Skip to content

Commit

Permalink
Improve swipe env var handling (#15)
Browse files Browse the repository at this point in the history
  • Loading branch information
jgadling authored Oct 14, 2021
1 parent a2e6926 commit 955b88d
Show file tree
Hide file tree
Showing 5 changed files with 115 additions and 100 deletions.
5 changes: 3 additions & 2 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
FROM ubuntu:20.04
ARG DEBIAN_FRONTEND=noninteractive
ARG MINIWDL_VERSION=1.1.5
ARG MINIWDL_VERSION=1.2.2

LABEL maintainer="IDseq Team idseq-tech@chanzuckerberg.com"

Expand Down Expand Up @@ -41,11 +41,11 @@ RUN apt-get -q install -y \
awscli

RUN pip3 install miniwdl==${MINIWDL_VERSION} miniwdl-s3parcp==0.0.5 miniwdl-s3upload==0.0.8
RUN pip3 install https://github.com/chanzuckerberg/miniwdl-plugins/archive/akislyuk-swipe-aspen-vars.zip#subdirectory=sfn-wdl

RUN curl -Ls https://github.com/chanzuckerberg/s3parcp/releases/download/v1.0.1-alpha/s3parcp_1.0.1-alpha_linux_amd64.tar.gz | tar -C /usr/bin -xz s3parcp

ADD https://raw.githubusercontent.com/chanzuckerberg/miniwdl/v${MINIWDL_VERSION}/examples/clean_download_cache.sh /usr/local/bin
ADD scripts/init.sh /usr/local/bin
RUN chmod +x /usr/local/bin/clean_download_cache.sh

# docker.io is the largest package at 250MB+ / half of all package disk space usage.
Expand All @@ -56,3 +56,4 @@ RUN cd /usr/bin; curl -O https://amazon-ecr-credential-helper-releases.s3.amazon
RUN chmod +x /usr/bin/docker-credential-ecr-login
RUN mkdir -p /root/.docker
RUN jq -n '.credsStore="ecr-login"' > /root/.docker/config.json
ENTRYPOINT ["/usr/local/bin/init.sh"]
79 changes: 79 additions & 0 deletions scripts/init.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
#!/bin/bash
set -a
if [ -f /etc/environment ]; then source /etc/environment; fi
if [ -f /etc/default/locale ]; then source /etc/default/locale; else export LC_ALL=C.UTF-8 LANG=C.UTF-8; fi
set +a

check_for_termination() {
count=0
while true; do
if TOKEN=`curl -m 10 -sX PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 21600"` && curl -H "X-aws-ec2-metadata-token: $TOKEN" -sf http://169.254.169.254/latest/meta-data/spot/instance-action; then
echo WARNING: THIS SPOT INSTANCE HAS BEEN SCHEDULED FOR TERMINATION >> /dev/stderr
fi
# Print an update every 5 mins
if [ $((count++ % 60)) -eq 0 ]; then
echo $(date --iso-8601=seconds) termination check
fi
sleep 10
done
}

put_metric() {
aws cloudwatch put-metric-data --metric-name $1 --namespace swipe-$DEPLOYMENT_ENVIRONMENT --unit Percent --value $2 --dimensions SFNCurrentState=$SFN_CURRENT_STATE
}

put_metrics() {
while true; do
put_metric ScratchSpaceInUse $(df --output=pcent /mnt | tail -n 1 | cut -f 1 -d %)
put_metric CPULoad $(cat /proc/loadavg | cut -f 1 -d ' ' | cut -f 2 -d .)
put_metric MemoryInUse $(python3 -c 'import psutil; m=psutil.virtual_memory(); print(100*(1-m.available/m.total))')
sleep 60
done
}

check_for_termination &
put_metrics &

mkdir -p /mnt/download_cache; touch /mnt/download_cache/_miniwdl_flock

clean_wd() {
(shopt -s nullglob;
for wf_log in /mnt/20??????_??????_*/workflow.log; do
flock -n $wf_log rm -rf $(dirname $wf_log) || true;
done;
flock -x /mnt/download_cache/_miniwdl_flock clean_download_cache.sh /mnt/download_cache $DOWNLOAD_CACHE_MAX_GB)
}
clean_wd
df -h / /mnt
export MINIWDL__S3_PROGRESSIVE_UPLOAD__URI_PREFIX=$(dirname "$WDL_OUTPUT_URI")
if [ -f /etc/profile ]; then source /etc/profile; fi
miniwdl --version

# Env vars that need to be forwarded to miniwdl's tasks in AWS Batch.
BATCH_SWIPE_ENVVARS="AWS_DEFAULT_REGION DEPLOYMENT_ENVIRONMENT AWS_CONTAINER_CREDENTIALS_RELATIVE_URI"
# set $WDL_PASSTHRU_ENVVARS to a list of space-separated env var names
# to pass the values of those vars to miniwdl's task containers.
PASSTHRU_VARS=( $BATCH_SWIPE_ENVVARS $WDL_PASSTHRU_ENVVARS )
PASSTHRU_ARGS=${PASSTHRU_VARS[@]/#/--env }

set -euo pipefail
export CURRENT_STATE=$(echo "$SFN_CURRENT_STATE" | sed -e s/SPOT// -e s/EC2//)
aws s3 cp "$WDL_WORKFLOW_URI" .
aws s3 cp "$WDL_INPUT_URI" wdl_input.json

handle_error() {
OF=wdl_output.json;
EP=.cause.stderr_file;
if jq -re .error $OF; then
if jq -re $EP $OF; then
if tail -n 1 $(jq -r $EP $OF) | jq -re .wdl_error_message; then
tail -n 1 $(jq -r $EP $OF) > $OF;
fi;
fi;
aws s3 cp $OF "$WDL_OUTPUT_URI";
fi
}

trap handle_error EXIT
miniwdl run $PASSTHRU_ARGS --dir /mnt $(basename "$WDL_WORKFLOW_URI") --input wdl_input.json --verbose --error-json -o wdl_output.json
clean_wd
Original file line number Diff line number Diff line change
@@ -1,93 +1,4 @@
image: "${batch_docker_image}"
command:
- "/bin/bash"
- "-c"
- "for i in \"$@\"; do eval \"$i\"; done; cd /"
- "swipe"
- "set -a"
- "if [ -f /etc/environment ]; then source /etc/environment; fi"
- "if [ -f /etc/default/locale ]; then source /etc/default/locale; else export LC_ALL=C.UTF-8 LANG=C.UTF-8; fi"
- "set +a"
- >-
while true; do
if curl -sf http://169.254.169.254/latest/meta-data/spot/instance-action; then
echo WARNING: THIS SPOT INSTANCE HAS BEEN SCHEDULED FOR TERMINATION >> /dev/stderr;
fi;
sleep 10;
done &
- >-
put_metric() {
aws cloudwatch put-metric-data --metric-name $1 --namespace swipe-$DEPLOYMENT_ENVIRONMENT --unit Percent --value $2 --dimensions SFNCurrentState=$SFN_CURRENT_STATE;
}
- >-
while true; do
put_metric ScratchSpaceInUse $(df --output=pcent /mnt | tail -n 1 | cut -f 1 -d %);
put_metric CPULoad $(cat /proc/loadavg | cut -f 1 -d ' ' | cut -f 2 -d .);
put_metric MemoryInUse $(python3 -c 'import psutil; m=psutil.virtual_memory(); print(100*(1-m.available/m.total))');
sleep 60;
done &
- "mkdir -p /mnt/download_cache; touch /mnt/download_cache/_miniwdl_flock"
- >-
clean_wd() {
(shopt -s nullglob;
for wf_log in /mnt/20??????_??????_*/workflow.log; do
flock -n $wf_log rm -rf $(dirname $wf_log) || true;
done;
flock -x /mnt/download_cache/_miniwdl_flock clean_download_cache.sh /mnt/download_cache $DOWNLOAD_CACHE_MAX_GB)
}
- "clean_wd"
- "df -h / /mnt"
- 'export MINIWDL__S3_PROGRESSIVE_UPLOAD__URI_PREFIX=$(dirname "$WDL_OUTPUT_URI")'
- "if [ -f /etc/profile ]; then source /etc/profile; fi"
- "miniwdl --version"
- "set -euo pipefail"
- 'export CURRENT_STATE=$(echo "$SFN_CURRENT_STATE" | sed -e s/SPOT// -e s/EC2//)'
- 'aws s3 cp "$WDL_WORKFLOW_URI" .'
- 'aws s3 cp "$WDL_INPUT_URI" wdl_input.json'
- >-
handle_error() {
OF=wdl_output.json;
EP=.cause.stderr_file;
if jq -re .error $OF; then
if jq -re $EP $OF; then
if tail -n 1 $(jq -r $EP $OF) | jq -re .wdl_error_message; then
tail -n 1 $(jq -r $EP $OF) > $OF;
fi;
fi;
aws s3 cp $OF "$WDL_OUTPUT_URI";
fi
}
- "trap handle_error EXIT"
- 'miniwdl run --dir /mnt $(basename "$WDL_WORKFLOW_URI") --input wdl_input.json --verbose --log-json -o wdl_output.json'
- 'aws s3 cp wdl_output.json "$WDL_OUTPUT_URI"'
- "clean_wd"
environment:
- name: "WDL_INPUT_URI"
value: "Set this variable to the S3 URI of the WDL input JSON"
- name: "WDL_WORKFLOW_URI"
value: "Set this variable to the S3 URI of the WDL workflow"
- name: "WDL_OUTPUT_URI"
value: "Set this variable to the S3 URI where the WDL output JSON will be written"
- name: "SFN_EXECUTION_ID"
value: "Set this variable to the current step function execution ARN"
- name: "SFN_CURRENT_STATE"
value: "Set this variable to the current step function state name, like HostFilterEC2 or HostFilterSPOT"
- name: "DEPLOYMENT_ENVIRONMENT"
value: "${deployment_environment}"
- name: "AWS_DEFAULT_REGION"
value: "${aws_region}"
- name: "MINIWDL__S3PARCP__DOCKER_IMAGE"
value: "${batch_docker_image}"
- name: "MINIWDL__DOWNLOAD_CACHE__PUT"
value: "true"
- name: "MINIWDL__DOWNLOAD_CACHE__GET"
value: "true"
- name: "MINIWDL__DOWNLOAD_CACHE__DIR"
value: /mnt/download_cache
- name: "MINIWDL__DOWNLOAD_CACHE__DISABLE_PATTERNS"
value: '["s3://swipe-samples-*/*"]'
- name: "DOWNLOAD_CACHE_MAX_GB"
value: "500"
jobRoleArn: "${batch_job_role_arn}"
volumes:
- name: "scratch"
Expand Down
36 changes: 27 additions & 9 deletions terraform/modules/swipe-sfn-batch-job/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,32 @@ data "aws_caller_identity" "current" {}
locals {
app_slug = "${var.app_name}-${var.deployment_environment}"
ecr_url = "${data.aws_caller_identity.current.account_id}.dkr.ecr.${data.aws_region.current.name}.amazonaws.com"
container_config = yamldecode(templatefile("${path.module}/batch_job_container_properties.yml", {
app_name = var.app_name,
batch_job_role_arn = aws_iam_role.swipe_batch_main_job.arn,

# TODO: fix docker image
# batch_docker_image = var.use_ecr_private_registry ? "${local.ecr_url}/${var.batch_job_docker_image_name}" : var.batch_job_docker_image_name,
batch_docker_image = "ghcr.io/chanzuckerberg/swipe:sha-c145a0ab"
}))
batch_env_vars = merge(var.extra_env_vars, {
"WDL_INPUT_URI" = "Set this variable to the S3 URI of the WDL input JSON",
"WDL_WORKFLOW_URI" = "Set this variable to the S3 URI of the WDL workflow",
"WDL_OUTPUT_URI" = "Set this variable to the S3 URI where the WDL output JSON will be written",
"SFN_EXECUTION_ID" = "Set this variable to the current step function execution ARN",
"SFN_CURRENT_STATE" = "Set this variable to the current step function state name, like HostFilterEC2 or HostFilterSPOT",
"DEPLOYMENT_ENVIRONMENT" = var.deployment_environment,
"AWS_DEFAULT_REGION" = data.aws_region.current.name,
"MINIWDL__S3PARCP__DOCKER_IMAGE" = var.use_ecr_private_registry ? "${local.ecr_url}/${var.batch_job_docker_image_name}" : var.batch_job_docker_image_name,
"MINIWDL__DOWNLOAD_CACHE__PUT" = "true",
"MINIWDL__DOWNLOAD_CACHE__GET" = "true",
"MINIWDL__DOWNLOAD_CACHE__DIR" = "/mnt/download_cache",
"MINIWDL__DOWNLOAD_CACHE__DISABLE_PATTERNS" = "[\"s3://swipe-samples-*/*\"]",
"DOWNLOAD_CACHE_MAX_GB" = "500",
"WDL_PASSTHRU_ENVVARS" = join(" ", [for k, v in var.extra_env_vars : k]),
})
container_env_vars = { "environment" : [for k, v in local.batch_env_vars : { "name" : k, "value" : v }] }
final_container_config = merge(local.container_config, local.container_env_vars)
}

resource "aws_iam_policy" "swipe_batch_main_job" {
Expand Down Expand Up @@ -45,13 +71,5 @@ resource "aws_batch_job_definition" "swipe_main" {
timeout {
attempt_duration_seconds = var.batch_job_timeout_seconds
}
container_properties = jsonencode(yamldecode(templatefile("${path.module}/batch_job_container_properties.yml", {
app_name = var.app_name,
deployment_environment = var.deployment_environment,
# TODO: fix docker image
# batch_docker_image = var.use_ecr_private_registry ? "${local.ecr_url}/${var.batch_job_docker_image_name}" : var.batch_job_docker_image_name,
batch_docker_image = "ghcr.io/chanzuckerberg/swipe:sha-c145a0ab"
aws_region = data.aws_region.current.name,
batch_job_role_arn = aws_iam_role.swipe_batch_main_job.arn,
})))
container_properties = jsonencode(local.final_container_config)
}
6 changes: 6 additions & 0 deletions terraform/modules/swipe-sfn-batch-job/variables.tf
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,12 @@ variable "app_name" {
default = "swipe"
}

variable "extra_env_vars" {
description = "Additional env vars to set on batch task definitions"
type = map(string)
default = {}
}

variable "batch_job_docker_image_name" {
description = "Docker image (name or name:tag) that will be used for Batch jobs"
type = string
Expand Down

0 comments on commit 955b88d

Please sign in to comment.