From a9b9d3ee0363a20719d19ee7da93d655a04cc394 Mon Sep 17 00:00:00 2001 From: Thomas Sibley Date: Thu, 31 Oct 2024 14:09:28 -0700 Subject: [PATCH] runner.aws_batch: Gracefully handle errors when fetching logs for completed jobs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit For completed jobs, it's more useful to continue on with printing the job status (e.g. success or reason for failure) and downloading job results even if an error occurs when fetching logs. As a concrete example, we've observed cases where a failed job has a log stream associated with it in Batch but that log stream does not actually exist in CloudWatch Logs.¹ The log fetch error hid the reason for job failure, hampering troubleshooting. ¹ --- CHANGES.md | 8 ++++++++ nextstrain/cli/runner/aws_batch/__init__.py | 8 ++++++-- 2 files changed, 14 insertions(+), 2 deletions(-) diff --git a/CHANGES.md b/CHANGES.md index 66f53250..0b81313a 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -13,6 +13,14 @@ development source code and as such may not be routinely kept up to date. # __NEXT__ +## Bug fixes + +* It is no longer a fatal error if the logs for a completed AWS Batch job + cannot be fetched for some reason. `nextstrain build` will warn about the + error but continue on with printing the job status (e.g. success or reason + for failure) and, if applicable, downloading job results. + ([#406](https://github.com/nextstrain/cli/pull/406)) + # 8.5.3 (3 September 2024) diff --git a/nextstrain/cli/runner/aws_batch/__init__.py b/nextstrain/cli/runner/aws_batch/__init__.py index eac5bfb4..b3a0fdee 100644 --- a/nextstrain/cli/runner/aws_batch/__init__.py +++ b/nextstrain/cli/runner/aws_batch/__init__.py @@ -77,6 +77,7 @@ Default for ``--aws-batch-memory``. """ +import botocore.exceptions import os import shlex from datetime import datetime @@ -404,8 +405,11 @@ def interrupt_signaled(sig, frame): # The watcher never started, so we probably missed the # transition to running. Display the whole log now! if opts.logs: - for entry in job.log_entries(): - print_job_log(entry) + try: + for entry in job.log_entries(): + print_job_log(entry) + except botocore.exceptions.ClientError as error: + warn(f"Unable to fetch job logs: {error}") print_stage( "Job %s after %0.1f minutes" % (job.status, job.elapsed_time / 60),