From a9b9d3ee0363a20719d19ee7da93d655a04cc394 Mon Sep 17 00:00:00 2001
From: Thomas Sibley <tsibley@fredhutch.org>
Date: Thu, 31 Oct 2024 14:09:28 -0700
Subject: [PATCH] runner.aws_batch: Gracefully handle errors when fetching logs
 for completed jobs
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

For completed jobs, it's more useful to continue on with printing the
job status (e.g. success or reason for failure) and downloading job
results even if an error occurs when fetching logs.  As a concrete
example, we've observed cases where a failed job has a log stream
associated with it in Batch but that log stream does not actually exist
in CloudWatch Logs.¹  The log fetch error hid the reason for job
failure, hampering troubleshooting.

¹ <https://bedfordlab.slack.com/archives/C01LCTT7JNN/p1730406138009409>
---
 CHANGES.md                                  | 8 ++++++++
 nextstrain/cli/runner/aws_batch/__init__.py | 8 ++++++--
 2 files changed, 14 insertions(+), 2 deletions(-)

diff --git a/CHANGES.md b/CHANGES.md
index 66f53250..0b81313a 100644
--- a/CHANGES.md
+++ b/CHANGES.md
@@ -13,6 +13,14 @@ development source code and as such may not be routinely kept up to date.
 
 # __NEXT__
 
+## Bug fixes
+
+* It is no longer a fatal error if the logs for a completed AWS Batch job
+  cannot be fetched for some reason.  `nextstrain build` will warn about the
+  error but continue on with printing the job status (e.g. success or reason
+  for failure) and, if applicable, downloading job results.
+  ([#406](https://github.com/nextstrain/cli/pull/406))
+
 
 # 8.5.3 (3 September 2024)
 
diff --git a/nextstrain/cli/runner/aws_batch/__init__.py b/nextstrain/cli/runner/aws_batch/__init__.py
index eac5bfb4..b3a0fdee 100644
--- a/nextstrain/cli/runner/aws_batch/__init__.py
+++ b/nextstrain/cli/runner/aws_batch/__init__.py
@@ -77,6 +77,7 @@
     Default for ``--aws-batch-memory``.
 """
 
+import botocore.exceptions
 import os
 import shlex
 from datetime import datetime
@@ -404,8 +405,11 @@ def interrupt_signaled(sig, frame):
                 # The watcher never started, so we probably missed the
                 # transition to running.  Display the whole log now!
                 if opts.logs:
-                    for entry in job.log_entries():
-                        print_job_log(entry)
+                    try:
+                        for entry in job.log_entries():
+                            print_job_log(entry)
+                    except botocore.exceptions.ClientError as error:
+                        warn(f"Unable to fetch job logs: {error}")
 
             print_stage(
                 "Job %s after %0.1f minutes" % (job.status, job.elapsed_time / 60),