diff --git a/CHANGES.md b/CHANGES.md index 393d3565..b899be18 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -24,6 +24,15 @@ installation method we recommend in the [Nextstrain installation documentation](https://docs.nextstrain.org/page/install.html). In that case, a supported Python version is always bundled with `nextstrain`. +## Features + +* `nextstrain build` now supports two new options when using the AWS Batch + runtime: [`--exclude-from-upload`][] and [`--exclude-from-download`][]. The + former is useful for avoiding the upload of large, ancillary files not needed + by the build. The latter exists to parallel the former and make it easier to + exclude files from both upload and download. + ([#370](https://github.com/nextstrain/cli/pull/370)) + ## Improvements * The Conda runtime now uses Micromamba 1.5.8 (upgraded from 1.1.0) to manage @@ -32,6 +41,19 @@ supported Python version is always bundled with `nextstrain`. the Conda runtime. ([#367](https://github.com/nextstrain/cli/pull/367)) +## Bug fixes + +* The [`--download`][] option of `nextstrain build` now supports passing _only_ + negated patterns (e.g. `!…` and `!(…)`). All files which _don't_ match the + negated patterns will be downloaded. Previously, no files were downloaded + unless at least one positive pattern was given. + ([#370](https://github.com/nextstrain/cli/pull/370)) + + +[`--exclude-from-upload`]: https://docs.nextstrain.org/projects/cli/en/__NEXT__/commands/build/#cmdoption-nextstrain-build-exclude-from-upload +[`--exclude-from-download`]: https://docs.nextstrain.org/projects/cli/en/__NEXT__/commands/build/#cmdoption-nextstrain-build-exclude-from-download +[`--download`]: https://docs.nextstrain.org/projects/cli/en/__NEXT__/commands/build/#cmdoption-nextstrain-build-download + # 8.3.0 (30 April 2024) diff --git a/doc/commands/build.rst b/doc/commands/build.rst index a2c61e99..e1acc08c 100644 --- a/doc/commands/build.rst +++ b/doc/commands/build.rst @@ -97,6 +97,8 @@ options wildcards (``**``), extended globbing (``@(…)``, ``+(…)``, etc.), and negation (``!…``). + Patterns should be relative to the build directory. + @@ -104,6 +106,58 @@ options Do not download any files from the remote build when it completes. Currently only supported when also using :option:`--aws-batch`. +.. option:: --exclude-from-download + + Exclude files matching ```` from being downloaded from + the remote build. Equivalent to passing a negated pattern to + :option:`--download`. That is, the following are equivalent:: + + --exclude-from-download 'xyz' + --download '!xyz' + + Refer to :option:`--download` for usage details, but note that this + option doesn't support already-negated patterns (e.g. ``!…`` or + ``!(…)``). + + This option exists to parallel :option:`--exclude-from-upload`. + + + + +.. option:: --exclude-from-upload + + Exclude files matching ```` from being uploaded as part of + the remote build. Shell-style advanced globbing is supported, but + be sure to escape wildcards or quote the whole pattern so your + shell doesn't expand them. May be passed more than once. + Currently only supported when also using :option:`--aws-batch`. + Default is to upload the entire pathogen build directory (except + for some ancillary files which are always excluded). + + Note that files excluded from upload may still be downloaded from + the remote build, e.g. if they're created by it, and if downloaded + will overwrite the local files. Use :option:`--no-download` to + avoid downloading any files, or :option:`--exclude-from-download` + to avoid downloading specific files, e.g.:: + + --exclude-from-upload 'results/**' \ + --exclude-from-download 'results/**' + + Your shell's brace expansion can also be used to shorten this, e.g.:: + + --exclude-from-{up,down}load='results/**' + + Besides basic glob features like single-part wildcards (``*``), + character classes (``[…]``), and brace expansion (``{…, …}``), + several advanced globbing features are also supported: multi-part + wildcards (``**``), extended globbing (``@(…)``, ``+(…)``, etc.), + and negation (``!…``). + + Patterns should be relative to the build directory. + + + + .. option:: --no-logs Do not show the log messages of the remote build. Currently only supported when also using :option:`--aws-batch`. Default is to show all log messages, even when attaching to a completed build. diff --git a/nextstrain/cli/command/build.py b/nextstrain/cli/command/build.py index 1c489304..02000d8f 100644 --- a/nextstrain/cli/command/build.py +++ b/nextstrain/cli/command/build.py @@ -104,6 +104,8 @@ def register_parser(subparser): wildcards (``**``), extended globbing (``@(…)``, ``+(…)``, etc.), and negation (``!…``). + Patterns should be relative to the build directory. + {SKIP_AUTO_DEFAULT_IN_HELP} """), default = True, @@ -117,6 +119,66 @@ def register_parser(subparser): dest = "download", action = "store_false") + parser.add_argument( + "--exclude-from-download", + metavar = "", + help = dedent(f"""\ + Exclude files matching ```` from being downloaded from + the remote build. Equivalent to passing a negated pattern to + :option:`--download`. That is, the following are equivalent:: + + --exclude-from-download 'xyz' + --download '!xyz' + + Refer to :option:`--download` for usage details, but note that this + option doesn't support already-negated patterns (e.g. ``!…`` or + ``!(…)``). + + This option exists to parallel :option:`--exclude-from-upload`. + + {SKIP_AUTO_DEFAULT_IN_HELP} + """), + dest = "download", + type = lambda pattern: f"!{pattern}", + action = AppendOverwriteDefault) + + parser.add_argument( + "--exclude-from-upload", + metavar = "", + help = dedent(f"""\ + Exclude files matching ```` from being uploaded as part of + the remote build. Shell-style advanced globbing is supported, but + be sure to escape wildcards or quote the whole pattern so your + shell doesn't expand them. May be passed more than once. + Currently only supported when also using :option:`--aws-batch`. + Default is to upload the entire pathogen build directory (except + for some ancillary files which are always excluded). + + Note that files excluded from upload may still be downloaded from + the remote build, e.g. if they're created by it, and if downloaded + will overwrite the local files. Use :option:`--no-download` to + avoid downloading any files, or :option:`--exclude-from-download` + to avoid downloading specific files, e.g.:: + + --exclude-from-upload 'results/**' \\ + --exclude-from-download 'results/**' + + Your shell's brace expansion can also be used to shorten this, e.g.:: + + --exclude-from-{{up,down}}load='results/**' + + Besides basic glob features like single-part wildcards (``*``), + character classes (``[…]``), and brace expansion (``{{…, …}}``), + several advanced globbing features are also supported: multi-part + wildcards (``**``), extended globbing (``@(…)``, ``+(…)``, etc.), + and negation (``!…``). + + Patterns should be relative to the build directory. + + {SKIP_AUTO_DEFAULT_IN_HELP} + """), + action = "append") + # A --logs option doesn't make much sense right now for most of our # runtimes, but I can see how it might in the future. So we're ready if # that future comes to pass, set up --no-logs as if there's a --logs option diff --git a/nextstrain/cli/runner/aws_batch/__init__.py b/nextstrain/cli/runner/aws_batch/__init__.py index 78ad13a4..eac5bfb4 100644 --- a/nextstrain/cli/runner/aws_batch/__init__.py +++ b/nextstrain/cli/runner/aws_batch/__init__.py @@ -213,7 +213,7 @@ def run(opts, argv, working_volume = None, extra_env: Env = {}, cpus: int = None print_stage("Uploading %s to S3" % local_workdir) bucket = s3.bucket(opts.s3_bucket) - remote_workdir = s3.upload_workdir(local_workdir, bucket, run_id) + remote_workdir = s3.upload_workdir(local_workdir, bucket, run_id, opts.exclude_from_upload) print("uploaded:", s3.object_url(remote_workdir)) diff --git a/nextstrain/cli/runner/aws_batch/s3.py b/nextstrain/cli/runner/aws_batch/s3.py index 6aa7331a..02343e1f 100644 --- a/nextstrain/cli/runner/aws_batch/s3.py +++ b/nextstrain/cli/runner/aws_batch/s3.py @@ -38,17 +38,20 @@ def object_from_url(s3url: str) -> S3Object: return bucket(url.netloc).Object(key) -def upload_workdir(workdir: Path, bucket: S3Bucket, run_id: str) -> S3Object: +def upload_workdir(workdir: Path, bucket: S3Bucket, run_id: str, patterns: List[str] = None) -> S3Object: """ Upload a ZIP archive of the local *workdir* to the remote S3 *bucket* for the given *run_id*. + An optional list of *patterns* (shell-style advanced globs) can be passed + to selectively exclude part of the local *workdir* from being uploaded. + Returns the S3.Object instance of the uploaded archive. """ remote_workdir = bucket.Object(run_id + ".zip") - excluded = path_matcher([ + always_excluded = path_matcher([ # Jobs don't use .git, so save the bandwidth/space/time. It may also # contain information in history that shouldn't be uploaded. ".git/", @@ -65,6 +68,13 @@ def upload_workdir(workdir: Path, bucket: S3Bucket, run_id: str) -> S3Object: "__pycache__/", ]) + if patterns: + deselected = glob_matcher(patterns, root = workdir) + else: + deselected = lambda path: False + + excluded = lambda path: always_excluded(path) or deselected(path) + # Stream writes directly to the remote ZIP file remote_file: Any with fsspec.open(object_url(remote_workdir), "wb", auto_mkdir = False) as remote_file: @@ -86,6 +96,15 @@ def download_workdir(remote_workdir: S3Object, workdir: Path, patterns: List[str to selectively download only part of the remote workdir. """ + # XXX TODO: Consider extending excluded patterns with the globs from any + # --exclude-from-upload options given, so that files excluded from upload + # are by default also excluded from download. That behaviour would seems + # sometimes useful but other times confusing. See also my discussion with + # Trevor and James about this.¹ + # -trs, 23 May 2024 + # + # ¹ + excluded = path_matcher([ # Jobs don't use .git and it may also contain information that # shouldn't be uploaded. diff --git a/nextstrain/cli/util.py b/nextstrain/cli/util.py index 7ffc65e3..eb834dcb 100644 --- a/nextstrain/cli/util.py +++ b/nextstrain/cli/util.py @@ -13,7 +13,7 @@ from shlex import quote as shquote from shutil import which from textwrap import dedent, indent -from wcmatch.glob import globmatch, GLOBSTAR, EXTGLOB, BRACE, MATCHBASE, NEGATE +from wcmatch.glob import globmatch, GLOBSTAR, EXTGLOB, BRACE, MATCHBASE, NEGATE, NEGATEALL, REALPATH from .__version__ import __version__ from .debug import debug from .types import RunnerModule, RunnerTestResults, RunnerTestResultStatus @@ -537,20 +537,23 @@ def split_image_name(name: str, implicit_latest: bool = True) -> Tuple[str, Opti return (repository, tag) -def glob_matcher(patterns: Sequence[str]) -> Callable[[Union[str, Path]], bool]: +def glob_matcher(patterns: Sequence[str], *, root: Path = None) -> Callable[[Union[str, Path]], bool]: """ Generate a function which matches a string or path-like object against the list of Bash-like glob *patterns*. + If a *root* path is provided, paths will be matched against *patterns* as + real filesystem paths relative to the given *root* directory. + See :func:`glob_match` for supported pattern features. """ def matcher(path: Union[str, Path]) -> bool: - return glob_match(path, patterns) + return glob_match(path, patterns, root = root) return matcher -def glob_match(path: Union[str, Path], patterns: Union[str, Sequence[str]]) -> bool: +def glob_match(path: Union[str, Path], patterns: Union[str, Sequence[str]], *, root: Path = None) -> bool: """ Test if *path* matches any of the glob *patterns*. @@ -559,10 +562,17 @@ def glob_match(path: Union[str, Path], patterns: Union[str, Sequence[str]]) -> b globbing features are also supported: multi-part wildcards (``**``), extended globbing (``@(…)``, ``+(…)``, etc.), basename matching for patterns containing only a single path part, and negation (``!…``). + Passing only negated patterns is explicitly supported and will match + anything that doesn't match the given patterns. + + If a *root* path is provided, *path* will be matched against *patterns* as + real filesystem paths relative to the given *root* directory. Implemented with with :func:`wcmatch.glob.globmatch`. """ - return globmatch(path, patterns, flags = GLOBSTAR | BRACE | EXTGLOB | MATCHBASE | NEGATE) + if root: + path = Path(path).resolve(strict = True).relative_to(root) + return globmatch(path, patterns, flags = GLOBSTAR | BRACE | EXTGLOB | MATCHBASE | NEGATE | NEGATEALL | (REALPATH if root else 0), root_dir = root) def runner_tests_ok(tests: RunnerTestResults) -> bool: diff --git a/tests/command-build.py b/tests/command-build.py new file mode 100644 index 00000000..78d4ff86 --- /dev/null +++ b/tests/command-build.py @@ -0,0 +1,29 @@ +from nextstrain.cli import make_parser + + +def pytest_build_download_options(): + parser = make_parser() + + opts = parser.parse_args(["build", "."]) + assert opts.download is True + + opts = parser.parse_args(["build", "--no-download", "."]) + assert opts.download is False + + opts = parser.parse_args(["build", "--download", "x", "."]) + assert opts.download == ["x"] + + opts = parser.parse_args(["build", "--download", "x", "--download", "y", "."]) + assert opts.download == ["x", "y"] + + opts = parser.parse_args(["build", "--exclude-from-download", "z", "."]) + assert opts.download == ["!z"] + + opts = parser.parse_args(["build", "--exclude-from-download", "z", "--exclude-from-download", "a", "."]) + assert opts.download == ["!z", "!a"] + + opts = parser.parse_args(["build", "--download", "y", "--exclude-from-download", "z", "."]) + assert opts.download == ["y", "!z"] + + opts = parser.parse_args(["build", "--download", "y", "--download", "!z", "."]) + assert opts.download == ["y", "!z"]