Skip to content

Commit 9ee336c

Browse files
committed
runner.aws_batch: Support overlay volumes (e.g. --augur)
The latest image, nextstrain/base:build-20250304T041009Z, provides a mechanism in the entrypoint to support bundling of overlays in the workdir ZIP archive by way of upwards-traversing archive member paths.¹ For example, an Augur overlay is bundled into the workdir ZIP archive with member paths starting with ../augur/ and ends up overwriting files in the image's /nextstrain/augur/ since the AWS Batch workdir is always /nextstrain/build/. Extending overlay support to AWS Batch has been very low priority and something I thought was unlikely to ever happen. However, in the course of working on AWS Batch support for `nextstrain run`, it turned out to be easiest/most straightforward/most minimal changes to bundle the pathogen source directory with the working analysis directory in the workdir ZIP archive, i.e. as a "pathogen" overlay. This naturally led to supporting overlays more generally, which I've done here. One caveat compared to overlays in runtimes with the concept of volume mounts (Docker, Singularity) is that any files in the image that do not exist in the overlaid files will remain present since nothing removes them. This is potentially problematic and will be annoying if run into but most of the time should be a non-issue. It is also solvable if we care to exert the effort and extra code to do so. I don't right now. ¹ <nextstrain/docker-base#241>
1 parent 410c735 commit 9ee336c

File tree

5 files changed

+84
-21
lines changed

5 files changed

+84
-21
lines changed

nextstrain/cli/command/build.py

Lines changed: 25 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -24,8 +24,8 @@
2424
from ..argparse import add_extended_help_flags, AppendOverwriteDefault, SKIP_AUTO_DEFAULT_IN_HELP
2525
from ..debug import debug
2626
from ..errors import UsageError, UserError
27-
from ..runner import docker, singularity
28-
from ..util import byte_quantity, runner_name, warn
27+
from ..runner import docker, singularity, aws_batch
28+
from ..util import byte_quantity, runner_name, split_image_name, warn
2929
from ..volume import NamedVolume
3030

3131

@@ -306,10 +306,31 @@ def assert_overlay_volumes_support(opts):
306306
"""
307307
overlay_volumes = opts.volumes
308308

309-
if overlay_volumes and opts.__runner__ not in {docker, singularity}:
309+
if not overlay_volumes:
310+
return
311+
312+
if opts.__runner__ not in {docker, singularity, aws_batch}:
310313
raise UserError(f"""
311314
The {runner_name(opts.__runner__)} runtime does not support overlays (e.g. of {overlay_volumes[0].name}).
312-
Use the Docker or Singularity runtimes (via --docker or --singularity) if overlays are necessary.
315+
Use the Docker, Singularity, or AWS Batch runtimes (via --docker,
316+
--singularity, or --aws-batch) if overlays are necessary.
317+
""")
318+
319+
if opts.__runner__ is aws_batch and not docker.image_supports(docker.IMAGE_FEATURE.aws_batch_overlays, opts.image):
320+
raise UserError(f"""
321+
The Nextstrain runtime image version in use
322+
323+
{opts.image}
324+
325+
is too old to support overlays (e.g. of {overlay_volumes[0].name}) with AWS Batch.
326+
327+
If overlays are necessary, please update the runtime image to at
328+
least version
329+
330+
{split_image_name(opts.image)[0]}:{docker.IMAGE_FEATURE.aws_batch_overlays.value}
331+
332+
using `nextstrain update docker`. Alternatively, use the Docker or
333+
Singularity runtime (via --docker or --singularity) instead.
313334
""")
314335

315336

nextstrain/cli/runner/aws_batch/__init__.py

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -164,8 +164,15 @@ def register_arguments(parser) -> None:
164164

165165

166166
def run(opts, argv, working_volume = None, extra_env: Env = {}, cpus: int = None, memory: int = None) -> int:
167+
docker.assert_volumes_exist(opts.volumes)
168+
169+
# "build" is a special-cased volume for AWS Batch, as /nextstrain/build is
170+
# the fixed initial working directory and what we'll populate by extracting
171+
# a ZIP file.
167172
build_volume = next((v for v in opts.volumes if v and v.name == "build"), None)
168173

174+
opts.volumes = [v for v in opts.volumes if v is not build_volume]
175+
169176
# Unlike other runners, the AWS Batch runner currently *requires* a working
170177
# dir in most usages. This is ok as we only provide the AWS Batch runner
171178
# for commands which also require a working dir (e.g. build), whereas other
@@ -213,8 +220,11 @@ def run(opts, argv, working_volume = None, extra_env: Env = {}, cpus: int = None
213220
# Upload workdir to S3 so it can be fetched at the start of the Batch job.
214221
print_stage("Uploading %s to S3" % local_workdir)
215222

223+
for volume in opts.volumes:
224+
print(" and %s as %s" % (volume.src.resolve(strict = True), volume.name))
225+
216226
bucket = s3.bucket(opts.s3_bucket)
217-
remote_workdir = s3.upload_workdir(local_workdir, bucket, run_id, opts.exclude_from_upload)
227+
remote_workdir = s3.upload_workdir(local_workdir, bucket, run_id, opts.exclude_from_upload, opts.volumes)
218228

219229
print("uploaded:", s3.object_url(remote_workdir))
220230

nextstrain/cli/runner/aws_batch/s3.py

Lines changed: 40 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -5,21 +5,24 @@
55
import binascii
66
import boto3
77
import fsspec
8+
import os.path
89
from botocore.config import Config
910
from botocore.exceptions import ClientError
1011
from calendar import timegm
1112
from os import utime
12-
from pathlib import Path
13+
from pathlib import Path, PurePath
1314
from time import struct_time
14-
from typing import Callable, Generator, Iterable, List, Optional, Any
15+
from typing import Callable, Generator, Iterable, List, Optional, Any, Union
1516
from urllib.parse import urlparse
1617
from zipfile import ZipFile, ZipInfo
1718
from ... import env
19+
from ...debug import DEBUGGING
1820
from ...types import Env, S3Bucket, S3Object
1921
from ...util import glob_matcher
22+
from ...volume import NamedVolume
2023

2124

22-
PathMatcher = Callable[[Path], bool]
25+
PathMatcher = Callable[[Union[Path, PurePath]], bool]
2326

2427

2528
def object_url(object: S3Object) -> str:
@@ -38,10 +41,10 @@ def object_from_url(s3url: str) -> S3Object:
3841
return bucket(url.netloc).Object(key)
3942

4043

41-
def upload_workdir(workdir: Path, bucket: S3Bucket, run_id: str, patterns: List[str] = None) -> S3Object:
44+
def upload_workdir(workdir: Path, bucket: S3Bucket, run_id: str, patterns: List[str] = None, volumes: List[NamedVolume] = []) -> S3Object:
4245
"""
43-
Upload a ZIP archive of the local *workdir* to the remote S3 *bucket* for
44-
the given *run_id*.
46+
Upload a ZIP archive of the local *workdir* (and optional *volumes*) to the
47+
remote S3 *bucket* for the given *run_id*.
4548
4649
An optional list of *patterns* (shell-style advanced globs) can be passed
4750
to selectively exclude part of the local *workdir* from being uploaded.
@@ -80,8 +83,23 @@ def upload_workdir(workdir: Path, bucket: S3Bucket, run_id: str, patterns: List[
8083
with fsspec.open(object_url(remote_workdir), "wb", auto_mkdir = False) as remote_file:
8184
with ZipFile(remote_file, "w") as zipfile:
8285
for path in walk(workdir, excluded):
83-
print("zipping:", path)
84-
zipfile.write(str(path), str(path.relative_to(workdir)))
86+
dst = path.relative_to(workdir)
87+
print(f"zipping: {path}" + (f" (as {dst})" if DEBUGGING else ""))
88+
zipfile.write(str(path), dst)
89+
90+
for volume in volumes:
91+
# XXX TODO: Use the "walk_up" argument to Path.relative_to()
92+
# once we require Python 3.12.
93+
# -trs, 10 Feb 2025
94+
try:
95+
prefix = PurePath(volume.name).relative_to("build")
96+
except ValueError:
97+
prefix = PurePath("..", volume.name)
98+
99+
for path in walk(volume.src, always_excluded):
100+
dst = prefix / path.relative_to(volume.src)
101+
print(f"zipping: {path}" + (f" (as {dst})" if DEBUGGING else ""))
102+
zipfile.write(str(path), dst)
85103

86104
return remote_workdir
87105

@@ -138,9 +156,19 @@ def download_workdir(remote_workdir: S3Object, workdir: Path, patterns: List[str
138156

139157
# …and extract its contents to the workdir.
140158
with ZipFile(remote_file) as zipfile:
141-
for member in zipfile.infolist():
142-
path = Path(member.filename)
143-
159+
# Completely ignore archive members with unsafe paths (absolute or
160+
# upwards-traversing) instead of relying on zipfile.extract()'s
161+
# default of munging them to be "safe". Munging seems more
162+
# confusing than skipping, and skipping is essential in the case of
163+
# additional volumes being uploaded in the workdir initially.
164+
safe_members = [
165+
(filename, member)
166+
for filename, member
167+
in ((PurePath(m.filename), m) for m in zipfile.infolist())
168+
if not filename.is_absolute()
169+
and os.path.pardir not in filename.parts ]
170+
171+
for path, member in safe_members:
144172
# Inclusions negate exclusions but aren't an exhaustive
145173
# list of what is included.
146174
if selected(path) and (included(path) or not excluded(path)):
@@ -179,7 +207,7 @@ def path_matcher(patterns: Iterable[str]) -> PathMatcher:
179207
Generate a function which matches a Path object against the list of glob
180208
*patterns*.
181209
"""
182-
def matches(path: Path) -> bool:
210+
def matches(path: Union[Path, PurePath]) -> bool:
183211
return any(map(path.match, patterns))
184212

185213
return matches

nextstrain/cli/runner/docker.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -112,6 +112,10 @@ class IMAGE_FEATURE(Enum):
112112
# /nextstrain/env.d support first present.
113113
envd = "build-20230613T204512Z"
114114

115+
# AWS Batch: support for volume overlays (i.e. ../ in archive members and
116+
# file overwriting) in ZIP extraction.
117+
aws_batch_overlays = "build-20250304T041009Z"
118+
115119

116120
def register_arguments(parser) -> None:
117121
# Docker development options

nextstrain/cli/util.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
from importlib.metadata import distribution as distribution_info, PackageNotFoundError
1010
from typing import Any, Callable, Iterable, Literal, Mapping, List, Optional, Sequence, Tuple, Union, overload
1111
from packaging.version import parse as parse_version
12-
from pathlib import Path
12+
from pathlib import Path, PurePath
1313
from shlex import quote as shquote
1414
from shutil import which
1515
from textwrap import dedent, indent
@@ -553,7 +553,7 @@ def split_image_name(name: str, implicit_latest: bool = True) -> Tuple[str, Opti
553553
return (repository, tag)
554554

555555

556-
def glob_matcher(patterns: Sequence[str], *, root: Path = None) -> Callable[[Union[str, Path]], bool]:
556+
def glob_matcher(patterns: Sequence[str], *, root: Path = None) -> Callable[[Union[str, Path, PurePath]], bool]:
557557
"""
558558
Generate a function which matches a string or path-like object against the
559559
list of Bash-like glob *patterns*.
@@ -563,13 +563,13 @@ def glob_matcher(patterns: Sequence[str], *, root: Path = None) -> Callable[[Uni
563563
564564
See :func:`glob_match` for supported pattern features.
565565
"""
566-
def matcher(path: Union[str, Path]) -> bool:
566+
def matcher(path: Union[str, Path, PurePath]) -> bool:
567567
return glob_match(path, patterns, root = root)
568568

569569
return matcher
570570

571571

572-
def glob_match(path: Union[str, Path], patterns: Union[str, Sequence[str]], *, root: Path = None) -> bool:
572+
def glob_match(path: Union[str, Path, PurePath], patterns: Union[str, Sequence[str]], *, root: Path = None) -> bool:
573573
"""
574574
Test if *path* matches any of the glob *patterns*.
575575

0 commit comments

Comments
 (0)