Skip to content

Commit 6e83831

Browse files
Nikhil BansalOrbax Authors
authored andcommitted
Cloud Run OSS IT
PiperOrigin-RevId: 880680159
1 parent f75444c commit 6e83831

File tree

6 files changed

+261
-1
lines changed

6 files changed

+261
-1
lines changed

.github/workflows/build_image.yml

Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
2+
# Copyright 2025 Google LLC
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
# https://www.apache.org/licenses/LICENSE-2.0
7+
# Unless required by applicable law or agreed to in writing, software
8+
# distributed under the License is distributed on an "AS IS" BASIS,
9+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10+
# See the License for the specific language governing permissions and
11+
# limitations under the License.
12+
# This workflow will build and push MaxText Docker image to GCR.
13+
name: Build and Push Orbax-checkpoint Docker Images
14+
on:
15+
schedule:
16+
# Run the job daily at 12AM UTC
17+
- cron: '0 0 * * *'
18+
push:
19+
branches:
20+
- "test_*"
21+
permissions:
22+
contents: read
23+
jobs:
24+
build_and_push:
25+
runs-on: linux-x86-n2-16-buildkit
26+
container: google/cloud-sdk:524.0.0
27+
steps:
28+
- name: Checkout Orbax-checkpoint
29+
uses: actions/checkout@v5
30+
- name: Mark git repositories as safe
31+
run: git config --global --add safe.directory '*'
32+
- name: Configure Docker
33+
run: gcloud auth configure-docker us-docker.pkg.dev,gcr.io -q
34+
- name: Set up Docker BuildX
35+
uses: docker/setup-buildx-action@v3.11.1
36+
with:
37+
driver: remote
38+
endpoint: tcp://localhost:1234
39+
- name: Build and push Docker image
40+
uses: docker/build-push-action@v6
41+
with:
42+
push: true
43+
context: .
44+
file: ./checkpoint/orbax/checkpoint/_src/testing/oss/Dockerfile
45+
tags: gcr.io/orbax-checkpoint/orbax-benchmarks-it-runner:latest
46+
cache-from: type=gha
47+
outputs: type=image,compression=zstd,force-compression=true
48+
build-args: |
49+
DEVICE=tpu
50+
JAX_VERSION=newest
51+
BRANCH=main
52+
GITHUB_RUNNER=true
53+
- name: Add tags to Docker images
54+
shell: bash
55+
run: |
56+
SOURCE_IMAGE="gcr.io/orbax-checkpoint/orbax-benchmarks-it-runner"
57+
# Add Orbax-checkpoint tag
58+
orbax_hash=$(git rev-parse --short HEAD)
59+
gcloud container images add-tag "$SOURCE_IMAGE:latest" "$SOURCE_IMAGE:orbax_${orbax_hash}" --quiet

checkpoint/orbax/checkpoint/_src/testing/benchmarks/emergency_checkpoint_manager_benchmark.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -196,7 +196,7 @@ def test_fn(
196196

197197
with metrics.measure("create_directories"):
198198
if jax.process_index() == 0:
199-
persistent_directory.mkdir(parents=True)
199+
persistent_directory.mkdir(parents=True, exist_ok=True)
200200
local_directory.mkdir(parents=True, exist_ok=True)
201201
multihost.sync_global_processes("create directories")
202202

checkpoint/orbax/checkpoint/_src/testing/benchmarks/xpk/launch_xpk.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -274,6 +274,11 @@
274274
False,
275275
'If True, run workload creation and execution twice to test restart.',
276276
)
277+
_SKIP_VALIDATION = flags.DEFINE_boolean(
278+
'skip_validation',
279+
False,
280+
'If True, skip validation of the benchmark results.',
281+
)
277282

278283
# --- Pathways Flags ---
279284
# Pathways uses a "Sidecar" architecture on XPK:
@@ -646,6 +651,8 @@ def construct_xpk_command(
646651
base_cmd.append('--enable-ops-agent')
647652
if _RAMDISK_DIRECTORY.value is not None:
648653
base_cmd.append('--mtc-enabled')
654+
if _SKIP_VALIDATION.value:
655+
base_cmd.append('--skip-validation')
649656

650657
if _ENABLE_PATHWAYS.value:
651658
if not _PATHWAYS_SERVER_IMAGE.value:
Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,58 @@
1+
# Base image argument (defaulting to slim python image)
2+
ARG BASE_IMAGE=python:3.11-slim
3+
FROM $BASE_IMAGE
4+
5+
WORKDIR /app
6+
7+
# 1. Install System Dependencies
8+
# common utils + git (needed for checkout)
9+
# --no-install-recommends limits bloat
10+
# python3-pip is standard in python images, no need to install
11+
RUN apt-get update && apt-get install -y --no-install-recommends \
12+
git \
13+
dnsutils \
14+
&& rm -rf /var/lib/apt/lists/*
15+
16+
RUN apt-get update && apt-get install -y --no-install-recommends \
17+
git \
18+
dnsutils \
19+
curl \
20+
ca-certificates \
21+
gnupg \
22+
apt-transport-https \
23+
gettext-base \
24+
gawk \
25+
&& curl -fsSL https://packages.cloud.google.com/apt/doc/apt-key.gpg | gpg --dearmor -o /usr/share/keyrings/cloud.google.gpg \
26+
&& echo "deb [signed-by=/usr/share/keyrings/cloud.google.gpg] https://packages.cloud.google.com/apt cloud-sdk main" | tee /etc/apt/sources.list.d/google-cloud-sdk.list \
27+
&& apt-get update \
28+
&& apt-get install -y --no-install-recommends \
29+
google-cloud-cli \
30+
google-cloud-cli-gke-gcloud-auth-plugin \
31+
kubectl \
32+
&& apt-get clean \
33+
&& rm -rf /var/lib/apt/lists/*
34+
35+
COPY ./checkpoint/orbax/checkpoint/_src/testin[g] /app/orbax_repo/checkpoint/orbax/checkpoint/_src/testing
36+
37+
# 3. Setup Python Environment & Dependencies
38+
# Uninstall pre-installed orbax if present in base image to avoid conflicts
39+
RUN pip uninstall -y orbax-checkpoint orbax || true
40+
41+
# # Create a fake docker binary that always returns success (exit 0)
42+
# RUN echo '#!/bin/bash\nexit 0' > /usr/local/bin/docker && chmod +x /usr/local/bin/docker
43+
44+
# Install requirements from repo root if it exists
45+
RUN if [ -f "requirements.txt" ]; then pip install --no-cache-dir -r requirements.txt; fi
46+
47+
RUN pip install --no-cache-dir gcsfs portpicker clu tensorflow pyyaml
48+
49+
# 4. Install Orbax from Source
50+
WORKDIR /app/orbax_repo/checkpoint
51+
RUN pip install xpk
52+
53+
# 5. Environment Setup
54+
# Set PYTHONPATH so 'import orbax' works from the correctly mapped directory
55+
ENV PYTHONPATH=/app/orbax_repo/checkpoint
56+
57+
58+
CMD ["python3", "orbax/checkpoint/_src/testing/oss/cloud_run_it.py"]
Lines changed: 119 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,119 @@
1+
# Copyright 2026 The Orbax Authors.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
"""Runs Orbax benchmarks on GCP."""
16+
17+
import datetime
18+
import os
19+
import subprocess
20+
import sys
21+
import yaml
22+
23+
24+
def run_benchmark(test_config):
25+
"""Runs a single benchmark test based on the given config.
26+
27+
Args:
28+
test_config: A dictionary containing the test configuration.
29+
30+
Returns:
31+
True if benchmark ran successfully, False otherwise.
32+
"""
33+
print(f"Running benchmark: {test_config['name']}")
34+
35+
# Get credentials
36+
try:
37+
subprocess.run(
38+
[
39+
'gcloud',
40+
'container',
41+
'clusters',
42+
'get-credentials',
43+
test_config['cluster_name'],
44+
'--region',
45+
test_config['gcp_region'],
46+
'--project',
47+
test_config['gcp_project'],
48+
],
49+
check=True,
50+
)
51+
except subprocess.CalledProcessError as e:
52+
print(f'Failed to get cluster credentials: {e}')
53+
return False
54+
55+
# Build command
56+
output_dir = os.path.join(
57+
test_config['output_directory'],
58+
datetime.datetime.now().strftime('%Y%m%d'),
59+
)
60+
61+
cmd = [
62+
'python3',
63+
'orbax/checkpoint/_src/testing/benchmarks/xpk/launch_xpk.py',
64+
'--cluster_name',
65+
test_config['cluster_name'],
66+
'--tpu_type',
67+
test_config['tpu_type'],
68+
'--zone',
69+
test_config['zone'],
70+
'--config_file',
71+
test_config['config_file'],
72+
'--docker_image',
73+
test_config['docker_image'],
74+
'--output_directory',
75+
output_dir,
76+
'--num_slices',
77+
str(test_config['num_slices']),
78+
]
79+
if test_config.get('nodelete_cluster_on_completion'):
80+
cmd.append('--nodelete_cluster_on_completion')
81+
if test_config.get('ramdisk_directory'):
82+
cmd.extend(['--ramdisk_directory', test_config['ramdisk_directory']])
83+
if test_config.get('test_restart_workflow'):
84+
cmd.append('--test_restart_workflow')
85+
if test_config.get('verbose'):
86+
cmd.append('--verbose')
87+
if test_config.get('skip_validation'):
88+
cmd.append('--skip_validation')
89+
if test_config.get('enable_pathways'):
90+
cmd.append('--enable_pathways')
91+
92+
print(f"Executing command: {' '.join(cmd)}")
93+
try:
94+
subprocess.run(cmd, check=True)
95+
except subprocess.CalledProcessError as e:
96+
print(f'Benchmark script failed: {e}')
97+
return False
98+
99+
return True
100+
101+
102+
def main():
103+
"""Loads test configurations and runs benchmarks."""
104+
config_path = 'orbax/checkpoint/_src/testing/oss/cloud_run_it.yaml'
105+
with open(config_path, 'r') as f:
106+
config = yaml.safe_load(f)
107+
108+
failures = 0
109+
for test in config.get('tests', []):
110+
if not run_benchmark(test):
111+
failures += 1
112+
113+
if failures:
114+
print(f'{failures} benchmarks failed.')
115+
sys.exit(1)
116+
117+
118+
if __name__ == '__main__':
119+
main()
Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
tests:
2+
- name: emergency_checkpoint_manager_benchmark
3+
cluster_name: orbax-cluster-test-mtc
4+
tpu_type: v5litepod-16
5+
zone: us-west1-c
6+
gcp_region: us-west1
7+
gcp_project: orbax-checkpoint
8+
config_file: orbax/checkpoint/_src/testing/benchmarks/configs/emergency_checkpoint_manager_benchmark.yaml
9+
docker_image: gcr.io/orbax-checkpoint/orbax-benchmarks-runner:latest
10+
output_directory: gs://orbax-benchmarks/cloud_runs/
11+
nodelete_cluster_on_completion: true
12+
ramdisk_directory: /local/test
13+
num_slices: 2
14+
test_restart_workflow: false
15+
verbose: true
16+
skip_validation: true
17+
enable_pathways: false

0 commit comments

Comments
 (0)