Skip to content

Commit 0954eb7

Browse files
committed
working version, no dockerfile
Signed-off-by: jwilber <[email protected]>
1 parent 4465b26 commit 0954eb7

File tree

1 file changed

+146
-139
lines changed

1 file changed

+146
-139
lines changed
Lines changed: 146 additions & 139 deletions
Original file line numberDiff line numberDiff line change
@@ -1,82 +1,82 @@
1-
############################################################
2-
# Template Type
3-
# Defines the template type for the job.
4-
# - convergence_tests: for convergence tests
5-
# - scdl_performance: for SCDL performance tests
6-
############################################################
7-
template_type: convergence_tests
1+
# @package _global_
2+
defaults:
3+
- /base
4+
- _self_
85

9-
job_name: container_test
6+
job_name: "conatinertest"
107

118
############################################################
12-
# Container Runtime
13-
# Defines the base Docker image and registry auth needed
9+
# lepton job info
1410
############################################################
15-
container:
16-
image: nvcr.io/nvidia/pytorch:25.11-py3
17-
registry_auth: lepton-nvidia
11+
node_group: yo-bom-lepton-001
12+
mount_from: node-nfs:fs1
13+
num_nodes: 1
14+
device_type: gpu
15+
num_devices: 2
16+
gpu_type: h100-sxm
17+
resource_shape: "${device_type}.${num_devices}x${gpu_type}"
1818

1919
############################################################
20-
# Environment Variables
21-
# These keys must be present for the job to authenticate with
22-
# external services (W&B, Kratos, Lepton) and control runtime caching.
23-
# HF_HOME is optional but recommended to speed up Hugging Face model loading.
20+
# kratos info: where to log data
2421
############################################################
25-
environment_variables:
26-
- name: WANDB_API_KEY
27-
value_from: JWILBER_WANDB_API_KEY
28-
- name: KRATOS_SSA_URL
29-
value_from: KRATOS_SSA_URL
30-
- name: KRATOS_SSA_CLIENT_ID
31-
value_from: KRATOS_SSA_CLIENT_ID
32-
- name: KRATOS_SSA_SECRET
33-
value_from: KRATOS_SSA_SECRET.jwilber
34-
- name: LEP_LOGIN_CREDENTIALS
35-
value_from: LEP_LOGIN_CREDENTIALS
36-
- name: HF_HOME
37-
value: /data/esm2/cache
38-
- name: HF_TOKEN
39-
value_from: HUGGING_FACE_HUB_TOKEN.jwilber
22+
kratos_subject: "convergence_tests_v0.0.3"
4023

4124
############################################################
42-
# Lepton Cluster Selection & Node Group
43-
# Select the GPU cluster where the job will run.
44-
# - h100: yo-bom-lepton-001
45-
# - h200: nv-int-multiteam-nebius-h200-01
46-
# - a100: az-sat-lepton-001
25+
# recipe identifiers
26+
# mostly used for logging and observability
4727
############################################################
48-
node_group: yo-bom-lepton-001
28+
recipe_subdir: esm2_native_te
29+
model_type: esm2
30+
variant: train # train, finetune
4931

50-
############################################################
51-
# Shared Mounts
52-
# Mount paths for accessing shared datasets, model checkpoints,
53-
# or intermediate artifacts. The NFS source should match the cluster.
54-
# - yo-bom-lepton-001 uses node-nfs:fs1
55-
# - nv-int-multiteam-nebius-h200-01 uses node-nfs:lepton-shared-fs
56-
############################################################
57-
mount_from: node-nfs:fs1
32+
# Core identifiers for filtering
33+
framework: native # native, accelerate
34+
precision: fp16 # likely bf16 or fp8
35+
te_enabled: true
36+
fp8_enabled: false
37+
# thd_enabled: false
5838

59-
mounts:
60-
- path: /BioNeMo
61-
mount_path: /data
62-
from_: ${mount_from}
39+
# Catchall for additional features/configs
40+
extras: [] # e.g. [thd]
6341

6442
############################################################
65-
# W&B Initialization
66-
# Configure how runs are logged to Weights & Biases.
43+
# wandb info (total_gpus used for group name)
6744
############################################################
45+
# `total_gpus` calculated from lepton job info above
46+
total_gpus: ${multiply:${num_devices},${num_nodes}}
47+
6848
wandb_init_args:
69-
group: "model_convergence__recipes"
70-
mode: "online"
49+
project: "test_convergence__recipes__${sanitize:${branch}}"
50+
group: "${model_type}__${task_cmd}__${total_gpus}gpus__${sanitize:${gpu_type}}"
51+
job_type: "${recipe_subdir}"
52+
name: null
7153

7254
############################################################
73-
# Git Checkout Options
74-
# Configure which version of the recipe to pull from GitHub.
75-
# - `branch`: defaults to main
76-
# - `commit_sha`: overrides branch if provided
55+
# task commands
56+
# shared across all products (if not explicitly overridden)
7757
############################################################
78-
branch: jwilber/lepton-build-container
79-
commit_sha: ""
58+
59+
# script overrides
60+
# these should match the keys in the recipe's config file
61+
model_tag: nvidia/esm2_t36_3B_UR50D
62+
task_cmd: train_fsdp2 # mfsdp
63+
num_train_steps: 20_000
64+
# dataset commands
65+
micro_batch_size: 16
66+
load_dataset_kwargs_path: nvidia/esm2_uniref_pretraining_data
67+
load_dataset_kwargs_streaming: true
68+
load_dataset_kwargs_revision: 4ac1d2973567e46b8ca95901f4b4793a21305995 # pragma: allowlist secret
69+
70+
# lr commands
71+
num_warmup_steps: 2_000
72+
# checkpoint controls
73+
ckpt_dir: ""
74+
save_checkpoints: false
75+
save_final_model: false
76+
resume_from_checkpoint: false
77+
use_distributed_checkpoint_fsdp2: false
78+
79+
log_to_kratos: false
8080

8181
############################################################
8282
# Checkout Script
@@ -88,95 +88,102 @@ checkout_script: |
8888
set -euo pipefail
8989
9090
echo "========================================"
91-
echo "DIAGNOSTIC: System Capabilities Check"
91+
echo "Setting up BioNeMo environment"
9292
echo "========================================"
9393
94-
echo -e "\n=== User Info ==="
95-
whoami
96-
id
97-
groups
98-
echo "HOME: $HOME"
99-
echo "PWD: $PWD"
100-
101-
echo -e "\n=== Sudo Access ==="
102-
if sudo -n true 2>/dev/null; then
103-
echo "✓ Sudo available WITHOUT password"
104-
sudo -V | head -n 1
105-
elif sudo -v 2>/dev/null; then
106-
echo "⚠ Sudo available but requires password"
107-
else
108-
echo "✗ No sudo access"
109-
fi
94+
# Clone repo
95+
git clone https://github.com/NVIDIA/bionemo-framework.git
96+
cd bionemo-framework/
97+
git checkout jstjohn/evo2_megatron_bridge_recipe
98+
# build container from dockerfile here
99+
cd bionemo-recipes/recipes/evo2_megatron
110100
111-
echo -e "\n=== Docker Availability ==="
112-
if which docker >/dev/null 2>&1; then
113-
echo "✓ Docker binary found: $(which docker)"
114-
docker --version || echo "✗ Docker version check failed"
115-
if docker info >/dev/null 2>&1; then
116-
echo "✓ Docker daemon accessible!"
117-
docker info | grep -E "Server Version|Storage Driver|Runtimes"
118-
else
119-
echo "✗ Docker daemon not accessible (may need sudo or socket permissions)"
120-
fi
121-
else
122-
echo "✗ Docker not installed"
101+
# Install uv (if not already available)
102+
if ! command -v uv &> /dev/null; then
103+
curl -LsSf https://astral.sh/uv/install.sh | sh
104+
export PATH="$HOME/.cargo/bin:$PATH"
123105
fi
124106
125-
echo -e "\n=== Docker Socket Check ==="
126-
if [ -S /var/run/docker.sock ]; then
127-
echo "✓ Docker socket exists: /var/run/docker.sock"
128-
ls -la /var/run/docker.sock
129-
if [ -r /var/run/docker.sock ] && [ -w /var/run/docker.sock ]; then
130-
echo "✓ Socket is readable and writable"
131-
else
132-
echo "⚠ Socket exists but may not be accessible"
133-
fi
134-
else
135-
echo "✗ Docker socket not found"
136-
fi
107+
# Fix TransformerEngine direct_url issue
108+
rm -f /usr/local/lib/python*/dist-packages/transformer_engine-*.dist-info/direct_url.json
137109
138-
echo -e "\n=== GPU Access ==="
139-
if which nvidia-smi >/dev/null 2>&1; then
140-
echo "✓ nvidia-smi found"
141-
nvidia-smi --query-gpu=name,driver_version --format=csv,noheader | head -n 1
142-
else
143-
echo "✗ nvidia-smi not found"
144-
fi
110+
# Create venv with system site packages
111+
export UV_LINK_MODE=copy
112+
export VIRTUAL_ENV=/workspace/.venv
113+
export PATH="$VIRTUAL_ENV/bin:$PATH"
145114
146-
echo -e "\n=== Package Management ==="
147-
if apt-get --version >/dev/null 2>&1; then
148-
echo "✓ apt-get available"
149-
if sudo -n apt-get update -y >/dev/null 2>&1; then
150-
echo "✓ Can run apt-get with sudo"
151-
else
152-
echo "✗ Cannot run apt-get (no sudo or permission denied)"
153-
fi
154-
fi
115+
uv venv --system-site-packages --seed $VIRTUAL_ENV
155116
156-
echo -e "\n=== Writable Locations ==="
157-
for dir in /tmp $HOME /data; do
158-
if [ -d "$dir" ] && [ -w "$dir" ]; then
159-
echo "✓ $dir is writable"
160-
else
161-
echo "✗ $dir not writable or doesn't exist"
162-
fi
163-
done
117+
# Create constraints file
118+
pip freeze | grep transformer_engine > pip-constraints.txt
164119
165-
echo -e "\n=== Installed Tools ==="
166-
for tool in git python3 pip curl wget; do
167-
if which $tool >/dev/null 2>&1; then
168-
echo "✓ $tool: $(which $tool)"
169-
else
170-
echo "✗ $tool: not found"
171-
fi
172-
done
120+
# Install dependencies
121+
uv pip install -r build_requirements.txt --no-build-isolation
122+
uv pip install -c pip-constraints.txt -e . --no-build-isolation
173123
174-
echo -e "\n========================================"
175-
echo "DIAGNOSTIC COMPLETE"
176124
echo "========================================"
125+
echo "BioNeMo environment ready!"
126+
echo "========================================"
127+
128+
129+
run_script: |
130+
pwd
131+
132+
ls
133+
134+
echo "ls ../../.."
135+
ls ../../..
136+
echo "ls ../../../.."
137+
ls ../../../..
138+
echo "ls ../../../.."
139+
ls ../../../../../..
177140
178-
run_script: ""
179141
180-
script: |
181-
${checkout_script}
182-
${run_script}
142+
train_evo2 \
143+
--hf-tokenizer-model-path tokenizers/nucleotide_fast_tokenizer_256 \
144+
--sharded-eden-data \
145+
--seq-length=8192 \
146+
--stride 7992 \
147+
--sequence-db-dir ../../../../../data/bcr_eden/OG2_database_splits \
148+
--train-window-db ../../../../../data/bcr_eden/OG2_database_splits/og2__train__short.sqlite \
149+
--val-window-db ../../../../../data/bcr_eden/OG2_database_splits/og2__validation__short.sqlite \
150+
--test-window-db ../../../../../data/bcr_eden/OG2_database_splits/og2__test__short.sqlite \
151+
--most-recent-k 3 \
152+
--max-steps=72926 \
153+
--constant-steps 1024 \
154+
--seed 1234 \
155+
--dataset-seed 1234 \
156+
--no-weight-decay-embeddings \
157+
--grad-reduce-in-fp32 \
158+
--activation-checkpoint-recompute-num-layers 1 \
159+
--mixed-precision-recipe bf16-with-fp8-delayed-scaling-mixed \
160+
--hybrid-override-pattern SDH*SDHSDH*SDHSDH*SDHSDH* \
161+
--use-precision-aware-optimizer \
162+
--log-num-zeros-in-grad \
163+
--enable-preemption \
164+
--no-fp32-residual-connection \
165+
--ckpt-async-save \
166+
--overlap-grad-reduce \
167+
--clip-grad 1 \
168+
--eod-pad-in-loss-mask \
169+
--wandb-project evo2-recipes-verification \
170+
--lr 3e-04 \
171+
--wd 0.01 \
172+
--min-lr 6e-06 \
173+
--warmup-steps 1024 \
174+
--attention-dropout 0.001 \
175+
--hidden-dropout 0.001 \
176+
--eval-iters=10 \
177+
--eval-interval=100 \
178+
--debug-ddp-parity-freq 100 \
179+
--experiment-name=pretrain_striped_hyena_1b_nv_parallel \
180+
--result-dir=FIXME \
181+
--tensor-model-parallel-size=1 \
182+
--context-parallel-size=1 \
183+
--pipeline-model-parallel-size=1 \
184+
--workers 8 \
185+
--log-interval 5 \
186+
--no-renormalize-loss \
187+
--micro-batch-size=20 \
188+
--global-batch-size=960 \
189+
--model-size=striped_hyena_1b_nv_parallel

0 commit comments

Comments
 (0)