1- # ###########################################################
2- # Template Type
3- # Defines the template type for the job.
4- # - convergence_tests: for convergence tests
5- # - scdl_performance: for SCDL performance tests
6- # ###########################################################
7- template_type : convergence_tests
1+ # @package _global_
2+ defaults :
3+ - /base
4+ - _self_
85
9- job_name : container_test
6+ job_name : " conatinertest "
107
118# ###########################################################
12- # Container Runtime
13- # Defines the base Docker image and registry auth needed
9+ # lepton job info
1410# ###########################################################
15- container :
16- image : nvcr.io/nvidia/pytorch:25.11-py3
17- registry_auth : lepton-nvidia
11+ node_group : yo-bom-lepton-001
12+ mount_from : node-nfs:fs1
13+ num_nodes : 1
14+ device_type : gpu
15+ num_devices : 2
16+ gpu_type : h100-sxm
17+ resource_shape : " ${device_type}.${num_devices}x${gpu_type}"
1818
1919# ###########################################################
20- # Environment Variables
21- # These keys must be present for the job to authenticate with
22- # external services (W&B, Kratos, Lepton) and control runtime caching.
23- # HF_HOME is optional but recommended to speed up Hugging Face model loading.
20+ # kratos info: where to log data
2421# ###########################################################
25- environment_variables :
26- - name : WANDB_API_KEY
27- value_from : JWILBER_WANDB_API_KEY
28- - name : KRATOS_SSA_URL
29- value_from : KRATOS_SSA_URL
30- - name : KRATOS_SSA_CLIENT_ID
31- value_from : KRATOS_SSA_CLIENT_ID
32- - name : KRATOS_SSA_SECRET
33- value_from : KRATOS_SSA_SECRET.jwilber
34- - name : LEP_LOGIN_CREDENTIALS
35- value_from : LEP_LOGIN_CREDENTIALS
36- - name : HF_HOME
37- value : /data/esm2/cache
38- - name : HF_TOKEN
39- value_from : HUGGING_FACE_HUB_TOKEN.jwilber
22+ kratos_subject : " convergence_tests_v0.0.3"
4023
4124# ###########################################################
42- # Lepton Cluster Selection & Node Group
43- # Select the GPU cluster where the job will run.
44- # - h100: yo-bom-lepton-001
45- # - h200: nv-int-multiteam-nebius-h200-01
46- # - a100: az-sat-lepton-001
25+ # recipe identifiers
26+ # mostly used for logging and observability
4727# ###########################################################
48- node_group : yo-bom-lepton-001
28+ recipe_subdir : esm2_native_te
29+ model_type : esm2
30+ variant : train # train, finetune
4931
50- # ###########################################################
51- # Shared Mounts
52- # Mount paths for accessing shared datasets, model checkpoints,
53- # or intermediate artifacts. The NFS source should match the cluster.
54- # - yo-bom-lepton-001 uses node-nfs:fs1
55- # - nv-int-multiteam-nebius-h200-01 uses node-nfs:lepton-shared-fs
56- # ###########################################################
57- mount_from : node-nfs:fs1
32+ # Core identifiers for filtering
33+ framework : native # native, accelerate
34+ precision : fp16 # likely bf16 or fp8
35+ te_enabled : true
36+ fp8_enabled : false
37+ # thd_enabled: false
5838
59- mounts :
60- - path : /BioNeMo
61- mount_path : /data
62- from_ : ${mount_from}
39+ # Catchall for additional features/configs
40+ extras : [] # e.g. [thd]
6341
6442# ###########################################################
65- # W&B Initialization
66- # Configure how runs are logged to Weights & Biases.
43+ # wandb info (total_gpus used for group name)
6744# ###########################################################
45+ # `total_gpus` calculated from lepton job info above
46+ total_gpus : ${multiply:${num_devices},${num_nodes}}
47+
6848wandb_init_args :
69- group : " model_convergence__recipes"
70- mode : " online"
49+ project : " test_convergence__recipes__${sanitize:${branch}}"
50+ group : " ${model_type}__${task_cmd}__${total_gpus}gpus__${sanitize:${gpu_type}}"
51+ job_type : " ${recipe_subdir}"
52+ name : null
7153
7254# ###########################################################
73- # Git Checkout Options
74- # Configure which version of the recipe to pull from GitHub.
75- # - `branch`: defaults to main
76- # - `commit_sha`: overrides branch if provided
55+ # task commands
56+ # shared across all products (if not explicitly overridden)
7757# ###########################################################
78- branch : jwilber/lepton-build-container
79- commit_sha : " "
58+
59+ # script overrides
60+ # these should match the keys in the recipe's config file
61+ model_tag : nvidia/esm2_t36_3B_UR50D
62+ task_cmd : train_fsdp2 # mfsdp
63+ num_train_steps : 20_000
64+ # dataset commands
65+ micro_batch_size : 16
66+ load_dataset_kwargs_path : nvidia/esm2_uniref_pretraining_data
67+ load_dataset_kwargs_streaming : true
68+ load_dataset_kwargs_revision : 4ac1d2973567e46b8ca95901f4b4793a21305995 # pragma: allowlist secret
69+
70+ # lr commands
71+ num_warmup_steps : 2_000
72+ # checkpoint controls
73+ ckpt_dir : " "
74+ save_checkpoints : false
75+ save_final_model : false
76+ resume_from_checkpoint : false
77+ use_distributed_checkpoint_fsdp2 : false
78+
79+ log_to_kratos : false
8080
8181# ###########################################################
8282# Checkout Script
@@ -88,95 +88,102 @@ checkout_script: |
8888 set -euo pipefail
8989
9090 echo "========================================"
91- echo "DIAGNOSTIC: System Capabilities Check "
91+ echo "Setting up BioNeMo environment "
9292 echo "========================================"
9393
94- echo -e "\n=== User Info ==="
95- whoami
96- id
97- groups
98- echo "HOME: $HOME"
99- echo "PWD: $PWD"
100-
101- echo -e "\n=== Sudo Access ==="
102- if sudo -n true 2>/dev/null; then
103- echo "✓ Sudo available WITHOUT password"
104- sudo -V | head -n 1
105- elif sudo -v 2>/dev/null; then
106- echo "⚠ Sudo available but requires password"
107- else
108- echo "✗ No sudo access"
109- fi
94+ # Clone repo
95+ git clone https://github.com/NVIDIA/bionemo-framework.git
96+ cd bionemo-framework/
97+ git checkout jstjohn/evo2_megatron_bridge_recipe
98+ # build container from dockerfile here
99+ cd bionemo-recipes/recipes/evo2_megatron
110100
111- echo -e "\n=== Docker Availability ==="
112- if which docker >/dev/null 2>&1; then
113- echo "✓ Docker binary found: $(which docker)"
114- docker --version || echo "✗ Docker version check failed"
115- if docker info >/dev/null 2>&1; then
116- echo "✓ Docker daemon accessible!"
117- docker info | grep -E "Server Version|Storage Driver|Runtimes"
118- else
119- echo "✗ Docker daemon not accessible (may need sudo or socket permissions)"
120- fi
121- else
122- echo "✗ Docker not installed"
101+ # Install uv (if not already available)
102+ if ! command -v uv &> /dev/null; then
103+ curl -LsSf https://astral.sh/uv/install.sh | sh
104+ export PATH="$HOME/.cargo/bin:$PATH"
123105 fi
124106
125- echo -e "\n=== Docker Socket Check ==="
126- if [ -S /var/run/docker.sock ]; then
127- echo "✓ Docker socket exists: /var/run/docker.sock"
128- ls -la /var/run/docker.sock
129- if [ -r /var/run/docker.sock ] && [ -w /var/run/docker.sock ]; then
130- echo "✓ Socket is readable and writable"
131- else
132- echo "⚠ Socket exists but may not be accessible"
133- fi
134- else
135- echo "✗ Docker socket not found"
136- fi
107+ # Fix TransformerEngine direct_url issue
108+ rm -f /usr/local/lib/python*/dist-packages/transformer_engine-*.dist-info/direct_url.json
137109
138- echo -e "\n=== GPU Access ==="
139- if which nvidia-smi >/dev/null 2>&1; then
140- echo "✓ nvidia-smi found"
141- nvidia-smi --query-gpu=name,driver_version --format=csv,noheader | head -n 1
142- else
143- echo "✗ nvidia-smi not found"
144- fi
110+ # Create venv with system site packages
111+ export UV_LINK_MODE=copy
112+ export VIRTUAL_ENV=/workspace/.venv
113+ export PATH="$VIRTUAL_ENV/bin:$PATH"
145114
146- echo -e "\n=== Package Management ==="
147- if apt-get --version >/dev/null 2>&1; then
148- echo "✓ apt-get available"
149- if sudo -n apt-get update -y >/dev/null 2>&1; then
150- echo "✓ Can run apt-get with sudo"
151- else
152- echo "✗ Cannot run apt-get (no sudo or permission denied)"
153- fi
154- fi
115+ uv venv --system-site-packages --seed $VIRTUAL_ENV
155116
156- echo -e "\n=== Writable Locations ==="
157- for dir in /tmp $HOME /data; do
158- if [ -d "$dir" ] && [ -w "$dir" ]; then
159- echo "✓ $dir is writable"
160- else
161- echo "✗ $dir not writable or doesn't exist"
162- fi
163- done
117+ # Create constraints file
118+ pip freeze | grep transformer_engine > pip-constraints.txt
164119
165- echo -e "\n=== Installed Tools ==="
166- for tool in git python3 pip curl wget; do
167- if which $tool >/dev/null 2>&1; then
168- echo "✓ $tool: $(which $tool)"
169- else
170- echo "✗ $tool: not found"
171- fi
172- done
120+ # Install dependencies
121+ uv pip install -r build_requirements.txt --no-build-isolation
122+ uv pip install -c pip-constraints.txt -e . --no-build-isolation
173123
174- echo -e "\n========================================"
175- echo "DIAGNOSTIC COMPLETE"
176124 echo "========================================"
125+ echo "BioNeMo environment ready!"
126+ echo "========================================"
127+
128+
129+ run_script : |
130+ pwd
131+
132+ ls
133+
134+ echo "ls ../../.."
135+ ls ../../..
136+ echo "ls ../../../.."
137+ ls ../../../..
138+ echo "ls ../../../.."
139+ ls ../../../../../..
177140
178- run_script : " "
179141
180- script : |
181- ${checkout_script}
182- ${run_script}
142+ train_evo2 \
143+ --hf-tokenizer-model-path tokenizers/nucleotide_fast_tokenizer_256 \
144+ --sharded-eden-data \
145+ --seq-length=8192 \
146+ --stride 7992 \
147+ --sequence-db-dir ../../../../../data/bcr_eden/OG2_database_splits \
148+ --train-window-db ../../../../../data/bcr_eden/OG2_database_splits/og2__train__short.sqlite \
149+ --val-window-db ../../../../../data/bcr_eden/OG2_database_splits/og2__validation__short.sqlite \
150+ --test-window-db ../../../../../data/bcr_eden/OG2_database_splits/og2__test__short.sqlite \
151+ --most-recent-k 3 \
152+ --max-steps=72926 \
153+ --constant-steps 1024 \
154+ --seed 1234 \
155+ --dataset-seed 1234 \
156+ --no-weight-decay-embeddings \
157+ --grad-reduce-in-fp32 \
158+ --activation-checkpoint-recompute-num-layers 1 \
159+ --mixed-precision-recipe bf16-with-fp8-delayed-scaling-mixed \
160+ --hybrid-override-pattern SDH*SDHSDH*SDHSDH*SDHSDH* \
161+ --use-precision-aware-optimizer \
162+ --log-num-zeros-in-grad \
163+ --enable-preemption \
164+ --no-fp32-residual-connection \
165+ --ckpt-async-save \
166+ --overlap-grad-reduce \
167+ --clip-grad 1 \
168+ --eod-pad-in-loss-mask \
169+ --wandb-project evo2-recipes-verification \
170+ --lr 3e-04 \
171+ --wd 0.01 \
172+ --min-lr 6e-06 \
173+ --warmup-steps 1024 \
174+ --attention-dropout 0.001 \
175+ --hidden-dropout 0.001 \
176+ --eval-iters=10 \
177+ --eval-interval=100 \
178+ --debug-ddp-parity-freq 100 \
179+ --experiment-name=pretrain_striped_hyena_1b_nv_parallel \
180+ --result-dir=FIXME \
181+ --tensor-model-parallel-size=1 \
182+ --context-parallel-size=1 \
183+ --pipeline-model-parallel-size=1 \
184+ --workers 8 \
185+ --log-interval 5 \
186+ --no-renormalize-loss \
187+ --micro-batch-size=20 \
188+ --global-batch-size=960 \
189+ --model-size=striped_hyena_1b_nv_parallel
0 commit comments