working version, no dockerfile

jwilber · jwilber · commit 0954eb7f0a12 · 2025-12-19T11:42:02.000-08:00
Signed-off-by: jwilber &lt;jwilber@nvidia.com&gt;
diff --git a/ci/lepton/model_convergence/configs/recipes/container.yaml b/ci/lepton/model_convergence/configs/recipes/container.yaml
@@ -1,82 +1,82 @@
-############################################################
-# Template Type
-# Defines the template type for the job.
-# - convergence_tests: for convergence tests
-# - scdl_performance: for SCDL performance tests
-############################################################
-template_type: convergence_tests
+# @package _global_
+defaults:
+  - /base
+  - _self_
 
-job_name: container_test
+job_name: "conatinertest"
 
 ############################################################
-# Container Runtime
-# Defines the base Docker image and registry auth needed
+# lepton job info
 ############################################################
-container:
-  image: nvcr.io/nvidia/pytorch:25.11-py3
-  registry_auth: lepton-nvidia
+node_group: yo-bom-lepton-001
+mount_from: node-nfs:fs1
+num_nodes: 1
+device_type: gpu
+num_devices: 2
+gpu_type: h100-sxm
+resource_shape: "${device_type}.${num_devices}x${gpu_type}"
 
 ############################################################
-# Environment Variables
-# These keys must be present for the job to authenticate with
-# external services (W&B, Kratos, Lepton) and control runtime caching.
-# HF_HOME is optional but recommended to speed up Hugging Face model loading.
+# kratos info: where to log data
 ############################################################
-environment_variables:
-  - name: WANDB_API_KEY
-    value_from: JWILBER_WANDB_API_KEY
-  - name: KRATOS_SSA_URL
-    value_from: KRATOS_SSA_URL
-  - name: KRATOS_SSA_CLIENT_ID
-    value_from: KRATOS_SSA_CLIENT_ID
-  - name: KRATOS_SSA_SECRET
-    value_from: KRATOS_SSA_SECRET.jwilber
-  - name: LEP_LOGIN_CREDENTIALS
-    value_from: LEP_LOGIN_CREDENTIALS
-  - name: HF_HOME
-    value: /data/esm2/cache
-  - name: HF_TOKEN
-    value_from: HUGGING_FACE_HUB_TOKEN.jwilber
+kratos_subject: "convergence_tests_v0.0.3"
 
 ############################################################
-# Lepton Cluster Selection & Node Group
-# Select the GPU cluster where the job will run.
-# - h100: yo-bom-lepton-001
-# - h200: nv-int-multiteam-nebius-h200-01
-# - a100: az-sat-lepton-001
+# recipe identifiers
+# mostly used for logging and observability
 ############################################################
-node_group: yo-bom-lepton-001
+recipe_subdir: esm2_native_te
+model_type: esm2
+variant: train # train, finetune
 
-############################################################
-# Shared Mounts
-# Mount paths for accessing shared datasets, model checkpoints,
-# or intermediate artifacts. The NFS source should match the cluster.
-# - yo-bom-lepton-001 uses node-nfs:fs1
-# - nv-int-multiteam-nebius-h200-01 uses node-nfs:lepton-shared-fs
-############################################################
-mount_from: node-nfs:fs1
+# Core identifiers for filtering
+framework: native # native, accelerate
+precision: fp16 # likely bf16 or fp8
+te_enabled: true
+fp8_enabled: false
+# thd_enabled: false
 
-mounts:
-  - path: /BioNeMo
-    mount_path: /data
-    from_: ${mount_from}
+# Catchall for additional features/configs
+extras: [] # e.g. [thd]
 
 ############################################################
-# W&B Initialization
-# Configure how runs are logged to Weights & Biases.
+# wandb info (total_gpus used for group name)
 ############################################################
+# `total_gpus` calculated from lepton job info above
+total_gpus: ${multiply:${num_devices},${num_nodes}}
+
 wandb_init_args:
-  group: "model_convergence__recipes"
-  mode: "online"
+  project: "test_convergence__recipes__${sanitize:${branch}}"
+  group: "${model_type}__${task_cmd}__${total_gpus}gpus__${sanitize:${gpu_type}}"
+  job_type: "${recipe_subdir}"
+  name: null
 
 ############################################################
-# Git Checkout Options
-# Configure which version of the recipe to pull from GitHub.
-# - `branch`: defaults to main
-# - `commit_sha`: overrides branch if provided
+# task commands
+# shared across all products (if not explicitly overridden)
 ############################################################
-branch: jwilber/lepton-build-container
-commit_sha: ""
+
+# script overrides
+# these should match the keys in the recipe's config file
+model_tag: nvidia/esm2_t36_3B_UR50D
+task_cmd: train_fsdp2 # mfsdp
+num_train_steps: 20_000
+# dataset commands
+micro_batch_size: 16
+load_dataset_kwargs_path: nvidia/esm2_uniref_pretraining_data
+load_dataset_kwargs_streaming: true
+load_dataset_kwargs_revision: 4ac1d2973567e46b8ca95901f4b4793a21305995 # pragma: allowlist secret
+
+# lr commands
+num_warmup_steps: 2_000
+# checkpoint controls
+ckpt_dir: ""
+save_checkpoints: false
+save_final_model: false
+resume_from_checkpoint: false
+use_distributed_checkpoint_fsdp2: false
+
+log_to_kratos: false
 
 ############################################################
 # Checkout Script
@@ -88,95 +88,102 @@ checkout_script: |
   set -euo pipefail
   
   echo "========================================"
-  echo "DIAGNOSTIC: System Capabilities Check"
+  echo "Setting up BioNeMo environment"
   echo "========================================"
   
-  echo -e "\n=== User Info ==="
-  whoami
-  id
-  groups
-  echo "HOME: $HOME"
-  echo "PWD: $PWD"
-  
-  echo -e "\n=== Sudo Access ==="
-  if sudo -n true 2>/dev/null; then
-    echo "✓ Sudo available WITHOUT password"
-    sudo -V | head -n 1
-  elif sudo -v 2>/dev/null; then
-    echo "⚠ Sudo available but requires password"
-  else
-    echo "✗ No sudo access"
-  fi
+  # Clone repo
+  git clone https://github.com/NVIDIA/bionemo-framework.git
+  cd bionemo-framework/
+  git checkout jstjohn/evo2_megatron_bridge_recipe
+  # build container from dockerfile here
+  cd bionemo-recipes/recipes/evo2_megatron
   
-  echo -e "\n=== Docker Availability ==="
-  if which docker >/dev/null 2>&1; then
-    echo "✓ Docker binary found: $(which docker)"
-    docker --version || echo "✗ Docker version check failed"
-    if docker info >/dev/null 2>&1; then
-      echo "✓ Docker daemon accessible!"
-      docker info | grep -E "Server Version|Storage Driver|Runtimes"
-    else
-      echo "✗ Docker daemon not accessible (may need sudo or socket permissions)"
-    fi
-  else
-    echo "✗ Docker not installed"
+  # Install uv (if not already available)
+  if ! command -v uv &> /dev/null; then
+    curl -LsSf https://astral.sh/uv/install.sh | sh
+    export PATH="$HOME/.cargo/bin:$PATH"
   fi
   
-  echo -e "\n=== Docker Socket Check ==="
-  if [ -S /var/run/docker.sock ]; then
-    echo "✓ Docker socket exists: /var/run/docker.sock"
-    ls -la /var/run/docker.sock
-    if [ -r /var/run/docker.sock ] && [ -w /var/run/docker.sock ]; then
-      echo "✓ Socket is readable and writable"
-    else
-      echo "⚠ Socket exists but may not be accessible"
-    fi
-  else
-    echo "✗ Docker socket not found"
-  fi
+  # Fix TransformerEngine direct_url issue
+  rm -f /usr/local/lib/python*/dist-packages/transformer_engine-*.dist-info/direct_url.json
   
-  echo -e "\n=== GPU Access ==="
-  if which nvidia-smi >/dev/null 2>&1; then
-    echo "✓ nvidia-smi found"
-    nvidia-smi --query-gpu=name,driver_version --format=csv,noheader | head -n 1
-  else
-    echo "✗ nvidia-smi not found"
-  fi
+  # Create venv with system site packages
+  export UV_LINK_MODE=copy
+  export VIRTUAL_ENV=/workspace/.venv
+  export PATH="$VIRTUAL_ENV/bin:$PATH"
   
-  echo -e "\n=== Package Management ==="
-  if apt-get --version >/dev/null 2>&1; then
-    echo "✓ apt-get available"
-    if sudo -n apt-get update -y >/dev/null 2>&1; then
-      echo "✓ Can run apt-get with sudo"
-    else
-      echo "✗ Cannot run apt-get (no sudo or permission denied)"
-    fi
-  fi
+  uv venv --system-site-packages --seed $VIRTUAL_ENV
   
-  echo -e "\n=== Writable Locations ==="
-  for dir in /tmp $HOME /data; do
-    if [ -d "$dir" ] && [ -w "$dir" ]; then
-      echo "✓ $dir is writable"
-    else
-      echo "✗ $dir not writable or doesn't exist"
-    fi
-  done
+  # Create constraints file
+  pip freeze | grep transformer_engine > pip-constraints.txt
   
-  echo -e "\n=== Installed Tools ==="
-  for tool in git python3 pip curl wget; do
-    if which $tool >/dev/null 2>&1; then
-      echo "✓ $tool: $(which $tool)"
-    else
-      echo "✗ $tool: not found"
-    fi
-  done
+  # Install dependencies
+  uv pip install -r build_requirements.txt --no-build-isolation
+  uv pip install -c pip-constraints.txt -e . --no-build-isolation
   
-  echo -e "\n========================================"
-  echo "DIAGNOSTIC COMPLETE"
   echo "========================================"
+  echo "BioNeMo environment ready!"
+  echo "========================================"
+
+
+run_script: |
+  pwd
+
+  ls
+
+  echo "ls ../../.."
+  ls ../../..
+  echo "ls ../../../.."
+  ls ../../../..
+  echo "ls ../../../.."
+  ls ../../../../../..
 
-run_script: ""
 
-script: |
-  ${checkout_script}
-  ${run_script}
+  train_evo2 \
+  --hf-tokenizer-model-path tokenizers/nucleotide_fast_tokenizer_256 \
+  --sharded-eden-data \
+  --seq-length=8192 \
+  --stride 7992 \
+  --sequence-db-dir ../../../../../data/bcr_eden/OG2_database_splits \
+  --train-window-db ../../../../../data/bcr_eden/OG2_database_splits/og2__train__short.sqlite \
+  --val-window-db ../../../../../data/bcr_eden/OG2_database_splits/og2__validation__short.sqlite \
+  --test-window-db ../../../../../data/bcr_eden/OG2_database_splits/og2__test__short.sqlite \
+  --most-recent-k 3 \
+  --max-steps=72926 \
+  --constant-steps 1024 \
+  --seed 1234 \
+  --dataset-seed 1234 \
+  --no-weight-decay-embeddings \
+  --grad-reduce-in-fp32 \
+  --activation-checkpoint-recompute-num-layers 1 \
+  --mixed-precision-recipe bf16-with-fp8-delayed-scaling-mixed \
+  --hybrid-override-pattern SDH*SDHSDH*SDHSDH*SDHSDH* \
+  --use-precision-aware-optimizer \
+  --log-num-zeros-in-grad \
+  --enable-preemption \
+  --no-fp32-residual-connection \
+  --ckpt-async-save \
+  --overlap-grad-reduce \
+  --clip-grad 1 \
+  --eod-pad-in-loss-mask \
+  --wandb-project evo2-recipes-verification \
+  --lr 3e-04 \
+  --wd 0.01 \
+  --min-lr 6e-06 \
+  --warmup-steps 1024 \
+  --attention-dropout 0.001 \
+  --hidden-dropout 0.001 \
+  --eval-iters=10 \
+  --eval-interval=100 \
+  --debug-ddp-parity-freq 100 \
+  --experiment-name=pretrain_striped_hyena_1b_nv_parallel \
+  --result-dir=FIXME \
+  --tensor-model-parallel-size=1 \
+  --context-parallel-size=1 \
+  --pipeline-model-parallel-size=1 \
+  --workers 8 \
+  --log-interval 5 \
+  --no-renormalize-loss \
+  --micro-batch-size=20 \
+  --global-batch-size=960 \
+  --model-size=striped_hyena_1b_nv_parallel