1- scope : partial-conv
2- time_limit : 14400
1+ scope : perf
2+ time_limit : 900
33script_args :
44 # All arguments referenced in the script string must be specified here.
55 # Arguments not referenced in the script string must have the 'arg' field specified.
@@ -14,35 +14,50 @@ script_args:
1414 value : evo2
1515 variant :
1616 value : train
17- config_name :
18- value : 7b
19- precision :
20- value : fp8
21- nodes :
22- value : 4
23- gpus :
24- value : 8
25- batch_size :
26- value : 2
27- pp :
28- value : 1
29- tp :
30- value : 8
31- cp :
32- value : 1
33- acc_grad :
34- value : 1
35- max_steps :
36- value : 20000
17+ config_name : 1b
18+ precision : fp8
19+ gpus : 8
20+ nodes : 4
21+ batch_size : 8
22+ max_steps : 490000
23+ pp : 1
24+ cp : 1
25+ tp : 1
26+ seq_len : 8192
27+ acc_grad : 1
28+ clip_grad :
29+ value : 250
30+ key_segment : False
31+ seed : 3735928559
32+ lr :
33+ value : 0.00015
34+ key_segment : False
35+ min_lr :
36+ value : 0.000015
37+ key_segment : False
38+ wu_steps :
39+ value : 5000
40+ key_segment : False
41+ wd :
42+ value : 0.1
43+ key_segment : False
3744script : |-
38- WANDB_API_KEY=$BIONEMO_WANDB_API_KEY python ${workspace}/sub-packages/bionemo-evo2/src/bionemo/evo2/run/train .py \
39- -d ${ workspace}/ sub-packages/bionemo-evo2/tests/config/test_dataset_config .yaml \
45+ WANDB_API_KEY=$BIONEMO_WANDB_API_KEY python ${workspace}/sub-packages/bionemo-evo2/src/bionemo/evo2/run/${variant} .py \
46+ -d / workspace/bionemo2/ sub-packages/bionemo-evo2/examples/configs/full_pretrain_shortphase_config .yaml \
4047 --dataset-dir ${data_path} \
4148 --grad-acc-batches ${acc_grad} \
42- --fp8 \
49+ --fp8 --fp8-wgrad --activation-checkpoint-recompute-num-layers 5 \
4350 --enable-preemption \
4451 --ckpt-async-save \
45- --seq-length=8192 \
52+ --use-megatron-comm-overlap-llama3-8k \
53+ --overlap-grad-reduce \
54+ --clip-grad=${clip_grad} \
55+ --eod-pad-in-loss-mask \
56+ --seq-length=${seq_len} \
57+ --lr=${lr} \
58+ --wd=${wd} \
59+ --min-lr=${min_lr} \
60+ --warmup-steps=${wu_steps} \
4661 --tensor-parallel-size=${tp} \
4762 --context-parallel-size=${cp} \
4863 --pipeline-model-parallel-size=${pp} \
@@ -54,10 +69,10 @@ script: |-
5469 --max-steps=${max_steps} \
5570 --limit-val-batches=20 \
5671 --log-every-n-steps=50 \
57- --val-check-interval=500 \
72+ --val-check-interval=${max_steps} \
5873 --tflops-callback \
59- --experiment-dir=${tensorboard_dir}/${batch_size}bs_${nodes}node_${gpus}gpu_${max_steps}s_${precision}prec \
74+ --experiment-dir=${tensorboard_dir} \
6075 --wandb-project=${wandb_project_name} \
61- --wandb-group=${model}_${variant}_${config_name}__${target} \
76+ --wandb-group=${model}_${variant}_${config_name}__${target}__slen${seq_len} \
6277 --wandb-job-type=${pipeline_label} \
6378 --disable-checkpointing;
0 commit comments