Skip to content

Commit 9adfd42

Browse files
committed
temporary changed wall time to 15 mins
1 parent 65c453a commit 9adfd42

File tree

2 files changed

+44
-96
lines changed

2 files changed

+44
-96
lines changed
Lines changed: 44 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
1-
scope: partial-conv
2-
time_limit: 14400
1+
scope: perf
2+
time_limit: 900
33
script_args:
44
# All arguments referenced in the script string must be specified here.
55
# Arguments not referenced in the script string must have the 'arg' field specified.
@@ -14,35 +14,50 @@ script_args:
1414
value: evo2
1515
variant:
1616
value: train
17-
config_name:
18-
value: 7b
19-
precision:
20-
value: fp8
21-
nodes:
22-
value: 4
23-
gpus:
24-
value: 8
25-
batch_size:
26-
value: 2
27-
pp:
28-
value: 1
29-
tp:
30-
value: 8
31-
cp:
32-
value: 1
33-
acc_grad:
34-
value: 1
35-
max_steps:
36-
value: 20000
17+
config_name: 1b
18+
precision: fp8
19+
gpus: 8
20+
nodes: 4
21+
batch_size: 8
22+
max_steps: 490000
23+
pp: 1
24+
cp: 1
25+
tp: 1
26+
seq_len: 8192
27+
acc_grad: 1
28+
clip_grad:
29+
value: 250
30+
key_segment: False
31+
seed: 3735928559
32+
lr:
33+
value: 0.00015
34+
key_segment: False
35+
min_lr:
36+
value: 0.000015
37+
key_segment: False
38+
wu_steps:
39+
value: 5000
40+
key_segment: False
41+
wd:
42+
value: 0.1
43+
key_segment: False
3744
script: |-
38-
WANDB_API_KEY=$BIONEMO_WANDB_API_KEY python ${workspace}/sub-packages/bionemo-evo2/src/bionemo/evo2/run/train.py \
39-
-d ${workspace}/sub-packages/bionemo-evo2/tests/config/test_dataset_config.yaml \
45+
WANDB_API_KEY=$BIONEMO_WANDB_API_KEY python ${workspace}/sub-packages/bionemo-evo2/src/bionemo/evo2/run/${variant}.py \
46+
-d /workspace/bionemo2/sub-packages/bionemo-evo2/examples/configs/full_pretrain_shortphase_config.yaml \
4047
--dataset-dir ${data_path} \
4148
--grad-acc-batches ${acc_grad} \
42-
--fp8 \
49+
--fp8 --fp8-wgrad --activation-checkpoint-recompute-num-layers 5 \
4350
--enable-preemption \
4451
--ckpt-async-save \
45-
--seq-length=8192 \
52+
--use-megatron-comm-overlap-llama3-8k \
53+
--overlap-grad-reduce \
54+
--clip-grad=${clip_grad} \
55+
--eod-pad-in-loss-mask \
56+
--seq-length=${seq_len} \
57+
--lr=${lr} \
58+
--wd=${wd} \
59+
--min-lr=${min_lr} \
60+
--warmup-steps=${wu_steps} \
4661
--tensor-parallel-size=${tp} \
4762
--context-parallel-size=${cp} \
4863
--pipeline-model-parallel-size=${pp} \
@@ -54,10 +69,10 @@ script: |-
5469
--max-steps=${max_steps} \
5570
--limit-val-batches=20 \
5671
--log-every-n-steps=50 \
57-
--val-check-interval=500 \
72+
--val-check-interval=${max_steps} \
5873
--tflops-callback \
59-
--experiment-dir=${tensorboard_dir}/${batch_size}bs_${nodes}node_${gpus}gpu_${max_steps}s_${precision}prec \
74+
--experiment-dir=${tensorboard_dir} \
6075
--wandb-project=${wandb_project_name} \
61-
--wandb-group=${model}_${variant}_${config_name}__${target} \
76+
--wandb-group=${model}_${variant}_${config_name}__${target}__slen${seq_len} \
6277
--wandb-job-type=${pipeline_label} \
6378
--disable-checkpointing;

ci/benchmarks/perf/evo2_pretrain.yaml

Lines changed: 0 additions & 67 deletions
This file was deleted.

0 commit comments

Comments
 (0)