@@ -11,21 +11,14 @@ trainable: true
1111# training, optimzer, checkpoint, loss, distributed, recompute, data, profile, logging
1212
1313# training
14- yaml_cfg : null # not support
15- spec : null
1614micro_batch_size : 2
17- batch_size : null # deprecated
1815global_batch_size : 128
19- rampup_batch_size : null
2016decrease_batch_size_if_needed : false
2117check_for_nan_in_loss_and_grad : true
2218check_for_spiky_loss : false
2319check_for_large_grads : false
2420make_vocab_size_divisible_by : 128
2521exit_signal_handler : false
26- exit_duration_in_mins : null
27- exit_interval : null
28- onnx_safe : null
2922bert_binary_head : true
3023
3124use_flash_attn : false
@@ -39,7 +32,6 @@ fp16: false
3932bf16 : true
4033grad_reduce_in_bf16 : false
4134calculate_per_token_loss : false
42- loss_scale : null
4335initial_loss_scale : 4294967296
4436min_loss_scale : 1.0
4537loss_scale_window : 1000
@@ -48,7 +40,6 @@ accumulate_allreduce_grads_in_fp32: false
4840fp16_lm_cross_entropy : false
4941
5042# fp8
51- fp8 : null # e4m3, hybrid
5243fp8_margin : 0
5344fp8_recipe : delayed
5445fp8_interval : 1 # deprecated
@@ -60,7 +51,6 @@ te_rng_tracker: false
6051inference_rng_tracker : false
6152
6253# fp4
63- fp4 : null
6454fp4_recipe : nvfp4
6555fp4_param : false
6656
@@ -72,20 +62,13 @@ num_layers_at_end_in_bf16: 1
7262optimizer : adam
7363lr : 2.5e-4
7464lr_decay_style : cosine
75- lr_decay_iters : null
76- lr_decay_samples : null
77- lr_warmup_fraction : null
7865lr_warmup_iters : 0
7966lr_warmup_samples : 0
8067lr_warmup_init : 0.0
8168min_lr : 2.5e-5
8269lr_wsd_decay_style : exponential
83- lr_wsd_decay_samples : null
84- lr_wsd_decay_iters : null
8570head_lr_mult : 1.0
8671weight_decay : 0.01
87- start_weight_decay : null
88- end_weight_decay : null
8972weight_decay_incr_style : constant
9073clip_grad : 1.0
9174adam_beta1 : 0.9
@@ -94,9 +77,6 @@ adam_eps: 1.0e-08
9477sgd_momentum : 0.9
9578override_opt_param_scheduler : false
9679use_checkpoint_opt_param_scheduler : false
97- warmup : null
98- decoupled_lr : null
99- decoupled_min_lr : null
10080# muon
10181muon_extra_scale_factor : 1.0
10282muon_scale_mode : " spectral"
@@ -117,41 +97,22 @@ pin_cpu_grads: true
11797pin_cpu_params : true
11898
11999# checkpointing arguments
120- save : null
121100save_interval : 20000
122- save_retain_interval : null
123- no_save_optim : null
124- no_save_rng : null
125- load : null
126101load_main_params_from_ckpt : false
127- no_load_optim : null
128- no_load_rng : null
129102finetune : false
130103use_checkpoint_args : false
131104use_mp_args_from_checkpoint_args : false
132105use_tokenizer_model_from_checkpoint_args : true
133106exit_on_missing_checkpoint : true
134- non_persistent_save_interval : null # int
135- non_persistent_ckpt_type : null # 'global', 'local', 'in_memory', null
136- non_persistent_global_ckpt_dir : null # str
137- non_persistent_local_ckpt_dir : null # str
138107non_persistent_local_ckpt_algo : " fully_parallel" # 'fully_parallel', 'atomic'
139- dist_ckpt_save_pre_mcore_014 : null
140- dist_ckpt_optim_fully_reshardable : null
141108
142- pretrained_checkpoint : null
143- ckpt_step : null
144109use_dist_ckpt_deprecated : false
145110use_persistent_ckpt_worker : false
146111auto_detect_ckpt_format : false
147- dist_ckpt_format_deprecated : null
148112ckpt_format : torch_dist # 'torch', 'torch_dist', 'zarr'
149- ckpt_convert_format : null # 'torch', 'torch_dist', 'zarr'
150- ckpt_convert_save : null
151113ckpt_convert_update_legacy_dist_opt_format : false
152114ckpt_fully_parallel_save_deprecated : false
153115ckpt_fully_parallel_save : true
154- async_save : null
155116ckpt_fully_parallel_load : false
156117ckpt_assume_constant_structure : false
157118dist_ckpt_strictness : assume_ok_unexpected
@@ -163,8 +124,6 @@ distributed_timeout_minutes: 10
163124defer_embedding_wgrad_compute : false
164125wgrad_deferral_limit : 0 # int
165126align_grad_reduce : true
166- ddp_num_buckets : null # int
167- ddp_bucket_size : null # int
168127ddp_pad_buckets_for_high_nccl_busbw : false
169128ddp_average_in_collective : false
170129overlap_grad_reduce : false
@@ -173,15 +132,12 @@ overlap_param_gather_with_optimizer_step: false
173132align_param_gather : true
174133scatter_gather_tensors_in_pipeline : true
175134use_ring_exchange_p2p : false
176- local_rank : null
177- lazy_mpu_init : null
178135account_for_embedding_in_pipeline_split : false
179136account_for_loss_in_pipeline_split : false
180137empty_unused_memory_level : 0
181138standalone_embedding_stage : false
182139use_distributed_optimizer : false
183140use_sharp : false
184- sharp_enabled_group : null # options: [dp, dp_replica]
185141use_custom_fsdp : false
186142use_megatron_fsdp : false
187143init_model_with_meta_device : false
@@ -191,31 +147,22 @@ suggested_communication_unit_size: 400000000 # int
191147keep_fp8_transpose_cache_when_using_custom_fsdp : false
192148num_distributed_optimizer_instances : 1 # int
193149use_torch_fsdp2 : false
194- nccl_communicator_config_path : null
195150use_tp_pp_dp_mapping : false
196151replication : false
197- replication_jump : null # int
198- replication_factor : null # int
199152deterministic_mode : false
200- check_weight_hash_across_dp_replicas_interval : null
201153overlap_moe_expert_parallel_comm : false
202154
203- train_iters : null
204155eval_iters : 32
205156full_validation : false
206157multiple_validation_sets : false
207158eval_interval : 2000
208159skip_train : false
209- train_sync_interval : null # int
210160
211161adlr_autoresume : false
212162adlr_autoresume_interval : 1000
213163
214164# activation recomputation
215165recompute_activations : false
216- recompute_granularity : null # full, selective
217- recompute_method : null # uniform, block
218- recompute_num_layers : null # int
219166distribute_saved_activations : false
220167checkpoint_activations : false # deprecated
221168
@@ -225,20 +172,10 @@ manual_gc_interval: 1 # int, default 0
225172manual_gc_eval : false
226173
227174# data
228- data_path : null
229175data_sharding : true
230176split : " 99,1,0"
231- train_data_path : null
232- valid_data_path : null
233- test_data_path : null
234- data_args_path : null # str
235- per_split_data_args_path : null # str
236- data_cache_path : null
237177mock_data : false
238- merge_file : null
239178seq_length : 4096
240- encoder_seq_length : null
241- decoder_seq_length : null
242179retriever_seq_length : 256
243180sample_rate : 1.0
244181mask_prob : 0.15
@@ -247,8 +184,6 @@ num_workers: 8
247184reset_position_ids : false
248185reset_attention_mask : false
249186eod_mask_loss : false
250- train_samples : null
251- dataloader_type : null
252187mmap_bin_files : true
253188
254189# profile:
@@ -257,8 +192,6 @@ use_pytorch_profiler: false
257192profile_ranks : [0]
258193profile_step_end : 12
259194profile_step_start : 10
260- iterations_to_skip : null
261- result_rejected_tracker_filename : null
262195enable_gloo_process_groups : true
263196record_memory_history : false
264197memory_snapshot_path : snapshot.pickle # str
@@ -281,20 +214,12 @@ log_validation_ppl_to_tensorboard: false
281214log_memory_to_tensorboard : false
282215log_world_size_to_tensorboard : false
283216log_loss_scale_to_tensorboard : true
284- wandb_project : null
285- wandb_exp_name : null
286- wandb_save_dir : null
287- wandb_entity : null
288217enable_one_logger : true
289218one_logger_project : megatron-lm
290- one_logger_run_name : null
291219log_interval : 100
292- tensorboard_dir : null
293- logging_level : null # int
294220config_logger_dir : " "
295221
296222one_logger_async : false
297- app_tag_run_name : null
298223app_tag_run_version : 0.0.0
299224
300225# rerun
@@ -338,9 +263,7 @@ classes_fraction: 1.0
338263data_per_class_fraction : 1.0
339264
340265# others
341- retro_project_dir : null
342266retro_add_retriever : false
343- retro_cyclic_train_iters : null
344267retro_encoder_layers : 2
345268retro_encoder_hidden_dropout : 0.1
346269retro_encoder_attention_dropout : 0.1
@@ -370,9 +293,6 @@ inference_batch_times_seqlen_threshold: -1
370293inference_dynamic_batching : false
371294inference_dynamic_batching_buffer_size_gb : 40.0 # float
372295inference_dynamic_batching_buffer_guaranteed_fraction : 0.2 # float
373- inference_dynamic_batching_buffer_overflow_factor : null # float
374- inference_dynamic_batching_max_requests_override : null # int
375- inference_dynamic_batching_max_tokens_override : null # int
376296max_tokens_to_oom : 12000
377297output_bert_embeddings : false
378298bert_embedder_type : megatron # "megatron", "huggingface"
@@ -386,28 +306,19 @@ inference_max_seq_length: 2560 # int, (prefill + decode)
386306
387307create_attention_mask_in_dataloader : true
388308num_dataset_builder_threads : 1
389- ict_head_size : null
390309biencoder_projection_dim : 0
391310biencoder_shared_query_context_model : false
392- ict_load : null
393- bert_load : null
394- titles_data_path : null
395311query_in_block_prob : 0.1
396312use_one_sent_docs : false
397- evidence_data_path : null
398313retriever_report_topk_accuracies : []
399314retriever_score_scaling : false
400- block_data_path : null
401- embedding_path : null
402315indexer_batch_size : 128
403316indexer_log_interval : 1000
404317
405318enable_ft_package : false
406319calc_ft_timeouts : false
407320run_workload_inspector_server : false
408321
409- heterogeneous_layers_config_path : null
410- heterogeneous_layers_config_encoded_json : null
411322inprocess_restart : false
412323
413324# rl_args
@@ -424,13 +335,10 @@ grpo_filter_groups_with_same_reward: false
424335grpo_default_temperature : 1.0
425336grpo_default_top_p : 0
426337langrl_inference_server_type : inplace_megatron
427- langrl_inference_server_conversation_template : null
428- langrl_env_config : null
429338rl_offload_optimizer_during_inference : false
430339rl_offload_kv_cache_during_training : false
431340rl_remove_kv_cache_during_training : false
432341rl_reset_cuda_graphs : false
433342rl_partial_rollouts : false
434343rl_inference_logprobs_is_correction : false
435- rl_importance_sampling_truncation_coef : null
436344rl_calculate_intra_group_similarity : false
0 commit comments