@@ -66,7 +66,7 @@ class MegatronBaseModel(NLPModel):
66
66
67
67
- Initialize the model parallel world for nemo.
68
68
- Turn on all of the nvidia optimizations.
69
- - If `cfg.tokenizer` is available, it loads the tokenizer and pad the vocab to the
69
+ - If `cfg.tokenizer` is available, it loads the tokenizer and pad the vocab to the
70
70
correct size for tensor model parallelism.
71
71
- If using distributed optimizer, configure to be compatible
72
72
with O2 level optimizations and/or model parallelism.
@@ -405,9 +405,8 @@ def setup_optimization(
405
405
optim_kwargs = {} if optim_kwargs is None else optim_kwargs .copy ()
406
406
if self .with_distributed_adam :
407
407
408
- # Allocate contiguous buffers to avoid extra copies
408
+ # Allocate contiguous buffer to avoid extra copies
409
409
optim_kwargs ['contiguous_grad_buffer' ] = True
410
- optim_kwargs ['contiguous_param_buffer' ] = True
411
410
412
411
# Make sure optimizer state is in FP32
413
412
optim_dtype = torch .float32
@@ -507,7 +506,8 @@ def configure_optimizers(self):
507
506
self ._optimizer .init_params (reversed (no_overlap_params ))
508
507
509
508
# Initialize contiguous parameter buffer
510
- self ._optimizer .init_param_buffer ()
509
+ if self ._optimizer .contiguous_param_buffer :
510
+ self ._optimizer .init_param_buffer ()
511
511
512
512
if self ._scheduler is None :
513
513
return self ._optimizer
0 commit comments