Skip to content

Commit 2e7f392

Browse files
committed
Disable distopt contiguous grad buffer by default
Signed-off-by: Tim Moon <[email protected]>
1 parent bee0d3f commit 2e7f392

File tree

1 file changed

+4
-4
lines changed

1 file changed

+4
-4
lines changed

nemo/collections/nlp/models/language_modeling/megatron_base_model.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -66,7 +66,7 @@ class MegatronBaseModel(NLPModel):
6666
6767
- Initialize the model parallel world for nemo.
6868
- Turn on all of the nvidia optimizations.
69-
- If `cfg.tokenizer` is available, it loads the tokenizer and pad the vocab to the
69+
- If `cfg.tokenizer` is available, it loads the tokenizer and pad the vocab to the
7070
correct size for tensor model parallelism.
7171
- If using distributed optimizer, configure to be compatible
7272
with O2 level optimizations and/or model parallelism.
@@ -405,9 +405,8 @@ def setup_optimization(
405405
optim_kwargs = {} if optim_kwargs is None else optim_kwargs.copy()
406406
if self.with_distributed_adam:
407407

408-
# Allocate contiguous buffers to avoid extra copies
408+
# Allocate contiguous buffer to avoid extra copies
409409
optim_kwargs['contiguous_grad_buffer'] = True
410-
optim_kwargs['contiguous_param_buffer'] = True
411410

412411
# Make sure optimizer state is in FP32
413412
optim_dtype = torch.float32
@@ -507,7 +506,8 @@ def configure_optimizers(self):
507506
self._optimizer.init_params(reversed(no_overlap_params))
508507

509508
# Initialize contiguous parameter buffer
510-
self._optimizer.init_param_buffer()
509+
if self._optimizer.contiguous_param_buffer:
510+
self._optimizer.init_param_buffer()
511511

512512
if self._scheduler is None:
513513
return self._optimizer

0 commit comments

Comments
 (0)