From 7bce6eccb62cf5cda67895abc2e1fa244a4a877a Mon Sep 17 00:00:00 2001 From: Maanu Grover Date: Tue, 11 Jul 2023 14:48:31 -0500 Subject: [PATCH 01/21] add builder class Signed-off-by: Maanu Grover --- .../megatron_trainer_builder.py | 55 +++++++++++++++++++ 1 file changed, 55 insertions(+) create mode 100644 examples/nlp/language_modeling/megatron_trainer_builder.py diff --git a/examples/nlp/language_modeling/megatron_trainer_builder.py b/examples/nlp/language_modeling/megatron_trainer_builder.py new file mode 100644 index 000000000000..c65279c649f6 --- /dev/null +++ b/examples/nlp/language_modeling/megatron_trainer_builder.py @@ -0,0 +1,55 @@ +from pytorch_lightning import Trainer +from pytorch_lightning.plugins.environments import TorchElasticEnvironment +from nemo.collections.nlp.parts.nlp_overrides import ( + GradScaler, + MegatronHalfPrecisionPlugin, + NLPDDPStrategy, + PipelineMixedPrecisionPlugin, +) + + +class MegatronTrainerBuilder: + + def __init__(self, cfg) -> None: + self.cfg = cfg + + def _training_strategy(self) -> NLPDDPStrategy: + return NLPDDPStrategy( + no_ddp_communication_hook=True, + gradient_as_bucket_view=self.cfg.model.gradient_as_bucket_view, + find_unused_parameters=False, + ) + + def _grad_scaler(self) -> GradScaler: + return GradScaler( + init_scale=self.cfg.model.get('native_amp_init_scale', 2 ** 32), + growth_interval=self.cfg.model.get('native_amp_growth_interval', 1000), + hysteresis=self.cfg.model.get('hysteresis', 2), + ) + + def _plugins(self) -> list: + megatron_amp_o2 = self.cfg.model.get('megatron_amp_O2', False) + with_distributed_adam = self.cfg.model.optim.get('name') == 'distributed_fused_adam' + + plugins = [] + if self.cfg.trainer.precision in [16, 'bf16']: + scaler = None + if self.cfg.trainer.precision == 16: + scaler = self.grad_scaler() + + if megatron_amp_o2 and not with_distributed_adam: + plugins.append(MegatronHalfPrecisionPlugin( + precision=self.cfg.trainer.precision, device='cuda', scaler=scaler)) + else: + plugins.append(PipelineMixedPrecisionPlugin( + precision=self.cfg.trainer.precision, device='cuda', scaler=scaler)) + + if self.cfg.get('cluster_type', None) == 'BCP': + plugins.append(TorchElasticEnvironment()) + + return plugins + + def create_trainer(self) -> Trainer: + strategy = self._training_strategy() + plugins = self._plugins() + return Trainer(plugins=plugins, strategy=strategy, **self.cfg.trainer) From a188faf66cf9c7a713f47a0f181c4081d7c23b14 Mon Sep 17 00:00:00 2001 From: Maanu Grover Date: Thu, 13 Jul 2023 15:14:26 -0500 Subject: [PATCH 02/21] formatting Signed-off-by: Maanu Grover --- .../nlp/language_modeling/megatron_trainer_builder.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/examples/nlp/language_modeling/megatron_trainer_builder.py b/examples/nlp/language_modeling/megatron_trainer_builder.py index c65279c649f6..dbcbbd33073d 100644 --- a/examples/nlp/language_modeling/megatron_trainer_builder.py +++ b/examples/nlp/language_modeling/megatron_trainer_builder.py @@ -9,7 +9,6 @@ class MegatronTrainerBuilder: - def __init__(self, cfg) -> None: self.cfg = cfg @@ -38,11 +37,13 @@ def _plugins(self) -> list: scaler = self.grad_scaler() if megatron_amp_o2 and not with_distributed_adam: - plugins.append(MegatronHalfPrecisionPlugin( - precision=self.cfg.trainer.precision, device='cuda', scaler=scaler)) + plugins.append( + MegatronHalfPrecisionPlugin(precision=self.cfg.trainer.precision, device='cuda', scaler=scaler) + ) else: - plugins.append(PipelineMixedPrecisionPlugin( - precision=self.cfg.trainer.precision, device='cuda', scaler=scaler)) + plugins.append( + PipelineMixedPrecisionPlugin(precision=self.cfg.trainer.precision, device='cuda', scaler=scaler) + ) if self.cfg.get('cluster_type', None) == 'BCP': plugins.append(TorchElasticEnvironment()) From 6763963656dfc1464cfad158640d3868951f9626 Mon Sep 17 00:00:00 2001 From: Maanu Grover Date: Thu, 13 Jul 2023 15:18:11 -0500 Subject: [PATCH 03/21] use trainer builder for gpt pretraining example Signed-off-by: Maanu Grover --- .../megatron_gpt_pretraining.py | 39 +------------------ .../megatron_trainer_builder.py | 11 ++++-- 2 files changed, 9 insertions(+), 41 deletions(-) diff --git a/examples/nlp/language_modeling/megatron_gpt_pretraining.py b/examples/nlp/language_modeling/megatron_gpt_pretraining.py index 5068f5d2222d..10ec4f526253 100644 --- a/examples/nlp/language_modeling/megatron_gpt_pretraining.py +++ b/examples/nlp/language_modeling/megatron_gpt_pretraining.py @@ -14,18 +14,13 @@ import torch.multiprocessing as mp +from megatron_trainer_builder import MegatronTrainerBuilder from omegaconf.omegaconf import OmegaConf, open_dict from pytorch_lightning import Trainer from pytorch_lightning.plugins.environments import TorchElasticEnvironment from pytorch_lightning.trainer.connectors.checkpoint_connector import _CheckpointConnector from nemo.collections.nlp.models.language_modeling.megatron_gpt_model import MegatronGPTModel -from nemo.collections.nlp.parts.nlp_overrides import ( - GradScaler, - MegatronHalfPrecisionPlugin, - NLPDDPStrategy, - PipelineMixedPrecisionPlugin, -) from nemo.core.config import hydra_runner from nemo.utils import logging from nemo.utils.exp_manager import exp_manager @@ -38,37 +33,7 @@ def main(cfg) -> None: logging.info("\n\n************** Experiment configuration ***********") logging.info(f'\n{OmegaConf.to_yaml(cfg)}') - megatron_amp_o2 = cfg.model.get('megatron_amp_O2', False) - with_distributed_adam = cfg.model.optim.get('name') == 'distributed_fused_adam' - - plugins = [] - strategy = NLPDDPStrategy( - no_ddp_communication_hook=True, # we don't use DDP for async grad allreduce - gradient_as_bucket_view=cfg.model.gradient_as_bucket_view, - find_unused_parameters=False, - ) - if cfg.trainer.precision in [16, '16', 'bf16', '16-mixed', 'bf16-mixed']: - scaler = None - if cfg.trainer.precision in [16, '16', '16-mixed']: - scaler = GradScaler( - init_scale=cfg.model.get('native_amp_init_scale', 2 ** 32), - growth_interval=cfg.model.get('native_amp_growth_interval', 1000), - hysteresis=cfg.model.get('hysteresis', 2), - ) - # MixedPrecisionPlugin in PTL >= 2.0 requires precision to be 16-mixed or bf16-mixed - plugin_precision = '16-mixed' - else: - plugin_precision = 'bf16-mixed' - - if megatron_amp_o2 and not with_distributed_adam: - plugins.append(MegatronHalfPrecisionPlugin(precision=plugin_precision, device='cuda', scaler=scaler)) - else: - plugins.append(PipelineMixedPrecisionPlugin(precision=plugin_precision, device='cuda', scaler=scaler)) - - if cfg.get('cluster_type', None) == 'BCP': - plugins.append(TorchElasticEnvironment()) - - trainer = Trainer(plugins=plugins, strategy=strategy, **cfg.trainer) + trainer = MegatronTrainerBuilder(cfg).create_trainer() exp_manager(trainer, cfg.exp_manager) diff --git a/examples/nlp/language_modeling/megatron_trainer_builder.py b/examples/nlp/language_modeling/megatron_trainer_builder.py index dbcbbd33073d..000132b6e3bb 100644 --- a/examples/nlp/language_modeling/megatron_trainer_builder.py +++ b/examples/nlp/language_modeling/megatron_trainer_builder.py @@ -31,18 +31,21 @@ def _plugins(self) -> list: with_distributed_adam = self.cfg.model.optim.get('name') == 'distributed_fused_adam' plugins = [] - if self.cfg.trainer.precision in [16, 'bf16']: + if self.cfg.trainer.precision in [16, '16', 'bf16', '16-mixed', 'bf16-mixed']: scaler = None - if self.cfg.trainer.precision == 16: + if self.cfg.trainer.precision == [16, '16', '16-mixed']: scaler = self.grad_scaler() + plugin_precision = '16-mixed' + else: + plugin_precision = 'bf16-mixed' if megatron_amp_o2 and not with_distributed_adam: plugins.append( - MegatronHalfPrecisionPlugin(precision=self.cfg.trainer.precision, device='cuda', scaler=scaler) + MegatronHalfPrecisionPlugin(precision=plugin_precision, device='cuda', scaler=scaler) ) else: plugins.append( - PipelineMixedPrecisionPlugin(precision=self.cfg.trainer.precision, device='cuda', scaler=scaler) + PipelineMixedPrecisionPlugin(precision=plugin_precision, device='cuda', scaler=scaler) ) if self.cfg.get('cluster_type', None) == 'BCP': From 6fab8cc585c197b07bc2a116feabcce542be571d Mon Sep 17 00:00:00 2001 From: Maanu Grover Date: Thu, 13 Jul 2023 15:21:42 -0500 Subject: [PATCH 04/21] subclass trainer builder for bert Signed-off-by: Maanu Grover --- .../nlp/language_modeling/megatron_trainer_builder.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/examples/nlp/language_modeling/megatron_trainer_builder.py b/examples/nlp/language_modeling/megatron_trainer_builder.py index 000132b6e3bb..9464456d34d8 100644 --- a/examples/nlp/language_modeling/megatron_trainer_builder.py +++ b/examples/nlp/language_modeling/megatron_trainer_builder.py @@ -57,3 +57,11 @@ def create_trainer(self) -> Trainer: strategy = self._training_strategy() plugins = self._plugins() return Trainer(plugins=plugins, strategy=strategy, **self.cfg.trainer) + + +class MegatronBertTrainerBuilder(MegatronTrainerBuilder): + def _grad_scaler(self) -> GradScaler: + return GradScaler( + init_scale=self.cfg.model.get('native_amp_init_scale', 2 ** 32), + growth_interval=self.cfg.model.get('native_amp_growth_interval', 1000), + ) From bfd62b92cfe3413704747a6142700e69034bef77 Mon Sep 17 00:00:00 2001 From: Maanu Grover Date: Thu, 13 Jul 2023 15:24:26 -0500 Subject: [PATCH 05/21] use trainer builder for bert pretraining example Signed-off-by: Maanu Grover --- .../megatron_bert_pretraining.py | 38 +------------------ 1 file changed, 2 insertions(+), 36 deletions(-) diff --git a/examples/nlp/language_modeling/megatron_bert_pretraining.py b/examples/nlp/language_modeling/megatron_bert_pretraining.py index 9199f03f0890..b73bd6cd6fa7 100644 --- a/examples/nlp/language_modeling/megatron_bert_pretraining.py +++ b/examples/nlp/language_modeling/megatron_bert_pretraining.py @@ -13,18 +13,13 @@ # limitations under the License. import torch.multiprocessing as mp +from megatron_trainer_builder import MegatronBertTrainerBuilder from omegaconf.omegaconf import OmegaConf, open_dict from pytorch_lightning import Trainer from pytorch_lightning.plugins.environments import TorchElasticEnvironment from pytorch_lightning.trainer.connectors.checkpoint_connector import _CheckpointConnector from nemo.collections.nlp.models.language_modeling.megatron_bert_model import MegatronBertModel -from nemo.collections.nlp.parts.nlp_overrides import ( - GradScaler, - MegatronHalfPrecisionPlugin, - NLPDDPStrategy, - PipelineMixedPrecisionPlugin, -) from nemo.core.config import hydra_runner from nemo.utils import logging from nemo.utils.exp_manager import exp_manager @@ -38,36 +33,7 @@ def main(cfg) -> None: logging.info("\n\n************** Experiment configuration ***********") logging.info(f'\n{OmegaConf.to_yaml(cfg)}') - megatron_amp_o2 = cfg.model.get('megatron_amp_O2', False) - with_distributed_adam = cfg.model.optim.get('name') == 'distributed_fused_adam' - - plugins = [] - strategy = NLPDDPStrategy( - no_ddp_communication_hook=True, - gradient_as_bucket_view=cfg.model.gradient_as_bucket_view, - find_unused_parameters=False, - ) - - if cfg.trainer.precision in [16, '16', 'bf16', '16-mixed', 'bf16-mixed']: - scaler = None - if cfg.trainer.precision in [16, '16', '16-mixed']: - scaler = GradScaler( - init_scale=cfg.model.get('native_amp_init_scale', 2 ** 32), - growth_interval=cfg.model.get('native_amp_growth_interval', 1000), - ) - # MixedPrecisionPlugin in PTL >= 2.0 requires precision to be 16-mixed or bf16-mixed - plugin_precision = '16-mixed' - else: - plugin_precision = 'bf16-mixed' - if megatron_amp_o2 and not with_distributed_adam: - plugins.append(MegatronHalfPrecisionPlugin(precision=plugin_precision, device='cuda', scaler=scaler)) - else: - plugins.append(PipelineMixedPrecisionPlugin(precision=plugin_precision, device='cuda', scaler=scaler)) - - if cfg.get('cluster_type', None) == 'BCP': - plugins.append(TorchElasticEnvironment()) - - trainer = Trainer(plugins=plugins, strategy=strategy, **cfg.trainer) + trainer = MegatronBertTrainerBuilder(cfg).create_trainer() exp_manager(trainer, cfg.exp_manager) From e18579d6407bbf293aee0a876e671874bc165909 Mon Sep 17 00:00:00 2001 From: Maanu Grover Date: Thu, 13 Jul 2023 15:44:23 -0500 Subject: [PATCH 06/21] subclass t5 builder and use in t5 pretraining Signed-off-by: Maanu Grover --- .../megatron_t5_pretraining.py | 36 ++----------------- .../megatron_trainer_builder.py | 8 +++++ 2 files changed, 10 insertions(+), 34 deletions(-) diff --git a/examples/nlp/language_modeling/megatron_t5_pretraining.py b/examples/nlp/language_modeling/megatron_t5_pretraining.py index 1674faab773f..bf7617497f13 100644 --- a/examples/nlp/language_modeling/megatron_t5_pretraining.py +++ b/examples/nlp/language_modeling/megatron_t5_pretraining.py @@ -13,6 +13,7 @@ # limitations under the License. +from megatron_trainer_builder import MegatronT5TrainerBuilder from omegaconf.omegaconf import OmegaConf, open_dict from pytorch_lightning import Trainer from pytorch_lightning.callbacks import ModelSummary @@ -20,12 +21,6 @@ from pytorch_lightning.trainer.connectors.checkpoint_connector import _CheckpointConnector from nemo.collections.nlp.models.language_modeling.megatron_t5_model import MegatronT5Model -from nemo.collections.nlp.parts.nlp_overrides import ( - GradScaler, - MegatronHalfPrecisionPlugin, - NLPDDPStrategy, - PipelineMixedPrecisionPlugin, -) from nemo.core.config import hydra_runner from nemo.utils import logging from nemo.utils.exp_manager import exp_manager @@ -36,34 +31,7 @@ def main(cfg) -> None: logging.info("\n\n************** Experiment configuration ***********") logging.info(f'\n{OmegaConf.to_yaml(cfg)}') - megatron_amp_o2 = cfg.model.get('megatron_amp_O2', False) - with_distributed_adam = cfg.model.optim.get('name') == 'distributed_fused_adam' - plugins = [] - strategy = NLPDDPStrategy( - no_ddp_communication_hook=True, # we don't use DDP for async grad allreduce - gradient_as_bucket_view=cfg.model.gradient_as_bucket_view, - find_unused_parameters=False, - ) - if cfg.trainer.precision in [16, '16', 'bf16', '16-mixed', 'bf16-mixed']: - scaler = None - if cfg.trainer.precision in [16, '16', '16-mixed']: - scaler = GradScaler( - init_scale=cfg.model.get('native_amp_init_scale', 2 ** 32), - growth_interval=cfg.model.get('native_amp_growth_interval', 1000), - hysteresis=cfg.model.get('hysteresis', 2), - ) - plugin_precision = '16-mixed' - else: - plugin_precision = 'bf16-mixed' - if megatron_amp_o2 and not with_distributed_adam: - plugins.append(MegatronHalfPrecisionPlugin(precision=plugin_precision, device='cuda', scaler=scaler)) - else: - plugins.append(PipelineMixedPrecisionPlugin(precision=plugin_precision, device='cuda', scaler=scaler)) - - if cfg.get('cluster_type', None) == 'BCP': - plugins.append(TorchElasticEnvironment()) - - trainer = Trainer(plugins=plugins, strategy=strategy, **cfg.trainer, callbacks=[ModelSummary(max_depth=3)]) + trainer = MegatronT5TrainerBuilder(cfg).create_trainer() exp_manager(trainer, cfg.exp_manager) # update resume from checkpoint found by exp_manager diff --git a/examples/nlp/language_modeling/megatron_trainer_builder.py b/examples/nlp/language_modeling/megatron_trainer_builder.py index 9464456d34d8..8857ebe4c27e 100644 --- a/examples/nlp/language_modeling/megatron_trainer_builder.py +++ b/examples/nlp/language_modeling/megatron_trainer_builder.py @@ -1,4 +1,5 @@ from pytorch_lightning import Trainer +from pytorch_lightning.callbacks import ModelSummary from pytorch_lightning.plugins.environments import TorchElasticEnvironment from nemo.collections.nlp.parts.nlp_overrides import ( GradScaler, @@ -65,3 +66,10 @@ def _grad_scaler(self) -> GradScaler: init_scale=self.cfg.model.get('native_amp_init_scale', 2 ** 32), growth_interval=self.cfg.model.get('native_amp_growth_interval', 1000), ) + + +class MegatronT5TrainerBuilder(MegatronTrainerBuilder): + def create_trainer(self) -> Trainer: + strategy = self._training_strategy() + plugins = self._plugins() + return Trainer(plugins=plugins, strategy=strategy, **self.cfg.trainer, callbacks=[ModelSummary(max_depth=3)]) From 2cb1b0dfba3b02a17fecb348fc9f280b8a10d605 Mon Sep 17 00:00:00 2001 From: Maanu Grover Date: Tue, 1 Aug 2023 18:26:50 -0500 Subject: [PATCH 07/21] move resume_from_checkpoint logic to exp_manager Signed-off-by: Maanu Grover --- .../nlp/language_modeling/megatron_gpt_pretraining.py | 7 ++----- .../nlp/language_modeling/megatron_trainer_builder.py | 2 +- nemo/utils/exp_manager.py | 9 +++++++++ 3 files changed, 12 insertions(+), 6 deletions(-) diff --git a/examples/nlp/language_modeling/megatron_gpt_pretraining.py b/examples/nlp/language_modeling/megatron_gpt_pretraining.py index 10ec4f526253..48f3af604ff6 100644 --- a/examples/nlp/language_modeling/megatron_gpt_pretraining.py +++ b/examples/nlp/language_modeling/megatron_gpt_pretraining.py @@ -34,14 +34,11 @@ def main(cfg) -> None: logging.info(f'\n{OmegaConf.to_yaml(cfg)}') trainer = MegatronTrainerBuilder(cfg).create_trainer() - + with open_dict(cfg): + cfg.exp_manager.resume_from_checkpoint = cfg.model.resume_from_checkpoint exp_manager(trainer, cfg.exp_manager) # update resume from checkpoint found by exp_manager - if cfg.model.resume_from_checkpoint is not None: - trainer.ckpt_path = cfg.model.resume_from_checkpoint - - logging.info(f'Resuming training from checkpoint: {trainer.ckpt_path}') # hydra interpolation does not work here as the interpolation key is lost when PTL saves hparams with open_dict(cfg): diff --git a/examples/nlp/language_modeling/megatron_trainer_builder.py b/examples/nlp/language_modeling/megatron_trainer_builder.py index 8857ebe4c27e..f83a556e010a 100644 --- a/examples/nlp/language_modeling/megatron_trainer_builder.py +++ b/examples/nlp/language_modeling/megatron_trainer_builder.py @@ -35,7 +35,7 @@ def _plugins(self) -> list: if self.cfg.trainer.precision in [16, '16', 'bf16', '16-mixed', 'bf16-mixed']: scaler = None if self.cfg.trainer.precision == [16, '16', '16-mixed']: - scaler = self.grad_scaler() + scaler = self._grad_scaler() plugin_precision = '16-mixed' else: plugin_precision = 'bf16-mixed' diff --git a/nemo/utils/exp_manager.py b/nemo/utils/exp_manager.py index 31d188776a41..c7ec98bdc07b 100644 --- a/nemo/utils/exp_manager.py +++ b/nemo/utils/exp_manager.py @@ -35,6 +35,7 @@ from pytorch_lightning.loggers import MLFlowLogger, TensorBoardLogger, WandbLogger from pytorch_lightning.loops import _TrainingEpochLoop from pytorch_lightning.strategies.ddp import DDPStrategy +from pytorch_lightning.trainer.connectors.checkpoint_connector import _CheckpointConnector from nemo.collections.common.callbacks import EMA from nemo.constants import NEMO_ENV_VARNAME_TESTING, NEMO_ENV_VARNAME_VERSION @@ -156,6 +157,7 @@ class ExpManagerConfig: create_early_stopping_callback: Optional[bool] = False early_stopping_callback_params: Optional[EarlyStoppingParams] = EarlyStoppingParams() create_preemption_callback: Optional[bool] = True + resume_from_checkpoint: Optional[str] = None # Additional exp_manager arguments files_to_copy: Optional[List[str]] = None # logs timing of train/val/test steps @@ -343,6 +345,13 @@ def exp_manager(trainer: 'pytorch_lightning.Trainer', cfg: Optional[Union[DictCo else: check_resume(trainer, log_dir, cfg.resume_past_end, cfg.resume_ignore_no_checkpoint) + if cfg.resume_from_checkpoint is not None: + trainer.ckpt_path = cfg.model.resume_from_checkpoint + + logging.info(f'Resuming training from checkpoint: {trainer.ckpt_path}') + + trainer._checkpoint_connector = _CheckpointConnector(trainer) + checkpoint_name = name # If name returned from get_log_dir is "", use cfg.name for checkpointing if checkpoint_name is None or checkpoint_name == '': From 77f1c86ab15c5fee7fc7a46a3e7c633582615bc7 Mon Sep 17 00:00:00 2001 From: Maanu Grover Date: Wed, 2 Aug 2023 20:34:54 -0500 Subject: [PATCH 08/21] add docstring for resume_from_checkpoint Signed-off-by: Maanu Grover --- nemo/utils/exp_manager.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/nemo/utils/exp_manager.py b/nemo/utils/exp_manager.py index c7ec98bdc07b..ee118e45f1d2 100644 --- a/nemo/utils/exp_manager.py +++ b/nemo/utils/exp_manager.py @@ -140,6 +140,7 @@ class ExpManagerConfig: resume_if_exists: Optional[bool] = False resume_past_end: Optional[bool] = False resume_ignore_no_checkpoint: Optional[bool] = False + resume_from_checkpoint: Optional[str] = None # Logging parameters create_tensorboard_logger: Optional[bool] = True summary_writer_kwargs: Optional[Dict[Any, Any]] = None @@ -157,7 +158,6 @@ class ExpManagerConfig: create_early_stopping_callback: Optional[bool] = False early_stopping_callback_params: Optional[EarlyStoppingParams] = EarlyStoppingParams() create_preemption_callback: Optional[bool] = True - resume_from_checkpoint: Optional[str] = None # Additional exp_manager arguments files_to_copy: Optional[List[str]] = None # logs timing of train/val/test steps @@ -259,6 +259,8 @@ def exp_manager(trainer: 'pytorch_lightning.Trainer', cfg: Optional[Union[DictCo - resume_ignore_no_checkpoint (bool): exp_manager errors out if resume_if_exists is True and no checkpoint could be found. This behaviour can be disabled, in which case exp_manager will print a message and continue without restoring, by setting resume_ignore_no_checkpoint to True. Defaults to False. + - resume_from_checkpoint (str): Can be used to specify a path to a specific checkpoint file to load from. This will + override any checkpoint found when resume_if_exists is True. Defaults to None. - create_tensorboard_logger (bool): Whether to create a tensorboard logger and attach it to the pytorch lightning trainer. Defaults to True. - summary_writer_kwargs (dict): A dictionary of kwargs that can be passed to lightning's TensorboardLogger From 0344f4ea603040cfcf979009c75461b44f9b6e06 Mon Sep 17 00:00:00 2001 From: Maanu Grover Date: Wed, 2 Aug 2023 20:36:00 -0500 Subject: [PATCH 09/21] set resume_from_checkpoint with interpolation Signed-off-by: Maanu Grover --- examples/nlp/language_modeling/conf/megatron_gpt_config.yaml | 1 + examples/nlp/language_modeling/megatron_gpt_pretraining.py | 2 -- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml b/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml index 3747e85fc622..98711d9a41cd 100755 --- a/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml +++ b/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml @@ -33,6 +33,7 @@ exp_manager: name: null resume_if_exists: True resume_ignore_no_checkpoint: True + resume_from_checkpoint: ${model.resume_from_checkpoint} create_checkpoint_callback: True checkpoint_callback_params: monitor: val_loss diff --git a/examples/nlp/language_modeling/megatron_gpt_pretraining.py b/examples/nlp/language_modeling/megatron_gpt_pretraining.py index 48f3af604ff6..878dd0b76c8f 100644 --- a/examples/nlp/language_modeling/megatron_gpt_pretraining.py +++ b/examples/nlp/language_modeling/megatron_gpt_pretraining.py @@ -34,8 +34,6 @@ def main(cfg) -> None: logging.info(f'\n{OmegaConf.to_yaml(cfg)}') trainer = MegatronTrainerBuilder(cfg).create_trainer() - with open_dict(cfg): - cfg.exp_manager.resume_from_checkpoint = cfg.model.resume_from_checkpoint exp_manager(trainer, cfg.exp_manager) # update resume from checkpoint found by exp_manager From 96dc68e60ea81e9e38c5523816e89cbadb2e7fae Mon Sep 17 00:00:00 2001 From: Maanu Grover Date: Thu, 3 Aug 2023 15:19:03 -0500 Subject: [PATCH 10/21] remove refactored lines Signed-off-by: Maanu Grover --- examples/nlp/language_modeling/megatron_bert_pretraining.py | 4 ---- examples/nlp/language_modeling/megatron_gpt_pretraining.py | 2 -- examples/nlp/language_modeling/megatron_t5_pretraining.py | 5 ----- 3 files changed, 11 deletions(-) diff --git a/examples/nlp/language_modeling/megatron_bert_pretraining.py b/examples/nlp/language_modeling/megatron_bert_pretraining.py index b73bd6cd6fa7..75eb1e4b930b 100644 --- a/examples/nlp/language_modeling/megatron_bert_pretraining.py +++ b/examples/nlp/language_modeling/megatron_bert_pretraining.py @@ -34,12 +34,8 @@ def main(cfg) -> None: logging.info(f'\n{OmegaConf.to_yaml(cfg)}') trainer = MegatronBertTrainerBuilder(cfg).create_trainer() - exp_manager(trainer, cfg.exp_manager) - # resume_from_checkpoint = uninject_model_parallel_rank(resume_from_checkpoint) - logging.info(f'Resuming training from checkpoint: {trainer.ckpt_path}') - # hydra interpolation does not work here as the interpolation key is lost when PTL saves hparams with open_dict(cfg): cfg.model.precision = cfg.trainer.precision diff --git a/examples/nlp/language_modeling/megatron_gpt_pretraining.py b/examples/nlp/language_modeling/megatron_gpt_pretraining.py index 878dd0b76c8f..fd090cccf5ec 100644 --- a/examples/nlp/language_modeling/megatron_gpt_pretraining.py +++ b/examples/nlp/language_modeling/megatron_gpt_pretraining.py @@ -36,8 +36,6 @@ def main(cfg) -> None: trainer = MegatronTrainerBuilder(cfg).create_trainer() exp_manager(trainer, cfg.exp_manager) - # update resume from checkpoint found by exp_manager - # hydra interpolation does not work here as the interpolation key is lost when PTL saves hparams with open_dict(cfg): cfg.model.precision = cfg.trainer.precision diff --git a/examples/nlp/language_modeling/megatron_t5_pretraining.py b/examples/nlp/language_modeling/megatron_t5_pretraining.py index bf7617497f13..d01a6ab7aa59 100644 --- a/examples/nlp/language_modeling/megatron_t5_pretraining.py +++ b/examples/nlp/language_modeling/megatron_t5_pretraining.py @@ -34,11 +34,6 @@ def main(cfg) -> None: trainer = MegatronT5TrainerBuilder(cfg).create_trainer() exp_manager(trainer, cfg.exp_manager) - # update resume from checkpoint found by exp_manager - if cfg.model.resume_from_checkpoint is not None: - trainer.ckpt_path = cfg.model.resume_from_checkpoint - logging.info(f'Resuming training from checkpoint: {trainer.ckpt_path}') - # hydra interpolation does not work here as the interpolation key is lost when PTL saves hparams with open_dict(cfg): cfg.model.precision = cfg.trainer.precision From caaebd0f6db87b7aa3381d9488317a04e5d38530 Mon Sep 17 00:00:00 2001 From: Maanu Grover Date: Thu, 3 Aug 2023 15:22:35 -0500 Subject: [PATCH 11/21] unused import Signed-off-by: Maanu Grover --- examples/nlp/language_modeling/megatron_bert_pretraining.py | 3 --- examples/nlp/language_modeling/megatron_t5_pretraining.py | 4 ---- 2 files changed, 7 deletions(-) diff --git a/examples/nlp/language_modeling/megatron_bert_pretraining.py b/examples/nlp/language_modeling/megatron_bert_pretraining.py index 75eb1e4b930b..ee1a1e9a1776 100644 --- a/examples/nlp/language_modeling/megatron_bert_pretraining.py +++ b/examples/nlp/language_modeling/megatron_bert_pretraining.py @@ -15,9 +15,6 @@ import torch.multiprocessing as mp from megatron_trainer_builder import MegatronBertTrainerBuilder from omegaconf.omegaconf import OmegaConf, open_dict -from pytorch_lightning import Trainer -from pytorch_lightning.plugins.environments import TorchElasticEnvironment -from pytorch_lightning.trainer.connectors.checkpoint_connector import _CheckpointConnector from nemo.collections.nlp.models.language_modeling.megatron_bert_model import MegatronBertModel from nemo.core.config import hydra_runner diff --git a/examples/nlp/language_modeling/megatron_t5_pretraining.py b/examples/nlp/language_modeling/megatron_t5_pretraining.py index d01a6ab7aa59..fcca0ea27b0c 100644 --- a/examples/nlp/language_modeling/megatron_t5_pretraining.py +++ b/examples/nlp/language_modeling/megatron_t5_pretraining.py @@ -15,10 +15,6 @@ from megatron_trainer_builder import MegatronT5TrainerBuilder from omegaconf.omegaconf import OmegaConf, open_dict -from pytorch_lightning import Trainer -from pytorch_lightning.callbacks import ModelSummary -from pytorch_lightning.plugins.environments import TorchElasticEnvironment -from pytorch_lightning.trainer.connectors.checkpoint_connector import _CheckpointConnector from nemo.collections.nlp.models.language_modeling.megatron_t5_model import MegatronT5Model from nemo.core.config import hydra_runner From 84c0130b2c9feaf96d1af06bab21a3af6b3c374c Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 7 Aug 2023 22:18:38 +0000 Subject: [PATCH 12/21] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- .../nlp/language_modeling/megatron_trainer_builder.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/examples/nlp/language_modeling/megatron_trainer_builder.py b/examples/nlp/language_modeling/megatron_trainer_builder.py index f83a556e010a..fb992b6a1a22 100644 --- a/examples/nlp/language_modeling/megatron_trainer_builder.py +++ b/examples/nlp/language_modeling/megatron_trainer_builder.py @@ -41,13 +41,9 @@ def _plugins(self) -> list: plugin_precision = 'bf16-mixed' if megatron_amp_o2 and not with_distributed_adam: - plugins.append( - MegatronHalfPrecisionPlugin(precision=plugin_precision, device='cuda', scaler=scaler) - ) + plugins.append(MegatronHalfPrecisionPlugin(precision=plugin_precision, device='cuda', scaler=scaler)) else: - plugins.append( - PipelineMixedPrecisionPlugin(precision=plugin_precision, device='cuda', scaler=scaler) - ) + plugins.append(PipelineMixedPrecisionPlugin(precision=plugin_precision, device='cuda', scaler=scaler)) if self.cfg.get('cluster_type', None) == 'BCP': plugins.append(TorchElasticEnvironment()) From 703f3e4bf2b6add42f859f9defda38b686dc8761 Mon Sep 17 00:00:00 2001 From: Maanu Grover Date: Mon, 7 Aug 2023 16:09:18 -0500 Subject: [PATCH 13/21] another unused import Signed-off-by: Maanu Grover --- examples/nlp/language_modeling/megatron_gpt_pretraining.py | 3 --- nemo/utils/exp_manager.py | 2 +- 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/examples/nlp/language_modeling/megatron_gpt_pretraining.py b/examples/nlp/language_modeling/megatron_gpt_pretraining.py index fd090cccf5ec..e8563a07555f 100644 --- a/examples/nlp/language_modeling/megatron_gpt_pretraining.py +++ b/examples/nlp/language_modeling/megatron_gpt_pretraining.py @@ -16,9 +16,6 @@ import torch.multiprocessing as mp from megatron_trainer_builder import MegatronTrainerBuilder from omegaconf.omegaconf import OmegaConf, open_dict -from pytorch_lightning import Trainer -from pytorch_lightning.plugins.environments import TorchElasticEnvironment -from pytorch_lightning.trainer.connectors.checkpoint_connector import _CheckpointConnector from nemo.collections.nlp.models.language_modeling.megatron_gpt_model import MegatronGPTModel from nemo.core.config import hydra_runner diff --git a/nemo/utils/exp_manager.py b/nemo/utils/exp_manager.py index ee118e45f1d2..53b21700dbcf 100644 --- a/nemo/utils/exp_manager.py +++ b/nemo/utils/exp_manager.py @@ -259,7 +259,7 @@ def exp_manager(trainer: 'pytorch_lightning.Trainer', cfg: Optional[Union[DictCo - resume_ignore_no_checkpoint (bool): exp_manager errors out if resume_if_exists is True and no checkpoint could be found. This behaviour can be disabled, in which case exp_manager will print a message and continue without restoring, by setting resume_ignore_no_checkpoint to True. Defaults to False. - - resume_from_checkpoint (str): Can be used to specify a path to a specific checkpoint file to load from. This will + - resume_from_checkpoint (str): Can be used to specify a path to a specific checkpoint file to load from. This will override any checkpoint found when resume_if_exists is True. Defaults to None. - create_tensorboard_logger (bool): Whether to create a tensorboard logger and attach it to the pytorch lightning trainer. Defaults to True. From ca25161ad60de27fb836dd75ef651586571d47de Mon Sep 17 00:00:00 2001 From: Maanu Grover Date: Tue, 8 Aug 2023 16:35:32 -0500 Subject: [PATCH 14/21] bug fix Signed-off-by: Maanu Grover --- examples/nlp/language_modeling/megatron_trainer_builder.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/nlp/language_modeling/megatron_trainer_builder.py b/examples/nlp/language_modeling/megatron_trainer_builder.py index fb992b6a1a22..67c34fd4910a 100644 --- a/examples/nlp/language_modeling/megatron_trainer_builder.py +++ b/examples/nlp/language_modeling/megatron_trainer_builder.py @@ -34,7 +34,7 @@ def _plugins(self) -> list: plugins = [] if self.cfg.trainer.precision in [16, '16', 'bf16', '16-mixed', 'bf16-mixed']: scaler = None - if self.cfg.trainer.precision == [16, '16', '16-mixed']: + if self.cfg.trainer.precision in [16, '16', '16-mixed']: scaler = self._grad_scaler() plugin_precision = '16-mixed' else: From fd098f05d8f49528b40ae49ae6a6c024c8427988 Mon Sep 17 00:00:00 2001 From: Maanu Grover Date: Tue, 8 Aug 2023 21:40:53 -0500 Subject: [PATCH 15/21] another bug missed in rebase Signed-off-by: Maanu Grover --- nemo/utils/exp_manager.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nemo/utils/exp_manager.py b/nemo/utils/exp_manager.py index 53b21700dbcf..60efa5d9518e 100644 --- a/nemo/utils/exp_manager.py +++ b/nemo/utils/exp_manager.py @@ -348,7 +348,7 @@ def exp_manager(trainer: 'pytorch_lightning.Trainer', cfg: Optional[Union[DictCo check_resume(trainer, log_dir, cfg.resume_past_end, cfg.resume_ignore_no_checkpoint) if cfg.resume_from_checkpoint is not None: - trainer.ckpt_path = cfg.model.resume_from_checkpoint + trainer.ckpt_path = cfg.resume_from_checkpoint logging.info(f'Resuming training from checkpoint: {trainer.ckpt_path}') From f81cc4de63c03343af544b68b706465597428120 Mon Sep 17 00:00:00 2001 From: Maanu Grover Date: Wed, 9 Aug 2023 13:57:09 -0500 Subject: [PATCH 16/21] add copyright Signed-off-by: Maanu Grover --- .../language_modeling/megatron_trainer_builder.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/examples/nlp/language_modeling/megatron_trainer_builder.py b/examples/nlp/language_modeling/megatron_trainer_builder.py index 67c34fd4910a..49901b173761 100644 --- a/examples/nlp/language_modeling/megatron_trainer_builder.py +++ b/examples/nlp/language_modeling/megatron_trainer_builder.py @@ -1,3 +1,17 @@ +# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + from pytorch_lightning import Trainer from pytorch_lightning.callbacks import ModelSummary from pytorch_lightning.plugins.environments import TorchElasticEnvironment From 744f9e4cdb0f448ae5a025d9f3de935dce8c2b95 Mon Sep 17 00:00:00 2001 From: Maanu Grover Date: Wed, 9 Aug 2023 14:20:49 -0500 Subject: [PATCH 17/21] add type annotation Signed-off-by: Maanu Grover --- examples/nlp/language_modeling/megatron_trainer_builder.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/examples/nlp/language_modeling/megatron_trainer_builder.py b/examples/nlp/language_modeling/megatron_trainer_builder.py index 49901b173761..d4d5a4fbb587 100644 --- a/examples/nlp/language_modeling/megatron_trainer_builder.py +++ b/examples/nlp/language_modeling/megatron_trainer_builder.py @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +from omegaconf import DictConfig from pytorch_lightning import Trainer from pytorch_lightning.callbacks import ModelSummary from pytorch_lightning.plugins.environments import TorchElasticEnvironment @@ -24,7 +25,7 @@ class MegatronTrainerBuilder: - def __init__(self, cfg) -> None: + def __init__(self, cfg: DictConfig) -> None: self.cfg = cfg def _training_strategy(self) -> NLPDDPStrategy: From 5d8bb3a48226bbabf3d7f3c394e6065f14d5f0e6 Mon Sep 17 00:00:00 2001 From: Maanu Grover Date: Wed, 9 Aug 2023 14:21:16 -0500 Subject: [PATCH 18/21] docstrings for trainer builder Signed-off-by: Maanu Grover --- .../megatron_trainer_builder.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/examples/nlp/language_modeling/megatron_trainer_builder.py b/examples/nlp/language_modeling/megatron_trainer_builder.py index d4d5a4fbb587..e5af76cbd1ec 100644 --- a/examples/nlp/language_modeling/megatron_trainer_builder.py +++ b/examples/nlp/language_modeling/megatron_trainer_builder.py @@ -25,10 +25,18 @@ class MegatronTrainerBuilder: + """ + Builder type to hide complex configuration of PTL Trainers for Megatron LLM models. + Can be extended to change behavior for a specific model. + """ + def __init__(self, cfg: DictConfig) -> None: self.cfg = cfg def _training_strategy(self) -> NLPDDPStrategy: + """ + Returns a ddp strategy passed to Trainer.strategy. + """ return NLPDDPStrategy( no_ddp_communication_hook=True, gradient_as_bucket_view=self.cfg.model.gradient_as_bucket_view, @@ -36,6 +44,9 @@ def _training_strategy(self) -> NLPDDPStrategy: ) def _grad_scaler(self) -> GradScaler: + """ + Returns a scaler for precision plugins. + """ return GradScaler( init_scale=self.cfg.model.get('native_amp_init_scale', 2 ** 32), growth_interval=self.cfg.model.get('native_amp_growth_interval', 1000), @@ -43,6 +54,10 @@ def _grad_scaler(self) -> GradScaler: ) def _plugins(self) -> list: + """ + Returns: + plugins: list of plugins passed to Trainer.plugins including precision plugins. + """ megatron_amp_o2 = self.cfg.model.get('megatron_amp_O2', False) with_distributed_adam = self.cfg.model.optim.get('name') == 'distributed_fused_adam' @@ -72,6 +87,8 @@ def create_trainer(self) -> Trainer: class MegatronBertTrainerBuilder(MegatronTrainerBuilder): + """Builder for BERT model Trainer with overrides.""" + def _grad_scaler(self) -> GradScaler: return GradScaler( init_scale=self.cfg.model.get('native_amp_init_scale', 2 ** 32), @@ -80,6 +97,8 @@ def _grad_scaler(self) -> GradScaler: class MegatronT5TrainerBuilder(MegatronTrainerBuilder): + """Builder for T5 model Trainer with overrides.""" + def create_trainer(self) -> Trainer: strategy = self._training_strategy() plugins = self._plugins() From 4040cccfa5474c2e82daccb7299f610afbb7c98d Mon Sep 17 00:00:00 2001 From: Maanu Grover Date: Wed, 9 Aug 2023 14:42:01 -0500 Subject: [PATCH 19/21] move trainer builder file Signed-off-by: Maanu Grover --- examples/nlp/language_modeling/megatron_bert_pretraining.py | 2 +- examples/nlp/language_modeling/megatron_gpt_pretraining.py | 2 +- examples/nlp/language_modeling/megatron_t5_pretraining.py | 2 +- .../collections/nlp/parts}/megatron_trainer_builder.py | 0 4 files changed, 3 insertions(+), 3 deletions(-) rename {examples/nlp/language_modeling => nemo/collections/nlp/parts}/megatron_trainer_builder.py (100%) diff --git a/examples/nlp/language_modeling/megatron_bert_pretraining.py b/examples/nlp/language_modeling/megatron_bert_pretraining.py index ee1a1e9a1776..5b4876141f74 100644 --- a/examples/nlp/language_modeling/megatron_bert_pretraining.py +++ b/examples/nlp/language_modeling/megatron_bert_pretraining.py @@ -13,10 +13,10 @@ # limitations under the License. import torch.multiprocessing as mp -from megatron_trainer_builder import MegatronBertTrainerBuilder from omegaconf.omegaconf import OmegaConf, open_dict from nemo.collections.nlp.models.language_modeling.megatron_bert_model import MegatronBertModel +from nemo.collections.nlp.parts.megatron_trainer_builder import MegatronBertTrainerBuilder from nemo.core.config import hydra_runner from nemo.utils import logging from nemo.utils.exp_manager import exp_manager diff --git a/examples/nlp/language_modeling/megatron_gpt_pretraining.py b/examples/nlp/language_modeling/megatron_gpt_pretraining.py index e8563a07555f..291a85ac8a05 100644 --- a/examples/nlp/language_modeling/megatron_gpt_pretraining.py +++ b/examples/nlp/language_modeling/megatron_gpt_pretraining.py @@ -14,10 +14,10 @@ import torch.multiprocessing as mp -from megatron_trainer_builder import MegatronTrainerBuilder from omegaconf.omegaconf import OmegaConf, open_dict from nemo.collections.nlp.models.language_modeling.megatron_gpt_model import MegatronGPTModel +from nemo.collections.nlp.parts.megatron_trainer_builder import MegatronTrainerBuilder from nemo.core.config import hydra_runner from nemo.utils import logging from nemo.utils.exp_manager import exp_manager diff --git a/examples/nlp/language_modeling/megatron_t5_pretraining.py b/examples/nlp/language_modeling/megatron_t5_pretraining.py index fcca0ea27b0c..ea5d751ab59a 100644 --- a/examples/nlp/language_modeling/megatron_t5_pretraining.py +++ b/examples/nlp/language_modeling/megatron_t5_pretraining.py @@ -13,10 +13,10 @@ # limitations under the License. -from megatron_trainer_builder import MegatronT5TrainerBuilder from omegaconf.omegaconf import OmegaConf, open_dict from nemo.collections.nlp.models.language_modeling.megatron_t5_model import MegatronT5Model +from nemo.collections.nlp.parts.megatron_trainer_builder import MegatronT5TrainerBuilder from nemo.core.config import hydra_runner from nemo.utils import logging from nemo.utils.exp_manager import exp_manager diff --git a/examples/nlp/language_modeling/megatron_trainer_builder.py b/nemo/collections/nlp/parts/megatron_trainer_builder.py similarity index 100% rename from examples/nlp/language_modeling/megatron_trainer_builder.py rename to nemo/collections/nlp/parts/megatron_trainer_builder.py From d7000ee32532e68db3acbdda1065acb153741513 Mon Sep 17 00:00:00 2001 From: Maanu Grover Date: Wed, 9 Aug 2023 16:46:52 -0500 Subject: [PATCH 20/21] not needed for ptl 2.0 Signed-off-by: Maanu Grover --- nemo/utils/exp_manager.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/nemo/utils/exp_manager.py b/nemo/utils/exp_manager.py index 60efa5d9518e..6cfede2d7573 100644 --- a/nemo/utils/exp_manager.py +++ b/nemo/utils/exp_manager.py @@ -35,7 +35,6 @@ from pytorch_lightning.loggers import MLFlowLogger, TensorBoardLogger, WandbLogger from pytorch_lightning.loops import _TrainingEpochLoop from pytorch_lightning.strategies.ddp import DDPStrategy -from pytorch_lightning.trainer.connectors.checkpoint_connector import _CheckpointConnector from nemo.collections.common.callbacks import EMA from nemo.constants import NEMO_ENV_VARNAME_TESTING, NEMO_ENV_VARNAME_VERSION @@ -352,8 +351,6 @@ def exp_manager(trainer: 'pytorch_lightning.Trainer', cfg: Optional[Union[DictCo logging.info(f'Resuming training from checkpoint: {trainer.ckpt_path}') - trainer._checkpoint_connector = _CheckpointConnector(trainer) - checkpoint_name = name # If name returned from get_log_dir is "", use cfg.name for checkpointing if checkpoint_name is None or checkpoint_name == '': From a9c4a6511b4817c3be864ae535b76097f2ec1673 Mon Sep 17 00:00:00 2001 From: Maanu Grover Date: Thu, 10 Aug 2023 19:59:34 -0500 Subject: [PATCH 21/21] disable resume_from_checkpoint logic in exp_manager Signed-off-by: Maanu Grover --- nemo/utils/exp_manager.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/nemo/utils/exp_manager.py b/nemo/utils/exp_manager.py index 6cfede2d7573..63775f4058c5 100644 --- a/nemo/utils/exp_manager.py +++ b/nemo/utils/exp_manager.py @@ -346,8 +346,9 @@ def exp_manager(trainer: 'pytorch_lightning.Trainer', cfg: Optional[Union[DictCo else: check_resume(trainer, log_dir, cfg.resume_past_end, cfg.resume_ignore_no_checkpoint) - if cfg.resume_from_checkpoint is not None: - trainer.ckpt_path = cfg.resume_from_checkpoint + # TODO: this behavior is undesirable, need ckpts in exp_dir to take priority if present over resume_from_checkpoint + # if cfg.resume_from_checkpoint is not None: + # trainer.ckpt_path = cfg.resume_from_checkpoint logging.info(f'Resuming training from checkpoint: {trainer.ckpt_path}')