From 7bce6eccb62cf5cda67895abc2e1fa244a4a877a Mon Sep 17 00:00:00 2001
From: Maanu Grover <maanug@nvidia.com>
Date: Tue, 11 Jul 2023 14:48:31 -0500
Subject: [PATCH 01/21] add builder class

Signed-off-by: Maanu Grover <maanug@nvidia.com>
---
 .../megatron_trainer_builder.py               | 55 +++++++++++++++++++
 1 file changed, 55 insertions(+)
 create mode 100644 examples/nlp/language_modeling/megatron_trainer_builder.py

diff --git a/examples/nlp/language_modeling/megatron_trainer_builder.py b/examples/nlp/language_modeling/megatron_trainer_builder.py
new file mode 100644
index 000000000000..c65279c649f6
--- /dev/null
+++ b/examples/nlp/language_modeling/megatron_trainer_builder.py
@@ -0,0 +1,55 @@
+from pytorch_lightning import Trainer
+from pytorch_lightning.plugins.environments import TorchElasticEnvironment
+from nemo.collections.nlp.parts.nlp_overrides import (
+    GradScaler,
+    MegatronHalfPrecisionPlugin,
+    NLPDDPStrategy,
+    PipelineMixedPrecisionPlugin,
+)
+
+
+class MegatronTrainerBuilder:
+
+    def __init__(self, cfg) -> None:
+        self.cfg = cfg
+
+    def _training_strategy(self) -> NLPDDPStrategy:
+        return NLPDDPStrategy(
+            no_ddp_communication_hook=True,
+            gradient_as_bucket_view=self.cfg.model.gradient_as_bucket_view,
+            find_unused_parameters=False,
+        )
+
+    def _grad_scaler(self) -> GradScaler:
+        return GradScaler(
+            init_scale=self.cfg.model.get('native_amp_init_scale', 2 ** 32),
+            growth_interval=self.cfg.model.get('native_amp_growth_interval', 1000),
+            hysteresis=self.cfg.model.get('hysteresis', 2),
+        )
+
+    def _plugins(self) -> list:
+        megatron_amp_o2 = self.cfg.model.get('megatron_amp_O2', False)
+        with_distributed_adam = self.cfg.model.optim.get('name') == 'distributed_fused_adam'
+
+        plugins = []
+        if self.cfg.trainer.precision in [16, 'bf16']:
+            scaler = None
+            if self.cfg.trainer.precision == 16:
+                scaler = self.grad_scaler()
+
+            if megatron_amp_o2 and not with_distributed_adam:
+                plugins.append(MegatronHalfPrecisionPlugin(
+                    precision=self.cfg.trainer.precision, device='cuda', scaler=scaler))
+            else:
+                plugins.append(PipelineMixedPrecisionPlugin(
+                    precision=self.cfg.trainer.precision, device='cuda', scaler=scaler))
+
+        if self.cfg.get('cluster_type', None) == 'BCP':
+            plugins.append(TorchElasticEnvironment())
+
+        return plugins
+
+    def create_trainer(self) -> Trainer:
+        strategy = self._training_strategy()
+        plugins = self._plugins()
+        return Trainer(plugins=plugins, strategy=strategy, **self.cfg.trainer)

From a188faf66cf9c7a713f47a0f181c4081d7c23b14 Mon Sep 17 00:00:00 2001
From: Maanu Grover <maanug@nvidia.com>
Date: Thu, 13 Jul 2023 15:14:26 -0500
Subject: [PATCH 02/21] formatting

Signed-off-by: Maanu Grover <maanug@nvidia.com>
---
 .../nlp/language_modeling/megatron_trainer_builder.py | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/examples/nlp/language_modeling/megatron_trainer_builder.py b/examples/nlp/language_modeling/megatron_trainer_builder.py
index c65279c649f6..dbcbbd33073d 100644
--- a/examples/nlp/language_modeling/megatron_trainer_builder.py
+++ b/examples/nlp/language_modeling/megatron_trainer_builder.py
@@ -9,7 +9,6 @@
 
 
 class MegatronTrainerBuilder:
-
     def __init__(self, cfg) -> None:
         self.cfg = cfg
 
@@ -38,11 +37,13 @@ def _plugins(self) -> list:
                 scaler = self.grad_scaler()
 
             if megatron_amp_o2 and not with_distributed_adam:
-                plugins.append(MegatronHalfPrecisionPlugin(
-                    precision=self.cfg.trainer.precision, device='cuda', scaler=scaler))
+                plugins.append(
+                    MegatronHalfPrecisionPlugin(precision=self.cfg.trainer.precision, device='cuda', scaler=scaler)
+                )
             else:
-                plugins.append(PipelineMixedPrecisionPlugin(
-                    precision=self.cfg.trainer.precision, device='cuda', scaler=scaler))
+                plugins.append(
+                    PipelineMixedPrecisionPlugin(precision=self.cfg.trainer.precision, device='cuda', scaler=scaler)
+                )
 
         if self.cfg.get('cluster_type', None) == 'BCP':
             plugins.append(TorchElasticEnvironment())

From 6763963656dfc1464cfad158640d3868951f9626 Mon Sep 17 00:00:00 2001
From: Maanu Grover <maanug@nvidia.com>
Date: Thu, 13 Jul 2023 15:18:11 -0500
Subject: [PATCH 03/21] use trainer builder for gpt pretraining example

Signed-off-by: Maanu Grover <maanug@nvidia.com>
---
 .../megatron_gpt_pretraining.py               | 39 +------------------
 .../megatron_trainer_builder.py               | 11 ++++--
 2 files changed, 9 insertions(+), 41 deletions(-)

diff --git a/examples/nlp/language_modeling/megatron_gpt_pretraining.py b/examples/nlp/language_modeling/megatron_gpt_pretraining.py
index 5068f5d2222d..10ec4f526253 100644
--- a/examples/nlp/language_modeling/megatron_gpt_pretraining.py
+++ b/examples/nlp/language_modeling/megatron_gpt_pretraining.py
@@ -14,18 +14,13 @@
 
 
 import torch.multiprocessing as mp
+from megatron_trainer_builder import MegatronTrainerBuilder
 from omegaconf.omegaconf import OmegaConf, open_dict
 from pytorch_lightning import Trainer
 from pytorch_lightning.plugins.environments import TorchElasticEnvironment
 from pytorch_lightning.trainer.connectors.checkpoint_connector import _CheckpointConnector
 
 from nemo.collections.nlp.models.language_modeling.megatron_gpt_model import MegatronGPTModel
-from nemo.collections.nlp.parts.nlp_overrides import (
-    GradScaler,
-    MegatronHalfPrecisionPlugin,
-    NLPDDPStrategy,
-    PipelineMixedPrecisionPlugin,
-)
 from nemo.core.config import hydra_runner
 from nemo.utils import logging
 from nemo.utils.exp_manager import exp_manager
@@ -38,37 +33,7 @@ def main(cfg) -> None:
     logging.info("\n\n************** Experiment configuration ***********")
     logging.info(f'\n{OmegaConf.to_yaml(cfg)}')
 
-    megatron_amp_o2 = cfg.model.get('megatron_amp_O2', False)
-    with_distributed_adam = cfg.model.optim.get('name') == 'distributed_fused_adam'
-
-    plugins = []
-    strategy = NLPDDPStrategy(
-        no_ddp_communication_hook=True,  # we don't use DDP for async grad allreduce
-        gradient_as_bucket_view=cfg.model.gradient_as_bucket_view,
-        find_unused_parameters=False,
-    )
-    if cfg.trainer.precision in [16, '16', 'bf16', '16-mixed', 'bf16-mixed']:
-        scaler = None
-        if cfg.trainer.precision in [16, '16', '16-mixed']:
-            scaler = GradScaler(
-                init_scale=cfg.model.get('native_amp_init_scale', 2 ** 32),
-                growth_interval=cfg.model.get('native_amp_growth_interval', 1000),
-                hysteresis=cfg.model.get('hysteresis', 2),
-            )
-            # MixedPrecisionPlugin in PTL >= 2.0 requires precision to be 16-mixed or bf16-mixed
-            plugin_precision = '16-mixed'
-        else:
-            plugin_precision = 'bf16-mixed'
-
-        if megatron_amp_o2 and not with_distributed_adam:
-            plugins.append(MegatronHalfPrecisionPlugin(precision=plugin_precision, device='cuda', scaler=scaler))
-        else:
-            plugins.append(PipelineMixedPrecisionPlugin(precision=plugin_precision, device='cuda', scaler=scaler))
-
-    if cfg.get('cluster_type', None) == 'BCP':
-        plugins.append(TorchElasticEnvironment())
-
-    trainer = Trainer(plugins=plugins, strategy=strategy, **cfg.trainer)
+    trainer = MegatronTrainerBuilder(cfg).create_trainer()
 
     exp_manager(trainer, cfg.exp_manager)
 
diff --git a/examples/nlp/language_modeling/megatron_trainer_builder.py b/examples/nlp/language_modeling/megatron_trainer_builder.py
index dbcbbd33073d..000132b6e3bb 100644
--- a/examples/nlp/language_modeling/megatron_trainer_builder.py
+++ b/examples/nlp/language_modeling/megatron_trainer_builder.py
@@ -31,18 +31,21 @@ def _plugins(self) -> list:
         with_distributed_adam = self.cfg.model.optim.get('name') == 'distributed_fused_adam'
 
         plugins = []
-        if self.cfg.trainer.precision in [16, 'bf16']:
+        if self.cfg.trainer.precision in [16, '16', 'bf16', '16-mixed', 'bf16-mixed']:
             scaler = None
-            if self.cfg.trainer.precision == 16:
+            if self.cfg.trainer.precision == [16, '16', '16-mixed']:
                 scaler = self.grad_scaler()
+                plugin_precision = '16-mixed'
+            else:
+                plugin_precision = 'bf16-mixed'
 
             if megatron_amp_o2 and not with_distributed_adam:
                 plugins.append(
-                    MegatronHalfPrecisionPlugin(precision=self.cfg.trainer.precision, device='cuda', scaler=scaler)
+                    MegatronHalfPrecisionPlugin(precision=plugin_precision, device='cuda', scaler=scaler)
                 )
             else:
                 plugins.append(
-                    PipelineMixedPrecisionPlugin(precision=self.cfg.trainer.precision, device='cuda', scaler=scaler)
+                    PipelineMixedPrecisionPlugin(precision=plugin_precision, device='cuda', scaler=scaler)
                 )
 
         if self.cfg.get('cluster_type', None) == 'BCP':

From 6fab8cc585c197b07bc2a116feabcce542be571d Mon Sep 17 00:00:00 2001
From: Maanu Grover <maanug@nvidia.com>
Date: Thu, 13 Jul 2023 15:21:42 -0500
Subject: [PATCH 04/21] subclass trainer builder for bert

Signed-off-by: Maanu Grover <maanug@nvidia.com>
---
 .../nlp/language_modeling/megatron_trainer_builder.py     | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/examples/nlp/language_modeling/megatron_trainer_builder.py b/examples/nlp/language_modeling/megatron_trainer_builder.py
index 000132b6e3bb..9464456d34d8 100644
--- a/examples/nlp/language_modeling/megatron_trainer_builder.py
+++ b/examples/nlp/language_modeling/megatron_trainer_builder.py
@@ -57,3 +57,11 @@ def create_trainer(self) -> Trainer:
         strategy = self._training_strategy()
         plugins = self._plugins()
         return Trainer(plugins=plugins, strategy=strategy, **self.cfg.trainer)
+
+
+class MegatronBertTrainerBuilder(MegatronTrainerBuilder):
+    def _grad_scaler(self) -> GradScaler:
+        return GradScaler(
+            init_scale=self.cfg.model.get('native_amp_init_scale', 2 ** 32),
+            growth_interval=self.cfg.model.get('native_amp_growth_interval', 1000),
+        )

From bfd62b92cfe3413704747a6142700e69034bef77 Mon Sep 17 00:00:00 2001
From: Maanu Grover <maanug@nvidia.com>
Date: Thu, 13 Jul 2023 15:24:26 -0500
Subject: [PATCH 05/21] use trainer builder for bert pretraining example

Signed-off-by: Maanu Grover <maanug@nvidia.com>
---
 .../megatron_bert_pretraining.py              | 38 +------------------
 1 file changed, 2 insertions(+), 36 deletions(-)

diff --git a/examples/nlp/language_modeling/megatron_bert_pretraining.py b/examples/nlp/language_modeling/megatron_bert_pretraining.py
index 9199f03f0890..b73bd6cd6fa7 100644
--- a/examples/nlp/language_modeling/megatron_bert_pretraining.py
+++ b/examples/nlp/language_modeling/megatron_bert_pretraining.py
@@ -13,18 +13,13 @@
 # limitations under the License.
 
 import torch.multiprocessing as mp
+from megatron_trainer_builder import MegatronBertTrainerBuilder
 from omegaconf.omegaconf import OmegaConf, open_dict
 from pytorch_lightning import Trainer
 from pytorch_lightning.plugins.environments import TorchElasticEnvironment
 from pytorch_lightning.trainer.connectors.checkpoint_connector import _CheckpointConnector
 
 from nemo.collections.nlp.models.language_modeling.megatron_bert_model import MegatronBertModel
-from nemo.collections.nlp.parts.nlp_overrides import (
-    GradScaler,
-    MegatronHalfPrecisionPlugin,
-    NLPDDPStrategy,
-    PipelineMixedPrecisionPlugin,
-)
 from nemo.core.config import hydra_runner
 from nemo.utils import logging
 from nemo.utils.exp_manager import exp_manager
@@ -38,36 +33,7 @@ def main(cfg) -> None:
     logging.info("\n\n************** Experiment configuration ***********")
     logging.info(f'\n{OmegaConf.to_yaml(cfg)}')
 
-    megatron_amp_o2 = cfg.model.get('megatron_amp_O2', False)
-    with_distributed_adam = cfg.model.optim.get('name') == 'distributed_fused_adam'
-
-    plugins = []
-    strategy = NLPDDPStrategy(
-        no_ddp_communication_hook=True,
-        gradient_as_bucket_view=cfg.model.gradient_as_bucket_view,
-        find_unused_parameters=False,
-    )
-
-    if cfg.trainer.precision in [16, '16', 'bf16', '16-mixed', 'bf16-mixed']:
-        scaler = None
-        if cfg.trainer.precision in [16, '16', '16-mixed']:
-            scaler = GradScaler(
-                init_scale=cfg.model.get('native_amp_init_scale', 2 ** 32),
-                growth_interval=cfg.model.get('native_amp_growth_interval', 1000),
-            )
-            # MixedPrecisionPlugin in PTL >= 2.0 requires precision to be 16-mixed or bf16-mixed
-            plugin_precision = '16-mixed'
-        else:
-            plugin_precision = 'bf16-mixed'
-        if megatron_amp_o2 and not with_distributed_adam:
-            plugins.append(MegatronHalfPrecisionPlugin(precision=plugin_precision, device='cuda', scaler=scaler))
-        else:
-            plugins.append(PipelineMixedPrecisionPlugin(precision=plugin_precision, device='cuda', scaler=scaler))
-
-    if cfg.get('cluster_type', None) == 'BCP':
-        plugins.append(TorchElasticEnvironment())
-
-    trainer = Trainer(plugins=plugins, strategy=strategy, **cfg.trainer)
+    trainer = MegatronBertTrainerBuilder(cfg).create_trainer()
 
     exp_manager(trainer, cfg.exp_manager)
 

From e18579d6407bbf293aee0a876e671874bc165909 Mon Sep 17 00:00:00 2001
From: Maanu Grover <maanug@nvidia.com>
Date: Thu, 13 Jul 2023 15:44:23 -0500
Subject: [PATCH 06/21] subclass t5 builder and use in t5 pretraining

Signed-off-by: Maanu Grover <maanug@nvidia.com>
---
 .../megatron_t5_pretraining.py                | 36 ++-----------------
 .../megatron_trainer_builder.py               |  8 +++++
 2 files changed, 10 insertions(+), 34 deletions(-)

diff --git a/examples/nlp/language_modeling/megatron_t5_pretraining.py b/examples/nlp/language_modeling/megatron_t5_pretraining.py
index 1674faab773f..bf7617497f13 100644
--- a/examples/nlp/language_modeling/megatron_t5_pretraining.py
+++ b/examples/nlp/language_modeling/megatron_t5_pretraining.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 
+from megatron_trainer_builder import MegatronT5TrainerBuilder
 from omegaconf.omegaconf import OmegaConf, open_dict
 from pytorch_lightning import Trainer
 from pytorch_lightning.callbacks import ModelSummary
@@ -20,12 +21,6 @@
 from pytorch_lightning.trainer.connectors.checkpoint_connector import _CheckpointConnector
 
 from nemo.collections.nlp.models.language_modeling.megatron_t5_model import MegatronT5Model
-from nemo.collections.nlp.parts.nlp_overrides import (
-    GradScaler,
-    MegatronHalfPrecisionPlugin,
-    NLPDDPStrategy,
-    PipelineMixedPrecisionPlugin,
-)
 from nemo.core.config import hydra_runner
 from nemo.utils import logging
 from nemo.utils.exp_manager import exp_manager
@@ -36,34 +31,7 @@ def main(cfg) -> None:
     logging.info("\n\n************** Experiment configuration ***********")
     logging.info(f'\n{OmegaConf.to_yaml(cfg)}')
 
-    megatron_amp_o2 = cfg.model.get('megatron_amp_O2', False)
-    with_distributed_adam = cfg.model.optim.get('name') == 'distributed_fused_adam'
-    plugins = []
-    strategy = NLPDDPStrategy(
-        no_ddp_communication_hook=True,  # we don't use DDP for async grad allreduce
-        gradient_as_bucket_view=cfg.model.gradient_as_bucket_view,
-        find_unused_parameters=False,
-    )
-    if cfg.trainer.precision in [16, '16', 'bf16', '16-mixed', 'bf16-mixed']:
-        scaler = None
-        if cfg.trainer.precision in [16, '16', '16-mixed']:
-            scaler = GradScaler(
-                init_scale=cfg.model.get('native_amp_init_scale', 2 ** 32),
-                growth_interval=cfg.model.get('native_amp_growth_interval', 1000),
-                hysteresis=cfg.model.get('hysteresis', 2),
-            )
-            plugin_precision = '16-mixed'
-        else:
-            plugin_precision = 'bf16-mixed'
-        if megatron_amp_o2 and not with_distributed_adam:
-            plugins.append(MegatronHalfPrecisionPlugin(precision=plugin_precision, device='cuda', scaler=scaler))
-        else:
-            plugins.append(PipelineMixedPrecisionPlugin(precision=plugin_precision, device='cuda', scaler=scaler))
-
-    if cfg.get('cluster_type', None) == 'BCP':
-        plugins.append(TorchElasticEnvironment())
-
-    trainer = Trainer(plugins=plugins, strategy=strategy, **cfg.trainer, callbacks=[ModelSummary(max_depth=3)])
+    trainer = MegatronT5TrainerBuilder(cfg).create_trainer()
     exp_manager(trainer, cfg.exp_manager)
 
     # update resume from checkpoint found by exp_manager
diff --git a/examples/nlp/language_modeling/megatron_trainer_builder.py b/examples/nlp/language_modeling/megatron_trainer_builder.py
index 9464456d34d8..8857ebe4c27e 100644
--- a/examples/nlp/language_modeling/megatron_trainer_builder.py
+++ b/examples/nlp/language_modeling/megatron_trainer_builder.py
@@ -1,4 +1,5 @@
 from pytorch_lightning import Trainer
+from pytorch_lightning.callbacks import ModelSummary
 from pytorch_lightning.plugins.environments import TorchElasticEnvironment
 from nemo.collections.nlp.parts.nlp_overrides import (
     GradScaler,
@@ -65,3 +66,10 @@ def _grad_scaler(self) -> GradScaler:
             init_scale=self.cfg.model.get('native_amp_init_scale', 2 ** 32),
             growth_interval=self.cfg.model.get('native_amp_growth_interval', 1000),
         )
+
+
+class MegatronT5TrainerBuilder(MegatronTrainerBuilder):
+    def create_trainer(self) -> Trainer:
+        strategy = self._training_strategy()
+        plugins = self._plugins()
+        return Trainer(plugins=plugins, strategy=strategy, **self.cfg.trainer, callbacks=[ModelSummary(max_depth=3)])

From 2cb1b0dfba3b02a17fecb348fc9f280b8a10d605 Mon Sep 17 00:00:00 2001
From: Maanu Grover <maanug@nvidia.com>
Date: Tue, 1 Aug 2023 18:26:50 -0500
Subject: [PATCH 07/21] move resume_from_checkpoint logic to exp_manager

Signed-off-by: Maanu Grover <maanug@nvidia.com>
---
 .../nlp/language_modeling/megatron_gpt_pretraining.py    | 7 ++-----
 .../nlp/language_modeling/megatron_trainer_builder.py    | 2 +-
 nemo/utils/exp_manager.py                                | 9 +++++++++
 3 files changed, 12 insertions(+), 6 deletions(-)

diff --git a/examples/nlp/language_modeling/megatron_gpt_pretraining.py b/examples/nlp/language_modeling/megatron_gpt_pretraining.py
index 10ec4f526253..48f3af604ff6 100644
--- a/examples/nlp/language_modeling/megatron_gpt_pretraining.py
+++ b/examples/nlp/language_modeling/megatron_gpt_pretraining.py
@@ -34,14 +34,11 @@ def main(cfg) -> None:
     logging.info(f'\n{OmegaConf.to_yaml(cfg)}')
 
     trainer = MegatronTrainerBuilder(cfg).create_trainer()
-
+    with open_dict(cfg):
+        cfg.exp_manager.resume_from_checkpoint = cfg.model.resume_from_checkpoint
     exp_manager(trainer, cfg.exp_manager)
 
     # update resume from checkpoint found by exp_manager
-    if cfg.model.resume_from_checkpoint is not None:
-        trainer.ckpt_path = cfg.model.resume_from_checkpoint
-
-    logging.info(f'Resuming training from checkpoint: {trainer.ckpt_path}')
 
     # hydra interpolation does not work here as the interpolation key is lost when PTL saves hparams
     with open_dict(cfg):
diff --git a/examples/nlp/language_modeling/megatron_trainer_builder.py b/examples/nlp/language_modeling/megatron_trainer_builder.py
index 8857ebe4c27e..f83a556e010a 100644
--- a/examples/nlp/language_modeling/megatron_trainer_builder.py
+++ b/examples/nlp/language_modeling/megatron_trainer_builder.py
@@ -35,7 +35,7 @@ def _plugins(self) -> list:
         if self.cfg.trainer.precision in [16, '16', 'bf16', '16-mixed', 'bf16-mixed']:
             scaler = None
             if self.cfg.trainer.precision == [16, '16', '16-mixed']:
-                scaler = self.grad_scaler()
+                scaler = self._grad_scaler()
                 plugin_precision = '16-mixed'
             else:
                 plugin_precision = 'bf16-mixed'
diff --git a/nemo/utils/exp_manager.py b/nemo/utils/exp_manager.py
index 31d188776a41..c7ec98bdc07b 100644
--- a/nemo/utils/exp_manager.py
+++ b/nemo/utils/exp_manager.py
@@ -35,6 +35,7 @@
 from pytorch_lightning.loggers import MLFlowLogger, TensorBoardLogger, WandbLogger
 from pytorch_lightning.loops import _TrainingEpochLoop
 from pytorch_lightning.strategies.ddp import DDPStrategy
+from pytorch_lightning.trainer.connectors.checkpoint_connector import _CheckpointConnector
 
 from nemo.collections.common.callbacks import EMA
 from nemo.constants import NEMO_ENV_VARNAME_TESTING, NEMO_ENV_VARNAME_VERSION
@@ -156,6 +157,7 @@ class ExpManagerConfig:
     create_early_stopping_callback: Optional[bool] = False
     early_stopping_callback_params: Optional[EarlyStoppingParams] = EarlyStoppingParams()
     create_preemption_callback: Optional[bool] = True
+    resume_from_checkpoint: Optional[str] = None
     # Additional exp_manager arguments
     files_to_copy: Optional[List[str]] = None
     # logs timing of train/val/test steps
@@ -343,6 +345,13 @@ def exp_manager(trainer: 'pytorch_lightning.Trainer', cfg: Optional[Union[DictCo
         else:
             check_resume(trainer, log_dir, cfg.resume_past_end, cfg.resume_ignore_no_checkpoint)
 
+    if cfg.resume_from_checkpoint is not None:
+        trainer.ckpt_path = cfg.model.resume_from_checkpoint
+
+    logging.info(f'Resuming training from checkpoint: {trainer.ckpt_path}')
+
+    trainer._checkpoint_connector = _CheckpointConnector(trainer)
+
     checkpoint_name = name
     # If name returned from get_log_dir is "", use cfg.name for checkpointing
     if checkpoint_name is None or checkpoint_name == '':

From 77f1c86ab15c5fee7fc7a46a3e7c633582615bc7 Mon Sep 17 00:00:00 2001
From: Maanu Grover <maanug@nvidia.com>
Date: Wed, 2 Aug 2023 20:34:54 -0500
Subject: [PATCH 08/21] add docstring for resume_from_checkpoint

Signed-off-by: Maanu Grover <maanug@nvidia.com>
---
 nemo/utils/exp_manager.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/nemo/utils/exp_manager.py b/nemo/utils/exp_manager.py
index c7ec98bdc07b..ee118e45f1d2 100644
--- a/nemo/utils/exp_manager.py
+++ b/nemo/utils/exp_manager.py
@@ -140,6 +140,7 @@ class ExpManagerConfig:
     resume_if_exists: Optional[bool] = False
     resume_past_end: Optional[bool] = False
     resume_ignore_no_checkpoint: Optional[bool] = False
+    resume_from_checkpoint: Optional[str] = None
     # Logging parameters
     create_tensorboard_logger: Optional[bool] = True
     summary_writer_kwargs: Optional[Dict[Any, Any]] = None
@@ -157,7 +158,6 @@ class ExpManagerConfig:
     create_early_stopping_callback: Optional[bool] = False
     early_stopping_callback_params: Optional[EarlyStoppingParams] = EarlyStoppingParams()
     create_preemption_callback: Optional[bool] = True
-    resume_from_checkpoint: Optional[str] = None
     # Additional exp_manager arguments
     files_to_copy: Optional[List[str]] = None
     # logs timing of train/val/test steps
@@ -259,6 +259,8 @@ def exp_manager(trainer: 'pytorch_lightning.Trainer', cfg: Optional[Union[DictCo
             - resume_ignore_no_checkpoint (bool): exp_manager errors out if resume_if_exists is True and no checkpoint
                 could be found. This behaviour can be disabled, in which case exp_manager will print a message and
                 continue without restoring, by setting resume_ignore_no_checkpoint to True. Defaults to False.
+            - resume_from_checkpoint (str): Can be used to specify a path to a specific checkpoint file to load from. This will 
+                override any checkpoint found when resume_if_exists is True. Defaults to None.
             - create_tensorboard_logger (bool): Whether to create a tensorboard logger and attach it to the pytorch
                 lightning trainer. Defaults to True.
             - summary_writer_kwargs (dict): A dictionary of kwargs that can be passed to lightning's TensorboardLogger

From 0344f4ea603040cfcf979009c75461b44f9b6e06 Mon Sep 17 00:00:00 2001
From: Maanu Grover <maanug@nvidia.com>
Date: Wed, 2 Aug 2023 20:36:00 -0500
Subject: [PATCH 09/21] set resume_from_checkpoint with interpolation

Signed-off-by: Maanu Grover <maanug@nvidia.com>
---
 examples/nlp/language_modeling/conf/megatron_gpt_config.yaml | 1 +
 examples/nlp/language_modeling/megatron_gpt_pretraining.py   | 2 --
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml b/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml
index 3747e85fc622..98711d9a41cd 100755
--- a/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml
+++ b/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml
@@ -33,6 +33,7 @@ exp_manager:
     name: null
   resume_if_exists: True
   resume_ignore_no_checkpoint: True
+  resume_from_checkpoint: ${model.resume_from_checkpoint} 
   create_checkpoint_callback: True
   checkpoint_callback_params:
     monitor: val_loss
diff --git a/examples/nlp/language_modeling/megatron_gpt_pretraining.py b/examples/nlp/language_modeling/megatron_gpt_pretraining.py
index 48f3af604ff6..878dd0b76c8f 100644
--- a/examples/nlp/language_modeling/megatron_gpt_pretraining.py
+++ b/examples/nlp/language_modeling/megatron_gpt_pretraining.py
@@ -34,8 +34,6 @@ def main(cfg) -> None:
     logging.info(f'\n{OmegaConf.to_yaml(cfg)}')
 
     trainer = MegatronTrainerBuilder(cfg).create_trainer()
-    with open_dict(cfg):
-        cfg.exp_manager.resume_from_checkpoint = cfg.model.resume_from_checkpoint
     exp_manager(trainer, cfg.exp_manager)
 
     # update resume from checkpoint found by exp_manager

From 96dc68e60ea81e9e38c5523816e89cbadb2e7fae Mon Sep 17 00:00:00 2001
From: Maanu Grover <maanug@nvidia.com>
Date: Thu, 3 Aug 2023 15:19:03 -0500
Subject: [PATCH 10/21] remove refactored lines

Signed-off-by: Maanu Grover <maanug@nvidia.com>
---
 examples/nlp/language_modeling/megatron_bert_pretraining.py | 4 ----
 examples/nlp/language_modeling/megatron_gpt_pretraining.py  | 2 --
 examples/nlp/language_modeling/megatron_t5_pretraining.py   | 5 -----
 3 files changed, 11 deletions(-)

diff --git a/examples/nlp/language_modeling/megatron_bert_pretraining.py b/examples/nlp/language_modeling/megatron_bert_pretraining.py
index b73bd6cd6fa7..75eb1e4b930b 100644
--- a/examples/nlp/language_modeling/megatron_bert_pretraining.py
+++ b/examples/nlp/language_modeling/megatron_bert_pretraining.py
@@ -34,12 +34,8 @@ def main(cfg) -> None:
     logging.info(f'\n{OmegaConf.to_yaml(cfg)}')
 
     trainer = MegatronBertTrainerBuilder(cfg).create_trainer()
-
     exp_manager(trainer, cfg.exp_manager)
 
-    # resume_from_checkpoint = uninject_model_parallel_rank(resume_from_checkpoint)
-    logging.info(f'Resuming training from checkpoint: {trainer.ckpt_path}')
-
     # hydra interpolation does not work here as the interpolation key is lost when PTL saves hparams
     with open_dict(cfg):
         cfg.model.precision = cfg.trainer.precision
diff --git a/examples/nlp/language_modeling/megatron_gpt_pretraining.py b/examples/nlp/language_modeling/megatron_gpt_pretraining.py
index 878dd0b76c8f..fd090cccf5ec 100644
--- a/examples/nlp/language_modeling/megatron_gpt_pretraining.py
+++ b/examples/nlp/language_modeling/megatron_gpt_pretraining.py
@@ -36,8 +36,6 @@ def main(cfg) -> None:
     trainer = MegatronTrainerBuilder(cfg).create_trainer()
     exp_manager(trainer, cfg.exp_manager)
 
-    # update resume from checkpoint found by exp_manager
-
     # hydra interpolation does not work here as the interpolation key is lost when PTL saves hparams
     with open_dict(cfg):
         cfg.model.precision = cfg.trainer.precision
diff --git a/examples/nlp/language_modeling/megatron_t5_pretraining.py b/examples/nlp/language_modeling/megatron_t5_pretraining.py
index bf7617497f13..d01a6ab7aa59 100644
--- a/examples/nlp/language_modeling/megatron_t5_pretraining.py
+++ b/examples/nlp/language_modeling/megatron_t5_pretraining.py
@@ -34,11 +34,6 @@ def main(cfg) -> None:
     trainer = MegatronT5TrainerBuilder(cfg).create_trainer()
     exp_manager(trainer, cfg.exp_manager)
 
-    # update resume from checkpoint found by exp_manager
-    if cfg.model.resume_from_checkpoint is not None:
-        trainer.ckpt_path = cfg.model.resume_from_checkpoint
-    logging.info(f'Resuming training from checkpoint: {trainer.ckpt_path}')
-
     # hydra interpolation does not work here as the interpolation key is lost when PTL saves hparams
     with open_dict(cfg):
         cfg.model.precision = cfg.trainer.precision

From caaebd0f6db87b7aa3381d9488317a04e5d38530 Mon Sep 17 00:00:00 2001
From: Maanu Grover <maanug@nvidia.com>
Date: Thu, 3 Aug 2023 15:22:35 -0500
Subject: [PATCH 11/21] unused import

Signed-off-by: Maanu Grover <maanug@nvidia.com>
---
 examples/nlp/language_modeling/megatron_bert_pretraining.py | 3 ---
 examples/nlp/language_modeling/megatron_t5_pretraining.py   | 4 ----
 2 files changed, 7 deletions(-)

diff --git a/examples/nlp/language_modeling/megatron_bert_pretraining.py b/examples/nlp/language_modeling/megatron_bert_pretraining.py
index 75eb1e4b930b..ee1a1e9a1776 100644
--- a/examples/nlp/language_modeling/megatron_bert_pretraining.py
+++ b/examples/nlp/language_modeling/megatron_bert_pretraining.py
@@ -15,9 +15,6 @@
 import torch.multiprocessing as mp
 from megatron_trainer_builder import MegatronBertTrainerBuilder
 from omegaconf.omegaconf import OmegaConf, open_dict
-from pytorch_lightning import Trainer
-from pytorch_lightning.plugins.environments import TorchElasticEnvironment
-from pytorch_lightning.trainer.connectors.checkpoint_connector import _CheckpointConnector
 
 from nemo.collections.nlp.models.language_modeling.megatron_bert_model import MegatronBertModel
 from nemo.core.config import hydra_runner
diff --git a/examples/nlp/language_modeling/megatron_t5_pretraining.py b/examples/nlp/language_modeling/megatron_t5_pretraining.py
index d01a6ab7aa59..fcca0ea27b0c 100644
--- a/examples/nlp/language_modeling/megatron_t5_pretraining.py
+++ b/examples/nlp/language_modeling/megatron_t5_pretraining.py
@@ -15,10 +15,6 @@
 
 from megatron_trainer_builder import MegatronT5TrainerBuilder
 from omegaconf.omegaconf import OmegaConf, open_dict
-from pytorch_lightning import Trainer
-from pytorch_lightning.callbacks import ModelSummary
-from pytorch_lightning.plugins.environments import TorchElasticEnvironment
-from pytorch_lightning.trainer.connectors.checkpoint_connector import _CheckpointConnector
 
 from nemo.collections.nlp.models.language_modeling.megatron_t5_model import MegatronT5Model
 from nemo.core.config import hydra_runner

From 84c0130b2c9feaf96d1af06bab21a3af6b3c374c Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Mon, 7 Aug 2023 22:18:38 +0000
Subject: [PATCH 12/21] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 .../nlp/language_modeling/megatron_trainer_builder.py     | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/examples/nlp/language_modeling/megatron_trainer_builder.py b/examples/nlp/language_modeling/megatron_trainer_builder.py
index f83a556e010a..fb992b6a1a22 100644
--- a/examples/nlp/language_modeling/megatron_trainer_builder.py
+++ b/examples/nlp/language_modeling/megatron_trainer_builder.py
@@ -41,13 +41,9 @@ def _plugins(self) -> list:
                 plugin_precision = 'bf16-mixed'
 
             if megatron_amp_o2 and not with_distributed_adam:
-                plugins.append(
-                    MegatronHalfPrecisionPlugin(precision=plugin_precision, device='cuda', scaler=scaler)
-                )
+                plugins.append(MegatronHalfPrecisionPlugin(precision=plugin_precision, device='cuda', scaler=scaler))
             else:
-                plugins.append(
-                    PipelineMixedPrecisionPlugin(precision=plugin_precision, device='cuda', scaler=scaler)
-                )
+                plugins.append(PipelineMixedPrecisionPlugin(precision=plugin_precision, device='cuda', scaler=scaler))
 
         if self.cfg.get('cluster_type', None) == 'BCP':
             plugins.append(TorchElasticEnvironment())

From 703f3e4bf2b6add42f859f9defda38b686dc8761 Mon Sep 17 00:00:00 2001
From: Maanu Grover <maanug@nvidia.com>
Date: Mon, 7 Aug 2023 16:09:18 -0500
Subject: [PATCH 13/21] another unused import

Signed-off-by: Maanu Grover <maanug@nvidia.com>
---
 examples/nlp/language_modeling/megatron_gpt_pretraining.py | 3 ---
 nemo/utils/exp_manager.py                                  | 2 +-
 2 files changed, 1 insertion(+), 4 deletions(-)

diff --git a/examples/nlp/language_modeling/megatron_gpt_pretraining.py b/examples/nlp/language_modeling/megatron_gpt_pretraining.py
index fd090cccf5ec..e8563a07555f 100644
--- a/examples/nlp/language_modeling/megatron_gpt_pretraining.py
+++ b/examples/nlp/language_modeling/megatron_gpt_pretraining.py
@@ -16,9 +16,6 @@
 import torch.multiprocessing as mp
 from megatron_trainer_builder import MegatronTrainerBuilder
 from omegaconf.omegaconf import OmegaConf, open_dict
-from pytorch_lightning import Trainer
-from pytorch_lightning.plugins.environments import TorchElasticEnvironment
-from pytorch_lightning.trainer.connectors.checkpoint_connector import _CheckpointConnector
 
 from nemo.collections.nlp.models.language_modeling.megatron_gpt_model import MegatronGPTModel
 from nemo.core.config import hydra_runner
diff --git a/nemo/utils/exp_manager.py b/nemo/utils/exp_manager.py
index ee118e45f1d2..53b21700dbcf 100644
--- a/nemo/utils/exp_manager.py
+++ b/nemo/utils/exp_manager.py
@@ -259,7 +259,7 @@ def exp_manager(trainer: 'pytorch_lightning.Trainer', cfg: Optional[Union[DictCo
             - resume_ignore_no_checkpoint (bool): exp_manager errors out if resume_if_exists is True and no checkpoint
                 could be found. This behaviour can be disabled, in which case exp_manager will print a message and
                 continue without restoring, by setting resume_ignore_no_checkpoint to True. Defaults to False.
-            - resume_from_checkpoint (str): Can be used to specify a path to a specific checkpoint file to load from. This will 
+            - resume_from_checkpoint (str): Can be used to specify a path to a specific checkpoint file to load from. This will
                 override any checkpoint found when resume_if_exists is True. Defaults to None.
             - create_tensorboard_logger (bool): Whether to create a tensorboard logger and attach it to the pytorch
                 lightning trainer. Defaults to True.

From ca25161ad60de27fb836dd75ef651586571d47de Mon Sep 17 00:00:00 2001
From: Maanu Grover <maanug@nvidia.com>
Date: Tue, 8 Aug 2023 16:35:32 -0500
Subject: [PATCH 14/21] bug fix

Signed-off-by: Maanu Grover <maanug@nvidia.com>
---
 examples/nlp/language_modeling/megatron_trainer_builder.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/nlp/language_modeling/megatron_trainer_builder.py b/examples/nlp/language_modeling/megatron_trainer_builder.py
index fb992b6a1a22..67c34fd4910a 100644
--- a/examples/nlp/language_modeling/megatron_trainer_builder.py
+++ b/examples/nlp/language_modeling/megatron_trainer_builder.py
@@ -34,7 +34,7 @@ def _plugins(self) -> list:
         plugins = []
         if self.cfg.trainer.precision in [16, '16', 'bf16', '16-mixed', 'bf16-mixed']:
             scaler = None
-            if self.cfg.trainer.precision == [16, '16', '16-mixed']:
+            if self.cfg.trainer.precision in [16, '16', '16-mixed']:
                 scaler = self._grad_scaler()
                 plugin_precision = '16-mixed'
             else:

From fd098f05d8f49528b40ae49ae6a6c024c8427988 Mon Sep 17 00:00:00 2001
From: Maanu Grover <maanug@nvidia.com>
Date: Tue, 8 Aug 2023 21:40:53 -0500
Subject: [PATCH 15/21] another bug missed in rebase

Signed-off-by: Maanu Grover <maanug@nvidia.com>
---
 nemo/utils/exp_manager.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/nemo/utils/exp_manager.py b/nemo/utils/exp_manager.py
index 53b21700dbcf..60efa5d9518e 100644
--- a/nemo/utils/exp_manager.py
+++ b/nemo/utils/exp_manager.py
@@ -348,7 +348,7 @@ def exp_manager(trainer: 'pytorch_lightning.Trainer', cfg: Optional[Union[DictCo
             check_resume(trainer, log_dir, cfg.resume_past_end, cfg.resume_ignore_no_checkpoint)
 
     if cfg.resume_from_checkpoint is not None:
-        trainer.ckpt_path = cfg.model.resume_from_checkpoint
+        trainer.ckpt_path = cfg.resume_from_checkpoint
 
     logging.info(f'Resuming training from checkpoint: {trainer.ckpt_path}')
 

From f81cc4de63c03343af544b68b706465597428120 Mon Sep 17 00:00:00 2001
From: Maanu Grover <maanug@nvidia.com>
Date: Wed, 9 Aug 2023 13:57:09 -0500
Subject: [PATCH 16/21] add copyright

Signed-off-by: Maanu Grover <maanug@nvidia.com>
---
 .../language_modeling/megatron_trainer_builder.py  | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/examples/nlp/language_modeling/megatron_trainer_builder.py b/examples/nlp/language_modeling/megatron_trainer_builder.py
index 67c34fd4910a..49901b173761 100644
--- a/examples/nlp/language_modeling/megatron_trainer_builder.py
+++ b/examples/nlp/language_modeling/megatron_trainer_builder.py
@@ -1,3 +1,17 @@
+# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from pytorch_lightning import Trainer
 from pytorch_lightning.callbacks import ModelSummary
 from pytorch_lightning.plugins.environments import TorchElasticEnvironment

From 744f9e4cdb0f448ae5a025d9f3de935dce8c2b95 Mon Sep 17 00:00:00 2001
From: Maanu Grover <maanug@nvidia.com>
Date: Wed, 9 Aug 2023 14:20:49 -0500
Subject: [PATCH 17/21] add type annotation

Signed-off-by: Maanu Grover <maanug@nvidia.com>
---
 examples/nlp/language_modeling/megatron_trainer_builder.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/examples/nlp/language_modeling/megatron_trainer_builder.py b/examples/nlp/language_modeling/megatron_trainer_builder.py
index 49901b173761..d4d5a4fbb587 100644
--- a/examples/nlp/language_modeling/megatron_trainer_builder.py
+++ b/examples/nlp/language_modeling/megatron_trainer_builder.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from omegaconf import DictConfig
 from pytorch_lightning import Trainer
 from pytorch_lightning.callbacks import ModelSummary
 from pytorch_lightning.plugins.environments import TorchElasticEnvironment
@@ -24,7 +25,7 @@
 
 
 class MegatronTrainerBuilder:
-    def __init__(self, cfg) -> None:
+    def __init__(self, cfg: DictConfig) -> None:
         self.cfg = cfg
 
     def _training_strategy(self) -> NLPDDPStrategy:

From 5d8bb3a48226bbabf3d7f3c394e6065f14d5f0e6 Mon Sep 17 00:00:00 2001
From: Maanu Grover <maanug@nvidia.com>
Date: Wed, 9 Aug 2023 14:21:16 -0500
Subject: [PATCH 18/21] docstrings for trainer builder

Signed-off-by: Maanu Grover <maanug@nvidia.com>
---
 .../megatron_trainer_builder.py               | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/examples/nlp/language_modeling/megatron_trainer_builder.py b/examples/nlp/language_modeling/megatron_trainer_builder.py
index d4d5a4fbb587..e5af76cbd1ec 100644
--- a/examples/nlp/language_modeling/megatron_trainer_builder.py
+++ b/examples/nlp/language_modeling/megatron_trainer_builder.py
@@ -25,10 +25,18 @@
 
 
 class MegatronTrainerBuilder:
+    """
+    Builder type to hide complex configuration of PTL Trainers for Megatron LLM models.
+    Can be extended to change behavior for a specific model.
+    """
+
     def __init__(self, cfg: DictConfig) -> None:
         self.cfg = cfg
 
     def _training_strategy(self) -> NLPDDPStrategy:
+        """
+        Returns a ddp strategy passed to Trainer.strategy.
+        """
         return NLPDDPStrategy(
             no_ddp_communication_hook=True,
             gradient_as_bucket_view=self.cfg.model.gradient_as_bucket_view,
@@ -36,6 +44,9 @@ def _training_strategy(self) -> NLPDDPStrategy:
         )
 
     def _grad_scaler(self) -> GradScaler:
+        """
+        Returns a scaler for precision plugins.
+        """
         return GradScaler(
             init_scale=self.cfg.model.get('native_amp_init_scale', 2 ** 32),
             growth_interval=self.cfg.model.get('native_amp_growth_interval', 1000),
@@ -43,6 +54,10 @@ def _grad_scaler(self) -> GradScaler:
         )
 
     def _plugins(self) -> list:
+        """
+        Returns:
+            plugins: list of plugins passed to Trainer.plugins including precision plugins.
+        """
         megatron_amp_o2 = self.cfg.model.get('megatron_amp_O2', False)
         with_distributed_adam = self.cfg.model.optim.get('name') == 'distributed_fused_adam'
 
@@ -72,6 +87,8 @@ def create_trainer(self) -> Trainer:
 
 
 class MegatronBertTrainerBuilder(MegatronTrainerBuilder):
+    """Builder for BERT model Trainer with overrides."""
+
     def _grad_scaler(self) -> GradScaler:
         return GradScaler(
             init_scale=self.cfg.model.get('native_amp_init_scale', 2 ** 32),
@@ -80,6 +97,8 @@ def _grad_scaler(self) -> GradScaler:
 
 
 class MegatronT5TrainerBuilder(MegatronTrainerBuilder):
+    """Builder for T5 model Trainer with overrides."""
+
     def create_trainer(self) -> Trainer:
         strategy = self._training_strategy()
         plugins = self._plugins()

From 4040cccfa5474c2e82daccb7299f610afbb7c98d Mon Sep 17 00:00:00 2001
From: Maanu Grover <maanug@nvidia.com>
Date: Wed, 9 Aug 2023 14:42:01 -0500
Subject: [PATCH 19/21] move trainer builder file

Signed-off-by: Maanu Grover <maanug@nvidia.com>
---
 examples/nlp/language_modeling/megatron_bert_pretraining.py     | 2 +-
 examples/nlp/language_modeling/megatron_gpt_pretraining.py      | 2 +-
 examples/nlp/language_modeling/megatron_t5_pretraining.py       | 2 +-
 .../collections/nlp/parts}/megatron_trainer_builder.py          | 0
 4 files changed, 3 insertions(+), 3 deletions(-)
 rename {examples/nlp/language_modeling => nemo/collections/nlp/parts}/megatron_trainer_builder.py (100%)

diff --git a/examples/nlp/language_modeling/megatron_bert_pretraining.py b/examples/nlp/language_modeling/megatron_bert_pretraining.py
index ee1a1e9a1776..5b4876141f74 100644
--- a/examples/nlp/language_modeling/megatron_bert_pretraining.py
+++ b/examples/nlp/language_modeling/megatron_bert_pretraining.py
@@ -13,10 +13,10 @@
 # limitations under the License.
 
 import torch.multiprocessing as mp
-from megatron_trainer_builder import MegatronBertTrainerBuilder
 from omegaconf.omegaconf import OmegaConf, open_dict
 
 from nemo.collections.nlp.models.language_modeling.megatron_bert_model import MegatronBertModel
+from nemo.collections.nlp.parts.megatron_trainer_builder import MegatronBertTrainerBuilder
 from nemo.core.config import hydra_runner
 from nemo.utils import logging
 from nemo.utils.exp_manager import exp_manager
diff --git a/examples/nlp/language_modeling/megatron_gpt_pretraining.py b/examples/nlp/language_modeling/megatron_gpt_pretraining.py
index e8563a07555f..291a85ac8a05 100644
--- a/examples/nlp/language_modeling/megatron_gpt_pretraining.py
+++ b/examples/nlp/language_modeling/megatron_gpt_pretraining.py
@@ -14,10 +14,10 @@
 
 
 import torch.multiprocessing as mp
-from megatron_trainer_builder import MegatronTrainerBuilder
 from omegaconf.omegaconf import OmegaConf, open_dict
 
 from nemo.collections.nlp.models.language_modeling.megatron_gpt_model import MegatronGPTModel
+from nemo.collections.nlp.parts.megatron_trainer_builder import MegatronTrainerBuilder
 from nemo.core.config import hydra_runner
 from nemo.utils import logging
 from nemo.utils.exp_manager import exp_manager
diff --git a/examples/nlp/language_modeling/megatron_t5_pretraining.py b/examples/nlp/language_modeling/megatron_t5_pretraining.py
index fcca0ea27b0c..ea5d751ab59a 100644
--- a/examples/nlp/language_modeling/megatron_t5_pretraining.py
+++ b/examples/nlp/language_modeling/megatron_t5_pretraining.py
@@ -13,10 +13,10 @@
 # limitations under the License.
 
 
-from megatron_trainer_builder import MegatronT5TrainerBuilder
 from omegaconf.omegaconf import OmegaConf, open_dict
 
 from nemo.collections.nlp.models.language_modeling.megatron_t5_model import MegatronT5Model
+from nemo.collections.nlp.parts.megatron_trainer_builder import MegatronT5TrainerBuilder
 from nemo.core.config import hydra_runner
 from nemo.utils import logging
 from nemo.utils.exp_manager import exp_manager
diff --git a/examples/nlp/language_modeling/megatron_trainer_builder.py b/nemo/collections/nlp/parts/megatron_trainer_builder.py
similarity index 100%
rename from examples/nlp/language_modeling/megatron_trainer_builder.py
rename to nemo/collections/nlp/parts/megatron_trainer_builder.py

From d7000ee32532e68db3acbdda1065acb153741513 Mon Sep 17 00:00:00 2001
From: Maanu Grover <maanug@nvidia.com>
Date: Wed, 9 Aug 2023 16:46:52 -0500
Subject: [PATCH 20/21] not needed for ptl 2.0

Signed-off-by: Maanu Grover <maanug@nvidia.com>
---
 nemo/utils/exp_manager.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/nemo/utils/exp_manager.py b/nemo/utils/exp_manager.py
index 60efa5d9518e..6cfede2d7573 100644
--- a/nemo/utils/exp_manager.py
+++ b/nemo/utils/exp_manager.py
@@ -35,7 +35,6 @@
 from pytorch_lightning.loggers import MLFlowLogger, TensorBoardLogger, WandbLogger
 from pytorch_lightning.loops import _TrainingEpochLoop
 from pytorch_lightning.strategies.ddp import DDPStrategy
-from pytorch_lightning.trainer.connectors.checkpoint_connector import _CheckpointConnector
 
 from nemo.collections.common.callbacks import EMA
 from nemo.constants import NEMO_ENV_VARNAME_TESTING, NEMO_ENV_VARNAME_VERSION
@@ -352,8 +351,6 @@ def exp_manager(trainer: 'pytorch_lightning.Trainer', cfg: Optional[Union[DictCo
 
     logging.info(f'Resuming training from checkpoint: {trainer.ckpt_path}')
 
-    trainer._checkpoint_connector = _CheckpointConnector(trainer)
-
     checkpoint_name = name
     # If name returned from get_log_dir is "", use cfg.name for checkpointing
     if checkpoint_name is None or checkpoint_name == '':

From a9c4a6511b4817c3be864ae535b76097f2ec1673 Mon Sep 17 00:00:00 2001
From: Maanu Grover <maanug@nvidia.com>
Date: Thu, 10 Aug 2023 19:59:34 -0500
Subject: [PATCH 21/21] disable resume_from_checkpoint logic in exp_manager

Signed-off-by: Maanu Grover <maanug@nvidia.com>
---
 nemo/utils/exp_manager.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/nemo/utils/exp_manager.py b/nemo/utils/exp_manager.py
index 6cfede2d7573..63775f4058c5 100644
--- a/nemo/utils/exp_manager.py
+++ b/nemo/utils/exp_manager.py
@@ -346,8 +346,9 @@ def exp_manager(trainer: 'pytorch_lightning.Trainer', cfg: Optional[Union[DictCo
         else:
             check_resume(trainer, log_dir, cfg.resume_past_end, cfg.resume_ignore_no_checkpoint)
 
-    if cfg.resume_from_checkpoint is not None:
-        trainer.ckpt_path = cfg.resume_from_checkpoint
+    #  TODO: this behavior is undesirable, need ckpts in exp_dir to take priority if present over resume_from_checkpoint
+    # if cfg.resume_from_checkpoint is not None:
+    #     trainer.ckpt_path = cfg.resume_from_checkpoint
 
     logging.info(f'Resuming training from checkpoint: {trainer.ckpt_path}')