Update evo2 ModelCheckpoint args (#935)

jwilber · web-flow · commit 8d3d043cc7f4 · 2025-06-10T20:33:53.000Z
### Description
Adds new arguments to control `ModelCheckpoint`.

### Type of changes
&lt;!-- Mark the relevant option with an [x] --&gt;

- [ ]  Bug fix (non-breaking change which fixes an issue)
- [x]  New feature (non-breaking change which adds functionality)
- [ ]  Refactor
- [x]  Documentation update
- [ ]  Other (please describe):


### Pre-submit Checklist
&lt;!--- Ensure all items are completed before submitting --&gt;

 - [x] I have tested these changes locally
 - [x] I have updated the documentation accordingly
 - [x] I have added/updated tests as needed
 - [x] All existing tests pass successfully

---------

Signed-off-by: Jared Wilber &lt;jwilber@nvidia.com&gt;
diff --git a/sub-packages/bionemo-evo2/README.md b/sub-packages/bionemo-evo2/README.md
@@ -49,7 +49,7 @@ usage: train_evo2 [-h] (-d DATASET_CONFIG | --mock-data) [--dataset-dir DATASET_
                   [--debug-ddp-parity-freq DEBUG_DDP_PARITY_FREQ] [--hybrid-override-pattern HYBRID_OVERRIDE_PATTERN] [--num-layers NUM_LAYERS] [--create-tflops-callback] [--log-parameters-and-shapes] [--lr LR] [--min-lr MIN_LR]
                   [--warmup-steps WARMUP_STEPS] [--nsys-profiling] [--nsys-start-step NSYS_START_STEP] [--nsys-end-step NSYS_END_STEP] [--no-renormalize-loss] [--nsys-ranks NSYS_RANKS [NSYS_RANKS ...]]
                   [--activation-checkpoint-recompute-num-layers ACTIVATION_CHECKPOINT_RECOMPUTE_NUM_LAYERS] [--disable-checkpointing] [--clip-grad CLIP_GRAD] [--seq-len-interpolation-factor SEQ_LEN_INTERPOLATION_FACTOR]
-                  [--overlap-param-gather] [--overlap-grad-reduce] [--hidden-dropout HIDDEN_DROPOUT] [--attention-dropout ATTENTION_DROPOUT] [--no-activation-checkpointing | --selective-activation-checkpointing]
+                  [--overlap-param-gather] [--overlap-grad-reduce] [--hidden-dropout HIDDEN_DROPOUT] [--attention-dropout ATTENTION_DROPOUT] [--save-top-k SAVE_TOP_K] [--metric-to-monitor-for-checkpoints METRIC_TO_MONITOR_FOR_CHECKPOINTS] [--save-last-checkpoint] [--no-save-last-checkpoint] [--no-activation-checkpointing | --selective-activation-checkpointing]
 
 Train a Hyena model using NeMo 2.0.
 
@@ -179,6 +179,14 @@ options:
                         Dropout probability for the hyena layers (default: 0.0)
   --attention-dropout ATTENTION_DROPOUT
                         Dropout probability for the attention layers. (default: 0.0)
+  --save-top-k SAVE_TOP_K
+                        Number of best checkpoints to keep. Set to -1 to save all checkpoints. (default: 5)
+  --metric-to-monitor-for-checkpoints METRIC_TO_MONITOR_FOR_CHECKPOINTS
+                        Metric to monitor for checkpoints. (default: val_loss)
+  --save-last-checkpoint
+                        Save the last checkpoint. (default: True)
+  --no-save-last-checkpoint
+                        Disable saving the last checkpoint. (default: True)
   --no-activation-checkpointing
   --selective-activation-checkpointing
 ```
diff --git a/sub-packages/bionemo-evo2/src/bionemo/evo2/run/train.py b/sub-packages/bionemo-evo2/src/bionemo/evo2/run/train.py
@@ -38,7 +38,6 @@
 )
 from nemo.collections.nlp.modules.common.tokenizer_utils import get_nmt_tokenizer
 from nemo.lightning.pytorch import callbacks as nl_callbacks
-from nemo.lightning.pytorch.callbacks import ModelCheckpoint
 from nemo.lightning.pytorch.callbacks.flops_callback import FLOPsMeasurementCallback
 from nemo.lightning.pytorch.callbacks.megatron_comm_overlap import MegatronCommOverlapCallback
 from nemo.lightning.pytorch.optim import CosineAnnealingScheduler
@@ -389,6 +388,31 @@ def parse_args(args: Optional[List[str]] = None) -> argparse.Namespace:
         default=0.0,
         help="Dropout probability for the attention layers.",
     )
+    parser.add_argument(
+        "--save-top-k",
+        type=int,
+        default=5,
+        help="Number of best checkpoints to keep. Set to -1 to save all checkpoints.",
+    )
+    parser.add_argument(
+        "--metric-to-monitor-for-checkpoints",
+        type=str,
+        default="val_loss",
+        help="Metric to monitor for checkpoints.",
+    )
+    parser.add_argument(
+        "--save-last-checkpoint",
+        action="store_true",
+        default=True,
+        help="Save the last checkpoint.",
+    )
+    parser.add_argument(
+        "--no-save-last-checkpoint",
+        action="store_false",
+        dest="save_last_checkpoint",
+        default=True,
+        help="Disable saving the last checkpoint.",
+    )
     recompute_group = parser.add_mutually_exclusive_group(required=False)
     recompute_group.add_argument("--no-activation-checkpointing", action="store_true", default=False)
     recompute_group.add_argument("--selective-activation-checkpointing", action="store_true", default=False)
@@ -601,11 +625,15 @@ def train(args: argparse.Namespace) -> nl.Trainer:
 
     if args.create_checkpoint_callback:
         checkpoint_path = str(Path(nemo_logger.save_dir) / "checkpoints")
-        checkpoint_callback = ModelCheckpoint(
-            every_n_train_steps=args.val_check_interval,
+        checkpoint_callback = nl_callbacks.ModelCheckpoint(
             dirpath=checkpoint_path,
-            save_top_k=5,
+            save_last=args.save_last_checkpoint,
+            monitor=args.metric_to_monitor_for_checkpoints,
+            save_top_k=args.save_top_k,
+            every_n_train_steps=args.val_check_interval,
             always_save_context=True,
+            filename="{epoch}-{step}-{consumed_samples}",
+            save_weights_only=False,
             save_optim_on_train_end=True,
             save_context_on_train_end=True,
         )