refactor

HaochenYuan · HaochenYuan · commit e522d6a21a2c · 2025-12-24T01:45:37.000-08:00
diff --git a/megatron/core/tensor_parallel/random.py b/megatron/core/tensor_parallel/random.py
@@ -4,7 +4,6 @@
 # repo: https://github.com/pytorch/pytorch
 
 import contextlib
-import functools
 import logging
 from typing import Optional, Union
 
@@ -543,11 +542,9 @@ def backward(ctx, *args):
         return (None, None) + grads
 
 
-def checkpoint(function, distribute_saved_activations, *args, **kwargs):
+def checkpoint(function, distribute_saved_activations, *args):
     """Checkpoint a model or part of the model.
     This has been directly copied from torch.utils.checkpoint."""
-    if kwargs:
-        function = functools.partial(function, **kwargs)
     return CheckpointFunction.apply(function, distribute_saved_activations, *args)
 
 
diff --git a/megatron/core/transformer/transformer_layer.py b/megatron/core/transformer/transformer_layer.py
@@ -1,5 +1,6 @@
 # Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 
+import functools
 import logging
 import warnings
 from abc import ABC
@@ -672,7 +673,9 @@ def _forward_mlp(self, hidden_states, inference_context=None, padding_mask=None)
                 )
             else:
                 mlp_output_with_bias = tensor_parallel.checkpoint(
-                    self.mlp, False, pre_mlp_layernorm_output, padding_mask=padding_mask
+                    functools.partial(self.mlp, padding_mask=padding_mask),
+                    False,
+                    pre_mlp_layernorm_output,
                 )
         elif should_chunk_mlp_for_prefill:
             # Chunk input along sequence dimension