NVIDIA
diff --git a/‎megatron/core/extensions/transformer_engine.py‎
Lines changed: 1 addition & 1 deletion b/‎megatron/core/extensions/transformer_engine.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎megatron/core/models/common/model_chunk_schedule_plan.py‎
Lines changed: 0 additions & 2 deletions b/‎megatron/core/models/common/model_chunk_schedule_plan.py‎
Lines changed: 0 additions & 2 deletions
diff --git a/‎megatron/core/models/gpt/fine_grained_callables.py‎
Lines changed: 7 additions & 14 deletions b/‎megatron/core/models/gpt/fine_grained_callables.py‎
Lines changed: 7 additions & 14 deletions
diff --git a/‎megatron/core/models/gpt/gpt_model.py‎
Lines changed: 4 additions & 33 deletions b/‎megatron/core/models/gpt/gpt_model.py‎
Lines changed: 4 additions & 33 deletions
diff --git a/‎megatron/core/transformer/mlp.py‎
Lines changed: 1 addition & 1 deletion b/‎megatron/core/transformer/mlp.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎megatron/core/transformer/moe/moe_layer.py‎
Lines changed: 8 additions & 19 deletions b/‎megatron/core/transformer/moe/moe_layer.py‎
Lines changed: 8 additions & 19 deletions
diff --git a/‎megatron/core/transformer/moe/moe_utils.py‎
Lines changed: 16 additions & 67 deletions b/‎megatron/core/transformer/moe/moe_utils.py‎
Lines changed: 16 additions & 67 deletions
@@ -1851,7 +1851,7 @@ def forward_post_hook(module, *_) -> None:
                     "TEFusedMLP module does not support submodules with post-backward hooks"
                 )
 
-        def forward(self, hidden_states: torch.Tensor, **kwargs) -> Tuple[Tensor, Optional[Tensor]]:
+        def forward(self, hidden_states: torch.Tensor) -> Tuple[Tensor, Optional[Tensor]]:
             """Forward."""
 
             # Construct fused impl if needed
 
@@ -305,7 +305,6 @@ def __init__(
         extra_block_kwargs=None,
         runtime_gather_output: Optional[bool] = None,
         loss_mask: Optional[Tensor] = None,
-        padding_mask=None,
     ):
         """Initialize the schedule plan of all Transformer layers' sub-modules.
 
@@ -348,7 +347,6 @@ def __init__(
         self._model_chunk_state.mtp_hidden_states = None
         self._model_chunk_state.loss_mask = loss_mask
         self._model_chunk_state.packed_seq_params = packed_seq_params
-        self._model_chunk_state.padding_mask = padding_mask
         self._model_chunk_state.extra_block_kwargs = extra_block_kwargs
         self._model_chunk_state.runtime_gather_output = runtime_gather_output
         self._model_chunk_state.model = model
 
@@ -120,19 +120,13 @@ def forward_impl(self):
         if not self.gpt_model.pre_process:
             self.chunk_state.decoder_input = self.gpt_model.decoder.input_tensor
         # Run GPTModel._preprocess
-        (
-            decoder_input,
-            rotary_pos_emb,
-            rotary_pos_cos,
-            rotary_pos_sin,
-            sequence_len_offset,
-            padding_mask,
-        ) = self.gpt_model._preprocess(
-            input_ids=self.chunk_state.input_ids,
-            position_ids=self.chunk_state.position_ids,
-            decoder_input=self.chunk_state.decoder_input,
-            packed_seq_params=self.chunk_state.packed_seq_params,
-            padding_mask=self.chunk_state.padding_mask,
+        decoder_input, rotary_pos_emb, rotary_pos_cos, rotary_pos_sin, sequence_len_offset = (
+            self.gpt_model._preprocess(
+                input_ids=self.chunk_state.input_ids,
+                position_ids=self.chunk_state.position_ids,
+                decoder_input=self.chunk_state.decoder_input,
+                packed_seq_params=self.chunk_state.packed_seq_params,
+            )
         )
 
         # Saved for later use
@@ -141,7 +135,6 @@ def forward_impl(self):
         self.chunk_state.rotary_pos_cos = rotary_pos_cos
         self.chunk_state.rotary_pos_sin = rotary_pos_sin
         self.chunk_state.sequence_len_offset = sequence_len_offset
-        self.chunk_state.padding_mask = padding_mask
         return decoder_input
 
 
 
@@ -284,7 +284,6 @@ def _preprocess(
         decoder_input: Tensor = None,
         inference_context: BaseInferenceContext = None,
         packed_seq_params: PackedSeqParams = None,
-        padding_mask: Optional[Tensor] = None,
     ):
         """Preprocesses inputs for the transformer decoder.
 
@@ -301,20 +300,7 @@ def _preprocess(
         if decoder_input is not None:
             pass
         elif self.pre_process:
-            if padding_mask is not None:
-                assert padding_mask.shape == input_ids.shape, (
-                    f"padding_mask shape {padding_mask.shape} does not match "
-                    f"input_ids shape {input_ids.shape}"
-                )
             decoder_input = self.embedding(input_ids=input_ids, position_ids=position_ids)
-            if padding_mask is not None and self.config.sequence_parallel:
-                padding_mask = (
-                    tensor_parallel.scatter_to_sequence_parallel_region(
-                        padding_mask.transpose(0, 1).contiguous()
-                    )
-                    .transpose(0, 1)
-                    .contiguous()
-                )
         else:
             # intermediate stage of pipeline
             # decoder will get hidden_states from encoder.input_tensor
@@ -417,7 +403,6 @@ def _preprocess(
             rotary_pos_cos,
             rotary_pos_sin,
             sequence_len_offset,
-            padding_mask,
         )
         if rotary_pos_cos_sin is not None:
             # only in the case of flashinfer fused rope will we
@@ -461,7 +446,6 @@ def forward(
         *,
         inference_params: Optional[BaseInferenceContext] = None,
         loss_mask: Optional[Tensor] = None,
-        padding_mask: Optional[Tensor] = None,
     ) -> Tensor:
         """Forward function of the GPT Model This function passes the input tensors
         through the embedding layer, and then the decoder and finally into the post
@@ -472,9 +456,6 @@ def forward(
         Args:
             runtime_gather_output (bool): Gather output at runtime. Default None means
                 `parallel_output` arg in the constructor will be used.
-            padding_mask (Tensor, optional): Padding mask for MoE routing.
-                Shape [bsz, seq_length]. True = padding (exclude), False = valid (include).
-                Only used for MoE layers to exclude padding tokens from routing computations.
         """
         if self.config.fine_grained_activation_offloading:
             self.preprocess_for_fine_grained_offloading()
@@ -487,19 +468,13 @@ def forward(
             decoder_input=decoder_input,
             inference_context=inference_context,
             packed_seq_params=packed_seq_params,
-            padding_mask=padding_mask,
         )
 
-        (
-            decoder_input,
-            rotary_pos_emb,
-            rotary_pos_cos,
-            rotary_pos_sin,
-            sequence_len_offset,
-            padding_mask,
-        ) = preproc_output[:6]
+        (decoder_input, rotary_pos_emb, rotary_pos_cos, rotary_pos_sin, sequence_len_offset) = (
+            preproc_output[:5]
+        )
 
-        rotary_pos_cos_sin = preproc_output[6] if len(preproc_output) == 7 else None
+        rotary_pos_cos_sin = preproc_output[5] if len(preproc_output) == 6 else None
 
         # Run decoder.
         hidden_states = self.decoder(
@@ -512,7 +487,6 @@ def forward(
             rotary_pos_cos_sin=rotary_pos_cos_sin,
             packed_seq_params=packed_seq_params,
             sequence_len_offset=sequence_len_offset,
-            padding_mask=padding_mask,
             **(extra_block_kwargs or {}),
         )
 
@@ -750,7 +724,6 @@ def build_schedule_plan(
         runtime_gather_output: Optional[bool] = None,
         inference_params: Optional[BaseInferenceContext] = None,
         loss_mask: Optional[Tensor] = None,
-        padding_mask: Optional[Tensor] = None,
     ):
         """Builds a computation schedule plan for the model.
 
@@ -776,7 +749,6 @@ def build_schedule_plan(
             inference_params (InferenceParams, optional):
                 Parameters for inference. Defaults to None.
             loss_mask (Optional[Tensor], optional): Loss mask. Defaults to None.
-            padding_mask (Optional[Tensor], optional): Padding mask. Defaults to None.
 
         Returns:
             TransformerModelChunkSchedulePlan: The model chunk schedule plan.
@@ -798,7 +770,6 @@ def build_schedule_plan(
             extra_block_kwargs,
             runtime_gather_output,
             loss_mask,
-            padding_mask,
         )
 
     def sharded_state_dict(
 
@@ -137,7 +137,7 @@ def __init__(
             tp_group=tp_group,
         )
 
-    def forward(self, hidden_states, per_token_scale=None, **kwargs):
+    def forward(self, hidden_states, per_token_scale=None):
         """Perform the forward pass through the MLP block."""
         # [s, b, 4 * h/p]
         nvtx_range_push(suffix="linear_fc1")
 
@@ -178,13 +178,13 @@ def __init__(
         self.cudagraph_tensor_store = MoECudaGraphTensorStore()
 
     @maybe_skip_or_early_return_by_cudagraph("route")
-    def route(self, hidden_states: torch.Tensor, padding_mask: Optional[torch.Tensor] = None):
+    def route(self, hidden_states: torch.Tensor):
         """Compute token routing for preprocessing.
 
         This method uses the router to determine which experts to send each token to,
         producing routing probabilities and a mapping.
         """
-        probs, routing_map = self.router(hidden_states, padding_mask=padding_mask)
+        probs, routing_map = self.router(hidden_states)
         return probs, routing_map
 
     @maybe_skip_or_early_return_by_cudagraph("preprocess")
@@ -270,7 +270,7 @@ def combine(self, output: torch.Tensor, shared_expert_output: Optional[torch.Ten
             output = output + shared_expert_output
         return output
 
-    def forward(self, hidden_states: torch.Tensor, padding_mask: Optional[torch.Tensor] = None):
+    def forward(self, hidden_states: torch.Tensor):
         """Forward pass for the MoE layer.
 
         The forward pass comprises four main steps:
@@ -280,11 +280,7 @@ def forward(self, hidden_states: torch.Tensor, padding_mask: Optional[torch.Tens
         4. Combine: The outputs from the experts are combined and returned.
 
         Args:
-            hidden_states (torch.Tensor): The input tensor shape [seq_length, bsz, hidden_size].
-            padding_mask (torch.Tensor, optional): Boolean mask indicating padding positions.
-                used for correct auxiliary loss computation for packed sequence.
-                Shape = [bsz, seq_length]. True = padding (exclude), False = valid (include).
-                Defaults to None (all tokens are valid).
+            hidden_states (torch.Tensor): The input tensor to the MoE layer.
 
         Returns:
             A tuple containing the output tensor and the MLP bias, if any.
@@ -295,15 +291,11 @@ def forward(self, hidden_states: torch.Tensor, padding_mask: Optional[torch.Tens
                 "are enabled without also enabling sequence parallelism."
             )
 
-        # Transpose from [bsz, seq_length] to [seq_length, bsz] to align with hidden_states
-        if padding_mask is not None:
-            padding_mask = padding_mask.transpose(0, 1).bool()
-
         # MoE forward: route -> dispatch -> compute -> combine
-        def custom_forward(hidden_states, padding_mask=None):
+        def custom_forward(hidden_states):
             try:
                 shared_expert_output = self.shared_experts_compute(hidden_states)
-                probs, routing_map = self.route(hidden_states, padding_mask=padding_mask)
+                probs, routing_map = self.route(hidden_states)
                 hidden_states, probs, residual = self.preprocess(hidden_states, probs, routing_map)
             except MoECudaGraphPartialCaptureSignal as e:
                 # This signal is raised from the maybe_skip_or_early_return_by_cudagraph decorator.
@@ -326,14 +318,11 @@ def custom_forward(hidden_states, padding_mask=None):
                     tensor_parallel.random.get_cuda_rng_tracker,
                     parallel_state.get_tensor_model_parallel_group(),
                     hidden_states,
-                    padding_mask,
                 )
             else:
-                outputs = tensor_parallel.checkpoint(
-                    custom_forward, False, hidden_states, padding_mask
-                )
+                outputs = tensor_parallel.checkpoint(custom_forward, False, hidden_states)
         else:
-            outputs = custom_forward(hidden_states, padding_mask)
+            outputs = custom_forward(hidden_states)
 
         return outputs
 
 
@@ -1,4 +1,5 @@
 # Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+
 import math
 from dataclasses import dataclass
 from typing import List, Optional, Union
@@ -10,7 +11,6 @@
 from megatron.core.fp8_utils import get_fp8_align_size
 from megatron.core.process_groups_config import ProcessGroupCollection
 from megatron.core.tensor_parallel import get_cuda_rng_tracker, get_expert_parallel_rng_tracker_name
-from megatron.core.tensor_parallel.mappings import reduce_from_tensor_model_parallel_region
 from megatron.core.transformer.cuda_graphs import is_graph_capturing
 from megatron.core.transformer.enums import CudaGraphScope
 from megatron.core.transformer.transformer_config import TransformerConfig
@@ -120,34 +120,18 @@ def switch_load_balancing_loss_func(
     return aux_loss
 
 
-def z_loss_func(logits, z_loss_coeff, padding_mask: Optional[torch.Tensor] = None):
+def z_loss_func(logits, z_loss_coeff):
     """Encourages the router's logits to remain small to enhance stability.
     Please refer to the ST-MoE paper (https://arxiv.org/pdf/2202.08906.pdf) for details.
 
     Args:
         logits (torch.Tensor): The logits of the router.
-        z_loss_coeff (float): The coefficient for the z-loss.
-        padding_mask (torch.Tensor, optional): Boolean mask indicating padding positions.
-                                               Shape [num_tokens]. True = padding (exclude),
-                                               False = valid (include). Defaults to None.
 
     Returns:
         torch.Tensor: The logits after applying the z-loss.
     """
-    logsum = torch.logsumexp(logits, dim=-1)
-    z_loss_values = torch.square(logsum)
-
-    if padding_mask is not None:
-        # Invert padding_mask: True (padding) -> 0, False (valid) -> 1
-        valid_mask = ~padding_mask
-        # Only compute z_loss for valid (non-padding) tokens
-        z_loss_values = z_loss_values * valid_mask
-        # Compute mean over valid tokens only
-        num_valid_tokens = valid_mask.sum()
-        z_loss = z_loss_values.sum() / torch.clamp(num_valid_tokens, min=1.0) * z_loss_coeff
-    else:
-        z_loss = torch.mean(z_loss_values) * z_loss_coeff
 
+    z_loss = torch.mean(torch.square(torch.logsumexp(logits, dim=-1))) * z_loss_coeff
     return z_loss
 
 
@@ -187,28 +171,6 @@ def get_capacity(num_tokens: int, num_experts: int, capacity_factor: float, min_
     return capacity
 
 
-def get_tokens_per_expert_and_token_count(
-    routing_map: torch.Tensor,
-    reduce_group: torch.distributed.ProcessGroup,
-    topk: int = None,
-    with_padding_mask: bool = False,
-) -> torch.Tensor:
-    """
-    Compute global_tokens_per_expert, local_num_tokens and total_num_tokens with padding mask.
-    """
-    local_tokens_per_expert = routing_map.sum(dim=0)
-    global_tokens_per_expert = reduce_from_tensor_model_parallel_region(
-        local_tokens_per_expert, reduce_group
-    )
-    if with_padding_mask:
-        local_num_tokens = local_tokens_per_expert.sum() / topk
-        total_num_tokens = global_tokens_per_expert.sum() / topk
-    else:
-        local_num_tokens = routing_map.shape[0]
-        total_num_tokens = local_num_tokens * reduce_group.size()
-    return global_tokens_per_expert, local_num_tokens, total_num_tokens
-
-
 class MoEAuxLossAutoScaler(torch.autograd.Function):
     """An AutoScaler that triggers the backward pass and scales the grad for auxiliary loss."""
 
@@ -667,48 +629,35 @@ def compute_topk(scores, topk, num_groups=None, group_topk=None):
 
 
 def compute_routing_scores_for_aux_loss(
-    logits: torch.Tensor,
-    topk: int,
-    score_function: str,
-    fused: bool = False,
-    padding_mask: Optional[torch.Tensor] = None,
+    logits: torch.Tensor, topk: int, score_function: str, fused: bool = False
 ):
     """Compute routing scores based on the score function.
 
     Args:
         logits (torch.Tensor): The logits tensor after gating, shape: [num_tokens, num_experts].
-        padding_mask (torch.Tensor, optional): Boolean mask indicating padding positions.
-                                               Shape [num_tokens]. True = padding (exclude),
-                                               False = valid (include). Defaults to None.
+
     Returns:
-        Tuple[torch.Tensor, torch.Tensor]: routing_map and scores.
+        torch.Tensor: The normalized routing scores.
     """
     if fused:
         if not HAVE_TE or fused_compute_score_for_moe_aux_loss is None:
             raise ValueError(
                 "fused_compute_score_for_moe_aux_loss is not available. Please install TE >= 2.6.0."
             )
-        routing_map, scores = fused_compute_score_for_moe_aux_loss(
+        return fused_compute_score_for_moe_aux_loss(
             logits=logits, topk=topk, score_function=score_function
         )
-    else:
-        if score_function == "softmax":
-            scores = torch.softmax(logits, dim=-1, dtype=torch.float32)
-        elif score_function == "sigmoid":
-            scores = torch.sigmoid(logits)
-            scores = scores / (scores.sum(dim=-1, keepdim=True) + 1e-20)
-        else:
-            raise ValueError(f"Invalid score_function: {score_function}")
 
-        _, top_indices = torch.topk(scores, k=topk, dim=1)
-        routing_map = torch.zeros_like(logits).int().scatter(1, top_indices, 1).bool()
+    if score_function == "softmax":
+        scores = torch.softmax(logits, dim=-1, dtype=torch.float32)
+    elif score_function == "sigmoid":
+        scores = torch.sigmoid(logits)
+        scores = scores / (scores.sum(dim=-1, keepdim=True) + 1e-20)
+    else:
+        raise ValueError(f"Invalid score_function: {score_function}")
 
-    # Apply padding mask to scores if provided
-    if padding_mask is not None:
-        # Invert padding_mask and make True indicates valid tokens
-        valid_mask = (~padding_mask).unsqueeze(-1)
-        routing_map = routing_map * valid_mask
-        scores = scores * valid_mask
+    _, top_indices = torch.topk(scores, k=topk, dim=1)
+    routing_map = torch.zeros_like(logits).int().scatter(1, top_indices, 1).bool()
     return routing_map, scores
Original file line number	Diff line number	Diff line change
`@@ -1851,7 +1851,7 @@ def forward_post_hook(module, *_) -> None:`
`1851`	`1851`	`"TEFusedMLP module does not support submodules with post-backward hooks"`
`1852`	`1852`	`)`
`1853`	`1853`
`1854`		`- def forward(self, hidden_states: torch.Tensor, **kwargs) -> Tuple[Tensor, Optional[Tensor]]:`
	`1854`	`+ def forward(self, hidden_states: torch.Tensor) -> Tuple[Tensor, Optional[Tensor]]:`
`1855`	`1855`	`"""Forward."""`
`1856`	`1856`
`1857`	`1857`	`# Construct fused impl if needed`
Original file line number	Diff line number	Diff line change
`@@ -137,7 +137,7 @@ def __init__(`
`137`	`137`	`tp_group=tp_group,`
`138`	`138`	`)`
`139`	`139`
`140`		`- def forward(self, hidden_states, per_token_scale=None, **kwargs):`
	`140`	`+ def forward(self, hidden_states, per_token_scale=None):`
`141`	`141`	`"""Perform the forward pass through the MLP block."""`
`142`	`142`	`# [s, b, 4 * h/p]`
`143`	`143`	`nvtx_range_push(suffix="linear_fc1")`