[None][fix] Make TP working for Triton MOE (in additional to EP we are using) (NVIDIA#6722)

dongfengy · dominicshanshan · commit 18c447e4f38b · 2025-08-17T08:25:46.000-07:00
Signed-off-by: Dongfeng Yu &lt;dongfengy@nvidia.com&gt;
diff --git a/tensorrt_llm/_torch/modules/fused_moe/fused_moe_triton.py b/tensorrt_llm/_torch/modules/fused_moe/fused_moe_triton.py
@@ -10,6 +10,8 @@
 import triton
 import triton.language as tl
 
+from tensorrt_llm.math_utils import ceil_div
+
 IS_TRITON_KERNELS_AVAILABLE = False
 # We expect to find triton_kernels under $TRITON_ROOT/python/triton_kernels
 # Triton upstream commit f3067cd3bd0c29065fa4ecdb724b6f29cbabea5f has been verified.
@@ -625,7 +627,14 @@ class TritonMXFP4FusedMoEQuantScales(NamedTuple):
 
 
 def swizzle_weight_and_scale(w: torch.Tensor, w_scale: torch.Tensor):
-    w = w.transpose(-1, -2).contiguous().transpose(-1, -2)
+    # (num_experts, in_dim//2, out_dim)
+    w_shape = w.shape
+    # (num_experts, in_dim//32, out_dim)
+    w_scale_shape = w_scale.shape
+    assert w_shape[0] == w_scale_shape[0]
+    assert w_shape[1] * 2 == w_scale_shape[1] * 32
+    assert w_shape[2] == w_scale_shape[2]
+    w = maybe_update_stride(w)
     #num_warps = 4 if batch <= 512 else 8
     num_warps = int(os.getenv("TRITON_MOE_MXFP4_NUM_WARPS", 4))
     assert num_warps in [4, 8], \
@@ -662,6 +671,8 @@ def __init__(self, activation_dtype):
         assert activation_dtype in [torch.float8_e4m3fn, torch.bfloat16], \
             f"TritonMXFP4FusedMoEMethod only supports float8_e4m3fn or bfloat16 activation, got {activation_dtype}"
         self.activation_dtype = activation_dtype
+        self.in_dim_padding_multiple = 128
+        self.out_dim_padding_multiple = 256
 
     def create_weights(self, module: torch.nn.Module):
         weight_dtype = torch.uint8
@@ -673,11 +684,11 @@ def create_weights(self, module: torch.nn.Module):
             module.intermediate_size_per_partition * 2,
         )
 
+        # Full scale is loaded at the beginning, later we will slice properly for TP
         w3_w1_scale_shape = (
             w3_w1_weight_shape[0],
-            w3_w1_weight_shape[1] //
-            16,  # block size of 32 for mxfp4, we already divided by 2 before so only divide by 16
-            w3_w1_weight_shape[2],
+            ceil_div(module.hidden_size, 32),  # block size of 32 for mxfp4
+            module.intermediate_size * 2,
         )
 
         # The Triton kernel accepts the w2_weight in (num_experts, intermediate_dim, hidden_dim) format
@@ -688,27 +699,14 @@ def create_weights(self, module: torch.nn.Module):
             module.hidden_size,
         )
 
+        # Full scale is loaded at the beginning, later we will slice properly for TP
         w2_scale_shape = (
             w2_weight_shape[0],
-            w2_weight_shape[1] //
-            16,  # block size of 32 for mxfp4, we already divided by 2 before so only divide by 16
+            ceil_div(module.intermediate_size,
+                     32),  # block size of 32 for mxfp4
             w2_weight_shape[2],
         )
 
-        def _check_shape_requirement(shape):
-            # Reject shapes that may cause kernels to fail
-            # For hidden_size = 2880 and intermediate_size = 2880,
-            # we have w3_w1_weight_shape = (?, 1440, 5760)
-            # and w2_weight_shape = (?, 1440, 2880).
-            # Note that the div 2 here is because we are using mxfp4 which packs two values into one byte.
-            # This check allows the 2880 case with tp1 to pass while rejecting larger tp.
-            assert len(shape) == 3
-            assert shape[1] % 32 == 0 and shape[
-                2] % 32 == 0, "Shape not well-supported by Triton kernel, try EP instead"
-
-        _check_shape_requirement(w3_w1_weight_shape)
-        _check_shape_requirement(w2_weight_shape)
-
         FusedMoEMethodBase.create_weights(self,
                                           module,
                                           weight_dtype,
@@ -1011,25 +1009,63 @@ def load_quant_scales(self, module: torch.nn.Module, weights: Dict):
             tmp_w3_w1_weight_scale)
 
         # For Hopper style swizzle, we need to pad the out dim to multiple of 256 otherwise it sometimes produces nan
-        def _maybe_pad_weight_and_scale(weight, scale=None):
+        def _maybe_pad_weight_and_scale(weight,
+                                        scale=None,
+                                        in_dim_padding_offset=0):
+            # Both weight and bias are handled here
+            assert weight.dim() in [2, 3], "Weight should be 2D or 3D tensor"
+            # out_dim padding is only required for Hopper
             if torch.cuda.get_device_capability()[0] == 9:
-                # Both weight and bias are handled here
-                assert weight.dim() in [2,
-                                        3], "Weight should be 2D or 3D tensor"
-
                 out_dim = weight.shape[-1]
                 assert scale is None or scale.shape[
                     -1] == out_dim, "Out dim of weight and scale should match"
-                pad_size = (256 - out_dim % 256) % 256
+                pad_size = (self.out_dim_padding_multiple -
+                            out_dim % self.out_dim_padding_multiple
+                            ) % self.out_dim_padding_multiple
                 weight = F.pad(
                     weight,
                     (0, pad_size))  # Pad the last dimension on right side
                 if scale is not None:
                     scale = F.pad(scale, (0, pad_size))
+            # in_dim padding is always required when we have TP because of mxfp4 scale block size
+            # We only do in_dim padding for weights but not for bias
+            if weight.dim() == 3:
+                in_dim = weight.shape[
+                    -2] * 2  # mxfp4 packs two values into one byte
+                assert scale is None or scale.shape[-2] == ceil_div(
+                    in_dim, 32), "In dim of weight and scale should match"
+                pad_size = (self.in_dim_padding_multiple -
+                            in_dim % self.in_dim_padding_multiple
+                            ) % self.in_dim_padding_multiple
+                assert pad_size % 2 == 0
+                pad_size //= 2  # pad_size is in mxfp4 units
+                assert in_dim_padding_offset % 2 == 0
+                in_dim_padding_offset //= 2
+                assert in_dim_padding_offset <= pad_size, "TP offset larger than pad size"
+                weight = F.pad(weight, (0, 0, in_dim_padding_offset,
+                                        pad_size - in_dim_padding_offset))
+                assert scale is not None  # Bias won't enter this branch
+                new_in_dim = weight.shape[-2] * 2
+                assert new_in_dim % 32 == 0
+                new_scale_in_dim = new_in_dim // 32
+                scale_pad_size = new_scale_in_dim - scale.shape[-2]
+                assert scale_pad_size >= 0
+                scale = F.pad(scale, (0, 0, 0, scale_pad_size))
+
             return (weight, scale) if scale is not None else weight
 
         # Handle w3_w1_weight
 
+        # Slice scales for TP
+        tp_slice_start = module.intermediate_size_per_partition * module.tp_rank
+        tp_slice_end = tp_slice_start + module.intermediate_size_per_partition
+        #(num_experts, in_dim / 32, out_dim)
+        assert tmp_w3_w1_weight_scale.dim() == 3
+        assert tmp_w3_w1_weight_scale.shape[-1] == module.intermediate_size * 2
+        # The scale is already shuffled
+        tmp_w3_w1_weight_scale = tmp_w3_w1_weight_scale[:, :, tp_slice_start *
+                                                        2:tp_slice_end * 2]
+
         tmp_w3_w1_weight, tmp_w3_w1_weight_scale = _maybe_pad_weight_and_scale(
             module.w3_w1_weight, tmp_w3_w1_weight_scale)
 
@@ -1045,8 +1081,21 @@ def _maybe_pad_weight_and_scale(weight, scale=None):
 
         # Handle w2_weight
 
+        # Slice scales for TP
+        # TP might make the weight start from half of the mxfp4 32 block
+        # For example, if we start from index 20, there are 12 elements in the first block instead of 32
+        # We need to pad 20 elements to the first block
+        self.w2_tp_offset = tp_slice_start % 32
+        assert tmp_w2_weight_scale.dim() == 3
+        # assert tmp_w2_weight_scale.shape[-2] * 32 == module.intermediate_size
+        # We skip this assert to allow intermidiate_size not divisible by 32, this is used in the unit test to test TP shapes in a single gpu
+        scale_slice_start = tp_slice_start // 32
+        scale_slice_end = (tp_slice_end - 1) // 32 + 1
+        tmp_w2_weight_scale = tmp_w2_weight_scale[:, scale_slice_start:
+                                                  scale_slice_end, :]
+
         tmp_w2_weight, tmp_w2_weight_scale = _maybe_pad_weight_and_scale(
-            module.w2_weight, tmp_w2_weight_scale)
+            module.w2_weight, tmp_w2_weight_scale, self.w2_tp_offset)
 
         module._parameters.pop('w2_weight', None)
         module._parameters.pop('fc2_dequant', None)
@@ -1116,6 +1165,18 @@ def apply(self, module: torch.nn.Module, x: torch.Tensor,
 
         # Step 2: Gemm1
         # Setup quantization context
+        def _maybe_pad_activation(hidden_states, in_dim_padding_offset):
+            assert hidden_states.dim() == 2, "Hidden states should be 2D tensor"
+            in_dim = hidden_states.shape[-1]
+            pad_size_in = (self.in_dim_padding_multiple -
+                           in_dim % self.in_dim_padding_multiple
+                           ) % self.in_dim_padding_multiple
+            assert in_dim_padding_offset <= pad_size_in
+            padding = (in_dim_padding_offset,
+                       pad_size_in - in_dim_padding_offset)
+            hidden_states = F.pad(hidden_states, padding)
+            return hidden_states
+
         if self.activation_dtype == torch.float8_e4m3fn:
             flex_ctx_1 = FlexCtx(
                 lhs_data=InFlexData(scale=hidden_states_scale), )
@@ -1129,6 +1190,7 @@ def apply(self, module: torch.nn.Module, x: torch.Tensor,
         # Call the Triton gemm kernel, which also does permutation and activation
         alpha = module.swiglu_alpha or 1.0
         beta = module.swiglu_beta or 0.0
+        hidden_states = _maybe_pad_activation(hidden_states, 0)
         if beta == 1.0:
             act = FusedActivation(
                 FnSpecs("swiglu", triton_kernels.swiglu.swiglu_fn,
@@ -1183,6 +1245,7 @@ def _maybe_remove_padding(gemm_output, expected_size):
                               out_dtype=module.dtype)
 
         # Call the Triton kernel, which also does finalization
+        act_out = _maybe_pad_activation(act_out, self.w2_tp_offset)
         gemm2_output = matmul_ogs(act_out,
                                   gemm2_weights,
                                   module.w2_bias if module.bias else None,
diff --git a/tests/integration/defs/accuracy/test_llm_api_pytorch.py b/tests/integration/defs/accuracy/test_llm_api_pytorch.py
@@ -2565,10 +2565,6 @@ def test_w4_4gpus(self, moe_backend, tp_size, pp_size, ep_size,
         if moe_backend == "TRITON":
             if not IS_TRITON_KERNELS_AVAILABLE:
                 pytest.skip("Triton kernels are not available")
-            if tp_size != ep_size:
-                pytest.skip(
-                    "TRITON moe backend currently doesn't supported mxfp4 tp for this size"
-                )
 
         pytorch_config = dict(
             disable_overlap_scheduler=not overlap_scheduler,
diff --git a/tests/unittest/_torch/modules/test_fused_moe.py b/tests/unittest/_torch/modules/test_fused_moe.py
@@ -1757,8 +1757,10 @@ def ref():
 @pytest.mark.parametrize(
     "hidden_size, intermediate_size",
     [
-        (256, 256),
         (2880, 2880),
+        (2880, 1440),
+        (2880, 720),
+        (2880, 360),
     ],
 )
 @pytest.mark.parametrize("fp8_activation", [True, False])

Original file line number	Diff line number	Diff line change
`@@ -1757,8 +1757,10 @@ def ref():`
`1757`	`1757`	`@pytest.mark.parametrize(`
`1758`	`1758`	`"hidden_size, intermediate_size",`
`1759`	`1759`	`[`
`1760`		`- (256, 256),`
`1761`	`1760`	`(2880, 2880),`
	`1761`	`+ (2880, 1440),`
	`1762`	`+ (2880, 720),`
	`1763`	`+ (2880, 360),`
`1762`	`1764`	`],`
`1763`	`1765`	`)`
`1764`	`1766`	`@pytest.mark.parametrize("fp8_activation", [True, False])`