Enhance MNNVL unittest.

timlee0212 · timlee0212 · commit 3a82827285bd · 2025-07-22T15:02:47.000-07:00
Signed-off-by: Shiyu Li &lt;shili@nvidia.com&gt;
diff --git a/tests/unittest/_torch/multi_gpu/test_mnnvl_allreduce.py b/tests/unittest/_torch/multi_gpu/test_mnnvl_allreduce.py
@@ -47,29 +47,29 @@ def rms_norm(x: torch.Tensor, weight: torch.Tensor = None, eps: float = 1e-6):
 def run_single_rank(
     tensor_parallel_size,
     single_rank_forward_func,
-    input,
-    residual,
+    input_list,
+    residual_list,
     norm_weight,
     eps,
     hidden_size,
     dtype,
     fused_add_norm,
-    reference_output,
+    reference_output_list,
 ):
     rank = tensorrt_llm.mpi_rank()
     torch.cuda.set_device(rank)
     try:
         single_rank_forward_func(
-            input,
-            residual,
+            input_list,
+            residual_list,
             norm_weight,
             eps,
             hidden_size,
             dtype,
             tensor_parallel_size,
             rank,
             fused_add_norm,
-            reference_output,
+            reference_output_list,
         )
     except Exception:
         traceback.print_exc()
@@ -79,25 +79,30 @@ def run_single_rank(
 
 @torch.inference_mode()
 def row_linear_residual_norm_fusion_forward(
-    x: torch.Tensor,
-    residual: torch.Tensor,
+    x_list: list[torch.Tensor],
+    residual_list: list[torch.Tensor],
     norm_weight: torch.Tensor,
     eps: float,
     hidden_size: int,
     dtype: torch.dtype,
     tensor_parallel_size: int,
     tensor_parallel_rank: int,
     fusion: bool,
-    reference_output: tuple[torch.Tensor, ...],
+    reference_output_list: list[tuple[torch.Tensor, ...]],
 ):
 
-    x = x.cuda()
-    residual = residual.cuda()
+    # Move all tensors to GPU
+    x_list = [x.cuda() for x in x_list]
+    residual_list = [residual.cuda() for residual in residual_list]
     norm_weight = norm_weight.cuda()
-    reference_output = tuple(t.cuda() for t in reference_output)
+    reference_output_list = [
+        tuple(t.cuda() for t in ref_output)
+        for ref_output in reference_output_list
+    ]
 
     MPI.COMM_WORLD.barrier()
 
+    # Create a single AllReduce instance to be reused for all sequence lengths
     allreduce = AllReduce(
         mapping=Mapping(
             world_size=tensor_parallel_size,
@@ -119,72 +124,116 @@ def func(input, residual, norm_weight, eps, enable_fusion):
                     residual=residual,
                     norm_weight=norm_weight,
                     eps=eps,
-                ))
+                ),
+            )
             return (output, residual)
         else:
             output = allreduce(input)
             return (output, )
 
-    output = func(x.clone(), residual.clone(), norm_weight, eps, fusion)
+    # Process each sequence length using the same AllReduce instance
+    for i, (x, residual, reference_output) in enumerate(
+            zip(x_list, residual_list, reference_output_list)):
+        output = func(x.clone(), residual.clone(), norm_weight, eps, fusion)
 
-    torch.testing.assert_close(
-        output[0],
-        reference_output[0],
-        rtol=0.05,
-        atol=0.15,
-    )
-
-    if fusion:
         torch.testing.assert_close(
-            output[1],
-            reference_output[1],
+            output[0],
+            reference_output[0],
             rtol=0.05,
             atol=0.15,
         )
 
+        if fusion:
+            torch.testing.assert_close(
+                output[1],
+                reference_output[1],
+                rtol=0.05,
+                atol=0.15,
+            )
+
 
 @skip_pre_blackwell
 @pytest.mark.skipif(torch.cuda.device_count() < 2,
                     reason="needs 2 GPUs to run this test")
-@pytest.mark.parametrize("seq_len", [1, 4, 32, 128],
-                         ids=lambda x: f"seqlen:{x}")
+@pytest.mark.parametrize(
+    "seq_len",
+    [
+        [
+            1,
+        ],
+        [
+            4,
+        ],
+        [
+            15,
+        ],
+        [
+            32,
+        ],
+        [
+            128,
+        ],
+        [31, 11, 27, 4],
+    ],
+    ids=lambda x: f"seqlen:{x}",
+)
 @pytest.mark.parametrize("hidden_size", [7168], ids=lambda x: f"hidden:{x}")
+@pytest.mark.parametrize("dtype",
+                         [torch.float16, torch.bfloat16, torch.float32],
+                         ids=lambda x: f"dtype:{torch.finfo(x).dtype}")
 @pytest.mark.parametrize(
     "fusion",
     [True, False],
     ids=["fusion", "no_fusion"],
 )
-def test_row_linear_residual_norm_fusion(seq_len, hidden_size, fusion):
+def test_row_linear_residual_norm_fusion(seq_len, hidden_size, dtype, fusion):
 
     torch.manual_seed(42)
-    dtype = torch.bfloat16
     tensor_parallel_size = 2
 
-    x = torch.randn((tensor_parallel_size, seq_len, hidden_size), dtype=dtype)
-    residual = torch.randn((seq_len, hidden_size), dtype=dtype)
+    # Create norm_weight once (same for all sequence lengths)
     norm_weight = torch.randn((hidden_size, ), dtype=dtype)
     eps = 1e-5
-    reference_output = (torch.sum(x, dim=0), )
-    if fusion:
-        residual_out = reference_output[0] + residual
-        reference_output = (rms_norm(residual_out.to(torch.float32),
-                                     norm_weight, eps).to(dtype), residual_out)
+
+    # Create lists of tensors for each sequence length
+    x_list = []
+    residual_list = []
+    reference_output_list = []
+
+    for seq_len_val in seq_len:
+        x = torch.randn((tensor_parallel_size, seq_len_val, hidden_size),
+                        dtype=dtype)
+        residual = torch.randn((seq_len_val, hidden_size), dtype=dtype)
+        reference_output = (torch.sum(x, dim=0), )
+        if fusion:
+            residual_out = reference_output[0] + residual
+            reference_output = (rms_norm(residual_out.to(torch.float32),
+                                         norm_weight,
+                                         eps).to(dtype), residual_out)
+
+        x_list.append(x)
+        residual_list.append(residual)
+        reference_output_list.append(reference_output)
 
     with MPIPoolExecutor(max_workers=tensor_parallel_size) as executor:
         results = executor.map(
             run_single_rank,
-            *zip(*[(
-                tensor_parallel_size,
-                row_linear_residual_norm_fusion_forward,
-                x[i, :, :],
-                residual,
-                norm_weight,
-                eps,
-                hidden_size,
-                dtype,
-                fusion,
-                reference_output,
-            ) for i in range(tensor_parallel_size)]),
+            *zip(*[
+                (
+                    tensor_parallel_size,
+                    row_linear_residual_norm_fusion_forward,
+                    [
+                        x[i, :, :] for x in x_list
+                    ],  # Extract the i-th rank's data from each sequence length
+                    residual_list,
+                    norm_weight,
+                    eps,
+                    hidden_size,
+                    dtype,
+                    fusion,
+                    reference_output_list,
+                ) for i in range(tensor_parallel_size)
+            ]),
         )
         for r in results:
             assert r is True