From 9270ea457d5e6d51b5592ea3bb8cd1a378e33260 Mon Sep 17 00:00:00 2001 From: Yi Zhang <187001205+yizhang-nv@users.noreply.github.com> Date: Mon, 15 Sep 2025 09:52:17 +0000 Subject: [PATCH] Fix trtllm moe backend test config and Qwen3 MoE multi node Signed-off-by: Yi Zhang <187001205+yizhang-nv@users.noreply.github.com> --- jenkins/L0_Test.groovy | 4 ++-- tensorrt_llm/_torch/models/modeling_qwen3_moe.py | 13 ++++++++----- .../defs/accuracy/test_llm_api_pytorch.py | 4 ++++ .../integration/test_lists/qa/llm_function_full.txt | 1 + .../test_lists/test-db/l0_gb200_multi_nodes.yml | 1 + tests/integration/test_lists/waives.txt | 1 + 6 files changed, 17 insertions(+), 7 deletions(-) diff --git a/jenkins/L0_Test.groovy b/jenkins/L0_Test.groovy index 92e5f8b6ad7..9b8f75674f5 100644 --- a/jenkins/L0_Test.groovy +++ b/jenkins/L0_Test.groovy @@ -1892,8 +1892,8 @@ def launchTestJobs(pipeline, testFilter, dockerNode=null) ] fullSet += SBSASlurmTestConfigs.keySet() - multiNodesSBSAConfigs = (1..7).collectEntries { i -> - ["GB200-8_GPUs-2_Nodes-PyTorch-Post-Merge-${i}".toString(), ["gb200-multi-node", "l0_gb200_multi_nodes", i, 7, 8, 2]] + multiNodesSBSAConfigs = (1..8).collectEntries { i -> + ["GB200-8_GPUs-2_Nodes-PyTorch-Post-Merge-${i}".toString(), ["gb200-multi-node", "l0_gb200_multi_nodes", i, 8, 8, 2]] } fullSet += multiNodesSBSAConfigs.keySet() diff --git a/tensorrt_llm/_torch/models/modeling_qwen3_moe.py b/tensorrt_llm/_torch/models/modeling_qwen3_moe.py index eeefecb4223..c72d69d7905 100644 --- a/tensorrt_llm/_torch/models/modeling_qwen3_moe.py +++ b/tensorrt_llm/_torch/models/modeling_qwen3_moe.py @@ -5,6 +5,7 @@ from torch import nn from transformers import Qwen3MoeConfig +from tensorrt_llm._ipc_utils import can_access_peer from tensorrt_llm._torch.models.checkpoints.base_weight_mapper import \ BaseWeightMapper @@ -187,6 +188,8 @@ def __init__(self, model_config: ModelConfig[Qwen3MoeConfig], strategy=model_config.allreduce_strategy) self.next_layer_layernorm: RMSNorm = None + self.is_p2p_supported = can_access_peer(model_config.mapping) + self.fusion_config = EagerFusionConfig() self.enable_fusion = os.environ.get( "TRTLLM_QWEN3_EAGER_FUSION_DISABLED", "0") == "0" @@ -242,11 +245,11 @@ def forward( hidden_states, residual) # Note: this fusion pattern is only supported for TRTLLM-nvfp4 backend now - do_finalize = not (hidden_states.shape[0] - <= self.moe_allreduce.max_token - and self.fusion_config.POST_MOE_FUSION - and self.model_config.moe_backend == 'TRTLLM' - and self.mlp.experts.has_nvfp4) + do_finalize = not ( + hidden_states.shape[0] <= self.moe_allreduce.max_token + and self.fusion_config.POST_MOE_FUSION + and self.model_config.moe_backend == 'TRTLLM' + and self.mlp.experts.has_nvfp4 and self.is_p2p_supported) hidden_states = self.mlp( hidden_states, diff --git a/tests/integration/defs/accuracy/test_llm_api_pytorch.py b/tests/integration/defs/accuracy/test_llm_api_pytorch.py index ab0df93a356..518480f8d2d 100644 --- a/tests/integration/defs/accuracy/test_llm_api_pytorch.py +++ b/tests/integration/defs/accuracy/test_llm_api_pytorch.py @@ -2294,13 +2294,17 @@ def test_fp8(self, tp_size, pp_size, ep_size, attention_dp, cuda_graph, "tp_size,pp_size,ep_size,attention_dp,cuda_graph,overlap_scheduler,moe_backend,eagle3", [ (8, 1, 8, True, True, True, "CUTLASS", False), + (8, 1, 8, False, True, True, "TRTLLM", False), (8, 1, 8, True, True, True, "TRTLLM", False), + (8, 1, 8, False, True, True, "TRTLLM", True), (8, 1, 8, True, True, True, "TRTLLM", True), ], ids=[ "latency_moe_cutlass", "latency_moe_trtllm", + "latency_moe_trtllm_attention_dp", "latency_moe_trtllm_eagle3", + "latency_moe_trtllm_eagle3_attention_dp", ], ) def test_nvfp4(self, tp_size, pp_size, ep_size, attention_dp, cuda_graph, diff --git a/tests/integration/test_lists/qa/llm_function_full.txt b/tests/integration/test_lists/qa/llm_function_full.txt index c2d289361a3..26542ee0952 100644 --- a/tests/integration/test_lists/qa/llm_function_full.txt +++ b/tests/integration/test_lists/qa/llm_function_full.txt @@ -537,6 +537,7 @@ accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_fp8[throughput_laten accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_fp8[latency] accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_cutlass] accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_trtllm] +accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_trtllm_attention_dp] accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4_4gpus[latency_moe_trtllm_eagle3] accuracy/test_llm_api_pytorch.py::TestKanana_Instruct::test_auto_dtype accuracy/test_llm_api_pytorch.py::TestBielik11BInstruct::test_auto_dtype diff --git a/tests/integration/test_lists/test-db/l0_gb200_multi_nodes.yml b/tests/integration/test_lists/test-db/l0_gb200_multi_nodes.yml index 124b0d0c462..9ca82496d1c 100644 --- a/tests/integration/test_lists/test-db/l0_gb200_multi_nodes.yml +++ b/tests/integration/test_lists/test-db/l0_gb200_multi_nodes.yml @@ -18,4 +18,5 @@ l0_gb200_multi_nodes: - accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[latency_trtllmgen] TIMEOUT (180) - accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_cutlass] TIMEOUT (90) - accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_trtllm] TIMEOUT (90) + - accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_trtllm_attention_dp] TIMEOUT (90) - accuracy/test_llm_api_pytorch.py::TestQwen3_8B::test_bf16[multi_gpus_no_cache] TIMEOUT (180) diff --git a/tests/integration/test_lists/waives.txt b/tests/integration/test_lists/waives.txt index 0b68e774151..bfad9928fb8 100644 --- a/tests/integration/test_lists/waives.txt +++ b/tests/integration/test_lists/waives.txt @@ -269,6 +269,7 @@ examples/test_gemma.py::test_hf_gemma_fp8_base_bf16_multi_lora[gemma-2-9b-it] SK examples/test_gemma.py::test_hf_gemma_fp8_base_bf16_multi_lora[gemma-2-27b-it] SKIP (https://nvbugs/5434451) examples/test_gemma.py::test_hf_gemma_fp8_base_bf16_multi_lora[gemma-3-1b-it] SKIP (https://nvbugs/5434451) accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_trtllm_eagle3] SKIP (https://nvbugs/5437384) +accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_trtllm_eagle3_attention_dp] SKIP (https://nvbugs/5437384) test_e2e.py::test_ptp_quickstart_multimodal[llava-v1.6-mistral-7b-llava-v1.6-mistral-7b-hf-image-False] SKIP (https://nvbugs/5444095) full:GB200/examples/test_qwen.py::test_llm_qwen_7b_multi_gpus_summary[qwen1.5_7b_chat-enable_fmha_fp32_acc-enable_plugin-tp2pp2-nb:4] SKIP (https://nvbugs/5247837) full:GB200/examples/test_qwen.py::test_llm_qwen_7b_multi_gpus_summary[qwen2_7b_instruct-enable_fmha_fp32_acc-enable_plugin-tp2pp2-nb:4] SKIP (https://nvbugs/5247837)