[nvbug 5380101][fix] Fix nemotronNAS loading for TP>1 (NVIDIA#6447)

tomeras91 · lancelly · commit 98d10dfcdda7 · 2025-08-06T03:01:47.000Z
Signed-off-by: Tomer Asida &lt;57313761+tomeras91@users.noreply.github.com&gt;
Signed-off-by: Lanyu Liao &lt;lancelly@users.noreply.github.com&gt;
diff --git a/tensorrt_llm/_torch/models/checkpoints/base_weight_mapper.py b/tensorrt_llm/_torch/models/checkpoints/base_weight_mapper.py
@@ -3,7 +3,7 @@
 
 from torch import nn
 
-from tensorrt_llm._torch.model_config import ModelConfig, TConfig
+from tensorrt_llm._torch.model_config import ModelConfig
 from tensorrt_llm._torch.models.modeling_utils import DecoderModelForCausalLM
 
 
@@ -14,11 +14,11 @@ def __init__(self):
         self._mapping: dict = {}
         self._skip_modules = []
         self._model: Union[nn.Module, DecoderModelForCausalLM] | None = None
-        self._config: TConfig | None = None
+        self._config: ModelConfig | None = None
 
     def init_model_and_config(self, model: Union[nn.Module,
                                                  DecoderModelForCausalLM],
-                              config: TConfig):
+                              config: ModelConfig):
         self._model = model
         self._config = config
 
@@ -29,9 +29,9 @@ def init_model_and_config(self, model: Union[nn.Module,
             raise ValueError("model must have a config attribute")
 
         self._tp_size = 1 if model.model_config.mapping.enable_attention_dp else model.model_config.mapping.tp_size
-        self._num_kv_heads = model.config.num_key_value_heads if hasattr(
-            model.config, 'num_key_value_heads'
-        ) and model.config.num_key_value_heads is not None else model.config.num_attention_heads
+        self._head_dim = model.config.head_dim if hasattr(
+            model.config, 'head_dim'
+        ) and model.config.head_dim is not None else model.config.hidden_size // model.config.num_attention_heads
 
         self.map_weights()
 
@@ -153,7 +153,7 @@ def mapping(self) -> dict:
         return self._mapping
 
     @property
-    def config(self) -> TConfig:
+    def config(self) -> ModelConfig:
         if self._config is None:
             raise RuntimeError("Weight mapper is not initialized")
         return self._config
diff --git a/tensorrt_llm/_torch/models/checkpoints/hf/qwen3_moe_weight_mapper.py b/tensorrt_llm/_torch/models/checkpoints/hf/qwen3_moe_weight_mapper.py
@@ -1,13 +1,25 @@
+from typing import Union
+
 from torch import nn
 
+from tensorrt_llm._torch.model_config import ModelConfig
 from tensorrt_llm._torch.models.checkpoints.hf.qwen2_moe_weight_mapper import \
     Qwen2MoeHfWeightMapper
 from tensorrt_llm._torch.models.modeling_utils import register_mapper
+from tensorrt_llm.models.modeling_utils import DecoderModelForCausalLM
 
 
 @register_mapper("HF", "Qwen3MoeForCausalLM")
 class Qwen3MoeHfWeightMapper(Qwen2MoeHfWeightMapper):
 
+    def init_model_and_config(self, model: Union[nn.Module,
+                                                 DecoderModelForCausalLM],
+                              config: ModelConfig):
+        super().init_model_and_config(model, config)
+        self._num_kv_heads = model.config.num_key_value_heads if hasattr(
+            model.config, 'num_key_value_heads'
+        ) and model.config.num_key_value_heads is not None else model.config.num_attention_heads
+
     def should_skip_module(self, module_name: str) -> bool:
         if module_name.startswith("draft_model"):
             return True
diff --git a/tensorrt_llm/_torch/models/checkpoints/hf/weight_mapper.py b/tensorrt_llm/_torch/models/checkpoints/hf/weight_mapper.py
@@ -59,17 +59,15 @@ def should_skip_module(self, module_name: str) -> bool:
     def _duplicate_kv_weights(self, module: nn.Module, new_name: str,
                               weights: dict):
         if new_name in ['k_proj', 'v_proj']:
-            num_kv_heads_list = [self._num_kv_heads
-                                 ] * len(weights) if isinstance(
-                                     self._num_kv_heads,
-                                     int) else self._num_kv_heads
+            # k_proj and v_proj shape is [num_kv_heads*head_dim, hidden_dim]
+            num_kv_heads = weights['weight'].shape[0] // self._head_dim
             processed_weights = {
                 k:
                 self._duplicate_kv(weight=v[:],
-                                   num_kv_heads=num_kv_heads_list[i],
+                                   num_kv_heads=num_kv_heads,
                                    tensor_parallel_size=self._tp_size)
                 if k in ["weight", "bias"] else v
-                for i, (k, v) in enumerate(weights.items())
+                for k, v in weights.items()
             }
             return processed_weights
 
diff --git a/tests/integration/test_lists/waives.txt b/tests/integration/test_lists/waives.txt
@@ -400,7 +400,6 @@ full:GH200/disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_l
 accuracy/test_llm_api_pytorch.py::TestGemma3_1BInstruct::test_auto_dtype SKIP (https://nvbugs/5375620)
 test_e2e.py::test_ptp_quickstart_advanced[Mixtral-8x7B-NVFP4-nvfp4-quantized/Mixtral-8x7B-Instruct-v0.1] SKIP (https://nvbugs/5377465)
 test_e2e.py::test_ptp_quickstart_advanced[Llama3.1-70B-FP8-llama-3.1-model/Llama-3.1-70B-Instruct-FP8] SKIP (https://nvbugs/5377465)
-accuracy/test_llm_api_pytorch.py::TestNemotronNas::test_auto_dtype_tp8 SKIP (https://nvbugs/5380101)
 test_e2e.py::test_ptp_quickstart_advanced_8gpus[Llama3.1-405B-FP8-llama-3.1-model/Llama-3.1-405B-Instruct-FP8] SKIP (https://nvbugs/5380570)
 test_e2e.py::test_ptp_quickstart_advanced_8gpus[Nemotron-Ultra-253B-nemotron-nas/Llama-3_1-Nemotron-Ultra-253B-v1] SKIP (https://nvbugs/5380570)
 examples/test_multimodal.py::test_llm_fp8_multimodal_general[fp8-fp8-cnn_dailymail-Qwen2-VL-7B-Instruct-pp:1-tp:1-bfloat16-bs:1-cpp_e2e:False] SKIP (https://nvbugs/5385987)