[TRTLLM-6825][fix] Update lora for phi4-mm (#6817)

Wanli-Jiang · Wanli-Jiang · commit f4adb25f1fd8 · 2025-08-21T19:12:48.000-07:00
Signed-off-by: Wanli Jiang &lt;35160485+Wanli-Jiang@users.noreply.github.com&gt;
diff --git a/tensorrt_llm/_torch/models/modeling_phi4mm.py b/tensorrt_llm/_torch/models/modeling_phi4mm.py
@@ -243,23 +243,21 @@ def forward(
     @staticmethod
     def lora_config(model_dir: str):
         _lora_config = LoraConfig(
-            lora_dir=[
-                f"{model_dir}/vision-lora",
-                f"{model_dir}/speech-lora",
-            ],
             lora_target_modules=[
                 "attn_qkv",
                 "attn_dense",
-                "mlp_h_to_4h",
+                "mlp_gate_up",
                 "mlp_4h_to_h",
             ],
             trtllm_modules_to_hf_modules={
                 "attn_qkv": "qkv_proj",
                 "attn_dense": "o_proj",
-                "mlp_h_to_4h": "gate_up_proj",
+                "mlp_gate_up": "gate_up_proj",
                 "mlp_4h_to_h": "down_proj",
             },
             max_lora_rank=320,  # Max rank for Phi4MM.
+            swap_gate_up_proj_lora_b_weight=
+            False,  # Disable swap gate_up_proj.lora_B.weight for Phi4MM.
         )
         return _lora_config
 
diff --git a/tensorrt_llm/_torch/pyexecutor/_util.py b/tensorrt_llm/_torch/pyexecutor/_util.py
@@ -509,7 +509,8 @@ def create_py_executor_instance(
         resources[ResourceManagerType.PEFT_CACHE_MANAGER] = peft_cache_manager
         model_engine.set_lora_model_config(
             lora_config.lora_target_modules,
-            lora_config.trtllm_modules_to_hf_modules)
+            lora_config.trtllm_modules_to_hf_modules,
+            lora_config.swap_gate_up_proj_lora_b_weight)
 
     max_num_sequences = executor_config.max_batch_size * mapping.pp_size
 
diff --git a/tensorrt_llm/_torch/pyexecutor/model_engine.py b/tensorrt_llm/_torch/pyexecutor/model_engine.py
@@ -438,12 +438,14 @@ def __init__(
             self.cache_indirection_attention = None
 
     def set_lora_model_config(self, lora_target_modules: list[str],
-                              trtllm_modules_to_hf_modules: dict[str, str]):
+                              trtllm_modules_to_hf_modules: dict[str, str],
+                              swap_gate_up_proj_lora_b_weight: bool = True):
         self.lora_model_config = LoraModelConfig(
             lora_target_modules=lora_target_modules,
             trtllm_modules_to_hf_modules=trtllm_modules_to_hf_modules,
             hidden_size=self.model.config.hidden_size,
-            dtype=torch_dtype_to_str(self.model.config.torch_dtype))
+            dtype=torch_dtype_to_str(self.model.config.torch_dtype),
+            swap_gate_up_proj_lora_b_weight=swap_gate_up_proj_lora_b_weight)
 
     @property
     def use_mrope(self):
diff --git a/tensorrt_llm/_torch/pyexecutor/resource_manager.py b/tensorrt_llm/_torch/pyexecutor/resource_manager.py
@@ -1206,7 +1206,8 @@ def __init__(self,
         self._lora_model_config = LoraModelConfig(
             lora_config.lora_target_modules,
             lora_config.trtllm_modules_to_hf_modules, model_config.hidden_size,
-            binding_to_str_dtype(model_config.data_type))
+            binding_to_str_dtype(model_config.data_type),
+            lora_config.swap_gate_up_proj_lora_b_weight)
         self._lora_manager = LoraManager()
 
     def add_request_peft(self, request: LlmRequest):
diff --git a/tensorrt_llm/lora_helper.py b/tensorrt_llm/lora_helper.py
@@ -0,0 +1,102 @@
+# SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from dataclasses import dataclass, field
+from typing import Dict, List, Optional
+
+from ._utils import DictConversion
+
+
+def get_missing_qkv_modules_from_lora_modules(
+        lora_target_modules: List[str]) -> List[str]:
+    """Get missing QKV modules from LoRA target modules.
+
+    In current design, q_lora_params, k_lora_params and v_lora_params should be all enabled or
+    all disabled at the same time. However, some lora checkpoints (e.g. BART) only contain two of them,
+    so we use zero tensor to fill the missing ones.
+    """
+    missing_qkv_modules = []
+    if any(x in lora_target_modules for x in ["attn_q", "attn_k", "attn_v"]):
+        for lora_module in ["attn_q", "attn_k", "attn_v"]:
+            if lora_module not in lora_target_modules:
+                missing_qkv_modules.append(lora_module)
+    if any(x in lora_target_modules
+           for x in ["cross_attn_q", "cross_attn_k", "cross_attn_v"]):
+        for lora_module in ["cross_attn_q", "cross_attn_k", "cross_attn_v"]:
+            if lora_module not in lora_target_modules:
+                missing_qkv_modules.append(lora_module)
+    return missing_qkv_modules
+
+
+def get_default_trtllm_modules_to_hf_modules():
+    """Get default mapping from TensorRT-LLM module names to HuggingFace module names."""
+    return {
+        "attn_q": "q_proj",
+        "attn_k": "k_proj",
+        "attn_v": "v_proj",
+        "attn_dense": "o_proj",
+        "mlp_h_to_4h": "gate_proj",
+        "mlp_4h_to_h": "down_proj",
+        "mlp_gate": "up_proj",
+        "mlp_gate_up": "gate_up_proj",
+        "moe_h_to_4h": "w1",
+        "moe_4h_to_h": "w2",
+        "moe_gate": "w3",
+        "moe_router": "gate",
+    }
+
+
+def use_lora(
+    model,
+    lora_config: "LoraConfig",
+    trtllm_modules_to_hf_modules: Optional[Dict[str, str]] = None,
+):
+    """Use LoRA with the given model and configuration.
+
+    This function is a wrapper that delegates to the appropriate loading function
+    based on the LoRA checkpoint source.
+    """
+    if lora_config.lora_ckpt_source == "nemo":
+        from .lora_manager import load_nemo_lora
+        load_nemo_lora(model, lora_config)
+    elif lora_config.lora_ckpt_source == "hf":
+        from .lora_manager import load_hf_lora
+        load_hf_lora(model, lora_config, trtllm_modules_to_hf_modules)
+    else:
+        raise ValueError(
+            f"Unsupported lora_ckpt_source: {lora_config.lora_ckpt_source}")
+
+
+@dataclass
+class LoraConfig(DictConversion):
+    lora_dir: List[str] = field(default_factory=list)
+    lora_ckpt_source: str = "hf"
+    max_lora_rank: int = 64
+    lora_target_modules: List[str] = field(default_factory=list)
+    trtllm_modules_to_hf_modules: Dict[str, str] = field(default_factory=dict)
+    max_loras: Optional[int] = None
+    max_cpu_loras: Optional[int] = None
+    swap_gate_up_proj_lora_b_weight: bool = True
+
+    def __post_init__(self):
+        assert self.lora_ckpt_source in [
+            "hf", "nemo"
+        ], (f"lora_ckpt_source must be one of 'hf' or 'nemo', got {self.lora_ckpt_source}"
+            )
+
+    @property
+    def missing_qkv_modules(self) -> List[str]:
+        return get_missing_qkv_modules_from_lora_modules(
+            self.lora_target_modules)
diff --git a/tensorrt_llm/lora_manager.py b/tensorrt_llm/lora_manager.py
@@ -241,6 +241,7 @@ class LoraConfig(DictConversion):
     trtllm_modules_to_hf_modules: Dict[str, str] = field(default_factory=dict)
     max_loras: int | None = None
     max_cpu_loras: int | None = None
+    swap_gate_up_proj_lora_b_weight: bool = True
 
     def __post_init__(self):
         assert self.lora_ckpt_source in ["hf", "nemo"], (
@@ -258,6 +259,7 @@ class LoraModelConfig:
     trtllm_modules_to_hf_modules: dict[str, str]
     hidden_size: int
     dtype: str
+    swap_gate_up_proj_lora_b_weight: bool = True
 
 
 class HfLoraLoader:
@@ -1026,16 +1028,17 @@ def load_from_hf(
         )
         hf_modules = set(hf_modules_to_trtllm_modules.keys())
 
-        def preprocess_lora_weights(lora_model):
+        def preprocess_lora_weights(lora_model, model_config):
             # Swap weights of gate_up_proj
-            for key, value in lora_model.items():
-                if "gate_up_proj.lora_B.weight" in key:
-                    original_weights = value.contiguous().clone()
-                    half_split = original_weights.shape[0] // 2
-                    first_half = original_weights[:half_split, :]
-                    second_half = original_weights[half_split:, :]
-                    value = torch.cat((second_half, first_half), dim=0)
-                    lora_model[key] = value
+            if getattr(model_config, "swap_gate_up_proj_lora_b_weight", True):
+                for key, value in lora_model.items():
+                    if "gate_up_proj.lora_B.weight" in key:
+                        original_weights = value.contiguous().clone()
+                        half_split = original_weights.shape[0] // 2
+                        first_half = original_weights[:half_split, :]
+                        second_half = original_weights[half_split:, :]
+                        value = torch.cat((second_half, first_half), dim=0)
+                        lora_model[key] = value
             return lora_model
 
         def load_from_model_dir(uid, model_dir, hf_config):
@@ -1047,7 +1050,7 @@ def load_from_model_dir(uid, model_dir, hf_config):
             lora_model = load_state_dict(get_model_path(model_dir, "adapter_model"))
             if lora_model is None:
                 raise ValueError(f"Failed to load adapter_model from {model_dir}")
-            lora_model = preprocess_lora_weights(lora_model)
+            lora_model = preprocess_lora_weights(lora_model, model_config)
             all_weights = get_all_hf_lora_weights(lora_model, hf_modules, component)
             rank = int(hf_config["r"])
             rs_lora = bool(hf_config.get("use_rslora", False))
diff --git a/tests/integration/defs/perf/pytorch_model_config.py b/tests/integration/defs/perf/pytorch_model_config.py
@@ -198,15 +198,17 @@ def get_model_yaml_config(model_label: str,
         }
         if 'phi_4_multimodal_instruct' in model_label:
             lora_config['lora_config']['lora_target_modules'] = [
-                "attn_qkv", "attn_dense", "mlp_h_to_4h", "mlp_4h_to_h"
+                "attn_qkv", "attn_dense", "mlp_gate_up", "mlp_4h_to_h"
             ]
             lora_config['lora_config']['trtllm_modules_to_hf_modules'] = {
                 "attn_qkv": "qkv_proj",
                 "attn_dense": "o_proj",
-                "mlp_h_to_4h": "gate_up_proj",
+                "mlp_gate_up": "gate_up_proj",
                 "mlp_4h_to_h": "down_proj"
             }
             lora_config['lora_config']['max_lora_rank'] = 320
+            lora_config['lora_config'][
+                'swap_gate_up_proj_lora_b_weight'] = False
         base_config.update(lora_config)
 
     kv_cache_config = base_config.get('kv_cache_config', {})
diff --git a/tests/integration/defs/test_e2e.py b/tests/integration/defs/test_e2e.py
@@ -2486,15 +2486,15 @@ def test_ptp_quickstart_multimodal_phi4mm(llm_root, llm_venv, modality):
     }
     expected_keywords = {
         "image": [
-            ["image", "depicts", "mountain", "half", "rock"],
-            ["road", "car", "lane", "traffic", "bus"],
+            ["object", "mountain", "weather", "clear", "clouds"],
+            ["traffic", "road", "vehicles", "cars", "bus"],
         ],
         "audio": [
             ["what", "is", "the", "traffic", "sign", "in", "image"],
             ["what", "is", "shown", "in", "this", "image"],
         ],
         "image_audio": [
-            ["image", "depicts", "Grand", "rock", "scene"],
+            ["image", "depicts", "scenic", "famous", "landmark"],
         ],
     }
 

Original file line number	Diff line number	Diff line change
`@@ -198,15 +198,17 @@ def get_model_yaml_config(model_label: str,`
`198`	`198`	`}`
`199`	`199`	`if 'phi_4_multimodal_instruct' in model_label:`
`200`	`200`	`lora_config['lora_config']['lora_target_modules'] = [`
`201`		`- "attn_qkv", "attn_dense", "mlp_h_to_4h", "mlp_4h_to_h"`
	`201`	`+ "attn_qkv", "attn_dense", "mlp_gate_up", "mlp_4h_to_h"`
`202`	`202`	`]`
`203`	`203`	`lora_config['lora_config']['trtllm_modules_to_hf_modules'] = {`
`204`	`204`	`"attn_qkv": "qkv_proj",`
`205`	`205`	`"attn_dense": "o_proj",`
`206`		`- "mlp_h_to_4h": "gate_up_proj",`
	`206`	`+ "mlp_gate_up": "gate_up_proj",`
`207`	`207`	`"mlp_4h_to_h": "down_proj"`
`208`	`208`	`}`
`209`	`209`	`lora_config['lora_config']['max_lora_rank'] = 320`
	`210`	`+ lora_config['lora_config'][`
	`211`	`+ 'swap_gate_up_proj_lora_b_weight'] = False`
`210`	`212`	`base_config.update(lora_config)`
`211`	`213`
`212`	`214`	`kv_cache_config = base_config.get('kv_cache_config', {})`