[None][fix] Accommodate Phi3/4 to work with ModelOpt's FP8 ckpts in Torch (#6761)

moraxu · web-flow · commit 7334f9390c75 · 2025-08-19T09:22:46.000-07:00
Signed-off-by: Michal Guzek &lt;mguzek@nvidia.com&gt;
diff --git a/tensorrt_llm/_torch/models/modeling_phi3.py b/tensorrt_llm/_torch/models/modeling_phi3.py
@@ -222,31 +222,42 @@ def filter_weights(prefix: str, weights: dict):
                                               num_kv_heads * head_dim, :]
                         v_weight = qkv_weight[hidden_size +
                                               num_kv_heads * head_dim:, :]
-                        module.load_weights(weights=[
-                            {
-                                'weight': q_weight
-                            },
-                            {
-                                'weight': k_weight
-                            },
-                            {
-                                'weight': v_weight
-                            },
-                        ])
+
+                        # Get the scale factor for the fused QKV projection
+                        qkv_scale = module_weights.get('weight_scale', None)
+
+                        q_dict = {'weight': q_weight}
+                        if qkv_scale is not None:
+                            q_dict['weight_scale'] = qkv_scale
+
+                        k_dict = {'weight': k_weight}
+                        if qkv_scale is not None:
+                            k_dict['weight_scale'] = qkv_scale  # Use same scale
+
+                        v_dict = {'weight': v_weight}
+                        if qkv_scale is not None:
+                            v_dict['weight_scale'] = qkv_scale  # Use same scale
+
+                        module.load_weights(weights=[q_dict, k_dict, v_dict])
                     elif "mlp.gate_up_proj" in name:
                         # The weights need to be split correctly before sharding to support tp_size >1.
                         intermediate_size = self.config.intermediate_size
                         gate_up_weight = module_weights['weight'][:]
                         gate_weight = gate_up_weight[:intermediate_size, :]
                         up_weight = gate_up_weight[intermediate_size:, :]
-                        module.load_weights(weights=[
-                            {
-                                'weight': gate_weight
-                            },
-                            {
-                                'weight': up_weight
-                            },
-                        ])
+
+                        # Get the scale factors if they exist
+                        gate_up_scale = module_weights.get('weight_scale', None)
+
+                        gate_dict = {'weight': gate_weight}
+                        if gate_up_scale is not None:
+                            gate_dict['weight_scale'] = gate_up_scale
+
+                        up_dict = {'weight': up_weight}
+                        if gate_up_scale is not None:
+                            up_dict['weight_scale'] = gate_up_scale
+
+                        module.load_weights(weights=[gate_dict, up_dict])
                     else:
                         module.load_weights(weights=[module_weights])
                 else: