update quantize_linear_from_config to point to the custom op

Fridah-nv · Fridah-nv · commit 2c0b4f02c4c0 · 2025-08-15T02:40:38.000Z
Signed-off-by: Frida Hou &lt;201670829+Fridah-nv@users.noreply.github.com&gt;
diff --git a/tensorrt_llm/_torch/auto_deploy/transform/library/quantization.py b/tensorrt_llm/_torch/auto_deploy/transform/library/quantization.py
@@ -29,7 +29,7 @@ def _insert_quantized_linear(
     quantization_impl: QuantizationImpl,
     is_quantized_graph: bool = False,
 ):
-    """Replaces the matmul node with a new quantized matmul node.
+    """Replaces the matmul node with a new custom quantized linear node.
 
     The state_dict is also updated to contain the sharded weights.
     """
@@ -72,14 +72,17 @@ def _insert_quantized_linear(
         partial(quantization_impl.load_hook, weight_name=param_name)
     )
 
-    node.target = quantization_impl.target_op()
-
     with gm.graph.inserting_before(node):
         scales = {}
         for scale_name in quantization_impl.scale_names():
             scales[scale_name] = gm.graph.create_node("get_attr", modname + "." + scale_name)
 
-    node.kwargs = {**node.kwargs, **scales}
+    custom_kwargs = quantization_impl.build_custom_kwargs_for_linear(
+        scales,
+    )
+
+    node.target = quantization_impl.custom_op()
+    node.kwargs = {**node.kwargs, **custom_kwargs}
 
 
 def _insert_quantized_bmm(
diff --git a/tensorrt_llm/_torch/auto_deploy/utils/quantization_utils.py b/tensorrt_llm/_torch/auto_deploy/utils/quantization_utils.py
@@ -27,6 +27,10 @@
 except ImportError:
     float4_sf_dtype = None
 
+# TODO: put the ENUMs in the same place and import it
+FORMAT_FP8 = 0
+FORMAT_NVFP4 = 1
+
 
 def modelopt_fp4_scale_to_cutlass_fp4_scale(modelopt_scale: torch.Tensor) -> torch.Tensor:
     """Converts the modelopt FP4 per-block weight scale to the cutlass format (padded and swizzled)."""
@@ -160,6 +164,18 @@ def shard_load_hook(
     def fuse_linear_weights(weights, **kwargs) -> Tuple[torch.Tensor, Dict[str, torch.Tensor]]:
         pass
 
+    @staticmethod
+    def custom_op():
+        """Unified custom kernel entry-point for quantized linear."""
+        return torch.ops.auto_deploy.custom_quant_linear
+
+    @staticmethod
+    def build_custom_kwargs_for_linear(
+        scale_getattrs: Dict[str, Node],
+    ) -> Dict[str, object]:
+        """Default: no extra kwargs. Each impl overrides to pass the right inputs/scales/zps/format."""
+        return {}
+
 
 class FP8QuantizationImpl(QuantizationImpl):
     @staticmethod
@@ -180,6 +196,20 @@ def scale_names() -> List[str]:
     def default_scales(original_weight_shape: Tuple) -> Dict[str, torch.Tensor]:
         return {"input_scale": torch.tensor(1.0), "weight_scale": torch.tensor(1.0)}
 
+    @staticmethod
+    def build_custom_kwargs_for_linear(
+        scale_getattrs: Dict[str, Node],
+    ) -> Dict[str, object]:
+        # FP8 custom op contract:
+        #   input_scale=[tensor], weight_scale=[tensor], input_zp=[], weight_zp=[], format_type=FORMAT_FP8
+        return dict(
+            input_scale=[scale_getattrs["input_scale"]],
+            weight_scale=[scale_getattrs["weight_scale"]],
+            input_zp=[],
+            weight_zp=[],
+            format_type=FORMAT_FP8,
+        )
+
     @staticmethod
     def load_hook(state_dict, prefix, *args, weight_name):
         if weight_name in state_dict:
@@ -264,6 +294,29 @@ def default_scales(original_weight_shape: Tuple) -> Dict[str, torch.Tensor]:
             "alpha": torch.tensor(1.0 / 6.0),
         }
 
+    @staticmethod
+    def build_custom_kwargs_for_linear(
+        scale_getattrs: Dict[str, Node],
+    ) -> Dict[str, object]:
+        """
+        Contract:
+          custom_quant_linear(
+              x, Wq, bias,
+              input_scale=[s_in2],
+              weight_scale=[weight_scale_cutlass_uint8, alpha_fused],
+              input_zp=[],
+              weight_zp=[],
+              format_type=FORMAT_NVFP4
+          )
+        """
+        return dict(
+            input_scale=[scale_getattrs["input_scale"]],
+            weight_scale=[scale_getattrs["weight_scale"], scale_getattrs["alpha"]],
+            input_zp=[],
+            weight_zp=[],
+            format_type=FORMAT_NVFP4,
+        )
+
     @staticmethod
     def load_hook(state_dict, prefix, *args, weight_name):
         if weight_name in state_dict: