rename custom quant ops

Fridah-nv · Fridah-nv · commit 97e81256d27d · 2025-08-25T20:04:51.000Z
Signed-off-by: Frida Hou &lt;201670829+Fridah-nv@users.noreply.github.com&gt;

rename torch cumstom op

Signed-off-by: Frida Hou &lt;201670829+Fridah-nv@users.noreply.github.com&gt;
diff --git a/tensorrt_llm/_torch/auto_deploy/custom_ops/torch_quant.py b/tensorrt_llm/_torch/auto_deploy/custom_ops/torch_quant.py
@@ -162,8 +162,8 @@ def _dequantize_nvfp4(
     return vals.view(N, K).to(orig_dtype)
 
 
-@torch.library.custom_op("auto_deploy::torch_quant_linear_fp8", mutates_args=())
-def torch_quant_linear_fp8(
+@torch.library.custom_op("auto_deploy::torch_fake_quant_fp8_linear", mutates_args=())
+def torch_fake_quant_fp8_linear(
     input: torch.Tensor,
     weight_quantized: torch.Tensor,
     bias: torch.Tensor,  # Optional, no default
@@ -198,8 +198,8 @@ def torch_quant_linear_fp8(
     return out.reshape(*input.shape[:-1], out_features)
 
 
-@torch_quant_linear_fp8.register_fake
-def torch_quant_linear_fp8(
+@torch_fake_quant_fp8_linear.register_fake
+def torch_fake_quant_fp8_linear(
     input: torch.Tensor,
     weight_quantized: torch.Tensor,
     bias: torch.Tensor,
@@ -212,8 +212,8 @@ def torch_quant_linear_fp8(
     return torch.ops.aten.linear(input, w, bias)
 
 
-@torch.library.custom_op("auto_deploy::torch_quant_linear_fp4", mutates_args=())
-def torch_quant_linear_fp4(
+@torch.library.custom_op("auto_deploy::torch_fake_quant_fp4_linear", mutates_args=())
+def torch_fake_quant_fp4_linear(
     input: torch.Tensor,
     weight_quantized: torch.Tensor,
     bias: torch.Tensor,  # Optional, no default
@@ -274,8 +274,8 @@ def torch_quant_linear_fp4(
     return out_2d.reshape(*input_shape[:-1], N)
 
 
-@torch_quant_linear_fp4.register_fake
-def torch_quant_linear_fp4(
+@torch_fake_quant_fp4_linear.register_fake
+def torch_fake_quant_fp4_linear(
     input: torch.Tensor,
     weight_quantized: torch.Tensor,
     bias: torch.Tensor,
diff --git a/tensorrt_llm/_torch/auto_deploy/transform/library/fusion.py b/tensorrt_llm/_torch/auto_deploy/transform/library/fusion.py
@@ -102,7 +102,7 @@ class QuantizationFusionMixin:
     that share the same input activation (parent node).
 
     Subclasses must define:
-      - target_op: the torch op identifying the quantized linear (e.g., torch.ops.auto_deploy.torch_quant_linear_fp8)
+      - target_op: the torch op identifying the quantized linear
       - scale_groups: List[List[str]] describing how kwargs should be grouped, e.g.
             FP8 -> [["input_scale"], ["weight_scale"]]
             FP4 -> [["input_scale"], ["weight_scale", "alpha"]]
@@ -260,7 +260,7 @@ def _apply(
 
 @TransformRegistry.register("fuse_fp8_gemms")
 class FuseFP8Gemms(QuantizationFusionMixin, BaseTransform):
-    target_op = torch.ops.auto_deploy.torch_quant_linear_fp8
+    target_op = torch.ops.auto_deploy.torch_fake_quant_fp8_linear
     scale_groups = [["input_scale"], ["weight_scale"]]
 
     def fuse_rule(
@@ -298,7 +298,7 @@ def _apply(
 
 @TransformRegistry.register("fuse_fp4_gemms")
 class FuseFP4Gemms(QuantizationFusionMixin, BaseTransform):
-    target_op = torch.ops.auto_deploy.torch_quant_linear_fp4
+    target_op = torch.ops.auto_deploy.torch_fake_quant_fp4_linear
     scale_groups = [["input_scale"], ["weight_scale", "alpha"]]
 
     def fuse_rule(
diff --git a/tensorrt_llm/_torch/auto_deploy/utils/node_utils.py b/tensorrt_llm/_torch/auto_deploy/utils/node_utils.py
@@ -249,8 +249,8 @@ def is_linear_op(node: Node, include_quantization: bool = False) -> bool:
 
     if include_quantization:
         lin_ops.update(QUANT_LINEAR_OPS)
-        lin_ops.update([torch.ops.auto_deploy.torch_quant_linear_fp8])
-        lin_ops.update([torch.ops.auto_deploy.torch_quant_linear_fp4])
+        lin_ops.update([torch.ops.auto_deploy.torch_fake_quant_fp8_linear])
+        lin_ops.update([torch.ops.auto_deploy.torch_fake_quant_fp4_linear])
     return is_op(node, lin_ops)
 
 
diff --git a/tensorrt_llm/_torch/auto_deploy/utils/quantization_utils.py b/tensorrt_llm/_torch/auto_deploy/utils/quantization_utils.py
@@ -155,7 +155,7 @@ def target_op():
     @staticmethod
     def custom_op():
         """Unified custom kernel entry-point for quantized linear."""
-        return torch.ops.auto_deploy.torch_quant_linear_fp8
+        return torch.ops.auto_deploy.torch_fake_quant_fp8_linear
 
     @staticmethod
     def quantize_weight(original_weight: torch.Tensor) -> torch.Tensor:
@@ -211,7 +211,7 @@ def target_op():
     @staticmethod
     def custom_op():
         """Unified custom kernel entry-point for quantized linear."""
-        return torch.ops.auto_deploy.torch_quant_linear_fp4
+        return torch.ops.auto_deploy.torch_fake_quant_fp4_linear
 
     @staticmethod
     def quantize_weight(original_weight: torch.Tensor) -> torch.Tensor:
diff --git a/tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/test_quant.py b/tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/test_quant.py
@@ -128,15 +128,14 @@ def test_quant_linear_fp8_matches_fused_op(bias):
         weight_scale=weight_scale,
     )
 
-    out_unified = torch.ops.auto_deploy.custom_quant_linear(
+    out_unified = torch.ops.auto_deploy.torch_fake_quant_fp8_linear(
         input,
         weight_fp8,
         bias,
         [torch.tensor(1.0, device="cuda")],
         [weight_scale],
         [],
         [],
-        format_type=FORMAT_FP8,
     )
 
     assert out_unified.shape == out_fused.shape
@@ -184,7 +183,7 @@ def test_quant_linear_nvfp4_matches_fused_op(bias):
         alpha=alpha_fused,
     )
 
-    out_unified = torch.ops.auto_deploy.custom_quant_linear(
+    out_unified = torch.ops.auto_deploy.torch_fake_quant_fp4_linear(
         x,
         weight_fp4,
         bias,
@@ -195,7 +194,6 @@ def test_quant_linear_nvfp4_matches_fused_op(bias):
         ],  # weight_scale list: [per-block vector, combined alpha]
         [],  # input_zp
         [],  # weight_zp
-        format_type=FORMAT_NVFP4,
     )
 
     assert out_unified.shape == out_fused.shape