NVIDIA
diff --git a/‎tensorrt_llm/_torch/auto_deploy/custom_ops/torch_quant.py
Lines changed: 14 additions & 14 deletions b/‎tensorrt_llm/_torch/auto_deploy/custom_ops/torch_quant.py
Lines changed: 14 additions & 14 deletions
diff --git a/‎tensorrt_llm/_torch/auto_deploy/transform/library/fuse_quant.py
Lines changed: 11 additions & 24 deletions b/‎tensorrt_llm/_torch/auto_deploy/transform/library/fuse_quant.py
Lines changed: 11 additions & 24 deletions
diff --git a/‎tensorrt_llm/_torch/auto_deploy/transform/library/fusion.py
Lines changed: 1 addition & 3 deletions b/‎tensorrt_llm/_torch/auto_deploy/transform/library/fusion.py
Lines changed: 1 addition & 3 deletions
diff --git a/‎tensorrt_llm/_torch/auto_deploy/transform/library/quantization.py
Lines changed: 1 addition & 3 deletions b/‎tensorrt_llm/_torch/auto_deploy/transform/library/quantization.py
Lines changed: 1 addition & 3 deletions
diff --git a/‎tensorrt_llm/_torch/auto_deploy/utils/pattern_matcher.py
Lines changed: 3 additions & 11 deletions b/‎tensorrt_llm/_torch/auto_deploy/utils/pattern_matcher.py
Lines changed: 3 additions & 11 deletions
@@ -20,7 +20,7 @@ def _expect_single_scale(scales: List[Optional[torch.Tensor]], name: str) -> tor
     return scales[0]
 
 
-def _to_fp8(x: torch.Tensor, scale: torch.Tensor) -> torch.Tensor:
+def _to_fp8_fake(x: torch.Tensor, scale: torch.Tensor) -> torch.Tensor:
     return (x / scale).to(torch.float8_e4m3fn)
 
 
@@ -84,7 +84,7 @@ def _cast_fp4(weight: torch.Tensor):
 
     sign_bit = (weight < 0).to(torch.uint8)
 
-    weight_abs = weight.abs_()
+    weight_abs = weight.abs()  # avoid in-place modification to input
     # Calculate the ordinal value based on the bounds
     ord = torch.searchsorted(e2m1_bounds.to(device), weight_abs, out_int32=True).to(torch.uint8)
     # All values equal to e2m1_bounds at odd indices are rounded up and even indices are rounded down
@@ -105,7 +105,7 @@ def _quantize_nvfp4(
         block_size (int): The size of each block for quantization.
         weights_scaling_factor_2 (torch.Tensor): The per-tensor scaling factor for the weights.
     Returns:
-    tuple: Contains quantized data, quantized per block scaling factor, and per block scaling factor.
+    tuple: Contains quantized data and quantized per block scaling factor
     """
 
     weights_scaling_factor, weights_scaling_factor_2 = _nvfp4_get_weights_scaling_factor(
@@ -160,11 +160,11 @@ def _dequantize_nvfp4(
 def torch_fake_quant_fp8_linear(
     input: torch.Tensor,
     weight_quantized: torch.Tensor,
-    bias: torch.Tensor,  # Optional, no default
-    input_scale: List[torch.Tensor],  # Tensor?[]  (REQUIRED: no default)
-    weight_scale: List[torch.Tensor],  # Tensor?[]
-    input_zp: List[torch.Tensor],  # Tensor?[]
-    weight_zp: List[torch.Tensor],  # Tensor?[]
+    bias: torch.Tensor,
+    input_scale: List[torch.Tensor],
+    weight_scale: List[torch.Tensor],
+    input_zp: List[torch.Tensor],
+    weight_zp: List[torch.Tensor],
 ) -> torch.Tensor:
     """
     Reference (eager) implementation for multiple quant formats via `format_type`.
@@ -180,7 +180,7 @@ def torch_fake_quant_fp8_linear(
     in_dtype = input.dtype
     out_features, in_features = weight_quantized.shape
 
-    input_fp8 = _to_fp8(input, s_in)
+    input_fp8 = _to_fp8_fake(input, s_in)
     input_deq = _from_fp8(input_fp8, s_in, in_dtype)
 
     weight_deq = _dequant_weight_fp8(weight_quantized, s_w, out_features, in_dtype)
@@ -209,11 +209,11 @@ def torch_fake_quant_fp8_linear(
 def torch_fake_quant_fp4_linear(
     input: torch.Tensor,
     weight_quantized: torch.Tensor,
-    bias: torch.Tensor,  # Optional, no default
-    input_scale: List[torch.Tensor],  # Tensor?[]  (REQUIRED: no default)
-    weight_scale: List[torch.Tensor],  # Tensor?[]
-    input_zp: List[torch.Tensor],  # Tensor?[]
-    weight_zp: List[torch.Tensor],  # Tensor?[]
+    bias: torch.Tensor,
+    input_scale: List[torch.Tensor],
+    weight_scale: List[torch.Tensor],
+    input_zp: List[torch.Tensor],
+    weight_zp: List[torch.Tensor],
 ) -> torch.Tensor:
     """
     Reference (eager) implementation for multiple quant formats via `format_type`.
 
@@ -9,13 +9,13 @@
 from ..interface import BaseTransform, SharedConfig, TransformInfo, TransformRegistry
 
 
+# with bias=None
 def _fp8_ref_pattern_1(
     x: torch.Tensor,
     w_fp8: torch.Tensor,
     input_scale: torch.Tensor,
     weight_scale: torch.Tensor,
 ):
-    # Matches: torch_fake_quant_fp8_linear(input, weight_fp8, bias, [in_s], [w_s], [], [])
     return torch.ops.auto_deploy.torch_fake_quant_fp8_linear.default(
         x,
         w_fp8,
@@ -33,9 +33,6 @@ def _fp8_ref_repl_1(
     input_scale: torch.Tensor,
     weight_scale: torch.Tensor,
 ):
-    # Map lists -> scalars for fused op
-    # in_s = input_scale[0]
-    # w_s = weight_scale[0]
     return torch.ops.auto_deploy.torch_quant_fp8_linear(
         x,
         w_fp8,
@@ -45,14 +42,14 @@ def _fp8_ref_repl_1(
     )
 
 
+# with bias!=None
 def _fp8_ref_pattern_2(
     x: torch.Tensor,
     w_fp8: torch.Tensor,
     bias: torch.Tensor,
     input_scale: torch.Tensor,
     weight_scale: torch.Tensor,
 ):
-    # Matches: torch_fake_quant_fp8_linear(input, weight_fp8, bias, [in_s], [w_s], [], [])
     return torch.ops.auto_deploy.torch_fake_quant_fp8_linear.default(
         x,
         w_fp8,
@@ -71,9 +68,6 @@ def _fp8_ref_repl_2(
     input_scale: torch.Tensor,
     weight_scale: torch.Tensor,
 ):
-    # Map lists -> scalars for fused op
-    # in_s = input_scale[0]
-    # w_s = weight_scale[0]
     return torch.ops.auto_deploy.torch_quant_fp8_linear(
         x,
         w_fp8,
@@ -83,15 +77,14 @@ def _fp8_ref_repl_2(
     )
 
 
-# NVFP4: reference (search) and fused (replacement)
+# NVFP4: with bias=None
 def _fp4_ref_pattern_1(
     x: torch.Tensor,
     w_fp4: torch.Tensor,
     input_scale: torch.Tensor,
     weight_scale: torch.Tensor,
     alpha: torch.Tensor,
 ):
-    # Matches: torch_fake_quant_fp4_linear(x, w_fp4, bias, [s_in2], [cutlass_scale, alpha], [], [])
     return torch.ops.auto_deploy.torch_fake_quant_fp4_linear(
         x,
         w_fp4,
@@ -120,6 +113,7 @@ def _fp4_ref_repl_1(
     )
 
 
+# with bias!=None
 def _fp4_ref_pattern_2(
     x: torch.Tensor,
     w_fp4: torch.Tensor,
@@ -128,7 +122,6 @@ def _fp4_ref_pattern_2(
     weight_scale: torch.Tensor,
     alpha: torch.Tensor,
 ):
-    # Matches: torch_fake_quant_fp4_linear(x, w_fp4, bias, [s_in2], [cutlass_scale, alpha], [], [])
     return torch.ops.auto_deploy.torch_fake_quant_fp4_linear(
         x,
         w_fp4,
@@ -162,10 +155,8 @@ def _register_quant_linear_patterns(patterns: ADPatternMatcherPass) -> None:
     """
     Register the FP8 and FP4 patterns with robust dummy args and minimal ignores.
     """
-    # Use harmless meta tensors; no dtype/device constraints during tracing.
-    # Shapes mirror your unit tests but can be arbitrary as long as tracing succeeds.
     x_fp8 = torch.randn(3, 16, device="meta", dtype=torch.float16)
-    w_fp8 = torch.randn(32, 16, device="meta", dtype=torch.float16)  # dtype not enforced in trace
+    w_fp8 = torch.randn(32, 16, device="meta", dtype=torch.float16)
     bias32 = torch.randn(32, device="meta", dtype=torch.float32)
     one = torch.tensor(1.0, device="meta", dtype=torch.float32)
 
@@ -189,47 +180,43 @@ def _register_quant_linear_patterns(patterns: ADPatternMatcherPass) -> None:
         replace_fn=_fp8_ref_repl_1,
         patterns=patterns,
         dummy_args=dummy_args_fp8,
-        # No special scalar_workaround or op_ignore_types needed here.
     )
     register_ad_pattern(
         search_fn=_fp8_ref_pattern_2,
         replace_fn=_fp8_ref_repl_2,
         patterns=patterns,
         dummy_args=dummy_args_fp8_2,
-        # No special scalar_workaround or op_ignore_types needed here.
     )
 
     # FP4 dummy args
     N = 32
     K_packed = 32  # weight is packed by 2 FP4 per byte
-    K_eff = 2 * K_packed  # <- effective K after repeat(1, 2) in the fake impl
+    K_eff = 2 * K_packed
 
-    x_fp4 = torch.randn(3, K_eff, device="meta", dtype=torch.float16)  # was 3 x 32, must be 3 x 64
+    x_fp4 = torch.randn(3, K_eff, device="meta", dtype=torch.float16)
     w_fp4 = torch.randint(0, 255, (N, K_packed), device="meta", dtype=torch.uint8)
 
     s_in2 = torch.tensor(0.01, device="meta", dtype=torch.float32)
     alpha = torch.tensor(1.2345, device="meta", dtype=torch.float32)
 
-    # Optional: give a realistic-length CUTLASS scale vector (one uint8 per 16-wide block)
-    # num_blocks = N * (K_eff // 16)
     cutlass_len = N * (K_eff // 16)  # 32 * (64/16) = 128
     cutlass_vec = torch.randint(0, 255, (cutlass_len,), device="meta", dtype=torch.uint8)
 
     dummy_args_fp4_1 = [
         x_fp4,
         w_fp4,
-        s_in2,  # input_scale list
+        s_in2,
         cutlass_vec,
-        alpha,  # weight_scale list: [per-block vec, alpha]
+        alpha,
     ]
 
     dummy_args_fp4_2 = [
         x_fp4,
         w_fp4,
         torch.randn(N, device="meta", dtype=torch.float16),  # bias
-        s_in2,  # input_scale list
+        s_in2,
         cutlass_vec,
-        alpha,  # weight_scale list: [per-block vec, alpha]
+        alpha,
     ]
 
     register_ad_pattern(
 
@@ -115,11 +115,9 @@ class QuantizationFusionMixin:
             fused_buffers: Dict[str, Tensor] to register as buffers on the fused module
     """
 
-    # required class attributes in subclasses:
     target_op: Callable
     scale_groups: List[List[str]]
 
-    # required method in subclasses:
     def fuse_rule(
         self, weights: List[torch.Tensor], **scales
     ) -> Tuple[torch.Tensor, Dict[str, torch.Tensor]]:
@@ -141,7 +139,7 @@ def split_output(tensor: torch.Tensor) -> Tuple[torch.Tensor, ...]:
             """Split the output tensor of the fused linear node to obtain the original outputs."""
             return tuple(t.contiguous() for t in torch.split(tensor, sizes_unfused, dim=-1))
 
-        # 2) Load scale buffers grouped by flattened scale names
+        # Load scale buffers grouped by flattened scale names
         flat_scale_names = list(chain.from_iterable(self.scale_groups))
         scales: Dict[str, List[torch.Tensor]] = {}
         for weight_key in keys_unfused:
 
@@ -79,7 +79,7 @@ def _insert_quantized_linear(
 
     custom_args = quantization_impl.build_custom_args_for_linear(scales)
 
-    node.target = quantization_impl.custom_op()
+    node.target = quantization_impl.target_op()
     node.args = (*node.args, *custom_args)
 
 
@@ -195,7 +195,6 @@ def _apply(
         impl = QuantizationImpl.create(quant_algo, is_bmm=False)
 
         for n in gm.graph.nodes:
-            # Only consider linear ops; skip if excluded
             if not is_linear_op(n, include_quantization=False):
                 continue
             if should_skip_quantization(n, excluded):
@@ -236,7 +235,6 @@ def _apply(
         for n in gm.graph.nodes:
             if not is_bmm_op(n):
                 continue
-            # Reuse common exclusion rule (supports Node or param-name string)
             if should_skip_quantization(n, excluded):
                 continue
 
 
@@ -34,17 +34,10 @@
 from tensorrt_llm._torch.auto_deploy.export import torch_export_to_gm
 
 
+# Copied from torch._dynamo.utils.detect_fake_mode but skip the same FakeMode assertion
+# In our use case, FakeMode of the inserted replacement pattern is different from the original
+# FakeMode from graph, which breaks this assertion
 def ad_detect_fake_mode(inputs: Any = None):
-    """
-    Attempts to "detect" what the current fake mode is.  If there is one ambiently
-    available from TracingContext, we preferentially use that.  Otherwise, we
-    heuristically detect the fake mode via the following sources, in order of
-    priority:
-
-        - Currently active fake mode on stack
-        - Fake mode associated with passed in tensors (inputs does not
-          have to be flattened)
-    """
     from torch._subclasses.fake_tensor import FakeTensor, FakeTensorMode
 
     fake_modes = []
@@ -72,7 +65,6 @@ def ad_detect_fake_mode(inputs: Any = None):
         return None
 
 
-# Replace the function used as a context manager
 torch._dynamo.utils.detect_fake_mode = ad_detect_fake_mode