NVIDIA
diff --git a/‎tensorrt_llm/_torch/attention_backend/flashinfer.py‎
Lines changed: 88 additions & 129 deletions b/‎tensorrt_llm/_torch/attention_backend/flashinfer.py‎
Lines changed: 88 additions & 129 deletions
diff --git a/‎tensorrt_llm/_torch/attention_backend/trtllm.py‎
Lines changed: 40 additions & 32 deletions b/‎tensorrt_llm/_torch/attention_backend/trtllm.py‎
Lines changed: 40 additions & 32 deletions
diff --git a/‎tensorrt_llm/_torch/compilation/piecewise_optimizer.py‎
Lines changed: 2 additions & 3 deletions b/‎tensorrt_llm/_torch/compilation/piecewise_optimizer.py‎
Lines changed: 2 additions & 3 deletions
diff --git a/‎tensorrt_llm/_torch/compilation/utils.py‎
Lines changed: 1 addition & 2 deletions b/‎tensorrt_llm/_torch/compilation/utils.py‎
Lines changed: 1 addition & 2 deletions
@@ -478,6 +478,82 @@ def update_quant_config(self, new_quant_config: Optional[QuantConfig]):
             self.has_fp8_kv_cache = self.quant_config.layer_quant_mode.has_fp8_kv_cache(
             )
 
+    def forward_impl(
+        self,
+        q: torch.Tensor,
+        k: torch.Tensor,
+        v: torch.Tensor,
+        metadata: FlashInferAttentionMetadata,
+        attention_mask_type: int,
+        output: torch.Tensor,
+        attention_mask_data: Optional[torch.Tensor] = None,
+        attention_window_size: Optional[int] = None,
+    ) -> None:
+        # Query
+        q = q.view(-1, self.num_heads, self.head_dim)
+
+        # Key and Value
+        kv_cache = metadata.kv_cache_manager.get_buffers(self.layer_idx)
+
+        if k is not None and v is not None:
+            k = k.view(-1, self.num_kv_heads, self.head_dim)
+            v = v.view(-1, self.num_kv_heads, self.head_dim)
+
+            if self.has_fp8_kv_cache:
+                assert kv_cache.dtype == torch.float8_e4m3fn, (
+                    f"KV cache should have fp8 dtype, but get {kv_cache.dtype}")
+                k = k.to(torch.float8_e4m3fn)
+                v = v.to(torch.float8_e4m3fn)
+            assert k.dtype == v.dtype == kv_cache.dtype, (
+                f"KV cache dtype {kv_cache.dtype} does not match k/v dtype {k.dtype}/{v.dtype}"
+            )
+
+            flashinfer.page.append_paged_kv_cache(
+                append_key=k,
+                append_value=v,
+                batch_indices=metadata.batch_indices,
+                positions=metadata.positions,
+                paged_kv_cache=kv_cache,
+                kv_indices=metadata.paged_kv_indices,
+                kv_indptr=metadata.paged_kv_indptr,
+                kv_last_page_len=metadata.paged_kv_last_page_len,
+                kv_layout=metadata.kv_layout)
+
+        num_contexts = metadata.num_contexts
+        num_generations = metadata.num_generations
+        num_ctx_tokens = metadata.num_ctx_tokens
+
+        def prefill_forward(plan_params: PlanParams, out: torch.Tensor):
+            wrapper = metadata.get_prefill_wrapper(plan_params)
+            wrapper.run(q[:num_ctx_tokens],
+                        kv_cache,
+                        out=out.view(-1, self.num_heads, self.head_dim))
+
+        def decode_forward(plan_params: PlanParams, out: torch.Tensor):
+            wrapper = metadata.get_decode_wrapper(plan_params)
+            wrapper.run(q[num_ctx_tokens:],
+                        kv_cache,
+                        out=out.view(-1, self.num_heads, self.head_dim))
+
+        # this will do nothing if the last forward pass had the same parameters
+        plan_params = metadata.plan(self.num_heads,
+                                    self.num_kv_heads,
+                                    self.head_dim,
+                                    q_dtype=q.dtype,
+                                    kv_dtype=kv_cache.dtype,
+                                    q_scaling=self.q_scaling,
+                                    attention_window_size=attention_window_size,
+                                    attention_mask_type=attention_mask_type,
+                                    attention_mask_data=attention_mask_data)
+
+        if num_contexts == 0:
+            decode_forward(plan_params, output)
+        elif num_generations == 0:
+            prefill_forward(plan_params, output)
+        else:
+            prefill_forward(plan_params, output[:num_ctx_tokens, :])
+            decode_forward(plan_params, output[num_ctx_tokens:, :])
+
     def forward(self,
                 q: torch.Tensor,
                 k: Optional[torch.Tensor],
@@ -487,6 +563,7 @@ def forward(self,
                 attention_window_size: Optional[int] = None,
                 attention_mask: AttentionMask = PredefinedAttentionMask.CAUSAL,
                 attention_mask_data: Optional[torch.Tensor] = None,
+                output: Optional[torch.Tensor] = None,
                 **kwargs) -> torch.Tensor:
         if attention_mask == CustomAttentionMask.CUSTOM:
             assert attention_mask_data is not None, "attention_mask_data is required for custom attention mask."
@@ -502,133 +579,15 @@ def forward(self,
         else:
             raise ValueError("Unexpected attention mask type")
 
-        return forward_pattern(q=q,
-                               k=k,
-                               v=v,
-                               num_heads=self.num_heads,
-                               head_dim=self.head_dim,
-                               num_kv_heads=self.num_kv_heads,
-                               layer_idx=self.layer_idx,
-                               has_fp8_kv_cache=self.has_fp8_kv_cache,
-                               attention_mask_type=attention_mask_type,
-                               q_scaling=self.q_scaling,
-                               attention_mask_data=attention_mask_data,
-                               attention_window_size=attention_window_size)
-
-
-@torch.library.custom_op("trtllm::flashinfer_forward", mutates_args=())
-def forward_pattern(
-    q: torch.Tensor,
-    k: torch.Tensor,
-    v: torch.Tensor,
-    num_heads: int,
-    head_dim: int,
-    num_kv_heads: int,
-    layer_idx: int,
-    has_fp8_kv_cache: bool,
-    attention_mask_type: int,
-    q_scaling: Optional[float] = None,
-    attention_mask_data: Optional[torch.Tensor] = None,
-    attention_window_size: Optional[int] = None,
-) -> torch.Tensor:
-    '''
-    Wrapping the flashinfer forward as a custom op is required to fix `torch.compile` graph breaks,
-    otherwise it will graph break when calling `metadata.num_contexts` since it convert tensor's sum directly to int.
-    '''
-    # torch.compile does not support custom object as arguments, so we have to use global function to get the metadata.
-    extra_attrs = get_model_extra_attrs()
-    if extra_attrs is not None:
-        metadata_ref = extra_attrs.get("attention_metadata", None)
-        metadata = metadata_ref() if metadata_ref is not None else None
-    else:
-        metadata = get_global_attrs().attention_metadata()
-
-    assert isinstance(
-        metadata,
-        FlashInferAttentionMetadata,
-    )
-
-    # Query
-    q = q.view(-1, num_heads, head_dim)
-
-    # Key and Value
-    kv_cache = metadata.kv_cache_manager.get_buffers(layer_idx)
-
-    if k is not None and v is not None:
-        k = k.view(-1, num_kv_heads, head_dim)
-        v = v.view(-1, num_kv_heads, head_dim)
-
-        if has_fp8_kv_cache:
-            assert kv_cache.dtype == torch.float8_e4m3fn, f"KV cache should have fp8 dtype, but get {kv_cache.dtype}"
-            k = k.to(torch.float8_e4m3fn)
-            v = v.to(torch.float8_e4m3fn)
-        assert k.dtype == v.dtype == kv_cache.dtype, f"KV cache dtype {kv_cache.dtype} does not match k/v dtype {k.dtype}/{v.dtype}"
-
-        flashinfer.page.append_paged_kv_cache(
-            append_key=k,
-            append_value=v,
-            batch_indices=metadata.batch_indices,
-            positions=metadata.positions,
-            paged_kv_cache=kv_cache,
-            kv_indices=metadata.paged_kv_indices,
-            kv_indptr=metadata.paged_kv_indptr,
-            kv_last_page_len=metadata.paged_kv_last_page_len,
-            kv_layout=metadata.kv_layout)
-
-    num_contexts = metadata.num_contexts
-    num_generations = metadata.num_generations
-    num_ctx_tokens = metadata.num_ctx_tokens
-
-    def prefill_forward(plan_params: PlanParams):
-        wrapper = metadata.get_prefill_wrapper(plan_params)
-        output = wrapper.run(q[:num_ctx_tokens], kv_cache)
-        output = output.view(num_ctx_tokens, -1)
-        return output
-
-    def decode_forward(plan_params: PlanParams):
-        wrapper = metadata.get_decode_wrapper(plan_params)
-        output = wrapper.run(q[num_ctx_tokens:], kv_cache)
-        output = output.view(num_generations, -1)
+        if output is None:
+            output = torch.empty_like(q)
+
+        self.forward_impl(q=q,
+                          k=k,
+                          v=v,
+                          metadata=metadata,
+                          attention_mask_type=attention_mask_type,
+                          attention_mask_data=attention_mask_data,
+                          attention_window_size=attention_window_size,
+                          output=output)
         return output
-
-    # this will do nothing if the last forward pass had the same parameters
-    plan_params = metadata.plan(num_heads,
-                                num_kv_heads,
-                                head_dim,
-                                q_dtype=q.dtype,
-                                kv_dtype=kv_cache.dtype,
-                                q_scaling=q_scaling,
-                                attention_window_size=attention_window_size,
-                                attention_mask_type=attention_mask_type,
-                                attention_mask_data=attention_mask_data)
-
-    if num_contexts > 0:
-        ctx_output = prefill_forward(plan_params)
-
-    if num_generations > 0:
-        gen_output = decode_forward(plan_params)
-
-    if num_contexts > 0 and num_generations > 0:
-        output = torch.cat([ctx_output, gen_output], dim=0)
-    elif num_contexts > 0:
-        output = ctx_output
-    elif num_generations > 0:
-        output = gen_output
-
-    return output
-
-
-@forward_pattern.register_fake
-def _(
-    q: torch.Tensor,
-    k: torch.Tensor,
-    v: torch.Tensor,
-    num_heads: int,
-    head_dim: int,
-    num_kv_heads: int,
-    layer_idx: int,
-    has_fp8_kv_cache: bool,
-    attention_mask_type: int,
-    attention_mask_data: Optional[torch.Tensor],
-):
-    return torch.empty_like(q)
@@ -2,7 +2,7 @@
 import os
 import weakref
 from dataclasses import dataclass, field
-from typing import Optional
+from typing import Optional, Tuple, Union
 
 import torch
 
@@ -11,8 +11,8 @@
 from tensorrt_llm.logger import logger
 from tensorrt_llm.models.modeling_utils import QuantConfig
 
-from ..utils import (Fp4QuantizedTensor, compute_swizzled_sf_shape,
-                     get_global_attrs, get_model_extra_attrs)
+from ..utils import (compute_swizzled_sf_shape, get_global_attrs,
+                     get_model_extra_attrs)
 from .interface import (AttentionBackend, AttentionInputType, AttentionMask,
                         AttentionMetadata, KVCacheParams, MLAParams,
                         PositionalEmbeddingParams, PredefinedAttentionMask,
@@ -263,6 +263,35 @@ def plan(
         self.spec_decoding_generation_lengths = spec_decoding_generation_lengths
         self.kwargs.update(kwargs)
 
+    def create_output(self, q: torch.Tensor, out_dtype: torch.dtype):
+        num_tokens = q.size(0)
+        attention_input_type = (AttentionInputType(self.attention_input_type)
+                                if self.attention_input_type is not None else
+                                AttentionInputType.mixed)
+        if out_dtype is None:
+            out_dtype = q.dtype
+        is_gen_only = attention_input_type == AttentionInputType.generation_only
+        v_head_size = self.head_size
+        if self.is_mla_enable:
+            v_head_size = self.kv_lora_rank if is_gen_only else self.v_head_dim
+        if out_dtype == torch.uint8:
+            num_nvfp4_elements_per_container = 2
+            scaling_vector_size = 16
+            size_per_token = self.num_heads * v_head_size
+            output = q.new_empty(
+                (num_tokens,
+                 size_per_token // num_nvfp4_elements_per_container),
+                dtype=torch.uint8)
+            # Create a sf (scaling factors) tensor for NVFP4 (use INT8 as the container dtype).
+            output_sf = q.new_empty(compute_swizzled_sf_shape(
+                num_tokens, size_per_token // scaling_vector_size),
+                                    dtype=torch.uint8)
+        else:
+            output = q.new_empty((num_tokens, self.num_heads * v_head_size),
+                                 dtype=out_dtype)
+            output_sf = None
+        return output, output_sf
+
     def run(
         self,
         q: torch.Tensor,
@@ -361,30 +390,7 @@ def run(
 
         if output is None:
             assert output_sf is None
-            num_tokens = q.size(0)
-            attention_input_type = (AttentionInputType(
-                self.attention_input_type) if self.attention_input_type
-                                    is not None else AttentionInputType.mixed)
-            if out_dtype is None:
-                out_dtype = q.dtype
-            is_gen_only = attention_input_type == AttentionInputType.generation_only
-            v_head_size = self.head_size if not self.is_mla_enable else self.kv_lora_rank if is_gen_only else self.v_head_dim
-            if out_dtype == torch.uint8:
-                num_nvfp4_elements_per_container = 2
-                scaling_vector_size = 16
-                size_per_token = self.num_heads * v_head_size
-                output = q.new_empty(
-                    (num_tokens,
-                     size_per_token // num_nvfp4_elements_per_container),
-                    dtype=torch.uint8)
-                # Create a sf (scaling factors) tensor for NVFP4 (use INT8 as the container dtype).
-                output_sf = q.new_empty(compute_swizzled_sf_shape(
-                    num_tokens, size_per_token // scaling_vector_size),
-                                        dtype=torch.uint8)
-            else:
-                output = q.new_empty((num_tokens, self.num_heads * v_head_size),
-                                     dtype=out_dtype)
-                output_sf = None
+            output, output_sf = self.create_output(q, out_dtype)
         else:
             # output is provided, expect output_sf be provided as well if has NVFP4 output.
             assert out_dtype is None or out_dtype != torch.uint8 or output_sf is not None
@@ -1089,10 +1095,11 @@ def forward(
         mla_context_paged_kv: Optional[torch.Tensor] = None,
         mla_context_kv_cache_block_offsets: Optional[torch.Tensor] = None,
         softmax_stats_tensor: Optional[torch.Tensor] = None,
+        enable_attn_nvfp4_output: bool = True,
         output: Optional[torch.Tensor] = None,
         output_sf: Optional[torch.Tensor] = None,
         **kwargs,
-    ) -> torch.Tensor:
+    ) -> Union[torch.Tensor, Tuple[torch.Tensor, Optional[torch.Tensor]]]:
         assert isinstance(
             metadata,
             TrtllmAttentionMetadata,
@@ -1111,7 +1118,8 @@ def forward(
                 metadata)
 
         use_nvfp4_output = False
-        if self.has_nvfp4 and self.support_nvfp4_output():
+        if enable_attn_nvfp4_output and self.has_nvfp4 and self.support_nvfp4_output(
+        ):
             # Runtime check whether the NVFP4 output kernel is available.
             use_nvfp4_output = self.wrapper.is_nvfp4_output_kernel_available(
                 tokens_per_block=metadata.tokens_per_block,
@@ -1184,9 +1192,9 @@ def forward(
             update_kv_cache=not metadata.is_cross or k is not None,
             attention_mask=attention_mask)
 
-        if out_dtype == torch.uint8:
-            assert output_sf is not None
-            return Fp4QuantizedTensor(output, output_sf)
+        if use_nvfp4_output:
+            return output, output_sf
+
         return output
 
     @classmethod
 
@@ -246,17 +246,16 @@ def piecewise_optimizer(
         if node.op in ("output", "placeholder"):
             continue
         if (not stop_partition and is_call_function(node, [
-                torch.ops.trtllm.attention_inplace.default,
+                torch.ops.trtllm.attn_custom_op_inplace.default,
                 torch.ops.trtllm.mla_custom_op_inplace.default,
                 torch.ops.aten.index.Tensor,
                 torch.ops.aten.cumsum.default,
         ])):
             idx += 1
             node_to_graph_id[node] = idx
             exclude_modules_id.append(idx)
-            if node.target != torch.ops.trtllm.attention_inplace.default and node.target != torch.ops.trtllm.mla_custom_op_inplace.default:
+            if node.target != torch.ops.trtllm.attn_custom_op_inplace.default and node.target != torch.ops.trtllm.mla_custom_op_inplace.default:
                 # We only know it is safe to continue splitting after attention
-                # since attention_inplace will not produce any new tensor
                 stop_partition = True
             else:
                 idx += 1
 
@@ -49,9 +49,8 @@ def inplace_info():
             1: "input",
             2: "residual"
         },
-        torch.ops.trtllm.attention_inplace.default: {
+        torch.ops.trtllm.attn_custom_op_inplace.default: {
             1: "output",
-            2: "output_sf"
         },
         torch.ops.trtllm.mla_custom_op_inplace.default: {
             1: "output"