[Fix] Fix the torch backend (#108)

nvchenghaoz · Fridah-nv · MrGeva · commit dc74ec32d0cd · 2025-07-29T02:32:32.000-07:00
* attention matcher with torch._inductor pattern matcher,matching repeat kv, sdpa and group attention, update unit tests Signed-off-by: Frida Hou <201670829+Fridah-nv@users.noreply.github.com> * Fix the torch backend Attention Signed-off-by: nvchenghaoz <211069071+nvchenghaoz@users.noreply.github.com> * Revert "attention matcher with torch._inductor pattern matcher,matching repeat kv, sdpa and group attention, update unit tests" This reverts commit 5743fb3. --------- Signed-off-by: Frida Hou <201670829+Fridah-nv@users.noreply.github.com> Signed-off-by: nvchenghaoz <211069071+nvchenghaoz@users.noreply.github.com> Co-authored-by: Frida Hou <201670829+Fridah-nv@users.noreply.github.com>
diff --git a/tensorrt_llm/_torch/auto_deploy/custom_ops/torch_backend_attention.py b/tensorrt_llm/_torch/auto_deploy/custom_ops/torch_backend_attention.py
@@ -103,7 +103,7 @@ def _torch_generate_mha(
         # Apply sinks if provided (following the model file pattern)
         if sinks is not None:
             # Concatenate sinks to attention scores
-            sinks = sinks.reshape(-1, 1, 1).expand(-1, attn_scores.shape[-2], -1)
+            sinks = sinks.reshape(-1, 1, 1)
             attn_weights = torch.cat([attn_scores, sinks], dim=-1)
             attn_weights = torch.softmax(attn_weights, dim=-1, dtype=torch.float32).to(q.dtype)
             # Use only the non-sink portion for computing output (ignore sinks)
@@ -202,9 +202,7 @@ def _torch_context_mha(
             )  # [seq_len_i, kv_seq_len]
 
             # Sliding window mask: allow attention only if 0 <= pos_diff < sliding_window_size
-            sliding_window_mask = (pos_diff < 0) | (
-                pos_diff >= sliding_window_size
-            )  # [seq_len_i, kv_seq_len]
+            sliding_window_mask = pos_diff >= sliding_window_size
 
             # Combine causal and sliding window masks
             combined_mask = causal_mask | sliding_window_mask
@@ -219,14 +217,14 @@ def _torch_context_mha(
         # Apply sinks if provided (following the model file pattern)
         if sinks is not None:
             # Concatenate sinks to attention scores
-            sinks = sinks.reshape(1, -1, 1, 1).expand(
-                attn_scores.shape[0], -1, attn_scores.shape[-2], -1
+            new_sinks = sinks.reshape(1, -1, 1, 1).expand(
+                attn_scores.shape[0], -1, attn_scores.shape[2], 1
             )
-            attn_weights = torch.cat([attn_scores, sinks], dim=-1)
+            attn_weights = torch.cat([attn_scores, new_sinks], dim=-1)
             attn_weights = torch.softmax(attn_weights, dim=-1, dtype=torch.float32).to(q.dtype)
             # Use only the non-sink portion for computing output (ignore sinks)
             attn_out = torch.matmul(
-                attn_weights[..., : -sinks.size(-1)], v_seq_t
+                attn_weights[..., : -new_sinks.size(-1)], v_seq_t
             )  # [1, n_heads, seq_len_i, v_head_dim]
         else:
             attn_weights = torch.softmax(attn_scores, dim=-1, dtype=torch.float32).to(q.dtype)