[DSv4] FlashInfer sparse MLA: collapse decode+prefill into single launcher call

zyongye · claude · zyongye · commit 854b7b457db7 · 2026-05-28T03:23:49.000Z
GSM8K parity (95) verified with the full mixed batch passed in one flashinfer_trtllm_batch_decode_sparse_mla_dsv4_raw call -- the prior two-call split (PR vllm-project#42316 pattern) is no longer needed. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com> Signed-off-by: Yongye Zhu <zyy1102000@gmail.com>
diff --git a/vllm/models/deepseek_v4/nvidia/flashinfer_sparse.py b/vllm/models/deepseek_v4/nvidia/flashinfer_sparse.py
@@ -372,60 +372,20 @@ def _forward(
 
         workspace = _get_flashinfer_dsv4_workspace(q.device)
 
-        # Split decode and prefill into two launcher calls (PR #42316 pattern).
-        # The TRTLLM-GEN DSV4 sparse-MLA kernel was tuned for uniform-q batches
-        # and an earlier attempt to fold both halves into a single call produced
-        # subtly wrong attention outputs (~3pt GSM8K drop).  Decode uses the
-        # absolute cum_seq_lens_q (it already starts at 0); prefill uses a
-        # rebased cum_seq_lens_q so its sliced query view re-anchors at 0.
-        if num_decode_tokens > 0:
-            decode_query_start_loc = query_start_loc[: num_decodes + 1]
-            decode_query_start_loc_cpu = query_start_loc_cpu[: num_decodes + 1]
-            decode_query_lens_cpu = (
-                decode_query_start_loc_cpu[1:] - decode_query_start_loc_cpu[:-1]
-            )
-            flashinfer_trtllm_batch_decode_sparse_mla_dsv4_raw(
-                query=query[:num_decode_tokens],
-                swa_kv_cache=swa_k_cache,
-                workspace_buffer=workspace,
-                sparse_indices=sparse_indices[:num_decode_tokens],
-                compressed_kv_cache=compressed_kv_cache,
-                sparse_topk_lens=sparse_topk_lens[:num_decode_tokens],
-                seq_lens=seq_lens[:num_decodes],
-                out=output[:num_decode_tokens],
-                bmm1_scale=bmm1_scale,
-                bmm2_scale=bmm2_scale,
-                sinks=layer.attn_sink,
-                cum_seq_lens_q=decode_query_start_loc,
-                max_q_len=int(decode_query_lens_cpu.max().item()),
-            )
-
-        if num_prefill_tokens > 0:
-            # Prefill query view starts at offset num_decode_tokens inside the
-            # combined batch; the launcher expects cum_seq_lens_q to index into
-            # that sliced view, so subtract the decode base.
-            prefill_query_start_loc = (
-                query_start_loc[num_decodes : num_reqs + 1]
-                - query_start_loc[num_decodes]
-            )
-            prefill_query_start_loc_cpu = query_start_loc_cpu[
-                num_decodes : num_reqs + 1
-            ]
-            prefill_query_lens_cpu = (
-                prefill_query_start_loc_cpu[1:] - prefill_query_start_loc_cpu[:-1]
-            )
-            flashinfer_trtllm_batch_decode_sparse_mla_dsv4_raw(
-                query=query[num_decode_tokens:num_tokens],
-                swa_kv_cache=swa_k_cache,
-                workspace_buffer=workspace,
-                sparse_indices=sparse_indices[num_decode_tokens:num_tokens],
-                compressed_kv_cache=compressed_kv_cache,
-                sparse_topk_lens=sparse_topk_lens[num_decode_tokens:num_tokens],
-                seq_lens=seq_lens[num_decodes:num_reqs],
-                out=output[num_decode_tokens:num_tokens],
-                bmm1_scale=bmm1_scale,
-                bmm2_scale=bmm2_scale,
-                sinks=layer.attn_sink,
-                cum_seq_lens_q=prefill_query_start_loc,
-                max_q_len=int(prefill_query_lens_cpu.max().item()),
-            )
+        # Single-call launcher over the full mixed decode+prefill batch.
+        query_lens_cpu = query_start_loc_cpu[1:] - query_start_loc_cpu[:-1]
+        flashinfer_trtllm_batch_decode_sparse_mla_dsv4_raw(
+            query=query,
+            swa_kv_cache=swa_k_cache,
+            workspace_buffer=workspace,
+            sparse_indices=sparse_indices[:num_tokens],
+            compressed_kv_cache=compressed_kv_cache,
+            sparse_topk_lens=sparse_topk_lens[:num_tokens],
+            seq_lens=seq_lens[:num_reqs],
+            out=output,
+            bmm1_scale=bmm1_scale,
+            bmm2_scale=bmm2_scale,
+            sinks=layer.attn_sink,
+            cum_seq_lens_q=query_start_loc,
+            max_q_len=int(query_lens_cpu.max().item()),
+        )