[gpt-oss] raise error for flashinfer backend without trtllm (vllm-project#24482)

heheda12345 · web-flow · commit b5e383cd8b62 · 2025-09-10T14:33:13.000-07:00
Signed-off-by: Chen Zhang &lt;zhangch99@outlook.com&gt;
diff --git a/vllm/v1/attention/backends/flashinfer.py b/vllm/v1/attention/backends/flashinfer.py
@@ -216,7 +216,11 @@ def __init__(self, kv_cache_spec: AttentionSpec, layer_names: list[str],
         self.window_left = self.global_hyperparameters.window_left
         self.logits_soft_cap = self.global_hyperparameters.logits_soft_cap
         self.has_sinks = self.global_hyperparameters.has_sinks
-
+        if self.has_sinks and not supports_trtllm_attention()[0]:
+            raise NotImplementedError(
+                "FlashInfer backend currently does not support attention "
+                "sinks, please use trtllm on blackwell or flash attention on "
+                "earlier GPUs.")
         # Preparing persistent buffers (device-side)
         self.paged_kv_indptr = torch.zeros(max_num_reqs + 1,
                                            dtype=torch.int32,
@@ -408,7 +412,11 @@ def build(self,
                                                  self.q_data_type,
                                                  is_prefill=False,
                                                  has_sinks=self.has_sinks)
-
+        if self.has_sinks and not (prefill_use_trtllm and decode_use_trtllm):
+            raise NotImplementedError(
+                "FlashInfer backend currently does not support attention "
+                "sinks, please use trtllm on blackwell or flash attention on "
+                "earlier GPUs.")
         attn_metadata = FlashInferMetadata(
             num_actual_tokens=num_actual_tokens,
             q_data_type=self.q_data_type,