[V1] v1 engine + full CUDA graph support for PLaMo2 (vllm-project#23998)

nopperl · Alnusjaponica · tdoublep · web-flow · commit fa4311d85f3a · 2025-09-03T08:24:02.000-07:00
Signed-off-by: Hemmi Shinichi &lt;shemmi@preferred.jp&gt;
Signed-off-by: nopperl &lt;54780682+nopperl@users.noreply.github.com&gt;
Co-authored-by: Hemmi Shinichi &lt;shemmi@preferred.jp&gt;
Co-authored-by: Thomas Parnell &lt;tom.parnell@gmail.com&gt;
diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md
@@ -395,7 +395,7 @@ th {
 | `PhiMoEForCausalLM` | Phi-3.5-MoE | `microsoft/Phi-3.5-MoE-instruct`, etc. | ✅︎ | ✅︎ | ✅︎ |
 | `Phi4FlashForCausalLM` | Phi-4-mini-flash-reasoning | `microsoft/microsoft/Phi-4-mini-instruct`, etc. | | | |
 | `PersimmonForCausalLM` | Persimmon | `adept/persimmon-8b-base`, `adept/persimmon-8b-chat`, etc. | | ✅︎ | ✅︎ |
-| `Plamo2ForCausalLM` | PLaMo2 | `pfnet/plamo-2-1b`, `pfnet/plamo-2-8b`, etc. | | ✅︎ | |
+| `Plamo2ForCausalLM` | PLaMo2 | `pfnet/plamo-2-1b`, `pfnet/plamo-2-8b`, etc. | | ✅︎ | ✅︎ |
 | `QWenLMHeadModel` | Qwen | `Qwen/Qwen-7B`, `Qwen/Qwen-7B-Chat`, etc. | ✅︎ | ✅︎ | ✅︎ |
 | `Qwen2ForCausalLM` | QwQ, Qwen2 | `Qwen/QwQ-32B-Preview`, `Qwen/Qwen2-7B-Instruct`, `Qwen/Qwen2-7B`, etc. | ✅︎ | ✅︎ | ✅︎ |
 | `Qwen2MoeForCausalLM` | Qwen2MoE | `Qwen/Qwen1.5-MoE-A2.7B`, `Qwen/Qwen1.5-MoE-A2.7B-Chat`, etc. | ✅︎ | ✅︎ | ✅︎ |
diff --git a/docs/usage/v1_guide.md b/docs/usage/v1_guide.md
@@ -110,7 +110,7 @@ Models using selective state-space mechanisms instead of standard transformer at
 Models that use Mamba-2 and Mamba-1 layers (e.g., `Mamba2ForCausalLM`, `MambaForCausalLM`,`FalconMambaForCausalLM`) are supported.
 
 Hybrid models that combine Mamba-2 and Mamba-1 layers with standard attention layers are also supported (e.g., `BambaForCausalLM`,
-`Zamba2ForCausalLM`, `NemotronHForCausalLM`, `FalconH1ForCausalLM` and `GraniteMoeHybridForCausalLM`, `JambaForCausalLM`).
+`Zamba2ForCausalLM`, `NemotronHForCausalLM`, `FalconH1ForCausalLM` and `GraniteMoeHybridForCausalLM`, `JambaForCausalLM`, `Plamo2ForCausalLM`).
 
 Hybrid models with mechanisms different to Mamba are also supported (e.g, `MiniMaxText01ForCausalLM`, `MiniMaxM1ForCausalLM`, `Lfm2ForCausalLM`).
 
diff --git a/tests/models/language/generation/test_hybrid.py b/tests/models/language/generation/test_hybrid.py
@@ -25,8 +25,7 @@
 
 HYBRID_MODELS = [
     "ai21labs/Jamba-tiny-dev",
-    # skipping until vLLM implementation issues are resolved
-    # "pfnet/plamo-2-1b",
+    "pfnet/plamo-2-1b",
     "Zyphra/Zamba2-1.2B-instruct",
     "hmellor/tiny-random-BambaForCausalLM",
     "ibm-granite/granite-4.0-tiny-preview",
@@ -37,6 +36,7 @@
 V1_SUPPORTED_MODELS = [
     "state-spaces/mamba-130m-hf",
     "ai21labs/Jamba-tiny-dev",
+    "pfnet/plamo-2-1b",
     "yujiepan/mamba2-codestral-v0.1-tiny-random",
     "Zyphra/Zamba2-1.2B-instruct",
     "hmellor/tiny-random-BambaForCausalLM",
@@ -47,6 +47,7 @@
 
 FULL_CUDA_GRAPH_MODELS = [
     "ai21labs/Jamba-tiny-dev",
+    "pfnet/plamo-2-1b",
     "Zyphra/Zamba2-1.2B-instruct",
 ]
 
diff --git a/tests/models/registry.py b/tests/models/registry.py
@@ -287,8 +287,6 @@ def check_available_online(
     "PhiMoEForCausalLM": _HfExamplesInfo("microsoft/Phi-3.5-MoE-instruct",
                                          trust_remote_code=True),
     "Plamo2ForCausalLM": _HfExamplesInfo("pfnet/plamo-2-1b",
-                                         max_transformers_version="4.53",
-                                         transformers_version_reason="vLLM impl inherits PreTrainedModel and clashes with get_input_embeddings",  # noqa: E501
                                         trust_remote_code=True),
     "QWenLMHeadModel": _HfExamplesInfo("Qwen/Qwen-7B-Chat",
                                        max_transformers_version="4.53",
diff --git a/vllm/config/compilation.py b/vllm/config/compilation.py
@@ -340,6 +340,7 @@ class CompilationConfig:
         "vllm.mamba_mixer",
         "vllm.short_conv",
         "vllm.linear_attention",
+        "vllm.plamo2_mamba_mixer",
     ]
 
     def compute_hash(self) -> str:
diff --git a/vllm/model_executor/models/plamo2.py b/vllm/model_executor/models/plamo2.py

Original file line number	Diff line number	Diff line change
`@@ -340,6 +340,7 @@ class CompilationConfig:`
`340`	`340`	`"vllm.mamba_mixer",`
`341`	`341`	`"vllm.short_conv",`
`342`	`342`	`"vllm.linear_attention",`
	`343`	`+ "vllm.plamo2_mamba_mixer",`
`343`	`344`	`]`
`344`	`345`
`345`	`346`	`def compute_hash(self) -> str:`