FEAT:support MiniCPM4 Series (#3609)

Jun-Howie · web-flow · commit c328ae109111 · 2025-06-12T00:18:07.000+08:00
diff --git a/xinference/model/llm/llm_family.json b/xinference/model/llm/llm_family.json
@@ -6142,6 +6142,53 @@
       "</s>"
     ]
   },
+  {
+    "version": 1,
+    "context_length": 32768,
+    "model_name": "minicpm4",
+    "model_lang": [
+      "zh"
+    ],
+    "model_ability": [
+      "chat"
+    ],
+    "model_description": "MiniCPM4 series are highly efficient large language models (LLMs) designed explicitly for end-side devices, which achieves this efficiency through systematic innovation in four key dimensions: model architecture, training data, training algorithms, and inference systems.",
+    "model_specs": [
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": "0_5",
+        "quantizations": [
+          "none"
+        ],
+        "model_id": "openbmb/MiniCPM4-0.5B"
+      },
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 8,
+        "quantizations": [
+          "none"
+        ],
+        "model_id": "openbmb/MiniCPM4-8B"
+      },
+      {
+        "model_format": "mlx",
+        "model_size_in_billions": 8,
+        "quantizations": [
+          "4bit"
+        ],
+        "model_id": "mlx-community/MiniCPM4-8B-4bit"
+      }
+    ],
+    "chat_template": "{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}",
+    "stop_token_ids": [
+      2,
+      73440
+    ],
+    "stop": [
+      "</s>",
+      "<|im_end|>"
+    ]
+  },
   {
     "version": 1,
     "context_length": 32768,
diff --git a/xinference/model/llm/llm_family_modelscope.json b/xinference/model/llm/llm_family_modelscope.json
@@ -4277,6 +4277,56 @@
       "</s>"
     ]
   },
+  {
+    "version": 1,
+    "context_length": 32768,
+    "model_name": "minicpm4",
+    "model_lang": [
+      "zh"
+    ],
+    "model_ability": [
+      "chat"
+    ],
+    "model_description": "MiniCPM4 series are highly efficient large language models (LLMs) designed explicitly for end-side devices, which achieves this efficiency through systematic innovation in four key dimensions: model architecture, training data, training algorithms, and inference systems.",
+    "model_specs": [
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": "0_5",
+        "quantizations": [
+          "none"
+        ],
+        "model_id": "OpenBMB/MiniCPM4-0.5B",
+        "model_hub": "modelscope"
+      },
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 8,
+        "quantizations": [
+          "none"
+        ],
+        "model_id": "OpenBMB/MiniCPM4-8B",
+        "model_hub": "modelscope"
+      },
+      {
+        "model_format": "mlx",
+        "model_size_in_billions": 8,
+        "quantizations": [
+          "4bit"
+        ],
+        "model_id": "mlx-community/MiniCPM4-8B-4bit",
+        "model_hub": "modelscope"
+      }
+    ],
+    "chat_template": "{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}",
+    "stop_token_ids": [
+      2,
+      73440
+    ],
+    "stop": [
+      "</s>",
+      "<|im_end|>"
+    ]
+  },
   {
     "version": 1,
     "context_length": 32768,
diff --git a/xinference/model/llm/vllm/core.py b/xinference/model/llm/vllm/core.py
@@ -252,6 +252,9 @@ class VLLMGenerateConfig(TypedDict, total=False):
 if VLLM_INSTALLED and vllm.__version__ >= "0.8.5":
     VLLM_SUPPORTED_CHAT_MODELS.append("qwen3")
 
+if VLLM_INSTALLED and vllm.__version__ >= "0.9.1":
+    VLLM_SUPPORTED_CHAT_MODELS.append("minicpm4")
+
 
 class VLLMModel(LLM):
     def __init__(