vllm-project
diff --git a/‎docs/contributing/benchmarks.md‎
Lines changed: 2 additions & 2 deletions b/‎docs/contributing/benchmarks.md‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎docs/features/disagg_prefill.md‎
Lines changed: 6 additions & 0 deletions b/‎docs/features/disagg_prefill.md‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎docs/features/tool_calling.md‎
Lines changed: 9 additions & 0 deletions b/‎docs/features/tool_calling.md‎
Lines changed: 9 additions & 0 deletions
diff --git a/‎docs/serving/expert_parallel_deployment.md‎
Lines changed: 1 addition & 1 deletion b/‎docs/serving/expert_parallel_deployment.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎tests/entrypoints/openai/tool_parsers/test_hermes_tool_parser.py‎
Lines changed: 203 additions & 6 deletions b/‎tests/entrypoints/openai/tool_parsers/test_hermes_tool_parser.py‎
Lines changed: 203 additions & 6 deletions
diff --git a/‎tests/kernels/attention/test_attention_selector.py‎
Lines changed: 1 addition & 2 deletions b/‎tests/kernels/attention/test_attention_selector.py‎
Lines changed: 1 addition & 2 deletions
diff --git a/‎tests/lora/test_layers.py‎
Lines changed: 14 additions & 12 deletions b/‎tests/lora/test_layers.py‎
Lines changed: 14 additions & 12 deletions
@@ -680,7 +680,7 @@ vllm bench serve \
   --save-result \
   --result-dir ~/vllm_benchmark_results \
   --save-detailed \
-  --endpoint /v1/chat/completion
+  --endpoint /v1/chat/completions
 ```
 
 ##### Videos (ShareGPT4Video)
@@ -707,7 +707,7 @@ vllm bench serve \
   --save-result \
   --result-dir ~/vllm_benchmark_results \
   --save-detailed \
-  --endpoint /v1/chat/completion
+  --endpoint /v1/chat/completions
 ```
 
 ##### Synthetic Random Images (random-mm)
 
@@ -31,6 +31,12 @@ Now supports 5 types of connectors:
   --kv-transfer-config '{"kv_connector":"MultiConnector","kv_role":"kv_both","kv_connector_extra_config":{"connectors":[{"kv_connector":"NixlConnector","kv_role":"kv_both"},{"kv_connector":"SharedStorageConnector","kv_role":"kv_both","kv_connector_extra_config":{"shared_storage_path":"local_storage"}}]}}'
   ```
 
+For NixlConnector, you may also specify one or multiple NIXL_Backend. Such as:
+
+  ```bash
+  --kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_both", "kv_buffer_device":"cuda", "kv_connector_extra_config":{"backend":["UCX", "GDS"]}'
+  ```
+
 - **OffloadingConnector**: enable offloading of KV data to CPU memory, customizing the CPU block size (in tokens) and number of blocks to allocate (per worker):
 
   ```bash
 
@@ -319,6 +319,15 @@ Supported models:
 
 Flags: `--tool-call-parser glm45`
 
+### Qwen3-Coder Models (`qwen3_xml`)
+
+Supported models:
+
+* `Qwen/Qwen3-480B-A35B-Instruct`
+* `Qwen/Qwen3-Coder-30B-A3B-Instruct`
+
+Flags: `--tool-call-parser qwen3_xml`
+
 ### Models with Pythonic Tool Calls (`pythonic`)
 
 A growing number of models output a python list to represent tool calls instead of using JSON. This has the advantage of inherently supporting parallel tool calls and removing ambiguity around the JSON schema required for tool calls. The `pythonic` tool parser can support such models.
 
@@ -193,7 +193,7 @@ For production deployments requiring strict SLA guarantees for time-to-first-tok
 
 1. **Install gdrcopy/ucx/nixl**: For maximum performance, run the [install_gdrcopy.sh](gh-file:tools/install_gdrcopy.sh) script to install `gdrcopy` (e.g., `install_gdrcopy.sh "${GDRCOPY_OS_VERSION}" "12.8" "x64"`). You can find available OS versions [here](https://developer.download.nvidia.com/compute/redist/gdrcopy/CUDA%2012.8/). If `gdrcopy` is not installed, things will still work with a plain `pip install nixl`, just with lower performance. `nixl` and `ucx` are installed as dependencies via pip.
 
-2. **Configure Both Instances**: Add this flag to both prefill and decode instances `--kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_both"}`
+2. **Configure Both Instances**: Add this flag to both prefill and decode instances `--kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_both"}`. Noted, you may also specify one or multiple NIXL_Backend. Such as: `--kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_both", "kv_connector_extra_config":{"backend":["UCX", "GDS"]}'`
 
 3. **Client Orchestration**: Use the client-side script below to coordinate prefill/decode operations. We are actively working on routing solutions.
 
 
@@ -5,6 +5,11 @@
 
 import pytest
 
+from vllm.entrypoints.openai.protocol import ChatCompletionRequest
+from vllm.entrypoints.openai.tool_parsers.hermes_tool_parser import (
+    Hermes2ProToolParser)
+from vllm.transformers_utils.tokenizer import AnyTokenizer
+
 from ....utils import RemoteOpenAIServer
 
 MODEL_NAME = "meta-llama/Llama-3.2-1B-Instruct"
@@ -37,7 +42,7 @@
                 },
                 "unit": {
                     "type": "string",
-                    "enum": ["celsius", "fahrenheit"]
+                    "enum": ["celsius", "fahrenheit"],
                 },
             },
             "required": ["location"],
@@ -75,7 +80,7 @@
     "user",
     "content":
     "Hi! Do you have any detailed information about the product id "
-    "7355608 and inserted true?"
+    "7355608 and inserted true?",
 }]
 
 
@@ -144,8 +149,8 @@ async def test_streaming_tool_call():
                 if tool_chunk.function.name:
                     tool_call_chunks[index]["name"] += tool_chunk.function.name
                 if tool_chunk.function.arguments:
-                    tool_call_chunks[index][
-                        "arguments"] += tool_chunk.function.arguments
+                    tool_call_chunks[index]["arguments"] += (
+                        tool_chunk.function.arguments)
 
         assert len(tool_call_chunks) == 1
         reconstructed_tool_call = tool_call_chunks[0]
@@ -234,8 +239,8 @@ async def test_streaming_product_tool_call():
                 if tool_chunk.function.name:
                     tool_call_chunks[index]["name"] += tool_chunk.function.name
                 if tool_chunk.function.arguments:
-                    tool_call_chunks[index][
-                        "arguments"] += tool_chunk.function.arguments
+                    tool_call_chunks[index]["arguments"] += (
+                        tool_chunk.function.arguments)
 
         assert len(tool_call_chunks) == 1
         reconstructed_tool_call = tool_call_chunks[0]
@@ -258,3 +263,195 @@ async def test_streaming_product_tool_call():
         print("\n[Streaming Product Test Passed]")
         print(f"Reconstructed Tool Call: {reconstructed_tool_call['name']}")
         print(f"Reconstructed Arguments: {arguments}")
+
+
+@pytest.fixture
+def qwen_tokenizer() -> AnyTokenizer:
+    from vllm.transformers_utils.tokenizer import get_tokenizer
+
+    return get_tokenizer("Qwen/Qwen3-32B")
+
+
+@pytest.fixture
+def hermes_parser(qwen_tokenizer: AnyTokenizer) -> Hermes2ProToolParser:
+    return Hermes2ProToolParser(qwen_tokenizer)
+
+
+@pytest.fixture
+def any_chat_request() -> ChatCompletionRequest:
+    return ChatCompletionRequest(
+        seed=42,
+        model="Qwen/Qwen3-32B",
+        messages=[],
+    )
+
+
+def test_hermes_parser_streaming_just_forward_text(
+    qwen_tokenizer: AnyTokenizer,
+    hermes_parser: Hermes2ProToolParser,
+    any_chat_request: ChatCompletionRequest,
+) -> None:
+    text = (
+        """This is some prior text that has nothing to do with tool calling."""
+    )
+    tokens = qwen_tokenizer.encode(text)
+    previous_text = ""
+    delta_messages = []
+    for token in tokens:
+        delta_text = qwen_tokenizer.decode([token])
+        current_text = previous_text + delta_text
+        delta = hermes_parser.extract_tool_calls_streaming(
+            previous_text=previous_text,
+            current_text=current_text,
+            delta_text=delta_text,
+            previous_token_ids=[],
+            current_token_ids=[],
+            delta_token_ids=[],
+            request=any_chat_request,
+        )
+        previous_text = current_text
+        delta_messages.append(delta)
+
+    for delta in delta_messages:
+        assert delta is not None
+        assert not delta.tool_calls
+
+    print(delta_messages)
+    assert "".join([delta.content for delta in delta_messages]) == text
+
+
+def test_hermes_parser_streaming_failure_case_bug_19056(
+    qwen_tokenizer: AnyTokenizer,
+    hermes_parser: Hermes2ProToolParser,
+    any_chat_request: ChatCompletionRequest,
+) -> None:
+    text = """<tool_call>
+{"name": "final_answer", "arguments": {"trigger": true}}
+</tool_call>"""
+    tokens = qwen_tokenizer.encode(text)
+    previous_text = ""
+    delta_messages = []
+    for token in tokens:
+        text = qwen_tokenizer.decode([token])
+        current_text = previous_text + text
+        delta = hermes_parser.extract_tool_calls_streaming(
+            previous_text=previous_text,
+            current_text=current_text,
+            delta_text=text,
+            previous_token_ids=[],
+            current_token_ids=[],
+            delta_token_ids=[],
+            request=any_chat_request,
+        )
+        previous_text = current_text
+        if delta is not None:
+            delta_messages.append(delta)
+
+    assert delta_messages[0].tool_calls[0].function.name == "final_answer"
+    tool_call_args = "".join(delta.tool_calls[0].function.arguments or ""
+                             for delta in delta_messages)
+    assert tool_call_args == '{"trigger": true}'
+
+
+def test_hermes_parser_streaming(
+    qwen_tokenizer: AnyTokenizer,
+    hermes_parser: Hermes2ProToolParser,
+    any_chat_request: ChatCompletionRequest,
+) -> None:
+    text = '<tool_call>\
+{"name": "get_current_temperature",\
+"arguments": {"location":\
+"San Francisco, California, United States", "unit": "celsius"}}\
+</tool_call>'
+
+    tokens = qwen_tokenizer.encode(text)
+    previous_text = ""
+    delta_messages = []
+    for token in tokens:
+        text = qwen_tokenizer.decode([token])
+        current_text = previous_text + text
+        delta = hermes_parser.extract_tool_calls_streaming(
+            previous_text=previous_text,
+            current_text=current_text,
+            delta_text=text,
+            previous_token_ids=[],
+            current_token_ids=[],
+            delta_token_ids=[],
+            request=any_chat_request,
+        )
+        previous_text = current_text
+        if delta is not None:
+            delta_messages.append(delta)
+    print(delta_messages)
+    assert (delta_messages[0].tool_calls[0].function.name ==
+            "get_current_temperature")
+    tool_call_args = "".join(delta.tool_calls[0].function.arguments or ""
+                             for delta in delta_messages)
+    assert tool_call_args == (
+        '{"location":"San Francisco, California, United States", '
+        '"unit": "celsius"}')
+
+
+def test_hermes_parser_non_streaming_no_tool_call(
+    hermes_parser: Hermes2ProToolParser,
+    any_chat_request: ChatCompletionRequest,
+) -> None:
+    text = """This is not a tool call."""
+    tool_call = hermes_parser.extract_tool_calls(
+        model_output=text,
+        request=any_chat_request,
+    )
+
+    assert tool_call is not None
+    assert not tool_call.tools_called
+
+
+def test_hermes_parser_non_streaming_tool_call_between_tags(
+    hermes_parser: Hermes2ProToolParser,
+    any_chat_request: ChatCompletionRequest,
+) -> None:
+    text = """<tool_call>
+{"name": "final_answer", "arguments": {"trigger": true}}
+</tool_call>"""
+    tool_call = hermes_parser.extract_tool_calls(
+        model_output=text,
+        request=any_chat_request,
+    )
+
+    assert tool_call is not None
+    assert tool_call.tools_called
+    assert tool_call.tool_calls[0].function.name == "final_answer"
+    assert tool_call.tool_calls[0].function.arguments == '{"trigger": true}'
+
+
+def test_hermes_parser_non_streaming_tool_call_until_eos(
+    hermes_parser: Hermes2ProToolParser,
+    any_chat_request: ChatCompletionRequest,
+) -> None:
+    text = """<tool_call>
+{"name": "final_answer", "arguments": {"trigger": true}}"""
+    tool_call = hermes_parser.extract_tool_calls(
+        model_output=text,
+        request=any_chat_request,
+    )
+
+    assert tool_call is not None
+    assert tool_call.tools_called
+    assert tool_call.tool_calls[0].function.name == "final_answer"
+    assert tool_call.tool_calls[0].function.arguments == '{"trigger": true}'
+
+
+def test_hermes_parser_non_streaming_tool_call_invalid_json(
+    hermes_parser: Hermes2ProToolParser,
+    any_chat_request: ChatCompletionRequest,
+) -> None:
+    # Missing closing brace to trigger exception
+    text = """<tool_call>
+{"name": "final_answer", "arguments": {"trigger": true}"""
+    tool_call = hermes_parser.extract_tool_calls(
+        model_output=text,
+        request=any_chat_request,
+    )
+
+    assert tool_call is not None
+    assert not tool_call.tools_called
@@ -67,7 +67,6 @@ def generate_params():
     return params
 
 
-@pytest.mark.skip(reason="Skipped for now. Should be revisited.")
 @pytest.mark.parametrize("device, name, use_mla, block_size",
                          generate_params())
 def test_env(
@@ -189,7 +188,7 @@ def test_env(
                             # FlashMLA only supports block_size == 64
                             pytest.skip("FlashMLA only supports block_size 64")
                         else:
-                            from vllm.attention.backends.flashmla import (
+                            from vllm.v1.attention.backends.mla.flashmla import (  # noqa: E501
                                 is_flashmla_supported)
                             is_supported, _ = is_flashmla_supported()
                             if not is_supported:
 
@@ -164,8 +164,8 @@ def populate_loras(
                         weight=layer_weights,
                         generate_embeddings_tensor=generate_embeddings_tensor,
                     )
-                sublora.lora_b = sublora.lora_b[:, (sublora_len *
-                                                    i):(sublora_len * (i + 1))]
+                sublora.lora_b = sublora.lora_b[(sublora_len *
+                                                 i):(sublora_len * (i + 1)), :]
                 sublora.optimize()
                 subloras.append(sublora)
 
@@ -304,9 +304,9 @@ def create_random_embedding_layer():
             result = embedding(input_)
             after_a = F.embedding(
                 input_,
-                lora.lora_a,
+                lora.lora_a.T,
             )
-            result += (after_a @ lora.lora_b)
+            result += (after_a @ lora.lora_b.T)
             expected_results.append(result)
         expected_result = torch.cat(expected_results)
 
@@ -445,9 +445,9 @@ def create_random_embedding_layer():
             result = expanded_embedding(input_)
             after_a = F.embedding(
                 original_input_,
-                lora.lora_a,
+                lora.lora_a.T,
             )
-            result += (after_a @ lora.lora_b)
+            result += (after_a @ lora.lora_b.T)
             expected_results.append(result)
         expected_result = torch.cat(expected_results)
 
@@ -575,7 +575,7 @@ def _pretest():
                                                   lm_head=linear,
                                                   embedding_bias=None)
             result[:, vocab_size + embeddings_tensor_len:] = float("-inf")
-            result += input_ @ lora.lora_a @ lora.lora_b * lora.scaling
+            result += input_ @ lora.lora_a.T @ lora.lora_b.T * lora.scaling
             expected_results.append(result)
         expected_result = torch.cat(expected_results)
         logits_processor.org_vocab_size = vocab_size
@@ -692,9 +692,10 @@ def create_random_linear_replicated_layer():
 
         expected_results: list[torch.Tensor] = []
         for input_, lora_id in zip(inputs, prompt_mapping):
+
             lora = lora_dict[lora_id]
             result = linear(input_)[0]
-            result += input_ @ lora.lora_a @ lora.lora_b * lora.scaling
+            result += input_ @ lora.lora_a.T @ lora.lora_b.T * lora.scaling
             expected_results.append(result)
         expected_result = torch.cat(expected_results)
 
@@ -817,7 +818,7 @@ def create_random_linear_parallel_layer():
         for input_, lora_id in zip(inputs, prompt_mapping):
             lora = lora_dict[lora_id]
             result = linear(input_)[0]
-            result += input_ @ lora.lora_a @ lora.lora_b * lora.scaling
+            result += input_ @ lora.lora_a.T @ lora.lora_b.T * lora.scaling
             expected_results.append(result)
         expected_result = torch.cat(expected_results)
 
@@ -965,9 +966,10 @@ class FakeConfig:
             result = linear(input_)[0]
             subloras = sublora_dict[lora_id]
             for i, sublora in enumerate(subloras):
-                result[:, sublora.lora_b.shape[1] * i:sublora.lora_b.shape[1] *
-                       (i + 1)] += (input_ @ sublora.lora_a @ sublora.lora_b *
-                                    sublora.scaling)
+                result[:, sublora.lora_b.shape[0] * i:sublora.lora_b.shape[0] *
+                       (i + 1)] += (
+                           input_ @ sublora.lora_a.T @ sublora.lora_b.T *
+                           sublora.scaling)
             expected_results.append(result)
         expected_result = torch.cat(expected_results)