[feat]: support logit_bias (NVIDIA#5354)

xq25478 · venkywonka · hexiao.xq · lancelly · commit 83b316a48462 · 2025-08-06T03:01:36.000Z
Signed-off-by: xq25478 &lt;xq25478@qq.com&gt;
Signed-off-by: Venky Ganesh &lt;23023424+venkywonka@users.noreply.github.com&gt;
Signed-off-by: hexiao.xq &lt;hexiao.xq@antgroup.com&gt;
Co-authored-by: Venky Ganesh &lt;23023424+venkywonka@users.noreply.github.com&gt;
Co-authored-by: hexiao.xq &lt;hexiao.xq@antgroup.com&gt;
Co-authored-by: Pengyun Lin &lt;81065165+LinPoly@users.noreply.github.com&gt;
Signed-off-by: Lanyu Liao &lt;lancelly@users.noreply.github.com&gt;
diff --git a/tensorrt_llm/sampling_params.py b/tensorrt_llm/sampling_params.py
@@ -2,7 +2,7 @@
 import os
 from abc import ABC, abstractmethod
 from dataclasses import dataclass, field, fields
-from typing import List, NamedTuple, Optional, Tuple, Union
+from typing import Dict, List, NamedTuple, Optional, Tuple, Union
 
 import torch
 from pydantic import BaseModel
@@ -108,6 +108,55 @@ def __call__(
         pass  # noqa
 
 
+class LogitBiasLogitsProcessor(LogitsProcessor):
+    def __init__(self, logit_bias: Dict[str, float]) -> None:
+        super().__init__()
+        self.logit_bias = logit_bias
+        self.tokens_to_adjust = self.process_logit_bias(logit_bias)
+        if not self.tokens_to_adjust:
+            raise ValueError("Empty logit_bias provided - no tokens to adjust")
+
+    def process_logit_bias(self, logit_bias: Dict[str, float]) -> Dict[int, float]:
+        valid = {}
+        invalid = {}
+
+        for k, v in logit_bias.items():
+            try:
+                token_id = int(k)
+                valid[token_id] = v
+            except (ValueError, TypeError):
+                invalid[k] = v
+
+        if invalid:
+            raise ValueError(
+                f"Invalid token_ids in logit_bias: {list(invalid.keys())}. "
+                f"All keys must be integers."
+            )
+        return valid
+
+    def __call__(
+        self,
+        req_id: int,
+        logits: torch.Tensor,
+        token_ids: List[List[int]],
+        stream_ptr: Optional[int],
+        client_id: Optional[int],
+    ) -> None:
+        vocab_size = logits.size(-1)
+        token_ids_list = list(self.tokens_to_adjust.keys())
+        bias_values = torch.tensor(list(self.tokens_to_adjust.values()), device=logits.device)
+
+        invalid_token_ids = [tid for tid in token_ids_list if tid >= vocab_size]
+        if invalid_token_ids:
+            raise ValueError(
+                f"Token ID(s) {invalid_token_ids} exceed vocabulary size (vocab_size={vocab_size})"
+            )
+
+        stream = None if stream_ptr is None else torch.cuda.ExternalStream(stream_ptr)
+        with torch.cuda.stream(stream):
+            logits[:, :, token_ids_list] += bias_values
+
+
 @dataclass(slots=True, kw_only=True)
 class AdditionalModelOutput:
     """An additional output to gather from the model.
diff --git a/tensorrt_llm/serve/openai_protocol.py b/tensorrt_llm/serve/openai_protocol.py
@@ -16,6 +16,8 @@
 from tensorrt_llm.llmapi import DisaggregatedParams as LlmDisaggregatedParams
 from tensorrt_llm.llmapi import GuidedDecodingParams, SamplingParams
 
+from ..sampling_params import LogitBiasLogitsProcessor
+
 
 class OpenAIBaseModel(BaseModel):
     # OpenAI API does not allow extra fields & allow to initialize by both alias and field name
@@ -248,6 +250,10 @@ def to_sampling_params(self) -> SamplingParams:
                 self.response_format),
             detokenize=self.detokenize,
 
+            # logits_bias
+            logits_processor=None if not self.logit_bias else
+            LogitBiasLogitsProcessor(self.logit_bias),
+
             # completion-extra-params
             add_special_tokens=self.add_special_tokens,
 
@@ -539,6 +545,10 @@ def to_sampling_params(self) -> SamplingParams:
             guided_decoding=_response_format_to_guided_decoding_params(
                 self.response_format),
 
+            # logits_bias
+            logits_processor=None if not self.logit_bias else
+            LogitBiasLogitsProcessor(self.logit_bias),
+
             # chat-completion-extra-params
             add_special_tokens=self.add_special_tokens,
 
@@ -574,13 +584,6 @@ def check_logprobs(cls, data):
             raise ValueError("top_logprobs is not supported")
         return data
 
-    @model_validator(mode="before")
-    @classmethod
-    def verify_logit_processor(cls, data):
-        if data.get("logit_bias"):
-            raise ValueError("logit bias is not supported")
-        return data
-
     @model_validator(mode="before")
     @classmethod
     def check_suffix(cls, data):
diff --git a/tests/integration/test_lists/test-db/l0_a10.yml b/tests/integration/test_lists/test-db/l0_a10.yml
@@ -29,7 +29,7 @@ l0_a10:
   - test_e2e.py::test_openai_misc_example[pytorch]
   - test_e2e.py::test_openai_reasoning[pytorch]
   - test_e2e.py::test_openai_completions_example[pytorch]
-  - test_e2e.py::test_openai_chat_example[pytorch]
+  - test_e2e.py::test_openai_chat_example[pytorch] TIMEOUT (90)
   - test_e2e.py::test_trtllm_bench_request_rate_and_concurrency[enable_concurrency-]
 - condition:
     ranges:
diff --git a/tests/unittest/llmapi/apps/_test_openai_chat.py b/tests/unittest/llmapi/apps/_test_openai_chat.py
@@ -521,3 +521,41 @@ def test_stop_reason(client: openai.OpenAI, model_name: str, backend: str):
     )
     assert resp.choices[0].finish_reason == "stop"
     assert resp.choices[0].stop_reason == "two"
+
+
+@pytest.mark.asyncio
+async def test_chat_completion_with_logit_bias(async_client: openai.AsyncOpenAI,
+                                               model_name: str):
+    """Test logit_bias in chat completions"""
+    logit_bias = {
+        "1000": 2.0,
+        "2000": -2.0,
+    }
+
+    chat_completion = await async_client.chat.completions.create(
+        model=model_name,
+        messages=[{
+            "role": "user",
+            "content": "Tell me a fact about Paris"
+        }],
+        max_tokens=20,
+        logit_bias=logit_bias,
+        temperature=0.0,
+    )
+    assert chat_completion.choices[0].message.content
+
+
+@pytest.mark.asyncio
+async def test_chat_completion_with_invalid_logit_bias(
+        async_client: openai.AsyncOpenAI, model_name: str):
+    """Test with invalid token IDs (non-integer keys)"""
+    with pytest.raises(openai.BadRequestError):
+        await async_client.chat.completions.create(
+            model=model_name,
+            messages=[{
+                "role": "user",
+                "content": "Tell me a fact about Paris"
+            }],
+            logit_bias={"invalid_token": 1.0},  # Non-integer key
+            max_tokens=5,
+        )
diff --git a/tests/unittest/llmapi/apps/_test_openai_completions.py b/tests/unittest/llmapi/apps/_test_openai_completions.py
@@ -368,3 +368,36 @@ async def test_completion_streaming(async_client: openai.AsyncOpenAI,
         tokens.extend(chunk.choices[0].token_ids)
 
     assert tokens == single_output
+
+
+@pytest.mark.asyncio
+async def test_completion_with_logit_bias(async_client: openai.AsyncOpenAI,
+                                          model_name: str):
+    """Test logit_bias with valid token IDs"""
+    logit_bias = {
+        "1000": 80,
+        "2000": -80,
+    }
+
+    completion = await async_client.completions.create(
+        model=model_name,
+        prompt="The capital of France is",
+        max_tokens=10,
+        logit_bias=logit_bias,
+        temperature=0.0,
+    )
+
+    assert completion.choices[0].text
+
+
+@pytest.mark.asyncio
+async def test_completion_with_invalid_logit_bias(
+        async_client: openai.AsyncOpenAI, model_name: str):
+    """Test with invalid token IDs (non-integer keys)"""
+    with pytest.raises(openai.BadRequestError):
+        await async_client.completions.create(
+            model=model_name,
+            prompt="Hello world",
+            logit_bias={"invalid_token": 1.0},  # Non-integer key
+            max_tokens=5,
+        )