Skip to content

Commit e080294

Browse files
authored
[TRTLLM-7918][feat] Revert "Support kvcache reuse for phi4mm (#7563)" (#7722)
Signed-off-by: Wanli Jiang <[email protected]>
1 parent 965a3da commit e080294

File tree

3 files changed

+7
-25
lines changed

3 files changed

+7
-25
lines changed

docs/source/models/supported-models.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -45,13 +45,13 @@ Note: Support for other models may vary. Features marked "N/A" are not applicabl
4545

4646
| Model Architecture/Feature | Overlap Scheduler | CUDA Graph | Chunked Prefill | Torch Sampler | TLLM C++ Sampler | KV Cache Reuse | Logits Post Processor | EPD Disaggregated Serving | Modality |
4747
| ---------------------------------- | ----------------- | ---------- | --------------- | ------------- | ---------------- | -------------- | --------------------- | ------------------------- | -------- |
48-
| Gemma3ForConditionalGeneration | Yes | Yes | N/A | Yes | Yes | N/A | Yes | No | L + I |
48+
| Gemma3ForConditionalGeneration | Yes | Yes | N/A | Yes | Yes | N/A | Yes | No | L + I |
4949
| HCXVisionForCausalLM | Yes | Yes | No | Yes | Yes | No | Yes | No | L + I |
5050
| LlavaLlamaModel (VILA) | Yes | Yes | No | Yes | Yes | No | Yes | No | L + I + V |
5151
| LlavaNextForConditionalGeneration | Yes | Yes | No | Yes | Yes | No | Yes | No | L + I |
5252
| Llama4ForConditionalGeneration | Yes | Yes | No | Yes | Yes | No | Yes | No | L + I |
5353
| Mistral3ForConditionalGeneration | Yes | Yes | No | Yes | Yes | No | Yes | No | L + I |
54-
| Phi4MMForCausalLM | Yes | Yes | No | Yes | Yes | Yes | Yes | No | L + I + A |
54+
| Phi4MMForCausalLM | Yes | Yes | No | Yes | Yes | No | Yes | No | L + I + A |
5555
| Qwen2VLForConditionalGeneration | Yes | Yes | No | Yes | Yes | Yes | Yes | No | L + I + V |
5656
| Qwen2_5_VLForConditionalGeneration | Yes | Yes | No | Yes | Yes | Yes | Yes | No | L + I + V |
5757

docs/source/reference/multimodal-feature-support-matrix.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,6 @@
88
| LLaVA-NeXT | Yes | Yes | Yes | Yes |
99
| Llama 4 | Yes | Yes | No | No |
1010
| Mistral-Small-3.1 | Yes | Yes | No | No |
11-
| Phi-4-multimodal | Yes | Yes | Yes | No |
11+
| Phi-4-multimodal | Yes | Yes | No | No |
1212
| Qwen2-VL | Yes | Yes | Yes | Yes |
1313
| Qwen2.5-VL | Yes | Yes | Yes | Yes |

tensorrt_llm/_torch/models/modeling_phi4mm.py

Lines changed: 4 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -19,8 +19,8 @@
1919
from tensorrt_llm.inputs.multimodal import MultimodalParams
2020

2121
from ...executor.request import LoRARequest
22-
from ...inputs import (BaseMultimodalInputProcessor, ExtraProcessedInputs,
23-
InputProcessor, MultimodalPlaceholderMetadata,
22+
from ...inputs import (ExtraProcessedInputs, InputProcessor,
23+
MultimodalPlaceholderMetadata,
2424
MultimodalPlaceholderPlacement, TextPrompt,
2525
register_input_processor)
2626
from ...logger import logger
@@ -29,8 +29,7 @@
2929
from ..attention_backend import AttentionMetadata
3030
from ..model_config import ModelConfig
3131
from .modeling_auto import AutoModelForCausalLM
32-
from .modeling_multimodal_utils import (find_uncached_mm_embeds,
33-
fuse_input_embeds)
32+
from .modeling_multimodal_utils import fuse_input_embeds
3433
from .modeling_utils import register_auto_model
3534

3635
# Special token ids from the original Phi-4-multimodal-instruct implementation
@@ -390,7 +389,7 @@ def forward(self, multimodal_params: List[MultimodalParams],
390389
return self._encoding_batch_request(multimodal_params, mm_token_ids)
391390

392391

393-
class Phi4MMInputProcessor(BaseMultimodalInputProcessor, InputProcessor):
392+
class Phi4MMInputProcessor(InputProcessor):
394393

395394
def __init__(self,
396395
model_path: str,
@@ -416,20 +415,6 @@ def __init__(self,
416415
trust_remote_code=trust_remote_code,
417416
use_fast=self.use_fast)
418417

419-
def get_mm_token_ids(self) -> Optional[torch.Tensor]:
420-
return torch.tensor([_IMAGE_SPECIAL_TOKEN_ID, _AUDIO_SPECIAL_TOKEN_ID],
421-
dtype=torch.int32,
422-
device=self.device)
423-
424-
def get_num_tokens_per_image(
425-
self,
426-
*,
427-
image: Image.Image,
428-
**kwargs,
429-
):
430-
data = self.processor.image_processor.preprocess(image)
431-
return data["num_img_tokens"][0]
432-
433418
@torch.inference_mode()
434419
def __call__(
435420
self, inputs: TextPrompt, sampling_params: SamplingParams
@@ -604,9 +589,6 @@ def forward(
604589
multimodal_param.multimodal_data["multimodal_embedding"]
605590
for multimodal_param in multimodal_params
606591
]
607-
mm_embedding = find_uncached_mm_embeds(
608-
mm_embedding, multimodal_params[:num_context_requests])
609-
610592
input_ids, input_embeds = fuse_input_embeds(
611593
self.llm.model.embed_tokens,
612594
input_ids,

0 commit comments

Comments
 (0)