1919from tensorrt_llm .inputs .multimodal import MultimodalParams
2020
2121from ...executor .request import LoRARequest
22- from ...inputs import (BaseMultimodalInputProcessor , ExtraProcessedInputs ,
23- InputProcessor , MultimodalPlaceholderMetadata ,
22+ from ...inputs import (ExtraProcessedInputs , InputProcessor ,
23+ MultimodalPlaceholderMetadata ,
2424 MultimodalPlaceholderPlacement , TextPrompt ,
2525 register_input_processor )
2626from ...logger import logger
2929from ..attention_backend import AttentionMetadata
3030from ..model_config import ModelConfig
3131from .modeling_auto import AutoModelForCausalLM
32- from .modeling_multimodal_utils import (find_uncached_mm_embeds ,
33- fuse_input_embeds )
32+ from .modeling_multimodal_utils import fuse_input_embeds
3433from .modeling_utils import register_auto_model
3534
3635# Special token ids from the original Phi-4-multimodal-instruct implementation
@@ -390,7 +389,7 @@ def forward(self, multimodal_params: List[MultimodalParams],
390389 return self ._encoding_batch_request (multimodal_params , mm_token_ids )
391390
392391
393- class Phi4MMInputProcessor (BaseMultimodalInputProcessor , InputProcessor ):
392+ class Phi4MMInputProcessor (InputProcessor ):
394393
395394 def __init__ (self ,
396395 model_path : str ,
@@ -416,20 +415,6 @@ def __init__(self,
416415 trust_remote_code = trust_remote_code ,
417416 use_fast = self .use_fast )
418417
419- def get_mm_token_ids (self ) -> Optional [torch .Tensor ]:
420- return torch .tensor ([_IMAGE_SPECIAL_TOKEN_ID , _AUDIO_SPECIAL_TOKEN_ID ],
421- dtype = torch .int32 ,
422- device = self .device )
423-
424- def get_num_tokens_per_image (
425- self ,
426- * ,
427- image : Image .Image ,
428- ** kwargs ,
429- ):
430- data = self .processor .image_processor .preprocess (image )
431- return data ["num_img_tokens" ][0 ]
432-
433418 @torch .inference_mode ()
434419 def __call__ (
435420 self , inputs : TextPrompt , sampling_params : SamplingParams
@@ -604,9 +589,6 @@ def forward(
604589 multimodal_param .multimodal_data ["multimodal_embedding" ]
605590 for multimodal_param in multimodal_params
606591 ]
607- mm_embedding = find_uncached_mm_embeds (
608- mm_embedding , multimodal_params [:num_context_requests ])
609-
610592 input_ids , input_embeds = fuse_input_embeds (
611593 self .llm .model .embed_tokens ,
612594 input_ids ,
0 commit comments