Skip to content

Commit 0e219cd

Browse files
authored
[Bugfix] Fix GLM4.1V multimodal processor with compatability for Transformers v4.56 (vllm-project#24822)
Signed-off-by: Isotr0py <[email protected]>
1 parent 72c99f2 commit 0e219cd

File tree

6 files changed

+118
-70
lines changed

6 files changed

+118
-70
lines changed

examples/offline_inference/vision_language.py

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1716,6 +1716,13 @@ def run_tarsier2(questions: list[str], modality: str) -> ModelRequestData:
17161716
}
17171717

17181718

1719+
MODELS_NEED_VIDEO_METADATA = [
1720+
"glm4_1v",
1721+
"glm4_5v",
1722+
"glm4_5v_fp8",
1723+
]
1724+
1725+
17191726
def get_multi_modal_input(args):
17201727
"""
17211728
return {
@@ -1740,12 +1747,13 @@ def get_multi_modal_input(args):
17401747

17411748
if args.modality == "video":
17421749
# Input video and question
1750+
needs_metadata = args.model_type in MODELS_NEED_VIDEO_METADATA
17431751
video = VideoAsset(name="baby_reading", num_frames=args.num_frames).np_ndarrays
17441752
metadata = VideoAsset(name="baby_reading", num_frames=args.num_frames).metadata
17451753
vid_questions = ["Why is this video funny?"]
17461754

17471755
return {
1748-
"data": [(video, metadata)] if args.model_type == "glm4_1v" else video,
1756+
"data": ([(video, metadata)] if needs_metadata else video),
17491757
"questions": vid_questions,
17501758
}
17511759

tests/models/multimodal/processing/test_common.py

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -32,11 +32,14 @@ def glm4_1v_patch_mm_data(mm_data: MultiModalDataDict) -> MultiModalDataDict:
3232
# Ensure video metadata is included
3333
if "video" in mm_data:
3434
video = mm_data["video"]
35+
num_frames = len(video)
3536
mm_data["video"] = (video, {
36-
"total_num_frames": len(video),
37-
"fps": len(video),
37+
"total_num_frames": num_frames,
38+
"fps": num_frames,
3839
"duration": 1,
39-
"video_backend": "opencv"
40+
"frames_indices": [i for i in range(num_frames)],
41+
"video_backend": "opencv",
42+
"do_sample_frames": True,
4043
})
4144
return mm_data
4245

tests/models/multimodal/processing/test_glm4_1v.py

Lines changed: 14 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -12,8 +12,19 @@
1212

1313
@pytest.mark.parametrize("model_id", ["zai-org/GLM-4.1V-9B-Thinking"])
1414
@pytest.mark.parametrize("expected_toks_per_frame", [299])
15-
@pytest.mark.parametrize("num_frames", [32, 128])
16-
@pytest.mark.parametrize("fps, expected_grid_t", [(1, 5), (2, 10)])
15+
@pytest.mark.parametrize(
16+
"num_frames, fps, expected_grid_t",
17+
[
18+
# pre-sampled fixed frames (unexpected behavior,
19+
# but we still expect it to work without errors)
20+
(32, 1, 16),
21+
(32, 2, 16),
22+
(128, 1, 64),
23+
(128, 2, 64),
24+
# post-sampled frames (expected behavior)
25+
(-1, 1, 5),
26+
(-1, 2, 10),
27+
])
1728
def test_processor_override(
1829
model_id: str,
1930
expected_toks_per_frame: int,
@@ -80,7 +91,7 @@ def test_video_loader_consistency(
8091

8192
static_video, static_metadata = OpenCVVideoBackend.load_bytes(video_bytes)
8293
dynamic_video, dynamic_metadata = OpenCVDynamicVideoBackend.load_bytes(
83-
video_bytes, requested_fps=fps)
94+
video_bytes, fps=fps)
8495

8596
# pre-sampled loader shouldn't read all frames
8697
assert len(dynamic_video) < len(static_video)

vllm/assets/video.py

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -76,7 +76,7 @@ def video_to_pil_images_list(path: str,
7676
return [Image.fromarray(frame) for frame in frames]
7777

7878

79-
def video_get_metadata(path: str) -> dict[str, Any]:
79+
def video_get_metadata(path: str, num_frames: int = -1) -> dict[str, Any]:
8080
cap = cv2.VideoCapture(path)
8181
if not cap.isOpened():
8282
raise ValueError(f"Could not open video file {path}")
@@ -85,11 +85,18 @@ def video_get_metadata(path: str) -> dict[str, Any]:
8585
fps = cap.get(cv2.CAP_PROP_FPS)
8686
duration = total_frames / fps if fps > 0 else 0
8787

88+
if num_frames == -1 or num_frames > total_frames:
89+
num_frames = total_frames
90+
8891
metadata = {
89-
"total_num_frames": total_frames,
92+
"total_num_frames": num_frames,
9093
"fps": fps,
9194
"duration": duration,
92-
"video_backend": "opencv"
95+
"video_backend": "opencv",
96+
"frames_indices": list(range(num_frames)),
97+
# extra field used to control hf processor's video
98+
# sampling behavior
99+
"do_sample_frames": num_frames == total_frames,
93100
}
94101
return metadata
95102

@@ -126,7 +133,7 @@ def np_ndarrays(self) -> npt.NDArray:
126133

127134
@property
128135
def metadata(self) -> dict[str, Any]:
129-
ret = video_get_metadata(self.video_path)
136+
ret = video_get_metadata(self.video_path, self.num_frames)
130137
return ret
131138

132139
def get_audio(self, sampling_rate: Optional[float] = None) -> npt.NDArray:

vllm/model_executor/models/glm4_1v.py

Lines changed: 50 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,9 @@
3636
import torch.nn as nn
3737
import torch.nn.functional as F
3838
from einops import rearrange
39+
from packaging.version import Version
3940
from transformers import BatchFeature
41+
from transformers import __version__ as TRANSFORMERS_VERSION
4042
from transformers.models.glm4v.configuration_glm4v import Glm4vVisionConfig
4143
from transformers.models.glm4v.image_processing_glm4v import (
4244
Glm4vImageProcessor, smart_resize)
@@ -1001,28 +1003,32 @@ def _get_video_second_idx(self, metadata: dict[str, Any],
10011003
max_frame_idx = meta_frames - 1
10021004
duration = metadata.get("duration",
10031005
round(max_frame_idx / video_fps) + 1)
1004-
if duration <= video_processor.max_duration:
1005-
n = int(math.floor(duration * video_processor.fps))
1006-
frame_indices = [
1007-
min(
1008-
max_frame_idx,
1009-
int(math.ceil(i * video_fps / video_processor.fps)),
1010-
) for i in range(n)
1011-
]
1006+
do_sample_frames = metadata["do_sample_frames"]
1007+
if not do_sample_frames:
1008+
frame_indices = metadata["frames_indices"]
10121009
else:
1013-
num_samples = int(video_processor.max_duration *
1014-
video_processor.fps)
1015-
if num_samples >= meta_frames:
1016-
frame_indices = list(range(meta_frames))
1017-
else:
1018-
target_seconds = np.linspace(0,
1019-
duration,
1020-
num_samples,
1021-
endpoint=True)
1010+
if duration <= video_processor.max_duration:
1011+
n = int(math.floor(duration * video_processor.fps))
10221012
frame_indices = [
1023-
min(max_frame_idx, int(math.ceil(t * video_fps)))
1024-
for t in target_seconds
1013+
min(
1014+
max_frame_idx,
1015+
int(math.ceil(i * video_fps / video_processor.fps)),
1016+
) for i in range(n)
10251017
]
1018+
else:
1019+
num_samples = int(video_processor.max_duration *
1020+
video_processor.fps)
1021+
if num_samples >= meta_frames:
1022+
frame_indices = list(range(meta_frames))
1023+
else:
1024+
target_seconds = np.linspace(0,
1025+
duration,
1026+
num_samples,
1027+
endpoint=True)
1028+
frame_indices = [
1029+
min(max_frame_idx, int(math.ceil(t * video_fps)))
1030+
for t in target_seconds
1031+
]
10261032

10271033
seen, uniq = set(), []
10281034
for idx in frame_indices:
@@ -1139,7 +1145,9 @@ def _get_dummy_videos(
11391145
"fps": 2.0,
11401146
"duration": num_frames / 2.0,
11411147
"total_num_frames": num_frames,
1148+
"frames_indices": [i for i in range(num_frames)],
11421149
"video_backend": "opencv",
1150+
"do_sample_frames": False,
11431151
}
11441152
video_item = (video.copy(), video_metadata)
11451153
video_items.append(video_item)
@@ -1172,34 +1180,37 @@ def _call_hf_processor(
11721180
for item in mm_data.pop("videos", []):
11731181
video_array, metadata = item
11741182

1175-
if metadata["video_backend"] == "opencv_dynamic":
1176-
mm_kwargs["do_sample_frames"] = False
1177-
1178-
elif metadata["total_num_frames"] != len(video_array):
1179-
logger.warning(
1180-
"Total frames in metadata "
1181-
"(%s) does not match the length of "
1182-
"video array %s. This can "
1183-
"be because the video is resampled "
1184-
"in advance. This may cause "
1185-
"a divergence with HF implementation.",
1186-
metadata["total_num_frames"],
1187-
len(video_array),
1188-
)
1189-
metadata["total_num_frames"] = len(video_array)
1183+
# don't update mm_kwargs inplace
1184+
video_mm_kwargs = dict(**mm_kwargs)
1185+
video_mm_kwargs["do_sample_frames"] = metadata.get(
1186+
"do_sample_frames", True)
11901187

11911188
video_mm_data = dict()
11921189
video_mm_data["videos"] = [[video_array]]
1193-
video_mm_data["video_metadata"] = [[VideoMetadata(**metadata)]]
1190+
1191+
# backward compatibility for Transformers 4.55
1192+
unuse_metadata = ["do_sample_frames"]
1193+
if not hasattr(
1194+
VideoMetadata,
1195+
"frames_indices") and "frames_indices" in metadata:
1196+
unuse_metadata.append("frames_indices")
1197+
1198+
video_mm_data["video_metadata"] = [[
1199+
VideoMetadata(
1200+
**{
1201+
k: metadata[k]
1202+
for k in metadata if k not in unuse_metadata
1203+
})
1204+
]]
11941205

11951206
video_outputs = super()._call_hf_processor(
11961207
prompt="<|begin_of_video|><|video|><|end_of_video|>",
11971208
mm_data=video_mm_data,
1198-
mm_kwargs=mm_kwargs,
1209+
mm_kwargs=video_mm_kwargs,
11991210
tok_kwargs=tok_kwargs,
12001211
)
1201-
if "do_sample_frames" in mm_kwargs and not mm_kwargs[
1202-
"do_sample_frames"]:
1212+
if not video_mm_kwargs["do_sample_frames"] and Version(
1213+
TRANSFORMERS_VERSION) < Version("4.56.0"):
12031214
# Transformers v4.55 has incorrect timestamps issue for
12041215
# skip sampling. We construct the placeholder manually to
12051216
# get placeholders with correct timestamps.
@@ -1218,6 +1229,7 @@ def _call_hf_processor(
12181229
prompt = prompt.replace(
12191230
"<|begin_of_video|><|video|><|end_of_video|>",
12201231
video_placeholder,
1232+
1,
12211233
)
12221234

12231235
video_grid_thw_lst.append(video_outputs["video_grid_thw"])

vllm/multimodal/video.py

Lines changed: 28 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -121,14 +121,6 @@ def load_bytes(
121121
original_fps = cap.get(cv2.CAP_PROP_FPS)
122122
duration = total_frames_num / original_fps if original_fps > 0 else 0
123123

124-
# Use transformers transformers.video_utils.VideoMetadata format
125-
metadata = {
126-
"total_num_frames": total_frames_num,
127-
"fps": original_fps,
128-
"duration": duration,
129-
"video_backend": "opencv"
130-
}
131-
132124
# resample video to target num_frames
133125
full_read = num_frames == -1 or total_frames_num < num_frames
134126
if full_read:
@@ -159,6 +151,20 @@ def load_bytes(
159151
assert i == num_frames, (f"Expected reading {num_frames} frames, "
160152
f"but only loaded {i} frames from video.")
161153

154+
# Use transformers transformers.video_utils.VideoMetadata format
155+
# NOTE(Isotr0py): For models like Qwen3-VL/GLM4.5V, this metadata
156+
# can cause incorrect timestamp calculation without num_frames=-1.
157+
metadata = {
158+
"total_num_frames": num_frames,
159+
"fps": original_fps,
160+
"duration": duration,
161+
"video_backend": "opencv",
162+
"frames_indices": list(range(num_frames)),
163+
# extra field used to control hf processor's video
164+
# sampling behavior
165+
"do_sample_frames": num_frames == total_frames_num,
166+
}
167+
162168
return frames, metadata
163169

164170

@@ -170,7 +176,7 @@ def load_bytes(
170176
cls,
171177
data: bytes,
172178
num_frames: int = -1,
173-
requested_fps: int = 2,
179+
fps: int = 2,
174180
max_duration: int = 300,
175181
**kwargs,
176182
) -> tuple[npt.NDArray, dict[str, Any]]:
@@ -185,14 +191,6 @@ def load_bytes(
185191
original_fps = cap.get(cv2.CAP_PROP_FPS)
186192
duration = total_frames_num / original_fps if original_fps > 0 else 0
187193

188-
# Use transformers transformers.video_utils.VideoMetadata format
189-
metadata = {
190-
"total_num_frames": total_frames_num,
191-
"fps": original_fps,
192-
"duration": duration,
193-
"video_backend": "opencv_dynamic"
194-
}
195-
196194
# resample video to target num_frames
197195
max_frame_idx = total_frames_num - 1
198196
duration = duration or round(max_frame_idx / original_fps) + 1
@@ -201,14 +199,13 @@ def load_bytes(
201199
# https://github.com/huggingface/transformers/blob/v4.55.4/src/transformers/models/glm4v/video_processing_glm4v.py#L103-L140
202200
frame_indices: Union[range, list[int]]
203201
if duration <= max_duration:
204-
n = int(math.floor(duration * requested_fps))
202+
n = int(math.floor(duration * fps))
205203
frame_indices = sorted({
206-
min(max_frame_idx,
207-
int(math.ceil(i * original_fps / requested_fps)))
204+
min(max_frame_idx, int(math.ceil(i * original_fps / fps)))
208205
for i in range(n)
209206
})
210207
else:
211-
num_samples = int(max_duration * requested_fps)
208+
num_samples = int(max_duration * fps)
212209
if num_samples >= total_frames_num:
213210
frame_indices = range(total_frames_num)
214211
else:
@@ -241,6 +238,16 @@ def load_bytes(
241238
f"Expected reading {len(frame_indices)} frames, "
242239
f"but only loaded {i} frames from video.")
243240

241+
# Use transformers transformers.video_utils.VideoMetadata format
242+
metadata = {
243+
"total_num_frames": total_frames_num,
244+
"fps": original_fps,
245+
"duration": duration,
246+
"video_backend": "opencv_dynamic",
247+
"frames_indices": list(frame_indices),
248+
"do_sample_frames": False,
249+
}
250+
244251
return frames, metadata
245252

246253

0 commit comments

Comments
 (0)