Skip to content

Commit 641b4bb

Browse files
committed
Fix EVS video_embeds require second_per_grid_ts
Signed-off-by: zitian.zhao <[email protected]>
1 parent 6c317a6 commit 641b4bb

File tree

1 file changed

+16
-1
lines changed

1 file changed

+16
-1
lines changed

vllm/model_executor/models/qwen2_5_vl.py

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -225,6 +225,9 @@ class Qwen2_5_VLVideoEmbeddingInputs(TensorSchema):
225225
- hidden_size must match the hidden size of language model backbone.
226226
- video_grid_thw shape: (num_videos, 3) in (grid_t, grid_h, grid_w)
227227
format
228+
- second_per_grid_ts: The video time interval (in seconds) for each
229+
grid along the temporal dimension in the 3D position IDs. Returned
230+
when `videos` is not `None`.
228231
"""
229232

230233
type: Literal["video_embeds"]
@@ -239,6 +242,11 @@ class Qwen2_5_VLVideoEmbeddingInputs(TensorSchema):
239242
TensorShape("nv", 3),
240243
]
241244

245+
second_per_grid_ts: Annotated[
246+
torch.Tensor | None,
247+
TensorShape("nv"),
248+
] = None
249+
242250

243251
Qwen2_5_VLVideoInputs: TypeAlias = (
244252
Qwen2_5_VLVideoPixelInputs | Qwen2_5_VLVideoEmbeddingInputs
@@ -1350,6 +1358,7 @@ def _parse_and_validate_video_input(
13501358
type="video_embeds",
13511359
video_embeds=video_embeds,
13521360
video_grid_thw=video_grid_thw,
1361+
second_per_grid_ts=second_per_grid_ts,
13531362
)
13541363

13551364
def _process_image_input(
@@ -1471,7 +1480,13 @@ def _postprocess_video_embeds_evs(
14711480

14721481
# Cast to long to match the original code
14731482
# https://github.com/huggingface/transformers/blob/41980ce93e775f6c88500c51c8db7946fc6a2add/src/transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py#L491 # noqa
1474-
second_per_grid_ts = video_input["second_per_grid_ts"].long()
1483+
second_per_grid_ts = video_input.get("second_per_grid_ts")
1484+
if second_per_grid_ts is None:
1485+
raise ValueError(
1486+
"second_per_grid_ts is required when video_pruning_rate > 0 "
1487+
"is enabled for video inputs, including the video_embeds path."
1488+
)
1489+
second_per_grid_ts = second_per_grid_ts.long()
14751490
tokens_per_second = self.config.vision_config.tokens_per_second
14761491

14771492
video_embeds_out = []

0 commit comments

Comments
 (0)