@@ -225,6 +225,9 @@ class Qwen2_5_VLVideoEmbeddingInputs(TensorSchema):
225225 - hidden_size must match the hidden size of language model backbone.
226226 - video_grid_thw shape: (num_videos, 3) in (grid_t, grid_h, grid_w)
227227 format
228+ - second_per_grid_ts: The video time interval (in seconds) for each
229+ grid along the temporal dimension in the 3D position IDs. Returned
230+ when `videos` is not `None`.
228231 """
229232
230233 type : Literal ["video_embeds" ]
@@ -239,6 +242,11 @@ class Qwen2_5_VLVideoEmbeddingInputs(TensorSchema):
239242 TensorShape ("nv" , 3 ),
240243 ]
241244
245+ second_per_grid_ts : Annotated [
246+ torch .Tensor | None ,
247+ TensorShape ("nv" ),
248+ ] = None
249+
242250
243251Qwen2_5_VLVideoInputs : TypeAlias = (
244252 Qwen2_5_VLVideoPixelInputs | Qwen2_5_VLVideoEmbeddingInputs
@@ -1350,6 +1358,7 @@ def _parse_and_validate_video_input(
13501358 type = "video_embeds" ,
13511359 video_embeds = video_embeds ,
13521360 video_grid_thw = video_grid_thw ,
1361+ second_per_grid_ts = second_per_grid_ts ,
13531362 )
13541363
13551364 def _process_image_input (
@@ -1471,7 +1480,13 @@ def _postprocess_video_embeds_evs(
14711480
14721481 # Cast to long to match the original code
14731482 # https://github.com/huggingface/transformers/blob/41980ce93e775f6c88500c51c8db7946fc6a2add/src/transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py#L491 # noqa
1474- second_per_grid_ts = video_input ["second_per_grid_ts" ].long ()
1483+ second_per_grid_ts = video_input .get ("second_per_grid_ts" )
1484+ if second_per_grid_ts is None :
1485+ raise ValueError (
1486+ "second_per_grid_ts is required when video_pruning_rate > 0 "
1487+ "is enabled for video inputs, including the video_embeds path."
1488+ )
1489+ second_per_grid_ts = second_per_grid_ts .long ()
14751490 tokens_per_second = self .config .vision_config .tokens_per_second
14761491
14771492 video_embeds_out = []
0 commit comments