11from typing import Any , Union
22
33import os
4+ import random
5+ import tarfile
46import datasets
7+
8+ import numpy as np
59import pandas as pd
6- from transformers . image_utils import load_image
10+
711from tqdm import tqdm
12+ from typing import Literal
13+ from itertools import zip_longest
814from transformers import set_seed
15+ from huggingface_hub import hf_hub_download
16+ from transformers .image_utils import load_image
17+ from transformers .video_utils import load_video
918
1019from .registry import register_evaluator
1120from .text_evaluator import TextEvaluator
1221from .utils import get_ignore_parameters_flag
1322
23+ DEF_VIDEO_FRAMES_AMOUNT = 10
24+
1425
1526def preprocess_fn (example ):
1627 return {
1728 "prompts" : example ["instruction" ],
1829 "images" : load_image (example ["image_url" ]),
30+ "videos" : None ,
1931 }
2032
2133
22- def prepare_default_data (num_samples = None ):
34+ def prepare_default_data_image (num_samples = None ):
2335 DATASET_NAME = "ucla-contextual/contextual_test"
2436 NUM_SAMPLES = 24 if num_samples is None else num_samples
2537 set_seed (42 )
@@ -31,6 +43,49 @@ def prepare_default_data(num_samples=None):
3143 )
3244
3345
46+ def prepare_default_data_video (num_samples = None , num_frames = DEF_VIDEO_FRAMES_AMOUNT ):
47+ DATASET_NAME = "lmms-lab/LLaVA-Video-178K"
48+ SUBSET = "30_60_s_academic_v0_1"
49+ NUM_SAMPLES = 24 if num_samples is None else num_samples
50+
51+ questions_per_video_set = datasets .load_dataset (DATASET_NAME , SUBSET ,
52+ split = "open_ended" ,
53+ data_files = {"open_ended" : f"{ SUBSET } /30_60_s_academic_oe_v0_1_qa_processed.json" })
54+ questions_per_video = {val ['video' ]: val for val in questions_per_video_set }
55+
56+ # 30_60_s_academic_v0_1_videos_10.tar.gz - just the most lightweight chunk among subset
57+ # https://huggingface.co/datasets/lmms-lab/LLaVA-Video-178K/tree/main/30_60_s_academic_v0_1
58+ # the archive contains 56 videos
59+ videos_arc_path = hf_hub_download (repo_id = "lmms-lab/LLaVA-Video-178K" ,
60+ filename = f"{ SUBSET } /{ SUBSET } _videos_10.tar.gz" ,
61+ repo_type = "dataset" )
62+
63+ video_samples = []
64+ extract_dir = "./videos"
65+ os .makedirs (extract_dir , exist_ok = True )
66+ with tarfile .open (videos_arc_path , "r:gz" ) as tar :
67+ all_videos = tar .getnames ()
68+
69+ random .seed (42 )
70+ video_samples = random .sample (all_videos , NUM_SAMPLES )
71+ for sample in video_samples :
72+ tar .extract (sample , path = extract_dir )
73+
74+ # if num_frames < total_num_frames, sample each total_num_frames/num_frames frames or sample all frames
75+ def default_sample_indices_fn (metadata , ** kwargs ):
76+ total_num_frames = metadata .total_num_frames
77+ if num_frames < total_num_frames :
78+ return np .arange (0 , total_num_frames , total_num_frames / num_frames , dtype = int )
79+ return np .arange (0 , total_num_frames , dtype = int )
80+
81+ data = []
82+ for video_rel_path in video_samples :
83+ video_tensor = load_video (os .path .join (extract_dir , video_rel_path ), backend = "opencv" , sample_indices_fn = default_sample_indices_fn )
84+ data .append ({'prompts' : questions_per_video [video_rel_path ]['conversations' ][0 ]['value' ], "images" : None , 'videos' : video_tensor [0 ]})
85+
86+ return data
87+
88+
3489def fix_phi3_v_eos_token_id (model_type , tokenizer ):
3590 """
3691 phi3_v configs aren't consistent. Override the default
@@ -44,7 +99,7 @@ def fix_phi3_v_eos_token_id(model_type, tokenizer):
4499 return dict ()
45100
46101
47- @register_evaluator ("visual-text" )
102+ @register_evaluator ("visual-text" , "visual-video-text" )
48103class VisualTextEvaluator (TextEvaluator ):
49104 def __init__ (
50105 self ,
@@ -60,8 +115,12 @@ def __init__(
60115 gen_answer_fn = None ,
61116 generation_config = None ,
62117 seqs_per_request = None ,
118+ task_type : Literal ['visual-text' , 'visual-video-text' ] = "visual-text" ,
119+ frames_num : int | None = None ,
63120 ) -> None :
64121 self .processor = processor
122+ self .is_image_input = (task_type == "visual-text" )
123+ self .frames_num = frames_num or DEF_VIDEO_FRAMES_AMOUNT
65124 super ().__init__ (
66125 base_model = base_model ,
67126 tokenizer = tokenizer ,
@@ -124,15 +183,15 @@ def worst_examples(self, top_k: int = 5, metric="similarity"):
124183
125184 def _generate_data (self , model , gen_answer_fn = None , generation_config = None ):
126185 def default_gen_answer (
127- model , prompt , image , processor , tokenizer , max_new_tokens , crop_question
186+ model , prompt , image , video , processor , tokenizer , max_new_tokens , crop_question
128187 ):
129188
130189 from optimum .intel .openvino .modeling_visual_language import \
131190 MODEL_TYPE_TO_CLS_MAPPING
132191 preprocess_inputs = MODEL_TYPE_TO_CLS_MAPPING [
133192 model .config .model_type
134193 ].preprocess_inputs
135- inputs = preprocess_inputs (prompt , image , processor , tokenizer , config = model .config )
194+ inputs = preprocess_inputs (prompt , image , processor , tokenizer , config = model .config , video = video )
136195 tokens = model .generate (
137196 ** inputs ,
138197 ** fix_phi3_v_eos_token_id (model .config .model_type , tokenizer ),
@@ -160,24 +219,29 @@ def default_gen_answer(
160219 if isinstance (self .test_data , dict ):
161220 assert "prompts" in self .test_data
162221 assert "images" in self .test_data
222+ assert "videos" in self .test_data
163223 data = dict (self .test_data )
164224 data = pd .DataFrame .from_dict (data )
165225 else :
166- data = pd .DataFrame .from_dict (prepare_default_data (self .num_samples ))
226+ input_data = prepare_default_data_image (self .num_samples ) if self .is_image_input else prepare_default_data_video (self .num_samples , self .frames_num )
227+ data = pd .DataFrame .from_dict (input_data )
167228
168229 prompt_data = data ["prompts" ]
169230 image_data = data ["images" ]
231+ videos_data = data ["videos" ]
170232
171233 answers = []
172234 prompts = prompt_data .values
173235 images = image_data .values
236+ videos = videos_data .values
174237
175- for p , i in tqdm (zip (prompts , images ), desc = "Evaluate pipeline" ):
238+ for p , i , v in tqdm (zip_longest (prompts , images , videos ), desc = "Evaluate pipeline" ):
176239 answers .append (
177240 gen_answer_fn (
178241 model ,
179242 p ,
180243 i ,
244+ v ,
181245 self .processor ,
182246 self .tokenizer ,
183247 self .max_new_tokens ,
0 commit comments