11from typing import Any , Union
22
33import os
4+ import random
5+ import tarfile
46import datasets
7+
8+ import numpy as np
59import pandas as pd
6- from transformers . image_utils import load_image
10+
711from tqdm import tqdm
12+ from typing import Literal
13+ from itertools import zip_longest
814from transformers import set_seed
15+ from huggingface_hub import hf_hub_download
16+ from transformers .image_utils import load_image
17+ from transformers .video_utils import load_video
918
1019from .registry import register_evaluator
1120from .text_evaluator import TextEvaluator
1221from .utils import get_ignore_parameters_flag
1322
23+ DEF_VIDEO_FRAMES_AMOUNT = 10
24+
1425
1526def preprocess_fn (example ):
1627 return {
1728 "prompts" : example ["instruction" ],
1829 "images" : load_image (example ["image_url" ]),
30+ "videos" : None ,
1931 }
2032
2133
22- def prepare_default_data (num_samples = None ):
34+ def prepare_default_data_image (num_samples = None ):
2335 DATASET_NAME = "ucla-contextual/contextual_test"
2436 NUM_SAMPLES = 24 if num_samples is None else num_samples
2537 set_seed (42 )
@@ -31,6 +43,50 @@ def prepare_default_data(num_samples=None):
3143 )
3244
3345
46+ def prepare_default_data_video (num_samples = None , num_frames = DEF_VIDEO_FRAMES_AMOUNT ):
47+ DATASET_NAME = "lmms-lab/LLaVA-Video-178K"
48+ SUBSET = "30_60_s_academic_v0_1"
49+ NUM_SAMPLES = 24 if num_samples is None else num_samples
50+
51+ questions_per_video_set = datasets .load_dataset (DATASET_NAME , SUBSET ,
52+ split = "open_ended" ,
53+ data_files = {"open_ended" : f"{ SUBSET } /30_60_s_academic_oe_v0_1_qa_processed.json" })
54+ questions_per_video = {val ['video' ]: val for val in questions_per_video_set }
55+
56+ # 30_60_s_academic_v0_1_videos_10.tar.gz - just the most lightweight chunk among subset
57+ # https://huggingface.co/datasets/lmms-lab/LLaVA-Video-178K/tree/main/30_60_s_academic_v0_1
58+ # the archive contains 56 videos
59+ videos_arc_path = hf_hub_download (repo_id = "lmms-lab/LLaVA-Video-178K" ,
60+ filename = f"{ SUBSET } /{ SUBSET } _videos_10.tar.gz" ,
61+ repo_type = "dataset" )
62+
63+ video_samples = []
64+ extract_dir = "./videos"
65+ os .makedirs (extract_dir , exist_ok = True )
66+ with tarfile .open (videos_arc_path , "r:gz" ) as tar :
67+ all_videos = tar .getnames ()
68+
69+ random .seed (42 )
70+ video_samples = random .sample (all_videos , NUM_SAMPLES )
71+ for sample in video_samples :
72+ tar .extract (sample , path = extract_dir )
73+
74+ # if num_frames < total_num_frames, sample each total_num_frames/num_frames frames or sample all frames
75+ def default_sample_indices_fn (metadata , ** kwargs ):
76+ total_num_frames = metadata .total_num_frames
77+ if num_frames < total_num_frames :
78+ return np .arange (0 , total_num_frames , total_num_frames / num_frames , dtype = int )
79+ return np .arange (0 , total_num_frames , dtype = int )
80+
81+ data = []
82+ for video_rel_path in video_samples :
83+ video_tensor = load_video (os .path .join (extract_dir , video_rel_path ), backend = "opencv" , sample_indices_fn = default_sample_indices_fn )
84+ prompt = questions_per_video [video_rel_path ]['conversations' ][0 ]['value' ].replace ("<image>\n " , "" )
85+ data .append ({'prompts' : prompt , "images" : None , 'videos' : video_tensor [0 ]})
86+
87+ return data
88+
89+
3490def fix_phi3_v_eos_token_id (model_type , tokenizer ):
3591 """
3692 phi3_v configs aren't consistent. Override the default
@@ -44,7 +100,7 @@ def fix_phi3_v_eos_token_id(model_type, tokenizer):
44100 return dict ()
45101
46102
47- @register_evaluator ("visual-text" )
103+ @register_evaluator ("visual-text" , "visual-video-text" )
48104class VisualTextEvaluator (TextEvaluator ):
49105 def __init__ (
50106 self ,
@@ -60,8 +116,12 @@ def __init__(
60116 gen_answer_fn = None ,
61117 generation_config = None ,
62118 seqs_per_request = None ,
119+ task_type : Literal ['visual-text' , 'visual-video-text' ] = "visual-text" ,
120+ frames_num : int | None = None ,
63121 ) -> None :
64122 self .processor = processor
123+ self .is_image_input = (task_type == "visual-text" )
124+ self .frames_num = frames_num or DEF_VIDEO_FRAMES_AMOUNT
65125 super ().__init__ (
66126 base_model = base_model ,
67127 tokenizer = tokenizer ,
@@ -124,15 +184,15 @@ def worst_examples(self, top_k: int = 5, metric="similarity"):
124184
125185 def _generate_data (self , model , gen_answer_fn = None , generation_config = None ):
126186 def default_gen_answer (
127- model , prompt , image , processor , tokenizer , max_new_tokens , crop_question
187+ model , prompt , image , video , processor , tokenizer , max_new_tokens , crop_question
128188 ):
129189
130190 from optimum .intel .openvino .modeling_visual_language import \
131191 MODEL_TYPE_TO_CLS_MAPPING
132192 preprocess_inputs = MODEL_TYPE_TO_CLS_MAPPING [
133193 model .config .model_type
134194 ].preprocess_inputs
135- inputs = preprocess_inputs (prompt , image , processor , tokenizer , config = model .config )
195+ inputs = preprocess_inputs (prompt , image , processor , tokenizer , config = model .config , video = video )
136196 tokens = model .generate (
137197 ** inputs ,
138198 ** fix_phi3_v_eos_token_id (model .config .model_type , tokenizer ),
@@ -160,24 +220,29 @@ def default_gen_answer(
160220 if isinstance (self .test_data , dict ):
161221 assert "prompts" in self .test_data
162222 assert "images" in self .test_data
223+ assert "videos" in self .test_data
163224 data = dict (self .test_data )
164225 data = pd .DataFrame .from_dict (data )
165226 else :
166- data = pd .DataFrame .from_dict (prepare_default_data (self .num_samples ))
227+ input_data = prepare_default_data_image (self .num_samples ) if self .is_image_input else prepare_default_data_video (self .num_samples , self .frames_num )
228+ data = pd .DataFrame .from_dict (input_data )
167229
168230 prompt_data = data ["prompts" ]
169231 image_data = data ["images" ]
232+ videos_data = data ["videos" ]
170233
171234 answers = []
172235 prompts = prompt_data .values
173236 images = image_data .values
237+ videos = videos_data .values
174238
175- for p , i in tqdm (zip (prompts , images ), desc = "Evaluate pipeline" ):
239+ for p , i , v in tqdm (zip_longest (prompts , images , videos ), desc = "Evaluate pipeline" ):
176240 answers .append (
177241 gen_answer_fn (
178242 model ,
179243 p ,
180244 i ,
245+ v ,
181246 self .processor ,
182247 self .tokenizer ,
183248 self .max_new_tokens ,
0 commit comments