11from typing import Any , Union
22
33import os
4+ import random
5+ import tarfile
46import datasets
7+
8+ import numpy as np
59import pandas as pd
6- from transformers . image_utils import load_image
10+
711from tqdm import tqdm
12+ from typing import Literal
13+ from itertools import zip_longest
814from transformers import set_seed
15+ from transformers .image_utils import load_image
916
1017from .registry import register_evaluator
1118from .text_evaluator import TextEvaluator
1219from .utils import get_ignore_parameters_flag
1320
21+ DEF_VIDEO_FRAMES_AMOUNT = 10
22+
1423
1524def preprocess_fn (example ):
1625 return {
1726 "prompts" : example ["instruction" ],
1827 "images" : load_image (example ["image_url" ]),
28+ "videos" : None ,
1929 }
2030
2131
22- def prepare_default_data (num_samples = None ):
32+ def prepare_default_data_image (num_samples = None ):
2333 DATASET_NAME = "ucla-contextual/contextual_test"
2434 NUM_SAMPLES = 24 if num_samples is None else num_samples
2535 set_seed (42 )
@@ -31,6 +41,53 @@ def prepare_default_data(num_samples=None):
3141 )
3242
3343
44+ def prepare_default_data_video (num_samples = None , num_frames = DEF_VIDEO_FRAMES_AMOUNT ):
45+ from huggingface_hub import hf_hub_download
46+ from transformers .video_utils import load_video
47+
48+ DATASET_NAME = "lmms-lab/LLaVA-Video-178K"
49+ SUBSET = "30_60_s_academic_v0_1"
50+ NUM_SAMPLES = 24 if num_samples is None else num_samples
51+
52+ questions_per_video_set = datasets .load_dataset (DATASET_NAME , SUBSET ,
53+ split = "open_ended" ,
54+ data_files = {"open_ended" : f"{ SUBSET } /30_60_s_academic_oe_v0_1_qa_processed.json" })
55+ questions_per_video = {val ['video' ]: val for val in questions_per_video_set }
56+
57+ # 30_60_s_academic_v0_1_videos_10.tar.gz - just the most lightweight chunk among subset
58+ # https://huggingface.co/datasets/lmms-lab/LLaVA-Video-178K/tree/main/30_60_s_academic_v0_1
59+ # the archive contains 56 videos
60+ videos_arc_path = hf_hub_download (repo_id = "lmms-lab/LLaVA-Video-178K" ,
61+ filename = f"{ SUBSET } /{ SUBSET } _videos_10.tar.gz" ,
62+ repo_type = "dataset" )
63+
64+ video_samples = []
65+ extract_dir = "./videos"
66+ os .makedirs (extract_dir , exist_ok = True )
67+ with tarfile .open (videos_arc_path , "r:gz" ) as tar :
68+ all_videos = tar .getnames ()
69+
70+ random .seed (42 ) # nosec
71+ video_samples = random .sample (all_videos , NUM_SAMPLES ) # nosec
72+ for sample in video_samples :
73+ tar .extract (sample , path = extract_dir )
74+
75+ # if num_frames < total_num_frames, sample each total_num_frames/num_frames frames or sample all frames
76+ def default_sample_indices_fn (metadata , ** kwargs ):
77+ total_num_frames = metadata .total_num_frames
78+ if num_frames < total_num_frames :
79+ return np .arange (0 , total_num_frames , total_num_frames / num_frames , dtype = int )
80+ return np .arange (0 , total_num_frames , dtype = int )
81+
82+ data = []
83+ for video_rel_path in video_samples :
84+ video_tensor = load_video (os .path .join (extract_dir , video_rel_path ), backend = "opencv" , sample_indices_fn = default_sample_indices_fn )
85+ prompt = questions_per_video [video_rel_path ]['conversations' ][0 ]['value' ].replace ("<image>\n " , "" )
86+ data .append ({'prompts' : prompt , "images" : None , 'videos' : video_tensor [0 ]})
87+
88+ return data
89+
90+
3491def fix_phi3_v_eos_token_id (model_type , tokenizer ):
3592 """
3693 phi3_v configs aren't consistent. Override the default
@@ -44,7 +101,7 @@ def fix_phi3_v_eos_token_id(model_type, tokenizer):
44101 return dict ()
45102
46103
47- @register_evaluator ("visual-text" )
104+ @register_evaluator ("visual-text" , "visual-video-text" )
48105class VisualTextEvaluator (TextEvaluator ):
49106 def __init__ (
50107 self ,
@@ -60,8 +117,12 @@ def __init__(
60117 gen_answer_fn = None ,
61118 generation_config = None ,
62119 seqs_per_request = None ,
120+ task_type : Literal ['visual-text' , 'visual-video-text' ] = "visual-text" ,
121+ frames_num : int | None = None ,
63122 ) -> None :
64123 self .processor = processor
124+ self .is_image_input = (task_type == "visual-text" )
125+ self .frames_num = frames_num or DEF_VIDEO_FRAMES_AMOUNT
65126 super ().__init__ (
66127 base_model = base_model ,
67128 tokenizer = tokenizer ,
@@ -124,15 +185,15 @@ def worst_examples(self, top_k: int = 5, metric="similarity"):
124185
125186 def _generate_data (self , model , gen_answer_fn = None , generation_config = None ):
126187 def default_gen_answer (
127- model , prompt , image , processor , tokenizer , max_new_tokens , crop_question
188+ model , prompt , image , video , processor , tokenizer , max_new_tokens , crop_question
128189 ):
129190
130191 from optimum .intel .openvino .modeling_visual_language import \
131192 MODEL_TYPE_TO_CLS_MAPPING
132193 preprocess_inputs = MODEL_TYPE_TO_CLS_MAPPING [
133194 model .config .model_type
134195 ].preprocess_inputs
135- inputs = preprocess_inputs (prompt , image , processor , tokenizer , config = model .config )
196+ inputs = preprocess_inputs (prompt , image , processor , tokenizer , config = model .config , video = video )
136197 tokens = model .generate (
137198 ** inputs ,
138199 ** fix_phi3_v_eos_token_id (model .config .model_type , tokenizer ),
@@ -160,24 +221,29 @@ def default_gen_answer(
160221 if isinstance (self .test_data , dict ):
161222 assert "prompts" in self .test_data
162223 assert "images" in self .test_data
224+ assert "videos" in self .test_data
163225 data = dict (self .test_data )
164226 data = pd .DataFrame .from_dict (data )
165227 else :
166- data = pd .DataFrame .from_dict (prepare_default_data (self .num_samples ))
228+ input_data = prepare_default_data_image (self .num_samples ) if self .is_image_input else prepare_default_data_video (self .num_samples , self .frames_num )
229+ data = pd .DataFrame .from_dict (input_data )
167230
168231 prompt_data = data ["prompts" ]
169232 image_data = data ["images" ]
233+ videos_data = data ["videos" ]
170234
171235 answers = []
172236 prompts = prompt_data .values
173237 images = image_data .values
238+ videos = videos_data .values
174239
175- for p , i in tqdm (zip (prompts , images ), desc = "Evaluate pipeline" ):
240+ for p , i , v in tqdm (zip_longest (prompts , images , videos ), desc = "Evaluate pipeline" ):
176241 answers .append (
177242 gen_answer_fn (
178243 model ,
179244 p ,
180245 i ,
246+ v ,
181247 self .processor ,
182248 self .tokenizer ,
183249 self .max_new_tokens ,
0 commit comments