Skip to content

Commit 01ae501

Browse files
committed
[wwb] Add possibility to check video inputs for VLM
1 parent df1c52d commit 01ae501

File tree

5 files changed

+128
-20
lines changed

5 files changed

+128
-20
lines changed

tools/who_what_benchmark/README.md

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,7 @@ wwb --target-model phi-3-openvino --gt-data gt.csv --model-type text --genai
5252

5353
> **NOTE**: use --verbose option for debug to see the outputs with the largest difference.
5454
55-
### Compare Visual Language Models (VLMs)
55+
### Compare Visual Language Models with image inputs (VLMs)
5656
```sh
5757
# Export FP16 model to OpenVINO
5858
optimum-cli export openvino -m llava-hf/llava-v1.6-mistral-7b-hf --weight-format int8 llava-int8
@@ -64,6 +64,18 @@ wwb --base-model llava-hf/llava-v1.6-mistral-7b-hf --gt-data llava_test/gt.csv -
6464
wwb --target-model llava-int8 --gt-data llava_test/gt.csv --model-type visual-text --genai
6565
```
6666

67+
### Compare Visual Language Models with video inputs (VLMs)
68+
```sh
69+
# Export FP16 model to OpenVINO
70+
optimum-cli export openvino -m Qwen/Qwen2-VL-7B-Instruct --weight-format int8 qwen2-vl-7b-Instruct
71+
# Collect the references and save the mapping in the .csv file.
72+
# Reference images will be stored in the "reference" subfolder under the same path with .csv.
73+
wwb --base-model Qwen/Qwen2-VL-7B-Instruct --gt-data qwen_video_test/gt.csv --model-type visual-video-text --hf
74+
# Compute the metric
75+
# Target images will be stored in the "target" subfolder under the same path with .csv.
76+
wwb --target-model qwen2-vl-7b-Instruct --gt-data qwen_video_test/gt.csv --model-type visual-video-text --genai
77+
```
78+
6779
### Compare Text-to-image models
6880
```sh
6981
# Export model with 8-bit quantized weights to OpenVINO

tools/who_what_benchmark/tests/test_cli_vlm.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -112,3 +112,14 @@ def test_vlm_basic(model_id, model_type, tmp_path):
112112
)
113113
def test_vlm_nanollava(model_id, model_type, optimum_threshold, genai_threshold, tmp_path):
114114
run_test(model_id, model_type, optimum_threshold, genai_threshold, tmp_path)
115+
116+
117+
@pytest.mark.parametrize(
118+
("model_id", "model_type"),
119+
[
120+
("katuni4ka/tiny-random-qwen2vl", "visual-text"),
121+
("katuni4ka/tiny-random-llava-next-video", "visual-text"),
122+
],
123+
)
124+
def test_vlm_video(model_id, model_type, tmp_path):
125+
run_test(model_id, model_type, None, None, tmp_path)

tools/who_what_benchmark/whowhatbench/model_loaders.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ def __init__(self, model, model_dir, model_type):
2424
self.model = model
2525
self.model_type = model_type
2626

27-
if model_type in ["text", "visual-text", "text-embedding", "text-reranking"]:
27+
if model_type in ["text", "visual-text", "visual-video-text", "text-embedding", "text-reranking"]:
2828
try:
2929
self.config = AutoConfig.from_pretrained(model_dir)
3030
except Exception:
@@ -321,7 +321,7 @@ def load_visual_text_genai_pipeline(model_dir, device="CPU", ov_config=None, **k
321321
return GenAIModelWrapper(
322322
pipeline,
323323
model_dir,
324-
"visual-text"
324+
kwargs.get("model_type", "visual-text")
325325
)
326326

327327

@@ -641,7 +641,8 @@ def load_model(
641641
return load_text2image_model(
642642
model_id, device, ov_options, use_hf, use_genai, **kwargs
643643
)
644-
elif model_type == "visual-text":
644+
elif model_type == "visual-text" or model_type == "visual-video-text":
645+
kwargs["model_type"] = model_type
645646
return load_visual_text_model(model_id, device, ov_options, use_hf, use_genai, **kwargs)
646647
elif model_type == "image-to-image":
647648
return load_imagetext2image_model(model_id, device, ov_options, use_hf, use_genai, **kwargs)

tools/who_what_benchmark/whowhatbench/visualtext_evaluator.py

Lines changed: 73 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,25 +1,35 @@
11
from typing import Any, Union
22

33
import os
4+
import random
5+
import tarfile
46
import datasets
7+
8+
import numpy as np
59
import pandas as pd
6-
from transformers.image_utils import load_image
10+
711
from tqdm import tqdm
12+
from typing import Literal
13+
from itertools import zip_longest
814
from transformers import set_seed
15+
from transformers.image_utils import load_image
916

1017
from .registry import register_evaluator
1118
from .text_evaluator import TextEvaluator
1219
from .utils import get_ignore_parameters_flag
1320

21+
DEF_VIDEO_FRAMES_AMOUNT = 10
22+
1423

1524
def preprocess_fn(example):
1625
return {
1726
"prompts": example["instruction"],
1827
"images": load_image(example["image_url"]),
28+
"videos": None,
1929
}
2030

2131

22-
def prepare_default_data(num_samples=None):
32+
def prepare_default_data_image(num_samples=None):
2333
DATASET_NAME = "ucla-contextual/contextual_test"
2434
NUM_SAMPLES = 24 if num_samples is None else num_samples
2535
set_seed(42)
@@ -31,6 +41,53 @@ def prepare_default_data(num_samples=None):
3141
)
3242

3343

44+
def prepare_default_data_video(num_samples=None, num_frames=DEF_VIDEO_FRAMES_AMOUNT):
45+
from huggingface_hub import hf_hub_download
46+
from transformers.video_utils import load_video
47+
48+
DATASET_NAME = "lmms-lab/LLaVA-Video-178K"
49+
SUBSET = "30_60_s_academic_v0_1"
50+
NUM_SAMPLES = 24 if num_samples is None else num_samples
51+
52+
questions_per_video_set = datasets.load_dataset(DATASET_NAME, SUBSET,
53+
split="open_ended",
54+
data_files={"open_ended": f"{SUBSET}/30_60_s_academic_oe_v0_1_qa_processed.json"})
55+
questions_per_video = {val['video']: val for val in questions_per_video_set}
56+
57+
# 30_60_s_academic_v0_1_videos_10.tar.gz - just the most lightweight chunk among subset
58+
# https://huggingface.co/datasets/lmms-lab/LLaVA-Video-178K/tree/main/30_60_s_academic_v0_1
59+
# the archive contains 56 videos
60+
videos_arc_path = hf_hub_download(repo_id="lmms-lab/LLaVA-Video-178K",
61+
filename=f"{SUBSET}/{SUBSET}_videos_10.tar.gz",
62+
repo_type="dataset")
63+
64+
video_samples = []
65+
extract_dir = "./videos"
66+
os.makedirs(extract_dir, exist_ok=True)
67+
with tarfile.open(videos_arc_path, "r:gz") as tar:
68+
all_videos = tar.getnames()
69+
70+
random.seed(42) # nosec
71+
video_samples = random.sample(all_videos, NUM_SAMPLES) # nosec
72+
for sample in video_samples:
73+
tar.extract(sample, path=extract_dir)
74+
75+
# if num_frames < total_num_frames, sample each total_num_frames/num_frames frames or sample all frames
76+
def default_sample_indices_fn(metadata, **kwargs):
77+
total_num_frames = metadata.total_num_frames
78+
if num_frames < total_num_frames:
79+
return np.arange(0, total_num_frames, total_num_frames / num_frames, dtype=int)
80+
return np.arange(0, total_num_frames, dtype=int)
81+
82+
data = []
83+
for video_rel_path in video_samples:
84+
video_tensor = load_video(os.path.join(extract_dir, video_rel_path), backend="opencv", sample_indices_fn=default_sample_indices_fn)
85+
prompt = questions_per_video[video_rel_path]['conversations'][0]['value'].replace("<image>\n", "")
86+
data.append({'prompts': prompt, "images": None, 'videos': video_tensor[0]})
87+
88+
return data
89+
90+
3491
def fix_phi3_v_eos_token_id(model_type, tokenizer):
3592
"""
3693
phi3_v configs aren't consistent. Override the default
@@ -44,7 +101,7 @@ def fix_phi3_v_eos_token_id(model_type, tokenizer):
44101
return dict()
45102

46103

47-
@register_evaluator("visual-text")
104+
@register_evaluator("visual-text", "visual-video-text")
48105
class VisualTextEvaluator(TextEvaluator):
49106
def __init__(
50107
self,
@@ -60,8 +117,12 @@ def __init__(
60117
gen_answer_fn=None,
61118
generation_config=None,
62119
seqs_per_request=None,
120+
task_type: Literal['visual-text', 'visual-video-text'] = "visual-text",
121+
frames_num: int | None = None,
63122
) -> None:
64123
self.processor = processor
124+
self.is_image_input = (task_type == "visual-text")
125+
self.frames_num = frames_num or DEF_VIDEO_FRAMES_AMOUNT
65126
super().__init__(
66127
base_model=base_model,
67128
tokenizer=tokenizer,
@@ -124,15 +185,15 @@ def worst_examples(self, top_k: int = 5, metric="similarity"):
124185

125186
def _generate_data(self, model, gen_answer_fn=None, generation_config=None):
126187
def default_gen_answer(
127-
model, prompt, image, processor, tokenizer, max_new_tokens, crop_question
188+
model, prompt, image, video, processor, tokenizer, max_new_tokens, crop_question
128189
):
129190

130191
from optimum.intel.openvino.modeling_visual_language import \
131192
MODEL_TYPE_TO_CLS_MAPPING
132193
preprocess_inputs = MODEL_TYPE_TO_CLS_MAPPING[
133194
model.config.model_type
134195
].preprocess_inputs
135-
inputs = preprocess_inputs(prompt, image, processor, tokenizer, config=model.config)
196+
inputs = preprocess_inputs(prompt, image, processor, tokenizer, config=model.config, video=video)
136197
tokens = model.generate(
137198
**inputs,
138199
**fix_phi3_v_eos_token_id(model.config.model_type, tokenizer),
@@ -160,24 +221,29 @@ def default_gen_answer(
160221
if isinstance(self.test_data, dict):
161222
assert "prompts" in self.test_data
162223
assert "images" in self.test_data
224+
assert "videos" in self.test_data
163225
data = dict(self.test_data)
164226
data = pd.DataFrame.from_dict(data)
165227
else:
166-
data = pd.DataFrame.from_dict(prepare_default_data(self.num_samples))
228+
input_data = prepare_default_data_image(self.num_samples) if self.is_image_input else prepare_default_data_video(self.num_samples, self.frames_num)
229+
data = pd.DataFrame.from_dict(input_data)
167230

168231
prompt_data = data["prompts"]
169232
image_data = data["images"]
233+
videos_data = data["videos"]
170234

171235
answers = []
172236
prompts = prompt_data.values
173237
images = image_data.values
238+
videos = videos_data.values
174239

175-
for p, i in tqdm(zip(prompts, images), desc="Evaluate pipeline"):
240+
for p, i, v in tqdm(zip_longest(prompts, images, videos), desc="Evaluate pipeline"):
176241
answers.append(
177242
gen_answer_fn(
178243
model,
179244
p,
180245
i,
246+
v,
181247
self.processor,
182248
self.tokenizer,
183249
self.max_new_tokens,

tools/who_what_benchmark/whowhatbench/wwb.py

Lines changed: 27 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -62,10 +62,11 @@ def parse_args():
6262
parser.add_argument(
6363
"--model-type",
6464
type=str,
65-
choices=["text", "text-to-image", "visual-text", "image-to-image", "image-inpainting", "text-embedding", "text-reranking"],
65+
choices=["text", "text-to-image", "visual-text", "visual-video-text", "image-to-image", "image-inpainting", "text-embedding", "text-reranking"],
6666
default="text",
6767
help="Indicated the model type: 'text' - for causal text generation, 'text-to-image' - for image generation, "
68-
"visual-text - for Visual Language Models, image-to-image - for image generation based on image and prompt "
68+
"visual-text - for Visual Language Models with image inpust, visual-video-text - for Visual Language Models with video inputs, "
69+
"image-to-image - for image generation based on image and prompt "
6970
"image-inpainting - for image generation based on image, mask and prompt, text-reranking - for reranking a list of texts based on relevance to query",
7071
)
7172
parser.add_argument(
@@ -266,6 +267,14 @@ def parse_args():
266267
default=None,
267268
help="Config option assistant_confidence_threshold for Speculative decoding.",
268269
)
270+
parser.add_argument(
271+
"--video-frames-num",
272+
type=int,
273+
default=None,
274+
help="The number of frames that will be taken from video for input, the frames will be taken evenly across the entire length, "
275+
"applicable for Visual Language Models with video inputs",
276+
)
277+
269278
return parser.parse_args()
270279

271280

@@ -507,15 +516,22 @@ def genai_gen_inpainting(model, prompt, image, mask, num_inference_steps, genera
507516
return image
508517

509518

510-
def genai_gen_visual_text(model, prompt, image, processor, tokenizer, max_new_tokens, crop_question):
511-
image_data = ov.Tensor(np.array(image)[None])
519+
def genai_gen_visual_text(model, prompt, image, video, processor, tokenizer, max_new_tokens, crop_question):
520+
kwargs = {
521+
"do_sample": False,
522+
"max_new_tokens": max_new_tokens
523+
}
524+
if image is not None:
525+
kwargs['image'] = ov.Tensor(np.array(image)[None])
526+
if video is not None:
527+
kwargs['videos'] = [ov.Tensor(np.array(video))]
528+
512529
out = model.generate(
513530
prompt,
514531
**fix_phi3_v_eos_token_id(model.config.model_type, tokenizer),
515-
image=image_data,
516-
do_sample=False,
517-
max_new_tokens=max_new_tokens
532+
**kwargs
518533
)
534+
519535
return out.texts[0]
520536

521537

@@ -588,7 +604,7 @@ def create_evaluator(base_model, args):
588604
is_genai=args.genai,
589605
seed=args.seed,
590606
)
591-
elif task == "visual-text":
607+
elif task == "visual-text" or task == "visual-video-text":
592608
processor, config = load_processor(args)
593609
tokenizer = processor.tokenizer if hasattr(processor, "tokenizer") else load_tokenizer(args)
594610
if config and is_model_with_automatic_crop(config) and args.hf:
@@ -605,6 +621,8 @@ def create_evaluator(base_model, args):
605621
gen_answer_fn=genai_gen_visual_text if args.genai else None,
606622
processor=processor,
607623
crop_question=crop_question,
624+
task_type=task,
625+
frames_num=args.video_frames_num
608626
)
609627
elif task == "image-to-image":
610628
return EvaluatorCLS(
@@ -840,7 +858,7 @@ def main():
840858
evaluator.dump_predictions(os.path.join(args.output, "target.csv"))
841859

842860
if args.verbose and (args.target_model or args.target_data):
843-
if args.model_type == "text" or args.model_type == "visual-text":
861+
if args.model_type in ["text", "visual-text", "visual-video-text"]:
844862
print_text_results(evaluator)
845863
elif "text-to-image" in args.model_type or "image-to-image" in args.model_type:
846864
print_image_results(evaluator)

0 commit comments

Comments
 (0)