Skip to content

Commit 36c1196

Browse files
committed
[wwb] Add possibility to check video inputs for VLM
1 parent eb64c87 commit 36c1196

File tree

5 files changed

+131
-25
lines changed

5 files changed

+131
-25
lines changed

tools/who_what_benchmark/README.md

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,7 @@ wwb --target-model phi-3-openvino --gt-data gt.csv --model-type text --genai
5252

5353
> **NOTE**: use --verbose option for debug to see the outputs with the largest difference.
5454
55-
### Compare Visual Language Models (VLMs)
55+
### Compare Visual Language Models with image inputs (VLMs)
5656
```sh
5757
# Export FP16 model to OpenVINO
5858
optimum-cli export openvino -m llava-hf/llava-v1.6-mistral-7b-hf --weight-format int8 llava-int8
@@ -64,6 +64,18 @@ wwb --base-model llava-hf/llava-v1.6-mistral-7b-hf --gt-data llava_test/gt.csv -
6464
wwb --target-model llava-int8 --gt-data llava_test/gt.csv --model-type visual-text --genai
6565
```
6666

67+
### Compare Visual Language Models with video inputs (VLMs)
68+
```sh
69+
# Export FP16 model to OpenVINO
70+
optimum-cli export openvino -m Qwen/Qwen2-VL-7B-Instruct --weight-format int8 qwen2-vl-7b-Instruct
71+
# Collect the references and save the mapping in the .csv file.
72+
# Reference images will be stored in the "reference" subfolder under the same path with .csv.
73+
wwb --base-model Qwen/Qwen2-VL-7B-Instruct --gt-data qwen_video_test/gt.csv --model-type visual-video-text --hf
74+
# Compute the metric
75+
# Target images will be stored in the "target" subfolder under the same path with .csv.
76+
wwb --target-model qwen2-vl-7b-Instruct --gt-data qwen_video_test/gt.csv --model-type visual-video-text --genai
77+
```
78+
6779
### Compare Text-to-image models
6880
```sh
6981
# Export model with 8-bit quantized weights to OpenVINO

tools/who_what_benchmark/tests/test_cli_vlm.py

Lines changed: 16 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -9,16 +9,16 @@
99
logger = logging.getLogger(__name__)
1010

1111

12-
def run_test(model_id, model_type, optimum_threshold, genai_threshold, tmp_path):
12+
def run_test(model_id, model_type, optimum_threshold, genai_threshold, convertion_task, tmp_path):
1313
if sys.platform == 'darwin':
1414
pytest.xfail("Ticket 173169")
1515
GT_FILE = tmp_path / "gt.csv"
1616
MODEL_PATH = tmp_path / model_id.replace("/", "_")
1717

18+
task = f"--task {convertion_task}" if convertion_task else ''
1819
result = subprocess.run(["optimum-cli", "export",
1920
"openvino", "-m", model_id,
20-
MODEL_PATH, "--task",
21-
"image-text-to-text",
21+
MODEL_PATH, task,
2222
"--trust-remote-code"],
2323
capture_output=True,
2424
text=True,
@@ -100,7 +100,7 @@ def run_test(model_id, model_type, optimum_threshold, genai_threshold, tmp_path)
100100
],
101101
)
102102
def test_vlm_basic(model_id, model_type, tmp_path):
103-
run_test(model_id, model_type, None, None, tmp_path)
103+
run_test(model_id, model_type, None, None, "image-text-to-text", tmp_path)
104104

105105

106106
@pytest.mark.nanollava
@@ -111,4 +111,15 @@ def test_vlm_basic(model_id, model_type, tmp_path):
111111
],
112112
)
113113
def test_vlm_nanollava(model_id, model_type, optimum_threshold, genai_threshold, tmp_path):
114-
run_test(model_id, model_type, optimum_threshold, genai_threshold, tmp_path)
114+
run_test(model_id, model_type, optimum_threshold, genai_threshold, "image-text-to-text", tmp_path)
115+
116+
117+
@pytest.mark.parametrize(
118+
("model_id", "model_type", "optimum_threshold", "genai_threshold"),
119+
[
120+
("katuni4ka/tiny-random-qwen2vl", "visual-text", 0.8),
121+
("katuni4ka/tiny-random-llava-next-video", "visual-text", 0.8),
122+
],
123+
)
124+
def test_vlm_nanollava(model_id, model_type, threshold, tmp_path):
125+
run_test(model_id, model_type, threshold, threshold, None, tmp_path)

tools/who_what_benchmark/whowhatbench/model_loaders.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ def __init__(self, model, model_dir, model_type):
2424
self.model = model
2525
self.model_type = model_type
2626

27-
if model_type in ["text", "visual-text", "text-embedding", "text-reranking"]:
27+
if model_type in ["text", "visual-text", "visual-video-text", "text-embedding", "text-reranking"]:
2828
try:
2929
self.config = AutoConfig.from_pretrained(model_dir)
3030
except Exception:
@@ -321,7 +321,7 @@ def load_visual_text_genai_pipeline(model_dir, device="CPU", ov_config=None, **k
321321
return GenAIModelWrapper(
322322
pipeline,
323323
model_dir,
324-
"visual-text"
324+
kwargs.get("model_type", "visual-text")
325325
)
326326

327327

@@ -641,7 +641,8 @@ def load_model(
641641
return load_text2image_model(
642642
model_id, device, ov_options, use_hf, use_genai, **kwargs
643643
)
644-
elif model_type == "visual-text":
644+
elif model_type == "visual-text" or model_type == "visual-video-text":
645+
kwargs["model_type"] = model_type
645646
return load_visual_text_model(model_id, device, ov_options, use_hf, use_genai, **kwargs)
646647
elif model_type == "image-to-image":
647648
return load_imagetext2image_model(model_id, device, ov_options, use_hf, use_genai, **kwargs)

tools/who_what_benchmark/whowhatbench/visualtext_evaluator.py

Lines changed: 71 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,25 +1,37 @@
11
from typing import Any, Union
22

33
import os
4+
import random
5+
import tarfile
46
import datasets
7+
8+
import numpy as np
59
import pandas as pd
6-
from transformers.image_utils import load_image
10+
711
from tqdm import tqdm
12+
from typing import Literal
13+
from itertools import zip_longest
814
from transformers import set_seed
15+
from huggingface_hub import hf_hub_download
16+
from transformers.image_utils import load_image
17+
from transformers.video_utils import load_video
918

1019
from .registry import register_evaluator
1120
from .text_evaluator import TextEvaluator
1221
from .utils import get_ignore_parameters_flag
1322

23+
DEF_VIDEO_FRAMES_AMOUNT = 10
24+
1425

1526
def preprocess_fn(example):
1627
return {
1728
"prompts": example["instruction"],
1829
"images": load_image(example["image_url"]),
30+
"videos": None,
1931
}
2032

2133

22-
def prepare_default_data(num_samples=None):
34+
def prepare_default_data_image(num_samples=None):
2335
DATASET_NAME = "ucla-contextual/contextual_test"
2436
NUM_SAMPLES = 24 if num_samples is None else num_samples
2537
set_seed(42)
@@ -31,6 +43,49 @@ def prepare_default_data(num_samples=None):
3143
)
3244

3345

46+
def prepare_default_data_video(num_samples=None, num_frames=DEF_VIDEO_FRAMES_AMOUNT):
47+
DATASET_NAME = "lmms-lab/LLaVA-Video-178K"
48+
SUBSET = "30_60_s_academic_v0_1"
49+
NUM_SAMPLES = 24 if num_samples is None else num_samples
50+
51+
questions_per_video_set = datasets.load_dataset(DATASET_NAME, SUBSET,
52+
split="open_ended",
53+
data_files={"open_ended": f"{SUBSET}/30_60_s_academic_oe_v0_1_qa_processed.json"})
54+
questions_per_video = {val['video']: val for val in questions_per_video_set}
55+
56+
# 30_60_s_academic_v0_1_videos_10.tar.gz - just the most lightweight chunk among subset
57+
# https://huggingface.co/datasets/lmms-lab/LLaVA-Video-178K/tree/main/30_60_s_academic_v0_1
58+
# the archive contains 56 videos
59+
videos_arc_path = hf_hub_download(repo_id="lmms-lab/LLaVA-Video-178K",
60+
filename=f"{SUBSET}/{SUBSET}_videos_10.tar.gz",
61+
repo_type="dataset")
62+
63+
video_samples = []
64+
extract_dir = "./videos"
65+
os.makedirs(extract_dir, exist_ok=True)
66+
with tarfile.open(videos_arc_path, "r:gz") as tar:
67+
all_videos = tar.getnames()
68+
69+
random.seed(42)
70+
video_samples = random.sample(all_videos, NUM_SAMPLES)
71+
for sample in video_samples:
72+
tar.extract(sample, path=extract_dir)
73+
74+
# if num_frames < total_num_frames, sample each total_num_frames/num_frames frames or sample all frames
75+
def default_sample_indices_fn(metadata, **kwargs):
76+
total_num_frames = metadata.total_num_frames
77+
if num_frames < total_num_frames:
78+
return np.arange(0, total_num_frames, total_num_frames / num_frames, dtype=int)
79+
return np.arange(0, total_num_frames, dtype=int)
80+
81+
data = []
82+
for video_rel_path in video_samples:
83+
video_tensor = load_video(os.path.join(extract_dir, video_rel_path), backend="opencv", sample_indices_fn=default_sample_indices_fn)
84+
data.append({'prompts': questions_per_video[video_rel_path]['conversations'][0]['value'], "images": None, 'videos': video_tensor[0]})
85+
86+
return data
87+
88+
3489
def fix_phi3_v_eos_token_id(model_type, tokenizer):
3590
"""
3691
phi3_v configs aren't consistent. Override the default
@@ -44,7 +99,7 @@ def fix_phi3_v_eos_token_id(model_type, tokenizer):
4499
return dict()
45100

46101

47-
@register_evaluator("visual-text")
102+
@register_evaluator("visual-text", "visual-video-text")
48103
class VisualTextEvaluator(TextEvaluator):
49104
def __init__(
50105
self,
@@ -60,8 +115,12 @@ def __init__(
60115
gen_answer_fn=None,
61116
generation_config=None,
62117
seqs_per_request=None,
118+
task_type: Literal['visual-text', 'visual-video-text'] = "visual-text",
119+
frames_num: int | None = None,
63120
) -> None:
64121
self.processor = processor
122+
self.is_image_input = (task_type == "visual-text")
123+
self.frames_num = frames_num or DEF_VIDEO_FRAMES_AMOUNT
65124
super().__init__(
66125
base_model=base_model,
67126
tokenizer=tokenizer,
@@ -124,15 +183,15 @@ def worst_examples(self, top_k: int = 5, metric="similarity"):
124183

125184
def _generate_data(self, model, gen_answer_fn=None, generation_config=None):
126185
def default_gen_answer(
127-
model, prompt, image, processor, tokenizer, max_new_tokens, crop_question
186+
model, prompt, image, video, processor, tokenizer, max_new_tokens, crop_question
128187
):
129188

130189
from optimum.intel.openvino.modeling_visual_language import \
131190
MODEL_TYPE_TO_CLS_MAPPING
132191
preprocess_inputs = MODEL_TYPE_TO_CLS_MAPPING[
133192
model.config.model_type
134193
].preprocess_inputs
135-
inputs = preprocess_inputs(prompt, image, processor, tokenizer, config=model.config)
194+
inputs = preprocess_inputs(prompt, image, processor, tokenizer, config=model.config, video=video)
136195
tokens = model.generate(
137196
**inputs,
138197
**fix_phi3_v_eos_token_id(model.config.model_type, tokenizer),
@@ -160,24 +219,29 @@ def default_gen_answer(
160219
if isinstance(self.test_data, dict):
161220
assert "prompts" in self.test_data
162221
assert "images" in self.test_data
222+
assert "videos" in self.test_data
163223
data = dict(self.test_data)
164224
data = pd.DataFrame.from_dict(data)
165225
else:
166-
data = pd.DataFrame.from_dict(prepare_default_data(self.num_samples))
226+
input_data = prepare_default_data_image(self.num_samples) if self.is_image_input else prepare_default_data_video(self.num_samples, self.frames_num)
227+
data = pd.DataFrame.from_dict(input_data)
167228

168229
prompt_data = data["prompts"]
169230
image_data = data["images"]
231+
videos_data = data["videos"]
170232

171233
answers = []
172234
prompts = prompt_data.values
173235
images = image_data.values
236+
videos = videos_data.values
174237

175-
for p, i in tqdm(zip(prompts, images), desc="Evaluate pipeline"):
238+
for p, i, v in tqdm(zip_longest(prompts, images, videos), desc="Evaluate pipeline"):
176239
answers.append(
177240
gen_answer_fn(
178241
model,
179242
p,
180243
i,
244+
v,
181245
self.processor,
182246
self.tokenizer,
183247
self.max_new_tokens,

tools/who_what_benchmark/whowhatbench/wwb.py

Lines changed: 27 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -62,10 +62,11 @@ def parse_args():
6262
parser.add_argument(
6363
"--model-type",
6464
type=str,
65-
choices=["text", "text-to-image", "visual-text", "image-to-image", "image-inpainting", "text-embedding", "text-reranking"],
65+
choices=["text", "text-to-image", "visual-text", "visual-video-text", "image-to-image", "image-inpainting", "text-embedding", "text-reranking"],
6666
default="text",
6767
help="Indicated the model type: 'text' - for causal text generation, 'text-to-image' - for image generation, "
68-
"visual-text - for Visual Language Models, image-to-image - for image generation based on image and prompt "
68+
"visual-text - for Visual Language Models with image inpust, visual-video-text - for Visual Language Models with video inputs, "
69+
"image-to-image - for image generation based on image and prompt "
6970
"image-inpainting - for image generation based on image, mask and prompt, text-reranking - for reranking a list of texts based on relevance to query",
7071
)
7172
parser.add_argument(
@@ -266,6 +267,14 @@ def parse_args():
266267
default=None,
267268
help="Config option assistant_confidence_threshold for Speculative decoding.",
268269
)
270+
parser.add_argument(
271+
"--video-frames-num",
272+
type=int,
273+
default=None,
274+
help="The number of frames that will be taken from video for input, the frames will be taken evenly across the entire length, "
275+
"applicable for Visual Language Models with video inputs",
276+
)
277+
269278
return parser.parse_args()
270279

271280

@@ -507,15 +516,22 @@ def genai_gen_inpainting(model, prompt, image, mask, num_inference_steps, genera
507516
return image
508517

509518

510-
def genai_gen_visual_text(model, prompt, image, processor, tokenizer, max_new_tokens, crop_question):
511-
image_data = ov.Tensor(np.array(image)[None])
519+
def genai_gen_visual_text(model, prompt, image, video, processor, tokenizer, max_new_tokens, crop_question):
520+
kwargs = {
521+
"do_sample": False,
522+
"max_new_tokens": max_new_tokens
523+
}
524+
if image is not None:
525+
kwargs['image'] = ov.Tensor(np.array(image)[None])
526+
if video is not None:
527+
kwargs['videos'] = [ov.Tensor(np.array(video))]
528+
512529
out = model.generate(
513530
prompt,
514531
**fix_phi3_v_eos_token_id(model.config.model_type, tokenizer),
515-
image=image_data,
516-
do_sample=False,
517-
max_new_tokens=max_new_tokens
532+
**kwargs
518533
)
534+
519535
return out.texts[0]
520536

521537

@@ -588,7 +604,7 @@ def create_evaluator(base_model, args):
588604
is_genai=args.genai,
589605
seed=args.seed,
590606
)
591-
elif task == "visual-text":
607+
elif task == "visual-text" or task == "visual-video-text":
592608
processor, config = load_processor(args)
593609
tokenizer = processor.tokenizer if hasattr(processor, "tokenizer") else load_tokenizer(args)
594610
if config and is_model_with_automatic_crop(config) and args.hf:
@@ -605,6 +621,8 @@ def create_evaluator(base_model, args):
605621
gen_answer_fn=genai_gen_visual_text if args.genai else None,
606622
processor=processor,
607623
crop_question=crop_question,
624+
task_type=task,
625+
frames_num=args.video_frames_num
608626
)
609627
elif task == "image-to-image":
610628
return EvaluatorCLS(
@@ -840,7 +858,7 @@ def main():
840858
evaluator.dump_predictions(os.path.join(args.output, "target.csv"))
841859

842860
if args.verbose and (args.target_model or args.target_data):
843-
if args.model_type == "text" or args.model_type == "visual-text":
861+
if args.model_type in ["text", "visual-text", "visual-video-text"]:
844862
print_text_results(evaluator)
845863
elif "text-to-image" in args.model_type or "image-to-image" in args.model_type:
846864
print_image_results(evaluator)

0 commit comments

Comments
 (0)