Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 13 additions & 1 deletion tools/who_what_benchmark/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ wwb --target-model phi-3-openvino --gt-data gt.csv --model-type text --genai

> **NOTE**: use --verbose option for debug to see the outputs with the largest difference.

### Compare Visual Language Models (VLMs)
### Compare Visual Language Models with image inputs (VLMs)
```sh
# Export FP16 model to OpenVINO
optimum-cli export openvino -m llava-hf/llava-v1.6-mistral-7b-hf --weight-format int8 llava-int8
Expand All @@ -64,6 +64,18 @@ wwb --base-model llava-hf/llava-v1.6-mistral-7b-hf --gt-data llava_test/gt.csv -
wwb --target-model llava-int8 --gt-data llava_test/gt.csv --model-type visual-text --genai
```

### Compare Visual Language Models with video inputs (VLMs)
```sh
# Export FP16 model to OpenVINO
optimum-cli export openvino -m Qwen/Qwen2-VL-7B-Instruct --weight-format int8 qwen2-vl-7b-Instruct
# Collect the references and save the mapping in the .csv file.
# Reference images will be stored in the "reference" subfolder under the same path with .csv.
wwb --base-model Qwen/Qwen2-VL-7B-Instruct --gt-data qwen_video_test/gt.csv --model-type visual-video-text --hf
# Compute the metric
# Target images will be stored in the "target" subfolder under the same path with .csv.
wwb --target-model qwen2-vl-7b-Instruct --gt-data qwen_video_test/gt.csv --model-type visual-video-text --genai
```

### Compare Text-to-image models
```sh
# Export model with 8-bit quantized weights to OpenVINO
Expand Down
21 changes: 16 additions & 5 deletions tools/who_what_benchmark/tests/test_cli_vlm.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,16 +9,16 @@
logger = logging.getLogger(__name__)


def run_test(model_id, model_type, optimum_threshold, genai_threshold, tmp_path):
def run_test(model_id, model_type, optimum_threshold, genai_threshold, convertion_task, tmp_path):
if sys.platform == 'darwin':
pytest.xfail("Ticket 173169")
GT_FILE = tmp_path / "gt.csv"
MODEL_PATH = tmp_path / model_id.replace("/", "_")

task = ["--task", convertion_task] if convertion_task else []
result = subprocess.run(["optimum-cli", "export",
"openvino", "-m", model_id,
MODEL_PATH, "--task",
"image-text-to-text",
MODEL_PATH, *task,
"--trust-remote-code"],
capture_output=True,
text=True,
Expand Down Expand Up @@ -100,7 +100,7 @@ def run_test(model_id, model_type, optimum_threshold, genai_threshold, tmp_path)
],
)
def test_vlm_basic(model_id, model_type, tmp_path):
run_test(model_id, model_type, None, None, tmp_path)
run_test(model_id, model_type, None, None, "image-text-to-text", tmp_path)


@pytest.mark.nanollava
Expand All @@ -111,4 +111,15 @@ def test_vlm_basic(model_id, model_type, tmp_path):
],
)
def test_vlm_nanollava(model_id, model_type, optimum_threshold, genai_threshold, tmp_path):
run_test(model_id, model_type, optimum_threshold, genai_threshold, tmp_path)
run_test(model_id, model_type, optimum_threshold, genai_threshold, "image-text-to-text", tmp_path)


@pytest.mark.parametrize(
("model_id", "model_type", "threshold"),
[
("katuni4ka/tiny-random-qwen2vl", "visual-text", 0.8),
("katuni4ka/tiny-random-llava-next-video", "visual-text", 0.8),
],
)
def test_vlm_video(model_id, model_type, threshold, tmp_path):
run_test(model_id, model_type, threshold, threshold, None, tmp_path)
7 changes: 4 additions & 3 deletions tools/who_what_benchmark/whowhatbench/model_loaders.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ def __init__(self, model, model_dir, model_type):
self.model = model
self.model_type = model_type

if model_type in ["text", "visual-text", "text-embedding", "text-reranking"]:
if model_type in ["text", "visual-text", "visual-video-text", "text-embedding", "text-reranking"]:
try:
self.config = AutoConfig.from_pretrained(model_dir)
except Exception:
Expand Down Expand Up @@ -321,7 +321,7 @@ def load_visual_text_genai_pipeline(model_dir, device="CPU", ov_config=None, **k
return GenAIModelWrapper(
pipeline,
model_dir,
"visual-text"
kwargs.get("model_type", "visual-text")
)


Expand Down Expand Up @@ -641,7 +641,8 @@ def load_model(
return load_text2image_model(
model_id, device, ov_options, use_hf, use_genai, **kwargs
)
elif model_type == "visual-text":
elif model_type == "visual-text" or model_type == "visual-video-text":
kwargs["model_type"] = model_type
return load_visual_text_model(model_id, device, ov_options, use_hf, use_genai, **kwargs)
elif model_type == "image-to-image":
return load_imagetext2image_model(model_id, device, ov_options, use_hf, use_genai, **kwargs)
Expand Down
79 changes: 72 additions & 7 deletions tools/who_what_benchmark/whowhatbench/visualtext_evaluator.py
Original file line number Diff line number Diff line change
@@ -1,25 +1,37 @@
from typing import Any, Union

import os
import random
import tarfile
import datasets

import numpy as np
import pandas as pd
from transformers.image_utils import load_image

from tqdm import tqdm
from typing import Literal
from itertools import zip_longest
from transformers import set_seed
from huggingface_hub import hf_hub_download
from transformers.image_utils import load_image
from transformers.video_utils import load_video

from .registry import register_evaluator
from .text_evaluator import TextEvaluator
from .utils import get_ignore_parameters_flag

DEF_VIDEO_FRAMES_AMOUNT = 10


def preprocess_fn(example):
return {
"prompts": example["instruction"],
"images": load_image(example["image_url"]),
"videos": None,
}


def prepare_default_data(num_samples=None):
def prepare_default_data_image(num_samples=None):
DATASET_NAME = "ucla-contextual/contextual_test"
NUM_SAMPLES = 24 if num_samples is None else num_samples
set_seed(42)
Expand All @@ -31,6 +43,50 @@ def prepare_default_data(num_samples=None):
)


def prepare_default_data_video(num_samples=None, num_frames=DEF_VIDEO_FRAMES_AMOUNT):
DATASET_NAME = "lmms-lab/LLaVA-Video-178K"
SUBSET = "30_60_s_academic_v0_1"
NUM_SAMPLES = 24 if num_samples is None else num_samples

questions_per_video_set = datasets.load_dataset(DATASET_NAME, SUBSET,
split="open_ended",
data_files={"open_ended": f"{SUBSET}/30_60_s_academic_oe_v0_1_qa_processed.json"})
questions_per_video = {val['video']: val for val in questions_per_video_set}

# 30_60_s_academic_v0_1_videos_10.tar.gz - just the most lightweight chunk among subset
# https://huggingface.co/datasets/lmms-lab/LLaVA-Video-178K/tree/main/30_60_s_academic_v0_1
# the archive contains 56 videos
videos_arc_path = hf_hub_download(repo_id="lmms-lab/LLaVA-Video-178K",
filename=f"{SUBSET}/{SUBSET}_videos_10.tar.gz",
repo_type="dataset")

video_samples = []
extract_dir = "./videos"
os.makedirs(extract_dir, exist_ok=True)
with tarfile.open(videos_arc_path, "r:gz") as tar:
all_videos = tar.getnames()

random.seed(42)
video_samples = random.sample(all_videos, NUM_SAMPLES)
for sample in video_samples:
tar.extract(sample, path=extract_dir)

# if num_frames < total_num_frames, sample each total_num_frames/num_frames frames or sample all frames
def default_sample_indices_fn(metadata, **kwargs):
total_num_frames = metadata.total_num_frames
if num_frames < total_num_frames:
return np.arange(0, total_num_frames, total_num_frames / num_frames, dtype=int)
return np.arange(0, total_num_frames, dtype=int)

data = []
for video_rel_path in video_samples:
video_tensor = load_video(os.path.join(extract_dir, video_rel_path), backend="opencv", sample_indices_fn=default_sample_indices_fn)
prompt = questions_per_video[video_rel_path]['conversations'][0]['value'].replace("<image>\n", "")
data.append({'prompts': prompt, "images": None, 'videos': video_tensor[0]})

return data


def fix_phi3_v_eos_token_id(model_type, tokenizer):
"""
phi3_v configs aren't consistent. Override the default
Expand All @@ -44,7 +100,7 @@ def fix_phi3_v_eos_token_id(model_type, tokenizer):
return dict()


@register_evaluator("visual-text")
@register_evaluator("visual-text", "visual-video-text")
class VisualTextEvaluator(TextEvaluator):
def __init__(
self,
Expand All @@ -60,8 +116,12 @@ def __init__(
gen_answer_fn=None,
generation_config=None,
seqs_per_request=None,
task_type: Literal['visual-text', 'visual-video-text'] = "visual-text",
frames_num: int | None = None,
) -> None:
self.processor = processor
self.is_image_input = (task_type == "visual-text")
self.frames_num = frames_num or DEF_VIDEO_FRAMES_AMOUNT
super().__init__(
base_model=base_model,
tokenizer=tokenizer,
Expand Down Expand Up @@ -124,15 +184,15 @@ def worst_examples(self, top_k: int = 5, metric="similarity"):

def _generate_data(self, model, gen_answer_fn=None, generation_config=None):
def default_gen_answer(
model, prompt, image, processor, tokenizer, max_new_tokens, crop_question
model, prompt, image, video, processor, tokenizer, max_new_tokens, crop_question
):

from optimum.intel.openvino.modeling_visual_language import \
MODEL_TYPE_TO_CLS_MAPPING
preprocess_inputs = MODEL_TYPE_TO_CLS_MAPPING[
model.config.model_type
].preprocess_inputs
inputs = preprocess_inputs(prompt, image, processor, tokenizer, config=model.config)
inputs = preprocess_inputs(prompt, image, processor, tokenizer, config=model.config, video=video)
tokens = model.generate(
**inputs,
**fix_phi3_v_eos_token_id(model.config.model_type, tokenizer),
Expand Down Expand Up @@ -160,24 +220,29 @@ def default_gen_answer(
if isinstance(self.test_data, dict):
assert "prompts" in self.test_data
assert "images" in self.test_data
assert "videos" in self.test_data
data = dict(self.test_data)
data = pd.DataFrame.from_dict(data)
else:
data = pd.DataFrame.from_dict(prepare_default_data(self.num_samples))
input_data = prepare_default_data_image(self.num_samples) if self.is_image_input else prepare_default_data_video(self.num_samples, self.frames_num)
data = pd.DataFrame.from_dict(input_data)

prompt_data = data["prompts"]
image_data = data["images"]
videos_data = data["videos"]

answers = []
prompts = prompt_data.values
images = image_data.values
videos = videos_data.values

for p, i in tqdm(zip(prompts, images), desc="Evaluate pipeline"):
for p, i, v in tqdm(zip_longest(prompts, images, videos), desc="Evaluate pipeline"):
answers.append(
gen_answer_fn(
model,
p,
i,
v,
self.processor,
self.tokenizer,
self.max_new_tokens,
Expand Down
36 changes: 27 additions & 9 deletions tools/who_what_benchmark/whowhatbench/wwb.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,10 +62,11 @@ def parse_args():
parser.add_argument(
"--model-type",
type=str,
choices=["text", "text-to-image", "visual-text", "image-to-image", "image-inpainting", "text-embedding", "text-reranking"],
choices=["text", "text-to-image", "visual-text", "visual-video-text", "image-to-image", "image-inpainting", "text-embedding", "text-reranking"],
default="text",
help="Indicated the model type: 'text' - for causal text generation, 'text-to-image' - for image generation, "
"visual-text - for Visual Language Models, image-to-image - for image generation based on image and prompt "
"visual-text - for Visual Language Models with image inpust, visual-video-text - for Visual Language Models with video inputs, "
"image-to-image - for image generation based on image and prompt "
"image-inpainting - for image generation based on image, mask and prompt, text-reranking - for reranking a list of texts based on relevance to query",
)
parser.add_argument(
Expand Down Expand Up @@ -266,6 +267,14 @@ def parse_args():
default=None,
help="Config option assistant_confidence_threshold for Speculative decoding.",
)
parser.add_argument(
"--video-frames-num",
type=int,
default=None,
help="The number of frames that will be taken from video for input, the frames will be taken evenly across the entire length, "
"applicable for Visual Language Models with video inputs",
)

return parser.parse_args()


Expand Down Expand Up @@ -507,15 +516,22 @@ def genai_gen_inpainting(model, prompt, image, mask, num_inference_steps, genera
return image


def genai_gen_visual_text(model, prompt, image, processor, tokenizer, max_new_tokens, crop_question):
image_data = ov.Tensor(np.array(image)[None])
def genai_gen_visual_text(model, prompt, image, video, processor, tokenizer, max_new_tokens, crop_question):
kwargs = {
"do_sample": False,
"max_new_tokens": max_new_tokens
}
if image is not None:
kwargs['image'] = ov.Tensor(np.array(image)[None])
if video is not None:
kwargs['videos'] = [ov.Tensor(np.array(video))]

out = model.generate(
prompt,
**fix_phi3_v_eos_token_id(model.config.model_type, tokenizer),
image=image_data,
do_sample=False,
max_new_tokens=max_new_tokens
**kwargs
)

return out.texts[0]


Expand Down Expand Up @@ -588,7 +604,7 @@ def create_evaluator(base_model, args):
is_genai=args.genai,
seed=args.seed,
)
elif task == "visual-text":
elif task == "visual-text" or task == "visual-video-text":
processor, config = load_processor(args)
tokenizer = processor.tokenizer if hasattr(processor, "tokenizer") else load_tokenizer(args)
if config and is_model_with_automatic_crop(config) and args.hf:
Expand All @@ -605,6 +621,8 @@ def create_evaluator(base_model, args):
gen_answer_fn=genai_gen_visual_text if args.genai else None,
processor=processor,
crop_question=crop_question,
task_type=task,
frames_num=args.video_frames_num
)
elif task == "image-to-image":
return EvaluatorCLS(
Expand Down Expand Up @@ -840,7 +858,7 @@ def main():
evaluator.dump_predictions(os.path.join(args.output, "target.csv"))

if args.verbose and (args.target_model or args.target_data):
if args.model_type == "text" or args.model_type == "visual-text":
if args.model_type in ["text", "visual-text", "visual-video-text"]:
print_text_results(evaluator)
elif "text-to-image" in args.model_type or "image-to-image" in args.model_type:
print_image_results(evaluator)
Expand Down
Loading