diff --git a/tools/who_what_benchmark/README.md b/tools/who_what_benchmark/README.md index d2903f2905..3b10c763ad 100644 --- a/tools/who_what_benchmark/README.md +++ b/tools/who_what_benchmark/README.md @@ -52,7 +52,7 @@ wwb --target-model phi-3-openvino --gt-data gt.csv --model-type text --genai > **NOTE**: use --verbose option for debug to see the outputs with the largest difference. -### Compare Visual Language Models (VLMs) +### Compare Visual Language Models with image inputs (VLMs) ```sh # Export FP16 model to OpenVINO optimum-cli export openvino -m llava-hf/llava-v1.6-mistral-7b-hf --weight-format int8 llava-int8 @@ -64,6 +64,18 @@ wwb --base-model llava-hf/llava-v1.6-mistral-7b-hf --gt-data llava_test/gt.csv - wwb --target-model llava-int8 --gt-data llava_test/gt.csv --model-type visual-text --genai ``` +### Compare Visual Language Models with video inputs (VLMs) +```sh +# Export FP16 model to OpenVINO +optimum-cli export openvino -m Qwen/Qwen2-VL-7B-Instruct --weight-format int8 qwen2-vl-7b-Instruct +# Collect the references and save the mapping in the .csv file. +# Reference images will be stored in the "reference" subfolder under the same path with .csv. +wwb --base-model Qwen/Qwen2-VL-7B-Instruct --gt-data qwen_video_test/gt.csv --model-type visual-video-text --hf +# Compute the metric +# Target images will be stored in the "target" subfolder under the same path with .csv. +wwb --target-model qwen2-vl-7b-Instruct --gt-data qwen_video_test/gt.csv --model-type visual-video-text --genai +``` + ### Compare Text-to-image models ```sh # Export model with 8-bit quantized weights to OpenVINO diff --git a/tools/who_what_benchmark/tests/test_cli_vlm.py b/tools/who_what_benchmark/tests/test_cli_vlm.py index e215d901cc..b23aba6efa 100644 --- a/tools/who_what_benchmark/tests/test_cli_vlm.py +++ b/tools/who_what_benchmark/tests/test_cli_vlm.py @@ -112,3 +112,14 @@ def test_vlm_basic(model_id, model_type, tmp_path): ) def test_vlm_nanollava(model_id, model_type, optimum_threshold, genai_threshold, tmp_path): run_test(model_id, model_type, optimum_threshold, genai_threshold, tmp_path) + + +@pytest.mark.parametrize( + ("model_id", "model_type"), + [ + ("katuni4ka/tiny-random-qwen2vl", "visual-text"), + ("katuni4ka/tiny-random-llava-next-video", "visual-text"), + ], +) +def test_vlm_video(model_id, model_type, tmp_path): + run_test(model_id, model_type, None, None, tmp_path) diff --git a/tools/who_what_benchmark/whowhatbench/model_loaders.py b/tools/who_what_benchmark/whowhatbench/model_loaders.py index 6fe5b44abb..166929d4b5 100644 --- a/tools/who_what_benchmark/whowhatbench/model_loaders.py +++ b/tools/who_what_benchmark/whowhatbench/model_loaders.py @@ -24,7 +24,7 @@ def __init__(self, model, model_dir, model_type): self.model = model self.model_type = model_type - if model_type in ["text", "visual-text", "text-embedding", "text-reranking"]: + if model_type in ["text", "visual-text", "visual-video-text", "text-embedding", "text-reranking"]: try: self.config = AutoConfig.from_pretrained(model_dir) except Exception: @@ -321,7 +321,7 @@ def load_visual_text_genai_pipeline(model_dir, device="CPU", ov_config=None, **k return GenAIModelWrapper( pipeline, model_dir, - "visual-text" + kwargs.get("model_type", "visual-text") ) @@ -641,7 +641,8 @@ def load_model( return load_text2image_model( model_id, device, ov_options, use_hf, use_genai, **kwargs ) - elif model_type == "visual-text": + elif model_type == "visual-text" or model_type == "visual-video-text": + kwargs["model_type"] = model_type return load_visual_text_model(model_id, device, ov_options, use_hf, use_genai, **kwargs) elif model_type == "image-to-image": return load_imagetext2image_model(model_id, device, ov_options, use_hf, use_genai, **kwargs) diff --git a/tools/who_what_benchmark/whowhatbench/visualtext_evaluator.py b/tools/who_what_benchmark/whowhatbench/visualtext_evaluator.py index 13f9d1f6d1..b23dda45e6 100644 --- a/tools/who_what_benchmark/whowhatbench/visualtext_evaluator.py +++ b/tools/who_what_benchmark/whowhatbench/visualtext_evaluator.py @@ -1,25 +1,37 @@ from typing import Any, Union import os +import random +import tarfile import datasets + +import numpy as np import pandas as pd -from transformers.image_utils import load_image + from tqdm import tqdm +from typing import Literal +from itertools import zip_longest from transformers import set_seed +from huggingface_hub import hf_hub_download +from transformers.image_utils import load_image +from transformers.video_utils import load_video from .registry import register_evaluator from .text_evaluator import TextEvaluator from .utils import get_ignore_parameters_flag +DEF_VIDEO_FRAMES_AMOUNT = 10 + def preprocess_fn(example): return { "prompts": example["instruction"], "images": load_image(example["image_url"]), + "videos": None, } -def prepare_default_data(num_samples=None): +def prepare_default_data_image(num_samples=None): DATASET_NAME = "ucla-contextual/contextual_test" NUM_SAMPLES = 24 if num_samples is None else num_samples set_seed(42) @@ -31,6 +43,50 @@ def prepare_default_data(num_samples=None): ) +def prepare_default_data_video(num_samples=None, num_frames=DEF_VIDEO_FRAMES_AMOUNT): + DATASET_NAME = "lmms-lab/LLaVA-Video-178K" + SUBSET = "30_60_s_academic_v0_1" + NUM_SAMPLES = 24 if num_samples is None else num_samples + + questions_per_video_set = datasets.load_dataset(DATASET_NAME, SUBSET, + split="open_ended", + data_files={"open_ended": f"{SUBSET}/30_60_s_academic_oe_v0_1_qa_processed.json"}) + questions_per_video = {val['video']: val for val in questions_per_video_set} + + # 30_60_s_academic_v0_1_videos_10.tar.gz - just the most lightweight chunk among subset + # https://huggingface.co/datasets/lmms-lab/LLaVA-Video-178K/tree/main/30_60_s_academic_v0_1 + # the archive contains 56 videos + videos_arc_path = hf_hub_download(repo_id="lmms-lab/LLaVA-Video-178K", + filename=f"{SUBSET}/{SUBSET}_videos_10.tar.gz", + repo_type="dataset") + + video_samples = [] + extract_dir = "./videos" + os.makedirs(extract_dir, exist_ok=True) + with tarfile.open(videos_arc_path, "r:gz") as tar: + all_videos = tar.getnames() + + random.seed(42) + video_samples = random.sample(all_videos, NUM_SAMPLES) + for sample in video_samples: + tar.extract(sample, path=extract_dir) + + # if num_frames < total_num_frames, sample each total_num_frames/num_frames frames or sample all frames + def default_sample_indices_fn(metadata, **kwargs): + total_num_frames = metadata.total_num_frames + if num_frames < total_num_frames: + return np.arange(0, total_num_frames, total_num_frames / num_frames, dtype=int) + return np.arange(0, total_num_frames, dtype=int) + + data = [] + for video_rel_path in video_samples: + video_tensor = load_video(os.path.join(extract_dir, video_rel_path), backend="opencv", sample_indices_fn=default_sample_indices_fn) + prompt = questions_per_video[video_rel_path]['conversations'][0]['value'].replace("\n", "") + data.append({'prompts': prompt, "images": None, 'videos': video_tensor[0]}) + + return data + + def fix_phi3_v_eos_token_id(model_type, tokenizer): """ phi3_v configs aren't consistent. Override the default @@ -44,7 +100,7 @@ def fix_phi3_v_eos_token_id(model_type, tokenizer): return dict() -@register_evaluator("visual-text") +@register_evaluator("visual-text", "visual-video-text") class VisualTextEvaluator(TextEvaluator): def __init__( self, @@ -60,8 +116,12 @@ def __init__( gen_answer_fn=None, generation_config=None, seqs_per_request=None, + task_type: Literal['visual-text', 'visual-video-text'] = "visual-text", + frames_num: int | None = None, ) -> None: self.processor = processor + self.is_image_input = (task_type == "visual-text") + self.frames_num = frames_num or DEF_VIDEO_FRAMES_AMOUNT super().__init__( base_model=base_model, tokenizer=tokenizer, @@ -124,7 +184,7 @@ def worst_examples(self, top_k: int = 5, metric="similarity"): def _generate_data(self, model, gen_answer_fn=None, generation_config=None): def default_gen_answer( - model, prompt, image, processor, tokenizer, max_new_tokens, crop_question + model, prompt, image, video, processor, tokenizer, max_new_tokens, crop_question ): from optimum.intel.openvino.modeling_visual_language import \ @@ -132,7 +192,7 @@ def default_gen_answer( preprocess_inputs = MODEL_TYPE_TO_CLS_MAPPING[ model.config.model_type ].preprocess_inputs - inputs = preprocess_inputs(prompt, image, processor, tokenizer, config=model.config) + inputs = preprocess_inputs(prompt, image, processor, tokenizer, config=model.config, video=video) tokens = model.generate( **inputs, **fix_phi3_v_eos_token_id(model.config.model_type, tokenizer), @@ -160,24 +220,29 @@ def default_gen_answer( if isinstance(self.test_data, dict): assert "prompts" in self.test_data assert "images" in self.test_data + assert "videos" in self.test_data data = dict(self.test_data) data = pd.DataFrame.from_dict(data) else: - data = pd.DataFrame.from_dict(prepare_default_data(self.num_samples)) + input_data = prepare_default_data_image(self.num_samples) if self.is_image_input else prepare_default_data_video(self.num_samples, self.frames_num) + data = pd.DataFrame.from_dict(input_data) prompt_data = data["prompts"] image_data = data["images"] + videos_data = data["videos"] answers = [] prompts = prompt_data.values images = image_data.values + videos = videos_data.values - for p, i in tqdm(zip(prompts, images), desc="Evaluate pipeline"): + for p, i, v in tqdm(zip_longest(prompts, images, videos), desc="Evaluate pipeline"): answers.append( gen_answer_fn( model, p, i, + v, self.processor, self.tokenizer, self.max_new_tokens, diff --git a/tools/who_what_benchmark/whowhatbench/wwb.py b/tools/who_what_benchmark/whowhatbench/wwb.py index ab6af99d20..44354e97ec 100644 --- a/tools/who_what_benchmark/whowhatbench/wwb.py +++ b/tools/who_what_benchmark/whowhatbench/wwb.py @@ -62,10 +62,11 @@ def parse_args(): parser.add_argument( "--model-type", type=str, - choices=["text", "text-to-image", "visual-text", "image-to-image", "image-inpainting", "text-embedding", "text-reranking"], + choices=["text", "text-to-image", "visual-text", "visual-video-text", "image-to-image", "image-inpainting", "text-embedding", "text-reranking"], default="text", help="Indicated the model type: 'text' - for causal text generation, 'text-to-image' - for image generation, " - "visual-text - for Visual Language Models, image-to-image - for image generation based on image and prompt " + "visual-text - for Visual Language Models with image inpust, visual-video-text - for Visual Language Models with video inputs, " + "image-to-image - for image generation based on image and prompt " "image-inpainting - for image generation based on image, mask and prompt, text-reranking - for reranking a list of texts based on relevance to query", ) parser.add_argument( @@ -266,6 +267,14 @@ def parse_args(): default=None, help="Config option assistant_confidence_threshold for Speculative decoding.", ) + parser.add_argument( + "--video-frames-num", + type=int, + default=None, + help="The number of frames that will be taken from video for input, the frames will be taken evenly across the entire length, " + "applicable for Visual Language Models with video inputs", + ) + return parser.parse_args() @@ -507,15 +516,22 @@ def genai_gen_inpainting(model, prompt, image, mask, num_inference_steps, genera return image -def genai_gen_visual_text(model, prompt, image, processor, tokenizer, max_new_tokens, crop_question): - image_data = ov.Tensor(np.array(image)[None]) +def genai_gen_visual_text(model, prompt, image, video, processor, tokenizer, max_new_tokens, crop_question): + kwargs = { + "do_sample": False, + "max_new_tokens": max_new_tokens + } + if image is not None: + kwargs['image'] = ov.Tensor(np.array(image)[None]) + if video is not None: + kwargs['videos'] = [ov.Tensor(np.array(video))] + out = model.generate( prompt, **fix_phi3_v_eos_token_id(model.config.model_type, tokenizer), - image=image_data, - do_sample=False, - max_new_tokens=max_new_tokens + **kwargs ) + return out.texts[0] @@ -588,7 +604,7 @@ def create_evaluator(base_model, args): is_genai=args.genai, seed=args.seed, ) - elif task == "visual-text": + elif task == "visual-text" or task == "visual-video-text": processor, config = load_processor(args) tokenizer = processor.tokenizer if hasattr(processor, "tokenizer") else load_tokenizer(args) if config and is_model_with_automatic_crop(config) and args.hf: @@ -605,6 +621,8 @@ def create_evaluator(base_model, args): gen_answer_fn=genai_gen_visual_text if args.genai else None, processor=processor, crop_question=crop_question, + task_type=task, + frames_num=args.video_frames_num ) elif task == "image-to-image": return EvaluatorCLS( @@ -840,7 +858,7 @@ def main(): evaluator.dump_predictions(os.path.join(args.output, "target.csv")) if args.verbose and (args.target_model or args.target_data): - if args.model_type == "text" or args.model_type == "visual-text": + if args.model_type in ["text", "visual-text", "visual-video-text"]: print_text_results(evaluator) elif "text-to-image" in args.model_type or "image-to-image" in args.model_type: print_image_results(evaluator)