From 4952d71e0b31021ffdb3a88f568e93273dc72068 Mon Sep 17 00:00:00 2001 From: Piotr Wolnowski Date: Thu, 19 Dec 2024 10:49:15 +0100 Subject: [PATCH 01/13] Create whisper_evaluator.py --- .../custom_evaluators/whisper_evaluator.py | 151 ++++++++++++++++++ 1 file changed, 151 insertions(+) create mode 100644 tools/accuracy_checker/accuracy_checker/evaluators/custom_evaluators/whisper_evaluator.py diff --git a/tools/accuracy_checker/accuracy_checker/evaluators/custom_evaluators/whisper_evaluator.py b/tools/accuracy_checker/accuracy_checker/evaluators/custom_evaluators/whisper_evaluator.py new file mode 100644 index 0000000000..fbf49e400f --- /dev/null +++ b/tools/accuracy_checker/accuracy_checker/evaluators/custom_evaluators/whisper_evaluator.py @@ -0,0 +1,151 @@ +""" +Copyright (c) 2024 Intel Corporation + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" +import re + +import openvino_genai as ov_genai + +from ...representation import CharacterRecognitionPrediction +from ...utils import UnsupportedPackage, extract_image_representations +from .base_custom_evaluator import BaseCustomEvaluator + +try: + import inflect +except ImportError as import_err: + inflect = UnsupportedPackage("inflect", import_err.msg) + + +class WhisperEvaluator(BaseCustomEvaluator): + def __init__(self, dataset_config, pipe, orig_config): + super().__init__(dataset_config, None, orig_config) + self.pipe = pipe + if hasattr(self.pipe, 'adapter'): + self.adapter_type = self.pipe.adapter.__provider__ + + @classmethod + def from_configs(cls, config, delayed_model_loading=False, orig_config=None): + dataset_config = config['datasets'] + + framework = config['launchers'][0]['framework'] + if framework == 'openvino': + pipe = GenAI_WhisperPipeline(config) + else: + pipe = TransformersAsrPipeline(config) + return cls(dataset_config, pipe, orig_config) + + def _process(self, output_callback, calculate_metrics, progress_reporter, metric_config, csv_file): + for batch_id, (batch_input_ids, batch_annotation, batch_inputs, batch_identifiers) in enumerate(self.dataset): + batch_inputs = self.preprocessor.process(batch_inputs, batch_annotation) + batch_inputs_extr, batch_meta = extract_image_representations(batch_inputs) + + batch_raw_prediction, batch_prediction = self.pipe.predict( + batch_identifiers, batch_inputs_extr, batch_meta + ) + metrics_result = self._get_metrics_result(batch_input_ids, batch_annotation, batch_prediction, + calculate_metrics) + if output_callback: + output_callback(batch_raw_prediction[0], metrics_result=metrics_result, + element_identifiers=batch_identifiers, dataset_indices=batch_input_ids) + self._update_progress(progress_reporter, metric_config, batch_id, len(batch_prediction), csv_file) + + def release(self): + pass + + +def normalize_transcription(engine, text): + # Convert numbers to words + tokens = (engine.number_to_words(token) if token.isdigit() else token for token in text.split()) + # Remove punctuation except for apostrophes that are in the middle of words + text = re.sub(r"\b'\b|[^\w\s]", '', ' '.join(tokens)) + # Remove leading, trailing, and multiple consecutive spaces, and convert to uppercase + return ' '.join(text.upper().split()) + + +class WhisperPipeline: + def __init__(self, config): + if isinstance(inflect,UnsupportedPackage): + UnsupportedPackage("inflect", inflect.msg).raise_error(self.__class__.__name__) + self.engine = inflect.engine() + self.pipeline = self._initialize_pipeline(config) + + def _initialize_pipeline(self, config): + raise NotImplementedError + + def _get_predictions(self, data, identifiers, input_meta): + raise NotImplementedError + + def predict(self, identifiers, input_data, input_meta, encoder_callback=None): + predictions = [] + outputs = [] + for data in input_data: + transcription = self._get_predictions(data, identifiers, input_meta) + prediction_text = normalize_transcription(self.engine, transcription) + predictions.append(prediction_text) + outputs.append(CharacterRecognitionPrediction(identifiers[0], predictions[0])) + return [], outputs + + +class GenAI_WhisperPipeline(WhisperPipeline): + def _initialize_pipeline(self, config): + models_dirs = config.get('_models', []) + device = config.get('_device', 'CPU') + model_dir = models_dirs[0] + pipeline = ov_genai.WhisperPipeline(str(model_dir), device=device) + return pipeline + + def _get_predictions(self, data, identifiers, input_meta): + return self.pipeline.generate(data[0]).texts[0] + + +class TransformersAsrPipeline(WhisperPipeline): + def _initialize_pipeline(self, config): + try: + from transformers import ( # pylint: disable=C0415 + AutoModelForSpeechSeq2Seq, AutoProcessor) + from transformers.pipelines.automatic_speech_recognition import \ + AutomaticSpeechRecognitionPipeline # pylint: disable=C0415 + except ImportError as import_err: + UnsupportedPackage("transformers", import_err.msg).raise_error(self.__class__.__name__) + + try: + import torch # pylint: disable=C0415 + except ImportError as import_err: + UnsupportedPackage("torch", import_err.msg).raise_error(self.__class__.__name__) + + model_id = config.get('model_id') + device = "cpu" + + # The following code is based on the implementation found at: + # https://huggingface.co/openai/whisper-large-v3 + torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32 + model = AutoModelForSpeechSeq2Seq.from_pretrained( + model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True + ).to(device) + + processor = AutoProcessor.from_pretrained(model_id) + + pipeline = AutomaticSpeechRecognitionPipeline( + model=model, + tokenizer=processor.tokenizer, + feature_extractor=processor.feature_extractor, + torch_dtype=torch_dtype, + device=device, + ) + return pipeline + + def _get_predictions(self, data, identifiers, input_meta): + sampling_rate = input_meta[0].get('sample_rate') + sample = {'path': identifiers[0], 'array': data[0], 'sampling_rate': sampling_rate} + return self.pipeline(sample)["text"] From 75b57b8f5b90a6c7211e072b11871a125b13b7bd Mon Sep 17 00:00:00 2001 From: Piotr Wolnowski Date: Thu, 19 Dec 2024 13:44:23 +0100 Subject: [PATCH 02/13] Add OptimumIntelPipeline to whisper_evaluator.py --- .../custom_evaluators/whisper_evaluator.py | 76 ++++++++++++------- 1 file changed, 50 insertions(+), 26 deletions(-) diff --git a/tools/accuracy_checker/accuracy_checker/evaluators/custom_evaluators/whisper_evaluator.py b/tools/accuracy_checker/accuracy_checker/evaluators/custom_evaluators/whisper_evaluator.py index fbf49e400f..53de578d66 100644 --- a/tools/accuracy_checker/accuracy_checker/evaluators/custom_evaluators/whisper_evaluator.py +++ b/tools/accuracy_checker/accuracy_checker/evaluators/custom_evaluators/whisper_evaluator.py @@ -13,9 +13,13 @@ See the License for the specific language governing permissions and limitations under the License. """ +import os import re import openvino_genai as ov_genai +from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor +from transformers.pipelines.automatic_speech_recognition import \ + AutomaticSpeechRecognitionPipeline from ...representation import CharacterRecognitionPrediction from ...utils import UnsupportedPackage, extract_image_representations @@ -28,21 +32,29 @@ class WhisperEvaluator(BaseCustomEvaluator): + VALID_PIPELINE_CLASSES = [ + "GenAI_WhisperPipeline", + "TransformersAsrPipeline", + "OptimumIntelPipeline" + ] + def __init__(self, dataset_config, pipe, orig_config): super().__init__(dataset_config, None, orig_config) self.pipe = pipe - if hasattr(self.pipe, 'adapter'): + if hasattr(self.pipe, "adapter"): self.adapter_type = self.pipe.adapter.__provider__ @classmethod def from_configs(cls, config, delayed_model_loading=False, orig_config=None): - dataset_config = config['datasets'] + dataset_config = config["datasets"] + pipeline_class_name = config["pipeline_class"] + + if pipeline_class_name not in cls.VALID_PIPELINE_CLASSES: + raise ValueError(f"Invalid pipeline class name: {pipeline_class_name}. " + f"Must be one of {cls.VALID_PIPELINE_CLASSES}") - framework = config['launchers'][0]['framework'] - if framework == 'openvino': - pipe = GenAI_WhisperPipeline(config) - else: - pipe = TransformersAsrPipeline(config) + pipeline_class = globals()[pipeline_class_name] + pipe = pipeline_class(config) return cls(dataset_config, pipe, orig_config) def _process(self, output_callback, calculate_metrics, progress_reporter, metric_config, csv_file): @@ -68,15 +80,13 @@ def normalize_transcription(engine, text): # Convert numbers to words tokens = (engine.number_to_words(token) if token.isdigit() else token for token in text.split()) # Remove punctuation except for apostrophes that are in the middle of words - text = re.sub(r"\b'\b|[^\w\s]", '', ' '.join(tokens)) + text = re.sub(r"\b'\b|[^\w\s]", "", " ".join(tokens)) # Remove leading, trailing, and multiple consecutive spaces, and convert to uppercase - return ' '.join(text.upper().split()) + return " ".join(text.upper().split()) class WhisperPipeline: def __init__(self, config): - if isinstance(inflect,UnsupportedPackage): - UnsupportedPackage("inflect", inflect.msg).raise_error(self.__class__.__name__) self.engine = inflect.engine() self.pipeline = self._initialize_pipeline(config) @@ -99,9 +109,8 @@ def predict(self, identifiers, input_data, input_meta, encoder_callback=None): class GenAI_WhisperPipeline(WhisperPipeline): def _initialize_pipeline(self, config): - models_dirs = config.get('_models', []) - device = config.get('_device', 'CPU') - model_dir = models_dirs[0] + model_dir = config.get("_models", [None])[0] + device = config.get("_device", "CPU") pipeline = ov_genai.WhisperPipeline(str(model_dir), device=device) return pipeline @@ -109,26 +118,41 @@ def _get_predictions(self, data, identifiers, input_meta): return self.pipeline.generate(data[0]).texts[0] -class TransformersAsrPipeline(WhisperPipeline): +class OptimumIntelPipeline(WhisperPipeline): def _initialize_pipeline(self, config): try: - from transformers import ( # pylint: disable=C0415 - AutoModelForSpeechSeq2Seq, AutoProcessor) - from transformers.pipelines.automatic_speech_recognition import \ - AutomaticSpeechRecognitionPipeline # pylint: disable=C0415 + from optimum.intel.openvino import \ + OVModelForSpeechSeq2Seq # pylint: disable=C0415 except ImportError as import_err: - UnsupportedPackage("transformers", import_err.msg).raise_error(self.__class__.__name__) + UnsupportedPackage("optimum.intel.openvino", import_err.msg).raise_error(self.__class__.__name__) + + device = config.get("_device", "CPU") + model_dir = config.get("_models", [None])[0] + ov_model = OVModelForSpeechSeq2Seq.from_pretrained(str(model_dir)).to(device) + ov_processor = AutoProcessor.from_pretrained(str(model_dir)) + + pipeline = AutomaticSpeechRecognitionPipeline( + model=ov_model, + tokenizer=ov_processor.tokenizer, + feature_extractor=ov_processor.feature_extractor + ) + return pipeline + def _get_predictions(self, data, identifiers, input_meta): + sampling_rate = input_meta[0].get("sample_rate") + sample = {"path": identifiers[0], "array": data[0], "sampling_rate": sampling_rate} + return self.pipeline(sample)["text"] + + +class TransformersAsrPipeline(WhisperPipeline): + def _initialize_pipeline(self, config): try: import torch # pylint: disable=C0415 except ImportError as import_err: UnsupportedPackage("torch", import_err.msg).raise_error(self.__class__.__name__) - model_id = config.get('model_id') + model_id = config.get("model_id") device = "cpu" - - # The following code is based on the implementation found at: - # https://huggingface.co/openai/whisper-large-v3 torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32 model = AutoModelForSpeechSeq2Seq.from_pretrained( model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True @@ -146,6 +170,6 @@ def _initialize_pipeline(self, config): return pipeline def _get_predictions(self, data, identifiers, input_meta): - sampling_rate = input_meta[0].get('sample_rate') - sample = {'path': identifiers[0], 'array': data[0], 'sampling_rate': sampling_rate} + sampling_rate = input_meta[0].get("sample_rate") + sample = {"path": identifiers[0], "array": data[0], "sampling_rate": sampling_rate} return self.pipeline(sample)["text"] From b31af0fea21a63b4fa76dd31eee7ad6a33e3e94b Mon Sep 17 00:00:00 2001 From: Piotr Wolnowski Date: Thu, 19 Dec 2024 13:46:50 +0100 Subject: [PATCH 03/13] Update whisper_evaluator.py --- .../custom_evaluators/whisper_evaluator.py | 54 ++++++++++--------- 1 file changed, 28 insertions(+), 26 deletions(-) diff --git a/tools/accuracy_checker/accuracy_checker/evaluators/custom_evaluators/whisper_evaluator.py b/tools/accuracy_checker/accuracy_checker/evaluators/custom_evaluators/whisper_evaluator.py index 53de578d66..a13239be69 100644 --- a/tools/accuracy_checker/accuracy_checker/evaluators/custom_evaluators/whisper_evaluator.py +++ b/tools/accuracy_checker/accuracy_checker/evaluators/custom_evaluators/whisper_evaluator.py @@ -118,32 +118,6 @@ def _get_predictions(self, data, identifiers, input_meta): return self.pipeline.generate(data[0]).texts[0] -class OptimumIntelPipeline(WhisperPipeline): - def _initialize_pipeline(self, config): - try: - from optimum.intel.openvino import \ - OVModelForSpeechSeq2Seq # pylint: disable=C0415 - except ImportError as import_err: - UnsupportedPackage("optimum.intel.openvino", import_err.msg).raise_error(self.__class__.__name__) - - device = config.get("_device", "CPU") - model_dir = config.get("_models", [None])[0] - ov_model = OVModelForSpeechSeq2Seq.from_pretrained(str(model_dir)).to(device) - ov_processor = AutoProcessor.from_pretrained(str(model_dir)) - - pipeline = AutomaticSpeechRecognitionPipeline( - model=ov_model, - tokenizer=ov_processor.tokenizer, - feature_extractor=ov_processor.feature_extractor - ) - return pipeline - - def _get_predictions(self, data, identifiers, input_meta): - sampling_rate = input_meta[0].get("sample_rate") - sample = {"path": identifiers[0], "array": data[0], "sampling_rate": sampling_rate} - return self.pipeline(sample)["text"] - - class TransformersAsrPipeline(WhisperPipeline): def _initialize_pipeline(self, config): try: @@ -173,3 +147,31 @@ def _get_predictions(self, data, identifiers, input_meta): sampling_rate = input_meta[0].get("sample_rate") sample = {"path": identifiers[0], "array": data[0], "sampling_rate": sampling_rate} return self.pipeline(sample)["text"] + + +class OptimumIntelPipeline(WhisperPipeline): + def _initialize_pipeline(self, config): + try: + from optimum.intel.openvino import \ + OVModelForSpeechSeq2Seq # pylint: disable=C0415 + except ImportError as import_err: + UnsupportedPackage("optimum.intel.openvino", import_err.msg).raise_error(self.__class__.__name__) + + device = config.get("_device", "CPU") + model_dir = config.get("_models", [None])[0] + ov_model = OVModelForSpeechSeq2Seq.from_pretrained(str(model_dir)) + ov_processor = AutoProcessor.from_pretrained(str(model_dir)) + + pipeline = AutomaticSpeechRecognitionPipeline( + model=ov_model, + tokenizer=ov_processor.tokenizer, + feature_extractor=ov_processor.feature_extractor, + device=device, + ) + return pipeline + + def _get_predictions(self, data, identifiers, input_meta): + sampling_rate = input_meta[0].get("sample_rate") + sample = {"path": identifiers[0], "array": data[0], "sampling_rate": sampling_rate} + return self.pipeline(sample)["text"] + From 1d3c287ce27f5a440de3169ae6a01e4c84379c69 Mon Sep 17 00:00:00 2001 From: Piotr Wolnowski Date: Thu, 19 Dec 2024 15:12:15 +0100 Subject: [PATCH 04/13] Update OptimumIntelPipeline --- .../evaluators/custom_evaluators/whisper_evaluator.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tools/accuracy_checker/accuracy_checker/evaluators/custom_evaluators/whisper_evaluator.py b/tools/accuracy_checker/accuracy_checker/evaluators/custom_evaluators/whisper_evaluator.py index a13239be69..8bfdea4c57 100644 --- a/tools/accuracy_checker/accuracy_checker/evaluators/custom_evaluators/whisper_evaluator.py +++ b/tools/accuracy_checker/accuracy_checker/evaluators/custom_evaluators/whisper_evaluator.py @@ -165,8 +165,7 @@ def _initialize_pipeline(self, config): pipeline = AutomaticSpeechRecognitionPipeline( model=ov_model, tokenizer=ov_processor.tokenizer, - feature_extractor=ov_processor.feature_extractor, - device=device, + feature_extractor=ov_processor.feature_extractor ) return pipeline From 74a98828068d7ef3438d3ae301c2747c4d88b44b Mon Sep 17 00:00:00 2001 From: Piotr Wolnowski Date: Fri, 20 Dec 2024 12:11:45 +0100 Subject: [PATCH 05/13] Update naming, avoid errors for long audio --- .../custom_evaluators/whisper_evaluator.py | 46 ++++++++++++------- 1 file changed, 29 insertions(+), 17 deletions(-) diff --git a/tools/accuracy_checker/accuracy_checker/evaluators/custom_evaluators/whisper_evaluator.py b/tools/accuracy_checker/accuracy_checker/evaluators/custom_evaluators/whisper_evaluator.py index 8bfdea4c57..04a17b108a 100644 --- a/tools/accuracy_checker/accuracy_checker/evaluators/custom_evaluators/whisper_evaluator.py +++ b/tools/accuracy_checker/accuracy_checker/evaluators/custom_evaluators/whisper_evaluator.py @@ -13,18 +13,24 @@ See the License for the specific language governing permissions and limitations under the License. """ -import os import re -import openvino_genai as ov_genai -from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor -from transformers.pipelines.automatic_speech_recognition import \ - AutomaticSpeechRecognitionPipeline - from ...representation import CharacterRecognitionPrediction from ...utils import UnsupportedPackage, extract_image_representations from .base_custom_evaluator import BaseCustomEvaluator +try: + from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor +except ImportError as import_err: + AutoModelForSpeechSeq2Seq = UnsupportedPackage("transformers", import_err.msg) + AutoProcessor = UnsupportedPackage("transformers", import_err.msg) + +try: + from transformers.pipelines.automatic_speech_recognition import \ + AutomaticSpeechRecognitionPipeline +except ImportError as import_err: + AutomaticSpeechRecognitionPipeline = UnsupportedPackage("transformers", import_err.msg) + try: import inflect except ImportError as import_err: @@ -33,9 +39,9 @@ class WhisperEvaluator(BaseCustomEvaluator): VALID_PIPELINE_CLASSES = [ - "GenAI_WhisperPipeline", - "TransformersAsrPipeline", - "OptimumIntelPipeline" + "GenAIWhisperPipeline", + "HFWhisperPipeline", + "OptimumWhisperPipeline" ] def __init__(self, dataset_config, pipe, orig_config): @@ -48,6 +54,8 @@ def __init__(self, dataset_config, pipe, orig_config): def from_configs(cls, config, delayed_model_loading=False, orig_config=None): dataset_config = config["datasets"] pipeline_class_name = config["pipeline_class"] + if 'device' in config['launchers'][0]: + config["_device"] = config['launchers'][0]['device'] if pipeline_class_name not in cls.VALID_PIPELINE_CLASSES: raise ValueError(f"Invalid pipeline class name: {pipeline_class_name}. " @@ -107,18 +115,23 @@ def predict(self, identifiers, input_data, input_meta, encoder_callback=None): return [], outputs -class GenAI_WhisperPipeline(WhisperPipeline): +class GenAIWhisperPipeline(WhisperPipeline): def _initialize_pipeline(self, config): + try: + import openvino_genai as ov_genai # pylint: disable=C0415 + except ImportError as import_err: + UnsupportedPackage("openvino_genai", import_err.msg).raise_error(self.__class__.__name__) + model_dir = config.get("_models", [None])[0] device = config.get("_device", "CPU") pipeline = ov_genai.WhisperPipeline(str(model_dir), device=device) return pipeline def _get_predictions(self, data, identifiers, input_meta): - return self.pipeline.generate(data[0]).texts[0] + return self.pipeline.generate(data[0], return_timestamps=True).texts[0] -class TransformersAsrPipeline(WhisperPipeline): +class HFWhisperPipeline(WhisperPipeline): def _initialize_pipeline(self, config): try: import torch # pylint: disable=C0415 @@ -146,10 +159,10 @@ def _initialize_pipeline(self, config): def _get_predictions(self, data, identifiers, input_meta): sampling_rate = input_meta[0].get("sample_rate") sample = {"path": identifiers[0], "array": data[0], "sampling_rate": sampling_rate} - return self.pipeline(sample)["text"] + return self.pipeline(sample, return_timestamps=True)["text"] -class OptimumIntelPipeline(WhisperPipeline): +class OptimumWhisperPipeline(WhisperPipeline): def _initialize_pipeline(self, config): try: from optimum.intel.openvino import \ @@ -159,7 +172,7 @@ def _initialize_pipeline(self, config): device = config.get("_device", "CPU") model_dir = config.get("_models", [None])[0] - ov_model = OVModelForSpeechSeq2Seq.from_pretrained(str(model_dir)) + ov_model = OVModelForSpeechSeq2Seq.from_pretrained(str(model_dir)).to(device) ov_processor = AutoProcessor.from_pretrained(str(model_dir)) pipeline = AutomaticSpeechRecognitionPipeline( @@ -172,5 +185,4 @@ def _initialize_pipeline(self, config): def _get_predictions(self, data, identifiers, input_meta): sampling_rate = input_meta[0].get("sample_rate") sample = {"path": identifiers[0], "array": data[0], "sampling_rate": sampling_rate} - return self.pipeline(sample)["text"] - + return self.pipeline(sample, return_timestamps=True)["text"] From a67c9f3a8adc88c813d0ac5e4d053ac00d7f9f26 Mon Sep 17 00:00:00 2001 From: Piotr Wolnowski Date: Fri, 20 Dec 2024 12:13:01 +0100 Subject: [PATCH 06/13] Create test_whisper_evaluator.py --- .../tests/test_whisper_evaluator.py | 88 +++++++++++++++++++ 1 file changed, 88 insertions(+) create mode 100644 tools/accuracy_checker/tests/test_whisper_evaluator.py diff --git a/tools/accuracy_checker/tests/test_whisper_evaluator.py b/tools/accuracy_checker/tests/test_whisper_evaluator.py new file mode 100644 index 0000000000..6e32d047ca --- /dev/null +++ b/tools/accuracy_checker/tests/test_whisper_evaluator.py @@ -0,0 +1,88 @@ +""" +Copyright (c) 2024 Intel Corporation + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" +from pathlib import Path +from unittest.mock import MagicMock + +import pytest +from accuracy_checker.evaluators.custom_evaluators.whisper_evaluator import ( + GenAIWhisperPipeline, OptimumWhisperPipeline, HFWhisperPipeline, + WhisperEvaluator, normalize_transcription) +from datasets import load_dataset +from optimum.exporters.openvino.convert import export_tokenizer +from optimum.intel.openvino import OVModelForSpeechSeq2Seq +from transformers import AutoTokenizer,AutoProcessor + + +def export_model(model_id, output_dir): + tokenizer = AutoTokenizer.from_pretrained(model_id) + processor = AutoProcessor.from_pretrained(model_id) + base_model = OVModelForSpeechSeq2Seq.from_pretrained(model_id) + + base_model.save_pretrained(output_dir) + tokenizer.save_pretrained(output_dir) + processor.save_pretrained(output_dir) + export_tokenizer(tokenizer, output_dir) + +model_name = "openai/whisper-tiny" +model_dir = Path("/tmp/whisper-tiny") + +# Export the model +export_model(model_name, model_dir) + +# Load a single sample from the dataset +dataset = load_dataset("openslr/librispeech_asr", "clean", split="validation", streaming=True, trust_remote_code=True) +sample = next(iter(dataset)) +ground_truth = sample["text"] +input_data = [sample["audio"]["array"]] +input_meta = [{"sample_rate": sample["audio"]["sampling_rate"]}] +identifiers = [sample["id"]] +# print(ground_truth) + +class TestWhisperEvaluator: + def test_hf_whisper_pipeline(self): + config = {"model_id": model_name} + pipeline = HFWhisperPipeline(config) + evaluator = WhisperEvaluator(None, pipeline, None) + + result = evaluator.pipe._get_predictions(input_data, identifiers, input_meta) + assert isinstance(result, str) + # print(result) + + def test_genai_whisper_pipeline(self): + config = {"_models": [model_dir], "_device": "CPU"} + pipeline = GenAIWhisperPipeline(config) + evaluator = WhisperEvaluator(None, pipeline, None) + + result = evaluator.pipe._get_predictions(input_data, identifiers, input_meta) + assert isinstance(result, str) + # print(result) + + def test_optimum_whisper_pipeline(self): + config = {"_models": [model_dir], "_device": "CPU"} + pipeline = OptimumWhisperPipeline(config) + evaluator = WhisperEvaluator(None, pipeline, None) + + result = evaluator.pipe._get_predictions(input_data, identifiers, input_meta) + assert isinstance(result, str) + # print(result) + + +def test_normalize_transcription(): + engine = MagicMock() + engine.number_to_words.side_effect = lambda x: "one" if x == "1" else x + text = "This is a test 1" + result = normalize_transcription(engine, text) + assert result == "THIS IS A TEST ONE" From 6bd2668a9f91e724cf289f6f9cd3b34fee7513a1 Mon Sep 17 00:00:00 2001 From: Piotr Wolnowski Date: Fri, 20 Dec 2024 13:10:39 +0100 Subject: [PATCH 07/13] Add datasets to requirements-test.in --- tools/accuracy_checker/requirements-test.in | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/accuracy_checker/requirements-test.in b/tools/accuracy_checker/requirements-test.in index 514358bc9c..24d51fb3dd 100644 --- a/tools/accuracy_checker/requirements-test.in +++ b/tools/accuracy_checker/requirements-test.in @@ -7,3 +7,4 @@ pytest-mock~=2.0 # will not include atomicwrites and thus will not work on Windows. # So as a workaround, make the atomicwrites dependency unconditional. atomicwrites +datasets From 8f82fcd78c4741d17ecdaf3b9b86a026973db05e Mon Sep 17 00:00:00 2001 From: Piotr Wolnowski Date: Fri, 20 Dec 2024 13:21:05 +0100 Subject: [PATCH 08/13] Add infect to requirements-extra.in --- tools/accuracy_checker/requirements-extra.in | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tools/accuracy_checker/requirements-extra.in b/tools/accuracy_checker/requirements-extra.in index 9dbc18c62b..748216c415 100644 --- a/tools/accuracy_checker/requirements-extra.in +++ b/tools/accuracy_checker/requirements-extra.in @@ -48,3 +48,6 @@ lmdb>=1.2.1 # pandas datasets support pandas>=1.1.5,<2.1 + +# word-based representations of numbers +inflect>=7.4.0 From d99c20c8ef39fc9ded0d732d664b7b1f3b4799f2 Mon Sep 17 00:00:00 2001 From: Piotr Wolnowski Date: Fri, 20 Dec 2024 14:08:32 +0100 Subject: [PATCH 09/13] Add cleanup test_whisper_evaluator.py --- .../tests/test_whisper_evaluator.py | 48 +++++++++++-------- 1 file changed, 27 insertions(+), 21 deletions(-) diff --git a/tools/accuracy_checker/tests/test_whisper_evaluator.py b/tools/accuracy_checker/tests/test_whisper_evaluator.py index 6e32d047ca..1742191e28 100644 --- a/tools/accuracy_checker/tests/test_whisper_evaluator.py +++ b/tools/accuracy_checker/tests/test_whisper_evaluator.py @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. """ +import os from pathlib import Path from unittest.mock import MagicMock @@ -23,8 +24,7 @@ from datasets import load_dataset from optimum.exporters.openvino.convert import export_tokenizer from optimum.intel.openvino import OVModelForSpeechSeq2Seq -from transformers import AutoTokenizer,AutoProcessor - +from transformers import AutoTokenizer, AutoProcessor def export_model(model_id, output_dir): tokenizer = AutoTokenizer.from_pretrained(model_id) @@ -36,49 +36,54 @@ def export_model(model_id, output_dir): processor.save_pretrained(output_dir) export_tokenizer(tokenizer, output_dir) -model_name = "openai/whisper-tiny" +model_id = "openai/whisper-tiny" model_dir = Path("/tmp/whisper-tiny") # Export the model -export_model(model_name, model_dir) - -# Load a single sample from the dataset -dataset = load_dataset("openslr/librispeech_asr", "clean", split="validation", streaming=True, trust_remote_code=True) -sample = next(iter(dataset)) -ground_truth = sample["text"] -input_data = [sample["audio"]["array"]] -input_meta = [{"sample_rate": sample["audio"]["sampling_rate"]}] -identifiers = [sample["id"]] -# print(ground_truth) +export_model(model_id, model_dir) class TestWhisperEvaluator: + @classmethod + def setup_class(cls): + # Load a single sample from the dataset + dataset = load_dataset("openslr/librispeech_asr", "clean", split="validation", streaming=True, trust_remote_code=True) + sample = next(iter(dataset)) + cls.input_data = [sample["audio"]["array"]] + cls.input_meta = [{"sample_rate": sample["audio"]["sampling_rate"]}] + cls.identifiers = [sample["id"]] + + @classmethod + def teardown_class(cls): + if model_dir.exists(): + for item in model_dir.iterdir(): + if item.is_file(): + item.unlink() + model_dir.rmdir() + + def test_hf_whisper_pipeline(self): - config = {"model_id": model_name} + config = {"model_id": model_id} pipeline = HFWhisperPipeline(config) evaluator = WhisperEvaluator(None, pipeline, None) - result = evaluator.pipe._get_predictions(input_data, identifiers, input_meta) + result = evaluator.pipe._get_predictions(self.input_data, self.identifiers, self.input_meta) assert isinstance(result, str) - # print(result) def test_genai_whisper_pipeline(self): config = {"_models": [model_dir], "_device": "CPU"} pipeline = GenAIWhisperPipeline(config) evaluator = WhisperEvaluator(None, pipeline, None) - result = evaluator.pipe._get_predictions(input_data, identifiers, input_meta) + result = evaluator.pipe._get_predictions(self.input_data, self.identifiers, self.input_meta) assert isinstance(result, str) - # print(result) def test_optimum_whisper_pipeline(self): config = {"_models": [model_dir], "_device": "CPU"} pipeline = OptimumWhisperPipeline(config) evaluator = WhisperEvaluator(None, pipeline, None) - result = evaluator.pipe._get_predictions(input_data, identifiers, input_meta) + result = evaluator.pipe._get_predictions(self.input_data, self.identifiers, self.input_meta) assert isinstance(result, str) - # print(result) - def test_normalize_transcription(): engine = MagicMock() @@ -86,3 +91,4 @@ def test_normalize_transcription(): text = "This is a test 1" result = normalize_transcription(engine, text) assert result == "THIS IS A TEST ONE" + From f870aed66a727c9ea7bdaf6f04e81f2ee9c70484 Mon Sep 17 00:00:00 2001 From: Piotr Wolnowski Date: Sat, 21 Dec 2024 07:29:19 +0100 Subject: [PATCH 10/13] Cleanup of test_whisper_evaluator.py --- .../tests/test_whisper_evaluator.py | 77 ++++++++----------- 1 file changed, 34 insertions(+), 43 deletions(-) diff --git a/tools/accuracy_checker/tests/test_whisper_evaluator.py b/tools/accuracy_checker/tests/test_whisper_evaluator.py index 1742191e28..8cae64f111 100644 --- a/tools/accuracy_checker/tests/test_whisper_evaluator.py +++ b/tools/accuracy_checker/tests/test_whisper_evaluator.py @@ -15,80 +15,71 @@ """ import os from pathlib import Path -from unittest.mock import MagicMock +from unittest.mock import MagicMock, patch import pytest from accuracy_checker.evaluators.custom_evaluators.whisper_evaluator import ( - GenAIWhisperPipeline, OptimumWhisperPipeline, HFWhisperPipeline, + GenAIWhisperPipeline, HFWhisperPipeline, OptimumWhisperPipeline, WhisperEvaluator, normalize_transcription) from datasets import load_dataset from optimum.exporters.openvino.convert import export_tokenizer from optimum.intel.openvino import OVModelForSpeechSeq2Seq -from transformers import AutoTokenizer, AutoProcessor - -def export_model(model_id, output_dir): - tokenizer = AutoTokenizer.from_pretrained(model_id) - processor = AutoProcessor.from_pretrained(model_id) - base_model = OVModelForSpeechSeq2Seq.from_pretrained(model_id) - - base_model.save_pretrained(output_dir) - tokenizer.save_pretrained(output_dir) - processor.save_pretrained(output_dir) - export_tokenizer(tokenizer, output_dir) +from transformers import AutoProcessor, AutoTokenizer model_id = "openai/whisper-tiny" model_dir = Path("/tmp/whisper-tiny") -# Export the model -export_model(model_id, model_dir) +def setup_module(module): + global input_data, input_meta, identifiers -class TestWhisperEvaluator: - @classmethod - def setup_class(cls): - # Load a single sample from the dataset - dataset = load_dataset("openslr/librispeech_asr", "clean", split="validation", streaming=True, trust_remote_code=True) - sample = next(iter(dataset)) - cls.input_data = [sample["audio"]["array"]] - cls.input_meta = [{"sample_rate": sample["audio"]["sampling_rate"]}] - cls.identifiers = [sample["id"]] - - @classmethod - def teardown_class(cls): - if model_dir.exists(): - for item in model_dir.iterdir(): - if item.is_file(): - item.unlink() - model_dir.rmdir() + dataset = load_dataset("openslr/librispeech_asr", "clean", split="validation", streaming=True, trust_remote_code=True) + sample = next(iter(dataset)) + input_data = [sample["audio"]["array"]] + input_meta = [{"sample_rate": sample["audio"]["sampling_rate"]}] + identifiers = [sample["id"]] + +def teardown_module(module): + if model_dir.exists(): + for item in model_dir.iterdir(): + if item.is_file(): + item.unlink() + model_dir.rmdir() +def test_optimum_convert_model_to_ir(): + tokenizer = AutoTokenizer.from_pretrained(model_id) + processor = AutoProcessor.from_pretrained(model_id) + base_model = OVModelForSpeechSeq2Seq.from_pretrained(model_id) + + model_dir.mkdir(parents=True, exist_ok=True) + base_model.save_pretrained(model_dir) + tokenizer.save_pretrained(model_dir) + processor.save_pretrained(model_dir) + export_tokenizer(tokenizer, model_dir) + assert base_model.__class__.__module__.startswith('optimum.intel.openvino') +class TestWhisperEvaluator: def test_hf_whisper_pipeline(self): config = {"model_id": model_id} pipeline = HFWhisperPipeline(config) evaluator = WhisperEvaluator(None, pipeline, None) - result = evaluator.pipe._get_predictions(self.input_data, self.identifiers, self.input_meta) + result = evaluator.pipe._get_predictions(input_data, identifiers, input_meta) assert isinstance(result, str) + @pytest.mark.dependency(depends=["test_base_model"]) def test_genai_whisper_pipeline(self): config = {"_models": [model_dir], "_device": "CPU"} pipeline = GenAIWhisperPipeline(config) evaluator = WhisperEvaluator(None, pipeline, None) - result = evaluator.pipe._get_predictions(self.input_data, self.identifiers, self.input_meta) + result = evaluator.pipe._get_predictions(input_data, identifiers, input_meta) assert isinstance(result, str) + @pytest.mark.dependency(depends=["test_base_model"]) def test_optimum_whisper_pipeline(self): config = {"_models": [model_dir], "_device": "CPU"} pipeline = OptimumWhisperPipeline(config) evaluator = WhisperEvaluator(None, pipeline, None) - result = evaluator.pipe._get_predictions(self.input_data, self.identifiers, self.input_meta) + result = evaluator.pipe._get_predictions(input_data, identifiers, input_meta) assert isinstance(result, str) - -def test_normalize_transcription(): - engine = MagicMock() - engine.number_to_words.side_effect = lambda x: "one" if x == "1" else x - text = "This is a test 1" - result = normalize_transcription(engine, text) - assert result == "THIS IS A TEST ONE" - From 52639a07fb9819ce0037f6fc2c462aa4ffe5138e Mon Sep 17 00:00:00 2001 From: Piotr Wolnowski Date: Tue, 7 Jan 2025 14:04:20 +0100 Subject: [PATCH 11/13] Skip tests if modules not available --- .../tests/test_whisper_evaluator.py | 21 ++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) diff --git a/tools/accuracy_checker/tests/test_whisper_evaluator.py b/tools/accuracy_checker/tests/test_whisper_evaluator.py index 8cae64f111..9f18df178a 100644 --- a/tools/accuracy_checker/tests/test_whisper_evaluator.py +++ b/tools/accuracy_checker/tests/test_whisper_evaluator.py @@ -1,5 +1,5 @@ """ -Copyright (c) 2024 Intel Corporation +Copyright (c) 2024-2025 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -20,18 +20,23 @@ import pytest from accuracy_checker.evaluators.custom_evaluators.whisper_evaluator import ( GenAIWhisperPipeline, HFWhisperPipeline, OptimumWhisperPipeline, - WhisperEvaluator, normalize_transcription) + WhisperEvaluator) from datasets import load_dataset -from optimum.exporters.openvino.convert import export_tokenizer -from optimum.intel.openvino import OVModelForSpeechSeq2Seq -from transformers import AutoProcessor, AutoTokenizer + +AutoProcessor = pytest.importorskip("transformers", reason="transformers is not available").AutoProcessor +AutoTokenizer = pytest.importorskip("transformers", reason="transformers is not available").AutoTokenizer +export_tokenizer = pytest.importorskip("optimum.exporters.openvino.convert", reason="optimum.exporters.openvino.convert is not available").export_tokenizer +OVModelForSpeechSeq2Seq = pytest.importorskip("optimum.intel.openvino", reason="optimum.intel.openvino is not available").OVModelForSpeechSeq2Seq + model_id = "openai/whisper-tiny" model_dir = Path("/tmp/whisper-tiny") def setup_module(module): + # Setup code here global input_data, input_meta, identifiers + # Load a single sample from the dataset dataset = load_dataset("openslr/librispeech_asr", "clean", split="validation", streaming=True, trust_remote_code=True) sample = next(iter(dataset)) input_data = [sample["audio"]["array"]] @@ -39,6 +44,7 @@ def setup_module(module): identifiers = [sample["id"]] def teardown_module(module): + # Cleanup code here if model_dir.exists(): for item in model_dir.iterdir(): if item.is_file(): @@ -55,6 +61,7 @@ def test_optimum_convert_model_to_ir(): tokenizer.save_pretrained(model_dir) processor.save_pretrained(model_dir) export_tokenizer(tokenizer, model_dir) + assert base_model.__class__.__module__.startswith('optimum.intel.openvino') class TestWhisperEvaluator: @@ -66,7 +73,7 @@ def test_hf_whisper_pipeline(self): result = evaluator.pipe._get_predictions(input_data, identifiers, input_meta) assert isinstance(result, str) - @pytest.mark.dependency(depends=["test_base_model"]) + @pytest.mark.dependency(depends=["test_optimum_convert_model_to_ir"]) def test_genai_whisper_pipeline(self): config = {"_models": [model_dir], "_device": "CPU"} pipeline = GenAIWhisperPipeline(config) @@ -75,7 +82,7 @@ def test_genai_whisper_pipeline(self): result = evaluator.pipe._get_predictions(input_data, identifiers, input_meta) assert isinstance(result, str) - @pytest.mark.dependency(depends=["test_base_model"]) + @pytest.mark.dependency(depends=["test_optimum_convert_model_to_ir"]) def test_optimum_whisper_pipeline(self): config = {"_models": [model_dir], "_device": "CPU"} pipeline = OptimumWhisperPipeline(config) From 74e46d3ea9e686f7092c2f5e29e8dd20e06ffd5e Mon Sep 17 00:00:00 2001 From: Piotr Wolnowski Date: Tue, 7 Jan 2025 14:06:32 +0100 Subject: [PATCH 12/13] Update copyright --- .../evaluators/custom_evaluators/whisper_evaluator.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/accuracy_checker/accuracy_checker/evaluators/custom_evaluators/whisper_evaluator.py b/tools/accuracy_checker/accuracy_checker/evaluators/custom_evaluators/whisper_evaluator.py index 04a17b108a..677eace354 100644 --- a/tools/accuracy_checker/accuracy_checker/evaluators/custom_evaluators/whisper_evaluator.py +++ b/tools/accuracy_checker/accuracy_checker/evaluators/custom_evaluators/whisper_evaluator.py @@ -1,5 +1,5 @@ """ -Copyright (c) 2024 Intel Corporation +Copyright (c) 2024-2025 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. From 08a89a9674f7ceec4a405c67e9e287a799a00e6c Mon Sep 17 00:00:00 2001 From: Piotr Wolnowski Date: Tue, 7 Jan 2025 15:29:42 +0100 Subject: [PATCH 13/13] Pylint fixes --- .../custom_evaluators/whisper_evaluator.py | 20 +++++++++---------- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/tools/accuracy_checker/accuracy_checker/evaluators/custom_evaluators/whisper_evaluator.py b/tools/accuracy_checker/accuracy_checker/evaluators/custom_evaluators/whisper_evaluator.py index 677eace354..01a8fd81a6 100644 --- a/tools/accuracy_checker/accuracy_checker/evaluators/custom_evaluators/whisper_evaluator.py +++ b/tools/accuracy_checker/accuracy_checker/evaluators/custom_evaluators/whisper_evaluator.py @@ -1,5 +1,5 @@ """ -Copyright (c) 2024-2025 Intel Corporation +Copyright (c) 2024 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -26,8 +26,7 @@ AutoProcessor = UnsupportedPackage("transformers", import_err.msg) try: - from transformers.pipelines.automatic_speech_recognition import \ - AutomaticSpeechRecognitionPipeline + from transformers.pipelines.automatic_speech_recognition import AutomaticSpeechRecognitionPipeline except ImportError as import_err: AutomaticSpeechRecognitionPipeline = UnsupportedPackage("transformers", import_err.msg) @@ -119,8 +118,8 @@ class GenAIWhisperPipeline(WhisperPipeline): def _initialize_pipeline(self, config): try: import openvino_genai as ov_genai # pylint: disable=C0415 - except ImportError as import_err: - UnsupportedPackage("openvino_genai", import_err.msg).raise_error(self.__class__.__name__) + except ImportError as import_error: + UnsupportedPackage("openvino_genai", import_error.msg).raise_error(self.__class__.__name__) model_dir = config.get("_models", [None])[0] device = config.get("_device", "CPU") @@ -135,8 +134,8 @@ class HFWhisperPipeline(WhisperPipeline): def _initialize_pipeline(self, config): try: import torch # pylint: disable=C0415 - except ImportError as import_err: - UnsupportedPackage("torch", import_err.msg).raise_error(self.__class__.__name__) + except ImportError as import_error: + UnsupportedPackage("torch", import_error.msg).raise_error(self.__class__.__name__) model_id = config.get("model_id") device = "cpu" @@ -165,10 +164,9 @@ def _get_predictions(self, data, identifiers, input_meta): class OptimumWhisperPipeline(WhisperPipeline): def _initialize_pipeline(self, config): try: - from optimum.intel.openvino import \ - OVModelForSpeechSeq2Seq # pylint: disable=C0415 - except ImportError as import_err: - UnsupportedPackage("optimum.intel.openvino", import_err.msg).raise_error(self.__class__.__name__) + from optimum.intel.openvino import OVModelForSpeechSeq2Seq # pylint: disable=C0415 + except ImportError as import_error: + UnsupportedPackage("optimum.intel.openvino", import_error.msg).raise_error(self.__class__.__name__) device = config.get("_device", "CPU") model_dir = config.get("_models", [None])[0]