diff --git a/tests/generation/test_continuous_batching.py b/tests/generation/test_continuous_batching.py index 8ecc6ba42af2..ff2022879422 100644 --- a/tests/generation/test_continuous_batching.py +++ b/tests/generation/test_continuous_batching.py @@ -25,7 +25,6 @@ require_kernels, require_read_token, require_torch_accelerator, - require_torch_gpu, slow, torch_device, ) @@ -315,36 +314,47 @@ def test_continuous_batching_parity_gemma_sdpa(self) -> None: # GPT-OSS is not compatible with SDPA because it has an attention sink. TODO: is this fixable? # Flash attention test - @require_torch_gpu + @require_torch_accelerator @require_kernels @slow def test_continuous_batching_parity_llama_flash(self) -> None: expected_outputs = Expectations({ ("cuda", (9, 0)): { "req_1": " 3 bolts of blue fiber and 1.5 bolts of white fiber. The total number of bolts is 4.5 bolts. The total number of bolts is 4.5 bolts.", - } + }, + ("xpu", None): { + "req_1": " 3 bolts of blue fiber and 1.5 bolts of white fiber. The total number of bolts is 4.5 bolts. The total number of bolts is 4.5 bolts.", + }, }).get_expectation() # fmt: skip self._continuous_batching_parity("meta-llama/Llama-3.1-8B", "paged|flash_attention_2", expected_outputs) - @require_torch_gpu + @require_torch_accelerator @require_kernels @slow def test_continuous_batching_parity_gemma_flash(self) -> None: expected_outputs = Expectations({ ("cuda", (9, 0)): { "req_1": " \n \n 2 + 1 = 3 bolts \n \n \n \n \n \n \n \n \n \n \n \n \n ", - } + }, + ("xpu", None): { + "req_0": "\n\n**$128**\n\n**Here's how to solve it:**\n\n* **Eggs eaten:** 3\n* **Eggs left:** 16 - 3 = 1", + "req_1": "\n\n**Answer:** 3 bolts\n\n**Solution:**\n\n* **White fiber:** The robe needs half as much white fiber as blue fiber, so it needs 2 bolts / 2 =", + }, }).get_expectation() # fmt: skip self._continuous_batching_parity("google/gemma-2-2b-it", "paged|flash_attention_2", expected_outputs) - @require_torch_gpu + @require_torch_accelerator @require_kernels @slow def test_continuous_batching_parity_qwen_flash(self) -> None: - expected_outputs = {} + expected_outputs = Expectations({ + ("xpu", None): { + "req_1": " 3.5 bolts.\n\nLet's break it down step by step:\n\n- Blue fiber: 2 bolts\n- White fiber: half of 2 bolts = 1 bolt\n\nTotal = ", + }, + }).get_expectation() # fmt: skip self._continuous_batching_parity("Qwen/Qwen3-4B-Instruct-2507", "paged|flash_attention_2", expected_outputs) - @require_torch_gpu + @require_torch_accelerator @require_kernels @slow def test_continuous_batching_parity_gpt_oss_flash(self) -> None: diff --git a/tests/generation/test_paged_attention.py b/tests/generation/test_paged_attention.py index 0cb13eb1dc23..d44355c599de 100644 --- a/tests/generation/test_paged_attention.py +++ b/tests/generation/test_paged_attention.py @@ -4,7 +4,7 @@ from parameterized import parameterized from transformers import AutoModelForCausalLM, AutoTokenizer, GenerationConfig -from transformers.testing_utils import require_flash_attn, require_torch_gpu, slow +from transformers.testing_utils import require_flash_attn, require_torch_accelerator, slow _TEST_PROMPTS = [ @@ -26,7 +26,7 @@ @slow @require_flash_attn -@require_torch_gpu +@require_torch_accelerator class TestBatchGeneration(unittest.TestCase): @classmethod def setUpClass(cls): diff --git a/tests/models/gemma2/test_modeling_gemma2.py b/tests/models/gemma2/test_modeling_gemma2.py index cb118b2267aa..f2a77e696afd 100644 --- a/tests/models/gemma2/test_modeling_gemma2.py +++ b/tests/models/gemma2/test_modeling_gemma2.py @@ -33,7 +33,6 @@ require_torch, require_torch_accelerator, require_torch_large_accelerator, - require_torch_large_gpu, run_test_using_subprocess, slow, torch_device, @@ -172,16 +171,25 @@ def test_model_2b_pipeline_bf16_flex_attention(self): @require_read_token @require_flash_attn - @require_torch_large_gpu + @require_torch_large_accelerator @mark.flash_attn_test @slow def test_model_9b_flash_attn(self): # See https://github.com/huggingface/transformers/issues/31953 --- flash attn was generating garbage for gemma2, especially in long context model_id = "google/gemma-2-9b" - EXPECTED_TEXTS = [ - 'Hello I am doing a project on the 1918 flu pandemic and I am trying to find out how many people died in the United States. I have found a few sites that say 500,000 but I am not sure if that is correct. I have also found a site that says 675,000 but I am not sure if that is correct either. I am trying to find out how many people died in the United States. I have found a few', - "Hi today I'm going to be talking about the history of the United States. The United States of America is a country in North America. It is the third largest country in the world by total area and the third most populous country with over 320 million people. The United States is a federal republic composed of 50 states and a federal district. The 48 contiguous states and the district of Columbia are in central North America between Canada and Mexico. The state of Alaska is in the", - ] # fmt: skip + # fmt: off + EXPECTED_TEXTS = Expectations( + { + (None, None): ['Hello I am doing a project on the 1918 flu pandemic and I am trying to find out how many people died in the United States. I have found a few sites that say 500,000 but I am not sure if that is correct. I have also found a site that says 675,000 but I am not sure if that is correct either. I am trying to find out how many people died in the United States. I have found a few', + "Hi today I'm going to be talking about the history of the United States. The United States of America is a country in North America. It is the third largest country in the world by total area and the third most populous country with over 320 million people. The United States is a federal republic composed of 50 states and a federal district. The 48 contiguous states and the district of Columbia are in central North America between Canada and Mexico. The state of Alaska is in the", + ], + ("xpu", None): ['Hello I am doing a project on the 1918 flu pandemic and I am trying to find out how many people died in the United States. I have found a few sites that say 500,000 but I am not sure if that is correct. I have also found a site that says 675,000 but I am not sure if that is correct either. I am trying to find out how many people died in the United States. I have found a few', + "Hi today I'm going to be talking about the history of the United States. The United States of America is a country in North America. It is the third largest country in the world by total area and the third most populous country with over 320 million people. The United States is a federal republic consisting of 50 states and a federal district. The 48 contiguous states and the district of Columbia are in central North America between Canada and Mexico. The state of Alaska is in the", + ], + } + ) + # fmt: on + EXPECTED_TEXT = EXPECTED_TEXTS.get_expectation() model = AutoModelForCausalLM.from_pretrained( model_id, attn_implementation="flash_attention_2", dtype="float16" @@ -192,7 +200,7 @@ def test_model_9b_flash_attn(self): output = model.generate(**inputs, max_new_tokens=100, do_sample=False) output_text = tokenizer.batch_decode(output, skip_special_tokens=False) - self.assertEqual(output_text, EXPECTED_TEXTS) + self.assertEqual(output_text, EXPECTED_TEXT) @pytest.mark.torch_export_test @slow diff --git a/tests/models/gemma3/test_modeling_gemma3.py b/tests/models/gemma3/test_modeling_gemma3.py index 17e9d9991bd4..785e543400a8 100644 --- a/tests/models/gemma3/test_modeling_gemma3.py +++ b/tests/models/gemma3/test_modeling_gemma3.py @@ -440,7 +440,7 @@ def test_automodelforcausallm(self): self.assertIsInstance(for_causal_lm, Gemma3ForConditionalGeneration) @require_flash_attn - @require_torch_gpu + @require_torch_accelerator @mark.flash_attn_test @slow def test_flash_attn_2_from_config(self): diff --git a/tests/models/glm4v/test_modeling_glm4v.py b/tests/models/glm4v/test_modeling_glm4v.py index ef1f0d5dc80a..d1d247cace2e 100644 --- a/tests/models/glm4v/test_modeling_glm4v.py +++ b/tests/models/glm4v/test_modeling_glm4v.py @@ -29,7 +29,7 @@ require_deterministic_for_xpu, require_flash_attn, require_torch, - require_torch_gpu, + require_torch_accelerator, slow, torch_device, ) @@ -512,7 +512,7 @@ def test_small_model_integration_test_batch_different_resolutions(self): @slow @require_flash_attn - @require_torch_gpu + @require_torch_accelerator def test_small_model_integration_test_batch_flashatt2(self): model = Glm4vForConditionalGeneration.from_pretrained( "THUDM/GLM-4.1V-9B-Thinking", @@ -547,7 +547,7 @@ def test_small_model_integration_test_batch_flashatt2(self): @slow @require_flash_attn - @require_torch_gpu + @require_torch_accelerator def test_small_model_integration_test_batch_wo_image_flashatt2(self): model = Glm4vForConditionalGeneration.from_pretrained( "THUDM/GLM-4.1V-9B-Thinking", diff --git a/tests/models/glm4v_moe/test_modeling_glm4v_moe.py b/tests/models/glm4v_moe/test_modeling_glm4v_moe.py index c0c1cfee45cb..8572dc1e229f 100644 --- a/tests/models/glm4v_moe/test_modeling_glm4v_moe.py +++ b/tests/models/glm4v_moe/test_modeling_glm4v_moe.py @@ -27,7 +27,7 @@ cleanup, require_flash_attn, require_torch, - require_torch_gpu, + require_torch_accelerator, run_first, slow, torch_device, @@ -434,7 +434,7 @@ def test_small_model_integration_test_with_video(self): @run_first @require_flash_attn - @require_torch_gpu + @require_torch_accelerator def test_small_model_integration_test_batch_flashatt2(self): model = Glm4vMoeForConditionalGeneration.from_pretrained( "zai-org/GLM-4.5V", diff --git a/tests/models/granitemoehybrid/test_modeling_granitemoehybrid.py b/tests/models/granitemoehybrid/test_modeling_granitemoehybrid.py index 47ede51be516..4d8b787dcafc 100644 --- a/tests/models/granitemoehybrid/test_modeling_granitemoehybrid.py +++ b/tests/models/granitemoehybrid/test_modeling_granitemoehybrid.py @@ -30,7 +30,7 @@ from transformers.testing_utils import ( require_flash_attn, require_torch, - require_torch_gpu, + require_torch_accelerator, slow, torch_device, ) @@ -235,7 +235,7 @@ def test_flash_attention_2_padding_matches_padding_free_with_position_ids_and_fa pass @require_flash_attn - @require_torch_gpu + @require_torch_accelerator @mark.flash_attn_test @slow @unittest.skip( @@ -356,7 +356,7 @@ def test_config_requires_mamba_or_attention_layers(self): # TODO (@alex-jw-brooks) - update this once the model(s) are out @unittest.skip(reason="GraniteMoeHybrid models are not yet released") -@require_torch_gpu +@require_torch_accelerator class GraniteMoeHybridIntegrationTest(unittest.TestCase): @slow def test_model_logits(self): diff --git a/tests/models/idefics2/test_modeling_idefics2.py b/tests/models/idefics2/test_modeling_idefics2.py index 4d958aff7007..4767dfae2556 100644 --- a/tests/models/idefics2/test_modeling_idefics2.py +++ b/tests/models/idefics2/test_modeling_idefics2.py @@ -36,7 +36,7 @@ require_bitsandbytes, require_flash_attn, require_torch, - require_torch_gpu, + require_torch_accelerator, require_torch_multi_accelerator, slow, torch_device, @@ -645,7 +645,7 @@ def test_integration_test_4bit_batch2(self): @pytest.mark.flash_attn_test @require_flash_attn - @require_torch_gpu + @require_torch_accelerator @require_bitsandbytes def test_flash_attn_2_eager_equivalence(self): # Create inputs diff --git a/tests/models/kosmos2_5/test_modeling_kosmos2_5.py b/tests/models/kosmos2_5/test_modeling_kosmos2_5.py index 105fba5e596b..cb77fa242905 100644 --- a/tests/models/kosmos2_5/test_modeling_kosmos2_5.py +++ b/tests/models/kosmos2_5/test_modeling_kosmos2_5.py @@ -33,7 +33,6 @@ require_flash_attn, require_torch, require_torch_accelerator, - require_torch_gpu, require_vision, slow, torch_device, @@ -467,7 +466,7 @@ def test_model_parallelism(self): pass # TODO: ydshieh - @require_torch_gpu + @require_torch_accelerator @slow @unittest.skip(reason="_update_causal_mask is not implemented yet which fails this test") def test_sdpa_can_dispatch_on_flash(self): diff --git a/tests/models/longcat_flash/test_modeling_longcat_flash.py b/tests/models/longcat_flash/test_modeling_longcat_flash.py index 981d0edb9702..85c203bae44f 100644 --- a/tests/models/longcat_flash/test_modeling_longcat_flash.py +++ b/tests/models/longcat_flash/test_modeling_longcat_flash.py @@ -25,7 +25,7 @@ require_flash_attn, require_large_cpu_ram, require_torch, - require_torch_gpu, + require_torch_accelerator, slow, torch_device, ) @@ -285,7 +285,7 @@ def _prepare_config_headdim(config, requested_dim): return config @require_flash_attn - @require_torch_gpu + @require_torch_accelerator @require_bitsandbytes @mark.flash_attn_test @slow diff --git a/tests/models/mimi/test_modeling_mimi.py b/tests/models/mimi/test_modeling_mimi.py index c25d84e650bb..96af2264b47e 100644 --- a/tests/models/mimi/test_modeling_mimi.py +++ b/tests/models/mimi/test_modeling_mimi.py @@ -28,7 +28,7 @@ is_torch_available, require_flash_attn, require_torch, - require_torch_gpu, + require_torch_accelerator, slow, torch_device, ) @@ -291,7 +291,7 @@ def test_identity_shortcut(self): self.model_tester.create_and_check_model_forward(config, inputs_dict) @require_flash_attn - @require_torch_gpu + @require_torch_accelerator @mark.flash_attn_test @slow @is_flaky() diff --git a/tests/models/modernbert/test_modeling_modernbert.py b/tests/models/modernbert/test_modeling_modernbert.py index 5f74e15cd3a6..8ac641273d17 100644 --- a/tests/models/modernbert/test_modeling_modernbert.py +++ b/tests/models/modernbert/test_modeling_modernbert.py @@ -27,7 +27,6 @@ require_flash_attn, require_torch, require_torch_accelerator, - require_torch_gpu, slow, torch_device, ) @@ -344,14 +343,14 @@ def test_model_from_pretrained(self): self.assertIsNotNone(model) @require_flash_attn - @require_torch_gpu + @require_torch_accelerator @pytest.mark.flash_attn_test @slow def test_flash_attn_2_inference_equivalence_right_padding(self): self.skipTest(reason="ModernBert flash attention does not support right padding") @require_flash_attn - @require_torch_gpu + @require_torch_accelerator @pytest.mark.flash_attn_test @slow def test_flash_attn_2_conversion(self): diff --git a/tests/models/musicgen/test_modeling_musicgen.py b/tests/models/musicgen/test_modeling_musicgen.py index baba4181f730..8aa6b5f07e09 100644 --- a/tests/models/musicgen/test_modeling_musicgen.py +++ b/tests/models/musicgen/test_modeling_musicgen.py @@ -40,7 +40,6 @@ require_torch, require_torch_accelerator, require_torch_fp16, - require_torch_gpu, slow, torch_device, ) @@ -282,7 +281,7 @@ def test_greedy_generate_stereo_outputs(self): self.model_tester.audio_channels = original_audio_channels @require_flash_attn - @require_torch_gpu + @require_torch_accelerator @mark.flash_attn_test @slow # Copied from tests.test_modeling_common.ModelTesterMixin.test_flash_attn_2_inference_equivalence @@ -362,7 +361,7 @@ def test_flash_attn_2_inference_equivalence(self): _ = model_fa(dummy_input, **other_inputs) @require_flash_attn - @require_torch_gpu + @require_torch_accelerator @mark.flash_attn_test @slow # Copied from tests.test_modeling_common.ModelTesterMixin.test_flash_attn_2_inference_equivalence_right_padding @@ -899,7 +898,7 @@ def test_greedy_generate_stereo_outputs(self): self.model_tester.audio_channels = original_audio_channels @require_flash_attn - @require_torch_gpu + @require_torch_accelerator @mark.flash_attn_test @slow def test_flash_attn_2_conversion(self): diff --git a/tests/models/musicgen_melody/test_modeling_musicgen_melody.py b/tests/models/musicgen_melody/test_modeling_musicgen_melody.py index 564cb661e527..6a5a0729c0fe 100644 --- a/tests/models/musicgen_melody/test_modeling_musicgen_melody.py +++ b/tests/models/musicgen_melody/test_modeling_musicgen_melody.py @@ -41,7 +41,6 @@ require_torch, require_torch_accelerator, require_torch_fp16, - require_torch_gpu, require_torchaudio, slow, torch_device, @@ -291,7 +290,7 @@ def test_greedy_generate_stereo_outputs(self): self.model_tester.audio_channels = original_audio_channels @require_flash_attn - @require_torch_gpu + @require_torch_accelerator @mark.flash_attn_test @slow # Copied from tests.models.musicgen.test_modeling_musicgen.MusicgenDecoderTest.test_flash_attn_2_inference_equivalence @@ -373,7 +372,7 @@ def test_flash_attn_2_inference_equivalence(self): _ = model_fa(dummy_input, **other_inputs) @require_flash_attn - @require_torch_gpu + @require_torch_accelerator @mark.flash_attn_test @slow # Copied from tests.models.musicgen.test_modeling_musicgen.MusicgenDecoderTest.test_flash_attn_2_inference_equivalence_right_padding @@ -902,7 +901,7 @@ def test_greedy_generate_stereo_outputs(self): self.model_tester.audio_channels = original_audio_channels @require_flash_attn - @require_torch_gpu + @require_torch_accelerator @mark.flash_attn_test @slow def test_flash_attn_2_conversion(self): diff --git a/tests/models/pixtral/test_image_processing_pixtral.py b/tests/models/pixtral/test_image_processing_pixtral.py index 537e210d64c6..a4bb23225cde 100644 --- a/tests/models/pixtral/test_image_processing_pixtral.py +++ b/tests/models/pixtral/test_image_processing_pixtral.py @@ -19,7 +19,13 @@ from packaging import version from transformers.image_utils import load_image -from transformers.testing_utils import require_torch, require_torch_gpu, require_vision, slow, torch_device +from transformers.testing_utils import ( + require_torch, + require_torch_accelerator, + require_vision, + slow, + torch_device, +) from transformers.utils import is_torch_available, is_torchvision_available, is_vision_available from ...test_image_processing_common import ImageProcessingTestMixin, prepare_image_inputs @@ -261,7 +267,7 @@ def test_slow_fast_equivalence_batched(self): ) @slow - @require_torch_gpu + @require_torch_accelerator @require_vision @pytest.mark.torch_compile_test def test_can_compile_fast_image_processor(self): diff --git a/tests/models/qwen2_5_omni/test_modeling_qwen2_5_omni.py b/tests/models/qwen2_5_omni/test_modeling_qwen2_5_omni.py index a2a131fee64f..0b3af9e1f072 100644 --- a/tests/models/qwen2_5_omni/test_modeling_qwen2_5_omni.py +++ b/tests/models/qwen2_5_omni/test_modeling_qwen2_5_omni.py @@ -37,7 +37,7 @@ cleanup, require_flash_attn, require_torch, - require_torch_gpu, + require_torch_accelerator, slow, torch_device, ) @@ -831,7 +831,7 @@ def test_small_model_integration_test_w_audio(self): @slow @require_flash_attn - @require_torch_gpu + @require_torch_accelerator @pytest.mark.flash_attn_test def test_small_model_integration_test_batch_flashatt2(self): model = Qwen2_5OmniForConditionalGeneration.from_pretrained( diff --git a/tests/models/qwen2_5_vl/test_modeling_qwen2_5_vl.py b/tests/models/qwen2_5_vl/test_modeling_qwen2_5_vl.py index 8f4459e36933..fd8cc883142b 100644 --- a/tests/models/qwen2_5_vl/test_modeling_qwen2_5_vl.py +++ b/tests/models/qwen2_5_vl/test_modeling_qwen2_5_vl.py @@ -35,7 +35,7 @@ require_cv2, require_flash_attn, require_torch, - require_torch_gpu, + require_torch_accelerator, slow, torch_device, ) @@ -592,7 +592,7 @@ def test_small_model_integration_test_batch_different_resolutions(self): @slow @require_flash_attn - @require_torch_gpu + @require_torch_accelerator @pytest.mark.flash_attn_test def test_small_model_integration_test_batch_flashatt2(self): model = Qwen2_5_VLForConditionalGeneration.from_pretrained( @@ -611,7 +611,8 @@ def test_small_model_integration_test_batch_flashatt2(self): expected_decoded_text = Expectations({ ("cuda", None): "system\nYou are a helpful assistant.\nuser\nWhat kind of dog is this?\nassistant\nThe dog in the picture appears to be a Labrador Retriever. Labradors are known for their friendly and energetic nature, which is evident in", - ("rocm", (9, 4)): "system\nYou are a helpful assistant.\nuser\nWhat kind of dog is this?\nassistant\nThe dog in the picture appears to be a Labrador Retriever. Labradors are known for their friendly and energetic nature, which is evident in" + ("rocm", (9, 4)): "system\nYou are a helpful assistant.\nuser\nWhat kind of dog is this?\nassistant\nThe dog in the picture appears to be a Labrador Retriever. Labradors are known for their friendly and energetic nature, which is evident in", + ("xpu", None): "system\nYou are a helpful assistant.\nuser\nWhat kind of dog is this?\nassistant\nThe dog in the picture appears to be a Labrador Retriever. Labradors are known for their friendly and energetic nature, which is evident in", }).get_expectation() # fmt: skip # Since the test is to generate twice the same text, we just test twice against the expected decoded text @@ -621,7 +622,7 @@ def test_small_model_integration_test_batch_flashatt2(self): @slow @require_flash_attn - @require_torch_gpu + @require_torch_accelerator @pytest.mark.flash_attn_test def test_small_model_integration_test_batch_wo_image_flashatt2(self): model = Qwen2_5_VLForConditionalGeneration.from_pretrained( @@ -654,6 +655,10 @@ def test_small_model_integration_test_batch_wo_image_flashatt2(self): 'system\nYou are a helpful assistant.\nuser\nWhat kind of dog is this?\nassistant\nThe dog in the picture appears to be a Labrador Retriever. Labradors are known for their friendly and energetic nature, which is evident in', "system\nYou are a helpful assistant.\nuser\nWho are you?\nassistant\nI am Qwen, a large language model created by Alibaba Cloud. I am designed to answer a wide range of questions and provide information on various topics", ], + ("xpu", None): [ + 'system\nYou are a helpful assistant.\nuser\nWhat kind of dog is this?\nassistant\nThe dog in the picture appears to be a Labrador Retriever. Labradors are known for their friendly and energetic nature, which is evident in', + 'system\nYou are a helpful assistant.\nuser\nWho are you?\nassistant\niclaim to be a large language model created by Alibaba Cloud. I am called Qwen.', + ], }).get_expectation() # fmt: skip decoded_text = self.processor.batch_decode(output, skip_special_tokens=True) diff --git a/tests/models/qwen2_moe/test_modeling_qwen2_moe.py b/tests/models/qwen2_moe/test_modeling_qwen2_moe.py index 61d33463fb41..2559f1cbee52 100644 --- a/tests/models/qwen2_moe/test_modeling_qwen2_moe.py +++ b/tests/models/qwen2_moe/test_modeling_qwen2_moe.py @@ -23,7 +23,7 @@ is_flaky, require_flash_attn, require_torch, - require_torch_gpu, + require_torch_accelerator, run_first, run_test_using_subprocess, slow, @@ -67,7 +67,7 @@ def is_pipeline_test_to_skip( return True @require_flash_attn - @require_torch_gpu + @require_torch_accelerator @pytest.mark.flash_attn_test @slow def test_flash_attn_2_inference_equivalence_right_padding(self): diff --git a/tests/models/qwen2_vl/test_modeling_qwen2_vl.py b/tests/models/qwen2_vl/test_modeling_qwen2_vl.py index 5f90620ca703..5852b474fc4f 100644 --- a/tests/models/qwen2_vl/test_modeling_qwen2_vl.py +++ b/tests/models/qwen2_vl/test_modeling_qwen2_vl.py @@ -34,7 +34,7 @@ backend_empty_cache, require_flash_attn, require_torch, - require_torch_gpu, + require_torch_accelerator, slow, torch_device, ) @@ -583,7 +583,7 @@ def test_small_model_integration_test_batch_different_resolutions(self): @slow @require_flash_attn - @require_torch_gpu + @require_torch_accelerator @pytest.mark.flash_attn_test def test_small_model_integration_test_batch_flashatt2(self): model = Qwen2VLForConditionalGeneration.from_pretrained( @@ -611,7 +611,7 @@ def test_small_model_integration_test_batch_flashatt2(self): @slow @require_flash_attn - @require_torch_gpu + @require_torch_accelerator @pytest.mark.flash_attn_test def test_small_model_integration_test_batch_wo_image_flashatt2(self): model = Qwen2VLForConditionalGeneration.from_pretrained( diff --git a/tests/models/qwen3_omni_moe/test_modeling_qwen3_omni_moe.py b/tests/models/qwen3_omni_moe/test_modeling_qwen3_omni_moe.py index df0d53bf404e..e483b17af9b3 100644 --- a/tests/models/qwen3_omni_moe/test_modeling_qwen3_omni_moe.py +++ b/tests/models/qwen3_omni_moe/test_modeling_qwen3_omni_moe.py @@ -37,7 +37,7 @@ cleanup, require_flash_attn, require_torch, - require_torch_gpu, + require_torch_accelerator, slow, torch_device, ) @@ -853,7 +853,7 @@ def test_small_model_integration_test_w_audio(self): @slow @require_flash_attn - @require_torch_gpu + @require_torch_accelerator @pytest.mark.flash_attn_test def test_small_model_integration_test_batch_flashatt2(self): model = Qwen3OmniMoeForConditionalGeneration.from_pretrained( diff --git a/tests/models/qwen3_vl_moe/test_modeling_qwen3_vl_moe.py b/tests/models/qwen3_vl_moe/test_modeling_qwen3_vl_moe.py index fcbb38260b8e..17f8a049efcb 100644 --- a/tests/models/qwen3_vl_moe/test_modeling_qwen3_vl_moe.py +++ b/tests/models/qwen3_vl_moe/test_modeling_qwen3_vl_moe.py @@ -26,10 +26,11 @@ is_torch_available, ) from transformers.testing_utils import ( + Expectations, cleanup, require_flash_attn, require_torch, - require_torch_gpu, + require_torch_accelerator, slow, torch_device, ) @@ -546,7 +547,7 @@ def test_small_model_integration_test_batch_different_resolutions(self): @slow @require_flash_attn - @require_torch_gpu + @require_torch_accelerator @pytest.mark.flash_attn_test def test_small_model_integration_test_batch_flashatt2(self): model = Qwen3VLMoeForConditionalGeneration.from_pretrained( @@ -568,18 +569,29 @@ def test_small_model_integration_test_batch_flashatt2(self): # it should not matter whether two images are the same size or not output = model.generate(**inputs, max_new_tokens=30, do_sample=False) - EXPECTED_DECODED_TEXT = [ - "user\nWhat kind of dog is this?\nassistant\nThis is a Pallas's cat, also known as the manul. It's a wild cat species native to the grasslands and montane regions", - "user\nWhat kind of dog is this?\nassistant\nBased on the image provided, there is no dog present. The animals in the picture are two cats.\n\nHere are some observations about the cats in the" - ] # fmt: skip + # fmt: off + EXPECTED_DECODED_TEXTS = Expectations( + { + (None, None): ["user\nWhat kind of dog is this?\nassistant\nThis is a Pallas's cat, also known as the manul. It's a wild cat species native to the grasslands and montane regions", + "user\nWhat kind of dog is this?\nassistant\nBased on the image provided, there is no dog present. The animals in the picture are two cats.\n\nHere are some observations about the cats in the" + ], + ("xpu", None): ["user\nWhat kind of dog is this?\nassistant\nThis is a Pallas's cat, also known as the manul. It's a small wild cat native to the grasslands and steppes", + 'user\nWhat kind of dog is this?\nassistant\nBased on the image provided, there is no dog present. The animals in the picture are two cats.\n\nHere is a description of the scene:\n-' + ], + } + ) + EXPECTED_DECODED_TEXT = EXPECTED_DECODED_TEXTS.get_expectation() + # fmt: on + + DECODED_TEXT = self.processor.batch_decode(output, skip_special_tokens=True) self.assertEqual( - self.processor.batch_decode(output, skip_special_tokens=True), + DECODED_TEXT, EXPECTED_DECODED_TEXT, ) @slow @require_flash_attn - @require_torch_gpu + @require_torch_accelerator @pytest.mark.flash_attn_test def test_small_model_integration_test_batch_wo_image_flashatt2(self): model = Qwen3VLMoeForConditionalGeneration.from_pretrained( diff --git a/tests/models/siglip2/test_modeling_siglip2.py b/tests/models/siglip2/test_modeling_siglip2.py index 7b634ca7acb2..f45165cbf200 100644 --- a/tests/models/siglip2/test_modeling_siglip2.py +++ b/tests/models/siglip2/test_modeling_siglip2.py @@ -27,7 +27,7 @@ is_flaky, require_flash_attn, require_torch, - require_torch_gpu, + require_torch_accelerator, require_vision, slow, torch_device, @@ -90,7 +90,7 @@ def test_sdpa_can_dispatch_composite_models(self): self.assertTrue(model_eager.config._attn_implementation == "eager") @require_flash_attn - @require_torch_gpu + @require_torch_accelerator @mark.flash_attn_test @slow def test_flash_attn_2_inference_equivalence(self): @@ -599,7 +599,7 @@ def test_model_from_pretrained(self): self.assertIsNotNone(model) @require_flash_attn - @require_torch_gpu + @require_torch_accelerator @mark.flash_attn_test def test_flash_attn_2_inference_equivalence_right_padding(self): self.skipTest("Siglip2 does not support right padding") @@ -743,6 +743,11 @@ def test_inference(self): [ 9.4241, 10.1828, 6.3366], [ 2.4371, 3.1062, 4.5530], [-12.3173, -13.7240, -13.4580], [ 1.1502, 1.1716, -1.9623] ], + ("xpu", 3): [ + [ 1.0195, -0.0280, -1.4468], [ -4.5395, -6.2269, -1.5667], [ 4.1757, 5.0358, 3.5159], + [ 9.4264, 10.1879, 6.3353], [ 2.4409, 3.1058, 4.5491], [-12.3230, -13.7355, -13.4632], + [ 1.1520, 1.1687, -1.9647] + ], }) EXPECTED_LOGITS_PER_TEXT = torch.tensor(expected_logits_per_texts.get_expectation()).to(torch_device) # fmt: on diff --git a/tests/models/t5gemma/test_modeling_t5gemma.py b/tests/models/t5gemma/test_modeling_t5gemma.py index 2c46dd06cbb5..5897f08601f2 100644 --- a/tests/models/t5gemma/test_modeling_t5gemma.py +++ b/tests/models/t5gemma/test_modeling_t5gemma.py @@ -26,7 +26,6 @@ require_flash_attn, require_torch, require_torch_accelerator, - require_torch_gpu, torch_device, ) @@ -1273,7 +1272,7 @@ def test_flex_attention_with_grads(self): _ = model(**dummy_inputs) @require_flash_attn - @require_torch_gpu + @require_torch_accelerator @mark.flash_attn_test def test_generate_beyond_sliding_window_with_flash_attn(self): config, input_ids, _, attention_mask, _, _ = self.model_tester.prepare_config_and_inputs() diff --git a/tests/models/video_llama_3/test_modeling_video_llama_3.py b/tests/models/video_llama_3/test_modeling_video_llama_3.py index 0f7bd7d456ad..36f64f641944 100644 --- a/tests/models/video_llama_3/test_modeling_video_llama_3.py +++ b/tests/models/video_llama_3/test_modeling_video_llama_3.py @@ -40,7 +40,7 @@ backend_empty_cache, require_flash_attn, require_torch, - require_torch_gpu, + require_torch_accelerator, set_config_for_less_flaky_test, set_model_for_less_flaky_test, slow, @@ -925,7 +925,7 @@ def test_small_model_integration_test_batch_different_resolutions(self): self.assertEqual(DECODED_TEXT, EXPECTED_DECODED_TEXT) @require_flash_attn - @require_torch_gpu + @require_torch_accelerator @pytest.mark.flash_attn_test def test_small_model_integration_test_batch_flashatt2(self): model = VideoLlama3ForConditionalGeneration.from_pretrained( @@ -942,17 +942,26 @@ def test_small_model_integration_test_batch_flashatt2(self): # it should not matter whether two images are the same size or not output = model.generate(**inputs, max_new_tokens=20, do_sample=False, repetition_penalty=None) - EXPECTED_DECODED_TEXT = [ - 'user\n\nDescribe the image.\nassistant\nThe image captures a vibrant nighttime scene on a bustling city street. A woman in a striking red dress', - 'user\n\nDescribe the image.\nassistant\nThe image captures a vibrant nighttime scene on a bustling city street. A woman in a striking red dress', - ] # fmt: skip - self.assertEqual( - self.processor.batch_decode(output, skip_special_tokens=True), - EXPECTED_DECODED_TEXT, + # fmt: off + EXPECTED_DECODED_TEXTS = Expectations( + { + (None, None): ['user\n\nDescribe the image.\nassistant\nThe image captures a vibrant nighttime scene on a bustling city street. A woman in a striking red dress', + 'user\n\nDescribe the image.\nassistant\nThe image captures a vibrant nighttime scene on a bustling city street. A woman in a striking red dress', + ], + ("xpu", 3): ['user\n\nDescribe the image.\nassistant\nThe image captures a vibrant nighttime scene on a bustling city street. A woman in a striking red dress', + 'user\n\nDescribe the image.\nassistant\nThe image depicts a vibrant nighttime scene on a bustling city street. A woman in a striking red dress', + ], + } ) + # fmt: on + EXPECTED_DECODED_TEXT = EXPECTED_DECODED_TEXTS.get_expectation() + + DECODED_TEXT = self.processor.batch_decode(output, skip_special_tokens=True) + + self.assertEqual(DECODED_TEXT, EXPECTED_DECODED_TEXT) @require_flash_attn - @require_torch_gpu + @require_torch_accelerator @pytest.mark.flash_attn_test def test_small_model_integration_test_batch_wo_image_flashatt2(self): model = VideoLlama3ForConditionalGeneration.from_pretrained( diff --git a/tests/models/videomae/test_modeling_videomae.py b/tests/models/videomae/test_modeling_videomae.py index 2c07a74d7d55..987c80ba30cd 100644 --- a/tests/models/videomae/test_modeling_videomae.py +++ b/tests/models/videomae/test_modeling_videomae.py @@ -29,7 +29,7 @@ is_flaky, require_flash_attn, require_torch, - require_torch_gpu, + require_torch_accelerator, require_vision, slow, torch_device, @@ -349,7 +349,7 @@ def check_hidden_states_output(inputs_dict, config, model_class): check_hidden_states_output(inputs_dict, config, model_class) @require_flash_attn - @require_torch_gpu + @require_torch_accelerator @mark.flash_attn_test @slow @is_flaky() diff --git a/tests/models/vit_mae/test_modeling_vit_mae.py b/tests/models/vit_mae/test_modeling_vit_mae.py index caf2f4dcba59..a547904f8b9a 100644 --- a/tests/models/vit_mae/test_modeling_vit_mae.py +++ b/tests/models/vit_mae/test_modeling_vit_mae.py @@ -26,7 +26,7 @@ is_flaky, require_flash_attn, require_torch, - require_torch_gpu, + require_torch_accelerator, require_vision, slow, torch_device, @@ -262,7 +262,7 @@ def test_model_from_pretrained(self): self.assertIsNotNone(model) @require_flash_attn - @require_torch_gpu + @require_torch_accelerator @mark.flash_attn_test @slow @is_flaky() diff --git a/tests/models/wav2vec2/test_modeling_wav2vec2.py b/tests/models/wav2vec2/test_modeling_wav2vec2.py index c2767583c6cd..59ad60e05521 100644 --- a/tests/models/wav2vec2/test_modeling_wav2vec2.py +++ b/tests/models/wav2vec2/test_modeling_wav2vec2.py @@ -34,7 +34,7 @@ require_flash_attn, require_pyctcdecode, require_torch, - require_torch_gpu, + require_torch_accelerator, require_torchaudio, require_torchcodec, run_test_in_subprocess, @@ -1808,7 +1808,7 @@ def run_model(lang): assert run_model(lang) == TRANSCRIPTIONS[lang] @require_flash_attn - @require_torch_gpu + @require_torch_accelerator @mark.flash_attn_test def test_inference_ctc_fa2(self): model_fa = Wav2Vec2ForCTC.from_pretrained( @@ -1830,7 +1830,7 @@ def test_inference_ctc_fa2(self): self.assertListEqual(predicted_trans, EXPECTED_TRANSCRIPTIONS) @require_flash_attn - @require_torch_gpu + @require_torch_accelerator @mark.flash_attn_test def test_inference_ctc_fa2_batched(self): model_fa = Wav2Vec2ForCTC.from_pretrained( diff --git a/tests/models/xlstm/test_modeling_xlstm.py b/tests/models/xlstm/test_modeling_xlstm.py index 918734c7f8ac..1a718f44cb71 100644 --- a/tests/models/xlstm/test_modeling_xlstm.py +++ b/tests/models/xlstm/test_modeling_xlstm.py @@ -18,7 +18,13 @@ from parameterized import parameterized from transformers import AutoTokenizer, is_torch_available, xLSTMConfig -from transformers.testing_utils import require_read_token, require_torch, require_torch_gpu, slow, torch_device +from transformers.testing_utils import ( + require_read_token, + require_torch, + require_torch_accelerator, + slow, + torch_device, +) from ...generation.test_utils import GenerationTesterMixin from ...test_configuration_common import ConfigTester @@ -324,7 +330,7 @@ def test_batched_equivalence_without_cache(self): individual_output = tokenizer.batch_decode(individual_gen, skip_special_tokens=True)[0] self.assertEqual(individual_output[:100], batched_output[index_gen][:100]) - @require_torch_gpu + @require_torch_accelerator def test_xlstm_block_train_vs_eval_equivalence(self): # Based on https://github.com/sustcsonglin/flash-linear-attention/issues/63 # Credit to zhixuan-lin diff --git a/tests/test_modeling_common.py b/tests/test_modeling_common.py index c1773dd69c23..21a05d34332f 100755 --- a/tests/test_modeling_common.py +++ b/tests/test_modeling_common.py @@ -3365,7 +3365,7 @@ def test_flash_attn_2_fp32_ln(self): _ = model(dummy_input, attention_mask=dummy_attention_mask) @require_flash_attn - @require_torch_gpu + @require_torch_accelerator @mark.flash_attn_test @pytest.mark.torch_compile_test @slow