_supports_flash_attn = False on vision encoder

baptiste-aubertin · baptiste-aubertin · commit 5c88a1de4e14 · 2025-11-05T17:30:59.000+01:00
diff --git a/src/transformers/models/lightonocr/modeling_lightonocr.py b/src/transformers/models/lightonocr/modeling_lightonocr.py
@@ -156,7 +156,7 @@ class LightOnOCRVisionPreTrainedModel(PreTrainedModel):
     main_input_name = "pixel_values"
     supports_gradient_checkpointing = True
     _supports_attention_backend = True
-    _supports_flash_attn = True
+    _supports_flash_attn = False
     _supports_sdpa = True
     _supports_flex_attn = True
     _no_split_modules = ["LightOnOCRVisionAttentionLayer"]
diff --git a/src/transformers/models/lightonocr/modular_lightonocr.py b/src/transformers/models/lightonocr/modular_lightonocr.py
@@ -417,7 +417,7 @@ class LightOnOCRVisionPreTrainedModel(PreTrainedModel):
     main_input_name = "pixel_values"
     supports_gradient_checkpointing = True
     _supports_attention_backend = True
-    _supports_flash_attn = True
+    _supports_flash_attn = False
     _supports_sdpa = True
     _supports_flex_attn = True
     _no_split_modules = ["LightOnOCRVisionAttentionLayer"]
diff --git a/tests/models/lightonocr/test_modeling_lightonocr.py b/tests/models/lightonocr/test_modeling_lightonocr.py
@@ -412,34 +412,6 @@ def test_flash_attention_2_padding_matches_padding_free_with_position_ids(self):
     def test_flash_attn_2_fp32_ln(self):
         pass
 
-    @unittest.skip("Pixtral does not support attention interfaces.")
-    def test_eager_matches_fa2_generate(self):
-        pass
-
-    @unittest.skip("Pixtral does not support attention interfaces.")
-    def test_eager_matches_sdpa_generate(self):
-        pass
-
-    @unittest.skip("Pixtral does not support attention interfaces.")
-    def test_flash_attn_2_from_config(self):
-        pass
-
-    @unittest.skip("Pixtral does not support attention interfaces.")
-    def test_flash_attn_2_inference_equivalence(self):
-        pass
-
-    @unittest.skip("Pixtral does not support attention interfaces.")
-    def test_flash_attn_2_inference_equivalence_right_padding(self):
-        pass
-
-    @unittest.skip("Pixtral does not support attention interfaces.")
-    def test_sdpa_can_dispatch_on_flash(self):
-        pass
-
-    @unittest.skip("Pixtral does not support attention interfaces.")
-    def test_flex_attention_with_grads(self):
-        pass
-
     def test_initialization(self):
         """
         Test that model initializes correctly with proper weight initialization.