Add LightOnOCR documentation and test improvements

baptiste-aubertin · baptiste-aubertin · commit 0b34715817c0 · 2025-10-18T18:32:14.000+02:00
Add model documentation page with config and class references. Update toctree to include LightOnOCR entry. Clean up test formatting and add vision/text models to private model exceptions.
diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml
@@ -1102,6 +1102,8 @@
         title: LayoutXLM
       - local: model_doc/lfm2_vl
         title: LFM2-VL
+      - local: model_doc/lightonocr
+        title: LightOnOCR
       - local: model_doc/lilt
         title: LiLT
       - local: model_doc/llama4
diff --git a/docs/source/en/model_doc/lightonocr.md b/docs/source/en/model_doc/lightonocr.md
@@ -0,0 +1,61 @@
+<!--Copyright 2024 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the
+License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an
+"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+specific language governing permissions and limitations under the License. -->
+
+# LightOnOCR
+
+LightOnOCR is a multimodal model designed for optical character recognition (OCR) tasks. It combines a vision encoder for processing document images with a text decoder for generating text sequences.
+
+The model architecture consists of:
+- **Vision Encoder**: Processes document images into visual embeddings
+- **Text Decoder**: Generates text sequences from the visual embeddings
+
+You can use LightOnOCR for various document understanding tasks including text extraction, document question answering, and structured information extraction.
+
+## LightOnOCRConfig
+
+[[autodoc]] LightOnOCRConfig
+
+## LightOnOCRTextConfig
+
+[[autodoc]] LightOnOCRTextConfig
+
+## LightOnOCRVisionConfig
+
+[[autodoc]] LightOnOCRVisionConfig
+
+## LightOnOCRProcessor
+
+[[autodoc]] LightOnOCRProcessor
+    - __call__
+
+## LightOnOCRText
+
+[[autodoc]] LightOnOCRText
+    - forward
+
+## LightOnOCRVision
+
+[[autodoc]] LightOnOCRVision
+    - forward
+
+## LightOnOCRModel
+
+[[autodoc]] LightOnOCRModel
+    - forward
+
+## LightOnOCRForConditionalGeneration
+
+[[autodoc]] LightOnOCRForConditionalGeneration
+    - forward
diff --git a/tests/models/lightonocr/test_modeling_lightonocr.py b/tests/models/lightonocr/test_modeling_lightonocr.py
@@ -183,11 +183,7 @@ class LightOnOCRForConditionalGenerationModelTest(ModelTesterMixin, GenerationTe
         if is_torch_available()
         else ()
     )
-    pipeline_model_mapping = (
-        {"image-text-to-text": LightOnOCRForConditionalGeneration}
-        if is_torch_available()
-        else {}
-    )
+    pipeline_model_mapping = {"image-text-to-text": LightOnOCRForConditionalGeneration} if is_torch_available() else {}
 
     _is_composite = True
     test_head_masking = False
@@ -286,7 +282,7 @@ def test_forward_pass_with_image_sizes(self):
             ]
 
             num_patches = (self.model_tester.image_size // self.model_tester.patch_size) ** 2
-            num_image_tokens = num_patches // (config.spatial_merge_size ** 2)
+            num_image_tokens = num_patches // (config.spatial_merge_size**2)
 
             input_ids = ids_tensor([batch_size, 10 + num_image_tokens], config.text_config.vocab_size - 1) + 1
             input_ids[:, :num_image_tokens] = config.image_token_id
@@ -316,9 +312,7 @@ def test_model_outputs_equivalence(self):
 
             # Check that outputs are deterministic
             if hasattr(outputs1, "last_hidden_state") and hasattr(outputs2, "last_hidden_state"):
-                self.assertTrue(
-                    torch.allclose(outputs1.last_hidden_state, outputs2.last_hidden_state, atol=1e-5)
-                )
+                self.assertTrue(torch.allclose(outputs1.last_hidden_state, outputs2.last_hidden_state, atol=1e-5))
 
     @unittest.skip(
         "LightOnOCR uses complex attention patterns with sliding windows, skipping gradient checkpointing test"
@@ -501,7 +495,7 @@ def test_model_forward_with_images(self):
 
         # Calculate number of image tokens
         num_patches = (image_size // 14) ** 2  # patch_size = 14
-        num_image_tokens = num_patches // (config.spatial_merge_size ** 2)
+        num_image_tokens = num_patches // (config.spatial_merge_size**2)
 
         seq_len = num_image_tokens + 10
         input_ids = torch.randint(0, config.vocab_size - 1, (batch_size, seq_len), device=torch_device) + 1
diff --git a/utils/check_repo.py b/utils/check_repo.py
@@ -96,6 +96,8 @@
     "Phi4MultimodalVisionModel",
     "Glm4vVisionModel",
     "Glm4vMoeVisionModel",
+    "LightOnOCRText",
+    "LightOnOCRVision",
     "EvollaSaProtPreTrainedModel",
     "BltLocalEncoder",  # Building part of bigger (tested) model. Tested implicitly through BLTForCausalLM.
     "BltLocalDecoder",  # Building part of bigger (tested) model. Tested implicitly through BLTForCausalLM.
@@ -182,6 +184,8 @@
         "Qwen2_5_VLTextModel",  # Building part of bigger (tested) model
         "InternVLVisionModel",  # Building part of bigger (tested) model
         "JanusVisionModel",  # Building part of bigger (tested) model
+        "LightOnOCRText",  # Building part of bigger (tested) model. Tested implicitly through LightOnOCRForConditionalGeneration.
+        "LightOnOCRVision",  # Building part of bigger (tested) model. Tested implicitly through LightOnOCRForConditionalGeneration.
         "TimesFmModel",  # Building part of bigger (tested) model
         "CsmDepthDecoderForCausalLM",  # Building part of bigger (tested) model. Tested implicitly through CsmForConditionalGenerationIntegrationTest.
         "CsmDepthDecoderModel",  # Building part of bigger (tested) model. Tested implicitly through CsmForConditionalGenerationIntegrationTest.