Skip to content

Commit 0b34715

Browse files
Add LightOnOCR documentation and test improvements
Add model documentation page with config and class references. Update toctree to include LightOnOCR entry. Clean up test formatting and add vision/text models to private model exceptions.
1 parent 2e482e9 commit 0b34715

File tree

4 files changed

+71
-10
lines changed

4 files changed

+71
-10
lines changed

docs/source/en/_toctree.yml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1102,6 +1102,8 @@
11021102
title: LayoutXLM
11031103
- local: model_doc/lfm2_vl
11041104
title: LFM2-VL
1105+
- local: model_doc/lightonocr
1106+
title: LightOnOCR
11051107
- local: model_doc/lilt
11061108
title: LiLT
11071109
- local: model_doc/llama4
Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,61 @@
1+
<!--Copyright 2024 The HuggingFace Team. All rights reserved.
2+
3+
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the
4+
License. You may obtain a copy of the License at
5+
6+
http://www.apache.org/licenses/LICENSE-2.0
7+
8+
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an
9+
"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
10+
11+
⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
12+
rendered properly in your Markdown viewer.
13+
14+
specific language governing permissions and limitations under the License. -->
15+
16+
# LightOnOCR
17+
18+
LightOnOCR is a multimodal model designed for optical character recognition (OCR) tasks. It combines a vision encoder for processing document images with a text decoder for generating text sequences.
19+
20+
The model architecture consists of:
21+
- **Vision Encoder**: Processes document images into visual embeddings
22+
- **Text Decoder**: Generates text sequences from the visual embeddings
23+
24+
You can use LightOnOCR for various document understanding tasks including text extraction, document question answering, and structured information extraction.
25+
26+
## LightOnOCRConfig
27+
28+
[[autodoc]] LightOnOCRConfig
29+
30+
## LightOnOCRTextConfig
31+
32+
[[autodoc]] LightOnOCRTextConfig
33+
34+
## LightOnOCRVisionConfig
35+
36+
[[autodoc]] LightOnOCRVisionConfig
37+
38+
## LightOnOCRProcessor
39+
40+
[[autodoc]] LightOnOCRProcessor
41+
- __call__
42+
43+
## LightOnOCRText
44+
45+
[[autodoc]] LightOnOCRText
46+
- forward
47+
48+
## LightOnOCRVision
49+
50+
[[autodoc]] LightOnOCRVision
51+
- forward
52+
53+
## LightOnOCRModel
54+
55+
[[autodoc]] LightOnOCRModel
56+
- forward
57+
58+
## LightOnOCRForConditionalGeneration
59+
60+
[[autodoc]] LightOnOCRForConditionalGeneration
61+
- forward

tests/models/lightonocr/test_modeling_lightonocr.py

Lines changed: 4 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -183,11 +183,7 @@ class LightOnOCRForConditionalGenerationModelTest(ModelTesterMixin, GenerationTe
183183
if is_torch_available()
184184
else ()
185185
)
186-
pipeline_model_mapping = (
187-
{"image-text-to-text": LightOnOCRForConditionalGeneration}
188-
if is_torch_available()
189-
else {}
190-
)
186+
pipeline_model_mapping = {"image-text-to-text": LightOnOCRForConditionalGeneration} if is_torch_available() else {}
191187

192188
_is_composite = True
193189
test_head_masking = False
@@ -286,7 +282,7 @@ def test_forward_pass_with_image_sizes(self):
286282
]
287283

288284
num_patches = (self.model_tester.image_size // self.model_tester.patch_size) ** 2
289-
num_image_tokens = num_patches // (config.spatial_merge_size ** 2)
285+
num_image_tokens = num_patches // (config.spatial_merge_size**2)
290286

291287
input_ids = ids_tensor([batch_size, 10 + num_image_tokens], config.text_config.vocab_size - 1) + 1
292288
input_ids[:, :num_image_tokens] = config.image_token_id
@@ -316,9 +312,7 @@ def test_model_outputs_equivalence(self):
316312

317313
# Check that outputs are deterministic
318314
if hasattr(outputs1, "last_hidden_state") and hasattr(outputs2, "last_hidden_state"):
319-
self.assertTrue(
320-
torch.allclose(outputs1.last_hidden_state, outputs2.last_hidden_state, atol=1e-5)
321-
)
315+
self.assertTrue(torch.allclose(outputs1.last_hidden_state, outputs2.last_hidden_state, atol=1e-5))
322316

323317
@unittest.skip(
324318
"LightOnOCR uses complex attention patterns with sliding windows, skipping gradient checkpointing test"
@@ -501,7 +495,7 @@ def test_model_forward_with_images(self):
501495

502496
# Calculate number of image tokens
503497
num_patches = (image_size // 14) ** 2 # patch_size = 14
504-
num_image_tokens = num_patches // (config.spatial_merge_size ** 2)
498+
num_image_tokens = num_patches // (config.spatial_merge_size**2)
505499

506500
seq_len = num_image_tokens + 10
507501
input_ids = torch.randint(0, config.vocab_size - 1, (batch_size, seq_len), device=torch_device) + 1

utils/check_repo.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -96,6 +96,8 @@
9696
"Phi4MultimodalVisionModel",
9797
"Glm4vVisionModel",
9898
"Glm4vMoeVisionModel",
99+
"LightOnOCRText",
100+
"LightOnOCRVision",
99101
"EvollaSaProtPreTrainedModel",
100102
"BltLocalEncoder", # Building part of bigger (tested) model. Tested implicitly through BLTForCausalLM.
101103
"BltLocalDecoder", # Building part of bigger (tested) model. Tested implicitly through BLTForCausalLM.
@@ -182,6 +184,8 @@
182184
"Qwen2_5_VLTextModel", # Building part of bigger (tested) model
183185
"InternVLVisionModel", # Building part of bigger (tested) model
184186
"JanusVisionModel", # Building part of bigger (tested) model
187+
"LightOnOCRText", # Building part of bigger (tested) model. Tested implicitly through LightOnOCRForConditionalGeneration.
188+
"LightOnOCRVision", # Building part of bigger (tested) model. Tested implicitly through LightOnOCRForConditionalGeneration.
185189
"TimesFmModel", # Building part of bigger (tested) model
186190
"CsmDepthDecoderForCausalLM", # Building part of bigger (tested) model. Tested implicitly through CsmForConditionalGenerationIntegrationTest.
187191
"CsmDepthDecoderModel", # Building part of bigger (tested) model. Tested implicitly through CsmForConditionalGenerationIntegrationTest.

0 commit comments

Comments
 (0)