huggingface · baptiste-aubertin · Oct 15, 2025 · Oct 15, 2025 · Oct 15, 2025 · Oct 16, 2025
diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml
@@ -1072,6 +1072,8 @@
         title: LayoutXLM
       - local: model_doc/lfm2_vl
         title: LFM2-VL
+      - local: model_doc/lightonocr
+        title: LightOnOCR
       - local: model_doc/lilt
         title: LiLT
       - local: model_doc/llama4

diff --git a/docs/source/en/model_doc/lightonocr.md b/docs/source/en/model_doc/lightonocr.md
@@ -0,0 +1,66 @@
+<!--Copyright 2025 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the
+License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an
+"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+specific language governing permissions and limitations under the License. -->
+*This model was released on {release_date} and added to Hugging Face Transformers on 2025-11-18.*
+
+# LightOnOCR
+
+
+**LightOnOCR** is a compact, end-to-end vision–language model for Optical Character Recognition (OCR) and document understanding. It achieves state-of-the-art accuracy in its weight class while being several times faster and cheaper than larger general-purpose VLMs.
+
+📝 **[Read the full blog post](https://huggingface.co/blog/lightonai/lightonocr/)** | 📓 **[Finetuning notebook](https://colab.research.google.com/drive/1WjbsFJZ4vOAAlKtcCauFLn_evo5UBRNa?usp=sharing)**
+
+**Model Overview**
+
+LightOnOCR combines a Vision Transformer encoder(Pixtral-based) with a lightweight text decoder(Qwen3-based) distilled from high-quality open VLMs. It is optimized for document parsing tasks, producing accurate, layout-aware text extraction from high-resolution pages.
+
+
+
+
+## LightOnOCRConfig
+
+[[autodoc]] LightOnOCRConfig
+
+## LightOnOCRTextConfig
+
+[[autodoc]] LightOnOCRTextConfig
+
+## LightOnOCRVisionConfig
+
+[[autodoc]] LightOnOCRVisionConfig
+
+## LightOnOCRProcessor
+
+[[autodoc]] LightOnOCRProcessor
+    - __call__
+
+## LightOnOCRTextModel
+
+[[autodoc]] LightOnOCRTextModel
+    - forward
+
+## LightOnOCRVisionModel
+
+[[autodoc]] LightOnOCRVisionModel
+    - forward
+
+## LightOnOCRModel
+
+[[autodoc]] LightOnOCRModel
+    - forward
+
+## LightOnOCRForConditionalGeneration
+
+[[autodoc]] LightOnOCRForConditionalGeneration
+    - forward
diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py
@@ -229,6 +229,7 @@
         ("lfm2_moe", "Lfm2MoeConfig"),
         ("lfm2_vl", "Lfm2VlConfig"),
         ("lightglue", "LightGlueConfig"),
+        ("lightonocr", "LightOnOCRConfig"),
         ("lilt", "LiltConfig"),
         ("llama", "LlamaConfig"),
         ("llama4", "Llama4Config"),
@@ -665,6 +666,7 @@
         ("lfm2_moe", "Lfm2Moe"),
         ("lfm2_vl", "Lfm2Vl"),
         ("lightglue", "LightGlue"),
+        ("lightonocr", "LightOnOCR"),
         ("lilt", "LiLT"),
         ("llama", "LLaMA"),
         ("llama2", "Llama2"),

diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py
@@ -229,6 +229,7 @@ class _BaseModelWithGenerate(PreTrainedModel, GenerationMixin):
         ("lfm2_moe", "Lfm2MoeModel"),
         ("lfm2_vl", "Lfm2VlModel"),
         ("lightglue", "LightGlueForKeypointMatching"),
+        ("lightonocr", "LightOnOCRModel"),
         ("lilt", "LiltModel"),
         ("llama", "LlamaModel"),
         ("llama4", "Llama4ForConditionalGeneration"),
@@ -1004,6 +1005,7 @@ class _BaseModelWithGenerate(PreTrainedModel, GenerationMixin):
         ("kosmos-2", "Kosmos2ForConditionalGeneration"),
         ("kosmos-2.5", "Kosmos2_5ForConditionalGeneration"),
         ("lfm2_vl", "Lfm2VlForConditionalGeneration"),
+        ("lightonocr", "LightOnOCRForConditionalGeneration"),
         ("llama4", "Llama4ForConditionalGeneration"),
         ("llava", "LlavaForConditionalGeneration"),
         ("llava_next", "LlavaNextForConditionalGeneration"),

diff --git a/src/transformers/models/auto/processing_auto.py b/src/transformers/models/auto/processing_auto.py
@@ -96,6 +96,7 @@
         ("layoutlmv2", "LayoutLMv2Processor"),
         ("layoutlmv3", "LayoutLMv3Processor"),
         ("lfm2_vl", "Lfm2VlProcessor"),
+        ("lightonocr", "LightOnOCRProcessor"),
         ("llama4", "Llama4Processor"),
         ("llava", "LlavaProcessor"),
         ("llava_next", "LlavaNextProcessor"),

diff --git a/src/transformers/models/auto/tokenization_auto.py b/src/transformers/models/auto/tokenization_auto.py
@@ -368,6 +368,13 @@
         ("led", ("LEDTokenizer", "LEDTokenizerFast" if is_tokenizers_available() else None)),
         ("lfm2", (None, "PreTrainedTokenizerFast" if is_tokenizers_available() else None)),
         ("lfm2_vl", (None, "PreTrainedTokenizerFast" if is_tokenizers_available() else None)),
+        (
+            "lightonocr",
+            (
+                "Qwen2Tokenizer",
+                "Qwen2TokenizerFast" if is_tokenizers_available() else None,
+            ),
+        ),
         ("lilt", ("LayoutLMv3Tokenizer", "LayoutLMv3TokenizerFast" if is_tokenizers_available() else None)),
         (
             "llama",

diff --git a/src/transformers/models/lightonocr/__init__.py b/src/transformers/models/lightonocr/__init__.py
@@ -0,0 +1,28 @@
+# Copyright 2024 The Qwen Team and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_lightonocr import *
+    from .modeling_lightonocr import *
+    from .processing_lightonocr import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)