feat: ImageContent validation (#305)

anakin87 · web-flow · commit 106aa002ea91 · 2025-05-16T14:05:51.000+02:00
* initial work + tests

* mime type validation + tests

* small fixes

* wording

* add validate parameter

* rm import
diff --git a/haystack_experimental/components/image_converters/image_utils.py b/haystack_experimental/components/image_converters/image_utils.py
@@ -10,6 +10,8 @@
 from haystack.dataclasses import ByteStream
 from haystack.lazy_imports import LazyImport
 
+from haystack_experimental.dataclasses.image_content import MIME_TO_FORMAT
+
 with LazyImport("Run 'pip install pypdf pdf2image'") as pypdf_and_pdf2image_import:
     import pdf2image
     from pypdf import PdfReader
@@ -23,41 +25,6 @@
 logger = logging.getLogger(__name__)
 
 
-# NOTE: We have to rely on this since our util functions are using the bytestream object.
-#      We could change this to use the file path instead, where the file extension is used to determine the format.
-# This is a mapping of image formats to their MIME types.
-# from PIL import Image
-# Image.init()  # <- Must force all plugins to initialize to get this mapping
-# print(Image.MIME)
-FORMAT_TO_MIME = {
-    "BMP": "image/bmp",
-    "DIB": "image/bmp",
-    "PCX": "image/x-pcx",
-    "EPS": "application/postscript",
-    "GIF": "image/gif",
-    "PNG": "image/png",
-    "JPEG2000": "image/jp2",
-    "ICNS": "image/icns",
-    "ICO": "image/x-icon",
-    "JPEG": "image/jpeg",
-    "MPEG": "video/mpeg",
-    "TIFF": "image/tiff",
-    "MPO": "image/mpo",
-    "PALM": "image/palm",
-    "PDF": "application/pdf",
-    "PPM": "image/x-portable-anymap",
-    "PSD": "image/vnd.adobe.photoshop",
-    "SGI": "image/sgi",
-    "TGA": "image/x-tga",
-    "WEBP": "image/webp",
-    "XBM": "image/xbm",
-    "XPM": "image/xpm",
-}
-MIME_TO_FORMAT = {v: k for k, v in FORMAT_TO_MIME.items()}
-# Adding some common MIME types that are not in the PIL mapping
-MIME_TO_FORMAT["image/jpg"] = "JPEG"
-
-
 def encode_image_to_base64(
     bytestream: ByteStream,
     size: Optional[Tuple[int, int]] = None,
diff --git a/haystack_experimental/dataclasses/image_content.py b/haystack_experimental/dataclasses/image_content.py
@@ -14,14 +14,46 @@
 from haystack.lazy_imports import LazyImport
 from haystack.utils import is_in_jupyter
 
-from haystack_experimental.components.image_converters.image_utils import MIME_TO_FORMAT
-
 with LazyImport("The 'show' method requires the 'PIL' library. Run 'pip install pillow'") as pillow_import:
     from PIL import Image
 
 logger = logging.getLogger(__name__)
 
-IMAGE_MIME_TYPES = {key for key in MIME_TO_FORMAT.keys() if key != "application/pdf"}
+# NOTE: We have to rely on this since our util functions are using the bytestream object.
+#      We could change this to use the file path instead, where the file extension is used to determine the format.
+# This is a mapping of image formats to their MIME types.
+# from PIL import Image
+# Image.init()  # <- Must force all plugins to initialize to get this mapping
+# print(Image.MIME)
+FORMAT_TO_MIME = {
+    "BMP": "image/bmp",
+    "DIB": "image/bmp",
+    "PCX": "image/x-pcx",
+    "EPS": "application/postscript",
+    "GIF": "image/gif",
+    "PNG": "image/png",
+    "JPEG2000": "image/jp2",
+    "ICNS": "image/icns",
+    "ICO": "image/x-icon",
+    "JPEG": "image/jpeg",
+    "MPEG": "video/mpeg",
+    "TIFF": "image/tiff",
+    "MPO": "image/mpo",
+    "PALM": "image/palm",
+    "PDF": "application/pdf",
+    "PPM": "image/x-portable-anymap",
+    "PSD": "image/vnd.adobe.photoshop",
+    "SGI": "image/sgi",
+    "TGA": "image/x-tga",
+    "WEBP": "image/webp",
+    "XBM": "image/xbm",
+    "XPM": "image/xpm",
+}
+MIME_TO_FORMAT = {v: k for k, v in FORMAT_TO_MIME.items()}
+# Adding some common MIME types that are not in the PIL mapping
+MIME_TO_FORMAT["image/jpg"] = "JPEG"
+
+IMAGE_MIME_TYPES = set(MIME_TO_FORMAT.keys())
 
 
 @dataclass
@@ -35,31 +67,42 @@ class ImageContent:
         If not provided, the MIME type is guessed from the base64 string, which can be slow and not always reliable.
     :param detail: Optional detail level of the image (only supported by OpenAI). One of "auto", "high", or "low".
     :param meta: Optional metadata for the image.
+    :param validate: If True (default), a validation process is performed:
+        - Check whether the base64 string is valid;
+        - Guess the MIME type if not provided;
+        - Check if the MIME type is a valid image MIME type.
+        Set to False to skip validation and speed up initialization.
     """
 
     base64_image: str
     mime_type: Optional[str] = None
     detail: Optional[Literal["auto", "high", "low"]] = None
     meta: Dict[str, Any] = field(default_factory=dict)
+    validate: bool = True
 
     def __post_init__(self):
+        if not self.validate:
+            return
+
+        try:
+            decoded_image = base64.b64decode(self.base64_image, validate=True)
+        except Exception as e:
+            raise ValueError("The base64 string is not valid") from e
+
         # mime_type is an important information, so we try to guess it if not provided
         if not self.mime_type:
-            try:
-                # Attempt to decode the string as base64
-                decoded_image = base64.b64decode(self.base64_image)
-
-                guess = filetype.guess(decoded_image)
-                if guess:
-                    self.mime_type = guess.mime
-                else:
-                    msg = (
-                        "Failed to guess the MIME type of the image. Omitting the MIME type may result in "
-                        "processing errors or incorrect handling of the image by LLM providers."
-                    )
-                    logger.warning(msg)
-            except:
-                pass
+            guess = filetype.guess(decoded_image)
+            if guess:
+                self.mime_type = guess.mime
+            else:
+                msg = (
+                    "Failed to guess the MIME type of the image. Omitting the MIME type may result in "
+                    "processing errors or incorrect handling of the image by LLM providers."
+                )
+                logger.warning(msg)
+
+        if self.mime_type and self.mime_type not in IMAGE_MIME_TYPES:
+            raise ValueError(f"{self.mime_type} is not a valid image MIME type.")
 
     def __repr__(self) -> str:
         """
@@ -161,7 +204,7 @@ def from_url(
             Additional metadata for the image.
 
         :raises ValueError:
-            If the URL does not point to an image.
+            If the URL does not point to an image or if it points to a PDF file.
 
         :returns:
             An ImageContent object.
@@ -176,6 +219,10 @@ def from_url(
             msg = f"The URL does not point to an image. The MIME type of the URL is {bytestream.mime_type}."
             raise ValueError(msg)
 
+        if bytestream.mime_type == "application/pdf":
+            raise ValueError("PDF files are not supported. "
+                             "For PDF to ImageContent conversion, use the `PDFToImageContent` component.")
+
         converter = ImageFileToImageContent(size=size, detail=detail)
         result = converter.run(sources=[bytestream], meta=[meta] if meta else None)
         return result["image_contents"][0]
diff --git a/test/conftest.py b/test/conftest.py
@@ -47,6 +47,10 @@ def spying_tracer() -> Generator[SpyingTracer, None, None]:
     # Make sure to disable tracing after the test to avoid affecting other tests
     tracing.disable_tracing()
 
+@pytest.fixture()
+def base64_image_string():
+    return "iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAQAAAC1HAwCAAAAC0lEQVR42mP8/x8AAwMCAO+ip1sAAAAASUVORK5CYII="
+
 def load_and_resume_pipeline_state(pipeline, output_directory: Path, component: str, data: Dict = None) -> Dict:
     """
     Utility function to load and resume pipeline state from a breakpoint file.
diff --git a/test/dataclasses/test_chat_message.py b/test/dataclasses/test_chat_message.py
@@ -335,22 +335,22 @@ def test_from_user_fails_if_text_and_content_parts():
     with pytest.raises(ValueError):
         ChatMessage.from_user(text="text", content_parts=[TextContent(text="text")])
 
-def test_from_user_with_content_parts():
-    content_parts = [TextContent(text="text"), ImageContent(base64_image="base64_string")]
+def test_from_user_with_content_parts(base64_image_string):
+    content_parts = [TextContent(text="text"), ImageContent(base64_image=base64_image_string)]
     message = ChatMessage.from_user(content_parts=content_parts)
 
     assert message.role == ChatRole.USER
     assert message._content == content_parts
 
-    content_parts = ["text", ImageContent(base64_image="base64_string")]
+    content_parts = ["text", ImageContent(base64_image=base64_image_string)]
     message = ChatMessage.from_user(content_parts=content_parts)
 
     assert message.role == ChatRole.USER
-    assert message._content == [TextContent(text="text"), ImageContent(base64_image="base64_string")]
+    assert message._content == [TextContent(text="text"), ImageContent(base64_image=base64_image_string)]
 
-def test_from_user_with_content_parts_fails_if_no_textual_parts():
+def test_from_user_with_content_parts_fails_if_no_textual_parts(base64_image_string):
     with pytest.raises(ValueError):
-        ChatMessage.from_user(content_parts=[ImageContent(base64_image="base64_string")])
+        ChatMessage.from_user(content_parts=[ImageContent(base64_image=base64_image_string)])
 
 def test_from_user_with_content_parts_fails_unsupported_parts():
     with pytest.raises(ValueError):
@@ -396,16 +396,16 @@ def test_from_tool_with_valid_content():
     assert not message.images
     assert not message.image
 
-def test_serde():
+def test_serde(base64_image_string):
     # the following message is created just for testing purposes and does not make sense in a real use case
 
     role = ChatRole.ASSISTANT
 
     text_content = TextContent(text="Hello")
     tool_call = ToolCall(id="123", tool_name="mytool", arguments={"a": 1})
     tool_call_result = ToolCallResult(result="result", origin=tool_call, error=False)
-    image_content = ImageContent(base64_image="base64_string", mime_type="image/png", detail="auto",
-                                 meta={"key": "value"})
+    image_content = ImageContent(base64_image=base64_image_string, mime_type="image/png", detail="auto",
+                                 meta={"key": "value"}, validate=True)
     meta = {"some": "info"}
 
     message = ChatMessage(_role=role, _content=[text_content, tool_call, tool_call_result, image_content], _meta=meta)
@@ -424,10 +424,11 @@ def test_serde():
             },
             {
                 "image": {
-                    "base64_image": "base64_string",
+                    "base64_image": base64_image_string,
                     "mime_type": "image/png",
                     "detail": "auto",
                     "meta": {"key": "value"},
+                    "validate": True,
                 }
             },
         ],
@@ -447,13 +448,13 @@ def test_to_openai_dict_format_user_message():
     message = ChatMessage.from_user("I have a question")
     assert message.to_openai_dict_format() == {"role": "user", "content": "I have a question"}
 
-def test_to_openai_dict_format_multimodal_user_message():
+def test_to_openai_dict_format_multimodal_user_message(base64_image_string):
     message = ChatMessage.from_user(content_parts=[TextContent("I have a question"),
-                                                   ImageContent(base64_image="base64_string")])
+                                                   ImageContent(base64_image=base64_image_string)])
     assert message.to_openai_dict_format() == {"role": "user",
                                                "content": [{"type": "text", "text": "I have a question"},
                                                             {"type": "image_url", "image_url":
-                                                            {"url": "data:image/jpeg;base64,base64_string"}}]}
+                                                            {"url": f"data:image/png;base64,{base64_image_string}"}}]}
 
 def test_to_openai_dict_format_assistant_message():
     message = ChatMessage.from_assistant(text="I have an answer", meta={"finish_reason": "stop"})
diff --git a/test/dataclasses/test_image_content.py b/test/dataclasses/test_image_content.py
@@ -13,13 +13,53 @@
 from haystack_experimental.dataclasses.image_content import ImageContent
 
 
-def test_image_content_init():
-    image_content = ImageContent(base64_image="base64_string", mime_type="image/png", detail="auto",
+def test_image_content_init(base64_image_string):
+    image_content = ImageContent(base64_image=base64_image_string, mime_type="image/png", detail="auto",
                                  meta={"key": "value"})
-    assert image_content.base64_image == "base64_string"
+    assert image_content.base64_image == base64_image_string
     assert image_content.mime_type == "image/png"
     assert image_content.detail == "auto"
     assert image_content.meta == {"key": "value"}
+    assert image_content.validate
+
+def test_image_content_init_with_invalid_base64_string():
+    with pytest.raises(ValueError):
+        ImageContent(base64_image="invalid_base64_string")
+
+def test_image_content_init_with_invalid_base64_string_and_validate_false():
+    image_content = ImageContent(base64_image="invalid_base64_string", validate=False)
+    assert image_content.base64_image == "invalid_base64_string"
+    assert image_content.mime_type is None
+    assert image_content.detail is None
+    assert image_content.meta == {}
+    assert not image_content.validate
+
+def test_image_content_init_with_invalid_mime_type(test_files_path, base64_image_string):
+    with pytest.raises(ValueError):
+        ImageContent(base64_image=base64_image_string, mime_type="text/xml")
+
+    with open(test_files_path / "docx" / "sample_docx.docx", "rb") as docx_file:
+        docx_base64 = base64.b64encode(docx_file.read()).decode("utf-8")
+    with pytest.raises(ValueError):
+        ImageContent(base64_image=docx_base64)
+
+def test_image_content_init_with_invalid_mime_type_and_validate_false(test_files_path, base64_image_string):
+    image_content = ImageContent(base64_image=base64_image_string, mime_type="text/xml", validate=False)
+    assert image_content.base64_image == base64_image_string
+    assert image_content.mime_type == "text/xml"
+    assert image_content.detail is None
+    assert image_content.meta == {}
+    assert not image_content.validate
+
+    with open(test_files_path / "docx" / "sample_docx.docx", "rb") as docx_file:
+        docx_base64 = base64.b64encode(docx_file.read()).decode("utf-8")
+    image_content = ImageContent(base64_image=docx_base64, validate=False)
+    assert image_content.base64_image == docx_base64
+    assert image_content.mime_type is None
+    assert image_content.detail is None
+    assert image_content.meta == {}
+    assert not image_content.validate
+
 
 def test_image_content_mime_type_guessing(test_files_path):
     image_path = test_files_path / "images" / "apple.jpg"
@@ -28,10 +68,6 @@ def test_image_content_mime_type_guessing(test_files_path):
     image_content = ImageContent(base64_image=base64_image)
     assert image_content.mime_type == "image/jpeg"
 
-    # do not guess mime type if base64 decoding fails
-    image_content = ImageContent(base64_image="base64_string")
-    assert image_content.mime_type is None
-
     # do not guess mime type if mime type is provided
     image_content = ImageContent(base64_image=base64_image, mime_type="image/png")
     assert image_content.mime_type == "image/png"
diff --git a/test/test_files/docx/sample_docx.docx b/test/test_files/docx/sample_docx.docx
diff --git a/test/utils/test_jinja_chat_extension.py b/test/utils/test_jinja_chat_extension.py