Skip to content

Commit 106aa00

Browse files
authored
feat: ImageContent validation (#305)
* initial work + tests * mime type validation + tests * small fixes * wording * add validate parameter * rm import
1 parent fb3cbdd commit 106aa00

File tree

7 files changed

+161
-101
lines changed

7 files changed

+161
-101
lines changed

haystack_experimental/components/image_converters/image_utils.py

Lines changed: 2 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,8 @@
1010
from haystack.dataclasses import ByteStream
1111
from haystack.lazy_imports import LazyImport
1212

13+
from haystack_experimental.dataclasses.image_content import MIME_TO_FORMAT
14+
1315
with LazyImport("Run 'pip install pypdf pdf2image'") as pypdf_and_pdf2image_import:
1416
import pdf2image
1517
from pypdf import PdfReader
@@ -23,41 +25,6 @@
2325
logger = logging.getLogger(__name__)
2426

2527

26-
# NOTE: We have to rely on this since our util functions are using the bytestream object.
27-
# We could change this to use the file path instead, where the file extension is used to determine the format.
28-
# This is a mapping of image formats to their MIME types.
29-
# from PIL import Image
30-
# Image.init() # <- Must force all plugins to initialize to get this mapping
31-
# print(Image.MIME)
32-
FORMAT_TO_MIME = {
33-
"BMP": "image/bmp",
34-
"DIB": "image/bmp",
35-
"PCX": "image/x-pcx",
36-
"EPS": "application/postscript",
37-
"GIF": "image/gif",
38-
"PNG": "image/png",
39-
"JPEG2000": "image/jp2",
40-
"ICNS": "image/icns",
41-
"ICO": "image/x-icon",
42-
"JPEG": "image/jpeg",
43-
"MPEG": "video/mpeg",
44-
"TIFF": "image/tiff",
45-
"MPO": "image/mpo",
46-
"PALM": "image/palm",
47-
"PDF": "application/pdf",
48-
"PPM": "image/x-portable-anymap",
49-
"PSD": "image/vnd.adobe.photoshop",
50-
"SGI": "image/sgi",
51-
"TGA": "image/x-tga",
52-
"WEBP": "image/webp",
53-
"XBM": "image/xbm",
54-
"XPM": "image/xpm",
55-
}
56-
MIME_TO_FORMAT = {v: k for k, v in FORMAT_TO_MIME.items()}
57-
# Adding some common MIME types that are not in the PIL mapping
58-
MIME_TO_FORMAT["image/jpg"] = "JPEG"
59-
60-
6128
def encode_image_to_base64(
6229
bytestream: ByteStream,
6330
size: Optional[Tuple[int, int]] = None,

haystack_experimental/dataclasses/image_content.py

Lines changed: 66 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -14,14 +14,46 @@
1414
from haystack.lazy_imports import LazyImport
1515
from haystack.utils import is_in_jupyter
1616

17-
from haystack_experimental.components.image_converters.image_utils import MIME_TO_FORMAT
18-
1917
with LazyImport("The 'show' method requires the 'PIL' library. Run 'pip install pillow'") as pillow_import:
2018
from PIL import Image
2119

2220
logger = logging.getLogger(__name__)
2321

24-
IMAGE_MIME_TYPES = {key for key in MIME_TO_FORMAT.keys() if key != "application/pdf"}
22+
# NOTE: We have to rely on this since our util functions are using the bytestream object.
23+
# We could change this to use the file path instead, where the file extension is used to determine the format.
24+
# This is a mapping of image formats to their MIME types.
25+
# from PIL import Image
26+
# Image.init() # <- Must force all plugins to initialize to get this mapping
27+
# print(Image.MIME)
28+
FORMAT_TO_MIME = {
29+
"BMP": "image/bmp",
30+
"DIB": "image/bmp",
31+
"PCX": "image/x-pcx",
32+
"EPS": "application/postscript",
33+
"GIF": "image/gif",
34+
"PNG": "image/png",
35+
"JPEG2000": "image/jp2",
36+
"ICNS": "image/icns",
37+
"ICO": "image/x-icon",
38+
"JPEG": "image/jpeg",
39+
"MPEG": "video/mpeg",
40+
"TIFF": "image/tiff",
41+
"MPO": "image/mpo",
42+
"PALM": "image/palm",
43+
"PDF": "application/pdf",
44+
"PPM": "image/x-portable-anymap",
45+
"PSD": "image/vnd.adobe.photoshop",
46+
"SGI": "image/sgi",
47+
"TGA": "image/x-tga",
48+
"WEBP": "image/webp",
49+
"XBM": "image/xbm",
50+
"XPM": "image/xpm",
51+
}
52+
MIME_TO_FORMAT = {v: k for k, v in FORMAT_TO_MIME.items()}
53+
# Adding some common MIME types that are not in the PIL mapping
54+
MIME_TO_FORMAT["image/jpg"] = "JPEG"
55+
56+
IMAGE_MIME_TYPES = set(MIME_TO_FORMAT.keys())
2557

2658

2759
@dataclass
@@ -35,31 +67,42 @@ class ImageContent:
3567
If not provided, the MIME type is guessed from the base64 string, which can be slow and not always reliable.
3668
:param detail: Optional detail level of the image (only supported by OpenAI). One of "auto", "high", or "low".
3769
:param meta: Optional metadata for the image.
70+
:param validate: If True (default), a validation process is performed:
71+
- Check whether the base64 string is valid;
72+
- Guess the MIME type if not provided;
73+
- Check if the MIME type is a valid image MIME type.
74+
Set to False to skip validation and speed up initialization.
3875
"""
3976

4077
base64_image: str
4178
mime_type: Optional[str] = None
4279
detail: Optional[Literal["auto", "high", "low"]] = None
4380
meta: Dict[str, Any] = field(default_factory=dict)
81+
validate: bool = True
4482

4583
def __post_init__(self):
84+
if not self.validate:
85+
return
86+
87+
try:
88+
decoded_image = base64.b64decode(self.base64_image, validate=True)
89+
except Exception as e:
90+
raise ValueError("The base64 string is not valid") from e
91+
4692
# mime_type is an important information, so we try to guess it if not provided
4793
if not self.mime_type:
48-
try:
49-
# Attempt to decode the string as base64
50-
decoded_image = base64.b64decode(self.base64_image)
51-
52-
guess = filetype.guess(decoded_image)
53-
if guess:
54-
self.mime_type = guess.mime
55-
else:
56-
msg = (
57-
"Failed to guess the MIME type of the image. Omitting the MIME type may result in "
58-
"processing errors or incorrect handling of the image by LLM providers."
59-
)
60-
logger.warning(msg)
61-
except:
62-
pass
94+
guess = filetype.guess(decoded_image)
95+
if guess:
96+
self.mime_type = guess.mime
97+
else:
98+
msg = (
99+
"Failed to guess the MIME type of the image. Omitting the MIME type may result in "
100+
"processing errors or incorrect handling of the image by LLM providers."
101+
)
102+
logger.warning(msg)
103+
104+
if self.mime_type and self.mime_type not in IMAGE_MIME_TYPES:
105+
raise ValueError(f"{self.mime_type} is not a valid image MIME type.")
63106

64107
def __repr__(self) -> str:
65108
"""
@@ -161,7 +204,7 @@ def from_url(
161204
Additional metadata for the image.
162205
163206
:raises ValueError:
164-
If the URL does not point to an image.
207+
If the URL does not point to an image or if it points to a PDF file.
165208
166209
:returns:
167210
An ImageContent object.
@@ -176,6 +219,10 @@ def from_url(
176219
msg = f"The URL does not point to an image. The MIME type of the URL is {bytestream.mime_type}."
177220
raise ValueError(msg)
178221

222+
if bytestream.mime_type == "application/pdf":
223+
raise ValueError("PDF files are not supported. "
224+
"For PDF to ImageContent conversion, use the `PDFToImageContent` component.")
225+
179226
converter = ImageFileToImageContent(size=size, detail=detail)
180227
result = converter.run(sources=[bytestream], meta=[meta] if meta else None)
181228
return result["image_contents"][0]

test/conftest.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,10 @@ def spying_tracer() -> Generator[SpyingTracer, None, None]:
4747
# Make sure to disable tracing after the test to avoid affecting other tests
4848
tracing.disable_tracing()
4949

50+
@pytest.fixture()
51+
def base64_image_string():
52+
return "iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAQAAAC1HAwCAAAAC0lEQVR42mP8/x8AAwMCAO+ip1sAAAAASUVORK5CYII="
53+
5054
def load_and_resume_pipeline_state(pipeline, output_directory: Path, component: str, data: Dict = None) -> Dict:
5155
"""
5256
Utility function to load and resume pipeline state from a breakpoint file.

test/dataclasses/test_chat_message.py

Lines changed: 14 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -335,22 +335,22 @@ def test_from_user_fails_if_text_and_content_parts():
335335
with pytest.raises(ValueError):
336336
ChatMessage.from_user(text="text", content_parts=[TextContent(text="text")])
337337

338-
def test_from_user_with_content_parts():
339-
content_parts = [TextContent(text="text"), ImageContent(base64_image="base64_string")]
338+
def test_from_user_with_content_parts(base64_image_string):
339+
content_parts = [TextContent(text="text"), ImageContent(base64_image=base64_image_string)]
340340
message = ChatMessage.from_user(content_parts=content_parts)
341341

342342
assert message.role == ChatRole.USER
343343
assert message._content == content_parts
344344

345-
content_parts = ["text", ImageContent(base64_image="base64_string")]
345+
content_parts = ["text", ImageContent(base64_image=base64_image_string)]
346346
message = ChatMessage.from_user(content_parts=content_parts)
347347

348348
assert message.role == ChatRole.USER
349-
assert message._content == [TextContent(text="text"), ImageContent(base64_image="base64_string")]
349+
assert message._content == [TextContent(text="text"), ImageContent(base64_image=base64_image_string)]
350350

351-
def test_from_user_with_content_parts_fails_if_no_textual_parts():
351+
def test_from_user_with_content_parts_fails_if_no_textual_parts(base64_image_string):
352352
with pytest.raises(ValueError):
353-
ChatMessage.from_user(content_parts=[ImageContent(base64_image="base64_string")])
353+
ChatMessage.from_user(content_parts=[ImageContent(base64_image=base64_image_string)])
354354

355355
def test_from_user_with_content_parts_fails_unsupported_parts():
356356
with pytest.raises(ValueError):
@@ -396,16 +396,16 @@ def test_from_tool_with_valid_content():
396396
assert not message.images
397397
assert not message.image
398398

399-
def test_serde():
399+
def test_serde(base64_image_string):
400400
# the following message is created just for testing purposes and does not make sense in a real use case
401401

402402
role = ChatRole.ASSISTANT
403403

404404
text_content = TextContent(text="Hello")
405405
tool_call = ToolCall(id="123", tool_name="mytool", arguments={"a": 1})
406406
tool_call_result = ToolCallResult(result="result", origin=tool_call, error=False)
407-
image_content = ImageContent(base64_image="base64_string", mime_type="image/png", detail="auto",
408-
meta={"key": "value"})
407+
image_content = ImageContent(base64_image=base64_image_string, mime_type="image/png", detail="auto",
408+
meta={"key": "value"}, validate=True)
409409
meta = {"some": "info"}
410410

411411
message = ChatMessage(_role=role, _content=[text_content, tool_call, tool_call_result, image_content], _meta=meta)
@@ -424,10 +424,11 @@ def test_serde():
424424
},
425425
{
426426
"image": {
427-
"base64_image": "base64_string",
427+
"base64_image": base64_image_string,
428428
"mime_type": "image/png",
429429
"detail": "auto",
430430
"meta": {"key": "value"},
431+
"validate": True,
431432
}
432433
},
433434
],
@@ -447,13 +448,13 @@ def test_to_openai_dict_format_user_message():
447448
message = ChatMessage.from_user("I have a question")
448449
assert message.to_openai_dict_format() == {"role": "user", "content": "I have a question"}
449450

450-
def test_to_openai_dict_format_multimodal_user_message():
451+
def test_to_openai_dict_format_multimodal_user_message(base64_image_string):
451452
message = ChatMessage.from_user(content_parts=[TextContent("I have a question"),
452-
ImageContent(base64_image="base64_string")])
453+
ImageContent(base64_image=base64_image_string)])
453454
assert message.to_openai_dict_format() == {"role": "user",
454455
"content": [{"type": "text", "text": "I have a question"},
455456
{"type": "image_url", "image_url":
456-
{"url": "_string"}}]}
457+
{"url": f"data:image/png;base64,{base64_image_string}"}}]}
457458

458459
def test_to_openai_dict_format_assistant_message():
459460
message = ChatMessage.from_assistant(text="I have an answer", meta={"finish_reason": "stop"})

test/dataclasses/test_image_content.py

Lines changed: 43 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -13,13 +13,53 @@
1313
from haystack_experimental.dataclasses.image_content import ImageContent
1414

1515

16-
def test_image_content_init():
17-
image_content = ImageContent(base64_image="base64_string", mime_type="image/png", detail="auto",
16+
def test_image_content_init(base64_image_string):
17+
image_content = ImageContent(base64_image=base64_image_string, mime_type="image/png", detail="auto",
1818
meta={"key": "value"})
19-
assert image_content.base64_image == "base64_string"
19+
assert image_content.base64_image == base64_image_string
2020
assert image_content.mime_type == "image/png"
2121
assert image_content.detail == "auto"
2222
assert image_content.meta == {"key": "value"}
23+
assert image_content.validate
24+
25+
def test_image_content_init_with_invalid_base64_string():
26+
with pytest.raises(ValueError):
27+
ImageContent(base64_image="invalid_base64_string")
28+
29+
def test_image_content_init_with_invalid_base64_string_and_validate_false():
30+
image_content = ImageContent(base64_image="invalid_base64_string", validate=False)
31+
assert image_content.base64_image == "invalid_base64_string"
32+
assert image_content.mime_type is None
33+
assert image_content.detail is None
34+
assert image_content.meta == {}
35+
assert not image_content.validate
36+
37+
def test_image_content_init_with_invalid_mime_type(test_files_path, base64_image_string):
38+
with pytest.raises(ValueError):
39+
ImageContent(base64_image=base64_image_string, mime_type="text/xml")
40+
41+
with open(test_files_path / "docx" / "sample_docx.docx", "rb") as docx_file:
42+
docx_base64 = base64.b64encode(docx_file.read()).decode("utf-8")
43+
with pytest.raises(ValueError):
44+
ImageContent(base64_image=docx_base64)
45+
46+
def test_image_content_init_with_invalid_mime_type_and_validate_false(test_files_path, base64_image_string):
47+
image_content = ImageContent(base64_image=base64_image_string, mime_type="text/xml", validate=False)
48+
assert image_content.base64_image == base64_image_string
49+
assert image_content.mime_type == "text/xml"
50+
assert image_content.detail is None
51+
assert image_content.meta == {}
52+
assert not image_content.validate
53+
54+
with open(test_files_path / "docx" / "sample_docx.docx", "rb") as docx_file:
55+
docx_base64 = base64.b64encode(docx_file.read()).decode("utf-8")
56+
image_content = ImageContent(base64_image=docx_base64, validate=False)
57+
assert image_content.base64_image == docx_base64
58+
assert image_content.mime_type is None
59+
assert image_content.detail is None
60+
assert image_content.meta == {}
61+
assert not image_content.validate
62+
2363

2464
def test_image_content_mime_type_guessing(test_files_path):
2565
image_path = test_files_path / "images" / "apple.jpg"
@@ -28,10 +68,6 @@ def test_image_content_mime_type_guessing(test_files_path):
2868
image_content = ImageContent(base64_image=base64_image)
2969
assert image_content.mime_type == "image/jpeg"
3070

31-
# do not guess mime type if base64 decoding fails
32-
image_content = ImageContent(base64_image="base64_string")
33-
assert image_content.mime_type is None
34-
3571
# do not guess mime type if mime type is provided
3672
image_content = ImageContent(base64_image=base64_image, mime_type="image/png")
3773
assert image_content.mime_type == "image/png"
12.9 KB
Binary file not shown.

0 commit comments

Comments
 (0)