diff --git a/docs/features/multimodal_inputs.md b/docs/features/multimodal_inputs.md index 206ab7a46875..77baa27c7a95 100644 --- a/docs/features/multimodal_inputs.md +++ b/docs/features/multimodal_inputs.md @@ -215,19 +215,19 @@ When loading RGBA images (images with transparency), vLLM converts them to RGB f ```python from vllm import LLM - + # Default white background (no configuration needed) llm = LLM(model="llava-hf/llava-1.5-7b-hf") - + # Custom black background for dark theme llm = LLM( model="llava-hf/llava-1.5-7b-hf", media_io_kwargs={"image": {"rgba_background_color": [0, 0, 0]}} ) - + # Custom brand color background (e.g., blue) llm = LLM( - model="llava-hf/llava-1.5-7b-hf", + model="llava-hf/llava-1.5-7b-hf", media_io_kwargs={"image": {"rgba_background_color": [0, 0, 255]}} ) ``` @@ -388,7 +388,7 @@ For Qwen2-VL and MiniCPM-V, we accept additional parameters alongside the embedd ## Online Serving -Our OpenAI-compatible server accepts multi-modal data via the [Chat Completions API](https://platform.openai.com/docs/api-reference/chat). +Our OpenAI-compatible server accepts multi-modal data via the [Chat Completions API](https://platform.openai.com/docs/api-reference/chat). Media inputs also support optional UUIDs users can provide to uniquely identify each media, which is used to cache the media results across requests. !!! important A chat template is **required** to use Chat Completions API. @@ -438,7 +438,13 @@ Then, you can use the OpenAI client as follows: # NOTE: The prompt formatting with the image token `` is not needed # since the prompt will be processed automatically by the API server. {"type": "text", "text": "What’s in this image?"}, - {"type": "image_url", "image_url": {"url": image_url}}, + { + "type": "image_url", + "image_url": { + url": image_url + }, + "uuid": image_url # Optional + }, ], }], ) @@ -454,8 +460,20 @@ Then, you can use the OpenAI client as follows: "role": "user", "content": [ {"type": "text", "text": "What are the animals in these images?"}, - {"type": "image_url", "image_url": {"url": image_url_duck}}, - {"type": "image_url", "image_url": {"url": image_url_lion}}, + { + "type": "image_url", + "image_url": { + "url": image_url_duck + }, + "uuid": image_url_duck # Optional + }, + { + "type": "image_url", + "image_url": { + "url": image_url_lion + }, + "uuid": image_url_lion # Optional + }, ], }], ) @@ -522,6 +540,7 @@ Then, you can use the OpenAI client as follows: "video_url": { "url": video_url }, + "uuid": video_url # Optional }, ], }], @@ -613,6 +632,7 @@ Then, you can use the OpenAI client as follows: "data": audio_base64, "format": "wav" }, + "uuid": audio_url # Optional }, ], }], @@ -642,6 +662,7 @@ Alternatively, you can pass `audio_url`, which is the audio counterpart of `imag "audio_url": { "url": audio_url }, + "uuid": audio_url # Optional }, ], }], @@ -695,7 +716,8 @@ The following example demonstrates how to pass image embeddings to the OpenAI se model = "llava-hf/llava-1.5-7b-hf" embeds = { "type": "image_embeds", - "image_embeds": f"{base64_image_embedding}" + "image_embeds": f"{base64_image_embedding}", + "uuid": image_url # Optional } # Pass additional parameters (available to Qwen2-VL and MiniCPM-V) @@ -706,6 +728,7 @@ The following example demonstrates how to pass image embeddings to the OpenAI se "image_embeds": f"{base64_image_embedding}" , # Required "image_grid_thw": f"{base64_image_grid_thw}" # Required by Qwen/Qwen2-VL-2B-Instruct }, + "uuid": image_url # Optional } model = "openbmb/MiniCPM-V-2_6" embeds = { @@ -714,6 +737,7 @@ The following example demonstrates how to pass image embeddings to the OpenAI se "image_embeds": f"{base64_image_embedding}" , # Required "image_sizes": f"{base64_image_sizes}" # Required by openbmb/MiniCPM-V-2_6 }, + "uuid": image_url # Optional } chat_completion = client.chat.completions.create( messages=[ diff --git a/tests/entrypoints/openai/test_vision.py b/tests/entrypoints/openai/test_vision.py index 9d61754059e2..29a3b40d2d86 100644 --- a/tests/entrypoints/openai/test_vision.py +++ b/tests/entrypoints/openai/test_vision.py @@ -436,3 +436,132 @@ async def test_multi_image_input(client: openai.AsyncOpenAI, model_name: str, ) message = chat_completion.choices[0].message assert message.content is not None and len(message.content) >= 0 + + +@pytest.mark.asyncio +@pytest.mark.parametrize("model_name", [MODEL_NAME]) +@pytest.mark.parametrize( + "image_urls", + [TEST_IMAGE_ASSETS[:i] for i in range(2, len(TEST_IMAGE_ASSETS))], + indirect=True) +async def test_completions_with_image( + client: openai.AsyncOpenAI, + model_name: str, + image_urls: list[str], +): + for image_url in image_urls: + chat_completion = await client.chat.completions.create( + messages=[ + { + "role": "system", + "content": "You are a helpful assistant." + }, + { + "role": + "user", + "content": [ + { + "type": "text", + "text": "Describe this image.", + }, + { + "type": "image_url", + "image_url": { + "url": image_url, + } + }, + ], + }, + ], + model=model_name, + ) + assert chat_completion.choices[0].message.content is not None + assert isinstance(chat_completion.choices[0].message.content, str) + assert len(chat_completion.choices[0].message.content) > 0 + + +@pytest.mark.asyncio +@pytest.mark.parametrize("model_name", [MODEL_NAME]) +@pytest.mark.parametrize( + "image_urls", + [TEST_IMAGE_ASSETS[:i] for i in range(2, len(TEST_IMAGE_ASSETS))], + indirect=True) +async def test_completions_with_image_with_uuid( + client: openai.AsyncOpenAI, + model_name: str, + image_urls: list[str], +): + for image_url in image_urls: + chat_completion = await client.chat.completions.create( + messages=[ + { + "role": "system", + "content": "You are a helpful assistant." + }, + { + "role": + "user", + "content": [ + { + "type": "text", + "text": "Describe this image.", + }, + { + "type": "image_url", + "image_url": { + "url": image_url, + }, + "uuid": image_url + }, + ], + }, + ], + model=model_name, + ) + assert chat_completion.choices[0].message.content is not None + assert isinstance(chat_completion.choices[0].message.content, str) + assert len(chat_completion.choices[0].message.content) > 0 + + +@pytest.mark.asyncio +@pytest.mark.parametrize("model_name", [MODEL_NAME]) +@pytest.mark.parametrize( + "image_urls", + [TEST_IMAGE_ASSETS[:i] for i in range(2, len(TEST_IMAGE_ASSETS))], + indirect=True) +async def test_completions_with_image_with_incorrect_uuid_format( + client: openai.AsyncOpenAI, + model_name: str, + image_urls: list[str], +): + for image_url in image_urls: + chat_completion = await client.chat.completions.create( + messages=[ + { + "role": "system", + "content": "You are a helpful assistant." + }, + { + "role": + "user", + "content": [ + { + "type": "text", + "text": "Describe this image.", + }, + { + "type": "image_url", + "image_url": { + "url": image_url, + "incorrect_uuid_key": image_url, + }, + "also_incorrect_uuid_key": image_url, + }, + ], + }, + ], + model=model_name, + ) + assert chat_completion.choices[0].message.content is not None + assert isinstance(chat_completion.choices[0].message.content, str) + assert len(chat_completion.choices[0].message.content) > 0 diff --git a/tests/entrypoints/test_chat_utils.py b/tests/entrypoints/test_chat_utils.py index 18db1027c004..5149ca346050 100644 --- a/tests/entrypoints/test_chat_utils.py +++ b/tests/entrypoints/test_chat_utils.py @@ -21,7 +21,7 @@ resolve_chat_template_content_format, resolve_hf_chat_template) from vllm.entrypoints.llm import apply_hf_chat_template -from vllm.multimodal import MultiModalDataDict +from vllm.multimodal import MultiModalDataDict, MultiModalUUIDDict from vllm.multimodal.utils import (encode_audio_base64, encode_image_base64, encode_video_base64) from vllm.transformers_utils.tokenizer_group import TokenizerGroup @@ -179,6 +179,27 @@ def _assert_mm_data_is_image_input( assert isinstance(image_data, list) and len(image_data) == image_count +def _assert_mm_uuids( + mm_uuids: Optional[MultiModalUUIDDict], + media_count: int, + expected_uuids: list[Optional[str]], + modality: str = "image", +) -> None: + if len(expected_uuids) > 0: + assert mm_uuids is not None + assert modality in mm_uuids + + image_uuids = mm_uuids.get(modality) + assert image_uuids is not None + + assert isinstance(image_uuids, + list) and len(image_uuids) == media_count + + assert image_uuids == expected_uuids + else: + assert mm_uuids is None + + ModalityType = Literal["image", "video", "audio"] MultiModalDataCounts = Mapping[ModalityType, int] @@ -201,7 +222,7 @@ def test_parse_chat_messages_single_image( phi3v_tokenizer, image_url, ): - conversation, mm_data = parse_chat_messages( + conversation, mm_data, mm_uuids = parse_chat_messages( [{ "role": "user", @@ -228,6 +249,260 @@ def test_parse_chat_messages_single_image( "content": "<|image_1|>\nWhat's in the image?" }] _assert_mm_data_is_image_input(mm_data, 1) + _assert_mm_uuids(mm_uuids, 1, expected_uuids=[None]) + + +def test_parse_chat_messages_single_image_with_uuid( + phi3v_model_config, + phi3v_tokenizer, + image_url, +): + image_uuid = str(hash(image_url)) + conversation, mm_data, mm_uuids = parse_chat_messages( + [{ + "role": + "user", + "content": [ + { + "type": "image_url", + "image_url": { + "url": image_url, + }, + "uuid": image_uuid, + }, + { + "type": "text", + "text": "What's in the image?" + }, + ], + }], + phi3v_model_config, + phi3v_tokenizer, + content_format="string", + ) + + assert conversation == [{ + "role": "user", + "content": "<|image_1|>\nWhat's in the image?" + }] + _assert_mm_data_is_image_input(mm_data, 1) + _assert_mm_uuids(mm_uuids, 1, expected_uuids=[image_uuid]) + + +def test_parse_chat_messages_single_image_with_bad_uuid_format( + phi3v_model_config, + phi3v_tokenizer, + image_url, +): + image_uuid = str(hash(image_url)) + conversation, mm_data, mm_uuids = parse_chat_messages( + [{ + "role": + "user", + "content": [ + { + "type": "image_url", + "image_url": { + "url": image_url, + "uuid": image_uuid, + }, + "bad_uuid_key": image_uuid, + }, + { + "type": "text", + "text": "What's in the image?" + }, + ], + }], + phi3v_model_config, + phi3v_tokenizer, + content_format="string", + ) + + assert conversation == [{ + "role": "user", + "content": "<|image_1|>\nWhat's in the image?" + }] + _assert_mm_data_is_image_input(mm_data, 1) + _assert_mm_uuids(mm_uuids, 1, expected_uuids=[None]) + + +def test_parse_chat_messages_multiple_images_with_uuids( + phi3v_model_config, + phi3v_tokenizer, + image_url, +): + image_uuid1 = "my_uuid_1" + image_uuid2 = "my_uuid_2" + + conversation, mm_data, mm_uuids = parse_chat_messages( + [{ + "role": + "user", + "content": [ + { + "type": "image_url", + "image_url": { + "url": image_url, + }, + "uuid": image_uuid1, + }, + { + "type": "image_url", + "image_url": { + "url": image_url, + }, + "uuid": image_uuid2, + }, + { + "type": "text", + "text": "What's in the image?" + }, + ], + }], + phi3v_model_config, + phi3v_tokenizer, + content_format="string", + ) + + assert conversation == [{ + "role": + "user", + "content": + "<|image_1|>\n<|image_2|>\nWhat's in the image?", + }] + _assert_mm_data_is_image_input(mm_data, 2) + _assert_mm_uuids(mm_uuids, 2, expected_uuids=[image_uuid1, image_uuid2]) + + +@pytest.mark.asyncio +async def test_parse_chat_messages_single_image_with_uuid_async( + phi3v_model_config, + phi3v_tokenizer, + image_url, +): + image_uuid = str(hash(image_url)) + conversation, mm_future, mm_uuids = parse_chat_messages_futures( + [{ + "role": + "user", + "content": [ + { + "type": "image_url", + "image_url": { + "url": image_url + }, + "uuid": image_uuid, + }, + { + "type": "text", + "text": "What's in the image?" + }, + ], + }], + phi3v_model_config, + phi3v_tokenizer, + content_format="string", + ) + + assert conversation == [{ + "role": "user", + "content": "<|image_1|>\nWhat's in the image?" + }] + _assert_mm_data_is_image_input(await mm_future, 1) + _assert_mm_uuids(mm_uuids, 1, expected_uuids=[image_uuid]) + + +@pytest.mark.asyncio +async def test_parse_chat_messages_multiple_images_with_uuids_async( + phi3v_model_config, + phi3v_tokenizer, + image_url, +): + image_uuid1 = "my_uuid_1" + image_uuid2 = "my_uuid_2" + + conversation, mm_future, mm_uuids = parse_chat_messages_futures( + [{ + "role": + "user", + "content": [ + { + "type": "image_url", + "image_url": { + "url": image_url + }, + "uuid": image_uuid1, + }, + { + "type": "image_pil", + "image_pil": ImageAsset("cherry_blossom").pil_image, + "uuid": image_uuid2, + }, + { + "type": "text", + "text": "What's in these images?" + }, + ], + }], + phi3v_model_config, + phi3v_tokenizer, + content_format="string", + ) + + assert conversation == [{ + "role": + "user", + "content": + "<|image_1|>\n<|image_2|>\nWhat's in these images?", + }] + _assert_mm_data_is_image_input(await mm_future, 2) + _assert_mm_uuids(mm_uuids, 2, expected_uuids=[image_uuid1, image_uuid2]) + + +@pytest.mark.asyncio +async def test_parse_chat_messages_multiple_images_with_partial_uuids_async( + phi3v_model_config, + phi3v_tokenizer, + image_url, +): + image_uuid2 = "my_uuid_2" + + conversation, mm_future, mm_uuids = parse_chat_messages_futures( + [{ + "role": + "user", + "content": [ + { + "type": "image_url", + "image_url": { + "url": image_url + }, + }, + { + "type": "image_pil", + "image_pil": ImageAsset("cherry_blossom").pil_image, + "uuid": image_uuid2, + }, + { + "type": "text", + "text": "What's in these images?" + }, + ], + }], + phi3v_model_config, + phi3v_tokenizer, + content_format="string", + ) + + assert conversation == [{ + "role": + "user", + "content": + "<|image_1|>\n<|image_2|>\nWhat's in these images?", + }] + _assert_mm_data_is_image_input(await mm_future, 2) + _assert_mm_uuids(mm_uuids, 2, expected_uuids=[None, image_uuid2]) def test_parse_chat_messages_empty_system( @@ -235,7 +510,7 @@ def test_parse_chat_messages_empty_system( mistral_tokenizer, ): # Test string format - conversation, _ = parse_chat_messages( + conversation, _, _ = parse_chat_messages( [ { "role": "system", @@ -265,7 +540,7 @@ def test_parse_chat_messages_empty_system( ] # Test openai format - conversation, _ = parse_chat_messages( + conversation, _, _ = parse_chat_messages( [ { "role": "system", @@ -307,7 +582,7 @@ async def test_parse_chat_messages_single_image_async( phi3v_tokenizer, image_url, ): - conversation, mm_future = parse_chat_messages_futures( + conversation, mm_future, mm_uuids = parse_chat_messages_futures( [{ "role": "user", @@ -334,6 +609,7 @@ async def test_parse_chat_messages_single_image_async( "content": "<|image_1|>\nWhat's in the image?" }] _assert_mm_data_is_image_input(await mm_future, 1) + _assert_mm_uuids(mm_uuids, 1, expected_uuids=[None]) def test_parse_chat_messages_multiple_images( @@ -341,7 +617,7 @@ def test_parse_chat_messages_multiple_images( phi3v_tokenizer, image_url, ): - conversation, mm_data = parse_chat_messages( + conversation, mm_data, mm_uuids = parse_chat_messages( [{ "role": "user", @@ -374,6 +650,7 @@ def test_parse_chat_messages_multiple_images( "<|image_1|>\n<|image_2|>\nWhat's in these images?", }] _assert_mm_data_is_image_input(mm_data, 2) + _assert_mm_uuids(mm_uuids, 2, expected_uuids=[None, None]) @pytest.mark.asyncio @@ -382,7 +659,7 @@ async def test_parse_chat_messages_multiple_images_async( phi3v_tokenizer, image_url, ): - conversation, mm_future = parse_chat_messages_futures( + conversation, mm_future, mm_uuids = parse_chat_messages_futures( [{ "role": "user", @@ -415,6 +692,7 @@ async def test_parse_chat_messages_multiple_images_async( "<|image_1|>\n<|image_2|>\nWhat's in these images?", }] _assert_mm_data_is_image_input(await mm_future, 2) + _assert_mm_uuids(mm_uuids, 2, expected_uuids=[None, None]) def test_parse_chat_messages_placeholder_already_in_prompt( @@ -422,7 +700,7 @@ def test_parse_chat_messages_placeholder_already_in_prompt( phi3v_tokenizer, image_url, ): - conversation, mm_data = parse_chat_messages( + conversation, mm_data, mm_uuids = parse_chat_messages( [{ "role": "user", @@ -458,6 +736,7 @@ def test_parse_chat_messages_placeholder_already_in_prompt( "What's in <|image_1|> and how does it compare to <|image_2|>?", }] _assert_mm_data_is_image_input(mm_data, 2) + _assert_mm_uuids(mm_uuids, 2, expected_uuids=[None, None]) def test_parse_chat_messages_placeholder_one_already_in_prompt( @@ -465,7 +744,7 @@ def test_parse_chat_messages_placeholder_one_already_in_prompt( phi3v_tokenizer, image_url, ): - conversation, mm_data = parse_chat_messages( + conversation, mm_data, mm_uuids = parse_chat_messages( [{ "role": "user", @@ -503,6 +782,7 @@ def test_parse_chat_messages_placeholder_one_already_in_prompt( "other one?", }] _assert_mm_data_is_image_input(mm_data, 2) + _assert_mm_uuids(mm_uuids, 2, expected_uuids=[None, None]) def test_parse_chat_messages_multiple_images_across_messages( @@ -510,7 +790,7 @@ def test_parse_chat_messages_multiple_images_across_messages( phi3v_tokenizer, image_url, ): - conversation, mm_data = parse_chat_messages( + conversation, mm_data, mm_uuids = parse_chat_messages( [ { "role": @@ -569,28 +849,99 @@ def test_parse_chat_messages_multiple_images_across_messages( }, ] _assert_mm_data_is_image_input(mm_data, 2) + _assert_mm_uuids(mm_uuids, 2, expected_uuids=[None, None]) -def test_parse_chat_messages_context_text_format( +def test_parse_chat_messages_multiple_images_with_uuids_across_messages( phi3v_model_config, phi3v_tokenizer, + image_url, ): - conversation, mm_data = parse_chat_messages( + image_uuid = str(hash(image_url)) + conversation, mm_data, mm_uuids = parse_chat_messages( [ { - "role": "user", - "content": [{ - "type": "text", - "text": "What's in this text?" - }], + "role": + "user", + "content": [ + { + "type": "image_url", + "image_url": { + "url": image_url + }, + "uuid": image_uuid, + }, + { + "type": "text", + "text": "What's in this image?" + }, + ], }, { "role": "assistant", "content": "Some stuff." }, { - "role": "user", - "content": "What about this one?" + "role": + "user", + "content": [ + { + "type": "image_url", + "image_url": { + "url": image_url + }, + "uuid": image_uuid, + }, + { + "type": "text", + "text": "What about this one?" + }, + ], + }, + ], + phi3v_model_config, + phi3v_tokenizer, + content_format="string", + ) + + assert conversation == [ + { + "role": "user", + "content": "<|image_1|>\nWhat's in this image?" + }, + { + "role": "assistant", + "content": "Some stuff." + }, + { + "role": "user", + "content": "<|image_2|>\nWhat about this one?" + }, + ] + _assert_mm_data_is_image_input(mm_data, 2) + _assert_mm_uuids(mm_uuids, 2, expected_uuids=[image_uuid, image_uuid]) + + +def test_parse_chat_messages_context_text_format( + phi3v_model_config, + phi3v_tokenizer, +): + conversation, mm_data, mm_uuids = parse_chat_messages( + [ + { + "role": "user", + "content": [{ + "type": "text", + "text": "What's in this text?" + }], + }, + { + "role": "assistant", + "content": "Some stuff." + }, + { + "role": "user", + "content": "What about this one?" }, ], phi3v_model_config, @@ -621,6 +972,8 @@ def test_parse_chat_messages_context_text_format( }], }, ] + assert mm_data is None + assert mm_uuids is None def test_parse_chat_messages_rejects_too_many_images_in_one_message( @@ -736,7 +1089,7 @@ def test_parse_chat_messages_multiple_images_uncommon_input( phi3v_tokenizer, image_url, ): - conversation, mm_data = parse_chat_messages( + conversation, mm_data, mm_uuids = parse_chat_messages( [{ "role": "user", @@ -762,6 +1115,7 @@ def test_parse_chat_messages_multiple_images_uncommon_input( "<|image_1|>\n<|image_2|>\nWhat's in these images?", }] _assert_mm_data_is_image_input(mm_data, 2) + _assert_mm_uuids(mm_uuids, 2, expected_uuids=[None, None]) def test_parse_chat_messages_multiple_images_interleave( @@ -769,7 +1123,7 @@ def test_parse_chat_messages_multiple_images_interleave( phi3v_tokenizer, image_url, ): - conversation, mm_data = parse_chat_messages( + conversation, mm_data, mm_uuids = parse_chat_messages( [{ "role": "user", @@ -813,6 +1167,7 @@ def test_parse_chat_messages_multiple_images_interleave( "Do they have differences?", }] _assert_mm_data_is_image_input(mm_data, 2) + _assert_mm_uuids(mm_uuids, 2, expected_uuids=[None, None]) @pytest.mark.asyncio @@ -821,7 +1176,7 @@ async def test_parse_chat_messages_multiple_images_interleave_async( phi3v_tokenizer, image_url, ): - conversation, mm_data = parse_chat_messages_futures( + conversation, mm_data, mm_uuids = parse_chat_messages_futures( [{ "role": "user", @@ -865,6 +1220,63 @@ async def test_parse_chat_messages_multiple_images_interleave_async( "Do they have differences?", }] _assert_mm_data_is_image_input(await mm_data, 2) + _assert_mm_uuids(mm_uuids, 2, expected_uuids=[None, None]) + + +@pytest.mark.asyncio +async def test_parse_chat_messages_multiple_images_with_uuids_interleave_async( + phi3v_model_config_mm_interleaved, + phi3v_tokenizer, + image_url, +): + image_uuid = str(hash(image_url)) + conversation, mm_data, mm_uuids = parse_chat_messages_futures( + [{ + "role": + "user", + "content": [ + { + "type": "text", + "text": "I need you to compare this image", + }, + { + "type": "image_url", + "image_url": { + "url": image_url + }, + "uuid": image_uuid, + }, + { + "type": "text", + "text": "and this one" + }, + { + "type": "image_url", + "image_url": { + "url": image_url + }, + "uuid": image_uuid, + }, + { + "type": "text", + "text": "Do they have differences?" + }, + ], + }], + phi3v_model_config_mm_interleaved, + phi3v_tokenizer, + content_format="string", + ) + + assert conversation == [{ + "role": + "user", + "content": + "I need you to compare this image\n<|image_1|>\nand this one\n<|image_2|>\n" # noqa: E501 + "Do they have differences?", + }] + _assert_mm_data_is_image_input(await mm_data, 2) + _assert_mm_uuids(mm_uuids, 2, expected_uuids=[image_uuid, image_uuid]) def test_parse_chat_messages_multiple_images_multiple_messages_interleave( @@ -872,7 +1284,7 @@ def test_parse_chat_messages_multiple_images_multiple_messages_interleave( phi3v_tokenizer, image_url, ): - conversation, mm_data = parse_chat_messages( + conversation, mm_data, mm_uuids = parse_chat_messages( [ { "role": @@ -935,6 +1347,81 @@ def test_parse_chat_messages_multiple_images_multiple_messages_interleave( }, ] _assert_mm_data_is_image_input(mm_data, 2) + _assert_mm_uuids(mm_uuids, 2, expected_uuids=[None, None]) + + +def test_parse_chat_messages_multiple_images_with_uuids_multiple_messages_interleave( # noqa: E501 + phi3v_model_config_mm_interleaved, + phi3v_tokenizer, + image_url, +): + image_uuid = str(hash(image_url)) + conversation, mm_data, mm_uuids = parse_chat_messages( + [ + { + "role": + "user", + "content": [ + { + "type": "text", + "text": "What's on this image?" + }, + { + "type": "image_url", + "image_url": { + "url": image_url + }, + "uuid": image_uuid, + }, + { + "type": "text", + "text": "Be accurate." + }, + ], + }, + { + "role": "assistant", + "content": "Some stuff." + }, + { + "role": + "user", + "content": [ + { + "type": "text", + "text": "What's on this image?" + }, + { + "type": "image_url", + "image_url": { + "url": image_url + }, + "uuid": image_uuid, + }, + ], + }, + ], + phi3v_model_config_mm_interleaved, + phi3v_tokenizer, + content_format="string", + ) + + assert conversation == [ + { + "role": "user", + "content": "What's on this image?\n<|image_1|>\nBe accurate.", + }, + { + "role": "assistant", + "content": "Some stuff." + }, + { + "role": "user", + "content": "What's on this image?\n<|image_2|>" + }, + ] + _assert_mm_data_is_image_input(mm_data, 2) + _assert_mm_uuids(mm_uuids, 2, expected_uuids=[image_uuid, image_uuid]) def test_parse_chat_messages_multiple_modals_multiple_messages_interleave( @@ -944,7 +1431,7 @@ def test_parse_chat_messages_multiple_modals_multiple_messages_interleave( video_url, audio_url, ): - conversation, mm_data = parse_chat_messages( + conversation, mm_data, mm_uuids = parse_chat_messages( [ { "role": @@ -1030,6 +1517,229 @@ def test_parse_chat_messages_multiple_modals_multiple_messages_interleave( ] _assert_mm_data_inputs(mm_data, {"image": 2, "video": 1, "audio": 1}) + _assert_mm_uuids(mm_uuids, + 2, + modality="image", + expected_uuids=[None, None]) + _assert_mm_uuids(mm_uuids, 1, modality="video", expected_uuids=[None]) + _assert_mm_uuids(mm_uuids, 1, modality="audio", expected_uuids=[None]) + + +def test_parse_chat_messages_multiple_modals_with_uuids_multiple_messages_interleave( # noqa: E501 + qwen25omni_model_config_mm_interleaved, + qwen25omni_tokenizer, + image_url, + video_url, + audio_url, +): + conversation, mm_data, mm_uuids = parse_chat_messages( + [ + { + "role": + "user", + "content": [ + { + "type": "text", + "text": "What's on this image?" + }, + { + "type": "image_url", + "image_url": { + "url": image_url + }, + "uuid": "image_123", + }, + { + "type": "text", + "text": "Now listen to this audio" + }, + { + "type": "audio_url", + "audio_url": { + "url": audio_url + }, + "uuid": "audio_123", + }, + ], + }, + { + "role": "assistant", + "content": "Some stuff." + }, + { + "role": + "user", + "content": [ + { + "type": "text", + "text": "What's on this image?" + }, + { + "type": "image_url", + "image_url": { + "url": image_url + }, + "uuid": "image_123", + }, + { + "type": "text", + "text": "And what's in the video?" + }, + { + "type": "video_url", + "video_url": { + "url": video_url + }, + "uuid": "video_123", + }, + ], + }, + ], + qwen25omni_model_config_mm_interleaved, + qwen25omni_tokenizer, + content_format="string", + ) + + assert conversation == [ + { + "role": + "user", + "content": + "What's on this image?\n<|vision_start|><|IMAGE|><|vision_end|>\n" + "Now listen to this audio\nAudio 1: <|audio_bos|><|AUDIO|><|audio_eos|>", # noqa: E501 + }, + { + "role": "assistant", + "content": "Some stuff." + }, + { + "role": + "user", + "content": + "What's on this image?\n<|vision_start|><|IMAGE|><|vision_end|>\n" + "And what's in the video?\n<|vision_start|><|VIDEO|><|vision_end|>", + }, + ] + + _assert_mm_data_inputs(mm_data, {"image": 2, "video": 1, "audio": 1}) + _assert_mm_uuids(mm_uuids, + 2, + modality="image", + expected_uuids=["image_123", "image_123"]) + _assert_mm_uuids(mm_uuids, + 1, + modality="video", + expected_uuids=["video_123"]) + _assert_mm_uuids(mm_uuids, + 1, + modality="audio", + expected_uuids=["audio_123"]) + + +def test_parse_chat_messages_multiple_modals_with_partial_uuids_multiple_messages_interleave( # noqa: E501 + qwen25omni_model_config_mm_interleaved, + qwen25omni_tokenizer, + image_url, + video_url, + audio_url, +): + conversation, mm_data, mm_uuids = parse_chat_messages( + [ + { + "role": + "user", + "content": [ + { + "type": "text", + "text": "What's on this image?" + }, + { + "type": "image_url", + "image_url": { + "url": image_url + }, + "uuid": "image_123", + }, + { + "type": "text", + "text": "Now listen to this audio" + }, + { + "type": "audio_url", + "audio_url": { + "url": audio_url + } + }, + ], + }, + { + "role": "assistant", + "content": "Some stuff." + }, + { + "role": + "user", + "content": [ + { + "type": "text", + "text": "What's on this image?" + }, + { + "type": "image_url", + "image_url": { + "url": image_url + } + }, + { + "type": "text", + "text": "And what's in the video?" + }, + { + "type": "video_url", + "video_url": { + "url": video_url + }, + "uuid": "video_123", + }, + ], + }, + ], + qwen25omni_model_config_mm_interleaved, + qwen25omni_tokenizer, + content_format="string", + ) + + assert conversation == [ + { + "role": + "user", + "content": + "What's on this image?\n<|vision_start|><|IMAGE|><|vision_end|>\n" + "Now listen to this audio\nAudio 1: <|audio_bos|><|AUDIO|><|audio_eos|>", # noqa: E501 + }, + { + "role": "assistant", + "content": "Some stuff." + }, + { + "role": + "user", + "content": + "What's on this image?\n<|vision_start|><|IMAGE|><|vision_end|>\n" + "And what's in the video?\n<|vision_start|><|VIDEO|><|vision_end|>", + }, + ] + + _assert_mm_data_inputs(mm_data, {"image": 2, "video": 1, "audio": 1}) + _assert_mm_uuids(mm_uuids, + 2, + modality="image", + expected_uuids=["image_123", None]) + _assert_mm_uuids(mm_uuids, + 1, + modality="video", + expected_uuids=["video_123"]) + _assert_mm_uuids(mm_uuids, 1, modality="audio", expected_uuids=[None]) def test_parse_chat_messages_multiple_images_interleave_with_placeholders( @@ -1081,7 +1791,7 @@ def test_mllama_single_image( image_url, ): """Ensures that a single image is parsed correctly mllama.""" - conversation, mm_data = parse_chat_messages( + conversation, mm_data, mm_uuids = parse_chat_messages( [{ "role": "user", @@ -1100,6 +1810,7 @@ def test_mllama_single_image( content_format="openai", ) _assert_mm_data_is_image_input(mm_data, 1) + _assert_mm_uuids(mm_uuids, 1, expected_uuids=[None]) assert conversation == [{ "role": "user", @@ -1121,7 +1832,7 @@ def test_mllama_interleaved_images( image_url, ): """Ensures that multiple image are parsed as interleaved dicts.""" - conversation, mm_data = parse_chat_messages( + conversation, mm_data, mm_uuids = parse_chat_messages( [{ "role": "user", @@ -1147,6 +1858,7 @@ def test_mllama_interleaved_images( content_format="openai", ) _assert_mm_data_is_image_input(mm_data, 2) + _assert_mm_uuids(mm_uuids, 2, expected_uuids=[None, None]) assert conversation == [{ "role": "user", @@ -1227,7 +1939,7 @@ def get_conversation(is_hf: bool): # Now parse with vLLMs chat utils & apply the template vllm_conversation = get_conversation(is_hf=False) - conversation, _ = parse_chat_messages( + conversation, _, _ = parse_chat_messages( vllm_conversation, model_config, tokenizer_group, @@ -1518,7 +2230,7 @@ def test_parse_chat_messages_include_thinking_chunk(mistral_model_config, }], }] - conversation_with_thinking, _ = parse_chat_messages( + conversation_with_thinking, _, _ = parse_chat_messages( messages, mistral_model_config, mistral_tokenizer, diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py index 80e2c44a0251..b53dbfb3a26a 100644 --- a/vllm/entrypoints/chat_utils.py +++ b/vllm/entrypoints/chat_utils.py @@ -41,7 +41,8 @@ from vllm.config import ModelConfig from vllm.logger import init_logger from vllm.model_executor.models import SupportsMultiModal -from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalDataDict +from vllm.multimodal import (MULTIMODAL_REGISTRY, MultiModalDataDict, + MultiModalUUIDDict) from vllm.multimodal.utils import MediaConnector # yapf: disable from vllm.transformers_utils.chat_templates import ( @@ -72,6 +73,11 @@ class ChatCompletionContentPartAudioParam(TypedDict, total=False): type: Required[Literal["audio_url"]] """The type of the content part.""" + uuid: Optional[str] + """ + User-provided UUID of a media. User must guarantee that it is properly + generated and unique for different medias. + """ class ChatCompletionContentPartImageEmbedsParam(TypedDict, total=False): @@ -83,6 +89,11 @@ class ChatCompletionContentPartImageEmbedsParam(TypedDict, total=False): """ type: Required[Literal["image_embeds"]] """The type of the content part.""" + uuid: Optional[str] + """ + User-provided UUID of a media. User must guarantee that it is properly + generated and unique for different medias. + """ class VideoURL(TypedDict, total=False): @@ -97,6 +108,11 @@ class ChatCompletionContentPartVideoParam(TypedDict, total=False): type: Required[Literal["video_url"]] """The type of the content part.""" + uuid: Optional[str] + """ + User-provided UUID of a media. User must guarantee that it is properly + generated and unique for different medias. + """ class PILImage(BaseModel): @@ -118,6 +134,11 @@ class CustomChatCompletionContentPILImageParam(TypedDict, total=False): """ image_pil: Required[PILImage] + uuid: Optional[str] + """ + User-provided UUID of a media. User must guarantee that it is properly + generated and unique for different medias. + """ class CustomChatCompletionContentSimpleImageParam(TypedDict, total=False): @@ -131,6 +152,11 @@ class CustomChatCompletionContentSimpleImageParam(TypedDict, total=False): """ image_url: Required[str] + uuid: Optional[str] + """ + User-provided UUID of a media. User must guarantee that it is properly + generated and unique for different medias. + """ class CustomChatCompletionContentSimpleAudioParam(TypedDict, total=False): @@ -155,6 +181,11 @@ class CustomChatCompletionContentSimpleVideoParam(TypedDict, total=False): """ video_url: Required[str] + uuid: Optional[str] + """ + User-provided UUID of a media. User must guarantee that it is properly + generated and unique for different medias. + """ class CustomThinkCompletionContentParam(TypedDict, total=False): @@ -567,6 +598,7 @@ def __init__(self, model_config: ModelConfig, tokenizer: AnyTokenizer): self._tokenizer = tokenizer self._items_by_modality = defaultdict[str, list[_T]](list) + self._uuids_by_modality = defaultdict[str, list[Optional[str]]](list) @property def model_config(self) -> ModelConfig: @@ -591,10 +623,15 @@ def mm_registry(self): def mm_processor(self): return self.mm_registry.create_processor(self.model_config) - def add(self, modality: ModalityStr, item: _T) -> Optional[str]: + def add( + self, modality: ModalityStr, item: _T, uuid: Optional[str] = None + ) -> Optional[str]: """ Add a multi-modal item to the current prompt and returns the placeholder string to use, if any. + + An optional uuid can be added which serves as a unique identifier of the + media. """ input_modality = modality.replace("_embeds", "") num_items = len(self._items_by_modality[modality]) + 1 @@ -602,9 +639,35 @@ def add(self, modality: ModalityStr, item: _T) -> Optional[str]: self.mm_processor.validate_num_items(input_modality, num_items) self._items_by_modality[modality].append(item) + self._uuids_by_modality[modality].append(uuid) return self.model_cls.get_placeholder_str(modality, num_items) + def all_mm_uuids(self) -> Optional[MultiModalUUIDDict]: + if not self._items_by_modality: + return None + mm_uuids = {} + uuids_by_modality = dict(self._uuids_by_modality) + if "image" in uuids_by_modality and "image_embeds" in uuids_by_modality: + raise ValueError( + "Mixing raw image and embedding inputs is not allowed" + ) + + if "image_embeds" in uuids_by_modality: + image_embeds_uuids = uuids_by_modality["image_embeds"] + if len(image_embeds_uuids) > 1: + raise ValueError( + "Only one message can have {'type': 'image_embeds'}" + ) + mm_uuids["image"] = uuids_by_modality["image_embeds"] + if "image" in uuids_by_modality: + mm_uuids["image"] = uuids_by_modality["image"] # UUIDs of images + if "audio" in uuids_by_modality: + mm_uuids["audio"] = uuids_by_modality["audio"] # UUIDs of audios + if "video" in uuids_by_modality: + mm_uuids["video"] = uuids_by_modality["video"] # UUIDs of videos + return mm_uuids + @abstractmethod def create_parser(self) -> "BaseMultiModalContentParser": raise NotImplementedError @@ -697,29 +760,35 @@ def mm_placeholder_storage(self) -> dict[str, list]: return dict(self._placeholder_storage) @abstractmethod - def parse_image(self, image_url: str) -> None: + def parse_image(self, image_url: str, uuid: Optional[str] = None) -> None: raise NotImplementedError @abstractmethod def parse_image_embeds( - self, image_embeds: Union[str, dict[str, str]] + self, + image_embeds: Union[str, dict[str, str]], + uuid: Optional[str] = None, ) -> None: raise NotImplementedError @abstractmethod - def parse_image_pil(self, image_pil: Image.Image) -> None: + def parse_image_pil( + self, image_pil: Image.Image, uuid: Optional[str] = None + ) -> None: raise NotImplementedError @abstractmethod - def parse_audio(self, audio_url: str) -> None: + def parse_audio(self, audio_url: str, uuid: Optional[str] = None) -> None: raise NotImplementedError @abstractmethod - def parse_input_audio(self, input_audio: InputAudio) -> None: + def parse_input_audio( + self, input_audio: InputAudio, uuid: Optional[str] = None + ) -> None: raise NotImplementedError @abstractmethod - def parse_video(self, video_url: str) -> None: + def parse_video(self, video_url: str, uuid: Optional[str] = None) -> None: raise NotImplementedError @@ -734,49 +803,55 @@ def __init__(self, tracker: MultiModalItemTracker) -> None: allowed_local_media_path=tracker.allowed_local_media_path, ) - def parse_image(self, image_url: str) -> None: + def parse_image(self, image_url: str, uuid: Optional[str] = None) -> None: image = self._connector.fetch_image(image_url) - placeholder = self._tracker.add("image", image) + placeholder = self._tracker.add("image", image, uuid) self._add_placeholder("image", placeholder) def parse_image_embeds( - self, image_embeds: Union[str, dict[str, str]] + self, + image_embeds: Union[str, dict[str, str]], + uuid: Optional[str] = None, ) -> None: if isinstance(image_embeds, dict): embeds = { k: self._connector.fetch_image_embedding(v) for k, v in image_embeds.items() } - placeholder = self._tracker.add("image_embeds", embeds) + placeholder = self._tracker.add("image_embeds", embeds, uuid) if isinstance(image_embeds, str): embedding = self._connector.fetch_image_embedding(image_embeds) - placeholder = self._tracker.add("image_embeds", embedding) + placeholder = self._tracker.add("image_embeds", embedding, uuid) self._add_placeholder("image", placeholder) - def parse_image_pil(self, image_pil: Image.Image) -> None: - placeholder = self._tracker.add("image", image_pil) + def parse_image_pil( + self, image_pil: Image.Image, uuid: Optional[str] = None + ) -> None: + placeholder = self._tracker.add("image", image_pil, uuid) self._add_placeholder("image", placeholder) - def parse_audio(self, audio_url: str) -> None: + def parse_audio(self, audio_url: str, uuid: Optional[str] = None) -> None: audio = self._connector.fetch_audio(audio_url) - placeholder = self._tracker.add("audio", audio) + placeholder = self._tracker.add("audio", audio, uuid) self._add_placeholder("audio", placeholder) - def parse_input_audio(self, input_audio: InputAudio) -> None: + def parse_input_audio( + self, input_audio: InputAudio, uuid: Optional[str] = None + ) -> None: audio_data = input_audio.get("data", "") audio_format = input_audio.get("format", "") audio_url = f"data:audio/{audio_format};base64,{audio_data}" - return self.parse_audio(audio_url) + return self.parse_audio(audio_url, uuid) - def parse_video(self, video_url: str) -> None: + def parse_video(self, video_url: str, uuid: Optional[str] = None) -> None: video = self._connector.fetch_video(video_url=video_url) - placeholder = self._tracker.add("video", video) + placeholder = self._tracker.add("video", video, uuid) self._add_placeholder("video", placeholder) @@ -790,14 +865,16 @@ def __init__(self, tracker: AsyncMultiModalItemTracker) -> None: allowed_local_media_path=tracker.allowed_local_media_path, ) - def parse_image(self, image_url: str) -> None: + def parse_image(self, image_url: str, uuid: Optional[str] = None) -> None: image_coro = self._connector.fetch_image_async(image_url) - placeholder = self._tracker.add("image", image_coro) + placeholder = self._tracker.add("image", image_coro, uuid) self._add_placeholder("image", placeholder) def parse_image_embeds( - self, image_embeds: Union[str, dict[str, str]] + self, + image_embeds: Union[str, dict[str, str]], + uuid: Optional[str] = None, ) -> None: future: asyncio.Future[Union[str, dict[str, str]]] = asyncio.Future() @@ -812,33 +889,37 @@ def parse_image_embeds( embedding = self._connector.fetch_image_embedding(image_embeds) future.set_result(embedding) - placeholder = self._tracker.add("image_embeds", future) + placeholder = self._tracker.add("image_embeds", future, uuid) self._add_placeholder("image", placeholder) - def parse_image_pil(self, image_pil: Image.Image) -> None: + def parse_image_pil( + self, image_pil: Image.Image, uuid: Optional[str] = None + ) -> None: future: asyncio.Future[Image.Image] = asyncio.Future() future.set_result(image_pil) - placeholder = self._tracker.add("image", future) + placeholder = self._tracker.add("image", future, uuid) self._add_placeholder("image", placeholder) - def parse_audio(self, audio_url: str) -> None: + def parse_audio(self, audio_url: str, uuid: Optional[str] = None) -> None: audio_coro = self._connector.fetch_audio_async(audio_url) - placeholder = self._tracker.add("audio", audio_coro) + placeholder = self._tracker.add("audio", audio_coro, uuid) self._add_placeholder("audio", placeholder) - def parse_input_audio(self, input_audio: InputAudio) -> None: + def parse_input_audio( + self, input_audio: InputAudio, uuid: Optional[str] = None + ) -> None: audio_data = input_audio.get("data", "") audio_format = input_audio.get("format", "") audio_url = f"data:audio/{audio_format};base64,{audio_data}" - return self.parse_audio(audio_url) + return self.parse_audio(audio_url, uuid) - def parse_video(self, video_url: str) -> None: + def parse_video(self, video_url: str, uuid: Optional[str] = None) -> None: video = self._connector.fetch_video_async(video_url=video_url) - placeholder = self._tracker.add("video", video) + placeholder = self._tracker.add("video", video, uuid) self._add_placeholder("video", placeholder) @@ -1177,30 +1258,36 @@ def _parse_chat_message_content_part( else: return str_content + # For media items, if a user has provided one, use it. Otherwise, insert + # a placeholder empty uuid. + uuid = part.get("uuid", None) + if uuid is not None: + uuid = str(uuid) + modality = None if part_type == "image_pil": image_content = cast(Image.Image, content) - mm_parser.parse_image_pil(image_content) + mm_parser.parse_image_pil(image_content, uuid) modality = "image" elif part_type in ("image_url", "input_image"): str_content = cast(str, content) - mm_parser.parse_image(str_content) + mm_parser.parse_image(str_content, uuid) modality = "image" elif part_type == "image_embeds": content = cast(Union[str, dict[str, str]], content) - mm_parser.parse_image_embeds(content) + mm_parser.parse_image_embeds(content, uuid) modality = "image" elif part_type == "audio_url": str_content = cast(str, content) - mm_parser.parse_audio(str_content) + mm_parser.parse_audio(str_content, uuid) modality = "audio" elif part_type == "input_audio": dict_content = cast(InputAudio, content) - mm_parser.parse_input_audio(dict_content) + mm_parser.parse_input_audio(dict_content, uuid) modality = "audio" elif part_type == "video_url": str_content = cast(str, content) - mm_parser.parse_video(str_content) + mm_parser.parse_video(str_content, uuid) modality = "video" else: raise NotImplementedError(f"Unknown part type: {part_type}") @@ -1288,7 +1375,11 @@ def parse_chat_messages( model_config: ModelConfig, tokenizer: AnyTokenizer, content_format: _ChatTemplateContentFormat, -) -> tuple[list[ConversationMessage], Optional[MultiModalDataDict]]: +) -> tuple[ + list[ConversationMessage], + Optional[MultiModalDataDict], + Optional[MultiModalUUIDDict], +]: conversation: list[ConversationMessage] = [] mm_tracker = MultiModalItemTracker(model_config, tokenizer) @@ -1308,7 +1399,7 @@ def parse_chat_messages( _postprocess_messages(conversation) - return conversation, mm_tracker.all_mm_data() + return conversation, mm_tracker.all_mm_data(), mm_tracker.all_mm_uuids() def parse_chat_messages_futures( @@ -1316,7 +1407,11 @@ def parse_chat_messages_futures( model_config: ModelConfig, tokenizer: AnyTokenizer, content_format: _ChatTemplateContentFormat, -) -> tuple[list[ConversationMessage], Awaitable[Optional[MultiModalDataDict]]]: +) -> tuple[ + list[ConversationMessage], + Awaitable[Optional[MultiModalDataDict]], + Optional[MultiModalUUIDDict], +]: conversation: list[ConversationMessage] = [] mm_tracker = AsyncMultiModalItemTracker(model_config, tokenizer) @@ -1336,7 +1431,7 @@ def parse_chat_messages_futures( _postprocess_messages(conversation) - return conversation, mm_tracker.all_mm_data() + return conversation, mm_tracker.all_mm_data(), mm_tracker.all_mm_uuids() def apply_hf_chat_template( diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py index 9b2ad808eb03..d33fd0ec0b49 100644 --- a/vllm/entrypoints/llm.py +++ b/vllm/entrypoints/llm.py @@ -796,7 +796,7 @@ def chat( # NOTE: _parse_chat_message_content_parts() currently doesn't # handle mm_processor_kwargs, since there is no implementation in # the chat message parsing for it. - conversation, mm_data = parse_chat_messages( + conversation, mm_data, mm_uuids = parse_chat_messages( msgs, model_config, tokenizer, @@ -826,6 +826,9 @@ def chat( if mm_data is not None: prompt["multi_modal_data"] = mm_data + if mm_uuids is not None: + prompt["multi_modal_uuids"] = mm_uuids + if mm_processor_kwargs is not None: prompt["mm_processor_kwargs"] = mm_processor_kwargs diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py index 1a2236de4fa4..d6e8d93a57e1 100644 --- a/vllm/entrypoints/openai/serving_engine.py +++ b/vllm/entrypoints/openai/serving_engine.py @@ -929,7 +929,7 @@ async def _preprocess_chat( tokenizer, model_config=model_config, ) - conversation, mm_data_future = parse_chat_messages_futures( + conversation, mm_data_future, mm_uuids = parse_chat_messages_futures( messages, model_config, tokenizer, @@ -1006,6 +1006,10 @@ async def _preprocess_chat( prompt_token_ids=prompt_inputs["prompt_token_ids"]) if mm_data is not None: engine_prompt["multi_modal_data"] = mm_data + + if mm_uuids is not None: + engine_prompt["multi_modal_uuids"] = mm_uuids + if request.mm_processor_kwargs is not None: engine_prompt["mm_processor_kwargs"] = request.mm_processor_kwargs diff --git a/vllm/inputs/preprocess.py b/vllm/inputs/preprocess.py index 094fcf021b61..ec82be831e0d 100644 --- a/vllm/inputs/preprocess.py +++ b/vllm/inputs/preprocess.py @@ -276,13 +276,23 @@ def _process_multimodal( if mm_processor_kwargs is None: mm_processor_kwargs = {} - return mm_processor.apply( + mm_input = mm_processor.apply( prompt, mm_data, hf_processor_mm_kwargs=mm_processor_kwargs, tokenization_kwargs=tokenization_kwargs, mm_hash_overrides=mm_hash_overrides, ) + mm_hashes = mm_input["mm_hashes"] + + # Validate that all mm items have a string as their hash + if not contains_only_strings(mm_hashes): + raise ValueError( + f"mm_hashes must contain only strings, got: {mm_hashes}. " + "This is likely due to an incorrect custom implementation of " + "MultiModalProcessor.apply method.") + + return mm_input async def _process_multimodal_async( self, @@ -310,13 +320,23 @@ async def _process_multimodal_async( if mm_processor_kwargs is None: mm_processor_kwargs = {} - return mm_processor.apply( + mm_input = mm_processor.apply( prompt, mm_data, hf_processor_mm_kwargs=mm_processor_kwargs, tokenization_kwargs=tokenization_kwargs, mm_hash_overrides=mm_hash_overrides, ) + mm_hashes = mm_input["mm_hashes"] + + # Validate that all mm items have a string as their hash + if not contains_only_strings(mm_hashes): + raise ValueError( + f"mm_hashes must contain only strings, got: {mm_hashes}. " + "This is likely due to an incorrect custom implementation of " + "MultiModalProcessor.apply method.") + + return mm_input def _process_embeds( self, @@ -953,3 +973,15 @@ async def preprocess_async( def clear_cache(self) -> None: if self.mm_processor_cache is not None: self.mm_processor_cache.clear_cache() + + +# Helper function to validate that a nested dictionary contains +# only strings or list of strings as the leaf values. +def contains_only_strings(obj: object): + if isinstance(obj, str): + return True + if isinstance(obj, list): + return all(isinstance(x, str) for x in obj) + if isinstance(obj, dict): + return all(contains_only_strings(v) for v in obj.values()) + return False diff --git a/vllm/model_executor/models/terratorch.py b/vllm/model_executor/models/terratorch.py index 739396a4932c..453da1a51d98 100644 --- a/vllm/model_executor/models/terratorch.py +++ b/vllm/model_executor/models/terratorch.py @@ -174,9 +174,10 @@ def apply( mm_items = self._to_mm_items(mm_data) tokenization_kwargs = tokenization_kwargs or {} - mm_hashes = (mm_hash_overrides if mm_hash_overrides is not None else - self._hash_mm_items(mm_items, hf_processor_mm_kwargs, - tokenization_kwargs)) + mm_hashes = self._hash_mm_items(mm_items, + hf_processor_mm_kwargs, + tokenization_kwargs, + mm_hash_overrides=mm_hash_overrides) mm_placeholders = {"image": [PlaceholderRange(offset=0, length=0)]} mm_processed_data = BatchFeature(image_data)