diff --git a/docs/features/multimodal_inputs.md b/docs/features/multimodal_inputs.md
index 206ab7a46875..77baa27c7a95 100644
--- a/docs/features/multimodal_inputs.md
+++ b/docs/features/multimodal_inputs.md
@@ -215,19 +215,19 @@ When loading RGBA images (images with transparency), vLLM converts them to RGB f
 
     ```python
     from vllm import LLM
-    
+
     # Default white background (no configuration needed)
     llm = LLM(model="llava-hf/llava-1.5-7b-hf")
-    
+
     # Custom black background for dark theme
     llm = LLM(
         model="llava-hf/llava-1.5-7b-hf",
         media_io_kwargs={"image": {"rgba_background_color": [0, 0, 0]}}
     )
-    
+
     # Custom brand color background (e.g., blue)
     llm = LLM(
-        model="llava-hf/llava-1.5-7b-hf", 
+        model="llava-hf/llava-1.5-7b-hf",
         media_io_kwargs={"image": {"rgba_background_color": [0, 0, 255]}}
     )
     ```
@@ -388,7 +388,7 @@ For Qwen2-VL and MiniCPM-V, we accept additional parameters alongside the embedd
 
 ## Online Serving
 
-Our OpenAI-compatible server accepts multi-modal data via the [Chat Completions API](https://platform.openai.com/docs/api-reference/chat).
+Our OpenAI-compatible server accepts multi-modal data via the [Chat Completions API](https://platform.openai.com/docs/api-reference/chat). Media inputs also support optional UUIDs users can provide to uniquely identify each media, which is used to cache the media results across requests.
 
 !!! important
     A chat template is **required** to use Chat Completions API.
@@ -438,7 +438,13 @@ Then, you can use the OpenAI client as follows:
                 # NOTE: The prompt formatting with the image token `<image>` is not needed
                 # since the prompt will be processed automatically by the API server.
                 {"type": "text", "text": "What’s in this image?"},
-                {"type": "image_url", "image_url": {"url": image_url}},
+                {
+                    "type": "image_url",
+                    "image_url": {
+                        url": image_url
+                    },
+                    "uuid": image_url # Optional
+                },
             ],
         }],
     )
@@ -454,8 +460,20 @@ Then, you can use the OpenAI client as follows:
             "role": "user",
             "content": [
                 {"type": "text", "text": "What are the animals in these images?"},
-                {"type": "image_url", "image_url": {"url": image_url_duck}},
-                {"type": "image_url", "image_url": {"url": image_url_lion}},
+                {
+                    "type": "image_url",
+                    "image_url": {
+                        "url": image_url_duck
+                    },
+                    "uuid": image_url_duck # Optional
+                },
+                {
+                    "type": "image_url",
+                    "image_url": {
+                        "url": image_url_lion
+                    },
+                    "uuid": image_url_lion # Optional
+                },
             ],
         }],
     )
@@ -522,6 +540,7 @@ Then, you can use the OpenAI client as follows:
                     "video_url": {
                         "url": video_url
                     },
+                    "uuid": video_url # Optional
                 },
             ],
         }],
@@ -613,6 +632,7 @@ Then, you can use the OpenAI client as follows:
                         "data": audio_base64,
                         "format": "wav"
                     },
+                    "uuid": audio_url # Optional
                 },
             ],
         }],
@@ -642,6 +662,7 @@ Alternatively, you can pass `audio_url`, which is the audio counterpart of `imag
                     "audio_url": {
                         "url": audio_url
                     },
+                    "uuid": audio_url # Optional
                 },
             ],
         }],
@@ -695,7 +716,8 @@ The following example demonstrates how to pass image embeddings to the OpenAI se
     model = "llava-hf/llava-1.5-7b-hf"
     embeds =  {
         "type": "image_embeds",
-        "image_embeds": f"{base64_image_embedding}" 
+        "image_embeds": f"{base64_image_embedding}",
+        "uuid": image_url # Optional
     }
 
     # Pass additional parameters (available to Qwen2-VL and MiniCPM-V)
@@ -706,6 +728,7 @@ The following example demonstrates how to pass image embeddings to the OpenAI se
             "image_embeds": f"{base64_image_embedding}" , # Required
             "image_grid_thw": f"{base64_image_grid_thw}"  # Required by Qwen/Qwen2-VL-2B-Instruct
         },
+        "uuid": image_url # Optional
     }
     model = "openbmb/MiniCPM-V-2_6"
     embeds =  {
@@ -714,6 +737,7 @@ The following example demonstrates how to pass image embeddings to the OpenAI se
             "image_embeds": f"{base64_image_embedding}" , # Required
             "image_sizes": f"{base64_image_sizes}"  # Required by openbmb/MiniCPM-V-2_6
         },
+        "uuid": image_url # Optional
     }
     chat_completion = client.chat.completions.create(
         messages=[
diff --git a/tests/entrypoints/openai/test_vision.py b/tests/entrypoints/openai/test_vision.py
index 9d61754059e2..29a3b40d2d86 100644
--- a/tests/entrypoints/openai/test_vision.py
+++ b/tests/entrypoints/openai/test_vision.py
@@ -436,3 +436,132 @@ async def test_multi_image_input(client: openai.AsyncOpenAI, model_name: str,
         )
         message = chat_completion.choices[0].message
         assert message.content is not None and len(message.content) >= 0
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+@pytest.mark.parametrize(
+    "image_urls",
+    [TEST_IMAGE_ASSETS[:i] for i in range(2, len(TEST_IMAGE_ASSETS))],
+    indirect=True)
+async def test_completions_with_image(
+    client: openai.AsyncOpenAI,
+    model_name: str,
+    image_urls: list[str],
+):
+    for image_url in image_urls:
+        chat_completion = await client.chat.completions.create(
+            messages=[
+                {
+                    "role": "system",
+                    "content": "You are a helpful assistant."
+                },
+                {
+                    "role":
+                    "user",
+                    "content": [
+                        {
+                            "type": "text",
+                            "text": "Describe this image.",
+                        },
+                        {
+                            "type": "image_url",
+                            "image_url": {
+                                "url": image_url,
+                            }
+                        },
+                    ],
+                },
+            ],
+            model=model_name,
+        )
+        assert chat_completion.choices[0].message.content is not None
+        assert isinstance(chat_completion.choices[0].message.content, str)
+        assert len(chat_completion.choices[0].message.content) > 0
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+@pytest.mark.parametrize(
+    "image_urls",
+    [TEST_IMAGE_ASSETS[:i] for i in range(2, len(TEST_IMAGE_ASSETS))],
+    indirect=True)
+async def test_completions_with_image_with_uuid(
+    client: openai.AsyncOpenAI,
+    model_name: str,
+    image_urls: list[str],
+):
+    for image_url in image_urls:
+        chat_completion = await client.chat.completions.create(
+            messages=[
+                {
+                    "role": "system",
+                    "content": "You are a helpful assistant."
+                },
+                {
+                    "role":
+                    "user",
+                    "content": [
+                        {
+                            "type": "text",
+                            "text": "Describe this image.",
+                        },
+                        {
+                            "type": "image_url",
+                            "image_url": {
+                                "url": image_url,
+                            },
+                            "uuid": image_url
+                        },
+                    ],
+                },
+            ],
+            model=model_name,
+        )
+        assert chat_completion.choices[0].message.content is not None
+        assert isinstance(chat_completion.choices[0].message.content, str)
+        assert len(chat_completion.choices[0].message.content) > 0
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+@pytest.mark.parametrize(
+    "image_urls",
+    [TEST_IMAGE_ASSETS[:i] for i in range(2, len(TEST_IMAGE_ASSETS))],
+    indirect=True)
+async def test_completions_with_image_with_incorrect_uuid_format(
+    client: openai.AsyncOpenAI,
+    model_name: str,
+    image_urls: list[str],
+):
+    for image_url in image_urls:
+        chat_completion = await client.chat.completions.create(
+            messages=[
+                {
+                    "role": "system",
+                    "content": "You are a helpful assistant."
+                },
+                {
+                    "role":
+                    "user",
+                    "content": [
+                        {
+                            "type": "text",
+                            "text": "Describe this image.",
+                        },
+                        {
+                            "type": "image_url",
+                            "image_url": {
+                                "url": image_url,
+                                "incorrect_uuid_key": image_url,
+                            },
+                            "also_incorrect_uuid_key": image_url,
+                        },
+                    ],
+                },
+            ],
+            model=model_name,
+        )
+        assert chat_completion.choices[0].message.content is not None
+        assert isinstance(chat_completion.choices[0].message.content, str)
+        assert len(chat_completion.choices[0].message.content) > 0
diff --git a/tests/entrypoints/test_chat_utils.py b/tests/entrypoints/test_chat_utils.py
index 18db1027c004..5149ca346050 100644
--- a/tests/entrypoints/test_chat_utils.py
+++ b/tests/entrypoints/test_chat_utils.py
@@ -21,7 +21,7 @@
                                          resolve_chat_template_content_format,
                                          resolve_hf_chat_template)
 from vllm.entrypoints.llm import apply_hf_chat_template
-from vllm.multimodal import MultiModalDataDict
+from vllm.multimodal import MultiModalDataDict, MultiModalUUIDDict
 from vllm.multimodal.utils import (encode_audio_base64, encode_image_base64,
                                    encode_video_base64)
 from vllm.transformers_utils.tokenizer_group import TokenizerGroup
@@ -179,6 +179,27 @@ def _assert_mm_data_is_image_input(
     assert isinstance(image_data, list) and len(image_data) == image_count
 
 
+def _assert_mm_uuids(
+    mm_uuids: Optional[MultiModalUUIDDict],
+    media_count: int,
+    expected_uuids: list[Optional[str]],
+    modality: str = "image",
+) -> None:
+    if len(expected_uuids) > 0:
+        assert mm_uuids is not None
+        assert modality in mm_uuids
+
+        image_uuids = mm_uuids.get(modality)
+        assert image_uuids is not None
+
+        assert isinstance(image_uuids,
+                          list) and len(image_uuids) == media_count
+
+        assert image_uuids == expected_uuids
+    else:
+        assert mm_uuids is None
+
+
 ModalityType = Literal["image", "video", "audio"]
 MultiModalDataCounts = Mapping[ModalityType, int]
 
@@ -201,7 +222,7 @@ def test_parse_chat_messages_single_image(
     phi3v_tokenizer,
     image_url,
 ):
-    conversation, mm_data = parse_chat_messages(
+    conversation, mm_data, mm_uuids = parse_chat_messages(
         [{
             "role":
             "user",
@@ -228,6 +249,260 @@ def test_parse_chat_messages_single_image(
         "content": "<|image_1|>\nWhat's in the image?"
     }]
     _assert_mm_data_is_image_input(mm_data, 1)
+    _assert_mm_uuids(mm_uuids, 1, expected_uuids=[None])
+
+
+def test_parse_chat_messages_single_image_with_uuid(
+    phi3v_model_config,
+    phi3v_tokenizer,
+    image_url,
+):
+    image_uuid = str(hash(image_url))
+    conversation, mm_data, mm_uuids = parse_chat_messages(
+        [{
+            "role":
+            "user",
+            "content": [
+                {
+                    "type": "image_url",
+                    "image_url": {
+                        "url": image_url,
+                    },
+                    "uuid": image_uuid,
+                },
+                {
+                    "type": "text",
+                    "text": "What's in the image?"
+                },
+            ],
+        }],
+        phi3v_model_config,
+        phi3v_tokenizer,
+        content_format="string",
+    )
+
+    assert conversation == [{
+        "role": "user",
+        "content": "<|image_1|>\nWhat's in the image?"
+    }]
+    _assert_mm_data_is_image_input(mm_data, 1)
+    _assert_mm_uuids(mm_uuids, 1, expected_uuids=[image_uuid])
+
+
+def test_parse_chat_messages_single_image_with_bad_uuid_format(
+    phi3v_model_config,
+    phi3v_tokenizer,
+    image_url,
+):
+    image_uuid = str(hash(image_url))
+    conversation, mm_data, mm_uuids = parse_chat_messages(
+        [{
+            "role":
+            "user",
+            "content": [
+                {
+                    "type": "image_url",
+                    "image_url": {
+                        "url": image_url,
+                        "uuid": image_uuid,
+                    },
+                    "bad_uuid_key": image_uuid,
+                },
+                {
+                    "type": "text",
+                    "text": "What's in the image?"
+                },
+            ],
+        }],
+        phi3v_model_config,
+        phi3v_tokenizer,
+        content_format="string",
+    )
+
+    assert conversation == [{
+        "role": "user",
+        "content": "<|image_1|>\nWhat's in the image?"
+    }]
+    _assert_mm_data_is_image_input(mm_data, 1)
+    _assert_mm_uuids(mm_uuids, 1, expected_uuids=[None])
+
+
+def test_parse_chat_messages_multiple_images_with_uuids(
+    phi3v_model_config,
+    phi3v_tokenizer,
+    image_url,
+):
+    image_uuid1 = "my_uuid_1"
+    image_uuid2 = "my_uuid_2"
+
+    conversation, mm_data, mm_uuids = parse_chat_messages(
+        [{
+            "role":
+            "user",
+            "content": [
+                {
+                    "type": "image_url",
+                    "image_url": {
+                        "url": image_url,
+                    },
+                    "uuid": image_uuid1,
+                },
+                {
+                    "type": "image_url",
+                    "image_url": {
+                        "url": image_url,
+                    },
+                    "uuid": image_uuid2,
+                },
+                {
+                    "type": "text",
+                    "text": "What's in the image?"
+                },
+            ],
+        }],
+        phi3v_model_config,
+        phi3v_tokenizer,
+        content_format="string",
+    )
+
+    assert conversation == [{
+        "role":
+        "user",
+        "content":
+        "<|image_1|>\n<|image_2|>\nWhat's in the image?",
+    }]
+    _assert_mm_data_is_image_input(mm_data, 2)
+    _assert_mm_uuids(mm_uuids, 2, expected_uuids=[image_uuid1, image_uuid2])
+
+
+@pytest.mark.asyncio
+async def test_parse_chat_messages_single_image_with_uuid_async(
+    phi3v_model_config,
+    phi3v_tokenizer,
+    image_url,
+):
+    image_uuid = str(hash(image_url))
+    conversation, mm_future, mm_uuids = parse_chat_messages_futures(
+        [{
+            "role":
+            "user",
+            "content": [
+                {
+                    "type": "image_url",
+                    "image_url": {
+                        "url": image_url
+                    },
+                    "uuid": image_uuid,
+                },
+                {
+                    "type": "text",
+                    "text": "What's in the image?"
+                },
+            ],
+        }],
+        phi3v_model_config,
+        phi3v_tokenizer,
+        content_format="string",
+    )
+
+    assert conversation == [{
+        "role": "user",
+        "content": "<|image_1|>\nWhat's in the image?"
+    }]
+    _assert_mm_data_is_image_input(await mm_future, 1)
+    _assert_mm_uuids(mm_uuids, 1, expected_uuids=[image_uuid])
+
+
+@pytest.mark.asyncio
+async def test_parse_chat_messages_multiple_images_with_uuids_async(
+    phi3v_model_config,
+    phi3v_tokenizer,
+    image_url,
+):
+    image_uuid1 = "my_uuid_1"
+    image_uuid2 = "my_uuid_2"
+
+    conversation, mm_future, mm_uuids = parse_chat_messages_futures(
+        [{
+            "role":
+            "user",
+            "content": [
+                {
+                    "type": "image_url",
+                    "image_url": {
+                        "url": image_url
+                    },
+                    "uuid": image_uuid1,
+                },
+                {
+                    "type": "image_pil",
+                    "image_pil": ImageAsset("cherry_blossom").pil_image,
+                    "uuid": image_uuid2,
+                },
+                {
+                    "type": "text",
+                    "text": "What's in these images?"
+                },
+            ],
+        }],
+        phi3v_model_config,
+        phi3v_tokenizer,
+        content_format="string",
+    )
+
+    assert conversation == [{
+        "role":
+        "user",
+        "content":
+        "<|image_1|>\n<|image_2|>\nWhat's in these images?",
+    }]
+    _assert_mm_data_is_image_input(await mm_future, 2)
+    _assert_mm_uuids(mm_uuids, 2, expected_uuids=[image_uuid1, image_uuid2])
+
+
+@pytest.mark.asyncio
+async def test_parse_chat_messages_multiple_images_with_partial_uuids_async(
+    phi3v_model_config,
+    phi3v_tokenizer,
+    image_url,
+):
+    image_uuid2 = "my_uuid_2"
+
+    conversation, mm_future, mm_uuids = parse_chat_messages_futures(
+        [{
+            "role":
+            "user",
+            "content": [
+                {
+                    "type": "image_url",
+                    "image_url": {
+                        "url": image_url
+                    },
+                },
+                {
+                    "type": "image_pil",
+                    "image_pil": ImageAsset("cherry_blossom").pil_image,
+                    "uuid": image_uuid2,
+                },
+                {
+                    "type": "text",
+                    "text": "What's in these images?"
+                },
+            ],
+        }],
+        phi3v_model_config,
+        phi3v_tokenizer,
+        content_format="string",
+    )
+
+    assert conversation == [{
+        "role":
+        "user",
+        "content":
+        "<|image_1|>\n<|image_2|>\nWhat's in these images?",
+    }]
+    _assert_mm_data_is_image_input(await mm_future, 2)
+    _assert_mm_uuids(mm_uuids, 2, expected_uuids=[None, image_uuid2])
 
 
 def test_parse_chat_messages_empty_system(
@@ -235,7 +510,7 @@ def test_parse_chat_messages_empty_system(
     mistral_tokenizer,
 ):
     # Test string format
-    conversation, _ = parse_chat_messages(
+    conversation, _, _ = parse_chat_messages(
         [
             {
                 "role": "system",
@@ -265,7 +540,7 @@ def test_parse_chat_messages_empty_system(
     ]
 
     # Test openai format
-    conversation, _ = parse_chat_messages(
+    conversation, _, _ = parse_chat_messages(
         [
             {
                 "role": "system",
@@ -307,7 +582,7 @@ async def test_parse_chat_messages_single_image_async(
     phi3v_tokenizer,
     image_url,
 ):
-    conversation, mm_future = parse_chat_messages_futures(
+    conversation, mm_future, mm_uuids = parse_chat_messages_futures(
         [{
             "role":
             "user",
@@ -334,6 +609,7 @@ async def test_parse_chat_messages_single_image_async(
         "content": "<|image_1|>\nWhat's in the image?"
     }]
     _assert_mm_data_is_image_input(await mm_future, 1)
+    _assert_mm_uuids(mm_uuids, 1, expected_uuids=[None])
 
 
 def test_parse_chat_messages_multiple_images(
@@ -341,7 +617,7 @@ def test_parse_chat_messages_multiple_images(
     phi3v_tokenizer,
     image_url,
 ):
-    conversation, mm_data = parse_chat_messages(
+    conversation, mm_data, mm_uuids = parse_chat_messages(
         [{
             "role":
             "user",
@@ -374,6 +650,7 @@ def test_parse_chat_messages_multiple_images(
         "<|image_1|>\n<|image_2|>\nWhat's in these images?",
     }]
     _assert_mm_data_is_image_input(mm_data, 2)
+    _assert_mm_uuids(mm_uuids, 2, expected_uuids=[None, None])
 
 
 @pytest.mark.asyncio
@@ -382,7 +659,7 @@ async def test_parse_chat_messages_multiple_images_async(
     phi3v_tokenizer,
     image_url,
 ):
-    conversation, mm_future = parse_chat_messages_futures(
+    conversation, mm_future, mm_uuids = parse_chat_messages_futures(
         [{
             "role":
             "user",
@@ -415,6 +692,7 @@ async def test_parse_chat_messages_multiple_images_async(
         "<|image_1|>\n<|image_2|>\nWhat's in these images?",
     }]
     _assert_mm_data_is_image_input(await mm_future, 2)
+    _assert_mm_uuids(mm_uuids, 2, expected_uuids=[None, None])
 
 
 def test_parse_chat_messages_placeholder_already_in_prompt(
@@ -422,7 +700,7 @@ def test_parse_chat_messages_placeholder_already_in_prompt(
     phi3v_tokenizer,
     image_url,
 ):
-    conversation, mm_data = parse_chat_messages(
+    conversation, mm_data, mm_uuids = parse_chat_messages(
         [{
             "role":
             "user",
@@ -458,6 +736,7 @@ def test_parse_chat_messages_placeholder_already_in_prompt(
         "What's in <|image_1|> and how does it compare to <|image_2|>?",
     }]
     _assert_mm_data_is_image_input(mm_data, 2)
+    _assert_mm_uuids(mm_uuids, 2, expected_uuids=[None, None])
 
 
 def test_parse_chat_messages_placeholder_one_already_in_prompt(
@@ -465,7 +744,7 @@ def test_parse_chat_messages_placeholder_one_already_in_prompt(
     phi3v_tokenizer,
     image_url,
 ):
-    conversation, mm_data = parse_chat_messages(
+    conversation, mm_data, mm_uuids = parse_chat_messages(
         [{
             "role":
             "user",
@@ -503,6 +782,7 @@ def test_parse_chat_messages_placeholder_one_already_in_prompt(
         "other one?",
     }]
     _assert_mm_data_is_image_input(mm_data, 2)
+    _assert_mm_uuids(mm_uuids, 2, expected_uuids=[None, None])
 
 
 def test_parse_chat_messages_multiple_images_across_messages(
@@ -510,7 +790,7 @@ def test_parse_chat_messages_multiple_images_across_messages(
     phi3v_tokenizer,
     image_url,
 ):
-    conversation, mm_data = parse_chat_messages(
+    conversation, mm_data, mm_uuids = parse_chat_messages(
         [
             {
                 "role":
@@ -569,28 +849,99 @@ def test_parse_chat_messages_multiple_images_across_messages(
         },
     ]
     _assert_mm_data_is_image_input(mm_data, 2)
+    _assert_mm_uuids(mm_uuids, 2, expected_uuids=[None, None])
 
 
-def test_parse_chat_messages_context_text_format(
+def test_parse_chat_messages_multiple_images_with_uuids_across_messages(
     phi3v_model_config,
     phi3v_tokenizer,
+    image_url,
 ):
-    conversation, mm_data = parse_chat_messages(
+    image_uuid = str(hash(image_url))
+    conversation, mm_data, mm_uuids = parse_chat_messages(
         [
             {
-                "role": "user",
-                "content": [{
-                    "type": "text",
-                    "text": "What's in this text?"
-                }],
+                "role":
+                "user",
+                "content": [
+                    {
+                        "type": "image_url",
+                        "image_url": {
+                            "url": image_url
+                        },
+                        "uuid": image_uuid,
+                    },
+                    {
+                        "type": "text",
+                        "text": "What's in this image?"
+                    },
+                ],
             },
             {
                 "role": "assistant",
                 "content": "Some stuff."
             },
             {
-                "role": "user",
-                "content": "What about this one?"
+                "role":
+                "user",
+                "content": [
+                    {
+                        "type": "image_url",
+                        "image_url": {
+                            "url": image_url
+                        },
+                        "uuid": image_uuid,
+                    },
+                    {
+                        "type": "text",
+                        "text": "What about this one?"
+                    },
+                ],
+            },
+        ],
+        phi3v_model_config,
+        phi3v_tokenizer,
+        content_format="string",
+    )
+
+    assert conversation == [
+        {
+            "role": "user",
+            "content": "<|image_1|>\nWhat's in this image?"
+        },
+        {
+            "role": "assistant",
+            "content": "Some stuff."
+        },
+        {
+            "role": "user",
+            "content": "<|image_2|>\nWhat about this one?"
+        },
+    ]
+    _assert_mm_data_is_image_input(mm_data, 2)
+    _assert_mm_uuids(mm_uuids, 2, expected_uuids=[image_uuid, image_uuid])
+
+
+def test_parse_chat_messages_context_text_format(
+    phi3v_model_config,
+    phi3v_tokenizer,
+):
+    conversation, mm_data, mm_uuids = parse_chat_messages(
+        [
+            {
+                "role": "user",
+                "content": [{
+                    "type": "text",
+                    "text": "What's in this text?"
+                }],
+            },
+            {
+                "role": "assistant",
+                "content": "Some stuff."
+            },
+            {
+                "role": "user",
+                "content": "What about this one?"
             },
         ],
         phi3v_model_config,
@@ -621,6 +972,8 @@ def test_parse_chat_messages_context_text_format(
             }],
         },
     ]
+    assert mm_data is None
+    assert mm_uuids is None
 
 
 def test_parse_chat_messages_rejects_too_many_images_in_one_message(
@@ -736,7 +1089,7 @@ def test_parse_chat_messages_multiple_images_uncommon_input(
     phi3v_tokenizer,
     image_url,
 ):
-    conversation, mm_data = parse_chat_messages(
+    conversation, mm_data, mm_uuids = parse_chat_messages(
         [{
             "role":
             "user",
@@ -762,6 +1115,7 @@ def test_parse_chat_messages_multiple_images_uncommon_input(
         "<|image_1|>\n<|image_2|>\nWhat's in these images?",
     }]
     _assert_mm_data_is_image_input(mm_data, 2)
+    _assert_mm_uuids(mm_uuids, 2, expected_uuids=[None, None])
 
 
 def test_parse_chat_messages_multiple_images_interleave(
@@ -769,7 +1123,7 @@ def test_parse_chat_messages_multiple_images_interleave(
     phi3v_tokenizer,
     image_url,
 ):
-    conversation, mm_data = parse_chat_messages(
+    conversation, mm_data, mm_uuids = parse_chat_messages(
         [{
             "role":
             "user",
@@ -813,6 +1167,7 @@ def test_parse_chat_messages_multiple_images_interleave(
         "Do they have differences?",
     }]
     _assert_mm_data_is_image_input(mm_data, 2)
+    _assert_mm_uuids(mm_uuids, 2, expected_uuids=[None, None])
 
 
 @pytest.mark.asyncio
@@ -821,7 +1176,7 @@ async def test_parse_chat_messages_multiple_images_interleave_async(
     phi3v_tokenizer,
     image_url,
 ):
-    conversation, mm_data = parse_chat_messages_futures(
+    conversation, mm_data, mm_uuids = parse_chat_messages_futures(
         [{
             "role":
             "user",
@@ -865,6 +1220,63 @@ async def test_parse_chat_messages_multiple_images_interleave_async(
         "Do they have differences?",
     }]
     _assert_mm_data_is_image_input(await mm_data, 2)
+    _assert_mm_uuids(mm_uuids, 2, expected_uuids=[None, None])
+
+
+@pytest.mark.asyncio
+async def test_parse_chat_messages_multiple_images_with_uuids_interleave_async(
+    phi3v_model_config_mm_interleaved,
+    phi3v_tokenizer,
+    image_url,
+):
+    image_uuid = str(hash(image_url))
+    conversation, mm_data, mm_uuids = parse_chat_messages_futures(
+        [{
+            "role":
+            "user",
+            "content": [
+                {
+                    "type": "text",
+                    "text": "I need you to compare this image",
+                },
+                {
+                    "type": "image_url",
+                    "image_url": {
+                        "url": image_url
+                    },
+                    "uuid": image_uuid,
+                },
+                {
+                    "type": "text",
+                    "text": "and this one"
+                },
+                {
+                    "type": "image_url",
+                    "image_url": {
+                        "url": image_url
+                    },
+                    "uuid": image_uuid,
+                },
+                {
+                    "type": "text",
+                    "text": "Do they have differences?"
+                },
+            ],
+        }],
+        phi3v_model_config_mm_interleaved,
+        phi3v_tokenizer,
+        content_format="string",
+    )
+
+    assert conversation == [{
+        "role":
+        "user",
+        "content":
+        "I need you to compare this image\n<|image_1|>\nand this one\n<|image_2|>\n"  # noqa: E501
+        "Do they have differences?",
+    }]
+    _assert_mm_data_is_image_input(await mm_data, 2)
+    _assert_mm_uuids(mm_uuids, 2, expected_uuids=[image_uuid, image_uuid])
 
 
 def test_parse_chat_messages_multiple_images_multiple_messages_interleave(
@@ -872,7 +1284,7 @@ def test_parse_chat_messages_multiple_images_multiple_messages_interleave(
     phi3v_tokenizer,
     image_url,
 ):
-    conversation, mm_data = parse_chat_messages(
+    conversation, mm_data, mm_uuids = parse_chat_messages(
         [
             {
                 "role":
@@ -935,6 +1347,81 @@ def test_parse_chat_messages_multiple_images_multiple_messages_interleave(
         },
     ]
     _assert_mm_data_is_image_input(mm_data, 2)
+    _assert_mm_uuids(mm_uuids, 2, expected_uuids=[None, None])
+
+
+def test_parse_chat_messages_multiple_images_with_uuids_multiple_messages_interleave(  # noqa: E501
+    phi3v_model_config_mm_interleaved,
+    phi3v_tokenizer,
+    image_url,
+):
+    image_uuid = str(hash(image_url))
+    conversation, mm_data, mm_uuids = parse_chat_messages(
+        [
+            {
+                "role":
+                "user",
+                "content": [
+                    {
+                        "type": "text",
+                        "text": "What's on this image?"
+                    },
+                    {
+                        "type": "image_url",
+                        "image_url": {
+                            "url": image_url
+                        },
+                        "uuid": image_uuid,
+                    },
+                    {
+                        "type": "text",
+                        "text": "Be accurate."
+                    },
+                ],
+            },
+            {
+                "role": "assistant",
+                "content": "Some stuff."
+            },
+            {
+                "role":
+                "user",
+                "content": [
+                    {
+                        "type": "text",
+                        "text": "What's on this image?"
+                    },
+                    {
+                        "type": "image_url",
+                        "image_url": {
+                            "url": image_url
+                        },
+                        "uuid": image_uuid,
+                    },
+                ],
+            },
+        ],
+        phi3v_model_config_mm_interleaved,
+        phi3v_tokenizer,
+        content_format="string",
+    )
+
+    assert conversation == [
+        {
+            "role": "user",
+            "content": "What's on this image?\n<|image_1|>\nBe accurate.",
+        },
+        {
+            "role": "assistant",
+            "content": "Some stuff."
+        },
+        {
+            "role": "user",
+            "content": "What's on this image?\n<|image_2|>"
+        },
+    ]
+    _assert_mm_data_is_image_input(mm_data, 2)
+    _assert_mm_uuids(mm_uuids, 2, expected_uuids=[image_uuid, image_uuid])
 
 
 def test_parse_chat_messages_multiple_modals_multiple_messages_interleave(
@@ -944,7 +1431,7 @@ def test_parse_chat_messages_multiple_modals_multiple_messages_interleave(
     video_url,
     audio_url,
 ):
-    conversation, mm_data = parse_chat_messages(
+    conversation, mm_data, mm_uuids = parse_chat_messages(
         [
             {
                 "role":
@@ -1030,6 +1517,229 @@ def test_parse_chat_messages_multiple_modals_multiple_messages_interleave(
     ]
 
     _assert_mm_data_inputs(mm_data, {"image": 2, "video": 1, "audio": 1})
+    _assert_mm_uuids(mm_uuids,
+                     2,
+                     modality="image",
+                     expected_uuids=[None, None])
+    _assert_mm_uuids(mm_uuids, 1, modality="video", expected_uuids=[None])
+    _assert_mm_uuids(mm_uuids, 1, modality="audio", expected_uuids=[None])
+
+
+def test_parse_chat_messages_multiple_modals_with_uuids_multiple_messages_interleave(  # noqa: E501
+    qwen25omni_model_config_mm_interleaved,
+    qwen25omni_tokenizer,
+    image_url,
+    video_url,
+    audio_url,
+):
+    conversation, mm_data, mm_uuids = parse_chat_messages(
+        [
+            {
+                "role":
+                "user",
+                "content": [
+                    {
+                        "type": "text",
+                        "text": "What's on this image?"
+                    },
+                    {
+                        "type": "image_url",
+                        "image_url": {
+                            "url": image_url
+                        },
+                        "uuid": "image_123",
+                    },
+                    {
+                        "type": "text",
+                        "text": "Now listen to this audio"
+                    },
+                    {
+                        "type": "audio_url",
+                        "audio_url": {
+                            "url": audio_url
+                        },
+                        "uuid": "audio_123",
+                    },
+                ],
+            },
+            {
+                "role": "assistant",
+                "content": "Some stuff."
+            },
+            {
+                "role":
+                "user",
+                "content": [
+                    {
+                        "type": "text",
+                        "text": "What's on this image?"
+                    },
+                    {
+                        "type": "image_url",
+                        "image_url": {
+                            "url": image_url
+                        },
+                        "uuid": "image_123",
+                    },
+                    {
+                        "type": "text",
+                        "text": "And what's in the video?"
+                    },
+                    {
+                        "type": "video_url",
+                        "video_url": {
+                            "url": video_url
+                        },
+                        "uuid": "video_123",
+                    },
+                ],
+            },
+        ],
+        qwen25omni_model_config_mm_interleaved,
+        qwen25omni_tokenizer,
+        content_format="string",
+    )
+
+    assert conversation == [
+        {
+            "role":
+            "user",
+            "content":
+            "What's on this image?\n<|vision_start|><|IMAGE|><|vision_end|>\n"
+            "Now listen to this audio\nAudio 1: <|audio_bos|><|AUDIO|><|audio_eos|>",  # noqa: E501
+        },
+        {
+            "role": "assistant",
+            "content": "Some stuff."
+        },
+        {
+            "role":
+            "user",
+            "content":
+            "What's on this image?\n<|vision_start|><|IMAGE|><|vision_end|>\n"
+            "And what's in the video?\n<|vision_start|><|VIDEO|><|vision_end|>",
+        },
+    ]
+
+    _assert_mm_data_inputs(mm_data, {"image": 2, "video": 1, "audio": 1})
+    _assert_mm_uuids(mm_uuids,
+                     2,
+                     modality="image",
+                     expected_uuids=["image_123", "image_123"])
+    _assert_mm_uuids(mm_uuids,
+                     1,
+                     modality="video",
+                     expected_uuids=["video_123"])
+    _assert_mm_uuids(mm_uuids,
+                     1,
+                     modality="audio",
+                     expected_uuids=["audio_123"])
+
+
+def test_parse_chat_messages_multiple_modals_with_partial_uuids_multiple_messages_interleave(  # noqa: E501
+    qwen25omni_model_config_mm_interleaved,
+    qwen25omni_tokenizer,
+    image_url,
+    video_url,
+    audio_url,
+):
+    conversation, mm_data, mm_uuids = parse_chat_messages(
+        [
+            {
+                "role":
+                "user",
+                "content": [
+                    {
+                        "type": "text",
+                        "text": "What's on this image?"
+                    },
+                    {
+                        "type": "image_url",
+                        "image_url": {
+                            "url": image_url
+                        },
+                        "uuid": "image_123",
+                    },
+                    {
+                        "type": "text",
+                        "text": "Now listen to this audio"
+                    },
+                    {
+                        "type": "audio_url",
+                        "audio_url": {
+                            "url": audio_url
+                        }
+                    },
+                ],
+            },
+            {
+                "role": "assistant",
+                "content": "Some stuff."
+            },
+            {
+                "role":
+                "user",
+                "content": [
+                    {
+                        "type": "text",
+                        "text": "What's on this image?"
+                    },
+                    {
+                        "type": "image_url",
+                        "image_url": {
+                            "url": image_url
+                        }
+                    },
+                    {
+                        "type": "text",
+                        "text": "And what's in the video?"
+                    },
+                    {
+                        "type": "video_url",
+                        "video_url": {
+                            "url": video_url
+                        },
+                        "uuid": "video_123",
+                    },
+                ],
+            },
+        ],
+        qwen25omni_model_config_mm_interleaved,
+        qwen25omni_tokenizer,
+        content_format="string",
+    )
+
+    assert conversation == [
+        {
+            "role":
+            "user",
+            "content":
+            "What's on this image?\n<|vision_start|><|IMAGE|><|vision_end|>\n"
+            "Now listen to this audio\nAudio 1: <|audio_bos|><|AUDIO|><|audio_eos|>",  # noqa: E501
+        },
+        {
+            "role": "assistant",
+            "content": "Some stuff."
+        },
+        {
+            "role":
+            "user",
+            "content":
+            "What's on this image?\n<|vision_start|><|IMAGE|><|vision_end|>\n"
+            "And what's in the video?\n<|vision_start|><|VIDEO|><|vision_end|>",
+        },
+    ]
+
+    _assert_mm_data_inputs(mm_data, {"image": 2, "video": 1, "audio": 1})
+    _assert_mm_uuids(mm_uuids,
+                     2,
+                     modality="image",
+                     expected_uuids=["image_123", None])
+    _assert_mm_uuids(mm_uuids,
+                     1,
+                     modality="video",
+                     expected_uuids=["video_123"])
+    _assert_mm_uuids(mm_uuids, 1, modality="audio", expected_uuids=[None])
 
 
 def test_parse_chat_messages_multiple_images_interleave_with_placeholders(
@@ -1081,7 +1791,7 @@ def test_mllama_single_image(
     image_url,
 ):
     """Ensures that a single image is parsed correctly mllama."""
-    conversation, mm_data = parse_chat_messages(
+    conversation, mm_data, mm_uuids = parse_chat_messages(
         [{
             "role":
             "user",
@@ -1100,6 +1810,7 @@ def test_mllama_single_image(
         content_format="openai",
     )
     _assert_mm_data_is_image_input(mm_data, 1)
+    _assert_mm_uuids(mm_uuids, 1, expected_uuids=[None])
     assert conversation == [{
         "role":
         "user",
@@ -1121,7 +1832,7 @@ def test_mllama_interleaved_images(
     image_url,
 ):
     """Ensures that multiple image are parsed as interleaved dicts."""
-    conversation, mm_data = parse_chat_messages(
+    conversation, mm_data, mm_uuids = parse_chat_messages(
         [{
             "role":
             "user",
@@ -1147,6 +1858,7 @@ def test_mllama_interleaved_images(
         content_format="openai",
     )
     _assert_mm_data_is_image_input(mm_data, 2)
+    _assert_mm_uuids(mm_uuids, 2, expected_uuids=[None, None])
     assert conversation == [{
         "role":
         "user",
@@ -1227,7 +1939,7 @@ def get_conversation(is_hf: bool):
 
     # Now parse with vLLMs chat utils & apply the template
     vllm_conversation = get_conversation(is_hf=False)
-    conversation, _ = parse_chat_messages(
+    conversation, _, _ = parse_chat_messages(
         vllm_conversation,
         model_config,
         tokenizer_group,
@@ -1518,7 +2230,7 @@ def test_parse_chat_messages_include_thinking_chunk(mistral_model_config,
         }],
     }]
 
-    conversation_with_thinking, _ = parse_chat_messages(
+    conversation_with_thinking, _, _ = parse_chat_messages(
         messages,
         mistral_model_config,
         mistral_tokenizer,
diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py
index 80e2c44a0251..b53dbfb3a26a 100644
--- a/vllm/entrypoints/chat_utils.py
+++ b/vllm/entrypoints/chat_utils.py
@@ -41,7 +41,8 @@
 from vllm.config import ModelConfig
 from vllm.logger import init_logger
 from vllm.model_executor.models import SupportsMultiModal
-from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalDataDict
+from vllm.multimodal import (MULTIMODAL_REGISTRY, MultiModalDataDict,
+                             MultiModalUUIDDict)
 from vllm.multimodal.utils import MediaConnector
 # yapf: disable
 from vllm.transformers_utils.chat_templates import (
@@ -72,6 +73,11 @@ class ChatCompletionContentPartAudioParam(TypedDict, total=False):
 
     type: Required[Literal["audio_url"]]
     """The type of the content part."""
+    uuid: Optional[str]
+    """
+    User-provided UUID of a media. User must guarantee that it is properly
+    generated and unique for different medias.
+    """
 
 
 class ChatCompletionContentPartImageEmbedsParam(TypedDict, total=False):
@@ -83,6 +89,11 @@ class ChatCompletionContentPartImageEmbedsParam(TypedDict, total=False):
     """
     type: Required[Literal["image_embeds"]]
     """The type of the content part."""
+    uuid: Optional[str]
+    """
+    User-provided UUID of a media. User must guarantee that it is properly
+    generated and unique for different medias.
+    """
 
 
 class VideoURL(TypedDict, total=False):
@@ -97,6 +108,11 @@ class ChatCompletionContentPartVideoParam(TypedDict, total=False):
 
     type: Required[Literal["video_url"]]
     """The type of the content part."""
+    uuid: Optional[str]
+    """
+    User-provided UUID of a media. User must guarantee that it is properly
+    generated and unique for different medias.
+    """
 
 
 class PILImage(BaseModel):
@@ -118,6 +134,11 @@ class CustomChatCompletionContentPILImageParam(TypedDict, total=False):
     """
 
     image_pil: Required[PILImage]
+    uuid: Optional[str]
+    """
+    User-provided UUID of a media. User must guarantee that it is properly
+    generated and unique for different medias.
+    """
 
 
 class CustomChatCompletionContentSimpleImageParam(TypedDict, total=False):
@@ -131,6 +152,11 @@ class CustomChatCompletionContentSimpleImageParam(TypedDict, total=False):
     """
 
     image_url: Required[str]
+    uuid: Optional[str]
+    """
+    User-provided UUID of a media. User must guarantee that it is properly
+    generated and unique for different medias.
+    """
 
 
 class CustomChatCompletionContentSimpleAudioParam(TypedDict, total=False):
@@ -155,6 +181,11 @@ class CustomChatCompletionContentSimpleVideoParam(TypedDict, total=False):
     """
 
     video_url: Required[str]
+    uuid: Optional[str]
+    """
+    User-provided UUID of a media. User must guarantee that it is properly
+    generated and unique for different medias.
+    """
 
 
 class CustomThinkCompletionContentParam(TypedDict, total=False):
@@ -567,6 +598,7 @@ def __init__(self, model_config: ModelConfig, tokenizer: AnyTokenizer):
         self._tokenizer = tokenizer
 
         self._items_by_modality = defaultdict[str, list[_T]](list)
+        self._uuids_by_modality = defaultdict[str, list[Optional[str]]](list)
 
     @property
     def model_config(self) -> ModelConfig:
@@ -591,10 +623,15 @@ def mm_registry(self):
     def mm_processor(self):
         return self.mm_registry.create_processor(self.model_config)
 
-    def add(self, modality: ModalityStr, item: _T) -> Optional[str]:
+    def add(
+        self, modality: ModalityStr, item: _T, uuid: Optional[str] = None
+    ) -> Optional[str]:
         """
         Add a multi-modal item to the current prompt and returns the
         placeholder string to use, if any.
+
+        An optional uuid can be added which serves as a unique identifier of the
+        media. 
         """
         input_modality = modality.replace("_embeds", "")
         num_items = len(self._items_by_modality[modality]) + 1
@@ -602,9 +639,35 @@ def add(self, modality: ModalityStr, item: _T) -> Optional[str]:
         self.mm_processor.validate_num_items(input_modality, num_items)
 
         self._items_by_modality[modality].append(item)
+        self._uuids_by_modality[modality].append(uuid)
 
         return self.model_cls.get_placeholder_str(modality, num_items)
 
+    def all_mm_uuids(self) -> Optional[MultiModalUUIDDict]:
+        if not self._items_by_modality:
+            return None
+        mm_uuids = {}
+        uuids_by_modality = dict(self._uuids_by_modality)
+        if "image" in uuids_by_modality and "image_embeds" in uuids_by_modality:
+            raise ValueError(
+                "Mixing raw image and embedding inputs is not allowed"
+            )
+
+        if "image_embeds" in uuids_by_modality:
+            image_embeds_uuids = uuids_by_modality["image_embeds"]
+            if len(image_embeds_uuids) > 1:
+                raise ValueError(
+                    "Only one message can have {'type': 'image_embeds'}"
+                )
+            mm_uuids["image"] = uuids_by_modality["image_embeds"]
+        if "image" in uuids_by_modality:
+            mm_uuids["image"] = uuids_by_modality["image"]  # UUIDs of images
+        if "audio" in uuids_by_modality:
+            mm_uuids["audio"] = uuids_by_modality["audio"]  # UUIDs of audios
+        if "video" in uuids_by_modality:
+            mm_uuids["video"] = uuids_by_modality["video"]  # UUIDs of videos
+        return mm_uuids
+
     @abstractmethod
     def create_parser(self) -> "BaseMultiModalContentParser":
         raise NotImplementedError
@@ -697,29 +760,35 @@ def mm_placeholder_storage(self) -> dict[str, list]:
         return dict(self._placeholder_storage)
 
     @abstractmethod
-    def parse_image(self, image_url: str) -> None:
+    def parse_image(self, image_url: str, uuid: Optional[str] = None) -> None:
         raise NotImplementedError
 
     @abstractmethod
     def parse_image_embeds(
-        self, image_embeds: Union[str, dict[str, str]]
+        self,
+        image_embeds: Union[str, dict[str, str]],
+        uuid: Optional[str] = None,
     ) -> None:
         raise NotImplementedError
 
     @abstractmethod
-    def parse_image_pil(self, image_pil: Image.Image) -> None:
+    def parse_image_pil(
+        self, image_pil: Image.Image, uuid: Optional[str] = None
+    ) -> None:
         raise NotImplementedError
 
     @abstractmethod
-    def parse_audio(self, audio_url: str) -> None:
+    def parse_audio(self, audio_url: str, uuid: Optional[str] = None) -> None:
         raise NotImplementedError
 
     @abstractmethod
-    def parse_input_audio(self, input_audio: InputAudio) -> None:
+    def parse_input_audio(
+        self, input_audio: InputAudio, uuid: Optional[str] = None
+    ) -> None:
         raise NotImplementedError
 
     @abstractmethod
-    def parse_video(self, video_url: str) -> None:
+    def parse_video(self, video_url: str, uuid: Optional[str] = None) -> None:
         raise NotImplementedError
 
 
@@ -734,49 +803,55 @@ def __init__(self, tracker: MultiModalItemTracker) -> None:
             allowed_local_media_path=tracker.allowed_local_media_path,
         )
 
-    def parse_image(self, image_url: str) -> None:
+    def parse_image(self, image_url: str, uuid: Optional[str] = None) -> None:
         image = self._connector.fetch_image(image_url)
 
-        placeholder = self._tracker.add("image", image)
+        placeholder = self._tracker.add("image", image, uuid)
         self._add_placeholder("image", placeholder)
 
     def parse_image_embeds(
-        self, image_embeds: Union[str, dict[str, str]]
+        self,
+        image_embeds: Union[str, dict[str, str]],
+        uuid: Optional[str] = None,
     ) -> None:
         if isinstance(image_embeds, dict):
             embeds = {
                 k: self._connector.fetch_image_embedding(v)
                 for k, v in image_embeds.items()
             }
-            placeholder = self._tracker.add("image_embeds", embeds)
+            placeholder = self._tracker.add("image_embeds", embeds, uuid)
 
         if isinstance(image_embeds, str):
             embedding = self._connector.fetch_image_embedding(image_embeds)
-            placeholder = self._tracker.add("image_embeds", embedding)
+            placeholder = self._tracker.add("image_embeds", embedding, uuid)
 
         self._add_placeholder("image", placeholder)
 
-    def parse_image_pil(self, image_pil: Image.Image) -> None:
-        placeholder = self._tracker.add("image", image_pil)
+    def parse_image_pil(
+        self, image_pil: Image.Image, uuid: Optional[str] = None
+    ) -> None:
+        placeholder = self._tracker.add("image", image_pil, uuid)
         self._add_placeholder("image", placeholder)
 
-    def parse_audio(self, audio_url: str) -> None:
+    def parse_audio(self, audio_url: str, uuid: Optional[str] = None) -> None:
         audio = self._connector.fetch_audio(audio_url)
 
-        placeholder = self._tracker.add("audio", audio)
+        placeholder = self._tracker.add("audio", audio, uuid)
         self._add_placeholder("audio", placeholder)
 
-    def parse_input_audio(self, input_audio: InputAudio) -> None:
+    def parse_input_audio(
+        self, input_audio: InputAudio, uuid: Optional[str] = None
+    ) -> None:
         audio_data = input_audio.get("data", "")
         audio_format = input_audio.get("format", "")
         audio_url = f"data:audio/{audio_format};base64,{audio_data}"
 
-        return self.parse_audio(audio_url)
+        return self.parse_audio(audio_url, uuid)
 
-    def parse_video(self, video_url: str) -> None:
+    def parse_video(self, video_url: str, uuid: Optional[str] = None) -> None:
         video = self._connector.fetch_video(video_url=video_url)
 
-        placeholder = self._tracker.add("video", video)
+        placeholder = self._tracker.add("video", video, uuid)
         self._add_placeholder("video", placeholder)
 
 
@@ -790,14 +865,16 @@ def __init__(self, tracker: AsyncMultiModalItemTracker) -> None:
             allowed_local_media_path=tracker.allowed_local_media_path,
         )
 
-    def parse_image(self, image_url: str) -> None:
+    def parse_image(self, image_url: str, uuid: Optional[str] = None) -> None:
         image_coro = self._connector.fetch_image_async(image_url)
 
-        placeholder = self._tracker.add("image", image_coro)
+        placeholder = self._tracker.add("image", image_coro, uuid)
         self._add_placeholder("image", placeholder)
 
     def parse_image_embeds(
-        self, image_embeds: Union[str, dict[str, str]]
+        self,
+        image_embeds: Union[str, dict[str, str]],
+        uuid: Optional[str] = None,
     ) -> None:
         future: asyncio.Future[Union[str, dict[str, str]]] = asyncio.Future()
 
@@ -812,33 +889,37 @@ def parse_image_embeds(
             embedding = self._connector.fetch_image_embedding(image_embeds)
             future.set_result(embedding)
 
-        placeholder = self._tracker.add("image_embeds", future)
+        placeholder = self._tracker.add("image_embeds", future, uuid)
         self._add_placeholder("image", placeholder)
 
-    def parse_image_pil(self, image_pil: Image.Image) -> None:
+    def parse_image_pil(
+        self, image_pil: Image.Image, uuid: Optional[str] = None
+    ) -> None:
         future: asyncio.Future[Image.Image] = asyncio.Future()
         future.set_result(image_pil)
 
-        placeholder = self._tracker.add("image", future)
+        placeholder = self._tracker.add("image", future, uuid)
         self._add_placeholder("image", placeholder)
 
-    def parse_audio(self, audio_url: str) -> None:
+    def parse_audio(self, audio_url: str, uuid: Optional[str] = None) -> None:
         audio_coro = self._connector.fetch_audio_async(audio_url)
 
-        placeholder = self._tracker.add("audio", audio_coro)
+        placeholder = self._tracker.add("audio", audio_coro, uuid)
         self._add_placeholder("audio", placeholder)
 
-    def parse_input_audio(self, input_audio: InputAudio) -> None:
+    def parse_input_audio(
+        self, input_audio: InputAudio, uuid: Optional[str] = None
+    ) -> None:
         audio_data = input_audio.get("data", "")
         audio_format = input_audio.get("format", "")
         audio_url = f"data:audio/{audio_format};base64,{audio_data}"
 
-        return self.parse_audio(audio_url)
+        return self.parse_audio(audio_url, uuid)
 
-    def parse_video(self, video_url: str) -> None:
+    def parse_video(self, video_url: str, uuid: Optional[str] = None) -> None:
         video = self._connector.fetch_video_async(video_url=video_url)
 
-        placeholder = self._tracker.add("video", video)
+        placeholder = self._tracker.add("video", video, uuid)
         self._add_placeholder("video", placeholder)
 
 
@@ -1177,30 +1258,36 @@ def _parse_chat_message_content_part(
         else:
             return str_content
 
+    # For media items, if a user has provided one, use it. Otherwise, insert
+    # a placeholder empty uuid.
+    uuid = part.get("uuid", None)
+    if uuid is not None:
+        uuid = str(uuid)
+
     modality = None
     if part_type == "image_pil":
         image_content = cast(Image.Image, content)
-        mm_parser.parse_image_pil(image_content)
+        mm_parser.parse_image_pil(image_content, uuid)
         modality = "image"
     elif part_type in ("image_url", "input_image"):
         str_content = cast(str, content)
-        mm_parser.parse_image(str_content)
+        mm_parser.parse_image(str_content, uuid)
         modality = "image"
     elif part_type == "image_embeds":
         content = cast(Union[str, dict[str, str]], content)
-        mm_parser.parse_image_embeds(content)
+        mm_parser.parse_image_embeds(content, uuid)
         modality = "image"
     elif part_type == "audio_url":
         str_content = cast(str, content)
-        mm_parser.parse_audio(str_content)
+        mm_parser.parse_audio(str_content, uuid)
         modality = "audio"
     elif part_type == "input_audio":
         dict_content = cast(InputAudio, content)
-        mm_parser.parse_input_audio(dict_content)
+        mm_parser.parse_input_audio(dict_content, uuid)
         modality = "audio"
     elif part_type == "video_url":
         str_content = cast(str, content)
-        mm_parser.parse_video(str_content)
+        mm_parser.parse_video(str_content, uuid)
         modality = "video"
     else:
         raise NotImplementedError(f"Unknown part type: {part_type}")
@@ -1288,7 +1375,11 @@ def parse_chat_messages(
     model_config: ModelConfig,
     tokenizer: AnyTokenizer,
     content_format: _ChatTemplateContentFormat,
-) -> tuple[list[ConversationMessage], Optional[MultiModalDataDict]]:
+) -> tuple[
+    list[ConversationMessage],
+    Optional[MultiModalDataDict],
+    Optional[MultiModalUUIDDict],
+]:
     conversation: list[ConversationMessage] = []
     mm_tracker = MultiModalItemTracker(model_config, tokenizer)
 
@@ -1308,7 +1399,7 @@ def parse_chat_messages(
 
     _postprocess_messages(conversation)
 
-    return conversation, mm_tracker.all_mm_data()
+    return conversation, mm_tracker.all_mm_data(), mm_tracker.all_mm_uuids()
 
 
 def parse_chat_messages_futures(
@@ -1316,7 +1407,11 @@ def parse_chat_messages_futures(
     model_config: ModelConfig,
     tokenizer: AnyTokenizer,
     content_format: _ChatTemplateContentFormat,
-) -> tuple[list[ConversationMessage], Awaitable[Optional[MultiModalDataDict]]]:
+) -> tuple[
+    list[ConversationMessage],
+    Awaitable[Optional[MultiModalDataDict]],
+    Optional[MultiModalUUIDDict],
+]:
     conversation: list[ConversationMessage] = []
     mm_tracker = AsyncMultiModalItemTracker(model_config, tokenizer)
 
@@ -1336,7 +1431,7 @@ def parse_chat_messages_futures(
 
     _postprocess_messages(conversation)
 
-    return conversation, mm_tracker.all_mm_data()
+    return conversation, mm_tracker.all_mm_data(), mm_tracker.all_mm_uuids()
 
 
 def apply_hf_chat_template(
diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py
index 9b2ad808eb03..d33fd0ec0b49 100644
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@@ -796,7 +796,7 @@ def chat(
             # NOTE: _parse_chat_message_content_parts() currently doesn't
             # handle mm_processor_kwargs, since there is no implementation in
             # the chat message parsing for it.
-            conversation, mm_data = parse_chat_messages(
+            conversation, mm_data, mm_uuids = parse_chat_messages(
                 msgs,
                 model_config,
                 tokenizer,
@@ -826,6 +826,9 @@ def chat(
             if mm_data is not None:
                 prompt["multi_modal_data"] = mm_data
 
+            if mm_uuids is not None:
+                prompt["multi_modal_uuids"] = mm_uuids
+
             if mm_processor_kwargs is not None:
                 prompt["mm_processor_kwargs"] = mm_processor_kwargs
 
diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py
index 1a2236de4fa4..d6e8d93a57e1 100644
--- a/vllm/entrypoints/openai/serving_engine.py
+++ b/vllm/entrypoints/openai/serving_engine.py
@@ -929,7 +929,7 @@ async def _preprocess_chat(
             tokenizer,
             model_config=model_config,
         )
-        conversation, mm_data_future = parse_chat_messages_futures(
+        conversation, mm_data_future, mm_uuids = parse_chat_messages_futures(
             messages,
             model_config,
             tokenizer,
@@ -1006,6 +1006,10 @@ async def _preprocess_chat(
             prompt_token_ids=prompt_inputs["prompt_token_ids"])
         if mm_data is not None:
             engine_prompt["multi_modal_data"] = mm_data
+
+        if mm_uuids is not None:
+            engine_prompt["multi_modal_uuids"] = mm_uuids
+
         if request.mm_processor_kwargs is not None:
             engine_prompt["mm_processor_kwargs"] = request.mm_processor_kwargs
 
diff --git a/vllm/inputs/preprocess.py b/vllm/inputs/preprocess.py
index 094fcf021b61..ec82be831e0d 100644
--- a/vllm/inputs/preprocess.py
+++ b/vllm/inputs/preprocess.py
@@ -276,13 +276,23 @@ def _process_multimodal(
         if mm_processor_kwargs is None:
             mm_processor_kwargs = {}
 
-        return mm_processor.apply(
+        mm_input = mm_processor.apply(
             prompt,
             mm_data,
             hf_processor_mm_kwargs=mm_processor_kwargs,
             tokenization_kwargs=tokenization_kwargs,
             mm_hash_overrides=mm_hash_overrides,
         )
+        mm_hashes = mm_input["mm_hashes"]
+
+        # Validate that all mm items have a string as their hash
+        if not contains_only_strings(mm_hashes):
+            raise ValueError(
+                f"mm_hashes must contain only strings, got: {mm_hashes}. "
+                "This is likely due to an incorrect custom implementation of "
+                "MultiModalProcessor.apply method.")
+
+        return mm_input
 
     async def _process_multimodal_async(
         self,
@@ -310,13 +320,23 @@ async def _process_multimodal_async(
         if mm_processor_kwargs is None:
             mm_processor_kwargs = {}
 
-        return mm_processor.apply(
+        mm_input = mm_processor.apply(
             prompt,
             mm_data,
             hf_processor_mm_kwargs=mm_processor_kwargs,
             tokenization_kwargs=tokenization_kwargs,
             mm_hash_overrides=mm_hash_overrides,
         )
+        mm_hashes = mm_input["mm_hashes"]
+
+        # Validate that all mm items have a string as their hash
+        if not contains_only_strings(mm_hashes):
+            raise ValueError(
+                f"mm_hashes must contain only strings, got: {mm_hashes}. "
+                "This is likely due to an incorrect custom implementation of "
+                "MultiModalProcessor.apply method.")
+
+        return mm_input
 
     def _process_embeds(
         self,
@@ -953,3 +973,15 @@ async def preprocess_async(
     def clear_cache(self) -> None:
         if self.mm_processor_cache is not None:
             self.mm_processor_cache.clear_cache()
+
+
+# Helper function to validate that a nested dictionary contains
+# only strings or list of strings as the leaf values.
+def contains_only_strings(obj: object):
+    if isinstance(obj, str):
+        return True
+    if isinstance(obj, list):
+        return all(isinstance(x, str) for x in obj)
+    if isinstance(obj, dict):
+        return all(contains_only_strings(v) for v in obj.values())
+    return False
diff --git a/vllm/model_executor/models/terratorch.py b/vllm/model_executor/models/terratorch.py
index 739396a4932c..453da1a51d98 100644
--- a/vllm/model_executor/models/terratorch.py
+++ b/vllm/model_executor/models/terratorch.py
@@ -174,9 +174,10 @@ def apply(
 
         mm_items = self._to_mm_items(mm_data)
         tokenization_kwargs = tokenization_kwargs or {}
-        mm_hashes = (mm_hash_overrides if mm_hash_overrides is not None else
-                     self._hash_mm_items(mm_items, hf_processor_mm_kwargs,
-                                         tokenization_kwargs))
+        mm_hashes = self._hash_mm_items(mm_items,
+                                        hf_processor_mm_kwargs,
+                                        tokenization_kwargs,
+                                        mm_hash_overrides=mm_hash_overrides)
         mm_placeholders = {"image": [PlaceholderRange(offset=0, length=0)]}
 
         mm_processed_data = BatchFeature(image_data)