eicherseiji
diff --git a/‎docs/features/multimodal_inputs.md‎
Lines changed: 33 additions & 9 deletions b/‎docs/features/multimodal_inputs.md‎
Lines changed: 33 additions & 9 deletions
diff --git a/‎tests/entrypoints/openai/test_vision.py‎
Lines changed: 129 additions & 0 deletions b/‎tests/entrypoints/openai/test_vision.py‎
Lines changed: 129 additions & 0 deletions
@@ -215,19 +215,19 @@ When loading RGBA images (images with transparency), vLLM converts them to RGB f
 
     ```python
     from vllm import LLM
-    
+
     # Default white background (no configuration needed)
     llm = LLM(model="llava-hf/llava-1.5-7b-hf")
-    
+
     # Custom black background for dark theme
     llm = LLM(
         model="llava-hf/llava-1.5-7b-hf",
         media_io_kwargs={"image": {"rgba_background_color": [0, 0, 0]}}
     )
-    
+
     # Custom brand color background (e.g., blue)
     llm = LLM(
-        model="llava-hf/llava-1.5-7b-hf", 
+        model="llava-hf/llava-1.5-7b-hf",
         media_io_kwargs={"image": {"rgba_background_color": [0, 0, 255]}}
     )
     ```
@@ -388,7 +388,7 @@ For Qwen2-VL and MiniCPM-V, we accept additional parameters alongside the embedd
 
 ## Online Serving
 
-Our OpenAI-compatible server accepts multi-modal data via the [Chat Completions API](https://platform.openai.com/docs/api-reference/chat).
+Our OpenAI-compatible server accepts multi-modal data via the [Chat Completions API](https://platform.openai.com/docs/api-reference/chat). Media inputs also support optional UUIDs users can provide to uniquely identify each media, which is used to cache the media results across requests.
 
 !!! important
     A chat template is **required** to use Chat Completions API.
@@ -438,7 +438,13 @@ Then, you can use the OpenAI client as follows:
                 # NOTE: The prompt formatting with the image token `<image>` is not needed
                 # since the prompt will be processed automatically by the API server.
                 {"type": "text", "text": "What’s in this image?"},
-                {"type": "image_url", "image_url": {"url": image_url}},
+                {
+                    "type": "image_url",
+                    "image_url": {
+                        url": image_url
+                    },
+                    "uuid": image_url # Optional
+                },
             ],
         }],
     )
@@ -454,8 +460,20 @@ Then, you can use the OpenAI client as follows:
             "role": "user",
             "content": [
                 {"type": "text", "text": "What are the animals in these images?"},
-                {"type": "image_url", "image_url": {"url": image_url_duck}},
-                {"type": "image_url", "image_url": {"url": image_url_lion}},
+                {
+                    "type": "image_url",
+                    "image_url": {
+                        "url": image_url_duck
+                    },
+                    "uuid": image_url_duck # Optional
+                },
+                {
+                    "type": "image_url",
+                    "image_url": {
+                        "url": image_url_lion
+                    },
+                    "uuid": image_url_lion # Optional
+                },
             ],
         }],
     )
@@ -522,6 +540,7 @@ Then, you can use the OpenAI client as follows:
                     "video_url": {
                         "url": video_url
                     },
+                    "uuid": video_url # Optional
                 },
             ],
         }],
@@ -613,6 +632,7 @@ Then, you can use the OpenAI client as follows:
                         "data": audio_base64,
                         "format": "wav"
                     },
+                    "uuid": audio_url # Optional
                 },
             ],
         }],
@@ -642,6 +662,7 @@ Alternatively, you can pass `audio_url`, which is the audio counterpart of `imag
                     "audio_url": {
                         "url": audio_url
                     },
+                    "uuid": audio_url # Optional
                 },
             ],
         }],
@@ -695,7 +716,8 @@ The following example demonstrates how to pass image embeddings to the OpenAI se
     model = "llava-hf/llava-1.5-7b-hf"
     embeds =  {
         "type": "image_embeds",
-        "image_embeds": f"{base64_image_embedding}" 
+        "image_embeds": f"{base64_image_embedding}",
+        "uuid": image_url # Optional
     }
 
     # Pass additional parameters (available to Qwen2-VL and MiniCPM-V)
@@ -706,6 +728,7 @@ The following example demonstrates how to pass image embeddings to the OpenAI se
             "image_embeds": f"{base64_image_embedding}" , # Required
             "image_grid_thw": f"{base64_image_grid_thw}"  # Required by Qwen/Qwen2-VL-2B-Instruct
         },
+        "uuid": image_url # Optional
     }
     model = "openbmb/MiniCPM-V-2_6"
     embeds =  {
@@ -714,6 +737,7 @@ The following example demonstrates how to pass image embeddings to the OpenAI se
             "image_embeds": f"{base64_image_embedding}" , # Required
             "image_sizes": f"{base64_image_sizes}"  # Required by openbmb/MiniCPM-V-2_6
         },
+        "uuid": image_url # Optional
     }
     chat_completion = client.chat.completions.create(
         messages=[
 
@@ -436,3 +436,132 @@ async def test_multi_image_input(client: openai.AsyncOpenAI, model_name: str,
         )
         message = chat_completion.choices[0].message
         assert message.content is not None and len(message.content) >= 0
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+@pytest.mark.parametrize(
+    "image_urls",
+    [TEST_IMAGE_ASSETS[:i] for i in range(2, len(TEST_IMAGE_ASSETS))],
+    indirect=True)
+async def test_completions_with_image(
+    client: openai.AsyncOpenAI,
+    model_name: str,
+    image_urls: list[str],
+):
+    for image_url in image_urls:
+        chat_completion = await client.chat.completions.create(
+            messages=[
+                {
+                    "role": "system",
+                    "content": "You are a helpful assistant."
+                },
+                {
+                    "role":
+                    "user",
+                    "content": [
+                        {
+                            "type": "text",
+                            "text": "Describe this image.",
+                        },
+                        {
+                            "type": "image_url",
+                            "image_url": {
+                                "url": image_url,
+                            }
+                        },
+                    ],
+                },
+            ],
+            model=model_name,
+        )
+        assert chat_completion.choices[0].message.content is not None
+        assert isinstance(chat_completion.choices[0].message.content, str)
+        assert len(chat_completion.choices[0].message.content) > 0
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+@pytest.mark.parametrize(
+    "image_urls",
+    [TEST_IMAGE_ASSETS[:i] for i in range(2, len(TEST_IMAGE_ASSETS))],
+    indirect=True)
+async def test_completions_with_image_with_uuid(
+    client: openai.AsyncOpenAI,
+    model_name: str,
+    image_urls: list[str],
+):
+    for image_url in image_urls:
+        chat_completion = await client.chat.completions.create(
+            messages=[
+                {
+                    "role": "system",
+                    "content": "You are a helpful assistant."
+                },
+                {
+                    "role":
+                    "user",
+                    "content": [
+                        {
+                            "type": "text",
+                            "text": "Describe this image.",
+                        },
+                        {
+                            "type": "image_url",
+                            "image_url": {
+                                "url": image_url,
+                            },
+                            "uuid": image_url
+                        },
+                    ],
+                },
+            ],
+            model=model_name,
+        )
+        assert chat_completion.choices[0].message.content is not None
+        assert isinstance(chat_completion.choices[0].message.content, str)
+        assert len(chat_completion.choices[0].message.content) > 0
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+@pytest.mark.parametrize(
+    "image_urls",
+    [TEST_IMAGE_ASSETS[:i] for i in range(2, len(TEST_IMAGE_ASSETS))],
+    indirect=True)
+async def test_completions_with_image_with_incorrect_uuid_format(
+    client: openai.AsyncOpenAI,
+    model_name: str,
+    image_urls: list[str],
+):
+    for image_url in image_urls:
+        chat_completion = await client.chat.completions.create(
+            messages=[
+                {
+                    "role": "system",
+                    "content": "You are a helpful assistant."
+                },
+                {
+                    "role":
+                    "user",
+                    "content": [
+                        {
+                            "type": "text",
+                            "text": "Describe this image.",
+                        },
+                        {
+                            "type": "image_url",
+                            "image_url": {
+                                "url": image_url,
+                                "incorrect_uuid_key": image_url,
+                            },
+                            "also_incorrect_uuid_key": image_url,
+                        },
+                    ],
+                },
+            ],
+            model=model_name,
+        )
+        assert chat_completion.choices[0].message.content is not None
+        assert isinstance(chat_completion.choices[0].message.content, str)
+        assert len(chat_completion.choices[0].message.content) > 0