Skip to content

Commit ec8dd2c

Browse files
huachenheliywang96Roger WangDarkLight1337
authored andcommitted
[Frontend] User-provided uuids for medias in chat. (RFC vllm-project#22044) (vllm-project#23449)
Signed-off-by: Roger Wang <[email protected]> Signed-off-by: Chenheli Hua <[email protected]> Signed-off-by: Roger Wang <[email protected]> Signed-off-by: Cyrus Leung <[email protected]> Co-authored-by: Roger Wang <[email protected]> Co-authored-by: Roger Wang <[email protected]> Co-authored-by: Cyrus Leung <[email protected]>
1 parent b63bb16 commit ec8dd2c

File tree

8 files changed

+1087
-87
lines changed

8 files changed

+1087
-87
lines changed

docs/features/multimodal_inputs.md

Lines changed: 33 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -215,19 +215,19 @@ When loading RGBA images (images with transparency), vLLM converts them to RGB f
215215

216216
```python
217217
from vllm import LLM
218-
218+
219219
# Default white background (no configuration needed)
220220
llm = LLM(model="llava-hf/llava-1.5-7b-hf")
221-
221+
222222
# Custom black background for dark theme
223223
llm = LLM(
224224
model="llava-hf/llava-1.5-7b-hf",
225225
media_io_kwargs={"image": {"rgba_background_color": [0, 0, 0]}}
226226
)
227-
227+
228228
# Custom brand color background (e.g., blue)
229229
llm = LLM(
230-
model="llava-hf/llava-1.5-7b-hf",
230+
model="llava-hf/llava-1.5-7b-hf",
231231
media_io_kwargs={"image": {"rgba_background_color": [0, 0, 255]}}
232232
)
233233
```
@@ -388,7 +388,7 @@ For Qwen2-VL and MiniCPM-V, we accept additional parameters alongside the embedd
388388

389389
## Online Serving
390390

391-
Our OpenAI-compatible server accepts multi-modal data via the [Chat Completions API](https://platform.openai.com/docs/api-reference/chat).
391+
Our OpenAI-compatible server accepts multi-modal data via the [Chat Completions API](https://platform.openai.com/docs/api-reference/chat). Media inputs also support optional UUIDs users can provide to uniquely identify each media, which is used to cache the media results across requests.
392392

393393
!!! important
394394
A chat template is **required** to use Chat Completions API.
@@ -438,7 +438,13 @@ Then, you can use the OpenAI client as follows:
438438
# NOTE: The prompt formatting with the image token `<image>` is not needed
439439
# since the prompt will be processed automatically by the API server.
440440
{"type": "text", "text": "What’s in this image?"},
441-
{"type": "image_url", "image_url": {"url": image_url}},
441+
{
442+
"type": "image_url",
443+
"image_url": {
444+
url": image_url
445+
},
446+
"uuid": image_url # Optional
447+
},
442448
],
443449
}],
444450
)
@@ -454,8 +460,20 @@ Then, you can use the OpenAI client as follows:
454460
"role": "user",
455461
"content": [
456462
{"type": "text", "text": "What are the animals in these images?"},
457-
{"type": "image_url", "image_url": {"url": image_url_duck}},
458-
{"type": "image_url", "image_url": {"url": image_url_lion}},
463+
{
464+
"type": "image_url",
465+
"image_url": {
466+
"url": image_url_duck
467+
},
468+
"uuid": image_url_duck # Optional
469+
},
470+
{
471+
"type": "image_url",
472+
"image_url": {
473+
"url": image_url_lion
474+
},
475+
"uuid": image_url_lion # Optional
476+
},
459477
],
460478
}],
461479
)
@@ -522,6 +540,7 @@ Then, you can use the OpenAI client as follows:
522540
"video_url": {
523541
"url": video_url
524542
},
543+
"uuid": video_url # Optional
525544
},
526545
],
527546
}],
@@ -613,6 +632,7 @@ Then, you can use the OpenAI client as follows:
613632
"data": audio_base64,
614633
"format": "wav"
615634
},
635+
"uuid": audio_url # Optional
616636
},
617637
],
618638
}],
@@ -642,6 +662,7 @@ Alternatively, you can pass `audio_url`, which is the audio counterpart of `imag
642662
"audio_url": {
643663
"url": audio_url
644664
},
665+
"uuid": audio_url # Optional
645666
},
646667
],
647668
}],
@@ -695,7 +716,8 @@ The following example demonstrates how to pass image embeddings to the OpenAI se
695716
model = "llava-hf/llava-1.5-7b-hf"
696717
embeds = {
697718
"type": "image_embeds",
698-
"image_embeds": f"{base64_image_embedding}"
719+
"image_embeds": f"{base64_image_embedding}",
720+
"uuid": image_url # Optional
699721
}
700722

701723
# Pass additional parameters (available to Qwen2-VL and MiniCPM-V)
@@ -706,6 +728,7 @@ The following example demonstrates how to pass image embeddings to the OpenAI se
706728
"image_embeds": f"{base64_image_embedding}" , # Required
707729
"image_grid_thw": f"{base64_image_grid_thw}" # Required by Qwen/Qwen2-VL-2B-Instruct
708730
},
731+
"uuid": image_url # Optional
709732
}
710733
model = "openbmb/MiniCPM-V-2_6"
711734
embeds = {
@@ -714,6 +737,7 @@ The following example demonstrates how to pass image embeddings to the OpenAI se
714737
"image_embeds": f"{base64_image_embedding}" , # Required
715738
"image_sizes": f"{base64_image_sizes}" # Required by openbmb/MiniCPM-V-2_6
716739
},
740+
"uuid": image_url # Optional
717741
}
718742
chat_completion = client.chat.completions.create(
719743
messages=[

tests/entrypoints/openai/test_vision.py

Lines changed: 129 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -436,3 +436,132 @@ async def test_multi_image_input(client: openai.AsyncOpenAI, model_name: str,
436436
)
437437
message = chat_completion.choices[0].message
438438
assert message.content is not None and len(message.content) >= 0
439+
440+
441+
@pytest.mark.asyncio
442+
@pytest.mark.parametrize("model_name", [MODEL_NAME])
443+
@pytest.mark.parametrize(
444+
"image_urls",
445+
[TEST_IMAGE_ASSETS[:i] for i in range(2, len(TEST_IMAGE_ASSETS))],
446+
indirect=True)
447+
async def test_completions_with_image(
448+
client: openai.AsyncOpenAI,
449+
model_name: str,
450+
image_urls: list[str],
451+
):
452+
for image_url in image_urls:
453+
chat_completion = await client.chat.completions.create(
454+
messages=[
455+
{
456+
"role": "system",
457+
"content": "You are a helpful assistant."
458+
},
459+
{
460+
"role":
461+
"user",
462+
"content": [
463+
{
464+
"type": "text",
465+
"text": "Describe this image.",
466+
},
467+
{
468+
"type": "image_url",
469+
"image_url": {
470+
"url": image_url,
471+
}
472+
},
473+
],
474+
},
475+
],
476+
model=model_name,
477+
)
478+
assert chat_completion.choices[0].message.content is not None
479+
assert isinstance(chat_completion.choices[0].message.content, str)
480+
assert len(chat_completion.choices[0].message.content) > 0
481+
482+
483+
@pytest.mark.asyncio
484+
@pytest.mark.parametrize("model_name", [MODEL_NAME])
485+
@pytest.mark.parametrize(
486+
"image_urls",
487+
[TEST_IMAGE_ASSETS[:i] for i in range(2, len(TEST_IMAGE_ASSETS))],
488+
indirect=True)
489+
async def test_completions_with_image_with_uuid(
490+
client: openai.AsyncOpenAI,
491+
model_name: str,
492+
image_urls: list[str],
493+
):
494+
for image_url in image_urls:
495+
chat_completion = await client.chat.completions.create(
496+
messages=[
497+
{
498+
"role": "system",
499+
"content": "You are a helpful assistant."
500+
},
501+
{
502+
"role":
503+
"user",
504+
"content": [
505+
{
506+
"type": "text",
507+
"text": "Describe this image.",
508+
},
509+
{
510+
"type": "image_url",
511+
"image_url": {
512+
"url": image_url,
513+
},
514+
"uuid": image_url
515+
},
516+
],
517+
},
518+
],
519+
model=model_name,
520+
)
521+
assert chat_completion.choices[0].message.content is not None
522+
assert isinstance(chat_completion.choices[0].message.content, str)
523+
assert len(chat_completion.choices[0].message.content) > 0
524+
525+
526+
@pytest.mark.asyncio
527+
@pytest.mark.parametrize("model_name", [MODEL_NAME])
528+
@pytest.mark.parametrize(
529+
"image_urls",
530+
[TEST_IMAGE_ASSETS[:i] for i in range(2, len(TEST_IMAGE_ASSETS))],
531+
indirect=True)
532+
async def test_completions_with_image_with_incorrect_uuid_format(
533+
client: openai.AsyncOpenAI,
534+
model_name: str,
535+
image_urls: list[str],
536+
):
537+
for image_url in image_urls:
538+
chat_completion = await client.chat.completions.create(
539+
messages=[
540+
{
541+
"role": "system",
542+
"content": "You are a helpful assistant."
543+
},
544+
{
545+
"role":
546+
"user",
547+
"content": [
548+
{
549+
"type": "text",
550+
"text": "Describe this image.",
551+
},
552+
{
553+
"type": "image_url",
554+
"image_url": {
555+
"url": image_url,
556+
"incorrect_uuid_key": image_url,
557+
},
558+
"also_incorrect_uuid_key": image_url,
559+
},
560+
],
561+
},
562+
],
563+
model=model_name,
564+
)
565+
assert chat_completion.choices[0].message.content is not None
566+
assert isinstance(chat_completion.choices[0].message.content, str)
567+
assert len(chat_completion.choices[0].message.content) > 0

0 commit comments

Comments
 (0)