Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
40 commits
Select commit Hold shift + click to select a range
e63fe82
Prompt Embeddings Support for v1 Engine
jesse996 Sep 19, 2025
3f29bec
merge
jesse996 Sep 22, 2025
777046f
[Fix] Update input embeddings condition to include prompt embeddings …
jesse996 Sep 22, 2025
c522293
fix param
jesse996 Sep 22, 2025
f239a9b
merge main
jesse996 Sep 25, 2025
a8187a5
format
jesse996 Sep 26, 2025
75cfdd7
add test
jesse996 Sep 26, 2025
6548360
merge main
jesse996 Sep 26, 2025
6d47582
fix test
jesse996 Oct 9, 2025
f706331
fix test
jesse996 Oct 9, 2025
7d6f819
fix test
jesse996 Oct 10, 2025
24706a5
fix test
jesse996 Oct 10, 2025
1526fc1
Merge branch 'main' into enable-prompt-embeds-in-v1
jesse996 Oct 10, 2025
cff8886
fix test
jesse996 Oct 10, 2025
4daf970
fix test
jesse996 Oct 11, 2025
41b6cdc
fix test
jesse996 Oct 11, 2025
565e3bf
fix test
jesse996 Oct 11, 2025
b84600a
fix test
jesse996 Oct 13, 2025
8973cd3
fix test
jesse996 Oct 13, 2025
a32adc3
Merge branch 'vllm-project:main' into enable-prompt-embeds-in-v1
jesse996 Oct 15, 2025
e41c84a
Merge branch 'vllm-project:main' into enable-prompt-embeds-in-v1
jesse996 Oct 16, 2025
aef7626
fix test
jesse996 Oct 16, 2025
60dbffe
fix code
jesse996 Oct 16, 2025
6a5ea17
add example
jesse996 Oct 16, 2025
75ee4a2
fix
jesse996 Oct 16, 2025
7f7f992
Merge branch 'vllm-project:main' into enable-prompt-embeds-in-v1
jesse996 Oct 21, 2025
cf0f217
Merge branch 'vllm-project:main' into enable-prompt-embeds-in-v1
jesse996 Oct 22, 2025
b072b3c
fix comment
jesse996 Oct 23, 2025
be3fe4c
Merge branch 'main' into enable-prompt-embeds-in-v1
jesse996 Oct 24, 2025
dfada67
remove unused
jesse996 Oct 24, 2025
d1327c6
Merge branch 'main' into enable-prompt-embeds-in-v1
jesse996 Oct 26, 2025
407bf75
add test to workflows
jesse996 Oct 26, 2025
d3b9fbb
fix test
jesse996 Oct 26, 2025
60256c4
fix test
jesse996 Oct 26, 2025
b4e4098
fix test
jesse996 Oct 26, 2025
46358ed
fix test
jesse996 Oct 26, 2025
c74dc8f
fix test
jesse996 Oct 27, 2025
e5a5c3c
Merge branch 'main' into enable-prompt-embeds-in-v1
jesse996 Oct 28, 2025
256414b
fix test
jesse996 Oct 28, 2025
182c911
fix test
jesse996 Oct 28, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .github/workflows/_e2e_test.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,7 @@ jobs:
# We found that if running aclgraph tests in batch, it will cause AclmdlRICaptureBegin error. So we run
# the test separately.
pytest -sv tests/e2e/singlecard/test_completion_with_prompt_embeds.py
pytest -sv tests/e2e/singlecard/test_aclgraph.py
pytest -sv tests/e2e/singlecard/test_ascend_scheduler.py
pytest -sv tests/e2e/singlecard/test_bge_model.py
Expand Down
97 changes: 97 additions & 0 deletions examples/prompt_embed_inference.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
Demonstrates how to generate prompt embeddings using
Hugging Face Transformers and use them as input to vLLM
for both single and batch inference.

Model: meta-llama/Llama-3.2-1B-Instruct
Note: This model is gated on Hugging Face Hub.
You must request access to use it:
https://huggingface.co/meta-llama/Llama-3.2-1B-Instruct

Requirements:
- vLLM
- transformers

Run:
python examples/prompt_embed_inference.py
"""

import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, PreTrainedTokenizer

from vllm import LLM


def init_tokenizer_and_llm(model_name: str):
llm = LLM(model=model_name, enable_prompt_embeds=True)
tokenizer = AutoTokenizer.from_pretrained(model_name)
transformers_model = AutoModelForCausalLM.from_pretrained(model_name)
embedding_layer = transformers_model.get_input_embeddings()
return tokenizer, embedding_layer, llm


def get_prompt_embeds(
chat: list[dict[str, str]],
tokenizer: PreTrainedTokenizer,
embedding_layer: torch.nn.Module,
):
token_ids = tokenizer.apply_chat_template(
chat, add_generation_prompt=True, return_tensors="pt"
)
prompt_embeds = embedding_layer(token_ids).squeeze(0)
return prompt_embeds


def single_prompt_inference(
llm: LLM, tokenizer: PreTrainedTokenizer, embedding_layer: torch.nn.Module
):
chat = [{"role": "user", "content": "Please tell me about the capital of France."}]
prompt_embeds = get_prompt_embeds(chat, tokenizer, embedding_layer)

outputs = llm.generate(
{
"prompt_embeds": prompt_embeds,
}
)

print("\n[Single Inference Output]")
print("-" * 30)
for o in outputs:
print(o.outputs[0].text)
print("-" * 30)


def batch_prompt_inference(
llm: LLM, tokenizer: PreTrainedTokenizer, embedding_layer: torch.nn.Module
):
chats = [
[{"role": "user", "content": "Please tell me about the capital of France."}],
[{"role": "user", "content": "When is the day longest during the year?"}],
[{"role": "user", "content": "Where is bigger, the moon or the sun?"}],
]

prompt_embeds_list = [
get_prompt_embeds(chat, tokenizer, embedding_layer) for chat in chats
]

outputs = llm.generate([{"prompt_embeds": embeds} for embeds in prompt_embeds_list])

print("\n[Batch Inference Outputs]")
print("-" * 30)
for i, o in enumerate(outputs):
print(f"Q{i + 1}: {chats[i][0]['content']}")
print(f"A{i + 1}: {o.outputs[0].text}\n")
print("-" * 30)


def main():
model_name = "meta-llama/Llama-3.2-1B-Instruct"
tokenizer, embedding_layer, llm = init_tokenizer_and_llm(model_name)
single_prompt_inference(llm, tokenizer, embedding_layer)
batch_prompt_inference(llm, tokenizer, embedding_layer)


if __name__ == "__main__":
main()
197 changes: 197 additions & 0 deletions tests/e2e/singlecard/test_completion_with_prompt_embeds.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,197 @@
#
# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
# Copyright 2023 The vLLM team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# This file is a part of the vllm-ascend project.
#
import os

import pytest
from transformers import AutoModelForCausalLM, AutoTokenizer

from tests.e2e.conftest import VllmRunner

os.environ["VLLM_USE_MODELSCOPE"] = "True"
os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"

MODELS = ["Qwen/Qwen2.5-0.5B-Instruct"]


def get_prompt_embeds(chat, tokenizer, embedding_layer):
"""Convert chat messages to prompt embeddings."""
token_ids = tokenizer.apply_chat_template(chat,
add_generation_prompt=True,
return_tensors='pt')
prompt_embeds = embedding_layer(token_ids).squeeze(0)
return prompt_embeds


@pytest.mark.parametrize("model_name", MODELS)
def test_single_prompt_embeds_inference(model_name):
"""Test single prompt inference with prompt embeddings."""
# Prepare prompt embeddings
tokenizer = AutoTokenizer.from_pretrained(model_name)
transformers_model = AutoModelForCausalLM.from_pretrained(model_name)
embedding_layer = transformers_model.get_input_embeddings()

chat = [{
"role": "user",
"content": "Please tell me about the capital of France."
}]
prompt_embeds = get_prompt_embeds(chat, tokenizer, embedding_layer)

# Run inference with prompt embeddings
with VllmRunner(
model_name,
enable_prompt_embeds=True,
enforce_eager=True,
) as vllm_runner:
outputs = vllm_runner.model.generate({
"prompt_embeds": prompt_embeds,
})

# Verify output
assert len(outputs) == 1
assert len(outputs[0].outputs) > 0
assert len(outputs[0].outputs[0].text) > 0
print(f"\n[Single Inference Output]: {outputs[0].outputs[0].text}")


@pytest.mark.parametrize("model_name", MODELS)
def test_batch_prompt_embeds_inference(model_name):
"""Test batch prompt inference with prompt embeddings."""
# Prepare prompt embeddings
tokenizer = AutoTokenizer.from_pretrained(model_name)
transformers_model = AutoModelForCausalLM.from_pretrained(model_name)
embedding_layer = transformers_model.get_input_embeddings()

chats = [[{
"role": "user",
"content": "Please tell me about the capital of France."
}],
[{
"role": "user",
"content": "When is the day longest during the year?"
}],
[{
"role": "user",
"content": "Where is bigger, the moon or the sun?"
}]]

prompt_embeds_list = [
get_prompt_embeds(chat, tokenizer, embedding_layer) for chat in chats
]

# Run batch inference with prompt embeddings
with VllmRunner(
model_name,
enable_prompt_embeds=True,
enforce_eager=True,
) as vllm_runner:
outputs = vllm_runner.model.generate([{
"prompt_embeds": embeds
} for embeds in prompt_embeds_list])

# Verify outputs
assert len(outputs) == len(chats)
for i, output in enumerate(outputs):
assert len(output.outputs) > 0
assert len(output.outputs[0].text) > 0
print(f"\nQ{i+1}: {chats[i][0]['content']}")
print(f"A{i+1}: {output.outputs[0].text}")


@pytest.mark.parametrize("model_name", MODELS)
def test_prompt_embeds_with_aclgraph(model_name):
"""Test prompt embeddings with ACL graph enabled vs disabled."""
# Prepare prompt embeddings
tokenizer = AutoTokenizer.from_pretrained(model_name)
transformers_model = AutoModelForCausalLM.from_pretrained(model_name)
embedding_layer = transformers_model.get_input_embeddings()

chat = [{"role": "user", "content": "What is the capital of China?"}]
prompt_embeds = get_prompt_embeds(chat, tokenizer, embedding_layer)

# Run with ACL graph enabled (enforce_eager=False)
with VllmRunner(
model_name,
enable_prompt_embeds=True,
enforce_eager=False,
) as vllm_aclgraph_runner:
aclgraph_outputs = vllm_aclgraph_runner.model.generate({
"prompt_embeds":
prompt_embeds,
})

# Run with ACL graph disabled (enforce_eager=True)
with VllmRunner(
model_name,
enable_prompt_embeds=True,
enforce_eager=True,
) as vllm_eager_runner:
eager_outputs = vllm_eager_runner.model.generate({
"prompt_embeds":
prompt_embeds,
})

# Verify both produce valid outputs
assert len(aclgraph_outputs) == 1
assert len(eager_outputs) == 1
assert len(aclgraph_outputs[0].outputs[0].text) > 0
assert len(eager_outputs[0].outputs[0].text) > 0

print("\n[ACL Graph Output]:", aclgraph_outputs[0].outputs[0].text)
print("[Eager Output]:", eager_outputs[0].outputs[0].text)

# Note: Outputs may differ slightly due to different execution paths,
# but both should be valid responses


@pytest.mark.parametrize("model_name", MODELS)
def test_mixed_prompt_embeds_and_text(model_name):
"""Test mixed inputs with both prompt embeddings and text prompts."""
# Prepare prompt embeddings for first request
tokenizer = AutoTokenizer.from_pretrained(model_name)
transformers_model = AutoModelForCausalLM.from_pretrained(model_name)
embedding_layer = transformers_model.get_input_embeddings()

chat = [{"role": "user", "content": "What is AI?"}]
prompt_embeds = get_prompt_embeds(chat, tokenizer, embedding_layer)

# Prepare text prompt for second request
text_prompt = "What is machine learning?"

# Run inference with mixed inputs
with VllmRunner(
model_name,
enable_prompt_embeds=True,
enforce_eager=True,
) as vllm_runner:
# Test prompt embeddings
embeds_output = vllm_runner.model.generate({
"prompt_embeds":
prompt_embeds,
})

# Test text prompt
text_output = vllm_runner.model.generate(text_prompt)

# Verify both types of inputs work
assert len(embeds_output) == 1
assert len(text_output) == 1
assert len(embeds_output[0].outputs[0].text) > 0
assert len(text_output[0].outputs[0].text) > 0

print("\n[Prompt Embeds Output]:", embeds_output[0].outputs[0].text)
print("[Text Prompt Output]:", text_output[0].outputs[0].text)
Loading
Loading