Skip to content

Commit 4721e4f

Browse files
[bugfix] asyncscheduler bug fix (#4968)
### What this PR does / why we need it? now vllm-ascend uses AsyncGPUModelRunnerOutput ,AsyncNPUModelRunnerOutput before is outdated, so we should fix it - vLLM version: v0.12.0 - vLLM main: vllm-project/vllm@ad32e3e --------- Signed-off-by: zhenwenqi2024 <[email protected]>
1 parent 3581946 commit 4721e4f

File tree

3 files changed

+10
-3
lines changed

3 files changed

+10
-3
lines changed

.github/workflows/_e2e_test.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -93,6 +93,7 @@ jobs:
9393
pytest -sv tests/e2e/singlecard/test_completion_with_prompt_embeds.py
9494
pytest -sv tests/e2e/singlecard/test_aclgraph_accuracy.py
9595
pytest -sv tests/e2e/singlecard/test_aclgraph_mem.py
96+
pytest -sv tests/e2e/singlecard/test_async_scheduling.py
9697
pytest -sv tests/e2e/singlecard/test_camem.py
9798
pytest -sv tests/e2e/singlecard/test_guided_decoding.py
9899
# torch 2.8 doesn't work with lora, fix me

tests/e2e/singlecard/test_async_scheduling.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,8 +17,12 @@
1717

1818
first_prompt = ("The following numbers of the sequence " +
1919
", ".join(str(i) for i in range(10)) + " are:")
20-
example_prompts = [first_prompt, "In one word, the capital of France is "
21-
] + [f"Tell me about the number {i}: " for i in range(32)]
20+
example_prompts = [
21+
"Hello, my name is",
22+
"The president of the United States is",
23+
"The capital of France is",
24+
"The future of AI is",
25+
]
2226

2327
default_params = dict(
2428
temperature=0.0, # greedy

vllm_ascend/worker/model_runner_v1.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1875,8 +1875,10 @@ def propose_draft_token_ids(sampled_token_ids):
18751875
return AsyncGPUModelRunnerOutput(
18761876
model_runner_output=model_runner_output,
18771877
sampled_token_ids=sampled_token_ids,
1878+
logprobs_tensors=sampler_output.logprobs_tensors,
18781879
invalid_req_indices=invalid_req_indices,
18791880
async_output_copy_stream=self.async_output_copy_stream,
1881+
vocab_size=self.input_batch.vocab_size,
18801882
)
18811883

18821884
def _build_dummy_attn_metadata(
@@ -3472,7 +3474,7 @@ def __init__(self, *args, **kwargs) -> None:
34723474

34733475
try:
34743476
# replace cuda APIs with xpu APIs, this should work by default
3475-
torch.cuda.Event = _EventPlaceholder
3477+
torch.cuda.Event = torch.npu.Event
34763478
torch.cuda.Stream = torch.npu.Stream
34773479
torch.cuda.default_stream = torch.npu.default_stream
34783480
torch.cuda.current_stream = torch.npu.current_stream

0 commit comments

Comments
 (0)