Skip to content

Commit 0ce743f

Browse files
authored
Fix(llm): Abort orphaned requests when llm.chat() batch fails Fixes #26081 (#27420)
Signed-off-by: vensenmu <[email protected]>
1 parent 6c317a6 commit 0ce743f

File tree

2 files changed

+75
-14
lines changed

2 files changed

+75
-14
lines changed

tests/entrypoints/llm/test_chat.py

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66

77
from vllm import LLM
88
from vllm.distributed import cleanup_dist_env_and_memory
9+
from vllm.sampling_params import SamplingParams
910

1011
from ..openai.test_vision import TEST_IMAGE_ASSETS
1112

@@ -23,6 +24,29 @@ def text_llm():
2324
cleanup_dist_env_and_memory()
2425

2526

27+
@pytest.fixture(scope="function")
28+
def llm_for_failure_test():
29+
"""
30+
Fixture for testing issue #26081.
31+
Uses a small max_model_len to easily trigger length errors.
32+
"""
33+
# pytest caches the fixture so we use weakref.proxy to
34+
# enable garbage collection
35+
llm = LLM(
36+
model="meta-llama/Llama-3.2-1B-Instruct",
37+
enforce_eager=True,
38+
seed=0,
39+
max_model_len=128,
40+
disable_log_stats=True,
41+
)
42+
43+
yield weakref.proxy(llm)
44+
45+
del llm
46+
47+
cleanup_dist_env_and_memory()
48+
49+
2650
def test_chat(text_llm):
2751
prompt1 = "Explain the concept of entropy."
2852
messages = [
@@ -157,3 +181,32 @@ def test_chat_extra_kwargs(thinking_llm, enable_thinking):
157181
else:
158182
# The chat template includes dummy thinking process
159183
assert think_id in prompt_token_ids
184+
185+
186+
def test_chat_batch_failure_cleanup(llm_for_failure_test):
187+
"""
188+
Tests that if a batch call to llm.chat() fails mid-way
189+
(e.g., due to one invalid prompt), the requests that
190+
were already enqueued are properly aborted and do not
191+
pollute the queue for subsequent calls.
192+
(Fixes Issue #26081)
193+
"""
194+
llm = llm_for_failure_test
195+
valid_msg = [{"role": "user", "content": "Hello"}]
196+
long_text = "This is a very long text to test the error " * 50
197+
invalid_msg = [{"role": "user", "content": long_text}]
198+
batch_1 = [
199+
valid_msg,
200+
valid_msg,
201+
invalid_msg,
202+
]
203+
batch_2 = [
204+
valid_msg,
205+
valid_msg,
206+
]
207+
sampling_params = SamplingParams(temperature=0, max_tokens=10)
208+
with pytest.raises(ValueError, match="longer than the maximum model length"):
209+
llm.chat(batch_1, sampling_params=sampling_params)
210+
outputs_2 = llm.chat(batch_2, sampling_params=sampling_params)
211+
assert len(outputs_2) == len(batch_2)
212+
assert llm.llm_engine.get_num_unfinished_requests() == 0

vllm/entrypoints/llm.py

Lines changed: 22 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1588,20 +1588,27 @@ def _validate_and_add_requests(
15881588
tqdm_func = use_tqdm if callable(use_tqdm) else tqdm
15891589
it = tqdm_func(it, desc="Adding requests")
15901590

1591-
for i, prompt in enumerate(it):
1592-
if isinstance(prompt, dict):
1593-
self._validate_mm_data_and_uuids(
1594-
prompt.get("multi_modal_data"), prompt.get("multi_modal_uuids")
1595-
)
1591+
added_request_ids: list[str] = []
15961592

1597-
self._add_request(
1598-
prompt,
1599-
params[i] if isinstance(params, Sequence) else params,
1600-
lora_request=lora_request[i]
1601-
if isinstance(lora_request, Sequence)
1602-
else lora_request,
1603-
priority=priority[i] if priority else 0,
1604-
)
1593+
try:
1594+
for i, prompt in enumerate(it):
1595+
if isinstance(prompt, dict):
1596+
self._validate_mm_data_and_uuids(
1597+
prompt.get("multi_modal_data"), prompt.get("multi_modal_uuids")
1598+
)
1599+
request_id = self._add_request(
1600+
prompt,
1601+
params[i] if isinstance(params, Sequence) else params,
1602+
lora_request=lora_request[i]
1603+
if isinstance(lora_request, Sequence)
1604+
else lora_request,
1605+
priority=priority[i] if priority else 0,
1606+
)
1607+
added_request_ids.append(request_id)
1608+
except Exception as e:
1609+
if added_request_ids:
1610+
self.llm_engine.abort_request(added_request_ids)
1611+
raise e
16051612

16061613
def _validate_mm_data_and_uuids(
16071614
self,
@@ -1684,7 +1691,7 @@ def _add_request(
16841691
params: SamplingParams | PoolingParams,
16851692
lora_request: LoRARequest | None = None,
16861693
priority: int = 0,
1687-
) -> None:
1694+
) -> str:
16881695
prompt_text, _, _ = get_prompt_components(prompt)
16891696
request_id = str(next(self.request_counter))
16901697

@@ -1705,6 +1712,7 @@ def _add_request(
17051712
priority=priority,
17061713
prompt_text=prompt_text,
17071714
)
1715+
return request_id
17081716

17091717
def _run_engine(
17101718
self, *, use_tqdm: bool | Callable[..., tqdm] = True

0 commit comments

Comments
 (0)