Skip to content

Commit 66d433b

Browse files
[V1] Revert the default max_num_seqs to V0 values for most hardware (vllm-project#16158)
Signed-off-by: DarkLight1337 <[email protected]>
1 parent 027b204 commit 66d433b

File tree

3 files changed

+6
-10
lines changed

3 files changed

+6
-10
lines changed

docs/source/getting_started/v1_user_guide.md

Lines changed: 0 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -156,10 +156,3 @@ vLLM V1 is currently optimized for decoder-only transformers. Models requiring
156156
cross-attention between separate encoder and decoder are not yet supported (e.g., `BartForConditionalGeneration`, `MllamaForConditionalGeneration`).
157157

158158
For a complete list of supported models, see the [list of supported models](https://docs.vllm.ai/en/latest/models/supported_models.html).
159-
160-
## Frequently Asked Questions
161-
162-
**I'm using vLLM V1 and I'm getting CUDA OOM errors. What should I do?**
163-
The default `max_num_seqs` has been raised from `256` in V0 to `1024` in V1. If you encounter CUDA OOM only when using V1 engine, try setting a lower value of `max_num_seqs` or `gpu_memory_utilization`.
164-
165-
On the other hand, if you get an error about insufficient memory for the cache blocks, you should increase `gpu_memory_utilization` as this indicates that your GPU has sufficient memory but you're not allocating enough to vLLM for KV cache blocks.

tests/v1/engine/test_engine_args.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -64,15 +64,17 @@ def test_defaults_with_usage_context():
6464
# For H100 and H200, we use larger default values.
6565
default_llm_tokens = 16384
6666
default_server_tokens = 8192
67+
default_max_num_seqs = 1024
6768
else:
6869
default_llm_tokens = 8192
6970
default_server_tokens = 2048
71+
default_max_num_seqs = 256
7072

71-
assert vllm_config.scheduler_config.max_num_seqs == 1024
73+
assert vllm_config.scheduler_config.max_num_seqs == default_max_num_seqs
7274
assert vllm_config.scheduler_config.max_num_batched_tokens == default_llm_tokens # noqa: E501
7375

7476
engine_args = EngineArgs(model="facebook/opt-125m")
7577
vllm_config = engine_args.create_engine_config(
7678
UsageContext.OPENAI_API_SERVER)
77-
assert vllm_config.scheduler_config.max_num_seqs == 1024
79+
assert vllm_config.scheduler_config.max_num_seqs == default_max_num_seqs
7880
assert vllm_config.scheduler_config.max_num_batched_tokens == default_server_tokens # noqa: E501

vllm/engine/arg_utils.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1666,12 +1666,14 @@ def _set_default_args_v1(self, usage_context: UsageContext) -> None:
16661666
UsageContext.LLM_CLASS: 16384,
16671667
UsageContext.OPENAI_API_SERVER: 8192,
16681668
}
1669+
default_max_num_seqs = 1024
16691670
else:
16701671
# TODO(woosuk): Tune the default values for other hardware.
16711672
default_max_num_batched_tokens = {
16721673
UsageContext.LLM_CLASS: 8192,
16731674
UsageContext.OPENAI_API_SERVER: 2048,
16741675
}
1676+
default_max_num_seqs = 256
16751677

16761678
use_context_value = usage_context.value if usage_context else None
16771679
if (self.max_num_batched_tokens is None
@@ -1682,7 +1684,6 @@ def _set_default_args_v1(self, usage_context: UsageContext) -> None:
16821684
"Setting max_num_batched_tokens to %d for %s usage context.",
16831685
self.max_num_batched_tokens, use_context_value)
16841686

1685-
default_max_num_seqs = 1024
16861687
if self.max_num_seqs is None:
16871688
self.max_num_seqs = default_max_num_seqs
16881689

0 commit comments

Comments
 (0)