Skip to content

Commit af826e0

Browse files
authored
[V0 deprecation] Remove VLLM_USE_V1 usage in config module (#27784)
Signed-off-by: wangxiyuan <[email protected]>
1 parent e806178 commit af826e0

File tree

4 files changed

+9
-62
lines changed

4 files changed

+9
-62
lines changed

vllm/config/lora.py

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,6 @@
99
from pydantic.dataclasses import dataclass
1010
from typing_extensions import Self
1111

12-
import vllm.envs as envs
1312
from vllm.config.utils import config
1413
from vllm.logger import init_logger
1514
from vllm.platforms import current_platform
@@ -106,10 +105,6 @@ def _validate_lora_config(self) -> Self:
106105

107106
return self
108107

109-
def verify_with_cache_config(self, cache_config: CacheConfig):
110-
if cache_config.cpu_offload_gb > 0 and not envs.VLLM_USE_V1:
111-
raise ValueError("V0 LoRA does not support CPU offload, please use V1.")
112-
113108
def verify_with_model_config(self, model_config: ModelConfig):
114109
if self.lora_dtype in (None, "auto"):
115110
self.lora_dtype = model_config.dtype

vllm/config/model.py

Lines changed: 2 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,6 @@
3232
get_pooling_config,
3333
get_sentence_transformer_tokenizer_config,
3434
is_encoder_decoder,
35-
is_interleaved,
3635
try_get_dense_modules,
3736
try_get_generation_config,
3837
try_get_safetensors_metadata,
@@ -442,15 +441,12 @@ def __post_init__(
442441
self.enforce_eager = True
443442

444443
# Set the default seed to 0 in V1.
445-
# NOTE(woosuk): In V0, we set the default seed to None because the
446-
# driver worker shares the same process as the user process, and thus
447-
# setting a seed affects the user process as well.
448-
# In V1, we use separate processes for workers (unless
444+
# NOTE(woosuk): In V1, we use separate processes for workers (unless
449445
# VLLM_ENABLE_V1_MULTIPROCESSING=0), so setting a seed here
450446
# doesn't affect the user process. However, without a consistent seed,
451447
# different tensor parallel workers would sample different tokens,
452448
# leading to inconsistent results.
453-
if envs.VLLM_USE_V1 and self.seed is None:
449+
if self.seed is None:
454450
self.seed = 0
455451
if not envs.VLLM_ENABLE_V1_MULTIPROCESSING:
456452
logger.warning(
@@ -703,23 +699,6 @@ def _task_to_convert(task: TaskOption) -> ConvertType:
703699
revision=self.revision,
704700
)
705701

706-
# Interleaved attention is not supported by some backends in V0
707-
if (
708-
not self.disable_sliding_window
709-
and is_interleaved(self.hf_text_config)
710-
and not envs.VLLM_USE_V1
711-
and (backend := envs.VLLM_ATTENTION_BACKEND) in ("XFORMERS", "FLASHINFER")
712-
):
713-
logger.warning_once(
714-
"%s has interleaved attention, which is currently not "
715-
"supported by the %s backend. Disabling sliding window and "
716-
"capping the max length to the sliding window size (%d).",
717-
self.hf_text_config.model_type,
718-
backend,
719-
self.hf_text_config.sliding_window,
720-
)
721-
self.disable_sliding_window = True
722-
723702
self.original_max_model_len = self.max_model_len
724703
self.max_model_len = self.get_and_verify_max_len(self.max_model_len)
725704
# Init multimodal config if needed

vllm/config/speculative.py

Lines changed: 0 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,6 @@
99
from pydantic.dataclasses import dataclass
1010
from typing_extensions import Self
1111

12-
import vllm.envs as envs
1312
from vllm.config.parallel import ParallelConfig
1413
from vllm.config.utils import config
1514
from vllm.logger import init_logger
@@ -366,12 +365,6 @@ def __post_init__(self):
366365

367366
# Replace hf_config for EAGLE draft_model
368367
if self.method in ("eagle", "eagle3"):
369-
if self.enable_chunked_prefill and not envs.VLLM_USE_V1:
370-
raise ValueError(
371-
"Chunked prefill and EAGLE are not compatible "
372-
"when using V0."
373-
)
374-
375368
from vllm.transformers_utils.configs import SpeculatorsConfig
376369
from vllm.transformers_utils.configs.eagle import EAGLEConfig
377370

vllm/config/vllm.py

Lines changed: 7 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -130,7 +130,6 @@ def compute_hash(self) -> str:
130130
from vllm import __version__
131131

132132
vllm_factors.append(__version__)
133-
vllm_factors.append(envs.VLLM_USE_V1)
134133
if self.model_config:
135134
vllm_factors.append(self.model_config.compute_hash())
136135
else:
@@ -306,7 +305,6 @@ def __post_init__(self):
306305
self.cache_config.verify_with_parallel_config(self.parallel_config)
307306

308307
if self.lora_config is not None:
309-
self.lora_config.verify_with_cache_config(self.cache_config)
310308
self.lora_config.verify_with_model_config(self.model_config)
311309

312310
if self.quant_config is None and self.model_config is not None:
@@ -332,18 +330,9 @@ def __post_init__(self):
332330
# we use the default mode. The default mode depends on other
333331
# settings (see the below code).
334332
if self.compilation_config.mode is None:
335-
if envs.VLLM_USE_V1:
336-
if (
337-
self.model_config is not None
338-
and not self.model_config.enforce_eager
339-
):
340-
self.compilation_config.mode = CompilationMode.VLLM_COMPILE
341-
else:
342-
self.compilation_config.mode = CompilationMode.NONE
343-
333+
if self.model_config is not None and not self.model_config.enforce_eager:
334+
self.compilation_config.mode = CompilationMode.VLLM_COMPILE
344335
else:
345-
# NB: Passing both --enforce-eager and a compilation mode
346-
# in V0 means the compilation mode wins out.
347336
self.compilation_config.mode = CompilationMode.NONE
348337
else:
349338
assert self.compilation_config.mode >= CompilationMode.NONE
@@ -371,10 +360,7 @@ def __post_init__(self):
371360
# if cudagraph_mode is not explicitly set by users, set default
372361
# value
373362
if self.compilation_config.cudagraph_mode is None:
374-
if (
375-
envs.VLLM_USE_V1
376-
and self.compilation_config.mode == CompilationMode.VLLM_COMPILE
377-
):
363+
if self.compilation_config.mode == CompilationMode.VLLM_COMPILE:
378364
# default to full and piecewise for most models
379365
self.compilation_config.cudagraph_mode = (
380366
CUDAGraphMode.FULL_AND_PIECEWISE
@@ -428,7 +414,7 @@ def __post_init__(self):
428414
# override related settings when enforce eager
429415
self.compilation_config.max_cudagraph_capture_size = 0
430416
self.compilation_config.cudagraph_capture_sizes = []
431-
elif envs.VLLM_USE_V1:
417+
else:
432418
self.compilation_config.cudagraph_num_of_warmups = 1
433419

434420
self._set_cudagraph_sizes()
@@ -535,14 +521,11 @@ def __post_init__(self):
535521
current_platform.check_and_update_config(self)
536522

537523
# Do this after all the updates to compilation_config.mode
538-
if (
539-
envs.VLLM_USE_V1
540-
and self.compilation_config.mode == CompilationMode.VLLM_COMPILE
541-
):
524+
if self.compilation_config.mode == CompilationMode.VLLM_COMPILE:
542525
self.compilation_config.set_splitting_ops_for_v1()
543526

544527
# final check of cudagraph mode after all possible updates
545-
if envs.VLLM_USE_V1 and current_platform.is_cuda_alike():
528+
if current_platform.is_cuda_alike():
546529
if (
547530
self.compilation_config.cudagraph_mode.has_full_cudagraphs()
548531
and self.model_config is not None
@@ -587,10 +570,7 @@ def __post_init__(self):
587570
if not self.instance_id:
588571
self.instance_id = random_uuid()[:5]
589572

590-
if (
591-
envs.VLLM_USE_V1
592-
and not self.scheduler_config.disable_hybrid_kv_cache_manager
593-
):
573+
if not self.scheduler_config.disable_hybrid_kv_cache_manager:
594574
# logger should only print warning message for hybrid models. As we
595575
# can't know whether the model is hybrid or not now, so we don't log
596576
# warning message here and will log it later.

0 commit comments

Comments
 (0)