@@ -130,7 +130,6 @@ def compute_hash(self) -> str:
130130 from vllm import __version__
131131
132132 vllm_factors .append (__version__ )
133- vllm_factors .append (envs .VLLM_USE_V1 )
134133 if self .model_config :
135134 vllm_factors .append (self .model_config .compute_hash ())
136135 else :
@@ -306,7 +305,6 @@ def __post_init__(self):
306305 self .cache_config .verify_with_parallel_config (self .parallel_config )
307306
308307 if self .lora_config is not None :
309- self .lora_config .verify_with_cache_config (self .cache_config )
310308 self .lora_config .verify_with_model_config (self .model_config )
311309
312310 if self .quant_config is None and self .model_config is not None :
@@ -332,18 +330,9 @@ def __post_init__(self):
332330 # we use the default mode. The default mode depends on other
333331 # settings (see the below code).
334332 if self .compilation_config .mode is None :
335- if envs .VLLM_USE_V1 :
336- if (
337- self .model_config is not None
338- and not self .model_config .enforce_eager
339- ):
340- self .compilation_config .mode = CompilationMode .VLLM_COMPILE
341- else :
342- self .compilation_config .mode = CompilationMode .NONE
343-
333+ if self .model_config is not None and not self .model_config .enforce_eager :
334+ self .compilation_config .mode = CompilationMode .VLLM_COMPILE
344335 else :
345- # NB: Passing both --enforce-eager and a compilation mode
346- # in V0 means the compilation mode wins out.
347336 self .compilation_config .mode = CompilationMode .NONE
348337 else :
349338 assert self .compilation_config .mode >= CompilationMode .NONE
@@ -371,10 +360,7 @@ def __post_init__(self):
371360 # if cudagraph_mode is not explicitly set by users, set default
372361 # value
373362 if self .compilation_config .cudagraph_mode is None :
374- if (
375- envs .VLLM_USE_V1
376- and self .compilation_config .mode == CompilationMode .VLLM_COMPILE
377- ):
363+ if self .compilation_config .mode == CompilationMode .VLLM_COMPILE :
378364 # default to full and piecewise for most models
379365 self .compilation_config .cudagraph_mode = (
380366 CUDAGraphMode .FULL_AND_PIECEWISE
@@ -428,7 +414,7 @@ def __post_init__(self):
428414 # override related settings when enforce eager
429415 self .compilation_config .max_cudagraph_capture_size = 0
430416 self .compilation_config .cudagraph_capture_sizes = []
431- elif envs . VLLM_USE_V1 :
417+ else :
432418 self .compilation_config .cudagraph_num_of_warmups = 1
433419
434420 self ._set_cudagraph_sizes ()
@@ -535,14 +521,11 @@ def __post_init__(self):
535521 current_platform .check_and_update_config (self )
536522
537523 # Do this after all the updates to compilation_config.mode
538- if (
539- envs .VLLM_USE_V1
540- and self .compilation_config .mode == CompilationMode .VLLM_COMPILE
541- ):
524+ if self .compilation_config .mode == CompilationMode .VLLM_COMPILE :
542525 self .compilation_config .set_splitting_ops_for_v1 ()
543526
544527 # final check of cudagraph mode after all possible updates
545- if envs . VLLM_USE_V1 and current_platform .is_cuda_alike ():
528+ if current_platform .is_cuda_alike ():
546529 if (
547530 self .compilation_config .cudagraph_mode .has_full_cudagraphs ()
548531 and self .model_config is not None
@@ -587,10 +570,7 @@ def __post_init__(self):
587570 if not self .instance_id :
588571 self .instance_id = random_uuid ()[:5 ]
589572
590- if (
591- envs .VLLM_USE_V1
592- and not self .scheduler_config .disable_hybrid_kv_cache_manager
593- ):
573+ if not self .scheduler_config .disable_hybrid_kv_cache_manager :
594574 # logger should only print warning message for hybrid models. As we
595575 # can't know whether the model is hybrid or not now, so we don't log
596576 # warning message here and will log it later.
0 commit comments