@@ -142,24 +142,47 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
142142 "Non-MLA LLMs forcibly disable the chunked prefill feature,"
143143 "as the performance of operators supporting this feature "
144144 "functionality is currently suboptimal." )
145- if not model_config .is_multimodal_model and \
146- structured_outputs_config .backend == "auto" and \
147- not getattr (scheduler_config , "scheduler_delay_factor" , 0 ) > 0 and \
148- scheduler_config .policy == "fcfs" :
149- ascend_scheduler_config .enabled = True
150- chunked_prefill_enabled_in_ascend_scheduler = getattr (
151- ascend_scheduler_config , "enable_chunked_prefill" , False )
152- if chunked_prefill_enabled_in_ascend_scheduler :
153- logger .warning (
154- "Chunked prefill feature is enabled in ascend_scheduler,"
155- "but note that the operator supporting this feature "
156- "would lead to performance degradation." )
157- # In this situation, max_num_batched_tokens would have been rewritten.
158- # So we must make sure max_num_batched_tokens is not smaller than max_model_len.
159- if (scheduler_config .max_num_batched_tokens
160- < scheduler_config .max_model_len
161- and not chunked_prefill_enabled_in_ascend_scheduler ):
162- scheduler_config .max_num_batched_tokens = scheduler_config .max_model_len
145+ if vllm_version_is ("0.11.0" ):
146+ if not model_config .is_multimodal_model and \
147+ structured_outputs_config .backend == "auto" and \
148+ not scheduler_config .send_delta_data and \
149+ not getattr (scheduler_config , "scheduler_delay_factor" , 0 ) > 0 and \
150+ scheduler_config .policy == "fcfs" :
151+ ascend_scheduler_config .enabled = True
152+ chunked_prefill_enabled_in_ascend_scheduler = getattr (
153+ ascend_scheduler_config , "enable_chunked_prefill" ,
154+ False )
155+ if chunked_prefill_enabled_in_ascend_scheduler :
156+ logger .warning (
157+ "Chunked prefill feature is enabled in ascend_scheduler,"
158+ "but note that the operator supporting this feature "
159+ "would lead to performance degradation." )
160+ # In this situation, max_num_batched_tokens would have been rewritten.
161+ # So we must make sure max_num_batched_tokens is not smaller than max_model_len.
162+ if (scheduler_config .max_num_batched_tokens
163+ < scheduler_config .max_model_len and
164+ not chunked_prefill_enabled_in_ascend_scheduler ):
165+ scheduler_config .max_num_batched_tokens = scheduler_config .max_model_len
166+ else :
167+ if not model_config .is_multimodal_model and \
168+ structured_outputs_config .backend == "auto" and \
169+ not getattr (scheduler_config , "scheduler_delay_factor" , 0 ) > 0 and \
170+ scheduler_config .policy == "fcfs" :
171+ ascend_scheduler_config .enabled = True
172+ chunked_prefill_enabled_in_ascend_scheduler = getattr (
173+ ascend_scheduler_config , "enable_chunked_prefill" ,
174+ False )
175+ if chunked_prefill_enabled_in_ascend_scheduler :
176+ logger .warning (
177+ "Chunked prefill feature is enabled in ascend_scheduler,"
178+ "but note that the operator supporting this feature "
179+ "would lead to performance degradation." )
180+ # In this situation, max_num_batched_tokens would have been rewritten.
181+ # So we must make sure max_num_batched_tokens is not smaller than max_model_len.
182+ if (scheduler_config .max_num_batched_tokens
183+ < scheduler_config .max_model_len and
184+ not chunked_prefill_enabled_in_ascend_scheduler ):
185+ scheduler_config .max_num_batched_tokens = scheduler_config .max_model_len
163186
164187 kv_cache_dtype = vllm_config .additional_config .get (
165188 "kv_cache_dtype" , None )
0 commit comments