|
33 | 33 | delete_torchair_cache_file) |
34 | 34 | from vllm_ascend.utils import (ASCEND_QUANTIZATION_METHOD, enable_sp, is_310p, |
35 | 35 | prefill_context_parallel_enable, |
36 | | - update_aclgraph_sizes, vllm_version_is) |
| 36 | + update_aclgraph_sizes, |
| 37 | + update_cudagraph_capture_sizes, vllm_version_is) |
37 | 38 |
|
38 | 39 | if TYPE_CHECKING: |
39 | 40 | from vllm.config import ModelConfig, VllmConfig |
@@ -142,24 +143,47 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None: |
142 | 143 | "Non-MLA LLMs forcibly disable the chunked prefill feature," |
143 | 144 | "as the performance of operators supporting this feature " |
144 | 145 | "functionality is currently suboptimal.") |
145 | | - if not model_config.is_multimodal_model and \ |
146 | | - structured_outputs_config.backend == "auto" and \ |
147 | | - not getattr(scheduler_config, "scheduler_delay_factor", 0) > 0 and \ |
148 | | - scheduler_config.policy == "fcfs": |
149 | | - ascend_scheduler_config.enabled = True |
150 | | - chunked_prefill_enabled_in_ascend_scheduler = getattr( |
151 | | - ascend_scheduler_config, "enable_chunked_prefill", False) |
152 | | - if chunked_prefill_enabled_in_ascend_scheduler: |
153 | | - logger.warning( |
154 | | - "Chunked prefill feature is enabled in ascend_scheduler," |
155 | | - "but note that the operator supporting this feature " |
156 | | - "would lead to performance degradation.") |
157 | | - # In this situation, max_num_batched_tokens would have been rewritten. |
158 | | - # So we must make sure max_num_batched_tokens is not smaller than max_model_len. |
159 | | - if (scheduler_config.max_num_batched_tokens |
160 | | - < scheduler_config.max_model_len |
161 | | - and not chunked_prefill_enabled_in_ascend_scheduler): |
162 | | - scheduler_config.max_num_batched_tokens = scheduler_config.max_model_len |
| 146 | + if vllm_version_is("0.11.0"): |
| 147 | + if not model_config.is_multimodal_model and \ |
| 148 | + structured_outputs_config.backend == "auto" and \ |
| 149 | + not scheduler_config.send_delta_data and \ |
| 150 | + not getattr(scheduler_config, "scheduler_delay_factor", 0) > 0 and \ |
| 151 | + scheduler_config.policy == "fcfs": |
| 152 | + ascend_scheduler_config.enabled = True |
| 153 | + chunked_prefill_enabled_in_ascend_scheduler = getattr( |
| 154 | + ascend_scheduler_config, "enable_chunked_prefill", |
| 155 | + False) |
| 156 | + if chunked_prefill_enabled_in_ascend_scheduler: |
| 157 | + logger.warning( |
| 158 | + "Chunked prefill feature is enabled in ascend_scheduler," |
| 159 | + "but note that the operator supporting this feature " |
| 160 | + "would lead to performance degradation.") |
| 161 | + # In this situation, max_num_batched_tokens would have been rewritten. |
| 162 | + # So we must make sure max_num_batched_tokens is not smaller than max_model_len. |
| 163 | + if (scheduler_config.max_num_batched_tokens |
| 164 | + < scheduler_config.max_model_len and |
| 165 | + not chunked_prefill_enabled_in_ascend_scheduler): |
| 166 | + scheduler_config.max_num_batched_tokens = scheduler_config.max_model_len |
| 167 | + else: |
| 168 | + if not model_config.is_multimodal_model and \ |
| 169 | + structured_outputs_config.backend == "auto" and \ |
| 170 | + not getattr(scheduler_config, "scheduler_delay_factor", 0) > 0 and \ |
| 171 | + scheduler_config.policy == "fcfs": |
| 172 | + ascend_scheduler_config.enabled = True |
| 173 | + chunked_prefill_enabled_in_ascend_scheduler = getattr( |
| 174 | + ascend_scheduler_config, "enable_chunked_prefill", |
| 175 | + False) |
| 176 | + if chunked_prefill_enabled_in_ascend_scheduler: |
| 177 | + logger.warning( |
| 178 | + "Chunked prefill feature is enabled in ascend_scheduler," |
| 179 | + "but note that the operator supporting this feature " |
| 180 | + "would lead to performance degradation.") |
| 181 | + # In this situation, max_num_batched_tokens would have been rewritten. |
| 182 | + # So we must make sure max_num_batched_tokens is not smaller than max_model_len. |
| 183 | + if (scheduler_config.max_num_batched_tokens |
| 184 | + < scheduler_config.max_model_len and |
| 185 | + not chunked_prefill_enabled_in_ascend_scheduler): |
| 186 | + scheduler_config.max_num_batched_tokens = scheduler_config.max_model_len |
163 | 187 |
|
164 | 188 | kv_cache_dtype = vllm_config.additional_config.get( |
165 | 189 | "kv_cache_dtype", None) |
@@ -237,8 +261,12 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None: |
237 | 261 | f"{vllm_config.parallel_config.tensor_parallel_size}") |
238 | 262 | if len(sp_aclgraph_sizes) != len(original_sizes): |
239 | 263 | compilation_config.cudagraph_capture_sizes = sp_aclgraph_sizes |
240 | | - vllm_config.compilation_config.init_with_cudagraph_sizes( |
241 | | - sp_aclgraph_sizes) |
| 264 | + if vllm_version_is("0.11.0"): |
| 265 | + compilation_config.init_with_cudagraph_sizes( |
| 266 | + sp_aclgraph_sizes) |
| 267 | + else: |
| 268 | + update_cudagraph_capture_sizes(vllm_config, |
| 269 | + sp_aclgraph_sizes) |
242 | 270 |
|
243 | 271 | # TODO: Full graph is fully supported later, and the default value will be set to full graph. |
244 | 272 | if compilation_config.cudagraph_mode == CUDAGraphMode.FULL_AND_PIECEWISE: |
|
0 commit comments