|
122 | 122 | from vllm_ascend.utils import (ACL_FORMAT_FRACTAL_ND, ACL_FORMAT_FRACTAL_NZ, |
123 | 123 | AscendDeviceType, ProfileExecuteDuration, |
124 | 124 | enable_sp, get_ascend_device_type, is_enable_nz, |
125 | | - is_moe_model, lmhead_tp_enable) |
| 125 | + is_moe_model, is_vl_model, lmhead_tp_enable) |
126 | 126 | from vllm_ascend.worker.npu_input_batch import InputBatch |
127 | 127 |
|
128 | 128 | if TYPE_CHECKING: |
@@ -270,6 +270,9 @@ def __init__(self, vllm_config: VllmConfig, device: torch.device): |
270 | 270 |
|
271 | 271 | set_cos_and_sin(vllm_config, self.max_num_reqs, |
272 | 272 | self.uniform_decode_query_len, self.dtype, self.device) |
| 273 | + if not is_vl_model(self.vllm_config |
| 274 | + ) and not self.vllm_config.model_config.use_mla: |
| 275 | + initialize_cos_sin(self.vllm_config, self.dtype, self.device) |
273 | 276 | set_mc2_tokens_capacity(vllm_config, self.max_num_reqs, |
274 | 277 | self.uniform_decode_query_len) |
275 | 278 | set_mc2_mask(vllm_config, self.device) |
@@ -2198,6 +2201,9 @@ def _dummy_run( |
2198 | 2201 | else: |
2199 | 2202 | positions = self.positions.gpu[:num_tokens_padded] |
2200 | 2203 |
|
| 2204 | + # update global cos, sin |
| 2205 | + update_cos_sin(positions) |
| 2206 | + |
2201 | 2207 | if get_pp_group().is_first_rank: |
2202 | 2208 | intermediate_tensors = None |
2203 | 2209 | else: |
|
0 commit comments