ascend 950 support qwen dense model

wangyao-i · wangyao-i · commit b040e39bf2bb · 2025-11-19T19:28:58.000+08:00
Signed-off-by: wangyao &lt;iwangyao@outlook.com&gt;
diff --git a/tests/ut/attention/test_attention_v1.py b/tests/ut/attention/test_attention_v1.py
@@ -289,6 +289,8 @@ def test_forward_no_attn_metadata(self):
     @patch('vllm_ascend.attention.attention_v1.get_forward_context')
     @patch('torch_npu._npu_reshape_and_cache')
     @patch('torch_npu._npu_flash_attention')
+    @patch('vllm_ascend.attention.attention_v1.is_Ascend950',
+           return_value=False)
     def test_forward_prefill_no_cache(self, mock_flash_attention,
                                       mock_reshape_cache,
                                       mock_get_forward_context):
@@ -321,6 +323,8 @@ def test_forward_prefill_no_cache(self, mock_flash_attention,
     @patch('torch_npu._npu_reshape_and_cache')
     @patch('torch_npu.npu_fused_infer_attention_score')
     @patch('vllm_ascend.attention.attention_v1.get_forward_context')
+    @patch('vllm_ascend.attention.attention_v1.is_Ascend950',
+           return_value=False)
     def test_forward_prefill_cache_hit(self, mock_get_forward_context,
                                        mock_npu_fused_infer_attention_score,
                                        mock_npu_reshape_and_cache):
@@ -357,6 +361,8 @@ def test_forward_prefill_cache_hit(self, mock_get_forward_context,
     @patch('torch_npu._npu_paged_attention')
     @patch('torch_npu._npu_reshape_and_cache')
     @patch('vllm_ascend.attention.attention_v1.get_forward_context')
+    @patch('vllm_ascend.attention.attention_v1.is_Ascend950',
+           return_value=False)
     def test_forward_decode_only(self, mock_get_forward_context,
                                  mock_npu_reshape_and_cache,
                                  mock_paged_attention):
@@ -388,6 +394,8 @@ def test_forward_decode_only(self, mock_get_forward_context,
     @patch('vllm_ascend.attention.attention_v1.get_forward_context')
     @patch('torch_npu.npu_fused_infer_attention_score')
     @patch('torch_npu._npu_reshape_and_cache')
+    @patch('vllm_ascend.attention.attention_v1.is_Ascend950',
+           return_value=False)
     def test_forward_decode_only_swa(self, mock_npu_reshape_and_cache,
                                      mock_fused_infer_attention_score,
                                      mock_get_forward_context):
@@ -421,6 +429,8 @@ def test_forward_decode_only_swa(self, mock_npu_reshape_and_cache,
     @patch('torch_npu._npu_paged_attention')
     @patch('torch_npu.npu_fused_infer_attention_score')
     @patch('torch_npu._npu_reshape_and_cache')
+    @patch('vllm_ascend.attention.attention_v1.is_Ascend950',
+           return_value=False)
     def test_forward_decode_only_swa_seq_len_mismatch(
             self, mock_npu_reshape_and_cache, mock_fused_infer_attention_score,
             mock_paged_attention, mock_get_forward_context):
@@ -458,6 +468,8 @@ def test_forward_decode_only_swa_seq_len_mismatch(
     @patch('vllm_ascend.attention.attention_v1.is_310p', return_value=False)
     @patch('torch_npu._npu_reshape_and_cache')
     @patch('vllm_ascend.attention.attention_v1.vanilla_chunked_prefill')
+    @patch('vllm_ascend.attention.attention_v1.is_Ascend950',
+           return_value=False)
     def test_forward_head_size_192(self, mock_vanilla_prefill,
                                    mock_npu_reshape_and_cache, mock_is_310p,
                                    mock_get_forward_context):
@@ -493,6 +505,8 @@ def test_forward_head_size_192(self, mock_vanilla_prefill,
     @patch('vllm_ascend.attention.attention_v1.get_forward_context')
     @patch('torch_npu.npu_fused_infer_attention_score')
     @patch('torch_npu._npu_reshape_and_cache')
+    @patch('vllm_ascend.attention.attention_v1.is_Ascend950',
+           return_value=False)
     def test_forward_normal_v1_situation(self, mock_npu_reshape_and_cache,
                                          mock_npu_fused_infer_attention_score,
                                          mock_get_forward_context):
@@ -529,6 +543,8 @@ def test_forward_normal_v1_situation(self, mock_npu_reshape_and_cache,
     @patch('torch_npu.npu_fused_infer_attention_score')
     @patch('vllm_ascend.attention.attention_v1.is_310p', return_value=True)
     @patch('vllm_ascend.attention.attention_v1.get_forward_context')
+    @patch('vllm_ascend.attention.attention_v1.is_Ascend950',
+           return_value=False)
     def test_forward_310p_device(self, mock_get_forward_context, mock_is_310p,
                                  mock_npu_fused_infer_attention_score,
                                  mock_npu_reshape_and_cache,
diff --git a/tests/ut/ops/test_rotary_embedding.py b/tests/ut/ops/test_rotary_embedding.py
@@ -428,6 +428,7 @@ def _create_vllm_config(self):
     @patch('vllm.config.VllmConfig.__post_init__', MagicMock())
     @patch('vllm.distributed.parallel_state._DP', MagicMock(world_size=1))
     @patch('vllm.distributed.parallel_state._TP', MagicMock(world_size=1))
+    @patch('vllm_ascend.ops.rotary_embedding.is_Ascend950', return_value=False)
     def test_forward_oot_1d_positions(self, mock_npu_mrope):
         mock_npu_mrope.return_value = (torch.zeros_like(self.query),
                                        torch.zeros_like(self.key))
@@ -447,6 +448,7 @@ def test_forward_oot_1d_positions(self, mock_npu_mrope):
     @patch('vllm.config.VllmConfig.__post_init__', MagicMock())
     @patch('vllm.distributed.parallel_state._DP', MagicMock(world_size=1))
     @patch('vllm.distributed.parallel_state._TP', MagicMock(world_size=1))
+    @patch('vllm_ascend.ops.rotary_embedding.is_Ascend950', return_value=False)
     def test_forward_oot_2d_positions(self, mock_npu_mrope):
         mock_npu_mrope.return_value = (torch.zeros_like(self.query),
                                        torch.zeros_like(self.key))
diff --git a/tests/ut/sample/test_sampler.py b/tests/ut/sample/test_sampler.py
@@ -1,4 +1,5 @@
 from unittest import mock
+from unittest.mock import patch
 
 import torch
 
@@ -18,6 +19,7 @@ def test_init_with_raw_logprobs(self):
 class TestAscendTopKTopPSampler(TestBase):
 
     @mock.patch("torch_npu.npu_top_k_top_p")
+    @patch('vllm_ascend.sample.sampler.is_Ascend950')
     def test_npu_topk_topp_called_when_optimized(self, mock_npu_op):
         mock_npu_op.return_value = (torch.randn(1, 3))
         sampler = AscendTopKTopPSampler()
diff --git a/tests/ut/worker/test_worker_v1.py b/tests/ut/worker/test_worker_v1.py
@@ -1075,6 +1075,7 @@ def test_load_model_sleep_mode_assertion_error(self, mock_allocator_class):
     @patch("vllm_ascend.worker.worker_v1.NPUPlatform.seed_everything")
     @patch("vllm_ascend.worker.worker_v1.logger")
     @patch("vllm_ascend.worker.worker_v1.NPUWorker._warm_up_atb")
+    @patch('vllm_ascend.worker.worker_v1.is_Ascend950', return_value=False)
     def test_compile_or_warm_up_model_with_eager_mode(self, mock_warm_up_atb,
                                                       mock_logger,
                                                       mock_seed_everything):
@@ -1124,6 +1125,7 @@ def test_compile_or_warm_up_model_with_eager_mode(self, mock_warm_up_atb,
     @patch("vllm_ascend.worker.worker_v1.NPUPlatform.seed_everything")
     @patch("vllm_ascend.worker.worker_v1.logger")
     @patch("vllm_ascend.worker.worker_v1.NPUWorker._warm_up_atb")
+    @patch('vllm_ascend.worker.worker_v1', return_value=False)
     def test_compile_or_warm_up_model_with_graph_capture(
             self, mock_warm_up_atb, mock_logger, mock_seed_everything):
         """Test compile_or_warm_up_model method - with graph capture enabled"""
diff --git a/vllm_ascend/attention/attention_v1.py b/vllm_ascend/attention/attention_v1.py
@@ -50,7 +50,7 @@
                                                update_graph_params_workspaces)
 from vllm_ascend.ops.attention import vanilla_chunked_prefill
 from vllm_ascend.utils import (ACL_FORMAT_FRACTAL_NZ, aligned_16, is_310p,
-                               nd_to_nz_2d, nd_to_nz_spec,
+                               is_Ascend950, nd_to_nz_2d, nd_to_nz_spec,
                                prefill_context_parallel_enable,
                                weak_ref_tensors)
 
@@ -1448,6 +1448,69 @@ def _load_kv_for_chunk(self, attn_metadata, kv_cache,
             )
         return key, value
 
+    def _forward_ascend_950(self, query: torch.Tensor, key: torch.Tensor,
+                         value: torch.Tensor, attn_metadata: AscendMetadata,
+                         output: torch.Tensor) -> torch.Tensor:
+        if attn_metadata.attn_state == AscendAttentionState.PrefillNoCache:
+            num_tokens = attn_metadata.query_start_loc[-1]
+            output, _ = torch_npu.npu_fused_infer_attention_score_v2(
+                query[:num_tokens],
+                key[:num_tokens],
+                value[:num_tokens],
+                atten_mask=attn_metadata.attn_mask.to(torch.bool),
+                actual_seq_qlen=attn_metadata.query_lens.cumsum(0),
+                actual_seq_kvlen=attn_metadata.seq_lens.cumsum(0),
+                num_query_heads=self.num_heads,
+                num_key_value_heads=self.num_kv_heads,
+                input_layout="TND",
+                softmax_scale=self.scale)
+            return output[:num_tokens]
+        else:
+            batch_size = attn_metadata.query_lens.shape[0]
+            block_table = attn_metadata.block_tables[:batch_size, :]
+            if attn_metadata.attn_state == AscendAttentionState.DecodeOnly:
+                query = query.view(batch_size, 1, self.num_heads * self.head_size)
+                key = self.key_cache.flatten(2, 3).contiguous()
+                value = self.value_cache.flatten(2, 3).contiguous()
+                atten_mask=None
+                actual_seq_qlen=None
+                actual_seq_kvlen=attn_metadata.seq_lens
+                sparse_mode=0
+                input_layout="BSH"
+            else:
+                num_block, block_size, _, _ = self.key_cache.shape  # type: ignore
+                key = self.key_cache.view(  # type: ignore
+                    num_block, block_size, -1)
+                value = self.value_cache.view(  # type: ignore
+                    num_block, block_size, -1)
+                input_layout="TND"
+                atten_mask=attn_metadata.attn_mask.to(torch.bool)
+                if attn_metadata.attn_state == AscendAttentionState.PrefillCacheHit:
+                    actual_seq_qlen=attn_metadata.query_lens.cumsum(0)
+                    actual_seq_kvlen=attn_metadata.seq_lens
+                    sparse_mode=2
+                else:
+                    actual_seq_qlen=attn_metadata.actual_seq_lengths_q
+                    actual_seq_kvlen=attn_metadata.seq_lens_list
+                    sparse_mode=0
+            output, _ = torch_npu.npu_fused_infer_attention_score_v2(
+                query=query,
+                key=key,
+                value=value,
+                block_table=block_table,
+                atten_mask=atten_mask,
+                actual_seq_qlen=actual_seq_qlen,
+                actual_seq_kvlen=actual_seq_kvlen,
+                num_query_heads=self.num_heads,
+                num_key_value_heads=self.num_kv_heads,
+                softmax_scale=self.scale,
+                sparse_mode=sparse_mode,
+                block_size=block_size,
+                input_layout=input_layout)
+            if attn_metadata.attn_state == AscendAttentionState.DecodeOnly:
+                output = output.view(batch_size, self.num_heads, self.head_size)
+            return output
+
     def forward(
         self,
         layer: AttentionLayer,
@@ -1507,12 +1570,20 @@ def forward(
             if has_decode:
                 slot_mapping = attn_metadata.slot_mapping[:num_decode_tokens * self.pcp_size: self.pcp_size] \
                     if self.pcp_size * self.dcp_size > 1 else attn_metadata.slot_mapping[:num_decode_tokens]
-                torch_npu._npu_reshape_and_cache(
-                    key=key[:num_decode_tokens],
-                    value=value[:num_decode_tokens],
-                    key_cache=self.key_cache,
-                    value_cache=self.value_cache,
-                    slot_indices=slot_mapping)
+                if is_Ascend950():
+                    num_tokens = slot_mapping.shape[0]
+                    torch_npu.npu_scatter_pa_kv_cache(
+                        key=key[:num_tokens],
+                        value=value[:num_tokens].contiguous(),
+                        slot_mapping=slot_mapping,
+                        out=(self.key_cache, self.value_cache))
+                else:
+                    torch_npu._npu_reshape_and_cache(
+                        key=key[:num_decode_tokens],
+                        value=value[:num_decode_tokens],
+                        key_cache=self.key_cache,
+                        value_cache=self.value_cache,
+                        slot_indices=slot_mapping)
 
             if has_prefill:
                 if self.pcp_size > 1:
@@ -1526,22 +1597,34 @@ def forward(
                     key, value = all_kv.split([self.head_size, self.head_size],
                                               dim=-1)
 
-                torch_npu._npu_reshape_and_cache(
-                    key=key[self.pcp_size * num_decode_tokens:attn_metadata.
-                            num_actual_tokens_pcp_padded],
-                    value=value[self.pcp_size *
+                if is_Ascend950():
+                    num_tokens = attn_metadata.slot_mapping.shape[0]
+                    torch_npu.npu_scatter_pa_kv_cache(
+                        key=key[:num_tokens],
+                        value=value[:num_tokens].contiguous(),
+                        slot_mapping=attn_metadata.slot_mapping,
+                        out=(self.key_cache, self.value_cache))
+                else:
+                    torch_npu._npu_reshape_and_cache(
+                        key=key[self.pcp_size *
                                 num_decode_tokens:attn_metadata.
                                 num_actual_tokens_pcp_padded],
-                    key_cache=self.key_cache,
-                    value_cache=self.value_cache,
-                    slot_indices=attn_metadata.
-                    slot_mapping[self.pcp_size *
-                                 num_decode_tokens:attn_metadata.
-                                 num_actual_tokens_pcp_padded])
+                        value=value[self.pcp_size *
+                                    num_decode_tokens:attn_metadata.
+                                    num_actual_tokens_pcp_padded],
+                        key_cache=self.key_cache,
+                        value_cache=self.value_cache,
+                        slot_indices=attn_metadata.
+                        slot_mapping[self.pcp_size *
+                                     num_decode_tokens:attn_metadata.
+                                     num_actual_tokens_pcp_padded])
 
         forward_context: ForwardContext = get_forward_context()
         if not forward_context.capturing:
-            if self.pcp_size * self.dcp_size > 1:
+            if is_Ascend950():
+                intermediate_output =self._forward_ascend_950(
+                    query, key, value, attn_metadata, output)
+            elif self.pcp_size * self.dcp_size > 1:
                 intermediate_output = self._forward_pcp_dcp(
                     query, key, value, kv_cache, attn_metadata, output)
             elif attn_type == AttentionType.ENCODER_ONLY:
diff --git a/vllm_ascend/ops/rotary_embedding.py b/vllm_ascend/ops/rotary_embedding.py
@@ -26,7 +26,7 @@
     YaRNScalingRotaryEmbedding)
 
 from vllm_ascend.platform import NPUPlatform
-from vllm_ascend.utils import enable_custom_op, is_310p
+from vllm_ascend.utils import enable_custom_op, is_310p, is_Ascend950
 
 
 def _custom_rotary_embedding_enabled(query, neox_style, head_size):
@@ -405,7 +405,7 @@ def forward_oot(
         query: torch.Tensor,
         key: torch.Tensor,
     ):
-        if self.mrope_section != [16, 24, 24]:
+        if self.mrope_section != [16, 24, 24] or is_Ascend950():
             return super().forward_oot(positions, query, key)
 
         import torch_npu
diff --git a/vllm_ascend/sample/sampler.py b/vllm_ascend/sample/sampler.py
@@ -3,7 +3,7 @@
 from vllm.v1.sample.ops.topk_topp_sampler import TopKTopPSampler, random_sample
 from vllm.v1.sample.sampler import Sampler
 
-from vllm_ascend.utils import is_310p
+from vllm_ascend.utils import is_310p, is_Ascend950
 
 DEFAULT_LOGPROBS_MODE = "raw_logprobs"
 
@@ -25,8 +25,8 @@ def _apply_top_k_top_p(
         p: torch.Tensor,
     ) -> torch.Tensor:
         # npu_top_k_top_p uses the operator aclnnApplyTopKTopP, but aclnnApplyTopKTopP currently does not support 310P
-        if not is_310p() and p is not None and k is not None and 1 <= int(
-                k.max()) <= 1024:
+        if not is_310p() and not is_Ascend950() \
+            and p is not None and k is not None and 1 <= int(k.max()) <= 1024:
             # npu_top_k_top_p's parameter order is (logits, p, k), not (logits, k, p)
             return torch_npu.npu_top_k_top_p(logits, p, k)
 
diff --git a/vllm_ascend/utils.py b/vllm_ascend/utils.py
@@ -49,6 +49,7 @@
 
 _CUSTOM_OP_ENABLED = None
 _IS_310P = None
+_IS_ASCEND950 = None
 _SLEEP_MODE_ENABLED = None
 _CURRENT_STREAM = None
 _PREFETCH_STREAM = None
@@ -668,7 +669,8 @@ def register_ascend_customop(vllm_config: Optional[VllmConfig] = None):
 class AscendSocVersion(Enum):
     A2 = 0
     A3 = 1
-    UNDEFINED = 2
+    A5 = 2
+    UNDEFINED = 3
 
 
 _ascend_soc_version = None
@@ -681,6 +683,8 @@ def init_ascend_soc_version():
         _ascend_soc_version = AscendSocVersion.A2
     elif 250 <= soc_version <= 255:
         _ascend_soc_version = AscendSocVersion.A3
+    elif soc_version == 260:
+        _ascend_soc_version = AscendSocVersion.A5
     else:
         _ascend_soc_version = AscendSocVersion.UNDEFINED
 
@@ -945,3 +949,10 @@ def get_flashcomm2_reorgnized_batch_ids(global_tp_size) -> list[list[int]]:
         reorgnized_batch_ids.append(ranks)
 
     return reorgnized_batch_ids
+
+
+def is_Ascend950():
+    global _IS_ASCEND950
+    if _IS_ASCEND950 is None:
+        _IS_ASCEND950 = (get_ascend_soc_version() == AscendSocVersion.A5)
+    return _IS_ASCEND950
diff --git a/vllm_ascend/worker/worker_v1.py b/vllm_ascend/worker/worker_v1.py
@@ -47,8 +47,8 @@
 from vllm_ascend.device_allocator.camem import CaMemAllocator
 from vllm_ascend.distributed.parallel_state import init_ascend_model_parallel
 from vllm_ascend.platform import NPUPlatform
-from vllm_ascend.utils import (init_ascend_soc_version, is_enable_nz,
-                               prefill_context_parallel_enable,
+from vllm_ascend.utils import (init_ascend_soc_version, is_Ascend950,
+                               is_enable_nz, prefill_context_parallel_enable,
                                register_ascend_customop, sleep_mode_enabled,
                                try_register_lib, vllm_version_is)
 from vllm_ascend.worker.model_runner_v1 import NPUModelRunner
@@ -342,7 +342,8 @@ def compile_or_warm_up_model(self) -> None:
             self.model_runner.capture_model()
         # Call ATB matmul to warm up; otherwise, the first operation (ReshapeAndCache)
         # may cause performance degradation at runtime.
-        self._warm_up_atb()
+        if not is_Ascend950():
+            self._warm_up_atb()
         # Reset the seed to ensure that the random state is not affected by
         # the model initialization and profiling.
         NPUPlatform.seed_everything(self.model_config.seed)