[Refactor] add fia_v3 attention & remove other attention operator.

weijinqian_v1 · weijinqian_v1 · commit a919aeffdbd4 · 2025-11-27T13:14:43.000+08:00
Signed-off-by: weijinqian_v1 &lt;weijinqian@huawei.com&gt;
diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py
@@ -890,14 +890,14 @@ def get_supported_tasks(self) -> "tuple[SupportedTask, ...]":
 
     def _make_attention_mask(self, seq_lens, position,
                              attn_state) -> torch.Tensor:
+        if self.vllm_config.model_config.use_mla:
+            return None
         # Pooling situation.
         if self.model_config.runner_type == "pooling" and self.model_config.pooler_config.pooling_type == "CLS":
             return self.attn_mask_builder.get_pooling_mask(self.device)
-        # Chunk Prefill situation.
-        if attn_state == AscendAttentionState.ChunkedPrefill and not self.vllm_config.model_config.use_mla and not self.use_sparse:
-            return self.attn_mask_builder.get_splitfuse_attn_mask()
-        # Prefill without cache situation and Prefill with cache hit.
-        if attn_state == AscendAttentionState.PrefillNoCache or attn_state == AscendAttentionState.PrefillCacheHit:
+        # fia prefill situation.
+        if attn_state in [AscendAttentionState.PrefillNoCache, AscendAttentionState.PrefillCacheHit,
+                          AscendAttentionState.ChunkedPrefill]:
             return self.attn_mask_builder.get_splitfuse_attn_mask()
         # Decode-only situation.
         return None