[Refactor] add fia_v3 attention & remove other attention operator.

weijinqian_v1 · weijinqian_v1 · commit 7d5b5078f7e8 · 2025-11-27T17:44:08.000+08:00
Signed-off-by: weijinqian_v1 &lt;weijinqian@huawei.com&gt;
diff --git a/vllm_ascend/attention/attention_v1.py b/vllm_ascend/attention/attention_v1.py
@@ -540,16 +540,17 @@ def forward(
                 value=value,
                 output=output,
                 layer_name=layer.layer_name)
-            return output.view(num_tokens, self.hidden_size)
+            return output
 
         if attn_metadata is None:
             return output.fill_(0)
 
         if hasattr(layer, 'quant_method') and use_kv_cache_int8:
-            output = layer.quant_method.apply(layer, query, key, value,
+            attn_output = layer.quant_method.apply(layer, query, key, value,
                                               kv_cache, attn_metadata,
                                               self.attn_type, self.scale,
                                               output)
+            output[:num_tokens] = attn_output[:num_tokens]
             return output
 
         # View q k v to BSH.