delete sfa_cp_context.py

zzhx1 · zzhx1 · commit a86bdf8ef324 · 2025-12-05T22:05:28.000+08:00
Signed-off-by: zzhx1 &lt;zzh_201018@outlook.com&gt;
diff --git a/vllm_ascend/attention/sfa_v1.py b/vllm_ascend/attention/sfa_v1.py
@@ -1,6 +1,6 @@
 from dataclasses import dataclass
 from typing import TYPE_CHECKING, ClassVar, Optional, Tuple, Type, TypeVar
-import math
+
 import torch
 import torch_npu
 from torch import nn
@@ -17,21 +17,16 @@
 from vllm_ascend.attention.mla_v1 import MAX_O_PROJ_PREFETCH_SIZE
 from vllm_ascend.attention.utils import (AscendCommonAttentionMetadata,
                                          wait_for_kv_layer_from_connector)
-from vllm_ascend.ops.weight_prefetch import maybe_npu_prefetch
-from vllm_ascend.utils import (ACL_FORMAT_FRACTAL_ND, ACL_FORMAT_FRACTAL_NZ,
-                               is_enable_nz, _round_up)
-from vllm_ascend.worker.npu_input_batch import InputBatch
-from vllm_ascend.utils import dispose_tensor, dispose_layer, replace_layer, enable_sp
 from vllm_ascend.ops.shared_weight_layer import (
     is_hidden_layer, post_process_after_loading_for_shared_weight_series,
     reach_layer_for_shared_weight_series,
     register_layer_to_shared_weight_series)
 from vllm_ascend.ops.triton.rope import rope_forward_triton
 from vllm_ascend.ops.weight_prefetch import maybe_npu_prefetch
 from vllm_ascend.utils import (ACL_FORMAT_FRACTAL_ND, ACL_FORMAT_FRACTAL_NZ,
-                               dispose_layer, is_enable_nz, replace_layer)
+                               _round_up, dispose_layer, enable_sp,
+                               is_enable_nz, replace_layer)
 from vllm_ascend.worker.npu_input_batch import InputBatch
-from vllm.forward_context import get_forward_context
 
 if TYPE_CHECKING:
     from vllm.v1.core.sched.output import SchedulerOutput
@@ -59,7 +54,6 @@ def get_impl_cls() -> Type["AscendSFAImpl"]:
         return AscendSFAImpl
 
 
-
 @dataclass
 class SfaCpContext:
     num_tokens: int
@@ -73,6 +67,7 @@ class SfaCpContext:
     actual_seq_lengths_query: torch.Tensor
     actual_seq_lengths_key: torch.Tensor
 
+
 @dataclass
 class AscendSFAMetadata:
     """Metadata for MLACommon.
@@ -198,7 +193,7 @@ def build(
             1).unsqueeze(2)
         sin = self.sin_cache[input_positions].unsqueeze(  # type: ignore
             1).unsqueeze(2)
-        
+
         sfa_cp_context = None
         if self.enable_sfa_cp:
             global_tp_size = get_tp_group().world_size
@@ -214,12 +209,13 @@ def build(
             if pad_size > 0:
                 cos = nn.functional.pad(cos, (0, 0, 0, 0, 0, 0, 0, pad_size))
                 sin = nn.functional.pad(sin, (0, 0, 0, 0, 0, 0, 0, pad_size))
-                slot_mapping = nn.functional.pad(slot_mapping, (0, pad_size), value=-1)
+                slot_mapping = nn.functional.pad(slot_mapping, (0, pad_size),
+                                                 value=-1)
             cos = cos[local_start:local_end_with_pad]
             sin = sin[local_start:local_end_with_pad]
             slot_mapping_cp = slot_mapping[local_start:local_end_with_pad]
 
-            actual_seq_lengths_query = torch.empty_like(cum_query_lens) 
+            actual_seq_lengths_query = torch.empty_like(cum_query_lens)
             actual_seq_lengths_key = torch.empty_like(seq_lens)
             num_segs = cum_query_lens.shape[0]
             last_token = 0
@@ -347,7 +343,7 @@ def __init__(
         self.enable_kv_nz = ascend_config.torchair_graph_config.enable_kv_nz
         self.model_config = get_current_vllm_config().model_config
         assert self.indexer is not None, "Indexer is required for DSA."
-        
+
         self.enable_sfa_cp = enable_sp() and \
             hasattr(self.model_config.hf_config, "index_topk")
         self.local_num_heads = self.num_heads
@@ -357,7 +353,8 @@ def __init__(
 
             #TODO: Temporarily adapt sfa-cp, remove after adapting near PCP. --clrs97
             self._replace_linear_class_for_sfa_cp()
-            from vllm_ascend.distributed.parallel_state import get_shared_weight_group
+            from vllm_ascend.distributed.parallel_state import \
+                get_shared_weight_group
             register_layer_to_shared_weight_series(
                 series_name="q_proj",
                 group=get_shared_weight_group(),
@@ -625,23 +622,24 @@ def forward(
 
         ql_nope, q_pe = self._q_proj_and_k_up_proj(q_c)
         q_pe = self.rope_single(q_pe, cos, sin)
-        
+
         actual_seq_lengths_query = attn_metadata.cum_query_lens
         actual_seq_lengths_key = attn_metadata.seq_lens
 
         if self.enable_sfa_cp:
             actual_seq_lengths_query = attn_metadata.sfa_cp_context.actual_seq_lengths_query
             actual_seq_lengths_key = attn_metadata.sfa_cp_context.actual_seq_lengths_key
-        
-        topk_indices = self.indexer_select(x=hidden_states,
-                                           qr=q_c,
-                                           kv_cache=kv_cache,
-                                           attn_metadata=attn_metadata,
-                                           cos=cos,
-                                           sin=sin,
-                                           actual_seq_lengths_query=actual_seq_lengths_query,
-                                           actual_seq_lengths_key=actual_seq_lengths_key,
-                                           need_gather_q_kv=need_gather_q_kv)
+
+        topk_indices = self.indexer_select(
+            x=hidden_states,
+            qr=q_c,
+            kv_cache=kv_cache,
+            attn_metadata=attn_metadata,
+            cos=cos,
+            sin=sin,
+            actual_seq_lengths_query=actual_seq_lengths_query,
+            actual_seq_lengths_key=actual_seq_lengths_key,
+            need_gather_q_kv=need_gather_q_kv)
         attn_output = torch.ops._C_ascend.npu_sparse_flash_attention(
             query=ql_nope,
             key=kv_cache[0],
@@ -751,19 +749,18 @@ def indexer_select(
             sparse_count=2048,
             sparse_mode=3)
         return topk_indices
-    
+
     def _replace_linear_class_for_sfa_cp(self):
 
         vllm_config = get_current_vllm_config()
         # Dispose tensor from the original q_proj
         dispose_layer(self.q_proj)
         # Construct the new q_proj using ReplicatedLinear
-        new_q_proj = ReplicatedLinear(
-            self.q_lora_rank,
-            self.local_num_heads * self.qk_head_dim,
-            bias=False,
-            quant_config=vllm_config.quant_config,
-            prefix=self.q_proj.prefix)
+        new_q_proj = ReplicatedLinear(self.q_lora_rank,
+                                      self.local_num_heads * self.qk_head_dim,
+                                      bias=False,
+                                      quant_config=vllm_config.quant_config,
+                                      prefix=self.q_proj.prefix)
         # Replace the q_proj with the new one
         replace_layer(self.q_proj, new_q_proj)
 
@@ -783,13 +780,11 @@ def _replace_linear_class_for_sfa_cp(self):
         dispose_layer(self.o_proj)
         # Construct the new o_proj using ReplicatedLinear
         config = vllm_config.model_config.hf_config
-        new_o_proj = ReplicatedLinear(
-            config.num_attention_heads * config.v_head_dim,
-            config.hidden_size,
-            bias=False,
-            quant_config=vllm_config.quant_config,
-            prefix=self.o_proj.prefix)
+        new_o_proj = ReplicatedLinear(config.num_attention_heads *
+                                      config.v_head_dim,
+                                      config.hidden_size,
+                                      bias=False,
+                                      quant_config=vllm_config.quant_config,
+                                      prefix=self.o_proj.prefix)
         # Replace the o_proj with the new one
         replace_layer(self.o_proj, new_o_proj)
-        
-
diff --git a/vllm_ascend/distributed/parallel_state.py b/vllm_ascend/distributed/parallel_state.py
@@ -233,7 +233,7 @@ def init_ascend_model_parallel(parallel_config: ParallelConfig, ):
                 get_world_group().local_rank,
                 backend,
                 group_name="flashcomm2_odp")
-            
+
     vllm_config = get_current_vllm_config()
     # TODO: Check if the model is Deepseek V3.2 with enabled SFA CP and activated shared weights. It will then be normalized within the PCP parameters. -- clrs97
     is_ds_v32 = hasattr(vllm_config.model_config.hf_config, "index_topk")
diff --git a/vllm_ascend/distributed/sfa_sp_context.py b/vllm_ascend/distributed/sfa_sp_context.py