fix eagle spec decode

wxsIcey · wxsIcey · commit a41460ef5e81 · 2025-12-11T02:52:41.000Z
Signed-off-by: wxsIcey &lt;1790571317@qq.com&gt;
diff --git a/vllm_ascend/ascend_config.py b/vllm_ascend/ascend_config.py
@@ -211,6 +211,7 @@ def __init__(self,
         self.enable_quantization_fusion = enable_quantization_fusion
         self.fuse_qknorm_rope = fuse_qknorm_rope
 
+
 class XliteGraphConfig:
     """
     Configuration Object for xlite_graph_config from additional_config
diff --git a/vllm_ascend/ops/rotary_embedding.py b/vllm_ascend/ops/rotary_embedding.py
@@ -29,6 +29,69 @@
 from vllm_ascend.utils import (AscendDeviceType, enable_custom_op,
                                get_ascend_device_type)
 
+# Currently, rope ops used on npu requires detached cos && sin as inputs.
+# However, RotaryEmbedding in vllm use cos_sin_cache as a whole variable.
+# So we have to preprocess cos_sin_cache int cos && sin. In the future,
+# we shall implement a new rope ops which accept cos_sin_cache as inputs.
+_cos_sin_cache: Optional[torch.Tensor] = None
+_cos_cache: Optional[torch.Tensor] = None
+_sin_cache: Optional[torch.Tensor] = None
+_cos: Optional[torch.Tensor] = None
+_sin: Optional[torch.Tensor] = None
+
+
+def _record_cos_sin_cache(cos_sin_cache):
+    global _cos_sin_cache
+    if _cos_sin_cache is not None:
+        return
+    _cos_sin_cache = cos_sin_cache
+
+
+def initialize_cos_sin(vllm_config, dtype, device):
+    global _cos_cache
+    global _sin_cache
+
+    head_dim = vllm_config.model_config.get_head_size()
+    max_num_batched_tokens = vllm_config.scheduler_config.max_num_batched_tokens
+    _cos_cache = torch.ones(1,
+                            max_num_batched_tokens,
+                            1,
+                            head_dim,
+                            dtype=dtype,
+                            device=device)
+    _sin_cache = torch.zeros(1,
+                             max_num_batched_tokens,
+                             1,
+                             head_dim,
+                             dtype=dtype,
+                             device=device)
+
+
+def update_cos_sin(positions):
+    global _cos_cache
+    global _sin_cache
+    global _cos
+    global _sin
+
+    if _cos_sin_cache is None or \
+        _cos_cache is None or \
+        _sin_cache is None:
+        return
+
+    num_tokens = positions.size(0)
+    _cos_cache[:, :num_tokens] = _cos_sin_cache.index_select(
+        0, positions).view(num_tokens, 2, -1).repeat(1, 1, 2).chunk(2,
+                                                                    dim=-2)[0]
+    _sin_cache[:, :num_tokens] = _cos_sin_cache.index_select(
+        0, positions).view(num_tokens, 2, -1).repeat(1, 1, 2).chunk(2,
+                                                                    dim=-2)[1]
+    _cos = _cos_cache[:, :num_tokens]
+    _sin = _sin_cache[:, :num_tokens]
+
+
+def get_cos_sin():
+    return _cos, _sin
+
 
 def _custom_rotary_embedding_enabled(query, neox_style, head_size):
     return query.dtype == torch.float16 and neox_style and head_size % 32 == 0 and enable_custom_op(
@@ -65,8 +128,9 @@ def _rope_forward_oot(
         raise NotImplementedError(
             "Batched rotary embedding is currently not supported on NPU.")
     else:
-        if hasattr(self, "cos") and hasattr(self, "sin") and \
-            self.cos is not None and self.sin is not None:
+        cos, sin = get_cos_sin()
+        if is_neox_style and self.head_size == 128 and self.cos_sin_cache.shape[
+                -1] == 128 and cos is not None and sin is not None:
             # If cos and sin are generated outside, use npu_apply_rotary_pos_emb to avoid redundant calculation.
             # This method requires head_size and rotary_dim equal 128 and neox_style is True
             query = query.contiguous().view(1, query.shape[0], -1,
@@ -75,7 +139,7 @@ def _rope_forward_oot(
             # Although this function modifies in-place, please retain the function's return value.
             # Otherwise, the graph fusion operation may fail.
             query, key = torch_npu.npu_apply_rotary_pos_emb(
-                query, key, forward_context.cos, forward_context.sin)
+                query, key, cos, sin)
         elif self.rotary_dim < self.head_size:
             num_tokens = query.shape[0]
             query = query.view(num_tokens, -1, self.head_size)
@@ -125,10 +189,9 @@ def __init__(
         is_neox_style: bool,
         dtype: torch.dtype,
     ) -> None:
-        self.cos = None
-        self.sin = None
         super().__init__(head_size, rotary_dim, max_position_embeddings, base,
                          is_neox_style, dtype)
+        _record_cos_sin_cache(self.cos_sin_cache)
 
     def forward_oot(
         self,
@@ -162,8 +225,6 @@ def __init__(
         beta_fast: int = 32,
         beta_slow: int = 1,
     ) -> None:
-        self.cos = None
-        self.sin = None
         extra_kwargs = {
             "extrapolation_factor": extrapolation_factor,
             "attn_factor": attn_factor,
@@ -172,6 +233,7 @@ def __init__(
         }
         super().__init__(head_size, rotary_dim, max_position_embeddings, base,
                          is_neox_style, scaling_factor, dtype, **extra_kwargs)
+        _record_cos_sin_cache(self.cos_sin_cache)
 
     def forward_oot(
         self,
diff --git a/vllm_ascend/ops/triton/linearnorm/split_qkv_rmsnorm_rope.py b/vllm_ascend/ops/triton/linearnorm/split_qkv_rmsnorm_rope.py
@@ -18,7 +18,7 @@
 
 import torch
 import triton
-import triton.language as tl # type: ignore
+import triton.language as tl  # type: ignore
 import triton.runtime.driver as driver
 from vllm.utils.torch_utils import direct_register_custom_op
 
diff --git a/vllm_ascend/spec_decode/eagle_proposer.py b/vllm_ascend/spec_decode/eagle_proposer.py
@@ -23,6 +23,7 @@
 from vllm_ascend.attention.attention_v1 import (AscendAttentionState,
                                                 AscendMetadata)
 from vllm_ascend.attention.utils import AscendCommonAttentionMetadata
+from vllm_ascend.ops.rotary_embedding import update_cos_sin
 from vllm_ascend.spec_decode.interface import Proposer, SpecDcodeType
 
 PADDING_SLOT_ID = -1
@@ -124,13 +125,16 @@ def dummy_run(self,
                   batch_descriptor=None,
                   dummy_compute_logits=lambda hidden_states: None):
         moe_comm_type = self.runner._select_moe_comm_method(num_tokens)
+        positions = self.positions[:num_tokens]
+        # update global cos, sin
+        update_cos_sin(positions)
         with set_ascend_forward_context(None,
                                         self.vllm_config,
                                         moe_comm_type=moe_comm_type,
                                         num_tokens=num_tokens):
             self.model(
                 input_ids=self.input_ids[:num_tokens],
-                positions=self.positions[:num_tokens],
+                positions=positions,
                 hidden_states=self.hidden_states[:num_tokens],
             )
             dummy_compute_logits(self.hidden_states)
@@ -464,13 +468,18 @@ def _propose(
         self.positions[:num_tokens] = target_positions.to(device)
         self.hidden_states[:num_tokens] = target_hidden_states
         attn_metadata.block_tables = block_table.to(device)
+
+        positions = self.positions[:num_input_tokens]
+        # update global cos, sin
+        update_cos_sin(positions)
+
         with set_ascend_forward_context(attn_metadata,
                                         self.vllm_config,
                                         moe_comm_type=moe_comm_type,
                                         num_tokens=num_input_tokens):
             last_hidden_states, hidden_states = self.model(
                 input_ids=self.input_ids[:num_input_tokens],
-                positions=self.positions[:num_input_tokens],
+                positions=positions,
                 hidden_states=self.hidden_states[:num_input_tokens],
             )
         sample_hidden_states = last_hidden_states[last_token_indices]
@@ -573,14 +582,19 @@ def _propose(
             attn_metadata.attn_mask = attn_mask
             attn_metadata.block_tables = block_table.to(device)
             # Run the model.
+
+            positions = self.positions[:input_batch_size]
+            # update global cos, sin
+            update_cos_sin(positions)
+
             with set_ascend_forward_context(attn_metadata,
                                             self.vllm_config,
                                             moe_comm_type=moe_comm_type,
                                             num_tokens=input_batch_size):
 
                 last_hidden_states, hidden_states = self.model(
                     input_ids=self.input_ids[:input_batch_size],
-                    positions=self.positions[:input_batch_size],
+                    positions=positions,
                     hidden_states=self.hidden_states[:input_batch_size],
                 )
             hidden_states = hidden_states[:batch_size]
diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py
@@ -136,6 +136,7 @@
 from vllm_ascend.eplb.core.eplb_worker import EplbProcess
 from vllm_ascend.eplb.eplb_updator import EplbUpdator
 from vllm_ascend.eplb.utils import model_register
+from vllm_ascend.ops.rotary_embedding import initialize_cos_sin, update_cos_sin
 from vllm_ascend.ops.weight_prefetch import WeightPrefetchMethod
 from vllm_ascend.patch.worker.patch_module import patch_torch_npu_argsort
 from vllm_ascend.platform import NPUPlatform
@@ -149,7 +150,7 @@
 from vllm_ascend.utils import (ACL_FORMAT_FRACTAL_ND, ACL_FORMAT_FRACTAL_NZ,
                                AscendDeviceType, ProfileExecuteDuration,
                                enable_sp, get_ascend_device_type, is_enable_nz,
-                               is_moe_model, lmhead_tp_enable)
+                               is_moe_model, is_vl_model, lmhead_tp_enable)
 from vllm_ascend.worker.npu_input_batch import CachedRequestState, InputBatch
 
 if TYPE_CHECKING:
@@ -434,6 +435,9 @@ def __init__(self, vllm_config: VllmConfig, device: torch.device):
         else:
             self.cos = None
             self.sin = None
+            if not is_vl_model(self.vllm_config
+                               ) and not self.vllm_config.model_config.use_mla:
+                initialize_cos_sin(self.vllm_config, self.dtype, self.device)
 
         self.uses_mrope = self.model_config.uses_mrope
         # Only relevant for models using M-RoPE (e.g, Qwen2-VL)
@@ -2025,6 +2029,9 @@ def _prepare_inputs(
                 for layer_name in attn_group.layer_names:
                     attn_metadata[layer_name] = attn_metadata_i
 
+        # update global cos, sin
+        update_cos_sin(positions)
+
         if lmhead_tp_enable():
             max_num_reqs_across_dp = maybe_padded_num_tokens if not with_prefill else self.max_num_reqs
             logits_indices = nn.functional.pad(
@@ -3224,6 +3231,9 @@ def _dummy_run(
             else:
                 positions = self.positions[:num_tokens_padded]
 
+            # update global cos, sin
+            update_cos_sin(positions)
+
             if get_pp_group().is_first_rank:
                 intermediate_tensors = None
             else: