tiny fix

wxsIcey · wxsIcey · commit c57cdca3e367 · 2025-12-11T02:59:04.000Z
Signed-off-by: wxsIcey &lt;1790571317@qq.com&gt;
diff --git a/vllm_ascend/ascend_forward_context.py b/vllm_ascend/ascend_forward_context.py
@@ -72,8 +72,6 @@ def set_ascend_forward_context(
         prefetch_stream: torch.npu.Stream = None,
         model_instance: torch.nn.Module = None,
         weight_prefetch_method: Optional[WeightPrefetchMethod] = None,
-        cos: Optional[torch.Tensor] = None,
-        sin: Optional[torch.Tensor] = None,
         is_mtp_model=False):
     """A context manager that stores the current forward context,
     can be attention metadata, etc.
@@ -162,10 +160,6 @@ def set_ascend_forward_context(
         forward_context.weight_prefetch_method = weight_prefetch_method
         forward_context.is_mtp_model = is_mtp_model
 
-        # initialize rope
-        forward_context.cos = cos
-        forward_context.sin = sin
-
         if num_tokens is None and attn_metadata is not None:
             num_tokens = attn_metadata.num_actual_tokens
 
diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py
@@ -418,20 +418,6 @@ def __init__(self, vllm_config: VllmConfig, device: torch.device):
                                    rope_dim,
                                    dtype=self.dtype,
                                    device=self.device)
-        # For GQA models.
-        elif not self.vllm_config.model_config.use_mla:
-            self.cos = torch.ones(1,
-                                  self.max_num_tokens,
-                                  1,
-                                  128,
-                                  dtype=self.dtype,
-                                  device=self.device)
-            self.sin = torch.zeros(1,
-                                   self.max_num_tokens,
-                                   1,
-                                   128,
-                                   dtype=self.dtype,
-                                   device=self.device)
         else:
             self.cos = None
             self.sin = None
@@ -2530,22 +2516,6 @@ def execute_model(
         aclgraph_runtime_mode, batch_descriptor = \
             self.aclgraph_dispatcher.dispatch(num_tokens=num_input_tokens, uniform_decode=uniform_decode, has_lora=has_lora)
 
-        # initialize rope
-        cos_sin_cache = self.model.model.layers[
-            self.model.model.
-            start_layer].self_attn.rotary_emb.cos_sin_cache.index_select(
-                0, positions)
-        last_dim = cos_sin_cache.size()[-1]
-        cos, sin = cos_sin_cache.reshape(-1, 2,
-                                         last_dim // 2).repeat(1, 1,
-                                                               2).chunk(2,
-                                                                        dim=-2)
-        # BSNH
-        self.cos[:, :maybe_padded_num_tokens] = cos.view(
-            1, -1, 1, last_dim).contiguous()
-        self.sin[:, :maybe_padded_num_tokens] = sin.view(
-            1, -1, 1, last_dim).contiguous()
-
         # Run forward pass
         with ProfileExecuteDuration().capture_async("forward"):
             with set_ascend_forward_context(
@@ -2562,11 +2532,7 @@ def execute_model(
                     total_num_scheduled_tokens,
                     prefetch_stream=self.prefetch_stream,
                     model_instance=self.model,
-                    weight_prefetch_method=self.weight_prefetch_method,
-                    cos=self.cos[:, :maybe_padded_num_tokens]
-                    if self.cos is not None else None,
-                    sin=self.sin[:, :maybe_padded_num_tokens]
-                    if self.sin is not None else None):
+                    weight_prefetch_method=self.weight_prefetch_method):
                 self.maybe_setup_kv_connector(scheduler_output)
 
                 hidden_states = self._generate_process_reqs_hidden_states(
@@ -3274,20 +3240,6 @@ def dummy_drafter_compute_logits(hidden_states):
                     return self.drafter.model.compute_logits(
                         hidden_states[dummy_indices])
 
-            # initialize rope
-            cos_sin_cache = self.model.model.layers[
-                self.model.model.
-                start_layer].self_attn.rotary_emb.cos_sin_cache.index_select(
-                    0, positions)
-            last_dim = cos_sin_cache.size()[-1]
-            cos, sin = cos_sin_cache.reshape(-1, 2, last_dim // 2).repeat(
-                1, 1, 2).chunk(2, dim=-2)
-            # BSNH
-            self.cos[:, :num_tokens] = cos.view(1, -1, 1,
-                                                last_dim).contiguous()
-            self.sin[:, :num_tokens] = sin.view(1, -1, 1,
-                                                last_dim).contiguous()
-
             with set_ascend_forward_context(
                     attn_metadata,
                     self.vllm_config,
@@ -3302,11 +3254,7 @@ def dummy_drafter_compute_logits(hidden_states):
                     batch_descriptor=batch_descriptor,
                     prefetch_stream=self.prefetch_stream,
                     model_instance=self.model,
-                    weight_prefetch_method=self.weight_prefetch_method,
-                    cos=self.cos[:, :num_tokens]
-                    if self.cos is not None else None,
-                    sin=self.sin[:, :num_tokens]
-                    if self.sin is not None else None):
+                    weight_prefetch_method=self.weight_prefetch_method):
                 hidden_states = self._generate_dummy_run_hidden_states(
                     with_prefill, input_ids, positions, attn_metadata,
                     num_tokens_padded, intermediate_tensors, inputs_embeds)