Skip to content

Commit aab23e7

Browse files
committed
Upgrade to 0.11.1 newest vllm commit
Signed-off-by: Icey <[email protected]>
1 parent e5676fc commit aab23e7

File tree

5 files changed

+68
-22
lines changed

5 files changed

+68
-22
lines changed

vllm_ascend/kv_offload/cpu_npu.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,11 +2,17 @@
22
import torch
33
from vllm.attention import AttentionBackend
44
from vllm.logger import init_logger
5-
from vllm.utils import is_pin_memory_available
65
from vllm.v1.kv_offload.mediums import CPULoadStoreSpec, GPULoadStoreSpec
76
from vllm.v1.kv_offload.worker.worker import (OffloadingHandler,
87
TransferResult, TransferSpec)
98

9+
from vllm_ascend.utils import vllm_version_is
10+
11+
if vllm_version_is("0.11.0"):
12+
from vllm.utils import is_pin_memory_available
13+
else:
14+
from vllm.utils.platform_utils import is_pin_memory_available
15+
1016
logger = init_logger(__name__)
1117

1218

vllm_ascend/models/qwen3_next.py

Lines changed: 36 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,8 @@
5151
from vllm.transformers_utils.configs import Qwen3NextConfig
5252
from vllm.v1.attention.backends.gdn_attn import GDNAttentionMetadata
5353

54+
from vllm_ascend.utils import vllm_version_is
55+
5456
from vllm.model_executor.models.qwen3_next import ( # isort: skip
5557
Qwen3NextAttention, Qwen3NextDecoderLayer, Qwen3NextForCausalLM,
5658
Qwen3NextGatedDeltaNet, Qwen3NextModel, Qwen3NextSparseMoeBlock,
@@ -201,7 +203,11 @@ def _forward(
201203
spec_query_start_loc = attn_metadata.spec_query_start_loc
202204
non_spec_query_start_loc = attn_metadata.non_spec_query_start_loc
203205
spec_sequence_masks = attn_metadata.spec_sequence_masks
204-
spec_token_masks = attn_metadata.spec_token_masks
206+
if vllm_version_is("0.11.0"):
207+
spec_token_masks = attn_metadata.spec_token_masks
208+
else:
209+
spec_token_indx = attn_metadata.spec_token_indx
210+
non_spec_token_indx = attn_metadata.non_spec_token_indx
205211
spec_state_indices_tensor = attn_metadata.spec_state_indices_tensor # noqa: E501
206212
non_spec_state_indices_tensor = attn_metadata.non_spec_state_indices_tensor # noqa: E501
207213
self_kv_cache = self.kv_cache[forward_context.virtual_engine]
@@ -216,8 +222,9 @@ def _forward(
216222

217223
# 1. Set up dimensions for reshapes later
218224
projected_states, _ = self.in_proj(hidden_states[:num_actual_tokens])
219-
if spec_token_masks is not None:
220-
spec_token_masks = spec_token_masks[:num_actual_tokens]
225+
if vllm_version_is("0.11.0"):
226+
if spec_token_masks is not None:
227+
spec_token_masks = spec_token_masks[:num_actual_tokens]
221228
projected_states_qkvz, projected_states_ba = torch.split(
222229
projected_states,
223230
[
@@ -242,8 +249,13 @@ def _forward(
242249
mixed_qkv_spec = mixed_qkv
243250
mixed_qkv_non_spec = None
244251
else:
245-
mixed_qkv_spec = mixed_qkv[spec_token_masks]
246-
mixed_qkv_non_spec = mixed_qkv[~spec_token_masks]
252+
if vllm_version_is("0.11.0"):
253+
mixed_qkv_spec = mixed_qkv[spec_token_masks]
254+
mixed_qkv_non_spec = mixed_qkv[~spec_token_masks]
255+
else:
256+
mixed_qkv_spec = mixed_qkv.index_select(0, spec_token_indx)
257+
mixed_qkv_non_spec = mixed_qkv.index_select(
258+
0, non_spec_token_indx)
247259
else:
248260
mixed_qkv_spec = None
249261
mixed_qkv_non_spec = mixed_qkv
@@ -293,10 +305,16 @@ def _forward(
293305
g_non_spec = None
294306
beta_non_spec = None
295307
else:
296-
g_spec = g[:, spec_token_masks]
297-
beta_spec = beta[:, spec_token_masks]
298-
g_non_spec = g[:, ~spec_token_masks]
299-
beta_non_spec = beta[:, ~spec_token_masks]
308+
if vllm_version_is("0.11.0"):
309+
g_spec = g[:, spec_token_masks]
310+
beta_spec = beta[:, spec_token_masks]
311+
g_non_spec = g[:, ~spec_token_masks]
312+
beta_non_spec = beta[:, ~spec_token_masks]
313+
else:
314+
g_spec = g.index_select(1, spec_token_indx)
315+
beta_spec = beta.index_select(1, spec_token_indx)
316+
g_non_spec = g.index_select(1, non_spec_token_indx)
317+
beta_non_spec = beta.index_select(1, non_spec_token_indx)
300318
else:
301319
g_spec = None
302320
beta_spec = None
@@ -404,8 +422,14 @@ def _forward(
404422
dtype=core_attn_out_non_spec.dtype,
405423
device=core_attn_out_non_spec.device,
406424
)
407-
core_attn_out[:, spec_token_masks] = core_attn_out_spec
408-
core_attn_out[:, ~spec_token_masks] = core_attn_out_non_spec
425+
if vllm_version_is("0.11.0"):
426+
core_attn_out[:, spec_token_masks] = core_attn_out_spec
427+
core_attn_out[:, ~spec_token_masks] = core_attn_out_non_spec
428+
else:
429+
core_attn_out.index_copy_(1, spec_token_indx,
430+
core_attn_out_spec)
431+
core_attn_out.index_copy_(1, non_spec_token_indx,
432+
core_attn_out_non_spec)
409433
elif spec_sequence_masks is not None:
410434
core_attn_out = core_attn_out_spec
411435
else:
@@ -673,4 +697,4 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
673697
self.num_physical_experts = example_layer.n_physical_experts
674698
self.num_local_physical_experts = example_layer.n_local_physical_experts
675699
self.num_routed_experts = example_layer.n_routed_experts
676-
self.num_redundant_experts = example_layer.n_redundant_experts
700+
self.num_redundant_experts = example_layer.n_redundant_experts

vllm_ascend/sample/rejection_sampler.py

Lines changed: 20 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -5,10 +5,17 @@
55
import torch.nn as nn
66
import vllm.v1.sample.rejection_sampler as rs
77
from vllm.v1.sample.metadata import SamplingMetadata
8-
from vllm.v1.sample.rejection_sampler import (RejectionSampler, compute_probs,
8+
from vllm.v1.sample.rejection_sampler import (RejectionSampler,
99
generate_uniform_probs)
1010
from vllm.v1.spec_decode.metadata import SpecDecodeMetadata
1111

12+
from vllm_ascend.utils import vllm_version_is
13+
14+
if vllm_version_is("0.11.0"):
15+
from vllm.v1.sample.rejection_sampler import compute_probs
16+
else:
17+
from vllm.v1.sample.rejection_sampler import apply_sampling_constraints
18+
1219
PLACEHOLDER_TOKEN_ID = -1
1320
GREEDY_TEMPERATURE = -1
1421
# Maximum number of speculative draft tokens allowed per request in a single
@@ -82,11 +89,18 @@ def forward(
8289
# [num_tokens, vocab_size]
8390
# NOTE(woosuk): `target_logits` can be updated in place inside the
8491
# `compute_probs` function.
85-
target_probs = compute_probs(
86-
target_logits,
87-
metadata.cu_num_draft_tokens,
88-
sampling_metadata,
89-
)
92+
if vllm_version_is("0.11.0"):
93+
target_probs = compute_probs(
94+
target_logits,
95+
metadata.cu_num_draft_tokens,
96+
sampling_metadata,
97+
)
98+
else:
99+
target_probs = apply_sampling_constraints(
100+
target_logits,
101+
metadata.cu_num_draft_tokens,
102+
sampling_metadata,
103+
)
90104

91105
output_token_ids = rejection_sample(
92106
metadata.draft_token_ids,

vllm_ascend/spec_decode/eagle_proposer.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,6 @@
1212
from vllm.model_executor.model_loader import get_model
1313
from vllm.model_executor.models import supports_multimodal
1414
from vllm.model_executor.models.llama_eagle3 import Eagle3LlamaForCausalLM
15-
from vllm.utils import is_pin_memory_available
1615
from vllm.v1.core.sched.output import SchedulerOutput
1716
from vllm.v1.sample.metadata import SamplingMetadata
1817
from vllm.v1.spec_decode.metadata import SpecDecodeMetadata
@@ -27,8 +26,10 @@
2726

2827
if vllm_version_is("0.11.0"):
2928
from vllm.config import CompilationLevel
29+
from vllm.utils import is_pin_memory_available
3030
else:
3131
from vllm.config import CompilationMode
32+
from vllm.utils.platform_utils import is_pin_memory_available
3233

3334
PADDING_SLOT_ID = -1
3435

vllm_ascend/worker/model_runner_v1.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -72,7 +72,7 @@
7272
from vllm.sampling_params import SamplingType
7373
from vllm.sequence import IntermediateTensors
7474
from vllm.tasks import GenerationTask, PoolingTask, SupportedTask
75-
from vllm.utils import cdiv, is_pin_memory_available
75+
from vllm.utils import cdiv
7676
from vllm.utils.jsontree import json_map_leaves
7777
from vllm.v1.attention.backends.gdn_attn import GDNAttentionMetadataBuilder
7878
from vllm.v1.attention.backends.utils import (
@@ -155,13 +155,14 @@
155155
if vllm_version_is("0.11.0"):
156156
from vllm.attention.layer import Attention
157157
from vllm.config import CompilationLevel
158-
from vllm.utils import LazyLoader
158+
from vllm.utils import LazyLoader, is_pin_memory_available
159159

160160
from vllm_ascend.models.layers.mla import AscendMultiHeadLatentAttention
161161
else:
162162
from vllm.attention.layer import MLAAttention
163163
from vllm.config import CompilationMode
164164
from vllm.utils.import_utils import LazyLoader
165+
from vllm.utils.platform_utils import is_pin_memory_available
165166

166167
if TYPE_CHECKING:
167168
import xgrammar as xgr # type: ignore[import-untyped]

0 commit comments

Comments
 (0)