Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
41 changes: 40 additions & 1 deletion vllm_ascend/ops/rotary_embedding.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,14 +16,16 @@
#

import math
from typing import Optional, Tuple
from typing import Callable, Optional, Tuple

import einops
import torch
import torch_npu
from vllm.forward_context import get_forward_context
from vllm.model_executor.layers.rotary_embedding import (
DeepseekScalingRotaryEmbedding, MRotaryEmbedding, RotaryEmbedding,
YaRNScalingRotaryEmbedding)
from vllm.model_executor.layers.rotary_embedding.common import ApplyRotaryEmb
from vllm.platforms import CpuArchEnum

from vllm_ascend.platform import NPUPlatform
Expand Down Expand Up @@ -435,3 +437,40 @@ def forward_oot(
rotary_mode='half')

return query, key


class AscendApplyRotaryEmb(ApplyRotaryEmb):

def __init__(
self,
is_neox_style: bool = False,
is_unsqueeze: bool = False,
default: Callable[..., torch.Tensor] | None = None,
) -> None:
super().__init__(is_neox_style, is_unsqueeze, default)

def forward_oot(
self,
x: torch.Tensor,
cos: torch.Tensor,
sin: torch.Tensor,
) -> torch.Tensor:
# x: [2 * b, s, head, head_dim]
qk = einops.rearrange(
x, "(two b) s head head_dim -> b s two head head_dim", two=2)
# q/k: [b, s, head, head_dim]
q, k = qk[:, :, 0], qk[:, :, 1]
head_dim = q.shape[-1]

cos = torch.cat((cos, cos), dim=-1)
sin = torch.cat((sin, sin), dim=-1)
cos = cos.reshape(1, -1, 1, head_dim)
sin = sin.reshape(1, -1, 1, head_dim)
# cos/sin: [1, s, 1, 2 * head_dim]

q = torch_npu.npu_rotary_mul(q, cos, sin)
k = torch_npu.npu_rotary_mul(k, cos, sin)

# output: []
output = torch.cat([q, k], dim=0)
return output
52 changes: 28 additions & 24 deletions vllm_ascend/patch/worker/patch_qwen2_5_vl.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,8 +32,6 @@
from vllm.model_executor.layers.layernorm import RMSNorm
from vllm.model_executor.layers.quantization import QuantizationConfig
from vllm.model_executor.layers.rotary_embedding import get_rope
from vllm.model_executor.layers.rotary_embedding.common import (
apply_rotary_emb_torch, dispatch_rotary_emb_function)
from vllm.model_executor.models.qwen2_5_vl import (
Qwen2_5_VisionAttention, Qwen2_5_VisionBlock, Qwen2_5_VisionPatchEmbed,
Qwen2_5_VisionPatchMerger, Qwen2_5_VisionTransformer,
Expand Down Expand Up @@ -69,36 +67,50 @@ def forward(
x, _ = self.qkv(x)
seq_len, batch_size, _ = x.shape

# Split q k v.
qkv = einops.rearrange(
x,
"s b (three head head_dim) -> b s three head head_dim",
three=3,
head=self.num_attention_heads_per_partition,
)
q, k, v = qkv[:, :, 0], qkv[:, :, 1], qkv[:, :, 2]
origin_shape = q.shape[-1]

# Convert cumulative tensor to intervals and move it to cpu.
cu_seqlens = torch.diff(cu_seqlens).to("cpu")

cos = torch.cat((rotary_pos_emb_cos, rotary_pos_emb_cos), dim=-1)
sin = torch.cat((rotary_pos_emb_sin, rotary_pos_emb_sin), dim=-1)
cos = cos.reshape(1, -1, 1, self.hidden_size_per_attention_head)
sin = sin.reshape(1, -1, 1, self.hidden_size_per_attention_head)
q = torch_npu.npu_rotary_mul(q, cos, sin)
k = torch_npu.npu_rotary_mul(k, cos, sin)
if rotary_pos_emb_cos is not None and rotary_pos_emb_sin is not None:
qk, v = qkv[:, :, :2], qkv[:, :, 2]

q, k, v = [
einops.rearrange(x, "b s h d -> (b s) h d").contiguous()
for x in (q, k, v)
]
qk_reshaped = einops.rearrange(
qk, "b s two head head_dim -> (two b) s head head_dim", two=2)
qk_rotated = self.apply_rotary_emb(
qk_reshaped,
rotary_pos_emb_cos,
rotary_pos_emb_sin,
)
qk_rotated = qk_rotated.view(
2,
batch_size,
seq_len,
self.num_attention_heads_per_partition,
self.hidden_size_per_attention_head,
)
q, k = qk_rotated.unbind(dim=0)
else:
q, k, v = qkv.unbind(dim=2)

# TODO(shen-shanshan): Move codes below to MMEncoderAttention CustomOp
# ----------------------------------------------------------------------
enable_pad = (envs_ascend.USE_OPTIMIZED_MODEL
and self.hidden_size_per_attention_head > MIN_PAD_SIZE
and self.hidden_size_per_attention_head < MAX_PAD_SIZE)

q, k, v = [
einops.rearrange(x, "b s h d -> (b s) h d").contiguous()
for x in (q, k, v)
]

if enable_pad:
origin_shape = q.shape[-1]
pad_len = MAX_PAD_SIZE - origin_shape
# q/k/v: [b * s, head, head_dim] -> [b * s, head, MAX_PAD_SIZE]
q = F.pad(q, (0, pad_len), mode="constant", value=0)
Expand All @@ -125,6 +137,7 @@ def forward(
context_layer = einops.rearrange(context_layer,
"(b s) h d -> s b (h d)",
b=batch_size).contiguous()
# ----------------------------------------------------------------------

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Meaningless comments, recommended for deletion.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Meaningless comments, recommended for deletion.

Ok. In fact, all these codes will finally be removed recently. 😀


output, _ = self.proj(context_layer)
return output
Expand Down Expand Up @@ -650,14 +663,6 @@ def _process_video_input(
return video_embeds.split(sizes)


def _apply_rotary_pos_emb_vision(t: torch.Tensor, cos: torch.Tensor,
sin: torch.Tensor) -> torch.Tensor:
rotary_emb_function = dispatch_rotary_emb_function(
default=partial(apply_rotary_emb_torch, is_neox_style=True))
output = rotary_emb_function(t, cos, sin).type_as(t)
return output


# NOTE: This will be removed after MMEncoderAttention has been extract as a CustomOp in vllm.
Qwen2VisionAttention.forward = AscendQwen2_5_VisionAttention.forward
Qwen2_5_VisionAttention.forward = AscendQwen2_5_VisionAttention.forward
Expand All @@ -676,4 +681,3 @@ def _apply_rotary_pos_emb_vision(t: torch.Tensor, cos: torch.Tensor,
Qwen2_5_VisionTransformer.rotary_pos_emb_thw = AscendQwen2_5_VisionTransformer.rotary_pos_emb_thw
Qwen2_5_VisionTransformer.get_rope_by_thw = AscendQwen2_5_VisionTransformer.get_rope_by_thw
Qwen2_5_VisionTransformer.forward = AscendQwen2_5_VisionTransformer.forward
apply_rotary_pos_emb_vision = _apply_rotary_pos_emb_vision
6 changes: 4 additions & 2 deletions vllm_ascend/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -679,8 +679,9 @@ def register_ascend_customop(vllm_config: Optional[VllmConfig] = None):
AscendRowParallelLinear)
from vllm_ascend.ops.mla import AscendMultiHeadLatentAttention
from vllm_ascend.ops.rotary_embedding import (
AscendDeepseekScalingRotaryEmbedding, AscendMRotaryEmbedding,
AscendRotaryEmbedding, AscendYaRNRotaryEmbedding)
AscendApplyRotaryEmb, AscendDeepseekScalingRotaryEmbedding,
AscendMRotaryEmbedding, AscendRotaryEmbedding,
AscendYaRNRotaryEmbedding)
from vllm_ascend.ops.vocab_parallel_embedding import (
AscendLogitsProcessor, AscendParallelLMHead,
AscendVocabParallelEmbedding)
Expand All @@ -706,6 +707,7 @@ def register_ascend_customop(vllm_config: Optional[VllmConfig] = None):
"FusedMoE": AscendFusedMoE,
"SharedFusedMoE": AscendSharedFusedMoE,
"MultiHeadLatentAttentionWrapper": AscendMultiHeadLatentAttention,
"ApplyRotaryEmb": AscendApplyRotaryEmb,
}

for name, op_cls in REGISTERED_ASCEND_OPS.items():
Expand Down
Loading