Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 2 additions & 15 deletions vllm_ascend/ops/common_fused_moe.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@

import torch
import torch_npu
from vllm.config import CompilationLevel, get_current_vllm_config
from vllm.config import get_current_vllm_config
from vllm.distributed import (get_dp_group, get_ep_group, get_tp_group,
tensor_model_parallel_all_reduce)
from vllm.forward_context import get_forward_context
Expand Down Expand Up @@ -51,20 +51,7 @@ class AscendUnquantizedFusedMoEMethod(UnquantizedFusedMoEMethod):
def __init__(self, moe: FusedMoEConfig = None):

super().__init__(moe=moe)

# NOTE: Currently, this self.use_aclgraph is only used in
# UnquantizedFusedMoEMethod.forward_oot to decide whether to use in
# ops/fused_moe.py:568 to circumvent torch.randint_like not supported issue.
# Once torch.randint_like is supported or removed, this flag can be removed.
vllm_config = get_current_vllm_config()
ascend_config = get_ascend_config()
self.dynamic_eplb = get_ascend_config().dynamic_eplb
if ascend_config.torchair_graph_config.enabled:
self.use_aclgraph = False
else:
self.use_aclgraph = (vllm_config.compilation_config.level
== CompilationLevel.PIECEWISE and
not vllm_config.model_config.enforce_eager)
self.transpose = True

def process_weights_after_loading(self, layer):
Expand Down Expand Up @@ -133,7 +120,7 @@ def apply(self,
# this is a naive implementation for experts load balance so as
# to avoid accumulating too much tokens on a single rank.
# currently it is only activated when doing profile runs.
if enable_force_load_balance and not self.use_aclgraph:
if enable_force_load_balance:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

critical

This change enables torch.randint_like to be executed within an ACL graph context. This contradicts the comment on lines 55-58, which states that torch.randint_like is unsupported in ACL graph and was intentionally disabled. Executing an unsupported operation within a graph will cause failures during profiling runs. This introduces a critical risk of breaking profiling with ACL graph.

topk_ids = torch.randint_like(topk_ids, 0, global_num_experts)

moe_comm_method = get_forward_context().moe_comm_method
Expand Down
Loading