Skip to content

Commit d5b59ad

Browse files
committed
move multi-stream to eplb utils
Signed-off-by: daishixun <[email protected]>
1 parent e09650a commit d5b59ad

File tree

4 files changed

+17
-13
lines changed

4 files changed

+17
-13
lines changed

vllm_ascend/eplb/eplb_updator.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,8 @@
2323

2424
from vllm_ascend.eplb.core.eplb_utils import EPLBParamUtils
2525
from vllm_ascend.eplb.core.eplb_worker import EplbProcess
26-
from vllm_ascend.utils import moe_load_async_stream, npu_stream_switch
26+
from vllm_ascend.eplb.utils import moe_load_async_stream
27+
from vllm_ascend.utils import npu_stream_switch
2728

2829

2930
class EplbUpdator:

vllm_ascend/eplb/utils.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,9 @@
1818
import types
1919

2020
import torch
21+
import torch_npu
22+
23+
_MOE_LOAD_ASYNC_STREAM = None
2124

2225

2326
def get_expert_map(self, layer_id):
@@ -75,3 +78,12 @@ def model_register(model, model_config):
7578
model.num_moe_layers = config.num_hidden_layers - model.num_dense_layers
7679
else:
7780
raise NotImplementedError("EPLB is not supported.")
81+
82+
83+
def moe_load_async_stream() -> torch_npu.npu.Stream:
84+
global _MOE_LOAD_ASYNC_STREAM
85+
if _MOE_LOAD_ASYNC_STREAM is None:
86+
# when this function is called before any stream is set,
87+
# we return the default stream.
88+
_MOE_LOAD_ASYNC_STREAM = torch_npu.npu.Stream()
89+
return _MOE_LOAD_ASYNC_STREAM

vllm_ascend/ops/fused_moe/fused_moe.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@
3636
from vllm_ascend.ascend_forward_context import MoECommType
3737
from vllm_ascend.distributed.parallel_state import get_mc2_group
3838
from vllm_ascend.eplb.core.eplb_utils import determine_default_log2phy_map
39+
from vllm_ascend.eplb.utils import moe_load_async_stream
3940
from vllm_ascend.ops.expert_load_balancer import ExpertLoadBalancer
4041
from vllm_ascend.ops.fused_moe.experts_selector import select_experts
4142
from vllm_ascend.ops.fused_moe.moe_comm_method import setup_moe_comm_method
@@ -46,8 +47,7 @@
4647
AscendW8A8DynamicFusedMoEMethod
4748
from vllm_ascend.utils import (ACL_FORMAT_FRACTAL_NZ, AscendDeviceType,
4849
enable_sp, get_ascend_device_type, is_enable_nz,
49-
moe_load_async_stream, npu_stream_switch,
50-
shared_expert_dp_enabled,
50+
npu_stream_switch, shared_expert_dp_enabled,
5151
shared_experts_calculation_stream)
5252

5353

@@ -380,6 +380,7 @@ def forward_impl(self, hidden_states: torch.Tensor,
380380
if isinstance(final_hidden_states, tuple):
381381
final_hidden_states, group_list_type, expert_tokens = final_hidden_states
382382
if self.dynamic_eplb:
383+
383384
moe_load_stream = moe_load_async_stream()
384385
cur_stream = torch.npu.current_stream()
385386

vllm_ascend/utils.py

Lines changed: 0 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,6 @@
5353
_CURRENT_STREAM = None
5454
_PREFETCH_STREAM = None
5555
_SHARED_EXPERTS_CALCULATION_STREAM = None
56-
_MOE_LOAD_ASYNC_STREAM = None
5756
_ASCEND_CUSTOMOP_IS_REIGISTERED = False
5857
_DEFAULT_BUFFER_SIZE = 200
5958
_MIN_DP_BUFFER_SIZE = 50
@@ -324,15 +323,6 @@ def shared_experts_calculation_stream() -> torch.npu.Stream:
324323
return _SHARED_EXPERTS_CALCULATION_STREAM
325324

326325

327-
def moe_load_async_stream() -> torch_npu.npu.Stream:
328-
global _MOE_LOAD_ASYNC_STREAM
329-
if _MOE_LOAD_ASYNC_STREAM is None:
330-
# when this function is called before any stream is set,
331-
# we return the default stream.
332-
_MOE_LOAD_ASYNC_STREAM = torch_npu.npu.Stream()
333-
return _MOE_LOAD_ASYNC_STREAM
334-
335-
336326
def adapt_patch(is_global_patch: bool = False):
337327
if is_global_patch:
338328
from vllm_ascend.patch import platform # noqa: F401

0 commit comments

Comments
 (0)