move multi-stream to eplb utils

dsxsteven · dsxsteven · commit d5b59adcfcf2 · 2025-12-04T19:19:34.000+08:00
Signed-off-by: daishixun &lt;dsxsteven@sina.com&gt;
diff --git a/vllm_ascend/eplb/eplb_updator.py b/vllm_ascend/eplb/eplb_updator.py
@@ -23,7 +23,8 @@
 
 from vllm_ascend.eplb.core.eplb_utils import EPLBParamUtils
 from vllm_ascend.eplb.core.eplb_worker import EplbProcess
-from vllm_ascend.utils import moe_load_async_stream, npu_stream_switch
+from vllm_ascend.eplb.utils import moe_load_async_stream
+from vllm_ascend.utils import npu_stream_switch
 
 
 class EplbUpdator:
diff --git a/vllm_ascend/eplb/utils.py b/vllm_ascend/eplb/utils.py
@@ -18,6 +18,9 @@
 import types
 
 import torch
+import torch_npu
+
+_MOE_LOAD_ASYNC_STREAM = None
 
 
 def get_expert_map(self, layer_id):
@@ -75,3 +78,12 @@ def model_register(model, model_config):
         model.num_moe_layers = config.num_hidden_layers - model.num_dense_layers
     else:
         raise NotImplementedError("EPLB is not supported.")
+
+
+def moe_load_async_stream() -> torch_npu.npu.Stream:
+    global _MOE_LOAD_ASYNC_STREAM
+    if _MOE_LOAD_ASYNC_STREAM is None:
+        # when this function is called before any stream is set,
+        # we return the default stream.
+        _MOE_LOAD_ASYNC_STREAM = torch_npu.npu.Stream()
+    return _MOE_LOAD_ASYNC_STREAM
diff --git a/vllm_ascend/ops/fused_moe/fused_moe.py b/vllm_ascend/ops/fused_moe/fused_moe.py
@@ -36,6 +36,7 @@
 from vllm_ascend.ascend_forward_context import MoECommType
 from vllm_ascend.distributed.parallel_state import get_mc2_group
 from vllm_ascend.eplb.core.eplb_utils import determine_default_log2phy_map
+from vllm_ascend.eplb.utils import moe_load_async_stream
 from vllm_ascend.ops.expert_load_balancer import ExpertLoadBalancer
 from vllm_ascend.ops.fused_moe.experts_selector import select_experts
 from vllm_ascend.ops.fused_moe.moe_comm_method import setup_moe_comm_method
@@ -46,8 +47,7 @@
     AscendW8A8DynamicFusedMoEMethod
 from vllm_ascend.utils import (ACL_FORMAT_FRACTAL_NZ, AscendDeviceType,
                                enable_sp, get_ascend_device_type, is_enable_nz,
-                               moe_load_async_stream, npu_stream_switch,
-                               shared_expert_dp_enabled,
+                               npu_stream_switch, shared_expert_dp_enabled,
                                shared_experts_calculation_stream)
 
 
@@ -380,6 +380,7 @@ def forward_impl(self, hidden_states: torch.Tensor,
         if isinstance(final_hidden_states, tuple):
             final_hidden_states, group_list_type, expert_tokens = final_hidden_states
             if self.dynamic_eplb:
+
                 moe_load_stream = moe_load_async_stream()
                 cur_stream = torch.npu.current_stream()
 
diff --git a/vllm_ascend/utils.py b/vllm_ascend/utils.py
@@ -53,7 +53,6 @@
 _CURRENT_STREAM = None
 _PREFETCH_STREAM = None
 _SHARED_EXPERTS_CALCULATION_STREAM = None
-_MOE_LOAD_ASYNC_STREAM = None
 _ASCEND_CUSTOMOP_IS_REIGISTERED = False
 _DEFAULT_BUFFER_SIZE = 200
 _MIN_DP_BUFFER_SIZE = 50
@@ -324,15 +323,6 @@ def shared_experts_calculation_stream() -> torch.npu.Stream:
     return _SHARED_EXPERTS_CALCULATION_STREAM
 
 
-def moe_load_async_stream() -> torch_npu.npu.Stream:
-    global _MOE_LOAD_ASYNC_STREAM
-    if _MOE_LOAD_ASYNC_STREAM is None:
-        # when this function is called before any stream is set,
-        # we return the default stream.
-        _MOE_LOAD_ASYNC_STREAM = torch_npu.npu.Stream()
-    return _MOE_LOAD_ASYNC_STREAM
-
-
 def adapt_patch(is_global_patch: bool = False):
     if is_global_patch:
         from vllm_ascend.patch import platform  # noqa: F401