vllm-project · MengqingCao · Dec 9, 2025 · Nov 17, 2025 · Nov 17, 2025 · Nov 17, 2025
diff --git a/vllm_ascend/eplb/eplb_updator.py b/vllm_ascend/eplb/eplb_updator.py
@@ -23,7 +23,8 @@
 
 from vllm_ascend.eplb.core.eplb_utils import EPLBParamUtils
 from vllm_ascend.eplb.core.eplb_worker import EplbProcess
-
+from vllm_ascend.utils import (npu_stream_switch,
+                               moe_load_async_stream)
 
 class EplbUpdator:
 
@@ -152,20 +153,21 @@ def compute_and_set_moe_load(self, is_clear=False):
 
         self._gather_buffer = None
         if dist.is_initialized():
-            self.world_size = dist.get_world_size()
-            self.device = local_load.device
-            if self._gather_buffer is None:
-                shape = (self.world_size, *local_load.shape)
-                self._gather_buffer = torch.empty(shape,
-                                                  dtype=local_load.dtype,
-                                                  device=self.device)
-
-            dist.all_gather_into_tensor(self._gather_buffer, local_load)
-
-            moe_load = self._gather_buffer.permute(1, 0, 2)
-            self.shared_dict["moe_load"] = moe_load.cpu()
-            logger.debug(
-                f"[ModelRunner] Updated shared_dict['moe_load'] shape={moe_load.shape}"
+            with npu_stream_switch(moe_load_async_stream()):
+                self.world_size = dist.get_world_size()
+                self.device = local_load.device
+                if self._gather_buffer is None:
+                    shape = (self.world_size, *local_load.shape)
+                    self._gather_buffer = torch.empty(shape,
+                                                    dtype=local_load.dtype,
+                                                    device=self.device)
+
+                dist.all_gather_into_tensor(self._gather_buffer, local_load)
+
+                moe_load = self._gather_buffer.permute(1, 0, 2)
+                self.shared_dict["moe_load"] = moe_load.cpu()
+                logger.debug(
+                    f"[ModelRunner] Updated shared_dict['moe_load'] shape={moe_load.shape}"
             )
         else:
             moe_load = local_load.unsqueeze(1)

diff --git a/vllm_ascend/ops/fused_moe/fused_moe.py b/vllm_ascend/ops/fused_moe/fused_moe.py
@@ -46,6 +46,7 @@
                                is_enable_nz, npu_stream_switch,
                                shared_expert_dp_enabled,
                                shared_experts_calculation_stream,
+                               moe_load_async_stream,
                                vllm_version_is)
 
 if vllm_version_is("0.11.0"):
@@ -392,8 +393,10 @@ def forward_impl(self, hidden_states: torch.Tensor,
         if isinstance(final_hidden_states, tuple):
             final_hidden_states, group_list_type, expert_tokens = final_hidden_states
             if self.dynamic_eplb:
-                self.moe_load += expert_tokens if group_list_type == 1 else \
-                    torch.cat([expert_tokens[:1], expert_tokens[1:] - expert_tokens[:-1]])
+                with npu_stream_switch(moe_load_async_stream()):
+                    moe_load_async_stream().wait_stream(torch.npu.current_stream(device=expert_tokens.device))
+                    self.moe_load += expert_tokens if group_list_type == 1 else \
+                        torch.cat([expert_tokens[:1], expert_tokens[1:] - expert_tokens[:-1]])
 
         final_hidden_states = forward_context.moe_comm_method.finalize(
             hidden_states=final_hidden_states,

diff --git a/vllm_ascend/utils.py b/vllm_ascend/utils.py
@@ -53,6 +53,7 @@
 _CURRENT_STREAM = None
 _PREFETCH_STREAM = None
 _SHARED_EXPERTS_CALCULATION_STREAM = None
+_MOE_LOAD_ASYNC_STREAM = None
 _ASCEND_CUSTOMOP_IS_REIGISTERED = False
 _DEFAULT_BUFFER_SIZE = 200
 _MIN_DP_BUFFER_SIZE = 50
@@ -336,6 +337,15 @@ def shared_experts_calculation_stream() -> torch.npu.Stream:
     return _SHARED_EXPERTS_CALCULATION_STREAM
 
 
+def moe_load_async_stream() -> torch_npu.npu.Stream:
+    global _MOE_LOAD_ASYNC_STREAM
+    if _MOE_LOAD_ASYNC_STREAM is None:
+        # when this function is called before any stream is set,
+        # we return the default stream.
+        _MOE_LOAD_ASYNC_STREAM = torch_npu.npu.Stream()
+    return _MOE_LOAD_ASYNC_STREAM
+
+
 def adapt_patch(is_global_patch: bool = False):
     if is_global_patch:
         from vllm_ascend.patch import platform  # noqa: F401