top9

shenchuxiaofugui · shenchuxiaofugui · commit 60c5bb3a7a5d · 2025-12-08T11:59:35.000+08:00
diff --git a/vllm_ascend/ascend_config.py b/vllm_ascend/ascend_config.py
@@ -66,6 +66,7 @@ class AscendConfig:
 
     def __init__(self, vllm_config):
         additional_config = vllm_config.additional_config if vllm_config.additional_config is not None else {}
+        self.mix_placement = additional_config.get("mix_placement",False)
         torchair_graph_config = additional_config.get("torchair_graph_config",
                                                       {})
 
diff --git a/vllm_ascend/ops/fused_moe/experts_selector.py b/vllm_ascend/ops/fused_moe/experts_selector.py
@@ -33,6 +33,8 @@ def select_experts(hidden_states: torch.Tensor,
                    routed_scaling_factor=1.0,
                    e_score_correction_bias: Optional[torch.Tensor] = None,
                    indices_type: Optional[torch.dtype] = None,
+                   mix_placement: Optional[bool] = False,
+                   num_logical_experts: int = -1,
                    global_num_experts: int = -1):
     """
     Fused experts with select experts.
@@ -95,6 +97,19 @@ def select_experts(hidden_states: torch.Tensor,
             e_score_correction_bias=e_score_correction_bias,
             global_num_experts=global_num_experts,
         )
+    if mix_placement:
+        pad_shared_expert_ids = torch.full((topk_ids.shape[0], 1),
+                                           num_logical_experts,
+                                           dtype=topk_ids.dtype,
+                                           device=topk_ids.device)
+
+        pad_shared_expert_weights = torch.full((topk_weights.shape[0], 1),
+                                               0.4,
+                                               dtype=topk_weights.dtype,
+                                               device=topk_weights.device)
+        topk_ids = torch.cat([topk_ids, pad_shared_expert_ids], dim=1)
+        topk_weights = torch.cat([topk_weights, pad_shared_expert_weights], 
+                                 dim=1)
     return topk_weights, topk_ids
 
 
diff --git a/vllm_ascend/ops/fused_moe/fused_moe.py b/vllm_ascend/ops/fused_moe/fused_moe.py
@@ -172,10 +172,10 @@ def __init__(self, *args, **kwargs):
         self.moe_config.dp_group = get_dp_group()
         self.moe_config.ep_group = get_ep_group()
         self.moe_config.mc2_group = get_mc2_group()
-        ascend_config = get_ascend_config()
-        self.dynamic_eplb = ascend_config.dynamic_eplb or ascend_config.expert_map_record_path
-        self.expert_map_path = ascend_config.expert_map_path
-        self.global_redundant_expert_num = ascend_config.init_redundancy_expert
+        self.ascend_config = get_ascend_config()
+        self.dynamic_eplb = self.ascend_config.dynamic_eplb or self.ascend_config.expert_map_record_path
+        self.expert_map_path = self.ascend_config.expert_map_path
+        self.global_redundant_expert_num = self.ascend_config.init_redundancy_expert
         self.global_num_experts = num_experts + self.global_redundant_expert_num
         if self.custom_routing_function is None and self.e_score_correction_bias is not None:
             vllm_config = get_current_vllm_config()
@@ -195,8 +195,8 @@ def __init__(self, *args, **kwargs):
             self.expert_load_balancer = ExpertLoadBalancer(
                 self.expert_map_path, num_experts)
             self.expert_load_balancer.check_expert_map_tensor()
-            self.global_redundant_expert_num = (
-                self.expert_load_balancer.get_global_redundant_expert_num())
+            # self.global_redundant_expert_num = (
+            #     self.expert_load_balancer.get_global_redundant_expert_num())
             self.global_num_experts = num_experts + self.global_redundant_expert_num
             try:
                 self.local_num_experts, self.expert_map = (
@@ -254,7 +254,7 @@ def __init__(self, *args, **kwargs):
             moe_quant_params["intermediate_size_full"] = intermediate_size
         self.quant_method.create_weights(layer=self, **moe_quant_params)
 
-        self.enable_shared_expert_dp = ascend_config.enable_shared_expert_dp
+        self.enable_shared_expert_dp = self.ascend_config.enable_shared_expert_dp
 
         setup_moe_comm_method(self.moe_config)
         self.quant_type = self._get_quant_type()
@@ -460,8 +460,8 @@ def __init__(
         self._shared_experts = shared_experts
         self.use_overlapped = use_overlapped
         self.shared_expert_stream = None
-        ascend_config = get_ascend_config()
-        self.multistream_overlap_shared_expert = ascend_config.multistream_overlap_shared_expert
+        self.ascend_config = get_ascend_config()
+        self.multistream_overlap_shared_expert = self.ascend_config.multistream_overlap_shared_expert
         if enable_sp():
             logger.info_once(
                 "Sequence parallelism is enabled, shared experts are replicated for best performance."
@@ -489,11 +489,19 @@ def forward(
         hidden_states: torch.Tensor,
         router_logits: torch.Tensor,
     ) -> tuple[torch.Tensor, torch.Tensor]:
-        shared_out, fused_out = AscendFusedMoE.forward(
-            self,
-            hidden_states=hidden_states,
-            router_logits=router_logits,
-        )
+        if self._shared_experts is None:
+            fused_out = AscendFusedMoE.forward(
+                self,
+                hidden_states=hidden_states,
+                router_logits=router_logits,
+            )
+            shared_out = None
+        else:
+            shared_out, fused_out = AscendFusedMoE.forward(
+                self,
+                hidden_states=hidden_states,
+                router_logits=router_logits,
+            )
         return shared_out, fused_out
 
     def forward_impl(self, hidden_states: torch.Tensor,
@@ -507,7 +515,10 @@ def forward_impl(self, hidden_states: torch.Tensor,
             # Use a separate stream to run shared experts.
             # Note that currently we only support calculations in separate streams with aclgraph.
             # Communication operations in another stream might cause unknown errors.
-            shared_out = self._shared_experts(hidden_states)
+            if self._shared_experts is None:
+                shared_out = None
+            else:
+                shared_out = self._shared_experts(hidden_states)
 
         fused_output = AscendFusedMoE.forward_impl(
             self,
@@ -521,7 +532,10 @@ def forward_impl(self, hidden_states: torch.Tensor,
         # NOTE: This is exactly the opposite of `maybe_all_reduce_tensor_model_parallel`
         forward_context = get_forward_context()
         moe_comm_type = forward_context.moe_comm_type
-        if moe_comm_type in {MoECommType.ALLTOALL, MoECommType.MC2, MoECommType.FUSED_ALLTOALL} \
-                and not shared_expert_dp_enabled():
+        if moe_comm_type in {MoECommType.ALLTOALL, MoECommType.MC2} \
+                and not shared_expert_dp_enabled() and shared_out is not None:
             shared_out = tensor_model_parallel_all_reduce(shared_out)
-        return shared_out, fused_output
+        if shared_out is None:
+            return fused_output
+        else:
+            return shared_out, fused_output
diff --git a/vllm_ascend/ops/fused_moe/moe_mlp.py b/vllm_ascend/ops/fused_moe/moe_mlp.py
@@ -299,3 +299,4 @@ def unified_apply_mlp(hidden_states: torch.Tensor,
                                  group_list_type=group_list_type,
                                  topk_scales=topk_scales,
                                  need_trans=need_trans)
+
diff --git a/vllm_ascend/patch/__init__.py b/vllm_ascend/patch/__init__.py
@@ -138,3 +138,4 @@
 #    Future Plan:
 #       Remove this patch when adapted vllm version contains the above PR.
 #
+from vllm_ascend.patch.worker import patch_deepseekv3
diff --git a/vllm_ascend/patch/worker/patch_deepseekv3.py b/vllm_ascend/patch/worker/patch_deepseekv3.py

Original file line number	Diff line number	Diff line change
`@@ -138,3 +138,4 @@`
`138`	`138`	`# Future Plan:`
`139`	`139`	`# Remove this patch when adapted vllm version contains the above PR.`
`140`	`140`	`#`
	`141`	`+from vllm_ascend.patch.worker import patch_deepseekv3`