[bugfix] fix deeepseek accuracy (#1118)

zzzzwwjj · web-flow · commit f1543d5e0d71 · 2025-06-07T21:11:36.000+08:00
### What this PR does / why we need it?
fix deeepseek accuracy in mix-parallel case.


Signed-off-by: zzzzwwjj &lt;1183291235@qq.com&gt;
diff --git a/vllm_ascend/models/deepseek_v2.py b/vllm_ascend/models/deepseek_v2.py
@@ -67,6 +67,7 @@
 
 import vllm_ascend.envs as envs_ascend
 from vllm_ascend.ascend_config import get_ascend_config
+from vllm_ascend.distributed.parallel_state import get_ep_group
 from vllm_ascend.ops.fused_moe import AscendFusedMoE
 from vllm_ascend.quantization.w8a8_dynamic import AscendW8A8DynamicLinearMethod
 from vllm_ascend.utils import dispose_tensor
@@ -211,13 +212,15 @@ def __init__(
 
         self.tp_group = get_tp_group().device_group
         self.tp_rank = get_tp_group().rank_in_group
+        self.ep_group = get_ep_group()
 
         self.params_dtype = torch.get_default_dtype()
 
         ascend_config = get_ascend_config()
         self.torchair_graph_enabled = ascend_config.torchair_graph_config.enabled
+        # NOTE: multistream only effective when `VLLM_ENABLE_MC2` is on
         self.enable_multistream_shared_expert = \
-            ascend_config.torchair_graph_config.enable_multistream_shared_expert
+            ascend_config.torchair_graph_config.enable_multistream_shared_expert and VLLM_ENABLE_MC2
 
     def forward(
             self,
@@ -245,16 +248,12 @@ def forward(
         old_hidden_states = hidden_states.clone()
 
         if self.tp_size > 1:
-            if envs_ascend.VLLM_ENABLE_MC2 and not is_prefill:
-                chunks = torch.chunk(hidden_states, self.tp_size, dim=0)
-                hidden_states = chunks[self.tp_rank]
-            elif not self.torchair_graph_enabled:
-                num_padding_tokens = (self.tp_size -
-                                      num_tokens % self.tp_size) % self.tp_size
-                # Pad hidden_states to make it divisible by tp_size to avoid cross-ring AllGatherV on 910B2C
-                if num_padding_tokens > 0:
+            if (VLLM_ENABLE_MC2
+                    and not is_prefill) or not (self.torchair_graph_enabled or
+                                                self.ep_group.world_size == 1):
+                if num_tokens < self.tp_size:
                     hidden_states = nn.functional.pad(
-                        hidden_states, (0, 0, 0, num_padding_tokens))
+                        hidden_states, (0, 0, 0, self.tp_size - num_tokens))
                 chunk_hidden_states = torch.tensor_split(hidden_states,
                                                          self.tp_size,
                                                          dim=0)
@@ -284,24 +283,16 @@ def forward(
         hidden_states = hidden_states * self.routed_scaling_factor
 
         if self.tp_size > 1:
-            if self.torchair_graph_enabled:
-                if envs_ascend.VLLM_ENABLE_MC2 and not is_prefill:
-                    final_hidden_states = torch.zeros(
-                        [num_tokens, hidden_size],
-                        dtype=self.params_dtype,
-                        device="npu")
-                    dist.all_gather_into_tensor(final_hidden_states,
-                                                hidden_states, self.tp_group)
-                    hidden_states = final_hidden_states
-                else:
-                    hidden_states = tensor_model_parallel_all_reduce(
-                        hidden_states)
-            else:
+            if (VLLM_ENABLE_MC2
+                    and not is_prefill) or not (self.torchair_graph_enabled or
+                                                self.ep_group.world_size == 1):
                 dist.all_gather(list(chunk_hidden_states), hidden_states,
                                 self.tp_group)
                 hidden_states = torch.cat(chunk_hidden_states, dim=0)
-                if num_padding_tokens > 0:
-                    hidden_states = hidden_states[:-num_padding_tokens]
+                if num_tokens < self.tp_size:
+                    hidden_states = hidden_states[:num_tokens]
+            else:
+                hidden_states = tensor_model_parallel_all_reduce(hidden_states)
 
         if self.n_shared_experts is not None:
             if not multistream:
diff --git a/vllm_ascend/ops/fused_moe.py b/vllm_ascend/ops/fused_moe.py
@@ -1027,8 +1027,9 @@ def __init__(
 
         ascend_config = get_ascend_config()
         self.torchair_graph_enabled = ascend_config.torchair_graph_config.enabled
+        # NOTE: multistream only effective when `VLLM_ENABLE_MC2` is on
         self.enable_multistream_shared_expert = \
-            ascend_config.torchair_graph_config.enable_multistream_shared_expert
+            ascend_config.torchair_graph_config.enable_multistream_shared_expert and VLLM_ENABLE_MC2
 
         if self.scoring_func != "softmax" and not self.use_grouped_topk:
             raise ValueError("Only softmax scoring function is supported for "
diff --git a/vllm_ascend/platform.py b/vllm_ascend/platform.py
@@ -142,7 +142,11 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
 
             # NOTE: When enable_expert_parallel is True, we follow vLLM convention:
             # ep_size = world_size, which means expert_tensor_parallel_size must be 1
-            if ascend_config.expert_tensor_parallel_size > 0 and not parallel_config.enable_expert_parallel:
+            if parallel_config.enable_expert_parallel:
+                parallel_config.expert_tensor_parallel_size = 1
+            # NOTE: When enable_expert_parallel is False and param `asceend_config.expert_tensor_parallel_size`
+            # is configured, use ascend_config
+            elif ascend_config.expert_tensor_parallel_size > 0:
                 parallel_config.expert_tensor_parallel_size = ascend_config.expert_tensor_parallel_size
 
             # Calculate expert parallel size based on world size