From 0ac8da49080c2dd34fb18a55116ce213b6ae1708 Mon Sep 17 00:00:00 2001 From: offline0806 <3337230449@qq.com> Date: Sat, 29 Nov 2025 18:05:30 +0800 Subject: [PATCH 1/3] [Bugfix]Fix eplb enable when using mtp float weights. Signed-off-by: offline0806 <3337230449@qq.com> --- vllm_ascend/ops/common_fused_moe.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/vllm_ascend/ops/common_fused_moe.py b/vllm_ascend/ops/common_fused_moe.py index e33835b5016..8363e0a96d7 100644 --- a/vllm_ascend/ops/common_fused_moe.py +++ b/vllm_ascend/ops/common_fused_moe.py @@ -172,6 +172,9 @@ def __init__(self, *args, **kwargs): self.expert_map_path = ascend_config.expert_map_path self.global_redundant_expert_num = ascend_config.init_redundancy_expert self.global_num_experts = num_experts + self.global_redundant_expert_num + #Todo: init_eplb_enable is a flag of whether enable when using mtp and float weights or + # not enable. It will be remove when eplb supporting mtp and float weights + init_eplb_enable = False if self.custom_routing_function is None and self.e_score_correction_bias is not None: vllm_config = get_current_vllm_config() self.e_score_correction_bias.data = self.e_score_correction_bias.data.to( @@ -191,6 +194,7 @@ def __init__(self, *args, **kwargs): self.moe_instance_id, self.ep_rank)) self.log2phy = self.expert_load_balancer.get_rank_log2phy_map( self.moe_instance_id, self.ep_rank).npu() + init_eplb_enable = True except Exception as e: logger.warning( f"Init expert map of mtp/eagle when using sample.{e}") @@ -236,10 +240,10 @@ def __init__(self, *args, **kwargs): self.moe_load = torch.zeros(local_num_experts, dtype=torch.int64).npu() - eplb_enable = self.dynamic_eplb or (self.expert_map_path is not None) - if eplb_enable and (not hasattr(self.quant_method, "quant_method") or - not isinstance(self.quant_method.quant_method, - AscendW8A8DynamicFusedMoEMethod)): + if init_eplb_enable and ( + not hasattr(self.quant_method, "quant_method") + or not isinstance(self.quant_method.quant_method, + AscendW8A8DynamicFusedMoEMethod)): raise ValueError("Eplb supports only w8a8_dynamic quantization.") self.moe_config.num_experts = self.global_num_experts From 08d41bc8b67e27adb528bd5db64a6c438f3a3e63 Mon Sep 17 00:00:00 2001 From: offline893 <158537145+offline893@users.noreply.github.com> Date: Sat, 29 Nov 2025 18:13:19 +0800 Subject: [PATCH 2/3] Update vllm_ascend/ops/common_fused_moe.py Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> Signed-off-by: offline893 <158537145+offline893@users.noreply.github.com> --- vllm_ascend/ops/common_fused_moe.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/vllm_ascend/ops/common_fused_moe.py b/vllm_ascend/ops/common_fused_moe.py index 8363e0a96d7..bd6c2f76f92 100644 --- a/vllm_ascend/ops/common_fused_moe.py +++ b/vllm_ascend/ops/common_fused_moe.py @@ -172,9 +172,9 @@ def __init__(self, *args, **kwargs): self.expert_map_path = ascend_config.expert_map_path self.global_redundant_expert_num = ascend_config.init_redundancy_expert self.global_num_experts = num_experts + self.global_redundant_expert_num - #Todo: init_eplb_enable is a flag of whether enable when using mtp and float weights or - # not enable. It will be remove when eplb supporting mtp and float weights - init_eplb_enable = False + # TODO: Flag for static expert placement. This is a temporary workaround + # to allow dynamic EPLB with float weights by skipping quantization checks. + self.static_eplb_enabled = False if self.custom_routing_function is None and self.e_score_correction_bias is not None: vllm_config = get_current_vllm_config() self.e_score_correction_bias.data = self.e_score_correction_bias.data.to( From 70459b2f9dcb70b175e53f3e483024df6b0659d8 Mon Sep 17 00:00:00 2001 From: offline0806 <3337230449@qq.com> Date: Tue, 2 Dec 2025 09:30:34 +0800 Subject: [PATCH 3/3] [CI]Fix ci. Signed-off-by: offline0806 <3337230449@qq.com> --- vllm_ascend/ops/common_fused_moe.py | 1 + 1 file changed, 1 insertion(+) diff --git a/vllm_ascend/ops/common_fused_moe.py b/vllm_ascend/ops/common_fused_moe.py index bd6c2f76f92..448b79465a1 100644 --- a/vllm_ascend/ops/common_fused_moe.py +++ b/vllm_ascend/ops/common_fused_moe.py @@ -180,6 +180,7 @@ def __init__(self, *args, **kwargs): self.e_score_correction_bias.data = self.e_score_correction_bias.data.to( dtype=vllm_config.model_config.dtype) # static eplb initializing with expert_map_path + init_eplb_enable = False if self.expert_map_path and os.path.exists( self.expert_map_path) and os.access(self.expert_map_path, os.R_OK):