Skip to content

Commit 08401b3

Browse files
845473182白永斌gemini-code-assist[bot]wangxiyuan
authored andcommitted
[Bugifx] fix quant_apply_mlp w1_scale type error & fix getting num_local_expert (vllm-project#4632)
### What this PR does / why we need it? Fix bugs introduced by vllm-project@bc67696 1. fix getting num_local_experet error in vllm_adaptor 2. fix w1_scale type error in moe_mlp.quant_apply_mlp.npu_dequant_swiglu_quant in w4a8 quantized scenario - vLLM version: v0.12.0 --------- Signed-off-by: 白永斌 <[email protected]> Signed-off-by: 欧派果奶我还要 <[email protected]> Co-authored-by: 白永斌 <[email protected]> Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> Co-authored-by: wangxiyuan <[email protected]>
1 parent 2d08a8e commit 08401b3

File tree

3 files changed

+4
-4
lines changed

3 files changed

+4
-4
lines changed

vllm_ascend/eplb/adaptor/vllm_adaptor.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -107,8 +107,8 @@ def init_buffer_tensor(self, num_buffer_tensor):
107107
self.buffer_tensor_list[buffer_id].append(buffer_tensor)
108108

109109
def init_expert_param_per_layer(self):
110-
num_local_expert = self.param_dict["model.layers." + str(self.num_dense_layers) + \
111-
".mlp.experts." + self.expert_weight_names[0]].data.shape[0]
110+
key = f"model.layers.{self.num_dense_layers}.mlp.experts.{self.expert_weight_names[0]}"
111+
num_local_expert = len(self.param_dict[key])
112112
for moe_layer_id in range(self.num_moe_layers):
113113
layer_idx = self.num_dense_layers + moe_layer_id
114114
self.expert_param_per_layer[layer_idx] = list()

vllm_ascend/ops/fused_moe/moe_mlp.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -129,7 +129,7 @@ def quant_apply_mlp(hidden_states: torch.Tensor,
129129
# act_fn: swiglu
130130
hidden_states, swiglu_out_scale = torch_npu.npu_dequant_swiglu_quant(
131131
x=hidden_states,
132-
weight_scale=w1_scale,
132+
weight_scale=w1_scale[0],
133133
activation_scale=pertoken_scale,
134134
bias=None,
135135
quant_scale=None,

vllm_ascend/quantization/w8a8_dynamic.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -289,7 +289,7 @@ def process_weights_after_loading(self, layer):
289289
]
290290
layer.w13_weight_scale_fp32_list = [
291291
weight.clone()
292-
for weight in layer.w13_weight_scale.data.unbind(dim=0)
292+
for weight in layer.w13_weight_scale_fp32.data.unbind(dim=0)
293293
]
294294
layer.w2_weight_scale_list = [
295295
weight.clone()

0 commit comments

Comments
 (0)