diff --git a/vllm_ascend/quantization/w4a8_dynamic.py b/vllm_ascend/quantization/w4a8_dynamic.py index 11dc97d51e2..48ade692ec6 100644 --- a/vllm_ascend/quantization/w4a8_dynamic.py +++ b/vllm_ascend/quantization/w4a8_dynamic.py @@ -27,7 +27,7 @@ from vllm_ascend.ascend_config import get_ascend_config from vllm_ascend.distributed.parallel_state import get_mc2_group from vllm_ascend.ops.moe.experts_selector import select_experts -from vllm_ascend.utils import ACL_FORMAT_FRACTAL_NZ, is_enable_nz +from vllm_ascend.utils import ACL_FORMAT_FRACTAL_NZ class AscendW4A8DynamicLinearMethod: @@ -482,10 +482,9 @@ def process_weights_after_loading(self, layer): self.update_bias(layer, w13_bias, w2_bias) - if is_enable_nz(): - layer.w13_weight.data = torch_npu.npu_format_cast( - layer.w13_weight.data, ACL_FORMAT_FRACTAL_NZ) - layer.w2_weight.data = torch_npu.npu_format_cast( - layer.w2_weight.data, ACL_FORMAT_FRACTAL_NZ) + layer.w13_weight.data = torch_npu.npu_format_cast( + layer.w13_weight.data, ACL_FORMAT_FRACTAL_NZ) + layer.w2_weight.data = torch_npu.npu_format_cast( + layer.w2_weight.data, ACL_FORMAT_FRACTAL_NZ) layer.w13_weight.data = self.pack_to_int32(layer.w13_weight.data) layer.w2_weight.data = self.pack_to_int32(layer.w2_weight.data) diff --git a/vllm_ascend/quantization/w8a8.py b/vllm_ascend/quantization/w8a8.py index 9df640c1893..cf5b1912c8e 100644 --- a/vllm_ascend/quantization/w8a8.py +++ b/vllm_ascend/quantization/w8a8.py @@ -347,7 +347,7 @@ def process_weights_after_loading(self, layer): # converting ACL_FORMAT_FRACTAL_NZ. # npu_quant_grouped_matmul_dequant in eager mode does not accept # ACL_FORMAT_FRACTAL_NZ. - if not is_310p() and is_enable_nz(): + if not is_310p(): layer.w13_weight.data = torch_npu.npu_format_cast( layer.w13_weight.data, ACL_FORMAT_FRACTAL_NZ).contiguous() layer.w2_weight.data = torch_npu.npu_format_cast( diff --git a/vllm_ascend/quantization/w8a8_dynamic.py b/vllm_ascend/quantization/w8a8_dynamic.py index 709f4884eb1..31b9ea912f5 100644 --- a/vllm_ascend/quantization/w8a8_dynamic.py +++ b/vllm_ascend/quantization/w8a8_dynamic.py @@ -270,9 +270,8 @@ def process_weights_after_loading(self, layer): 1, 2).contiguous() layer.w2_weight.data = layer.w2_weight.data.transpose( 1, 2).contiguous() - if is_enable_nz(): - torch_npu.npu_format_cast_(layer.w13_weight, ACL_FORMAT_FRACTAL_NZ) - torch_npu.npu_format_cast_(layer.w2_weight, ACL_FORMAT_FRACTAL_NZ) + torch_npu.npu_format_cast_(layer.w13_weight, ACL_FORMAT_FRACTAL_NZ) + torch_npu.npu_format_cast_(layer.w2_weight, ACL_FORMAT_FRACTAL_NZ) layer.w13_weight_scale.data = layer.w13_weight_scale.data.view( layer.w13_weight_scale.data.shape[0], -1) layer.w13_weight_scale_fp32 = layer.w13_weight_scale.data.to( diff --git a/vllm_ascend/utils.py b/vllm_ascend/utils.py index 0d128eb8aad..20b89107a4d 100644 --- a/vllm_ascend/utils.py +++ b/vllm_ascend/utils.py @@ -81,10 +81,9 @@ def is_enable_nz(dtype: Optional[torch.dtype] = torch.int8, "vllm_config must be provided when _ENABLE_NZ is None") _ENABLE_NZ = envs_ascend.VLLM_ASCEND_ENABLE_NZ and vllm_config.model_config.hf_config.model_type != "qwen3_next" - _IS_EAGLE_MODE = ( - vllm_config.speculative_config is not None and - getattr(vllm_config.speculative_config, 'method', None) in ("eagle", "eagle3") - ) + _IS_EAGLE_MODE = (vllm_config.speculative_config is not None + and getattr(vllm_config.speculative_config, 'method', + None) in ("eagle", "eagle3")) if dtype in [torch.float16, torch.bfloat16]: return _ENABLE_NZ if _IS_EAGLE_MODE else False