Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 5 additions & 6 deletions vllm_ascend/quantization/w4a8_dynamic.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@
from vllm_ascend.ascend_config import get_ascend_config
from vllm_ascend.distributed.parallel_state import get_mc2_group
from vllm_ascend.ops.moe.experts_selector import select_experts
from vllm_ascend.utils import ACL_FORMAT_FRACTAL_NZ, is_enable_nz
from vllm_ascend.utils import ACL_FORMAT_FRACTAL_NZ


class AscendW4A8DynamicLinearMethod:
Expand Down Expand Up @@ -482,10 +482,9 @@ def process_weights_after_loading(self, layer):

self.update_bias(layer, w13_bias, w2_bias)

if is_enable_nz():
layer.w13_weight.data = torch_npu.npu_format_cast(
layer.w13_weight.data, ACL_FORMAT_FRACTAL_NZ)
layer.w2_weight.data = torch_npu.npu_format_cast(
layer.w2_weight.data, ACL_FORMAT_FRACTAL_NZ)
layer.w13_weight.data = torch_npu.npu_format_cast(
layer.w13_weight.data, ACL_FORMAT_FRACTAL_NZ)
layer.w2_weight.data = torch_npu.npu_format_cast(
layer.w2_weight.data, ACL_FORMAT_FRACTAL_NZ)
Comment on lines +485 to +488
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

critical

This change makes the ACL_FORMAT_FRACTAL_NZ conversion unconditional. However, the removed is_enable_nz() check contained specific logic to disable this conversion for the qwen3_next model, which could be for compatibility reasons. Applying this format conversion to qwen3_next may cause a regression if the model does not support it for W4A8 dynamic quantization. If this model-specific limitation still exists, the check should be preserved, perhaps without the dependency on the VLLM_ASCEND_ENABLE_NZ environment variable.

layer.w13_weight.data = self.pack_to_int32(layer.w13_weight.data)
layer.w2_weight.data = self.pack_to_int32(layer.w2_weight.data)
2 changes: 1 addition & 1 deletion vllm_ascend/quantization/w8a8.py
Original file line number Diff line number Diff line change
Expand Up @@ -347,7 +347,7 @@ def process_weights_after_loading(self, layer):
# converting ACL_FORMAT_FRACTAL_NZ.
# npu_quant_grouped_matmul_dequant in eager mode does not accept
# ACL_FORMAT_FRACTAL_NZ.
if not is_310p() and is_enable_nz():
if not is_310p():
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

critical

Removing the is_enable_nz() check from this condition makes the NZ format conversion unconditional for non-310p devices. The is_enable_nz() function contained an exception to disable NZ format for the qwen3_next model. This change will now apply the conversion to qwen3_next, which could cause a regression if this model does not support NZ format for W8A8 quantization. If the qwen3_next model still has this incompatibility, the check should be restored.

layer.w13_weight.data = torch_npu.npu_format_cast(
layer.w13_weight.data, ACL_FORMAT_FRACTAL_NZ).contiguous()
layer.w2_weight.data = torch_npu.npu_format_cast(
Expand Down
5 changes: 2 additions & 3 deletions vllm_ascend/quantization/w8a8_dynamic.py
Original file line number Diff line number Diff line change
Expand Up @@ -270,9 +270,8 @@ def process_weights_after_loading(self, layer):
1, 2).contiguous()
layer.w2_weight.data = layer.w2_weight.data.transpose(
1, 2).contiguous()
if is_enable_nz():
torch_npu.npu_format_cast_(layer.w13_weight, ACL_FORMAT_FRACTAL_NZ)
torch_npu.npu_format_cast_(layer.w2_weight, ACL_FORMAT_FRACTAL_NZ)
torch_npu.npu_format_cast_(layer.w13_weight, ACL_FORMAT_FRACTAL_NZ)
torch_npu.npu_format_cast_(layer.w2_weight, ACL_FORMAT_FRACTAL_NZ)
Comment on lines +273 to +274
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

critical

This change makes the ACL_FORMAT_FRACTAL_NZ conversion unconditional. The removed is_enable_nz() check included logic to disable this conversion for the qwen3_next model. This could lead to a regression for qwen3_next if it does not support the NZ format for W8A8 dynamic quantization. If qwen3_next still has this limitation, the model-specific check should be preserved.

layer.w13_weight_scale.data = layer.w13_weight_scale.data.view(
layer.w13_weight_scale.data.shape[0], -1)
layer.w13_weight_scale_fp32 = layer.w13_weight_scale.data.to(
Expand Down
7 changes: 3 additions & 4 deletions vllm_ascend/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,10 +81,9 @@ def is_enable_nz(dtype: Optional[torch.dtype] = torch.int8,
"vllm_config must be provided when _ENABLE_NZ is None")
_ENABLE_NZ = envs_ascend.VLLM_ASCEND_ENABLE_NZ and vllm_config.model_config.hf_config.model_type != "qwen3_next"

_IS_EAGLE_MODE = (
vllm_config.speculative_config is not None and
getattr(vllm_config.speculative_config, 'method', None) in ("eagle", "eagle3")
)
_IS_EAGLE_MODE = (vllm_config.speculative_config is not None
and getattr(vllm_config.speculative_config, 'method',
None) in ("eagle", "eagle3"))

if dtype in [torch.float16, torch.bfloat16]:
return _ENABLE_NZ if _IS_EAGLE_MODE else False
Expand Down
Loading