[BugFix] Fix mlapo accuracy problem related with weight processing. (#3857)

whx-sjtu · web-flow · commit 211d4b9da4c9 · 2025-10-30T00:35:50.000+08:00
This PR fixes a mlapo accuracy problem related with weight processing.
Furthermore, modify mlapo related e2e test with quantized deepseek model
to make it effective.

Signed-off-by: whx-sjtu &lt;2952154980@qq.com&gt;
diff --git a/tests/e2e/singlecard/spec_decode_v1/test_v1_mtp_correctness.py b/tests/e2e/singlecard/spec_decode_v1/test_v1_mtp_correctness.py
@@ -111,19 +111,3 @@ def test_mtp2_correctness_full_graph(
     model_name: str,
 ):
     mtp_correctness(sampling_config, model_name, 2, CUDAGraphMode.FULL)
-
-
-@patch.dict(os.environ, {"VLLM_ASCEND_ENABLE_MLAPO": "1"})
-def test_mtp_correctness_piecewise_graph_with_mlapo_kernel(
-    sampling_config: SamplingParams,
-    model_name: str,
-):
-    mtp_correctness(sampling_config, model_name, 1)
-
-
-@patch.dict(os.environ, {"VLLM_ASCEND_ENABLE_MLAPO": "1"})
-def test_mtp_correctness_full_graph_with_mlapo_kernel(
-    sampling_config: SamplingParams,
-    model_name: str,
-):
-    mtp_correctness(sampling_config, model_name, 1, CUDAGraphMode.FULL)
diff --git a/vllm_ascend/attention/mla_v1.py b/vllm_ascend/attention/mla_v1.py
@@ -676,9 +676,9 @@ def _process_weights_for_fused_mlapo(self, act_dtype: torch.dtype):
             ..., self.q_lora_rank:].contiguous()
         q_a_proj_wt = self.fused_qkv_a_proj.weight.data[
             ..., :self.q_lora_rank].contiguous()
-        kv_a_proj_wt = kv_a_proj_wt.contiguous()
+        kv_a_proj_wt = kv_a_proj_wt.t().contiguous()
         kv_a_proj_wt = trans_rope_weight(kv_a_proj_wt, self.qk_rope_head_dim)
-        kv_a_proj_wt = kv_a_proj_wt.contiguous()
+        kv_a_proj_wt = kv_a_proj_wt.t().contiguous()
         wd_qkv = torch.cat((kv_a_proj_wt, q_a_proj_wt), dim=-1)
         wd_qkv = wd_qkv.t().contiguous()
         wd_qkv = transdata(wd_qkv,