Update the way of capture

anon189Ty · anon189Ty · commit 343b4659be04 · 2025-11-14T11:45:04.000+08:00
Signed-off-by: anon189Ty &lt;Stari_Falcon@outlook.com&gt;
diff --git a/vllm_ascend/attention/mla_v1.py b/vllm_ascend/attention/mla_v1.py
@@ -2,6 +2,7 @@
 from typing import (TYPE_CHECKING, ClassVar, NamedTuple, Optional, Tuple, Type,
                     TypeVar)
 
+import numpy as np
 import torch
 import torch_npu
 from torch import nn
@@ -290,6 +291,31 @@ def reorder_batch(self, input_batch: "InputBatch",
         # better way of doing this
         return modified_batch
 
+    def pad_actual_seq_len_q(self, num_reqs_pad_size, num_reqs,
+                             actual_seq_lengths_q):
+        """
+        Only use for acl full graph mode.
+        Pad the last element of the actual_seq_lengths_q equal to the TND(T) and
+        the num of dimensions equal to the batch_size of main model.
+        
+        For example:
+        batch_size = 8, num_reqs = 4, num_speculative_tokens = 1
+        input actual_seq_lengths_q = [1, 2, 4, 5]  (the 3rd req was accept a token)
+        After padding the actual_seq_lengths_q will be similar to [1, 2, 4, 5, 6, 6, 7, 8]
+        """
+        need_padding = num_reqs_pad_size > 0
+        if need_padding:
+            start_val = actual_seq_lengths_q[-1]
+            end_val = num_reqs + num_reqs_pad_size
+            num_step = num_reqs_pad_size
+            interpolated = np.round(
+                np.linspace(start_val, end_val,
+                            num_step + 1)[1:]).astype(int).tolist()
+            assert interpolated[-1] == end_val
+            assert len(interpolated) == num_reqs_pad_size
+            actual_seq_lengths_q = actual_seq_lengths_q + interpolated
+        return actual_seq_lengths_q
+
     def build(
         self,
         common_prefix_len: int,
@@ -310,7 +336,13 @@ def build(
         # it blocks on all previous kernels.
         device = self.device
 
-        block_table = (common_attn_metadata.block_table_tensor[:num_reqs])
+        graph_pad_size = common_attn_metadata.graph_pad_size
+        if graph_pad_size > num_reqs:
+            # Should we add other check to make sure is in full graph mode?
+            block_table = (
+                common_attn_metadata.block_table_tensor[:graph_pad_size])
+        else:
+            block_table = (common_attn_metadata.block_table_tensor[:num_reqs])
         slot_mapping = common_attn_metadata.slot_mapping[:num_actual_tokens]
         input_positions = common_attn_metadata.positions[:
                                                          num_actual_tokens].long(
@@ -407,9 +439,28 @@ def build(
             max_seq_lens = seq_lens[:num_decodes].max().item()
             seq_lens = seq_lens[:num_decodes]
             input_positions = input_positions[:num_decode_tokens]
-            block_table = block_table[:num_decodes, ...]
+            if graph_pad_size > num_decodes:
+                # Should we add other check to make sure is in full graph mode?
+                block_table = block_table[:graph_pad_size, ...]
+            else:
+                block_table = block_table[:num_decodes, ...]
             seq_lens_list = seq_lens.tolist()
 
+            if graph_pad_size > num_reqs:
+                # Should we add other check to make sure is in full graph mode?
+                num_reqs_pad_size = graph_pad_size - num_reqs
+                actual_seq_lengths_q = self.pad_actual_seq_len_q(
+                    num_reqs_pad_size, num_reqs, actual_seq_lengths_q)
+                seq_lens_list = seq_lens_list + [0] * num_reqs_pad_size
+                num_block_pad_size = graph_pad_size - block_table.shape[0]
+                if num_block_pad_size > 0:
+                    block_table_padding = torch.zeros(
+                        (num_block_pad_size, ) + block_table.shape[1:],
+                        dtype=block_table.dtype,
+                        device=block_table.device)
+                    block_table = torch.cat([block_table, block_table_padding],
+                                            dim=0)
+
             # TODO: After the fullgraph supports MTP, the if branch needs to deleted
             assert self.cos_cache is not None
             assert self.sin_cache is not None
diff --git a/vllm_ascend/compilation/acl_graph.py b/vllm_ascend/compilation/acl_graph.py
@@ -279,10 +279,17 @@ def update_mla_attn_params(update_stream, forward_context, runtime_shape,
                     spec_multiple * (i + 1)
                     for i in range(runtime_shape // spec_multiple)
                 ]
+            elif forward_context.is_mtp_model:
+                actual_seq_lengths = forward_context.attn_metadata[
+                    key].decode.actual_seq_lengths_q
+                block_table = forward_context.attn_metadata[
+                    key].decode.block_table
+                seq_lens_list = seq_lens_list + [0] * (
+                    len(actual_seq_lengths) - len(seq_lens_list))
             else:
                 seq_lens_list = seq_lens_list + [0] * (runtime_shape -
                                                        len(seq_lens_list))
-                torch.npu.graph_task_update_begin(update_stream, handle)
+            torch.npu.graph_task_update_begin(update_stream, handle)
 
             torch_npu.npu_fused_infer_attention_score.out(
                 q_nope,
@@ -346,22 +353,14 @@ def get_graph_params():
     return _graph_params
 
 
-@dataclass
-class MTPGraphParams:
-    events: dict[int, list[torch.npu.ExternalEvent]]
-    workspaces: dict[int, torch.Tensor]
-    handles: dict[int, list[torch_npu._C._NPUTaskGroupHandle]]
-    attn_params: dict[int, list[tuple]]
-
-
-_mtp_graph_params: Optional[MTPGraphParams] = None
+_mtp_graph_params: Optional[GraphParams] = None
 
 
 def set_mtp_graph_params(aclgraph_capture_sizes: set[int]):
     global _mtp_graph_params
     if _mtp_graph_params is not None:
         raise ValueError("MTPGraph parameters have already been set!")
-    _mtp_graph_params = MTPGraphParams(
+    _mtp_graph_params = GraphParams(
         {size: []
          for size in aclgraph_capture_sizes},
         {size: None
diff --git a/vllm_ascend/spec_decode/mtp_proposer.py b/vllm_ascend/spec_decode/mtp_proposer.py