fix synchronize error in _calc_spec_decode_metadata

Ronald1995 · Ronald1995 · commit b8ff04dfd401 · 2025-11-27T17:16:22.000+08:00
Signed-off-by: Ronald1995 &lt;ronaldautomobile@163.com&gt;
diff --git a/vllm_ascend/sample/rejection_sampler.py b/vllm_ascend/sample/rejection_sampler.py
@@ -383,7 +383,7 @@ def rejection_greedy_sample_pytorch(
         target_argmax[global_idx].to(output_token_ids.dtype),
         output_token_ids
     )
-    output_token_ids_.copy_(output_token_ids)
+    output_token_ids.copy_(output_token_ids_)
     # Fill bonus token.
     needs_bonus = is_greedy & (first_mismatch_pos_per_req
                                >= draft_tokens_per_req)
diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py
@@ -2144,16 +2144,31 @@ def _calc_spec_decode_metadata(
         target_logits_indices += arange
 
         # TODO: Optimize the CPU -> NPU copy.
-        cu_num_draft_tokens = torch.from_numpy(cu_num_draft_tokens).to(
-            self.device, non_blocking=True)
-        cu_num_sampled_tokens = torch.from_numpy(cu_num_sampled_tokens).to(
-            self.device, non_blocking=True)
-        logits_indices = torch.from_numpy(logits_indices).to(self.device,
-                                                             non_blocking=True)
-        target_logits_indices = torch.from_numpy(target_logits_indices).to(
-            self.device, non_blocking=True)
-        bonus_logits_indices = torch.from_numpy(bonus_logits_indices).to(
-            self.device, non_blocking=True)
+        cu_num_draft_tokens = (
+            torch.from_numpy(cu_num_draft_tokens)
+            .pin_memory()
+            .to(self.device, non_blocking=True)
+        )
+        cu_num_sampled_tokens = (
+            torch.from_numpy(cu_num_sampled_tokens)
+            .pin_memory()
+            .to(self.device, non_blocking=True)
+        )
+        logits_indices = (
+            torch.from_numpy(logits_indices)
+            .pin_memory()
+            .to(self.device, non_blocking=True)
+        )
+        target_logits_indices = (
+            torch.from_numpy(target_logits_indices)
+            .pin_memory()
+            .to(self.device, non_blocking=True)
+        )
+        bonus_logits_indices = (
+            torch.from_numpy(bonus_logits_indices)
+            .pin_memory()
+            .to(self.device, non_blocking=True)
+        )
 
         # Compute the draft token ids.
         # draft_token_indices:      [  1,   2,   3, 105, 106, 208]

Original file line number	Diff line number	Diff line change
`@@ -383,7 +383,7 @@ def rejection_greedy_sample_pytorch(`
`383`	`383`	`target_argmax[global_idx].to(output_token_ids.dtype),`
`384`	`384`	`output_token_ids`
`385`	`385`	`)`
`386`		`- output_token_ids_.copy_(output_token_ids)`
	`386`	`+ output_token_ids.copy_(output_token_ids_)`
`387`	`387`	`# Fill bonus token.`
`388`	`388`	`needs_bonus = is_greedy & (first_mismatch_pos_per_req`
`389`	`389`	`>= draft_tokens_per_req)`