fix synchronize error

Ronald1995 · Ronald1995 · commit 43efaafea6a2 · 2025-11-27T16:19:26.000+08:00
Signed-off-by: Ronald1995 &lt;ronaldautomobile@163.com&gt;
diff --git a/vllm_ascend/sample/rejection_sampler.py b/vllm_ascend/sample/rejection_sampler.py
@@ -334,7 +334,7 @@ def rejection_greedy_sample_pytorch(
 
     start_indices = cu_num_draft_tokens - draft_tokens_per_req
     req_ids = torch.arange(batch_size, device=device)
-    total_draft_tokens = torch.sum(draft_tokens_per_req_cpu).item()
+    total_draft_tokens = sum(draft_tokens_per_req_cpu)
     token_req_ids = torch.repeat_interleave(
         req_ids, draft_tokens_per_req, output_size=total_draft_tokens
     )
@@ -363,8 +363,11 @@ def rejection_greedy_sample_pytorch(
                                          max_spec_len * 2)
         first_mismatch_pos_per_req, _ = torch.min(mismatch_positions, dim=1)
         no_mismatch_mask = (first_mismatch_pos_per_req == max_spec_len * 2)
-        first_mismatch_pos_per_req[no_mismatch_mask] = draft_tokens_per_req[
-            no_mismatch_mask]
+        first_mismatch_pos_per_req = torch.where(
+            no_mismatch_mask,
+            draft_tokens_per_req,
+            first_mismatch_pos_per_req,
+        )
 
     # Copy matched target tokens into output.
     copy_len = torch.minimum(first_mismatch_pos_per_req + 1,
@@ -375,16 +378,19 @@ def rejection_greedy_sample_pytorch(
     greedy_mask = is_greedy.unsqueeze(1)
     final_copy_mask = copy_mask & greedy_mask
     global_idx = start_indices.unsqueeze(1) + copy_indices
-    output_token_ids[final_copy_mask] = target_argmax[
-        global_idx[final_copy_mask]].to(output_token_ids.dtype)
+    output_token_ids_ = torch.where(
+        final_copy_mask,
+        target_argmax[global_idx].to(output_token_ids.dtype),
+        output_token_ids
+    )
+    output_token_ids_.copy_(output_token_ids)
     # Fill bonus token.
     needs_bonus = is_greedy & (first_mismatch_pos_per_req
                                >= draft_tokens_per_req)
-    if torch.any(needs_bonus):
-        bonus_rows = torch.where(needs_bonus)[0]
-        bonus_cols = draft_tokens_per_req[bonus_rows]
-        bonus_token_ids = bonus_token_ids.squeeze(1)
-        output_token_ids[bonus_rows, bonus_cols] = bonus_token_ids[bonus_rows]
+    bonus_rows = torch.where(needs_bonus)[0]
+    bonus_cols = draft_tokens_per_req[bonus_rows]
+    bonus_token_ids = bonus_token_ids.squeeze(1)
+    output_token_ids[bonus_rows, bonus_cols] = bonus_token_ids[bonus_rows]
 
 
 def rejection_random_sample_pytorch(
diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py
@@ -2655,32 +2655,33 @@ def sample_tokens(
             # NOTE(woosuk): As an exception, when using PP, the scheduler sends
             # the sampled tokens back, because there's no direct communication
             # between the first-stage worker and the last-stage worker.
-            for req_idx in range(num_sampled_tokens):
-                sampled_ids: np.ndarray | None
-                if self.use_async_scheduling:
-                    sampled_ids = (np.array([-1]) if req_idx
-                                   not in invalid_req_indices_set else None)
-                else:
-                    sampled_ids = valid_sampled_token_ids[req_idx]
-                if sampled_ids is None or sampled_ids.shape[0] == 0:
-                    continue
+            if not self.use_async_scheduling:
+                for req_idx in range(num_sampled_tokens):
+                    sampled_ids: np.ndarray | None
+                    if self.use_async_scheduling:
+                        sampled_ids = (np.array([-1]) if req_idx
+                                    not in invalid_req_indices_set else None)
+                    else:
+                        sampled_ids = valid_sampled_token_ids[req_idx]
+                    if sampled_ids is None or sampled_ids.shape[0] == 0:
+                        continue
 
-                start_idx = self.input_batch.num_tokens_no_spec[req_idx]
-                end_idx = start_idx + sampled_ids.shape[0]
-                assert end_idx <= self.model_config.max_model_len, (
-                    "Sampled token IDs exceed the max model length. "
-                    f"Total number of tokens: {end_idx} > max_model_len: "
-                    f"{self.model_config.max_model_len}")
-
-                self.input_batch.token_ids_cpu[req_idx,
-                                               start_idx:end_idx] = sampled_ids
-                self.input_batch.is_token_ids[req_idx,
-                                              start_idx:end_idx] = True
-                self.input_batch.num_tokens_no_spec[req_idx] = end_idx
-                self.input_batch.num_tokens[req_idx] = end_idx
-                req_id = self.input_batch.req_ids[req_idx]
-                req_state = self.requests[req_id]
-                req_state.output_token_ids.extend(sampled_ids.tolist())
+                    start_idx = self.input_batch.num_tokens_no_spec[req_idx]
+                    end_idx = start_idx + sampled_ids.shape[0]
+                    assert end_idx <= self.model_config.max_model_len, (
+                        "Sampled token IDs exceed the max model length. "
+                        f"Total number of tokens: {end_idx} > max_model_len: "
+                        f"{self.model_config.max_model_len}")
+
+                    self.input_batch.token_ids_cpu[req_idx,
+                                                start_idx:end_idx] = sampled_ids
+                    self.input_batch.is_token_ids[req_idx,
+                                                start_idx:end_idx] = True
+                    self.input_batch.num_tokens_no_spec[req_idx] = end_idx
+                    self.input_batch.num_tokens[req_idx] = end_idx
+                    req_id = self.input_batch.req_ids[req_idx]
+                    req_state = self.requests[req_id]
+                    req_state.output_token_ids.extend(sampled_ids.tolist())
         self.input_batch.prev_sampled_token_ids = None
         def propose_draft_token_ids(sampled_token_ids):
             assert self.spec_decode_common_attn_metadata is not None