triton code vector core rejection_sampler.py

yuxingcyx · yuxingcyx · commit a60e4c85fa9c · 2025-12-11T11:31:17.000+08:00
Signed-off-by: yuxingcyx &lt;yuxingchen.math@gmail.com&gt;
diff --git a/vllm_ascend/sample/rejection_sampler.py b/vllm_ascend/sample/rejection_sampler.py
@@ -3,6 +3,7 @@
 
 import torch
 import torch.nn as nn
+import triton.runtime.driver as driver
 import vllm.v1.sample.rejection_sampler as rs
 from vllm.triton_utils import HAS_TRITON, tl, triton
 from vllm.v1.sample.metadata import SamplingMetadata
@@ -17,6 +18,9 @@
 # step. This value is chosen to be large enough to handle typical use cases.
 MAX_SPEC_LEN = 32
 
+device_properties = driver.active.utils.get_device_properties(torch.npu.current_device())
+vectorcore_num = device_properties['num_vectorcore']
+#get vector core number in order for later tiling
 
 class AscendRejectionSampler(RejectionSampler, nn.Module):
     """
@@ -155,8 +159,8 @@ def rejection_sample(
             n = cu_num_draft_tokens.numel()
             BLOCK_SIZE = 2
             grid = triton.cdiv(n, BLOCK_SIZE)
-            if n >= 40:
-                grid = 40  # Empirically tuned value
+            if n >= vectorcore_num:
+                grid = vectorcore_num  # Empirically tuned value
                 BLOCK_SIZE = triton.next_power_of_2(n // grid)
 
             if min(num_draft_tokens) == 1 and max(
@@ -295,8 +299,8 @@ def expand_batch_to_tokens(
         n = cu_num_tokens.numel()
         BLOCK_SIZE = 2
         grid = triton.cdiv(n, BLOCK_SIZE)
-        if n >= 40:
-            grid = 40
+        if n >= vectorcore_num:
+            grid = vectorcore_num
             BLOCK_SIZE = triton.next_power_of_2(n // grid)
 
         expand_kernel[(grid, )](