test:profiling

CrimsonDump · CrimsonDump · commit e2752ac491b6 · 2025-11-12T17:00:41.000+08:00
diff --git a/rtp_llm/cpp/core/torch_utils/BufferTorchUtils.h b/rtp_llm/cpp/core/torch_utils/BufferTorchUtils.h
@@ -63,6 +63,7 @@ inline std::vector<int64_t> bufferShapeToTorchShape(const Buffer& buffer) {
     F(TYPE_INT16, torch::kShort)                                                                                       \
     F(TYPE_INT32, torch::kInt)                                                                                         \
     F(TYPE_INT64, torch::kLong)                                                                                        \
+    F(TYPE_UINT64, torch::kUInt64)                                                                                        \
     F(TYPE_FP16, torch::kHalf)                                                                                         \
     F(TYPE_FP32, torch::kFloat)                                                                                        \
     F(TYPE_FP64, torch::kDouble)                                                                                       \
diff --git a/rtp_llm/cpp/devices/rocm_impl/ROCmSampleOp.cc b/rtp_llm/cpp/devices/rocm_impl/ROCmSampleOp.cc
@@ -12,6 +12,14 @@ namespace rtp_llm {
 
 using SamplerT = float;
 
+void _saveTorchDataTofile(const torch::Tensor& tensor, const std::string& fileName) {
+    auto          tensor_cpu = tensor.contiguous().cpu();
+    auto          pickled    = torch::pickle_save(tensor_cpu);
+    std::ofstream fout(fileName, std::ios::out | std::ios::binary);
+    fout.write(pickled.data(), pickled.size());
+    fout.close();
+}
+
 // batch sampling explained:
 // topk = [4, 0, 4]. topp = [0.0, 0.5, 0.5]
 // then topk_decode handles [4, x, 4 + 0.5]
@@ -226,26 +234,22 @@ GreedyOutput ROCmDevice::sampleGreedy(const GreedyParams& params) {
         auto&      top_k      = params.top_k;
         auto&      top_p      = params.top_p;
 
-        auto logits_ref = params.logits.slice(0, params.logits.shape()[0]);
-        auto probs      = softmax({logits_ref, std::nullopt, std::nullopt, 1.0f, DataType::TYPE_INVALID, std::nullopt});
-        auto samples    = transposed_tokens->view(transposed_tokens->shape()[0] - 1, 1);
-
         bool                  deterministic = true;
-        std::vector<uint64_t> seed_v;
-        std::vector<uint64_t> offset_v;
+        auto seed_h = allocateBuffer({DataType::TYPE_UINT64, {batch_size}, AllocationType::HOST});
+        auto offset_h = allocateBuffer({DataType::TYPE_UINT64, {batch_size}, AllocationType::HOST});
         for (int i = 0; i < batch_size; i++) {
-            if (params.generator[i].defined()) {
-                auto [sd, ofst] = get_seed_and_offset(batch_size * 32, params.generator[i]);
-                seed_v.push_back(sd);
-                offset_v.push_back(ofst);
-            } else {
-                seed_v.push_back(0);
-                offset_v.push_back(0);
-            }
+            std::tie(seed_h->data<uint64_t>()[i], offset_h->data<uint64_t>()[i]) = params.generator[i].defined() ?
+                get_seed_and_offset(batch_size * 32, params.generator[i]) : 
+                std::make_pair(0ULL, 0ULL);
         }
-        auto seed = torch::from_blob(seed_v.data(), {static_cast<long>(batch_size)}, torch::kUInt64).to(torch::kCUDA);
-        auto offset =
-            torch::from_blob(offset_v.data(), {static_cast<long>(batch_size)}, torch::kUInt64).to(torch::kCUDA);
+        auto seed_d = clone({*seed_h, AllocationType::DEVICE});
+        auto offset_d = clone({*offset_h, AllocationType::DEVICE});
+        auto seed = Buffer2torchTensor(seed_d, false);
+        auto offset = Buffer2torchTensor(offset_d, false);
+
+        auto logits_ref = params.logits.slice(0, params.logits.shape()[0]);
+        auto probs      = softmax({logits_ref, std::nullopt, std::nullopt, 1.0f, DataType::TYPE_INVALID, std::nullopt});
+        auto samples    = transposed_tokens->view(transposed_tokens->shape()[0] - 1, 1);
 
         bool          need_output_all_probs = params.output_all_probs.has_value();
         torch::Tensor probs_t               = Buffer2torchTensor(probs, false);
@@ -267,6 +271,11 @@ GreedyOutput ROCmDevice::sampleGreedy(const GreedyParams& params) {
             }
         } else if (std::all_of(
                        top_k.data<uint32_t>(), top_k.data<uint32_t>() + batch_size, [&](auto t) { return t <= 0; })) {
+            static int fwd = 0;
+            ++fwd;
+            if (std::getenv("XBJ_DUMP_PROBS")) {
+                _saveTorchDataTofile(probs_t, std::string(std::getenv("XBJ_DUMP_PROBS")) + "/probs" + std::to_string(fwd) + ".pt");
+            }
             top_p_sampling_from_probs(probs_t,
                                       samples_t,
                                       std::nullopt,
diff --git a/rtp_llm/cpp/kernels/rocm/sampling/kernel.cuh b/rtp_llm/cpp/kernels/rocm/sampling/kernel.cuh
@@ -58,14 +58,7 @@ using namespace hipcub;
     __VA_ARGS__                                                                \
   }
 
-#define DISPATCH_SOFTMAX_CACHE_INPUT(cache_input, CACHE_INPUT, ...) \
-  if (cache_input) {                                                \
-    constexpr bool CACHE_INPUT = true;                              \
-    __VA_ARGS__                                                     \
-  } else {                                                          \
-    constexpr bool CACHE_INPUT = false;                             \
-    __VA_ARGS__                                                     \
-  }
+#define VEC_BYTES 64
 
 constexpr BlockScanAlgorithm SCAN_ALGO = BLOCK_SCAN_WARP_SCANS;
 constexpr BlockReduceAlgorithm REDUCE_ALGO = BLOCK_REDUCE_WARP_REDUCTIONS;
@@ -650,7 +643,7 @@ hipError_t TopKSamplingFromProb(T* probs, IdType* output, IdType* indices, T* to
                                  uint32_t batch_size, uint32_t top_k_val, uint32_t d,
                                  bool deterministic, uint64_t* philox_seed, uint64_t* philox_offset,
                                  hipStream_t stream = 0) {
-  const uint32_t vec_size = std::gcd(16 / sizeof(T), d);
+  const uint32_t vec_size = std::gcd(VEC_BYTES / sizeof(T), d);
 
   auto compute_capacity = GetCudaComputeCapability();
   DISPATCH_COMPUTE_CAP_NUM_THREADS(compute_capacity, BLOCK_THREADS, {
@@ -678,7 +671,7 @@ hipError_t TopPSamplingFromProb(T* probs, IdType* output, IdType* indices, T* to
                                  uint32_t batch_size, T top_p_val, uint32_t d, bool deterministic,
                                  uint64_t* philox_seed, uint64_t* philox_offset,
                                  hipStream_t stream = 0) {
-  const uint32_t vec_size = std::gcd(16 / sizeof(T), d);
+  const uint32_t vec_size = std::gcd(VEC_BYTES / sizeof(T), d);
 
   auto compute_capacity = GetCudaComputeCapability();
   DISPATCH_COMPUTE_CAP_NUM_THREADS(compute_capacity, BLOCK_THREADS, {
@@ -708,7 +701,7 @@ hipError_t TopKTopPSamplingFromProb(T* probs, IdType* top_k_arr, T* top_p_arr, I
                                      T top_p_val, uint32_t d, bool deterministic,
                                      uint64_t* philox_seed, uint64_t* philox_offset,
                                      hipStream_t stream = 0) {
-  const uint32_t vec_size = std::gcd(16 / sizeof(T), d);
+  const uint32_t vec_size = std::gcd(VEC_BYTES / sizeof(T), d);
 
   auto compute_capacity = GetCudaComputeCapability();
   DISPATCH_COMPUTE_CAP_NUM_THREADS(compute_capacity, BLOCK_THREADS, {
@@ -1053,7 +1046,7 @@ template <typename DType>
 hipError_t TopPRenormProb(DType* probs, DType* renormed_prob, float* top_p_arr,
                            uint32_t batch_size, float top_p_val, uint32_t d,
                            hipStream_t stream = 0) {
-  const uint32_t vec_size = std::gcd(16 / sizeof(DType), d);
+  const uint32_t vec_size = std::gcd(VEC_BYTES / sizeof(DType), d);
 
   auto compute_capacity = GetCudaComputeCapability();
   DISPATCH_COMPUTE_CAP_NUM_THREADS(compute_capacity, BLOCK_THREADS, {
@@ -1075,7 +1068,7 @@ template <typename DType, typename IdType>
 hipError_t TopKRenormProb(DType* probs, DType* renormed_prob, IdType* top_k_arr,
                            uint32_t batch_size, uint32_t top_k_val, uint32_t d,
                            hipStream_t stream = 0) {
-  const uint32_t vec_size = std::gcd(16 / sizeof(DType), d);
+  const uint32_t vec_size = std::gcd(VEC_BYTES / sizeof(DType), d);
 
   auto compute_capacity = GetCudaComputeCapability();
   DISPATCH_COMPUTE_CAP_NUM_THREADS(compute_capacity, BLOCK_THREADS, {
diff --git a/rtp_llm/cpp/kernels/rocm/sampling/test.py b/rtp_llm/cpp/kernels/rocm/sampling/test.py
@@ -170,6 +170,8 @@ def test_top_p_sampling(batch_size, vocab_size, p):
         realdata = Path(realdata)
         assert realdata.is_file()
         normalized_prob = torch.load(realdata, weights_only=True).to("cuda:0")
+    batch_size = normalized_prob.shape[0]
+    vocab_size = normalized_prob.shape[1]
     
     info["prob"] = normalized_prob.cpu().numpy().tolist()
     sorted_prob, indices = torch.sort(normalized_prob, descending=False)
@@ -183,10 +185,14 @@ def test_top_p_sampling(batch_size, vocab_size, p):
       file = Path(file)
       with file.open("w") as f:
           json.dump(info, f, ensure_ascii=False, indent=4)
-    num_trials = 1000
+    num_trials = 10
     info["out"] = []
+    samples = torch.empty(batch_size, dtype=torch.int32, device="cuda:0")
+
+    start_event = torch.cuda.Event(enable_timing=True)
+    end_event = torch.cuda.Event(enable_timing=True)
+    start_event.record()
     for _ in range(num_trials):
-        samples = torch.empty(batch_size, dtype=torch.int32, device="cuda:0")
         top_p_sampling_from_probs(
             normalized_prob,
             samples,
@@ -197,9 +203,15 @@ def test_top_p_sampling(batch_size, vocab_size, p):
             torch.zeros(batch_size, dtype=torch.uint64, device="cuda:0"),
             torch.zeros(batch_size, dtype=torch.uint64, device="cuda:0"),
         )
-        assert torch.all(samples < vocab_size) and torch.all(samples >= 0)
-        assert torch.all(mask[torch.arange(batch_size), samples] == 1)
-        info["out"].append(samples.cpu().numpy().tolist())
+        # assert torch.all(samples < vocab_size) and torch.all(samples >= 0)
+        # assert torch.all(mask[torch.arange(batch_size), samples] == 1)
+        # info["out"].append(samples.cpu().numpy().tolist())
+    end_event.record()
+    torch.cuda.synchronize()
+    elapsed_time = start_event.elapsed_time(end_event) / num_trials
+    print(f"elapsed_time: {elapsed_time} ms")
+    return elapsed_time
+
     if file:
         with file.open("w") as f:
             json.dump(info, f, ensure_ascii=False, indent=4)
@@ -376,4 +388,11 @@ def test_top_k_renorm_probs(batch_size, vocab_size, k):
 
 
 if __name__ == "__main__":
+    for i in range(1, 377):
+        os.environ["REALDATA"] = f"/home/xiebaijie.xbj/qwen-vl/probs/probs{i}.pt"
+        rt = test_top_p_sampling(1, 1, 0.95)
+        if rt > 0.5:
+            exit(1)
+    print("no found")
+    exit(1)
     exit(pytest.main([__file__, "-s", "-v"]))