alibaba
diff --git a/‎rtp_llm/cpp/devices/rocm_impl/ROCmDevice.cc‎
Lines changed: 3 additions & 2 deletions b/‎rtp_llm/cpp/devices/rocm_impl/ROCmDevice.cc‎
Lines changed: 3 additions & 2 deletions
diff --git a/‎rtp_llm/cpp/devices/rocm_impl/ROCmSampleOp.cc‎
Lines changed: 66 additions & 36 deletions b/‎rtp_llm/cpp/devices/rocm_impl/ROCmSampleOp.cc‎
Lines changed: 66 additions & 36 deletions
diff --git a/‎rtp_llm/cpp/kernels/rocm/sampling/api.cc‎
Lines changed: 20 additions & 7 deletions b/‎rtp_llm/cpp/kernels/rocm/sampling/api.cc‎
Lines changed: 20 additions & 7 deletions
diff --git a/‎rtp_llm/cpp/kernels/rocm/sampling/bind.cc‎
Lines changed: 1 addition & 0 deletions b/‎rtp_llm/cpp/kernels/rocm/sampling/bind.cc‎
Lines changed: 1 addition & 0 deletions
@@ -170,8 +170,9 @@ ROCmDevice::~ROCmDevice() {
 
 void ROCmDevice::init() {
     DeviceBase::init();
-    RTP_LLM_LOG_INFO("max batch size: %d", init_params_.max_batch_size);
-    curandstate_buf_ = allocateBuffer({init_params_.max_batch_size * sizeof(curandState_t)}, {"curandstate"});
+    int max_batch_size_deprecated = 128;
+    RTP_LLM_LOG_INFO("max batch size: %d", max_batch_size_deprecated);
+    curandstate_buf_ = allocateBuffer({max_batch_size_deprecated * sizeof(curandState_t)}, {"curandstate"});
 }
 
 DeviceProperties ROCmDevice::getDeviceProperties() {
 
@@ -20,11 +20,11 @@ using SamplerT = float;
 // topk should has higher proirity than topp.
 
 GreedyOutput ROCmDevice::sampleGreedy(const GreedyParams& params) {
-    bool enable_flashinfer = init_params_.sampler_config.enable_flashinfer_sample_kernel;
-    const auto& logits     = params.logits;
-    const auto  batch_size = logits.shape()[0];
-    const auto vocab_size_padded = logits.shape()[1];
-    const auto step              = params.step;
+    bool        disable_dprs      = std::getenv("DISABLE_ROCM_DPRS") && std::string(std::getenv("DISABLE_ROCM_DPRS")) == "1";
+    const auto& logits            = params.logits;
+    const auto  batch_size        = logits.shape()[0];
+    const auto  vocab_size_padded = logits.shape()[1];
+    const auto  step              = params.step;
     RUNTIME_ASSERT_OP_ARG(batch_size == params.token_ids.shape()[0],
                           "logits.shape[0] should equal to token_ids.shape[0], but %d vs %d",
                           batch_size,
@@ -40,7 +40,7 @@ GreedyOutput ROCmDevice::sampleGreedy(const GreedyParams& params) {
     auto& top_k       = params.top_k;
     auto& top_p       = params.top_p;
     auto& temperature = params.temperature;
-    auto& random_seed = params.random_seed;
+    // auto& random_seed = params.random_seed;
     ROCM_CHECK_VALUE(top_k.size() == batch_size, "top_k.size() != batch_size");
     ROCM_CHECK_VALUE(top_p.size() == batch_size, "top_p.size() != batch_size");
     ROCM_CHECK_VALUE(temperature.size() == batch_size, "temperature.size() != batch_size");
@@ -129,24 +129,24 @@ GreedyOutput ROCmDevice::sampleGreedy(const GreedyParams& params) {
     // 3. prepare common inputs
 
     // 3.1. setup random seeds
-    if (random_seed) {
-        auto& seeds = random_seed.value().get();
-        if (seeds.size() == 1) {
-            invokeCurandInitialize(
-                (curandState_t*)curandstate_buf_->data(), batch_size, seeds.data<uint64_t>()[0], stream_);
-        } else {
-            auto random_seeds_buf = allocateBuffer({DataType::TYPE_UINT64, {batch_size}});
-            RUNTIME_ASSERT_OP_ARG((seeds.size() == batch_size),
-                                  "random_seed.size() should equal to batch_size, but %d vs %d",
-                                  seeds.size(),
-                                  batch_size);
-            copy({*random_seeds_buf, seeds});
-            invokeCurandBatchInitialize((curandState_t*)curandstate_buf_->data(),
-                                        batch_size,
-                                        (unsigned long long*)random_seeds_buf->data(),
-                                        stream_);
-        }
-    }
+    // if (random_seed) {
+    //     auto& seeds = random_seed.value().get();
+    //     if (seeds.size() == 1) {
+    //         invokeCurandInitialize(
+    //             (curandState_t*)curandstate_buf_->data(), batch_size, seeds.data<uint64_t>()[0], stream_);
+    //     } else {
+    //         auto random_seeds_buf = allocateBuffer({DataType::TYPE_UINT64, {batch_size}});
+    //         RUNTIME_ASSERT_OP_ARG((seeds.size() == batch_size),
+    //                               "random_seed.size() should equal to batch_size, but %d vs %d",
+    //                               seeds.size(),
+    //                               batch_size);
+    //         copy({*random_seeds_buf, seeds});
+    //         invokeCurandBatchInitialize((curandState_t*)curandstate_buf_->data(),
+    //                                     batch_size,
+    //                                     (unsigned long long*)random_seeds_buf->data(),
+    //                                     stream_);
+    //     }
+    // }
 
     // 3.2. compute logits penalty
     if (std::any_of(
@@ -221,17 +221,32 @@ GreedyOutput ROCmDevice::sampleGreedy(const GreedyParams& params) {
         return GreedyOutput{};
     }
 
-    if (enable_flashinfer) {
+    if (!disable_dprs) {
         const auto batch_size = params.logits.shape()[0];
         auto&      top_k      = params.top_k;
         auto&      top_p      = params.top_p;
 
-        auto      logits_ref = params.logits.slice(0, params.logits.shape()[0]);
-        auto      probs   = softmax({logits_ref, std::nullopt, std::nullopt, 1.0f, DataType::TYPE_INVALID, std::nullopt});
-        auto      samples = transposed_tokens->view(transposed_tokens->shape()[0] - 1, 1);
-        torch::TensorOptions options =
-            torch::TensorOptions(dataTypeToTorchType(probs->type())).device(torch::Device(torch::kCUDA));
-        bool deterministic = false;
+        auto logits_ref = params.logits.slice(0, params.logits.shape()[0]);
+        auto probs      = softmax({logits_ref, std::nullopt, std::nullopt, 1.0f, DataType::TYPE_INVALID, std::nullopt});
+        auto samples    = transposed_tokens->view(transposed_tokens->shape()[0] - 1, 1);
+
+        bool                  deterministic = true;
+        std::vector<uint64_t> seed_v;
+        std::vector<uint64_t> offset_v;
+        for (int i = 0; i < batch_size; i++) {
+            if (params.generator[i].defined()) {
+                auto [sd, ofst] = get_seed_and_offset(batch_size * 32, params.generator[i]);
+                seed_v.push_back(sd);
+                offset_v.push_back(ofst);
+            } else {
+                seed_v.push_back(0);
+                offset_v.push_back(0);
+            }
+        }
+        auto seed = torch::from_blob(seed_v.data(), {static_cast<long>(batch_size)}, torch::kUInt64).to(torch::kCUDA);
+        auto offset =
+            torch::from_blob(offset_v.data(), {static_cast<long>(batch_size)}, torch::kUInt64).to(torch::kCUDA);
+
         bool          need_output_all_probs = params.output_all_probs.has_value();
         torch::Tensor probs_t               = Buffer2torchTensor(probs, false);
         torch::Tensor samples_t             = Buffer2torchTensor(samples, false).flatten();
@@ -252,7 +267,15 @@ GreedyOutput ROCmDevice::sampleGreedy(const GreedyParams& params) {
             }
         } else if (std::all_of(
                        top_k.data<uint32_t>(), top_k.data<uint32_t>() + batch_size, [&](auto t) { return t <= 0; })) {
-            top_p_sampling_from_probs(probs_t, samples_t, std::nullopt, top_p_t, 1.0, deterministic, 0, 0, reinterpret_cast<uintptr_t>(stream_));
+            top_p_sampling_from_probs(probs_t,
+                                      samples_t,
+                                      std::nullopt,
+                                      top_p_t,
+                                      1.0,
+                                      deterministic,
+                                      seed,
+                                      offset,
+                                      reinterpret_cast<uintptr_t>(stream_));
             if (need_output_all_probs) {
                 top_p_renorm_probs(probs_t, output_all_probs_t, top_p_t, 1.0, reinterpret_cast<uintptr_t>(stream_));
             }
@@ -263,8 +286,15 @@ GreedyOutput ROCmDevice::sampleGreedy(const GreedyParams& params) {
                            top_k.data<uint32_t>() + batch_size,
                            top_k.data<uint32_t>(),
                            [&](auto t) { return t <= 0 ? 1 << 30 : t; });
-            top_k_sampling_from_probs(
-                probs_t, samples_t, std::nullopt, top_k_t, 0, deterministic, 0, 0, reinterpret_cast<uintptr_t>(stream_));
+            top_k_sampling_from_probs(probs_t,
+                                      samples_t,
+                                      std::nullopt,
+                                      top_k_t,
+                                      0,
+                                      deterministic,
+                                      seed,
+                                      offset,
+                                      reinterpret_cast<uintptr_t>(stream_));
             if (need_output_all_probs) {
                 top_k_renorm_probs(probs_t, output_all_probs_t, top_k_t, 0, reinterpret_cast<uintptr_t>(stream_));
             }
@@ -281,8 +311,8 @@ GreedyOutput ROCmDevice::sampleGreedy(const GreedyParams& params) {
                                             top_p_t,
                                             1.0,
                                             deterministic,
-                                            0,
-                                            0,
+                                            seed,
+                                            offset,
                                             reinterpret_cast<uintptr_t>(stream_));
             if (need_output_all_probs) {
                 torch::Tensor temp_t = torch::zeros_like(output_all_probs_t);
 
@@ -14,16 +14,29 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+#include <ATen/hip/HIPGeneratorImpl.h>
+
 #include "sampling.h"
 #include "utils.h"
 #include "kernel.cuh"
 
 namespace rtp_llm {
 
+std::tuple<uint64_t, uint64_t> get_seed_and_offset(int increment_size, std::optional<at::Generator> generator) {
+  uint64_t philox_seed, philox_offset;
+  auto gen = at::get_generator_or_default<at::CUDAGeneratorImpl>(
+      generator, at::cuda::detail::getDefaultCUDAGenerator());
+  std::lock_guard<std::mutex> lock(gen->mutex_);
+  at::PhiloxCudaState rng_engine_inputs = gen->philox_cuda_state(increment_size);
+  philox_seed = rng_engine_inputs.seed_.val;
+  philox_offset = rng_engine_inputs.offset_.val;
+  return std::make_tuple(philox_seed, philox_offset);
+}
+
 void top_p_sampling_from_probs(torch::Tensor probs, torch::Tensor output,
                                std::optional<torch::Tensor> maybe_indices,
                                std::optional<torch::Tensor> maybe_top_p_arr, double top_p_val,
-                               bool deterministic, uint64_t philox_seed, uint64_t philox_offset, uintptr_t stream) {
+                               bool deterministic, torch::Tensor philox_seed, torch::Tensor philox_offset, uintptr_t stream) {
   CHECK_INPUT(probs);
   CHECK_DIM(2, probs);  // probs: (batch_size, vocab_size)
   unsigned int batch_size = output.sizes()[0];
@@ -35,14 +48,14 @@ void top_p_sampling_from_probs(torch::Tensor probs, torch::Tensor output,
       static_cast<float*>(probs.data_ptr()), static_cast<int*>(output.data_ptr()),
       maybe_indices.has_value() ? static_cast<int*>(maybe_indices->data_ptr()) : nullptr,
       has_top_p_arr ? static_cast<float*>(maybe_top_p_arr->data_ptr()) : nullptr, batch_size,
-      top_p_val, vocab_size, deterministic, philox_seed, philox_offset, reinterpret_cast<hipStream_t>(stream));
+      top_p_val, vocab_size, deterministic, static_cast<uint64_t*>(philox_seed.data_ptr()), static_cast<uint64_t*>(philox_offset.data_ptr()), reinterpret_cast<hipStream_t>(stream));
   TORCH_CHECK(status == hipSuccess, "TopPSamplingFromProbs failed with error code " + std::string(hipGetErrorString(status)));
 }
 
 void top_k_sampling_from_probs(torch::Tensor probs, torch::Tensor output,
                                std::optional<torch::Tensor> maybe_indices,
                                std::optional<torch::Tensor> maybe_top_k_arr, int64_t top_k_val,
-                               bool deterministic, uint64_t philox_seed, uint64_t philox_offset, uintptr_t stream) {
+                               bool deterministic, torch::Tensor philox_seed, torch::Tensor philox_offset, uintptr_t stream) {
   CHECK_INPUT(probs);
   CHECK_INPUT(output);
   CHECK_DEVICE(output, probs);
@@ -57,16 +70,16 @@ void top_k_sampling_from_probs(torch::Tensor probs, torch::Tensor output,
       static_cast<float*>(probs.data_ptr()), static_cast<int*>(output.data_ptr()),
       maybe_indices.has_value() ? static_cast<int*>(maybe_indices->data_ptr()) : nullptr,
       has_top_k_arr ? static_cast<float*>(maybe_top_k_arr->data_ptr()) : nullptr, batch_size,
-      top_k_val, vocab_size, deterministic, philox_seed, philox_offset, reinterpret_cast<hipStream_t>(stream));
+      top_k_val, vocab_size, deterministic, static_cast<uint64_t*>(philox_seed.data_ptr()), static_cast<uint64_t*>(philox_offset.data_ptr()), reinterpret_cast<hipStream_t>(stream));
   TORCH_CHECK(status == hipSuccess, "TopKSamplingFromProbs failed with error code " + std::string(hipGetErrorString(status)));
 }
 
 void top_k_top_p_sampling_from_probs(torch::Tensor probs, torch::Tensor output,
                                      std::optional<torch::Tensor> maybe_indices,
                                      std::optional<torch::Tensor> maybe_top_k_arr, double top_k_val,
                                      std::optional<torch::Tensor> maybe_top_p_arr, double top_p_val,
-                                     bool deterministic, uint64_t philox_seed,
-                                     uint64_t philox_offset, uintptr_t stream) {
+                                     bool deterministic, torch::Tensor philox_seed,
+                                     torch::Tensor philox_offset, uintptr_t stream) {
   CHECK_INPUT(probs);
   CHECK_INPUT(output);
   CHECK_DEVICE(output, probs);
@@ -84,7 +97,7 @@ void top_k_top_p_sampling_from_probs(torch::Tensor probs, torch::Tensor output,
       has_top_p_arr ? static_cast<float*>(maybe_top_p_arr->data_ptr()) : nullptr,
       static_cast<int*>(output.data_ptr()),
       maybe_indices.has_value() ? static_cast<int*>(maybe_indices->data_ptr()) : nullptr,
-      batch_size, top_k_val, top_p_val, vocab_size, deterministic, philox_seed, philox_offset,
+      batch_size, top_k_val, top_p_val, vocab_size, deterministic, static_cast<uint64_t*>(philox_seed.data_ptr()), static_cast<uint64_t*>(philox_offset.data_ptr()),
       reinterpret_cast<hipStream_t>(stream));
   TORCH_CHECK(status == hipSuccess, "TopKTopPSamplingFromProb failed with error code " + std::string(hipGetErrorString(status)));
 }
 
@@ -8,6 +8,7 @@ namespace rtp_llm {
 
 PYBIND11_MODULE(bind, m) {
     m.doc() = "sampling c++ api for test";
+    m.def("get_seed_and_offset", &get_seed_and_offset, py::arg(), py::arg("generator") = std::nullopt, "get_seed_and_offset");
     m.def("top_p_renorm_probs", &top_p_renorm_probs, py::arg(), py::arg(), py::arg(), py::arg(), py::arg("stream") = 0, "top_p_renorm_probs");
     m.def("top_k_renorm_probs", &top_k_renorm_probs, py::arg(), py::arg(), py::arg(), py::arg(), py::arg("stream") = 0, "top_k_renorm_probs");
     m.def("top_p_sampling_from_probs", &top_p_sampling_from_probs, py::arg(), py::arg(), py::arg(), py::arg(), py::arg(), py::arg(), py::arg(), py::arg(), py::arg("stream") = 0, "top_p_sampling_from_probs");