WIP

Binyang2014 · Binyang2014 · commit 8ccaf095d101 · 2025-11-18T06:06:59.000Z
diff --git a/examples/customized-collective-algorithm/customized_allgather.cu b/examples/customized-collective-algorithm/customized_allgather.cu
@@ -90,15 +90,14 @@ class AllgatherAlgoBuilder : public mscclpp::AlgorithmBuilder {
     std::shared_ptr<mscclpp::Algorithm> allgatherAlgo = std::make_shared<mscclpp::NativeAlgorithm>(
         "allgather", "allgather", [self](std::shared_ptr<mscclpp::Communicator> comm) { self->initialize(comm); },
         [self](const std::shared_ptr<mscclpp::AlgorithmCtx> ctx, const void* input, void* output, size_t inputSize,
-               size_t outputSize, int dtype, cudaStream_t stream, std::unordered_map<std::string, uintptr_t>& extras) {
-          return self->allgatherKernelFunc(ctx, input, output, inputSize, static_cast<ncclDataType_t>(dtype), stream,
-                                           extras);
+               size_t outputSize, mscclpp::DataType dtype, cudaStream_t stream,
+               std::unordered_map<std::string, uintptr_t>& extras) {
+          return self->allgatherKernelFunc(ctx, input, output, inputSize, dtype, stream, extras);
         },
         [self](std::shared_ptr<mscclpp::Communicator> comm, const void* input, void* output, size_t inputSize,
-               size_t outputSize, int dtype) {
-          return self->initAllgatherContext(comm, input, output, inputSize, static_cast<ncclDataType_t>(dtype));
-        },
-        [self](const void* input, void* output, size_t inputSize, size_t outputSize, int dtype) {
+               size_t outputSize,
+               mscclpp::DataType dtype) { return self->initAllgatherContext(comm, input, output, inputSize, dtype); },
+        [self](const void* input, void* output, size_t inputSize, size_t outputSize, mscclpp::DataType dtype) {
           return self->generateAllgatherContextKey(input, output, inputSize, outputSize,
                                                    static_cast<ncclDataType_t>(dtype));
         });
@@ -126,7 +125,7 @@ class AllgatherAlgoBuilder : public mscclpp::AlgorithmBuilder {
   }
 
   ncclResult_t allgatherKernelFunc(const std::shared_ptr<mscclpp::AlgorithmCtx> ctx, const void* input, void* output,
-                                   size_t inputSize, [[maybe_unused]] ncclDataType_t dtype, cudaStream_t stream,
+                                   size_t inputSize, [[maybe_unused]] mscclpp::DataType dtype, cudaStream_t stream,
                                    std::unordered_map<std::string, uintptr_t>& extras) {
     int rank = ctx->rank;
     int worldSize = ctx->workSize;
@@ -141,7 +140,7 @@ class AllgatherAlgoBuilder : public mscclpp::AlgorithmBuilder {
 
   std::shared_ptr<mscclpp::AlgorithmCtx> initAllgatherContext(std::shared_ptr<mscclpp::Communicator> comm,
                                                               const void* input, void* output, size_t inputSize,
-                                                              ncclDataType_t dtype) {
+                                                              mscclpp::DataType dtype) {
     auto ctx = std::make_shared<mscclpp::AlgorithmCtx>();
     ctx->rank = comm->bootstrap()->getRank();
     ctx->workSize = comm->bootstrap()->getNranks();
diff --git a/examples/torch-integration/customized_allgather.cu b/examples/torch-integration/customized_allgather.cu
@@ -56,43 +56,41 @@ class AllgatherAlgoBuilder : public mscclpp::AlgorithmBuilder {
     std::shared_ptr<mscclpp::Algorithm> allgatherAlgo = std::make_shared<mscclpp::NativeAlgorithm>(
         "allgather", "allgather", [self](std::shared_ptr<mscclpp::Communicator> comm) { self->initialize(comm); },
         [self](const std::shared_ptr<mscclpp::AlgorithmCtx> ctx, const void* input, void* output, size_t inputSize,
-               size_t outputSize, int dtype, cudaStream_t stream, std::unordered_map<std::string, uintptr_t>& extras) {
-          return self->allgatherKernelFunc(ctx, input, output, inputSize, static_cast<ncclDataType_t>(dtype), stream,
-                                           extras);
+               size_t outputSize, mscclpp::DataType dtype, cudaStream_t stream,
+               std::unordered_map<std::string, uintptr_t>& extras) {
+          return self->allgatherKernelFunc(ctx, input, output, inputSize, dtype, stream, extras);
         },
         [self](std::shared_ptr<mscclpp::Communicator> comm, const void* input, void* output, size_t inputSize,
-               size_t outputSize, int dtype) {
-          return self->initAllgatherContext(comm, input, output, inputSize, static_cast<ncclDataType_t>(dtype));
-        },
-        [self](const void* input, void* output, size_t inputSize, size_t outputSize, int dtype) {
-          return self->generateAllgatherContextKey(input, output, inputSize, outputSize,
-                                                   static_cast<ncclDataType_t>(dtype));
+               size_t outputSize,
+               mscclpp::DataType dtype) { return self->initAllgatherContext(comm, input, output, inputSize, dtype); },
+        [self](const void* input, void* output, size_t inputSize, size_t outputSize, mscclpp::DataType dtype) {
+          return self->generateAllgatherContextKey(input, output, inputSize, outputSize, dtype);
         });
     return allgatherAlgo;
   }
 
  private:
-  std::vector<std::shared_ptr<mscclpp::Connection>> conns_;
+  std::vector<mscclpp::Connection> conns_;
   std::shared_ptr<mscclpp::ProxyService> proxyService_;
   int worldSize_;
 
   void initialize(std::shared_ptr<mscclpp::Communicator> comm) {
-    std::vector<std::shared_future<std::shared_ptr<mscclpp::Connection>>> connectionFutures;
+    std::vector<std::shared_future<mscclpp::Connection>> connectionFutures;
     worldSize_ = comm->bootstrap()->getNranks();
     for (int i = 0; i < worldSize_; i++) {
       if (i == comm->bootstrap()->getRank()) continue;
       connectionFutures.push_back(comm->connect(mscclpp::Transport::CudaIpc, i));
     }
-    std::vector<std::shared_ptr<mscclpp::Connection>> connections;
+    std::vector<mscclpp::Connection> connections;
     std::transform(connectionFutures.begin(), connectionFutures.end(), std::back_inserter(connections),
                    [](const auto& future) { return future.get(); });
     this->conns_ = std::move(connections);
     proxyService_ = std::make_shared<mscclpp::ProxyService>();
-    proxyService_->startProxy();
+    proxyService_->startProxy(true);
   }
 
   ncclResult_t allgatherKernelFunc(const std::shared_ptr<mscclpp::AlgorithmCtx> ctx, const void* input, void* output,
-                                   size_t inputBytes, [[maybe_unused]] ncclDataType_t dtype, cudaStream_t stream,
+                                   size_t inputBytes, [[maybe_unused]] mscclpp::DataType dtype, cudaStream_t stream,
                                    std::unordered_map<std::string, uintptr_t>& extras) {
     int rank = ctx->rank;
     int worldSize = ctx->workSize;
@@ -107,7 +105,7 @@ class AllgatherAlgoBuilder : public mscclpp::AlgorithmBuilder {
 
   std::shared_ptr<mscclpp::AlgorithmCtx> initAllgatherContext(std::shared_ptr<mscclpp::Communicator> comm,
                                                               const void* input, void* output, size_t inputBytes,
-                                                              ncclDataType_t dtype) {
+                                                              mscclpp::DataType dtype) {
     auto ctx = std::make_shared<mscclpp::AlgorithmCtx>();
     ctx->rank = comm->bootstrap()->getRank();
     ctx->workSize = comm->bootstrap()->getNranks();
@@ -149,7 +147,7 @@ class AllgatherAlgoBuilder : public mscclpp::AlgorithmBuilder {
   }
 
   mscclpp::AlgorithmCtxKey generateAllgatherContextKey(const void* input, void* output, size_t inputSize,
-                                                       size_t outputSize, ncclDataType_t dtype) {
+                                                       size_t outputSize, mscclpp::DataType dtype) {
     return {(void*)input, output, inputSize, outputSize, 0};
   }
 };
diff --git a/examples/torch-integration/dsl_with_nccl_api.py b/examples/torch-integration/dsl_with_nccl_api.py
@@ -1,7 +1,7 @@
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT License.
 
-# LD_PRELOAD=<MSCCLPP_REPO>/build/apps/nccl/libmscclpp_nccl.so  torchrun --nnodes=1 --nproc_per_node=8 torch-integration/dsl_with_nccl_api.py
+# LD_PRELOAD=<MSCCLPP_REPO>/build/apps/nccl/libmscclpp_nccl.so  torchrun --nnodes=1 --nproc_per_node=8 dsl_with_nccl_api.py
 
 import os
 from typing import Any, Dict
@@ -111,6 +111,7 @@ def main():
     dist.all_reduce(x, op=dist.ReduceOp.SUM)
     dist.barrier()
     dist.destroy_process_group()
+    print(f"Rank {local} allreduce completed successfully.")
 
 
 if __name__ == "__main__":
diff --git a/python/csrc/core_py.cpp b/python/csrc/core_py.cpp
@@ -36,6 +36,13 @@ void def_shared_future(nb::handle& m, const std::string& typestr) {
 void register_core(nb::module_& m) {
   m.def("version", &version);
 
+  nb::enum_<DataType>(m, "DataType")
+      .value("int32", DataType::INT32)
+      .value("uint32", DataType::UINT32)
+      .value("float16", DataType::FLOAT16)
+      .value("float32", DataType::FLOAT32)
+      .value("bfloat16", DataType::BFLOAT16);
+
   nb::class_<Bootstrap>(m, "Bootstrap")
       .def("get_rank", &Bootstrap::getRank)
       .def("get_n_ranks", &Bootstrap::getNranks)
diff --git a/python/csrc/executor_py.cpp b/python/csrc/executor_py.cpp
@@ -15,13 +15,6 @@ namespace nb = nanobind;
 using namespace mscclpp;
 
 void register_executor(nb::module_& m) {
-  nb::enum_<DataType>(m, "DataType")
-      .value("int32", DataType::INT32)
-      .value("uint32", DataType::UINT32)
-      .value("float16", DataType::FLOAT16)
-      .value("float32", DataType::FLOAT32)
-      .value("bfloat16", DataType::BFLOAT16);
-
   nb::enum_<PacketType>(m, "PacketType").value("LL8", PacketType::LL8).value("LL16", PacketType::LL16);
 
 
diff --git a/python/mscclpp/_algorithm.py b/python/mscclpp/_algorithm.py
@@ -14,7 +14,7 @@
     AlgorithmCollectionBuilder as _AlgorithmCollectionBuilder,
     Communicator,
     CollectiveBufferMode,
-    DeviceType,
+    DataType,
     Executor,
     ExecutionPlan,
 )
@@ -100,7 +100,7 @@ def execute(
         output_buffer: int,
         input_size: int,
         output_size: int,
-        dtype: DeviceType,
+        dtype: DataType,
         stream: int,
         executor: Optional[Executor] = None,
         extras: Optional[Dict[str, int]] = None,
@@ -111,7 +111,7 @@ def execute(
             int(output_buffer), 
             input_size, 
             output_size, 
-            int(dtype), 
+            dtype, 
             int(stream),
             executor,
             extras if extras is not None else {}