microsoft · caiomcbr · Oct 29, 2025 · Aug 18, 2025 · Aug 18, 2025 · Aug 19, 2025
diff --git a/apps/nccl/src/allreduce.hpp b/apps/nccl/src/allreduce.hpp
@@ -1237,7 +1237,6 @@ class AllreduceNvlsPacket : public mscclpp::AlgorithmBuilder {
 
   size_t scratchBufferSize_;
   std::shared_ptr<char> scratchBuffer_;
-  const int nSegmentsForScratchBuffer_ = 2;
   const size_t nvlsBufferSize_ = (1 << 30);
 
   std::shared_ptr<uint32_t> deviceFlag_;

diff --git a/apps/nccl/src/nccl.cu b/apps/nccl/src/nccl.cu
@@ -120,7 +120,8 @@ static inline int mscclppNcclDlopenInit() {
   return dlopenSuccess;
 }
 
-static inline void mscclppNcclDlopenFinalize() {
+// No need to call this function, handle will be closed at program exit
+[[maybe_unused]] static inline void mscclppNcclDlopenFinalize() {
   if (mscclppNcclDlHandle) {
     dlclose(mscclppNcclDlHandle);
   }
@@ -159,17 +160,6 @@ static bool tryLoadNcclSharedLib() {
 // Declare the global map to store associations between raw pointer and shared pointer
 static std::unordered_map<void*, std::shared_ptr<char>> ptrMap;
 
-struct planKey {
-  size_t minMessageSize;
-  size_t maxMessageSize;
-  bool isInPlace;
-};
-
-struct executionPlanInstance {
-  planKey key;
-  std::shared_ptr<mscclpp::ExecutionPlan> plan;
-};
-
 struct splitCommInfo {
   int color;
   int key;
@@ -179,23 +169,16 @@ struct splitCommInfo {
 struct ncclComm {
   std::shared_ptr<mscclpp::Communicator> comm;
   std::shared_ptr<mscclpp::Executor> executor;
-  std::unordered_map<std::string, std::vector<executionPlanInstance>> executionPlans;
   std::shared_ptr<mscclpp::AlgorithmCollection> algorithmCollection;
   std::shared_ptr<char> scratchBuffer_;
   const size_t scratchBufferSize_ = (1 << 27);  // 128MB
+  std::shared_ptr<mscclpp::ExecutionPlanRegistry> planRegistry_;
   int nRanksPerNode;
   int worldSize;
 
   void* mscclppNcclComm;
 };
 
-static std::pair<std::string, executionPlanInstance> loadExecutionPlan(const std::string& filename, int rank) {
-  std::shared_ptr<mscclpp::ExecutionPlan> plan = std::make_shared<mscclpp::ExecutionPlan>(filename, rank);
-  std::string collective = plan->collective();
-  planKey key{plan->minMessageSize(), plan->maxMessageSize(), plan->isInPlace()};
-  return std::make_pair(collective, executionPlanInstance{key, plan});
-}
-
 static ncclResult_t executeWithPlan(std::shared_ptr<mscclpp::Executor> executor, int rank, ncclDataType_t datatype,
                                     const void* sendbuff, void* recvbuff, size_t sendBytes, size_t recvBytes,
                                     std::shared_ptr<mscclpp::ExecutionPlan> plan, cudaStream_t stream) {
@@ -352,6 +335,20 @@ static mscclpp::Algorithm algoSelector(
   return mscclpp::Algorithm();
 }
 
+std::shared_ptr<mscclpp::ExecutionPlanHandle> executionPlanDefaultSelector(
+    const std::vector<std::shared_ptr<mscclpp::ExecutionPlanHandle>> plans, const mscclpp::ExecutionRequest&) {
+  if (plans.empty()) {
+    INFO(MSCCLPP_NCCL, "No execution plans available for selection");
+    return nullptr;
+  }
+  for (auto plan : plans) {
+    if (plan->tags.find("default") == plan->tags.end()) {
+      return plan;
+    }
+  }
+  return plans[0];
+}
+
 NCCL_API ncclResult_t ncclCommInitRank(ncclComm_t* comm, int nranks, ncclUniqueId commId, int rank) {
   INFO(MSCCLPP_NCCL, "Initializing NCCL communicator for rank %d, world_size=%d", rank, nranks);
   if (comm == nullptr) {
@@ -371,29 +368,13 @@ NCCL_API ncclResult_t ncclCommInitRank(ncclComm_t* comm, int nranks, ncclUniqueI
 
   commPtr->comm = mscclppComm;
   commPtr->scratchBuffer_ = mscclpp::GpuBuffer<char>(commPtr->scratchBufferSize_).memory();
-  commPtr->executor = std::make_shared<mscclpp::Executor>(mscclppComm);
+  commPtr->executor = std::make_shared<mscclpp::Executor>(mscclppComm, commPtr->scratchBuffer_);
+  commPtr->planRegistry_ = mscclpp::ExecutionPlanRegistry::getInstance();
+
   commPtr->nRanksPerNode = mscclppComm->bootstrap()->getNranksPerNode();
   commPtr->worldSize = mscclppComm->bootstrap()->getNranks();
-
-  if (commPtr->worldSize == 1) {
-    *comm = commPtr;
-    return ncclSuccess;
-  }
-
-  const std::string& collectiveDir = mscclpp::env()->executionPlanDir;
-  if (collectiveDir != "") {
-    if (!std::filesystem::is_directory(collectiveDir)) {
-      WARN("The value of the environment variable %s is not a directory", collectiveDir.c_str());
-      return ncclInvalidArgument;
-    }
-    for (const auto& entry : std::filesystem::directory_iterator(collectiveDir)) {
-      if (entry.is_regular_file()) {
-        auto plan = loadExecutionPlan(entry.path(), rank);
-        commPtr->executionPlans[plan.first].push_back(plan.second);
-      }
-    }
-  }
-
+  commPtr->planRegistry_->loadDefaultPlans(rank);
+  commPtr->planRegistry_->setDefaultSelector(executionPlanDefaultSelector);
   mscclpp::AlgorithmCollectionBuilder::getInstance()->setFallbackAlgorithmSelector(algoSelector);
   registerCustomizedAlgo();
   commPtr->algorithmCollection = mscclpp::AlgorithmCollectionBuilder::getInstance()->build();
@@ -462,12 +443,12 @@ NCCL_API ncclResult_t ncclCommDestroy(ncclComm_t comm) {
   }
 #endif
 
-  if (mscclppNcclDlopenSharedLib == true) {
-    mscclppNcclOps.CommDestroy(*reinterpret_cast<ncclComm_t*>(comm->mscclppNcclComm));
-    mscclppNcclDlopenFinalize();
-    delete static_cast<ncclComm_t*>(comm->mscclppNcclComm);
-  }
+  ncclComm_t* mscclppNcclCommPtr = reinterpret_cast<ncclComm_t*>(comm->mscclppNcclComm);
   delete comm;
+  if (mscclppNcclCommPtr != nullptr) {
+    mscclppNcclOps.CommDestroy(*reinterpret_cast<ncclComm_t*>(mscclppNcclCommPtr));
+    delete static_cast<ncclComm_t*>(mscclppNcclCommPtr);
+  }
   return ncclSuccess;
 }
 
@@ -646,18 +627,13 @@ NCCL_API ncclResult_t ncclBroadcast(const void* sendbuff, void* recvbuff, size_t
                                     *reinterpret_cast<ncclComm_t*>(comm->mscclppNcclComm), stream);
   }
 
-  std::vector<executionPlanInstance>& plans = comm->executionPlans["broadcast"];
-  std::shared_ptr<mscclpp::ExecutionPlan> plan;
-  bool inPlace = sendbuff == recvbuff;
-  for (const auto& p : plans) {
-    if (bytes >= p.key.minMessageSize && bytes < p.key.maxMessageSize && inPlace == p.key.isInPlace) {
-      plan = p.plan;
-      break;
-    }
-  }
-
-  if (plan != nullptr) {
-    return executeWithPlan(comm->executor, rank, datatype, sendbuff, recvbuff, bytes, bytes, plan, stream);
+  static std::unordered_map<std::string, std::vector<uint64_t>> hints{{"root", {static_cast<uint64_t>(root)}}};
+  hints["root"][0] = static_cast<uint64_t>(root);
+  auto planHandle = comm->planRegistry_->select("broadcast", comm->comm->bootstrap()->getNranks(),
+                                                comm->comm->bootstrap()->getNranksPerNode(),
+                                                comm->comm->bootstrap()->getRank(), sendbuff, recvbuff, bytes, hints);
+  if (planHandle != nullptr) {
+    return executeWithPlan(comm->executor, rank, datatype, sendbuff, recvbuff, bytes, bytes, planHandle->plan, stream);
   }
   auto algo = comm->algorithmCollection->selectAlgorithm(
       "broadcast", sendbuff, recvbuff, count * ncclTypeSize(datatype), datatype,
@@ -706,18 +682,11 @@ NCCL_API ncclResult_t ncclAllReduce(const void* sendbuff, void* recvbuff, size_t
                                     *reinterpret_cast<ncclComm_t*>(comm->mscclppNcclComm), stream);
   }
 
-  std::vector<executionPlanInstance>& plans = comm->executionPlans["allreduce"];
-  std::shared_ptr<mscclpp::ExecutionPlan> plan;
-  bool inPlace = sendbuff == recvbuff;
-  for (const auto& p : plans) {
-    if (bytes >= p.key.minMessageSize && bytes < p.key.maxMessageSize && inPlace == p.key.isInPlace) {
-      plan = p.plan;
-      break;
-    }
-  }
-
-  if (plan != nullptr) {
-    return executeWithPlan(comm->executor, rank, datatype, sendbuff, recvbuff, bytes, bytes, plan, stream);
+  auto planHandler = comm->planRegistry_->select("allreduce", comm->comm->bootstrap()->getNranks(),
+                                                 comm->comm->bootstrap()->getNranksPerNode(),
+                                                 comm->comm->bootstrap()->getRank(), sendbuff, recvbuff, bytes, {});
+  if (planHandler != nullptr) {
+    return executeWithPlan(comm->executor, rank, datatype, sendbuff, recvbuff, bytes, bytes, planHandler->plan, stream);
   }
 
   auto algo = comm->algorithmCollection->selectAlgorithm(
@@ -769,20 +738,12 @@ NCCL_API ncclResult_t ncclReduceScatter(const void* sendbuff, void* recvbuff, si
   int rank = comm->comm->bootstrap()->getRank();
   int nRank = comm->comm->bootstrap()->getNranks();
 
-  std::vector<executionPlanInstance>& plans = comm->executionPlans["reducescatter"];
-  std::shared_ptr<mscclpp::ExecutionPlan> plan;
-  void* basePtr = (char*)sendbuff + rank * bytes;
-  bool inPlace = basePtr == recvbuff;
-  const size_t totalBytes = bytes * nRank;
-  for (const auto& p : plans) {
-    if (totalBytes >= p.key.minMessageSize && totalBytes < p.key.maxMessageSize && inPlace == p.key.isInPlace) {
-      plan = p.plan;
-      break;
-    }
-  }
-
-  if (plan != nullptr) {
-    return executeWithPlan(comm->executor, rank, datatype, sendbuff, recvbuff, totalBytes, bytes, plan, stream);
+  auto planHandle = comm->planRegistry_->select("reducescatter", comm->comm->bootstrap()->getNranks(),
+                                                comm->comm->bootstrap()->getNranksPerNode(),
+                                                comm->comm->bootstrap()->getRank(), sendbuff, recvbuff, bytes, {});
+  if (planHandle != nullptr) {
+    return executeWithPlan(comm->executor, rank, datatype, sendbuff, recvbuff, bytes * nRank, bytes, planHandle->plan,
+                           stream);
   }
 
   if (mscclppNcclDlopenSharedLib == true) {
@@ -821,20 +782,12 @@ NCCL_API ncclResult_t ncclAllGather(const void* sendbuff, void* recvbuff, size_t
                                     *reinterpret_cast<ncclComm_t*>(comm->mscclppNcclComm), stream);
   }
 
-  std::vector<executionPlanInstance>& plans = comm->executionPlans["allgather"];
-  std::shared_ptr<mscclpp::ExecutionPlan> plan;
-  void* basePtr = (char*)sendbuff - rank * bytes;
-  bool inPlace = basePtr == recvbuff;
-  const size_t totalBytes = bytes * nRank;
-  for (const auto& p : plans) {
-    if (totalBytes >= p.key.minMessageSize && totalBytes < p.key.maxMessageSize && inPlace == p.key.isInPlace) {
-      plan = p.plan;
-      break;
-    }
-  }
-
-  if (plan != nullptr) {
-    return executeWithPlan(comm->executor, rank, datatype, sendbuff, recvbuff, bytes, totalBytes, plan, stream);
+  auto planHandle = comm->planRegistry_->select("allgather", comm->comm->bootstrap()->getNranks(),
+                                                comm->comm->bootstrap()->getNranksPerNode(),
+                                                comm->comm->bootstrap()->getRank(), sendbuff, recvbuff, bytes, {});
+  if (planHandle != nullptr) {
+    return executeWithPlan(comm->executor, rank, datatype, sendbuff, recvbuff, bytes, bytes * nRank, planHandle->plan,
+                           stream);
   }
 
   auto algo = comm->algorithmCollection->selectAlgorithm(

diff --git a/docs/conf.py b/docs/conf.py
@@ -49,7 +49,7 @@
     "show-inheritance": True,
 }
 # only mock the C-extension when using the source tree
-autodoc_mock_imports = ["mscclpp._version", "mscclpp._mscclpp", "cupy", "mpi4py", "numpy", "sortedcontainers"]
+autodoc_mock_imports = ["mscclpp._version", "mscclpp._mscclpp", "blake3", "cupy", "mpi4py", "numpy", "sortedcontainers"]
 autodoc_typehints = "description"
 napoleon_google_docstring = True
 napoleon_numpy_docstring = True

diff --git a/docs/guide/mscclpp-dsl-integration.md b/docs/guide/mscclpp-dsl-integration.md
@@ -0,0 +1,126 @@
+# MSCCL++ DSL Integration Guide
+
+MSCCL++ DSL (domain-specific language) enables concise expression of collective algorithms as Python functions.
+MSCCL++ offers pythonic utilities to author, JIT-compile, register, and select execution plans. This guide walks through two integration paths: a customized MSCCL++ communicator and NCCL interposition that accelerates existing PyTorch `backend="nccl"` workloads.
+
+## Initial Setup
+
+Run the following from the repository root after completing the basic project setup:
+
+1. Install Python dependencies.
+   ```bash
+   pip install -r ./python/<requirements_file>
+   ```
+   Replace `<requirements_file>` with the file that matches your environment (e.g., `requirements_cuda11.txt`, `requirements_cuda12.txt`, or `requirements_rocm6.txt`).
+
+2. Install the module and generate default algorithm plans.
+   ```bash
+   pip install . && python3 -m mscclpp --install
+   ```
+
+## Integration Options
+
+MSCCL++ DSL integrates into your training or inference workload in two ways:
+1. **Custom MSCCL++ Communicator** — directly manage an MSCCL++ communicator and launch collectives with the MSCCL++ executor.
+2. **NCCL Interposition** — keep using `backend="nccl"`; MSCCL++ intercepts NCCL calls at runtime for drop-in acceleration.
+
+Both paths follow the same high-level flow:
+1. Author (or reuse) a collective algorithm with the MSCCL++ DSL.
+2. Compile it into an execution plan.
+3. Register the plan with the MSCCL++ runtime.
+4. Configure a selector to choose the plan for each collective call.
+
+Below we show an AllReduce example and then detail each integration option.
+
+### Example: AllReduce in the MSCCL++ DSL
+The snippet defines an AllReduce that uses NVLS for intra-node reduce-scatter followed by broadcast.
+```python
+def allreduce_nvls(spec: mscclpp.AlgoSpec) -> CollectiveProgram:
+    gpu_size = spec.world_size
+    with CollectiveProgram(
+        spec.name,
+        spec.collective,
+        gpu_size,
+        instances=8,
+        protocol=spec.protocol,
+        num_threads_per_block=spec.num_threads_per_block,
+        min_message_size=spec.min_message_size,
+        max_message_size=spec.max_message_size,
+    ) as program:
+        # Creating Channels
+        nvls_chan = SwitchChannel(rank_list=[gpu for gpu in range(gpu_size)], buffer_type=BufferType.input)
+        channels = {}
+        for gpu in range(gpu_size):
+            for peer in range(gpu_size):
+                if peer != gpu:
+                    channels[(peer, gpu)] = MemoryChannel(peer, gpu)
+
+        # Synchronization to Ensure all the Gpus are Ready
+        for gpu in range(gpu_size):
+            src_rank = gpu
+            for peer in range(gpu_size):
+                if peer != src_rank:
+                    dst_rank = peer
+                    channels[(dst_rank, src_rank)].signal(tb=0, relaxed=True)
+            for peer in range(gpu_size):
+                if peer != src_rank:
+                    dst_rank = peer
+                    channels[(dst_rank, src_rank)].wait(tb=0, relaxed=True, data_sync=SyncType.after)
+        # Reducing and Storing the data
+        for gpu in range(gpu_size):
+            buffer_offset = gpu
+            rank = Rank(gpu)
+            input_buffer = rank.get_input_buffer()
+            nvls_chan.at_rank(gpu).reduce(
+                buffer_offset=buffer_offset, size=1, dst_chunk=input_buffer[gpu : gpu + 1], tb=0
+            )
+            nvls_chan.at_rank(gpu).broadcast(
+                src_chunk=input_buffer[gpu : gpu + 1], buffer_offset=buffer_offset, size=1, tb=0
+            )
+        # Synchronization to Ensure the Gpus finished
+        for gpu in range(gpu_size):
+            src_rank = gpu
+            for peer in range(gpu_size):
+                if peer != src_rank:
+                    dst_rank = peer
+                    channels[(dst_rank, src_rank)].signal(tb=0, relaxed=True, data_sync=SyncType.before)
+            for peer in range(gpu_size):
+                if peer != src_rank:
+                    dst_rank = peer
+                    channels[(dst_rank, src_rank)].wait(tb=0, relaxed=True)
+
+    return program
+```
+
+### Integrate with MSCCL++ customized communicator
+Use when you want a PyTorch‑compatible interface with fine‑grained control. You manage the communicator, compile/register DSL plans, and invoke collectives via a thin wrapper. The example below shows an AllReduce built on the MSCCL++ communicator and executor.
+Example source directory:
+```
+examples/torch-integration
+```
+Key file: `customized_comm.py`.
+
+
+#### Launch (single node)
+```bash
+MSCCLPP_MASTER_ADDR=<master_ip> MSCCLPP_MASTER_PORT=<port> torchrun --nnodes=1 --nproc_per_node=8  customized_comm.py
+```
+
+### Integrate via NCCL Interposition
+Keep your script as‑is: init PyTorch with backend="nccl"; MSCCL++ intercepts NCCL calls for drop‑in acceleration.
+Example source directory:
+```
+examples/torch-integration
+```
+Key file: `dsl_with_nccl_api.py`.
+
+#### Launch with interposition
+To run with NCCL interposition, you preload the MSCCL++ shim so it transparently intercepts NCCL calls made by PyTorch’s nccl backend.
+```bash
+LD_PRELOAD=<MSCCLPP_REPO>/build/apps/nccl/libmscclpp_nccl.so torchrun --nnodes=1 --nproc_per_node=8 dsl_with_nccl_api.py
+```
+## Notices:
+ - When using NCCL interposition, the algorithm selection order is:
+   1. Check for registered DSL plans matching the collective call.
+   2. Check for a customized kernel implementation if no DSL plan fits.
+   3. Fall back to the default NCCL implementation (set `MSCCLPP_NCCL_LIB_PATH` to the original NCCL library).
diff --git a/docs/programming_guide.rst b/docs/programming_guide.rst
@@ -13,3 +13,4 @@ This section provides advanced topics and best practices for using MSCCL++. It i
    guide/cpp-examples
    guide/mscclpp-dsl
    guide/customized-algorithm-with-nccl-api
+   guide/mscclpp-dsl-integration