microsoft
diff --git a/‎docs/dsl_quick_start.md‎
Lines changed: 176 additions & 0 deletions b/‎docs/dsl_quick_start.md‎
Lines changed: 176 additions & 0 deletions
diff --git a/‎docs/index.rst‎
Lines changed: 2 additions & 0 deletions b/‎docs/index.rst‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎include/mscclpp/gpu_utils.hpp‎
Lines changed: 17 additions & 0 deletions b/‎include/mscclpp/gpu_utils.hpp‎
Lines changed: 17 additions & 0 deletions
diff --git a/‎python/csrc/core_py.cpp‎
Lines changed: 2 additions & 1 deletion b/‎python/csrc/core_py.cpp‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎python/csrc/memory_channel_py.cpp‎
Lines changed: 14 additions & 8 deletions b/‎python/csrc/memory_channel_py.cpp‎
Lines changed: 14 additions & 8 deletions
diff --git a/‎python/mscclpp/__init__.py‎
Lines changed: 2 additions & 0 deletions b/‎python/mscclpp/__init__.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎python/mscclpp/comm.py‎
Lines changed: 10 additions & 13 deletions b/‎python/mscclpp/comm.py‎
Lines changed: 10 additions & 13 deletions
diff --git a/‎python/mscclpp_benchmark/mscclpp_op.py‎
Lines changed: 4 additions & 1 deletion b/‎python/mscclpp_benchmark/mscclpp_op.py‎
Lines changed: 4 additions & 1 deletion
@@ -0,0 +1,176 @@
+# DSL Quick Start
+
+The MSCCL++ DSL (Domain Specific Language) provides a high-level Python API for defining custom collective communication algorithms. This guide will help you get started with writing and testing your own communication patterns.
+
+## Installation
+
+You can follow the same steps in the [Quick Start](quickstart).
+
+After finishing the installation in the quick start section, you can add the following steps to install some default algorithms from the DSL:
+
+```bash
+python3 -m mscclpp --install
+```
+
+## Your First Algorithm: AllGather
+
+Let's walk through a simple AllGather algorithm to understand the DSL basics. This example demonstrates the key concepts without diving into all the advanced features.
+
+### Complete Example
+
+```python
+from mscclpp.language import *
+
+def simple_allgather(name):
+    """
+    A simple AllGather implementation using the MSCCL++ DSL.
+    
+    This example demonstrates a 2-GPU AllGather where each GPU sends
+    its data to all other GPUs, so all GPUs end up with everyone's data.
+    
+    Args:
+        name: Algorithm name for identification
+    """
+    num_gpus = 2
+    chunk_factor = 1  # Split data into num_gpus chunks
+    
+    # Define the collective operation
+    collective = AllGather(num_gpus, chunk_factor, inplace=True)
+    
+    # Create the program context
+    with CollectiveProgram(
+        name,
+        collective,
+        num_gpus,
+        protocol="Simple",  # Use Simple protocol (vs "LL" for low-latency)
+        min_message_size=0,
+        max_message_size=2**30  # 1GB
+    ):
+        # Loop over each source GPU rank
+        for src_rank in range(num_gpus):
+            # Create a Rank object for the source GPU
+            rank = Rank(src_rank)
+            # Get the output buffer where the data is stored
+            src_buffer = rank.get_output_buffer()
+            # Take a slice corresponding to this rank's data
+            src_chunk = src_buffer[src_rank:src_rank + 1]
+            
+            # Loop over each destination GPU rank
+            for dst_rank in range(num_gpus):
+                # Skip sending from a rank to itself
+                if src_rank != dst_rank:
+                    # Create a Rank object for the destination GPU
+                    dst_rank_obj = Rank(dst_rank)
+                    # Get the destination buffer where data will be sent
+                    dst_buffer = dst_rank_obj.get_output_buffer()
+                    # Take a slice where the data will be placed
+                    dst_chunk = dst_buffer[src_rank:src_rank + 1]
+                    
+                    # Define a channel from src_rank → dst_rank
+                    channel = MemoryChannel(dst_rank, src_rank)
+                    
+                    # Step 1: Source signals it is ready to send data
+                    channel.signal(tb=0, relaxed=True)
+                    
+                    # Step 2: Wait for destination to be ready
+                    channel.wait(tb=0, data_sync=SyncType.after, relaxed=True)
+                    
+                    # Step 3: Source rank sends data to destination rank
+                    channel.put(dst_chunk, src_chunk, tb=0)
+                    
+                    # Step 4: Signal that put operation is complete
+                    channel.signal(tb=0, data_sync=SyncType.before)
+                    
+                    # Step 5: Wait for acknowledgment
+                    channel.wait(tb=0, data_sync=SyncType.after)
+            
+        print(JSON())
+
+simple_allgather("simple_allgather_2gpus")
+```
+
+### Key Concepts Explained
+
+**1. Collective Definition**
+```python
+collective = AllGather(num_gpus, chunk_factor=1, inplace=True)
+```
+- Defines what collective operation to implement (AllGather in this case)
+- `chunk_factor` determines data chunking strategy
+- `inplace=True` means input and output use the same buffer. For AllGather, the input buffer is a slice of the output buffer. For example, on rank 0, the input buffer is the first half of the output buffer, and on rank 1, the input buffer is the second half of the output buffer.
+
+**2. Program Context**
+```python
+with CollectiveProgram(name, collective, num_gpus, ...):
+```
+- Sets up the execution environment
+- Configures protocol, threading, and message size ranges
+
+**3. Ranks and Buffers**
+```python
+rank = Rank(src_rank)
+src_buffer = rank.get_output_buffer()
+src_chunk = src_buffer[src_rank:src_rank + 1]
+```
+- `Rank` represents a GPU in the collective
+- Buffers hold the data being communicated
+- Chunks are slices of buffers representing data portions
+
+**4. Channels**
+```python
+channel = MemoryChannel(dst_rank, src_rank)
+```
+- Establishes communication paths between GPUs
+- `MemoryChannel` for intra-node (fast, direct memory access)
+- Created for each source-destination pair
+- Can also use `PortChannel` for inter-node communication
+
+**5. Synchronization and Data Transfer**
+```python
+channel.signal(tb=0, relaxed=True)
+channel.wait(tb=0, data_sync=SyncType.after, relaxed=True)
+channel.put(dst_chunk, src_chunk, tb=0)
+```
+- `signal()`: Notify remote GPU of state changes
+- `wait()`: Wait for remote GPU to reach a certain state
+- `put()`: Write data from local to remote GPU memory
+- `tb=0` assigns operations to thread block 0
+- `relaxed=True` uses relaxed memory ordering for performance
+
+For more advanced concepts like synchronization, scratch buffers, and pipelining, refer to the [full DSL documentation](py_api).
+
+## Testing Your Algorithm
+
+Once you've written your algorithm, you need to run it:
+
+```bash
+python3 path/to/simple_allgather.py > /path/to/simple_allgather.json
+```
+
+After this, use `executor_test.py` to validate correctness and measure performance.
+
+```bash
+# Test with 2 GPUs on a single node
+mpirun --allow-run-as-root -np 2 python3 python/test/executor_test.py \
+ -path /path/to/simple_allgather.json \
+ --size 1M \
+ --in_place
+```
+
+## Next Steps
+
+Now that you understand the basics:
+
+1. **Explore Examples**: Check `python/mscclpp/language/tests/` for more algorithm examples
+2. **Optimize**: Experiment with different chunk strategies, pipelining, and synchronization patterns
+3. **Advanced Features**: Learn about scratch buffers, thread block groups, and packet-based communication
+
+For detailed API documentation and advanced features, refer to:
+- [Programming Guide](programming_guide)
+- [Tutorials](tutorials)
+
+## Troubleshooting
+
+**Import Error**: If you see `ModuleNotFoundError: No module named 'mscclpp'`, ensure you've installed the package with `pip install .`
+
+For more help, please file an issue on the [GitHub repository](https://github.com/microsoft/mscclpp/issues).
@@ -10,6 +10,7 @@ You can find the followings from this documentation.
 
 - **Overview:** An overview of MSCCL++ and its features. :doc:`🔗 <overview>`
 - **Quick Start:** A guide to build, install, and run MSCCL++. :doc:`🔗 <quickstart>`
+- **DSL Quick Start:** A guide to get started with the MSCCL++ DSL for defining custom algorithms. :doc:`🔗 <dsl_quick_start>`
 - **Tutorials:** A step-by-step guide for GPU communication using MSCCL++. :doc:`🔗 <tutorials>`
 - **Programming Guide:** Advanced topics and best practices for using MSCCL++. :doc:`🔗 <programming_guide>`
 - **C++ API Reference:** Detailed documentation of the MSCCL++ C++ API. :doc:`🔗 <cpp_api>`
@@ -21,6 +22,7 @@ You can find the followings from this documentation.
 
    overview
    quickstart
+   dsl_quick_start
    tutorials
    programming_guide
    cpp_api
 
@@ -41,10 +41,26 @@ namespace mscclpp {
 struct AvoidCudaGraphCaptureGuard {
   AvoidCudaGraphCaptureGuard();
   ~AvoidCudaGraphCaptureGuard();
+  AvoidCudaGraphCaptureGuard(const AvoidCudaGraphCaptureGuard&) = delete;
+  AvoidCudaGraphCaptureGuard& operator=(const AvoidCudaGraphCaptureGuard&) = delete;
+  AvoidCudaGraphCaptureGuard(AvoidCudaGraphCaptureGuard&&) = delete;
+  AvoidCudaGraphCaptureGuard& operator=(AvoidCudaGraphCaptureGuard&&) = delete;
   cudaStreamCaptureMode mode_;
   bool active_;
 };
 
+/// A RAII guard that will set the current device on construction and restore the previous device on destruction.
+struct CudaDeviceGuard {
+  CudaDeviceGuard(int deviceId);
+  ~CudaDeviceGuard();
+  CudaDeviceGuard(const CudaDeviceGuard&) = delete;
+  CudaDeviceGuard& operator=(const CudaDeviceGuard&) = delete;
+  CudaDeviceGuard(CudaDeviceGuard&&) = delete;
+  CudaDeviceGuard& operator=(CudaDeviceGuard&&) = delete;
+  int deviceId_;
+  int origDeviceId_;
+};
+
 /// A RAII wrapper around cudaStream_t that will call cudaStreamDestroy on destruction.
 struct CudaStreamWithFlags {
   /// Constructor without flags. This will not create any stream. set() can be called later to create a stream with
@@ -128,6 +144,7 @@ std::shared_ptr<GpuStreamPool> gpuStreamPool();
 namespace detail {
 
 void setReadWriteMemoryAccess(void* base, size_t size);
+int gpuIdFromAddress(void* ptr);
 
 void* gpuCalloc(size_t bytes);
 void* gpuCallocHost(size_t bytes, unsigned int flags);
 
@@ -216,6 +216,7 @@ void register_core(nb::module_& m) {
 
   def_shared_future<RegisteredMemory>(m, "RegisteredMemory");
   def_shared_future<Connection>(m, "Connection");
+  def_shared_future<Semaphore>(m, "Semaphore");
 
   nb::class_<Communicator>(m, "Communicator")
       .def(nb::init<std::shared_ptr<Bootstrap>, std::shared_ptr<Context>>(), nb::arg("bootstrap"),
@@ -242,7 +243,7 @@ void register_core(nb::module_& m) {
           nb::arg("remote_rank"), nb::arg("tag"), nb::arg("local_config"))
       .def("send_memory_on_setup", &Communicator::sendMemory, nb::arg("memory"), nb::arg("remote_rank"), nb::arg("tag"))
       .def("recv_memory_on_setup", &Communicator::recvMemory, nb::arg("remote_rank"), nb::arg("tag"))
-      .def("build_semaphore", &Communicator::buildSemaphore, nb::arg("local_flag"), nb::arg("remote_rank"),
+      .def("build_semaphore", &Communicator::buildSemaphore, nb::arg("connection"), nb::arg("remote_rank"),
            nb::arg("tag") = 0)
       .def("remote_rank_of", &Communicator::remoteRankOf)
       .def("tag_of", &Communicator::tagOf)
 
@@ -26,14 +26,20 @@ void register_memory_channel(nb::module_& m) {
 
   nb::class_<MemoryChannel>(m, "MemoryChannel")
       .def(nb::init<>())
-      .def("__init__",
-           [](MemoryChannel* memoryChannel, std::shared_ptr<MemoryDevice2DeviceSemaphore> semaphore,
-              RegisteredMemory dst, RegisteredMemory src) { new (memoryChannel) MemoryChannel(semaphore, dst, src); })
-      .def("__init__",
-           [](MemoryChannel* memoryChannel, std::shared_ptr<MemoryDevice2DeviceSemaphore> semaphore,
-              RegisteredMemory dst, RegisteredMemory src, uintptr_t packet_buffer) {
-             new (memoryChannel) MemoryChannel(semaphore, dst, src, reinterpret_cast<void*>(packet_buffer));
-           })
+      .def(
+          "__init__",
+          [](MemoryChannel* memoryChannel, std::shared_ptr<MemoryDevice2DeviceSemaphore> semaphore,
+             RegisteredMemory dst, RegisteredMemory src, uintptr_t packet_buffer) {
+            new (memoryChannel) MemoryChannel(semaphore, dst, src, reinterpret_cast<void*>(packet_buffer));
+          },
+          nb::arg("semaphore"), nb::arg("dst"), nb::arg("src"), nb::arg("packet_buffer") = 0)
+      .def(
+          "__init__",
+          [](MemoryChannel* memoryChannel, const Semaphore& semaphore, RegisteredMemory dst, RegisteredMemory src,
+             uintptr_t packet_buffer = 0) {
+            new (memoryChannel) MemoryChannel(semaphore, dst, src, reinterpret_cast<void*>(packet_buffer));
+          },
+          nb::arg("semaphore"), nb::arg("dst"), nb::arg("src"), nb::arg("packet_buffer") = 0)
       .def("device_handle", &MemoryChannel::deviceHandle);
 
   nb::class_<MemoryChannel::DeviceHandle>(m, "MemoryChannelDeviceHandle")
 
@@ -47,6 +47,7 @@
     connect_nvls_collective,
     EndpointConfig,
     Fifo,
+    Semaphore,
     Host2DeviceSemaphore,
     Host2HostSemaphore,
     numa,
@@ -79,6 +80,7 @@
     "connect_nvls_collective",
     "EndpointConfig",
     "Fifo",
+    "Semaphore",
     "Host2DeviceSemaphore",
     "Host2HostSemaphore",
     "numa",
 
@@ -10,6 +10,7 @@
     Connection,
     connect_nvls_collective,
     EndpointConfig,
+    Semaphore,
     Host2DeviceSemaphore,
     Host2HostSemaphore,
     ProxyService,
@@ -133,18 +134,14 @@ def _register_memory_with_connections(
             all_registered_memories[rank] = future_memories[rank].get()
         return all_registered_memories
 
-    def make_semaphore(
-        self,
-        connections: dict[int, Connection],
-        semaphore_type: Type[Host2HostSemaphore] | Type[Host2DeviceSemaphore] | Type[MemoryDevice2DeviceSemaphore],
-    ) -> dict[int, Host2HostSemaphore]:
-        semaphores = {}
+    def make_semaphores(self, connections: dict[int, Connection]) -> dict[int, Semaphore]:
+        future_semaphores = {}
         for rank in connections:
-            semaphores[rank] = semaphore_type(self.communicator, connections[rank])
-        return semaphores
+            future_semaphores[rank] = self.communicator.build_semaphore(connections[rank], rank)
+        return {rank: future.get() for rank, future in future_semaphores.items()}
 
     def make_memory_channels(self, tensor: cp.ndarray, connections: dict[int, Connection]) -> dict[int, MemoryChannel]:
-        semaphores = self.make_semaphore(connections, MemoryDevice2DeviceSemaphore)
+        semaphores = self.make_semaphores(connections)
         registered_memories = self.register_tensor_with_connections(tensor, connections)
         channels = {}
         for rank in connections:
@@ -159,7 +156,7 @@ def make_memory_channels_with_scratch(
         registeredScratchBuffer: RegisteredMemory,
         connections: dict[int, Connection],
     ) -> dict[int, MemoryChannel]:
-        semaphores = self.make_semaphore(connections, MemoryDevice2DeviceSemaphore)
+        semaphores = self.make_semaphores(connections)
         registered_memories = self._register_memory_with_connections(registeredScratchBuffer, connections)
         channels = {}
         tensor_data_ptr = tensor.data_ptr() if is_torch_tensor(tensor) else tensor.data.ptr
@@ -177,7 +174,7 @@ def make_memory_channels_with_scratch(
     def make_port_channels(
         self, proxy_service: ProxyService, tensor: cp.ndarray, connections: dict[int, Connection]
     ) -> dict[int, PortChannel]:
-        semaphores = self.make_semaphore(connections, Host2DeviceSemaphore)
+        semaphores = self.make_semaphores(connections)
         registered_memories = self.register_tensor_with_connections(tensor, connections)
         memory_ids = {}
         semaphore_ids = {}
@@ -210,7 +207,7 @@ def make_port_channels_with_scratch(
         )
         local_reg_memory = self.communicator.register_memory(data_ptr, tensor_size, transport_flags)
 
-        semaphores = self.make_semaphore(connections, Host2DeviceSemaphore)
+        semaphores = self.make_semaphores(connections)
         registered_memories = self._register_memory_with_connections(registeredScratchBuffer, connections)
         memory_ids = {}
         semaphore_ids = {}
@@ -229,7 +226,7 @@ def make_port_channels_with_scratch(
     def register_semaphore_with_proxy(
         self, proxy_service: ProxyService, connections: dict[int, Connection]
     ) -> dict[int, PortChannel]:
-        semaphores = self.make_semaphore(connections, Host2DeviceSemaphore)
+        semaphores = self.make_semaphores(connections)
         semaphore_ids = {}
         for rank in semaphores:
             semaphore_ids[rank] = proxy_service.add_semaphore(semaphores[rank])
 
@@ -453,7 +453,10 @@ def __init__(
         )
 
         # create a memory_channel for each remote neighbor
-        self.semaphores = group.make_semaphore(self.nvlink_connections, MemoryDevice2DeviceSemaphore)
+        self.semaphores = {
+            rank: MemoryDevice2DeviceSemaphore(sema)
+            for rank, sema in group.make_semaphores(self.nvlink_connections).items()
+        }
         file_dir = os.path.dirname(os.path.abspath(__file__))
         self.kernel = KernelBuilder(
             file="allreduce.cu",