update

Binyang2014 · Binyang2014 · commit cef1f89ed7c9 · 2025-11-30T20:35:06.000Z
diff --git a/examples/torch-integration/customized_comm_with_dsl.py b/examples/torch-integration/customized_comm_with_dsl.py
diff --git a/examples/torch-integration/cutomized_comm_with_default_algo.py b/examples/torch-integration/cutomized_comm_with_default_algo.py
@@ -0,0 +1,137 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+# MSCCLPP_MASTER_ADDR=<master_ip> MSCCLPP_MASTER_PORT=<port> torchrun --nnodes=1 --nproc_per_node=8  customized_comm_with_default_algo.py
+
+import os
+import torch
+import mscclpp.comm as mscclpp_comm
+import mscclpp
+import netifaces as ni
+import ipaddress
+import ctypes
+
+
+def load_algorithms(scratch_buffer: torch.tensor, rank: int) -> mscclpp.AlgorithmCollection:
+    collection_builder = mscclpp.AlgorithmCollectionBuilder()
+    return collection_builder.build_default_algorithms(
+        scratch_buffer=scratch_buffer.data_ptr(), scratch_buffer_size=scratch_buffer.nbytes, rank=rank
+    )
+
+
+def interfaces_for_ip_netifaces(ip: str):
+    target = ipaddress.ip_address(ip)
+    for interface in ni.interfaces():
+        addresses = ni.ifaddresses(interface)
+        if ni.AF_INET in addresses:
+            for link in addresses[ni.AF_INET]:
+                if "addr" in link:
+                    addr = ipaddress.ip_address(link["addr"])
+                    if addr == target:
+                        return interface
+    return None
+
+
+def dtype_to_mscclpp_dtype(dtype: torch.dtype) -> mscclpp.DataType:
+    if dtype == torch.float16:
+        return mscclpp.DataType.float16
+    elif dtype == torch.float32:
+        return mscclpp.DataType.float32
+    elif dtype == torch.int32:
+        return mscclpp.DataType.int32
+    elif dtype == torch.bfloat16:
+        return mscclpp.DataType.bfloat16
+    else:
+        raise ValueError(f"Unknown data type: {dtype}")
+
+
+class CustomizedComm:
+    def __init__(self, comm: mscclpp_comm.CommGroup):
+        self.comm = comm
+        self.rank = comm.my_rank
+        self.world_size = comm.nranks
+        self.local_rank = comm.my_rank % comm.nranks_per_node
+        self.n_ranks_per_node = comm.nranks_per_node
+        dlpack  = mscclpp.RawGpuBuffer(1 << 27).to_dlpack(data_type=str(torch.float16))
+        self.scratch_buffer = torch.utils.dlpack.from_dlpack(dlpack)
+        algorithms = load_algorithms(scratch_buffer=self.scratch_buffer, rank=self.rank)
+        self._algorithm_nvls_packet = [
+            algo
+            for algo in algorithms
+            if algo.collective == "allreduce" and algo.name == "default_allreduce_nvls_packet"
+        ][0]
+        self._algorithm_nvls_nonzero_copy = [
+            algo
+            for algo in algorithms
+            if algo.collective == "allreduce" and algo.name == "default_allreduce_nvls_with_copy"
+        ][0]
+
+    def all_reduce(self, tensor: torch.Tensor, op=torch.distributed.ReduceOp.SUM, stream: torch.cuda.Stream = None):
+        assert op == torch.distributed.ReduceOp.SUM
+        algo = None
+        if tensor.nbytes < 1 << 20:
+            algo = self._algorithm_nvls_packet
+        else:
+            algo = self._algorithm_nvls_nonzero_copy
+        ctype_op = ctypes.c_int32(op.value)
+        extras: dict[str, int] = {"op": ctypes.addressof(ctype_op)}
+        algo.execute(
+            comm=self.comm.communicator,
+            input_buffer=tensor.data_ptr(),
+            output_buffer=tensor.data_ptr(),
+            input_size=tensor.nbytes,
+            output_size=tensor.nbytes,
+            dtype=dtype_to_mscclpp_dtype(tensor.dtype),
+            stream=stream.cuda_stream if stream is not None else 0,
+            extras=extras,
+        )
+
+    def barrier(self):
+        tensor = torch.empty(1, dtype=torch.float, device=torch.device("cuda"))
+        self.all_reduce(tensor, op=torch.distributed.ReduceOp.SUM, stream=torch.cuda.current_stream())
+
+    def destroy(self):
+        self.executor = None
+        self._algorithm_nvls_nonzero_copy = None
+        self._algorithm_nvls_packet = None
+        self.scratch_buffer = None
+        self.comm = None
+
+
+def init_dist() -> CustomizedComm:
+    rank = int(os.environ["RANK"])
+    world = int(os.environ["WORLD_SIZE"])
+    master_addr = os.environ["MSCCLPP_MASTER_ADDR"]
+    master_port = os.environ["MSCCLPP_MASTER_PORT"]
+    interface = interfaces_for_ip_netifaces(master_addr)
+    if interface is None:
+        raise ValueError(f"Cannot find network interface for IP address {master_addr}")
+    nranks_per_node = os.environ.get("MSCCLPP_NRANKS_PER_NODE")
+    if nranks_per_node is None:
+        nranks_per_node = os.environ.get("LOCAL_WORLD_SIZE")
+    if nranks_per_node is None:
+        nnodes = int(os.environ.get("NNODES", "1"))
+        if world % nnodes == 0:
+            nranks_per_node = world // nnodes
+    if nranks_per_node is None:
+        nranks_per_node = torch.cuda.device_count()
+    nranks_per_node = int(nranks_per_node)
+    nranks_per_node = max(1, min(world, nranks_per_node))
+    interfaceIpPortTrio = f"{interface}:{master_addr}:{master_port}"
+    mscclpp_group = mscclpp_comm.CommGroup(interfaceIpPortTrio=interfaceIpPortTrio, rank=rank, size=world)
+    return CustomizedComm(mscclpp_group)
+
+
+def main():
+    local = int(os.environ["LOCAL_RANK"])
+    torch.cuda.set_device(local)
+    comm = init_dist()
+    comm.barrier()
+    input_data = torch.randn(1 << 22, dtype=torch.float16, device=torch.device("cuda"))
+    comm.all_reduce(input_data, op=torch.distributed.ReduceOp.SUM, stream=torch.cuda.current_stream())
+    comm.barrier()
+    comm.destroy()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/python/csrc/algorithm.cpp b/python/csrc/algorithm.cpp
@@ -22,6 +22,17 @@ void register_algorithm(nb::module_& m) {
 
   nb::enum_<AlgorithmType>(m, "AlgorithmType").value("NATIVE", AlgorithmType::NATIVE).value("DSL", AlgorithmType::DSL);
 
+  nb::enum_<CommResult>(m, "CommResult")
+      .value("COMM_SUCCESS", CommResult::commSuccess)
+      .value("COMM_UNHANDLED_CUDA_ERROR", CommResult::commUnhandledCudaError)
+      .value("COMM_SYSTEM_ERROR", CommResult::commSystemError)
+      .value("COMM_INTERNAL_ERROR", CommResult::commInternalError)
+      .value("COMM_INVALID_ARGUMENT", CommResult::commInvalidArgument)
+      .value("COMM_INVALID_USAGE", CommResult::commInvalidUsage)
+      .value("COMM_REMOTE_ERROR", CommResult::commRemoteError)
+      .value("COMM_IN_PROGRESS", CommResult::commInProgress)
+      .value("COMM_NUM_RESULTS", CommResult::commNumResults);
+
   auto algorithmClass =
       nb::class_<Algorithm>(m, "Algorithm")
           .def_static(
diff --git a/python/mscclpp/__init__.py b/python/mscclpp/__init__.py
@@ -10,7 +10,7 @@
 
 from functools import wraps
 from mscclpp._version import __version__, __commit_id__
-from mscclpp._algorithm import Algorithm, AlgorithmCollectionBuilder
+from mscclpp._algorithm import Algorithm, AlgorithmCollectionBuilder, AlgorithmCollection
 from mscclpp.language.utils import AlgoSpec
 from mscclpp._compiler import DslCompiler, NativeCodeCompiler
 
@@ -91,6 +91,7 @@
     # Python API
     "Algorithm",
     "AlgorithmCollectionBuilder",
+    "AlgorithmCollection",
     "AlgoSpec",
 ]
 
diff --git a/python/mscclpp/_algorithm.py b/python/mscclpp/_algorithm.py
@@ -11,6 +11,7 @@
     DslAlgorithm as _DslAlgorithm,
     AlgorithmType as _AlgorithmType,
     AlgorithmBuilder as _AlgorithmBuilder,
+    AlgorithmCollection as _AlgorithmCollection,
     AlgorithmCollectionBuilder as _AlgorithmCollectionBuilder,
     Communicator,
     CollectiveBufferMode,
@@ -126,6 +127,36 @@ def build(self) -> Algorithm:
         return Algorithm.create_from_native_handle(self._algorithm_builder.build())
 
 
+class AlgorithmCollection:
+    def __init__(self, native_collection: _AlgorithmCollection):
+        self._native_collection = native_collection
+        self._algorithms = [
+            Algorithm.create_from_native_handle(algo)
+            for algo in self._native_collection.to_list()
+        ]
+
+    def __iter__(self):
+        """Iterate over all algorithms in the collection."""
+        return iter(self._algorithms)
+
+    def __len__(self):
+        """Return the number of algorithms in the collection."""
+        return len(self._algorithms)
+
+    def __getitem__(self, index: int) -> Algorithm:
+        """Get an algorithm by index."""
+        return self._algorithms[index]
+
+    def get_by_collective(self, collective: str):
+        """Get all algorithms for a specific collective operation."""
+        return [algo for algo in self._algorithms if algo.collective == collective]
+
+    def register_algorithm(self, collective: str, algo_name: str, algorithm: Algorithm):
+        """Register an algorithm for a collective operation."""
+        self._native_collection.register_algorithm(collective, algo_name, algorithm._algorithm)
+        self._algorithms.append(algorithm)
+
+
 class AlgorithmCollectionBuilder:
     _instance = None
 
@@ -161,8 +192,14 @@ def set_algorithm_selector(self, selector):
     def set_fallback_algorithm_selector(self, selector):
         self._builder.set_fallback_algorithm_selector(selector)
 
-    def build(self):
-        return self._builder.build()
-
+    def build(self) -> AlgorithmCollection:
+        collection =  self._builder.build()
+        return AlgorithmCollection(collection)
+    
+    def build_default_algorithms(self, scratch_buffer: int, scratch_buffer_size: int, rank: int) -> AlgorithmCollection:
+        native_collection = self._builder.build_default_algorithms(
+            int(scratch_buffer), scratch_buffer_size, rank
+        )
+        return AlgorithmCollection(native_collection)
 
 atexit.register(AlgorithmCollectionBuilder.reset)