microsoft · chhwang · Oct 29, 2025 · Oct 29, 2025 · Oct 29, 2025 · Oct 29, 2025
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -19,23 +19,23 @@ set(MSCCLPP_VERSION "${MSCCLPP_MAJOR}.${MSCCLPP_MINOR}.${MSCCLPP_PATCH}")
 find_package(Git)
 set(GIT_HASH "UNKNOWN")
 if(Git_FOUND)
-  execute_process(
-    COMMAND "${GIT_EXECUTABLE}" rev-parse --short=12 HEAD
-    WORKING_DIRECTORY "${CMAKE_SOURCE_DIR}"
-    OUTPUT_VARIABLE _git_out
-    OUTPUT_STRIP_TRAILING_WHITESPACE
-  )
-  if(NOT _git_out STREQUAL "")
-    set(GIT_HASH "${_git_out}")
-  endif()
+    execute_process(
+        COMMAND "${GIT_EXECUTABLE}" rev-parse --short=12 HEAD
+        WORKING_DIRECTORY "${CMAKE_SOURCE_DIR}"
+        OUTPUT_VARIABLE _git_out
+        OUTPUT_STRIP_TRAILING_WHITESPACE
+    )
+    if(NOT _git_out STREQUAL "")
+        set(GIT_HASH "${_git_out}")
+    endif()
 else()
     message(WARNING "Git not found, setting GIT_HASH to 'UNKNOWN'")
 endif()
 
 configure_file(
-  "${CMAKE_CURRENT_SOURCE_DIR}/include/mscclpp/version.hpp.in"
-  "${CMAKE_CURRENT_BINARY_DIR}/include/mscclpp/version.hpp"
-  @ONLY
+    "${CMAKE_CURRENT_SOURCE_DIR}/include/mscclpp/version.hpp.in"
+    "${CMAKE_CURRENT_BINARY_DIR}/include/mscclpp/version.hpp"
+    @ONLY
 )
 
 list(APPEND CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake)
@@ -47,6 +47,7 @@ option(MSCCLPP_BUILD_PYTHON_BINDINGS "Build Python bindings" ON)
 option(MSCCLPP_BUILD_APPS_NCCL "Build NCCL interfaces" ON)
 option(MSCCLPP_USE_CUDA "Use NVIDIA/CUDA." OFF)
 option(MSCCLPP_USE_ROCM "Use AMD/ROCm." OFF)
+option(MSCCLPP_USE_IB "Use InfiniBand." ON)
 option(MSCCLPP_BYPASS_GPU_CHECK "Bypass GPU check." OFF)
 option(MSCCLPP_NPKIT_FLAGS "Set NPKIT flags" OFF)
 set(MSCCLPP_GPU_ARCHS "" CACHE STRING "Specify GPU architectures with delimiters (comma, space, or semicolon).")
@@ -141,23 +142,18 @@ else()
 endif()
 
 if(CMAKE_BUILD_TYPE STREQUAL "Debug")
-  add_compile_definitions(DEBUG_BUILD)
+    add_compile_definitions(DEBUG_BUILD)
 endif()
 
-find_package(IBVerbs)
+if(MSCCLPP_USE_IB)
+    find_package(IBVerbs)
+    if(NOT IBVERBS_FOUND)
+        message(FATAL_ERROR "IBVerbs not found. Install libibverbs-dev or rdma-core-devel. If you want to disable InfiniBand, add `-DMSCCLPP_USE_IB=OFF` in your cmake command.")
+    endif()
+endif()
 find_package(NUMA REQUIRED)
 find_package(Threads REQUIRED)
 
-set(CMAKE_COLOR_DIAGNOSTICS ON)
-function(msg_red text)
-  string(ASCII 27 ESC)
-  message("${ESC}[31m${text}${ESC}[0m")
-endfunction()
-
-if(NOT IBVERBS_FOUND)
-  msg_red("libibverbs not found. Install libibverbs-dev or rdma-core-devel.")
-endif()
-
 include(FetchContent)
 FetchContent_Declare(json URL https://github.com/nlohmann/json/releases/download/v3.11.3/json.tar.xz)
 FetchContent_MakeAvailable(json)
@@ -168,7 +164,7 @@ target_include_directories(mscclpp_obj
     ${GPU_INCLUDE_DIRS}
     ${NUMA_INCLUDE_DIRS})
 target_link_libraries(mscclpp_obj PRIVATE ${GPU_LIBRARIES} ${NUMA_LIBRARIES} nlohmann_json::nlohmann_json Threads::Threads dl)
-if(IBVERBS_FOUND)
+if(MSCCLPP_USE_IB)
     target_include_directories(mscclpp_obj SYSTEM PRIVATE ${IBVERBS_INCLUDE_DIRS})
     target_link_libraries(mscclpp_obj PRIVATE ${IBVERBS_LIBRARIES})
     target_compile_definitions(mscclpp_obj PUBLIC USE_IBVERBS)
@@ -179,7 +175,7 @@ if(MSCCLPP_USE_CUDA)
 elseif(MSCCLPP_USE_ROCM)
     target_compile_definitions(mscclpp_obj PRIVATE MSCCLPP_USE_ROCM)
     foreach(arch ${MSCCLPP_GPU_ARCHS})
-      target_compile_options(mscclpp_obj PRIVATE --offload-arch=${arch})
+        target_compile_options(mscclpp_obj PRIVATE --offload-arch=${arch})
     endforeach()
 endif()
 if(MSCCLPP_ENABLE_TRACE)

diff --git a/docs/quickstart.md b/docs/quickstart.md
@@ -19,7 +19,7 @@
             * AMD MI250X GPUs + ROCm >= 5.7
             * AMD MI300X GPUs + ROCm >= 6.0
 * OS
-    * Tested on Ubuntu 18.04 and later
+    * Tested on Ubuntu 20.04 and later
 * Libraries
     * [libnuma](https://github.com/numactl/numactl)
         ```bash
@@ -32,10 +32,7 @@
         If you don't want to build Python module, you need to set `-DMSCCLPP_BUILD_PYTHON_BINDINGS=OFF` in your `cmake` command (see details in [Install from Source](#install-from-source)).
     * (Optional, for benchmarks) MPI
 * Others
-    * For NVIDIA platforms, `nvidia_peermem` driver should be loaded on all nodes. Check it via:
-        ```bash
-        lsmod | grep nvidia_peermem
-        ```
+    * For RDMA (InfiniBand or RoCE) support on NVIDIA platforms, [GPUDirect RDMA](https://docs.nvidia.com/datacenter/cloud-native/gpu-operator/latest/gpu-operator-rdma.html#gpudirect-rdma-and-gpudirect-storage) should be supported by the system. See the detailed prerequisites from [this NVIDIA documentation](https://docs.nvidia.com/datacenter/cloud-native/gpu-operator/latest/gpu-operator-rdma.html#common-prerequisites).
     * For NVLink SHARP (NVLS) support on NVIDIA platforms, the Linux kernel version should be 5.6 or above.
 
 (docker-images)=
@@ -91,6 +88,7 @@ There are a few optional CMake options you can set:
 - `-DMSCCLPP_GPU_ARCHS=<arch-list>`: Specify the GPU architectures to build for. For example, `-DMSCCLPP_GPU_ARCHS="80,90"` for NVIDIA A100 and H100 GPUs, `-DMSCCLPP_GPU_ARCHS=gfx942` for AMD MI300x GPU.
 - `-DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_CUDA=ON`: If the build environment doesn't have GPUs and only has CUDA installed, you can set these options to bypass GPU checks and use CUDA APIs. This is useful for building on CI systems or environments without GPUs.
 - `-DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_ROCM=ON`: If the build environment doesn't have GPUs and only has ROCm installed, you can set these options to bypass GPU checks and use ROCm APIs.
+- `-DMSCCLPP_USE_IB=OFF`: Don't build InfiniBand support.
 - `-DMSCCLPP_BUILD_PYTHON_BINDINGS=OFF`: Don't build the Python module.
 - `-DMSCCLPP_BUILD_TESTS=OFF`: Don't build the tests.
 - `-DMSCCLPP_BUILD_APPS_NCCL=OFF`: Don't build the NCCL API.

diff --git a/src/ib.cc b/src/ib.cc
@@ -25,7 +25,7 @@
 #if !defined(__HIP_PLATFORM_AMD__)
 
 // Check if nvidia_peermem kernel module is loaded
-static bool checkNvPeerMemLoaded() {
+[[maybe_unused]] static bool checkNvPeerMemLoaded() {
   std::ifstream file("/proc/modules");
   std::string line;
   while (std::getline(file, line)) {
@@ -77,6 +77,12 @@ IbMr::IbMr(ibv_pd* pd, void* buff, std::size_t size) : buff(buff) {
     throw Error("Registeration of dma-buf based memory region failed on HIP platform", ErrorCode::InvalidUsage);
 #endif  // !defined(__HIP_PLATFORM_AMD__)
   } else {
+#if !defined(__HIP_PLATFORM_AMD__)
+    // nvidia-peermem is needed only when DMABUF is not supported
+    if (!checkNvPeerMemLoaded()) {
+      throw Error("nvidia_peermem kernel module is not loaded", ErrorCode::InternalError);
+    }
+#endif  // !defined(__HIP_PLATFORM_AMD__)
     this->mr = IBVerbs::ibv_reg_mr2(pd, reinterpret_cast<void*>(addr), pages * pageSize,
                                     IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_REMOTE_READ |
                                         IBV_ACCESS_RELAXED_ORDERING | IBV_ACCESS_REMOTE_ATOMIC);
@@ -329,11 +335,6 @@ int IbQp::getWcStatus(int idx) const { return (*this->wcs)[idx].status; }
 int IbQp::getNumCqItems() const { return this->numSignaledPostedItems; }
 
 IbCtx::IbCtx(const std::string& devName) : devName(devName) {
-#if !defined(__HIP_PLATFORM_AMD__)
-  if (!checkNvPeerMemLoaded()) {
-    throw Error("nvidia_peermem kernel module is not loaded", ErrorCode::InternalError);
-  }
-#endif  // !defined(__HIP_PLATFORM_AMD__)
   int num;
   struct ibv_device** devices = IBVerbs::ibv_get_device_list(&num);
   for (int i = 0; i < num; ++i) {

diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
@@ -4,7 +4,7 @@
 find_package(MPI)
 
 set(TEST_LIBS_COMMON mscclpp ${GPU_LIBRARIES} ${NUMA_LIBRARIES} Threads::Threads)
-if(IBVERBS_FOUND)
+if(MSCCLPP_USE_IB)
     list(APPEND TEST_LIBS_COMMON ${IBVERBS_LIBRARIES})
 endif()
 set(TEST_LIBS_GTEST GTest::gtest_main GTest::gmock_main)
@@ -19,7 +19,7 @@ endif()
 function(add_test_executable name sources)
     add_executable(${name} ${sources})
     target_link_libraries(${name} ${TEST_LIBS_COMMON} MPI::MPI_CXX)
-    if(IBVERBS_FOUND)
+    if(MSCCLPP_USE_IB)
         target_compile_definitions(${name} PRIVATE USE_IBVERBS)
     endif()
     target_include_directories(${name} ${TEST_INC_COMMON} ${TEST_INC_INTERNAL})

diff --git a/test/mp_unit/communicator_tests.cu b/test/mp_unit/communicator_tests.cu
@@ -104,7 +104,11 @@ void CommunicatorTest::SetUp() {
 
   ASSERT_EQ((deviceBufferSize / sizeof(int)) % gEnv->worldSize, 0);
 
+#if defined(USE_IBVERBS)
   connectMesh(true, true, false);
+#else
+  connectMesh(true, false, false);
+#endif
 
   devicePtr.resize(numBuffers);
   localMemory.resize(numBuffers);
@@ -117,10 +121,16 @@ void CommunicatorTest::SetUp() {
     }
   }
 
+#if defined(USE_IBVERBS)
+  auto transport = mscclpp::Transport::CudaIpc | ibTransport;
+#else
+  auto transport = mscclpp::Transport::CudaIpc;
+#endif
+
   for (size_t n = 0; n < numBuffers; n++) {
     devicePtr[n] = mscclpp::detail::gpuCallocShared<int>(deviceBufferSize / sizeof(int));
-    registerMemoryPairs(devicePtr[n].get(), deviceBufferSize, mscclpp::Transport::CudaIpc | ibTransport, 0, remoteRanks,
-                        localMemory[n], remoteMemory[n]);
+    registerMemoryPairs(devicePtr[n].get(), deviceBufferSize, transport, 0, remoteRanks, localMemory[n],
+                        remoteMemory[n]);
   }
 }
 

diff --git a/test/mp_unit/ib_tests.cu b/test/mp_unit/ib_tests.cu
@@ -18,6 +18,10 @@ void IbTestBase::SetUp() {
 }
 
 void IbPeerToPeerTest::SetUp() {
+#if !defined(USE_IBVERBS)
+  GTEST_SKIP() << "This test requires IBVerbs that the current build does not support.";
+#endif  // !defined(USE_IBVERBS)
+
   IbTestBase::SetUp();
 
   mscclpp::UniqueId id;

diff --git a/test/mp_unit/memory_channel_tests.cu b/test/mp_unit/memory_channel_tests.cu
@@ -23,7 +23,7 @@ void MemoryChannelOneToOneTest::setupMeshConnections(std::vector<mscclpp::Memory
   const int rank = communicator->bootstrap()->getRank();
   const int worldSize = communicator->bootstrap()->getNranks();
   const bool isInPlace = (outputBuff == nullptr);
-  mscclpp::TransportFlags transport = mscclpp::Transport::CudaIpc | ibTransport;
+  mscclpp::TransportFlags transport = mscclpp::Transport::CudaIpc;
 
   std::vector<std::shared_future<std::shared_ptr<mscclpp::Connection>>> connectionFutures(worldSize);
   std::vector<std::shared_future<mscclpp::RegisteredMemory>> remoteMemFutures(worldSize);
@@ -38,11 +38,8 @@ void MemoryChannelOneToOneTest::setupMeshConnections(std::vector<mscclpp::Memory
     if (r == rank) {
       continue;
     }
-    if (rankToNode(r) == rankToNode(gEnv->rank)) {
-      connectionFutures[r] = communicator->connect(mscclpp::Transport::CudaIpc, r);
-    } else {
-      connectionFutures[r] = communicator->connect(ibTransport, r);
-    }
+    // No IB for MemoryChannel tests
+    connectionFutures[r] = communicator->connect(mscclpp::Transport::CudaIpc, r);
 
     if (isInPlace) {
       communicator->sendMemory(inputBufRegMem, r);

diff --git a/test/mp_unit/port_channel_tests.cu b/test/mp_unit/port_channel_tests.cu
@@ -242,31 +242,43 @@ void PortChannelOneToOneTest::testPingPongPerf(PingPongTestParams params) {
 }
 
 TEST_F(PortChannelOneToOneTest, PingPong) {
-  testPingPong(PingPongTestParams{.useIPC = true, .useIB = true, .useEthernet = false, .waitWithPoll = false});
+  testPingPong(PingPongTestParams{.useIPC = true, .useIB = false, .useEthernet = false, .waitWithPoll = false});
 }
 
 TEST_F(PortChannelOneToOneTest, PingPongIb) {
+#if defined(USE_IBVERBS)
   testPingPong(PingPongTestParams{.useIPC = false, .useIB = true, .useEthernet = false, .waitWithPoll = false});
+#else   // !defined(USE_IBVERBS)
+  GTEST_SKIP() << "This test requires IBVerbs that the current build does not support.";
+#endif  // !defined(USE_IBVERBS)
 }
 
 TEST_F(PortChannelOneToOneTest, PingPongEthernet) {
   testPingPong(PingPongTestParams{.useIPC = false, .useIB = false, .useEthernet = true, .waitWithPoll = false});
 }
 
 TEST_F(PortChannelOneToOneTest, PingPongWithPoll) {
-  testPingPong(PingPongTestParams{.useIPC = true, .useIB = true, .useEthernet = false, .waitWithPoll = true});
+  testPingPong(PingPongTestParams{.useIPC = true, .useIB = false, .useEthernet = false, .waitWithPoll = true});
 }
 
 TEST_F(PortChannelOneToOneTest, PingPongIbWithPoll) {
+#if defined(USE_IBVERBS)
   testPingPong(PingPongTestParams{.useIPC = false, .useIB = true, .useEthernet = false, .waitWithPoll = true});
+#else   // !defined(USE_IBVERBS)
+  GTEST_SKIP() << "This test requires IBVerbs that the current build does not support.";
+#endif  // !defined(USE_IBVERBS)
 }
 
 TEST_F(PortChannelOneToOneTest, PingPongPerf) {
-  testPingPongPerf(PingPongTestParams{.useIPC = true, .useIB = true, .useEthernet = false, .waitWithPoll = false});
+  testPingPongPerf(PingPongTestParams{.useIPC = true, .useIB = false, .useEthernet = false, .waitWithPoll = false});
 }
 
 TEST_F(PortChannelOneToOneTest, PingPongPerfIb) {
+#if defined(USE_IBVERBS)
   testPingPongPerf(PingPongTestParams{.useIPC = false, .useIB = true, .useEthernet = false, .waitWithPoll = false});
+#else   // !defined(USE_IBVERBS)
+  GTEST_SKIP() << "This test requires IBVerbs that the current build does not support.";
+#endif  // !defined(USE_IBVERBS)
 }
 
 TEST_F(PortChannelOneToOneTest, PingPongPerfEthernet) {
@@ -339,7 +351,7 @@ __global__ void kernelProxyLLPingPong(int* buff, mscclpp::LLPacket* putPktBuf, m
   }
 }
 
-void PortChannelOneToOneTest::testPacketPingPong(bool useIbOnly) {
+void PortChannelOneToOneTest::testPacketPingPong(bool useIb) {
   if (gEnv->rank >= numRanksToUse) return;
 
   const int nElem = 4 * 1024 * 1024;
@@ -351,8 +363,8 @@ void PortChannelOneToOneTest::testPacketPingPong(bool useIbOnly) {
   auto putPacketBuffer = mscclpp::GpuBuffer<mscclpp::LLPacket>(nPacket).memory();
   auto getPacketBuffer = mscclpp::GpuBuffer<mscclpp::LLPacket>(nPacket).memory();
 
-  setupMeshConnections(portChannels, !useIbOnly, true, false, putPacketBuffer.get(),
-                       nPacket * sizeof(mscclpp::LLPacket), getPacketBuffer.get(), nPacket * sizeof(mscclpp::LLPacket));
+  setupMeshConnections(portChannels, !useIb, useIb, false, putPacketBuffer.get(), nPacket * sizeof(mscclpp::LLPacket),
+                       getPacketBuffer.get(), nPacket * sizeof(mscclpp::LLPacket));
 
   ASSERT_EQ(portChannels.size(), 1);
 
@@ -406,7 +418,7 @@ void PortChannelOneToOneTest::testPacketPingPong(bool useIbOnly) {
   proxyService->stopProxy();
 }
 
-void PortChannelOneToOneTest::testPacketPingPongPerf(bool useIbOnly) {
+void PortChannelOneToOneTest::testPacketPingPongPerf(bool useIb) {
   if (gEnv->rank >= numRanksToUse) return;
 
   const int nElem = 4 * 1024 * 1024;
@@ -418,8 +430,8 @@ void PortChannelOneToOneTest::testPacketPingPongPerf(bool useIbOnly) {
   auto putPacketBuffer = mscclpp::GpuBuffer<mscclpp::LLPacket>(nPacket).memory();
   auto getPacketBuffer = mscclpp::GpuBuffer<mscclpp::LLPacket>(nPacket).memory();
 
-  setupMeshConnections(portChannels, !useIbOnly, true, false, putPacketBuffer.get(),
-                       nPacket * sizeof(mscclpp::LLPacket), getPacketBuffer.get(), nPacket * sizeof(mscclpp::LLPacket));
+  setupMeshConnections(portChannels, !useIb, useIb, false, putPacketBuffer.get(), nPacket * sizeof(mscclpp::LLPacket),
+                       getPacketBuffer.get(), nPacket * sizeof(mscclpp::LLPacket));
 
   ASSERT_EQ(portChannels.size(), 1);
 
@@ -464,8 +476,20 @@ void PortChannelOneToOneTest::testPacketPingPongPerf(bool useIbOnly) {
 
 TEST_F(PortChannelOneToOneTest, PacketPingPong) { testPacketPingPong(false); }
 
-TEST_F(PortChannelOneToOneTest, PacketPingPongIb) { testPacketPingPong(true); }
+TEST_F(PortChannelOneToOneTest, PacketPingPongIb) {
+#if defined(USE_IBVERBS)
+  testPacketPingPong(true);
+#else   // !defined(USE_IBVERBS)
+  GTEST_SKIP() << "This test requires IBVerbs that the current build does not support.";
+#endif  // !defined(USE_IBVERBS)
+}
 
 TEST_F(PortChannelOneToOneTest, PacketPingPongPerf) { testPacketPingPongPerf(false); }
 
-TEST_F(PortChannelOneToOneTest, PacketPingPongPerfIb) { testPacketPingPongPerf(true); }
+TEST_F(PortChannelOneToOneTest, PacketPingPongPerfIb) {
+#if defined(USE_IBVERBS)
+  testPacketPingPongPerf(true);
+#else   // !defined(USE_IBVERBS)
+  GTEST_SKIP() << "This test requires IBVerbs that the current build does not support.";
+#endif  // !defined(USE_IBVERBS)
+}
diff --git a/test/perf/CMakeLists.txt b/test/perf/CMakeLists.txt
@@ -8,7 +8,7 @@ find_package(MPI REQUIRED)
 
 # Set up common libraries and includes for tests
 set(PERF_TEST_LIBS_COMMON mscclpp ${GPU_LIBRARIES} ${NUMA_LIBRARIES} Threads::Threads MPI::MPI_CXX)
-if(IBVERBS_FOUND)
+if(MSCCLPP_USE_IB)
     list(APPEND PERF_TEST_LIBS_COMMON ${IBVERBS_LIBRARIES})
 endif()
 
@@ -27,7 +27,7 @@ function(add_perf_test_executable name sources)
     # Link nlohmann_json - use the target from main project
     target_link_libraries(${name} nlohmann_json::nlohmann_json)
 
-    if(IBVERBS_FOUND)
+    if(MSCCLPP_USE_IB)
         target_compile_definitions(${name} PRIVATE USE_IBVERBS)
     endif()