Add CudaDeviceGuard (#691)

chhwang · web-flow · commit ddf84a6b9d01 · 2025-11-24T13:38:44.000-08:00
Add an RAII guard that sets a proper GPU device before a CUDA API call.
We may change this stateful in the future to minimize `cudaGetDevice()`
calls. This PR fixes a bug of the tutorial 01.
diff --git a/include/mscclpp/gpu_utils.hpp b/include/mscclpp/gpu_utils.hpp
@@ -41,10 +41,26 @@ namespace mscclpp {
 struct AvoidCudaGraphCaptureGuard {
   AvoidCudaGraphCaptureGuard();
   ~AvoidCudaGraphCaptureGuard();
+  AvoidCudaGraphCaptureGuard(const AvoidCudaGraphCaptureGuard&) = delete;
+  AvoidCudaGraphCaptureGuard& operator=(const AvoidCudaGraphCaptureGuard&) = delete;
+  AvoidCudaGraphCaptureGuard(AvoidCudaGraphCaptureGuard&&) = delete;
+  AvoidCudaGraphCaptureGuard& operator=(AvoidCudaGraphCaptureGuard&&) = delete;
   cudaStreamCaptureMode mode_;
   bool active_;
 };
 
+/// A RAII guard that will set the current device on construction and restore the previous device on destruction.
+struct CudaDeviceGuard {
+  CudaDeviceGuard(int deviceId);
+  ~CudaDeviceGuard();
+  CudaDeviceGuard(const CudaDeviceGuard&) = delete;
+  CudaDeviceGuard& operator=(const CudaDeviceGuard&) = delete;
+  CudaDeviceGuard(CudaDeviceGuard&&) = delete;
+  CudaDeviceGuard& operator=(CudaDeviceGuard&&) = delete;
+  int deviceId_;
+  int origDeviceId_;
+};
+
 /// A RAII wrapper around cudaStream_t that will call cudaStreamDestroy on destruction.
 struct CudaStreamWithFlags {
   /// Constructor without flags. This will not create any stream. set() can be called later to create a stream with
@@ -128,6 +144,7 @@ std::shared_ptr<GpuStreamPool> gpuStreamPool();
 namespace detail {
 
 void setReadWriteMemoryAccess(void* base, size_t size);
+int gpuIdFromAddress(void* ptr);
 
 void* gpuCalloc(size_t bytes);
 void* gpuCallocHost(size_t bytes, unsigned int flags);
diff --git a/src/connection.cc b/src/connection.cc
@@ -96,18 +96,11 @@ CudaIpcConnection::CudaIpcConnection(std::shared_ptr<Context> context, const End
   } else if (localEndpoint.device().type == DeviceType::GPU && remoteEndpoint.device().type == DeviceType::GPU) {
     if (isSameProcess(localEndpoint, remoteEndpoint) && localDeviceId != remoteDeviceId) {
       // Connecting two GPUs in the same process - need to enable peer access explicitly
-      int originalDeviceId;
-      MSCCLPP_CUDATHROW(cudaGetDevice(&originalDeviceId));
-      if (originalDeviceId != localDeviceId) {
-        MSCCLPP_CUDATHROW(cudaSetDevice(localDeviceId));
-      }
+      CudaDeviceGuard deviceGuard(localDeviceId);
       auto ret = cudaDeviceEnablePeerAccess(remoteDeviceId, 0);
       if (ret != cudaSuccess && ret != cudaErrorPeerAccessAlreadyEnabled) {
         MSCCLPP_CUDATHROW(ret);
       }
-      if (originalDeviceId != localDeviceId) {
-        MSCCLPP_CUDATHROW(cudaSetDevice(originalDeviceId));
-      }
     }
   }
   int streamDeviceId = (localEndpoint.device().type == DeviceType::GPU) ? localDeviceId : remoteDeviceId;
diff --git a/src/context.cc b/src/context.cc
@@ -19,26 +19,28 @@ CudaIpcStream::CudaIpcStream(int deviceId)
 
 void CudaIpcStream::setStreamIfNeeded() {
   if (!env()->cudaIpcUseDefaultStream && stream_->empty()) {
-    MSCCLPP_CUDATHROW(cudaSetDevice(deviceId_));
     stream_->set(cudaStreamNonBlocking);
   }
 }
 
 void CudaIpcStream::memcpyD2D(void *dst, const void *src, size_t nbytes) {
+  CudaDeviceGuard deviceGuard(deviceId_);
   setStreamIfNeeded();
   MSCCLPP_CUDATHROW(cudaMemcpyAsync(dst, src, nbytes, cudaMemcpyDeviceToDevice, *stream_));
   dirty_ = true;
 }
 
 void CudaIpcStream::memcpyH2D(void *dst, const void *src, size_t nbytes) {
+  CudaDeviceGuard deviceGuard(deviceId_);
   setStreamIfNeeded();
   MSCCLPP_CUDATHROW(cudaMemcpyAsync(dst, src, nbytes, cudaMemcpyHostToDevice, *stream_));
   dirty_ = true;
 }
 
 void CudaIpcStream::sync() {
-  setStreamIfNeeded();
   if (dirty_) {
+    CudaDeviceGuard deviceGuard(deviceId_);
+    setStreamIfNeeded();
     MSCCLPP_CUDATHROW(cudaStreamSynchronize(*stream_));
     dirty_ = false;
   }
diff --git a/src/gpu_utils.cc b/src/gpu_utils.cc
@@ -66,6 +66,21 @@ AvoidCudaGraphCaptureGuard::~AvoidCudaGraphCaptureGuard() {
   (void)cudaThreadExchangeStreamCaptureMode(&mode_);
 }
 
+CudaDeviceGuard::CudaDeviceGuard(int deviceId) : deviceId_(deviceId), origDeviceId_(-1) {
+  if (deviceId_ >= 0) {
+    MSCCLPP_CUDATHROW(cudaGetDevice(&origDeviceId_));
+    if (origDeviceId_ != deviceId_) {
+      MSCCLPP_CUDATHROW(cudaSetDevice(deviceId_));
+    }
+  }
+}
+
+CudaDeviceGuard::~CudaDeviceGuard() {
+  if (deviceId_ >= 0 && origDeviceId_ >= 0 && origDeviceId_ != deviceId_) {
+    (void)cudaSetDevice(origDeviceId_);
+  }
+}
+
 CudaStreamWithFlags::CudaStreamWithFlags() : stream_(nullptr) { MSCCLPP_CUDATHROW(cudaGetDevice(&deviceId_)); }
 
 CudaStreamWithFlags::CudaStreamWithFlags(unsigned int flags) {
@@ -79,11 +94,8 @@ CudaStreamWithFlags::~CudaStreamWithFlags() {
 
 void CudaStreamWithFlags::set(unsigned int flags) {
   if (!empty()) throw Error("CudaStreamWithFlags already set", ErrorCode::InvalidUsage);
-  int originalDeviceId;
-  MSCCLPP_CUDATHROW(cudaGetDevice(&originalDeviceId));  // Save the current device
-  MSCCLPP_CUDATHROW(cudaSetDevice(deviceId_));
+  CudaDeviceGuard deviceGuard(deviceId_);
   MSCCLPP_CUDATHROW(cudaStreamCreateWithFlags(&stream_, flags));
-  MSCCLPP_CUDATHROW(cudaSetDevice(originalDeviceId));  // Restore the original device
 }
 
 bool CudaStreamWithFlags::empty() const { return stream_ == nullptr; }
@@ -123,6 +135,18 @@ namespace detail {
 
 CUmemAllocationHandleType nvlsCompatibleMemHandleType = CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR;
 
+int gpuIdFromAddress(void* ptr) {
+  int deviceId;
+  auto res = cuPointerGetAttribute(&deviceId, CU_POINTER_ATTRIBUTE_DEVICE_ORDINAL, reinterpret_cast<CUdeviceptr>(ptr));
+  if (res == CUDA_ERROR_INVALID_VALUE) {
+    // not a GPU address
+    return -1;
+  } else {
+    MSCCLPP_CUTHROW(res);
+  }
+  return deviceId;
+}
+
 /// set memory access permission to read-write
 /// @param base Base memory pointer.
 /// @param size Size of the memory.
diff --git a/src/ib.cc b/src/ib.cc
@@ -41,23 +41,6 @@ namespace mscclpp {
 
 #if defined(USE_IBVERBS)
 
-static inline bool isGpuAddr(void* ptr) {
-  CUmemorytype memType;
-  auto res = cuPointerGetAttribute(&memType, CU_POINTER_ATTRIBUTE_MEMORY_TYPE, reinterpret_cast<CUdeviceptr>(ptr));
-  if (res == CUDA_ERROR_INVALID_VALUE) {
-    return false;
-  } else if (res != CUDA_SUCCESS) {
-    MSCCLPP_CUTHROW(res);
-  }
-  return (memType == CU_MEMORYTYPE_DEVICE);
-}
-
-static inline int gpuAddrToDeviceId(CUdeviceptr devPtr) {
-  int deviceId;
-  MSCCLPP_CUTHROW(cuPointerGetAttribute(&deviceId, CU_POINTER_ATTRIBUTE_DEVICE_ORDINAL, devPtr));
-  return deviceId;
-}
-
 static inline bool isDmabufSupportedByGpu(int gpuId) {
   static std::unordered_map<int, bool> cache;
   if (gpuId < 0 || !IBVerbs::isDmabufSupported()) {
@@ -92,8 +75,8 @@ IbMr::IbMr(ibv_pd* pd, void* buff, std::size_t size) : mr_(nullptr), buff_(buff)
   uintptr_t addr = buffIntPtr & -pageSize;
   std::size_t pages = (size + (buffIntPtr - addr) + pageSize - 1) / pageSize;
 
-  bool isGpuBuff = isGpuAddr(buff_);
-  int gpuId = isGpuBuff ? gpuAddrToDeviceId(reinterpret_cast<CUdeviceptr>(buff_)) : -1;
+  int gpuId = detail::gpuIdFromAddress(buff_);
+  bool isGpuBuff = (gpuId != -1);
   if (isGpuBuff && isDmabufSupportedByGpu(gpuId)) {
 #if !defined(__HIP_PLATFORM_AMD__)
     int fd;
diff --git a/src/registered_memory.cc b/src/registered_memory.cc
@@ -53,6 +53,8 @@ RegisteredMemory::Impl::Impl(void* data, size_t size, TransportFlags transports,
       pidHash(getPidHash()),
       transports(transports) {
   if (transports.has(Transport::CudaIpc)) {
+    CudaDeviceGuard deviceGuard(detail::gpuIdFromAddress(data));
+
     TransportInfo transportInfo;
     transportInfo.transport = Transport::CudaIpc;
 
@@ -204,7 +206,15 @@ RegisteredMemory::Impl::Impl(const std::vector<char>::const_iterator& begin,
     // The memory is local to the process, so originalDataPtr is valid as is
     this->data = this->originalDataPtr;
     if (this->isCuMemMapAlloc) {
-      detail::setReadWriteMemoryAccess(this->data, this->baseDataSize);
+      // Query which device owns this memory
+      int gpuId = detail::gpuIdFromAddress(this->data);
+      int currentDevice = -1;
+      MSCCLPP_CUDATHROW(cudaGetDevice(&currentDevice));
+
+      // Only set access if we're on a different device than where memory was allocated
+      if (gpuId != currentDevice) {
+        detail::setReadWriteMemoryAccess(this->data, this->baseDataSize);
+      }
     }
   } else if (transports.has(Transport::CudaIpc)) {
     // The memory is local to the machine but not to the process, so we need to open the CUDA IPC handle
diff --git a/src/semaphore.cc b/src/semaphore.cc
@@ -51,7 +51,7 @@ SemaphoreStub::Impl::Impl(const Connection& connection) : connection_(connection
     if (localDevice.id < 0) {
       throw Error("Local GPU ID is not provided", ErrorCode::InvalidUsage);
     }
-    MSCCLPP_CUDATHROW(cudaSetDevice(localDevice.id));
+    CudaDeviceGuard deviceGuard(localDevice.id);
     token_ = gpuCallocToken(connection_.context());
   } else {
     throw Error("Unsupported local device type", ErrorCode::InvalidUsage);

Original file line number	Diff line number	Diff line change
`@@ -19,26 +19,28 @@ CudaIpcStream::CudaIpcStream(int deviceId)`
`19`	`19`
`20`	`20`	`void CudaIpcStream::setStreamIfNeeded() {`
`21`	`21`	`if (!env()->cudaIpcUseDefaultStream && stream_->empty()) {`
`22`		`- MSCCLPP_CUDATHROW(cudaSetDevice(deviceId_));`
`23`	`22`	`stream_->set(cudaStreamNonBlocking);`
`24`	`23`	`}`
`25`	`24`	`}`
`26`	`25`
`27`	`26`	`void CudaIpcStream::memcpyD2D(void dst, const void src, size_t nbytes) {`
	`27`	`+ CudaDeviceGuard deviceGuard(deviceId_);`
`28`	`28`	`setStreamIfNeeded();`
`29`	`29`	`MSCCLPP_CUDATHROW(cudaMemcpyAsync(dst, src, nbytes, cudaMemcpyDeviceToDevice, *stream_));`
`30`	`30`	`dirty_ = true;`
`31`	`31`	`}`
`32`	`32`
`33`	`33`	`void CudaIpcStream::memcpyH2D(void dst, const void src, size_t nbytes) {`
	`34`	`+ CudaDeviceGuard deviceGuard(deviceId_);`
`34`	`35`	`setStreamIfNeeded();`
`35`	`36`	`MSCCLPP_CUDATHROW(cudaMemcpyAsync(dst, src, nbytes, cudaMemcpyHostToDevice, *stream_));`
`36`	`37`	`dirty_ = true;`
`37`	`38`	`}`
`38`	`39`
`39`	`40`	`void CudaIpcStream::sync() {`
`40`		`- setStreamIfNeeded();`
`41`	`41`	`if (dirty_) {`
	`42`	`+ CudaDeviceGuard deviceGuard(deviceId_);`
	`43`	`+ setStreamIfNeeded();`
`42`	`44`	`MSCCLPP_CUDATHROW(cudaStreamSynchronize(*stream_));`
`43`	`45`	`dirty_ = false;`
`44`	`46`	`}`
Original file line number	Diff line number	Diff line change
`@@ -51,7 +51,7 @@ SemaphoreStub::Impl::Impl(const Connection& connection) : connection_(connection`
`51`	`51`	`if (localDevice.id < 0) {`
`52`	`52`	`throw Error("Local GPU ID is not provided", ErrorCode::InvalidUsage);`
`53`	`53`	`}`
`54`		`- MSCCLPP_CUDATHROW(cudaSetDevice(localDevice.id));`
	`54`	`+ CudaDeviceGuard deviceGuard(localDevice.id);`
`55`	`55`	`token_ = gpuCallocToken(connection_.context());`
`56`	`56`	`} else {`
`57`	`57`	`throw Error("Unsupported local device type", ErrorCode::InvalidUsage);`