Skip to content

Commit ddf84a6

Browse files
authored
Add CudaDeviceGuard (#691)
Add an RAII guard that sets a proper GPU device before a CUDA API call. We may change this stateful in the future to minimize `cudaGetDevice()` calls. This PR fixes a bug of the tutorial 01.
1 parent 17247cd commit ddf84a6

File tree

7 files changed

+64
-35
lines changed

7 files changed

+64
-35
lines changed

include/mscclpp/gpu_utils.hpp

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,10 +41,26 @@ namespace mscclpp {
4141
struct AvoidCudaGraphCaptureGuard {
4242
AvoidCudaGraphCaptureGuard();
4343
~AvoidCudaGraphCaptureGuard();
44+
AvoidCudaGraphCaptureGuard(const AvoidCudaGraphCaptureGuard&) = delete;
45+
AvoidCudaGraphCaptureGuard& operator=(const AvoidCudaGraphCaptureGuard&) = delete;
46+
AvoidCudaGraphCaptureGuard(AvoidCudaGraphCaptureGuard&&) = delete;
47+
AvoidCudaGraphCaptureGuard& operator=(AvoidCudaGraphCaptureGuard&&) = delete;
4448
cudaStreamCaptureMode mode_;
4549
bool active_;
4650
};
4751

52+
/// A RAII guard that will set the current device on construction and restore the previous device on destruction.
53+
struct CudaDeviceGuard {
54+
CudaDeviceGuard(int deviceId);
55+
~CudaDeviceGuard();
56+
CudaDeviceGuard(const CudaDeviceGuard&) = delete;
57+
CudaDeviceGuard& operator=(const CudaDeviceGuard&) = delete;
58+
CudaDeviceGuard(CudaDeviceGuard&&) = delete;
59+
CudaDeviceGuard& operator=(CudaDeviceGuard&&) = delete;
60+
int deviceId_;
61+
int origDeviceId_;
62+
};
63+
4864
/// A RAII wrapper around cudaStream_t that will call cudaStreamDestroy on destruction.
4965
struct CudaStreamWithFlags {
5066
/// Constructor without flags. This will not create any stream. set() can be called later to create a stream with
@@ -128,6 +144,7 @@ std::shared_ptr<GpuStreamPool> gpuStreamPool();
128144
namespace detail {
129145

130146
void setReadWriteMemoryAccess(void* base, size_t size);
147+
int gpuIdFromAddress(void* ptr);
131148

132149
void* gpuCalloc(size_t bytes);
133150
void* gpuCallocHost(size_t bytes, unsigned int flags);

src/connection.cc

Lines changed: 1 addition & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -96,18 +96,11 @@ CudaIpcConnection::CudaIpcConnection(std::shared_ptr<Context> context, const End
9696
} else if (localEndpoint.device().type == DeviceType::GPU && remoteEndpoint.device().type == DeviceType::GPU) {
9797
if (isSameProcess(localEndpoint, remoteEndpoint) && localDeviceId != remoteDeviceId) {
9898
// Connecting two GPUs in the same process - need to enable peer access explicitly
99-
int originalDeviceId;
100-
MSCCLPP_CUDATHROW(cudaGetDevice(&originalDeviceId));
101-
if (originalDeviceId != localDeviceId) {
102-
MSCCLPP_CUDATHROW(cudaSetDevice(localDeviceId));
103-
}
99+
CudaDeviceGuard deviceGuard(localDeviceId);
104100
auto ret = cudaDeviceEnablePeerAccess(remoteDeviceId, 0);
105101
if (ret != cudaSuccess && ret != cudaErrorPeerAccessAlreadyEnabled) {
106102
MSCCLPP_CUDATHROW(ret);
107103
}
108-
if (originalDeviceId != localDeviceId) {
109-
MSCCLPP_CUDATHROW(cudaSetDevice(originalDeviceId));
110-
}
111104
}
112105
}
113106
int streamDeviceId = (localEndpoint.device().type == DeviceType::GPU) ? localDeviceId : remoteDeviceId;

src/context.cc

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -19,26 +19,28 @@ CudaIpcStream::CudaIpcStream(int deviceId)
1919

2020
void CudaIpcStream::setStreamIfNeeded() {
2121
if (!env()->cudaIpcUseDefaultStream && stream_->empty()) {
22-
MSCCLPP_CUDATHROW(cudaSetDevice(deviceId_));
2322
stream_->set(cudaStreamNonBlocking);
2423
}
2524
}
2625

2726
void CudaIpcStream::memcpyD2D(void *dst, const void *src, size_t nbytes) {
27+
CudaDeviceGuard deviceGuard(deviceId_);
2828
setStreamIfNeeded();
2929
MSCCLPP_CUDATHROW(cudaMemcpyAsync(dst, src, nbytes, cudaMemcpyDeviceToDevice, *stream_));
3030
dirty_ = true;
3131
}
3232

3333
void CudaIpcStream::memcpyH2D(void *dst, const void *src, size_t nbytes) {
34+
CudaDeviceGuard deviceGuard(deviceId_);
3435
setStreamIfNeeded();
3536
MSCCLPP_CUDATHROW(cudaMemcpyAsync(dst, src, nbytes, cudaMemcpyHostToDevice, *stream_));
3637
dirty_ = true;
3738
}
3839

3940
void CudaIpcStream::sync() {
40-
setStreamIfNeeded();
4141
if (dirty_) {
42+
CudaDeviceGuard deviceGuard(deviceId_);
43+
setStreamIfNeeded();
4244
MSCCLPP_CUDATHROW(cudaStreamSynchronize(*stream_));
4345
dirty_ = false;
4446
}

src/gpu_utils.cc

Lines changed: 28 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,21 @@ AvoidCudaGraphCaptureGuard::~AvoidCudaGraphCaptureGuard() {
6666
(void)cudaThreadExchangeStreamCaptureMode(&mode_);
6767
}
6868

69+
CudaDeviceGuard::CudaDeviceGuard(int deviceId) : deviceId_(deviceId), origDeviceId_(-1) {
70+
if (deviceId_ >= 0) {
71+
MSCCLPP_CUDATHROW(cudaGetDevice(&origDeviceId_));
72+
if (origDeviceId_ != deviceId_) {
73+
MSCCLPP_CUDATHROW(cudaSetDevice(deviceId_));
74+
}
75+
}
76+
}
77+
78+
CudaDeviceGuard::~CudaDeviceGuard() {
79+
if (deviceId_ >= 0 && origDeviceId_ >= 0 && origDeviceId_ != deviceId_) {
80+
(void)cudaSetDevice(origDeviceId_);
81+
}
82+
}
83+
6984
CudaStreamWithFlags::CudaStreamWithFlags() : stream_(nullptr) { MSCCLPP_CUDATHROW(cudaGetDevice(&deviceId_)); }
7085

7186
CudaStreamWithFlags::CudaStreamWithFlags(unsigned int flags) {
@@ -79,11 +94,8 @@ CudaStreamWithFlags::~CudaStreamWithFlags() {
7994

8095
void CudaStreamWithFlags::set(unsigned int flags) {
8196
if (!empty()) throw Error("CudaStreamWithFlags already set", ErrorCode::InvalidUsage);
82-
int originalDeviceId;
83-
MSCCLPP_CUDATHROW(cudaGetDevice(&originalDeviceId)); // Save the current device
84-
MSCCLPP_CUDATHROW(cudaSetDevice(deviceId_));
97+
CudaDeviceGuard deviceGuard(deviceId_);
8598
MSCCLPP_CUDATHROW(cudaStreamCreateWithFlags(&stream_, flags));
86-
MSCCLPP_CUDATHROW(cudaSetDevice(originalDeviceId)); // Restore the original device
8799
}
88100

89101
bool CudaStreamWithFlags::empty() const { return stream_ == nullptr; }
@@ -123,6 +135,18 @@ namespace detail {
123135

124136
CUmemAllocationHandleType nvlsCompatibleMemHandleType = CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR;
125137

138+
int gpuIdFromAddress(void* ptr) {
139+
int deviceId;
140+
auto res = cuPointerGetAttribute(&deviceId, CU_POINTER_ATTRIBUTE_DEVICE_ORDINAL, reinterpret_cast<CUdeviceptr>(ptr));
141+
if (res == CUDA_ERROR_INVALID_VALUE) {
142+
// not a GPU address
143+
return -1;
144+
} else {
145+
MSCCLPP_CUTHROW(res);
146+
}
147+
return deviceId;
148+
}
149+
126150
/// set memory access permission to read-write
127151
/// @param base Base memory pointer.
128152
/// @param size Size of the memory.

src/ib.cc

Lines changed: 2 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -41,23 +41,6 @@ namespace mscclpp {
4141

4242
#if defined(USE_IBVERBS)
4343

44-
static inline bool isGpuAddr(void* ptr) {
45-
CUmemorytype memType;
46-
auto res = cuPointerGetAttribute(&memType, CU_POINTER_ATTRIBUTE_MEMORY_TYPE, reinterpret_cast<CUdeviceptr>(ptr));
47-
if (res == CUDA_ERROR_INVALID_VALUE) {
48-
return false;
49-
} else if (res != CUDA_SUCCESS) {
50-
MSCCLPP_CUTHROW(res);
51-
}
52-
return (memType == CU_MEMORYTYPE_DEVICE);
53-
}
54-
55-
static inline int gpuAddrToDeviceId(CUdeviceptr devPtr) {
56-
int deviceId;
57-
MSCCLPP_CUTHROW(cuPointerGetAttribute(&deviceId, CU_POINTER_ATTRIBUTE_DEVICE_ORDINAL, devPtr));
58-
return deviceId;
59-
}
60-
6144
static inline bool isDmabufSupportedByGpu(int gpuId) {
6245
static std::unordered_map<int, bool> cache;
6346
if (gpuId < 0 || !IBVerbs::isDmabufSupported()) {
@@ -92,8 +75,8 @@ IbMr::IbMr(ibv_pd* pd, void* buff, std::size_t size) : mr_(nullptr), buff_(buff)
9275
uintptr_t addr = buffIntPtr & -pageSize;
9376
std::size_t pages = (size + (buffIntPtr - addr) + pageSize - 1) / pageSize;
9477

95-
bool isGpuBuff = isGpuAddr(buff_);
96-
int gpuId = isGpuBuff ? gpuAddrToDeviceId(reinterpret_cast<CUdeviceptr>(buff_)) : -1;
78+
int gpuId = detail::gpuIdFromAddress(buff_);
79+
bool isGpuBuff = (gpuId != -1);
9780
if (isGpuBuff && isDmabufSupportedByGpu(gpuId)) {
9881
#if !defined(__HIP_PLATFORM_AMD__)
9982
int fd;

src/registered_memory.cc

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,8 @@ RegisteredMemory::Impl::Impl(void* data, size_t size, TransportFlags transports,
5353
pidHash(getPidHash()),
5454
transports(transports) {
5555
if (transports.has(Transport::CudaIpc)) {
56+
CudaDeviceGuard deviceGuard(detail::gpuIdFromAddress(data));
57+
5658
TransportInfo transportInfo;
5759
transportInfo.transport = Transport::CudaIpc;
5860

@@ -204,7 +206,15 @@ RegisteredMemory::Impl::Impl(const std::vector<char>::const_iterator& begin,
204206
// The memory is local to the process, so originalDataPtr is valid as is
205207
this->data = this->originalDataPtr;
206208
if (this->isCuMemMapAlloc) {
207-
detail::setReadWriteMemoryAccess(this->data, this->baseDataSize);
209+
// Query which device owns this memory
210+
int gpuId = detail::gpuIdFromAddress(this->data);
211+
int currentDevice = -1;
212+
MSCCLPP_CUDATHROW(cudaGetDevice(&currentDevice));
213+
214+
// Only set access if we're on a different device than where memory was allocated
215+
if (gpuId != currentDevice) {
216+
detail::setReadWriteMemoryAccess(this->data, this->baseDataSize);
217+
}
208218
}
209219
} else if (transports.has(Transport::CudaIpc)) {
210220
// The memory is local to the machine but not to the process, so we need to open the CUDA IPC handle

src/semaphore.cc

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,7 @@ SemaphoreStub::Impl::Impl(const Connection& connection) : connection_(connection
5151
if (localDevice.id < 0) {
5252
throw Error("Local GPU ID is not provided", ErrorCode::InvalidUsage);
5353
}
54-
MSCCLPP_CUDATHROW(cudaSetDevice(localDevice.id));
54+
CudaDeviceGuard deviceGuard(localDevice.id);
5555
token_ = gpuCallocToken(connection_.context());
5656
} else {
5757
throw Error("Unsupported local device type", ErrorCode::InvalidUsage);

0 commit comments

Comments
 (0)