Skip to content

Commit a01b5aa

Browse files
authored
Merge branch 'main' into binyli/py-api
2 parents 0d5cb6a + fc0aaaf commit a01b5aa

File tree

10 files changed

+93
-8
lines changed

10 files changed

+93
-8
lines changed

cmake/CheckNvidiaGpu.cmake

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ if(NOT CUDAToolkit_FOUND)
99
return()
1010
endif()
1111

12-
set(CMAKE_CUDA_ARCHITECTURES "60")
12+
set(CMAKE_CUDA_ARCHITECTURES native)
1313
if(NOT CMAKE_CUDA_COMPILER)
1414
# In case the CUDA Toolkit directory is not in the PATH
1515
find_program(CUDA_COMPILER

include/mscclpp/core.hpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -533,6 +533,7 @@ class Context : public std::enable_shared_from_this<Context> {
533533
friend class Endpoint;
534534
friend class Connection;
535535
friend class RegisteredMemory;
536+
friend class SemaphoreStub;
536537
};
537538

538539
/// Block of memory that has been registered to a Context.

include/mscclpp/gpu.hpp

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ using CUmemGenericAllocationHandle = hipMemGenericAllocationHandle_t;
2424
using CUmemAllocationProp = hipMemAllocationProp;
2525
using CUmemAccessDesc = hipMemAccessDesc;
2626
using CUmemAllocationHandleType = hipMemAllocationHandleType;
27+
using CUmemAllocationGranularity_flags = hipMemAllocationGranularity_flags;
2728

2829
constexpr auto cudaErrorPeerAccessAlreadyEnabled = hipErrorPeerAccessAlreadyEnabled;
2930
constexpr auto cudaErrorContextIsDestroyed = hipErrorContextIsDestroyed;
@@ -44,6 +45,7 @@ constexpr auto CU_MEM_ALLOCATION_TYPE_PINNED = hipMemAllocationTypePinned;
4445
constexpr auto CU_MEM_LOCATION_TYPE_DEVICE = hipMemLocationTypeDevice;
4546
constexpr auto CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR = hipMemHandleTypePosixFileDescriptor;
4647
constexpr auto CU_MEM_ACCESS_FLAGS_PROT_READWRITE = hipMemAccessFlagsProtReadWrite;
48+
constexpr auto CU_MEM_ALLOC_GRANULARITY_MINIMUM = hipMemAllocationGranularityMinimum;
4749

4850
#ifndef CUDA_SUCCESS
4951
#define CUDA_SUCCESS hipSuccess
@@ -106,6 +108,7 @@ constexpr auto CU_MEM_ACCESS_FLAGS_PROT_READWRITE = hipMemAccessFlagsProtReadWri
106108
#define cuMemRetainAllocationHandle(...) hipMemRetainAllocationHandle(__VA_ARGS__)
107109
#define cuMemExportToShareableHandle(...) hipMemExportToShareableHandle(__VA_ARGS__)
108110
#define cuMemImportFromShareableHandle(...) hipMemImportFromShareableHandle(__VA_ARGS__)
111+
#define cuMemGetAllocationGranularity(...) hipMemGetAllocationGranularity(__VA_ARGS__)
109112

110113
#else
111114

include/mscclpp/gpu_utils.hpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -254,6 +254,7 @@ auto gpuCallocPhysicalUnique(size_t nelems = 1, size_t gran = 0, size_t align =
254254
}
255255

256256
size_t getMulticastGranularity(size_t size, CUmulticastGranularity_flags granFlag);
257+
size_t getCuAllocationGranularity(CUmemAllocationGranularity_flags granFlag);
257258

258259
#endif // CUDA_NVLS_API_AVAILABLE
259260

src/context.cc

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,13 @@ IbCtx *Context::Impl::getIbContext(Transport ibTransport) {
5757
return it->second.get();
5858
}
5959

60+
std::shared_ptr<uint64_t> Context::Impl::getToken() {
61+
if (!tokenPool_) {
62+
tokenPool_ = std::make_shared<TokenPool>(maxNumTokens_);
63+
}
64+
return tokenPool_->getToken();
65+
}
66+
6067
MSCCLPP_API_CPP Context::Context() : pimpl_(std::make_unique<Impl>()) {}
6168

6269
MSCCLPP_API_CPP Context::~Context() = default;

src/gpu_utils.cc

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -167,6 +167,21 @@ void* gpuCallocUncached(size_t bytes) {
167167
#endif // defined(__HIP_PLATFORM_AMD__)
168168

169169
#if (CUDA_NVLS_API_AVAILABLE)
170+
size_t getCuAllocationGranularity(CUmemAllocationGranularity_flags granFlag) {
171+
size_t gran = 0;
172+
int deviceId = -1;
173+
MSCCLPP_CUDATHROW(cudaGetDevice(&deviceId));
174+
175+
CUmemAllocationProp prop = {};
176+
prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
177+
prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
178+
prop.location.id = deviceId;
179+
prop.requestedHandleTypes =
180+
(CUmemAllocationHandleType)(CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR | CU_MEM_HANDLE_TYPE_FABRIC);
181+
cuMemGetAllocationGranularity(&gran, &prop, granFlag);
182+
return gran;
183+
}
184+
170185
size_t getMulticastGranularity(size_t size, CUmulticastGranularity_flags granFlag) {
171186
size_t gran = 0;
172187
int numDevices = 0;

src/include/context.hpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,13 +35,17 @@ class CudaIpcStream {
3535
int deviceId() const { return deviceId_; }
3636
};
3737

38+
class TokenPool;
3839
struct Context::Impl {
3940
std::unordered_map<Transport, std::unique_ptr<IbCtx>> ibContexts_;
4041
std::vector<std::shared_ptr<CudaIpcStream>> ipcStreams_;
42+
std::shared_ptr<TokenPool> tokenPool_;
43+
const size_t maxNumTokens_ = 1 << 15; // 32K tokens
4144

4245
Impl();
4346

4447
IbCtx *getIbContext(Transport ibTransport);
48+
std::shared_ptr<uint64_t> getToken();
4549
};
4650

4751
} // namespace mscclpp

src/include/utils_internal.hpp

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,19 @@ struct PairHash {
6464
}
6565
};
6666

67+
class TokenPool : public std::enable_shared_from_this<TokenPool> {
68+
public:
69+
TokenPool(size_t nTokens);
70+
std::shared_ptr<uint64_t> getToken();
71+
72+
private:
73+
size_t nToken_;
74+
uint64_t* baseAddr_;
75+
uint64_t tailMask_;
76+
std::shared_ptr<uint64_t> tokens_;
77+
std::vector<std::bitset<UINT64_WIDTH>> allocationMap_;
78+
};
79+
6780
} // namespace mscclpp
6881

6982
#endif

src/semaphore.cc

Lines changed: 9 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -20,18 +20,20 @@ struct SemaphoreStub::Impl {
2020

2121
Impl(const std::vector<char>& data);
2222

23+
std::shared_ptr<uint64_t> gpuCallocToken(std::shared_ptr<Context> context);
24+
2325
std::shared_ptr<Connection> connection_;
2426
std::shared_ptr<uint64_t> token_;
2527
RegisteredMemory idMemory_;
2628
Device device_;
2729
};
2830

29-
static std::shared_ptr<uint64_t> gpuCallocToken() {
30-
// #if (CUDA_NVLS_API_AVAILABLE)
31-
// if (isNvlsSupported()) {
32-
// return detail::gpuCallocPhysicalShared<uint64_t>(1, 0);
33-
// }
34-
// #endif // CUDA_NVLS_API_AVAILABLE
31+
std::shared_ptr<uint64_t> SemaphoreStub::Impl::gpuCallocToken(std::shared_ptr<Context> context) {
32+
#if (CUDA_NVLS_API_AVAILABLE)
33+
if (isNvlsSupported()) {
34+
return context->pimpl_->getToken();
35+
}
36+
#endif // CUDA_NVLS_API_AVAILABLE
3537
#if defined(MSCCLPP_DEVICE_HIP)
3638
return detail::gpuCallocUncachedShared<uint64_t>();
3739
#else // !defined(MSCCLPP_DEVICE_HIP)
@@ -49,7 +51,7 @@ SemaphoreStub::Impl::Impl(std::shared_ptr<Connection> connection) : connection_(
4951
throw Error("Local GPU ID is not provided", ErrorCode::InvalidUsage);
5052
}
5153
MSCCLPP_CUDATHROW(cudaSetDevice(localDevice.id));
52-
token_ = gpuCallocToken();
54+
token_ = gpuCallocToken(connection_->context());
5355
} else {
5456
throw Error("Unsupported local device type", ErrorCode::InvalidUsage);
5557
}

src/utils_internal.cc

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
#include <memory>
1313
#include <mscclpp/env.hpp>
1414
#include <mscclpp/errors.hpp>
15+
#include <mscclpp/gpu_utils.hpp>
1516
#include <sstream>
1617
#include <string>
1718

@@ -232,4 +233,42 @@ void getRandomData(void* buffer, size_t bytes) {
232233
}
233234
}
234235

236+
TokenPool::TokenPool(size_t nToken) : nToken_(nToken) {
237+
#if (CUDA_NVLS_API_AVAILABLE)
238+
tokens_ = detail::gpuCallocPhysicalShared<uint64_t>(
239+
nToken, detail::getCuAllocationGranularity(CU_MEM_ALLOC_GRANULARITY_MINIMUM));
240+
MSCCLPP_CUTHROW(cuMemGetAddressRange((CUdeviceptr*)(&baseAddr_), NULL, (CUdeviceptr)tokens_.get()));
241+
size_t nElems = (nToken + (UINT64_WIDTH - 1)) / UINT64_WIDTH;
242+
allocationMap_.resize(nElems, 0);
243+
tailMask_ = (nToken % UINT64_WIDTH) ? ((1UL << (nToken % UINT64_WIDTH)) - 1) : ~0UL;
244+
#else
245+
throw Error("TokenPool only available on GPUs with NVLS support", ErrorCode::InvalidUsage);
246+
#endif
247+
}
248+
249+
std::shared_ptr<uint64_t> TokenPool::getToken() {
250+
auto deleter = [self = shared_from_this()](uint64_t* token) {
251+
size_t index = (token - self->baseAddr_) / UINT64_WIDTH;
252+
size_t bit = (token - self->baseAddr_) % UINT64_WIDTH;
253+
uint64_t mask = 1UL << bit;
254+
self->allocationMap_[index] &= ~mask;
255+
};
256+
257+
size_t size = allocationMap_.size();
258+
for (size_t i = 0; i < size; i++) {
259+
uint64_t ullong = allocationMap_[i].to_ullong();
260+
uint64_t mask = (i + 1 == size) ? tailMask_ : ~0ULL;
261+
uint64_t holes = (~ullong) & mask;
262+
if (!holes) continue;
263+
for (int bit = 0; bit < UINT64_WIDTH; bit++) {
264+
if (holes & (1UL << bit)) {
265+
allocationMap_[i].set(bit);
266+
INFO(MSCCLPP_ALLOC, "TokenPool allocated token at addr %p", baseAddr_ + i * UINT64_WIDTH + bit);
267+
return std::shared_ptr<uint64_t>(baseAddr_ + i * UINT64_WIDTH + bit, deleter);
268+
}
269+
}
270+
}
271+
throw Error("TokenPool is exhausted", ErrorCode::InternalError);
272+
}
273+
235274
} // namespace mscclpp

0 commit comments

Comments
 (0)