Skip to content

Commit 6484dce

Browse files
committed
apps/nccl: performance optimization for allreduce7
Add unroll and non-temporal store
1 parent ca6741c commit 6484dce

File tree

3 files changed

+8
-4
lines changed

3 files changed

+8
-4
lines changed

apps/nccl/src/allreduce.hpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -289,7 +289,7 @@ __global__ void __launch_bounds__(1024, 1)
289289
for (int idx = threadIdx.x + blockIdx.x * blockDim.x; idx < nPktsPerRank; idx += blockDim.x * gridDim.x) {
290290
//uint32_t data = 0;
291291
uint2 data = make_uint2(0, 0);
292-
for (int index = 0; index < nPeers; index++) {
292+
for (int index = 0; index < NPEERS; index++) {
293293
const int remoteRank = index < rank ? index : index + 1;
294294
mscclpp::LLPacket* dstPkt = (mscclpp::LLPacket*)scratchBuff + remoteRank * nPktsPerRank;
295295
//uint32_t val = dstPkt[idx].read(flag, -1);
@@ -312,7 +312,7 @@ __global__ void __launch_bounds__(1024, 1)
312312
packet.data2 = data.y;
313313
packet.flag2 = flag;
314314
size_t offset = scratchResultOffset / sizeof(mscclpp::LLPacket) + (idx + rank * nPktsPerRank);
315-
for (int index = 0; index < nPeers; index++) {
315+
for (int index = 0; index < NPEERS; index++) {
316316
channels[index].write(offset, packet);
317317
}
318318
}

apps/nccl/src/common.hpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,8 @@
1212
#endif
1313

1414
constexpr int NRANKS_PER_NODE = 8;
15+
constexpr int NPEERS = 7;
16+
1517
constexpr int SCRATCH_SIZE = 2 * 1024 * 1024 * 70; // double buffer * 35 thread-blocks * 8 ranks * 256KB = 70MB
1618

1719
#endif // NCCL_COMMON_HPP_

include/mscclpp/packet_device.hpp

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -49,8 +49,10 @@ union alignas(16) LL16Packet {
4949
#else // !defined(MSCCLPP_DEVICE_CUDA)
5050
uint4 reg = make_uint4(val1, flag, val2, flag);
5151
ulonglong2* p = reinterpret_cast<ulonglong2*>(&reg);
52-
atomicStore(&(raw_.x), p->x, memoryOrderRelaxed);
53-
atomicStore(&(raw_.y), p->y, memoryOrderRelaxed);
52+
/*atomicStore(&(raw_.x), p->x, memoryOrderRelaxed);
53+
atomicStore(&(raw_.y), p->y, memoryOrderRelaxed);*/
54+
__builtin_nontemporal_store(p->x, &(raw_.x));
55+
__builtin_nontemporal_store(p->y, &(raw_.y));
5456
#endif
5557
}
5658

0 commit comments

Comments
 (0)