Skip to content

Commit 68b5423

Browse files
committed
2.23.4-1
Add scalable init API * Add new ncclCommInitRankScalable to allow for passing multiple unique IDs to the init function. * Spreads the load onto multiple bootstrap roots, allowing for constant bootstrap time. * Requires multiple ranks to create a unique ID, and the CPU-side ID exchange code to call allgather[v] instead of broadcast. Accelerate init bootstrap operations * Reduce the number of calls to allgather. * Allow roots to reply early to ranks when information is already available. * Add an option to use ncclNet instead of sockets to perform bootstrap allgather operations. Add PAT algorithms for Allgather and ReduceScatter * Parallel Aggregated Trees, variation of Bruck algorithm. * Logarithmic number of network steps for small sizes at scale. * Only supports one rank per node at the moment. Add support for registered buffers for intra-node communication. * Allow registered user buffers to be accessed directly intra-node * Avoids extra copies in algorithms which permit it, saving memory bandwidth and helping with compute overlap. Add profiler plugin API * New plugin API for profiling * Supports various levels of profiling, with a hierarchy. Asynchronous graph allocation * Make calls to cudaMalloc and cudaMemcpy during graph allocation asynchronous. * Significantly speeds up graph capture. Use fatal IB asynchronous events to stop network operation * Avoids many other error messages * Only fatal errors are affected; potentially transient errors (e.g. port down) do not cause an immediate stop. Set P2P level to PXB on AMD CPUs when using more than 2 GPUs per node * P2P would cause a significant performance degradation when using many GPUs, and therefore many interleaved data flows. * Disable P2P through the CPU when we have 3+ GPUs per node; keep it enabled when we only have 2 GPUs. Improve the init logs to report the real NCCL function. * Make the log report ncclCommInitRank or ncclCommSplit, rather than the generic ncclCommInitRankFunc. Add a parameter to set the location of the user configuration file. * Add NCCL_CONF_FILE environment variable to set where the user's configuration file resides. Increase default IB timeout * Increase IB timeout value from 18 to 20. * Should help avoid fatal errors on large RoCE systems. Add new check for nvidia peermem * On linux kernels 6.6+, /sys/kernel/mm/memory_peers is no longer present; check for /sys/module/nvidia_peermem/version instead. Fix old performance regression when mixing small and large operations. * Improves distribution of work on channels. Fix crash when NUMA IDs are equal to -1. * Can happen when a NIC is a virtual NIC, or when linux doesn't know which NUMA node a device is attached to * Issue NVIDIA/nccl-tests#233 Fix tree graph search when NCCL_CROSS_NIC is set to 1. * Would force NCCL to use the balanced_tree pattern, thereby disabling LL128 on platforms with 1 GPU+1 NIC per PCI switch. * Would also try to use alternate rings even though it was not needed. Compiler tweaks and fixes * PR #1177 * PR #1228 Fix stack smash * PR #1325 Fixes for multi-node NVLink + IB operation Coverity fixes and comments.
1 parent 178b6b7 commit 68b5423

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

88 files changed

+7127
-1973
lines changed

ext-profiler/example/Makefile

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
#
2+
# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
3+
#
4+
# See LICENSE.txt for license information
5+
#
6+
NCCL_HOME := ../../build
7+
INC := -I$(NCCL_HOME)/include -I$(CUDA_HOME)/include -Inccl
8+
PLUGIN_SO := libnccl-profiler.so
9+
10+
default: $(PLUGIN_SO)
11+
12+
$(PLUGIN_SO): plugin.c event.c print_event.c
13+
$(CC) $(INC) -g -fPIC -shared -o $@ -Wl,-soname,$(PLUGIN_SO) $^
14+
15+
clean:
16+
rm -f $(PLUGIN_SO)

ext-profiler/example/event.c

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
/*************************************************************************
2+
* Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
3+
*
4+
* See LICENSE.txt for license information
5+
************************************************************************/
6+
7+
#include <stdio.h>
8+
#include "event.h"
9+
10+
int taskEventQueueEmpty(struct group* g) {
11+
return g->eventHead == NULL;
12+
}
13+
14+
void taskEventQueueEnqueue(struct group* g, struct taskEventBase* event) {
15+
event->next = NULL;
16+
if (g->eventHead) g->eventTail->next = event;
17+
else g->eventHead = event;
18+
g->eventTail = event;
19+
}
20+
21+
struct taskEventBase* taskEventQueueHead(struct group* g) {
22+
return g->eventHead;
23+
}
24+
25+
struct taskEventBase* taskEventQueueDequeue(struct group* g) {
26+
struct taskEventBase* tmp = g->eventHead;
27+
g->eventHead = g->eventHead->next;
28+
if (g->eventHead == NULL) g->eventTail = NULL;
29+
return tmp;
30+
}

ext-profiler/example/event.h

Lines changed: 167 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,167 @@
1+
/*************************************************************************
2+
* Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
3+
*
4+
* See LICENSE.txt for license information
5+
************************************************************************/
6+
7+
#ifndef EVENT_H_
8+
#define EVENT_H_
9+
10+
#include <sys/types.h>
11+
#include <stdint.h>
12+
#include <unistd.h>
13+
#include "profiler.h"
14+
15+
#define MAX_CHANNELS 32
16+
#define MAX_STEPS 16
17+
18+
#define PROXY_OP_SEND_STATE_OFFSET (ncclProfilerProxyOpSendPosted)
19+
#define PROXY_OP_RECV_STATE_OFFSET (ncclProfilerProxyOpRecvPosted)
20+
#define PROXY_STEP_SEND_STATE_OFFSET (ncclProfilerProxyStepSendGPUWait)
21+
#define PROXY_STEP_RECV_STATE_OFFSET (ncclProfilerProxyStepRecvWait)
22+
23+
#define NUM_PROXY_OP_SEND_STATES (ncclProfilerProxyOpSendDone - ncclProfilerProxyOpSendPosted + 1)
24+
#define NUM_PROXY_OP_RECV_STATES (ncclProfilerProxyOpRecvDone - ncclProfilerProxyOpRecvPosted + 1)
25+
#define NUM_PROXY_STEP_SEND_STATES (ncclProfilerProxyStepSendWait - ncclProfilerProxyStepSendGPUWait + 1)
26+
#define NUM_PROXY_STEP_RECV_STATES (ncclProfilerProxyStepRecvGPUWait - ncclProfilerProxyStepRecvWait + 1)
27+
28+
#define PROXY_OP_SEND_STATE_IDX(state) (state - PROXY_OP_SEND_STATE_OFFSET)
29+
#define PROXY_OP_RECV_STATE_IDX(state) (state - PROXY_OP_RECV_STATE_OFFSET)
30+
#define PROXY_STEP_SEND_STATE_IDX(state) (state - PROXY_STEP_SEND_STATE_OFFSET)
31+
#define PROXY_STEP_RECV_STATE_IDX(state) (state - PROXY_STEP_RECV_STATE_OFFSET)
32+
33+
#define MAX_PROXY_OP_STATES ((NUM_PROXY_OP_SEND_STATES > NUM_PROXY_OP_RECV_STATES ) ? NUM_PROXY_OP_SEND_STATES : NUM_PROXY_OP_RECV_STATES)
34+
#define MAX_PROXY_STEP_STATES ((NUM_PROXY_STEP_SEND_STATES > NUM_PROXY_STEP_RECV_STATES) ? NUM_PROXY_STEP_SEND_STATES : NUM_PROXY_STEP_RECV_STATES)
35+
36+
#define MAX_COMM_CLIQUES (32 * 8)
37+
38+
struct proxyOp;
39+
40+
struct proxyStep {
41+
uint8_t type; // type of event: network transfer
42+
int step; // network transfer id in given channel
43+
int isSend; // send/recv channel operation
44+
double timestamp[MAX_PROXY_STEP_STATES];
45+
double startTs;
46+
double stopTs;
47+
struct proxyOp* parent;
48+
};
49+
50+
struct proxyOp {
51+
uint8_t type; // type of event: proxy operation
52+
uint8_t channelId; // channel id for this proxy operation
53+
pid_t pid;
54+
int rank;
55+
int peer; // peer rank for this proxy operation
56+
int nSteps; // total number of network transfers for this proxy operation
57+
int chunkSize; // chunk size for this proxy operation
58+
int isSend; // send/recv channel operation
59+
size_t transSize; // transfer data size for this proxy operation
60+
struct {
61+
int steps; // completed steps for this proxy operation state
62+
double timestamp;
63+
} states[MAX_PROXY_OP_STATES];
64+
double startTs;
65+
double stopTs;
66+
int stepCount; // last processed network operation for this proxy operation
67+
struct proxyStep step[MAX_STEPS]; // array of network transfer events
68+
struct taskEventBase* parent; // parent event p2p/collective
69+
};
70+
71+
struct group;
72+
struct context;
73+
74+
struct proxyCtrl {
75+
uint8_t type;
76+
struct context* ctx; // profiler context
77+
double startTs;
78+
double stopTs;
79+
int state;
80+
int appended; // appended proxy operations
81+
};
82+
83+
// task level event base structure
84+
struct taskEventBase {
85+
uint8_t type; // event type: collective/p2p
86+
int rank; // rank of the operation in NCCL communicator
87+
const char* name; // FIXME: unused
88+
uint64_t commHash; // communicator identifier
89+
uint8_t func; // ncclFunc*
90+
int refCount; // number of references for this operation
91+
struct group* parent; // parent event group
92+
struct taskEventBase* next; // next top level event in group
93+
double startTs;
94+
double stopTs;
95+
};
96+
97+
struct collective {
98+
struct taskEventBase base; // base structure for this event
99+
uint64_t seqNumber; // sequence number for this collective in communicator
100+
void const* sendBuff;
101+
void* recvBuff;
102+
size_t count;
103+
size_t trafficBytes;
104+
int root;
105+
uint8_t datatype;
106+
uint8_t nMaxChannels;
107+
uint8_t algo;
108+
uint8_t proto;
109+
int op;
110+
int nWarps;
111+
int isCollnet;
112+
int isNvls;
113+
struct proxyOp send[MAX_CHANNELS];// array of send proxy operation events
114+
struct proxyOp recv[MAX_CHANNELS];// array of recv proxy operation events
115+
};
116+
117+
struct p2p {
118+
struct taskEventBase base; // base structure for this event
119+
uint8_t func;
120+
void const* buff;
121+
size_t count;
122+
uint8_t datatype;
123+
int peer;
124+
struct proxyOp op;
125+
};
126+
127+
struct group {
128+
uint8_t type;
129+
struct context* ctx; // profiler context
130+
int groupId;
131+
int refCount;
132+
struct taskEventBase* eventHead; // queue head for task events
133+
struct taskEventBase* eventTail; // queue tail for task events
134+
double startTs;
135+
double stopTs;
136+
struct group* next; // next group event in queue
137+
};
138+
139+
// arrays for different event objects
140+
struct context {
141+
int groupPoolSize;
142+
int groupPoolBase;
143+
int groupPoolIndex;
144+
struct group* groupPool;
145+
146+
int collPoolSize;
147+
int collPoolBase;
148+
int collPoolIndex;
149+
struct collective* collPool;
150+
151+
int p2pPoolSize;
152+
int p2pPoolBase;
153+
int p2pPoolIndex;
154+
struct p2p* p2pPool;
155+
156+
int proxyCtrlPoolSize;
157+
int proxyCtrlPoolBase;
158+
int proxyCtrlPoolIndex;
159+
struct proxyCtrl* proxyCtrlPool;
160+
};
161+
162+
int taskEventQueueEmpty(struct group* g);
163+
void taskEventQueueEnqueue(struct group* g, struct taskEventBase* event);
164+
struct taskEventBase* taskEventQueueHead(struct group* g);
165+
struct taskEventBase* taskEventQueueDequeue(struct group* g);
166+
167+
#endif

ext-profiler/example/nccl/common.h

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
/*************************************************************************
2+
* Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
3+
*
4+
* See LICENSE.txt for license information
5+
************************************************************************/
6+
7+
#ifndef COMMON_H_
8+
#define COMMON_H_
9+
10+
typedef enum {NCCL_LOG_NONE=0, NCCL_LOG_VERSION=1, NCCL_LOG_WARN=2, NCCL_LOG_INFO=3, NCCL_LOG_ABORT=4, NCCL_LOG_TRACE=5} ncclDebugLogLevel;
11+
typedef enum {NCCL_INIT=1, NCCL_COLL=2, NCCL_P2P=4, NCCL_SHM=8, NCCL_NET=16, NCCL_GRAPH=32, NCCL_TUNING=64, NCCL_ENV=128, NCCL_ALLOC=256, NCCL_CALL=512, NCCL_PROXY=1024, NCCL_NVLS=2048, NCCL_BOOTSTRAP=4096, NCCL_REG=8192, NCCL_ALL=~0} ncclDebugLogSubSys;
12+
13+
typedef void (*ncclDebugLogger_t)(ncclDebugLogLevel level, unsigned long flags, const char *file, int line, const char *fmt, ...);
14+
15+
#endif

ext-profiler/example/nccl/err.h

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
/*************************************************************************
2+
* Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
3+
*
4+
* See LICENSE.txt for license information
5+
************************************************************************/
6+
7+
#ifndef NCCL_ERR_H_
8+
#define NCCL_ERR_H_
9+
10+
/* Error type for plugins */
11+
typedef enum { ncclSuccess = 0,
12+
ncclUnhandledCudaError = 1,
13+
ncclSystemError = 2,
14+
ncclInternalError = 3,
15+
ncclInvalidArgument = 4,
16+
ncclInvalidUsage = 5,
17+
ncclRemoteError = 6 } ncclResult_t;
18+
19+
#endif
Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
/*************************************************************************
2+
* Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
3+
*
4+
* See LICENSE.txt for license information
5+
************************************************************************/
6+
7+
#ifndef NCCL_PROFILER_H_
8+
#define NCCL_PROFILER_H_
9+
10+
#include <stdint.h>
11+
#include <stdlib.h>
12+
13+
#include "common.h"
14+
#include "err.h"
15+
16+
#include "profiler_v1.h"
17+
18+
#endif // end include guard

0 commit comments

Comments
 (0)