|
| 1 | +/************************************************************************* |
| 2 | + * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. |
| 3 | + * |
| 4 | + * See LICENSE.txt for license information |
| 5 | + ************************************************************************/ |
| 6 | + |
| 7 | +#ifndef EVENT_H_ |
| 8 | +#define EVENT_H_ |
| 9 | + |
| 10 | +#include <sys/types.h> |
| 11 | +#include <stdint.h> |
| 12 | +#include <unistd.h> |
| 13 | +#include "profiler.h" |
| 14 | + |
| 15 | +#define MAX_CHANNELS 32 |
| 16 | +#define MAX_STEPS 16 |
| 17 | + |
| 18 | +#define PROXY_OP_SEND_STATE_OFFSET (ncclProfilerProxyOpSendPosted) |
| 19 | +#define PROXY_OP_RECV_STATE_OFFSET (ncclProfilerProxyOpRecvPosted) |
| 20 | +#define PROXY_STEP_SEND_STATE_OFFSET (ncclProfilerProxyStepSendGPUWait) |
| 21 | +#define PROXY_STEP_RECV_STATE_OFFSET (ncclProfilerProxyStepRecvWait) |
| 22 | + |
| 23 | +#define NUM_PROXY_OP_SEND_STATES (ncclProfilerProxyOpSendDone - ncclProfilerProxyOpSendPosted + 1) |
| 24 | +#define NUM_PROXY_OP_RECV_STATES (ncclProfilerProxyOpRecvDone - ncclProfilerProxyOpRecvPosted + 1) |
| 25 | +#define NUM_PROXY_STEP_SEND_STATES (ncclProfilerProxyStepSendWait - ncclProfilerProxyStepSendGPUWait + 1) |
| 26 | +#define NUM_PROXY_STEP_RECV_STATES (ncclProfilerProxyStepRecvGPUWait - ncclProfilerProxyStepRecvWait + 1) |
| 27 | + |
| 28 | +#define PROXY_OP_SEND_STATE_IDX(state) (state - PROXY_OP_SEND_STATE_OFFSET) |
| 29 | +#define PROXY_OP_RECV_STATE_IDX(state) (state - PROXY_OP_RECV_STATE_OFFSET) |
| 30 | +#define PROXY_STEP_SEND_STATE_IDX(state) (state - PROXY_STEP_SEND_STATE_OFFSET) |
| 31 | +#define PROXY_STEP_RECV_STATE_IDX(state) (state - PROXY_STEP_RECV_STATE_OFFSET) |
| 32 | + |
| 33 | +#define MAX_PROXY_OP_STATES ((NUM_PROXY_OP_SEND_STATES > NUM_PROXY_OP_RECV_STATES ) ? NUM_PROXY_OP_SEND_STATES : NUM_PROXY_OP_RECV_STATES) |
| 34 | +#define MAX_PROXY_STEP_STATES ((NUM_PROXY_STEP_SEND_STATES > NUM_PROXY_STEP_RECV_STATES) ? NUM_PROXY_STEP_SEND_STATES : NUM_PROXY_STEP_RECV_STATES) |
| 35 | + |
| 36 | +#define MAX_COMM_CLIQUES (32 * 8) |
| 37 | + |
| 38 | +struct proxyOp; |
| 39 | + |
| 40 | +struct proxyStep { |
| 41 | + uint8_t type; // type of event: network transfer |
| 42 | + int step; // network transfer id in given channel |
| 43 | + int isSend; // send/recv channel operation |
| 44 | + double timestamp[MAX_PROXY_STEP_STATES]; |
| 45 | + double startTs; |
| 46 | + double stopTs; |
| 47 | + struct proxyOp* parent; |
| 48 | +}; |
| 49 | + |
| 50 | +struct proxyOp { |
| 51 | + uint8_t type; // type of event: proxy operation |
| 52 | + uint8_t channelId; // channel id for this proxy operation |
| 53 | + pid_t pid; |
| 54 | + int rank; |
| 55 | + int peer; // peer rank for this proxy operation |
| 56 | + int nSteps; // total number of network transfers for this proxy operation |
| 57 | + int chunkSize; // chunk size for this proxy operation |
| 58 | + int isSend; // send/recv channel operation |
| 59 | + size_t transSize; // transfer data size for this proxy operation |
| 60 | + struct { |
| 61 | + int steps; // completed steps for this proxy operation state |
| 62 | + double timestamp; |
| 63 | + } states[MAX_PROXY_OP_STATES]; |
| 64 | + double startTs; |
| 65 | + double stopTs; |
| 66 | + int stepCount; // last processed network operation for this proxy operation |
| 67 | + struct proxyStep step[MAX_STEPS]; // array of network transfer events |
| 68 | + struct taskEventBase* parent; // parent event p2p/collective |
| 69 | +}; |
| 70 | + |
| 71 | +struct group; |
| 72 | +struct context; |
| 73 | + |
| 74 | +struct proxyCtrl { |
| 75 | + uint8_t type; |
| 76 | + struct context* ctx; // profiler context |
| 77 | + double startTs; |
| 78 | + double stopTs; |
| 79 | + int state; |
| 80 | + int appended; // appended proxy operations |
| 81 | +}; |
| 82 | + |
| 83 | +// task level event base structure |
| 84 | +struct taskEventBase { |
| 85 | + uint8_t type; // event type: collective/p2p |
| 86 | + int rank; // rank of the operation in NCCL communicator |
| 87 | + const char* name; // FIXME: unused |
| 88 | + uint64_t commHash; // communicator identifier |
| 89 | + uint8_t func; // ncclFunc* |
| 90 | + int refCount; // number of references for this operation |
| 91 | + struct group* parent; // parent event group |
| 92 | + struct taskEventBase* next; // next top level event in group |
| 93 | + double startTs; |
| 94 | + double stopTs; |
| 95 | +}; |
| 96 | + |
| 97 | +struct collective { |
| 98 | + struct taskEventBase base; // base structure for this event |
| 99 | + uint64_t seqNumber; // sequence number for this collective in communicator |
| 100 | + void const* sendBuff; |
| 101 | + void* recvBuff; |
| 102 | + size_t count; |
| 103 | + size_t trafficBytes; |
| 104 | + int root; |
| 105 | + uint8_t datatype; |
| 106 | + uint8_t nMaxChannels; |
| 107 | + uint8_t algo; |
| 108 | + uint8_t proto; |
| 109 | + int op; |
| 110 | + int nWarps; |
| 111 | + int isCollnet; |
| 112 | + int isNvls; |
| 113 | + struct proxyOp send[MAX_CHANNELS];// array of send proxy operation events |
| 114 | + struct proxyOp recv[MAX_CHANNELS];// array of recv proxy operation events |
| 115 | +}; |
| 116 | + |
| 117 | +struct p2p { |
| 118 | + struct taskEventBase base; // base structure for this event |
| 119 | + uint8_t func; |
| 120 | + void const* buff; |
| 121 | + size_t count; |
| 122 | + uint8_t datatype; |
| 123 | + int peer; |
| 124 | + struct proxyOp op; |
| 125 | +}; |
| 126 | + |
| 127 | +struct group { |
| 128 | + uint8_t type; |
| 129 | + struct context* ctx; // profiler context |
| 130 | + int groupId; |
| 131 | + int refCount; |
| 132 | + struct taskEventBase* eventHead; // queue head for task events |
| 133 | + struct taskEventBase* eventTail; // queue tail for task events |
| 134 | + double startTs; |
| 135 | + double stopTs; |
| 136 | + struct group* next; // next group event in queue |
| 137 | +}; |
| 138 | + |
| 139 | +// arrays for different event objects |
| 140 | +struct context { |
| 141 | + int groupPoolSize; |
| 142 | + int groupPoolBase; |
| 143 | + int groupPoolIndex; |
| 144 | + struct group* groupPool; |
| 145 | + |
| 146 | + int collPoolSize; |
| 147 | + int collPoolBase; |
| 148 | + int collPoolIndex; |
| 149 | + struct collective* collPool; |
| 150 | + |
| 151 | + int p2pPoolSize; |
| 152 | + int p2pPoolBase; |
| 153 | + int p2pPoolIndex; |
| 154 | + struct p2p* p2pPool; |
| 155 | + |
| 156 | + int proxyCtrlPoolSize; |
| 157 | + int proxyCtrlPoolBase; |
| 158 | + int proxyCtrlPoolIndex; |
| 159 | + struct proxyCtrl* proxyCtrlPool; |
| 160 | +}; |
| 161 | + |
| 162 | +int taskEventQueueEmpty(struct group* g); |
| 163 | +void taskEventQueueEnqueue(struct group* g, struct taskEventBase* event); |
| 164 | +struct taskEventBase* taskEventQueueHead(struct group* g); |
| 165 | +struct taskEventBase* taskEventQueueDequeue(struct group* g); |
| 166 | + |
| 167 | +#endif |
0 commit comments