fix reduce primitive and plan validation

yzygitzh · yzygitzh · commit aaff57f8616b · 2024-10-24T04:59:15.000Z
diff --git a/src/executor/execution_plan.cc b/src/executor/execution_plan.cc
@@ -305,6 +305,15 @@ void ExecutionPlan::Impl::setupChannels(const json& gpus) {
   }
 }
 
+void ExecutionPlan::Impl::checkChannelsPerOperation(int channels) {
+  if (channels > MAX_CHANNEL_PER_OPERATION) {
+    throw Error("Executor plan has " + std::to_string(channels) +
+                    " channels per operation, exceeding executor support (" +
+                    std::to_string(MAX_CHANNEL_PER_OPERATION) + ")",
+                ErrorCode::ExecutorError);
+  }
+}
+
 void ExecutionPlan::Impl::setupOperations(const json& gpus, size_t constSrcOffset, size_t constDstOffset) {
   // setup threadblocks and operations
   for (const auto& gpu : gpus) {
@@ -332,6 +341,7 @@ void ExecutionPlan::Impl::setupOperations(const json& gpus, size_t constSrcOffse
         }
         if (op.contains("i_cids")) {
           operation.nInputs = op["i_cids"].size();
+          checkChannelsPerOperation(operation.nInputs);
           for (int i = 0; i < operation.nInputs; i++) {
             BufferType srcBufferType = convertToBufferType(op["i_buff"]["src"]);
             BufferType dstBufferType = convertToBufferType(op["i_buff"]["dst"]);
@@ -347,6 +357,7 @@ void ExecutionPlan::Impl::setupOperations(const json& gpus, size_t constSrcOffse
         // will have either srcs or i_cids
         if (op.contains("srcs")) {
           operation.nInputs = op["srcs"].size();
+          checkChannelsPerOperation(operation.nInputs);
           operation.inputBufferType = convertToBufferType(op["srcs"][0]["buff"]);
           for (int i = 0; i < operation.nInputs; i++) {
             operation.inputOffsets[i] =
@@ -357,6 +368,7 @@ void ExecutionPlan::Impl::setupOperations(const json& gpus, size_t constSrcOffse
         }
         if (op.contains("o_cids")) {
           operation.nOutputs = op["o_cids"].size();
+          checkChannelsPerOperation(operation.nOutputs);
           for (int i = 0; i < operation.nOutputs; i++) {
             BufferType srcBufferType = convertToBufferType(op["o_buff"]["src"]);
             BufferType dstBufferType = convertToBufferType(op["o_buff"]["dst"]);
@@ -371,6 +383,7 @@ void ExecutionPlan::Impl::setupOperations(const json& gpus, size_t constSrcOffse
         // will have either dsts or o_cids
         if (op.contains("dsts")) {
           operation.nOutputs = op["dsts"].size();
+          checkChannelsPerOperation(operation.nOutputs);
           operation.outputBufferType = convertToBufferType(op["dsts"][0]["buff"]);
           for (int i = 0; i < operation.nOutputs; i++) {
             operation.outputOffsets[i] =
diff --git a/src/executor/executor.cc b/src/executor/executor.cc
@@ -287,8 +287,19 @@ struct Executor::Impl {
       DeviceExecutionPlan deviceExecutionPlan = {};
       std::vector<Operation> ops = plan.impl_->getOperations(rank, threadblock);
       deviceExecutionPlan.nOperations = ops.size();
+      if (deviceExecutionPlan.nOperations > MAX_OPERATION) {
+        throw Error("Executor plan has " + std::to_string(deviceExecutionPlan.nOperations) +
+                        " operations, exceeding executor support (" + std::to_string(MAX_OPERATION) + ")",
+                    ErrorCode::ExecutorError);
+      }
       deviceExecutionPlan.nSmChannels = plan.impl_->threadblockSMChannelMap.at(rank).at(threadblock).size();
       deviceExecutionPlan.nProxyChannels = plan.impl_->threadblockProxyChannelMap.at(rank).at(threadblock).size();
+      if (deviceExecutionPlan.nSmChannels > MAX_CHANNEL || deviceExecutionPlan.nProxyChannels > MAX_CHANNEL) {
+        throw Error("Executor plan has " +
+                        std::to_string(std::max(deviceExecutionPlan.nSmChannels, deviceExecutionPlan.nProxyChannels)) +
+                        " channels, exceeding executor support (" + std::to_string(MAX_CHANNEL) + ")",
+                    ErrorCode::ExecutorError);
+      }
       int chanIndex = 0;
       for (const auto& [index, _] : plan.impl_->threadblockSMChannelMap.at(rank).at(threadblock)) {
         deviceExecutionPlan.channels.smChannels[chanIndex++] = mscclpp::deviceHandle(context.smChannels[index]);
diff --git a/src/include/execution_kernel.hpp b/src/include/execution_kernel.hpp
@@ -349,11 +349,11 @@ MSCCLPP_DEVICE_INLINE void handleTransformToPacket(void* dst, void* src, uint32_
   mscclpp::putPackets<PacketType>(dst, dstOffset, src, srcOffset, size, threadIdx.x, blockDim.x, flag);
 }
 
-template <typename T>
+template <typename T, bool SendToRemote = true>
 MSCCLPP_DEVICE_INLINE void handleReduceSend(T* dst, uint32_t dstOffsetByBytes, T* src, uint32_t srcOffsetByBytes,
-                                            T* input, uint32_t* inputOffsets, DeviceHandle<SmChannel>* smChannels,
-                                            uint8_t* outputChannelIndexes, uint32_t* outputOffsets, int nOutChannels,
-                                            uint32_t size) {
+                                            T* input, uint32_t* inputOffsets, int nInputs,
+                                            DeviceHandle<SmChannel>* smChannels, uint8_t* outputChannelIndexes,
+                                            uint32_t* outputOffsets, int nOutChannels, uint32_t size) {
   const size_t nInt4 = size / sizeof(int4);
   const size_t srcOffset4 = srcOffsetByBytes / sizeof(int4);
   const size_t dstOffset4 = dstOffsetByBytes / sizeof(int4);
@@ -362,15 +362,17 @@ MSCCLPP_DEVICE_INLINE void handleReduceSend(T* dst, uint32_t dstOffsetByBytes, T
   int4* input4 = (int4*)input;
   for (size_t idx = threadIdx.x; idx < nInt4; idx += blockDim.x) {
     int4 tmp = src4[srcOffset4 + idx];
-    for (int index = 0; index < nOutChannels; ++index) {
+    for (int index = 0; index < nInputs; ++index) {
       size_t offset = inputOffsets[index] / sizeof(int4);
       int4 val = input4[offset + idx];
       tmp = add_vectors<T>(tmp, val);
     }
     dst4[dstOffset4 + idx] = tmp;
-    for (int index = 0; index < nOutChannels; ++index) {
-      size_t offset = outputOffsets[index] / sizeof(int4);
-      smChannels[outputChannelIndexes[index]].write<int4>(offset + idx, tmp);
+    if constexpr (SendToRemote) {
+      for (int index = 0; index < nOutChannels; ++index) {
+        size_t offset = outputOffsets[index] / sizeof(int4);
+        smChannels[outputChannelIndexes[index]].write<int4>(offset + idx, tmp);
+      }
     }
   }
   // handle rest of data
@@ -379,14 +381,16 @@ MSCCLPP_DEVICE_INLINE void handleReduceSend(T* dst, uint32_t dstOffsetByBytes, T
   const size_t endIdx = (srcOffsetByBytes + size) / sizeof(T);
   for (size_t idx = threadIdx.x + startIdx; idx < endIdx; idx += blockDim.x) {
     T tmp = src[idx];
-    for (int index = 0; index < nOutChannels; ++index) {
+    for (int index = 0; index < nInputs; ++index) {
       size_t offset = inputOffsets[index] / sizeof(T);
       tmp = add_elements(tmp, input[offset + idx]);
     }
     dst[idx] = tmp;
-    for (int index = 0; index < nOutChannels; ++index) {
-      size_t offset = outputOffsets[index] / sizeof(T);
-      smChannels[outputChannelIndexes[index]].write<T>(offset + idx, tmp);
+    if constexpr (SendToRemote) {
+      for (int index = 0; index < nOutChannels; ++index) {
+        size_t offset = outputOffsets[index] / sizeof(T);
+        smChannels[outputChannelIndexes[index]].write<T>(offset + idx, tmp);
+      }
     }
   }
 }
@@ -523,8 +527,14 @@ __global__ void executionKernel([[maybe_unused]] int rank /*for debug*/, T* inpu
       T* dst = getBuffer(input, output, scratch, op.dstBufferType);
       T* src = getBuffer(input, output, scratch, op.srcBufferType);
       T* tmp = getBuffer(input, output, scratch, op.inputBufferType);
-      handleReduceSend(dst, op.dstOffset, src, op.srcOffset, tmp, op.inputOffsets, smChannels, op.outputChannelIndexes,
-                       op.outputOffsets, op.nOutputs, op.size);
+      handleReduceSend(dst, op.dstOffset, src, op.srcOffset, tmp, op.inputOffsets, op.nInputs, smChannels,
+                       op.outputChannelIndexes, op.outputOffsets, op.nOutputs, op.size);
+    } else if (op.type == OperationType::REDUCE) {
+      T* dst = getBuffer(input, output, scratch, op.dstBufferType);
+      T* src = getBuffer(input, output, scratch, op.srcBufferType);
+      T* tmp = getBuffer(input, output, scratch, op.inputBufferType);
+      handleReduceSend<T, false>(dst, op.dstOffset, src, op.srcOffset, tmp, op.inputOffsets, op.nInputs, smChannels,
+                                 op.outputChannelIndexes, op.outputOffsets, op.nOutputs, op.size);
     }
 
 #if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_EXECUTOR_OP_BASE_EXIT)
diff --git a/src/include/execution_plan.hpp b/src/include/execution_plan.hpp
@@ -108,6 +108,7 @@ struct ExecutionPlan::Impl {
   size_t getNChunkSize(int rank, size_t inputSize, size_t outputSize, uint32_t nChunks,
                        const std::vector<uint32_t> offsets) const;
   void calcScratchBufferSizeAndOffset(int rank, size_t inputSize, size_t outputSize, int flag);
+  void checkChannelsPerOperation(int channels);
 };
 
 }  // namespace mscclpp