fix bug

yzygitzh · yzygitzh · commit 59d09179dba2 · 2024-10-22T14:02:42.000Z
diff --git a/src/executor/execution_kernel.cu b/src/executor/execution_kernel.cu
@@ -8,12 +8,12 @@ namespace mscclpp {
 
 template <typename PacketType>
 void ExecutionKernel::launchKernel(int rank, int nthreadblocks, int nthreads, void* src, void* dst, void* scratch,
-                                   size_t scratchSize, DataType dataType, DeviceExecutionPlan* plan,
-                                   size_t sharedMemSize, cudaStream_t stream, uint32_t flag) {
+                                   DataType dataType, DeviceExecutionPlan* plan, size_t sharedMemSize,
+                                   cudaStream_t stream, uint32_t flag) {
   switch (dataType) {
     case DataType::INT32:
       executionKernel<int32_t, PacketType><<<nthreadblocks, nthreads, sharedMemSize, stream>>>(
-          rank, (int32_t*)src, (int32_t*)dst, (int32_t*)scratch, scratchSize, plan, flag
+          rank, (int32_t*)src, (int32_t*)dst, (int32_t*)scratch, plan, flag
 #if defined(ENABLE_NPKIT)
           ,
           NpKit::GetGpuEventCollectContexts(), NpKit::GetCpuTimestamp());
@@ -23,7 +23,7 @@ void ExecutionKernel::launchKernel(int rank, int nthreadblocks, int nthreads, vo
       break;
     case DataType::UINT32:
       executionKernel<uint32_t><<<nthreadblocks, nthreads, sharedMemSize, stream>>>(
-          rank, (uint32_t*)src, (uint32_t*)dst, (uint32_t*)scratch, scratchSize, plan, flag
+          rank, (uint32_t*)src, (uint32_t*)dst, (uint32_t*)scratch, plan, flag
 #if defined(ENABLE_NPKIT)
           ,
           NpKit::GetGpuEventCollectContexts(), NpKit::GetCpuTimestamp());
@@ -33,7 +33,7 @@ void ExecutionKernel::launchKernel(int rank, int nthreadblocks, int nthreads, vo
       break;
     case DataType::FLOAT16:
       executionKernel<half><<<nthreadblocks, nthreads, sharedMemSize, stream>>>(
-          rank, (half*)src, (half*)dst, (half*)scratch, scratchSize, plan, flag
+          rank, (half*)src, (half*)dst, (half*)scratch, plan, flag
 #if defined(ENABLE_NPKIT)
           ,
           NpKit::GetGpuEventCollectContexts(), NpKit::GetCpuTimestamp());
@@ -43,7 +43,7 @@ void ExecutionKernel::launchKernel(int rank, int nthreadblocks, int nthreads, vo
       break;
     case DataType::FLOAT32:
       executionKernel<float><<<nthreadblocks, nthreads, sharedMemSize, stream>>>(
-          rank, (float*)src, (float*)dst, (float*)scratch, scratchSize, plan, flag
+          rank, (float*)src, (float*)dst, (float*)scratch, plan, flag
 #if defined(ENABLE_NPKIT)
           ,
           NpKit::GetGpuEventCollectContexts(), NpKit::GetCpuTimestamp());
@@ -53,7 +53,7 @@ void ExecutionKernel::launchKernel(int rank, int nthreadblocks, int nthreads, vo
       break;
     case DataType::BFLOAT16:
       executionKernel<__bfloat16><<<nthreadblocks, nthreads, sharedMemSize, stream>>>(
-          rank, (__bfloat16*)src, (__bfloat16*)dst, (__bfloat16*)scratch, scratchSize, plan, flag
+          rank, (__bfloat16*)src, (__bfloat16*)dst, (__bfloat16*)scratch, plan, flag
 #if defined(ENABLE_NPKIT)
           ,
           NpKit::GetGpuEventCollectContexts(), NpKit::GetCpuTimestamp());
@@ -65,12 +65,10 @@ void ExecutionKernel::launchKernel(int rank, int nthreadblocks, int nthreads, vo
 }
 
 template void ExecutionKernel::launchKernel<LL16Packet>(int rank, int nthreadblocks, int nthreads, void* src, void* dst,
-                                                        void* scratch, size_t scratchSize, DataType dataType,
-                                                        DeviceExecutionPlan* plan, size_t sharedMemSize,
-                                                        cudaStream_t stream, uint32_t flag);
+                                                        void* scratch, DataType dataType, DeviceExecutionPlan* plan,
+                                                        size_t sharedMemSize, cudaStream_t stream, uint32_t flag);
 template void ExecutionKernel::launchKernel<LL8Packet>(int rank, int nthreadblocks, int nthreads, void* src, void* dst,
-                                                       void* scratch, size_t scratchSize, DataType dataType,
-                                                       DeviceExecutionPlan* plan, size_t sharedMemSize,
-                                                       cudaStream_t stream, uint32_t flag);
+                                                       void* scratch, DataType dataType, DeviceExecutionPlan* plan,
+                                                       size_t sharedMemSize, cudaStream_t stream, uint32_t flag);
 }  // namespace mscclpp
 #endif
diff --git a/src/executor/execution_plan.cc b/src/executor/execution_plan.cc
@@ -100,7 +100,7 @@ std::vector<ChannelInfo> ExecutionPlan::Impl::getChannelInfos(int rank, BufferTy
 }
 
 std::vector<ChannelInfo> ExecutionPlan::Impl::getChannelInfosByDstRank(int rank, BufferType bufferType) const {
-  auto pred = [rank, bufferType](const ChannelInfo& info) { return info.dstBufferType == bufferType; };
+  auto pred = [bufferType](const ChannelInfo& info) { return info.dstBufferType == bufferType; };
   return filter(this->channelInfosByDstRank.at(rank), pred);
 }
 
@@ -159,10 +159,10 @@ size_t ExecutionPlan::Impl::getScratchBufferSize(int rank, size_t inputSize, siz
 
   size_t scratchBufferSize = sizePerRank * this->scratchChunks.at(rank);
   if (this->isUsingPacket) {
-    scratchBufferSize *= 2; // data + flag
+    scratchBufferSize *= 2; /* data + flag */
   }
   if (this->isUsingDoubleScratchBuffer) {
-    scratchBufferSize *= 2; // double buffer
+    scratchBufferSize *= 2; /* double buffer */
   }
   return scratchBufferSize;
 }
@@ -174,7 +174,7 @@ int ExecutionPlan::Impl::getThreadblockCount(int rank) const { return this->oper
 
 int ExecutionPlan::Impl::getNThreadsPerBlock() const { return this->nThreadsPerBlock; }
 
-bool ExecutionPlan::Impl::getIsUsingDoubleScratchBuffer() const { return this->getIsUsingDoubleScratchBuffer; }
+bool ExecutionPlan::Impl::getIsUsingDoubleScratchBuffer() const { return this->isUsingDoubleScratchBuffer; }
 
 void ExecutionPlan::Impl::loadExecutionPlan(size_t inputSize, size_t outputSize, size_t contsSrcOffset,
                                             size_t constDstOffset) {
diff --git a/src/executor/executor.cc b/src/executor/executor.cc
@@ -309,12 +309,8 @@ struct Executor::Impl {
     static uint32_t flag = 0;
     int nthreadblocks = context.deviceExecutionPlans.size();
     char* kernelScratchBufferPtr = context.scratchBuffer.get();
-    size_t kernelScratchBufferSize = context.scratchBufferSize;
-    if (context.isUsingDoubleScratchBuffer) {
-      kernelScratchBufferSize /= 2;
-      if (flag % 2) {
-        kernelScratchBufferPtr += kernelScratchBufferSize;
-      }
+    if (context.isUsingDoubleScratchBuffer && (flag % 2)) {
+      kernelScratchBufferPtr += context.scratchBufferSize / 2;
     }
 #if defined(ENABLE_NPKIT)
 #if defined(__HIP_PLATFORM_AMD__)
@@ -332,13 +328,13 @@ struct Executor::Impl {
     switch (packetType) {
       case PacketType::LL16:
         ExecutionKernel::launchKernel<LL16Packet>(rank, nthreadblocks, context.nthreadsPerBlock, sendbuff, recvbuff,
-                                                  (void*)kernelScratchBufferPtr, kernelScratchBufferSize, dataType,
+                                                  (void*)kernelScratchBufferPtr, dataType,
                                                   (DeviceExecutionPlan*)context.deviceExecutionPlansBuffer.get(),
                                                   sharedMemSize, stream, ++flag);
         break;
       case PacketType::LL8:
         ExecutionKernel::launchKernel<LL8Packet>(rank, nthreadblocks, context.nthreadsPerBlock, sendbuff, recvbuff,
-                                                 (void*)kernelScratchBufferPtr, kernelScratchBufferSize, dataType,
+                                                 (void*)kernelScratchBufferPtr, dataType,
                                                  (DeviceExecutionPlan*)context.deviceExecutionPlansBuffer.get(),
                                                  sharedMemSize, stream, ++flag);
         break;
diff --git a/src/include/execution_kernel.hpp b/src/include/execution_kernel.hpp
@@ -301,7 +301,7 @@ MSCCLPP_DEVICE_INLINE void handlePutPacket(DeviceHandle<SmChannel>* smChannels,
 
 template <typename T, typename PacketType, bool SendToRemote = true>
 MSCCLPP_DEVICE_INLINE void handleReduceSendPacket(T* dst, uint32_t dstOffsetByBytes, T* src, uint32_t srcOffsetByBytes,
-                                                  T* inputBuff, size_t inputBuffSize, uint32_t* inputOffsets, int nSrcs,
+                                                  T* inputBuff, uint32_t* inputOffsets, int nSrcs,
                                                   DeviceHandle<SmChannel>* smChannels, uint8_t* outputChannelIndexes,
                                                   uint32_t* outputOffsets, int nDstChannels, size_t size,
                                                   uint32_t flag) {
@@ -331,8 +331,8 @@ MSCCLPP_DEVICE_INLINE void handleReduceSendPacket(T* dst, uint32_t dstOffsetByBy
 }
 
 template <typename PacketType>
-MSCCLPP_DEVICE_INLINE void handleCopyPacket(void* dst, void* src, size_t srcSize, uint32_t dstOffset,
-                                            uint32_t srcOffset, size_t size, uint32_t flag) {
+MSCCLPP_DEVICE_INLINE void handleCopyPacket(void* dst, void* src, uint32_t dstOffset, uint32_t srcOffset, size_t size,
+                                            uint32_t flag) {
   PacketType* srcPackets = (PacketType*)((char*)src + 2 * srcOffset);
   PacketPayload<PacketType>* result = (PacketPayload<PacketType>*)((char*)dst + dstOffset);
   size_t nPackets = size * 2 / sizeof(PacketType);
@@ -343,8 +343,8 @@ MSCCLPP_DEVICE_INLINE void handleCopyPacket(void* dst, void* src, size_t srcSize
 }
 
 template <typename PacketType>
-MSCCLPP_DEVICE_INLINE void handleTransformToPacket(void* dst, void* src, size_t dstSize, uint32_t dstOffset,
-                                                   uint32_t srcOffset, size_t size, uint32_t flag) {
+MSCCLPP_DEVICE_INLINE void handleTransformToPacket(void* dst, void* src, uint32_t dstOffset, uint32_t srcOffset,
+                                                   size_t size, uint32_t flag) {
   dstOffset = dstOffset * 2;
   mscclpp::putPackets<PacketType>(dst, dstOffset, src, srcOffset, size, threadIdx.x, blockDim.x, flag);
 }

Original file line number	Diff line number	Diff line change
`@@ -100,7 +100,7 @@ std::vector<ChannelInfo> ExecutionPlan::Impl::getChannelInfos(int rank, BufferTy`
`100`	`100`	`}`
`101`	`101`
`102`	`102`	`std::vector<ChannelInfo> ExecutionPlan::Impl::getChannelInfosByDstRank(int rank, BufferType bufferType) const {`
`103`		`- auto pred = [rank, bufferType](const ChannelInfo& info) { return info.dstBufferType == bufferType; };`
	`103`	`+ auto pred = [bufferType](const ChannelInfo& info) { return info.dstBufferType == bufferType; };`
`104`	`104`	`return filter(this->channelInfosByDstRank.at(rank), pred);`
`105`	`105`	`}`
`106`	`106`
`@@ -159,10 +159,10 @@ size_t ExecutionPlan::Impl::getScratchBufferSize(int rank, size_t inputSize, siz`
`159`	`159`
`160`	`160`	`size_t scratchBufferSize = sizePerRank * this->scratchChunks.at(rank);`
`161`	`161`	`if (this->isUsingPacket) {`
`162`		`- scratchBufferSize *= 2; // data + flag`
	`162`	`+ scratchBufferSize = 2; / data + flag */`
`163`	`163`	`}`
`164`	`164`	`if (this->isUsingDoubleScratchBuffer) {`
`165`		`- scratchBufferSize *= 2; // double buffer`
	`165`	`+ scratchBufferSize = 2; / double buffer */`
`166`	`166`	`}`
`167`	`167`	`return scratchBufferSize;`
`168`	`168`	`}`
`@@ -174,7 +174,7 @@ int ExecutionPlan::Impl::getThreadblockCount(int rank) const { return this->oper`
`174`	`174`
`175`	`175`	`int ExecutionPlan::Impl::getNThreadsPerBlock() const { return this->nThreadsPerBlock; }`
`176`	`176`
`177`		`-bool ExecutionPlan::Impl::getIsUsingDoubleScratchBuffer() const { return this->getIsUsingDoubleScratchBuffer; }`
	`177`	`+bool ExecutionPlan::Impl::getIsUsingDoubleScratchBuffer() const { return this->isUsingDoubleScratchBuffer; }`
`178`	`178`
`179`	`179`	`void ExecutionPlan::Impl::loadExecutionPlan(size_t inputSize, size_t outputSize, size_t contsSrcOffset,`
`180`	`180`	`size_t constDstOffset) {`