@@ -382,7 +382,7 @@ __global__ void __launch_bounds__(512, 1)
382382 __syncthreads ();
383383 // Starts allgather
384384 for (size_t idx = threadIdx.x ; idx < nInt4PerChunk; idx += blockDim.x ) {
385- for (int i = 0 ; i < nPeer ; i++) {
385+ for (int i = 0 ; i < NPEER ; i++) {
386386 const int peerIdx = (i + blockIdx.x ) % nPeer;
387387 const int remoteRank = (peerIdx < rank) ? peerIdx : peerIdx + 1 ;
388388 int4 val = buff4[nInt4PerRank * remoteRank + idx + offsetOfThisBlock];
@@ -399,13 +399,13 @@ __global__ void __launch_bounds__(512, 1)
399399
400400 for (size_t idx = threadIdx.x ; idx < nInt4PerChunk; idx += blockDim.x ) {
401401 int4 data = buff4[nInt4PerRank * rank + idx + offsetOfThisBlock];
402- for (int peerIdx = 0 ; peerIdx < nPeer ; peerIdx++) {
402+ for (int peerIdx = 0 ; peerIdx < NPEER ; peerIdx++) {
403403 const int remoteRank = (peerIdx < rank) ? peerIdx : peerIdx + 1 ;
404404 int4 val = scratch4[chunkSizePerRank * remoteRank + blockOffset + idx];
405405 data = add_vectors<T>(val, data);
406406 }
407407 resultBuff4[nInt4PerRank * rank + idx + offsetOfThisBlock] = data;
408- for (int peerIdx = 0 ; peerIdx < nPeer ; peerIdx++) {
408+ for (int peerIdx = 0 ; peerIdx < NPEER ; peerIdx++) {
409409 outChannels[peerIdx].write (nInt4PerRank * rank + idx + offsetOfThisBlock + channelOutDataOffset / sizeof (int4),
410410 data);
411411 }
@@ -419,7 +419,7 @@ __global__ void __launch_bounds__(512, 1)
419419 }
420420 __syncthreads ();
421421 for (size_t idx = threadIdx.x ; idx < restNInt4; idx += blockDim.x ) {
422- for (int i = 0 ; i < nPeer ; i++) {
422+ for (int i = 0 ; i < NPEER ; i++) {
423423 const int peerIdx = (i + blockIdx.x ) % nPeer;
424424 const int remoteRank = (peerIdx < rank) ? peerIdx : peerIdx + 1 ;
425425 int4 val = buff4[nInt4PerRank * remoteRank + idx + offsetOfThisBlock];
@@ -435,13 +435,13 @@ __global__ void __launch_bounds__(512, 1)
435435
436436 for (size_t idx = threadIdx.x ; idx < restNInt4; idx += blockDim.x ) {
437437 int4 data = buff4[nInt4PerRank * rank + idx + offsetOfThisBlock];
438- for (int peerIdx = 0 ; peerIdx < nPeer ; peerIdx++) {
438+ for (int peerIdx = 0 ; peerIdx < NPEER ; peerIdx++) {
439439 const int remoteRank = (peerIdx < rank) ? peerIdx : peerIdx + 1 ;
440440 int4 val = scratch4[chunkSizePerRank * remoteRank + blockOffset + idx];
441441 data = add_vectors<T>(val, data);
442442 }
443443 resultBuff4[nInt4PerRank * rank + idx + offsetOfThisBlock] = data;
444- for (int peerIdx = 0 ; peerIdx < nPeer ; peerIdx++) {
444+ for (int peerIdx = 0 ; peerIdx < NPEER ; peerIdx++) {
445445 outChannels[peerIdx].write (nInt4PerRank * rank + idx + offsetOfThisBlock + channelOutDataOffset / sizeof (int4),
446446 data);
447447 }
0 commit comments