microsoft
diff --git a/‎apps/nccl/include/mscclpp/nccl.h‎
Lines changed: 4 additions & 15 deletions b/‎apps/nccl/include/mscclpp/nccl.h‎
Lines changed: 4 additions & 15 deletions
diff --git a/‎apps/nccl/src/allreduce.cu‎
Lines changed: 47 additions & 21 deletions b/‎apps/nccl/src/allreduce.cu‎
Lines changed: 47 additions & 21 deletions
@@ -248,17 +248,10 @@ typedef enum {
   ncclFloat = 7,
   ncclFloat64 = 8,
   ncclDouble = 8,
-#if defined(__CUDA_BF16_TYPES_EXIST__) && defined(__CUDA_FP8_TYPES_EXIST__)
   ncclBfloat16 = 9,
-  ncclFp8E4M3 = 10,
-  ncclFp8E5M2 = 11,
+  ncclFloat8e4m3 = 10,
+  ncclFloat8e5m2 = 11,
   ncclNumTypes = 12
-#elif defined(__CUDA_BF16_TYPES_EXIST__)
-  ncclBfloat16 = 9,
-  ncclNumTypes = 10
-#else
-  ncclNumTypes = 9
-#endif
 } ncclDataType_t;
 
 static inline size_t ncclTypeSize(ncclDataType_t type) {
@@ -278,15 +271,11 @@ static inline size_t ncclTypeSize(ncclDataType_t type) {
       return 4;
     case ncclFloat64:
       return 8;
-#if defined(__CUDA_BF16_TYPES_EXIST__)
     case ncclBfloat16:
       return 2;
-#endif  // defined(__CUDA_BF16_TYPES_EXIST__)
-#if defined(__CUDA_FP8_TYPES_EXIST__)
-    case ncclFp8E4M3:
-    case ncclFp8E5M2:
+    case ncclFloat8e4m3:
+    case ncclFloat8e5m2:
       return 1;
-#endif  // defined(__CUDA_FP8_TYPES_EXIST__)
     case ncclNumTypes:
       return 0;
   }
 
@@ -71,13 +71,20 @@ struct NvlsAdapter {
                           mscclpp::DeviceHandle<mscclpp::SwitchChannel>* nvlsOutChannels, size_t channelInOffset,
                           size_t channelOutOffset, size_t, int rank, int nRanksPerNode, int, size_t nelems,
                           cudaStream_t stream, uint32_t*, uint32_t*, uint32_t*, uint32_t) {
-    using ChannelType = mscclpp::DeviceHandle<mscclpp::BaseMemoryChannel>;
-    int nBlocks = nRanksPerNode;
-    int nThreadsPerBlock = 1024;
-    allreduce9<T><<<nBlocks, nThreadsPerBlock, 0, stream>>>((ChannelType*)memoryChannels, nvlsChannels, nvlsOutChannels,
-                                                            channelInOffset, channelOutOffset, nelems * sizeof(T), rank,
-                                                            nRanksPerNode);
-    return cudaGetLastError();
+#if defined(__CUDA_ARCH__)  // Skip the __CUDA_ARCH__ < 1000 since FP8 has not been supported for NVLS
+    if constexpr (std::is_same_v<T, __fp8_e4m3> || std::is_same_v<T, __fp8_e5m2>) {
+      return cudaErrorNotSupported;
+    } else
+#endif
+    {
+      using ChannelType = mscclpp::DeviceHandle<mscclpp::BaseMemoryChannel>;
+      int nBlocks = nRanksPerNode;
+      int nThreadsPerBlock = 1024;
+      allreduce9<T><<<nBlocks, nThreadsPerBlock, 0, stream>>>((ChannelType*)memoryChannels, nvlsChannels,
+                                                              nvlsOutChannels, channelInOffset, channelOutOffset,
+                                                              nelems * sizeof(T), rank, nRanksPerNode);
+      return cudaGetLastError();
+    }
   }
 };
 
@@ -88,21 +95,28 @@ struct NvlsWithCopyAdapter {
                           mscclpp::DeviceHandle<mscclpp::SwitchChannel>*, size_t, size_t, size_t scratchBufferSize,
                           int rank, int nRanksPerNode, int, size_t nelems, cudaStream_t stream, uint32_t*, uint32_t*,
                           uint32_t*, uint32_t) {
-    using ChannelType = mscclpp::DeviceHandle<mscclpp::BaseMemoryChannel>;
-    if (sizeof(T) * nelems < (1 << 24)) {
-      int nBlocks = nRanksPerNode * 4;
-      int nThreadsPerBlock = 1024;
-      allreduce10<T><<<nBlocks, nThreadsPerBlock, 0, stream>>>(input, scratch, output, (ChannelType*)memoryChannels,
-                                                               nvlsChannels, nelems * sizeof(T), scratchBufferSize,
-                                                               rank, nRanksPerNode);
-    } else {
-      int nBlocks = nRanksPerNode * 5;
-      int nThreadsPerBlock = 1024;
-      allreduce11<T><<<nBlocks, nThreadsPerBlock, 0, stream>>>(input, scratch, output, (ChannelType*)memoryChannels,
-                                                               nvlsChannels, nelems * sizeof(T), scratchBufferSize,
-                                                               rank, nRanksPerNode);
+#if defined(__CUDA_ARCH__)  // Skip the __CUDA_ARCH__ < 1000 since FP8 has not been supported for NVLS
+    if constexpr (std::is_same_v<T, __fp8_e4m3> || std::is_same_v<T, __fp8_e5m2>) {
+      return cudaErrorNotSupported;
+    } else
+#endif
+    {
+      using ChannelType = mscclpp::DeviceHandle<mscclpp::BaseMemoryChannel>;
+      if (sizeof(T) * nelems < (1 << 24)) {
+        int nBlocks = nRanksPerNode * 4;
+        int nThreadsPerBlock = 1024;
+        allreduce10<T><<<nBlocks, nThreadsPerBlock, 0, stream>>>(input, scratch, output, (ChannelType*)memoryChannels,
+                                                                 nvlsChannels, nelems * sizeof(T), scratchBufferSize,
+                                                                 rank, nRanksPerNode);
+      } else {
+        int nBlocks = nRanksPerNode * 5;
+        int nThreadsPerBlock = 1024;
+        allreduce11<T><<<nBlocks, nThreadsPerBlock, 0, stream>>>(input, scratch, output, (ChannelType*)memoryChannels,
+                                                                 nvlsChannels, nelems * sizeof(T), scratchBufferSize,
+                                                                 rank, nRanksPerNode);
+      }
+      return cudaGetLastError();
     }
-    return cudaGetLastError();
   }
 };
 
@@ -154,6 +168,12 @@ AllreduceFunc dispatch(ncclRedOp_t op, ncclDataType_t dtype) {
 #if defined(__CUDA_BF16_TYPES_EXIST__)
     } else if (dtype == ncclBfloat16) {
       return Adapter<SUM, __bfloat16>::call;
+#endif
+#if defined(__FP8_TYPES_EXIST__)
+    } else if (dtype == ncclFloat8e4m3) {
+      return Adapter<SUM, __fp8_e4m3>::call;
+    } else if (dtype == ncclFloat8e5m2) {
+      return Adapter<SUM, __fp8_e5m2>::call;
 #endif
     } else if (dtype == ncclInt32 || dtype == ncclUint32) {
       return Adapter<SUM, int>::call;
@@ -168,6 +188,12 @@ AllreduceFunc dispatch(ncclRedOp_t op, ncclDataType_t dtype) {
 #if defined(__CUDA_BF16_TYPES_EXIST__)
     } else if (dtype == ncclBfloat16) {
       return Adapter<MIN, __bfloat16>::call;
+#endif
+#if defined(__FP8_TYPES_EXIST__)
+    } else if (dtype == ncclFloat8e4m3) {
+      return Adapter<MIN, __fp8_e4m3>::call;
+    } else if (dtype == ncclFloat8e5m2) {
+      return Adapter<MIN, __fp8_e5m2>::call;
 #endif
     } else if (dtype == ncclInt32 || dtype == ncclUint32) {
       return Adapter<MIN, int>::call;