uccl-project · YangZhou1997 · Oct 31, 2025 · Nov 3, 2025 · Nov 4, 2025 · Nov 4, 2025
diff --git a/ep/bench/run_ep.sh b/ep/bench/run_ep.sh
@@ -15,7 +15,7 @@ if [ "$MODE" = "ll" ]; then
 else
     torchrun --nnodes=$NNODES --nproc_per_node=8 --node_rank=$RANK \
         --master_addr=$MAIN_IP --master_port=12355 \
-        test_internode.py  --num-tokens=4096 \
+        --log-dir=./logs --redirect 3 test_internode.py  --num-tokens=1024 \
         --hidden=7168 --num-topk=8 --num-experts=288 --test-ll-compatibility
 fi
 # --log-dir=logs --redirect=3
diff --git a/ep/include/ep_configs.cuh b/ep/include/ep_configs.cuh
@@ -12,11 +12,19 @@
 // #define ENABLE_FAST_DEBUG
 #ifndef ENABLE_FAST_DEBUG
 #define NUM_CPU_TIMEOUT_SECS 100
+#if defined(__HIP_PLATFORM_AMD__) || defined(__HIPCC__)
+#define NUM_TIMEOUT_CYCLES 20000000000ull
+#else
 #define NUM_TIMEOUT_CYCLES 200000000000ull  // 200G cycles ~= 100s
+#endif
 #else
 #define NUM_CPU_TIMEOUT_SECS 10
+#if defined(__HIP_PLATFORM_AMD__) || defined(__HIPCC__)
+#define NUM_TIMEOUT_CYCLES 2000000000ull
+#else
 #define NUM_TIMEOUT_CYCLES 20000000000ull  // 20G cycles ~= 10s
 #endif
+#endif
 
 #define LOW_LATENCY_SEND_PHASE 1
 #define LOW_LATENCY_RECV_PHASE 2

diff --git a/ep/include/ep_launch.cuh b/ep/include/ep_launch.cuh
@@ -1,21 +1,6 @@
 #pragma once
 #include "exception.cuh"
 
-#if defined(__HIP_PLATFORM_AMD__) || defined(__HIPCC__)
-#ifndef SETUP_LAUNCH_CONFIG
-#define SETUP_LAUNCH_CONFIG(num_sms, num_threads, stream)                    \
-  hipLaunchConfig_t cfg = {(num_sms), (num_threads), 0, stream, nullptr, 0}; \
-  hipLaunchAttribute attr[1];                                                \
-  attr[0].id = hipLaunchAttributeCooperative;                                \
-  attr[0].val.cooperative = 1;                                               \
-  cfg.attrs = attr;                                                          \
-  cfg.numAttrs = 1
-#endif
-#ifndef LAUNCH_KERNEL
-#define LAUNCH_KERNEL(config, kernel, ...) \
-  CUDA_CHECK(hipLaunchKernelEx(config, kernel, ##__VA_ARGS__))
-#endif
-#else
 #ifndef SETUP_LAUNCH_CONFIG
 #ifndef DISABLE_SM90_FEATURES
 #define SETUP_LAUNCH_CONFIG(num_sms, num_threads, stream)                     \
@@ -55,7 +40,6 @@
   } while (0)
 #endif
 #endif
-#endif
 
 #ifndef SET_SHARED_MEMORY_FOR_TMA
 #ifndef DISABLE_SM90_FEATURES

diff --git a/ep/include/ep_utils.cuh b/ep/include/ep_utils.cuh
@@ -5,7 +5,6 @@
 
 #if defined(__HIP_PLATFORM_AMD__) || defined(__HIPCC__)
 #include "amd_nanosleep.cuh"
-#define __syncwarp() __builtin_amdgcn_wave_barrier()
 #ifndef clock64
 #define clock64 wall_clock64
 #endif
@@ -906,7 +905,7 @@ __device__ __forceinline__ void st_relaxed_sys_global(int const* ptr, int val) {
 __device__ __forceinline__ int ld_acquire_cta(int const* ptr) {
   int ret;
 #if defined(__HIP_PLATFORM_AMD__) || defined(__HIPCC__)
-  HIP_ATOMIC_LOAD(ptr, __ATOMIC_ACQUIRE, __HIP_MEMORY_SCOPE_WORKGROUP);
+  ret = HIP_ATOMIC_LOAD(ptr, __ATOMIC_ACQUIRE, __HIP_MEMORY_SCOPE_WORKGROUP);
 #else
   asm volatile("ld.acquire.cta.s32 %0, [%1];" : "=r"(ret) : "l"(ptr));
 #endif

diff --git a/ep/setup.py b/ep/setup.py
@@ -2,12 +2,12 @@
 import subprocess
 import setuptools
 from glob import glob
-import torch
 import shutil
 import site
-
+import re
 from pathlib import Path
 
+import torch
 from torch.utils.cpp_extension import BuildExtension, CUDAExtension
 from setuptools.command.install import install
 
@@ -156,11 +156,37 @@ def run(self):
             if float(default_arch) >= 9.0:
                 nvcc_flags.extend(["--ptxas-options=--register-usage-level=10"])
 
-        os.environ["TORCH_CUDA_ARCH_LIST"] = os.getenv(
-            "TORCH_CUDA_ARCH_LIST", default_arch
-        )
-        device_arch = os.environ["TORCH_CUDA_ARCH_LIST"]
+        # Set architecture environment variable before creating CUDAExtension
+        device_arch = os.getenv("TORCH_CUDA_ARCH_LIST", default_arch)
+        os.environ["TORCH_CUDA_ARCH_LIST"] = device_arch
     else:
+        gpu_archs = os.getenv("TORCH_CUDA_ARCH_LIST", None)
+        if gpu_archs is None or gpu_archs.strip() == "":
+            # Detect GPU architecture on AMD
+            GPU_ARCH_PATTERN = re.compile(r"Name:\s*(gfx\d+\w*)")
+            try:
+                result = subprocess.run(
+                    ["rocminfo"],
+                    stdout=subprocess.PIPE,
+                    stderr=subprocess.PIPE,
+                    text=True,
+                    check=True,
+                )
+            except Exception as e:
+                raise RuntimeError(f"rocminfo failed: {e}")
+
+            matches = set(GPU_ARCH_PATTERN.findall(result.stdout))
+
+            if not matches:
+                raise RuntimeError("No gfx architecture found in rocminfo output.")
+            arch_list = list(matches)
+
+        else:
+            gpu_archs = gpu_archs.split(",")
-            gpu_archs = gpu_archs.split(",")
+            arch_list = gpu_archs.split(",")
-            gpu_archs = gpu_archs.split(",")
+            arch_list = gpu_archs.split(",")
+
+        for arch in arch_list:
+            nvcc_flags.append(f"--offload-arch={arch.lower()}")
+
         # Disable SM90 features on AMD
         cxx_flags.append("-DDISABLE_SM90_FEATURES")
         nvcc_flags.append("-DDISABLE_SM90_FEATURES")
@@ -169,8 +195,11 @@ def run(self):
             cxx_flags.append("-DDISABLE_AGGRESSIVE_ATOMIC")
             nvcc_flags.append("-DDISABLE_AGGRESSIVE_ATOMIC")
 
-        device_arch = os.getenv("TORCH_CUDA_ARCH_LIST", "gfx942")
-        os.environ["PYTORCH_ROCM_ARCH"] = device_arch
+        cxx_flags.append("-DUSE_GRACE_HOPPER")
+        nvcc_flags.append("-DUSE_GRACE_HOPPER")
-        cxx_flags.append("-DUSE_GRACE_HOPPER")
-        nvcc_flags.append("-DUSE_GRACE_HOPPER")
+        # Removed erroneous Grace Hopper flag for AMD/ROCm
+        # (No action needed)
-        cxx_flags.append("-DUSE_GRACE_HOPPER")
-        nvcc_flags.append("-DUSE_GRACE_HOPPER")
+        # Removed erroneous Grace Hopper flag for AMD/ROCm
+        # (No action needed)
+
+        # Get device architecture (already set at top of file)
+        device_arch = os.getenv("PYTORCH_ROCM_ARCH", "gfx942")
 
     # Disable LD/ST tricks, as some CUDA version does not support `.L1::no_allocate`
     # Only enable aggressive PTX instructions for SM 9.0+ (H100/H800/B200)

diff --git a/ep/src/internode.cu b/ep/src/internode.cu
@@ -582,7 +582,12 @@ __global__ void __launch_bounds__(
   // RDMA sender warp synchronization
   // NOTES: `rdma_send_channel_tail` means the latest released tail
   // NOTES: `rdma_send_channel_window` means the ongoing 32 transactions' status
+#if defined(__HIP_PLATFORM_AMD__) || defined(__HIPCC__)
+  __shared__ int volatile rdma_send_next_token_idx;
+  __shared__ int volatile rdma_send_channel_next_tail[kNumRDMARanks];
+#else
   __shared__ int rdma_send_channel_lock[kNumRDMARanks];
+#endif
   __shared__ int rdma_send_channel_tail[kNumRDMARanks];
   __shared__ uint32_t rdma_send_channel_window[kNumRDMARanks];
 
@@ -629,6 +634,12 @@ __global__ void __launch_bounds__(
     get_channel_task_range(num_tokens, num_channels, channel_id,
                            token_start_idx, token_end_idx);
 
+#if defined(__HIP_PLATFORM_AMD__) || defined(__HIPCC__)
+    (warp_id == 0 and lane_id == 0)
+        ? (rdma_send_next_token_idx = token_start_idx)
+        : 0;
+#endif
+
     // Send number of tokens in this channel by `-value - 1`
     EP_STATIC_ASSERT(NUM_MAX_NVL_PEERS * 2 + 2 <= WARP_SIZE,
                      "Invalid number of NVL peers");
@@ -694,13 +705,67 @@ __global__ void __launch_bounds__(
     auto send_buffer = lane_id == rdma_rank
                            ? rdma_channel_data.recv_buffer(lane_id)
                            : rdma_channel_data.send_buffer(lane_id);
+
+#if defined(__HIP_PLATFORM_AMD__) || defined(__HIPCC__)
+    // NOTE: sequential lock works for amd.
+    int last_rdma_tail_idx = -1;
+    for (token_idx = token_start_idx + warp_id; token_idx < token_end_idx;
+         token_idx += kNumDispatchRDMASenderWarps) {
+      // Read RDMA rank existence
+      uint64_t is_token_in_rank_uint64 = 0;
+      if (lane_id < kNumRDMARanks) {
+        is_token_in_rank_uint64 = __ldg(reinterpret_cast<uint64_t const*>(
+            is_token_in_rank + token_idx * num_ranks +
+            lane_id * NUM_MAX_NVL_PEERS));
+      }
+
+      // Acquire sequential lock
+      while (lane_id == 0 and rdma_send_next_token_idx != token_idx)
+        ;
+      __syncwarp();
+
+      // Acquire next tail
+      int rdma_tail_idx = -1;
+      auto start_time = clock64();
+      if (is_token_in_rank_uint64 != 0) {
+        rdma_tail_idx = rdma_send_channel_next_tail[lane_id]++;
+        // Wait the remote buffer to be released
+        while (rdma_tail_idx - cached_rdma_channel_head >=
+               num_max_rdma_chunked_recv_tokens) {
+          cached_rdma_channel_head = static_cast<int>(
+              ld_acquire_sys_global(rdma_channel_head.buffer(lane_id)));
+
+          // Timeout check
+          if (clock64() - start_time >= NUM_TIMEOUT_CYCLES) {
+            printf(
+                "DeepEP dispatch RDMA sender timeout, channel: %d, RDMA: %d, "
+                "nvl: %d, dst RDMA lane: %d, head: %d, tail: %d\n",
+                channel_id, rdma_rank, nvl_rank, lane_id,
+                cached_rdma_channel_head, rdma_tail_idx);
+            trap();
+          }
+        }
+      }
+      __syncwarp();
+
+      // Update last token tail
+      if (last_rdma_tail_idx >= 0)
+        st_release_cta(const_cast<int const*>(rdma_send_channel_tail + lane_id),
+                       last_rdma_tail_idx + 1);
+      last_rdma_tail_idx = rdma_tail_idx;
+
+      // Release sequential lock
+      lane_id == 0 ? (rdma_send_next_token_idx += 1) : 0;
+
+#else
     for (token_idx = token_start_idx; token_idx < token_end_idx; ++token_idx) {
       // Read RDMA rank existence
       uint64_t is_token_in_rank_uint64 = 0;
       if (lane_id < kNumRDMARanks) {
         is_token_in_rank_uint64 = __ldg(reinterpret_cast<uint64_t const*>(
             is_token_in_rank + token_idx * num_ranks +
             lane_id * NUM_MAX_NVL_PEERS));
+
         global_rdma_tail_idx += (is_token_in_rank_uint64 != 0);
       }
       __syncwarp();
@@ -730,6 +795,7 @@ __global__ void __launch_bounds__(
           trap();
         }
       }
+#endif
       __syncwarp();
 
       // Store RDMA head for combine
@@ -813,6 +879,7 @@ __global__ void __launch_bounds__(
       }
       __syncwarp();
 
+#if defined(__NVCC__)
       // Release the transaction in the window
       if (is_token_in_rank_uint64 != 0) {
         // Acquire lock first
@@ -841,8 +908,25 @@ __global__ void __launch_bounds__(
         // Release lock
         release_lock(rdma_send_channel_lock + lane_id);
       }
+#endif
       __syncwarp();
     }
+
+#if defined(__HIP_PLATFORM_AMD__) || defined(__HIPCC__)
+    // Epilogue
+    // Acquire sequential lock
+    while (lane_id == 0 and rdma_send_next_token_idx != token_idx)
+      ;
+    __syncwarp();
+
+    // Update last token tail
+    if (last_rdma_tail_idx >= 0)
+      st_release_cta(const_cast<int const*>(rdma_send_channel_tail + lane_id),
+                     last_rdma_tail_idx + 1);
+
+    // Release sequential lock
+    lane_id == 0 ? (rdma_send_next_token_idx += 1) : 0;
+#endif
   } else if (warp_role == WarpRole::kRDMASenderCoordinator) {
     // NOTES: in case of splitting, the issued put at the end of the buffer
     EP_DEVICE_ASSERT(num_max_rdma_chunked_recv_tokens %
@@ -852,7 +936,11 @@ __global__ void __launch_bounds__(
     // Clean shared memory
     EP_STATIC_ASSERT(kNumRDMARanks <= WARP_SIZE,
                      "Invalid number of RDMA ranks");
+#if defined(__HIP_PLATFORM_AMD__) || defined(__HIPCC__)
+    (lane_id < kNumRDMARanks) ? (rdma_send_channel_next_tail[lane_id] = 0) : 0;
+#else
     (lane_id < kNumRDMARanks) ? (rdma_send_channel_lock[lane_id] = 0) : 0;
+#endif
     (lane_id < kNumRDMARanks) ? (rdma_send_channel_tail[lane_id] = 0) : 0;
     (lane_id < kNumRDMARanks) ? (rdma_send_channel_window[lane_id] = 0) : 0;
 
@@ -1114,9 +1202,10 @@ __global__ void __launch_bounds__(
 
         // Copy data
 #if defined(__HIP_PLATFORM_AMD__) || defined(__HIPCC__)
-        UNROLLED_WARP_COPY(
-            5, lane_id, hidden_int4, reinterpret_cast<int4*>(dst_shifted),
-            reinterpret_cast<int4*>(shifted), ld_nc_global, st_na_global);
+        UNROLLED_WARP_COPY(5, lane_id, num_bytes_per_token / sizeof(int4),
+                           reinterpret_cast<int4*>(dst_shifted),
+                           reinterpret_cast<int4*>(shifted), ld_nc_global,
+                           st_na_global);
 #else
         if (lane_id == 0) {
           tma_load_1d(tma_buffer, shifted, tma_mbarrier, num_bytes_per_token,
@@ -1298,6 +1387,12 @@ __global__ void __launch_bounds__(
             5, lane_id, hidden_int4,
             reinterpret_cast<int4*>(recv_x + recv_token_idx * hidden_int4),
             reinterpret_cast<int4*>(shifted), ld_nc_global, st_na_global);
+        if (scale_aligned)
+          UNROLLED_WARP_COPY(1, lane_id, num_scales,
+                             recv_x_scales + recv_token_idx * num_scales,
+                             reinterpret_cast<float*>(shifted + hidden_bytes),
+                             ld_nc_global, st_na_global);
+
 #else
         if (lane_id == 0) {
           tma_load_1d(tma_buffer, shifted, tma_mbarrier, tma_load_bytes);
@@ -1660,7 +1755,12 @@ void cached_notify(int hidden_int4, int num_scales, int num_topk_idx,
                    bool is_cached_dispatch, bool low_latency_mode,
                    uint64_t const* d2h_channel_addrs, int num_d2h_channel_addrs,
                    void* atomic_buffer_ptr) {
+#if defined(__HIP_PLATFORM_AMD__) || defined(__HIPCC__)
+  int const num_threads =
+      std::max(128, WARP_SIZE * (is_cached_dispatch ? 2 : num_channels));
+#else
   int const num_threads = std::max(128, WARP_SIZE * num_channels);
+#endif
   int const num_warps = num_threads / WARP_SIZE;
   auto const num_rdma_ranks = num_ranks / NUM_MAX_NVL_PEERS;
   int const kNumTMABytesPerWarp = 8192;

diff --git a/ep/src/internode_ll.cu b/ep/src/internode_ll.cu
@@ -1125,4 +1125,4 @@ void combine(void* combined_x, void* rdma_recv_x, int* rdma_recv_flag,
 }
 
 }  // namespace internode_ll
-}  // namespace uccl
+}  // namespace uccl
diff --git a/ep/src/proxy.cpp b/ep/src/proxy.cpp
@@ -196,9 +196,6 @@ void Proxy::init_common() {
       reinterpret_cast<uint32_t*>(static_cast<uint8_t*>(cfg_.gpu_buffer) +
                                   cfg_.total_size - atomic_buf_size);
 
-  // printf("[PROXY_INIT] Atomic buffer at %p, size %zu bytes\n",
-  //        ctx_.atomic_old_values_buf, atomic_buf_size);
-
   int num_ranks = ctxs_for_all_ranks_.size();
   local_infos_.assign(num_ranks, RDMAConnectionInfo{});
   remote_infos_.assign(num_ranks, RDMAConnectionInfo{});
@@ -846,7 +843,6 @@ void Proxy::post_gpu_commands_mixed(
       0) {
     return;
   }
-
   // Handle regular RDMA writes
   if (!rdma_wrs.empty()) {
     post_rdma_async_batched(ctx_, cfg_.gpu_buffer, rdma_wrs.size(), rdma_wrs,