Skip to content
Open
Changes from 1 commit
Commits
Show all changes
31 commits
Select commit Hold shift + click to select a range
b7c1fb1
EP: port internode_dispatch to amd.
zhenhuang12 Oct 31, 2025
05e7df8
EP: port internode_combine to amd.
zhenhuang12 Nov 3, 2025
04c6a14
ziming fix amd port
MaoZiming Nov 4, 2025
946db83
fixing wr bug
MaoZiming Nov 4, 2025
fbda03b
debugging
MaoZiming Nov 4, 2025
d6f3355
clean
MaoZiming Nov 4, 2025
1903181
Merge branch 'zm-amd-port' of https://github.com/uccl-project/uccl in…
MaoZiming Nov 4, 2025
7324fcf
checkpt
MaoZiming Nov 5, 2025
b136aaa
merge main
MaoZiming Nov 5, 2025
4d63369
adding wr_id_to_wr_ids emplace for normal mode atomics
MaoZiming Nov 5, 2025
de4abc4
EP: fix ep internode
zhenhuang12 Nov 6, 2025
49c0bab
EP: fix RDMAAndNVLForwarder copy data
zhenhuang12 Nov 11, 2025
e5b2ef3
merge main
MaoZiming Nov 11, 2025
a4d9d4b
Merge branch 'main' of https://github.com/uccl-project/uccl into zm-a…
YangZhou1997 Nov 14, 2025
026a836
Merge branch 'main' of https://github.com/uccl-project/uccl into zm-a…
YangZhou1997 Nov 14, 2025
b64412d
debugging
YangZhou1997 Nov 15, 2025
9cb051f
run on nebius
YangZhou1997 Nov 15, 2025
f19fd62
merge with main
YangZhou1997 Nov 24, 2025
3a7d093
fixing setup.py on amd
YangZhou1997 Nov 24, 2025
8c86136
add printf to internode
YangZhou1997 Nov 24, 2025
fa1d720
Merge branch 'main' into yang-amd-normal
zhenhuang12 Nov 26, 2025
b6451b4
debug internode on amd-gpu
zhenhuang12 Nov 26, 2025
a142d9e
trying to debug dispatch kernel hang issues, but fails
YangZhou1997 Nov 28, 2025
a2d9f26
merge
YangZhou1997 Nov 28, 2025
5d38ecb
EP: fix normal dispatch bug
zhenhuang12 Dec 1, 2025
33e9bb1
Merge branch 'main' into yang-amd-normal
zhenhuang12 Dec 1, 2025
97d905e
EP: format code
zhenhuang12 Dec 1, 2025
4a44245
EP: restore internode_ll.cu
zhenhuang12 Dec 1, 2025
61eba6f
EP: format code
zhenhuang12 Dec 1, 2025
ce30d31
Merge branch 'main' into yang-amd-normal
YangZhou1997 Dec 2, 2025
6687cb6
EP: modify as suggested.
zhenhuang12 Dec 2, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
58 changes: 25 additions & 33 deletions ep/src/internode.cu
Original file line number Diff line number Diff line change
Expand Up @@ -442,8 +442,6 @@ void notify_dispatch(
EP_HOST_ASSERT(num_nvl_bytes < std::numeric_limits<int>::max());

// Launch kernel
printf("notify dispatch num_sms = %d, num_threads = %d", 1 + num_rdma_ranks,
kNumThreads);
SETUP_LAUNCH_CONFIG(1 + num_rdma_ranks, kNumThreads, stream);
SWITCH_RDMA_RANKS(NOTIFY_DISPATCH_LAUNCH_CASE);
#undef NOTIFY_DISPATCH_LAUNCH_CASE
Expand Down Expand Up @@ -705,9 +703,7 @@ __global__ void __launch_bounds__(
__syncwarp();

// Skip the token which does not belong to this warp
if ((token_idx - token_start_idx) % kNumDispatchRDMASenderWarps !=
warp_id)
continue;
if ((token_idx - token_start_idx) % 2 != warp_id) continue;
Copy link
Collaborator

@zhenhuang12 zhenhuang12 Nov 26, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I set kNumDispatchRDMASenderWarps = 2 here, it works! but I'm still working on it.
There are two possible reasons:

  • rdma transaction porting error
  • rdma send warp size 2 limit the speed of rdma command commit to avoid atomic errors.

Copy link
Member Author

@YangZhou1997 YangZhou1997 Nov 28, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@zhenhuang12 indeed, setting to 2 makes it work! Nice!

cc @MaoZiming, @CalebZ9909

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@YangZhou1997 The changes don't explain the cause of the error, but provide directions for troubleshooting. I'm currently planning to perform a more in-depth troubleshooting.

auto rdma_tail_idx =
is_token_in_rank_uint64 == 0 ? -1 : global_rdma_tail_idx - 1;

Expand Down Expand Up @@ -1148,15 +1144,15 @@ __global__ void __launch_bounds__(
// Move tail index
__syncwarp();
if (lane_id == 0) {
/*******************************************************************/
printf(
"DeepEP dispatch NVL forwarder, channel: %d, RDMA: %d, "
"src NVL: %d, dst NVL: %d, head: %d, tail: %d\n",
channel_id, rdma_rank, nvl_rank, target_rank,
cached_nvl_channel_head, cached_nvl_channel_tail);
// /*******************************************************************/
// printf(
// "DeepEP dispatch NVL forwarder, channel: %d, RDMA: %d, "
// "src NVL: %d, dst NVL: %d, head: %d, tail: %d\n",
// channel_id, rdma_rank, nvl_rank, target_rank,
// cached_nvl_channel_head, cached_nvl_channel_tail);
st_release_sys_global(nvl_channel_tail.buffer(),
cached_nvl_channel_tail);
/*******************************************************************/
// /*******************************************************************/
}
}
// Retired
Expand Down Expand Up @@ -1268,19 +1264,19 @@ __global__ void __launch_bounds__(

cached_channel_tail_idx = __shfl_sync(
WARP_MASK, ld_acquire_sys_global(nvl_channel_tail.buffer()), 0);
if (lane_id == 0) {
/*******************************************************************/
printf(
"DeepEP dispatch NVL receiver check, channel: %d, RDMA: %d, src "
"NVL: %d, dst NVL: %d, head: %d, tail: %d, "
"num_tokens_to_recv_original: %d, "
"num_tokens_to_recv: %d\n",
channel_id, rdma_rank, target_rank, nvl_rank,
ld_acquire_sys_global(nvl_channel_head.buffer()),
ld_acquire_sys_global(nvl_channel_tail.buffer()),
num_tokens_to_recv_original, num_tokens_to_recv);
/*******************************************************************/
}
// if (lane_id == 0) {
/*******************************************************************/
// printf(
// "DeepEP dispatch NVL receiver check, channel: %d, RDMA: %d, src
// " "NVL: %d, dst NVL: %d, head: %d, tail: %d, "
// "num_tokens_to_recv_original: %d, "
// "num_tokens_to_recv: %d\n",
// channel_id, rdma_rank, target_rank, nvl_rank,
// ld_acquire_sys_global(nvl_channel_head.buffer()),
// ld_acquire_sys_global(nvl_channel_tail.buffer()),
// num_tokens_to_recv_original, num_tokens_to_recv);
/*******************************************************************/
// }
// Timeout check
if (lane_id == 0 and clock64() - start_time > NUM_TIMEOUT_CYCLES) {
printf(
Expand Down Expand Up @@ -1459,8 +1455,6 @@ void dispatch(void* recv_x, float* recv_x_scales, int64_t* recv_topk_idx,
EP_HOST_ASSERT((topk_idx == nullptr) == (topk_weights == nullptr));
EP_HOST_ASSERT((recv_topk_idx == nullptr) == (recv_topk_weights == nullptr));

printf("dispatch num_sms = %d, num_threads = %d", num_channels * 2,
(kNumDispatchRDMASenderWarps + 1 + NUM_MAX_NVL_PEERS) * WARP_SIZE);
SETUP_LAUNCH_CONFIG(
num_channels * 2,
(kNumDispatchRDMASenderWarps + 1 + NUM_MAX_NVL_PEERS) * WARP_SIZE,
Expand Down Expand Up @@ -1721,8 +1715,6 @@ void cached_notify(int hidden_int4, int num_scales, int num_topk_idx,
? cached_notify<true, kNumTMABytesPerWarp>
: cached_notify<false, kNumTMABytesPerWarp>;

printf("cached_notify num_sms = %d, num_threads = %d", num_channels * 2,
num_threads);
SETUP_LAUNCH_CONFIG(num_channels * 2, num_threads, stream);
SET_SHARED_MEMORY_FOR_TMA(cached_notify_func);
LAUNCH_KERNEL(&cfg, cached_notify_func, rdma_clean_meta.first,
Expand Down Expand Up @@ -2083,7 +2075,7 @@ __global__ void __launch_bounds__((kNumForwarders + 1) * WARP_SIZE, 1)
channel_id, rdma_rank, nvl_rank, dst_nvl_rank, lane_id,
ld_volatile_global(nvl_channel_head.buffer() + lane_id),
cached_channel_tail_idx, token_start_idx, token_end_idx);
trap();
// trap();
}
}

Expand Down Expand Up @@ -2335,7 +2327,7 @@ __global__ void __launch_bounds__((kNumForwarders + 1) * WARP_SIZE, 1)
channel_id, rdma_rank, nvl_rank, dst_rdma_rank,
ld_acquire_sys_global(rdma_channel_head.buffer(dst_rdma_rank)),
token_start_idx, num_chunked_tokens);
trap();
// trap();
}
}
sync_large_warp();
Expand Down Expand Up @@ -2372,7 +2364,7 @@ __global__ void __launch_bounds__((kNumForwarders + 1) * WARP_SIZE, 1)
channel_id, rdma_rank, nvl_rank, lane_id, dst_rdma_rank,
cached_nvl_channel_tail_idx, token_idx, num_tokens_to_combine,
sub_warp_id, kNumWarpsPerForwarder, expected_head);
trap();
// trap();
}
}

Expand Down Expand Up @@ -2516,7 +2508,7 @@ __global__ void __launch_bounds__((kNumForwarders + 1) * WARP_SIZE, 1)
"nvl: %d, src RDMA: %d, tail: %d, waiting: %ld, expect: %d\n",
channel_id, rdma_rank, nvl_rank, lane_id,
cached_channel_tail_idx, token_idx, expected_head);
trap();
// trap();
}
}
__syncwarp();
Expand Down
Loading