-
Notifications
You must be signed in to change notification settings - Fork 103
[EP] debugging amd normal #548
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
b7c1fb1
05e7df8
04c6a14
946db83
fbda03b
d6f3355
1903181
7324fcf
b136aaa
4d63369
de4abc4
49c0bab
e5b2ef3
a4d9d4b
026a836
b64412d
9cb051f
f19fd62
3a7d093
8c86136
fa1d720
b6451b4
a142d9e
a2d9f26
5d38ecb
33e9bb1
97d905e
4a44245
61eba6f
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change | ||||||||
|---|---|---|---|---|---|---|---|---|---|---|
|
|
@@ -2,12 +2,12 @@ | |||||||||
| import subprocess | ||||||||||
| import setuptools | ||||||||||
| from glob import glob | ||||||||||
| import torch | ||||||||||
| import shutil | ||||||||||
| import site | ||||||||||
|
|
||||||||||
| import re | ||||||||||
| from pathlib import Path | ||||||||||
|
|
||||||||||
| import torch | ||||||||||
| from torch.utils.cpp_extension import BuildExtension, CUDAExtension | ||||||||||
| from setuptools.command.install import install | ||||||||||
|
|
||||||||||
|
|
@@ -156,11 +156,37 @@ def run(self): | |||||||||
| if float(default_arch) >= 9.0: | ||||||||||
| nvcc_flags.extend(["--ptxas-options=--register-usage-level=10"]) | ||||||||||
|
|
||||||||||
| os.environ["TORCH_CUDA_ARCH_LIST"] = os.getenv( | ||||||||||
| "TORCH_CUDA_ARCH_LIST", default_arch | ||||||||||
| ) | ||||||||||
| device_arch = os.environ["TORCH_CUDA_ARCH_LIST"] | ||||||||||
| # Set architecture environment variable before creating CUDAExtension | ||||||||||
| device_arch = os.getenv("TORCH_CUDA_ARCH_LIST", default_arch) | ||||||||||
| os.environ["TORCH_CUDA_ARCH_LIST"] = device_arch | ||||||||||
| else: | ||||||||||
| gpu_archs = os.getenv("TORCH_CUDA_ARCH_LIST", None) | ||||||||||
| if gpu_archs is None or gpu_archs.strip() == "": | ||||||||||
| # Detect GPU architecture on AMD | ||||||||||
| GPU_ARCH_PATTERN = re.compile(r"Name:\s*(gfx\d+\w*)") | ||||||||||
| try: | ||||||||||
| result = subprocess.run( | ||||||||||
| ["rocminfo"], | ||||||||||
| stdout=subprocess.PIPE, | ||||||||||
| stderr=subprocess.PIPE, | ||||||||||
| text=True, | ||||||||||
| check=True, | ||||||||||
| ) | ||||||||||
| except Exception as e: | ||||||||||
| raise RuntimeError(f"rocminfo failed: {e}") | ||||||||||
|
|
||||||||||
| matches = set(GPU_ARCH_PATTERN.findall(result.stdout)) | ||||||||||
|
|
||||||||||
| if not matches: | ||||||||||
| raise RuntimeError("No gfx architecture found in rocminfo output.") | ||||||||||
| arch_list = list(matches) | ||||||||||
|
|
||||||||||
| else: | ||||||||||
| gpu_archs = gpu_archs.split(",") | ||||||||||
|
|
||||||||||
| for arch in arch_list: | ||||||||||
| nvcc_flags.append(f"--offload-arch={arch.lower()}") | ||||||||||
|
|
||||||||||
| # Disable SM90 features on AMD | ||||||||||
| cxx_flags.append("-DDISABLE_SM90_FEATURES") | ||||||||||
| nvcc_flags.append("-DDISABLE_SM90_FEATURES") | ||||||||||
|
|
@@ -169,8 +195,11 @@ def run(self): | |||||||||
| cxx_flags.append("-DDISABLE_AGGRESSIVE_ATOMIC") | ||||||||||
| nvcc_flags.append("-DDISABLE_AGGRESSIVE_ATOMIC") | ||||||||||
|
|
||||||||||
| device_arch = os.getenv("TORCH_CUDA_ARCH_LIST", "gfx942") | ||||||||||
| os.environ["PYTORCH_ROCM_ARCH"] = device_arch | ||||||||||
| cxx_flags.append("-DUSE_GRACE_HOPPER") | ||||||||||
| nvcc_flags.append("-DUSE_GRACE_HOPPER") | ||||||||||
|
Comment on lines
+198
to
+199
|
||||||||||
| cxx_flags.append("-DUSE_GRACE_HOPPER") | |
| nvcc_flags.append("-DUSE_GRACE_HOPPER") | |
| # Removed erroneous Grace Hopper flag for AMD/ROCm | |
| # (No action needed) |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -582,7 +582,12 @@ __global__ void __launch_bounds__( | |
| // RDMA sender warp synchronization | ||
| // NOTES: `rdma_send_channel_tail` means the latest released tail | ||
| // NOTES: `rdma_send_channel_window` means the ongoing 32 transactions' status | ||
| #if defined(__HIP_PLATFORM_AMD__) || defined(__HIPCC__) | ||
| __shared__ int volatile rdma_send_next_token_idx; | ||
| __shared__ int volatile rdma_send_channel_next_tail[kNumRDMARanks]; | ||
| #else | ||
| __shared__ int rdma_send_channel_lock[kNumRDMARanks]; | ||
| #endif | ||
| __shared__ int rdma_send_channel_tail[kNumRDMARanks]; | ||
| __shared__ uint32_t rdma_send_channel_window[kNumRDMARanks]; | ||
|
|
||
|
|
@@ -629,6 +634,12 @@ __global__ void __launch_bounds__( | |
| get_channel_task_range(num_tokens, num_channels, channel_id, | ||
| token_start_idx, token_end_idx); | ||
|
|
||
| #if defined(__HIP_PLATFORM_AMD__) || defined(__HIPCC__) | ||
| (warp_id == 0 and lane_id == 0) | ||
| ? (rdma_send_next_token_idx = token_start_idx) | ||
| : 0; | ||
| #endif | ||
|
|
||
| // Send number of tokens in this channel by `-value - 1` | ||
| EP_STATIC_ASSERT(NUM_MAX_NVL_PEERS * 2 + 2 <= WARP_SIZE, | ||
| "Invalid number of NVL peers"); | ||
|
|
@@ -694,13 +705,67 @@ __global__ void __launch_bounds__( | |
| auto send_buffer = lane_id == rdma_rank | ||
| ? rdma_channel_data.recv_buffer(lane_id) | ||
| : rdma_channel_data.send_buffer(lane_id); | ||
|
|
||
| #if defined(__HIP_PLATFORM_AMD__) || defined(__HIPCC__) | ||
| // NOTE: sequential lock works for amd. | ||
| int last_rdma_tail_idx = -1; | ||
| for (token_idx = token_start_idx + warp_id; token_idx < token_end_idx; | ||
| token_idx += kNumDispatchRDMASenderWarps) { | ||
| // Read RDMA rank existence | ||
| uint64_t is_token_in_rank_uint64 = 0; | ||
| if (lane_id < kNumRDMARanks) { | ||
| is_token_in_rank_uint64 = __ldg(reinterpret_cast<uint64_t const*>( | ||
| is_token_in_rank + token_idx * num_ranks + | ||
| lane_id * NUM_MAX_NVL_PEERS)); | ||
| } | ||
|
|
||
| // Acquire sequential lock | ||
| while (lane_id == 0 and rdma_send_next_token_idx != token_idx) | ||
| ; | ||
| __syncwarp(); | ||
|
|
||
| // Acquire next tail | ||
| int rdma_tail_idx = -1; | ||
| auto start_time = clock64(); | ||
| if (is_token_in_rank_uint64 != 0) { | ||
| rdma_tail_idx = rdma_send_channel_next_tail[lane_id]++; | ||
| // Wait the remote buffer to be released | ||
| while (rdma_tail_idx - cached_rdma_channel_head >= | ||
| num_max_rdma_chunked_recv_tokens) { | ||
| cached_rdma_channel_head = static_cast<int>( | ||
| ld_acquire_sys_global(rdma_channel_head.buffer(lane_id))); | ||
|
|
||
| // Timeout check | ||
| if (clock64() - start_time >= NUM_TIMEOUT_CYCLES) { | ||
| printf( | ||
| "DeepEP dispatch RDMA sender timeout, channel: %d, RDMA: %d, " | ||
| "nvl: %d, dst RDMA lane: %d, head: %d, tail: %d\n", | ||
| channel_id, rdma_rank, nvl_rank, lane_id, | ||
| cached_rdma_channel_head, rdma_tail_idx); | ||
| trap(); | ||
| } | ||
| } | ||
| } | ||
| __syncwarp(); | ||
|
|
||
| // Update last token tail | ||
| if (last_rdma_tail_idx >= 0) | ||
| st_release_cta(const_cast<int const*>(rdma_send_channel_tail + lane_id), | ||
| last_rdma_tail_idx + 1); | ||
| last_rdma_tail_idx = rdma_tail_idx; | ||
|
|
||
| // Release sequential lock | ||
| lane_id == 0 ? (rdma_send_next_token_idx += 1) : 0; | ||
|
|
||
| #else | ||
| for (token_idx = token_start_idx; token_idx < token_end_idx; ++token_idx) { | ||
| // Read RDMA rank existence | ||
| uint64_t is_token_in_rank_uint64 = 0; | ||
| if (lane_id < kNumRDMARanks) { | ||
| is_token_in_rank_uint64 = __ldg(reinterpret_cast<uint64_t const*>( | ||
| is_token_in_rank + token_idx * num_ranks + | ||
| lane_id * NUM_MAX_NVL_PEERS)); | ||
|
|
||
| global_rdma_tail_idx += (is_token_in_rank_uint64 != 0); | ||
| } | ||
| __syncwarp(); | ||
|
|
@@ -730,6 +795,7 @@ __global__ void __launch_bounds__( | |
| trap(); | ||
| } | ||
| } | ||
| #endif | ||
| __syncwarp(); | ||
|
|
||
| // Store RDMA head for combine | ||
|
|
@@ -813,6 +879,7 @@ __global__ void __launch_bounds__( | |
| } | ||
| __syncwarp(); | ||
|
|
||
| #if defined(__NVCC__) | ||
| // Release the transaction in the window | ||
| if (is_token_in_rank_uint64 != 0) { | ||
| // Acquire lock first | ||
|
|
@@ -841,8 +908,25 @@ __global__ void __launch_bounds__( | |
| // Release lock | ||
| release_lock(rdma_send_channel_lock + lane_id); | ||
| } | ||
| #endif | ||
| __syncwarp(); | ||
| } | ||
|
|
||
| #if defined(__HIP_PLATFORM_AMD__) || defined(__HIPCC__) | ||
| // Epilogue | ||
| // Acquire sequential lock | ||
| while (lane_id == 0 and rdma_send_next_token_idx != token_idx) | ||
| ; | ||
| __syncwarp(); | ||
|
|
||
| // Update last token tail | ||
| if (last_rdma_tail_idx >= 0) | ||
| st_release_cta(const_cast<int const*>(rdma_send_channel_tail + lane_id), | ||
| last_rdma_tail_idx + 1); | ||
|
|
||
| // Release sequential lock | ||
| lane_id == 0 ? (rdma_send_next_token_idx += 1) : 0; | ||
| #endif | ||
| } else if (warp_role == WarpRole::kRDMASenderCoordinator) { | ||
| // NOTES: in case of splitting, the issued put at the end of the buffer | ||
| EP_DEVICE_ASSERT(num_max_rdma_chunked_recv_tokens % | ||
|
|
@@ -852,7 +936,11 @@ __global__ void __launch_bounds__( | |
| // Clean shared memory | ||
| EP_STATIC_ASSERT(kNumRDMARanks <= WARP_SIZE, | ||
| "Invalid number of RDMA ranks"); | ||
| #if defined(__HIP_PLATFORM_AMD__) || defined(__HIPCC__) | ||
| (lane_id < kNumRDMARanks) ? (rdma_send_channel_next_tail[lane_id] = 0) : 0; | ||
| #else | ||
| (lane_id < kNumRDMARanks) ? (rdma_send_channel_lock[lane_id] = 0) : 0; | ||
| #endif | ||
| (lane_id < kNumRDMARanks) ? (rdma_send_channel_tail[lane_id] = 0) : 0; | ||
| (lane_id < kNumRDMARanks) ? (rdma_send_channel_window[lane_id] = 0) : 0; | ||
|
|
||
|
|
@@ -1114,9 +1202,10 @@ __global__ void __launch_bounds__( | |
|
|
||
| // Copy data | ||
| #if defined(__HIP_PLATFORM_AMD__) || defined(__HIPCC__) | ||
| UNROLLED_WARP_COPY( | ||
| 5, lane_id, hidden_int4, reinterpret_cast<int4*>(dst_shifted), | ||
| reinterpret_cast<int4*>(shifted), ld_nc_global, st_na_global); | ||
| UNROLLED_WARP_COPY(5, lane_id, num_bytes_per_token / sizeof(int4), | ||
| reinterpret_cast<int4*>(dst_shifted), | ||
| reinterpret_cast<int4*>(shifted), ld_nc_global, | ||
| st_na_global); | ||
|
Comment on lines
+1205
to
+1208
|
||
| #else | ||
| if (lane_id == 0) { | ||
| tma_load_1d(tma_buffer, shifted, tma_mbarrier, num_bytes_per_token, | ||
|
|
@@ -1298,6 +1387,12 @@ __global__ void __launch_bounds__( | |
| 5, lane_id, hidden_int4, | ||
| reinterpret_cast<int4*>(recv_x + recv_token_idx * hidden_int4), | ||
| reinterpret_cast<int4*>(shifted), ld_nc_global, st_na_global); | ||
| if (scale_aligned) | ||
| UNROLLED_WARP_COPY(1, lane_id, num_scales, | ||
| recv_x_scales + recv_token_idx * num_scales, | ||
| reinterpret_cast<float*>(shifted + hidden_bytes), | ||
| ld_nc_global, st_na_global); | ||
|
|
||
| #else | ||
| if (lane_id == 0) { | ||
| tma_load_1d(tma_buffer, shifted, tma_mbarrier, tma_load_bytes); | ||
|
|
@@ -1660,7 +1755,12 @@ void cached_notify(int hidden_int4, int num_scales, int num_topk_idx, | |
| bool is_cached_dispatch, bool low_latency_mode, | ||
| uint64_t const* d2h_channel_addrs, int num_d2h_channel_addrs, | ||
| void* atomic_buffer_ptr) { | ||
| #if defined(__HIP_PLATFORM_AMD__) || defined(__HIPCC__) | ||
| int const num_threads = | ||
| std::max(128, WARP_SIZE * (is_cached_dispatch ? 2 : num_channels)); | ||
| #else | ||
| int const num_threads = std::max(128, WARP_SIZE * num_channels); | ||
| #endif | ||
| int const num_warps = num_threads / WARP_SIZE; | ||
| auto const num_rdma_ranks = num_ranks / NUM_MAX_NVL_PEERS; | ||
| int const kNumTMABytesPerWarp = 8192; | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The variable 'arch_list' is only defined inside the 'if not matches' block (line 182) but is used here unconditionally. If 'gpu_archs' is provided via environment variable, 'arch_list' will be undefined, causing a NameError. The logic should use 'gpu_archs' when set, or define 'arch_list' in both branches.