Reproduced code
if rank % 2 == 0:
with torch.cuda.stream(test_alt_stream):
dist.all_gather_into_tensor(all_topk_idx, topk_idx, group=group)
buffer.low_latency_dispatch(current_x, topk_idx, num_tokens, num_experts,
use_fp8=dispatch_use_fp8, round_scale=round_scale, use_ue8m0=use_ue8m0,
cumulative_local_expert_recv_stats=cumulative_local_expert_recv_stats,
async_finish=not return_recv_hook, return_recv_hook=return_recv_hook)
if rank % 2 != 0:
with torch.cuda.stream(test_alt_stream):
dist.all_gather_into_tensor(all_topk_idx, topk_idx, group=group)
default_stream.wait_stream(test_alt_stream)
@sphish mentioned in #414: ' If each rank executes DeepEP and NCCL communications in the same order, there should be no conflicts. ' What is the reason for this restriction?