Skip to content

Commit a97f82b

Browse files
committed
rdma: Support early completion of recv() requests
Plugin Optimization Details: This change set early completion by default to be enabled when data progress model is FI_PROGRESS_AUTO. Receiver Side: - Marks request completion immediately after CTRL message send completion - Does not wait for RDMA write operation completion Sender Side: - Uses fi_write instead of fi_writedata, to eliminate unnecessary CQ entries on RX side Requirements: - Eager msg mode is disabled: OFI_NCCL_EAGER_MAX_SIZE == -1. (With the plugin version at the time of this PR, by default, eager mode is disabled) - Provider must use FI_PROGRESS_AUTO data progress model
1 parent 0c463da commit a97f82b

File tree

4 files changed

+92
-22
lines changed

4 files changed

+92
-22
lines changed

include/nccl_ofi_param.h

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -396,6 +396,14 @@ OFI_NCCL_PARAM_INT(use_low_lat_tc, "USE_LOW_LATENCY_TC", 1);
396396
*/
397397
OFI_NCCL_PARAM_INT(force_num_rails, "FORCE_NUM_RAILS", 0);
398398

399+
/*
400+
* 1 to enable early completion, 0 to disable it.
401+
* Default at -1 to follow the data progress model, given that
402+
* early completion feature is contigent on FI_PROGRESS_AUTO data progress model
403+
* i.e. enabled when FI_PROGRESS_AUTO, otherwise disabled
404+
*/
405+
OFI_NCCL_PARAM_INT(early_completion, "EARLY_COMPLETION", -1);
406+
399407
#ifdef __cplusplus
400408
} // End extern "C"
401409
#endif

include/nccl_ofi_rdma.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -104,6 +104,7 @@ enum nccl_ofi_rdma_msg_type {
104104
NCCL_OFI_RDMA_MSG_CTRL,
105105
NCCL_OFI_RDMA_MSG_EAGER,
106106
NCCL_OFI_RDMA_MSG_CLOSE,
107+
NCCL_OFI_RDMA_MSG_CTRL_NO_COMPLETION,
107108
NCCL_OFI_RDMA_MSG_INVALID = 15,
108109
NCCL_OFI_RDMA_MSG_MAX = NCCL_OFI_RDMA_MSG_INVALID,
109110
};
@@ -260,6 +261,12 @@ typedef struct {
260261
/* Total number of completions. Expect one completion for receiving the
261262
* control message and one completion for each send segment. */
262263
int total_num_compls;
264+
/*
265+
* Flag to indicate target side early completion, so that sender side
266+
* uses the corresponding RMA write operation.
267+
* True to use fi_write instead of fi_writedata in send()
268+
*/
269+
bool no_target_completion;
263270
#if HAVE_NVTX_TRACING
264271
nvtxRangeId_t trace_id;
265272
nvtxRangeId_t seg_trace_id[MAX_NUM_RAILS];

src/nccl_ofi_api.c

Lines changed: 0 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -789,14 +789,6 @@ ncclResult_t nccl_net_ofi_irecv_v9(void* recvComm, int n, void** data,
789789
return check_return(ncclInvalidArgument);
790790
}
791791

792-
/*
793-
* Reset to NULL for now until optional receive completion logic is
794-
* implemented
795-
*/
796-
if (*request == (void *)NCCL_NET_OPTIONAL_RECV_COMPLETION) {
797-
*request = NULL;
798-
}
799-
800792
ncclResult_t validation_result = msg_length_verify_max_size(sizes, n);
801793
if (validation_result != ncclSuccess) {
802794
return check_return(validation_result);

src/nccl_ofi_rdma.c

Lines changed: 77 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -107,6 +107,8 @@ static bool is_max_write_inline_size_initialized = false;
107107
/* CPU cache line size */
108108
static ssize_t cpu_cache_line_size;
109109

110+
static bool early_completion = false;
111+
110112
/* Function prototypes */
111113
static int send_progress(nccl_net_ofi_rdma_req_t *req);
112114

@@ -935,6 +937,7 @@ static inline int update_send_data_from_remote(nccl_net_ofi_rdma_send_comm_t *s_
935937
send_data->wdata =
936938
GET_RDMA_WRITE_IMM_DATA(s_comm->remote_comm_id, req->msg_seq_num, send_data->schedule->num_xfer_infos);
937939

940+
send_data->no_target_completion = (ctrl_msg->type == NCCL_OFI_RDMA_MSG_CTRL_NO_COMPLETION);
938941
return 0;
939942
}
940943

@@ -1335,6 +1338,8 @@ static inline int handle_rx_buff_recv(nccl_net_ofi_rdma_device_t *device, int ra
13351338
goto exit;
13361339
}
13371340
break;
1341+
case NCCL_OFI_RDMA_MSG_CTRL_NO_COMPLETION:
1342+
/* fall through to NCCL_OFI_RDMA_MSG_CTRL case */
13381343
case NCCL_OFI_RDMA_MSG_CTRL:
13391344
/* CTRL receive completion */
13401345
assert(cq_entry->len == nccl_net_ofi_rdma_ctrl_msg_size(ep->num_rails, ep->use_long_rkeys));
@@ -3205,7 +3210,8 @@ static inline int insert_send_ctrl_req(
32053210
int dev_id, uint16_t msg_seq_num, void *buff,
32063211
size_t size,
32073212
nccl_net_ofi_rdma_mr_handle_t *buff_mr_handle,
3208-
nccl_net_ofi_rdma_req_t *recv_req)
3213+
nccl_net_ofi_rdma_req_t *recv_req,
3214+
bool recv_completion_optional)
32093215
{
32103216
nccl_net_ofi_scheduler_t *scheduler = device->scheduler;
32113217
nccl_net_ofi_rdma_ep_t *ep = (nccl_net_ofi_rdma_ep_t *)r_comm->base.base.ep;
@@ -3267,7 +3273,8 @@ static inline int insert_send_ctrl_req(
32673273

32683274
nccl_net_ofi_rdma_ctrl_msg_t *ctrl_msg = rdma_send_ctrl_get_msg(send_ctrl_data);
32693275

3270-
ctrl_msg->type = NCCL_OFI_RDMA_MSG_CTRL;
3276+
/* If early completion is turned on, CTRL msg type will be NCCL_OFI_RDMA_MSG_CTRL_NO_COMPLETION to influence send() behavior */
3277+
ctrl_msg->type = recv_completion_optional ? NCCL_OFI_RDMA_MSG_CTRL_NO_COMPLETION : NCCL_OFI_RDMA_MSG_CTRL;
32713278
ctrl_msg->remote_comm_id = r_comm->remote_comm_id;
32723279
ctrl_msg->msg_seq_num = msg_seq_num;
32733280
ctrl_msg->buff_addr = (uint64_t)buff;
@@ -3343,7 +3350,8 @@ static inline int allocate_rdma_recv_req(
33433350
int dev_id, uint16_t msg_seq_num, void *buff,
33443351
size_t size,
33453352
nccl_net_ofi_rdma_mr_handle_t *buff_mr_handle,
3346-
nccl_net_ofi_rdma_req_t **ret_req)
3353+
nccl_net_ofi_rdma_req_t **ret_req,
3354+
bool recv_completion_optional)
33473355
{
33483356
int ret = 0;
33493357
rdma_req_recv_data_t *recv_data;
@@ -3364,14 +3372,15 @@ static inline int allocate_rdma_recv_req(
33643372
req->msg_seq_num = msg_seq_num;
33653373

33663374
recv_data = get_recv_data(req);
3367-
recv_data->total_num_compls = 2;
3375+
/* In the case of early completion, only expect the completion for control msg itself */
3376+
recv_data->total_num_compls = recv_completion_optional ? 1 : 2;
33683377
recv_data->eager_copy_req = NULL;
33693378
recv_data->dst_buff = buff;
33703379
recv_data->dst_len = size;
33713380
recv_data->dest_mr_handle = buff_mr_handle;
33723381

33733382
/* TODO consolidate arguments to insert_send_ctrl_req and insert_recv_segms_req */
3374-
ret = insert_send_ctrl_req(r_comm, device, dev_id, msg_seq_num, buff, size, buff_mr_handle, req);
3383+
ret = insert_send_ctrl_req(r_comm, device, dev_id, msg_seq_num, buff, size, buff_mr_handle, req, recv_completion_optional);
33753384
if (ret) {
33763385
NCCL_OFI_WARN("Failed to insert send ctrl request into recv request");
33773386
return ret;
@@ -3469,9 +3478,14 @@ static int recv(nccl_net_ofi_recv_comm_t *recv_comm, int n, void **buffers,
34693478
uint16_t msg_seq_num = 0;
34703479
bool eager = false;
34713480
int i;
3481+
bool recv_completion_optional = false;
34723482

34733483
assert(r_comm != NULL);
34743484

3485+
if (early_completion && *base_req == (void *)NCCL_NET_OPTIONAL_RECV_COMPLETION) {
3486+
recv_completion_optional = true;
3487+
}
3488+
34753489
if (r_comm->comm_active == false) {
34763490
NCCL_OFI_WARN("Called irecv on inactive communicator");
34773491
ret = -EINVAL;
@@ -3562,7 +3576,7 @@ static int recv(nccl_net_ofi_recv_comm_t *recv_comm, int n, void **buffers,
35623576

35633577
ret = allocate_rdma_recv_req(r_comm, device, dev_id, msg_seq_num,
35643578
buffers[0], sizes[0],
3565-
mr_handles[0], &req);
3579+
mr_handles[0], &req, recv_completion_optional);
35663580
if (ret != 0) {
35673581
goto error;
35683582
}
@@ -5348,7 +5362,8 @@ static int post_rma_write(nccl_net_ofi_rdma_req_t *req)
53485362

53495363
static int post_rdma_write(nccl_net_ofi_rdma_req_t *req,
53505364
nccl_net_ofi_rdma_send_comm_rail_t *comm_rail,
5351-
nccl_net_ofi_xfer_info_t *xfer_info)
5365+
nccl_net_ofi_xfer_info_t *xfer_info,
5366+
bool no_target_completion)
53525367
{
53535368
rdma_req_send_data_t *send_data = get_send_data(req);
53545369
assert(xfer_info->rail_id < send_data->buff_mr_handle->num_rails);
@@ -5358,12 +5373,19 @@ static int post_rdma_write(nccl_net_ofi_rdma_req_t *req,
53585373

53595374
ssize_t rc;
53605375
/* Post RDMA write */
5361-
rc = fi_writedata(comm_rail->local_ep, (void*)((uintptr_t)send_data->buff + xfer_info->offset),
5362-
xfer_info->msg_size, desc, send_data->wdata,
5363-
comm_rail->remote_addr,
5364-
send_data->remote_buff + xfer_info->offset,
5365-
send_data->remote_mr_key[rail_id], (void *)&req->ctx[rail_id]);
5366-
5376+
if (no_target_completion) {
5377+
rc = fi_write(comm_rail->local_ep, (void*)((uintptr_t)send_data->buff + xfer_info->offset),
5378+
xfer_info->msg_size, desc,
5379+
comm_rail->remote_addr,
5380+
send_data->remote_buff + xfer_info->offset,
5381+
send_data->remote_mr_key[rail_id], (void *)&req->ctx[rail_id]);
5382+
} else {
5383+
rc = fi_writedata(comm_rail->local_ep, (void*)((uintptr_t)send_data->buff + xfer_info->offset),
5384+
xfer_info->msg_size, desc, send_data->wdata,
5385+
comm_rail->remote_addr,
5386+
send_data->remote_buff + xfer_info->offset,
5387+
send_data->remote_mr_key[rail_id], (void *)&req->ctx[rail_id]);
5388+
}
53675389
if ((rc != 0) && (rc != -FI_EAGAIN)) {
53685390
NCCL_OFI_WARN("fi_writedata failed; RC: %zd, Error: %s",
53695391
rc, fi_strerror(-rc));
@@ -5492,7 +5514,7 @@ static int send_progress(nccl_net_ofi_rdma_req_t *req)
54925514
nccl_net_ofi_rdma_send_comm_rail_t *comm_rail =
54935515
rdma_send_comm_get_rail(s_comm, xfer_info->rail_id);
54945516

5495-
ret = post_rdma_write(req, comm_rail, xfer_info);
5517+
ret = post_rdma_write(req, comm_rail, xfer_info, send_data->no_target_completion);
54965518

54975519
if (ret == 0) // Successfully sent the xfer with this rail
54985520
send_data->xferred_rail_id++;
@@ -7999,6 +8021,47 @@ int nccl_net_ofi_rdma_init(const char *provider_filter,
79998021
goto error;
80008022
}
80018023

8024+
/*
8025+
* NCCL Net v9 API Optimization for LL/LL128 Protocols
8026+
*
8027+
* Background:
8028+
* When using LL (Low Latency) or LL128 protocols, NCCL sets the request pointer
8029+
* to NCCL_NET_OPTIONAL_RECV_COMPLETION in irecv() calls. This indicates that
8030+
* the plugin can complete a receiver request early without plugin explicitly
8031+
* polling the CQ to validate data arrival. This is achievable because NCCL itself
8032+
* following LL protocol semantics will validate data arrival by checking the flag bytes.
8033+
*
8034+
* Plugin Optimization Details:
8035+
* 1. Receiver Side:
8036+
* - Marks request completion immediately after CTRL message send completion
8037+
* - Does not wait for RDMA write operation completion
8038+
*
8039+
* 2. Sender Side:
8040+
* - Uses fi_write instead of fi_writedata, to eliminate unnecessary CQ entries on RX side
8041+
*
8042+
* Requirements:
8043+
* - Eager msg mode is diabled: eager_max_size == -1
8044+
* - Provider must use FI_PROGRESS_AUTO data progress model
8045+
*/
8046+
if (ofi_nccl_early_completion() < 0) {
8047+
early_completion = data_progress_auto;
8048+
} else if (ofi_nccl_early_completion() == 0) {
8049+
early_completion = false;
8050+
} else {
8051+
if (!data_progress_auto) {
8052+
NCCL_OFI_WARN("Failed configuration of EARLY_COMPLETION due to provider data progress model is not FI_PROGRESS_AUTO");
8053+
ret = -ENOTSUP;
8054+
goto error;
8055+
}
8056+
early_completion = true;
8057+
}
8058+
8059+
if (early_completion && ofi_nccl_eager_max_size() != -1) {
8060+
NCCL_OFI_WARN("Conflicted configuration of EARLY_COMPLETION and EAGER_MAX_SIZE");
8061+
ret = -ENOTSUP;
8062+
goto error;
8063+
}
8064+
80028065
/* Create NCCL OFI topology */
80038066
topo = nccl_ofi_topo_create(provider_list);
80048067
if (!topo) {

0 commit comments

Comments
 (0)