@@ -107,6 +107,8 @@ static bool is_max_write_inline_size_initialized = false;
107107/* CPU cache line size */
108108static ssize_t cpu_cache_line_size ;
109109
110+ static bool early_completion = false;
111+
110112/* Function prototypes */
111113static int send_progress (nccl_net_ofi_rdma_req_t * req );
112114
@@ -935,6 +937,7 @@ static inline int update_send_data_from_remote(nccl_net_ofi_rdma_send_comm_t *s_
935937 send_data -> wdata =
936938 GET_RDMA_WRITE_IMM_DATA (s_comm -> remote_comm_id , req -> msg_seq_num , send_data -> schedule -> num_xfer_infos );
937939
940+ send_data -> no_target_completion = (ctrl_msg -> type == NCCL_OFI_RDMA_MSG_CTRL_NO_COMPLETION );
938941 return 0 ;
939942}
940943
@@ -1335,6 +1338,8 @@ static inline int handle_rx_buff_recv(nccl_net_ofi_rdma_device_t *device, int ra
13351338 goto exit ;
13361339 }
13371340 break ;
1341+ case NCCL_OFI_RDMA_MSG_CTRL_NO_COMPLETION :
1342+ /* fall through to NCCL_OFI_RDMA_MSG_CTRL case */
13381343 case NCCL_OFI_RDMA_MSG_CTRL :
13391344 /* CTRL receive completion */
13401345 assert (cq_entry -> len == nccl_net_ofi_rdma_ctrl_msg_size (ep -> num_rails , ep -> use_long_rkeys ));
@@ -3205,7 +3210,8 @@ static inline int insert_send_ctrl_req(
32053210 int dev_id , uint16_t msg_seq_num , void * buff ,
32063211 size_t size ,
32073212 nccl_net_ofi_rdma_mr_handle_t * buff_mr_handle ,
3208- nccl_net_ofi_rdma_req_t * recv_req )
3213+ nccl_net_ofi_rdma_req_t * recv_req ,
3214+ bool recv_completion_optional )
32093215{
32103216 nccl_net_ofi_scheduler_t * scheduler = device -> scheduler ;
32113217 nccl_net_ofi_rdma_ep_t * ep = (nccl_net_ofi_rdma_ep_t * )r_comm -> base .base .ep ;
@@ -3267,7 +3273,8 @@ static inline int insert_send_ctrl_req(
32673273
32683274 nccl_net_ofi_rdma_ctrl_msg_t * ctrl_msg = rdma_send_ctrl_get_msg (send_ctrl_data );
32693275
3270- ctrl_msg -> type = NCCL_OFI_RDMA_MSG_CTRL ;
3276+ /* If early completion is turned on, CTRL msg type will be NCCL_OFI_RDMA_MSG_CTRL_NO_COMPLETION to influence send() behavior */
3277+ ctrl_msg -> type = recv_completion_optional ? NCCL_OFI_RDMA_MSG_CTRL_NO_COMPLETION : NCCL_OFI_RDMA_MSG_CTRL ;
32713278 ctrl_msg -> remote_comm_id = r_comm -> remote_comm_id ;
32723279 ctrl_msg -> msg_seq_num = msg_seq_num ;
32733280 ctrl_msg -> buff_addr = (uint64_t )buff ;
@@ -3343,7 +3350,8 @@ static inline int allocate_rdma_recv_req(
33433350 int dev_id , uint16_t msg_seq_num , void * buff ,
33443351 size_t size ,
33453352 nccl_net_ofi_rdma_mr_handle_t * buff_mr_handle ,
3346- nccl_net_ofi_rdma_req_t * * ret_req )
3353+ nccl_net_ofi_rdma_req_t * * ret_req ,
3354+ bool recv_completion_optional )
33473355{
33483356 int ret = 0 ;
33493357 rdma_req_recv_data_t * recv_data ;
@@ -3364,14 +3372,15 @@ static inline int allocate_rdma_recv_req(
33643372 req -> msg_seq_num = msg_seq_num ;
33653373
33663374 recv_data = get_recv_data (req );
3367- recv_data -> total_num_compls = 2 ;
3375+ /* In the case of early completion, only expect the completion for control msg itself */
3376+ recv_data -> total_num_compls = recv_completion_optional ? 1 : 2 ;
33683377 recv_data -> eager_copy_req = NULL ;
33693378 recv_data -> dst_buff = buff ;
33703379 recv_data -> dst_len = size ;
33713380 recv_data -> dest_mr_handle = buff_mr_handle ;
33723381
33733382 /* TODO consolidate arguments to insert_send_ctrl_req and insert_recv_segms_req */
3374- ret = insert_send_ctrl_req (r_comm , device , dev_id , msg_seq_num , buff , size , buff_mr_handle , req );
3383+ ret = insert_send_ctrl_req (r_comm , device , dev_id , msg_seq_num , buff , size , buff_mr_handle , req , recv_completion_optional );
33753384 if (ret ) {
33763385 NCCL_OFI_WARN ("Failed to insert send ctrl request into recv request" );
33773386 return ret ;
@@ -3469,9 +3478,14 @@ static int recv(nccl_net_ofi_recv_comm_t *recv_comm, int n, void **buffers,
34693478 uint16_t msg_seq_num = 0 ;
34703479 bool eager = false;
34713480 int i ;
3481+ bool recv_completion_optional = false;
34723482
34733483 assert (r_comm != NULL );
34743484
3485+ if (early_completion && * base_req == (void * )NCCL_NET_OPTIONAL_RECV_COMPLETION ) {
3486+ recv_completion_optional = true;
3487+ }
3488+
34753489 if (r_comm -> comm_active == false) {
34763490 NCCL_OFI_WARN ("Called irecv on inactive communicator" );
34773491 ret = - EINVAL ;
@@ -3562,7 +3576,7 @@ static int recv(nccl_net_ofi_recv_comm_t *recv_comm, int n, void **buffers,
35623576
35633577 ret = allocate_rdma_recv_req (r_comm , device , dev_id , msg_seq_num ,
35643578 buffers [0 ], sizes [0 ],
3565- mr_handles [0 ], & req );
3579+ mr_handles [0 ], & req , recv_completion_optional );
35663580 if (ret != 0 ) {
35673581 goto error ;
35683582 }
@@ -5348,7 +5362,8 @@ static int post_rma_write(nccl_net_ofi_rdma_req_t *req)
53485362
53495363static int post_rdma_write (nccl_net_ofi_rdma_req_t * req ,
53505364 nccl_net_ofi_rdma_send_comm_rail_t * comm_rail ,
5351- nccl_net_ofi_xfer_info_t * xfer_info )
5365+ nccl_net_ofi_xfer_info_t * xfer_info ,
5366+ bool no_target_completion )
53525367{
53535368 rdma_req_send_data_t * send_data = get_send_data (req );
53545369 assert (xfer_info -> rail_id < send_data -> buff_mr_handle -> num_rails );
@@ -5358,12 +5373,19 @@ static int post_rdma_write(nccl_net_ofi_rdma_req_t *req,
53585373
53595374 ssize_t rc ;
53605375 /* Post RDMA write */
5361- rc = fi_writedata (comm_rail -> local_ep , (void * )((uintptr_t )send_data -> buff + xfer_info -> offset ),
5362- xfer_info -> msg_size , desc , send_data -> wdata ,
5363- comm_rail -> remote_addr ,
5364- send_data -> remote_buff + xfer_info -> offset ,
5365- send_data -> remote_mr_key [rail_id ], (void * )& req -> ctx [rail_id ]);
5366-
5376+ if (no_target_completion ) {
5377+ rc = fi_write (comm_rail -> local_ep , (void * )((uintptr_t )send_data -> buff + xfer_info -> offset ),
5378+ xfer_info -> msg_size , desc ,
5379+ comm_rail -> remote_addr ,
5380+ send_data -> remote_buff + xfer_info -> offset ,
5381+ send_data -> remote_mr_key [rail_id ], (void * )& req -> ctx [rail_id ]);
5382+ } else {
5383+ rc = fi_writedata (comm_rail -> local_ep , (void * )((uintptr_t )send_data -> buff + xfer_info -> offset ),
5384+ xfer_info -> msg_size , desc , send_data -> wdata ,
5385+ comm_rail -> remote_addr ,
5386+ send_data -> remote_buff + xfer_info -> offset ,
5387+ send_data -> remote_mr_key [rail_id ], (void * )& req -> ctx [rail_id ]);
5388+ }
53675389 if ((rc != 0 ) && (rc != - FI_EAGAIN )) {
53685390 NCCL_OFI_WARN ("fi_writedata failed; RC: %zd, Error: %s" ,
53695391 rc , fi_strerror (- rc ));
@@ -5492,7 +5514,7 @@ static int send_progress(nccl_net_ofi_rdma_req_t *req)
54925514 nccl_net_ofi_rdma_send_comm_rail_t * comm_rail =
54935515 rdma_send_comm_get_rail (s_comm , xfer_info -> rail_id );
54945516
5495- ret = post_rdma_write (req , comm_rail , xfer_info );
5517+ ret = post_rdma_write (req , comm_rail , xfer_info , send_data -> no_target_completion );
54965518
54975519 if (ret == 0 ) // Successfully sent the xfer with this rail
54985520 send_data -> xferred_rail_id ++ ;
@@ -7999,6 +8021,47 @@ int nccl_net_ofi_rdma_init(const char *provider_filter,
79998021 goto error ;
80008022 }
80018023
8024+ /*
8025+ * NCCL Net v9 API Optimization for LL/LL128 Protocols
8026+ *
8027+ * Background:
8028+ * When using LL (Low Latency) or LL128 protocols, NCCL sets the request pointer
8029+ * to NCCL_NET_OPTIONAL_RECV_COMPLETION in irecv() calls. This indicates that
8030+ * the plugin can complete a receiver request early without plugin explicitly
8031+ * polling the CQ to validate data arrival. This is achievable because NCCL itself
8032+ * following LL protocol semantics will validate data arrival by checking the flag bytes.
8033+ *
8034+ * Plugin Optimization Details:
8035+ * 1. Receiver Side:
8036+ * - Marks request completion immediately after CTRL message send completion
8037+ * - Does not wait for RDMA write operation completion
8038+ *
8039+ * 2. Sender Side:
8040+ * - Uses fi_write instead of fi_writedata, to eliminate unnecessary CQ entries on RX side
8041+ *
8042+ * Requirements:
8043+ * - Eager msg mode is diabled: eager_max_size == -1
8044+ * - Provider must use FI_PROGRESS_AUTO data progress model
8045+ */
8046+ if (ofi_nccl_early_completion () < 0 ) {
8047+ early_completion = data_progress_auto ;
8048+ } else if (ofi_nccl_early_completion () == 0 ) {
8049+ early_completion = false;
8050+ } else {
8051+ if (!data_progress_auto ) {
8052+ NCCL_OFI_WARN ("Failed configuration of EARLY_COMPLETION due to provider data progress model is not FI_PROGRESS_AUTO" );
8053+ ret = - ENOTSUP ;
8054+ goto error ;
8055+ }
8056+ early_completion = true;
8057+ }
8058+
8059+ if (early_completion && ofi_nccl_eager_max_size () != -1 ) {
8060+ NCCL_OFI_WARN ("Conflicted configuration of EARLY_COMPLETION and EAGER_MAX_SIZE" );
8061+ ret = - ENOTSUP ;
8062+ goto error ;
8063+ }
8064+
80028065 /* Create NCCL OFI topology */
80038066 topo = nccl_ofi_topo_create (provider_list );
80048067 if (!topo ) {
0 commit comments