Skip to content

Commit d4f408a

Browse files
committed
MTL OFI: MTL_OFI_RETRY_UNTIL_DONE support for Resource overflow
- Added support in MTL_OFI_RETRY_UNTIL_DONE to handle -FI_EAGAIN from the provider and correctly attempt to progress the OFI Completion queue by calling ompi_mtl_ofi_progress. - If events were pending that blocked OFI operations from being enqueued they will be completed and the OFI operation will be retried once ompi_mtl_ofi_progress has successfully completed. - Updated MTL_OFI_RETRY_UNTIL_DONE to take a RETURN variable instead of requiring the existance of a "ret" variable to pass back the return value from completing the OFI operation. Signed-off-by: Spruit, Neil R <neil.r.spruit@intel.com>
1 parent be3cb01 commit d4f408a

File tree

1 file changed

+28
-18
lines changed

1 file changed

+28
-18
lines changed

ompi/mca/mtl/ofi/mtl_ofi.h

Lines changed: 28 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -40,13 +40,6 @@
4040
#include "mtl_ofi_endpoint.h"
4141
#include "mtl_ofi_compat.h"
4242

43-
#define MTL_OFI_RETRY_UNTIL_DONE(FUNC) \
44-
do { \
45-
do { \
46-
ret = FUNC; \
47-
if(OPAL_LIKELY(0 == ret)) {break;} \
48-
} while(-FI_EAGAIN == ret); \
49-
} while(0);
5043

5144
BEGIN_C_DECLS
5245

@@ -134,6 +127,24 @@ ompi_mtl_ofi_progress(void)
134127
return count;
135128
}
136129

130+
/**
131+
* When attempting to execute an OFI operation we need to handle
132+
* resource overrun cases. When a call to an OFI OP fails with -FI_EAGAIN
133+
* the OFI mtl will attempt to progress any pending Completion Queue
134+
* events that may prevent additional operations to be enqueued.
135+
* If the call to ofi progress is successful, then the function call
136+
* will be retried.
137+
*/
138+
#define MTL_OFI_RETRY_UNTIL_DONE(FUNC, RETURN) \
139+
do { \
140+
do { \
141+
RETURN = FUNC; \
142+
if (OPAL_LIKELY(0 == RETURN)) {break;} \
143+
if (OPAL_LIKELY(RETURN == -FI_EAGAIN)) { \
144+
ompi_mtl_ofi_progress(); \
145+
} \
146+
} while (OPAL_LIKELY(-FI_EAGAIN == RETURN)); \
147+
} while (0);
137148

138149
/* MTL interface functions */
139150
int ompi_mtl_ofi_finalize(struct mca_mtl_base_module_t *mtl);
@@ -281,7 +292,7 @@ ompi_mtl_ofi_send_start(struct mca_mtl_base_module_t *mtl,
281292
src_addr,
282293
match_bits | ompi_mtl_ofi.sync_send_ack,
283294
0, /* Exact match, no ignore bits */
284-
(void *) &ack_req->ctx));
295+
(void *) &ack_req->ctx), ret);
285296
if (OPAL_UNLIKELY(0 > ret)) {
286297
opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
287298
"%s:%d: fi_trecv failed: %s(%zd)",
@@ -302,15 +313,14 @@ ompi_mtl_ofi_send_start(struct mca_mtl_base_module_t *mtl,
302313
length,
303314
comm->c_my_rank,
304315
endpoint->peer_fiaddr,
305-
match_bits));
316+
match_bits), ret);
306317
} else {
307318
MTL_OFI_RETRY_UNTIL_DONE(fi_tinject(ompi_mtl_ofi.ep,
308319
start,
309320
length,
310321
endpoint->peer_fiaddr,
311-
match_bits));
322+
match_bits), ret);
312323
}
313-
314324
if (OPAL_UNLIKELY(0 > ret)) {
315325
char *fi_api = ompi_mtl_ofi.fi_cq_data ? "fi_tinjectddata" : "fi_tinject";
316326
opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
@@ -334,15 +344,15 @@ ompi_mtl_ofi_send_start(struct mca_mtl_base_module_t *mtl,
334344
comm->c_my_rank,
335345
endpoint->peer_fiaddr,
336346
match_bits,
337-
(void *) &ofi_req->ctx));
347+
(void *) &ofi_req->ctx), ret);
338348
} else {
339349
MTL_OFI_RETRY_UNTIL_DONE(fi_tsend(ompi_mtl_ofi.ep,
340350
start,
341351
length,
342352
NULL,
343353
endpoint->peer_fiaddr,
344354
match_bits,
345-
(void *) &ofi_req->ctx));
355+
(void *) &ofi_req->ctx), ret);
346356
}
347357
if (OPAL_UNLIKELY(0 > ret)) {
348358
char *fi_api = ompi_mtl_ofi.fi_cq_data ? "fi_tsendddata" : "fi_send";
@@ -517,7 +527,7 @@ ompi_mtl_ofi_recv_callback(struct fi_cq_tagged_entry *wc,
517527
tagged_msg.data = 0;
518528

519529
MTL_OFI_RETRY_UNTIL_DONE(fi_tsendmsg(ompi_mtl_ofi.ep,
520-
&tagged_msg, 0));
530+
&tagged_msg, 0), ret);
521531
if (OPAL_UNLIKELY(0 > ret)) {
522532
opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
523533
"%s:%d: fi_tsendmsg failed: %s(%zd)",
@@ -621,7 +631,7 @@ ompi_mtl_ofi_irecv(struct mca_mtl_base_module_t *mtl,
621631
remote_addr,
622632
match_bits,
623633
mask_bits,
624-
(void *)&ofi_req->ctx));
634+
(void *)&ofi_req->ctx), ret);
625635
if (OPAL_UNLIKELY(0 > ret)) {
626636
if (NULL != ofi_req->buffer) {
627637
free(ofi_req->buffer);
@@ -734,7 +744,7 @@ ompi_mtl_ofi_imrecv(struct mca_mtl_base_module_t *mtl,
734744
msg.context = (void *)&ofi_req->ctx;
735745
msg.data = 0;
736746

737-
MTL_OFI_RETRY_UNTIL_DONE(fi_trecvmsg(ompi_mtl_ofi.ep, &msg, msgflags));
747+
MTL_OFI_RETRY_UNTIL_DONE(fi_trecvmsg(ompi_mtl_ofi.ep, &msg, msgflags), ret);
738748
if (OPAL_UNLIKELY(0 > ret)) {
739749
opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
740750
"%s:%d: fi_trecvmsg failed: %s(%zd)",
@@ -833,7 +843,7 @@ ompi_mtl_ofi_iprobe(struct mca_mtl_base_module_t *mtl,
833843
ofi_req.completion_count = 1;
834844
ofi_req.match_state = 0;
835845

836-
MTL_OFI_RETRY_UNTIL_DONE(fi_trecvmsg(ompi_mtl_ofi.ep, &msg, msgflags));
846+
MTL_OFI_RETRY_UNTIL_DONE(fi_trecvmsg(ompi_mtl_ofi.ep, &msg, msgflags), ret);
837847
if (-FI_ENOMSG == ret) {
838848
/**
839849
* The search request completed but no matching message was found.
@@ -928,7 +938,7 @@ ompi_mtl_ofi_improbe(struct mca_mtl_base_module_t *mtl,
928938
ofi_req->match_state = 0;
929939
ofi_req->mask_bits = mask_bits;
930940

931-
MTL_OFI_RETRY_UNTIL_DONE(fi_trecvmsg(ompi_mtl_ofi.ep, &msg, msgflags));
941+
MTL_OFI_RETRY_UNTIL_DONE(fi_trecvmsg(ompi_mtl_ofi.ep, &msg, msgflags), ret);
932942
if (-FI_ENOMSG == ret) {
933943
/**
934944
* The search request completed but no matching message was found.

0 commit comments

Comments
 (0)