40
40
#include "mtl_ofi_endpoint.h"
41
41
#include "mtl_ofi_compat.h"
42
42
43
- #define MTL_OFI_RETRY_UNTIL_DONE (FUNC ) \
44
- do { \
45
- do { \
46
- ret = FUNC; \
47
- if(OPAL_LIKELY(0 == ret)) {break;} \
48
- } while(-FI_EAGAIN == ret); \
49
- } while(0);
50
43
51
44
BEGIN_C_DECLS
52
45
@@ -134,6 +127,24 @@ ompi_mtl_ofi_progress(void)
134
127
return count ;
135
128
}
136
129
130
+ /**
131
+ * When attempting to execute an OFI operation we need to handle
132
+ * resource overrun cases. When a call to an OFI OP fails with -FI_EAGAIN
133
+ * the OFI mtl will attempt to progress any pending Completion Queue
134
+ * events that may prevent additional operations to be enqueued.
135
+ * If the call to ofi progress is successful, then the function call
136
+ * will be retried.
137
+ */
138
+ #define MTL_OFI_RETRY_UNTIL_DONE (FUNC , RETURN ) \
139
+ do { \
140
+ do { \
141
+ RETURN = FUNC; \
142
+ if (OPAL_LIKELY(0 == RETURN)) {break;} \
143
+ if (OPAL_LIKELY(RETURN == -FI_EAGAIN)) { \
144
+ ompi_mtl_ofi_progress(); \
145
+ } \
146
+ } while (OPAL_LIKELY(-FI_EAGAIN == RETURN)); \
147
+ } while (0);
137
148
138
149
/* MTL interface functions */
139
150
int ompi_mtl_ofi_finalize (struct mca_mtl_base_module_t * mtl );
@@ -281,7 +292,7 @@ ompi_mtl_ofi_send_start(struct mca_mtl_base_module_t *mtl,
281
292
src_addr ,
282
293
match_bits | ompi_mtl_ofi .sync_send_ack ,
283
294
0 , /* Exact match, no ignore bits */
284
- (void * ) & ack_req -> ctx ));
295
+ (void * ) & ack_req -> ctx ), ret );
285
296
if (OPAL_UNLIKELY (0 > ret )) {
286
297
opal_output_verbose (1 , ompi_mtl_base_framework .framework_output ,
287
298
"%s:%d: fi_trecv failed: %s(%zd)" ,
@@ -302,15 +313,14 @@ ompi_mtl_ofi_send_start(struct mca_mtl_base_module_t *mtl,
302
313
length ,
303
314
comm -> c_my_rank ,
304
315
endpoint -> peer_fiaddr ,
305
- match_bits ));
316
+ match_bits ), ret );
306
317
} else {
307
318
MTL_OFI_RETRY_UNTIL_DONE (fi_tinject (ompi_mtl_ofi .ep ,
308
319
start ,
309
320
length ,
310
321
endpoint -> peer_fiaddr ,
311
- match_bits ));
322
+ match_bits ), ret );
312
323
}
313
-
314
324
if (OPAL_UNLIKELY (0 > ret )) {
315
325
char * fi_api = ompi_mtl_ofi .fi_cq_data ? "fi_tinjectddata" : "fi_tinject" ;
316
326
opal_output_verbose (1 , ompi_mtl_base_framework .framework_output ,
@@ -334,15 +344,15 @@ ompi_mtl_ofi_send_start(struct mca_mtl_base_module_t *mtl,
334
344
comm -> c_my_rank ,
335
345
endpoint -> peer_fiaddr ,
336
346
match_bits ,
337
- (void * ) & ofi_req -> ctx ));
347
+ (void * ) & ofi_req -> ctx ), ret );
338
348
} else {
339
349
MTL_OFI_RETRY_UNTIL_DONE (fi_tsend (ompi_mtl_ofi .ep ,
340
350
start ,
341
351
length ,
342
352
NULL ,
343
353
endpoint -> peer_fiaddr ,
344
354
match_bits ,
345
- (void * ) & ofi_req -> ctx ));
355
+ (void * ) & ofi_req -> ctx ), ret );
346
356
}
347
357
if (OPAL_UNLIKELY (0 > ret )) {
348
358
char * fi_api = ompi_mtl_ofi .fi_cq_data ? "fi_tsendddata" : "fi_send" ;
@@ -517,7 +527,7 @@ ompi_mtl_ofi_recv_callback(struct fi_cq_tagged_entry *wc,
517
527
tagged_msg .data = 0 ;
518
528
519
529
MTL_OFI_RETRY_UNTIL_DONE (fi_tsendmsg (ompi_mtl_ofi .ep ,
520
- & tagged_msg , 0 ));
530
+ & tagged_msg , 0 ), ret );
521
531
if (OPAL_UNLIKELY (0 > ret )) {
522
532
opal_output_verbose (1 , ompi_mtl_base_framework .framework_output ,
523
533
"%s:%d: fi_tsendmsg failed: %s(%zd)" ,
@@ -621,7 +631,7 @@ ompi_mtl_ofi_irecv(struct mca_mtl_base_module_t *mtl,
621
631
remote_addr ,
622
632
match_bits ,
623
633
mask_bits ,
624
- (void * )& ofi_req -> ctx ));
634
+ (void * )& ofi_req -> ctx ), ret );
625
635
if (OPAL_UNLIKELY (0 > ret )) {
626
636
if (NULL != ofi_req -> buffer ) {
627
637
free (ofi_req -> buffer );
@@ -734,7 +744,7 @@ ompi_mtl_ofi_imrecv(struct mca_mtl_base_module_t *mtl,
734
744
msg .context = (void * )& ofi_req -> ctx ;
735
745
msg .data = 0 ;
736
746
737
- MTL_OFI_RETRY_UNTIL_DONE (fi_trecvmsg (ompi_mtl_ofi .ep , & msg , msgflags ));
747
+ MTL_OFI_RETRY_UNTIL_DONE (fi_trecvmsg (ompi_mtl_ofi .ep , & msg , msgflags ), ret );
738
748
if (OPAL_UNLIKELY (0 > ret )) {
739
749
opal_output_verbose (1 , ompi_mtl_base_framework .framework_output ,
740
750
"%s:%d: fi_trecvmsg failed: %s(%zd)" ,
@@ -833,7 +843,7 @@ ompi_mtl_ofi_iprobe(struct mca_mtl_base_module_t *mtl,
833
843
ofi_req .completion_count = 1 ;
834
844
ofi_req .match_state = 0 ;
835
845
836
- MTL_OFI_RETRY_UNTIL_DONE (fi_trecvmsg (ompi_mtl_ofi .ep , & msg , msgflags ));
846
+ MTL_OFI_RETRY_UNTIL_DONE (fi_trecvmsg (ompi_mtl_ofi .ep , & msg , msgflags ), ret );
837
847
if (- FI_ENOMSG == ret ) {
838
848
/**
839
849
* The search request completed but no matching message was found.
@@ -928,7 +938,7 @@ ompi_mtl_ofi_improbe(struct mca_mtl_base_module_t *mtl,
928
938
ofi_req -> match_state = 0 ;
929
939
ofi_req -> mask_bits = mask_bits ;
930
940
931
- MTL_OFI_RETRY_UNTIL_DONE (fi_trecvmsg (ompi_mtl_ofi .ep , & msg , msgflags ));
941
+ MTL_OFI_RETRY_UNTIL_DONE (fi_trecvmsg (ompi_mtl_ofi .ep , & msg , msgflags ), ret );
932
942
if (- FI_ENOMSG == ret ) {
933
943
/**
934
944
* The search request completed but no matching message was found.
0 commit comments