@@ -274,8 +274,6 @@ ompi_mtl_ofi_send_start(struct mca_mtl_base_module_t *mtl,
274
274
275
275
ofi_req -> completion_count = 2 ;
276
276
277
- MTL_OFI_SET_SYNC_SEND (match_bits );
278
-
279
277
MTL_OFI_RETRY_UNTIL_DONE (fi_trecv (ompi_mtl_ofi .ep ,
280
278
NULL ,
281
279
0 ,
@@ -291,6 +289,8 @@ ompi_mtl_ofi_send_start(struct mca_mtl_base_module_t *mtl,
291
289
free (ack_req );
292
290
return ompi_mtl_ofi_get_error (ret );
293
291
}
292
+ /* The SYNC_SEND tag bit is set for the send operation only.*/
293
+ MTL_OFI_SET_SYNC_SEND (match_bits );
294
294
} else {
295
295
ofi_req -> completion_count = 1 ;
296
296
}
@@ -423,20 +423,6 @@ ompi_mtl_ofi_isend(struct mca_mtl_base_module_t *mtl,
423
423
return ret ;
424
424
}
425
425
426
- /**
427
- * Called when a completion for SYNC ACK send is received.
428
- * This completes the synchronous recv operation. Thus, we
429
- * call the upper layer's completion function.
430
- */
431
- __opal_attribute_always_inline__ static inline int
432
- ompi_mtl_ofi_sync_recv_callback (struct fi_cq_tagged_entry * wc ,
433
- ompi_mtl_ofi_request_t * ofi_req )
434
- {
435
- ofi_req -> super .completion_callback (& ofi_req -> super );
436
-
437
- return OMPI_SUCCESS ;
438
- }
439
-
440
426
/**
441
427
* Called when a completion for a posted recv is received.
442
428
*/
@@ -450,6 +436,7 @@ ompi_mtl_ofi_recv_callback(struct fi_cq_tagged_entry *wc,
450
436
mca_mtl_ofi_endpoint_t * endpoint = NULL ;
451
437
int src = mtl_ofi_get_source (wc );
452
438
ompi_status_public_t * status = NULL ;
439
+ struct fi_msg_tagged tagged_msg ;
453
440
454
441
assert (ofi_req -> super .ompi_req );
455
442
status = & ofi_req -> super .ompi_req -> req_status ;
@@ -487,21 +474,25 @@ ompi_mtl_ofi_recv_callback(struct fi_cq_tagged_entry *wc,
487
474
}
488
475
489
476
/**
490
- * We do not want any SYNC_SEND_ACK here!
491
- * See mtl_ofi_send.c for details.
492
- */
477
+ * We can only accept MTL_OFI_SYNC_SEND in the standard recv callback.
478
+ * MTL_OFI_SYNC_SEND_ACK should only be received in the send_ack
479
+ * callback.
480
+ */
493
481
assert (!MTL_OFI_IS_SYNC_SEND_ACK (wc -> tag ));
494
482
495
483
/**
496
484
* If this recv is part of an MPI_Ssend operation, then we send an
497
- * acknowledgment back to the sender. The fi_context can be
498
- * re-used safely because the previous operation has completed.
499
- * This recv request will complete once we get a completion for
500
- * this send. See ompi_mtl_ofi_sync_recv_callback().
501
- * Otherwise, this request is now complete.
485
+ * acknowledgment back to the sender.
486
+ * The ack message is sent without generating a completion event in
487
+ * the completion queue by not setting FI_COMPLETION in the flags to
488
+ * fi_tsendmsg(FI_SELECTIVE_COMPLETION).
489
+ * This is done since the 0 byte message requires no
490
+ * notification on the send side for a successful completion.
491
+ * If a failure occurs the provider will notify the error
492
+ * in the cq_readerr during OFI progress. Once the message has been
493
+ * successfully processed the request is marked as completed.
502
494
*/
503
495
if (OPAL_UNLIKELY (MTL_OFI_IS_SYNC_SEND (wc -> tag ))) {
504
- ofi_req -> event_callback = ompi_mtl_ofi_sync_recv_callback ;
505
496
/**
506
497
* If the recv request was posted for any source,
507
498
* we need to extract the source's actual address.
@@ -511,23 +502,32 @@ ompi_mtl_ofi_recv_callback(struct fi_cq_tagged_entry *wc,
511
502
endpoint = ompi_mtl_ofi_get_endpoint (ofi_req -> mtl , ompi_proc );
512
503
ofi_req -> remote_addr = endpoint -> peer_fiaddr ;
513
504
}
514
- MTL_OFI_RETRY_UNTIL_DONE (fi_tsend (ompi_mtl_ofi .ep ,
515
- NULL ,
516
- 0 ,
517
- NULL ,
518
- ofi_req -> remote_addr ,
519
- wc -> tag | ompi_mtl_ofi .sync_send_ack ,
520
- (void * ) & ofi_req -> ctx ));
505
+
506
+ tagged_msg .msg_iov = NULL ;
507
+ tagged_msg .desc = NULL ;
508
+ tagged_msg .iov_count = 0 ;
509
+ tagged_msg .addr = ofi_req -> remote_addr ;
510
+ /**
511
+ * We must continue to use the user's original tag but remove the
512
+ * sync_send protocol tag bit and instead apply the sync_send_ack
513
+ * tag bit to complete the initator's sync send receive.
514
+ */
515
+ tagged_msg .tag = (wc -> tag | ompi_mtl_ofi .sync_send_ack ) & ~ompi_mtl_ofi .sync_send ;
516
+ tagged_msg .context = NULL ;
517
+ tagged_msg .data = 0 ;
518
+
519
+ MTL_OFI_RETRY_UNTIL_DONE (fi_tsendmsg (ompi_mtl_ofi .ep ,
520
+ & tagged_msg , 0 ));
521
521
if (OPAL_UNLIKELY (0 > ret )) {
522
522
opal_output_verbose (1 , ompi_mtl_base_framework .framework_output ,
523
- "%s:%d: fi_tsend failed: %s(%zd)" ,
523
+ "%s:%d: fi_tsendmsg failed: %s(%zd)" ,
524
524
__FILE__ , __LINE__ , fi_strerror (- ret ), ret );
525
525
status -> MPI_ERROR = OMPI_ERROR ;
526
526
}
527
- } else {
528
- ofi_req -> super .completion_callback (& ofi_req -> super );
529
527
}
530
528
529
+ ofi_req -> super .completion_callback (& ofi_req -> super );
530
+
531
531
return OMPI_SUCCESS ;
532
532
}
533
533
@@ -701,7 +701,7 @@ ompi_mtl_ofi_imrecv(struct mca_mtl_base_module_t *mtl,
701
701
struct fi_msg_tagged msg ;
702
702
int ompi_ret ;
703
703
ssize_t ret ;
704
- uint64_t msgflags = FI_CLAIM ;
704
+ uint64_t msgflags = FI_CLAIM | FI_COMPLETION ;
705
705
706
706
ompi_ret = ompi_mtl_datatype_recv_buf (convertor ,
707
707
& start ,
@@ -791,7 +791,7 @@ ompi_mtl_ofi_iprobe(struct mca_mtl_base_module_t *mtl,
791
791
uint64_t match_bits , mask_bits ;
792
792
ssize_t ret ;
793
793
struct fi_msg_tagged msg ;
794
- uint64_t msgflags = FI_PEEK ;
794
+ uint64_t msgflags = FI_PEEK | FI_COMPLETION ;
795
795
796
796
if (ompi_mtl_ofi .fi_cq_data ) {
797
797
/* If the source is known, use its peer_fiaddr. */
@@ -877,7 +877,7 @@ ompi_mtl_ofi_improbe(struct mca_mtl_base_module_t *mtl,
877
877
uint64_t match_bits , mask_bits ;
878
878
ssize_t ret ;
879
879
struct fi_msg_tagged msg ;
880
- uint64_t msgflags = FI_PEEK | FI_CLAIM ;
880
+ uint64_t msgflags = FI_PEEK | FI_CLAIM | FI_COMPLETION ;
881
881
882
882
ofi_req = malloc (sizeof * ofi_req );
883
883
if (NULL == ofi_req ) {
0 commit comments