Skip to content

Commit 9a17864

Browse files
committed
MTL OFI: Redesign sync send with reduced tag bits and quick ack
-Updated the design for sync send MPI calls to use 2 protocol bits for denoting "sync_send" or "sync_send_ack". -"Sync_send" is added to the send tag only and is masked out in receives such that it can be read by the original Recv posted in the send/recv operation. -"Sync_send_ack" is sent from the recv callback to the send side. This 0 byte send does not generate a completion entry and instead sends the message and immediately completes the opal completion in the recv. -Tag formats ofi_tag_1 and ofi_tag_2 have been updated to include 2 more tag bits per format type due to the reduced protocal bits required by OMPI. Signed-off-by: Spruit, Neil R <neil.r.spruit@intel.com>
1 parent c6ee8d2 commit 9a17864

File tree

4 files changed

+72
-69
lines changed

4 files changed

+72
-69
lines changed

ompi/mca/mtl/ofi/README

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -24,8 +24,8 @@ CQ.
2424
OFI TAG:
2525
MPI needs to send 96 bits of information per message (32 bits communicator id,
2626
32 bits source rank, 32 bits MPI tag) but OFI only offers 64 bits tags. In
27-
addition, the OFI MTL uses 4 bits of the OFI tag for the synchronous send protocol.
28-
Therefore, there are only 60 bits available in the OFI tag for message usage. The
27+
addition, the OFI MTL uses 2 bits of the OFI tag for the synchronous send protocol.
28+
Therefore, there are only 62 bits available in the OFI tag for message usage. The
2929
OFI MTL offers the mtl_ofi_tag_mode mca parameter with 4 modes to address this:
3030

3131
"auto" (Default):
@@ -36,19 +36,19 @@ fall back to "ofi_tag_1".
3636

3737
"ofi_tag_1":
3838
For providers that do not support FI_REMOTE_CQ_DATA, the OFI MTL will
39-
trim the fields (Communicator ID, Source Rank, MPI tag) to make them fit the 60
39+
trim the fields (Communicator ID, Source Rank, MPI tag) to make them fit the 62
4040
bits available bit in the OFI tag. There are two options available with different
4141
number of bits for the Communicator ID and MPI tag fields. This tag distribution
4242
offers: 12 bits for Communicator ID (max Communicator ID 4,095) subject to
43-
provider reserved bits (see mem_tag_format below), 16 bits for Source Rank (max
44-
Source Rank 65,535), 32 bits for MPI tag (max MPI tag is INT_MAX).
43+
provider reserved bits (see mem_tag_format below), 18 bits for Source Rank (max
44+
Source Rank 262,143), 32 bits for MPI tag (max MPI tag is INT_MAX).
4545

4646
"ofi_tag_2":
4747
Same as 2 "ofi_tag_1" but offering a different OFI tag distribution for
4848
applications that may require a greater number of supported Communicators at the
4949
expense of fewer MPI tag bits. This tag distribution offers: 24 bits for
50-
Communicator ID (max Communicator ED 16,777,215. See mem_tag_format below), 16
51-
bits for Source Rank (max Source Rank 65,535), 20 bits for MPI tag (max MPI tag
50+
Communicator ID (max Communicator ED 16,777,215. See mem_tag_format below), 18
51+
bits for Source Rank (max Source Rank 262,143), 20 bits for MPI tag (max MPI tag
5252
524,287).
5353

5454
"ofi_tag_full":

ompi/mca/mtl/ofi/mtl_ofi.h

Lines changed: 38 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -274,8 +274,6 @@ ompi_mtl_ofi_send_start(struct mca_mtl_base_module_t *mtl,
274274

275275
ofi_req->completion_count = 2;
276276

277-
MTL_OFI_SET_SYNC_SEND(match_bits);
278-
279277
MTL_OFI_RETRY_UNTIL_DONE(fi_trecv(ompi_mtl_ofi.ep,
280278
NULL,
281279
0,
@@ -291,6 +289,8 @@ ompi_mtl_ofi_send_start(struct mca_mtl_base_module_t *mtl,
291289
free(ack_req);
292290
return ompi_mtl_ofi_get_error(ret);
293291
}
292+
/* The SYNC_SEND tag bit is set for the send operation only.*/
293+
MTL_OFI_SET_SYNC_SEND(match_bits);
294294
} else {
295295
ofi_req->completion_count = 1;
296296
}
@@ -423,20 +423,6 @@ ompi_mtl_ofi_isend(struct mca_mtl_base_module_t *mtl,
423423
return ret;
424424
}
425425

426-
/**
427-
* Called when a completion for SYNC ACK send is received.
428-
* This completes the synchronous recv operation. Thus, we
429-
* call the upper layer's completion function.
430-
*/
431-
__opal_attribute_always_inline__ static inline int
432-
ompi_mtl_ofi_sync_recv_callback(struct fi_cq_tagged_entry *wc,
433-
ompi_mtl_ofi_request_t *ofi_req)
434-
{
435-
ofi_req->super.completion_callback(&ofi_req->super);
436-
437-
return OMPI_SUCCESS;
438-
}
439-
440426
/**
441427
* Called when a completion for a posted recv is received.
442428
*/
@@ -450,6 +436,7 @@ ompi_mtl_ofi_recv_callback(struct fi_cq_tagged_entry *wc,
450436
mca_mtl_ofi_endpoint_t *endpoint = NULL;
451437
int src = mtl_ofi_get_source(wc);
452438
ompi_status_public_t *status = NULL;
439+
struct fi_msg_tagged tagged_msg;
453440

454441
assert(ofi_req->super.ompi_req);
455442
status = &ofi_req->super.ompi_req->req_status;
@@ -487,21 +474,25 @@ ompi_mtl_ofi_recv_callback(struct fi_cq_tagged_entry *wc,
487474
}
488475

489476
/**
490-
* We do not want any SYNC_SEND_ACK here!
491-
* See mtl_ofi_send.c for details.
492-
*/
477+
* We can only accept MTL_OFI_SYNC_SEND in the standard recv callback.
478+
* MTL_OFI_SYNC_SEND_ACK should only be received in the send_ack
479+
* callback.
480+
*/
493481
assert(!MTL_OFI_IS_SYNC_SEND_ACK(wc->tag));
494482

495483
/**
496484
* If this recv is part of an MPI_Ssend operation, then we send an
497-
* acknowledgment back to the sender. The fi_context can be
498-
* re-used safely because the previous operation has completed.
499-
* This recv request will complete once we get a completion for
500-
* this send. See ompi_mtl_ofi_sync_recv_callback().
501-
* Otherwise, this request is now complete.
485+
* acknowledgment back to the sender.
486+
* The ack message is sent without generating a completion event in
487+
* the completion queue by not setting FI_COMPLETION in the flags to
488+
* fi_tsendmsg(FI_SELECTIVE_COMPLETION).
489+
* This is done since the 0 byte message requires no
490+
* notification on the send side for a successful completion.
491+
* If a failure occurs the provider will notify the error
492+
* in the cq_readerr during OFI progress. Once the message has been
493+
* successfully processed the request is marked as completed.
502494
*/
503495
if (OPAL_UNLIKELY(MTL_OFI_IS_SYNC_SEND(wc->tag))) {
504-
ofi_req->event_callback = ompi_mtl_ofi_sync_recv_callback;
505496
/**
506497
* If the recv request was posted for any source,
507498
* we need to extract the source's actual address.
@@ -511,23 +502,32 @@ ompi_mtl_ofi_recv_callback(struct fi_cq_tagged_entry *wc,
511502
endpoint = ompi_mtl_ofi_get_endpoint(ofi_req->mtl, ompi_proc);
512503
ofi_req->remote_addr = endpoint->peer_fiaddr;
513504
}
514-
MTL_OFI_RETRY_UNTIL_DONE(fi_tsend(ompi_mtl_ofi.ep,
515-
NULL,
516-
0,
517-
NULL,
518-
ofi_req->remote_addr,
519-
wc->tag | ompi_mtl_ofi.sync_send_ack,
520-
(void *) &ofi_req->ctx));
505+
506+
tagged_msg.msg_iov = NULL;
507+
tagged_msg.desc = NULL;
508+
tagged_msg.iov_count = 0;
509+
tagged_msg.addr = ofi_req->remote_addr;
510+
/**
511+
* We must continue to use the user's original tag but remove the
512+
* sync_send protocol tag bit and instead apply the sync_send_ack
513+
* tag bit to complete the initator's sync send receive.
514+
*/
515+
tagged_msg.tag = (wc->tag | ompi_mtl_ofi.sync_send_ack) & ~ompi_mtl_ofi.sync_send;
516+
tagged_msg.context = NULL;
517+
tagged_msg.data = 0;
518+
519+
MTL_OFI_RETRY_UNTIL_DONE(fi_tsendmsg(ompi_mtl_ofi.ep,
520+
&tagged_msg, 0));
521521
if (OPAL_UNLIKELY(0 > ret)) {
522522
opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
523-
"%s:%d: fi_tsend failed: %s(%zd)",
523+
"%s:%d: fi_tsendmsg failed: %s(%zd)",
524524
__FILE__, __LINE__, fi_strerror(-ret), ret);
525525
status->MPI_ERROR = OMPI_ERROR;
526526
}
527-
} else {
528-
ofi_req->super.completion_callback(&ofi_req->super);
529527
}
530528

529+
ofi_req->super.completion_callback(&ofi_req->super);
530+
531531
return OMPI_SUCCESS;
532532
}
533533

@@ -701,7 +701,7 @@ ompi_mtl_ofi_imrecv(struct mca_mtl_base_module_t *mtl,
701701
struct fi_msg_tagged msg;
702702
int ompi_ret;
703703
ssize_t ret;
704-
uint64_t msgflags = FI_CLAIM;
704+
uint64_t msgflags = FI_CLAIM | FI_COMPLETION;
705705

706706
ompi_ret = ompi_mtl_datatype_recv_buf(convertor,
707707
&start,
@@ -791,7 +791,7 @@ ompi_mtl_ofi_iprobe(struct mca_mtl_base_module_t *mtl,
791791
uint64_t match_bits, mask_bits;
792792
ssize_t ret;
793793
struct fi_msg_tagged msg;
794-
uint64_t msgflags = FI_PEEK;
794+
uint64_t msgflags = FI_PEEK | FI_COMPLETION;
795795

796796
if (ompi_mtl_ofi.fi_cq_data) {
797797
/* If the source is known, use its peer_fiaddr. */
@@ -877,7 +877,7 @@ ompi_mtl_ofi_improbe(struct mca_mtl_base_module_t *mtl,
877877
uint64_t match_bits, mask_bits;
878878
ssize_t ret;
879879
struct fi_msg_tagged msg;
880-
uint64_t msgflags = FI_PEEK | FI_CLAIM;
880+
uint64_t msgflags = FI_PEEK | FI_CLAIM | FI_COMPLETION;
881881

882882
ofi_req = malloc(sizeof *ofi_req);
883883
if (NULL == ofi_req) {

ompi/mca/mtl/ofi/mtl_ofi_component.c

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -467,6 +467,8 @@ ompi_mtl_ofi_component_init(bool enable_progress_threads,
467467
hints->caps = FI_TAGGED; /* Tag matching interface */
468468
hints->tx_attr->msg_order = FI_ORDER_SAS;
469469
hints->rx_attr->msg_order = FI_ORDER_SAS;
470+
hints->rx_attr->op_flags = FI_COMPLETION;
471+
hints->tx_attr->op_flags = FI_COMPLETION;
470472

471473
hints->domain_attr->threading = FI_THREAD_UNSPEC;
472474

@@ -691,7 +693,7 @@ ompi_mtl_ofi_component_init(bool enable_progress_threads,
691693
*/
692694
ret = fi_ep_bind(ompi_mtl_ofi.ep,
693695
(fid_t)ompi_mtl_ofi.cq,
694-
FI_SEND | FI_RECV);
696+
FI_TRANSMIT | FI_RECV | FI_SELECTIVE_COMPLETION);
695697
if (0 != ret) {
696698
opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
697699
"%s:%d: fi_bind CQ-EP failed: %s\n",

ompi/mca/mtl/ofi/mtl_ofi_types.h

Lines changed: 24 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -89,18 +89,19 @@ typedef struct mca_mtl_ofi_component_t {
8989
*/
9090

9191
/* Support FI_REMOTE_CQ_DATA, send the source rank in the CQ data (4 Bytes is the minimum)
92-
* 01234567 01234567 01234567 0123 4567 01234567 01234567 01234567 01234567
93-
* | |
94-
* context_id |prot| message tag
92+
* 01234567 01234567 01234567 012345 67 01234567 01234567 01234567 01234567
93+
* | |
94+
* context_id |prot| message tag
9595
*/
96-
#define MTL_OFI_PROTO_BIT_COUNT (4)
96+
#define MTL_OFI_PROTO_BIT_COUNT (2)
9797

98-
#define MTL_OFI_CID_BIT_COUNT_DATA (28)
98+
#define MTL_OFI_CID_MASK_DATA (0xFFFFFFFC00000000ULL)
99+
#define MTL_OFI_CID_BIT_COUNT_DATA (30)
99100
#define MTL_OFI_TAG_MASK_DATA (0x00000000FFFFFFFFULL)
100101
#define MTL_OFI_TAG_BIT_COUNT_DATA (32)
101-
#define MTL_OFI_PROTO_MASK_DATA (0x0000000F00000000ULL)
102+
#define MTL_OFI_PROTO_MASK_DATA (0x0000000300000000ULL)
102103
#define MTL_OFI_SYNC_SEND_DATA (0x0000000100000000ULL)
103-
#define MTL_OFI_SYNC_SEND_ACK_DATA (0x0000000900000000ULL)
104+
#define MTL_OFI_SYNC_SEND_ACK_DATA (0x0000000200000000ULL)
104105

105106
/* Send tag with CQ_DATA */
106107
__opal_attribute_always_inline__ static inline uint64_t
@@ -136,38 +137,38 @@ mtl_ofi_create_recv_tag_CQD(uint64_t *match_bits, uint64_t *mask_bits,
136137
/*
137138
* ofi_tag_1: fallback when no FI_REMOTE_CQ_DATA is supported
138139
*
139-
* 01234567 0123 4567 01234567 0123 4567 01234567 01234567 01234567 01234567
140-
* | | |
141-
* Comm id | source |prot| message tag
140+
* 01234567 0123 4567 01234567 012345 67 01234567 01234567 01234567 01234567
141+
* | | |
142+
* Comm id | source |prot| message tag
142143
*/
143144

144145
#define MTL_OFI_CID_BIT_COUNT_1 (12)
145-
#define MTL_OFI_SOURCE_TAG_MASK_1 (0x000FFFF000000000ULL)
146-
#define MTL_OFI_SOURCE_BIT_COUNT_1 (16)
147-
#define MTL_OFI_SOURCE_MASK_1 (0x000000000000FFFFULL)
146+
#define MTL_OFI_SOURCE_TAG_MASK_1 (0x000FFFFC00000000ULL)
147+
#define MTL_OFI_SOURCE_BIT_COUNT_1 (18)
148+
#define MTL_OFI_SOURCE_MASK_1 (0x000000000003FFFFULL)
148149
#define MTL_OFI_TAG_MASK_1 (0x00000000FFFFFFFFULL)
149150
#define MTL_OFI_TAG_BIT_COUNT_1 (32)
150-
#define MTL_OFI_PROTO_MASK_1 (0x0000000F00000000ULL)
151+
#define MTL_OFI_PROTO_MASK_1 (0x0000000300000000ULL)
151152
#define MTL_OFI_SYNC_SEND_1 (0x0000000100000000ULL)
152-
#define MTL_OFI_SYNC_SEND_ACK_1 (0x0000000900000000ULL)
153+
#define MTL_OFI_SYNC_SEND_ACK_1 (0x0000000200000000ULL)
153154

154155
/*
155156
* ofi_tag_2: Alternative tag when no FI_REMOTE_CQ_DATA is supported
156157
*
157-
* 01234567 01234567 01234567 01234567 01234567 0123 4567 01234567 01234567
158-
* | | |
159-
* Comm id | source |prot| message tag
158+
* 01234567 01234567 01234567 01234567 01234567 01 23 4567 01234567 01234567
159+
* | | |
160+
* Comm id | source |prot| message tag
160161
*/
161162

162163
#define MTL_OFI_CID_BIT_COUNT_2 (24)
163-
#define MTL_OFI_SOURCE_TAG_MASK_2 (0x000000FFFF000000ULL)
164-
#define MTL_OFI_SOURCE_BIT_COUNT_2 (16)
165-
#define MTL_OFI_SOURCE_MASK_2 (0x000000000000FFFFULL)
164+
#define MTL_OFI_SOURCE_TAG_MASK_2 (0x000000FFFFC00000ULL)
165+
#define MTL_OFI_SOURCE_BIT_COUNT_2 (18)
166+
#define MTL_OFI_SOURCE_MASK_2 (0x000000000003FFFFULL)
166167
#define MTL_OFI_TAG_MASK_2 (0x00000000000FFFFFULL)
167168
#define MTL_OFI_TAG_BIT_COUNT_2 (20)
168-
#define MTL_OFI_PROTO_MASK_2 (0x0000000000F00000ULL)
169+
#define MTL_OFI_PROTO_MASK_2 (0x0000000000300000ULL)
169170
#define MTL_OFI_SYNC_SEND_2 (0x0000000000100000ULL)
170-
#define MTL_OFI_SYNC_SEND_ACK_2 (0x0000000000900000ULL)
171+
#define MTL_OFI_SYNC_SEND_ACK_2 (0x0000000000200000ULL)
171172

172173
/* Send tag */
173174
__opal_attribute_always_inline__ static inline uint64_t

0 commit comments

Comments
 (0)