Skip to content

Commit 44be7f1

Browse files
committed
mtl/ofi: Provide av count hint during initialization
Provide the av_attr.count hint (number of addresses that will be inserted into the address vector through the life of the process) at initialization of the address vector. It's ok to be a bit wrong, but some endpoints (RxR) can benefit by not going through the slow growth realloc churn. Signed-off-by: Brian Barrett <bbarrett@amazon.com>
1 parent 352b667 commit 44be7f1

File tree

1 file changed

+34
-17
lines changed

1 file changed

+34
-17
lines changed

ompi/mca/mtl/ofi/mtl_ofi_component.c

Lines changed: 34 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -491,7 +491,7 @@ ompi_mtl_ofi_define_tag_mode(int ofi_tag_mode, int *bits_for_cid) {
491491
} \
492492
} while(0);
493493

494-
static int ompi_mtl_ofi_init_sep(struct fi_info *prov)
494+
static int ompi_mtl_ofi_init_sep(struct fi_info *prov, int universe_size)
495495
{
496496
int ret = OMPI_SUCCESS, num_ofi_ctxts;
497497
struct fi_av_attr av_attr = {0};
@@ -513,7 +513,7 @@ static int ompi_mtl_ofi_init_sep(struct fi_info *prov)
513513

514514
av_attr.type = (MTL_OFI_AV_TABLE == av_type) ? FI_AV_TABLE: FI_AV_MAP;
515515
av_attr.rx_ctx_bits = ompi_mtl_ofi.rx_ctx_bits;
516-
av_attr.count = ompi_mtl_ofi.num_ofi_contexts;
516+
av_attr.count = ompi_mtl_ofi.num_ofi_contexts * universe_size;
517517
ret = fi_av_open(ompi_mtl_ofi.domain, &av_attr, &ompi_mtl_ofi.av, NULL);
518518

519519
if (0 != ret) {
@@ -546,7 +546,7 @@ static int ompi_mtl_ofi_init_sep(struct fi_info *prov)
546546
return ret;
547547
}
548548

549-
static int ompi_mtl_ofi_init_regular_ep(struct fi_info * prov)
549+
static int ompi_mtl_ofi_init_regular_ep(struct fi_info * prov, int universe_size)
550550
{
551551
int ret = OMPI_SUCCESS;
552552
struct fi_av_attr av_attr = {0};
@@ -574,6 +574,7 @@ static int ompi_mtl_ofi_init_regular_ep(struct fi_info * prov)
574574
* - address vector and completion queues
575575
*/
576576
av_attr.type = (MTL_OFI_AV_TABLE == av_type) ? FI_AV_TABLE: FI_AV_MAP;
577+
av_attr.count = universe_size;
577578
ret = fi_av_open(ompi_mtl_ofi.domain, &av_attr, &ompi_mtl_ofi.av, NULL);
578579
if (ret) {
579580
MTL_OFI_LOG_FI_ERR(ret, "fi_av_open failed");
@@ -626,6 +627,8 @@ ompi_mtl_ofi_component_init(bool enable_progress_threads,
626627
struct fi_info *prov_cq_data = NULL;
627628
char ep_name[FI_NAME_MAX] = {0};
628629
size_t namelen;
630+
int universe_size;
631+
char *univ_size_str;
629632

630633
/**
631634
* Hints to filter providers
@@ -897,21 +900,35 @@ ompi_mtl_ofi_component_init(bool enable_progress_threads,
897900
* vectors, completion counters or event queues etc, and enabled.
898901
* See man fi_endpoint for more details.
899902
*/
900-
max_ofi_ctxts = (prov->domain_attr->max_ep_tx_ctx <
901-
prov->domain_attr->max_ep_rx_ctx) ?
902-
prov->domain_attr->max_ep_tx_ctx :
903-
prov->domain_attr->max_ep_rx_ctx;
904-
905-
num_local_ranks = 1 + ompi_process_info.num_local_peers;
906-
if ((max_ofi_ctxts <= num_local_ranks) &&
907-
(1 == ompi_mtl_ofi.enable_sep)) {
908-
opal_show_help("help-mtl-ofi.txt", "Local ranks exceed ofi contexts",
909-
true, prov->fabric_attr->prov_name,
910-
ompi_process_info.nodename, __FILE__, __LINE__);
911-
goto error;
903+
904+
/* use the universe size as a rough guess on the address vector
905+
* size hint that should be passed to fi_av_open(). For regular
906+
* endpoints, the count will be the universe size. For scalable
907+
* endpoints, the count will be the universe size multiplied by
908+
* the number of contexts. In either case, if the universe grows
909+
* (via dynamic processes), the count is a hint, not a hard limit,
910+
* so libfabric will just be slightly less efficient.
911+
*/
912+
univ_size_str = getenv("OMPI_UNIVERSE_SIZE");
913+
if (NULL == univ_size_str ||
914+
(universe_size = strtol(univ_size_str, NULL, 0)) <= 0) {
915+
universe_size = ompi_proc_world_size();
912916
}
913917

914918
if (1 == ompi_mtl_ofi.enable_sep) {
919+
max_ofi_ctxts = (prov->domain_attr->max_ep_tx_ctx <
920+
prov->domain_attr->max_ep_rx_ctx) ?
921+
prov->domain_attr->max_ep_tx_ctx :
922+
prov->domain_attr->max_ep_rx_ctx;
923+
924+
num_local_ranks = 1 + ompi_process_info.num_local_peers;
925+
if (max_ofi_ctxts <= num_local_ranks) {
926+
opal_show_help("help-mtl-ofi.txt", "Local ranks exceed ofi contexts",
927+
true, prov->fabric_attr->prov_name,
928+
ompi_process_info.nodename, __FILE__, __LINE__);
929+
goto error;
930+
}
931+
915932
/* Provision enough contexts to service all ranks in a node */
916933
max_ofi_ctxts /= num_local_ranks;
917934

@@ -926,9 +943,9 @@ ompi_mtl_ofi_component_init(bool enable_progress_threads,
926943
ompi_mtl_ofi.num_ofi_contexts = max_ofi_ctxts;
927944
}
928945

929-
ret = ompi_mtl_ofi_init_sep(prov);
946+
ret = ompi_mtl_ofi_init_sep(prov, universe_size);
930947
} else {
931-
ret = ompi_mtl_ofi_init_regular_ep(prov);
948+
ret = ompi_mtl_ofi_init_regular_ep(prov, universe_size);
932949
}
933950

934951
if (OMPI_SUCCESS != ret) {

0 commit comments

Comments
 (0)