Skip to content

Commit 1cd1f4a

Browse files
authored
Merge pull request #6192 from aravindksg/master
Fix for SEP when num local procs is greater than available contexts
2 parents 14d1325 + e5e19df commit 1cd1f4a

File tree

3 files changed

+26
-21
lines changed

3 files changed

+26
-21
lines changed

ompi/mca/mtl/ofi/mtl_ofi.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1391,7 +1391,7 @@ ompi_mtl_ofi_add_comm(struct mca_mtl_base_module_t *mtl,
13911391
struct ompi_communicator_t *comm)
13921392
{
13931393
int ret;
1394-
mca_mtl_ofi_ep_type ep_type = (false == ompi_mtl_ofi.sep_supported) ?
1394+
mca_mtl_ofi_ep_type ep_type = (0 == ompi_mtl_ofi.enable_sep) ?
13951395
OFI_REGULAR_EP : OFI_SCALABLE_EP;
13961396

13971397
/*
@@ -1422,7 +1422,7 @@ ompi_mtl_ofi_del_comm(struct mca_mtl_base_module_t *mtl,
14221422
struct ompi_communicator_t *comm)
14231423
{
14241424
int ret = OMPI_SUCCESS;
1425-
mca_mtl_ofi_ep_type ep_type = (false == ompi_mtl_ofi.sep_supported) ?
1425+
mca_mtl_ofi_ep_type ep_type = (0 == ompi_mtl_ofi.enable_sep) ?
14261426
OFI_REGULAR_EP : OFI_SCALABLE_EP;
14271427

14281428
/*

ompi/mca/mtl/ofi/mtl_ofi_component.c

Lines changed: 22 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -473,13 +473,6 @@ static int ompi_mtl_ofi_init_sep(struct fi_info *prov)
473473
int ret = OMPI_SUCCESS, num_ofi_ctxts;
474474
struct fi_av_attr av_attr = {0};
475475

476-
ompi_mtl_ofi.max_ctx_cnt = (prov->domain_attr->max_ep_tx_ctx <
477-
prov->domain_attr->max_ep_rx_ctx) ?
478-
prov->domain_attr->max_ep_tx_ctx :
479-
prov->domain_attr->max_ep_rx_ctx;
480-
481-
/* Provision enough contexts to service all ranks in a node */
482-
ompi_mtl_ofi.max_ctx_cnt /= (1 + ompi_process_info.num_local_peers);
483476
prov->ep_attr->tx_ctx_cnt = prov->ep_attr->rx_ctx_cnt =
484477
ompi_mtl_ofi.max_ctx_cnt;
485478

@@ -601,15 +594,14 @@ static mca_mtl_base_module_t*
601594
ompi_mtl_ofi_component_init(bool enable_progress_threads,
602595
bool enable_mpi_threads)
603596
{
604-
int ret, fi_version;
597+
int ret, fi_version, num_local_ranks;
598+
int ofi_tag_leading_zeros, ofi_tag_bits_for_cid;
605599
struct fi_info *hints;
606600
struct fi_info *providers = NULL;
607601
struct fi_info *prov = NULL;
608602
struct fi_info *prov_cq_data = NULL;
609603
char ep_name[FI_NAME_MAX] = {0};
610604
size_t namelen;
611-
int ofi_tag_leading_zeros;
612-
int ofi_tag_bits_for_cid;
613605

614606
/**
615607
* Hints to filter providers
@@ -774,10 +766,10 @@ ompi_mtl_ofi_component_init(bool enable_progress_threads,
774766
ompi_mtl_ofi.num_peers = 0;
775767

776768
/* Check if Scalable Endpoints can be enabled for the provider */
777-
ompi_mtl_ofi.sep_supported = false;
769+
ompi_mtl_ofi.enable_sep = 0;
778770
if ((prov->domain_attr->max_ep_tx_ctx > 1) ||
779771
(prov->domain_attr->max_ep_rx_ctx > 1)) {
780-
ompi_mtl_ofi.sep_supported = true;
772+
ompi_mtl_ofi.enable_sep = 1;
781773
opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
782774
"%s:%d: Scalable EP supported in %s provider. Enabling in MTL.\n",
783775
__FILE__, __LINE__, prov->fabric_attr->prov_name);
@@ -786,7 +778,7 @@ ompi_mtl_ofi_component_init(bool enable_progress_threads,
786778
/*
787779
* Scalable Endpoints is required for Thread Grouping feature
788780
*/
789-
if (!ompi_mtl_ofi.sep_supported && ompi_mtl_ofi.thread_grouping) {
781+
if (!ompi_mtl_ofi.enable_sep && ompi_mtl_ofi.thread_grouping) {
790782
opal_show_help("help-mtl-ofi.txt", "SEP unavailable", true,
791783
prov->fabric_attr->prov_name,
792784
ompi_process_info.nodename, __FILE__, __LINE__,
@@ -848,7 +840,20 @@ ompi_mtl_ofi_component_init(bool enable_progress_threads,
848840
* vectors, completion counters or event queues etc, and enabled.
849841
* See man fi_endpoint for more details.
850842
*/
851-
if (true == ompi_mtl_ofi.sep_supported) {
843+
ompi_mtl_ofi.max_ctx_cnt = (prov->domain_attr->max_ep_tx_ctx <
844+
prov->domain_attr->max_ep_rx_ctx) ?
845+
prov->domain_attr->max_ep_tx_ctx :
846+
prov->domain_attr->max_ep_rx_ctx;
847+
848+
num_local_ranks = 1 + ompi_process_info.num_local_peers;
849+
if (ompi_mtl_ofi.max_ctx_cnt <= num_local_ranks) {
850+
ompi_mtl_ofi.enable_sep = 0;
851+
}
852+
853+
if (1 == ompi_mtl_ofi.enable_sep) {
854+
/* Provision enough contexts to service all ranks in a node */
855+
ompi_mtl_ofi.max_ctx_cnt /= num_local_ranks;
856+
852857
ret = ompi_mtl_ofi_init_sep(prov);
853858
} else {
854859
ret = ompi_mtl_ofi_init_regular_ep(prov);
@@ -926,7 +931,7 @@ ompi_mtl_ofi_component_init(bool enable_progress_threads,
926931
if (ompi_mtl_ofi.av) {
927932
(void) fi_close((fid_t)ompi_mtl_ofi.av);
928933
}
929-
if ((false == ompi_mtl_ofi.sep_supported) &&
934+
if ((0 == ompi_mtl_ofi.enable_sep) &&
930935
ompi_mtl_ofi.ofi_ctxt != NULL &&
931936
ompi_mtl_ofi.ofi_ctxt[0].cq) {
932937
/* Check if CQ[0] was created for non-SEP case and close if needed */
@@ -964,9 +969,9 @@ ompi_mtl_ofi_finalize(struct mca_mtl_base_module_t *mtl)
964969
goto finalize_err;
965970
}
966971

967-
if (false == ompi_mtl_ofi.sep_supported) {
972+
if (0 == ompi_mtl_ofi.enable_sep) {
968973
/*
969-
* CQ[0] is bound to SEP object when SEP is not supported by a
974+
* CQ[0] is bound to SEP object Nwhen SEP is not supported by a
970975
* provider. OFI spec requires that we close the Endpoint that is bound
971976
* to the CQ before closing the CQ itself. So, for the non-SEP case, we
972977
* handle the closing of CQ[0] here.

ompi/mca/mtl/ofi/mtl_ofi_types.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -71,8 +71,8 @@ typedef struct mca_mtl_ofi_module_t {
7171
/* MCA parameter for Thread grouping feature */
7272
int thread_grouping;
7373

74-
/* Boolen value to indicate if provider supports Scalable EP or not */
75-
bool sep_supported;
74+
/* If SEP is used by OFI MTL */
75+
int enable_sep;
7676

7777
/* Numbers of bits used for rx contexts */
7878
int rx_ctx_bits;

0 commit comments

Comments
 (0)