Skip to content

Commit f744668

Browse files
authored
Merge pull request #7646 from hppritcha/topic/ofi_common_wl
add a common ofi whitelist/blacklist
2 parents 4a5622a + 9f1081a commit f744668

File tree

8 files changed

+186
-75
lines changed

8 files changed

+186
-75
lines changed

ompi/mca/mtl/ofi/Makefile.am

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,8 @@
77
# Copyright (c) 2017 IBM Corporation. All rights reserved.
88
# Copyright (c) 2019 Research Organization for Information Science
99
# and Technology (RIST). All rights reserved.
10+
# Copyright (c) 2020 Triad National Security, LLC. All rights
11+
# reserved.
1012
# $COPYRIGHT$
1113
#
1214
# Additional copyrights may follow

ompi/mca/mtl/ofi/mtl_ofi.c

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -113,7 +113,7 @@ ompi_mtl_ofi_add_procs(struct mca_mtl_base_module_t *mtl,
113113
*/
114114
count = fi_av_insert(ompi_mtl_ofi.av, ep_names, nprocs, fi_addrs, 0, NULL);
115115
if ((count < 0) || (nprocs != (size_t)count)) {
116-
opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
116+
opal_output_verbose(1, opal_common_ofi.output,
117117
"%s:%d: fi_av_insert failed: %d\n",
118118
__FILE__, __LINE__, count);
119119
ret = OMPI_ERROR;
@@ -126,7 +126,7 @@ ompi_mtl_ofi_add_procs(struct mca_mtl_base_module_t *mtl,
126126
for (i = 0; i < nprocs; ++i) {
127127
endpoint = OBJ_NEW(mca_mtl_ofi_endpoint_t);
128128
if (NULL == endpoint) {
129-
opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
129+
opal_output_verbose(1, opal_common_ofi.output,
130130
"%s:%d: mtl/ofi: could not allocate endpoint"
131131
" structure\n",
132132
__FILE__, __LINE__);
@@ -171,7 +171,7 @@ ompi_mtl_ofi_del_procs(struct mca_mtl_base_module_t *mtl,
171171
endpoint = procs[i]->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_MTL];
172172
ret = fi_av_remove(ompi_mtl_ofi.av, &endpoint->peer_fiaddr, 1, 0);
173173
if (ret) {
174-
opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
174+
opal_output_verbose(1, opal_common_ofi.output,
175175
"%s:%d: fi_av_remove failed: %s\n", __FILE__, __LINE__, fi_strerror(errno));
176176
return ret;
177177
}

ompi/mca/mtl/ofi/mtl_ofi.h

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
* Copyright (c) 2013-2018 Intel, Inc. All rights reserved
33
* Copyright (c) 2017 Los Alamos National Security, LLC. All rights
44
* reserved.
5-
* Copyright (c) 2019 Triad National Security, LLC. All rights
5+
* Copyright (c) 2019-2020 Triad National Security, LLC. All rights
66
* reserved.
77
* Copyright (c) 2018-2020 Amazon.com, Inc. or its affiliates. All rights
88
* reserved.
@@ -39,6 +39,7 @@
3939
#include "ompi/mca/mtl/base/base.h"
4040
#include "ompi/mca/mtl/base/mtl_base_datatype.h"
4141
#include "ompi/message/message.h"
42+
#include "opal/mca/common/ofi/common_ofi.h"
4243

4344
#include "mtl_ofi_opt.h"
4445
#include "mtl_ofi_types.h"
@@ -236,7 +237,7 @@ ompi_mtl_ofi_progress(void)
236237

237238
#define MTL_OFI_LOG_FI_ERR(err, string) \
238239
do { \
239-
opal_output_verbose(1, ompi_mtl_base_framework.framework_output, \
240+
opal_output_verbose(1, opal_common_ofi.output, \
240241
"%s:%d:%s: %s\n", \
241242
__FILE__, __LINE__, string, fi_strerror(-err)); \
242243
} while(0);
@@ -378,7 +379,7 @@ ompi_mtl_ofi_ssend_recv(ompi_mtl_ofi_request_t *ack_req,
378379
0, /* Exact match, no ignore bits */
379380
(void *) &ack_req->ctx), ret);
380381
if (OPAL_UNLIKELY(0 > ret)) {
381-
opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
382+
opal_output_verbose(1, opal_common_ofi.output,
382383
"%s:%d: fi_trecv failed: %s(%zd)",
383384
__FILE__, __LINE__, fi_strerror(-ret), ret);
384385
free(ack_req);
@@ -664,7 +665,7 @@ ompi_mtl_ofi_recv_callback(struct fi_cq_tagged_entry *wc,
664665
status->_ucount = wc->len;
665666

666667
if (OPAL_UNLIKELY(wc->len > ofi_req->length)) {
667-
opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
668+
opal_output_verbose(1, opal_common_ofi.output,
668669
"truncate expected: %ld %ld",
669670
wc->len, ofi_req->length);
670671
status->MPI_ERROR = MPI_ERR_TRUNCATE;
@@ -678,7 +679,7 @@ ompi_mtl_ofi_recv_callback(struct fi_cq_tagged_entry *wc,
678679
ofi_req->buffer,
679680
wc->len);
680681
if (OPAL_UNLIKELY(OMPI_SUCCESS != ompi_ret)) {
681-
opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
682+
opal_output_verbose(1, opal_common_ofi.output,
682683
"%s:%d: ompi_mtl_datatype_unpack failed: %d",
683684
__FILE__, __LINE__, ompi_ret);
684685
status->MPI_ERROR = ompi_ret;
@@ -1331,7 +1332,7 @@ static int ompi_mtl_ofi_init_contexts(struct mca_mtl_base_module_t *mtl,
13311332
if (MPI_COMM_WORLD == comm) {
13321333
ret = opal_progress_register(ompi_mtl_ofi_progress_no_inline);
13331334
if (OMPI_SUCCESS != ret) {
1334-
opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
1335+
opal_output_verbose(1, opal_common_ofi.output,
13351336
"%s:%d: opal_progress_register failed: %d\n",
13361337
__FILE__, __LINE__, ret);
13371338
goto init_error;

ompi/mca/mtl/ofi/mtl_ofi_component.c

Lines changed: 28 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,8 @@
66
* Copyright (c) 2015-2016 Los Alamos National Security, LLC. All rights
77
* reserved.
88
* Copyright (c) 2018 Amazon.com, Inc. or its affiliates. All Rights reserved.
9+
* Copyright (c) 2020 Triad National Security, LLC. All rights
10+
* reserved.
911
* $COPYRIGHT$
1012
*
1113
* Additional copyrights may follow
@@ -28,8 +30,6 @@ ompi_mtl_ofi_component_init(bool enable_progress_threads,
2830
bool enable_mpi_threads);
2931

3032
static int param_priority;
31-
static char *prov_include;
32-
static char *prov_exclude;
3333
static int control_progress;
3434
static int data_progress;
3535
static int av_type;
@@ -131,24 +131,6 @@ ompi_mtl_ofi_component_register(void)
131131
MCA_BASE_VAR_SCOPE_READONLY,
132132
&param_priority);
133133

134-
prov_include = NULL;
135-
mca_base_component_var_register(&mca_mtl_ofi_component.super.mtl_version,
136-
"provider_include",
137-
"Comma-delimited list of OFI providers that are considered for use (e.g., \"psm,psm2\"; an empty value means that all providers will be considered). Mutually exclusive with mtl_ofi_provider_exclude.",
138-
MCA_BASE_VAR_TYPE_STRING, NULL, 0, 0,
139-
OPAL_INFO_LVL_1,
140-
MCA_BASE_VAR_SCOPE_READONLY,
141-
&prov_include);
142-
143-
prov_exclude = "shm,sockets,tcp,udp,rstream";
144-
mca_base_component_var_register(&mca_mtl_ofi_component.super.mtl_version,
145-
"provider_exclude",
146-
"Comma-delimited list of OFI providers that are not considered for use (default: \"sockets,mxm\"; empty value means that all providers will be considered). Mutually exclusive with mtl_ofi_provider_include.",
147-
MCA_BASE_VAR_TYPE_STRING, NULL, 0, 0,
148-
OPAL_INFO_LVL_1,
149-
MCA_BASE_VAR_SCOPE_READONLY,
150-
&prov_exclude);
151-
152134
ompi_mtl_ofi.ofi_progress_event_count = MTL_OFI_MAX_PROG_EVENT_COUNT;
153135
opal_asprintf(&desc, "Max number of events to read each call to OFI progress (default: %d events will be read per OFI progress call)", ompi_mtl_ofi.ofi_progress_event_count);
154136
mca_base_component_var_register(&mca_mtl_ofi_component.super.mtl_version,
@@ -268,6 +250,8 @@ ompi_mtl_ofi_component_register(void)
268250
MCA_BASE_VAR_SCOPE_READONLY,
269251
&ompi_mtl_ofi.num_ofi_contexts);
270252

253+
opal_common_ofi_register_mca_variables(&mca_mtl_ofi_component.super.mtl_version);
254+
271255
return OMPI_SUCCESS;
272256
}
273257

@@ -312,6 +296,7 @@ ompi_mtl_ofi_component_query(mca_base_module_t **module, int *priority)
312296
static int
313297
ompi_mtl_ofi_component_close(void)
314298
{
299+
opal_common_ofi_mca_deregister();
315300
return OMPI_SUCCESS;
316301
}
317302

@@ -350,7 +335,7 @@ select_ofi_provider(struct fi_info *providers,
350335
if (NULL != include_list) {
351336
while ((NULL != prov) &&
352337
(!is_in_list(include_list, prov->fabric_attr->prov_name))) {
353-
opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
338+
opal_output_verbose(1, opal_common_ofi.output,
354339
"%s:%d: mtl:ofi: \"%s\" not in include list\n",
355340
__FILE__, __LINE__,
356341
prov->fabric_attr->prov_name);
@@ -359,15 +344,15 @@ select_ofi_provider(struct fi_info *providers,
359344
} else if (NULL != exclude_list) {
360345
while ((NULL != prov) &&
361346
(is_in_list(exclude_list, prov->fabric_attr->prov_name))) {
362-
opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
347+
opal_output_verbose(1, opal_common_ofi.output,
363348
"%s:%d: mtl:ofi: \"%s\" in exclude list\n",
364349
__FILE__, __LINE__,
365350
prov->fabric_attr->prov_name);
366351
prov = prov->next;
367352
}
368353
}
369354

370-
opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
355+
opal_output_verbose(1, opal_common_ofi.output,
371356
"%s:%d: mtl:ofi:prov: %s\n",
372357
__FILE__, __LINE__,
373358
(prov ? prov->fabric_attr->prov_name : "none"));
@@ -397,6 +382,7 @@ select_ofi_provider(struct fi_info *providers,
397382
return prov;
398383
}
399384

385+
400386
/* Check if FI_REMOTE_CQ_DATA is supported, if so send the source rank there
401387
* FI_DIRECTED_RECV is also needed so receives can discrimate the source
402388
*/
@@ -482,7 +468,7 @@ ompi_mtl_ofi_define_tag_mode(int ofi_tag_mode, int *bits_for_cid) {
482468
do { \
483469
ompi_mtl_ofi.comm_to_context = calloc(arr_size, sizeof(int)); \
484470
if (OPAL_UNLIKELY(!ompi_mtl_ofi.comm_to_context)) { \
485-
opal_output_verbose(1, ompi_mtl_base_framework.framework_output, \
471+
opal_output_verbose(1, opal_common_ofi.output, \
486472
"%s:%d: alloc of comm_to_context array failed: %s\n",\
487473
__FILE__, __LINE__, strerror(errno)); \
488474
return ret; \
@@ -494,7 +480,7 @@ ompi_mtl_ofi_define_tag_mode(int ofi_tag_mode, int *bits_for_cid) {
494480
ompi_mtl_ofi.ofi_ctxt = (mca_mtl_ofi_context_t *) malloc(ompi_mtl_ofi.num_ofi_contexts * \
495481
sizeof(mca_mtl_ofi_context_t)); \
496482
if (OPAL_UNLIKELY(!ompi_mtl_ofi.ofi_ctxt)) { \
497-
opal_output_verbose(1, ompi_mtl_base_framework.framework_output, \
483+
opal_output_verbose(1, opal_common_ofi.output, \
498484
"%s:%d: alloc of ofi_ctxt array failed: %s\n", \
499485
__FILE__, __LINE__, strerror(errno)); \
500486
return ret; \
@@ -642,17 +628,19 @@ ompi_mtl_ofi_component_init(bool enable_progress_threads,
642628
int universe_size;
643629
char *univ_size_str;
644630

645-
opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
631+
opal_common_ofi_mca_register();
632+
633+
opal_output_verbose(1, opal_common_ofi.output,
646634
"%s:%d: mtl:ofi:provider_include = \"%s\"\n",
647-
__FILE__, __LINE__, prov_include);
648-
opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
635+
__FILE__, __LINE__, *opal_common_ofi.prov_include);
636+
opal_output_verbose(1, opal_common_ofi.output,
649637
"%s:%d: mtl:ofi:provider_exclude = \"%s\"\n",
650-
__FILE__, __LINE__, prov_exclude);
638+
__FILE__, __LINE__, *opal_common_ofi.prov_exclude);
651639

652-
if (NULL != prov_include) {
653-
include_list = opal_argv_split(prov_include, ',');
654-
} else if (NULL != prov_exclude) {
655-
exclude_list = opal_argv_split(prov_exclude, ',');
640+
if (NULL != *opal_common_ofi.prov_include) {
641+
include_list = opal_argv_split(*opal_common_ofi.prov_include, ',');
642+
} else if (NULL != *opal_common_ofi.prov_exclude) {
643+
exclude_list = opal_argv_split(*opal_common_ofi.prov_exclude, ',');
656644
}
657645

658646
/**
@@ -667,7 +655,7 @@ ompi_mtl_ofi_component_init(bool enable_progress_threads,
667655
*/
668656
hints = fi_allocinfo();
669657
if (!hints) {
670-
opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
658+
opal_output_verbose(1, opal_common_ofi.output,
671659
"%s:%d: Could not allocate fi_info\n",
672660
__FILE__, __LINE__);
673661
goto error;
@@ -753,7 +741,7 @@ ompi_mtl_ofi_component_init(bool enable_progress_threads,
753741

754742
ret = fi_getinfo(fi_version, NULL, NULL, 0ULL, hints_dup, &providers);
755743

756-
opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
744+
opal_output_verbose(1, opal_common_ofi.output,
757745
"%s:%d: EFA specific fi_getinfo(): %s\n",
758746
__FILE__, __LINE__, fi_strerror(-ret));
759747

@@ -790,7 +778,7 @@ ompi_mtl_ofi_component_init(bool enable_progress_threads,
790778
hints, /* In: Hints to filter providers */
791779
&providers); /* Out: List of matching providers */
792780

793-
opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
781+
opal_output_verbose(1, opal_common_ofi.output,
794782
"%s:%d: fi_getinfo(): %s\n",
795783
__FILE__, __LINE__, fi_strerror(-ret));
796784

@@ -811,7 +799,7 @@ ompi_mtl_ofi_component_init(bool enable_progress_threads,
811799
*/
812800
prov = select_ofi_provider(providers, include_list, exclude_list);
813801
if (!prov) {
814-
opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
802+
opal_output_verbose(1, opal_common_ofi.output,
815803
"%s:%d: select_ofi_provider: no provider found\n",
816804
__FILE__, __LINE__);
817805
goto error;
@@ -840,7 +828,7 @@ ompi_mtl_ofi_component_init(bool enable_progress_threads,
840828
/* Fallback to MTL_OFI_TAG_1 */
841829
ompi_mtl_ofi_define_tag_mode(MTL_OFI_TAG_1, &ofi_tag_bits_for_cid);
842830
} else { /* MTL_OFI_TAG_FULL */
843-
opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
831+
opal_output_verbose(1, opal_common_ofi.output,
844832
"%s:%d: OFI provider %s does not support FI_REMOTE_CQ_DATA\n",
845833
__FILE__, __LINE__, prov->fabric_attr->prov_name);
846834
goto error;
@@ -920,7 +908,7 @@ ompi_mtl_ofi_component_init(bool enable_progress_threads,
920908
ompi_process_info.nodename, __FILE__, __LINE__);
921909
goto error;
922910
} else if (1 == sep_support_in_provider) {
923-
opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
911+
opal_output_verbose(1, opal_common_ofi.output,
924912
"%s:%d: Scalable EP supported in %s provider. Enabling in MTL.\n",
925913
__FILE__, __LINE__, prov->fabric_attr->prov_name);
926914
}
@@ -1079,7 +1067,7 @@ ompi_mtl_ofi_component_init(bool enable_progress_threads,
10791067
&ep_name,
10801068
namelen);
10811069
if (OMPI_SUCCESS != ret) {
1082-
opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
1070+
opal_output_verbose(1, opal_common_ofi.output,
10831071
"%s:%d: modex_send failed: %d\n",
10841072
__FILE__, __LINE__, ret);
10851073
goto error;

opal/mca/btl/ofi/Makefile.am

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,8 @@
1313
# Copyright (c) 2013 NVIDIA Corporation. All rights reserved.
1414
# Copyright (c) 2017 IBM Corporation. All rights reserved.
1515
# Copyright (c) 2018 Intel, inc. All rights reserved
16+
# Copyright (c) 2020 Triad National Security, LLC. All rights
17+
# reserved.
1618
# $COPYRIGHT$
1719
#
1820
# Additional copyrights may follow

0 commit comments

Comments
 (0)