Skip to content

Commit 091e07a

Browse files
authored
Merge pull request #10763 from jsquyres/pr/show-load-errors----or-not
Update to "show load errors" functionality
2 parents 63f7768 + 20bbf27 commit 091e07a

File tree

9 files changed

+369
-37
lines changed

9 files changed

+369
-37
lines changed

config/opal_configure_options.m4

Lines changed: 19 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ dnl Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
1010
dnl University of Stuttgart. All rights reserved.
1111
dnl Copyright (c) 2004-2005 The Regents of the University of California.
1212
dnl All rights reserved.
13-
dnl Copyright (c) 2006-2020 Cisco Systems, Inc. All rights reserved
13+
dnl Copyright (c) 2006-2022 Cisco Systems, Inc. All rights reserved
1414
dnl Copyright (c) 2007 Sun Microsystems, Inc. All rights reserved.
1515
dnl Copyright (c) 2009 IBM Corporation. All rights reserved.
1616
dnl Copyright (c) 2009 Los Alamos National Security, LLC. All rights
@@ -327,25 +327,24 @@ fi
327327
#
328328

329329
AC_MSG_CHECKING([for default value of mca_base_component_show_load_errors])
330-
AC_ARG_ENABLE([show-load-errors-by-default],
331-
[AS_HELP_STRING([--enable-show-load-errors-by-default],
332-
[Set the default value for the MCA parameter
333-
mca_base_component_show_load_errors (but can be
334-
overridden at run time by the usual
335-
MCA-variable-setting mechansism). This MCA variable
336-
controls whether warnings are displayed when an MCA
337-
component fails to load at run time due to an error.
338-
(default: enabled, meaning that
339-
mca_base_component_show_load_errors is enabled
340-
by default])])
341-
if test "$enable_show_load_errors_by_default" = "no" ; then
342-
OPAL_SHOW_LOAD_ERRORS_DEFAULT=0
343-
AC_MSG_RESULT([disabled by default])
344-
else
345-
OPAL_SHOW_LOAD_ERRORS_DEFAULT=1
346-
AC_MSG_RESULT([enabled by default])
347-
fi
348-
AC_DEFINE_UNQUOTED(OPAL_SHOW_LOAD_ERRORS_DEFAULT, $OPAL_SHOW_LOAD_ERRORS_DEFAULT,
330+
AC_ARG_WITH([show-load-errors],
331+
[AS_HELP_STRING([--with-show-load-errors],
332+
[Set the default value for the MCA
333+
parameter
334+
mca_base_component_show_load_errors (but
335+
can be overridden at run time by the usual
336+
MCA-variable-setting mechansism).
337+
(default: "all")])])
338+
339+
AS_IF([test -z "$with_show_load_errors" -o "$with_show_load_errors" = "yes"],
340+
[with_show_load_errors=all
341+
AC_MSG_RESULT([enabled for all])],
342+
[AS_IF([test "$with_show_load_errors" = "no"],
343+
[with_show_load_errors=none
344+
AC_MSG_RESULT([disabled for all])],
345+
[AC_MSG_RESULT([$with_show_load_errors])])])
346+
347+
AC_DEFINE_UNQUOTED(OPAL_SHOW_LOAD_ERRORS_DEFAULT, ["$with_show_load_errors"],
349348
[Default value for mca_base_component_show_load_errors MCA variable])
350349

351350

docs/running-apps/tuning.rst

Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -445,3 +445,66 @@ presented here so that they can easily be found via internet searches:
445445
.. important:: You can only use the "include" *or* the "exclude"
446446
parameter |mdash| they are mutually exclusive from each
447447
other.
448+
* ``opal_mca_base_component_show_load_errors``: By default, Open MPI
449+
emits a warning message if it fails to open a DSO component at run
450+
time. This typically happens when a shared library that the DSO
451+
requires is not available.
452+
453+
.. admonition:: Rationale
454+
:class: tip
455+
456+
In prior versions of Open MPI, components defaulted to building
457+
as DSOs (vs. being included in their parent libraries, such as
458+
``libmpi.so``). On misconfigured systems, sometimes network
459+
acceleration libraries would not be present, meaning that
460+
HPC-class networking components failed to open at run time. As
461+
such, Open MPI would typically fall back to TCP as a network
462+
transport, which usually led to poor performance of end-user
463+
applications.
464+
465+
Having Open MPI warn about such failures to load was useful
466+
because it alerted users to the misconfiguration.
467+
468+
.. note:: By default, Open MPI |ompi_ver| includes all components in
469+
its base libraries (e.g., on Linux, ``libmpi.so`` includes
470+
all the components that were built with Open MPI, and
471+
therefore no component need to be opened dynamically), and
472+
does not build its components as DSOs.
473+
474+
This MCA parameter *only* affects the behavior of when a
475+
component DSO fails to open.
476+
477+
This MCA parameter can take four general values:
478+
479+
#. ``yes`` or a boolean "true" value (e.g., ``1``): Open MPI will
480+
emit a warning about every component DSO that fails to load.
481+
482+
#. ``no`` or a boolean "false" value (e.g., ``0``): Open MPI will
483+
never emit warnings about component DSOs that fail to load.
484+
485+
#. A comma-delimited list of frameworks and/or components: Open MPI
486+
will emit a warning about any dynamic component that fails to
487+
open and matches a token in the list. "Match" is defined as:
488+
489+
* If a token in the list is only a framework name, then any
490+
component in that framework will match.
491+
* If a token in the list specifies both a framework name and a
492+
component name (in the form ``framework/component``), then
493+
only the specified component in the specified framework will
494+
match.
495+
496+
For example, if the value of this MCA parameter is
497+
``accelerator,btl/uct``, then Open MPI warn if any component in
498+
the accelerator framework or if the UCT BTL fails to load at run
499+
time.
500+
501+
#. The value can also be a ``^`` character followed by a
502+
comma-delimited list of ``framework[/component]`` values: This
503+
is similar to the comma-delimited list of tokens, except it will
504+
only emit warnings about dynamic components that fail to load
505+
and do *not* match a token in the list.
506+
507+
For example, if the value of this MCA parameter is
508+
``^accelerator,btl/uct``, then Open MPI will only warn about the
509+
failure to load DSOs that are neither in the accelerator
510+
framework nor are the UCT BTL.

opal/mca/base/base.h

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
* University of Stuttgart. All rights reserved.
1111
* Copyright (c) 2004-2005 The Regents of the University of California.
1212
* All rights reserved.
13-
* Copyright (c) 2009 Cisco Systems, Inc. All rights reserved.
13+
* Copyright (c) 2009-2022 Cisco Systems, Inc. All rights reserved
1414
* Copyright (c) 2013-2015 Los Alamos National Security, LLC. All rights
1515
* reserved.
1616
* Copyright (c) 2015 Research Organization for Information Science
@@ -69,7 +69,7 @@ OPAL_DECLSPEC OBJ_CLASS_DECLARATION(mca_base_component_priority_list_item_t);
6969
* Public variables
7070
*/
7171
OPAL_DECLSPEC extern char *mca_base_component_path;
72-
OPAL_DECLSPEC extern bool mca_base_component_show_load_errors;
72+
OPAL_DECLSPEC extern char *mca_base_component_show_load_errors;
7373
OPAL_DECLSPEC extern bool mca_base_component_track_load_errors;
7474
OPAL_DECLSPEC extern bool mca_base_component_disable_dlopen;
7575
OPAL_DECLSPEC extern char *mca_base_system_default_path;
@@ -214,6 +214,10 @@ OPAL_DECLSPEC int mca_base_framework_components_register(struct mca_base_framewo
214214
mca_base_register_flag_t flags);
215215

216216
/* mca_base_components_open.c */
217+
OPAL_DECLSPEC int mca_base_show_load_errors_init(void);
218+
OPAL_DECLSPEC int mca_base_show_load_errors_finalize(void);
219+
OPAL_DECLSPEC bool mca_base_show_load_errors(const char *framework_name,
220+
const char *component_name);
217221
OPAL_DECLSPEC int mca_base_framework_components_open(struct mca_base_framework_t *framework,
218222
mca_base_open_flag_t flags);
219223

opal/mca/base/help-mca-base.txt

Lines changed: 21 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
# University of Stuttgart. All rights reserved.
1111
# Copyright (c) 2004-2005 The Regents of the University of California.
1212
# All rights reserved.
13-
# Copyright (c) 2008-2014 Cisco Systems, Inc. All rights reserved.
13+
# Copyright (c) 2008-2022 Cisco Systems, Inc. All rights reserved
1414
# $COPYRIGHT$
1515
#
1616
# Additional copyrights may follow
@@ -59,3 +59,23 @@ all components *except* a and b", while "c,d" specifies the inclusive
5959
behavior and means "use *only* components c and d."
6060

6161
You cannot mix inclusive and exclusive behavior.
62+
#
63+
[internal error during init]
64+
An internal error has occurred during the startup of Open MPI. This
65+
is highly unusual and shouldn't happen. Open MPI will now abort your
66+
job.
67+
68+
The following message may provide additional insight into the error:
69+
70+
Failure at: %s (%s:%d)
71+
Error: %d (%s)
72+
#
73+
[show_load_errors: too many /]
74+
The opal_mca_base_component_show_load_errors MCA variable cannot
75+
contain a token that has more than one "/" character in it.
76+
77+
The opal_mca_base_component_show_load_errors MCA variable can only
78+
contain the values: all, none, or a comma-delimited list of tokens in
79+
the form of "framework" or "framework/component".
80+
81+
Erroneous value: %s

opal/mca/base/mca_base_close.c

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
* University of Stuttgart. All rights reserved.
1111
* Copyright (c) 2004-2005 The Regents of the University of California.
1212
* All rights reserved.
13-
* Copyright (c) 2009 Cisco Systems, Inc. All rights reserved.
13+
* Copyright (c) 2009-2022 Cisco Systems, Inc. All rights reserved
1414
* Copyright (c) 2015 Los Alamos National Security, LLC. All rights
1515
* reserved.
1616
* $COPYRIGHT$
@@ -61,6 +61,9 @@ void mca_base_close(void)
6161
/* Shut down the dynamic component finder */
6262
mca_base_component_find_finalize();
6363

64+
/* Shut down the show_load_errors processing */
65+
mca_base_show_load_errors_finalize();
66+
6467
/* Close opal output stream 0 */
6568
opal_output_close(0);
6669
}

opal/mca/base/mca_base_component_repository.c

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
* University of Stuttgart. All rights reserved.
1111
* Copyright (c) 2004-2005 The Regents of the University of California.
1212
* All rights reserved.
13-
* Copyright (c) 2008-2015 Cisco Systems, Inc. All rights reserved.
13+
* Copyright (c) 2008-2022 Cisco Systems, Inc. All rights reserved
1414
* Copyright (c) 2015 Los Alamos National Security, LLC. All rights
1515
* reserved.
1616
* Copyright (c) 2015 Research Organization for Information Science
@@ -372,7 +372,8 @@ int mca_base_component_repository_open(mca_base_framework_t *framework,
372372
"%s MCA component \"%s\" at path %s",
373373
ri->ri_type, ri->ri_name, ri->ri_path);
374374

375-
vl = mca_base_component_show_load_errors ? MCA_BASE_VERBOSE_ERROR : MCA_BASE_VERBOSE_INFO;
375+
vl = mca_base_show_load_errors(ri->ri_type,
376+
ri->ri_name) ? MCA_BASE_VERBOSE_ERROR : MCA_BASE_VERBOSE_INFO;
376377

377378
/* Ensure that this component is not already loaded (should only happen
378379
if it was statically loaded). It's an error if it's already

0 commit comments

Comments
 (0)