Skip to content

Commit 02d91b5

Browse files
authored
Merge pull request #10321 from bwbarrett/bugfix/mpirun-prterun-search-fix
mpirun: Refactor to better play with prefixes
2 parents 6f7e9bc + 4a008c1 commit 02d91b5

File tree

4 files changed

+128
-89
lines changed

4 files changed

+128
-89
lines changed

config/ompi_setup_prrte.m4

Lines changed: 19 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -32,9 +32,6 @@ dnl external PRRTE and the internal one fails to configure, abort.
3232
dnl
3333
dnl This macro wil change the environment in the following way:
3434
dnl
35-
dnl * PRTE_PATH will be AC_SUBST'ed to the full path (minus the EXE
36-
dnl extension) of the prte binary.
37-
dnl
3835
dnl A Makefile conditional OMPI_WANT_PRRTE will be defined based on the
3936
dnl results of the build.
4037
AC_DEFUN([OMPI_SETUP_PRRTE],[
@@ -44,6 +41,10 @@ AC_DEFUN([OMPI_SETUP_PRRTE],[
4441
4542
OPAL_3RDPARTY_WITH([prrte], [prrte], [package_prrte], [1])
4643
44+
AC_ARG_WITH([prrte-bindir],
45+
[AC_HELP_STRING([--with-prrte-bindir=DIR],
46+
[Search for PRRTE binaries in DIR. Defaults to PRRTE_DIR/bin if not specified])])
47+
4748
prrte_setup_internal_happy=0
4849
m4_ifdef([package_prrte],
4950
[OMPI_PRRTE_ADD_ARGS
@@ -89,20 +90,13 @@ AC_DEFUN([OMPI_SETUP_PRRTE],[
8990
OMPI_HAVE_PRRTE=1],
9091
[OMPI_HAVE_PRRTE=0])
9192
92-
AC_SUBST([PRTE_PATH])
93-
9493
AM_CONDITIONAL([OMPI_WANT_PRRTE],
9594
[test "$prrte_setup_internal_happy" = "1" -o "$prrte_setup_external_happy" = "1"])
9695
9796
AC_DEFINE_UNQUOTED([OMPI_HAVE_PRRTE],
9897
[$OMPI_HAVE_PRRTE],
9998
[Whether or not PRRTE is available])
10099
101-
AS_IF([test "$opal_prrte_mode" = "external"],
102-
[AC_DEFINE_UNQUOTED([OMPI_PRTERUN_PATH],
103-
["$PRTE_PATH"],
104-
[Path to prterun])])
105-
106100
AC_DEFINE_UNQUOTED([OMPI_USING_INTERNAL_PRRTE],
107101
[$OMPI_USING_INTERNAL_PRRTE],
108102
[Whether or not we are using the internal PRRTE])
@@ -239,16 +233,12 @@ dnl caller configured libprrte configure, and the configure script
239233
dnl succeeded.
240234
AC_DEFUN([_OMPI_SETUP_PRRTE_INTERNAL_POST], [
241235
OPAL_3RDPARTY_SUBDIRS="$OPAL_3RDPARTY_SUBDIRS prrte"
242-
243-
PRTE_PATH="prte"
244236
])
245237

246238

247239
dnl _OMPI_SETUP_PRRTE_EXTERNAL([action if success], [action if not success])
248240
dnl
249-
dnl Try to find an external prrte with sufficient version. Since we
250-
dnl don't link against prrte, only output environment variable is
251-
dnl PRTE_PATH.
241+
dnl Try to find an external prrte with sufficient version.
252242
AC_DEFUN([_OMPI_SETUP_PRRTE_EXTERNAL], [
253243
OPAL_VAR_SCOPE_PUSH([setup_prrte_external_happy opal_prrte_CPPFLAGS_save])
254244
@@ -276,18 +266,21 @@ AC_DEFUN([_OMPI_SETUP_PRRTE_EXTERNAL], [
276266
277267
CPPFLAGS="$opal_prrte_CPPFLAGS_save"
278268
269+
# If an external build and the user told us where to find PRRTE,
270+
# find prterun and save that path.
271+
prterun_path=
279272
AS_IF([test "$setup_prrte_external_happy" = "yes"],
280-
[AS_IF([test -n "$with_prrte"],
281-
[PRTE_PATH="${with_prrte}/bin/prte"
282-
AS_IF([test ! -r ${PRTE_PATH}],
283-
[AC_MSG_ERROR([Could not find prte binary at $PRTE_PATH])],
284-
[PRTE_PATH="${with_prrte}/bin"])],
285-
[PRTE_PATH=""
286-
OPAL_WHICH([prte], [PRTE_PATH])
287-
AS_IF([tets -z "$PRTE_PATH"],
288-
[AC_MSG_WARN([Could not find prte in PATH])
289-
setup_prrte_external_happy=no],
290-
[PRTE_PATH="`echo $PRTE_PATH | sed -e 's/\/prte//'`"])])])
273+
[AS_IF([test "${with_prrte_bindir}" = "yes" -o "${with_prrte_bindir}" = "no"],
274+
[AC_MSG_ERROR(["yes" and "no" are not valid arguments for --with-prrte-bindir])])
275+
AS_IF([test -z "${with_prrte_bindir}" -a -n "${with_prrte}"],
276+
[with_prrte_bindir="${with_prrte}/bin"])
277+
AS_IF([test -n "${with_prrte_bindir}"],
278+
[AS_IF([test -x ${with_prrte_bindir}/prterun],
279+
[prterun_path="${with_prrte_bindir}/prterun"],
280+
[AC_MSG_ERROR([Could not find executable prterun: ${with_prrte_bindir}/prterun])])])])
281+
AS_IF([test -n "${prterun_path}"],
282+
[AC_DEFINE_UNQUOTED([OMPI_PRTERUN_PATH], ["${prterun_path}"], [Path to prterun])])
283+
291284
AS_IF([test "$setup_prrte_external_happy" = "yes"],
292285
[$1], [$2])
293286

ompi/tools/mpirun/Makefile.am

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,7 @@
11
#
22
# Copyright (c) 2019-2020 Intel, Inc. All rights reserved.
33
# Copyright (c) 2020 IBM Corporation. All rights reserved.
4-
# Copyright (c) 2021 Amazon.com, Inc. or its affiliates.
5-
# All Rights reserved.
4+
# Copyright (c) 2021-2022 Amazon.com, Inc. or its affiliates. All Rights reserved.
65
# Copyright (c) 2021 Nanook Consulting. All rights reserved.
76
# $COPYRIGHT$
87
#
@@ -15,6 +14,8 @@ if OMPI_WANT_PRRTE
1514

1615
bin_PROGRAMS = mpirun
1716

17+
dist_ompidata_DATA = help-mpirun.txt
18+
1819
mpirun_SOURCES = \
1920
main.c
2021

ompi/tools/mpirun/help-mpirun.txt

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
# -*- text -*-
2+
#
3+
# Copyright (c) 2022 Amazon.com, Inc. or its affiliates. All Rights reserved.
4+
# $COPYRIGHT$
5+
#
6+
# Additional copyrights may follow
7+
#
8+
# $HEADER$
9+
#
10+
# This is the US/English help file for Open MPI wrapper compiler error
11+
# messages.
12+
#
13+
[no-prterun-found]
14+
Open MPI's mpirun command was unable to find an underlying prterun
15+
command to execute. Consider setting the OMPI_PRTERUN environment
16+
variable to help mpirun find the correct underlying prterun.
17+
[prterun-exec-failed]
18+
Open MPI's mpirun command could not execute the underlying prterun
19+
command. The prterun command we tried to execute and the error
20+
message from exec() are below:
21+
22+
Command: %s
23+
Error Message: %s

ompi/tools/mpirun/main.c

Lines changed: 83 additions & 61 deletions
Original file line numberDiff line numberDiff line change
@@ -14,100 +14,121 @@
1414
#if HAVE_SYS_STAT_H
1515
# include <sys/stat.h>
1616
#endif /* HAVE_SYS_STAT_H */
17+
#include <string.h>
1718

1819
#include "opal/mca/base/base.h"
1920
#include "opal/mca/installdirs/base/base.h"
21+
#include "opal/runtime/opal.h"
2022
#include "opal/util/argv.h"
2123
#include "opal/util/basename.h"
2224
#include "opal/util/opal_environ.h"
2325
#include "opal/util/os_dirpath.h"
2426
#include "opal/util/os_path.h"
2527
#include "opal/util/path.h"
2628
#include "opal/util/printf.h"
29+
#include "opal/util/show_help.h"
30+
#include "ompi/constants.h"
2731

28-
int main(int argc, char *argv[])
32+
static char *find_prterun(void)
2933
{
30-
char *evar;
31-
char **pargs = NULL;
32-
char *pfx = NULL;
33-
int m, param_len;
34-
char *truepath = NULL;
34+
char *filename = NULL;
35+
char *prrte_prefix = NULL;
3536

36-
if (NULL != (evar = getenv("OPAL_PREFIX"))) {
37+
/* 1) Did the user tell us exactly where to find prterun? */
38+
filename = getenv("OMPI_PRTERUN");
39+
if (NULL != filename) {
40+
return filename;
41+
}
3742

3843
#if OMPI_USING_INTERNAL_PRRTE
39-
setenv("PRTE_PREFIX", evar, 1);
40-
#endif
44+
/* 2) If using internal PRRTE, use our bindir. Note that this
45+
* will obey OPAL_PREFIX and OPAL_DESTDIR */
46+
opal_asprintf(&filename, "%s%sprterun", opal_install_dirs.bindir, OPAL_PATH_SEP);
47+
return filename;
48+
#else
4149

42-
#if OPAL_USING_INTERNAL_PMIX
43-
setenv("PMIX_PREFIX", evar, 1);
50+
/* 3) Look in ${PRTE_PREFIX}/bin */
51+
prrte_prefix = getenv("PRTE_PREFIX");
52+
if (NULL != prrte_prefix) {
53+
opal_asprintf(&filename, "%s%sbin%sprterun", prrte_prefix, OPAL_PATH_SEP, OPAL_PATH_SEP);
54+
return filename;
55+
}
56+
57+
/* 4) See if configure told us where to look, if set */
58+
#if defined(OMPI_PRTERUN_PATH)
59+
return strdup(OMPI_PRTERUN_PATH);
60+
#else
61+
62+
/* 5) Use path search */
63+
filename = opal_find_absolute_path("prterun");
64+
65+
return filename;
66+
#endif
4467
#endif
68+
}
69+
70+
71+
int main(int argc, char *argv[])
72+
{
73+
char *opal_prefix = getenv("OPAL_PREFIX");
74+
char *full_prterun_path = NULL;
75+
char **prterun_args = NULL;
76+
int ret;
77+
size_t i;
78+
79+
ret = opal_init_util(&argc, &argv);
80+
if (OMPI_SUCCESS != ret) {
81+
fprintf(stderr, "Failed initializing opal: %d\n", ret);
82+
exit(1);
4583
}
84+
85+
/* note that we just modify our environment rather than create a
86+
* child environment because it is easier and we're not going to
87+
* be around long enough for it to matter (since we exec prterun
88+
* asap */
4689
setenv("PRTE_MCA_schizo_proxy", "ompi", 1);
4790
setenv("OMPI_VERSION", OMPI_VERSION, 1);
4891
char *base_tool_name = opal_basename(argv[0]);
4992
setenv("OMPI_TOOL_NAME", base_tool_name, 1);
5093
free(base_tool_name);
5194

95+
/* TODO: look for --prefix and compare with OPAL_PREFIX and pick
96+
* one */
5297

53-
opal_argv_append_nosize(&pargs, "prterun");
54-
for (m=1; NULL != argv[m]; m++) {
55-
opal_argv_append_nosize(&pargs, argv[m]);
56-
/* Did the user specify a prefix, or want prefix by default? */
57-
if (0 == strcmp(argv[m], "--prefix")) {
58-
opal_asprintf(&pfx, "%s%s", argv[m+1], "/bin");
59-
}
60-
}
61-
62-
if (NULL != pfx) {
63-
/* "Parse" the param, aka remove superfluous path_sep. */
64-
param_len = strlen(pfx);
65-
while (0 == strcmp(OPAL_PATH_SEP, &(pfx[param_len - 1]))) {
66-
pfx[param_len - 1] = '\0';
67-
param_len--;
68-
if (0 == param_len) {
69-
fprintf(stderr, "A prefix was supplied to mpirun that only contained slashes.\n"
70-
"This is a fatal error; mpirun will now abort.\nNo processes were launched.\n");
71-
exit(1);
72-
}
73-
}
74-
} else if (opal_path_is_absolute(argv[0])) {
75-
/* Check if called with fully-qualified path to mpirun.
76-
* (Note: Put this second so can override with --prefix (above). */
77-
pfx = opal_dirname(argv[0]);
98+
/* as a special case, if OPAL_PREFIX was set and either PRRTE or
99+
* PMIx are internal builds, set their prefix variables as well */
100+
if (NULL != opal_prefix) {
78101
#if OMPI_USING_INTERNAL_PRRTE
79-
} else {
80-
/* in case --enable-prefix-by-default was given */
81-
mca_base_framework_open(&opal_installdirs_base_framework, 0); // fill in the installdirs
82-
if (NULL != opal_install_dirs.bindir) {
83-
pfx = strdup(opal_install_dirs.bindir);
84-
}
102+
setenv("PRTE_PREFIX", opal_prefix, 1);
85103
#endif
86-
}
87-
88-
if (NULL == pfx) {
89-
truepath = opal_path_findv("prterun", X_OK, environ, NULL);
90-
#if !OMPI_USING_INTERNAL_PRRTE
91-
// if OMPI_PRTERUN_PATH is available, try that
92-
// for external builds if the user didn't explictly
93-
// add a prefix and it isn't in the users path.
94-
if((NULL == truepath) && (0 != strlen(OMPI_PRTERUN_PATH))) {
95-
truepath = opal_os_path(0, OMPI_PRTERUN_PATH, "prterun", NULL);
96-
}
104+
#if OPAL_USING_INTERNAL_PMIX
105+
setenv("PMIX_PREFIX", opal_prefix, 1);
97106
#endif
98-
} else {
99-
truepath = opal_os_path(0, pfx, "prterun", NULL);
100-
free(pfx);
101107
}
102108

103-
if (NULL == truepath) {
104-
fprintf(stderr, "prterun executable could not be found - unable to run\n");
109+
full_prterun_path = find_prterun();
110+
if (NULL == full_prterun_path) {
111+
opal_show_help("help-mpirun.txt", "no-prterun-found", 1);
105112
exit(1);
106113
}
107114

108-
execve(truepath, pargs, environ);
109-
fprintf(stderr, "The mpirun (\"%s\") cmd failed to exec its actual executable - your application will NOT execute. Error: %s\n",
110-
truepath ? truepath : "NULL", strerror(errno));
115+
/* calling mpirun (and now prterun) with a full path has a special
116+
* meaning in terms of -prefix behavior, so copy that behavior
117+
* into prterun */
118+
if (opal_path_is_absolute(argv[0])) {
119+
opal_argv_append_nosize(&prterun_args, full_prterun_path);
120+
} else {
121+
opal_argv_append_nosize(&prterun_args, "prterun");
122+
}
123+
124+
/* Copy all the mpirun arguments to prterun.
125+
* TODO: Need to handle --prefix rationally here. */
126+
for (i = 1; NULL != argv[i]; i++) {
127+
opal_argv_append_nosize(&prterun_args, argv[i]);
128+
}
129+
ret = execv(full_prterun_path, prterun_args);
130+
opal_show_help("help-mpirun.txt", "prterun-exec-failed",
131+
1, full_prterun_path, strerror(errno));
111132
exit(1);
112133
}
113134

@@ -125,6 +146,7 @@ int main(int argc, char *argv[])
125146
* Copyright (c) 2017-2020 Intel, Inc. All rights reserved.
126147
* Copyright (c) 2020 Cisco Systems, Inc. All rights reserved.
127148
* Copyright (c) 2021 Nanook Consulting. All rights reserved.
149+
* Copyright (c) 2022 Amazon.com, Inc. or its affiliates. All Rights reserved.
128150
* $COPYRIGHT$
129151
*
130152
* Additional copyrights may follow

0 commit comments

Comments
 (0)