Skip to content

Commit 4a008c1

Browse files
committed
mpirun: Refactor to better play with prefixes
Rework mpirun to not use OMPI prefixes to find prterun unless OMPI was built with an internal PRRTE. The design for this work is documented in #10252. Signed-off-by: Brian Barrett <bbarrett@amazon.com>
1 parent ab812d4 commit 4a008c1

File tree

4 files changed

+128
-89
lines changed

4 files changed

+128
-89
lines changed

config/ompi_setup_prrte.m4

Lines changed: 19 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -32,9 +32,6 @@ dnl external PRRTE and the internal one fails to configure, abort.
3232
dnl
3333
dnl This macro wil change the environment in the following way:
3434
dnl
35-
dnl * PRTE_PATH will be AC_SUBST'ed to the full path (minus the EXE
36-
dnl extension) of the prte binary.
37-
dnl
3835
dnl A Makefile conditional OMPI_WANT_PRRTE will be defined based on the
3936
dnl results of the build.
4037
AC_DEFUN([OMPI_SETUP_PRRTE],[
@@ -44,6 +41,10 @@ AC_DEFUN([OMPI_SETUP_PRRTE],[
4441
4542
OPAL_3RDPARTY_WITH([prrte], [prrte], [package_prrte], [1])
4643
44+
AC_ARG_WITH([prrte-bindir],
45+
[AC_HELP_STRING([--with-prrte-bindir=DIR],
46+
[Search for PRRTE binaries in DIR. Defaults to PRRTE_DIR/bin if not specified])])
47+
4748
prrte_setup_internal_happy=0
4849
m4_ifdef([package_prrte],
4950
[OMPI_PRRTE_ADD_ARGS
@@ -89,20 +90,13 @@ AC_DEFUN([OMPI_SETUP_PRRTE],[
8990
OMPI_HAVE_PRRTE=1],
9091
[OMPI_HAVE_PRRTE=0])
9192
92-
AC_SUBST([PRTE_PATH])
93-
9493
AM_CONDITIONAL([OMPI_WANT_PRRTE],
9594
[test "$prrte_setup_internal_happy" = "1" -o "$prrte_setup_external_happy" = "1"])
9695
9796
AC_DEFINE_UNQUOTED([OMPI_HAVE_PRRTE],
9897
[$OMPI_HAVE_PRRTE],
9998
[Whether or not PRRTE is available])
10099
101-
AS_IF([test "$opal_prrte_mode" = "external"],
102-
[AC_DEFINE_UNQUOTED([OMPI_PRTERUN_PATH],
103-
["$PRTE_PATH"],
104-
[Path to prterun])])
105-
106100
AC_DEFINE_UNQUOTED([OMPI_USING_INTERNAL_PRRTE],
107101
[$OMPI_USING_INTERNAL_PRRTE],
108102
[Whether or not we are using the internal PRRTE])
@@ -239,16 +233,12 @@ dnl caller configured libprrte configure, and the configure script
239233
dnl succeeded.
240234
AC_DEFUN([_OMPI_SETUP_PRRTE_INTERNAL_POST], [
241235
OPAL_3RDPARTY_SUBDIRS="$OPAL_3RDPARTY_SUBDIRS prrte"
242-
243-
PRTE_PATH="prte"
244236
])
245237

246238

247239
dnl _OMPI_SETUP_PRRTE_EXTERNAL([action if success], [action if not success])
248240
dnl
249-
dnl Try to find an external prrte with sufficient version. Since we
250-
dnl don't link against prrte, only output environment variable is
251-
dnl PRTE_PATH.
241+
dnl Try to find an external prrte with sufficient version.
252242
AC_DEFUN([_OMPI_SETUP_PRRTE_EXTERNAL], [
253243
OPAL_VAR_SCOPE_PUSH([setup_prrte_external_happy opal_prrte_CPPFLAGS_save])
254244
@@ -276,18 +266,21 @@ AC_DEFUN([_OMPI_SETUP_PRRTE_EXTERNAL], [
276266
277267
CPPFLAGS="$opal_prrte_CPPFLAGS_save"
278268
269+
# If an external build and the user told us where to find PRRTE,
270+
# find prterun and save that path.
271+
prterun_path=
279272
AS_IF([test "$setup_prrte_external_happy" = "yes"],
280-
[AS_IF([test -n "$with_prrte"],
281-
[PRTE_PATH="${with_prrte}/bin/prte"
282-
AS_IF([test ! -r ${PRTE_PATH}],
283-
[AC_MSG_ERROR([Could not find prte binary at $PRTE_PATH])],
284-
[PRTE_PATH="${with_prrte}/bin"])],
285-
[PRTE_PATH=""
286-
OPAL_WHICH([prte], [PRTE_PATH])
287-
AS_IF([tets -z "$PRTE_PATH"],
288-
[AC_MSG_WARN([Could not find prte in PATH])
289-
setup_prrte_external_happy=no],
290-
[PRTE_PATH="`echo $PRTE_PATH | sed -e 's/\/prte//'`"])])])
273+
[AS_IF([test "${with_prrte_bindir}" = "yes" -o "${with_prrte_bindir}" = "no"],
274+
[AC_MSG_ERROR(["yes" and "no" are not valid arguments for --with-prrte-bindir])])
275+
AS_IF([test -z "${with_prrte_bindir}" -a -n "${with_prrte}"],
276+
[with_prrte_bindir="${with_prrte}/bin"])
277+
AS_IF([test -n "${with_prrte_bindir}"],
278+
[AS_IF([test -x ${with_prrte_bindir}/prterun],
279+
[prterun_path="${with_prrte_bindir}/prterun"],
280+
[AC_MSG_ERROR([Could not find executable prterun: ${with_prrte_bindir}/prterun])])])])
281+
AS_IF([test -n "${prterun_path}"],
282+
[AC_DEFINE_UNQUOTED([OMPI_PRTERUN_PATH], ["${prterun_path}"], [Path to prterun])])
283+
291284
AS_IF([test "$setup_prrte_external_happy" = "yes"],
292285
[$1], [$2])
293286

ompi/tools/mpirun/Makefile.am

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,7 @@
11
#
22
# Copyright (c) 2019-2020 Intel, Inc. All rights reserved.
33
# Copyright (c) 2020 IBM Corporation. All rights reserved.
4-
# Copyright (c) 2021 Amazon.com, Inc. or its affiliates.
5-
# All Rights reserved.
4+
# Copyright (c) 2021-2022 Amazon.com, Inc. or its affiliates. All Rights reserved.
65
# Copyright (c) 2021 Nanook Consulting. All rights reserved.
76
# $COPYRIGHT$
87
#
@@ -15,6 +14,8 @@ if OMPI_WANT_PRRTE
1514

1615
bin_PROGRAMS = mpirun
1716

17+
dist_ompidata_DATA = help-mpirun.txt
18+
1819
mpirun_SOURCES = \
1920
main.c
2021

ompi/tools/mpirun/help-mpirun.txt

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
# -*- text -*-
2+
#
3+
# Copyright (c) 2022 Amazon.com, Inc. or its affiliates. All Rights reserved.
4+
# $COPYRIGHT$
5+
#
6+
# Additional copyrights may follow
7+
#
8+
# $HEADER$
9+
#
10+
# This is the US/English help file for Open MPI wrapper compiler error
11+
# messages.
12+
#
13+
[no-prterun-found]
14+
Open MPI's mpirun command was unable to find an underlying prterun
15+
command to execute. Consider setting the OMPI_PRTERUN environment
16+
variable to help mpirun find the correct underlying prterun.
17+
[prterun-exec-failed]
18+
Open MPI's mpirun command could not execute the underlying prterun
19+
command. The prterun command we tried to execute and the error
20+
message from exec() are below:
21+
22+
Command: %s
23+
Error Message: %s

ompi/tools/mpirun/main.c

Lines changed: 83 additions & 61 deletions
Original file line numberDiff line numberDiff line change
@@ -14,100 +14,121 @@
1414
#if HAVE_SYS_STAT_H
1515
# include <sys/stat.h>
1616
#endif /* HAVE_SYS_STAT_H */
17+
#include <string.h>
1718

1819
#include "opal/mca/base/base.h"
1920
#include "opal/mca/installdirs/base/base.h"
21+
#include "opal/runtime/opal.h"
2022
#include "opal/util/argv.h"
2123
#include "opal/util/basename.h"
2224
#include "opal/util/opal_environ.h"
2325
#include "opal/util/os_dirpath.h"
2426
#include "opal/util/os_path.h"
2527
#include "opal/util/path.h"
2628
#include "opal/util/printf.h"
29+
#include "opal/util/show_help.h"
30+
#include "ompi/constants.h"
2731

28-
int main(int argc, char *argv[])
32+
static char *find_prterun(void)
2933
{
30-
char *evar;
31-
char **pargs = NULL;
32-
char *pfx = NULL;
33-
int m, param_len;
34-
char *truepath = NULL;
34+
char *filename = NULL;
35+
char *prrte_prefix = NULL;
3536

36-
if (NULL != (evar = getenv("OPAL_PREFIX"))) {
37+
/* 1) Did the user tell us exactly where to find prterun? */
38+
filename = getenv("OMPI_PRTERUN");
39+
if (NULL != filename) {
40+
return filename;
41+
}
3742

3843
#if OMPI_USING_INTERNAL_PRRTE
39-
setenv("PRTE_PREFIX", evar, 1);
40-
#endif
44+
/* 2) If using internal PRRTE, use our bindir. Note that this
45+
* will obey OPAL_PREFIX and OPAL_DESTDIR */
46+
opal_asprintf(&filename, "%s%sprterun", opal_install_dirs.bindir, OPAL_PATH_SEP);
47+
return filename;
48+
#else
4149

42-
#if OPAL_USING_INTERNAL_PMIX
43-
setenv("PMIX_PREFIX", evar, 1);
50+
/* 3) Look in ${PRTE_PREFIX}/bin */
51+
prrte_prefix = getenv("PRTE_PREFIX");
52+
if (NULL != prrte_prefix) {
53+
opal_asprintf(&filename, "%s%sbin%sprterun", prrte_prefix, OPAL_PATH_SEP, OPAL_PATH_SEP);
54+
return filename;
55+
}
56+
57+
/* 4) See if configure told us where to look, if set */
58+
#if defined(OMPI_PRTERUN_PATH)
59+
return strdup(OMPI_PRTERUN_PATH);
60+
#else
61+
62+
/* 5) Use path search */
63+
filename = opal_find_absolute_path("prterun");
64+
65+
return filename;
66+
#endif
4467
#endif
68+
}
69+
70+
71+
int main(int argc, char *argv[])
72+
{
73+
char *opal_prefix = getenv("OPAL_PREFIX");
74+
char *full_prterun_path = NULL;
75+
char **prterun_args = NULL;
76+
int ret;
77+
size_t i;
78+
79+
ret = opal_init_util(&argc, &argv);
80+
if (OMPI_SUCCESS != ret) {
81+
fprintf(stderr, "Failed initializing opal: %d\n", ret);
82+
exit(1);
4583
}
84+
85+
/* note that we just modify our environment rather than create a
86+
* child environment because it is easier and we're not going to
87+
* be around long enough for it to matter (since we exec prterun
88+
* asap */
4689
setenv("PRTE_MCA_schizo_proxy", "ompi", 1);
4790
setenv("OMPI_VERSION", OMPI_VERSION, 1);
4891
char *base_tool_name = opal_basename(argv[0]);
4992
setenv("OMPI_TOOL_NAME", base_tool_name, 1);
5093
free(base_tool_name);
5194

95+
/* TODO: look for --prefix and compare with OPAL_PREFIX and pick
96+
* one */
5297

53-
opal_argv_append_nosize(&pargs, "prterun");
54-
for (m=1; NULL != argv[m]; m++) {
55-
opal_argv_append_nosize(&pargs, argv[m]);
56-
/* Did the user specify a prefix, or want prefix by default? */
57-
if (0 == strcmp(argv[m], "--prefix")) {
58-
opal_asprintf(&pfx, "%s%s", argv[m+1], "/bin");
59-
}
60-
}
61-
62-
if (NULL != pfx) {
63-
/* "Parse" the param, aka remove superfluous path_sep. */
64-
param_len = strlen(pfx);
65-
while (0 == strcmp(OPAL_PATH_SEP, &(pfx[param_len - 1]))) {
66-
pfx[param_len - 1] = '\0';
67-
param_len--;
68-
if (0 == param_len) {
69-
fprintf(stderr, "A prefix was supplied to mpirun that only contained slashes.\n"
70-
"This is a fatal error; mpirun will now abort.\nNo processes were launched.\n");
71-
exit(1);
72-
}
73-
}
74-
} else if (opal_path_is_absolute(argv[0])) {
75-
/* Check if called with fully-qualified path to mpirun.
76-
* (Note: Put this second so can override with --prefix (above). */
77-
pfx = opal_dirname(argv[0]);
98+
/* as a special case, if OPAL_PREFIX was set and either PRRTE or
99+
* PMIx are internal builds, set their prefix variables as well */
100+
if (NULL != opal_prefix) {
78101
#if OMPI_USING_INTERNAL_PRRTE
79-
} else {
80-
/* in case --enable-prefix-by-default was given */
81-
mca_base_framework_open(&opal_installdirs_base_framework, 0); // fill in the installdirs
82-
if (NULL != opal_install_dirs.bindir) {
83-
pfx = strdup(opal_install_dirs.bindir);
84-
}
102+
setenv("PRTE_PREFIX", opal_prefix, 1);
85103
#endif
86-
}
87-
88-
if (NULL == pfx) {
89-
truepath = opal_path_findv("prterun", X_OK, environ, NULL);
90-
#if !OMPI_USING_INTERNAL_PRRTE
91-
// if OMPI_PRTERUN_PATH is available, try that
92-
// for external builds if the user didn't explictly
93-
// add a prefix and it isn't in the users path.
94-
if((NULL == truepath) && (0 != strlen(OMPI_PRTERUN_PATH))) {
95-
truepath = opal_os_path(0, OMPI_PRTERUN_PATH, "prterun", NULL);
96-
}
104+
#if OPAL_USING_INTERNAL_PMIX
105+
setenv("PMIX_PREFIX", opal_prefix, 1);
97106
#endif
98-
} else {
99-
truepath = opal_os_path(0, pfx, "prterun", NULL);
100-
free(pfx);
101107
}
102108

103-
if (NULL == truepath) {
104-
fprintf(stderr, "prterun executable could not be found - unable to run\n");
109+
full_prterun_path = find_prterun();
110+
if (NULL == full_prterun_path) {
111+
opal_show_help("help-mpirun.txt", "no-prterun-found", 1);
105112
exit(1);
106113
}
107114

108-
execve(truepath, pargs, environ);
109-
fprintf(stderr, "The mpirun (\"%s\") cmd failed to exec its actual executable - your application will NOT execute. Error: %s\n",
110-
truepath ? truepath : "NULL", strerror(errno));
115+
/* calling mpirun (and now prterun) with a full path has a special
116+
* meaning in terms of -prefix behavior, so copy that behavior
117+
* into prterun */
118+
if (opal_path_is_absolute(argv[0])) {
119+
opal_argv_append_nosize(&prterun_args, full_prterun_path);
120+
} else {
121+
opal_argv_append_nosize(&prterun_args, "prterun");
122+
}
123+
124+
/* Copy all the mpirun arguments to prterun.
125+
* TODO: Need to handle --prefix rationally here. */
126+
for (i = 1; NULL != argv[i]; i++) {
127+
opal_argv_append_nosize(&prterun_args, argv[i]);
128+
}
129+
ret = execv(full_prterun_path, prterun_args);
130+
opal_show_help("help-mpirun.txt", "prterun-exec-failed",
131+
1, full_prterun_path, strerror(errno));
111132
exit(1);
112133
}
113134

@@ -125,6 +146,7 @@ int main(int argc, char *argv[])
125146
* Copyright (c) 2017-2020 Intel, Inc. All rights reserved.
126147
* Copyright (c) 2020 Cisco Systems, Inc. All rights reserved.
127148
* Copyright (c) 2021 Nanook Consulting. All rights reserved.
149+
* Copyright (c) 2022 Amazon.com, Inc. or its affiliates. All Rights reserved.
128150
* $COPYRIGHT$
129151
*
130152
* Additional copyrights may follow

0 commit comments

Comments
 (0)