Skip to content

Commit c74b711

Browse files
authored
Merge pull request #11036 from hppritcha/singleton_noise_reduction_under_slurm
singleton: reduce chattiness under slurm
2 parents b4c9286 + 6a787fb commit c74b711

File tree

2 files changed

+28
-14
lines changed

2 files changed

+28
-14
lines changed

ompi/runtime/help-mpi-runtime.txt

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -120,5 +120,5 @@ that either you are operating in a PMIx-enabled environment, or use "mpirun"
120120
to execute the job.
121121
#
122122
[no-pmix-but]
123-
No PMIx server was reachable, but a PMI1/2 or SLURM environment was detected.
124-
Open MPI will start %d singletons
123+
No PMIx server was reachable, but a PMI1/2 was detected.
124+
If srun is being used to launch application, %d singletons will be started.

ompi/runtime/ompi_rte.c

Lines changed: 26 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -541,6 +541,9 @@ int ompi_rte_init(int *pargc, char ***pargv)
541541
pmix_status_t rc;
542542
char **tmp;
543543
bool singleton = false;
544+
const static char *pmi_sentinels[] = {"PMI_FD", /* SLURM PMI1,2 */
545+
"PMI_CONTROL_PORT", /* Cray Shasta */
546+
NULL};
544547

545548
u32ptr = &u32;
546549
u16ptr = &u16;
@@ -569,21 +572,32 @@ int ompi_rte_init(int *pargc, char ***pargv)
569572
/* if we get PMIX_ERR_UNREACH indicating that we cannot reach the
570573
* server, then we assume we are operating as a singleton */
571574
if (PMIX_ERR_UNREACH == ret) {
572-
/* if we are in a PMI or SLURM environment with two tasks or more,
575+
bool found_a_pmi = false;
576+
int n = 0;
577+
/* if we are in a PMI environment with two tasks or more,
573578
* we probably do not want to start singletons */
574-
char *size_str = getenv("PMI_SIZE");
575-
if (NULL == size_str) {
576-
size_str = getenv("SLURM_NPROCS");
579+
while (pmi_sentinels[n] != NULL) {
580+
if (NULL != getenv(pmi_sentinels[n])) {
581+
found_a_pmi = true;
582+
break;
583+
}
584+
n++;
577585
}
578-
int size = (NULL != size_str)?atoi(size_str):1;
579-
if (1 < size) {
580-
char *rank_str = getenv("PMI_RANK");
581-
if (NULL == rank_str) {
582-
rank_str = getenv("SLURM_PROCID");
586+
if (found_a_pmi) {
587+
char *size_str = getenv("PMI_SIZE");
588+
if (NULL == size_str) {
589+
size_str = getenv("SLURM_NPROCS");
583590
}
584-
int rank = (NULL != rank_str)?atoi(rank_str):0;
585-
if (0 == rank) {
586-
opal_show_help("help-mpi-runtime.txt", "no-pmix-but", false, size);
591+
int size = (NULL != size_str)?atoi(size_str):1;
592+
if (1 < size) {
593+
char *rank_str = getenv("PMI_RANK");
594+
if (NULL == rank_str) {
595+
rank_str = getenv("SLURM_PROCID");
596+
}
597+
int rank = (NULL != rank_str)?atoi(rank_str):0;
598+
if (0 == rank) {
599+
opal_show_help("help-mpi-runtime.txt", "no-pmix-but", false, size);
600+
}
587601
}
588602
}
589603
singleton = true;

0 commit comments

Comments
 (0)