Skip to content

Commit 06baa51

Browse files
committed
rte/pmix: fill in opal_process_info when using prrte/pmix
This commit fixes a bug when launching with prun where the process info structures used by the btls are not populated. Signed-off-by: Nathan Hjelm <hjelmn@lanl.gov>
1 parent 804a517 commit 06baa51

File tree

2 files changed

+27
-9
lines changed

2 files changed

+27
-9
lines changed

ompi/mca/rte/pmix/rte_pmix.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
12
/*
23
* Copyright (c) 2012-2013 Los Alamos National Security, LLC.
34
* All rights reserved.
@@ -7,6 +8,8 @@
78
* and Technology (RIST). All rights reserved.
89
* Copyright (c) 2015 Intel, Inc. All rights reserved.
910
* Copyright (c) 2016-2017 IBM Corporation. All rights reserved.
11+
* Copyright (c) 2018 Triad National Security, LLC. All rights
12+
* reserved.
1013
* $COPYRIGHT$
1114
*
1215
* Additional copyrights may follow
@@ -115,6 +118,7 @@ typedef struct {
115118
int32_t num_local_peers;
116119
uint32_t num_procs;
117120
uint32_t app_num;
121+
char *cpuset;
118122
} pmix_process_info_t;
119123
OMPI_DECLSPEC extern pmix_process_info_t pmix_process_info;
120124
#define ompi_process_info pmix_process_info

ompi/mca/rte/pmix/rte_pmix_module.c

Lines changed: 23 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
12
/*
23
* Copyright (c) 2012-2013 Los Alamos National Security, LLC.
34
* All rights reserved.
@@ -7,6 +8,8 @@
78
* reserved.
89
* Copyright (c) 2014-2018 Cisco Systems, Inc. All rights reserved
910
* Copyright (c) 2018 Amazon.com, Inc. or its affiliates. All Rights reserved.
11+
* Copyright (c) 2018 Triad National Security, LLC. All rights
12+
* reserved.
1013
* $COPYRIGHT$
1114
*/
1215
#include "ompi_config.h"
@@ -500,7 +503,7 @@ int ompi_rte_init(int *pargc, char ***pargv)
500503
opal_proc_t *myproc;
501504
int u32, *u32ptr;
502505
uint16_t u16, *u16ptr;
503-
char **peers=NULL, *mycpuset;
506+
char **peers=NULL;
504507
char *envar, *ev1, *ev2;
505508
opal_value_t *kv;
506509
char *val;
@@ -684,9 +687,9 @@ int ompi_rte_init(int *pargc, char ***pargv)
684687
OPAL_MODEX_RECV_VALUE_OPTIONAL(ret, OPAL_PMIX_LOCALITY_STRING,
685688
&pmix_process_info.my_name, &val, OPAL_STRING);
686689
if (OPAL_SUCCESS == ret && NULL != val) {
687-
mycpuset = val;
690+
pmix_process_info.cpuset = val;
688691
} else {
689-
mycpuset = NULL;
692+
pmix_process_info.cpuset = NULL;
690693
}
691694
pname.jobid = pmix_process_info.my_name.jobid;
692695
for (i=0; NULL != peers[i]; i++) {
@@ -699,7 +702,7 @@ int ompi_rte_init(int *pargc, char ***pargv)
699702
OPAL_MODEX_RECV_VALUE_OPTIONAL(ret, OPAL_PMIX_LOCALITY_STRING,
700703
&pname, &val, OPAL_STRING);
701704
if (OPAL_SUCCESS == ret && NULL != val) {
702-
u16 = opal_hwloc_compute_relative_locality(mycpuset, val);
705+
u16 = opal_hwloc_compute_relative_locality(pmix_process_info.cpuset, val);
703706
free(val);
704707
} else {
705708
/* all we can say is that it shares our node */
@@ -718,24 +721,31 @@ int ompi_rte_init(int *pargc, char ***pargv)
718721
if (OPAL_SUCCESS != ret) {
719722
error = "local store of locality";
720723
opal_argv_free(peers);
721-
if (NULL != mycpuset) {
722-
free(mycpuset);
724+
if (NULL != pmix_process_info.cpuset) {
725+
free(pmix_process_info.cpuset);
723726
}
724727
goto error;
725728
}
726729
OBJ_RELEASE(kv);
727730
}
728731
opal_argv_free(peers);
729-
if (NULL != mycpuset) {
730-
free(mycpuset);
731-
}
732732
}
733733

734734
/* poor attempt to detect we are bound */
735735
if (NULL != getenv("SLURM_CPU_BIND_TYPE")) {
736736
pmix_proc_is_bound = true;
737737
}
738738

739+
/* set the remaining opal_process_info fields. Note that
740+
* the OPAL layer will have initialized these to NULL, and
741+
* anyone between us would not have strdup'd the string, so
742+
* we cannot free it here */
743+
opal_process_info.job_session_dir = pmix_process_info.job_session_dir;
744+
opal_process_info.proc_session_dir = pmix_process_info.proc_session_dir;
745+
opal_process_info.num_local_peers = (int32_t)pmix_process_info.num_local_peers;
746+
opal_process_info.my_local_rank = (int32_t)pmix_process_info.my_local_rank;
747+
opal_process_info.cpuset = pmix_process_info.cpuset;
748+
739749
/* push our hostname so others can find us, if they need to - the
740750
* native PMIx component will ignore this request as the hostname
741751
* is provided by the system */
@@ -807,6 +817,10 @@ int ompi_rte_finalize(void)
807817
false, check_file);
808818
free(pmix_process_info.job_session_dir);
809819
}
820+
821+
free (pmix_process_info.cpuset);
822+
pmix_process_info.cpuset = NULL;
823+
810824
return OMPI_SUCCESS;
811825
}
812826

0 commit comments

Comments
 (0)