Skip to content

Commit 2f7f1fe

Browse files
committed
Fix confusion between cpuset and locality
Ensure we correctly collect and save the cpuset of the process separately from its locality string. Ensure we use the correct one when computing things like relative locality between processes. Signed-off-by: Ralph Castain <rhc@pmix.org>
1 parent 57ccb83 commit 2f7f1fe

File tree

5 files changed

+18
-23
lines changed

5 files changed

+18
-23
lines changed

ompi/dpm/dpm.c

Lines changed: 3 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -355,7 +355,7 @@ int ompi_dpm_connect_accept(ompi_communicator_t *comm, int root,
355355
if (0 < opal_list_get_size(&ilist)) {
356356
uint32_t *peer_ranks = NULL;
357357
int prn, nprn = 0;
358-
char *val, *mycpuset;
358+
char *val;
359359
uint16_t u16;
360360
opal_process_name_t wildcard_rank;
361361
/* convert the list of new procs to a proc_t array */
@@ -380,16 +380,6 @@ int ompi_dpm_connect_accept(ompi_communicator_t *comm, int root,
380380
opal_argv_free(peers);
381381
}
382382

383-
/* get my locality string */
384-
val = NULL;
385-
OPAL_MODEX_RECV_VALUE_OPTIONAL(rc, PMIX_LOCALITY_STRING,
386-
OMPI_PROC_MY_NAME, &val, PMIX_STRING);
387-
if (OPAL_SUCCESS == rc && NULL != val) {
388-
mycpuset = val;
389-
} else {
390-
mycpuset = NULL;
391-
}
392-
393383
i = 0;
394384
OPAL_LIST_FOREACH(cd, &ilist, ompi_dpm_proct_caddy_t) {
395385
proc = cd->p;
@@ -406,8 +396,8 @@ int ompi_dpm_connect_accept(ompi_communicator_t *comm, int root,
406396
val = NULL;
407397
OPAL_MODEX_RECV_VALUE_IMMEDIATE(rc, PMIX_LOCALITY_STRING,
408398
&proc->super.proc_name, &val, OPAL_STRING);
409-
if (OPAL_SUCCESS == rc && NULL != val) {
410-
u16 = opal_hwloc_compute_relative_locality(mycpuset, val);
399+
if (OPAL_SUCCESS == rc && NULL != ompi_process_info.locality) {
400+
u16 = opal_hwloc_compute_relative_locality(ompi_process_info.locality, val);
411401
free(val);
412402
} else {
413403
/* all we can say is that it shares our node */
@@ -425,9 +415,6 @@ int ompi_dpm_connect_accept(ompi_communicator_t *comm, int root,
425415
}
426416
++i;
427417
}
428-
if (NULL != mycpuset) {
429-
free(mycpuset);
430-
}
431418
if (NULL != peer_ranks) {
432419
free(peer_ranks);
433420
}

ompi/runtime/ompi_rte.c

Lines changed: 11 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -764,7 +764,7 @@ int ompi_rte_init(int *pargc, char ***pargv)
764764

765765
/* identify our location */
766766
val = NULL;
767-
OPAL_MODEX_RECV_VALUE_OPTIONAL(rc, PMIX_LOCALITY_STRING,
767+
OPAL_MODEX_RECV_VALUE_OPTIONAL(rc, PMIX_CPUSET,
768768
&opal_process_info.my_name, &val, PMIX_STRING);
769769
if (PMIX_SUCCESS == rc && NULL != val) {
770770
opal_process_info.cpuset = val;
@@ -774,6 +774,15 @@ int ompi_rte_init(int *pargc, char ***pargv)
774774
opal_process_info.cpuset = NULL;
775775
opal_process_info.proc_is_bound = false;
776776
}
777+
val = NULL;
778+
OPAL_MODEX_RECV_VALUE_OPTIONAL(rc, PMIX_LOCALITY_STRING,
779+
&opal_process_info.my_name, &val, PMIX_STRING);
780+
if (PMIX_SUCCESS == rc && NULL != val) {
781+
opal_process_info.locality = val;
782+
val = NULL; // protect the string
783+
} else {
784+
opal_process_info.locality = NULL;
785+
}
777786

778787
/* retrieve the local peers - defaults to local node */
779788
val = NULL;
@@ -811,7 +820,7 @@ int ompi_rte_init(int *pargc, char ***pargv)
811820
OPAL_MODEX_RECV_VALUE_OPTIONAL(rc, PMIX_LOCALITY_STRING,
812821
&pname, &val, PMIX_STRING);
813822
if (PMIX_SUCCESS == rc && NULL != val) {
814-
u16 = opal_hwloc_compute_relative_locality(opal_process_info.cpuset, val);
823+
u16 = opal_hwloc_compute_relative_locality(opal_process_info.locality, val);
815824
free(val);
816825
} else {
817826
/* all we can say is that it shares our node */
@@ -826,9 +835,6 @@ int ompi_rte_init(int *pargc, char ***pargv)
826835
ret = opal_pmix_convert_status(rc);
827836
error = "local store of locality";
828837
opal_argv_free(peers);
829-
if (NULL != opal_process_info.cpuset) {
830-
free(opal_process_info.cpuset);
831-
}
832838
goto error;
833839
}
834840
}

opal/mca/common/ofi/common_ofi.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright (c) 2015 Intel, Inc. All rights reserved.
2+
* Copyright (c) 2015-2020 Intel, Inc. All rights reserved.
33
* Copyright (c) 2017 Los Alamos National Security, LLC. All rights
44
* reserved.
55
* Copyright (c) 2020 Triad National Security, LLC. All rights
@@ -345,7 +345,7 @@ static uint32_t get_package_rank(opal_process_info_t *process_info)
345345
}
346346

347347
// compute relative locality
348-
relative_locality = opal_hwloc_compute_relative_locality(process_info->cpuset, locality_string);
348+
relative_locality = opal_hwloc_compute_relative_locality(process_info->locality, locality_string);
349349
free(locality_string);
350350

351351
if (relative_locality & OPAL_PROC_ON_SOCKET) {

opal/util/proc.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,7 @@ opal_process_info_t opal_process_info = {
4141
.my_local_rank = 0, /* I'm the only process around here */
4242
.my_node_rank = 0,
4343
.cpuset = NULL,
44+
.locality = NULL,
4445
.pid = 0,
4546
.num_procs = 0,
4647
.app_num = 0,

opal/util/proc.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -115,6 +115,7 @@ typedef struct opal_process_info_t {
115115
uint16_t my_local_rank; /**< local rank on this node within my job */
116116
uint16_t my_node_rank;
117117
char *cpuset; /**< String-representation of bitmap where we are bound */
118+
char *locality; /**< String-representation of process locality */
118119
pid_t pid;
119120
uint32_t num_procs;
120121
uint32_t app_num;

0 commit comments

Comments
 (0)