Skip to content

Commit e534c57

Browse files
authored
Merge pull request #8472 from awlauria/fix_mpiext_segv
Fix segv in OMPI_Affinity_str().
2 parents 4573f90 + 17f72b1 commit e534c57

File tree

3 files changed

+52
-70
lines changed

3 files changed

+52
-70
lines changed

ompi/mpiext/affinity/c/mpiext_affinity_str.c

Lines changed: 20 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
* Copyright (c) 2015-2017 Research Organization for Information Science
1010
* and Technology (RIST). All rights reserved.
1111
* Copyright (c) 2015-2019 Intel, Inc. All rights reserved.
12+
* Copyright (c) 2021 IBM Corporation. All rights reserved.
1213
* $COPYRIGHT$
1314
*
1415
* Additional copyrights may follow
@@ -98,26 +99,24 @@ int OMPI_Affinity_str(ompi_affinity_fmt_t fmt_type,
9899
*/
99100
static int get_rsrc_ompi_bound(char str[OMPI_AFFINITY_STRING_MAX])
100101
{
101-
int ret;
102-
103102
/* If OMPI did not bind, indicate that */
104103
if (!ompi_rte_proc_is_bound) {
105104
opal_string_copy(str, ompi_nobind_str, OMPI_AFFINITY_STRING_MAX);
106105
return OMPI_SUCCESS;
107106
}
108107

109-
if (NULL == ompi_proc_applied_binding) {
110-
ret = OPAL_ERR_NOT_BOUND;
111-
} else {
112-
ret = opal_hwloc_base_cset2str(str, OMPI_AFFINITY_STRING_MAX,
113-
opal_hwloc_topology,
114-
ompi_proc_applied_binding);
108+
hwloc_bitmap_t cpuset = hwloc_bitmap_alloc();
109+
hwloc_bitmap_list_sscanf(cpuset, opal_process_info.cpuset);
110+
if(OPAL_ERR_NOT_BOUND == opal_hwloc_base_cset2str(str,
111+
OMPI_AFFINITY_STRING_MAX,
112+
opal_hwloc_topology,
113+
cpuset))
114+
{
115+
opal_string_copy(str, not_bound_str, OMPI_AFFINITY_STRING_MAX);
115116
}
116-
if (OPAL_ERR_NOT_BOUND == ret) {
117-
opal_string_copy(str, not_bound_str, OMPI_AFFINITY_STRING_MAX);
118-
ret = OMPI_SUCCESS;
119-
}
120-
return ret;
117+
hwloc_bitmap_free(cpuset);
118+
119+
return OMPI_SUCCESS;
121120
}
122121

123122

@@ -290,28 +289,23 @@ static int get_rsrc_exists(char str[OMPI_AFFINITY_STRING_MAX])
290289
*/
291290
static int get_layout_ompi_bound(char str[OMPI_AFFINITY_STRING_MAX])
292291
{
293-
int ret;
294-
295292
/* If OMPI did not bind, indicate that */
296293
if (!ompi_rte_proc_is_bound) {
297294
opal_string_copy(str, ompi_nobind_str, OMPI_AFFINITY_STRING_MAX);
298295
return OMPI_SUCCESS;
299296
}
300297

301-
/* Find out what OMPI bound us to and prettyprint it */
302-
if (NULL == ompi_proc_applied_binding) {
303-
ret = OPAL_ERR_NOT_BOUND;
304-
} else {
305-
ret = opal_hwloc_base_cset2mapstr(str, OMPI_AFFINITY_STRING_MAX,
298+
hwloc_bitmap_t cpuset = hwloc_bitmap_alloc();
299+
hwloc_bitmap_list_sscanf(cpuset, opal_process_info.cpuset);
300+
if(OPAL_ERR_NOT_BOUND == opal_hwloc_base_cset2mapstr(str,
301+
OMPI_AFFINITY_STRING_MAX,
306302
opal_hwloc_topology,
307-
ompi_proc_applied_binding);
308-
}
309-
if (OPAL_ERR_NOT_BOUND == ret) {
303+
cpuset))
304+
{
310305
opal_string_copy(str, not_bound_str, OMPI_AFFINITY_STRING_MAX);
311-
ret = OMPI_SUCCESS;
312306
}
313-
314-
return ret;
307+
hwloc_bitmap_free(cpuset);
308+
return OMPI_SUCCESS;
315309
}
316310

317311
/*

ompi/runtime/ompi_rte.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
* Copyright (c) 2020 Amazon.com, Inc. or its affiliates. All Rights
1616
* reserved.
1717
* Copyright (c) 2021 Nanook Consulting. All rights reserved.
18+
* Copyright (c) 2021 IBM Corporation. All rights reserved.
1819
* $COPYRIGHT$
1920
*/
2021
#include "ompi_config.h"
@@ -62,7 +63,6 @@
6263
/* storage to support OMPI */
6364
opal_process_name_t pmix_name_wildcard = {UINT32_MAX-1, UINT32_MAX-1};
6465
opal_process_name_t pmix_name_invalid = {UINT32_MAX, UINT32_MAX};
65-
hwloc_cpuset_t ompi_proc_applied_binding = NULL;
6666
bool ompi_singleton = false;
6767

6868
static int _setup_top_session_dir(char **sdir);

opal/mca/hwloc/base/hwloc_base_util.c

Lines changed: 31 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@
1919
* Copyright (C) 2018 Mellanox Technologies, Ltd.
2020
* All rights reserved.
2121
* Copyright (c) 2018 Amazon.com, Inc. or its affiliates. All Rights reserved.
22-
* Copyright (c) 2019 IBM Corporation. All rights reserved.
22+
* Copyright (c) 2019-2021 IBM Corporation. All rights reserved.
2323
* Copyright (c) 2019-2020 Inria. All rights reserved.
2424
* $COPYRIGHT$
2525
*
@@ -294,6 +294,18 @@ int opal_hwloc_base_get_topology(void)
294294
wildcard_rank.jobid = OPAL_PROC_MY_NAME.jobid;
295295
wildcard_rank.vpid = OPAL_VPID_WILDCARD;
296296

297+
// Did the user ask for a topo file at the mca line?
298+
// Check this first, before main methods.
299+
if(NULL != opal_hwloc_base_topo_file) {
300+
opal_output_verbose(1, opal_hwloc_base_framework.framework_output,
301+
"hwloc:base loading topology from file %s",
302+
opal_hwloc_base_topo_file);
303+
if(OPAL_SUCCESS != (rc = opal_hwloc_base_set_topology(opal_hwloc_base_topo_file))) {
304+
return rc;
305+
}
306+
goto done;
307+
}
308+
297309
#if HWLOC_API_VERSION >= 0x20000
298310
opal_output_verbose(2, opal_hwloc_base_framework.framework_output,
299311
"hwloc:base: looking for topology in shared memory");
@@ -337,7 +349,7 @@ int opal_hwloc_base_get_topology(void)
337349
opal_output_verbose(2, opal_hwloc_base_framework.framework_output,
338350
"hwloc:base: topology in shared memory");
339351
topo_in_shmem = true;
340-
return OPAL_SUCCESS;
352+
goto done;
341353
}
342354
}
343355
#endif
@@ -394,7 +406,7 @@ int opal_hwloc_base_get_topology(void)
394406
hwloc_topology_destroy(opal_hwloc_topology);
395407
return rc;
396408
}
397-
} else if (NULL == opal_hwloc_base_topo_file) {
409+
} else {
398410
opal_output_verbose(1, opal_hwloc_base_framework.framework_output,
399411
"hwloc:base discovering topology");
400412
if (0 != hwloc_topology_init(&opal_hwloc_topology) ||
@@ -408,15 +420,10 @@ int opal_hwloc_base_get_topology(void)
408420
hwloc_topology_destroy(opal_hwloc_topology);
409421
return rc;
410422
}
411-
} else {
412-
opal_output_verbose(1, opal_hwloc_base_framework.framework_output,
413-
"hwloc:base loading topology from file %s",
414-
opal_hwloc_base_topo_file);
415-
if (OPAL_SUCCESS != (rc = opal_hwloc_base_set_topology(opal_hwloc_base_topo_file))) {
416-
return rc;
417-
}
418423
}
419424

425+
done:
426+
420427
/* fill opal_cache_line_size global with the smallest L1 cache
421428
line size */
422429
fill_cache_line_size();
@@ -659,9 +666,11 @@ static hwloc_obj_t df_search(hwloc_topology_t topo,
659666
// available = opal_hwloc_base_get_available_cpus(topo, start)
660667
// and skipped objs that had hwloc_bitmap_iszero(available)
661668
hwloc_obj_t root;
662-
opal_hwloc_topo_data_t *rdata;
669+
opal_hwloc_topo_data_t *rdata = NULL;
663670
root = hwloc_get_root_obj(topo);
664-
rdata = (opal_hwloc_topo_data_t*)root->userdata;
671+
if(false == topo_in_shmem) {
672+
rdata = (opal_hwloc_topo_data_t*)root->userdata;
673+
}
665674
hwloc_cpuset_t constrained_cpuset;
666675

667676
constrained_cpuset = hwloc_bitmap_alloc();
@@ -696,7 +705,7 @@ unsigned int opal_hwloc_base_get_nbobjs_by_type(hwloc_topology_t topo,
696705
unsigned int num_objs;
697706
hwloc_obj_t obj;
698707
opal_hwloc_summary_t *sum;
699-
opal_hwloc_topo_data_t *data;
708+
opal_hwloc_topo_data_t *data = NULL;
700709
int rc;
701710

702711
/* bozo check */
@@ -728,10 +737,17 @@ unsigned int opal_hwloc_base_get_nbobjs_by_type(hwloc_topology_t topo,
728737
obj = hwloc_get_root_obj(topo);
729738

730739
/* first see if the topology already has this summary */
731-
data = (opal_hwloc_topo_data_t*)obj->userdata;
740+
if(false == topo_in_shmem) {
741+
data = (opal_hwloc_topo_data_t*)obj->userdata;
742+
}
732743
if (NULL == data) {
733744
data = OBJ_NEW(opal_hwloc_topo_data_t);
734-
obj->userdata = (void*)data;
745+
if(false == topo_in_shmem) {
746+
// Can't touch userdata if in read-only shmem!
747+
// We have to protect here for the case where obj->userdata
748+
// is in shmem and it is NULL.
749+
obj->userdata = (void*) data;
750+
}
735751
} else {
736752
OPAL_LIST_FOREACH(sum, &data->summaries, opal_hwloc_summary_t) {
737753
if (target == sum->type &&
@@ -1167,8 +1183,6 @@ int opal_hwloc_base_cset2str(char *str, int len,
11671183
char tmp[BUFSIZ];
11681184
const int stmp = sizeof(tmp) - 1;
11691185
int **map=NULL;
1170-
hwloc_obj_t root;
1171-
opal_hwloc_topo_data_t *sum;
11721186

11731187
str[0] = tmp[stmp] = '\0';
11741188

@@ -1177,18 +1191,6 @@ int opal_hwloc_base_cset2str(char *str, int len,
11771191
return OPAL_ERR_NOT_BOUND;
11781192
}
11791193

1180-
/* if the cpuset includes all available cpus, then we are unbound */
1181-
root = hwloc_get_root_obj(topo);
1182-
if (NULL != root->userdata) {
1183-
sum = (opal_hwloc_topo_data_t*)root->userdata;
1184-
if (NULL == sum->available) {
1185-
return OPAL_ERROR;
1186-
}
1187-
if (0 != hwloc_bitmap_isincluded(sum->available, cpuset)) {
1188-
return OPAL_ERR_NOT_BOUND;
1189-
}
1190-
}
1191-
11921194
if (OPAL_SUCCESS != (ret = build_map(&num_sockets, &num_cores, cpuset, &map, topo))) {
11931195
return ret;
11941196
}
@@ -1235,8 +1237,6 @@ int opal_hwloc_base_cset2mapstr(char *str, int len,
12351237
int core_index, pu_index;
12361238
const int stmp = sizeof(tmp) - 1;
12371239
hwloc_obj_t socket, core, pu;
1238-
hwloc_obj_t root;
1239-
opal_hwloc_topo_data_t *sum;
12401240

12411241
str[0] = tmp[stmp] = '\0';
12421242

@@ -1245,18 +1245,6 @@ int opal_hwloc_base_cset2mapstr(char *str, int len,
12451245
return OPAL_ERR_NOT_BOUND;
12461246
}
12471247

1248-
/* if the cpuset includes all available cpus, then we are unbound */
1249-
root = hwloc_get_root_obj(topo);
1250-
if (NULL != root->userdata) {
1251-
sum = (opal_hwloc_topo_data_t*)root->userdata;
1252-
if (NULL == sum->available) {
1253-
return OPAL_ERROR;
1254-
}
1255-
if (0 != hwloc_bitmap_isincluded(sum->available, cpuset)) {
1256-
return OPAL_ERR_NOT_BOUND;
1257-
}
1258-
}
1259-
12601248
/* Iterate over all existing sockets */
12611249
for (socket = hwloc_get_obj_by_type(topo, HWLOC_OBJ_SOCKET, 0);
12621250
NULL != socket;

0 commit comments

Comments
 (0)