Skip to content

Commit f5a6b7f

Browse files
author
Ralph Castain
committed
Fix -H operations for multi-app case
Correctly aggregate slots across -H entries from each app. Take into account any -H entry when computing nprocs when no value was given. Signed-off-by: Ralph Castain <rhc@open-mpi.org>
1 parent 5f767e1 commit f5a6b7f

File tree

4 files changed

+46
-35
lines changed

4 files changed

+46
-35
lines changed

orte/mca/rmaps/base/rmaps_base_map_job.c

Lines changed: 1 addition & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,7 @@ void orte_rmaps_base_map_job(int fd, short args, void *cbdata)
4949
orte_job_t *jdata;
5050
orte_node_t *node;
5151
int rc, i, ppx = 0;
52-
bool did_map, given, pernode = false, persocket = false;
52+
bool did_map, pernode = false, persocket = false;
5353
orte_rmaps_base_selected_module_t *mod;
5454
orte_job_t *parent;
5555
orte_vpid_t nprocs;
@@ -105,7 +105,6 @@ void orte_rmaps_base_map_job(int fd, short args, void *cbdata)
105105
orte_std_cntr_t slots;
106106
OBJ_CONSTRUCT(&nodes, opal_list_t);
107107
orte_rmaps_base_get_target_nodes(&nodes, &slots, app, ORTE_MAPPING_BYNODE, true, true);
108-
slots = 0;
109108
if (pernode) {
110109
slots = ppx * opal_list_get_size(&nodes);
111110
} else if (persocket) {
@@ -115,34 +114,6 @@ void orte_rmaps_base_map_job(int fd, short args, void *cbdata)
115114
HWLOC_OBJ_SOCKET, 0,
116115
OPAL_HWLOC_AVAILABLE);
117116
}
118-
} else {
119-
/* if we are in a managed allocation, then all is good - otherwise,
120-
* we have to do a little more checking */
121-
if (!orte_managed_allocation) {
122-
/* if all the nodes have their slots given, then we are okay */
123-
given = true;
124-
OPAL_LIST_FOREACH(node, &nodes, orte_node_t) {
125-
if (!ORTE_FLAG_TEST(node, ORTE_NODE_FLAG_SLOTS_GIVEN)) {
126-
given = false;
127-
break;
128-
}
129-
}
130-
/* if -host or -hostfile was given, and the slots were not,
131-
* then this is no longer allowed */
132-
if (!given &&
133-
(orte_get_attribute(&app->attributes, ORTE_APP_DASH_HOST, NULL, OPAL_STRING) ||
134-
orte_get_attribute(&app->attributes, ORTE_APP_HOSTFILE, NULL, OPAL_STRING))) {
135-
/* inform the user of the error */
136-
orte_show_help("help-orte-rmaps-base.txt", "num-procs-not-specified", true);
137-
OPAL_LIST_DESTRUCT(&nodes);
138-
OBJ_RELEASE(caddy);
139-
ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_MAP_FAILED);
140-
return;
141-
}
142-
OPAL_LIST_FOREACH(node, &nodes, orte_node_t) {
143-
slots += node->slots;
144-
}
145-
}
146117
}
147118
app->num_procs = slots;
148119
OPAL_LIST_DESTRUCT(&nodes);

orte/mca/rmaps/base/rmaps_base_support_fns.c

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -469,12 +469,19 @@ int orte_rmaps_base_get_target_nodes(opal_list_t *allocated_nodes, orte_std_cntr
469469
continue;
470470
}
471471
if (node->slots > node->slots_inuse) {
472+
orte_std_cntr_t s;
473+
/* check for any -host allocations */
474+
if (orte_get_attribute(&app->attributes, ORTE_APP_DASH_HOST, (void**)&hosts, OPAL_STRING)) {
475+
s = orte_util_dash_host_compute_slots(node, hosts);
476+
} else {
477+
s = node->slots - node->slots_inuse;
478+
}
472479
/* add the available slots */
473480
OPAL_OUTPUT_VERBOSE((5, orte_rmaps_base_framework.framework_output,
474481
"%s node %s has %d slots available",
475482
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
476-
node->name, node->slots - node->slots_inuse));
477-
num_slots += node->slots - node->slots_inuse;
483+
node->name, s));
484+
num_slots += s;
478485
continue;
479486
}
480487
if (!(ORTE_MAPPING_NO_OVERSUBSCRIBE & ORTE_GET_MAPPING_DIRECTIVE(policy))) {

orte/util/dash_host/dash_host.c

Lines changed: 32 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
* Copyright (c) 2004-2005 The Regents of the University of California.
1111
* All rights reserved.
1212
* Copyright (c) 2013 Cisco Systems, Inc. All rights reserved.
13-
* Copyright (c) 2014-2017 Intel, Inc. All rights reserved.
13+
* Copyright (c) 2014-2018 Intel, Inc. All rights reserved.
1414
* Copyright (c) 2015 Research Organization for Information Science
1515
* and Technology (RIST). All rights reserved.
1616
* Copyright (c) 2016 IBM Corporation. All rights reserved.
@@ -42,6 +42,36 @@
4242

4343
#include "dash_host.h"
4444

45+
int orte_util_dash_host_compute_slots(orte_node_t *node, char *hosts)
46+
{
47+
char **specs, *cptr;
48+
int slots = 0;
49+
int n;
50+
51+
specs = opal_argv_split(hosts, ',');
52+
53+
/* see if this node appears in the list */
54+
for (n=0; NULL != specs[n]; n++) {
55+
if (0 == strncmp(node->name, specs[n], strlen(node->name))) {
56+
/* check if the #slots was specified */
57+
if (NULL != (cptr = strchr(specs[n], ':'))) {
58+
*cptr = '\0';
59+
++cptr;
60+
if ('*' == *cptr || 0 == strcmp(cptr, "auto")) {
61+
slots += node->slots - node->slots_inuse;
62+
} else {
63+
slots += strtol(cptr, NULL, 10);
64+
}
65+
} else {
66+
++slots;
67+
}
68+
69+
}
70+
}
71+
opal_argv_free(specs);
72+
return slots;
73+
}
74+
4575
/* we can only enter this routine if no other allocation
4676
* was found, so we only need to know that finding any
4777
* relative node syntax should generate an immediate error
@@ -289,7 +319,7 @@ int orte_util_add_dash_host_nodes(opal_list_t *nodes,
289319
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), node->name));
290320
if (ORTE_FLAG_TEST(nd, ORTE_NODE_FLAG_SLOTS_GIVEN)) {
291321
/* transfer across the number of slots */
292-
node->slots = nd->slots;
322+
node->slots += nd->slots;
293323
ORTE_FLAG_SET(node, ORTE_NODE_FLAG_SLOTS_GIVEN);
294324
}
295325
break;

orte/util/dash_host/dash_host.h

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
* University of Stuttgart. All rights reserved.
1010
* Copyright (c) 2004-2005 The Regents of the University of California.
1111
* All rights reserved.
12-
* Copyright (c) 2014-2015 Intel, Inc. All rights reserved.
12+
* Copyright (c) 2014-2018 Intel, Inc. All rights reserved.
1313
* $COPYRIGHT$
1414
*
1515
* Additional copyrights may follow
@@ -27,6 +27,7 @@
2727

2828
#include "opal/class/opal_list.h"
2929

30+
#include "orte/runtime/orte_globals.h"
3031

3132
BEGIN_C_DECLS
3233

@@ -41,6 +42,8 @@ ORTE_DECLSPEC int orte_util_filter_dash_host_nodes(opal_list_t *nodes,
4142
ORTE_DECLSPEC int orte_util_get_ordered_dash_host_list(opal_list_t *nodes,
4243
char *hosts);
4344

45+
ORTE_DECLSPEC int orte_util_dash_host_compute_slots(orte_node_t *node, char *hosts);
46+
4447
END_C_DECLS
4548

4649
#endif

0 commit comments

Comments
 (0)