Skip to content

Commit 2794ae4

Browse files
committed
Update nidmap
Signed-off-by: Ralph Castain <rhc@pmix.org>
1 parent 35a5971 commit 2794ae4

File tree

1 file changed

+95
-48
lines changed

1 file changed

+95
-48
lines changed

orte/util/nidmap.c

Lines changed: 95 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -385,6 +385,7 @@ int orte_util_decode_nidmap(opal_buffer_t *buf)
385385
/* add this name to the pool */
386386
nd = OBJ_NEW(orte_node_t);
387387
nd->name = strdup(names[n]);
388+
nd->index = n;
388389
opal_pointer_array_set_item(orte_node_pool, n, nd);
389390
/* set the topology - always default to homogeneous
390391
* as that is the most common scenario */
@@ -409,7 +410,6 @@ int orte_util_decode_nidmap(opal_buffer_t *buf)
409410
daemons->num_procs++;
410411
opal_pointer_array_set_item(daemons->procs, proc->name.vpid, proc);
411412
}
412-
nd->index = proc->name.vpid;
413413
OBJ_RETAIN(nd);
414414
proc->node = nd;
415415
OBJ_RETAIN(proc);
@@ -945,8 +945,9 @@ int orte_util_parse_node_info(opal_buffer_t *buf)
945945
int orte_util_generate_ppn(orte_job_t *jdata,
946946
opal_buffer_t *buf)
947947
{
948-
uint16_t *ppn=NULL;
949-
size_t nbytes;
948+
uint16_t ppn;
949+
uint8_t *bytes;
950+
int32_t nbytes;
950951
int rc = ORTE_SUCCESS;
951952
orte_app_idx_t i;
952953
int j, k;
@@ -955,40 +956,47 @@ int orte_util_generate_ppn(orte_job_t *jdata,
955956
orte_node_t *nptr;
956957
orte_proc_t *proc;
957958
size_t sz;
959+
opal_buffer_t bucket;
958960

959-
/* make room for the number of procs on each node */
960-
nbytes = sizeof(uint16_t) * orte_node_pool->size;
961-
ppn = (uint16_t*)malloc(nbytes);
961+
OBJ_CONSTRUCT(&bucket, opal_buffer_t);
962962

963963
for (i=0; i < jdata->num_apps; i++) {
964-
/* reset the #procs */
965-
memset(ppn, 0, nbytes);
966-
/* for each app_context, compute the #procs on
967-
* each node of the allocation */
968-
for (j=0; j < orte_node_pool->size; j++) {
969-
if (NULL == (nptr = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, j))) {
964+
/* for each app_context */
965+
for (j=0; j < jdata->map->nodes->size; j++) {
966+
if (NULL == (nptr = (orte_node_t*)opal_pointer_array_get_item(jdata->map->nodes, j))) {
970967
continue;
971968
}
972969
if (NULL == nptr->daemon) {
973970
continue;
974971
}
972+
ppn = 0;
975973
for (k=0; k < nptr->procs->size; k++) {
976974
if (NULL != (proc = (orte_proc_t*)opal_pointer_array_get_item(nptr->procs, k))) {
977975
if (proc->name.jobid == jdata->jobid) {
978-
++ppn[j];
976+
++ppn;
979977
}
980978
}
981979
}
980+
if (0 < ppn) {
981+
if (ORTE_SUCCESS != (rc = opal_dss.pack(&bucket, &nptr->index, 1, ORTE_STD_CNTR))) {
982+
goto cleanup;
983+
}
984+
if (ORTE_SUCCESS != (rc = opal_dss.pack(&bucket, &ppn, 1, OPAL_UINT16))) {
985+
goto cleanup;
986+
}
987+
}
982988
}
983-
if (opal_compress.compress_block((uint8_t*)ppn, nbytes,
989+
opal_dss.unload(&bucket, (void**)&bytes, &nbytes);
990+
991+
if (opal_compress.compress_block(bytes, (size_t)nbytes,
984992
(uint8_t**)&bo.bytes, &sz)) {
985993
/* mark that this was compressed */
986994
compressed = true;
987995
bo.size = sz;
988996
} else {
989997
/* mark that this was not compressed */
990998
compressed = false;
991-
bo.bytes = (uint8_t*)ppn;
999+
bo.bytes = bytes;
9921000
bo.size = nbytes;
9931001
}
9941002
/* indicate compression */
@@ -1015,21 +1023,31 @@ int orte_util_generate_ppn(orte_job_t *jdata,
10151023
}
10161024

10171025
cleanup:
1018-
free(ppn);
1026+
OBJ_DESTRUCT(&bucket);
10191027
return rc;
10201028
}
10211029

10221030
int orte_util_decode_ppn(orte_job_t *jdata,
10231031
opal_buffer_t *buf)
10241032
{
1033+
orte_std_cntr_t index;
10251034
orte_app_idx_t n;
1026-
int m, cnt, rc;
1035+
int cnt, rc;
10271036
opal_byte_object_t *boptr;
10281037
bool compressed;
1038+
uint8_t *bytes;
10291039
size_t sz;
1030-
uint16_t *ppn, k;
1040+
uint16_t ppn, k;
10311041
orte_node_t *node;
10321042
orte_proc_t *proc;
1043+
opal_buffer_t bucket;
1044+
1045+
/* reset any flags */
1046+
for (n=0; n < orte_node_pool->size; n++) {
1047+
if (NULL != (node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, n))) {
1048+
ORTE_FLAG_UNSET(node, ORTE_NODE_FLAG_MAPPED);
1049+
}
1050+
}
10331051

10341052
for (n=0; n < jdata->num_apps; n++) {
10351053
/* unpack the compression flag */
@@ -1062,14 +1080,14 @@ int orte_util_decode_ppn(orte_job_t *jdata,
10621080

10631081
/* decompress if required */
10641082
if (compressed) {
1065-
if (!opal_compress.decompress_block((uint8_t**)&ppn, sz,
1083+
if (!opal_compress.decompress_block(&bytes, sz,
10661084
boptr->bytes, boptr->size)) {
10671085
ORTE_ERROR_LOG(ORTE_ERROR);
10681086
OBJ_RELEASE(boptr);
10691087
return ORTE_ERROR;
10701088
}
10711089
} else {
1072-
ppn = (uint16_t*)boptr->bytes;
1090+
bytes = boptr->bytes;
10731091
boptr->bytes = NULL;
10741092
boptr->size = 0;
10751093
}
@@ -1078,38 +1096,67 @@ int orte_util_decode_ppn(orte_job_t *jdata,
10781096
}
10791097
free(boptr);
10801098

1081-
/* cycle thru the node pool */
1082-
for (m=0; m < orte_node_pool->size; m++) {
1083-
if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, m))) {
1084-
continue;
1099+
/* setup to unpack */
1100+
OBJ_CONSTRUCT(&bucket, opal_buffer_t);
1101+
opal_dss.load(&bucket, bytes, sz);
1102+
1103+
/* unpack each node and its ppn */
1104+
cnt = 1;
1105+
while (OPAL_SUCCESS == (rc = opal_dss.unpack(&bucket, &index, &cnt, ORTE_STD_CNTR))) {
1106+
/* get the corresponding node object */
1107+
if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, index))) {
1108+
rc = ORTE_ERR_NOT_FOUND;
1109+
ORTE_ERROR_LOG(rc);
1110+
goto error;
10851111
}
1086-
if (0 < ppn[m]) {
1087-
if (!ORTE_FLAG_TEST(node, ORTE_NODE_FLAG_MAPPED)) {
1088-
OBJ_RETAIN(node);
1089-
ORTE_FLAG_SET(node, ORTE_NODE_FLAG_MAPPED);
1090-
opal_pointer_array_add(jdata->map->nodes, node);
1091-
}
1092-
/* create a proc object for each one */
1093-
for (k=0; k < ppn[m]; k++) {
1094-
proc = OBJ_NEW(orte_proc_t);
1095-
proc->name.jobid = jdata->jobid;
1096-
/* leave the vpid undefined as this will be determined
1097-
* later when we do the overall ranking */
1098-
proc->app_idx = n;
1099-
proc->parent = node->daemon->name.vpid;
1100-
OBJ_RETAIN(node);
1101-
proc->node = node;
1102-
/* flag the proc as ready for launch */
1103-
proc->state = ORTE_PROC_STATE_INIT;
1104-
opal_pointer_array_add(node->procs, proc);
1105-
/* we will add the proc to the jdata array when we
1106-
* compute its rank */
1107-
}
1108-
node->num_procs += ppn[m];
1112+
/* add the node to the job map if not already assigned */
1113+
if (!ORTE_FLAG_TEST(node, ORTE_NODE_FLAG_MAPPED)) {
1114+
OBJ_RETAIN(node);
1115+
opal_pointer_array_add(jdata->map->nodes, node);
1116+
ORTE_FLAG_SET(node, ORTE_NODE_FLAG_MAPPED);
1117+
}
1118+
/* get the ppn */
1119+
cnt = 1;
1120+
if (OPAL_SUCCESS != (rc = opal_dss.unpack(&bucket, &ppn, &cnt, OPAL_UINT16))) {
1121+
ORTE_ERROR_LOG(rc);
1122+
goto error;
1123+
}
1124+
/* create a proc object for each one */
1125+
for (k=0; k < ppn; k++) {
1126+
proc = OBJ_NEW(orte_proc_t);
1127+
proc->name.jobid = jdata->jobid;
1128+
/* leave the vpid undefined as this will be determined
1129+
* later when we do the overall ranking */
1130+
proc->app_idx = n;
1131+
proc->parent = node->daemon->name.vpid;
1132+
OBJ_RETAIN(node);
1133+
proc->node = node;
1134+
/* flag the proc as ready for launch */
1135+
proc->state = ORTE_PROC_STATE_INIT;
1136+
opal_pointer_array_add(node->procs, proc);
1137+
/* we will add the proc to the jdata array when we
1138+
* compute its rank */
11091139
}
1140+
node->num_procs += ppn;
1141+
cnt = 1;
11101142
}
1111-
free(ppn);
1143+
OBJ_DESTRUCT(&bucket);
1144+
}
1145+
1146+
/* reset any flags */
1147+
for (n=0; n < jdata->map->nodes->size; n++) {
1148+
node = (orte_node_t*)opal_pointer_array_get_item(jdata->map->nodes, n);
1149+
ORTE_FLAG_UNSET(node, ORTE_NODE_FLAG_MAPPED);
11121150
}
11131151

11141152
return ORTE_SUCCESS;
1153+
1154+
error:
1155+
OBJ_DESTRUCT(&bucket);
1156+
/* reset any flags */
1157+
for (n=0; n < jdata->map->nodes->size; n++) {
1158+
node = (orte_node_t*)opal_pointer_array_get_item(jdata->map->nodes, n);
1159+
ORTE_FLAG_UNSET(node, ORTE_NODE_FLAG_MAPPED);
1160+
}
1161+
return rc;
11151162
}

0 commit comments

Comments
 (0)