Skip to content

Commit 3ae723f

Browse files
authored
Merge pull request #10886 from hppritcha/dworldify_pset_lookup
sessions:deworldify behavior of pmix pset lookup
2 parents 1af4b98 + 541a17b commit 3ae723f

File tree

1 file changed

+67
-54
lines changed

1 file changed

+67
-54
lines changed

ompi/instance/instance.c

Lines changed: 67 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -1091,7 +1091,8 @@ int ompi_instance_get_num_psets (ompi_instance_t *instance, int *npset_names)
10911091

10921092
int ompi_instance_get_nth_pset (ompi_instance_t *instance, int n, int *len, char *pset_name)
10931093
{
1094-
if (NULL == ompi_mpi_instance_pmix_psets && n >= ompi_instance_builtin_count) {
1094+
if (NULL == ompi_mpi_instance_pmix_psets ||
1095+
(size_t) n >= (ompi_instance_builtin_count + ompi_mpi_instance_num_pmix_psets)) {
10951096
ompi_instance_refresh_pmix_psets (PMIX_QUERY_PSET_NAMES);
10961097
}
10971098

@@ -1229,71 +1230,83 @@ static int ompi_instance_group_self (ompi_instance_t *instance, ompi_group_t **g
12291230

12301231
static int ompi_instance_group_pmix_pset (ompi_instance_t *instance, const char *pset_name, ompi_group_t **group_out)
12311232
{
1233+
int ret = OMPI_SUCCESS;
1234+
size_t i,n;
1235+
bool isnew, try_again = false, refresh = true;
12321236
pmix_status_t rc;
1233-
pmix_proc_t p;
1234-
ompi_group_t *group;
1235-
pmix_value_t *pval = NULL;
1236-
char *stmp = NULL;
1237-
size_t size = 0;
1238-
1239-
/* make the group large enough to hold world */
1240-
group = ompi_group_allocate (NULL, ompi_process_info.num_procs);
1241-
if (OPAL_UNLIKELY(NULL == group)) {
1242-
return OMPI_ERR_OUT_OF_RESOURCE;
1243-
}
1237+
ompi_group_t *group = NULL;
1238+
pmix_query_t query;
1239+
pmix_info_t *info = NULL;
1240+
size_t ninfo;
1241+
opal_process_name_t pname;
12441242

1243+
PMIX_QUERY_CONSTRUCT(&query);
1244+
PMIX_ARGV_APPEND(rc, query.keys, PMIX_QUERY_PSET_MEMBERSHIP);
1245+
PMIX_INFO_CREATE(query.qualifiers, 1);
1246+
query.nqual = 1;
1247+
PMIX_INFO_LOAD(&query.qualifiers[0], PMIX_PSET_NAME, pset_name, PMIX_STRING);
12451248

1246-
for (size_t i = 0 ; i < ompi_process_info.num_procs ; ++i) {
1247-
opal_process_name_t name = {.vpid = i, .jobid = OMPI_PROC_MY_NAME->jobid};
1249+
/*
1250+
* First try finding in the local PMIx cache, if not found, try a refresh
1251+
*/
1252+
fn_try_again:
1253+
rc = PMIx_Query_info(&query, 1, &info, &ninfo);
1254+
if (PMIX_SUCCESS != (rc = PMIx_Query_info(&query, 1, &info, &ninfo)) || 0 == ninfo) {
1255+
if ((PMIX_ERR_NOT_FOUND == rc) && (false == try_again)) {
1256+
try_again = true;
1257+
PMIX_QUERY_DESTRUCT(&query);
1258+
PMIX_QUERY_CONSTRUCT(&query);
1259+
PMIX_ARGV_APPEND(rc, query.keys, PMIX_QUERY_PSET_MEMBERSHIP);
1260+
PMIX_INFO_CREATE(query.qualifiers, 2);
1261+
PMIX_INFO_LOAD(&query.qualifiers[0], PMIX_PSET_NAME, pset_name, PMIX_STRING);
1262+
PMIX_INFO_LOAD(&query.qualifiers[1], PMIX_QUERY_REFRESH_CACHE, &refresh, PMIX_BOOL);
1263+
goto fn_try_again;
1264+
}
1265+
ret = opal_pmix_convert_status(rc);
1266+
ompi_instance_print_error ("PMIx_Query_info() failed", ret);
1267+
goto fn_w_query;
1268+
}
12481269

1249-
OPAL_PMIX_CONVERT_NAME(&p, &name);
1250-
rc = PMIx_Get(&p, PMIX_PSET_NAME, NULL, 0, &pval);
1251-
if (OPAL_UNLIKELY(PMIX_SUCCESS != rc)) {
1252-
OBJ_RELEASE(group);
1253-
return opal_pmix_convert_status(rc);
1254-
}
1270+
for(n = 0; n < ninfo; n++){
1271+
if(0 == strcmp(info[n].key, PMIX_QUERY_PSET_MEMBERSHIP)){
1272+
1273+
pmix_data_array_t *data_array = info[n].value.data.darray;
1274+
pmix_proc_t *members_array = (pmix_proc_t*) data_array->array;
12551275

1256-
PMIX_VALUE_UNLOAD(rc,
1257-
pval,
1258-
(void **)&stmp,
1259-
&size);
1260-
if (0 != strcmp (pset_name, stmp)) {
1261-
PMIX_VALUE_RELEASE(pval);
1262-
free(stmp);
1263-
continue;
1264-
}
1265-
PMIX_VALUE_RELEASE(pval);
1266-
free(stmp);
1276+
group = ompi_group_allocate (NULL, data_array->size);
1277+
if (OPAL_UNLIKELY(NULL == group)) {
1278+
ret = OMPI_ERR_OUT_OF_RESOURCE;
1279+
goto fn_w_info;
1280+
}
12671281

1268-
/* look for existing ompi_proc_t that matches this name */
1269-
group->grp_proc_pointers[size] = (ompi_proc_t *) ompi_proc_lookup (name);
1270-
if (NULL == group->grp_proc_pointers[size]) {
1271-
/* set sentinel value */
1272-
group->grp_proc_pointers[size] = (ompi_proc_t *) ompi_proc_name_to_sentinel (name);
1273-
} else {
1274-
OBJ_RETAIN (group->grp_proc_pointers[size]);
1282+
for(i = 0; i < data_array->size; i++){
1283+
OPAL_PMIX_CONVERT_PROCT(ret, &pname, &members_array[i]);
1284+
if (OPAL_SUCCESS == rc) {
1285+
group->grp_proc_pointers[i] = ompi_proc_find_and_add(&pname,&isnew);
1286+
} else {
1287+
ompi_instance_print_error ("OPAL_PMIX_CONVERT_PROCT failed %d", ret);
1288+
ompi_group_free(&group);
1289+
goto fn_w_info;
1290+
}
1291+
}
1292+
break;
12751293
}
1276-
++size;
12771294
}
12781295

1279-
/* shrink the proc array if needed */
1280-
if (size < (size_t) group->grp_proc_count) {
1281-
void *tmp = realloc (group->grp_proc_pointers, size * sizeof (group->grp_proc_pointers[0]));
1282-
if (OPAL_UNLIKELY(NULL == tmp)) {
1283-
OBJ_RELEASE(group);
1284-
return OMPI_ERR_OUT_OF_RESOURCE;
1285-
}
1286-
1287-
group->grp_proc_pointers = (ompi_proc_t **) tmp;
1288-
group->grp_proc_count = (int) size;
1296+
if (NULL != group) {
1297+
ompi_set_group_rank (group, ompi_proc_local());
1298+
group->grp_instance = instance;
1299+
*group_out = group;
1300+
} else {
1301+
ret = OMPI_ERR_NOT_FOUND;
12891302
}
12901303

1291-
ompi_set_group_rank (group, ompi_proc_local());
1292-
1293-
group->grp_instance = instance;
1304+
fn_w_info:
1305+
PMIX_INFO_DESTRUCT(info);
1306+
fn_w_query:
1307+
PMIX_QUERY_DESTRUCT(&query);
12941308

1295-
*group_out = group;
1296-
return OMPI_SUCCESS;
1309+
return ret;
12971310
}
12981311

12991312
static int ompi_instance_get_pmix_pset_size (ompi_instance_t *instance, const char *pset_name, size_t *size_out)

0 commit comments

Comments
 (0)