Skip to content

Commit 6741344

Browse files
committed
Fix singleton operation
OpenPMIx fills in a variety of info when it detects that we are in singleton mode. Best way of detecting it is to look for the "singleton" at the beginning of the returned nspace. Make the modex recvs optional so we don't bounce up to the server and then to the host trying to retrieve job-level info that must be given to us at job start. Signed-off-by: Ralph Castain <rhc@pmix.org>
1 parent 2770974 commit 6741344

File tree

1 file changed

+24
-16
lines changed

1 file changed

+24
-16
lines changed

ompi/runtime/ompi_rte.c

Lines changed: 24 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -543,6 +543,10 @@ int ompi_rte_init(int *pargc, char ***pargv)
543543
opal_show_help("help-mpi-runtime.txt", "no-pmi", true, PMIx_Error_string(ret));
544544
return OPAL_ERR_SILENT;
545545
}
546+
/* if our nspace starts with "singleton", then we are a singleton */
547+
if (0 == strncmp(myprocid.nspace, "singleton", strlen("singleton"))) {
548+
ompi_singleton = true;
549+
}
546550

547551
/* setup the process name fields - also registers the new nspace */
548552
OPAL_PMIX_CONVERT_PROCT(rc, &pname, &myprocid);
@@ -566,32 +570,32 @@ int ompi_rte_init(int *pargc, char ***pargv)
566570
ompi_process_info.nodename = opal_process_info.nodename;
567571

568572
/* get our local rank from PMI */
569-
OPAL_MODEX_RECV_VALUE(rc, PMIX_LOCAL_RANK,
570-
&pmix_process_info.my_name, &u16ptr, PMIX_UINT16);
573+
OPAL_MODEX_RECV_VALUE_OPTIONAL(rc, PMIX_LOCAL_RANK,
574+
&pmix_process_info.my_name, &u16ptr, PMIX_UINT16);
571575
if (PMIX_SUCCESS != rc) {
572-
/* assume we are a singleton */
573-
u16 = 0;
574-
ompi_singleton = true;
576+
ret = opal_pmix_convert_status(rc);
577+
error = "local rank";
578+
goto error;
575579
}
576580
pmix_process_info.my_local_rank = u16;
577581

578582
/* get our node rank from PMI */
579-
OPAL_MODEX_RECV_VALUE(rc, PMIX_NODE_RANK,
580-
&pmix_process_info.my_name, &u16ptr, PMIX_UINT16);
583+
OPAL_MODEX_RECV_VALUE_OPTIONAL(rc, PMIX_NODE_RANK,
584+
&pmix_process_info.my_name, &u16ptr, PMIX_UINT16);
581585
if (PMIX_SUCCESS != rc) {
582-
/* assume we are a singleton */
583586
u16 = 0;
584587
}
585588
pmix_process_info.my_node_rank = u16;
586589

587590
/* get job size */
588591
pname.jobid = pmix_process_info.my_name.jobid;
589592
pname.vpid = OPAL_VPID_WILDCARD;
590-
OPAL_MODEX_RECV_VALUE(rc, PMIX_JOB_SIZE,
591-
&pname, &u32ptr, PMIX_UINT32);
593+
OPAL_MODEX_RECV_VALUE_OPTIONAL(rc, PMIX_JOB_SIZE,
594+
&pname, &u32ptr, PMIX_UINT32);
592595
if (PMIX_SUCCESS != rc) {
593-
/* assume we are a singleton */
594-
u32 = 1;
596+
ret = opal_pmix_convert_status(rc);
597+
error = "job size";
598+
goto error;
595599
}
596600
pmix_process_info.num_procs = u32;
597601

@@ -620,12 +624,14 @@ int ompi_rte_init(int *pargc, char ***pargv)
620624

621625
/* get the number of local peers - required for wireup of
622626
* shared memory BTL */
623-
OPAL_MODEX_RECV_VALUE(rc, PMIX_LOCAL_SIZE,
624-
&pname, &u32ptr, PMIX_UINT32);
627+
OPAL_MODEX_RECV_VALUE_OPTIONAL(rc, PMIX_LOCAL_SIZE,
628+
&pname, &u32ptr, PMIX_UINT32);
625629
if (PMIX_SUCCESS == rc) {
626630
pmix_process_info.num_local_peers = u32 - 1; // want number besides ourselves
627631
} else {
628-
pmix_process_info.num_local_peers = 0;
632+
ret = opal_pmix_convert_status(rc);
633+
error = "local size";
634+
goto error;
629635
}
630636

631637
/* setup transport keys in case the MPI layer needs them -
@@ -711,7 +717,9 @@ int ompi_rte_init(int *pargc, char ***pargv)
711717
peers = opal_argv_split(val, ',');
712718
free(val);
713719
} else {
714-
peers = NULL;
720+
ret = opal_pmix_convert_status(rc);
721+
error = "local peers";
722+
goto error;
715723
}
716724
} else {
717725
peers = NULL;

0 commit comments

Comments
 (0)