Skip to content

Commit 21d7433

Browse files
committed
Make sure the PML is consistent across the world.
Temporary solution for the PML inconsistency issue discussed in #7475. This patch address 2 things: first it make the PMIx key optional so that if we are not in a full modex mode we don't do a direct modex, and second it get the PML info from the vpid 0 instead of from the local rank. Signed-off-by: George Bosilca <bosilca@icl.utk.edu>
1 parent 0276679 commit 21d7433

File tree

2 files changed

+49
-8
lines changed

2 files changed

+49
-8
lines changed

ompi/mca/pml/base/pml_base_select.c

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -324,6 +324,7 @@ mca_pml_base_pml_check_selected(const char *my_pml,
324324
size_t size;
325325
int ret;
326326
char *remote_pml;
327+
opal_process_name_t rank0 = {.jobid = ompi_proc_local()->super.proc_name.jobid, .vpid = 0};
327328

328329
/* if no modex was required by the PML, then
329330
* we can assume success
@@ -342,13 +343,13 @@ mca_pml_base_pml_check_selected(const char *my_pml,
342343
}
343344

344345
/* get the name of the PML module selected by rank=0 */
345-
OPAL_MODEX_RECV(ret, &pml_base_component,
346-
&procs[0]->super.proc_name, (void**) &remote_pml, &size);
346+
OPAL_MODEX_RECV_STRING_OPTIONAL(ret, mca_base_component_to_string(&pml_base_component),
347+
&rank0, (void**) &remote_pml, &size);
347348

348349
/* if this key wasn't found, then just assume all is well... */
349-
if (OMPI_SUCCESS != ret) {
350+
if (PMIX_ERR_NOT_FOUND != ret) {
350351
opal_output_verbose( 10, ompi_pml_base_framework.framework_output,
351-
"check:select: modex data not found");
352+
"check:select: PML modex for vpid 0 data not found");
352353
return OMPI_SUCCESS;
353354
}
354355

opal/mca/pmix/pmix-internal.h

Lines changed: 44 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -301,14 +301,14 @@ typedef struct {
301301
* is to be returned
302302
* t - the expected data type
303303
*/
304-
#define OPAL_MODEX_RECV_VALUE_IMMEDIATE(r, s, p, d, t) \
305-
do { \
304+
#define OPAL_MODEX_RECV_VALUE_IMMEDIATE(r, s, p, d, t) \
305+
do { \
306306
pmix_proc_t _proc; \
307307
pmix_value_t *_kv = NULL; \
308308
pmix_info_t _info; \
309309
size_t _sz; \
310310
OPAL_OUTPUT_VERBOSE((1, opal_pmix_verbose_output, \
311-
"%s[%s:%d] MODEX RECV VALUE OPTIONAL FOR PROC %s KEY %s", \
311+
"%s[%s:%d] MODEX RECV VALUE IMMEDIATE FOR PROC %s KEY %s", \
312312
OPAL_NAME_PRINT(OPAL_PROC_MY_NAME), \
313313
__FILE__, __LINE__, \
314314
OPAL_NAME_PRINT(*(p)), (s))); \
@@ -349,7 +349,7 @@ typedef struct {
349349
OPAL_NAME_PRINT(OPAL_PROC_MY_NAME), \
350350
__FILE__, __LINE__, \
351351
OPAL_NAME_PRINT(*(p)), (s))); \
352-
OPAL_PMIX_CONVERT_NAME(&_proc, (p)); \
352+
OPAL_PMIX_CONVERT_NAME(&_proc, (p)); \
353353
(r) = PMIx_Get(&(_proc), (s), NULL, 0, &(_kv)); \
354354
if (NULL == _kv) { \
355355
(r) = PMIX_ERR_NOT_FOUND; \
@@ -363,6 +363,46 @@ typedef struct {
363363
} \
364364
} while(0);
365365

366+
/**
367+
* Provide a simplified macro for retrieving modex data
368+
* from another process:
369+
*
370+
* r - the integer return status from the modex op (int)
371+
* s - string key (char*)
372+
* p - pointer to the opal_process_name_t of the proc that posted
373+
* the data (opal_process_name_t*)
374+
* d - pointer to a location wherein the data object
375+
* it to be returned (char**)
376+
* sz - pointer to a location wherein the number of bytes
377+
* in the data object can be returned (size_t)
378+
*/
379+
#define OPAL_MODEX_RECV_STRING_OPTIONAL(r, s, p, d, sz) \
380+
do { \
381+
pmix_proc_t _proc; \
382+
pmix_value_t *_kv = NULL; \
383+
pmix_info_t _info; \
384+
OPAL_OUTPUT_VERBOSE((1, opal_pmix_verbose_output, \
385+
"%s[%s:%d] MODEX RECV STRING OPTIONAL FOR PROC %s KEY %s", \
386+
OPAL_NAME_PRINT(OPAL_PROC_MY_NAME), \
387+
__FILE__, __LINE__, \
388+
OPAL_NAME_PRINT(*(p)), (s))); \
389+
*(d) = NULL; \
390+
*(sz) = 0; \
391+
OPAL_PMIX_CONVERT_NAME(&_proc, (p)); \
392+
PMIX_INFO_LOAD(&_info, PMIX_OPTIONAL, NULL, PMIX_BOOL); \
393+
(r) = PMIx_Get(&(_proc), (s), &(_info), 1, &(_kv)); \
394+
if (NULL == _kv) { \
395+
(r) = PMIX_ERR_NOT_FOUND; \
396+
} else if (PMIX_SUCCESS == (r)) { \
397+
*(d) = (uint8_t*)_kv->data.bo.bytes; \
398+
*(sz) = _kv->data.bo.size; \
399+
_kv->data.bo.bytes = NULL; /* protect the data */ \
400+
} \
401+
if (NULL != _kv) { \
402+
PMIX_VALUE_RELEASE(_kv); \
403+
} \
404+
} while(0);
405+
366406
/**
367407
* Provide a simplified macro for retrieving modex data
368408
* from another process:

0 commit comments

Comments
 (0)