@@ -259,6 +259,7 @@ int ompi_dpm_connect_accept(ompi_communicator_t *comm, int root,
259
259
* fail. */
260
260
if (0 >= rportlen ) {
261
261
rc = rportlen ;
262
+ /* no need to free here, the root has already done it and everyone else has not yet allocated the rport array */
262
263
goto exit ;
263
264
}
264
265
@@ -406,72 +407,85 @@ int ompi_dpm_connect_accept(ompi_communicator_t *comm, int root,
406
407
OPAL_LIST_DESTRUCT (& rlist );
407
408
goto exit ;
408
409
}
409
- if (0 < opal_list_get_size (& ilist )) {
410
- uint32_t * peer_ranks = NULL ;
410
+ if (!opal_list_is_empty (& ilist )) {
411
411
int prn , nprn = 0 ;
412
412
char * val ;
413
- uint16_t u16 ;
414
413
opal_process_name_t wildcard_rank ;
414
+ i = 0 ; /* start from the begining */
415
+
415
416
/* convert the list of new procs to a proc_t array */
416
417
new_proc_list = (ompi_proc_t * * )calloc (opal_list_get_size (& ilist ),
417
418
sizeof (ompi_proc_t * ));
418
- /* get the list of local peers for the new procs */
419
- cd = (ompi_dpm_proct_caddy_t * )opal_list_get_first (& ilist );
420
- proc = cd -> p ;
421
- wildcard_rank .jobid = proc -> super .proc_name .jobid ;
422
- wildcard_rank .vpid = OMPI_NAME_WILDCARD -> vpid ;
423
- /* retrieve the local peers */
424
- OPAL_MODEX_RECV_VALUE_OPTIONAL (rc , PMIX_LOCAL_PEERS ,
425
- & wildcard_rank , & val , PMIX_STRING );
426
- if (OPAL_SUCCESS == rc && NULL != val ) {
427
- char * * peers = opal_argv_split (val , ',' );
428
- free (val );
429
- nprn = opal_argv_count (peers );
430
- peer_ranks = (uint32_t * )calloc (nprn , sizeof (uint32_t ));
431
- for (prn = 0 ; NULL != peers [prn ]; prn ++ ) {
432
- peer_ranks [prn ] = strtoul (peers [prn ], NULL , 10 );
433
- }
434
- opal_argv_free (peers );
435
- }
436
-
437
- i = 0 ;
438
- OPAL_LIST_FOREACH (cd , & ilist , ompi_dpm_proct_caddy_t ) {
419
+ /* Extract the modex info for the first proc on the ilist, and then
420
+ * remove all processors in the same jobid from the list by getting
421
+ * their connection information and moving them into the proc array.
422
+ */
423
+ do {
424
+ uint32_t * local_ranks_in_jobid = NULL ;
425
+ ompi_dpm_proct_caddy_t * next = NULL ;
426
+ cd = (ompi_dpm_proct_caddy_t * )opal_list_get_first (& ilist );
439
427
proc = cd -> p ;
440
- new_proc_list [i ] = proc ;
441
- /* ompi_proc_complete_init_single() initializes and optionally retrieves
442
- * OPAL_PMIX_LOCALITY and OPAL_PMIX_HOSTNAME. since we can live without
443
- * them, we are just fine */
444
- ompi_proc_complete_init_single (proc );
445
- /* if this proc is local, then get its locality */
446
- if (NULL != peer_ranks ) {
447
- for (prn = 0 ; prn < nprn ; prn ++ ) {
448
- if (peer_ranks [prn ] == proc -> super .proc_name .vpid ) {
449
- /* get their locality string */
450
- val = NULL ;
451
- OPAL_MODEX_RECV_VALUE_IMMEDIATE (rc , PMIX_LOCALITY_STRING ,
452
- & proc -> super .proc_name , & val , PMIX_STRING );
453
- if (OPAL_SUCCESS == rc && NULL != ompi_process_info .locality ) {
454
- u16 = opal_hwloc_compute_relative_locality (ompi_process_info .locality , val );
455
- free (val );
456
- } else {
457
- /* all we can say is that it shares our node */
458
- u16 = OPAL_PROC_ON_CLUSTER | OPAL_PROC_ON_CU | OPAL_PROC_ON_NODE ;
428
+ wildcard_rank .jobid = proc -> super .proc_name .jobid ;
429
+ wildcard_rank .vpid = OMPI_NAME_WILDCARD -> vpid ;
430
+ /* retrieve the local peers for the specified jobid */
431
+ OPAL_MODEX_RECV_VALUE_OPTIONAL (rc , PMIX_LOCAL_PEERS ,
432
+ & wildcard_rank , & val , PMIX_STRING );
433
+ if (OPAL_SUCCESS == rc && NULL != val ) {
434
+ char * * peers = opal_argv_split (val , ',' );
435
+ free (val );
436
+ nprn = opal_argv_count (peers );
437
+ local_ranks_in_jobid = (uint32_t * )calloc (nprn , sizeof (uint32_t ));
438
+ for (prn = 0 ; NULL != peers [prn ]; prn ++ ) {
439
+ local_ranks_in_jobid [prn ] = strtoul (peers [prn ], NULL , 10 );
440
+ }
441
+ opal_argv_free (peers );
442
+ }
443
+
444
+ OPAL_LIST_FOREACH_SAFE (cd , next , & ilist , ompi_dpm_proct_caddy_t ) {
445
+ proc = cd -> p ;
446
+ if ( proc -> super .proc_name .jobid != wildcard_rank .jobid )
447
+ continue ; /* not a proc from this jobid */
448
+
449
+ new_proc_list [i ] = proc ;
450
+ opal_list_remove_item (& ilist , (opal_list_item_t * )cd ); // TODO: do we need to release cd ?
451
+ OBJ_RELEASE (cd );
452
+ /* ompi_proc_complete_init_single() initializes and optionally retrieves
453
+ * OPAL_PMIX_LOCALITY and OPAL_PMIX_HOSTNAME. since we can live without
454
+ * them, we are just fine */
455
+ ompi_proc_complete_init_single (proc );
456
+ /* if this proc is local, then get its locality */
457
+ if (NULL != local_ranks_in_jobid ) {
458
+ uint16_t u16 ;
459
+ for (prn = 0 ; prn < nprn ; prn ++ ) {
460
+ if (local_ranks_in_jobid [prn ] == proc -> super .proc_name .vpid ) {
461
+ /* get their locality string */
462
+ val = NULL ;
463
+ OPAL_MODEX_RECV_VALUE_IMMEDIATE (rc , PMIX_LOCALITY_STRING ,
464
+ & proc -> super .proc_name , & val , PMIX_STRING );
465
+ if (OPAL_SUCCESS == rc && NULL != ompi_process_info .locality ) {
466
+ u16 = opal_hwloc_compute_relative_locality (ompi_process_info .locality , val );
467
+ free (val );
468
+ } else {
469
+ /* all we can say is that it shares our node */
470
+ u16 = OPAL_PROC_ON_CLUSTER | OPAL_PROC_ON_CU | OPAL_PROC_ON_NODE ;
471
+ }
472
+ proc -> super .proc_flags = u16 ;
473
+ /* save the locality for later */
474
+ OPAL_PMIX_CONVERT_NAME (& pxproc , & proc -> super .proc_name );
475
+ pval .type = PMIX_UINT16 ;
476
+ pval .data .uint16 = proc -> super .proc_flags ;
477
+ PMIx_Store_internal (& pxproc , PMIX_LOCALITY , & pval );
478
+ break ;
459
479
}
460
- proc -> super .proc_flags = u16 ;
461
- /* save the locality for later */
462
- OPAL_PMIX_CONVERT_NAME (& pxproc , & proc -> super .proc_name );
463
- pval .type = PMIX_UINT16 ;
464
- pval .data .uint16 = proc -> super .proc_flags ;
465
- PMIx_Store_internal (& pxproc , PMIX_LOCALITY , & pval );
466
- break ;
467
480
}
468
481
}
482
+ ++ i ;
469
483
}
470
- ++ i ;
471
- }
472
- if ( NULL != peer_ranks ) {
473
- free ( peer_ranks );
474
- }
484
+ if ( NULL != local_ranks_in_jobid ) {
485
+ free ( local_ranks_in_jobid );
486
+ }
487
+ } while (! opal_list_is_empty ( & ilist ) );
488
+
475
489
/* call add_procs on the new ones */
476
490
rc = MCA_PML_CALL (add_procs (new_proc_list , opal_list_get_size (& ilist )));
477
491
free (new_proc_list );
0 commit comments