Skip to content

Commit 478b6b2

Browse files
authored
Merge pull request #11305 from hppritcha/pmix_fence_removal_v3
PMIx_Fences - remove unneeded ones during MPI initialization
2 parents a73399e + 712f0e4 commit 478b6b2

File tree

2 files changed

+32
-160
lines changed

2 files changed

+32
-160
lines changed

ompi/instance/instance.c

Lines changed: 27 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
22
/*
3-
* Copyright (c) 2018-2022 Triad National Security, LLC. All rights
3+
* Copyright (c) 2018-2023 Triad National Security, LLC. All rights
44
* reserved.
55
* Copyright (c) 2022 Cisco Systems, Inc. All rights reserved.
66
* Copyright (c) 2022 The University of Tennessee and The University
@@ -21,6 +21,14 @@
2121
#include "opal/util/show_help.h"
2222
#include "opal/util/argv.h"
2323
#include "opal/runtime/opal_params.h"
24+
#include "opal/util/timings.h"
25+
#include "opal/mca/allocator/base/base.h"
26+
#include "opal/mca/rcache/base/base.h"
27+
#include "opal/mca/mpool/base/base.h"
28+
#include "opal/mca/smsc/base/base.h"
29+
#include "opal/mca/mpool/base/mpool_base_tree.h"
30+
#include "opal/mca/pmix/pmix-internal.h"
31+
#include "opal/mca/pmix/base/base.h"
2432

2533
#include "ompi/mca/pml/pml.h"
2634
#include "ompi/runtime/params.h"
@@ -36,26 +44,19 @@
3644
#include "ompi/dpm/dpm.h"
3745
#include "ompi/file/file.h"
3846
#include "ompi/mpiext/mpiext.h"
47+
#include "ompi/util/timings.h"
3948

4049
#include "ompi/mca/hook/base/base.h"
4150
#include "ompi/mca/op/base/base.h"
42-
#include "opal/mca/allocator/base/base.h"
43-
#include "opal/mca/rcache/base/base.h"
44-
#include "opal/mca/mpool/base/base.h"
45-
#include "opal/mca/smsc/base/base.h"
4651
#include "ompi/mca/bml/base/base.h"
4752
#include "ompi/mca/pml/base/base.h"
4853
#include "ompi/mca/coll/base/base.h"
4954
#include "ompi/mca/osc/base/base.h"
5055
#include "ompi/mca/part/base/base.h"
5156
#include "ompi/mca/io/base/base.h"
5257
#include "ompi/mca/topo/base/base.h"
53-
#include "opal/mca/pmix/base/base.h"
5458

55-
#include "opal/mca/mpool/base/mpool_base_tree.h"
5659
#include "ompi/mca/pml/base/pml_base_bsend.h"
57-
#include "ompi/util/timings.h"
58-
#include "opal/mca/pmix/pmix-internal.h"
5960

6061
ompi_predefined_instance_t ompi_mpi_instance_null = {{{{0}}}};
6162

@@ -344,7 +345,8 @@ static int ompi_mpi_instance_init_common (int argc, char **argv)
344345
pmix_info_t info[2];
345346
pmix_status_t rc;
346347
opal_pmix_lock_t mylock;
347-
OMPI_TIMING_INIT(64);
348+
349+
OPAL_TIMING_ENV_INIT(init_common);
348350

349351
ret = ompi_mpi_instance_retain ();
350352
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
@@ -385,13 +387,15 @@ static int ompi_mpi_instance_init_common (int argc, char **argv)
385387
mca_base_var_set_value(ret, allvalue, 4, MCA_BASE_VAR_SOURCE_DEFAULT, NULL);
386388
}
387389

388-
OMPI_TIMING_NEXT("initialization");
390+
OPAL_TIMING_ENV_NEXT(init_common, "initialization");
389391

390392
/* Setup RTE */
391393
if (OMPI_SUCCESS != (ret = ompi_rte_init (&argc, &argv))) {
392394
return ompi_instance_print_error ("ompi_mpi_init: ompi_rte_init failed", ret);
393395
}
394396

397+
OPAL_TIMING_ENV_NEXT(init_common, "ompi_rte_init");
398+
395399
/* open the ompi hook framework */
396400
for (int i = 0 ; ompi_framework_dependencies[i] ; ++i) {
397401
ret = mca_base_framework_open (ompi_framework_dependencies[i], 0);
@@ -404,10 +408,6 @@ static int ompi_mpi_instance_init_common (int argc, char **argv)
404408
}
405409
}
406410

407-
OMPI_TIMING_NEXT("rte_init");
408-
OMPI_TIMING_IMPORT_OPAL("orte_ess_base_app_setup");
409-
OMPI_TIMING_IMPORT_OPAL("rte_init");
410-
411411
ompi_rte_initialized = true;
412412
/* if we are oversubscribed, then set yield_when_idle
413413
* accordingly */
@@ -509,9 +509,6 @@ static int ompi_mpi_instance_init_common (int argc, char **argv)
509509
return ompi_instance_print_error ("mca_pml_base_select() failed", ret);
510510
}
511511

512-
OMPI_TIMING_IMPORT_OPAL("orte_init");
513-
OMPI_TIMING_NEXT("rte_init-commit");
514-
515512
/* exchange connection info - this function may also act as a barrier
516513
* if data exchange is required. The modex occurs solely across procs
517514
* in our job. If a barrier is required, the "modex" function will
@@ -522,19 +519,20 @@ static int ompi_mpi_instance_init_common (int argc, char **argv)
522519
return ret; /* TODO: need to fix this */
523520
}
524521

525-
OMPI_TIMING_NEXT("commit");
522+
OPAL_TIMING_ENV_NEXT(init_common, "PMIx_Commit");
523+
526524
#if (OPAL_ENABLE_TIMING)
527525
if (OMPI_TIMING_ENABLED && !opal_pmix_base_async_modex &&
528526
opal_pmix_collect_all_data && !opal_process_info.is_singleton) {
529527
if (PMIX_SUCCESS != (rc = PMIx_Fence(NULL, 0, NULL, 0))) {
530528
ret = opal_pmix_convert_status(rc);
531529
return ompi_instance_print_error ("timing: pmix-barrier-1 failed", ret);
532530
}
533-
OMPI_TIMING_NEXT("pmix-barrier-1");
531+
OPAL_TIMING_ENV_NEXT(init_common, "pmix-barrier-1");
534532
if (PMIX_SUCCESS != (rc = PMIx_Fence(NULL, 0, NULL, 0))) {
535533
return ompi_instance_print_error ("timing: pmix-barrier-2 failed", ret);
536534
}
537-
OMPI_TIMING_NEXT("pmix-barrier-2");
535+
OPAL_TIMING_ENV_NEXT(init_common, "pmix-barrier-2");
538536
}
539537
#endif
540538

@@ -579,7 +577,7 @@ static int ompi_mpi_instance_init_common (int argc, char **argv)
579577
}
580578
}
581579

582-
OMPI_TIMING_NEXT("modex");
580+
OPAL_TIMING_ENV_NEXT(init_common, "modex");
583581

584582
/* select buffered send allocator component to be used */
585583
if (OMPI_SUCCESS != (ret = mca_pml_base_bsend_init ())) {
@@ -627,14 +625,6 @@ static int ompi_mpi_instance_init_common (int argc, char **argv)
627625
return ompi_instance_print_error ("ompi_attr_create_predefined_keyvals() failed", ret);
628626
}
629627

630-
if (mca_pml_base_requires_world ()) {
631-
/* need to set up comm world for this instance -- XXX -- FIXME -- probably won't always
632-
* be the case. */
633-
if (OMPI_SUCCESS != (ret = ompi_comm_init_mpi3 ())) {
634-
return ompi_instance_print_error ("ompi_comm_init_mpi3 () failed", ret);
635-
}
636-
}
637-
638628
/* initialize file handles */
639629
if (OMPI_SUCCESS != (ret = ompi_file_init ())) {
640630
return ompi_instance_print_error ("ompi_file_init() failed", ret);
@@ -711,11 +701,12 @@ static int ompi_mpi_instance_init_common (int argc, char **argv)
711701
return ompi_instance_print_error ("ompi_mpi_init: ompi_comm_cid_init failed", ret);
712702
}
713703

714-
/* Do we need to wait for a debugger? */
715-
ompi_rte_wait_for_debugger();
704+
if (OMPI_SUCCESS != (ret = ompi_comm_init_mpi3 ())) {
705+
return ompi_instance_print_error ("ompi_comm_init_mpi3 () failed", ret);
706+
}
716707

717708
/* Next timing measurement */
718-
OMPI_TIMING_NEXT("modex-barrier");
709+
OPAL_TIMING_ENV_NEXT(init_common, "modex-barrier");
719710

720711
if (!opal_process_info.is_singleton) {
721712
/* if we executed the above fence in the background, then
@@ -740,9 +731,7 @@ static int ompi_mpi_instance_init_common (int argc, char **argv)
740731
}
741732
}
742733

743-
/* check for timing request - get stop time and report elapsed
744-
time if so, then start the clock again */
745-
OMPI_TIMING_NEXT("barrier");
734+
OPAL_TIMING_ENV_NEXT(init_common, "barrier");
746735

747736
#if OPAL_ENABLE_PROGRESS_THREADS == 0
748737
/* Start setting up the event engine for MPI operations. Don't
@@ -751,7 +740,8 @@ static int ompi_mpi_instance_init_common (int argc, char **argv)
751740
CPU utilization for the remainder of MPI_INIT when we are
752741
blocking on RTE-level events, but may greatly reduce non-TCP
753742
latency. */
754-
opal_progress_set_event_flag(OPAL_EVLOOP_NONBLOCK);
743+
int old_event_flags = opal_progress_set_event_flag(0);
744+
opal_progress_set_event_flag(old_event_flags | OPAL_EVLOOP_NONBLOCK);
755745
#endif
756746

757747
/* Undo OPAL calling opal_progress_event_users_increment() during

ompi/runtime/ompi_mpi_init.c

Lines changed: 5 additions & 123 deletions
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@
2727
* Copyright (c) 2020 Amazon.com, Inc. or its affiliates.
2828
* All Rights reserved.
2929
* Copyright (c) 2021 Nanook Consulting. All rights reserved.
30-
* Copyright (c) 2021-2022 Triad National Security, LLC. All rights
30+
* Copyright (c) 2021-2023 Triad National Security, LLC. All rights
3131
* reserved.
3232
* $COPYRIGHT$
3333
*
@@ -291,14 +291,6 @@ void ompi_mpi_thread_level(int requested, int *provided)
291291
MPI_THREAD_MULTIPLE);
292292
}
293293

294-
static void fence_release(pmix_status_t status, void *cbdata)
295-
{
296-
volatile bool *active = (volatile bool*)cbdata;
297-
OPAL_ACQUIRE_OBJECT(active);
298-
*active = false;
299-
OPAL_POST_OBJECT(active);
300-
}
301-
302294
int ompi_mpi_init(int argc, char **argv, int requested, int *provided,
303295
bool reinit_ok)
304296
{
@@ -307,9 +299,6 @@ int ompi_mpi_init(int argc, char **argv, int requested, int *provided,
307299
#if OPAL_USING_INTERNAL_PMIX
308300
char *evar;
309301
#endif
310-
volatile bool active;
311-
bool background_fence = false;
312-
pmix_info_t info[2];
313302
pmix_status_t rc;
314303
OMPI_TIMING_INIT(64);
315304

@@ -364,12 +353,6 @@ int ompi_mpi_init(int argc, char **argv, int requested, int *provided,
364353

365354
ompi_hook_base_mpi_init_top_post_opal(argc, argv, requested, provided);
366355

367-
/* initialize communicator subsystem */
368-
if (OMPI_SUCCESS != (ret = ompi_comm_init_mpi3 ())) {
369-
error = "ompi_mpi_init: ompi_comm_init_mpi3 failed";
370-
goto error;
371-
}
372-
373356
/* Bozo argument check */
374357
if (NULL == argv && argc > 1) {
375358
ret = OMPI_ERR_BAD_PARAM;
@@ -392,69 +375,6 @@ int ompi_mpi_init(int argc, char **argv, int requested, int *provided,
392375
free(tmp);
393376
}
394377

395-
#if (OPAL_ENABLE_TIMING)
396-
if (OMPI_TIMING_ENABLED && !opal_pmix_base_async_modex &&
397-
opal_pmix_collect_all_data && !opal_process_info.is_singleton) {
398-
if (PMIX_SUCCESS != (rc = PMIx_Fence(NULL, 0, NULL, 0))) {
399-
ret = opal_pmix_convert_status(rc);
400-
error = "timing: pmix-barrier-1 failed";
401-
goto error;
402-
}
403-
OMPI_TIMING_NEXT("pmix-barrier-1");
404-
if (PMIX_SUCCESS != (rc = PMIx_Fence(NULL, 0, NULL, 0))) {
405-
ret = opal_pmix_convert_status(rc);
406-
error = "timing: pmix-barrier-2 failed";
407-
goto error;
408-
}
409-
OMPI_TIMING_NEXT("pmix-barrier-2");
410-
}
411-
#endif
412-
413-
if (!opal_process_info.is_singleton) {
414-
if (opal_pmix_base_async_modex) {
415-
/* if we are doing an async modex, but we are collecting all
416-
* data, then execute the non-blocking modex in the background.
417-
* All calls to modex_recv will be cached until the background
418-
* modex completes. If collect_all_data is false, then we skip
419-
* the fence completely and retrieve data on-demand from the
420-
* source node.
421-
*/
422-
if (opal_pmix_collect_all_data) {
423-
/* execute the fence_nb in the background to collect
424-
* the data */
425-
background_fence = true;
426-
active = true;
427-
OPAL_POST_OBJECT(&active);
428-
PMIX_INFO_LOAD(&info[0], PMIX_COLLECT_DATA, &opal_pmix_collect_all_data, PMIX_BOOL);
429-
if( PMIX_SUCCESS != (rc = PMIx_Fence_nb(NULL, 0, NULL, 0,
430-
fence_release,
431-
(void*)&active))) {
432-
ret = opal_pmix_convert_status(rc);
433-
error = "PMIx_Fence_nb() failed";
434-
goto error;
435-
}
436-
}
437-
} else {
438-
/* we want to do the modex - we block at this point, but we must
439-
* do so in a manner that allows us to call opal_progress so our
440-
* event library can be cycled as we have tied PMIx to that
441-
* event base */
442-
active = true;
443-
OPAL_POST_OBJECT(&active);
444-
PMIX_INFO_LOAD(&info[0], PMIX_COLLECT_DATA, &opal_pmix_collect_all_data, PMIX_BOOL);
445-
rc = PMIx_Fence_nb(NULL, 0, info, 1, fence_release, (void*)&active);
446-
if( PMIX_SUCCESS != rc) {
447-
ret = opal_pmix_convert_status(rc);
448-
error = "PMIx_Fence() failed";
449-
goto error;
450-
}
451-
/* cannot just wait on thread as we need to call opal_progress */
452-
OMPI_LAZY_WAIT_FOR_COMPLETION(active);
453-
}
454-
}
455-
456-
OMPI_TIMING_NEXT("modex");
457-
458378
MCA_PML_CALL(add_comm(&ompi_mpi_comm_world.comm));
459379
MCA_PML_CALL(add_comm(&ompi_mpi_comm_self.comm));
460380

@@ -491,48 +411,6 @@ int ompi_mpi_init(int argc, char **argv, int requested, int *provided,
491411
/* Do we need to wait for a debugger? */
492412
ompi_rte_wait_for_debugger();
493413

494-
/* Next timing measurement */
495-
OMPI_TIMING_NEXT("modex-barrier");
496-
497-
if (!opal_process_info.is_singleton) {
498-
/* if we executed the above fence in the background, then
499-
* we have to wait here for it to complete. However, there
500-
* is no reason to do two barriers! */
501-
if (background_fence) {
502-
OMPI_LAZY_WAIT_FOR_COMPLETION(active);
503-
} else if (!ompi_async_mpi_init) {
504-
/* wait for everyone to reach this point - this is a hard
505-
* barrier requirement at this time, though we hope to relax
506-
* it at a later point */
507-
bool flag = false;
508-
active = true;
509-
OPAL_POST_OBJECT(&active);
510-
PMIX_INFO_LOAD(&info[0], PMIX_COLLECT_DATA, &flag, PMIX_BOOL);
511-
if (PMIX_SUCCESS != (rc = PMIx_Fence_nb(NULL, 0, info, 1,
512-
fence_release, (void*)&active))) {
513-
ret = opal_pmix_convert_status(rc);
514-
error = "PMIx_Fence_nb() failed";
515-
goto error;
516-
}
517-
OMPI_LAZY_WAIT_FOR_COMPLETION(active);
518-
}
519-
}
520-
521-
/* check for timing request - get stop time and report elapsed
522-
time if so, then start the clock again */
523-
OMPI_TIMING_NEXT("barrier");
524-
525-
#if OPAL_ENABLE_PROGRESS_THREADS == 0
526-
/* Start setting up the event engine for MPI operations. Don't
527-
block in the event library, so that communications don't take
528-
forever between procs in the dynamic code. This will increase
529-
CPU utilization for the remainder of MPI_INIT when we are
530-
blocking on RTE-level events, but may greatly reduce non-TCP
531-
latency. */
532-
int old_event_flags = opal_progress_set_event_flag(0);
533-
opal_progress_set_event_flag(old_event_flags | OPAL_EVLOOP_NONBLOCK);
534-
#endif
535-
536414
/* wire up the mpi interface, if requested. Do this after the
537415
non-block switch for non-TCP performance. Do before the
538416
polling change as anyone with a complex wire-up is going to be
@@ -592,6 +470,10 @@ int ompi_mpi_init(int argc, char **argv, int requested, int *provided,
592470
opal_atomic_wmb();
593471
opal_atomic_swap_32(&ompi_mpi_state, OMPI_MPI_STATE_INIT_COMPLETED);
594472

473+
OMPI_TIMING_IMPORT_OPAL("opal_init_util");
474+
OMPI_TIMING_IMPORT_OPAL("opal_init");
475+
OMPI_TIMING_IMPORT_OPAL("ompi_mpi_instance_init_common");
476+
595477
/* Finish last measurement, output results
596478
* and clear timing structure */
597479
OMPI_TIMING_NEXT("barrier-finish");

0 commit comments

Comments
 (0)