Skip to content

Commit 211fc6d

Browse files
committed
Revert "PMIx_Fences - remove unneeded ones during MPI initialization"
This reverts commit b3ae758.
1 parent b8f18a3 commit 211fc6d

File tree

2 files changed

+155
-29
lines changed

2 files changed

+155
-29
lines changed

ompi/instance/instance.c

Lines changed: 38 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
22
/*
3-
* Copyright (c) 2018-2023 Triad National Security, LLC. All rights
3+
* Copyright (c) 2018-2022 Triad National Security, LLC. All rights
44
* reserved.
55
* Copyright (c) 2022 Cisco Systems, Inc. All rights reserved.
66
* Copyright (c) 2022 The University of Tennessee and The University
@@ -21,14 +21,6 @@
2121
#include "opal/util/show_help.h"
2222
#include "opal/util/argv.h"
2323
#include "opal/runtime/opal_params.h"
24-
#include "opal/util/timings.h"
25-
#include "opal/mca/allocator/base/base.h"
26-
#include "opal/mca/rcache/base/base.h"
27-
#include "opal/mca/mpool/base/base.h"
28-
#include "opal/mca/smsc/base/base.h"
29-
#include "opal/mca/mpool/base/mpool_base_tree.h"
30-
#include "opal/mca/pmix/pmix-internal.h"
31-
#include "opal/mca/pmix/base/base.h"
3224

3325
#include "ompi/mca/pml/pml.h"
3426
#include "ompi/runtime/params.h"
@@ -44,19 +36,26 @@
4436
#include "ompi/dpm/dpm.h"
4537
#include "ompi/file/file.h"
4638
#include "ompi/mpiext/mpiext.h"
47-
#include "ompi/util/timings.h"
4839

4940
#include "ompi/mca/hook/base/base.h"
5041
#include "ompi/mca/op/base/base.h"
42+
#include "opal/mca/allocator/base/base.h"
43+
#include "opal/mca/rcache/base/base.h"
44+
#include "opal/mca/mpool/base/base.h"
45+
#include "opal/mca/smsc/base/base.h"
5146
#include "ompi/mca/bml/base/base.h"
5247
#include "ompi/mca/pml/base/base.h"
5348
#include "ompi/mca/coll/base/base.h"
5449
#include "ompi/mca/osc/base/base.h"
5550
#include "ompi/mca/part/base/base.h"
5651
#include "ompi/mca/io/base/base.h"
5752
#include "ompi/mca/topo/base/base.h"
53+
#include "opal/mca/pmix/base/base.h"
5854

55+
#include "opal/mca/mpool/base/mpool_base_tree.h"
5956
#include "ompi/mca/pml/base/pml_base_bsend.h"
57+
#include "ompi/util/timings.h"
58+
#include "opal/mca/pmix/pmix-internal.h"
6059

6160
ompi_predefined_instance_t ompi_mpi_instance_null = {{{{0}}}};
6261

@@ -345,8 +344,7 @@ static int ompi_mpi_instance_init_common (int argc, char **argv)
345344
pmix_info_t info[2];
346345
pmix_status_t rc;
347346
opal_pmix_lock_t mylock;
348-
349-
OPAL_TIMING_ENV_INIT(init_common);
347+
OMPI_TIMING_INIT(64);
350348

351349
ret = ompi_mpi_instance_retain ();
352350
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
@@ -387,15 +385,13 @@ static int ompi_mpi_instance_init_common (int argc, char **argv)
387385
mca_base_var_set_value(ret, allvalue, 4, MCA_BASE_VAR_SOURCE_DEFAULT, NULL);
388386
}
389387

390-
OPAL_TIMING_ENV_NEXT(init_common, "initialization");
388+
OMPI_TIMING_NEXT("initialization");
391389

392390
/* Setup RTE */
393391
if (OMPI_SUCCESS != (ret = ompi_rte_init (&argc, &argv))) {
394392
return ompi_instance_print_error ("ompi_mpi_init: ompi_rte_init failed", ret);
395393
}
396394

397-
OPAL_TIMING_ENV_NEXT(init_common, "ompi_rte_init");
398-
399395
/* open the ompi hook framework */
400396
for (int i = 0 ; ompi_framework_dependencies[i] ; ++i) {
401397
ret = mca_base_framework_open (ompi_framework_dependencies[i], 0);
@@ -408,6 +404,10 @@ static int ompi_mpi_instance_init_common (int argc, char **argv)
408404
}
409405
}
410406

407+
OMPI_TIMING_NEXT("rte_init");
408+
OMPI_TIMING_IMPORT_OPAL("orte_ess_base_app_setup");
409+
OMPI_TIMING_IMPORT_OPAL("rte_init");
410+
411411
ompi_rte_initialized = true;
412412
/* if we are oversubscribed, then set yield_when_idle
413413
* accordingly */
@@ -509,6 +509,9 @@ static int ompi_mpi_instance_init_common (int argc, char **argv)
509509
return ompi_instance_print_error ("mca_pml_base_select() failed", ret);
510510
}
511511

512+
OMPI_TIMING_IMPORT_OPAL("orte_init");
513+
OMPI_TIMING_NEXT("rte_init-commit");
514+
512515
/* exchange connection info - this function may also act as a barrier
513516
* if data exchange is required. The modex occurs solely across procs
514517
* in our job. If a barrier is required, the "modex" function will
@@ -519,20 +522,19 @@ static int ompi_mpi_instance_init_common (int argc, char **argv)
519522
return ret; /* TODO: need to fix this */
520523
}
521524

522-
OPAL_TIMING_ENV_NEXT(init_common, "PMIx_Commit");
523-
525+
OMPI_TIMING_NEXT("commit");
524526
#if (OPAL_ENABLE_TIMING)
525527
if (OMPI_TIMING_ENABLED && !opal_pmix_base_async_modex &&
526528
opal_pmix_collect_all_data && !opal_process_info.is_singleton) {
527529
if (PMIX_SUCCESS != (rc = PMIx_Fence(NULL, 0, NULL, 0))) {
528530
ret = opal_pmix_convert_status(rc);
529531
return ompi_instance_print_error ("timing: pmix-barrier-1 failed", ret);
530532
}
531-
OPAL_TIMING_ENV_NEXT(init_common, "pmix-barrier-1");
533+
OMPI_TIMING_NEXT("pmix-barrier-1");
532534
if (PMIX_SUCCESS != (rc = PMIx_Fence(NULL, 0, NULL, 0))) {
533535
return ompi_instance_print_error ("timing: pmix-barrier-2 failed", ret);
534536
}
535-
OPAL_TIMING_ENV_NEXT(init_common, "pmix-barrier-2");
537+
OMPI_TIMING_NEXT("pmix-barrier-2");
536538
}
537539
#endif
538540

@@ -577,7 +579,7 @@ static int ompi_mpi_instance_init_common (int argc, char **argv)
577579
}
578580
}
579581

580-
OPAL_TIMING_ENV_NEXT(init_common, "modex");
582+
OMPI_TIMING_NEXT("modex");
581583

582584
/* select buffered send allocator component to be used */
583585
if (OMPI_SUCCESS != (ret = mca_pml_base_bsend_init ())) {
@@ -625,6 +627,14 @@ static int ompi_mpi_instance_init_common (int argc, char **argv)
625627
return ompi_instance_print_error ("ompi_attr_create_predefined_keyvals() failed", ret);
626628
}
627629

630+
if (mca_pml_base_requires_world ()) {
631+
/* need to set up comm world for this instance -- XXX -- FIXME -- probably won't always
632+
* be the case. */
633+
if (OMPI_SUCCESS != (ret = ompi_comm_init_mpi3 ())) {
634+
return ompi_instance_print_error ("ompi_comm_init_mpi3 () failed", ret);
635+
}
636+
}
637+
628638
/* initialize file handles */
629639
if (OMPI_SUCCESS != (ret = ompi_file_init ())) {
630640
return ompi_instance_print_error ("ompi_file_init() failed", ret);
@@ -701,8 +711,11 @@ static int ompi_mpi_instance_init_common (int argc, char **argv)
701711
return ompi_instance_print_error ("ompi_mpi_init: ompi_comm_cid_init failed", ret);
702712
}
703713

714+
/* Do we need to wait for a debugger? */
715+
ompi_rte_wait_for_debugger();
716+
704717
/* Next timing measurement */
705-
OPAL_TIMING_ENV_NEXT(init_common, "modex-barrier");
718+
OMPI_TIMING_NEXT("modex-barrier");
706719

707720
if (!opal_process_info.is_singleton) {
708721
/* if we executed the above fence in the background, then
@@ -727,7 +740,9 @@ static int ompi_mpi_instance_init_common (int argc, char **argv)
727740
}
728741
}
729742

730-
OPAL_TIMING_ENV_NEXT(init_common, "barrier");
743+
/* check for timing request - get stop time and report elapsed
744+
time if so, then start the clock again */
745+
OMPI_TIMING_NEXT("barrier");
731746

732747
#if OPAL_ENABLE_PROGRESS_THREADS == 0
733748
/* Start setting up the event engine for MPI operations. Don't
@@ -736,8 +751,7 @@ static int ompi_mpi_instance_init_common (int argc, char **argv)
736751
CPU utilization for the remainder of MPI_INIT when we are
737752
blocking on RTE-level events, but may greatly reduce non-TCP
738753
latency. */
739-
int old_event_flags = opal_progress_set_event_flag(0);
740-
opal_progress_set_event_flag(old_event_flags | OPAL_EVLOOP_NONBLOCK);
754+
opal_progress_set_event_flag(OPAL_EVLOOP_NONBLOCK);
741755
#endif
742756

743757
/* Undo OPAL calling opal_progress_event_users_increment() during

ompi/runtime/ompi_mpi_init.c

Lines changed: 117 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@
2727
* Copyright (c) 2020 Amazon.com, Inc. or its affiliates.
2828
* All Rights reserved.
2929
* Copyright (c) 2021 Nanook Consulting. All rights reserved.
30-
* Copyright (c) 2021-2023 Triad National Security, LLC. All rights
30+
* Copyright (c) 2021-2022 Triad National Security, LLC. All rights
3131
* reserved.
3232
* $COPYRIGHT$
3333
*
@@ -291,6 +291,14 @@ void ompi_mpi_thread_level(int requested, int *provided)
291291
MPI_THREAD_MULTIPLE);
292292
}
293293

294+
static void fence_release(pmix_status_t status, void *cbdata)
295+
{
296+
volatile bool *active = (volatile bool*)cbdata;
297+
OPAL_ACQUIRE_OBJECT(active);
298+
*active = false;
299+
OPAL_POST_OBJECT(active);
300+
}
301+
294302
int ompi_mpi_init(int argc, char **argv, int requested, int *provided,
295303
bool reinit_ok)
296304
{
@@ -299,6 +307,9 @@ int ompi_mpi_init(int argc, char **argv, int requested, int *provided,
299307
#if OPAL_USING_INTERNAL_PMIX
300308
char *evar;
301309
#endif
310+
volatile bool active;
311+
bool background_fence = false;
312+
pmix_info_t info[2];
302313
pmix_status_t rc;
303314
OMPI_TIMING_INIT(64);
304315

@@ -381,6 +392,69 @@ int ompi_mpi_init(int argc, char **argv, int requested, int *provided,
381392
free(tmp);
382393
}
383394

395+
#if (OPAL_ENABLE_TIMING)
396+
if (OMPI_TIMING_ENABLED && !opal_pmix_base_async_modex &&
397+
opal_pmix_collect_all_data && !opal_process_info.is_singleton) {
398+
if (PMIX_SUCCESS != (rc = PMIx_Fence(NULL, 0, NULL, 0))) {
399+
ret = opal_pmix_convert_status(rc);
400+
error = "timing: pmix-barrier-1 failed";
401+
goto error;
402+
}
403+
OMPI_TIMING_NEXT("pmix-barrier-1");
404+
if (PMIX_SUCCESS != (rc = PMIx_Fence(NULL, 0, NULL, 0))) {
405+
ret = opal_pmix_convert_status(rc);
406+
error = "timing: pmix-barrier-2 failed";
407+
goto error;
408+
}
409+
OMPI_TIMING_NEXT("pmix-barrier-2");
410+
}
411+
#endif
412+
413+
if (!opal_process_info.is_singleton) {
414+
if (opal_pmix_base_async_modex) {
415+
/* if we are doing an async modex, but we are collecting all
416+
* data, then execute the non-blocking modex in the background.
417+
* All calls to modex_recv will be cached until the background
418+
* modex completes. If collect_all_data is false, then we skip
419+
* the fence completely and retrieve data on-demand from the
420+
* source node.
421+
*/
422+
if (opal_pmix_collect_all_data) {
423+
/* execute the fence_nb in the background to collect
424+
* the data */
425+
background_fence = true;
426+
active = true;
427+
OPAL_POST_OBJECT(&active);
428+
PMIX_INFO_LOAD(&info[0], PMIX_COLLECT_DATA, &opal_pmix_collect_all_data, PMIX_BOOL);
429+
if( PMIX_SUCCESS != (rc = PMIx_Fence_nb(NULL, 0, NULL, 0,
430+
fence_release,
431+
(void*)&active))) {
432+
ret = opal_pmix_convert_status(rc);
433+
error = "PMIx_Fence_nb() failed";
434+
goto error;
435+
}
436+
}
437+
} else {
438+
/* we want to do the modex - we block at this point, but we must
439+
* do so in a manner that allows us to call opal_progress so our
440+
* event library can be cycled as we have tied PMIx to that
441+
* event base */
442+
active = true;
443+
OPAL_POST_OBJECT(&active);
444+
PMIX_INFO_LOAD(&info[0], PMIX_COLLECT_DATA, &opal_pmix_collect_all_data, PMIX_BOOL);
445+
rc = PMIx_Fence_nb(NULL, 0, info, 1, fence_release, (void*)&active);
446+
if( PMIX_SUCCESS != rc) {
447+
ret = opal_pmix_convert_status(rc);
448+
error = "PMIx_Fence() failed";
449+
goto error;
450+
}
451+
/* cannot just wait on thread as we need to call opal_progress */
452+
OMPI_LAZY_WAIT_FOR_COMPLETION(active);
453+
}
454+
}
455+
456+
OMPI_TIMING_NEXT("modex");
457+
384458
MCA_PML_CALL(add_comm(&ompi_mpi_comm_world.comm));
385459
MCA_PML_CALL(add_comm(&ompi_mpi_comm_self.comm));
386460

@@ -417,6 +491,48 @@ int ompi_mpi_init(int argc, char **argv, int requested, int *provided,
417491
/* Do we need to wait for a debugger? */
418492
ompi_rte_wait_for_debugger();
419493

494+
/* Next timing measurement */
495+
OMPI_TIMING_NEXT("modex-barrier");
496+
497+
if (!opal_process_info.is_singleton) {
498+
/* if we executed the above fence in the background, then
499+
* we have to wait here for it to complete. However, there
500+
* is no reason to do two barriers! */
501+
if (background_fence) {
502+
OMPI_LAZY_WAIT_FOR_COMPLETION(active);
503+
} else if (!ompi_async_mpi_init) {
504+
/* wait for everyone to reach this point - this is a hard
505+
* barrier requirement at this time, though we hope to relax
506+
* it at a later point */
507+
bool flag = false;
508+
active = true;
509+
OPAL_POST_OBJECT(&active);
510+
PMIX_INFO_LOAD(&info[0], PMIX_COLLECT_DATA, &flag, PMIX_BOOL);
511+
if (PMIX_SUCCESS != (rc = PMIx_Fence_nb(NULL, 0, info, 1,
512+
fence_release, (void*)&active))) {
513+
ret = opal_pmix_convert_status(rc);
514+
error = "PMIx_Fence_nb() failed";
515+
goto error;
516+
}
517+
OMPI_LAZY_WAIT_FOR_COMPLETION(active);
518+
}
519+
}
520+
521+
/* check for timing request - get stop time and report elapsed
522+
time if so, then start the clock again */
523+
OMPI_TIMING_NEXT("barrier");
524+
525+
#if OPAL_ENABLE_PROGRESS_THREADS == 0
526+
/* Start setting up the event engine for MPI operations. Don't
527+
block in the event library, so that communications don't take
528+
forever between procs in the dynamic code. This will increase
529+
CPU utilization for the remainder of MPI_INIT when we are
530+
blocking on RTE-level events, but may greatly reduce non-TCP
531+
latency. */
532+
int old_event_flags = opal_progress_set_event_flag(0);
533+
opal_progress_set_event_flag(old_event_flags | OPAL_EVLOOP_NONBLOCK);
534+
#endif
535+
420536
/* wire up the mpi interface, if requested. Do this after the
421537
non-block switch for non-TCP performance. Do before the
422538
polling change as anyone with a complex wire-up is going to be
@@ -476,10 +592,6 @@ int ompi_mpi_init(int argc, char **argv, int requested, int *provided,
476592
opal_atomic_wmb();
477593
opal_atomic_swap_32(&ompi_mpi_state, OMPI_MPI_STATE_INIT_COMPLETED);
478594

479-
OMPI_TIMING_IMPORT_OPAL("opal_init_util");
480-
OMPI_TIMING_IMPORT_OPAL("opal_init");
481-
OMPI_TIMING_IMPORT_OPAL("ompi_mpi_instance_init_common");
482-
483595
/* Finish last measurement, output results
484596
* and clear timing structure */
485597
OMPI_TIMING_NEXT("barrier-finish");

0 commit comments

Comments
 (0)