Skip to content

Commit 068ece3

Browse files
committed
ulfm/ishrink: updates for new cid model
Signed-off-by: Aurelien Bouteiller <bouteill@icl.utk.edu>
1 parent ad5a52e commit 068ece3

File tree

1 file changed

+49
-27
lines changed

1 file changed

+49
-27
lines changed

ompi/communicator/ft/comm_ft.c

Lines changed: 49 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -244,11 +244,9 @@ int ompi_comm_shrink_internal(ompi_communicator_t* comm, ompi_communicator_t** n
244244
opal_mutex_unlock(&ompi_group_afp_mutex);
245245
#if OPAL_ENABLE_DEBUG
246246
stop = PMPI_Wtime();
247-
#endif
248247
OPAL_OUTPUT_VERBOSE((10, ompi_ftmpi_output_handle,
249248
"%s ompi: comm_shrink: group_inter: %g seconds",
250249
OMPI_NAME_PRINT(OMPI_PROC_MY_NAME), stop-start));
251-
#if OPAL_ENABLE_DEBUG
252250
start = PMPI_Wtime();
253251
#endif
254252
do {
@@ -266,10 +264,10 @@ int ompi_comm_shrink_internal(ompi_communicator_t* comm, ompi_communicator_t** n
266264
} while( MPI_ERR_PROC_FAILED == rc );
267265
#if OPAL_ENABLE_DEBUG
268266
stop = PMPI_Wtime();
269-
#endif
270267
OPAL_OUTPUT_VERBOSE((10, ompi_ftmpi_output_handle,
271268
"%s ompi: comm_shrink: AGREE: %g seconds",
272269
OMPI_NAME_PRINT(OMPI_PROC_MY_NAME), stop-start));
270+
#endif
273271
if(OMPI_SUCCESS != rc) {
274272
opal_output(0, "%s:%d Agreement failure: %d\n", __FILE__, __LINE__, rc);
275273
exit_status = rc;
@@ -313,7 +311,7 @@ int ompi_comm_shrink_internal(ompi_communicator_t* comm, ompi_communicator_t** n
313311
comm->c_keyhash, /* attrs */
314312
comm->error_handler, /* error handler */
315313
alive_group, /* local group */
316-
alive_rgroup /* remote group */
314+
alive_rgroup, /* remote group */
317315
0 /* flags */
318316
);
319317
if( OMPI_SUCCESS != rc ) {
@@ -326,10 +324,10 @@ int ompi_comm_shrink_internal(ompi_communicator_t* comm, ompi_communicator_t** n
326324
}
327325
#if OPAL_ENABLE_DEBUG
328326
stop = PMPI_Wtime();
329-
#endif
330327
OPAL_OUTPUT_VERBOSE((10, ompi_ftmpi_output_handle,
331328
"%s ompi: comm_shrink: GRP COMPUTATION: %g seconds\n",
332329
OMPI_NAME_PRINT(OMPI_PROC_MY_NAME), stop-start));
330+
#endif
333331
/*
334332
* Step 3: Determine context id
335333
*/
@@ -356,10 +354,10 @@ int ompi_comm_shrink_internal(ompi_communicator_t* comm, ompi_communicator_t** n
356354
}
357355
#if OPAL_ENABLE_DEBUG
358356
stop = PMPI_Wtime();
359-
#endif
360357
OPAL_OUTPUT_VERBOSE((10, ompi_ftmpi_output_handle,
361358
"%s ompi: comm_shrink: NEXT CID: %g seconds\n",
362359
OMPI_NAME_PRINT(OMPI_PROC_MY_NAME), stop-start));
360+
#endif
363361
/*
364362
* Step 4: Activate the communicator
365363
*/
@@ -426,15 +424,14 @@ static int ompi_comm_ishrink_check_agree(ompi_comm_request_t *request);
426424
static int ompi_comm_ishrink_check_setrank(ompi_comm_request_t *request);
427425
static int ompi_comm_ishrink_check_cid(ompi_comm_request_t *request);
428426
static int ompi_comm_ishrink_check_activate(ompi_comm_request_t *request);
429-
static int ompi_comm_ishrink_check_finished(ompi_comm_request_t *request) {
430-
return OMPI_SUCCESS;
431-
}
432427

433428
int ompi_comm_ishrink_internal(ompi_communicator_t* comm, ompi_communicator_t** newcomm, ompi_request_t** req)
434429
{
435430
int rc;
436431
int flag = 1;
432+
#if OPAL_ENABLE_DEBUG
437433
double stop;
434+
#endif
438435
ompi_comm_request_t *request;
439436
ompi_comm_ishrink_context_t *context;
440437
ompi_request_t *subreq[1];
@@ -466,13 +463,17 @@ int ompi_comm_ishrink_internal(ompi_communicator_t* comm, ompi_communicator_t**
466463
OPAL_OUTPUT_VERBOSE((5, ompi_ftmpi_output_handle,
467464
"%s ompi: comm_ishrink: Agreement on failed processes",
468465
OMPI_NAME_PRINT(OMPI_PROC_MY_NAME) ));
469-
context->start = MPI_Wtime();
466+
#if OPAL_ENABLE_DEBUG
467+
context->start = PMPI_Wtime();
468+
#endif
470469
ompi_group_intersection(comm->c_remote_group, ompi_group_all_failed_procs, &context->failed_group);
471-
stop = MPI_Wtime();
470+
#if OPAL_ENABLE_DEBUG
471+
stop = PMPI_Wtime();
472472
OPAL_OUTPUT_VERBOSE((10, ompi_ftmpi_output_handle,
473473
"%s ompi: comm_ishrink: group_inter: %g seconds",
474474
OMPI_NAME_PRINT(OMPI_PROC_MY_NAME), stop-context->start));
475-
context->start = MPI_Wtime();
475+
context->start = PMPI_Wtime();
476+
#endif
476477
/* We need to create the list of alive processes. Thus, we don't care about
477478
* the value of flag, instead we are only using the globally consistent
478479
* return value.
@@ -505,15 +506,19 @@ static int ompi_comm_ishrink_check_agree(ompi_comm_request_t *request) {
505506
(ompi_comm_ishrink_context_t *)request->context;
506507
ompi_communicator_t *comm = context->comm;
507508
ompi_request_t *subreq[1];
508-
int rc, mode, flag = 1;
509-
double stop;
510509
ompi_group_t *comm_group = NULL;
510+
int rc, flag = 1;
511+
#if OPAL_ENABLE_DEBUG
512+
double stop;
513+
#endif
511514

512-
stop = MPI_Wtime();
513-
rc = request->super.req_status.MPI_ERROR;
515+
#if OPAL_ENABLE_DEBUG
516+
stop = PMPI_Wtime();
514517
OPAL_OUTPUT_VERBOSE((10, ompi_ftmpi_output_handle,
515518
"%s ompi: comm_ishrink: AGREE: %g seconds",
516519
OMPI_NAME_PRINT(OMPI_PROC_MY_NAME), stop-context->start));
520+
#endif
521+
rc = request->super.req_status.MPI_ERROR;
517522
if( (OMPI_SUCCESS != rc) && (MPI_ERR_PROC_FAILED != rc) ) {
518523
opal_output(0, "%s:%d Agreement failure: %d\n", __FILE__, __LINE__, rc);
519524
return rc;
@@ -546,10 +551,11 @@ static int ompi_comm_ishrink_check_agree(ompi_comm_request_t *request) {
546551
OPAL_OUTPUT_VERBOSE((5, ompi_ftmpi_output_handle,
547552
"%s ompi: comm_ishrink: Determine ranking for new communicator",
548553
OMPI_NAME_PRINT(OMPI_PROC_MY_NAME) ));
549-
context->start = MPI_Wtime();
554+
#if OPAL_ENABLE_DEBUG
555+
context->start = PMPI_Wtime();
556+
#endif
550557

551558
/* Create 'alive' groups */
552-
mode = OMPI_COMM_CID_INTRA_FT;
553559
comm_group = comm->c_local_group;
554560
rc = ompi_group_difference(comm_group, context->failed_group, &context->alive_group);
555561
if( OMPI_SUCCESS != rc ) {
@@ -558,7 +564,6 @@ static int ompi_comm_ishrink_check_agree(ompi_comm_request_t *request) {
558564
return rc;
559565
}
560566
if( OMPI_COMM_IS_INTER(comm) ) {
561-
mode = OMPI_COMM_CID_INTER_FT;
562567
comm_group = comm->c_remote_group;
563568
rc = ompi_group_difference(comm_group, context->failed_group, &context->alive_rgroup);
564569
if( OMPI_SUCCESS != rc ) {
@@ -579,9 +584,9 @@ static int ompi_comm_ishrink_check_agree(ompi_comm_request_t *request) {
579584
NULL, /* remote_ranks */
580585
comm->c_keyhash, /* attrs */
581586
comm->error_handler, /* error handler */
582-
false, /* topo component */
583587
context->alive_group, /* local group */
584588
context->alive_rgroup, /* remote group */
589+
0, /* flags */
585590
subreq
586591
);
587592
if( OMPI_SUCCESS != rc ) {
@@ -602,7 +607,9 @@ static int ompi_comm_ishrink_check_setrank(ompi_comm_request_t *request) {
602607
(ompi_comm_ishrink_context_t *)request->context;
603608
ompi_request_t *subreq[1];
604609
int rc, mode;
610+
#if OPAL_ENABLE_DEBUG
605611
double stop;
612+
#endif
606613

607614
/* cleanup temporary groups */
608615
OBJ_RELEASE(context->alive_group);
@@ -620,10 +627,12 @@ static int ompi_comm_ishrink_check_setrank(ompi_comm_request_t *request) {
620627
return rc;
621628
}
622629

623-
stop = MPI_Wtime();
630+
#if OPAL_ENABLE_DEBUG
631+
stop = PMPI_Wtime();
624632
OPAL_OUTPUT_VERBOSE((10, ompi_ftmpi_output_handle,
625633
"%s ompi: comm_ishrink: GRP COMPUTATION: %g seconds\n",
626634
OMPI_NAME_PRINT(OMPI_PROC_MY_NAME), stop-context->start));
635+
#endif
627636

628637
/*
629638
* Step 3: Determine context id
@@ -636,7 +645,9 @@ static int ompi_comm_ishrink_check_setrank(ompi_comm_request_t *request) {
636645
OPAL_OUTPUT_VERBOSE((5, ompi_ftmpi_output_handle,
637646
"%s ompi: comm_ishrink: Determine context id",
638647
OMPI_NAME_PRINT(OMPI_PROC_MY_NAME) ));
639-
context->start = MPI_Wtime();
648+
#if OPAL_ENABLE_DEBUG
649+
context->start = PMPI_Wtime();
650+
#endif
640651
rc = ompi_comm_nextcid_nb( *context->newcomm, /* new communicator */
641652
context->comm, /* old comm */
642653
NULL, /* bridge comm */
@@ -661,7 +672,9 @@ static int ompi_comm_ishrink_check_cid(ompi_comm_request_t *request) {
661672
(ompi_comm_ishrink_context_t *)request->context;
662673
ompi_request_t *subreq[1];
663674
int rc, mode;
675+
#if OPAL_ENABLE_DEBUG
664676
double stop;
677+
#endif
665678

666679
rc = request->super.req_status.MPI_ERROR;
667680
if( OMPI_SUCCESS != rc ) {
@@ -672,10 +685,13 @@ static int ompi_comm_ishrink_check_cid(ompi_comm_request_t *request) {
672685
OBJ_RELEASE(*context->newcomm);
673686
return rc;
674687
}
675-
stop = MPI_Wtime();
688+
#if OPAL_ENABLE_DEBUG
689+
stop = PMPI_Wtime();
676690
OPAL_OUTPUT_VERBOSE((10, ompi_ftmpi_output_handle,
677691
"%s ompi: comm_ishrink: NEXT CID: %g seconds\n",
678692
OMPI_NAME_PRINT(OMPI_PROC_MY_NAME), stop-context->start));
693+
#endif
694+
679695
/*
680696
* Step 4: Activate the communicator
681697
*/
@@ -687,8 +703,11 @@ static int ompi_comm_ishrink_check_cid(ompi_comm_request_t *request) {
687703
/* Set name for debugging purposes */
688704
ompi_communicator_t *newcomp = *context->newcomm;
689705
snprintf(newcomp->c_name, MPI_MAX_OBJECT_NAME, "MPI COMMUNICATOR %d SHRUNK FROM %d",
690-
newcomp->c_contextid, context->comm->c_contextid );
691-
context->start = MPI_Wtime();
706+
ompi_comm_get_local_cid(newcomp),
707+
ompi_comm_get_local_cid(context->comm));
708+
#if OPAL_ENABLE_DEBUG
709+
context->start = PMPI_Wtime();
710+
#endif
692711
/* activate communicator and init coll-module */
693712
rc = ompi_comm_activate_nb( context->newcomm, /* new communicator */
694713
context->comm,
@@ -711,18 +730,21 @@ static int ompi_comm_ishrink_check_cid(ompi_comm_request_t *request) {
711730
static int ompi_comm_ishrink_check_activate(ompi_comm_request_t *request) {
712731
ompi_comm_ishrink_context_t *context =
713732
(ompi_comm_ishrink_context_t *)request->context;
714-
double stop;
715733
int rc;
734+
#if OPAL_ENABLE_DEBUG
735+
double stop;
736+
#endif
716737

717738
rc = request->super.req_status.MPI_ERROR;
718739
if( OMPI_SUCCESS != rc ) {
719740
return rc;
720741
}
721-
742+
#if OPAL_ENABLE_DEBUG
722743
stop = MPI_Wtime();
723744
OPAL_OUTPUT_VERBOSE((10, ompi_ftmpi_output_handle,
724745
"%s ompi: comm_ishrink: COLL SELECT: %g seconds\n",
725746
OMPI_NAME_PRINT(OMPI_PROC_MY_NAME), stop-context->start));
747+
#endif
726748

727749
return OMPI_SUCCESS;
728750
}

0 commit comments

Comments
 (0)