Skip to content

Commit 96c91e9

Browse files
committed
Manage errors in communicator creations (cid)
In order for this to work, error management needs to also be added to NBC, from separate PR Signed-off-by: Aurelien Bouteiller <bouteill@icl.utk.edu> The error field of requests needs to be rearmed at start, not at create Signed-off-by: Aurelien Bouteiller <bouteill@icl.utk.edu>
1 parent 1b96be5 commit 96c91e9

File tree

6 files changed

+62
-29
lines changed

6 files changed

+62
-29
lines changed

ompi/communicator/comm.c

Lines changed: 25 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -120,10 +120,10 @@ int ompi_comm_set ( ompi_communicator_t **ncomm,
120120
}
121121

122122
if (NULL != req) {
123-
ompi_request_wait( &req, MPI_STATUS_IGNORE);
123+
rc = ompi_request_wait( &req, MPI_STATUS_IGNORE);
124124
}
125125

126-
return OMPI_SUCCESS;
126+
return rc;
127127
}
128128

129129
/*
@@ -1006,6 +1006,7 @@ int ompi_comm_dup_with_info ( ompi_communicator_t * comm, opal_info_t *info, omp
10061006
/* Determine context id. It is identical to f_2_c_handle */
10071007
rc = ompi_comm_nextcid (newcomp, comm, NULL, NULL, NULL, false, mode);
10081008
if ( OMPI_SUCCESS != rc ) {
1009+
OBJ_RELEASE(newcomp);
10091010
return rc;
10101011
}
10111012

@@ -1022,6 +1023,7 @@ int ompi_comm_dup_with_info ( ompi_communicator_t * comm, opal_info_t *info, omp
10221023
/* activate communicator and init coll-module */
10231024
rc = ompi_comm_activate (&newcomp, comm, NULL, NULL, NULL, false, mode);
10241025
if ( OMPI_SUCCESS != rc ) {
1026+
OBJ_RELEASE(newcomp);
10251027
return rc;
10261028
}
10271029

@@ -1138,6 +1140,7 @@ static int ompi_comm_idup_getcid (ompi_comm_request_t *request)
11381140
NULL, false, mode, subreq);
11391141
if (OMPI_SUCCESS != rc) {
11401142
ompi_comm_request_return (request);
1143+
OBJ_RELEASE(context->newcomp);
11411144
return rc;
11421145
}
11431146

@@ -1166,6 +1169,7 @@ static int ompi_comm_idup_with_info_activate (ompi_comm_request_t *request)
11661169
/* activate communicator and init coll-module */
11671170
rc = ompi_comm_activate_nb (&context->newcomp, context->comm, NULL, NULL, NULL, false, mode, subreq);
11681171
if ( OMPI_SUCCESS != rc ) {
1172+
OBJ_RELEASE(context->newcomp);
11691173
return rc;
11701174
}
11711175

@@ -1208,6 +1212,7 @@ int ompi_comm_create_group (ompi_communicator_t *comm, ompi_group_t *group, int
12081212
/* Determine context id. It is identical to f_2_c_handle */
12091213
rc = ompi_comm_nextcid (newcomp, comm, NULL, &tag, NULL, false, mode);
12101214
if ( OMPI_SUCCESS != rc ) {
1215+
OBJ_RELEASE(newcomp);
12111216
return rc;
12121217
}
12131218

@@ -1218,6 +1223,7 @@ int ompi_comm_create_group (ompi_communicator_t *comm, ompi_group_t *group, int
12181223
/* activate communicator and init coll-module */
12191224
rc = ompi_comm_activate (&newcomp, comm, NULL, &tag, NULL, false, mode);
12201225
if ( OMPI_SUCCESS != rc ) {
1226+
OBJ_RELEASE(newcomp);
12211227
return rc;
12221228
}
12231229

@@ -1517,16 +1523,16 @@ int ompi_comm_free( ompi_communicator_t **comm )
15171523
/**********************************************************************/
15181524
/**********************************************************************/
15191525
/**********************************************************************/
1520-
ompi_proc_t **ompi_comm_get_rprocs ( ompi_communicator_t *local_comm,
1521-
ompi_communicator_t *bridge_comm,
1522-
int local_leader,
1523-
int remote_leader,
1524-
int tag,
1525-
int rsize)
1526+
int ompi_comm_get_rprocs ( ompi_communicator_t *local_comm,
1527+
ompi_communicator_t *bridge_comm,
1528+
int local_leader,
1529+
int remote_leader,
1530+
int tag,
1531+
int rsize,
1532+
ompi_proc_t ***prprocs )
15261533
{
1527-
15281534
MPI_Request req;
1529-
int rc;
1535+
int rc = OMPI_SUCCESS;
15301536
int local_rank, local_size;
15311537
ompi_proc_t **rprocs=NULL;
15321538
int32_t size_len;
@@ -1543,7 +1549,7 @@ ompi_proc_t **ompi_comm_get_rprocs ( ompi_communicator_t *local_comm,
15431549
if (local_rank == local_leader) {
15441550
sbuf = OBJ_NEW(opal_buffer_t);
15451551
if (NULL == sbuf) {
1546-
rc = OMPI_ERROR;
1552+
rc = OMPI_ERR_OUT_OF_RESOURCE;
15471553
goto err_exit;
15481554
}
15491555
if(OMPI_GROUP_IS_DENSE(local_comm->c_local_group)) {
@@ -1595,6 +1601,7 @@ ompi_proc_t **ompi_comm_get_rprocs ( ompi_communicator_t *local_comm,
15951601
/* Allocate temporary buffer */
15961602
recvbuf = (char *)malloc(rlen);
15971603
if ( NULL == recvbuf ) {
1604+
rc = OMPI_ERR_OUT_OF_RESOURCE;
15981605
goto err_exit;
15991606
}
16001607

@@ -1626,19 +1633,20 @@ ompi_proc_t **ompi_comm_get_rprocs ( ompi_communicator_t *local_comm,
16261633

16271634
rbuf = OBJ_NEW(opal_buffer_t);
16281635
if (NULL == rbuf) {
1629-
rc = OMPI_ERROR;
1636+
rc = OMPI_ERR_OUT_OF_RESOURCE;
16301637
goto err_exit;
16311638
}
16321639

16331640
if (OMPI_SUCCESS != (rc = opal_dss.load(rbuf, recvbuf, rlen))) {
16341641
goto err_exit;
16351642
}
16361643

1637-
/* decode the names into a proc-list */
1644+
/* decode the names into a proc-list -- will never add a new proc
1645+
as the result of this operation, so no need to get the newprocs
1646+
list or call PML add_procs(). */
16381647
rc = ompi_proc_unpack(rbuf, rsize, &rprocs, NULL, NULL);
16391648
OBJ_RELEASE(rbuf);
16401649
if (OMPI_SUCCESS != rc) {
1641-
OMPI_ERROR_LOG(rc);
16421650
goto err_exit;
16431651
}
16441652

@@ -1658,14 +1666,14 @@ ompi_proc_t **ompi_comm_get_rprocs ( ompi_communicator_t *local_comm,
16581666

16591667
/* And now add the information into the database */
16601668
if (OMPI_SUCCESS != (rc = MCA_PML_CALL(add_procs(rprocs, rsize)))) {
1661-
OMPI_ERROR_LOG(rc);
16621669
goto err_exit;
16631670
}
16641671

16651672
err_exit:
16661673
/* rprocs isn't freed unless we have an error,
16671674
since it is used in the communicator */
16681675
if ( OMPI_SUCCESS != rc ) {
1676+
OMPI_ERROR_LOG(rc);
16691677
opal_output(0, "%d: Error in ompi_get_rprocs\n", local_rank);
16701678
if ( NULL != rprocs ) {
16711679
free ( rprocs );
@@ -1686,7 +1694,8 @@ ompi_proc_t **ompi_comm_get_rprocs ( ompi_communicator_t *local_comm,
16861694
free ( sendbuf );
16871695
}
16881696

1689-
return rprocs;
1697+
*prprocs = rprocs;
1698+
return rc;
16901699
}
16911700
/**********************************************************************/
16921701
/**********************************************************************/

ompi/communicator/comm_cid.c

Lines changed: 16 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -370,6 +370,13 @@ static int ompi_comm_checkcid (ompi_comm_request_t *request)
370370
int ret;
371371
int participate = (context->newcomm->c_local_group->grp_my_rank != MPI_UNDEFINED);
372372

373+
if (OMPI_SUCCESS != request->super.req_status.MPI_ERROR) {
374+
if (participate) {
375+
opal_pointer_array_set_item(&ompi_mpi_communicators, context->nextlocal_cid, NULL);
376+
}
377+
return request->super.req_status.MPI_ERROR;
378+
}
379+
373380
if (OPAL_THREAD_TRYLOCK(&ompi_cid_lock)) {
374381
return ompi_comm_request_schedule_append (request, ompi_comm_checkcid, NULL, 0);
375382
}
@@ -407,11 +414,18 @@ static int ompi_comm_nextcid_check_flag (ompi_comm_request_t *request)
407414
ompi_comm_cid_context_t *context = (ompi_comm_cid_context_t *) request->context;
408415
int participate = (context->newcomm->c_local_group->grp_my_rank != MPI_UNDEFINED);
409416

417+
if (OMPI_SUCCESS != request->super.req_status.MPI_ERROR) {
418+
if (participate) {
419+
opal_pointer_array_set_item(&ompi_mpi_communicators, context->nextcid, NULL);
420+
}
421+
return request->super.req_status.MPI_ERROR;
422+
}
423+
410424
if (OPAL_THREAD_TRYLOCK(&ompi_cid_lock)) {
411425
return ompi_comm_request_schedule_append (request, ompi_comm_nextcid_check_flag, NULL, 0);
412426
}
413427

414-
if (1 == context->rflag) {
428+
if (0 != context->rflag) {
415429
if( !participate ) {
416430
/* we need to provide something sane here
417431
* but we cannot use `nextcid` as we may have it
@@ -442,7 +456,7 @@ static int ompi_comm_nextcid_check_flag (ompi_comm_request_t *request)
442456
return OMPI_SUCCESS;
443457
}
444458

445-
if (participate && (1 == context->flag)) {
459+
if (participate && (0 != context->flag)) {
446460
/* we could use this cid, but other don't agree */
447461
opal_pointer_array_set_item (&ompi_mpi_communicators, context->nextcid, NULL);
448462
context->start = context->nextcid + 1; /* that's where we can start the next round */

ompi/communicator/comm_request.c

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -119,6 +119,11 @@ static int ompi_comm_request_progress (void)
119119
while (request_item->subreq_count) {
120120
ompi_request_t *subreq = request_item->subreqs[request_item->subreq_count-1];
121121
if( REQUEST_COMPLETE(subreq) ) {
122+
if (OMPI_SUCCESS != subreq->req_status.MPI_ERROR) {
123+
/* Let it continue but mark it as failed, so
124+
* that it does some subreqs cleanup */
125+
request->super.req_status.MPI_ERROR = subreq->req_status.MPI_ERROR;
126+
}
122127
ompi_request_free (&subreq);
123128
request_item->subreq_count--;
124129
} else {
@@ -130,6 +135,8 @@ static int ompi_comm_request_progress (void)
130135
if (item_complete) {
131136
if (request_item->callback) {
132137
opal_mutex_unlock (&ompi_comm_request_mutex);
138+
/* the callback should check for errors in the request
139+
* status. */
133140
rc = request_item->callback (request);
134141
opal_mutex_lock (&ompi_comm_request_mutex);
135142
}
@@ -142,7 +149,7 @@ static int ompi_comm_request_progress (void)
142149
/* if the request schedule is empty then the request is complete */
143150
if (0 == opal_list_get_size (&request->schedule)) {
144151
opal_list_remove_item (&ompi_comm_requests_active, (opal_list_item_t *) request);
145-
request->super.req_status.MPI_ERROR = (OMPI_SUCCESS == rc) ? MPI_SUCCESS : MPI_ERR_INTERN;
152+
request->super.req_status.MPI_ERROR = (OMPI_SUCCESS == rc) ? MPI_SUCCESS : rc;
146153
ompi_request_complete (&request->super, true);
147154
}
148155
}
@@ -171,6 +178,7 @@ void ompi_comm_request_start (ompi_comm_request_t *request)
171178
}
172179

173180
request->super.req_state = OMPI_REQUEST_ACTIVE;
181+
request->super.req_status.MPI_ERROR = OMPI_SUCCESS;
174182

175183
opal_mutex_unlock (&ompi_comm_request_mutex);
176184
}

ompi/communicator/communicator.h

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -649,12 +649,13 @@ OMPI_DECLSPEC int ompi_comm_set_nb ( ompi_communicator_t **ncomm,
649649
* The routine makes sure, that all processes have afterwards
650650
* a list of ompi_proc_t pointers for the remote group.
651651
*/
652-
struct ompi_proc_t **ompi_comm_get_rprocs ( ompi_communicator_t *local_comm,
653-
ompi_communicator_t *bridge_comm,
654-
int local_leader,
655-
int remote_leader,
656-
int tag,
657-
int rsize);
652+
int ompi_comm_get_rprocs ( ompi_communicator_t *local_comm,
653+
ompi_communicator_t *bridge_comm,
654+
int local_leader,
655+
int remote_leader,
656+
int tag,
657+
int rsize,
658+
struct ompi_proc_t ***prprocs );
658659

659660
/**
660661
* This routine verifies, whether local_group and remote group are overlapping

ompi/errhandler/errhandler_invoke.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -160,6 +160,7 @@ int ompi_errhandler_request_invoke(int count,
160160
/* Invoke the exception */
161161
switch (type) {
162162
case OMPI_REQUEST_PML:
163+
case OMPI_REQUEST_COLL:
163164
return ompi_errhandler_invoke(mpi_object.comm->error_handler,
164165
mpi_object.comm,
165166
mpi_object.comm->errhandler_type,

ompi/mpi/c/intercomm_create.c

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -137,9 +137,9 @@ int MPI_Intercomm_create(MPI_Comm local_comm, int local_leader,
137137
goto err_exit;
138138
}
139139

140-
rprocs = ompi_comm_get_rprocs( local_comm, bridge_comm, lleader,
141-
remote_leader, tag, rsize );
142-
if ( NULL == rprocs ) {
140+
rc = ompi_comm_get_rprocs( local_comm, bridge_comm, lleader,
141+
remote_leader, tag, rsize, &rprocs );
142+
if ( OMPI_SUCCESS != rc ) {
143143
goto err_exit;
144144
}
145145

@@ -222,7 +222,7 @@ int MPI_Intercomm_create(MPI_Comm local_comm, int local_leader,
222222
}
223223
if ( OMPI_SUCCESS != rc ) {
224224
*newintercomm = MPI_COMM_NULL;
225-
return OMPI_ERRHANDLER_INVOKE(local_comm, MPI_ERR_INTERN,
225+
return OMPI_ERRHANDLER_INVOKE(local_comm, rc,
226226
FUNC_NAME);
227227
}
228228

0 commit comments

Comments
 (0)