Skip to content

Commit e54496b

Browse files
authored
Merge pull request #6087 from ICLDisco/export/errors_cid
Manage errors in communicator creations (cid)
2 parents 17be4c6 + 96c91e9 commit e54496b

File tree

6 files changed

+62
-29
lines changed

6 files changed

+62
-29
lines changed

ompi/communicator/comm.c

Lines changed: 25 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -121,10 +121,10 @@ int ompi_comm_set ( ompi_communicator_t **ncomm,
121121
}
122122

123123
if (NULL != req) {
124-
ompi_request_wait( &req, MPI_STATUS_IGNORE);
124+
rc = ompi_request_wait( &req, MPI_STATUS_IGNORE);
125125
}
126126

127-
return OMPI_SUCCESS;
127+
return rc;
128128
}
129129

130130
/*
@@ -1007,6 +1007,7 @@ int ompi_comm_dup_with_info ( ompi_communicator_t * comm, opal_info_t *info, omp
10071007
/* Determine context id. It is identical to f_2_c_handle */
10081008
rc = ompi_comm_nextcid (newcomp, comm, NULL, NULL, NULL, false, mode);
10091009
if ( OMPI_SUCCESS != rc ) {
1010+
OBJ_RELEASE(newcomp);
10101011
return rc;
10111012
}
10121013

@@ -1023,6 +1024,7 @@ int ompi_comm_dup_with_info ( ompi_communicator_t * comm, opal_info_t *info, omp
10231024
/* activate communicator and init coll-module */
10241025
rc = ompi_comm_activate (&newcomp, comm, NULL, NULL, NULL, false, mode);
10251026
if ( OMPI_SUCCESS != rc ) {
1027+
OBJ_RELEASE(newcomp);
10261028
return rc;
10271029
}
10281030

@@ -1139,6 +1141,7 @@ static int ompi_comm_idup_getcid (ompi_comm_request_t *request)
11391141
NULL, false, mode, subreq);
11401142
if (OMPI_SUCCESS != rc) {
11411143
ompi_comm_request_return (request);
1144+
OBJ_RELEASE(context->newcomp);
11421145
return rc;
11431146
}
11441147

@@ -1167,6 +1170,7 @@ static int ompi_comm_idup_with_info_activate (ompi_comm_request_t *request)
11671170
/* activate communicator and init coll-module */
11681171
rc = ompi_comm_activate_nb (&context->newcomp, context->comm, NULL, NULL, NULL, false, mode, subreq);
11691172
if ( OMPI_SUCCESS != rc ) {
1173+
OBJ_RELEASE(context->newcomp);
11701174
return rc;
11711175
}
11721176

@@ -1209,6 +1213,7 @@ int ompi_comm_create_group (ompi_communicator_t *comm, ompi_group_t *group, int
12091213
/* Determine context id. It is identical to f_2_c_handle */
12101214
rc = ompi_comm_nextcid (newcomp, comm, NULL, &tag, NULL, false, mode);
12111215
if ( OMPI_SUCCESS != rc ) {
1216+
OBJ_RELEASE(newcomp);
12121217
return rc;
12131218
}
12141219

@@ -1219,6 +1224,7 @@ int ompi_comm_create_group (ompi_communicator_t *comm, ompi_group_t *group, int
12191224
/* activate communicator and init coll-module */
12201225
rc = ompi_comm_activate (&newcomp, comm, NULL, &tag, NULL, false, mode);
12211226
if ( OMPI_SUCCESS != rc ) {
1227+
OBJ_RELEASE(newcomp);
12221228
return rc;
12231229
}
12241230

@@ -1516,16 +1522,16 @@ int ompi_comm_free( ompi_communicator_t **comm )
15161522
/**********************************************************************/
15171523
/**********************************************************************/
15181524
/**********************************************************************/
1519-
ompi_proc_t **ompi_comm_get_rprocs ( ompi_communicator_t *local_comm,
1520-
ompi_communicator_t *bridge_comm,
1521-
int local_leader,
1522-
int remote_leader,
1523-
int tag,
1524-
int rsize)
1525+
int ompi_comm_get_rprocs ( ompi_communicator_t *local_comm,
1526+
ompi_communicator_t *bridge_comm,
1527+
int local_leader,
1528+
int remote_leader,
1529+
int tag,
1530+
int rsize,
1531+
ompi_proc_t ***prprocs )
15251532
{
1526-
15271533
MPI_Request req;
1528-
int rc;
1534+
int rc = OMPI_SUCCESS;
15291535
int local_rank, local_size;
15301536
ompi_proc_t **rprocs=NULL;
15311537
int32_t size_len;
@@ -1542,7 +1548,7 @@ ompi_proc_t **ompi_comm_get_rprocs ( ompi_communicator_t *local_comm,
15421548
if (local_rank == local_leader) {
15431549
sbuf = OBJ_NEW(opal_buffer_t);
15441550
if (NULL == sbuf) {
1545-
rc = OMPI_ERROR;
1551+
rc = OMPI_ERR_OUT_OF_RESOURCE;
15461552
goto err_exit;
15471553
}
15481554
if(OMPI_GROUP_IS_DENSE(local_comm->c_local_group)) {
@@ -1594,6 +1600,7 @@ ompi_proc_t **ompi_comm_get_rprocs ( ompi_communicator_t *local_comm,
15941600
/* Allocate temporary buffer */
15951601
recvbuf = (char *)malloc(rlen);
15961602
if ( NULL == recvbuf ) {
1603+
rc = OMPI_ERR_OUT_OF_RESOURCE;
15971604
goto err_exit;
15981605
}
15991606

@@ -1625,19 +1632,20 @@ ompi_proc_t **ompi_comm_get_rprocs ( ompi_communicator_t *local_comm,
16251632

16261633
rbuf = OBJ_NEW(opal_buffer_t);
16271634
if (NULL == rbuf) {
1628-
rc = OMPI_ERROR;
1635+
rc = OMPI_ERR_OUT_OF_RESOURCE;
16291636
goto err_exit;
16301637
}
16311638

16321639
if (OMPI_SUCCESS != (rc = opal_dss.load(rbuf, recvbuf, rlen))) {
16331640
goto err_exit;
16341641
}
16351642

1636-
/* decode the names into a proc-list */
1643+
/* decode the names into a proc-list -- will never add a new proc
1644+
as the result of this operation, so no need to get the newprocs
1645+
list or call PML add_procs(). */
16371646
rc = ompi_proc_unpack(rbuf, rsize, &rprocs, NULL, NULL);
16381647
OBJ_RELEASE(rbuf);
16391648
if (OMPI_SUCCESS != rc) {
1640-
OMPI_ERROR_LOG(rc);
16411649
goto err_exit;
16421650
}
16431651

@@ -1657,14 +1665,14 @@ ompi_proc_t **ompi_comm_get_rprocs ( ompi_communicator_t *local_comm,
16571665

16581666
/* And now add the information into the database */
16591667
if (OMPI_SUCCESS != (rc = MCA_PML_CALL(add_procs(rprocs, rsize)))) {
1660-
OMPI_ERROR_LOG(rc);
16611668
goto err_exit;
16621669
}
16631670

16641671
err_exit:
16651672
/* rprocs isn't freed unless we have an error,
16661673
since it is used in the communicator */
16671674
if ( OMPI_SUCCESS != rc ) {
1675+
OMPI_ERROR_LOG(rc);
16681676
opal_output(0, "%d: Error in ompi_get_rprocs\n", local_rank);
16691677
if ( NULL != rprocs ) {
16701678
free ( rprocs );
@@ -1685,7 +1693,8 @@ ompi_proc_t **ompi_comm_get_rprocs ( ompi_communicator_t *local_comm,
16851693
free ( sendbuf );
16861694
}
16871695

1688-
return rprocs;
1696+
*prprocs = rprocs;
1697+
return rc;
16891698
}
16901699
/**********************************************************************/
16911700
/**********************************************************************/

ompi/communicator/comm_cid.c

Lines changed: 16 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -372,6 +372,13 @@ static int ompi_comm_checkcid (ompi_comm_request_t *request)
372372
int ret;
373373
int participate = (context->newcomm->c_local_group->grp_my_rank != MPI_UNDEFINED);
374374

375+
if (OMPI_SUCCESS != request->super.req_status.MPI_ERROR) {
376+
if (participate) {
377+
opal_pointer_array_set_item(&ompi_mpi_communicators, context->nextlocal_cid, NULL);
378+
}
379+
return request->super.req_status.MPI_ERROR;
380+
}
381+
375382
if (OPAL_THREAD_TRYLOCK(&ompi_cid_lock)) {
376383
return ompi_comm_request_schedule_append (request, ompi_comm_checkcid, NULL, 0);
377384
}
@@ -409,11 +416,18 @@ static int ompi_comm_nextcid_check_flag (ompi_comm_request_t *request)
409416
ompi_comm_cid_context_t *context = (ompi_comm_cid_context_t *) request->context;
410417
int participate = (context->newcomm->c_local_group->grp_my_rank != MPI_UNDEFINED);
411418

419+
if (OMPI_SUCCESS != request->super.req_status.MPI_ERROR) {
420+
if (participate) {
421+
opal_pointer_array_set_item(&ompi_mpi_communicators, context->nextcid, NULL);
422+
}
423+
return request->super.req_status.MPI_ERROR;
424+
}
425+
412426
if (OPAL_THREAD_TRYLOCK(&ompi_cid_lock)) {
413427
return ompi_comm_request_schedule_append (request, ompi_comm_nextcid_check_flag, NULL, 0);
414428
}
415429

416-
if (1 == context->rflag) {
430+
if (0 != context->rflag) {
417431
if( !participate ) {
418432
/* we need to provide something sane here
419433
* but we cannot use `nextcid` as we may have it
@@ -444,7 +458,7 @@ static int ompi_comm_nextcid_check_flag (ompi_comm_request_t *request)
444458
return OMPI_SUCCESS;
445459
}
446460

447-
if (participate && (1 == context->flag)) {
461+
if (participate && (0 != context->flag)) {
448462
/* we could use this cid, but other don't agree */
449463
opal_pointer_array_set_item (&ompi_mpi_communicators, context->nextcid, NULL);
450464
context->start = context->nextcid + 1; /* that's where we can start the next round */

ompi/communicator/comm_request.c

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -119,6 +119,11 @@ static int ompi_comm_request_progress (void)
119119
while (request_item->subreq_count) {
120120
ompi_request_t *subreq = request_item->subreqs[request_item->subreq_count-1];
121121
if( REQUEST_COMPLETE(subreq) ) {
122+
if (OMPI_SUCCESS != subreq->req_status.MPI_ERROR) {
123+
/* Let it continue but mark it as failed, so
124+
* that it does some subreqs cleanup */
125+
request->super.req_status.MPI_ERROR = subreq->req_status.MPI_ERROR;
126+
}
122127
ompi_request_free (&subreq);
123128
request_item->subreq_count--;
124129
} else {
@@ -130,6 +135,8 @@ static int ompi_comm_request_progress (void)
130135
if (item_complete) {
131136
if (request_item->callback) {
132137
opal_mutex_unlock (&ompi_comm_request_mutex);
138+
/* the callback should check for errors in the request
139+
* status. */
133140
rc = request_item->callback (request);
134141
opal_mutex_lock (&ompi_comm_request_mutex);
135142
}
@@ -142,7 +149,7 @@ static int ompi_comm_request_progress (void)
142149
/* if the request schedule is empty then the request is complete */
143150
if (0 == opal_list_get_size (&request->schedule)) {
144151
opal_list_remove_item (&ompi_comm_requests_active, (opal_list_item_t *) request);
145-
request->super.req_status.MPI_ERROR = (OMPI_SUCCESS == rc) ? MPI_SUCCESS : MPI_ERR_INTERN;
152+
request->super.req_status.MPI_ERROR = (OMPI_SUCCESS == rc) ? MPI_SUCCESS : rc;
146153
ompi_request_complete (&request->super, true);
147154
}
148155
}
@@ -171,6 +178,7 @@ void ompi_comm_request_start (ompi_comm_request_t *request)
171178
}
172179

173180
request->super.req_state = OMPI_REQUEST_ACTIVE;
181+
request->super.req_status.MPI_ERROR = OMPI_SUCCESS;
174182

175183
opal_mutex_unlock (&ompi_comm_request_mutex);
176184
}

ompi/communicator/communicator.h

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -649,12 +649,13 @@ OMPI_DECLSPEC int ompi_comm_set_nb ( ompi_communicator_t **ncomm,
649649
* The routine makes sure, that all processes have afterwards
650650
* a list of ompi_proc_t pointers for the remote group.
651651
*/
652-
struct ompi_proc_t **ompi_comm_get_rprocs ( ompi_communicator_t *local_comm,
653-
ompi_communicator_t *bridge_comm,
654-
int local_leader,
655-
int remote_leader,
656-
int tag,
657-
int rsize);
652+
int ompi_comm_get_rprocs ( ompi_communicator_t *local_comm,
653+
ompi_communicator_t *bridge_comm,
654+
int local_leader,
655+
int remote_leader,
656+
int tag,
657+
int rsize,
658+
struct ompi_proc_t ***prprocs );
658659

659660
/**
660661
* This routine verifies, whether local_group and remote group are overlapping

ompi/errhandler/errhandler_invoke.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -160,6 +160,7 @@ int ompi_errhandler_request_invoke(int count,
160160
/* Invoke the exception */
161161
switch (type) {
162162
case OMPI_REQUEST_PML:
163+
case OMPI_REQUEST_COLL:
163164
return ompi_errhandler_invoke(mpi_object.comm->error_handler,
164165
mpi_object.comm,
165166
mpi_object.comm->errhandler_type,

ompi/mpi/c/intercomm_create.c

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -137,9 +137,9 @@ int MPI_Intercomm_create(MPI_Comm local_comm, int local_leader,
137137
goto err_exit;
138138
}
139139

140-
rprocs = ompi_comm_get_rprocs( local_comm, bridge_comm, lleader,
141-
remote_leader, tag, rsize );
142-
if ( NULL == rprocs ) {
140+
rc = ompi_comm_get_rprocs( local_comm, bridge_comm, lleader,
141+
remote_leader, tag, rsize, &rprocs );
142+
if ( OMPI_SUCCESS != rc ) {
143143
goto err_exit;
144144
}
145145

@@ -222,7 +222,7 @@ int MPI_Intercomm_create(MPI_Comm local_comm, int local_leader,
222222
}
223223
if ( OMPI_SUCCESS != rc ) {
224224
*newintercomm = MPI_COMM_NULL;
225-
return OMPI_ERRHANDLER_INVOKE(local_comm, MPI_ERR_INTERN,
225+
return OMPI_ERRHANDLER_INVOKE(local_comm, rc,
226226
FUNC_NAME);
227227
}
228228

0 commit comments

Comments
 (0)