Skip to content

Commit 6b48e41

Browse files
bosilcaabouteiller
authored andcommitted
Catch and gracefully handle error during HAN initialization.
Signed-off-by: George Bosilca <bosilca@icl.utk.edu>
1 parent 996c0dc commit 6b48e41

File tree

1 file changed

+36
-15
lines changed

1 file changed

+36
-15
lines changed

ompi/mca/coll/han/coll_han_subcomms.c

Lines changed: 36 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,7 @@ int mca_coll_han_comm_create_new(struct ompi_communicator_t *comm,
5252
ompi_communicator_t **low_comm = &(han_module->sub_comm[INTRA_NODE]);
5353
ompi_communicator_t **up_comm = &(han_module->sub_comm[INTER_NODE]);
5454
mca_coll_han_collectives_fallback_t fallbacks;
55-
int vrank, *vranks;
55+
int rc = OMPI_SUCCESS, vrank, *vranks;
5656
opal_info_t comm_info;
5757

5858
/* The sub communicators have already been created */
@@ -91,9 +91,12 @@ int mca_coll_han_comm_create_new(struct ompi_communicator_t *comm,
9191
* all participants.
9292
*/
9393
int local_procs = ompi_group_count_local_peers(comm->c_local_group);
94-
comm->c_coll->coll_allreduce(MPI_IN_PLACE, &local_procs, 1, MPI_INT,
95-
MPI_MAX, comm,
96-
comm->c_coll->coll_allreduce_module);
94+
rc = comm->c_coll->coll_allreduce(MPI_IN_PLACE, &local_procs, 1, MPI_INT,
95+
MPI_MAX, comm,
96+
comm->c_coll->coll_allreduce_module);
97+
if( OMPI_SUCCESS != rc ) {
98+
goto return_with_error;
99+
}
97100
if( local_procs == 1 ) {
98101
/* restore saved collectives */
99102
HAN_SUBCOM_LOAD_COLLECTIVE(fallbacks, comm, han_module, allgatherv);
@@ -118,8 +121,12 @@ int mca_coll_han_comm_create_new(struct ompi_communicator_t *comm,
118121
*/
119122
opal_info_set(&comm_info, "ompi_comm_coll_preference", "han");
120123
opal_info_set(&comm_info, "ompi_comm_coll_han_topo_level", "INTRA_NODE");
121-
ompi_comm_split_type(comm, MPI_COMM_TYPE_SHARED, 0,
122-
&comm_info, low_comm);
124+
rc = ompi_comm_split_type(comm, MPI_COMM_TYPE_SHARED, 0,
125+
&comm_info, low_comm);
126+
if( OMPI_SUCCESS != rc ) {
127+
/* cannot create subcommunicators. Return the error upstream */
128+
goto return_with_error;
129+
}
123130

124131
/*
125132
* Get my local rank and the local size
@@ -132,7 +139,11 @@ int mca_coll_han_comm_create_new(struct ompi_communicator_t *comm,
132139
* same intra-node rank id share such a sub-communicator
133140
*/
134141
opal_info_set(&comm_info, "ompi_comm_coll_han_topo_level", "INTER_NODE");
135-
ompi_comm_split_with_info(comm, low_rank, w_rank, &comm_info, up_comm, false);
142+
rc = ompi_comm_split_with_info(comm, low_rank, w_rank, &comm_info, up_comm, false);
143+
if( OMPI_SUCCESS != rc ) {
144+
/* cannot create subcommunicators. Return the error upstream */
145+
goto return_with_error;
146+
}
136147

137148
up_rank = ompi_comm_rank(*up_comm);
138149

@@ -150,14 +161,13 @@ int mca_coll_han_comm_create_new(struct ompi_communicator_t *comm,
150161
* gather vrank from each process so every process will know other processes
151162
* vrank
152163
*/
153-
comm->c_coll->coll_allgather(&vrank,
154-
1,
155-
MPI_INT,
156-
vranks,
157-
1,
158-
MPI_INT,
159-
comm,
160-
comm->c_coll->coll_allgather_module);
164+
rc = comm->c_coll->coll_allgather(&vrank, 1, MPI_INT,
165+
vranks, 1, MPI_INT,
166+
comm, comm->c_coll->coll_allgather_module);
167+
if( OMPI_SUCCESS != rc ) {
168+
/* cannot create subcommunicators. Return the error upstream */
169+
goto return_with_error;
170+
}
161171

162172
/*
163173
* Set the cached info
@@ -175,6 +185,17 @@ int mca_coll_han_comm_create_new(struct ompi_communicator_t *comm,
175185

176186
OBJ_DESTRUCT(&comm_info);
177187
return OMPI_SUCCESS;
188+
189+
return_with_error:
190+
if( NULL != *low_comm ) {
191+
ompi_comm_free(low_comm);
192+
*low_comm = NULL; /* don't leave the MPI_COMM_NULL set by ompi_comm_free */
193+
}
194+
if( NULL != *up_comm ) {
195+
ompi_comm_free(up_comm);
196+
*up_comm = NULL; /* don't leave the MPI_COMM_NULL set by ompi_comm_free */
197+
}
198+
return rc;
178199
}
179200

180201
/*

0 commit comments

Comments
 (0)