Skip to content

Commit 4b5fad4

Browse files
authored
Merge pull request #12388 from wenduwan/mtl_ofi_null_err_cxt
mtl/ofi: bail gracefully if completion error context is null
2 parents a166ad7 + 0cde19e commit 4b5fad4

File tree

1 file changed

+18
-10
lines changed

1 file changed

+18
-10
lines changed

ompi/mca/mtl/ofi/mtl_ofi.h

Lines changed: 18 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -158,8 +158,7 @@ ompi_mtl_ofi_context_progress(int ctxt_id)
158158
"%s:%d: Error returned by request (type: %d) event callback: %zd.\n"
159159
"*** The Open MPI OFI MTL is aborting the MPI job (via exit(3)).\n",
160160
__FILE__, __LINE__, req_type, ret);
161-
fflush(stderr);
162-
exit(1);
161+
goto bail;
163162
}
164163
}
165164
}
@@ -181,16 +180,23 @@ ompi_mtl_ofi_context_progress(int ctxt_id)
181180
* thread fetches the entry while others get -FI_EAGAIN
182181
* indicating an empty queue, which is not erroneous.
183182
*/
184-
if (ret == -FI_EAGAIN)
183+
if (ret == -FI_EAGAIN) {
185184
return count;
185+
}
186186
opal_output(0, "%s:%d: Error returned from fi_cq_readerr: %s(%zd).\n"
187187
"*** The Open MPI OFI MTL is aborting the MPI job (via exit(3)).\n",
188188
__FILE__, __LINE__, fi_strerror(-ret), ret);
189-
fflush(stderr);
190-
exit(1);
189+
goto bail;
190+
}
191+
192+
if (!error.op_context) {
193+
opal_output(0, "%s:%d: Error returned from fi_cq_readerr with null context. "
194+
"Completion flags: %016lx\n"
195+
"*** The Open MPI OFI MTL is aborting the MPI job (via exit(3)).\n",
196+
__FILE__, __LINE__, error.flags);
197+
goto bail;
191198
}
192199

193-
assert(error.op_context);
194200
ofi_req = TO_OFI_REQ(error.op_context);
195201
assert(ofi_req);
196202
req_type = ofi_req->type;
@@ -200,18 +206,20 @@ ompi_mtl_ofi_context_progress(int ctxt_id)
200206
"%s:%d: Error returned by request (type: %d) error callback: %zd.\n"
201207
"*** The Open MPI OFI MTL is aborting the MPI job (via exit(3)).\n",
202208
__FILE__, __LINE__, req_type, ret);
203-
fflush(stderr);
204-
exit(1);
209+
goto bail;
205210
}
206211
} else if (ret != -FI_EAGAIN && ret != -EINTR) {
207212
opal_output(0, "%s:%d: Error returned from fi_cq_read: %s(%zd).\n"
208213
"*** The Open MPI OFI MTL is aborting the MPI job (via exit(3)).\n",
209214
__FILE__, __LINE__, fi_strerror(-ret), ret);
210-
fflush(stderr);
211-
exit(1);
215+
goto bail;
212216
}
213217

214218
return count;
219+
220+
bail:
221+
fflush(stderr);
222+
exit(1);
215223
}
216224

217225
__opal_attribute_always_inline__ static inline int

0 commit comments

Comments
 (0)