File tree Expand file tree Collapse file tree 1 file changed +11
-0
lines changed Expand file tree Collapse file tree 1 file changed +11
-0
lines changed Original file line number Diff line number Diff line change @@ -137,6 +137,17 @@ ompi_mtl_ofi_context_progress(int ctxt_id)
137
137
& error ,
138
138
0 );
139
139
if (0 > ret ) {
140
+ /*
141
+ * In multi-threaded scenarios, any thread that attempts to read
142
+ * a CQ when there's a pending error CQ entry gets an
143
+ * -FI_EAVAIL. Without any serialization here (which is okay,
144
+ * since libfabric will protect access to critical CQ objects),
145
+ * all threads proceed to read from the error CQ, but only one
146
+ * thread fetches the entry while others get -FI_EAGAIN
147
+ * indicating an empty queue, which is not erroneous.
148
+ */
149
+ if (ret == - FI_EAGAIN )
150
+ return count ;
140
151
opal_output (0 , "%s:%d: Error returned from fi_cq_readerr: %s(%zd).\n"
141
152
"*** The Open MPI OFI MTL is aborting the MPI job (via exit(3)).\n" ,
142
153
__FILE__ , __LINE__ , fi_strerror (- ret ), ret );
You can’t perform that action at this time.
0 commit comments