Skip to content

Commit b6b300d

Browse files
authored
Merge pull request #6784 from abouteiller/export/event-infloop
Address a race condition in libevent select.
2 parents 4358e75 + c39fb57 commit b6b300d

File tree

1 file changed

+24
-5
lines changed
  • opal/mca/event/libevent2022/libevent

1 file changed

+24
-5
lines changed

opal/mca/event/libevent2022/libevent/select.c

Lines changed: 24 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,7 @@
4242
#include <string.h>
4343
#include <unistd.h>
4444
#include <errno.h>
45+
#include <fcntl.h>
4546

4647
#include "event-internal.h"
4748
#include "evsignal-internal.h"
@@ -166,12 +167,30 @@ select_dispatch(struct event_base *base, struct timeval *tv)
166167
check_selectop(sop);
167168

168169
if (res == -1) {
169-
if (errno != EINTR) {
170-
event_warn("select");
171-
return (-1);
170+
if (errno == EINTR) {
171+
return (0);
172172
}
173-
174-
return (0);
173+
/* There seems to be a very subtle race condition between the
174+
* event_del and the select, where the fd is still active on the
175+
* event_readset_in but no libevent structure make reference
176+
* to it so it. Thus, any call to progress will no nothing more
177+
* than print a warning and do nothing, leading to deadlocks.
178+
* If we force remove the problematic fd, we get the warning only
179+
* once, and things work as expected.
180+
*/
181+
event_warn("select");
182+
for (j = 0; j < nfds; ++j) {
183+
if (FD_ISSET(j, sop->event_readset_in) ||
184+
FD_ISSET(j, sop->event_writeset_in)) {
185+
res = fcntl(j, F_GETFL);
186+
if( res == -1 ) {
187+
event_warn("bad file descriptor %d/%d\n", j, nfds);
188+
FD_CLR(j, sop->event_readset_in);
189+
FD_CLR(j, sop->event_writeset_in);
190+
}
191+
}
192+
}
193+
return (-1);
175194
}
176195

177196
event_debug(("%s: select reports %d", __func__, res));

0 commit comments

Comments
 (0)