Skip to content

Commit 9996b9f

Browse files
authored
Merge pull request #7720 from abouteiller/bugfix/tcp-failed-lock
Race condition when closing TCP endpoint with error
2 parents f744668 + 0e93d0f commit 9996b9f

File tree

2 files changed

+16
-11
lines changed

2 files changed

+16
-11
lines changed

opal/mca/btl/tcp/btl_tcp_endpoint.c

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -388,6 +388,8 @@ mca_btl_tcp_endpoint_send_blocking(mca_btl_base_endpoint_t* btl_endpoint,
388388
{
389389
int ret = mca_btl_tcp_send_blocking(btl_endpoint->endpoint_sd, data, size);
390390
if (ret < 0) {
391+
/* send-lock not needed because never called when the socket is in the
392+
* event set. */
391393
btl_endpoint->endpoint_state = MCA_BTL_TCP_FAILED;
392394
mca_btl_tcp_endpoint_close(btl_endpoint);
393395
}
@@ -1077,6 +1079,7 @@ static void mca_btl_tcp_endpoint_send_handler(int sd, short flags, void* user)
10771079
mca_btl_tcp_frag_t* frag = btl_endpoint->endpoint_send_frag;
10781080
int btl_ownership = (frag->base.des_flags & MCA_BTL_DES_FLAGS_BTL_OWNERSHIP);
10791081

1082+
assert(btl_endpoint->endpoint_state == MCA_BTL_TCP_CONNECTED);
10801083
if(mca_btl_tcp_frag_send(frag, btl_endpoint->endpoint_sd) == false) {
10811084
break;
10821085
}

opal/mca/btl/tcp/btl_tcp_frag.c

Lines changed: 13 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,7 @@
4949
#include "opal/util/show_help.h"
5050

5151
#include "btl_tcp_frag.h"
52+
#include "btl_tcp_proc.h"
5253
#include "btl_tcp_endpoint.h"
5354
#include "btl_tcp_proc.h"
5455

@@ -130,13 +131,15 @@ bool mca_btl_tcp_frag_send(mca_btl_tcp_frag_t* frag, int sd)
130131
BTL_ERROR(("mca_btl_tcp_frag_send: writev error (%p, %lu)\n\t%s(%lu)\n",
131132
frag->iov_ptr[0].iov_base, (unsigned long) frag->iov_ptr[0].iov_len,
132133
strerror(opal_socket_errno), (unsigned long) frag->iov_cnt));
134+
/* send_lock held by caller */
133135
frag->endpoint->endpoint_state = MCA_BTL_TCP_FAILED;
134136
mca_btl_tcp_endpoint_close(frag->endpoint);
135137
return false;
136138
default:
137139
BTL_ERROR(("mca_btl_tcp_frag_send: writev failed: %s (%d)",
138140
strerror(opal_socket_errno),
139141
opal_socket_errno));
142+
/* send_lock held by caller */
140143
frag->endpoint->endpoint_state = MCA_BTL_TCP_FAILED;
141144
mca_btl_tcp_endpoint_close(frag->endpoint);
142145
return false;
@@ -215,9 +218,11 @@ bool mca_btl_tcp_frag_recv(mca_btl_tcp_frag_t* frag, int sd)
215218
cnt = readv(sd, frag->iov_ptr, num_vecs);
216219
if( 0 < cnt ) goto advance_iov_position;
217220
if( cnt == 0 ) {
221+
OPAL_THREAD_LOCK(&btl_endpoint->endpoint_send_lock);
218222
if(MCA_BTL_TCP_CONNECTED == btl_endpoint->endpoint_state)
219223
btl_endpoint->endpoint_state = MCA_BTL_TCP_FAILED;
220224
mca_btl_tcp_endpoint_close(btl_endpoint);
225+
OPAL_THREAD_UNLOCK(&btl_endpoint->endpoint_send_lock);
221226
return false;
222227
}
223228
switch(opal_socket_errno) {
@@ -229,28 +234,25 @@ bool mca_btl_tcp_frag_recv(mca_btl_tcp_frag_t* frag, int sd)
229234
BTL_ERROR(("mca_btl_tcp_frag_recv: readv error (%p, %lu)\n\t%s(%lu)\n",
230235
frag->iov_ptr[0].iov_base, (unsigned long) frag->iov_ptr[0].iov_len,
231236
strerror(opal_socket_errno), (unsigned long) frag->iov_cnt));
232-
btl_endpoint->endpoint_state = MCA_BTL_TCP_FAILED;
233-
mca_btl_tcp_endpoint_close(btl_endpoint);
234-
return false;
235-
237+
break;
236238
case ECONNRESET:
237239
errhost = opal_get_proc_hostname(btl_endpoint->endpoint_proc->proc_opal);
238240
opal_show_help("help-mpi-btl-tcp.txt", "peer hung up",
239241
true, opal_process_info.nodename,
240242
getpid(), errhost);
241243
free(errhost);
242-
btl_endpoint->endpoint_state = MCA_BTL_TCP_FAILED;
243-
mca_btl_tcp_endpoint_close(btl_endpoint);
244-
return false;
245-
244+
break;
246245
default:
247246
BTL_ERROR(("mca_btl_tcp_frag_recv: readv failed: %s (%d)",
248247
strerror(opal_socket_errno),
249248
opal_socket_errno));
250-
btl_endpoint->endpoint_state = MCA_BTL_TCP_FAILED;
251-
mca_btl_tcp_endpoint_close(btl_endpoint);
252-
return false;
249+
break;
253250
}
251+
OPAL_THREAD_LOCK(&btl_endpoint->endpoint_send_lock);
252+
btl_endpoint->endpoint_state = MCA_BTL_TCP_FAILED;
253+
mca_btl_tcp_endpoint_close(btl_endpoint);
254+
OPAL_THREAD_UNLOCK(&btl_endpoint->endpoint_send_lock);
255+
return false;
254256
} while( cnt < 0 );
255257

256258
advance_iov_position:

0 commit comments

Comments
 (0)