Skip to content

Commit 5162011

Browse files
committed
Revert "Handle error cases in TCP BTL"
This reverts commit 6acebc4. This patch is causing numerous "Socket closed" messages which are causing most of the failures on Cisco's MTT run. See #5849 for more information. Signed-off-by: Brian Barrett <bbarrett@amazon.com>
1 parent ee6cb4b commit 5162011

File tree

3 files changed

+12
-42
lines changed

3 files changed

+12
-42
lines changed

opal/mca/btl/tcp/btl_tcp.c

Lines changed: 1 addition & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
* Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
44
* University Research and Technology
55
* Corporation. All rights reserved.
6-
* Copyright (c) 2004-2017 The University of Tennessee and The University
6+
* Copyright (c) 2004-2014 The University of Tennessee and The University
77
* of Tennessee Research Foundation. All rights
88
* reserved.
99
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
@@ -38,8 +38,6 @@
3838
#include "btl_tcp_proc.h"
3939
#include "btl_tcp_endpoint.h"
4040

41-
static int mca_btl_tcp_register_error_cb(struct mca_btl_base_module_t* btl,
42-
mca_btl_base_module_error_cb_fn_t cbfunc);
4341

4442
mca_btl_tcp_module_t mca_btl_tcp_module = {
4543
.super = {
@@ -53,20 +51,11 @@ mca_btl_tcp_module_t mca_btl_tcp_module = {
5351
.btl_send = mca_btl_tcp_send,
5452
.btl_put = mca_btl_tcp_put,
5553
.btl_dump = mca_btl_base_dump,
56-
.btl_register_error = mca_btl_tcp_register_error_cb, /* register error */
5754
.btl_ft_event = mca_btl_tcp_ft_event
5855
},
5956
.tcp_endpoints_mutex = OPAL_MUTEX_STATIC_INIT
6057
};
6158

62-
static int mca_btl_tcp_register_error_cb(struct mca_btl_base_module_t* btl,
63-
mca_btl_base_module_error_cb_fn_t cbfunc)
64-
{
65-
mca_btl_tcp_module_t* tcp_btl = (mca_btl_tcp_module_t*)btl;
66-
tcp_btl->tcp_error_cb = cbfunc;
67-
return OPAL_SUCCESS;
68-
}
69-
7059
/**
7160
*
7261
*/

opal/mca/btl/tcp/btl_tcp_endpoint.c

Lines changed: 10 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
33
* University Research and Technology
44
* Corporation. All rights reserved.
5-
* Copyright (c) 2004-2017 The University of Tennessee and The University
5+
* Copyright (c) 2004-2016 The University of Tennessee and The University
66
* of Tennessee Research Foundation. All rights
77
* reserved.
88
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
@@ -387,7 +387,6 @@ mca_btl_tcp_endpoint_send_blocking(mca_btl_base_endpoint_t* btl_endpoint,
387387
{
388388
int ret = mca_btl_tcp_send_blocking(btl_endpoint->endpoint_sd, data, size);
389389
if (ret < 0) {
390-
btl_endpoint->endpoint_state = MCA_BTL_TCP_FAILED;
391390
mca_btl_tcp_endpoint_close(btl_endpoint);
392391
}
393392
return ret;
@@ -537,30 +536,20 @@ void mca_btl_tcp_endpoint_close(mca_btl_base_endpoint_t* btl_endpoint)
537536
btl_endpoint->endpoint_sd = -1;
538537
/**
539538
* If we keep failing to connect to the peer let the caller know about
540-
* this situation by triggering the callback on all pending fragments and
541-
* reporting the error. The upper layer has then the opportunity to
542-
* re-route or re-schedule the fragments.
539+
* this situation by triggering all the pending fragments callback and
540+
* reporting the error.
543541
*/
544542
if( MCA_BTL_TCP_FAILED == btl_endpoint->endpoint_state ) {
545543
mca_btl_tcp_frag_t* frag = btl_endpoint->endpoint_send_frag;
546544
if( NULL == frag )
547545
frag = (mca_btl_tcp_frag_t*)opal_list_remove_first(&btl_endpoint->endpoint_frags);
548546
while(NULL != frag) {
549547
frag->base.des_cbfunc(&frag->btl->super, frag->endpoint, &frag->base, OPAL_ERR_UNREACH);
550-
if( frag->base.des_flags & MCA_BTL_DES_FLAGS_BTL_OWNERSHIP ) {
551-
MCA_BTL_TCP_FRAG_RETURN(frag);
552-
}
548+
553549
frag = (mca_btl_tcp_frag_t*)opal_list_remove_first(&btl_endpoint->endpoint_frags);
554550
}
555-
btl_endpoint->endpoint_send_frag = NULL;
556-
/* Let's report the error upstream */
557-
if(NULL != btl_endpoint->endpoint_btl->tcp_error_cb) {
558-
btl_endpoint->endpoint_btl->tcp_error_cb((mca_btl_base_module_t*)btl_endpoint->endpoint_btl, 0,
559-
btl_endpoint->endpoint_proc->proc_opal, "Socket closed");
560-
}
561-
} else {
562-
btl_endpoint->endpoint_state = MCA_BTL_TCP_CLOSED;
563551
}
552+
btl_endpoint->endpoint_state = MCA_BTL_TCP_CLOSED;
564553
}
565554

566555
/*
@@ -617,6 +606,7 @@ static int mca_btl_tcp_endpoint_recv_connect_ack(mca_btl_base_endpoint_t* btl_en
617606
opal_show_help("help-mpi-btl-tcp.txt", "client handshake fail",
618607
true, opal_process_info.nodename,
619608
getpid(), "did not receive entire connect ACK from peer");
609+
620610
return OPAL_ERR_BAD_PARAM;
621611
}
622612
if (0 != strncmp(hs_msg.magic_id, mca_btl_tcp_magic_id_string, len)) {
@@ -636,7 +626,6 @@ static int mca_btl_tcp_endpoint_recv_connect_ack(mca_btl_base_endpoint_t* btl_en
636626
if (0 != opal_compare_proc(btl_proc->proc_opal->proc_name, guid)) {
637627
BTL_ERROR(("received unexpected process identifier %s",
638628
OPAL_NAME_PRINT(guid)));
639-
btl_endpoint->endpoint_state = MCA_BTL_TCP_FAILED;
640629
mca_btl_tcp_endpoint_close(btl_endpoint);
641630
return OPAL_ERR_UNREACH;
642631
}
@@ -843,7 +832,6 @@ static int mca_btl_tcp_endpoint_complete_connect(mca_btl_base_endpoint_t* btl_en
843832
opal_net_get_hostname((struct sockaddr*) &endpoint_addr),
844833
((struct sockaddr_in*) &endpoint_addr)->sin_port,
845834
strerror(opal_socket_errno), opal_socket_errno));
846-
btl_endpoint->endpoint_state = MCA_BTL_TCP_FAILED;
847835
mca_btl_tcp_endpoint_close(btl_endpoint);
848836
return OPAL_ERROR;
849837
}
@@ -860,7 +848,6 @@ static int mca_btl_tcp_endpoint_complete_connect(mca_btl_base_endpoint_t* btl_en
860848
getpid(), msg,
861849
strerror(opal_socket_errno), opal_socket_errno);
862850
free(msg);
863-
btl_endpoint->endpoint_state = MCA_BTL_TCP_FAILED;
864851
mca_btl_tcp_endpoint_close(btl_endpoint);
865852
return OPAL_ERROR;
866853
}
@@ -932,15 +919,12 @@ static void mca_btl_tcp_endpoint_recv_handler(int sd, short flags, void* user)
932919
OPAL_THREAD_UNLOCK(&btl_endpoint->endpoint_send_lock);
933920
MCA_BTL_TCP_ENDPOINT_DUMP(10, btl_endpoint, true, "connected");
934921
}
935-
else if (OPAL_ERR_BAD_PARAM == rc
936-
|| OPAL_ERROR == rc) {
922+
else if (OPAL_ERR_BAD_PARAM == rc) {
937923
/* If we get a BAD_PARAM, it means that it probably wasn't
938924
an OMPI process on the other end of the socket (e.g.,
939-
the magic string ID failed). recv_connect_ack already cleaned
940-
up the socket. */
941-
/* If we get OPAL_ERROR, the other end closed the connection
942-
* because it has initiated a symetrical connexion on its end.
943-
* recv_connect_ack already cleaned up the socket. */
925+
the magic string ID failed). So we can probably just
926+
close the socket and ignore this connection. */
927+
CLOSE_THE_SOCKET(sd);
944928
}
945929
else {
946930
/* Otherwise, it probably *was* an OMPI peer process on
@@ -1079,8 +1063,6 @@ static void mca_btl_tcp_endpoint_send_handler(int sd, short flags, void* user)
10791063
opal_event_del(&btl_endpoint->endpoint_send_event);
10801064
}
10811065
break;
1082-
case MCA_BTL_TCP_FAILED:
1083-
break;
10841066
default:
10851067
BTL_ERROR(("invalid connection state (%d)", btl_endpoint->endpoint_state));
10861068
MCA_BTL_TCP_ENDPOINT_DUMP(1, btl_endpoint, true, "event_del(send) [endpoint_send_handler:error]");

opal/mca/btl/tcp/btl_tcp_frag.c

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -212,8 +212,7 @@ bool mca_btl_tcp_frag_recv(mca_btl_tcp_frag_t* frag, int sd)
212212
cnt = readv(sd, frag->iov_ptr, num_vecs);
213213
if( 0 < cnt ) goto advance_iov_position;
214214
if( cnt == 0 ) {
215-
if(MCA_BTL_TCP_CONNECTED == btl_endpoint->endpoint_state)
216-
btl_endpoint->endpoint_state = MCA_BTL_TCP_FAILED;
215+
btl_endpoint->endpoint_state = MCA_BTL_TCP_FAILED;
217216
mca_btl_tcp_endpoint_close(btl_endpoint);
218217
return false;
219218
}

0 commit comments

Comments
 (0)