Skip to content

Commit da1189d

Browse files
authored
Merge pull request #5916 from bwbarrett/revert/6acebc4
Revert "Handle error cases in TCP BTL"
2 parents 069084e + 5162011 commit da1189d

File tree

3 files changed

+12
-42
lines changed

3 files changed

+12
-42
lines changed

opal/mca/btl/tcp/btl_tcp.c

Lines changed: 1 addition & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
* Copyright (c) 2004-2007 The Trustees of Indiana University and Indiana
44
* University Research and Technology
55
* Corporation. All rights reserved.
6-
* Copyright (c) 2004-2017 The University of Tennessee and The University
6+
* Copyright (c) 2004-2014 The University of Tennessee and The University
77
* of Tennessee Research Foundation. All rights
88
* reserved.
99
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
@@ -38,8 +38,6 @@
3838
#include "btl_tcp_proc.h"
3939
#include "btl_tcp_endpoint.h"
4040

41-
static int mca_btl_tcp_register_error_cb(struct mca_btl_base_module_t* btl,
42-
mca_btl_base_module_error_cb_fn_t cbfunc);
4341

4442
mca_btl_tcp_module_t mca_btl_tcp_module = {
4543
.super = {
@@ -53,20 +51,11 @@ mca_btl_tcp_module_t mca_btl_tcp_module = {
5351
.btl_send = mca_btl_tcp_send,
5452
.btl_put = mca_btl_tcp_put,
5553
.btl_dump = mca_btl_base_dump,
56-
.btl_register_error = mca_btl_tcp_register_error_cb, /* register error */
5754
.btl_ft_event = mca_btl_tcp_ft_event
5855
},
5956
.tcp_endpoints_mutex = OPAL_MUTEX_STATIC_INIT
6057
};
6158

62-
static int mca_btl_tcp_register_error_cb(struct mca_btl_base_module_t* btl,
63-
mca_btl_base_module_error_cb_fn_t cbfunc)
64-
{
65-
mca_btl_tcp_module_t* tcp_btl = (mca_btl_tcp_module_t*)btl;
66-
tcp_btl->tcp_error_cb = cbfunc;
67-
return OPAL_SUCCESS;
68-
}
69-
7059
/**
7160
*
7261
*/

opal/mca/btl/tcp/btl_tcp_endpoint.c

Lines changed: 10 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
33
* University Research and Technology
44
* Corporation. All rights reserved.
5-
* Copyright (c) 2004-2017 The University of Tennessee and The University
5+
* Copyright (c) 2004-2016 The University of Tennessee and The University
66
* of Tennessee Research Foundation. All rights
77
* reserved.
88
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
@@ -388,7 +388,6 @@ mca_btl_tcp_endpoint_send_blocking(mca_btl_base_endpoint_t* btl_endpoint,
388388
{
389389
int ret = mca_btl_tcp_send_blocking(btl_endpoint->endpoint_sd, data, size);
390390
if (ret < 0) {
391-
btl_endpoint->endpoint_state = MCA_BTL_TCP_FAILED;
392391
mca_btl_tcp_endpoint_close(btl_endpoint);
393392
}
394393
return ret;
@@ -539,30 +538,20 @@ void mca_btl_tcp_endpoint_close(mca_btl_base_endpoint_t* btl_endpoint)
539538
btl_endpoint->endpoint_sd = -1;
540539
/**
541540
* If we keep failing to connect to the peer let the caller know about
542-
* this situation by triggering the callback on all pending fragments and
543-
* reporting the error. The upper layer has then the opportunity to
544-
* re-route or re-schedule the fragments.
541+
* this situation by triggering all the pending fragments callback and
542+
* reporting the error.
545543
*/
546544
if( MCA_BTL_TCP_FAILED == btl_endpoint->endpoint_state ) {
547545
mca_btl_tcp_frag_t* frag = btl_endpoint->endpoint_send_frag;
548546
if( NULL == frag )
549547
frag = (mca_btl_tcp_frag_t*)opal_list_remove_first(&btl_endpoint->endpoint_frags);
550548
while(NULL != frag) {
551549
frag->base.des_cbfunc(&frag->btl->super, frag->endpoint, &frag->base, OPAL_ERR_UNREACH);
552-
if( frag->base.des_flags & MCA_BTL_DES_FLAGS_BTL_OWNERSHIP ) {
553-
MCA_BTL_TCP_FRAG_RETURN(frag);
554-
}
550+
555551
frag = (mca_btl_tcp_frag_t*)opal_list_remove_first(&btl_endpoint->endpoint_frags);
556552
}
557-
btl_endpoint->endpoint_send_frag = NULL;
558-
/* Let's report the error upstream */
559-
if(NULL != btl_endpoint->endpoint_btl->tcp_error_cb) {
560-
btl_endpoint->endpoint_btl->tcp_error_cb((mca_btl_base_module_t*)btl_endpoint->endpoint_btl, 0,
561-
btl_endpoint->endpoint_proc->proc_opal, "Socket closed");
562-
}
563-
} else {
564-
btl_endpoint->endpoint_state = MCA_BTL_TCP_CLOSED;
565553
}
554+
btl_endpoint->endpoint_state = MCA_BTL_TCP_CLOSED;
566555
}
567556

568557
/*
@@ -619,6 +608,7 @@ static int mca_btl_tcp_endpoint_recv_connect_ack(mca_btl_base_endpoint_t* btl_en
619608
opal_show_help("help-mpi-btl-tcp.txt", "client handshake fail",
620609
true, opal_process_info.nodename,
621610
getpid(), "did not receive entire connect ACK from peer");
611+
622612
return OPAL_ERR_BAD_PARAM;
623613
}
624614
if (0 != strncmp(hs_msg.magic_id, mca_btl_tcp_magic_id_string, len)) {
@@ -638,7 +628,6 @@ static int mca_btl_tcp_endpoint_recv_connect_ack(mca_btl_base_endpoint_t* btl_en
638628
if (0 != opal_compare_proc(btl_proc->proc_opal->proc_name, guid)) {
639629
BTL_ERROR(("received unexpected process identifier %s",
640630
OPAL_NAME_PRINT(guid)));
641-
btl_endpoint->endpoint_state = MCA_BTL_TCP_FAILED;
642631
mca_btl_tcp_endpoint_close(btl_endpoint);
643632
return OPAL_ERR_UNREACH;
644633
}
@@ -845,7 +834,6 @@ static int mca_btl_tcp_endpoint_complete_connect(mca_btl_base_endpoint_t* btl_en
845834
opal_net_get_hostname((struct sockaddr*) &endpoint_addr),
846835
((struct sockaddr_in*) &endpoint_addr)->sin_port,
847836
strerror(opal_socket_errno), opal_socket_errno));
848-
btl_endpoint->endpoint_state = MCA_BTL_TCP_FAILED;
849837
mca_btl_tcp_endpoint_close(btl_endpoint);
850838
return OPAL_ERROR;
851839
}
@@ -862,7 +850,6 @@ static int mca_btl_tcp_endpoint_complete_connect(mca_btl_base_endpoint_t* btl_en
862850
getpid(), msg,
863851
strerror(opal_socket_errno), opal_socket_errno);
864852
free(msg);
865-
btl_endpoint->endpoint_state = MCA_BTL_TCP_FAILED;
866853
mca_btl_tcp_endpoint_close(btl_endpoint);
867854
return OPAL_ERROR;
868855
}
@@ -934,15 +921,12 @@ static void mca_btl_tcp_endpoint_recv_handler(int sd, short flags, void* user)
934921
OPAL_THREAD_UNLOCK(&btl_endpoint->endpoint_send_lock);
935922
MCA_BTL_TCP_ENDPOINT_DUMP(10, btl_endpoint, true, "connected");
936923
}
937-
else if (OPAL_ERR_BAD_PARAM == rc
938-
|| OPAL_ERROR == rc) {
924+
else if (OPAL_ERR_BAD_PARAM == rc) {
939925
/* If we get a BAD_PARAM, it means that it probably wasn't
940926
an OMPI process on the other end of the socket (e.g.,
941-
the magic string ID failed). recv_connect_ack already cleaned
942-
up the socket. */
943-
/* If we get OPAL_ERROR, the other end closed the connection
944-
* because it has initiated a symetrical connexion on its end.
945-
* recv_connect_ack already cleaned up the socket. */
927+
the magic string ID failed). So we can probably just
928+
close the socket and ignore this connection. */
929+
CLOSE_THE_SOCKET(sd);
946930
}
947931
else {
948932
/* Otherwise, it probably *was* an OMPI peer process on
@@ -1081,8 +1065,6 @@ static void mca_btl_tcp_endpoint_send_handler(int sd, short flags, void* user)
10811065
opal_event_del(&btl_endpoint->endpoint_send_event);
10821066
}
10831067
break;
1084-
case MCA_BTL_TCP_FAILED:
1085-
break;
10861068
default:
10871069
BTL_ERROR(("invalid connection state (%d)", btl_endpoint->endpoint_state));
10881070
MCA_BTL_TCP_ENDPOINT_DUMP(1, btl_endpoint, true, "event_del(send) [endpoint_send_handler:error]");

opal/mca/btl/tcp/btl_tcp_frag.c

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -212,8 +212,7 @@ bool mca_btl_tcp_frag_recv(mca_btl_tcp_frag_t* frag, int sd)
212212
cnt = readv(sd, frag->iov_ptr, num_vecs);
213213
if( 0 < cnt ) goto advance_iov_position;
214214
if( cnt == 0 ) {
215-
if(MCA_BTL_TCP_CONNECTED == btl_endpoint->endpoint_state)
216-
btl_endpoint->endpoint_state = MCA_BTL_TCP_FAILED;
215+
btl_endpoint->endpoint_state = MCA_BTL_TCP_FAILED;
217216
mca_btl_tcp_endpoint_close(btl_endpoint);
218217
return false;
219218
}

0 commit comments

Comments
 (0)