Skip to content

Commit 2f0e815

Browse files
authored
Merge pull request #4247 from jocherry/btlTcpLinksBugFix
tcp btl: Fix multiple-link connection establishment.
2 parents 04ec013 + d7e7e3a commit 2f0e815

File tree

4 files changed

+64
-5
lines changed

4 files changed

+64
-5
lines changed

opal/mca/btl/tcp/btl_tcp.h

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -167,7 +167,10 @@ struct mca_btl_tcp_module_t {
167167
#if 0
168168
int tcp_ifindex; /**< BTL interface index */
169169
#endif
170-
struct sockaddr_storage tcp_ifaddr; /**< BTL interface address */
170+
struct sockaddr_storage tcp_ifaddr; /**< First IPv4 address discovered for this interface, bound as sending address for this BTL */
171+
#if OPAL_ENABLE_IPV6
172+
struct sockaddr_storage tcp_ifaddr_6; /**< First IPv6 address discovered for this interface, bound as sending address for this BTL */
173+
#endif
171174
uint32_t tcp_ifmask; /**< BTL interface netmask */
172175

173176
opal_mutex_t tcp_endpoints_mutex;

opal/mca/btl/tcp/btl_tcp_component.c

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -511,6 +511,17 @@ static int mca_btl_tcp_create(int if_kindex, const char* if_name)
511511
btl->tcp_send_handler = 0;
512512
#endif
513513

514+
struct sockaddr_storage addr;
515+
opal_ifkindextoaddr(if_kindex, (struct sockaddr*) &addr,
516+
sizeof (struct sockaddr_storage));
517+
#if OPAL_ENABLE_IPV6
518+
if (addr.ss_family == AF_INET6) {
519+
btl->tcp_ifaddr_6 = addr;
520+
}
521+
#endif
522+
if (addr.ss_family == AF_INET) {
523+
btl->tcp_ifaddr = addr;
524+
}
514525
/* allow user to specify interface bandwidth */
515526
sprintf(param, "bandwidth_%s", if_name);
516527
mca_btl_tcp_param_register_uint(param, NULL, btl->super.btl_bandwidth, OPAL_INFO_LVL_5, &btl->super.btl_bandwidth);

opal/mca/btl/tcp/btl_tcp_endpoint.c

Lines changed: 29 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -717,7 +717,35 @@ static int mca_btl_tcp_endpoint_start_connect(mca_btl_base_endpoint_t* btl_endpo
717717

718718
/* start the connect - will likely fail with EINPROGRESS */
719719
mca_btl_tcp_proc_tosocks(btl_endpoint->endpoint_addr, &endpoint_addr);
720-
720+
721+
/* Bind the socket to one of the addresses associated with
722+
* this btl module. This sets the source IP to one of the
723+
* addresses shared in modex, so that the destination rank
724+
* can properly pair btl modules, even in cases where Linux
725+
* might do something unexpected with routing */
726+
opal_socklen_t sockaddr_addrlen = sizeof(struct sockaddr_storage);
727+
if (endpoint_addr.ss_family == AF_INET) {
728+
assert(NULL != &btl_endpoint->endpoint_btl->tcp_ifaddr);
729+
if (bind(btl_endpoint->endpoint_sd, (struct sockaddr*) &btl_endpoint->endpoint_btl->tcp_ifaddr,
730+
sockaddr_addrlen) < 0) {
731+
BTL_ERROR(("bind() failed: %s (%d)", strerror(opal_socket_errno), opal_socket_errno));
732+
733+
CLOSE_THE_SOCKET(btl_endpoint->endpoint_sd);
734+
return OPAL_ERROR;
735+
}
736+
}
737+
#if OPAL_ENABLE_IPV6
738+
if (endpoint_addr.ss_family == AF_INET6) {
739+
assert(NULL != &btl_endpoint->endpoint_btl->tcp_ifaddr_6);
740+
if (bind(btl_endpoint->endpoint_sd, (struct sockaddr*) &btl_endpoint->endpoint_btl->tcp_ifaddr_6,
741+
sockaddr_addrlen) < 0) {
742+
BTL_ERROR(("bind() failed: %s (%d)", strerror(opal_socket_errno), opal_socket_errno));
743+
744+
CLOSE_THE_SOCKET(btl_endpoint->endpoint_sd);
745+
return OPAL_ERROR;
746+
}
747+
}
748+
#endif
721749
opal_output_verbose(10, opal_btl_base_framework.framework_output,
722750
"btl: tcp: attempting to connect() to %s address %s on port %d",
723751
OPAL_NAME_PRINT(btl_endpoint->endpoint_proc->proc_opal->proc_name),

opal/mca/btl/tcp/btl_tcp_proc.c

Lines changed: 20 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -486,7 +486,7 @@ int mca_btl_tcp_proc_insert( mca_btl_tcp_proc_t* btl_proc,
486486
}
487487

488488
/*
489-
* in case the peer address has all intended connections,
489+
* in case the peer address has created all intended connections,
490490
* mark the complete peer interface as 'not available'
491491
*/
492492
if(endpoint_addr->addr_inuse >= mca_btl_tcp_component.tcp_num_links) {
@@ -810,12 +810,15 @@ mca_btl_tcp_proc_t* mca_btl_tcp_proc_lookup(const opal_process_name_t *name)
810810
void mca_btl_tcp_proc_accept(mca_btl_tcp_proc_t* btl_proc, struct sockaddr* addr, int sd)
811811
{
812812
OPAL_THREAD_LOCK(&btl_proc->proc_lock);
813+
int found_match = 0;
814+
mca_btl_base_endpoint_t* match_btl_endpoint;
815+
813816
for( size_t i = 0; i < btl_proc->proc_endpoint_count; i++ ) {
814817
mca_btl_base_endpoint_t* btl_endpoint = btl_proc->proc_endpoints[i];
815818
/* We are not here to make a decision about what is good socket
816819
* and what is not. We simply check that this socket fit the endpoint
817-
* end we prepare for the real decision function mca_btl_tcp_endpoint_accept. */
818-
if( btl_endpoint->endpoint_addr->addr_family != addr->sa_family ) {
820+
* end we prepare for the real decision function mca_btl_tcp_endpoint_accept. */
821+
if( btl_endpoint->endpoint_addr->addr_family != addr->sa_family) {
819822
continue;
820823
}
821824
switch (addr->sa_family) {
@@ -833,6 +836,10 @@ void mca_btl_tcp_proc_accept(mca_btl_tcp_proc_t* btl_proc, struct sockaddr* addr
833836
tmp[1], 16),
834837
(int)i, (int)btl_proc->proc_endpoint_count);
835838
continue;
839+
} else if (btl_endpoint->endpoint_state != MCA_BTL_TCP_CLOSED) {
840+
found_match = 1;
841+
match_btl_endpoint = btl_endpoint;
842+
continue;
836843
}
837844
break;
838845
#if OPAL_ENABLE_IPV6
@@ -857,10 +864,20 @@ void mca_btl_tcp_proc_accept(mca_btl_tcp_proc_t* btl_proc, struct sockaddr* addr
857864
;
858865
}
859866

867+
/* Set state to CONNECTING to ensure that subsequent conenctions do not attempt to re-use endpoint in the num_links > 1 case*/
868+
btl_endpoint->endpoint_state = MCA_BTL_TCP_CONNECTING;
860869
(void)mca_btl_tcp_endpoint_accept(btl_endpoint, addr, sd);
861870
OPAL_THREAD_UNLOCK(&btl_proc->proc_lock);
862871
return;
863872
}
873+
/* In this case the connection was inbound to an address exported, but was not in a CLOSED state.
874+
* mca_btl_tcp_endpoint_accept() has logic to deal with the race condition that has likely caused this
875+
* scenario, so call it here.*/
876+
if (found_match) {
877+
(void)mca_btl_tcp_endpoint_accept(match_btl_endpoint, addr, sd);
878+
OPAL_THREAD_UNLOCK(&btl_proc->proc_lock);
879+
return;
880+
}
864881
/* No further use of this socket. Close it */
865882
CLOSE_THE_SOCKET(sd);
866883
{

0 commit comments

Comments
 (0)