Skip to content

Commit 0ddbc75

Browse files
author
Ralph Castain
authored
Merge pull request #4930 from kizill/fix-ipv6
fixed ipv6 OOB connection problems (fix issue #1585)
2 parents abb87f9 + c2bfca1 commit 0ddbc75

File tree

5 files changed

+51
-24
lines changed

5 files changed

+51
-24
lines changed

opal/mca/btl/tcp/btl_tcp_proc.c

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -873,6 +873,10 @@ void mca_btl_tcp_proc_accept(mca_btl_tcp_proc_t* btl_proc, struct sockaddr* addr
873873
tmp[1], INET6_ADDRSTRLEN),
874874
(int)i, (int)btl_proc->proc_endpoint_count);
875875
continue;
876+
} else if (btl_endpoint->endpoint_state != MCA_BTL_TCP_CLOSED) {
877+
found_match = 1;
878+
match_btl_endpoint = btl_endpoint;
879+
continue;
876880
}
877881
break;
878882
#endif

opal/mca/if/linux_ipv6/if_linux_ipv6.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -118,8 +118,8 @@ static int if_linux_ipv6_open(void)
118118
addrbyte[8], addrbyte[9], addrbyte[10], addrbyte[11],
119119
addrbyte[12], addrbyte[13], addrbyte[14], addrbyte[15], scope);
120120

121-
/* we don't want any other scope less than link-local */
122-
if (scope < 0x20) {
121+
/* Only interested in global (0x00) scope */
122+
if (scope != 0x00) {
123123
opal_output_verbose(1, opal_if_base_framework.framework_output,
124124
"skipping interface %2x%2x:%2x%2x:%2x%2x:%2x%2x:%2x%2x:%2x%2x:%2x%2x:%2x%2x scope %x\n",
125125
addrbyte[0], addrbyte[1], addrbyte[2], addrbyte[3],

orte/mca/oob/tcp/oob_tcp_component.c

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,8 @@
5252
#include <netdb.h>
5353
#endif
5454
#include <ctype.h>
55+
#include <sys/socket.h>
56+
#include <arpa/inet.h>
5557

5658
#include "opal/util/show_help.h"
5759
#include "opal/util/error.h"
@@ -84,6 +86,8 @@
8486
#include "orte/mca/oob/tcp/oob_tcp_peer.h"
8587
#include "orte/mca/oob/tcp/oob_tcp_connection.h"
8688
#include "orte/mca/oob/tcp/oob_tcp_listener.h"
89+
#include "oob_tcp_peer.h"
90+
8791
/*
8892
* Local utility functions
8993
*/
@@ -843,6 +847,8 @@ static int parse_uri(const uint16_t af_family,
843847
opal_output (0, "oob_tcp_parse_uri: Could not convert %s\n", host);
844848
return ORTE_ERR_BAD_PARAM;
845849
}
850+
in6->sin6_family = AF_INET6;
851+
in6->sin6_port = htons(atoi(port));
846852
}
847853
#endif
848854
else {
@@ -973,6 +979,7 @@ static int component_set_addr(orte_process_name_t *peer,
973979
}
974980

975981
maddr = OBJ_NEW(mca_oob_tcp_addr_t);
982+
((struct sockaddr_storage*) &(maddr->addr))->ss_family = af_family;
976983
if (ORTE_SUCCESS != (rc = parse_uri(af_family, host, ports, (struct sockaddr_storage*) &(maddr->addr)))) {
977984
ORTE_ERROR_LOG(rc);
978985
OBJ_RELEASE(maddr);

orte/mca/oob/tcp/oob_tcp_connection.c

Lines changed: 36 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,8 @@
3030
#include <unistd.h>
3131
#endif
3232
#include <fcntl.h>
33+
#include <sys/socket.h>
34+
3335
#ifdef HAVE_SYS_UIO_H
3436
#include <sys/uio.h>
3537
#endif
@@ -77,6 +79,9 @@
7779
#include "orte/mca/oob/tcp/oob_tcp_peer.h"
7880
#include "orte/mca/oob/tcp/oob_tcp_common.h"
7981
#include "orte/mca/oob/tcp/oob_tcp_connection.h"
82+
#include "oob_tcp_peer.h"
83+
#include "oob_tcp_common.h"
84+
#include "oob_tcp_connection.h"
8085

8186
static void tcp_peer_event_init(mca_oob_tcp_peer_t* peer);
8287
static int tcp_peer_send_connect_ack(mca_oob_tcp_peer_t* peer);
@@ -86,7 +91,7 @@ static bool tcp_peer_recv_blocking(mca_oob_tcp_peer_t* peer, int sd,
8691
void* data, size_t size);
8792
static void tcp_peer_connected(mca_oob_tcp_peer_t* peer);
8893

89-
static int tcp_peer_create_socket(mca_oob_tcp_peer_t* peer)
94+
static int tcp_peer_create_socket(mca_oob_tcp_peer_t* peer, sa_family_t family)
9095
{
9196
int flags;
9297

@@ -98,8 +103,7 @@ static int tcp_peer_create_socket(mca_oob_tcp_peer_t* peer)
98103
"%s oob:tcp:peer creating socket to %s",
99104
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
100105
ORTE_NAME_PRINT(&(peer->name))));
101-
102-
peer->sd = socket(AF_INET, SOCK_STREAM, 0);
106+
peer->sd = socket(family, SOCK_STREAM, 0);
103107
if (peer->sd < 0) {
104108
opal_output(0, "%s-%s tcp_peer_create_socket: socket() failed: %s (%d)\n",
105109
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
@@ -155,6 +159,7 @@ void mca_oob_tcp_peer_try_connect(int fd, short args, void *cbdata)
155159
{
156160
mca_oob_tcp_conn_op_t *op = (mca_oob_tcp_conn_op_t*)cbdata;
157161
mca_oob_tcp_peer_t *peer;
162+
int current_socket_family = 0;
158163
int rc;
159164
opal_socklen_t addrlen = 0;
160165
mca_oob_tcp_addr_t *addr;
@@ -171,30 +176,12 @@ void mca_oob_tcp_peer_try_connect(int fd, short args, void *cbdata)
171176
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
172177
ORTE_NAME_PRINT(&(peer->name)));
173178

174-
rc = tcp_peer_create_socket(peer);
175-
if (ORTE_SUCCESS != rc) {
176-
/* FIXME: we cannot create a TCP socket - this spans
177-
* all interfaces, so all we can do is report
178-
* back to the component that this peer is
179-
* unreachable so it can remove the peer
180-
* from its list and report back to the base
181-
* NOTE: this could be a reconnect attempt,
182-
* so we also need to mark any queued messages
183-
* and return them as "unreachable"
184-
*/
185-
opal_output(0, "%s CANNOT CREATE SOCKET", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
186-
ORTE_FORCED_TERMINATE(1);
187-
OBJ_RELEASE(op);
188-
return;
189-
}
190-
191179
opal_output_verbose(OOB_TCP_DEBUG_CONNECT, orte_oob_base_framework.framework_output,
192180
"%s orte_tcp_peer_try_connect: "
193181
"attempting to connect to proc %s on socket %d",
194182
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
195183
ORTE_NAME_PRINT(&(peer->name)), peer->sd);
196184

197-
addrlen = sizeof(struct sockaddr_in);
198185
peer->active_addr = NULL;
199186
OPAL_LIST_FOREACH(addr, &peer->addrs, mca_oob_tcp_addr_t) {
200187
opal_output_verbose(OOB_TCP_DEBUG_CONNECT, orte_oob_base_framework.framework_output,
@@ -222,9 +209,36 @@ void mca_oob_tcp_peer_try_connect(int fd, short args, void *cbdata)
222209
continue;
223210
}
224211
peer->active_addr = addr; // record the one we are using
212+
addrlen = addr->addr.ss_family == AF_INET6 ? sizeof(struct sockaddr_in6)
213+
: sizeof(struct sockaddr_in);
214+
if (addr->addr.ss_family != current_socket_family) {
215+
if (peer->sd >= 0) {
216+
CLOSE_THE_SOCKET(peer->sd);
217+
peer->sd = -1;
218+
}
219+
rc = tcp_peer_create_socket(peer, addr->addr.ss_family);
220+
current_socket_family = addr->addr.ss_family;
221+
222+
if (ORTE_SUCCESS != rc) {
223+
/* FIXME: we cannot create a TCP socket - this spans
224+
* all interfaces, so all we can do is report
225+
* back to the component that this peer is
226+
* unreachable so it can remove the peer
227+
* from its list and report back to the base
228+
* NOTE: this could be a reconnect attempt,
229+
* so we also need to mark any queued messages
230+
* and return them as "unreachable"
231+
*/
232+
opal_output(0, "%s CANNOT CREATE SOCKET", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
233+
ORTE_FORCED_TERMINATE(1);
234+
goto cleanup;
235+
}
236+
}
225237
retry_connect:
226238
addr->retries++;
227-
if (connect(peer->sd, (struct sockaddr*)&addr->addr, addrlen) < 0) {
239+
240+
rc = connect(peer->sd, (struct sockaddr*) &addr->addr, addrlen);
241+
if (rc < 0) {
228242
/* non-blocking so wait for completion */
229243
if (opal_socket_errno == EINPROGRESS || opal_socket_errno == EWOULDBLOCK) {
230244
opal_output_verbose(OOB_TCP_DEBUG_CONNECT, orte_oob_base_framework.framework_output,

orte/mca/oob/tcp/oob_tcp_listener.c

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -277,6 +277,7 @@ static int create_listen(void)
277277
}
278278

279279
/* get the address info for this interface */
280+
memset(&inaddr, 0, sizeof(inaddr));
280281
((struct sockaddr_in*) &inaddr)->sin_family = AF_INET;
281282
((struct sockaddr_in*) &inaddr)->sin_addr.s_addr = INADDR_ANY;
282283
addrlen = sizeof(struct sockaddr_in);
@@ -529,6 +530,7 @@ static int create_listen6(void)
529530
}
530531

531532
/* get the address info for this interface */
533+
memset(&inaddr, 0, sizeof(inaddr));
532534
((struct sockaddr_in6*) &inaddr)->sin6_family = AF_INET6;
533535
((struct sockaddr_in6*) &inaddr)->sin6_addr = in6addr_any;
534536
addrlen = sizeof(struct sockaddr_in6);

0 commit comments

Comments
 (0)