Skip to content

Commit 86061fb

Browse files
committed
fixed ipv6 OOB connection problems
Signed-off-by: Stanislav Kirillov <staskirillof@yandex.ru>
1 parent dd62004 commit 86061fb

File tree

4 files changed

+47
-24
lines changed

4 files changed

+47
-24
lines changed

opal/mca/if/linux_ipv6/if_linux_ipv6.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -118,8 +118,8 @@ static int if_linux_ipv6_open(void)
118118
addrbyte[8], addrbyte[9], addrbyte[10], addrbyte[11],
119119
addrbyte[12], addrbyte[13], addrbyte[14], addrbyte[15], scope);
120120

121-
/* we don't want any other scope less than link-local */
122-
if (scope < 0x20) {
121+
/* Only interested in global (0x00) scope */
122+
if (scope != 0x00) {
123123
opal_output_verbose(1, opal_if_base_framework.framework_output,
124124
"skipping interface %2x%2x:%2x%2x:%2x%2x:%2x%2x:%2x%2x:%2x%2x:%2x%2x:%2x%2x scope %x\n",
125125
addrbyte[0], addrbyte[1], addrbyte[2], addrbyte[3],

orte/mca/oob/tcp/oob_tcp_component.c

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,8 @@
5252
#include <netdb.h>
5353
#endif
5454
#include <ctype.h>
55+
#include <sys/socket.h>
56+
#include <arpa/inet.h>
5557

5658
#include "opal/util/show_help.h"
5759
#include "opal/util/error.h"
@@ -84,6 +86,8 @@
8486
#include "orte/mca/oob/tcp/oob_tcp_peer.h"
8587
#include "orte/mca/oob/tcp/oob_tcp_connection.h"
8688
#include "orte/mca/oob/tcp/oob_tcp_listener.h"
89+
#include "oob_tcp_peer.h"
90+
8791
/*
8892
* Local utility functions
8993
*/
@@ -843,6 +847,8 @@ static int parse_uri(const uint16_t af_family,
843847
opal_output (0, "oob_tcp_parse_uri: Could not convert %s\n", host);
844848
return ORTE_ERR_BAD_PARAM;
845849
}
850+
in6->sin6_family = AF_INET6;
851+
in6->sin6_port = htons(atoi(port));
846852
}
847853
#endif
848854
else {
@@ -973,6 +979,7 @@ static int component_set_addr(orte_process_name_t *peer,
973979
}
974980

975981
maddr = OBJ_NEW(mca_oob_tcp_addr_t);
982+
((struct sockaddr_storage*) &(maddr->addr))->ss_family = af_family;
976983
if (ORTE_SUCCESS != (rc = parse_uri(af_family, host, ports, (struct sockaddr_storage*) &(maddr->addr)))) {
977984
ORTE_ERROR_LOG(rc);
978985
OBJ_RELEASE(maddr);

orte/mca/oob/tcp/oob_tcp_connection.c

Lines changed: 36 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,8 @@
3030
#include <unistd.h>
3131
#endif
3232
#include <fcntl.h>
33+
#include <sys/socket.h>
34+
3335
#ifdef HAVE_SYS_UIO_H
3436
#include <sys/uio.h>
3537
#endif
@@ -76,6 +78,9 @@
7678
#include "orte/mca/oob/tcp/oob_tcp_peer.h"
7779
#include "orte/mca/oob/tcp/oob_tcp_common.h"
7880
#include "orte/mca/oob/tcp/oob_tcp_connection.h"
81+
#include "oob_tcp_peer.h"
82+
#include "oob_tcp_common.h"
83+
#include "oob_tcp_connection.h"
7984

8085
static void tcp_peer_event_init(mca_oob_tcp_peer_t* peer);
8186
static int tcp_peer_send_connect_ack(mca_oob_tcp_peer_t* peer);
@@ -85,7 +90,7 @@ static bool tcp_peer_recv_blocking(mca_oob_tcp_peer_t* peer, int sd,
8590
void* data, size_t size);
8691
static void tcp_peer_connected(mca_oob_tcp_peer_t* peer);
8792

88-
static int tcp_peer_create_socket(mca_oob_tcp_peer_t* peer)
93+
static int tcp_peer_create_socket(mca_oob_tcp_peer_t* peer, sa_family_t family)
8994
{
9095
int flags;
9196

@@ -97,8 +102,7 @@ static int tcp_peer_create_socket(mca_oob_tcp_peer_t* peer)
97102
"%s oob:tcp:peer creating socket to %s",
98103
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
99104
ORTE_NAME_PRINT(&(peer->name))));
100-
101-
peer->sd = socket(AF_INET, SOCK_STREAM, 0);
105+
peer->sd = socket(family, SOCK_STREAM, 0);
102106
if (peer->sd < 0) {
103107
opal_output(0, "%s-%s tcp_peer_create_socket: socket() failed: %s (%d)\n",
104108
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
@@ -154,6 +158,7 @@ void mca_oob_tcp_peer_try_connect(int fd, short args, void *cbdata)
154158
{
155159
mca_oob_tcp_conn_op_t *op = (mca_oob_tcp_conn_op_t*)cbdata;
156160
mca_oob_tcp_peer_t *peer;
161+
int current_socket_family = 0;
157162
int rc;
158163
opal_socklen_t addrlen = 0;
159164
mca_oob_tcp_addr_t *addr;
@@ -170,30 +175,12 @@ void mca_oob_tcp_peer_try_connect(int fd, short args, void *cbdata)
170175
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
171176
ORTE_NAME_PRINT(&(peer->name)));
172177

173-
rc = tcp_peer_create_socket(peer);
174-
if (ORTE_SUCCESS != rc) {
175-
/* FIXME: we cannot create a TCP socket - this spans
176-
* all interfaces, so all we can do is report
177-
* back to the component that this peer is
178-
* unreachable so it can remove the peer
179-
* from its list and report back to the base
180-
* NOTE: this could be a reconnect attempt,
181-
* so we also need to mark any queued messages
182-
* and return them as "unreachable"
183-
*/
184-
opal_output(0, "%s CANNOT CREATE SOCKET", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
185-
ORTE_FORCED_TERMINATE(1);
186-
OBJ_RELEASE(op);
187-
return;
188-
}
189-
190178
opal_output_verbose(OOB_TCP_DEBUG_CONNECT, orte_oob_base_framework.framework_output,
191179
"%s orte_tcp_peer_try_connect: "
192180
"attempting to connect to proc %s on socket %d",
193181
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
194182
ORTE_NAME_PRINT(&(peer->name)), peer->sd);
195183

196-
addrlen = sizeof(struct sockaddr_in);
197184
peer->active_addr = NULL;
198185
OPAL_LIST_FOREACH(addr, &peer->addrs, mca_oob_tcp_addr_t) {
199186
opal_output_verbose(OOB_TCP_DEBUG_CONNECT, orte_oob_base_framework.framework_output,
@@ -221,9 +208,36 @@ void mca_oob_tcp_peer_try_connect(int fd, short args, void *cbdata)
221208
continue;
222209
}
223210
peer->active_addr = addr; // record the one we are using
211+
addrlen = addr->addr.ss_family == AF_INET6 ? sizeof(struct sockaddr_in6)
212+
: sizeof(struct sockaddr_in);
213+
if (addr->addr.ss_family != current_socket_family) {
214+
if (peer->sd >= 0) {
215+
CLOSE_THE_SOCKET(peer->sd);
216+
peer->sd = -1;
217+
}
218+
rc = tcp_peer_create_socket(peer, addr->addr.ss_family);
219+
current_socket_family = addr->addr.ss_family;
220+
221+
if (ORTE_SUCCESS != rc) {
222+
/* FIXME: we cannot create a TCP socket - this spans
223+
* all interfaces, so all we can do is report
224+
* back to the component that this peer is
225+
* unreachable so it can remove the peer
226+
* from its list and report back to the base
227+
* NOTE: this could be a reconnect attempt,
228+
* so we also need to mark any queued messages
229+
* and return them as "unreachable"
230+
*/
231+
opal_output(0, "%s CANNOT CREATE SOCKET", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
232+
ORTE_FORCED_TERMINATE(1);
233+
goto cleanup;
234+
}
235+
}
224236
retry_connect:
225237
addr->retries++;
226-
if (connect(peer->sd, (struct sockaddr*)&addr->addr, addrlen) < 0) {
238+
239+
rc = connect(peer->sd, (struct sockaddr*) &addr->addr, addrlen);
240+
if (rc < 0) {
227241
/* non-blocking so wait for completion */
228242
if (opal_socket_errno == EINPROGRESS || opal_socket_errno == EWOULDBLOCK) {
229243
opal_output_verbose(OOB_TCP_DEBUG_CONNECT, orte_oob_base_framework.framework_output,

orte/mca/oob/tcp/oob_tcp_listener.c

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -277,6 +277,7 @@ static int create_listen(void)
277277
}
278278

279279
/* get the address info for this interface */
280+
memset(&inaddr, 0, sizeof(inaddr));
280281
((struct sockaddr_in*) &inaddr)->sin_family = AF_INET;
281282
((struct sockaddr_in*) &inaddr)->sin_addr.s_addr = INADDR_ANY;
282283
addrlen = sizeof(struct sockaddr_in);
@@ -529,6 +530,7 @@ static int create_listen6(void)
529530
}
530531

531532
/* get the address info for this interface */
533+
memset(&inaddr, 0, sizeof(inaddr));
532534
((struct sockaddr_in6*) &inaddr)->sin6_family = AF_INET6;
533535
((struct sockaddr_in6*) &inaddr)->sin6_addr = in6addr_any;
534536
addrlen = sizeof(struct sockaddr_in6);

0 commit comments

Comments
 (0)