Skip to content

Commit 6900249

Browse files
authored
Merge pull request #5938 from bwbarrett/bugfix/tcp-multi-address
Always use same IP address for module and modex
2 parents a663734 + 457f058 commit 6900249

File tree

5 files changed

+230
-187
lines changed

5 files changed

+230
-187
lines changed

opal/mca/btl/tcp/btl_tcp.h

Lines changed: 5 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -164,13 +164,11 @@ OPAL_MODULE_DECLSPEC extern mca_btl_tcp_component_t mca_btl_tcp_component;
164164
struct mca_btl_tcp_module_t {
165165
mca_btl_base_module_t super; /**< base BTL interface */
166166
uint16_t tcp_ifkindex; /** <BTL kernel interface index */
167-
#if 0
168-
int tcp_ifindex; /**< BTL interface index */
169-
#endif
170-
struct sockaddr_storage tcp_ifaddr; /**< First IPv4 address discovered for this interface, bound as sending address for this BTL */
171-
#if OPAL_ENABLE_IPV6
172-
struct sockaddr_storage tcp_ifaddr_6; /**< First IPv6 address discovered for this interface, bound as sending address for this BTL */
173-
#endif
167+
struct sockaddr_storage tcp_ifaddr; /**< First address
168+
discovered for this
169+
interface, bound as
170+
sending address for this
171+
BTL */
174172
uint32_t tcp_ifmask; /**< BTL interface netmask */
175173

176174
opal_mutex_t tcp_endpoints_mutex;

opal/mca/btl/tcp/btl_tcp_addr.h

Lines changed: 36 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -33,40 +33,52 @@
3333

3434

3535
/**
36-
* Structure used to publish TCP connection information to peers.
36+
* Modex address structure.
37+
*
38+
* One of these structures will be sent for every btl module in use by
39+
* the local BTL TCP component.
3740
*/
41+
struct mca_btl_tcp_modex_addr_t {
42+
uint8_t addr[16]; /* endpoint address. for addr_family
43+
of MCA_BTL_TCP_AF_INET, only the
44+
first 4 bytes have meaning. */
45+
uint32_t addr_ifkindex; /* endpoint kernel index */
46+
uint16_t addr_port; /* endpoint listen port */
47+
uint8_t addr_family; /* endpoint address family. Note that
48+
this is
49+
MCA_BTL_TCP_AF_{INET,INET6}, not
50+
the traditional
51+
AF_INET/AF_INET6. */
52+
uint8_t padding[1]; /* padd out to an 8-byte word */
53+
};
54+
typedef struct mca_btl_tcp_modex_addr_t mca_btl_tcp_modex_addr_t;
3855

56+
57+
/**
58+
* Remote peer address structure
59+
*
60+
* One of these structures will be allocated for every remote endpoint
61+
* associated with a remote proc. The data is pulled from the
62+
* mca_btl_tcp_modex_addr_t structure, except for the addr_inuse
63+
* field, which is local.
64+
*/
3965
struct mca_btl_tcp_addr_t {
40-
/* the following information is exchanged between different
41-
machines (read: byte order), so use network byte order
42-
for everything and don't add padding
43-
*/
66+
union {
67+
struct in_addr addr_inet; /* IPv6 listen address */
4468
#if OPAL_ENABLE_IPV6
45-
struct in6_addr addr_inet; /**< IPv4/IPv6 listen address > */
46-
#else
47-
/* Bug, FIXME: needs testing */
48-
struct my_in6_addr {
49-
union {
50-
uint32_t u6_addr32[4];
51-
struct _my_in6_addr {
52-
struct in_addr _addr_inet;
53-
uint32_t _pad[3];
54-
} _addr__inet;
55-
} _union_inet;
56-
} addr_inet;
69+
struct in6_addr addr_inet6; /* IPv6 listen address */
5770
#endif
58-
in_port_t addr_port; /**< listen port */
59-
uint16_t addr_ifkindex; /**< remote interface index assigned with
71+
};
72+
in_port_t addr_port; /**< listen port */
73+
int addr_ifkindex; /**< remote interface index assigned with
6074
this address */
61-
unsigned short addr_inuse; /**< local meaning only */
62-
uint8_t addr_family; /**< AF_INET or AF_INET6 */
75+
uint8_t addr_family; /**< AF_INET or AF_INET6 */
76+
bool addr_inuse; /**< local meaning only */
6377
};
6478
typedef struct mca_btl_tcp_addr_t mca_btl_tcp_addr_t;
6579

6680
#define MCA_BTL_TCP_AF_INET 0
67-
#if OPAL_ENABLE_IPV6
68-
# define MCA_BTL_TCP_AF_INET6 1
69-
#endif
81+
#define MCA_BTL_TCP_AF_INET6 1
7082

7183
#endif
7284

opal/mca/btl/tcp/btl_tcp_component.c

Lines changed: 109 additions & 102 deletions
Original file line numberDiff line numberDiff line change
@@ -490,11 +490,59 @@ static int mca_btl_tcp_component_close(void)
490490
* Create a btl instance and add to modules list.
491491
*/
492492

493-
static int mca_btl_tcp_create(int if_kindex, const char* if_name)
493+
static int mca_btl_tcp_create(const int if_kindex, const char* if_name)
494494
{
495495
struct mca_btl_tcp_module_t* btl;
496496
char param[256];
497497
int i;
498+
struct sockaddr_storage addr;
499+
bool found = false;
500+
501+
/*
502+
* Look for an address on the given device (ie, kindex) which
503+
* isn't disabled by the disable_family option. If there isn't
504+
* one, skip creating the modules for this interface. We store
505+
* the address on the module both to publish in the modex and to
506+
* use as the source address of all packets sent by this module.
507+
*
508+
* This still isn't quite right. Better would be to pull apart
509+
* split_and_resolve and pass the address used to select the
510+
* device into mca_btl_tcp_create(). This is a cleanup of the
511+
* logic that's been in use for years, but the case it doesn't
512+
* cover is (say) only specifing mca_btl_if_include 10.0.0.0/16
513+
* when the interface has addresses of both 10.0.0.1 and 10.1.0.1;
514+
* there's absolutely nothing that keeps this code from picking
515+
* 10.1.0.1 as the one that is published in the modex and used for
516+
* connection.
517+
*/
518+
for (i = opal_ifbegin() ; i >= 0 ; i = opal_ifnext(i)) {
519+
int ret;
520+
521+
if (if_kindex != opal_ifindextokindex(i)) {
522+
continue;
523+
}
524+
525+
ret = opal_ifindextoaddr(i, (struct sockaddr*)&addr,
526+
sizeof(struct sockaddr_storage));
527+
if (OPAL_SUCCESS != ret) {
528+
return ret;
529+
}
530+
531+
if (addr.ss_family == AF_INET &&
532+
4 != mca_btl_tcp_component.tcp_disable_family) {
533+
found = true;
534+
break;
535+
} else if (addr.ss_family == AF_INET6 &&
536+
6 != mca_btl_tcp_component.tcp_disable_family) {
537+
found = true;
538+
break;
539+
}
540+
}
541+
/* if we didn't find an address that works for us on this
542+
interface, just move on. */
543+
if (!found) {
544+
return OPAL_SUCCESS;
545+
}
498546

499547
for( i = 0; i < (int)mca_btl_tcp_component.tcp_num_links; i++ ) {
500548
btl = (struct mca_btl_tcp_module_t *)malloc(sizeof(mca_btl_tcp_module_t));
@@ -513,17 +561,8 @@ static int mca_btl_tcp_create(int if_kindex, const char* if_name)
513561
btl->tcp_send_handler = 0;
514562
#endif
515563

516-
struct sockaddr_storage addr;
517-
opal_ifkindextoaddr(if_kindex, (struct sockaddr*) &addr,
518-
sizeof (struct sockaddr_storage));
519-
#if OPAL_ENABLE_IPV6
520-
if (addr.ss_family == AF_INET6) {
521-
btl->tcp_ifaddr_6 = addr;
522-
}
523-
#endif
524-
if (addr.ss_family == AF_INET) {
525-
btl->tcp_ifaddr = addr;
526-
}
564+
memcpy(&btl->tcp_ifaddr, &addr, sizeof(struct sockaddr_storage));
565+
527566
/* allow user to specify interface bandwidth */
528567
sprintf(param, "bandwidth_%s", if_name);
529568
mca_btl_tcp_param_register_uint(param, NULL, btl->super.btl_bandwidth, OPAL_INFO_LVL_5, &btl->super.btl_bandwidth);
@@ -567,8 +606,8 @@ static int mca_btl_tcp_create(int if_kindex, const char* if_name)
567606
opal_output_verbose(5, opal_btl_base_framework.framework_output,
568607
"btl:tcp: %p: if %s kidx %d cnt %i addr %s %s bw %d lt %d\n",
569608
(void*)btl, if_name, (int) btl->tcp_ifkindex, i,
570-
opal_net_get_hostname((struct sockaddr*)&addr),
571-
(addr.ss_family == AF_INET) ? "IPv4" : "IPv6",
609+
opal_net_get_hostname((struct sockaddr*)&btl->tcp_ifaddr),
610+
(btl->tcp_ifaddr.ss_family == AF_INET) ? "IPv4" : "IPv6",
572611
btl->super.btl_bandwidth, btl->super.btl_latency);
573612
}
574613
return OPAL_SUCCESS;
@@ -721,6 +760,7 @@ static int mca_btl_tcp_component_create_instances(void)
721760
*/
722761
for(if_index = opal_ifbegin(); if_index >= 0; if_index = opal_ifnext(if_index)){
723762
int index = opal_ifindextokindex (if_index);
763+
724764
if (index > 0) {
725765
bool want_this_if = true;
726766

@@ -1121,97 +1161,64 @@ static int mca_btl_tcp_component_create_listen(uint16_t af_family)
11211161

11221162
static int mca_btl_tcp_component_exchange(void)
11231163
{
1124-
int rc = 0, index;
1125-
size_t i = 0;
1126-
size_t size = mca_btl_tcp_component.tcp_addr_count *
1127-
mca_btl_tcp_component.tcp_num_links * sizeof(mca_btl_tcp_addr_t);
1128-
/* adi@2007-04-12:
1129-
*
1130-
* We'll need to explain things a bit here:
1131-
* 1. We normally have as many BTLs as physical NICs.
1132-
* 2. With num_links, we now have num_btl = num_links * #NICs
1133-
* 3. we might have more than one address per NIC
1134-
*/
1135-
size_t xfer_size = 0; /* real size to transfer (may differ from 'size') */
1136-
size_t current_addr = 0;
1137-
1138-
if(mca_btl_tcp_component.tcp_num_btls != 0) {
1139-
char ifn[32];
1140-
mca_btl_tcp_addr_t *addrs = (mca_btl_tcp_addr_t *)malloc(size);
1141-
memset(addrs, 0, size);
1142-
1143-
/* here we start populating our addresses */
1144-
for( i = 0; i < mca_btl_tcp_component.tcp_num_btls; i++ ) {
1145-
for (index = opal_ifbegin(); index >= 0;
1146-
index = opal_ifnext(index)) {
1147-
struct sockaddr_storage my_ss;
1148-
1149-
/* Look if the module's address belongs to this
1150-
* kernel IP interface. If not, go to next address.
1151-
*/
1152-
if (opal_ifindextokindex (index) !=
1153-
mca_btl_tcp_component.tcp_btls[i]->tcp_ifkindex) {
1154-
continue;
1155-
}
1156-
1157-
opal_ifindextoname(index, ifn, sizeof(ifn));
1158-
opal_output_verbose(30, opal_btl_base_framework.framework_output,
1159-
"btl: tcp: component_exchange: examining interface %s",
1160-
ifn);
1161-
if (OPAL_SUCCESS !=
1162-
opal_ifindextoaddr(index, (struct sockaddr*) &my_ss,
1163-
sizeof (my_ss))) {
1164-
opal_output (0,
1165-
"btl_tcp_component: problems getting address for index %i (kernel index %i)\n",
1166-
index, opal_ifindextokindex (index));
1167-
continue;
1168-
}
1164+
int rc;
1165+
size_t i;
1166+
size_t num_btls = mca_btl_tcp_component.tcp_num_btls;
1167+
size_t size = num_btls * sizeof(mca_btl_tcp_modex_addr_t);
1168+
mca_btl_tcp_modex_addr_t *addrs;
1169+
1170+
if (num_btls <= 0) {
1171+
return 0;
1172+
}
1173+
1174+
addrs = (mca_btl_tcp_modex_addr_t*)malloc(size);
1175+
if (NULL == addrs) {
1176+
return OPAL_ERR_OUT_OF_RESOURCE;
1177+
}
1178+
memset(addrs, 0, size);
1179+
1180+
for (i = 0 ; i < num_btls ; i++) {
1181+
struct mca_btl_tcp_module_t *btl = mca_btl_tcp_component.tcp_btls[i];
1182+
struct sockaddr *addr = (struct sockaddr*)&(btl->tcp_ifaddr);
11691183

11701184
#if OPAL_ENABLE_IPV6
1171-
if ((AF_INET6 == my_ss.ss_family) &&
1172-
(6 != mca_btl_tcp_component.tcp_disable_family)) {
1173-
memcpy(&addrs[current_addr].addr_inet,
1174-
&((struct sockaddr_in6*)&my_ss)->sin6_addr,
1175-
sizeof(addrs[0].addr_inet));
1176-
addrs[current_addr].addr_port =
1177-
mca_btl_tcp_component.tcp6_listen_port;
1178-
addrs[current_addr].addr_family = MCA_BTL_TCP_AF_INET6;
1179-
xfer_size += sizeof (mca_btl_tcp_addr_t);
1180-
addrs[current_addr].addr_inuse = 0;
1181-
addrs[current_addr].addr_ifkindex =
1182-
opal_ifindextokindex (index);
1183-
current_addr++;
1184-
opal_output_verbose(30, opal_btl_base_framework.framework_output,
1185-
"btl: tcp: component_exchange: "
1186-
"%s IPv6 %s", ifn,
1187-
opal_net_get_hostname((struct sockaddr*)&my_ss));
1188-
} else
1185+
if (AF_INET6 == addr->sa_family) {
1186+
struct sockaddr_in6 *inaddr6 = (struct sockaddr_in6*)addr;
1187+
1188+
memcpy(&addrs[i].addr, &(inaddr6->sin6_addr),
1189+
sizeof(struct in6_addr));
1190+
addrs[i].addr_port = mca_btl_tcp_component.tcp6_listen_port;
1191+
addrs[i].addr_ifkindex = btl->tcp_ifkindex;
1192+
addrs[i].addr_family = MCA_BTL_TCP_AF_INET6;
1193+
opal_output_verbose(5, opal_btl_base_framework.framework_output,
1194+
"btl: tcp: exchange: %d %d IPv6 %s",
1195+
(int)i, btl->tcp_ifkindex,
1196+
opal_net_get_hostname(addr));
1197+
} else
11891198
#endif
1190-
if ((AF_INET == my_ss.ss_family) &&
1191-
(4 != mca_btl_tcp_component.tcp_disable_family)) {
1192-
memcpy(&addrs[current_addr].addr_inet,
1193-
&((struct sockaddr_in*)&my_ss)->sin_addr,
1194-
sizeof(struct in_addr));
1195-
addrs[current_addr].addr_port =
1196-
mca_btl_tcp_component.tcp_listen_port;
1197-
addrs[current_addr].addr_family = MCA_BTL_TCP_AF_INET;
1198-
xfer_size += sizeof (mca_btl_tcp_addr_t);
1199-
addrs[current_addr].addr_inuse = 0;
1200-
addrs[current_addr].addr_ifkindex =
1201-
opal_ifindextokindex (index);
1202-
current_addr++;
1203-
opal_output_verbose(30, opal_btl_base_framework.framework_output,
1204-
"btl: tcp: component_exchange: "
1205-
"%s IPv4 %s", ifn,
1206-
opal_net_get_hostname((struct sockaddr*)&my_ss));
1207-
}
1208-
} /* end of for opal_ifbegin() */
1209-
} /* end of for tcp_num_btls */
1210-
OPAL_MODEX_SEND(rc, OPAL_PMIX_GLOBAL,
1211-
&mca_btl_tcp_component.super.btl_version,
1212-
addrs, xfer_size);
1213-
free(addrs);
1214-
} /* end if */
1199+
if (AF_INET == addr->sa_family) {
1200+
struct sockaddr_in *inaddr = (struct sockaddr_in*)addr;
1201+
1202+
memcpy(&addrs[i].addr, &(inaddr->sin_addr),
1203+
sizeof(struct in_addr));
1204+
addrs[i].addr_port = mca_btl_tcp_component.tcp_listen_port;
1205+
addrs[i].addr_ifkindex = btl->tcp_ifkindex;
1206+
addrs[i].addr_family = MCA_BTL_TCP_AF_INET;
1207+
opal_output_verbose(5, opal_btl_base_framework.framework_output,
1208+
"btl: tcp: exchange: %d %d IPv4 %s",
1209+
(int)i, btl->tcp_ifkindex,
1210+
opal_net_get_hostname(addr));
1211+
} else {
1212+
BTL_ERROR(("Unexpected address family: %d", addr->sa_family));
1213+
return OPAL_ERR_BAD_PARAM;
1214+
}
1215+
}
1216+
1217+
OPAL_MODEX_SEND(rc, OPAL_PMIX_GLOBAL,
1218+
&mca_btl_tcp_component.super.btl_version,
1219+
addrs, size);
1220+
free(addrs);
1221+
12151222
return rc;
12161223
}
12171224

opal/mca/btl/tcp/btl_tcp_endpoint.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -743,11 +743,11 @@ static int mca_btl_tcp_endpoint_start_connect(mca_btl_base_endpoint_t* btl_endpo
743743
#if OPAL_ENABLE_IPV6
744744
if (endpoint_addr.ss_family == AF_INET6) {
745745
assert(NULL != &btl_endpoint->endpoint_btl->tcp_ifaddr_6);
746-
if (bind(btl_endpoint->endpoint_sd, (struct sockaddr*) &btl_endpoint->endpoint_btl->tcp_ifaddr_6,
746+
if (bind(btl_endpoint->endpoint_sd, (struct sockaddr*) &btl_endpoint->endpoint_btl->tcp_ifaddr,
747747
sizeof(struct sockaddr_in6)) < 0) {
748748
BTL_ERROR(("bind on local address (%s:%d) failed: %s (%d)",
749749
opal_net_get_hostname((struct sockaddr*) &btl_endpoint->endpoint_btl->tcp_ifaddr),
750-
htons(((struct sockaddr_in*)&btl_endpoint->endpoint_btl->tcp_ifaddr)->sin_port),
750+
htons(((struct sockaddr_in6*)&btl_endpoint->endpoint_btl->tcp_ifaddr)->sin6_port),
751751
strerror(opal_socket_errno), opal_socket_errno));
752752

753753
CLOSE_THE_SOCKET(btl_endpoint->endpoint_sd);

0 commit comments

Comments
 (0)