@@ -486,7 +486,7 @@ int mca_btl_tcp_proc_insert( mca_btl_tcp_proc_t* btl_proc,
486
486
}
487
487
488
488
/*
489
- * in case the peer address has all intended connections,
489
+ * in case the peer address has created all intended connections,
490
490
* mark the complete peer interface as 'not available'
491
491
*/
492
492
if (endpoint_addr -> addr_inuse >= mca_btl_tcp_component .tcp_num_links ) {
@@ -810,12 +810,15 @@ mca_btl_tcp_proc_t* mca_btl_tcp_proc_lookup(const opal_process_name_t *name)
810
810
void mca_btl_tcp_proc_accept (mca_btl_tcp_proc_t * btl_proc , struct sockaddr * addr , int sd )
811
811
{
812
812
OPAL_THREAD_LOCK (& btl_proc -> proc_lock );
813
+ int found_match = 0 ;
814
+ mca_btl_base_endpoint_t * match_btl_endpoint ;
815
+
813
816
for ( size_t i = 0 ; i < btl_proc -> proc_endpoint_count ; i ++ ) {
814
817
mca_btl_base_endpoint_t * btl_endpoint = btl_proc -> proc_endpoints [i ];
815
818
/* We are not here to make a decision about what is good socket
816
819
* and what is not. We simply check that this socket fit the endpoint
817
- * end we prepare for the real decision function mca_btl_tcp_endpoint_accept. */
818
- if ( btl_endpoint -> endpoint_addr -> addr_family != addr -> sa_family ) {
820
+ * end we prepare for the real decision function mca_btl_tcp_endpoint_accept. */
821
+ if ( btl_endpoint -> endpoint_addr -> addr_family != addr -> sa_family ) {
819
822
continue ;
820
823
}
821
824
switch (addr -> sa_family ) {
@@ -833,6 +836,10 @@ void mca_btl_tcp_proc_accept(mca_btl_tcp_proc_t* btl_proc, struct sockaddr* addr
833
836
tmp [1 ], 16 ),
834
837
(int )i , (int )btl_proc -> proc_endpoint_count );
835
838
continue ;
839
+ } else if (btl_endpoint -> endpoint_state != MCA_BTL_TCP_CLOSED ) {
840
+ found_match = 1 ;
841
+ match_btl_endpoint = btl_endpoint ;
842
+ continue ;
836
843
}
837
844
break ;
838
845
#if OPAL_ENABLE_IPV6
@@ -857,10 +864,20 @@ void mca_btl_tcp_proc_accept(mca_btl_tcp_proc_t* btl_proc, struct sockaddr* addr
857
864
;
858
865
}
859
866
867
+ /* Set state to CONNECTING to ensure that subsequent conenctions do not attempt to re-use endpoint in the num_links > 1 case*/
868
+ btl_endpoint -> endpoint_state = MCA_BTL_TCP_CONNECTING ;
860
869
(void )mca_btl_tcp_endpoint_accept (btl_endpoint , addr , sd );
861
870
OPAL_THREAD_UNLOCK (& btl_proc -> proc_lock );
862
871
return ;
863
872
}
873
+ /* In this case the connection was inbound to an address exported, but was not in a CLOSED state.
874
+ * mca_btl_tcp_endpoint_accept() has logic to deal with the race condition that has likely caused this
875
+ * scenario, so call it here.*/
876
+ if (found_match ) {
877
+ (void )mca_btl_tcp_endpoint_accept (match_btl_endpoint , addr , sd );
878
+ OPAL_THREAD_UNLOCK (& btl_proc -> proc_lock );
879
+ return ;
880
+ }
864
881
/* No further use of this socket. Close it */
865
882
CLOSE_THE_SOCKET (sd );
866
883
{
0 commit comments