@@ -32,6 +32,8 @@ __thread int initialized = 0;
32
32
#endif
33
33
34
34
bool opal_common_ucx_thread_enabled = false;
35
+ opal_atomic_int64_t opal_common_ucx_ep_counts = 0 ;
36
+ opal_atomic_int64_t opal_common_ucx_unpacked_rkey_counts = 0 ;
35
37
36
38
static _ctx_record_t * _tlocal_add_ctx_rec (opal_common_ucx_ctx_t * ctx );
37
39
static inline _ctx_record_t * _tlocal_get_ctx_rec (opal_tsd_tracked_key_t tls_key );
@@ -102,6 +104,7 @@ static void _winfo_destructor(opal_common_ucx_winfo_t *winfo)
102
104
for (i = 0 ; i < winfo -> comm_size ; i ++ ) {
103
105
if (NULL != winfo -> endpoints [i ]) {
104
106
ucp_ep_destroy (winfo -> endpoints [i ]);
107
+ OPAL_COMMON_UCX_DEBUG_ATOMIC_ADD (opal_common_ucx_ep_counts , -1 );
105
108
}
106
109
assert (winfo -> inflight_ops [i ] == 0 );
107
110
}
@@ -326,9 +329,26 @@ static opal_common_ucx_winfo_t *_wpool_get_winfo(opal_common_ucx_wpool_t *wpool,
326
329
return winfo ;
327
330
}
328
331
332
+ /* Remove the winfo from active workers and add it to idle workers */
329
333
static void _wpool_put_winfo (opal_common_ucx_wpool_t * wpool , opal_common_ucx_winfo_t * winfo )
330
334
{
331
335
opal_mutex_lock (& wpool -> mutex );
336
+ if (winfo -> comm_size != 0 ) {
337
+ size_t i ;
338
+ if (opal_common_ucx_thread_enabled ) {
339
+ for (i = 0 ; i < winfo -> comm_size ; i ++ ) {
340
+ if (NULL != winfo -> endpoints [i ]) {
341
+ ucp_ep_destroy (winfo -> endpoints [i ]);
342
+ OPAL_COMMON_UCX_DEBUG_ATOMIC_ADD (opal_common_ucx_ep_counts , -1 );
343
+ }
344
+ assert (winfo -> inflight_ops [i ] == 0 );
345
+ }
346
+ }
347
+ free (winfo -> endpoints );
348
+ free (winfo -> inflight_ops );
349
+ }
350
+ winfo -> endpoints = NULL ;
351
+ winfo -> comm_size = 0 ;
332
352
opal_list_remove_item (& wpool -> active_workers , & winfo -> super );
333
353
opal_list_prepend (& wpool -> idle_workers , & winfo -> super );
334
354
opal_mutex_unlock (& wpool -> mutex );
@@ -632,6 +652,7 @@ static int _tlocal_ctx_connect(_ctx_record_t *ctx_rec, int target)
632
652
memset (& ep_params , 0 , sizeof (ucp_ep_params_t ));
633
653
ep_params .field_mask = UCP_EP_PARAM_FIELD_REMOTE_ADDRESS ;
634
654
655
+ assert (winfo -> endpoints [target ] == NULL );
635
656
opal_mutex_lock (& winfo -> mutex );
636
657
displ = gctx -> recv_worker_displs [target ];
637
658
ep_params .address = (ucp_address_t * ) & (gctx -> recv_worker_addrs [displ ]);
@@ -641,7 +662,9 @@ static int _tlocal_ctx_connect(_ctx_record_t *ctx_rec, int target)
641
662
opal_mutex_unlock (& winfo -> mutex );
642
663
return OPAL_ERROR ;
643
664
}
665
+ OPAL_COMMON_UCX_DEBUG_ATOMIC_ADD (opal_common_ucx_ep_counts , 1 );
644
666
opal_mutex_unlock (& winfo -> mutex );
667
+ assert (winfo -> endpoints [target ] != NULL );
645
668
return OPAL_SUCCESS ;
646
669
}
647
670
@@ -662,6 +685,7 @@ static void _tlocal_mem_rec_cleanup(_mem_record_t *mem_rec)
662
685
for (i = 0 ; i < mem_rec -> gmem -> ctx -> comm_size ; i ++ ) {
663
686
if (mem_rec -> rkeys [i ]) {
664
687
ucp_rkey_destroy (mem_rec -> rkeys [i ]);
688
+ OPAL_COMMON_UCX_DEBUG_ATOMIC_ADD (opal_common_ucx_unpacked_rkey_counts , -1 );
665
689
}
666
690
}
667
691
opal_mutex_unlock (& mem_rec -> winfo -> mutex );
@@ -701,6 +725,7 @@ static int _tlocal_mem_create_rkey(_mem_record_t *mem_rec, ucp_ep_h ep, int targ
701
725
702
726
opal_mutex_lock (& mem_rec -> winfo -> mutex );
703
727
status = ucp_ep_rkey_unpack (ep , & gmem -> mem_addrs [displ ], & mem_rec -> rkeys [target ]);
728
+ OPAL_COMMON_UCX_DEBUG_ATOMIC_ADD (opal_common_ucx_unpacked_rkey_counts , 1 );
704
729
opal_mutex_unlock (& mem_rec -> winfo -> mutex );
705
730
if (status != UCS_OK ) {
706
731
MCA_COMMON_UCX_VERBOSE (1 , "ucp_ep_rkey_unpack failed: %d" , status );
0 commit comments