@@ -176,6 +176,7 @@ static int spml_ucx_init(void)
176
176
}
177
177
178
178
OBJ_CONSTRUCT (& (mca_spml_ucx .ctx_list ), opal_list_t );
179
+ OBJ_CONSTRUCT (& (mca_spml_ucx .idle_ctx_list ), opal_list_t );
179
180
SHMEM_MUTEX_INIT (mca_spml_ucx .internal_mutex );
180
181
181
182
wkr_params .field_mask = UCP_WORKER_PARAM_FIELD_THREAD_MODE ;
@@ -224,42 +225,81 @@ mca_spml_ucx_component_init(int* priority,
224
225
return & mca_spml_ucx .super ;
225
226
}
226
227
228
+ static void _ctx_cleanup (mca_spml_ucx_ctx_list_item_t * ctx_item )
229
+ {
230
+ int i , j , nprocs = oshmem_num_procs ();
231
+ opal_common_ucx_del_proc_t * del_procs ;
232
+
233
+ del_procs = malloc (sizeof (* del_procs ) * nprocs );
234
+
235
+ for (i = 0 ; i < nprocs ; ++ i ) {
236
+ for (j = 0 ; j < MCA_MEMHEAP_SEG_COUNT ; j ++ ) {
237
+ if (ctx_item -> ctx .ucp_peers [i ].mkeys [j ].key .rkey != NULL ) {
238
+ ucp_rkey_destroy (ctx_item -> ctx .ucp_peers [i ].mkeys [j ].key .rkey );
239
+ }
240
+ }
241
+
242
+ del_procs [i ].ep = ctx_item -> ctx .ucp_peers [i ].ucp_conn ;
243
+ del_procs [i ].vpid = i ;
244
+ ctx_item -> ctx .ucp_peers [i ].ucp_conn = NULL ;
245
+ }
246
+
247
+ opal_common_ucx_del_procs_nofence (del_procs , nprocs , oshmem_my_proc_id (),
248
+ mca_spml_ucx .num_disconnect ,
249
+ ctx_item -> ctx .ucp_worker );
250
+ free (del_procs );
251
+ free (ctx_item -> ctx .ucp_peers );
252
+ }
253
+
227
254
static int mca_spml_ucx_component_fini (void )
228
255
{
229
256
mca_spml_ucx_ctx_list_item_t * ctx_item , * next ;
230
- size_t i , j , nprocs = oshmem_num_procs ();
257
+ int fenced = 0 ;
258
+ int ret = OSHMEM_SUCCESS ;
231
259
232
260
opal_progress_unregister (spml_ucx_progress );
233
261
234
262
if (!mca_spml_ucx .enabled )
235
263
return OSHMEM_SUCCESS ; /* never selected.. return success.. */
236
264
237
265
/* delete context objects from list */
238
- OPAL_LIST_FOREACH_SAFE (ctx_item , next , & (mca_spml_ucx .ctx_list ),
266
+ OPAL_LIST_FOREACH_SAFE (ctx_item , next , & (mca_spml_ucx .idle_ctx_list ),
239
267
mca_spml_ucx_ctx_list_item_t ) {
240
- opal_list_remove_item (& (mca_spml_ucx .ctx_list ), & ctx_item -> super );
268
+ _ctx_cleanup (ctx_item );
269
+ }
241
270
242
- opal_common_ucx_del_proc_t * del_procs ;
243
- del_procs = malloc (sizeof (* del_procs ) * nprocs );
271
+ OPAL_LIST_FOREACH_SAFE (ctx_item , next , & (mca_spml_ucx .ctx_list ),
272
+ mca_spml_ucx_ctx_list_item_t ) {
273
+ _ctx_cleanup (ctx_item );
274
+ }
244
275
245
- for (i = 0 ; i < nprocs ; ++ i ) {
246
- for (j = 0 ; j < MCA_MEMHEAP_SEG_COUNT ; j ++ ) {
247
- if (ctx_item -> ctx .ucp_peers [i ].mkeys [j ].key .rkey != NULL ) {
248
- ucp_rkey_destroy (ctx_item -> ctx .ucp_peers [i ].mkeys [j ].key .rkey );
249
- }
250
- }
276
+ ret = opal_common_ucx_mca_pmix_fence_nb (& fenced );
277
+ if (OPAL_SUCCESS != ret ) {
278
+ return ret ;
279
+ }
251
280
252
- del_procs [i ].ep = ctx_item -> ctx .ucp_peers [i ].ucp_conn ;
253
- del_procs [i ].vpid = i ;
254
- ctx_item -> ctx .ucp_peers [i ].ucp_conn = NULL ;
281
+ while (!fenced ) {
282
+ OPAL_LIST_FOREACH_SAFE (ctx_item , next , & (mca_spml_ucx .ctx_list ),
283
+ mca_spml_ucx_ctx_list_item_t ) {
284
+ ucp_worker_progress (ctx_item -> ctx .ucp_worker );
255
285
}
286
+ OPAL_LIST_FOREACH_SAFE (ctx_item , next , & (mca_spml_ucx .idle_ctx_list ),
287
+ mca_spml_ucx_ctx_list_item_t ) {
288
+ ucp_worker_progress (ctx_item -> ctx .ucp_worker );
289
+ }
290
+ ucp_worker_progress (mca_spml_ucx_ctx_default .ucp_worker );
291
+ }
256
292
257
- opal_common_ucx_del_procs (del_procs , nprocs , oshmem_my_proc_id (),
258
- mca_spml_ucx .num_disconnect ,
259
- ctx_item -> ctx .ucp_worker );
260
- free (del_procs );
261
- free (ctx_item -> ctx .ucp_peers );
262
-
293
+ /* delete all workers */
294
+ OPAL_LIST_FOREACH_SAFE (ctx_item , next , & (mca_spml_ucx .idle_ctx_list ),
295
+ mca_spml_ucx_ctx_list_item_t ) {
296
+ opal_list_remove_item (& (mca_spml_ucx .idle_ctx_list ), & ctx_item -> super );
297
+ ucp_worker_destroy (ctx_item -> ctx .ucp_worker );
298
+ OBJ_RELEASE (ctx_item );
299
+ }
300
+ OPAL_LIST_FOREACH_SAFE (ctx_item , next , & (mca_spml_ucx .ctx_list ),
301
+ mca_spml_ucx_ctx_list_item_t ) {
302
+ opal_list_remove_item (& (mca_spml_ucx .ctx_list ), & ctx_item -> super );
263
303
ucp_worker_destroy (ctx_item -> ctx .ucp_worker );
264
304
OBJ_RELEASE (ctx_item );
265
305
}
@@ -271,6 +311,7 @@ static int mca_spml_ucx_component_fini(void)
271
311
mca_spml_ucx .enabled = false; /* not anymore */
272
312
273
313
OBJ_DESTRUCT (& (mca_spml_ucx .ctx_list ));
314
+ OBJ_DESTRUCT (& (mca_spml_ucx .idle_ctx_list ));
274
315
SHMEM_MUTEX_DESTROY (mca_spml_ucx .internal_mutex );
275
316
276
317
if (mca_spml_ucx .ucp_context ) {
0 commit comments