Skip to content

Commit 295dc76

Browse files
committed
smcuda: fix edge case when using enable mca dso
turns out singleton and single process per node hit an edge case - a segfault in MPI_Finalize - without this patch related to #11627 Signed-off-by: Howard Pritchard <howardp@lanl.gov>
1 parent da6d715 commit 295dc76

File tree

3 files changed

+21
-12
lines changed

3 files changed

+21
-12
lines changed

opal/mca/btl/smcuda/btl_smcuda_accelerator.c

Lines changed: 2 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,7 @@ static int accelerator_event_max = 400;
3737
static int accelerator_event_ipc_most = 0;
3838
static bool smcuda_accelerator_initialized = false;
3939

40-
static void mca_btl_smcuda_accelerator_fini(void);
40+
void mca_btl_smcuda_accelerator_fini(void);
4141

4242
int mca_btl_smcuda_accelerator_init(void)
4343
{
@@ -83,14 +83,6 @@ int mca_btl_smcuda_accelerator_init(void)
8383
goto cleanup_and_error;
8484
}
8585

86-
/*
87-
* add smcuda acclerator fini code to opal's list of cleanup functions.
88-
* Cleanups are called before all the MCA frameworks are closed, so by
89-
* adding this function to the callback list, we avoid issues with ordering
90-
* of the closing of the BTL framework with the accelerator framework, etc. etc.
91-
*/
92-
opal_finalize_register_cleanup(mca_btl_smcuda_accelerator_fini);
93-
9486
smcuda_accelerator_initialized = true;
9587

9688
cleanup_and_error:
@@ -115,7 +107,7 @@ int mca_btl_smcuda_accelerator_init(void)
115107
return rc;
116108
}
117109

118-
static void mca_btl_smcuda_accelerator_fini(void)
110+
void mca_btl_smcuda_accelerator_fini(void)
119111
{
120112
int i;
121113

opal/mca/btl/smcuda/btl_smcuda_accelerator.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,5 +22,6 @@ OPAL_DECLSPEC int mca_btl_smcuda_accelerator_init(void);
2222
OPAL_DECLSPEC int mca_btl_smcuda_progress_one_ipc_event(struct mca_btl_base_descriptor_t **frag);
2323
OPAL_DECLSPEC int mca_btl_smcuda_memcpy(void *dst, void *src, size_t amount, char *msg,
2424
struct mca_btl_base_descriptor_t *frag);
25+
OPAL_DECLSPEC void mca_btl_smcuda_accelerator_fini(void);
2526

2627
#endif /* MCA_BTL_SMCUDA_ACCELERATOR_H */

opal/mca/btl/smcuda/btl_smcuda_component.c

Lines changed: 18 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -265,8 +265,6 @@ static int mca_btl_smcuda_component_open(void)
265265
OBJ_CONSTRUCT(&mca_btl_smcuda_component.sm_frags_user, opal_free_list_t);
266266
OBJ_CONSTRUCT(&mca_btl_smcuda_component.pending_send_fl, opal_free_list_t);
267267

268-
opal_finalize_register_cleanup(mca_btl_smcuda_component_fini);
269-
270268
return OPAL_SUCCESS;
271269
}
272270

@@ -283,6 +281,8 @@ static void mca_btl_smcuda_component_fini(void)
283281
{
284282
int rc;
285283

284+
mca_btl_smcuda_accelerator_fini();
285+
286286
OBJ_DESTRUCT(&mca_btl_smcuda_component.sm_lock);
287287
/**
288288
* We don't have to destroy the fragment lists. They are allocated
@@ -892,6 +892,22 @@ mca_btl_smcuda_component_init(int *num_btls, bool enable_progress_threads, bool
892892
mca_btl_base_active_message_trigger[MCA_BTL_TAG_SMCUDA].cbfunc = btl_smcuda_control;
893893
mca_btl_base_active_message_trigger[MCA_BTL_TAG_SMCUDA].cbdata = NULL;
894894

895+
/*
896+
* add smcuda component fini code to opal's list of cleanup functions.
897+
* Cleanups are called before all the MCA frameworks are closed, so by
898+
* of the closing of the BTL framework with the accelerator framework, etc. etc.
899+
* We add it here in the btl_init routine as its possible under
900+
* certain scenarios that one of the steps above in this routine will fail,
901+
* resulting in a NULL return value, and the btl component selector to close
902+
* the btl. This can also happen in normal operation, for instance for singleton
903+
* where the smcuda is closed during mpi initialization. We don't want
904+
* to add a cleanup callback if no btls were returned.
905+
*/
906+
907+
if (NULL != btls) {
908+
opal_finalize_register_cleanup(mca_btl_smcuda_component_fini);
909+
}
910+
895911
return btls;
896912
}
897913

0 commit comments

Comments
 (0)