Skip to content

Commit dc972f0

Browse files
committed
Fix the PML monitoring.
The monitoring PML hides it's existence from the OMPI infrastructure by removing itself from the list of PML loaded components, remaining hidden until MPI_Finalize. Signed-off-by: George Bosilca <bosilca@icl.utk.edu>
1 parent 668aa15 commit dc972f0

File tree

2 files changed

+70
-55
lines changed

2 files changed

+70
-55
lines changed

ompi/mca/pml/monitoring/pml_monitoring_component.c

Lines changed: 60 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -95,50 +95,6 @@ static int mca_pml_monitoring_component_open(void)
9595
return OMPI_SUCCESS;
9696
}
9797

98-
static int mca_pml_monitoring_component_close(void)
99-
{
100-
if( !mca_common_monitoring_enabled ) return OMPI_SUCCESS;
101-
102-
/**
103-
* If this component is already active, then we are currently monitoring
104-
* the execution and this call to close if the one from MPI_Finalize.
105-
* Clean up and release the extra reference on ourselves.
106-
*/
107-
if( mca_pml_monitoring_active ) { /* Already active, turn off */
108-
pml_selected_component.pmlm_version.mca_close_component();
109-
mca_base_component_repository_release((mca_base_component_t*)&mca_pml_monitoring_component);
110-
mca_pml_monitoring_active = 0;
111-
return OMPI_SUCCESS;
112-
}
113-
114-
/**
115-
* We are supposed to monitor the execution. Save the winner PML component and
116-
* module, and swap it with ourselves. Increase our refcount so that we are
117-
* not dlclose.
118-
*/
119-
if( OPAL_SUCCESS != mca_base_component_repository_retain_component(mca_pml_monitoring_component.pmlm_version.mca_type_name,
120-
mca_pml_monitoring_component.pmlm_version.mca_component_name) ) {
121-
return OMPI_ERROR;
122-
}
123-
124-
/* Save a copy of the selected PML */
125-
pml_selected_component = mca_pml_base_selected_component;
126-
pml_selected_module = mca_pml;
127-
/* Install our interception layer */
128-
mca_pml_base_selected_component = mca_pml_monitoring_component;
129-
mca_pml = mca_pml_monitoring_module;
130-
/* Restore some of the original values: progress, flags, tags and context id */
131-
mca_pml.pml_progress = pml_selected_module.pml_progress;
132-
mca_pml.pml_max_contextid = pml_selected_module.pml_max_contextid;
133-
mca_pml.pml_max_tag = pml_selected_module.pml_max_tag;
134-
/* Add MCA_PML_BASE_FLAG_REQUIRE_WORLD flag to ensure the hashtable is properly initialized */
135-
mca_pml.pml_flags = pml_selected_module.pml_flags | MCA_PML_BASE_FLAG_REQUIRE_WORLD;
136-
137-
mca_pml_monitoring_active = 1;
138-
139-
return OMPI_SUCCESS;
140-
}
141-
14298
static mca_pml_base_module_t*
14399
mca_pml_monitoring_component_init(int* priority,
144100
bool enable_progress_threads,
@@ -154,19 +110,72 @@ mca_pml_monitoring_component_init(int* priority,
154110

155111
static int mca_pml_monitoring_component_finish(void)
156112
{
157-
if( mca_common_monitoring_enabled && mca_pml_monitoring_active ) {
158-
/* Free internal data structure */
159-
mca_common_monitoring_finalize();
113+
if( !mca_common_monitoring_enabled )
114+
return OMPI_SUCCESS;
115+
if( !mca_pml_monitoring_active ) {
116+
/* The monitoring component priority is always low to guarantee that the component
117+
* is never selected. Thus, the first time component_finish is called it is right
118+
* after the selection of the best PML was done, and the perfect moment to intercept
119+
* it. At this point we remove ourselves from ompi_pml_base_framework.framework_components
120+
* so that the component never gets closed and unloaded and it's VARs are safe for
121+
* the rest of the execution.
122+
*/
123+
mca_pml_base_component_t *component = NULL;
124+
mca_base_component_list_item_t *cli = NULL;
125+
OPAL_LIST_FOREACH(cli, &ompi_pml_base_framework.framework_components, mca_base_component_list_item_t) {
126+
component = (mca_pml_base_component_t *) cli->cli_component;
127+
128+
if( component == &mca_pml_monitoring_component ) {
129+
opal_list_remove_item(&ompi_pml_base_framework.framework_components, (opal_list_item_t*)cli);
130+
OBJ_RELEASE(cli);
131+
break;
132+
}
133+
}
134+
/**
135+
* We are supposed to monitor the execution. Save the winner PML component and
136+
* module, and swap it with ourselves. Increase our refcount so that we are
137+
* not dlclose.
138+
*/
139+
/* Save a copy of the selected PML */
140+
pml_selected_component = mca_pml_base_selected_component;
141+
pml_selected_module = mca_pml;
142+
/* Install our interception layer */
143+
mca_pml_base_selected_component = mca_pml_monitoring_component;
144+
mca_pml = mca_pml_monitoring_module;
145+
146+
/* Restore some of the original values: progress, flags, tags and context id */
147+
mca_pml.pml_progress = pml_selected_module.pml_progress;
148+
mca_pml.pml_max_contextid = pml_selected_module.pml_max_contextid;
149+
mca_pml.pml_max_tag = pml_selected_module.pml_max_tag;
150+
/* Add MCA_PML_BASE_FLAG_REQUIRE_WORLD flag to ensure the hashtable is properly initialized */
151+
mca_pml.pml_flags = pml_selected_module.pml_flags | MCA_PML_BASE_FLAG_REQUIRE_WORLD;
152+
153+
mca_pml_monitoring_active = 1;
154+
} else {
155+
/**
156+
* This is the second call to component_finalize, and the component is actively
157+
* intercepting the calls to the best PML. Time to stop and cleanly finalize ourself.
158+
*/
159+
160160
/* Restore the original PML */
161161
mca_pml_base_selected_component = pml_selected_component;
162162
mca_pml = pml_selected_module;
163163
/* Redirect the close call to the original PML */
164164
pml_selected_component.pmlm_finalize();
165+
166+
/* Free internal data structure */
167+
mca_common_monitoring_finalize();
168+
165169
/**
166-
* We should never release the last ref on the current
167-
* component or face forever punishement.
170+
* We are in the compoenent code itself, we need to prevent the dlloader from
171+
* removing the code. This will result in minimal memory leaks, but it is the only
172+
* way to remove most of the references to the component (including the *vars).
168173
*/
169-
/* mca_base_component_repository_release(&mca_common_monitoring_component.pmlm_version); */
174+
mca_base_component_repository_retain_component(mca_pml_monitoring_component.pmlm_version.mca_type_name,
175+
mca_pml_monitoring_component.pmlm_version.mca_component_name);
176+
/* Release all memory and be gone. */
177+
mca_base_component_close((mca_base_component_t*)&mca_pml_monitoring_component,
178+
ompi_pml_base_framework.framework_output);
170179
}
171180
return OMPI_SUCCESS;
172181
}
@@ -188,7 +197,7 @@ mca_pml_base_component_2_0_0_t mca_pml_monitoring_component = {
188197
.mca_component_name = "monitoring", /* MCA component name */
189198
MCA_MONITORING_MAKE_VERSION,
190199
.mca_open_component = mca_pml_monitoring_component_open, /* component open */
191-
.mca_close_component = mca_pml_monitoring_component_close, /* component close */
200+
.mca_close_component = NULL, /* component close */
192201
.mca_register_component_params = mca_pml_monitoring_component_register
193202
},
194203
.pmlm_data = {

opal/mca/base/mca_base_components_close.c

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -50,10 +50,16 @@ void mca_base_component_close (const mca_base_component_t *component, int output
5050
{
5151
/* Close */
5252
if (NULL != component->mca_close_component) {
53-
component->mca_close_component();
54-
opal_output_verbose (MCA_BASE_VERBOSE_COMPONENT, output_id,
55-
"mca: base: close: component %s closed",
56-
component->mca_component_name);
53+
if( OPAL_SUCCESS == component->mca_close_component() ) {
54+
opal_output_verbose (MCA_BASE_VERBOSE_COMPONENT, output_id,
55+
"mca: base: close: component %s closed",
56+
component->mca_component_name);
57+
} else {
58+
opal_output_verbose (MCA_BASE_VERBOSE_COMPONENT, output_id,
59+
"mca: base: close: component %s refused to close [drop it]",
60+
component->mca_component_name);
61+
return;
62+
}
5763
}
5864

5965
mca_base_component_unload (component, output_id);

0 commit comments

Comments
 (0)