Skip to content

Commit c3abedb

Browse files
authored
Merge pull request #5759 from bosilca/fix/monitoring
Fix/monitoring
2 parents 8db5aaa + dc972f0 commit c3abedb

File tree

3 files changed

+78
-63
lines changed

3 files changed

+78
-63
lines changed

ompi/mca/pml/base/pml_base_select.c

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -193,6 +193,14 @@ int mca_pml_base_select(bool enable_progress_threads,
193193
modex_reqd = true;
194194
}
195195

196+
/* Save the winner */
197+
198+
mca_pml_base_selected_component = *best_component;
199+
mca_pml = *best_module;
200+
opal_output_verbose( 10, ompi_pml_base_framework.framework_output,
201+
"select: component %s selected",
202+
mca_pml_base_selected_component.pmlm_version.mca_component_name );
203+
196204
/* Finalize all non-selected components */
197205

198206
for (item = opal_list_remove_first(&opened);
@@ -239,14 +247,6 @@ int mca_pml_base_select(bool enable_progress_threads,
239247
}
240248
#endif
241249

242-
/* Save the winner */
243-
244-
mca_pml_base_selected_component = *best_component;
245-
mca_pml = *best_module;
246-
opal_output_verbose( 10, ompi_pml_base_framework.framework_output,
247-
"select: component %s selected",
248-
mca_pml_base_selected_component.pmlm_version.mca_component_name );
249-
250250
/* This base function closes, unloads, and removes from the
251251
available list all unselected components. The available list will
252252
contain only the selected component. */

ompi/mca/pml/monitoring/pml_monitoring_component.c

Lines changed: 60 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -95,50 +95,6 @@ static int mca_pml_monitoring_component_open(void)
9595
return OMPI_SUCCESS;
9696
}
9797

98-
static int mca_pml_monitoring_component_close(void)
99-
{
100-
if( !mca_common_monitoring_enabled ) return OMPI_SUCCESS;
101-
102-
/**
103-
* If this component is already active, then we are currently monitoring
104-
* the execution and this call to close if the one from MPI_Finalize.
105-
* Clean up and release the extra reference on ourselves.
106-
*/
107-
if( mca_pml_monitoring_active ) { /* Already active, turn off */
108-
pml_selected_component.pmlm_version.mca_close_component();
109-
mca_base_component_repository_release((mca_base_component_t*)&mca_pml_monitoring_component);
110-
mca_pml_monitoring_active = 0;
111-
return OMPI_SUCCESS;
112-
}
113-
114-
/**
115-
* We are supposed to monitor the execution. Save the winner PML component and
116-
* module, and swap it with ourselves. Increase our refcount so that we are
117-
* not dlclose.
118-
*/
119-
if( OPAL_SUCCESS != mca_base_component_repository_retain_component(mca_pml_monitoring_component.pmlm_version.mca_type_name,
120-
mca_pml_monitoring_component.pmlm_version.mca_component_name) ) {
121-
return OMPI_ERROR;
122-
}
123-
124-
/* Save a copy of the selected PML */
125-
pml_selected_component = mca_pml_base_selected_component;
126-
pml_selected_module = mca_pml;
127-
/* Install our interception layer */
128-
mca_pml_base_selected_component = mca_pml_monitoring_component;
129-
mca_pml = mca_pml_monitoring_module;
130-
/* Restore some of the original values: progress, flags, tags and context id */
131-
mca_pml.pml_progress = pml_selected_module.pml_progress;
132-
mca_pml.pml_max_contextid = pml_selected_module.pml_max_contextid;
133-
mca_pml.pml_max_tag = pml_selected_module.pml_max_tag;
134-
/* Add MCA_PML_BASE_FLAG_REQUIRE_WORLD flag to ensure the hashtable is properly initialized */
135-
mca_pml.pml_flags = pml_selected_module.pml_flags | MCA_PML_BASE_FLAG_REQUIRE_WORLD;
136-
137-
mca_pml_monitoring_active = 1;
138-
139-
return OMPI_SUCCESS;
140-
}
141-
14298
static mca_pml_base_module_t*
14399
mca_pml_monitoring_component_init(int* priority,
144100
bool enable_progress_threads,
@@ -154,19 +110,72 @@ mca_pml_monitoring_component_init(int* priority,
154110

155111
static int mca_pml_monitoring_component_finish(void)
156112
{
157-
if( mca_common_monitoring_enabled && mca_pml_monitoring_active ) {
158-
/* Free internal data structure */
159-
mca_common_monitoring_finalize();
113+
if( !mca_common_monitoring_enabled )
114+
return OMPI_SUCCESS;
115+
if( !mca_pml_monitoring_active ) {
116+
/* The monitoring component priority is always low to guarantee that the component
117+
* is never selected. Thus, the first time component_finish is called it is right
118+
* after the selection of the best PML was done, and the perfect moment to intercept
119+
* it. At this point we remove ourselves from ompi_pml_base_framework.framework_components
120+
* so that the component never gets closed and unloaded and it's VARs are safe for
121+
* the rest of the execution.
122+
*/
123+
mca_pml_base_component_t *component = NULL;
124+
mca_base_component_list_item_t *cli = NULL;
125+
OPAL_LIST_FOREACH(cli, &ompi_pml_base_framework.framework_components, mca_base_component_list_item_t) {
126+
component = (mca_pml_base_component_t *) cli->cli_component;
127+
128+
if( component == &mca_pml_monitoring_component ) {
129+
opal_list_remove_item(&ompi_pml_base_framework.framework_components, (opal_list_item_t*)cli);
130+
OBJ_RELEASE(cli);
131+
break;
132+
}
133+
}
134+
/**
135+
* We are supposed to monitor the execution. Save the winner PML component and
136+
* module, and swap it with ourselves. Increase our refcount so that we are
137+
* not dlclose.
138+
*/
139+
/* Save a copy of the selected PML */
140+
pml_selected_component = mca_pml_base_selected_component;
141+
pml_selected_module = mca_pml;
142+
/* Install our interception layer */
143+
mca_pml_base_selected_component = mca_pml_monitoring_component;
144+
mca_pml = mca_pml_monitoring_module;
145+
146+
/* Restore some of the original values: progress, flags, tags and context id */
147+
mca_pml.pml_progress = pml_selected_module.pml_progress;
148+
mca_pml.pml_max_contextid = pml_selected_module.pml_max_contextid;
149+
mca_pml.pml_max_tag = pml_selected_module.pml_max_tag;
150+
/* Add MCA_PML_BASE_FLAG_REQUIRE_WORLD flag to ensure the hashtable is properly initialized */
151+
mca_pml.pml_flags = pml_selected_module.pml_flags | MCA_PML_BASE_FLAG_REQUIRE_WORLD;
152+
153+
mca_pml_monitoring_active = 1;
154+
} else {
155+
/**
156+
* This is the second call to component_finalize, and the component is actively
157+
* intercepting the calls to the best PML. Time to stop and cleanly finalize ourself.
158+
*/
159+
160160
/* Restore the original PML */
161161
mca_pml_base_selected_component = pml_selected_component;
162162
mca_pml = pml_selected_module;
163163
/* Redirect the close call to the original PML */
164164
pml_selected_component.pmlm_finalize();
165+
166+
/* Free internal data structure */
167+
mca_common_monitoring_finalize();
168+
165169
/**
166-
* We should never release the last ref on the current
167-
* component or face forever punishement.
170+
* We are in the compoenent code itself, we need to prevent the dlloader from
171+
* removing the code. This will result in minimal memory leaks, but it is the only
172+
* way to remove most of the references to the component (including the *vars).
168173
*/
169-
/* mca_base_component_repository_release(&mca_common_monitoring_component.pmlm_version); */
174+
mca_base_component_repository_retain_component(mca_pml_monitoring_component.pmlm_version.mca_type_name,
175+
mca_pml_monitoring_component.pmlm_version.mca_component_name);
176+
/* Release all memory and be gone. */
177+
mca_base_component_close((mca_base_component_t*)&mca_pml_monitoring_component,
178+
ompi_pml_base_framework.framework_output);
170179
}
171180
return OMPI_SUCCESS;
172181
}
@@ -188,7 +197,7 @@ mca_pml_base_component_2_0_0_t mca_pml_monitoring_component = {
188197
.mca_component_name = "monitoring", /* MCA component name */
189198
MCA_MONITORING_MAKE_VERSION,
190199
.mca_open_component = mca_pml_monitoring_component_open, /* component open */
191-
.mca_close_component = mca_pml_monitoring_component_close, /* component close */
200+
.mca_close_component = NULL, /* component close */
192201
.mca_register_component_params = mca_pml_monitoring_component_register
193202
},
194203
.pmlm_data = {

opal/mca/base/mca_base_components_close.c

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -50,10 +50,16 @@ void mca_base_component_close (const mca_base_component_t *component, int output
5050
{
5151
/* Close */
5252
if (NULL != component->mca_close_component) {
53-
component->mca_close_component();
54-
opal_output_verbose (MCA_BASE_VERBOSE_COMPONENT, output_id,
55-
"mca: base: close: component %s closed",
56-
component->mca_component_name);
53+
if( OPAL_SUCCESS == component->mca_close_component() ) {
54+
opal_output_verbose (MCA_BASE_VERBOSE_COMPONENT, output_id,
55+
"mca: base: close: component %s closed",
56+
component->mca_component_name);
57+
} else {
58+
opal_output_verbose (MCA_BASE_VERBOSE_COMPONENT, output_id,
59+
"mca: base: close: component %s refused to close [drop it]",
60+
component->mca_component_name);
61+
return;
62+
}
5763
}
5864

5965
mca_base_component_unload (component, output_id);

0 commit comments

Comments
 (0)