@@ -95,50 +95,6 @@ static int mca_pml_monitoring_component_open(void)
95
95
return OMPI_SUCCESS ;
96
96
}
97
97
98
- static int mca_pml_monitoring_component_close (void )
99
- {
100
- if ( !mca_common_monitoring_enabled ) return OMPI_SUCCESS ;
101
-
102
- /**
103
- * If this component is already active, then we are currently monitoring
104
- * the execution and this call to close if the one from MPI_Finalize.
105
- * Clean up and release the extra reference on ourselves.
106
- */
107
- if ( mca_pml_monitoring_active ) { /* Already active, turn off */
108
- pml_selected_component .pmlm_version .mca_close_component ();
109
- mca_base_component_repository_release ((mca_base_component_t * )& mca_pml_monitoring_component );
110
- mca_pml_monitoring_active = 0 ;
111
- return OMPI_SUCCESS ;
112
- }
113
-
114
- /**
115
- * We are supposed to monitor the execution. Save the winner PML component and
116
- * module, and swap it with ourselves. Increase our refcount so that we are
117
- * not dlclose.
118
- */
119
- if ( OPAL_SUCCESS != mca_base_component_repository_retain_component (mca_pml_monitoring_component .pmlm_version .mca_type_name ,
120
- mca_pml_monitoring_component .pmlm_version .mca_component_name ) ) {
121
- return OMPI_ERROR ;
122
- }
123
-
124
- /* Save a copy of the selected PML */
125
- pml_selected_component = mca_pml_base_selected_component ;
126
- pml_selected_module = mca_pml ;
127
- /* Install our interception layer */
128
- mca_pml_base_selected_component = mca_pml_monitoring_component ;
129
- mca_pml = mca_pml_monitoring_module ;
130
- /* Restore some of the original values: progress, flags, tags and context id */
131
- mca_pml .pml_progress = pml_selected_module .pml_progress ;
132
- mca_pml .pml_max_contextid = pml_selected_module .pml_max_contextid ;
133
- mca_pml .pml_max_tag = pml_selected_module .pml_max_tag ;
134
- /* Add MCA_PML_BASE_FLAG_REQUIRE_WORLD flag to ensure the hashtable is properly initialized */
135
- mca_pml .pml_flags = pml_selected_module .pml_flags | MCA_PML_BASE_FLAG_REQUIRE_WORLD ;
136
-
137
- mca_pml_monitoring_active = 1 ;
138
-
139
- return OMPI_SUCCESS ;
140
- }
141
-
142
98
static mca_pml_base_module_t *
143
99
mca_pml_monitoring_component_init (int * priority ,
144
100
bool enable_progress_threads ,
@@ -154,19 +110,72 @@ mca_pml_monitoring_component_init(int* priority,
154
110
155
111
static int mca_pml_monitoring_component_finish (void )
156
112
{
157
- if ( mca_common_monitoring_enabled && mca_pml_monitoring_active ) {
158
- /* Free internal data structure */
159
- mca_common_monitoring_finalize ();
113
+ if ( !mca_common_monitoring_enabled )
114
+ return OMPI_SUCCESS ;
115
+ if ( !mca_pml_monitoring_active ) {
116
+ /* The monitoring component priority is always low to guarantee that the component
117
+ * is never selected. Thus, the first time component_finish is called it is right
118
+ * after the selection of the best PML was done, and the perfect moment to intercept
119
+ * it. At this point we remove ourselves from ompi_pml_base_framework.framework_components
120
+ * so that the component never gets closed and unloaded and it's VARs are safe for
121
+ * the rest of the execution.
122
+ */
123
+ mca_pml_base_component_t * component = NULL ;
124
+ mca_base_component_list_item_t * cli = NULL ;
125
+ OPAL_LIST_FOREACH (cli , & ompi_pml_base_framework .framework_components , mca_base_component_list_item_t ) {
126
+ component = (mca_pml_base_component_t * ) cli -> cli_component ;
127
+
128
+ if ( component == & mca_pml_monitoring_component ) {
129
+ opal_list_remove_item (& ompi_pml_base_framework .framework_components , (opal_list_item_t * )cli );
130
+ OBJ_RELEASE (cli );
131
+ break ;
132
+ }
133
+ }
134
+ /**
135
+ * We are supposed to monitor the execution. Save the winner PML component and
136
+ * module, and swap it with ourselves. Increase our refcount so that we are
137
+ * not dlclose.
138
+ */
139
+ /* Save a copy of the selected PML */
140
+ pml_selected_component = mca_pml_base_selected_component ;
141
+ pml_selected_module = mca_pml ;
142
+ /* Install our interception layer */
143
+ mca_pml_base_selected_component = mca_pml_monitoring_component ;
144
+ mca_pml = mca_pml_monitoring_module ;
145
+
146
+ /* Restore some of the original values: progress, flags, tags and context id */
147
+ mca_pml .pml_progress = pml_selected_module .pml_progress ;
148
+ mca_pml .pml_max_contextid = pml_selected_module .pml_max_contextid ;
149
+ mca_pml .pml_max_tag = pml_selected_module .pml_max_tag ;
150
+ /* Add MCA_PML_BASE_FLAG_REQUIRE_WORLD flag to ensure the hashtable is properly initialized */
151
+ mca_pml .pml_flags = pml_selected_module .pml_flags | MCA_PML_BASE_FLAG_REQUIRE_WORLD ;
152
+
153
+ mca_pml_monitoring_active = 1 ;
154
+ } else {
155
+ /**
156
+ * This is the second call to component_finalize, and the component is actively
157
+ * intercepting the calls to the best PML. Time to stop and cleanly finalize ourself.
158
+ */
159
+
160
160
/* Restore the original PML */
161
161
mca_pml_base_selected_component = pml_selected_component ;
162
162
mca_pml = pml_selected_module ;
163
163
/* Redirect the close call to the original PML */
164
164
pml_selected_component .pmlm_finalize ();
165
+
166
+ /* Free internal data structure */
167
+ mca_common_monitoring_finalize ();
168
+
165
169
/**
166
- * We should never release the last ref on the current
167
- * component or face forever punishement.
170
+ * We are in the compoenent code itself, we need to prevent the dlloader from
171
+ * removing the code. This will result in minimal memory leaks, but it is the only
172
+ * way to remove most of the references to the component (including the *vars).
168
173
*/
169
- /* mca_base_component_repository_release(&mca_common_monitoring_component.pmlm_version); */
174
+ mca_base_component_repository_retain_component (mca_pml_monitoring_component .pmlm_version .mca_type_name ,
175
+ mca_pml_monitoring_component .pmlm_version .mca_component_name );
176
+ /* Release all memory and be gone. */
177
+ mca_base_component_close ((mca_base_component_t * )& mca_pml_monitoring_component ,
178
+ ompi_pml_base_framework .framework_output );
170
179
}
171
180
return OMPI_SUCCESS ;
172
181
}
@@ -188,7 +197,7 @@ mca_pml_base_component_2_0_0_t mca_pml_monitoring_component = {
188
197
.mca_component_name = "monitoring" , /* MCA component name */
189
198
MCA_MONITORING_MAKE_VERSION ,
190
199
.mca_open_component = mca_pml_monitoring_component_open , /* component open */
191
- .mca_close_component = mca_pml_monitoring_component_close , /* component close */
200
+ .mca_close_component = NULL , /* component close */
192
201
.mca_register_component_params = mca_pml_monitoring_component_register
193
202
},
194
203
.pmlm_data = {
0 commit comments