@@ -130,7 +130,11 @@ void orte_plm_base_daemons_reported(int fd, short args, void *cbdata)
130
130
orte_state_caddy_t * caddy = (orte_state_caddy_t * )cbdata ;
131
131
orte_topology_t * t ;
132
132
orte_node_t * node ;
133
- int i ;
133
+ int i , rc ;
134
+ uint8_t u8 ;
135
+ opal_buffer_t buf ;
136
+ orte_grpcomm_signature_t * sig ;
137
+ orte_daemon_cmd_flag_t command = ORTE_DAEMON_PASS_NODE_INFO_CMD ;
134
138
135
139
ORTE_ACQUIRE_OBJECT (caddy );
136
140
@@ -177,6 +181,78 @@ void orte_plm_base_daemons_reported(int fd, short args, void *cbdata)
177
181
/* ensure we update the routing plan */
178
182
orte_routed .update_routing_plan (NULL );
179
183
184
+ /* prep the buffer */
185
+ OBJ_CONSTRUCT (& buf , opal_buffer_t );
186
+ /* load the command */
187
+ if (ORTE_SUCCESS != (rc = opal_dss .pack (& buf , & command , 1 , ORTE_DAEMON_CMD ))) {
188
+ ORTE_ERROR_LOG (rc );
189
+ OBJ_DESTRUCT (& buf );
190
+ ORTE_FORCED_TERMINATE (ORTE_ERROR_DEFAULT_EXIT_CODE );
191
+ OBJ_RELEASE (caddy );
192
+ return ;
193
+ }
194
+
195
+
196
+ /* if we did not execute a tree-spawn, then the daemons do
197
+ * not currently have a nidmap for the job - in that case,
198
+ * send one to them */
199
+ if (!orte_nidmap_communicated ) {
200
+ u8 = 1 ;
201
+ if (ORTE_SUCCESS != (rc = opal_dss .pack (& buf , & u8 , 1 , OPAL_UINT8 ))) {
202
+ ORTE_ERROR_LOG (rc );
203
+ OBJ_DESTRUCT (& buf );
204
+ ORTE_FORCED_TERMINATE (ORTE_ERROR_DEFAULT_EXIT_CODE );
205
+ OBJ_RELEASE (caddy );
206
+ return ;
207
+ }
208
+ if (OPAL_SUCCESS != (rc = orte_util_nidmap_create (orte_node_pool , & buf ))) {
209
+ ORTE_ERROR_LOG (rc );
210
+ OBJ_DESTRUCT (& buf );
211
+ ORTE_FORCED_TERMINATE (ORTE_ERROR_DEFAULT_EXIT_CODE );
212
+ OBJ_RELEASE (caddy );
213
+ return ;
214
+ }
215
+ orte_nidmap_communicated = true;
216
+ } else {
217
+ u8 = 0 ;
218
+ if (ORTE_SUCCESS != (rc = opal_dss .pack (& buf , & u8 , 1 , OPAL_UINT8 ))) {
219
+ ORTE_ERROR_LOG (rc );
220
+ OBJ_DESTRUCT (& buf );
221
+ ORTE_FORCED_TERMINATE (ORTE_ERROR_DEFAULT_EXIT_CODE );
222
+ OBJ_RELEASE (caddy );
223
+ return ;
224
+ }
225
+ }
226
+
227
+ /* we always send the topologies and the #slots on each node. Note
228
+ * that we cannot send the #slots until after the above step since,
229
+ * for unmanaged allocations, we might have just determined it! */
230
+ if (OPAL_SUCCESS != (rc = orte_util_pass_node_info (& buf ))) {
231
+ ORTE_ERROR_LOG (rc );
232
+ OBJ_DESTRUCT (& buf );
233
+ ORTE_FORCED_TERMINATE (ORTE_ERROR_DEFAULT_EXIT_CODE );
234
+ OBJ_RELEASE (caddy );
235
+ return ;
236
+ }
237
+
238
+ /* goes to all daemons */
239
+ sig = OBJ_NEW (orte_grpcomm_signature_t );
240
+ sig -> signature = (orte_process_name_t * )malloc (sizeof (orte_process_name_t ));
241
+ sig -> signature [0 ].jobid = ORTE_PROC_MY_NAME -> jobid ;
242
+ sig -> signature [0 ].vpid = ORTE_VPID_WILDCARD ;
243
+ sig -> sz = 1 ;
244
+ if (ORTE_SUCCESS != (rc = orte_grpcomm .xcast (sig , ORTE_RML_TAG_DAEMON , & buf ))) {
245
+ ORTE_ERROR_LOG (rc );
246
+ OBJ_RELEASE (sig );
247
+ OBJ_DESTRUCT (& buf );
248
+ ORTE_FORCED_TERMINATE (ORTE_ERROR_DEFAULT_EXIT_CODE );
249
+ OBJ_RELEASE (caddy );
250
+ return ;
251
+ }
252
+ OBJ_DESTRUCT (& buf );
253
+ /* maintain accounting */
254
+ OBJ_RELEASE (sig );
255
+
180
256
/* progress the job */
181
257
caddy -> jdata -> state = ORTE_JOB_STATE_DAEMONS_REPORTED ;
182
258
ORTE_ACTIVATE_JOB_STATE (caddy -> jdata , ORTE_JOB_STATE_VM_READY );
0 commit comments