@@ -832,121 +832,162 @@ static void _toolconn(int sd, short args, void *cbdata)
832
832
orte_job_t * jdata ;
833
833
orte_app_context_t * app ;
834
834
orte_proc_t * proc ;
835
- orte_node_t * node ;
836
- orte_process_name_t tool ;
837
- int rc ;
835
+ orte_node_t * node , * nptr ;
836
+ char * hostname = NULL ;
837
+ orte_process_name_t tool = {ORTE_JOBID_INVALID , ORTE_VPID_INVALID };
838
+ int rc , i ;
838
839
opal_value_t * val ;
839
- bool flag ;
840
+ bool flag = false, flag_given = false; ;
840
841
841
842
ORTE_ACQUIRE_OBJECT (cd );
842
843
843
844
opal_output_verbose (2 , orte_pmix_server_globals .output ,
844
845
"%s TOOL CONNECTION PROCESSING" ,
845
846
ORTE_NAME_PRINT (ORTE_PROC_MY_NAME ));
846
847
847
- /* if we are the HNP, we can directly assign the jobid */
848
- if (ORTE_PROC_IS_HNP || ORTE_PROC_IS_MASTER ) {
849
- jdata = OBJ_NEW (orte_job_t );
850
- rc = orte_plm_base_create_jobid (jdata );
851
- if (ORTE_SUCCESS != rc ) {
852
- tool .jobid = ORTE_JOBID_INVALID ;
848
+ /* check for directives */
849
+ if (NULL != cd -> info ) {
850
+ OPAL_LIST_FOREACH (val , cd -> info , opal_value_t ) {
851
+ if (0 == strcmp (val -> key , OPAL_PMIX_EVENT_SILENT_TERMINATION )) {
852
+ if (OPAL_UNDEF == val -> type || val -> data .flag ) {
853
+ flag = true;
854
+ flag_given = true;
855
+ }
856
+ } else if (0 == strcmp (val -> key , OPAL_PMIX_NSPACE )) {
857
+ tool .jobid = val -> data .name .jobid ;
858
+ } else if (0 == strcmp (val -> key , OPAL_PMIX_RANK )) {
859
+ tool .vpid = val -> data .name .vpid ;
860
+ } else if (0 == strcmp (val -> key , OPAL_PMIX_HOSTNAME )) {
861
+ hostname = strdup (val -> data .string );
862
+ }
863
+ }
864
+ }
865
+
866
+ /* if we are not the HNP or master, and the tool doesn't
867
+ * already have a name (i.e., we didn't spawn it), then
868
+ * there is nothing we can currently do.
869
+ * Eventually, when we switch to nspace instead of an
870
+ * integer jobid, we'll just locally assign this value */
871
+ if (ORTE_JOBID_INVALID == tool .jobid ||
872
+ ORTE_VPID_INVALID == tool .vpid ) {
873
+ /* if we are the HNP, we can directly assign the jobid */
874
+ if (ORTE_PROC_IS_HNP || ORTE_PROC_IS_MASTER ) {
875
+ jdata = OBJ_NEW (orte_job_t );
876
+ rc = orte_plm_base_create_jobid (jdata );
877
+ if (ORTE_SUCCESS != rc ) {
878
+ OBJ_RELEASE (jdata );
879
+ if (NULL != cd -> toolcbfunc ) {
880
+ cd -> toolcbfunc (ORTE_ERROR , tool , cd -> cbdata );
881
+ }
882
+ OBJ_RELEASE (cd );
883
+ return ;
884
+ }
885
+ tool .jobid = jdata -> jobid ;
853
886
tool .vpid = 0 ;
887
+ } else {
888
+ /* we currently do not support connections to non-HNP/master
889
+ * daemons from tools that were not spawned by a daemon */
854
890
if (NULL != cd -> toolcbfunc ) {
855
- cd -> toolcbfunc (rc , tool , cd -> cbdata );
891
+ cd -> toolcbfunc (ORTE_ERR_NOT_SUPPORTED , tool , cd -> cbdata );
856
892
}
857
893
OBJ_RELEASE (cd );
858
894
return ;
859
895
}
860
- opal_hash_table_set_value_uint32 (orte_job_data , jdata -> jobid , jdata );
861
- /* setup some required job-level fields in case this
862
- * tool calls spawn, or uses some other functions that
863
- * need them */
864
- /* must create a map for it (even though it has no
865
- * info in it) so that the job info will be picked
866
- * up in subsequent pidmaps or other daemons won't
867
- * know how to route
868
- */
869
- jdata -> map = OBJ_NEW (orte_job_map_t );
870
-
871
- /* setup an app_context for the singleton */
872
- app = OBJ_NEW (orte_app_context_t );
873
- app -> app = strdup ("tool" );
874
- app -> num_procs = 1 ;
875
- opal_pointer_array_add (jdata -> apps , app );
876
- jdata -> num_apps = 1 ;
877
-
878
- /* setup a proc object for the singleton - since we
879
- * -must- be the HNP, and therefore we stored our
880
- * node on the global node pool, and since the singleton
881
- * -must- be on the same node as us, indicate that
882
- */
883
- proc = OBJ_NEW (orte_proc_t );
884
- proc -> name .jobid = jdata -> jobid ;
885
- proc -> name .vpid = 0 ;
886
- proc -> parent = ORTE_PROC_MY_NAME -> vpid ;
887
- ORTE_FLAG_SET (proc , ORTE_PROC_FLAG_ALIVE );
888
- proc -> state = ORTE_PROC_STATE_RUNNING ;
889
- proc -> app_idx = 0 ;
890
- /* obviously, it is on my node */
896
+ } else {
897
+ jdata = OBJ_NEW (orte_job_t );
898
+ jdata -> jobid = tool .jobid ;
899
+ }
900
+
901
+ opal_hash_table_set_value_uint32 (orte_job_data , jdata -> jobid , jdata );
902
+ /* setup some required job-level fields in case this
903
+ * tool calls spawn, or uses some other functions that
904
+ * need them */
905
+ /* must create a map for it (even though it has no
906
+ * info in it) so that the job info will be picked
907
+ * up in subsequent pidmaps or other daemons won't
908
+ * know how to route
909
+ */
910
+ jdata -> map = OBJ_NEW (orte_job_map_t );
911
+
912
+ /* setup an app_context for the singleton */
913
+ app = OBJ_NEW (orte_app_context_t );
914
+ app -> app = strdup ("tool" );
915
+ app -> num_procs = 1 ;
916
+ opal_pointer_array_add (jdata -> apps , app );
917
+ jdata -> num_apps = 1 ;
918
+
919
+ /* setup a proc object for the singleton - since we
920
+ * -must- be the HNP, and therefore we stored our
921
+ * node on the global node pool, and since the singleton
922
+ * -must- be on the same node as us, indicate that
923
+ */
924
+ proc = OBJ_NEW (orte_proc_t );
925
+ proc -> name .jobid = jdata -> jobid ;
926
+ proc -> name .vpid = tool .vpid ;
927
+ proc -> parent = ORTE_PROC_MY_NAME -> vpid ;
928
+ ORTE_FLAG_SET (proc , ORTE_PROC_FLAG_ALIVE );
929
+ ORTE_FLAG_SET (proc , ORTE_PROC_FLAG_TOOL );
930
+ proc -> state = ORTE_PROC_STATE_RUNNING ;
931
+ /* set the trivial */
932
+ proc -> local_rank = 0 ;
933
+ proc -> node_rank = 0 ;
934
+ proc -> app_rank = 0 ;
935
+ proc -> app_idx = 0 ;
936
+ if (NULL == hostname ) {
937
+ /* it is on my node */
891
938
node = (orte_node_t * )opal_pointer_array_get_item (orte_node_pool , 0 );
892
- proc -> node = node ;
893
- OBJ_RETAIN (node ); /* keep accounting straight */
894
- opal_pointer_array_add (jdata -> procs , proc );
895
- jdata -> num_procs = 1 ;
896
- /* add the node to the job map */
897
- OBJ_RETAIN (node );
898
- opal_pointer_array_add (jdata -> map -> nodes , node );
899
- jdata -> map -> num_nodes ++ ;
900
- /* and it obviously is on the node - note that
901
- * we do _not_ increment the #procs on the node
902
- * as the tool doesn't count against the slot
903
- * allocation */
904
- OBJ_RETAIN (proc );
905
- opal_pointer_array_add (node -> procs , proc );
906
- /* set the trivial */
907
- proc -> local_rank = 0 ;
908
- proc -> node_rank = 0 ;
909
- proc -> app_rank = 0 ;
910
- proc -> state = ORTE_PROC_STATE_RUNNING ;
911
- proc -> app_idx = 0 ;
912
939
ORTE_FLAG_SET (proc , ORTE_PROC_FLAG_LOCAL );
913
-
914
- /* check for directives */
915
- if (NULL != cd -> info ) {
916
- OPAL_LIST_FOREACH (val , cd -> info , opal_value_t ) {
917
- if (0 == strcmp (val -> key , OPAL_PMIX_EVENT_SILENT_TERMINATION )) {
918
- if (OPAL_UNDEF == val -> type || val -> data .flag ) {
919
- flag = true;
920
- orte_set_attribute (& jdata -> attributes , ORTE_JOB_SILENT_TERMINATION ,
921
- ORTE_ATTR_GLOBAL , & flag , OPAL_BOOL );
922
- }
923
- }
940
+ } else {
941
+ /* we need to locate it */
942
+ node = NULL ;
943
+ for (i = 0 ; i < orte_node_pool -> size ; i ++ ) {
944
+ if (NULL == (nptr = (orte_node_t * )opal_pointer_array_get_item (orte_node_pool , i ))) {
945
+ continue ;
924
946
}
947
+ if (0 == strcmp (hostname , nptr -> name )) {
948
+ node = nptr ;
949
+ break ;
950
+ }
951
+ }
952
+ if (NULL == node ) {
953
+ /* not in our allocation - which is still okay */
954
+ node = OBJ_NEW (orte_node_t );
955
+ node -> name = strdup (hostname );
956
+ ORTE_FLAG_SET (node , ORTE_NODE_NON_USABLE );
957
+ opal_pointer_array_add (orte_node_pool , node );
925
958
}
959
+ }
960
+ proc -> node = node ;
961
+ OBJ_RETAIN (node ); /* keep accounting straight */
962
+ opal_pointer_array_add (jdata -> procs , proc );
963
+ jdata -> num_procs = 1 ;
964
+ /* add the node to the job map */
965
+ OBJ_RETAIN (node );
966
+ opal_pointer_array_add (jdata -> map -> nodes , node );
967
+ jdata -> map -> num_nodes ++ ;
968
+ /* and it obviously is on the node - note that
969
+ * we do _not_ increment the #procs on the node
970
+ * as the tool doesn't count against the slot
971
+ * allocation */
972
+ OBJ_RETAIN (proc );
973
+ opal_pointer_array_add (node -> procs , proc );
974
+ /* if they indicated a preference for termination, set it */
975
+ if (flag_given ) {
976
+ orte_set_attribute (& jdata -> attributes , ORTE_JOB_SILENT_TERMINATION ,
977
+ ORTE_ATTR_GLOBAL , & flag , OPAL_BOOL );
978
+ } else {
979
+ /* we default to silence */
926
980
flag = true;
927
981
orte_set_attribute (& jdata -> attributes , ORTE_JOB_SILENT_TERMINATION ,
928
982
ORTE_ATTR_GLOBAL , & flag , OPAL_BOOL );
929
-
930
- /* pass back the assigned jobid */
931
- tool .jobid = jdata -> jobid ;
932
- tool .vpid = 0 ;
933
- if (NULL != cd -> toolcbfunc ) {
934
- cd -> toolcbfunc (rc , tool , cd -> cbdata );
935
- }
936
- OBJ_RELEASE (cd );
937
- return ;
938
983
}
939
984
940
- /* otherwise, we have to send the request to the HNP.
941
- * Eventually, when we switch to nspace instead of an
942
- * integer jobid, we'll just locally assign this value */
943
- tool .jobid = ORTE_JOBID_INVALID ;
944
- tool .vpid = ORTE_VPID_INVALID ;
945
985
if (NULL != cd -> toolcbfunc ) {
946
986
cd -> toolcbfunc (ORTE_ERR_NOT_SUPPORTED , tool , cd -> cbdata );
947
987
}
948
988
OBJ_RELEASE (cd );
949
989
}
990
+
950
991
void pmix_tool_connected_fn (opal_list_t * info ,
951
992
opal_pmix_tool_connection_cbfunc_t cbfunc ,
952
993
void * cbdata )
0 commit comments