42
42
43
43
#include "orte/mca/errmgr/errmgr.h"
44
44
#include "orte/mca/rmaps/base/base.h"
45
+ #include "orte/mca/rml/base/rml_contact.h"
45
46
#include "orte/mca/state/state.h"
46
47
#include "orte/util/name_fns.h"
47
48
#include "orte/util/show_help.h"
@@ -539,7 +540,14 @@ static void _cnlk(int status, opal_list_t *data, void *cbdata)
539
540
int rc , cnt ;
540
541
opal_pmix_pdata_t * pdat ;
541
542
orte_job_t * jdata ;
542
- opal_buffer_t buf ;
543
+ orte_node_t * node ;
544
+ orte_proc_t * proc ;
545
+ opal_buffer_t buf , bucket ;
546
+ opal_byte_object_t * bo ;
547
+ orte_process_name_t dmn , pname ;
548
+ char * uri ;
549
+ opal_value_t val ;
550
+ opal_list_t nodes ;
543
551
544
552
ORTE_ACQUIRE_OBJECT (cd );
545
553
@@ -556,6 +564,7 @@ static void _cnlk(int status, opal_list_t *data, void *cbdata)
556
564
pdat = (opal_pmix_pdata_t * )opal_list_get_first (data );
557
565
if (OPAL_BYTE_OBJECT != pdat -> value .type ) {
558
566
rc = ORTE_ERR_BAD_PARAM ;
567
+ ORTE_ERROR_LOG (rc );
559
568
goto release ;
560
569
}
561
570
/* the data will consist of a packed buffer with the job data in it */
@@ -565,15 +574,107 @@ static void _cnlk(int status, opal_list_t *data, void *cbdata)
565
574
pdat -> value .data .bo .size = 0 ;
566
575
cnt = 1 ;
567
576
if (OPAL_SUCCESS != (rc = opal_dss .unpack (& buf , & jdata , & cnt , ORTE_JOB ))) {
577
+ ORTE_ERROR_LOG (rc );
578
+ OBJ_DESTRUCT (& buf );
579
+ goto release ;
580
+ }
581
+
582
+ /* unpack the byte object containing the daemon uri's */
583
+ cnt = 1 ;
584
+ if (ORTE_SUCCESS != (rc = opal_dss .unpack (& buf , & bo , & cnt , OPAL_BYTE_OBJECT ))) {
585
+ ORTE_ERROR_LOG (rc );
568
586
OBJ_DESTRUCT (& buf );
569
587
goto release ;
570
588
}
589
+ /* load it into a buffer */
590
+ OBJ_CONSTRUCT (& bucket , opal_buffer_t );
591
+ opal_dss .load (& bucket , bo -> bytes , bo -> size );
592
+ bo -> bytes = NULL ;
593
+ free (bo );
594
+ /* prep a list to save the nodes */
595
+ OBJ_CONSTRUCT (& nodes , opal_list_t );
596
+ /* unpack and store the URI's */
597
+ cnt = 1 ;
598
+ while (OPAL_SUCCESS == (rc = opal_dss .unpack (& bucket , & uri , & cnt , OPAL_STRING ))) {
599
+ rc = orte_rml_base_parse_uris (uri , & dmn , NULL );
600
+ if (ORTE_SUCCESS != rc ) {
601
+ OBJ_DESTRUCT (& buf );
602
+ OBJ_DESTRUCT (& bucket );
603
+ goto release ;
604
+ }
605
+ /* save a node object for this daemon */
606
+ node = OBJ_NEW (orte_node_t );
607
+ node -> daemon = OBJ_NEW (orte_proc_t );
608
+ memcpy (& node -> daemon -> name , & dmn , sizeof (orte_process_name_t ));
609
+ opal_list_append (& nodes , & node -> super );
610
+ /* register the URI */
611
+ OBJ_CONSTRUCT (& val , opal_value_t );
612
+ val .key = OPAL_PMIX_PROC_URI ;
613
+ val .type = OPAL_STRING ;
614
+ val .data .string = uri ;
615
+ if (OPAL_SUCCESS != (rc = opal_pmix .store_local (& dmn , & val ))) {
616
+ ORTE_ERROR_LOG (rc );
617
+ val .key = NULL ;
618
+ val .data .string = NULL ;
619
+ OBJ_DESTRUCT (& val );
620
+ OBJ_DESTRUCT (& buf );
621
+ OBJ_DESTRUCT (& bucket );
622
+ goto release ;
623
+ }
624
+ val .key = NULL ;
625
+ val .data .string = NULL ;
626
+ OBJ_DESTRUCT (& val );
627
+ cnt = 1 ;
628
+ }
629
+ OBJ_DESTRUCT (& bucket );
630
+
631
+ /* unpack the proc-to-daemon map */
632
+ cnt = 1 ;
633
+ if (ORTE_SUCCESS != (rc = opal_dss .unpack (& buf , & bo , & cnt , OPAL_BYTE_OBJECT ))) {
634
+ ORTE_ERROR_LOG (rc );
635
+ OBJ_DESTRUCT (& buf );
636
+ goto release ;
637
+ }
638
+ /* load it into a buffer */
639
+ OBJ_CONSTRUCT (& bucket , opal_buffer_t );
640
+ opal_dss .load (& bucket , bo -> bytes , bo -> size );
641
+ bo -> bytes = NULL ;
642
+ free (bo );
643
+ /* unpack and store the map */
644
+ cnt = 1 ;
645
+ while (OPAL_SUCCESS == (rc = opal_dss .unpack (& bucket , & pname , & cnt , ORTE_NAME ))) {
646
+ /* get the name of the daemon hosting it */
647
+ if (OPAL_SUCCESS != (rc = opal_dss .unpack (& bucket , & dmn , & cnt , ORTE_NAME ))) {
648
+ OBJ_DESTRUCT (& buf );
649
+ OBJ_DESTRUCT (& bucket );
650
+ goto release ;
651
+ }
652
+ /* create the proc object */
653
+ proc = OBJ_NEW (orte_proc_t );
654
+ memcpy (& proc -> name , & pname , sizeof (orte_process_name_t ));
655
+ opal_pointer_array_set_item (jdata -> procs , pname .vpid , proc );
656
+ /* find the daemon */
657
+ OPAL_LIST_FOREACH (node , & nodes , orte_node_t ) {
658
+ if (node -> daemon -> name .vpid == dmn .vpid ) {
659
+ OBJ_RETAIN (node );
660
+ proc -> node = node ;
661
+ break ;
662
+ }
663
+ }
664
+ }
665
+ OBJ_DESTRUCT (& bucket );
666
+ OPAL_LIST_DESTRUCT (& nodes );
571
667
OBJ_DESTRUCT (& buf );
668
+
669
+ /* register the nspace */
572
670
if (ORTE_SUCCESS != (rc = orte_pmix_server_register_nspace (jdata , true))) {
671
+ ORTE_ERROR_LOG (rc );
573
672
OBJ_RELEASE (jdata );
574
673
goto release ;
575
674
}
576
- OBJ_RELEASE (jdata ); // no reason to keep this around
675
+
676
+ /* save the job object so we don't endlessly cycle */
677
+ opal_hash_table_set_value_uint32 (orte_job_data , jdata -> jobid , jdata );
577
678
578
679
/* restart the cnct processor */
579
680
ORTE_PMIX_OPERATION (cd -> procs , cd -> info , _cnct , cd -> cbfunc , cd -> cbdata );
@@ -619,6 +720,7 @@ static void _cnct(int sd, short args, void *cbdata)
619
720
* out about it, and all we can do is return an error */
620
721
if (orte_pmix_server_globals .server .jobid == ORTE_PROC_MY_HNP -> jobid &&
621
722
orte_pmix_server_globals .server .vpid == ORTE_PROC_MY_HNP -> vpid ) {
723
+ ORTE_ERROR_LOG (ORTE_ERR_NOT_SUPPORTED );
622
724
rc = ORTE_ERR_NOT_SUPPORTED ;
623
725
goto release ;
624
726
}
@@ -634,6 +736,7 @@ static void _cnct(int sd, short args, void *cbdata)
634
736
kv -> data .uint32 = geteuid ();
635
737
opal_list_append (cd -> info , & kv -> super );
636
738
if (ORTE_SUCCESS != (rc = pmix_server_lookup_fn (& nm -> name , keys , cd -> info , _cnlk , cd ))) {
739
+ ORTE_ERROR_LOG (rc );
637
740
opal_argv_free (keys );
638
741
goto release ;
639
742
}
@@ -647,6 +750,7 @@ static void _cnct(int sd, short args, void *cbdata)
647
750
if (!orte_get_attribute (& jdata -> attributes , ORTE_JOB_NSPACE_REGISTERED , NULL , OPAL_BOOL )) {
648
751
/* it hasn't been registered yet, so register it now */
649
752
if (ORTE_SUCCESS != (rc = orte_pmix_server_register_nspace (jdata , true))) {
753
+ ORTE_ERROR_LOG (rc );
650
754
goto release ;
651
755
}
652
756
}
0 commit comments