coll/han: disqualify hierarchical scatterv for heterogeneous communicators

wenduwan · wenduwan · commit 8bb3e3dfb37e · 2024-04-15T17:51:05.000Z
Hierarchical *v collective algorithms may not work for heterogeneous
communicators with different endianness, interger representation, etc.,
and thus require knowledge of the global communicator's homogeneity to
disqualify the module.

The hierarchical scatterv algorithm requires that every process have the
same architecture as the Root due to the use of MPI_BYTE on node
leaders. Heterogeneous communicators need additional logic to correctly
pack and unpack the data at the cost of memory usage and performance.

Signed-off-by: Wenduo Wang &lt;wenduwan@amazon.com&gt;
diff --git a/ompi/mca/coll/han/coll_han.h b/ompi/mca/coll/han/coll_han.h
@@ -331,6 +331,7 @@ typedef struct mca_coll_han_module_t {
     int *cached_topo;
     bool is_mapbycore;
     bool are_ppn_imbalanced;
+    bool is_heterogeneous;
 
     /* To be able to fallback when the cases are not supported */
     struct mca_coll_han_collectives_fallback_s fallback;
diff --git a/ompi/mca/coll/han/coll_han_scatterv.c b/ompi/mca/coll/han/coll_han_scatterv.c
@@ -55,6 +55,12 @@
  *    to send the data in the correct order even if the process are NOT mapped by core.
  * 2. In the send buffer, other than the root's node, data destined to the same node are continuous
  *    - it is ok if data to different nodes has gap.
+ *
+ * Limitation:
+ * The node leader acts as a broker between the Root and node followers, but it cannot match the
+ * exact type signature of the followers; instead it forwards the intermediate data from Root in its
+ * packed form of MPI_BYTE type. This works for Gatherv but NOT for Scatterv provided that the Root
+ * has a different architecture, e.g. endianness, integer representation, etc.
  */
 int mca_coll_han_scatterv_intra(const void *sbuf, const int *scounts, const int *displs,
                                 struct ompi_datatype_t *sdtype, void *rbuf, int rcount,
@@ -94,6 +100,14 @@ int mca_coll_han_scatterv_intra(const void *sbuf, const int *scounts, const int
         return han_module->previous_scatterv(sbuf, scounts, displs, sdtype, rbuf, rcount, rdtype,
                                              root, comm, han_module->previous_scatterv_module);
     }
+    if (han_module->is_heterogeneous) {
+        OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output,
+                             "han cannot handle scatterv with this communicator (heterogeneous). Fall "
+                             "back on another component\n"));
+        HAN_LOAD_FALLBACK_COLLECTIVE(han_module, comm, scatterv);
+        return han_module->previous_scatterv(sbuf, scounts, displs, sdtype, rbuf, rcount, rdtype,
+                                             root, comm, han_module->previous_scatterv_module);
+    }
 
     w_rank = ompi_comm_rank(comm);
     w_size = ompi_comm_size(comm);
diff --git a/ompi/mca/coll/han/coll_han_topo.c b/ompi/mca/coll/han/coll_han_topo.c
@@ -92,12 +92,19 @@ mca_coll_han_topo_init(struct ompi_communicator_t *comm,
     }
     assert(up_comm != NULL && low_comm != NULL);
 
+    int up_rank = ompi_comm_rank(up_comm);
     int low_rank = ompi_comm_rank(low_comm);
     int low_size = ompi_comm_size(low_comm);
 
+    ompi_proc_t *up_proc = NULL;
+
     int *topo = (int *)malloc(sizeof(int) * size * num_topo_level);
-    int is_imbalanced = 1;
-    int ranks_non_consecutive = 0;
+    int is_imbalanced = 1, ranks_non_consecutive = 0, is_heterogeneous = 0;
+
+    if (0 != up_rank) {
+        up_proc = ompi_comm_peer_lookup(up_comm, 0);
+        is_heterogeneous = up_proc->super.proc_convertor->remoteArch != opal_local_arch;
+    }
 
     /* node leaders translate the node-local ranks to global ranks and check whether they are placed consecutively */
     if (0 == low_rank) {
@@ -116,15 +123,16 @@ mca_coll_han_topo_init(struct ompi_communicator_t *comm,
             }
         }
 
-        int reduce_vals[] = {ranks_non_consecutive, low_size, -low_size};
+        int reduce_vals[] = {ranks_non_consecutive, low_size, -low_size, is_heterogeneous};
 
-        up_comm->c_coll->coll_allreduce(MPI_IN_PLACE, &reduce_vals, 3,
+        up_comm->c_coll->coll_allreduce(MPI_IN_PLACE, &reduce_vals, 4,
                                         MPI_INT, MPI_MAX, up_comm,
                                         up_comm->c_coll->coll_allreduce_module);
 
         /* is the distribution of processes balanced per node? */
         is_imbalanced = (reduce_vals[1] == -reduce_vals[2]) ? 0 : 1;
         ranks_non_consecutive = reduce_vals[0];
+        is_heterogeneous = reduce_vals[3];
 
         if ( ranks_non_consecutive && !is_imbalanced ) {
             /* kick off up_comm allgather to collect non-consecutive rank information at node leaders */
@@ -136,12 +144,13 @@ mca_coll_han_topo_init(struct ompi_communicator_t *comm,
     }
 
 
-    /* broadcast balanced and consecutive properties from node leaders to remaining ranks */
-    int bcast_vals[] = {is_imbalanced, ranks_non_consecutive};
-    low_comm->c_coll->coll_bcast(bcast_vals, 2, MPI_INT, 0,
+    /* broadcast balanced, consecutive and homogeneity properties from node leaders to remaining ranks */
+    int bcast_vals[] = {is_imbalanced, ranks_non_consecutive, is_heterogeneous};
+    low_comm->c_coll->coll_bcast(bcast_vals, 3, MPI_INT, 0,
                                  low_comm, low_comm->c_coll->coll_bcast_module);
     is_imbalanced = bcast_vals[0];
     ranks_non_consecutive = bcast_vals[1];
+    han_module->is_heterogeneous = bcast_vals[2];
 
     /* error out if the rank distribution is not balanced */
     if (is_imbalanced) {