Merge pull request #9422 from WiltonLoch/master

bosilca · web-flow · commit 4828663537e9 · 2021-09-28T17:28:32.000-04:00
Adding a new algorithm for allgather and allgatherv
diff --git a/ompi/mca/coll/base/coll_base_allgather.c b/ompi/mca/coll/base/coll_base_allgather.c
@@ -23,6 +23,8 @@
 
 #include "ompi_config.h"
 
+#include "math.h"
+
 #include "mpi.h"
 #include "opal/util/bit_ops.h"
 #include "ompi/constants.h"
@@ -338,7 +340,145 @@ ompi_coll_base_allgather_intra_recursivedoubling(const void *sbuf, int scount,
     return err;
 }
 
+/*
+ * ompi_coll_base_allgather_intra_sparbit
+ *
+ * Function:     allgather using O(log(N)) steps.
+ * Accepts:      Same arguments as MPI_Allgather
+ * Returns:      MPI_SUCCESS or error code
+ *
+ * Description: Proposal of an allgather algorithm similar to Bruck but with inverted distances
+ *              and non-decreasing exchanged data sizes. Described in "Sparbit: a new
+ *              logarithmic-cost and data locality-aware MPI Allgather algorithm".
+ *
+ * Memory requirements:  
+ *              Additional memory for N requests. 
+ *
+ * Example on 6 nodes, with l representing the highest power of two smaller than N, in this case l =
+ * 4 (more details can be found on the paper):
+ *  Initial state
+ *    #     0      1      2      3      4      5
+ *         [0]    [ ]    [ ]    [ ]    [ ]    [ ]
+ *         [ ]    [1]    [ ]    [ ]    [ ]    [ ]
+ *         [ ]    [ ]    [2]    [ ]    [ ]    [ ]
+ *         [ ]    [ ]    [ ]    [3]    [ ]    [ ]
+ *         [ ]    [ ]    [ ]    [ ]    [4]    [ ]
+ *         [ ]    [ ]    [ ]    [ ]    [ ]    [5]
+ *   Step 0: Each process sends its own block to process r + l and receives another from r - l.
+ *    #     0      1      2      3      4      5
+ *         [0]    [ ]    [ ]    [ ]    [0]    [ ]
+ *         [ ]    [1]    [ ]    [ ]    [ ]    [1]
+ *         [2]    [ ]    [2]    [ ]    [ ]    [ ]
+ *         [ ]    [3]    [ ]    [3]    [ ]    [ ]
+ *         [ ]    [ ]    [4]    [ ]    [4]    [ ]
+ *         [ ]    [ ]    [ ]    [5]    [ ]    [5]
+ *   Step 1: Each process sends its own block to process r + l/2 and receives another from r - l/2.
+ *   The block received on the previous step is ignored to avoid a future double-write.  
+ *    #     0      1      2      3      4      5
+ *         [0]    [ ]    [0]    [ ]    [0]    [ ]
+ *         [ ]    [1]    [ ]    [1]    [ ]    [1]
+ *         [2]    [ ]    [2]    [ ]    [2]    [ ]
+ *         [ ]    [3]    [ ]    [3]    [ ]    [3]
+ *         [4]    [ ]    [4]    [ ]    [4]    [ ]
+ *         [ ]    [5]    [ ]    [5]    [ ]    [5]
+ *   Step 1: Each process sends all the data it has (3 blocks) to process r + l/4 and similarly
+ *   receives all the data from process r - l/4. 
+ *    #     0      1      2      3      4      5
+ *         [0]    [0]    [0]    [0]    [0]    [0]
+ *         [1]    [1]    [1]    [1]    [1]    [1]
+ *         [2]    [2]    [2]    [2]    [2]    [2]
+ *         [3]    [3]    [3]    [3]    [3]    [3]
+ *         [4]    [4]    [4]    [4]    [4]    [4]
+ *         [5]    [5]    [5]    [5]    [5]    [5]
+ */
 
+int ompi_coll_base_allgather_intra_sparbit(const void *sbuf, int scount,
+                                                  struct ompi_datatype_t *sdtype,
+                                                  void* rbuf, int rcount,
+                                                  struct ompi_datatype_t *rdtype,
+                                                  struct ompi_communicator_t *comm,
+                                                  mca_coll_base_module_t *module)
+{
+    /* ################# VARIABLE DECLARATION, BUFFER CREATION AND PREPARATION FOR THE ALGORITHM ######################## */
+
+    /* list of variable declaration */
+    int rank = 0, comm_size = 0, comm_log = 0, exclusion = 0, data_expected = 1, transfer_count = 0;
+    int sendto, recvfrom, send_disp, recv_disp;
+    uint32_t last_ignore, ignore_steps, distance = 1;
+
+    int err = 0;
+    int line = -1;
+
+    ptrdiff_t rlb, rext;
+
+    char *tmpsend = NULL, *tmprecv = NULL;
+
+    MPI_Request *requests = NULL;
+
+    /* algorithm choice information printing */
+    OPAL_OUTPUT((ompi_coll_base_framework.framework_output, 
+                 "coll:base:allgather_intra_sparbit rank %d", rank));
+
+    comm_size = ompi_comm_size(comm);
+    rank = ompi_comm_rank(comm);
+
+    err = ompi_datatype_get_extent(rdtype, &rlb, &rext);
+    if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
+
+    /* if the MPI_IN_PLACE condition is not set, copy the send buffer to the receive buffer to perform the sends (all the data is extracted and forwarded from the recv buffer)*/
+    /* tmprecv and tmpsend are used as abstract pointers to simplify send and receive buffer choice */
+    tmprecv = (char *) rbuf;
+    if(MPI_IN_PLACE != sbuf){
+        tmpsend = (char *) sbuf; 
+        err = ompi_datatype_sndrcv(tmpsend, scount, sdtype, tmprecv + (ptrdiff_t) rank * rcount * rext, rcount, rdtype);
+        if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl;  }
+    }
+    tmpsend = tmprecv;
+
+    requests = (MPI_Request *) malloc(comm_size * sizeof(MPI_Request));
+    
+    /* ################# ALGORITHM LOGIC ######################## */
+
+    /* calculate log2 of the total process count */
+    comm_log = ceil(log(comm_size)/log(2));
+    distance <<= comm_log - 1;
+
+    last_ignore = __builtin_ctz(comm_size);
+    ignore_steps = (~((uint32_t) comm_size >> last_ignore) | 1) << last_ignore;
+
+    /* perform the parallel binomial tree distribution steps */
+    for (int i = 0; i < comm_log; ++i) {
+       sendto = (rank + distance) % comm_size;  
+       recvfrom = (rank - distance + comm_size) % comm_size;  
+       exclusion = (distance & ignore_steps) == distance;
+
+       for (transfer_count = 0; transfer_count < data_expected - exclusion; transfer_count++) {
+           send_disp = (rank - 2 * transfer_count * distance + comm_size) % comm_size;
+           recv_disp = (rank - (2 * transfer_count + 1) * distance + comm_size) % comm_size;
+
+           /* Since each process sends several non-contiguos blocks of data, each block sent (and therefore each send and recv call) needs a different tag. */
+           /* As base OpenMPI only provides one tag for allgather, we are forced to use a tag space from other components in the send and recv calls */
+           MCA_PML_CALL(isend(tmpsend + (ptrdiff_t) send_disp * scount * rext, scount, rdtype, sendto, MCA_COLL_BASE_TAG_HCOLL_BASE - send_disp, MCA_PML_BASE_SEND_STANDARD, comm, requests + transfer_count));
+           MCA_PML_CALL(irecv(tmprecv + (ptrdiff_t) recv_disp * rcount * rext, rcount, rdtype, recvfrom, MCA_COLL_BASE_TAG_HCOLL_BASE - recv_disp, comm, requests + data_expected - exclusion + transfer_count));
+       }
+       ompi_request_wait_all(transfer_count * 2, requests, MPI_STATUSES_IGNORE);
+
+       distance >>= 1; 
+       /* calculates the data expected for the next step, based on the current number of blocks and eventual exclusions */
+       data_expected = (data_expected << 1) - exclusion;
+       exclusion = 0;
+    }
+    
+    free(requests);
+
+    return OMPI_SUCCESS;
+
+err_hndl:
+    OPAL_OUTPUT((ompi_coll_base_framework.framework_output,  "%s:%4d\tError occurred %d, rank %2d",
+                 __FILE__, line, err, rank));
+    (void)line;  // silence compiler warning
+    return err;
+}
 
 /*
  * ompi_coll_base_allgather_intra_ring
diff --git a/ompi/mca/coll/base/coll_base_allgatherv.c b/ompi/mca/coll/base/coll_base_allgatherv.c
@@ -25,6 +25,8 @@
 
 #include "ompi_config.h"
 
+#include "math.h"
+
 #include "mpi.h"
 #include "ompi/constants.h"
 #include "ompi/datatype/ompi_datatype.h"
@@ -202,6 +204,154 @@ int ompi_coll_base_allgatherv_intra_bruck(const void *sbuf, int scount,
     return err;
 }
 
+/*
+ * ompi_coll_base_allgather_intra_sparbit
+ *
+ * Function:     allgather using O(log(N)) steps.
+ * Accepts:      Same arguments as MPI_Allgather
+ * Returns:      MPI_SUCCESS or error code
+ *
+ * Description: Proposal of an allgather algorithm similar to Bruck but with inverted distances
+ *              and non-decreasing exchanged data sizes. Described in "Sparbit: a new
+ *              logarithmic-cost and data locality-aware MPI Allgather algorithm".
+ *
+ * Memory requirements:  
+ *              Additional memory for N requests. 
+ *
+ * Example on 6 nodes, with l representing the highest power of two smaller than N, in this case l =
+ * 4 (more details can be found on the paper):
+ *  Initial state
+ *    #     0      1      2      3      4      5
+ *         [0]    [ ]    [ ]    [ ]    [ ]    [ ]
+ *         [ ]    [1]    [ ]    [ ]    [ ]    [ ]
+ *         [ ]    [ ]    [2]    [ ]    [ ]    [ ]
+ *         [ ]    [ ]    [ ]    [3]    [ ]    [ ]
+ *         [ ]    [ ]    [ ]    [ ]    [4]    [ ]
+ *         [ ]    [ ]    [ ]    [ ]    [ ]    [5]
+ *   Step 0: Each process sends its own block to process r + l and receives another from r - l.
+ *    #     0      1      2      3      4      5
+ *         [0]    [ ]    [ ]    [ ]    [0]    [ ]
+ *         [ ]    [1]    [ ]    [ ]    [ ]    [1]
+ *         [2]    [ ]    [2]    [ ]    [ ]    [ ]
+ *         [ ]    [3]    [ ]    [3]    [ ]    [ ]
+ *         [ ]    [ ]    [4]    [ ]    [4]    [ ]
+ *         [ ]    [ ]    [ ]    [5]    [ ]    [5]
+ *   Step 1: Each process sends its own block to process r + l/2 and receives another from r - l/2.
+ *   The block received on the previous step is ignored to avoid a future double-write.  
+ *    #     0      1      2      3      4      5
+ *         [0]    [ ]    [0]    [ ]    [0]    [ ]
+ *         [ ]    [1]    [ ]    [1]    [ ]    [1]
+ *         [2]    [ ]    [2]    [ ]    [2]    [ ]
+ *         [ ]    [3]    [ ]    [3]    [ ]    [3]
+ *         [4]    [ ]    [4]    [ ]    [4]    [ ]
+ *         [ ]    [5]    [ ]    [5]    [ ]    [5]
+ *   Step 1: Each process sends all the data it has (3 blocks) to process r + l/4 and similarly
+ *   receives all the data from process r - l/4. 
+ *    #     0      1      2      3      4      5
+ *         [0]    [0]    [0]    [0]    [0]    [0]
+ *         [1]    [1]    [1]    [1]    [1]    [1]
+ *         [2]    [2]    [2]    [2]    [2]    [2]
+ *         [3]    [3]    [3]    [3]    [3]    [3]
+ *         [4]    [4]    [4]    [4]    [4]    [4]
+ *         [5]    [5]    [5]    [5]    [5]    [5]
+ */
+
+int ompi_coll_base_allgatherv_intra_sparbit(const void *sbuf, int scount,
+                                           struct ompi_datatype_t *sdtype,
+                                           void* rbuf, const int *rcounts,
+                                           const int *rdispls,
+                                           struct ompi_datatype_t *rdtype,
+                                           struct ompi_communicator_t *comm,
+                                           mca_coll_base_module_t *module)
+{
+    /* ################# VARIABLE DECLARATION, BUFFER CREATION AND PREPARATION FOR THE ALGORITHM ######################## */
+
+    /* list of variable declaration */
+    int rank = 0, comm_size = 0, comm_log = 0, exclusion = 0; 
+    int data_expected = 1, transfer_count = 0, step_requests = 0;
+    int sendto, recvfrom, send_disp, recv_disp;
+    uint32_t last_ignore, ignore_steps, distance = 1;
+
+    int err = 0;
+    int line = -1;
+
+    ptrdiff_t rlb, rext;
+
+    char *tmpsend = NULL, *tmprecv = NULL;
+
+    MPI_Request *requests = NULL;
+
+    /* printf("utilizando o allgatherv novo!!\n"); */
+
+    /* algorithm choice information printing */
+    OPAL_OUTPUT((ompi_coll_base_framework.framework_output, 
+                 "coll:sparbit:allgather_sync_intra rank %d", rank));
+
+    comm_size = ompi_comm_size(comm);
+    rank = ompi_comm_rank(comm);
+
+    err = ompi_datatype_get_extent(rdtype, &rlb, &rext);
+    if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
+
+    /* if the MPI_IN_PLACE condition is not set, copy the send buffer to the receive buffer to perform the sends (all the data is extracted and forwarded from the recv buffer)*/
+    /* tmprecv and tmpsend are used as abstract pointers to simplify send and receive buffer choice */
+    tmprecv = (char *) rbuf;
+    if(MPI_IN_PLACE != sbuf){
+        tmpsend = (char *) sbuf; 
+        err = ompi_datatype_sndrcv(tmpsend, scount, sdtype, tmprecv + (ptrdiff_t) rdispls[rank] * rext, scount, rdtype);
+        if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl;  }
+    }
+    tmpsend = tmprecv;
+
+    requests = (MPI_Request *) malloc(comm_size * sizeof(MPI_Request));
+    
+    /* ################# ALGORITHM LOGIC ######################## */
+
+    /* calculate log2 of the total process count */
+    comm_log = ceil(log(comm_size)/log(2));
+    distance <<= comm_log - 1;
+
+    last_ignore = __builtin_ctz(comm_size);
+    ignore_steps = (~((uint32_t) comm_size >> last_ignore) | 1) << last_ignore;
+
+    /* perform the parallel binomial tree distribution steps */
+    for (int i = 0; i < comm_log; ++i) {
+       sendto = (rank + distance) % comm_size;  
+       recvfrom = (rank - distance + comm_size) % comm_size;  
+       exclusion = (distance & ignore_steps) == distance;
+
+       for (transfer_count = 0; transfer_count < data_expected - exclusion; transfer_count++) {
+           send_disp = (rank - 2 * transfer_count * distance + comm_size) % comm_size;
+           recv_disp = (rank - (2 * transfer_count + 1) * distance + comm_size) % comm_size;
+
+           /* Since each process sends several non-contiguos blocks of data to the same destination,
+            * each block sent (and therefore each send and recv call) needs a different tag. */
+           /* As base OpenMPI only provides one tag for allgather, we are forced to use a tag space
+            * from other components in the send and recv calls */
+           if(rcounts[send_disp] > 0)
+               MCA_PML_CALL(isend(tmpsend + (ptrdiff_t) rdispls[send_disp] * rext, rcounts[send_disp], rdtype, sendto, MCA_COLL_BASE_TAG_HCOLL_BASE - send_disp, MCA_PML_BASE_SEND_STANDARD, comm, requests + step_requests++));
+           if(rcounts[recv_disp] > 0)
+               MCA_PML_CALL(irecv(tmprecv + (ptrdiff_t) rdispls[recv_disp] * rext, rcounts[recv_disp], rdtype, recvfrom, MCA_COLL_BASE_TAG_HCOLL_BASE - recv_disp, comm, requests + step_requests++));
+       }
+       ompi_request_wait_all(step_requests, requests, MPI_STATUSES_IGNORE);
+
+       distance >>= 1; 
+       /* calculates the data expected for the next step, based on the current number of blocks and eventual exclusions */
+       data_expected = (data_expected << 1) - exclusion;
+       exclusion = step_requests = 0;
+    }
+    
+    free(requests);
+
+    return OMPI_SUCCESS;
+
+err_hndl:
+    OPAL_OUTPUT((ompi_coll_base_framework.framework_output,  "%s:%4d\tError occurred %d, rank %2d",
+                 __FILE__, line, err, rank));
+    (void)line;  // silence compiler warning
+    return err;
+
+}
 
 /*
  * ompi_coll_base_allgatherv_intra_ring
diff --git a/ompi/mca/coll/base/coll_base_functions.h b/ompi/mca/coll/base/coll_base_functions.h
@@ -189,13 +189,15 @@ BEGIN_C_DECLS
 /* All Gather */
 int ompi_coll_base_allgather_intra_bruck(ALLGATHER_ARGS);
 int ompi_coll_base_allgather_intra_recursivedoubling(ALLGATHER_ARGS);
+int ompi_coll_base_allgather_intra_sparbit(ALLGATHER_ARGS);
 int ompi_coll_base_allgather_intra_ring(ALLGATHER_ARGS);
 int ompi_coll_base_allgather_intra_neighborexchange(ALLGATHER_ARGS);
 int ompi_coll_base_allgather_intra_basic_linear(ALLGATHER_ARGS);
 int ompi_coll_base_allgather_intra_two_procs(ALLGATHER_ARGS);
 
 /* All GatherV */
 int ompi_coll_base_allgatherv_intra_bruck(ALLGATHERV_ARGS);
+int ompi_coll_base_allgatherv_intra_sparbit(ALLGATHERV_ARGS);
 int ompi_coll_base_allgatherv_intra_ring(ALLGATHERV_ARGS);
 int ompi_coll_base_allgatherv_intra_neighborexchange(ALLGATHERV_ARGS);
 int ompi_coll_base_allgatherv_intra_basic_default(ALLGATHERV_ARGS);
diff --git a/ompi/mca/coll/tuned/coll_tuned_allgather_decision.c b/ompi/mca/coll/tuned/coll_tuned_allgather_decision.c
@@ -40,6 +40,7 @@ static const mca_base_var_enum_value_t allgather_algorithms[] = {
     {4, "ring"},
     {5, "neighbor"},
     {6, "two_proc"},
+    {7, "sparbit"},
     {0, NULL}
 };
 
@@ -78,7 +79,7 @@ ompi_coll_tuned_allgather_intra_check_forced_init(coll_tuned_force_algorithm_mca
     mca_param_indices->algorithm_param_index =
         mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
                                         "allgather_algorithm",
-                                        "Which allgather algorithm is used. Can be locked down to choice of: 0 ignore, 1 basic linear, 2 bruck, 3 recursive doubling, 4 ring, 5 neighbor exchange, 6: two proc only. "
+                                        "Which allgather algorithm is used. Can be locked down to choice of: 0 ignore, 1 basic linear, 2 bruck, 3 recursive doubling, 4 ring, 5 neighbor exchange, 6: two proc only, 7: sparbit. "
                                         "Only relevant if coll_tuned_use_dynamic_rules is true.",
                                         MCA_BASE_VAR_TYPE_INT, new_enum, 0, MCA_BASE_VAR_FLAG_SETTABLE,
                                         OPAL_INFO_LVL_5,
@@ -163,6 +164,10 @@ int ompi_coll_tuned_allgather_intra_do_this(const void *sbuf, int scount,
         return ompi_coll_base_allgather_intra_two_procs(sbuf, scount, sdtype,
                                                         rbuf, rcount, rdtype,
                                                         comm, module);
+    case (7):
+        return ompi_coll_base_allgather_intra_sparbit(sbuf, scount, sdtype,
+                                                        rbuf, rcount, rdtype,
+                                                        comm, module);
     } /* switch */
     OPAL_OUTPUT((ompi_coll_tuned_stream,
                  "coll:tuned:allgather_intra_do_this attempt to select algorithm %d when only 0-%d is valid?",
diff --git a/ompi/mca/coll/tuned/coll_tuned_allgatherv_decision.c b/ompi/mca/coll/tuned/coll_tuned_allgatherv_decision.c
@@ -39,6 +39,7 @@ static const mca_base_var_enum_value_t allgatherv_algorithms[] = {
     {3, "ring"},
     {4, "neighbor"},
     {5, "two_proc"},
+    {6, "sparbit"},
     {0, NULL}
 };
 
@@ -77,7 +78,7 @@ ompi_coll_tuned_allgatherv_intra_check_forced_init(coll_tuned_force_algorithm_mc
     mca_param_indices->algorithm_param_index =
         mca_base_component_var_register(&mca_coll_tuned_component.super.collm_version,
                                         "allgatherv_algorithm",
-                                        "Which allgatherv algorithm is used. Can be locked down to choice of: 0 ignore, 1 default (allgathervv + bcast), 2 bruck, 3 ring, 4 neighbor exchange, 5: two proc only. "
+                                        "Which allgatherv algorithm is used. Can be locked down to choice of: 0 ignore, 1 default (allgathervv + bcast), 2 bruck, 3 ring, 4 neighbor exchange, 5: two proc only, 6: sparbit. "
                                         "Only relevant if coll_tuned_use_dynamic_rules is true.",
                                         MCA_BASE_VAR_TYPE_INT, new_enum, 0, MCA_BASE_VAR_FLAG_SETTABLE,
                                         OPAL_INFO_LVL_5,
@@ -160,6 +161,10 @@ int ompi_coll_tuned_allgatherv_intra_do_this(const void *sbuf, int scount,
         return ompi_coll_base_allgatherv_intra_two_procs(sbuf, scount, sdtype,
                                                          rbuf, rcounts, rdispls, rdtype,
                                                          comm, module);
+    case (6):
+        return ompi_coll_base_allgatherv_intra_sparbit(sbuf, scount, sdtype,
+                                                         rbuf, rcounts, rdispls, rdtype,
+                                                         comm, module);
     } /* switch */
     OPAL_OUTPUT((ompi_coll_tuned_stream,
                  "coll:tuned:allgatherv_intra_do_this attempt to select algorithm %d when only 0-%d is valid?",