open-mpi
diff --git a/‎LICENSE
Lines changed: 41 additions & 0 deletions b/‎LICENSE
Lines changed: 41 additions & 0 deletions
diff --git a/‎ompi/datatype/ompi_datatype.h
Lines changed: 90 additions & 0 deletions b/‎ompi/datatype/ompi_datatype.h
Lines changed: 90 additions & 0 deletions
diff --git a/‎ompi/mpi/c/pack.c
Lines changed: 21 additions & 2 deletions b/‎ompi/mpi/c/pack.c
Lines changed: 21 additions & 2 deletions
diff --git a/‎ompi/mpi/c/unpack.c
Lines changed: 21 additions & 2 deletions b/‎ompi/mpi/c/unpack.c
Lines changed: 21 additions & 2 deletions
diff --git a/‎opal/datatype/Makefile.am
Lines changed: 2 additions & 0 deletions b/‎opal/datatype/Makefile.am
Lines changed: 2 additions & 0 deletions
@@ -58,6 +58,8 @@ Copyright (c) 2017-2020 Amazon.com, Inc. or its affiliates.  All Rights
 Copyright (c) 2018      DataDirect Networks. All rights reserved.
 Copyright (c) 2018-2020 Triad National Security, LLC. All rights reserved.
 Copyright (c) 2020      Google, LLC. All rights reserved.
+Copyright (c) 2002      University of Chicago
+Copyright (c) 2001      Argonne National Laboratory
 
 $COPYRIGHT$
 
@@ -99,3 +101,42 @@ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+----------------[Copyright from inclusion of MPICH code]----------------
+
+The following is a notice of limited availability of the code, and disclaimer
+which must be included in the prologue of the code and in all source listings
+of the code.
+
+Copyright Notice
+ + 2002 University of Chicago
+
+Permission is hereby granted to use, reproduce, prepare derivative works, and
+to redistribute to others.  This software was authored by:
+
+Mathematics and Computer Science Division
+Argonne National Laboratory, Argonne IL 60439
+
+(and)
+
+Department of Computer Science
+University of Illinois at Urbana-Champaign
+
+
+			      GOVERNMENT LICENSE
+
+Portions of this material resulted from work developed under a U.S.
+Government Contract and are subject to the following license: the Government
+is granted for itself and others acting on its behalf a paid-up, nonexclusive,
+irrevocable worldwide license in this computer software to reproduce, prepare
+derivative works, and perform publicly and display publicly.
+
+				  DISCLAIMER
+
+This computer code material was prepared, in part, as an account of work
+sponsored by an agency of the United States Government.  Neither the United
+States, nor the University of Chicago, nor any of their employees, makes any
+warranty express or implied, or assumes any legal liability or responsibility
+for the accuracy, completeness, or usefulness of any information, apparatus,
+product, or process disclosed, or represents that its use would not infringe
+privately owned rights.
@@ -10,6 +10,7 @@
  * Copyright (c) 2015-2020 Research Organization for Information Science
  *                         and Technology (RIST).  All rights reserved.
  * Copyright (c) 2018      FUJITSU LIMITED.  All rights reserved.
+ * Copyright (c) 2021      IBM Corporation. All rights reserved.
  * $COPYRIGHT$
  *
  * Additional copyrights may follow
@@ -418,5 +419,94 @@ OMPI_DECLSPEC int ompi_datatype_pack_external_size( const char datarep[], int in
         }                                                               \
     }
 
+/*
+ * Sometimes it's faster to operate on a (count,datatype) pair if it's
+ * converted to (1,larger_datatype).  This comes up in pack/unpack if
+ * the datatype is [int4b,empty4b] for example.  With that datatype the
+ * (count,datatype) path has to loop over the count processing each
+ * occurrance of the datatype, but a larger type created via
+ * MPI_Type_contiguous(count,datatype,) will have a single description
+ * entry describing the whole vector and go through pack/unpack much
+ * faster.
+ *
+ * These functions convert an incoming (count,dt) if the performance
+ * is potentially better.
+ *
+ * Note this function is only likely to be useful if the (count,datatype)
+ * describes a simple evenly spaced vector that will boil down to a
+ * single description element, but I don't think it's cheap to traverse
+ * the incoming datatype to check if that will be the case.  Eg I'm not
+ * sure it would be cheap enough to check that
+ *   [int,int,space,int,int,space]  is going to convert nicely, vs
+ *   [int,int,space,int,space]      which isn't.
+ * So the only checks performed are that the (count,datatype) isn't
+ * contiguous, and that the count is large enough to justify the
+ * overhead of making a new datatype.
+ */
+typedef struct {
+    MPI_Datatype dt;
+    MPI_Count count;
+    int new_type_was_created;
+} ompi_datatype_consolidate_t;
+
+static inline int
+ompi_datatype_consolidate_create(
+    MPI_Count count, MPI_Datatype dtype, ompi_datatype_consolidate_t *dtmod,
+    int threshold)
+{
+    int rc;
+    size_t dtsize;
+    MPI_Aint lb, extent;
+
+    /* default (do nothing) unless we decide otherwise below */
+    dtmod->dt = dtype;
+    dtmod->count = count;
+    dtmod->new_type_was_created = 0;
+
+    if (count >= threshold) {
+        opal_datatype_type_size ( &dtype->super, &dtsize);
+        rc = ompi_datatype_get_extent( dtype, &lb, &extent );
+        if (rc != OMPI_SUCCESS) { return rc; }
+        if ((dtype->super.flags & OPAL_DATATYPE_FLAG_CONTIGUOUS) &&
+            (MPI_Aint)dtsize == extent)
+        {
+            /* contig, no performance advantage to making a new type */
+        } else {
+            rc = ompi_datatype_create_contiguous( count, dtype, &dtmod->dt );
+            if (rc != OMPI_SUCCESS) { return rc; }
+            ompi_datatype_commit(&dtmod->dt);
+            dtmod->count = 1;
+            dtmod->new_type_was_created = 1;
+        }
+    }
+    return OMPI_SUCCESS;
+}
+static inline int
+ompi_datatype_consolidate_free(ompi_datatype_consolidate_t *dtmod)
+{
+    int rc = OMPI_SUCCESS;
+    if (dtmod->new_type_was_created) {
+        rc = ompi_datatype_destroy( &dtmod->dt );
+        /* caller isn't supposed to free twice, but safety valve if they do: */
+        dtmod->new_type_was_created = 0;
+    }
+    return rc;
+}
+/*
+ *  The magic number below just came from empirical testing on a couple
+ *  local PPC machines using [int,space] as the datatype.  There's some
+ *  overhead in constructing a new datatype, so just walking a sequence of
+ *  description elements is better for a short list of elements vs
+ *  creating a potentially shorter list and hoping the vector-walking
+ *  of the new elements is faster.  This could maybe be tuned dynamically
+ *  but it doesn't really seem worth it.
+ *
+ *  I only tested on two machines, the crossover point for pack and unpack
+ *  were 80 and 62 on one machine, and 250 and 220 on the other.  So I lean
+ *  toward using 250 for both and assuming that's likely to not waste too
+ *  much overhead on the datatype creation for most cases.
+ */
+#define OMPI_DATATYPE_CONSOLIDATE_THRESHOLD 250
+
 END_C_DECLS
 #endif  /* OMPI_DATATYPE_H_HAS_BEEN_INCLUDED */
@@ -15,6 +15,7 @@
  *                         reserved.
  * Copyright (c) 2015-2018 Research Organization for Information Science
  *                         and Technology (RIST). All rights reserved.
+ * Copyright (c) 2021      IBM Corporation. All rights reserved.
  * $COPYRIGHT$
  *
  * Additional copyrights may follow
@@ -77,10 +78,25 @@ int MPI_Pack(const void *inbuf, int incount, MPI_Datatype datatype,
 
     OPAL_CR_ENTER_LIBRARY();
 
+    /*
+     * If a datatype's description contains a single element that describes
+     * a large vector that path is reasonably optimized in pack/unpack. On
+     * the other hand if the count and datatype combined describe the same
+     * vector, that gets processed one element at a time.
+     *
+     * So at the top level we morph the call if the count and datatype look
+     * like a good vector.
+     */
+    ompi_datatype_consolidate_t dtmod;
+    rc = ompi_datatype_consolidate_create(incount, datatype, &dtmod,
+        OMPI_DATATYPE_CONSOLIDATE_THRESHOLD);
+    OMPI_ERRHANDLER_CHECK(rc, comm, rc, FUNC_NAME);
+
     OBJ_CONSTRUCT( &local_convertor, opal_convertor_t );
     /* the resulting convertor will be set to the position ZERO */
-    opal_convertor_copy_and_prepare_for_send( ompi_mpi_local_convertor, &(datatype->super),
-                                              incount, (void *) inbuf, 0, &local_convertor );
+    opal_convertor_copy_and_prepare_for_send( ompi_mpi_local_convertor,
+                                              &(dtmod.dt->super), dtmod.count,
+                                              (void *) inbuf, 0, &local_convertor );
 
     /* Check for truncation */
     opal_convertor_get_packed_size( &local_convertor, &size );
@@ -100,6 +116,9 @@ int MPI_Pack(const void *inbuf, int incount, MPI_Datatype datatype,
     *position += size;
     OBJ_DESTRUCT( &local_convertor );
 
+    rc = ompi_datatype_consolidate_free(&dtmod);
+    OMPI_ERRHANDLER_CHECK(rc, comm, rc, FUNC_NAME);
+
     OPAL_CR_EXIT_LIBRARY();
 
     /* All done.  Note that the convertor returns 1 upon success, not
 
@@ -12,6 +12,7 @@
  * Copyright (c) 2006-2013 Cisco Systems, Inc.  All rights reserved.
  * Copyright (c) 2015-2018 Research Organization for Information Science
  *                         and Technology (RIST). All rights reserved.
+ * Copyright (c) 2021      IBM Corporation. All rights reserved.
  * $COPYRIGHT$
  *
  * Additional copyrights may follow
@@ -79,12 +80,27 @@ int MPI_Unpack(const void *inbuf, int insize, int *position,
 
     OPAL_CR_ENTER_LIBRARY();
 
+   /*
+    * If a datatype's description contains a single element that describes
+    * a large vector that path is reasonably optimized in pack/unpack. On
+    * the other hand if the count and datatype combined describe the same
+    * vector that is processed one element at a time.
+    *
+    * So at the top level we morph the call if the count and datatype look
+    * like a good vector.
+    */
+    ompi_datatype_consolidate_t dtmod;
+    rc = ompi_datatype_consolidate_create(outcount, datatype, &dtmod,
+        OMPI_DATATYPE_CONSOLIDATE_THRESHOLD);
+    OMPI_ERRHANDLER_CHECK(rc, comm, rc, FUNC_NAME);
+
     if( insize > 0 ) {
         int ret;
         OBJ_CONSTRUCT( &local_convertor, opal_convertor_t );
         /* the resulting convertor will be set the the position ZERO */
-        opal_convertor_copy_and_prepare_for_recv( ompi_mpi_local_convertor, &(datatype->super),
-                                                  outcount, outbuf, 0, &local_convertor );
+        opal_convertor_copy_and_prepare_for_recv( ompi_mpi_local_convertor,
+                                                  &(dtmod.dt->super), dtmod.count,
+                                                  outbuf, 0, &local_convertor );
 
         /* Check for truncation */
         opal_convertor_get_packed_size( &local_convertor, &size );
@@ -110,6 +126,9 @@ int MPI_Unpack(const void *inbuf, int insize, int *position,
         }
     }
 
+    rc = ompi_datatype_consolidate_free(&dtmod);
+    OMPI_ERRHANDLER_CHECK(rc, comm, rc, FUNC_NAME);
+
     OPAL_CR_EXIT_LIBRARY();
 
     OMPI_ERRHANDLER_RETURN(rc, comm, MPI_ERR_UNKNOWN, FUNC_NAME);
 
@@ -17,6 +17,7 @@
 # Copyright (c) 2011-2013 NVIDIA Corporation.  All rights reserved.
 # Copyright (c) 2018      Research Organization for Information Science
 #                         and Technology (RIST). All rights reserved.
+# Copyright (c) 2021      IBM Corporation. All rights reserved.
 # $COPYRIGHT$
 #
 # Additional copyrights may follow
@@ -32,6 +33,7 @@ headers = \
         opal_datatype_internal.h \
         opal_datatype_copy.h \
         opal_datatype_memcpy.h \
+        opal_datatype_pack_unpack_predefined.h \
         opal_datatype_pack.h \
         opal_datatype_prototypes.h \
         opal_datatype_unpack.h