Skip to content

Commit 14b3c70

Browse files
dong0321bosilca
authored andcommitted
Add supports for MPI_OP using AVX512, AVX2 and MMX
Add logic to handle different architectural capabilities Detect the compiler flags necessary to build specialized versions of the MPI_OP. Once the different flavors (AVX512, AVX2, AVX) are built, detect at runtime which is the best match with the current processor capabilities. Add validation checks for loadu 256 and 512 bits. Add validation tests for MPI_Op. Signed-off-by: Jeff Squyres <jsquyres@cisco.com> Signed-off-by: Gilles Gouaillardet <gilles@rist.or.jp> Signed-off-by: dongzhong <zhongdong0321@hotmail.com> Signed-off-by: George Bosilca <bosilca@icl.utk.edu>
1 parent a26e494 commit 14b3c70

File tree

12 files changed

+3543
-23
lines changed

12 files changed

+3543
-23
lines changed

config/opal_config_asm.m4

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@ dnl
22
dnl Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
33
dnl University Research and Technology
44
dnl Corporation. All rights reserved.
5-
dnl Copyright (c) 2004-2018 The University of Tennessee and The University
5+
dnl Copyright (c) 2004-2020 The University of Tennessee and The University
66
dnl of Tennessee Research Foundation. All rights
77
dnl reserved.
88
dnl Copyright (c) 2004-2006 High Performance Computing Center Stuttgart,
@@ -1255,7 +1255,7 @@ AC_DEFUN([OPAL_CONFIG_ASM],[
12551255
12561256
# Check for RDTSCP support
12571257
result=0
1258-
AS_IF([test "$opal_cv_asm_arch" = "OPAL_X86_64" || test "$opal_cv_asm_arch" = "OPAL_IA32"],
1258+
AS_IF([test "$opal_cv_asm_arch" = "X86_64" || test "$opal_cv_asm_arch" = "IA32"],
12591259
[AC_MSG_CHECKING([for RDTSCP assembly support])
12601260
AC_LANG_PUSH([C])
12611261
AC_TRY_RUN([[

ompi/mca/op/avx/Makefile.am

Lines changed: 101 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,101 @@
1+
#
2+
# Copyright (c) 2019-2020 The University of Tennessee and The University
3+
# of Tennessee Research Foundation. All rights
4+
# reserved.
5+
# Copyright (c) 2020 Research Organization for Information Science
6+
# and Technology (RIST). All rights reserved.
7+
# $COPYRIGHT$
8+
#
9+
# Additional copyrights may follow
10+
#
11+
# $HEADER$
12+
#
13+
14+
# This component provide support for the Advanced Vector Extensions (AVX)
15+
# available in recent versions of x86 processors.
16+
#
17+
# See https://github.com/open-mpi/ompi/wiki/devel-CreateComponent
18+
# for more details on how to make Open MPI components.
19+
20+
# First, list all .h and .c sources. It is necessary to list all .h
21+
# files so that they will be picked up in the distribution tarball.
22+
23+
sources = op_avx_component.c op_avx.h
24+
sources_extended = op_avx_functions.c
25+
26+
# Open MPI components can be compiled two ways:
27+
#
28+
# 1. As a standalone dynamic shared object (DSO), sometimes called a
29+
# dynamically loadable library (DLL).
30+
#
31+
# 2. As a static library that is slurped up into the upper-level
32+
# libmpi library (regardless of whether libmpi is a static or dynamic
33+
# library). This is called a "Libtool convenience library".
34+
#
35+
# The component needs to create an output library in this top-level
36+
# component directory, and named either mca_<type>_<name>.la (for DSO
37+
# builds) or libmca_<type>_<name>.la (for static builds). The OMPI
38+
# build system will have set the
39+
# MCA_BUILD_ompi_<framework>_<component>_DSO AM_CONDITIONAL to indicate
40+
# which way this component should be built.
41+
42+
# We need to support all processors from early AVX to full AVX512 support, based on
43+
# a decision made at runtime. So, we generate all combinations of capabilities, and
44+
# we will select the most suitable (based on the processor flags) during the
45+
# component initialization.
46+
specialized_op_libs =
47+
if MCA_BUILD_ompi_op_has_avx_support
48+
specialized_op_libs += liblocal_ops_avx.la
49+
liblocal_ops_avx_la_SOURCES = $(sources_extended)
50+
liblocal_ops_avx_la_CFLAGS = @MCA_BUILD_OP_AVX_FLAGS@
51+
liblocal_ops_avx_la_CPPFLAGS = -DGENERATE_AVX_CODE
52+
if MCA_BUILD_ompi_op_has_sse3_support
53+
liblocal_ops_avx_la_CPPFLAGS += -DGENERATE_SSE3_CODE
54+
endif
55+
if MCA_BUILD_ompi_op_has_sse41_support
56+
liblocal_ops_avx_la_CPPFLAGS += -DGENERATE_SSE41_CODE
57+
endif
58+
endif
59+
if MCA_BUILD_ompi_op_has_avx2_support
60+
specialized_op_libs += liblocal_ops_avx2.la
61+
liblocal_ops_avx2_la_SOURCES = $(sources_extended)
62+
liblocal_ops_avx2_la_CFLAGS = @MCA_BUILD_OP_AVX2_FLAGS@
63+
liblocal_ops_avx2_la_CPPFLAGS = -DGENERATE_SSE3_CODE -DGENERATE_SSE41_CODE -DGENERATE_AVX_CODE -DGENERATE_AVX2_CODE
64+
endif
65+
if MCA_BUILD_ompi_op_has_avx512_support
66+
specialized_op_libs += liblocal_ops_avx512.la
67+
liblocal_ops_avx512_la_SOURCES = $(sources_extended)
68+
liblocal_ops_avx512_la_CFLAGS = @MCA_BUILD_OP_AVX512_FLAGS@
69+
liblocal_ops_avx512_la_CPPFLAGS = -DGENERATE_SSE3_CODE -DGENERATE_SSE41_CODE -DGENERATE_AVX_CODE -DGENERATE_AVX2_CODE -DGENERATE_AVX512_CODE
70+
endif
71+
72+
component_noinst = $(specialized_op_libs)
73+
if MCA_BUILD_ompi_op_avx_DSO
74+
component_install = mca_op_avx.la
75+
else
76+
component_install =
77+
component_noinst += libmca_op_avx.la
78+
endif
79+
80+
# Specific information for DSO builds.
81+
#
82+
# The DSO should install itself in $(ompilibdir) (by default,
83+
# $prefix/lib/openmpi).
84+
85+
mcacomponentdir = $(ompilibdir)
86+
mcacomponent_LTLIBRARIES = $(component_install)
87+
mca_op_avx_la_SOURCES = $(sources)
88+
mca_op_avx_la_LIBADD = $(specialized_op_libs)
89+
mca_op_avx_la_LDFLAGS = -module -avoid-version
90+
91+
92+
# Specific information for static builds.
93+
#
94+
# Note that we *must* "noinst"; the upper-layer Makefile.am's will
95+
# slurp in the resulting .la library into libmpi.
96+
97+
noinst_LTLIBRARIES = $(component_noinst)
98+
libmca_op_avx_la_SOURCES = $(sources)
99+
libmca_op_avx_la_LIBADD = $(specialized_op_libs)
100+
libmca_op_avx_la_LDFLAGS = -module -avoid-version
101+

ompi/mca/op/avx/configure.m4

Lines changed: 265 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,265 @@
1+
# -*- shell-script -*-
2+
#
3+
# Copyright (c) 2019-2020 The University of Tennessee and The University
4+
# of Tennessee Research Foundation. All rights
5+
# reserved.
6+
# Copyright (c) 2020 Cisco Systems, Inc. All rights reserved.
7+
#
8+
# $COPYRIGHT$
9+
#
10+
# Additional copyrights may follow
11+
#
12+
# $HEADER$
13+
#
14+
15+
# MCA_ompi_op_avx_CONFIG([action-if-can-compile],
16+
# [action-if-cant-compile])
17+
# ------------------------------------------------
18+
# We can always build, unless we were explicitly disabled.
19+
AC_DEFUN([MCA_ompi_op_avx_CONFIG],[
20+
AC_CONFIG_FILES([ompi/mca/op/avx/Makefile])
21+
22+
MCA_BUILD_OP_AVX_FLAGS=""
23+
MCA_BUILD_OP_AVX2_FLAGS=""
24+
MCA_BUILD_OP_AVX512_FLAGS=""
25+
op_sse3_support=0
26+
op_sse41_support=0
27+
op_avx_support=0
28+
op_avx2_support=0
29+
op_avx512_support=0
30+
OPAL_VAR_SCOPE_PUSH([op_avx_cflags_save])
31+
32+
AS_IF([test "$opal_cv_asm_arch" = "X86_64"],
33+
[AC_LANG_PUSH([C])
34+
35+
#
36+
# Check for AVX512 support
37+
#
38+
AC_MSG_CHECKING([for AVX512 support (no additional flags)])
39+
AC_LINK_IFELSE(
40+
[AC_LANG_PROGRAM([[#include <immintrin.h>]],
41+
[[
42+
__m512 vA, vB;
43+
_mm512_add_ps(vA, vB)
44+
]])],
45+
[op_avx512_support=1
46+
AC_MSG_RESULT([yes])],
47+
[AC_MSG_RESULT([no])])
48+
49+
AS_IF([test $op_avx512_support -eq 0],
50+
[AC_MSG_CHECKING([for AVX512 support (with -march=skylake-avx512)])
51+
op_avx_cflags_save="$CFLAGS"
52+
CFLAGS="$CFLAGS -march=skylake-avx512"
53+
AC_LINK_IFELSE(
54+
[AC_LANG_PROGRAM([[#include <immintrin.h>]],
55+
[[
56+
__m512 vA, vB;
57+
_mm512_add_ps(vA, vB)
58+
]])],
59+
[op_avx512_support=1
60+
MCA_BUILD_OP_AVX512_FLAGS="-march=skylake-avx512"
61+
AC_MSG_RESULT([yes])],
62+
[AC_MSG_RESULT([no])])
63+
CFLAGS="$op_avx_cflags_save"
64+
])
65+
#
66+
# Some combination of gcc and older as would not correctly build the code generated by
67+
# _mm256_loadu_si256. Screen them out.
68+
#
69+
AS_IF([test $op_avx512_support -eq 1],
70+
[AC_MSG_CHECKING([if _mm512_loadu_si512 generates code that can be compiled])
71+
op_avx_cflags_save="$CFLAGS"
72+
CFLAGS="$CFLAGS_WITHOUT_OPTFLAGS -O0 $MCA_BUILD_OP_AVX512_FLAGS"
73+
AC_LINK_IFELSE(
74+
[AC_LANG_PROGRAM([[#include <immintrin.h>]],
75+
[[
76+
int A[] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16};
77+
__m512i vA = _mm512_loadu_si512((__m512i*)&(A[1]))
78+
]])],
79+
[AC_MSG_RESULT([yes])],
80+
[op_avx512_support=0
81+
MCA_BUILD_OP_AVX512_FLAGS=""
82+
AC_MSG_RESULT([no])])
83+
CFLAGS="$op_avx_cflags_save"
84+
])
85+
#
86+
# Check support for AVX2
87+
#
88+
AC_MSG_CHECKING([for AVX2 support (no additional flags)])
89+
AC_LINK_IFELSE(
90+
[AC_LANG_PROGRAM([[#include <immintrin.h>]],
91+
[[
92+
__m256 vA, vB;
93+
_mm256_add_ps(vA, vB)
94+
]])],
95+
[op_avx2_support=1
96+
AC_MSG_RESULT([yes])],
97+
[AC_MSG_RESULT([no])])
98+
AS_IF([test $op_avx2_support -eq 0],
99+
[AC_MSG_CHECKING([for AVX2 support (with -mavx2)])
100+
op_avx_cflags_save="$CFLAGS"
101+
CFLAGS="$CFLAGS -mavx2"
102+
AC_LINK_IFELSE(
103+
[AC_LANG_PROGRAM([[#include <immintrin.h>]],
104+
[[
105+
__m256 vA, vB;
106+
_mm256_add_ps(vA, vB)
107+
]])],
108+
[op_avx2_support=1
109+
MCA_BUILD_OP_AVX2_FLAGS="-mavx2"
110+
AC_MSG_RESULT([yes])],
111+
[AC_MSG_RESULT([no])])
112+
CFLAGS="$op_avx_cflags_save"
113+
])
114+
#
115+
# Some combination of gcc and older as would not correctly build the code generated by
116+
# _mm256_loadu_si256. Screen them out.
117+
#
118+
AS_IF([test $op_avx2_support -eq 1],
119+
[AC_MSG_CHECKING([if _mm256_loadu_si256 generates code that can be compiled])
120+
op_avx_cflags_save="$CFLAGS"
121+
CFLAGS="$CFLAGS_WITHOUT_OPTFLAGS -O0 $MCA_BUILD_OP_AVX2_FLAGS"
122+
AC_LINK_IFELSE(
123+
[AC_LANG_PROGRAM([[#include <immintrin.h>]],
124+
[[
125+
int A[8] = {0, 1, 2, 3, 4, 5, 6, 7};
126+
__m256i vA = _mm256_loadu_si256((__m256i*)&A)
127+
]])],
128+
[AC_MSG_RESULT([yes])],
129+
[op_avx2_support=0
130+
MCA_BUILD_OP_AVX2_FLAGS=""
131+
AC_MSG_RESULT([no])])
132+
CFLAGS="$op_avx_cflags_save"
133+
])
134+
#
135+
# What about early AVX support. The rest of the logic is slightly different as
136+
# we need to include some of the SSE4.1 and SSE3 instructions. So, we first check
137+
# if we can compile AVX code without a flag, then we validate that we have support
138+
# for the SSE4.1 and SSE3 instructions we need. If not, we check for the usage of
139+
# the AVX flag, and then recheck if we have support for the SSE4.1 and SSE3
140+
# instructions.
141+
#
142+
AC_MSG_CHECKING([for AVX support (no additional flags)])
143+
AC_LINK_IFELSE(
144+
[AC_LANG_PROGRAM([[#include <immintrin.h>]],
145+
[[
146+
__m128 vA, vB;
147+
_mm_add_ps(vA, vB)
148+
]])],
149+
[op_avx_support=1
150+
AC_MSG_RESULT([yes])],
151+
[AC_MSG_RESULT([no])])
152+
#
153+
# Check for SSE4.1 support
154+
#
155+
AS_IF([test $op_avx_support -eq 1],
156+
[AC_MSG_CHECKING([for SSE4.1 support])
157+
AC_LINK_IFELSE(
158+
[AC_LANG_PROGRAM([[#include <immintrin.h>]],
159+
[[
160+
__m128i vA, vB;
161+
(void)_mm_max_epi8(vA, vB)
162+
]])],
163+
[op_sse41_support=1
164+
AC_MSG_RESULT([yes])],
165+
[AC_MSG_RESULT([no])])
166+
])
167+
#
168+
# Check for SSE3 support
169+
#
170+
AS_IF([test $op_avx_support -eq 1],
171+
[AC_MSG_CHECKING([for SSE3 support])
172+
AC_LINK_IFELSE(
173+
[AC_LANG_PROGRAM([[#include <immintrin.h>]],
174+
[[
175+
int A[4] = {0, 1, 2, 3};
176+
__m128i vA = _mm_lddqu_si128((__m128i*)&A)
177+
]])],
178+
[op_sse3_support=1
179+
AC_MSG_RESULT([yes])],
180+
[AC_MSG_RESULT([no])])
181+
])
182+
# Second pass, do we need to add the AVX flag ?
183+
AS_IF([test $op_avx_support -eq 0 || test $op_sse41_support -eq 0 || test $op_sse3_support -eq 0],
184+
[AC_MSG_CHECKING([for AVX support (with -mavx)])
185+
op_avx_cflags_save="$CFLAGS"
186+
CFLAGS="$CFLAGS -mavx"
187+
AC_LINK_IFELSE(
188+
[AC_LANG_PROGRAM([[#include <immintrin.h>]],
189+
[[
190+
__m128 vA, vB;
191+
_mm_add_ps(vA, vB)
192+
]])],
193+
[op_avx_support=1
194+
MCA_BUILD_OP_AVX_FLAGS="-mavx"
195+
op_sse41_support=0
196+
op_sse3_support=0
197+
AC_MSG_RESULT([yes])],
198+
[AC_MSG_RESULT([no])])
199+
200+
AS_IF([test $op_sse41_support -eq 0],
201+
[AC_MSG_CHECKING([for SSE4.1 support])
202+
AC_LINK_IFELSE(
203+
[AC_LANG_PROGRAM([[#include <immintrin.h>]],
204+
[[
205+
__m128i vA, vB;
206+
(void)_mm_max_epi8(vA, vB)
207+
]])],
208+
[op_sse41_support=1
209+
AC_MSG_RESULT([yes])],
210+
[AC_MSG_RESULT([no])])
211+
])
212+
AS_IF([test $op_sse3_support -eq 0],
213+
[AC_MSG_CHECKING([for SSE3 support])
214+
AC_LINK_IFELSE(
215+
[AC_LANG_PROGRAM([[#include <immintrin.h>]],
216+
[[
217+
int A[4] = {0, 1, 2, 3};
218+
__m128i vA = _mm_lddqu_si128((__m128i*)&A)
219+
]])],
220+
[op_sse3_support=1
221+
AC_MSG_RESULT([yes])],
222+
[AC_MSG_RESULT([no])])
223+
])
224+
CFLAGS="$op_avx_cflags_save"
225+
])
226+
227+
AC_LANG_POP([C])
228+
])
229+
AC_DEFINE_UNQUOTED([OMPI_MCA_OP_HAVE_AVX512],
230+
[$op_avx512_support],
231+
[AVX512 supported in the current build])
232+
AC_DEFINE_UNQUOTED([OMPI_MCA_OP_HAVE_AVX2],
233+
[$op_avx2_support],
234+
[AVX2 supported in the current build])
235+
AC_DEFINE_UNQUOTED([OMPI_MCA_OP_HAVE_AVX],
236+
[$op_avx_support],
237+
[AVX supported in the current build])
238+
AC_DEFINE_UNQUOTED([OMPI_MCA_OP_HAVE_SSE41],
239+
[$op_sse41_support],
240+
[SSE4.1 supported in the current build])
241+
AC_DEFINE_UNQUOTED([OMPI_MCA_OP_HAVE_SSE3],
242+
[$op_sse3_support],
243+
[SSE3 supported in the current build])
244+
AM_CONDITIONAL([MCA_BUILD_ompi_op_has_avx512_support],
245+
[test "$op_avx512_support" == "1"])
246+
AM_CONDITIONAL([MCA_BUILD_ompi_op_has_avx2_support],
247+
[test "$op_avx2_support" == "1"])
248+
AM_CONDITIONAL([MCA_BUILD_ompi_op_has_avx_support],
249+
[test "$op_avx_support" == "1"])
250+
AM_CONDITIONAL([MCA_BUILD_ompi_op_has_sse41_support],
251+
[test "$op_sse41_support" == "1"])
252+
AM_CONDITIONAL([MCA_BUILD_ompi_op_has_sse3_support],
253+
[test "$op_sse3_support" == "1"])
254+
AC_SUBST(MCA_BUILD_OP_AVX512_FLAGS)
255+
AC_SUBST(MCA_BUILD_OP_AVX2_FLAGS)
256+
AC_SUBST(MCA_BUILD_OP_AVX_FLAGS)
257+
258+
OPAL_VAR_SCOPE_POP
259+
# Enable this component iff we have at least the most basic form of support
260+
# for vectorial ISA
261+
AS_IF([test $op_avx_support -eq 1 || test $op_avx2_support -eq 1 || test $op_avx512_support -eq 1],
262+
[$1],
263+
[$2])
264+
265+
])dnl

0 commit comments

Comments
 (0)