diff --git a/.gitignore b/.gitignore index af0a88ef513..169f5e8d9ad 100644 --- a/.gitignore +++ b/.gitignore @@ -23,8 +23,9 @@ build # Visual studio code .vscode -### Generated header file +### Generated header files include/config.hpp +dpcpp/base/config.hpp ### C++ # Prerequisites diff --git a/CMakeLists.txt b/CMakeLists.txt index 761f10f420d..80464201e54 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -95,9 +95,20 @@ else() option(GINKGO_BUILD_HWLOC "Build Ginkgo with HWLOC. Default is ON. If a system HWLOC is not found, then we try to build it ourselves. Switch this OFF to disable HWLOC." ON) endif() option(GINKGO_DPCPP_SINGLE_MODE "Do not compile double kernels for the DPC++ backend." OFF) +set(GINKGO_DPCPP_AOT_TARGETS "" CACHE STRING + "Uses DPC++ AOT with set targets (e.g., spir64_gen;spir64_x86_64). The default is '' (use JIT).") +set(GINKGO_DPCPP_SUBGROUPS "8;16;32" CACHE STRING + "The subgroup sizes to generate DPC++ kernels for. The default is '8;16;32'.") +set(GINKGO_DPCPP_WORKGROUPS "256;512" CACHE STRING + "The max workgroup sizes to generate DPC++ kernels for. The default is '256;512'.") option(GINKGO_INSTALL_RPATH "Set the RPATH when installing its libraries." ON) option(GINKGO_INSTALL_RPATH_ORIGIN "Add $ORIGIN (Linux) or @loader_path (MacOS) to the installation RPATH." ON) option(GINKGO_INSTALL_RPATH_DEPENDENCIES "Add dependencies to the installation RPATH." OFF) +# Third party libraries +option(GINKGO_DOWNLOAD_GFLAGS "Force Ginkgo to download gflags." OFF) +option(GINKGO_DOWNLOAD_GTEST "Force Ginkgo to download GTest." OFF) +option(GINKGO_DOWNLOAD_HWLOC "Force Ginkgo to download HWLOC." OFF) +option(GINKGO_DOWNLOAD_RAPIDJSON "Force Ginkgo to download RapidJSON." OFF) set(GINKGO_CIRCULAR_DEPS_FLAGS "-Wl,--no-undefined") @@ -186,7 +197,6 @@ if(GINKGO_BUILD_HWLOC) set(GINKGO_HAVE_HWLOC 1) else() set(GINKGO_HAVE_HWLOC 0) - message(STATUS "HWLOC is being forcibly switched off") endif() if(GINKGO_BUILD_MPI) @@ -223,14 +233,14 @@ endif() # Try to find the third party packages before using our subdirectories include(cmake/package_helpers.cmake) if(GINKGO_BUILD_TESTS) - find_package(GTest 1.10.0) # No need for QUIET as CMake ships FindGTest + ginkgo_find_package(GTest 1.10.0) # No need for QUIET as CMake ships FindGTest endif() if(GINKGO_BUILD_BENCHMARKS) - find_package(gflags 2.2.2 QUIET) - find_package(RapidJSON 1.1.0 QUIET) + ginkgo_find_package(gflags 2.2.2 QUIET) + ginkgo_find_package(RapidJSON 1.1.0 QUIET) endif() if(GINKGO_BUILD_HWLOC) - find_package(HWLOC 2.1) # No need for QUIET as we ship FindHWLOC + ginkgo_find_package(HWLOC 2.1) # No need for QUIET as we ship FindHWLOC endif() add_subdirectory(third_party) # Third-party tools and libraries diff --git a/cmake/Modules/FindHWLOC.cmake b/cmake/Modules/FindHWLOC.cmake index fe01a40b2f4..91d61499265 100644 --- a/cmake/Modules/FindHWLOC.cmake +++ b/cmake/Modules/FindHWLOC.cmake @@ -39,13 +39,13 @@ include(hwloc_helpers) find_path(HWLOC_INCLUDE_DIRS NAMES "hwloc.h" - HINTS ${HWLOC_DIR} $ENV{HWLOC_DIR} + HINTS ${HWLOC_DIR} $ENV{HWLOC_DIR} ${HWLOC_ROOT} $ENV{HWLOC_ROOT} PATH_SUFFIXES include DOC "Find the hwloc.h main header" ) find_library(HWLOC_LIBRARIES "hwloc" - HINTS ${HWLOC_DIR} $ENV{HWLOC_DIR} + HINTS ${HWLOC_DIR} $ENV{HWLOC_DIR} ${HWLOC_ROOT} $ENV{HWLOC_ROOT} PATH_SUFFIXES lib lib64 DOC "Find the hwloc library" ) diff --git a/cmake/create_test.cmake b/cmake/create_test.cmake index 28685fdf232..1f8181912df 100644 --- a/cmake/create_test.cmake +++ b/cmake/create_test.cmake @@ -41,7 +41,11 @@ function(ginkgo_create_dpcpp_test test_name) target_compile_features(${test_target_name} PUBLIC cxx_std_17) target_compile_options(${test_target_name} PRIVATE "${GINKGO_DPCPP_FLAGS}") target_compile_options(${test_target_name} PRIVATE "${GINKGO_COMPILER_FLAGS}") - target_link_options(${test_target_name} PRIVATE -fsycl-device-code-split=per_kernel) + if (NOT GINKGO_DPCPP_AOT_TARGETS STREQUAL "") + target_compile_options(${test_target_name} PRIVATE -fsycl-targets=${GINKGO_DPCPP_AOT_TARGETS_FORMATTED}) + else() + target_link_options(${test_target_name} PRIVATE -fsycl-device-code-split=per_kernel) + endif() ginkgo_set_test_target_properties(${test_name} ${test_target_name}) # Note: MKL_ENV is empty on linux. Maybe need to apply MKL_ENV to all test. if (MKL_ENV) diff --git a/cmake/get_info.cmake b/cmake/get_info.cmake index 479b889aeaf..976e1927c6b 100644 --- a/cmake/get_info.cmake +++ b/cmake/get_info.cmake @@ -200,6 +200,10 @@ ginkgo_print_variable(${detailed_log} "GINKGO_BUILD_HWLOC") ginkgo_print_variable(${detailed_log} "HWLOC_VERSION") ginkgo_print_variable(${detailed_log} "HWLOC_LIBRARIES") ginkgo_print_variable(${detailed_log} "HWLOC_INCLUDE_DIRS") +ginkgo_print_variable(${minimal_log} "GINKGO_DOWNLOAD_GFLAGS") +ginkgo_print_variable(${minimal_log} "GINKGO_DOWNLOAD_GTEST") +ginkgo_print_variable(${minimal_log} "GINKGO_DOWNLOAD_HWLOC") +ginkgo_print_variable(${minimal_log} "GINKGO_DOWNLOAD_RAPIDJSON") _minimal( " diff --git a/cmake/package_helpers.cmake b/cmake/package_helpers.cmake index e1d196ad553..2abead229e2 100644 --- a/cmake/package_helpers.cmake +++ b/cmake/package_helpers.cmake @@ -57,3 +57,15 @@ function(ginkgo_download_file url filename hash_type hash) message(FATAL_ERROR "Download of ${filename} failed.") endif() endfunction(ginkgo_download_file) + +# Find package with a protection. To not find library foo, set the CMake +# variable GINKGO_DOWNLOAD_FOO=ON +# +# \param hash The name of the package +# \param ARGN Extra arguments to give to find_package +macro(ginkgo_find_package name) + string(TOUPPER "${name}" GKO_TPL_PKG_NAME) + if(NOT GINKGO_DOWNLOAD_${GKO_TPL_PKG_NAME}) + find_package(${name} ${ARGN}) + endif() +endmacro() diff --git a/common/CMakeLists.txt b/common/CMakeLists.txt index 04f33b4a718..c0757c8226c 100644 --- a/common/CMakeLists.txt +++ b/common/CMakeLists.txt @@ -10,6 +10,15 @@ set(UNIFIED_SOURCES matrix/coo_kernels.cpp matrix/csr_kernels.cpp matrix/dense_kernels.cpp + matrix/dense_kernels_2.cpp + matrix/dense_kernels_3.cpp + matrix/dense_kernels_4.cpp + matrix/dense_kernels_5.cpp + matrix/dense_kernels_6.cpp + matrix/dense_compute_norm2.cpp + matrix/dense_compute_norm1.cpp + matrix/dense_cmnpr.cpp + matrix/dense_css.cpp matrix/ell_kernels.cpp matrix/hybrid_kernels.cpp matrix/sellp_kernels.cpp diff --git a/common/unified/matrix/dense_cmnpr.cpp b/common/unified/matrix/dense_cmnpr.cpp new file mode 100644 index 00000000000..aa81556beef --- /dev/null +++ b/common/unified/matrix/dense_cmnpr.cpp @@ -0,0 +1,80 @@ +/************************************************************* +Copyright (c) 2017-2022, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include "core/matrix/dense_kernels.hpp" + + +#include +#include + + +#include "common/unified/base/kernel_launch.hpp" +#include "common/unified/base/kernel_launch_reduction.hpp" +#include "core/base/mixed_precision_types.hpp" +#include "core/components/prefix_sum_kernels.hpp" + + +namespace gko { +namespace kernels { +namespace GKO_DEVICE_NAMESPACE { +/** + * @brief The Dense matrix format namespace. + * + * @ingroup dense + */ +namespace dense { + + +template +void compute_max_nnz_per_row(std::shared_ptr exec, + const matrix::Dense* source, + size_type& result) +{ + Array partial{exec, source->get_size()[0] + 1}; + count_nonzeros_per_row(exec, source, partial.get_data()); + run_kernel_reduction( + exec, [] GKO_KERNEL(auto i, auto partial) { return partial[i]; }, + GKO_KERNEL_REDUCE_MAX(size_type), + partial.get_data() + source->get_size()[0], source->get_size()[0], + partial); + result = exec->copy_val_to_host(partial.get_const_data() + + source->get_size()[0]); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( + GKO_DECLARE_DENSE_COMPUTE_MAX_NNZ_PER_ROW_KERNEL); + + +} // namespace dense +} // namespace GKO_DEVICE_NAMESPACE +} // namespace kernels +} // namespace gko diff --git a/dpcpp/base/config.hpp b/common/unified/matrix/dense_compute_norm1.cpp similarity index 64% rename from dpcpp/base/config.hpp rename to common/unified/matrix/dense_compute_norm1.cpp index 43465b61c8a..762260bb6b6 100644 --- a/dpcpp/base/config.hpp +++ b/common/unified/matrix/dense_compute_norm1.cpp @@ -30,51 +30,45 @@ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *************************************************************/ -#ifndef GKO_DPCPP_BASE_CONFIG_HPP_ -#define GKO_DPCPP_BASE_CONFIG_HPP_ +#include "core/matrix/dense_kernels.hpp" +#include #include -#include -namespace gko { -namespace kernels { -namespace dpcpp { - - -struct config { - /** - * The type containing a bitmask over all lanes of a warp. - */ - using lane_mask_type = uint64; - - /** - * The number of threads within a Dpcpp subgroup. - */ - static constexpr uint32 warp_size = 16; - - /** - * The bitmask of the entire warp. - */ - static constexpr auto full_lane_mask = ~zero(); +#include "common/unified/base/kernel_launch.hpp" +#include "common/unified/base/kernel_launch_reduction.hpp" +#include "core/base/mixed_precision_types.hpp" +#include "core/components/prefix_sum_kernels.hpp" - /** - * The minimal amount of warps that need to be scheduled for each block - * to maximize GPU occupancy. - */ - static constexpr uint32 min_warps_per_block = 4; - /** - * The default maximal number of threads allowed in DPCPP group - */ - static constexpr uint32 max_block_size = 256; -}; - - -} // namespace dpcpp +namespace gko { +namespace kernels { +namespace GKO_DEVICE_NAMESPACE { +/** + * @brief The Dense matrix format namespace. + * + * @ingroup dense + */ +namespace dense { + + +template +void compute_norm1(std::shared_ptr exec, + const matrix::Dense* x, + matrix::Dense>* result) +{ + run_kernel_col_reduction( + exec, [] GKO_KERNEL(auto i, auto j, auto x) { return abs(x(i, j)); }, + GKO_KERNEL_REDUCE_SUM(remove_complex), result->get_values(), + x->get_size(), x); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_NORM1_KERNEL); + + +} // namespace dense +} // namespace GKO_DEVICE_NAMESPACE } // namespace kernels } // namespace gko - - -#endif // GKO_DPCPP_BASE_CONFIG_HPP_ diff --git a/common/unified/matrix/dense_compute_norm2.cpp b/common/unified/matrix/dense_compute_norm2.cpp new file mode 100644 index 00000000000..890980ad247 --- /dev/null +++ b/common/unified/matrix/dense_compute_norm2.cpp @@ -0,0 +1,76 @@ +/************************************************************* +Copyright (c) 2017-2022, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include "core/matrix/dense_kernels.hpp" + + +#include +#include + + +#include "common/unified/base/kernel_launch.hpp" +#include "common/unified/base/kernel_launch_reduction.hpp" +#include "core/base/mixed_precision_types.hpp" +#include "core/components/prefix_sum_kernels.hpp" + + +namespace gko { +namespace kernels { +namespace GKO_DEVICE_NAMESPACE { +/** + * @brief The Dense matrix format namespace. + * + * @ingroup dense + */ +namespace dense { + + +template +void compute_norm2(std::shared_ptr exec, + const matrix::Dense* x, + matrix::Dense>* result) +{ + run_kernel_col_reduction( + exec, + [] GKO_KERNEL(auto i, auto j, auto x) { return squared_norm(x(i, j)); }, + [] GKO_KERNEL(auto a, auto b) { return a + b; }, + [] GKO_KERNEL(auto a) { return sqrt(a); }, remove_complex{}, + result->get_values(), x->get_size(), x); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_NORM2_KERNEL); + + +} // namespace dense +} // namespace GKO_DEVICE_NAMESPACE +} // namespace kernels +} // namespace gko diff --git a/common/unified/matrix/dense_css.cpp b/common/unified/matrix/dense_css.cpp new file mode 100644 index 00000000000..71a7d8ab405 --- /dev/null +++ b/common/unified/matrix/dense_css.cpp @@ -0,0 +1,92 @@ +/************************************************************* +Copyright (c) 2017-2022, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include "core/matrix/dense_kernels.hpp" + + +#include +#include + + +#include "common/unified/base/kernel_launch.hpp" +#include "common/unified/base/kernel_launch_reduction.hpp" +#include "core/base/mixed_precision_types.hpp" +#include "core/components/prefix_sum_kernels.hpp" + + +namespace gko { +namespace kernels { +namespace GKO_DEVICE_NAMESPACE { +/** + * @brief The Dense matrix format namespace. + * + * @ingroup dense + */ +namespace dense { + + +template +void compute_slice_sets(std::shared_ptr exec, + const matrix::Dense* source, + size_type slice_size, size_type stride_factor, + size_type* slice_sets, size_type* slice_lengths) +{ + const auto num_rows = source->get_size()[0]; + Array row_nnz{exec, num_rows}; + count_nonzeros_per_row(exec, source, row_nnz.get_data()); + const auto num_slices = + static_cast(ceildiv(num_rows, slice_size)); + run_kernel_row_reduction( + exec, + [] GKO_KERNEL(auto slice, auto local_row, auto row_nnz, auto slice_size, + auto stride_factor, auto num_rows) { + const auto row = slice * slice_size + local_row; + return row < num_rows ? static_cast( + ceildiv(row_nnz[row], stride_factor) * + stride_factor) + : size_type{}; + }, + GKO_KERNEL_REDUCE_MAX(size_type), slice_lengths, 1, + gko::dim<2>{num_slices, slice_size}, row_nnz, slice_size, stride_factor, + num_rows); + exec->copy(num_slices, slice_lengths, slice_sets); + components::prefix_sum(exec, slice_sets, num_slices + 1); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( + GKO_DECLARE_DENSE_COMPUTE_SLICE_SETS_KERNEL); + + +} // namespace dense +} // namespace GKO_DEVICE_NAMESPACE +} // namespace kernels +} // namespace gko diff --git a/common/unified/matrix/dense_kernels.cpp b/common/unified/matrix/dense_kernels.cpp index d5037879eda..4e4babe4e9d 100644 --- a/common/unified/matrix/dense_kernels.cpp +++ b/common/unified/matrix/dense_kernels.cpp @@ -104,509 +104,6 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( GKO_DECLARE_DENSE_FILL_IN_MATRIX_DATA_KERNEL); -template -void scale(std::shared_ptr exec, - const matrix::Dense* alpha, matrix::Dense* x) -{ - if (alpha->get_size()[1] > 1) { - run_kernel( - exec, - [] GKO_KERNEL(auto row, auto col, auto alpha, auto x) { - x(row, col) *= alpha[col]; - }, - x->get_size(), alpha->get_const_values(), x); - } else { - run_kernel( - exec, - [] GKO_KERNEL(auto row, auto col, auto alpha, auto x) { - x(row, col) *= alpha[0]; - }, - x->get_size(), alpha->get_const_values(), x); - } -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_SCALAR_TYPE(GKO_DECLARE_DENSE_SCALE_KERNEL); - - -template -void inv_scale(std::shared_ptr exec, - const matrix::Dense* alpha, - matrix::Dense* x) -{ - if (alpha->get_size()[1] > 1) { - run_kernel( - exec, - [] GKO_KERNEL(auto row, auto col, auto alpha, auto x) { - x(row, col) /= alpha[col]; - }, - x->get_size(), alpha->get_const_values(), x); - } else { - run_kernel( - exec, - [] GKO_KERNEL(auto row, auto col, auto alpha, auto x) { - x(row, col) /= alpha[0]; - }, - x->get_size(), alpha->get_const_values(), x); - } -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_SCALAR_TYPE( - GKO_DECLARE_DENSE_INV_SCALE_KERNEL); - - -template -void add_scaled(std::shared_ptr exec, - const matrix::Dense* alpha, - const matrix::Dense* x, matrix::Dense* y) -{ - if (alpha->get_size()[1] > 1) { - run_kernel( - exec, - [] GKO_KERNEL(auto row, auto col, auto alpha, auto x, auto y) { - y(row, col) += alpha[col] * x(row, col); - }, - x->get_size(), alpha->get_const_values(), x, y); - } else { - run_kernel( - exec, - [] GKO_KERNEL(auto row, auto col, auto alpha, auto x, auto y) { - y(row, col) += alpha[0] * x(row, col); - }, - x->get_size(), alpha->get_const_values(), x, y); - } -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_SCALAR_TYPE( - GKO_DECLARE_DENSE_ADD_SCALED_KERNEL); - - -template -void sub_scaled(std::shared_ptr exec, - const matrix::Dense* alpha, - const matrix::Dense* x, matrix::Dense* y) -{ - if (alpha->get_size()[1] > 1) { - run_kernel( - exec, - [] GKO_KERNEL(auto row, auto col, auto alpha, auto x, auto y) { - y(row, col) -= alpha[col] * x(row, col); - }, - x->get_size(), alpha->get_const_values(), x, y); - } else { - run_kernel( - exec, - [] GKO_KERNEL(auto row, auto col, auto alpha, auto x, auto y) { - y(row, col) -= alpha[0] * x(row, col); - }, - x->get_size(), alpha->get_const_values(), x, y); - } -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_SCALAR_TYPE( - GKO_DECLARE_DENSE_SUB_SCALED_KERNEL); - - -template -void add_scaled_diag(std::shared_ptr exec, - const matrix::Dense* alpha, - const matrix::Diagonal* x, - matrix::Dense* y) -{ - const auto diag_values = x->get_const_values(); - run_kernel( - exec, - [] GKO_KERNEL(auto i, auto alpha, auto diag, auto y) { - y(i, i) += alpha[0] * diag[i]; - }, - x->get_size()[0], alpha->get_const_values(), x->get_const_values(), y); -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_ADD_SCALED_DIAG_KERNEL); - - -template -void sub_scaled_diag(std::shared_ptr exec, - const matrix::Dense* alpha, - const matrix::Diagonal* x, - matrix::Dense* y) -{ - const auto diag_values = x->get_const_values(); - run_kernel( - exec, - [] GKO_KERNEL(auto i, auto alpha, auto diag, auto y) { - y(i, i) -= alpha[0] * diag[i]; - }, - x->get_size()[0], alpha->get_const_values(), x->get_const_values(), y); -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_SUB_SCALED_DIAG_KERNEL); - - -template -void compute_dot(std::shared_ptr exec, - const matrix::Dense* x, - const matrix::Dense* y, - matrix::Dense* result) -{ - run_kernel_col_reduction( - exec, - [] GKO_KERNEL(auto i, auto j, auto x, auto y) { - return x(i, j) * y(i, j); - }, - GKO_KERNEL_REDUCE_SUM(ValueType), result->get_values(), x->get_size(), - x, y); -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_DOT_KERNEL); - - -template -void compute_conj_dot(std::shared_ptr exec, - const matrix::Dense* x, - const matrix::Dense* y, - matrix::Dense* result) -{ - run_kernel_col_reduction( - exec, - [] GKO_KERNEL(auto i, auto j, auto x, auto y) { - return conj(x(i, j)) * y(i, j); - }, - GKO_KERNEL_REDUCE_SUM(ValueType), result->get_values(), x->get_size(), - x, y); -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_CONJ_DOT_KERNEL); - - -template -void compute_norm2(std::shared_ptr exec, - const matrix::Dense* x, - matrix::Dense>* result) -{ - run_kernel_col_reduction( - exec, - [] GKO_KERNEL(auto i, auto j, auto x) { return squared_norm(x(i, j)); }, - [] GKO_KERNEL(auto a, auto b) { return a + b; }, - [] GKO_KERNEL(auto a) { return sqrt(a); }, remove_complex{}, - result->get_values(), x->get_size(), x); -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_NORM2_KERNEL); - -template -void compute_norm1(std::shared_ptr exec, - const matrix::Dense* x, - matrix::Dense>* result) -{ - run_kernel_col_reduction( - exec, [] GKO_KERNEL(auto i, auto j, auto x) { return abs(x(i, j)); }, - GKO_KERNEL_REDUCE_SUM(remove_complex), result->get_values(), - x->get_size(), x); -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_NORM1_KERNEL); - - -template -void compute_max_nnz_per_row(std::shared_ptr exec, - const matrix::Dense* source, - size_type& result) -{ - Array partial{exec, source->get_size()[0] + 1}; - count_nonzeros_per_row(exec, source, partial.get_data()); - run_kernel_reduction( - exec, [] GKO_KERNEL(auto i, auto partial) { return partial[i]; }, - GKO_KERNEL_REDUCE_MAX(size_type), - partial.get_data() + source->get_size()[0], source->get_size()[0], - partial); - result = exec->copy_val_to_host(partial.get_const_data() + - source->get_size()[0]); -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( - GKO_DECLARE_DENSE_COMPUTE_MAX_NNZ_PER_ROW_KERNEL); - - -template -void compute_slice_sets(std::shared_ptr exec, - const matrix::Dense* source, - size_type slice_size, size_type stride_factor, - size_type* slice_sets, size_type* slice_lengths) -{ - const auto num_rows = source->get_size()[0]; - Array row_nnz{exec, num_rows}; - count_nonzeros_per_row(exec, source, row_nnz.get_data()); - const auto num_slices = - static_cast(ceildiv(num_rows, slice_size)); - run_kernel_row_reduction( - exec, - [] GKO_KERNEL(auto slice, auto local_row, auto row_nnz, auto slice_size, - auto stride_factor, auto num_rows) { - const auto row = slice * slice_size + local_row; - return row < num_rows ? static_cast( - ceildiv(row_nnz[row], stride_factor) * - stride_factor) - : size_type{}; - }, - GKO_KERNEL_REDUCE_MAX(size_type), slice_lengths, 1, - gko::dim<2>{num_slices, slice_size}, row_nnz, slice_size, stride_factor, - num_rows); - exec->copy(num_slices, slice_lengths, slice_sets); - components::prefix_sum(exec, slice_sets, num_slices + 1); -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( - GKO_DECLARE_DENSE_COMPUTE_SLICE_SETS_KERNEL); - - -template -void count_nonzeros_per_row(std::shared_ptr exec, - const matrix::Dense* mtx, - IndexType* result) -{ - run_kernel_row_reduction( - exec, - [] GKO_KERNEL(auto i, auto j, auto mtx) { - return is_nonzero(mtx(i, j)) ? 1 : 0; - }, - GKO_KERNEL_REDUCE_SUM(IndexType), result, 1, mtx->get_size(), mtx); -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_DENSE_COUNT_NONZEROS_PER_ROW_KERNEL); -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( - GKO_DECLARE_DENSE_COUNT_NONZEROS_PER_ROW_KERNEL_SIZE_T); - - -template -void symm_permute(std::shared_ptr exec, - const Array* permutation_indices, - const matrix::Dense* orig, - matrix::Dense* permuted) -{ - run_kernel( - exec, - [] GKO_KERNEL(auto row, auto col, auto orig, auto perm, auto permuted) { - permuted(row, col) = orig(perm[row], perm[col]); - }, - orig->get_size(), orig, *permutation_indices, permuted); -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_DENSE_SYMM_PERMUTE_KERNEL); - - -template -void inv_symm_permute(std::shared_ptr exec, - const Array* permutation_indices, - const matrix::Dense* orig, - matrix::Dense* permuted) -{ - run_kernel( - exec, - [] GKO_KERNEL(auto row, auto col, auto orig, auto perm, auto permuted) { - permuted(perm[row], perm[col]) = orig(row, col); - }, - orig->get_size(), orig, *permutation_indices, permuted); -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_DENSE_INV_SYMM_PERMUTE_KERNEL); - - -template -void row_gather(std::shared_ptr exec, - const Array* row_idxs, - const matrix::Dense* orig, - matrix::Dense* row_collection) -{ - run_kernel( - exec, - [] GKO_KERNEL(auto row, auto col, auto orig, auto rows, auto gathered) { - gathered(row, col) = orig(rows[row], col); - }, - dim<2>{row_idxs->get_num_elems(), orig->get_size()[1]}, orig, *row_idxs, - row_collection); -} - -GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE_2( - GKO_DECLARE_DENSE_ROW_GATHER_KERNEL); - - -template -void advanced_row_gather(std::shared_ptr exec, - const matrix::Dense* alpha, - const Array* row_idxs, - const matrix::Dense* orig, - const matrix::Dense* beta, - matrix::Dense* row_collection) -{ - run_kernel( - exec, - [] GKO_KERNEL(auto row, auto col, auto alpha, auto orig, auto rows, - auto beta, auto gathered) { - using type = device_type>; - gathered(row, col) = - static_cast(alpha[0] * orig(rows[row], col)) + - static_cast(beta[0]) * - static_cast(gathered(row, col)); - }, - dim<2>{row_idxs->get_num_elems(), orig->get_size()[1]}, - alpha->get_const_values(), orig, *row_idxs, beta->get_const_values(), - row_collection); -} - -GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE_2( - GKO_DECLARE_DENSE_ADVANCED_ROW_GATHER_KERNEL); - - -template -void column_permute(std::shared_ptr exec, - const Array* permutation_indices, - const matrix::Dense* orig, - matrix::Dense* column_permuted) -{ - run_kernel( - exec, - [] GKO_KERNEL(auto row, auto col, auto orig, auto perm, auto permuted) { - permuted(row, col) = orig(row, perm[col]); - }, - orig->get_size(), orig, *permutation_indices, column_permuted); -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_DENSE_COLUMN_PERMUTE_KERNEL); - - -template -void inverse_row_permute(std::shared_ptr exec, - const Array* permutation_indices, - const matrix::Dense* orig, - matrix::Dense* row_permuted) -{ - run_kernel( - exec, - [] GKO_KERNEL(auto row, auto col, auto orig, auto perm, auto permuted) { - permuted(perm[row], col) = orig(row, col); - }, - orig->get_size(), orig, *permutation_indices, row_permuted); -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_DENSE_INV_ROW_PERMUTE_KERNEL); - - -template -void inverse_column_permute(std::shared_ptr exec, - const Array* permutation_indices, - const matrix::Dense* orig, - matrix::Dense* column_permuted) -{ - run_kernel( - exec, - [] GKO_KERNEL(auto row, auto col, auto orig, auto perm, auto permuted) { - permuted(row, perm[col]) = orig(row, col); - }, - orig->get_size(), orig, *permutation_indices, column_permuted); -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_DENSE_INV_COLUMN_PERMUTE_KERNEL); - - -template -void extract_diagonal(std::shared_ptr exec, - const matrix::Dense* orig, - matrix::Diagonal* diag) -{ - run_kernel( - exec, - [] GKO_KERNEL(auto i, auto orig, auto diag) { diag[i] = orig(i, i); }, - diag->get_size()[0], orig, diag->get_values()); -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_EXTRACT_DIAGONAL_KERNEL); - - -template -void inplace_absolute_dense(std::shared_ptr exec, - matrix::Dense* source) -{ - run_kernel( - exec, - [] GKO_KERNEL(auto row, auto col, auto source) { - source(row, col) = abs(source(row, col)); - }, - source->get_size(), source); -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_INPLACE_ABSOLUTE_DENSE_KERNEL); - - -template -void outplace_absolute_dense(std::shared_ptr exec, - const matrix::Dense* source, - matrix::Dense>* result) -{ - run_kernel( - exec, - [] GKO_KERNEL(auto row, auto col, auto source, auto result) { - result(row, col) = abs(source(row, col)); - }, - source->get_size(), source, result); -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_OUTPLACE_ABSOLUTE_DENSE_KERNEL); - - -template -void make_complex(std::shared_ptr exec, - const matrix::Dense* source, - matrix::Dense>* result) -{ - run_kernel( - exec, - [] GKO_KERNEL(auto row, auto col, auto source, auto result) { - result(row, col) = source(row, col); - }, - source->get_size(), source, result); -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_MAKE_COMPLEX_KERNEL); - - -template -void get_real(std::shared_ptr exec, - const matrix::Dense* source, - matrix::Dense>* result) -{ - run_kernel( - exec, - [] GKO_KERNEL(auto row, auto col, auto source, auto result) { - result(row, col) = real(source(row, col)); - }, - source->get_size(), source, result); -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_GET_REAL_KERNEL); - - -template -void get_imag(std::shared_ptr exec, - const matrix::Dense* source, - matrix::Dense>* result) -{ - run_kernel( - exec, - [] GKO_KERNEL(auto row, auto col, auto source, auto result) { - result(row, col) = imag(source(row, col)); - }, - source->get_size(), source, result); -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_GET_IMAG_KERNEL); - - } // namespace dense } // namespace GKO_DEVICE_NAMESPACE } // namespace kernels diff --git a/common/unified/matrix/dense_kernels_2.cpp b/common/unified/matrix/dense_kernels_2.cpp new file mode 100644 index 00000000000..b3179c6137f --- /dev/null +++ b/common/unified/matrix/dense_kernels_2.cpp @@ -0,0 +1,198 @@ +/************************************************************* +Copyright (c) 2017-2022, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include "core/matrix/dense_kernels.hpp" + + +#include +#include + + +#include "common/unified/base/kernel_launch.hpp" +#include "common/unified/base/kernel_launch_reduction.hpp" +#include "core/base/mixed_precision_types.hpp" +#include "core/components/prefix_sum_kernels.hpp" + + +namespace gko { +namespace kernels { +namespace GKO_DEVICE_NAMESPACE { +/** + * @brief The Dense matrix format namespace. + * + * @ingroup dense + */ +namespace dense { + + +template +void scale(std::shared_ptr exec, + const matrix::Dense* alpha, matrix::Dense* x) +{ + if (alpha->get_size()[1] > 1) { + run_kernel( + exec, + [] GKO_KERNEL(auto row, auto col, auto alpha, auto x) { + x(row, col) *= alpha[col]; + }, + x->get_size(), alpha->get_const_values(), x); + } else { + run_kernel( + exec, + [] GKO_KERNEL(auto row, auto col, auto alpha, auto x) { + x(row, col) *= alpha[0]; + }, + x->get_size(), alpha->get_const_values(), x); + } +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_SCALAR_TYPE(GKO_DECLARE_DENSE_SCALE_KERNEL); + + +template +void inv_scale(std::shared_ptr exec, + const matrix::Dense* alpha, + matrix::Dense* x) +{ + if (alpha->get_size()[1] > 1) { + run_kernel( + exec, + [] GKO_KERNEL(auto row, auto col, auto alpha, auto x) { + x(row, col) /= alpha[col]; + }, + x->get_size(), alpha->get_const_values(), x); + } else { + run_kernel( + exec, + [] GKO_KERNEL(auto row, auto col, auto alpha, auto x) { + x(row, col) /= alpha[0]; + }, + x->get_size(), alpha->get_const_values(), x); + } +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_SCALAR_TYPE( + GKO_DECLARE_DENSE_INV_SCALE_KERNEL); + + +template +void add_scaled(std::shared_ptr exec, + const matrix::Dense* alpha, + const matrix::Dense* x, matrix::Dense* y) +{ + if (alpha->get_size()[1] > 1) { + run_kernel( + exec, + [] GKO_KERNEL(auto row, auto col, auto alpha, auto x, auto y) { + y(row, col) += alpha[col] * x(row, col); + }, + x->get_size(), alpha->get_const_values(), x, y); + } else { + run_kernel( + exec, + [] GKO_KERNEL(auto row, auto col, auto alpha, auto x, auto y) { + y(row, col) += alpha[0] * x(row, col); + }, + x->get_size(), alpha->get_const_values(), x, y); + } +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_SCALAR_TYPE( + GKO_DECLARE_DENSE_ADD_SCALED_KERNEL); + + +template +void sub_scaled(std::shared_ptr exec, + const matrix::Dense* alpha, + const matrix::Dense* x, matrix::Dense* y) +{ + if (alpha->get_size()[1] > 1) { + run_kernel( + exec, + [] GKO_KERNEL(auto row, auto col, auto alpha, auto x, auto y) { + y(row, col) -= alpha[col] * x(row, col); + }, + x->get_size(), alpha->get_const_values(), x, y); + } else { + run_kernel( + exec, + [] GKO_KERNEL(auto row, auto col, auto alpha, auto x, auto y) { + y(row, col) -= alpha[0] * x(row, col); + }, + x->get_size(), alpha->get_const_values(), x, y); + } +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_SCALAR_TYPE( + GKO_DECLARE_DENSE_SUB_SCALED_KERNEL); + + +template +void add_scaled_diag(std::shared_ptr exec, + const matrix::Dense* alpha, + const matrix::Diagonal* x, + matrix::Dense* y) +{ + const auto diag_values = x->get_const_values(); + run_kernel( + exec, + [] GKO_KERNEL(auto i, auto alpha, auto diag, auto y) { + y(i, i) += alpha[0] * diag[i]; + }, + x->get_size()[0], alpha->get_const_values(), x->get_const_values(), y); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_ADD_SCALED_DIAG_KERNEL); + + +template +void sub_scaled_diag(std::shared_ptr exec, + const matrix::Dense* alpha, + const matrix::Diagonal* x, + matrix::Dense* y) +{ + const auto diag_values = x->get_const_values(); + run_kernel( + exec, + [] GKO_KERNEL(auto i, auto alpha, auto diag, auto y) { + y(i, i) -= alpha[0] * diag[i]; + }, + x->get_size()[0], alpha->get_const_values(), x->get_const_values(), y); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_SUB_SCALED_DIAG_KERNEL); + + +} // namespace dense +} // namespace GKO_DEVICE_NAMESPACE +} // namespace kernels +} // namespace gko diff --git a/common/unified/matrix/dense_kernels_3.cpp b/common/unified/matrix/dense_kernels_3.cpp new file mode 100644 index 00000000000..f4a580a7109 --- /dev/null +++ b/common/unified/matrix/dense_kernels_3.cpp @@ -0,0 +1,78 @@ +/************************************************************* +Copyright (c) 2017-2022, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include "core/matrix/dense_kernels.hpp" + + +#include +#include + + +#include "common/unified/base/kernel_launch.hpp" +#include "common/unified/base/kernel_launch_reduction.hpp" +#include "core/base/mixed_precision_types.hpp" +#include "core/components/prefix_sum_kernels.hpp" + + +namespace gko { +namespace kernels { +namespace GKO_DEVICE_NAMESPACE { +/** + * @brief The Dense matrix format namespace. + * + * @ingroup dense + */ +namespace dense { + + +template +void compute_dot(std::shared_ptr exec, + const matrix::Dense* x, + const matrix::Dense* y, + matrix::Dense* result) +{ + run_kernel_col_reduction( + exec, + [] GKO_KERNEL(auto i, auto j, auto x, auto y) { + return x(i, j) * y(i, j); + }, + GKO_KERNEL_REDUCE_SUM(ValueType), result->get_values(), x->get_size(), + x, y); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_DOT_KERNEL); + + +} // namespace dense +} // namespace GKO_DEVICE_NAMESPACE +} // namespace kernels +} // namespace gko diff --git a/common/unified/matrix/dense_kernels_4.cpp b/common/unified/matrix/dense_kernels_4.cpp new file mode 100644 index 00000000000..1f9b4287f68 --- /dev/null +++ b/common/unified/matrix/dense_kernels_4.cpp @@ -0,0 +1,78 @@ +/************************************************************* +Copyright (c) 2017-2022, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include "core/matrix/dense_kernels.hpp" + + +#include +#include + + +#include "common/unified/base/kernel_launch.hpp" +#include "common/unified/base/kernel_launch_reduction.hpp" +#include "core/base/mixed_precision_types.hpp" +#include "core/components/prefix_sum_kernels.hpp" + + +namespace gko { +namespace kernels { +namespace GKO_DEVICE_NAMESPACE { +/** + * @brief The Dense matrix format namespace. + * + * @ingroup dense + */ +namespace dense { + + +template +void compute_conj_dot(std::shared_ptr exec, + const matrix::Dense* x, + const matrix::Dense* y, + matrix::Dense* result) +{ + run_kernel_col_reduction( + exec, + [] GKO_KERNEL(auto i, auto j, auto x, auto y) { + return conj(x(i, j)) * y(i, j); + }, + GKO_KERNEL_REDUCE_SUM(ValueType), result->get_values(), x->get_size(), + x, y); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_CONJ_DOT_KERNEL); + + +} // namespace dense +} // namespace GKO_DEVICE_NAMESPACE +} // namespace kernels +} // namespace gko diff --git a/common/unified/matrix/dense_kernels_5.cpp b/common/unified/matrix/dense_kernels_5.cpp new file mode 100644 index 00000000000..65bb8dd91e4 --- /dev/null +++ b/common/unified/matrix/dense_kernels_5.cpp @@ -0,0 +1,114 @@ +/************************************************************* +Copyright (c) 2017-2022, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include "core/matrix/dense_kernels.hpp" + + +#include +#include + + +#include "common/unified/base/kernel_launch.hpp" +#include "common/unified/base/kernel_launch_reduction.hpp" +#include "core/base/mixed_precision_types.hpp" +#include "core/components/prefix_sum_kernels.hpp" + + +namespace gko { +namespace kernels { +namespace GKO_DEVICE_NAMESPACE { +/** + * @brief The Dense matrix format namespace. + * + * @ingroup dense + */ +namespace dense { + + +template +void count_nonzeros_per_row(std::shared_ptr exec, + const matrix::Dense* mtx, + IndexType* result) +{ + run_kernel_row_reduction( + exec, + [] GKO_KERNEL(auto i, auto j, auto mtx) { + return is_nonzero(mtx(i, j)) ? 1 : 0; + }, + GKO_KERNEL_REDUCE_SUM(IndexType), result, 1, mtx->get_size(), mtx); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_DENSE_COUNT_NONZEROS_PER_ROW_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( + GKO_DECLARE_DENSE_COUNT_NONZEROS_PER_ROW_KERNEL_SIZE_T); + + +template +void symm_permute(std::shared_ptr exec, + const Array* permutation_indices, + const matrix::Dense* orig, + matrix::Dense* permuted) +{ + run_kernel( + exec, + [] GKO_KERNEL(auto row, auto col, auto orig, auto perm, auto permuted) { + permuted(row, col) = orig(perm[row], perm[col]); + }, + orig->get_size(), orig, *permutation_indices, permuted); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_DENSE_SYMM_PERMUTE_KERNEL); + + +template +void inv_symm_permute(std::shared_ptr exec, + const Array* permutation_indices, + const matrix::Dense* orig, + matrix::Dense* permuted) +{ + run_kernel( + exec, + [] GKO_KERNEL(auto row, auto col, auto orig, auto perm, auto permuted) { + permuted(perm[row], perm[col]) = orig(row, col); + }, + orig->get_size(), orig, *permutation_indices, permuted); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_DENSE_INV_SYMM_PERMUTE_KERNEL); + +} // namespace dense +} // namespace GKO_DEVICE_NAMESPACE +} // namespace kernels +} // namespace gko diff --git a/common/unified/matrix/dense_kernels_6.cpp b/common/unified/matrix/dense_kernels_6.cpp new file mode 100644 index 00000000000..3629bd930fb --- /dev/null +++ b/common/unified/matrix/dense_kernels_6.cpp @@ -0,0 +1,253 @@ +/************************************************************* +Copyright (c) 2017-2022, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include "core/matrix/dense_kernels.hpp" + + +#include +#include + + +#include "common/unified/base/kernel_launch.hpp" +#include "common/unified/base/kernel_launch_reduction.hpp" +#include "core/base/mixed_precision_types.hpp" +#include "core/components/prefix_sum_kernels.hpp" + + +namespace gko { +namespace kernels { +namespace GKO_DEVICE_NAMESPACE { +/** + * @brief The Dense matrix format namespace. + * + * @ingroup dense + */ +namespace dense { + + +template +void row_gather(std::shared_ptr exec, + const Array* row_idxs, + const matrix::Dense* orig, + matrix::Dense* row_collection) +{ + run_kernel( + exec, + [] GKO_KERNEL(auto row, auto col, auto orig, auto rows, auto gathered) { + gathered(row, col) = orig(rows[row], col); + }, + dim<2>{row_idxs->get_num_elems(), orig->get_size()[1]}, orig, *row_idxs, + row_collection); +} + +GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE_2( + GKO_DECLARE_DENSE_ROW_GATHER_KERNEL); + + +template +void advanced_row_gather(std::shared_ptr exec, + const matrix::Dense* alpha, + const Array* row_idxs, + const matrix::Dense* orig, + const matrix::Dense* beta, + matrix::Dense* row_collection) +{ + run_kernel( + exec, + [] GKO_KERNEL(auto row, auto col, auto alpha, auto orig, auto rows, + auto beta, auto gathered) { + using type = device_type>; + gathered(row, col) = + static_cast(alpha[0] * orig(rows[row], col)) + + static_cast(beta[0]) * + static_cast(gathered(row, col)); + }, + dim<2>{row_idxs->get_num_elems(), orig->get_size()[1]}, + alpha->get_const_values(), orig, *row_idxs, beta->get_const_values(), + row_collection); +} + +GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE_2( + GKO_DECLARE_DENSE_ADVANCED_ROW_GATHER_KERNEL); + + +template +void column_permute(std::shared_ptr exec, + const Array* permutation_indices, + const matrix::Dense* orig, + matrix::Dense* column_permuted) +{ + run_kernel( + exec, + [] GKO_KERNEL(auto row, auto col, auto orig, auto perm, auto permuted) { + permuted(row, col) = orig(row, perm[col]); + }, + orig->get_size(), orig, *permutation_indices, column_permuted); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_DENSE_COLUMN_PERMUTE_KERNEL); + + +template +void inverse_row_permute(std::shared_ptr exec, + const Array* permutation_indices, + const matrix::Dense* orig, + matrix::Dense* row_permuted) +{ + run_kernel( + exec, + [] GKO_KERNEL(auto row, auto col, auto orig, auto perm, auto permuted) { + permuted(perm[row], col) = orig(row, col); + }, + orig->get_size(), orig, *permutation_indices, row_permuted); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_DENSE_INV_ROW_PERMUTE_KERNEL); + + +template +void inverse_column_permute(std::shared_ptr exec, + const Array* permutation_indices, + const matrix::Dense* orig, + matrix::Dense* column_permuted) +{ + run_kernel( + exec, + [] GKO_KERNEL(auto row, auto col, auto orig, auto perm, auto permuted) { + permuted(row, perm[col]) = orig(row, col); + }, + orig->get_size(), orig, *permutation_indices, column_permuted); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_DENSE_INV_COLUMN_PERMUTE_KERNEL); + + +template +void extract_diagonal(std::shared_ptr exec, + const matrix::Dense* orig, + matrix::Diagonal* diag) +{ + run_kernel( + exec, + [] GKO_KERNEL(auto i, auto orig, auto diag) { diag[i] = orig(i, i); }, + diag->get_size()[0], orig, diag->get_values()); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_EXTRACT_DIAGONAL_KERNEL); + + +template +void inplace_absolute_dense(std::shared_ptr exec, + matrix::Dense* source) +{ + run_kernel( + exec, + [] GKO_KERNEL(auto row, auto col, auto source) { + source(row, col) = abs(source(row, col)); + }, + source->get_size(), source); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_INPLACE_ABSOLUTE_DENSE_KERNEL); + + +template +void outplace_absolute_dense(std::shared_ptr exec, + const matrix::Dense* source, + matrix::Dense>* result) +{ + run_kernel( + exec, + [] GKO_KERNEL(auto row, auto col, auto source, auto result) { + result(row, col) = abs(source(row, col)); + }, + source->get_size(), source, result); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_OUTPLACE_ABSOLUTE_DENSE_KERNEL); + + +template +void make_complex(std::shared_ptr exec, + const matrix::Dense* source, + matrix::Dense>* result) +{ + run_kernel( + exec, + [] GKO_KERNEL(auto row, auto col, auto source, auto result) { + result(row, col) = source(row, col); + }, + source->get_size(), source, result); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_MAKE_COMPLEX_KERNEL); + + +template +void get_real(std::shared_ptr exec, + const matrix::Dense* source, + matrix::Dense>* result) +{ + run_kernel( + exec, + [] GKO_KERNEL(auto row, auto col, auto source, auto result) { + result(row, col) = real(source(row, col)); + }, + source->get_size(), source, result); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_GET_REAL_KERNEL); + + +template +void get_imag(std::shared_ptr exec, + const matrix::Dense* source, + matrix::Dense>* result) +{ + run_kernel( + exec, + [] GKO_KERNEL(auto row, auto col, auto source, auto result) { + result(row, col) = imag(source(row, col)); + }, + source->get_size(), source, result); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_GET_IMAG_KERNEL); + + +} // namespace dense +} // namespace GKO_DEVICE_NAMESPACE +} // namespace kernels +} // namespace gko diff --git a/core/base/block_sizes.hpp b/core/base/block_sizes.hpp index 7701d66111f..5a22870374d 100644 --- a/core/base/block_sizes.hpp +++ b/core/base/block_sizes.hpp @@ -34,8 +34,10 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GKO_CORE_BASE_BLOCK_SIZES_HPP_ +#include + + #include -#include namespace gko { @@ -51,9 +53,10 @@ namespace fixedblock { * A compile-time list of block sizes for which dedicated fixed-block matrix * and corresponding preconditioner kernels should be compiled. */ -using compiled_kernels = syn::value_list; +using compiled_kernels = + std::integer_sequence; #else -using compiled_kernels = syn::value_list; +using compiled_kernels = std::integer_sequence; #endif diff --git a/core/base/mixed_precision_types.hpp b/core/base/mixed_precision_types.hpp index a93fc9013b9..33ea7166697 100644 --- a/core/base/mixed_precision_types.hpp +++ b/core/base/mixed_precision_types.hpp @@ -39,39 +39,39 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #ifdef GINKGO_MIXED_PRECISION -#define GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE(_macro, ...) \ - template _macro(float, float, float, __VA_ARGS__); \ - template _macro(float, float, double, __VA_ARGS__); \ - template _macro(float, double, float, __VA_ARGS__); \ - template _macro(float, double, double, __VA_ARGS__); \ - template _macro(double, float, float, __VA_ARGS__); \ - template _macro(double, float, double, __VA_ARGS__); \ - template _macro(double, double, float, __VA_ARGS__); \ - template _macro(double, double, double, __VA_ARGS__); \ - template _macro(std::complex, std::complex, \ - std::complex, __VA_ARGS__); \ - template _macro(std::complex, std::complex, \ - std::complex, __VA_ARGS__); \ - template _macro(std::complex, std::complex, \ - std::complex, __VA_ARGS__); \ - template _macro(std::complex, std::complex, \ - std::complex, __VA_ARGS__); \ - template _macro(std::complex, std::complex, \ - std::complex, __VA_ARGS__); \ - template _macro(std::complex, std::complex, \ - std::complex, __VA_ARGS__); \ - template _macro(std::complex, std::complex, \ - std::complex, __VA_ARGS__); \ - template _macro(std::complex, std::complex, \ - std::complex, __VA_ARGS__) +#define GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE(_macro, ...) \ + template _macro(float, float, float, __VA_ARGS__); \ + GKO_ADAPT_SINGLE(_macro(float, float, double, __VA_ARGS__)); \ + GKO_ADAPT_SINGLE(_macro(float, double, float, __VA_ARGS__)); \ + GKO_ADAPT_SINGLE(_macro(float, double, double, __VA_ARGS__)); \ + GKO_ADAPT_SINGLE(_macro(double, float, float, __VA_ARGS__)); \ + GKO_ADAPT_SINGLE(_macro(double, float, double, __VA_ARGS__)); \ + GKO_ADAPT_SINGLE(_macro(double, double, float, __VA_ARGS__)); \ + GKO_ADAPT_SINGLE(_macro(double, double, double, __VA_ARGS__)); \ + template _macro(std::complex, std::complex, \ + std::complex, __VA_ARGS__); \ + GKO_ADAPT_SINGLE(_macro(std::complex, std::complex, \ + std::complex, __VA_ARGS__)); \ + GKO_ADAPT_SINGLE(_macro(std::complex, std::complex, \ + std::complex, __VA_ARGS__)); \ + GKO_ADAPT_SINGLE(_macro(std::complex, std::complex, \ + std::complex, __VA_ARGS__)); \ + GKO_ADAPT_SINGLE(_macro(std::complex, std::complex, \ + std::complex, __VA_ARGS__)); \ + GKO_ADAPT_SINGLE(_macro(std::complex, std::complex, \ + std::complex, __VA_ARGS__)); \ + GKO_ADAPT_SINGLE(_macro(std::complex, std::complex, \ + std::complex, __VA_ARGS__)); \ + GKO_ADAPT_SINGLE(_macro(std::complex, std::complex, \ + std::complex, __VA_ARGS__)) #else -#define GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE(_macro, ...) \ - template _macro(float, float, float, __VA_ARGS__); \ - template _macro(double, double, double, __VA_ARGS__); \ - template _macro(std::complex, std::complex, \ - std::complex, __VA_ARGS__); \ - template _macro(std::complex, std::complex, \ - std::complex, __VA_ARGS__) +#define GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE(_macro, ...) \ + template _macro(float, float, float, __VA_ARGS__); \ + GKO_ADAPT_SINGLE(_macro(double, double, double, __VA_ARGS__)); \ + template _macro(std::complex, std::complex, \ + std::complex, __VA_ARGS__); \ + GKO_ADAPT_SINGLE(_macro(std::complex, std::complex, \ + std::complex, __VA_ARGS__)) #endif @@ -81,21 +81,25 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #ifdef GINKGO_MIXED_PRECISION -#define GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_2(_macro, ...) \ - template _macro(float, float, __VA_ARGS__); \ - template _macro(float, double, __VA_ARGS__); \ - template _macro(double, float, __VA_ARGS__); \ - template _macro(double, double, __VA_ARGS__); \ - template _macro(std::complex, std::complex, __VA_ARGS__); \ - template _macro(std::complex, std::complex, __VA_ARGS__); \ - template _macro(std::complex, std::complex, __VA_ARGS__); \ - template _macro(std::complex, std::complex, __VA_ARGS__) +#define GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_2(_macro, ...) \ + template _macro(float, float, __VA_ARGS__); \ + GKO_ADAPT_SINGLE(_macro(float, double, __VA_ARGS__)); \ + GKO_ADAPT_SINGLE(_macro(double, float, __VA_ARGS__)); \ + GKO_ADAPT_SINGLE(_macro(double, double, __VA_ARGS__)); \ + template _macro(std::complex, std::complex, __VA_ARGS__); \ + GKO_ADAPT_SINGLE( \ + _macro(std::complex, std::complex, __VA_ARGS__)); \ + GKO_ADAPT_SINGLE( \ + _macro(std::complex, std::complex, __VA_ARGS__)); \ + GKO_ADAPT_SINGLE( \ + _macro(std::complex, std::complex, __VA_ARGS__)) #else #define GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_2(_macro, ...) \ template _macro(float, float, __VA_ARGS__); \ - template _macro(double, double, __VA_ARGS__); \ + GKO_ADAPT_SINGLE(_macro(double, double, __VA_ARGS__)); \ template _macro(std::complex, std::complex, __VA_ARGS__); \ - template _macro(std::complex, std::complex, __VA_ARGS__) + GKO_ADAPT_SINGLE( \ + _macro(std::complex, std::complex, __VA_ARGS__)) #endif diff --git a/core/base/types.hpp b/core/base/types.hpp index 49c4fb99e15..2e2c4c99dce 100644 --- a/core/base/types.hpp +++ b/core/base/types.hpp @@ -143,6 +143,7 @@ class ConfigSet { public: static constexpr unsigned num_groups = sizeof...(num_bits); static constexpr std::array bits{num_bits...}; + using can_encode = std::true_type; /** * Decodes the `position` information from encoded diff --git a/core/solver/cb_gmres_kernels.hpp b/core/solver/cb_gmres_kernels.hpp index aa3d352e5a6..bac66826406 100644 --- a/core/solver/cb_gmres_kernels.hpp +++ b/core/solver/cb_gmres_kernels.hpp @@ -63,27 +63,27 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * const accessor, or not. */ #define GKO_INSTANTIATE_FOR_EACH_CB_GMRES_TYPE_HELPER(_macro, _const) \ - template _macro( \ + GKO_ADAPT_SINGLE(_macro( \ double, \ GKO_UNPACK( \ - acc::range>)); \ - template _macro( \ + acc::range>))); \ + GKO_ADAPT_SINGLE(_macro( \ double, \ GKO_UNPACK( \ - acc::range>)); \ - template _macro( \ + acc::range>))); \ + GKO_ADAPT_SINGLE(_macro( \ double, \ GKO_UNPACK( \ - acc::range>)); \ - template _macro(double, \ - GKO_UNPACK(acc::range>)); \ - template _macro(double, \ - GKO_UNPACK(acc::range>)); \ - template _macro(double, \ - GKO_UNPACK(acc::range>)); \ + acc::range>))); \ + GKO_ADAPT_SINGLE( \ + _macro(double, GKO_UNPACK(acc::range>))); \ + GKO_ADAPT_SINGLE( \ + _macro(double, GKO_UNPACK(acc::range>))); \ + GKO_ADAPT_SINGLE( \ + _macro(double, GKO_UNPACK(acc::range>))); \ template _macro( \ float, \ GKO_UNPACK( \ @@ -98,16 +98,16 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. template _macro(float, \ GKO_UNPACK(acc::range>)); \ - template _macro( \ + GKO_ADAPT_SINGLE(_macro( \ std::complex, \ GKO_UNPACK( \ - acc::range, \ - _const std::complex>>)); \ - template _macro( \ + acc::range, _const std::complex>>))); \ + GKO_ADAPT_SINGLE(_macro( \ std::complex, \ GKO_UNPACK( \ acc::range, \ - _const std::complex>>)); \ + _const std::complex>>))); \ template _macro( \ std::complex, \ GKO_UNPACK( \ diff --git a/core/synthesizer/implementation_selection.hpp b/core/synthesizer/implementation_selection.hpp index a78ac391d2e..39efa24b67d 100644 --- a/core/synthesizer/implementation_selection.hpp +++ b/core/synthesizer/implementation_selection.hpp @@ -48,24 +48,24 @@ namespace syn { #define GKO_ENABLE_IMPLEMENTATION_SELECTION(_name, _callable) \ template \ - inline void _name(::gko::syn::value_list, Predicate, \ - ::gko::syn::value_list, \ + inline void _name(std::integer_sequence, Predicate, \ + std::integer_sequence, \ ::gko::syn::type_list, InferredArgs...) \ GKO_KERNEL_NOT_FOUND; \ \ template \ inline void _name( \ - ::gko::syn::value_list, Predicate is_eligible, \ - ::gko::syn::value_list int_args, \ + std::integer_sequence, Predicate is_eligible, \ + std::integer_sequence int_args, \ ::gko::syn::type_list type_args, InferredArgs... args) \ { \ if (is_eligible(K)) { \ _callable( \ - ::gko::syn::value_list(), \ + std::integer_sequence(), \ std::forward(args)...); \ } else { \ - _name(::gko::syn::value_list(), is_eligible, \ + _name(std::integer_sequence(), is_eligible, \ int_args, type_args, std::forward(args)...); \ } \ } \ @@ -73,36 +73,35 @@ namespace syn { "This assert is used to counter the false positive extra " \ "semi-colon warnings") -#define GKO_ENABLE_IMPLEMENTATION_CONFIG_SELECTION(_name, _callable) \ - template \ - inline void _name(::gko::syn::value_list, Predicate, \ - ::gko::syn::value_list, \ - ::gko::syn::value_list, \ - ::gko::syn::value_list, \ - ::gko::syn::type_list, InferredArgs...) \ - GKO_KERNEL_NOT_FOUND; \ - \ - template \ - inline void _name( \ - ::gko::syn::value_list, \ - Predicate is_eligible, \ - ::gko::syn::value_list bool_args, \ - ::gko::syn::value_list int_args, \ - ::gko::syn::value_list size_args, \ - ::gko::syn::type_list type_args, InferredArgs... args) \ - { \ - if (is_eligible(K)) { \ - _callable( \ - std::forward(args)...); \ - } else { \ - _name(::gko::syn::value_list(), \ - is_eligible, bool_args, int_args, size_args, type_args, \ - std::forward(args)...); \ - } \ +#define GKO_ENABLE_IMPLEMENTATION_CONFIG_SELECTION(_name, _callable) \ + template \ + inline void _name(std::integer_sequence, Predicate, \ + std::integer_sequence, \ + std::integer_sequence, \ + std::integer_sequence, \ + ::gko::syn::type_list, InferredArgs...) \ + GKO_KERNEL_NOT_FOUND; \ + \ + template \ + inline void _name( \ + std::integer_sequence, Predicate is_eligible, \ + std::integer_sequence bool_args, \ + std::integer_sequence int_args, \ + std::integer_sequence size_args, \ + ::gko::syn::type_list type_args, InferredArgs... args) \ + { \ + if (is_eligible(K)) { \ + _callable( \ + std::forward(args)...); \ + } else { \ + _name(std::integer_sequence(), is_eligible, \ + bool_args, int_args, size_args, type_args, \ + std::forward(args)...); \ + } \ } diff --git a/core/test/CMakeLists.txt b/core/test/CMakeLists.txt index 2bb711e7a2e..07641c4ad6c 100644 --- a/core/test/CMakeLists.txt +++ b/core/test/CMakeLists.txt @@ -12,5 +12,6 @@ add_subdirectory(multigrid) add_subdirectory(preconditioner) add_subdirectory(reorder) add_subdirectory(solver) +add_subdirectory(synthesizer) add_subdirectory(stop) add_subdirectory(utils) diff --git a/core/test/synthesizer/CMakeLists.txt b/core/test/synthesizer/CMakeLists.txt new file mode 100644 index 00000000000..c2d6d722e4a --- /dev/null +++ b/core/test/synthesizer/CMakeLists.txt @@ -0,0 +1 @@ +ginkgo_create_test(containers) diff --git a/core/test/synthesizer/containers.cpp b/core/test/synthesizer/containers.cpp new file mode 100644 index 00000000000..9975dc03e82 --- /dev/null +++ b/core/test/synthesizer/containers.cpp @@ -0,0 +1,280 @@ +/************************************************************* +Copyright (c) 2017-2022, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include + + +#include + +namespace { + + +struct IntegerSequenceExtensions : public ::testing::Test { + using range1 = gko::syn::as_list>; + using range1_exp = std::integer_sequence; + using range2 = gko::syn::as_list>; + using range2_exp = std::integer_sequence; + + using test = gko::syn::value_list; + using sorted_asc_dups = + std::integer_sequence; + using sorted_asc_nodups = + std::integer_sequence; + using sorted_desc_dups = + std::integer_sequence; + using sorted_desc_nodups = + std::integer_sequence; + + using empty = std::integer_sequence; + +protected: + IntegerSequenceExtensions() {} +}; + +using ::testing::StaticAssertTypeEq; + + +TEST_F(IntegerSequenceExtensions, CanCreateRanges) +{ + StaticAssertTypeEq(); + StaticAssertTypeEq(); +} + + +TEST_F(IntegerSequenceExtensions, CanConcatenate) +{ + using test_case = + gko::syn::concatenate>; + using expected = std::integer_sequence; + + StaticAssertTypeEq(); +} + + +TEST_F(IntegerSequenceExtensions, CanGetArrayAndValue) +{ + auto expected_array = std::array{0, 7614, 453, 16, 9, 16, 0, 0, -4}; + using front = gko::syn::front; + + ASSERT_EQ(gko::syn::as_array(test{}), expected_array); + ASSERT_EQ(gko::syn::as_array(front{})[0], 0); + ASSERT_EQ(gko::syn::as_value(front{}), 0); +} + + +TEST_F(IntegerSequenceExtensions, CanSortAscendingNoDuplicates) +{ + using sorted = gko::syn::sort; + + StaticAssertTypeEq(); +} + + +TEST_F(IntegerSequenceExtensions, CanSortDescendingNoDuplicates) +{ + using sorted = gko::syn::sort; + + StaticAssertTypeEq(); +} + + +TEST_F(IntegerSequenceExtensions, CanSortAscendingWithDuplicates) +{ + using sorted = gko::syn::sort_keep; + + StaticAssertTypeEq(); +} + + +TEST_F(IntegerSequenceExtensions, CanSortDescendingWithDuplicates) +{ + using sorted = gko::syn::sort_keep; + + StaticAssertTypeEq(); +} + + +TEST_F(IntegerSequenceExtensions, CanAccessMax) +{ + using test_case = gko::syn::max; + using expected = std::integer_sequence; + StaticAssertTypeEq(); +} + + +TEST_F(IntegerSequenceExtensions, CanAccessMin) +{ + using test_case = gko::syn::min; + using expected = std::integer_sequence; + StaticAssertTypeEq(); +} + + +TEST_F(IntegerSequenceExtensions, CanAccessMedian) +{ + using test_case = gko::syn::median; + using expected = std::integer_sequence; + + StaticAssertTypeEq(); +} + + +TEST_F(IntegerSequenceExtensions, CanAccessFront) +{ + using test_case = gko::syn::front; + using expected = std::integer_sequence; + + StaticAssertTypeEq(); +} + + +TEST_F(IntegerSequenceExtensions, CanAccessBack) +{ + using test_case = gko::syn::back; + using expected = std::integer_sequence; + + StaticAssertTypeEq(); +} + + +TEST_F(IntegerSequenceExtensions, CanUseAtIndex) +{ + using idx1 = gko::syn::at_index<1, test>; + using exp_idx1 = std::integer_sequence; + using idx2 = gko::syn::at_index<2, test>; + using exp_idx2 = std::integer_sequence; + using idx4 = gko::syn::at_index<4, test>; + using exp_idx4 = std::integer_sequence; + using idx6 = gko::syn::at_index<6, test>; + using exp_idx6 = std::integer_sequence; + + StaticAssertTypeEq(); + StaticAssertTypeEq(); + StaticAssertTypeEq(); + StaticAssertTypeEq(); +} + + +TEST_F(IntegerSequenceExtensions, EmptyTests) +{ + using test1 = gko::syn::at_index<72, empty>; + using test2 = gko::syn::back; + using test3 = gko::syn::front; + using test4 = gko::syn::median; + using test5 = gko::syn::min; + using test6 = gko::syn::sort; + using test7 = gko::syn::sort; + using test8 = gko::syn::sort_keep; + using test9 = gko::syn::sort_keep; + + StaticAssertTypeEq(); + StaticAssertTypeEq(); + StaticAssertTypeEq(); + StaticAssertTypeEq(); + StaticAssertTypeEq(); + StaticAssertTypeEq(); + StaticAssertTypeEq(); + StaticAssertTypeEq(); + StaticAssertTypeEq(); +} + +struct int_encoder { + using can_encode = std::true_type; + + static constexpr int encode() { return 1; } + + template + static constexpr int encode(int v1, Rest&&... rest) + { + return v1 * encode(std::forward(rest)...); + } +}; + + +TEST_F(IntegerSequenceExtensions, CanMergeEmptyList) +{ + StaticAssertTypeEq, empty>(); +} + + +TEST_F(IntegerSequenceExtensions, CanMergeOneList) +{ + StaticAssertTypeEq, test>(); +} + + +TEST_F(IntegerSequenceExtensions, CanMergeTwoLists) +{ + using list1 = std::integer_sequence; + using list2 = std::integer_sequence; + using expected = std::integer_sequence; + + using res = gko::syn::merge; + + StaticAssertTypeEq(); +} + + +TEST_F(IntegerSequenceExtensions, CanMergeThreeLists) +{ + using list1 = std::integer_sequence; + using list2 = std::integer_sequence; + using list3 = std::integer_sequence; + using expected1 = std::integer_sequence; + using expected2 = + std::integer_sequence; + + using res1 = gko::syn::merge; + using res2 = gko::syn::merge; + + StaticAssertTypeEq(); + StaticAssertTypeEq(); +} + + +TEST_F(IntegerSequenceExtensions, CanMergeThreeListsOtherOrder) +{ + using list1 = std::integer_sequence; + using list2 = std::integer_sequence; + using list3 = std::integer_sequence; + using expected1 = std::integer_sequence; + using expected2 = + std::integer_sequence; + + using res1 = gko::syn::merge; + using res2 = gko::syn::merge; + + StaticAssertTypeEq(); + StaticAssertTypeEq(); +} + +} // namespace diff --git a/cuda/base/kernel_launch_reduction.cuh b/cuda/base/kernel_launch_reduction.cuh index db7040b07f5..58bf5e3580a 100644 --- a/cuda/base/kernel_launch_reduction.cuh +++ b/cuda/base/kernel_launch_reduction.cuh @@ -357,7 +357,7 @@ namespace { template -void run_generic_kernel_row_reduction(syn::value_list, +void run_generic_kernel_row_reduction(std::integer_sequence, int64 rows, int64 cols, int64 col_blocks, KernelFunction fn, ReductionOp op, FinalizeOp finalize, ValueType identity, @@ -382,7 +382,7 @@ GKO_ENABLE_IMPLEMENTATION_SELECTION(select_run_generic_kernel_row_reduction, template -void run_generic_col_reduction_small(syn::value_list, +void run_generic_col_reduction_small(std::integer_sequence, int64 max_blocks, std::shared_ptr exec, KernelFunction fn, ReductionOp op, @@ -433,7 +433,7 @@ void run_kernel_row_reduction(std::shared_ptr exec, dim<2> size, KernelArgs&&... args) { using subwarp_sizes = - syn::value_list; + std::integer_sequence; constexpr int oversubscription = 16; const auto rows = static_cast(size[0]); const auto cols = static_cast(size[1]); @@ -464,8 +464,8 @@ void run_kernel_row_reduction(std::shared_ptr exec, return compiled_subwarp_size >= cols || compiled_subwarp_size == config::warp_size; }, - syn::value_list(), syn::type_list<>(), rows, cols, 1, fn, op, - finalize, identity, result, static_cast(result_stride), + std::integer_sequence(), syn::type_list<>(), rows, cols, 1, fn, + op, finalize, identity, result, static_cast(result_stride), map_to_device(args)...); } } @@ -480,7 +480,7 @@ void run_kernel_col_reduction(std::shared_ptr exec, KernelArgs&&... args) { using subwarp_sizes = - syn::value_list; + std::integer_sequence; constexpr int oversubscription = 16; const auto rows = static_cast(size[0]); const auto cols = static_cast(size[1]); @@ -493,8 +493,8 @@ void run_kernel_col_reduction(std::shared_ptr exec, return compiled_subwarp_size >= cols || compiled_subwarp_size == config::warp_size; }, - syn::value_list(), syn::type_list<>(), max_blocks, exec, fn, - op, finalize, identity, result, size, map_to_device(args)...); + std::integer_sequence(), syn::type_list<>(), max_blocks, exec, + fn, op, finalize, identity, result, size, map_to_device(args)...); } else { const auto col_blocks = ceildiv(cols, config::warp_size); const auto row_blocks = diff --git a/cuda/factorization/par_ic_kernels.cu b/cuda/factorization/par_ic_kernels.cu index 4318cd1e1d9..43e2c66526f 100644 --- a/cuda/factorization/par_ic_kernels.cu +++ b/cuda/factorization/par_ic_kernels.cu @@ -59,7 +59,7 @@ constexpr int default_block_size = 512; // subwarp sizes for all warp-parallel kernels (sweep) using compiled_kernels = - syn::value_list; + std::integer_sequence; #include "common/cuda_hip/factorization/par_ic_kernels.hpp.inc" diff --git a/cuda/factorization/par_ict_kernels.cu b/cuda/factorization/par_ict_kernels.cu index 2f65affb628..f1d0689bc0f 100644 --- a/cuda/factorization/par_ict_kernels.cu +++ b/cuda/factorization/par_ict_kernels.cu @@ -70,7 +70,7 @@ constexpr int default_block_size = 512; // subwarp sizes for all warp-parallel kernels (filter, add_candidates) using compiled_kernels = - syn::value_list; + std::integer_sequence; #include "common/cuda_hip/factorization/par_ict_spgeam_kernels.hpp.inc" @@ -81,7 +81,7 @@ namespace { template -void add_candidates(syn::value_list, +void add_candidates(std::integer_sequence, std::shared_ptr exec, const matrix::Csr* llh, const matrix::Csr* a, @@ -137,7 +137,7 @@ GKO_ENABLE_IMPLEMENTATION_SELECTION(select_add_candidates, add_candidates); template -void compute_factor(syn::value_list, +void compute_factor(std::integer_sequence, std::shared_ptr exec, const matrix::Csr* a, matrix::Csr* l, @@ -180,7 +180,8 @@ void add_candidates(std::shared_ptr exec, return total_nnz_per_row <= compiled_subwarp_size || compiled_subwarp_size == config::warp_size; }, - syn::value_list(), syn::type_list<>(), exec, llh, a, l, l_new); + std::integer_sequence(), syn::type_list<>(), exec, llh, a, l, + l_new); } GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( @@ -202,7 +203,7 @@ void compute_factor(std::shared_ptr exec, return total_nnz_per_row <= compiled_subwarp_size || compiled_subwarp_size == config::warp_size; }, - syn::value_list(), syn::type_list<>(), exec, a, l, l_coo); + std::integer_sequence(), syn::type_list<>(), exec, a, l, l_coo); } GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( diff --git a/cuda/factorization/par_ilut_approx_filter_kernel.cu b/cuda/factorization/par_ilut_approx_filter_kernel.cu index be86588a909..a8f60fd2bc2 100644 --- a/cuda/factorization/par_ilut_approx_filter_kernel.cu +++ b/cuda/factorization/par_ilut_approx_filter_kernel.cu @@ -73,7 +73,7 @@ namespace par_ilut_factorization { // subwarp sizes for filter kernels using compiled_kernels = - syn::value_list; + std::integer_sequence; #include "common/cuda_hip/factorization/par_ilut_filter_kernels.hpp.inc" @@ -81,7 +81,7 @@ using compiled_kernels = template -void threshold_filter_approx(syn::value_list, +void threshold_filter_approx(std::integer_sequence, std::shared_ptr exec, const matrix::Csr* m, IndexType rank, Array* tmp, @@ -196,7 +196,7 @@ void threshold_filter_approx(std::shared_ptr exec, return total_nnz_per_row <= compiled_subwarp_size || compiled_subwarp_size == config::warp_size; }, - syn::value_list(), syn::type_list<>(), exec, m, rank, &tmp, + std::integer_sequence(), syn::type_list<>(), exec, m, rank, &tmp, &threshold, m_out, m_out_coo); } diff --git a/cuda/factorization/par_ilut_filter_kernel.cu b/cuda/factorization/par_ilut_filter_kernel.cu index 6dd83c41835..85d8c03211e 100644 --- a/cuda/factorization/par_ilut_filter_kernel.cu +++ b/cuda/factorization/par_ilut_filter_kernel.cu @@ -69,7 +69,7 @@ constexpr int default_block_size = 512; // subwarp sizes for filter kernels using compiled_kernels = - syn::value_list; + std::integer_sequence; #include "common/cuda_hip/factorization/par_ilut_filter_kernels.hpp.inc" @@ -79,7 +79,7 @@ namespace { template -void threshold_filter(syn::value_list, +void threshold_filter(std::integer_sequence, std::shared_ptr exec, const matrix::Csr* a, remove_complex threshold, @@ -153,8 +153,8 @@ void threshold_filter(std::shared_ptr exec, return total_nnz_per_row <= compiled_subwarp_size || compiled_subwarp_size == config::warp_size; }, - syn::value_list(), syn::type_list<>(), exec, a, threshold, m_out, - m_out_coo, lower); + std::integer_sequence(), syn::type_list<>(), exec, a, threshold, + m_out, m_out_coo, lower); } GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( diff --git a/cuda/factorization/par_ilut_spgeam_kernel.cu b/cuda/factorization/par_ilut_spgeam_kernel.cu index f1a6b7518ed..f77d3d77119 100644 --- a/cuda/factorization/par_ilut_spgeam_kernel.cu +++ b/cuda/factorization/par_ilut_spgeam_kernel.cu @@ -70,7 +70,7 @@ constexpr int default_block_size = 512; // subwarp sizes for add_candidates kernels using compiled_kernels = - syn::value_list; + std::integer_sequence; #include "common/cuda_hip/factorization/par_ilut_spgeam_kernels.hpp.inc" @@ -80,7 +80,7 @@ namespace { template -void add_candidates(syn::value_list, +void add_candidates(std::integer_sequence, std::shared_ptr exec, const matrix::Csr* lu, const matrix::Csr* a, @@ -172,8 +172,8 @@ void add_candidates(std::shared_ptr exec, return total_nnz_per_row <= compiled_subwarp_size || compiled_subwarp_size == config::warp_size; }, - syn::value_list(), syn::type_list<>(), exec, lu, a, l, u, l_new, - u_new); + std::integer_sequence(), syn::type_list<>(), exec, lu, a, l, u, + l_new, u_new); } GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( diff --git a/cuda/factorization/par_ilut_sweep_kernel.cu b/cuda/factorization/par_ilut_sweep_kernel.cu index fb1248158f9..c4a13c3915f 100644 --- a/cuda/factorization/par_ilut_sweep_kernel.cu +++ b/cuda/factorization/par_ilut_sweep_kernel.cu @@ -70,7 +70,7 @@ constexpr int default_block_size = 512; // subwarp sizes for all warp-parallel kernels (filter, add_candidates) using compiled_kernels = - syn::value_list; + std::integer_sequence; #include "common/cuda_hip/factorization/par_ilut_sweep_kernels.hpp.inc" @@ -80,7 +80,7 @@ namespace { template -void compute_l_u_factors(syn::value_list, +void compute_l_u_factors(std::integer_sequence, std::shared_ptr exec, const matrix::Csr* a, matrix::Csr* l, @@ -133,8 +133,8 @@ void compute_l_u_factors(std::shared_ptr exec, return total_nnz_per_row <= compiled_subwarp_size || compiled_subwarp_size == config::warp_size; }, - syn::value_list(), syn::type_list<>(), exec, a, l, l_coo, u, u_coo, - u_csc); + std::integer_sequence(), syn::type_list<>(), exec, a, l, l_coo, u, + u_coo, u_csc); } GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( diff --git a/cuda/matrix/csr_kernels.cu b/cuda/matrix/csr_kernels.cu index 2550aa2ceb3..c4f3f76ad98 100644 --- a/cuda/matrix/csr_kernels.cu +++ b/cuda/matrix/csr_kernels.cu @@ -88,13 +88,13 @@ constexpr int classical_overweight = 32; * A compile-time list of the number items per threads for which spmv kernel * should be compiled. */ -using compiled_kernels = syn::value_list; +using compiled_kernels = std::integer_sequence; using classical_kernels = - syn::value_list; + std::integer_sequence; using spgeam_kernels = - syn::value_list; + std::integer_sequence; #include "common/cuda_hip/matrix/csr_kernels.hpp.inc" @@ -105,7 +105,7 @@ namespace { template -void merge_path_spmv(syn::value_list, +void merge_path_spmv(std::integer_sequence, std::shared_ptr exec, const matrix::Csr* a, const matrix::Dense* b, @@ -213,7 +213,7 @@ int compute_items_per_thread(std::shared_ptr exec) template -void classical_spmv(syn::value_list, +void classical_spmv(std::integer_sequence, std::shared_ptr exec, const matrix::Csr* a, const matrix::Dense* b, @@ -429,7 +429,7 @@ void spmv(std::shared_ptr exec, [&items_per_thread](int compiled_info) { return items_per_thread == compiled_info; }, - syn::value_list(), syn::type_list<>(), exec, a, b, c); + std::integer_sequence(), syn::type_list<>(), exec, a, b, c); } else { bool use_classical = true; if (a->get_strategy()->get_name() == "sparselib" || @@ -459,7 +459,8 @@ void spmv(std::shared_ptr exec, [&max_length_per_row](int compiled_info) { return max_length_per_row >= compiled_info; }, - syn::value_list(), syn::type_list<>(), exec, a, b, c); + std::integer_sequence(), syn::type_list<>(), exec, a, b, + c); } } } @@ -487,8 +488,8 @@ void advanced_spmv(std::shared_ptr exec, [&items_per_thread](int compiled_info) { return items_per_thread == compiled_info; }, - syn::value_list(), syn::type_list<>(), exec, a, b, c, alpha, - beta); + std::integer_sequence(), syn::type_list<>(), exec, a, b, c, + alpha, beta); } else { bool use_classical = true; if (a->get_strategy()->get_name() == "sparselib" || @@ -519,7 +520,7 @@ void advanced_spmv(std::shared_ptr exec, [&max_length_per_row](int compiled_info) { return max_length_per_row >= compiled_info; }, - syn::value_list(), syn::type_list<>(), exec, a, b, c, + std::integer_sequence(), syn::type_list<>(), exec, a, b, c, alpha, beta); } } @@ -658,7 +659,7 @@ namespace { template -void spgeam(syn::value_list, +void spgeam(std::integer_sequence, std::shared_ptr exec, const ValueType* alpha, const IndexType* a_row_ptrs, const IndexType* a_col_idxs, const ValueType* a_vals, const ValueType* beta, @@ -836,7 +837,7 @@ void advanced_spgemm(std::shared_ptr exec, return compiled_subwarp_size >= nnz_per_row || compiled_subwarp_size == config::warp_size; }, - syn::value_list(), syn::type_list<>(), exec, + std::integer_sequence(), syn::type_list<>(), exec, alpha->get_const_values(), c_tmp_row_ptrs_array.get_const_data(), c_tmp_col_idxs_array.get_const_data(), c_tmp_vals_array.get_const_data(), beta->get_const_values(), d_row_ptrs, @@ -865,7 +866,7 @@ void spgeam(std::shared_ptr exec, return compiled_subwarp_size >= nnz_per_row || compiled_subwarp_size == config::warp_size; }, - syn::value_list(), syn::type_list<>(), exec, + std::integer_sequence(), syn::type_list<>(), exec, alpha->get_const_values(), a->get_const_row_ptrs(), a->get_const_col_idxs(), a->get_const_values(), beta->get_const_values(), b->get_const_row_ptrs(), diff --git a/cuda/matrix/ell_kernels.cu b/cuda/matrix/ell_kernels.cu index c906bb7fd1a..957faca5d6e 100644 --- a/cuda/matrix/ell_kernels.cu +++ b/cuda/matrix/ell_kernels.cu @@ -102,7 +102,7 @@ constexpr int max_thread_per_worker = 32; * 0 is a special case where it uses a sub-warp size of warp_size in * combination with atomic_adds. */ -using compiled_kernels = syn::value_list; +using compiled_kernels = std::integer_sequence; #include "common/cuda_hip/matrix/ell_kernels.hpp.inc" @@ -113,7 +113,7 @@ namespace { template -void abstract_spmv(syn::value_list, int num_worker_per_row, +void abstract_spmv(std::integer_sequence, int num_worker_per_row, const matrix::Ell* a, const matrix::Dense* b, matrix::Dense* c, @@ -249,8 +249,8 @@ void spmv(std::shared_ptr exec, select_abstract_spmv( compiled_kernels(), [&info](int compiled_info) { return info == compiled_info; }, - syn::value_list(), syn::type_list<>(), num_worker_per_row, a, b, - c); + std::integer_sequence(), syn::type_list<>(), num_worker_per_row, a, + b, c); } GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE( @@ -283,8 +283,8 @@ void advanced_spmv(std::shared_ptr exec, select_abstract_spmv( compiled_kernels(), [&info](int compiled_info) { return info == compiled_info; }, - syn::value_list(), syn::type_list<>(), num_worker_per_row, a, b, c, - alpha, beta); + std::integer_sequence(), syn::type_list<>(), num_worker_per_row, a, + b, c, alpha, beta); } GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE( diff --git a/cuda/matrix/fbcsr_kernels.cu b/cuda/matrix/fbcsr_kernels.cu index c9c4782d262..eede67c3211 100644 --- a/cuda/matrix/fbcsr_kernels.cu +++ b/cuda/matrix/fbcsr_kernels.cu @@ -261,7 +261,7 @@ namespace { template -void transpose_blocks_impl(syn::value_list, +void transpose_blocks_impl(std::integer_sequence, matrix::Fbcsr* const mat) { constexpr int subwarp_size = config::warp_size; @@ -310,7 +310,7 @@ void transpose(const std::shared_ptr exec, select_transpose_blocks( fixedblock::compiled_kernels(), [bs](int compiled_block_size) { return bs == compiled_block_size; }, - syn::value_list(), syn::type_list<>(), trans); + std::integer_sequence(), syn::type_list<>(), trans); } else { GKO_NOT_IMPLEMENTED; } diff --git a/cuda/preconditioner/jacobi_advanced_apply_instantiate.inc.cu b/cuda/preconditioner/jacobi_advanced_apply_instantiate.inc.cu index 3917c54bf43..34d4ebb529d 100644 --- a/cuda/preconditioner/jacobi_advanced_apply_instantiate.inc.cu +++ b/cuda/preconditioner/jacobi_advanced_apply_instantiate.inc.cu @@ -75,7 +75,7 @@ namespace jacobi { template void advanced_apply( - syn::value_list, size_type num_blocks, + std::integer_sequence, size_type num_blocks, const precision_reduction* block_precisions, const IndexType* block_pointers, const ValueType* blocks, const preconditioner::block_interleaved_storage_scheme& @@ -112,7 +112,7 @@ void advanced_apply( #define DECLARE_JACOBI_ADVANCED_APPLY_INSTANTIATION(ValueType, IndexType) \ void advanced_apply( \ - syn::value_list, size_type, \ + std::integer_sequence, size_type, \ const precision_reduction*, const IndexType* block_pointers, \ const ValueType*, \ const preconditioner::block_interleaved_storage_scheme&, \ diff --git a/cuda/preconditioner/jacobi_advanced_apply_kernel.cu b/cuda/preconditioner/jacobi_advanced_apply_kernel.cu index 2632e9c7180..95048463d1d 100644 --- a/cuda/preconditioner/jacobi_advanced_apply_kernel.cu +++ b/cuda/preconditioner/jacobi_advanced_apply_kernel.cu @@ -55,7 +55,7 @@ namespace jacobi { template void advanced_apply( - syn::value_list, size_type num_blocks, + std::integer_sequence, size_type num_blocks, const precision_reduction* block_precisions, const IndexType* block_pointers, const ValueType* blocks, const preconditioner::block_interleaved_storage_scheme& @@ -86,7 +86,7 @@ void apply(std::shared_ptr exec, size_type num_blocks, [&](int compiled_block_size) { return max_block_size <= compiled_block_size; }, - syn::value_list(), + std::integer_sequence(), syn::type_list<>(), num_blocks, block_precisions.get_const_data(), block_pointers.get_const_data(), blocks.get_const_data(), storage_scheme, alpha->get_const_values(), diff --git a/cuda/preconditioner/jacobi_common.hpp.in b/cuda/preconditioner/jacobi_common.hpp.in index d6be53c1835..19bd21385f4 100644 --- a/cuda/preconditioner/jacobi_common.hpp.in +++ b/cuda/preconditioner/jacobi_common.hpp.in @@ -30,12 +30,15 @@ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *************************************************************/ +#include + + #include -#include #include "cuda/base/config.hpp" + namespace gko { namespace kernels { namespace cuda { @@ -55,7 +58,8 @@ namespace jacobi { #endif -using compiled_kernels = syn::value_list; +using compiled_kernels = + std::integer_sequence; constexpr int get_larger_power(int value, int guess = 1) diff --git a/cuda/preconditioner/jacobi_generate_instantiate.inc.cu b/cuda/preconditioner/jacobi_generate_instantiate.inc.cu index 240b4f4b5fd..b232a50536d 100644 --- a/cuda/preconditioner/jacobi_generate_instantiate.inc.cu +++ b/cuda/preconditioner/jacobi_generate_instantiate.inc.cu @@ -77,7 +77,7 @@ namespace jacobi { template -void generate(syn::value_list, +void generate(std::integer_sequence, const matrix::Csr* mtx, remove_complex accuracy, ValueType* block_data, const preconditioner::block_interleaved_storage_scheme& @@ -119,7 +119,7 @@ void generate(syn::value_list, #define DECLARE_JACOBI_GENERATE_INSTANTIATION(ValueType, IndexType) \ void generate( \ - syn::value_list, \ + std::integer_sequence, \ const matrix::Csr*, remove_complex, \ ValueType*, \ const preconditioner::block_interleaved_storage_scheme&, \ diff --git a/cuda/preconditioner/jacobi_generate_kernel.cu b/cuda/preconditioner/jacobi_generate_kernel.cu index 01ce054322e..50b3f9d0968 100644 --- a/cuda/preconditioner/jacobi_generate_kernel.cu +++ b/cuda/preconditioner/jacobi_generate_kernel.cu @@ -55,7 +55,7 @@ namespace jacobi { template -void generate(syn::value_list, +void generate(std::integer_sequence, const matrix::Csr* mtx, remove_complex accuracy, ValueType* block_data, const preconditioner::block_interleaved_storage_scheme& @@ -85,9 +85,9 @@ void generate(std::shared_ptr exec, [&](int compiled_block_size) { return max_block_size <= compiled_block_size; }, - syn::value_list(), syn::type_list<>(), - system_matrix, accuracy, blocks.get_data(), storage_scheme, - conditioning.get_data(), block_precisions.get_data(), + std::integer_sequence(), + syn::type_list<>(), system_matrix, accuracy, blocks.get_data(), + storage_scheme, conditioning.get_data(), block_precisions.get_data(), block_pointers.get_const_data(), num_blocks); } diff --git a/cuda/preconditioner/jacobi_kernels.cu b/cuda/preconditioner/jacobi_kernels.cu index c88eab57738..105cddf7bfc 100644 --- a/cuda/preconditioner/jacobi_kernels.cu +++ b/cuda/preconditioner/jacobi_kernels.cu @@ -153,7 +153,7 @@ namespace { template void transpose_jacobi( - syn::value_list, size_type num_blocks, + std::integer_sequence, size_type num_blocks, const precision_reduction* block_precisions, const IndexType* block_pointers, const ValueType* blocks, const preconditioner::block_interleaved_storage_scheme& @@ -202,7 +202,7 @@ void transpose_jacobi( [&](int compiled_block_size) { return max_block_size <= compiled_block_size; }, - syn::value_list(), + std::integer_sequence(), syn::type_list<>(), num_blocks, block_precisions.get_const_data(), block_pointers.get_const_data(), blocks.get_const_data(), storage_scheme, out_blocks.get_data()); @@ -226,7 +226,7 @@ void conj_transpose_jacobi( [&](int compiled_block_size) { return max_block_size <= compiled_block_size; }, - syn::value_list(), + std::integer_sequence(), syn::type_list<>(), num_blocks, block_precisions.get_const_data(), block_pointers.get_const_data(), blocks.get_const_data(), storage_scheme, out_blocks.get_data()); diff --git a/cuda/preconditioner/jacobi_simple_apply_instantiate.inc.cu b/cuda/preconditioner/jacobi_simple_apply_instantiate.inc.cu index 2d21f9d43ac..3e4cb263e18 100644 --- a/cuda/preconditioner/jacobi_simple_apply_instantiate.inc.cu +++ b/cuda/preconditioner/jacobi_simple_apply_instantiate.inc.cu @@ -74,7 +74,7 @@ namespace jacobi { template -void apply(syn::value_list, size_type num_blocks, +void apply(std::integer_sequence, size_type num_blocks, const precision_reduction* block_precisions, const IndexType* block_pointers, const ValueType* blocks, const preconditioner::block_interleaved_storage_scheme& @@ -110,7 +110,7 @@ void apply(syn::value_list, size_type num_blocks, #define DECLARE_JACOBI_SIMPLE_APPLY_INSTANTIATION(ValueType, IndexType) \ void apply( \ - syn::value_list, size_type, \ + std::integer_sequence, size_type, \ const precision_reduction*, const IndexType*, const ValueType*, \ const preconditioner::block_interleaved_storage_scheme&, \ const ValueType*, size_type, ValueType*, size_type) diff --git a/cuda/preconditioner/jacobi_simple_apply_kernel.cu b/cuda/preconditioner/jacobi_simple_apply_kernel.cu index 2b96cfc3321..e6ef05611f0 100644 --- a/cuda/preconditioner/jacobi_simple_apply_kernel.cu +++ b/cuda/preconditioner/jacobi_simple_apply_kernel.cu @@ -54,7 +54,7 @@ namespace jacobi { template -void apply(syn::value_list, size_type num_blocks, +void apply(std::integer_sequence, size_type num_blocks, const precision_reduction* block_precisions, const IndexType* block_pointers, const ValueType* blocks, const preconditioner::block_interleaved_storage_scheme& @@ -82,7 +82,7 @@ void simple_apply( [&](int compiled_block_size) { return max_block_size <= compiled_block_size; }, - syn::value_list(), + std::integer_sequence(), syn::type_list<>(), num_blocks, block_precisions.get_const_data(), block_pointers.get_const_data(), blocks.get_const_data(), storage_scheme, b->get_const_values() + col, b->get_stride(), diff --git a/dpcpp/CMakeLists.txt b/dpcpp/CMakeLists.txt index ac7cf2d981e..5500ecd3a7f 100644 --- a/dpcpp/CMakeLists.txt +++ b/dpcpp/CMakeLists.txt @@ -11,6 +11,14 @@ set(GINKGO_MKL_ROOT "${MKL_ROOT}" PARENT_SCOPE) find_package(oneDPL REQUIRED HINTS "$ENV{DPL_ROOT}") set(GINKGO_DPL_ROOT "${DPL_ROOT}" PARENT_SCOPE) +# Reverse the subgroup list, replace ; to , from the user strings +string(REPLACE ";" "," GINKGO_DPCPP_SUBGROUPS_FORMATTED "${GINKGO_DPCPP_SUBGROUPS}") +string(REPLACE ";" "," GINKGO_DPCPP_WORKGROUPS_FORMATTED "${GINKGO_DPCPP_WORKGROUPS}") +# Generate the DPC++ Configuration file +configure_file(${CMAKE_CURRENT_SOURCE_DIR}/base/config.hpp.in + ${CMAKE_CURRENT_SOURCE_DIR}/base/config.hpp @ONLY) + + add_library(ginkgo_dpcpp $ "") target_sources(ginkgo_dpcpp PRIVATE @@ -51,25 +59,36 @@ target_sources(ginkgo_dpcpp ${GKO_UNIFIED_COMMON_SOURCES} ) + ginkgo_compile_features(ginkgo_dpcpp) target_compile_definitions(ginkgo_dpcpp PRIVATE GKO_COMPILING_DPCPP _ONEDPL_COMPILE_KERNEL=0) set(GINKGO_DPCPP_FLAGS ${GINKGO_DPCPP_FLAGS} PARENT_SCOPE) target_compile_options(ginkgo_dpcpp PRIVATE "${GINKGO_DPCPP_FLAGS}") target_compile_options(ginkgo_dpcpp PRIVATE "${GINKGO_COMPILER_FLAGS}") +target_compile_features(ginkgo_dpcpp PUBLIC cxx_std_17) + # Note: add MKL as PRIVATE not PUBLIC (MKL example shows) to avoid propagating # find_package(MKL) everywhere when linking ginkgo (see the MKL example # https://software.intel.com/content/www/us/en/develop/documentation/onemkl-windows-developer-guide/top/getting-started/cmake-config-for-onemkl.html) target_compile_options(ginkgo_dpcpp PRIVATE $) -target_compile_features(ginkgo_dpcpp PRIVATE cxx_std_17) target_include_directories(ginkgo_dpcpp PRIVATE $) +target_include_directories(ginkgo_dpcpp PRIVATE $) + target_link_options(ginkgo_dpcpp PRIVATE -fsycl-device-lib=all) -# When building ginkgo as a static library, we need to use dpcpp and per_kernel -# link option when the program uses a dpcpp related function. -if (BUILD_SHARED_LIBS) - target_link_options(ginkgo_dpcpp PRIVATE -fsycl-device-code-split=per_kernel) -else () - target_link_options(ginkgo_dpcpp PUBLIC -fsycl-device-code-split=per_kernel) + +if (NOT GINKGO_DPCPP_AOT_TARGETS STREQUAL "") + string(REPLACE ";" "," GINKGO_DPCPP_AOT_TARGETS_FORMATTED "${GINKGO_DPCPP_AOT_TARGETS}") + target_compile_options(ginkgo_dpcpp PUBLIC -fsycl-targets=${GINKGO_DPCPP_AOT_TARGETS_FORMATTED}) + target_link_options(ginkgo_dpcpp PUBLIC -fsycl-targets=${GINKGO_DPCPP_AOT_TARGETS_FORMATTED}) +else() + # When building ginkgo as a static library, we need to use dpcpp and per_source + # link option when the program uses a dpcpp related function. + if (BUILD_SHARED_LIBS) + target_link_options(ginkgo_dpcpp PRIVATE -fsycl-device-code-split=per_source) + else () + target_link_options(ginkgo_dpcpp PUBLIC -fsycl-device-code-split=per_source) + endif() endif() target_link_libraries(ginkgo_dpcpp PUBLIC ginkgo_device) target_link_libraries(ginkgo_dpcpp PRIVATE MKL::MKL_DPCPP oneDPL) diff --git a/dpcpp/base/config.hpp.in b/dpcpp/base/config.hpp.in new file mode 100644 index 00000000000..15b56383b93 --- /dev/null +++ b/dpcpp/base/config.hpp.in @@ -0,0 +1,117 @@ +/************************************************************* +Copyright (c) 2017-2022, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#ifndef GKO_DPCPP_BASE_CONFIG_HPP_IN_ +#define GKO_DPCPP_BASE_CONFIG_HPP_IN_ + + +#include +#include +#include + + +#include "core/base/types.hpp" + + +namespace gko { +namespace kernels { +namespace dpcpp { + + +// CMake generated subgroup list. We sort by ascending order. +// clang-format off +using subgroup_list_t = + syn::sort>; +using subgroup_list_desc_t = syn::sort; +constexpr auto subgroup_list = subgroup_list_t{}; +// clang-format on + +// CMake generated workgroup list. We sort by ascending order. +// clang-format off +using workgroup_list_t = + syn::sort>; +using workgroup_list_desc_t = syn::sort; +constexpr auto workgroup_list = workgroup_list_t{}; +// clang-format on + +using empty_list_t = std::integer_sequence; + +struct config { + /** + * The type containing a bitmask over all lanes of a warp. + */ + using lane_mask_type = uint64; + + /** + * The number of threads within a Dpcpp subgroup. + */ + static constexpr uint32 warp_size = + syn::as_value(syn::median{}); + + /** + * The bitmask of the entire warp. + */ + static constexpr auto full_lane_mask = ~zero(); + + /** + * The minimal amount of warps that need to be scheduled for each block + * to maximize GPU occupancy. + */ + static constexpr uint32 min_warps_per_block = 4; + + /** + * The default maximal number of threads allowed in DPCPP group + */ + static constexpr uint32 max_block_size = + syn::as_value(syn::max{}); +}; + + +using KCFG_1D = ConfigSet<11, 7>; +constexpr auto kcfg_1d_list = + syn::merge{}; +constexpr auto kcfg_1d_array = syn::as_array(kcfg_1d_list); + +using BlockCfg = ConfigSet<11>; +constexpr auto block_cfg_list = + syn::merge{}; +constexpr auto block_cfg_array = syn::as_array(block_cfg_list); + + +} // namespace dpcpp +} // namespace kernels +} // namespace gko + + +#endif // GKO_DPCPP_BASE_CONFIG_HPP_IN_ diff --git a/dpcpp/base/executor.dp.cpp b/dpcpp/base/executor.dp.cpp index 2aed12c606d..18bdc5ed077 100644 --- a/dpcpp/base/executor.dp.cpp +++ b/dpcpp/base/executor.dp.cpp @@ -118,7 +118,8 @@ void DpcppExecutor::raw_free(void* ptr) const noexcept #endif // GKO_VERBOSE_LEVEL >= 1 // OpenCL error code use 0 for CL_SUCCESS and negative number for others // error. if the error is not from OpenCL, it will return CL_SUCCESS. - int err_code = err.get_cl_code(); + // TODO: check whether error + int err_code = err.code().value(); // if return CL_SUCCESS, exit 1 as DPCPP error. if (err_code == 0) { err_code = 1; diff --git a/dpcpp/base/helper.hpp b/dpcpp/base/helper.hpp index d8081543313..65d9a0f1fd1 100644 --- a/dpcpp/base/helper.hpp +++ b/dpcpp/base/helper.hpp @@ -81,7 +81,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * @param kernel_ the kernel name */ #define GKO_ENABLE_DEFAULT_HOST_CONFIG(name_, kernel_) \ - template \ + template \ inline void name_(dim3 grid, dim3 block, gko::size_type, \ sycl::queue* queue, InferredArgs... args) \ { \ @@ -115,19 +115,18 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * available to decode<0> for blocksize and decode<1> for * subgroup_size by cfg_ */ -#define GKO_ENABLE_DEFAULT_CONFIG_CALL(name_, callable_, list_) \ - template \ - void name_(std::uint32_t desired_cfg, dim3 grid, dim3 block, \ - gko::size_type dynamic_shared_memory, sycl::queue* queue, \ - InferredArgs... args) \ - { \ - callable_( \ - list_, \ - [&desired_cfg](std::uint32_t cfg) { return cfg == desired_cfg; }, \ - ::gko::syn::value_list(), ::gko::syn::value_list(), \ - ::gko::syn::value_list(), \ - ::gko::syn::type_list<>(), grid, block, dynamic_shared_memory, \ - queue, std::forward(args)...); \ +#define GKO_ENABLE_DEFAULT_CONFIG_CALL(name_, callable_, list_) \ + template \ + void name_(int desired_cfg, dim3 grid, dim3 block, \ + gko::size_type dynamic_shared_memory, sycl::queue* queue, \ + InferredArgs... args) \ + { \ + callable_( \ + list_, [&desired_cfg](int cfg) { return cfg == desired_cfg; }, \ + std::integer_sequence(), std::integer_sequence(), \ + std::integer_sequence(), \ + ::gko::syn::type_list<>(), grid, block, dynamic_shared_memory, \ + queue, std::forward(args)...); \ } // __WG_BOUND__ gives the cuda-like launch bound in cuda ordering @@ -178,7 +177,7 @@ bool validate(sycl::queue* queue, unsigned workgroup_size, * @return the first valid config */ template -std::uint32_t get_first_cfg(const IterArr& arr, Validate verify) +int get_first_cfg(const IterArr& arr, Validate verify) { for (auto& cfg : arr) { if (verify(cfg)) { diff --git a/dpcpp/base/kernel_launch_reduction.dp.hpp b/dpcpp/base/kernel_launch_reduction.dp.hpp index 0df00f58c6e..dbdf34de726 100644 --- a/dpcpp/base/kernel_launch_reduction.dp.hpp +++ b/dpcpp/base/kernel_launch_reduction.dp.hpp @@ -37,10 +37,10 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include #include "core/synthesizer/implementation_selection.hpp" -#include "dpcpp/base/config.hpp" #include "dpcpp/base/dim3.dp.hpp" #include "dpcpp/components/cooperative_groups.dp.hpp" #include "dpcpp/components/reduction.dp.hpp" @@ -53,15 +53,7 @@ namespace kernels { namespace dpcpp { -using KCFG_1D = ConfigSet<11, 7>; -constexpr auto kcfg_1d_list_simple_reduction = - syn::value_list(); - - -template void generic_kernel_reduction_1d(sycl::handler& cgh, int64 size, @@ -73,16 +65,17 @@ void generic_kernel_reduction_1d(sycl::handler& cgh, int64 size, constexpr auto wg_size = KCFG_1D::decode<0>(cfg); constexpr auto sg_size = KCFG_1D::decode<1>(cfg); constexpr auto num_partials = wg_size / sg_size; - sycl::accessor, 0, - sycl::access_mode::read_write, sycl::access::target::local> - subgroup_partial_acc(cgh); + sycl::accessor + subgroup_partial_acc(sycl::range<1>(num_partials), cgh); const auto range = sycl_nd_range(dim3(num_workgroups), dim3(wg_size)); const auto global_size = num_workgroups * wg_size; cgh.parallel_for( range, [= ](sycl::nd_item<3> idx) [[sycl::reqd_sub_group_size(sg_size)]] { - auto subgroup_partial = &(*subgroup_partial_acc.get_pointer())[0]; + ValueType* __restrict__ subgroup_partial = + static_cast(subgroup_partial_acc.get_pointer()); const auto tidx = thread::get_thread_id_flat(idx); const auto local_tidx = static_cast(tidx % wg_size); auto subgroup = @@ -110,7 +103,7 @@ void generic_kernel_reduction_1d(sycl::handler& cgh, int64 size, } -template void generic_kernel_reduction_2d(sycl::handler& cgh, int64 rows, int64 cols, @@ -122,16 +115,17 @@ void generic_kernel_reduction_2d(sycl::handler& cgh, int64 rows, int64 cols, constexpr auto wg_size = KCFG_1D::decode<0>(cfg); constexpr auto sg_size = KCFG_1D::decode<1>(cfg); constexpr auto num_partials = wg_size / sg_size; - sycl::accessor, 0, - sycl::access_mode::read_write, sycl::access::target::local> - subgroup_partial_acc(cgh); + sycl::accessor + subgroup_partial_acc(sycl::range<1>(num_partials), cgh); const auto range = sycl_nd_range(dim3(num_workgroups), dim3(wg_size)); const auto global_size = num_workgroups * wg_size; cgh.parallel_for( range, [= ](sycl::nd_item<3> idx) [[sycl::reqd_sub_group_size(sg_size)]] { - auto subgroup_partial = &(*subgroup_partial_acc.get_pointer())[0]; + ValueType* __restrict__ subgroup_partial = + static_cast(subgroup_partial_acc.get_pointer()); const auto tidx = thread::get_thread_id_flat(idx); const auto local_tidx = static_cast(tidx % wg_size); auto subgroup = @@ -161,7 +155,7 @@ void generic_kernel_reduction_2d(sycl::handler& cgh, int64 rows, int64 cols, } -template void run_kernel_reduction_impl(std::shared_ptr exec, @@ -201,7 +195,7 @@ void run_kernel_reduction_impl(std::shared_ptr exec, } -template void run_kernel_reduction_impl(std::shared_ptr exec, @@ -254,16 +248,14 @@ void run_kernel_reduction(std::shared_ptr exec, FinalizeOp finalize, ValueType identity, ValueType* result, dim<2> size, KernelArgs&&... args) { - const auto desired_cfg = get_first_cfg( - as_array(kcfg_1d_list_simple_reduction), [&](std::uint32_t cfg) { - return validate(exec->get_queue(), KCFG_1D::decode<0>(cfg), - KCFG_1D::decode<1>(cfg)); - }); + const auto desired_cfg = get_first_cfg(kcfg_1d_array, [&](int cfg) { + return validate(exec->get_queue(), KCFG_1D::decode<0>(cfg), + KCFG_1D::decode<1>(cfg)); + }); select_run_kernel_reduction( - kcfg_1d_list_simple_reduction, - [&](std::uint32_t cfg) { return cfg == desired_cfg; }, - syn::value_list(), syn::value_list(), - syn::value_list(), syn::type_list<>(), exec, fn, op, + kcfg_1d_list, [&](int cfg) { return cfg == desired_cfg; }, + std::integer_sequence(), std::integer_sequence(), + std::integer_sequence(), syn::type_list<>(), exec, fn, op, finalize, identity, result, size, map_to_device(args)...); } @@ -276,16 +268,14 @@ void run_kernel_reduction(std::shared_ptr exec, ValueType* result, size_type size, KernelArgs&&... args) { - const auto desired_cfg = get_first_cfg( - as_array(kcfg_1d_list_simple_reduction), [&](std::uint32_t cfg) { - return validate(exec->get_queue(), KCFG_1D::decode<0>(cfg), - KCFG_1D::decode<1>(cfg)); - }); + const auto desired_cfg = get_first_cfg(kcfg_1d_array, [&](int cfg) { + return validate(exec->get_queue(), KCFG_1D::decode<0>(cfg), + KCFG_1D::decode<1>(cfg)); + }); select_run_kernel_reduction( - kcfg_1d_list_simple_reduction, - [&](std::uint32_t cfg) { return cfg == desired_cfg; }, - syn::value_list(), syn::value_list(), - syn::value_list(), syn::type_list<>(), exec, fn, op, + kcfg_1d_list, [&](int cfg) { return cfg == desired_cfg; }, + std::integer_sequence(), std::integer_sequence(), + std::integer_sequence(), syn::type_list<>(), exec, fn, op, finalize, identity, result, size, map_to_device(args)...); } @@ -293,10 +283,10 @@ void run_kernel_reduction(std::shared_ptr exec, namespace { -template -void generic_kernel_row_reduction_2d(syn::value_list, +void generic_kernel_row_reduction_2d(std::integer_sequence, std::shared_ptr exec, int64 rows, int64 cols, int64 col_blocks, KernelFunction fn, ReductionOp op, @@ -349,8 +339,8 @@ GKO_ENABLE_IMPLEMENTATION_SELECTION(select_generic_kernel_row_reduction_2d, generic_kernel_row_reduction_2d); -template void generic_kernel_col_reduction_2d_small( sycl::handler& cgh, int64 rows, int64 cols, int64 row_blocks, @@ -363,13 +353,14 @@ void generic_kernel_col_reduction_2d_small( constexpr auto subgroups_per_workgroup = wg_size / sg_size; // stores the subwarp_size partial sums from each warp, grouped by warp constexpr auto shared_storage = subgroups_per_workgroup * ssg_size; - sycl::accessor, 0, - sycl::access_mode::read_write, sycl::access::target::local> - block_partial_acc(cgh); + sycl::accessor + block_partial_acc(sycl::range<1>(shared_storage), cgh); const auto range = sycl_nd_range(dim3(row_blocks), dim3(wg_size)); cgh.parallel_for( range, [=](sycl::nd_item<3> id) [[sycl::reqd_sub_group_size(sg_size)]] { - auto block_partial = &(*block_partial_acc.get_pointer())[0]; + ValueType* __restrict__ block_partial = + static_cast(block_partial_acc.get_pointer()); const auto ssg_id = thread::get_subwarp_id_flat(id); const auto local_sg_id = id.get_local_id(2) / sg_size; @@ -424,7 +415,7 @@ void generic_kernel_col_reduction_2d_small( } -template void generic_kernel_col_reduction_2d_blocked( @@ -436,9 +427,9 @@ void generic_kernel_col_reduction_2d_blocked( constexpr auto sg_size = KCFG_1D::decode<1>(cfg); const auto range = sycl_nd_range(dim3(row_blocks, col_blocks), dim3(wg_size)); - sycl::accessor, 0, - sycl::access_mode::read_write, sycl::access::target::local> - block_partial_acc(cgh); + sycl::accessor + block_partial_acc(sycl::range<1>(wg_size), cgh); cgh.parallel_for( range, [=](sycl::nd_item<3> id) [[sycl::reqd_sub_group_size(sg_size)]] { const auto sg_id = thread::get_subwarp_id_flat(id); @@ -449,7 +440,8 @@ void generic_kernel_col_reduction_2d_blocked( const auto sg_rank = subgroup.thread_rank(); const auto col = sg_rank + static_cast(id.get_group(1)) * sg_size; - auto block_partial = &(*block_partial_acc.get_pointer())[0]; + ValueType* __restrict__ block_partial = + static_cast(block_partial_acc.get_pointer()); auto partial = identity; // accumulate within a thread if (col < cols) { @@ -493,10 +485,10 @@ void generic_kernel_reduction_finalize_2d( } -template -void run_generic_col_reduction_small(syn::value_list, +void run_generic_col_reduction_small(std::integer_sequence, std::shared_ptr exec, int64 max_workgroups, KernelFunction fn, ReductionOp op, FinalizeOp finalize, @@ -537,7 +529,7 @@ GKO_ENABLE_IMPLEMENTATION_SELECTION(select_generic_col_reduction_small, run_generic_col_reduction_small); -template void run_kernel_row_reduction_stage1(std::shared_ptr exec, @@ -549,8 +541,8 @@ void run_kernel_row_reduction_stage1(std::shared_ptr exec, constexpr auto wg_size = KCFG_1D::decode<0>(cfg); constexpr auto sg_size = KCFG_1D::decode<1>(cfg); using subsubgroup_sizes = - syn::value_list(16, sg_size), - std::min(32, sg_size), sg_size>; + std::integer_sequence(16, sg_size), + std::min(32, sg_size), sg_size>; constexpr int oversubscription = 16; const auto rows = static_cast(size[0]); const auto cols = static_cast(size[1]); @@ -562,8 +554,8 @@ void run_kernel_row_reduction_stage1(std::shared_ptr exec, Array partial{exec, static_cast(col_blocks * rows)}; generic_kernel_row_reduction_2d( - syn::value_list{}, exec, rows, cols, col_blocks, fn, - op, [](auto v) { return v; }, identity, partial.get_data(), 1, + std::integer_sequence{}, exec, rows, cols, col_blocks, + fn, op, [](auto v) { return v; }, identity, partial.get_data(), 1, args...); queue->submit([&](sycl::handler& cgh) { generic_kernel_reduction_finalize_2d( @@ -578,8 +570,8 @@ void run_kernel_row_reduction_stage1(std::shared_ptr exec, return compiled_ssg_size >= cols || compiled_ssg_size == sg_size; }, - syn::value_list(), syn::type_list<>(), exec, rows, cols, - 1, fn, op, finalize, identity, result, + std::integer_sequence(), syn::type_list<>(), exec, rows, + cols, 1, fn, op, finalize, identity, result, static_cast(result_stride), args...); } } @@ -588,7 +580,7 @@ GKO_ENABLE_IMPLEMENTATION_CONFIG_SELECTION(select_kernel_row_reduction_stage1, run_kernel_row_reduction_stage1); -template void run_kernel_col_reduction_stage1(std::shared_ptr exec, @@ -600,8 +592,8 @@ void run_kernel_col_reduction_stage1(std::shared_ptr exec, constexpr auto wg_size = KCFG_1D::decode<0>(cfg); constexpr auto sg_size = KCFG_1D::decode<1>(cfg); using subsubgroup_sizes = - syn::value_list(16, sg_size), - std::min(32, sg_size), sg_size>; + std::integer_sequence(16, sg_size), + std::min(32, sg_size), sg_size>; constexpr int oversubscription = 16; const auto rows = static_cast(size[0]); const auto cols = static_cast(size[1]); @@ -614,8 +606,8 @@ void run_kernel_col_reduction_stage1(std::shared_ptr exec, return compiled_ssg_size >= cols || compiled_ssg_size == sg_size; }, - syn::value_list(), syn::type_list<>(), exec, max_blocks, - fn, op, finalize, identity, result, size, args...); + std::integer_sequence(), syn::type_list<>(), exec, + max_blocks, fn, op, finalize, identity, result, size, args...); } else { const auto col_blocks = ceildiv(cols, sg_size); const auto row_blocks = ceildiv( @@ -661,16 +653,14 @@ void run_kernel_row_reduction(std::shared_ptr exec, ValueType* result, size_type result_stride, dim<2> size, KernelArgs&&... args) { - const auto desired_cfg = get_first_cfg( - as_array(kcfg_1d_list_simple_reduction), [&](std::uint32_t cfg) { - return validate(exec->get_queue(), KCFG_1D::decode<0>(cfg), - KCFG_1D::decode<1>(cfg)); - }); + const auto desired_cfg = get_first_cfg(kcfg_1d_array, [&](int cfg) { + return validate(exec->get_queue(), KCFG_1D::decode<0>(cfg), + KCFG_1D::decode<1>(cfg)); + }); select_kernel_row_reduction_stage1( - kcfg_1d_list_simple_reduction, - [&](std::uint32_t cfg) { return cfg == desired_cfg; }, - syn::value_list(), syn::value_list(), - syn::value_list(), syn::type_list<>(), exec, fn, op, + kcfg_1d_list, [&](int cfg) { return cfg == desired_cfg; }, + std::integer_sequence(), std::integer_sequence(), + std::integer_sequence(), syn::type_list<>(), exec, fn, op, finalize, identity, result, result_stride, size, map_to_device(args)...); } @@ -684,16 +674,14 @@ void run_kernel_col_reduction(std::shared_ptr exec, ValueType* result, dim<2> size, KernelArgs&&... args) { - const auto desired_cfg = get_first_cfg( - as_array(kcfg_1d_list_simple_reduction), [&](std::uint32_t cfg) { - return validate(exec->get_queue(), KCFG_1D::decode<0>(cfg), - KCFG_1D::decode<1>(cfg)); - }); + const auto desired_cfg = get_first_cfg(kcfg_1d_array, [&](int cfg) { + return validate(exec->get_queue(), KCFG_1D::decode<0>(cfg), + KCFG_1D::decode<1>(cfg)); + }); select_kernel_col_reduction_stage1( - kcfg_1d_list_simple_reduction, - [&](std::uint32_t cfg) { return cfg == desired_cfg; }, - syn::value_list(), syn::value_list(), - syn::value_list(), syn::type_list<>(), exec, fn, op, + kcfg_1d_list, [&](int cfg) { return cfg == desired_cfg; }, + std::integer_sequence(), std::integer_sequence(), + std::integer_sequence(), syn::type_list<>(), exec, fn, op, finalize, identity, result, size, map_to_device(args)...); } diff --git a/dpcpp/components/prefix_sum.dp.hpp b/dpcpp/components/prefix_sum.dp.hpp index ea7dd341ecf..56fe9864612 100644 --- a/dpcpp/components/prefix_sum.dp.hpp +++ b/dpcpp/components/prefix_sum.dp.hpp @@ -129,11 +129,11 @@ __dpct_inline__ void subwarp_prefix_sum(ValueType element, * @note To calculate the prefix sum over an array of size bigger than * `block_size`, `finalize_prefix_sum` has to be used as well. */ -template +template void start_prefix_sum(size_type num_elements, ValueType* __restrict__ elements, ValueType* __restrict__ block_sum, sycl::nd_item<3> item_ct1, - UninitializedArray& prefix_helper) + ValueType* __restrict__ prefix_helper) { const auto tidx = thread::get_thread_id_flat(item_ct1); const auto element_id = item_ct1.get_local_id(2); @@ -182,22 +182,22 @@ void start_prefix_sum(size_type num_elements, ValueType* __restrict__ elements, } } -template +template void start_prefix_sum(dim3 grid, dim3 block, size_type dynamic_shared_memory, sycl::queue* queue, size_type num_elements, ValueType* elements, ValueType* block_sum) { queue->submit([&](sycl::handler& cgh) { - sycl::accessor, 0, - sycl::access::mode::read_write, + sycl::accessor - prefix_helper_acc_ct1(cgh); + prefix_helper_acc_ct1(sycl::range<1>(block_size), cgh); cgh.parallel_for(sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) { start_prefix_sum( num_elements, elements, block_sum, item_ct1, - *prefix_helper_acc_ct1.get_pointer()); + static_cast( + prefix_helper_acc_ct1.get_pointer())); }); }); } @@ -217,7 +217,7 @@ void start_prefix_sum(dim3 grid, dim3 block, size_type dynamic_shared_memory, * * @note To calculate a prefix sum, first `start_prefix_sum` has to be called. */ -template +template void finalize_prefix_sum(size_type num_elements, ValueType* __restrict__ elements, const ValueType* __restrict__ block_sum, @@ -234,7 +234,7 @@ void finalize_prefix_sum(size_type num_elements, } } -template +template void finalize_prefix_sum(dim3 grid, dim3 block, size_type dynamic_shared_memory, sycl::queue* queue, size_type num_elements, ValueType* elements, const ValueType* block_sum) diff --git a/dpcpp/components/prefix_sum_kernels.dp.cpp b/dpcpp/components/prefix_sum_kernels.dp.cpp index 695fb2f2898..5e89b61877f 100644 --- a/dpcpp/components/prefix_sum_kernels.dp.cpp +++ b/dpcpp/components/prefix_sum_kernels.dp.cpp @@ -33,6 +33,9 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "core/components/prefix_sum_kernels.hpp" +#include + + #include @@ -50,12 +53,6 @@ namespace dpcpp { namespace components { -using BlockCfg = ConfigSet<11>; - -constexpr auto block_cfg_list = - ::gko::syn::value_list(); - GKO_ENABLE_IMPLEMENTATION_CONFIG_SELECTION(start_prefix_sum, start_prefix_sum) GKO_ENABLE_DEFAULT_CONFIG_CALL(start_prefix_sum_call, start_prefix_sum, block_cfg_list) @@ -73,11 +70,9 @@ void prefix_sum(std::shared_ptr exec, IndexType* counts, // prefix_sum should only be performed on a valid array if (num_entries > 0) { auto queue = exec->get_queue(); - constexpr auto block_cfg_array = as_array(block_cfg_list); - const std::uint32_t cfg = - get_first_cfg(block_cfg_array, [&queue](std::uint32_t cfg) { - return validate(queue, BlockCfg::decode<0>(cfg), 16); - }); + const int cfg = get_first_cfg(block_cfg_array, [&queue](int cfg) { + return validate(queue, BlockCfg::decode<0>(cfg), 16); + }); const auto wg_size = BlockCfg::decode<0>(cfg); auto num_blocks = ceildiv(num_entries, wg_size); Array block_sum_array(exec, num_blocks - 1); diff --git a/dpcpp/components/reduction.dp.hpp b/dpcpp/components/reduction.dp.hpp index b1eb86a1992..dde112fba9d 100644 --- a/dpcpp/components/reduction.dp.hpp +++ b/dpcpp/components/reduction.dp.hpp @@ -34,6 +34,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GKO_DPCPP_COMPONENTS_REDUCTION_DP_HPP_ +#include #include @@ -47,7 +48,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "core/base/types.hpp" #include "core/synthesizer/implementation_selection.hpp" -#include "dpcpp/base/config.hpp" #include "dpcpp/base/dim3.dp.hpp" #include "dpcpp/base/dpct.hpp" #include "dpcpp/base/helper.hpp" @@ -62,13 +62,7 @@ namespace dpcpp { constexpr int default_block_size = 256; -using KCFG_1D = ConfigSet<11, 7>; -constexpr auto kcfg_1d_list = - syn::value_list(); -constexpr auto kcfg_1d_array = as_array(kcfg_1d_list); + /** * @internal @@ -202,14 +196,13 @@ void reduce_array(size_type size, const ValueType* __restrict__ source, * `source` of any size. Has to be called a second time on `result` to reduce * an array larger than `block_size`. */ -template -void reduce_add_array( - size_type size, const ValueType* __restrict__ source, - ValueType* __restrict__ result, sycl::nd_item<3> item_ct1, - UninitializedArray(cfg)>& block_sum) +template +void reduce_add_array(size_type size, const ValueType* __restrict__ source, + ValueType* __restrict__ result, sycl::nd_item<3> item_ct1, + ValueType* __restrict__ block_sum) { reduce_array(cfg)>( - size, source, static_cast(block_sum), item_ct1, + size, source, block_sum, item_ct1, [](const ValueType& x, const ValueType& y) { return x + y; }); if (item_ct1.get_local_id(2) == 0) { @@ -217,23 +210,24 @@ void reduce_add_array( } } -template +template void reduce_add_array(dim3 grid, dim3 block, size_type dynamic_shared_memory, sycl::queue* queue, size_type size, const ValueType* source, ValueType* result) { queue->submit([&](sycl::handler& cgh) { - sycl::accessor(cfg)>, - 0, sycl::access::mode::read_write, + sycl::accessor - block_sum_acc_ct1(cgh); + block_sum_acc_ct1(sycl::range<1>(KCFG_1D::decode<0>(cfg)), cgh); cgh.parallel_for( sycl_nd_range(grid, block), [= ](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size( KCFG_1D::decode<1>(cfg))]] { - reduce_add_array(size, source, result, item_ct1, - *block_sum_acc_ct1.get_pointer()); + reduce_add_array( + size, source, result, item_ct1, + static_cast(block_sum_acc_ct1.get_pointer())); }); }); } @@ -242,7 +236,7 @@ GKO_ENABLE_IMPLEMENTATION_CONFIG_SELECTION(reduce_add_array_config, reduce_add_array); GKO_ENABLE_DEFAULT_CONFIG_CALL(reduce_add_array_call, reduce_add_array_config, - kcfg_1d_list); + kcfg_1d_array); /** @@ -263,12 +257,10 @@ ValueType reduce_add_array(std::shared_ptr exec, auto block_results = Array(exec); ValueType answer = zero(); auto queue = exec->get_queue(); - constexpr auto kcfg_1d_array = as_array(kcfg_1d_list); - const std::uint32_t cfg = - get_first_cfg(kcfg_1d_array, [&queue](std::uint32_t cfg) { - return validate(queue, KCFG_1D::decode<0>(cfg), - KCFG_1D::decode<1>(cfg)); - }); + const int cfg = get_first_cfg(kcfg_1d_array, [&queue](int cfg) { + return validate(queue, KCFG_1D::decode<0>(cfg), + KCFG_1D::decode<1>(cfg)); + }); const auto wg_size = KCFG_1D::decode<0>(cfg); const auto sg_size = KCFG_1D::decode<1>(cfg); diff --git a/dpcpp/get_info.cmake b/dpcpp/get_info.cmake index 36918a3a8c6..f595db6c190 100644 --- a/dpcpp/get_info.cmake +++ b/dpcpp/get_info.cmake @@ -2,6 +2,9 @@ ginkgo_print_module_header(${detailed_log} "DPCPP") ginkgo_print_module_footer(${detailed_log} "DPCPP variables:") ginkgo_print_variable(${detailed_log} "GINKGO_DPCPP_FLAGS") ginkgo_print_variable(${detailed_log} "GINKGO_DPCPP_SINGLE_MODE") +ginkgo_print_variable(${detailed_log} "GINKGO_DPCPP_AOT_TARGETS") +ginkgo_print_variable(${detailed_log} "GINKGO_DPCPP_SUBGROUPS") +ginkgo_print_variable(${detailed_log} "GINKGO_DPCPP_WORKGROUPS") ginkgo_print_module_footer(${detailed_log} "DPCPP environment variables:") ginkgo_print_env_variable(${detailed_log} "SYCL_DEVICE_TYPE") ginkgo_print_env_variable(${detailed_log} "SYCL_BE") diff --git a/dpcpp/matrix/csr_kernels.dp.cpp b/dpcpp/matrix/csr_kernels.dp.cpp index fa23cda9d1d..a7e8ee2bd58 100644 --- a/dpcpp/matrix/csr_kernels.dp.cpp +++ b/dpcpp/matrix/csr_kernels.dp.cpp @@ -34,6 +34,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include #include @@ -56,7 +57,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "core/matrix/csr_builder.hpp" #include "core/matrix/dense_kernels.hpp" #include "core/synthesizer/implementation_selection.hpp" -#include "dpcpp/base/config.hpp" #include "dpcpp/base/dim3.dp.hpp" #include "dpcpp/base/dpct.hpp" #include "dpcpp/base/helper.hpp" @@ -89,9 +89,9 @@ constexpr int classical_overweight = 32; * A compile-time list of the number items per threads for which spmv kernel * should be compiled. */ -using compiled_kernels = syn::value_list; +using compiled_kernels = std::integer_sequence; -using classical_kernels = syn::value_list; +using classical_kernels = subgroup_list_desc_t; namespace kernel { @@ -324,8 +324,8 @@ void merge_path_reduce(const IndexType nwarps, const IndexType* __restrict__ last_row, ValueType* __restrict__ c, const size_type c_stride, Alpha_op alpha_op, sycl::nd_item<3> item_ct1, - UninitializedArray& tmp_ind, - UninitializedArray& tmp_val) + IndexType* __restrict__ tmp_ind, + ValueType* __restrict__ tmp_val) { const IndexType cache_lines = ceildivT(nwarps, spmv_block_size); const IndexType tid = item_ct1.get_local_id(2); @@ -351,9 +351,7 @@ void merge_path_reduce(const IndexType nwarps, tmp_val[item_ct1.get_local_id(2)] = value; tmp_ind[item_ct1.get_local_id(2)] = row; group::this_thread_block(item_ct1).sync(); - bool last = - block_segment_scan_reverse(static_cast(tmp_ind), - static_cast(tmp_val), item_ct1); + bool last = block_segment_scan_reverse(tmp_ind, tmp_val, item_ct1); group::this_thread_block(item_ct1).sync(); if (last) { c[row * c_stride] += alpha_op(tmp_val[item_ct1.get_local_id(2)]); @@ -538,9 +536,8 @@ void abstract_reduce(const IndexType nwarps, const ValueType* __restrict__ last_val, const IndexType* __restrict__ last_row, ValueType* __restrict__ c, const size_type c_stride, - sycl::nd_item<3> item_ct1, - UninitializedArray& tmp_ind, - UninitializedArray& tmp_val) + sycl::nd_item<3> item_ct1, IndexType* __restrict__ tmp_ind, + ValueType* __restrict__ tmp_val) { merge_path_reduce( nwarps, last_val, last_row, c, c_stride, [](ValueType& x) { return x; }, @@ -554,20 +551,19 @@ void abstract_reduce(dim3 grid, dim3 block, size_type dynamic_shared_memory, ValueType* c, const size_type c_stride) { queue->submit([&](sycl::handler& cgh) { - sycl::accessor, 0, - sycl::access_mode::read_write, + sycl::accessor - tmp_ind_acc_ct1(cgh); - sycl::accessor, 0, - sycl::access_mode::read_write, + tmp_ind_acc_ct1(sycl::range<1>(spmv_block_size), cgh); + sycl::accessor - tmp_val_acc_ct1(cgh); + tmp_val_acc_ct1(sycl::range<1>(spmv_block_size), cgh); cgh.parallel_for( sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) { - abstract_reduce(nwarps, last_val, last_row, c, c_stride, - item_ct1, *tmp_ind_acc_ct1.get_pointer(), - *tmp_val_acc_ct1.get_pointer()); + abstract_reduce( + nwarps, last_val, last_row, c, c_stride, item_ct1, + static_cast(tmp_ind_acc_ct1.get_pointer()), + static_cast(tmp_val_acc_ct1.get_pointer())); }); }); } @@ -579,9 +575,8 @@ void abstract_reduce(const IndexType nwarps, const IndexType* __restrict__ last_row, const ValueType* __restrict__ alpha, ValueType* __restrict__ c, const size_type c_stride, - sycl::nd_item<3> item_ct1, - UninitializedArray& tmp_ind, - UninitializedArray& tmp_val) + sycl::nd_item<3> item_ct1, IndexType* __restrict__ tmp_ind, + ValueType* __restrict__ tmp_val) { const auto alpha_val = alpha[0]; merge_path_reduce( @@ -598,26 +593,25 @@ void abstract_reduce(dim3 grid, dim3 block, size_type dynamic_shared_memory, const size_type c_stride) { queue->submit([&](sycl::handler& cgh) { - sycl::accessor, 0, - sycl::access_mode::read_write, + sycl::accessor - tmp_ind_acc_ct1(cgh); - sycl::accessor, 0, - sycl::access_mode::read_write, + tmp_ind_acc_ct1(sycl::range<1>(spmv_block_size), cgh); + sycl::accessor - tmp_val_acc_ct1(cgh); + tmp_val_acc_ct1(sycl::range<1>(spmv_block_size), cgh); cgh.parallel_for( sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) { - abstract_reduce(nwarps, last_val, last_row, alpha, c, c_stride, - item_ct1, *tmp_ind_acc_ct1.get_pointer(), - *tmp_val_acc_ct1.get_pointer()); + abstract_reduce( + nwarps, last_val, last_row, alpha, c, c_stride, item_ct1, + static_cast(tmp_ind_acc_ct1.get_pointer()), + static_cast(tmp_val_acc_ct1.get_pointer())); }); }); } -template void device_classical_spmv(const size_type num_rows, const ValueType* __restrict__ val, @@ -654,7 +648,7 @@ void device_classical_spmv(const size_type num_rows, } -template +template void abstract_classical_spmv( const size_type num_rows, const ValueType* __restrict__ val, const IndexType* __restrict__ col_idxs, @@ -667,7 +661,7 @@ void abstract_classical_spmv( [](const ValueType& x, const ValueType& y) { return x; }, item_ct1); } -template +template void abstract_classical_spmv(dim3 grid, dim3 block, size_type dynamic_shared_memory, sycl::queue* queue, const size_type num_rows, @@ -689,7 +683,7 @@ void abstract_classical_spmv(dim3 grid, dim3 block, } -template +template void abstract_classical_spmv( const size_type num_rows, const ValueType* __restrict__ alpha, const ValueType* __restrict__ val, const IndexType* __restrict__ col_idxs, @@ -708,7 +702,7 @@ void abstract_classical_spmv( item_ct1); } -template +template void abstract_classical_spmv(dim3 grid, dim3 block, size_type dynamic_shared_memory, sycl::queue* queue, const size_type num_rows, @@ -719,12 +713,14 @@ void abstract_classical_spmv(dim3 grid, dim3 block, ValueType* c, const size_type c_stride) { queue->submit([&](sycl::handler& cgh) { - cgh.parallel_for(sycl_nd_range(grid, block), - [=](sycl::nd_item<3> item_ct1) { - abstract_classical_spmv( - num_rows, alpha, val, col_idxs, row_ptrs, b, - b_stride, beta, c, c_stride, item_ct1); - }); + cgh.parallel_for( + sycl_nd_range(grid, block), [= + ](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size( + subgroup_size)]] { + abstract_classical_spmv( + num_rows, alpha, val, col_idxs, row_ptrs, b, b_stride, beta, + c, c_stride, item_ct1); + }); }); } @@ -899,7 +895,9 @@ void row_permute_kernel(dim3 grid, dim3 block, size_type dynamic_shared_memory, { queue->submit([&](sycl::handler& cgh) { cgh.parallel_for( - sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) { + sycl_nd_range(grid, block), [= + ](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size( + subgroup_size)]] { row_permute_kernel( num_rows, permutation, in_row_ptrs, in_cols, in_vals, out_row_ptrs, out_cols, out_vals, item_ct1); @@ -946,7 +944,9 @@ void inv_row_permute_kernel(dim3 grid, dim3 block, { queue->submit([&](sycl::handler& cgh) { cgh.parallel_for( - sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) { + sycl_nd_range(grid, block), [= + ](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size( + subgroup_size)]] { inv_row_permute_kernel( num_rows, permutation, in_row_ptrs, in_cols, in_vals, out_row_ptrs, out_cols, out_vals, item_ct1); @@ -994,7 +994,9 @@ void inv_symm_permute_kernel(dim3 grid, dim3 block, { queue->submit([&](sycl::handler& cgh) { cgh.parallel_for( - sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) { + sycl_nd_range(grid, block), [= + ](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size( + subgroup_size)]] { inv_symm_permute_kernel( num_rows, permutation, in_row_ptrs, in_cols, in_vals, out_row_ptrs, out_cols, out_vals, item_ct1); @@ -1006,7 +1008,7 @@ namespace host_kernel { template -void merge_path_spmv(syn::value_list, +void merge_path_spmv(std::integer_sequence, std::shared_ptr exec, const matrix::Csr* a, const matrix::Dense* b, @@ -1076,7 +1078,7 @@ int compute_items_per_thread(std::shared_ptr exec) template -void classical_spmv(syn::value_list, +void classical_spmv(std::integer_sequence, std::shared_ptr exec, const matrix::Csr* a, const matrix::Dense* b, @@ -1148,7 +1150,7 @@ void spmv(std::shared_ptr exec, [&items_per_thread](int compiled_info) { return items_per_thread == compiled_info; }, - syn::value_list(), syn::type_list<>(), exec, a, b, c); + empty_list_t{}, syn::type_list<>(), exec, a, b, c); } else if (a->get_strategy()->get_name() == "classical") { IndexType max_length_per_row = 0; using Tcsr = matrix::Csr; @@ -1167,7 +1169,7 @@ void spmv(std::shared_ptr exec, [&max_length_per_row](int compiled_info) { return max_length_per_row >= compiled_info; }, - syn::value_list(), syn::type_list<>(), exec, a, b, c); + empty_list_t{}, syn::type_list<>(), exec, a, b, c); } else if (a->get_strategy()->get_name() == "sparselib" || a->get_strategy()->get_name() == "cusparse") { if (!is_complex()) { @@ -1284,8 +1286,7 @@ void advanced_spmv(std::shared_ptr exec, [&max_length_per_row](int compiled_info) { return max_length_per_row >= compiled_info; }, - syn::value_list(), syn::type_list<>(), exec, a, b, c, alpha, - beta); + empty_list_t{}, syn::type_list<>(), exec, a, b, c, alpha, beta); } else if (a->get_strategy()->get_name() == "merge_path") { int items_per_thread = host_kernel::compute_items_per_thread(exec); @@ -1294,8 +1295,7 @@ void advanced_spmv(std::shared_ptr exec, [&items_per_thread](int compiled_info) { return items_per_thread == compiled_info; }, - syn::value_list(), syn::type_list<>(), exec, a, b, c, alpha, - beta); + empty_list_t{}, syn::type_list<>(), exec, a, b, c, alpha, beta); } else { GKO_NOT_IMPLEMENTED; } diff --git a/dpcpp/matrix/dense_kernels.dp.cpp b/dpcpp/matrix/dense_kernels.dp.cpp index 97c01e0fac5..3dd7964c4ab 100644 --- a/dpcpp/matrix/dense_kernels.dp.cpp +++ b/dpcpp/matrix/dense_kernels.dp.cpp @@ -33,6 +33,9 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "core/matrix/dense_kernels.hpp" +#include + + #include #include @@ -49,7 +52,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "core/components/prefix_sum_kernels.hpp" -#include "dpcpp/base/config.hpp" #include "dpcpp/base/dim3.dp.hpp" #include "dpcpp/base/helper.hpp" #include "dpcpp/base/onemkl_bindings.hpp" @@ -70,27 +72,18 @@ namespace dpcpp { namespace dense { -// Disable the 64 subgroup. CPU supports 64 now, but conj_transpose will -// lead CL_OUT_OF_RESOURCES. TODO: investigate this issue. -using KCFG_1D = ConfigSet<11, 7>; -constexpr auto kcfg_1d_list = - syn::value_list(); -constexpr auto subgroup_list = syn::value_list(); -constexpr auto kcfg_1d_array = syn::as_array(kcfg_1d_list); constexpr int default_block_size = 256; namespace kernel { -template +template void transpose(const size_type nrows, const size_type ncols, const ValueType* __restrict__ in, const size_type in_stride, ValueType* __restrict__ out, const size_type out_stride, Closure op, sycl::nd_item<3> item_ct1, - UninitializedArray& space) + ValueType* __restrict__ space) { auto local_x = item_ct1.get_local_id(2); auto local_y = item_ct1.get_local_id(1); @@ -108,35 +101,34 @@ void transpose(const size_type nrows, const size_type ncols, } } -template +template void transpose(const size_type nrows, const size_type ncols, const ValueType* __restrict__ in, const size_type in_stride, ValueType* __restrict__ out, const size_type out_stride, - sycl::nd_item<3> item_ct1, - UninitializedArray& space) + sycl::nd_item<3> item_ct1, ValueType* __restrict__ space) { transpose( nrows, ncols, in, in_stride, out, out_stride, [](ValueType val) { return val; }, item_ct1, space); } -template +template void transpose(dim3 grid, dim3 block, size_type dynamic_shared_memory, sycl::queue* queue, const size_type nrows, const size_type ncols, const ValueType* in, const size_type in_stride, ValueType* out, const size_type out_stride) { queue->submit([&](sycl::handler& cgh) { - sycl::accessor, 0, - sycl::access_mode::read_write, + sycl::accessor - space_acc_ct1(cgh); + space_acc_ct1(sycl::range<1>(sg_size * (sg_size + 1)), cgh); cgh.parallel_for( sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) __WG_BOUND__(sg_size, sg_size) { - transpose(nrows, ncols, in, in_stride, out, out_stride, - item_ct1, *space_acc_ct1.get_pointer()); + transpose( + nrows, ncols, in, in_stride, out, out_stride, item_ct1, + static_cast(space_acc_ct1.get_pointer())); }); }); } @@ -145,19 +137,18 @@ GKO_ENABLE_IMPLEMENTATION_CONFIG_SELECTION(transpose, transpose); GKO_ENABLE_DEFAULT_CONFIG_CALL(transpose_call, transpose, subgroup_list); -template +template void conj_transpose(const size_type nrows, const size_type ncols, const ValueType* __restrict__ in, const size_type in_stride, ValueType* __restrict__ out, const size_type out_stride, - sycl::nd_item<3> item_ct1, - UninitializedArray& space) + sycl::nd_item<3> item_ct1, ValueType* __restrict__ space) { transpose( nrows, ncols, in, in_stride, out, out_stride, [](ValueType val) { return conj(val); }, item_ct1, space); } -template +template void conj_transpose(dim3 grid, dim3 block, size_type dynamic_shared_memory, sycl::queue* queue, const size_type nrows, const size_type ncols, const ValueType* in, @@ -165,17 +156,16 @@ void conj_transpose(dim3 grid, dim3 block, size_type dynamic_shared_memory, const size_type out_stride) { queue->submit([&](sycl::handler& cgh) { - sycl::accessor, 0, - sycl::access_mode::read_write, + sycl::accessor - space_acc_ct1(cgh); + space_acc_ct1(sycl::range<1>(sg_size * (sg_size + 1)), cgh); cgh.parallel_for( sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) __WG_BOUND__(sg_size, sg_size) { - conj_transpose(nrows, ncols, in, in_stride, out, - out_stride, item_ct1, - *space_acc_ct1.get_pointer()); + conj_transpose( + nrows, ncols, in, in_stride, out, out_stride, item_ct1, + static_cast(space_acc_ct1.get_pointer())); }); }); } @@ -539,10 +529,9 @@ void transpose(std::shared_ptr exec, auto size = orig->get_size(); auto sg_array = syn::as_array(subgroup_list); auto queue = exec->get_queue(); - const std::uint32_t cfg = - get_first_cfg(sg_array, [&queue](std::uint32_t cfg) { - return validate(queue, cfg * cfg, cfg); - }); + const int cfg = get_first_cfg(sg_array, [&queue](int cfg) { + return validate(queue, cfg * cfg, cfg); + }); dim3 grid(ceildiv(size[1], cfg), ceildiv(size[0], cfg)); dim3 block(cfg, cfg); kernel::transpose_call(cfg, grid, block, 0, queue, size[0], size[1], @@ -561,10 +550,9 @@ void conj_transpose(std::shared_ptr exec, auto size = orig->get_size(); auto sg_array = syn::as_array(subgroup_list); auto queue = exec->get_queue(); - const std::uint32_t cfg = - get_first_cfg(sg_array, [&queue](std::uint32_t cfg) { - return validate(queue, cfg * cfg, cfg); - }); + const int cfg = get_first_cfg(sg_array, [&queue](int cfg) { + return validate(queue, cfg * cfg, cfg); + }); dim3 grid(ceildiv(size[1], cfg), ceildiv(size[0], cfg)); dim3 block(cfg, cfg); kernel::conj_transpose_call(cfg, grid, block, 0, queue, size[0], size[1], diff --git a/dpcpp/matrix/ell_kernels.dp.cpp b/dpcpp/matrix/ell_kernels.dp.cpp index fbd1063bf13..23847bdf9b8 100644 --- a/dpcpp/matrix/ell_kernels.dp.cpp +++ b/dpcpp/matrix/ell_kernels.dp.cpp @@ -34,6 +34,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include #include @@ -52,7 +53,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "core/components/prefix_sum_kernels.hpp" #include "core/matrix/dense_kernels.hpp" #include "core/synthesizer/implementation_selection.hpp" -#include "dpcpp/base/config.hpp" #include "dpcpp/base/dim3.dp.hpp" #include "dpcpp/base/helper.hpp" #include "dpcpp/components/atomic.dp.hpp" @@ -91,20 +91,26 @@ constexpr int num_threads_per_core = 4; constexpr double ratio = 1e-2; -/** - * max_thread_per_worker is the max number of thread per worker. The - * `compiled_kernels` must be a list <0, 1, 2, ..., max_thread_per_worker> - */ -constexpr int max_thread_per_worker = 32; - - /** * A compile-time list of sub-warp sizes for which the spmv kernels should be * compiled. * 0 is a special case where it uses a sub-warp size of warp_size in * combination with atomic_adds. */ -using compiled_kernels = syn::value_list; +using compiled_kernels = + syn::concatenate, subgroup_list_t>; + + +/** + * max_thread_per_worker is the max number of thread per worker. The + * `compiled_kernels` must be a list <0, 1, 2, ..., max_thread_per_worker> + */ +constexpr auto max_thread_per_worker = + syn::as_value(syn::max{}); + + +constexpr auto min_thread_per_worker = + syn::as_value(syn::min{}); namespace kernel { @@ -114,14 +120,14 @@ namespace { template -void spmv_kernel( - const size_type num_rows, const int num_worker_per_row, - acc::range val, const IndexType* __restrict__ col, - const size_type stride, const size_type num_stored_elements_per_row, - acc::range b, OutputValueType* __restrict__ c, - const size_type c_stride, Closure op, sycl::nd_item<3> item_ct1, - UninitializedArray& storage) +void spmv_kernel(const size_type num_rows, const int num_worker_per_row, + acc::range val, const IndexType* __restrict__ col, + const size_type stride, + const size_type num_stored_elements_per_row, + acc::range b, OutputValueType* __restrict__ c, + const size_type c_stride, Closure op, + sycl::nd_item<3> item_ct1, + OutputValueType* __restrict__ storage) { const auto tidx = thread::get_thread_id_flat(item_ct1); const decltype(tidx) column_id = item_ct1.get_group(1); @@ -175,8 +181,9 @@ void spmv_kernel( if (runnable && idx_in_worker == 0) { const auto c_ind = x * c_stride + column_id; if (atomic) { - atomic_add(&(c[c_ind]), - op(storage[item_ct1.get_local_id(2)], c[c_ind])); + atomic_add( + &(c[c_ind]), + op(storage[item_ct1.get_local_id(2)], c[c_ind])); } else { c[c_ind] = op(storage[item_ct1.get_local_id(2)], c[c_ind]); } @@ -187,14 +194,12 @@ void spmv_kernel( template -void spmv( - const size_type num_rows, const int num_worker_per_row, - acc::range val, const IndexType* __restrict__ col, - const size_type stride, const size_type num_stored_elements_per_row, - acc::range b, OutputValueType* __restrict__ c, - const size_type c_stride, sycl::nd_item<3> item_ct1, - UninitializedArray& storage) +void spmv(const size_type num_rows, const int num_worker_per_row, + acc::range val, const IndexType* __restrict__ col, + const size_type stride, const size_type num_stored_elements_per_row, + acc::range b, OutputValueType* __restrict__ c, + const size_type c_stride, sycl::nd_item<3> item_ct1, + OutputValueType* __restrict__ storage) { spmv_kernel( num_rows, num_worker_per_row, val, col, stride, @@ -213,34 +218,33 @@ void spmv(dim3 grid, dim3 block, size_type dynamic_shared_memory, OutputValueType* c, const size_type c_stride) { queue->submit([&](sycl::handler& cgh) { - sycl::accessor< - UninitializedArray, - 0, sycl::access_mode::read_write, sycl::access::target::local> - storage_acc_ct1(cgh); - - cgh.parallel_for(sycl_nd_range(grid, block), - [=](sycl::nd_item<3> item_ct1) { - spmv( - num_rows, num_worker_per_row, val, col, stride, - num_stored_elements_per_row, b, c, c_stride, - item_ct1, *storage_acc_ct1.get_pointer()); - }); + sycl::accessor + storage_acc_ct1( + sycl::range<1>(default_block_size / num_thread_per_worker), + cgh); + + cgh.parallel_for( + sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) { + spmv( + num_rows, num_worker_per_row, val, col, stride, + num_stored_elements_per_row, b, c, c_stride, item_ct1, + static_cast( + storage_acc_ct1.get_pointer())); + }); }); } template -void spmv( - const size_type num_rows, const int num_worker_per_row, - acc::range alpha, acc::range val, - const IndexType* __restrict__ col, const size_type stride, - const size_type num_stored_elements_per_row, acc::range b, - const OutputValueType* __restrict__ beta, OutputValueType* __restrict__ c, - const size_type c_stride, sycl::nd_item<3> item_ct1, - UninitializedArray& storage) +void spmv(const size_type num_rows, const int num_worker_per_row, + acc::range alpha, acc::range val, + const IndexType* __restrict__ col, const size_type stride, + const size_type num_stored_elements_per_row, acc::range b, + const OutputValueType* __restrict__ beta, + OutputValueType* __restrict__ c, const size_type c_stride, + sycl::nd_item<3> item_ct1, OutputValueType* __restrict__ storage) { const OutputValueType alpha_val = alpha(0); const OutputValueType beta_val = beta[0]; @@ -280,18 +284,19 @@ void spmv(dim3 grid, dim3 block, size_type dynamic_shared_memory, OutputValueType* c, const size_type c_stride) { queue->submit([&](sycl::handler& cgh) { - sycl::accessor< - UninitializedArray, - 0, sycl::access_mode::read_write, sycl::access::target::local> - storage_acc_ct1(cgh); + sycl::accessor + storage_acc_ct1( + sycl::range<1>(default_block_size / num_thread_per_worker), + cgh); cgh.parallel_for( sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) { spmv( num_rows, num_worker_per_row, alpha, val, col, stride, num_stored_elements_per_row, b, beta, c, c_stride, item_ct1, - *storage_acc_ct1.get_pointer()); + static_cast( + storage_acc_ct1.get_pointer())); }); }); } @@ -306,7 +311,7 @@ namespace { template -void abstract_spmv(syn::value_list, +void abstract_spmv(std::integer_sequence, std::shared_ptr exec, int num_worker_per_row, const matrix::Ell* a, @@ -325,7 +330,7 @@ void abstract_spmv(syn::value_list, const auto num_stored_elements_per_row = a->get_num_stored_elements_per_row(); - constexpr int num_thread_per_worker = + constexpr auto num_thread_per_worker = (info == 0) ? max_thread_per_worker : info; constexpr bool atomic = (info == 0); const dim3 block_size(default_block_size / num_thread_per_worker, @@ -372,7 +377,7 @@ std::array compute_thread_worker_and_atomicity( std::shared_ptr exec, const matrix::Ell* a) { - int num_thread_per_worker = 8; + int num_thread_per_worker = min_thread_per_worker; int atomic = 0; int num_worker_per_row = 1; @@ -391,8 +396,8 @@ std::array compute_thread_worker_and_atomicity( // decided according to the number of worker allowed on GPU. if (static_cast(ell_ncols) / nrows > ratio) { while (num_thread_per_worker < max_thread_per_worker && - (num_thread_per_worker << 1) <= ell_ncols) { - num_thread_per_worker <<= 1; + (num_thread_per_worker * 2) <= ell_ncols) { + num_thread_per_worker *= 2; } if (num_thread_per_worker == max_thread_per_worker) { num_worker_per_row = @@ -436,8 +441,7 @@ void spmv(std::shared_ptr exec, select_abstract_spmv( compiled_kernels(), [&info](int compiled_info) { return info == compiled_info; }, - syn::value_list(), syn::type_list<>(), exec, num_worker_per_row, a, - b, c); + empty_list_t{}, syn::type_list<>(), exec, num_worker_per_row, a, b, c); } GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE( @@ -470,8 +474,8 @@ void advanced_spmv(std::shared_ptr exec, select_abstract_spmv( compiled_kernels(), [&info](int compiled_info) { return info == compiled_info; }, - syn::value_list(), syn::type_list<>(), exec, num_worker_per_row, a, - b, c, alpha, beta); + empty_list_t{}, syn::type_list<>(), exec, num_worker_per_row, a, b, c, + alpha, beta); } GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE( diff --git a/dpcpp/solver/cb_gmres_kernels.dp.cpp b/dpcpp/solver/cb_gmres_kernels.dp.cpp index dc94bbb92ff..ac64a232fbc 100644 --- a/dpcpp/solver/cb_gmres_kernels.dp.cpp +++ b/dpcpp/solver/cb_gmres_kernels.dp.cpp @@ -215,14 +215,13 @@ GKO_ENABLE_DEFAULT_HOST(increase_final_iteration_numbers_kernel, template -void multinorm2_kernel( - size_type num_rows, size_type num_cols, - const ValueType* __restrict__ next_krylov_basis, - size_type stride_next_krylov, remove_complex* __restrict__ norms, - const stopping_status* __restrict__ stop_status, sycl::nd_item<3> item_ct1, - UninitializedArray, - default_dot_dim*(default_dot_dim + 1)>* - reduction_helper_array) +void multinorm2_kernel(size_type num_rows, size_type num_cols, + const ValueType* __restrict__ next_krylov_basis, + size_type stride_next_krylov, + remove_complex* __restrict__ norms, + const stopping_status* __restrict__ stop_status, + sycl::nd_item<3> item_ct1, + remove_complex* reduction_helper_array) { using rc_vtype = remove_complex; const auto tidx = item_ct1.get_local_id(2); @@ -236,7 +235,7 @@ void multinorm2_kernel( // Used that way to get around dynamic initialization warning and // template error when using `reduction_helper_array` directly in `reduce` - rc_vtype* __restrict__ reduction_helper = (*reduction_helper_array); + rc_vtype* __restrict__ reduction_helper = reduction_helper_array; rc_vtype local_res = zero(); if (col_idx < num_cols && !stop_status[col_idx].has_stopped()) { for (size_type i = start_row + tidy; i < end_row; @@ -270,20 +269,21 @@ void multinorm2_kernel(dim3 grid, dim3 block, size_type dynamic_shared_memory, const stopping_status* stop_status) { queue->submit([&](sycl::handler& cgh) { - sycl::accessor< - UninitializedArray, - default_dot_dim*(default_dot_dim + 1)>, - 0, sycl::access_mode::read_write, sycl::access::target::local> - reduction_helper_array_acc_ct1(cgh); + sycl::accessor, 1, + sycl::access_mode::read_write, + sycl::access::target::local> + reduction_helper_array_acc_ct1( + sycl::range<1>(default_dot_dim * (default_dot_dim + 1)), cgh); cgh.parallel_for( sycl_nd_range(grid, block), [= ](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size( default_dot_dim)]] { - multinorm2_kernel(num_rows, num_cols, next_krylov_basis, - stride_next_krylov, norms, stop_status, - item_ct1, - reduction_helper_array_acc_ct1.get_pointer()); + multinorm2_kernel( + num_rows, num_cols, next_krylov_basis, stride_next_krylov, + norms, stop_status, item_ct1, + static_cast*>( + reduction_helper_array_acc_ct1.get_pointer())); }); }); } @@ -295,9 +295,7 @@ void multinorminf_without_stop_kernel( const ValueType* __restrict__ next_krylov_basis, size_type stride_next_krylov, remove_complex* __restrict__ norms, size_type stride_norms, sycl::nd_item<3> item_ct1, - UninitializedArray, - default_dot_dim*(default_dot_dim + 1)>* - reduction_helper_array) + remove_complex* reduction_helper_array) { using rc_vtype = remove_complex; const auto tidx = item_ct1.get_local_id(2); @@ -311,7 +309,7 @@ void multinorminf_without_stop_kernel( // Used that way to get around dynamic initialization warning and // template error when using `reduction_helper_array` directly in `reduce` - rc_vtype* __restrict__ reduction_helper = (*reduction_helper_array); + rc_vtype* __restrict__ reduction_helper = reduction_helper_array; rc_vtype local_max = zero(); if (col_idx < num_cols) { for (size_type i = start_row + tidy; i < end_row; @@ -347,11 +345,11 @@ void multinorminf_without_stop_kernel( size_type stride_norms) { queue->submit([&](sycl::handler& cgh) { - sycl::accessor< - UninitializedArray, - default_dot_dim*(default_dot_dim + 1)>, - 0, sycl::access_mode::read_write, sycl::access::target::local> - reduction_helper_array_acc_ct1(cgh); + sycl::accessor, 1, + sycl::access_mode::read_write, + sycl::access::target::local> + reduction_helper_array_acc_ct1( + sycl::range<1>(default_dot_dim * (default_dot_dim + 1)), cgh); cgh.parallel_for( sycl_nd_range(grid, block), [= @@ -360,7 +358,8 @@ void multinorminf_without_stop_kernel( multinorminf_without_stop_kernel( num_rows, num_cols, next_krylov_basis, stride_next_krylov, norms, stride_norms, item_ct1, - reduction_helper_array_acc_ct1.get_pointer()); + static_cast*>( + reduction_helper_array_acc_ct1.get_pointer())); }); }); } @@ -368,17 +367,14 @@ void multinorminf_without_stop_kernel( // ONLY computes the inf-norm (into norms2) when compute_inf is true template -void multinorm2_inf_kernel( - size_type num_rows, size_type num_cols, - const ValueType* __restrict__ next_krylov_basis, - size_type stride_next_krylov, - remove_complex* __restrict__ norms1, - remove_complex* __restrict__ norms2, - const stopping_status* __restrict__ stop_status, sycl::nd_item<3> item_ct1, - UninitializedArray, - (1 + compute_inf) * - default_dot_dim*(default_dot_dim + 1)>* - reduction_helper_array) +void multinorm2_inf_kernel(size_type num_rows, size_type num_cols, + const ValueType* __restrict__ next_krylov_basis, + size_type stride_next_krylov, + remove_complex* __restrict__ norms1, + remove_complex* __restrict__ norms2, + const stopping_status* __restrict__ stop_status, + sycl::nd_item<3> item_ct1, + remove_complex* reduction_helper_array) { using rc_vtype = remove_complex; const auto tidx = item_ct1.get_local_id(2); @@ -392,9 +388,9 @@ void multinorm2_inf_kernel( // Used that way to get around dynamic initialization warning and // template error when using `reduction_helper_array` directly in `reduce` - rc_vtype* __restrict__ reduction_helper_add = (*reduction_helper_array); + rc_vtype* __restrict__ reduction_helper_add = reduction_helper_array; rc_vtype* __restrict__ reduction_helper_max = - static_cast((*reduction_helper_array)) + + static_cast(reduction_helper_array) + default_dot_dim * (default_dot_dim + 1); rc_vtype local_res = zero(); rc_vtype local_max = zero(); @@ -449,12 +445,12 @@ void multinorm2_inf_kernel( remove_complex* norms2, const stopping_status* stop_status) { queue->submit([&](sycl::handler& cgh) { - sycl::accessor< - UninitializedArray, - (1 + compute_inf) * - default_dot_dim*(default_dot_dim + 1)>, - 0, sycl::access_mode::read_write, sycl::access::target::local> - reduction_helper_array_acc_ct1(cgh); + sycl::accessor, 1, + sycl::access_mode::read_write, + sycl::access::target::local> + reduction_helper_array_acc_ct1( + (1 + compute_inf) * default_dot_dim * (default_dot_dim + 1), + cgh); cgh.parallel_for( sycl_nd_range(grid, block), [= @@ -463,20 +459,23 @@ void multinorm2_inf_kernel( multinorm2_inf_kernel( num_rows, num_cols, next_krylov_basis, stride_next_krylov, norms1, norms2, stop_status, item_ct1, - reduction_helper_array_acc_ct1.get_pointer()); + static_cast*>( + reduction_helper_array_acc_ct1.get_pointer())); }); }); } template -void multidot_kernel( - size_type num_rows, size_type num_cols, - const ValueType* __restrict__ next_krylov_basis, - size_type stride_next_krylov, const Accessor3d krylov_bases, - ValueType* __restrict__ hessenberg_iter, size_type stride_hessenberg, - const stopping_status* __restrict__ stop_status, sycl::nd_item<3> item_ct1, - UninitializedArray& reduction_helper_array) +void multidot_kernel(size_type num_rows, size_type num_cols, + const ValueType* __restrict__ next_krylov_basis, + size_type stride_next_krylov, + const Accessor3d krylov_bases, + ValueType* __restrict__ hessenberg_iter, + size_type stride_hessenberg, + const stopping_status* __restrict__ stop_status, + sycl::nd_item<3> item_ct1, + ValueType* reduction_helper_array) { /* * In general in this kernel: @@ -543,10 +542,10 @@ void multidot_kernel(dim3 grid, dim3 block, size_type dynamic_shared_memory, const stopping_status* stop_status) { queue->submit([&](sycl::handler& cgh) { - sycl::accessor, 0, - sycl::access_mode::read_write, + sycl::accessor - reduction_helper_array_acc_ct1(cgh); + reduction_helper_array_acc_ct1(sycl::range<1>(dot_dim * dot_dim), + cgh); cgh.parallel_for( sycl_nd_range(grid, block), [= @@ -555,7 +554,8 @@ void multidot_kernel(dim3 grid, dim3 block, size_type dynamic_shared_memory, num_rows, num_cols, next_krylov_basis, stride_next_krylov, krylov_bases, hessenberg_iter, stride_hessenberg, stop_status, item_ct1, - *reduction_helper_array_acc_ct1.get_pointer()); + static_cast( + reduction_helper_array_acc_ct1.get_pointer())); }); }); } @@ -567,7 +567,7 @@ void singledot_kernel( size_type stride_next_krylov, const Accessor3d krylov_bases, ValueType* __restrict__ hessenberg_iter, size_type stride_hessenberg, const stopping_status* __restrict__ stop_status, sycl::nd_item<3> item_ct1, - UninitializedArray& reduction_helper_array) + ValueType* __restrict__ reduction_helper_array) { /* * In general in this kernel: @@ -624,10 +624,9 @@ void singledot_kernel(dim3 grid, dim3 block, size_type dynamic_shared_memory, const stopping_status* stop_status) { queue->submit([&](sycl::handler& cgh) { - sycl::accessor, 0, - sycl::access_mode::read_write, + sycl::accessor - reduction_helper_array_acc_ct1(cgh); + reduction_helper_array_acc_ct1(sycl::range<1>(block_size), cgh); cgh.parallel_for( sycl_nd_range(grid, block), [= @@ -637,7 +636,8 @@ void singledot_kernel(dim3 grid, dim3 block, size_type dynamic_shared_memory, num_rows, next_krylov_basis, stride_next_krylov, krylov_bases, hessenberg_iter, stride_hessenberg, stop_status, item_ct1, - *reduction_helper_array_acc_ct1.get_pointer()); + static_cast( + reduction_helper_array_acc_ct1.get_pointer())); }); }); } diff --git a/dpcpp/solver/gmres_kernels.dp.cpp b/dpcpp/solver/gmres_kernels.dp.cpp index 2fe0d63bcbf..3f4c22eea94 100644 --- a/dpcpp/solver/gmres_kernels.dp.cpp +++ b/dpcpp/solver/gmres_kernels.dp.cpp @@ -140,14 +140,15 @@ GKO_ENABLE_DEFAULT_HOST(increase_final_iteration_numbers_kernel, template -void multidot_kernel( - size_type k, size_type num_rows, size_type num_cols, - const ValueType* __restrict__ krylov_bases, - const ValueType* __restrict__ next_krylov_basis, size_type stride_krylov, - ValueType* __restrict__ hessenberg_iter, size_type stride_hessenberg, - const stopping_status* __restrict__ stop_status, sycl::nd_item<3> item_ct1, - UninitializedArray* - reduction_helper_array) +void multidot_kernel(size_type k, size_type num_rows, size_type num_cols, + const ValueType* __restrict__ krylov_bases, + const ValueType* __restrict__ next_krylov_basis, + size_type stride_krylov, + ValueType* __restrict__ hessenberg_iter, + size_type stride_hessenberg, + const stopping_status* __restrict__ stop_status, + sycl::nd_item<3> item_ct1, + ValueType* __restrict__ reduction_helper_array) { const auto tidx = item_ct1.get_local_id(2); const auto tidy = item_ct1.get_local_id(1); @@ -160,7 +161,7 @@ void multidot_kernel( // Used that way to get around dynamic initialization warning and // template error when using `reduction_helper_array` directly in `reduce` - ValueType* __restrict__ reduction_helper = (*reduction_helper_array); + ValueType* __restrict__ reduction_helper = reduction_helper_array; ValueType local_res = zero(); if (col_idx < num_cols && !stop_status[col_idx].has_stopped()) { @@ -197,11 +198,10 @@ void multidot_kernel(dim3 grid, dim3 block, size_type dynamic_shared_memory, const stopping_status* stop_status) { queue->submit([&](sycl::handler& cgh) { - sycl::accessor, - 0, sycl::access_mode::read_write, + sycl::accessor - reduction_helper_array_acc_ct1(cgh); + reduction_helper_array_acc_ct1( + sycl::range<1>(default_dot_dim * (default_dot_dim + 1)), cgh); cgh.parallel_for( sycl_nd_range(grid, block), [= @@ -211,9 +211,8 @@ void multidot_kernel(dim3 grid, dim3 block, size_type dynamic_shared_memory, k, num_rows, num_cols, krylov_bases, next_krylov_basis, stride_krylov, hessenberg_iter, stride_hessenberg, stop_status, item_ct1, - (UninitializedArray*) - reduction_helper_array_acc_ct1.get_pointer()); + static_cast( + reduction_helper_array_acc_ct1.get_pointer())); }); }); } @@ -267,13 +266,15 @@ void update_next_krylov_kernel( // Must be called with at least `num_cols` blocks, each with `block_size` // threads. `block_size` must be a power of 2. template -void update_hessenberg_2_kernel( - size_type iter, size_type num_rows, size_type num_cols, - const ValueType* __restrict__ next_krylov_basis, - size_type stride_next_krylov, ValueType* __restrict__ hessenberg_iter, - size_type stride_hessenberg, - const stopping_status* __restrict__ stop_status, sycl::nd_item<3> item_ct1, - UninitializedArray& reduction_helper_array) +void update_hessenberg_2_kernel(size_type iter, size_type num_rows, + size_type num_cols, + const ValueType* __restrict__ next_krylov_basis, + size_type stride_next_krylov, + ValueType* __restrict__ hessenberg_iter, + size_type stride_hessenberg, + const stopping_status* __restrict__ stop_status, + sycl::nd_item<3> item_ct1, + ValueType* __restrict__ reduction_helper_array) { const auto tidx = item_ct1.get_local_id(2); const auto col_idx = item_ct1.get_group(2); @@ -314,10 +315,9 @@ void update_hessenberg_2_kernel( const stopping_status* stop_status) { queue->submit([&](sycl::handler& cgh) { - sycl::accessor, 0, - sycl::access_mode::read_write, + sycl::accessor - reduction_helper_array_acc_ct1(cgh); + reduction_helper_array_acc_ct1(sycl::range<1>(block_size), cgh); cgh.parallel_for( sycl_nd_range(grid, block), [= @@ -327,7 +327,8 @@ void update_hessenberg_2_kernel( iter, num_rows, num_cols, next_krylov_basis, stride_next_krylov, hessenberg_iter, stride_hessenberg, stop_status, item_ct1, - *reduction_helper_array_acc_ct1.get_pointer()); + static_cast( + reduction_helper_array_acc_ct1.get_pointer())); }); }); } diff --git a/dpcpp/solver/idr_kernels.dp.cpp b/dpcpp/solver/idr_kernels.dp.cpp index 3ec0db6d469..f37a9bcd091 100644 --- a/dpcpp/solver/idr_kernels.dp.cpp +++ b/dpcpp/solver/idr_kernels.dp.cpp @@ -111,7 +111,7 @@ template void orthonormalize_subspace_vectors_kernel( size_type num_rows, size_type num_cols, ValueType* __restrict__ values, size_type stride, sycl::nd_item<3> item_ct1, - UninitializedArray& reduction_helper_array) + ValueType* __restrict__ reduction_helper_array) { const auto tidx = thread::get_thread_id_flat(item_ct1); @@ -168,10 +168,9 @@ void orthonormalize_subspace_vectors_kernel( size_type num_rows, size_type num_cols, ValueType* values, size_type stride) { stream->submit([&](sycl::handler& cgh) { - sycl::accessor, 0, - sycl::access_mode::read_write, + sycl::accessor - reduction_helper_array_acc_ct1(cgh); + reduction_helper_array_acc_ct1(sycl::range<1>(block_size), cgh); cgh.parallel_for( sycl_nd_range(grid, block), [= @@ -179,7 +178,8 @@ void orthonormalize_subspace_vectors_kernel( config::warp_size)]] { orthonormalize_subspace_vectors_kernel( num_rows, num_cols, values, stride, item_ct1, - *reduction_helper_array_acc_ct1.get_pointer()); + static_cast( + reduction_helper_array_acc_ct1.get_pointer())); }); }); } @@ -329,13 +329,13 @@ void step_2_kernel(dim3 grid, dim3 block, size_t dynamic_shared_memory, template -void multidot_kernel( - size_type num_rows, size_type nrhs, const ValueType* __restrict__ p_i, - const ValueType* __restrict__ g_k, size_type g_k_stride, - ValueType* __restrict__ alpha, - const stopping_status* __restrict__ stop_status, sycl::nd_item<3> item_ct1, - UninitializedArray& - reduction_helper_array) +void multidot_kernel(size_type num_rows, size_type nrhs, + const ValueType* __restrict__ p_i, + const ValueType* __restrict__ g_k, size_type g_k_stride, + ValueType* __restrict__ alpha, + const stopping_status* __restrict__ stop_status, + sycl::nd_item<3> item_ct1, + ValueType* __restrict__ reduction_helper_array) { const auto tidx = item_ct1.get_local_id(2); const auto tidy = item_ct1.get_local_id(1); @@ -379,19 +379,20 @@ void multidot_kernel(dim3 grid, dim3 block, size_t dynamic_shared_memory, const stopping_status* stop_status) { stream->submit([&](sycl::handler& cgh) { - sycl::accessor, - 0, sycl::access_mode::read_write, + sycl::accessor - reduction_helper_array_acc_ct1(cgh); + reduction_helper_array_acc_ct1( + sycl::range<1>(default_dot_dim * (default_dot_dim + 1)), cgh); cgh.parallel_for( sycl_nd_range(grid, block), [= ](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size( default_dot_dim)]] { - multidot_kernel(num_rows, nrhs, p_i, g_k, g_k_stride, alpha, - stop_status, item_ct1, - *reduction_helper_array_acc_ct1.get_pointer()); + multidot_kernel( + num_rows, nrhs, p_i, g_k, g_k_stride, alpha, stop_status, + item_ct1, + static_cast( + reduction_helper_array_acc_ct1.get_pointer())); }); }); } diff --git a/dpcpp/test/components/cooperative_groups.dp.cpp b/dpcpp/test/components/cooperative_groups.dp.cpp index 533f8f363e4..903a5637197 100644 --- a/dpcpp/test/components/cooperative_groups.dp.cpp +++ b/dpcpp/test/components/cooperative_groups.dp.cpp @@ -33,6 +33,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "dpcpp/components/cooperative_groups.dp.hpp" +#include #include #include @@ -51,7 +52,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "core/base/types.hpp" #include "core/synthesizer/implementation_selection.hpp" #include "core/test/utils/assertions.hpp" -#include "dpcpp/base/config.hpp" #include "dpcpp/base/dim3.dp.hpp" #include "dpcpp/base/helper.hpp" @@ -62,9 +62,9 @@ namespace { using namespace gko::kernels::dpcpp; using KCFG_1D = gko::ConfigSet<11, 7>; constexpr auto default_config_list = - ::gko::syn::value_list(); + std::integer_sequence(); class CooperativeGroups : public testing::TestWithParam { @@ -116,7 +116,7 @@ class CooperativeGroups : public testing::TestWithParam { // kernel implementation -template +template void cg_shuffle(bool* s, sycl::nd_item<3> item_ct1) { constexpr auto sg_size = KCFG_1D::decode<1>(config); @@ -152,16 +152,16 @@ void cg_shuffle_host(dim3 grid, dim3 block, GKO_ENABLE_IMPLEMENTATION_CONFIG_SELECTION(cg_shuffle_config, cg_shuffle_host) // the call -void cg_shuffle_config_call(std::uint32_t desired_cfg, dim3 grid, dim3 block, +void cg_shuffle_config_call(int desired_cfg, dim3 grid, dim3 block, gko::size_type dynamic_shared_memory, sycl::queue* queue, bool* s) { cg_shuffle_config( default_config_list, // validate - [&desired_cfg](std::uint32_t cfg) { return cfg == desired_cfg; }, - ::gko::syn::value_list(), ::gko::syn::value_list(), - ::gko::syn::value_list(), ::gko::syn::type_list<>(), + [&desired_cfg](int cfg) { return cfg == desired_cfg; }, + std::integer_sequence(), std::integer_sequence(), + std::integer_sequence(), ::gko::syn::type_list<>(), grid, block, dynamic_shared_memory, queue, s); } @@ -171,7 +171,7 @@ TEST_P(CooperativeGroups, Shuffle) } -template +template void cg_all(bool* s, sycl::nd_item<3> item_ct1) { constexpr auto sg_size = KCFG_1D::decode<1>(config); @@ -185,7 +185,7 @@ void cg_all(bool* s, sycl::nd_item<3> item_ct1) group.all(item_ct1.get_local_id(2) < 13) == sg_size < 13; } -template +template inline void cg_all(dim3 grid, dim3 block, gko::size_type, sycl::queue* queue, InferredArgs... args) { @@ -206,7 +206,7 @@ GKO_ENABLE_DEFAULT_CONFIG_CALL(cg_all_call, cg_all, default_config_list) TEST_P(CooperativeGroups, All) { test_all_subgroup(cg_all_call); } -template +template void cg_any(bool* s, sycl::nd_item<3> item_ct1) { constexpr auto sg_size = KCFG_1D::decode<1>(config); @@ -219,7 +219,7 @@ void cg_any(bool* s, sycl::nd_item<3> item_ct1) s[i + sg_size * 2] = !group.any(false); } -template +template inline void cg_any(dim3 grid, dim3 block, gko::size_type, sycl::queue* queue, InferredArgs... args) { @@ -240,7 +240,7 @@ GKO_ENABLE_DEFAULT_CONFIG_CALL(cg_any_call, cg_any, default_config_list) TEST_P(CooperativeGroups, Any) { test_all_subgroup(cg_any_call); } -template +template void cg_ballot(bool* s, sycl::nd_item<3> item_ct1) { constexpr auto sg_size = KCFG_1D::decode<1>(config); @@ -254,7 +254,7 @@ void cg_ballot(bool* s, sycl::nd_item<3> item_ct1) s[i + sg_size * 2] = group.ballot(item_ct1.get_local_id(2) < 4) == 0xf; } -template +template inline void cg_ballot(dim3 grid, dim3 block, gko::size_type, sycl::queue* queue, InferredArgs... args) { diff --git a/hip/base/kernel_launch_reduction.hip.hpp b/hip/base/kernel_launch_reduction.hip.hpp index e93f811bfb1..01e19c6c1f0 100644 --- a/hip/base/kernel_launch_reduction.hip.hpp +++ b/hip/base/kernel_launch_reduction.hip.hpp @@ -362,7 +362,7 @@ namespace { template -void run_generic_kernel_row_reduction(syn::value_list, +void run_generic_kernel_row_reduction(std::integer_sequence, int64 rows, int64 cols, int64 col_blocks, KernelFunction fn, ReductionOp op, FinalizeOp finalize, ValueType identity, @@ -387,7 +387,7 @@ GKO_ENABLE_IMPLEMENTATION_SELECTION(select_run_generic_kernel_row_reduction, template -void run_generic_col_reduction_small(syn::value_list, +void run_generic_col_reduction_small(std::integer_sequence, int64 max_blocks, std::shared_ptr exec, KernelFunction fn, ReductionOp op, @@ -441,7 +441,7 @@ void run_kernel_row_reduction(std::shared_ptr exec, dim<2> size, KernelArgs&&... args) { using subwarp_sizes = - syn::value_list; + std::integer_sequence; constexpr int oversubscription = 16; const auto rows = static_cast(size[0]); const auto cols = static_cast(size[1]); @@ -472,8 +472,8 @@ void run_kernel_row_reduction(std::shared_ptr exec, return compiled_subwarp_size >= cols || compiled_subwarp_size == config::warp_size; }, - syn::value_list(), syn::type_list<>(), rows, cols, 1, fn, op, - finalize, identity, result, static_cast(result_stride), + std::integer_sequence(), syn::type_list<>(), rows, cols, 1, fn, + op, finalize, identity, result, static_cast(result_stride), map_to_device(args)...); } } @@ -488,7 +488,7 @@ void run_kernel_col_reduction(std::shared_ptr exec, KernelArgs&&... args) { using subwarp_sizes = - syn::value_list; + std::integer_sequence; constexpr int oversubscription = 16; const auto rows = static_cast(size[0]); const auto cols = static_cast(size[1]); @@ -501,8 +501,8 @@ void run_kernel_col_reduction(std::shared_ptr exec, return compiled_subwarp_size >= cols || compiled_subwarp_size == config::warp_size; }, - syn::value_list(), syn::type_list<>(), max_blocks, exec, fn, - op, finalize, identity, result, size, map_to_device(args)...); + std::integer_sequence(), syn::type_list<>(), max_blocks, exec, + fn, op, finalize, identity, result, size, map_to_device(args)...); } else { const auto col_blocks = ceildiv(cols, config::warp_size); const auto row_blocks = diff --git a/hip/factorization/par_ic_kernels.hip.cpp b/hip/factorization/par_ic_kernels.hip.cpp index 47cc332b70e..4b36d854593 100644 --- a/hip/factorization/par_ic_kernels.hip.cpp +++ b/hip/factorization/par_ic_kernels.hip.cpp @@ -59,7 +59,7 @@ constexpr int default_block_size = 512; // subwarp sizes for all warp-parallel kernels (sweep) using compiled_kernels = - syn::value_list; + std::integer_sequence; #include "common/cuda_hip/factorization/par_ic_kernels.hpp.inc" diff --git a/hip/factorization/par_ict_kernels.hip.cpp b/hip/factorization/par_ict_kernels.hip.cpp index 05d08e598a7..18929cad361 100644 --- a/hip/factorization/par_ict_kernels.hip.cpp +++ b/hip/factorization/par_ict_kernels.hip.cpp @@ -73,7 +73,7 @@ constexpr int default_block_size = 512; // subwarp sizes for all warp-parallel kernels (filter, add_candidates) using compiled_kernels = - syn::value_list; + std::integer_sequence; #include "common/cuda_hip/factorization/par_ict_spgeam_kernels.hpp.inc" @@ -84,7 +84,7 @@ namespace { template -void add_candidates(syn::value_list, +void add_candidates(std::integer_sequence, std::shared_ptr exec, const matrix::Csr* llh, const matrix::Csr* a, @@ -140,7 +140,7 @@ GKO_ENABLE_IMPLEMENTATION_SELECTION(select_add_candidates, add_candidates); template -void compute_factor(syn::value_list, +void compute_factor(std::integer_sequence, std::shared_ptr exec, const matrix::Csr* a, matrix::Csr* l, @@ -184,7 +184,8 @@ void add_candidates(std::shared_ptr exec, return total_nnz_per_row <= compiled_subwarp_size || compiled_subwarp_size == config::warp_size; }, - syn::value_list(), syn::type_list<>(), exec, llh, a, l, l_new); + std::integer_sequence(), syn::type_list<>(), exec, llh, a, l, + l_new); } GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( @@ -206,7 +207,7 @@ void compute_factor(std::shared_ptr exec, return total_nnz_per_row <= compiled_subwarp_size || compiled_subwarp_size == config::warp_size; }, - syn::value_list(), syn::type_list<>(), exec, a, l, l_coo); + std::integer_sequence(), syn::type_list<>(), exec, a, l, l_coo); } GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( diff --git a/hip/factorization/par_ilut_approx_filter_kernel.hip.cpp b/hip/factorization/par_ilut_approx_filter_kernel.hip.cpp index d484af93d2a..8de4288787c 100644 --- a/hip/factorization/par_ilut_approx_filter_kernel.hip.cpp +++ b/hip/factorization/par_ilut_approx_filter_kernel.hip.cpp @@ -76,7 +76,7 @@ namespace par_ilut_factorization { // subwarp sizes for filter kernels using compiled_kernels = - syn::value_list; + std::integer_sequence; #include "common/cuda_hip/factorization/par_ilut_filter_kernels.hpp.inc" @@ -84,7 +84,7 @@ using compiled_kernels = template -void threshold_filter_approx(syn::value_list, +void threshold_filter_approx(std::integer_sequence, std::shared_ptr exec, const matrix::Csr* m, IndexType rank, Array* tmp, @@ -201,7 +201,7 @@ void threshold_filter_approx(std::shared_ptr exec, return total_nnz_per_row <= compiled_subwarp_size || compiled_subwarp_size == config::warp_size; }, - syn::value_list(), syn::type_list<>(), exec, m, rank, &tmp, + std::integer_sequence(), syn::type_list<>(), exec, m, rank, &tmp, &threshold, m_out, m_out_coo); } diff --git a/hip/factorization/par_ilut_filter_kernel.hip.cpp b/hip/factorization/par_ilut_filter_kernel.hip.cpp index cf691f58f8b..a4843ee89ff 100644 --- a/hip/factorization/par_ilut_filter_kernel.hip.cpp +++ b/hip/factorization/par_ilut_filter_kernel.hip.cpp @@ -72,7 +72,7 @@ constexpr int default_block_size = 512; // subwarp sizes for filter kernels using compiled_kernels = - syn::value_list; + std::integer_sequence; #include "common/cuda_hip/factorization/par_ilut_filter_kernels.hpp.inc" @@ -82,7 +82,7 @@ namespace { template -void threshold_filter(syn::value_list, +void threshold_filter(std::integer_sequence, std::shared_ptr exec, const matrix::Csr* a, remove_complex threshold, @@ -156,8 +156,8 @@ void threshold_filter(std::shared_ptr exec, return total_nnz_per_row <= compiled_subwarp_size || compiled_subwarp_size == config::warp_size; }, - syn::value_list(), syn::type_list<>(), exec, a, threshold, m_out, - m_out_coo, lower); + std::integer_sequence(), syn::type_list<>(), exec, a, threshold, + m_out, m_out_coo, lower); } GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( diff --git a/hip/factorization/par_ilut_spgeam_kernel.hip.cpp b/hip/factorization/par_ilut_spgeam_kernel.hip.cpp index f8234cc6a04..02325d508bd 100644 --- a/hip/factorization/par_ilut_spgeam_kernel.hip.cpp +++ b/hip/factorization/par_ilut_spgeam_kernel.hip.cpp @@ -73,7 +73,7 @@ constexpr int default_block_size = 512; // subwarp sizes for add_candidates kernels using compiled_kernels = - syn::value_list; + std::integer_sequence; #include "common/cuda_hip/factorization/par_ilut_spgeam_kernels.hpp.inc" @@ -83,7 +83,7 @@ namespace { template -void add_candidates(syn::value_list, +void add_candidates(std::integer_sequence, std::shared_ptr exec, const matrix::Csr* lu, const matrix::Csr* a, @@ -175,8 +175,8 @@ void add_candidates(std::shared_ptr exec, return total_nnz_per_row <= compiled_subwarp_size || compiled_subwarp_size == config::warp_size; }, - syn::value_list(), syn::type_list<>(), exec, lu, a, l, u, l_new, - u_new); + std::integer_sequence(), syn::type_list<>(), exec, lu, a, l, u, + l_new, u_new); } GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( diff --git a/hip/factorization/par_ilut_sweep_kernel.hip.cpp b/hip/factorization/par_ilut_sweep_kernel.hip.cpp index e063fd81d0d..562e83efd7e 100644 --- a/hip/factorization/par_ilut_sweep_kernel.hip.cpp +++ b/hip/factorization/par_ilut_sweep_kernel.hip.cpp @@ -73,7 +73,7 @@ constexpr int default_block_size = 512; // subwarp sizes for all warp-parallel kernels (filter, add_candidates) using compiled_kernels = - syn::value_list; + std::integer_sequence; #include "common/cuda_hip/factorization/par_ilut_sweep_kernels.hpp.inc" @@ -83,7 +83,7 @@ namespace { template -void compute_l_u_factors(syn::value_list, +void compute_l_u_factors(std::integer_sequence, std::shared_ptr exec, const matrix::Csr* a, matrix::Csr* l, @@ -138,8 +138,8 @@ void compute_l_u_factors(std::shared_ptr exec, return total_nnz_per_row <= compiled_subwarp_size || compiled_subwarp_size == config::warp_size; }, - syn::value_list(), syn::type_list<>(), exec, a, l, l_coo, u, u_coo, - u_csc); + std::integer_sequence(), syn::type_list<>(), exec, a, l, l_coo, u, + u_coo, u_csc); } GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( diff --git a/hip/matrix/csr_kernels.hip.cpp b/hip/matrix/csr_kernels.hip.cpp index eafbd367c0e..fa4890f6ad2 100644 --- a/hip/matrix/csr_kernels.hip.cpp +++ b/hip/matrix/csr_kernels.hip.cpp @@ -90,13 +90,13 @@ constexpr int classical_overweight = 32; * A compile-time list of the number items per threads for which spmv kernel * should be compiled. */ -using compiled_kernels = syn::value_list; +using compiled_kernels = std::integer_sequence; using classical_kernels = - syn::value_list; + std::integer_sequence; using spgeam_kernels = - syn::value_list; + std::integer_sequence; #include "common/cuda_hip/matrix/csr_kernels.hpp.inc" @@ -106,7 +106,7 @@ namespace host_kernel { template -void merge_path_spmv(syn::value_list, +void merge_path_spmv(std::integer_sequence, std::shared_ptr exec, const matrix::Csr* a, const matrix::Dense* b, @@ -226,7 +226,7 @@ int compute_items_per_thread(std::shared_ptr exec) template -void classical_spmv(syn::value_list, +void classical_spmv(std::integer_sequence, std::shared_ptr exec, const matrix::Csr* a, const matrix::Dense* b, @@ -307,7 +307,7 @@ void spmv(std::shared_ptr exec, [&items_per_thread](int compiled_info) { return items_per_thread == compiled_info; }, - syn::value_list(), syn::type_list<>(), exec, a, b, c); + std::integer_sequence(), syn::type_list<>(), exec, a, b, c); } else { bool try_sparselib = (a->get_strategy()->get_name() == "sparselib" || a->get_strategy()->get_name() == "cusparse"); @@ -356,7 +356,8 @@ void spmv(std::shared_ptr exec, [&max_length_per_row](int compiled_info) { return max_length_per_row >= compiled_info; }, - syn::value_list(), syn::type_list<>(), exec, a, b, c); + std::integer_sequence(), syn::type_list<>(), exec, a, b, + c); } } } @@ -404,8 +405,8 @@ void advanced_spmv(std::shared_ptr exec, [&items_per_thread](int compiled_info) { return items_per_thread == compiled_info; }, - syn::value_list(), syn::type_list<>(), exec, a, b, c, alpha, - beta); + std::integer_sequence(), syn::type_list<>(), exec, a, b, c, + alpha, beta); } else { bool try_sparselib = (a->get_strategy()->get_name() == "sparselib" || a->get_strategy()->get_name() == "cusparse"); @@ -453,7 +454,7 @@ void advanced_spmv(std::shared_ptr exec, [&max_length_per_row](int compiled_info) { return max_length_per_row >= compiled_info; }, - syn::value_list(), syn::type_list<>(), exec, a, b, c, + std::integer_sequence(), syn::type_list<>(), exec, a, b, c, alpha, beta); } } @@ -542,7 +543,7 @@ namespace { template -void spgeam(syn::value_list, +void spgeam(std::integer_sequence, std::shared_ptr exec, const ValueType* alpha, const IndexType* a_row_ptrs, const IndexType* a_col_idxs, const ValueType* a_vals, const ValueType* beta, @@ -667,7 +668,7 @@ void advanced_spgemm(std::shared_ptr exec, return compiled_subwarp_size >= nnz_per_row || compiled_subwarp_size == config::warp_size; }, - syn::value_list(), syn::type_list<>(), exec, + std::integer_sequence(), syn::type_list<>(), exec, alpha->get_const_values(), c_tmp_row_ptrs, c_tmp_col_idxs, c_tmp_vals, beta->get_const_values(), d_row_ptrs, d_col_idxs, d_vals, c); @@ -697,7 +698,7 @@ void spgeam(std::shared_ptr exec, return compiled_subwarp_size >= nnz_per_row || compiled_subwarp_size == config::warp_size; }, - syn::value_list(), syn::type_list<>(), exec, + std::integer_sequence(), syn::type_list<>(), exec, alpha->get_const_values(), a->get_const_row_ptrs(), a->get_const_col_idxs(), a->get_const_values(), beta->get_const_values(), b->get_const_row_ptrs(), diff --git a/hip/matrix/ell_kernels.hip.cpp b/hip/matrix/ell_kernels.hip.cpp index 2486a666ab0..cd55dfc266f 100644 --- a/hip/matrix/ell_kernels.hip.cpp +++ b/hip/matrix/ell_kernels.hip.cpp @@ -105,7 +105,7 @@ constexpr int max_thread_per_worker = 32; * 0 is a special case where it uses a sub-warp size of warp_size in * combination with atomic_adds. */ -using compiled_kernels = syn::value_list; +using compiled_kernels = std::integer_sequence; #include "common/cuda_hip/matrix/ell_kernels.hpp.inc" @@ -116,7 +116,7 @@ namespace { template -void abstract_spmv(syn::value_list, int num_worker_per_row, +void abstract_spmv(std::integer_sequence, int num_worker_per_row, const matrix::Ell* a, const matrix::Dense* b, matrix::Dense* c, @@ -253,8 +253,8 @@ void spmv(std::shared_ptr exec, select_abstract_spmv( compiled_kernels(), [&info](int compiled_info) { return info == compiled_info; }, - syn::value_list(), syn::type_list<>(), num_worker_per_row, a, b, - c); + std::integer_sequence(), syn::type_list<>(), num_worker_per_row, a, + b, c); } GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE( @@ -287,8 +287,8 @@ void advanced_spmv(std::shared_ptr exec, select_abstract_spmv( compiled_kernels(), [&info](int compiled_info) { return info == compiled_info; }, - syn::value_list(), syn::type_list<>(), num_worker_per_row, a, b, c, - alpha, beta); + std::integer_sequence(), syn::type_list<>(), num_worker_per_row, a, + b, c, alpha, beta); } GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE( diff --git a/hip/preconditioner/jacobi_advanced_apply_instantiate.inc.hip.cpp b/hip/preconditioner/jacobi_advanced_apply_instantiate.inc.hip.cpp index 87042e35332..443ca644e00 100644 --- a/hip/preconditioner/jacobi_advanced_apply_instantiate.inc.hip.cpp +++ b/hip/preconditioner/jacobi_advanced_apply_instantiate.inc.hip.cpp @@ -78,7 +78,7 @@ namespace jacobi { template void advanced_apply( - syn::value_list, size_type num_blocks, + std::integer_sequence, size_type num_blocks, const precision_reduction* block_precisions, const IndexType* block_pointers, const ValueType* blocks, const preconditioner::block_interleaved_storage_scheme& @@ -117,7 +117,7 @@ void advanced_apply( #define DECLARE_JACOBI_ADVANCED_APPLY_INSTANTIATION(ValueType, IndexType) \ void advanced_apply( \ - syn::value_list, size_type, \ + std::integer_sequence, size_type, \ const precision_reduction*, const IndexType* block_pointers, \ const ValueType*, \ const preconditioner::block_interleaved_storage_scheme&, \ diff --git a/hip/preconditioner/jacobi_advanced_apply_kernel.hip.cpp b/hip/preconditioner/jacobi_advanced_apply_kernel.hip.cpp index 6412fb8ca6b..82483e0fcac 100644 --- a/hip/preconditioner/jacobi_advanced_apply_kernel.hip.cpp +++ b/hip/preconditioner/jacobi_advanced_apply_kernel.hip.cpp @@ -55,7 +55,7 @@ namespace jacobi { template void advanced_apply( - syn::value_list, size_type num_blocks, + std::integer_sequence, size_type num_blocks, const precision_reduction* block_precisions, const IndexType* block_pointers, const ValueType* blocks, const preconditioner::block_interleaved_storage_scheme& @@ -86,7 +86,7 @@ void apply(std::shared_ptr exec, size_type num_blocks, [&](int compiled_block_size) { return max_block_size <= compiled_block_size; }, - syn::value_list(), + std::integer_sequence(), syn::type_list<>(), num_blocks, block_precisions.get_const_data(), block_pointers.get_const_data(), blocks.get_const_data(), storage_scheme, alpha->get_const_values(), diff --git a/hip/preconditioner/jacobi_common.hip.hpp.in b/hip/preconditioner/jacobi_common.hip.hpp.in index 19f7d15bcef..8b79d841355 100644 --- a/hip/preconditioner/jacobi_common.hip.hpp.in +++ b/hip/preconditioner/jacobi_common.hip.hpp.in @@ -30,8 +30,10 @@ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *************************************************************/ +#include + + #include -#include #include "hip/base/config.hip.hpp" @@ -56,7 +58,8 @@ namespace jacobi { #endif -using compiled_kernels = syn::value_list; +using compiled_kernels = + std::integer_sequence; constexpr int get_larger_power(int value, int guess = 1) diff --git a/hip/preconditioner/jacobi_generate_instantiate.inc.hip.cpp b/hip/preconditioner/jacobi_generate_instantiate.inc.hip.cpp index 44ca906769f..313f9c4b9f1 100644 --- a/hip/preconditioner/jacobi_generate_instantiate.inc.hip.cpp +++ b/hip/preconditioner/jacobi_generate_instantiate.inc.hip.cpp @@ -77,7 +77,7 @@ namespace jacobi { template -void generate(syn::value_list, +void generate(std::integer_sequence, const matrix::Csr* mtx, remove_complex accuracy, ValueType* block_data, const preconditioner::block_interleaved_storage_scheme& @@ -120,7 +120,7 @@ void generate(syn::value_list, #define DECLARE_JACOBI_GENERATE_INSTANTIATION(ValueType, IndexType) \ void generate( \ - syn::value_list, \ + std::integer_sequence, \ const matrix::Csr*, remove_complex, \ ValueType*, \ const preconditioner::block_interleaved_storage_scheme&, \ diff --git a/hip/preconditioner/jacobi_generate_kernel.hip.cpp b/hip/preconditioner/jacobi_generate_kernel.hip.cpp index a5c8ee94fa6..54a63b47d1e 100644 --- a/hip/preconditioner/jacobi_generate_kernel.hip.cpp +++ b/hip/preconditioner/jacobi_generate_kernel.hip.cpp @@ -71,7 +71,7 @@ namespace jacobi { template -void generate(syn::value_list, +void generate(std::integer_sequence, const matrix::Csr* mtx, remove_complex accuracy, ValueType* block_data, const preconditioner::block_interleaved_storage_scheme& @@ -101,9 +101,9 @@ void generate(std::shared_ptr exec, [&](int compiled_block_size) { return max_block_size <= compiled_block_size; }, - syn::value_list(), syn::type_list<>(), - system_matrix, accuracy, blocks.get_data(), storage_scheme, - conditioning.get_data(), block_precisions.get_data(), + std::integer_sequence(), + syn::type_list<>(), system_matrix, accuracy, blocks.get_data(), + storage_scheme, conditioning.get_data(), block_precisions.get_data(), block_pointers.get_const_data(), num_blocks); } diff --git a/hip/preconditioner/jacobi_kernels.hip.cpp b/hip/preconditioner/jacobi_kernels.hip.cpp index d8c365fa12d..76db71d9c08 100644 --- a/hip/preconditioner/jacobi_kernels.hip.cpp +++ b/hip/preconditioner/jacobi_kernels.hip.cpp @@ -163,7 +163,7 @@ namespace { template void transpose_jacobi( - syn::value_list, size_type num_blocks, + std::integer_sequence, size_type num_blocks, const precision_reduction* block_precisions, const IndexType* block_pointers, const ValueType* blocks, const preconditioner::block_interleaved_storage_scheme& @@ -217,7 +217,7 @@ void transpose_jacobi( [&](int compiled_block_size) { return max_block_size <= compiled_block_size; }, - syn::value_list(), + std::integer_sequence(), syn::type_list<>(), num_blocks, block_precisions.get_const_data(), block_pointers.get_const_data(), blocks.get_const_data(), storage_scheme, out_blocks.get_data()); @@ -241,7 +241,7 @@ void conj_transpose_jacobi( [&](int compiled_block_size) { return max_block_size <= compiled_block_size; }, - syn::value_list(), + std::integer_sequence(), syn::type_list<>(), num_blocks, block_precisions.get_const_data(), block_pointers.get_const_data(), blocks.get_const_data(), storage_scheme, out_blocks.get_data()); diff --git a/hip/preconditioner/jacobi_simple_apply_instantiate.inc.hip.cpp b/hip/preconditioner/jacobi_simple_apply_instantiate.inc.hip.cpp index c4a63007798..fcf2ca2140c 100644 --- a/hip/preconditioner/jacobi_simple_apply_instantiate.inc.hip.cpp +++ b/hip/preconditioner/jacobi_simple_apply_instantiate.inc.hip.cpp @@ -74,7 +74,7 @@ namespace jacobi { template -void apply(syn::value_list, size_type num_blocks, +void apply(std::integer_sequence, size_type num_blocks, const precision_reduction* block_precisions, const IndexType* block_pointers, const ValueType* blocks, const preconditioner::block_interleaved_storage_scheme& @@ -112,7 +112,7 @@ void apply(syn::value_list, size_type num_blocks, #define DECLARE_JACOBI_SIMPLE_APPLY_INSTANTIATION(ValueType, IndexType) \ void apply( \ - syn::value_list, size_type, \ + std::integer_sequence, size_type, \ const precision_reduction*, const IndexType*, const ValueType*, \ const preconditioner::block_interleaved_storage_scheme&, \ const ValueType*, size_type, ValueType*, size_type) diff --git a/hip/preconditioner/jacobi_simple_apply_kernel.hip.cpp b/hip/preconditioner/jacobi_simple_apply_kernel.hip.cpp index 9690cd73121..bca05ce445c 100644 --- a/hip/preconditioner/jacobi_simple_apply_kernel.hip.cpp +++ b/hip/preconditioner/jacobi_simple_apply_kernel.hip.cpp @@ -68,7 +68,7 @@ namespace jacobi { template -void apply(syn::value_list, size_type num_blocks, +void apply(std::integer_sequence, size_type num_blocks, const precision_reduction* block_precisions, const IndexType* block_pointers, const ValueType* blocks, const preconditioner::block_interleaved_storage_scheme& @@ -96,7 +96,7 @@ void simple_apply( [&](int compiled_block_size) { return max_block_size <= compiled_block_size; }, - syn::value_list(), + std::integer_sequence(), syn::type_list<>(), num_blocks, block_precisions.get_const_data(), block_pointers.get_const_data(), blocks.get_const_data(), storage_scheme, b->get_const_values() + col, b->get_stride(), diff --git a/include/ginkgo/core/base/executor.hpp b/include/ginkgo/core/base/executor.hpp index 73976c24089..0ab3f54a79a 100644 --- a/include/ginkgo/core/base/executor.hpp +++ b/include/ginkgo/core/base/executor.hpp @@ -49,7 +49,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include #include -#include namespace gko { diff --git a/include/ginkgo/core/base/types.hpp b/include/ginkgo/core/base/types.hpp index 009c2e182cd..26854ad3cba 100644 --- a/include/ginkgo/core/base/types.hpp +++ b/include/ginkgo/core/base/types.hpp @@ -419,6 +419,14 @@ GKO_ATTRIBUTES constexpr bool operator!=(precision_reduction x, _enable_macro(CudaExecutor, cuda) +#if GINKGO_DPCPP_SINGLE_MODE +#define GKO_ADAPT_SINGLE(_macro) \ + template <> \ + _macro GKO_NOT_IMPLEMENTED +#else +#define GKO_ADAPT_SINGLE(_macro) template _macro +#endif + /** * Instantiates a template for each non-complex value type compiled by Ginkgo. * @@ -427,16 +435,9 @@ GKO_ATTRIBUTES constexpr bool operator!=(precision_reduction x, * Should take one argument, which is replaced by the * value type. */ -#if GINKGO_DPCPP_SINGLE_MODE #define GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE(_macro) \ template _macro(float); \ - template <> \ - _macro(double) GKO_NOT_IMPLEMENTED -#else -#define GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE(_macro) \ - template _macro(float); \ - template _macro(double) -#endif + GKO_ADAPT_SINGLE(_macro(double)) /** @@ -447,18 +448,10 @@ GKO_ATTRIBUTES constexpr bool operator!=(precision_reduction x, * Should take one argument, which is replaced by the * value type. */ -#if GINKGO_DPCPP_SINGLE_MODE -#define GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(_macro) \ - GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE(_macro); \ - template _macro(std::complex); \ - template <> \ - _macro(std::complex) GKO_NOT_IMPLEMENTED -#else #define GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(_macro) \ GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE(_macro); \ template _macro(std::complex); \ - template _macro(std::complex) -#endif + GKO_ADAPT_SINGLE(_macro(std::complex)) /** @@ -471,26 +464,14 @@ GKO_ATTRIBUTES constexpr bool operator!=(precision_reduction x, * Should take two arguments, which are replaced by the * value and scalar type, respectively. */ -#if GINKGO_DPCPP_SINGLE_MODE -#define GKO_INSTANTIATE_FOR_EACH_VALUE_AND_SCALAR_TYPE(_macro) \ - template _macro(float, float); \ - template <> \ - _macro(double, double) GKO_NOT_IMPLEMENTED; \ - template _macro(std::complex, std::complex); \ - template <> \ - _macro(std::complex, std::complex) GKO_NOT_IMPLEMENTED; \ - template _macro(std::complex, float); \ - template <> \ - _macro(std::complex, double) GKO_NOT_IMPLEMENTED; -#else -#define GKO_INSTANTIATE_FOR_EACH_VALUE_AND_SCALAR_TYPE(_macro) \ - template _macro(float, float); \ - template _macro(double, double); \ - template _macro(std::complex, std::complex); \ - template _macro(std::complex, std::complex); \ - template _macro(std::complex, float); \ - template _macro(std::complex, double) -#endif + +#define GKO_INSTANTIATE_FOR_EACH_VALUE_AND_SCALAR_TYPE(_macro) \ + template _macro(float, float); \ + GKO_ADAPT_SINGLE(_macro(double, double)); \ + template _macro(std::complex, std::complex); \ + GKO_ADAPT_SINGLE(_macro(std::complex, std::complex)); \ + template _macro(std::complex, float); \ + GKO_ADAPT_SINGLE(_macro(std::complex, double)) /** @@ -515,21 +496,11 @@ GKO_ATTRIBUTES constexpr bool operator!=(precision_reduction x, * Should take two arguments, which are replaced by the * value and index types. */ -#if GINKGO_DPCPP_SINGLE_MODE -#define GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_AND_INDEX_TYPE(_macro) \ - template _macro(float, int32); \ - template <> \ - _macro(double, int32) GKO_NOT_IMPLEMENTED; \ - template _macro(float, int64); \ - template <> \ - _macro(double, int64) GKO_NOT_IMPLEMENTED -#else #define GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_AND_INDEX_TYPE(_macro) \ template _macro(float, int32); \ - template _macro(double, int32); \ + GKO_ADAPT_SINGLE(_macro(double, int32)); \ template _macro(float, int64); \ - template _macro(double, int64) -#endif + GKO_ADAPT_SINGLE(_macro(double, int64)) /** @@ -540,46 +511,12 @@ GKO_ATTRIBUTES constexpr bool operator!=(precision_reduction x, * Should take two arguments, which are replaced by the * value and index types. */ -#if GINKGO_DPCPP_SINGLE_MODE #define GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(_macro) \ GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_AND_INDEX_TYPE(_macro); \ template _macro(std::complex, int32); \ - template <> \ - _macro(std::complex, int32) GKO_NOT_IMPLEMENTED; \ + GKO_ADAPT_SINGLE(_macro(std::complex, int32)); \ template _macro(std::complex, int64); \ - template <> \ - _macro(std::complex, int64) GKO_NOT_IMPLEMENTED -#else -#define GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(_macro) \ - GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_AND_INDEX_TYPE(_macro); \ - template _macro(std::complex, int32); \ - template _macro(std::complex, int32); \ - template _macro(std::complex, int64); \ - template _macro(std::complex, int64) -#endif - - -#if GINKGO_DPCPP_SINGLE_MODE -#define GKO_INSTANTIATE_FOR_EACH_VALUE_CONVERSION(_macro) \ - template <> \ - _macro(float, double) GKO_NOT_IMPLEMENTED; \ - template <> \ - _macro(double, float) GKO_NOT_IMPLEMENTED; \ - template <> \ - _macro(std::complex, std::complex) GKO_NOT_IMPLEMENTED; \ - template <> \ - _macro(std::complex, std::complex) GKO_NOT_IMPLEMENTED - - -#define GKO_INSTANTIATE_FOR_EACH_VALUE_CONVERSION_OR_COPY(_macro) \ - GKO_INSTANTIATE_FOR_EACH_VALUE_CONVERSION(_macro); \ - template _macro(float, float); \ - template <> \ - _macro(double, double) GKO_NOT_IMPLEMENTED; \ - template _macro(std::complex, std::complex); \ - template <> \ - _macro(std::complex, std::complex) GKO_NOT_IMPLEMENTED -#else + GKO_ADAPT_SINGLE(_macro(std::complex, int64)) /** @@ -591,11 +528,11 @@ GKO_ATTRIBUTES constexpr bool operator!=(precision_reduction x, * Should take two arguments `src` and `dst`, which * are replaced by the source and destination value type. */ -#define GKO_INSTANTIATE_FOR_EACH_VALUE_CONVERSION(_macro) \ - template _macro(float, double); \ - template _macro(double, float); \ - template _macro(std::complex, std::complex); \ - template _macro(std::complex, std::complex) +#define GKO_INSTANTIATE_FOR_EACH_VALUE_CONVERSION(_macro) \ + GKO_ADAPT_SINGLE(_macro(float, double)); \ + GKO_ADAPT_SINGLE(_macro(double, float)); \ + GKO_ADAPT_SINGLE(_macro(std::complex, std::complex)); \ + GKO_ADAPT_SINGLE(_macro(std::complex, std::complex)) /** @@ -610,10 +547,9 @@ GKO_ATTRIBUTES constexpr bool operator!=(precision_reduction x, #define GKO_INSTANTIATE_FOR_EACH_VALUE_CONVERSION_OR_COPY(_macro) \ GKO_INSTANTIATE_FOR_EACH_VALUE_CONVERSION(_macro); \ template _macro(float, float); \ - template _macro(double, double); \ + GKO_ADAPT_SINGLE(_macro(double, double)); \ template _macro(std::complex, std::complex); \ - template _macro(std::complex, std::complex) -#endif + GKO_ADAPT_SINGLE(_macro(std::complex, std::complex)) /** diff --git a/include/ginkgo/core/synthesizer/containers.hpp b/include/ginkgo/core/synthesizer/containers.hpp index fcb152da761..c6075ea3021 100644 --- a/include/ginkgo/core/synthesizer/containers.hpp +++ b/include/ginkgo/core/synthesizer/containers.hpp @@ -35,7 +35,10 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include +#include #include +#include namespace gko { @@ -46,7 +49,6 @@ namespace gko { */ namespace syn { - /** * value_list records several values with the same type in template. * @@ -54,7 +56,7 @@ namespace syn { * @tparam Values the values in the list */ template -struct value_list {}; +using value_list = std::integer_sequence; /** @@ -83,22 +85,36 @@ namespace detail { /** * concatenate_impl base type * - * @tparam List1 the first List - * @tparam List2 the second List + * @tparam Lists a list of std::integer_sequence */ -template +template struct concatenate_impl; /** - * concatenate_impl specializes for two value_list with the same value type. + * concatenate_impl specialization for a single std::integer_sequence + * + * @tparam T the value type of the std::integer_sequence + * @tparam Values the values of the list + */ +template +struct concatenate_impl> { + using type = std::integer_sequence; +}; + +/** + * concatenate_impl specialization for multiple std::integer_sequence with the + * same value type. * - * @tparam T the value type of two value_list - * @tparam Values the values of the first list - * @tparam Values the values of the second list + * @tparam T the value type of two std::integer_sequence + * @tparam Values1 the values of the first list + * @tparam Values2 the values of the second list + * @tparam Tail the lists which have not been concatenated yet */ -template -struct concatenate_impl, value_list> { - using type = value_list; +template +struct concatenate_impl, + std::integer_sequence, Tail...> { + using type = typename concatenate_impl< + std::integer_sequence, Tail...>::type; }; @@ -106,13 +122,13 @@ struct concatenate_impl, value_list> { /** - * concatenate combines two value_list into one value_list. + * concatenate an arbitrary number of std::integer_sequence with the same base + * type into one * - * @tparam List1 the first list - * @tparam List2 the second list + * @tparam Lists a list of std::integer_sequence */ -template -using concatenate = typename detail::concatenate_impl::type; +template +using concatenate = typename detail::concatenate_impl::type; namespace detail { @@ -127,14 +143,14 @@ template struct as_list_impl; /** - * as_list_impl specializes for the value_list + * as_list_impl specializes for the std::integer_sequence * - * @tparam T the value_list type - * @tparam Values the values of value_list + * @tparam T the std::integer_sequence type + * @tparam Values the values of std::integer_sequence */ template -struct as_list_impl> { - using type = value_list; +struct as_list_impl> { + using type = std::integer_sequence; }; /** @@ -158,7 +174,7 @@ struct as_list_impl> { template struct as_list_impl, std::enable_if_t<(Start < End)>> { using type = concatenate< - value_list, + std::integer_sequence, typename as_list_impl>::type>; }; @@ -171,7 +187,7 @@ struct as_list_impl, std::enable_if_t<(Start < End)>> { */ template struct as_list_impl, std::enable_if_t<(Start >= End)>> { - using type = value_list; + using type = std::integer_sequence; }; @@ -189,22 +205,582 @@ using as_list = typename detail::as_list_impl::type; /** - * as_array returns the array from value_list. It will be helpful if using - * for in runtime on the array. + * as_array returns the array from std::integer_sequence. It will be helpful + * if using for in runtime on the array. * - * @tparam T the type of value_list - * @tparam Value the values of value_list + * @tparam T the type of std::integer_sequence + * @tparam Value the values of std::integer_sequence * - * @param value_list the input value_list + * @param vl the input std::integer_sequence * - * @return std::array the std::array contains the values of value_list + * @return std::array the std::array contains the values of vl */ template -constexpr std::array as_array(value_list vl) +constexpr auto as_array(std::integer_sequence vl) { + std::ignore = vl; return std::array{Value...}; } +/** + * as_value returns the (first) value contained within an + * std::integer_sequence. The empty case is made to fail on purpose. + * + * @tparam T the type of std::integer_sequence + * @tparam Value the values of std::integer_sequence + * + * @param vl the input std::integer_sequence + * + * @return the first value within vl + */ +template +constexpr auto as_value(std::integer_sequence vl) +{ + static_assert(sizeof...(Value) > 0, + "Do not call as_value on an empty set!"); + return as_array(vl)[0]; +} + + +namespace detail { + + +/** + * This is the base type of a helper for sorting. It partitions the values + * within an std::integer_sequence into three parts based on a pivot: values + * lower than required, values equal, values above the requirement. + * + * The results will depend on the ascending or descending order, which reverses + * above and lower. + * + * For example, when considering + * ``` + * using idxs = std::integer_sequence; + * using parts = partition_impl<16, idxs>; // the pivot is 16 + * // Then effectively: + * // parts::lower = std::integer_sequence + * // parts::equal = std::integer_sequence + * // parts::above = std::integer_sequence + * ``` + * + * @tparam i the pivot value + * @tparam ascending whether to group in ascending or descending order + * @tparam T the type of the values + * @tparam Values the values + */ +template +struct partition_impl; + +/** + * The default case for partitioning an empty std::integer_sequence. All parts + * are the empty set. + * + * @tparam i the pivot value + * @tparam ascending whether to group in ascending or descending order + * @tparam T the type of the values + */ +template +struct partition_impl> { + using lower = std::integer_sequence; + using equal = std::integer_sequence; + using above = std::integer_sequence; +}; + +/** + * The recursive case for partitioning an std::integer_sequence. The value v1 is + * put in the matching part. + * + * @tparam i the pivot value + * @tparam ascending whether to group in ascending or descending order + * @tparam T the type of the values + * @tparam v1 the current value being processed + * @tparam Values the values left to process + */ +template +struct partition_impl> { + using this_elt = std::integer_sequence; + using empty = std::integer_sequence; + using recurse = + partition_impl>; + + using lower = concatenate< + std::conditional_t<(ascending ? v1 < i : v1 > i), this_elt, empty>, + typename recurse::lower>; + using equal = concatenate, + typename recurse::equal>; + using above = concatenate< + std::conditional_t<(ascending ? v1 > i : v1 < i), this_elt, empty>, + typename recurse::above>; +}; + + +/** + * This is the base type of the sorting structure. It sorts the values within + * an std::integer_sequence by using partition_t as a helper. + * + * The sorting will depend on the ascending or descending order and whether + * duplicates are kept or not. + * + * For example, when considering + * ``` + * using idxs = std::integer_sequence; + * using asc_dups = typename sort_impl::type; + * using asc_nodups = typename sort_impl::type; + * using desc_dups = typename sort_impl::type; + * // Then effectively: + * // asc_dups = std::integer_sequence + * // asc_nodups = std::integer_sequence + * // desc_dups = std::integer_sequence + * ``` + * + * @tparam ascending whether to sort in ascending or descending order + * @tparam keep_dups whether to keep duplicates or not + * @tparam T the type of the values + * @tparam Values the values + */ +template +struct sort_impl; + +/** + * The default case for sorting an empty std::integer_sequence. The result is an + * empty set. + * + * @tparam ascending whether to group in ascending or descending order + * @tparam keep_dups whether to keep duplicates or not + * @tparam T the type of the values + */ +template +struct sort_impl> { + using type = std::integer_sequence; +}; + +/** + * The recursive case for sorting an std::integer_sequence. The value v1 becomes + * the partition_impl pivot. The obtained groups are concatenated in order. + * Duplicates are removed by only populating `this_elt` instead of `this_elt + + * parts::equal`. + * + * @tparam ascending whether to group in ascending or descending order + * @tparam keep_dups whether to keep duplicates or not + * @tparam T the type of the values + * @tparam v1 the current value and pivot for partition_impl + * @tparam Values the values to sort + */ +template +struct sort_impl> { + using this_elt = std::integer_sequence; + using empty = std::integer_sequence; + using parts = + partition_impl>; + using sorted_inf = + typename sort_impl::type; + using sorted_eq = + concatenate>; + using sorted_up = + typename sort_impl::type; + + using type = concatenate; +}; + + +} // namespace detail + +/** + * This is a helper interface for sorting an std::integer_sequence. It always + * removes duplicate values. + * + * @see detail::sort_impl + * + * @tparam ascending whether to group in ascending or descending order + * @tparam T the type of the values + * @tparam Values the values to sort + */ +template +using sort = typename detail::sort_impl::type; + +/** + * This is sorting variant which keeps duplicates + * + * @see detail::sort_impl + * + * @tparam ascending whether to group in ascending or descending order + * @tparam T the type of the values + * @tparam Values the values to sort + */ +template +using sort_keep = + typename detail::sort_impl::type; + + +namespace detail { + + +/** + * This is the base type of the accessing an element of an std::integer_sequence + * at a given index. + * + * For example, when considering + * ``` + * using idxs = std::integer_sequence; + * using num1 = typename at_index_impl<0, idxs>::type; + * using num2 = typename at_index_impl<2, idxs>::type; + * using num5 = typename at_index_impl<5, idxs>::type; + * // Then effectively: + * // num1 = std::integer_sequence + * // num2 = std::integer_sequence + * // num5 = std::integer_sequence + * ``` + * + * @tparam idx the index to find + * @tparam T the type of the values + * @tparam Values the values + */ +template +struct at_index_impl; + +/** + * The default case for accessing an element of an empty std::integer_sequence + * at a given index. The result is an empty set. + * + * @tparam idx the index to find + * @tparam T the type of the values + */ +template +struct at_index_impl> { + using type = std::integer_sequence; +}; + +/** + * The recursive case for accessing an element of an std::integer_sequence at a + * given index. Idx is counted down until 0, where v1 is the requested element. + * + * @tparam idx the distance to the index to find + * @tparam T the type of the values + * @tparam v1 the value being processed + * @tparam Values the values + */ +template +struct at_index_impl> { + using recurse = + typename at_index_impl>::type; + using type = + std::conditional_t<(idx <= 0), std::integer_sequence, recurse>; +}; + + +} // namespace detail + + +/** + * This is a helper interface for accessing an std::integer_sequence at a given + * index. + * + * @see detail::at_index_impl + * + * @tparam idx the index of the element to find + * @tparam T the type of the values + * @tparam Values the values + */ +template +using at_index = typename detail::at_index_impl::type; + + +namespace detail { + + +/** + * Access the element at the back of an std::integer_sequence. This is the base + * type. + * + * @tparam T the type of the values + * @tparam Values the values + */ +template +struct back_impl; + +/** + * Access the element at the back of an std::integer_sequence. This is the + * specialization for std::integer_sequence. We simply reuse at_index. We need + * to unpack the std::integer_sequence in order to have the proper size of the + * parameter pack `sizeof...(Values)`, otherwise it is always one. + * + * @tparam T the type of the values + * @tparam Values the values + */ +template +struct back_impl> { + using type = + at_index>; +}; + + +/** + * Access the median element of an std::integer_sequence. This is the base type. + * + * @tparam T the type of the values + * @tparam Values the values + */ +template +struct median_impl; + +/** + * Access the median element of an std::integer_sequence. This is the + * specialization for std::integer_sequence. We simply reuse at_index of the + * middle element after calling sort. We need to unpack the + * std::integer_sequence in order to have the proper size of the parameter pack + * `sizeof...(Values)`, otherwise it is always one. + * + * @tparam T the type of the values + * @tparam Values the values + */ +template +struct median_impl> { + using type = + at_index>; +}; + + +} // namespace detail + + +/** + * This is a helper interface for accessing the front of an + * std::integer_sequence. + * + * @tparam T the type of the values + * @tparam Values the values + */ +template +using front = at_index<0, T, Values...>; + +/** + * This is a helper interface for accessing the back of an + * std::integer_sequence. + * + * @tparam T the type of the values + * @tparam Values the values + */ +template +using back = typename detail::back_impl::type; + +/** + * This is a helper interface for accessing the median element of an + * std::integer_sequence. + * + * @tparam T the type of the values + * @tparam Values the values + */ +template +using median = typename detail::median_impl>::type; + + +/** + * This is a helper interface for accessing the minimum element of an + * std::integer_sequence. + * + * @tparam T the type of the values + * @tparam Values the values + */ +template +using min = front>; + + +/** + * This is a helper interface for accessing the minimum element of an + * std::integer_sequence. + * + * @tparam T the type of the values + * @tparam Values the values + */ +template +using max = front>; + + +namespace detail { + +/** + * This is the base type of a helper to merge_impl. It merges two lists of + * std::integer_sequence by applying EncodingType::encode() on every pair of + * values. In this case, the first list is a single element. This helps + * logically split the double recursion needed for merging two general lists. + * + * For example, when considering + * ``` + * struct int_encoder { + * using can_encode = std::true_type; + * static constexpr auto encode(int v1, int v2) { return v1*v2; } + * }; + * using idx1 = std::integer_sequence; + * using idx2 = std::integer_sequence; + * using merged = typename merge_one_impl::type; + * // Then effectively: + * // merged = std::integer_sequence + * ``` + * + * @tparam EncodingType the type used to encode values. It must at least look + * like the encoder in the example above. ConfigSet is a + * Ginkgo type which can encode. + * @tparam Lists the lists to merge and encode + */ +template +struct merge_one_impl; + +/** + * This is the base case for merge_one_impl's recursion. The second list is + * empty. + * + * @tparam EncodingType the type used to encode values + * @tparam T the type of the values + * @tparam T v1 the value from the first list + */ +template +struct merge_one_impl, + std::integer_sequence> { + using type = std::integer_sequence; +}; + +/** + * This is the recursive case for merge_one_impl. We merge and encode v1 + * with every subsequent v2 from the second std::integer_sequence. + * + * @tparam EncodingType the type used to encode values + * @tparam T the type of the values + * @tparam T v1 the value from the first list being merged + * @tparam T v2 the value from the second list being merged + * @tparam Values the values not yet processed from the second list + */ +template +struct merge_one_impl, + std::integer_sequence> { + static_assert( + std::is_same::value, + "EncodingType must have encoding functionality."); + + using v1_as_seq = std::integer_sequence; + using values_as_seq = std::integer_sequence; + using recurse = merge_one_impl; + using type = + concatenate, + typename recurse::type>; +}; + + +/** + * This is the base type of merge_impl. It merges two + * std::integer_sequence and calls EncodingType::encode() on every pair of + * values. + * + * For example, when considering + * ``` + * struct int_encoder { + * using can_encode = std::true_type; + * static constexpr auto encode(int v1, int v2) { return v1*v2; } + * }; + * using idx1 = std::integer_sequence; + * using idx2 = std::integer_sequence; + * using merged = typename merge_impl::type; + * // Then effectively: + * // merged = std::integer_sequence + * ``` + * + * @see detail::merge_one_impl + * + * @tparam EncodingType the type used to encode values. It must at least look + * like the encoder in the example above. ConfigSet is a + * Ginkgo type which can encode. + * @tparam Lists the lists to merge and encode + */ +template +struct merge_impl; + +/** + * This is the base case for merge_impl's recursion. The first list has been + * completely consumed. + * + * @tparam EncodingType the type used to encode values + * @tparam T the type of the values + * @tparam T Values2 the values of the second list + */ +template +struct merge_impl, + std::integer_sequence> { + using type = std::integer_sequence; +}; + +/** + * This is the first recursive case for merge_impl. In this case, the second + * list is empty. We only encode v1 one after the other. + * + * @tparam EncodingType the type used to encode values + * @tparam T the type of the values + * @tparam T v1 the value from the first list being merged + * @tparam Values1 the values left to consume from the first list + * @tparam Values2 the values of the second list + */ +template +struct merge_impl, + std::integer_sequence> { + using v1_as_seq = std::integer_sequence; + using empty = std::integer_sequence; + using val1_as_seq = std::integer_sequence; + using processed_v1 = std::integer_sequence; + // move to the next v1 + using recurse = merge_impl; + using type = concatenate; +}; + +/** + * This is the recursive case for merge_impl with a non empty Values2 list. We + * call merge_one_impl for every v1 and element of Values2 until Values1 is + * completely consumed. + * + * @tparam EncodingType the type used to encode values + * @tparam T the type of the values + * @tparam T v1 the value from the first list being merged + * @tparam Values1 the values left to consume from the first list + * @tparam Values2 the values of the second list + */ +template +struct merge_impl, + std::integer_sequence> { + using v1_as_seq = std::integer_sequence; + using val1_as_seq = std::integer_sequence; + using val2_as_seq = std::integer_sequence; + using process_v1 = merge_one_impl; + // move to the next v1 + using recurse = merge_impl; + using type = concatenate; +}; + + +} // namespace detail + + +/** + * This is a helper interface for merging two lists of std::integer_sequence + * using EncodingType as an encoder. It only merges two lists at once. + * EncodingType must look like ConfigSet or the following: + * + * ``` + * struct int_encoder { + * using can_encode = std::true_type; + * static constexpr auto encode(int v1, int v2) { return v1*v2; } + * }; + * ``` + * + * @see detail::merge_impl + * + * @tparam EncodingType the type used to encode values. It must at least look + * like the encoder in the example above. ConfigSet is a + * Ginkgo type which can encode. + * @tparam Lists the lists to merge and encode + */ +template +using merge = typename detail::merge_impl::type; + } // namespace syn } // namespace gko diff --git a/omp/base/kernel_launch.hpp b/omp/base/kernel_launch.hpp index eb89aa211bd..c4da5bfa5fc 100644 --- a/omp/base/kernel_launch.hpp +++ b/omp/base/kernel_launch.hpp @@ -66,7 +66,7 @@ void run_kernel_impl(std::shared_ptr exec, KernelFunction fn, template -void run_kernel_sized_impl(syn::value_list, +void run_kernel_sized_impl(std::integer_sequence, std::shared_ptr exec, KernelFunction fn, dim<2> size, MappedKernelArgs... args) @@ -124,8 +124,8 @@ void run_kernel_impl(std::shared_ptr exec, KernelFunction fn, select_run_kernel_sized( remainders(), [&](int remainder) { return remainder == cols % block_size; }, - syn::value_list(), syn::type_list<>(), exec, fn, size, - args...); + std::integer_sequence(), syn::type_list<>(), exec, fn, + size, args...); } diff --git a/omp/base/kernel_launch_reduction.hpp b/omp/base/kernel_launch_reduction.hpp index d4a489a258f..97b2d8dd0c0 100644 --- a/omp/base/kernel_launch_reduction.hpp +++ b/omp/base/kernel_launch_reduction.hpp @@ -87,7 +87,7 @@ void run_kernel_reduction_impl(std::shared_ptr exec, template -void run_kernel_reduction_sized_impl(syn::value_list, +void run_kernel_reduction_sized_impl(std::integer_sequence, std::shared_ptr exec, KernelFunction fn, ReductionOp op, FinalizeOp finalize, ValueType identity, @@ -183,8 +183,8 @@ void run_kernel_reduction(std::shared_ptr exec, select_run_kernel_reduction_sized( remainders(), [&](int remainder) { return remainder == cols % block_size; }, - syn::value_list(), syn::type_list<>(), exec, fn, op, - finalize, identity, result, size, map_to_device(args)...); + std::integer_sequence(), syn::type_list<>(), exec, fn, + op, finalize, identity, result, size, map_to_device(args)...); } @@ -287,7 +287,7 @@ template void run_kernel_col_reduction_sized_impl( - syn::value_list, + std::integer_sequence, std::shared_ptr exec, KernelFunction fn, ReductionOp op, FinalizeOp finalize, ValueType identity, ValueType* result, dim<2> size, MappedKernelArgs... args) @@ -395,8 +395,8 @@ void run_kernel_col_reduction(std::shared_ptr exec, select_run_kernel_col_reduction_sized( remainders(), [&](int remainder) { return remainder == cols % block_size; }, - syn::value_list(), syn::type_list<>(), exec, fn, op, - finalize, identity, result, size, map_to_device(args)...); + std::integer_sequence(), syn::type_list<>(), exec, fn, + op, finalize, identity, result, size, map_to_device(args)...); } diff --git a/omp/matrix/fbcsr_kernels.cpp b/omp/matrix/fbcsr_kernels.cpp index ce647f79935..c8823fc896a 100644 --- a/omp/matrix/fbcsr_kernels.cpp +++ b/omp/matrix/fbcsr_kernels.cpp @@ -375,7 +375,7 @@ namespace { template void sort_by_column_index_impl( - syn::value_list, + std::integer_sequence, matrix::Fbcsr* const to_sort) { auto row_ptrs = to_sort->get_const_row_ptrs(); @@ -420,7 +420,7 @@ void sort_by_column_index(const std::shared_ptr exec, select_sort_col_idx( fixedblock::compiled_kernels(), [bs](int compiled_block_size) { return bs == compiled_block_size; }, - syn::value_list(), syn::type_list<>(), to_sort); + std::integer_sequence(), syn::type_list<>(), to_sort); } GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( diff --git a/reference/matrix/fbcsr_kernels.cpp b/reference/matrix/fbcsr_kernels.cpp index 9b093a97b68..8a318084568 100644 --- a/reference/matrix/fbcsr_kernels.cpp +++ b/reference/matrix/fbcsr_kernels.cpp @@ -430,7 +430,7 @@ namespace { template void sort_by_column_index_impl( - syn::value_list, + std::integer_sequence, matrix::Fbcsr* const to_sort) { auto row_ptrs = to_sort->get_const_row_ptrs(); @@ -476,7 +476,7 @@ void sort_by_column_index(const std::shared_ptr exec, select_sort_col_idx( fixedblock::compiled_kernels(), [bs](int compiled_block_size) { return bs == compiled_block_size; }, - syn::value_list(), syn::type_list<>(), to_sort); + std::integer_sequence(), syn::type_list<>(), to_sort); } GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( diff --git a/third_party/hwloc/CMakeLists.txt b/third_party/hwloc/CMakeLists.txt index a47e9a84877..970ed78689c 100644 --- a/third_party/hwloc/CMakeLists.txt +++ b/third_party/hwloc/CMakeLists.txt @@ -5,8 +5,6 @@ ginkgo_load_and_configure_package(hwloc_external "https://download.open-mpi.org/ "${TPL_HWLOC_PATH}/src/configure" "--disable-nvml" "--disable-cuda" "--disable-rsmi" ) -add_library(hwloc SHARED IMPORTED GLOBAL) -add_dependencies(hwloc hwloc_external ) file(MAKE_DIRECTORY ${TPL_HWLOC_PATH}/lib/) file(GLOB HWLOC_LIBS "${TPL_HWLOC_PATH}/build/hwloc/.libs/libhwloc.so*") configure_file("${TPL_HWLOC_PATH}/build/include/hwloc/autogen/config.h" "${TPL_HWLOC_PATH}/src/include/hwloc/autogen/config.h" COPYONLY) @@ -14,6 +12,9 @@ foreach(lib ${HWLOC_LIBS}) get_filename_component(lib_name ${lib} NAME) configure_file("${lib}" "${TPL_HWLOC_PATH}/lib/${lib_name}" COPYONLY) endforeach() + +add_library(hwloc SHARED IMPORTED GLOBAL) +add_dependencies(hwloc hwloc_external) set(HWLOC_LIBRARIES "${TPL_HWLOC_PATH}/lib/libhwloc.so" CACHE FILEPATH "The path to HWLOC library libhwloc.so" FORCE) set(HWLOC_INCLUDE_DIRS "${TPL_HWLOC_PATH}/src/include" CACHE PATH "The directory containing the hwloc header, hwloc.h" FORCE) set_target_properties(hwloc PROPERTIES IMPORTED_LOCATION ${HWLOC_LIBRARIES})