From 5616a4be416fefb6c7d1f209de08462b06fec704 Mon Sep 17 00:00:00 2001 From: Terry Cojean Date: Tue, 14 Dec 2021 13:44:56 +0100 Subject: [PATCH 01/14] Initial messy version. --- CMakeLists.txt | 1 + dpcpp/CMakeLists.txt | 6 +- dpcpp/base/kernel_launch_reduction.dp.hpp | 179 +++++++++++++--------- dpcpp/components/prefix_sum.dp.hpp | 44 +++--- dpcpp/components/reduction.dp.hpp | 10 +- dpcpp/matrix/dense_kernels.dp.cpp | 13 +- dpcpp/test/matrix/dense_kernels.cpp | 2 +- 7 files changed, 148 insertions(+), 107 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 761f10f420d..6a854db9f02 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -22,6 +22,7 @@ include(cmake/build_type_helpers.cmake) include(cmake/build_helpers.cmake) include(cmake/install_helpers.cmake) +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mcmodel=large") if (MSVC) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /bigobj") endif() diff --git a/dpcpp/CMakeLists.txt b/dpcpp/CMakeLists.txt index ac7cf2d981e..41245f4cc7b 100644 --- a/dpcpp/CMakeLists.txt +++ b/dpcpp/CMakeLists.txt @@ -60,16 +60,16 @@ target_compile_options(ginkgo_dpcpp PRIVATE "${GINKGO_COMPILER_FLAGS}") # Note: add MKL as PRIVATE not PUBLIC (MKL example shows) to avoid propagating # find_package(MKL) everywhere when linking ginkgo (see the MKL example # https://software.intel.com/content/www/us/en/develop/documentation/onemkl-windows-developer-guide/top/getting-started/cmake-config-for-onemkl.html) -target_compile_options(ginkgo_dpcpp PRIVATE $) target_compile_features(ginkgo_dpcpp PRIVATE cxx_std_17) +target_compile_options(ginkgo_dpcpp PRIVATE $) target_include_directories(ginkgo_dpcpp PRIVATE $) target_link_options(ginkgo_dpcpp PRIVATE -fsycl-device-lib=all) # When building ginkgo as a static library, we need to use dpcpp and per_kernel # link option when the program uses a dpcpp related function. if (BUILD_SHARED_LIBS) - target_link_options(ginkgo_dpcpp PRIVATE -fsycl-device-code-split=per_kernel) + target_link_options(ginkgo_dpcpp PRIVATE -fsycl-device-code-split=per_source) else () - target_link_options(ginkgo_dpcpp PUBLIC -fsycl-device-code-split=per_kernel) + target_link_options(ginkgo_dpcpp PUBLIC -fsycl-device-code-split=per_source) endif() target_link_libraries(ginkgo_dpcpp PUBLIC ginkgo_device) target_link_libraries(ginkgo_dpcpp PRIVATE MKL::MKL_DPCPP oneDPL) diff --git a/dpcpp/base/kernel_launch_reduction.dp.hpp b/dpcpp/base/kernel_launch_reduction.dp.hpp index 0df00f58c6e..52d3ef669e5 100644 --- a/dpcpp/base/kernel_launch_reduction.dp.hpp +++ b/dpcpp/base/kernel_launch_reduction.dp.hpp @@ -55,10 +55,12 @@ namespace dpcpp { using KCFG_1D = ConfigSet<11, 7>; constexpr auto kcfg_1d_list_simple_reduction = - syn::value_list(); + syn::value_list(); template exec, auto queue = exec->get_queue(); if (num_workgroups > 1) { Array partial{exec, static_cast(num_workgroups)}; - queue->submit([&](sycl::handler& cgh) { - generic_kernel_reduction_1d( - cgh, static_cast(size), num_workgroups, fn, op, - [](auto v) { return v; }, identity, partial.get_data(), - args...); - }); - queue->submit([&](sycl::handler& cgh) { - generic_kernel_reduction_1d( - cgh, static_cast(num_workgroups), 1, - [](auto i, auto v) { return v[i]; }, op, finalize, identity, - result, partial.get_const_data()); - }); + queue + ->submit([&](sycl::handler& cgh) { + generic_kernel_reduction_1d( + cgh, static_cast(size), num_workgroups, fn, op, + [](auto v) { return v; }, identity, partial.get_data(), + args...); + }) + .wait(); + queue + ->submit([&](sycl::handler& cgh) { + generic_kernel_reduction_1d( + cgh, static_cast(num_workgroups), 1, + [](auto i, auto v) { return v[i]; }, op, finalize, identity, + result, partial.get_const_data()); + }) + .wait(); } else { - queue->submit([&](sycl::handler& cgh) { - generic_kernel_reduction_1d(cgh, static_cast(size), - num_workgroups, fn, op, finalize, - identity, result, args...); - }); + queue + ->submit([&](sycl::handler& cgh) { + generic_kernel_reduction_1d( + cgh, static_cast(size), num_workgroups, fn, op, + finalize, identity, result, args...); + }) + .wait(); } } @@ -222,24 +230,30 @@ void run_kernel_reduction_impl(std::shared_ptr exec, auto queue = exec->get_queue(); if (num_workgroups > 1) { Array partial{exec, static_cast(num_workgroups)}; - queue->submit([&](sycl::handler& cgh) { - generic_kernel_reduction_2d( - cgh, rows, cols, num_workgroups, fn, op, - [](auto v) { return v; }, identity, partial.get_data(), - args...); - }); - queue->submit([&](sycl::handler& cgh) { - generic_kernel_reduction_1d( - cgh, static_cast(num_workgroups), 1, - [](auto i, auto v) { return v[i]; }, op, finalize, identity, - result, partial.get_const_data()); - }); + queue + ->submit([&](sycl::handler& cgh) { + generic_kernel_reduction_2d( + cgh, rows, cols, num_workgroups, fn, op, + [](auto v) { return v; }, identity, partial.get_data(), + args...); + }) + .wait(); + queue + ->submit([&](sycl::handler& cgh) { + generic_kernel_reduction_1d( + cgh, static_cast(num_workgroups), 1, + [](auto i, auto v) { return v[i]; }, op, finalize, identity, + result, partial.get_const_data()); + }) + .wait(); } else { - queue->submit([&](sycl::handler& cgh) { - generic_kernel_reduction_2d(cgh, rows, cols, num_workgroups, - fn, op, finalize, identity, result, - args...); - }); + queue + ->submit([&](sycl::handler& cgh) { + generic_kernel_reduction_2d( + cgh, rows, cols, num_workgroups, fn, op, finalize, identity, + result, args...); + }) + .wait(); } } @@ -520,16 +534,21 @@ void run_generic_col_reduction_small(syn::value_list, } else { Array tmp_storage{exec, static_cast(row_blocks * cols)}; - queue->submit([&](sycl::handler& cgh) { - generic_kernel_col_reduction_2d_small( - cgh, rows, cols, row_blocks, fn, op, [](auto v) { return v; }, - identity, tmp_storage.get_data(), args...); - }); - queue->submit([&](sycl::handler& cgh) { - generic_kernel_reduction_finalize_2d( - cgh, cols, row_blocks, op, finalize, identity, - tmp_storage.get_const_data(), 1, result); - }); + queue + ->submit([&](sycl::handler& cgh) { + generic_kernel_col_reduction_2d_small( + cgh, rows, cols, row_blocks, fn, op, + [](auto v) { return v; }, identity, tmp_storage.get_data(), + args...); + }) + .wait(); + queue + ->submit([&](sycl::handler& cgh) { + generic_kernel_reduction_finalize_2d( + cgh, cols, row_blocks, op, finalize, identity, + tmp_storage.get_const_data(), 1, result); + }) + .wait(); } } @@ -549,29 +568,31 @@ void run_kernel_row_reduction_stage1(std::shared_ptr exec, constexpr auto wg_size = KCFG_1D::decode<0>(cfg); constexpr auto sg_size = KCFG_1D::decode<1>(cfg); using subsubgroup_sizes = - syn::value_list(16, sg_size), - std::min(32, sg_size), sg_size>; + // syn::value_list(16, sg_size), + // std::min(32, sg_size), sg_size>; + syn::value_list; constexpr int oversubscription = 16; const auto rows = static_cast(size[0]); const auto cols = static_cast(size[1]); const auto resources = exec->get_num_computing_units() * sg_size * oversubscription; auto queue = exec->get_queue(); - if (rows * cols > resources && rows < cols) { - const auto col_blocks = ceildiv(rows * cols, resources); - Array partial{exec, - static_cast(col_blocks * rows)}; - generic_kernel_row_reduction_2d( - syn::value_list{}, exec, rows, cols, col_blocks, fn, - op, [](auto v) { return v; }, identity, partial.get_data(), 1, - args...); - queue->submit([&](sycl::handler& cgh) { + // if (rows * cols > resources && rows < cols) { + const auto col_blocks = ceildiv(rows * cols, resources); + Array partial{exec, static_cast(col_blocks * rows)}; + generic_kernel_row_reduction_2d( + syn::value_list{}, exec, rows, cols, col_blocks, fn, op, + [](auto v) { return v; }, identity, partial.get_data(), 1, args...); + queue->wait(); + queue + ->submit([&](sycl::handler& cgh) { generic_kernel_reduction_finalize_2d( cgh, rows, col_blocks, op, finalize, identity, partial.get_const_data(), static_cast(result_stride), result); - }); - } else { + }) + .wait(); + /* } else { select_generic_kernel_row_reduction_2d( subsubgroup_sizes(), [cols](int compiled_ssg_size) { @@ -581,7 +602,7 @@ void run_kernel_row_reduction_stage1(std::shared_ptr exec, syn::value_list(), syn::type_list<>(), exec, rows, cols, 1, fn, op, finalize, identity, result, static_cast(result_stride), args...); - } + } */ } GKO_ENABLE_IMPLEMENTATION_CONFIG_SELECTION(select_kernel_row_reduction_stage1, @@ -600,14 +621,16 @@ void run_kernel_col_reduction_stage1(std::shared_ptr exec, constexpr auto wg_size = KCFG_1D::decode<0>(cfg); constexpr auto sg_size = KCFG_1D::decode<1>(cfg); using subsubgroup_sizes = - syn::value_list(16, sg_size), - std::min(32, sg_size), sg_size>; + // syn::value_list(16, sg_size), + // std::min(32, sg_size), sg_size>; + syn::value_list; constexpr int oversubscription = 16; const auto rows = static_cast(size[0]); const auto cols = static_cast(size[1]); const auto max_blocks = exec->get_num_computing_units() * sg_size * oversubscription / wg_size; if (cols <= sg_size) { + std::cout << "Running col reduction small\n"; select_generic_col_reduction_small( subsubgroup_sizes(), [cols](int compiled_ssg_size) { @@ -623,25 +646,31 @@ void run_kernel_col_reduction_stage1(std::shared_ptr exec, col_blocks); auto queue = exec->get_queue(); if (row_blocks <= 1) { + std::cout << "Running 2d blocked\n"; queue->submit([&](sycl::handler& cgh) { generic_kernel_col_reduction_2d_blocked( cgh, rows, cols, 1, col_blocks, fn, op, finalize, identity, result, args...); }); } else { + std::cout << "Running 2d blocked/finalize\n"; Array tmp_storage{ exec, static_cast(row_blocks * cols)}; - queue->submit([&](sycl::handler& cgh) { - generic_kernel_col_reduction_2d_blocked( - cgh, rows, cols, row_blocks, col_blocks, fn, op, - [](auto v) { return v; }, identity, tmp_storage.get_data(), - args...); - }); - queue->submit([&](sycl::handler& cgh) { - generic_kernel_reduction_finalize_2d( - cgh, cols, row_blocks, op, finalize, identity, - tmp_storage.get_const_data(), 1, result); - }); + queue + ->submit([&](sycl::handler& cgh) { + generic_kernel_col_reduction_2d_blocked( + cgh, rows, cols, row_blocks, col_blocks, fn, op, + [](auto v) { return v; }, identity, + tmp_storage.get_data(), args...); + }) + .wait(); + queue + ->submit([&](sycl::handler& cgh) { + generic_kernel_reduction_finalize_2d( + cgh, cols, row_blocks, op, finalize, identity, + tmp_storage.get_const_data(), 1, result); + }) + .wait(); } } } diff --git a/dpcpp/components/prefix_sum.dp.hpp b/dpcpp/components/prefix_sum.dp.hpp index ea7dd341ecf..6e6e7062fb3 100644 --- a/dpcpp/components/prefix_sum.dp.hpp +++ b/dpcpp/components/prefix_sum.dp.hpp @@ -187,19 +187,21 @@ void start_prefix_sum(dim3 grid, dim3 block, size_type dynamic_shared_memory, sycl::queue* queue, size_type num_elements, ValueType* elements, ValueType* block_sum) { - queue->submit([&](sycl::handler& cgh) { - sycl::accessor, 0, - sycl::access::mode::read_write, - sycl::access::target::local> - prefix_helper_acc_ct1(cgh); - - cgh.parallel_for(sycl_nd_range(grid, block), - [=](sycl::nd_item<3> item_ct1) { - start_prefix_sum( - num_elements, elements, block_sum, item_ct1, - *prefix_helper_acc_ct1.get_pointer()); - }); - }); + queue + ->submit([&](sycl::handler& cgh) { + sycl::accessor, 0, + sycl::access::mode::read_write, + sycl::access::target::local> + prefix_helper_acc_ct1(cgh); + + cgh.parallel_for( + sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) { + start_prefix_sum( + num_elements, elements, block_sum, item_ct1, + *prefix_helper_acc_ct1.get_pointer()); + }); + }) + .wait(); } @@ -239,13 +241,15 @@ void finalize_prefix_sum(dim3 grid, dim3 block, size_type dynamic_shared_memory, sycl::queue* queue, size_type num_elements, ValueType* elements, const ValueType* block_sum) { - queue->submit([&](sycl::handler& cgh) { - cgh.parallel_for(sycl_nd_range(grid, block), - [=](sycl::nd_item<3> item_ct1) { - finalize_prefix_sum( - num_elements, elements, block_sum, item_ct1); - }); - }); + queue + ->submit([&](sycl::handler& cgh) { + cgh.parallel_for( + sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) { + finalize_prefix_sum(num_elements, elements, + block_sum, item_ct1); + }); + }) + .wait(); } diff --git a/dpcpp/components/reduction.dp.hpp b/dpcpp/components/reduction.dp.hpp index b1eb86a1992..dd5a68dd956 100644 --- a/dpcpp/components/reduction.dp.hpp +++ b/dpcpp/components/reduction.dp.hpp @@ -64,10 +64,12 @@ namespace dpcpp { constexpr int default_block_size = 256; using KCFG_1D = ConfigSet<11, 7>; constexpr auto kcfg_1d_list = - syn::value_list(); + syn::value_list(); constexpr auto kcfg_1d_array = as_array(kcfg_1d_list); /** diff --git a/dpcpp/matrix/dense_kernels.dp.cpp b/dpcpp/matrix/dense_kernels.dp.cpp index 97c01e0fac5..02bb8b5f513 100644 --- a/dpcpp/matrix/dense_kernels.dp.cpp +++ b/dpcpp/matrix/dense_kernels.dp.cpp @@ -74,10 +74,15 @@ namespace dense { // lead CL_OUT_OF_RESOURCES. TODO: investigate this issue. using KCFG_1D = ConfigSet<11, 7>; constexpr auto kcfg_1d_list = - syn::value_list(); -constexpr auto subgroup_list = syn::value_list(); + syn::value_list(); +// constexpr auto subgroup_list = syn::value_list(); +constexpr auto subgroup_list = syn::value_list(); constexpr auto kcfg_1d_array = syn::as_array(kcfg_1d_list); constexpr int default_block_size = 256; diff --git a/dpcpp/test/matrix/dense_kernels.cpp b/dpcpp/test/matrix/dense_kernels.cpp index 56ef8b5532e..a48d404f089 100644 --- a/dpcpp/test/matrix/dense_kernels.cpp +++ b/dpcpp/test/matrix/dense_kernels.cpp @@ -199,7 +199,7 @@ TEST_F(Dense, SingleVectorDpcppComputeDotIsEquivalentToRef) TEST_F(Dense, MultipleVectorDpcppComputeDotIsEquivalentToRef) { - set_up_vector_data(20); + set_up_vector_data(2); x->compute_dot(y.get(), expected.get()); dx->compute_dot(dy.get(), dresult.get()); From 7c1df8991ab635a3b6758c758d26c96e504a512d Mon Sep 17 00:00:00 2001 From: Terry Cojean Date: Tue, 14 Dec 2021 16:36:07 +0100 Subject: [PATCH 02/14] Add better DPC++ config generation AOT support + Now the DPC++ configuration of workgroups/subgroups is generated from CMake using a `config.hpp.in` file. + AOT support is added thanks to a new `GINKGO_DPCPP_AOT_TARGETS` variable --- CMakeLists.txt | 7 +- cmake/build_helpers.cmake | 43 +++++ cmake/create_test.cmake | 6 +- dpcpp/CMakeLists.txt | 56 +++++- dpcpp/base/{config.hpp => config.hpp.in} | 20 +- dpcpp/base/kernel_launch_reduction.dp.hpp | 207 +++++++++------------ dpcpp/components/prefix_sum.dp.hpp | 44 ++--- dpcpp/components/prefix_sum_kernels.dp.cpp | 7 +- dpcpp/components/reduction.dp.hpp | 14 +- dpcpp/matrix/dense_kernels.dp.cpp | 14 -- dpcpp/test/matrix/dense_kernels.cpp | 2 +- 11 files changed, 230 insertions(+), 190 deletions(-) rename dpcpp/base/{config.hpp => config.hpp.in} (79%) diff --git a/CMakeLists.txt b/CMakeLists.txt index 6a854db9f02..74cc7c3d1c2 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -22,7 +22,6 @@ include(cmake/build_type_helpers.cmake) include(cmake/build_helpers.cmake) include(cmake/install_helpers.cmake) -set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mcmodel=large") if (MSVC) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /bigobj") endif() @@ -96,6 +95,12 @@ else() option(GINKGO_BUILD_HWLOC "Build Ginkgo with HWLOC. Default is ON. If a system HWLOC is not found, then we try to build it ourselves. Switch this OFF to disable HWLOC." ON) endif() option(GINKGO_DPCPP_SINGLE_MODE "Do not compile double kernels for the DPC++ backend." OFF) +set(GINKGO_DPCPP_AOT_TARGETS "" CACHE STRING + "Uses DPC++ AOT with set targets (e.g., spir64_gen;spir64_x86_64). The default is '' (use JIT).") +set(GINKGO_DPCPP_SUBGROUPS "8;16;32" CACHE STRING + "The subgroup sizes to generate DPC++ kernels for. The default is '8;16;32'.") +set(GINKGO_DPCPP_WORKGROUPS "256;512" CACHE STRING + "The max workgroup sizes to generate DPC++ kernels for. The default is '256;512'.") option(GINKGO_INSTALL_RPATH "Set the RPATH when installing its libraries." ON) option(GINKGO_INSTALL_RPATH_ORIGIN "Add $ORIGIN (Linux) or @loader_path (MacOS) to the installation RPATH." ON) option(GINKGO_INSTALL_RPATH_DEPENDENCIES "Add dependencies to the installation RPATH." OFF) diff --git a/cmake/build_helpers.cmake b/cmake/build_helpers.cmake index 4b89cf451c4..6f5ba8be21a 100644 --- a/cmake/build_helpers.cmake +++ b/cmake/build_helpers.cmake @@ -152,3 +152,46 @@ function(ginkgo_extract_dpcpp_version DPCPP_COMPILER GINKGO_DPCPP_VERSION) file(REMOVE ${CMAKE_CURRENT_BINARY_DIR}/extract_dpcpp_ver.cpp) file(REMOVE ${CMAKE_CURRENT_BINARY_DIR}/extract_dpcpp_ver) endfunction() + +# From https://stackoverflow.com/questions/61106418/sorting-a-list-of-numbers-numerically-in-cmake +# After CMake 3.18, this can be removed +function(insertionSortRecursive sort_list) + # Get list length + list(LENGTH sort_list len) + + # Base case, return. + if(${len} LESS_EQUAL 1) + return() + endif() + + math(EXPR len_minus_one "${len} - 1") + # Recursively sort the sublist of size (len - 1). + insertionSortRecursive("${sort_list}" ${len_minus_one}) + set(_sort_list ${sort_list}) + + # Get the last element in the sublist. + list(GET _sort_list ${len_minus_one} last_element) + # Define a counter for the sublist we will operate on. + math(EXPR sublist_counter "${len} - 2") + # Get the element at the counter index. + list(GET _sort_list ${sublist_counter} counter_element) + + # Loop to move those elements greater than last_element up one position. + while((${sublist_counter} GREATER_EQUAL 0) AND (${counter_element} GREATER ${last_element})) + # Move elem at counter index up one position in list. + math(EXPR counter_plus_one "${sublist_counter} + 1") + list(REMOVE_AT _sort_list ${counter_plus_one}) + list(INSERT _sort_list ${counter_plus_one} ${counter_element}) + # Decrement the sublist counter. + math(EXPR sublist_counter "${sublist_counter} - 1") + # Get the element at the new counter value. + list(GET _sort_list ${sublist_counter} counter_element) + endwhile() + + # Place the last element at the correct position. + math(EXPR counter_plus_one "${sublist_counter} + 1") + list(REMOVE_AT _sort_list ${counter_plus_one}) + list(INSERT _sort_list ${counter_plus_one} ${last_element}) + # Send the modified list back up to parent scope. + set(sort_list ${_sort_list} PARENT_SCOPE) +endfunction() diff --git a/cmake/create_test.cmake b/cmake/create_test.cmake index 28685fdf232..1f8181912df 100644 --- a/cmake/create_test.cmake +++ b/cmake/create_test.cmake @@ -41,7 +41,11 @@ function(ginkgo_create_dpcpp_test test_name) target_compile_features(${test_target_name} PUBLIC cxx_std_17) target_compile_options(${test_target_name} PRIVATE "${GINKGO_DPCPP_FLAGS}") target_compile_options(${test_target_name} PRIVATE "${GINKGO_COMPILER_FLAGS}") - target_link_options(${test_target_name} PRIVATE -fsycl-device-code-split=per_kernel) + if (NOT GINKGO_DPCPP_AOT_TARGETS STREQUAL "") + target_compile_options(${test_target_name} PRIVATE -fsycl-targets=${GINKGO_DPCPP_AOT_TARGETS_FORMATTED}) + else() + target_link_options(${test_target_name} PRIVATE -fsycl-device-code-split=per_kernel) + endif() ginkgo_set_test_target_properties(${test_name} ${test_target_name}) # Note: MKL_ENV is empty on linux. Maybe need to apply MKL_ENV to all test. if (MKL_ENV) diff --git a/dpcpp/CMakeLists.txt b/dpcpp/CMakeLists.txt index 41245f4cc7b..86efb1b9e86 100644 --- a/dpcpp/CMakeLists.txt +++ b/dpcpp/CMakeLists.txt @@ -51,25 +51,67 @@ target_sources(ginkgo_dpcpp ${GKO_UNIFIED_COMMON_SOURCES} ) +## Parse the subgroup and workgroup lists +# 1. Sort from smallest element to the biggest +insertionSortRecursive(GINKGO_DPCPP_SUBGROUPS) +insertionSortRecursive(GINKGO_DPCPP_WORKGROUPS) +# 2. Extract the two biggest elements +list(GET GINKGO_DPCPP_SUBGROUPS -1 GINKGO_DPCPP_BIGGEST_SUBGROUP) +list(GET GINKGO_DPCPP_WORKGROUPS -1 GINKGO_DPCPP_BIGGEST_WORKGROUP) +# 3. Combine the two lists, also format the workgroup list +set(GINKGO_DPCPP_1D_CFG "") +set(GINKGO_DPCPP_WORKGROUPS_FORMATTED "") +foreach(workgroup ${GINKGO_DPCPP_WORKGROUPS}) + foreach(subgroup ${GINKGO_DPCPP_SUBGROUPS}) + if (GINKGO_DPCPP_1D_CFG STREQUAL "") + set(GINKGO_DPCPP_1D_CFG "KCFG_1D::encode(${workgroup}, ${subgroup})") + else() + set(GINKGO_DPCPP_1D_CFG "${GINKGO_DPCPP_1D_CFG}, KCFG_1D::encode(${workgroup}, ${subgroup})") + endif() + endforeach() + if (GINKGO_DPCPP_WORKGROUPS_FORMATTED STREQUAL "") + set(GINKGO_DPCPP_WORKGROUPS_FORMATTED "BlockCfg::encode(${workgroup})") + else() + set(GINKGO_DPCPP_WORKGROUPS_FORMATTED "BlockCfg::encode(${workgroup}), ${GINKGO_DPCPP_WORKGROUPS_FORMATTED}, ") + endif() +endforeach() +# 4. Reverse the subgroup list, replace ; to , +list(REVERSE GINKGO_DPCPP_SUBGROUPS) +string(REPLACE ";" "," GINKGO_DPCPP_SUBGROUPS_FORMATTED "${GINKGO_DPCPP_SUBGROUPS}") + +## Now we can generate the DPC++ Configure file +configure_file(${CMAKE_CURRENT_SOURCE_DIR}/base/config.hpp.in + ${CMAKE_CURRENT_SOURCE_DIR}/base/config.hpp @ONLY) + + ginkgo_compile_features(ginkgo_dpcpp) target_compile_definitions(ginkgo_dpcpp PRIVATE GKO_COMPILING_DPCPP _ONEDPL_COMPILE_KERNEL=0) set(GINKGO_DPCPP_FLAGS ${GINKGO_DPCPP_FLAGS} PARENT_SCOPE) target_compile_options(ginkgo_dpcpp PRIVATE "${GINKGO_DPCPP_FLAGS}") target_compile_options(ginkgo_dpcpp PRIVATE "${GINKGO_COMPILER_FLAGS}") +target_compile_features(ginkgo_dpcpp PRIVATE cxx_std_17) + # Note: add MKL as PRIVATE not PUBLIC (MKL example shows) to avoid propagating # find_package(MKL) everywhere when linking ginkgo (see the MKL example # https://software.intel.com/content/www/us/en/develop/documentation/onemkl-windows-developer-guide/top/getting-started/cmake-config-for-onemkl.html) -target_compile_features(ginkgo_dpcpp PRIVATE cxx_std_17) target_compile_options(ginkgo_dpcpp PRIVATE $) target_include_directories(ginkgo_dpcpp PRIVATE $) +target_include_directories(ginkgo_dpcpp PRIVATE $) + target_link_options(ginkgo_dpcpp PRIVATE -fsycl-device-lib=all) -# When building ginkgo as a static library, we need to use dpcpp and per_kernel -# link option when the program uses a dpcpp related function. -if (BUILD_SHARED_LIBS) - target_link_options(ginkgo_dpcpp PRIVATE -fsycl-device-code-split=per_source) -else () - target_link_options(ginkgo_dpcpp PUBLIC -fsycl-device-code-split=per_source) + +if (NOT GINKGO_DPCPP_AOT_TARGETS STREQUAL "") + string(REPLACE ";" "," GINKGO_DPCPP_AOT_TARGETS_FORMATTED "${GINKGO_DPCPP_AOT_TARGETS}") + target_compile_options(ginkgo_dpcpp PRIVATE -fsycl-targets=${GINKGO_DPCPP_AOT_TARGETS_FORMATTED}) +else() + # When building ginkgo as a static library, we need to use dpcpp and per_source + # link option when the program uses a dpcpp related function. + if (BUILD_SHARED_LIBS) + target_link_options(ginkgo_dpcpp PRIVATE -fsycl-device-code-split=per_source) + else () + target_link_options(ginkgo_dpcpp PUBLIC -fsycl-device-code-split=per_source) + endif() endif() target_link_libraries(ginkgo_dpcpp PUBLIC ginkgo_device) target_link_libraries(ginkgo_dpcpp PRIVATE MKL::MKL_DPCPP oneDPL) diff --git a/dpcpp/base/config.hpp b/dpcpp/base/config.hpp.in similarity index 79% rename from dpcpp/base/config.hpp rename to dpcpp/base/config.hpp.in index 43465b61c8a..3ac90f3dd62 100644 --- a/dpcpp/base/config.hpp +++ b/dpcpp/base/config.hpp.in @@ -36,6 +36,10 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include +#include + + +#include "core/base/types.hpp" namespace gko { @@ -52,7 +56,7 @@ struct config { /** * The number of threads within a Dpcpp subgroup. */ - static constexpr uint32 warp_size = 16; + static constexpr uint32 warp_size = @GINKGO_DPCPP_BIGGEST_SUBGROUP@; /** * The bitmask of the entire warp. @@ -68,10 +72,22 @@ struct config { /** * The default maximal number of threads allowed in DPCPP group */ - static constexpr uint32 max_block_size = 256; + static constexpr uint32 max_block_size = @GINKGO_DPCPP_BIGGEST_WORKGROUP@; }; +using KCFG_1D = ConfigSet<11, 7>; +constexpr auto kcfg_1d_list = + syn::value_list(); +constexpr auto subgroup_list = syn::value_list(); +constexpr auto kcfg_1d_array = syn::as_array(kcfg_1d_list); + + +using BlockCfg = ConfigSet<11>; +constexpr auto block_cfg_list = + ::gko::syn::value_list(); + + } // namespace dpcpp } // namespace kernels } // namespace gko diff --git a/dpcpp/base/kernel_launch_reduction.dp.hpp b/dpcpp/base/kernel_launch_reduction.dp.hpp index 52d3ef669e5..2ebc724ab05 100644 --- a/dpcpp/base/kernel_launch_reduction.dp.hpp +++ b/dpcpp/base/kernel_launch_reduction.dp.hpp @@ -53,16 +53,6 @@ namespace kernels { namespace dpcpp { -using KCFG_1D = ConfigSet<11, 7>; -constexpr auto kcfg_1d_list_simple_reduction = - syn::value_list(); - - template @@ -181,30 +171,24 @@ void run_kernel_reduction_impl(std::shared_ptr exec, auto queue = exec->get_queue(); if (num_workgroups > 1) { Array partial{exec, static_cast(num_workgroups)}; - queue - ->submit([&](sycl::handler& cgh) { - generic_kernel_reduction_1d( - cgh, static_cast(size), num_workgroups, fn, op, - [](auto v) { return v; }, identity, partial.get_data(), - args...); - }) - .wait(); - queue - ->submit([&](sycl::handler& cgh) { - generic_kernel_reduction_1d( - cgh, static_cast(num_workgroups), 1, - [](auto i, auto v) { return v[i]; }, op, finalize, identity, - result, partial.get_const_data()); - }) - .wait(); + queue->submit([&](sycl::handler& cgh) { + generic_kernel_reduction_1d( + cgh, static_cast(size), num_workgroups, fn, op, + [](auto v) { return v; }, identity, partial.get_data(), + args...); + }); + queue->submit([&](sycl::handler& cgh) { + generic_kernel_reduction_1d( + cgh, static_cast(num_workgroups), 1, + [](auto i, auto v) { return v[i]; }, op, finalize, identity, + result, partial.get_const_data()); + }); } else { - queue - ->submit([&](sycl::handler& cgh) { - generic_kernel_reduction_1d( - cgh, static_cast(size), num_workgroups, fn, op, - finalize, identity, result, args...); - }) - .wait(); + queue->submit([&](sycl::handler& cgh) { + generic_kernel_reduction_1d(cgh, static_cast(size), + num_workgroups, fn, op, finalize, + identity, result, args...); + }); } } @@ -230,30 +214,24 @@ void run_kernel_reduction_impl(std::shared_ptr exec, auto queue = exec->get_queue(); if (num_workgroups > 1) { Array partial{exec, static_cast(num_workgroups)}; - queue - ->submit([&](sycl::handler& cgh) { - generic_kernel_reduction_2d( - cgh, rows, cols, num_workgroups, fn, op, - [](auto v) { return v; }, identity, partial.get_data(), - args...); - }) - .wait(); - queue - ->submit([&](sycl::handler& cgh) { - generic_kernel_reduction_1d( - cgh, static_cast(num_workgroups), 1, - [](auto i, auto v) { return v[i]; }, op, finalize, identity, - result, partial.get_const_data()); - }) - .wait(); + queue->submit([&](sycl::handler& cgh) { + generic_kernel_reduction_2d( + cgh, rows, cols, num_workgroups, fn, op, + [](auto v) { return v; }, identity, partial.get_data(), + args...); + }); + queue->submit([&](sycl::handler& cgh) { + generic_kernel_reduction_1d( + cgh, static_cast(num_workgroups), 1, + [](auto i, auto v) { return v[i]; }, op, finalize, identity, + result, partial.get_const_data()); + }); } else { - queue - ->submit([&](sycl::handler& cgh) { - generic_kernel_reduction_2d( - cgh, rows, cols, num_workgroups, fn, op, finalize, identity, - result, args...); - }) - .wait(); + queue->submit([&](sycl::handler& cgh) { + generic_kernel_reduction_2d(cgh, rows, cols, num_workgroups, + fn, op, finalize, identity, result, + args...); + }); } } @@ -268,14 +246,13 @@ void run_kernel_reduction(std::shared_ptr exec, FinalizeOp finalize, ValueType identity, ValueType* result, dim<2> size, KernelArgs&&... args) { - const auto desired_cfg = get_first_cfg( - as_array(kcfg_1d_list_simple_reduction), [&](std::uint32_t cfg) { + const auto desired_cfg = + get_first_cfg(as_array(kcfg_1d_list), [&](std::uint32_t cfg) { return validate(exec->get_queue(), KCFG_1D::decode<0>(cfg), KCFG_1D::decode<1>(cfg)); }); select_run_kernel_reduction( - kcfg_1d_list_simple_reduction, - [&](std::uint32_t cfg) { return cfg == desired_cfg; }, + kcfg_1d_list, [&](std::uint32_t cfg) { return cfg == desired_cfg; }, syn::value_list(), syn::value_list(), syn::value_list(), syn::type_list<>(), exec, fn, op, finalize, identity, result, size, map_to_device(args)...); @@ -290,14 +267,13 @@ void run_kernel_reduction(std::shared_ptr exec, ValueType* result, size_type size, KernelArgs&&... args) { - const auto desired_cfg = get_first_cfg( - as_array(kcfg_1d_list_simple_reduction), [&](std::uint32_t cfg) { + const auto desired_cfg = + get_first_cfg(as_array(kcfg_1d_list), [&](std::uint32_t cfg) { return validate(exec->get_queue(), KCFG_1D::decode<0>(cfg), KCFG_1D::decode<1>(cfg)); }); select_run_kernel_reduction( - kcfg_1d_list_simple_reduction, - [&](std::uint32_t cfg) { return cfg == desired_cfg; }, + kcfg_1d_list, [&](std::uint32_t cfg) { return cfg == desired_cfg; }, syn::value_list(), syn::value_list(), syn::value_list(), syn::type_list<>(), exec, fn, op, finalize, identity, result, size, map_to_device(args)...); @@ -534,21 +510,16 @@ void run_generic_col_reduction_small(syn::value_list, } else { Array tmp_storage{exec, static_cast(row_blocks * cols)}; - queue - ->submit([&](sycl::handler& cgh) { - generic_kernel_col_reduction_2d_small( - cgh, rows, cols, row_blocks, fn, op, - [](auto v) { return v; }, identity, tmp_storage.get_data(), - args...); - }) - .wait(); - queue - ->submit([&](sycl::handler& cgh) { - generic_kernel_reduction_finalize_2d( - cgh, cols, row_blocks, op, finalize, identity, - tmp_storage.get_const_data(), 1, result); - }) - .wait(); + queue->submit([&](sycl::handler& cgh) { + generic_kernel_col_reduction_2d_small( + cgh, rows, cols, row_blocks, fn, op, [](auto v) { return v; }, + identity, tmp_storage.get_data(), args...); + }); + queue->submit([&](sycl::handler& cgh) { + generic_kernel_reduction_finalize_2d( + cgh, cols, row_blocks, op, finalize, identity, + tmp_storage.get_const_data(), 1, result); + }); } } @@ -568,31 +539,29 @@ void run_kernel_row_reduction_stage1(std::shared_ptr exec, constexpr auto wg_size = KCFG_1D::decode<0>(cfg); constexpr auto sg_size = KCFG_1D::decode<1>(cfg); using subsubgroup_sizes = - // syn::value_list(16, sg_size), - // std::min(32, sg_size), sg_size>; - syn::value_list; + syn::value_list(16, sg_size), + std::min(32, sg_size), sg_size>; constexpr int oversubscription = 16; const auto rows = static_cast(size[0]); const auto cols = static_cast(size[1]); const auto resources = exec->get_num_computing_units() * sg_size * oversubscription; auto queue = exec->get_queue(); - // if (rows * cols > resources && rows < cols) { - const auto col_blocks = ceildiv(rows * cols, resources); - Array partial{exec, static_cast(col_blocks * rows)}; - generic_kernel_row_reduction_2d( - syn::value_list{}, exec, rows, cols, col_blocks, fn, op, - [](auto v) { return v; }, identity, partial.get_data(), 1, args...); - queue->wait(); - queue - ->submit([&](sycl::handler& cgh) { + if (rows * cols > resources && rows < cols) { + const auto col_blocks = ceildiv(rows * cols, resources); + Array partial{exec, + static_cast(col_blocks * rows)}; + generic_kernel_row_reduction_2d( + syn::value_list{}, exec, rows, cols, col_blocks, fn, + op, [](auto v) { return v; }, identity, partial.get_data(), 1, + args...); + queue->submit([&](sycl::handler& cgh) { generic_kernel_reduction_finalize_2d( cgh, rows, col_blocks, op, finalize, identity, partial.get_const_data(), static_cast(result_stride), result); - }) - .wait(); - /* } else { + }); + } else { select_generic_kernel_row_reduction_2d( subsubgroup_sizes(), [cols](int compiled_ssg_size) { @@ -602,7 +571,7 @@ void run_kernel_row_reduction_stage1(std::shared_ptr exec, syn::value_list(), syn::type_list<>(), exec, rows, cols, 1, fn, op, finalize, identity, result, static_cast(result_stride), args...); - } */ + } } GKO_ENABLE_IMPLEMENTATION_CONFIG_SELECTION(select_kernel_row_reduction_stage1, @@ -621,16 +590,14 @@ void run_kernel_col_reduction_stage1(std::shared_ptr exec, constexpr auto wg_size = KCFG_1D::decode<0>(cfg); constexpr auto sg_size = KCFG_1D::decode<1>(cfg); using subsubgroup_sizes = - // syn::value_list(16, sg_size), - // std::min(32, sg_size), sg_size>; - syn::value_list; + syn::value_list(16, sg_size), + std::min(32, sg_size), sg_size>; constexpr int oversubscription = 16; const auto rows = static_cast(size[0]); const auto cols = static_cast(size[1]); const auto max_blocks = exec->get_num_computing_units() * sg_size * oversubscription / wg_size; if (cols <= sg_size) { - std::cout << "Running col reduction small\n"; select_generic_col_reduction_small( subsubgroup_sizes(), [cols](int compiled_ssg_size) { @@ -646,31 +613,25 @@ void run_kernel_col_reduction_stage1(std::shared_ptr exec, col_blocks); auto queue = exec->get_queue(); if (row_blocks <= 1) { - std::cout << "Running 2d blocked\n"; queue->submit([&](sycl::handler& cgh) { generic_kernel_col_reduction_2d_blocked( cgh, rows, cols, 1, col_blocks, fn, op, finalize, identity, result, args...); }); } else { - std::cout << "Running 2d blocked/finalize\n"; Array tmp_storage{ exec, static_cast(row_blocks * cols)}; - queue - ->submit([&](sycl::handler& cgh) { - generic_kernel_col_reduction_2d_blocked( - cgh, rows, cols, row_blocks, col_blocks, fn, op, - [](auto v) { return v; }, identity, - tmp_storage.get_data(), args...); - }) - .wait(); - queue - ->submit([&](sycl::handler& cgh) { - generic_kernel_reduction_finalize_2d( - cgh, cols, row_blocks, op, finalize, identity, - tmp_storage.get_const_data(), 1, result); - }) - .wait(); + queue->submit([&](sycl::handler& cgh) { + generic_kernel_col_reduction_2d_blocked( + cgh, rows, cols, row_blocks, col_blocks, fn, op, + [](auto v) { return v; }, identity, tmp_storage.get_data(), + args...); + }); + queue->submit([&](sycl::handler& cgh) { + generic_kernel_reduction_finalize_2d( + cgh, cols, row_blocks, op, finalize, identity, + tmp_storage.get_const_data(), 1, result); + }); } } } @@ -690,14 +651,13 @@ void run_kernel_row_reduction(std::shared_ptr exec, ValueType* result, size_type result_stride, dim<2> size, KernelArgs&&... args) { - const auto desired_cfg = get_first_cfg( - as_array(kcfg_1d_list_simple_reduction), [&](std::uint32_t cfg) { + const auto desired_cfg = + get_first_cfg(as_array(kcfg_1d_list), [&](std::uint32_t cfg) { return validate(exec->get_queue(), KCFG_1D::decode<0>(cfg), KCFG_1D::decode<1>(cfg)); }); select_kernel_row_reduction_stage1( - kcfg_1d_list_simple_reduction, - [&](std::uint32_t cfg) { return cfg == desired_cfg; }, + kcfg_1d_list, [&](std::uint32_t cfg) { return cfg == desired_cfg; }, syn::value_list(), syn::value_list(), syn::value_list(), syn::type_list<>(), exec, fn, op, finalize, identity, result, result_stride, size, @@ -713,14 +673,13 @@ void run_kernel_col_reduction(std::shared_ptr exec, ValueType* result, dim<2> size, KernelArgs&&... args) { - const auto desired_cfg = get_first_cfg( - as_array(kcfg_1d_list_simple_reduction), [&](std::uint32_t cfg) { + const auto desired_cfg = + get_first_cfg(as_array(kcfg_1d_list), [&](std::uint32_t cfg) { return validate(exec->get_queue(), KCFG_1D::decode<0>(cfg), KCFG_1D::decode<1>(cfg)); }); select_kernel_col_reduction_stage1( - kcfg_1d_list_simple_reduction, - [&](std::uint32_t cfg) { return cfg == desired_cfg; }, + kcfg_1d_list, [&](std::uint32_t cfg) { return cfg == desired_cfg; }, syn::value_list(), syn::value_list(), syn::value_list(), syn::type_list<>(), exec, fn, op, finalize, identity, result, size, map_to_device(args)...); diff --git a/dpcpp/components/prefix_sum.dp.hpp b/dpcpp/components/prefix_sum.dp.hpp index 6e6e7062fb3..ea7dd341ecf 100644 --- a/dpcpp/components/prefix_sum.dp.hpp +++ b/dpcpp/components/prefix_sum.dp.hpp @@ -187,21 +187,19 @@ void start_prefix_sum(dim3 grid, dim3 block, size_type dynamic_shared_memory, sycl::queue* queue, size_type num_elements, ValueType* elements, ValueType* block_sum) { - queue - ->submit([&](sycl::handler& cgh) { - sycl::accessor, 0, - sycl::access::mode::read_write, - sycl::access::target::local> - prefix_helper_acc_ct1(cgh); - - cgh.parallel_for( - sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) { - start_prefix_sum( - num_elements, elements, block_sum, item_ct1, - *prefix_helper_acc_ct1.get_pointer()); - }); - }) - .wait(); + queue->submit([&](sycl::handler& cgh) { + sycl::accessor, 0, + sycl::access::mode::read_write, + sycl::access::target::local> + prefix_helper_acc_ct1(cgh); + + cgh.parallel_for(sycl_nd_range(grid, block), + [=](sycl::nd_item<3> item_ct1) { + start_prefix_sum( + num_elements, elements, block_sum, item_ct1, + *prefix_helper_acc_ct1.get_pointer()); + }); + }); } @@ -241,15 +239,13 @@ void finalize_prefix_sum(dim3 grid, dim3 block, size_type dynamic_shared_memory, sycl::queue* queue, size_type num_elements, ValueType* elements, const ValueType* block_sum) { - queue - ->submit([&](sycl::handler& cgh) { - cgh.parallel_for( - sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) { - finalize_prefix_sum(num_elements, elements, - block_sum, item_ct1); - }); - }) - .wait(); + queue->submit([&](sycl::handler& cgh) { + cgh.parallel_for(sycl_nd_range(grid, block), + [=](sycl::nd_item<3> item_ct1) { + finalize_prefix_sum( + num_elements, elements, block_sum, item_ct1); + }); + }); } diff --git a/dpcpp/components/prefix_sum_kernels.dp.cpp b/dpcpp/components/prefix_sum_kernels.dp.cpp index 695fb2f2898..166b9482b49 100644 --- a/dpcpp/components/prefix_sum_kernels.dp.cpp +++ b/dpcpp/components/prefix_sum_kernels.dp.cpp @@ -40,6 +40,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "core/base/types.hpp" +#include "dpcpp/base/config.hpp" #include "dpcpp/base/helper.hpp" #include "dpcpp/components/prefix_sum.dp.hpp" @@ -50,12 +51,6 @@ namespace dpcpp { namespace components { -using BlockCfg = ConfigSet<11>; - -constexpr auto block_cfg_list = - ::gko::syn::value_list(); - GKO_ENABLE_IMPLEMENTATION_CONFIG_SELECTION(start_prefix_sum, start_prefix_sum) GKO_ENABLE_DEFAULT_CONFIG_CALL(start_prefix_sum_call, start_prefix_sum, block_cfg_list) diff --git a/dpcpp/components/reduction.dp.hpp b/dpcpp/components/reduction.dp.hpp index dd5a68dd956..e6d5aca5f5b 100644 --- a/dpcpp/components/reduction.dp.hpp +++ b/dpcpp/components/reduction.dp.hpp @@ -62,15 +62,7 @@ namespace dpcpp { constexpr int default_block_size = 256; -using KCFG_1D = ConfigSet<11, 7>; -constexpr auto kcfg_1d_list = - syn::value_list(); -constexpr auto kcfg_1d_array = as_array(kcfg_1d_list); + /** * @internal @@ -219,7 +211,9 @@ void reduce_add_array( } } -template +template void reduce_add_array(dim3 grid, dim3 block, size_type dynamic_shared_memory, sycl::queue* queue, size_type size, const ValueType* source, ValueType* result) diff --git a/dpcpp/matrix/dense_kernels.dp.cpp b/dpcpp/matrix/dense_kernels.dp.cpp index 02bb8b5f513..8c46f15fd40 100644 --- a/dpcpp/matrix/dense_kernels.dp.cpp +++ b/dpcpp/matrix/dense_kernels.dp.cpp @@ -70,20 +70,6 @@ namespace dpcpp { namespace dense { -// Disable the 64 subgroup. CPU supports 64 now, but conj_transpose will -// lead CL_OUT_OF_RESOURCES. TODO: investigate this issue. -using KCFG_1D = ConfigSet<11, 7>; -constexpr auto kcfg_1d_list = - syn::value_list(); -// constexpr auto subgroup_list = syn::value_list(); -constexpr auto subgroup_list = syn::value_list(); -constexpr auto kcfg_1d_array = syn::as_array(kcfg_1d_list); constexpr int default_block_size = 256; diff --git a/dpcpp/test/matrix/dense_kernels.cpp b/dpcpp/test/matrix/dense_kernels.cpp index a48d404f089..56ef8b5532e 100644 --- a/dpcpp/test/matrix/dense_kernels.cpp +++ b/dpcpp/test/matrix/dense_kernels.cpp @@ -199,7 +199,7 @@ TEST_F(Dense, SingleVectorDpcppComputeDotIsEquivalentToRef) TEST_F(Dense, MultipleVectorDpcppComputeDotIsEquivalentToRef) { - set_up_vector_data(2); + set_up_vector_data(20); x->compute_dot(y.get(), expected.get()); dx->compute_dot(dy.get(), dresult.get()); From 6d56693424275235002f27b8005bd22cb81979f4 Mon Sep 17 00:00:00 2001 From: Terry Cojean Date: Wed, 16 Feb 2022 16:46:09 +0100 Subject: [PATCH 03/14] Phase out value_list for std::integer_sequence --- core/base/block_sizes.hpp | 5 +- core/synthesizer/implementation_selection.hpp | 30 ++++++------ cuda/base/kernel_launch_reduction.cuh | 48 +++++++++---------- cuda/factorization/par_ic_kernels.cu | 2 +- cuda/factorization/par_ict_kernels.cu | 11 +++-- .../par_ilut_approx_filter_kernel.cu | 6 +-- cuda/factorization/par_ilut_filter_kernel.cu | 8 ++-- cuda/factorization/par_ilut_spgeam_kernel.cu | 8 ++-- cuda/factorization/par_ilut_sweep_kernel.cu | 8 ++-- cuda/matrix/csr_kernels.cu | 27 ++++++----- cuda/matrix/ell_kernels.cu | 12 ++--- cuda/matrix/fbcsr_kernels.cu | 4 +- .../jacobi_advanced_apply_instantiate.inc.cu | 4 +- .../jacobi_advanced_apply_kernel.cu | 4 +- cuda/preconditioner/jacobi_common.hpp.in | 2 +- .../jacobi_generate_instantiate.inc.cu | 4 +- cuda/preconditioner/jacobi_generate_kernel.cu | 8 ++-- cuda/preconditioner/jacobi_kernels.cu | 6 +-- .../jacobi_simple_apply_instantiate.inc.cu | 4 +- .../jacobi_simple_apply_kernel.cu | 4 +- dpcpp/base/config.hpp.in | 6 +-- dpcpp/base/helper.hpp | 4 +- dpcpp/base/kernel_launch_reduction.dp.hpp | 40 ++++++++-------- dpcpp/matrix/csr_kernels.dp.cpp | 21 ++++---- dpcpp/matrix/ell_kernels.dp.cpp | 12 ++--- .../test/components/cooperative_groups.dp.cpp | 10 ++-- hip/base/kernel_launch_reduction.hip.hpp | 48 +++++++++---------- hip/factorization/par_ic_kernels.hip.cpp | 2 +- hip/factorization/par_ict_kernels.hip.cpp | 11 +++-- .../par_ilut_approx_filter_kernel.hip.cpp | 6 +-- .../par_ilut_filter_kernel.hip.cpp | 8 ++-- .../par_ilut_spgeam_kernel.hip.cpp | 8 ++-- .../par_ilut_sweep_kernel.hip.cpp | 8 ++-- hip/matrix/csr_kernels.hip.cpp | 27 ++++++----- hip/matrix/ell_kernels.hip.cpp | 12 ++--- ...obi_advanced_apply_instantiate.inc.hip.cpp | 4 +- .../jacobi_advanced_apply_kernel.hip.cpp | 4 +- hip/preconditioner/jacobi_common.hip.hpp.in | 2 +- .../jacobi_generate_instantiate.inc.hip.cpp | 4 +- .../jacobi_generate_kernel.hip.cpp | 8 ++-- hip/preconditioner/jacobi_kernels.hip.cpp | 6 +-- ...acobi_simple_apply_instantiate.inc.hip.cpp | 4 +- .../jacobi_simple_apply_kernel.hip.cpp | 4 +- .../ginkgo/core/synthesizer/containers.hpp | 46 ++++++++++-------- omp/base/kernel_launch.hpp | 6 +-- omp/base/kernel_launch_reduction.hpp | 12 ++--- omp/matrix/fbcsr_kernels.cpp | 4 +- reference/matrix/fbcsr_kernels.cpp | 4 +- 48 files changed, 273 insertions(+), 263 deletions(-) diff --git a/core/base/block_sizes.hpp b/core/base/block_sizes.hpp index 7701d66111f..e41b3cc00e3 100644 --- a/core/base/block_sizes.hpp +++ b/core/base/block_sizes.hpp @@ -51,9 +51,10 @@ namespace fixedblock { * A compile-time list of block sizes for which dedicated fixed-block matrix * and corresponding preconditioner kernels should be compiled. */ -using compiled_kernels = syn::value_list; +using compiled_kernels = + std::integer_sequence; #else -using compiled_kernels = syn::value_list; +using compiled_kernels = std::integer_sequence; #endif diff --git a/core/synthesizer/implementation_selection.hpp b/core/synthesizer/implementation_selection.hpp index a78ac391d2e..47bbcf3be72 100644 --- a/core/synthesizer/implementation_selection.hpp +++ b/core/synthesizer/implementation_selection.hpp @@ -48,24 +48,24 @@ namespace syn { #define GKO_ENABLE_IMPLEMENTATION_SELECTION(_name, _callable) \ template \ - inline void _name(::gko::syn::value_list, Predicate, \ - ::gko::syn::value_list, \ + inline void _name(std::integer_sequence, Predicate, \ + std::integer_sequence, \ ::gko::syn::type_list, InferredArgs...) \ GKO_KERNEL_NOT_FOUND; \ \ template \ inline void _name( \ - ::gko::syn::value_list, Predicate is_eligible, \ - ::gko::syn::value_list int_args, \ + std::integer_sequence, Predicate is_eligible, \ + std::integer_sequence int_args, \ ::gko::syn::type_list type_args, InferredArgs... args) \ { \ if (is_eligible(K)) { \ _callable( \ - ::gko::syn::value_list(), \ + std::integer_sequence(), \ std::forward(args)...); \ } else { \ - _name(::gko::syn::value_list(), is_eligible, \ + _name(std::integer_sequence(), is_eligible, \ int_args, type_args, std::forward(args)...); \ } \ } \ @@ -77,10 +77,10 @@ namespace syn { template \ - inline void _name(::gko::syn::value_list, Predicate, \ - ::gko::syn::value_list, \ - ::gko::syn::value_list, \ - ::gko::syn::value_list, \ + inline void _name(std::integer_sequence, Predicate, \ + std::integer_sequence, \ + std::integer_sequence, \ + std::integer_sequence, \ ::gko::syn::type_list, InferredArgs...) \ GKO_KERNEL_NOT_FOUND; \ \ @@ -88,18 +88,18 @@ namespace syn { bool... BoolArgs, int... IntArgs, gko::size_type... SizeTArgs, \ typename... TArgs, typename... InferredArgs> \ inline void _name( \ - ::gko::syn::value_list, \ + std::integer_sequence, \ Predicate is_eligible, \ - ::gko::syn::value_list bool_args, \ - ::gko::syn::value_list int_args, \ - ::gko::syn::value_list size_args, \ + std::integer_sequence bool_args, \ + std::integer_sequence int_args, \ + std::integer_sequence size_args, \ ::gko::syn::type_list type_args, InferredArgs... args) \ { \ if (is_eligible(K)) { \ _callable( \ std::forward(args)...); \ } else { \ - _name(::gko::syn::value_list(), \ + _name(std::integer_sequence(), \ is_eligible, bool_args, int_args, size_args, type_args, \ std::forward(args)...); \ } \ diff --git a/cuda/base/kernel_launch_reduction.cuh b/cuda/base/kernel_launch_reduction.cuh index db7040b07f5..8b6e98fab86 100644 --- a/cuda/base/kernel_launch_reduction.cuh +++ b/cuda/base/kernel_launch_reduction.cuh @@ -201,10 +201,10 @@ void run_kernel_reduction(std::shared_ptr exec, template __global__ - __launch_bounds__(default_block_size) void generic_kernel_row_reduction_2d( - int64 rows, int64 cols, int64 col_blocks, KernelFunction fn, - ReductionOp op, FinalizeOp finalize, ValueType identity, - ValueType* result, int64 result_stride, KernelArgs... args) +__launch_bounds__(default_block_size) void generic_kernel_row_reduction_2d( + int64 rows, int64 cols, int64 col_blocks, KernelFunction fn, ReductionOp op, + FinalizeOp finalize, ValueType identity, ValueType* result, + int64 result_stride, KernelArgs... args) { const auto idx = thread::get_subwarp_id_flat(); const auto row = idx % rows; @@ -233,10 +233,10 @@ __global__ template __global__ - __launch_bounds__(default_block_size) void generic_kernel_col_reduction_2d_small( - int64 rows, int64 cols, KernelFunction fn, ReductionOp op, - FinalizeOp finalize, ValueType identity, ValueType* result, - KernelArgs... args) +__launch_bounds__(default_block_size) void generic_kernel_col_reduction_2d_small( + int64 rows, int64 cols, KernelFunction fn, ReductionOp op, + FinalizeOp finalize, ValueType identity, ValueType* result, + KernelArgs... args) { constexpr auto warp_size = config::warp_size; constexpr auto warps_per_block = default_block_size / warp_size; @@ -296,10 +296,10 @@ __global__ template __global__ - __launch_bounds__(default_block_size) void generic_kernel_col_reduction_2d_blocked( - int64 rows, int64 cols, KernelFunction fn, ReductionOp op, - FinalizeOp finalize, ValueType identity, ValueType* result, - KernelArgs... args) +__launch_bounds__(default_block_size) void generic_kernel_col_reduction_2d_blocked( + int64 rows, int64 cols, KernelFunction fn, ReductionOp op, + FinalizeOp finalize, ValueType identity, ValueType* result, + KernelArgs... args) { constexpr auto warp_size = config::warp_size; __shared__ UninitializedArray block_partial; @@ -335,10 +335,10 @@ __global__ template __global__ - __launch_bounds__(default_block_size) void generic_kernel_reduction_finalize_2d( - int64 num_results, int64 num_blocks, ReductionOp op, - FinalizeOp finalize, ValueType identity, const ValueType* input, - int64 result_stride, ValueType* result) +__launch_bounds__(default_block_size) void generic_kernel_reduction_finalize_2d( + int64 num_results, int64 num_blocks, ReductionOp op, FinalizeOp finalize, + ValueType identity, const ValueType* input, int64 result_stride, + ValueType* result) { const auto idx = thread::get_thread_id_flat(); if (idx >= num_results) { @@ -357,7 +357,7 @@ namespace { template -void run_generic_kernel_row_reduction(syn::value_list, +void run_generic_kernel_row_reduction(std::integer_sequence, int64 rows, int64 cols, int64 col_blocks, KernelFunction fn, ReductionOp op, FinalizeOp finalize, ValueType identity, @@ -382,7 +382,7 @@ GKO_ENABLE_IMPLEMENTATION_SELECTION(select_run_generic_kernel_row_reduction, template -void run_generic_col_reduction_small(syn::value_list, +void run_generic_col_reduction_small(std::integer_sequence, int64 max_blocks, std::shared_ptr exec, KernelFunction fn, ReductionOp op, @@ -433,7 +433,7 @@ void run_kernel_row_reduction(std::shared_ptr exec, dim<2> size, KernelArgs&&... args) { using subwarp_sizes = - syn::value_list; + std::integer_sequence; constexpr int oversubscription = 16; const auto rows = static_cast(size[0]); const auto cols = static_cast(size[1]); @@ -464,8 +464,8 @@ void run_kernel_row_reduction(std::shared_ptr exec, return compiled_subwarp_size >= cols || compiled_subwarp_size == config::warp_size; }, - syn::value_list(), syn::type_list<>(), rows, cols, 1, fn, op, - finalize, identity, result, static_cast(result_stride), + std::integer_sequence(), syn::type_list<>(), rows, cols, 1, fn, + op, finalize, identity, result, static_cast(result_stride), map_to_device(args)...); } } @@ -480,7 +480,7 @@ void run_kernel_col_reduction(std::shared_ptr exec, KernelArgs&&... args) { using subwarp_sizes = - syn::value_list; + std::integer_sequence; constexpr int oversubscription = 16; const auto rows = static_cast(size[0]); const auto cols = static_cast(size[1]); @@ -493,8 +493,8 @@ void run_kernel_col_reduction(std::shared_ptr exec, return compiled_subwarp_size >= cols || compiled_subwarp_size == config::warp_size; }, - syn::value_list(), syn::type_list<>(), max_blocks, exec, fn, - op, finalize, identity, result, size, map_to_device(args)...); + std::integer_sequence(), syn::type_list<>(), max_blocks, exec, + fn, op, finalize, identity, result, size, map_to_device(args)...); } else { const auto col_blocks = ceildiv(cols, config::warp_size); const auto row_blocks = diff --git a/cuda/factorization/par_ic_kernels.cu b/cuda/factorization/par_ic_kernels.cu index 4318cd1e1d9..43e2c66526f 100644 --- a/cuda/factorization/par_ic_kernels.cu +++ b/cuda/factorization/par_ic_kernels.cu @@ -59,7 +59,7 @@ constexpr int default_block_size = 512; // subwarp sizes for all warp-parallel kernels (sweep) using compiled_kernels = - syn::value_list; + std::integer_sequence; #include "common/cuda_hip/factorization/par_ic_kernels.hpp.inc" diff --git a/cuda/factorization/par_ict_kernels.cu b/cuda/factorization/par_ict_kernels.cu index 2f65affb628..f1d0689bc0f 100644 --- a/cuda/factorization/par_ict_kernels.cu +++ b/cuda/factorization/par_ict_kernels.cu @@ -70,7 +70,7 @@ constexpr int default_block_size = 512; // subwarp sizes for all warp-parallel kernels (filter, add_candidates) using compiled_kernels = - syn::value_list; + std::integer_sequence; #include "common/cuda_hip/factorization/par_ict_spgeam_kernels.hpp.inc" @@ -81,7 +81,7 @@ namespace { template -void add_candidates(syn::value_list, +void add_candidates(std::integer_sequence, std::shared_ptr exec, const matrix::Csr* llh, const matrix::Csr* a, @@ -137,7 +137,7 @@ GKO_ENABLE_IMPLEMENTATION_SELECTION(select_add_candidates, add_candidates); template -void compute_factor(syn::value_list, +void compute_factor(std::integer_sequence, std::shared_ptr exec, const matrix::Csr* a, matrix::Csr* l, @@ -180,7 +180,8 @@ void add_candidates(std::shared_ptr exec, return total_nnz_per_row <= compiled_subwarp_size || compiled_subwarp_size == config::warp_size; }, - syn::value_list(), syn::type_list<>(), exec, llh, a, l, l_new); + std::integer_sequence(), syn::type_list<>(), exec, llh, a, l, + l_new); } GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( @@ -202,7 +203,7 @@ void compute_factor(std::shared_ptr exec, return total_nnz_per_row <= compiled_subwarp_size || compiled_subwarp_size == config::warp_size; }, - syn::value_list(), syn::type_list<>(), exec, a, l, l_coo); + std::integer_sequence(), syn::type_list<>(), exec, a, l, l_coo); } GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( diff --git a/cuda/factorization/par_ilut_approx_filter_kernel.cu b/cuda/factorization/par_ilut_approx_filter_kernel.cu index be86588a909..a8f60fd2bc2 100644 --- a/cuda/factorization/par_ilut_approx_filter_kernel.cu +++ b/cuda/factorization/par_ilut_approx_filter_kernel.cu @@ -73,7 +73,7 @@ namespace par_ilut_factorization { // subwarp sizes for filter kernels using compiled_kernels = - syn::value_list; + std::integer_sequence; #include "common/cuda_hip/factorization/par_ilut_filter_kernels.hpp.inc" @@ -81,7 +81,7 @@ using compiled_kernels = template -void threshold_filter_approx(syn::value_list, +void threshold_filter_approx(std::integer_sequence, std::shared_ptr exec, const matrix::Csr* m, IndexType rank, Array* tmp, @@ -196,7 +196,7 @@ void threshold_filter_approx(std::shared_ptr exec, return total_nnz_per_row <= compiled_subwarp_size || compiled_subwarp_size == config::warp_size; }, - syn::value_list(), syn::type_list<>(), exec, m, rank, &tmp, + std::integer_sequence(), syn::type_list<>(), exec, m, rank, &tmp, &threshold, m_out, m_out_coo); } diff --git a/cuda/factorization/par_ilut_filter_kernel.cu b/cuda/factorization/par_ilut_filter_kernel.cu index 6dd83c41835..85d8c03211e 100644 --- a/cuda/factorization/par_ilut_filter_kernel.cu +++ b/cuda/factorization/par_ilut_filter_kernel.cu @@ -69,7 +69,7 @@ constexpr int default_block_size = 512; // subwarp sizes for filter kernels using compiled_kernels = - syn::value_list; + std::integer_sequence; #include "common/cuda_hip/factorization/par_ilut_filter_kernels.hpp.inc" @@ -79,7 +79,7 @@ namespace { template -void threshold_filter(syn::value_list, +void threshold_filter(std::integer_sequence, std::shared_ptr exec, const matrix::Csr* a, remove_complex threshold, @@ -153,8 +153,8 @@ void threshold_filter(std::shared_ptr exec, return total_nnz_per_row <= compiled_subwarp_size || compiled_subwarp_size == config::warp_size; }, - syn::value_list(), syn::type_list<>(), exec, a, threshold, m_out, - m_out_coo, lower); + std::integer_sequence(), syn::type_list<>(), exec, a, threshold, + m_out, m_out_coo, lower); } GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( diff --git a/cuda/factorization/par_ilut_spgeam_kernel.cu b/cuda/factorization/par_ilut_spgeam_kernel.cu index f1a6b7518ed..f77d3d77119 100644 --- a/cuda/factorization/par_ilut_spgeam_kernel.cu +++ b/cuda/factorization/par_ilut_spgeam_kernel.cu @@ -70,7 +70,7 @@ constexpr int default_block_size = 512; // subwarp sizes for add_candidates kernels using compiled_kernels = - syn::value_list; + std::integer_sequence; #include "common/cuda_hip/factorization/par_ilut_spgeam_kernels.hpp.inc" @@ -80,7 +80,7 @@ namespace { template -void add_candidates(syn::value_list, +void add_candidates(std::integer_sequence, std::shared_ptr exec, const matrix::Csr* lu, const matrix::Csr* a, @@ -172,8 +172,8 @@ void add_candidates(std::shared_ptr exec, return total_nnz_per_row <= compiled_subwarp_size || compiled_subwarp_size == config::warp_size; }, - syn::value_list(), syn::type_list<>(), exec, lu, a, l, u, l_new, - u_new); + std::integer_sequence(), syn::type_list<>(), exec, lu, a, l, u, + l_new, u_new); } GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( diff --git a/cuda/factorization/par_ilut_sweep_kernel.cu b/cuda/factorization/par_ilut_sweep_kernel.cu index fb1248158f9..c4a13c3915f 100644 --- a/cuda/factorization/par_ilut_sweep_kernel.cu +++ b/cuda/factorization/par_ilut_sweep_kernel.cu @@ -70,7 +70,7 @@ constexpr int default_block_size = 512; // subwarp sizes for all warp-parallel kernels (filter, add_candidates) using compiled_kernels = - syn::value_list; + std::integer_sequence; #include "common/cuda_hip/factorization/par_ilut_sweep_kernels.hpp.inc" @@ -80,7 +80,7 @@ namespace { template -void compute_l_u_factors(syn::value_list, +void compute_l_u_factors(std::integer_sequence, std::shared_ptr exec, const matrix::Csr* a, matrix::Csr* l, @@ -133,8 +133,8 @@ void compute_l_u_factors(std::shared_ptr exec, return total_nnz_per_row <= compiled_subwarp_size || compiled_subwarp_size == config::warp_size; }, - syn::value_list(), syn::type_list<>(), exec, a, l, l_coo, u, u_coo, - u_csc); + std::integer_sequence(), syn::type_list<>(), exec, a, l, l_coo, u, + u_coo, u_csc); } GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( diff --git a/cuda/matrix/csr_kernels.cu b/cuda/matrix/csr_kernels.cu index 2550aa2ceb3..c4f3f76ad98 100644 --- a/cuda/matrix/csr_kernels.cu +++ b/cuda/matrix/csr_kernels.cu @@ -88,13 +88,13 @@ constexpr int classical_overweight = 32; * A compile-time list of the number items per threads for which spmv kernel * should be compiled. */ -using compiled_kernels = syn::value_list; +using compiled_kernels = std::integer_sequence; using classical_kernels = - syn::value_list; + std::integer_sequence; using spgeam_kernels = - syn::value_list; + std::integer_sequence; #include "common/cuda_hip/matrix/csr_kernels.hpp.inc" @@ -105,7 +105,7 @@ namespace { template -void merge_path_spmv(syn::value_list, +void merge_path_spmv(std::integer_sequence, std::shared_ptr exec, const matrix::Csr* a, const matrix::Dense* b, @@ -213,7 +213,7 @@ int compute_items_per_thread(std::shared_ptr exec) template -void classical_spmv(syn::value_list, +void classical_spmv(std::integer_sequence, std::shared_ptr exec, const matrix::Csr* a, const matrix::Dense* b, @@ -429,7 +429,7 @@ void spmv(std::shared_ptr exec, [&items_per_thread](int compiled_info) { return items_per_thread == compiled_info; }, - syn::value_list(), syn::type_list<>(), exec, a, b, c); + std::integer_sequence(), syn::type_list<>(), exec, a, b, c); } else { bool use_classical = true; if (a->get_strategy()->get_name() == "sparselib" || @@ -459,7 +459,8 @@ void spmv(std::shared_ptr exec, [&max_length_per_row](int compiled_info) { return max_length_per_row >= compiled_info; }, - syn::value_list(), syn::type_list<>(), exec, a, b, c); + std::integer_sequence(), syn::type_list<>(), exec, a, b, + c); } } } @@ -487,8 +488,8 @@ void advanced_spmv(std::shared_ptr exec, [&items_per_thread](int compiled_info) { return items_per_thread == compiled_info; }, - syn::value_list(), syn::type_list<>(), exec, a, b, c, alpha, - beta); + std::integer_sequence(), syn::type_list<>(), exec, a, b, c, + alpha, beta); } else { bool use_classical = true; if (a->get_strategy()->get_name() == "sparselib" || @@ -519,7 +520,7 @@ void advanced_spmv(std::shared_ptr exec, [&max_length_per_row](int compiled_info) { return max_length_per_row >= compiled_info; }, - syn::value_list(), syn::type_list<>(), exec, a, b, c, + std::integer_sequence(), syn::type_list<>(), exec, a, b, c, alpha, beta); } } @@ -658,7 +659,7 @@ namespace { template -void spgeam(syn::value_list, +void spgeam(std::integer_sequence, std::shared_ptr exec, const ValueType* alpha, const IndexType* a_row_ptrs, const IndexType* a_col_idxs, const ValueType* a_vals, const ValueType* beta, @@ -836,7 +837,7 @@ void advanced_spgemm(std::shared_ptr exec, return compiled_subwarp_size >= nnz_per_row || compiled_subwarp_size == config::warp_size; }, - syn::value_list(), syn::type_list<>(), exec, + std::integer_sequence(), syn::type_list<>(), exec, alpha->get_const_values(), c_tmp_row_ptrs_array.get_const_data(), c_tmp_col_idxs_array.get_const_data(), c_tmp_vals_array.get_const_data(), beta->get_const_values(), d_row_ptrs, @@ -865,7 +866,7 @@ void spgeam(std::shared_ptr exec, return compiled_subwarp_size >= nnz_per_row || compiled_subwarp_size == config::warp_size; }, - syn::value_list(), syn::type_list<>(), exec, + std::integer_sequence(), syn::type_list<>(), exec, alpha->get_const_values(), a->get_const_row_ptrs(), a->get_const_col_idxs(), a->get_const_values(), beta->get_const_values(), b->get_const_row_ptrs(), diff --git a/cuda/matrix/ell_kernels.cu b/cuda/matrix/ell_kernels.cu index c906bb7fd1a..957faca5d6e 100644 --- a/cuda/matrix/ell_kernels.cu +++ b/cuda/matrix/ell_kernels.cu @@ -102,7 +102,7 @@ constexpr int max_thread_per_worker = 32; * 0 is a special case where it uses a sub-warp size of warp_size in * combination with atomic_adds. */ -using compiled_kernels = syn::value_list; +using compiled_kernels = std::integer_sequence; #include "common/cuda_hip/matrix/ell_kernels.hpp.inc" @@ -113,7 +113,7 @@ namespace { template -void abstract_spmv(syn::value_list, int num_worker_per_row, +void abstract_spmv(std::integer_sequence, int num_worker_per_row, const matrix::Ell* a, const matrix::Dense* b, matrix::Dense* c, @@ -249,8 +249,8 @@ void spmv(std::shared_ptr exec, select_abstract_spmv( compiled_kernels(), [&info](int compiled_info) { return info == compiled_info; }, - syn::value_list(), syn::type_list<>(), num_worker_per_row, a, b, - c); + std::integer_sequence(), syn::type_list<>(), num_worker_per_row, a, + b, c); } GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE( @@ -283,8 +283,8 @@ void advanced_spmv(std::shared_ptr exec, select_abstract_spmv( compiled_kernels(), [&info](int compiled_info) { return info == compiled_info; }, - syn::value_list(), syn::type_list<>(), num_worker_per_row, a, b, c, - alpha, beta); + std::integer_sequence(), syn::type_list<>(), num_worker_per_row, a, + b, c, alpha, beta); } GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE( diff --git a/cuda/matrix/fbcsr_kernels.cu b/cuda/matrix/fbcsr_kernels.cu index c9c4782d262..eede67c3211 100644 --- a/cuda/matrix/fbcsr_kernels.cu +++ b/cuda/matrix/fbcsr_kernels.cu @@ -261,7 +261,7 @@ namespace { template -void transpose_blocks_impl(syn::value_list, +void transpose_blocks_impl(std::integer_sequence, matrix::Fbcsr* const mat) { constexpr int subwarp_size = config::warp_size; @@ -310,7 +310,7 @@ void transpose(const std::shared_ptr exec, select_transpose_blocks( fixedblock::compiled_kernels(), [bs](int compiled_block_size) { return bs == compiled_block_size; }, - syn::value_list(), syn::type_list<>(), trans); + std::integer_sequence(), syn::type_list<>(), trans); } else { GKO_NOT_IMPLEMENTED; } diff --git a/cuda/preconditioner/jacobi_advanced_apply_instantiate.inc.cu b/cuda/preconditioner/jacobi_advanced_apply_instantiate.inc.cu index 3917c54bf43..34d4ebb529d 100644 --- a/cuda/preconditioner/jacobi_advanced_apply_instantiate.inc.cu +++ b/cuda/preconditioner/jacobi_advanced_apply_instantiate.inc.cu @@ -75,7 +75,7 @@ namespace jacobi { template void advanced_apply( - syn::value_list, size_type num_blocks, + std::integer_sequence, size_type num_blocks, const precision_reduction* block_precisions, const IndexType* block_pointers, const ValueType* blocks, const preconditioner::block_interleaved_storage_scheme& @@ -112,7 +112,7 @@ void advanced_apply( #define DECLARE_JACOBI_ADVANCED_APPLY_INSTANTIATION(ValueType, IndexType) \ void advanced_apply( \ - syn::value_list, size_type, \ + std::integer_sequence, size_type, \ const precision_reduction*, const IndexType* block_pointers, \ const ValueType*, \ const preconditioner::block_interleaved_storage_scheme&, \ diff --git a/cuda/preconditioner/jacobi_advanced_apply_kernel.cu b/cuda/preconditioner/jacobi_advanced_apply_kernel.cu index 2632e9c7180..95048463d1d 100644 --- a/cuda/preconditioner/jacobi_advanced_apply_kernel.cu +++ b/cuda/preconditioner/jacobi_advanced_apply_kernel.cu @@ -55,7 +55,7 @@ namespace jacobi { template void advanced_apply( - syn::value_list, size_type num_blocks, + std::integer_sequence, size_type num_blocks, const precision_reduction* block_precisions, const IndexType* block_pointers, const ValueType* blocks, const preconditioner::block_interleaved_storage_scheme& @@ -86,7 +86,7 @@ void apply(std::shared_ptr exec, size_type num_blocks, [&](int compiled_block_size) { return max_block_size <= compiled_block_size; }, - syn::value_list(), + std::integer_sequence(), syn::type_list<>(), num_blocks, block_precisions.get_const_data(), block_pointers.get_const_data(), blocks.get_const_data(), storage_scheme, alpha->get_const_values(), diff --git a/cuda/preconditioner/jacobi_common.hpp.in b/cuda/preconditioner/jacobi_common.hpp.in index d6be53c1835..4cdf9d6cefd 100644 --- a/cuda/preconditioner/jacobi_common.hpp.in +++ b/cuda/preconditioner/jacobi_common.hpp.in @@ -55,7 +55,7 @@ namespace jacobi { #endif -using compiled_kernels = syn::value_list; +using compiled_kernels = std::integer_sequence; constexpr int get_larger_power(int value, int guess = 1) diff --git a/cuda/preconditioner/jacobi_generate_instantiate.inc.cu b/cuda/preconditioner/jacobi_generate_instantiate.inc.cu index 240b4f4b5fd..b232a50536d 100644 --- a/cuda/preconditioner/jacobi_generate_instantiate.inc.cu +++ b/cuda/preconditioner/jacobi_generate_instantiate.inc.cu @@ -77,7 +77,7 @@ namespace jacobi { template -void generate(syn::value_list, +void generate(std::integer_sequence, const matrix::Csr* mtx, remove_complex accuracy, ValueType* block_data, const preconditioner::block_interleaved_storage_scheme& @@ -119,7 +119,7 @@ void generate(syn::value_list, #define DECLARE_JACOBI_GENERATE_INSTANTIATION(ValueType, IndexType) \ void generate( \ - syn::value_list, \ + std::integer_sequence, \ const matrix::Csr*, remove_complex, \ ValueType*, \ const preconditioner::block_interleaved_storage_scheme&, \ diff --git a/cuda/preconditioner/jacobi_generate_kernel.cu b/cuda/preconditioner/jacobi_generate_kernel.cu index 01ce054322e..50b3f9d0968 100644 --- a/cuda/preconditioner/jacobi_generate_kernel.cu +++ b/cuda/preconditioner/jacobi_generate_kernel.cu @@ -55,7 +55,7 @@ namespace jacobi { template -void generate(syn::value_list, +void generate(std::integer_sequence, const matrix::Csr* mtx, remove_complex accuracy, ValueType* block_data, const preconditioner::block_interleaved_storage_scheme& @@ -85,9 +85,9 @@ void generate(std::shared_ptr exec, [&](int compiled_block_size) { return max_block_size <= compiled_block_size; }, - syn::value_list(), syn::type_list<>(), - system_matrix, accuracy, blocks.get_data(), storage_scheme, - conditioning.get_data(), block_precisions.get_data(), + std::integer_sequence(), + syn::type_list<>(), system_matrix, accuracy, blocks.get_data(), + storage_scheme, conditioning.get_data(), block_precisions.get_data(), block_pointers.get_const_data(), num_blocks); } diff --git a/cuda/preconditioner/jacobi_kernels.cu b/cuda/preconditioner/jacobi_kernels.cu index c88eab57738..105cddf7bfc 100644 --- a/cuda/preconditioner/jacobi_kernels.cu +++ b/cuda/preconditioner/jacobi_kernels.cu @@ -153,7 +153,7 @@ namespace { template void transpose_jacobi( - syn::value_list, size_type num_blocks, + std::integer_sequence, size_type num_blocks, const precision_reduction* block_precisions, const IndexType* block_pointers, const ValueType* blocks, const preconditioner::block_interleaved_storage_scheme& @@ -202,7 +202,7 @@ void transpose_jacobi( [&](int compiled_block_size) { return max_block_size <= compiled_block_size; }, - syn::value_list(), + std::integer_sequence(), syn::type_list<>(), num_blocks, block_precisions.get_const_data(), block_pointers.get_const_data(), blocks.get_const_data(), storage_scheme, out_blocks.get_data()); @@ -226,7 +226,7 @@ void conj_transpose_jacobi( [&](int compiled_block_size) { return max_block_size <= compiled_block_size; }, - syn::value_list(), + std::integer_sequence(), syn::type_list<>(), num_blocks, block_precisions.get_const_data(), block_pointers.get_const_data(), blocks.get_const_data(), storage_scheme, out_blocks.get_data()); diff --git a/cuda/preconditioner/jacobi_simple_apply_instantiate.inc.cu b/cuda/preconditioner/jacobi_simple_apply_instantiate.inc.cu index 2d21f9d43ac..3e4cb263e18 100644 --- a/cuda/preconditioner/jacobi_simple_apply_instantiate.inc.cu +++ b/cuda/preconditioner/jacobi_simple_apply_instantiate.inc.cu @@ -74,7 +74,7 @@ namespace jacobi { template -void apply(syn::value_list, size_type num_blocks, +void apply(std::integer_sequence, size_type num_blocks, const precision_reduction* block_precisions, const IndexType* block_pointers, const ValueType* blocks, const preconditioner::block_interleaved_storage_scheme& @@ -110,7 +110,7 @@ void apply(syn::value_list, size_type num_blocks, #define DECLARE_JACOBI_SIMPLE_APPLY_INSTANTIATION(ValueType, IndexType) \ void apply( \ - syn::value_list, size_type, \ + std::integer_sequence, size_type, \ const precision_reduction*, const IndexType*, const ValueType*, \ const preconditioner::block_interleaved_storage_scheme&, \ const ValueType*, size_type, ValueType*, size_type) diff --git a/cuda/preconditioner/jacobi_simple_apply_kernel.cu b/cuda/preconditioner/jacobi_simple_apply_kernel.cu index 2b96cfc3321..e6ef05611f0 100644 --- a/cuda/preconditioner/jacobi_simple_apply_kernel.cu +++ b/cuda/preconditioner/jacobi_simple_apply_kernel.cu @@ -54,7 +54,7 @@ namespace jacobi { template -void apply(syn::value_list, size_type num_blocks, +void apply(std::integer_sequence, size_type num_blocks, const precision_reduction* block_precisions, const IndexType* block_pointers, const ValueType* blocks, const preconditioner::block_interleaved_storage_scheme& @@ -82,7 +82,7 @@ void simple_apply( [&](int compiled_block_size) { return max_block_size <= compiled_block_size; }, - syn::value_list(), + std::integer_sequence(), syn::type_list<>(), num_blocks, block_precisions.get_const_data(), block_pointers.get_const_data(), blocks.get_const_data(), storage_scheme, b->get_const_values() + col, b->get_stride(), diff --git a/dpcpp/base/config.hpp.in b/dpcpp/base/config.hpp.in index 3ac90f3dd62..429cf1278df 100644 --- a/dpcpp/base/config.hpp.in +++ b/dpcpp/base/config.hpp.in @@ -78,14 +78,14 @@ struct config { using KCFG_1D = ConfigSet<11, 7>; constexpr auto kcfg_1d_list = - syn::value_list(); -constexpr auto subgroup_list = syn::value_list(); + std::integer_sequence(); +constexpr auto subgroup_list = std::integer_sequence(); constexpr auto kcfg_1d_array = syn::as_array(kcfg_1d_list); using BlockCfg = ConfigSet<11>; constexpr auto block_cfg_list = - ::gko::syn::value_list(); + std::integer_sequence(); } // namespace dpcpp diff --git a/dpcpp/base/helper.hpp b/dpcpp/base/helper.hpp index d8081543313..af76aa41fa4 100644 --- a/dpcpp/base/helper.hpp +++ b/dpcpp/base/helper.hpp @@ -124,8 +124,8 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. callable_( \ list_, \ [&desired_cfg](std::uint32_t cfg) { return cfg == desired_cfg; }, \ - ::gko::syn::value_list(), ::gko::syn::value_list(), \ - ::gko::syn::value_list(), \ + std::integer_sequence(), std::integer_sequence(), \ + std::integer_sequence(), \ ::gko::syn::type_list<>(), grid, block, dynamic_shared_memory, \ queue, std::forward(args)...); \ } diff --git a/dpcpp/base/kernel_launch_reduction.dp.hpp b/dpcpp/base/kernel_launch_reduction.dp.hpp index 2ebc724ab05..77d00da2a08 100644 --- a/dpcpp/base/kernel_launch_reduction.dp.hpp +++ b/dpcpp/base/kernel_launch_reduction.dp.hpp @@ -253,8 +253,8 @@ void run_kernel_reduction(std::shared_ptr exec, }); select_run_kernel_reduction( kcfg_1d_list, [&](std::uint32_t cfg) { return cfg == desired_cfg; }, - syn::value_list(), syn::value_list(), - syn::value_list(), syn::type_list<>(), exec, fn, op, + std::integer_sequence(), std::integer_sequence(), + std::integer_sequence(), syn::type_list<>(), exec, fn, op, finalize, identity, result, size, map_to_device(args)...); } @@ -274,8 +274,8 @@ void run_kernel_reduction(std::shared_ptr exec, }); select_run_kernel_reduction( kcfg_1d_list, [&](std::uint32_t cfg) { return cfg == desired_cfg; }, - syn::value_list(), syn::value_list(), - syn::value_list(), syn::type_list<>(), exec, fn, op, + std::integer_sequence(), std::integer_sequence(), + std::integer_sequence(), syn::type_list<>(), exec, fn, op, finalize, identity, result, size, map_to_device(args)...); } @@ -286,7 +286,7 @@ namespace { template -void generic_kernel_row_reduction_2d(syn::value_list, +void generic_kernel_row_reduction_2d(std::integer_sequence, std::shared_ptr exec, int64 rows, int64 cols, int64 col_blocks, KernelFunction fn, ReductionOp op, @@ -486,7 +486,7 @@ void generic_kernel_reduction_finalize_2d( template -void run_generic_col_reduction_small(syn::value_list, +void run_generic_col_reduction_small(std::integer_sequence, std::shared_ptr exec, int64 max_workgroups, KernelFunction fn, ReductionOp op, FinalizeOp finalize, @@ -539,8 +539,8 @@ void run_kernel_row_reduction_stage1(std::shared_ptr exec, constexpr auto wg_size = KCFG_1D::decode<0>(cfg); constexpr auto sg_size = KCFG_1D::decode<1>(cfg); using subsubgroup_sizes = - syn::value_list(16, sg_size), - std::min(32, sg_size), sg_size>; + std::integer_sequence(16, sg_size), + std::min(32, sg_size), sg_size>; constexpr int oversubscription = 16; const auto rows = static_cast(size[0]); const auto cols = static_cast(size[1]); @@ -552,8 +552,8 @@ void run_kernel_row_reduction_stage1(std::shared_ptr exec, Array partial{exec, static_cast(col_blocks * rows)}; generic_kernel_row_reduction_2d( - syn::value_list{}, exec, rows, cols, col_blocks, fn, - op, [](auto v) { return v; }, identity, partial.get_data(), 1, + std::integer_sequence{}, exec, rows, cols, col_blocks, + fn, op, [](auto v) { return v; }, identity, partial.get_data(), 1, args...); queue->submit([&](sycl::handler& cgh) { generic_kernel_reduction_finalize_2d( @@ -568,8 +568,8 @@ void run_kernel_row_reduction_stage1(std::shared_ptr exec, return compiled_ssg_size >= cols || compiled_ssg_size == sg_size; }, - syn::value_list(), syn::type_list<>(), exec, rows, cols, - 1, fn, op, finalize, identity, result, + std::integer_sequence(), syn::type_list<>(), exec, rows, + cols, 1, fn, op, finalize, identity, result, static_cast(result_stride), args...); } } @@ -590,8 +590,8 @@ void run_kernel_col_reduction_stage1(std::shared_ptr exec, constexpr auto wg_size = KCFG_1D::decode<0>(cfg); constexpr auto sg_size = KCFG_1D::decode<1>(cfg); using subsubgroup_sizes = - syn::value_list(16, sg_size), - std::min(32, sg_size), sg_size>; + std::integer_sequence(16, sg_size), + std::min(32, sg_size), sg_size>; constexpr int oversubscription = 16; const auto rows = static_cast(size[0]); const auto cols = static_cast(size[1]); @@ -604,8 +604,8 @@ void run_kernel_col_reduction_stage1(std::shared_ptr exec, return compiled_ssg_size >= cols || compiled_ssg_size == sg_size; }, - syn::value_list(), syn::type_list<>(), exec, max_blocks, - fn, op, finalize, identity, result, size, args...); + std::integer_sequence(), syn::type_list<>(), exec, + max_blocks, fn, op, finalize, identity, result, size, args...); } else { const auto col_blocks = ceildiv(cols, sg_size); const auto row_blocks = ceildiv( @@ -658,8 +658,8 @@ void run_kernel_row_reduction(std::shared_ptr exec, }); select_kernel_row_reduction_stage1( kcfg_1d_list, [&](std::uint32_t cfg) { return cfg == desired_cfg; }, - syn::value_list(), syn::value_list(), - syn::value_list(), syn::type_list<>(), exec, fn, op, + std::integer_sequence(), std::integer_sequence(), + std::integer_sequence(), syn::type_list<>(), exec, fn, op, finalize, identity, result, result_stride, size, map_to_device(args)...); } @@ -680,8 +680,8 @@ void run_kernel_col_reduction(std::shared_ptr exec, }); select_kernel_col_reduction_stage1( kcfg_1d_list, [&](std::uint32_t cfg) { return cfg == desired_cfg; }, - syn::value_list(), syn::value_list(), - syn::value_list(), syn::type_list<>(), exec, fn, op, + std::integer_sequence(), std::integer_sequence(), + std::integer_sequence(), syn::type_list<>(), exec, fn, op, finalize, identity, result, size, map_to_device(args)...); } diff --git a/dpcpp/matrix/csr_kernels.dp.cpp b/dpcpp/matrix/csr_kernels.dp.cpp index fa23cda9d1d..eaf401b0472 100644 --- a/dpcpp/matrix/csr_kernels.dp.cpp +++ b/dpcpp/matrix/csr_kernels.dp.cpp @@ -89,9 +89,10 @@ constexpr int classical_overweight = 32; * A compile-time list of the number items per threads for which spmv kernel * should be compiled. */ -using compiled_kernels = syn::value_list; +using compiled_kernels = std::integer_sequence; -using classical_kernels = syn::value_list; +using classical_kernels = + std::integer_sequence; namespace kernel { @@ -1006,7 +1007,7 @@ namespace host_kernel { template -void merge_path_spmv(syn::value_list, +void merge_path_spmv(std::integer_sequence, std::shared_ptr exec, const matrix::Csr* a, const matrix::Dense* b, @@ -1076,7 +1077,7 @@ int compute_items_per_thread(std::shared_ptr exec) template -void classical_spmv(syn::value_list, +void classical_spmv(std::integer_sequence, std::shared_ptr exec, const matrix::Csr* a, const matrix::Dense* b, @@ -1148,7 +1149,7 @@ void spmv(std::shared_ptr exec, [&items_per_thread](int compiled_info) { return items_per_thread == compiled_info; }, - syn::value_list(), syn::type_list<>(), exec, a, b, c); + std::integer_sequence(), syn::type_list<>(), exec, a, b, c); } else if (a->get_strategy()->get_name() == "classical") { IndexType max_length_per_row = 0; using Tcsr = matrix::Csr; @@ -1167,7 +1168,7 @@ void spmv(std::shared_ptr exec, [&max_length_per_row](int compiled_info) { return max_length_per_row >= compiled_info; }, - syn::value_list(), syn::type_list<>(), exec, a, b, c); + std::integer_sequence(), syn::type_list<>(), exec, a, b, c); } else if (a->get_strategy()->get_name() == "sparselib" || a->get_strategy()->get_name() == "cusparse") { if (!is_complex()) { @@ -1284,8 +1285,8 @@ void advanced_spmv(std::shared_ptr exec, [&max_length_per_row](int compiled_info) { return max_length_per_row >= compiled_info; }, - syn::value_list(), syn::type_list<>(), exec, a, b, c, alpha, - beta); + std::integer_sequence(), syn::type_list<>(), exec, a, b, c, + alpha, beta); } else if (a->get_strategy()->get_name() == "merge_path") { int items_per_thread = host_kernel::compute_items_per_thread(exec); @@ -1294,8 +1295,8 @@ void advanced_spmv(std::shared_ptr exec, [&items_per_thread](int compiled_info) { return items_per_thread == compiled_info; }, - syn::value_list(), syn::type_list<>(), exec, a, b, c, alpha, - beta); + std::integer_sequence(), syn::type_list<>(), exec, a, b, c, + alpha, beta); } else { GKO_NOT_IMPLEMENTED; } diff --git a/dpcpp/matrix/ell_kernels.dp.cpp b/dpcpp/matrix/ell_kernels.dp.cpp index fbd1063bf13..679ce5f2b31 100644 --- a/dpcpp/matrix/ell_kernels.dp.cpp +++ b/dpcpp/matrix/ell_kernels.dp.cpp @@ -104,7 +104,7 @@ constexpr int max_thread_per_worker = 32; * 0 is a special case where it uses a sub-warp size of warp_size in * combination with atomic_adds. */ -using compiled_kernels = syn::value_list; +using compiled_kernels = std::integer_sequence; namespace kernel { @@ -306,7 +306,7 @@ namespace { template -void abstract_spmv(syn::value_list, +void abstract_spmv(std::integer_sequence, std::shared_ptr exec, int num_worker_per_row, const matrix::Ell* a, @@ -436,8 +436,8 @@ void spmv(std::shared_ptr exec, select_abstract_spmv( compiled_kernels(), [&info](int compiled_info) { return info == compiled_info; }, - syn::value_list(), syn::type_list<>(), exec, num_worker_per_row, a, - b, c); + std::integer_sequence(), syn::type_list<>(), exec, + num_worker_per_row, a, b, c); } GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE( @@ -470,8 +470,8 @@ void advanced_spmv(std::shared_ptr exec, select_abstract_spmv( compiled_kernels(), [&info](int compiled_info) { return info == compiled_info; }, - syn::value_list(), syn::type_list<>(), exec, num_worker_per_row, a, - b, c, alpha, beta); + std::integer_sequence(), syn::type_list<>(), exec, + num_worker_per_row, a, b, c, alpha, beta); } GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE( diff --git a/dpcpp/test/components/cooperative_groups.dp.cpp b/dpcpp/test/components/cooperative_groups.dp.cpp index 533f8f363e4..ce6becd8896 100644 --- a/dpcpp/test/components/cooperative_groups.dp.cpp +++ b/dpcpp/test/components/cooperative_groups.dp.cpp @@ -62,9 +62,9 @@ namespace { using namespace gko::kernels::dpcpp; using KCFG_1D = gko::ConfigSet<11, 7>; constexpr auto default_config_list = - ::gko::syn::value_list(); + std::integer_sequence(); class CooperativeGroups : public testing::TestWithParam { @@ -160,8 +160,8 @@ void cg_shuffle_config_call(std::uint32_t desired_cfg, dim3 grid, dim3 block, default_config_list, // validate [&desired_cfg](std::uint32_t cfg) { return cfg == desired_cfg; }, - ::gko::syn::value_list(), ::gko::syn::value_list(), - ::gko::syn::value_list(), ::gko::syn::type_list<>(), + std::integer_sequence(), std::integer_sequence(), + std::integer_sequence(), ::gko::syn::type_list<>(), grid, block, dynamic_shared_memory, queue, s); } diff --git a/hip/base/kernel_launch_reduction.hip.hpp b/hip/base/kernel_launch_reduction.hip.hpp index e93f811bfb1..8eac7625c34 100644 --- a/hip/base/kernel_launch_reduction.hip.hpp +++ b/hip/base/kernel_launch_reduction.hip.hpp @@ -206,10 +206,10 @@ void run_kernel_reduction(std::shared_ptr exec, template __global__ - __launch_bounds__(default_block_size) void generic_kernel_row_reduction_2d( - int64 rows, int64 cols, int64 col_blocks, KernelFunction fn, - ReductionOp op, FinalizeOp finalize, ValueType identity, - ValueType* result, int64 result_stride, KernelArgs... args) +__launch_bounds__(default_block_size) void generic_kernel_row_reduction_2d( + int64 rows, int64 cols, int64 col_blocks, KernelFunction fn, ReductionOp op, + FinalizeOp finalize, ValueType identity, ValueType* result, + int64 result_stride, KernelArgs... args) { const auto idx = thread::get_subwarp_id_flat(); const auto row = idx % rows; @@ -238,10 +238,10 @@ __global__ template __global__ - __launch_bounds__(default_block_size) void generic_kernel_col_reduction_2d_small( - int64 rows, int64 cols, KernelFunction fn, ReductionOp op, - FinalizeOp finalize, ValueType identity, ValueType* result, - KernelArgs... args) +__launch_bounds__(default_block_size) void generic_kernel_col_reduction_2d_small( + int64 rows, int64 cols, KernelFunction fn, ReductionOp op, + FinalizeOp finalize, ValueType identity, ValueType* result, + KernelArgs... args) { constexpr auto warp_size = config::warp_size; constexpr auto warps_per_block = default_block_size / warp_size; @@ -301,10 +301,10 @@ __global__ template __global__ - __launch_bounds__(default_block_size) void generic_kernel_col_reduction_2d_blocked( - int64 rows, int64 cols, KernelFunction fn, ReductionOp op, - FinalizeOp finalize, ValueType identity, ValueType* result, - KernelArgs... args) +__launch_bounds__(default_block_size) void generic_kernel_col_reduction_2d_blocked( + int64 rows, int64 cols, KernelFunction fn, ReductionOp op, + FinalizeOp finalize, ValueType identity, ValueType* result, + KernelArgs... args) { constexpr auto warp_size = config::warp_size; __shared__ UninitializedArray block_partial; @@ -340,10 +340,10 @@ __global__ template __global__ - __launch_bounds__(default_block_size) void generic_kernel_reduction_finalize_2d( - int64 num_results, int64 num_blocks, ReductionOp op, - FinalizeOp finalize, ValueType identity, const ValueType* input, - int64 result_stride, ValueType* result) +__launch_bounds__(default_block_size) void generic_kernel_reduction_finalize_2d( + int64 num_results, int64 num_blocks, ReductionOp op, FinalizeOp finalize, + ValueType identity, const ValueType* input, int64 result_stride, + ValueType* result) { const auto idx = thread::get_thread_id_flat(); if (idx >= num_results) { @@ -362,7 +362,7 @@ namespace { template -void run_generic_kernel_row_reduction(syn::value_list, +void run_generic_kernel_row_reduction(std::integer_sequence, int64 rows, int64 cols, int64 col_blocks, KernelFunction fn, ReductionOp op, FinalizeOp finalize, ValueType identity, @@ -387,7 +387,7 @@ GKO_ENABLE_IMPLEMENTATION_SELECTION(select_run_generic_kernel_row_reduction, template -void run_generic_col_reduction_small(syn::value_list, +void run_generic_col_reduction_small(std::integer_sequence, int64 max_blocks, std::shared_ptr exec, KernelFunction fn, ReductionOp op, @@ -441,7 +441,7 @@ void run_kernel_row_reduction(std::shared_ptr exec, dim<2> size, KernelArgs&&... args) { using subwarp_sizes = - syn::value_list; + std::integer_sequence; constexpr int oversubscription = 16; const auto rows = static_cast(size[0]); const auto cols = static_cast(size[1]); @@ -472,8 +472,8 @@ void run_kernel_row_reduction(std::shared_ptr exec, return compiled_subwarp_size >= cols || compiled_subwarp_size == config::warp_size; }, - syn::value_list(), syn::type_list<>(), rows, cols, 1, fn, op, - finalize, identity, result, static_cast(result_stride), + std::integer_sequence(), syn::type_list<>(), rows, cols, 1, fn, + op, finalize, identity, result, static_cast(result_stride), map_to_device(args)...); } } @@ -488,7 +488,7 @@ void run_kernel_col_reduction(std::shared_ptr exec, KernelArgs&&... args) { using subwarp_sizes = - syn::value_list; + std::integer_sequence; constexpr int oversubscription = 16; const auto rows = static_cast(size[0]); const auto cols = static_cast(size[1]); @@ -501,8 +501,8 @@ void run_kernel_col_reduction(std::shared_ptr exec, return compiled_subwarp_size >= cols || compiled_subwarp_size == config::warp_size; }, - syn::value_list(), syn::type_list<>(), max_blocks, exec, fn, - op, finalize, identity, result, size, map_to_device(args)...); + std::integer_sequence(), syn::type_list<>(), max_blocks, exec, + fn, op, finalize, identity, result, size, map_to_device(args)...); } else { const auto col_blocks = ceildiv(cols, config::warp_size); const auto row_blocks = diff --git a/hip/factorization/par_ic_kernels.hip.cpp b/hip/factorization/par_ic_kernels.hip.cpp index 47cc332b70e..4b36d854593 100644 --- a/hip/factorization/par_ic_kernels.hip.cpp +++ b/hip/factorization/par_ic_kernels.hip.cpp @@ -59,7 +59,7 @@ constexpr int default_block_size = 512; // subwarp sizes for all warp-parallel kernels (sweep) using compiled_kernels = - syn::value_list; + std::integer_sequence; #include "common/cuda_hip/factorization/par_ic_kernels.hpp.inc" diff --git a/hip/factorization/par_ict_kernels.hip.cpp b/hip/factorization/par_ict_kernels.hip.cpp index 05d08e598a7..18929cad361 100644 --- a/hip/factorization/par_ict_kernels.hip.cpp +++ b/hip/factorization/par_ict_kernels.hip.cpp @@ -73,7 +73,7 @@ constexpr int default_block_size = 512; // subwarp sizes for all warp-parallel kernels (filter, add_candidates) using compiled_kernels = - syn::value_list; + std::integer_sequence; #include "common/cuda_hip/factorization/par_ict_spgeam_kernels.hpp.inc" @@ -84,7 +84,7 @@ namespace { template -void add_candidates(syn::value_list, +void add_candidates(std::integer_sequence, std::shared_ptr exec, const matrix::Csr* llh, const matrix::Csr* a, @@ -140,7 +140,7 @@ GKO_ENABLE_IMPLEMENTATION_SELECTION(select_add_candidates, add_candidates); template -void compute_factor(syn::value_list, +void compute_factor(std::integer_sequence, std::shared_ptr exec, const matrix::Csr* a, matrix::Csr* l, @@ -184,7 +184,8 @@ void add_candidates(std::shared_ptr exec, return total_nnz_per_row <= compiled_subwarp_size || compiled_subwarp_size == config::warp_size; }, - syn::value_list(), syn::type_list<>(), exec, llh, a, l, l_new); + std::integer_sequence(), syn::type_list<>(), exec, llh, a, l, + l_new); } GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( @@ -206,7 +207,7 @@ void compute_factor(std::shared_ptr exec, return total_nnz_per_row <= compiled_subwarp_size || compiled_subwarp_size == config::warp_size; }, - syn::value_list(), syn::type_list<>(), exec, a, l, l_coo); + std::integer_sequence(), syn::type_list<>(), exec, a, l, l_coo); } GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( diff --git a/hip/factorization/par_ilut_approx_filter_kernel.hip.cpp b/hip/factorization/par_ilut_approx_filter_kernel.hip.cpp index d484af93d2a..8de4288787c 100644 --- a/hip/factorization/par_ilut_approx_filter_kernel.hip.cpp +++ b/hip/factorization/par_ilut_approx_filter_kernel.hip.cpp @@ -76,7 +76,7 @@ namespace par_ilut_factorization { // subwarp sizes for filter kernels using compiled_kernels = - syn::value_list; + std::integer_sequence; #include "common/cuda_hip/factorization/par_ilut_filter_kernels.hpp.inc" @@ -84,7 +84,7 @@ using compiled_kernels = template -void threshold_filter_approx(syn::value_list, +void threshold_filter_approx(std::integer_sequence, std::shared_ptr exec, const matrix::Csr* m, IndexType rank, Array* tmp, @@ -201,7 +201,7 @@ void threshold_filter_approx(std::shared_ptr exec, return total_nnz_per_row <= compiled_subwarp_size || compiled_subwarp_size == config::warp_size; }, - syn::value_list(), syn::type_list<>(), exec, m, rank, &tmp, + std::integer_sequence(), syn::type_list<>(), exec, m, rank, &tmp, &threshold, m_out, m_out_coo); } diff --git a/hip/factorization/par_ilut_filter_kernel.hip.cpp b/hip/factorization/par_ilut_filter_kernel.hip.cpp index cf691f58f8b..a4843ee89ff 100644 --- a/hip/factorization/par_ilut_filter_kernel.hip.cpp +++ b/hip/factorization/par_ilut_filter_kernel.hip.cpp @@ -72,7 +72,7 @@ constexpr int default_block_size = 512; // subwarp sizes for filter kernels using compiled_kernels = - syn::value_list; + std::integer_sequence; #include "common/cuda_hip/factorization/par_ilut_filter_kernels.hpp.inc" @@ -82,7 +82,7 @@ namespace { template -void threshold_filter(syn::value_list, +void threshold_filter(std::integer_sequence, std::shared_ptr exec, const matrix::Csr* a, remove_complex threshold, @@ -156,8 +156,8 @@ void threshold_filter(std::shared_ptr exec, return total_nnz_per_row <= compiled_subwarp_size || compiled_subwarp_size == config::warp_size; }, - syn::value_list(), syn::type_list<>(), exec, a, threshold, m_out, - m_out_coo, lower); + std::integer_sequence(), syn::type_list<>(), exec, a, threshold, + m_out, m_out_coo, lower); } GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( diff --git a/hip/factorization/par_ilut_spgeam_kernel.hip.cpp b/hip/factorization/par_ilut_spgeam_kernel.hip.cpp index f8234cc6a04..02325d508bd 100644 --- a/hip/factorization/par_ilut_spgeam_kernel.hip.cpp +++ b/hip/factorization/par_ilut_spgeam_kernel.hip.cpp @@ -73,7 +73,7 @@ constexpr int default_block_size = 512; // subwarp sizes for add_candidates kernels using compiled_kernels = - syn::value_list; + std::integer_sequence; #include "common/cuda_hip/factorization/par_ilut_spgeam_kernels.hpp.inc" @@ -83,7 +83,7 @@ namespace { template -void add_candidates(syn::value_list, +void add_candidates(std::integer_sequence, std::shared_ptr exec, const matrix::Csr* lu, const matrix::Csr* a, @@ -175,8 +175,8 @@ void add_candidates(std::shared_ptr exec, return total_nnz_per_row <= compiled_subwarp_size || compiled_subwarp_size == config::warp_size; }, - syn::value_list(), syn::type_list<>(), exec, lu, a, l, u, l_new, - u_new); + std::integer_sequence(), syn::type_list<>(), exec, lu, a, l, u, + l_new, u_new); } GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( diff --git a/hip/factorization/par_ilut_sweep_kernel.hip.cpp b/hip/factorization/par_ilut_sweep_kernel.hip.cpp index e063fd81d0d..562e83efd7e 100644 --- a/hip/factorization/par_ilut_sweep_kernel.hip.cpp +++ b/hip/factorization/par_ilut_sweep_kernel.hip.cpp @@ -73,7 +73,7 @@ constexpr int default_block_size = 512; // subwarp sizes for all warp-parallel kernels (filter, add_candidates) using compiled_kernels = - syn::value_list; + std::integer_sequence; #include "common/cuda_hip/factorization/par_ilut_sweep_kernels.hpp.inc" @@ -83,7 +83,7 @@ namespace { template -void compute_l_u_factors(syn::value_list, +void compute_l_u_factors(std::integer_sequence, std::shared_ptr exec, const matrix::Csr* a, matrix::Csr* l, @@ -138,8 +138,8 @@ void compute_l_u_factors(std::shared_ptr exec, return total_nnz_per_row <= compiled_subwarp_size || compiled_subwarp_size == config::warp_size; }, - syn::value_list(), syn::type_list<>(), exec, a, l, l_coo, u, u_coo, - u_csc); + std::integer_sequence(), syn::type_list<>(), exec, a, l, l_coo, u, + u_coo, u_csc); } GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( diff --git a/hip/matrix/csr_kernels.hip.cpp b/hip/matrix/csr_kernels.hip.cpp index eafbd367c0e..fa4890f6ad2 100644 --- a/hip/matrix/csr_kernels.hip.cpp +++ b/hip/matrix/csr_kernels.hip.cpp @@ -90,13 +90,13 @@ constexpr int classical_overweight = 32; * A compile-time list of the number items per threads for which spmv kernel * should be compiled. */ -using compiled_kernels = syn::value_list; +using compiled_kernels = std::integer_sequence; using classical_kernels = - syn::value_list; + std::integer_sequence; using spgeam_kernels = - syn::value_list; + std::integer_sequence; #include "common/cuda_hip/matrix/csr_kernels.hpp.inc" @@ -106,7 +106,7 @@ namespace host_kernel { template -void merge_path_spmv(syn::value_list, +void merge_path_spmv(std::integer_sequence, std::shared_ptr exec, const matrix::Csr* a, const matrix::Dense* b, @@ -226,7 +226,7 @@ int compute_items_per_thread(std::shared_ptr exec) template -void classical_spmv(syn::value_list, +void classical_spmv(std::integer_sequence, std::shared_ptr exec, const matrix::Csr* a, const matrix::Dense* b, @@ -307,7 +307,7 @@ void spmv(std::shared_ptr exec, [&items_per_thread](int compiled_info) { return items_per_thread == compiled_info; }, - syn::value_list(), syn::type_list<>(), exec, a, b, c); + std::integer_sequence(), syn::type_list<>(), exec, a, b, c); } else { bool try_sparselib = (a->get_strategy()->get_name() == "sparselib" || a->get_strategy()->get_name() == "cusparse"); @@ -356,7 +356,8 @@ void spmv(std::shared_ptr exec, [&max_length_per_row](int compiled_info) { return max_length_per_row >= compiled_info; }, - syn::value_list(), syn::type_list<>(), exec, a, b, c); + std::integer_sequence(), syn::type_list<>(), exec, a, b, + c); } } } @@ -404,8 +405,8 @@ void advanced_spmv(std::shared_ptr exec, [&items_per_thread](int compiled_info) { return items_per_thread == compiled_info; }, - syn::value_list(), syn::type_list<>(), exec, a, b, c, alpha, - beta); + std::integer_sequence(), syn::type_list<>(), exec, a, b, c, + alpha, beta); } else { bool try_sparselib = (a->get_strategy()->get_name() == "sparselib" || a->get_strategy()->get_name() == "cusparse"); @@ -453,7 +454,7 @@ void advanced_spmv(std::shared_ptr exec, [&max_length_per_row](int compiled_info) { return max_length_per_row >= compiled_info; }, - syn::value_list(), syn::type_list<>(), exec, a, b, c, + std::integer_sequence(), syn::type_list<>(), exec, a, b, c, alpha, beta); } } @@ -542,7 +543,7 @@ namespace { template -void spgeam(syn::value_list, +void spgeam(std::integer_sequence, std::shared_ptr exec, const ValueType* alpha, const IndexType* a_row_ptrs, const IndexType* a_col_idxs, const ValueType* a_vals, const ValueType* beta, @@ -667,7 +668,7 @@ void advanced_spgemm(std::shared_ptr exec, return compiled_subwarp_size >= nnz_per_row || compiled_subwarp_size == config::warp_size; }, - syn::value_list(), syn::type_list<>(), exec, + std::integer_sequence(), syn::type_list<>(), exec, alpha->get_const_values(), c_tmp_row_ptrs, c_tmp_col_idxs, c_tmp_vals, beta->get_const_values(), d_row_ptrs, d_col_idxs, d_vals, c); @@ -697,7 +698,7 @@ void spgeam(std::shared_ptr exec, return compiled_subwarp_size >= nnz_per_row || compiled_subwarp_size == config::warp_size; }, - syn::value_list(), syn::type_list<>(), exec, + std::integer_sequence(), syn::type_list<>(), exec, alpha->get_const_values(), a->get_const_row_ptrs(), a->get_const_col_idxs(), a->get_const_values(), beta->get_const_values(), b->get_const_row_ptrs(), diff --git a/hip/matrix/ell_kernels.hip.cpp b/hip/matrix/ell_kernels.hip.cpp index 2486a666ab0..cd55dfc266f 100644 --- a/hip/matrix/ell_kernels.hip.cpp +++ b/hip/matrix/ell_kernels.hip.cpp @@ -105,7 +105,7 @@ constexpr int max_thread_per_worker = 32; * 0 is a special case where it uses a sub-warp size of warp_size in * combination with atomic_adds. */ -using compiled_kernels = syn::value_list; +using compiled_kernels = std::integer_sequence; #include "common/cuda_hip/matrix/ell_kernels.hpp.inc" @@ -116,7 +116,7 @@ namespace { template -void abstract_spmv(syn::value_list, int num_worker_per_row, +void abstract_spmv(std::integer_sequence, int num_worker_per_row, const matrix::Ell* a, const matrix::Dense* b, matrix::Dense* c, @@ -253,8 +253,8 @@ void spmv(std::shared_ptr exec, select_abstract_spmv( compiled_kernels(), [&info](int compiled_info) { return info == compiled_info; }, - syn::value_list(), syn::type_list<>(), num_worker_per_row, a, b, - c); + std::integer_sequence(), syn::type_list<>(), num_worker_per_row, a, + b, c); } GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE( @@ -287,8 +287,8 @@ void advanced_spmv(std::shared_ptr exec, select_abstract_spmv( compiled_kernels(), [&info](int compiled_info) { return info == compiled_info; }, - syn::value_list(), syn::type_list<>(), num_worker_per_row, a, b, c, - alpha, beta); + std::integer_sequence(), syn::type_list<>(), num_worker_per_row, a, + b, c, alpha, beta); } GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE( diff --git a/hip/preconditioner/jacobi_advanced_apply_instantiate.inc.hip.cpp b/hip/preconditioner/jacobi_advanced_apply_instantiate.inc.hip.cpp index 87042e35332..443ca644e00 100644 --- a/hip/preconditioner/jacobi_advanced_apply_instantiate.inc.hip.cpp +++ b/hip/preconditioner/jacobi_advanced_apply_instantiate.inc.hip.cpp @@ -78,7 +78,7 @@ namespace jacobi { template void advanced_apply( - syn::value_list, size_type num_blocks, + std::integer_sequence, size_type num_blocks, const precision_reduction* block_precisions, const IndexType* block_pointers, const ValueType* blocks, const preconditioner::block_interleaved_storage_scheme& @@ -117,7 +117,7 @@ void advanced_apply( #define DECLARE_JACOBI_ADVANCED_APPLY_INSTANTIATION(ValueType, IndexType) \ void advanced_apply( \ - syn::value_list, size_type, \ + std::integer_sequence, size_type, \ const precision_reduction*, const IndexType* block_pointers, \ const ValueType*, \ const preconditioner::block_interleaved_storage_scheme&, \ diff --git a/hip/preconditioner/jacobi_advanced_apply_kernel.hip.cpp b/hip/preconditioner/jacobi_advanced_apply_kernel.hip.cpp index 6412fb8ca6b..82483e0fcac 100644 --- a/hip/preconditioner/jacobi_advanced_apply_kernel.hip.cpp +++ b/hip/preconditioner/jacobi_advanced_apply_kernel.hip.cpp @@ -55,7 +55,7 @@ namespace jacobi { template void advanced_apply( - syn::value_list, size_type num_blocks, + std::integer_sequence, size_type num_blocks, const precision_reduction* block_precisions, const IndexType* block_pointers, const ValueType* blocks, const preconditioner::block_interleaved_storage_scheme& @@ -86,7 +86,7 @@ void apply(std::shared_ptr exec, size_type num_blocks, [&](int compiled_block_size) { return max_block_size <= compiled_block_size; }, - syn::value_list(), + std::integer_sequence(), syn::type_list<>(), num_blocks, block_precisions.get_const_data(), block_pointers.get_const_data(), blocks.get_const_data(), storage_scheme, alpha->get_const_values(), diff --git a/hip/preconditioner/jacobi_common.hip.hpp.in b/hip/preconditioner/jacobi_common.hip.hpp.in index 19f7d15bcef..ddfa1ee2962 100644 --- a/hip/preconditioner/jacobi_common.hip.hpp.in +++ b/hip/preconditioner/jacobi_common.hip.hpp.in @@ -56,7 +56,7 @@ namespace jacobi { #endif -using compiled_kernels = syn::value_list; +using compiled_kernels = std::integer_sequence; constexpr int get_larger_power(int value, int guess = 1) diff --git a/hip/preconditioner/jacobi_generate_instantiate.inc.hip.cpp b/hip/preconditioner/jacobi_generate_instantiate.inc.hip.cpp index 44ca906769f..313f9c4b9f1 100644 --- a/hip/preconditioner/jacobi_generate_instantiate.inc.hip.cpp +++ b/hip/preconditioner/jacobi_generate_instantiate.inc.hip.cpp @@ -77,7 +77,7 @@ namespace jacobi { template -void generate(syn::value_list, +void generate(std::integer_sequence, const matrix::Csr* mtx, remove_complex accuracy, ValueType* block_data, const preconditioner::block_interleaved_storage_scheme& @@ -120,7 +120,7 @@ void generate(syn::value_list, #define DECLARE_JACOBI_GENERATE_INSTANTIATION(ValueType, IndexType) \ void generate( \ - syn::value_list, \ + std::integer_sequence, \ const matrix::Csr*, remove_complex, \ ValueType*, \ const preconditioner::block_interleaved_storage_scheme&, \ diff --git a/hip/preconditioner/jacobi_generate_kernel.hip.cpp b/hip/preconditioner/jacobi_generate_kernel.hip.cpp index a5c8ee94fa6..54a63b47d1e 100644 --- a/hip/preconditioner/jacobi_generate_kernel.hip.cpp +++ b/hip/preconditioner/jacobi_generate_kernel.hip.cpp @@ -71,7 +71,7 @@ namespace jacobi { template -void generate(syn::value_list, +void generate(std::integer_sequence, const matrix::Csr* mtx, remove_complex accuracy, ValueType* block_data, const preconditioner::block_interleaved_storage_scheme& @@ -101,9 +101,9 @@ void generate(std::shared_ptr exec, [&](int compiled_block_size) { return max_block_size <= compiled_block_size; }, - syn::value_list(), syn::type_list<>(), - system_matrix, accuracy, blocks.get_data(), storage_scheme, - conditioning.get_data(), block_precisions.get_data(), + std::integer_sequence(), + syn::type_list<>(), system_matrix, accuracy, blocks.get_data(), + storage_scheme, conditioning.get_data(), block_precisions.get_data(), block_pointers.get_const_data(), num_blocks); } diff --git a/hip/preconditioner/jacobi_kernels.hip.cpp b/hip/preconditioner/jacobi_kernels.hip.cpp index d8c365fa12d..76db71d9c08 100644 --- a/hip/preconditioner/jacobi_kernels.hip.cpp +++ b/hip/preconditioner/jacobi_kernels.hip.cpp @@ -163,7 +163,7 @@ namespace { template void transpose_jacobi( - syn::value_list, size_type num_blocks, + std::integer_sequence, size_type num_blocks, const precision_reduction* block_precisions, const IndexType* block_pointers, const ValueType* blocks, const preconditioner::block_interleaved_storage_scheme& @@ -217,7 +217,7 @@ void transpose_jacobi( [&](int compiled_block_size) { return max_block_size <= compiled_block_size; }, - syn::value_list(), + std::integer_sequence(), syn::type_list<>(), num_blocks, block_precisions.get_const_data(), block_pointers.get_const_data(), blocks.get_const_data(), storage_scheme, out_blocks.get_data()); @@ -241,7 +241,7 @@ void conj_transpose_jacobi( [&](int compiled_block_size) { return max_block_size <= compiled_block_size; }, - syn::value_list(), + std::integer_sequence(), syn::type_list<>(), num_blocks, block_precisions.get_const_data(), block_pointers.get_const_data(), blocks.get_const_data(), storage_scheme, out_blocks.get_data()); diff --git a/hip/preconditioner/jacobi_simple_apply_instantiate.inc.hip.cpp b/hip/preconditioner/jacobi_simple_apply_instantiate.inc.hip.cpp index c4a63007798..fcf2ca2140c 100644 --- a/hip/preconditioner/jacobi_simple_apply_instantiate.inc.hip.cpp +++ b/hip/preconditioner/jacobi_simple_apply_instantiate.inc.hip.cpp @@ -74,7 +74,7 @@ namespace jacobi { template -void apply(syn::value_list, size_type num_blocks, +void apply(std::integer_sequence, size_type num_blocks, const precision_reduction* block_precisions, const IndexType* block_pointers, const ValueType* blocks, const preconditioner::block_interleaved_storage_scheme& @@ -112,7 +112,7 @@ void apply(syn::value_list, size_type num_blocks, #define DECLARE_JACOBI_SIMPLE_APPLY_INSTANTIATION(ValueType, IndexType) \ void apply( \ - syn::value_list, size_type, \ + std::integer_sequence, size_type, \ const precision_reduction*, const IndexType*, const ValueType*, \ const preconditioner::block_interleaved_storage_scheme&, \ const ValueType*, size_type, ValueType*, size_type) diff --git a/hip/preconditioner/jacobi_simple_apply_kernel.hip.cpp b/hip/preconditioner/jacobi_simple_apply_kernel.hip.cpp index 9690cd73121..bca05ce445c 100644 --- a/hip/preconditioner/jacobi_simple_apply_kernel.hip.cpp +++ b/hip/preconditioner/jacobi_simple_apply_kernel.hip.cpp @@ -68,7 +68,7 @@ namespace jacobi { template -void apply(syn::value_list, size_type num_blocks, +void apply(std::integer_sequence, size_type num_blocks, const precision_reduction* block_precisions, const IndexType* block_pointers, const ValueType* blocks, const preconditioner::block_interleaved_storage_scheme& @@ -96,7 +96,7 @@ void simple_apply( [&](int compiled_block_size) { return max_block_size <= compiled_block_size; }, - syn::value_list(), + std::integer_sequence(), syn::type_list<>(), num_blocks, block_precisions.get_const_data(), block_pointers.get_const_data(), blocks.get_const_data(), storage_scheme, b->get_const_values() + col, b->get_stride(), diff --git a/include/ginkgo/core/synthesizer/containers.hpp b/include/ginkgo/core/synthesizer/containers.hpp index fcb152da761..cb857aa36e3 100644 --- a/include/ginkgo/core/synthesizer/containers.hpp +++ b/include/ginkgo/core/synthesizer/containers.hpp @@ -35,6 +35,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include #include @@ -46,7 +47,6 @@ namespace gko { */ namespace syn { - /** * value_list records several values with the same type in template. * @@ -54,7 +54,7 @@ namespace syn { * @tparam Values the values in the list */ template -struct value_list {}; +using value_list = std::integer_sequence; /** @@ -90,15 +90,17 @@ template struct concatenate_impl; /** - * concatenate_impl specializes for two value_list with the same value type. + * concatenate_impl specializes for two std::integer_sequence with the same + * value type. * - * @tparam T the value type of two value_list + * @tparam T the value type of two std::integer_sequence * @tparam Values the values of the first list * @tparam Values the values of the second list */ template -struct concatenate_impl, value_list> { - using type = value_list; +struct concatenate_impl, + std::integer_sequence> { + using type = std::integer_sequence; }; @@ -106,7 +108,7 @@ struct concatenate_impl, value_list> { /** - * concatenate combines two value_list into one value_list. + * concatenate combines two std::integer_sequence into one. * * @tparam List1 the first list * @tparam List2 the second list @@ -127,14 +129,14 @@ template struct as_list_impl; /** - * as_list_impl specializes for the value_list + * as_list_impl specializes for the std::integer_sequence * - * @tparam T the value_list type - * @tparam Values the values of value_list + * @tparam T the std::integer_sequence type + * @tparam Values the values of std::integer_sequence */ template -struct as_list_impl> { - using type = value_list; +struct as_list_impl> { + using type = std::integer_sequence; }; /** @@ -158,7 +160,7 @@ struct as_list_impl> { template struct as_list_impl, std::enable_if_t<(Start < End)>> { using type = concatenate< - value_list, + std::integer_sequence, typename as_list_impl>::type>; }; @@ -171,7 +173,7 @@ struct as_list_impl, std::enable_if_t<(Start < End)>> { */ template struct as_list_impl, std::enable_if_t<(Start >= End)>> { - using type = value_list; + using type = std::integer_sequence; }; @@ -189,19 +191,21 @@ using as_list = typename detail::as_list_impl::type; /** - * as_array returns the array from value_list. It will be helpful if using - * for in runtime on the array. + * as_array returns the array from std::integer_sequence. It will be helpful + * if using for in runtime on the array. * - * @tparam T the type of value_list - * @tparam Value the values of value_list + * @tparam T the type of std::integer_sequence + * @tparam Value the values of std::integer_sequence * - * @param value_list the input value_list + * @param vl the input std::integer_sequence * - * @return std::array the std::array contains the values of value_list + * @return std::array the std::array contains the values of vl */ template -constexpr std::array as_array(value_list vl) +constexpr std::array as_array( + std::integer_sequence vl) { + std::ignore = vl; return std::array{Value...}; } diff --git a/omp/base/kernel_launch.hpp b/omp/base/kernel_launch.hpp index eb89aa211bd..c4da5bfa5fc 100644 --- a/omp/base/kernel_launch.hpp +++ b/omp/base/kernel_launch.hpp @@ -66,7 +66,7 @@ void run_kernel_impl(std::shared_ptr exec, KernelFunction fn, template -void run_kernel_sized_impl(syn::value_list, +void run_kernel_sized_impl(std::integer_sequence, std::shared_ptr exec, KernelFunction fn, dim<2> size, MappedKernelArgs... args) @@ -124,8 +124,8 @@ void run_kernel_impl(std::shared_ptr exec, KernelFunction fn, select_run_kernel_sized( remainders(), [&](int remainder) { return remainder == cols % block_size; }, - syn::value_list(), syn::type_list<>(), exec, fn, size, - args...); + std::integer_sequence(), syn::type_list<>(), exec, fn, + size, args...); } diff --git a/omp/base/kernel_launch_reduction.hpp b/omp/base/kernel_launch_reduction.hpp index d4a489a258f..97b2d8dd0c0 100644 --- a/omp/base/kernel_launch_reduction.hpp +++ b/omp/base/kernel_launch_reduction.hpp @@ -87,7 +87,7 @@ void run_kernel_reduction_impl(std::shared_ptr exec, template -void run_kernel_reduction_sized_impl(syn::value_list, +void run_kernel_reduction_sized_impl(std::integer_sequence, std::shared_ptr exec, KernelFunction fn, ReductionOp op, FinalizeOp finalize, ValueType identity, @@ -183,8 +183,8 @@ void run_kernel_reduction(std::shared_ptr exec, select_run_kernel_reduction_sized( remainders(), [&](int remainder) { return remainder == cols % block_size; }, - syn::value_list(), syn::type_list<>(), exec, fn, op, - finalize, identity, result, size, map_to_device(args)...); + std::integer_sequence(), syn::type_list<>(), exec, fn, + op, finalize, identity, result, size, map_to_device(args)...); } @@ -287,7 +287,7 @@ template void run_kernel_col_reduction_sized_impl( - syn::value_list, + std::integer_sequence, std::shared_ptr exec, KernelFunction fn, ReductionOp op, FinalizeOp finalize, ValueType identity, ValueType* result, dim<2> size, MappedKernelArgs... args) @@ -395,8 +395,8 @@ void run_kernel_col_reduction(std::shared_ptr exec, select_run_kernel_col_reduction_sized( remainders(), [&](int remainder) { return remainder == cols % block_size; }, - syn::value_list(), syn::type_list<>(), exec, fn, op, - finalize, identity, result, size, map_to_device(args)...); + std::integer_sequence(), syn::type_list<>(), exec, fn, + op, finalize, identity, result, size, map_to_device(args)...); } diff --git a/omp/matrix/fbcsr_kernels.cpp b/omp/matrix/fbcsr_kernels.cpp index ce647f79935..c8823fc896a 100644 --- a/omp/matrix/fbcsr_kernels.cpp +++ b/omp/matrix/fbcsr_kernels.cpp @@ -375,7 +375,7 @@ namespace { template void sort_by_column_index_impl( - syn::value_list, + std::integer_sequence, matrix::Fbcsr* const to_sort) { auto row_ptrs = to_sort->get_const_row_ptrs(); @@ -420,7 +420,7 @@ void sort_by_column_index(const std::shared_ptr exec, select_sort_col_idx( fixedblock::compiled_kernels(), [bs](int compiled_block_size) { return bs == compiled_block_size; }, - syn::value_list(), syn::type_list<>(), to_sort); + std::integer_sequence(), syn::type_list<>(), to_sort); } GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( diff --git a/reference/matrix/fbcsr_kernels.cpp b/reference/matrix/fbcsr_kernels.cpp index 9b093a97b68..8a318084568 100644 --- a/reference/matrix/fbcsr_kernels.cpp +++ b/reference/matrix/fbcsr_kernels.cpp @@ -430,7 +430,7 @@ namespace { template void sort_by_column_index_impl( - syn::value_list, + std::integer_sequence, matrix::Fbcsr* const to_sort) { auto row_ptrs = to_sort->get_const_row_ptrs(); @@ -476,7 +476,7 @@ void sort_by_column_index(const std::shared_ptr exec, select_sort_col_idx( fixedblock::compiled_kernels(), [bs](int compiled_block_size) { return bs == compiled_block_size; }, - syn::value_list(), syn::type_list<>(), to_sort); + std::integer_sequence(), syn::type_list<>(), to_sort); } GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( From 22cfa4a70054a750758859f39d8e36c7047b0256 Mon Sep 17 00:00:00 2001 From: Terry Cojean Date: Fri, 18 Feb 2022 01:15:56 +0100 Subject: [PATCH 04/14] Add new manipulations on std::integer_sequences + Sorting, with/without keeping duplicates, in ascending or descending order + Accessing functionality + min, max, median + Converting to a plain value --- core/base/block_sizes.hpp | 4 +- core/test/CMakeLists.txt | 1 + core/test/synthesizer/CMakeLists.txt | 1 + core/test/synthesizer/containers.cpp | 211 +++++++++ cuda/preconditioner/jacobi_common.hpp.in | 5 +- dpcpp/base/kernel_launch_reduction.dp.hpp | 8 +- dpcpp/components/prefix_sum_kernels.dp.cpp | 2 +- dpcpp/components/reduction.dp.hpp | 2 +- hip/preconditioner/jacobi_common.hip.hpp.in | 5 +- include/ginkgo/core/base/executor.hpp | 1 - .../ginkgo/core/synthesizer/containers.hpp | 423 +++++++++++++++++- 11 files changed, 636 insertions(+), 27 deletions(-) create mode 100644 core/test/synthesizer/CMakeLists.txt create mode 100644 core/test/synthesizer/containers.cpp diff --git a/core/base/block_sizes.hpp b/core/base/block_sizes.hpp index e41b3cc00e3..5a22870374d 100644 --- a/core/base/block_sizes.hpp +++ b/core/base/block_sizes.hpp @@ -34,8 +34,10 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GKO_CORE_BASE_BLOCK_SIZES_HPP_ +#include + + #include -#include namespace gko { diff --git a/core/test/CMakeLists.txt b/core/test/CMakeLists.txt index 2bb711e7a2e..07641c4ad6c 100644 --- a/core/test/CMakeLists.txt +++ b/core/test/CMakeLists.txt @@ -12,5 +12,6 @@ add_subdirectory(multigrid) add_subdirectory(preconditioner) add_subdirectory(reorder) add_subdirectory(solver) +add_subdirectory(synthesizer) add_subdirectory(stop) add_subdirectory(utils) diff --git a/core/test/synthesizer/CMakeLists.txt b/core/test/synthesizer/CMakeLists.txt new file mode 100644 index 00000000000..c2d6d722e4a --- /dev/null +++ b/core/test/synthesizer/CMakeLists.txt @@ -0,0 +1 @@ +ginkgo_create_test(containers) diff --git a/core/test/synthesizer/containers.cpp b/core/test/synthesizer/containers.cpp new file mode 100644 index 00000000000..7bedd7f1a5e --- /dev/null +++ b/core/test/synthesizer/containers.cpp @@ -0,0 +1,211 @@ +/************************************************************* +Copyright (c) 2017-2022, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include + + +#include + +namespace { + + +struct IntegerSequenceExtensions : public ::testing::Test { + using range1 = gko::syn::as_list>; + using range1_exp = std::integer_sequence; + using range2 = gko::syn::as_list>; + using range2_exp = std::integer_sequence; + + using test = gko::syn::value_list; + using sorted_asc_dups = + std::integer_sequence; + using sorted_asc_nodups = + std::integer_sequence; + using sorted_desc_dups = + std::integer_sequence; + using sorted_desc_nodups = + std::integer_sequence; + + using empty = std::integer_sequence; + +protected: + IntegerSequenceExtensions() {} +}; + +using ::testing::StaticAssertTypeEq; + + +TEST_F(IntegerSequenceExtensions, CanCreateRanges) +{ + StaticAssertTypeEq(); + StaticAssertTypeEq(); +} + + +TEST_F(IntegerSequenceExtensions, CanConcatenate) +{ + using test_case = + gko::syn::concatenate>; + using expected = std::integer_sequence; + + StaticAssertTypeEq(); +} + + +TEST_F(IntegerSequenceExtensions, CanGetArrayAndValue) +{ + auto expected_array = std::array{0, 7614, 453, 16, 9, 16, 0, 0, -4}; + using front = gko::syn::front; + + ASSERT_EQ(gko::syn::as_array(test{}), expected_array); + ASSERT_EQ(gko::syn::as_array(front{})[0], 0); + ASSERT_EQ(gko::syn::as_value(front{}), 0); +} + + +TEST_F(IntegerSequenceExtensions, CanSortAscendingNoDuplicates) +{ + using sorted = gko::syn::sort; + + StaticAssertTypeEq(); +} + + +TEST_F(IntegerSequenceExtensions, CanSortDescendingNoDuplicates) +{ + using sorted = gko::syn::sort; + + StaticAssertTypeEq(); +} + + +TEST_F(IntegerSequenceExtensions, CanSortAscendingWithDuplicates) +{ + using sorted = gko::syn::sort_keep; + + StaticAssertTypeEq(); +} + + +TEST_F(IntegerSequenceExtensions, CanSortDescendingWithDuplicates) +{ + using sorted = gko::syn::sort_keep; + + StaticAssertTypeEq(); +} + + +TEST_F(IntegerSequenceExtensions, CanAccessMax) +{ + using test_case = gko::syn::max; + using expected = std::integer_sequence; + StaticAssertTypeEq(); +} + + +TEST_F(IntegerSequenceExtensions, CanAccessMin) +{ + using test_case = gko::syn::min; + using expected = std::integer_sequence; + StaticAssertTypeEq(); +} + + +TEST_F(IntegerSequenceExtensions, CanAccessMedian) +{ + using test_case = gko::syn::median; + using expected = std::integer_sequence; + + StaticAssertTypeEq(); +} + + +TEST_F(IntegerSequenceExtensions, CanAccessFront) +{ + using test_case = gko::syn::front; + using expected = std::integer_sequence; + + StaticAssertTypeEq(); +} + + +TEST_F(IntegerSequenceExtensions, CanAccessBack) +{ + using test_case = gko::syn::back; + using expected = std::integer_sequence; + + StaticAssertTypeEq(); +} + + +TEST_F(IntegerSequenceExtensions, CanUseAtIndex) +{ + using idx1 = gko::syn::at_index<1, test>; + using exp_idx1 = std::integer_sequence; + using idx2 = gko::syn::at_index<2, test>; + using exp_idx2 = std::integer_sequence; + using idx4 = gko::syn::at_index<4, test>; + using exp_idx4 = std::integer_sequence; + using idx6 = gko::syn::at_index<6, test>; + using exp_idx6 = std::integer_sequence; + + StaticAssertTypeEq(); + StaticAssertTypeEq(); + StaticAssertTypeEq(); + StaticAssertTypeEq(); +} + + +TEST_F(IntegerSequenceExtensions, EmptyTests) +{ + using test1 = gko::syn::at_index<72, empty>; + using test2 = gko::syn::back; + using test3 = gko::syn::front; + using test4 = gko::syn::median; + using test5 = gko::syn::min; + using test6 = gko::syn::sort; + using test7 = gko::syn::sort; + using test8 = gko::syn::sort_keep; + using test9 = gko::syn::sort_keep; + + StaticAssertTypeEq(); + StaticAssertTypeEq(); + StaticAssertTypeEq(); + StaticAssertTypeEq(); + StaticAssertTypeEq(); + StaticAssertTypeEq(); + StaticAssertTypeEq(); + StaticAssertTypeEq(); + StaticAssertTypeEq(); +} + + +} // namespace diff --git a/cuda/preconditioner/jacobi_common.hpp.in b/cuda/preconditioner/jacobi_common.hpp.in index 4cdf9d6cefd..6a637a572cf 100644 --- a/cuda/preconditioner/jacobi_common.hpp.in +++ b/cuda/preconditioner/jacobi_common.hpp.in @@ -30,12 +30,15 @@ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *************************************************************/ +#include + + #include -#include #include "cuda/base/config.hpp" + namespace gko { namespace kernels { namespace cuda { diff --git a/dpcpp/base/kernel_launch_reduction.dp.hpp b/dpcpp/base/kernel_launch_reduction.dp.hpp index 77d00da2a08..33dc21b2056 100644 --- a/dpcpp/base/kernel_launch_reduction.dp.hpp +++ b/dpcpp/base/kernel_launch_reduction.dp.hpp @@ -247,7 +247,7 @@ void run_kernel_reduction(std::shared_ptr exec, ValueType* result, dim<2> size, KernelArgs&&... args) { const auto desired_cfg = - get_first_cfg(as_array(kcfg_1d_list), [&](std::uint32_t cfg) { + get_first_cfg(syn::as_array(kcfg_1d_list), [&](std::uint32_t cfg) { return validate(exec->get_queue(), KCFG_1D::decode<0>(cfg), KCFG_1D::decode<1>(cfg)); }); @@ -268,7 +268,7 @@ void run_kernel_reduction(std::shared_ptr exec, KernelArgs&&... args) { const auto desired_cfg = - get_first_cfg(as_array(kcfg_1d_list), [&](std::uint32_t cfg) { + get_first_cfg(syn::as_array(kcfg_1d_list), [&](std::uint32_t cfg) { return validate(exec->get_queue(), KCFG_1D::decode<0>(cfg), KCFG_1D::decode<1>(cfg)); }); @@ -652,7 +652,7 @@ void run_kernel_row_reduction(std::shared_ptr exec, dim<2> size, KernelArgs&&... args) { const auto desired_cfg = - get_first_cfg(as_array(kcfg_1d_list), [&](std::uint32_t cfg) { + get_first_cfg(syn::as_array(kcfg_1d_list), [&](std::uint32_t cfg) { return validate(exec->get_queue(), KCFG_1D::decode<0>(cfg), KCFG_1D::decode<1>(cfg)); }); @@ -674,7 +674,7 @@ void run_kernel_col_reduction(std::shared_ptr exec, KernelArgs&&... args) { const auto desired_cfg = - get_first_cfg(as_array(kcfg_1d_list), [&](std::uint32_t cfg) { + get_first_cfg(syn::as_array(kcfg_1d_list), [&](std::uint32_t cfg) { return validate(exec->get_queue(), KCFG_1D::decode<0>(cfg), KCFG_1D::decode<1>(cfg)); }); diff --git a/dpcpp/components/prefix_sum_kernels.dp.cpp b/dpcpp/components/prefix_sum_kernels.dp.cpp index 166b9482b49..0e83b0b379a 100644 --- a/dpcpp/components/prefix_sum_kernels.dp.cpp +++ b/dpcpp/components/prefix_sum_kernels.dp.cpp @@ -68,7 +68,7 @@ void prefix_sum(std::shared_ptr exec, IndexType* counts, // prefix_sum should only be performed on a valid array if (num_entries > 0) { auto queue = exec->get_queue(); - constexpr auto block_cfg_array = as_array(block_cfg_list); + constexpr auto block_cfg_array = syn::as_array(block_cfg_list); const std::uint32_t cfg = get_first_cfg(block_cfg_array, [&queue](std::uint32_t cfg) { return validate(queue, BlockCfg::decode<0>(cfg), 16); diff --git a/dpcpp/components/reduction.dp.hpp b/dpcpp/components/reduction.dp.hpp index e6d5aca5f5b..3d4f30347a2 100644 --- a/dpcpp/components/reduction.dp.hpp +++ b/dpcpp/components/reduction.dp.hpp @@ -259,7 +259,7 @@ ValueType reduce_add_array(std::shared_ptr exec, auto block_results = Array(exec); ValueType answer = zero(); auto queue = exec->get_queue(); - constexpr auto kcfg_1d_array = as_array(kcfg_1d_list); + constexpr auto kcfg_1d_array = syn::as_array(kcfg_1d_list); const std::uint32_t cfg = get_first_cfg(kcfg_1d_array, [&queue](std::uint32_t cfg) { return validate(queue, KCFG_1D::decode<0>(cfg), diff --git a/hip/preconditioner/jacobi_common.hip.hpp.in b/hip/preconditioner/jacobi_common.hip.hpp.in index ddfa1ee2962..76947dfd5d7 100644 --- a/hip/preconditioner/jacobi_common.hip.hpp.in +++ b/hip/preconditioner/jacobi_common.hip.hpp.in @@ -30,8 +30,11 @@ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *************************************************************/ + +#include + + #include -#include #include "hip/base/config.hip.hpp" diff --git a/include/ginkgo/core/base/executor.hpp b/include/ginkgo/core/base/executor.hpp index 73976c24089..0ab3f54a79a 100644 --- a/include/ginkgo/core/base/executor.hpp +++ b/include/ginkgo/core/base/executor.hpp @@ -49,7 +49,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include #include -#include namespace gko { diff --git a/include/ginkgo/core/synthesizer/containers.hpp b/include/ginkgo/core/synthesizer/containers.hpp index cb857aa36e3..f4c5eafbb9f 100644 --- a/include/ginkgo/core/synthesizer/containers.hpp +++ b/include/ginkgo/core/synthesizer/containers.hpp @@ -35,8 +35,10 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include #include #include +#include namespace gko { @@ -83,24 +85,36 @@ namespace detail { /** * concatenate_impl base type * - * @tparam List1 the first List - * @tparam List2 the second List + * @tparam Lists a list of std::integer_sequence */ -template +template struct concatenate_impl; /** - * concatenate_impl specializes for two std::integer_sequence with the same - * value type. + * concatenate_impl specialization for a single std::integer_sequence + * + * @tparam T the value type of the std::integer_sequence + * @tparam Values the values of the list + */ +template +struct concatenate_impl> { + using type = std::integer_sequence; +}; + +/** + * concatenate_impl specialization for multiple std::integer_sequence with the + * same value type. * * @tparam T the value type of two std::integer_sequence - * @tparam Values the values of the first list - * @tparam Values the values of the second list + * @tparam Values1 the values of the first list + * @tparam Values2 the values of the second list + * @tparam Tail the lists which have not been concatenated yet */ -template +template struct concatenate_impl, - std::integer_sequence> { - using type = std::integer_sequence; + std::integer_sequence, Tail...> { + using type = typename concatenate_impl< + std::integer_sequence, Tail...>::type; }; @@ -108,13 +122,13 @@ struct concatenate_impl, /** - * concatenate combines two std::integer_sequence into one. + * concatenate an arbitrary number of std::integer_sequence with the same base + * type into one * - * @tparam List1 the first list - * @tparam List2 the second list + * @tparam Lists a list of std::integer_sequence */ -template -using concatenate = typename detail::concatenate_impl::type; +template +using concatenate = typename detail::concatenate_impl::type; namespace detail { @@ -202,13 +216,388 @@ using as_list = typename detail::as_list_impl::type; * @return std::array the std::array contains the values of vl */ template -constexpr std::array as_array( - std::integer_sequence vl) +constexpr auto as_array(std::integer_sequence vl) { std::ignore = vl; return std::array{Value...}; } +/** + * as_value returns the (first) value contained within an + * std::integer_sequence. The empty case is made to fail on purpose. + * + * @tparam T the type of std::integer_sequence + * @tparam Value the values of std::integer_sequence + * + * @param vl the input std::integer_sequence + * + * @return the first value within vl + */ +template +constexpr auto as_value(std::integer_sequence vl) +{ + static_assert(sizeof...(Value) > 0, + "Do not call as_value on an empty set!"); + return as_array(vl)[0]; +} + + +namespace detail { + + +/** + * This is the base type of a helper for sorting. It partitions the values + * within an std::integer_sequence into three parts based on a pivot: values + * lower than required, values equal, values above the requirement. + * + * The results will depend on the ascending or descending order, which reverses + * above and lower. + * + * For example, when considering + * ``` + * using idxs = std::integer_sequence; + * using parts = partition_impl<16, idxs>; // the pivot is 16 + * // Then effectively: + * // parts::lower = std::integer_sequence + * // parts::equal = std::integer_sequence + * // parts::above = std::integer_sequence + * ``` + * + * @tparam i the pivot value + * @tparam ascending whether to group in ascending or descending order + * @tparam T the type of the values + * @tparam Values the values + */ +template +struct partition_impl; + +/** + * The default case for partitioning an empty std::integer_sequence. All parts + * are the empty set. + * + * @tparam i the pivot value + * @tparam ascending whether to group in ascending or descending order + * @tparam T the type of the values + */ +template +struct partition_impl> { + using lower = std::integer_sequence; + using equal = std::integer_sequence; + using above = std::integer_sequence; +}; + +/** + * The recursive case for partitioning an std::integer_sequence. The value v1 is + * put in the matching part. + * + * @tparam i the pivot value + * @tparam ascending whether to group in ascending or descending order + * @tparam T the type of the values + * @tparam v1 the current value being processed + * @tparam Values the values left to process + */ +template +struct partition_impl> { + using this_elt = std::integer_sequence; + using empty = std::integer_sequence; + using recurse = + partition_impl>; + + using lower = concatenate< + std::conditional_t<(ascending ? v1 < i : v1 > i), this_elt, empty>, + typename recurse::lower>; + using equal = concatenate, + typename recurse::equal>; + using above = concatenate< + std::conditional_t<(ascending ? v1 > i : v1 < i), this_elt, empty>, + typename recurse::above>; +}; + + +/** + * This is the base type of the sorting structure. It sorts the values within + * an std::integer_sequence by using partition_t as a helper. + * + * The sorting will depend on the ascending or descending order and whether + * duplicates are kept or not. + * + * For example, when considering + * ``` + * using idxs = std::integer_sequence; + * using asc_dups = typename sort_impl::type; + * using asc_nodups = typename sort_impl::type; + * using desc_dups = typename sort_impl::type; + * // Then effectively: + * // asc_dups = std::integer_sequence + * // asc_nodups = std::integer_sequence + * // desc_dups = std::integer_sequence + * ``` + * + * @tparam ascending whether to sort in ascending or descending order + * @tparam keep_dups whether to keep duplicates or not + * @tparam T the type of the values + * @tparam Values the values + */ +template +struct sort_impl; + +/** + * The default case for sorting an empty std::integer_sequence. The result is an + * empty set. + * + * @tparam ascending whether to group in ascending or descending order + * @tparam keep_dups whether to keep duplicates or not + * @tparam T the type of the values + */ +template +struct sort_impl> { + using type = std::integer_sequence; +}; + +/** + * The recursive case for sorting an std::integer_sequence. The value v1 becomes + * the partition_impl pivot. The obtained groups are concatenated in order. + * Duplicates are removed by only populating `this_elt` instead of `this_elt + + * parts::equal`. + * + * @tparam ascending whether to group in ascending or descending order + * @tparam keep_dups whether to keep duplicates or not + * @tparam T the type of the values + * @tparam v1 the current value and pivot for partition_impl + * @tparam Values the values to sort + */ +template +struct sort_impl> { + using this_elt = std::integer_sequence; + using empty = std::integer_sequence; + using parts = + partition_impl>; + using sorted_inf = + typename sort_impl::type; + using sorted_eq = + concatenate>; + using sorted_up = + typename sort_impl::type; + + using type = concatenate; +}; + + +} // namespace detail + +/** + * This is a helper interface for sorting an std::integer_sequence. It always + * removes duplicate values. + * + * @see detail::sort_impl + * + * @tparam ascending whether to group in ascending or descending order + * @tparam T the type of the values + * @tparam Values the values to sort + */ +template +using sort = typename detail::sort_impl::type; + +/** + * This is sorting variant which keeps duplicates + * + * @see detail::sort_impl + * + * @tparam ascending whether to group in ascending or descending order + * @tparam T the type of the values + * @tparam Values the values to sort + */ +template +using sort_keep = + typename detail::sort_impl::type; + + +namespace detail { + + +/** + * This is the base type of the accessing an element of an std::integer_sequence + * at a given index. + * + * For example, when considering + * ``` + * using idxs = std::integer_sequence; + * using num1 = typename at_index_impl<0, idxs>::type; + * using num2 = typename at_index_impl<2, idxs>::type; + * using num5 = typename at_index_impl<5, idxs>::type; + * // Then effectively: + * // num1 = std::integer_sequence + * // num2 = std::integer_sequence + * // num5 = std::integer_sequence + * ``` + * + * @tparam idx the index to find + * @tparam T the type of the values + * @tparam Values the values + */ +template +struct at_index_impl; + +/** + * The default case for accessing an element of an empty std::integer_sequence + * at a given index. The result is an empty set. + * + * @tparam idx the index to find + * @tparam T the type of the values + */ +template +struct at_index_impl> { + using type = std::integer_sequence; +}; + +/** + * The recursive case for accessing an element of an std::integer_sequence at a + * given index. Idx is counted down until 0, where v1 is the requested element. + * + * @tparam idx the distance to the index to find + * @tparam T the type of the values + * @tparam v1 the value being processed + * @tparam Values the values + */ +template +struct at_index_impl> { + using recurse = + typename at_index_impl>::type; + using type = + std::conditional_t<(idx <= 0), std::integer_sequence, recurse>; +}; + + +} // namespace detail + + +/** + * This is a helper interface for accessing an std::integer_sequence at a given + * index. + * + * @see detail::at_index_impl + * + * @tparam idx the index of the element to find + * @tparam T the type of the values + * @tparam Values the values + */ +template +using at_index = typename detail::at_index_impl::type; + + +namespace detail { + + +/** + * Access the element at the back of an std::integer_sequence. This is the base + * type. + * + * @tparam T the type of the values + * @tparam Values the values + */ +template +struct back_impl; + +/** + * Access the element at the back of an std::integer_sequence. This is the + * specialization for std::integer_sequence. We simply reuse at_index. We need + * to unpack the std::integer_sequence in order to have the proper size of the + * parameter pack `sizeof...(Values)`, otherwise it is always one. + * + * @tparam T the type of the values + * @tparam Values the values + */ +template +struct back_impl> { + using type = + at_index>; +}; + + +/** + * Access the median element of an std::integer_sequence. This is the base type. + * + * @tparam T the type of the values + * @tparam Values the values + */ +template +struct median_impl; + +/** + * Access the median element of an std::integer_sequence. This is the + * specialization for std::integer_sequence. We simply reuse at_index of the + * middle element after calling sort. We need to unpack the + * std::integer_sequence in order to have the proper size of the parameter pack + * `sizeof...(Values)`, otherwise it is always one. + * + * @tparam T the type of the values + * @tparam Values the values + */ +template +struct median_impl> { + using type = + at_index>; +}; + + +} // namespace detail + + +/** + * This is a helper interface for accessing the front of an + * std::integer_sequence. + * + * @tparam T the type of the values + * @tparam Values the values + */ +template +using front = at_index<0, T, Values...>; + +/** + * This is a helper interface for accessing the back of an + * std::integer_sequence. + * + * @tparam T the type of the values + * @tparam Values the values + */ +template +using back = typename detail::back_impl::type; + +/** + * This is a helper interface for accessing the median element of an + * std::integer_sequence. + * + * @tparam T the type of the values + * @tparam Values the values + */ +template +using median = typename detail::median_impl>::type; + + +/** + * This is a helper interface for accessing the minimum element of an + * std::integer_sequence. + * + * @tparam T the type of the values + * @tparam Values the values + */ +template +using min = front>; + + +/** + * This is a helper interface for accessing the minimum element of an + * std::integer_sequence. + * + * @tparam T the type of the values + * @tparam Values the values + */ +template +using max = front>; + } // namespace syn } // namespace gko From 6facc498e8030979d979937a5235a34b3f8a43e5 Mon Sep 17 00:00:00 2001 From: Terry Cojean Date: Fri, 18 Feb 2022 01:18:24 +0100 Subject: [PATCH 05/14] piggyback: Allow force download third party libs --- CMakeLists.txt | 13 +++++++++---- cmake/get_info.cmake | 4 ++++ cmake/package_helpers.cmake | 12 ++++++++++++ 3 files changed, 25 insertions(+), 4 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 74cc7c3d1c2..1cf20070487 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -104,6 +104,11 @@ set(GINKGO_DPCPP_WORKGROUPS "256;512" CACHE STRING option(GINKGO_INSTALL_RPATH "Set the RPATH when installing its libraries." ON) option(GINKGO_INSTALL_RPATH_ORIGIN "Add $ORIGIN (Linux) or @loader_path (MacOS) to the installation RPATH." ON) option(GINKGO_INSTALL_RPATH_DEPENDENCIES "Add dependencies to the installation RPATH." OFF) +# Third party libraries +option(GINKGO_DOWNLOAD_GFLAGS "Force Ginkgo to download gflags." OFF) +option(GINKGO_DOWNLOAD_GTEST "Force Ginkgo to download GTest." OFF) +option(GINKGO_DOWNLOAD_HWLOC "Force Ginkgo to download HWLOC." OFF) +option(GINKGO_DOWNLOAD_RAPIDJSON "Force Ginkgo to download RapidJSON." OFF) set(GINKGO_CIRCULAR_DEPS_FLAGS "-Wl,--no-undefined") @@ -229,14 +234,14 @@ endif() # Try to find the third party packages before using our subdirectories include(cmake/package_helpers.cmake) if(GINKGO_BUILD_TESTS) - find_package(GTest 1.10.0) # No need for QUIET as CMake ships FindGTest + ginkgo_find_package(GTest 1.10.0) # No need for QUIET as CMake ships FindGTest endif() if(GINKGO_BUILD_BENCHMARKS) - find_package(gflags 2.2.2 QUIET) - find_package(RapidJSON 1.1.0 QUIET) + ginkgo_find_package(gflags 2.2.2 QUIET) + ginkgo_find_package(RapidJSON 1.1.0 QUIET) endif() if(GINKGO_BUILD_HWLOC) - find_package(HWLOC 2.1) # No need for QUIET as we ship FindHWLOC + ginkgo_find_package(HWLOC 2.1) # No need for QUIET as we ship FindHWLOC endif() add_subdirectory(third_party) # Third-party tools and libraries diff --git a/cmake/get_info.cmake b/cmake/get_info.cmake index 479b889aeaf..976e1927c6b 100644 --- a/cmake/get_info.cmake +++ b/cmake/get_info.cmake @@ -200,6 +200,10 @@ ginkgo_print_variable(${detailed_log} "GINKGO_BUILD_HWLOC") ginkgo_print_variable(${detailed_log} "HWLOC_VERSION") ginkgo_print_variable(${detailed_log} "HWLOC_LIBRARIES") ginkgo_print_variable(${detailed_log} "HWLOC_INCLUDE_DIRS") +ginkgo_print_variable(${minimal_log} "GINKGO_DOWNLOAD_GFLAGS") +ginkgo_print_variable(${minimal_log} "GINKGO_DOWNLOAD_GTEST") +ginkgo_print_variable(${minimal_log} "GINKGO_DOWNLOAD_HWLOC") +ginkgo_print_variable(${minimal_log} "GINKGO_DOWNLOAD_RAPIDJSON") _minimal( " diff --git a/cmake/package_helpers.cmake b/cmake/package_helpers.cmake index e1d196ad553..2abead229e2 100644 --- a/cmake/package_helpers.cmake +++ b/cmake/package_helpers.cmake @@ -57,3 +57,15 @@ function(ginkgo_download_file url filename hash_type hash) message(FATAL_ERROR "Download of ${filename} failed.") endif() endfunction(ginkgo_download_file) + +# Find package with a protection. To not find library foo, set the CMake +# variable GINKGO_DOWNLOAD_FOO=ON +# +# \param hash The name of the package +# \param ARGN Extra arguments to give to find_package +macro(ginkgo_find_package name) + string(TOUPPER "${name}" GKO_TPL_PKG_NAME) + if(NOT GINKGO_DOWNLOAD_${GKO_TPL_PKG_NAME}) + find_package(${name} ${ARGN}) + endif() +endmacro() From 5fbf5f4ace5bb68b9d8e4eba602275c86900e759 Mon Sep 17 00:00:00 2001 From: Terry Cojean Date: Fri, 18 Feb 2022 11:31:34 +0100 Subject: [PATCH 06/14] Piggyback: minor hwloc and gitignore changes --- .gitignore | 3 ++- CMakeLists.txt | 1 - cmake/Modules/FindHWLOC.cmake | 4 ++-- third_party/hwloc/CMakeLists.txt | 5 +++-- 4 files changed, 7 insertions(+), 6 deletions(-) diff --git a/.gitignore b/.gitignore index af0a88ef513..169f5e8d9ad 100644 --- a/.gitignore +++ b/.gitignore @@ -23,8 +23,9 @@ build # Visual studio code .vscode -### Generated header file +### Generated header files include/config.hpp +dpcpp/base/config.hpp ### C++ # Prerequisites diff --git a/CMakeLists.txt b/CMakeLists.txt index 1cf20070487..80464201e54 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -197,7 +197,6 @@ if(GINKGO_BUILD_HWLOC) set(GINKGO_HAVE_HWLOC 1) else() set(GINKGO_HAVE_HWLOC 0) - message(STATUS "HWLOC is being forcibly switched off") endif() if(GINKGO_BUILD_MPI) diff --git a/cmake/Modules/FindHWLOC.cmake b/cmake/Modules/FindHWLOC.cmake index fe01a40b2f4..91d61499265 100644 --- a/cmake/Modules/FindHWLOC.cmake +++ b/cmake/Modules/FindHWLOC.cmake @@ -39,13 +39,13 @@ include(hwloc_helpers) find_path(HWLOC_INCLUDE_DIRS NAMES "hwloc.h" - HINTS ${HWLOC_DIR} $ENV{HWLOC_DIR} + HINTS ${HWLOC_DIR} $ENV{HWLOC_DIR} ${HWLOC_ROOT} $ENV{HWLOC_ROOT} PATH_SUFFIXES include DOC "Find the hwloc.h main header" ) find_library(HWLOC_LIBRARIES "hwloc" - HINTS ${HWLOC_DIR} $ENV{HWLOC_DIR} + HINTS ${HWLOC_DIR} $ENV{HWLOC_DIR} ${HWLOC_ROOT} $ENV{HWLOC_ROOT} PATH_SUFFIXES lib lib64 DOC "Find the hwloc library" ) diff --git a/third_party/hwloc/CMakeLists.txt b/third_party/hwloc/CMakeLists.txt index a47e9a84877..970ed78689c 100644 --- a/third_party/hwloc/CMakeLists.txt +++ b/third_party/hwloc/CMakeLists.txt @@ -5,8 +5,6 @@ ginkgo_load_and_configure_package(hwloc_external "https://download.open-mpi.org/ "${TPL_HWLOC_PATH}/src/configure" "--disable-nvml" "--disable-cuda" "--disable-rsmi" ) -add_library(hwloc SHARED IMPORTED GLOBAL) -add_dependencies(hwloc hwloc_external ) file(MAKE_DIRECTORY ${TPL_HWLOC_PATH}/lib/) file(GLOB HWLOC_LIBS "${TPL_HWLOC_PATH}/build/hwloc/.libs/libhwloc.so*") configure_file("${TPL_HWLOC_PATH}/build/include/hwloc/autogen/config.h" "${TPL_HWLOC_PATH}/src/include/hwloc/autogen/config.h" COPYONLY) @@ -14,6 +12,9 @@ foreach(lib ${HWLOC_LIBS}) get_filename_component(lib_name ${lib} NAME) configure_file("${lib}" "${TPL_HWLOC_PATH}/lib/${lib_name}" COPYONLY) endforeach() + +add_library(hwloc SHARED IMPORTED GLOBAL) +add_dependencies(hwloc hwloc_external) set(HWLOC_LIBRARIES "${TPL_HWLOC_PATH}/lib/libhwloc.so" CACHE FILEPATH "The path to HWLOC library libhwloc.so" FORCE) set(HWLOC_INCLUDE_DIRS "${TPL_HWLOC_PATH}/src/include" CACHE PATH "The directory containing the hwloc header, hwloc.h" FORCE) set_target_properties(hwloc PROPERTIES IMPORTED_LOCATION ${HWLOC_LIBRARIES}) From 0755886838ccf3ac1da616120863ba29ef4804e9 Mon Sep 17 00:00:00 2001 From: Terry Cojean Date: Fri, 18 Feb 2022 15:49:01 +0100 Subject: [PATCH 07/14] User specified config manipulation at compile time + Also add a syn::merge function which allows merging lists thanks to an ::encode() function. --- cmake/build_helpers.cmake | 43 ---- core/base/types.hpp | 1 + core/test/synthesizer/containers.cpp | 69 +++++++ dpcpp/CMakeLists.txt | 40 +--- dpcpp/base/config.hpp.in | 29 ++- dpcpp/get_info.cmake | 3 + .../ginkgo/core/synthesizer/containers.hpp | 183 ++++++++++++++++++ 7 files changed, 288 insertions(+), 80 deletions(-) diff --git a/cmake/build_helpers.cmake b/cmake/build_helpers.cmake index 6f5ba8be21a..4b89cf451c4 100644 --- a/cmake/build_helpers.cmake +++ b/cmake/build_helpers.cmake @@ -152,46 +152,3 @@ function(ginkgo_extract_dpcpp_version DPCPP_COMPILER GINKGO_DPCPP_VERSION) file(REMOVE ${CMAKE_CURRENT_BINARY_DIR}/extract_dpcpp_ver.cpp) file(REMOVE ${CMAKE_CURRENT_BINARY_DIR}/extract_dpcpp_ver) endfunction() - -# From https://stackoverflow.com/questions/61106418/sorting-a-list-of-numbers-numerically-in-cmake -# After CMake 3.18, this can be removed -function(insertionSortRecursive sort_list) - # Get list length - list(LENGTH sort_list len) - - # Base case, return. - if(${len} LESS_EQUAL 1) - return() - endif() - - math(EXPR len_minus_one "${len} - 1") - # Recursively sort the sublist of size (len - 1). - insertionSortRecursive("${sort_list}" ${len_minus_one}) - set(_sort_list ${sort_list}) - - # Get the last element in the sublist. - list(GET _sort_list ${len_minus_one} last_element) - # Define a counter for the sublist we will operate on. - math(EXPR sublist_counter "${len} - 2") - # Get the element at the counter index. - list(GET _sort_list ${sublist_counter} counter_element) - - # Loop to move those elements greater than last_element up one position. - while((${sublist_counter} GREATER_EQUAL 0) AND (${counter_element} GREATER ${last_element})) - # Move elem at counter index up one position in list. - math(EXPR counter_plus_one "${sublist_counter} + 1") - list(REMOVE_AT _sort_list ${counter_plus_one}) - list(INSERT _sort_list ${counter_plus_one} ${counter_element}) - # Decrement the sublist counter. - math(EXPR sublist_counter "${sublist_counter} - 1") - # Get the element at the new counter value. - list(GET _sort_list ${sublist_counter} counter_element) - endwhile() - - # Place the last element at the correct position. - math(EXPR counter_plus_one "${sublist_counter} + 1") - list(REMOVE_AT _sort_list ${counter_plus_one}) - list(INSERT _sort_list ${counter_plus_one} ${last_element}) - # Send the modified list back up to parent scope. - set(sort_list ${_sort_list} PARENT_SCOPE) -endfunction() diff --git a/core/base/types.hpp b/core/base/types.hpp index 49c4fb99e15..2e2c4c99dce 100644 --- a/core/base/types.hpp +++ b/core/base/types.hpp @@ -143,6 +143,7 @@ class ConfigSet { public: static constexpr unsigned num_groups = sizeof...(num_bits); static constexpr std::array bits{num_bits...}; + using can_encode = std::true_type; /** * Decodes the `position` information from encoded diff --git a/core/test/synthesizer/containers.cpp b/core/test/synthesizer/containers.cpp index 7bedd7f1a5e..9975dc03e82 100644 --- a/core/test/synthesizer/containers.cpp +++ b/core/test/synthesizer/containers.cpp @@ -207,5 +207,74 @@ TEST_F(IntegerSequenceExtensions, EmptyTests) StaticAssertTypeEq(); } +struct int_encoder { + using can_encode = std::true_type; + + static constexpr int encode() { return 1; } + + template + static constexpr int encode(int v1, Rest&&... rest) + { + return v1 * encode(std::forward(rest)...); + } +}; + + +TEST_F(IntegerSequenceExtensions, CanMergeEmptyList) +{ + StaticAssertTypeEq, empty>(); +} + + +TEST_F(IntegerSequenceExtensions, CanMergeOneList) +{ + StaticAssertTypeEq, test>(); +} + + +TEST_F(IntegerSequenceExtensions, CanMergeTwoLists) +{ + using list1 = std::integer_sequence; + using list2 = std::integer_sequence; + using expected = std::integer_sequence; + + using res = gko::syn::merge; + + StaticAssertTypeEq(); +} + + +TEST_F(IntegerSequenceExtensions, CanMergeThreeLists) +{ + using list1 = std::integer_sequence; + using list2 = std::integer_sequence; + using list3 = std::integer_sequence; + using expected1 = std::integer_sequence; + using expected2 = + std::integer_sequence; + + using res1 = gko::syn::merge; + using res2 = gko::syn::merge; + + StaticAssertTypeEq(); + StaticAssertTypeEq(); +} + + +TEST_F(IntegerSequenceExtensions, CanMergeThreeListsOtherOrder) +{ + using list1 = std::integer_sequence; + using list2 = std::integer_sequence; + using list3 = std::integer_sequence; + using expected1 = std::integer_sequence; + using expected2 = + std::integer_sequence; + + using res1 = gko::syn::merge; + using res2 = gko::syn::merge; + + StaticAssertTypeEq(); + StaticAssertTypeEq(); +} } // namespace diff --git a/dpcpp/CMakeLists.txt b/dpcpp/CMakeLists.txt index 86efb1b9e86..ada8bdda36f 100644 --- a/dpcpp/CMakeLists.txt +++ b/dpcpp/CMakeLists.txt @@ -11,6 +11,14 @@ set(GINKGO_MKL_ROOT "${MKL_ROOT}" PARENT_SCOPE) find_package(oneDPL REQUIRED HINTS "$ENV{DPL_ROOT}") set(GINKGO_DPL_ROOT "${DPL_ROOT}" PARENT_SCOPE) +# Reverse the subgroup list, replace ; to , from the user strings +string(REPLACE ";" "," GINKGO_DPCPP_SUBGROUPS_FORMATTED "${GINKGO_DPCPP_SUBGROUPS}") +string(REPLACE ";" "," GINKGO_DPCPP_WORKGROUPS_FORMATTED "${GINKGO_DPCPP_WORKGROUPS}") +# Generate the DPC++ Configuration file +configure_file(${CMAKE_CURRENT_SOURCE_DIR}/base/config.hpp.in + ${CMAKE_CURRENT_SOURCE_DIR}/base/config.hpp @ONLY) + + add_library(ginkgo_dpcpp $ "") target_sources(ginkgo_dpcpp PRIVATE @@ -51,38 +59,6 @@ target_sources(ginkgo_dpcpp ${GKO_UNIFIED_COMMON_SOURCES} ) -## Parse the subgroup and workgroup lists -# 1. Sort from smallest element to the biggest -insertionSortRecursive(GINKGO_DPCPP_SUBGROUPS) -insertionSortRecursive(GINKGO_DPCPP_WORKGROUPS) -# 2. Extract the two biggest elements -list(GET GINKGO_DPCPP_SUBGROUPS -1 GINKGO_DPCPP_BIGGEST_SUBGROUP) -list(GET GINKGO_DPCPP_WORKGROUPS -1 GINKGO_DPCPP_BIGGEST_WORKGROUP) -# 3. Combine the two lists, also format the workgroup list -set(GINKGO_DPCPP_1D_CFG "") -set(GINKGO_DPCPP_WORKGROUPS_FORMATTED "") -foreach(workgroup ${GINKGO_DPCPP_WORKGROUPS}) - foreach(subgroup ${GINKGO_DPCPP_SUBGROUPS}) - if (GINKGO_DPCPP_1D_CFG STREQUAL "") - set(GINKGO_DPCPP_1D_CFG "KCFG_1D::encode(${workgroup}, ${subgroup})") - else() - set(GINKGO_DPCPP_1D_CFG "${GINKGO_DPCPP_1D_CFG}, KCFG_1D::encode(${workgroup}, ${subgroup})") - endif() - endforeach() - if (GINKGO_DPCPP_WORKGROUPS_FORMATTED STREQUAL "") - set(GINKGO_DPCPP_WORKGROUPS_FORMATTED "BlockCfg::encode(${workgroup})") - else() - set(GINKGO_DPCPP_WORKGROUPS_FORMATTED "BlockCfg::encode(${workgroup}), ${GINKGO_DPCPP_WORKGROUPS_FORMATTED}, ") - endif() -endforeach() -# 4. Reverse the subgroup list, replace ; to , -list(REVERSE GINKGO_DPCPP_SUBGROUPS) -string(REPLACE ";" "," GINKGO_DPCPP_SUBGROUPS_FORMATTED "${GINKGO_DPCPP_SUBGROUPS}") - -## Now we can generate the DPC++ Configure file -configure_file(${CMAKE_CURRENT_SOURCE_DIR}/base/config.hpp.in - ${CMAKE_CURRENT_SOURCE_DIR}/base/config.hpp @ONLY) - ginkgo_compile_features(ginkgo_dpcpp) target_compile_definitions(ginkgo_dpcpp PRIVATE GKO_COMPILING_DPCPP _ONEDPL_COMPILE_KERNEL=0) diff --git a/dpcpp/base/config.hpp.in b/dpcpp/base/config.hpp.in index 429cf1278df..49e63bf1877 100644 --- a/dpcpp/base/config.hpp.in +++ b/dpcpp/base/config.hpp.in @@ -47,6 +47,24 @@ namespace kernels { namespace dpcpp { +// CMake generated subgroup list. We sort by ascending order. +// clang-format off +using subgroup_list_t = + syn::sort>; +constexpr auto subgroup_list = subgroup_list_t{}; +// clang-format on + +// CMake generated workgroup list. We sort by ascending order. +// clang-format off +using workgroup_list_t = + syn::sort>; +constexpr auto workgroup_list = workgroup_list_t{}; +// clang-format on + +using empty_list_t = std::integer_sequence; + struct config { /** * The type containing a bitmask over all lanes of a warp. @@ -56,7 +74,8 @@ struct config { /** * The number of threads within a Dpcpp subgroup. */ - static constexpr uint32 warp_size = @GINKGO_DPCPP_BIGGEST_SUBGROUP@; + static constexpr uint32 warp_size = + syn::as_value(syn::median{}); /** * The bitmask of the entire warp. @@ -72,20 +91,20 @@ struct config { /** * The default maximal number of threads allowed in DPCPP group */ - static constexpr uint32 max_block_size = @GINKGO_DPCPP_BIGGEST_WORKGROUP@; + static constexpr uint32 max_block_size = + syn::as_value(syn::max{}); }; using KCFG_1D = ConfigSet<11, 7>; constexpr auto kcfg_1d_list = - std::integer_sequence(); -constexpr auto subgroup_list = std::integer_sequence(); + syn::merge{}; constexpr auto kcfg_1d_array = syn::as_array(kcfg_1d_list); using BlockCfg = ConfigSet<11>; constexpr auto block_cfg_list = - std::integer_sequence(); + syn::merge{}; } // namespace dpcpp diff --git a/dpcpp/get_info.cmake b/dpcpp/get_info.cmake index 36918a3a8c6..f595db6c190 100644 --- a/dpcpp/get_info.cmake +++ b/dpcpp/get_info.cmake @@ -2,6 +2,9 @@ ginkgo_print_module_header(${detailed_log} "DPCPP") ginkgo_print_module_footer(${detailed_log} "DPCPP variables:") ginkgo_print_variable(${detailed_log} "GINKGO_DPCPP_FLAGS") ginkgo_print_variable(${detailed_log} "GINKGO_DPCPP_SINGLE_MODE") +ginkgo_print_variable(${detailed_log} "GINKGO_DPCPP_AOT_TARGETS") +ginkgo_print_variable(${detailed_log} "GINKGO_DPCPP_SUBGROUPS") +ginkgo_print_variable(${detailed_log} "GINKGO_DPCPP_WORKGROUPS") ginkgo_print_module_footer(${detailed_log} "DPCPP environment variables:") ginkgo_print_env_variable(${detailed_log} "SYCL_DEVICE_TYPE") ginkgo_print_env_variable(${detailed_log} "SYCL_BE") diff --git a/include/ginkgo/core/synthesizer/containers.hpp b/include/ginkgo/core/synthesizer/containers.hpp index f4c5eafbb9f..c6075ea3021 100644 --- a/include/ginkgo/core/synthesizer/containers.hpp +++ b/include/ginkgo/core/synthesizer/containers.hpp @@ -599,6 +599,189 @@ template using max = front>; +namespace detail { + +/** + * This is the base type of a helper to merge_impl. It merges two lists of + * std::integer_sequence by applying EncodingType::encode() on every pair of + * values. In this case, the first list is a single element. This helps + * logically split the double recursion needed for merging two general lists. + * + * For example, when considering + * ``` + * struct int_encoder { + * using can_encode = std::true_type; + * static constexpr auto encode(int v1, int v2) { return v1*v2; } + * }; + * using idx1 = std::integer_sequence; + * using idx2 = std::integer_sequence; + * using merged = typename merge_one_impl::type; + * // Then effectively: + * // merged = std::integer_sequence + * ``` + * + * @tparam EncodingType the type used to encode values. It must at least look + * like the encoder in the example above. ConfigSet is a + * Ginkgo type which can encode. + * @tparam Lists the lists to merge and encode + */ +template +struct merge_one_impl; + +/** + * This is the base case for merge_one_impl's recursion. The second list is + * empty. + * + * @tparam EncodingType the type used to encode values + * @tparam T the type of the values + * @tparam T v1 the value from the first list + */ +template +struct merge_one_impl, + std::integer_sequence> { + using type = std::integer_sequence; +}; + +/** + * This is the recursive case for merge_one_impl. We merge and encode v1 + * with every subsequent v2 from the second std::integer_sequence. + * + * @tparam EncodingType the type used to encode values + * @tparam T the type of the values + * @tparam T v1 the value from the first list being merged + * @tparam T v2 the value from the second list being merged + * @tparam Values the values not yet processed from the second list + */ +template +struct merge_one_impl, + std::integer_sequence> { + static_assert( + std::is_same::value, + "EncodingType must have encoding functionality."); + + using v1_as_seq = std::integer_sequence; + using values_as_seq = std::integer_sequence; + using recurse = merge_one_impl; + using type = + concatenate, + typename recurse::type>; +}; + + +/** + * This is the base type of merge_impl. It merges two + * std::integer_sequence and calls EncodingType::encode() on every pair of + * values. + * + * For example, when considering + * ``` + * struct int_encoder { + * using can_encode = std::true_type; + * static constexpr auto encode(int v1, int v2) { return v1*v2; } + * }; + * using idx1 = std::integer_sequence; + * using idx2 = std::integer_sequence; + * using merged = typename merge_impl::type; + * // Then effectively: + * // merged = std::integer_sequence + * ``` + * + * @see detail::merge_one_impl + * + * @tparam EncodingType the type used to encode values. It must at least look + * like the encoder in the example above. ConfigSet is a + * Ginkgo type which can encode. + * @tparam Lists the lists to merge and encode + */ +template +struct merge_impl; + +/** + * This is the base case for merge_impl's recursion. The first list has been + * completely consumed. + * + * @tparam EncodingType the type used to encode values + * @tparam T the type of the values + * @tparam T Values2 the values of the second list + */ +template +struct merge_impl, + std::integer_sequence> { + using type = std::integer_sequence; +}; + +/** + * This is the first recursive case for merge_impl. In this case, the second + * list is empty. We only encode v1 one after the other. + * + * @tparam EncodingType the type used to encode values + * @tparam T the type of the values + * @tparam T v1 the value from the first list being merged + * @tparam Values1 the values left to consume from the first list + * @tparam Values2 the values of the second list + */ +template +struct merge_impl, + std::integer_sequence> { + using v1_as_seq = std::integer_sequence; + using empty = std::integer_sequence; + using val1_as_seq = std::integer_sequence; + using processed_v1 = std::integer_sequence; + // move to the next v1 + using recurse = merge_impl; + using type = concatenate; +}; + +/** + * This is the recursive case for merge_impl with a non empty Values2 list. We + * call merge_one_impl for every v1 and element of Values2 until Values1 is + * completely consumed. + * + * @tparam EncodingType the type used to encode values + * @tparam T the type of the values + * @tparam T v1 the value from the first list being merged + * @tparam Values1 the values left to consume from the first list + * @tparam Values2 the values of the second list + */ +template +struct merge_impl, + std::integer_sequence> { + using v1_as_seq = std::integer_sequence; + using val1_as_seq = std::integer_sequence; + using val2_as_seq = std::integer_sequence; + using process_v1 = merge_one_impl; + // move to the next v1 + using recurse = merge_impl; + using type = concatenate; +}; + + +} // namespace detail + + +/** + * This is a helper interface for merging two lists of std::integer_sequence + * using EncodingType as an encoder. It only merges two lists at once. + * EncodingType must look like ConfigSet or the following: + * + * ``` + * struct int_encoder { + * using can_encode = std::true_type; + * static constexpr auto encode(int v1, int v2) { return v1*v2; } + * }; + * ``` + * + * @see detail::merge_impl + * + * @tparam EncodingType the type used to encode values. It must at least look + * like the encoder in the example above. ConfigSet is a + * Ginkgo type which can encode. + * @tparam Lists the lists to merge and encode + */ +template +using merge = typename detail::merge_impl::type; + + } // namespace syn } // namespace gko From 7c8aa34b2f591a7678c99e251a69bcfedc65e300 Mon Sep 17 00:00:00 2001 From: Terry Cojean Date: Fri, 18 Feb 2022 16:04:28 +0100 Subject: [PATCH 08/14] Fix CSR and ELL. Unify dpcpp config to int --- core/synthesizer/implementation_selection.hpp | 59 ++++--- dpcpp/CMakeLists.txt | 2 +- dpcpp/base/config.hpp.in | 10 +- dpcpp/base/helper.hpp | 29 ++-- dpcpp/base/kernel_launch_reduction.dp.hpp | 70 ++++---- dpcpp/components/prefix_sum.dp.hpp | 8 +- dpcpp/components/prefix_sum_kernels.dp.cpp | 8 +- dpcpp/components/reduction.dp.hpp | 17 +- dpcpp/matrix/csr_kernels.dp.cpp | 49 +++--- dpcpp/matrix/dense_kernels.dp.cpp | 24 ++- dpcpp/matrix/ell_kernels.dp.cpp | 150 +++++++++--------- .../test/components/cooperative_groups.dp.cpp | 24 +-- 12 files changed, 225 insertions(+), 225 deletions(-) diff --git a/core/synthesizer/implementation_selection.hpp b/core/synthesizer/implementation_selection.hpp index 47bbcf3be72..39efa24b67d 100644 --- a/core/synthesizer/implementation_selection.hpp +++ b/core/synthesizer/implementation_selection.hpp @@ -73,36 +73,35 @@ namespace syn { "This assert is used to counter the false positive extra " \ "semi-colon warnings") -#define GKO_ENABLE_IMPLEMENTATION_CONFIG_SELECTION(_name, _callable) \ - template \ - inline void _name(std::integer_sequence, Predicate, \ - std::integer_sequence, \ - std::integer_sequence, \ - std::integer_sequence, \ - ::gko::syn::type_list, InferredArgs...) \ - GKO_KERNEL_NOT_FOUND; \ - \ - template \ - inline void _name( \ - std::integer_sequence, \ - Predicate is_eligible, \ - std::integer_sequence bool_args, \ - std::integer_sequence int_args, \ - std::integer_sequence size_args, \ - ::gko::syn::type_list type_args, InferredArgs... args) \ - { \ - if (is_eligible(K)) { \ - _callable( \ - std::forward(args)...); \ - } else { \ - _name(std::integer_sequence(), \ - is_eligible, bool_args, int_args, size_args, type_args, \ - std::forward(args)...); \ - } \ +#define GKO_ENABLE_IMPLEMENTATION_CONFIG_SELECTION(_name, _callable) \ + template \ + inline void _name(std::integer_sequence, Predicate, \ + std::integer_sequence, \ + std::integer_sequence, \ + std::integer_sequence, \ + ::gko::syn::type_list, InferredArgs...) \ + GKO_KERNEL_NOT_FOUND; \ + \ + template \ + inline void _name( \ + std::integer_sequence, Predicate is_eligible, \ + std::integer_sequence bool_args, \ + std::integer_sequence int_args, \ + std::integer_sequence size_args, \ + ::gko::syn::type_list type_args, InferredArgs... args) \ + { \ + if (is_eligible(K)) { \ + _callable( \ + std::forward(args)...); \ + } else { \ + _name(std::integer_sequence(), is_eligible, \ + bool_args, int_args, size_args, type_args, \ + std::forward(args)...); \ + } \ } diff --git a/dpcpp/CMakeLists.txt b/dpcpp/CMakeLists.txt index ada8bdda36f..a119147a25d 100644 --- a/dpcpp/CMakeLists.txt +++ b/dpcpp/CMakeLists.txt @@ -66,7 +66,7 @@ target_compile_definitions(ginkgo_dpcpp PRIVATE GKO_COMPILING_DPCPP _ONEDPL_COMP set(GINKGO_DPCPP_FLAGS ${GINKGO_DPCPP_FLAGS} PARENT_SCOPE) target_compile_options(ginkgo_dpcpp PRIVATE "${GINKGO_DPCPP_FLAGS}") target_compile_options(ginkgo_dpcpp PRIVATE "${GINKGO_COMPILER_FLAGS}") -target_compile_features(ginkgo_dpcpp PRIVATE cxx_std_17) +target_compile_features(ginkgo_dpcpp PUBLIC cxx_std_17) # Note: add MKL as PRIVATE not PUBLIC (MKL example shows) to avoid propagating # find_package(MKL) everywhere when linking ginkgo (see the MKL example diff --git a/dpcpp/base/config.hpp.in b/dpcpp/base/config.hpp.in index 49e63bf1877..3f53bcac2af 100644 --- a/dpcpp/base/config.hpp.in +++ b/dpcpp/base/config.hpp.in @@ -51,7 +51,8 @@ namespace dpcpp { // clang-format off using subgroup_list_t = syn::sort>; + std::integer_sequence>; +using subgroup_list_desc_t = syn::sort; constexpr auto subgroup_list = subgroup_list_t{}; // clang-format on @@ -59,11 +60,12 @@ constexpr auto subgroup_list = subgroup_list_t{}; // clang-format off using workgroup_list_t = syn::sort>; + std::integer_sequence>; +using workgroup_list_desc_t = syn::sort; constexpr auto workgroup_list = workgroup_list_t{}; // clang-format on -using empty_list_t = std::integer_sequence; +using empty_list_t = std::integer_sequence; struct config { /** @@ -101,10 +103,10 @@ constexpr auto kcfg_1d_list = syn::merge{}; constexpr auto kcfg_1d_array = syn::as_array(kcfg_1d_list); - using BlockCfg = ConfigSet<11>; constexpr auto block_cfg_list = syn::merge{}; +constexpr auto block_cfg_array = syn::as_array(block_cfg_list); } // namespace dpcpp diff --git a/dpcpp/base/helper.hpp b/dpcpp/base/helper.hpp index af76aa41fa4..65d9a0f1fd1 100644 --- a/dpcpp/base/helper.hpp +++ b/dpcpp/base/helper.hpp @@ -81,7 +81,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * @param kernel_ the kernel name */ #define GKO_ENABLE_DEFAULT_HOST_CONFIG(name_, kernel_) \ - template \ + template \ inline void name_(dim3 grid, dim3 block, gko::size_type, \ sycl::queue* queue, InferredArgs... args) \ { \ @@ -115,19 +115,18 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * available to decode<0> for blocksize and decode<1> for * subgroup_size by cfg_ */ -#define GKO_ENABLE_DEFAULT_CONFIG_CALL(name_, callable_, list_) \ - template \ - void name_(std::uint32_t desired_cfg, dim3 grid, dim3 block, \ - gko::size_type dynamic_shared_memory, sycl::queue* queue, \ - InferredArgs... args) \ - { \ - callable_( \ - list_, \ - [&desired_cfg](std::uint32_t cfg) { return cfg == desired_cfg; }, \ - std::integer_sequence(), std::integer_sequence(), \ - std::integer_sequence(), \ - ::gko::syn::type_list<>(), grid, block, dynamic_shared_memory, \ - queue, std::forward(args)...); \ +#define GKO_ENABLE_DEFAULT_CONFIG_CALL(name_, callable_, list_) \ + template \ + void name_(int desired_cfg, dim3 grid, dim3 block, \ + gko::size_type dynamic_shared_memory, sycl::queue* queue, \ + InferredArgs... args) \ + { \ + callable_( \ + list_, [&desired_cfg](int cfg) { return cfg == desired_cfg; }, \ + std::integer_sequence(), std::integer_sequence(), \ + std::integer_sequence(), \ + ::gko::syn::type_list<>(), grid, block, dynamic_shared_memory, \ + queue, std::forward(args)...); \ } // __WG_BOUND__ gives the cuda-like launch bound in cuda ordering @@ -178,7 +177,7 @@ bool validate(sycl::queue* queue, unsigned workgroup_size, * @return the first valid config */ template -std::uint32_t get_first_cfg(const IterArr& arr, Validate verify) +int get_first_cfg(const IterArr& arr, Validate verify) { for (auto& cfg : arr) { if (verify(cfg)) { diff --git a/dpcpp/base/kernel_launch_reduction.dp.hpp b/dpcpp/base/kernel_launch_reduction.dp.hpp index 33dc21b2056..c4d14cd97f2 100644 --- a/dpcpp/base/kernel_launch_reduction.dp.hpp +++ b/dpcpp/base/kernel_launch_reduction.dp.hpp @@ -53,7 +53,7 @@ namespace kernels { namespace dpcpp { -template void generic_kernel_reduction_1d(sycl::handler& cgh, int64 size, @@ -102,7 +102,7 @@ void generic_kernel_reduction_1d(sycl::handler& cgh, int64 size, } -template void generic_kernel_reduction_2d(sycl::handler& cgh, int64 rows, int64 cols, @@ -153,7 +153,7 @@ void generic_kernel_reduction_2d(sycl::handler& cgh, int64 rows, int64 cols, } -template void run_kernel_reduction_impl(std::shared_ptr exec, @@ -193,7 +193,7 @@ void run_kernel_reduction_impl(std::shared_ptr exec, } -template void run_kernel_reduction_impl(std::shared_ptr exec, @@ -246,13 +246,12 @@ void run_kernel_reduction(std::shared_ptr exec, FinalizeOp finalize, ValueType identity, ValueType* result, dim<2> size, KernelArgs&&... args) { - const auto desired_cfg = - get_first_cfg(syn::as_array(kcfg_1d_list), [&](std::uint32_t cfg) { - return validate(exec->get_queue(), KCFG_1D::decode<0>(cfg), - KCFG_1D::decode<1>(cfg)); - }); + const auto desired_cfg = get_first_cfg(kcfg_1d_array, [&](int cfg) { + return validate(exec->get_queue(), KCFG_1D::decode<0>(cfg), + KCFG_1D::decode<1>(cfg)); + }); select_run_kernel_reduction( - kcfg_1d_list, [&](std::uint32_t cfg) { return cfg == desired_cfg; }, + kcfg_1d_list, [&](int cfg) { return cfg == desired_cfg; }, std::integer_sequence(), std::integer_sequence(), std::integer_sequence(), syn::type_list<>(), exec, fn, op, finalize, identity, result, size, map_to_device(args)...); @@ -267,13 +266,12 @@ void run_kernel_reduction(std::shared_ptr exec, ValueType* result, size_type size, KernelArgs&&... args) { - const auto desired_cfg = - get_first_cfg(syn::as_array(kcfg_1d_list), [&](std::uint32_t cfg) { - return validate(exec->get_queue(), KCFG_1D::decode<0>(cfg), - KCFG_1D::decode<1>(cfg)); - }); + const auto desired_cfg = get_first_cfg(kcfg_1d_array, [&](int cfg) { + return validate(exec->get_queue(), KCFG_1D::decode<0>(cfg), + KCFG_1D::decode<1>(cfg)); + }); select_run_kernel_reduction( - kcfg_1d_list, [&](std::uint32_t cfg) { return cfg == desired_cfg; }, + kcfg_1d_list, [&](int cfg) { return cfg == desired_cfg; }, std::integer_sequence(), std::integer_sequence(), std::integer_sequence(), syn::type_list<>(), exec, fn, op, finalize, identity, result, size, map_to_device(args)...); @@ -283,8 +281,8 @@ void run_kernel_reduction(std::shared_ptr exec, namespace { -template void generic_kernel_row_reduction_2d(std::integer_sequence, std::shared_ptr exec, @@ -339,8 +337,8 @@ GKO_ENABLE_IMPLEMENTATION_SELECTION(select_generic_kernel_row_reduction_2d, generic_kernel_row_reduction_2d); -template void generic_kernel_col_reduction_2d_small( sycl::handler& cgh, int64 rows, int64 cols, int64 row_blocks, @@ -414,7 +412,7 @@ void generic_kernel_col_reduction_2d_small( } -template void generic_kernel_col_reduction_2d_blocked( @@ -483,8 +481,8 @@ void generic_kernel_reduction_finalize_2d( } -template void run_generic_col_reduction_small(std::integer_sequence, std::shared_ptr exec, @@ -527,7 +525,7 @@ GKO_ENABLE_IMPLEMENTATION_SELECTION(select_generic_col_reduction_small, run_generic_col_reduction_small); -template void run_kernel_row_reduction_stage1(std::shared_ptr exec, @@ -578,7 +576,7 @@ GKO_ENABLE_IMPLEMENTATION_CONFIG_SELECTION(select_kernel_row_reduction_stage1, run_kernel_row_reduction_stage1); -template void run_kernel_col_reduction_stage1(std::shared_ptr exec, @@ -651,13 +649,12 @@ void run_kernel_row_reduction(std::shared_ptr exec, ValueType* result, size_type result_stride, dim<2> size, KernelArgs&&... args) { - const auto desired_cfg = - get_first_cfg(syn::as_array(kcfg_1d_list), [&](std::uint32_t cfg) { - return validate(exec->get_queue(), KCFG_1D::decode<0>(cfg), - KCFG_1D::decode<1>(cfg)); - }); + const auto desired_cfg = get_first_cfg(kcfg_1d_array, [&](int cfg) { + return validate(exec->get_queue(), KCFG_1D::decode<0>(cfg), + KCFG_1D::decode<1>(cfg)); + }); select_kernel_row_reduction_stage1( - kcfg_1d_list, [&](std::uint32_t cfg) { return cfg == desired_cfg; }, + kcfg_1d_list, [&](int cfg) { return cfg == desired_cfg; }, std::integer_sequence(), std::integer_sequence(), std::integer_sequence(), syn::type_list<>(), exec, fn, op, finalize, identity, result, result_stride, size, @@ -673,13 +670,12 @@ void run_kernel_col_reduction(std::shared_ptr exec, ValueType* result, dim<2> size, KernelArgs&&... args) { - const auto desired_cfg = - get_first_cfg(syn::as_array(kcfg_1d_list), [&](std::uint32_t cfg) { - return validate(exec->get_queue(), KCFG_1D::decode<0>(cfg), - KCFG_1D::decode<1>(cfg)); - }); + const auto desired_cfg = get_first_cfg(kcfg_1d_array, [&](int cfg) { + return validate(exec->get_queue(), KCFG_1D::decode<0>(cfg), + KCFG_1D::decode<1>(cfg)); + }); select_kernel_col_reduction_stage1( - kcfg_1d_list, [&](std::uint32_t cfg) { return cfg == desired_cfg; }, + kcfg_1d_list, [&](int cfg) { return cfg == desired_cfg; }, std::integer_sequence(), std::integer_sequence(), std::integer_sequence(), syn::type_list<>(), exec, fn, op, finalize, identity, result, size, map_to_device(args)...); diff --git a/dpcpp/components/prefix_sum.dp.hpp b/dpcpp/components/prefix_sum.dp.hpp index ea7dd341ecf..b55c6a22fb8 100644 --- a/dpcpp/components/prefix_sum.dp.hpp +++ b/dpcpp/components/prefix_sum.dp.hpp @@ -129,7 +129,7 @@ __dpct_inline__ void subwarp_prefix_sum(ValueType element, * @note To calculate the prefix sum over an array of size bigger than * `block_size`, `finalize_prefix_sum` has to be used as well. */ -template +template void start_prefix_sum(size_type num_elements, ValueType* __restrict__ elements, ValueType* __restrict__ block_sum, sycl::nd_item<3> item_ct1, @@ -182,7 +182,7 @@ void start_prefix_sum(size_type num_elements, ValueType* __restrict__ elements, } } -template +template void start_prefix_sum(dim3 grid, dim3 block, size_type dynamic_shared_memory, sycl::queue* queue, size_type num_elements, ValueType* elements, ValueType* block_sum) @@ -217,7 +217,7 @@ void start_prefix_sum(dim3 grid, dim3 block, size_type dynamic_shared_memory, * * @note To calculate a prefix sum, first `start_prefix_sum` has to be called. */ -template +template void finalize_prefix_sum(size_type num_elements, ValueType* __restrict__ elements, const ValueType* __restrict__ block_sum, @@ -234,7 +234,7 @@ void finalize_prefix_sum(size_type num_elements, } } -template +template void finalize_prefix_sum(dim3 grid, dim3 block, size_type dynamic_shared_memory, sycl::queue* queue, size_type num_elements, ValueType* elements, const ValueType* block_sum) diff --git a/dpcpp/components/prefix_sum_kernels.dp.cpp b/dpcpp/components/prefix_sum_kernels.dp.cpp index 0e83b0b379a..ae55fd1edb8 100644 --- a/dpcpp/components/prefix_sum_kernels.dp.cpp +++ b/dpcpp/components/prefix_sum_kernels.dp.cpp @@ -68,11 +68,9 @@ void prefix_sum(std::shared_ptr exec, IndexType* counts, // prefix_sum should only be performed on a valid array if (num_entries > 0) { auto queue = exec->get_queue(); - constexpr auto block_cfg_array = syn::as_array(block_cfg_list); - const std::uint32_t cfg = - get_first_cfg(block_cfg_array, [&queue](std::uint32_t cfg) { - return validate(queue, BlockCfg::decode<0>(cfg), 16); - }); + const int cfg = get_first_cfg(block_cfg_array, [&queue](int cfg) { + return validate(queue, BlockCfg::decode<0>(cfg), 16); + }); const auto wg_size = BlockCfg::decode<0>(cfg); auto num_blocks = ceildiv(num_entries, wg_size); Array block_sum_array(exec, num_blocks - 1); diff --git a/dpcpp/components/reduction.dp.hpp b/dpcpp/components/reduction.dp.hpp index 3d4f30347a2..7f912fa1309 100644 --- a/dpcpp/components/reduction.dp.hpp +++ b/dpcpp/components/reduction.dp.hpp @@ -196,7 +196,7 @@ void reduce_array(size_type size, const ValueType* __restrict__ source, * `source` of any size. Has to be called a second time on `result` to reduce * an array larger than `block_size`. */ -template +template void reduce_add_array( size_type size, const ValueType* __restrict__ source, ValueType* __restrict__ result, sycl::nd_item<3> item_ct1, @@ -211,8 +211,7 @@ void reduce_add_array( } } -template void reduce_add_array(dim3 grid, dim3 block, size_type dynamic_shared_memory, sycl::queue* queue, size_type size, @@ -238,7 +237,7 @@ GKO_ENABLE_IMPLEMENTATION_CONFIG_SELECTION(reduce_add_array_config, reduce_add_array); GKO_ENABLE_DEFAULT_CONFIG_CALL(reduce_add_array_call, reduce_add_array_config, - kcfg_1d_list); + kcfg_1d_array); /** @@ -259,12 +258,10 @@ ValueType reduce_add_array(std::shared_ptr exec, auto block_results = Array(exec); ValueType answer = zero(); auto queue = exec->get_queue(); - constexpr auto kcfg_1d_array = syn::as_array(kcfg_1d_list); - const std::uint32_t cfg = - get_first_cfg(kcfg_1d_array, [&queue](std::uint32_t cfg) { - return validate(queue, KCFG_1D::decode<0>(cfg), - KCFG_1D::decode<1>(cfg)); - }); + const int cfg = get_first_cfg(kcfg_1d_array, [&queue](int cfg) { + return validate(queue, KCFG_1D::decode<0>(cfg), + KCFG_1D::decode<1>(cfg)); + }); const auto wg_size = KCFG_1D::decode<0>(cfg); const auto sg_size = KCFG_1D::decode<1>(cfg); diff --git a/dpcpp/matrix/csr_kernels.dp.cpp b/dpcpp/matrix/csr_kernels.dp.cpp index eaf401b0472..97e5a9d5757 100644 --- a/dpcpp/matrix/csr_kernels.dp.cpp +++ b/dpcpp/matrix/csr_kernels.dp.cpp @@ -91,8 +91,7 @@ constexpr int classical_overweight = 32; */ using compiled_kernels = std::integer_sequence; -using classical_kernels = - std::integer_sequence; +using classical_kernels = subgroup_list_desc_t; namespace kernel { @@ -618,7 +617,7 @@ void abstract_reduce(dim3 grid, dim3 block, size_type dynamic_shared_memory, } -template void device_classical_spmv(const size_type num_rows, const ValueType* __restrict__ val, @@ -655,7 +654,7 @@ void device_classical_spmv(const size_type num_rows, } -template +template void abstract_classical_spmv( const size_type num_rows, const ValueType* __restrict__ val, const IndexType* __restrict__ col_idxs, @@ -668,7 +667,7 @@ void abstract_classical_spmv( [](const ValueType& x, const ValueType& y) { return x; }, item_ct1); } -template +template void abstract_classical_spmv(dim3 grid, dim3 block, size_type dynamic_shared_memory, sycl::queue* queue, const size_type num_rows, @@ -690,7 +689,7 @@ void abstract_classical_spmv(dim3 grid, dim3 block, } -template +template void abstract_classical_spmv( const size_type num_rows, const ValueType* __restrict__ alpha, const ValueType* __restrict__ val, const IndexType* __restrict__ col_idxs, @@ -709,7 +708,7 @@ void abstract_classical_spmv( item_ct1); } -template +template void abstract_classical_spmv(dim3 grid, dim3 block, size_type dynamic_shared_memory, sycl::queue* queue, const size_type num_rows, @@ -720,12 +719,14 @@ void abstract_classical_spmv(dim3 grid, dim3 block, ValueType* c, const size_type c_stride) { queue->submit([&](sycl::handler& cgh) { - cgh.parallel_for(sycl_nd_range(grid, block), - [=](sycl::nd_item<3> item_ct1) { - abstract_classical_spmv( - num_rows, alpha, val, col_idxs, row_ptrs, b, - b_stride, beta, c, c_stride, item_ct1); - }); + cgh.parallel_for( + sycl_nd_range(grid, block), [= + ](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size( + subgroup_size)]] { + abstract_classical_spmv( + num_rows, alpha, val, col_idxs, row_ptrs, b, b_stride, beta, + c, c_stride, item_ct1); + }); }); } @@ -900,7 +901,9 @@ void row_permute_kernel(dim3 grid, dim3 block, size_type dynamic_shared_memory, { queue->submit([&](sycl::handler& cgh) { cgh.parallel_for( - sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) { + sycl_nd_range(grid, block), [= + ](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size( + subgroup_size)]] { row_permute_kernel( num_rows, permutation, in_row_ptrs, in_cols, in_vals, out_row_ptrs, out_cols, out_vals, item_ct1); @@ -947,7 +950,9 @@ void inv_row_permute_kernel(dim3 grid, dim3 block, { queue->submit([&](sycl::handler& cgh) { cgh.parallel_for( - sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) { + sycl_nd_range(grid, block), [= + ](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size( + subgroup_size)]] { inv_row_permute_kernel( num_rows, permutation, in_row_ptrs, in_cols, in_vals, out_row_ptrs, out_cols, out_vals, item_ct1); @@ -995,7 +1000,9 @@ void inv_symm_permute_kernel(dim3 grid, dim3 block, { queue->submit([&](sycl::handler& cgh) { cgh.parallel_for( - sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) { + sycl_nd_range(grid, block), [= + ](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size( + subgroup_size)]] { inv_symm_permute_kernel( num_rows, permutation, in_row_ptrs, in_cols, in_vals, out_row_ptrs, out_cols, out_vals, item_ct1); @@ -1149,7 +1156,7 @@ void spmv(std::shared_ptr exec, [&items_per_thread](int compiled_info) { return items_per_thread == compiled_info; }, - std::integer_sequence(), syn::type_list<>(), exec, a, b, c); + empty_list_t{}, syn::type_list<>(), exec, a, b, c); } else if (a->get_strategy()->get_name() == "classical") { IndexType max_length_per_row = 0; using Tcsr = matrix::Csr; @@ -1168,7 +1175,7 @@ void spmv(std::shared_ptr exec, [&max_length_per_row](int compiled_info) { return max_length_per_row >= compiled_info; }, - std::integer_sequence(), syn::type_list<>(), exec, a, b, c); + empty_list_t{}, syn::type_list<>(), exec, a, b, c); } else if (a->get_strategy()->get_name() == "sparselib" || a->get_strategy()->get_name() == "cusparse") { if (!is_complex()) { @@ -1285,8 +1292,7 @@ void advanced_spmv(std::shared_ptr exec, [&max_length_per_row](int compiled_info) { return max_length_per_row >= compiled_info; }, - std::integer_sequence(), syn::type_list<>(), exec, a, b, c, - alpha, beta); + empty_list_t{}, syn::type_list<>(), exec, a, b, c, alpha, beta); } else if (a->get_strategy()->get_name() == "merge_path") { int items_per_thread = host_kernel::compute_items_per_thread(exec); @@ -1295,8 +1301,7 @@ void advanced_spmv(std::shared_ptr exec, [&items_per_thread](int compiled_info) { return items_per_thread == compiled_info; }, - std::integer_sequence(), syn::type_list<>(), exec, a, b, c, - alpha, beta); + empty_list_t{}, syn::type_list<>(), exec, a, b, c, alpha, beta); } else { GKO_NOT_IMPLEMENTED; } diff --git a/dpcpp/matrix/dense_kernels.dp.cpp b/dpcpp/matrix/dense_kernels.dp.cpp index 8c46f15fd40..bb67b3bf865 100644 --- a/dpcpp/matrix/dense_kernels.dp.cpp +++ b/dpcpp/matrix/dense_kernels.dp.cpp @@ -76,7 +76,7 @@ constexpr int default_block_size = 256; namespace kernel { -template +template void transpose(const size_type nrows, const size_type ncols, const ValueType* __restrict__ in, const size_type in_stride, ValueType* __restrict__ out, const size_type out_stride, @@ -99,7 +99,7 @@ void transpose(const size_type nrows, const size_type ncols, } } -template +template void transpose(const size_type nrows, const size_type ncols, const ValueType* __restrict__ in, const size_type in_stride, ValueType* __restrict__ out, const size_type out_stride, @@ -111,7 +111,7 @@ void transpose(const size_type nrows, const size_type ncols, [](ValueType val) { return val; }, item_ct1, space); } -template +template void transpose(dim3 grid, dim3 block, size_type dynamic_shared_memory, sycl::queue* queue, const size_type nrows, const size_type ncols, const ValueType* in, const size_type in_stride, ValueType* out, @@ -136,7 +136,7 @@ GKO_ENABLE_IMPLEMENTATION_CONFIG_SELECTION(transpose, transpose); GKO_ENABLE_DEFAULT_CONFIG_CALL(transpose_call, transpose, subgroup_list); -template +template void conj_transpose(const size_type nrows, const size_type ncols, const ValueType* __restrict__ in, const size_type in_stride, ValueType* __restrict__ out, const size_type out_stride, @@ -148,7 +148,7 @@ void conj_transpose(const size_type nrows, const size_type ncols, [](ValueType val) { return conj(val); }, item_ct1, space); } -template +template void conj_transpose(dim3 grid, dim3 block, size_type dynamic_shared_memory, sycl::queue* queue, const size_type nrows, const size_type ncols, const ValueType* in, @@ -530,10 +530,9 @@ void transpose(std::shared_ptr exec, auto size = orig->get_size(); auto sg_array = syn::as_array(subgroup_list); auto queue = exec->get_queue(); - const std::uint32_t cfg = - get_first_cfg(sg_array, [&queue](std::uint32_t cfg) { - return validate(queue, cfg * cfg, cfg); - }); + const int cfg = get_first_cfg(sg_array, [&queue](int cfg) { + return validate(queue, cfg * cfg, cfg); + }); dim3 grid(ceildiv(size[1], cfg), ceildiv(size[0], cfg)); dim3 block(cfg, cfg); kernel::transpose_call(cfg, grid, block, 0, queue, size[0], size[1], @@ -552,10 +551,9 @@ void conj_transpose(std::shared_ptr exec, auto size = orig->get_size(); auto sg_array = syn::as_array(subgroup_list); auto queue = exec->get_queue(); - const std::uint32_t cfg = - get_first_cfg(sg_array, [&queue](std::uint32_t cfg) { - return validate(queue, cfg * cfg, cfg); - }); + const int cfg = get_first_cfg(sg_array, [&queue](int cfg) { + return validate(queue, cfg * cfg, cfg); + }); dim3 grid(ceildiv(size[1], cfg), ceildiv(size[0], cfg)); dim3 block(cfg, cfg); kernel::conj_transpose_call(cfg, grid, block, 0, queue, size[0], size[1], diff --git a/dpcpp/matrix/ell_kernels.dp.cpp b/dpcpp/matrix/ell_kernels.dp.cpp index 679ce5f2b31..53812e30c59 100644 --- a/dpcpp/matrix/ell_kernels.dp.cpp +++ b/dpcpp/matrix/ell_kernels.dp.cpp @@ -91,27 +91,33 @@ constexpr int num_threads_per_core = 4; constexpr double ratio = 1e-2; -/** - * max_thread_per_worker is the max number of thread per worker. The - * `compiled_kernels` must be a list <0, 1, 2, ..., max_thread_per_worker> - */ -constexpr int max_thread_per_worker = 32; - - /** * A compile-time list of sub-warp sizes for which the spmv kernels should be * compiled. * 0 is a special case where it uses a sub-warp size of warp_size in * combination with atomic_adds. */ -using compiled_kernels = std::integer_sequence; +using compiled_kernels = + syn::concatenate, subgroup_list_t>; + + +/** + * max_thread_per_worker is the max number of thread per worker. The + * `compiled_kernels` must be a list <0, 1, 2, ..., max_thread_per_worker> + */ +constexpr auto max_thread_per_worker = + syn::as_value(syn::max{}); + + +constexpr auto min_thread_per_worker = + syn::as_value(syn::min{}); namespace kernel { namespace { -template void spmv_kernel( @@ -120,13 +126,13 @@ void spmv_kernel( const size_type stride, const size_type num_stored_elements_per_row, acc::range b, OutputValueType* __restrict__ c, const size_type c_stride, Closure op, sycl::nd_item<3> item_ct1, - UninitializedArray& storage) + UninitializedArray& + storage) { const auto tidx = thread::get_thread_id_flat(item_ct1); const decltype(tidx) column_id = item_ct1.get_group(1); - if (num_thread_per_worker == 1) { - // Specialize the num_thread_per_worker = 1. It doesn't need the shared + if (subgroup_size == 1) { + // Specialize the subgroup_size = 1. It doesn't need the shared // memory, __syncthreads, and atomic_add if (tidx < num_rows) { auto temp = zero(); @@ -147,7 +153,7 @@ void spmv_kernel( const auto idx_in_worker = item_ct1.get_local_id(1); const auto x = tidx % num_rows; const auto worker_id = tidx / num_rows; - const auto step_size = num_worker_per_row * num_thread_per_worker; + const auto step_size = num_worker_per_row * subgroup_size; if (runnable && idx_in_worker == 0) { storage[item_ct1.get_local_id(2)] = 0; @@ -156,8 +162,7 @@ void spmv_kernel( item_ct1.barrier(sycl::access::fence_space::local_space); auto temp = zero(); if (runnable) { - for (size_type idx = - worker_id * num_thread_per_worker + idx_in_worker; + for (size_type idx = worker_id * subgroup_size + idx_in_worker; idx < num_stored_elements_per_row; idx += step_size) { const auto ind = x + idx * stride; const auto col_idx = col[ind]; @@ -175,8 +180,9 @@ void spmv_kernel( if (runnable && idx_in_worker == 0) { const auto c_ind = x * c_stride + column_id; if (atomic) { - atomic_add(&(c[c_ind]), - op(storage[item_ct1.get_local_id(2)], c[c_ind])); + atomic_add( + &(c[c_ind]), + op(storage[item_ct1.get_local_id(2)], c[c_ind])); } else { c[c_ind] = op(storage[item_ct1.get_local_id(2)], c[c_ind]); } @@ -185,25 +191,24 @@ void spmv_kernel( } -template -void spmv( - const size_type num_rows, const int num_worker_per_row, - acc::range val, const IndexType* __restrict__ col, - const size_type stride, const size_type num_stored_elements_per_row, - acc::range b, OutputValueType* __restrict__ c, - const size_type c_stride, sycl::nd_item<3> item_ct1, - UninitializedArray& storage) +void spmv(const size_type num_rows, const int num_worker_per_row, + acc::range val, const IndexType* __restrict__ col, + const size_type stride, const size_type num_stored_elements_per_row, + acc::range b, OutputValueType* __restrict__ c, + const size_type c_stride, sycl::nd_item<3> item_ct1, + UninitializedArray& storage) { - spmv_kernel( + spmv_kernel( num_rows, num_worker_per_row, val, col, stride, num_stored_elements_per_row, b, c, c_stride, [](const OutputValueType& x, const OutputValueType& y) { return x; }, item_ct1, storage); } -template void spmv(dim3 grid, dim3 block, size_type dynamic_shared_memory, sycl::queue* queue, const size_type num_rows, @@ -213,34 +218,36 @@ void spmv(dim3 grid, dim3 block, size_type dynamic_shared_memory, OutputValueType* c, const size_type c_stride) { queue->submit([&](sycl::handler& cgh) { - sycl::accessor< - UninitializedArray, - 0, sycl::access_mode::read_write, sycl::access::target::local> + sycl::accessor, + 0, sycl::access_mode::read_write, + sycl::access::target::local> storage_acc_ct1(cgh); - cgh.parallel_for(sycl_nd_range(grid, block), - [=](sycl::nd_item<3> item_ct1) { - spmv( - num_rows, num_worker_per_row, val, col, stride, - num_stored_elements_per_row, b, c, c_stride, - item_ct1, *storage_acc_ct1.get_pointer()); - }); + cgh.parallel_for( + sycl_nd_range(grid, block), [= + ](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size( + subgroup_size)]] { + spmv( + num_rows, num_worker_per_row, val, col, stride, + num_stored_elements_per_row, b, c, c_stride, item_ct1, + *storage_acc_ct1.get_pointer()); + }); }); } -template -void spmv( - const size_type num_rows, const int num_worker_per_row, - acc::range alpha, acc::range val, - const IndexType* __restrict__ col, const size_type stride, - const size_type num_stored_elements_per_row, acc::range b, - const OutputValueType* __restrict__ beta, OutputValueType* __restrict__ c, - const size_type c_stride, sycl::nd_item<3> item_ct1, - UninitializedArray& storage) +void spmv(const size_type num_rows, const int num_worker_per_row, + acc::range alpha, acc::range val, + const IndexType* __restrict__ col, const size_type stride, + const size_type num_stored_elements_per_row, acc::range b, + const OutputValueType* __restrict__ beta, + OutputValueType* __restrict__ c, const size_type c_stride, + sycl::nd_item<3> item_ct1, + UninitializedArray& storage) { const OutputValueType alpha_val = alpha(0); const OutputValueType beta_val = beta[0]; @@ -250,7 +257,7 @@ void spmv( // operation. The beta * c needs to be done before calling this kernel. // Then, this kernel only adds alpha * a * b when it uses atomic // operation. - spmv_kernel( + spmv_kernel( num_rows, num_worker_per_row, val, col, stride, num_stored_elements_per_row, b, c, c_stride, [&alpha_val](const OutputValueType& x, const OutputValueType& y) { @@ -258,7 +265,7 @@ void spmv( }, item_ct1, storage); } else { - spmv_kernel( + spmv_kernel( num_rows, num_worker_per_row, val, col, stride, num_stored_elements_per_row, b, c, c_stride, [&alpha_val, &beta_val](const OutputValueType& x, @@ -269,7 +276,7 @@ void spmv( } } -template void spmv(dim3 grid, dim3 block, size_type dynamic_shared_memory, sycl::queue* queue, const size_type num_rows, @@ -280,15 +287,17 @@ void spmv(dim3 grid, dim3 block, size_type dynamic_shared_memory, OutputValueType* c, const size_type c_stride) { queue->submit([&](sycl::handler& cgh) { - sycl::accessor< - UninitializedArray, - 0, sycl::access_mode::read_write, sycl::access::target::local> + sycl::accessor, + 0, sycl::access_mode::read_write, + sycl::access::target::local> storage_acc_ct1(cgh); cgh.parallel_for( - sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) { - spmv( + sycl_nd_range(grid, block), [= + ](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size( + subgroup_size)]] { + spmv( num_rows, num_worker_per_row, alpha, val, col, stride, num_stored_elements_per_row, b, beta, c, c_stride, item_ct1, *storage_acc_ct1.get_pointer()); @@ -325,11 +334,9 @@ void abstract_spmv(std::integer_sequence, const auto num_stored_elements_per_row = a->get_num_stored_elements_per_row(); - constexpr int num_thread_per_worker = - (info == 0) ? max_thread_per_worker : info; + constexpr auto subgroup_size = (info == 0) ? config::warp_size : info; constexpr bool atomic = (info == 0); - const dim3 block_size(default_block_size / num_thread_per_worker, - num_thread_per_worker, 1); + const dim3 block_size(default_block_size / subgroup_size, subgroup_size, 1); const dim3 grid_size(ceildiv(nrows * num_worker_per_row, block_size.x), b->get_size()[1], 1); @@ -346,7 +353,7 @@ void abstract_spmv(std::integer_sequence, {static_cast(b->get_stride())}}); if (alpha == nullptr && beta == nullptr) { - kernel::spmv( + kernel::spmv( grid_size, block_size, 0, exec->get_queue(), nrows, num_worker_per_row, a_vals, a->get_const_col_idxs(), stride, num_stored_elements_per_row, b_vals, c->get_values(), @@ -354,7 +361,7 @@ void abstract_spmv(std::integer_sequence, } else if (alpha != nullptr && beta != nullptr) { const auto alpha_val = gko::acc::range( std::array{1}, alpha->get_const_values()); - kernel::spmv( + kernel::spmv( grid_size, block_size, 0, exec->get_queue(), nrows, num_worker_per_row, alpha_val, a_vals, a->get_const_col_idxs(), stride, num_stored_elements_per_row, b_vals, @@ -372,7 +379,7 @@ std::array compute_thread_worker_and_atomicity( std::shared_ptr exec, const matrix::Ell* a) { - int num_thread_per_worker = 8; + int num_thread_per_worker = min_thread_per_worker; int atomic = 0; int num_worker_per_row = 1; @@ -391,8 +398,8 @@ std::array compute_thread_worker_and_atomicity( // decided according to the number of worker allowed on GPU. if (static_cast(ell_ncols) / nrows > ratio) { while (num_thread_per_worker < max_thread_per_worker && - (num_thread_per_worker << 1) <= ell_ncols) { - num_thread_per_worker <<= 1; + (num_thread_per_worker * 2) <= ell_ncols) { + num_thread_per_worker *= 2; } if (num_thread_per_worker == max_thread_per_worker) { num_worker_per_row = @@ -436,8 +443,7 @@ void spmv(std::shared_ptr exec, select_abstract_spmv( compiled_kernels(), [&info](int compiled_info) { return info == compiled_info; }, - std::integer_sequence(), syn::type_list<>(), exec, - num_worker_per_row, a, b, c); + empty_list_t{}, syn::type_list<>(), exec, num_worker_per_row, a, b, c); } GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE( @@ -470,8 +476,8 @@ void advanced_spmv(std::shared_ptr exec, select_abstract_spmv( compiled_kernels(), [&info](int compiled_info) { return info == compiled_info; }, - std::integer_sequence(), syn::type_list<>(), exec, - num_worker_per_row, a, b, c, alpha, beta); + empty_list_t{}, syn::type_list<>(), exec, num_worker_per_row, a, b, c, + alpha, beta); } GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE( diff --git a/dpcpp/test/components/cooperative_groups.dp.cpp b/dpcpp/test/components/cooperative_groups.dp.cpp index ce6becd8896..1fbc3f7034f 100644 --- a/dpcpp/test/components/cooperative_groups.dp.cpp +++ b/dpcpp/test/components/cooperative_groups.dp.cpp @@ -62,9 +62,9 @@ namespace { using namespace gko::kernels::dpcpp; using KCFG_1D = gko::ConfigSet<11, 7>; constexpr auto default_config_list = - std::integer_sequence(); + std::integer_sequence(); class CooperativeGroups : public testing::TestWithParam { @@ -116,7 +116,7 @@ class CooperativeGroups : public testing::TestWithParam { // kernel implementation -template +template void cg_shuffle(bool* s, sycl::nd_item<3> item_ct1) { constexpr auto sg_size = KCFG_1D::decode<1>(config); @@ -152,14 +152,14 @@ void cg_shuffle_host(dim3 grid, dim3 block, GKO_ENABLE_IMPLEMENTATION_CONFIG_SELECTION(cg_shuffle_config, cg_shuffle_host) // the call -void cg_shuffle_config_call(std::uint32_t desired_cfg, dim3 grid, dim3 block, +void cg_shuffle_config_call(int desired_cfg, dim3 grid, dim3 block, gko::size_type dynamic_shared_memory, sycl::queue* queue, bool* s) { cg_shuffle_config( default_config_list, // validate - [&desired_cfg](std::uint32_t cfg) { return cfg == desired_cfg; }, + [&desired_cfg](int cfg) { return cfg == desired_cfg; }, std::integer_sequence(), std::integer_sequence(), std::integer_sequence(), ::gko::syn::type_list<>(), grid, block, dynamic_shared_memory, queue, s); @@ -171,7 +171,7 @@ TEST_P(CooperativeGroups, Shuffle) } -template +template void cg_all(bool* s, sycl::nd_item<3> item_ct1) { constexpr auto sg_size = KCFG_1D::decode<1>(config); @@ -185,7 +185,7 @@ void cg_all(bool* s, sycl::nd_item<3> item_ct1) group.all(item_ct1.get_local_id(2) < 13) == sg_size < 13; } -template +template inline void cg_all(dim3 grid, dim3 block, gko::size_type, sycl::queue* queue, InferredArgs... args) { @@ -206,7 +206,7 @@ GKO_ENABLE_DEFAULT_CONFIG_CALL(cg_all_call, cg_all, default_config_list) TEST_P(CooperativeGroups, All) { test_all_subgroup(cg_all_call); } -template +template void cg_any(bool* s, sycl::nd_item<3> item_ct1) { constexpr auto sg_size = KCFG_1D::decode<1>(config); @@ -219,7 +219,7 @@ void cg_any(bool* s, sycl::nd_item<3> item_ct1) s[i + sg_size * 2] = !group.any(false); } -template +template inline void cg_any(dim3 grid, dim3 block, gko::size_type, sycl::queue* queue, InferredArgs... args) { @@ -240,7 +240,7 @@ GKO_ENABLE_DEFAULT_CONFIG_CALL(cg_any_call, cg_any, default_config_list) TEST_P(CooperativeGroups, Any) { test_all_subgroup(cg_any_call); } -template +template void cg_ballot(bool* s, sycl::nd_item<3> item_ct1) { constexpr auto sg_size = KCFG_1D::decode<1>(config); @@ -254,7 +254,7 @@ void cg_ballot(bool* s, sycl::nd_item<3> item_ct1) s[i + sg_size * 2] = group.ballot(item_ct1.get_local_id(2) < 4) == 0xf; } -template +template inline void cg_ballot(dim3 grid, dim3 block, gko::size_type, sycl::queue* queue, InferredArgs... args) { From 3c7668818eca0f8e4146f03bc0fa662cb02d60f4 Mon Sep 17 00:00:00 2001 From: ginkgo-bot Date: Fri, 18 Feb 2022 19:32:28 +0000 Subject: [PATCH 09/14] Format files Co-authored-by: Terry Cojean --- cuda/base/kernel_launch_reduction.cuh | 32 +++++++++---------- cuda/preconditioner/jacobi_common.hpp.in | 3 +- dpcpp/base/config.hpp.in | 6 ++-- dpcpp/base/kernel_launch_reduction.dp.hpp | 2 +- dpcpp/components/prefix_sum_kernels.dp.cpp | 4 ++- dpcpp/components/reduction.dp.hpp | 2 +- dpcpp/matrix/csr_kernels.dp.cpp | 2 +- dpcpp/matrix/dense_kernels.dp.cpp | 4 ++- dpcpp/matrix/ell_kernels.dp.cpp | 2 +- .../test/components/cooperative_groups.dp.cpp | 2 +- hip/base/kernel_launch_reduction.hip.hpp | 32 +++++++++---------- hip/preconditioner/jacobi_common.hip.hpp.in | 4 +-- 12 files changed, 50 insertions(+), 45 deletions(-) diff --git a/cuda/base/kernel_launch_reduction.cuh b/cuda/base/kernel_launch_reduction.cuh index 8b6e98fab86..58bf5e3580a 100644 --- a/cuda/base/kernel_launch_reduction.cuh +++ b/cuda/base/kernel_launch_reduction.cuh @@ -201,10 +201,10 @@ void run_kernel_reduction(std::shared_ptr exec, template __global__ -__launch_bounds__(default_block_size) void generic_kernel_row_reduction_2d( - int64 rows, int64 cols, int64 col_blocks, KernelFunction fn, ReductionOp op, - FinalizeOp finalize, ValueType identity, ValueType* result, - int64 result_stride, KernelArgs... args) + __launch_bounds__(default_block_size) void generic_kernel_row_reduction_2d( + int64 rows, int64 cols, int64 col_blocks, KernelFunction fn, + ReductionOp op, FinalizeOp finalize, ValueType identity, + ValueType* result, int64 result_stride, KernelArgs... args) { const auto idx = thread::get_subwarp_id_flat(); const auto row = idx % rows; @@ -233,10 +233,10 @@ __launch_bounds__(default_block_size) void generic_kernel_row_reduction_2d( template __global__ -__launch_bounds__(default_block_size) void generic_kernel_col_reduction_2d_small( - int64 rows, int64 cols, KernelFunction fn, ReductionOp op, - FinalizeOp finalize, ValueType identity, ValueType* result, - KernelArgs... args) + __launch_bounds__(default_block_size) void generic_kernel_col_reduction_2d_small( + int64 rows, int64 cols, KernelFunction fn, ReductionOp op, + FinalizeOp finalize, ValueType identity, ValueType* result, + KernelArgs... args) { constexpr auto warp_size = config::warp_size; constexpr auto warps_per_block = default_block_size / warp_size; @@ -296,10 +296,10 @@ __launch_bounds__(default_block_size) void generic_kernel_col_reduction_2d_small template __global__ -__launch_bounds__(default_block_size) void generic_kernel_col_reduction_2d_blocked( - int64 rows, int64 cols, KernelFunction fn, ReductionOp op, - FinalizeOp finalize, ValueType identity, ValueType* result, - KernelArgs... args) + __launch_bounds__(default_block_size) void generic_kernel_col_reduction_2d_blocked( + int64 rows, int64 cols, KernelFunction fn, ReductionOp op, + FinalizeOp finalize, ValueType identity, ValueType* result, + KernelArgs... args) { constexpr auto warp_size = config::warp_size; __shared__ UninitializedArray block_partial; @@ -335,10 +335,10 @@ __launch_bounds__(default_block_size) void generic_kernel_col_reduction_2d_block template __global__ -__launch_bounds__(default_block_size) void generic_kernel_reduction_finalize_2d( - int64 num_results, int64 num_blocks, ReductionOp op, FinalizeOp finalize, - ValueType identity, const ValueType* input, int64 result_stride, - ValueType* result) + __launch_bounds__(default_block_size) void generic_kernel_reduction_finalize_2d( + int64 num_results, int64 num_blocks, ReductionOp op, + FinalizeOp finalize, ValueType identity, const ValueType* input, + int64 result_stride, ValueType* result) { const auto idx = thread::get_thread_id_flat(); if (idx >= num_results) { diff --git a/cuda/preconditioner/jacobi_common.hpp.in b/cuda/preconditioner/jacobi_common.hpp.in index 6a637a572cf..19bd21385f4 100644 --- a/cuda/preconditioner/jacobi_common.hpp.in +++ b/cuda/preconditioner/jacobi_common.hpp.in @@ -58,7 +58,8 @@ namespace jacobi { #endif -using compiled_kernels = std::integer_sequence; +using compiled_kernels = + std::integer_sequence; constexpr int get_larger_power(int value, int guess = 1) diff --git a/dpcpp/base/config.hpp.in b/dpcpp/base/config.hpp.in index 3f53bcac2af..15b56383b93 100644 --- a/dpcpp/base/config.hpp.in +++ b/dpcpp/base/config.hpp.in @@ -30,8 +30,8 @@ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *************************************************************/ -#ifndef GKO_DPCPP_BASE_CONFIG_HPP_ -#define GKO_DPCPP_BASE_CONFIG_HPP_ +#ifndef GKO_DPCPP_BASE_CONFIG_HPP_IN_ +#define GKO_DPCPP_BASE_CONFIG_HPP_IN_ #include @@ -114,4 +114,4 @@ constexpr auto block_cfg_array = syn::as_array(block_cfg_list); } // namespace gko -#endif // GKO_DPCPP_BASE_CONFIG_HPP_ +#endif // GKO_DPCPP_BASE_CONFIG_HPP_IN_ diff --git a/dpcpp/base/kernel_launch_reduction.dp.hpp b/dpcpp/base/kernel_launch_reduction.dp.hpp index c4d14cd97f2..d844ba407a1 100644 --- a/dpcpp/base/kernel_launch_reduction.dp.hpp +++ b/dpcpp/base/kernel_launch_reduction.dp.hpp @@ -37,10 +37,10 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include #include "core/synthesizer/implementation_selection.hpp" -#include "dpcpp/base/config.hpp" #include "dpcpp/base/dim3.dp.hpp" #include "dpcpp/components/cooperative_groups.dp.hpp" #include "dpcpp/components/reduction.dp.hpp" diff --git a/dpcpp/components/prefix_sum_kernels.dp.cpp b/dpcpp/components/prefix_sum_kernels.dp.cpp index ae55fd1edb8..5e89b61877f 100644 --- a/dpcpp/components/prefix_sum_kernels.dp.cpp +++ b/dpcpp/components/prefix_sum_kernels.dp.cpp @@ -33,6 +33,9 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "core/components/prefix_sum_kernels.hpp" +#include + + #include @@ -40,7 +43,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "core/base/types.hpp" -#include "dpcpp/base/config.hpp" #include "dpcpp/base/helper.hpp" #include "dpcpp/components/prefix_sum.dp.hpp" diff --git a/dpcpp/components/reduction.dp.hpp b/dpcpp/components/reduction.dp.hpp index 7f912fa1309..e41362fe103 100644 --- a/dpcpp/components/reduction.dp.hpp +++ b/dpcpp/components/reduction.dp.hpp @@ -34,6 +34,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GKO_DPCPP_COMPONENTS_REDUCTION_DP_HPP_ +#include #include @@ -47,7 +48,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "core/base/types.hpp" #include "core/synthesizer/implementation_selection.hpp" -#include "dpcpp/base/config.hpp" #include "dpcpp/base/dim3.dp.hpp" #include "dpcpp/base/dpct.hpp" #include "dpcpp/base/helper.hpp" diff --git a/dpcpp/matrix/csr_kernels.dp.cpp b/dpcpp/matrix/csr_kernels.dp.cpp index 97e5a9d5757..091c1c25579 100644 --- a/dpcpp/matrix/csr_kernels.dp.cpp +++ b/dpcpp/matrix/csr_kernels.dp.cpp @@ -34,6 +34,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include #include @@ -56,7 +57,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "core/matrix/csr_builder.hpp" #include "core/matrix/dense_kernels.hpp" #include "core/synthesizer/implementation_selection.hpp" -#include "dpcpp/base/config.hpp" #include "dpcpp/base/dim3.dp.hpp" #include "dpcpp/base/dpct.hpp" #include "dpcpp/base/helper.hpp" diff --git a/dpcpp/matrix/dense_kernels.dp.cpp b/dpcpp/matrix/dense_kernels.dp.cpp index bb67b3bf865..a458f797d62 100644 --- a/dpcpp/matrix/dense_kernels.dp.cpp +++ b/dpcpp/matrix/dense_kernels.dp.cpp @@ -33,6 +33,9 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "core/matrix/dense_kernels.hpp" +#include + + #include #include @@ -49,7 +52,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "core/components/prefix_sum_kernels.hpp" -#include "dpcpp/base/config.hpp" #include "dpcpp/base/dim3.dp.hpp" #include "dpcpp/base/helper.hpp" #include "dpcpp/base/onemkl_bindings.hpp" diff --git a/dpcpp/matrix/ell_kernels.dp.cpp b/dpcpp/matrix/ell_kernels.dp.cpp index 53812e30c59..785dbf59d81 100644 --- a/dpcpp/matrix/ell_kernels.dp.cpp +++ b/dpcpp/matrix/ell_kernels.dp.cpp @@ -34,6 +34,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include #include @@ -52,7 +53,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "core/components/prefix_sum_kernels.hpp" #include "core/matrix/dense_kernels.hpp" #include "core/synthesizer/implementation_selection.hpp" -#include "dpcpp/base/config.hpp" #include "dpcpp/base/dim3.dp.hpp" #include "dpcpp/base/helper.hpp" #include "dpcpp/components/atomic.dp.hpp" diff --git a/dpcpp/test/components/cooperative_groups.dp.cpp b/dpcpp/test/components/cooperative_groups.dp.cpp index 1fbc3f7034f..903a5637197 100644 --- a/dpcpp/test/components/cooperative_groups.dp.cpp +++ b/dpcpp/test/components/cooperative_groups.dp.cpp @@ -33,6 +33,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "dpcpp/components/cooperative_groups.dp.hpp" +#include #include #include @@ -51,7 +52,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "core/base/types.hpp" #include "core/synthesizer/implementation_selection.hpp" #include "core/test/utils/assertions.hpp" -#include "dpcpp/base/config.hpp" #include "dpcpp/base/dim3.dp.hpp" #include "dpcpp/base/helper.hpp" diff --git a/hip/base/kernel_launch_reduction.hip.hpp b/hip/base/kernel_launch_reduction.hip.hpp index 8eac7625c34..01e19c6c1f0 100644 --- a/hip/base/kernel_launch_reduction.hip.hpp +++ b/hip/base/kernel_launch_reduction.hip.hpp @@ -206,10 +206,10 @@ void run_kernel_reduction(std::shared_ptr exec, template __global__ -__launch_bounds__(default_block_size) void generic_kernel_row_reduction_2d( - int64 rows, int64 cols, int64 col_blocks, KernelFunction fn, ReductionOp op, - FinalizeOp finalize, ValueType identity, ValueType* result, - int64 result_stride, KernelArgs... args) + __launch_bounds__(default_block_size) void generic_kernel_row_reduction_2d( + int64 rows, int64 cols, int64 col_blocks, KernelFunction fn, + ReductionOp op, FinalizeOp finalize, ValueType identity, + ValueType* result, int64 result_stride, KernelArgs... args) { const auto idx = thread::get_subwarp_id_flat(); const auto row = idx % rows; @@ -238,10 +238,10 @@ __launch_bounds__(default_block_size) void generic_kernel_row_reduction_2d( template __global__ -__launch_bounds__(default_block_size) void generic_kernel_col_reduction_2d_small( - int64 rows, int64 cols, KernelFunction fn, ReductionOp op, - FinalizeOp finalize, ValueType identity, ValueType* result, - KernelArgs... args) + __launch_bounds__(default_block_size) void generic_kernel_col_reduction_2d_small( + int64 rows, int64 cols, KernelFunction fn, ReductionOp op, + FinalizeOp finalize, ValueType identity, ValueType* result, + KernelArgs... args) { constexpr auto warp_size = config::warp_size; constexpr auto warps_per_block = default_block_size / warp_size; @@ -301,10 +301,10 @@ __launch_bounds__(default_block_size) void generic_kernel_col_reduction_2d_small template __global__ -__launch_bounds__(default_block_size) void generic_kernel_col_reduction_2d_blocked( - int64 rows, int64 cols, KernelFunction fn, ReductionOp op, - FinalizeOp finalize, ValueType identity, ValueType* result, - KernelArgs... args) + __launch_bounds__(default_block_size) void generic_kernel_col_reduction_2d_blocked( + int64 rows, int64 cols, KernelFunction fn, ReductionOp op, + FinalizeOp finalize, ValueType identity, ValueType* result, + KernelArgs... args) { constexpr auto warp_size = config::warp_size; __shared__ UninitializedArray block_partial; @@ -340,10 +340,10 @@ __launch_bounds__(default_block_size) void generic_kernel_col_reduction_2d_block template __global__ -__launch_bounds__(default_block_size) void generic_kernel_reduction_finalize_2d( - int64 num_results, int64 num_blocks, ReductionOp op, FinalizeOp finalize, - ValueType identity, const ValueType* input, int64 result_stride, - ValueType* result) + __launch_bounds__(default_block_size) void generic_kernel_reduction_finalize_2d( + int64 num_results, int64 num_blocks, ReductionOp op, + FinalizeOp finalize, ValueType identity, const ValueType* input, + int64 result_stride, ValueType* result) { const auto idx = thread::get_thread_id_flat(); if (idx >= num_results) { diff --git a/hip/preconditioner/jacobi_common.hip.hpp.in b/hip/preconditioner/jacobi_common.hip.hpp.in index 76947dfd5d7..8b79d841355 100644 --- a/hip/preconditioner/jacobi_common.hip.hpp.in +++ b/hip/preconditioner/jacobi_common.hip.hpp.in @@ -30,7 +30,6 @@ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *************************************************************/ - #include @@ -59,7 +58,8 @@ namespace jacobi { #endif -using compiled_kernels = std::integer_sequence; +using compiled_kernels = + std::integer_sequence; constexpr int get_larger_power(int value, int guess = 1) From f7bc0e60dadc36e49a1cd8d2f96139abeb9a9d83 Mon Sep 17 00:00:00 2001 From: "Yuhsiang M. Tsai" Date: Tue, 22 Feb 2022 19:01:34 +0100 Subject: [PATCH 10/14] revert changes subgroup and num_thread_per_worker --- dpcpp/matrix/ell_kernels.dp.cpp | 110 ++++++++++++++++---------------- 1 file changed, 55 insertions(+), 55 deletions(-) diff --git a/dpcpp/matrix/ell_kernels.dp.cpp b/dpcpp/matrix/ell_kernels.dp.cpp index 785dbf59d81..17261b40175 100644 --- a/dpcpp/matrix/ell_kernels.dp.cpp +++ b/dpcpp/matrix/ell_kernels.dp.cpp @@ -117,7 +117,7 @@ namespace kernel { namespace { -template void spmv_kernel( @@ -126,13 +126,13 @@ void spmv_kernel( const size_type stride, const size_type num_stored_elements_per_row, acc::range b, OutputValueType* __restrict__ c, const size_type c_stride, Closure op, sycl::nd_item<3> item_ct1, - UninitializedArray& - storage) + UninitializedArray& storage) { const auto tidx = thread::get_thread_id_flat(item_ct1); const decltype(tidx) column_id = item_ct1.get_group(1); - if (subgroup_size == 1) { - // Specialize the subgroup_size = 1. It doesn't need the shared + if (num_thread_per_worker == 1) { + // Specialize the num_thread_per_worker = 1. It doesn't need the shared // memory, __syncthreads, and atomic_add if (tidx < num_rows) { auto temp = zero(); @@ -153,7 +153,7 @@ void spmv_kernel( const auto idx_in_worker = item_ct1.get_local_id(1); const auto x = tidx % num_rows; const auto worker_id = tidx / num_rows; - const auto step_size = num_worker_per_row * subgroup_size; + const auto step_size = num_worker_per_row * num_thread_per_worker; if (runnable && idx_in_worker == 0) { storage[item_ct1.get_local_id(2)] = 0; @@ -162,7 +162,8 @@ void spmv_kernel( item_ct1.barrier(sycl::access::fence_space::local_space); auto temp = zero(); if (runnable) { - for (size_type idx = worker_id * subgroup_size + idx_in_worker; + for (size_type idx = + worker_id * num_thread_per_worker + idx_in_worker; idx < num_stored_elements_per_row; idx += step_size) { const auto ind = x + idx * stride; const auto col_idx = col[ind]; @@ -191,24 +192,25 @@ void spmv_kernel( } -template -void spmv(const size_type num_rows, const int num_worker_per_row, - acc::range val, const IndexType* __restrict__ col, - const size_type stride, const size_type num_stored_elements_per_row, - acc::range b, OutputValueType* __restrict__ c, - const size_type c_stride, sycl::nd_item<3> item_ct1, - UninitializedArray& storage) +void spmv( + const size_type num_rows, const int num_worker_per_row, + acc::range val, const IndexType* __restrict__ col, + const size_type stride, const size_type num_stored_elements_per_row, + acc::range b, OutputValueType* __restrict__ c, + const size_type c_stride, sycl::nd_item<3> item_ct1, + UninitializedArray& storage) { - spmv_kernel( + spmv_kernel( num_rows, num_worker_per_row, val, col, stride, num_stored_elements_per_row, b, c, c_stride, [](const OutputValueType& x, const OutputValueType& y) { return x; }, item_ct1, storage); } -template void spmv(dim3 grid, dim3 block, size_type dynamic_shared_memory, sycl::queue* queue, const size_type num_rows, @@ -218,36 +220,34 @@ void spmv(dim3 grid, dim3 block, size_type dynamic_shared_memory, OutputValueType* c, const size_type c_stride) { queue->submit([&](sycl::handler& cgh) { - sycl::accessor, - 0, sycl::access_mode::read_write, - sycl::access::target::local> + sycl::accessor< + UninitializedArray, + 0, sycl::access_mode::read_write, sycl::access::target::local> storage_acc_ct1(cgh); - cgh.parallel_for( - sycl_nd_range(grid, block), [= - ](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size( - subgroup_size)]] { - spmv( - num_rows, num_worker_per_row, val, col, stride, - num_stored_elements_per_row, b, c, c_stride, item_ct1, - *storage_acc_ct1.get_pointer()); - }); + cgh.parallel_for(sycl_nd_range(grid, block), + [=](sycl::nd_item<3> item_ct1) { + spmv( + num_rows, num_worker_per_row, val, col, stride, + num_stored_elements_per_row, b, c, c_stride, + item_ct1, *storage_acc_ct1.get_pointer()); + }); }); } -template -void spmv(const size_type num_rows, const int num_worker_per_row, - acc::range alpha, acc::range val, - const IndexType* __restrict__ col, const size_type stride, - const size_type num_stored_elements_per_row, acc::range b, - const OutputValueType* __restrict__ beta, - OutputValueType* __restrict__ c, const size_type c_stride, - sycl::nd_item<3> item_ct1, - UninitializedArray& storage) +void spmv( + const size_type num_rows, const int num_worker_per_row, + acc::range alpha, acc::range val, + const IndexType* __restrict__ col, const size_type stride, + const size_type num_stored_elements_per_row, acc::range b, + const OutputValueType* __restrict__ beta, OutputValueType* __restrict__ c, + const size_type c_stride, sycl::nd_item<3> item_ct1, + UninitializedArray& storage) { const OutputValueType alpha_val = alpha(0); const OutputValueType beta_val = beta[0]; @@ -257,7 +257,7 @@ void spmv(const size_type num_rows, const int num_worker_per_row, // operation. The beta * c needs to be done before calling this kernel. // Then, this kernel only adds alpha * a * b when it uses atomic // operation. - spmv_kernel( + spmv_kernel( num_rows, num_worker_per_row, val, col, stride, num_stored_elements_per_row, b, c, c_stride, [&alpha_val](const OutputValueType& x, const OutputValueType& y) { @@ -265,7 +265,7 @@ void spmv(const size_type num_rows, const int num_worker_per_row, }, item_ct1, storage); } else { - spmv_kernel( + spmv_kernel( num_rows, num_worker_per_row, val, col, stride, num_stored_elements_per_row, b, c, c_stride, [&alpha_val, &beta_val](const OutputValueType& x, @@ -276,7 +276,7 @@ void spmv(const size_type num_rows, const int num_worker_per_row, } } -template void spmv(dim3 grid, dim3 block, size_type dynamic_shared_memory, sycl::queue* queue, const size_type num_rows, @@ -287,17 +287,15 @@ void spmv(dim3 grid, dim3 block, size_type dynamic_shared_memory, OutputValueType* c, const size_type c_stride) { queue->submit([&](sycl::handler& cgh) { - sycl::accessor, - 0, sycl::access_mode::read_write, - sycl::access::target::local> + sycl::accessor< + UninitializedArray, + 0, sycl::access_mode::read_write, sycl::access::target::local> storage_acc_ct1(cgh); cgh.parallel_for( - sycl_nd_range(grid, block), [= - ](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size( - subgroup_size)]] { - spmv( + sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) { + spmv( num_rows, num_worker_per_row, alpha, val, col, stride, num_stored_elements_per_row, b, beta, c, c_stride, item_ct1, *storage_acc_ct1.get_pointer()); @@ -334,9 +332,11 @@ void abstract_spmv(std::integer_sequence, const auto num_stored_elements_per_row = a->get_num_stored_elements_per_row(); - constexpr auto subgroup_size = (info == 0) ? config::warp_size : info; + constexpr auto num_thread_per_worker = + (info == 0) ? max_thread_per_worker : info; constexpr bool atomic = (info == 0); - const dim3 block_size(default_block_size / subgroup_size, subgroup_size, 1); + const dim3 block_size(default_block_size / num_thread_per_worker, + num_thread_per_worker, 1); const dim3 grid_size(ceildiv(nrows * num_worker_per_row, block_size.x), b->get_size()[1], 1); @@ -353,7 +353,7 @@ void abstract_spmv(std::integer_sequence, {static_cast(b->get_stride())}}); if (alpha == nullptr && beta == nullptr) { - kernel::spmv( + kernel::spmv( grid_size, block_size, 0, exec->get_queue(), nrows, num_worker_per_row, a_vals, a->get_const_col_idxs(), stride, num_stored_elements_per_row, b_vals, c->get_values(), @@ -361,7 +361,7 @@ void abstract_spmv(std::integer_sequence, } else if (alpha != nullptr && beta != nullptr) { const auto alpha_val = gko::acc::range( std::array{1}, alpha->get_const_values()); - kernel::spmv( + kernel::spmv( grid_size, block_size, 0, exec->get_queue(), nrows, num_worker_per_row, alpha_val, a_vals, a->get_const_col_idxs(), stride, num_stored_elements_per_row, b_vals, From 8e029637028617cf785610fc8be692b55abf8595 Mon Sep 17 00:00:00 2001 From: "Yuhsiang M. Tsai" Date: Wed, 23 Feb 2022 11:34:54 +0100 Subject: [PATCH 11/14] use native type for shared memory dpcpp seems to be unhappy with atomic on UninitializedArray --- dpcpp/base/kernel_launch_reduction.dp.hpp | 36 +++--- dpcpp/components/prefix_sum.dp.hpp | 10 +- dpcpp/components/reduction.dp.hpp | 19 ++-- dpcpp/matrix/csr_kernels.dp.cpp | 52 ++++----- dpcpp/matrix/dense_kernels.dp.cpp | 29 +++-- dpcpp/matrix/ell_kernels.dp.cpp | 86 +++++++-------- dpcpp/solver/cb_gmres_kernels.dp.cpp | 128 +++++++++++----------- dpcpp/solver/gmres_kernels.dp.cpp | 55 +++++----- dpcpp/solver/idr_kernels.dp.cpp | 39 +++---- 9 files changed, 224 insertions(+), 230 deletions(-) diff --git a/dpcpp/base/kernel_launch_reduction.dp.hpp b/dpcpp/base/kernel_launch_reduction.dp.hpp index d844ba407a1..dbdf34de726 100644 --- a/dpcpp/base/kernel_launch_reduction.dp.hpp +++ b/dpcpp/base/kernel_launch_reduction.dp.hpp @@ -65,16 +65,17 @@ void generic_kernel_reduction_1d(sycl::handler& cgh, int64 size, constexpr auto wg_size = KCFG_1D::decode<0>(cfg); constexpr auto sg_size = KCFG_1D::decode<1>(cfg); constexpr auto num_partials = wg_size / sg_size; - sycl::accessor, 0, - sycl::access_mode::read_write, sycl::access::target::local> - subgroup_partial_acc(cgh); + sycl::accessor + subgroup_partial_acc(sycl::range<1>(num_partials), cgh); const auto range = sycl_nd_range(dim3(num_workgroups), dim3(wg_size)); const auto global_size = num_workgroups * wg_size; cgh.parallel_for( range, [= ](sycl::nd_item<3> idx) [[sycl::reqd_sub_group_size(sg_size)]] { - auto subgroup_partial = &(*subgroup_partial_acc.get_pointer())[0]; + ValueType* __restrict__ subgroup_partial = + static_cast(subgroup_partial_acc.get_pointer()); const auto tidx = thread::get_thread_id_flat(idx); const auto local_tidx = static_cast(tidx % wg_size); auto subgroup = @@ -114,16 +115,17 @@ void generic_kernel_reduction_2d(sycl::handler& cgh, int64 rows, int64 cols, constexpr auto wg_size = KCFG_1D::decode<0>(cfg); constexpr auto sg_size = KCFG_1D::decode<1>(cfg); constexpr auto num_partials = wg_size / sg_size; - sycl::accessor, 0, - sycl::access_mode::read_write, sycl::access::target::local> - subgroup_partial_acc(cgh); + sycl::accessor + subgroup_partial_acc(sycl::range<1>(num_partials), cgh); const auto range = sycl_nd_range(dim3(num_workgroups), dim3(wg_size)); const auto global_size = num_workgroups * wg_size; cgh.parallel_for( range, [= ](sycl::nd_item<3> idx) [[sycl::reqd_sub_group_size(sg_size)]] { - auto subgroup_partial = &(*subgroup_partial_acc.get_pointer())[0]; + ValueType* __restrict__ subgroup_partial = + static_cast(subgroup_partial_acc.get_pointer()); const auto tidx = thread::get_thread_id_flat(idx); const auto local_tidx = static_cast(tidx % wg_size); auto subgroup = @@ -351,13 +353,14 @@ void generic_kernel_col_reduction_2d_small( constexpr auto subgroups_per_workgroup = wg_size / sg_size; // stores the subwarp_size partial sums from each warp, grouped by warp constexpr auto shared_storage = subgroups_per_workgroup * ssg_size; - sycl::accessor, 0, - sycl::access_mode::read_write, sycl::access::target::local> - block_partial_acc(cgh); + sycl::accessor + block_partial_acc(sycl::range<1>(shared_storage), cgh); const auto range = sycl_nd_range(dim3(row_blocks), dim3(wg_size)); cgh.parallel_for( range, [=](sycl::nd_item<3> id) [[sycl::reqd_sub_group_size(sg_size)]] { - auto block_partial = &(*block_partial_acc.get_pointer())[0]; + ValueType* __restrict__ block_partial = + static_cast(block_partial_acc.get_pointer()); const auto ssg_id = thread::get_subwarp_id_flat(id); const auto local_sg_id = id.get_local_id(2) / sg_size; @@ -424,9 +427,9 @@ void generic_kernel_col_reduction_2d_blocked( constexpr auto sg_size = KCFG_1D::decode<1>(cfg); const auto range = sycl_nd_range(dim3(row_blocks, col_blocks), dim3(wg_size)); - sycl::accessor, 0, - sycl::access_mode::read_write, sycl::access::target::local> - block_partial_acc(cgh); + sycl::accessor + block_partial_acc(sycl::range<1>(wg_size), cgh); cgh.parallel_for( range, [=](sycl::nd_item<3> id) [[sycl::reqd_sub_group_size(sg_size)]] { const auto sg_id = thread::get_subwarp_id_flat(id); @@ -437,7 +440,8 @@ void generic_kernel_col_reduction_2d_blocked( const auto sg_rank = subgroup.thread_rank(); const auto col = sg_rank + static_cast(id.get_group(1)) * sg_size; - auto block_partial = &(*block_partial_acc.get_pointer())[0]; + ValueType* __restrict__ block_partial = + static_cast(block_partial_acc.get_pointer()); auto partial = identity; // accumulate within a thread if (col < cols) { diff --git a/dpcpp/components/prefix_sum.dp.hpp b/dpcpp/components/prefix_sum.dp.hpp index b55c6a22fb8..56fe9864612 100644 --- a/dpcpp/components/prefix_sum.dp.hpp +++ b/dpcpp/components/prefix_sum.dp.hpp @@ -133,7 +133,7 @@ template void start_prefix_sum(size_type num_elements, ValueType* __restrict__ elements, ValueType* __restrict__ block_sum, sycl::nd_item<3> item_ct1, - UninitializedArray& prefix_helper) + ValueType* __restrict__ prefix_helper) { const auto tidx = thread::get_thread_id_flat(item_ct1); const auto element_id = item_ct1.get_local_id(2); @@ -188,16 +188,16 @@ void start_prefix_sum(dim3 grid, dim3 block, size_type dynamic_shared_memory, ValueType* elements, ValueType* block_sum) { queue->submit([&](sycl::handler& cgh) { - sycl::accessor, 0, - sycl::access::mode::read_write, + sycl::accessor - prefix_helper_acc_ct1(cgh); + prefix_helper_acc_ct1(sycl::range<1>(block_size), cgh); cgh.parallel_for(sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) { start_prefix_sum( num_elements, elements, block_sum, item_ct1, - *prefix_helper_acc_ct1.get_pointer()); + static_cast( + prefix_helper_acc_ct1.get_pointer())); }); }); } diff --git a/dpcpp/components/reduction.dp.hpp b/dpcpp/components/reduction.dp.hpp index e41362fe103..dde112fba9d 100644 --- a/dpcpp/components/reduction.dp.hpp +++ b/dpcpp/components/reduction.dp.hpp @@ -197,13 +197,12 @@ void reduce_array(size_type size, const ValueType* __restrict__ source, * an array larger than `block_size`. */ template -void reduce_add_array( - size_type size, const ValueType* __restrict__ source, - ValueType* __restrict__ result, sycl::nd_item<3> item_ct1, - UninitializedArray(cfg)>& block_sum) +void reduce_add_array(size_type size, const ValueType* __restrict__ source, + ValueType* __restrict__ result, sycl::nd_item<3> item_ct1, + ValueType* __restrict__ block_sum) { reduce_array(cfg)>( - size, source, static_cast(block_sum), item_ct1, + size, source, block_sum, item_ct1, [](const ValueType& x, const ValueType& y) { return x + y; }); if (item_ct1.get_local_id(2) == 0) { @@ -218,17 +217,17 @@ void reduce_add_array(dim3 grid, dim3 block, size_type dynamic_shared_memory, const ValueType* source, ValueType* result) { queue->submit([&](sycl::handler& cgh) { - sycl::accessor(cfg)>, - 0, sycl::access::mode::read_write, + sycl::accessor - block_sum_acc_ct1(cgh); + block_sum_acc_ct1(sycl::range<1>(KCFG_1D::decode<0>(cfg)), cgh); cgh.parallel_for( sycl_nd_range(grid, block), [= ](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size( KCFG_1D::decode<1>(cfg))]] { - reduce_add_array(size, source, result, item_ct1, - *block_sum_acc_ct1.get_pointer()); + reduce_add_array( + size, source, result, item_ct1, + static_cast(block_sum_acc_ct1.get_pointer())); }); }); } diff --git a/dpcpp/matrix/csr_kernels.dp.cpp b/dpcpp/matrix/csr_kernels.dp.cpp index 091c1c25579..a7e8ee2bd58 100644 --- a/dpcpp/matrix/csr_kernels.dp.cpp +++ b/dpcpp/matrix/csr_kernels.dp.cpp @@ -324,8 +324,8 @@ void merge_path_reduce(const IndexType nwarps, const IndexType* __restrict__ last_row, ValueType* __restrict__ c, const size_type c_stride, Alpha_op alpha_op, sycl::nd_item<3> item_ct1, - UninitializedArray& tmp_ind, - UninitializedArray& tmp_val) + IndexType* __restrict__ tmp_ind, + ValueType* __restrict__ tmp_val) { const IndexType cache_lines = ceildivT(nwarps, spmv_block_size); const IndexType tid = item_ct1.get_local_id(2); @@ -351,9 +351,7 @@ void merge_path_reduce(const IndexType nwarps, tmp_val[item_ct1.get_local_id(2)] = value; tmp_ind[item_ct1.get_local_id(2)] = row; group::this_thread_block(item_ct1).sync(); - bool last = - block_segment_scan_reverse(static_cast(tmp_ind), - static_cast(tmp_val), item_ct1); + bool last = block_segment_scan_reverse(tmp_ind, tmp_val, item_ct1); group::this_thread_block(item_ct1).sync(); if (last) { c[row * c_stride] += alpha_op(tmp_val[item_ct1.get_local_id(2)]); @@ -538,9 +536,8 @@ void abstract_reduce(const IndexType nwarps, const ValueType* __restrict__ last_val, const IndexType* __restrict__ last_row, ValueType* __restrict__ c, const size_type c_stride, - sycl::nd_item<3> item_ct1, - UninitializedArray& tmp_ind, - UninitializedArray& tmp_val) + sycl::nd_item<3> item_ct1, IndexType* __restrict__ tmp_ind, + ValueType* __restrict__ tmp_val) { merge_path_reduce( nwarps, last_val, last_row, c, c_stride, [](ValueType& x) { return x; }, @@ -554,20 +551,19 @@ void abstract_reduce(dim3 grid, dim3 block, size_type dynamic_shared_memory, ValueType* c, const size_type c_stride) { queue->submit([&](sycl::handler& cgh) { - sycl::accessor, 0, - sycl::access_mode::read_write, + sycl::accessor - tmp_ind_acc_ct1(cgh); - sycl::accessor, 0, - sycl::access_mode::read_write, + tmp_ind_acc_ct1(sycl::range<1>(spmv_block_size), cgh); + sycl::accessor - tmp_val_acc_ct1(cgh); + tmp_val_acc_ct1(sycl::range<1>(spmv_block_size), cgh); cgh.parallel_for( sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) { - abstract_reduce(nwarps, last_val, last_row, c, c_stride, - item_ct1, *tmp_ind_acc_ct1.get_pointer(), - *tmp_val_acc_ct1.get_pointer()); + abstract_reduce( + nwarps, last_val, last_row, c, c_stride, item_ct1, + static_cast(tmp_ind_acc_ct1.get_pointer()), + static_cast(tmp_val_acc_ct1.get_pointer())); }); }); } @@ -579,9 +575,8 @@ void abstract_reduce(const IndexType nwarps, const IndexType* __restrict__ last_row, const ValueType* __restrict__ alpha, ValueType* __restrict__ c, const size_type c_stride, - sycl::nd_item<3> item_ct1, - UninitializedArray& tmp_ind, - UninitializedArray& tmp_val) + sycl::nd_item<3> item_ct1, IndexType* __restrict__ tmp_ind, + ValueType* __restrict__ tmp_val) { const auto alpha_val = alpha[0]; merge_path_reduce( @@ -598,20 +593,19 @@ void abstract_reduce(dim3 grid, dim3 block, size_type dynamic_shared_memory, const size_type c_stride) { queue->submit([&](sycl::handler& cgh) { - sycl::accessor, 0, - sycl::access_mode::read_write, + sycl::accessor - tmp_ind_acc_ct1(cgh); - sycl::accessor, 0, - sycl::access_mode::read_write, + tmp_ind_acc_ct1(sycl::range<1>(spmv_block_size), cgh); + sycl::accessor - tmp_val_acc_ct1(cgh); + tmp_val_acc_ct1(sycl::range<1>(spmv_block_size), cgh); cgh.parallel_for( sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) { - abstract_reduce(nwarps, last_val, last_row, alpha, c, c_stride, - item_ct1, *tmp_ind_acc_ct1.get_pointer(), - *tmp_val_acc_ct1.get_pointer()); + abstract_reduce( + nwarps, last_val, last_row, alpha, c, c_stride, item_ct1, + static_cast(tmp_ind_acc_ct1.get_pointer()), + static_cast(tmp_val_acc_ct1.get_pointer())); }); }); } diff --git a/dpcpp/matrix/dense_kernels.dp.cpp b/dpcpp/matrix/dense_kernels.dp.cpp index a458f797d62..3dd7964c4ab 100644 --- a/dpcpp/matrix/dense_kernels.dp.cpp +++ b/dpcpp/matrix/dense_kernels.dp.cpp @@ -83,7 +83,7 @@ void transpose(const size_type nrows, const size_type ncols, const ValueType* __restrict__ in, const size_type in_stride, ValueType* __restrict__ out, const size_type out_stride, Closure op, sycl::nd_item<3> item_ct1, - UninitializedArray& space) + ValueType* __restrict__ space) { auto local_x = item_ct1.get_local_id(2); auto local_y = item_ct1.get_local_id(1); @@ -105,8 +105,7 @@ template void transpose(const size_type nrows, const size_type ncols, const ValueType* __restrict__ in, const size_type in_stride, ValueType* __restrict__ out, const size_type out_stride, - sycl::nd_item<3> item_ct1, - UninitializedArray& space) + sycl::nd_item<3> item_ct1, ValueType* __restrict__ space) { transpose( nrows, ncols, in, in_stride, out, out_stride, @@ -120,16 +119,16 @@ void transpose(dim3 grid, dim3 block, size_type dynamic_shared_memory, const size_type out_stride) { queue->submit([&](sycl::handler& cgh) { - sycl::accessor, 0, - sycl::access_mode::read_write, + sycl::accessor - space_acc_ct1(cgh); + space_acc_ct1(sycl::range<1>(sg_size * (sg_size + 1)), cgh); cgh.parallel_for( sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) __WG_BOUND__(sg_size, sg_size) { - transpose(nrows, ncols, in, in_stride, out, out_stride, - item_ct1, *space_acc_ct1.get_pointer()); + transpose( + nrows, ncols, in, in_stride, out, out_stride, item_ct1, + static_cast(space_acc_ct1.get_pointer())); }); }); } @@ -142,8 +141,7 @@ template void conj_transpose(const size_type nrows, const size_type ncols, const ValueType* __restrict__ in, const size_type in_stride, ValueType* __restrict__ out, const size_type out_stride, - sycl::nd_item<3> item_ct1, - UninitializedArray& space) + sycl::nd_item<3> item_ct1, ValueType* __restrict__ space) { transpose( nrows, ncols, in, in_stride, out, out_stride, @@ -158,17 +156,16 @@ void conj_transpose(dim3 grid, dim3 block, size_type dynamic_shared_memory, const size_type out_stride) { queue->submit([&](sycl::handler& cgh) { - sycl::accessor, 0, - sycl::access_mode::read_write, + sycl::accessor - space_acc_ct1(cgh); + space_acc_ct1(sycl::range<1>(sg_size * (sg_size + 1)), cgh); cgh.parallel_for( sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) __WG_BOUND__(sg_size, sg_size) { - conj_transpose(nrows, ncols, in, in_stride, out, - out_stride, item_ct1, - *space_acc_ct1.get_pointer()); + conj_transpose( + nrows, ncols, in, in_stride, out, out_stride, item_ct1, + static_cast(space_acc_ct1.get_pointer())); }); }); } diff --git a/dpcpp/matrix/ell_kernels.dp.cpp b/dpcpp/matrix/ell_kernels.dp.cpp index 17261b40175..23847bdf9b8 100644 --- a/dpcpp/matrix/ell_kernels.dp.cpp +++ b/dpcpp/matrix/ell_kernels.dp.cpp @@ -120,14 +120,14 @@ namespace { template -void spmv_kernel( - const size_type num_rows, const int num_worker_per_row, - acc::range val, const IndexType* __restrict__ col, - const size_type stride, const size_type num_stored_elements_per_row, - acc::range b, OutputValueType* __restrict__ c, - const size_type c_stride, Closure op, sycl::nd_item<3> item_ct1, - UninitializedArray& storage) +void spmv_kernel(const size_type num_rows, const int num_worker_per_row, + acc::range val, const IndexType* __restrict__ col, + const size_type stride, + const size_type num_stored_elements_per_row, + acc::range b, OutputValueType* __restrict__ c, + const size_type c_stride, Closure op, + sycl::nd_item<3> item_ct1, + OutputValueType* __restrict__ storage) { const auto tidx = thread::get_thread_id_flat(item_ct1); const decltype(tidx) column_id = item_ct1.get_group(1); @@ -194,14 +194,12 @@ void spmv_kernel( template -void spmv( - const size_type num_rows, const int num_worker_per_row, - acc::range val, const IndexType* __restrict__ col, - const size_type stride, const size_type num_stored_elements_per_row, - acc::range b, OutputValueType* __restrict__ c, - const size_type c_stride, sycl::nd_item<3> item_ct1, - UninitializedArray& storage) +void spmv(const size_type num_rows, const int num_worker_per_row, + acc::range val, const IndexType* __restrict__ col, + const size_type stride, const size_type num_stored_elements_per_row, + acc::range b, OutputValueType* __restrict__ c, + const size_type c_stride, sycl::nd_item<3> item_ct1, + OutputValueType* __restrict__ storage) { spmv_kernel( num_rows, num_worker_per_row, val, col, stride, @@ -220,34 +218,33 @@ void spmv(dim3 grid, dim3 block, size_type dynamic_shared_memory, OutputValueType* c, const size_type c_stride) { queue->submit([&](sycl::handler& cgh) { - sycl::accessor< - UninitializedArray, - 0, sycl::access_mode::read_write, sycl::access::target::local> - storage_acc_ct1(cgh); - - cgh.parallel_for(sycl_nd_range(grid, block), - [=](sycl::nd_item<3> item_ct1) { - spmv( - num_rows, num_worker_per_row, val, col, stride, - num_stored_elements_per_row, b, c, c_stride, - item_ct1, *storage_acc_ct1.get_pointer()); - }); + sycl::accessor + storage_acc_ct1( + sycl::range<1>(default_block_size / num_thread_per_worker), + cgh); + + cgh.parallel_for( + sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) { + spmv( + num_rows, num_worker_per_row, val, col, stride, + num_stored_elements_per_row, b, c, c_stride, item_ct1, + static_cast( + storage_acc_ct1.get_pointer())); + }); }); } template -void spmv( - const size_type num_rows, const int num_worker_per_row, - acc::range alpha, acc::range val, - const IndexType* __restrict__ col, const size_type stride, - const size_type num_stored_elements_per_row, acc::range b, - const OutputValueType* __restrict__ beta, OutputValueType* __restrict__ c, - const size_type c_stride, sycl::nd_item<3> item_ct1, - UninitializedArray& storage) +void spmv(const size_type num_rows, const int num_worker_per_row, + acc::range alpha, acc::range val, + const IndexType* __restrict__ col, const size_type stride, + const size_type num_stored_elements_per_row, acc::range b, + const OutputValueType* __restrict__ beta, + OutputValueType* __restrict__ c, const size_type c_stride, + sycl::nd_item<3> item_ct1, OutputValueType* __restrict__ storage) { const OutputValueType alpha_val = alpha(0); const OutputValueType beta_val = beta[0]; @@ -287,18 +284,19 @@ void spmv(dim3 grid, dim3 block, size_type dynamic_shared_memory, OutputValueType* c, const size_type c_stride) { queue->submit([&](sycl::handler& cgh) { - sycl::accessor< - UninitializedArray, - 0, sycl::access_mode::read_write, sycl::access::target::local> - storage_acc_ct1(cgh); + sycl::accessor + storage_acc_ct1( + sycl::range<1>(default_block_size / num_thread_per_worker), + cgh); cgh.parallel_for( sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) { spmv( num_rows, num_worker_per_row, alpha, val, col, stride, num_stored_elements_per_row, b, beta, c, c_stride, item_ct1, - *storage_acc_ct1.get_pointer()); + static_cast( + storage_acc_ct1.get_pointer())); }); }); } diff --git a/dpcpp/solver/cb_gmres_kernels.dp.cpp b/dpcpp/solver/cb_gmres_kernels.dp.cpp index dc94bbb92ff..ac64a232fbc 100644 --- a/dpcpp/solver/cb_gmres_kernels.dp.cpp +++ b/dpcpp/solver/cb_gmres_kernels.dp.cpp @@ -215,14 +215,13 @@ GKO_ENABLE_DEFAULT_HOST(increase_final_iteration_numbers_kernel, template -void multinorm2_kernel( - size_type num_rows, size_type num_cols, - const ValueType* __restrict__ next_krylov_basis, - size_type stride_next_krylov, remove_complex* __restrict__ norms, - const stopping_status* __restrict__ stop_status, sycl::nd_item<3> item_ct1, - UninitializedArray, - default_dot_dim*(default_dot_dim + 1)>* - reduction_helper_array) +void multinorm2_kernel(size_type num_rows, size_type num_cols, + const ValueType* __restrict__ next_krylov_basis, + size_type stride_next_krylov, + remove_complex* __restrict__ norms, + const stopping_status* __restrict__ stop_status, + sycl::nd_item<3> item_ct1, + remove_complex* reduction_helper_array) { using rc_vtype = remove_complex; const auto tidx = item_ct1.get_local_id(2); @@ -236,7 +235,7 @@ void multinorm2_kernel( // Used that way to get around dynamic initialization warning and // template error when using `reduction_helper_array` directly in `reduce` - rc_vtype* __restrict__ reduction_helper = (*reduction_helper_array); + rc_vtype* __restrict__ reduction_helper = reduction_helper_array; rc_vtype local_res = zero(); if (col_idx < num_cols && !stop_status[col_idx].has_stopped()) { for (size_type i = start_row + tidy; i < end_row; @@ -270,20 +269,21 @@ void multinorm2_kernel(dim3 grid, dim3 block, size_type dynamic_shared_memory, const stopping_status* stop_status) { queue->submit([&](sycl::handler& cgh) { - sycl::accessor< - UninitializedArray, - default_dot_dim*(default_dot_dim + 1)>, - 0, sycl::access_mode::read_write, sycl::access::target::local> - reduction_helper_array_acc_ct1(cgh); + sycl::accessor, 1, + sycl::access_mode::read_write, + sycl::access::target::local> + reduction_helper_array_acc_ct1( + sycl::range<1>(default_dot_dim * (default_dot_dim + 1)), cgh); cgh.parallel_for( sycl_nd_range(grid, block), [= ](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size( default_dot_dim)]] { - multinorm2_kernel(num_rows, num_cols, next_krylov_basis, - stride_next_krylov, norms, stop_status, - item_ct1, - reduction_helper_array_acc_ct1.get_pointer()); + multinorm2_kernel( + num_rows, num_cols, next_krylov_basis, stride_next_krylov, + norms, stop_status, item_ct1, + static_cast*>( + reduction_helper_array_acc_ct1.get_pointer())); }); }); } @@ -295,9 +295,7 @@ void multinorminf_without_stop_kernel( const ValueType* __restrict__ next_krylov_basis, size_type stride_next_krylov, remove_complex* __restrict__ norms, size_type stride_norms, sycl::nd_item<3> item_ct1, - UninitializedArray, - default_dot_dim*(default_dot_dim + 1)>* - reduction_helper_array) + remove_complex* reduction_helper_array) { using rc_vtype = remove_complex; const auto tidx = item_ct1.get_local_id(2); @@ -311,7 +309,7 @@ void multinorminf_without_stop_kernel( // Used that way to get around dynamic initialization warning and // template error when using `reduction_helper_array` directly in `reduce` - rc_vtype* __restrict__ reduction_helper = (*reduction_helper_array); + rc_vtype* __restrict__ reduction_helper = reduction_helper_array; rc_vtype local_max = zero(); if (col_idx < num_cols) { for (size_type i = start_row + tidy; i < end_row; @@ -347,11 +345,11 @@ void multinorminf_without_stop_kernel( size_type stride_norms) { queue->submit([&](sycl::handler& cgh) { - sycl::accessor< - UninitializedArray, - default_dot_dim*(default_dot_dim + 1)>, - 0, sycl::access_mode::read_write, sycl::access::target::local> - reduction_helper_array_acc_ct1(cgh); + sycl::accessor, 1, + sycl::access_mode::read_write, + sycl::access::target::local> + reduction_helper_array_acc_ct1( + sycl::range<1>(default_dot_dim * (default_dot_dim + 1)), cgh); cgh.parallel_for( sycl_nd_range(grid, block), [= @@ -360,7 +358,8 @@ void multinorminf_without_stop_kernel( multinorminf_without_stop_kernel( num_rows, num_cols, next_krylov_basis, stride_next_krylov, norms, stride_norms, item_ct1, - reduction_helper_array_acc_ct1.get_pointer()); + static_cast*>( + reduction_helper_array_acc_ct1.get_pointer())); }); }); } @@ -368,17 +367,14 @@ void multinorminf_without_stop_kernel( // ONLY computes the inf-norm (into norms2) when compute_inf is true template -void multinorm2_inf_kernel( - size_type num_rows, size_type num_cols, - const ValueType* __restrict__ next_krylov_basis, - size_type stride_next_krylov, - remove_complex* __restrict__ norms1, - remove_complex* __restrict__ norms2, - const stopping_status* __restrict__ stop_status, sycl::nd_item<3> item_ct1, - UninitializedArray, - (1 + compute_inf) * - default_dot_dim*(default_dot_dim + 1)>* - reduction_helper_array) +void multinorm2_inf_kernel(size_type num_rows, size_type num_cols, + const ValueType* __restrict__ next_krylov_basis, + size_type stride_next_krylov, + remove_complex* __restrict__ norms1, + remove_complex* __restrict__ norms2, + const stopping_status* __restrict__ stop_status, + sycl::nd_item<3> item_ct1, + remove_complex* reduction_helper_array) { using rc_vtype = remove_complex; const auto tidx = item_ct1.get_local_id(2); @@ -392,9 +388,9 @@ void multinorm2_inf_kernel( // Used that way to get around dynamic initialization warning and // template error when using `reduction_helper_array` directly in `reduce` - rc_vtype* __restrict__ reduction_helper_add = (*reduction_helper_array); + rc_vtype* __restrict__ reduction_helper_add = reduction_helper_array; rc_vtype* __restrict__ reduction_helper_max = - static_cast((*reduction_helper_array)) + + static_cast(reduction_helper_array) + default_dot_dim * (default_dot_dim + 1); rc_vtype local_res = zero(); rc_vtype local_max = zero(); @@ -449,12 +445,12 @@ void multinorm2_inf_kernel( remove_complex* norms2, const stopping_status* stop_status) { queue->submit([&](sycl::handler& cgh) { - sycl::accessor< - UninitializedArray, - (1 + compute_inf) * - default_dot_dim*(default_dot_dim + 1)>, - 0, sycl::access_mode::read_write, sycl::access::target::local> - reduction_helper_array_acc_ct1(cgh); + sycl::accessor, 1, + sycl::access_mode::read_write, + sycl::access::target::local> + reduction_helper_array_acc_ct1( + (1 + compute_inf) * default_dot_dim * (default_dot_dim + 1), + cgh); cgh.parallel_for( sycl_nd_range(grid, block), [= @@ -463,20 +459,23 @@ void multinorm2_inf_kernel( multinorm2_inf_kernel( num_rows, num_cols, next_krylov_basis, stride_next_krylov, norms1, norms2, stop_status, item_ct1, - reduction_helper_array_acc_ct1.get_pointer()); + static_cast*>( + reduction_helper_array_acc_ct1.get_pointer())); }); }); } template -void multidot_kernel( - size_type num_rows, size_type num_cols, - const ValueType* __restrict__ next_krylov_basis, - size_type stride_next_krylov, const Accessor3d krylov_bases, - ValueType* __restrict__ hessenberg_iter, size_type stride_hessenberg, - const stopping_status* __restrict__ stop_status, sycl::nd_item<3> item_ct1, - UninitializedArray& reduction_helper_array) +void multidot_kernel(size_type num_rows, size_type num_cols, + const ValueType* __restrict__ next_krylov_basis, + size_type stride_next_krylov, + const Accessor3d krylov_bases, + ValueType* __restrict__ hessenberg_iter, + size_type stride_hessenberg, + const stopping_status* __restrict__ stop_status, + sycl::nd_item<3> item_ct1, + ValueType* reduction_helper_array) { /* * In general in this kernel: @@ -543,10 +542,10 @@ void multidot_kernel(dim3 grid, dim3 block, size_type dynamic_shared_memory, const stopping_status* stop_status) { queue->submit([&](sycl::handler& cgh) { - sycl::accessor, 0, - sycl::access_mode::read_write, + sycl::accessor - reduction_helper_array_acc_ct1(cgh); + reduction_helper_array_acc_ct1(sycl::range<1>(dot_dim * dot_dim), + cgh); cgh.parallel_for( sycl_nd_range(grid, block), [= @@ -555,7 +554,8 @@ void multidot_kernel(dim3 grid, dim3 block, size_type dynamic_shared_memory, num_rows, num_cols, next_krylov_basis, stride_next_krylov, krylov_bases, hessenberg_iter, stride_hessenberg, stop_status, item_ct1, - *reduction_helper_array_acc_ct1.get_pointer()); + static_cast( + reduction_helper_array_acc_ct1.get_pointer())); }); }); } @@ -567,7 +567,7 @@ void singledot_kernel( size_type stride_next_krylov, const Accessor3d krylov_bases, ValueType* __restrict__ hessenberg_iter, size_type stride_hessenberg, const stopping_status* __restrict__ stop_status, sycl::nd_item<3> item_ct1, - UninitializedArray& reduction_helper_array) + ValueType* __restrict__ reduction_helper_array) { /* * In general in this kernel: @@ -624,10 +624,9 @@ void singledot_kernel(dim3 grid, dim3 block, size_type dynamic_shared_memory, const stopping_status* stop_status) { queue->submit([&](sycl::handler& cgh) { - sycl::accessor, 0, - sycl::access_mode::read_write, + sycl::accessor - reduction_helper_array_acc_ct1(cgh); + reduction_helper_array_acc_ct1(sycl::range<1>(block_size), cgh); cgh.parallel_for( sycl_nd_range(grid, block), [= @@ -637,7 +636,8 @@ void singledot_kernel(dim3 grid, dim3 block, size_type dynamic_shared_memory, num_rows, next_krylov_basis, stride_next_krylov, krylov_bases, hessenberg_iter, stride_hessenberg, stop_status, item_ct1, - *reduction_helper_array_acc_ct1.get_pointer()); + static_cast( + reduction_helper_array_acc_ct1.get_pointer())); }); }); } diff --git a/dpcpp/solver/gmres_kernels.dp.cpp b/dpcpp/solver/gmres_kernels.dp.cpp index 2fe0d63bcbf..3f4c22eea94 100644 --- a/dpcpp/solver/gmres_kernels.dp.cpp +++ b/dpcpp/solver/gmres_kernels.dp.cpp @@ -140,14 +140,15 @@ GKO_ENABLE_DEFAULT_HOST(increase_final_iteration_numbers_kernel, template -void multidot_kernel( - size_type k, size_type num_rows, size_type num_cols, - const ValueType* __restrict__ krylov_bases, - const ValueType* __restrict__ next_krylov_basis, size_type stride_krylov, - ValueType* __restrict__ hessenberg_iter, size_type stride_hessenberg, - const stopping_status* __restrict__ stop_status, sycl::nd_item<3> item_ct1, - UninitializedArray* - reduction_helper_array) +void multidot_kernel(size_type k, size_type num_rows, size_type num_cols, + const ValueType* __restrict__ krylov_bases, + const ValueType* __restrict__ next_krylov_basis, + size_type stride_krylov, + ValueType* __restrict__ hessenberg_iter, + size_type stride_hessenberg, + const stopping_status* __restrict__ stop_status, + sycl::nd_item<3> item_ct1, + ValueType* __restrict__ reduction_helper_array) { const auto tidx = item_ct1.get_local_id(2); const auto tidy = item_ct1.get_local_id(1); @@ -160,7 +161,7 @@ void multidot_kernel( // Used that way to get around dynamic initialization warning and // template error when using `reduction_helper_array` directly in `reduce` - ValueType* __restrict__ reduction_helper = (*reduction_helper_array); + ValueType* __restrict__ reduction_helper = reduction_helper_array; ValueType local_res = zero(); if (col_idx < num_cols && !stop_status[col_idx].has_stopped()) { @@ -197,11 +198,10 @@ void multidot_kernel(dim3 grid, dim3 block, size_type dynamic_shared_memory, const stopping_status* stop_status) { queue->submit([&](sycl::handler& cgh) { - sycl::accessor, - 0, sycl::access_mode::read_write, + sycl::accessor - reduction_helper_array_acc_ct1(cgh); + reduction_helper_array_acc_ct1( + sycl::range<1>(default_dot_dim * (default_dot_dim + 1)), cgh); cgh.parallel_for( sycl_nd_range(grid, block), [= @@ -211,9 +211,8 @@ void multidot_kernel(dim3 grid, dim3 block, size_type dynamic_shared_memory, k, num_rows, num_cols, krylov_bases, next_krylov_basis, stride_krylov, hessenberg_iter, stride_hessenberg, stop_status, item_ct1, - (UninitializedArray*) - reduction_helper_array_acc_ct1.get_pointer()); + static_cast( + reduction_helper_array_acc_ct1.get_pointer())); }); }); } @@ -267,13 +266,15 @@ void update_next_krylov_kernel( // Must be called with at least `num_cols` blocks, each with `block_size` // threads. `block_size` must be a power of 2. template -void update_hessenberg_2_kernel( - size_type iter, size_type num_rows, size_type num_cols, - const ValueType* __restrict__ next_krylov_basis, - size_type stride_next_krylov, ValueType* __restrict__ hessenberg_iter, - size_type stride_hessenberg, - const stopping_status* __restrict__ stop_status, sycl::nd_item<3> item_ct1, - UninitializedArray& reduction_helper_array) +void update_hessenberg_2_kernel(size_type iter, size_type num_rows, + size_type num_cols, + const ValueType* __restrict__ next_krylov_basis, + size_type stride_next_krylov, + ValueType* __restrict__ hessenberg_iter, + size_type stride_hessenberg, + const stopping_status* __restrict__ stop_status, + sycl::nd_item<3> item_ct1, + ValueType* __restrict__ reduction_helper_array) { const auto tidx = item_ct1.get_local_id(2); const auto col_idx = item_ct1.get_group(2); @@ -314,10 +315,9 @@ void update_hessenberg_2_kernel( const stopping_status* stop_status) { queue->submit([&](sycl::handler& cgh) { - sycl::accessor, 0, - sycl::access_mode::read_write, + sycl::accessor - reduction_helper_array_acc_ct1(cgh); + reduction_helper_array_acc_ct1(sycl::range<1>(block_size), cgh); cgh.parallel_for( sycl_nd_range(grid, block), [= @@ -327,7 +327,8 @@ void update_hessenberg_2_kernel( iter, num_rows, num_cols, next_krylov_basis, stride_next_krylov, hessenberg_iter, stride_hessenberg, stop_status, item_ct1, - *reduction_helper_array_acc_ct1.get_pointer()); + static_cast( + reduction_helper_array_acc_ct1.get_pointer())); }); }); } diff --git a/dpcpp/solver/idr_kernels.dp.cpp b/dpcpp/solver/idr_kernels.dp.cpp index 3ec0db6d469..f37a9bcd091 100644 --- a/dpcpp/solver/idr_kernels.dp.cpp +++ b/dpcpp/solver/idr_kernels.dp.cpp @@ -111,7 +111,7 @@ template void orthonormalize_subspace_vectors_kernel( size_type num_rows, size_type num_cols, ValueType* __restrict__ values, size_type stride, sycl::nd_item<3> item_ct1, - UninitializedArray& reduction_helper_array) + ValueType* __restrict__ reduction_helper_array) { const auto tidx = thread::get_thread_id_flat(item_ct1); @@ -168,10 +168,9 @@ void orthonormalize_subspace_vectors_kernel( size_type num_rows, size_type num_cols, ValueType* values, size_type stride) { stream->submit([&](sycl::handler& cgh) { - sycl::accessor, 0, - sycl::access_mode::read_write, + sycl::accessor - reduction_helper_array_acc_ct1(cgh); + reduction_helper_array_acc_ct1(sycl::range<1>(block_size), cgh); cgh.parallel_for( sycl_nd_range(grid, block), [= @@ -179,7 +178,8 @@ void orthonormalize_subspace_vectors_kernel( config::warp_size)]] { orthonormalize_subspace_vectors_kernel( num_rows, num_cols, values, stride, item_ct1, - *reduction_helper_array_acc_ct1.get_pointer()); + static_cast( + reduction_helper_array_acc_ct1.get_pointer())); }); }); } @@ -329,13 +329,13 @@ void step_2_kernel(dim3 grid, dim3 block, size_t dynamic_shared_memory, template -void multidot_kernel( - size_type num_rows, size_type nrhs, const ValueType* __restrict__ p_i, - const ValueType* __restrict__ g_k, size_type g_k_stride, - ValueType* __restrict__ alpha, - const stopping_status* __restrict__ stop_status, sycl::nd_item<3> item_ct1, - UninitializedArray& - reduction_helper_array) +void multidot_kernel(size_type num_rows, size_type nrhs, + const ValueType* __restrict__ p_i, + const ValueType* __restrict__ g_k, size_type g_k_stride, + ValueType* __restrict__ alpha, + const stopping_status* __restrict__ stop_status, + sycl::nd_item<3> item_ct1, + ValueType* __restrict__ reduction_helper_array) { const auto tidx = item_ct1.get_local_id(2); const auto tidy = item_ct1.get_local_id(1); @@ -379,19 +379,20 @@ void multidot_kernel(dim3 grid, dim3 block, size_t dynamic_shared_memory, const stopping_status* stop_status) { stream->submit([&](sycl::handler& cgh) { - sycl::accessor, - 0, sycl::access_mode::read_write, + sycl::accessor - reduction_helper_array_acc_ct1(cgh); + reduction_helper_array_acc_ct1( + sycl::range<1>(default_dot_dim * (default_dot_dim + 1)), cgh); cgh.parallel_for( sycl_nd_range(grid, block), [= ](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size( default_dot_dim)]] { - multidot_kernel(num_rows, nrhs, p_i, g_k, g_k_stride, alpha, - stop_status, item_ct1, - *reduction_helper_array_acc_ct1.get_pointer()); + multidot_kernel( + num_rows, nrhs, p_i, g_k, g_k_stride, alpha, stop_status, + item_ct1, + static_cast( + reduction_helper_array_acc_ct1.get_pointer())); }); }); } From e2d732b404820b1edb0f9b17ce5751bf67e51afa Mon Sep 17 00:00:00 2001 From: "Yuhsiang M. Tsai" Date: Wed, 23 Feb 2022 14:25:47 +0100 Subject: [PATCH 12/14] fix deprecated --- dpcpp/base/executor.dp.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/dpcpp/base/executor.dp.cpp b/dpcpp/base/executor.dp.cpp index 2aed12c606d..18bdc5ed077 100644 --- a/dpcpp/base/executor.dp.cpp +++ b/dpcpp/base/executor.dp.cpp @@ -118,7 +118,8 @@ void DpcppExecutor::raw_free(void* ptr) const noexcept #endif // GKO_VERBOSE_LEVEL >= 1 // OpenCL error code use 0 for CL_SUCCESS and negative number for others // error. if the error is not from OpenCL, it will return CL_SUCCESS. - int err_code = err.get_cl_code(); + // TODO: check whether error + int err_code = err.code().value(); // if return CL_SUCCESS, exit 1 as DPCPP error. if (err_code == 0) { err_code = 1; From f32e5c44cd17c470f40a6112f3ac1c06c67e63a1 Mon Sep 17 00:00:00 2001 From: "Yuhsiang M. Tsai" Date: Wed, 23 Feb 2022 17:03:32 +0100 Subject: [PATCH 13/14] fix the mixed on single mode and use adapt macro --- core/base/mixed_precision_types.hpp | 90 ++++++++++---------- core/solver/cb_gmres_kernels.hpp | 40 ++++----- include/ginkgo/core/base/types.hpp | 122 +++++++--------------------- 3 files changed, 96 insertions(+), 156 deletions(-) diff --git a/core/base/mixed_precision_types.hpp b/core/base/mixed_precision_types.hpp index a93fc9013b9..33ea7166697 100644 --- a/core/base/mixed_precision_types.hpp +++ b/core/base/mixed_precision_types.hpp @@ -39,39 +39,39 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #ifdef GINKGO_MIXED_PRECISION -#define GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE(_macro, ...) \ - template _macro(float, float, float, __VA_ARGS__); \ - template _macro(float, float, double, __VA_ARGS__); \ - template _macro(float, double, float, __VA_ARGS__); \ - template _macro(float, double, double, __VA_ARGS__); \ - template _macro(double, float, float, __VA_ARGS__); \ - template _macro(double, float, double, __VA_ARGS__); \ - template _macro(double, double, float, __VA_ARGS__); \ - template _macro(double, double, double, __VA_ARGS__); \ - template _macro(std::complex, std::complex, \ - std::complex, __VA_ARGS__); \ - template _macro(std::complex, std::complex, \ - std::complex, __VA_ARGS__); \ - template _macro(std::complex, std::complex, \ - std::complex, __VA_ARGS__); \ - template _macro(std::complex, std::complex, \ - std::complex, __VA_ARGS__); \ - template _macro(std::complex, std::complex, \ - std::complex, __VA_ARGS__); \ - template _macro(std::complex, std::complex, \ - std::complex, __VA_ARGS__); \ - template _macro(std::complex, std::complex, \ - std::complex, __VA_ARGS__); \ - template _macro(std::complex, std::complex, \ - std::complex, __VA_ARGS__) +#define GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE(_macro, ...) \ + template _macro(float, float, float, __VA_ARGS__); \ + GKO_ADAPT_SINGLE(_macro(float, float, double, __VA_ARGS__)); \ + GKO_ADAPT_SINGLE(_macro(float, double, float, __VA_ARGS__)); \ + GKO_ADAPT_SINGLE(_macro(float, double, double, __VA_ARGS__)); \ + GKO_ADAPT_SINGLE(_macro(double, float, float, __VA_ARGS__)); \ + GKO_ADAPT_SINGLE(_macro(double, float, double, __VA_ARGS__)); \ + GKO_ADAPT_SINGLE(_macro(double, double, float, __VA_ARGS__)); \ + GKO_ADAPT_SINGLE(_macro(double, double, double, __VA_ARGS__)); \ + template _macro(std::complex, std::complex, \ + std::complex, __VA_ARGS__); \ + GKO_ADAPT_SINGLE(_macro(std::complex, std::complex, \ + std::complex, __VA_ARGS__)); \ + GKO_ADAPT_SINGLE(_macro(std::complex, std::complex, \ + std::complex, __VA_ARGS__)); \ + GKO_ADAPT_SINGLE(_macro(std::complex, std::complex, \ + std::complex, __VA_ARGS__)); \ + GKO_ADAPT_SINGLE(_macro(std::complex, std::complex, \ + std::complex, __VA_ARGS__)); \ + GKO_ADAPT_SINGLE(_macro(std::complex, std::complex, \ + std::complex, __VA_ARGS__)); \ + GKO_ADAPT_SINGLE(_macro(std::complex, std::complex, \ + std::complex, __VA_ARGS__)); \ + GKO_ADAPT_SINGLE(_macro(std::complex, std::complex, \ + std::complex, __VA_ARGS__)) #else -#define GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE(_macro, ...) \ - template _macro(float, float, float, __VA_ARGS__); \ - template _macro(double, double, double, __VA_ARGS__); \ - template _macro(std::complex, std::complex, \ - std::complex, __VA_ARGS__); \ - template _macro(std::complex, std::complex, \ - std::complex, __VA_ARGS__) +#define GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE(_macro, ...) \ + template _macro(float, float, float, __VA_ARGS__); \ + GKO_ADAPT_SINGLE(_macro(double, double, double, __VA_ARGS__)); \ + template _macro(std::complex, std::complex, \ + std::complex, __VA_ARGS__); \ + GKO_ADAPT_SINGLE(_macro(std::complex, std::complex, \ + std::complex, __VA_ARGS__)) #endif @@ -81,21 +81,25 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #ifdef GINKGO_MIXED_PRECISION -#define GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_2(_macro, ...) \ - template _macro(float, float, __VA_ARGS__); \ - template _macro(float, double, __VA_ARGS__); \ - template _macro(double, float, __VA_ARGS__); \ - template _macro(double, double, __VA_ARGS__); \ - template _macro(std::complex, std::complex, __VA_ARGS__); \ - template _macro(std::complex, std::complex, __VA_ARGS__); \ - template _macro(std::complex, std::complex, __VA_ARGS__); \ - template _macro(std::complex, std::complex, __VA_ARGS__) +#define GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_2(_macro, ...) \ + template _macro(float, float, __VA_ARGS__); \ + GKO_ADAPT_SINGLE(_macro(float, double, __VA_ARGS__)); \ + GKO_ADAPT_SINGLE(_macro(double, float, __VA_ARGS__)); \ + GKO_ADAPT_SINGLE(_macro(double, double, __VA_ARGS__)); \ + template _macro(std::complex, std::complex, __VA_ARGS__); \ + GKO_ADAPT_SINGLE( \ + _macro(std::complex, std::complex, __VA_ARGS__)); \ + GKO_ADAPT_SINGLE( \ + _macro(std::complex, std::complex, __VA_ARGS__)); \ + GKO_ADAPT_SINGLE( \ + _macro(std::complex, std::complex, __VA_ARGS__)) #else #define GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_2(_macro, ...) \ template _macro(float, float, __VA_ARGS__); \ - template _macro(double, double, __VA_ARGS__); \ + GKO_ADAPT_SINGLE(_macro(double, double, __VA_ARGS__)); \ template _macro(std::complex, std::complex, __VA_ARGS__); \ - template _macro(std::complex, std::complex, __VA_ARGS__) + GKO_ADAPT_SINGLE( \ + _macro(std::complex, std::complex, __VA_ARGS__)) #endif diff --git a/core/solver/cb_gmres_kernels.hpp b/core/solver/cb_gmres_kernels.hpp index aa3d352e5a6..bac66826406 100644 --- a/core/solver/cb_gmres_kernels.hpp +++ b/core/solver/cb_gmres_kernels.hpp @@ -63,27 +63,27 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * const accessor, or not. */ #define GKO_INSTANTIATE_FOR_EACH_CB_GMRES_TYPE_HELPER(_macro, _const) \ - template _macro( \ + GKO_ADAPT_SINGLE(_macro( \ double, \ GKO_UNPACK( \ - acc::range>)); \ - template _macro( \ + acc::range>))); \ + GKO_ADAPT_SINGLE(_macro( \ double, \ GKO_UNPACK( \ - acc::range>)); \ - template _macro( \ + acc::range>))); \ + GKO_ADAPT_SINGLE(_macro( \ double, \ GKO_UNPACK( \ - acc::range>)); \ - template _macro(double, \ - GKO_UNPACK(acc::range>)); \ - template _macro(double, \ - GKO_UNPACK(acc::range>)); \ - template _macro(double, \ - GKO_UNPACK(acc::range>)); \ + acc::range>))); \ + GKO_ADAPT_SINGLE( \ + _macro(double, GKO_UNPACK(acc::range>))); \ + GKO_ADAPT_SINGLE( \ + _macro(double, GKO_UNPACK(acc::range>))); \ + GKO_ADAPT_SINGLE( \ + _macro(double, GKO_UNPACK(acc::range>))); \ template _macro( \ float, \ GKO_UNPACK( \ @@ -98,16 +98,16 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. template _macro(float, \ GKO_UNPACK(acc::range>)); \ - template _macro( \ + GKO_ADAPT_SINGLE(_macro( \ std::complex, \ GKO_UNPACK( \ - acc::range, \ - _const std::complex>>)); \ - template _macro( \ + acc::range, _const std::complex>>))); \ + GKO_ADAPT_SINGLE(_macro( \ std::complex, \ GKO_UNPACK( \ acc::range, \ - _const std::complex>>)); \ + _const std::complex>>))); \ template _macro( \ std::complex, \ GKO_UNPACK( \ diff --git a/include/ginkgo/core/base/types.hpp b/include/ginkgo/core/base/types.hpp index 009c2e182cd..26854ad3cba 100644 --- a/include/ginkgo/core/base/types.hpp +++ b/include/ginkgo/core/base/types.hpp @@ -419,6 +419,14 @@ GKO_ATTRIBUTES constexpr bool operator!=(precision_reduction x, _enable_macro(CudaExecutor, cuda) +#if GINKGO_DPCPP_SINGLE_MODE +#define GKO_ADAPT_SINGLE(_macro) \ + template <> \ + _macro GKO_NOT_IMPLEMENTED +#else +#define GKO_ADAPT_SINGLE(_macro) template _macro +#endif + /** * Instantiates a template for each non-complex value type compiled by Ginkgo. * @@ -427,16 +435,9 @@ GKO_ATTRIBUTES constexpr bool operator!=(precision_reduction x, * Should take one argument, which is replaced by the * value type. */ -#if GINKGO_DPCPP_SINGLE_MODE #define GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE(_macro) \ template _macro(float); \ - template <> \ - _macro(double) GKO_NOT_IMPLEMENTED -#else -#define GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE(_macro) \ - template _macro(float); \ - template _macro(double) -#endif + GKO_ADAPT_SINGLE(_macro(double)) /** @@ -447,18 +448,10 @@ GKO_ATTRIBUTES constexpr bool operator!=(precision_reduction x, * Should take one argument, which is replaced by the * value type. */ -#if GINKGO_DPCPP_SINGLE_MODE -#define GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(_macro) \ - GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE(_macro); \ - template _macro(std::complex); \ - template <> \ - _macro(std::complex) GKO_NOT_IMPLEMENTED -#else #define GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(_macro) \ GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE(_macro); \ template _macro(std::complex); \ - template _macro(std::complex) -#endif + GKO_ADAPT_SINGLE(_macro(std::complex)) /** @@ -471,26 +464,14 @@ GKO_ATTRIBUTES constexpr bool operator!=(precision_reduction x, * Should take two arguments, which are replaced by the * value and scalar type, respectively. */ -#if GINKGO_DPCPP_SINGLE_MODE -#define GKO_INSTANTIATE_FOR_EACH_VALUE_AND_SCALAR_TYPE(_macro) \ - template _macro(float, float); \ - template <> \ - _macro(double, double) GKO_NOT_IMPLEMENTED; \ - template _macro(std::complex, std::complex); \ - template <> \ - _macro(std::complex, std::complex) GKO_NOT_IMPLEMENTED; \ - template _macro(std::complex, float); \ - template <> \ - _macro(std::complex, double) GKO_NOT_IMPLEMENTED; -#else -#define GKO_INSTANTIATE_FOR_EACH_VALUE_AND_SCALAR_TYPE(_macro) \ - template _macro(float, float); \ - template _macro(double, double); \ - template _macro(std::complex, std::complex); \ - template _macro(std::complex, std::complex); \ - template _macro(std::complex, float); \ - template _macro(std::complex, double) -#endif + +#define GKO_INSTANTIATE_FOR_EACH_VALUE_AND_SCALAR_TYPE(_macro) \ + template _macro(float, float); \ + GKO_ADAPT_SINGLE(_macro(double, double)); \ + template _macro(std::complex, std::complex); \ + GKO_ADAPT_SINGLE(_macro(std::complex, std::complex)); \ + template _macro(std::complex, float); \ + GKO_ADAPT_SINGLE(_macro(std::complex, double)) /** @@ -515,21 +496,11 @@ GKO_ATTRIBUTES constexpr bool operator!=(precision_reduction x, * Should take two arguments, which are replaced by the * value and index types. */ -#if GINKGO_DPCPP_SINGLE_MODE -#define GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_AND_INDEX_TYPE(_macro) \ - template _macro(float, int32); \ - template <> \ - _macro(double, int32) GKO_NOT_IMPLEMENTED; \ - template _macro(float, int64); \ - template <> \ - _macro(double, int64) GKO_NOT_IMPLEMENTED -#else #define GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_AND_INDEX_TYPE(_macro) \ template _macro(float, int32); \ - template _macro(double, int32); \ + GKO_ADAPT_SINGLE(_macro(double, int32)); \ template _macro(float, int64); \ - template _macro(double, int64) -#endif + GKO_ADAPT_SINGLE(_macro(double, int64)) /** @@ -540,46 +511,12 @@ GKO_ATTRIBUTES constexpr bool operator!=(precision_reduction x, * Should take two arguments, which are replaced by the * value and index types. */ -#if GINKGO_DPCPP_SINGLE_MODE #define GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(_macro) \ GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_AND_INDEX_TYPE(_macro); \ template _macro(std::complex, int32); \ - template <> \ - _macro(std::complex, int32) GKO_NOT_IMPLEMENTED; \ + GKO_ADAPT_SINGLE(_macro(std::complex, int32)); \ template _macro(std::complex, int64); \ - template <> \ - _macro(std::complex, int64) GKO_NOT_IMPLEMENTED -#else -#define GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(_macro) \ - GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_AND_INDEX_TYPE(_macro); \ - template _macro(std::complex, int32); \ - template _macro(std::complex, int32); \ - template _macro(std::complex, int64); \ - template _macro(std::complex, int64) -#endif - - -#if GINKGO_DPCPP_SINGLE_MODE -#define GKO_INSTANTIATE_FOR_EACH_VALUE_CONVERSION(_macro) \ - template <> \ - _macro(float, double) GKO_NOT_IMPLEMENTED; \ - template <> \ - _macro(double, float) GKO_NOT_IMPLEMENTED; \ - template <> \ - _macro(std::complex, std::complex) GKO_NOT_IMPLEMENTED; \ - template <> \ - _macro(std::complex, std::complex) GKO_NOT_IMPLEMENTED - - -#define GKO_INSTANTIATE_FOR_EACH_VALUE_CONVERSION_OR_COPY(_macro) \ - GKO_INSTANTIATE_FOR_EACH_VALUE_CONVERSION(_macro); \ - template _macro(float, float); \ - template <> \ - _macro(double, double) GKO_NOT_IMPLEMENTED; \ - template _macro(std::complex, std::complex); \ - template <> \ - _macro(std::complex, std::complex) GKO_NOT_IMPLEMENTED -#else + GKO_ADAPT_SINGLE(_macro(std::complex, int64)) /** @@ -591,11 +528,11 @@ GKO_ATTRIBUTES constexpr bool operator!=(precision_reduction x, * Should take two arguments `src` and `dst`, which * are replaced by the source and destination value type. */ -#define GKO_INSTANTIATE_FOR_EACH_VALUE_CONVERSION(_macro) \ - template _macro(float, double); \ - template _macro(double, float); \ - template _macro(std::complex, std::complex); \ - template _macro(std::complex, std::complex) +#define GKO_INSTANTIATE_FOR_EACH_VALUE_CONVERSION(_macro) \ + GKO_ADAPT_SINGLE(_macro(float, double)); \ + GKO_ADAPT_SINGLE(_macro(double, float)); \ + GKO_ADAPT_SINGLE(_macro(std::complex, std::complex)); \ + GKO_ADAPT_SINGLE(_macro(std::complex, std::complex)) /** @@ -610,10 +547,9 @@ GKO_ATTRIBUTES constexpr bool operator!=(precision_reduction x, #define GKO_INSTANTIATE_FOR_EACH_VALUE_CONVERSION_OR_COPY(_macro) \ GKO_INSTANTIATE_FOR_EACH_VALUE_CONVERSION(_macro); \ template _macro(float, float); \ - template _macro(double, double); \ + GKO_ADAPT_SINGLE(_macro(double, double)); \ template _macro(std::complex, std::complex); \ - template _macro(std::complex, std::complex) -#endif + GKO_ADAPT_SINGLE(_macro(std::complex, std::complex)) /** From 025fb91e65a43339e8dc8dca7f3d475973e65434 Mon Sep 17 00:00:00 2001 From: "Yuhsiang M. Tsai" Date: Thu, 24 Feb 2022 16:41:01 +0100 Subject: [PATCH 14/14] split the dense kernel --- common/CMakeLists.txt | 9 + common/unified/matrix/dense_cmnpr.cpp | 80 +++ common/unified/matrix/dense_compute_norm1.cpp | 74 +++ common/unified/matrix/dense_compute_norm2.cpp | 76 +++ common/unified/matrix/dense_css.cpp | 92 ++++ common/unified/matrix/dense_kernels.cpp | 503 ------------------ common/unified/matrix/dense_kernels_2.cpp | 198 +++++++ common/unified/matrix/dense_kernels_3.cpp | 78 +++ common/unified/matrix/dense_kernels_4.cpp | 78 +++ common/unified/matrix/dense_kernels_5.cpp | 114 ++++ common/unified/matrix/dense_kernels_6.cpp | 253 +++++++++ dpcpp/CMakeLists.txt | 3 +- 12 files changed, 1054 insertions(+), 504 deletions(-) create mode 100644 common/unified/matrix/dense_cmnpr.cpp create mode 100644 common/unified/matrix/dense_compute_norm1.cpp create mode 100644 common/unified/matrix/dense_compute_norm2.cpp create mode 100644 common/unified/matrix/dense_css.cpp create mode 100644 common/unified/matrix/dense_kernels_2.cpp create mode 100644 common/unified/matrix/dense_kernels_3.cpp create mode 100644 common/unified/matrix/dense_kernels_4.cpp create mode 100644 common/unified/matrix/dense_kernels_5.cpp create mode 100644 common/unified/matrix/dense_kernels_6.cpp diff --git a/common/CMakeLists.txt b/common/CMakeLists.txt index 04f33b4a718..c0757c8226c 100644 --- a/common/CMakeLists.txt +++ b/common/CMakeLists.txt @@ -10,6 +10,15 @@ set(UNIFIED_SOURCES matrix/coo_kernels.cpp matrix/csr_kernels.cpp matrix/dense_kernels.cpp + matrix/dense_kernels_2.cpp + matrix/dense_kernels_3.cpp + matrix/dense_kernels_4.cpp + matrix/dense_kernels_5.cpp + matrix/dense_kernels_6.cpp + matrix/dense_compute_norm2.cpp + matrix/dense_compute_norm1.cpp + matrix/dense_cmnpr.cpp + matrix/dense_css.cpp matrix/ell_kernels.cpp matrix/hybrid_kernels.cpp matrix/sellp_kernels.cpp diff --git a/common/unified/matrix/dense_cmnpr.cpp b/common/unified/matrix/dense_cmnpr.cpp new file mode 100644 index 00000000000..aa81556beef --- /dev/null +++ b/common/unified/matrix/dense_cmnpr.cpp @@ -0,0 +1,80 @@ +/************************************************************* +Copyright (c) 2017-2022, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include "core/matrix/dense_kernels.hpp" + + +#include +#include + + +#include "common/unified/base/kernel_launch.hpp" +#include "common/unified/base/kernel_launch_reduction.hpp" +#include "core/base/mixed_precision_types.hpp" +#include "core/components/prefix_sum_kernels.hpp" + + +namespace gko { +namespace kernels { +namespace GKO_DEVICE_NAMESPACE { +/** + * @brief The Dense matrix format namespace. + * + * @ingroup dense + */ +namespace dense { + + +template +void compute_max_nnz_per_row(std::shared_ptr exec, + const matrix::Dense* source, + size_type& result) +{ + Array partial{exec, source->get_size()[0] + 1}; + count_nonzeros_per_row(exec, source, partial.get_data()); + run_kernel_reduction( + exec, [] GKO_KERNEL(auto i, auto partial) { return partial[i]; }, + GKO_KERNEL_REDUCE_MAX(size_type), + partial.get_data() + source->get_size()[0], source->get_size()[0], + partial); + result = exec->copy_val_to_host(partial.get_const_data() + + source->get_size()[0]); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( + GKO_DECLARE_DENSE_COMPUTE_MAX_NNZ_PER_ROW_KERNEL); + + +} // namespace dense +} // namespace GKO_DEVICE_NAMESPACE +} // namespace kernels +} // namespace gko diff --git a/common/unified/matrix/dense_compute_norm1.cpp b/common/unified/matrix/dense_compute_norm1.cpp new file mode 100644 index 00000000000..762260bb6b6 --- /dev/null +++ b/common/unified/matrix/dense_compute_norm1.cpp @@ -0,0 +1,74 @@ +/************************************************************* +Copyright (c) 2017-2022, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include "core/matrix/dense_kernels.hpp" + + +#include +#include + + +#include "common/unified/base/kernel_launch.hpp" +#include "common/unified/base/kernel_launch_reduction.hpp" +#include "core/base/mixed_precision_types.hpp" +#include "core/components/prefix_sum_kernels.hpp" + + +namespace gko { +namespace kernels { +namespace GKO_DEVICE_NAMESPACE { +/** + * @brief The Dense matrix format namespace. + * + * @ingroup dense + */ +namespace dense { + + +template +void compute_norm1(std::shared_ptr exec, + const matrix::Dense* x, + matrix::Dense>* result) +{ + run_kernel_col_reduction( + exec, [] GKO_KERNEL(auto i, auto j, auto x) { return abs(x(i, j)); }, + GKO_KERNEL_REDUCE_SUM(remove_complex), result->get_values(), + x->get_size(), x); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_NORM1_KERNEL); + + +} // namespace dense +} // namespace GKO_DEVICE_NAMESPACE +} // namespace kernels +} // namespace gko diff --git a/common/unified/matrix/dense_compute_norm2.cpp b/common/unified/matrix/dense_compute_norm2.cpp new file mode 100644 index 00000000000..890980ad247 --- /dev/null +++ b/common/unified/matrix/dense_compute_norm2.cpp @@ -0,0 +1,76 @@ +/************************************************************* +Copyright (c) 2017-2022, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include "core/matrix/dense_kernels.hpp" + + +#include +#include + + +#include "common/unified/base/kernel_launch.hpp" +#include "common/unified/base/kernel_launch_reduction.hpp" +#include "core/base/mixed_precision_types.hpp" +#include "core/components/prefix_sum_kernels.hpp" + + +namespace gko { +namespace kernels { +namespace GKO_DEVICE_NAMESPACE { +/** + * @brief The Dense matrix format namespace. + * + * @ingroup dense + */ +namespace dense { + + +template +void compute_norm2(std::shared_ptr exec, + const matrix::Dense* x, + matrix::Dense>* result) +{ + run_kernel_col_reduction( + exec, + [] GKO_KERNEL(auto i, auto j, auto x) { return squared_norm(x(i, j)); }, + [] GKO_KERNEL(auto a, auto b) { return a + b; }, + [] GKO_KERNEL(auto a) { return sqrt(a); }, remove_complex{}, + result->get_values(), x->get_size(), x); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_NORM2_KERNEL); + + +} // namespace dense +} // namespace GKO_DEVICE_NAMESPACE +} // namespace kernels +} // namespace gko diff --git a/common/unified/matrix/dense_css.cpp b/common/unified/matrix/dense_css.cpp new file mode 100644 index 00000000000..71a7d8ab405 --- /dev/null +++ b/common/unified/matrix/dense_css.cpp @@ -0,0 +1,92 @@ +/************************************************************* +Copyright (c) 2017-2022, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include "core/matrix/dense_kernels.hpp" + + +#include +#include + + +#include "common/unified/base/kernel_launch.hpp" +#include "common/unified/base/kernel_launch_reduction.hpp" +#include "core/base/mixed_precision_types.hpp" +#include "core/components/prefix_sum_kernels.hpp" + + +namespace gko { +namespace kernels { +namespace GKO_DEVICE_NAMESPACE { +/** + * @brief The Dense matrix format namespace. + * + * @ingroup dense + */ +namespace dense { + + +template +void compute_slice_sets(std::shared_ptr exec, + const matrix::Dense* source, + size_type slice_size, size_type stride_factor, + size_type* slice_sets, size_type* slice_lengths) +{ + const auto num_rows = source->get_size()[0]; + Array row_nnz{exec, num_rows}; + count_nonzeros_per_row(exec, source, row_nnz.get_data()); + const auto num_slices = + static_cast(ceildiv(num_rows, slice_size)); + run_kernel_row_reduction( + exec, + [] GKO_KERNEL(auto slice, auto local_row, auto row_nnz, auto slice_size, + auto stride_factor, auto num_rows) { + const auto row = slice * slice_size + local_row; + return row < num_rows ? static_cast( + ceildiv(row_nnz[row], stride_factor) * + stride_factor) + : size_type{}; + }, + GKO_KERNEL_REDUCE_MAX(size_type), slice_lengths, 1, + gko::dim<2>{num_slices, slice_size}, row_nnz, slice_size, stride_factor, + num_rows); + exec->copy(num_slices, slice_lengths, slice_sets); + components::prefix_sum(exec, slice_sets, num_slices + 1); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( + GKO_DECLARE_DENSE_COMPUTE_SLICE_SETS_KERNEL); + + +} // namespace dense +} // namespace GKO_DEVICE_NAMESPACE +} // namespace kernels +} // namespace gko diff --git a/common/unified/matrix/dense_kernels.cpp b/common/unified/matrix/dense_kernels.cpp index d5037879eda..4e4babe4e9d 100644 --- a/common/unified/matrix/dense_kernels.cpp +++ b/common/unified/matrix/dense_kernels.cpp @@ -104,509 +104,6 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( GKO_DECLARE_DENSE_FILL_IN_MATRIX_DATA_KERNEL); -template -void scale(std::shared_ptr exec, - const matrix::Dense* alpha, matrix::Dense* x) -{ - if (alpha->get_size()[1] > 1) { - run_kernel( - exec, - [] GKO_KERNEL(auto row, auto col, auto alpha, auto x) { - x(row, col) *= alpha[col]; - }, - x->get_size(), alpha->get_const_values(), x); - } else { - run_kernel( - exec, - [] GKO_KERNEL(auto row, auto col, auto alpha, auto x) { - x(row, col) *= alpha[0]; - }, - x->get_size(), alpha->get_const_values(), x); - } -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_SCALAR_TYPE(GKO_DECLARE_DENSE_SCALE_KERNEL); - - -template -void inv_scale(std::shared_ptr exec, - const matrix::Dense* alpha, - matrix::Dense* x) -{ - if (alpha->get_size()[1] > 1) { - run_kernel( - exec, - [] GKO_KERNEL(auto row, auto col, auto alpha, auto x) { - x(row, col) /= alpha[col]; - }, - x->get_size(), alpha->get_const_values(), x); - } else { - run_kernel( - exec, - [] GKO_KERNEL(auto row, auto col, auto alpha, auto x) { - x(row, col) /= alpha[0]; - }, - x->get_size(), alpha->get_const_values(), x); - } -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_SCALAR_TYPE( - GKO_DECLARE_DENSE_INV_SCALE_KERNEL); - - -template -void add_scaled(std::shared_ptr exec, - const matrix::Dense* alpha, - const matrix::Dense* x, matrix::Dense* y) -{ - if (alpha->get_size()[1] > 1) { - run_kernel( - exec, - [] GKO_KERNEL(auto row, auto col, auto alpha, auto x, auto y) { - y(row, col) += alpha[col] * x(row, col); - }, - x->get_size(), alpha->get_const_values(), x, y); - } else { - run_kernel( - exec, - [] GKO_KERNEL(auto row, auto col, auto alpha, auto x, auto y) { - y(row, col) += alpha[0] * x(row, col); - }, - x->get_size(), alpha->get_const_values(), x, y); - } -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_SCALAR_TYPE( - GKO_DECLARE_DENSE_ADD_SCALED_KERNEL); - - -template -void sub_scaled(std::shared_ptr exec, - const matrix::Dense* alpha, - const matrix::Dense* x, matrix::Dense* y) -{ - if (alpha->get_size()[1] > 1) { - run_kernel( - exec, - [] GKO_KERNEL(auto row, auto col, auto alpha, auto x, auto y) { - y(row, col) -= alpha[col] * x(row, col); - }, - x->get_size(), alpha->get_const_values(), x, y); - } else { - run_kernel( - exec, - [] GKO_KERNEL(auto row, auto col, auto alpha, auto x, auto y) { - y(row, col) -= alpha[0] * x(row, col); - }, - x->get_size(), alpha->get_const_values(), x, y); - } -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_SCALAR_TYPE( - GKO_DECLARE_DENSE_SUB_SCALED_KERNEL); - - -template -void add_scaled_diag(std::shared_ptr exec, - const matrix::Dense* alpha, - const matrix::Diagonal* x, - matrix::Dense* y) -{ - const auto diag_values = x->get_const_values(); - run_kernel( - exec, - [] GKO_KERNEL(auto i, auto alpha, auto diag, auto y) { - y(i, i) += alpha[0] * diag[i]; - }, - x->get_size()[0], alpha->get_const_values(), x->get_const_values(), y); -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_ADD_SCALED_DIAG_KERNEL); - - -template -void sub_scaled_diag(std::shared_ptr exec, - const matrix::Dense* alpha, - const matrix::Diagonal* x, - matrix::Dense* y) -{ - const auto diag_values = x->get_const_values(); - run_kernel( - exec, - [] GKO_KERNEL(auto i, auto alpha, auto diag, auto y) { - y(i, i) -= alpha[0] * diag[i]; - }, - x->get_size()[0], alpha->get_const_values(), x->get_const_values(), y); -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_SUB_SCALED_DIAG_KERNEL); - - -template -void compute_dot(std::shared_ptr exec, - const matrix::Dense* x, - const matrix::Dense* y, - matrix::Dense* result) -{ - run_kernel_col_reduction( - exec, - [] GKO_KERNEL(auto i, auto j, auto x, auto y) { - return x(i, j) * y(i, j); - }, - GKO_KERNEL_REDUCE_SUM(ValueType), result->get_values(), x->get_size(), - x, y); -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_DOT_KERNEL); - - -template -void compute_conj_dot(std::shared_ptr exec, - const matrix::Dense* x, - const matrix::Dense* y, - matrix::Dense* result) -{ - run_kernel_col_reduction( - exec, - [] GKO_KERNEL(auto i, auto j, auto x, auto y) { - return conj(x(i, j)) * y(i, j); - }, - GKO_KERNEL_REDUCE_SUM(ValueType), result->get_values(), x->get_size(), - x, y); -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_CONJ_DOT_KERNEL); - - -template -void compute_norm2(std::shared_ptr exec, - const matrix::Dense* x, - matrix::Dense>* result) -{ - run_kernel_col_reduction( - exec, - [] GKO_KERNEL(auto i, auto j, auto x) { return squared_norm(x(i, j)); }, - [] GKO_KERNEL(auto a, auto b) { return a + b; }, - [] GKO_KERNEL(auto a) { return sqrt(a); }, remove_complex{}, - result->get_values(), x->get_size(), x); -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_NORM2_KERNEL); - -template -void compute_norm1(std::shared_ptr exec, - const matrix::Dense* x, - matrix::Dense>* result) -{ - run_kernel_col_reduction( - exec, [] GKO_KERNEL(auto i, auto j, auto x) { return abs(x(i, j)); }, - GKO_KERNEL_REDUCE_SUM(remove_complex), result->get_values(), - x->get_size(), x); -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_NORM1_KERNEL); - - -template -void compute_max_nnz_per_row(std::shared_ptr exec, - const matrix::Dense* source, - size_type& result) -{ - Array partial{exec, source->get_size()[0] + 1}; - count_nonzeros_per_row(exec, source, partial.get_data()); - run_kernel_reduction( - exec, [] GKO_KERNEL(auto i, auto partial) { return partial[i]; }, - GKO_KERNEL_REDUCE_MAX(size_type), - partial.get_data() + source->get_size()[0], source->get_size()[0], - partial); - result = exec->copy_val_to_host(partial.get_const_data() + - source->get_size()[0]); -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( - GKO_DECLARE_DENSE_COMPUTE_MAX_NNZ_PER_ROW_KERNEL); - - -template -void compute_slice_sets(std::shared_ptr exec, - const matrix::Dense* source, - size_type slice_size, size_type stride_factor, - size_type* slice_sets, size_type* slice_lengths) -{ - const auto num_rows = source->get_size()[0]; - Array row_nnz{exec, num_rows}; - count_nonzeros_per_row(exec, source, row_nnz.get_data()); - const auto num_slices = - static_cast(ceildiv(num_rows, slice_size)); - run_kernel_row_reduction( - exec, - [] GKO_KERNEL(auto slice, auto local_row, auto row_nnz, auto slice_size, - auto stride_factor, auto num_rows) { - const auto row = slice * slice_size + local_row; - return row < num_rows ? static_cast( - ceildiv(row_nnz[row], stride_factor) * - stride_factor) - : size_type{}; - }, - GKO_KERNEL_REDUCE_MAX(size_type), slice_lengths, 1, - gko::dim<2>{num_slices, slice_size}, row_nnz, slice_size, stride_factor, - num_rows); - exec->copy(num_slices, slice_lengths, slice_sets); - components::prefix_sum(exec, slice_sets, num_slices + 1); -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( - GKO_DECLARE_DENSE_COMPUTE_SLICE_SETS_KERNEL); - - -template -void count_nonzeros_per_row(std::shared_ptr exec, - const matrix::Dense* mtx, - IndexType* result) -{ - run_kernel_row_reduction( - exec, - [] GKO_KERNEL(auto i, auto j, auto mtx) { - return is_nonzero(mtx(i, j)) ? 1 : 0; - }, - GKO_KERNEL_REDUCE_SUM(IndexType), result, 1, mtx->get_size(), mtx); -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_DENSE_COUNT_NONZEROS_PER_ROW_KERNEL); -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( - GKO_DECLARE_DENSE_COUNT_NONZEROS_PER_ROW_KERNEL_SIZE_T); - - -template -void symm_permute(std::shared_ptr exec, - const Array* permutation_indices, - const matrix::Dense* orig, - matrix::Dense* permuted) -{ - run_kernel( - exec, - [] GKO_KERNEL(auto row, auto col, auto orig, auto perm, auto permuted) { - permuted(row, col) = orig(perm[row], perm[col]); - }, - orig->get_size(), orig, *permutation_indices, permuted); -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_DENSE_SYMM_PERMUTE_KERNEL); - - -template -void inv_symm_permute(std::shared_ptr exec, - const Array* permutation_indices, - const matrix::Dense* orig, - matrix::Dense* permuted) -{ - run_kernel( - exec, - [] GKO_KERNEL(auto row, auto col, auto orig, auto perm, auto permuted) { - permuted(perm[row], perm[col]) = orig(row, col); - }, - orig->get_size(), orig, *permutation_indices, permuted); -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_DENSE_INV_SYMM_PERMUTE_KERNEL); - - -template -void row_gather(std::shared_ptr exec, - const Array* row_idxs, - const matrix::Dense* orig, - matrix::Dense* row_collection) -{ - run_kernel( - exec, - [] GKO_KERNEL(auto row, auto col, auto orig, auto rows, auto gathered) { - gathered(row, col) = orig(rows[row], col); - }, - dim<2>{row_idxs->get_num_elems(), orig->get_size()[1]}, orig, *row_idxs, - row_collection); -} - -GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE_2( - GKO_DECLARE_DENSE_ROW_GATHER_KERNEL); - - -template -void advanced_row_gather(std::shared_ptr exec, - const matrix::Dense* alpha, - const Array* row_idxs, - const matrix::Dense* orig, - const matrix::Dense* beta, - matrix::Dense* row_collection) -{ - run_kernel( - exec, - [] GKO_KERNEL(auto row, auto col, auto alpha, auto orig, auto rows, - auto beta, auto gathered) { - using type = device_type>; - gathered(row, col) = - static_cast(alpha[0] * orig(rows[row], col)) + - static_cast(beta[0]) * - static_cast(gathered(row, col)); - }, - dim<2>{row_idxs->get_num_elems(), orig->get_size()[1]}, - alpha->get_const_values(), orig, *row_idxs, beta->get_const_values(), - row_collection); -} - -GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE_2( - GKO_DECLARE_DENSE_ADVANCED_ROW_GATHER_KERNEL); - - -template -void column_permute(std::shared_ptr exec, - const Array* permutation_indices, - const matrix::Dense* orig, - matrix::Dense* column_permuted) -{ - run_kernel( - exec, - [] GKO_KERNEL(auto row, auto col, auto orig, auto perm, auto permuted) { - permuted(row, col) = orig(row, perm[col]); - }, - orig->get_size(), orig, *permutation_indices, column_permuted); -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_DENSE_COLUMN_PERMUTE_KERNEL); - - -template -void inverse_row_permute(std::shared_ptr exec, - const Array* permutation_indices, - const matrix::Dense* orig, - matrix::Dense* row_permuted) -{ - run_kernel( - exec, - [] GKO_KERNEL(auto row, auto col, auto orig, auto perm, auto permuted) { - permuted(perm[row], col) = orig(row, col); - }, - orig->get_size(), orig, *permutation_indices, row_permuted); -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_DENSE_INV_ROW_PERMUTE_KERNEL); - - -template -void inverse_column_permute(std::shared_ptr exec, - const Array* permutation_indices, - const matrix::Dense* orig, - matrix::Dense* column_permuted) -{ - run_kernel( - exec, - [] GKO_KERNEL(auto row, auto col, auto orig, auto perm, auto permuted) { - permuted(row, perm[col]) = orig(row, col); - }, - orig->get_size(), orig, *permutation_indices, column_permuted); -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_DENSE_INV_COLUMN_PERMUTE_KERNEL); - - -template -void extract_diagonal(std::shared_ptr exec, - const matrix::Dense* orig, - matrix::Diagonal* diag) -{ - run_kernel( - exec, - [] GKO_KERNEL(auto i, auto orig, auto diag) { diag[i] = orig(i, i); }, - diag->get_size()[0], orig, diag->get_values()); -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_EXTRACT_DIAGONAL_KERNEL); - - -template -void inplace_absolute_dense(std::shared_ptr exec, - matrix::Dense* source) -{ - run_kernel( - exec, - [] GKO_KERNEL(auto row, auto col, auto source) { - source(row, col) = abs(source(row, col)); - }, - source->get_size(), source); -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_INPLACE_ABSOLUTE_DENSE_KERNEL); - - -template -void outplace_absolute_dense(std::shared_ptr exec, - const matrix::Dense* source, - matrix::Dense>* result) -{ - run_kernel( - exec, - [] GKO_KERNEL(auto row, auto col, auto source, auto result) { - result(row, col) = abs(source(row, col)); - }, - source->get_size(), source, result); -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_OUTPLACE_ABSOLUTE_DENSE_KERNEL); - - -template -void make_complex(std::shared_ptr exec, - const matrix::Dense* source, - matrix::Dense>* result) -{ - run_kernel( - exec, - [] GKO_KERNEL(auto row, auto col, auto source, auto result) { - result(row, col) = source(row, col); - }, - source->get_size(), source, result); -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_MAKE_COMPLEX_KERNEL); - - -template -void get_real(std::shared_ptr exec, - const matrix::Dense* source, - matrix::Dense>* result) -{ - run_kernel( - exec, - [] GKO_KERNEL(auto row, auto col, auto source, auto result) { - result(row, col) = real(source(row, col)); - }, - source->get_size(), source, result); -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_GET_REAL_KERNEL); - - -template -void get_imag(std::shared_ptr exec, - const matrix::Dense* source, - matrix::Dense>* result) -{ - run_kernel( - exec, - [] GKO_KERNEL(auto row, auto col, auto source, auto result) { - result(row, col) = imag(source(row, col)); - }, - source->get_size(), source, result); -} - -GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_GET_IMAG_KERNEL); - - } // namespace dense } // namespace GKO_DEVICE_NAMESPACE } // namespace kernels diff --git a/common/unified/matrix/dense_kernels_2.cpp b/common/unified/matrix/dense_kernels_2.cpp new file mode 100644 index 00000000000..b3179c6137f --- /dev/null +++ b/common/unified/matrix/dense_kernels_2.cpp @@ -0,0 +1,198 @@ +/************************************************************* +Copyright (c) 2017-2022, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include "core/matrix/dense_kernels.hpp" + + +#include +#include + + +#include "common/unified/base/kernel_launch.hpp" +#include "common/unified/base/kernel_launch_reduction.hpp" +#include "core/base/mixed_precision_types.hpp" +#include "core/components/prefix_sum_kernels.hpp" + + +namespace gko { +namespace kernels { +namespace GKO_DEVICE_NAMESPACE { +/** + * @brief The Dense matrix format namespace. + * + * @ingroup dense + */ +namespace dense { + + +template +void scale(std::shared_ptr exec, + const matrix::Dense* alpha, matrix::Dense* x) +{ + if (alpha->get_size()[1] > 1) { + run_kernel( + exec, + [] GKO_KERNEL(auto row, auto col, auto alpha, auto x) { + x(row, col) *= alpha[col]; + }, + x->get_size(), alpha->get_const_values(), x); + } else { + run_kernel( + exec, + [] GKO_KERNEL(auto row, auto col, auto alpha, auto x) { + x(row, col) *= alpha[0]; + }, + x->get_size(), alpha->get_const_values(), x); + } +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_SCALAR_TYPE(GKO_DECLARE_DENSE_SCALE_KERNEL); + + +template +void inv_scale(std::shared_ptr exec, + const matrix::Dense* alpha, + matrix::Dense* x) +{ + if (alpha->get_size()[1] > 1) { + run_kernel( + exec, + [] GKO_KERNEL(auto row, auto col, auto alpha, auto x) { + x(row, col) /= alpha[col]; + }, + x->get_size(), alpha->get_const_values(), x); + } else { + run_kernel( + exec, + [] GKO_KERNEL(auto row, auto col, auto alpha, auto x) { + x(row, col) /= alpha[0]; + }, + x->get_size(), alpha->get_const_values(), x); + } +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_SCALAR_TYPE( + GKO_DECLARE_DENSE_INV_SCALE_KERNEL); + + +template +void add_scaled(std::shared_ptr exec, + const matrix::Dense* alpha, + const matrix::Dense* x, matrix::Dense* y) +{ + if (alpha->get_size()[1] > 1) { + run_kernel( + exec, + [] GKO_KERNEL(auto row, auto col, auto alpha, auto x, auto y) { + y(row, col) += alpha[col] * x(row, col); + }, + x->get_size(), alpha->get_const_values(), x, y); + } else { + run_kernel( + exec, + [] GKO_KERNEL(auto row, auto col, auto alpha, auto x, auto y) { + y(row, col) += alpha[0] * x(row, col); + }, + x->get_size(), alpha->get_const_values(), x, y); + } +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_SCALAR_TYPE( + GKO_DECLARE_DENSE_ADD_SCALED_KERNEL); + + +template +void sub_scaled(std::shared_ptr exec, + const matrix::Dense* alpha, + const matrix::Dense* x, matrix::Dense* y) +{ + if (alpha->get_size()[1] > 1) { + run_kernel( + exec, + [] GKO_KERNEL(auto row, auto col, auto alpha, auto x, auto y) { + y(row, col) -= alpha[col] * x(row, col); + }, + x->get_size(), alpha->get_const_values(), x, y); + } else { + run_kernel( + exec, + [] GKO_KERNEL(auto row, auto col, auto alpha, auto x, auto y) { + y(row, col) -= alpha[0] * x(row, col); + }, + x->get_size(), alpha->get_const_values(), x, y); + } +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_SCALAR_TYPE( + GKO_DECLARE_DENSE_SUB_SCALED_KERNEL); + + +template +void add_scaled_diag(std::shared_ptr exec, + const matrix::Dense* alpha, + const matrix::Diagonal* x, + matrix::Dense* y) +{ + const auto diag_values = x->get_const_values(); + run_kernel( + exec, + [] GKO_KERNEL(auto i, auto alpha, auto diag, auto y) { + y(i, i) += alpha[0] * diag[i]; + }, + x->get_size()[0], alpha->get_const_values(), x->get_const_values(), y); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_ADD_SCALED_DIAG_KERNEL); + + +template +void sub_scaled_diag(std::shared_ptr exec, + const matrix::Dense* alpha, + const matrix::Diagonal* x, + matrix::Dense* y) +{ + const auto diag_values = x->get_const_values(); + run_kernel( + exec, + [] GKO_KERNEL(auto i, auto alpha, auto diag, auto y) { + y(i, i) -= alpha[0] * diag[i]; + }, + x->get_size()[0], alpha->get_const_values(), x->get_const_values(), y); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_SUB_SCALED_DIAG_KERNEL); + + +} // namespace dense +} // namespace GKO_DEVICE_NAMESPACE +} // namespace kernels +} // namespace gko diff --git a/common/unified/matrix/dense_kernels_3.cpp b/common/unified/matrix/dense_kernels_3.cpp new file mode 100644 index 00000000000..f4a580a7109 --- /dev/null +++ b/common/unified/matrix/dense_kernels_3.cpp @@ -0,0 +1,78 @@ +/************************************************************* +Copyright (c) 2017-2022, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include "core/matrix/dense_kernels.hpp" + + +#include +#include + + +#include "common/unified/base/kernel_launch.hpp" +#include "common/unified/base/kernel_launch_reduction.hpp" +#include "core/base/mixed_precision_types.hpp" +#include "core/components/prefix_sum_kernels.hpp" + + +namespace gko { +namespace kernels { +namespace GKO_DEVICE_NAMESPACE { +/** + * @brief The Dense matrix format namespace. + * + * @ingroup dense + */ +namespace dense { + + +template +void compute_dot(std::shared_ptr exec, + const matrix::Dense* x, + const matrix::Dense* y, + matrix::Dense* result) +{ + run_kernel_col_reduction( + exec, + [] GKO_KERNEL(auto i, auto j, auto x, auto y) { + return x(i, j) * y(i, j); + }, + GKO_KERNEL_REDUCE_SUM(ValueType), result->get_values(), x->get_size(), + x, y); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_DOT_KERNEL); + + +} // namespace dense +} // namespace GKO_DEVICE_NAMESPACE +} // namespace kernels +} // namespace gko diff --git a/common/unified/matrix/dense_kernels_4.cpp b/common/unified/matrix/dense_kernels_4.cpp new file mode 100644 index 00000000000..1f9b4287f68 --- /dev/null +++ b/common/unified/matrix/dense_kernels_4.cpp @@ -0,0 +1,78 @@ +/************************************************************* +Copyright (c) 2017-2022, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include "core/matrix/dense_kernels.hpp" + + +#include +#include + + +#include "common/unified/base/kernel_launch.hpp" +#include "common/unified/base/kernel_launch_reduction.hpp" +#include "core/base/mixed_precision_types.hpp" +#include "core/components/prefix_sum_kernels.hpp" + + +namespace gko { +namespace kernels { +namespace GKO_DEVICE_NAMESPACE { +/** + * @brief The Dense matrix format namespace. + * + * @ingroup dense + */ +namespace dense { + + +template +void compute_conj_dot(std::shared_ptr exec, + const matrix::Dense* x, + const matrix::Dense* y, + matrix::Dense* result) +{ + run_kernel_col_reduction( + exec, + [] GKO_KERNEL(auto i, auto j, auto x, auto y) { + return conj(x(i, j)) * y(i, j); + }, + GKO_KERNEL_REDUCE_SUM(ValueType), result->get_values(), x->get_size(), + x, y); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_CONJ_DOT_KERNEL); + + +} // namespace dense +} // namespace GKO_DEVICE_NAMESPACE +} // namespace kernels +} // namespace gko diff --git a/common/unified/matrix/dense_kernels_5.cpp b/common/unified/matrix/dense_kernels_5.cpp new file mode 100644 index 00000000000..65bb8dd91e4 --- /dev/null +++ b/common/unified/matrix/dense_kernels_5.cpp @@ -0,0 +1,114 @@ +/************************************************************* +Copyright (c) 2017-2022, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include "core/matrix/dense_kernels.hpp" + + +#include +#include + + +#include "common/unified/base/kernel_launch.hpp" +#include "common/unified/base/kernel_launch_reduction.hpp" +#include "core/base/mixed_precision_types.hpp" +#include "core/components/prefix_sum_kernels.hpp" + + +namespace gko { +namespace kernels { +namespace GKO_DEVICE_NAMESPACE { +/** + * @brief The Dense matrix format namespace. + * + * @ingroup dense + */ +namespace dense { + + +template +void count_nonzeros_per_row(std::shared_ptr exec, + const matrix::Dense* mtx, + IndexType* result) +{ + run_kernel_row_reduction( + exec, + [] GKO_KERNEL(auto i, auto j, auto mtx) { + return is_nonzero(mtx(i, j)) ? 1 : 0; + }, + GKO_KERNEL_REDUCE_SUM(IndexType), result, 1, mtx->get_size(), mtx); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_DENSE_COUNT_NONZEROS_PER_ROW_KERNEL); +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( + GKO_DECLARE_DENSE_COUNT_NONZEROS_PER_ROW_KERNEL_SIZE_T); + + +template +void symm_permute(std::shared_ptr exec, + const Array* permutation_indices, + const matrix::Dense* orig, + matrix::Dense* permuted) +{ + run_kernel( + exec, + [] GKO_KERNEL(auto row, auto col, auto orig, auto perm, auto permuted) { + permuted(row, col) = orig(perm[row], perm[col]); + }, + orig->get_size(), orig, *permutation_indices, permuted); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_DENSE_SYMM_PERMUTE_KERNEL); + + +template +void inv_symm_permute(std::shared_ptr exec, + const Array* permutation_indices, + const matrix::Dense* orig, + matrix::Dense* permuted) +{ + run_kernel( + exec, + [] GKO_KERNEL(auto row, auto col, auto orig, auto perm, auto permuted) { + permuted(perm[row], perm[col]) = orig(row, col); + }, + orig->get_size(), orig, *permutation_indices, permuted); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_DENSE_INV_SYMM_PERMUTE_KERNEL); + +} // namespace dense +} // namespace GKO_DEVICE_NAMESPACE +} // namespace kernels +} // namespace gko diff --git a/common/unified/matrix/dense_kernels_6.cpp b/common/unified/matrix/dense_kernels_6.cpp new file mode 100644 index 00000000000..3629bd930fb --- /dev/null +++ b/common/unified/matrix/dense_kernels_6.cpp @@ -0,0 +1,253 @@ +/************************************************************* +Copyright (c) 2017-2022, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include "core/matrix/dense_kernels.hpp" + + +#include +#include + + +#include "common/unified/base/kernel_launch.hpp" +#include "common/unified/base/kernel_launch_reduction.hpp" +#include "core/base/mixed_precision_types.hpp" +#include "core/components/prefix_sum_kernels.hpp" + + +namespace gko { +namespace kernels { +namespace GKO_DEVICE_NAMESPACE { +/** + * @brief The Dense matrix format namespace. + * + * @ingroup dense + */ +namespace dense { + + +template +void row_gather(std::shared_ptr exec, + const Array* row_idxs, + const matrix::Dense* orig, + matrix::Dense* row_collection) +{ + run_kernel( + exec, + [] GKO_KERNEL(auto row, auto col, auto orig, auto rows, auto gathered) { + gathered(row, col) = orig(rows[row], col); + }, + dim<2>{row_idxs->get_num_elems(), orig->get_size()[1]}, orig, *row_idxs, + row_collection); +} + +GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE_2( + GKO_DECLARE_DENSE_ROW_GATHER_KERNEL); + + +template +void advanced_row_gather(std::shared_ptr exec, + const matrix::Dense* alpha, + const Array* row_idxs, + const matrix::Dense* orig, + const matrix::Dense* beta, + matrix::Dense* row_collection) +{ + run_kernel( + exec, + [] GKO_KERNEL(auto row, auto col, auto alpha, auto orig, auto rows, + auto beta, auto gathered) { + using type = device_type>; + gathered(row, col) = + static_cast(alpha[0] * orig(rows[row], col)) + + static_cast(beta[0]) * + static_cast(gathered(row, col)); + }, + dim<2>{row_idxs->get_num_elems(), orig->get_size()[1]}, + alpha->get_const_values(), orig, *row_idxs, beta->get_const_values(), + row_collection); +} + +GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE_2( + GKO_DECLARE_DENSE_ADVANCED_ROW_GATHER_KERNEL); + + +template +void column_permute(std::shared_ptr exec, + const Array* permutation_indices, + const matrix::Dense* orig, + matrix::Dense* column_permuted) +{ + run_kernel( + exec, + [] GKO_KERNEL(auto row, auto col, auto orig, auto perm, auto permuted) { + permuted(row, col) = orig(row, perm[col]); + }, + orig->get_size(), orig, *permutation_indices, column_permuted); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_DENSE_COLUMN_PERMUTE_KERNEL); + + +template +void inverse_row_permute(std::shared_ptr exec, + const Array* permutation_indices, + const matrix::Dense* orig, + matrix::Dense* row_permuted) +{ + run_kernel( + exec, + [] GKO_KERNEL(auto row, auto col, auto orig, auto perm, auto permuted) { + permuted(perm[row], col) = orig(row, col); + }, + orig->get_size(), orig, *permutation_indices, row_permuted); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_DENSE_INV_ROW_PERMUTE_KERNEL); + + +template +void inverse_column_permute(std::shared_ptr exec, + const Array* permutation_indices, + const matrix::Dense* orig, + matrix::Dense* column_permuted) +{ + run_kernel( + exec, + [] GKO_KERNEL(auto row, auto col, auto orig, auto perm, auto permuted) { + permuted(row, perm[col]) = orig(row, col); + }, + orig->get_size(), orig, *permutation_indices, column_permuted); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_DENSE_INV_COLUMN_PERMUTE_KERNEL); + + +template +void extract_diagonal(std::shared_ptr exec, + const matrix::Dense* orig, + matrix::Diagonal* diag) +{ + run_kernel( + exec, + [] GKO_KERNEL(auto i, auto orig, auto diag) { diag[i] = orig(i, i); }, + diag->get_size()[0], orig, diag->get_values()); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_EXTRACT_DIAGONAL_KERNEL); + + +template +void inplace_absolute_dense(std::shared_ptr exec, + matrix::Dense* source) +{ + run_kernel( + exec, + [] GKO_KERNEL(auto row, auto col, auto source) { + source(row, col) = abs(source(row, col)); + }, + source->get_size(), source); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_INPLACE_ABSOLUTE_DENSE_KERNEL); + + +template +void outplace_absolute_dense(std::shared_ptr exec, + const matrix::Dense* source, + matrix::Dense>* result) +{ + run_kernel( + exec, + [] GKO_KERNEL(auto row, auto col, auto source, auto result) { + result(row, col) = abs(source(row, col)); + }, + source->get_size(), source, result); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_OUTPLACE_ABSOLUTE_DENSE_KERNEL); + + +template +void make_complex(std::shared_ptr exec, + const matrix::Dense* source, + matrix::Dense>* result) +{ + run_kernel( + exec, + [] GKO_KERNEL(auto row, auto col, auto source, auto result) { + result(row, col) = source(row, col); + }, + source->get_size(), source, result); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_MAKE_COMPLEX_KERNEL); + + +template +void get_real(std::shared_ptr exec, + const matrix::Dense* source, + matrix::Dense>* result) +{ + run_kernel( + exec, + [] GKO_KERNEL(auto row, auto col, auto source, auto result) { + result(row, col) = real(source(row, col)); + }, + source->get_size(), source, result); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_GET_REAL_KERNEL); + + +template +void get_imag(std::shared_ptr exec, + const matrix::Dense* source, + matrix::Dense>* result) +{ + run_kernel( + exec, + [] GKO_KERNEL(auto row, auto col, auto source, auto result) { + result(row, col) = imag(source(row, col)); + }, + source->get_size(), source, result); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_GET_IMAG_KERNEL); + + +} // namespace dense +} // namespace GKO_DEVICE_NAMESPACE +} // namespace kernels +} // namespace gko diff --git a/dpcpp/CMakeLists.txt b/dpcpp/CMakeLists.txt index a119147a25d..5500ecd3a7f 100644 --- a/dpcpp/CMakeLists.txt +++ b/dpcpp/CMakeLists.txt @@ -79,7 +79,8 @@ target_link_options(ginkgo_dpcpp PRIVATE -fsycl-device-lib=all) if (NOT GINKGO_DPCPP_AOT_TARGETS STREQUAL "") string(REPLACE ";" "," GINKGO_DPCPP_AOT_TARGETS_FORMATTED "${GINKGO_DPCPP_AOT_TARGETS}") - target_compile_options(ginkgo_dpcpp PRIVATE -fsycl-targets=${GINKGO_DPCPP_AOT_TARGETS_FORMATTED}) + target_compile_options(ginkgo_dpcpp PUBLIC -fsycl-targets=${GINKGO_DPCPP_AOT_TARGETS_FORMATTED}) + target_link_options(ginkgo_dpcpp PUBLIC -fsycl-targets=${GINKGO_DPCPP_AOT_TARGETS_FORMATTED}) else() # When building ginkgo as a static library, we need to use dpcpp and per_source # link option when the program uses a dpcpp related function.