diff --git a/.gitignore b/.gitignore
index af0a88ef513..169f5e8d9ad 100644
--- a/.gitignore
+++ b/.gitignore
@@ -23,8 +23,9 @@ build
 # Visual studio code
 .vscode
 
-### Generated header file
+### Generated header files
 include/config.hpp
+dpcpp/base/config.hpp
 
 ### C++
 # Prerequisites
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 761f10f420d..80464201e54 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -95,9 +95,20 @@ else()
     option(GINKGO_BUILD_HWLOC "Build Ginkgo with HWLOC. Default is ON. If a system HWLOC is not found, then we try to build it ourselves. Switch this OFF to disable HWLOC." ON)
 endif()
 option(GINKGO_DPCPP_SINGLE_MODE "Do not compile double kernels for the DPC++ backend." OFF)
+set(GINKGO_DPCPP_AOT_TARGETS "" CACHE STRING
+    "Uses DPC++ AOT with set targets (e.g., spir64_gen;spir64_x86_64). The default is '' (use JIT).")
+set(GINKGO_DPCPP_SUBGROUPS "8;16;32" CACHE STRING
+    "The subgroup sizes to generate DPC++ kernels for. The default is '8;16;32'.")
+set(GINKGO_DPCPP_WORKGROUPS "256;512" CACHE STRING
+    "The max workgroup sizes to generate DPC++ kernels for. The default is '256;512'.")
 option(GINKGO_INSTALL_RPATH "Set the RPATH when installing its libraries." ON)
 option(GINKGO_INSTALL_RPATH_ORIGIN "Add $ORIGIN (Linux) or @loader_path (MacOS) to the installation RPATH." ON)
 option(GINKGO_INSTALL_RPATH_DEPENDENCIES "Add dependencies to the installation RPATH." OFF)
+# Third party libraries
+option(GINKGO_DOWNLOAD_GFLAGS "Force Ginkgo to download gflags." OFF)
+option(GINKGO_DOWNLOAD_GTEST "Force Ginkgo to download GTest." OFF)
+option(GINKGO_DOWNLOAD_HWLOC "Force Ginkgo to download HWLOC." OFF)
+option(GINKGO_DOWNLOAD_RAPIDJSON "Force Ginkgo to download RapidJSON." OFF)
 
 set(GINKGO_CIRCULAR_DEPS_FLAGS "-Wl,--no-undefined")
 
@@ -186,7 +197,6 @@ if(GINKGO_BUILD_HWLOC)
     set(GINKGO_HAVE_HWLOC 1)
 else()
     set(GINKGO_HAVE_HWLOC 0)
-    message(STATUS "HWLOC is being forcibly switched off")
 endif()
 
 if(GINKGO_BUILD_MPI)
@@ -223,14 +233,14 @@ endif()
 # Try to find the third party packages before using our subdirectories
 include(cmake/package_helpers.cmake)
 if(GINKGO_BUILD_TESTS)
-    find_package(GTest 1.10.0) # No need for QUIET as CMake ships FindGTest
+    ginkgo_find_package(GTest 1.10.0) # No need for QUIET as CMake ships FindGTest
 endif()
 if(GINKGO_BUILD_BENCHMARKS)
-    find_package(gflags 2.2.2 QUIET)
-    find_package(RapidJSON 1.1.0 QUIET)
+    ginkgo_find_package(gflags 2.2.2 QUIET)
+    ginkgo_find_package(RapidJSON 1.1.0 QUIET)
 endif()
 if(GINKGO_BUILD_HWLOC)
-    find_package(HWLOC 2.1) # No need for QUIET as we ship FindHWLOC
+    ginkgo_find_package(HWLOC 2.1) # No need for QUIET as we ship FindHWLOC
 endif()
 add_subdirectory(third_party)    # Third-party tools and libraries
 
diff --git a/cmake/Modules/FindHWLOC.cmake b/cmake/Modules/FindHWLOC.cmake
index fe01a40b2f4..91d61499265 100644
--- a/cmake/Modules/FindHWLOC.cmake
+++ b/cmake/Modules/FindHWLOC.cmake
@@ -39,13 +39,13 @@ include(hwloc_helpers)
 
 find_path(HWLOC_INCLUDE_DIRS
     NAMES "hwloc.h"
-    HINTS ${HWLOC_DIR} $ENV{HWLOC_DIR}
+    HINTS ${HWLOC_DIR} $ENV{HWLOC_DIR} ${HWLOC_ROOT} $ENV{HWLOC_ROOT}
     PATH_SUFFIXES include
     DOC "Find the hwloc.h main header"
     )
 
 find_library(HWLOC_LIBRARIES "hwloc"
-    HINTS ${HWLOC_DIR} $ENV{HWLOC_DIR}
+    HINTS ${HWLOC_DIR} $ENV{HWLOC_DIR} ${HWLOC_ROOT} $ENV{HWLOC_ROOT}
     PATH_SUFFIXES lib lib64
     DOC "Find the hwloc library"
     )
diff --git a/cmake/create_test.cmake b/cmake/create_test.cmake
index 28685fdf232..1f8181912df 100644
--- a/cmake/create_test.cmake
+++ b/cmake/create_test.cmake
@@ -41,7 +41,11 @@ function(ginkgo_create_dpcpp_test test_name)
     target_compile_features(${test_target_name} PUBLIC cxx_std_17)
     target_compile_options(${test_target_name} PRIVATE "${GINKGO_DPCPP_FLAGS}")
     target_compile_options(${test_target_name} PRIVATE "${GINKGO_COMPILER_FLAGS}")
-    target_link_options(${test_target_name} PRIVATE -fsycl-device-code-split=per_kernel)
+    if (NOT GINKGO_DPCPP_AOT_TARGETS STREQUAL "")
+        target_compile_options(${test_target_name} PRIVATE -fsycl-targets=${GINKGO_DPCPP_AOT_TARGETS_FORMATTED})
+    else()
+        target_link_options(${test_target_name} PRIVATE -fsycl-device-code-split=per_kernel)
+    endif()
     ginkgo_set_test_target_properties(${test_name} ${test_target_name})
     # Note: MKL_ENV is empty on linux. Maybe need to apply MKL_ENV to all test.
     if (MKL_ENV)
diff --git a/cmake/get_info.cmake b/cmake/get_info.cmake
index 479b889aeaf..976e1927c6b 100644
--- a/cmake/get_info.cmake
+++ b/cmake/get_info.cmake
@@ -200,6 +200,10 @@ ginkgo_print_variable(${detailed_log} "GINKGO_BUILD_HWLOC")
 ginkgo_print_variable(${detailed_log} "HWLOC_VERSION")
 ginkgo_print_variable(${detailed_log} "HWLOC_LIBRARIES")
 ginkgo_print_variable(${detailed_log} "HWLOC_INCLUDE_DIRS")
+ginkgo_print_variable(${minimal_log} "GINKGO_DOWNLOAD_GFLAGS")
+ginkgo_print_variable(${minimal_log} "GINKGO_DOWNLOAD_GTEST")
+ginkgo_print_variable(${minimal_log} "GINKGO_DOWNLOAD_HWLOC")
+ginkgo_print_variable(${minimal_log} "GINKGO_DOWNLOAD_RAPIDJSON")
 
 _minimal(
     "
diff --git a/cmake/package_helpers.cmake b/cmake/package_helpers.cmake
index e1d196ad553..2abead229e2 100644
--- a/cmake/package_helpers.cmake
+++ b/cmake/package_helpers.cmake
@@ -57,3 +57,15 @@ function(ginkgo_download_file url filename hash_type hash)
         message(FATAL_ERROR "Download of ${filename} failed.")
     endif()
 endfunction(ginkgo_download_file)
+
+#   Find package with a protection. To not find library foo, set the CMake
+#   variable GINKGO_DOWNLOAD_FOO=ON
+#
+#   \param hash         The name of the package
+#   \param ARGN         Extra arguments to give to find_package
+macro(ginkgo_find_package name)
+    string(TOUPPER "${name}" GKO_TPL_PKG_NAME)
+    if(NOT GINKGO_DOWNLOAD_${GKO_TPL_PKG_NAME})
+        find_package(${name} ${ARGN})
+    endif()
+endmacro()
diff --git a/common/CMakeLists.txt b/common/CMakeLists.txt
index 04f33b4a718..c0757c8226c 100644
--- a/common/CMakeLists.txt
+++ b/common/CMakeLists.txt
@@ -10,6 +10,15 @@ set(UNIFIED_SOURCES
     matrix/coo_kernels.cpp
     matrix/csr_kernels.cpp
     matrix/dense_kernels.cpp
+    matrix/dense_kernels_2.cpp
+    matrix/dense_kernels_3.cpp
+    matrix/dense_kernels_4.cpp
+    matrix/dense_kernels_5.cpp
+    matrix/dense_kernels_6.cpp
+    matrix/dense_compute_norm2.cpp
+    matrix/dense_compute_norm1.cpp
+    matrix/dense_cmnpr.cpp
+    matrix/dense_css.cpp
     matrix/ell_kernels.cpp
     matrix/hybrid_kernels.cpp
     matrix/sellp_kernels.cpp
diff --git a/common/unified/matrix/dense_cmnpr.cpp b/common/unified/matrix/dense_cmnpr.cpp
new file mode 100644
index 00000000000..aa81556beef
--- /dev/null
+++ b/common/unified/matrix/dense_cmnpr.cpp
@@ -0,0 +1,80 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2022, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include "core/matrix/dense_kernels.hpp"
+
+
+#include <ginkgo/core/base/device_matrix_data.hpp>
+#include <ginkgo/core/base/math.hpp>
+
+
+#include "common/unified/base/kernel_launch.hpp"
+#include "common/unified/base/kernel_launch_reduction.hpp"
+#include "core/base/mixed_precision_types.hpp"
+#include "core/components/prefix_sum_kernels.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace GKO_DEVICE_NAMESPACE {
+/**
+ * @brief The Dense matrix format namespace.
+ *
+ * @ingroup dense
+ */
+namespace dense {
+
+
+template <typename ValueType>
+void compute_max_nnz_per_row(std::shared_ptr<const DefaultExecutor> exec,
+                             const matrix::Dense<ValueType>* source,
+                             size_type& result)
+{
+    Array<size_type> partial{exec, source->get_size()[0] + 1};
+    count_nonzeros_per_row(exec, source, partial.get_data());
+    run_kernel_reduction(
+        exec, [] GKO_KERNEL(auto i, auto partial) { return partial[i]; },
+        GKO_KERNEL_REDUCE_MAX(size_type),
+        partial.get_data() + source->get_size()[0], source->get_size()[0],
+        partial);
+    result = exec->copy_val_to_host(partial.get_const_data() +
+                                    source->get_size()[0]);
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
+    GKO_DECLARE_DENSE_COMPUTE_MAX_NNZ_PER_ROW_KERNEL);
+
+
+}  // namespace dense
+}  // namespace GKO_DEVICE_NAMESPACE
+}  // namespace kernels
+}  // namespace gko
diff --git a/dpcpp/base/config.hpp b/common/unified/matrix/dense_compute_norm1.cpp
similarity index 64%
rename from dpcpp/base/config.hpp
rename to common/unified/matrix/dense_compute_norm1.cpp
index 43465b61c8a..762260bb6b6 100644
--- a/dpcpp/base/config.hpp
+++ b/common/unified/matrix/dense_compute_norm1.cpp
@@ -30,51 +30,45 @@ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 ******************************<GINKGO LICENSE>*******************************/
 
-#ifndef GKO_DPCPP_BASE_CONFIG_HPP_
-#define GKO_DPCPP_BASE_CONFIG_HPP_
+#include "core/matrix/dense_kernels.hpp"
 
 
+#include <ginkgo/core/base/device_matrix_data.hpp>
 #include <ginkgo/core/base/math.hpp>
-#include <ginkgo/core/base/types.hpp>
 
 
-namespace gko {
-namespace kernels {
-namespace dpcpp {
-
-
-struct config {
-    /**
-     * The type containing a bitmask over all lanes of a warp.
-     */
-    using lane_mask_type = uint64;
-
-    /**
-     * The number of threads within a Dpcpp subgroup.
-     */
-    static constexpr uint32 warp_size = 16;
-
-    /**
-     * The bitmask of the entire warp.
-     */
-    static constexpr auto full_lane_mask = ~zero<lane_mask_type>();
+#include "common/unified/base/kernel_launch.hpp"
+#include "common/unified/base/kernel_launch_reduction.hpp"
+#include "core/base/mixed_precision_types.hpp"
+#include "core/components/prefix_sum_kernels.hpp"
 
-    /**
-     * The minimal amount of warps that need to be scheduled for each block
-     * to maximize GPU occupancy.
-     */
-    static constexpr uint32 min_warps_per_block = 4;
 
-    /**
-     * The default maximal number of threads allowed in DPCPP group
-     */
-    static constexpr uint32 max_block_size = 256;
-};
-
-
-}  // namespace dpcpp
+namespace gko {
+namespace kernels {
+namespace GKO_DEVICE_NAMESPACE {
+/**
+ * @brief The Dense matrix format namespace.
+ *
+ * @ingroup dense
+ */
+namespace dense {
+
+
+template <typename ValueType>
+void compute_norm1(std::shared_ptr<const DefaultExecutor> exec,
+                   const matrix::Dense<ValueType>* x,
+                   matrix::Dense<remove_complex<ValueType>>* result)
+{
+    run_kernel_col_reduction(
+        exec, [] GKO_KERNEL(auto i, auto j, auto x) { return abs(x(i, j)); },
+        GKO_KERNEL_REDUCE_SUM(remove_complex<ValueType>), result->get_values(),
+        x->get_size(), x);
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_NORM1_KERNEL);
+
+
+}  // namespace dense
+}  // namespace GKO_DEVICE_NAMESPACE
 }  // namespace kernels
 }  // namespace gko
-
-
-#endif  // GKO_DPCPP_BASE_CONFIG_HPP_
diff --git a/common/unified/matrix/dense_compute_norm2.cpp b/common/unified/matrix/dense_compute_norm2.cpp
new file mode 100644
index 00000000000..890980ad247
--- /dev/null
+++ b/common/unified/matrix/dense_compute_norm2.cpp
@@ -0,0 +1,76 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2022, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include "core/matrix/dense_kernels.hpp"
+
+
+#include <ginkgo/core/base/device_matrix_data.hpp>
+#include <ginkgo/core/base/math.hpp>
+
+
+#include "common/unified/base/kernel_launch.hpp"
+#include "common/unified/base/kernel_launch_reduction.hpp"
+#include "core/base/mixed_precision_types.hpp"
+#include "core/components/prefix_sum_kernels.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace GKO_DEVICE_NAMESPACE {
+/**
+ * @brief The Dense matrix format namespace.
+ *
+ * @ingroup dense
+ */
+namespace dense {
+
+
+template <typename ValueType>
+void compute_norm2(std::shared_ptr<const DefaultExecutor> exec,
+                   const matrix::Dense<ValueType>* x,
+                   matrix::Dense<remove_complex<ValueType>>* result)
+{
+    run_kernel_col_reduction(
+        exec,
+        [] GKO_KERNEL(auto i, auto j, auto x) { return squared_norm(x(i, j)); },
+        [] GKO_KERNEL(auto a, auto b) { return a + b; },
+        [] GKO_KERNEL(auto a) { return sqrt(a); }, remove_complex<ValueType>{},
+        result->get_values(), x->get_size(), x);
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_NORM2_KERNEL);
+
+
+}  // namespace dense
+}  // namespace GKO_DEVICE_NAMESPACE
+}  // namespace kernels
+}  // namespace gko
diff --git a/common/unified/matrix/dense_css.cpp b/common/unified/matrix/dense_css.cpp
new file mode 100644
index 00000000000..71a7d8ab405
--- /dev/null
+++ b/common/unified/matrix/dense_css.cpp
@@ -0,0 +1,92 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2022, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include "core/matrix/dense_kernels.hpp"
+
+
+#include <ginkgo/core/base/device_matrix_data.hpp>
+#include <ginkgo/core/base/math.hpp>
+
+
+#include "common/unified/base/kernel_launch.hpp"
+#include "common/unified/base/kernel_launch_reduction.hpp"
+#include "core/base/mixed_precision_types.hpp"
+#include "core/components/prefix_sum_kernels.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace GKO_DEVICE_NAMESPACE {
+/**
+ * @brief The Dense matrix format namespace.
+ *
+ * @ingroup dense
+ */
+namespace dense {
+
+
+template <typename ValueType>
+void compute_slice_sets(std::shared_ptr<const DefaultExecutor> exec,
+                        const matrix::Dense<ValueType>* source,
+                        size_type slice_size, size_type stride_factor,
+                        size_type* slice_sets, size_type* slice_lengths)
+{
+    const auto num_rows = source->get_size()[0];
+    Array<size_type> row_nnz{exec, num_rows};
+    count_nonzeros_per_row(exec, source, row_nnz.get_data());
+    const auto num_slices =
+        static_cast<size_type>(ceildiv(num_rows, slice_size));
+    run_kernel_row_reduction(
+        exec,
+        [] GKO_KERNEL(auto slice, auto local_row, auto row_nnz, auto slice_size,
+                      auto stride_factor, auto num_rows) {
+            const auto row = slice * slice_size + local_row;
+            return row < num_rows ? static_cast<size_type>(
+                                        ceildiv(row_nnz[row], stride_factor) *
+                                        stride_factor)
+                                  : size_type{};
+        },
+        GKO_KERNEL_REDUCE_MAX(size_type), slice_lengths, 1,
+        gko::dim<2>{num_slices, slice_size}, row_nnz, slice_size, stride_factor,
+        num_rows);
+    exec->copy(num_slices, slice_lengths, slice_sets);
+    components::prefix_sum(exec, slice_sets, num_slices + 1);
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
+    GKO_DECLARE_DENSE_COMPUTE_SLICE_SETS_KERNEL);
+
+
+}  // namespace dense
+}  // namespace GKO_DEVICE_NAMESPACE
+}  // namespace kernels
+}  // namespace gko
diff --git a/common/unified/matrix/dense_kernels.cpp b/common/unified/matrix/dense_kernels.cpp
index d5037879eda..4e4babe4e9d 100644
--- a/common/unified/matrix/dense_kernels.cpp
+++ b/common/unified/matrix/dense_kernels.cpp
@@ -104,509 +104,6 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_DENSE_FILL_IN_MATRIX_DATA_KERNEL);
 
 
-template <typename ValueType, typename ScalarType>
-void scale(std::shared_ptr<const DefaultExecutor> exec,
-           const matrix::Dense<ScalarType>* alpha, matrix::Dense<ValueType>* x)
-{
-    if (alpha->get_size()[1] > 1) {
-        run_kernel(
-            exec,
-            [] GKO_KERNEL(auto row, auto col, auto alpha, auto x) {
-                x(row, col) *= alpha[col];
-            },
-            x->get_size(), alpha->get_const_values(), x);
-    } else {
-        run_kernel(
-            exec,
-            [] GKO_KERNEL(auto row, auto col, auto alpha, auto x) {
-                x(row, col) *= alpha[0];
-            },
-            x->get_size(), alpha->get_const_values(), x);
-    }
-}
-
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_SCALAR_TYPE(GKO_DECLARE_DENSE_SCALE_KERNEL);
-
-
-template <typename ValueType, typename ScalarType>
-void inv_scale(std::shared_ptr<const DefaultExecutor> exec,
-               const matrix::Dense<ScalarType>* alpha,
-               matrix::Dense<ValueType>* x)
-{
-    if (alpha->get_size()[1] > 1) {
-        run_kernel(
-            exec,
-            [] GKO_KERNEL(auto row, auto col, auto alpha, auto x) {
-                x(row, col) /= alpha[col];
-            },
-            x->get_size(), alpha->get_const_values(), x);
-    } else {
-        run_kernel(
-            exec,
-            [] GKO_KERNEL(auto row, auto col, auto alpha, auto x) {
-                x(row, col) /= alpha[0];
-            },
-            x->get_size(), alpha->get_const_values(), x);
-    }
-}
-
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_SCALAR_TYPE(
-    GKO_DECLARE_DENSE_INV_SCALE_KERNEL);
-
-
-template <typename ValueType, typename ScalarType>
-void add_scaled(std::shared_ptr<const DefaultExecutor> exec,
-                const matrix::Dense<ScalarType>* alpha,
-                const matrix::Dense<ValueType>* x, matrix::Dense<ValueType>* y)
-{
-    if (alpha->get_size()[1] > 1) {
-        run_kernel(
-            exec,
-            [] GKO_KERNEL(auto row, auto col, auto alpha, auto x, auto y) {
-                y(row, col) += alpha[col] * x(row, col);
-            },
-            x->get_size(), alpha->get_const_values(), x, y);
-    } else {
-        run_kernel(
-            exec,
-            [] GKO_KERNEL(auto row, auto col, auto alpha, auto x, auto y) {
-                y(row, col) += alpha[0] * x(row, col);
-            },
-            x->get_size(), alpha->get_const_values(), x, y);
-    }
-}
-
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_SCALAR_TYPE(
-    GKO_DECLARE_DENSE_ADD_SCALED_KERNEL);
-
-
-template <typename ValueType, typename ScalarType>
-void sub_scaled(std::shared_ptr<const DefaultExecutor> exec,
-                const matrix::Dense<ScalarType>* alpha,
-                const matrix::Dense<ValueType>* x, matrix::Dense<ValueType>* y)
-{
-    if (alpha->get_size()[1] > 1) {
-        run_kernel(
-            exec,
-            [] GKO_KERNEL(auto row, auto col, auto alpha, auto x, auto y) {
-                y(row, col) -= alpha[col] * x(row, col);
-            },
-            x->get_size(), alpha->get_const_values(), x, y);
-    } else {
-        run_kernel(
-            exec,
-            [] GKO_KERNEL(auto row, auto col, auto alpha, auto x, auto y) {
-                y(row, col) -= alpha[0] * x(row, col);
-            },
-            x->get_size(), alpha->get_const_values(), x, y);
-    }
-}
-
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_SCALAR_TYPE(
-    GKO_DECLARE_DENSE_SUB_SCALED_KERNEL);
-
-
-template <typename ValueType>
-void add_scaled_diag(std::shared_ptr<const DefaultExecutor> exec,
-                     const matrix::Dense<ValueType>* alpha,
-                     const matrix::Diagonal<ValueType>* x,
-                     matrix::Dense<ValueType>* y)
-{
-    const auto diag_values = x->get_const_values();
-    run_kernel(
-        exec,
-        [] GKO_KERNEL(auto i, auto alpha, auto diag, auto y) {
-            y(i, i) += alpha[0] * diag[i];
-        },
-        x->get_size()[0], alpha->get_const_values(), x->get_const_values(), y);
-}
-
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_ADD_SCALED_DIAG_KERNEL);
-
-
-template <typename ValueType>
-void sub_scaled_diag(std::shared_ptr<const DefaultExecutor> exec,
-                     const matrix::Dense<ValueType>* alpha,
-                     const matrix::Diagonal<ValueType>* x,
-                     matrix::Dense<ValueType>* y)
-{
-    const auto diag_values = x->get_const_values();
-    run_kernel(
-        exec,
-        [] GKO_KERNEL(auto i, auto alpha, auto diag, auto y) {
-            y(i, i) -= alpha[0] * diag[i];
-        },
-        x->get_size()[0], alpha->get_const_values(), x->get_const_values(), y);
-}
-
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_SUB_SCALED_DIAG_KERNEL);
-
-
-template <typename ValueType>
-void compute_dot(std::shared_ptr<const DefaultExecutor> exec,
-                 const matrix::Dense<ValueType>* x,
-                 const matrix::Dense<ValueType>* y,
-                 matrix::Dense<ValueType>* result)
-{
-    run_kernel_col_reduction(
-        exec,
-        [] GKO_KERNEL(auto i, auto j, auto x, auto y) {
-            return x(i, j) * y(i, j);
-        },
-        GKO_KERNEL_REDUCE_SUM(ValueType), result->get_values(), x->get_size(),
-        x, y);
-}
-
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_DOT_KERNEL);
-
-
-template <typename ValueType>
-void compute_conj_dot(std::shared_ptr<const DefaultExecutor> exec,
-                      const matrix::Dense<ValueType>* x,
-                      const matrix::Dense<ValueType>* y,
-                      matrix::Dense<ValueType>* result)
-{
-    run_kernel_col_reduction(
-        exec,
-        [] GKO_KERNEL(auto i, auto j, auto x, auto y) {
-            return conj(x(i, j)) * y(i, j);
-        },
-        GKO_KERNEL_REDUCE_SUM(ValueType), result->get_values(), x->get_size(),
-        x, y);
-}
-
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_CONJ_DOT_KERNEL);
-
-
-template <typename ValueType>
-void compute_norm2(std::shared_ptr<const DefaultExecutor> exec,
-                   const matrix::Dense<ValueType>* x,
-                   matrix::Dense<remove_complex<ValueType>>* result)
-{
-    run_kernel_col_reduction(
-        exec,
-        [] GKO_KERNEL(auto i, auto j, auto x) { return squared_norm(x(i, j)); },
-        [] GKO_KERNEL(auto a, auto b) { return a + b; },
-        [] GKO_KERNEL(auto a) { return sqrt(a); }, remove_complex<ValueType>{},
-        result->get_values(), x->get_size(), x);
-}
-
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_NORM2_KERNEL);
-
-template <typename ValueType>
-void compute_norm1(std::shared_ptr<const DefaultExecutor> exec,
-                   const matrix::Dense<ValueType>* x,
-                   matrix::Dense<remove_complex<ValueType>>* result)
-{
-    run_kernel_col_reduction(
-        exec, [] GKO_KERNEL(auto i, auto j, auto x) { return abs(x(i, j)); },
-        GKO_KERNEL_REDUCE_SUM(remove_complex<ValueType>), result->get_values(),
-        x->get_size(), x);
-}
-
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_NORM1_KERNEL);
-
-
-template <typename ValueType>
-void compute_max_nnz_per_row(std::shared_ptr<const DefaultExecutor> exec,
-                             const matrix::Dense<ValueType>* source,
-                             size_type& result)
-{
-    Array<size_type> partial{exec, source->get_size()[0] + 1};
-    count_nonzeros_per_row(exec, source, partial.get_data());
-    run_kernel_reduction(
-        exec, [] GKO_KERNEL(auto i, auto partial) { return partial[i]; },
-        GKO_KERNEL_REDUCE_MAX(size_type),
-        partial.get_data() + source->get_size()[0], source->get_size()[0],
-        partial);
-    result = exec->copy_val_to_host(partial.get_const_data() +
-                                    source->get_size()[0]);
-}
-
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
-    GKO_DECLARE_DENSE_COMPUTE_MAX_NNZ_PER_ROW_KERNEL);
-
-
-template <typename ValueType>
-void compute_slice_sets(std::shared_ptr<const DefaultExecutor> exec,
-                        const matrix::Dense<ValueType>* source,
-                        size_type slice_size, size_type stride_factor,
-                        size_type* slice_sets, size_type* slice_lengths)
-{
-    const auto num_rows = source->get_size()[0];
-    Array<size_type> row_nnz{exec, num_rows};
-    count_nonzeros_per_row(exec, source, row_nnz.get_data());
-    const auto num_slices =
-        static_cast<size_type>(ceildiv(num_rows, slice_size));
-    run_kernel_row_reduction(
-        exec,
-        [] GKO_KERNEL(auto slice, auto local_row, auto row_nnz, auto slice_size,
-                      auto stride_factor, auto num_rows) {
-            const auto row = slice * slice_size + local_row;
-            return row < num_rows ? static_cast<size_type>(
-                                        ceildiv(row_nnz[row], stride_factor) *
-                                        stride_factor)
-                                  : size_type{};
-        },
-        GKO_KERNEL_REDUCE_MAX(size_type), slice_lengths, 1,
-        gko::dim<2>{num_slices, slice_size}, row_nnz, slice_size, stride_factor,
-        num_rows);
-    exec->copy(num_slices, slice_lengths, slice_sets);
-    components::prefix_sum(exec, slice_sets, num_slices + 1);
-}
-
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
-    GKO_DECLARE_DENSE_COMPUTE_SLICE_SETS_KERNEL);
-
-
-template <typename ValueType, typename IndexType>
-void count_nonzeros_per_row(std::shared_ptr<const DefaultExecutor> exec,
-                            const matrix::Dense<ValueType>* mtx,
-                            IndexType* result)
-{
-    run_kernel_row_reduction(
-        exec,
-        [] GKO_KERNEL(auto i, auto j, auto mtx) {
-            return is_nonzero(mtx(i, j)) ? 1 : 0;
-        },
-        GKO_KERNEL_REDUCE_SUM(IndexType), result, 1, mtx->get_size(), mtx);
-}
-
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
-    GKO_DECLARE_DENSE_COUNT_NONZEROS_PER_ROW_KERNEL);
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
-    GKO_DECLARE_DENSE_COUNT_NONZEROS_PER_ROW_KERNEL_SIZE_T);
-
-
-template <typename ValueType, typename IndexType>
-void symm_permute(std::shared_ptr<const DefaultExecutor> exec,
-                  const Array<IndexType>* permutation_indices,
-                  const matrix::Dense<ValueType>* orig,
-                  matrix::Dense<ValueType>* permuted)
-{
-    run_kernel(
-        exec,
-        [] GKO_KERNEL(auto row, auto col, auto orig, auto perm, auto permuted) {
-            permuted(row, col) = orig(perm[row], perm[col]);
-        },
-        orig->get_size(), orig, *permutation_indices, permuted);
-}
-
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
-    GKO_DECLARE_DENSE_SYMM_PERMUTE_KERNEL);
-
-
-template <typename ValueType, typename IndexType>
-void inv_symm_permute(std::shared_ptr<const DefaultExecutor> exec,
-                      const Array<IndexType>* permutation_indices,
-                      const matrix::Dense<ValueType>* orig,
-                      matrix::Dense<ValueType>* permuted)
-{
-    run_kernel(
-        exec,
-        [] GKO_KERNEL(auto row, auto col, auto orig, auto perm, auto permuted) {
-            permuted(perm[row], perm[col]) = orig(row, col);
-        },
-        orig->get_size(), orig, *permutation_indices, permuted);
-}
-
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
-    GKO_DECLARE_DENSE_INV_SYMM_PERMUTE_KERNEL);
-
-
-template <typename ValueType, typename OutputType, typename IndexType>
-void row_gather(std::shared_ptr<const DefaultExecutor> exec,
-                const Array<IndexType>* row_idxs,
-                const matrix::Dense<ValueType>* orig,
-                matrix::Dense<OutputType>* row_collection)
-{
-    run_kernel(
-        exec,
-        [] GKO_KERNEL(auto row, auto col, auto orig, auto rows, auto gathered) {
-            gathered(row, col) = orig(rows[row], col);
-        },
-        dim<2>{row_idxs->get_num_elems(), orig->get_size()[1]}, orig, *row_idxs,
-        row_collection);
-}
-
-GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE_2(
-    GKO_DECLARE_DENSE_ROW_GATHER_KERNEL);
-
-
-template <typename ValueType, typename OutputType, typename IndexType>
-void advanced_row_gather(std::shared_ptr<const DefaultExecutor> exec,
-                         const matrix::Dense<ValueType>* alpha,
-                         const Array<IndexType>* row_idxs,
-                         const matrix::Dense<ValueType>* orig,
-                         const matrix::Dense<ValueType>* beta,
-                         matrix::Dense<OutputType>* row_collection)
-{
-    run_kernel(
-        exec,
-        [] GKO_KERNEL(auto row, auto col, auto alpha, auto orig, auto rows,
-                      auto beta, auto gathered) {
-            using type = device_type<highest_precision<ValueType, OutputType>>;
-            gathered(row, col) =
-                static_cast<type>(alpha[0] * orig(rows[row], col)) +
-                static_cast<type>(beta[0]) *
-                    static_cast<type>(gathered(row, col));
-        },
-        dim<2>{row_idxs->get_num_elems(), orig->get_size()[1]},
-        alpha->get_const_values(), orig, *row_idxs, beta->get_const_values(),
-        row_collection);
-}
-
-GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE_2(
-    GKO_DECLARE_DENSE_ADVANCED_ROW_GATHER_KERNEL);
-
-
-template <typename ValueType, typename IndexType>
-void column_permute(std::shared_ptr<const DefaultExecutor> exec,
-                    const Array<IndexType>* permutation_indices,
-                    const matrix::Dense<ValueType>* orig,
-                    matrix::Dense<ValueType>* column_permuted)
-{
-    run_kernel(
-        exec,
-        [] GKO_KERNEL(auto row, auto col, auto orig, auto perm, auto permuted) {
-            permuted(row, col) = orig(row, perm[col]);
-        },
-        orig->get_size(), orig, *permutation_indices, column_permuted);
-}
-
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
-    GKO_DECLARE_DENSE_COLUMN_PERMUTE_KERNEL);
-
-
-template <typename ValueType, typename IndexType>
-void inverse_row_permute(std::shared_ptr<const DefaultExecutor> exec,
-                         const Array<IndexType>* permutation_indices,
-                         const matrix::Dense<ValueType>* orig,
-                         matrix::Dense<ValueType>* row_permuted)
-{
-    run_kernel(
-        exec,
-        [] GKO_KERNEL(auto row, auto col, auto orig, auto perm, auto permuted) {
-            permuted(perm[row], col) = orig(row, col);
-        },
-        orig->get_size(), orig, *permutation_indices, row_permuted);
-}
-
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
-    GKO_DECLARE_DENSE_INV_ROW_PERMUTE_KERNEL);
-
-
-template <typename ValueType, typename IndexType>
-void inverse_column_permute(std::shared_ptr<const DefaultExecutor> exec,
-                            const Array<IndexType>* permutation_indices,
-                            const matrix::Dense<ValueType>* orig,
-                            matrix::Dense<ValueType>* column_permuted)
-{
-    run_kernel(
-        exec,
-        [] GKO_KERNEL(auto row, auto col, auto orig, auto perm, auto permuted) {
-            permuted(row, perm[col]) = orig(row, col);
-        },
-        orig->get_size(), orig, *permutation_indices, column_permuted);
-}
-
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
-    GKO_DECLARE_DENSE_INV_COLUMN_PERMUTE_KERNEL);
-
-
-template <typename ValueType>
-void extract_diagonal(std::shared_ptr<const DefaultExecutor> exec,
-                      const matrix::Dense<ValueType>* orig,
-                      matrix::Diagonal<ValueType>* diag)
-{
-    run_kernel(
-        exec,
-        [] GKO_KERNEL(auto i, auto orig, auto diag) { diag[i] = orig(i, i); },
-        diag->get_size()[0], orig, diag->get_values());
-}
-
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_EXTRACT_DIAGONAL_KERNEL);
-
-
-template <typename ValueType>
-void inplace_absolute_dense(std::shared_ptr<const DefaultExecutor> exec,
-                            matrix::Dense<ValueType>* source)
-{
-    run_kernel(
-        exec,
-        [] GKO_KERNEL(auto row, auto col, auto source) {
-            source(row, col) = abs(source(row, col));
-        },
-        source->get_size(), source);
-}
-
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_INPLACE_ABSOLUTE_DENSE_KERNEL);
-
-
-template <typename ValueType>
-void outplace_absolute_dense(std::shared_ptr<const DefaultExecutor> exec,
-                             const matrix::Dense<ValueType>* source,
-                             matrix::Dense<remove_complex<ValueType>>* result)
-{
-    run_kernel(
-        exec,
-        [] GKO_KERNEL(auto row, auto col, auto source, auto result) {
-            result(row, col) = abs(source(row, col));
-        },
-        source->get_size(), source, result);
-}
-
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_OUTPLACE_ABSOLUTE_DENSE_KERNEL);
-
-
-template <typename ValueType>
-void make_complex(std::shared_ptr<const DefaultExecutor> exec,
-                  const matrix::Dense<ValueType>* source,
-                  matrix::Dense<to_complex<ValueType>>* result)
-{
-    run_kernel(
-        exec,
-        [] GKO_KERNEL(auto row, auto col, auto source, auto result) {
-            result(row, col) = source(row, col);
-        },
-        source->get_size(), source, result);
-}
-
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_MAKE_COMPLEX_KERNEL);
-
-
-template <typename ValueType>
-void get_real(std::shared_ptr<const DefaultExecutor> exec,
-              const matrix::Dense<ValueType>* source,
-              matrix::Dense<remove_complex<ValueType>>* result)
-{
-    run_kernel(
-        exec,
-        [] GKO_KERNEL(auto row, auto col, auto source, auto result) {
-            result(row, col) = real(source(row, col));
-        },
-        source->get_size(), source, result);
-}
-
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_GET_REAL_KERNEL);
-
-
-template <typename ValueType>
-void get_imag(std::shared_ptr<const DefaultExecutor> exec,
-              const matrix::Dense<ValueType>* source,
-              matrix::Dense<remove_complex<ValueType>>* result)
-{
-    run_kernel(
-        exec,
-        [] GKO_KERNEL(auto row, auto col, auto source, auto result) {
-            result(row, col) = imag(source(row, col));
-        },
-        source->get_size(), source, result);
-}
-
-GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_GET_IMAG_KERNEL);
-
-
 }  // namespace dense
 }  // namespace GKO_DEVICE_NAMESPACE
 }  // namespace kernels
diff --git a/common/unified/matrix/dense_kernels_2.cpp b/common/unified/matrix/dense_kernels_2.cpp
new file mode 100644
index 00000000000..b3179c6137f
--- /dev/null
+++ b/common/unified/matrix/dense_kernels_2.cpp
@@ -0,0 +1,198 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2022, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include "core/matrix/dense_kernels.hpp"
+
+
+#include <ginkgo/core/base/device_matrix_data.hpp>
+#include <ginkgo/core/base/math.hpp>
+
+
+#include "common/unified/base/kernel_launch.hpp"
+#include "common/unified/base/kernel_launch_reduction.hpp"
+#include "core/base/mixed_precision_types.hpp"
+#include "core/components/prefix_sum_kernels.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace GKO_DEVICE_NAMESPACE {
+/**
+ * @brief The Dense matrix format namespace.
+ *
+ * @ingroup dense
+ */
+namespace dense {
+
+
+template <typename ValueType, typename ScalarType>
+void scale(std::shared_ptr<const DefaultExecutor> exec,
+           const matrix::Dense<ScalarType>* alpha, matrix::Dense<ValueType>* x)
+{
+    if (alpha->get_size()[1] > 1) {
+        run_kernel(
+            exec,
+            [] GKO_KERNEL(auto row, auto col, auto alpha, auto x) {
+                x(row, col) *= alpha[col];
+            },
+            x->get_size(), alpha->get_const_values(), x);
+    } else {
+        run_kernel(
+            exec,
+            [] GKO_KERNEL(auto row, auto col, auto alpha, auto x) {
+                x(row, col) *= alpha[0];
+            },
+            x->get_size(), alpha->get_const_values(), x);
+    }
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_SCALAR_TYPE(GKO_DECLARE_DENSE_SCALE_KERNEL);
+
+
+template <typename ValueType, typename ScalarType>
+void inv_scale(std::shared_ptr<const DefaultExecutor> exec,
+               const matrix::Dense<ScalarType>* alpha,
+               matrix::Dense<ValueType>* x)
+{
+    if (alpha->get_size()[1] > 1) {
+        run_kernel(
+            exec,
+            [] GKO_KERNEL(auto row, auto col, auto alpha, auto x) {
+                x(row, col) /= alpha[col];
+            },
+            x->get_size(), alpha->get_const_values(), x);
+    } else {
+        run_kernel(
+            exec,
+            [] GKO_KERNEL(auto row, auto col, auto alpha, auto x) {
+                x(row, col) /= alpha[0];
+            },
+            x->get_size(), alpha->get_const_values(), x);
+    }
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_SCALAR_TYPE(
+    GKO_DECLARE_DENSE_INV_SCALE_KERNEL);
+
+
+template <typename ValueType, typename ScalarType>
+void add_scaled(std::shared_ptr<const DefaultExecutor> exec,
+                const matrix::Dense<ScalarType>* alpha,
+                const matrix::Dense<ValueType>* x, matrix::Dense<ValueType>* y)
+{
+    if (alpha->get_size()[1] > 1) {
+        run_kernel(
+            exec,
+            [] GKO_KERNEL(auto row, auto col, auto alpha, auto x, auto y) {
+                y(row, col) += alpha[col] * x(row, col);
+            },
+            x->get_size(), alpha->get_const_values(), x, y);
+    } else {
+        run_kernel(
+            exec,
+            [] GKO_KERNEL(auto row, auto col, auto alpha, auto x, auto y) {
+                y(row, col) += alpha[0] * x(row, col);
+            },
+            x->get_size(), alpha->get_const_values(), x, y);
+    }
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_SCALAR_TYPE(
+    GKO_DECLARE_DENSE_ADD_SCALED_KERNEL);
+
+
+template <typename ValueType, typename ScalarType>
+void sub_scaled(std::shared_ptr<const DefaultExecutor> exec,
+                const matrix::Dense<ScalarType>* alpha,
+                const matrix::Dense<ValueType>* x, matrix::Dense<ValueType>* y)
+{
+    if (alpha->get_size()[1] > 1) {
+        run_kernel(
+            exec,
+            [] GKO_KERNEL(auto row, auto col, auto alpha, auto x, auto y) {
+                y(row, col) -= alpha[col] * x(row, col);
+            },
+            x->get_size(), alpha->get_const_values(), x, y);
+    } else {
+        run_kernel(
+            exec,
+            [] GKO_KERNEL(auto row, auto col, auto alpha, auto x, auto y) {
+                y(row, col) -= alpha[0] * x(row, col);
+            },
+            x->get_size(), alpha->get_const_values(), x, y);
+    }
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_SCALAR_TYPE(
+    GKO_DECLARE_DENSE_SUB_SCALED_KERNEL);
+
+
+template <typename ValueType>
+void add_scaled_diag(std::shared_ptr<const DefaultExecutor> exec,
+                     const matrix::Dense<ValueType>* alpha,
+                     const matrix::Diagonal<ValueType>* x,
+                     matrix::Dense<ValueType>* y)
+{
+    const auto diag_values = x->get_const_values();
+    run_kernel(
+        exec,
+        [] GKO_KERNEL(auto i, auto alpha, auto diag, auto y) {
+            y(i, i) += alpha[0] * diag[i];
+        },
+        x->get_size()[0], alpha->get_const_values(), x->get_const_values(), y);
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_ADD_SCALED_DIAG_KERNEL);
+
+
+template <typename ValueType>
+void sub_scaled_diag(std::shared_ptr<const DefaultExecutor> exec,
+                     const matrix::Dense<ValueType>* alpha,
+                     const matrix::Diagonal<ValueType>* x,
+                     matrix::Dense<ValueType>* y)
+{
+    const auto diag_values = x->get_const_values();
+    run_kernel(
+        exec,
+        [] GKO_KERNEL(auto i, auto alpha, auto diag, auto y) {
+            y(i, i) -= alpha[0] * diag[i];
+        },
+        x->get_size()[0], alpha->get_const_values(), x->get_const_values(), y);
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_SUB_SCALED_DIAG_KERNEL);
+
+
+}  // namespace dense
+}  // namespace GKO_DEVICE_NAMESPACE
+}  // namespace kernels
+}  // namespace gko
diff --git a/common/unified/matrix/dense_kernels_3.cpp b/common/unified/matrix/dense_kernels_3.cpp
new file mode 100644
index 00000000000..f4a580a7109
--- /dev/null
+++ b/common/unified/matrix/dense_kernels_3.cpp
@@ -0,0 +1,78 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2022, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include "core/matrix/dense_kernels.hpp"
+
+
+#include <ginkgo/core/base/device_matrix_data.hpp>
+#include <ginkgo/core/base/math.hpp>
+
+
+#include "common/unified/base/kernel_launch.hpp"
+#include "common/unified/base/kernel_launch_reduction.hpp"
+#include "core/base/mixed_precision_types.hpp"
+#include "core/components/prefix_sum_kernels.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace GKO_DEVICE_NAMESPACE {
+/**
+ * @brief The Dense matrix format namespace.
+ *
+ * @ingroup dense
+ */
+namespace dense {
+
+
+template <typename ValueType>
+void compute_dot(std::shared_ptr<const DefaultExecutor> exec,
+                 const matrix::Dense<ValueType>* x,
+                 const matrix::Dense<ValueType>* y,
+                 matrix::Dense<ValueType>* result)
+{
+    run_kernel_col_reduction(
+        exec,
+        [] GKO_KERNEL(auto i, auto j, auto x, auto y) {
+            return x(i, j) * y(i, j);
+        },
+        GKO_KERNEL_REDUCE_SUM(ValueType), result->get_values(), x->get_size(),
+        x, y);
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_DOT_KERNEL);
+
+
+}  // namespace dense
+}  // namespace GKO_DEVICE_NAMESPACE
+}  // namespace kernels
+}  // namespace gko
diff --git a/common/unified/matrix/dense_kernels_4.cpp b/common/unified/matrix/dense_kernels_4.cpp
new file mode 100644
index 00000000000..1f9b4287f68
--- /dev/null
+++ b/common/unified/matrix/dense_kernels_4.cpp
@@ -0,0 +1,78 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2022, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include "core/matrix/dense_kernels.hpp"
+
+
+#include <ginkgo/core/base/device_matrix_data.hpp>
+#include <ginkgo/core/base/math.hpp>
+
+
+#include "common/unified/base/kernel_launch.hpp"
+#include "common/unified/base/kernel_launch_reduction.hpp"
+#include "core/base/mixed_precision_types.hpp"
+#include "core/components/prefix_sum_kernels.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace GKO_DEVICE_NAMESPACE {
+/**
+ * @brief The Dense matrix format namespace.
+ *
+ * @ingroup dense
+ */
+namespace dense {
+
+
+template <typename ValueType>
+void compute_conj_dot(std::shared_ptr<const DefaultExecutor> exec,
+                      const matrix::Dense<ValueType>* x,
+                      const matrix::Dense<ValueType>* y,
+                      matrix::Dense<ValueType>* result)
+{
+    run_kernel_col_reduction(
+        exec,
+        [] GKO_KERNEL(auto i, auto j, auto x, auto y) {
+            return conj(x(i, j)) * y(i, j);
+        },
+        GKO_KERNEL_REDUCE_SUM(ValueType), result->get_values(), x->get_size(),
+        x, y);
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_CONJ_DOT_KERNEL);
+
+
+}  // namespace dense
+}  // namespace GKO_DEVICE_NAMESPACE
+}  // namespace kernels
+}  // namespace gko
diff --git a/common/unified/matrix/dense_kernels_5.cpp b/common/unified/matrix/dense_kernels_5.cpp
new file mode 100644
index 00000000000..65bb8dd91e4
--- /dev/null
+++ b/common/unified/matrix/dense_kernels_5.cpp
@@ -0,0 +1,114 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2022, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include "core/matrix/dense_kernels.hpp"
+
+
+#include <ginkgo/core/base/device_matrix_data.hpp>
+#include <ginkgo/core/base/math.hpp>
+
+
+#include "common/unified/base/kernel_launch.hpp"
+#include "common/unified/base/kernel_launch_reduction.hpp"
+#include "core/base/mixed_precision_types.hpp"
+#include "core/components/prefix_sum_kernels.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace GKO_DEVICE_NAMESPACE {
+/**
+ * @brief The Dense matrix format namespace.
+ *
+ * @ingroup dense
+ */
+namespace dense {
+
+
+template <typename ValueType, typename IndexType>
+void count_nonzeros_per_row(std::shared_ptr<const DefaultExecutor> exec,
+                            const matrix::Dense<ValueType>* mtx,
+                            IndexType* result)
+{
+    run_kernel_row_reduction(
+        exec,
+        [] GKO_KERNEL(auto i, auto j, auto mtx) {
+            return is_nonzero(mtx(i, j)) ? 1 : 0;
+        },
+        GKO_KERNEL_REDUCE_SUM(IndexType), result, 1, mtx->get_size(), mtx);
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_DENSE_COUNT_NONZEROS_PER_ROW_KERNEL);
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
+    GKO_DECLARE_DENSE_COUNT_NONZEROS_PER_ROW_KERNEL_SIZE_T);
+
+
+template <typename ValueType, typename IndexType>
+void symm_permute(std::shared_ptr<const DefaultExecutor> exec,
+                  const Array<IndexType>* permutation_indices,
+                  const matrix::Dense<ValueType>* orig,
+                  matrix::Dense<ValueType>* permuted)
+{
+    run_kernel(
+        exec,
+        [] GKO_KERNEL(auto row, auto col, auto orig, auto perm, auto permuted) {
+            permuted(row, col) = orig(perm[row], perm[col]);
+        },
+        orig->get_size(), orig, *permutation_indices, permuted);
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_DENSE_SYMM_PERMUTE_KERNEL);
+
+
+template <typename ValueType, typename IndexType>
+void inv_symm_permute(std::shared_ptr<const DefaultExecutor> exec,
+                      const Array<IndexType>* permutation_indices,
+                      const matrix::Dense<ValueType>* orig,
+                      matrix::Dense<ValueType>* permuted)
+{
+    run_kernel(
+        exec,
+        [] GKO_KERNEL(auto row, auto col, auto orig, auto perm, auto permuted) {
+            permuted(perm[row], perm[col]) = orig(row, col);
+        },
+        orig->get_size(), orig, *permutation_indices, permuted);
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_DENSE_INV_SYMM_PERMUTE_KERNEL);
+
+}  // namespace dense
+}  // namespace GKO_DEVICE_NAMESPACE
+}  // namespace kernels
+}  // namespace gko
diff --git a/common/unified/matrix/dense_kernels_6.cpp b/common/unified/matrix/dense_kernels_6.cpp
new file mode 100644
index 00000000000..3629bd930fb
--- /dev/null
+++ b/common/unified/matrix/dense_kernels_6.cpp
@@ -0,0 +1,253 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2022, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include "core/matrix/dense_kernels.hpp"
+
+
+#include <ginkgo/core/base/device_matrix_data.hpp>
+#include <ginkgo/core/base/math.hpp>
+
+
+#include "common/unified/base/kernel_launch.hpp"
+#include "common/unified/base/kernel_launch_reduction.hpp"
+#include "core/base/mixed_precision_types.hpp"
+#include "core/components/prefix_sum_kernels.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace GKO_DEVICE_NAMESPACE {
+/**
+ * @brief The Dense matrix format namespace.
+ *
+ * @ingroup dense
+ */
+namespace dense {
+
+
+template <typename ValueType, typename OutputType, typename IndexType>
+void row_gather(std::shared_ptr<const DefaultExecutor> exec,
+                const Array<IndexType>* row_idxs,
+                const matrix::Dense<ValueType>* orig,
+                matrix::Dense<OutputType>* row_collection)
+{
+    run_kernel(
+        exec,
+        [] GKO_KERNEL(auto row, auto col, auto orig, auto rows, auto gathered) {
+            gathered(row, col) = orig(rows[row], col);
+        },
+        dim<2>{row_idxs->get_num_elems(), orig->get_size()[1]}, orig, *row_idxs,
+        row_collection);
+}
+
+GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE_2(
+    GKO_DECLARE_DENSE_ROW_GATHER_KERNEL);
+
+
+template <typename ValueType, typename OutputType, typename IndexType>
+void advanced_row_gather(std::shared_ptr<const DefaultExecutor> exec,
+                         const matrix::Dense<ValueType>* alpha,
+                         const Array<IndexType>* row_idxs,
+                         const matrix::Dense<ValueType>* orig,
+                         const matrix::Dense<ValueType>* beta,
+                         matrix::Dense<OutputType>* row_collection)
+{
+    run_kernel(
+        exec,
+        [] GKO_KERNEL(auto row, auto col, auto alpha, auto orig, auto rows,
+                      auto beta, auto gathered) {
+            using type = device_type<highest_precision<ValueType, OutputType>>;
+            gathered(row, col) =
+                static_cast<type>(alpha[0] * orig(rows[row], col)) +
+                static_cast<type>(beta[0]) *
+                    static_cast<type>(gathered(row, col));
+        },
+        dim<2>{row_idxs->get_num_elems(), orig->get_size()[1]},
+        alpha->get_const_values(), orig, *row_idxs, beta->get_const_values(),
+        row_collection);
+}
+
+GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE_2(
+    GKO_DECLARE_DENSE_ADVANCED_ROW_GATHER_KERNEL);
+
+
+template <typename ValueType, typename IndexType>
+void column_permute(std::shared_ptr<const DefaultExecutor> exec,
+                    const Array<IndexType>* permutation_indices,
+                    const matrix::Dense<ValueType>* orig,
+                    matrix::Dense<ValueType>* column_permuted)
+{
+    run_kernel(
+        exec,
+        [] GKO_KERNEL(auto row, auto col, auto orig, auto perm, auto permuted) {
+            permuted(row, col) = orig(row, perm[col]);
+        },
+        orig->get_size(), orig, *permutation_indices, column_permuted);
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_DENSE_COLUMN_PERMUTE_KERNEL);
+
+
+template <typename ValueType, typename IndexType>
+void inverse_row_permute(std::shared_ptr<const DefaultExecutor> exec,
+                         const Array<IndexType>* permutation_indices,
+                         const matrix::Dense<ValueType>* orig,
+                         matrix::Dense<ValueType>* row_permuted)
+{
+    run_kernel(
+        exec,
+        [] GKO_KERNEL(auto row, auto col, auto orig, auto perm, auto permuted) {
+            permuted(perm[row], col) = orig(row, col);
+        },
+        orig->get_size(), orig, *permutation_indices, row_permuted);
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_DENSE_INV_ROW_PERMUTE_KERNEL);
+
+
+template <typename ValueType, typename IndexType>
+void inverse_column_permute(std::shared_ptr<const DefaultExecutor> exec,
+                            const Array<IndexType>* permutation_indices,
+                            const matrix::Dense<ValueType>* orig,
+                            matrix::Dense<ValueType>* column_permuted)
+{
+    run_kernel(
+        exec,
+        [] GKO_KERNEL(auto row, auto col, auto orig, auto perm, auto permuted) {
+            permuted(row, perm[col]) = orig(row, col);
+        },
+        orig->get_size(), orig, *permutation_indices, column_permuted);
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_DENSE_INV_COLUMN_PERMUTE_KERNEL);
+
+
+template <typename ValueType>
+void extract_diagonal(std::shared_ptr<const DefaultExecutor> exec,
+                      const matrix::Dense<ValueType>* orig,
+                      matrix::Diagonal<ValueType>* diag)
+{
+    run_kernel(
+        exec,
+        [] GKO_KERNEL(auto i, auto orig, auto diag) { diag[i] = orig(i, i); },
+        diag->get_size()[0], orig, diag->get_values());
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_EXTRACT_DIAGONAL_KERNEL);
+
+
+template <typename ValueType>
+void inplace_absolute_dense(std::shared_ptr<const DefaultExecutor> exec,
+                            matrix::Dense<ValueType>* source)
+{
+    run_kernel(
+        exec,
+        [] GKO_KERNEL(auto row, auto col, auto source) {
+            source(row, col) = abs(source(row, col));
+        },
+        source->get_size(), source);
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_INPLACE_ABSOLUTE_DENSE_KERNEL);
+
+
+template <typename ValueType>
+void outplace_absolute_dense(std::shared_ptr<const DefaultExecutor> exec,
+                             const matrix::Dense<ValueType>* source,
+                             matrix::Dense<remove_complex<ValueType>>* result)
+{
+    run_kernel(
+        exec,
+        [] GKO_KERNEL(auto row, auto col, auto source, auto result) {
+            result(row, col) = abs(source(row, col));
+        },
+        source->get_size(), source, result);
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_OUTPLACE_ABSOLUTE_DENSE_KERNEL);
+
+
+template <typename ValueType>
+void make_complex(std::shared_ptr<const DefaultExecutor> exec,
+                  const matrix::Dense<ValueType>* source,
+                  matrix::Dense<to_complex<ValueType>>* result)
+{
+    run_kernel(
+        exec,
+        [] GKO_KERNEL(auto row, auto col, auto source, auto result) {
+            result(row, col) = source(row, col);
+        },
+        source->get_size(), source, result);
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_MAKE_COMPLEX_KERNEL);
+
+
+template <typename ValueType>
+void get_real(std::shared_ptr<const DefaultExecutor> exec,
+              const matrix::Dense<ValueType>* source,
+              matrix::Dense<remove_complex<ValueType>>* result)
+{
+    run_kernel(
+        exec,
+        [] GKO_KERNEL(auto row, auto col, auto source, auto result) {
+            result(row, col) = real(source(row, col));
+        },
+        source->get_size(), source, result);
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_GET_REAL_KERNEL);
+
+
+template <typename ValueType>
+void get_imag(std::shared_ptr<const DefaultExecutor> exec,
+              const matrix::Dense<ValueType>* source,
+              matrix::Dense<remove_complex<ValueType>>* result)
+{
+    run_kernel(
+        exec,
+        [] GKO_KERNEL(auto row, auto col, auto source, auto result) {
+            result(row, col) = imag(source(row, col));
+        },
+        source->get_size(), source, result);
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_GET_IMAG_KERNEL);
+
+
+}  // namespace dense
+}  // namespace GKO_DEVICE_NAMESPACE
+}  // namespace kernels
+}  // namespace gko
diff --git a/core/base/block_sizes.hpp b/core/base/block_sizes.hpp
index 7701d66111f..5a22870374d 100644
--- a/core/base/block_sizes.hpp
+++ b/core/base/block_sizes.hpp
@@ -34,8 +34,10 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define GKO_CORE_BASE_BLOCK_SIZES_HPP_
 
 
+#include <utility>
+
+
 #include <ginkgo/config.hpp>
-#include <ginkgo/core/synthesizer/containers.hpp>
 
 
 namespace gko {
@@ -51,9 +53,10 @@ namespace fixedblock {
  * A compile-time list of block sizes for which dedicated fixed-block matrix
  * and corresponding preconditioner kernels should be compiled.
  */
-using compiled_kernels = syn::value_list<int, GKO_FIXED_BLOCK_CUSTOM_SIZES>;
+using compiled_kernels =
+    std::integer_sequence<int, GKO_FIXED_BLOCK_CUSTOM_SIZES>;
 #else
-using compiled_kernels = syn::value_list<int, 2, 3, 4, 7>;
+using compiled_kernels = std::integer_sequence<int, 2, 3, 4, 7>;
 #endif
 
 
diff --git a/core/base/mixed_precision_types.hpp b/core/base/mixed_precision_types.hpp
index a93fc9013b9..33ea7166697 100644
--- a/core/base/mixed_precision_types.hpp
+++ b/core/base/mixed_precision_types.hpp
@@ -39,39 +39,39 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 
 #ifdef GINKGO_MIXED_PRECISION
-#define GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE(_macro, ...)  \
-    template _macro(float, float, float, __VA_ARGS__);          \
-    template _macro(float, float, double, __VA_ARGS__);         \
-    template _macro(float, double, float, __VA_ARGS__);         \
-    template _macro(float, double, double, __VA_ARGS__);        \
-    template _macro(double, float, float, __VA_ARGS__);         \
-    template _macro(double, float, double, __VA_ARGS__);        \
-    template _macro(double, double, float, __VA_ARGS__);        \
-    template _macro(double, double, double, __VA_ARGS__);       \
-    template _macro(std::complex<float>, std::complex<float>,   \
-                    std::complex<float>, __VA_ARGS__);          \
-    template _macro(std::complex<float>, std::complex<float>,   \
-                    std::complex<double>, __VA_ARGS__);         \
-    template _macro(std::complex<float>, std::complex<double>,  \
-                    std::complex<float>, __VA_ARGS__);          \
-    template _macro(std::complex<float>, std::complex<double>,  \
-                    std::complex<double>, __VA_ARGS__);         \
-    template _macro(std::complex<double>, std::complex<float>,  \
-                    std::complex<float>, __VA_ARGS__);          \
-    template _macro(std::complex<double>, std::complex<float>,  \
-                    std::complex<double>, __VA_ARGS__);         \
-    template _macro(std::complex<double>, std::complex<double>, \
-                    std::complex<float>, __VA_ARGS__);          \
-    template _macro(std::complex<double>, std::complex<double>, \
-                    std::complex<double>, __VA_ARGS__)
+#define GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE(_macro, ...)          \
+    template _macro(float, float, float, __VA_ARGS__);                  \
+    GKO_ADAPT_SINGLE(_macro(float, float, double, __VA_ARGS__));        \
+    GKO_ADAPT_SINGLE(_macro(float, double, float, __VA_ARGS__));        \
+    GKO_ADAPT_SINGLE(_macro(float, double, double, __VA_ARGS__));       \
+    GKO_ADAPT_SINGLE(_macro(double, float, float, __VA_ARGS__));        \
+    GKO_ADAPT_SINGLE(_macro(double, float, double, __VA_ARGS__));       \
+    GKO_ADAPT_SINGLE(_macro(double, double, float, __VA_ARGS__));       \
+    GKO_ADAPT_SINGLE(_macro(double, double, double, __VA_ARGS__));      \
+    template _macro(std::complex<float>, std::complex<float>,           \
+                    std::complex<float>, __VA_ARGS__);                  \
+    GKO_ADAPT_SINGLE(_macro(std::complex<float>, std::complex<float>,   \
+                            std::complex<double>, __VA_ARGS__));        \
+    GKO_ADAPT_SINGLE(_macro(std::complex<float>, std::complex<double>,  \
+                            std::complex<float>, __VA_ARGS__));         \
+    GKO_ADAPT_SINGLE(_macro(std::complex<float>, std::complex<double>,  \
+                            std::complex<double>, __VA_ARGS__));        \
+    GKO_ADAPT_SINGLE(_macro(std::complex<double>, std::complex<float>,  \
+                            std::complex<float>, __VA_ARGS__));         \
+    GKO_ADAPT_SINGLE(_macro(std::complex<double>, std::complex<float>,  \
+                            std::complex<double>, __VA_ARGS__));        \
+    GKO_ADAPT_SINGLE(_macro(std::complex<double>, std::complex<double>, \
+                            std::complex<float>, __VA_ARGS__));         \
+    GKO_ADAPT_SINGLE(_macro(std::complex<double>, std::complex<double>, \
+                            std::complex<double>, __VA_ARGS__))
 #else
-#define GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE(_macro, ...)  \
-    template _macro(float, float, float, __VA_ARGS__);          \
-    template _macro(double, double, double, __VA_ARGS__);       \
-    template _macro(std::complex<float>, std::complex<float>,   \
-                    std::complex<float>, __VA_ARGS__);          \
-    template _macro(std::complex<double>, std::complex<double>, \
-                    std::complex<double>, __VA_ARGS__)
+#define GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE(_macro, ...)          \
+    template _macro(float, float, float, __VA_ARGS__);                  \
+    GKO_ADAPT_SINGLE(_macro(double, double, double, __VA_ARGS__));      \
+    template _macro(std::complex<float>, std::complex<float>,           \
+                    std::complex<float>, __VA_ARGS__);                  \
+    GKO_ADAPT_SINGLE(_macro(std::complex<double>, std::complex<double>, \
+                            std::complex<double>, __VA_ARGS__))
 #endif
 
 
@@ -81,21 +81,25 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 
 #ifdef GINKGO_MIXED_PRECISION
-#define GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_2(_macro, ...)             \
-    template _macro(float, float, __VA_ARGS__);                              \
-    template _macro(float, double, __VA_ARGS__);                             \
-    template _macro(double, float, __VA_ARGS__);                             \
-    template _macro(double, double, __VA_ARGS__);                            \
-    template _macro(std::complex<float>, std::complex<float>, __VA_ARGS__);  \
-    template _macro(std::complex<float>, std::complex<double>, __VA_ARGS__); \
-    template _macro(std::complex<double>, std::complex<float>, __VA_ARGS__); \
-    template _macro(std::complex<double>, std::complex<double>, __VA_ARGS__)
+#define GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_2(_macro, ...)            \
+    template _macro(float, float, __VA_ARGS__);                             \
+    GKO_ADAPT_SINGLE(_macro(float, double, __VA_ARGS__));                   \
+    GKO_ADAPT_SINGLE(_macro(double, float, __VA_ARGS__));                   \
+    GKO_ADAPT_SINGLE(_macro(double, double, __VA_ARGS__));                  \
+    template _macro(std::complex<float>, std::complex<float>, __VA_ARGS__); \
+    GKO_ADAPT_SINGLE(                                                       \
+        _macro(std::complex<float>, std::complex<double>, __VA_ARGS__));    \
+    GKO_ADAPT_SINGLE(                                                       \
+        _macro(std::complex<double>, std::complex<float>, __VA_ARGS__));    \
+    GKO_ADAPT_SINGLE(                                                       \
+        _macro(std::complex<double>, std::complex<double>, __VA_ARGS__))
 #else
 #define GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_TYPE_2(_macro, ...)            \
     template _macro(float, float, __VA_ARGS__);                             \
-    template _macro(double, double, __VA_ARGS__);                           \
+    GKO_ADAPT_SINGLE(_macro(double, double, __VA_ARGS__));                  \
     template _macro(std::complex<float>, std::complex<float>, __VA_ARGS__); \
-    template _macro(std::complex<double>, std::complex<double>, __VA_ARGS__)
+    GKO_ADAPT_SINGLE(                                                       \
+        _macro(std::complex<double>, std::complex<double>, __VA_ARGS__))
 #endif
 
 
diff --git a/core/base/types.hpp b/core/base/types.hpp
index 49c4fb99e15..2e2c4c99dce 100644
--- a/core/base/types.hpp
+++ b/core/base/types.hpp
@@ -143,6 +143,7 @@ class ConfigSet {
 public:
     static constexpr unsigned num_groups = sizeof...(num_bits);
     static constexpr std::array<unsigned char, num_groups> bits{num_bits...};
+    using can_encode = std::true_type;
 
     /**
      * Decodes the `position` information from encoded
diff --git a/core/solver/cb_gmres_kernels.hpp b/core/solver/cb_gmres_kernels.hpp
index aa3d352e5a6..bac66826406 100644
--- a/core/solver/cb_gmres_kernels.hpp
+++ b/core/solver/cb_gmres_kernels.hpp
@@ -63,27 +63,27 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *                const accessor, or not.
  */
 #define GKO_INSTANTIATE_FOR_EACH_CB_GMRES_TYPE_HELPER(_macro, _const)          \
-    template _macro(                                                           \
+    GKO_ADAPT_SINGLE(_macro(                                                   \
         double,                                                                \
         GKO_UNPACK(                                                            \
-            acc::range<acc::reduced_row_major<3, double, _const double>>));    \
-    template _macro(                                                           \
+            acc::range<acc::reduced_row_major<3, double, _const double>>)));   \
+    GKO_ADAPT_SINGLE(_macro(                                                   \
         double,                                                                \
         GKO_UNPACK(                                                            \
-            acc::range<acc::reduced_row_major<3, double, _const float>>));     \
-    template _macro(                                                           \
+            acc::range<acc::reduced_row_major<3, double, _const float>>)));    \
+    GKO_ADAPT_SINGLE(_macro(                                                   \
         double,                                                                \
         GKO_UNPACK(                                                            \
-            acc::range<acc::reduced_row_major<3, double, _const half>>));      \
-    template _macro(double,                                                    \
-                    GKO_UNPACK(acc::range<acc::scaled_reduced_row_major<       \
-                                   3, double, _const int64, 0b101>>));         \
-    template _macro(double,                                                    \
-                    GKO_UNPACK(acc::range<acc::scaled_reduced_row_major<       \
-                                   3, double, _const int32, 0b101>>));         \
-    template _macro(double,                                                    \
-                    GKO_UNPACK(acc::range<acc::scaled_reduced_row_major<       \
-                                   3, double, _const int16, 0b101>>));         \
+            acc::range<acc::reduced_row_major<3, double, _const half>>)));     \
+    GKO_ADAPT_SINGLE(                                                          \
+        _macro(double, GKO_UNPACK(acc::range<acc::scaled_reduced_row_major<    \
+                                      3, double, _const int64, 0b101>>)));     \
+    GKO_ADAPT_SINGLE(                                                          \
+        _macro(double, GKO_UNPACK(acc::range<acc::scaled_reduced_row_major<    \
+                                      3, double, _const int32, 0b101>>)));     \
+    GKO_ADAPT_SINGLE(                                                          \
+        _macro(double, GKO_UNPACK(acc::range<acc::scaled_reduced_row_major<    \
+                                      3, double, _const int16, 0b101>>)));     \
     template _macro(                                                           \
         float,                                                                 \
         GKO_UNPACK(                                                            \
@@ -98,16 +98,16 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     template _macro(float,                                                     \
                     GKO_UNPACK(acc::range<acc::scaled_reduced_row_major<       \
                                    3, float, _const int16, 0b101>>));          \
-    template _macro(                                                           \
+    GKO_ADAPT_SINGLE(_macro(                                                   \
         std::complex<double>,                                                  \
         GKO_UNPACK(                                                            \
-            acc::range<acc::reduced_row_major<3, std::complex<double>,         \
-                                              _const std::complex<double>>>)); \
-    template _macro(                                                           \
+            acc::range<acc::reduced_row_major<                                 \
+                3, std::complex<double>, _const std::complex<double>>>)));     \
+    GKO_ADAPT_SINGLE(_macro(                                                   \
         std::complex<double>,                                                  \
         GKO_UNPACK(                                                            \
             acc::range<acc::reduced_row_major<3, std::complex<double>,         \
-                                              _const std::complex<float>>>));  \
+                                              _const std::complex<float>>>))); \
     template _macro(                                                           \
         std::complex<float>,                                                   \
         GKO_UNPACK(                                                            \
diff --git a/core/synthesizer/implementation_selection.hpp b/core/synthesizer/implementation_selection.hpp
index a78ac391d2e..39efa24b67d 100644
--- a/core/synthesizer/implementation_selection.hpp
+++ b/core/synthesizer/implementation_selection.hpp
@@ -48,24 +48,24 @@ namespace syn {
 #define GKO_ENABLE_IMPLEMENTATION_SELECTION(_name, _callable)                \
     template <typename Predicate, int... IntArgs, typename... TArgs,         \
               typename... InferredArgs>                                      \
-    inline void _name(::gko::syn::value_list<int>, Predicate,                \
-                      ::gko::syn::value_list<int, IntArgs...>,               \
+    inline void _name(std::integer_sequence<int>, Predicate,                 \
+                      std::integer_sequence<int, IntArgs...>,                \
                       ::gko::syn::type_list<TArgs...>, InferredArgs...)      \
         GKO_KERNEL_NOT_FOUND;                                                \
                                                                              \
     template <int K, int... Rest, typename Predicate, int... IntArgs,        \
               typename... TArgs, typename... InferredArgs>                   \
     inline void _name(                                                       \
-        ::gko::syn::value_list<int, K, Rest...>, Predicate is_eligible,      \
-        ::gko::syn::value_list<int, IntArgs...> int_args,                    \
+        std::integer_sequence<int, K, Rest...>, Predicate is_eligible,       \
+        std::integer_sequence<int, IntArgs...> int_args,                     \
         ::gko::syn::type_list<TArgs...> type_args, InferredArgs... args)     \
     {                                                                        \
         if (is_eligible(K)) {                                                \
             _callable<IntArgs..., TArgs...>(                                 \
-                ::gko::syn::value_list<int, K>(),                            \
+                std::integer_sequence<int, K>(),                             \
                 std::forward<InferredArgs>(args)...);                        \
         } else {                                                             \
-            _name(::gko::syn::value_list<int, Rest...>(), is_eligible,       \
+            _name(std::integer_sequence<int, Rest...>(), is_eligible,        \
                   int_args, type_args, std::forward<InferredArgs>(args)...); \
         }                                                                    \
     }                                                                        \
@@ -73,36 +73,35 @@ namespace syn {
                   "This assert is used to counter the false positive extra " \
                   "semi-colon warnings")
 
-#define GKO_ENABLE_IMPLEMENTATION_CONFIG_SELECTION(_name, _callable)         \
-    template <typename Predicate, bool... BoolArgs, int... IntArgs,          \
-              gko::size_type... SizeTArgs, typename... TArgs,                \
-              typename... InferredArgs>                                      \
-    inline void _name(::gko::syn::value_list<std::uint32_t>, Predicate,      \
-                      ::gko::syn::value_list<bool, BoolArgs...>,             \
-                      ::gko::syn::value_list<int, IntArgs...>,               \
-                      ::gko::syn::value_list<gko::size_type, SizeTArgs...>,  \
-                      ::gko::syn::type_list<TArgs...>, InferredArgs...)      \
-        GKO_KERNEL_NOT_FOUND;                                                \
-                                                                             \
-    template <std::uint32_t K, std::uint32_t... Rest, typename Predicate,    \
-              bool... BoolArgs, int... IntArgs, gko::size_type... SizeTArgs, \
-              typename... TArgs, typename... InferredArgs>                   \
-    inline void _name(                                                       \
-        ::gko::syn::value_list<std::uint32_t, K, Rest...>,                   \
-        Predicate is_eligible,                                               \
-        ::gko::syn::value_list<bool, BoolArgs...> bool_args,                 \
-        ::gko::syn::value_list<int, IntArgs...> int_args,                    \
-        ::gko::syn::value_list<gko::size_type, SizeTArgs...> size_args,      \
-        ::gko::syn::type_list<TArgs...> type_args, InferredArgs... args)     \
-    {                                                                        \
-        if (is_eligible(K)) {                                                \
-            _callable<BoolArgs..., IntArgs..., SizeTArgs..., TArgs..., K>(   \
-                std::forward<InferredArgs>(args)...);                        \
-        } else {                                                             \
-            _name(::gko::syn::value_list<std::uint32_t, Rest...>(),          \
-                  is_eligible, bool_args, int_args, size_args, type_args,    \
-                  std::forward<InferredArgs>(args)...);                      \
-        }                                                                    \
+#define GKO_ENABLE_IMPLEMENTATION_CONFIG_SELECTION(_name, _callable)          \
+    template <typename Predicate, bool... BoolArgs, int... IntArgs,           \
+              gko::size_type... SizeTArgs, typename... TArgs,                 \
+              typename... InferredArgs>                                       \
+    inline void _name(std::integer_sequence<int>, Predicate,                  \
+                      std::integer_sequence<bool, BoolArgs...>,               \
+                      std::integer_sequence<int, IntArgs...>,                 \
+                      std::integer_sequence<gko::size_type, SizeTArgs...>,    \
+                      ::gko::syn::type_list<TArgs...>, InferredArgs...)       \
+        GKO_KERNEL_NOT_FOUND;                                                 \
+                                                                              \
+    template <int K, int... Rest, typename Predicate, bool... BoolArgs,       \
+              int... IntArgs, gko::size_type... SizeTArgs, typename... TArgs, \
+              typename... InferredArgs>                                       \
+    inline void _name(                                                        \
+        std::integer_sequence<int, K, Rest...>, Predicate is_eligible,        \
+        std::integer_sequence<bool, BoolArgs...> bool_args,                   \
+        std::integer_sequence<int, IntArgs...> int_args,                      \
+        std::integer_sequence<gko::size_type, SizeTArgs...> size_args,        \
+        ::gko::syn::type_list<TArgs...> type_args, InferredArgs... args)      \
+    {                                                                         \
+        if (is_eligible(K)) {                                                 \
+            _callable<BoolArgs..., IntArgs..., SizeTArgs..., TArgs..., K>(    \
+                std::forward<InferredArgs>(args)...);                         \
+        } else {                                                              \
+            _name(std::integer_sequence<int, Rest...>(), is_eligible,         \
+                  bool_args, int_args, size_args, type_args,                  \
+                  std::forward<InferredArgs>(args)...);                       \
+        }                                                                     \
     }
 
 
diff --git a/core/test/CMakeLists.txt b/core/test/CMakeLists.txt
index 2bb711e7a2e..07641c4ad6c 100644
--- a/core/test/CMakeLists.txt
+++ b/core/test/CMakeLists.txt
@@ -12,5 +12,6 @@ add_subdirectory(multigrid)
 add_subdirectory(preconditioner)
 add_subdirectory(reorder)
 add_subdirectory(solver)
+add_subdirectory(synthesizer)
 add_subdirectory(stop)
 add_subdirectory(utils)
diff --git a/core/test/synthesizer/CMakeLists.txt b/core/test/synthesizer/CMakeLists.txt
new file mode 100644
index 00000000000..c2d6d722e4a
--- /dev/null
+++ b/core/test/synthesizer/CMakeLists.txt
@@ -0,0 +1 @@
+ginkgo_create_test(containers)
diff --git a/core/test/synthesizer/containers.cpp b/core/test/synthesizer/containers.cpp
new file mode 100644
index 00000000000..9975dc03e82
--- /dev/null
+++ b/core/test/synthesizer/containers.cpp
@@ -0,0 +1,280 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2022, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include <ginkgo/core/synthesizer/containers.hpp>
+
+
+#include <gtest/gtest.h>
+
+namespace {
+
+
+struct IntegerSequenceExtensions : public ::testing::Test {
+    using range1 = gko::syn::as_list<gko::syn::range<1, 6>>;
+    using range1_exp = std::integer_sequence<int, 1, 2, 3, 4, 5>;
+    using range2 = gko::syn::as_list<gko::syn::range<0, 7, 2>>;
+    using range2_exp = std::integer_sequence<int, 0, 2, 4, 6>;
+
+    using test = gko::syn::value_list<int, 0, 7614, 453, 16, 9, 16, 0, 0, -4>;
+    using sorted_asc_dups =
+        std::integer_sequence<int, -4, 0, 0, 0, 9, 16, 16, 453, 7614>;
+    using sorted_asc_nodups =
+        std::integer_sequence<int, -4, 0, 9, 16, 453, 7614>;
+    using sorted_desc_dups =
+        std::integer_sequence<int, 7614, 453, 16, 16, 9, 0, 0, 0, -4>;
+    using sorted_desc_nodups =
+        std::integer_sequence<int, 7614, 453, 16, 9, 0, -4>;
+
+    using empty = std::integer_sequence<int>;
+
+protected:
+    IntegerSequenceExtensions() {}
+};
+
+using ::testing::StaticAssertTypeEq;
+
+
+TEST_F(IntegerSequenceExtensions, CanCreateRanges)
+{
+    StaticAssertTypeEq<range1, range1_exp>();
+    StaticAssertTypeEq<range2, range2_exp>();
+}
+
+
+TEST_F(IntegerSequenceExtensions, CanConcatenate)
+{
+    using test_case =
+        gko::syn::concatenate<range1, range2, std::integer_sequence<int, 7>>;
+    using expected = std::integer_sequence<int, 1, 2, 3, 4, 5, 0, 2, 4, 6, 7>;
+
+    StaticAssertTypeEq<test_case, expected>();
+}
+
+
+TEST_F(IntegerSequenceExtensions, CanGetArrayAndValue)
+{
+    auto expected_array = std::array<int, 9>{0, 7614, 453, 16, 9, 16, 0, 0, -4};
+    using front = gko::syn::front<test>;
+
+    ASSERT_EQ(gko::syn::as_array(test{}), expected_array);
+    ASSERT_EQ(gko::syn::as_array(front{})[0], 0);
+    ASSERT_EQ(gko::syn::as_value(front{}), 0);
+}
+
+
+TEST_F(IntegerSequenceExtensions, CanSortAscendingNoDuplicates)
+{
+    using sorted = gko::syn::sort<true, test>;
+
+    StaticAssertTypeEq<sorted, sorted_asc_nodups>();
+}
+
+
+TEST_F(IntegerSequenceExtensions, CanSortDescendingNoDuplicates)
+{
+    using sorted = gko::syn::sort<false, test>;
+
+    StaticAssertTypeEq<sorted, sorted_desc_nodups>();
+}
+
+
+TEST_F(IntegerSequenceExtensions, CanSortAscendingWithDuplicates)
+{
+    using sorted = gko::syn::sort_keep<true, test>;
+
+    StaticAssertTypeEq<sorted, sorted_asc_dups>();
+}
+
+
+TEST_F(IntegerSequenceExtensions, CanSortDescendingWithDuplicates)
+{
+    using sorted = gko::syn::sort_keep<false, test>;
+
+    StaticAssertTypeEq<sorted, sorted_desc_dups>();
+}
+
+
+TEST_F(IntegerSequenceExtensions, CanAccessMax)
+{
+    using test_case = gko::syn::max<test>;
+    using expected = std::integer_sequence<int, 7614>;
+    StaticAssertTypeEq<test_case, expected>();
+}
+
+
+TEST_F(IntegerSequenceExtensions, CanAccessMin)
+{
+    using test_case = gko::syn::min<test>;
+    using expected = std::integer_sequence<int, -4>;
+    StaticAssertTypeEq<test_case, expected>();
+}
+
+
+TEST_F(IntegerSequenceExtensions, CanAccessMedian)
+{
+    using test_case = gko::syn::median<test>;
+    using expected = std::integer_sequence<int, 16>;
+
+    StaticAssertTypeEq<test_case, expected>();
+}
+
+
+TEST_F(IntegerSequenceExtensions, CanAccessFront)
+{
+    using test_case = gko::syn::front<test>;
+    using expected = std::integer_sequence<int, 0>;
+
+    StaticAssertTypeEq<test_case, expected>();
+}
+
+
+TEST_F(IntegerSequenceExtensions, CanAccessBack)
+{
+    using test_case = gko::syn::back<test>;
+    using expected = std::integer_sequence<int, -4>;
+
+    StaticAssertTypeEq<test_case, expected>();
+}
+
+
+TEST_F(IntegerSequenceExtensions, CanUseAtIndex)
+{
+    using idx1 = gko::syn::at_index<1, test>;
+    using exp_idx1 = std::integer_sequence<int, 7614>;
+    using idx2 = gko::syn::at_index<2, test>;
+    using exp_idx2 = std::integer_sequence<int, 453>;
+    using idx4 = gko::syn::at_index<4, test>;
+    using exp_idx4 = std::integer_sequence<int, 9>;
+    using idx6 = gko::syn::at_index<6, test>;
+    using exp_idx6 = std::integer_sequence<int, 0>;
+
+    StaticAssertTypeEq<idx1, exp_idx1>();
+    StaticAssertTypeEq<idx2, exp_idx2>();
+    StaticAssertTypeEq<idx4, exp_idx4>();
+    StaticAssertTypeEq<idx6, exp_idx6>();
+}
+
+
+TEST_F(IntegerSequenceExtensions, EmptyTests)
+{
+    using test1 = gko::syn::at_index<72, empty>;
+    using test2 = gko::syn::back<empty>;
+    using test3 = gko::syn::front<empty>;
+    using test4 = gko::syn::median<empty>;
+    using test5 = gko::syn::min<empty>;
+    using test6 = gko::syn::sort<true, empty>;
+    using test7 = gko::syn::sort<false, empty>;
+    using test8 = gko::syn::sort_keep<true, empty>;
+    using test9 = gko::syn::sort_keep<false, empty>;
+
+    StaticAssertTypeEq<test1, empty>();
+    StaticAssertTypeEq<test2, empty>();
+    StaticAssertTypeEq<test3, empty>();
+    StaticAssertTypeEq<test4, empty>();
+    StaticAssertTypeEq<test5, empty>();
+    StaticAssertTypeEq<test6, empty>();
+    StaticAssertTypeEq<test7, empty>();
+    StaticAssertTypeEq<test8, empty>();
+    StaticAssertTypeEq<test9, empty>();
+}
+
+struct int_encoder {
+    using can_encode = std::true_type;
+
+    static constexpr int encode() { return 1; }
+
+    template <typename... Rest>
+    static constexpr int encode(int v1, Rest&&... rest)
+    {
+        return v1 * encode(std::forward<Rest>(rest)...);
+    }
+};
+
+
+TEST_F(IntegerSequenceExtensions, CanMergeEmptyList)
+{
+    StaticAssertTypeEq<gko::syn::merge<int_encoder, empty, empty>, empty>();
+}
+
+
+TEST_F(IntegerSequenceExtensions, CanMergeOneList)
+{
+    StaticAssertTypeEq<gko::syn::merge<int_encoder, test, empty>, test>();
+}
+
+
+TEST_F(IntegerSequenceExtensions, CanMergeTwoLists)
+{
+    using list1 = std::integer_sequence<int, 2, 3>;
+    using list2 = std::integer_sequence<int, 4, 8>;
+    using expected = std::integer_sequence<int, 8, 16, 12, 24>;
+
+    using res = gko::syn::merge<int_encoder, list1, list2>;
+
+    StaticAssertTypeEq<res, expected>();
+}
+
+
+TEST_F(IntegerSequenceExtensions, CanMergeThreeLists)
+{
+    using list1 = std::integer_sequence<int, 2, 3>;
+    using list2 = std::integer_sequence<int, 4, 8>;
+    using list3 = std::integer_sequence<int, 2, 3>;
+    using expected1 = std::integer_sequence<int, 8, 16, 12, 24>;
+    using expected2 =
+        std::integer_sequence<int, 16, 32, 24, 48, 24, 48, 36, 72>;
+
+    using res1 = gko::syn::merge<int_encoder, list1, list2>;
+    using res2 = gko::syn::merge<int_encoder, list3, res1>;
+
+    StaticAssertTypeEq<res1, expected1>();
+    StaticAssertTypeEq<res2, expected2>();
+}
+
+
+TEST_F(IntegerSequenceExtensions, CanMergeThreeListsOtherOrder)
+{
+    using list1 = std::integer_sequence<int, 2, 3>;
+    using list2 = std::integer_sequence<int, 4, 8>;
+    using list3 = std::integer_sequence<int, 2, 3>;
+    using expected1 = std::integer_sequence<int, 8, 16, 12, 24>;
+    using expected2 =
+        std::integer_sequence<int, 16, 24, 32, 48, 24, 36, 48, 72>;
+
+    using res1 = gko::syn::merge<int_encoder, list1, list2>;
+    using res2 = gko::syn::merge<int_encoder, res1, list3>;
+
+    StaticAssertTypeEq<res1, expected1>();
+    StaticAssertTypeEq<res2, expected2>();
+}
+
+}  // namespace
diff --git a/cuda/base/kernel_launch_reduction.cuh b/cuda/base/kernel_launch_reduction.cuh
index db7040b07f5..58bf5e3580a 100644
--- a/cuda/base/kernel_launch_reduction.cuh
+++ b/cuda/base/kernel_launch_reduction.cuh
@@ -357,7 +357,7 @@ namespace {
 
 template <int subwarp_size, typename ValueType, typename KernelFunction,
           typename ReductionOp, typename FinalizeOp, typename... KernelArgs>
-void run_generic_kernel_row_reduction(syn::value_list<int, subwarp_size>,
+void run_generic_kernel_row_reduction(std::integer_sequence<int, subwarp_size>,
                                       int64 rows, int64 cols, int64 col_blocks,
                                       KernelFunction fn, ReductionOp op,
                                       FinalizeOp finalize, ValueType identity,
@@ -382,7 +382,7 @@ GKO_ENABLE_IMPLEMENTATION_SELECTION(select_run_generic_kernel_row_reduction,
 template <int subwarp_size, typename ValueType, typename KernelFunction,
           typename ReductionOp, typename FinalizeOp,
           typename... MappedKernelArgs>
-void run_generic_col_reduction_small(syn::value_list<int, subwarp_size>,
+void run_generic_col_reduction_small(std::integer_sequence<int, subwarp_size>,
                                      int64 max_blocks,
                                      std::shared_ptr<const CudaExecutor> exec,
                                      KernelFunction fn, ReductionOp op,
@@ -433,7 +433,7 @@ void run_kernel_row_reduction(std::shared_ptr<const CudaExecutor> exec,
                               dim<2> size, KernelArgs&&... args)
 {
     using subwarp_sizes =
-        syn::value_list<int, 1, 2, 4, 8, 16, 32, config::warp_size>;
+        std::integer_sequence<int, 1, 2, 4, 8, 16, 32, config::warp_size>;
     constexpr int oversubscription = 16;
     const auto rows = static_cast<int64>(size[0]);
     const auto cols = static_cast<int64>(size[1]);
@@ -464,8 +464,8 @@ void run_kernel_row_reduction(std::shared_ptr<const CudaExecutor> exec,
                 return compiled_subwarp_size >= cols ||
                        compiled_subwarp_size == config::warp_size;
             },
-            syn::value_list<int>(), syn::type_list<>(), rows, cols, 1, fn, op,
-            finalize, identity, result, static_cast<int64>(result_stride),
+            std::integer_sequence<int>(), syn::type_list<>(), rows, cols, 1, fn,
+            op, finalize, identity, result, static_cast<int64>(result_stride),
             map_to_device(args)...);
     }
 }
@@ -480,7 +480,7 @@ void run_kernel_col_reduction(std::shared_ptr<const CudaExecutor> exec,
                               KernelArgs&&... args)
 {
     using subwarp_sizes =
-        syn::value_list<int, 1, 2, 4, 8, 16, 32, config::warp_size>;
+        std::integer_sequence<int, 1, 2, 4, 8, 16, 32, config::warp_size>;
     constexpr int oversubscription = 16;
     const auto rows = static_cast<int64>(size[0]);
     const auto cols = static_cast<int64>(size[1]);
@@ -493,8 +493,8 @@ void run_kernel_col_reduction(std::shared_ptr<const CudaExecutor> exec,
                 return compiled_subwarp_size >= cols ||
                        compiled_subwarp_size == config::warp_size;
             },
-            syn::value_list<int>(), syn::type_list<>(), max_blocks, exec, fn,
-            op, finalize, identity, result, size, map_to_device(args)...);
+            std::integer_sequence<int>(), syn::type_list<>(), max_blocks, exec,
+            fn, op, finalize, identity, result, size, map_to_device(args)...);
     } else {
         const auto col_blocks = ceildiv(cols, config::warp_size);
         const auto row_blocks =
diff --git a/cuda/factorization/par_ic_kernels.cu b/cuda/factorization/par_ic_kernels.cu
index 4318cd1e1d9..43e2c66526f 100644
--- a/cuda/factorization/par_ic_kernels.cu
+++ b/cuda/factorization/par_ic_kernels.cu
@@ -59,7 +59,7 @@ constexpr int default_block_size = 512;
 
 // subwarp sizes for all warp-parallel kernels (sweep)
 using compiled_kernels =
-    syn::value_list<int, 1, 2, 4, 8, 16, 32, config::warp_size>;
+    std::integer_sequence<int, 1, 2, 4, 8, 16, 32, config::warp_size>;
 
 
 #include "common/cuda_hip/factorization/par_ic_kernels.hpp.inc"
diff --git a/cuda/factorization/par_ict_kernels.cu b/cuda/factorization/par_ict_kernels.cu
index 2f65affb628..f1d0689bc0f 100644
--- a/cuda/factorization/par_ict_kernels.cu
+++ b/cuda/factorization/par_ict_kernels.cu
@@ -70,7 +70,7 @@ constexpr int default_block_size = 512;
 
 // subwarp sizes for all warp-parallel kernels (filter, add_candidates)
 using compiled_kernels =
-    syn::value_list<int, 1, 2, 4, 8, 16, 32, config::warp_size>;
+    std::integer_sequence<int, 1, 2, 4, 8, 16, 32, config::warp_size>;
 
 
 #include "common/cuda_hip/factorization/par_ict_spgeam_kernels.hpp.inc"
@@ -81,7 +81,7 @@ namespace {
 
 
 template <int subwarp_size, typename ValueType, typename IndexType>
-void add_candidates(syn::value_list<int, subwarp_size>,
+void add_candidates(std::integer_sequence<int, subwarp_size>,
                     std::shared_ptr<const DefaultExecutor> exec,
                     const matrix::Csr<ValueType, IndexType>* llh,
                     const matrix::Csr<ValueType, IndexType>* a,
@@ -137,7 +137,7 @@ GKO_ENABLE_IMPLEMENTATION_SELECTION(select_add_candidates, add_candidates);
 
 
 template <int subwarp_size, typename ValueType, typename IndexType>
-void compute_factor(syn::value_list<int, subwarp_size>,
+void compute_factor(std::integer_sequence<int, subwarp_size>,
                     std::shared_ptr<const DefaultExecutor> exec,
                     const matrix::Csr<ValueType, IndexType>* a,
                     matrix::Csr<ValueType, IndexType>* l,
@@ -180,7 +180,8 @@ void add_candidates(std::shared_ptr<const DefaultExecutor> exec,
             return total_nnz_per_row <= compiled_subwarp_size ||
                    compiled_subwarp_size == config::warp_size;
         },
-        syn::value_list<int>(), syn::type_list<>(), exec, llh, a, l, l_new);
+        std::integer_sequence<int>(), syn::type_list<>(), exec, llh, a, l,
+        l_new);
 }
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
@@ -202,7 +203,7 @@ void compute_factor(std::shared_ptr<const DefaultExecutor> exec,
             return total_nnz_per_row <= compiled_subwarp_size ||
                    compiled_subwarp_size == config::warp_size;
         },
-        syn::value_list<int>(), syn::type_list<>(), exec, a, l, l_coo);
+        std::integer_sequence<int>(), syn::type_list<>(), exec, a, l, l_coo);
 }
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
diff --git a/cuda/factorization/par_ilut_approx_filter_kernel.cu b/cuda/factorization/par_ilut_approx_filter_kernel.cu
index be86588a909..a8f60fd2bc2 100644
--- a/cuda/factorization/par_ilut_approx_filter_kernel.cu
+++ b/cuda/factorization/par_ilut_approx_filter_kernel.cu
@@ -73,7 +73,7 @@ namespace par_ilut_factorization {
 
 // subwarp sizes for filter kernels
 using compiled_kernels =
-    syn::value_list<int, 1, 2, 4, 8, 16, 32, config::warp_size>;
+    std::integer_sequence<int, 1, 2, 4, 8, 16, 32, config::warp_size>;
 
 
 #include "common/cuda_hip/factorization/par_ilut_filter_kernels.hpp.inc"
@@ -81,7 +81,7 @@ using compiled_kernels =
 
 
 template <int subwarp_size, typename ValueType, typename IndexType>
-void threshold_filter_approx(syn::value_list<int, subwarp_size>,
+void threshold_filter_approx(std::integer_sequence<int, subwarp_size>,
                              std::shared_ptr<const DefaultExecutor> exec,
                              const matrix::Csr<ValueType, IndexType>* m,
                              IndexType rank, Array<ValueType>* tmp,
@@ -196,7 +196,7 @@ void threshold_filter_approx(std::shared_ptr<const DefaultExecutor> exec,
             return total_nnz_per_row <= compiled_subwarp_size ||
                    compiled_subwarp_size == config::warp_size;
         },
-        syn::value_list<int>(), syn::type_list<>(), exec, m, rank, &tmp,
+        std::integer_sequence<int>(), syn::type_list<>(), exec, m, rank, &tmp,
         &threshold, m_out, m_out_coo);
 }
 
diff --git a/cuda/factorization/par_ilut_filter_kernel.cu b/cuda/factorization/par_ilut_filter_kernel.cu
index 6dd83c41835..85d8c03211e 100644
--- a/cuda/factorization/par_ilut_filter_kernel.cu
+++ b/cuda/factorization/par_ilut_filter_kernel.cu
@@ -69,7 +69,7 @@ constexpr int default_block_size = 512;
 
 // subwarp sizes for filter kernels
 using compiled_kernels =
-    syn::value_list<int, 1, 2, 4, 8, 16, 32, config::warp_size>;
+    std::integer_sequence<int, 1, 2, 4, 8, 16, 32, config::warp_size>;
 
 
 #include "common/cuda_hip/factorization/par_ilut_filter_kernels.hpp.inc"
@@ -79,7 +79,7 @@ namespace {
 
 
 template <int subwarp_size, typename ValueType, typename IndexType>
-void threshold_filter(syn::value_list<int, subwarp_size>,
+void threshold_filter(std::integer_sequence<int, subwarp_size>,
                       std::shared_ptr<const DefaultExecutor> exec,
                       const matrix::Csr<ValueType, IndexType>* a,
                       remove_complex<ValueType> threshold,
@@ -153,8 +153,8 @@ void threshold_filter(std::shared_ptr<const DefaultExecutor> exec,
             return total_nnz_per_row <= compiled_subwarp_size ||
                    compiled_subwarp_size == config::warp_size;
         },
-        syn::value_list<int>(), syn::type_list<>(), exec, a, threshold, m_out,
-        m_out_coo, lower);
+        std::integer_sequence<int>(), syn::type_list<>(), exec, a, threshold,
+        m_out, m_out_coo, lower);
 }
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
diff --git a/cuda/factorization/par_ilut_spgeam_kernel.cu b/cuda/factorization/par_ilut_spgeam_kernel.cu
index f1a6b7518ed..f77d3d77119 100644
--- a/cuda/factorization/par_ilut_spgeam_kernel.cu
+++ b/cuda/factorization/par_ilut_spgeam_kernel.cu
@@ -70,7 +70,7 @@ constexpr int default_block_size = 512;
 
 // subwarp sizes for add_candidates kernels
 using compiled_kernels =
-    syn::value_list<int, 1, 2, 4, 8, 16, 32, config::warp_size>;
+    std::integer_sequence<int, 1, 2, 4, 8, 16, 32, config::warp_size>;
 
 
 #include "common/cuda_hip/factorization/par_ilut_spgeam_kernels.hpp.inc"
@@ -80,7 +80,7 @@ namespace {
 
 
 template <int subwarp_size, typename ValueType, typename IndexType>
-void add_candidates(syn::value_list<int, subwarp_size>,
+void add_candidates(std::integer_sequence<int, subwarp_size>,
                     std::shared_ptr<const DefaultExecutor> exec,
                     const matrix::Csr<ValueType, IndexType>* lu,
                     const matrix::Csr<ValueType, IndexType>* a,
@@ -172,8 +172,8 @@ void add_candidates(std::shared_ptr<const DefaultExecutor> exec,
             return total_nnz_per_row <= compiled_subwarp_size ||
                    compiled_subwarp_size == config::warp_size;
         },
-        syn::value_list<int>(), syn::type_list<>(), exec, lu, a, l, u, l_new,
-        u_new);
+        std::integer_sequence<int>(), syn::type_list<>(), exec, lu, a, l, u,
+        l_new, u_new);
 }
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
diff --git a/cuda/factorization/par_ilut_sweep_kernel.cu b/cuda/factorization/par_ilut_sweep_kernel.cu
index fb1248158f9..c4a13c3915f 100644
--- a/cuda/factorization/par_ilut_sweep_kernel.cu
+++ b/cuda/factorization/par_ilut_sweep_kernel.cu
@@ -70,7 +70,7 @@ constexpr int default_block_size = 512;
 
 // subwarp sizes for all warp-parallel kernels (filter, add_candidates)
 using compiled_kernels =
-    syn::value_list<int, 1, 2, 4, 8, 16, 32, config::warp_size>;
+    std::integer_sequence<int, 1, 2, 4, 8, 16, 32, config::warp_size>;
 
 
 #include "common/cuda_hip/factorization/par_ilut_sweep_kernels.hpp.inc"
@@ -80,7 +80,7 @@ namespace {
 
 
 template <int subwarp_size, typename ValueType, typename IndexType>
-void compute_l_u_factors(syn::value_list<int, subwarp_size>,
+void compute_l_u_factors(std::integer_sequence<int, subwarp_size>,
                          std::shared_ptr<const DefaultExecutor> exec,
                          const matrix::Csr<ValueType, IndexType>* a,
                          matrix::Csr<ValueType, IndexType>* l,
@@ -133,8 +133,8 @@ void compute_l_u_factors(std::shared_ptr<const DefaultExecutor> exec,
             return total_nnz_per_row <= compiled_subwarp_size ||
                    compiled_subwarp_size == config::warp_size;
         },
-        syn::value_list<int>(), syn::type_list<>(), exec, a, l, l_coo, u, u_coo,
-        u_csc);
+        std::integer_sequence<int>(), syn::type_list<>(), exec, a, l, l_coo, u,
+        u_coo, u_csc);
 }
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
diff --git a/cuda/matrix/csr_kernels.cu b/cuda/matrix/csr_kernels.cu
index 2550aa2ceb3..c4f3f76ad98 100644
--- a/cuda/matrix/csr_kernels.cu
+++ b/cuda/matrix/csr_kernels.cu
@@ -88,13 +88,13 @@ constexpr int classical_overweight = 32;
  * A compile-time list of the number items per threads for which spmv kernel
  * should be compiled.
  */
-using compiled_kernels = syn::value_list<int, 3, 4, 6, 7, 8, 12, 14>;
+using compiled_kernels = std::integer_sequence<int, 3, 4, 6, 7, 8, 12, 14>;
 
 using classical_kernels =
-    syn::value_list<int, config::warp_size, 32, 16, 8, 4, 2, 1>;
+    std::integer_sequence<int, config::warp_size, 32, 16, 8, 4, 2, 1>;
 
 using spgeam_kernels =
-    syn::value_list<int, 1, 2, 4, 8, 16, 32, config::warp_size>;
+    std::integer_sequence<int, 1, 2, 4, 8, 16, 32, config::warp_size>;
 
 
 #include "common/cuda_hip/matrix/csr_kernels.hpp.inc"
@@ -105,7 +105,7 @@ namespace {
 
 
 template <int items_per_thread, typename ValueType, typename IndexType>
-void merge_path_spmv(syn::value_list<int, items_per_thread>,
+void merge_path_spmv(std::integer_sequence<int, items_per_thread>,
                      std::shared_ptr<const CudaExecutor> exec,
                      const matrix::Csr<ValueType, IndexType>* a,
                      const matrix::Dense<ValueType>* b,
@@ -213,7 +213,7 @@ int compute_items_per_thread(std::shared_ptr<const CudaExecutor> exec)
 
 
 template <int subwarp_size, typename ValueType, typename IndexType>
-void classical_spmv(syn::value_list<int, subwarp_size>,
+void classical_spmv(std::integer_sequence<int, subwarp_size>,
                     std::shared_ptr<const CudaExecutor> exec,
                     const matrix::Csr<ValueType, IndexType>* a,
                     const matrix::Dense<ValueType>* b,
@@ -429,7 +429,7 @@ void spmv(std::shared_ptr<const CudaExecutor> exec,
             [&items_per_thread](int compiled_info) {
                 return items_per_thread == compiled_info;
             },
-            syn::value_list<int>(), syn::type_list<>(), exec, a, b, c);
+            std::integer_sequence<int>(), syn::type_list<>(), exec, a, b, c);
     } else {
         bool use_classical = true;
         if (a->get_strategy()->get_name() == "sparselib" ||
@@ -459,7 +459,8 @@ void spmv(std::shared_ptr<const CudaExecutor> exec,
                 [&max_length_per_row](int compiled_info) {
                     return max_length_per_row >= compiled_info;
                 },
-                syn::value_list<int>(), syn::type_list<>(), exec, a, b, c);
+                std::integer_sequence<int>(), syn::type_list<>(), exec, a, b,
+                c);
         }
     }
 }
@@ -487,8 +488,8 @@ void advanced_spmv(std::shared_ptr<const CudaExecutor> exec,
             [&items_per_thread](int compiled_info) {
                 return items_per_thread == compiled_info;
             },
-            syn::value_list<int>(), syn::type_list<>(), exec, a, b, c, alpha,
-            beta);
+            std::integer_sequence<int>(), syn::type_list<>(), exec, a, b, c,
+            alpha, beta);
     } else {
         bool use_classical = true;
         if (a->get_strategy()->get_name() == "sparselib" ||
@@ -519,7 +520,7 @@ void advanced_spmv(std::shared_ptr<const CudaExecutor> exec,
                 [&max_length_per_row](int compiled_info) {
                     return max_length_per_row >= compiled_info;
                 },
-                syn::value_list<int>(), syn::type_list<>(), exec, a, b, c,
+                std::integer_sequence<int>(), syn::type_list<>(), exec, a, b, c,
                 alpha, beta);
         }
     }
@@ -658,7 +659,7 @@ namespace {
 
 
 template <int subwarp_size, typename ValueType, typename IndexType>
-void spgeam(syn::value_list<int, subwarp_size>,
+void spgeam(std::integer_sequence<int, subwarp_size>,
             std::shared_ptr<const DefaultExecutor> exec, const ValueType* alpha,
             const IndexType* a_row_ptrs, const IndexType* a_col_idxs,
             const ValueType* a_vals, const ValueType* beta,
@@ -836,7 +837,7 @@ void advanced_spgemm(std::shared_ptr<const CudaExecutor> exec,
             return compiled_subwarp_size >= nnz_per_row ||
                    compiled_subwarp_size == config::warp_size;
         },
-        syn::value_list<int>(), syn::type_list<>(), exec,
+        std::integer_sequence<int>(), syn::type_list<>(), exec,
         alpha->get_const_values(), c_tmp_row_ptrs_array.get_const_data(),
         c_tmp_col_idxs_array.get_const_data(),
         c_tmp_vals_array.get_const_data(), beta->get_const_values(), d_row_ptrs,
@@ -865,7 +866,7 @@ void spgeam(std::shared_ptr<const DefaultExecutor> exec,
             return compiled_subwarp_size >= nnz_per_row ||
                    compiled_subwarp_size == config::warp_size;
         },
-        syn::value_list<int>(), syn::type_list<>(), exec,
+        std::integer_sequence<int>(), syn::type_list<>(), exec,
         alpha->get_const_values(), a->get_const_row_ptrs(),
         a->get_const_col_idxs(), a->get_const_values(),
         beta->get_const_values(), b->get_const_row_ptrs(),
diff --git a/cuda/matrix/ell_kernels.cu b/cuda/matrix/ell_kernels.cu
index c906bb7fd1a..957faca5d6e 100644
--- a/cuda/matrix/ell_kernels.cu
+++ b/cuda/matrix/ell_kernels.cu
@@ -102,7 +102,7 @@ constexpr int max_thread_per_worker = 32;
  * 0 is a special case where it uses a sub-warp size of warp_size in
  * combination with atomic_adds.
  */
-using compiled_kernels = syn::value_list<int, 0, 1, 2, 4, 8, 16, 32>;
+using compiled_kernels = std::integer_sequence<int, 0, 1, 2, 4, 8, 16, 32>;
 
 
 #include "common/cuda_hip/matrix/ell_kernels.hpp.inc"
@@ -113,7 +113,7 @@ namespace {
 
 template <int info, typename InputValueType, typename MatrixValueType,
           typename OutputValueType, typename IndexType>
-void abstract_spmv(syn::value_list<int, info>, int num_worker_per_row,
+void abstract_spmv(std::integer_sequence<int, info>, int num_worker_per_row,
                    const matrix::Ell<MatrixValueType, IndexType>* a,
                    const matrix::Dense<InputValueType>* b,
                    matrix::Dense<OutputValueType>* c,
@@ -249,8 +249,8 @@ void spmv(std::shared_ptr<const CudaExecutor> exec,
     select_abstract_spmv(
         compiled_kernels(),
         [&info](int compiled_info) { return info == compiled_info; },
-        syn::value_list<int>(), syn::type_list<>(), num_worker_per_row, a, b,
-        c);
+        std::integer_sequence<int>(), syn::type_list<>(), num_worker_per_row, a,
+        b, c);
 }
 
 GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE(
@@ -283,8 +283,8 @@ void advanced_spmv(std::shared_ptr<const CudaExecutor> exec,
     select_abstract_spmv(
         compiled_kernels(),
         [&info](int compiled_info) { return info == compiled_info; },
-        syn::value_list<int>(), syn::type_list<>(), num_worker_per_row, a, b, c,
-        alpha, beta);
+        std::integer_sequence<int>(), syn::type_list<>(), num_worker_per_row, a,
+        b, c, alpha, beta);
 }
 
 GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE(
diff --git a/cuda/matrix/fbcsr_kernels.cu b/cuda/matrix/fbcsr_kernels.cu
index c9c4782d262..eede67c3211 100644
--- a/cuda/matrix/fbcsr_kernels.cu
+++ b/cuda/matrix/fbcsr_kernels.cu
@@ -261,7 +261,7 @@ namespace {
 
 
 template <int mat_blk_sz, typename ValueType, typename IndexType>
-void transpose_blocks_impl(syn::value_list<int, mat_blk_sz>,
+void transpose_blocks_impl(std::integer_sequence<int, mat_blk_sz>,
                            matrix::Fbcsr<ValueType, IndexType>* const mat)
 {
     constexpr int subwarp_size = config::warp_size;
@@ -310,7 +310,7 @@ void transpose(const std::shared_ptr<const CudaExecutor> exec,
         select_transpose_blocks(
             fixedblock::compiled_kernels(),
             [bs](int compiled_block_size) { return bs == compiled_block_size; },
-            syn::value_list<int>(), syn::type_list<>(), trans);
+            std::integer_sequence<int>(), syn::type_list<>(), trans);
     } else {
         GKO_NOT_IMPLEMENTED;
     }
diff --git a/cuda/preconditioner/jacobi_advanced_apply_instantiate.inc.cu b/cuda/preconditioner/jacobi_advanced_apply_instantiate.inc.cu
index 3917c54bf43..34d4ebb529d 100644
--- a/cuda/preconditioner/jacobi_advanced_apply_instantiate.inc.cu
+++ b/cuda/preconditioner/jacobi_advanced_apply_instantiate.inc.cu
@@ -75,7 +75,7 @@ namespace jacobi {
 template <int warps_per_block, int max_block_size, typename ValueType,
           typename IndexType>
 void advanced_apply(
-    syn::value_list<int, max_block_size>, size_type num_blocks,
+    std::integer_sequence<int, max_block_size>, size_type num_blocks,
     const precision_reduction* block_precisions,
     const IndexType* block_pointers, const ValueType* blocks,
     const preconditioner::block_interleaved_storage_scheme<IndexType>&
@@ -112,7 +112,7 @@ void advanced_apply(
 #define DECLARE_JACOBI_ADVANCED_APPLY_INSTANTIATION(ValueType, IndexType)   \
     void advanced_apply<config::min_warps_per_block, GKO_JACOBI_BLOCK_SIZE, \
                         ValueType, IndexType>(                              \
-        syn::value_list<int, GKO_JACOBI_BLOCK_SIZE>, size_type,             \
+        std::integer_sequence<int, GKO_JACOBI_BLOCK_SIZE>, size_type,       \
         const precision_reduction*, const IndexType* block_pointers,        \
         const ValueType*,                                                   \
         const preconditioner::block_interleaved_storage_scheme<IndexType>&, \
diff --git a/cuda/preconditioner/jacobi_advanced_apply_kernel.cu b/cuda/preconditioner/jacobi_advanced_apply_kernel.cu
index 2632e9c7180..95048463d1d 100644
--- a/cuda/preconditioner/jacobi_advanced_apply_kernel.cu
+++ b/cuda/preconditioner/jacobi_advanced_apply_kernel.cu
@@ -55,7 +55,7 @@ namespace jacobi {
 template <int warps_per_block, int max_block_size, typename ValueType,
           typename IndexType>
 void advanced_apply(
-    syn::value_list<int, max_block_size>, size_type num_blocks,
+    std::integer_sequence<int, max_block_size>, size_type num_blocks,
     const precision_reduction* block_precisions,
     const IndexType* block_pointers, const ValueType* blocks,
     const preconditioner::block_interleaved_storage_scheme<IndexType>&
@@ -86,7 +86,7 @@ void apply(std::shared_ptr<const CudaExecutor> exec, size_type num_blocks,
             [&](int compiled_block_size) {
                 return max_block_size <= compiled_block_size;
             },
-            syn::value_list<int, config::min_warps_per_block>(),
+            std::integer_sequence<int, config::min_warps_per_block>(),
             syn::type_list<>(), num_blocks, block_precisions.get_const_data(),
             block_pointers.get_const_data(), blocks.get_const_data(),
             storage_scheme, alpha->get_const_values(),
diff --git a/cuda/preconditioner/jacobi_common.hpp.in b/cuda/preconditioner/jacobi_common.hpp.in
index d6be53c1835..19bd21385f4 100644
--- a/cuda/preconditioner/jacobi_common.hpp.in
+++ b/cuda/preconditioner/jacobi_common.hpp.in
@@ -30,12 +30,15 @@ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 ******************************<GINKGO LICENSE>*******************************/
 
+#include <utility>
+
+
 #include <ginkgo/config.hpp>
-#include <ginkgo/core/synthesizer/containers.hpp>
 
 
 #include "cuda/base/config.hpp"
 
+
 namespace gko {
 namespace kernels {
 namespace cuda {
@@ -55,7 +58,8 @@ namespace jacobi {
 #endif
 
 
-using compiled_kernels = syn::value_list<int, GKO_CUDA_JACOBI_BLOCK_SIZES_CODE>;
+using compiled_kernels =
+    std::integer_sequence<int, GKO_CUDA_JACOBI_BLOCK_SIZES_CODE>;
 
 
 constexpr int get_larger_power(int value, int guess = 1)
diff --git a/cuda/preconditioner/jacobi_generate_instantiate.inc.cu b/cuda/preconditioner/jacobi_generate_instantiate.inc.cu
index 240b4f4b5fd..b232a50536d 100644
--- a/cuda/preconditioner/jacobi_generate_instantiate.inc.cu
+++ b/cuda/preconditioner/jacobi_generate_instantiate.inc.cu
@@ -77,7 +77,7 @@ namespace jacobi {
 
 template <int warps_per_block, int max_block_size, typename ValueType,
           typename IndexType>
-void generate(syn::value_list<int, max_block_size>,
+void generate(std::integer_sequence<int, max_block_size>,
               const matrix::Csr<ValueType, IndexType>* mtx,
               remove_complex<ValueType> accuracy, ValueType* block_data,
               const preconditioner::block_interleaved_storage_scheme<IndexType>&
@@ -119,7 +119,7 @@ void generate(syn::value_list<int, max_block_size>,
 #define DECLARE_JACOBI_GENERATE_INSTANTIATION(ValueType, IndexType)          \
     void generate<config::min_warps_per_block, GKO_JACOBI_BLOCK_SIZE,        \
                   ValueType, IndexType>(                                     \
-        syn::value_list<int, GKO_JACOBI_BLOCK_SIZE>,                         \
+        std::integer_sequence<int, GKO_JACOBI_BLOCK_SIZE>,                   \
         const matrix::Csr<ValueType, IndexType>*, remove_complex<ValueType>, \
         ValueType*,                                                          \
         const preconditioner::block_interleaved_storage_scheme<IndexType>&,  \
diff --git a/cuda/preconditioner/jacobi_generate_kernel.cu b/cuda/preconditioner/jacobi_generate_kernel.cu
index 01ce054322e..50b3f9d0968 100644
--- a/cuda/preconditioner/jacobi_generate_kernel.cu
+++ b/cuda/preconditioner/jacobi_generate_kernel.cu
@@ -55,7 +55,7 @@ namespace jacobi {
 
 template <int warps_per_block, int max_block_size, typename ValueType,
           typename IndexType>
-void generate(syn::value_list<int, max_block_size>,
+void generate(std::integer_sequence<int, max_block_size>,
               const matrix::Csr<ValueType, IndexType>* mtx,
               remove_complex<ValueType> accuracy, ValueType* block_data,
               const preconditioner::block_interleaved_storage_scheme<IndexType>&
@@ -85,9 +85,9 @@ void generate(std::shared_ptr<const CudaExecutor> exec,
         [&](int compiled_block_size) {
             return max_block_size <= compiled_block_size;
         },
-        syn::value_list<int, config::min_warps_per_block>(), syn::type_list<>(),
-        system_matrix, accuracy, blocks.get_data(), storage_scheme,
-        conditioning.get_data(), block_precisions.get_data(),
+        std::integer_sequence<int, config::min_warps_per_block>(),
+        syn::type_list<>(), system_matrix, accuracy, blocks.get_data(),
+        storage_scheme, conditioning.get_data(), block_precisions.get_data(),
         block_pointers.get_const_data(), num_blocks);
 }
 
diff --git a/cuda/preconditioner/jacobi_kernels.cu b/cuda/preconditioner/jacobi_kernels.cu
index c88eab57738..105cddf7bfc 100644
--- a/cuda/preconditioner/jacobi_kernels.cu
+++ b/cuda/preconditioner/jacobi_kernels.cu
@@ -153,7 +153,7 @@ namespace {
 template <bool conjugate, int warps_per_block, int max_block_size,
           typename ValueType, typename IndexType>
 void transpose_jacobi(
-    syn::value_list<int, max_block_size>, size_type num_blocks,
+    std::integer_sequence<int, max_block_size>, size_type num_blocks,
     const precision_reduction* block_precisions,
     const IndexType* block_pointers, const ValueType* blocks,
     const preconditioner::block_interleaved_storage_scheme<IndexType>&
@@ -202,7 +202,7 @@ void transpose_jacobi(
         [&](int compiled_block_size) {
             return max_block_size <= compiled_block_size;
         },
-        syn::value_list<int, false, config::min_warps_per_block>(),
+        std::integer_sequence<int, false, config::min_warps_per_block>(),
         syn::type_list<>(), num_blocks, block_precisions.get_const_data(),
         block_pointers.get_const_data(), blocks.get_const_data(),
         storage_scheme, out_blocks.get_data());
@@ -226,7 +226,7 @@ void conj_transpose_jacobi(
         [&](int compiled_block_size) {
             return max_block_size <= compiled_block_size;
         },
-        syn::value_list<int, true, config::min_warps_per_block>(),
+        std::integer_sequence<int, true, config::min_warps_per_block>(),
         syn::type_list<>(), num_blocks, block_precisions.get_const_data(),
         block_pointers.get_const_data(), blocks.get_const_data(),
         storage_scheme, out_blocks.get_data());
diff --git a/cuda/preconditioner/jacobi_simple_apply_instantiate.inc.cu b/cuda/preconditioner/jacobi_simple_apply_instantiate.inc.cu
index 2d21f9d43ac..3e4cb263e18 100644
--- a/cuda/preconditioner/jacobi_simple_apply_instantiate.inc.cu
+++ b/cuda/preconditioner/jacobi_simple_apply_instantiate.inc.cu
@@ -74,7 +74,7 @@ namespace jacobi {
 
 template <int warps_per_block, int max_block_size, typename ValueType,
           typename IndexType>
-void apply(syn::value_list<int, max_block_size>, size_type num_blocks,
+void apply(std::integer_sequence<int, max_block_size>, size_type num_blocks,
            const precision_reduction* block_precisions,
            const IndexType* block_pointers, const ValueType* blocks,
            const preconditioner::block_interleaved_storage_scheme<IndexType>&
@@ -110,7 +110,7 @@ void apply(syn::value_list<int, max_block_size>, size_type num_blocks,
 #define DECLARE_JACOBI_SIMPLE_APPLY_INSTANTIATION(ValueType, IndexType)       \
     void apply<config::min_warps_per_block, GKO_JACOBI_BLOCK_SIZE, ValueType, \
                IndexType>(                                                    \
-        syn::value_list<int, GKO_JACOBI_BLOCK_SIZE>, size_type,               \
+        std::integer_sequence<int, GKO_JACOBI_BLOCK_SIZE>, size_type,         \
         const precision_reduction*, const IndexType*, const ValueType*,       \
         const preconditioner::block_interleaved_storage_scheme<IndexType>&,   \
         const ValueType*, size_type, ValueType*, size_type)
diff --git a/cuda/preconditioner/jacobi_simple_apply_kernel.cu b/cuda/preconditioner/jacobi_simple_apply_kernel.cu
index 2b96cfc3321..e6ef05611f0 100644
--- a/cuda/preconditioner/jacobi_simple_apply_kernel.cu
+++ b/cuda/preconditioner/jacobi_simple_apply_kernel.cu
@@ -54,7 +54,7 @@ namespace jacobi {
 
 template <int warps_per_block, int max_block_size, typename ValueType,
           typename IndexType>
-void apply(syn::value_list<int, max_block_size>, size_type num_blocks,
+void apply(std::integer_sequence<int, max_block_size>, size_type num_blocks,
            const precision_reduction* block_precisions,
            const IndexType* block_pointers, const ValueType* blocks,
            const preconditioner::block_interleaved_storage_scheme<IndexType>&
@@ -82,7 +82,7 @@ void simple_apply(
             [&](int compiled_block_size) {
                 return max_block_size <= compiled_block_size;
             },
-            syn::value_list<int, config::min_warps_per_block>(),
+            std::integer_sequence<int, config::min_warps_per_block>(),
             syn::type_list<>(), num_blocks, block_precisions.get_const_data(),
             block_pointers.get_const_data(), blocks.get_const_data(),
             storage_scheme, b->get_const_values() + col, b->get_stride(),
diff --git a/dpcpp/CMakeLists.txt b/dpcpp/CMakeLists.txt
index ac7cf2d981e..5500ecd3a7f 100644
--- a/dpcpp/CMakeLists.txt
+++ b/dpcpp/CMakeLists.txt
@@ -11,6 +11,14 @@ set(GINKGO_MKL_ROOT "${MKL_ROOT}" PARENT_SCOPE)
 find_package(oneDPL REQUIRED HINTS "$ENV{DPL_ROOT}")
 set(GINKGO_DPL_ROOT "${DPL_ROOT}" PARENT_SCOPE)
 
+# Reverse the subgroup list, replace ; to , from the user strings
+string(REPLACE ";" "," GINKGO_DPCPP_SUBGROUPS_FORMATTED "${GINKGO_DPCPP_SUBGROUPS}")
+string(REPLACE ";" "," GINKGO_DPCPP_WORKGROUPS_FORMATTED "${GINKGO_DPCPP_WORKGROUPS}")
+# Generate the DPC++ Configuration file
+configure_file(${CMAKE_CURRENT_SOURCE_DIR}/base/config.hpp.in
+    ${CMAKE_CURRENT_SOURCE_DIR}/base/config.hpp @ONLY)
+
+
 add_library(ginkgo_dpcpp $<TARGET_OBJECTS:ginkgo_dpcpp_device> "")
 target_sources(ginkgo_dpcpp
     PRIVATE
@@ -51,25 +59,36 @@ target_sources(ginkgo_dpcpp
     ${GKO_UNIFIED_COMMON_SOURCES}
     )
 
+
 ginkgo_compile_features(ginkgo_dpcpp)
 target_compile_definitions(ginkgo_dpcpp PRIVATE GKO_COMPILING_DPCPP _ONEDPL_COMPILE_KERNEL=0)
 
 set(GINKGO_DPCPP_FLAGS ${GINKGO_DPCPP_FLAGS} PARENT_SCOPE)
 target_compile_options(ginkgo_dpcpp PRIVATE "${GINKGO_DPCPP_FLAGS}")
 target_compile_options(ginkgo_dpcpp PRIVATE "${GINKGO_COMPILER_FLAGS}")
+target_compile_features(ginkgo_dpcpp PUBLIC cxx_std_17)
+
 # Note: add MKL as PRIVATE not PUBLIC (MKL example shows) to avoid propagating
 # find_package(MKL) everywhere when linking ginkgo (see the MKL example
 # https://software.intel.com/content/www/us/en/develop/documentation/onemkl-windows-developer-guide/top/getting-started/cmake-config-for-onemkl.html)
 target_compile_options(ginkgo_dpcpp PRIVATE $<TARGET_PROPERTY:MKL::MKL_DPCPP,INTERFACE_COMPILE_OPTIONS>)
-target_compile_features(ginkgo_dpcpp PRIVATE cxx_std_17)
 target_include_directories(ginkgo_dpcpp PRIVATE $<TARGET_PROPERTY:MKL::MKL_DPCPP,INTERFACE_INCLUDE_DIRECTORIES>)
+target_include_directories(ginkgo_dpcpp PRIVATE $<BUILD_INTERFACE:${CMAKE_CURRENT_BINARY_DIR}/core>)
+
 target_link_options(ginkgo_dpcpp PRIVATE -fsycl-device-lib=all)
-# When building ginkgo as a static library, we need to use dpcpp and per_kernel
-# link option when the program uses a dpcpp related function.
-if (BUILD_SHARED_LIBS)
-    target_link_options(ginkgo_dpcpp PRIVATE -fsycl-device-code-split=per_kernel)
-else ()
-    target_link_options(ginkgo_dpcpp PUBLIC -fsycl-device-code-split=per_kernel)
+
+if (NOT GINKGO_DPCPP_AOT_TARGETS STREQUAL "")
+    string(REPLACE ";" "," GINKGO_DPCPP_AOT_TARGETS_FORMATTED "${GINKGO_DPCPP_AOT_TARGETS}")
+    target_compile_options(ginkgo_dpcpp PUBLIC -fsycl-targets=${GINKGO_DPCPP_AOT_TARGETS_FORMATTED})
+    target_link_options(ginkgo_dpcpp PUBLIC -fsycl-targets=${GINKGO_DPCPP_AOT_TARGETS_FORMATTED})
+else()
+    # When building ginkgo as a static library, we need to use dpcpp and per_source
+    # link option when the program uses a dpcpp related function.
+    if (BUILD_SHARED_LIBS)
+        target_link_options(ginkgo_dpcpp PRIVATE -fsycl-device-code-split=per_source)
+    else ()
+        target_link_options(ginkgo_dpcpp PUBLIC -fsycl-device-code-split=per_source)
+    endif()
 endif()
 target_link_libraries(ginkgo_dpcpp PUBLIC ginkgo_device)
 target_link_libraries(ginkgo_dpcpp PRIVATE MKL::MKL_DPCPP oneDPL)
diff --git a/dpcpp/base/config.hpp.in b/dpcpp/base/config.hpp.in
new file mode 100644
index 00000000000..15b56383b93
--- /dev/null
+++ b/dpcpp/base/config.hpp.in
@@ -0,0 +1,117 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2022, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#ifndef GKO_DPCPP_BASE_CONFIG_HPP_IN_
+#define GKO_DPCPP_BASE_CONFIG_HPP_IN_
+
+
+#include <ginkgo/core/base/math.hpp>
+#include <ginkgo/core/base/types.hpp>
+#include <ginkgo/core/synthesizer/containers.hpp>
+
+
+#include "core/base/types.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace dpcpp {
+
+
+// CMake generated subgroup list. We sort by ascending order.
+// clang-format off
+using subgroup_list_t =
+    syn::sort<true,
+         std::integer_sequence<int, @GINKGO_DPCPP_SUBGROUPS_FORMATTED@>>;
+using subgroup_list_desc_t = syn::sort<false, subgroup_list_t>;
+constexpr auto subgroup_list = subgroup_list_t{};
+// clang-format on
+
+// CMake generated workgroup list. We sort by ascending order.
+// clang-format off
+using workgroup_list_t =
+    syn::sort<true,
+         std::integer_sequence<int, @GINKGO_DPCPP_WORKGROUPS_FORMATTED@>>;
+using workgroup_list_desc_t = syn::sort<false, workgroup_list_t>;
+constexpr auto workgroup_list = workgroup_list_t{};
+// clang-format on
+
+using empty_list_t = std::integer_sequence<int>;
+
+struct config {
+    /**
+     * The type containing a bitmask over all lanes of a warp.
+     */
+    using lane_mask_type = uint64;
+
+    /**
+     * The number of threads within a Dpcpp subgroup.
+     */
+    static constexpr uint32 warp_size =
+        syn::as_value(syn::median<subgroup_list_t>{});
+
+    /**
+     * The bitmask of the entire warp.
+     */
+    static constexpr auto full_lane_mask = ~zero<lane_mask_type>();
+
+    /**
+     * The minimal amount of warps that need to be scheduled for each block
+     * to maximize GPU occupancy.
+     */
+    static constexpr uint32 min_warps_per_block = 4;
+
+    /**
+     * The default maximal number of threads allowed in DPCPP group
+     */
+    static constexpr uint32 max_block_size =
+        syn::as_value(syn::max<workgroup_list_t>{});
+};
+
+
+using KCFG_1D = ConfigSet<11, 7>;
+constexpr auto kcfg_1d_list =
+    syn::merge<KCFG_1D, workgroup_list_t, subgroup_list_t>{};
+constexpr auto kcfg_1d_array = syn::as_array(kcfg_1d_list);
+
+using BlockCfg = ConfigSet<11>;
+constexpr auto block_cfg_list =
+    syn::merge<BlockCfg, workgroup_list_t, empty_list_t>{};
+constexpr auto block_cfg_array = syn::as_array(block_cfg_list);
+
+
+}  // namespace dpcpp
+}  // namespace kernels
+}  // namespace gko
+
+
+#endif  // GKO_DPCPP_BASE_CONFIG_HPP_IN_
diff --git a/dpcpp/base/executor.dp.cpp b/dpcpp/base/executor.dp.cpp
index 2aed12c606d..18bdc5ed077 100644
--- a/dpcpp/base/executor.dp.cpp
+++ b/dpcpp/base/executor.dp.cpp
@@ -118,7 +118,8 @@ void DpcppExecutor::raw_free(void* ptr) const noexcept
 #endif  // GKO_VERBOSE_LEVEL >= 1
         // OpenCL error code use 0 for CL_SUCCESS and negative number for others
         // error. if the error is not from OpenCL, it will return CL_SUCCESS.
-        int err_code = err.get_cl_code();
+        // TODO: check whether error
+        int err_code = err.code().value();
         // if return CL_SUCCESS, exit 1 as DPCPP error.
         if (err_code == 0) {
             err_code = 1;
diff --git a/dpcpp/base/helper.hpp b/dpcpp/base/helper.hpp
index d8081543313..65d9a0f1fd1 100644
--- a/dpcpp/base/helper.hpp
+++ b/dpcpp/base/helper.hpp
@@ -81,7 +81,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  * @param kernel_  the kernel name
  */
 #define GKO_ENABLE_DEFAULT_HOST_CONFIG(name_, kernel_)                    \
-    template <std::uint32_t encoded, typename... InferredArgs>            \
+    template <int encoded, typename... InferredArgs>                      \
     inline void name_(dim3 grid, dim3 block, gko::size_type,              \
                       sycl::queue* queue, InferredArgs... args)           \
     {                                                                     \
@@ -115,19 +115,18 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *               available to decode<0> for blocksize and decode<1> for
  *               subgroup_size by cfg_
  */
-#define GKO_ENABLE_DEFAULT_CONFIG_CALL(name_, callable_, list_)               \
-    template <typename... InferredArgs>                                       \
-    void name_(std::uint32_t desired_cfg, dim3 grid, dim3 block,              \
-               gko::size_type dynamic_shared_memory, sycl::queue* queue,      \
-               InferredArgs... args)                                          \
-    {                                                                         \
-        callable_(                                                            \
-            list_,                                                            \
-            [&desired_cfg](std::uint32_t cfg) { return cfg == desired_cfg; }, \
-            ::gko::syn::value_list<bool>(), ::gko::syn::value_list<int>(),    \
-            ::gko::syn::value_list<gko::size_type>(),                         \
-            ::gko::syn::type_list<>(), grid, block, dynamic_shared_memory,    \
-            queue, std::forward<InferredArgs>(args)...);                      \
+#define GKO_ENABLE_DEFAULT_CONFIG_CALL(name_, callable_, list_)            \
+    template <typename... InferredArgs>                                    \
+    void name_(int desired_cfg, dim3 grid, dim3 block,                     \
+               gko::size_type dynamic_shared_memory, sycl::queue* queue,   \
+               InferredArgs... args)                                       \
+    {                                                                      \
+        callable_(                                                         \
+            list_, [&desired_cfg](int cfg) { return cfg == desired_cfg; }, \
+            std::integer_sequence<bool>(), std::integer_sequence<int>(),   \
+            std::integer_sequence<gko::size_type>(),                       \
+            ::gko::syn::type_list<>(), grid, block, dynamic_shared_memory, \
+            queue, std::forward<InferredArgs>(args)...);                   \
     }
 
 // __WG_BOUND__ gives the cuda-like launch bound in cuda ordering
@@ -178,7 +177,7 @@ bool validate(sycl::queue* queue, unsigned workgroup_size,
  * @return the first valid config
  */
 template <typename IterArr, typename Validate>
-std::uint32_t get_first_cfg(const IterArr& arr, Validate verify)
+int get_first_cfg(const IterArr& arr, Validate verify)
 {
     for (auto& cfg : arr) {
         if (verify(cfg)) {
diff --git a/dpcpp/base/kernel_launch_reduction.dp.hpp b/dpcpp/base/kernel_launch_reduction.dp.hpp
index 0df00f58c6e..dbdf34de726 100644
--- a/dpcpp/base/kernel_launch_reduction.dp.hpp
+++ b/dpcpp/base/kernel_launch_reduction.dp.hpp
@@ -37,10 +37,10 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 
 #include <algorithm>
+#include <dpcpp/base/config.hpp>
 
 
 #include "core/synthesizer/implementation_selection.hpp"
-#include "dpcpp/base/config.hpp"
 #include "dpcpp/base/dim3.dp.hpp"
 #include "dpcpp/components/cooperative_groups.dp.hpp"
 #include "dpcpp/components/reduction.dp.hpp"
@@ -53,15 +53,7 @@ namespace kernels {
 namespace dpcpp {
 
 
-using KCFG_1D = ConfigSet<11, 7>;
-constexpr auto kcfg_1d_list_simple_reduction =
-    syn::value_list<std::uint32_t, KCFG_1D::encode(512, 64),
-                    KCFG_1D::encode(512, 32), KCFG_1D::encode(512, 16),
-                    KCFG_1D::encode(256, 32), KCFG_1D::encode(256, 16),
-                    KCFG_1D::encode(256, 8)>();
-
-
-template <std::uint32_t cfg, typename ValueType, typename KernelFunction,
+template <int cfg, typename ValueType, typename KernelFunction,
           typename ReductionOp, typename FinalizeOp,
           typename... MappedKernelArgs>
 void generic_kernel_reduction_1d(sycl::handler& cgh, int64 size,
@@ -73,16 +65,17 @@ void generic_kernel_reduction_1d(sycl::handler& cgh, int64 size,
     constexpr auto wg_size = KCFG_1D::decode<0>(cfg);
     constexpr auto sg_size = KCFG_1D::decode<1>(cfg);
     constexpr auto num_partials = wg_size / sg_size;
-    sycl::accessor<UninitializedArray<ValueType, num_partials>, 0,
-                   sycl::access_mode::read_write, sycl::access::target::local>
-        subgroup_partial_acc(cgh);
+    sycl::accessor<ValueType, 1, sycl::access_mode::read_write,
+                   sycl::access::target::local>
+        subgroup_partial_acc(sycl::range<1>(num_partials), cgh);
     const auto range = sycl_nd_range(dim3(num_workgroups), dim3(wg_size));
     const auto global_size = num_workgroups * wg_size;
 
     cgh.parallel_for(
         range, [=
     ](sycl::nd_item<3> idx) [[sycl::reqd_sub_group_size(sg_size)]] {
-            auto subgroup_partial = &(*subgroup_partial_acc.get_pointer())[0];
+            ValueType* __restrict__ subgroup_partial =
+                static_cast<ValueType*>(subgroup_partial_acc.get_pointer());
             const auto tidx = thread::get_thread_id_flat<int64>(idx);
             const auto local_tidx = static_cast<int64>(tidx % wg_size);
             auto subgroup =
@@ -110,7 +103,7 @@ void generic_kernel_reduction_1d(sycl::handler& cgh, int64 size,
 }
 
 
-template <std::uint32_t cfg, typename ValueType, typename KernelFunction,
+template <int cfg, typename ValueType, typename KernelFunction,
           typename ReductionOp, typename FinalizeOp,
           typename... MappedKernelArgs>
 void generic_kernel_reduction_2d(sycl::handler& cgh, int64 rows, int64 cols,
@@ -122,16 +115,17 @@ void generic_kernel_reduction_2d(sycl::handler& cgh, int64 rows, int64 cols,
     constexpr auto wg_size = KCFG_1D::decode<0>(cfg);
     constexpr auto sg_size = KCFG_1D::decode<1>(cfg);
     constexpr auto num_partials = wg_size / sg_size;
-    sycl::accessor<UninitializedArray<ValueType, num_partials>, 0,
-                   sycl::access_mode::read_write, sycl::access::target::local>
-        subgroup_partial_acc(cgh);
+    sycl::accessor<ValueType, 1, sycl::access_mode::read_write,
+                   sycl::access::target::local>
+        subgroup_partial_acc(sycl::range<1>(num_partials), cgh);
     const auto range = sycl_nd_range(dim3(num_workgroups), dim3(wg_size));
     const auto global_size = num_workgroups * wg_size;
 
     cgh.parallel_for(
         range, [=
     ](sycl::nd_item<3> idx) [[sycl::reqd_sub_group_size(sg_size)]] {
-            auto subgroup_partial = &(*subgroup_partial_acc.get_pointer())[0];
+            ValueType* __restrict__ subgroup_partial =
+                static_cast<ValueType*>(subgroup_partial_acc.get_pointer());
             const auto tidx = thread::get_thread_id_flat<int64>(idx);
             const auto local_tidx = static_cast<int64>(tidx % wg_size);
             auto subgroup =
@@ -161,7 +155,7 @@ void generic_kernel_reduction_2d(sycl::handler& cgh, int64 rows, int64 cols,
 }
 
 
-template <std::uint32_t cfg, typename ValueType, typename KernelFunction,
+template <int cfg, typename ValueType, typename KernelFunction,
           typename ReductionOp, typename FinalizeOp,
           typename... MappedKernelArgs>
 void run_kernel_reduction_impl(std::shared_ptr<const DpcppExecutor> exec,
@@ -201,7 +195,7 @@ void run_kernel_reduction_impl(std::shared_ptr<const DpcppExecutor> exec,
 }
 
 
-template <std::uint32_t cfg, typename ValueType, typename KernelFunction,
+template <int cfg, typename ValueType, typename KernelFunction,
           typename ReductionOp, typename FinalizeOp,
           typename... MappedKernelArgs>
 void run_kernel_reduction_impl(std::shared_ptr<const DpcppExecutor> exec,
@@ -254,16 +248,14 @@ void run_kernel_reduction(std::shared_ptr<const DpcppExecutor> exec,
                           FinalizeOp finalize, ValueType identity,
                           ValueType* result, dim<2> size, KernelArgs&&... args)
 {
-    const auto desired_cfg = get_first_cfg(
-        as_array(kcfg_1d_list_simple_reduction), [&](std::uint32_t cfg) {
-            return validate(exec->get_queue(), KCFG_1D::decode<0>(cfg),
-                            KCFG_1D::decode<1>(cfg));
-        });
+    const auto desired_cfg = get_first_cfg(kcfg_1d_array, [&](int cfg) {
+        return validate(exec->get_queue(), KCFG_1D::decode<0>(cfg),
+                        KCFG_1D::decode<1>(cfg));
+    });
     select_run_kernel_reduction(
-        kcfg_1d_list_simple_reduction,
-        [&](std::uint32_t cfg) { return cfg == desired_cfg; },
-        syn::value_list<bool>(), syn::value_list<int>(),
-        syn::value_list<size_type>(), syn::type_list<>(), exec, fn, op,
+        kcfg_1d_list, [&](int cfg) { return cfg == desired_cfg; },
+        std::integer_sequence<bool>(), std::integer_sequence<int>(),
+        std::integer_sequence<size_type>(), syn::type_list<>(), exec, fn, op,
         finalize, identity, result, size, map_to_device(args)...);
 }
 
@@ -276,16 +268,14 @@ void run_kernel_reduction(std::shared_ptr<const DpcppExecutor> exec,
                           ValueType* result, size_type size,
                           KernelArgs&&... args)
 {
-    const auto desired_cfg = get_first_cfg(
-        as_array(kcfg_1d_list_simple_reduction), [&](std::uint32_t cfg) {
-            return validate(exec->get_queue(), KCFG_1D::decode<0>(cfg),
-                            KCFG_1D::decode<1>(cfg));
-        });
+    const auto desired_cfg = get_first_cfg(kcfg_1d_array, [&](int cfg) {
+        return validate(exec->get_queue(), KCFG_1D::decode<0>(cfg),
+                        KCFG_1D::decode<1>(cfg));
+    });
     select_run_kernel_reduction(
-        kcfg_1d_list_simple_reduction,
-        [&](std::uint32_t cfg) { return cfg == desired_cfg; },
-        syn::value_list<bool>(), syn::value_list<int>(),
-        syn::value_list<size_type>(), syn::type_list<>(), exec, fn, op,
+        kcfg_1d_list, [&](int cfg) { return cfg == desired_cfg; },
+        std::integer_sequence<bool>(), std::integer_sequence<int>(),
+        std::integer_sequence<size_type>(), syn::type_list<>(), exec, fn, op,
         finalize, identity, result, size, map_to_device(args)...);
 }
 
@@ -293,10 +283,10 @@ void run_kernel_reduction(std::shared_ptr<const DpcppExecutor> exec,
 namespace {
 
 
-template <std::uint32_t cfg, int ssg_size, typename ValueType,
-          typename KernelFunction, typename ReductionOp, typename FinalizeOp,
+template <int cfg, int ssg_size, typename ValueType, typename KernelFunction,
+          typename ReductionOp, typename FinalizeOp,
           typename... MappedKernelArgs>
-void generic_kernel_row_reduction_2d(syn::value_list<int, ssg_size>,
+void generic_kernel_row_reduction_2d(std::integer_sequence<int, ssg_size>,
                                      std::shared_ptr<const DpcppExecutor> exec,
                                      int64 rows, int64 cols, int64 col_blocks,
                                      KernelFunction fn, ReductionOp op,
@@ -349,8 +339,8 @@ GKO_ENABLE_IMPLEMENTATION_SELECTION(select_generic_kernel_row_reduction_2d,
                                     generic_kernel_row_reduction_2d);
 
 
-template <std::uint32_t cfg, int ssg_size, typename ValueType,
-          typename KernelFunction, typename ReductionOp, typename FinalizeOp,
+template <int cfg, int ssg_size, typename ValueType, typename KernelFunction,
+          typename ReductionOp, typename FinalizeOp,
           typename... MappedKernelArgs>
 void generic_kernel_col_reduction_2d_small(
     sycl::handler& cgh, int64 rows, int64 cols, int64 row_blocks,
@@ -363,13 +353,14 @@ void generic_kernel_col_reduction_2d_small(
     constexpr auto subgroups_per_workgroup = wg_size / sg_size;
     // stores the subwarp_size partial sums from each warp, grouped by warp
     constexpr auto shared_storage = subgroups_per_workgroup * ssg_size;
-    sycl::accessor<UninitializedArray<ValueType, shared_storage>, 0,
-                   sycl::access_mode::read_write, sycl::access::target::local>
-        block_partial_acc(cgh);
+    sycl::accessor<ValueType, 1, sycl::access_mode::read_write,
+                   sycl::access::target::local>
+        block_partial_acc(sycl::range<1>(shared_storage), cgh);
     const auto range = sycl_nd_range(dim3(row_blocks), dim3(wg_size));
     cgh.parallel_for(
         range, [=](sycl::nd_item<3> id) [[sycl::reqd_sub_group_size(sg_size)]] {
-            auto block_partial = &(*block_partial_acc.get_pointer())[0];
+            ValueType* __restrict__ block_partial =
+                static_cast<ValueType*>(block_partial_acc.get_pointer());
             const auto ssg_id =
                 thread::get_subwarp_id_flat<ssg_size, int64>(id);
             const auto local_sg_id = id.get_local_id(2) / sg_size;
@@ -424,7 +415,7 @@ void generic_kernel_col_reduction_2d_small(
 }
 
 
-template <std::uint32_t cfg, typename ValueType, typename KernelFunction,
+template <int cfg, typename ValueType, typename KernelFunction,
           typename ReductionOp, typename FinalizeOp,
           typename... MappedKernelArgs>
 void generic_kernel_col_reduction_2d_blocked(
@@ -436,9 +427,9 @@ void generic_kernel_col_reduction_2d_blocked(
     constexpr auto sg_size = KCFG_1D::decode<1>(cfg);
     const auto range =
         sycl_nd_range(dim3(row_blocks, col_blocks), dim3(wg_size));
-    sycl::accessor<UninitializedArray<ValueType, wg_size>, 0,
-                   sycl::access_mode::read_write, sycl::access::target::local>
-        block_partial_acc(cgh);
+    sycl::accessor<ValueType, 1, sycl::access_mode::read_write,
+                   sycl::access::target::local>
+        block_partial_acc(sycl::range<1>(wg_size), cgh);
     cgh.parallel_for(
         range, [=](sycl::nd_item<3> id) [[sycl::reqd_sub_group_size(sg_size)]] {
             const auto sg_id = thread::get_subwarp_id_flat<sg_size, int64>(id);
@@ -449,7 +440,8 @@ void generic_kernel_col_reduction_2d_blocked(
             const auto sg_rank = subgroup.thread_rank();
             const auto col =
                 sg_rank + static_cast<int64>(id.get_group(1)) * sg_size;
-            auto block_partial = &(*block_partial_acc.get_pointer())[0];
+            ValueType* __restrict__ block_partial =
+                static_cast<ValueType*>(block_partial_acc.get_pointer());
             auto partial = identity;
             // accumulate within a thread
             if (col < cols) {
@@ -493,10 +485,10 @@ void generic_kernel_reduction_finalize_2d(
 }
 
 
-template <std::uint32_t cfg, int ssg_size, typename ValueType,
-          typename KernelFunction, typename ReductionOp, typename FinalizeOp,
+template <int cfg, int ssg_size, typename ValueType, typename KernelFunction,
+          typename ReductionOp, typename FinalizeOp,
           typename... MappedKernelArgs>
-void run_generic_col_reduction_small(syn::value_list<int, ssg_size>,
+void run_generic_col_reduction_small(std::integer_sequence<int, ssg_size>,
                                      std::shared_ptr<const DpcppExecutor> exec,
                                      int64 max_workgroups, KernelFunction fn,
                                      ReductionOp op, FinalizeOp finalize,
@@ -537,7 +529,7 @@ GKO_ENABLE_IMPLEMENTATION_SELECTION(select_generic_col_reduction_small,
                                     run_generic_col_reduction_small);
 
 
-template <std::uint32_t cfg, typename ValueType, typename KernelFunction,
+template <int cfg, typename ValueType, typename KernelFunction,
           typename ReductionOp, typename FinalizeOp,
           typename... MappedKernelArgs>
 void run_kernel_row_reduction_stage1(std::shared_ptr<const DpcppExecutor> exec,
@@ -549,8 +541,8 @@ void run_kernel_row_reduction_stage1(std::shared_ptr<const DpcppExecutor> exec,
     constexpr auto wg_size = KCFG_1D::decode<0>(cfg);
     constexpr auto sg_size = KCFG_1D::decode<1>(cfg);
     using subsubgroup_sizes =
-        syn::value_list<int, 1, 2, 4, 8, std::min<int>(16, sg_size),
-                        std::min<int>(32, sg_size), sg_size>;
+        std::integer_sequence<int, 1, 2, 4, 8, std::min<int>(16, sg_size),
+                              std::min<int>(32, sg_size), sg_size>;
     constexpr int oversubscription = 16;
     const auto rows = static_cast<int64>(size[0]);
     const auto cols = static_cast<int64>(size[1]);
@@ -562,8 +554,8 @@ void run_kernel_row_reduction_stage1(std::shared_ptr<const DpcppExecutor> exec,
         Array<ValueType> partial{exec,
                                  static_cast<size_type>(col_blocks * rows)};
         generic_kernel_row_reduction_2d<cfg, sg_size>(
-            syn::value_list<int, sg_size>{}, exec, rows, cols, col_blocks, fn,
-            op, [](auto v) { return v; }, identity, partial.get_data(), 1,
+            std::integer_sequence<int, sg_size>{}, exec, rows, cols, col_blocks,
+            fn, op, [](auto v) { return v; }, identity, partial.get_data(), 1,
             args...);
         queue->submit([&](sycl::handler& cgh) {
             generic_kernel_reduction_finalize_2d(
@@ -578,8 +570,8 @@ void run_kernel_row_reduction_stage1(std::shared_ptr<const DpcppExecutor> exec,
                 return compiled_ssg_size >= cols ||
                        compiled_ssg_size == sg_size;
             },
-            syn::value_list<int, cfg>(), syn::type_list<>(), exec, rows, cols,
-            1, fn, op, finalize, identity, result,
+            std::integer_sequence<int, cfg>(), syn::type_list<>(), exec, rows,
+            cols, 1, fn, op, finalize, identity, result,
             static_cast<int64>(result_stride), args...);
     }
 }
@@ -588,7 +580,7 @@ GKO_ENABLE_IMPLEMENTATION_CONFIG_SELECTION(select_kernel_row_reduction_stage1,
                                            run_kernel_row_reduction_stage1);
 
 
-template <std::uint32_t cfg, typename ValueType, typename KernelFunction,
+template <int cfg, typename ValueType, typename KernelFunction,
           typename ReductionOp, typename FinalizeOp,
           typename... MappedKernelArgs>
 void run_kernel_col_reduction_stage1(std::shared_ptr<const DpcppExecutor> exec,
@@ -600,8 +592,8 @@ void run_kernel_col_reduction_stage1(std::shared_ptr<const DpcppExecutor> exec,
     constexpr auto wg_size = KCFG_1D::decode<0>(cfg);
     constexpr auto sg_size = KCFG_1D::decode<1>(cfg);
     using subsubgroup_sizes =
-        syn::value_list<int, 1, 2, 4, 8, std::min<int>(16, sg_size),
-                        std::min<int>(32, sg_size), sg_size>;
+        std::integer_sequence<int, 1, 2, 4, 8, std::min<int>(16, sg_size),
+                              std::min<int>(32, sg_size), sg_size>;
     constexpr int oversubscription = 16;
     const auto rows = static_cast<int64>(size[0]);
     const auto cols = static_cast<int64>(size[1]);
@@ -614,8 +606,8 @@ void run_kernel_col_reduction_stage1(std::shared_ptr<const DpcppExecutor> exec,
                 return compiled_ssg_size >= cols ||
                        compiled_ssg_size == sg_size;
             },
-            syn::value_list<int, cfg>(), syn::type_list<>(), exec, max_blocks,
-            fn, op, finalize, identity, result, size, args...);
+            std::integer_sequence<int, cfg>(), syn::type_list<>(), exec,
+            max_blocks, fn, op, finalize, identity, result, size, args...);
     } else {
         const auto col_blocks = ceildiv(cols, sg_size);
         const auto row_blocks = ceildiv(
@@ -661,16 +653,14 @@ void run_kernel_row_reduction(std::shared_ptr<const DpcppExecutor> exec,
                               ValueType* result, size_type result_stride,
                               dim<2> size, KernelArgs&&... args)
 {
-    const auto desired_cfg = get_first_cfg(
-        as_array(kcfg_1d_list_simple_reduction), [&](std::uint32_t cfg) {
-            return validate(exec->get_queue(), KCFG_1D::decode<0>(cfg),
-                            KCFG_1D::decode<1>(cfg));
-        });
+    const auto desired_cfg = get_first_cfg(kcfg_1d_array, [&](int cfg) {
+        return validate(exec->get_queue(), KCFG_1D::decode<0>(cfg),
+                        KCFG_1D::decode<1>(cfg));
+    });
     select_kernel_row_reduction_stage1(
-        kcfg_1d_list_simple_reduction,
-        [&](std::uint32_t cfg) { return cfg == desired_cfg; },
-        syn::value_list<bool>(), syn::value_list<int>(),
-        syn::value_list<size_type>(), syn::type_list<>(), exec, fn, op,
+        kcfg_1d_list, [&](int cfg) { return cfg == desired_cfg; },
+        std::integer_sequence<bool>(), std::integer_sequence<int>(),
+        std::integer_sequence<size_type>(), syn::type_list<>(), exec, fn, op,
         finalize, identity, result, result_stride, size,
         map_to_device(args)...);
 }
@@ -684,16 +674,14 @@ void run_kernel_col_reduction(std::shared_ptr<const DpcppExecutor> exec,
                               ValueType* result, dim<2> size,
                               KernelArgs&&... args)
 {
-    const auto desired_cfg = get_first_cfg(
-        as_array(kcfg_1d_list_simple_reduction), [&](std::uint32_t cfg) {
-            return validate(exec->get_queue(), KCFG_1D::decode<0>(cfg),
-                            KCFG_1D::decode<1>(cfg));
-        });
+    const auto desired_cfg = get_first_cfg(kcfg_1d_array, [&](int cfg) {
+        return validate(exec->get_queue(), KCFG_1D::decode<0>(cfg),
+                        KCFG_1D::decode<1>(cfg));
+    });
     select_kernel_col_reduction_stage1(
-        kcfg_1d_list_simple_reduction,
-        [&](std::uint32_t cfg) { return cfg == desired_cfg; },
-        syn::value_list<bool>(), syn::value_list<int>(),
-        syn::value_list<size_type>(), syn::type_list<>(), exec, fn, op,
+        kcfg_1d_list, [&](int cfg) { return cfg == desired_cfg; },
+        std::integer_sequence<bool>(), std::integer_sequence<int>(),
+        std::integer_sequence<size_type>(), syn::type_list<>(), exec, fn, op,
         finalize, identity, result, size, map_to_device(args)...);
 }
 
diff --git a/dpcpp/components/prefix_sum.dp.hpp b/dpcpp/components/prefix_sum.dp.hpp
index ea7dd341ecf..56fe9864612 100644
--- a/dpcpp/components/prefix_sum.dp.hpp
+++ b/dpcpp/components/prefix_sum.dp.hpp
@@ -129,11 +129,11 @@ __dpct_inline__ void subwarp_prefix_sum(ValueType element,
  * @note To calculate the prefix sum over an array of size bigger than
  *       `block_size`, `finalize_prefix_sum` has to be used as well.
  */
-template <std::uint32_t block_size, typename ValueType>
+template <int block_size, typename ValueType>
 void start_prefix_sum(size_type num_elements, ValueType* __restrict__ elements,
                       ValueType* __restrict__ block_sum,
                       sycl::nd_item<3> item_ct1,
-                      UninitializedArray<ValueType, block_size>& prefix_helper)
+                      ValueType* __restrict__ prefix_helper)
 {
     const auto tidx = thread::get_thread_id_flat(item_ct1);
     const auto element_id = item_ct1.get_local_id(2);
@@ -182,22 +182,22 @@ void start_prefix_sum(size_type num_elements, ValueType* __restrict__ elements,
     }
 }
 
-template <std::uint32_t block_size, typename ValueType>
+template <int block_size, typename ValueType>
 void start_prefix_sum(dim3 grid, dim3 block, size_type dynamic_shared_memory,
                       sycl::queue* queue, size_type num_elements,
                       ValueType* elements, ValueType* block_sum)
 {
     queue->submit([&](sycl::handler& cgh) {
-        sycl::accessor<UninitializedArray<ValueType, block_size>, 0,
-                       sycl::access::mode::read_write,
+        sycl::accessor<ValueType, 1, sycl::access::mode::read_write,
                        sycl::access::target::local>
-            prefix_helper_acc_ct1(cgh);
+            prefix_helper_acc_ct1(sycl::range<1>(block_size), cgh);
 
         cgh.parallel_for(sycl_nd_range(grid, block),
                          [=](sycl::nd_item<3> item_ct1) {
                              start_prefix_sum<block_size>(
                                  num_elements, elements, block_sum, item_ct1,
-                                 *prefix_helper_acc_ct1.get_pointer());
+                                 static_cast<ValueType*>(
+                                     prefix_helper_acc_ct1.get_pointer()));
                          });
     });
 }
@@ -217,7 +217,7 @@ void start_prefix_sum(dim3 grid, dim3 block, size_type dynamic_shared_memory,
  *
  * @note To calculate a prefix sum, first `start_prefix_sum` has to be called.
  */
-template <std::uint32_t block_size, typename ValueType>
+template <int block_size, typename ValueType>
 void finalize_prefix_sum(size_type num_elements,
                          ValueType* __restrict__ elements,
                          const ValueType* __restrict__ block_sum,
@@ -234,7 +234,7 @@ void finalize_prefix_sum(size_type num_elements,
     }
 }
 
-template <std::uint32_t block_size, typename ValueType>
+template <int block_size, typename ValueType>
 void finalize_prefix_sum(dim3 grid, dim3 block, size_type dynamic_shared_memory,
                          sycl::queue* queue, size_type num_elements,
                          ValueType* elements, const ValueType* block_sum)
diff --git a/dpcpp/components/prefix_sum_kernels.dp.cpp b/dpcpp/components/prefix_sum_kernels.dp.cpp
index 695fb2f2898..5e89b61877f 100644
--- a/dpcpp/components/prefix_sum_kernels.dp.cpp
+++ b/dpcpp/components/prefix_sum_kernels.dp.cpp
@@ -33,6 +33,9 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "core/components/prefix_sum_kernels.hpp"
 
 
+#include <dpcpp/base/config.hpp>
+
+
 #include <CL/sycl.hpp>
 
 
@@ -50,12 +53,6 @@ namespace dpcpp {
 namespace components {
 
 
-using BlockCfg = ConfigSet<11>;
-
-constexpr auto block_cfg_list =
-    ::gko::syn::value_list<std::uint32_t, BlockCfg::encode(512),
-                           BlockCfg::encode(256), BlockCfg::encode(128)>();
-
 GKO_ENABLE_IMPLEMENTATION_CONFIG_SELECTION(start_prefix_sum, start_prefix_sum)
 GKO_ENABLE_DEFAULT_CONFIG_CALL(start_prefix_sum_call, start_prefix_sum,
                                block_cfg_list)
@@ -73,11 +70,9 @@ void prefix_sum(std::shared_ptr<const DpcppExecutor> exec, IndexType* counts,
     // prefix_sum should only be performed on a valid array
     if (num_entries > 0) {
         auto queue = exec->get_queue();
-        constexpr auto block_cfg_array = as_array(block_cfg_list);
-        const std::uint32_t cfg =
-            get_first_cfg(block_cfg_array, [&queue](std::uint32_t cfg) {
-                return validate(queue, BlockCfg::decode<0>(cfg), 16);
-            });
+        const int cfg = get_first_cfg(block_cfg_array, [&queue](int cfg) {
+            return validate(queue, BlockCfg::decode<0>(cfg), 16);
+        });
         const auto wg_size = BlockCfg::decode<0>(cfg);
         auto num_blocks = ceildiv(num_entries, wg_size);
         Array<IndexType> block_sum_array(exec, num_blocks - 1);
diff --git a/dpcpp/components/reduction.dp.hpp b/dpcpp/components/reduction.dp.hpp
index b1eb86a1992..dde112fba9d 100644
--- a/dpcpp/components/reduction.dp.hpp
+++ b/dpcpp/components/reduction.dp.hpp
@@ -34,6 +34,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define GKO_DPCPP_COMPONENTS_REDUCTION_DP_HPP_
 
 
+#include <dpcpp/base/config.hpp>
 #include <type_traits>
 
 
@@ -47,7 +48,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include "core/base/types.hpp"
 #include "core/synthesizer/implementation_selection.hpp"
-#include "dpcpp/base/config.hpp"
 #include "dpcpp/base/dim3.dp.hpp"
 #include "dpcpp/base/dpct.hpp"
 #include "dpcpp/base/helper.hpp"
@@ -62,13 +62,7 @@ namespace dpcpp {
 
 
 constexpr int default_block_size = 256;
-using KCFG_1D = ConfigSet<11, 7>;
-constexpr auto kcfg_1d_list =
-    syn::value_list<std::uint32_t, KCFG_1D::encode(512, 64),
-                    KCFG_1D::encode(512, 32), KCFG_1D::encode(512, 16),
-                    KCFG_1D::encode(256, 32), KCFG_1D::encode(256, 16),
-                    KCFG_1D::encode(256, 8)>();
-constexpr auto kcfg_1d_array = as_array(kcfg_1d_list);
+
 
 /**
  * @internal
@@ -202,14 +196,13 @@ void reduce_array(size_type size, const ValueType* __restrict__ source,
  * `source` of any size. Has to be called a second time on `result` to reduce
  * an array larger than `block_size`.
  */
-template <std::uint32_t cfg, typename ValueType>
-void reduce_add_array(
-    size_type size, const ValueType* __restrict__ source,
-    ValueType* __restrict__ result, sycl::nd_item<3> item_ct1,
-    UninitializedArray<ValueType, KCFG_1D::decode<0>(cfg)>& block_sum)
+template <int cfg, typename ValueType>
+void reduce_add_array(size_type size, const ValueType* __restrict__ source,
+                      ValueType* __restrict__ result, sycl::nd_item<3> item_ct1,
+                      ValueType* __restrict__ block_sum)
 {
     reduce_array<KCFG_1D::decode<1>(cfg)>(
-        size, source, static_cast<ValueType*>(block_sum), item_ct1,
+        size, source, block_sum, item_ct1,
         [](const ValueType& x, const ValueType& y) { return x + y; });
 
     if (item_ct1.get_local_id(2) == 0) {
@@ -217,23 +210,24 @@ void reduce_add_array(
     }
 }
 
-template <std::uint32_t cfg = KCFG_1D::encode(256, 16), typename ValueType>
+template <int cfg = KCFG_1D::encode(config::max_block_size, config::warp_size),
+          typename ValueType>
 void reduce_add_array(dim3 grid, dim3 block, size_type dynamic_shared_memory,
                       sycl::queue* queue, size_type size,
                       const ValueType* source, ValueType* result)
 {
     queue->submit([&](sycl::handler& cgh) {
-        sycl::accessor<UninitializedArray<ValueType, KCFG_1D::decode<0>(cfg)>,
-                       0, sycl::access::mode::read_write,
+        sycl::accessor<ValueType, 1, sycl::access::mode::read_write,
                        sycl::access::target::local>
-            block_sum_acc_ct1(cgh);
+            block_sum_acc_ct1(sycl::range<1>(KCFG_1D::decode<0>(cfg)), cgh);
 
         cgh.parallel_for(
             sycl_nd_range(grid, block), [=
         ](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(
                                             KCFG_1D::decode<1>(cfg))]] {
-                reduce_add_array<cfg>(size, source, result, item_ct1,
-                                      *block_sum_acc_ct1.get_pointer());
+                reduce_add_array<cfg>(
+                    size, source, result, item_ct1,
+                    static_cast<ValueType*>(block_sum_acc_ct1.get_pointer()));
             });
     });
 }
@@ -242,7 +236,7 @@ GKO_ENABLE_IMPLEMENTATION_CONFIG_SELECTION(reduce_add_array_config,
                                            reduce_add_array);
 
 GKO_ENABLE_DEFAULT_CONFIG_CALL(reduce_add_array_call, reduce_add_array_config,
-                               kcfg_1d_list);
+                               kcfg_1d_array);
 
 
 /**
@@ -263,12 +257,10 @@ ValueType reduce_add_array(std::shared_ptr<const DpcppExecutor> exec,
     auto block_results = Array<ValueType>(exec);
     ValueType answer = zero<ValueType>();
     auto queue = exec->get_queue();
-    constexpr auto kcfg_1d_array = as_array(kcfg_1d_list);
-    const std::uint32_t cfg =
-        get_first_cfg(kcfg_1d_array, [&queue](std::uint32_t cfg) {
-            return validate(queue, KCFG_1D::decode<0>(cfg),
-                            KCFG_1D::decode<1>(cfg));
-        });
+    const int cfg = get_first_cfg(kcfg_1d_array, [&queue](int cfg) {
+        return validate(queue, KCFG_1D::decode<0>(cfg),
+                        KCFG_1D::decode<1>(cfg));
+    });
     const auto wg_size = KCFG_1D::decode<0>(cfg);
     const auto sg_size = KCFG_1D::decode<1>(cfg);
 
diff --git a/dpcpp/get_info.cmake b/dpcpp/get_info.cmake
index 36918a3a8c6..f595db6c190 100644
--- a/dpcpp/get_info.cmake
+++ b/dpcpp/get_info.cmake
@@ -2,6 +2,9 @@ ginkgo_print_module_header(${detailed_log} "DPCPP")
 ginkgo_print_module_footer(${detailed_log} "DPCPP variables:")
 ginkgo_print_variable(${detailed_log} "GINKGO_DPCPP_FLAGS")
 ginkgo_print_variable(${detailed_log} "GINKGO_DPCPP_SINGLE_MODE")
+ginkgo_print_variable(${detailed_log} "GINKGO_DPCPP_AOT_TARGETS")
+ginkgo_print_variable(${detailed_log} "GINKGO_DPCPP_SUBGROUPS")
+ginkgo_print_variable(${detailed_log} "GINKGO_DPCPP_WORKGROUPS")
 ginkgo_print_module_footer(${detailed_log} "DPCPP environment variables:")
 ginkgo_print_env_variable(${detailed_log} "SYCL_DEVICE_TYPE")
 ginkgo_print_env_variable(${detailed_log} "SYCL_BE")
diff --git a/dpcpp/matrix/csr_kernels.dp.cpp b/dpcpp/matrix/csr_kernels.dp.cpp
index fa23cda9d1d..a7e8ee2bd58 100644
--- a/dpcpp/matrix/csr_kernels.dp.cpp
+++ b/dpcpp/matrix/csr_kernels.dp.cpp
@@ -34,6 +34,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 
 #include <algorithm>
+#include <dpcpp/base/config.hpp>
 
 
 #include <CL/sycl.hpp>
@@ -56,7 +57,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "core/matrix/csr_builder.hpp"
 #include "core/matrix/dense_kernels.hpp"
 #include "core/synthesizer/implementation_selection.hpp"
-#include "dpcpp/base/config.hpp"
 #include "dpcpp/base/dim3.dp.hpp"
 #include "dpcpp/base/dpct.hpp"
 #include "dpcpp/base/helper.hpp"
@@ -89,9 +89,9 @@ constexpr int classical_overweight = 32;
  * A compile-time list of the number items per threads for which spmv kernel
  * should be compiled.
  */
-using compiled_kernels = syn::value_list<int, 6>;
+using compiled_kernels = std::integer_sequence<int, 6>;
 
-using classical_kernels = syn::value_list<int, config::warp_size, 16, 8, 1>;
+using classical_kernels = subgroup_list_desc_t;
 
 
 namespace kernel {
@@ -324,8 +324,8 @@ void merge_path_reduce(const IndexType nwarps,
                        const IndexType* __restrict__ last_row,
                        ValueType* __restrict__ c, const size_type c_stride,
                        Alpha_op alpha_op, sycl::nd_item<3> item_ct1,
-                       UninitializedArray<IndexType, spmv_block_size>& tmp_ind,
-                       UninitializedArray<ValueType, spmv_block_size>& tmp_val)
+                       IndexType* __restrict__ tmp_ind,
+                       ValueType* __restrict__ tmp_val)
 {
     const IndexType cache_lines = ceildivT<IndexType>(nwarps, spmv_block_size);
     const IndexType tid = item_ct1.get_local_id(2);
@@ -351,9 +351,7 @@ void merge_path_reduce(const IndexType nwarps,
     tmp_val[item_ct1.get_local_id(2)] = value;
     tmp_ind[item_ct1.get_local_id(2)] = row;
     group::this_thread_block(item_ct1).sync();
-    bool last =
-        block_segment_scan_reverse(static_cast<IndexType*>(tmp_ind),
-                                   static_cast<ValueType*>(tmp_val), item_ct1);
+    bool last = block_segment_scan_reverse(tmp_ind, tmp_val, item_ct1);
     group::this_thread_block(item_ct1).sync();
     if (last) {
         c[row * c_stride] += alpha_op(tmp_val[item_ct1.get_local_id(2)]);
@@ -538,9 +536,8 @@ void abstract_reduce(const IndexType nwarps,
                      const ValueType* __restrict__ last_val,
                      const IndexType* __restrict__ last_row,
                      ValueType* __restrict__ c, const size_type c_stride,
-                     sycl::nd_item<3> item_ct1,
-                     UninitializedArray<IndexType, spmv_block_size>& tmp_ind,
-                     UninitializedArray<ValueType, spmv_block_size>& tmp_val)
+                     sycl::nd_item<3> item_ct1, IndexType* __restrict__ tmp_ind,
+                     ValueType* __restrict__ tmp_val)
 {
     merge_path_reduce(
         nwarps, last_val, last_row, c, c_stride, [](ValueType& x) { return x; },
@@ -554,20 +551,19 @@ void abstract_reduce(dim3 grid, dim3 block, size_type dynamic_shared_memory,
                      ValueType* c, const size_type c_stride)
 {
     queue->submit([&](sycl::handler& cgh) {
-        sycl::accessor<UninitializedArray<IndexType, spmv_block_size>, 0,
-                       sycl::access_mode::read_write,
+        sycl::accessor<IndexType, 1, sycl::access_mode::read_write,
                        sycl::access::target::local>
-            tmp_ind_acc_ct1(cgh);
-        sycl::accessor<UninitializedArray<ValueType, spmv_block_size>, 0,
-                       sycl::access_mode::read_write,
+            tmp_ind_acc_ct1(sycl::range<1>(spmv_block_size), cgh);
+        sycl::accessor<ValueType, 1, sycl::access_mode::read_write,
                        sycl::access::target::local>
-            tmp_val_acc_ct1(cgh);
+            tmp_val_acc_ct1(sycl::range<1>(spmv_block_size), cgh);
 
         cgh.parallel_for(
             sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) {
-                abstract_reduce(nwarps, last_val, last_row, c, c_stride,
-                                item_ct1, *tmp_ind_acc_ct1.get_pointer(),
-                                *tmp_val_acc_ct1.get_pointer());
+                abstract_reduce(
+                    nwarps, last_val, last_row, c, c_stride, item_ct1,
+                    static_cast<IndexType*>(tmp_ind_acc_ct1.get_pointer()),
+                    static_cast<ValueType*>(tmp_val_acc_ct1.get_pointer()));
             });
     });
 }
@@ -579,9 +575,8 @@ void abstract_reduce(const IndexType nwarps,
                      const IndexType* __restrict__ last_row,
                      const ValueType* __restrict__ alpha,
                      ValueType* __restrict__ c, const size_type c_stride,
-                     sycl::nd_item<3> item_ct1,
-                     UninitializedArray<IndexType, spmv_block_size>& tmp_ind,
-                     UninitializedArray<ValueType, spmv_block_size>& tmp_val)
+                     sycl::nd_item<3> item_ct1, IndexType* __restrict__ tmp_ind,
+                     ValueType* __restrict__ tmp_val)
 {
     const auto alpha_val = alpha[0];
     merge_path_reduce(
@@ -598,26 +593,25 @@ void abstract_reduce(dim3 grid, dim3 block, size_type dynamic_shared_memory,
                      const size_type c_stride)
 {
     queue->submit([&](sycl::handler& cgh) {
-        sycl::accessor<UninitializedArray<IndexType, spmv_block_size>, 0,
-                       sycl::access_mode::read_write,
+        sycl::accessor<IndexType, 1, sycl::access_mode::read_write,
                        sycl::access::target::local>
-            tmp_ind_acc_ct1(cgh);
-        sycl::accessor<UninitializedArray<ValueType, spmv_block_size>, 0,
-                       sycl::access_mode::read_write,
+            tmp_ind_acc_ct1(sycl::range<1>(spmv_block_size), cgh);
+        sycl::accessor<ValueType, 1, sycl::access_mode::read_write,
                        sycl::access::target::local>
-            tmp_val_acc_ct1(cgh);
+            tmp_val_acc_ct1(sycl::range<1>(spmv_block_size), cgh);
 
         cgh.parallel_for(
             sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) {
-                abstract_reduce(nwarps, last_val, last_row, alpha, c, c_stride,
-                                item_ct1, *tmp_ind_acc_ct1.get_pointer(),
-                                *tmp_val_acc_ct1.get_pointer());
+                abstract_reduce(
+                    nwarps, last_val, last_row, alpha, c, c_stride, item_ct1,
+                    static_cast<IndexType*>(tmp_ind_acc_ct1.get_pointer()),
+                    static_cast<ValueType*>(tmp_val_acc_ct1.get_pointer()));
             });
     });
 }
 
 
-template <size_type subgroup_size, typename ValueType, typename IndexType,
+template <int subgroup_size, typename ValueType, typename IndexType,
           typename Closure>
 void device_classical_spmv(const size_type num_rows,
                            const ValueType* __restrict__ val,
@@ -654,7 +648,7 @@ void device_classical_spmv(const size_type num_rows,
 }
 
 
-template <size_type subgroup_size, typename ValueType, typename IndexType>
+template <int subgroup_size, typename ValueType, typename IndexType>
 void abstract_classical_spmv(
     const size_type num_rows, const ValueType* __restrict__ val,
     const IndexType* __restrict__ col_idxs,
@@ -667,7 +661,7 @@ void abstract_classical_spmv(
         [](const ValueType& x, const ValueType& y) { return x; }, item_ct1);
 }
 
-template <size_type subgroup_size, typename ValueType, typename IndexType>
+template <int subgroup_size, typename ValueType, typename IndexType>
 void abstract_classical_spmv(dim3 grid, dim3 block,
                              size_type dynamic_shared_memory,
                              sycl::queue* queue, const size_type num_rows,
@@ -689,7 +683,7 @@ void abstract_classical_spmv(dim3 grid, dim3 block,
 }
 
 
-template <size_type subgroup_size, typename ValueType, typename IndexType>
+template <int subgroup_size, typename ValueType, typename IndexType>
 void abstract_classical_spmv(
     const size_type num_rows, const ValueType* __restrict__ alpha,
     const ValueType* __restrict__ val, const IndexType* __restrict__ col_idxs,
@@ -708,7 +702,7 @@ void abstract_classical_spmv(
         item_ct1);
 }
 
-template <size_type subgroup_size, typename ValueType, typename IndexType>
+template <int subgroup_size, typename ValueType, typename IndexType>
 void abstract_classical_spmv(dim3 grid, dim3 block,
                              size_type dynamic_shared_memory,
                              sycl::queue* queue, const size_type num_rows,
@@ -719,12 +713,14 @@ void abstract_classical_spmv(dim3 grid, dim3 block,
                              ValueType* c, const size_type c_stride)
 {
     queue->submit([&](sycl::handler& cgh) {
-        cgh.parallel_for(sycl_nd_range(grid, block),
-                         [=](sycl::nd_item<3> item_ct1) {
-                             abstract_classical_spmv<subgroup_size>(
-                                 num_rows, alpha, val, col_idxs, row_ptrs, b,
-                                 b_stride, beta, c, c_stride, item_ct1);
-                         });
+        cgh.parallel_for(
+            sycl_nd_range(grid, block), [=
+        ](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(
+                                            subgroup_size)]] {
+                abstract_classical_spmv<subgroup_size>(
+                    num_rows, alpha, val, col_idxs, row_ptrs, b, b_stride, beta,
+                    c, c_stride, item_ct1);
+            });
     });
 }
 
@@ -899,7 +895,9 @@ void row_permute_kernel(dim3 grid, dim3 block, size_type dynamic_shared_memory,
 {
     queue->submit([&](sycl::handler& cgh) {
         cgh.parallel_for(
-            sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) {
+            sycl_nd_range(grid, block), [=
+        ](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(
+                                            subgroup_size)]] {
                 row_permute_kernel<subgroup_size>(
                     num_rows, permutation, in_row_ptrs, in_cols, in_vals,
                     out_row_ptrs, out_cols, out_vals, item_ct1);
@@ -946,7 +944,9 @@ void inv_row_permute_kernel(dim3 grid, dim3 block,
 {
     queue->submit([&](sycl::handler& cgh) {
         cgh.parallel_for(
-            sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) {
+            sycl_nd_range(grid, block), [=
+        ](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(
+                                            subgroup_size)]] {
                 inv_row_permute_kernel<subgroup_size>(
                     num_rows, permutation, in_row_ptrs, in_cols, in_vals,
                     out_row_ptrs, out_cols, out_vals, item_ct1);
@@ -994,7 +994,9 @@ void inv_symm_permute_kernel(dim3 grid, dim3 block,
 {
     queue->submit([&](sycl::handler& cgh) {
         cgh.parallel_for(
-            sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) {
+            sycl_nd_range(grid, block), [=
+        ](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(
+                                            subgroup_size)]] {
                 inv_symm_permute_kernel<subgroup_size>(
                     num_rows, permutation, in_row_ptrs, in_cols, in_vals,
                     out_row_ptrs, out_cols, out_vals, item_ct1);
@@ -1006,7 +1008,7 @@ namespace host_kernel {
 
 
 template <int items_per_thread, typename ValueType, typename IndexType>
-void merge_path_spmv(syn::value_list<int, items_per_thread>,
+void merge_path_spmv(std::integer_sequence<int, items_per_thread>,
                      std::shared_ptr<const DpcppExecutor> exec,
                      const matrix::Csr<ValueType, IndexType>* a,
                      const matrix::Dense<ValueType>* b,
@@ -1076,7 +1078,7 @@ int compute_items_per_thread(std::shared_ptr<const DpcppExecutor> exec)
 
 
 template <int subgroup_size, typename ValueType, typename IndexType>
-void classical_spmv(syn::value_list<int, subgroup_size>,
+void classical_spmv(std::integer_sequence<int, subgroup_size>,
                     std::shared_ptr<const DpcppExecutor> exec,
                     const matrix::Csr<ValueType, IndexType>* a,
                     const matrix::Dense<ValueType>* b,
@@ -1148,7 +1150,7 @@ void spmv(std::shared_ptr<const DpcppExecutor> exec,
             [&items_per_thread](int compiled_info) {
                 return items_per_thread == compiled_info;
             },
-            syn::value_list<int>(), syn::type_list<>(), exec, a, b, c);
+            empty_list_t{}, syn::type_list<>(), exec, a, b, c);
     } else if (a->get_strategy()->get_name() == "classical") {
         IndexType max_length_per_row = 0;
         using Tcsr = matrix::Csr<ValueType, IndexType>;
@@ -1167,7 +1169,7 @@ void spmv(std::shared_ptr<const DpcppExecutor> exec,
             [&max_length_per_row](int compiled_info) {
                 return max_length_per_row >= compiled_info;
             },
-            syn::value_list<int>(), syn::type_list<>(), exec, a, b, c);
+            empty_list_t{}, syn::type_list<>(), exec, a, b, c);
     } else if (a->get_strategy()->get_name() == "sparselib" ||
                a->get_strategy()->get_name() == "cusparse") {
         if (!is_complex<ValueType>()) {
@@ -1284,8 +1286,7 @@ void advanced_spmv(std::shared_ptr<const DpcppExecutor> exec,
             [&max_length_per_row](int compiled_info) {
                 return max_length_per_row >= compiled_info;
             },
-            syn::value_list<int>(), syn::type_list<>(), exec, a, b, c, alpha,
-            beta);
+            empty_list_t{}, syn::type_list<>(), exec, a, b, c, alpha, beta);
     } else if (a->get_strategy()->get_name() == "merge_path") {
         int items_per_thread =
             host_kernel::compute_items_per_thread<ValueType, IndexType>(exec);
@@ -1294,8 +1295,7 @@ void advanced_spmv(std::shared_ptr<const DpcppExecutor> exec,
             [&items_per_thread](int compiled_info) {
                 return items_per_thread == compiled_info;
             },
-            syn::value_list<int>(), syn::type_list<>(), exec, a, b, c, alpha,
-            beta);
+            empty_list_t{}, syn::type_list<>(), exec, a, b, c, alpha, beta);
     } else {
         GKO_NOT_IMPLEMENTED;
     }
diff --git a/dpcpp/matrix/dense_kernels.dp.cpp b/dpcpp/matrix/dense_kernels.dp.cpp
index 97c01e0fac5..3dd7964c4ab 100644
--- a/dpcpp/matrix/dense_kernels.dp.cpp
+++ b/dpcpp/matrix/dense_kernels.dp.cpp
@@ -33,6 +33,9 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "core/matrix/dense_kernels.hpp"
 
 
+#include <dpcpp/base/config.hpp>
+
+
 #include <CL/sycl.hpp>
 #include <oneapi/mkl.hpp>
 
@@ -49,7 +52,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 
 #include "core/components/prefix_sum_kernels.hpp"
-#include "dpcpp/base/config.hpp"
 #include "dpcpp/base/dim3.dp.hpp"
 #include "dpcpp/base/helper.hpp"
 #include "dpcpp/base/onemkl_bindings.hpp"
@@ -70,27 +72,18 @@ namespace dpcpp {
 namespace dense {
 
 
-// Disable the 64 subgroup. CPU supports 64 now, but conj_transpose will
-// lead CL_OUT_OF_RESOURCES. TODO: investigate this issue.
-using KCFG_1D = ConfigSet<11, 7>;
-constexpr auto kcfg_1d_list =
-    syn::value_list<std::uint32_t, KCFG_1D::encode(512, 32),
-                    KCFG_1D::encode(512, 16), KCFG_1D::encode(256, 32),
-                    KCFG_1D::encode(256, 16), KCFG_1D::encode(256, 8)>();
-constexpr auto subgroup_list = syn::value_list<std::uint32_t, 32, 16, 8, 4>();
-constexpr auto kcfg_1d_array = syn::as_array(kcfg_1d_list);
 constexpr int default_block_size = 256;
 
 
 namespace kernel {
 
 
-template <std::uint32_t sg_size, typename ValueType, typename Closure>
+template <int sg_size, typename ValueType, typename Closure>
 void transpose(const size_type nrows, const size_type ncols,
                const ValueType* __restrict__ in, const size_type in_stride,
                ValueType* __restrict__ out, const size_type out_stride,
                Closure op, sycl::nd_item<3> item_ct1,
-               UninitializedArray<ValueType, sg_size*(sg_size + 1)>& space)
+               ValueType* __restrict__ space)
 {
     auto local_x = item_ct1.get_local_id(2);
     auto local_y = item_ct1.get_local_id(1);
@@ -108,35 +101,34 @@ void transpose(const size_type nrows, const size_type ncols,
     }
 }
 
-template <std::uint32_t sg_size, typename ValueType>
+template <int sg_size, typename ValueType>
 void transpose(const size_type nrows, const size_type ncols,
                const ValueType* __restrict__ in, const size_type in_stride,
                ValueType* __restrict__ out, const size_type out_stride,
-               sycl::nd_item<3> item_ct1,
-               UninitializedArray<ValueType, sg_size*(sg_size + 1)>& space)
+               sycl::nd_item<3> item_ct1, ValueType* __restrict__ space)
 {
     transpose<sg_size>(
         nrows, ncols, in, in_stride, out, out_stride,
         [](ValueType val) { return val; }, item_ct1, space);
 }
 
-template <std::uint32_t sg_size = 32, typename ValueType>
+template <int sg_size = 32, typename ValueType>
 void transpose(dim3 grid, dim3 block, size_type dynamic_shared_memory,
                sycl::queue* queue, const size_type nrows, const size_type ncols,
                const ValueType* in, const size_type in_stride, ValueType* out,
                const size_type out_stride)
 {
     queue->submit([&](sycl::handler& cgh) {
-        sycl::accessor<UninitializedArray<ValueType, sg_size*(sg_size + 1)>, 0,
-                       sycl::access_mode::read_write,
+        sycl::accessor<ValueType, 1, sycl::access_mode::read_write,
                        sycl::access::target::local>
-            space_acc_ct1(cgh);
+            space_acc_ct1(sycl::range<1>(sg_size * (sg_size + 1)), cgh);
 
         cgh.parallel_for(
             sycl_nd_range(grid, block),
             [=](sycl::nd_item<3> item_ct1) __WG_BOUND__(sg_size, sg_size) {
-                transpose<sg_size>(nrows, ncols, in, in_stride, out, out_stride,
-                                   item_ct1, *space_acc_ct1.get_pointer());
+                transpose<sg_size>(
+                    nrows, ncols, in, in_stride, out, out_stride, item_ct1,
+                    static_cast<ValueType*>(space_acc_ct1.get_pointer()));
             });
     });
 }
@@ -145,19 +137,18 @@ GKO_ENABLE_IMPLEMENTATION_CONFIG_SELECTION(transpose, transpose);
 GKO_ENABLE_DEFAULT_CONFIG_CALL(transpose_call, transpose, subgroup_list);
 
 
-template <std::uint32_t sg_size, typename ValueType>
+template <int sg_size, typename ValueType>
 void conj_transpose(const size_type nrows, const size_type ncols,
                     const ValueType* __restrict__ in, const size_type in_stride,
                     ValueType* __restrict__ out, const size_type out_stride,
-                    sycl::nd_item<3> item_ct1,
-                    UninitializedArray<ValueType, sg_size*(sg_size + 1)>& space)
+                    sycl::nd_item<3> item_ct1, ValueType* __restrict__ space)
 {
     transpose<sg_size>(
         nrows, ncols, in, in_stride, out, out_stride,
         [](ValueType val) { return conj(val); }, item_ct1, space);
 }
 
-template <std::uint32_t sg_size = 16, typename ValueType>
+template <int sg_size = 16, typename ValueType>
 void conj_transpose(dim3 grid, dim3 block, size_type dynamic_shared_memory,
                     sycl::queue* queue, const size_type nrows,
                     const size_type ncols, const ValueType* in,
@@ -165,17 +156,16 @@ void conj_transpose(dim3 grid, dim3 block, size_type dynamic_shared_memory,
                     const size_type out_stride)
 {
     queue->submit([&](sycl::handler& cgh) {
-        sycl::accessor<UninitializedArray<ValueType, sg_size*(sg_size + 1)>, 0,
-                       sycl::access_mode::read_write,
+        sycl::accessor<ValueType, 1, sycl::access_mode::read_write,
                        sycl::access::target::local>
-            space_acc_ct1(cgh);
+            space_acc_ct1(sycl::range<1>(sg_size * (sg_size + 1)), cgh);
 
         cgh.parallel_for(
             sycl_nd_range(grid, block),
             [=](sycl::nd_item<3> item_ct1) __WG_BOUND__(sg_size, sg_size) {
-                conj_transpose<sg_size>(nrows, ncols, in, in_stride, out,
-                                        out_stride, item_ct1,
-                                        *space_acc_ct1.get_pointer());
+                conj_transpose<sg_size>(
+                    nrows, ncols, in, in_stride, out, out_stride, item_ct1,
+                    static_cast<ValueType*>(space_acc_ct1.get_pointer()));
             });
     });
 }
@@ -539,10 +529,9 @@ void transpose(std::shared_ptr<const DefaultExecutor> exec,
     auto size = orig->get_size();
     auto sg_array = syn::as_array(subgroup_list);
     auto queue = exec->get_queue();
-    const std::uint32_t cfg =
-        get_first_cfg(sg_array, [&queue](std::uint32_t cfg) {
-            return validate(queue, cfg * cfg, cfg);
-        });
+    const int cfg = get_first_cfg(sg_array, [&queue](int cfg) {
+        return validate(queue, cfg * cfg, cfg);
+    });
     dim3 grid(ceildiv(size[1], cfg), ceildiv(size[0], cfg));
     dim3 block(cfg, cfg);
     kernel::transpose_call(cfg, grid, block, 0, queue, size[0], size[1],
@@ -561,10 +550,9 @@ void conj_transpose(std::shared_ptr<const DefaultExecutor> exec,
     auto size = orig->get_size();
     auto sg_array = syn::as_array(subgroup_list);
     auto queue = exec->get_queue();
-    const std::uint32_t cfg =
-        get_first_cfg(sg_array, [&queue](std::uint32_t cfg) {
-            return validate(queue, cfg * cfg, cfg);
-        });
+    const int cfg = get_first_cfg(sg_array, [&queue](int cfg) {
+        return validate(queue, cfg * cfg, cfg);
+    });
     dim3 grid(ceildiv(size[1], cfg), ceildiv(size[0], cfg));
     dim3 block(cfg, cfg);
     kernel::conj_transpose_call(cfg, grid, block, 0, queue, size[0], size[1],
diff --git a/dpcpp/matrix/ell_kernels.dp.cpp b/dpcpp/matrix/ell_kernels.dp.cpp
index fbd1063bf13..23847bdf9b8 100644
--- a/dpcpp/matrix/ell_kernels.dp.cpp
+++ b/dpcpp/matrix/ell_kernels.dp.cpp
@@ -34,6 +34,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 
 #include <array>
+#include <dpcpp/base/config.hpp>
 
 
 #include <CL/sycl.hpp>
@@ -52,7 +53,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "core/components/prefix_sum_kernels.hpp"
 #include "core/matrix/dense_kernels.hpp"
 #include "core/synthesizer/implementation_selection.hpp"
-#include "dpcpp/base/config.hpp"
 #include "dpcpp/base/dim3.dp.hpp"
 #include "dpcpp/base/helper.hpp"
 #include "dpcpp/components/atomic.dp.hpp"
@@ -91,20 +91,26 @@ constexpr int num_threads_per_core = 4;
 constexpr double ratio = 1e-2;
 
 
-/**
- * max_thread_per_worker is the max number of thread per worker. The
- * `compiled_kernels` must be a list <0, 1, 2, ..., max_thread_per_worker>
- */
-constexpr int max_thread_per_worker = 32;
-
-
 /**
  * A compile-time list of sub-warp sizes for which the spmv kernels should be
  * compiled.
  * 0 is a special case where it uses a sub-warp size of warp_size in
  * combination with atomic_adds.
  */
-using compiled_kernels = syn::value_list<int, 0, 8, 16, 32>;
+using compiled_kernels =
+    syn::concatenate<std::integer_sequence<int, 0>, subgroup_list_t>;
+
+
+/**
+ * max_thread_per_worker is the max number of thread per worker. The
+ * `compiled_kernels` must be a list <0, 1, 2, ..., max_thread_per_worker>
+ */
+constexpr auto max_thread_per_worker =
+    syn::as_value(syn::max<subgroup_list_t>{});
+
+
+constexpr auto min_thread_per_worker =
+    syn::as_value(syn::min<subgroup_list_t>{});
 
 
 namespace kernel {
@@ -114,14 +120,14 @@ namespace {
 template <int num_thread_per_worker, bool atomic, typename b_accessor,
           typename a_accessor, typename OutputValueType, typename IndexType,
           typename Closure>
-void spmv_kernel(
-    const size_type num_rows, const int num_worker_per_row,
-    acc::range<a_accessor> val, const IndexType* __restrict__ col,
-    const size_type stride, const size_type num_stored_elements_per_row,
-    acc::range<b_accessor> b, OutputValueType* __restrict__ c,
-    const size_type c_stride, Closure op, sycl::nd_item<3> item_ct1,
-    UninitializedArray<OutputValueType,
-                       default_block_size / num_thread_per_worker>& storage)
+void spmv_kernel(const size_type num_rows, const int num_worker_per_row,
+                 acc::range<a_accessor> val, const IndexType* __restrict__ col,
+                 const size_type stride,
+                 const size_type num_stored_elements_per_row,
+                 acc::range<b_accessor> b, OutputValueType* __restrict__ c,
+                 const size_type c_stride, Closure op,
+                 sycl::nd_item<3> item_ct1,
+                 OutputValueType* __restrict__ storage)
 {
     const auto tidx = thread::get_thread_id_flat(item_ct1);
     const decltype(tidx) column_id = item_ct1.get_group(1);
@@ -175,8 +181,9 @@ void spmv_kernel(
         if (runnable && idx_in_worker == 0) {
             const auto c_ind = x * c_stride + column_id;
             if (atomic) {
-                atomic_add(&(c[c_ind]),
-                           op(storage[item_ct1.get_local_id(2)], c[c_ind]));
+                atomic_add<atomic::global_space>(
+                    &(c[c_ind]),
+                    op(storage[item_ct1.get_local_id(2)], c[c_ind]));
             } else {
                 c[c_ind] = op(storage[item_ct1.get_local_id(2)], c[c_ind]);
             }
@@ -187,14 +194,12 @@ void spmv_kernel(
 
 template <int num_thread_per_worker, bool atomic = false, typename b_accessor,
           typename a_accessor, typename OutputValueType, typename IndexType>
-void spmv(
-    const size_type num_rows, const int num_worker_per_row,
-    acc::range<a_accessor> val, const IndexType* __restrict__ col,
-    const size_type stride, const size_type num_stored_elements_per_row,
-    acc::range<b_accessor> b, OutputValueType* __restrict__ c,
-    const size_type c_stride, sycl::nd_item<3> item_ct1,
-    UninitializedArray<OutputValueType,
-                       default_block_size / num_thread_per_worker>& storage)
+void spmv(const size_type num_rows, const int num_worker_per_row,
+          acc::range<a_accessor> val, const IndexType* __restrict__ col,
+          const size_type stride, const size_type num_stored_elements_per_row,
+          acc::range<b_accessor> b, OutputValueType* __restrict__ c,
+          const size_type c_stride, sycl::nd_item<3> item_ct1,
+          OutputValueType* __restrict__ storage)
 {
     spmv_kernel<num_thread_per_worker, atomic>(
         num_rows, num_worker_per_row, val, col, stride,
@@ -213,34 +218,33 @@ void spmv(dim3 grid, dim3 block, size_type dynamic_shared_memory,
           OutputValueType* c, const size_type c_stride)
 {
     queue->submit([&](sycl::handler& cgh) {
-        sycl::accessor<
-            UninitializedArray<OutputValueType,
-                               default_block_size / num_thread_per_worker>,
-            0, sycl::access_mode::read_write, sycl::access::target::local>
-            storage_acc_ct1(cgh);
-
-        cgh.parallel_for(sycl_nd_range(grid, block),
-                         [=](sycl::nd_item<3> item_ct1) {
-                             spmv<num_thread_per_worker, atomic>(
-                                 num_rows, num_worker_per_row, val, col, stride,
-                                 num_stored_elements_per_row, b, c, c_stride,
-                                 item_ct1, *storage_acc_ct1.get_pointer());
-                         });
+        sycl::accessor<OutputValueType, 1, sycl::access_mode::read_write,
+                       sycl::access::target::local>
+            storage_acc_ct1(
+                sycl::range<1>(default_block_size / num_thread_per_worker),
+                cgh);
+
+        cgh.parallel_for(
+            sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) {
+                spmv<num_thread_per_worker, atomic>(
+                    num_rows, num_worker_per_row, val, col, stride,
+                    num_stored_elements_per_row, b, c, c_stride, item_ct1,
+                    static_cast<OutputValueType*>(
+                        storage_acc_ct1.get_pointer()));
+            });
     });
 }
 
 
 template <int num_thread_per_worker, bool atomic = false, typename b_accessor,
           typename a_accessor, typename OutputValueType, typename IndexType>
-void spmv(
-    const size_type num_rows, const int num_worker_per_row,
-    acc::range<a_accessor> alpha, acc::range<a_accessor> val,
-    const IndexType* __restrict__ col, const size_type stride,
-    const size_type num_stored_elements_per_row, acc::range<b_accessor> b,
-    const OutputValueType* __restrict__ beta, OutputValueType* __restrict__ c,
-    const size_type c_stride, sycl::nd_item<3> item_ct1,
-    UninitializedArray<OutputValueType,
-                       default_block_size / num_thread_per_worker>& storage)
+void spmv(const size_type num_rows, const int num_worker_per_row,
+          acc::range<a_accessor> alpha, acc::range<a_accessor> val,
+          const IndexType* __restrict__ col, const size_type stride,
+          const size_type num_stored_elements_per_row, acc::range<b_accessor> b,
+          const OutputValueType* __restrict__ beta,
+          OutputValueType* __restrict__ c, const size_type c_stride,
+          sycl::nd_item<3> item_ct1, OutputValueType* __restrict__ storage)
 {
     const OutputValueType alpha_val = alpha(0);
     const OutputValueType beta_val = beta[0];
@@ -280,18 +284,19 @@ void spmv(dim3 grid, dim3 block, size_type dynamic_shared_memory,
           OutputValueType* c, const size_type c_stride)
 {
     queue->submit([&](sycl::handler& cgh) {
-        sycl::accessor<
-            UninitializedArray<OutputValueType,
-                               default_block_size / num_thread_per_worker>,
-            0, sycl::access_mode::read_write, sycl::access::target::local>
-            storage_acc_ct1(cgh);
+        sycl::accessor<OutputValueType, 1, sycl::access_mode::read_write,
+                       sycl::access::target::local>
+            storage_acc_ct1(
+                sycl::range<1>(default_block_size / num_thread_per_worker),
+                cgh);
 
         cgh.parallel_for(
             sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) {
                 spmv<num_thread_per_worker, atomic>(
                     num_rows, num_worker_per_row, alpha, val, col, stride,
                     num_stored_elements_per_row, b, beta, c, c_stride, item_ct1,
-                    *storage_acc_ct1.get_pointer());
+                    static_cast<OutputValueType*>(
+                        storage_acc_ct1.get_pointer()));
             });
     });
 }
@@ -306,7 +311,7 @@ namespace {
 
 template <int info, typename InputValueType, typename MatrixValueType,
           typename OutputValueType, typename IndexType>
-void abstract_spmv(syn::value_list<int, info>,
+void abstract_spmv(std::integer_sequence<int, info>,
                    std::shared_ptr<const DpcppExecutor> exec,
                    int num_worker_per_row,
                    const matrix::Ell<MatrixValueType, IndexType>* a,
@@ -325,7 +330,7 @@ void abstract_spmv(syn::value_list<int, info>,
     const auto num_stored_elements_per_row =
         a->get_num_stored_elements_per_row();
 
-    constexpr int num_thread_per_worker =
+    constexpr auto num_thread_per_worker =
         (info == 0) ? max_thread_per_worker : info;
     constexpr bool atomic = (info == 0);
     const dim3 block_size(default_block_size / num_thread_per_worker,
@@ -372,7 +377,7 @@ std::array<int, 3> compute_thread_worker_and_atomicity(
     std::shared_ptr<const DpcppExecutor> exec,
     const matrix::Ell<ValueType, IndexType>* a)
 {
-    int num_thread_per_worker = 8;
+    int num_thread_per_worker = min_thread_per_worker;
     int atomic = 0;
     int num_worker_per_row = 1;
 
@@ -391,8 +396,8 @@ std::array<int, 3> compute_thread_worker_and_atomicity(
     // decided according to the number of worker allowed on GPU.
     if (static_cast<double>(ell_ncols) / nrows > ratio) {
         while (num_thread_per_worker < max_thread_per_worker &&
-               (num_thread_per_worker << 1) <= ell_ncols) {
-            num_thread_per_worker <<= 1;
+               (num_thread_per_worker * 2) <= ell_ncols) {
+            num_thread_per_worker *= 2;
         }
         if (num_thread_per_worker == max_thread_per_worker) {
             num_worker_per_row =
@@ -436,8 +441,7 @@ void spmv(std::shared_ptr<const DpcppExecutor> exec,
     select_abstract_spmv(
         compiled_kernels(),
         [&info](int compiled_info) { return info == compiled_info; },
-        syn::value_list<int>(), syn::type_list<>(), exec, num_worker_per_row, a,
-        b, c);
+        empty_list_t{}, syn::type_list<>(), exec, num_worker_per_row, a, b, c);
 }
 
 GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE(
@@ -470,8 +474,8 @@ void advanced_spmv(std::shared_ptr<const DpcppExecutor> exec,
     select_abstract_spmv(
         compiled_kernels(),
         [&info](int compiled_info) { return info == compiled_info; },
-        syn::value_list<int>(), syn::type_list<>(), exec, num_worker_per_row, a,
-        b, c, alpha, beta);
+        empty_list_t{}, syn::type_list<>(), exec, num_worker_per_row, a, b, c,
+        alpha, beta);
 }
 
 GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE(
diff --git a/dpcpp/solver/cb_gmres_kernels.dp.cpp b/dpcpp/solver/cb_gmres_kernels.dp.cpp
index dc94bbb92ff..ac64a232fbc 100644
--- a/dpcpp/solver/cb_gmres_kernels.dp.cpp
+++ b/dpcpp/solver/cb_gmres_kernels.dp.cpp
@@ -215,14 +215,13 @@ GKO_ENABLE_DEFAULT_HOST(increase_final_iteration_numbers_kernel,
 
 
 template <typename ValueType>
-void multinorm2_kernel(
-    size_type num_rows, size_type num_cols,
-    const ValueType* __restrict__ next_krylov_basis,
-    size_type stride_next_krylov, remove_complex<ValueType>* __restrict__ norms,
-    const stopping_status* __restrict__ stop_status, sycl::nd_item<3> item_ct1,
-    UninitializedArray<remove_complex<ValueType>,
-                       default_dot_dim*(default_dot_dim + 1)>*
-        reduction_helper_array)
+void multinorm2_kernel(size_type num_rows, size_type num_cols,
+                       const ValueType* __restrict__ next_krylov_basis,
+                       size_type stride_next_krylov,
+                       remove_complex<ValueType>* __restrict__ norms,
+                       const stopping_status* __restrict__ stop_status,
+                       sycl::nd_item<3> item_ct1,
+                       remove_complex<ValueType>* reduction_helper_array)
 {
     using rc_vtype = remove_complex<ValueType>;
     const auto tidx = item_ct1.get_local_id(2);
@@ -236,7 +235,7 @@ void multinorm2_kernel(
     // Used that way to get around dynamic initialization warning and
     // template error when using `reduction_helper_array` directly in `reduce`
 
-    rc_vtype* __restrict__ reduction_helper = (*reduction_helper_array);
+    rc_vtype* __restrict__ reduction_helper = reduction_helper_array;
     rc_vtype local_res = zero<rc_vtype>();
     if (col_idx < num_cols && !stop_status[col_idx].has_stopped()) {
         for (size_type i = start_row + tidy; i < end_row;
@@ -270,20 +269,21 @@ void multinorm2_kernel(dim3 grid, dim3 block, size_type dynamic_shared_memory,
                        const stopping_status* stop_status)
 {
     queue->submit([&](sycl::handler& cgh) {
-        sycl::accessor<
-            UninitializedArray<remove_complex<ValueType>,
-                               default_dot_dim*(default_dot_dim + 1)>,
-            0, sycl::access_mode::read_write, sycl::access::target::local>
-            reduction_helper_array_acc_ct1(cgh);
+        sycl::accessor<remove_complex<ValueType>, 1,
+                       sycl::access_mode::read_write,
+                       sycl::access::target::local>
+            reduction_helper_array_acc_ct1(
+                sycl::range<1>(default_dot_dim * (default_dot_dim + 1)), cgh);
 
         cgh.parallel_for(
             sycl_nd_range(grid, block), [=
         ](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(
                                             default_dot_dim)]] {
-                multinorm2_kernel(num_rows, num_cols, next_krylov_basis,
-                                  stride_next_krylov, norms, stop_status,
-                                  item_ct1,
-                                  reduction_helper_array_acc_ct1.get_pointer());
+                multinorm2_kernel(
+                    num_rows, num_cols, next_krylov_basis, stride_next_krylov,
+                    norms, stop_status, item_ct1,
+                    static_cast<remove_complex<ValueType>*>(
+                        reduction_helper_array_acc_ct1.get_pointer()));
             });
     });
 }
@@ -295,9 +295,7 @@ void multinorminf_without_stop_kernel(
     const ValueType* __restrict__ next_krylov_basis,
     size_type stride_next_krylov, remove_complex<ValueType>* __restrict__ norms,
     size_type stride_norms, sycl::nd_item<3> item_ct1,
-    UninitializedArray<remove_complex<ValueType>,
-                       default_dot_dim*(default_dot_dim + 1)>*
-        reduction_helper_array)
+    remove_complex<ValueType>* reduction_helper_array)
 {
     using rc_vtype = remove_complex<ValueType>;
     const auto tidx = item_ct1.get_local_id(2);
@@ -311,7 +309,7 @@ void multinorminf_without_stop_kernel(
     // Used that way to get around dynamic initialization warning and
     // template error when using `reduction_helper_array` directly in `reduce`
 
-    rc_vtype* __restrict__ reduction_helper = (*reduction_helper_array);
+    rc_vtype* __restrict__ reduction_helper = reduction_helper_array;
     rc_vtype local_max = zero<rc_vtype>();
     if (col_idx < num_cols) {
         for (size_type i = start_row + tidy; i < end_row;
@@ -347,11 +345,11 @@ void multinorminf_without_stop_kernel(
     size_type stride_norms)
 {
     queue->submit([&](sycl::handler& cgh) {
-        sycl::accessor<
-            UninitializedArray<remove_complex<ValueType>,
-                               default_dot_dim*(default_dot_dim + 1)>,
-            0, sycl::access_mode::read_write, sycl::access::target::local>
-            reduction_helper_array_acc_ct1(cgh);
+        sycl::accessor<remove_complex<ValueType>, 1,
+                       sycl::access_mode::read_write,
+                       sycl::access::target::local>
+            reduction_helper_array_acc_ct1(
+                sycl::range<1>(default_dot_dim * (default_dot_dim + 1)), cgh);
 
         cgh.parallel_for(
             sycl_nd_range(grid, block), [=
@@ -360,7 +358,8 @@ void multinorminf_without_stop_kernel(
                 multinorminf_without_stop_kernel(
                     num_rows, num_cols, next_krylov_basis, stride_next_krylov,
                     norms, stride_norms, item_ct1,
-                    reduction_helper_array_acc_ct1.get_pointer());
+                    static_cast<remove_complex<ValueType>*>(
+                        reduction_helper_array_acc_ct1.get_pointer()));
             });
     });
 }
@@ -368,17 +367,14 @@ void multinorminf_without_stop_kernel(
 
 // ONLY computes the inf-norm (into norms2) when compute_inf is true
 template <bool compute_inf, typename ValueType>
-void multinorm2_inf_kernel(
-    size_type num_rows, size_type num_cols,
-    const ValueType* __restrict__ next_krylov_basis,
-    size_type stride_next_krylov,
-    remove_complex<ValueType>* __restrict__ norms1,
-    remove_complex<ValueType>* __restrict__ norms2,
-    const stopping_status* __restrict__ stop_status, sycl::nd_item<3> item_ct1,
-    UninitializedArray<remove_complex<ValueType>,
-                       (1 + compute_inf) *
-                           default_dot_dim*(default_dot_dim + 1)>*
-        reduction_helper_array)
+void multinorm2_inf_kernel(size_type num_rows, size_type num_cols,
+                           const ValueType* __restrict__ next_krylov_basis,
+                           size_type stride_next_krylov,
+                           remove_complex<ValueType>* __restrict__ norms1,
+                           remove_complex<ValueType>* __restrict__ norms2,
+                           const stopping_status* __restrict__ stop_status,
+                           sycl::nd_item<3> item_ct1,
+                           remove_complex<ValueType>* reduction_helper_array)
 {
     using rc_vtype = remove_complex<ValueType>;
     const auto tidx = item_ct1.get_local_id(2);
@@ -392,9 +388,9 @@ void multinorm2_inf_kernel(
     // Used that way to get around dynamic initialization warning and
     // template error when using `reduction_helper_array` directly in `reduce`
 
-    rc_vtype* __restrict__ reduction_helper_add = (*reduction_helper_array);
+    rc_vtype* __restrict__ reduction_helper_add = reduction_helper_array;
     rc_vtype* __restrict__ reduction_helper_max =
-        static_cast<rc_vtype*>((*reduction_helper_array)) +
+        static_cast<rc_vtype*>(reduction_helper_array) +
         default_dot_dim * (default_dot_dim + 1);
     rc_vtype local_res = zero<rc_vtype>();
     rc_vtype local_max = zero<rc_vtype>();
@@ -449,12 +445,12 @@ void multinorm2_inf_kernel(
     remove_complex<ValueType>* norms2, const stopping_status* stop_status)
 {
     queue->submit([&](sycl::handler& cgh) {
-        sycl::accessor<
-            UninitializedArray<remove_complex<ValueType>,
-                               (1 + compute_inf) *
-                                   default_dot_dim*(default_dot_dim + 1)>,
-            0, sycl::access_mode::read_write, sycl::access::target::local>
-            reduction_helper_array_acc_ct1(cgh);
+        sycl::accessor<remove_complex<ValueType>, 1,
+                       sycl::access_mode::read_write,
+                       sycl::access::target::local>
+            reduction_helper_array_acc_ct1(
+                (1 + compute_inf) * default_dot_dim * (default_dot_dim + 1),
+                cgh);
 
         cgh.parallel_for(
             sycl_nd_range(grid, block), [=
@@ -463,20 +459,23 @@ void multinorm2_inf_kernel(
                 multinorm2_inf_kernel<compute_inf>(
                     num_rows, num_cols, next_krylov_basis, stride_next_krylov,
                     norms1, norms2, stop_status, item_ct1,
-                    reduction_helper_array_acc_ct1.get_pointer());
+                    static_cast<remove_complex<ValueType>*>(
+                        reduction_helper_array_acc_ct1.get_pointer()));
             });
     });
 }
 
 
 template <int dot_dim, typename ValueType, typename Accessor3d>
-void multidot_kernel(
-    size_type num_rows, size_type num_cols,
-    const ValueType* __restrict__ next_krylov_basis,
-    size_type stride_next_krylov, const Accessor3d krylov_bases,
-    ValueType* __restrict__ hessenberg_iter, size_type stride_hessenberg,
-    const stopping_status* __restrict__ stop_status, sycl::nd_item<3> item_ct1,
-    UninitializedArray<ValueType, dot_dim * dot_dim>& reduction_helper_array)
+void multidot_kernel(size_type num_rows, size_type num_cols,
+                     const ValueType* __restrict__ next_krylov_basis,
+                     size_type stride_next_krylov,
+                     const Accessor3d krylov_bases,
+                     ValueType* __restrict__ hessenberg_iter,
+                     size_type stride_hessenberg,
+                     const stopping_status* __restrict__ stop_status,
+                     sycl::nd_item<3> item_ct1,
+                     ValueType* reduction_helper_array)
 {
     /*
      * In general in this kernel:
@@ -543,10 +542,10 @@ void multidot_kernel(dim3 grid, dim3 block, size_type dynamic_shared_memory,
                      const stopping_status* stop_status)
 {
     queue->submit([&](sycl::handler& cgh) {
-        sycl::accessor<UninitializedArray<ValueType, dot_dim * dot_dim>, 0,
-                       sycl::access_mode::read_write,
+        sycl::accessor<ValueType, 1, sycl::access_mode::read_write,
                        sycl::access::target::local>
-            reduction_helper_array_acc_ct1(cgh);
+            reduction_helper_array_acc_ct1(sycl::range<1>(dot_dim * dot_dim),
+                                           cgh);
 
         cgh.parallel_for(
             sycl_nd_range(grid, block), [=
@@ -555,7 +554,8 @@ void multidot_kernel(dim3 grid, dim3 block, size_type dynamic_shared_memory,
                     num_rows, num_cols, next_krylov_basis, stride_next_krylov,
                     krylov_bases, hessenberg_iter, stride_hessenberg,
                     stop_status, item_ct1,
-                    *reduction_helper_array_acc_ct1.get_pointer());
+                    static_cast<ValueType*>(
+                        reduction_helper_array_acc_ct1.get_pointer()));
             });
     });
 }
@@ -567,7 +567,7 @@ void singledot_kernel(
     size_type stride_next_krylov, const Accessor3d krylov_bases,
     ValueType* __restrict__ hessenberg_iter, size_type stride_hessenberg,
     const stopping_status* __restrict__ stop_status, sycl::nd_item<3> item_ct1,
-    UninitializedArray<ValueType, block_size>& reduction_helper_array)
+    ValueType* __restrict__ reduction_helper_array)
 {
     /*
      * In general in this kernel:
@@ -624,10 +624,9 @@ void singledot_kernel(dim3 grid, dim3 block, size_type dynamic_shared_memory,
                       const stopping_status* stop_status)
 {
     queue->submit([&](sycl::handler& cgh) {
-        sycl::accessor<UninitializedArray<ValueType, block_size>, 0,
-                       sycl::access_mode::read_write,
+        sycl::accessor<ValueType, 1, sycl::access_mode::read_write,
                        sycl::access::target::local>
-            reduction_helper_array_acc_ct1(cgh);
+            reduction_helper_array_acc_ct1(sycl::range<1>(block_size), cgh);
 
         cgh.parallel_for(
             sycl_nd_range(grid, block), [=
@@ -637,7 +636,8 @@ void singledot_kernel(dim3 grid, dim3 block, size_type dynamic_shared_memory,
                     num_rows, next_krylov_basis, stride_next_krylov,
                     krylov_bases, hessenberg_iter, stride_hessenberg,
                     stop_status, item_ct1,
-                    *reduction_helper_array_acc_ct1.get_pointer());
+                    static_cast<ValueType*>(
+                        reduction_helper_array_acc_ct1.get_pointer()));
             });
     });
 }
diff --git a/dpcpp/solver/gmres_kernels.dp.cpp b/dpcpp/solver/gmres_kernels.dp.cpp
index 2fe0d63bcbf..3f4c22eea94 100644
--- a/dpcpp/solver/gmres_kernels.dp.cpp
+++ b/dpcpp/solver/gmres_kernels.dp.cpp
@@ -140,14 +140,15 @@ GKO_ENABLE_DEFAULT_HOST(increase_final_iteration_numbers_kernel,
 
 
 template <typename ValueType>
-void multidot_kernel(
-    size_type k, size_type num_rows, size_type num_cols,
-    const ValueType* __restrict__ krylov_bases,
-    const ValueType* __restrict__ next_krylov_basis, size_type stride_krylov,
-    ValueType* __restrict__ hessenberg_iter, size_type stride_hessenberg,
-    const stopping_status* __restrict__ stop_status, sycl::nd_item<3> item_ct1,
-    UninitializedArray<ValueType, default_dot_dim*(default_dot_dim + 1)>*
-        reduction_helper_array)
+void multidot_kernel(size_type k, size_type num_rows, size_type num_cols,
+                     const ValueType* __restrict__ krylov_bases,
+                     const ValueType* __restrict__ next_krylov_basis,
+                     size_type stride_krylov,
+                     ValueType* __restrict__ hessenberg_iter,
+                     size_type stride_hessenberg,
+                     const stopping_status* __restrict__ stop_status,
+                     sycl::nd_item<3> item_ct1,
+                     ValueType* __restrict__ reduction_helper_array)
 {
     const auto tidx = item_ct1.get_local_id(2);
     const auto tidy = item_ct1.get_local_id(1);
@@ -160,7 +161,7 @@ void multidot_kernel(
 
     // Used that way to get around dynamic initialization warning and
     // template error when using `reduction_helper_array` directly in `reduce`
-    ValueType* __restrict__ reduction_helper = (*reduction_helper_array);
+    ValueType* __restrict__ reduction_helper = reduction_helper_array;
 
     ValueType local_res = zero<ValueType>();
     if (col_idx < num_cols && !stop_status[col_idx].has_stopped()) {
@@ -197,11 +198,10 @@ void multidot_kernel(dim3 grid, dim3 block, size_type dynamic_shared_memory,
                      const stopping_status* stop_status)
 {
     queue->submit([&](sycl::handler& cgh) {
-        sycl::accessor<UninitializedArray<ValueType, default_dot_dim*(
-                                                         default_dot_dim + 1)>,
-                       0, sycl::access_mode::read_write,
+        sycl::accessor<ValueType, 1, sycl::access_mode::read_write,
                        sycl::access::target::local>
-            reduction_helper_array_acc_ct1(cgh);
+            reduction_helper_array_acc_ct1(
+                sycl::range<1>(default_dot_dim * (default_dot_dim + 1)), cgh);
 
         cgh.parallel_for(
             sycl_nd_range(grid, block), [=
@@ -211,9 +211,8 @@ void multidot_kernel(dim3 grid, dim3 block, size_type dynamic_shared_memory,
                     k, num_rows, num_cols, krylov_bases, next_krylov_basis,
                     stride_krylov, hessenberg_iter, stride_hessenberg,
                     stop_status, item_ct1,
-                    (UninitializedArray<ValueType,
-                                        default_dot_dim*(default_dot_dim + 1)>*)
-                        reduction_helper_array_acc_ct1.get_pointer());
+                    static_cast<ValueType*>(
+                        reduction_helper_array_acc_ct1.get_pointer()));
             });
     });
 }
@@ -267,13 +266,15 @@ void update_next_krylov_kernel(
 // Must be called with at least `num_cols` blocks, each with `block_size`
 // threads. `block_size` must be a power of 2.
 template <int block_size, typename ValueType>
-void update_hessenberg_2_kernel(
-    size_type iter, size_type num_rows, size_type num_cols,
-    const ValueType* __restrict__ next_krylov_basis,
-    size_type stride_next_krylov, ValueType* __restrict__ hessenberg_iter,
-    size_type stride_hessenberg,
-    const stopping_status* __restrict__ stop_status, sycl::nd_item<3> item_ct1,
-    UninitializedArray<ValueType, block_size>& reduction_helper_array)
+void update_hessenberg_2_kernel(size_type iter, size_type num_rows,
+                                size_type num_cols,
+                                const ValueType* __restrict__ next_krylov_basis,
+                                size_type stride_next_krylov,
+                                ValueType* __restrict__ hessenberg_iter,
+                                size_type stride_hessenberg,
+                                const stopping_status* __restrict__ stop_status,
+                                sycl::nd_item<3> item_ct1,
+                                ValueType* __restrict__ reduction_helper_array)
 {
     const auto tidx = item_ct1.get_local_id(2);
     const auto col_idx = item_ct1.get_group(2);
@@ -314,10 +315,9 @@ void update_hessenberg_2_kernel(
     const stopping_status* stop_status)
 {
     queue->submit([&](sycl::handler& cgh) {
-        sycl::accessor<UninitializedArray<ValueType, block_size>, 0,
-                       sycl::access_mode::read_write,
+        sycl::accessor<ValueType, 1, sycl::access_mode::read_write,
                        sycl::access::target::local>
-            reduction_helper_array_acc_ct1(cgh);
+            reduction_helper_array_acc_ct1(sycl::range<1>(block_size), cgh);
 
         cgh.parallel_for(
             sycl_nd_range(grid, block), [=
@@ -327,7 +327,8 @@ void update_hessenberg_2_kernel(
                     iter, num_rows, num_cols, next_krylov_basis,
                     stride_next_krylov, hessenberg_iter, stride_hessenberg,
                     stop_status, item_ct1,
-                    *reduction_helper_array_acc_ct1.get_pointer());
+                    static_cast<ValueType*>(
+                        reduction_helper_array_acc_ct1.get_pointer()));
             });
     });
 }
diff --git a/dpcpp/solver/idr_kernels.dp.cpp b/dpcpp/solver/idr_kernels.dp.cpp
index 3ec0db6d469..f37a9bcd091 100644
--- a/dpcpp/solver/idr_kernels.dp.cpp
+++ b/dpcpp/solver/idr_kernels.dp.cpp
@@ -111,7 +111,7 @@ template <size_type block_size, typename ValueType>
 void orthonormalize_subspace_vectors_kernel(
     size_type num_rows, size_type num_cols, ValueType* __restrict__ values,
     size_type stride, sycl::nd_item<3> item_ct1,
-    UninitializedArray<ValueType, block_size>& reduction_helper_array)
+    ValueType* __restrict__ reduction_helper_array)
 {
     const auto tidx = thread::get_thread_id_flat(item_ct1);
 
@@ -168,10 +168,9 @@ void orthonormalize_subspace_vectors_kernel(
     size_type num_rows, size_type num_cols, ValueType* values, size_type stride)
 {
     stream->submit([&](sycl::handler& cgh) {
-        sycl::accessor<UninitializedArray<ValueType, block_size>, 0,
-                       sycl::access_mode::read_write,
+        sycl::accessor<ValueType, 1, sycl::access_mode::read_write,
                        sycl::access::target::local>
-            reduction_helper_array_acc_ct1(cgh);
+            reduction_helper_array_acc_ct1(sycl::range<1>(block_size), cgh);
 
         cgh.parallel_for(
             sycl_nd_range(grid, block), [=
@@ -179,7 +178,8 @@ void orthonormalize_subspace_vectors_kernel(
                                             config::warp_size)]] {
                 orthonormalize_subspace_vectors_kernel<block_size>(
                     num_rows, num_cols, values, stride, item_ct1,
-                    *reduction_helper_array_acc_ct1.get_pointer());
+                    static_cast<ValueType*>(
+                        reduction_helper_array_acc_ct1.get_pointer()));
             });
     });
 }
@@ -329,13 +329,13 @@ void step_2_kernel(dim3 grid, dim3 block, size_t dynamic_shared_memory,
 
 
 template <typename ValueType>
-void multidot_kernel(
-    size_type num_rows, size_type nrhs, const ValueType* __restrict__ p_i,
-    const ValueType* __restrict__ g_k, size_type g_k_stride,
-    ValueType* __restrict__ alpha,
-    const stopping_status* __restrict__ stop_status, sycl::nd_item<3> item_ct1,
-    UninitializedArray<ValueType, default_dot_dim*(default_dot_dim + 1)>&
-        reduction_helper_array)
+void multidot_kernel(size_type num_rows, size_type nrhs,
+                     const ValueType* __restrict__ p_i,
+                     const ValueType* __restrict__ g_k, size_type g_k_stride,
+                     ValueType* __restrict__ alpha,
+                     const stopping_status* __restrict__ stop_status,
+                     sycl::nd_item<3> item_ct1,
+                     ValueType* __restrict__ reduction_helper_array)
 {
     const auto tidx = item_ct1.get_local_id(2);
     const auto tidy = item_ct1.get_local_id(1);
@@ -379,19 +379,20 @@ void multidot_kernel(dim3 grid, dim3 block, size_t dynamic_shared_memory,
                      const stopping_status* stop_status)
 {
     stream->submit([&](sycl::handler& cgh) {
-        sycl::accessor<UninitializedArray<ValueType, default_dot_dim*(
-                                                         default_dot_dim + 1)>,
-                       0, sycl::access_mode::read_write,
+        sycl::accessor<ValueType, 1, sycl::access_mode::read_write,
                        sycl::access::target::local>
-            reduction_helper_array_acc_ct1(cgh);
+            reduction_helper_array_acc_ct1(
+                sycl::range<1>(default_dot_dim * (default_dot_dim + 1)), cgh);
 
         cgh.parallel_for(
             sycl_nd_range(grid, block), [=
         ](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(
                                             default_dot_dim)]] {
-                multidot_kernel(num_rows, nrhs, p_i, g_k, g_k_stride, alpha,
-                                stop_status, item_ct1,
-                                *reduction_helper_array_acc_ct1.get_pointer());
+                multidot_kernel(
+                    num_rows, nrhs, p_i, g_k, g_k_stride, alpha, stop_status,
+                    item_ct1,
+                    static_cast<ValueType*>(
+                        reduction_helper_array_acc_ct1.get_pointer()));
             });
     });
 }
diff --git a/dpcpp/test/components/cooperative_groups.dp.cpp b/dpcpp/test/components/cooperative_groups.dp.cpp
index 533f8f363e4..903a5637197 100644
--- a/dpcpp/test/components/cooperative_groups.dp.cpp
+++ b/dpcpp/test/components/cooperative_groups.dp.cpp
@@ -33,6 +33,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "dpcpp/components/cooperative_groups.dp.hpp"
 
 
+#include <dpcpp/base/config.hpp>
 #include <iostream>
 #include <memory>
 
@@ -51,7 +52,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "core/base/types.hpp"
 #include "core/synthesizer/implementation_selection.hpp"
 #include "core/test/utils/assertions.hpp"
-#include "dpcpp/base/config.hpp"
 #include "dpcpp/base/dim3.dp.hpp"
 #include "dpcpp/base/helper.hpp"
 
@@ -62,9 +62,9 @@ namespace {
 using namespace gko::kernels::dpcpp;
 using KCFG_1D = gko::ConfigSet<11, 7>;
 constexpr auto default_config_list =
-    ::gko::syn::value_list<std::uint32_t, KCFG_1D::encode(64, 64),
-                           KCFG_1D::encode(32, 32), KCFG_1D::encode(16, 16),
-                           KCFG_1D::encode(8, 8), KCFG_1D::encode(4, 4)>();
+    std::integer_sequence<int, KCFG_1D::encode(64, 64), KCFG_1D::encode(32, 32),
+                          KCFG_1D::encode(16, 16), KCFG_1D::encode(8, 8),
+                          KCFG_1D::encode(4, 4)>();
 
 
 class CooperativeGroups : public testing::TestWithParam<unsigned int> {
@@ -116,7 +116,7 @@ class CooperativeGroups : public testing::TestWithParam<unsigned int> {
 
 
 // kernel implementation
-template <std::uint32_t config>
+template <int config>
 void cg_shuffle(bool* s, sycl::nd_item<3> item_ct1)
 {
     constexpr auto sg_size = KCFG_1D::decode<1>(config);
@@ -152,16 +152,16 @@ void cg_shuffle_host(dim3 grid, dim3 block,
 GKO_ENABLE_IMPLEMENTATION_CONFIG_SELECTION(cg_shuffle_config, cg_shuffle_host)
 
 // the call
-void cg_shuffle_config_call(std::uint32_t desired_cfg, dim3 grid, dim3 block,
+void cg_shuffle_config_call(int desired_cfg, dim3 grid, dim3 block,
                             gko::size_type dynamic_shared_memory,
                             sycl::queue* queue, bool* s)
 {
     cg_shuffle_config(
         default_config_list,
         // validate
-        [&desired_cfg](std::uint32_t cfg) { return cfg == desired_cfg; },
-        ::gko::syn::value_list<bool>(), ::gko::syn::value_list<int>(),
-        ::gko::syn::value_list<gko::size_type>(), ::gko::syn::type_list<>(),
+        [&desired_cfg](int cfg) { return cfg == desired_cfg; },
+        std::integer_sequence<bool>(), std::integer_sequence<int>(),
+        std::integer_sequence<gko::size_type>(), ::gko::syn::type_list<>(),
         grid, block, dynamic_shared_memory, queue, s);
 }
 
@@ -171,7 +171,7 @@ TEST_P(CooperativeGroups, Shuffle)
 }
 
 
-template <std::uint32_t config>
+template <int config>
 void cg_all(bool* s, sycl::nd_item<3> item_ct1)
 {
     constexpr auto sg_size = KCFG_1D::decode<1>(config);
@@ -185,7 +185,7 @@ void cg_all(bool* s, sycl::nd_item<3> item_ct1)
         group.all(item_ct1.get_local_id(2) < 13) == sg_size < 13;
 }
 
-template <std::uint32_t encoded, typename... InferredArgs>
+template <int encoded, typename... InferredArgs>
 inline void cg_all(dim3 grid, dim3 block, gko::size_type, sycl::queue* queue,
                    InferredArgs... args)
 {
@@ -206,7 +206,7 @@ GKO_ENABLE_DEFAULT_CONFIG_CALL(cg_all_call, cg_all, default_config_list)
 TEST_P(CooperativeGroups, All) { test_all_subgroup(cg_all_call<bool*>); }
 
 
-template <std::uint32_t config>
+template <int config>
 void cg_any(bool* s, sycl::nd_item<3> item_ct1)
 {
     constexpr auto sg_size = KCFG_1D::decode<1>(config);
@@ -219,7 +219,7 @@ void cg_any(bool* s, sycl::nd_item<3> item_ct1)
     s[i + sg_size * 2] = !group.any(false);
 }
 
-template <std::uint32_t encoded, typename... InferredArgs>
+template <int encoded, typename... InferredArgs>
 inline void cg_any(dim3 grid, dim3 block, gko::size_type, sycl::queue* queue,
                    InferredArgs... args)
 {
@@ -240,7 +240,7 @@ GKO_ENABLE_DEFAULT_CONFIG_CALL(cg_any_call, cg_any, default_config_list)
 TEST_P(CooperativeGroups, Any) { test_all_subgroup(cg_any_call<bool*>); }
 
 
-template <std::uint32_t config>
+template <int config>
 void cg_ballot(bool* s, sycl::nd_item<3> item_ct1)
 {
     constexpr auto sg_size = KCFG_1D::decode<1>(config);
@@ -254,7 +254,7 @@ void cg_ballot(bool* s, sycl::nd_item<3> item_ct1)
     s[i + sg_size * 2] = group.ballot(item_ct1.get_local_id(2) < 4) == 0xf;
 }
 
-template <std::uint32_t encoded, typename... InferredArgs>
+template <int encoded, typename... InferredArgs>
 inline void cg_ballot(dim3 grid, dim3 block, gko::size_type, sycl::queue* queue,
                       InferredArgs... args)
 {
diff --git a/hip/base/kernel_launch_reduction.hip.hpp b/hip/base/kernel_launch_reduction.hip.hpp
index e93f811bfb1..01e19c6c1f0 100644
--- a/hip/base/kernel_launch_reduction.hip.hpp
+++ b/hip/base/kernel_launch_reduction.hip.hpp
@@ -362,7 +362,7 @@ namespace {
 
 template <int subwarp_size, typename ValueType, typename KernelFunction,
           typename ReductionOp, typename FinalizeOp, typename... KernelArgs>
-void run_generic_kernel_row_reduction(syn::value_list<int, subwarp_size>,
+void run_generic_kernel_row_reduction(std::integer_sequence<int, subwarp_size>,
                                       int64 rows, int64 cols, int64 col_blocks,
                                       KernelFunction fn, ReductionOp op,
                                       FinalizeOp finalize, ValueType identity,
@@ -387,7 +387,7 @@ GKO_ENABLE_IMPLEMENTATION_SELECTION(select_run_generic_kernel_row_reduction,
 template <int subwarp_size, typename ValueType, typename KernelFunction,
           typename ReductionOp, typename FinalizeOp,
           typename... MappedKernelArgs>
-void run_generic_col_reduction_small(syn::value_list<int, subwarp_size>,
+void run_generic_col_reduction_small(std::integer_sequence<int, subwarp_size>,
                                      int64 max_blocks,
                                      std::shared_ptr<const HipExecutor> exec,
                                      KernelFunction fn, ReductionOp op,
@@ -441,7 +441,7 @@ void run_kernel_row_reduction(std::shared_ptr<const HipExecutor> exec,
                               dim<2> size, KernelArgs&&... args)
 {
     using subwarp_sizes =
-        syn::value_list<int, 1, 2, 4, 8, 16, 32, config::warp_size>;
+        std::integer_sequence<int, 1, 2, 4, 8, 16, 32, config::warp_size>;
     constexpr int oversubscription = 16;
     const auto rows = static_cast<int64>(size[0]);
     const auto cols = static_cast<int64>(size[1]);
@@ -472,8 +472,8 @@ void run_kernel_row_reduction(std::shared_ptr<const HipExecutor> exec,
                 return compiled_subwarp_size >= cols ||
                        compiled_subwarp_size == config::warp_size;
             },
-            syn::value_list<int>(), syn::type_list<>(), rows, cols, 1, fn, op,
-            finalize, identity, result, static_cast<int64>(result_stride),
+            std::integer_sequence<int>(), syn::type_list<>(), rows, cols, 1, fn,
+            op, finalize, identity, result, static_cast<int64>(result_stride),
             map_to_device(args)...);
     }
 }
@@ -488,7 +488,7 @@ void run_kernel_col_reduction(std::shared_ptr<const HipExecutor> exec,
                               KernelArgs&&... args)
 {
     using subwarp_sizes =
-        syn::value_list<int, 1, 2, 4, 8, 16, 32, config::warp_size>;
+        std::integer_sequence<int, 1, 2, 4, 8, 16, 32, config::warp_size>;
     constexpr int oversubscription = 16;
     const auto rows = static_cast<int64>(size[0]);
     const auto cols = static_cast<int64>(size[1]);
@@ -501,8 +501,8 @@ void run_kernel_col_reduction(std::shared_ptr<const HipExecutor> exec,
                 return compiled_subwarp_size >= cols ||
                        compiled_subwarp_size == config::warp_size;
             },
-            syn::value_list<int>(), syn::type_list<>(), max_blocks, exec, fn,
-            op, finalize, identity, result, size, map_to_device(args)...);
+            std::integer_sequence<int>(), syn::type_list<>(), max_blocks, exec,
+            fn, op, finalize, identity, result, size, map_to_device(args)...);
     } else {
         const auto col_blocks = ceildiv(cols, config::warp_size);
         const auto row_blocks =
diff --git a/hip/factorization/par_ic_kernels.hip.cpp b/hip/factorization/par_ic_kernels.hip.cpp
index 47cc332b70e..4b36d854593 100644
--- a/hip/factorization/par_ic_kernels.hip.cpp
+++ b/hip/factorization/par_ic_kernels.hip.cpp
@@ -59,7 +59,7 @@ constexpr int default_block_size = 512;
 
 // subwarp sizes for all warp-parallel kernels (sweep)
 using compiled_kernels =
-    syn::value_list<int, 1, 2, 4, 8, 16, 32, config::warp_size>;
+    std::integer_sequence<int, 1, 2, 4, 8, 16, 32, config::warp_size>;
 
 
 #include "common/cuda_hip/factorization/par_ic_kernels.hpp.inc"
diff --git a/hip/factorization/par_ict_kernels.hip.cpp b/hip/factorization/par_ict_kernels.hip.cpp
index 05d08e598a7..18929cad361 100644
--- a/hip/factorization/par_ict_kernels.hip.cpp
+++ b/hip/factorization/par_ict_kernels.hip.cpp
@@ -73,7 +73,7 @@ constexpr int default_block_size = 512;
 
 // subwarp sizes for all warp-parallel kernels (filter, add_candidates)
 using compiled_kernels =
-    syn::value_list<int, 1, 2, 4, 8, 16, 32, config::warp_size>;
+    std::integer_sequence<int, 1, 2, 4, 8, 16, 32, config::warp_size>;
 
 
 #include "common/cuda_hip/factorization/par_ict_spgeam_kernels.hpp.inc"
@@ -84,7 +84,7 @@ namespace {
 
 
 template <int subwarp_size, typename ValueType, typename IndexType>
-void add_candidates(syn::value_list<int, subwarp_size>,
+void add_candidates(std::integer_sequence<int, subwarp_size>,
                     std::shared_ptr<const DefaultExecutor> exec,
                     const matrix::Csr<ValueType, IndexType>* llh,
                     const matrix::Csr<ValueType, IndexType>* a,
@@ -140,7 +140,7 @@ GKO_ENABLE_IMPLEMENTATION_SELECTION(select_add_candidates, add_candidates);
 
 
 template <int subwarp_size, typename ValueType, typename IndexType>
-void compute_factor(syn::value_list<int, subwarp_size>,
+void compute_factor(std::integer_sequence<int, subwarp_size>,
                     std::shared_ptr<const DefaultExecutor> exec,
                     const matrix::Csr<ValueType, IndexType>* a,
                     matrix::Csr<ValueType, IndexType>* l,
@@ -184,7 +184,8 @@ void add_candidates(std::shared_ptr<const DefaultExecutor> exec,
             return total_nnz_per_row <= compiled_subwarp_size ||
                    compiled_subwarp_size == config::warp_size;
         },
-        syn::value_list<int>(), syn::type_list<>(), exec, llh, a, l, l_new);
+        std::integer_sequence<int>(), syn::type_list<>(), exec, llh, a, l,
+        l_new);
 }
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
@@ -206,7 +207,7 @@ void compute_factor(std::shared_ptr<const DefaultExecutor> exec,
             return total_nnz_per_row <= compiled_subwarp_size ||
                    compiled_subwarp_size == config::warp_size;
         },
-        syn::value_list<int>(), syn::type_list<>(), exec, a, l, l_coo);
+        std::integer_sequence<int>(), syn::type_list<>(), exec, a, l, l_coo);
 }
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
diff --git a/hip/factorization/par_ilut_approx_filter_kernel.hip.cpp b/hip/factorization/par_ilut_approx_filter_kernel.hip.cpp
index d484af93d2a..8de4288787c 100644
--- a/hip/factorization/par_ilut_approx_filter_kernel.hip.cpp
+++ b/hip/factorization/par_ilut_approx_filter_kernel.hip.cpp
@@ -76,7 +76,7 @@ namespace par_ilut_factorization {
 
 // subwarp sizes for filter kernels
 using compiled_kernels =
-    syn::value_list<int, 1, 2, 4, 8, 16, 32, config::warp_size>;
+    std::integer_sequence<int, 1, 2, 4, 8, 16, 32, config::warp_size>;
 
 
 #include "common/cuda_hip/factorization/par_ilut_filter_kernels.hpp.inc"
@@ -84,7 +84,7 @@ using compiled_kernels =
 
 
 template <int subwarp_size, typename ValueType, typename IndexType>
-void threshold_filter_approx(syn::value_list<int, subwarp_size>,
+void threshold_filter_approx(std::integer_sequence<int, subwarp_size>,
                              std::shared_ptr<const DefaultExecutor> exec,
                              const matrix::Csr<ValueType, IndexType>* m,
                              IndexType rank, Array<ValueType>* tmp,
@@ -201,7 +201,7 @@ void threshold_filter_approx(std::shared_ptr<const DefaultExecutor> exec,
             return total_nnz_per_row <= compiled_subwarp_size ||
                    compiled_subwarp_size == config::warp_size;
         },
-        syn::value_list<int>(), syn::type_list<>(), exec, m, rank, &tmp,
+        std::integer_sequence<int>(), syn::type_list<>(), exec, m, rank, &tmp,
         &threshold, m_out, m_out_coo);
 }
 
diff --git a/hip/factorization/par_ilut_filter_kernel.hip.cpp b/hip/factorization/par_ilut_filter_kernel.hip.cpp
index cf691f58f8b..a4843ee89ff 100644
--- a/hip/factorization/par_ilut_filter_kernel.hip.cpp
+++ b/hip/factorization/par_ilut_filter_kernel.hip.cpp
@@ -72,7 +72,7 @@ constexpr int default_block_size = 512;
 
 // subwarp sizes for filter kernels
 using compiled_kernels =
-    syn::value_list<int, 1, 2, 4, 8, 16, 32, config::warp_size>;
+    std::integer_sequence<int, 1, 2, 4, 8, 16, 32, config::warp_size>;
 
 
 #include "common/cuda_hip/factorization/par_ilut_filter_kernels.hpp.inc"
@@ -82,7 +82,7 @@ namespace {
 
 
 template <int subwarp_size, typename ValueType, typename IndexType>
-void threshold_filter(syn::value_list<int, subwarp_size>,
+void threshold_filter(std::integer_sequence<int, subwarp_size>,
                       std::shared_ptr<const DefaultExecutor> exec,
                       const matrix::Csr<ValueType, IndexType>* a,
                       remove_complex<ValueType> threshold,
@@ -156,8 +156,8 @@ void threshold_filter(std::shared_ptr<const DefaultExecutor> exec,
             return total_nnz_per_row <= compiled_subwarp_size ||
                    compiled_subwarp_size == config::warp_size;
         },
-        syn::value_list<int>(), syn::type_list<>(), exec, a, threshold, m_out,
-        m_out_coo, lower);
+        std::integer_sequence<int>(), syn::type_list<>(), exec, a, threshold,
+        m_out, m_out_coo, lower);
 }
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
diff --git a/hip/factorization/par_ilut_spgeam_kernel.hip.cpp b/hip/factorization/par_ilut_spgeam_kernel.hip.cpp
index f8234cc6a04..02325d508bd 100644
--- a/hip/factorization/par_ilut_spgeam_kernel.hip.cpp
+++ b/hip/factorization/par_ilut_spgeam_kernel.hip.cpp
@@ -73,7 +73,7 @@ constexpr int default_block_size = 512;
 
 // subwarp sizes for add_candidates kernels
 using compiled_kernels =
-    syn::value_list<int, 1, 2, 4, 8, 16, 32, config::warp_size>;
+    std::integer_sequence<int, 1, 2, 4, 8, 16, 32, config::warp_size>;
 
 
 #include "common/cuda_hip/factorization/par_ilut_spgeam_kernels.hpp.inc"
@@ -83,7 +83,7 @@ namespace {
 
 
 template <int subwarp_size, typename ValueType, typename IndexType>
-void add_candidates(syn::value_list<int, subwarp_size>,
+void add_candidates(std::integer_sequence<int, subwarp_size>,
                     std::shared_ptr<const DefaultExecutor> exec,
                     const matrix::Csr<ValueType, IndexType>* lu,
                     const matrix::Csr<ValueType, IndexType>* a,
@@ -175,8 +175,8 @@ void add_candidates(std::shared_ptr<const DefaultExecutor> exec,
             return total_nnz_per_row <= compiled_subwarp_size ||
                    compiled_subwarp_size == config::warp_size;
         },
-        syn::value_list<int>(), syn::type_list<>(), exec, lu, a, l, u, l_new,
-        u_new);
+        std::integer_sequence<int>(), syn::type_list<>(), exec, lu, a, l, u,
+        l_new, u_new);
 }
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
diff --git a/hip/factorization/par_ilut_sweep_kernel.hip.cpp b/hip/factorization/par_ilut_sweep_kernel.hip.cpp
index e063fd81d0d..562e83efd7e 100644
--- a/hip/factorization/par_ilut_sweep_kernel.hip.cpp
+++ b/hip/factorization/par_ilut_sweep_kernel.hip.cpp
@@ -73,7 +73,7 @@ constexpr int default_block_size = 512;
 
 // subwarp sizes for all warp-parallel kernels (filter, add_candidates)
 using compiled_kernels =
-    syn::value_list<int, 1, 2, 4, 8, 16, 32, config::warp_size>;
+    std::integer_sequence<int, 1, 2, 4, 8, 16, 32, config::warp_size>;
 
 
 #include "common/cuda_hip/factorization/par_ilut_sweep_kernels.hpp.inc"
@@ -83,7 +83,7 @@ namespace {
 
 
 template <int subwarp_size, typename ValueType, typename IndexType>
-void compute_l_u_factors(syn::value_list<int, subwarp_size>,
+void compute_l_u_factors(std::integer_sequence<int, subwarp_size>,
                          std::shared_ptr<const DefaultExecutor> exec,
                          const matrix::Csr<ValueType, IndexType>* a,
                          matrix::Csr<ValueType, IndexType>* l,
@@ -138,8 +138,8 @@ void compute_l_u_factors(std::shared_ptr<const DefaultExecutor> exec,
             return total_nnz_per_row <= compiled_subwarp_size ||
                    compiled_subwarp_size == config::warp_size;
         },
-        syn::value_list<int>(), syn::type_list<>(), exec, a, l, l_coo, u, u_coo,
-        u_csc);
+        std::integer_sequence<int>(), syn::type_list<>(), exec, a, l, l_coo, u,
+        u_coo, u_csc);
 }
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
diff --git a/hip/matrix/csr_kernels.hip.cpp b/hip/matrix/csr_kernels.hip.cpp
index eafbd367c0e..fa4890f6ad2 100644
--- a/hip/matrix/csr_kernels.hip.cpp
+++ b/hip/matrix/csr_kernels.hip.cpp
@@ -90,13 +90,13 @@ constexpr int classical_overweight = 32;
  * A compile-time list of the number items per threads for which spmv kernel
  * should be compiled.
  */
-using compiled_kernels = syn::value_list<int, 3, 4, 6, 7, 8, 12, 14>;
+using compiled_kernels = std::integer_sequence<int, 3, 4, 6, 7, 8, 12, 14>;
 
 using classical_kernels =
-    syn::value_list<int, config::warp_size, 32, 16, 8, 4, 2, 1>;
+    std::integer_sequence<int, config::warp_size, 32, 16, 8, 4, 2, 1>;
 
 using spgeam_kernels =
-    syn::value_list<int, 1, 2, 4, 8, 16, 32, config::warp_size>;
+    std::integer_sequence<int, 1, 2, 4, 8, 16, 32, config::warp_size>;
 
 
 #include "common/cuda_hip/matrix/csr_kernels.hpp.inc"
@@ -106,7 +106,7 @@ namespace host_kernel {
 
 
 template <int items_per_thread, typename ValueType, typename IndexType>
-void merge_path_spmv(syn::value_list<int, items_per_thread>,
+void merge_path_spmv(std::integer_sequence<int, items_per_thread>,
                      std::shared_ptr<const HipExecutor> exec,
                      const matrix::Csr<ValueType, IndexType>* a,
                      const matrix::Dense<ValueType>* b,
@@ -226,7 +226,7 @@ int compute_items_per_thread(std::shared_ptr<const HipExecutor> exec)
 
 
 template <int subwarp_size, typename ValueType, typename IndexType>
-void classical_spmv(syn::value_list<int, subwarp_size>,
+void classical_spmv(std::integer_sequence<int, subwarp_size>,
                     std::shared_ptr<const HipExecutor> exec,
                     const matrix::Csr<ValueType, IndexType>* a,
                     const matrix::Dense<ValueType>* b,
@@ -307,7 +307,7 @@ void spmv(std::shared_ptr<const HipExecutor> exec,
             [&items_per_thread](int compiled_info) {
                 return items_per_thread == compiled_info;
             },
-            syn::value_list<int>(), syn::type_list<>(), exec, a, b, c);
+            std::integer_sequence<int>(), syn::type_list<>(), exec, a, b, c);
     } else {
         bool try_sparselib = (a->get_strategy()->get_name() == "sparselib" ||
                               a->get_strategy()->get_name() == "cusparse");
@@ -356,7 +356,8 @@ void spmv(std::shared_ptr<const HipExecutor> exec,
                 [&max_length_per_row](int compiled_info) {
                     return max_length_per_row >= compiled_info;
                 },
-                syn::value_list<int>(), syn::type_list<>(), exec, a, b, c);
+                std::integer_sequence<int>(), syn::type_list<>(), exec, a, b,
+                c);
         }
     }
 }
@@ -404,8 +405,8 @@ void advanced_spmv(std::shared_ptr<const HipExecutor> exec,
             [&items_per_thread](int compiled_info) {
                 return items_per_thread == compiled_info;
             },
-            syn::value_list<int>(), syn::type_list<>(), exec, a, b, c, alpha,
-            beta);
+            std::integer_sequence<int>(), syn::type_list<>(), exec, a, b, c,
+            alpha, beta);
     } else {
         bool try_sparselib = (a->get_strategy()->get_name() == "sparselib" ||
                               a->get_strategy()->get_name() == "cusparse");
@@ -453,7 +454,7 @@ void advanced_spmv(std::shared_ptr<const HipExecutor> exec,
                 [&max_length_per_row](int compiled_info) {
                     return max_length_per_row >= compiled_info;
                 },
-                syn::value_list<int>(), syn::type_list<>(), exec, a, b, c,
+                std::integer_sequence<int>(), syn::type_list<>(), exec, a, b, c,
                 alpha, beta);
         }
     }
@@ -542,7 +543,7 @@ namespace {
 
 
 template <int subwarp_size, typename ValueType, typename IndexType>
-void spgeam(syn::value_list<int, subwarp_size>,
+void spgeam(std::integer_sequence<int, subwarp_size>,
             std::shared_ptr<const HipExecutor> exec, const ValueType* alpha,
             const IndexType* a_row_ptrs, const IndexType* a_col_idxs,
             const ValueType* a_vals, const ValueType* beta,
@@ -667,7 +668,7 @@ void advanced_spgemm(std::shared_ptr<const HipExecutor> exec,
                 return compiled_subwarp_size >= nnz_per_row ||
                        compiled_subwarp_size == config::warp_size;
             },
-            syn::value_list<int>(), syn::type_list<>(), exec,
+            std::integer_sequence<int>(), syn::type_list<>(), exec,
             alpha->get_const_values(), c_tmp_row_ptrs, c_tmp_col_idxs,
             c_tmp_vals, beta->get_const_values(), d_row_ptrs, d_col_idxs,
             d_vals, c);
@@ -697,7 +698,7 @@ void spgeam(std::shared_ptr<const DefaultExecutor> exec,
             return compiled_subwarp_size >= nnz_per_row ||
                    compiled_subwarp_size == config::warp_size;
         },
-        syn::value_list<int>(), syn::type_list<>(), exec,
+        std::integer_sequence<int>(), syn::type_list<>(), exec,
         alpha->get_const_values(), a->get_const_row_ptrs(),
         a->get_const_col_idxs(), a->get_const_values(),
         beta->get_const_values(), b->get_const_row_ptrs(),
diff --git a/hip/matrix/ell_kernels.hip.cpp b/hip/matrix/ell_kernels.hip.cpp
index 2486a666ab0..cd55dfc266f 100644
--- a/hip/matrix/ell_kernels.hip.cpp
+++ b/hip/matrix/ell_kernels.hip.cpp
@@ -105,7 +105,7 @@ constexpr int max_thread_per_worker = 32;
  * 0 is a special case where it uses a sub-warp size of warp_size in
  * combination with atomic_adds.
  */
-using compiled_kernels = syn::value_list<int, 0, 1, 2, 4, 8, 16, 32>;
+using compiled_kernels = std::integer_sequence<int, 0, 1, 2, 4, 8, 16, 32>;
 
 
 #include "common/cuda_hip/matrix/ell_kernels.hpp.inc"
@@ -116,7 +116,7 @@ namespace {
 
 template <int info, typename InputValueType, typename MatrixValueType,
           typename OutputValueType, typename IndexType>
-void abstract_spmv(syn::value_list<int, info>, int num_worker_per_row,
+void abstract_spmv(std::integer_sequence<int, info>, int num_worker_per_row,
                    const matrix::Ell<MatrixValueType, IndexType>* a,
                    const matrix::Dense<InputValueType>* b,
                    matrix::Dense<OutputValueType>* c,
@@ -253,8 +253,8 @@ void spmv(std::shared_ptr<const HipExecutor> exec,
     select_abstract_spmv(
         compiled_kernels(),
         [&info](int compiled_info) { return info == compiled_info; },
-        syn::value_list<int>(), syn::type_list<>(), num_worker_per_row, a, b,
-        c);
+        std::integer_sequence<int>(), syn::type_list<>(), num_worker_per_row, a,
+        b, c);
 }
 
 GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE(
@@ -287,8 +287,8 @@ void advanced_spmv(std::shared_ptr<const HipExecutor> exec,
     select_abstract_spmv(
         compiled_kernels(),
         [&info](int compiled_info) { return info == compiled_info; },
-        syn::value_list<int>(), syn::type_list<>(), num_worker_per_row, a, b, c,
-        alpha, beta);
+        std::integer_sequence<int>(), syn::type_list<>(), num_worker_per_row, a,
+        b, c, alpha, beta);
 }
 
 GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE(
diff --git a/hip/preconditioner/jacobi_advanced_apply_instantiate.inc.hip.cpp b/hip/preconditioner/jacobi_advanced_apply_instantiate.inc.hip.cpp
index 87042e35332..443ca644e00 100644
--- a/hip/preconditioner/jacobi_advanced_apply_instantiate.inc.hip.cpp
+++ b/hip/preconditioner/jacobi_advanced_apply_instantiate.inc.hip.cpp
@@ -78,7 +78,7 @@ namespace jacobi {
 template <int warps_per_block, int max_block_size, typename ValueType,
           typename IndexType>
 void advanced_apply(
-    syn::value_list<int, max_block_size>, size_type num_blocks,
+    std::integer_sequence<int, max_block_size>, size_type num_blocks,
     const precision_reduction* block_precisions,
     const IndexType* block_pointers, const ValueType* blocks,
     const preconditioner::block_interleaved_storage_scheme<IndexType>&
@@ -117,7 +117,7 @@ void advanced_apply(
 #define DECLARE_JACOBI_ADVANCED_APPLY_INSTANTIATION(ValueType, IndexType)   \
     void advanced_apply<config::min_warps_per_block, GKO_JACOBI_BLOCK_SIZE, \
                         ValueType, IndexType>(                              \
-        syn::value_list<int, GKO_JACOBI_BLOCK_SIZE>, size_type,             \
+        std::integer_sequence<int, GKO_JACOBI_BLOCK_SIZE>, size_type,       \
         const precision_reduction*, const IndexType* block_pointers,        \
         const ValueType*,                                                   \
         const preconditioner::block_interleaved_storage_scheme<IndexType>&, \
diff --git a/hip/preconditioner/jacobi_advanced_apply_kernel.hip.cpp b/hip/preconditioner/jacobi_advanced_apply_kernel.hip.cpp
index 6412fb8ca6b..82483e0fcac 100644
--- a/hip/preconditioner/jacobi_advanced_apply_kernel.hip.cpp
+++ b/hip/preconditioner/jacobi_advanced_apply_kernel.hip.cpp
@@ -55,7 +55,7 @@ namespace jacobi {
 template <int warps_per_block, int max_block_size, typename ValueType,
           typename IndexType>
 void advanced_apply(
-    syn::value_list<int, max_block_size>, size_type num_blocks,
+    std::integer_sequence<int, max_block_size>, size_type num_blocks,
     const precision_reduction* block_precisions,
     const IndexType* block_pointers, const ValueType* blocks,
     const preconditioner::block_interleaved_storage_scheme<IndexType>&
@@ -86,7 +86,7 @@ void apply(std::shared_ptr<const HipExecutor> exec, size_type num_blocks,
             [&](int compiled_block_size) {
                 return max_block_size <= compiled_block_size;
             },
-            syn::value_list<int, config::min_warps_per_block>(),
+            std::integer_sequence<int, config::min_warps_per_block>(),
             syn::type_list<>(), num_blocks, block_precisions.get_const_data(),
             block_pointers.get_const_data(), blocks.get_const_data(),
             storage_scheme, alpha->get_const_values(),
diff --git a/hip/preconditioner/jacobi_common.hip.hpp.in b/hip/preconditioner/jacobi_common.hip.hpp.in
index 19f7d15bcef..8b79d841355 100644
--- a/hip/preconditioner/jacobi_common.hip.hpp.in
+++ b/hip/preconditioner/jacobi_common.hip.hpp.in
@@ -30,8 +30,10 @@ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 ******************************<GINKGO LICENSE>*******************************/
 
+#include <utility>
+
+
 #include <ginkgo/config.hpp>
-#include <ginkgo/core/synthesizer/containers.hpp>
 
 
 #include "hip/base/config.hip.hpp"
@@ -56,7 +58,8 @@ namespace jacobi {
 #endif
 
 
-using compiled_kernels = syn::value_list<int, GKO_HIP_JACOBI_BLOCK_SIZES_CODE>;
+using compiled_kernels =
+    std::integer_sequence<int, GKO_HIP_JACOBI_BLOCK_SIZES_CODE>;
 
 
 constexpr int get_larger_power(int value, int guess = 1)
diff --git a/hip/preconditioner/jacobi_generate_instantiate.inc.hip.cpp b/hip/preconditioner/jacobi_generate_instantiate.inc.hip.cpp
index 44ca906769f..313f9c4b9f1 100644
--- a/hip/preconditioner/jacobi_generate_instantiate.inc.hip.cpp
+++ b/hip/preconditioner/jacobi_generate_instantiate.inc.hip.cpp
@@ -77,7 +77,7 @@ namespace jacobi {
 
 template <int warps_per_block, int max_block_size, typename ValueType,
           typename IndexType>
-void generate(syn::value_list<int, max_block_size>,
+void generate(std::integer_sequence<int, max_block_size>,
               const matrix::Csr<ValueType, IndexType>* mtx,
               remove_complex<ValueType> accuracy, ValueType* block_data,
               const preconditioner::block_interleaved_storage_scheme<IndexType>&
@@ -120,7 +120,7 @@ void generate(syn::value_list<int, max_block_size>,
 #define DECLARE_JACOBI_GENERATE_INSTANTIATION(ValueType, IndexType)          \
     void generate<config::min_warps_per_block, GKO_JACOBI_BLOCK_SIZE,        \
                   ValueType, IndexType>(                                     \
-        syn::value_list<int, GKO_JACOBI_BLOCK_SIZE>,                         \
+        std::integer_sequence<int, GKO_JACOBI_BLOCK_SIZE>,                   \
         const matrix::Csr<ValueType, IndexType>*, remove_complex<ValueType>, \
         ValueType*,                                                          \
         const preconditioner::block_interleaved_storage_scheme<IndexType>&,  \
diff --git a/hip/preconditioner/jacobi_generate_kernel.hip.cpp b/hip/preconditioner/jacobi_generate_kernel.hip.cpp
index a5c8ee94fa6..54a63b47d1e 100644
--- a/hip/preconditioner/jacobi_generate_kernel.hip.cpp
+++ b/hip/preconditioner/jacobi_generate_kernel.hip.cpp
@@ -71,7 +71,7 @@ namespace jacobi {
 
 template <int warps_per_block, int max_block_size, typename ValueType,
           typename IndexType>
-void generate(syn::value_list<int, max_block_size>,
+void generate(std::integer_sequence<int, max_block_size>,
               const matrix::Csr<ValueType, IndexType>* mtx,
               remove_complex<ValueType> accuracy, ValueType* block_data,
               const preconditioner::block_interleaved_storage_scheme<IndexType>&
@@ -101,9 +101,9 @@ void generate(std::shared_ptr<const HipExecutor> exec,
         [&](int compiled_block_size) {
             return max_block_size <= compiled_block_size;
         },
-        syn::value_list<int, config::min_warps_per_block>(), syn::type_list<>(),
-        system_matrix, accuracy, blocks.get_data(), storage_scheme,
-        conditioning.get_data(), block_precisions.get_data(),
+        std::integer_sequence<int, config::min_warps_per_block>(),
+        syn::type_list<>(), system_matrix, accuracy, blocks.get_data(),
+        storage_scheme, conditioning.get_data(), block_precisions.get_data(),
         block_pointers.get_const_data(), num_blocks);
 }
 
diff --git a/hip/preconditioner/jacobi_kernels.hip.cpp b/hip/preconditioner/jacobi_kernels.hip.cpp
index d8c365fa12d..76db71d9c08 100644
--- a/hip/preconditioner/jacobi_kernels.hip.cpp
+++ b/hip/preconditioner/jacobi_kernels.hip.cpp
@@ -163,7 +163,7 @@ namespace {
 template <bool conjugate, int warps_per_block, int max_block_size,
           typename ValueType, typename IndexType>
 void transpose_jacobi(
-    syn::value_list<int, max_block_size>, size_type num_blocks,
+    std::integer_sequence<int, max_block_size>, size_type num_blocks,
     const precision_reduction* block_precisions,
     const IndexType* block_pointers, const ValueType* blocks,
     const preconditioner::block_interleaved_storage_scheme<IndexType>&
@@ -217,7 +217,7 @@ void transpose_jacobi(
         [&](int compiled_block_size) {
             return max_block_size <= compiled_block_size;
         },
-        syn::value_list<int, false, config::min_warps_per_block>(),
+        std::integer_sequence<int, false, config::min_warps_per_block>(),
         syn::type_list<>(), num_blocks, block_precisions.get_const_data(),
         block_pointers.get_const_data(), blocks.get_const_data(),
         storage_scheme, out_blocks.get_data());
@@ -241,7 +241,7 @@ void conj_transpose_jacobi(
         [&](int compiled_block_size) {
             return max_block_size <= compiled_block_size;
         },
-        syn::value_list<int, true, config::min_warps_per_block>(),
+        std::integer_sequence<int, true, config::min_warps_per_block>(),
         syn::type_list<>(), num_blocks, block_precisions.get_const_data(),
         block_pointers.get_const_data(), blocks.get_const_data(),
         storage_scheme, out_blocks.get_data());
diff --git a/hip/preconditioner/jacobi_simple_apply_instantiate.inc.hip.cpp b/hip/preconditioner/jacobi_simple_apply_instantiate.inc.hip.cpp
index c4a63007798..fcf2ca2140c 100644
--- a/hip/preconditioner/jacobi_simple_apply_instantiate.inc.hip.cpp
+++ b/hip/preconditioner/jacobi_simple_apply_instantiate.inc.hip.cpp
@@ -74,7 +74,7 @@ namespace jacobi {
 
 template <int warps_per_block, int max_block_size, typename ValueType,
           typename IndexType>
-void apply(syn::value_list<int, max_block_size>, size_type num_blocks,
+void apply(std::integer_sequence<int, max_block_size>, size_type num_blocks,
            const precision_reduction* block_precisions,
            const IndexType* block_pointers, const ValueType* blocks,
            const preconditioner::block_interleaved_storage_scheme<IndexType>&
@@ -112,7 +112,7 @@ void apply(syn::value_list<int, max_block_size>, size_type num_blocks,
 #define DECLARE_JACOBI_SIMPLE_APPLY_INSTANTIATION(ValueType, IndexType)       \
     void apply<config::min_warps_per_block, GKO_JACOBI_BLOCK_SIZE, ValueType, \
                IndexType>(                                                    \
-        syn::value_list<int, GKO_JACOBI_BLOCK_SIZE>, size_type,               \
+        std::integer_sequence<int, GKO_JACOBI_BLOCK_SIZE>, size_type,         \
         const precision_reduction*, const IndexType*, const ValueType*,       \
         const preconditioner::block_interleaved_storage_scheme<IndexType>&,   \
         const ValueType*, size_type, ValueType*, size_type)
diff --git a/hip/preconditioner/jacobi_simple_apply_kernel.hip.cpp b/hip/preconditioner/jacobi_simple_apply_kernel.hip.cpp
index 9690cd73121..bca05ce445c 100644
--- a/hip/preconditioner/jacobi_simple_apply_kernel.hip.cpp
+++ b/hip/preconditioner/jacobi_simple_apply_kernel.hip.cpp
@@ -68,7 +68,7 @@ namespace jacobi {
 
 template <int warps_per_block, int max_block_size, typename ValueType,
           typename IndexType>
-void apply(syn::value_list<int, max_block_size>, size_type num_blocks,
+void apply(std::integer_sequence<int, max_block_size>, size_type num_blocks,
            const precision_reduction* block_precisions,
            const IndexType* block_pointers, const ValueType* blocks,
            const preconditioner::block_interleaved_storage_scheme<IndexType>&
@@ -96,7 +96,7 @@ void simple_apply(
             [&](int compiled_block_size) {
                 return max_block_size <= compiled_block_size;
             },
-            syn::value_list<int, config::min_warps_per_block>(),
+            std::integer_sequence<int, config::min_warps_per_block>(),
             syn::type_list<>(), num_blocks, block_precisions.get_const_data(),
             block_pointers.get_const_data(), blocks.get_const_data(),
             storage_scheme, b->get_const_values() + col, b->get_stride(),
diff --git a/include/ginkgo/core/base/executor.hpp b/include/ginkgo/core/base/executor.hpp
index 73976c24089..0ab3f54a79a 100644
--- a/include/ginkgo/core/base/executor.hpp
+++ b/include/ginkgo/core/base/executor.hpp
@@ -49,7 +49,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/base/machine_topology.hpp>
 #include <ginkgo/core/base/types.hpp>
 #include <ginkgo/core/log/logger.hpp>
-#include <ginkgo/core/synthesizer/containers.hpp>
 
 
 namespace gko {
diff --git a/include/ginkgo/core/base/types.hpp b/include/ginkgo/core/base/types.hpp
index 009c2e182cd..26854ad3cba 100644
--- a/include/ginkgo/core/base/types.hpp
+++ b/include/ginkgo/core/base/types.hpp
@@ -419,6 +419,14 @@ GKO_ATTRIBUTES constexpr bool operator!=(precision_reduction x,
     _enable_macro(CudaExecutor, cuda)
 
 
+#if GINKGO_DPCPP_SINGLE_MODE
+#define GKO_ADAPT_SINGLE(_macro) \
+    template <>                  \
+    _macro GKO_NOT_IMPLEMENTED
+#else
+#define GKO_ADAPT_SINGLE(_macro) template _macro
+#endif
+
 /**
  * Instantiates a template for each non-complex value type compiled by Ginkgo.
  *
@@ -427,16 +435,9 @@ GKO_ATTRIBUTES constexpr bool operator!=(precision_reduction x,
  *                Should take one argument, which is replaced by the
  *                value type.
  */
-#if GINKGO_DPCPP_SINGLE_MODE
 #define GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE(_macro) \
     template _macro(float);                                     \
-    template <>                                                 \
-    _macro(double) GKO_NOT_IMPLEMENTED
-#else
-#define GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE(_macro) \
-    template _macro(float);                                     \
-    template _macro(double)
-#endif
+    GKO_ADAPT_SINGLE(_macro(double))
 
 
 /**
@@ -447,18 +448,10 @@ GKO_ATTRIBUTES constexpr bool operator!=(precision_reduction x,
  *                Should take one argument, which is replaced by the
  *                value type.
  */
-#if GINKGO_DPCPP_SINGLE_MODE
-#define GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(_macro)          \
-    GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE(_macro); \
-    template _macro(std::complex<float>);                    \
-    template <>                                              \
-    _macro(std::complex<double>) GKO_NOT_IMPLEMENTED
-#else
 #define GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(_macro)          \
     GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE(_macro); \
     template _macro(std::complex<float>);                    \
-    template _macro(std::complex<double>)
-#endif
+    GKO_ADAPT_SINGLE(_macro(std::complex<double>))
 
 
 /**
@@ -471,26 +464,14 @@ GKO_ATTRIBUTES constexpr bool operator!=(precision_reduction x,
  *                Should take two arguments, which are replaced by the
  *                value and scalar type, respectively.
  */
-#if GINKGO_DPCPP_SINGLE_MODE
-#define GKO_INSTANTIATE_FOR_EACH_VALUE_AND_SCALAR_TYPE(_macro)              \
-    template _macro(float, float);                                          \
-    template <>                                                             \
-    _macro(double, double) GKO_NOT_IMPLEMENTED;                             \
-    template _macro(std::complex<float>, std::complex<float>);              \
-    template <>                                                             \
-    _macro(std::complex<double>, std::complex<double>) GKO_NOT_IMPLEMENTED; \
-    template _macro(std::complex<float>, float);                            \
-    template <>                                                             \
-    _macro(std::complex<double>, double) GKO_NOT_IMPLEMENTED;
-#else
-#define GKO_INSTANTIATE_FOR_EACH_VALUE_AND_SCALAR_TYPE(_macro)   \
-    template _macro(float, float);                               \
-    template _macro(double, double);                             \
-    template _macro(std::complex<float>, std::complex<float>);   \
-    template _macro(std::complex<double>, std::complex<double>); \
-    template _macro(std::complex<float>, float);                 \
-    template _macro(std::complex<double>, double)
-#endif
+
+#define GKO_INSTANTIATE_FOR_EACH_VALUE_AND_SCALAR_TYPE(_macro)            \
+    template _macro(float, float);                                        \
+    GKO_ADAPT_SINGLE(_macro(double, double));                             \
+    template _macro(std::complex<float>, std::complex<float>);            \
+    GKO_ADAPT_SINGLE(_macro(std::complex<double>, std::complex<double>)); \
+    template _macro(std::complex<float>, float);                          \
+    GKO_ADAPT_SINGLE(_macro(std::complex<double>, double))
 
 
 /**
@@ -515,21 +496,11 @@ GKO_ATTRIBUTES constexpr bool operator!=(precision_reduction x,
  *                Should take two arguments, which are replaced by the
  *                value and index types.
  */
-#if GINKGO_DPCPP_SINGLE_MODE
-#define GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_AND_INDEX_TYPE(_macro) \
-    template _macro(float, int32);                                        \
-    template <>                                                           \
-    _macro(double, int32) GKO_NOT_IMPLEMENTED;                            \
-    template _macro(float, int64);                                        \
-    template <>                                                           \
-    _macro(double, int64) GKO_NOT_IMPLEMENTED
-#else
 #define GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_AND_INDEX_TYPE(_macro) \
     template _macro(float, int32);                                        \
-    template _macro(double, int32);                                       \
+    GKO_ADAPT_SINGLE(_macro(double, int32));                              \
     template _macro(float, int64);                                        \
-    template _macro(double, int64)
-#endif
+    GKO_ADAPT_SINGLE(_macro(double, int64))
 
 
 /**
@@ -540,46 +511,12 @@ GKO_ATTRIBUTES constexpr bool operator!=(precision_reduction x,
  *                Should take two arguments, which are replaced by the
  *                value and index types.
  */
-#if GINKGO_DPCPP_SINGLE_MODE
 #define GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(_macro)          \
     GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_AND_INDEX_TYPE(_macro); \
     template _macro(std::complex<float>, int32);                       \
-    template <>                                                        \
-    _macro(std::complex<double>, int32) GKO_NOT_IMPLEMENTED;           \
+    GKO_ADAPT_SINGLE(_macro(std::complex<double>, int32));             \
     template _macro(std::complex<float>, int64);                       \
-    template <>                                                        \
-    _macro(std::complex<double>, int64) GKO_NOT_IMPLEMENTED
-#else
-#define GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(_macro)          \
-    GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_AND_INDEX_TYPE(_macro); \
-    template _macro(std::complex<float>, int32);                       \
-    template _macro(std::complex<double>, int32);                      \
-    template _macro(std::complex<float>, int64);                       \
-    template _macro(std::complex<double>, int64)
-#endif
-
-
-#if GINKGO_DPCPP_SINGLE_MODE
-#define GKO_INSTANTIATE_FOR_EACH_VALUE_CONVERSION(_macro)                  \
-    template <>                                                            \
-    _macro(float, double) GKO_NOT_IMPLEMENTED;                             \
-    template <>                                                            \
-    _macro(double, float) GKO_NOT_IMPLEMENTED;                             \
-    template <>                                                            \
-    _macro(std::complex<float>, std::complex<double>) GKO_NOT_IMPLEMENTED; \
-    template <>                                                            \
-    _macro(std::complex<double>, std::complex<float>) GKO_NOT_IMPLEMENTED
-
-
-#define GKO_INSTANTIATE_FOR_EACH_VALUE_CONVERSION_OR_COPY(_macro) \
-    GKO_INSTANTIATE_FOR_EACH_VALUE_CONVERSION(_macro);            \
-    template _macro(float, float);                                \
-    template <>                                                   \
-    _macro(double, double) GKO_NOT_IMPLEMENTED;                   \
-    template _macro(std::complex<float>, std::complex<float>);    \
-    template <>                                                   \
-    _macro(std::complex<double>, std::complex<double>) GKO_NOT_IMPLEMENTED
-#else
+    GKO_ADAPT_SINGLE(_macro(std::complex<double>, int64))
 
 
 /**
@@ -591,11 +528,11 @@ GKO_ATTRIBUTES constexpr bool operator!=(precision_reduction x,
  *                Should take two arguments `src` and `dst`, which
  *                are replaced by the source and destination value type.
  */
-#define GKO_INSTANTIATE_FOR_EACH_VALUE_CONVERSION(_macro)       \
-    template _macro(float, double);                             \
-    template _macro(double, float);                             \
-    template _macro(std::complex<float>, std::complex<double>); \
-    template _macro(std::complex<double>, std::complex<float>)
+#define GKO_INSTANTIATE_FOR_EACH_VALUE_CONVERSION(_macro)                \
+    GKO_ADAPT_SINGLE(_macro(float, double));                             \
+    GKO_ADAPT_SINGLE(_macro(double, float));                             \
+    GKO_ADAPT_SINGLE(_macro(std::complex<float>, std::complex<double>)); \
+    GKO_ADAPT_SINGLE(_macro(std::complex<double>, std::complex<float>))
 
 
 /**
@@ -610,10 +547,9 @@ GKO_ATTRIBUTES constexpr bool operator!=(precision_reduction x,
 #define GKO_INSTANTIATE_FOR_EACH_VALUE_CONVERSION_OR_COPY(_macro) \
     GKO_INSTANTIATE_FOR_EACH_VALUE_CONVERSION(_macro);            \
     template _macro(float, float);                                \
-    template _macro(double, double);                              \
+    GKO_ADAPT_SINGLE(_macro(double, double));                     \
     template _macro(std::complex<float>, std::complex<float>);    \
-    template _macro(std::complex<double>, std::complex<double>)
-#endif
+    GKO_ADAPT_SINGLE(_macro(std::complex<double>, std::complex<double>))
 
 
 /**
diff --git a/include/ginkgo/core/synthesizer/containers.hpp b/include/ginkgo/core/synthesizer/containers.hpp
index fcb152da761..c6075ea3021 100644
--- a/include/ginkgo/core/synthesizer/containers.hpp
+++ b/include/ginkgo/core/synthesizer/containers.hpp
@@ -35,7 +35,10 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 
 #include <array>
+#include <limits>
+#include <tuple>
 #include <type_traits>
+#include <utility>
 
 
 namespace gko {
@@ -46,7 +49,6 @@ namespace gko {
  */
 namespace syn {
 
-
 /**
  * value_list records several values with the same type in template.
  *
@@ -54,7 +56,7 @@ namespace syn {
  * @tparam Values  the values in the list
  */
 template <typename T, T... Values>
-struct value_list {};
+using value_list = std::integer_sequence<T, Values...>;
 
 
 /**
@@ -83,22 +85,36 @@ namespace detail {
 /**
  * concatenate_impl base type
  *
- * @tparam List1  the first List
- * @tparam List2  the second List
+ * @tparam Lists  a list of std::integer_sequence
  */
-template <typename List1, typename List2>
+template <typename... Lists>
 struct concatenate_impl;
 
 /**
- * concatenate_impl specializes for two value_list with the same value type.
+ * concatenate_impl specialization for a single std::integer_sequence
+ *
+ * @tparam T  the value type of the std::integer_sequence
+ * @tparam Values  the values of the list
+ */
+template <typename T, T... Values>
+struct concatenate_impl<std::integer_sequence<T, Values...>> {
+    using type = std::integer_sequence<T, Values...>;
+};
+
+/**
+ * concatenate_impl specialization for multiple std::integer_sequence with the
+ * same value type.
  *
- * @tparam T  the value type of two value_list
- * @tparam Values  the values of the first list
- * @tparam Values  the values of the second list
+ * @tparam T  the value type of two std::integer_sequence
+ * @tparam Values1  the values of the first list
+ * @tparam Values2  the values of the second list
+ * @tparam Tail  the lists which have not been concatenated yet
  */
-template <typename T, T... Values1, T... Values2>
-struct concatenate_impl<value_list<T, Values1...>, value_list<T, Values2...>> {
-    using type = value_list<T, Values1..., Values2...>;
+template <typename T, T... Values1, T... Values2, typename... Tail>
+struct concatenate_impl<std::integer_sequence<T, Values1...>,
+                        std::integer_sequence<T, Values2...>, Tail...> {
+    using type = typename concatenate_impl<
+        std::integer_sequence<T, Values1..., Values2...>, Tail...>::type;
 };
 
 
@@ -106,13 +122,13 @@ struct concatenate_impl<value_list<T, Values1...>, value_list<T, Values2...>> {
 
 
 /**
- * concatenate combines two value_list into one value_list.
+ * concatenate an arbitrary number of std::integer_sequence with the same base
+ * type into one
  *
- * @tparam List1  the first list
- * @tparam List2  the second list
+ * @tparam Lists  a list of std::integer_sequence
  */
-template <typename List1, typename List2>
-using concatenate = typename detail::concatenate_impl<List1, List2>::type;
+template <typename... Lists>
+using concatenate = typename detail::concatenate_impl<Lists...>::type;
 
 
 namespace detail {
@@ -127,14 +143,14 @@ template <typename T, typename = void>
 struct as_list_impl;
 
 /**
- * as_list_impl specializes for the value_list
+ * as_list_impl specializes for the std::integer_sequence
  *
- * @tparam T  the value_list type
- * @tparam Values  the values of value_list
+ * @tparam T  the std::integer_sequence type
+ * @tparam Values  the values of std::integer_sequence
  */
 template <typename T, T... Values>
-struct as_list_impl<value_list<T, Values...>> {
-    using type = value_list<T, Values...>;
+struct as_list_impl<std::integer_sequence<T, Values...>> {
+    using type = std::integer_sequence<T, Values...>;
 };
 
 /**
@@ -158,7 +174,7 @@ struct as_list_impl<type_list<Types...>> {
 template <int Start, int End, int Step>
 struct as_list_impl<range<Start, End, Step>, std::enable_if_t<(Start < End)>> {
     using type = concatenate<
-        value_list<int, Start>,
+        std::integer_sequence<int, Start>,
         typename as_list_impl<range<Start + Step, End, Step>>::type>;
 };
 
@@ -171,7 +187,7 @@ struct as_list_impl<range<Start, End, Step>, std::enable_if_t<(Start < End)>> {
  */
 template <int Start, int End, int Step>
 struct as_list_impl<range<Start, End, Step>, std::enable_if_t<(Start >= End)>> {
-    using type = value_list<int>;
+    using type = std::integer_sequence<int>;
 };
 
 
@@ -189,22 +205,582 @@ using as_list = typename detail::as_list_impl<T>::type;
 
 
 /**
- * as_array<T> returns the array from value_list. It will be helpful if using
- * for in runtime on the array.
+ * as_array<T> returns the array from std::integer_sequence. It will be helpful
+ * if using for in runtime on the array.
  *
- * @tparam T  the type of value_list
- * @tparam Value  the values of value_list
+ * @tparam T  the type of std::integer_sequence
+ * @tparam Value  the values of std::integer_sequence
  *
- * @param value_list  the input value_list
+ * @param vl  the input std::integer_sequence
  *
- * @return std::array  the std::array contains the values of value_list
+ * @return std::array  the std::array contains the values of vl
  */
 template <typename T, T... Value>
-constexpr std::array<T, sizeof...(Value)> as_array(value_list<T, Value...> vl)
+constexpr auto as_array(std::integer_sequence<T, Value...> vl)
 {
+    std::ignore = vl;
     return std::array<T, sizeof...(Value)>{Value...};
 }
 
+/**
+ * as_value<T> returns the (first) value contained within an
+ * std::integer_sequence. The empty case is made to fail on purpose.
+ *
+ * @tparam T  the type of std::integer_sequence
+ * @tparam Value  the values of std::integer_sequence
+ *
+ * @param vl  the input std::integer_sequence
+ *
+ * @return the first value within vl
+ */
+template <typename T, T... Value>
+constexpr auto as_value(std::integer_sequence<T, Value...> vl)
+{
+    static_assert(sizeof...(Value) > 0,
+                  "Do not call as_value on an empty set!");
+    return as_array(vl)[0];
+}
+
+
+namespace detail {
+
+
+/**
+ * This is the base type of a helper for sorting. It partitions the values
+ * within an std::integer_sequence into three parts based on a pivot: values
+ * lower than required, values equal, values above the requirement.
+ *
+ * The results will depend on the ascending or descending order, which reverses
+ * above and lower.
+ *
+ * For example, when considering
+ * ```
+ *    using idxs = std::integer_sequence<int, 7614, 453, 16, 9, 16, 0, 0>;
+ *    using parts = partition_impl<16, idxs>; // the pivot is 16
+ *    // Then effectively:
+ *    // parts::lower = std::integer_sequence<int, 9, 0, 0>
+ *    // parts::equal = std::integer_sequence<int, 16, 16>
+ *    // parts::above = std::integer_sequence<int, 7614, 453>
+ * ```
+ *
+ * @tparam i  the pivot value
+ * @tparam ascending  whether to group in ascending or descending order
+ * @tparam T  the type of the values
+ * @tparam Values  the values
+ */
+template <int i, bool ascending, typename T, T... Values>
+struct partition_impl;
+
+/**
+ * The default case for partitioning an empty std::integer_sequence. All parts
+ * are the empty set.
+ *
+ * @tparam i  the pivot value
+ * @tparam ascending  whether to group in ascending or descending order
+ * @tparam T  the type of the values
+ */
+template <int i, bool ascending, typename T>
+struct partition_impl<i, ascending, std::integer_sequence<T>> {
+    using lower = std::integer_sequence<T>;
+    using equal = std::integer_sequence<T>;
+    using above = std::integer_sequence<T>;
+};
+
+/**
+ * The recursive case for partitioning an std::integer_sequence. The value v1 is
+ * put in the matching part.
+ *
+ * @tparam i  the pivot value
+ * @tparam ascending  whether to group in ascending or descending order
+ * @tparam T  the type of the values
+ * @tparam v1  the current value being processed
+ * @tparam Values  the values left to process
+ */
+template <int i, bool ascending, typename T, T v1, T... Values>
+struct partition_impl<i, ascending, std::integer_sequence<T, v1, Values...>> {
+    using this_elt = std::integer_sequence<T, v1>;
+    using empty = std::integer_sequence<T>;
+    using recurse =
+        partition_impl<i, ascending, std::integer_sequence<T, Values...>>;
+
+    using lower = concatenate<
+        std::conditional_t<(ascending ? v1 < i : v1 > i), this_elt, empty>,
+        typename recurse::lower>;
+    using equal = concatenate<std::conditional_t<(v1 == i), this_elt, empty>,
+                              typename recurse::equal>;
+    using above = concatenate<
+        std::conditional_t<(ascending ? v1 > i : v1 < i), this_elt, empty>,
+        typename recurse::above>;
+};
+
+
+/**
+ * This is the base type of the sorting structure. It sorts the values within
+ * an std::integer_sequence by using partition_t as a helper.
+ *
+ * The sorting will depend on the ascending or descending order and whether
+ * duplicates are kept or not.
+ *
+ * For example, when considering
+ * ```
+ *    using idxs = std::integer_sequence<int, 7614, 453, 16, 9, 16, 0, 0>;
+ *    using asc_dups = typename sort_impl<true, true, idxs>::type;
+ *    using asc_nodups = typename sort_impl<true, false, idxs>::type;
+ *    using desc_dups = typename sort_impl<false, true, idxs>::type;
+ *    // Then effectively:
+ *    // asc_dups = std::integer_sequence<int, 0, 0, 9, 16, 16, 453, 7614>
+ *    // asc_nodups = std::integer_sequence<int, 0, 9, 16, 453, 7614>
+ *    // desc_dups = std::integer_sequence<int, 7614, 453, 16, 16, 9, 0, 0>
+ * ```
+ *
+ * @tparam ascending  whether to sort in ascending or descending order
+ * @tparam keep_dups  whether to keep duplicates or not
+ * @tparam T  the type of the values
+ * @tparam Values  the values
+ */
+template <bool ascending, bool keep_dups, typename T, T... Values>
+struct sort_impl;
+
+/**
+ * The default case for sorting an empty std::integer_sequence. The result is an
+ * empty set.
+ *
+ * @tparam ascending  whether to group in ascending or descending order
+ * @tparam keep_dups  whether to keep duplicates or not
+ * @tparam T  the type of the values
+ */
+template <bool ascending, bool keep_dups, typename T>
+struct sort_impl<ascending, keep_dups, std::integer_sequence<T>> {
+    using type = std::integer_sequence<T>;
+};
+
+/**
+ * The recursive case for sorting an std::integer_sequence. The value v1 becomes
+ * the partition_impl pivot. The obtained groups are concatenated in order.
+ * Duplicates are removed by only populating `this_elt` instead of `this_elt +
+ * parts::equal`.
+ *
+ * @tparam ascending  whether to group in ascending or descending order
+ * @tparam keep_dups  whether to keep duplicates or not
+ * @tparam T  the type of the values
+ * @tparam v1  the current value and pivot for partition_impl
+ * @tparam Values  the values to sort
+ */
+template <bool ascending, bool keep_dups, typename T, T v1, T... Values>
+struct sort_impl<ascending, keep_dups,
+                 std::integer_sequence<T, v1, Values...>> {
+    using this_elt = std::integer_sequence<T, v1>;
+    using empty = std::integer_sequence<T>;
+    using parts =
+        partition_impl<v1, ascending, std::integer_sequence<T, Values...>>;
+    using sorted_inf =
+        typename sort_impl<ascending, keep_dups, typename parts::lower>::type;
+    using sorted_eq =
+        concatenate<this_elt, std::conditional_t<keep_dups == true,
+                                                 typename parts::equal, empty>>;
+    using sorted_up =
+        typename sort_impl<ascending, keep_dups, typename parts::above>::type;
+
+    using type = concatenate<sorted_inf, sorted_eq, sorted_up>;
+};
+
+
+}  // namespace detail
+
+/**
+ * This is a helper interface for sorting an std::integer_sequence. It always
+ * removes duplicate values.
+ *
+ * @see detail::sort_impl<bool, bool, typename T, T... Values>
+ *
+ * @tparam ascending  whether to group in ascending or descending order
+ * @tparam T  the type of the values
+ * @tparam Values  the values to sort
+ */
+template <bool ascending, typename T, T... Values>
+using sort = typename detail::sort_impl<ascending, false, T, Values...>::type;
+
+/**
+ * This is sorting variant which keeps duplicates
+ *
+ * @see detail::sort_impl<bool, bool, typename T, T... Values>
+ *
+ * @tparam ascending  whether to group in ascending or descending order
+ * @tparam T  the type of the values
+ * @tparam Values  the values to sort
+ */
+template <bool ascending, typename T, T... Values>
+using sort_keep =
+    typename detail::sort_impl<ascending, true, T, Values...>::type;
+
+
+namespace detail {
+
+
+/**
+ * This is the base type of the accessing an element of an std::integer_sequence
+ * at a given index.
+ *
+ * For example, when considering
+ * ```
+ *    using idxs = std::integer_sequence<int, 7614, 453, 16, 9, 16, 0, 0>;
+ *    using num1 = typename at_index_impl<0, idxs>::type;
+ *    using num2 = typename at_index_impl<2, idxs>::type;
+ *    using num5 = typename at_index_impl<5, idxs>::type;
+ *    // Then effectively:
+ *    // num1 = std::integer_sequence<int, 7614>
+ *    // num2 = std::integer_sequence<int, 16>
+ *    // num5 = std::integer_sequence<int, 0>
+ * ```
+ *
+ * @tparam idx  the index to find
+ * @tparam T  the type of the values
+ * @tparam Values  the values
+ */
+template <std::size_t idx, typename T, T... Values>
+struct at_index_impl;
+
+/**
+ * The default case for accessing an element of an empty std::integer_sequence
+ * at a given index. The result is an empty set.
+ *
+ * @tparam idx  the index to find
+ * @tparam T  the type of the values
+ */
+template <std::size_t idx, typename T>
+struct at_index_impl<idx, std::integer_sequence<T>> {
+    using type = std::integer_sequence<T>;
+};
+
+/**
+ * The recursive case for accessing an element of an std::integer_sequence at a
+ * given index. Idx is counted down until 0, where v1 is the requested element.
+ *
+ * @tparam idx  the distance to the index to find
+ * @tparam T  the type of the values
+ * @tparam v1  the value being processed
+ * @tparam Values  the values
+ */
+template <std::size_t idx, typename T, T v1, T... Values>
+struct at_index_impl<idx, std::integer_sequence<T, v1, Values...>> {
+    using recurse =
+        typename at_index_impl<idx - 1,
+                               std::integer_sequence<T, Values...>>::type;
+    using type =
+        std::conditional_t<(idx <= 0), std::integer_sequence<T, v1>, recurse>;
+};
+
+
+}  // namespace detail
+
+
+/**
+ * This is a helper interface for accessing an std::integer_sequence at a given
+ * index.
+ *
+ * @see detail::at_index_impl<std::size_t, typename T, T... Values>
+ *
+ * @tparam idx  the index of the element to find
+ * @tparam T  the type of the values
+ * @tparam Values  the values
+ */
+template <std::size_t idx, typename T, T... Values>
+using at_index = typename detail::at_index_impl<idx, T, Values...>::type;
+
+
+namespace detail {
+
+
+/**
+ * Access the element at the back of an std::integer_sequence. This is the base
+ * type.
+ *
+ * @tparam T  the type of the values
+ * @tparam Values  the values
+ */
+template <typename T, T... Values>
+struct back_impl;
+
+/**
+ * Access the element at the back of an std::integer_sequence. This is the
+ * specialization for std::integer_sequence. We simply reuse at_index. We need
+ * to unpack the std::integer_sequence in order to have the proper size of the
+ * parameter pack `sizeof...(Values)`, otherwise it is always one.
+ *
+ * @tparam T  the type of the values
+ * @tparam Values  the values
+ */
+template <typename T, T... Values>
+struct back_impl<std::integer_sequence<T, Values...>> {
+    using type =
+        at_index<sizeof...(Values) - 1, std::integer_sequence<T, Values...>>;
+};
+
+
+/**
+ * Access the median element of an std::integer_sequence. This is the base type.
+ *
+ * @tparam T  the type of the values
+ * @tparam Values  the values
+ */
+template <typename T, T... Values>
+struct median_impl;
+
+/**
+ * Access the median element of an std::integer_sequence. This is the
+ * specialization for std::integer_sequence. We simply reuse at_index of the
+ * middle element after calling sort. We need to unpack the
+ * std::integer_sequence in order to have the proper size of the parameter pack
+ * `sizeof...(Values)`, otherwise it is always one.
+ *
+ * @tparam T  the type of the values
+ * @tparam Values  the values
+ */
+template <typename T, T... Values>
+struct median_impl<std::integer_sequence<T, Values...>> {
+    using type =
+        at_index<sizeof...(Values) / 2, std::integer_sequence<T, Values...>>;
+};
+
+
+}  // namespace detail
+
+
+/**
+ * This is a helper interface for accessing the front of an
+ * std::integer_sequence.
+ *
+ * @tparam T  the type of the values
+ * @tparam Values  the values
+ */
+template <typename T, T... Values>
+using front = at_index<0, T, Values...>;
+
+/**
+ * This is a helper interface for accessing the back of an
+ * std::integer_sequence.
+ *
+ * @tparam T  the type of the values
+ * @tparam Values  the values
+ */
+template <typename T, T... Values>
+using back = typename detail::back_impl<T, Values...>::type;
+
+/**
+ * This is a helper interface for accessing the median element of an
+ * std::integer_sequence.
+ *
+ * @tparam T  the type of the values
+ * @tparam Values  the values
+ */
+template <typename T, T... Values>
+using median = typename detail::median_impl<sort<true, T, Values...>>::type;
+
+
+/**
+ * This is a helper interface for accessing the minimum element of an
+ * std::integer_sequence.
+ *
+ * @tparam T  the type of the values
+ * @tparam Values  the values
+ */
+template <typename T, T... Values>
+using min = front<sort<true, T, Values...>>;
+
+
+/**
+ * This is a helper interface for accessing the minimum element of an
+ * std::integer_sequence.
+ *
+ * @tparam T  the type of the values
+ * @tparam Values  the values
+ */
+template <typename T, T... Values>
+using max = front<sort<false, T, Values...>>;
+
+
+namespace detail {
+
+/**
+ * This is the base type of a helper to merge_impl. It merges two lists of
+ * std::integer_sequence by applying EncodingType::encode() on every pair of
+ * values. In this case, the first list is a single element. This helps
+ * logically split the double recursion needed for merging two general lists.
+ *
+ * For example, when considering
+ * ```
+ *    struct int_encoder {
+ *        using can_encode = std::true_type;
+ *        static constexpr auto encode(int v1, int v2) { return v1*v2; }
+ *    };
+ *    using idx1 = std::integer_sequence<int, 2>;
+ *    using idx2 = std::integer_sequence<int, 4, 8>;
+ *    using merged = typename merge_one_impl<int_encoder, idx1, idx2>::type;
+ *    // Then effectively:
+ *    // merged = std::integer_sequence<int, 8, 16>
+ * ```
+ *
+ * @tparam EncodingType  the type used to encode values. It must at least look
+ *                       like the encoder in the example above. ConfigSet is a
+ *                       Ginkgo type which can encode.
+ * @tparam Lists  the lists to merge and encode
+ */
+template <typename EncodingType, typename... Lists>
+struct merge_one_impl;
+
+/**
+ * This is the base case for merge_one_impl's recursion. The second list is
+ * empty.
+ *
+ * @tparam EncodingType  the type used to encode values
+ * @tparam T  the type of the values
+ * @tparam T  v1 the value from the first list
+ */
+template <typename EncodingType, typename T, T v1>
+struct merge_one_impl<EncodingType, std::integer_sequence<T, v1>,
+                      std::integer_sequence<T>> {
+    using type = std::integer_sequence<T>;
+};
+
+/**
+ * This is the recursive case for merge_one_impl. We merge and encode v1
+ * with every subsequent v2 from the second std::integer_sequence.
+ *
+ * @tparam EncodingType  the type used to encode values
+ * @tparam T  the type of the values
+ * @tparam T  v1 the value from the first list being merged
+ * @tparam T  v2 the value from the second list being merged
+ * @tparam Values  the values not yet processed from the second list
+ */
+template <typename EncodingType, typename T, T v1, T v2, T... Values>
+struct merge_one_impl<EncodingType, std::integer_sequence<T, v1>,
+                      std::integer_sequence<T, v2, Values...>> {
+    static_assert(
+        std::is_same<typename EncodingType::can_encode, std::true_type>::value,
+        "EncodingType must have encoding functionality.");
+
+    using v1_as_seq = std::integer_sequence<T, v1>;
+    using values_as_seq = std::integer_sequence<T, Values...>;
+    using recurse = merge_one_impl<EncodingType, v1_as_seq, values_as_seq>;
+    using type =
+        concatenate<std::integer_sequence<T, EncodingType::encode(v1, v2)>,
+                    typename recurse::type>;
+};
+
+
+/**
+ * This is the base type of merge_impl. It merges two
+ * std::integer_sequence and calls EncodingType::encode() on every pair of
+ * values.
+ *
+ * For example, when considering
+ * ```
+ *    struct int_encoder {
+ *        using can_encode = std::true_type;
+ *        static constexpr auto encode(int v1, int v2) { return v1*v2; }
+ *    };
+ *    using idx1 = std::integer_sequence<int, 2, 3>;
+ *    using idx2 = std::integer_sequence<int, 4, 8>;
+ *    using merged = typename merge_impl<int_encoder, idx1, idx2>::type;
+ *    // Then effectively:
+ *    // merged = std::integer_sequence<int, 8, 16, 12, 24>
+ * ```
+ *
+ * @see detail::merge_one_impl<typename EncodingType, typename ...Lists>
+ *
+ * @tparam EncodingType  the type used to encode values. It must at least look
+ *                       like the encoder in the example above. ConfigSet is a
+ *                       Ginkgo type which can encode.
+ * @tparam Lists  the lists to merge and encode
+ */
+template <typename EncodingType, typename... Lists>
+struct merge_impl;
+
+/**
+ * This is the base case for merge_impl's recursion. The first list has been
+ * completely consumed.
+ *
+ * @tparam EncodingType  the type used to encode values
+ * @tparam T  the type of the values
+ * @tparam T  Values2 the values of the second list
+ */
+template <typename EncodingType, typename T, T... Values2>
+struct merge_impl<EncodingType, std::integer_sequence<T>,
+                  std::integer_sequence<T, Values2...>> {
+    using type = std::integer_sequence<T>;
+};
+
+/**
+ * This is the first recursive case for merge_impl. In this case, the second
+ * list is empty. We only encode v1 one after the other.
+ *
+ * @tparam EncodingType  the type used to encode values
+ * @tparam T  the type of the values
+ * @tparam T  v1 the value from the first list being merged
+ * @tparam Values1  the values left to consume from the first list
+ * @tparam Values2  the values of the second list
+ */
+template <typename EncodingType, typename T, T v1, T... Values1>
+struct merge_impl<EncodingType, std::integer_sequence<T, v1, Values1...>,
+                  std::integer_sequence<T>> {
+    using v1_as_seq = std::integer_sequence<T, v1>;
+    using empty = std::integer_sequence<T>;
+    using val1_as_seq = std::integer_sequence<T, Values1...>;
+    using processed_v1 = std::integer_sequence<T, EncodingType::encode(v1)>;
+    // move to the next v1
+    using recurse = merge_impl<EncodingType, val1_as_seq, empty>;
+    using type = concatenate<processed_v1, typename recurse::type>;
+};
+
+/**
+ * This is the recursive case for merge_impl with a non empty Values2 list. We
+ * call merge_one_impl for every v1 and element of Values2 until Values1 is
+ * completely consumed.
+ *
+ * @tparam EncodingType  the type used to encode values
+ * @tparam T  the type of the values
+ * @tparam T  v1 the value from the first list being merged
+ * @tparam Values1  the values left to consume from the first list
+ * @tparam Values2  the values of the second list
+ */
+template <typename EncodingType, typename T, T v1, T... Values1, T... Values2>
+struct merge_impl<EncodingType, std::integer_sequence<T, v1, Values1...>,
+                  std::integer_sequence<T, Values2...>> {
+    using v1_as_seq = std::integer_sequence<T, v1>;
+    using val1_as_seq = std::integer_sequence<T, Values1...>;
+    using val2_as_seq = std::integer_sequence<T, Values2...>;
+    using process_v1 = merge_one_impl<EncodingType, v1_as_seq, val2_as_seq>;
+    // move to the next v1
+    using recurse = merge_impl<EncodingType, val1_as_seq, val2_as_seq>;
+    using type = concatenate<typename process_v1::type, typename recurse::type>;
+};
+
+
+}  // namespace detail
+
+
+/**
+ * This is a helper interface for merging two lists of std::integer_sequence
+ * using EncodingType as an encoder. It only merges two lists at once.
+ * EncodingType must look like ConfigSet or the following:
+ *
+ * ```
+ *    struct int_encoder {
+ *        using can_encode = std::true_type;
+ *        static constexpr auto encode(int v1, int v2) { return v1*v2; }
+ *    };
+ * ```
+ *
+ * @see detail::merge_impl<typename EncodingType, typename ...Lists>
+ *
+ * @tparam EncodingType  the type used to encode values. It must at least look
+ *                       like the encoder in the example above. ConfigSet is a
+ *                       Ginkgo type which can encode.
+ * @tparam Lists  the lists to merge and encode
+ */
+template <typename EncodingType, typename... Lists>
+using merge = typename detail::merge_impl<EncodingType, Lists...>::type;
+
 
 }  // namespace syn
 }  // namespace gko
diff --git a/omp/base/kernel_launch.hpp b/omp/base/kernel_launch.hpp
index eb89aa211bd..c4da5bfa5fc 100644
--- a/omp/base/kernel_launch.hpp
+++ b/omp/base/kernel_launch.hpp
@@ -66,7 +66,7 @@ void run_kernel_impl(std::shared_ptr<const OmpExecutor> exec, KernelFunction fn,
 
 template <int block_size, int remainder_cols, typename KernelFunction,
           typename... MappedKernelArgs>
-void run_kernel_sized_impl(syn::value_list<int, remainder_cols>,
+void run_kernel_sized_impl(std::integer_sequence<int, remainder_cols>,
                            std::shared_ptr<const OmpExecutor> exec,
                            KernelFunction fn, dim<2> size,
                            MappedKernelArgs... args)
@@ -124,8 +124,8 @@ void run_kernel_impl(std::shared_ptr<const OmpExecutor> exec, KernelFunction fn,
     select_run_kernel_sized(
         remainders(),
         [&](int remainder) { return remainder == cols % block_size; },
-        syn::value_list<int, block_size>(), syn::type_list<>(), exec, fn, size,
-        args...);
+        std::integer_sequence<int, block_size>(), syn::type_list<>(), exec, fn,
+        size, args...);
 }
 
 
diff --git a/omp/base/kernel_launch_reduction.hpp b/omp/base/kernel_launch_reduction.hpp
index d4a489a258f..97b2d8dd0c0 100644
--- a/omp/base/kernel_launch_reduction.hpp
+++ b/omp/base/kernel_launch_reduction.hpp
@@ -87,7 +87,7 @@ void run_kernel_reduction_impl(std::shared_ptr<const OmpExecutor> exec,
 template <int block_size, int remainder_cols, typename ValueType,
           typename KernelFunction, typename ReductionOp, typename FinalizeOp,
           typename... MappedKernelArgs>
-void run_kernel_reduction_sized_impl(syn::value_list<int, remainder_cols>,
+void run_kernel_reduction_sized_impl(std::integer_sequence<int, remainder_cols>,
                                      std::shared_ptr<const OmpExecutor> exec,
                                      KernelFunction fn, ReductionOp op,
                                      FinalizeOp finalize, ValueType identity,
@@ -183,8 +183,8 @@ void run_kernel_reduction(std::shared_ptr<const OmpExecutor> exec,
     select_run_kernel_reduction_sized(
         remainders(),
         [&](int remainder) { return remainder == cols % block_size; },
-        syn::value_list<int, block_size>(), syn::type_list<>(), exec, fn, op,
-        finalize, identity, result, size, map_to_device(args)...);
+        std::integer_sequence<int, block_size>(), syn::type_list<>(), exec, fn,
+        op, finalize, identity, result, size, map_to_device(args)...);
 }
 
 
@@ -287,7 +287,7 @@ template <int block_size, int remainder_cols, typename ValueType,
           typename KernelFunction, typename ReductionOp, typename FinalizeOp,
           typename... MappedKernelArgs>
 void run_kernel_col_reduction_sized_impl(
-    syn::value_list<int, remainder_cols>,
+    std::integer_sequence<int, remainder_cols>,
     std::shared_ptr<const OmpExecutor> exec, KernelFunction fn, ReductionOp op,
     FinalizeOp finalize, ValueType identity, ValueType* result, dim<2> size,
     MappedKernelArgs... args)
@@ -395,8 +395,8 @@ void run_kernel_col_reduction(std::shared_ptr<const OmpExecutor> exec,
     select_run_kernel_col_reduction_sized(
         remainders(),
         [&](int remainder) { return remainder == cols % block_size; },
-        syn::value_list<int, block_size>(), syn::type_list<>(), exec, fn, op,
-        finalize, identity, result, size, map_to_device(args)...);
+        std::integer_sequence<int, block_size>(), syn::type_list<>(), exec, fn,
+        op, finalize, identity, result, size, map_to_device(args)...);
 }
 
 
diff --git a/omp/matrix/fbcsr_kernels.cpp b/omp/matrix/fbcsr_kernels.cpp
index ce647f79935..c8823fc896a 100644
--- a/omp/matrix/fbcsr_kernels.cpp
+++ b/omp/matrix/fbcsr_kernels.cpp
@@ -375,7 +375,7 @@ namespace {
 
 template <int mat_blk_sz, typename ValueType, typename IndexType>
 void sort_by_column_index_impl(
-    syn::value_list<int, mat_blk_sz>,
+    std::integer_sequence<int, mat_blk_sz>,
     matrix::Fbcsr<ValueType, IndexType>* const to_sort)
 {
     auto row_ptrs = to_sort->get_const_row_ptrs();
@@ -420,7 +420,7 @@ void sort_by_column_index(const std::shared_ptr<const OmpExecutor> exec,
     select_sort_col_idx(
         fixedblock::compiled_kernels(),
         [bs](int compiled_block_size) { return bs == compiled_block_size; },
-        syn::value_list<int>(), syn::type_list<>(), to_sort);
+        std::integer_sequence<int>(), syn::type_list<>(), to_sort);
 }
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
diff --git a/reference/matrix/fbcsr_kernels.cpp b/reference/matrix/fbcsr_kernels.cpp
index 9b093a97b68..8a318084568 100644
--- a/reference/matrix/fbcsr_kernels.cpp
+++ b/reference/matrix/fbcsr_kernels.cpp
@@ -430,7 +430,7 @@ namespace {
 
 template <int mat_blk_sz, typename ValueType, typename IndexType>
 void sort_by_column_index_impl(
-    syn::value_list<int, mat_blk_sz>,
+    std::integer_sequence<int, mat_blk_sz>,
     matrix::Fbcsr<ValueType, IndexType>* const to_sort)
 {
     auto row_ptrs = to_sort->get_const_row_ptrs();
@@ -476,7 +476,7 @@ void sort_by_column_index(const std::shared_ptr<const ReferenceExecutor> exec,
     select_sort_col_idx(
         fixedblock::compiled_kernels(),
         [bs](int compiled_block_size) { return bs == compiled_block_size; },
-        syn::value_list<int>(), syn::type_list<>(), to_sort);
+        std::integer_sequence<int>(), syn::type_list<>(), to_sort);
 }
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
diff --git a/third_party/hwloc/CMakeLists.txt b/third_party/hwloc/CMakeLists.txt
index a47e9a84877..970ed78689c 100644
--- a/third_party/hwloc/CMakeLists.txt
+++ b/third_party/hwloc/CMakeLists.txt
@@ -5,8 +5,6 @@ ginkgo_load_and_configure_package(hwloc_external "https://download.open-mpi.org/
     "${TPL_HWLOC_PATH}/src/configure" "--disable-nvml" "--disable-cuda" "--disable-rsmi"
     )
 
-add_library(hwloc SHARED IMPORTED GLOBAL)
-add_dependencies(hwloc hwloc_external )
 file(MAKE_DIRECTORY ${TPL_HWLOC_PATH}/lib/)
 file(GLOB HWLOC_LIBS "${TPL_HWLOC_PATH}/build/hwloc/.libs/libhwloc.so*")
 configure_file("${TPL_HWLOC_PATH}/build/include/hwloc/autogen/config.h" "${TPL_HWLOC_PATH}/src/include/hwloc/autogen/config.h" COPYONLY)
@@ -14,6 +12,9 @@ foreach(lib ${HWLOC_LIBS})
     get_filename_component(lib_name ${lib} NAME)
     configure_file("${lib}" "${TPL_HWLOC_PATH}/lib/${lib_name}" COPYONLY)
 endforeach()
+
+add_library(hwloc SHARED IMPORTED GLOBAL)
+add_dependencies(hwloc hwloc_external)
 set(HWLOC_LIBRARIES "${TPL_HWLOC_PATH}/lib/libhwloc.so" CACHE FILEPATH "The path to HWLOC library libhwloc.so" FORCE)
 set(HWLOC_INCLUDE_DIRS "${TPL_HWLOC_PATH}/src/include" CACHE PATH "The directory containing the hwloc header, hwloc.h" FORCE)
 set_target_properties(hwloc PROPERTIES IMPORTED_LOCATION ${HWLOC_LIBRARIES})