diff --git a/benchmark/utils/preconditioners.hpp b/benchmark/utils/preconditioners.hpp index 63fd22708e6..ea8594e7446 100644 --- a/benchmark/utils/preconditioners.hpp +++ b/benchmark/utils/preconditioners.hpp @@ -22,7 +22,7 @@ DEFINE_string(preconditioners, "none", "A comma-separated list of preconditioners to use. " "Supported values are: none, jacobi, paric, parict, parilu, " "parilut, ic, ilu, paric-isai, parict-isai, parilu-isai, " - "parilut-isai, ic-isai, ilu-isai, overhead"); + "parilut-isai, ic-isai, ilu-isai, sor, overhead"); DEFINE_uint32(parilu_iterations, 5, "The number of iterations for ParIC(T)/ParILU(T)"); @@ -49,6 +49,12 @@ DEFINE_double(jacobi_accuracy, 1e-1, DEFINE_uint32(jacobi_max_block_size, 32, "Maximal block size of the block-Jacobi preconditioner"); +DEFINE_double(sor_relaxation_factor, 1.0, + "The relaxation factor for the SOR preconditioner"); + +DEFINE_bool(sor_symmetric, false, + "Apply the SOR preconditioner symmetrically, i.e. use SSOR"); + // parses the Jacobi storage optimization command line argument gko::precision_reduction parse_storage_optimization(const std::string& flag) @@ -292,6 +298,15 @@ const std::map( .with_sparsity_power(FLAGS_isai_power) .on(exec); }}, + {"sor", + [](std::shared_ptr exec) { + return gko::preconditioner::Sor::build() + .with_relaxation_factor( + static_cast>( + FLAGS_sor_relaxation_factor)) + .with_symmetric(FLAGS_sor_symmetric) + .on(exec); + }}, {"overhead", [](std::shared_ptr exec) { return gko::Overhead::build() .with_criteria(gko::stop::ResidualNorm::build() diff --git a/common/cuda_hip/CMakeLists.txt b/common/cuda_hip/CMakeLists.txt index f5a28596d16..267444d2144 100644 --- a/common/cuda_hip/CMakeLists.txt +++ b/common/cuda_hip/CMakeLists.txt @@ -38,6 +38,7 @@ set(CUDA_HIP_SOURCES preconditioner/jacobi_advanced_apply_kernels.cpp preconditioner/jacobi_generate_kernels.cpp preconditioner/jacobi_simple_apply_kernels.cpp + preconditioner/sor_kernels.cpp reorder/rcm_kernels.cpp solver/cb_gmres_kernels.cpp solver/idr_kernels.cpp diff --git a/common/cuda_hip/factorization/factorization_helpers.hpp b/common/cuda_hip/factorization/factorization_helpers.hpp new file mode 100644 index 00000000000..87248740867 --- /dev/null +++ b/common/cuda_hip/factorization/factorization_helpers.hpp @@ -0,0 +1,112 @@ +// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors +// +// SPDX-License-Identifier: BSD-3-Clause + +#include "common/cuda_hip/base/config.hpp" +#include "common/cuda_hip/base/runtime.hpp" +#include "common/cuda_hip/base/types.hpp" +#include "common/cuda_hip/components/thread_ids.hpp" +#include "core/factorization/factorization_helpers.hpp" + + +namespace gko { +namespace kernels { +namespace GKO_DEVICE_NAMESPACE { +namespace factorization { +namespace helpers { + + +using namespace ::gko::factorization; + + +constexpr int default_block_size{512}; + + +template +__global__ __launch_bounds__(default_block_size) void initialize_l_u( + size_type num_rows, const IndexType* __restrict__ row_ptrs, + const IndexType* __restrict__ col_idxs, + const ValueType* __restrict__ values, + const IndexType* __restrict__ l_row_ptrs, + IndexType* __restrict__ l_col_idxs, ValueType* __restrict__ l_values, + const IndexType* __restrict__ u_row_ptrs, + IndexType* __restrict__ u_col_idxs, ValueType* __restrict__ u_values, + LClosure l_closure, UClosure u_closure) +{ + const auto row = thread::get_thread_id_flat(); + if (row < num_rows) { + auto l_idx = l_row_ptrs[row]; + auto u_idx = u_row_ptrs[row] + 1; // we treat the diagonal separately + // default diagonal to one + auto diag_val = one(); + for (size_type i = row_ptrs[row]; i < row_ptrs[row + 1]; ++i) { + const auto col = col_idxs[i]; + const auto val = values[i]; + // save diagonal entry for later + if (col == row) { + diag_val = val; + } + if (col < row) { + l_col_idxs[l_idx] = col; + l_values[l_idx] = l_closure.map_off_diag(val); + ++l_idx; + } + if (row < col) { + u_col_idxs[u_idx] = col; + u_values[u_idx] = u_closure.map_off_diag(val); + ++u_idx; + } + } + // store diagonal entries + auto l_diag_idx = l_row_ptrs[row + 1] - 1; + auto u_diag_idx = u_row_ptrs[row]; + l_col_idxs[l_diag_idx] = row; + u_col_idxs[u_diag_idx] = row; + l_values[l_diag_idx] = l_closure.map_diag(diag_val); + u_values[u_diag_idx] = u_closure.map_diag(diag_val); + } +} + + +template +__global__ __launch_bounds__(default_block_size) void initialize_l( + size_type num_rows, const IndexType* __restrict__ row_ptrs, + const IndexType* __restrict__ col_idxs, + const ValueType* __restrict__ values, + const IndexType* __restrict__ l_row_ptrs, + IndexType* __restrict__ l_col_idxs, ValueType* __restrict__ l_values, + LClosure l_closure) +{ + const auto row = thread::get_thread_id_flat(); + if (row < num_rows) { + auto l_idx = l_row_ptrs[row]; + // if there was no diagonal entry, default to one + auto diag_val = one(); + for (size_type i = row_ptrs[row]; i < row_ptrs[row + 1]; ++i) { + const auto col = col_idxs[i]; + const auto val = values[i]; + // save diagonal entry for later + if (col == row) { + diag_val = val; + } + if (col < row) { + l_col_idxs[l_idx] = col; + l_values[l_idx] = l_closure.map_off_diag(val); + ++l_idx; + } + } + // store diagonal entries + auto l_diag_idx = l_row_ptrs[row + 1] - 1; + l_col_idxs[l_diag_idx] = row; + l_values[l_diag_idx] = l_closure.map_diag(diag_val); + } +} + + +} // namespace helpers +} // namespace factorization +} // namespace GKO_DEVICE_NAMESPACE + +} // namespace kernels +} // namespace gko \ No newline at end of file diff --git a/common/cuda_hip/factorization/factorization_kernels.cpp b/common/cuda_hip/factorization/factorization_kernels.cpp index 3a38175ab70..e790cf19540 100644 --- a/common/cuda_hip/factorization/factorization_kernels.cpp +++ b/common/cuda_hip/factorization/factorization_kernels.cpp @@ -13,6 +13,7 @@ #include "common/cuda_hip/components/intrinsics.hpp" #include "common/cuda_hip/components/searching.hpp" #include "common/cuda_hip/components/thread_ids.hpp" +#include "common/cuda_hip/factorization/factorization_helpers.hpp" #include "core/base/array_access.hpp" #include "core/components/prefix_sum_kernels.hpp" #include "core/matrix/csr_builder.hpp" @@ -255,51 +256,6 @@ __global__ __launch_bounds__(default_block_size) void count_nnz_per_l_u_row( } -template -__global__ __launch_bounds__(default_block_size) void initialize_l_u( - size_type num_rows, const IndexType* __restrict__ row_ptrs, - const IndexType* __restrict__ col_idxs, - const ValueType* __restrict__ values, - const IndexType* __restrict__ l_row_ptrs, - IndexType* __restrict__ l_col_idxs, ValueType* __restrict__ l_values, - const IndexType* __restrict__ u_row_ptrs, - IndexType* __restrict__ u_col_idxs, ValueType* __restrict__ u_values) -{ - const auto row = thread::get_thread_id_flat(); - if (row < num_rows) { - auto l_idx = l_row_ptrs[row]; - auto u_idx = u_row_ptrs[row] + 1; // we treat the diagonal separately - // default diagonal to one - auto diag_val = one(); - for (size_type i = row_ptrs[row]; i < row_ptrs[row + 1]; ++i) { - const auto col = col_idxs[i]; - const auto val = values[i]; - // save diagonal entry for later - if (col == row) { - diag_val = val; - } - if (col < row) { - l_col_idxs[l_idx] = col; - l_values[l_idx] = val; - ++l_idx; - } - if (row < col) { - u_col_idxs[u_idx] = col; - u_values[u_idx] = val; - ++u_idx; - } - } - // store diagonal entries - auto l_diag_idx = l_row_ptrs[row + 1] - 1; - auto u_diag_idx = u_row_ptrs[row]; - l_col_idxs[l_diag_idx] = row; - u_col_idxs[u_diag_idx] = row; - l_values[l_diag_idx] = one(); - u_values[u_diag_idx] = diag_val; - } -} - - template __global__ __launch_bounds__(default_block_size) void count_nnz_per_l_row( size_type num_rows, const IndexType* __restrict__ row_ptrs, @@ -320,48 +276,6 @@ __global__ __launch_bounds__(default_block_size) void count_nnz_per_l_row( } -template -__global__ __launch_bounds__(default_block_size) void initialize_l( - size_type num_rows, const IndexType* __restrict__ row_ptrs, - const IndexType* __restrict__ col_idxs, - const ValueType* __restrict__ values, - const IndexType* __restrict__ l_row_ptrs, - IndexType* __restrict__ l_col_idxs, ValueType* __restrict__ l_values, - bool use_sqrt) -{ - const auto row = thread::get_thread_id_flat(); - if (row < num_rows) { - auto l_idx = l_row_ptrs[row]; - // if there was no diagonal entry, default to one - auto diag_val = one(); - for (size_type i = row_ptrs[row]; i < row_ptrs[row + 1]; ++i) { - const auto col = col_idxs[i]; - const auto val = values[i]; - // save diagonal entry for later - if (col == row) { - diag_val = val; - } - if (col < row) { - l_col_idxs[l_idx] = col; - l_values[l_idx] = val; - ++l_idx; - } - } - // store diagonal entries - auto l_diag_idx = l_row_ptrs[row + 1] - 1; - l_col_idxs[l_diag_idx] = row; - // compute square root with sentinel - if (use_sqrt) { - diag_val = sqrt(diag_val); - if (!is_finite(diag_val)) { - diag_val = one(); - } - } - l_values[l_diag_idx] = diag_val; - } -} - - } // namespace kernel @@ -481,18 +395,25 @@ void initialize_l_u(std::shared_ptr exec, matrix::Csr* csr_u) { const size_type num_rows{system_matrix->get_size()[0]}; - const auto block_size = default_block_size; + const auto block_size = helpers::default_block_size; const auto grid_dim = static_cast( ceildiv(num_rows, static_cast(block_size))); if (grid_dim > 0) { - kernel::initialize_l_u<<get_stream()>>>( - num_rows, system_matrix->get_const_row_ptrs(), - system_matrix->get_const_col_idxs(), - as_device_type(system_matrix->get_const_values()), - csr_l->get_const_row_ptrs(), csr_l->get_col_idxs(), - as_device_type(csr_l->get_values()), csr_u->get_const_row_ptrs(), - csr_u->get_col_idxs(), as_device_type(csr_u->get_values())); + helpers:: + initialize_l_u<<get_stream()>>>( + num_rows, system_matrix->get_const_row_ptrs(), + system_matrix->get_const_col_idxs(), + as_device_type(system_matrix->get_const_values()), + csr_l->get_const_row_ptrs(), csr_l->get_col_idxs(), + as_device_type(csr_l->get_values()), + csr_u->get_const_row_ptrs(), csr_u->get_col_idxs(), + as_device_type(csr_u->get_values()), + helpers::triangular_mtx_closure( + [] __device__(auto val) { return one(val); }, + helpers::identity{}), + helpers::triangular_mtx_closure(helpers::identity{}, + helpers::identity{})); } } @@ -534,17 +455,28 @@ void initialize_l(std::shared_ptr exec, matrix::Csr* csr_l, bool diag_sqrt) { const size_type num_rows{system_matrix->get_size()[0]}; - const auto block_size = default_block_size; + const auto block_size = helpers::default_block_size; const auto grid_dim = static_cast( ceildiv(num_rows, static_cast(block_size))); if (grid_dim > 0) { - kernel::initialize_l<<get_stream()>>>( + helpers::initialize_l<<get_stream()>>>( num_rows, system_matrix->get_const_row_ptrs(), system_matrix->get_const_col_idxs(), as_device_type(system_matrix->get_const_values()), csr_l->get_const_row_ptrs(), csr_l->get_col_idxs(), - as_device_type(csr_l->get_values()), diag_sqrt); + as_device_type(csr_l->get_values()), + helpers::triangular_mtx_closure( + [diag_sqrt] __device__(auto val) { + if (diag_sqrt) { + val = sqrt(val); + if (!is_finite(val)) { + val = one(val); + } + } + return val; + }, + helpers::identity{})); } } diff --git a/common/cuda_hip/preconditioner/sor_kernels.cpp b/common/cuda_hip/preconditioner/sor_kernels.cpp new file mode 100644 index 00000000000..a415953915f --- /dev/null +++ b/common/cuda_hip/preconditioner/sor_kernels.cpp @@ -0,0 +1,99 @@ +// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors +// +// SPDX-License-Identifier: BSD-3-Clause + +#include "core/preconditioner/sor_kernels.hpp" + +#include +#include + +#include "common/cuda_hip/factorization/factorization_helpers.hpp" + +namespace gko { +namespace kernels { +namespace GKO_DEVICE_NAMESPACE { +namespace sor { + + +template +void initialize_weighted_l( + std::shared_ptr exec, + const matrix::Csr* system_matrix, + remove_complex weight, matrix::Csr* l_mtx) +{ + const size_type num_rows{system_matrix->get_size()[0]}; + const auto block_size = factorization::helpers::default_block_size; + const auto grid_dim = static_cast( + ceildiv(num_rows, static_cast(block_size))); + + auto inv_weight = one(weight) / weight; + + if (grid_dim > 0) { + factorization::helpers:: + initialize_l<<get_stream()>>>( + num_rows, system_matrix->get_const_row_ptrs(), + system_matrix->get_const_col_idxs(), + as_device_type(system_matrix->get_const_values()), + l_mtx->get_const_row_ptrs(), l_mtx->get_col_idxs(), + as_device_type(l_mtx->get_values()), + factorization::helpers::triangular_mtx_closure( + [inv_weight] __device__(auto val) { + return val * inv_weight; + }, + factorization::helpers::identity{})); + } +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_SOR_INITIALIZE_WEIGHTED_L); + + +template +void initialize_weighted_l_u( + std::shared_ptr exec, + const matrix::Csr* system_matrix, + remove_complex weight, matrix::Csr* l_mtx, + matrix::Csr* u_mtx) +{ + const size_type num_rows{system_matrix->get_size()[0]}; + const auto block_size = factorization::helpers::default_block_size; + const auto grid_dim = static_cast( + ceildiv(num_rows, static_cast(block_size))); + + auto inv_weight = one(weight) / weight; + auto inv_two_minus_weight = + one(weight) / (static_cast>(2.0) - weight); + + if (grid_dim > 0) { + factorization::helpers:: + initialize_l_u<<get_stream()>>>( + num_rows, system_matrix->get_const_row_ptrs(), + system_matrix->get_const_col_idxs(), + as_device_type(system_matrix->get_const_values()), + l_mtx->get_const_row_ptrs(), l_mtx->get_col_idxs(), + as_device_type(l_mtx->get_values()), + u_mtx->get_const_row_ptrs(), u_mtx->get_col_idxs(), + as_device_type(u_mtx->get_values()), + factorization::helpers::triangular_mtx_closure( + [inv_weight] __device__(auto val) { + return val * inv_weight; + }, + factorization::helpers::identity{}), + factorization::helpers::triangular_mtx_closure( + [inv_two_minus_weight] __device__(auto val) { + return val * inv_two_minus_weight; + }, + [weight, inv_two_minus_weight] __device__(auto val) { + return val * weight * inv_two_minus_weight; + })); + } +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_SOR_INITIALIZE_WEIGHTED_L_U); + + +} // namespace sor +} // namespace GKO_DEVICE_NAMESPACE +} // namespace kernels +} // namespace gko diff --git a/common/unified/CMakeLists.txt b/common/unified/CMakeLists.txt index 132e04c5d9a..00bc21df0c6 100644 --- a/common/unified/CMakeLists.txt +++ b/common/unified/CMakeLists.txt @@ -19,7 +19,6 @@ set(UNIFIED_SOURCES matrix/diagonal_kernels.cpp multigrid/pgm_kernels.cpp preconditioner/jacobi_kernels.cpp - preconditioner/sor_kernels.cpp solver/bicg_kernels.cpp solver/bicgstab_kernels.cpp solver/cg_kernels.cpp diff --git a/common/unified/preconditioner/sor_kernels.cpp b/common/unified/preconditioner/sor_kernels.cpp deleted file mode 100644 index 8932c1df562..00000000000 --- a/common/unified/preconditioner/sor_kernels.cpp +++ /dev/null @@ -1,44 +0,0 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors -// -// SPDX-License-Identifier: BSD-3-Clause - -#include "core/preconditioner/sor_kernels.hpp" - -#include -#include - -#include "common/unified/base/kernel_launch.hpp" - - -namespace gko { -namespace kernels { -namespace GKO_DEVICE_NAMESPACE { -namespace sor { - - -template -void initialize_weighted_l( - std::shared_ptr exec, - const matrix::Csr* system_matrix, - remove_complex weight, - matrix::Csr* l_mtx) GKO_NOT_IMPLEMENTED; - -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_SOR_INITIALIZE_WEIGHTED_L); - - -template -void initialize_weighted_l_u( - std::shared_ptr exec, - const matrix::Csr* system_matrix, - remove_complex weight, matrix::Csr* l_mtx, - matrix::Csr* u_mtx) GKO_NOT_IMPLEMENTED; - -GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( - GKO_DECLARE_SOR_INITIALIZE_WEIGHTED_L_U); - - -} // namespace sor -} // namespace GKO_DEVICE_NAMESPACE -} // namespace kernels -} // namespace gko diff --git a/dpcpp/CMakeLists.txt b/dpcpp/CMakeLists.txt index 851ef9a3dc6..bf65888a6ab 100644 --- a/dpcpp/CMakeLists.txt +++ b/dpcpp/CMakeLists.txt @@ -56,6 +56,7 @@ target_sources(ginkgo_dpcpp preconditioner/jacobi_generate_kernel.dp.cpp preconditioner/jacobi_kernels.dp.cpp preconditioner/jacobi_simple_apply_kernel.dp.cpp + preconditioner/sor_kernels.dp.cpp reorder/rcm_kernels.dp.cpp solver/batch_bicgstab_kernels.dp.cpp solver/batch_cg_kernels.dp.cpp diff --git a/dpcpp/factorization/factorization_helpers.dp.hpp b/dpcpp/factorization/factorization_helpers.dp.hpp new file mode 100644 index 00000000000..9779e134e77 --- /dev/null +++ b/dpcpp/factorization/factorization_helpers.dp.hpp @@ -0,0 +1,110 @@ +// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors +// +// SPDX-License-Identifier: BSD-3-Clause + +#include + +#include "core/factorization/factorization_helpers.hpp" +#include "dpcpp/base/config.hpp" +#include "dpcpp/base/dim3.dp.hpp" +#include "dpcpp/base/dpct.hpp" +#include "dpcpp/components/thread_ids.dp.hpp" + + +namespace gko { +namespace kernels { +namespace dpcpp { +namespace factorization { +namespace helpers { + +using namespace ::gko::factorization; + + +template +void initialize_l_u(size_type num_rows, const IndexType* __restrict__ row_ptrs, + const IndexType* __restrict__ col_idxs, + const ValueType* __restrict__ values, + const IndexType* __restrict__ l_row_ptrs, + IndexType* __restrict__ l_col_idxs, + ValueType* __restrict__ l_values, + const IndexType* __restrict__ u_row_ptrs, + IndexType* __restrict__ u_col_idxs, + ValueType* __restrict__ u_values, LClosure l_closure, + UClosure u_closure, sycl::nd_item<3> item_ct1) +{ + const auto row = thread::get_thread_id_flat(item_ct1); + if (row < num_rows) { + auto l_idx = l_row_ptrs[row]; + auto u_idx = u_row_ptrs[row] + 1; // we treat the diagonal separately + // default diagonal to one + auto diag_val = one(); + for (size_type i = row_ptrs[row]; i < row_ptrs[row + 1]; ++i) { + const auto col = col_idxs[i]; + const auto val = values[i]; + // save diagonal entry for later + if (col == row) { + diag_val = val; + } + if (col < row) { + l_col_idxs[l_idx] = col; + l_values[l_idx] = l_closure.map_off_diag(val); + ++l_idx; + } + if (row < col) { + u_col_idxs[u_idx] = col; + u_values[u_idx] = u_closure.map_off_diag(val); + ++u_idx; + } + } + // store diagonal entries + auto l_diag_idx = l_row_ptrs[row + 1] - 1; + auto u_diag_idx = u_row_ptrs[row]; + l_col_idxs[l_diag_idx] = row; + u_col_idxs[u_diag_idx] = row; + l_values[l_diag_idx] = l_closure.map_diag(diag_val); + u_values[u_diag_idx] = u_closure.map_diag(diag_val); + } +} + + +template +void initialize_l(size_type num_rows, const IndexType* __restrict__ row_ptrs, + const IndexType* __restrict__ col_idxs, + const ValueType* __restrict__ values, + const IndexType* __restrict__ l_row_ptrs, + IndexType* __restrict__ l_col_idxs, + ValueType* __restrict__ l_values, LClosure l_closure, + sycl::nd_item<3> item_ct1) +{ + const auto row = thread::get_thread_id_flat(item_ct1); + if (row < num_rows) { + auto l_idx = l_row_ptrs[row]; + // if there was no diagonal entry, default to one + auto diag_val = one(); + for (size_type i = row_ptrs[row]; i < row_ptrs[row + 1]; ++i) { + const auto col = col_idxs[i]; + const auto val = values[i]; + // save diagonal entry for later + if (col == row) { + diag_val = val; + } + if (col < row) { + l_col_idxs[l_idx] = col; + l_values[l_idx] = l_closure.map_off_diag(val); + ++l_idx; + } + } + // store diagonal entries + auto l_diag_idx = l_row_ptrs[row + 1] - 1; + l_col_idxs[l_diag_idx] = row; + l_values[l_diag_idx] = l_closure.map_diag(diag_val); + } +} + + +} // namespace helpers +} // namespace factorization +} // namespace dpcpp +} // namespace kernels +} // namespace gko diff --git a/dpcpp/factorization/factorization_kernels.dp.cpp b/dpcpp/factorization/factorization_kernels.dp.cpp index 1d9912b4f12..885fe481609 100644 --- a/dpcpp/factorization/factorization_kernels.dp.cpp +++ b/dpcpp/factorization/factorization_kernels.dp.cpp @@ -18,6 +18,7 @@ #include "dpcpp/components/intrinsics.dp.hpp" #include "dpcpp/components/searching.dp.hpp" #include "dpcpp/components/thread_ids.dp.hpp" +#include "dpcpp/factorization/factorization_helpers.dp.hpp" namespace gko { @@ -320,51 +321,6 @@ void count_nnz_per_l_u_row(dim3 grid, dim3 block, } -template -void initialize_l_u(size_type num_rows, const IndexType* __restrict__ row_ptrs, - const IndexType* __restrict__ col_idxs, - const ValueType* __restrict__ values, - const IndexType* __restrict__ l_row_ptrs, - IndexType* __restrict__ l_col_idxs, - ValueType* __restrict__ l_values, - const IndexType* __restrict__ u_row_ptrs, - IndexType* __restrict__ u_col_idxs, - ValueType* __restrict__ u_values, sycl::nd_item<3> item_ct1) -{ - const auto row = thread::get_thread_id_flat(item_ct1); - if (row < num_rows) { - auto l_idx = l_row_ptrs[row]; - auto u_idx = u_row_ptrs[row] + 1; // we treat the diagonal separately - // default diagonal to one - auto diag_val = one(); - for (size_type i = row_ptrs[row]; i < row_ptrs[row + 1]; ++i) { - const auto col = col_idxs[i]; - const auto val = values[i]; - // save diagonal entry for later - if (col == row) { - diag_val = val; - } - if (col < row) { - l_col_idxs[l_idx] = col; - l_values[l_idx] = val; - ++l_idx; - } - if (row < col) { - u_col_idxs[u_idx] = col; - u_values[u_idx] = val; - ++u_idx; - } - } - // store diagonal entries - auto l_diag_idx = l_row_ptrs[row + 1] - 1; - auto u_diag_idx = u_row_ptrs[row]; - l_col_idxs[l_diag_idx] = row; - u_col_idxs[u_diag_idx] = row; - l_values[l_diag_idx] = one(); - u_values[u_diag_idx] = diag_val; - } -} - template void initialize_l_u(dim3 grid, dim3 block, size_type dynamic_shared_memory, sycl::queue* queue, size_type num_rows, @@ -376,9 +332,14 @@ void initialize_l_u(dim3 grid, dim3 block, size_type dynamic_shared_memory, { queue->parallel_for( sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) { - initialize_l_u(num_rows, row_ptrs, col_idxs, values, l_row_ptrs, - l_col_idxs, l_values, u_row_ptrs, u_col_idxs, - u_values, item_ct1); + helpers::initialize_l_u( + num_rows, row_ptrs, col_idxs, values, l_row_ptrs, l_col_idxs, + l_values, u_row_ptrs, u_col_idxs, u_values, + helpers::triangular_mtx_closure( + [](auto) { return one(); }, helpers::identity{}), + helpers::triangular_mtx_closure(helpers::identity{}, + helpers::identity{}), + item_ct1); }); } @@ -418,47 +379,6 @@ void count_nnz_per_l_row(dim3 grid, dim3 block, size_type dynamic_shared_memory, } -template -void initialize_l(size_type num_rows, const IndexType* __restrict__ row_ptrs, - const IndexType* __restrict__ col_idxs, - const ValueType* __restrict__ values, - const IndexType* __restrict__ l_row_ptrs, - IndexType* __restrict__ l_col_idxs, - ValueType* __restrict__ l_values, bool use_sqrt, - sycl::nd_item<3> item_ct1) -{ - const auto row = thread::get_thread_id_flat(item_ct1); - if (row < num_rows) { - auto l_idx = l_row_ptrs[row]; - // if there was no diagonal entry, default to one - auto diag_val = one(); - for (size_type i = row_ptrs[row]; i < row_ptrs[row + 1]; ++i) { - const auto col = col_idxs[i]; - const auto val = values[i]; - // save diagonal entry for later - if (col == row) { - diag_val = val; - } - if (col < row) { - l_col_idxs[l_idx] = col; - l_values[l_idx] = val; - ++l_idx; - } - } - // store diagonal entries - auto l_diag_idx = l_row_ptrs[row + 1] - 1; - l_col_idxs[l_diag_idx] = row; - // compute square root with sentinel - if (use_sqrt) { - diag_val = std::sqrt(diag_val); - if (!is_finite(diag_val)) { - diag_val = one(); - } - } - l_values[l_diag_idx] = diag_val; - } -} - template void initialize_l(dim3 grid, dim3 block, size_type dynamic_shared_memory, sycl::queue* queue, size_type num_rows, @@ -468,8 +388,20 @@ void initialize_l(dim3 grid, dim3 block, size_type dynamic_shared_memory, { queue->parallel_for( sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) { - initialize_l(num_rows, row_ptrs, col_idxs, values, l_row_ptrs, - l_col_idxs, l_values, use_sqrt, item_ct1); + helpers::initialize_l(num_rows, row_ptrs, col_idxs, values, + l_row_ptrs, l_col_idxs, l_values, + helpers::triangular_mtx_closure( + [use_sqrt](auto val) { + if (use_sqrt) { + val = sqrt(val); + if (!is_finite(val)) { + val = one(); + } + } + return val; + }, + helpers::identity{}), + item_ct1); }); } diff --git a/dpcpp/preconditioner/sor_kernels.dp.cpp b/dpcpp/preconditioner/sor_kernels.dp.cpp new file mode 100644 index 00000000000..fe796586591 --- /dev/null +++ b/dpcpp/preconditioner/sor_kernels.dp.cpp @@ -0,0 +1,100 @@ +// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors +// +// SPDX-License-Identifier: BSD-3-Clause + +#include "core/preconditioner/sor_kernels.hpp" + +#include +#include + +#include "dpcpp/factorization/factorization_helpers.dp.hpp" + +namespace gko { +namespace kernels { +namespace dpcpp { +namespace sor { + + +constexpr int default_block_size{256}; + + +template +void initialize_weighted_l( + std::shared_ptr exec, + const matrix::Csr* system_matrix, + remove_complex weight, matrix::Csr* l_mtx) +{ + const size_type num_rows{system_matrix->get_size()[0]}; + const dim3 block_size{default_block_size, 1, 1}; + const dim3 grid_dim{static_cast(ceildiv( + num_rows, static_cast(block_size.x))), + 1, 1}; + + auto inv_weight = one(weight) / weight; + + exec->get_queue()->parallel_for( + sycl_nd_range(grid_dim, block_size), [=](sycl::nd_item<3> item_ct1) { + factorization::helpers::initialize_l( + num_rows, system_matrix->get_const_row_ptrs(), + system_matrix->get_const_col_idxs(), + system_matrix->get_const_values(), l_mtx->get_const_row_ptrs(), + l_mtx->get_col_idxs(), l_mtx->get_values(), + factorization::helpers::triangular_mtx_closure( + [inv_weight](auto val) { return val * inv_weight; }, + factorization::helpers::identity{}), + item_ct1); + }); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_SOR_INITIALIZE_WEIGHTED_L); + + +template +void initialize_weighted_l_u( + std::shared_ptr exec, + const matrix::Csr* system_matrix, + remove_complex weight, matrix::Csr* l_mtx, + matrix::Csr* u_mtx) +{ + const size_type num_rows{system_matrix->get_size()[0]}; + const dim3 block_size{default_block_size, 1, 1}; + const dim3 grid_dim{static_cast(ceildiv( + num_rows, static_cast(block_size.x))), + 1, 1}; + + auto inv_weight = one(weight) / weight; + auto inv_two_minus_weight = + one(weight) / (static_cast>(2.0) - weight); + + exec->get_queue()->parallel_for( + sycl_nd_range(grid_dim, block_size), [=](sycl::nd_item<3> item_ct1) { + factorization::helpers::initialize_l_u( + num_rows, system_matrix->get_const_row_ptrs(), + system_matrix->get_const_col_idxs(), + system_matrix->get_const_values(), l_mtx->get_const_row_ptrs(), + l_mtx->get_col_idxs(), l_mtx->get_values(), + u_mtx->get_const_row_ptrs(), u_mtx->get_col_idxs(), + u_mtx->get_values(), + factorization::helpers::triangular_mtx_closure( + [inv_weight](auto val) { return val * inv_weight; }, + factorization::helpers::identity{}), + factorization::helpers::triangular_mtx_closure( + [inv_two_minus_weight](auto val) { + return val * inv_two_minus_weight; + }, + [weight, inv_two_minus_weight](auto val) { + return val * weight * inv_two_minus_weight; + }), + item_ct1); + }); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_SOR_INITIALIZE_WEIGHTED_L_U); + + +} // namespace sor +} // namespace dpcpp +} // namespace kernels +} // namespace gko diff --git a/omp/CMakeLists.txt b/omp/CMakeLists.txt index 41bec80673f..fef0702048f 100644 --- a/omp/CMakeLists.txt +++ b/omp/CMakeLists.txt @@ -40,6 +40,7 @@ target_sources(ginkgo_omp preconditioner/batch_jacobi_kernels.cpp preconditioner/isai_kernels.cpp preconditioner/jacobi_kernels.cpp + preconditioner/sor_kernels.cpp reorder/rcm_kernels.cpp solver/batch_bicgstab_kernels.cpp solver/batch_cg_kernels.cpp diff --git a/omp/factorization/factorization_helpers.hpp b/omp/factorization/factorization_helpers.hpp new file mode 100644 index 00000000000..f1eed7d4d37 --- /dev/null +++ b/omp/factorization/factorization_helpers.hpp @@ -0,0 +1,114 @@ +// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors +// +// SPDX-License-Identifier: BSD-3-Clause + +#include + +#include "core/factorization/factorization_helpers.hpp" + + +namespace gko { +namespace kernels { +namespace omp { +namespace factorization { +namespace helpers { + + +using namespace ::gko::factorization; + + +template +void initialize_l_u(const matrix::Csr* system_matrix, + matrix::Csr* csr_l, + matrix::Csr* csr_u, + LClosure&& l_closure, UClosure&& u_closure) +{ + const auto row_ptrs = system_matrix->get_const_row_ptrs(); + const auto col_idxs = system_matrix->get_const_col_idxs(); + const auto vals = system_matrix->get_const_values(); + + const auto row_ptrs_l = csr_l->get_const_row_ptrs(); + auto col_idxs_l = csr_l->get_col_idxs(); + auto vals_l = csr_l->get_values(); + + const auto row_ptrs_u = csr_u->get_const_row_ptrs(); + auto col_idxs_u = csr_u->get_col_idxs(); + auto vals_u = csr_u->get_values(); + +#pragma omp parallel for + for (size_type row = 0; row < system_matrix->get_size()[0]; ++row) { + size_type current_index_l = row_ptrs_l[row]; + size_type current_index_u = + row_ptrs_u[row] + 1; // we treat the diagonal separately + // if there is no diagonal value, set it to 1 by default + auto diag_val = one(); + for (size_type el = row_ptrs[row]; el < row_ptrs[row + 1]; ++el) { + const auto col = col_idxs[el]; + const auto val = vals[el]; + if (col < row) { + col_idxs_l[current_index_l] = col; + vals_l[current_index_l] = l_closure.map_off_diag(val); + ++current_index_l; + } else if (col == row) { + // save value for later + diag_val = val; + } else { // col > row + col_idxs_u[current_index_u] = col; + vals_u[current_index_u] = u_closure.map_off_diag(val); + ++current_index_u; + } + } + // store diagonal entries + size_type l_diag_idx = row_ptrs_l[row + 1] - 1; + size_type u_diag_idx = row_ptrs_u[row]; + col_idxs_l[l_diag_idx] = row; + col_idxs_u[u_diag_idx] = row; + vals_l[l_diag_idx] = l_closure.map_diag(diag_val); + vals_u[u_diag_idx] = u_closure.map_diag(diag_val); + } +} + + +template +void initialize_l(const matrix::Csr* system_matrix, + matrix::Csr* csr_l, Closure&& closure) +{ + const auto row_ptrs = system_matrix->get_const_row_ptrs(); + const auto col_idxs = system_matrix->get_const_col_idxs(); + const auto vals = system_matrix->get_const_values(); + + const auto row_ptrs_l = csr_l->get_const_row_ptrs(); + auto col_idxs_l = csr_l->get_col_idxs(); + auto vals_l = csr_l->get_values(); + +#pragma omp parallel for + for (size_type row = 0; row < system_matrix->get_size()[0]; ++row) { + size_type current_index_l = row_ptrs_l[row]; + // if there is no diagonal value, set it to 1 by default + auto diag_val = one(); + for (size_type el = row_ptrs[row]; el < row_ptrs[row + 1]; ++el) { + const auto col = col_idxs[el]; + const auto val = vals[el]; + if (col < row) { + col_idxs_l[current_index_l] = col; + vals_l[current_index_l] = val; + ++current_index_l; + } else if (col == row) { + // save value for later + diag_val = val; + } + } + // store diagonal entries + size_type l_diag_idx = row_ptrs_l[row + 1] - 1; + col_idxs_l[l_diag_idx] = row; + vals_l[l_diag_idx] = closure.map_diag(diag_val); + } +} + + +} // namespace helpers +} // namespace factorization +} // namespace omp +} // namespace kernels +} // namespace gko diff --git a/omp/factorization/factorization_kernels.cpp b/omp/factorization/factorization_kernels.cpp index f4b41cbdac5..e7b66f6f887 100644 --- a/omp/factorization/factorization_kernels.cpp +++ b/omp/factorization/factorization_kernels.cpp @@ -12,6 +12,7 @@ #include "core/components/prefix_sum_kernels.hpp" #include "core/matrix/csr_builder.hpp" +#include "omp/factorization/factorization_helpers.hpp" namespace gko { @@ -224,49 +225,12 @@ void initialize_l_u(std::shared_ptr exec, matrix::Csr* csr_l, matrix::Csr* csr_u) { - const auto row_ptrs = system_matrix->get_const_row_ptrs(); - const auto col_idxs = system_matrix->get_const_col_idxs(); - const auto vals = system_matrix->get_const_values(); - - const auto row_ptrs_l = csr_l->get_const_row_ptrs(); - auto col_idxs_l = csr_l->get_col_idxs(); - auto vals_l = csr_l->get_values(); - - const auto row_ptrs_u = csr_u->get_const_row_ptrs(); - auto col_idxs_u = csr_u->get_col_idxs(); - auto vals_u = csr_u->get_values(); - -#pragma omp parallel for - for (size_type row = 0; row < system_matrix->get_size()[0]; ++row) { - size_type current_index_l = row_ptrs_l[row]; - size_type current_index_u = - row_ptrs_u[row] + 1; // we treat the diagonal separately - // if there is no diagonal value, set it to 1 by default - auto diag_val = one(); - for (size_type el = row_ptrs[row]; el < row_ptrs[row + 1]; ++el) { - const auto col = col_idxs[el]; - const auto val = vals[el]; - if (col < row) { - col_idxs_l[current_index_l] = col; - vals_l[current_index_l] = val; - ++current_index_l; - } else if (col == row) { - // save value for later - diag_val = val; - } else { // col > row - col_idxs_u[current_index_u] = col; - vals_u[current_index_u] = val; - ++current_index_u; - } - } - // store diagonal entries - size_type l_diag_idx = row_ptrs_l[row + 1] - 1; - size_type u_diag_idx = row_ptrs_u[row]; - col_idxs_l[l_diag_idx] = row; - col_idxs_u[u_diag_idx] = row; - vals_l[l_diag_idx] = one(); - vals_u[u_diag_idx] = diag_val; - } + helpers::initialize_l_u( + system_matrix, csr_l, csr_u, + helpers::triangular_mtx_closure([](auto) { return one(); }, + helpers::identity{}), + helpers::triangular_mtx_closure(helpers::identity{}, + helpers::identity{})); } GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( @@ -309,43 +273,18 @@ void initialize_l(std::shared_ptr exec, const matrix::Csr* system_matrix, matrix::Csr* csr_l, bool diag_sqrt) { - const auto row_ptrs = system_matrix->get_const_row_ptrs(); - const auto col_idxs = system_matrix->get_const_col_idxs(); - const auto vals = system_matrix->get_const_values(); - - const auto row_ptrs_l = csr_l->get_const_row_ptrs(); - auto col_idxs_l = csr_l->get_col_idxs(); - auto vals_l = csr_l->get_values(); - -#pragma omp parallel for - for (size_type row = 0; row < system_matrix->get_size()[0]; ++row) { - size_type current_index_l = row_ptrs_l[row]; - // if there is no diagonal value, set it to 1 by default - auto diag_val = one(); - for (size_type el = row_ptrs[row]; el < row_ptrs[row + 1]; ++el) { - const auto col = col_idxs[el]; - const auto val = vals[el]; - if (col < row) { - col_idxs_l[current_index_l] = col; - vals_l[current_index_l] = val; - ++current_index_l; - } else if (col == row) { - // save value for later - diag_val = val; - } - } - // store diagonal entries - size_type l_diag_idx = row_ptrs_l[row + 1] - 1; - col_idxs_l[l_diag_idx] = row; - // compute square root with sentinel - if (diag_sqrt) { - diag_val = sqrt(diag_val); - if (!is_finite(diag_val)) { - diag_val = one(); - } - } - vals_l[l_diag_idx] = diag_val; - } + helpers::initialize_l(system_matrix, csr_l, + helpers::triangular_mtx_closure( + [diag_sqrt](auto val) { + if (diag_sqrt) { + val = sqrt(val); + if (!is_finite(val)) { + val = one(); + } + } + return val; + }, + helpers::identity{})); } GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( diff --git a/omp/preconditioner/sor_kernels.cpp b/omp/preconditioner/sor_kernels.cpp new file mode 100644 index 00000000000..509946ac15a --- /dev/null +++ b/omp/preconditioner/sor_kernels.cpp @@ -0,0 +1,67 @@ +// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors +// +// SPDX-License-Identifier: BSD-3-Clause + +#include "core/preconditioner/sor_kernels.hpp" + +#include +#include + +#include "omp/factorization/factorization_helpers.hpp" + +namespace gko { +namespace kernels { +namespace omp { +namespace sor { + + +template +void initialize_weighted_l( + std::shared_ptr exec, + const matrix::Csr* system_matrix, + remove_complex weight, matrix::Csr* l_mtx) +{ + auto inv_weight = one(weight) / weight; + factorization::helpers::initialize_l( + system_matrix, l_mtx, + factorization::helpers::triangular_mtx_closure( + [inv_weight](auto val) { return val * inv_weight; }, + [](auto val) { return val; })); +}; + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_SOR_INITIALIZE_WEIGHTED_L); + + +template +void initialize_weighted_l_u( + std::shared_ptr exec, + const matrix::Csr* system_matrix, + remove_complex weight, matrix::Csr* l_mtx, + matrix::Csr* u_mtx) +{ + auto inv_weight = one(weight) / weight; + auto inv_two_minus_weight = + one(weight) / (static_cast>(2.0) - weight); + factorization::helpers::initialize_l_u( + system_matrix, l_mtx, u_mtx, + factorization::helpers::triangular_mtx_closure( + [inv_weight](auto val) { return val * inv_weight; }, + [](auto val) { return val; }), + factorization::helpers::triangular_mtx_closure( + [inv_two_minus_weight](auto val) { + return val * inv_two_minus_weight; + }, + [weight, inv_two_minus_weight](auto val) { + return val * weight * inv_two_minus_weight; + })); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( + GKO_DECLARE_SOR_INITIALIZE_WEIGHTED_L_U); + + +} // namespace sor +} // namespace omp +} // namespace kernels +} // namespace gko diff --git a/test/factorization/CMakeLists.txt b/test/factorization/CMakeLists.txt index e768a48ef05..8b5aa51287b 100644 --- a/test/factorization/CMakeLists.txt +++ b/test/factorization/CMakeLists.txt @@ -1,4 +1,5 @@ ginkgo_create_common_test(cholesky_kernels DISABLE_EXECUTORS dpcpp) +ginkgo_create_common_test(factorization_kernels) ginkgo_create_common_test(lu_kernels DISABLE_EXECUTORS dpcpp) ginkgo_create_common_test(ic_kernels DISABLE_EXECUTORS dpcpp omp) ginkgo_create_common_test(ilu_kernels DISABLE_EXECUTORS dpcpp omp) diff --git a/test/factorization/factorization_kernels.cpp b/test/factorization/factorization_kernels.cpp new file mode 100644 index 00000000000..7887d83e0f7 --- /dev/null +++ b/test/factorization/factorization_kernels.cpp @@ -0,0 +1,78 @@ +// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors +// +// SPDX-License-Identifier: BSD-3-Clause + +#include "core/factorization/factorization_kernels.hpp" + +#include +#include +#include +#include + +#include + +#include + +#include + +#include "core/test/utils.hpp" +#include "core/test/utils/unsort_matrix.hpp" +#include "test/utils/common_fixture.hpp" + + +class Factorization : public CommonTestFixture { +protected: + using Csr = gko::matrix::Csr; + + Factorization() + { + mtx = gko::test::generate_random_matrix( + 52, 52, std::uniform_int_distribution<>(4, 40), + std::uniform_real_distribution<>(1, 2), rand_engine, ref); + gko::utils::ensure_all_diagonal_entries(mtx.get()); + dmtx = gko::clone(exec, mtx); + } + + std::default_random_engine rand_engine{6794}; + std::shared_ptr mtx; + std::shared_ptr dmtx; +}; + + +TEST_F(Factorization, InitializeRowPtrsLSameAsRef) +{ + gko::array l_ptrs{ref, mtx->get_size()[0] + 1}; + gko::array dl_ptrs{exec, mtx->get_size()[0] + 1}; + + gko::kernels::reference::factorization::initialize_row_ptrs_l( + ref, mtx.get(), l_ptrs.get_data()); + gko::kernels::GKO_DEVICE_NAMESPACE::factorization::initialize_row_ptrs_l( + exec, dmtx.get(), dl_ptrs.get_data()); + + GKO_ASSERT_ARRAY_EQ(l_ptrs, dl_ptrs); +} + + +TEST_F(Factorization, InitializeLWithoutSqrtSameAsRef) +{ + gko::array l_ptrs{ref, mtx->get_size()[0] + 1}; + gko::kernels::reference::factorization::initialize_row_ptrs_l( + ref, mtx.get(), l_ptrs.get_data()); + auto nnz = + static_cast(l_ptrs.get_data()[mtx->get_size()[0]]); + auto l_mtx = + Csr::create(ref, mtx->get_size(), gko::array(ref, nnz), + gko::array(ref, nnz), l_ptrs); + auto dl_mtx = gko::clone(exec, l_mtx); + + for (auto diag_sqrt : {false, true}) { + SCOPED_TRACE("diag_sqrt: " + std::to_string(diag_sqrt)); + + gko::kernels::reference::factorization::initialize_l( + ref, mtx.get(), l_mtx.get(), diag_sqrt); + gko::kernels::GKO_DEVICE_NAMESPACE::factorization::initialize_l( + exec, dmtx.get(), dl_mtx.get(), diag_sqrt); + + GKO_ASSERT_MTX_NEAR(l_mtx, dl_mtx, 0.0); + } +} diff --git a/test/preconditioner/CMakeLists.txt b/test/preconditioner/CMakeLists.txt index b41897efaac..46696e29549 100644 --- a/test/preconditioner/CMakeLists.txt +++ b/test/preconditioner/CMakeLists.txt @@ -1,3 +1,4 @@ ginkgo_create_common_test(batch_jacobi_kernels DISABLE_EXECUTORS dpcpp) ginkgo_create_common_test(jacobi_kernels DISABLE_EXECUTORS dpcpp) ginkgo_create_common_test(isai_kernels) +ginkgo_create_common_test(sor_kernels) diff --git a/test/preconditioner/sor_kernels.cpp b/test/preconditioner/sor_kernels.cpp new file mode 100644 index 00000000000..cd12514bb28 --- /dev/null +++ b/test/preconditioner/sor_kernels.cpp @@ -0,0 +1,89 @@ +// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors +// +// SPDX-License-Identifier: BSD-3-Clause + +#include "core/preconditioner/sor_kernels.hpp" + +#include + +#include + +#include +#include +#include +#include + +#include "core/test/utils.hpp" +#include "core/utils/matrix_utils.hpp" +#include "test/utils/common_fixture.hpp" +#include "test/utils/executor.hpp" + + +class Sor : public CommonTestFixture { +protected: + using Csr = gko::matrix::Csr; + using Dense = gko::matrix::Dense; + + Sor() + { + gko::size_type n = 133; + index_type row_limit = 15; + auto nz_dist = std::uniform_int_distribution(1, row_limit); + auto val_dist = std::uniform_real_distribution(-1., 1.); + auto md = + gko::test::generate_random_matrix_data( + n, n, nz_dist, val_dist, rand_engine); + auto md_l = md; + auto md_u = md; + // make_upper/lower_triangular also removes the diagonal, so it is + // added back with make_unit_diagonal + gko::utils::make_lower_triangular(md_l); + gko::utils::make_unit_diagonal(md_l); + gko::utils::make_upper_triangular(md_u); + gko::utils::make_unit_diagonal(md_u); + + mtx->read(md); + d_mtx->read(md); + + result_l->read(md_l); + result_l->scale(gko::initialize({0.0}, ref)); + d_result_l = gko::clone(exec, result_l); + + result_u->read(md_u); + result_u->scale(gko::initialize({0.0}, ref)); + d_result_u = gko::clone(exec, result_u); + } + + std::default_random_engine rand_engine{42}; + + std::unique_ptr mtx = Csr::create(ref); + std::unique_ptr d_mtx = Csr::create(exec); + + std::unique_ptr result_l = Csr::create(ref); + std::unique_ptr d_result_l = Csr::create(exec); + std::unique_ptr result_u = Csr::create(ref); + std::unique_ptr d_result_u = Csr::create(exec); +}; + + +TEST_F(Sor, InitializeWeightedLFactorIsSameAsReference) +{ + gko::kernels::reference::sor::initialize_weighted_l(ref, mtx.get(), 1.24, + result_l.get()); + gko::kernels::GKO_DEVICE_NAMESPACE::sor::initialize_weighted_l( + exec, d_mtx.get(), 1.24, d_result_l.get()); + + GKO_ASSERT_MTX_NEAR(result_l, d_result_l, r::value); +} + + +TEST_F(Sor, InitializeWeightedLAndUFactorIsSameAsReference) +{ + gko::kernels::reference::sor::initialize_weighted_l_u( + ref, mtx.get(), 1.24, result_l.get(), result_u.get()); + gko::kernels::GKO_DEVICE_NAMESPACE::sor::initialize_weighted_l_u( + exec, d_mtx.get(), 1.24, d_result_l.get(), d_result_u.get()); + + GKO_ASSERT_MTX_NEAR(result_l, d_result_l, r::value); + GKO_ASSERT_MTX_NEAR(result_u, d_result_u, r::value); +}