Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 16 additions & 1 deletion benchmark/utils/preconditioners.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ DEFINE_string(preconditioners, "none",
"A comma-separated list of preconditioners to use. "
"Supported values are: none, jacobi, paric, parict, parilu, "
"parilut, ic, ilu, paric-isai, parict-isai, parilu-isai, "
"parilut-isai, ic-isai, ilu-isai, overhead");
"parilut-isai, ic-isai, ilu-isai, sor, overhead");

DEFINE_uint32(parilu_iterations, 5,
"The number of iterations for ParIC(T)/ParILU(T)");
Expand All @@ -49,6 +49,12 @@ DEFINE_double(jacobi_accuracy, 1e-1,
DEFINE_uint32(jacobi_max_block_size, 32,
"Maximal block size of the block-Jacobi preconditioner");

DEFINE_double(sor_relaxation_factor, 1.0,
"The relaxation factor for the SOR preconditioner");

DEFINE_bool(sor_symmetric, false,
"Apply the SOR preconditioner symmetrically, i.e. use SSOR");


// parses the Jacobi storage optimization command line argument
gko::precision_reduction parse_storage_optimization(const std::string& flag)
Expand Down Expand Up @@ -292,6 +298,15 @@ const std::map<std::string, std::function<std::unique_ptr<gko::LinOpFactory>(
.with_sparsity_power(FLAGS_isai_power)
.on(exec);
}},
{"sor",
[](std::shared_ptr<const gko::Executor> exec) {
return gko::preconditioner::Sor<etype, itype>::build()
.with_relaxation_factor(
static_cast<gko::remove_complex<etype>>(
FLAGS_sor_relaxation_factor))
.with_symmetric(FLAGS_sor_symmetric)
.on(exec);
}},
{"overhead", [](std::shared_ptr<const gko::Executor> exec) {
return gko::Overhead<etype>::build()
.with_criteria(gko::stop::ResidualNorm<etype>::build()
Expand Down
1 change: 1 addition & 0 deletions common/cuda_hip/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ set(CUDA_HIP_SOURCES
preconditioner/jacobi_advanced_apply_kernels.cpp
preconditioner/jacobi_generate_kernels.cpp
preconditioner/jacobi_simple_apply_kernels.cpp
preconditioner/sor_kernels.cpp
reorder/rcm_kernels.cpp
solver/cb_gmres_kernels.cpp
solver/idr_kernels.cpp
Expand Down
112 changes: 112 additions & 0 deletions common/cuda_hip/factorization/factorization_helpers.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,112 @@
// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
//
// SPDX-License-Identifier: BSD-3-Clause

#include "common/cuda_hip/base/config.hpp"
#include "common/cuda_hip/base/runtime.hpp"
#include "common/cuda_hip/base/types.hpp"
#include "common/cuda_hip/components/thread_ids.hpp"
#include "core/factorization/factorization_helpers.hpp"


namespace gko {
namespace kernels {
namespace GKO_DEVICE_NAMESPACE {
namespace factorization {
namespace helpers {


using namespace ::gko::factorization;


constexpr int default_block_size{512};


template <typename ValueType, typename IndexType, typename LClosure,
typename UClosure>
__global__ __launch_bounds__(default_block_size) void initialize_l_u(
size_type num_rows, const IndexType* __restrict__ row_ptrs,
const IndexType* __restrict__ col_idxs,
const ValueType* __restrict__ values,
const IndexType* __restrict__ l_row_ptrs,
IndexType* __restrict__ l_col_idxs, ValueType* __restrict__ l_values,
const IndexType* __restrict__ u_row_ptrs,
IndexType* __restrict__ u_col_idxs, ValueType* __restrict__ u_values,
LClosure l_closure, UClosure u_closure)
{
const auto row = thread::get_thread_id_flat<IndexType>();
if (row < num_rows) {
auto l_idx = l_row_ptrs[row];
auto u_idx = u_row_ptrs[row] + 1; // we treat the diagonal separately
// default diagonal to one
auto diag_val = one<ValueType>();
for (size_type i = row_ptrs[row]; i < row_ptrs[row + 1]; ++i) {
const auto col = col_idxs[i];
const auto val = values[i];
// save diagonal entry for later
if (col == row) {
diag_val = val;
}
if (col < row) {
l_col_idxs[l_idx] = col;
l_values[l_idx] = l_closure.map_off_diag(val);
++l_idx;
}
if (row < col) {
u_col_idxs[u_idx] = col;
u_values[u_idx] = u_closure.map_off_diag(val);
++u_idx;
}
}
// store diagonal entries
auto l_diag_idx = l_row_ptrs[row + 1] - 1;
auto u_diag_idx = u_row_ptrs[row];
l_col_idxs[l_diag_idx] = row;
u_col_idxs[u_diag_idx] = row;
l_values[l_diag_idx] = l_closure.map_diag(diag_val);
u_values[u_diag_idx] = u_closure.map_diag(diag_val);
}
}


template <typename ValueType, typename IndexType, typename LClosure>
__global__ __launch_bounds__(default_block_size) void initialize_l(
size_type num_rows, const IndexType* __restrict__ row_ptrs,
const IndexType* __restrict__ col_idxs,
const ValueType* __restrict__ values,
const IndexType* __restrict__ l_row_ptrs,
IndexType* __restrict__ l_col_idxs, ValueType* __restrict__ l_values,
LClosure l_closure)
{
const auto row = thread::get_thread_id_flat<IndexType>();
if (row < num_rows) {
auto l_idx = l_row_ptrs[row];
// if there was no diagonal entry, default to one
auto diag_val = one<ValueType>();
for (size_type i = row_ptrs[row]; i < row_ptrs[row + 1]; ++i) {
const auto col = col_idxs[i];
const auto val = values[i];
// save diagonal entry for later
if (col == row) {
diag_val = val;
}
if (col < row) {
l_col_idxs[l_idx] = col;
l_values[l_idx] = l_closure.map_off_diag(val);
++l_idx;
}
}
// store diagonal entries
auto l_diag_idx = l_row_ptrs[row + 1] - 1;
l_col_idxs[l_diag_idx] = row;
l_values[l_diag_idx] = l_closure.map_diag(diag_val);
}
}


} // namespace helpers
} // namespace factorization
} // namespace GKO_DEVICE_NAMESPACE

} // namespace kernels
} // namespace gko
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
} // namespace gko
} // namespace gko

nit

128 changes: 30 additions & 98 deletions common/cuda_hip/factorization/factorization_kernels.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
#include "common/cuda_hip/components/intrinsics.hpp"
#include "common/cuda_hip/components/searching.hpp"
#include "common/cuda_hip/components/thread_ids.hpp"
#include "common/cuda_hip/factorization/factorization_helpers.hpp"
#include "core/base/array_access.hpp"
#include "core/components/prefix_sum_kernels.hpp"
#include "core/matrix/csr_builder.hpp"
Expand Down Expand Up @@ -255,51 +256,6 @@ __global__ __launch_bounds__(default_block_size) void count_nnz_per_l_u_row(
}


template <typename ValueType, typename IndexType>
__global__ __launch_bounds__(default_block_size) void initialize_l_u(
size_type num_rows, const IndexType* __restrict__ row_ptrs,
const IndexType* __restrict__ col_idxs,
const ValueType* __restrict__ values,
const IndexType* __restrict__ l_row_ptrs,
IndexType* __restrict__ l_col_idxs, ValueType* __restrict__ l_values,
const IndexType* __restrict__ u_row_ptrs,
IndexType* __restrict__ u_col_idxs, ValueType* __restrict__ u_values)
{
const auto row = thread::get_thread_id_flat<IndexType>();
if (row < num_rows) {
auto l_idx = l_row_ptrs[row];
auto u_idx = u_row_ptrs[row] + 1; // we treat the diagonal separately
// default diagonal to one
auto diag_val = one<ValueType>();
for (size_type i = row_ptrs[row]; i < row_ptrs[row + 1]; ++i) {
const auto col = col_idxs[i];
const auto val = values[i];
// save diagonal entry for later
if (col == row) {
diag_val = val;
}
if (col < row) {
l_col_idxs[l_idx] = col;
l_values[l_idx] = val;
++l_idx;
}
if (row < col) {
u_col_idxs[u_idx] = col;
u_values[u_idx] = val;
++u_idx;
}
}
// store diagonal entries
auto l_diag_idx = l_row_ptrs[row + 1] - 1;
auto u_diag_idx = u_row_ptrs[row];
l_col_idxs[l_diag_idx] = row;
u_col_idxs[u_diag_idx] = row;
l_values[l_diag_idx] = one<ValueType>();
u_values[u_diag_idx] = diag_val;
}
}


template <typename ValueType, typename IndexType>
__global__ __launch_bounds__(default_block_size) void count_nnz_per_l_row(
size_type num_rows, const IndexType* __restrict__ row_ptrs,
Expand All @@ -320,48 +276,6 @@ __global__ __launch_bounds__(default_block_size) void count_nnz_per_l_row(
}


template <typename ValueType, typename IndexType>
__global__ __launch_bounds__(default_block_size) void initialize_l(
size_type num_rows, const IndexType* __restrict__ row_ptrs,
const IndexType* __restrict__ col_idxs,
const ValueType* __restrict__ values,
const IndexType* __restrict__ l_row_ptrs,
IndexType* __restrict__ l_col_idxs, ValueType* __restrict__ l_values,
bool use_sqrt)
{
const auto row = thread::get_thread_id_flat<IndexType>();
if (row < num_rows) {
auto l_idx = l_row_ptrs[row];
// if there was no diagonal entry, default to one
auto diag_val = one<ValueType>();
for (size_type i = row_ptrs[row]; i < row_ptrs[row + 1]; ++i) {
const auto col = col_idxs[i];
const auto val = values[i];
// save diagonal entry for later
if (col == row) {
diag_val = val;
}
if (col < row) {
l_col_idxs[l_idx] = col;
l_values[l_idx] = val;
++l_idx;
}
}
// store diagonal entries
auto l_diag_idx = l_row_ptrs[row + 1] - 1;
l_col_idxs[l_diag_idx] = row;
// compute square root with sentinel
if (use_sqrt) {
diag_val = sqrt(diag_val);
if (!is_finite(diag_val)) {
diag_val = one<ValueType>();
}
}
l_values[l_diag_idx] = diag_val;
}
}


} // namespace kernel


Expand Down Expand Up @@ -481,18 +395,25 @@ void initialize_l_u(std::shared_ptr<const DefaultExecutor> exec,
matrix::Csr<ValueType, IndexType>* csr_u)
{
const size_type num_rows{system_matrix->get_size()[0]};
const auto block_size = default_block_size;
const auto block_size = helpers::default_block_size;
const auto grid_dim = static_cast<uint32>(
ceildiv(num_rows, static_cast<size_type>(block_size)));

if (grid_dim > 0) {
kernel::initialize_l_u<<<grid_dim, block_size, 0, exec->get_stream()>>>(
num_rows, system_matrix->get_const_row_ptrs(),
system_matrix->get_const_col_idxs(),
as_device_type(system_matrix->get_const_values()),
csr_l->get_const_row_ptrs(), csr_l->get_col_idxs(),
as_device_type(csr_l->get_values()), csr_u->get_const_row_ptrs(),
csr_u->get_col_idxs(), as_device_type(csr_u->get_values()));
helpers::
initialize_l_u<<<grid_dim, block_size, 0, exec->get_stream()>>>(
num_rows, system_matrix->get_const_row_ptrs(),
system_matrix->get_const_col_idxs(),
as_device_type(system_matrix->get_const_values()),
csr_l->get_const_row_ptrs(), csr_l->get_col_idxs(),
as_device_type(csr_l->get_values()),
csr_u->get_const_row_ptrs(), csr_u->get_col_idxs(),
as_device_type(csr_u->get_values()),
helpers::triangular_mtx_closure(
[] __device__(auto val) { return one(val); },
helpers::identity{}),
helpers::triangular_mtx_closure(helpers::identity{},
helpers::identity{}));
}
}

Expand Down Expand Up @@ -534,17 +455,28 @@ void initialize_l(std::shared_ptr<const DefaultExecutor> exec,
matrix::Csr<ValueType, IndexType>* csr_l, bool diag_sqrt)
{
const size_type num_rows{system_matrix->get_size()[0]};
const auto block_size = default_block_size;
const auto block_size = helpers::default_block_size;
const auto grid_dim = static_cast<uint32>(
ceildiv(num_rows, static_cast<size_type>(block_size)));

if (grid_dim > 0) {
kernel::initialize_l<<<grid_dim, block_size, 0, exec->get_stream()>>>(
helpers::initialize_l<<<grid_dim, block_size, 0, exec->get_stream()>>>(
num_rows, system_matrix->get_const_row_ptrs(),
system_matrix->get_const_col_idxs(),
as_device_type(system_matrix->get_const_values()),
csr_l->get_const_row_ptrs(), csr_l->get_col_idxs(),
as_device_type(csr_l->get_values()), diag_sqrt);
as_device_type(csr_l->get_values()),
helpers::triangular_mtx_closure(
[diag_sqrt] __device__(auto val) {
if (diag_sqrt) {
val = sqrt(val);
if (!is_finite(val)) {
val = one(val);
}
}
return val;
},
helpers::identity{}));
}
}

Expand Down
Loading
Loading