ginkgo-project · MarcelKoch · Oct 31, 2024 · Jul 9, 2024 · Jun 27, 2024 · Jun 27, 2024
diff --git a/benchmark/utils/preconditioners.hpp b/benchmark/utils/preconditioners.hpp
@@ -22,7 +22,7 @@ DEFINE_string(preconditioners, "none",
               "A comma-separated list of preconditioners to use. "
               "Supported values are: none, jacobi, paric, parict, parilu, "
               "parilut, ic, ilu, paric-isai, parict-isai, parilu-isai, "
-              "parilut-isai, ic-isai, ilu-isai, overhead");
+              "parilut-isai, ic-isai, ilu-isai, sor, overhead");
 
 DEFINE_uint32(parilu_iterations, 5,
               "The number of iterations for ParIC(T)/ParILU(T)");
@@ -49,6 +49,12 @@ DEFINE_double(jacobi_accuracy, 1e-1,
 DEFINE_uint32(jacobi_max_block_size, 32,
               "Maximal block size of the block-Jacobi preconditioner");
 
+DEFINE_double(sor_relaxation_factor, 1.0,
+              "The relaxation factor for the SOR preconditioner");
+
+DEFINE_bool(sor_symmetric, false,
+            "Apply the SOR preconditioner symmetrically, i.e. use SSOR");
+
 
 // parses the Jacobi storage optimization command line argument
 gko::precision_reduction parse_storage_optimization(const std::string& flag)
@@ -292,6 +298,15 @@ const std::map<std::string, std::function<std::unique_ptr<gko::LinOpFactory>(
                  .with_sparsity_power(FLAGS_isai_power)
                  .on(exec);
          }},
+        {"sor",
+         [](std::shared_ptr<const gko::Executor> exec) {
+             return gko::preconditioner::Sor<etype, itype>::build()
+                 .with_relaxation_factor(
+                     static_cast<gko::remove_complex<etype>>(
+                         FLAGS_sor_relaxation_factor))
+                 .with_symmetric(FLAGS_sor_symmetric)
+                 .on(exec);
+         }},
         {"overhead", [](std::shared_ptr<const gko::Executor> exec) {
              return gko::Overhead<etype>::build()
                  .with_criteria(gko::stop::ResidualNorm<etype>::build()

diff --git a/common/cuda_hip/CMakeLists.txt b/common/cuda_hip/CMakeLists.txt
@@ -38,6 +38,7 @@ set(CUDA_HIP_SOURCES
     preconditioner/jacobi_advanced_apply_kernels.cpp
     preconditioner/jacobi_generate_kernels.cpp
     preconditioner/jacobi_simple_apply_kernels.cpp
+    preconditioner/sor_kernels.cpp
     reorder/rcm_kernels.cpp
     solver/cb_gmres_kernels.cpp
     solver/idr_kernels.cpp

diff --git a/common/cuda_hip/factorization/factorization_helpers.hpp b/common/cuda_hip/factorization/factorization_helpers.hpp
@@ -0,0 +1,112 @@
+// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
+//
+// SPDX-License-Identifier: BSD-3-Clause
+
+#include "common/cuda_hip/base/config.hpp"
+#include "common/cuda_hip/base/runtime.hpp"
+#include "common/cuda_hip/base/types.hpp"
+#include "common/cuda_hip/components/thread_ids.hpp"
+#include "core/factorization/factorization_helpers.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace GKO_DEVICE_NAMESPACE {
+namespace factorization {
+namespace helpers {
+
+
+using namespace ::gko::factorization;
+
+
+constexpr int default_block_size{512};
+
+
+template <typename ValueType, typename IndexType, typename LClosure,
+          typename UClosure>
+__global__ __launch_bounds__(default_block_size) void initialize_l_u(
+    size_type num_rows, const IndexType* __restrict__ row_ptrs,
+    const IndexType* __restrict__ col_idxs,
+    const ValueType* __restrict__ values,
+    const IndexType* __restrict__ l_row_ptrs,
+    IndexType* __restrict__ l_col_idxs, ValueType* __restrict__ l_values,
+    const IndexType* __restrict__ u_row_ptrs,
+    IndexType* __restrict__ u_col_idxs, ValueType* __restrict__ u_values,
+    LClosure l_closure, UClosure u_closure)
+{
+    const auto row = thread::get_thread_id_flat<IndexType>();
+    if (row < num_rows) {
+        auto l_idx = l_row_ptrs[row];
+        auto u_idx = u_row_ptrs[row] + 1;  // we treat the diagonal separately
+        // default diagonal to one
+        auto diag_val = one<ValueType>();
+        for (size_type i = row_ptrs[row]; i < row_ptrs[row + 1]; ++i) {
+            const auto col = col_idxs[i];
+            const auto val = values[i];
+            // save diagonal entry for later
+            if (col == row) {
+                diag_val = val;
+            }
+            if (col < row) {
+                l_col_idxs[l_idx] = col;
+                l_values[l_idx] = l_closure.map_off_diag(val);
+                ++l_idx;
+            }
+            if (row < col) {
+                u_col_idxs[u_idx] = col;
+                u_values[u_idx] = u_closure.map_off_diag(val);
+                ++u_idx;
+            }
+        }
+        // store diagonal entries
+        auto l_diag_idx = l_row_ptrs[row + 1] - 1;
+        auto u_diag_idx = u_row_ptrs[row];
+        l_col_idxs[l_diag_idx] = row;
+        u_col_idxs[u_diag_idx] = row;
+        l_values[l_diag_idx] = l_closure.map_diag(diag_val);
+        u_values[u_diag_idx] = u_closure.map_diag(diag_val);
+    }
+}
+
+
+template <typename ValueType, typename IndexType, typename LClosure>
+__global__ __launch_bounds__(default_block_size) void initialize_l(
+    size_type num_rows, const IndexType* __restrict__ row_ptrs,
+    const IndexType* __restrict__ col_idxs,
+    const ValueType* __restrict__ values,
+    const IndexType* __restrict__ l_row_ptrs,
+    IndexType* __restrict__ l_col_idxs, ValueType* __restrict__ l_values,
+    LClosure l_closure)
+{
+    const auto row = thread::get_thread_id_flat<IndexType>();
+    if (row < num_rows) {
+        auto l_idx = l_row_ptrs[row];
+        // if there was no diagonal entry, default to one
+        auto diag_val = one<ValueType>();
+        for (size_type i = row_ptrs[row]; i < row_ptrs[row + 1]; ++i) {
+            const auto col = col_idxs[i];
+            const auto val = values[i];
+            // save diagonal entry for later
+            if (col == row) {
+                diag_val = val;
+            }
+            if (col < row) {
+                l_col_idxs[l_idx] = col;
+                l_values[l_idx] = l_closure.map_off_diag(val);
+                ++l_idx;
+            }
+        }
+        // store diagonal entries
+        auto l_diag_idx = l_row_ptrs[row + 1] - 1;
+        l_col_idxs[l_diag_idx] = row;
+        l_values[l_diag_idx] = l_closure.map_diag(diag_val);
+    }
+}
+
+
+}  // namespace helpers
+}  // namespace factorization
+}  // namespace GKO_DEVICE_NAMESPACE
+
+}  // namespace kernels
+}  // namespace gko
-}  // namespace gko
+}  // namespace gko
+
-}  // namespace gko
+}  // namespace gko
+
diff --git a/common/cuda_hip/factorization/factorization_kernels.cpp b/common/cuda_hip/factorization/factorization_kernels.cpp
@@ -13,6 +13,7 @@
 #include "common/cuda_hip/components/intrinsics.hpp"
 #include "common/cuda_hip/components/searching.hpp"
 #include "common/cuda_hip/components/thread_ids.hpp"
+#include "common/cuda_hip/factorization/factorization_helpers.hpp"
 #include "core/base/array_access.hpp"
 #include "core/components/prefix_sum_kernels.hpp"
 #include "core/matrix/csr_builder.hpp"
@@ -255,51 +256,6 @@ __global__ __launch_bounds__(default_block_size) void count_nnz_per_l_u_row(
 }
 
 
-template <typename ValueType, typename IndexType>
-__global__ __launch_bounds__(default_block_size) void initialize_l_u(
-    size_type num_rows, const IndexType* __restrict__ row_ptrs,
-    const IndexType* __restrict__ col_idxs,
-    const ValueType* __restrict__ values,
-    const IndexType* __restrict__ l_row_ptrs,
-    IndexType* __restrict__ l_col_idxs, ValueType* __restrict__ l_values,
-    const IndexType* __restrict__ u_row_ptrs,
-    IndexType* __restrict__ u_col_idxs, ValueType* __restrict__ u_values)
-{
-    const auto row = thread::get_thread_id_flat<IndexType>();
-    if (row < num_rows) {
-        auto l_idx = l_row_ptrs[row];
-        auto u_idx = u_row_ptrs[row] + 1;  // we treat the diagonal separately
-        // default diagonal to one
-        auto diag_val = one<ValueType>();
-        for (size_type i = row_ptrs[row]; i < row_ptrs[row + 1]; ++i) {
-            const auto col = col_idxs[i];
-            const auto val = values[i];
-            // save diagonal entry for later
-            if (col == row) {
-                diag_val = val;
-            }
-            if (col < row) {
-                l_col_idxs[l_idx] = col;
-                l_values[l_idx] = val;
-                ++l_idx;
-            }
-            if (row < col) {
-                u_col_idxs[u_idx] = col;
-                u_values[u_idx] = val;
-                ++u_idx;
-            }
-        }
-        // store diagonal entries
-        auto l_diag_idx = l_row_ptrs[row + 1] - 1;
-        auto u_diag_idx = u_row_ptrs[row];
-        l_col_idxs[l_diag_idx] = row;
-        u_col_idxs[u_diag_idx] = row;
-        l_values[l_diag_idx] = one<ValueType>();
-        u_values[u_diag_idx] = diag_val;
-    }
-}
-
-
 template <typename ValueType, typename IndexType>
 __global__ __launch_bounds__(default_block_size) void count_nnz_per_l_row(
     size_type num_rows, const IndexType* __restrict__ row_ptrs,
@@ -320,48 +276,6 @@ __global__ __launch_bounds__(default_block_size) void count_nnz_per_l_row(
 }
 
 
-template <typename ValueType, typename IndexType>
-__global__ __launch_bounds__(default_block_size) void initialize_l(
-    size_type num_rows, const IndexType* __restrict__ row_ptrs,
-    const IndexType* __restrict__ col_idxs,
-    const ValueType* __restrict__ values,
-    const IndexType* __restrict__ l_row_ptrs,
-    IndexType* __restrict__ l_col_idxs, ValueType* __restrict__ l_values,
-    bool use_sqrt)
-{
-    const auto row = thread::get_thread_id_flat<IndexType>();
-    if (row < num_rows) {
-        auto l_idx = l_row_ptrs[row];
-        // if there was no diagonal entry, default to one
-        auto diag_val = one<ValueType>();
-        for (size_type i = row_ptrs[row]; i < row_ptrs[row + 1]; ++i) {
-            const auto col = col_idxs[i];
-            const auto val = values[i];
-            // save diagonal entry for later
-            if (col == row) {
-                diag_val = val;
-            }
-            if (col < row) {
-                l_col_idxs[l_idx] = col;
-                l_values[l_idx] = val;
-                ++l_idx;
-            }
-        }
-        // store diagonal entries
-        auto l_diag_idx = l_row_ptrs[row + 1] - 1;
-        l_col_idxs[l_diag_idx] = row;
-        // compute square root with sentinel
-        if (use_sqrt) {
-            diag_val = sqrt(diag_val);
-            if (!is_finite(diag_val)) {
-                diag_val = one<ValueType>();
-            }
-        }
-        l_values[l_diag_idx] = diag_val;
-    }
-}
-
-
 }  // namespace kernel
 
 
@@ -481,18 +395,25 @@ void initialize_l_u(std::shared_ptr<const DefaultExecutor> exec,
                     matrix::Csr<ValueType, IndexType>* csr_u)
 {
     const size_type num_rows{system_matrix->get_size()[0]};
-    const auto block_size = default_block_size;
+    const auto block_size = helpers::default_block_size;
     const auto grid_dim = static_cast<uint32>(
         ceildiv(num_rows, static_cast<size_type>(block_size)));
 
     if (grid_dim > 0) {
-        kernel::initialize_l_u<<<grid_dim, block_size, 0, exec->get_stream()>>>(
-            num_rows, system_matrix->get_const_row_ptrs(),
-            system_matrix->get_const_col_idxs(),
-            as_device_type(system_matrix->get_const_values()),
-            csr_l->get_const_row_ptrs(), csr_l->get_col_idxs(),
-            as_device_type(csr_l->get_values()), csr_u->get_const_row_ptrs(),
-            csr_u->get_col_idxs(), as_device_type(csr_u->get_values()));
+        helpers::
+            initialize_l_u<<<grid_dim, block_size, 0, exec->get_stream()>>>(
+                num_rows, system_matrix->get_const_row_ptrs(),
+                system_matrix->get_const_col_idxs(),
+                as_device_type(system_matrix->get_const_values()),
+                csr_l->get_const_row_ptrs(), csr_l->get_col_idxs(),
+                as_device_type(csr_l->get_values()),
+                csr_u->get_const_row_ptrs(), csr_u->get_col_idxs(),
+                as_device_type(csr_u->get_values()),
+                helpers::triangular_mtx_closure(
+                    [] __device__(auto val) { return one(val); },
+                    helpers::identity{}),
+                helpers::triangular_mtx_closure(helpers::identity{},
+                                                helpers::identity{}));
     }
 }
 
@@ -534,17 +455,28 @@ void initialize_l(std::shared_ptr<const DefaultExecutor> exec,
                   matrix::Csr<ValueType, IndexType>* csr_l, bool diag_sqrt)
 {
     const size_type num_rows{system_matrix->get_size()[0]};
-    const auto block_size = default_block_size;
+    const auto block_size = helpers::default_block_size;
     const auto grid_dim = static_cast<uint32>(
         ceildiv(num_rows, static_cast<size_type>(block_size)));
 
     if (grid_dim > 0) {
-        kernel::initialize_l<<<grid_dim, block_size, 0, exec->get_stream()>>>(
+        helpers::initialize_l<<<grid_dim, block_size, 0, exec->get_stream()>>>(
             num_rows, system_matrix->get_const_row_ptrs(),
             system_matrix->get_const_col_idxs(),
             as_device_type(system_matrix->get_const_values()),
             csr_l->get_const_row_ptrs(), csr_l->get_col_idxs(),
-            as_device_type(csr_l->get_values()), diag_sqrt);
+            as_device_type(csr_l->get_values()),
+            helpers::triangular_mtx_closure(
+                [diag_sqrt] __device__(auto val) {
+                    if (diag_sqrt) {
+                        val = sqrt(val);
+                        if (!is_finite(val)) {
+                            val = one(val);
+                        }
+                    }
+                    return val;
+                },
+                helpers::identity{}));
     }
 }