diff --git a/benchmark/utils/preconditioners.hpp b/benchmark/utils/preconditioners.hpp
index 63fd22708e6..ea8594e7446 100644
--- a/benchmark/utils/preconditioners.hpp
+++ b/benchmark/utils/preconditioners.hpp
@@ -22,7 +22,7 @@ DEFINE_string(preconditioners, "none",
               "A comma-separated list of preconditioners to use. "
               "Supported values are: none, jacobi, paric, parict, parilu, "
               "parilut, ic, ilu, paric-isai, parict-isai, parilu-isai, "
-              "parilut-isai, ic-isai, ilu-isai, overhead");
+              "parilut-isai, ic-isai, ilu-isai, sor, overhead");
 
 DEFINE_uint32(parilu_iterations, 5,
               "The number of iterations for ParIC(T)/ParILU(T)");
@@ -49,6 +49,12 @@ DEFINE_double(jacobi_accuracy, 1e-1,
 DEFINE_uint32(jacobi_max_block_size, 32,
               "Maximal block size of the block-Jacobi preconditioner");
 
+DEFINE_double(sor_relaxation_factor, 1.0,
+              "The relaxation factor for the SOR preconditioner");
+
+DEFINE_bool(sor_symmetric, false,
+            "Apply the SOR preconditioner symmetrically, i.e. use SSOR");
+
 
 // parses the Jacobi storage optimization command line argument
 gko::precision_reduction parse_storage_optimization(const std::string& flag)
@@ -292,6 +298,15 @@ const std::map<std::string, std::function<std::unique_ptr<gko::LinOpFactory>(
                  .with_sparsity_power(FLAGS_isai_power)
                  .on(exec);
          }},
+        {"sor",
+         [](std::shared_ptr<const gko::Executor> exec) {
+             return gko::preconditioner::Sor<etype, itype>::build()
+                 .with_relaxation_factor(
+                     static_cast<gko::remove_complex<etype>>(
+                         FLAGS_sor_relaxation_factor))
+                 .with_symmetric(FLAGS_sor_symmetric)
+                 .on(exec);
+         }},
         {"overhead", [](std::shared_ptr<const gko::Executor> exec) {
              return gko::Overhead<etype>::build()
                  .with_criteria(gko::stop::ResidualNorm<etype>::build()
diff --git a/common/cuda_hip/CMakeLists.txt b/common/cuda_hip/CMakeLists.txt
index f5a28596d16..267444d2144 100644
--- a/common/cuda_hip/CMakeLists.txt
+++ b/common/cuda_hip/CMakeLists.txt
@@ -38,6 +38,7 @@ set(CUDA_HIP_SOURCES
     preconditioner/jacobi_advanced_apply_kernels.cpp
     preconditioner/jacobi_generate_kernels.cpp
     preconditioner/jacobi_simple_apply_kernels.cpp
+    preconditioner/sor_kernels.cpp
     reorder/rcm_kernels.cpp
     solver/cb_gmres_kernels.cpp
     solver/idr_kernels.cpp
diff --git a/common/cuda_hip/factorization/factorization_helpers.hpp b/common/cuda_hip/factorization/factorization_helpers.hpp
new file mode 100644
index 00000000000..87248740867
--- /dev/null
+++ b/common/cuda_hip/factorization/factorization_helpers.hpp
@@ -0,0 +1,112 @@
+// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
+//
+// SPDX-License-Identifier: BSD-3-Clause
+
+#include "common/cuda_hip/base/config.hpp"
+#include "common/cuda_hip/base/runtime.hpp"
+#include "common/cuda_hip/base/types.hpp"
+#include "common/cuda_hip/components/thread_ids.hpp"
+#include "core/factorization/factorization_helpers.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace GKO_DEVICE_NAMESPACE {
+namespace factorization {
+namespace helpers {
+
+
+using namespace ::gko::factorization;
+
+
+constexpr int default_block_size{512};
+
+
+template <typename ValueType, typename IndexType, typename LClosure,
+          typename UClosure>
+__global__ __launch_bounds__(default_block_size) void initialize_l_u(
+    size_type num_rows, const IndexType* __restrict__ row_ptrs,
+    const IndexType* __restrict__ col_idxs,
+    const ValueType* __restrict__ values,
+    const IndexType* __restrict__ l_row_ptrs,
+    IndexType* __restrict__ l_col_idxs, ValueType* __restrict__ l_values,
+    const IndexType* __restrict__ u_row_ptrs,
+    IndexType* __restrict__ u_col_idxs, ValueType* __restrict__ u_values,
+    LClosure l_closure, UClosure u_closure)
+{
+    const auto row = thread::get_thread_id_flat<IndexType>();
+    if (row < num_rows) {
+        auto l_idx = l_row_ptrs[row];
+        auto u_idx = u_row_ptrs[row] + 1;  // we treat the diagonal separately
+        // default diagonal to one
+        auto diag_val = one<ValueType>();
+        for (size_type i = row_ptrs[row]; i < row_ptrs[row + 1]; ++i) {
+            const auto col = col_idxs[i];
+            const auto val = values[i];
+            // save diagonal entry for later
+            if (col == row) {
+                diag_val = val;
+            }
+            if (col < row) {
+                l_col_idxs[l_idx] = col;
+                l_values[l_idx] = l_closure.map_off_diag(val);
+                ++l_idx;
+            }
+            if (row < col) {
+                u_col_idxs[u_idx] = col;
+                u_values[u_idx] = u_closure.map_off_diag(val);
+                ++u_idx;
+            }
+        }
+        // store diagonal entries
+        auto l_diag_idx = l_row_ptrs[row + 1] - 1;
+        auto u_diag_idx = u_row_ptrs[row];
+        l_col_idxs[l_diag_idx] = row;
+        u_col_idxs[u_diag_idx] = row;
+        l_values[l_diag_idx] = l_closure.map_diag(diag_val);
+        u_values[u_diag_idx] = u_closure.map_diag(diag_val);
+    }
+}
+
+
+template <typename ValueType, typename IndexType, typename LClosure>
+__global__ __launch_bounds__(default_block_size) void initialize_l(
+    size_type num_rows, const IndexType* __restrict__ row_ptrs,
+    const IndexType* __restrict__ col_idxs,
+    const ValueType* __restrict__ values,
+    const IndexType* __restrict__ l_row_ptrs,
+    IndexType* __restrict__ l_col_idxs, ValueType* __restrict__ l_values,
+    LClosure l_closure)
+{
+    const auto row = thread::get_thread_id_flat<IndexType>();
+    if (row < num_rows) {
+        auto l_idx = l_row_ptrs[row];
+        // if there was no diagonal entry, default to one
+        auto diag_val = one<ValueType>();
+        for (size_type i = row_ptrs[row]; i < row_ptrs[row + 1]; ++i) {
+            const auto col = col_idxs[i];
+            const auto val = values[i];
+            // save diagonal entry for later
+            if (col == row) {
+                diag_val = val;
+            }
+            if (col < row) {
+                l_col_idxs[l_idx] = col;
+                l_values[l_idx] = l_closure.map_off_diag(val);
+                ++l_idx;
+            }
+        }
+        // store diagonal entries
+        auto l_diag_idx = l_row_ptrs[row + 1] - 1;
+        l_col_idxs[l_diag_idx] = row;
+        l_values[l_diag_idx] = l_closure.map_diag(diag_val);
+    }
+}
+
+
+}  // namespace helpers
+}  // namespace factorization
+}  // namespace GKO_DEVICE_NAMESPACE
+
+}  // namespace kernels
+}  // namespace gko
\ No newline at end of file
diff --git a/common/cuda_hip/factorization/factorization_kernels.cpp b/common/cuda_hip/factorization/factorization_kernels.cpp
index 3a38175ab70..e790cf19540 100644
--- a/common/cuda_hip/factorization/factorization_kernels.cpp
+++ b/common/cuda_hip/factorization/factorization_kernels.cpp
@@ -13,6 +13,7 @@
 #include "common/cuda_hip/components/intrinsics.hpp"
 #include "common/cuda_hip/components/searching.hpp"
 #include "common/cuda_hip/components/thread_ids.hpp"
+#include "common/cuda_hip/factorization/factorization_helpers.hpp"
 #include "core/base/array_access.hpp"
 #include "core/components/prefix_sum_kernels.hpp"
 #include "core/matrix/csr_builder.hpp"
@@ -255,51 +256,6 @@ __global__ __launch_bounds__(default_block_size) void count_nnz_per_l_u_row(
 }
 
 
-template <typename ValueType, typename IndexType>
-__global__ __launch_bounds__(default_block_size) void initialize_l_u(
-    size_type num_rows, const IndexType* __restrict__ row_ptrs,
-    const IndexType* __restrict__ col_idxs,
-    const ValueType* __restrict__ values,
-    const IndexType* __restrict__ l_row_ptrs,
-    IndexType* __restrict__ l_col_idxs, ValueType* __restrict__ l_values,
-    const IndexType* __restrict__ u_row_ptrs,
-    IndexType* __restrict__ u_col_idxs, ValueType* __restrict__ u_values)
-{
-    const auto row = thread::get_thread_id_flat<IndexType>();
-    if (row < num_rows) {
-        auto l_idx = l_row_ptrs[row];
-        auto u_idx = u_row_ptrs[row] + 1;  // we treat the diagonal separately
-        // default diagonal to one
-        auto diag_val = one<ValueType>();
-        for (size_type i = row_ptrs[row]; i < row_ptrs[row + 1]; ++i) {
-            const auto col = col_idxs[i];
-            const auto val = values[i];
-            // save diagonal entry for later
-            if (col == row) {
-                diag_val = val;
-            }
-            if (col < row) {
-                l_col_idxs[l_idx] = col;
-                l_values[l_idx] = val;
-                ++l_idx;
-            }
-            if (row < col) {
-                u_col_idxs[u_idx] = col;
-                u_values[u_idx] = val;
-                ++u_idx;
-            }
-        }
-        // store diagonal entries
-        auto l_diag_idx = l_row_ptrs[row + 1] - 1;
-        auto u_diag_idx = u_row_ptrs[row];
-        l_col_idxs[l_diag_idx] = row;
-        u_col_idxs[u_diag_idx] = row;
-        l_values[l_diag_idx] = one<ValueType>();
-        u_values[u_diag_idx] = diag_val;
-    }
-}
-
-
 template <typename ValueType, typename IndexType>
 __global__ __launch_bounds__(default_block_size) void count_nnz_per_l_row(
     size_type num_rows, const IndexType* __restrict__ row_ptrs,
@@ -320,48 +276,6 @@ __global__ __launch_bounds__(default_block_size) void count_nnz_per_l_row(
 }
 
 
-template <typename ValueType, typename IndexType>
-__global__ __launch_bounds__(default_block_size) void initialize_l(
-    size_type num_rows, const IndexType* __restrict__ row_ptrs,
-    const IndexType* __restrict__ col_idxs,
-    const ValueType* __restrict__ values,
-    const IndexType* __restrict__ l_row_ptrs,
-    IndexType* __restrict__ l_col_idxs, ValueType* __restrict__ l_values,
-    bool use_sqrt)
-{
-    const auto row = thread::get_thread_id_flat<IndexType>();
-    if (row < num_rows) {
-        auto l_idx = l_row_ptrs[row];
-        // if there was no diagonal entry, default to one
-        auto diag_val = one<ValueType>();
-        for (size_type i = row_ptrs[row]; i < row_ptrs[row + 1]; ++i) {
-            const auto col = col_idxs[i];
-            const auto val = values[i];
-            // save diagonal entry for later
-            if (col == row) {
-                diag_val = val;
-            }
-            if (col < row) {
-                l_col_idxs[l_idx] = col;
-                l_values[l_idx] = val;
-                ++l_idx;
-            }
-        }
-        // store diagonal entries
-        auto l_diag_idx = l_row_ptrs[row + 1] - 1;
-        l_col_idxs[l_diag_idx] = row;
-        // compute square root with sentinel
-        if (use_sqrt) {
-            diag_val = sqrt(diag_val);
-            if (!is_finite(diag_val)) {
-                diag_val = one<ValueType>();
-            }
-        }
-        l_values[l_diag_idx] = diag_val;
-    }
-}
-
-
 }  // namespace kernel
 
 
@@ -481,18 +395,25 @@ void initialize_l_u(std::shared_ptr<const DefaultExecutor> exec,
                     matrix::Csr<ValueType, IndexType>* csr_u)
 {
     const size_type num_rows{system_matrix->get_size()[0]};
-    const auto block_size = default_block_size;
+    const auto block_size = helpers::default_block_size;
     const auto grid_dim = static_cast<uint32>(
         ceildiv(num_rows, static_cast<size_type>(block_size)));
 
     if (grid_dim > 0) {
-        kernel::initialize_l_u<<<grid_dim, block_size, 0, exec->get_stream()>>>(
-            num_rows, system_matrix->get_const_row_ptrs(),
-            system_matrix->get_const_col_idxs(),
-            as_device_type(system_matrix->get_const_values()),
-            csr_l->get_const_row_ptrs(), csr_l->get_col_idxs(),
-            as_device_type(csr_l->get_values()), csr_u->get_const_row_ptrs(),
-            csr_u->get_col_idxs(), as_device_type(csr_u->get_values()));
+        helpers::
+            initialize_l_u<<<grid_dim, block_size, 0, exec->get_stream()>>>(
+                num_rows, system_matrix->get_const_row_ptrs(),
+                system_matrix->get_const_col_idxs(),
+                as_device_type(system_matrix->get_const_values()),
+                csr_l->get_const_row_ptrs(), csr_l->get_col_idxs(),
+                as_device_type(csr_l->get_values()),
+                csr_u->get_const_row_ptrs(), csr_u->get_col_idxs(),
+                as_device_type(csr_u->get_values()),
+                helpers::triangular_mtx_closure(
+                    [] __device__(auto val) { return one(val); },
+                    helpers::identity{}),
+                helpers::triangular_mtx_closure(helpers::identity{},
+                                                helpers::identity{}));
     }
 }
 
@@ -534,17 +455,28 @@ void initialize_l(std::shared_ptr<const DefaultExecutor> exec,
                   matrix::Csr<ValueType, IndexType>* csr_l, bool diag_sqrt)
 {
     const size_type num_rows{system_matrix->get_size()[0]};
-    const auto block_size = default_block_size;
+    const auto block_size = helpers::default_block_size;
     const auto grid_dim = static_cast<uint32>(
         ceildiv(num_rows, static_cast<size_type>(block_size)));
 
     if (grid_dim > 0) {
-        kernel::initialize_l<<<grid_dim, block_size, 0, exec->get_stream()>>>(
+        helpers::initialize_l<<<grid_dim, block_size, 0, exec->get_stream()>>>(
             num_rows, system_matrix->get_const_row_ptrs(),
             system_matrix->get_const_col_idxs(),
             as_device_type(system_matrix->get_const_values()),
             csr_l->get_const_row_ptrs(), csr_l->get_col_idxs(),
-            as_device_type(csr_l->get_values()), diag_sqrt);
+            as_device_type(csr_l->get_values()),
+            helpers::triangular_mtx_closure(
+                [diag_sqrt] __device__(auto val) {
+                    if (diag_sqrt) {
+                        val = sqrt(val);
+                        if (!is_finite(val)) {
+                            val = one(val);
+                        }
+                    }
+                    return val;
+                },
+                helpers::identity{}));
     }
 }
 
diff --git a/common/cuda_hip/preconditioner/sor_kernels.cpp b/common/cuda_hip/preconditioner/sor_kernels.cpp
new file mode 100644
index 00000000000..a415953915f
--- /dev/null
+++ b/common/cuda_hip/preconditioner/sor_kernels.cpp
@@ -0,0 +1,99 @@
+// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
+//
+// SPDX-License-Identifier: BSD-3-Clause
+
+#include "core/preconditioner/sor_kernels.hpp"
+
+#include <ginkgo/core/base/math.hpp>
+#include <ginkgo/core/matrix/csr.hpp>
+
+#include "common/cuda_hip/factorization/factorization_helpers.hpp"
+
+namespace gko {
+namespace kernels {
+namespace GKO_DEVICE_NAMESPACE {
+namespace sor {
+
+
+template <typename ValueType, typename IndexType>
+void initialize_weighted_l(
+    std::shared_ptr<const DefaultExecutor> exec,
+    const matrix::Csr<ValueType, IndexType>* system_matrix,
+    remove_complex<ValueType> weight, matrix::Csr<ValueType, IndexType>* l_mtx)
+{
+    const size_type num_rows{system_matrix->get_size()[0]};
+    const auto block_size = factorization::helpers::default_block_size;
+    const auto grid_dim = static_cast<uint32>(
+        ceildiv(num_rows, static_cast<size_type>(block_size)));
+
+    auto inv_weight = one(weight) / weight;
+
+    if (grid_dim > 0) {
+        factorization::helpers::
+            initialize_l<<<grid_dim, block_size, 0, exec->get_stream()>>>(
+                num_rows, system_matrix->get_const_row_ptrs(),
+                system_matrix->get_const_col_idxs(),
+                as_device_type(system_matrix->get_const_values()),
+                l_mtx->get_const_row_ptrs(), l_mtx->get_col_idxs(),
+                as_device_type(l_mtx->get_values()),
+                factorization::helpers::triangular_mtx_closure(
+                    [inv_weight] __device__(auto val) {
+                        return val * inv_weight;
+                    },
+                    factorization::helpers::identity{}));
+    }
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_SOR_INITIALIZE_WEIGHTED_L);
+
+
+template <typename ValueType, typename IndexType>
+void initialize_weighted_l_u(
+    std::shared_ptr<const DefaultExecutor> exec,
+    const matrix::Csr<ValueType, IndexType>* system_matrix,
+    remove_complex<ValueType> weight, matrix::Csr<ValueType, IndexType>* l_mtx,
+    matrix::Csr<ValueType, IndexType>* u_mtx)
+{
+    const size_type num_rows{system_matrix->get_size()[0]};
+    const auto block_size = factorization::helpers::default_block_size;
+    const auto grid_dim = static_cast<uint32>(
+        ceildiv(num_rows, static_cast<size_type>(block_size)));
+
+    auto inv_weight = one(weight) / weight;
+    auto inv_two_minus_weight =
+        one(weight) / (static_cast<remove_complex<ValueType>>(2.0) - weight);
+
+    if (grid_dim > 0) {
+        factorization::helpers::
+            initialize_l_u<<<grid_dim, block_size, 0, exec->get_stream()>>>(
+                num_rows, system_matrix->get_const_row_ptrs(),
+                system_matrix->get_const_col_idxs(),
+                as_device_type(system_matrix->get_const_values()),
+                l_mtx->get_const_row_ptrs(), l_mtx->get_col_idxs(),
+                as_device_type(l_mtx->get_values()),
+                u_mtx->get_const_row_ptrs(), u_mtx->get_col_idxs(),
+                as_device_type(u_mtx->get_values()),
+                factorization::helpers::triangular_mtx_closure(
+                    [inv_weight] __device__(auto val) {
+                        return val * inv_weight;
+                    },
+                    factorization::helpers::identity{}),
+                factorization::helpers::triangular_mtx_closure(
+                    [inv_two_minus_weight] __device__(auto val) {
+                        return val * inv_two_minus_weight;
+                    },
+                    [weight, inv_two_minus_weight] __device__(auto val) {
+                        return val * weight * inv_two_minus_weight;
+                    }));
+    }
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_SOR_INITIALIZE_WEIGHTED_L_U);
+
+
+}  // namespace sor
+}  // namespace GKO_DEVICE_NAMESPACE
+}  // namespace kernels
+}  // namespace gko
diff --git a/common/unified/CMakeLists.txt b/common/unified/CMakeLists.txt
index 132e04c5d9a..00bc21df0c6 100644
--- a/common/unified/CMakeLists.txt
+++ b/common/unified/CMakeLists.txt
@@ -19,7 +19,6 @@ set(UNIFIED_SOURCES
     matrix/diagonal_kernels.cpp
     multigrid/pgm_kernels.cpp
     preconditioner/jacobi_kernels.cpp
-    preconditioner/sor_kernels.cpp
     solver/bicg_kernels.cpp
     solver/bicgstab_kernels.cpp
     solver/cg_kernels.cpp
diff --git a/common/unified/preconditioner/sor_kernels.cpp b/common/unified/preconditioner/sor_kernels.cpp
deleted file mode 100644
index 8932c1df562..00000000000
--- a/common/unified/preconditioner/sor_kernels.cpp
+++ /dev/null
@@ -1,44 +0,0 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
-//
-// SPDX-License-Identifier: BSD-3-Clause
-
-#include "core/preconditioner/sor_kernels.hpp"
-
-#include <ginkgo/core/base/math.hpp>
-#include <ginkgo/core/matrix/csr.hpp>
-
-#include "common/unified/base/kernel_launch.hpp"
-
-
-namespace gko {
-namespace kernels {
-namespace GKO_DEVICE_NAMESPACE {
-namespace sor {
-
-
-template <typename ValueType, typename IndexType>
-void initialize_weighted_l(
-    std::shared_ptr<const DefaultExecutor> exec,
-    const matrix::Csr<ValueType, IndexType>* system_matrix,
-    remove_complex<ValueType> weight,
-    matrix::Csr<ValueType, IndexType>* l_mtx) GKO_NOT_IMPLEMENTED;
-
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
-    GKO_DECLARE_SOR_INITIALIZE_WEIGHTED_L);
-
-
-template <typename ValueType, typename IndexType>
-void initialize_weighted_l_u(
-    std::shared_ptr<const DefaultExecutor> exec,
-    const matrix::Csr<ValueType, IndexType>* system_matrix,
-    remove_complex<ValueType> weight, matrix::Csr<ValueType, IndexType>* l_mtx,
-    matrix::Csr<ValueType, IndexType>* u_mtx) GKO_NOT_IMPLEMENTED;
-
-GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
-    GKO_DECLARE_SOR_INITIALIZE_WEIGHTED_L_U);
-
-
-}  // namespace sor
-}  // namespace GKO_DEVICE_NAMESPACE
-}  // namespace kernels
-}  // namespace gko
diff --git a/dpcpp/CMakeLists.txt b/dpcpp/CMakeLists.txt
index 851ef9a3dc6..bf65888a6ab 100644
--- a/dpcpp/CMakeLists.txt
+++ b/dpcpp/CMakeLists.txt
@@ -56,6 +56,7 @@ target_sources(ginkgo_dpcpp
     preconditioner/jacobi_generate_kernel.dp.cpp
     preconditioner/jacobi_kernels.dp.cpp
     preconditioner/jacobi_simple_apply_kernel.dp.cpp
+    preconditioner/sor_kernels.dp.cpp
     reorder/rcm_kernels.dp.cpp
     solver/batch_bicgstab_kernels.dp.cpp
     solver/batch_cg_kernels.dp.cpp
diff --git a/dpcpp/factorization/factorization_helpers.dp.hpp b/dpcpp/factorization/factorization_helpers.dp.hpp
new file mode 100644
index 00000000000..9779e134e77
--- /dev/null
+++ b/dpcpp/factorization/factorization_helpers.dp.hpp
@@ -0,0 +1,110 @@
+// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
+//
+// SPDX-License-Identifier: BSD-3-Clause
+
+#include <CL/sycl.hpp>
+
+#include "core/factorization/factorization_helpers.hpp"
+#include "dpcpp/base/config.hpp"
+#include "dpcpp/base/dim3.dp.hpp"
+#include "dpcpp/base/dpct.hpp"
+#include "dpcpp/components/thread_ids.dp.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace dpcpp {
+namespace factorization {
+namespace helpers {
+
+using namespace ::gko::factorization;
+
+
+template <typename ValueType, typename IndexType, typename LClosure,
+          typename UClosure>
+void initialize_l_u(size_type num_rows, const IndexType* __restrict__ row_ptrs,
+                    const IndexType* __restrict__ col_idxs,
+                    const ValueType* __restrict__ values,
+                    const IndexType* __restrict__ l_row_ptrs,
+                    IndexType* __restrict__ l_col_idxs,
+                    ValueType* __restrict__ l_values,
+                    const IndexType* __restrict__ u_row_ptrs,
+                    IndexType* __restrict__ u_col_idxs,
+                    ValueType* __restrict__ u_values, LClosure l_closure,
+                    UClosure u_closure, sycl::nd_item<3> item_ct1)
+{
+    const auto row = thread::get_thread_id_flat<IndexType>(item_ct1);
+    if (row < num_rows) {
+        auto l_idx = l_row_ptrs[row];
+        auto u_idx = u_row_ptrs[row] + 1;  // we treat the diagonal separately
+        // default diagonal to one
+        auto diag_val = one<ValueType>();
+        for (size_type i = row_ptrs[row]; i < row_ptrs[row + 1]; ++i) {
+            const auto col = col_idxs[i];
+            const auto val = values[i];
+            // save diagonal entry for later
+            if (col == row) {
+                diag_val = val;
+            }
+            if (col < row) {
+                l_col_idxs[l_idx] = col;
+                l_values[l_idx] = l_closure.map_off_diag(val);
+                ++l_idx;
+            }
+            if (row < col) {
+                u_col_idxs[u_idx] = col;
+                u_values[u_idx] = u_closure.map_off_diag(val);
+                ++u_idx;
+            }
+        }
+        // store diagonal entries
+        auto l_diag_idx = l_row_ptrs[row + 1] - 1;
+        auto u_diag_idx = u_row_ptrs[row];
+        l_col_idxs[l_diag_idx] = row;
+        u_col_idxs[u_diag_idx] = row;
+        l_values[l_diag_idx] = l_closure.map_diag(diag_val);
+        u_values[u_diag_idx] = u_closure.map_diag(diag_val);
+    }
+}
+
+
+template <typename ValueType, typename IndexType, typename LClosure>
+void initialize_l(size_type num_rows, const IndexType* __restrict__ row_ptrs,
+                  const IndexType* __restrict__ col_idxs,
+                  const ValueType* __restrict__ values,
+                  const IndexType* __restrict__ l_row_ptrs,
+                  IndexType* __restrict__ l_col_idxs,
+                  ValueType* __restrict__ l_values, LClosure l_closure,
+                  sycl::nd_item<3> item_ct1)
+{
+    const auto row = thread::get_thread_id_flat<IndexType>(item_ct1);
+    if (row < num_rows) {
+        auto l_idx = l_row_ptrs[row];
+        // if there was no diagonal entry, default to one
+        auto diag_val = one<ValueType>();
+        for (size_type i = row_ptrs[row]; i < row_ptrs[row + 1]; ++i) {
+            const auto col = col_idxs[i];
+            const auto val = values[i];
+            // save diagonal entry for later
+            if (col == row) {
+                diag_val = val;
+            }
+            if (col < row) {
+                l_col_idxs[l_idx] = col;
+                l_values[l_idx] = l_closure.map_off_diag(val);
+                ++l_idx;
+            }
+        }
+        // store diagonal entries
+        auto l_diag_idx = l_row_ptrs[row + 1] - 1;
+        l_col_idxs[l_diag_idx] = row;
+        l_values[l_diag_idx] = l_closure.map_diag(diag_val);
+    }
+}
+
+
+}  // namespace helpers
+}  // namespace factorization
+}  // namespace dpcpp
+}  // namespace kernels
+}  // namespace gko
diff --git a/dpcpp/factorization/factorization_kernels.dp.cpp b/dpcpp/factorization/factorization_kernels.dp.cpp
index 1d9912b4f12..885fe481609 100644
--- a/dpcpp/factorization/factorization_kernels.dp.cpp
+++ b/dpcpp/factorization/factorization_kernels.dp.cpp
@@ -18,6 +18,7 @@
 #include "dpcpp/components/intrinsics.dp.hpp"
 #include "dpcpp/components/searching.dp.hpp"
 #include "dpcpp/components/thread_ids.dp.hpp"
+#include "dpcpp/factorization/factorization_helpers.dp.hpp"
 
 
 namespace gko {
@@ -320,51 +321,6 @@ void count_nnz_per_l_u_row(dim3 grid, dim3 block,
 }
 
 
-template <typename ValueType, typename IndexType>
-void initialize_l_u(size_type num_rows, const IndexType* __restrict__ row_ptrs,
-                    const IndexType* __restrict__ col_idxs,
-                    const ValueType* __restrict__ values,
-                    const IndexType* __restrict__ l_row_ptrs,
-                    IndexType* __restrict__ l_col_idxs,
-                    ValueType* __restrict__ l_values,
-                    const IndexType* __restrict__ u_row_ptrs,
-                    IndexType* __restrict__ u_col_idxs,
-                    ValueType* __restrict__ u_values, sycl::nd_item<3> item_ct1)
-{
-    const auto row = thread::get_thread_id_flat<IndexType>(item_ct1);
-    if (row < num_rows) {
-        auto l_idx = l_row_ptrs[row];
-        auto u_idx = u_row_ptrs[row] + 1;  // we treat the diagonal separately
-        // default diagonal to one
-        auto diag_val = one<ValueType>();
-        for (size_type i = row_ptrs[row]; i < row_ptrs[row + 1]; ++i) {
-            const auto col = col_idxs[i];
-            const auto val = values[i];
-            // save diagonal entry for later
-            if (col == row) {
-                diag_val = val;
-            }
-            if (col < row) {
-                l_col_idxs[l_idx] = col;
-                l_values[l_idx] = val;
-                ++l_idx;
-            }
-            if (row < col) {
-                u_col_idxs[u_idx] = col;
-                u_values[u_idx] = val;
-                ++u_idx;
-            }
-        }
-        // store diagonal entries
-        auto l_diag_idx = l_row_ptrs[row + 1] - 1;
-        auto u_diag_idx = u_row_ptrs[row];
-        l_col_idxs[l_diag_idx] = row;
-        u_col_idxs[u_diag_idx] = row;
-        l_values[l_diag_idx] = one<ValueType>();
-        u_values[u_diag_idx] = diag_val;
-    }
-}
-
 template <typename ValueType, typename IndexType>
 void initialize_l_u(dim3 grid, dim3 block, size_type dynamic_shared_memory,
                     sycl::queue* queue, size_type num_rows,
@@ -376,9 +332,14 @@ void initialize_l_u(dim3 grid, dim3 block, size_type dynamic_shared_memory,
 {
     queue->parallel_for(
         sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) {
-            initialize_l_u(num_rows, row_ptrs, col_idxs, values, l_row_ptrs,
-                           l_col_idxs, l_values, u_row_ptrs, u_col_idxs,
-                           u_values, item_ct1);
+            helpers::initialize_l_u(
+                num_rows, row_ptrs, col_idxs, values, l_row_ptrs, l_col_idxs,
+                l_values, u_row_ptrs, u_col_idxs, u_values,
+                helpers::triangular_mtx_closure(
+                    [](auto) { return one<ValueType>(); }, helpers::identity{}),
+                helpers::triangular_mtx_closure(helpers::identity{},
+                                                helpers::identity{}),
+                item_ct1);
         });
 }
 
@@ -418,47 +379,6 @@ void count_nnz_per_l_row(dim3 grid, dim3 block, size_type dynamic_shared_memory,
 }
 
 
-template <typename ValueType, typename IndexType>
-void initialize_l(size_type num_rows, const IndexType* __restrict__ row_ptrs,
-                  const IndexType* __restrict__ col_idxs,
-                  const ValueType* __restrict__ values,
-                  const IndexType* __restrict__ l_row_ptrs,
-                  IndexType* __restrict__ l_col_idxs,
-                  ValueType* __restrict__ l_values, bool use_sqrt,
-                  sycl::nd_item<3> item_ct1)
-{
-    const auto row = thread::get_thread_id_flat<IndexType>(item_ct1);
-    if (row < num_rows) {
-        auto l_idx = l_row_ptrs[row];
-        // if there was no diagonal entry, default to one
-        auto diag_val = one<ValueType>();
-        for (size_type i = row_ptrs[row]; i < row_ptrs[row + 1]; ++i) {
-            const auto col = col_idxs[i];
-            const auto val = values[i];
-            // save diagonal entry for later
-            if (col == row) {
-                diag_val = val;
-            }
-            if (col < row) {
-                l_col_idxs[l_idx] = col;
-                l_values[l_idx] = val;
-                ++l_idx;
-            }
-        }
-        // store diagonal entries
-        auto l_diag_idx = l_row_ptrs[row + 1] - 1;
-        l_col_idxs[l_diag_idx] = row;
-        // compute square root with sentinel
-        if (use_sqrt) {
-            diag_val = std::sqrt(diag_val);
-            if (!is_finite(diag_val)) {
-                diag_val = one<ValueType>();
-            }
-        }
-        l_values[l_diag_idx] = diag_val;
-    }
-}
-
 template <typename ValueType, typename IndexType>
 void initialize_l(dim3 grid, dim3 block, size_type dynamic_shared_memory,
                   sycl::queue* queue, size_type num_rows,
@@ -468,8 +388,20 @@ void initialize_l(dim3 grid, dim3 block, size_type dynamic_shared_memory,
 {
     queue->parallel_for(
         sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) {
-            initialize_l(num_rows, row_ptrs, col_idxs, values, l_row_ptrs,
-                         l_col_idxs, l_values, use_sqrt, item_ct1);
+            helpers::initialize_l(num_rows, row_ptrs, col_idxs, values,
+                                  l_row_ptrs, l_col_idxs, l_values,
+                                  helpers::triangular_mtx_closure(
+                                      [use_sqrt](auto val) {
+                                          if (use_sqrt) {
+                                              val = sqrt(val);
+                                              if (!is_finite(val)) {
+                                                  val = one<ValueType>();
+                                              }
+                                          }
+                                          return val;
+                                      },
+                                      helpers::identity{}),
+                                  item_ct1);
         });
 }
 
diff --git a/dpcpp/preconditioner/sor_kernels.dp.cpp b/dpcpp/preconditioner/sor_kernels.dp.cpp
new file mode 100644
index 00000000000..fe796586591
--- /dev/null
+++ b/dpcpp/preconditioner/sor_kernels.dp.cpp
@@ -0,0 +1,100 @@
+// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
+//
+// SPDX-License-Identifier: BSD-3-Clause
+
+#include "core/preconditioner/sor_kernels.hpp"
+
+#include <ginkgo/core/base/math.hpp>
+#include <ginkgo/core/matrix/csr.hpp>
+
+#include "dpcpp/factorization/factorization_helpers.dp.hpp"
+
+namespace gko {
+namespace kernels {
+namespace dpcpp {
+namespace sor {
+
+
+constexpr int default_block_size{256};
+
+
+template <typename ValueType, typename IndexType>
+void initialize_weighted_l(
+    std::shared_ptr<const DefaultExecutor> exec,
+    const matrix::Csr<ValueType, IndexType>* system_matrix,
+    remove_complex<ValueType> weight, matrix::Csr<ValueType, IndexType>* l_mtx)
+{
+    const size_type num_rows{system_matrix->get_size()[0]};
+    const dim3 block_size{default_block_size, 1, 1};
+    const dim3 grid_dim{static_cast<uint32>(ceildiv(
+                            num_rows, static_cast<size_type>(block_size.x))),
+                        1, 1};
+
+    auto inv_weight = one(weight) / weight;
+
+    exec->get_queue()->parallel_for(
+        sycl_nd_range(grid_dim, block_size), [=](sycl::nd_item<3> item_ct1) {
+            factorization::helpers::initialize_l(
+                num_rows, system_matrix->get_const_row_ptrs(),
+                system_matrix->get_const_col_idxs(),
+                system_matrix->get_const_values(), l_mtx->get_const_row_ptrs(),
+                l_mtx->get_col_idxs(), l_mtx->get_values(),
+                factorization::helpers::triangular_mtx_closure(
+                    [inv_weight](auto val) { return val * inv_weight; },
+                    factorization::helpers::identity{}),
+                item_ct1);
+        });
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_SOR_INITIALIZE_WEIGHTED_L);
+
+
+template <typename ValueType, typename IndexType>
+void initialize_weighted_l_u(
+    std::shared_ptr<const DefaultExecutor> exec,
+    const matrix::Csr<ValueType, IndexType>* system_matrix,
+    remove_complex<ValueType> weight, matrix::Csr<ValueType, IndexType>* l_mtx,
+    matrix::Csr<ValueType, IndexType>* u_mtx)
+{
+    const size_type num_rows{system_matrix->get_size()[0]};
+    const dim3 block_size{default_block_size, 1, 1};
+    const dim3 grid_dim{static_cast<uint32>(ceildiv(
+                            num_rows, static_cast<size_type>(block_size.x))),
+                        1, 1};
+
+    auto inv_weight = one(weight) / weight;
+    auto inv_two_minus_weight =
+        one(weight) / (static_cast<remove_complex<ValueType>>(2.0) - weight);
+
+    exec->get_queue()->parallel_for(
+        sycl_nd_range(grid_dim, block_size), [=](sycl::nd_item<3> item_ct1) {
+            factorization::helpers::initialize_l_u(
+                num_rows, system_matrix->get_const_row_ptrs(),
+                system_matrix->get_const_col_idxs(),
+                system_matrix->get_const_values(), l_mtx->get_const_row_ptrs(),
+                l_mtx->get_col_idxs(), l_mtx->get_values(),
+                u_mtx->get_const_row_ptrs(), u_mtx->get_col_idxs(),
+                u_mtx->get_values(),
+                factorization::helpers::triangular_mtx_closure(
+                    [inv_weight](auto val) { return val * inv_weight; },
+                    factorization::helpers::identity{}),
+                factorization::helpers::triangular_mtx_closure(
+                    [inv_two_minus_weight](auto val) {
+                        return val * inv_two_minus_weight;
+                    },
+                    [weight, inv_two_minus_weight](auto val) {
+                        return val * weight * inv_two_minus_weight;
+                    }),
+                item_ct1);
+        });
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_SOR_INITIALIZE_WEIGHTED_L_U);
+
+
+}  // namespace sor
+}  // namespace dpcpp
+}  // namespace kernels
+}  // namespace gko
diff --git a/omp/CMakeLists.txt b/omp/CMakeLists.txt
index 41bec80673f..fef0702048f 100644
--- a/omp/CMakeLists.txt
+++ b/omp/CMakeLists.txt
@@ -40,6 +40,7 @@ target_sources(ginkgo_omp
     preconditioner/batch_jacobi_kernels.cpp
     preconditioner/isai_kernels.cpp
     preconditioner/jacobi_kernels.cpp
+    preconditioner/sor_kernels.cpp
     reorder/rcm_kernels.cpp
     solver/batch_bicgstab_kernels.cpp
     solver/batch_cg_kernels.cpp
diff --git a/omp/factorization/factorization_helpers.hpp b/omp/factorization/factorization_helpers.hpp
new file mode 100644
index 00000000000..f1eed7d4d37
--- /dev/null
+++ b/omp/factorization/factorization_helpers.hpp
@@ -0,0 +1,114 @@
+// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
+//
+// SPDX-License-Identifier: BSD-3-Clause
+
+#include <ginkgo/core/matrix/csr.hpp>
+
+#include "core/factorization/factorization_helpers.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace omp {
+namespace factorization {
+namespace helpers {
+
+
+using namespace ::gko::factorization;
+
+
+template <typename ValueType, typename IndexType, typename LClosure,
+          typename UClosure>
+void initialize_l_u(const matrix::Csr<ValueType, IndexType>* system_matrix,
+                    matrix::Csr<ValueType, IndexType>* csr_l,
+                    matrix::Csr<ValueType, IndexType>* csr_u,
+                    LClosure&& l_closure, UClosure&& u_closure)
+{
+    const auto row_ptrs = system_matrix->get_const_row_ptrs();
+    const auto col_idxs = system_matrix->get_const_col_idxs();
+    const auto vals = system_matrix->get_const_values();
+
+    const auto row_ptrs_l = csr_l->get_const_row_ptrs();
+    auto col_idxs_l = csr_l->get_col_idxs();
+    auto vals_l = csr_l->get_values();
+
+    const auto row_ptrs_u = csr_u->get_const_row_ptrs();
+    auto col_idxs_u = csr_u->get_col_idxs();
+    auto vals_u = csr_u->get_values();
+
+#pragma omp parallel for
+    for (size_type row = 0; row < system_matrix->get_size()[0]; ++row) {
+        size_type current_index_l = row_ptrs_l[row];
+        size_type current_index_u =
+            row_ptrs_u[row] + 1;  // we treat the diagonal separately
+        // if there is no diagonal value, set it to 1 by default
+        auto diag_val = one<ValueType>();
+        for (size_type el = row_ptrs[row]; el < row_ptrs[row + 1]; ++el) {
+            const auto col = col_idxs[el];
+            const auto val = vals[el];
+            if (col < row) {
+                col_idxs_l[current_index_l] = col;
+                vals_l[current_index_l] = l_closure.map_off_diag(val);
+                ++current_index_l;
+            } else if (col == row) {
+                // save value for later
+                diag_val = val;
+            } else {  // col > row
+                col_idxs_u[current_index_u] = col;
+                vals_u[current_index_u] = u_closure.map_off_diag(val);
+                ++current_index_u;
+            }
+        }
+        // store diagonal entries
+        size_type l_diag_idx = row_ptrs_l[row + 1] - 1;
+        size_type u_diag_idx = row_ptrs_u[row];
+        col_idxs_l[l_diag_idx] = row;
+        col_idxs_u[u_diag_idx] = row;
+        vals_l[l_diag_idx] = l_closure.map_diag(diag_val);
+        vals_u[u_diag_idx] = u_closure.map_diag(diag_val);
+    }
+}
+
+
+template <typename ValueType, typename IndexType, typename Closure>
+void initialize_l(const matrix::Csr<ValueType, IndexType>* system_matrix,
+                  matrix::Csr<ValueType, IndexType>* csr_l, Closure&& closure)
+{
+    const auto row_ptrs = system_matrix->get_const_row_ptrs();
+    const auto col_idxs = system_matrix->get_const_col_idxs();
+    const auto vals = system_matrix->get_const_values();
+
+    const auto row_ptrs_l = csr_l->get_const_row_ptrs();
+    auto col_idxs_l = csr_l->get_col_idxs();
+    auto vals_l = csr_l->get_values();
+
+#pragma omp parallel for
+    for (size_type row = 0; row < system_matrix->get_size()[0]; ++row) {
+        size_type current_index_l = row_ptrs_l[row];
+        // if there is no diagonal value, set it to 1 by default
+        auto diag_val = one<ValueType>();
+        for (size_type el = row_ptrs[row]; el < row_ptrs[row + 1]; ++el) {
+            const auto col = col_idxs[el];
+            const auto val = vals[el];
+            if (col < row) {
+                col_idxs_l[current_index_l] = col;
+                vals_l[current_index_l] = val;
+                ++current_index_l;
+            } else if (col == row) {
+                // save value for later
+                diag_val = val;
+            }
+        }
+        // store diagonal entries
+        size_type l_diag_idx = row_ptrs_l[row + 1] - 1;
+        col_idxs_l[l_diag_idx] = row;
+        vals_l[l_diag_idx] = closure.map_diag(diag_val);
+    }
+}
+
+
+}  // namespace helpers
+}  // namespace factorization
+}  // namespace omp
+}  // namespace kernels
+}  // namespace gko
diff --git a/omp/factorization/factorization_kernels.cpp b/omp/factorization/factorization_kernels.cpp
index f4b41cbdac5..e7b66f6f887 100644
--- a/omp/factorization/factorization_kernels.cpp
+++ b/omp/factorization/factorization_kernels.cpp
@@ -12,6 +12,7 @@
 
 #include "core/components/prefix_sum_kernels.hpp"
 #include "core/matrix/csr_builder.hpp"
+#include "omp/factorization/factorization_helpers.hpp"
 
 
 namespace gko {
@@ -224,49 +225,12 @@ void initialize_l_u(std::shared_ptr<const OmpExecutor> exec,
                     matrix::Csr<ValueType, IndexType>* csr_l,
                     matrix::Csr<ValueType, IndexType>* csr_u)
 {
-    const auto row_ptrs = system_matrix->get_const_row_ptrs();
-    const auto col_idxs = system_matrix->get_const_col_idxs();
-    const auto vals = system_matrix->get_const_values();
-
-    const auto row_ptrs_l = csr_l->get_const_row_ptrs();
-    auto col_idxs_l = csr_l->get_col_idxs();
-    auto vals_l = csr_l->get_values();
-
-    const auto row_ptrs_u = csr_u->get_const_row_ptrs();
-    auto col_idxs_u = csr_u->get_col_idxs();
-    auto vals_u = csr_u->get_values();
-
-#pragma omp parallel for
-    for (size_type row = 0; row < system_matrix->get_size()[0]; ++row) {
-        size_type current_index_l = row_ptrs_l[row];
-        size_type current_index_u =
-            row_ptrs_u[row] + 1;  // we treat the diagonal separately
-        // if there is no diagonal value, set it to 1 by default
-        auto diag_val = one<ValueType>();
-        for (size_type el = row_ptrs[row]; el < row_ptrs[row + 1]; ++el) {
-            const auto col = col_idxs[el];
-            const auto val = vals[el];
-            if (col < row) {
-                col_idxs_l[current_index_l] = col;
-                vals_l[current_index_l] = val;
-                ++current_index_l;
-            } else if (col == row) {
-                // save value for later
-                diag_val = val;
-            } else {  // col > row
-                col_idxs_u[current_index_u] = col;
-                vals_u[current_index_u] = val;
-                ++current_index_u;
-            }
-        }
-        // store diagonal entries
-        size_type l_diag_idx = row_ptrs_l[row + 1] - 1;
-        size_type u_diag_idx = row_ptrs_u[row];
-        col_idxs_l[l_diag_idx] = row;
-        col_idxs_u[u_diag_idx] = row;
-        vals_l[l_diag_idx] = one<ValueType>();
-        vals_u[u_diag_idx] = diag_val;
-    }
+    helpers::initialize_l_u(
+        system_matrix, csr_l, csr_u,
+        helpers::triangular_mtx_closure([](auto) { return one<ValueType>(); },
+                                        helpers::identity{}),
+        helpers::triangular_mtx_closure(helpers::identity{},
+                                        helpers::identity{}));
 }
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
@@ -309,43 +273,18 @@ void initialize_l(std::shared_ptr<const OmpExecutor> exec,
                   const matrix::Csr<ValueType, IndexType>* system_matrix,
                   matrix::Csr<ValueType, IndexType>* csr_l, bool diag_sqrt)
 {
-    const auto row_ptrs = system_matrix->get_const_row_ptrs();
-    const auto col_idxs = system_matrix->get_const_col_idxs();
-    const auto vals = system_matrix->get_const_values();
-
-    const auto row_ptrs_l = csr_l->get_const_row_ptrs();
-    auto col_idxs_l = csr_l->get_col_idxs();
-    auto vals_l = csr_l->get_values();
-
-#pragma omp parallel for
-    for (size_type row = 0; row < system_matrix->get_size()[0]; ++row) {
-        size_type current_index_l = row_ptrs_l[row];
-        // if there is no diagonal value, set it to 1 by default
-        auto diag_val = one<ValueType>();
-        for (size_type el = row_ptrs[row]; el < row_ptrs[row + 1]; ++el) {
-            const auto col = col_idxs[el];
-            const auto val = vals[el];
-            if (col < row) {
-                col_idxs_l[current_index_l] = col;
-                vals_l[current_index_l] = val;
-                ++current_index_l;
-            } else if (col == row) {
-                // save value for later
-                diag_val = val;
-            }
-        }
-        // store diagonal entries
-        size_type l_diag_idx = row_ptrs_l[row + 1] - 1;
-        col_idxs_l[l_diag_idx] = row;
-        // compute square root with sentinel
-        if (diag_sqrt) {
-            diag_val = sqrt(diag_val);
-            if (!is_finite(diag_val)) {
-                diag_val = one<ValueType>();
-            }
-        }
-        vals_l[l_diag_idx] = diag_val;
-    }
+    helpers::initialize_l(system_matrix, csr_l,
+                          helpers::triangular_mtx_closure(
+                              [diag_sqrt](auto val) {
+                                  if (diag_sqrt) {
+                                      val = sqrt(val);
+                                      if (!is_finite(val)) {
+                                          val = one<ValueType>();
+                                      }
+                                  }
+                                  return val;
+                              },
+                              helpers::identity{}));
 }
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
diff --git a/omp/preconditioner/sor_kernels.cpp b/omp/preconditioner/sor_kernels.cpp
new file mode 100644
index 00000000000..509946ac15a
--- /dev/null
+++ b/omp/preconditioner/sor_kernels.cpp
@@ -0,0 +1,67 @@
+// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
+//
+// SPDX-License-Identifier: BSD-3-Clause
+
+#include "core/preconditioner/sor_kernels.hpp"
+
+#include <ginkgo/core/base/math.hpp>
+#include <ginkgo/core/matrix/csr.hpp>
+
+#include "omp/factorization/factorization_helpers.hpp"
+
+namespace gko {
+namespace kernels {
+namespace omp {
+namespace sor {
+
+
+template <typename ValueType, typename IndexType>
+void initialize_weighted_l(
+    std::shared_ptr<const DefaultExecutor> exec,
+    const matrix::Csr<ValueType, IndexType>* system_matrix,
+    remove_complex<ValueType> weight, matrix::Csr<ValueType, IndexType>* l_mtx)
+{
+    auto inv_weight = one(weight) / weight;
+    factorization::helpers::initialize_l(
+        system_matrix, l_mtx,
+        factorization::helpers::triangular_mtx_closure(
+            [inv_weight](auto val) { return val * inv_weight; },
+            [](auto val) { return val; }));
+};
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_SOR_INITIALIZE_WEIGHTED_L);
+
+
+template <typename ValueType, typename IndexType>
+void initialize_weighted_l_u(
+    std::shared_ptr<const DefaultExecutor> exec,
+    const matrix::Csr<ValueType, IndexType>* system_matrix,
+    remove_complex<ValueType> weight, matrix::Csr<ValueType, IndexType>* l_mtx,
+    matrix::Csr<ValueType, IndexType>* u_mtx)
+{
+    auto inv_weight = one(weight) / weight;
+    auto inv_two_minus_weight =
+        one(weight) / (static_cast<remove_complex<ValueType>>(2.0) - weight);
+    factorization::helpers::initialize_l_u(
+        system_matrix, l_mtx, u_mtx,
+        factorization::helpers::triangular_mtx_closure(
+            [inv_weight](auto val) { return val * inv_weight; },
+            [](auto val) { return val; }),
+        factorization::helpers::triangular_mtx_closure(
+            [inv_two_minus_weight](auto val) {
+                return val * inv_two_minus_weight;
+            },
+            [weight, inv_two_minus_weight](auto val) {
+                return val * weight * inv_two_minus_weight;
+            }));
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_SOR_INITIALIZE_WEIGHTED_L_U);
+
+
+}  // namespace sor
+}  // namespace omp
+}  // namespace kernels
+}  // namespace gko
diff --git a/test/factorization/CMakeLists.txt b/test/factorization/CMakeLists.txt
index e768a48ef05..8b5aa51287b 100644
--- a/test/factorization/CMakeLists.txt
+++ b/test/factorization/CMakeLists.txt
@@ -1,4 +1,5 @@
 ginkgo_create_common_test(cholesky_kernels DISABLE_EXECUTORS dpcpp)
+ginkgo_create_common_test(factorization_kernels)
 ginkgo_create_common_test(lu_kernels DISABLE_EXECUTORS dpcpp)
 ginkgo_create_common_test(ic_kernels DISABLE_EXECUTORS dpcpp omp)
 ginkgo_create_common_test(ilu_kernels DISABLE_EXECUTORS dpcpp omp)
diff --git a/test/factorization/factorization_kernels.cpp b/test/factorization/factorization_kernels.cpp
new file mode 100644
index 00000000000..7887d83e0f7
--- /dev/null
+++ b/test/factorization/factorization_kernels.cpp
@@ -0,0 +1,78 @@
+// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
+//
+// SPDX-License-Identifier: BSD-3-Clause
+
+#include "core/factorization/factorization_kernels.hpp"
+
+#include <fstream>
+#include <memory>
+#include <random>
+#include <string>
+
+#include <gtest/gtest.h>
+
+#include <core/utils/matrix_utils.hpp>
+
+#include <ginkgo/core/base/executor.hpp>
+
+#include "core/test/utils.hpp"
+#include "core/test/utils/unsort_matrix.hpp"
+#include "test/utils/common_fixture.hpp"
+
+
+class Factorization : public CommonTestFixture {
+protected:
+    using Csr = gko::matrix::Csr<value_type, index_type>;
+
+    Factorization()
+    {
+        mtx = gko::test::generate_random_matrix<Csr>(
+            52, 52, std::uniform_int_distribution<>(4, 40),
+            std::uniform_real_distribution<>(1, 2), rand_engine, ref);
+        gko::utils::ensure_all_diagonal_entries(mtx.get());
+        dmtx = gko::clone(exec, mtx);
+    }
+
+    std::default_random_engine rand_engine{6794};
+    std::shared_ptr<Csr> mtx;
+    std::shared_ptr<Csr> dmtx;
+};
+
+
+TEST_F(Factorization, InitializeRowPtrsLSameAsRef)
+{
+    gko::array<index_type> l_ptrs{ref, mtx->get_size()[0] + 1};
+    gko::array<index_type> dl_ptrs{exec, mtx->get_size()[0] + 1};
+
+    gko::kernels::reference::factorization::initialize_row_ptrs_l(
+        ref, mtx.get(), l_ptrs.get_data());
+    gko::kernels::GKO_DEVICE_NAMESPACE::factorization::initialize_row_ptrs_l(
+        exec, dmtx.get(), dl_ptrs.get_data());
+
+    GKO_ASSERT_ARRAY_EQ(l_ptrs, dl_ptrs);
+}
+
+
+TEST_F(Factorization, InitializeLWithoutSqrtSameAsRef)
+{
+    gko::array<index_type> l_ptrs{ref, mtx->get_size()[0] + 1};
+    gko::kernels::reference::factorization::initialize_row_ptrs_l(
+        ref, mtx.get(), l_ptrs.get_data());
+    auto nnz =
+        static_cast<gko::size_type>(l_ptrs.get_data()[mtx->get_size()[0]]);
+    auto l_mtx =
+        Csr::create(ref, mtx->get_size(), gko::array<value_type>(ref, nnz),
+                    gko::array<index_type>(ref, nnz), l_ptrs);
+    auto dl_mtx = gko::clone(exec, l_mtx);
+
+    for (auto diag_sqrt : {false, true}) {
+        SCOPED_TRACE("diag_sqrt: " + std::to_string(diag_sqrt));
+
+        gko::kernels::reference::factorization::initialize_l(
+            ref, mtx.get(), l_mtx.get(), diag_sqrt);
+        gko::kernels::GKO_DEVICE_NAMESPACE::factorization::initialize_l(
+            exec, dmtx.get(), dl_mtx.get(), diag_sqrt);
+
+        GKO_ASSERT_MTX_NEAR(l_mtx, dl_mtx, 0.0);
+    }
+}
diff --git a/test/preconditioner/CMakeLists.txt b/test/preconditioner/CMakeLists.txt
index b41897efaac..46696e29549 100644
--- a/test/preconditioner/CMakeLists.txt
+++ b/test/preconditioner/CMakeLists.txt
@@ -1,3 +1,4 @@
 ginkgo_create_common_test(batch_jacobi_kernels DISABLE_EXECUTORS dpcpp)
 ginkgo_create_common_test(jacobi_kernels DISABLE_EXECUTORS dpcpp)
 ginkgo_create_common_test(isai_kernels)
+ginkgo_create_common_test(sor_kernels)
diff --git a/test/preconditioner/sor_kernels.cpp b/test/preconditioner/sor_kernels.cpp
new file mode 100644
index 00000000000..cd12514bb28
--- /dev/null
+++ b/test/preconditioner/sor_kernels.cpp
@@ -0,0 +1,89 @@
+// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
+//
+// SPDX-License-Identifier: BSD-3-Clause
+
+#include "core/preconditioner/sor_kernels.hpp"
+
+#include <random>
+
+#include <gtest/gtest.h>
+
+#include <ginkgo/core/base/exception.hpp>
+#include <ginkgo/core/base/executor.hpp>
+#include <ginkgo/core/matrix/csr.hpp>
+#include <ginkgo/core/matrix/dense.hpp>
+
+#include "core/test/utils.hpp"
+#include "core/utils/matrix_utils.hpp"
+#include "test/utils/common_fixture.hpp"
+#include "test/utils/executor.hpp"
+
+
+class Sor : public CommonTestFixture {
+protected:
+    using Csr = gko::matrix::Csr<value_type, index_type>;
+    using Dense = gko::matrix::Dense<value_type>;
+
+    Sor()
+    {
+        gko::size_type n = 133;
+        index_type row_limit = 15;
+        auto nz_dist = std::uniform_int_distribution<index_type>(1, row_limit);
+        auto val_dist = std::uniform_real_distribution<value_type>(-1., 1.);
+        auto md =
+            gko::test::generate_random_matrix_data<value_type, index_type>(
+                n, n, nz_dist, val_dist, rand_engine);
+        auto md_l = md;
+        auto md_u = md;
+        // make_upper/lower_triangular also removes the diagonal, so it is
+        // added back with make_unit_diagonal
+        gko::utils::make_lower_triangular(md_l);
+        gko::utils::make_unit_diagonal(md_l);
+        gko::utils::make_upper_triangular(md_u);
+        gko::utils::make_unit_diagonal(md_u);
+
+        mtx->read(md);
+        d_mtx->read(md);
+
+        result_l->read(md_l);
+        result_l->scale(gko::initialize<Dense>({0.0}, ref));
+        d_result_l = gko::clone(exec, result_l);
+
+        result_u->read(md_u);
+        result_u->scale(gko::initialize<Dense>({0.0}, ref));
+        d_result_u = gko::clone(exec, result_u);
+    }
+
+    std::default_random_engine rand_engine{42};
+
+    std::unique_ptr<Csr> mtx = Csr::create(ref);
+    std::unique_ptr<Csr> d_mtx = Csr::create(exec);
+
+    std::unique_ptr<Csr> result_l = Csr::create(ref);
+    std::unique_ptr<Csr> d_result_l = Csr::create(exec);
+    std::unique_ptr<Csr> result_u = Csr::create(ref);
+    std::unique_ptr<Csr> d_result_u = Csr::create(exec);
+};
+
+
+TEST_F(Sor, InitializeWeightedLFactorIsSameAsReference)
+{
+    gko::kernels::reference::sor::initialize_weighted_l(ref, mtx.get(), 1.24,
+                                                        result_l.get());
+    gko::kernels::GKO_DEVICE_NAMESPACE::sor::initialize_weighted_l(
+        exec, d_mtx.get(), 1.24, d_result_l.get());
+
+    GKO_ASSERT_MTX_NEAR(result_l, d_result_l, r<value_type>::value);
+}
+
+
+TEST_F(Sor, InitializeWeightedLAndUFactorIsSameAsReference)
+{
+    gko::kernels::reference::sor::initialize_weighted_l_u(
+        ref, mtx.get(), 1.24, result_l.get(), result_u.get());
+    gko::kernels::GKO_DEVICE_NAMESPACE::sor::initialize_weighted_l_u(
+        exec, d_mtx.get(), 1.24, d_result_l.get(), d_result_u.get());
+
+    GKO_ASSERT_MTX_NEAR(result_l, d_result_l, r<value_type>::value);
+    GKO_ASSERT_MTX_NEAR(result_u, d_result_u, r<value_type>::value);
+}