ginkgo-project · gojakuch · Jul 18, 2025 · Jul 26, 2025 · Jul 31, 2025 · Aug 3, 2025
diff --git a/benchmark/solver/distributed/solver.cpp b/benchmark/solver/distributed/solver.cpp
@@ -105,8 +105,8 @@ int main(int argc, char* argv[])
         print_general_information(extra_information, exec);
     }
 
-    std::set<std::string> supported_solvers = {"cg", "fcg", "cgs", "bicgstab",
-                                               "gmres"};
+    std::set<std::string> supported_solvers = {"cg",       "fcg",   "cgs",
+                                               "bicgstab", "gmres", "pipe_cg"};
     auto solvers = split(FLAGS_solvers, ',');
     for (const auto& solver : solvers) {
         if (supported_solvers.find(solver) == supported_solvers.end()) {

diff --git a/common/unified/solver/pipe_cg_kernels.cpp b/common/unified/solver/pipe_cg_kernels.cpp
@@ -38,8 +38,8 @@ void initialize_1(std::shared_ptr<const DefaultExecutor> exec,
                 }
                 r(row, col) = b(row, col);
             },
-            b->get_size(), b->get_stride(), b, default_stride(r),
-            row_vector(prev_rho), *stop_status);
+            b->get_size(), b->get_stride(), b, r, row_vector(prev_rho),
+            *stop_status);
     } else {
         run_kernel(
             exec,
@@ -83,9 +83,7 @@ void initialize_2(std::shared_ptr<const DefaultExecutor> exec,
                 f(row, col) = m(row, col);
                 g(row, col) = n(row, col);
             },
-            p->get_size(), p->get_stride(), default_stride(p),
-            default_stride(q), default_stride(f), default_stride(g),
-            row_vector(beta), default_stride(z), default_stride(w),
+            p->get_size(), p->get_stride(), p, q, f, g, row_vector(beta), z, w,
             default_stride(m), default_stride(n), row_vector(delta));
     } else {
         run_kernel(
@@ -103,8 +101,8 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_PIPE_CG_INITIALIZE_2_KERNEL);
 template <typename ValueType>
 void step_1(std::shared_ptr<const DefaultExecutor> exec,
             matrix::Dense<ValueType>* x, matrix::Dense<ValueType>* r,
-            matrix::Dense<ValueType>* z, matrix::Dense<ValueType>* w,
-            const matrix::Dense<ValueType>* p,
+            matrix::Dense<ValueType>* z1, matrix::Dense<ValueType>* z2,
+            matrix::Dense<ValueType>* w, const matrix::Dense<ValueType>* p,
             const matrix::Dense<ValueType>* q,
             const matrix::Dense<ValueType>* f,
             const matrix::Dense<ValueType>* g,
@@ -119,21 +117,21 @@ void step_1(std::shared_ptr<const DefaultExecutor> exec,
     // w = w - tmp * g
     run_kernel_solver(
         exec,
-        [] GKO_KERNEL(auto row, auto col, auto x, auto r, auto z, auto w,
-                      auto p, auto q, auto f, auto g, auto rho, auto beta,
-                      auto stop) {
+        [] GKO_KERNEL(auto row, auto col, auto x, auto r, auto z1, auto z2,
+                      auto w, auto p, auto q, auto f, auto g, auto rho,
+                      auto beta, auto stop) {
             if (!stop[col].has_stopped()) {
                 auto tmp = safe_divide(rho[col], beta[col]);
                 x(row, col) += tmp * p(row, col);
                 r(row, col) -= tmp * q(row, col);
-                z(row, col) -= tmp * f(row, col);
+                z1(row, col) -= tmp * f(row, col);
+                z2(row, col) = z1(row, col);
                 w(row, col) -= tmp * g(row, col);
             }
         },
-        x->get_size(), r->get_stride(), x, default_stride(r), default_stride(z),
-        default_stride(w), default_stride(p), default_stride(q),
-        default_stride(f), default_stride(g), row_vector(rho), row_vector(beta),
-        *stop_status);
+        x->get_size(), x->get_stride(), default_stride(x), r, z1, z2, w,
+        default_stride(p), default_stride(q), default_stride(f),
+        default_stride(g), row_vector(rho), row_vector(beta), *stop_status);
 }
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_PIPE_CG_STEP_1_KERNEL);
@@ -179,10 +177,9 @@ void step_2(std::shared_ptr<const DefaultExecutor> exec,
             }
         },
         p->get_size(), p->get_stride(), row_vector(beta), default_stride(p),
-        default_stride(q), default_stride(f), default_stride(g),
-        default_stride(z), default_stride(w), default_stride(m),
-        default_stride(n), row_vector(prev_rho), row_vector(rho),
-        row_vector(delta), *stop_status);
+        default_stride(q), default_stride(f), default_stride(g), z, w,
+        default_stride(m), default_stride(n), row_vector(prev_rho),
+        row_vector(rho), row_vector(delta), *stop_status);
 }
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_PIPE_CG_STEP_2_KERNEL);

diff --git a/core/solver/pipe_cg.cpp b/core/solver/pipe_cg.cpp
@@ -18,6 +18,7 @@
 #include "core/distributed/helpers.hpp"
 #include "core/solver/pipe_cg_kernels.hpp"
 #include "core/solver/solver_boilerplate.hpp"
+#include "ginkgo/core/base/range.hpp"
 
 
 namespace gko {
@@ -102,26 +103,75 @@ void PipeCg<ValueType>::apply_dense_impl(const VectorType* dense_b,
     auto exec = this->get_executor();
     this->setup_workspace();
 
-    GKO_SOLVER_VECTOR(r, dense_b);
-    GKO_SOLVER_VECTOR(z, dense_b);
+    // we combine the two vectors r and w, formerly created with
+    // GKO_SOLVER_VECTOR(r, dense_b);
+    // GKO_SOLVER_VECTOR(w, dense_b);
+    // into rw that we later slice for efficient dot product computation
+    auto b_stride = dense_b->get_stride();
+
+    auto local_original_size = ::gko::detail::get_local(dense_b)->get_size();
+    auto global_original_size = dense_b->get_size();
+    dim<2> local_conjoined_size = {local_original_size[0], b_stride * 2};
+    dim<2> global_conjoined_size = {global_original_size[0], b_stride * 2};
+
+    VectorType* rw =
+        this->template create_workspace_op_with_type_of<VectorType>(
+            GKO_SOLVER_TRAITS::rw, dense_b, global_conjoined_size,
+            local_conjoined_size);
+    auto r_unique = rw->create_submatrix(local_span{0, local_original_size[0]},
+                                         local_span{0, local_original_size[1]},
+                                         global_original_size);
+    auto* r = r_unique.get();
+    auto w_unique = rw->create_submatrix(
+        local_span{0, local_original_size[0]},
+        local_span{b_stride, b_stride + local_original_size[1]},
+        global_original_size);
+    auto* w = w_unique.get();
+
+    // z now consists of two identical repeating parts: z1 and z2, again, for
+    // the same reason
+    GKO_SOLVER_VECTOR(z, rw);
+    auto z1_unique = z->create_submatrix(local_span{0, local_original_size[0]},
+                                         local_span{0, local_original_size[1]},
+                                         global_original_size);
+    auto* z1 = z1_unique.get();
+    auto z2_unique = z->create_submatrix(
+        local_span{0, local_original_size[0]},
+        local_span{b_stride, b_stride + local_original_size[1]},
+        global_original_size);
+    auto* z2 = z2_unique.get();
+
     GKO_SOLVER_VECTOR(p, dense_b);
-    GKO_SOLVER_VECTOR(w, dense_b);
     GKO_SOLVER_VECTOR(m, dense_b);
     GKO_SOLVER_VECTOR(n, dense_b);
     GKO_SOLVER_VECTOR(q, dense_b);
     GKO_SOLVER_VECTOR(f, dense_b);
     GKO_SOLVER_VECTOR(g, dense_b);
 
+    // rho and delta become combined as well
+    GKO_SOLVER_SCALAR(rhodelta, rw);
+    auto rho_unique = rhodelta->create_submatrix(
+        local_span{0, 1}, local_span{0, local_original_size[1]},
+        dim<2>{1, global_original_size[1]});
+    auto* rho = rho_unique.get();
+    auto delta_unique = rhodelta->create_submatrix(
+        local_span{0, 1},
+        local_span{b_stride, b_stride + local_original_size[1]},
+        dim<2>{1, global_original_size[1]});
+    auto* delta = delta_unique.get();
+
     GKO_SOLVER_SCALAR(beta, dense_b);
-    GKO_SOLVER_SCALAR(delta, dense_b);
     GKO_SOLVER_SCALAR(prev_rho, dense_b);
-    GKO_SOLVER_SCALAR(rho, dense_b);
 
     GKO_SOLVER_ONE_MINUS_ONE();
 
     bool one_changed{};
 
-    GKO_SOLVER_STOP_REDUCTION_ARRAYS();
+    // needs to match the size of the combined rhodelta
+    auto& stop_status = this->template create_workspace_array<stopping_status>(
+        GKO_SOLVER_TRAITS::stop, global_original_size[1]);
+    auto& reduction_tmp = this->template create_workspace_array<char>(
+        GKO_SOLVER_TRAITS::tmp, 2 * global_original_size[1]);
-    auto& reduction_tmp = this->template create_workspace_array<char>(
-        GKO_SOLVER_TRAITS::tmp, 2 * global_original_size[1]);
+    auto& reduction_tmp = this->template create_workspace_array<char>(
+        GKO_SOLVER_TRAITS::tmp);
-    auto& reduction_tmp = this->template create_workspace_array<char>(
-        GKO_SOLVER_TRAITS::tmp, 2 * global_original_size[1]);
+    auto& reduction_tmp = this->template create_workspace_array<char>(
+        GKO_SOLVER_TRAITS::tmp);
 
     // r = b
     // prev_rho = 1.0
@@ -131,18 +181,19 @@ void PipeCg<ValueType>::apply_dense_impl(const VectorType* dense_b,
     // r = r - Ax
     this->get_system_matrix()->apply(neg_one_op, dense_x, one_op, r);
     // z = preconditioner * r
-    this->get_preconditioner()->apply(r, z);
+    this->get_preconditioner()->apply(r, z1);
+    // z2 = z1
+    z2->copy_from(z1);
     // w = A * z
-    this->get_system_matrix()->apply(z, w);
+    this->get_system_matrix()->apply(z1, w);
     // m = preconditioner * w
     this->get_preconditioner()->apply(w, m);
     // n = A * m
     this->get_system_matrix()->apply(m, n);
-    // TODO: merge these two dot products:
-    // rho = dot(r, z)
-    r->compute_conj_dot(z, rho, reduction_tmp);
-    // delta = dot(w, z)
-    w->compute_conj_dot(z, delta, reduction_tmp);
+    // merged dot products
+    // rho = dot(r, z1)
+    // delta = dot(w, z2)
+    rw->compute_conj_dot(z, rhodelta, reduction_tmp);
 
     // check for an early termination
     auto stop_criterion = this->get_stop_criterion_factory()->generate(
@@ -171,7 +222,7 @@ void PipeCg<ValueType>::apply_dense_impl(const VectorType* dense_b,
     exec->run(pipe_cg::make_initialize_2(
         gko::detail::get_local(p), gko::detail::get_local(q),
         gko::detail::get_local(f), gko::detail::get_local(g), beta,
-        gko::detail::get_local(z), gko::detail::get_local(w),
+        gko::detail::get_local(z1), gko::detail::get_local(w),
         gko::detail::get_local(m), gko::detail::get_local(n), delta));
 
     /* Memory movement summary:
@@ -183,23 +234,25 @@ void PipeCg<ValueType>::apply_dense_impl(const VectorType* dense_b,
         // r = r - tmp * q
         // z = z - tmp * f
         // w = w - tmp * g
+        // it's the only place where z is updated so we updated both z1 and z2
+        // here
         exec->run(pipe_cg::make_step_1(
             gko::detail::get_local(dense_x), gko::detail::get_local(r),
-            gko::detail::get_local(z), gko::detail::get_local(w),
-            gko::detail::get_local(p), gko::detail::get_local(q),
-            gko::detail::get_local(f), gko::detail::get_local(g), rho, beta,
-            &stop_status));
+            gko::detail::get_local(z1), gko::detail::get_local(z2),
+            gko::detail::get_local(w), gko::detail::get_local(p),
+            gko::detail::get_local(q), gko::detail::get_local(f),
+            gko::detail::get_local(g), rho, beta, &stop_status));
+
         // m = preconditioner * w
         this->get_preconditioner()->apply(w, m);
         // n = A * m
         this->get_system_matrix()->apply(m, n);
         // prev_rho = rho
-        swap(prev_rho, rho);
-        // TODO: merge these two dot products:
-        // rho = dot(r, z)
-        r->compute_conj_dot(z, rho, reduction_tmp);
-        // delta = dot(w, z)
-        w->compute_conj_dot(z, delta, reduction_tmp);
+        prev_rho->copy_from(rho);
+        // merged dot products
+        // rho = dot(r, z1)
+        // delta = dot(w, z2)
+        rw->compute_conj_dot(z, rhodelta, reduction_tmp);
         // check
         ++iter;
         bool all_stopped =
@@ -215,6 +268,7 @@ void PipeCg<ValueType>::apply_dense_impl(const VectorType* dense_b,
         if (all_stopped) {
             break;
         }
+
         // tmp = rho / prev_rho
         // beta = delta - |tmp|^2 * beta
         // p = z + tmp * p
@@ -224,7 +278,7 @@ void PipeCg<ValueType>::apply_dense_impl(const VectorType* dense_b,
         exec->run(pipe_cg::make_step_2(
             beta, gko::detail::get_local(p), gko::detail::get_local(q),
             gko::detail::get_local(f), gko::detail::get_local(g),
-            gko::detail::get_local(z), gko::detail::get_local(w),
+            gko::detail::get_local(z1), gko::detail::get_local(w),
             gko::detail::get_local(m), gko::detail::get_local(n), prev_rho, rho,
             delta, &stop_status));
     }
@@ -259,7 +313,7 @@ int workspace_traits<PipeCg<ValueType>>::num_arrays(const Solver&)
 template <typename ValueType>
 int workspace_traits<PipeCg<ValueType>>::num_vectors(const Solver&)
 {
-    return 15;
+    return 13;
 }
 
 
@@ -268,8 +322,8 @@ std::vector<std::string> workspace_traits<PipeCg<ValueType>>::op_names(
     const Solver&)
 {
     return {
-        "r", "z",    "p",     "w",        "m",   "n",   "q",         "f",
-        "g", "beta", "delta", "prev_rho", "rho", "one", "minus_one",
+        "rw", "z",    "p",        "m",        "n",   "q",         "f",
+        "g",  "beta", "rhodelta", "prev_rho", "one", "minus_one",
     };
 }
 
@@ -285,14 +339,14 @@ std::vector<std::string> workspace_traits<PipeCg<ValueType>>::array_names(
 template <typename ValueType>
 std::vector<int> workspace_traits<PipeCg<ValueType>>::scalars(const Solver&)
 {
-    return {beta, delta, prev_rho, rho};
+    return {beta, rhodelta, prev_rho};
 }
 
 
 template <typename ValueType>
 std::vector<int> workspace_traits<PipeCg<ValueType>>::vectors(const Solver&)
 {
-    return {r, z, p, w, m, n, q, f, g};
+    return {rw, z, p, m, n, q, f, g};
 }
 
 

diff --git a/core/solver/pipe_cg_kernels.hpp b/core/solver/pipe_cg_kernels.hpp
@@ -39,14 +39,14 @@ namespace pipe_cg {
 
 
 #define GKO_DECLARE_PIPE_CG_STEP_1_KERNEL(_type)                              \
-    void step_1(std::shared_ptr<const DefaultExecutor> exec,                  \
-                matrix::Dense<_type>* x, matrix::Dense<_type>* r,             \
-                matrix::Dense<_type>* z, matrix::Dense<_type>* w,             \
-                const matrix::Dense<_type>* p, const matrix::Dense<_type>* q, \
-                const matrix::Dense<_type>* f, const matrix::Dense<_type>* g, \
-                const matrix::Dense<_type>* rho,                              \
-                const matrix::Dense<_type>* beta,                             \
-                const array<stopping_status>* stop_status)
+    void step_1(                                                              \
+        std::shared_ptr<const DefaultExecutor> exec, matrix::Dense<_type>* x, \
+        matrix::Dense<_type>* r, matrix::Dense<_type>* z1,                    \
+        matrix::Dense<_type>* z2, matrix::Dense<_type>* w,                    \
+        const matrix::Dense<_type>* p, const matrix::Dense<_type>* q,         \
+        const matrix::Dense<_type>* f, const matrix::Dense<_type>* g,         \
+        const matrix::Dense<_type>* rho, const matrix::Dense<_type>* beta,    \
+        const array<stopping_status>* stop_status)
 
 
 #define GKO_DECLARE_PIPE_CG_STEP_2_KERNEL(_type)                             \

diff --git a/include/ginkgo/core/solver/pipe_cg.hpp b/include/ginkgo/core/solver/pipe_cg.hpp
@@ -147,36 +147,32 @@ struct workspace_traits<PipeCg<ValueType>> {
     // array containing all varying vectors (dependent on problem size)
     static std::vector<int> vectors(const Solver&);
 
-    // residual vector
-    constexpr static int r = 0;
+    // joint (residual vector | w vector)
+    constexpr static int rw = 0;
     // preconditioned residual vector
     constexpr static int z = 1;
     // p vector
     constexpr static int p = 2;
-    // w vector
-    constexpr static int w = 3;
     // m vector
-    constexpr static int m = 4;
+    constexpr static int m = 3;
     // n vector
-    constexpr static int n = 5;
+    constexpr static int n = 4;
     // q vector
-    constexpr static int q = 6;
+    constexpr static int q = 5;
     // f vector
-    constexpr static int f = 7;
+    constexpr static int f = 6;
     // g vector
-    constexpr static int g = 8;
+    constexpr static int g = 7;
     // beta scalar
-    constexpr static int beta = 9;
-    // delta scalar
-    constexpr static int delta = 10;
+    constexpr static int beta = 8;
+    // (rho|delta) joint scalar
+    constexpr static int rhodelta = 9;
     // previous rho scalar
-    constexpr static int prev_rho = 11;
-    // current rho scalar
-    constexpr static int rho = 12;
+    constexpr static int prev_rho = 10;
     // constant 1.0 scalar
-    constexpr static int one = 13;
+    constexpr static int one = 11;
     // constant -1.0 scalar
-    constexpr static int minus_one = 14;
+    constexpr static int minus_one = 12;
 
     // stopping status array
     constexpr static int stop = 0;

diff --git a/reference/solver/pipe_cg_kernels.cpp b/reference/solver/pipe_cg_kernels.cpp
@@ -77,8 +77,8 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_PIPE_CG_INITIALIZE_2_KERNEL);
 template <typename ValueType>
 void step_1(std::shared_ptr<const ReferenceExecutor> exec,
             matrix::Dense<ValueType>* x, matrix::Dense<ValueType>* r,
-            matrix::Dense<ValueType>* z, matrix::Dense<ValueType>* w,
-            const matrix::Dense<ValueType>* p,
+            matrix::Dense<ValueType>* z1, matrix::Dense<ValueType>* z2,
+            matrix::Dense<ValueType>* w, const matrix::Dense<ValueType>* p,
             const matrix::Dense<ValueType>* q,
             const matrix::Dense<ValueType>* f,
             const matrix::Dense<ValueType>* g,
@@ -100,7 +100,8 @@ void step_1(std::shared_ptr<const ReferenceExecutor> exec,
                 auto tmp = rho->at(j) / beta->at(j);
                 x->at(i, j) += tmp * p->at(i, j);
                 r->at(i, j) -= tmp * q->at(i, j);
-                z->at(i, j) -= tmp * f->at(i, j);
+                z1->at(i, j) -= tmp * f->at(i, j);
+                z2->at(i, j) -= tmp * f->at(i, j);
                 w->at(i, j) -= tmp * g->at(i, j);
             }
         }