Skip to content
Open
Show file tree
Hide file tree
Changes from 13 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions benchmark/solver/distributed/solver.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -105,8 +105,8 @@ int main(int argc, char* argv[])
print_general_information(extra_information, exec);
}

std::set<std::string> supported_solvers = {"cg", "fcg", "cgs", "bicgstab",
"gmres"};
std::set<std::string> supported_solvers = {"cg", "fcg", "cgs",
"bicgstab", "gmres", "pipe_cg"};
auto solvers = split(FLAGS_solvers, ',');
for (const auto& solver : solvers) {
if (supported_solvers.find(solver) == supported_solvers.end()) {
Expand Down
35 changes: 16 additions & 19 deletions common/unified/solver/pipe_cg_kernels.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -38,8 +38,8 @@ void initialize_1(std::shared_ptr<const DefaultExecutor> exec,
}
r(row, col) = b(row, col);
},
b->get_size(), b->get_stride(), b, default_stride(r),
row_vector(prev_rho), *stop_status);
b->get_size(), b->get_stride(), b, r, row_vector(prev_rho),
*stop_status);
} else {
run_kernel(
exec,
Expand Down Expand Up @@ -83,9 +83,7 @@ void initialize_2(std::shared_ptr<const DefaultExecutor> exec,
f(row, col) = m(row, col);
g(row, col) = n(row, col);
},
p->get_size(), p->get_stride(), default_stride(p),
default_stride(q), default_stride(f), default_stride(g),
row_vector(beta), default_stride(z), default_stride(w),
p->get_size(), p->get_stride(), p, q, f, g, row_vector(beta), z, w,
default_stride(m), default_stride(n), row_vector(delta));
} else {
run_kernel(
Expand All @@ -103,8 +101,8 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_PIPE_CG_INITIALIZE_2_KERNEL);
template <typename ValueType>
void step_1(std::shared_ptr<const DefaultExecutor> exec,
matrix::Dense<ValueType>* x, matrix::Dense<ValueType>* r,
matrix::Dense<ValueType>* z, matrix::Dense<ValueType>* w,
const matrix::Dense<ValueType>* p,
matrix::Dense<ValueType>* z1, matrix::Dense<ValueType>* z2,
matrix::Dense<ValueType>* w, const matrix::Dense<ValueType>* p,
const matrix::Dense<ValueType>* q,
const matrix::Dense<ValueType>* f,
const matrix::Dense<ValueType>* g,
Expand All @@ -119,21 +117,21 @@ void step_1(std::shared_ptr<const DefaultExecutor> exec,
// w = w - tmp * g
run_kernel_solver(
exec,
[] GKO_KERNEL(auto row, auto col, auto x, auto r, auto z, auto w,
auto p, auto q, auto f, auto g, auto rho, auto beta,
auto stop) {
[] GKO_KERNEL(auto row, auto col, auto x, auto r, auto z1, auto z2,
auto w, auto p, auto q, auto f, auto g, auto rho,
auto beta, auto stop) {
if (!stop[col].has_stopped()) {
auto tmp = safe_divide(rho[col], beta[col]);
x(row, col) += tmp * p(row, col);
r(row, col) -= tmp * q(row, col);
z(row, col) -= tmp * f(row, col);
z1(row, col) -= tmp * f(row, col);
z2(row, col) = z1(row, col);
w(row, col) -= tmp * g(row, col);
}
},
x->get_size(), r->get_stride(), x, default_stride(r), default_stride(z),
default_stride(w), default_stride(p), default_stride(q),
default_stride(f), default_stride(g), row_vector(rho), row_vector(beta),
*stop_status);
x->get_size(), x->get_stride(), default_stride(x), r, z1, z2, w,
default_stride(p), default_stride(q), default_stride(f),
default_stride(g), row_vector(rho), row_vector(beta), *stop_status);
}

GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_PIPE_CG_STEP_1_KERNEL);
Expand Down Expand Up @@ -179,10 +177,9 @@ void step_2(std::shared_ptr<const DefaultExecutor> exec,
}
},
p->get_size(), p->get_stride(), row_vector(beta), default_stride(p),
default_stride(q), default_stride(f), default_stride(g),
default_stride(z), default_stride(w), default_stride(m),
default_stride(n), row_vector(prev_rho), row_vector(rho),
row_vector(delta), *stop_status);
default_stride(q), default_stride(f), default_stride(g), z, w,
default_stride(m), default_stride(n), row_vector(prev_rho),
row_vector(rho), row_vector(delta), *stop_status);
}

GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_PIPE_CG_STEP_2_KERNEL);
Expand Down
114 changes: 84 additions & 30 deletions core/solver/pipe_cg.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
#include "core/distributed/helpers.hpp"
#include "core/solver/pipe_cg_kernels.hpp"
#include "core/solver/solver_boilerplate.hpp"
#include "ginkgo/core/base/range.hpp"


namespace gko {
Expand Down Expand Up @@ -102,26 +103,75 @@ void PipeCg<ValueType>::apply_dense_impl(const VectorType* dense_b,
auto exec = this->get_executor();
this->setup_workspace();

GKO_SOLVER_VECTOR(r, dense_b);
GKO_SOLVER_VECTOR(z, dense_b);
// we combine the two vectors r and w, formerly created with
// GKO_SOLVER_VECTOR(r, dense_b);
// GKO_SOLVER_VECTOR(w, dense_b);
// into rw that we later slice for efficient dot product computation
auto b_stride = dense_b->get_stride();
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

do you need to use the stride from b?
the rw vectors actually have 2 * b_stride not b_stride.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yes, b_stride is used, say, here:

auto* r = r_unique.get();
    auto w_unique = rw->create_submatrix(
        local_span{0, local_original_size[0]},
        local_span{b_stride, b_stride + local_original_size[1]},
        global_original_size);
    auto* w = w_unique.get();

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

we multiply by 2 when needed

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, I know where b_stride is from.
My question is more on whether to use b_stride or use the #vectors as stride.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

as Pratik told me today, the b_stride is fine bc of padding and it's probably equivalent to #vectors most of the time. I used b_stride because dense_b is a vector so mathematically it made sense to me to use the dimensions of the original vector as reference for a size variable used to create other vectors


auto local_original_size = ::gko::detail::get_local(dense_b)->get_size();
auto global_original_size = dense_b->get_size();
dim<2> local_conjoined_size = {local_original_size[0], b_stride * 2};
dim<2> global_conjoined_size = {global_original_size[0], b_stride * 2};

VectorType* rw =
this->template create_workspace_op_with_type_of<VectorType>(
GKO_SOLVER_TRAITS::rw, dense_b, global_conjoined_size,
local_conjoined_size);
auto r_unique = rw->create_submatrix(local_span{0, local_original_size[0]},
local_span{0, local_original_size[1]},
global_original_size);
auto* r = r_unique.get();
auto w_unique = rw->create_submatrix(
local_span{0, local_original_size[0]},
local_span{b_stride, b_stride + local_original_size[1]},
global_original_size);
auto* w = w_unique.get();

// z now consists of two identical repeating parts: z1 and z2, again, for
// the same reason
GKO_SOLVER_VECTOR(z, rw);
auto z1_unique = z->create_submatrix(local_span{0, local_original_size[0]},
local_span{0, local_original_size[1]},
global_original_size);
auto* z1 = z1_unique.get();
auto z2_unique = z->create_submatrix(
local_span{0, local_original_size[0]},
local_span{b_stride, b_stride + local_original_size[1]},
global_original_size);
auto* z2 = z2_unique.get();

GKO_SOLVER_VECTOR(p, dense_b);
GKO_SOLVER_VECTOR(w, dense_b);
GKO_SOLVER_VECTOR(m, dense_b);
GKO_SOLVER_VECTOR(n, dense_b);
GKO_SOLVER_VECTOR(q, dense_b);
GKO_SOLVER_VECTOR(f, dense_b);
GKO_SOLVER_VECTOR(g, dense_b);

// rho and delta become combined as well
GKO_SOLVER_SCALAR(rhodelta, rw);
auto rho_unique = rhodelta->create_submatrix(
local_span{0, 1}, local_span{0, local_original_size[1]},
dim<2>{1, global_original_size[1]});
auto* rho = rho_unique.get();
auto delta_unique = rhodelta->create_submatrix(
local_span{0, 1},
local_span{b_stride, b_stride + local_original_size[1]},
dim<2>{1, global_original_size[1]});
auto* delta = delta_unique.get();

GKO_SOLVER_SCALAR(beta, dense_b);
GKO_SOLVER_SCALAR(delta, dense_b);
GKO_SOLVER_SCALAR(prev_rho, dense_b);
GKO_SOLVER_SCALAR(rho, dense_b);

GKO_SOLVER_ONE_MINUS_ONE();

bool one_changed{};

GKO_SOLVER_STOP_REDUCTION_ARRAYS();
// needs to match the size of the combined rhodelta
auto& stop_status = this->template create_workspace_array<stopping_status>(
GKO_SOLVER_TRAITS::stop, global_original_size[1]);
auto& reduction_tmp = this->template create_workspace_array<char>(
GKO_SOLVER_TRAITS::tmp, 2 * global_original_size[1]);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
auto& reduction_tmp = this->template create_workspace_array<char>(
GKO_SOLVER_TRAITS::tmp, 2 * global_original_size[1]);
auto& reduction_tmp = this->template create_workspace_array<char>(
GKO_SOLVER_TRAITS::tmp);

reduction tmp will need to go through the reduction step to know proper size, so we do not need to initialize a size for that


// r = b
// prev_rho = 1.0
Expand All @@ -131,18 +181,19 @@ void PipeCg<ValueType>::apply_dense_impl(const VectorType* dense_b,
// r = r - Ax
this->get_system_matrix()->apply(neg_one_op, dense_x, one_op, r);
// z = preconditioner * r
this->get_preconditioner()->apply(r, z);
this->get_preconditioner()->apply(r, z1);
// z2 = z1
z2->copy_from(z1);
// w = A * z
this->get_system_matrix()->apply(z, w);
this->get_system_matrix()->apply(z1, w);
// m = preconditioner * w
this->get_preconditioner()->apply(w, m);
// n = A * m
this->get_system_matrix()->apply(m, n);
// TODO: merge these two dot products:
// rho = dot(r, z)
r->compute_conj_dot(z, rho, reduction_tmp);
// delta = dot(w, z)
w->compute_conj_dot(z, delta, reduction_tmp);
// merged dot products
// rho = dot(r, z1)
// delta = dot(w, z2)
rw->compute_conj_dot(z, rhodelta, reduction_tmp);

// check for an early termination
auto stop_criterion = this->get_stop_criterion_factory()->generate(
Expand Down Expand Up @@ -171,7 +222,7 @@ void PipeCg<ValueType>::apply_dense_impl(const VectorType* dense_b,
exec->run(pipe_cg::make_initialize_2(
gko::detail::get_local(p), gko::detail::get_local(q),
gko::detail::get_local(f), gko::detail::get_local(g), beta,
gko::detail::get_local(z), gko::detail::get_local(w),
gko::detail::get_local(z1), gko::detail::get_local(w),
gko::detail::get_local(m), gko::detail::get_local(n), delta));

/* Memory movement summary:
Expand All @@ -183,23 +234,25 @@ void PipeCg<ValueType>::apply_dense_impl(const VectorType* dense_b,
// r = r - tmp * q
// z = z - tmp * f
// w = w - tmp * g
// it's the only place where z is updated so we updated both z1 and z2
// here
exec->run(pipe_cg::make_step_1(
gko::detail::get_local(dense_x), gko::detail::get_local(r),
gko::detail::get_local(z), gko::detail::get_local(w),
gko::detail::get_local(p), gko::detail::get_local(q),
gko::detail::get_local(f), gko::detail::get_local(g), rho, beta,
&stop_status));
gko::detail::get_local(z1), gko::detail::get_local(z2),
gko::detail::get_local(w), gko::detail::get_local(p),
gko::detail::get_local(q), gko::detail::get_local(f),
gko::detail::get_local(g), rho, beta, &stop_status));

// m = preconditioner * w
this->get_preconditioner()->apply(w, m);
// n = A * m
this->get_system_matrix()->apply(m, n);
// prev_rho = rho
swap(prev_rho, rho);
// TODO: merge these two dot products:
// rho = dot(r, z)
r->compute_conj_dot(z, rho, reduction_tmp);
// delta = dot(w, z)
w->compute_conj_dot(z, delta, reduction_tmp);
prev_rho->copy_from(rho);
// merged dot products
// rho = dot(r, z1)
// delta = dot(w, z2)
rw->compute_conj_dot(z, rhodelta, reduction_tmp);
// check
++iter;
bool all_stopped =
Expand All @@ -215,6 +268,7 @@ void PipeCg<ValueType>::apply_dense_impl(const VectorType* dense_b,
if (all_stopped) {
break;
}

// tmp = rho / prev_rho
// beta = delta - |tmp|^2 * beta
// p = z + tmp * p
Expand All @@ -224,7 +278,7 @@ void PipeCg<ValueType>::apply_dense_impl(const VectorType* dense_b,
exec->run(pipe_cg::make_step_2(
beta, gko::detail::get_local(p), gko::detail::get_local(q),
gko::detail::get_local(f), gko::detail::get_local(g),
gko::detail::get_local(z), gko::detail::get_local(w),
gko::detail::get_local(z1), gko::detail::get_local(w),
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

note z1 is strided access which might lower your performance

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

do you think there's a way to avoid this?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

currently no under this storage. You need change z storage and it will also require change the other to fit the need

gko::detail::get_local(m), gko::detail::get_local(n), prev_rho, rho,
delta, &stop_status));
}
Expand Down Expand Up @@ -259,7 +313,7 @@ int workspace_traits<PipeCg<ValueType>>::num_arrays(const Solver&)
template <typename ValueType>
int workspace_traits<PipeCg<ValueType>>::num_vectors(const Solver&)
{
return 15;
return 13;
}


Expand All @@ -268,8 +322,8 @@ std::vector<std::string> workspace_traits<PipeCg<ValueType>>::op_names(
const Solver&)
{
return {
"r", "z", "p", "w", "m", "n", "q", "f",
"g", "beta", "delta", "prev_rho", "rho", "one", "minus_one",
"rw", "z", "p", "m", "n", "q", "f",
"g", "beta", "rhodelta", "prev_rho", "one", "minus_one",
};
}

Expand All @@ -285,14 +339,14 @@ std::vector<std::string> workspace_traits<PipeCg<ValueType>>::array_names(
template <typename ValueType>
std::vector<int> workspace_traits<PipeCg<ValueType>>::scalars(const Solver&)
{
return {beta, delta, prev_rho, rho};
return {beta, rhodelta, prev_rho};
}


template <typename ValueType>
std::vector<int> workspace_traits<PipeCg<ValueType>>::vectors(const Solver&)
{
return {r, z, p, w, m, n, q, f, g};
return {rw, z, p, m, n, q, f, g};
}


Expand Down
16 changes: 8 additions & 8 deletions core/solver/pipe_cg_kernels.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -39,14 +39,14 @@ namespace pipe_cg {


#define GKO_DECLARE_PIPE_CG_STEP_1_KERNEL(_type) \
void step_1(std::shared_ptr<const DefaultExecutor> exec, \
matrix::Dense<_type>* x, matrix::Dense<_type>* r, \
matrix::Dense<_type>* z, matrix::Dense<_type>* w, \
const matrix::Dense<_type>* p, const matrix::Dense<_type>* q, \
const matrix::Dense<_type>* f, const matrix::Dense<_type>* g, \
const matrix::Dense<_type>* rho, \
const matrix::Dense<_type>* beta, \
const array<stopping_status>* stop_status)
void step_1( \
std::shared_ptr<const DefaultExecutor> exec, matrix::Dense<_type>* x, \
matrix::Dense<_type>* r, matrix::Dense<_type>* z1, \
matrix::Dense<_type>* z2, matrix::Dense<_type>* w, \
const matrix::Dense<_type>* p, const matrix::Dense<_type>* q, \
const matrix::Dense<_type>* f, const matrix::Dense<_type>* g, \
const matrix::Dense<_type>* rho, const matrix::Dense<_type>* beta, \
const array<stopping_status>* stop_status)


#define GKO_DECLARE_PIPE_CG_STEP_2_KERNEL(_type) \
Expand Down
30 changes: 13 additions & 17 deletions include/ginkgo/core/solver/pipe_cg.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -147,36 +147,32 @@ struct workspace_traits<PipeCg<ValueType>> {
// array containing all varying vectors (dependent on problem size)
static std::vector<int> vectors(const Solver&);

// residual vector
constexpr static int r = 0;
// joint (residual vector | w vector)
constexpr static int rw = 0;
// preconditioned residual vector
constexpr static int z = 1;
// p vector
constexpr static int p = 2;
// w vector
constexpr static int w = 3;
// m vector
constexpr static int m = 4;
constexpr static int m = 3;
// n vector
constexpr static int n = 5;
constexpr static int n = 4;
// q vector
constexpr static int q = 6;
constexpr static int q = 5;
// f vector
constexpr static int f = 7;
constexpr static int f = 6;
// g vector
constexpr static int g = 8;
constexpr static int g = 7;
// beta scalar
constexpr static int beta = 9;
// delta scalar
constexpr static int delta = 10;
constexpr static int beta = 8;
// (rho|delta) joint scalar
constexpr static int rhodelta = 9;
// previous rho scalar
constexpr static int prev_rho = 11;
// current rho scalar
constexpr static int rho = 12;
constexpr static int prev_rho = 10;
// constant 1.0 scalar
constexpr static int one = 13;
constexpr static int one = 11;
// constant -1.0 scalar
constexpr static int minus_one = 14;
constexpr static int minus_one = 12;

// stopping status array
constexpr static int stop = 0;
Expand Down
7 changes: 4 additions & 3 deletions reference/solver/pipe_cg_kernels.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -77,8 +77,8 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_PIPE_CG_INITIALIZE_2_KERNEL);
template <typename ValueType>
void step_1(std::shared_ptr<const ReferenceExecutor> exec,
matrix::Dense<ValueType>* x, matrix::Dense<ValueType>* r,
matrix::Dense<ValueType>* z, matrix::Dense<ValueType>* w,
const matrix::Dense<ValueType>* p,
matrix::Dense<ValueType>* z1, matrix::Dense<ValueType>* z2,
matrix::Dense<ValueType>* w, const matrix::Dense<ValueType>* p,
const matrix::Dense<ValueType>* q,
const matrix::Dense<ValueType>* f,
const matrix::Dense<ValueType>* g,
Expand All @@ -100,7 +100,8 @@ void step_1(std::shared_ptr<const ReferenceExecutor> exec,
auto tmp = rho->at(j) / beta->at(j);
x->at(i, j) += tmp * p->at(i, j);
r->at(i, j) -= tmp * q->at(i, j);
z->at(i, j) -= tmp * f->at(i, j);
z1->at(i, j) -= tmp * f->at(i, j);
z2->at(i, j) -= tmp * f->at(i, j);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

because z2 is always sync with z1, z2 = z1 should be better

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

it still use z2 -= tmp * f

w->at(i, j) -= tmp * g->at(i, j);
}
}
Expand Down
Loading
Loading