-
Notifications
You must be signed in to change notification settings - Fork 99
Merge dot products in PIPECG #1908
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: develop
Are you sure you want to change the base?
Changes from all commits
d0c8f13
93869f2
a756cc6
f7a2001
c74bbba
e0a5644
8b19a53
2487dea
164b999
f7935f0
262cc7e
0a6052e
c510304
ea9fc04
9cebf65
5adbaf7
734b3fb
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -18,6 +18,7 @@ | |
#include "core/distributed/helpers.hpp" | ||
#include "core/solver/pipe_cg_kernels.hpp" | ||
#include "core/solver/solver_boilerplate.hpp" | ||
#include "ginkgo/core/base/range.hpp" | ||
|
||
|
||
namespace gko { | ||
|
@@ -102,26 +103,75 @@ void PipeCg<ValueType>::apply_dense_impl(const VectorType* dense_b, | |
auto exec = this->get_executor(); | ||
this->setup_workspace(); | ||
|
||
GKO_SOLVER_VECTOR(r, dense_b); | ||
GKO_SOLVER_VECTOR(z, dense_b); | ||
// we combine the two vectors r and w, formerly created with | ||
// GKO_SOLVER_VECTOR(r, dense_b); | ||
// GKO_SOLVER_VECTOR(w, dense_b); | ||
// into rw that we later slice for efficient dot product computation | ||
auto b_stride = dense_b->get_stride(); | ||
|
||
auto local_original_size = ::gko::detail::get_local(dense_b)->get_size(); | ||
auto global_original_size = dense_b->get_size(); | ||
dim<2> local_conjoined_size = {local_original_size[0], b_stride * 2}; | ||
dim<2> global_conjoined_size = {global_original_size[0], b_stride * 2}; | ||
|
||
VectorType* rw = | ||
this->template create_workspace_op_with_type_of<VectorType>( | ||
GKO_SOLVER_TRAITS::rw, dense_b, global_conjoined_size, | ||
local_conjoined_size); | ||
auto r_unique = rw->create_submatrix(local_span{0, local_original_size[0]}, | ||
local_span{0, local_original_size[1]}, | ||
global_original_size); | ||
auto* r = r_unique.get(); | ||
auto w_unique = rw->create_submatrix( | ||
local_span{0, local_original_size[0]}, | ||
local_span{b_stride, b_stride + local_original_size[1]}, | ||
global_original_size); | ||
auto* w = w_unique.get(); | ||
|
||
// z now consists of two identical repeating parts: z1 and z2, again, for | ||
// the same reason | ||
GKO_SOLVER_VECTOR(z, rw); | ||
auto z1_unique = z->create_submatrix(local_span{0, local_original_size[0]}, | ||
local_span{0, local_original_size[1]}, | ||
global_original_size); | ||
auto* z1 = z1_unique.get(); | ||
auto z2_unique = z->create_submatrix( | ||
local_span{0, local_original_size[0]}, | ||
local_span{b_stride, b_stride + local_original_size[1]}, | ||
global_original_size); | ||
auto* z2 = z2_unique.get(); | ||
|
||
GKO_SOLVER_VECTOR(p, dense_b); | ||
GKO_SOLVER_VECTOR(w, dense_b); | ||
GKO_SOLVER_VECTOR(m, dense_b); | ||
GKO_SOLVER_VECTOR(n, dense_b); | ||
GKO_SOLVER_VECTOR(q, dense_b); | ||
GKO_SOLVER_VECTOR(f, dense_b); | ||
GKO_SOLVER_VECTOR(g, dense_b); | ||
|
||
// rho and delta become combined as well | ||
GKO_SOLVER_SCALAR(rhodelta, rw); | ||
auto rho_unique = rhodelta->create_submatrix( | ||
local_span{0, 1}, local_span{0, local_original_size[1]}, | ||
dim<2>{1, global_original_size[1]}); | ||
auto* rho = rho_unique.get(); | ||
auto delta_unique = rhodelta->create_submatrix( | ||
local_span{0, 1}, | ||
local_span{b_stride, b_stride + local_original_size[1]}, | ||
dim<2>{1, global_original_size[1]}); | ||
auto* delta = delta_unique.get(); | ||
|
||
GKO_SOLVER_SCALAR(beta, dense_b); | ||
GKO_SOLVER_SCALAR(delta, dense_b); | ||
GKO_SOLVER_SCALAR(prev_rho, dense_b); | ||
GKO_SOLVER_SCALAR(rho, dense_b); | ||
|
||
GKO_SOLVER_ONE_MINUS_ONE(); | ||
|
||
bool one_changed{}; | ||
|
||
GKO_SOLVER_STOP_REDUCTION_ARRAYS(); | ||
// needs to match the size of the combined rhodelta | ||
auto& stop_status = this->template create_workspace_array<stopping_status>( | ||
GKO_SOLVER_TRAITS::stop, global_original_size[1]); | ||
auto& reduction_tmp = | ||
this->template create_workspace_array<char>(GKO_SOLVER_TRAITS::tmp); | ||
|
||
// r = b | ||
// prev_rho = 1.0 | ||
|
@@ -131,18 +181,19 @@ void PipeCg<ValueType>::apply_dense_impl(const VectorType* dense_b, | |
// r = r - Ax | ||
this->get_system_matrix()->apply(neg_one_op, dense_x, one_op, r); | ||
// z = preconditioner * r | ||
this->get_preconditioner()->apply(r, z); | ||
this->get_preconditioner()->apply(r, z1); | ||
// z2 = z1 | ||
z2->copy_from(z1); | ||
// w = A * z | ||
this->get_system_matrix()->apply(z, w); | ||
this->get_system_matrix()->apply(z1, w); | ||
// m = preconditioner * w | ||
this->get_preconditioner()->apply(w, m); | ||
// n = A * m | ||
this->get_system_matrix()->apply(m, n); | ||
// TODO: merge these two dot products: | ||
// rho = dot(r, z) | ||
r->compute_conj_dot(z, rho, reduction_tmp); | ||
// delta = dot(w, z) | ||
w->compute_conj_dot(z, delta, reduction_tmp); | ||
// merged dot products | ||
// rho = dot(r, z1) | ||
// delta = dot(w, z2) | ||
rw->compute_conj_dot(z, rhodelta, reduction_tmp); | ||
|
||
// check for an early termination | ||
auto stop_criterion = this->get_stop_criterion_factory()->generate( | ||
|
@@ -171,7 +222,7 @@ void PipeCg<ValueType>::apply_dense_impl(const VectorType* dense_b, | |
exec->run(pipe_cg::make_initialize_2( | ||
gko::detail::get_local(p), gko::detail::get_local(q), | ||
gko::detail::get_local(f), gko::detail::get_local(g), beta, | ||
gko::detail::get_local(z), gko::detail::get_local(w), | ||
gko::detail::get_local(z1), gko::detail::get_local(w), | ||
gko::detail::get_local(m), gko::detail::get_local(n), delta)); | ||
|
||
/* Memory movement summary: | ||
|
@@ -183,23 +234,25 @@ void PipeCg<ValueType>::apply_dense_impl(const VectorType* dense_b, | |
// r = r - tmp * q | ||
// z = z - tmp * f | ||
// w = w - tmp * g | ||
// it's the only place where z is updated so we updated both z1 and z2 | ||
// here | ||
exec->run(pipe_cg::make_step_1( | ||
gko::detail::get_local(dense_x), gko::detail::get_local(r), | ||
gko::detail::get_local(z), gko::detail::get_local(w), | ||
gko::detail::get_local(p), gko::detail::get_local(q), | ||
gko::detail::get_local(f), gko::detail::get_local(g), rho, beta, | ||
&stop_status)); | ||
gko::detail::get_local(z1), gko::detail::get_local(z2), | ||
gko::detail::get_local(w), gko::detail::get_local(p), | ||
gko::detail::get_local(q), gko::detail::get_local(f), | ||
gko::detail::get_local(g), rho, beta, &stop_status)); | ||
|
||
// m = preconditioner * w | ||
this->get_preconditioner()->apply(w, m); | ||
// n = A * m | ||
this->get_system_matrix()->apply(m, n); | ||
// prev_rho = rho | ||
swap(prev_rho, rho); | ||
// TODO: merge these two dot products: | ||
// rho = dot(r, z) | ||
r->compute_conj_dot(z, rho, reduction_tmp); | ||
// delta = dot(w, z) | ||
w->compute_conj_dot(z, delta, reduction_tmp); | ||
prev_rho->copy_from(rho); | ||
// merged dot products | ||
// rho = dot(r, z1) | ||
// delta = dot(w, z2) | ||
rw->compute_conj_dot(z, rhodelta, reduction_tmp); | ||
// check | ||
++iter; | ||
bool all_stopped = | ||
|
@@ -215,6 +268,7 @@ void PipeCg<ValueType>::apply_dense_impl(const VectorType* dense_b, | |
if (all_stopped) { | ||
break; | ||
} | ||
|
||
// tmp = rho / prev_rho | ||
// beta = delta - |tmp|^2 * beta | ||
// p = z + tmp * p | ||
|
@@ -224,7 +278,7 @@ void PipeCg<ValueType>::apply_dense_impl(const VectorType* dense_b, | |
exec->run(pipe_cg::make_step_2( | ||
beta, gko::detail::get_local(p), gko::detail::get_local(q), | ||
gko::detail::get_local(f), gko::detail::get_local(g), | ||
gko::detail::get_local(z), gko::detail::get_local(w), | ||
gko::detail::get_local(z1), gko::detail::get_local(w), | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. note z1 is strided access which might lower your performance There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. do you think there's a way to avoid this? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. currently no under this storage. You need change z storage and it will also require change the other to fit the need |
||
gko::detail::get_local(m), gko::detail::get_local(n), prev_rho, rho, | ||
delta, &stop_status)); | ||
} | ||
|
@@ -259,7 +313,7 @@ int workspace_traits<PipeCg<ValueType>>::num_arrays(const Solver&) | |
template <typename ValueType> | ||
int workspace_traits<PipeCg<ValueType>>::num_vectors(const Solver&) | ||
{ | ||
return 15; | ||
return 13; | ||
} | ||
|
||
|
||
|
@@ -268,8 +322,8 @@ std::vector<std::string> workspace_traits<PipeCg<ValueType>>::op_names( | |
const Solver&) | ||
{ | ||
return { | ||
"r", "z", "p", "w", "m", "n", "q", "f", | ||
"g", "beta", "delta", "prev_rho", "rho", "one", "minus_one", | ||
"rw", "z", "p", "m", "n", "q", "f", | ||
"g", "beta", "rhodelta", "prev_rho", "one", "minus_one", | ||
}; | ||
} | ||
|
||
|
@@ -285,14 +339,14 @@ std::vector<std::string> workspace_traits<PipeCg<ValueType>>::array_names( | |
template <typename ValueType> | ||
std::vector<int> workspace_traits<PipeCg<ValueType>>::scalars(const Solver&) | ||
{ | ||
return {beta, delta, prev_rho, rho}; | ||
return {beta, rhodelta, prev_rho}; | ||
} | ||
|
||
|
||
template <typename ValueType> | ||
std::vector<int> workspace_traits<PipeCg<ValueType>>::vectors(const Solver&) | ||
{ | ||
return {r, z, p, w, m, n, q, f, g}; | ||
return {rw, z, p, m, n, q, f, g}; | ||
} | ||
|
||
|
||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
do you need to use the stride from b?
the rw vectors actually have 2 * b_stride not b_stride.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
yes, b_stride is used, say, here:
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
we multiply by 2 when needed
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes, I know where b_stride is from.
My question is more on whether to use b_stride or use the #vectors as stride.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
as Pratik told me today, the b_stride is fine bc of padding and it's probably equivalent to #vectors most of the time. I used b_stride because dense_b is a vector so mathematically it made sense to me to use the dimensions of the original vector as reference for a size variable used to create other vectors