Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions contributors.txt
Original file line number Diff line number Diff line change
Expand Up @@ -25,4 +25,5 @@ Nguyen Phuong <phuong.nguyen@icl.utk.edu> University of Tennessee, Knoxville
Olenik Gregor <go@hpsim.de> HPSim
Ribizel Tobias <mail@upsj.de> Karlsruhe Institute of Technology
Riemer Lukas <lksriemer@gmail.com> Karlsruhe Institute of Technology
Luka Stanisic <luka.stanisic@huawei.com> Huawei Technologies Duesseldorf GmbH
Tsai Yuhsiang <yhmtsai@gmail.com> National Taiwan University
193 changes: 166 additions & 27 deletions omp/matrix/csr_kernels.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -42,12 +42,138 @@ namespace omp {
namespace csr {


/**
* Computes the begin offsets into A (the number of rows) and B (the number of
* stored element), which intersect the diagonal line. The diagonal line is
* formed by the reaching point with the `diagonal` step from the starting
* point.
*
* @param diagonal the diagonal line to search
* @param row_end_ptrs the pointer to the ending of row offset of A.
* row_end_ptrs[i] gives the ending in the value/column
* array for row_i.
* @param a_len the length of A (the number of rows)
* @param b_len the length of B (the number of stored elements)
*
* @return a pair which contains (x, y) coordinate where diagonal intersects the
* merge path
*/
template <typename IndexType>
inline std::pair<IndexType, IndexType> merge_path_search(
const IndexType diagonal, const IndexType* row_end_ptrs,
const IndexType a_len, const IndexType b_len)
{
auto x_min = std::max(diagonal - b_len, zero<IndexType>());
auto x_max = std::min(diagonal, a_len);

while (x_min < x_max) {
auto x_pivot = x_min + ((x_max - x_min) / 2);
if (row_end_ptrs[x_pivot] <= (diagonal - x_pivot - 1)) {
x_min = x_pivot + 1; // Contract range up A (down B)
} else {
x_max = x_pivot; // Contract range down A (up B)
}
}

return std::make_pair(std::min(x_min, a_len), diagonal - x_min);
}


template <typename MatrixValueType, typename InputValueType,
typename OutputValueType, typename IndexType>
void spmv(std::shared_ptr<const OmpExecutor> exec,
const matrix::Csr<MatrixValueType, IndexType>* a,
const matrix::Dense<InputValueType>* b,
matrix::Dense<OutputValueType>* c)
typename OutputValueType, typename IndexType, typename AlphaOp,
typename BetaOp>
void merge_spmv(std::shared_ptr<const OmpExecutor> exec,
const matrix::Csr<MatrixValueType, IndexType>* a,
const matrix::Dense<InputValueType>* b,
matrix::Dense<OutputValueType>* c, AlphaOp alpha_op,
BetaOp beta_op)
{
using arithmetic_type =
highest_precision<MatrixValueType, InputValueType, OutputValueType>;

auto row_ptrs = a->get_const_row_ptrs();
auto col_idxs = a->get_const_col_idxs();

const auto a_vals =
acc::helper::build_const_rrm_accessor<arithmetic_type>(a);
const auto b_vals =
acc::helper::build_const_rrm_accessor<arithmetic_type>(b);
auto c_vals = acc::helper::build_rrm_accessor<arithmetic_type>(c);

// Merge-SpMV variables
const auto num_rows = static_cast<IndexType>(a->get_size()[0]);
const auto nnz = static_cast<IndexType>(a->get_num_stored_elements());
const auto num_threads = static_cast<IndexType>(omp_get_max_threads());
// Merge list A: row end ptr
const IndexType* row_end_ptrs = row_ptrs + 1;
// Merge path total length
const auto num_merge_items = num_rows + nnz;
// Merge items per thread
const auto items_per_thread =
static_cast<IndexType>(ceildiv(num_merge_items, num_threads));
array<IndexType> row_carry_over(exec, num_threads);
array<arithmetic_type> value_carry_over(exec, num_threads);
auto row_carry_over_ptr = row_carry_over.get_data();
auto value_carry_over_ptr = value_carry_over.get_data();

// TODO: parallelize with number of cols, too.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If you want to, I think it's pretty simple to move this loop to the innermost loop. You just have to increase the size of value_carry_out to num_threads * c->get_size()[1].

for (size_type j = 0; j < c->get_size()[1]; ++j) {
// TODO: It uses static from the observation of the previous
// experiments. Check it with different system and different kinds of
// schedule.
#pragma omp parallel for schedule(static)
for (IndexType tid = 0; tid < num_threads; tid++) {
const auto start_diagonal =
std::min(items_per_thread * tid, num_merge_items);
const auto end_diagonal =
std::min(start_diagonal + items_per_thread, num_merge_items);

auto [x, y] =
merge_path_search(start_diagonal, row_end_ptrs, num_rows, nnz);
auto [end_x, end_y] =
merge_path_search(end_diagonal, row_end_ptrs, num_rows, nnz);
// Consume merge items, whole rows first
for (; x < end_x; x++) {
auto sum = zero<arithmetic_type>();
for (; y < row_end_ptrs[x]; y++) {
arithmetic_type val = a_vals(y);
auto col = col_idxs[y];
sum += val * b_vals(col, j);
}
c_vals(x, j) = alpha_op(sum) + beta_op(c_vals(x, j));
}

// Consume partial portion of thread's last row
auto sum = zero<arithmetic_type>();
for (; y < end_y; y++) {
arithmetic_type val = a_vals(y);
auto col = col_idxs[y];
sum += val * b_vals(col, j);
}

// Save carry over
row_carry_over_ptr[tid] = end_x;
value_carry_over_ptr[tid] = alpha_op(sum);
}

// Carry over fix-up (rows spanning multiple threads)
// The carry over from thread `tid` to `tid + 1` is added by the thread
// `tid`, thus the last thread has no work.
for (IndexType tid = 0; tid < num_threads - 1; tid++) {
if (row_carry_over_ptr[tid] < num_rows) {
c_vals(row_carry_over_ptr[tid], j) += value_carry_over_ptr[tid];
}
}
}
}


template <typename MatrixValueType, typename InputValueType,
typename OutputValueType, typename IndexType, typename Function>
void classical_spmv(std::shared_ptr<const OmpExecutor> exec,
const matrix::Csr<MatrixValueType, IndexType>* a,
const matrix::Dense<InputValueType>* b,
matrix::Dense<OutputValueType>* c, Function lambda)
{
using arithmetic_type =
highest_precision<MatrixValueType, InputValueType, OutputValueType>;
Expand All @@ -72,11 +198,31 @@ void spmv(std::shared_ptr<const OmpExecutor> exec,

sum += val * b_vals(col, j);
}
c_vals(row, j) = sum;
c_vals(row, j) = lambda(sum, c_vals(row, j));
}
}
}

template <typename MatrixValueType, typename InputValueType,
typename OutputValueType, typename IndexType>
void spmv(std::shared_ptr<const OmpExecutor> exec,
const matrix::Csr<MatrixValueType, IndexType>* a,
const matrix::Dense<InputValueType>* b,
matrix::Dense<OutputValueType>* c)
{
using arithmetic_type =
highest_precision<MatrixValueType, InputValueType, OutputValueType>;
if (c->get_size()[0] == 0 || c->get_size()[1] == 0) {
// empty output: nothing to do
} else if (a->get_strategy()->get_name() == "merge_path") {
merge_spmv(
exec, a, b, c, [](auto val) { return val; },
[](auto) { return zero<arithmetic_type>(); });
} else {
classical_spmv(exec, a, b, c, [](auto sum, auto) { return sum; });
}
}

GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE(
GKO_DECLARE_CSR_SPMV_KERNEL);

Expand All @@ -92,29 +238,22 @@ void advanced_spmv(std::shared_ptr<const OmpExecutor> exec,
{
using arithmetic_type =
highest_precision<MatrixValueType, InputValueType, OutputValueType>;

auto row_ptrs = a->get_const_row_ptrs();
auto col_idxs = a->get_const_col_idxs();
auto valpha = static_cast<arithmetic_type>(alpha->at(0, 0));
auto vbeta = static_cast<arithmetic_type>(beta->at(0, 0));

const auto a_vals =
acc::helper::build_const_rrm_accessor<arithmetic_type>(a);
const auto b_vals =
acc::helper::build_const_rrm_accessor<arithmetic_type>(b);
auto c_vals = acc::helper::build_rrm_accessor<arithmetic_type>(c);
#pragma omp parallel for
for (size_type row = 0; row < a->get_size()[0]; ++row) {
for (size_type j = 0; j < c->get_size()[1]; ++j) {
auto sum = is_zero(vbeta) ? zero(vbeta) : c_vals(row, j) * vbeta;
for (size_type k = row_ptrs[row];
k < static_cast<size_type>(row_ptrs[row + 1]); ++k) {
arithmetic_type val = a_vals(k);
auto col = col_idxs[k];
sum += valpha * val * b_vals(col, j);
}
c_vals(row, j) = sum;
}
if (c->get_size()[0] == 0 || c->get_size()[1] == 0) {
// empty output: nothing to do
} else if (a->get_strategy()->get_name() == "merge_path") {
merge_spmv(
exec, a, b, c, [valpha](auto val) { return valpha * val; },
[vbeta](auto val) {
return is_zero(vbeta) ? zero(vbeta) : val * vbeta;
});
} else {
classical_spmv(exec, a, b, c, [valpha, vbeta](auto sum, auto orig_val) {
auto scaled_orig_val =
is_zero(vbeta) ? zero(vbeta) : orig_val * vbeta;
return valpha * sum + scaled_orig_val;
});
}
}

Expand Down
15 changes: 13 additions & 2 deletions test/matrix/matrix.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -130,7 +130,7 @@ struct CsrWithDefaultStrategy : CsrBase {


#if defined(GKO_COMPILING_CUDA) || defined(GKO_COMPILING_HIP) || \
defined(GKO_COMPILING_DPCPP)
defined(GKO_COMPILING_DPCPP) || defined(GKO_COMPILING_OMP)


struct CsrWithClassicalStrategy : CsrBase {
Expand Down Expand Up @@ -177,6 +177,14 @@ struct CsrWithMergePathStrategy : CsrBase {
}
};


#endif


#if defined(GKO_COMPILING_CUDA) || defined(GKO_COMPILING_HIP) || \
defined(GKO_COMPILING_DPCPP)


struct CsrWithSparselibStrategy : CsrBase {
static std::unique_ptr<matrix_type> create(
std::shared_ptr<gko::Executor> exec, gko::dim<2> size)
Expand Down Expand Up @@ -827,8 +835,11 @@ class Matrix : public CommonTestFixture {
using MatrixTypes = ::testing::Types<
DenseWithDefaultStride, DenseWithCustomStride, Coo, CsrWithDefaultStrategy,
#if defined(GKO_COMPILING_CUDA) || defined(GKO_COMPILING_HIP) || \
defined(GKO_COMPILING_DPCPP)
defined(GKO_COMPILING_DPCPP) || defined(GKO_COMPILING_OMP)
CsrWithClassicalStrategy, CsrWithMergePathStrategy,
#endif
#if defined(GKO_COMPILING_CUDA) || defined(GKO_COMPILING_HIP) || \
defined(GKO_COMPILING_DPCPP)
CsrWithSparselibStrategy, CsrWithLoadBalanceStrategy,
CsrWithAutomaticalStrategy,
#endif
Expand Down