Skip to content
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions common/cuda_hip/matrix/csr_kernels.instantiate.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -133,6 +133,8 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_SPGEMM_KERNEL);
GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
GKO_DECLARE_CSR_ADVANCED_SPGEMM_KERNEL);
GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
GKO_DECLARE_CSR_SPGEMM_REUSE_KERNEL);
GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE(GKO_DECLARE_CSR_BUILD_LOOKUP_KERNEL);
GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_SPGEAM_KERNEL);
GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
Expand Down
89 changes: 89 additions & 0 deletions common/cuda_hip/matrix/csr_kernels.template.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@
#include "common/cuda_hip/components/thread_ids.hpp"
#include "common/cuda_hip/components/uninitialized_array.hpp"
#include "core/base/array_access.hpp"
#include "core/base/index_range.hpp"
#include "core/base/mixed_precision_types.hpp"
#include "core/components/fill_array_kernels.hpp"
#include "core/components/format_conversion_kernels.hpp"
Expand Down Expand Up @@ -2735,6 +2736,94 @@ void advanced_spgemm(std::shared_ptr<const DefaultExecutor> exec,
}


namespace kernel {


template <typename ValueType, typename IndexType>
__global__ __launch_bounds__(default_block_size) void spgemm_reuse(
const IndexType* __restrict__ a_row_ptrs,
const IndexType* __restrict__ a_cols, const ValueType* __restrict__ a_vals,
const IndexType* __restrict__ b_row_ptrs,
const IndexType* __restrict__ b_cols, const ValueType* __restrict__ b_vals,
const IndexType* __restrict__ c_row_ptrs,
const IndexType* __restrict__ c_cols, ValueType* __restrict__ c_vals,
const IndexType* __restrict__ lookup_storage_offsets,
const int32* __restrict__ lookup_storage,
const int64* __restrict__ lookup_descs, IndexType num_rows)
{
constexpr auto subwarp_size = config::warp_size;
const auto subwarp =
group::tiled_partition<subwarp_size>(group::this_thread_block());
const auto row = thread::get_subwarp_id_flat<subwarp_size, IndexType>();
const auto lane = static_cast<IndexType>(subwarp.thread_rank());
if (row >= num_rows) {
return;
}
const auto a_begin = a_row_ptrs[row];
const auto a_end = a_row_ptrs[row + 1];
const auto c_begin = c_row_ptrs[row];
const auto c_end = c_row_ptrs[row + 1];
const auto c_row_lookup = matrix::csr::device_sparsity_lookup<IndexType>{
c_row_ptrs, c_cols, lookup_storage_offsets,
lookup_storage, lookup_descs, static_cast<size_type>(row)};
for (auto i = c_begin + lane; i < c_end; i += subwarp_size) {
c_vals[i] = zero<ValueType>();
}
for (const auto a_nz : irange{a_begin, a_end}) {
const auto a_col = a_cols[a_nz];
const auto a_val = a_vals[a_nz];
const auto b_begin = b_row_ptrs[a_col];
const auto b_end = b_row_ptrs[a_col + 1];
for (auto b_nz = b_begin + lane; b_nz < b_end; b_nz += subwarp_size) {
const auto b_col = b_cols[b_nz];
const auto b_val = b_vals[b_nz];
const auto rel_nz = c_row_lookup[b_col];
GKO_ASSERT(rel_nz != invalid_index<IndexType>());
c_vals[c_begin + rel_nz] += a_val * b_val;
}
// this is necessary to avoid data races between two rows of B
// sharing the same column index
subwarp.sync();
}
}


} // namespace kernel


template <typename ValueType, typename IndexType>
void spgemm_reuse(std::shared_ptr<const DefaultExecutor> exec,
const matrix::Csr<ValueType, IndexType>* a,
const matrix::Csr<ValueType, IndexType>* b,
const matrix::csr::lookup_data<IndexType>& c_lookup,
matrix::Csr<ValueType, IndexType>* c)
{
const auto num_rows = static_cast<IndexType>(c->get_size()[0]);
const auto a_row_ptrs = a->get_const_row_ptrs();
const auto b_row_ptrs = b->get_const_row_ptrs();
const auto c_row_ptrs = c->get_const_row_ptrs();
const auto a_cols = a->get_const_col_idxs();
const auto b_cols = b->get_const_col_idxs();
const auto c_cols = c->get_const_col_idxs();
const auto a_vals = as_device_type(a->get_const_values());
const auto b_vals = as_device_type(b->get_const_values());
const auto c_vals = as_device_type(c->get_values());
const auto lookup_storage_offsets =
c_lookup.storage_offsets.get_const_data();
const auto lookup_storage = c_lookup.storage.get_const_data();
const auto lookup_descs = c_lookup.row_descs.get_const_data();
if (num_rows > 0) {
const auto num_blocks =
ceildiv(num_rows, default_block_size / config::warp_size);
kernel::spgemm_reuse<<<num_blocks, default_block_size, 0,
exec->get_stream()>>>(
a_row_ptrs, a_cols, a_vals, b_row_ptrs, b_cols, b_vals, c_row_ptrs,
c_cols, c_vals, lookup_storage_offsets, lookup_storage,
lookup_descs, num_rows);
}
}


template <typename ValueType, typename IndexType>
void transpose(std::shared_ptr<const DefaultExecutor> exec,
const matrix::Csr<ValueType, IndexType>* orig,
Expand Down
1 change: 1 addition & 0 deletions core/device_hooks/common_kernels.inc.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -759,6 +759,7 @@ GKO_STUB_MIXED_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_SPMV_KERNEL);
GKO_STUB_MIXED_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_ADVANCED_SPMV_KERNEL);
GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_SPGEMM_KERNEL);
GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_ADVANCED_SPGEMM_KERNEL);
GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_SPGEMM_REUSE_KERNEL);
GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_SPGEAM_KERNEL);
GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_FILL_IN_DENSE_KERNEL);
GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_CONVERT_TO_ELL_KERNEL);
Expand Down
87 changes: 87 additions & 0 deletions core/matrix/csr.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@
#include "core/matrix/hybrid_kernels.hpp"
#include "core/matrix/permutation.hpp"
#include "core/matrix/sellp_kernels.hpp"
#include "ginkgo/core/base/temporary_clone.hpp"


namespace gko {
Expand All @@ -45,6 +46,7 @@ GKO_REGISTER_OPERATION(spmv, csr::spmv);
GKO_REGISTER_OPERATION(advanced_spmv, csr::advanced_spmv);
GKO_REGISTER_OPERATION(spgemm, csr::spgemm);
GKO_REGISTER_OPERATION(advanced_spgemm, csr::advanced_spgemm);
GKO_REGISTER_OPERATION(spgemm_reuse, csr::spgemm_reuse);
GKO_REGISTER_OPERATION(spgeam, csr::spgeam);
GKO_REGISTER_OPERATION(convert_idxs_to_ptrs, components::convert_idxs_to_ptrs);
GKO_REGISTER_OPERATION(convert_ptrs_to_idxs, components::convert_ptrs_to_idxs);
Expand Down Expand Up @@ -643,6 +645,91 @@ void Csr<ValueType, IndexType>::write(mat_data& data) const
}


template <typename ValueType, typename IndexType>
std::unique_ptr<Csr<ValueType, IndexType>> Csr<ValueType, IndexType>::multiply(
ptr_param<const Csr> other) const
{
GKO_ASSERT_CONFORMANT(this, other);
auto result_size = gko::dim<2>{this->get_size()[0], other->get_size()[1]};
auto exec = this->get_executor();
auto local_other = make_temporary_clone(exec, other);
auto result = Csr::create(exec, result_size);
exec->run(csr::make_spgemm(this, local_other.get(), result.get()));
return result;
}


template <typename ValueType, typename IndexType>
struct Csr<ValueType, IndexType>::multiply_reuse_info::lookup_data {
dim<2> size;
size_type nnz;
csr::lookup_data<IndexType> data;
};


template <typename ValueType, typename IndexType>
Csr<ValueType, IndexType>::multiply_reuse_info::~multiply_reuse_info() =
default;


template <typename ValueType, typename IndexType>
Csr<ValueType, IndexType>::multiply_reuse_info::multiply_reuse_info(
multiply_reuse_info&&) = default;


template <typename ValueType, typename IndexType>
typename Csr<ValueType, IndexType>::multiply_reuse_info&
Csr<ValueType, IndexType>::multiply_reuse_info::operator=(
multiply_reuse_info&&) = default;


template <typename ValueType, typename IndexType>
Csr<ValueType, IndexType>::multiply_reuse_info::multiply_reuse_info(
std::unique_ptr<lookup_data> data)
: internal{std::move(data)}
{}


template <typename ValueType, typename IndexType>
void Csr<ValueType, IndexType>::multiply_reuse_info::update_values(
ptr_param<const Csr> mtx1, ptr_param<const Csr> mtx2,
ptr_param<Csr> out) const
{
GKO_ASSERT_EQUAL_DIMENSIONS(out, internal->size);
GKO_ASSERT_CONFORMANT(mtx1, mtx2);
GKO_ASSERT_EQUAL_ROWS(mtx1, out);
GKO_ASSERT_EQUAL_COLS(mtx2, out);
GKO_ASSERT_EQ(out->get_num_stored_elements(), internal->nnz);
auto exec = internal->data.storage.get_executor();
auto local_mtx1 = make_temporary_clone(exec, mtx1);
auto local_mtx2 = make_temporary_clone(exec, mtx2);
auto local_out = make_temporary_clone(exec, out);
exec->run(csr::make_spgemm_reuse(local_mtx1.get(), local_mtx2.get(),
internal->data, local_out.get()));
}


template <typename ValueType, typename IndexType>
std::pair<std::unique_ptr<Csr<ValueType, IndexType>>,
typename Csr<ValueType, IndexType>::multiply_reuse_info>
Csr<ValueType, IndexType>::multiply_reuse(ptr_param<const Csr> other) const
{
GKO_ASSERT_CONFORMANT(this, other);
auto result_size = gko::dim<2>{this->get_size()[0], other->get_size()[1]};
auto exec = this->get_executor();
auto local_other = make_temporary_clone(exec, other);
auto result = Csr::create(exec, result_size);
exec->run(csr::make_spgemm(this, local_other.get(), result.get()));
auto lookup = csr::build_lookup(result.get());
auto reuse_info = multiply_reuse_info{
std::make_unique<typename multiply_reuse_info::lookup_data>(
typename multiply_reuse_info::lookup_data{
result_size, result->get_num_stored_elements(),
std::move(lookup)})};
return std::make_pair(std::move(result), std::move(reuse_info));
}


template <typename ValueType, typename IndexType, typename TransformClosure>
std::pair<std::unique_ptr<Csr<ValueType, IndexType>>,
typename Csr<ValueType, IndexType>::permuting_reuse_info>
Expand Down
9 changes: 9 additions & 0 deletions core/matrix/csr_kernels.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,13 @@ namespace kernels {
const matrix::Csr<ValueType, IndexType>* d, \
matrix::Csr<ValueType, IndexType>* c)

#define GKO_DECLARE_CSR_SPGEMM_REUSE_KERNEL(ValueType, IndexType) \
void spgemm_reuse(std::shared_ptr<const DefaultExecutor> exec, \
const matrix::Csr<ValueType, IndexType>* a, \
const matrix::Csr<ValueType, IndexType>* b, \
const matrix::csr::lookup_data<IndexType>& c_lookup, \
matrix::Csr<ValueType, IndexType>* c)

#define GKO_DECLARE_CSR_SPGEAM_KERNEL(ValueType, IndexType) \
void spgeam(std::shared_ptr<const DefaultExecutor> exec, \
const matrix::Dense<ValueType>* alpha, \
Expand Down Expand Up @@ -278,6 +285,8 @@ namespace kernels {
template <typename ValueType, typename IndexType> \
GKO_DECLARE_CSR_ADVANCED_SPGEMM_KERNEL(ValueType, IndexType); \
template <typename ValueType, typename IndexType> \
GKO_DECLARE_CSR_SPGEMM_REUSE_KERNEL(ValueType, IndexType); \
template <typename ValueType, typename IndexType> \
GKO_DECLARE_CSR_SPGEAM_KERNEL(ValueType, IndexType); \
template <typename ValueType, typename IndexType> \
GKO_DECLARE_CSR_FILL_IN_DENSE_KERNEL(ValueType, IndexType); \
Expand Down
63 changes: 63 additions & 0 deletions dpcpp/matrix/csr_kernels.dp.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@

#include "accessor/sycl_helper.hpp"
#include "core/base/array_access.hpp"
#include "core/base/index_range.hpp"
#include "core/base/mixed_precision_types.hpp"
#include "core/base/utils.hpp"
#include "core/components/fill_array_kernels.hpp"
Expand Down Expand Up @@ -2187,6 +2188,68 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
GKO_DECLARE_CSR_ADVANCED_SPGEMM_KERNEL);


template <typename ValueType, typename IndexType>
void spgemm_reuse(std::shared_ptr<const DefaultExecutor> exec,
const matrix::Csr<ValueType, IndexType>* a,
const matrix::Csr<ValueType, IndexType>* b,
const matrix::csr::lookup_data<IndexType>& c_lookup,
matrix::Csr<ValueType, IndexType>* c)
{
const auto num_rows = static_cast<IndexType>(c->get_size()[0]);
const auto a_row_ptrs = a->get_const_row_ptrs();
const auto b_row_ptrs = b->get_const_row_ptrs();
const auto c_row_ptrs = c->get_const_row_ptrs();
const auto a_cols = a->get_const_col_idxs();
const auto b_cols = b->get_const_col_idxs();
const auto c_cols = c->get_const_col_idxs();
const auto a_vals = as_device_type(a->get_const_values());
const auto b_vals = as_device_type(b->get_const_values());
const auto c_vals = as_device_type(c->get_values());
const auto lookup_storage_offsets =
c_lookup.storage_offsets.get_const_data();
const auto lookup_storage = c_lookup.storage.get_const_data();
const auto lookup_descs = c_lookup.row_descs.get_const_data();
exec->get_queue()->submit([&](sycl::handler& cgh) {
cgh.parallel_for(
sycl::range<1>{static_cast<size_type>(num_rows)},
[=](sycl::id<1> idx) {
const auto row = static_cast<IndexType>(idx[0]);
const auto a_begin = a_row_ptrs[row];
const auto a_end = a_row_ptrs[row + 1];
const auto c_begin = c_row_ptrs[row];
const auto c_end = c_row_ptrs[row + 1];
const auto c_row_lookup =
matrix::csr::device_sparsity_lookup<IndexType>{
c_row_ptrs,
c_cols,
lookup_storage_offsets,
lookup_storage,
lookup_descs,
static_cast<size_type>(row)};
for (auto nz = c_begin; nz < c_end; nz++) {
c_vals[nz] = zero(c_vals[nz]);
}
for (const auto a_nz : irange{a_begin, a_end}) {
const auto a_col = a_cols[a_nz];
const auto a_val = a_vals[a_nz];
const auto b_begin = b_row_ptrs[a_col];
const auto b_end = b_row_ptrs[a_col + 1];
for (const auto b_nz : irange{b_begin, b_end}) {
const auto b_col = b_cols[b_nz];
const auto b_val = b_vals[b_nz];
const auto rel_nz = c_row_lookup.lookup_unsafe(b_col);
GKO_ASSERT(rel_nz != invalid_index<IndexType>());
c_vals[c_begin + rel_nz] += a_val * b_val;
}
}
});
});
}

GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
GKO_DECLARE_CSR_SPGEMM_REUSE_KERNEL);


template <typename ValueType, typename IndexType>
void spgeam(std::shared_ptr<const DpcppExecutor> exec,
const matrix::Dense<ValueType>* alpha,
Expand Down
Loading
Loading