Skip to content

Commit 9264cee

Browse files
committed
[Code] Add SpGEMtV implementation for cuda backend
1 parent 23f6d96 commit 9264cee

File tree

9 files changed

+317
-34
lines changed

9 files changed

+317
-34
lines changed

cubool/CMakeLists.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -128,6 +128,7 @@ if (CUBOOL_WITH_CUDA)
128128
sources/cuda/cuda_vector.hpp
129129
sources/cuda/cuda_vector.cu
130130
sources/cuda/cuda_vector_mxv.cu
131+
sources/cuda/cuda_vector_vxm.cu
131132
sources/cuda/cuda_vector_ewiseadd.cu
132133
sources/cuda/cuda_vector_reduce.cu
133134
sources/cuda/details/meta.hpp
@@ -137,6 +138,7 @@ if (CUBOOL_WITH_CUDA)
137138
sources/cuda/kernels/slow_sort.cuh
138139
sources/cuda/kernels/bin_search.cuh
139140
sources/cuda/kernels/spgemv.cuh
141+
sources/cuda/kernels/spgemtv.cuh
140142
sources/cuda/kernels/spewiseadd.cuh
141143
sources/cuda/kernels/sptranspose.cuh
142144
sources/cuda/kernels/sptranspose2.cuh

cubool/sources/cuda/cuda_vector.cu

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -88,11 +88,6 @@ namespace cubool {
8888
result = getNvals();
8989
}
9090

91-
void CudaVector::multiplyVxM(const VectorBase &vBase, const struct MatrixBase &mBase, bool checkTime) {
92-
RAISE_ERROR(NotImplemented, "This function is not implemented");
93-
94-
}
95-
9691
index CudaVector::getNrows() const {
9792
return mVectorImpl.m_rows;
9893
}
Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
/**********************************************************************************/
2+
/* MIT License */
3+
/* */
4+
/* Copyright (c) 2020, 2021 JetBrains-Research */
5+
/* */
6+
/* Permission is hereby granted, free of charge, to any person obtaining a copy */
7+
/* of this software and associated documentation files (the "Software"), to deal */
8+
/* in the Software without restriction, including without limitation the rights */
9+
/* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell */
10+
/* copies of the Software, and to permit persons to whom the Software is */
11+
/* furnished to do so, subject to the following conditions: */
12+
/* */
13+
/* The above copyright notice and this permission notice shall be included in all */
14+
/* copies or substantial portions of the Software. */
15+
/* */
16+
/* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR */
17+
/* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, */
18+
/* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE */
19+
/* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER */
20+
/* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, */
21+
/* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE */
22+
/* SOFTWARE. */
23+
/**********************************************************************************/
24+
25+
#include <cuda/cuda_vector.hpp>
26+
#include <cuda/cuda_matrix.hpp>
27+
#include <cuda/kernels/spgemtv.cuh>
28+
#include <core/error.hpp>
29+
#include <cassert>
30+
31+
namespace cubool {
32+
33+
void CudaVector::multiplyVxM(const VectorBase &vBase, const struct MatrixBase &mBase, bool checkTime) {
34+
const auto* v = dynamic_cast<const CudaVector*>(&vBase);
35+
const auto* m = dynamic_cast<const CudaMatrix*>(&mBase);
36+
37+
CHECK_RAISE_ERROR(v != nullptr, InvalidArgument, "Provided vector does not belong to cuda vector class");
38+
CHECK_RAISE_ERROR(m != nullptr, InvalidArgument, "Provided matrix does not belong to cuda matrix class");
39+
40+
assert(m->getNrows() == v->getNrows());
41+
assert(m->getNcols() == this->getNrows());
42+
43+
m->resizeStorageToDim();
44+
45+
kernels::SpGEMtV<index, DeviceAlloc<index>> functor;
46+
auto result = functor(v->mVectorImpl, m->mMatrixImpl);
47+
48+
mVectorImpl = std::move(result);
49+
}
50+
51+
}
52+
53+
Lines changed: 235 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,235 @@
1+
/**********************************************************************************/
2+
/* MIT License */
3+
/* */
4+
/* Copyright (c) 2020, 2021 JetBrains-Research */
5+
/* */
6+
/* Permission is hereby granted, free of charge, to any person obtaining a copy */
7+
/* of this software and associated documentation files (the "Software"), to deal */
8+
/* in the Software without restriction, including without limitation the rights */
9+
/* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell */
10+
/* copies of the Software, and to permit persons to whom the Software is */
11+
/* furnished to do so, subject to the following conditions: */
12+
/* */
13+
/* The above copyright notice and this permission notice shall be included in all */
14+
/* copies or substantial portions of the Software. */
15+
/* */
16+
/* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR */
17+
/* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, */
18+
/* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE */
19+
/* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER */
20+
/* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, */
21+
/* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE */
22+
/* SOFTWARE. */
23+
/**********************************************************************************/
24+
25+
#ifndef CUBOOL_SPGEMTV_HPP
26+
#define CUBOOL_SPGEMTV_HPP
27+
28+
#include <cuda/details/sp_vector.hpp>
29+
#include <cuda/details/meta.hpp>
30+
#include <nsparse/matrix.h>
31+
#include <nsparse/detail/meta.h>
32+
#include <limits>
33+
34+
namespace cubool {
35+
namespace kernels {
36+
37+
template<typename IndexType, size_t threads, size_t blockSize>
38+
__global__ void __spgemtv(thrust::device_ptr<const IndexType> rowOffsets, // Input csr matrix rows
39+
thrust::device_ptr<const IndexType> colIndices, // Input csr matrix col indices
40+
thrust::device_ptr<IndexType> x, // Output dense x vector (x = M*v)
41+
thrust::device_ptr<const IndexType> rowConfig, // Rows to process for each bin
42+
IndexType rowsCount) { // Num of rows to process
43+
// Split block into number of groups of size `threads`.
44+
// Each group process its own row.
45+
46+
IndexType id = threadIdx.x % threads; // id of the thread withing row processing group
47+
IndexType interBlockId = threadIdx.x / threads; // id of the group (number of `threads` belong to the same group)
48+
IndexType assignedOrder = blockIdx.x * (blockSize / threads) + interBlockId; // row, which is process by number of `threads`
49+
50+
if (assignedOrder >= rowsCount)
51+
assignedOrder = rowsCount - 1;
52+
53+
IndexType i = rowConfig[assignedOrder]; // Row to process
54+
55+
size_t rowSize = rowOffsets[i + 1] - rowOffsets[i];
56+
size_t rowBegin = rowOffsets[i];
57+
58+
// Add value to result
59+
for (size_t k = id; k < rowSize; k += threads) {
60+
x[colIndices[rowBegin + k]] = 0x1u;
61+
}
62+
}
63+
64+
template<typename IndexType, typename AllocType>
65+
struct SpGEMtV {
66+
template<typename T>
67+
using ContainerType = thrust::device_vector<T, typename AllocType::template rebind<T>::other>;
68+
using MatrixType = nsparse::matrix<bool, IndexType, AllocType>;
69+
using VectorType = details::SpVector<IndexType, AllocType>;
70+
71+
template<typename ... Bins>
72+
void dispatch(StreamsWrapper<Config<Bins...>> &streamsWrapper,
73+
thrust::device_ptr<const IndexType> rowOffsets, // Input csr matrix rows
74+
thrust::device_ptr<const IndexType> colIndices, // Input csr matrix col indices
75+
thrust::device_ptr<IndexType> x, // Output dense x vector (x = M*v)
76+
const std::vector<IndexType> &binSizes, // Size of bin in rowConfig
77+
const std::vector<IndexType> &binOffset, // Offset of bin in rowConfig
78+
thrust::device_ptr<const IndexType> rowConfig) { // Rows to process for each bin)
79+
80+
EXPAND_SIDE_EFFECTS(
81+
(binSizes[Bins::id] > 0 ?
82+
__spgemtv<IndexType, Bins::threads, Bins::blockSize>
83+
<<<binSizes[Bins::id] / Bins::dispatchRatio +
84+
(binSizes[Bins::id] % Bins::dispatchRatio ? 1 : 0),
85+
Bins::blockSize,
86+
0,
87+
streamsWrapper.streams[Bins::id]>>>
88+
(rowOffsets, colIndices, x, rowConfig + binOffset[Bins::id], binSizes[Bins::id])
89+
: void())
90+
);
91+
}
92+
93+
/**
94+
* Compute r = M^t x v
95+
*
96+
* Matrix-vector multiplication algorithm:
97+
* 1. Assign for each row its computation group (group defines number of threads used for processing)
98+
* 2. Run each group (larger row - more threads must be assigned)
99+
*
100+
* @param v Sparse vector
101+
* @param m Sparse matrix
102+
*
103+
* @return Sparse vector
104+
*/
105+
VectorType operator()(const VectorType &v, const MatrixType &m) {
106+
static constexpr size_t max = std::numeric_limits<size_t>::max();
107+
108+
auto N = m.m_cols;
109+
auto vnvals = v.m_vals;
110+
111+
assert(m.m_rows == v.m_rows);
112+
113+
// Empty result case
114+
if (v.m_vals == 0 || m.m_vals == 0)
115+
return VectorType(N);
116+
117+
// Resize cached buffers to store r as dense vectors
118+
if (mOutput.size() < N)
119+
mOutput.resize(N);
120+
121+
// Empty out buffer
122+
thrust::fill_n(mOutput.begin(), N, (IndexType) 0);
123+
124+
using ConfigType = Config<Bin<4, 32, 1, 8, 0>,
125+
Bin<8, 32, 8, 16, 1>,
126+
Bin<16, 32, 16, 32, 2>,
127+
Bin<32, 32, 32, 64, 3>,
128+
Bin<64, 64, 64, 128, 4>,
129+
Bin<128, 128, 128, 256, 5>,
130+
Bin<256, 256, 256, max, 6>>;
131+
ConfigType config;
132+
133+
// Process only rows, where vec value is non-zero
134+
mRowsConfig.resize(vnvals);
135+
136+
mBinsSize.resize(config.binsCount());
137+
mBinsOffsets.resize(config.binsCount());
138+
139+
thrust::fill(mBinsSize.begin(), mBinsSize.end(), (IndexType) 0);
140+
141+
// Eval bins size for each row (look-up row by vector value)
142+
thrust::for_each(v.m_rows_index.begin(), v.m_rows_index.end(),
143+
[config, binSize = mBinsSize.data(), rowsIndices = m.m_row_index.data()]
144+
__device__(IndexType i) {
145+
auto valsInRow = rowsIndices[i + 1] - rowsIndices[i];
146+
auto binId = config.selectBin(valsInRow);
147+
148+
if (binId == config.unusedBinId())
149+
// Ignore empty rows
150+
return;
151+
152+
atomicAdd((binSize + binId).get(), 1);
153+
});
154+
155+
// Offsets for each bin (each bin will have its own section in permutation buffer)
156+
thrust::exclusive_scan(mBinsSize.begin(), mBinsSize.end(), mBinsOffsets.begin(), 0, thrust::plus<IndexType>());
157+
158+
// Reset bin sizes (use as offsets for permutation id assignments)
159+
thrust::fill(mBinsSize.begin(), mBinsSize.end(), (IndexType) 0);
160+
161+
// Assign rows () for its bins
162+
thrust::for_each(v.m_rows_index.begin(), v.m_rows_index.end(),
163+
[config, binSize = mBinsSize.data(), binOffset = mBinsOffsets.data(), rowsConfig = mRowsConfig.data(),
164+
rowsIndices = m.m_row_index.data()]
165+
__device__(IndexType i) {
166+
auto valsInRow = rowsIndices[i + 1] - rowsIndices[i];
167+
auto binId = config.selectBin(valsInRow);
168+
169+
if (binId == config.unusedBinId())
170+
// Ignore empty rows
171+
return;
172+
173+
auto order = atomicAdd((binSize + binId).get(), 1);
174+
rowsConfig[binOffset[binId] + order] = i;
175+
});
176+
177+
// Bring to the host bin sizes and offsets
178+
std::vector<IndexType> binOffsets(mBinsOffsets.size());
179+
thrust::copy(mBinsOffsets.begin(), mBinsOffsets.end(), binOffsets.begin());
180+
181+
std::vector<IndexType> binSizes(mBinsSize.size());
182+
thrust::copy(mBinsSize.begin(), mBinsSize.end(), binSizes.begin());
183+
184+
// Stream for each bin
185+
StreamsWrapper<ConfigType> streamsWrapper;
186+
187+
dispatch(streamsWrapper,
188+
m.m_row_index.data(),
189+
m.m_col_index.data(),
190+
mOutput.data(),
191+
binSizes,
192+
binOffsets,
193+
mRowsConfig.data());
194+
195+
cudaDeviceSynchronize();
196+
197+
// Nnz of the result
198+
auto resultSize = thrust::reduce(mOutput.begin(), mOutput.end(), (IndexType) 0);
199+
200+
ContainerType<index> result(resultSize);
201+
202+
// Order where to write indices
203+
ContainerType<index> order(1);
204+
thrust::fill(order.begin(), order.end(), (IndexType) 0);
205+
206+
// For each value of the output write in the result buffer if it not zero
207+
thrust::for_each(thrust::counting_iterator<IndexType>(0), thrust::counting_iterator<IndexType>(N),
208+
[output = mOutput.data(), result = result.data(), order = order.data()]
209+
__device__(IndexType i) {
210+
if (output[i] > 0) {
211+
auto offset = atomicAdd(order.get(), (IndexType) 1);
212+
auto colId = i;
213+
214+
result[offset] = colId;
215+
}
216+
}
217+
);
218+
219+
// Sort indices
220+
thrust::sort(result.begin(), result.end());
221+
222+
return VectorType(std::move(result), N, resultSize);
223+
}
224+
225+
private:
226+
ContainerType<index> mRowsConfig;
227+
ContainerType<index> mBinsSize;
228+
ContainerType<index> mBinsOffsets;
229+
ContainerType<index> mOutput;
230+
};
231+
232+
}
233+
}
234+
235+
#endif //CUBOOL_SPGEMTV_HPP

cubool/sources/cuda/kernels/spgemv.cuh

Lines changed: 5 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -43,14 +43,10 @@ namespace cubool {
4343
IndexType rowsCount) { // Num of rows to process
4444
// Split block into number of groups of size `threads`.
4545
// Each group process its own row.
46-
//
47-
// id: id of the thread withing row processing group
48-
// interBlockId: id of the group (number of `threads` belong to the same group)
49-
// assignedOrder: row, which is process by number of `threads`
5046

51-
IndexType id = threadIdx.x % threads;
52-
IndexType interBlockId = threadIdx.x / threads;
53-
IndexType assignedOrder = blockIdx.x * (blockSize / threads) + interBlockId;
47+
IndexType id = threadIdx.x % threads; // id of the thread withing row processing group
48+
IndexType interBlockId = threadIdx.x / threads; // id of the group (number of `threads` belong to the same group)
49+
IndexType assignedOrder = blockIdx.x * (blockSize / threads) + interBlockId; // row, which is process by number of `threads`
5450

5551
if (assignedOrder >= rowsCount)
5652
assignedOrder = rowsCount - 1;
@@ -71,6 +67,7 @@ namespace cubool {
7167
}
7268

7369
// Reduce accum to single value
70+
#pragma unroll
7471
for (size_t s = 1; s < threads && warpSize; s *= 2) {
7572
__syncwarp();
7673

@@ -79,6 +76,7 @@ namespace cubool {
7976
}
8077
}
8178

79+
#pragma unroll
8280
for (size_t s = warpSize; s < threads; s *= 2) {
8381
__syncthreads();
8482

cubool/tests/test_vector_mxv.cpp

Lines changed: 3 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -41,19 +41,12 @@ void testMatrixVectorMultiplyAdd(cuBool_Index m, cuBool_Index n, float density)
4141
ASSERT_EQ(cuBool_Matrix_Build(a, ta.rowsIndex.data(), ta.colsIndex.data(), ta.nvals, CUBOOL_HINT_VALUES_SORTED & CUBOOL_HINT_NO_DUPLICATES), CUBOOL_STATUS_SUCCESS);
4242
ASSERT_EQ(cuBool_Vector_Build(v, tv.index.data(), tv.nvals, CUBOOL_HINT_VALUES_SORTED & CUBOOL_HINT_NO_DUPLICATES), CUBOOL_STATUS_SUCCESS);
4343

44-
// Evaluate naive r += a x b on the cpu to compare results
44+
// Evaluate naive on the cpu to compare results
4545
testing::MatrixVectorMultiplyFunctor functor;
4646
testing::Vector tr = functor(ta, tv);
4747

48-
testing::TimeQuery query;
49-
50-
for (int i = 0; i < 10; i++) {
51-
testing::TimeScope scope(query);
52-
// Evaluate r += a x b
53-
ASSERT_EQ(cuBool_MxV(r, a, v, CUBOOL_HINT_NO), CUBOOL_STATUS_SUCCESS);
54-
}
55-
56-
std::cout << query.getAverageTimeMs() << " ms" << std::endl;
48+
// Evaluate r = M x v
49+
ASSERT_EQ(cuBool_MxV(r, a, v, CUBOOL_HINT_NO), CUBOOL_STATUS_SUCCESS);
5750

5851
// Compare results
5952
ASSERT_EQ(tr.areEqual(r), true);

0 commit comments

Comments
 (0)