Skip to content

Commit 76612dd

Browse files
committed
[Code] Add more comp bins in spgemv impl
1 parent 12bd69a commit 76612dd

File tree

2 files changed

+49
-21
lines changed

2 files changed

+49
-21
lines changed

cubool/sources/cuda/details/meta.hpp

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -46,9 +46,13 @@ namespace cubool {
4646
cudaStream_t streams[Config::binsCount()] = {};
4747
};
4848

49-
template<size_t BlocksSize, size_t Max, size_t Min, size_t Id>
49+
template<size_t Threads, size_t BlocksSize, size_t Max, size_t Min, size_t Id>
5050
struct Bin {
51+
static_assert(Threads <= BlocksSize, "Block size must be >= threads in this block");
52+
53+
static constexpr size_t threads = Threads;
5154
static constexpr size_t blockSize = BlocksSize;
55+
static constexpr size_t dispatchRatio = BlocksSize / Threads;
5256
static constexpr size_t min = Max;
5357
static constexpr size_t max = Min;
5458
static constexpr size_t id = Id;

cubool/sources/cuda/kernels/spgemv.cuh

Lines changed: 44 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -34,14 +34,22 @@
3434
namespace cubool {
3535
namespace kernels {
3636

37-
template<typename IndexType, size_t blockSize>
37+
template<typename IndexType, size_t threads, size_t blockSize>
3838
__global__ void spgemv(thrust::device_ptr<const IndexType> rowOffsets, // Input csr matrix rows
3939
thrust::device_ptr<const IndexType> colIndices, // Input csr matrix col indices
4040
thrust::device_ptr<const IndexType> v, // Input dense v vector
4141
thrust::device_ptr<IndexType> x, // Output dense x vector (x = M*v)
42-
thrust::device_ptr<const IndexType> rowConfig) { // Rows to process for each bin
43-
IndexType assignedOrder = blockIdx.x;
44-
IndexType id = threadIdx.x;
42+
thrust::device_ptr<const IndexType> rowConfig, // Rows to process for each bin
43+
IndexType rowsCount) { // Num of rows to process
44+
45+
static const size_t WARP_SIZE = 32;
46+
47+
IndexType id = threadIdx.x % threads;
48+
IndexType interBlockId = threadIdx.x / threads;
49+
IndexType assignedOrder = blockIdx.x * (blockSize / threads) + interBlockId;
50+
51+
if (assignedOrder >= rowsCount)
52+
assignedOrder = rowsCount - 1;
4553

4654
IndexType i = rowConfig[assignedOrder]; // Row to process
4755

@@ -51,28 +59,38 @@ namespace cubool {
5159
__shared__ IndexType tmp_accum[blockSize];
5260

5361
// Initial zero
54-
tmp_accum[id] = 0;
62+
tmp_accum[threadIdx.x] = 0;
5563
__syncthreads();
5664

5765
// Each thread accum nnz values
58-
for (size_t k = id; k < rowSize; k += blockSize) {
59-
tmp_accum[id] |= v[colIndices[rowBegin + k]];
66+
for (size_t k = id; k < rowSize; k += threads) {
67+
tmp_accum[threadIdx.x] |= v[colIndices[rowBegin + k]];
6068
}
6169
__syncthreads();
6270

6371
// Reduce accum to single value
64-
for (size_t s = 1; s < blockSize; s *= 2) {
72+
for (size_t s = 1; s < threads && warpSize; s *= 2) {
73+
if (id % (2 * s) == 0) {
74+
tmp_accum[threadIdx.x] |= tmp_accum[threadIdx.x + s];
75+
}
76+
77+
__syncwarp();
78+
}
79+
80+
__syncthreads();
81+
82+
for (size_t s = WARP_SIZE; s < threads; s *= 2) {
6583
if (id % (2 * s) == 0) {
66-
tmp_accum[id] |= tmp_accum[id + s];
84+
tmp_accum[threadIdx.x] |= tmp_accum[threadIdx.x + s];
6785
}
6886

6987
__syncthreads();
7088
}
7189

7290
// 0-thread saves result
7391
if (id == 0) {
74-
if (tmp_accum[0] > 0) {
75-
x[i] = tmp_accum[0];
92+
if (tmp_accum[threadIdx.x] > 0) {
93+
x[i] = tmp_accum[threadIdx.x];
7694
}
7795
}
7896
}
@@ -95,11 +113,14 @@ namespace cubool {
95113
thrust::device_ptr<const IndexType> rowConfig) { // Rows to process for each bin)
96114

97115
EXPAND_SIDE_EFFECTS(
98-
(binSizes[Bins::id] > 0?
99-
spgemv<IndexType, Bins::blockSize>
100-
<<<binSizes[Bins::id], Bins::blockSize, 0, streamsWrapper.streams[Bins::id]>>>
101-
(rowOffsets, colIndices, v, x, rowConfig + binOffset[Bins::id])
102-
: void())
116+
(binSizes[Bins::id] > 0?
117+
spgemv<IndexType, Bins::threads, Bins::blockSize>
118+
<<<binSizes[Bins::id] / Bins::dispatchRatio + (binSizes[Bins::id] % Bins::dispatchRatio? 1: 0),
119+
Bins::blockSize,
120+
0,
121+
streamsWrapper.streams[Bins::id]>>>
122+
(rowOffsets, colIndices, v, x, rowConfig + binOffset[Bins::id], binSizes[Bins::id])
123+
: void())
103124
);
104125
}
105126

@@ -143,10 +164,13 @@ namespace cubool {
143164
// Empty out buffer
144165
thrust::fill_n(mOutput.begin(), M, (IndexType) 0);
145166

146-
using ConfigType = Config<Bin<32, 1, 32, 0>,
147-
Bin<64, 32, 64, 1>,
148-
Bin<128,64, 128,2>,
149-
Bin<256,128,max,3>>;
167+
using ConfigType = Config<Bin<4, 32, 1, 8, 0>,
168+
Bin<8, 32, 8, 16, 1>,
169+
Bin<16, 32, 16, 32, 2>,
170+
Bin<32, 32, 32, 64, 3>,
171+
Bin<64, 64, 64, 128,4>,
172+
Bin<128,128,128,256,5>,
173+
Bin<256,256,256,max,6>>;
150174
ConfigType config;
151175

152176
mRowsConfig.resize(M);

0 commit comments

Comments
 (0)