34
34
namespace cubool {
35
35
namespace kernels {
36
36
37
- template <typename IndexType, size_t blockSize>
37
+ template <typename IndexType, size_t threads, size_t blockSize>
38
38
__global__ void spgemv (thrust::device_ptr<const IndexType> rowOffsets, // Input csr matrix rows
39
39
thrust::device_ptr<const IndexType> colIndices, // Input csr matrix col indices
40
40
thrust::device_ptr<const IndexType> v, // Input dense v vector
41
41
thrust::device_ptr<IndexType> x, // Output dense x vector (x = M*v)
42
- thrust::device_ptr<const IndexType> rowConfig) { // Rows to process for each bin
43
- IndexType assignedOrder = blockIdx .x ;
44
- IndexType id = threadIdx .x ;
42
+ thrust::device_ptr<const IndexType> rowConfig, // Rows to process for each bin
43
+ IndexType rowsCount) { // Num of rows to process
44
+
45
+ static const size_t WARP_SIZE = 32 ;
46
+
47
+ IndexType id = threadIdx .x % threads;
48
+ IndexType interBlockId = threadIdx .x / threads;
49
+ IndexType assignedOrder = blockIdx .x * (blockSize / threads) + interBlockId;
50
+
51
+ if (assignedOrder >= rowsCount)
52
+ assignedOrder = rowsCount - 1 ;
45
53
46
54
IndexType i = rowConfig[assignedOrder]; // Row to process
47
55
@@ -51,28 +59,38 @@ namespace cubool {
51
59
__shared__ IndexType tmp_accum[blockSize];
52
60
53
61
// Initial zero
54
- tmp_accum[id ] = 0 ;
62
+ tmp_accum[threadIdx . x ] = 0 ;
55
63
__syncthreads ();
56
64
57
65
// Each thread accum nnz values
58
- for (size_t k = id; k < rowSize; k += blockSize ) {
59
- tmp_accum[id ] |= v[colIndices[rowBegin + k]];
66
+ for (size_t k = id; k < rowSize; k += threads ) {
67
+ tmp_accum[threadIdx . x ] |= v[colIndices[rowBegin + k]];
60
68
}
61
69
__syncthreads ();
62
70
63
71
// Reduce accum to single value
64
- for (size_t s = 1 ; s < blockSize; s *= 2 ) {
72
+ for (size_t s = 1 ; s < threads && warpSize ; s *= 2 ) {
73
+ if (id % (2 * s) == 0 ) {
74
+ tmp_accum[threadIdx .x ] |= tmp_accum[threadIdx .x + s];
75
+ }
76
+
77
+ __syncwarp ();
78
+ }
79
+
80
+ __syncthreads ();
81
+
82
+ for (size_t s = WARP_SIZE; s < threads; s *= 2 ) {
65
83
if (id % (2 * s) == 0 ) {
66
- tmp_accum[id ] |= tmp_accum[id + s];
84
+ tmp_accum[threadIdx . x ] |= tmp_accum[threadIdx . x + s];
67
85
}
68
86
69
87
__syncthreads ();
70
88
}
71
89
72
90
// 0-thread saves result
73
91
if (id == 0 ) {
74
- if (tmp_accum[0 ] > 0 ) {
75
- x[i] = tmp_accum[0 ];
92
+ if (tmp_accum[threadIdx . x ] > 0 ) {
93
+ x[i] = tmp_accum[threadIdx . x ];
76
94
}
77
95
}
78
96
}
@@ -95,11 +113,14 @@ namespace cubool {
95
113
thrust::device_ptr<const IndexType> rowConfig) { // Rows to process for each bin)
96
114
97
115
EXPAND_SIDE_EFFECTS (
98
- (binSizes[Bins::id] > 0 ?
99
- spgemv<IndexType, Bins::blockSize>
100
- <<<binSizes[Bins::id], Bins::blockSize, 0 , streamsWrapper.streams[Bins::id]>>>
101
- (rowOffsets, colIndices, v, x, rowConfig + binOffset[Bins::id])
102
- : void ())
116
+ (binSizes[Bins::id] > 0 ?
117
+ spgemv<IndexType, Bins::threads, Bins::blockSize>
118
+ <<<binSizes[Bins::id] / Bins::dispatchRatio + (binSizes[Bins::id] % Bins::dispatchRatio? 1 : 0 ),
119
+ Bins::blockSize,
120
+ 0 ,
121
+ streamsWrapper.streams[Bins::id]>>>
122
+ (rowOffsets, colIndices, v, x, rowConfig + binOffset[Bins::id], binSizes[Bins::id])
123
+ : void ())
103
124
);
104
125
}
105
126
@@ -143,10 +164,13 @@ namespace cubool {
143
164
// Empty out buffer
144
165
thrust::fill_n (mOutput .begin (), M, (IndexType) 0 );
145
166
146
- using ConfigType = Config<Bin<32 , 1 , 32 , 0 >,
147
- Bin<64 , 32 , 64 , 1 >,
148
- Bin<128 ,64 , 128 ,2 >,
149
- Bin<256 ,128 ,max,3 >>;
167
+ using ConfigType = Config<Bin<4 , 32 , 1 , 8 , 0 >,
168
+ Bin<8 , 32 , 8 , 16 , 1 >,
169
+ Bin<16 , 32 , 16 , 32 , 2 >,
170
+ Bin<32 , 32 , 32 , 64 , 3 >,
171
+ Bin<64 , 64 , 64 , 128 ,4 >,
172
+ Bin<128 ,128 ,128 ,256 ,5 >,
173
+ Bin<256 ,256 ,256 ,max,6 >>;
150
174
ConfigType config;
151
175
152
176
mRowsConfig .resize (M);
0 commit comments