Skip to content

Commit 75a1236

Browse files
committed
[Code] Update:
- Fix cmake cuda build flags !!! - Fix cpu cached vals commit for matrix proxy class - Split cuda csr matrix build/extract into sep cpp files - Cuda csr matrix empty+resize improvements for storage (zeroDim*)
1 parent ffa276d commit 75a1236

15 files changed

+301
-192
lines changed

cubool/CMakeLists.txt

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,8 @@ if (CUBOOL_WITH_CUDA)
7272
sources/cuda/instance.cpp
7373
sources/cuda/matrix_csr.hpp
7474
sources/cuda/matrix_csr.cu
75+
sources/cuda/matrix_csr_build.cpp
76+
sources/cuda/matrix_csr_extract.cpp
7577
sources/cuda/matrix_csr_ewiseadd.cu
7678
sources/cuda/matrix_csr_kronecker.cu
7779
sources/cuda/matrix_csr_multiply.cu
@@ -143,12 +145,14 @@ if (CUBOOL_WITH_CUDA)
143145

144146
# Settings: https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/
145147
target_compile_options(cubool PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:
146-
-arch=sm_30
148+
# todo: fix this flag later -arch=sm_30 ?
149+
# todo: can we omit arch flag?
147150
-gencode=arch=compute_30,code=sm_30
148-
-gencode=arch=compute_35,code=sm_35
149151
-gencode=arch=compute_50,code=sm_50
150152
-gencode=arch=compute_52,code=sm_52
151-
-gencode=arch=compute_52,code=compute_52>)
153+
-gencode=arch=compute_60,code=sm_60
154+
-gencode=arch=compute_61,code=sm_61
155+
-gencode=arch=compute_61,code=compute_61>)
152156

153157
target_compile_options(cubool PRIVATE $<$<COMPILE_LANGUAGE:CUDA>: -use_fast_math -Xptxas -O2>)
154158

cubool/sources/core/library.cpp

Lines changed: 6 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -179,7 +179,7 @@ namespace cubool {
179179
logDeviceInfo();
180180
}
181181

182-
MatrixBase *Library::createMatrix(size_t nrows, size_t ncols) {
182+
Matrix *Library::createMatrix(size_t nrows, size_t ncols) {
183183
CHECK_RAISE_ERROR(nrows > 0, InvalidArgument, "Cannot create matrix with zero dimension");
184184
CHECK_RAISE_ERROR(ncols > 0, InvalidArgument, "Cannot create matrix with zero dimension");
185185

@@ -193,17 +193,16 @@ namespace cubool {
193193
return m;
194194
}
195195

196-
void Library::releaseMatrix(MatrixBase *matrixBase) {
196+
void Library::releaseMatrix(Matrix *matrix) {
197197
if (mRelaxedRelease && !mBackend) return;
198198

199-
auto m = (Matrix*)(matrixBase);
200-
CHECK_RAISE_ERROR(mAllocated.find(m) != mAllocated.end(), InvalidArgument, "No such matrix was allocated");
199+
CHECK_RAISE_ERROR(mAllocated.find(matrix) != mAllocated.end(), InvalidArgument, "No such matrix was allocated");
201200

202201
LogStream stream(*getLogger());
203-
stream << Logger::Level::Info << "Release Matrix " << m->getDebugMarker() << LogStream::cmt;
202+
stream << Logger::Level::Info << "Release Matrix " << matrix->getDebugMarker() << LogStream::cmt;
204203

205-
mAllocated.erase(m);
206-
delete m;
204+
mAllocated.erase(matrix);
205+
delete matrix;
207206
}
208207

209208
void Library::handleError(const std::exception& error) {

cubool/sources/core/library.hpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -38,8 +38,8 @@ namespace cubool {
3838
static void finalize();
3939
static void validate();
4040
static void setupLogging(const char* logFileName, cuBool_Hints hints);
41-
static class MatrixBase *createMatrix(size_t nrows, size_t ncols);
42-
static void releaseMatrix(class MatrixBase *matrixBase);
41+
static class Matrix *createMatrix(size_t nrows, size_t ncols);
42+
static void releaseMatrix(class Matrix *matrix);
4343
static void handleError(const std::exception& error);
4444
static void queryCapabilities(cuBool_DeviceCaps& caps);
4545
static void logDeviceInfo();

cubool/sources/core/matrix.cpp

Lines changed: 24 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -105,7 +105,8 @@ namespace cubool {
105105
CHECK_RAISE_ERROR(nrows == this->getNrows(), InvalidArgument, "Result matrix has incompatible size for extracted sub-matrix range");
106106
CHECK_RAISE_ERROR(ncols == this->getNcols(), InvalidArgument, "Result matrix has incompatible size for extracted sub-matrix range");
107107

108-
this->commitCache();
108+
other->commitCache();
109+
this->releaseCache(); // Values of this matrix won't be used any more
109110

110111
if (checkTime) {
111112
TIMER_ACTION(timer, mHnd->extractSubMatrix(*other->mHnd, i, j, nrows, ncols, false));
@@ -129,13 +130,18 @@ namespace cubool {
129130

130131
CHECK_RAISE_ERROR(other != nullptr, InvalidArgument, "Passed matrix does not belong to core matrix class");
131132

133+
if (this == other)
134+
return;
135+
132136
auto M = other->getNrows();
133137
auto N = other->getNcols();
134138

135139
CHECK_RAISE_ERROR(M == this->getNrows(), InvalidArgument, "Cloned matrix has incompatible size");
136140
CHECK_RAISE_ERROR(N == this->getNcols(), InvalidArgument, "Cloned matrix has incompatible size");
137141

138-
this->commitCache();
142+
other->commitCache();
143+
this->releaseCache(); // Values of this matrix won't be used any more
144+
139145
mHnd->clone(*other->mHnd);
140146
}
141147

@@ -151,6 +157,7 @@ namespace cubool {
151157
CHECK_RAISE_ERROR(N == this->getNrows(), InvalidArgument, "Transposed matrix has incompatible size");
152158

153159
this->commitCache();
160+
this->releaseCache(); // Values of this matrix won't be used any more
154161

155162
if (checkTime) {
156163
TIMER_ACTION(timer, mHnd->transpose(*other->mHnd, false));
@@ -178,7 +185,8 @@ namespace cubool {
178185
CHECK_RAISE_ERROR(M == this->getNrows(), InvalidArgument, "Matrix has incompatible size");
179186
CHECK_RAISE_ERROR(1 == this->getNcols(), InvalidArgument, "Matrix has incompatible size");
180187

181-
this->commitCache();
188+
other->commitCache();
189+
this->releaseCache(); // Values of this matrix won't be used any more
182190

183191
if (checkTime) {
184192
TIMER_ACTION(timer, mHnd->reduce(*other->mHnd, false));
@@ -211,7 +219,13 @@ namespace cubool {
211219
CHECK_RAISE_ERROR(N == this->getNcols(), InvalidArgument, "Matrix has incompatible size for operation result");
212220
CHECK_RAISE_ERROR(T == b->getNrows(), InvalidArgument, "Cannot multiply passed matrices");
213221

214-
this->commitCache();
222+
a->commitCache();
223+
b->commitCache();
224+
225+
if (accumulate)
226+
this->commitCache();
227+
else
228+
this->releaseCache();
215229

216230
if (checkTime) {
217231
TIMER_ACTION(timer, mHnd->multiply(*a->mHnd, *b->mHnd, accumulate, false));
@@ -245,7 +259,9 @@ namespace cubool {
245259
CHECK_RAISE_ERROR(M * K == this->getNrows(), InvalidArgument, "Matrix has incompatible size for operation result");
246260
CHECK_RAISE_ERROR(N * T == this->getNcols(), InvalidArgument, "Matrix has incompatible size for operation result");
247261

248-
this->commitCache();
262+
a->commitCache();
263+
b->commitCache();
264+
this->releaseCache();
249265

250266
if (checkTime) {
251267
TIMER_ACTION(timer, mHnd->kronecker(*a->mHnd, *b->mHnd, false));
@@ -280,7 +296,9 @@ namespace cubool {
280296
CHECK_RAISE_ERROR(M == this->getNrows(), InvalidArgument, "Matrix has incompatible size for operation result");
281297
CHECK_RAISE_ERROR(N == this->getNcols(), InvalidArgument, "Matrix has incompatible size for operation result");
282298

283-
this->commitCache();
299+
a->commitCache();
300+
b->commitCache();
301+
this->releaseCache();
284302

285303
if (checkTime) {
286304
TIMER_ACTION(timer, mHnd->eWiseAdd(*a->mHnd, *b->mHnd, false));

cubool/sources/cuda/matrix_csr.cu

Lines changed: 31 additions & 125 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@
2525
#include <cuda/matrix_csr.hpp>
2626
#include <core/error.hpp>
2727
#include <utils/exclusive_scan.hpp>
28+
#include <utils/timer.hpp>
2829
#include <algorithm>
2930

3031
namespace cubool {
@@ -38,131 +39,6 @@ namespace cubool {
3839
RAISE_ERROR(NotImplemented, "This function is not supported for this matrix class");
3940
}
4041

41-
void MatrixCsr::build(const index *rows, const index *cols, size_t nvals, bool isSorted, bool noDuplicates) {
42-
if (nvals == 0) {
43-
mMatrixImpl.zero_dim(); // no content, empty matrix
44-
return;
45-
}
46-
47-
thrust::host_vector<index, HostAlloc<index>> rowOffsets;
48-
rowOffsets.resize(getNrows() + 1, 0);
49-
50-
thrust::host_vector<index, HostAlloc<index>> colIndices;
51-
colIndices.resize(nvals);
52-
53-
// Compute nnz per row
54-
for (size_t idx = 0; idx < nvals; idx++) {
55-
index i = rows[idx];
56-
index j = cols[idx];
57-
58-
CHECK_RAISE_ERROR(i < getNrows() && j < getNcols(), InvalidArgument, "Out of matrix bounds value");
59-
60-
rowOffsets[i] += 1;
61-
}
62-
63-
// Exclusive scan to eval rows offsets
64-
::cubool::exclusive_scan(rowOffsets.begin(), rowOffsets.end(), 0);
65-
66-
// Write offsets for cols
67-
std::vector<size_t> writeOffsets(getNrows(), 0);
68-
69-
for (size_t idx = 0; idx < nvals; idx++) {
70-
index i = rows[idx];
71-
index j = cols[idx];
72-
73-
colIndices[rowOffsets[i] + writeOffsets[i]] = j;
74-
writeOffsets[i] += 1;
75-
}
76-
77-
if (!isSorted) {
78-
for (size_t i = 0; i < getNrows(); i++) {
79-
auto begin = rowOffsets[i];
80-
auto end = rowOffsets[i + 1];
81-
82-
// Sort col values within row
83-
thrust::sort(colIndices.begin() + begin, colIndices.begin() + end, [](const index& a, const index& b) {
84-
return a < b;
85-
});
86-
}
87-
}
88-
89-
// Reduce duplicated values
90-
if (!noDuplicates) {
91-
size_t unique = 0;
92-
for (size_t i = 0; i < getNrows(); i++) {
93-
index prev = std::numeric_limits<index>::max();
94-
95-
for (size_t k = rowOffsets[i]; k < rowOffsets[i + 1]; k++) {
96-
if (prev != colIndices[k]) {
97-
unique += 1;
98-
}
99-
100-
prev = colIndices[k];
101-
}
102-
}
103-
104-
thrust::host_vector<index, HostAlloc<index>> rowOffsetsReduced;
105-
rowOffsetsReduced.resize(getNrows() + 1, 0);
106-
107-
thrust::host_vector<index, HostAlloc<index>> colIndicesReduced;
108-
colIndicesReduced.reserve(unique);
109-
110-
for (size_t i = 0; i < getNrows(); i++) {
111-
index prev = std::numeric_limits<index>::max();
112-
113-
for (size_t k = rowOffsets[i]; k < rowOffsets[i + 1]; k++) {
114-
if (prev != colIndices[k]) {
115-
rowOffsetsReduced[i] += 1;
116-
colIndicesReduced.push_back(colIndices[k]);
117-
}
118-
119-
prev = colIndices[k];
120-
}
121-
}
122-
123-
// Exclusive scan to eval rows offsets
124-
::cubool::exclusive_scan(rowOffsetsReduced.begin(), rowOffsetsReduced.end(), 0);
125-
126-
// Now result in respective place
127-
std::swap(rowOffsets, rowOffsetsReduced);
128-
std::swap(colIndices, colIndicesReduced);
129-
}
130-
131-
// Create device buffers and copy data from the cpu side
132-
thrust::device_vector<index, DeviceAlloc<index>> rowsDeviceVec = rowOffsets;
133-
thrust::device_vector<index, DeviceAlloc<index>> colsDeviceVec = colIndices;
134-
135-
// Move actual data to the matrix implementation
136-
mMatrixImpl = std::move(MatrixImplType(std::move(colsDeviceVec), std::move(rowsDeviceVec), getNrows(), getNcols(), colIndices.size()));
137-
}
138-
139-
void MatrixCsr::extract(index *rows, index *cols, size_t &nvals) {
140-
assert(nvals >= getNvals());
141-
142-
// Set nvals to the exact number of nnz values
143-
nvals = getNvals();
144-
145-
if (nvals > 0) {
146-
auto& rowsDeviceVec = mMatrixImpl.m_row_index;
147-
auto& colsDeviceVec = mMatrixImpl.m_col_index;
148-
149-
// Copy data to the host
150-
thrust::host_vector<index, HostAlloc<index>> rowsVec = rowsDeviceVec;
151-
thrust::host_vector<index, HostAlloc<index>> colsVec = colsDeviceVec;
152-
153-
// Iterate over csr formatted data
154-
size_t idx = 0;
155-
for (index i = 0; i < getNrows(); i++) {
156-
for (index j = rowsVec[i]; j < rowsVec[i + 1]; j++) {
157-
rows[idx] = i;
158-
cols[idx] = colsVec[j];
159-
160-
idx += 1;
161-
}
162-
}
163-
}
164-
}
165-
16642
void MatrixCsr::clone(const MatrixBase &otherBase) {
16743
auto other = dynamic_cast<const MatrixCsr*>(&otherBase);
16844

@@ -190,6 +66,16 @@ namespace cubool {
19066
}
19167
}
19268

69+
void MatrixCsr::clearAndResizeStorageToDim() const {
70+
if (mMatrixImpl.m_vals > 0) {
71+
// Release only if have some nnz values
72+
mMatrixImpl.zero_dim();
73+
}
74+
75+
// Normally resize if no storage is actually allocated
76+
this->resizeStorageToDim();
77+
}
78+
19379
index MatrixCsr::getNrows() const {
19480
return mNrows;
19581
}
@@ -210,4 +96,24 @@ namespace cubool {
21096
return mMatrixImpl.m_vals == 0;
21197
}
21298

99+
void MatrixCsr::transferToDevice(const std::vector<index> &rowOffsets, const std::vector<index> &colIndices) {
100+
// Create device buffers and copy data from the cpu side
101+
thrust::device_vector<index, DeviceAlloc<index>> rowsDeviceVec(rowOffsets.size());
102+
thrust::device_vector<index, DeviceAlloc<index>> colsDeviceVec(colIndices.size());
103+
104+
thrust::copy(rowOffsets.begin(), rowOffsets.end(), rowsDeviceVec.begin());
105+
thrust::copy(colIndices.begin(), colIndices.end(), colsDeviceVec.begin());
106+
107+
// Move actual data to the matrix implementation
108+
mMatrixImpl = std::move(MatrixImplType(std::move(colsDeviceVec), std::move(rowsDeviceVec), getNrows(), getNcols(), colIndices.size()));
109+
}
110+
111+
void MatrixCsr::transferFromDevice(std::vector<index> &rowOffsets, std::vector<index> &colIndices) const {
112+
rowOffsets.resize(mMatrixImpl.m_row_index.size());
113+
colIndices.resize(mMatrixImpl.m_col_index.size());
114+
115+
thrust::copy(mMatrixImpl.m_row_index.begin(), mMatrixImpl.m_row_index.end(), rowOffsets.begin());
116+
thrust::copy(mMatrixImpl.m_col_index.begin(), mMatrixImpl.m_col_index.end(), colIndices.begin());
117+
}
118+
213119
}

cubool/sources/cuda/matrix_csr.hpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -63,11 +63,15 @@ namespace cubool {
6363

6464
private:
6565
void resizeStorageToDim() const;
66+
void clearAndResizeStorageToDim() const;
6667
bool isStorageEmpty() const;
6768
bool isMatrixEmpty() const;
69+
void transferToDevice(const std::vector<index> &rowOffsets, const std::vector<index> &colIndices);
70+
void transferFromDevice(std::vector<index> &rowOffsets, std::vector<index> &colIndices) const;
6871

6972
// Uses nsparse csr matrix implementation as a backend
7073
mutable MatrixImplType mMatrixImpl;
74+
7175
size_t mNrows = 0;
7276
size_t mNcols = 0;
7377
Instance& mInstance;

0 commit comments

Comments
 (0)