SparseLinearAlgebra
diff --git a/‎cubool/CMakeLists.txt
Lines changed: 7 additions & 3 deletions b/‎cubool/CMakeLists.txt
Lines changed: 7 additions & 3 deletions
diff --git a/‎cubool/sources/core/library.cpp
Lines changed: 6 additions & 7 deletions b/‎cubool/sources/core/library.cpp
Lines changed: 6 additions & 7 deletions
diff --git a/‎cubool/sources/core/library.hpp
Lines changed: 2 additions & 2 deletions b/‎cubool/sources/core/library.hpp
Lines changed: 2 additions & 2 deletions
diff --git a/‎cubool/sources/core/matrix.cpp
Lines changed: 24 additions & 6 deletions b/‎cubool/sources/core/matrix.cpp
Lines changed: 24 additions & 6 deletions
diff --git a/‎cubool/sources/cuda/matrix_csr.cu
Lines changed: 31 additions & 125 deletions b/‎cubool/sources/cuda/matrix_csr.cu
Lines changed: 31 additions & 125 deletions
diff --git a/‎cubool/sources/cuda/matrix_csr.hpp
Lines changed: 4 additions & 0 deletions b/‎cubool/sources/cuda/matrix_csr.hpp
Lines changed: 4 additions & 0 deletions
@@ -72,6 +72,8 @@ if (CUBOOL_WITH_CUDA)
         sources/cuda/instance.cpp
         sources/cuda/matrix_csr.hpp
         sources/cuda/matrix_csr.cu
+        sources/cuda/matrix_csr_build.cpp
+        sources/cuda/matrix_csr_extract.cpp
         sources/cuda/matrix_csr_ewiseadd.cu
         sources/cuda/matrix_csr_kronecker.cu
         sources/cuda/matrix_csr_multiply.cu
@@ -143,12 +145,14 @@ if (CUBOOL_WITH_CUDA)
 
     # Settings: https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/
     target_compile_options(cubool PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:
-        -arch=sm_30
+        # todo: fix this flag later -arch=sm_30 ?
+        # todo: can we omit arch flag?
         -gencode=arch=compute_30,code=sm_30
-        -gencode=arch=compute_35,code=sm_35
         -gencode=arch=compute_50,code=sm_50
         -gencode=arch=compute_52,code=sm_52
-        -gencode=arch=compute_52,code=compute_52>)
+        -gencode=arch=compute_60,code=sm_60
+        -gencode=arch=compute_61,code=sm_61
+        -gencode=arch=compute_61,code=compute_61>)
 
     target_compile_options(cubool PRIVATE $<$<COMPILE_LANGUAGE:CUDA>: -use_fast_math -Xptxas -O2>)
 
 
@@ -179,7 +179,7 @@ namespace cubool {
             logDeviceInfo();
     }
 
-    MatrixBase *Library::createMatrix(size_t nrows, size_t ncols) {
+    Matrix *Library::createMatrix(size_t nrows, size_t ncols) {
         CHECK_RAISE_ERROR(nrows > 0, InvalidArgument, "Cannot create matrix with zero dimension");
         CHECK_RAISE_ERROR(ncols > 0, InvalidArgument, "Cannot create matrix with zero dimension");
 
@@ -193,17 +193,16 @@ namespace cubool {
         return m;
     }
 
-    void Library::releaseMatrix(MatrixBase *matrixBase) {
+    void Library::releaseMatrix(Matrix *matrix) {
         if (mRelaxedRelease && !mBackend) return;
 
-        auto m = (Matrix*)(matrixBase);
-        CHECK_RAISE_ERROR(mAllocated.find(m) != mAllocated.end(), InvalidArgument, "No such matrix was allocated");
+        CHECK_RAISE_ERROR(mAllocated.find(matrix) != mAllocated.end(), InvalidArgument, "No such matrix was allocated");
 
         LogStream stream(*getLogger());
-        stream << Logger::Level::Info << "Release Matrix " << m->getDebugMarker() << LogStream::cmt;
+        stream << Logger::Level::Info << "Release Matrix " << matrix->getDebugMarker() << LogStream::cmt;
 
-        mAllocated.erase(m);
-        delete m;
+        mAllocated.erase(matrix);
+        delete matrix;
     }
 
     void Library::handleError(const std::exception& error) {
 
@@ -38,8 +38,8 @@ namespace cubool {
         static void finalize();
         static void validate();
         static void setupLogging(const char* logFileName, cuBool_Hints hints);
-        static class MatrixBase *createMatrix(size_t nrows, size_t ncols);
-        static void releaseMatrix(class MatrixBase *matrixBase);
+        static class Matrix *createMatrix(size_t nrows, size_t ncols);
+        static void releaseMatrix(class Matrix *matrix);
         static void handleError(const std::exception& error);
         static void queryCapabilities(cuBool_DeviceCaps& caps);
         static void logDeviceInfo();
 
@@ -105,7 +105,8 @@ namespace cubool {
         CHECK_RAISE_ERROR(nrows == this->getNrows(), InvalidArgument, "Result matrix has incompatible size for extracted sub-matrix range");
         CHECK_RAISE_ERROR(ncols == this->getNcols(), InvalidArgument, "Result matrix has incompatible size for extracted sub-matrix range");
 
-        this->commitCache();
+        other->commitCache();
+        this->releaseCache(); // Values of this matrix won't be used any more
 
         if (checkTime) {
             TIMER_ACTION(timer, mHnd->extractSubMatrix(*other->mHnd, i, j, nrows, ncols, false));
@@ -129,13 +130,18 @@ namespace cubool {
 
         CHECK_RAISE_ERROR(other != nullptr, InvalidArgument, "Passed matrix does not belong to core matrix class");
 
+        if (this == other)
+            return;
+
         auto M = other->getNrows();
         auto N = other->getNcols();
 
         CHECK_RAISE_ERROR(M == this->getNrows(), InvalidArgument, "Cloned matrix has incompatible size");
         CHECK_RAISE_ERROR(N == this->getNcols(), InvalidArgument, "Cloned matrix has incompatible size");
 
-        this->commitCache();
+        other->commitCache();
+        this->releaseCache(); // Values of this matrix won't be used any more
+
         mHnd->clone(*other->mHnd);
     }
 
@@ -151,6 +157,7 @@ namespace cubool {
         CHECK_RAISE_ERROR(N == this->getNrows(), InvalidArgument, "Transposed matrix has incompatible size");
 
         this->commitCache();
+        this->releaseCache(); // Values of this matrix won't be used any more
 
         if (checkTime) {
             TIMER_ACTION(timer, mHnd->transpose(*other->mHnd, false));
@@ -178,7 +185,8 @@ namespace cubool {
         CHECK_RAISE_ERROR(M == this->getNrows(), InvalidArgument, "Matrix has incompatible size");
         CHECK_RAISE_ERROR(1 == this->getNcols(), InvalidArgument, "Matrix has incompatible size");
 
-        this->commitCache();
+        other->commitCache();
+        this->releaseCache(); // Values of this matrix won't be used any more
 
         if (checkTime) {
             TIMER_ACTION(timer, mHnd->reduce(*other->mHnd, false));
@@ -211,7 +219,13 @@ namespace cubool {
         CHECK_RAISE_ERROR(N == this->getNcols(), InvalidArgument, "Matrix has incompatible size for operation result");
         CHECK_RAISE_ERROR(T == b->getNrows(), InvalidArgument, "Cannot multiply passed matrices");
 
-        this->commitCache();
+        a->commitCache();
+        b->commitCache();
+
+        if (accumulate)
+            this->commitCache();
+        else
+            this->releaseCache();
 
         if (checkTime) {
             TIMER_ACTION(timer, mHnd->multiply(*a->mHnd, *b->mHnd, accumulate, false));
@@ -245,7 +259,9 @@ namespace cubool {
         CHECK_RAISE_ERROR(M * K == this->getNrows(), InvalidArgument, "Matrix has incompatible size for operation result");
         CHECK_RAISE_ERROR(N * T == this->getNcols(), InvalidArgument, "Matrix has incompatible size for operation result");
 
-        this->commitCache();
+        a->commitCache();
+        b->commitCache();
+        this->releaseCache();
 
         if (checkTime) {
             TIMER_ACTION(timer, mHnd->kronecker(*a->mHnd, *b->mHnd, false));
@@ -280,7 +296,9 @@ namespace cubool {
         CHECK_RAISE_ERROR(M == this->getNrows(), InvalidArgument, "Matrix has incompatible size for operation result");
         CHECK_RAISE_ERROR(N == this->getNcols(), InvalidArgument, "Matrix has incompatible size for operation result");
 
-        this->commitCache();
+        a->commitCache();
+        b->commitCache();
+        this->releaseCache();
 
         if (checkTime) {
             TIMER_ACTION(timer, mHnd->eWiseAdd(*a->mHnd, *b->mHnd, false));
 
@@ -25,6 +25,7 @@
 #include <cuda/matrix_csr.hpp>
 #include <core/error.hpp>
 #include <utils/exclusive_scan.hpp>
+#include <utils/timer.hpp>
 #include <algorithm>
 
 namespace cubool {
@@ -38,131 +39,6 @@ namespace cubool {
         RAISE_ERROR(NotImplemented, "This function is not supported for this matrix class");
     }
 
-    void MatrixCsr::build(const index *rows, const index *cols, size_t nvals, bool isSorted, bool noDuplicates) {
-        if (nvals == 0) {
-            mMatrixImpl.zero_dim();  // no content, empty matrix
-            return;
-        }
-
-        thrust::host_vector<index, HostAlloc<index>> rowOffsets;
-        rowOffsets.resize(getNrows() + 1, 0);
-
-        thrust::host_vector<index, HostAlloc<index>> colIndices;
-        colIndices.resize(nvals);
-
-        // Compute nnz per row
-        for (size_t idx = 0; idx < nvals; idx++) {
-            index i = rows[idx];
-            index j = cols[idx];
-
-            CHECK_RAISE_ERROR(i < getNrows() && j < getNcols(), InvalidArgument, "Out of matrix bounds value");
-
-            rowOffsets[i] += 1;
-        }
-
-        // Exclusive scan to eval rows offsets
-        ::cubool::exclusive_scan(rowOffsets.begin(), rowOffsets.end(), 0);
-
-        // Write offsets for cols
-        std::vector<size_t> writeOffsets(getNrows(), 0);
-
-        for (size_t idx = 0; idx < nvals; idx++) {
-            index i = rows[idx];
-            index j = cols[idx];
-
-            colIndices[rowOffsets[i] + writeOffsets[i]] = j;
-            writeOffsets[i] += 1;
-        }
-
-        if (!isSorted) {
-            for (size_t i = 0; i < getNrows(); i++) {
-                auto begin = rowOffsets[i];
-                auto end = rowOffsets[i + 1];
-
-                // Sort col values within row
-                thrust::sort(colIndices.begin() + begin, colIndices.begin() + end, [](const index& a, const index& b) {
-                    return a < b;
-                });
-            }
-        }
-
-        // Reduce duplicated values
-        if (!noDuplicates) {
-            size_t unique = 0;
-            for (size_t i = 0; i < getNrows(); i++) {
-                index prev = std::numeric_limits<index>::max();
-
-                for (size_t k = rowOffsets[i]; k < rowOffsets[i + 1]; k++) {
-                    if (prev != colIndices[k]) {
-                        unique += 1;
-                    }
-
-                    prev = colIndices[k];
-                }
-            }
-
-            thrust::host_vector<index, HostAlloc<index>> rowOffsetsReduced;
-            rowOffsetsReduced.resize(getNrows() + 1, 0);
-
-            thrust::host_vector<index, HostAlloc<index>> colIndicesReduced;
-            colIndicesReduced.reserve(unique);
-
-            for (size_t i = 0; i < getNrows(); i++) {
-                index prev = std::numeric_limits<index>::max();
-
-                for (size_t k = rowOffsets[i]; k < rowOffsets[i + 1]; k++) {
-                    if (prev != colIndices[k]) {
-                        rowOffsetsReduced[i] += 1;
-                        colIndicesReduced.push_back(colIndices[k]);
-                    }
-
-                    prev = colIndices[k];
-                }
-            }
-
-            // Exclusive scan to eval rows offsets
-            ::cubool::exclusive_scan(rowOffsetsReduced.begin(), rowOffsetsReduced.end(), 0);
-
-            // Now result in respective place
-            std::swap(rowOffsets, rowOffsetsReduced);
-            std::swap(colIndices, colIndicesReduced);
-        }
-
-        // Create device buffers and copy data from the cpu side
-        thrust::device_vector<index, DeviceAlloc<index>> rowsDeviceVec = rowOffsets;
-        thrust::device_vector<index, DeviceAlloc<index>> colsDeviceVec = colIndices;
-
-        // Move actual data to the matrix implementation
-        mMatrixImpl = std::move(MatrixImplType(std::move(colsDeviceVec), std::move(rowsDeviceVec), getNrows(), getNcols(), colIndices.size()));
-    }
-
-    void MatrixCsr::extract(index *rows, index *cols, size_t &nvals) {
-        assert(nvals >= getNvals());
-
-        // Set nvals to the exact number of nnz values
-        nvals = getNvals();
-
-        if (nvals > 0) {
-            auto& rowsDeviceVec = mMatrixImpl.m_row_index;
-            auto& colsDeviceVec = mMatrixImpl.m_col_index;
-
-            // Copy data to the host
-            thrust::host_vector<index, HostAlloc<index>> rowsVec = rowsDeviceVec;
-            thrust::host_vector<index, HostAlloc<index>> colsVec = colsDeviceVec;
-
-            // Iterate over csr formatted data
-            size_t idx = 0;
-            for (index i = 0; i < getNrows(); i++) {
-                for (index j = rowsVec[i]; j < rowsVec[i + 1]; j++) {
-                    rows[idx] = i;
-                    cols[idx] = colsVec[j];
-
-                    idx += 1;
-                }
-            }
-        }
-    }
-
     void MatrixCsr::clone(const MatrixBase &otherBase) {
         auto other = dynamic_cast<const MatrixCsr*>(&otherBase);
 
@@ -190,6 +66,16 @@ namespace cubool {
         }
     }
 
+    void MatrixCsr::clearAndResizeStorageToDim() const {
+        if (mMatrixImpl.m_vals > 0) {
+            // Release only if have some nnz values
+            mMatrixImpl.zero_dim();
+        }
+
+        // Normally resize if no storage is actually allocated
+        this->resizeStorageToDim();
+    }
+
     index MatrixCsr::getNrows() const {
         return mNrows;
     }
@@ -210,4 +96,24 @@ namespace cubool {
         return mMatrixImpl.m_vals == 0;
     }
 
+    void MatrixCsr::transferToDevice(const std::vector<index> &rowOffsets, const std::vector<index> &colIndices) {
+        // Create device buffers and copy data from the cpu side
+        thrust::device_vector<index, DeviceAlloc<index>> rowsDeviceVec(rowOffsets.size());
+        thrust::device_vector<index, DeviceAlloc<index>> colsDeviceVec(colIndices.size());
+
+        thrust::copy(rowOffsets.begin(), rowOffsets.end(), rowsDeviceVec.begin());
+        thrust::copy(colIndices.begin(), colIndices.end(), colsDeviceVec.begin());
+
+        // Move actual data to the matrix implementation
+        mMatrixImpl = std::move(MatrixImplType(std::move(colsDeviceVec), std::move(rowsDeviceVec), getNrows(), getNcols(), colIndices.size()));
+    }
+
+    void MatrixCsr::transferFromDevice(std::vector<index> &rowOffsets, std::vector<index> &colIndices) const {
+        rowOffsets.resize(mMatrixImpl.m_row_index.size());
+        colIndices.resize(mMatrixImpl.m_col_index.size());
+
+        thrust::copy(mMatrixImpl.m_row_index.begin(), mMatrixImpl.m_row_index.end(), rowOffsets.begin());
+        thrust::copy(mMatrixImpl.m_col_index.begin(), mMatrixImpl.m_col_index.end(), colIndices.begin());
+    }
+
 }
@@ -63,11 +63,15 @@ namespace cubool {
 
     private:
         void resizeStorageToDim() const;
+        void clearAndResizeStorageToDim() const;
         bool isStorageEmpty() const;
         bool isMatrixEmpty() const;
+        void transferToDevice(const std::vector<index> &rowOffsets, const std::vector<index> &colIndices);
+        void transferFromDevice(std::vector<index> &rowOffsets, std::vector<index> &colIndices) const;
 
         // Uses nsparse csr matrix implementation as a backend
         mutable MatrixImplType mMatrixImpl;
+
         size_t mNrows = 0;
         size_t mNcols = 0;
         Instance& mInstance;