[Project] Add algo papers info

EgorOrachyov · EgorOrachyov · commit 1214391d66a6 · 2021-05-08T16:48:40.000+03:00
diff --git a/README.md b/README.md
@@ -333,6 +333,22 @@ cuBool
 }
 ```
 
+## Algorithms
+
+In this section listed all the related papers, articles and links, which
+were used as an algorithmic foundation for implementation of sparse linear
+boolean algebra operations (sparse matrix-matrix multiplication, sparse matrix-vector
+multiplication, sparse vector-matrix multiplication, matrix-matrix element-wise addition and etc.):
+
+- High-performance and Memory-saving Sparse General Matrix-Matrix Multiplication for NVIDIA Pascal GPU, 
+Yusuke Nagasaka, Akira Nukada, Satoshi Matsuoka
+- GPU Merge Path - A GPU Merging Algorithm, 
+Oded Green, Robert McColl, David A. Bader
+- Efficient Sparse Matrix-Vector Multiplication on GPUs using the CSR Storage Format, 
+Joseph L. Greathouse, Mayank Daga
+- Atomic Reduction Based Sparse Matrix-Transpose Vector Multiplication on GPUs, 
+Yuan Tao, Yangdong Deng, Shuai Mu, Mingfa Zhu, Limin Xiao, Li Ruan, Zhibin Huang
+
 ## License
 
 This project is licensed under MIT License. License text can be found in the 
diff --git a/cubool/CMakeLists.txt b/cubool/CMakeLists.txt
@@ -140,7 +140,7 @@ if (CUBOOL_WITH_CUDA)
         sources/cuda/kernels/slow_sort.cuh
         sources/cuda/kernels/bin_search.cuh
         sources/cuda/kernels/spgemv.cuh
-        sources/cuda/kernels/spgemtv.cuh
+            sources/cuda/kernels/spgemv_t.cuh
         sources/cuda/kernels/spewiseadd.cuh
         sources/cuda/kernels/sptranspose.cuh
         sources/cuda/kernels/sptranspose2.cuh
diff --git a/cubool/sources/cuda/cuda_vector_vxm.cu b/cubool/sources/cuda/cuda_vector_vxm.cu
@@ -24,7 +24,7 @@
 
 #include <cuda/cuda_vector.hpp>
 #include <cuda/cuda_matrix.hpp>
-#include <cuda/kernels/spgemtv.cuh>
+#include <cuda/kernels/spgemv_t.cuh>
 #include <core/error.hpp>
 #include <cassert>
 
@@ -42,7 +42,7 @@ namespace cubool {
 
         m->resizeStorageToDim();
 
-        kernels::SpGEMtV<index, DeviceAlloc<index>> functor;
+        kernels::SpGEMVT<index, DeviceAlloc<index>> functor;
         auto result = functor(v->mVectorImpl, m->mMatrixImpl);
 
         mVectorImpl = std::move(result);
diff --git a/cubool/sources/cuda/kernels/spgemv.cuh b/cubool/sources/cuda/kernels/spgemv.cuh
@@ -111,15 +111,11 @@ namespace cubool {
                           thrust::device_ptr<const IndexType> rowConfig) {   // Rows to process for each bin)
 
                 EXPAND_SIDE_EFFECTS(
-                        (binSizes[Bins::id] > 0 ?
-                         __spgemv<IndexType, Bins::threads, Bins::blockSize>
-                         <<<binSizes[Bins::id] / Bins::dispatchRatio +
-                            (binSizes[Bins::id] % Bins::dispatchRatio ? 1 : 0),
-                         Bins::blockSize,
-                         0,
-                         streamsWrapper.streams[Bins::id]>>>
-                                 (rowOffsets, colIndices, v, x, rowConfig + binOffset[Bins::id], binSizes[Bins::id])
-                                                : void())
+                    (binSizes[Bins::id] > 0 ?
+                    __spgemv<IndexType, Bins::threads, Bins::blockSize>
+                    <<<binSizes[Bins::id] / Bins::dispatchRatio + (binSizes[Bins::id] % Bins::dispatchRatio ? 1 : 0), Bins::blockSize, 0, streamsWrapper.streams[Bins::id]>>>
+                    (rowOffsets, colIndices, v, x, rowConfig + binOffset[Bins::id], binSizes[Bins::id])
+                    : void())
                 );
             }
 
diff --git a/cubool/sources/cuda/kernels/spgemv_t.cuh b/cubool/sources/cuda/kernels/spgemv_t.cuh
@@ -35,11 +35,11 @@ namespace cubool {
     namespace kernels {
 
         template<typename IndexType, size_t threads, size_t blockSize>
-        __global__ void __spgemtv(thrust::device_ptr<const IndexType> rowOffsets,  // Input csr matrix rows
-                                  thrust::device_ptr<const IndexType> colIndices,  // Input csr matrix col indices
-                                  thrust::device_ptr<IndexType> x,                 // Output dense x vector (x = M*v)
-                                  thrust::device_ptr<const IndexType> rowConfig,   // Rows to process for each bin
-                                  IndexType rowsCount) {                           // Num of rows to process
+        __global__ void __spgemv_t(thrust::device_ptr<const IndexType> rowOffsets,  // Input csr matrix rows
+                                   thrust::device_ptr<const IndexType> colIndices,  // Input csr matrix col indices
+                                   thrust::device_ptr<IndexType> x,                 // Output dense x vector (x = M*v)
+                                   thrust::device_ptr<const IndexType> rowConfig,   // Rows to process for each bin
+                                   IndexType rowsCount) {                           // Num of rows to process
             // Split block into number of groups of size `threads`.
             // Each group process its own row.
 
@@ -62,7 +62,7 @@ namespace cubool {
         }
 
         template<typename IndexType, typename AllocType>
-        struct SpGEMtV {
+        struct SpGEMVT {
             template<typename T>
             using ContainerType = thrust::device_vector<T, typename AllocType::template rebind<T>::other>;
             using MatrixType = nsparse::matrix<bool, IndexType, AllocType>;
@@ -78,15 +78,11 @@ namespace cubool {
                           thrust::device_ptr<const IndexType> rowConfig) {   // Rows to process for each bin)
 
                 EXPAND_SIDE_EFFECTS(
-                        (binSizes[Bins::id] > 0 ?
-                         __spgemtv<IndexType, Bins::threads, Bins::blockSize>
-                         <<<binSizes[Bins::id] / Bins::dispatchRatio +
-                            (binSizes[Bins::id] % Bins::dispatchRatio ? 1 : 0),
-                         Bins::blockSize,
-                         0,
-                         streamsWrapper.streams[Bins::id]>>>
-                                 (rowOffsets, colIndices, x, rowConfig + binOffset[Bins::id], binSizes[Bins::id])
-                                                : void())
+                    (binSizes[Bins::id] > 0 ?
+                    __spgemv_t<IndexType, Bins::threads, Bins::blockSize>
+                    <<<binSizes[Bins::id] / Bins::dispatchRatio + (binSizes[Bins::id] % Bins::dispatchRatio ? 1 : 0), Bins::blockSize, 0, streamsWrapper.streams[Bins::id]>>>
+                    (rowOffsets, colIndices, x, rowConfig + binOffset[Bins::id], binSizes[Bins::id])
+                    : void())
                 );
             }