From aa4ff99cc6cdcffc16f200316f993972161d7aa0 Mon Sep 17 00:00:00 2001 From: Pantelis Sopasakis Date: Wed, 15 May 2024 15:31:04 +0100 Subject: [PATCH 1/3] numBlocks instead of DIM2BLOCKS --- include/tensor.cuh | 87 ++++++++++++++++++++++++++++------------------ 1 file changed, 54 insertions(+), 33 deletions(-) diff --git a/include/tensor.cuh b/include/tensor.cuh index a154d63..f0d0ecb 100644 --- a/include/tensor.cuh +++ b/include/tensor.cuh @@ -17,7 +17,6 @@ */ #define DEFAULT_FPX double #define THREADS_PER_BLOCK 512 -#define DIM2BLOCKS(n) ((n) / THREADS_PER_BLOCK + ((n) % THREADS_PER_BLOCK != 0)) #if (__cplusplus >= 201703L) ///< if c++17 or above #define TEMPLATE_WITH_TYPE_T template #else @@ -29,6 +28,11 @@ #define TEMPLATE_CONSTRAINT_REQUIRES_FPX #endif +constexpr size_t numBlocks(size_t n, size_t threads_pre_block=THREADS_PER_BLOCK) { + return (n / threads_pre_block + (n % threads_pre_block != 0)); +} + + /** * Check for errors when calling GPU functions */ @@ -338,6 +342,8 @@ public: */ void addAB(const DTensor &A, const DTensor &B, T alpha = 1, T beta = 0); + void reshape(size_t newNumRows, size_t newNumCols, size_t newNumMats = 1); + /* ------------- OPERATORS ------------- */ DTensor &operator=(const DTensor &other); @@ -384,6 +390,21 @@ public: }; /* END OF DTENSOR */ +template +void DTensor::reshape(size_t newNumRows, size_t newNumCols, size_t newNumMats) { + size_t newNumElements = newNumRows * newNumCols * newNumMats; + if (numEl() != newNumElements) { + char errMessage[256]; + sprintf(errMessage, + "DTensor[%lu x %lu x %lu] with %lu elements cannot be reshaped into DTensor[%lu x %lu x %lu] (%lu elements)", + numRows(), numRows(), numMats(), numEl(), newNumRows, newNumCols, newNumMats, newNumElements); + throw std::invalid_argument(errMessage); + } + m_numRows = newNumRows; + m_numCols = newNumCols; + m_numMats = newNumMats; +} + template DTensor::DTensor(size_t m, size_t n, size_t k, bool zero) { m_numRows = m; @@ -623,7 +644,7 @@ template<> inline DTensor &DTensor::operator*=(double scalar) { double alpha = scalar; gpuErrChk( - cublasDscal(Session::getInstance().cuBlasHandle(), m_numRows * m_numCols * m_numMats, &alpha, m_d_data, 1)); + cublasDscal(Session::getInstance().cuBlasHandle(), m_numRows * m_numCols * m_numMats, &alpha, m_d_data, 1)); return *this; } @@ -641,7 +662,7 @@ template<> inline DTensor &DTensor::operator*=(float scalar) { float alpha = scalar; gpuErrChk( - cublasSscal(Session::getInstance().cuBlasHandle(), m_numRows * m_numCols * m_numMats, &alpha, m_d_data, 1)); + cublasSscal(Session::getInstance().cuBlasHandle(), m_numRows * m_numCols * m_numMats, &alpha, m_d_data, 1)); return *this; } @@ -649,8 +670,8 @@ template<> inline DTensor &DTensor::operator+=(const DTensor &rhs) { const double alpha = 1.; gpuErrChk( - cublasDaxpy(Session::getInstance().cuBlasHandle(), m_numRows * m_numCols * m_numMats, &alpha, rhs.m_d_data, - 1, m_d_data, 1)); + cublasDaxpy(Session::getInstance().cuBlasHandle(), m_numRows * m_numCols * m_numMats, &alpha, rhs.m_d_data, + 1, m_d_data, 1)); return *this; } @@ -658,8 +679,8 @@ template<> inline DTensor &DTensor::operator+=(const DTensor &rhs) { const float alpha = 1.; gpuErrChk( - cublasSaxpy(Session::getInstance().cuBlasHandle(), m_numRows * m_numCols * m_numMats, &alpha, rhs.m_d_data, - 1, m_d_data, 1)); + cublasSaxpy(Session::getInstance().cuBlasHandle(), m_numRows * m_numCols * m_numMats, &alpha, rhs.m_d_data, + 1, m_d_data, 1)); return *this; } @@ -675,8 +696,8 @@ template<> inline DTensor &DTensor::operator-=(const DTensor &rhs) { const double alpha = -1.; gpuErrChk( - cublasDaxpy(Session::getInstance().cuBlasHandle(), m_numRows * m_numCols * m_numMats, &alpha, rhs.m_d_data, - 1, m_d_data, 1)); + cublasDaxpy(Session::getInstance().cuBlasHandle(), m_numRows * m_numCols * m_numMats, &alpha, rhs.m_d_data, + 1, m_d_data, 1)); return *this; } @@ -987,7 +1008,7 @@ public: for (size_t i = 0; i < m_rank->numMats(); i++) { DTensor Si(*m_S, 2, i, i); DTensor rankI(*m_rank, 2, i, i); - k_countNonzeroSingularValues<<>>(Si.raw(), numElS, + k_countNonzeroSingularValues<<>>(Si.raw(), numElS, rankI.raw(), epsilon); } return *m_rank; @@ -1021,17 +1042,17 @@ inline bool Svd::factorise() { if (m_computeU) Ui = std::make_unique>(*m_U, 2, i, i); gpuErrChk( - cusolverDnDgesvd(Session::getInstance().cuSolverHandle(), - (m_computeU) ? 'A' : 'N', 'A', - m, n, - Ai.raw(), m, - Si.raw(), - (m_computeU) ? Ui->raw() : nullptr, m, - Vtri.raw(), n, - m_workspace->raw(), - m_lwork, - nullptr, // rwork (used only if SVD fails) - m_info->raw())); + cusolverDnDgesvd(Session::getInstance().cuSolverHandle(), + (m_computeU) ? 'A' : 'N', 'A', + m, n, + Ai.raw(), m, + Si.raw(), + (m_computeU) ? Ui->raw() : nullptr, m, + Vtri.raw(), n, + m_workspace->raw(), + m_lwork, + nullptr, // rwork (used only if SVD fails) + m_info->raw())); info = info && ((*m_info)(0, 0, 0) == 0); } return info; @@ -1051,17 +1072,17 @@ inline bool Svd::factorise() { if (m_computeU) Ui = std::make_unique>(*m_U, 2, i, i); gpuErrChk( - cusolverDnSgesvd(Session::getInstance().cuSolverHandle(), - (m_computeU) ? 'A' : 'N', 'A', - m, n, - Ai.raw(), m, - Si.raw(), - (m_computeU) ? Ui->raw() : nullptr, m, - Vtri.raw(), n, - m_workspace->raw(), - m_lwork, - nullptr, // rwork (used only if SVD fails) - m_info->raw())); + cusolverDnSgesvd(Session::getInstance().cuSolverHandle(), + (m_computeU) ? 'A' : 'N', 'A', + m, n, + Ai.raw(), m, + Si.raw(), + (m_computeU) ? Ui->raw() : nullptr, m, + Vtri.raw(), n, + m_workspace->raw(), + m_lwork, + nullptr, // rwork (used only if SVD fails) + m_info->raw())); info = info && ((*m_info)(0, 0, 0) == 0); } return info; @@ -1304,7 +1325,7 @@ public: * @param A either matrices to be factorised or lower-triangular Cholesky decomposition matrices * @param factorised whether A is the original matrices or the factorised ones (default=original matrices) */ - CholeskyBatchFactoriser(DTensor &A, bool factorised=false) : m_factorisationDone(factorised) { + CholeskyBatchFactoriser(DTensor &A, bool factorised = false) : m_factorisationDone(factorised) { if (A.numRows() != A.numCols()) throw std::invalid_argument("[CholeskyBatch] A must be square"); m_matrix = &A; m_numRows = A.numRows(); From 8a3cf98a9172d6c21dbf7b156de74fef819214fa Mon Sep 17 00:00:00 2001 From: Pantelis Sopasakis Date: Wed, 15 May 2024 16:22:51 +0100 Subject: [PATCH 2/3] docs for numBlocks --- include/tensor.cuh | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/include/tensor.cuh b/include/tensor.cuh index f0d0ecb..5641a28 100644 --- a/include/tensor.cuh +++ b/include/tensor.cuh @@ -28,6 +28,14 @@ #define TEMPLATE_CONSTRAINT_REQUIRES_FPX #endif +/** + * Determines the number of blocks needed for a given number of tasks, n, + * and number of threads per block + * + * @param n problem size + * @param threads_pre_block threads per block (defaults to THREADS_PER_BLOCK) + * @return number of blocks + */ constexpr size_t numBlocks(size_t n, size_t threads_pre_block=THREADS_PER_BLOCK) { return (n / threads_pre_block + (n % threads_pre_block != 0)); } From 546f54bb99cfeaca92e0f765f597cfbe1e59624b Mon Sep 17 00:00:00 2001 From: Pantelis Sopasakis Date: Wed, 22 May 2024 12:56:11 +0100 Subject: [PATCH 3/3] fix typo --- include/tensor.cuh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/include/tensor.cuh b/include/tensor.cuh index 5641a28..aeb9c60 100644 --- a/include/tensor.cuh +++ b/include/tensor.cuh @@ -33,11 +33,11 @@ * and number of threads per block * * @param n problem size - * @param threads_pre_block threads per block (defaults to THREADS_PER_BLOCK) + * @param threads_per_block threads per block (defaults to THREADS_PER_BLOCK) * @return number of blocks */ -constexpr size_t numBlocks(size_t n, size_t threads_pre_block=THREADS_PER_BLOCK) { - return (n / threads_pre_block + (n % threads_pre_block != 0)); +constexpr size_t numBlocks(size_t n, size_t threads_per_block=THREADS_PER_BLOCK) { + return (n / threads_per_block + (n % threads_per_block != 0)); }