diff --git a/include/tensor.cuh b/include/tensor.cuh index b2cfaf9..cf7003c 100644 --- a/include/tensor.cuh +++ b/include/tensor.cuh @@ -548,7 +548,7 @@ inline bool DTensor::upload(const std::vector &vec, StorageMode mode) { size_t size = vec.size(); size_t thisSize = m_numRows * m_numCols * m_numMats; // make sure vec is of right size - if (size != thisSize) throw std::invalid_argument("vec has wrong size"); + if (size != thisSize) throw std::invalid_argument("[upload] vec has wrong size"); std::vector vecCm(thisSize); if (mode == StorageMode::rowMajor) { rm2cm(vec, vecCm); @@ -611,7 +611,7 @@ inline DTensor DTensor::tr() const { template inline void DTensor::deviceCopyTo(DTensor &elsewhere) const { if (elsewhere.numEl() < numEl()) { - throw std::invalid_argument("tensor does not fit into destination"); + throw std::invalid_argument("[deviceCopyTo] tensor does not fit into destination"); } gpuErrChk(cudaMemcpy(elsewhere.raw(), m_d_data, @@ -623,7 +623,7 @@ template<> inline DTensor &DTensor::operator*=(double scalar) { double alpha = scalar; gpuErrChk( - cublasDscal(Session::getInstance().cuBlasHandle(), m_numRows * m_numCols * m_numMats, &alpha, m_d_data, 1)); + cublasDscal(Session::getInstance().cuBlasHandle(), m_numRows * m_numCols * m_numMats, &alpha, m_d_data, 1)); return *this; } @@ -641,7 +641,7 @@ template<> inline DTensor &DTensor::operator*=(float scalar) { float alpha = scalar; gpuErrChk( - cublasSscal(Session::getInstance().cuBlasHandle(), m_numRows * m_numCols * m_numMats, &alpha, m_d_data, 1)); + cublasSscal(Session::getInstance().cuBlasHandle(), m_numRows * m_numCols * m_numMats, &alpha, m_d_data, 1)); return *this; } @@ -649,8 +649,8 @@ template<> inline DTensor &DTensor::operator+=(const DTensor &rhs) { const double alpha = 1.; gpuErrChk( - cublasDaxpy(Session::getInstance().cuBlasHandle(), m_numRows * m_numCols * m_numMats, &alpha, rhs.m_d_data, - 1, m_d_data, 1)); + cublasDaxpy(Session::getInstance().cuBlasHandle(), m_numRows * m_numCols * m_numMats, &alpha, rhs.m_d_data, + 1, m_d_data, 1)); return *this; } @@ -658,8 +658,8 @@ template<> inline DTensor &DTensor::operator+=(const DTensor &rhs) { const float alpha = 1.; gpuErrChk( - cublasSaxpy(Session::getInstance().cuBlasHandle(), m_numRows * m_numCols * m_numMats, &alpha, rhs.m_d_data, - 1, m_d_data, 1)); + cublasSaxpy(Session::getInstance().cuBlasHandle(), m_numRows * m_numCols * m_numMats, &alpha, rhs.m_d_data, + 1, m_d_data, 1)); return *this; } @@ -675,8 +675,8 @@ template<> inline DTensor &DTensor::operator-=(const DTensor &rhs) { const double alpha = -1.; gpuErrChk( - cublasDaxpy(Session::getInstance().cuBlasHandle(), m_numRows * m_numCols * m_numMats, &alpha, rhs.m_d_data, - 1, m_d_data, 1)); + cublasDaxpy(Session::getInstance().cuBlasHandle(), m_numRows * m_numCols * m_numMats, &alpha, rhs.m_d_data, + 1, m_d_data, 1)); return *this; } @@ -745,13 +745,13 @@ inline void DTensor::leastSquares(DTensor &B) { size_t batchSize = numMats(); size_t nColsB = B.numCols(); if (B.numRows() != m_numRows) - throw std::invalid_argument("Least squares rhs rows does not equal lhs rows"); + throw std::invalid_argument("[Least squares] rhs rows does not equal lhs rows"); if (nColsB != 1) - throw std::invalid_argument("Least squares rhs are not vectors"); + throw std::invalid_argument("[Least squares] rhs are not vectors"); if (B.numMats() != batchSize) - throw std::invalid_argument("Least squares rhs numMats does not equal lhs numMats"); + throw std::invalid_argument("[Least squares] rhs numMats does not equal lhs numMats"); if (m_numCols > m_numRows) - throw std::invalid_argument("Least squares supports square or tall matrices only"); + throw std::invalid_argument("[Least squares] supports square or tall matrices only"); int info = 0; DTensor infoArray(batchSize); DTensor As = pointersToMatrices(); @@ -775,13 +775,13 @@ inline void DTensor::leastSquares(DTensor &B) { size_t batchSize = numMats(); size_t nColsB = B.numCols(); if (B.numRows() != m_numRows) - throw std::invalid_argument("Least squares rhs rows does not equal lhs rows"); + throw std::invalid_argument("[Least squares] rhs rows does not equal lhs rows"); if (nColsB != 1) - throw std::invalid_argument("Least squares rhs are not vectors"); + throw std::invalid_argument("[Least squares] rhs are not vectors"); if (B.numMats() != batchSize) - throw std::invalid_argument("Least squares rhs numMats does not equal lhs numMats"); + throw std::invalid_argument("[Least squares] rhs numMats does not equal lhs numMats"); if (m_numCols > m_numRows) - throw std::invalid_argument("Least squares supports square or tall matrices only"); + throw std::invalid_argument("[Least squares] supports square or tall matrices only"); int info = 0; DTensor infoArray(batchSize); DTensor As = pointersToMatrices(); @@ -899,7 +899,7 @@ private: */ void checkMatrix(DTensor &tensor) const { if (tensor.numRows() < tensor.numCols()) { - throw std::invalid_argument("your matrix is fat (no offence)"); + throw std::invalid_argument("[svd] your matrix is fat (no offence)"); } }; @@ -1021,17 +1021,17 @@ inline bool Svd::factorise() { if (m_computeU) Ui = std::make_unique>(*m_U, 2, i, i); gpuErrChk( - cusolverDnDgesvd(Session::getInstance().cuSolverHandle(), - (m_computeU) ? 'A' : 'N', 'A', - m, n, - Ai.raw(), m, - Si.raw(), - (m_computeU) ? Ui->raw() : nullptr, m, - Vtri.raw(), n, - m_workspace->raw(), - m_lwork, - nullptr, // rwork (used only if SVD fails) - m_info->raw())); + cusolverDnDgesvd(Session::getInstance().cuSolverHandle(), + (m_computeU) ? 'A' : 'N', 'A', + m, n, + Ai.raw(), m, + Si.raw(), + (m_computeU) ? Ui->raw() : nullptr, m, + Vtri.raw(), n, + m_workspace->raw(), + m_lwork, + nullptr, // rwork (used only if SVD fails) + m_info->raw())); info = info && ((*m_info)(0, 0, 0) == 0); } return info; @@ -1051,17 +1051,17 @@ inline bool Svd::factorise() { if (m_computeU) Ui = std::make_unique>(*m_U, 2, i, i); gpuErrChk( - cusolverDnSgesvd(Session::getInstance().cuSolverHandle(), - (m_computeU) ? 'A' : 'N', 'A', - m, n, - Ai.raw(), m, - Si.raw(), - (m_computeU) ? Ui->raw() : nullptr, m, - Vtri.raw(), n, - m_workspace->raw(), - m_lwork, - nullptr, // rwork (used only if SVD fails) - m_info->raw())); + cusolverDnSgesvd(Session::getInstance().cuSolverHandle(), + (m_computeU) ? 'A' : 'N', 'A', + m, n, + Ai.raw(), m, + Si.raw(), + (m_computeU) ? Ui->raw() : nullptr, m, + Vtri.raw(), n, + m_workspace->raw(), + m_lwork, + nullptr, // rwork (used only if SVD fails) + m_info->raw())); info = info && ((*m_info)(0, 0, 0) == 0); } return info; @@ -1095,8 +1095,8 @@ private: public: CholeskyFactoriser(DTensor &A) { - if (A.numMats() > 1) throw std::invalid_argument("3D tensors are not supported (for now); only matrices"); - if (A.numRows() != A.numCols()) throw std::invalid_argument("Matrix A must be square for CF"); + if (A.numMats() > 1) throw std::invalid_argument("[Cholesky] 3D tensors require `CholeskyBatchFactoriser"); + if (A.numRows() != A.numCols()) throw std::invalid_argument("[Cholesky] Matrix A must be square"); m_matrix = &A; computeWorkspaceSize(); m_workspace = std::make_unique>(m_workspaceSize); @@ -1237,7 +1237,7 @@ template TEMPLATE_CONSTRAINT_REQUIRES_FPX inline Nullspace::Nullspace(DTensor &a) { size_t m = a.numRows(), n = a.numCols(), nMats = a.numMats(); - if (m > n) throw std::invalid_argument("I was expecting a square or fat matrix"); + if (m > n) throw std::invalid_argument("[nullspace] I was expecting a square or fat matrix"); m_nullspace = std::make_unique>(n, n, nMats, true); m_projOp = std::make_unique>(n, n, nMats, true); auto aTranspose = a.tr(); @@ -1276,4 +1276,117 @@ inline void Nullspace::project(DTensor &b) { } +/* ================================================================================================ + * CHOLESKY BATCH FACTORISATION (CBF) + * ================================================================================================ */ + +/** + * Cholesky Factorisation (CF) for batches of matrices (tensors). + * This object can factorise and then solve, + * or you can provide pre-computed Cholesky lower-triangular matrices and then solve. + * @tparam T data type of tensor (must be float or double) + */ +TEMPLATE_WITH_TYPE_T TEMPLATE_CONSTRAINT_REQUIRES_FPX +class CholeskyBatchFactoriser { + +private: + DTensor *m_matrix; ///< Matrix to factorise or lower-triangular decomposed matrix + size_t m_numRows = 0; ///< Number of rows in `m_matrix` + size_t m_numMats = 0; ///< Number of matrices in `m_matrix` + std::unique_ptr> m_deviceInfo; ///< Info from cusolver functions + bool m_factorisationDone = false; ///< Whether `m_matrix` holds original or lower-triangular matrix + +public: + CholeskyBatchFactoriser() = delete; + + /** + * Constructor + * @param A either matrices to be factorised or lower-triangular Cholesky decomposition matrices + * @param factorised whether A is the original matrices or the factorised ones (default=original matrices) + */ + CholeskyBatchFactoriser(DTensor &A, bool factorised=false) : m_factorisationDone(factorised) { + if (A.numRows() != A.numCols()) throw std::invalid_argument("[CholeskyBatch] A must be square"); + m_matrix = &A; + m_numRows = A.numRows(); + m_numMats = A.numMats(); + m_deviceInfo = std::make_unique>(m_numMats, 1, 1, true); + } + + /** + * Factorise all matrices in tensor (batched). + */ + void factorise(); + + /** + * Solve the system A\b using Cholesky lower-triangular matrix of A (batched). + * @param b vector in system Ax=b to be solved where x is the solution + */ + void solve(DTensor &b); + +}; + +template<> +void CholeskyBatchFactoriser::factorise() { + if (m_factorisationDone) return; + DTensor ptrA = m_matrix->pointersToMatrices(); + gpuErrChk(cusolverDnDpotrfBatched(Session::getInstance().cuSolverHandle(), + CUBLAS_FILL_MODE_LOWER, + m_numRows, + ptrA.raw(), + m_numRows, + m_deviceInfo->raw(), + m_numMats)); + m_factorisationDone = true; +} + +template<> +void CholeskyBatchFactoriser::factorise() { + if (m_factorisationDone) return; + DTensor ptrA = m_matrix->pointersToMatrices(); + gpuErrChk(cusolverDnSpotrfBatched(Session::getInstance().cuSolverHandle(), + CUBLAS_FILL_MODE_LOWER, + m_numRows, + ptrA.raw(), + m_numRows, + m_deviceInfo->raw(), + m_numMats)); + m_factorisationDone = true; +} + +template<> +void CholeskyBatchFactoriser::solve(DTensor &b) { + if (!m_factorisationDone) throw std::logic_error("[CholeskyBatchSolve] no factor to solve with"); + if (b.numCols() != 1) throw std::invalid_argument("[CholeskyBatchSolve] only supports `b` with one column"); + DTensor ptrA = m_matrix->pointersToMatrices(); + DTensor ptrB = b.pointersToMatrices(); + gpuErrChk(cusolverDnDpotrsBatched(Session::getInstance().cuSolverHandle(), + CUBLAS_FILL_MODE_LOWER, + m_numRows, + 1, ///< only supports rhs = 1 + ptrA.raw(), + m_numRows, + ptrB.raw(), + m_numRows, + m_deviceInfo->raw(), + m_numMats)); +} + +template<> +void CholeskyBatchFactoriser::solve(DTensor &b) { + if (!m_factorisationDone) throw std::logic_error("[CholeskyBatchSolve] no factor to solve with"); + if (b.numCols() != 1) throw std::invalid_argument("[CholeskyBatchSolve] only supports `b` with one column"); + DTensor ptrA = m_matrix->pointersToMatrices(); + DTensor ptrB = b.pointersToMatrices(); + gpuErrChk(cusolverDnSpotrsBatched(Session::getInstance().cuSolverHandle(), + CUBLAS_FILL_MODE_LOWER, + m_numRows, + 1, ///< only supports rhs = 1 + ptrA.raw(), + m_numRows, + ptrB.raw(), + m_numRows, + m_deviceInfo->raw(), + m_numMats)); +} + #endif diff --git a/test/testTensor.cu b/test/testTensor.cu index a46a803..5291c68 100644 --- a/test/testTensor.cu +++ b/test/testTensor.cu @@ -733,7 +733,7 @@ TEST_F(SvdTest, singularValuesMemory) { * --------------------------------------- */ template requires std::floating_point -void singularValuesMutlipleMatrices(float epsilon) { +void singularValuesMultipleMatrices(float epsilon) { std::vector aData = {1, 2, 3, 4, 5, 6, 1, 1, 1, 2, 2, 2, 0, 0, 0, 0, 0, 1}; DTensor A(aData, 3, 2, 3); Svd svd(A, true); // do compute U (A will be destroyed) @@ -753,15 +753,15 @@ void singularValuesMutlipleMatrices(float epsilon) { S.download(actual_s); for (size_t i = 0; i < 6; i++) EXPECT_NEAR(expected_s[i], actual_s[i], epsilon); std::vector expected_u = { - -0.428667133548626, -0.566306918848035, -0.703946704147444, - 0.805963908589298, 0.112382414096594, -0.581199080396110, - 0.408248290463863, -0.816496580927726, 0.408248290463863, - -0.577350269189626, -0.577350269189626, -0.577350269189626, - 0.816496580927726, -0.408248290463863, -0.408248290463863, - 0.000000000000000, -0.707106781186548, 0.707106781186547, - 0, 0, -1, - 1, 0, 0, - 0, -1, 0, + -0.428667133548626, -0.566306918848035, -0.703946704147444, + 0.805963908589298, 0.112382414096594, -0.581199080396110, + 0.408248290463863, -0.816496580927726, 0.408248290463863, + -0.577350269189626, -0.577350269189626, -0.577350269189626, + 0.816496580927726, -0.408248290463863, -0.408248290463863, + 0.000000000000000, -0.707106781186548, 0.707106781186547, + 0, 0, -1, + 1, 0, 0, + 0, -1, 0, }; std::vector actual_u(27); U->download(actual_u); @@ -769,9 +769,9 @@ void singularValuesMutlipleMatrices(float epsilon) { } -TEST_F(SvdTest, singularValuesMutlipleMatrices) { - singularValuesMutlipleMatrices(10 * PRECISION_LOW); // SVD with float performs quite poorly - singularValuesMutlipleMatrices(PRECISION_HIGH); +TEST_F(SvdTest, singularValuesMultipleMatrices) { + singularValuesMultipleMatrices(10 * PRECISION_LOW); // SVD with float performs quite poorly + singularValuesMultipleMatrices(PRECISION_HIGH); } @@ -870,6 +870,127 @@ TEST_F(CholeskyTest, choleskyFactorisationSolution) { choleskyFactorisationSolution(PRECISION_HIGH); } +/* --------------------------------------- + * Batched Cholesky factorisation + * --------------------------------------- */ + +template +requires std::floating_point +void choleskyBatchFactorisation(T epsilon) { + std::vector aData = {10.0, 2.0, 3.0, + 2.0, 20.0, -1.0, + 3.0, -1.0, 30.0}; + DTensor A(3, 3, 2); + DTensor A0(A, 2, 0, 0); + DTensor A1(A, 2, 1, 1); + A0.upload(aData); + A1.upload(aData); + CholeskyBatchFactoriser chol(A); + chol.factorise(); + // 0 + EXPECT_NEAR(3.162277660168380, A(0, 0, 0), epsilon); + EXPECT_NEAR(-0.361403161162101, A(2, 1, 0), epsilon); + EXPECT_NEAR(5.382321781081287, A(2, 2, 0), epsilon); + // 1 + EXPECT_NEAR(3.162277660168380, A(0, 0, 1), epsilon); + EXPECT_NEAR(-0.361403161162101, A(2, 1, 1), epsilon); + EXPECT_NEAR(5.382321781081287, A(2, 2, 1), epsilon); +} + +TEST_F(CholeskyTest, choleskyBatchFactorisation) { + choleskyBatchFactorisation(PRECISION_LOW); + choleskyBatchFactorisation(PRECISION_HIGH); +} + +/* --------------------------------------- + * Batched Cholesky solve + * --------------------------------------- */ + +template +requires std::floating_point +void choleskyBatchFactorSolve(T epsilon) { + std::vector aData = {10.0, 2.0, 3.0, + 2.0, 20.0, -1.0, + 3.0, -1.0, 30.0}; + DTensor A(3, 3, 2); + DTensor A0(A, 2, 0, 0); + DTensor A1(A, 2, 1, 1); + A0.upload(aData); + A1.upload(aData); + DTensor L(A); // L = A + CholeskyBatchFactoriser chol(L); + chol.factorise(); + std::vector bData = {-1., -3., 5.}; + DTensor rhs(3, 1, 2); + DTensor rhs0(rhs, 2, 0, 0); + DTensor rhs1(rhs, 2, 1, 1); + rhs0.upload(bData); + rhs1.upload(bData); + DTensor sol(rhs); + chol.solve(sol); + std::vector expected = {-0.126805213103205, -0.128566396618528, 0.175061641423036}; + std::vector actual(6); + sol.download(actual); + for (size_t i = 0; i < 3; i++) EXPECT_NEAR(expected[i], actual[i], epsilon); // 0 + for (size_t i = 0; i < 3; i++) EXPECT_NEAR(expected[i], actual[i + 3], epsilon); // 1 + DTensor error = A * sol; + error -= rhs; + EXPECT_TRUE(error.normF() < epsilon); +} + +TEST_F(CholeskyTest, choleskyBatchFactorSolve) { + choleskyBatchFactorSolve(PRECISION_LOW); + choleskyBatchFactorSolve(PRECISION_HIGH); +} + +/* --------------------------------------- + * Batched Cholesky solve (factor provided) + * --------------------------------------- */ + +template +requires std::floating_point +void choleskyBatchSolve(T epsilon) { + std::vector aData = {10.0, 2.0, 3.0, + 2.0, 20.0, -1.0, + 3.0, -1.0, 30.0}; + DTensor A(3, 3, 2); + DTensor A0(A, 2, 0, 0); + DTensor A1(A, 2, 1, 1); + A0.upload(aData); + A1.upload(aData); + std::vector lowData = {3.162277660168380, 0, 0, + 0.632455532033676, 4.427188724235731, 0, + 0.948683298050514, -0.361403161162101, 5.382321781081287}; // from matlab + DTensor low(3, 3, 2); + DTensor low0(low, 2, 0, 0); + DTensor low1(low, 2, 1, 1); + low0.upload(lowData, rowMajor); + low1.upload(lowData, rowMajor); + DTensor L(low); + CholeskyBatchFactoriser chol(L, true); + std::vector bData = {-1., -3., 5.}; + DTensor rhs(3, 1, 2); + DTensor rhs0(rhs, 2, 0, 0); + DTensor rhs1(rhs, 2, 1, 1); + rhs0.upload(bData); + rhs1.upload(bData); + DTensor sol(rhs); + chol.solve(sol); + std::vector expected = {-0.126805213103205, -0.128566396618528, 0.175061641423036}; + std::vector actual(6); + sol.download(actual); + for (size_t i = 0; i < 3; i++) EXPECT_NEAR(expected[i], actual[i], epsilon); // 0 + for (size_t i = 0; i < 3; i++) EXPECT_NEAR(expected[i], actual[i + 3], epsilon); // 1 + DTensor error = A * sol; + error -= rhs; + EXPECT_TRUE(error.normF() < epsilon); +} + +TEST_F(CholeskyTest, choleskyBatchSolve) { + choleskyBatchSolve(PRECISION_LOW); + choleskyBatchSolve(PRECISION_HIGH); +} + /* ================================================================================================ * NULLSPACE TESTS @@ -983,7 +1104,7 @@ void projectOnNullspaceTensor(T epsilon) { DTensor delta1 = y - proj; DTensor delta2 = proj - x; EXPECT_LT(delta1.dotF(delta2), epsilon); - } +} TEST_F(NullspaceTest, projectOnNullspaceTensor) { projectOnNullspaceTensor(PRECISION_LOW);