From de9c325f09155ab176fbb9b82d1cdc06f0e551c4 Mon Sep 17 00:00:00 2001 From: Pantelis Sopasakis Date: Fri, 3 May 2024 21:49:47 +0100 Subject: [PATCH 1/3] implementing CholeskyMultiFactoriser --- include/tensor.cuh | 56 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 56 insertions(+) diff --git a/include/tensor.cuh b/include/tensor.cuh index b2cfaf9..ba79d18 100644 --- a/include/tensor.cuh +++ b/include/tensor.cuh @@ -1276,4 +1276,60 @@ inline void Nullspace::project(DTensor &b) { } +TEMPLATE_WITH_TYPE_T TEMPLATE_CONSTRAINT_REQUIRES_FPX +class CholeskyMultiFactoriser { + +private: + DTensor &m_matrix; ///< Matrix to factorise (reference) + size_t m_numRows = 0; + size_t m_numMats = 0; + std::unique_ptr> m_deviceInfo; + bool m_factorisationDone = false; + +public: + CholeskyMultiFactoriser() = delete; + + CholeskyMultiFactoriser(DTensor &A) : m_matrix(A) { + m_numRows = A.numRows(); + m_numMats = A.numMats(); + m_deviceInfo = std::make_unique>(m_numMats, 1, 1, true); + } + + /** + * Factorise all matrices in tensor (batched). + */ + void factorise(); + + int solve(DTensor &b); + +}; + +template<> +void CholeskyMultiFactoriser::factorise() { + if (m_factorisationDone) return; + DTensor ptrA = m_matrix.pointersToMatrices(); + cusolverDnDpotrfBatched(Session::getInstance().cuSolverHandle(), + CUBLAS_FILL_MODE_LOWER, + m_numRows, + ptrA.raw(), + m_numRows, + m_deviceInfo->raw(), + m_numMats); + m_factorisationDone = true; +} + +template<> +void CholeskyMultiFactoriser::factorise() { + if (m_factorisationDone) return; + DTensor ptrA = m_matrix.pointersToMatrices(); + cusolverDnSpotrfBatched(Session::getInstance().cuSolverHandle(), + CUBLAS_FILL_MODE_LOWER, + m_numRows, + ptrA.raw(), + m_numRows, + m_deviceInfo->raw(), + m_numMats); + m_factorisationDone = true; +} + #endif From 20c9c1578b2544a866d9a1298becc9adfeda28d1 Mon Sep 17 00:00:00 2001 From: Ruairi Moran Date: Tue, 7 May 2024 13:42:07 +0100 Subject: [PATCH 2/3] added batch solve, provide factorised lower, and unit tests --- include/tensor.cuh | 176 ++++++++++++++++++++++++++++----------------- test/testTensor.cu | 143 ++++++++++++++++++++++++++++++++---- 2 files changed, 238 insertions(+), 81 deletions(-) diff --git a/include/tensor.cuh b/include/tensor.cuh index ba79d18..dca144d 100644 --- a/include/tensor.cuh +++ b/include/tensor.cuh @@ -548,7 +548,7 @@ inline bool DTensor::upload(const std::vector &vec, StorageMode mode) { size_t size = vec.size(); size_t thisSize = m_numRows * m_numCols * m_numMats; // make sure vec is of right size - if (size != thisSize) throw std::invalid_argument("vec has wrong size"); + if (size != thisSize) throw std::invalid_argument("[upload] vec has wrong size"); std::vector vecCm(thisSize); if (mode == StorageMode::rowMajor) { rm2cm(vec, vecCm); @@ -611,7 +611,7 @@ inline DTensor DTensor::tr() const { template inline void DTensor::deviceCopyTo(DTensor &elsewhere) const { if (elsewhere.numEl() < numEl()) { - throw std::invalid_argument("tensor does not fit into destination"); + throw std::invalid_argument("[deviceCopyTo] tensor does not fit into destination"); } gpuErrChk(cudaMemcpy(elsewhere.raw(), m_d_data, @@ -623,7 +623,7 @@ template<> inline DTensor &DTensor::operator*=(double scalar) { double alpha = scalar; gpuErrChk( - cublasDscal(Session::getInstance().cuBlasHandle(), m_numRows * m_numCols * m_numMats, &alpha, m_d_data, 1)); + cublasDscal(Session::getInstance().cuBlasHandle(), m_numRows * m_numCols * m_numMats, &alpha, m_d_data, 1)); return *this; } @@ -641,7 +641,7 @@ template<> inline DTensor &DTensor::operator*=(float scalar) { float alpha = scalar; gpuErrChk( - cublasSscal(Session::getInstance().cuBlasHandle(), m_numRows * m_numCols * m_numMats, &alpha, m_d_data, 1)); + cublasSscal(Session::getInstance().cuBlasHandle(), m_numRows * m_numCols * m_numMats, &alpha, m_d_data, 1)); return *this; } @@ -649,8 +649,8 @@ template<> inline DTensor &DTensor::operator+=(const DTensor &rhs) { const double alpha = 1.; gpuErrChk( - cublasDaxpy(Session::getInstance().cuBlasHandle(), m_numRows * m_numCols * m_numMats, &alpha, rhs.m_d_data, - 1, m_d_data, 1)); + cublasDaxpy(Session::getInstance().cuBlasHandle(), m_numRows * m_numCols * m_numMats, &alpha, rhs.m_d_data, + 1, m_d_data, 1)); return *this; } @@ -658,8 +658,8 @@ template<> inline DTensor &DTensor::operator+=(const DTensor &rhs) { const float alpha = 1.; gpuErrChk( - cublasSaxpy(Session::getInstance().cuBlasHandle(), m_numRows * m_numCols * m_numMats, &alpha, rhs.m_d_data, - 1, m_d_data, 1)); + cublasSaxpy(Session::getInstance().cuBlasHandle(), m_numRows * m_numCols * m_numMats, &alpha, rhs.m_d_data, + 1, m_d_data, 1)); return *this; } @@ -675,8 +675,8 @@ template<> inline DTensor &DTensor::operator-=(const DTensor &rhs) { const double alpha = -1.; gpuErrChk( - cublasDaxpy(Session::getInstance().cuBlasHandle(), m_numRows * m_numCols * m_numMats, &alpha, rhs.m_d_data, - 1, m_d_data, 1)); + cublasDaxpy(Session::getInstance().cuBlasHandle(), m_numRows * m_numCols * m_numMats, &alpha, rhs.m_d_data, + 1, m_d_data, 1)); return *this; } @@ -745,13 +745,13 @@ inline void DTensor::leastSquares(DTensor &B) { size_t batchSize = numMats(); size_t nColsB = B.numCols(); if (B.numRows() != m_numRows) - throw std::invalid_argument("Least squares rhs rows does not equal lhs rows"); + throw std::invalid_argument("[Least squares] rhs rows does not equal lhs rows"); if (nColsB != 1) - throw std::invalid_argument("Least squares rhs are not vectors"); + throw std::invalid_argument("[Least squares] rhs are not vectors"); if (B.numMats() != batchSize) - throw std::invalid_argument("Least squares rhs numMats does not equal lhs numMats"); + throw std::invalid_argument("[Least squares] rhs numMats does not equal lhs numMats"); if (m_numCols > m_numRows) - throw std::invalid_argument("Least squares supports square or tall matrices only"); + throw std::invalid_argument("[Least squares] supports square or tall matrices only"); int info = 0; DTensor infoArray(batchSize); DTensor As = pointersToMatrices(); @@ -775,13 +775,13 @@ inline void DTensor::leastSquares(DTensor &B) { size_t batchSize = numMats(); size_t nColsB = B.numCols(); if (B.numRows() != m_numRows) - throw std::invalid_argument("Least squares rhs rows does not equal lhs rows"); + throw std::invalid_argument("[Least squares] rhs rows does not equal lhs rows"); if (nColsB != 1) - throw std::invalid_argument("Least squares rhs are not vectors"); + throw std::invalid_argument("[Least squares] rhs are not vectors"); if (B.numMats() != batchSize) - throw std::invalid_argument("Least squares rhs numMats does not equal lhs numMats"); + throw std::invalid_argument("[Least squares] rhs numMats does not equal lhs numMats"); if (m_numCols > m_numRows) - throw std::invalid_argument("Least squares supports square or tall matrices only"); + throw std::invalid_argument("[Least squares] supports square or tall matrices only"); int info = 0; DTensor infoArray(batchSize); DTensor As = pointersToMatrices(); @@ -899,7 +899,7 @@ private: */ void checkMatrix(DTensor &tensor) const { if (tensor.numRows() < tensor.numCols()) { - throw std::invalid_argument("your matrix is fat (no offence)"); + throw std::invalid_argument("[svd] your matrix is fat (no offence)"); } }; @@ -1021,17 +1021,17 @@ inline bool Svd::factorise() { if (m_computeU) Ui = std::make_unique>(*m_U, 2, i, i); gpuErrChk( - cusolverDnDgesvd(Session::getInstance().cuSolverHandle(), - (m_computeU) ? 'A' : 'N', 'A', - m, n, - Ai.raw(), m, - Si.raw(), - (m_computeU) ? Ui->raw() : nullptr, m, - Vtri.raw(), n, - m_workspace->raw(), - m_lwork, - nullptr, // rwork (used only if SVD fails) - m_info->raw())); + cusolverDnDgesvd(Session::getInstance().cuSolverHandle(), + (m_computeU) ? 'A' : 'N', 'A', + m, n, + Ai.raw(), m, + Si.raw(), + (m_computeU) ? Ui->raw() : nullptr, m, + Vtri.raw(), n, + m_workspace->raw(), + m_lwork, + nullptr, // rwork (used only if SVD fails) + m_info->raw())); info = info && ((*m_info)(0, 0, 0) == 0); } return info; @@ -1051,17 +1051,17 @@ inline bool Svd::factorise() { if (m_computeU) Ui = std::make_unique>(*m_U, 2, i, i); gpuErrChk( - cusolverDnSgesvd(Session::getInstance().cuSolverHandle(), - (m_computeU) ? 'A' : 'N', 'A', - m, n, - Ai.raw(), m, - Si.raw(), - (m_computeU) ? Ui->raw() : nullptr, m, - Vtri.raw(), n, - m_workspace->raw(), - m_lwork, - nullptr, // rwork (used only if SVD fails) - m_info->raw())); + cusolverDnSgesvd(Session::getInstance().cuSolverHandle(), + (m_computeU) ? 'A' : 'N', 'A', + m, n, + Ai.raw(), m, + Si.raw(), + (m_computeU) ? Ui->raw() : nullptr, m, + Vtri.raw(), n, + m_workspace->raw(), + m_lwork, + nullptr, // rwork (used only if SVD fails) + m_info->raw())); info = info && ((*m_info)(0, 0, 0) == 0); } return info; @@ -1095,8 +1095,8 @@ private: public: CholeskyFactoriser(DTensor &A) { - if (A.numMats() > 1) throw std::invalid_argument("3D tensors are not supported (for now); only matrices"); - if (A.numRows() != A.numCols()) throw std::invalid_argument("Matrix A must be square for CF"); + if (A.numMats() > 1) throw std::invalid_argument("[Cholesky] 3D tensors require `CholeskyBatchFactoriser"); + if (A.numRows() != A.numCols()) throw std::invalid_argument("[Cholesky] Matrix A must be square"); m_matrix = &A; computeWorkspaceSize(); m_workspace = std::make_unique>(m_workspaceSize); @@ -1237,7 +1237,7 @@ template TEMPLATE_CONSTRAINT_REQUIRES_FPX inline Nullspace::Nullspace(DTensor &a) { size_t m = a.numRows(), n = a.numCols(), nMats = a.numMats(); - if (m > n) throw std::invalid_argument("I was expecting a square or fat matrix"); + if (m > n) throw std::invalid_argument("[nullspace] I was expecting a square or fat matrix"); m_nullspace = std::make_unique>(n, n, nMats, true); m_projOp = std::make_unique>(n, n, nMats, true); auto aTranspose = a.tr(); @@ -1276,20 +1276,26 @@ inline void Nullspace::project(DTensor &b) { } +/* ================================================================================================ + * CHOLESKY BATCH FACTORISATION (CBF) + * ================================================================================================ */ + TEMPLATE_WITH_TYPE_T TEMPLATE_CONSTRAINT_REQUIRES_FPX -class CholeskyMultiFactoriser { +class CholeskyBatchFactoriser { private: - DTensor &m_matrix; ///< Matrix to factorise (reference) + DTensor *m_matrix; ///< Matrix to factorise (reference) size_t m_numRows = 0; size_t m_numMats = 0; std::unique_ptr> m_deviceInfo; bool m_factorisationDone = false; public: - CholeskyMultiFactoriser() = delete; + CholeskyBatchFactoriser() = delete; - CholeskyMultiFactoriser(DTensor &A) : m_matrix(A) { + CholeskyBatchFactoriser(DTensor &A, bool factorised=false) : m_factorisationDone(factorised) { + if (A.numRows() != A.numCols()) throw std::invalid_argument("[CholeskyBatch] A must be square"); + m_matrix = &A; m_numRows = A.numRows(); m_numMats = A.numMats(); m_deviceInfo = std::make_unique>(m_numMats, 1, 1, true); @@ -1300,36 +1306,72 @@ public: */ void factorise(); - int solve(DTensor &b); + void solve(DTensor &b); }; template<> -void CholeskyMultiFactoriser::factorise() { +void CholeskyBatchFactoriser::factorise() { if (m_factorisationDone) return; - DTensor ptrA = m_matrix.pointersToMatrices(); - cusolverDnDpotrfBatched(Session::getInstance().cuSolverHandle(), - CUBLAS_FILL_MODE_LOWER, - m_numRows, - ptrA.raw(), - m_numRows, - m_deviceInfo->raw(), - m_numMats); + DTensor ptrA = m_matrix->pointersToMatrices(); + gpuErrChk(cusolverDnDpotrfBatched(Session::getInstance().cuSolverHandle(), + CUBLAS_FILL_MODE_LOWER, + m_numRows, + ptrA.raw(), + m_numRows, + m_deviceInfo->raw(), + m_numMats)); m_factorisationDone = true; } template<> -void CholeskyMultiFactoriser::factorise() { +void CholeskyBatchFactoriser::factorise() { if (m_factorisationDone) return; - DTensor ptrA = m_matrix.pointersToMatrices(); - cusolverDnSpotrfBatched(Session::getInstance().cuSolverHandle(), - CUBLAS_FILL_MODE_LOWER, - m_numRows, - ptrA.raw(), - m_numRows, - m_deviceInfo->raw(), - m_numMats); + DTensor ptrA = m_matrix->pointersToMatrices(); + gpuErrChk(cusolverDnSpotrfBatched(Session::getInstance().cuSolverHandle(), + CUBLAS_FILL_MODE_LOWER, + m_numRows, + ptrA.raw(), + m_numRows, + m_deviceInfo->raw(), + m_numMats)); m_factorisationDone = true; } +template<> +void CholeskyBatchFactoriser::solve(DTensor &b) { + if (!m_factorisationDone) throw std::logic_error("[CholeskyBatchSolve] no factor to solve with"); + if (b.numCols() != 1) throw std::invalid_argument("[CholeskyBatchSolve] only supports `b` with one column"); + DTensor ptrA = m_matrix->pointersToMatrices(); + DTensor ptrB = b.pointersToMatrices(); + gpuErrChk(cusolverDnDpotrsBatched(Session::getInstance().cuSolverHandle(), + CUBLAS_FILL_MODE_LOWER, + m_numRows, + 1, ///< only supports rhs = 1 + ptrA.raw(), + m_numRows, + ptrB.raw(), + m_numRows, + m_deviceInfo->raw(), + m_numMats)); +} + +template<> +void CholeskyBatchFactoriser::solve(DTensor &b) { + if (!m_factorisationDone) throw std::logic_error("[CholeskyBatchSolve] no factor to solve with"); + if (b.numCols() != 1) throw std::invalid_argument("[CholeskyBatchSolve] only supports `b` with one column"); + DTensor ptrA = m_matrix->pointersToMatrices(); + DTensor ptrB = b.pointersToMatrices(); + gpuErrChk(cusolverDnSpotrsBatched(Session::getInstance().cuSolverHandle(), + CUBLAS_FILL_MODE_LOWER, + m_numRows, + 1, ///< only supports rhs = 1 + ptrA.raw(), + m_numRows, + ptrB.raw(), + m_numRows, + m_deviceInfo->raw(), + m_numMats)); +} + #endif diff --git a/test/testTensor.cu b/test/testTensor.cu index a46a803..d6e4be3 100644 --- a/test/testTensor.cu +++ b/test/testTensor.cu @@ -733,7 +733,7 @@ TEST_F(SvdTest, singularValuesMemory) { * --------------------------------------- */ template requires std::floating_point -void singularValuesMutlipleMatrices(float epsilon) { +void singularValuesMultipleMatrices(float epsilon) { std::vector aData = {1, 2, 3, 4, 5, 6, 1, 1, 1, 2, 2, 2, 0, 0, 0, 0, 0, 1}; DTensor A(aData, 3, 2, 3); Svd svd(A, true); // do compute U (A will be destroyed) @@ -753,15 +753,15 @@ void singularValuesMutlipleMatrices(float epsilon) { S.download(actual_s); for (size_t i = 0; i < 6; i++) EXPECT_NEAR(expected_s[i], actual_s[i], epsilon); std::vector expected_u = { - -0.428667133548626, -0.566306918848035, -0.703946704147444, - 0.805963908589298, 0.112382414096594, -0.581199080396110, - 0.408248290463863, -0.816496580927726, 0.408248290463863, - -0.577350269189626, -0.577350269189626, -0.577350269189626, - 0.816496580927726, -0.408248290463863, -0.408248290463863, - 0.000000000000000, -0.707106781186548, 0.707106781186547, - 0, 0, -1, - 1, 0, 0, - 0, -1, 0, + -0.428667133548626, -0.566306918848035, -0.703946704147444, + 0.805963908589298, 0.112382414096594, -0.581199080396110, + 0.408248290463863, -0.816496580927726, 0.408248290463863, + -0.577350269189626, -0.577350269189626, -0.577350269189626, + 0.816496580927726, -0.408248290463863, -0.408248290463863, + 0.000000000000000, -0.707106781186548, 0.707106781186547, + 0, 0, -1, + 1, 0, 0, + 0, -1, 0, }; std::vector actual_u(27); U->download(actual_u); @@ -769,9 +769,9 @@ void singularValuesMutlipleMatrices(float epsilon) { } -TEST_F(SvdTest, singularValuesMutlipleMatrices) { - singularValuesMutlipleMatrices(10 * PRECISION_LOW); // SVD with float performs quite poorly - singularValuesMutlipleMatrices(PRECISION_HIGH); +TEST_F(SvdTest, singularValuesMultipleMatrices) { + singularValuesMultipleMatrices(10 * PRECISION_LOW); // SVD with float performs quite poorly + singularValuesMultipleMatrices(PRECISION_HIGH); } @@ -870,6 +870,121 @@ TEST_F(CholeskyTest, choleskyFactorisationSolution) { choleskyFactorisationSolution(PRECISION_HIGH); } +/* --------------------------------------- + * Batched Cholesky factorisation + * --------------------------------------- */ + +template +requires std::floating_point +void choleskyBatchFactorisation(T epsilon) { + std::vector aData = {10.0, 2.0, 3.0, + 2.0, 20.0, -1.0, + 3.0, -1.0, 30.0}; + DTensor A(3, 3, 2); + DTensor A0(A, 2, 0, 0); + DTensor A1(A, 2, 1, 1); + A0.upload(aData); + A1.upload(aData); + CholeskyBatchFactoriser chol(A); + chol.factorise(); + // 0 + EXPECT_NEAR(3.162277660168380, A(0, 0, 0), epsilon); + EXPECT_NEAR(-0.361403161162101, A(2, 1, 0), epsilon); + EXPECT_NEAR(5.382321781081287, A(2, 2, 0), epsilon); + // 1 + EXPECT_NEAR(3.162277660168380, A(0, 0, 1), epsilon); + EXPECT_NEAR(-0.361403161162101, A(2, 1, 1), epsilon); + EXPECT_NEAR(5.382321781081287, A(2, 2, 1), epsilon); +} + +TEST_F(CholeskyTest, choleskyBatchFactorisation) { + choleskyBatchFactorisation(PRECISION_LOW); + choleskyBatchFactorisation(PRECISION_HIGH); +} + +/* --------------------------------------- + * Batched Cholesky solve + * --------------------------------------- */ + +template +requires std::floating_point +void choleskyBatchFactorSolve(T epsilon) { + std::vector aData = {10.0, 2.0, 3.0, + 2.0, 20.0, -1.0, + 3.0, -1.0, 30.0}; + DTensor A(3, 3, 2); + DTensor A0(A, 2, 0, 0); + DTensor A1(A, 2, 1, 1); + A0.upload(aData); + A1.upload(aData); + DTensor L(A); // L = A + CholeskyBatchFactoriser chol(L); + chol.factorise(); + std::vector bData = {-1., -3., 5.}; + DTensor rhs(3, 1, 2); + DTensor rhs0(rhs, 2, 0, 0); + DTensor rhs1(rhs, 2, 1, 1); + rhs0.upload(bData); + rhs1.upload(bData); + DTensor sol(rhs); + chol.solve(sol); + std::vector expected = {-0.126805213103205, -0.128566396618528, 0.175061641423036}; + std::vector actual(6); + sol.download(actual); + for (size_t i = 0; i < 3; i++) EXPECT_NEAR(expected[i], actual[i], epsilon); // 0 + for (size_t i = 0; i < 3; i++) EXPECT_NEAR(expected[i], actual[i + 3], epsilon); // 1 + DTensor error = A * sol; + error -= rhs; + EXPECT_TRUE(error.normF() < epsilon); +} + +TEST_F(CholeskyTest, choleskyBatchFactorSolve) { + choleskyBatchFactorSolve(PRECISION_LOW); + choleskyBatchFactorSolve(PRECISION_HIGH); +} + +/* --------------------------------------- + * Batched Cholesky solve (factor provided) + * --------------------------------------- */ + +template +requires std::floating_point +void choleskyBatchSolve(T epsilon) { + /* originalMatrix = {10.0, 2.0, 3.0, + 2.0, 20.0, -1.0, + 3.0, -1.0, 30.0}; */ + std::vector lowData = {3.162277660168380, 0, 0, + 0.632455532033676, 4.427188724235731, 0, + 0.948683298050514, -0.361403161162101, 5.382321781081287}; // from matlab + DTensor low(3, 3, 2); + DTensor low0(low, 2, 0, 0); + DTensor low1(low, 2, 1, 1); + low0.upload(lowData); + low1.upload(lowData); + CholeskyBatchFactoriser chol(low, true); + std::vector bData = {-1., -3., 5.}; + DTensor rhs(3, 1, 2); + DTensor rhs0(rhs, 2, 0, 0); + DTensor rhs1(rhs, 2, 1, 1); + rhs0.upload(bData); + rhs1.upload(bData); + DTensor sol(rhs); + chol.solve(sol); + std::vector expected = {-0.126805213103205, -0.128566396618528, 0.175061641423036}; + std::vector actual(6); + sol.download(actual); + for (size_t i = 0; i < 3; i++) EXPECT_NEAR(expected[i], actual[i], epsilon); // 0 + for (size_t i = 0; i < 3; i++) EXPECT_NEAR(expected[i], actual[i + 3], epsilon); // 1 + DTensor error = low * sol; + error -= rhs; + EXPECT_TRUE(error.normF() < epsilon); +} + +TEST_F(CholeskyTest, choleskyBatchSolve) { + choleskyBatchSolve(PRECISION_LOW); + choleskyBatchSolve(PRECISION_HIGH); +} + /* ================================================================================================ * NULLSPACE TESTS @@ -983,7 +1098,7 @@ void projectOnNullspaceTensor(T epsilon) { DTensor delta1 = y - proj; DTensor delta2 = proj - x; EXPECT_LT(delta1.dotF(delta2), epsilon); - } +} TEST_F(NullspaceTest, projectOnNullspaceTensor) { projectOnNullspaceTensor(PRECISION_LOW); From afd313cfe4d527e37f15ae1762a5a4e71c0b2dc8 Mon Sep 17 00:00:00 2001 From: Ruairi Moran Date: Tue, 7 May 2024 16:50:33 +0100 Subject: [PATCH 3/3] rowMajor --- include/tensor.cuh | 25 ++++++++++++++++++++----- test/testTensor.cu | 20 +++++++++++++------- 2 files changed, 33 insertions(+), 12 deletions(-) diff --git a/include/tensor.cuh b/include/tensor.cuh index dca144d..cf7003c 100644 --- a/include/tensor.cuh +++ b/include/tensor.cuh @@ -1280,19 +1280,30 @@ inline void Nullspace::project(DTensor &b) { * CHOLESKY BATCH FACTORISATION (CBF) * ================================================================================================ */ +/** + * Cholesky Factorisation (CF) for batches of matrices (tensors). + * This object can factorise and then solve, + * or you can provide pre-computed Cholesky lower-triangular matrices and then solve. + * @tparam T data type of tensor (must be float or double) + */ TEMPLATE_WITH_TYPE_T TEMPLATE_CONSTRAINT_REQUIRES_FPX class CholeskyBatchFactoriser { private: - DTensor *m_matrix; ///< Matrix to factorise (reference) - size_t m_numRows = 0; - size_t m_numMats = 0; - std::unique_ptr> m_deviceInfo; - bool m_factorisationDone = false; + DTensor *m_matrix; ///< Matrix to factorise or lower-triangular decomposed matrix + size_t m_numRows = 0; ///< Number of rows in `m_matrix` + size_t m_numMats = 0; ///< Number of matrices in `m_matrix` + std::unique_ptr> m_deviceInfo; ///< Info from cusolver functions + bool m_factorisationDone = false; ///< Whether `m_matrix` holds original or lower-triangular matrix public: CholeskyBatchFactoriser() = delete; + /** + * Constructor + * @param A either matrices to be factorised or lower-triangular Cholesky decomposition matrices + * @param factorised whether A is the original matrices or the factorised ones (default=original matrices) + */ CholeskyBatchFactoriser(DTensor &A, bool factorised=false) : m_factorisationDone(factorised) { if (A.numRows() != A.numCols()) throw std::invalid_argument("[CholeskyBatch] A must be square"); m_matrix = &A; @@ -1306,6 +1317,10 @@ public: */ void factorise(); + /** + * Solve the system A\b using Cholesky lower-triangular matrix of A (batched). + * @param b vector in system Ax=b to be solved where x is the solution + */ void solve(DTensor &b); }; diff --git a/test/testTensor.cu b/test/testTensor.cu index d6e4be3..5291c68 100644 --- a/test/testTensor.cu +++ b/test/testTensor.cu @@ -950,18 +950,24 @@ TEST_F(CholeskyTest, choleskyBatchFactorSolve) { template requires std::floating_point void choleskyBatchSolve(T epsilon) { - /* originalMatrix = {10.0, 2.0, 3.0, - 2.0, 20.0, -1.0, - 3.0, -1.0, 30.0}; */ + std::vector aData = {10.0, 2.0, 3.0, + 2.0, 20.0, -1.0, + 3.0, -1.0, 30.0}; + DTensor A(3, 3, 2); + DTensor A0(A, 2, 0, 0); + DTensor A1(A, 2, 1, 1); + A0.upload(aData); + A1.upload(aData); std::vector lowData = {3.162277660168380, 0, 0, 0.632455532033676, 4.427188724235731, 0, 0.948683298050514, -0.361403161162101, 5.382321781081287}; // from matlab DTensor low(3, 3, 2); DTensor low0(low, 2, 0, 0); DTensor low1(low, 2, 1, 1); - low0.upload(lowData); - low1.upload(lowData); - CholeskyBatchFactoriser chol(low, true); + low0.upload(lowData, rowMajor); + low1.upload(lowData, rowMajor); + DTensor L(low); + CholeskyBatchFactoriser chol(L, true); std::vector bData = {-1., -3., 5.}; DTensor rhs(3, 1, 2); DTensor rhs0(rhs, 2, 0, 0); @@ -975,7 +981,7 @@ void choleskyBatchSolve(T epsilon) { sol.download(actual); for (size_t i = 0; i < 3; i++) EXPECT_NEAR(expected[i], actual[i], epsilon); // 0 for (size_t i = 0; i < 3; i++) EXPECT_NEAR(expected[i], actual[i + 3], epsilon); // 1 - DTensor error = low * sol; + DTensor error = A * sol; error -= rhs; EXPECT_TRUE(error.normF() < epsilon); }