From de9c325f09155ab176fbb9b82d1cdc06f0e551c4 Mon Sep 17 00:00:00 2001
From: Pantelis Sopasakis
Date: Fri, 3 May 2024 21:49:47 +0100
Subject: [PATCH 1/3] implementing CholeskyMultiFactoriser
---
include/tensor.cuh | 56 ++++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 56 insertions(+)
diff --git a/include/tensor.cuh b/include/tensor.cuh
index b2cfaf9..ba79d18 100644
--- a/include/tensor.cuh
+++ b/include/tensor.cuh
@@ -1276,4 +1276,60 @@ inline void Nullspace::project(DTensor &b) {
}
+TEMPLATE_WITH_TYPE_T TEMPLATE_CONSTRAINT_REQUIRES_FPX
+class CholeskyMultiFactoriser {
+
+private:
+ DTensor &m_matrix; ///< Matrix to factorise (reference)
+ size_t m_numRows = 0;
+ size_t m_numMats = 0;
+ std::unique_ptr> m_deviceInfo;
+ bool m_factorisationDone = false;
+
+public:
+ CholeskyMultiFactoriser() = delete;
+
+ CholeskyMultiFactoriser(DTensor &A) : m_matrix(A) {
+ m_numRows = A.numRows();
+ m_numMats = A.numMats();
+ m_deviceInfo = std::make_unique>(m_numMats, 1, 1, true);
+ }
+
+ /**
+ * Factorise all matrices in tensor (batched).
+ */
+ void factorise();
+
+ int solve(DTensor &b);
+
+};
+
+template<>
+void CholeskyMultiFactoriser::factorise() {
+ if (m_factorisationDone) return;
+ DTensor ptrA = m_matrix.pointersToMatrices();
+ cusolverDnDpotrfBatched(Session::getInstance().cuSolverHandle(),
+ CUBLAS_FILL_MODE_LOWER,
+ m_numRows,
+ ptrA.raw(),
+ m_numRows,
+ m_deviceInfo->raw(),
+ m_numMats);
+ m_factorisationDone = true;
+}
+
+template<>
+void CholeskyMultiFactoriser::factorise() {
+ if (m_factorisationDone) return;
+ DTensor ptrA = m_matrix.pointersToMatrices();
+ cusolverDnSpotrfBatched(Session::getInstance().cuSolverHandle(),
+ CUBLAS_FILL_MODE_LOWER,
+ m_numRows,
+ ptrA.raw(),
+ m_numRows,
+ m_deviceInfo->raw(),
+ m_numMats);
+ m_factorisationDone = true;
+}
+
#endif
From 20c9c1578b2544a866d9a1298becc9adfeda28d1 Mon Sep 17 00:00:00 2001
From: Ruairi Moran
Date: Tue, 7 May 2024 13:42:07 +0100
Subject: [PATCH 2/3] added batch solve, provide factorised lower, and unit
tests
---
include/tensor.cuh | 176 ++++++++++++++++++++++++++++-----------------
test/testTensor.cu | 143 ++++++++++++++++++++++++++++++++----
2 files changed, 238 insertions(+), 81 deletions(-)
diff --git a/include/tensor.cuh b/include/tensor.cuh
index ba79d18..dca144d 100644
--- a/include/tensor.cuh
+++ b/include/tensor.cuh
@@ -548,7 +548,7 @@ inline bool DTensor::upload(const std::vector &vec, StorageMode mode) {
size_t size = vec.size();
size_t thisSize = m_numRows * m_numCols * m_numMats;
// make sure vec is of right size
- if (size != thisSize) throw std::invalid_argument("vec has wrong size");
+ if (size != thisSize) throw std::invalid_argument("[upload] vec has wrong size");
std::vector vecCm(thisSize);
if (mode == StorageMode::rowMajor) {
rm2cm(vec, vecCm);
@@ -611,7 +611,7 @@ inline DTensor DTensor::tr() const {
template
inline void DTensor::deviceCopyTo(DTensor &elsewhere) const {
if (elsewhere.numEl() < numEl()) {
- throw std::invalid_argument("tensor does not fit into destination");
+ throw std::invalid_argument("[deviceCopyTo] tensor does not fit into destination");
}
gpuErrChk(cudaMemcpy(elsewhere.raw(),
m_d_data,
@@ -623,7 +623,7 @@ template<>
inline DTensor &DTensor::operator*=(double scalar) {
double alpha = scalar;
gpuErrChk(
- cublasDscal(Session::getInstance().cuBlasHandle(), m_numRows * m_numCols * m_numMats, &alpha, m_d_data, 1));
+ cublasDscal(Session::getInstance().cuBlasHandle(), m_numRows * m_numCols * m_numMats, &alpha, m_d_data, 1));
return *this;
}
@@ -641,7 +641,7 @@ template<>
inline DTensor &DTensor::operator*=(float scalar) {
float alpha = scalar;
gpuErrChk(
- cublasSscal(Session::getInstance().cuBlasHandle(), m_numRows * m_numCols * m_numMats, &alpha, m_d_data, 1));
+ cublasSscal(Session::getInstance().cuBlasHandle(), m_numRows * m_numCols * m_numMats, &alpha, m_d_data, 1));
return *this;
}
@@ -649,8 +649,8 @@ template<>
inline DTensor &DTensor::operator+=(const DTensor &rhs) {
const double alpha = 1.;
gpuErrChk(
- cublasDaxpy(Session::getInstance().cuBlasHandle(), m_numRows * m_numCols * m_numMats, &alpha, rhs.m_d_data,
- 1, m_d_data, 1));
+ cublasDaxpy(Session::getInstance().cuBlasHandle(), m_numRows * m_numCols * m_numMats, &alpha, rhs.m_d_data,
+ 1, m_d_data, 1));
return *this;
}
@@ -658,8 +658,8 @@ template<>
inline DTensor &DTensor::operator+=(const DTensor &rhs) {
const float alpha = 1.;
gpuErrChk(
- cublasSaxpy(Session::getInstance().cuBlasHandle(), m_numRows * m_numCols * m_numMats, &alpha, rhs.m_d_data,
- 1, m_d_data, 1));
+ cublasSaxpy(Session::getInstance().cuBlasHandle(), m_numRows * m_numCols * m_numMats, &alpha, rhs.m_d_data,
+ 1, m_d_data, 1));
return *this;
}
@@ -675,8 +675,8 @@ template<>
inline DTensor &DTensor::operator-=(const DTensor &rhs) {
const double alpha = -1.;
gpuErrChk(
- cublasDaxpy(Session::getInstance().cuBlasHandle(), m_numRows * m_numCols * m_numMats, &alpha, rhs.m_d_data,
- 1, m_d_data, 1));
+ cublasDaxpy(Session::getInstance().cuBlasHandle(), m_numRows * m_numCols * m_numMats, &alpha, rhs.m_d_data,
+ 1, m_d_data, 1));
return *this;
}
@@ -745,13 +745,13 @@ inline void DTensor::leastSquares(DTensor &B) {
size_t batchSize = numMats();
size_t nColsB = B.numCols();
if (B.numRows() != m_numRows)
- throw std::invalid_argument("Least squares rhs rows does not equal lhs rows");
+ throw std::invalid_argument("[Least squares] rhs rows does not equal lhs rows");
if (nColsB != 1)
- throw std::invalid_argument("Least squares rhs are not vectors");
+ throw std::invalid_argument("[Least squares] rhs are not vectors");
if (B.numMats() != batchSize)
- throw std::invalid_argument("Least squares rhs numMats does not equal lhs numMats");
+ throw std::invalid_argument("[Least squares] rhs numMats does not equal lhs numMats");
if (m_numCols > m_numRows)
- throw std::invalid_argument("Least squares supports square or tall matrices only");
+ throw std::invalid_argument("[Least squares] supports square or tall matrices only");
int info = 0;
DTensor infoArray(batchSize);
DTensor As = pointersToMatrices();
@@ -775,13 +775,13 @@ inline void DTensor::leastSquares(DTensor &B) {
size_t batchSize = numMats();
size_t nColsB = B.numCols();
if (B.numRows() != m_numRows)
- throw std::invalid_argument("Least squares rhs rows does not equal lhs rows");
+ throw std::invalid_argument("[Least squares] rhs rows does not equal lhs rows");
if (nColsB != 1)
- throw std::invalid_argument("Least squares rhs are not vectors");
+ throw std::invalid_argument("[Least squares] rhs are not vectors");
if (B.numMats() != batchSize)
- throw std::invalid_argument("Least squares rhs numMats does not equal lhs numMats");
+ throw std::invalid_argument("[Least squares] rhs numMats does not equal lhs numMats");
if (m_numCols > m_numRows)
- throw std::invalid_argument("Least squares supports square or tall matrices only");
+ throw std::invalid_argument("[Least squares] supports square or tall matrices only");
int info = 0;
DTensor infoArray(batchSize);
DTensor As = pointersToMatrices();
@@ -899,7 +899,7 @@ private:
*/
void checkMatrix(DTensor &tensor) const {
if (tensor.numRows() < tensor.numCols()) {
- throw std::invalid_argument("your matrix is fat (no offence)");
+ throw std::invalid_argument("[svd] your matrix is fat (no offence)");
}
};
@@ -1021,17 +1021,17 @@ inline bool Svd::factorise() {
if (m_computeU)
Ui = std::make_unique>(*m_U, 2, i, i);
gpuErrChk(
- cusolverDnDgesvd(Session::getInstance().cuSolverHandle(),
- (m_computeU) ? 'A' : 'N', 'A',
- m, n,
- Ai.raw(), m,
- Si.raw(),
- (m_computeU) ? Ui->raw() : nullptr, m,
- Vtri.raw(), n,
- m_workspace->raw(),
- m_lwork,
- nullptr, // rwork (used only if SVD fails)
- m_info->raw()));
+ cusolverDnDgesvd(Session::getInstance().cuSolverHandle(),
+ (m_computeU) ? 'A' : 'N', 'A',
+ m, n,
+ Ai.raw(), m,
+ Si.raw(),
+ (m_computeU) ? Ui->raw() : nullptr, m,
+ Vtri.raw(), n,
+ m_workspace->raw(),
+ m_lwork,
+ nullptr, // rwork (used only if SVD fails)
+ m_info->raw()));
info = info && ((*m_info)(0, 0, 0) == 0);
}
return info;
@@ -1051,17 +1051,17 @@ inline bool Svd::factorise() {
if (m_computeU)
Ui = std::make_unique>(*m_U, 2, i, i);
gpuErrChk(
- cusolverDnSgesvd(Session::getInstance().cuSolverHandle(),
- (m_computeU) ? 'A' : 'N', 'A',
- m, n,
- Ai.raw(), m,
- Si.raw(),
- (m_computeU) ? Ui->raw() : nullptr, m,
- Vtri.raw(), n,
- m_workspace->raw(),
- m_lwork,
- nullptr, // rwork (used only if SVD fails)
- m_info->raw()));
+ cusolverDnSgesvd(Session::getInstance().cuSolverHandle(),
+ (m_computeU) ? 'A' : 'N', 'A',
+ m, n,
+ Ai.raw(), m,
+ Si.raw(),
+ (m_computeU) ? Ui->raw() : nullptr, m,
+ Vtri.raw(), n,
+ m_workspace->raw(),
+ m_lwork,
+ nullptr, // rwork (used only if SVD fails)
+ m_info->raw()));
info = info && ((*m_info)(0, 0, 0) == 0);
}
return info;
@@ -1095,8 +1095,8 @@ private:
public:
CholeskyFactoriser(DTensor &A) {
- if (A.numMats() > 1) throw std::invalid_argument("3D tensors are not supported (for now); only matrices");
- if (A.numRows() != A.numCols()) throw std::invalid_argument("Matrix A must be square for CF");
+ if (A.numMats() > 1) throw std::invalid_argument("[Cholesky] 3D tensors require `CholeskyBatchFactoriser");
+ if (A.numRows() != A.numCols()) throw std::invalid_argument("[Cholesky] Matrix A must be square");
m_matrix = &A;
computeWorkspaceSize();
m_workspace = std::make_unique>(m_workspaceSize);
@@ -1237,7 +1237,7 @@ template
TEMPLATE_CONSTRAINT_REQUIRES_FPX
inline Nullspace::Nullspace(DTensor &a) {
size_t m = a.numRows(), n = a.numCols(), nMats = a.numMats();
- if (m > n) throw std::invalid_argument("I was expecting a square or fat matrix");
+ if (m > n) throw std::invalid_argument("[nullspace] I was expecting a square or fat matrix");
m_nullspace = std::make_unique>(n, n, nMats, true);
m_projOp = std::make_unique>(n, n, nMats, true);
auto aTranspose = a.tr();
@@ -1276,20 +1276,26 @@ inline void Nullspace::project(DTensor &b) {
}
+/* ================================================================================================
+ * CHOLESKY BATCH FACTORISATION (CBF)
+ * ================================================================================================ */
+
TEMPLATE_WITH_TYPE_T TEMPLATE_CONSTRAINT_REQUIRES_FPX
-class CholeskyMultiFactoriser {
+class CholeskyBatchFactoriser {
private:
- DTensor &m_matrix; ///< Matrix to factorise (reference)
+ DTensor *m_matrix; ///< Matrix to factorise (reference)
size_t m_numRows = 0;
size_t m_numMats = 0;
std::unique_ptr> m_deviceInfo;
bool m_factorisationDone = false;
public:
- CholeskyMultiFactoriser() = delete;
+ CholeskyBatchFactoriser() = delete;
- CholeskyMultiFactoriser(DTensor &A) : m_matrix(A) {
+ CholeskyBatchFactoriser(DTensor &A, bool factorised=false) : m_factorisationDone(factorised) {
+ if (A.numRows() != A.numCols()) throw std::invalid_argument("[CholeskyBatch] A must be square");
+ m_matrix = &A;
m_numRows = A.numRows();
m_numMats = A.numMats();
m_deviceInfo = std::make_unique>(m_numMats, 1, 1, true);
@@ -1300,36 +1306,72 @@ public:
*/
void factorise();
- int solve(DTensor &b);
+ void solve(DTensor &b);
};
template<>
-void CholeskyMultiFactoriser::factorise() {
+void CholeskyBatchFactoriser::factorise() {
if (m_factorisationDone) return;
- DTensor ptrA = m_matrix.pointersToMatrices();
- cusolverDnDpotrfBatched(Session::getInstance().cuSolverHandle(),
- CUBLAS_FILL_MODE_LOWER,
- m_numRows,
- ptrA.raw(),
- m_numRows,
- m_deviceInfo->raw(),
- m_numMats);
+ DTensor ptrA = m_matrix->pointersToMatrices();
+ gpuErrChk(cusolverDnDpotrfBatched(Session::getInstance().cuSolverHandle(),
+ CUBLAS_FILL_MODE_LOWER,
+ m_numRows,
+ ptrA.raw(),
+ m_numRows,
+ m_deviceInfo->raw(),
+ m_numMats));
m_factorisationDone = true;
}
template<>
-void CholeskyMultiFactoriser::factorise() {
+void CholeskyBatchFactoriser::factorise() {
if (m_factorisationDone) return;
- DTensor ptrA = m_matrix.pointersToMatrices();
- cusolverDnSpotrfBatched(Session::getInstance().cuSolverHandle(),
- CUBLAS_FILL_MODE_LOWER,
- m_numRows,
- ptrA.raw(),
- m_numRows,
- m_deviceInfo->raw(),
- m_numMats);
+ DTensor ptrA = m_matrix->pointersToMatrices();
+ gpuErrChk(cusolverDnSpotrfBatched(Session::getInstance().cuSolverHandle(),
+ CUBLAS_FILL_MODE_LOWER,
+ m_numRows,
+ ptrA.raw(),
+ m_numRows,
+ m_deviceInfo->raw(),
+ m_numMats));
m_factorisationDone = true;
}
+template<>
+void CholeskyBatchFactoriser::solve(DTensor &b) {
+ if (!m_factorisationDone) throw std::logic_error("[CholeskyBatchSolve] no factor to solve with");
+ if (b.numCols() != 1) throw std::invalid_argument("[CholeskyBatchSolve] only supports `b` with one column");
+ DTensor ptrA = m_matrix->pointersToMatrices();
+ DTensor ptrB = b.pointersToMatrices();
+ gpuErrChk(cusolverDnDpotrsBatched(Session::getInstance().cuSolverHandle(),
+ CUBLAS_FILL_MODE_LOWER,
+ m_numRows,
+ 1, ///< only supports rhs = 1
+ ptrA.raw(),
+ m_numRows,
+ ptrB.raw(),
+ m_numRows,
+ m_deviceInfo->raw(),
+ m_numMats));
+}
+
+template<>
+void CholeskyBatchFactoriser::solve(DTensor &b) {
+ if (!m_factorisationDone) throw std::logic_error("[CholeskyBatchSolve] no factor to solve with");
+ if (b.numCols() != 1) throw std::invalid_argument("[CholeskyBatchSolve] only supports `b` with one column");
+ DTensor ptrA = m_matrix->pointersToMatrices();
+ DTensor ptrB = b.pointersToMatrices();
+ gpuErrChk(cusolverDnSpotrsBatched(Session::getInstance().cuSolverHandle(),
+ CUBLAS_FILL_MODE_LOWER,
+ m_numRows,
+ 1, ///< only supports rhs = 1
+ ptrA.raw(),
+ m_numRows,
+ ptrB.raw(),
+ m_numRows,
+ m_deviceInfo->raw(),
+ m_numMats));
+}
+
#endif
diff --git a/test/testTensor.cu b/test/testTensor.cu
index a46a803..d6e4be3 100644
--- a/test/testTensor.cu
+++ b/test/testTensor.cu
@@ -733,7 +733,7 @@ TEST_F(SvdTest, singularValuesMemory) {
* --------------------------------------- */
template
requires std::floating_point
-void singularValuesMutlipleMatrices(float epsilon) {
+void singularValuesMultipleMatrices(float epsilon) {
std::vector aData = {1, 2, 3, 4, 5, 6, 1, 1, 1, 2, 2, 2, 0, 0, 0, 0, 0, 1};
DTensor A(aData, 3, 2, 3);
Svd svd(A, true); // do compute U (A will be destroyed)
@@ -753,15 +753,15 @@ void singularValuesMutlipleMatrices(float epsilon) {
S.download(actual_s);
for (size_t i = 0; i < 6; i++) EXPECT_NEAR(expected_s[i], actual_s[i], epsilon);
std::vector expected_u = {
- -0.428667133548626, -0.566306918848035, -0.703946704147444,
- 0.805963908589298, 0.112382414096594, -0.581199080396110,
- 0.408248290463863, -0.816496580927726, 0.408248290463863,
- -0.577350269189626, -0.577350269189626, -0.577350269189626,
- 0.816496580927726, -0.408248290463863, -0.408248290463863,
- 0.000000000000000, -0.707106781186548, 0.707106781186547,
- 0, 0, -1,
- 1, 0, 0,
- 0, -1, 0,
+ -0.428667133548626, -0.566306918848035, -0.703946704147444,
+ 0.805963908589298, 0.112382414096594, -0.581199080396110,
+ 0.408248290463863, -0.816496580927726, 0.408248290463863,
+ -0.577350269189626, -0.577350269189626, -0.577350269189626,
+ 0.816496580927726, -0.408248290463863, -0.408248290463863,
+ 0.000000000000000, -0.707106781186548, 0.707106781186547,
+ 0, 0, -1,
+ 1, 0, 0,
+ 0, -1, 0,
};
std::vector actual_u(27);
U->download(actual_u);
@@ -769,9 +769,9 @@ void singularValuesMutlipleMatrices(float epsilon) {
}
-TEST_F(SvdTest, singularValuesMutlipleMatrices) {
- singularValuesMutlipleMatrices(10 * PRECISION_LOW); // SVD with float performs quite poorly
- singularValuesMutlipleMatrices(PRECISION_HIGH);
+TEST_F(SvdTest, singularValuesMultipleMatrices) {
+ singularValuesMultipleMatrices(10 * PRECISION_LOW); // SVD with float performs quite poorly
+ singularValuesMultipleMatrices(PRECISION_HIGH);
}
@@ -870,6 +870,121 @@ TEST_F(CholeskyTest, choleskyFactorisationSolution) {
choleskyFactorisationSolution(PRECISION_HIGH);
}
+/* ---------------------------------------
+ * Batched Cholesky factorisation
+ * --------------------------------------- */
+
+template
+requires std::floating_point
+void choleskyBatchFactorisation(T epsilon) {
+ std::vector aData = {10.0, 2.0, 3.0,
+ 2.0, 20.0, -1.0,
+ 3.0, -1.0, 30.0};
+ DTensor A(3, 3, 2);
+ DTensor A0(A, 2, 0, 0);
+ DTensor A1(A, 2, 1, 1);
+ A0.upload(aData);
+ A1.upload(aData);
+ CholeskyBatchFactoriser chol(A);
+ chol.factorise();
+ // 0
+ EXPECT_NEAR(3.162277660168380, A(0, 0, 0), epsilon);
+ EXPECT_NEAR(-0.361403161162101, A(2, 1, 0), epsilon);
+ EXPECT_NEAR(5.382321781081287, A(2, 2, 0), epsilon);
+ // 1
+ EXPECT_NEAR(3.162277660168380, A(0, 0, 1), epsilon);
+ EXPECT_NEAR(-0.361403161162101, A(2, 1, 1), epsilon);
+ EXPECT_NEAR(5.382321781081287, A(2, 2, 1), epsilon);
+}
+
+TEST_F(CholeskyTest, choleskyBatchFactorisation) {
+ choleskyBatchFactorisation(PRECISION_LOW);
+ choleskyBatchFactorisation(PRECISION_HIGH);
+}
+
+/* ---------------------------------------
+ * Batched Cholesky solve
+ * --------------------------------------- */
+
+template
+requires std::floating_point
+void choleskyBatchFactorSolve(T epsilon) {
+ std::vector aData = {10.0, 2.0, 3.0,
+ 2.0, 20.0, -1.0,
+ 3.0, -1.0, 30.0};
+ DTensor A(3, 3, 2);
+ DTensor A0(A, 2, 0, 0);
+ DTensor A1(A, 2, 1, 1);
+ A0.upload(aData);
+ A1.upload(aData);
+ DTensor L(A); // L = A
+ CholeskyBatchFactoriser chol(L);
+ chol.factorise();
+ std::vector bData = {-1., -3., 5.};
+ DTensor rhs(3, 1, 2);
+ DTensor rhs0(rhs, 2, 0, 0);
+ DTensor rhs1(rhs, 2, 1, 1);
+ rhs0.upload(bData);
+ rhs1.upload(bData);
+ DTensor sol(rhs);
+ chol.solve(sol);
+ std::vector expected = {-0.126805213103205, -0.128566396618528, 0.175061641423036};
+ std::vector actual(6);
+ sol.download(actual);
+ for (size_t i = 0; i < 3; i++) EXPECT_NEAR(expected[i], actual[i], epsilon); // 0
+ for (size_t i = 0; i < 3; i++) EXPECT_NEAR(expected[i], actual[i + 3], epsilon); // 1
+ DTensor error = A * sol;
+ error -= rhs;
+ EXPECT_TRUE(error.normF() < epsilon);
+}
+
+TEST_F(CholeskyTest, choleskyBatchFactorSolve) {
+ choleskyBatchFactorSolve(PRECISION_LOW);
+ choleskyBatchFactorSolve(PRECISION_HIGH);
+}
+
+/* ---------------------------------------
+ * Batched Cholesky solve (factor provided)
+ * --------------------------------------- */
+
+template
+requires std::floating_point
+void choleskyBatchSolve(T epsilon) {
+ /* originalMatrix = {10.0, 2.0, 3.0,
+ 2.0, 20.0, -1.0,
+ 3.0, -1.0, 30.0}; */
+ std::vector lowData = {3.162277660168380, 0, 0,
+ 0.632455532033676, 4.427188724235731, 0,
+ 0.948683298050514, -0.361403161162101, 5.382321781081287}; // from matlab
+ DTensor low(3, 3, 2);
+ DTensor low0(low, 2, 0, 0);
+ DTensor low1(low, 2, 1, 1);
+ low0.upload(lowData);
+ low1.upload(lowData);
+ CholeskyBatchFactoriser chol(low, true);
+ std::vector bData = {-1., -3., 5.};
+ DTensor rhs(3, 1, 2);
+ DTensor rhs0(rhs, 2, 0, 0);
+ DTensor rhs1(rhs, 2, 1, 1);
+ rhs0.upload(bData);
+ rhs1.upload(bData);
+ DTensor sol(rhs);
+ chol.solve(sol);
+ std::vector expected = {-0.126805213103205, -0.128566396618528, 0.175061641423036};
+ std::vector actual(6);
+ sol.download(actual);
+ for (size_t i = 0; i < 3; i++) EXPECT_NEAR(expected[i], actual[i], epsilon); // 0
+ for (size_t i = 0; i < 3; i++) EXPECT_NEAR(expected[i], actual[i + 3], epsilon); // 1
+ DTensor error = low * sol;
+ error -= rhs;
+ EXPECT_TRUE(error.normF() < epsilon);
+}
+
+TEST_F(CholeskyTest, choleskyBatchSolve) {
+ choleskyBatchSolve(PRECISION_LOW);
+ choleskyBatchSolve(PRECISION_HIGH);
+}
+
/* ================================================================================================
* NULLSPACE TESTS
@@ -983,7 +1098,7 @@ void projectOnNullspaceTensor(T epsilon) {
DTensor delta1 = y - proj;
DTensor delta2 = proj - x;
EXPECT_LT(delta1.dotF(delta2), epsilon);
- }
+}
TEST_F(NullspaceTest, projectOnNullspaceTensor) {
projectOnNullspaceTensor(PRECISION_LOW);
From afd313cfe4d527e37f15ae1762a5a4e71c0b2dc8 Mon Sep 17 00:00:00 2001
From: Ruairi Moran
Date: Tue, 7 May 2024 16:50:33 +0100
Subject: [PATCH 3/3] rowMajor
---
include/tensor.cuh | 25 ++++++++++++++++++++-----
test/testTensor.cu | 20 +++++++++++++-------
2 files changed, 33 insertions(+), 12 deletions(-)
diff --git a/include/tensor.cuh b/include/tensor.cuh
index dca144d..cf7003c 100644
--- a/include/tensor.cuh
+++ b/include/tensor.cuh
@@ -1280,19 +1280,30 @@ inline void Nullspace::project(DTensor &b) {
* CHOLESKY BATCH FACTORISATION (CBF)
* ================================================================================================ */
+/**
+ * Cholesky Factorisation (CF) for batches of matrices (tensors).
+ * This object can factorise and then solve,
+ * or you can provide pre-computed Cholesky lower-triangular matrices and then solve.
+ * @tparam T data type of tensor (must be float or double)
+ */
TEMPLATE_WITH_TYPE_T TEMPLATE_CONSTRAINT_REQUIRES_FPX
class CholeskyBatchFactoriser {
private:
- DTensor *m_matrix; ///< Matrix to factorise (reference)
- size_t m_numRows = 0;
- size_t m_numMats = 0;
- std::unique_ptr> m_deviceInfo;
- bool m_factorisationDone = false;
+ DTensor *m_matrix; ///< Matrix to factorise or lower-triangular decomposed matrix
+ size_t m_numRows = 0; ///< Number of rows in `m_matrix`
+ size_t m_numMats = 0; ///< Number of matrices in `m_matrix`
+ std::unique_ptr> m_deviceInfo; ///< Info from cusolver functions
+ bool m_factorisationDone = false; ///< Whether `m_matrix` holds original or lower-triangular matrix
public:
CholeskyBatchFactoriser() = delete;
+ /**
+ * Constructor
+ * @param A either matrices to be factorised or lower-triangular Cholesky decomposition matrices
+ * @param factorised whether A is the original matrices or the factorised ones (default=original matrices)
+ */
CholeskyBatchFactoriser(DTensor &A, bool factorised=false) : m_factorisationDone(factorised) {
if (A.numRows() != A.numCols()) throw std::invalid_argument("[CholeskyBatch] A must be square");
m_matrix = &A;
@@ -1306,6 +1317,10 @@ public:
*/
void factorise();
+ /**
+ * Solve the system A\b using Cholesky lower-triangular matrix of A (batched).
+ * @param b vector in system Ax=b to be solved where x is the solution
+ */
void solve(DTensor &b);
};
diff --git a/test/testTensor.cu b/test/testTensor.cu
index d6e4be3..5291c68 100644
--- a/test/testTensor.cu
+++ b/test/testTensor.cu
@@ -950,18 +950,24 @@ TEST_F(CholeskyTest, choleskyBatchFactorSolve) {
template
requires std::floating_point
void choleskyBatchSolve(T epsilon) {
- /* originalMatrix = {10.0, 2.0, 3.0,
- 2.0, 20.0, -1.0,
- 3.0, -1.0, 30.0}; */
+ std::vector aData = {10.0, 2.0, 3.0,
+ 2.0, 20.0, -1.0,
+ 3.0, -1.0, 30.0};
+ DTensor A(3, 3, 2);
+ DTensor A0(A, 2, 0, 0);
+ DTensor A1(A, 2, 1, 1);
+ A0.upload(aData);
+ A1.upload(aData);
std::vector lowData = {3.162277660168380, 0, 0,
0.632455532033676, 4.427188724235731, 0,
0.948683298050514, -0.361403161162101, 5.382321781081287}; // from matlab
DTensor low(3, 3, 2);
DTensor low0(low, 2, 0, 0);
DTensor low1(low, 2, 1, 1);
- low0.upload(lowData);
- low1.upload(lowData);
- CholeskyBatchFactoriser chol(low, true);
+ low0.upload(lowData, rowMajor);
+ low1.upload(lowData, rowMajor);
+ DTensor L(low);
+ CholeskyBatchFactoriser chol(L, true);
std::vector bData = {-1., -3., 5.};
DTensor rhs(3, 1, 2);
DTensor rhs0(rhs, 2, 0, 0);
@@ -975,7 +981,7 @@ void choleskyBatchSolve(T epsilon) {
sol.download(actual);
for (size_t i = 0; i < 3; i++) EXPECT_NEAR(expected[i], actual[i], epsilon); // 0
for (size_t i = 0; i < 3; i++) EXPECT_NEAR(expected[i], actual[i + 3], epsilon); // 1
- DTensor error = low * sol;
+ DTensor error = A * sol;
error -= rhs;
EXPECT_TRUE(error.normF() < epsilon);
}