From de9c325f09155ab176fbb9b82d1cdc06f0e551c4 Mon Sep 17 00:00:00 2001
From: Pantelis Sopasakis <p.sopasakis@gmail.com>
Date: Fri, 3 May 2024 21:49:47 +0100
Subject: [PATCH 1/3] implementing CholeskyMultiFactoriser

---
 include/tensor.cuh | 56 ++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 56 insertions(+)
diff --git a/include/tensor.cuh b/include/tensor.cuh
index b2cfaf9..ba79d18 100644
--- a/include/tensor.cuh
+++ b/include/tensor.cuh
@@ -1276,4 +1276,60 @@ inline void Nullspace<T>::project(DTensor<T> &b) {
 }
 
 
+TEMPLATE_WITH_TYPE_T TEMPLATE_CONSTRAINT_REQUIRES_FPX
+class CholeskyMultiFactoriser {
+
+private:
+    DTensor<T> &m_matrix;  ///< Matrix to factorise (reference)
+    size_t m_numRows = 0;
+    size_t m_numMats = 0;
+    std::unique_ptr<DTensor<int>> m_deviceInfo;
+    bool m_factorisationDone = false;
+
+public:
+    CholeskyMultiFactoriser() = delete;
+
+    CholeskyMultiFactoriser(DTensor<T> &A) : m_matrix(A) {
+        m_numRows = A.numRows();
+        m_numMats = A.numMats();
+        m_deviceInfo = std::make_unique<DTensor<int>>(m_numMats, 1, 1, true);
+    }
+
+    /**
+     * Factorise all matrices in tensor (batched).
+     */
+    void factorise();
+
+    int solve(DTensor<T> &b);
+
+};
+
+template<>
+void CholeskyMultiFactoriser<double>::factorise() {
+    if (m_factorisationDone) return;
+    DTensor<double *> ptrA = m_matrix.pointersToMatrices();
+    cusolverDnDpotrfBatched(Session::getInstance().cuSolverHandle(),
+                            CUBLAS_FILL_MODE_LOWER,
+                            m_numRows,
+                            ptrA.raw(),
+                            m_numRows,
+                            m_deviceInfo->raw(),
+                            m_numMats);
+    m_factorisationDone = true;
+}
+
+template<>
+void CholeskyMultiFactoriser<float>::factorise() {
+    if (m_factorisationDone) return;
+    DTensor<float *> ptrA = m_matrix.pointersToMatrices();
+    cusolverDnSpotrfBatched(Session::getInstance().cuSolverHandle(),
+                            CUBLAS_FILL_MODE_LOWER,
+                            m_numRows,
+                            ptrA.raw(),
+                            m_numRows,
+                            m_deviceInfo->raw(),
+                            m_numMats);
+    m_factorisationDone = true;
+}
+
 #endif

From 20c9c1578b2544a866d9a1298becc9adfeda28d1 Mon Sep 17 00:00:00 2001
From: Ruairi Moran <ruairi.moran@equipmentshare.com>
Date: Tue, 7 May 2024 13:42:07 +0100
Subject: [PATCH 2/3] added batch solve, provide factorised lower, and unit
 tests

---
 include/tensor.cuh | 176 ++++++++++++++++++++++++++++-----------------
 test/testTensor.cu | 143 ++++++++++++++++++++++++++++++++----
 2 files changed, 238 insertions(+), 81 deletions(-)

diff --git a/include/tensor.cuh b/include/tensor.cuh
index ba79d18..dca144d 100644
--- a/include/tensor.cuh
+++ b/include/tensor.cuh
@@ -548,7 +548,7 @@ inline bool DTensor<T>::upload(const std::vector<T> &vec, StorageMode mode) {
     size_t size = vec.size();
     size_t thisSize = m_numRows * m_numCols * m_numMats;
     // make sure vec is of right size
-    if (size != thisSize) throw std::invalid_argument("vec has wrong size");
+    if (size != thisSize) throw std::invalid_argument("[upload] vec has wrong size");
     std::vector<T> vecCm(thisSize);
     if (mode == StorageMode::rowMajor) {
         rm2cm(vec, vecCm);
@@ -611,7 +611,7 @@ inline DTensor<double> DTensor<double>::tr() const {
 template<typename T>
 inline void DTensor<T>::deviceCopyTo(DTensor<T> &elsewhere) const {
     if (elsewhere.numEl() < numEl()) {
-        throw std::invalid_argument("tensor does not fit into destination");
+        throw std::invalid_argument("[deviceCopyTo] tensor does not fit into destination");
     }
     gpuErrChk(cudaMemcpy(elsewhere.raw(),
                          m_d_data,
@@ -623,7 +623,7 @@ template<>
 inline DTensor<double> &DTensor<double>::operator*=(double scalar) {
     double alpha = scalar;
     gpuErrChk(
-            cublasDscal(Session::getInstance().cuBlasHandle(), m_numRows * m_numCols * m_numMats, &alpha, m_d_data, 1));
+        cublasDscal(Session::getInstance().cuBlasHandle(), m_numRows * m_numCols * m_numMats, &alpha, m_d_data, 1));
     return *this;
 }
 
@@ -641,7 +641,7 @@ template<>
 inline DTensor<float> &DTensor<float>::operator*=(float scalar) {
     float alpha = scalar;
     gpuErrChk(
-            cublasSscal(Session::getInstance().cuBlasHandle(), m_numRows * m_numCols * m_numMats, &alpha, m_d_data, 1));
+        cublasSscal(Session::getInstance().cuBlasHandle(), m_numRows * m_numCols * m_numMats, &alpha, m_d_data, 1));
     return *this;
 }
 
@@ -649,8 +649,8 @@ template<>
 inline DTensor<double> &DTensor<double>::operator+=(const DTensor<double> &rhs) {
     const double alpha = 1.;
     gpuErrChk(
-            cublasDaxpy(Session::getInstance().cuBlasHandle(), m_numRows * m_numCols * m_numMats, &alpha, rhs.m_d_data,
-                        1, m_d_data, 1));
+        cublasDaxpy(Session::getInstance().cuBlasHandle(), m_numRows * m_numCols * m_numMats, &alpha, rhs.m_d_data,
+                    1, m_d_data, 1));
     return *this;
 }
 
@@ -658,8 +658,8 @@ template<>
 inline DTensor<float> &DTensor<float>::operator+=(const DTensor<float> &rhs) {
     const float alpha = 1.;
     gpuErrChk(
-            cublasSaxpy(Session::getInstance().cuBlasHandle(), m_numRows * m_numCols * m_numMats, &alpha, rhs.m_d_data,
-                        1, m_d_data, 1));
+        cublasSaxpy(Session::getInstance().cuBlasHandle(), m_numRows * m_numCols * m_numMats, &alpha, rhs.m_d_data,
+                    1, m_d_data, 1));
     return *this;
 }
 
@@ -675,8 +675,8 @@ template<>
 inline DTensor<double> &DTensor<double>::operator-=(const DTensor<double> &rhs) {
     const double alpha = -1.;
     gpuErrChk(
-            cublasDaxpy(Session::getInstance().cuBlasHandle(), m_numRows * m_numCols * m_numMats, &alpha, rhs.m_d_data,
-                        1, m_d_data, 1));
+        cublasDaxpy(Session::getInstance().cuBlasHandle(), m_numRows * m_numCols * m_numMats, &alpha, rhs.m_d_data,
+                    1, m_d_data, 1));
     return *this;
 }
 
@@ -745,13 +745,13 @@ inline void DTensor<double>::leastSquares(DTensor &B) {
     size_t batchSize = numMats();
     size_t nColsB = B.numCols();
     if (B.numRows() != m_numRows)
-        throw std::invalid_argument("Least squares rhs rows does not equal lhs rows");
+        throw std::invalid_argument("[Least squares] rhs rows does not equal lhs rows");
     if (nColsB != 1)
-        throw std::invalid_argument("Least squares rhs are not vectors");
+        throw std::invalid_argument("[Least squares] rhs are not vectors");
     if (B.numMats() != batchSize)
-        throw std::invalid_argument("Least squares rhs numMats does not equal lhs numMats");
+        throw std::invalid_argument("[Least squares] rhs numMats does not equal lhs numMats");
     if (m_numCols > m_numRows)
-        throw std::invalid_argument("Least squares supports square or tall matrices only");
+        throw std::invalid_argument("[Least squares] supports square or tall matrices only");
     int info = 0;
     DTensor<int> infoArray(batchSize);
     DTensor<double *> As = pointersToMatrices();
@@ -775,13 +775,13 @@ inline void DTensor<float>::leastSquares(DTensor &B) {
     size_t batchSize = numMats();
     size_t nColsB = B.numCols();
     if (B.numRows() != m_numRows)
-        throw std::invalid_argument("Least squares rhs rows does not equal lhs rows");
+        throw std::invalid_argument("[Least squares] rhs rows does not equal lhs rows");
     if (nColsB != 1)
-        throw std::invalid_argument("Least squares rhs are not vectors");
+        throw std::invalid_argument("[Least squares] rhs are not vectors");
     if (B.numMats() != batchSize)
-        throw std::invalid_argument("Least squares rhs numMats does not equal lhs numMats");
+        throw std::invalid_argument("[Least squares] rhs numMats does not equal lhs numMats");
     if (m_numCols > m_numRows)
-        throw std::invalid_argument("Least squares supports square or tall matrices only");
+        throw std::invalid_argument("[Least squares] supports square or tall matrices only");
     int info = 0;
     DTensor<int> infoArray(batchSize);
     DTensor<float *> As = pointersToMatrices();
@@ -899,7 +899,7 @@ private:
      */
     void checkMatrix(DTensor<T> &tensor) const {
         if (tensor.numRows() < tensor.numCols()) {
-            throw std::invalid_argument("your matrix is fat (no offence)");
+            throw std::invalid_argument("[svd] your matrix is fat (no offence)");
         }
     };
 
@@ -1021,17 +1021,17 @@ inline bool Svd<double>::factorise() {
         if (m_computeU)
             Ui = std::make_unique<DTensor<double>>(*m_U, 2, i, i);
         gpuErrChk(
-                cusolverDnDgesvd(Session::getInstance().cuSolverHandle(),
-                                 (m_computeU) ? 'A' : 'N', 'A',
-                                 m, n,
-                                 Ai.raw(), m,
-                                 Si.raw(),
-                                 (m_computeU) ? Ui->raw() : nullptr, m,
-                                 Vtri.raw(), n,
-                                 m_workspace->raw(),
-                                 m_lwork,
-                                 nullptr,  // rwork (used only if SVD fails)
-                                 m_info->raw()));
+            cusolverDnDgesvd(Session::getInstance().cuSolverHandle(),
+                             (m_computeU) ? 'A' : 'N', 'A',
+                             m, n,
+                             Ai.raw(), m,
+                             Si.raw(),
+                             (m_computeU) ? Ui->raw() : nullptr, m,
+                             Vtri.raw(), n,
+                             m_workspace->raw(),
+                             m_lwork,
+                             nullptr,  // rwork (used only if SVD fails)
+                             m_info->raw()));
         info = info && ((*m_info)(0, 0, 0) == 0);
     }
     return info;
@@ -1051,17 +1051,17 @@ inline bool Svd<float>::factorise() {
         if (m_computeU)
             Ui = std::make_unique<DTensor<float>>(*m_U, 2, i, i);
         gpuErrChk(
-                cusolverDnSgesvd(Session::getInstance().cuSolverHandle(),
-                                 (m_computeU) ? 'A' : 'N', 'A',
-                                 m, n,
-                                 Ai.raw(), m,
-                                 Si.raw(),
-                                 (m_computeU) ? Ui->raw() : nullptr, m,
-                                 Vtri.raw(), n,
-                                 m_workspace->raw(),
-                                 m_lwork,
-                                 nullptr,  // rwork (used only if SVD fails)
-                                 m_info->raw()));
+            cusolverDnSgesvd(Session::getInstance().cuSolverHandle(),
+                             (m_computeU) ? 'A' : 'N', 'A',
+                             m, n,
+                             Ai.raw(), m,
+                             Si.raw(),
+                             (m_computeU) ? Ui->raw() : nullptr, m,
+                             Vtri.raw(), n,
+                             m_workspace->raw(),
+                             m_lwork,
+                             nullptr,  // rwork (used only if SVD fails)
+                             m_info->raw()));
         info = info && ((*m_info)(0, 0, 0) == 0);
     }
     return info;
@@ -1095,8 +1095,8 @@ private:
 public:
 
     CholeskyFactoriser(DTensor<T> &A) {
-        if (A.numMats() > 1) throw std::invalid_argument("3D tensors are not supported (for now); only matrices");
-        if (A.numRows() != A.numCols()) throw std::invalid_argument("Matrix A must be square for CF");
+        if (A.numMats() > 1) throw std::invalid_argument("[Cholesky] 3D tensors require `CholeskyBatchFactoriser");
+        if (A.numRows() != A.numCols()) throw std::invalid_argument("[Cholesky] Matrix A must be square");
         m_matrix = &A;
         computeWorkspaceSize();
         m_workspace = std::make_unique<DTensor<T>>(m_workspaceSize);
@@ -1237,7 +1237,7 @@ template<typename T>
 TEMPLATE_CONSTRAINT_REQUIRES_FPX
 inline Nullspace<T>::Nullspace(DTensor<T> &a) {
     size_t m = a.numRows(), n = a.numCols(), nMats = a.numMats();
-    if (m > n) throw std::invalid_argument("I was expecting a square or fat matrix");
+    if (m > n) throw std::invalid_argument("[nullspace] I was expecting a square or fat matrix");
     m_nullspace = std::make_unique<DTensor<T>>(n, n, nMats, true);
     m_projOp = std::make_unique<DTensor<T>>(n, n, nMats, true);
     auto aTranspose = a.tr();
@@ -1276,20 +1276,26 @@ inline void Nullspace<T>::project(DTensor<T> &b) {
 }
 
 
+/* ================================================================================================
+ *  CHOLESKY BATCH FACTORISATION (CBF)
+ * ================================================================================================ */
+
 TEMPLATE_WITH_TYPE_T TEMPLATE_CONSTRAINT_REQUIRES_FPX
-class CholeskyMultiFactoriser {
+class CholeskyBatchFactoriser {
 
 private:
-    DTensor<T> &m_matrix;  ///< Matrix to factorise (reference)
+    DTensor<T> *m_matrix;  ///< Matrix to factorise (reference)
     size_t m_numRows = 0;
     size_t m_numMats = 0;
     std::unique_ptr<DTensor<int>> m_deviceInfo;
     bool m_factorisationDone = false;
 
 public:
-    CholeskyMultiFactoriser() = delete;
+    CholeskyBatchFactoriser() = delete;
 
-    CholeskyMultiFactoriser(DTensor<T> &A) : m_matrix(A) {
+    CholeskyBatchFactoriser(DTensor<T> &A, bool factorised=false) : m_factorisationDone(factorised) {
+        if (A.numRows() != A.numCols()) throw std::invalid_argument("[CholeskyBatch] A must be square");
+        m_matrix = &A;
         m_numRows = A.numRows();
         m_numMats = A.numMats();
         m_deviceInfo = std::make_unique<DTensor<int>>(m_numMats, 1, 1, true);
@@ -1300,36 +1306,72 @@ public:
      */
     void factorise();
 
-    int solve(DTensor<T> &b);
+    void solve(DTensor<T> &b);
 
 };
 
 template<>
-void CholeskyMultiFactoriser<double>::factorise() {
+void CholeskyBatchFactoriser<double>::factorise() {
     if (m_factorisationDone) return;
-    DTensor<double *> ptrA = m_matrix.pointersToMatrices();
-    cusolverDnDpotrfBatched(Session::getInstance().cuSolverHandle(),
-                            CUBLAS_FILL_MODE_LOWER,
-                            m_numRows,
-                            ptrA.raw(),
-                            m_numRows,
-                            m_deviceInfo->raw(),
-                            m_numMats);
+    DTensor<double *> ptrA = m_matrix->pointersToMatrices();
+    gpuErrChk(cusolverDnDpotrfBatched(Session::getInstance().cuSolverHandle(),
+                                      CUBLAS_FILL_MODE_LOWER,
+                                      m_numRows,
+                                      ptrA.raw(),
+                                      m_numRows,
+                                      m_deviceInfo->raw(),
+                                      m_numMats));
     m_factorisationDone = true;
 }
 
 template<>
-void CholeskyMultiFactoriser<float>::factorise() {
+void CholeskyBatchFactoriser<float>::factorise() {
     if (m_factorisationDone) return;
-    DTensor<float *> ptrA = m_matrix.pointersToMatrices();
-    cusolverDnSpotrfBatched(Session::getInstance().cuSolverHandle(),
-                            CUBLAS_FILL_MODE_LOWER,
-                            m_numRows,
-                            ptrA.raw(),
-                            m_numRows,
-                            m_deviceInfo->raw(),
-                            m_numMats);
+    DTensor<float *> ptrA = m_matrix->pointersToMatrices();
+    gpuErrChk(cusolverDnSpotrfBatched(Session::getInstance().cuSolverHandle(),
+                                      CUBLAS_FILL_MODE_LOWER,
+                                      m_numRows,
+                                      ptrA.raw(),
+                                      m_numRows,
+                                      m_deviceInfo->raw(),
+                                      m_numMats));
     m_factorisationDone = true;
 }
 
+template<>
+void CholeskyBatchFactoriser<double>::solve(DTensor<double> &b) {
+    if (!m_factorisationDone) throw std::logic_error("[CholeskyBatchSolve] no factor to solve with");
+    if (b.numCols() != 1) throw std::invalid_argument("[CholeskyBatchSolve] only supports `b` with one column");
+    DTensor<double *> ptrA = m_matrix->pointersToMatrices();
+    DTensor<double *> ptrB = b.pointersToMatrices();
+    gpuErrChk(cusolverDnDpotrsBatched(Session::getInstance().cuSolverHandle(),
+                                      CUBLAS_FILL_MODE_LOWER,
+                                      m_numRows,
+                                      1,  ///< only supports rhs = 1
+                                      ptrA.raw(),
+                                      m_numRows,
+                                      ptrB.raw(),
+                                      m_numRows,
+                                      m_deviceInfo->raw(),
+                                      m_numMats));
+}
+
+template<>
+void CholeskyBatchFactoriser<float>::solve(DTensor<float> &b) {
+    if (!m_factorisationDone) throw std::logic_error("[CholeskyBatchSolve] no factor to solve with");
+    if (b.numCols() != 1) throw std::invalid_argument("[CholeskyBatchSolve] only supports `b` with one column");
+    DTensor<float *> ptrA = m_matrix->pointersToMatrices();
+    DTensor<float *> ptrB = b.pointersToMatrices();
+    gpuErrChk(cusolverDnSpotrsBatched(Session::getInstance().cuSolverHandle(),
+                                      CUBLAS_FILL_MODE_LOWER,
+                                      m_numRows,
+                                      1,  ///< only supports rhs = 1
+                                      ptrA.raw(),
+                                      m_numRows,
+                                      ptrB.raw(),
+                                      m_numRows,
+                                      m_deviceInfo->raw(),
+                                      m_numMats));
+}
+
 #endif
diff --git a/test/testTensor.cu b/test/testTensor.cu
index a46a803..d6e4be3 100644
--- a/test/testTensor.cu
+++ b/test/testTensor.cu
@@ -733,7 +733,7 @@ TEST_F(SvdTest, singularValuesMemory) {
  * --------------------------------------- */
 template<typename T>
 requires std::floating_point<T>
-void singularValuesMutlipleMatrices(float epsilon) {
+void singularValuesMultipleMatrices(float epsilon) {
     std::vector<T> aData = {1, 2, 3, 4, 5, 6, 1, 1, 1, 2, 2, 2, 0, 0, 0, 0, 0, 1};
     DTensor<T> A(aData, 3, 2, 3);
     Svd<T> svd(A, true); // do compute U (A will be destroyed)
@@ -753,15 +753,15 @@ void singularValuesMutlipleMatrices(float epsilon) {
     S.download(actual_s);
     for (size_t i = 0; i < 6; i++) EXPECT_NEAR(expected_s[i], actual_s[i], epsilon);
     std::vector<T> expected_u = {
-            -0.428667133548626, -0.566306918848035, -0.703946704147444,
-            0.805963908589298, 0.112382414096594, -0.581199080396110,
-            0.408248290463863, -0.816496580927726, 0.408248290463863,
-            -0.577350269189626, -0.577350269189626, -0.577350269189626,
-            0.816496580927726, -0.408248290463863, -0.408248290463863,
-            0.000000000000000, -0.707106781186548, 0.707106781186547,
-            0, 0, -1,
-            1, 0, 0,
-            0, -1, 0,
+        -0.428667133548626, -0.566306918848035, -0.703946704147444,
+        0.805963908589298, 0.112382414096594, -0.581199080396110,
+        0.408248290463863, -0.816496580927726, 0.408248290463863,
+        -0.577350269189626, -0.577350269189626, -0.577350269189626,
+        0.816496580927726, -0.408248290463863, -0.408248290463863,
+        0.000000000000000, -0.707106781186548, 0.707106781186547,
+        0, 0, -1,
+        1, 0, 0,
+        0, -1, 0,
     };
     std::vector<T> actual_u(27);
     U->download(actual_u);
@@ -769,9 +769,9 @@ void singularValuesMutlipleMatrices(float epsilon) {
 
 }
 
-TEST_F(SvdTest, singularValuesMutlipleMatrices) {
-    singularValuesMutlipleMatrices<float>(10 * PRECISION_LOW); // SVD with float performs quite poorly
-    singularValuesMutlipleMatrices<double>(PRECISION_HIGH);
+TEST_F(SvdTest, singularValuesMultipleMatrices) {
+    singularValuesMultipleMatrices<float>(10 * PRECISION_LOW); // SVD with float performs quite poorly
+    singularValuesMultipleMatrices<double>(PRECISION_HIGH);
 }
 
 
@@ -870,6 +870,121 @@ TEST_F(CholeskyTest, choleskyFactorisationSolution) {
     choleskyFactorisationSolution<double>(PRECISION_HIGH);
 }
 
+/* ---------------------------------------
+ * Batched Cholesky factorisation
+ * --------------------------------------- */
+
+template<typename T>
+requires std::floating_point<T>
+void choleskyBatchFactorisation(T epsilon) {
+    std::vector<T> aData = {10.0, 2.0, 3.0,
+                            2.0, 20.0, -1.0,
+                            3.0, -1.0, 30.0};
+    DTensor<T> A(3, 3, 2);
+    DTensor<T> A0(A, 2, 0, 0);
+    DTensor<T> A1(A, 2, 1, 1);
+    A0.upload(aData);
+    A1.upload(aData);
+    CholeskyBatchFactoriser<T> chol(A);
+    chol.factorise();
+    // 0
+    EXPECT_NEAR(3.162277660168380, A(0, 0, 0), epsilon);
+    EXPECT_NEAR(-0.361403161162101, A(2, 1, 0), epsilon);
+    EXPECT_NEAR(5.382321781081287, A(2, 2, 0), epsilon);
+    // 1
+    EXPECT_NEAR(3.162277660168380, A(0, 0, 1), epsilon);
+    EXPECT_NEAR(-0.361403161162101, A(2, 1, 1), epsilon);
+    EXPECT_NEAR(5.382321781081287, A(2, 2, 1), epsilon);
+}
+
+TEST_F(CholeskyTest, choleskyBatchFactorisation) {
+    choleskyBatchFactorisation<float>(PRECISION_LOW);
+    choleskyBatchFactorisation<double>(PRECISION_HIGH);
+}
+
+/* ---------------------------------------
+ * Batched Cholesky solve
+ * --------------------------------------- */
+
+template<typename T>
+requires std::floating_point<T>
+void choleskyBatchFactorSolve(T epsilon) {
+    std::vector<T> aData = {10.0, 2.0, 3.0,
+                            2.0, 20.0, -1.0,
+                            3.0, -1.0, 30.0};
+    DTensor<T> A(3, 3, 2);
+    DTensor<T> A0(A, 2, 0, 0);
+    DTensor<T> A1(A, 2, 1, 1);
+    A0.upload(aData);
+    A1.upload(aData);
+    DTensor<T> L(A); // L = A
+    CholeskyBatchFactoriser<T> chol(L);
+    chol.factorise();
+    std::vector<T> bData = {-1., -3., 5.};
+    DTensor<T> rhs(3, 1, 2);
+    DTensor<T> rhs0(rhs, 2, 0, 0);
+    DTensor<T> rhs1(rhs, 2, 1, 1);
+    rhs0.upload(bData);
+    rhs1.upload(bData);
+    DTensor<T> sol(rhs);
+    chol.solve(sol);
+    std::vector<T> expected = {-0.126805213103205, -0.128566396618528, 0.175061641423036};
+    std::vector<T> actual(6);
+    sol.download(actual);
+    for (size_t i = 0; i < 3; i++) EXPECT_NEAR(expected[i], actual[i], epsilon);  // 0
+    for (size_t i = 0; i < 3; i++) EXPECT_NEAR(expected[i], actual[i + 3], epsilon);  // 1
+    DTensor<T> error = A * sol;
+    error -= rhs;
+    EXPECT_TRUE(error.normF() < epsilon);
+}
+
+TEST_F(CholeskyTest, choleskyBatchFactorSolve) {
+    choleskyBatchFactorSolve<float>(PRECISION_LOW);
+    choleskyBatchFactorSolve<double>(PRECISION_HIGH);
+}
+
+/* ---------------------------------------
+ * Batched Cholesky solve (factor provided)
+ * --------------------------------------- */
+
+template<typename T>
+requires std::floating_point<T>
+void choleskyBatchSolve(T epsilon) {
+    /* originalMatrix = {10.0, 2.0, 3.0,
+                         2.0, 20.0, -1.0,
+                         3.0, -1.0, 30.0}; */
+    std::vector<T> lowData = {3.162277660168380, 0, 0,
+                              0.632455532033676, 4.427188724235731, 0,
+                              0.948683298050514, -0.361403161162101, 5.382321781081287};  // from matlab
+    DTensor<T> low(3, 3, 2);
+    DTensor<T> low0(low, 2, 0, 0);
+    DTensor<T> low1(low, 2, 1, 1);
+    low0.upload(lowData);
+    low1.upload(lowData);
+    CholeskyBatchFactoriser<T> chol(low, true);
+    std::vector<T> bData = {-1., -3., 5.};
+    DTensor<T> rhs(3, 1, 2);
+    DTensor<T> rhs0(rhs, 2, 0, 0);
+    DTensor<T> rhs1(rhs, 2, 1, 1);
+    rhs0.upload(bData);
+    rhs1.upload(bData);
+    DTensor<T> sol(rhs);
+    chol.solve(sol);
+    std::vector<T> expected = {-0.126805213103205, -0.128566396618528, 0.175061641423036};
+    std::vector<T> actual(6);
+    sol.download(actual);
+    for (size_t i = 0; i < 3; i++) EXPECT_NEAR(expected[i], actual[i], epsilon);  // 0
+    for (size_t i = 0; i < 3; i++) EXPECT_NEAR(expected[i], actual[i + 3], epsilon);  // 1
+    DTensor<T> error = low * sol;
+    error -= rhs;
+    EXPECT_TRUE(error.normF() < epsilon);
+}
+
+TEST_F(CholeskyTest, choleskyBatchSolve) {
+    choleskyBatchSolve<float>(PRECISION_LOW);
+    choleskyBatchSolve<double>(PRECISION_HIGH);
+}
+
 
 /* ================================================================================================
  *  NULLSPACE TESTS
@@ -983,7 +1098,7 @@ void projectOnNullspaceTensor(T epsilon) {
     DTensor<T> delta1 = y - proj;
     DTensor<T> delta2 = proj - x;
     EXPECT_LT(delta1.dotF(delta2), epsilon);
- }
+}
 
 TEST_F(NullspaceTest, projectOnNullspaceTensor) {
     projectOnNullspaceTensor<float>(PRECISION_LOW);

From afd313cfe4d527e37f15ae1762a5a4e71c0b2dc8 Mon Sep 17 00:00:00 2001
From: Ruairi Moran <ruairi.moran@equipmentshare.com>
Date: Tue, 7 May 2024 16:50:33 +0100
Subject: [PATCH 3/3] rowMajor

---
 include/tensor.cuh | 25 ++++++++++++++++++++-----
 test/testTensor.cu | 20 +++++++++++++-------
 2 files changed, 33 insertions(+), 12 deletions(-)

diff --git a/include/tensor.cuh b/include/tensor.cuh
index dca144d..cf7003c 100644
--- a/include/tensor.cuh
+++ b/include/tensor.cuh
@@ -1280,19 +1280,30 @@ inline void Nullspace<T>::project(DTensor<T> &b) {
  *  CHOLESKY BATCH FACTORISATION (CBF)
  * ================================================================================================ */
 
+/**
+ * Cholesky Factorisation (CF) for batches of matrices (tensors).
+ * This object can factorise and then solve,
+ * or you can provide pre-computed Cholesky lower-triangular matrices and then solve.
+ * @tparam T data type of tensor (must be float or double)
+ */
 TEMPLATE_WITH_TYPE_T TEMPLATE_CONSTRAINT_REQUIRES_FPX
 class CholeskyBatchFactoriser {
 
 private:
-    DTensor<T> *m_matrix;  ///< Matrix to factorise (reference)
-    size_t m_numRows = 0;
-    size_t m_numMats = 0;
-    std::unique_ptr<DTensor<int>> m_deviceInfo;
-    bool m_factorisationDone = false;
+    DTensor<T> *m_matrix;  ///< Matrix to factorise or lower-triangular decomposed matrix
+    size_t m_numRows = 0;  ///< Number of rows in `m_matrix`
+    size_t m_numMats = 0;  ///< Number of matrices in `m_matrix`
+    std::unique_ptr<DTensor<int>> m_deviceInfo;  ///< Info from cusolver functions
+    bool m_factorisationDone = false;  ///< Whether `m_matrix` holds original or lower-triangular matrix
 
 public:
     CholeskyBatchFactoriser() = delete;
 
+    /**
+     * Constructor
+     * @param A either matrices to be factorised or lower-triangular Cholesky decomposition matrices
+     * @param factorised whether A is the original matrices or the factorised ones (default=original matrices)
+     */
     CholeskyBatchFactoriser(DTensor<T> &A, bool factorised=false) : m_factorisationDone(factorised) {
         if (A.numRows() != A.numCols()) throw std::invalid_argument("[CholeskyBatch] A must be square");
         m_matrix = &A;
@@ -1306,6 +1317,10 @@ public:
      */
     void factorise();
 
+    /**
+     * Solve the system A\b using Cholesky lower-triangular matrix of A (batched).
+     * @param b vector in system Ax=b to be solved where x is the solution
+     */
     void solve(DTensor<T> &b);
 
 };
diff --git a/test/testTensor.cu b/test/testTensor.cu
index d6e4be3..5291c68 100644
--- a/test/testTensor.cu
+++ b/test/testTensor.cu
@@ -950,18 +950,24 @@ TEST_F(CholeskyTest, choleskyBatchFactorSolve) {
 template<typename T>
 requires std::floating_point<T>
 void choleskyBatchSolve(T epsilon) {
-    /* originalMatrix = {10.0, 2.0, 3.0,
-                         2.0, 20.0, -1.0,
-                         3.0, -1.0, 30.0}; */
+    std::vector<T> aData = {10.0, 2.0, 3.0,
+                            2.0, 20.0, -1.0,
+                            3.0, -1.0, 30.0};
+    DTensor<T> A(3, 3, 2);
+    DTensor<T> A0(A, 2, 0, 0);
+    DTensor<T> A1(A, 2, 1, 1);
+    A0.upload(aData);
+    A1.upload(aData);
     std::vector<T> lowData = {3.162277660168380, 0, 0,
                               0.632455532033676, 4.427188724235731, 0,
                               0.948683298050514, -0.361403161162101, 5.382321781081287};  // from matlab
     DTensor<T> low(3, 3, 2);
     DTensor<T> low0(low, 2, 0, 0);
     DTensor<T> low1(low, 2, 1, 1);
-    low0.upload(lowData);
-    low1.upload(lowData);
-    CholeskyBatchFactoriser<T> chol(low, true);
+    low0.upload(lowData, rowMajor);
+    low1.upload(lowData, rowMajor);
+    DTensor<T> L(low);
+    CholeskyBatchFactoriser<T> chol(L, true);
     std::vector<T> bData = {-1., -3., 5.};
     DTensor<T> rhs(3, 1, 2);
     DTensor<T> rhs0(rhs, 2, 0, 0);
@@ -975,7 +981,7 @@ void choleskyBatchSolve(T epsilon) {
     sol.download(actual);
     for (size_t i = 0; i < 3; i++) EXPECT_NEAR(expected[i], actual[i], epsilon);  // 0
     for (size_t i = 0; i < 3; i++) EXPECT_NEAR(expected[i], actual[i + 3], epsilon);  // 1
-    DTensor<T> error = low * sol;
+    DTensor<T> error = A * sol;
     error -= rhs;
     EXPECT_TRUE(error.normF() < epsilon);
 }