GPUEngineering · ruairimoran · Oct 4, 2024 · Oct 3, 2024 · Oct 4, 2024 · Oct 4, 2024
diff --git a/README.md b/README.md
@@ -236,7 +236,7 @@ DTensor<float> B(bData, m);
 Then, we can solve the system by
 
 ```c++
-A.leastSquares(B);
+A.leastSquaresBatched(B);
 ```
 
 The `DTensor` `B` will be overwritten with the solution.

diff --git a/include/tensor.cuh b/include/tensor.cuh
@@ -398,7 +398,7 @@ public:
      * @param b provided tensor
      * @return least squares solution (overwrites b)
      */
-    void leastSquares(DTensor &b);
+    void leastSquaresBatched(DTensor &b);
 
     /**
      * Batched `C <- bC + a*A*B`.
@@ -884,17 +884,17 @@ inline void DTensor<float>::addAB(const DTensor<float> &A, const DTensor<float>
 }
 
 template<>
-inline void DTensor<double>::leastSquares(DTensor &B) {
+inline void DTensor<double>::leastSquaresBatched(DTensor &B) {
     size_t batchSize = numMats();
     size_t nColsB = B.numCols();
     if (B.numRows() != m_numRows)
-        throw std::invalid_argument("[Least squares] rhs rows does not equal lhs rows");
+        throw std::invalid_argument("[Least squares batched] rhs rows does not equal lhs rows");
     if (nColsB != 1)
-        throw std::invalid_argument("[Least squares] rhs are not vectors");
+        throw std::invalid_argument("[Least squares batched] rhs are not vectors");
     if (B.numMats() != batchSize)
-        throw std::invalid_argument("[Least squares] rhs numMats does not equal lhs numMats");
+        throw std::invalid_argument("[Least squares batched] rhs numMats does not equal lhs numMats");
     if (m_numCols > m_numRows)
-        throw std::invalid_argument("[Least squares] supports square or tall matrices only");
+        throw std::invalid_argument("[Least squares batched] supports square or tall matrices only");
     int info = 0;
     DTensor<int> infoArray(batchSize);
     DTensor<double *> As = pointersToMatrices();
@@ -914,17 +914,17 @@ inline void DTensor<double>::leastSquares(DTensor &B) {
 }
 
 template<>
-inline void DTensor<float>::leastSquares(DTensor &B) {
+inline void DTensor<float>::leastSquaresBatched(DTensor &B) {
     size_t batchSize = numMats();
     size_t nColsB = B.numCols();
     if (B.numRows() != m_numRows)
-        throw std::invalid_argument("[Least squares] rhs rows does not equal lhs rows");
+        throw std::invalid_argument("[Least squares batched] rhs rows does not equal lhs rows");
     if (nColsB != 1)
-        throw std::invalid_argument("[Least squares] rhs are not vectors");
+        throw std::invalid_argument("[Least squares batched] rhs are not vectors");
     if (B.numMats() != batchSize)
-        throw std::invalid_argument("[Least squares] rhs numMats does not equal lhs numMats");
+        throw std::invalid_argument("[Least squares batched] rhs numMats does not equal lhs numMats");
     if (m_numCols > m_numRows)
-        throw std::invalid_argument("[Least squares] supports square or tall matrices only");
+        throw std::invalid_argument("[Least squares batched] supports square or tall matrices only");
     int info = 0;
     DTensor<int> infoArray(batchSize);
     DTensor<float *> As = pointersToMatrices();
@@ -1238,7 +1238,7 @@ private:
 public:
 
     CholeskyFactoriser(DTensor<T> &A) {
-        if (A.numMats() > 1) throw std::invalid_argument("[Cholesky] 3D tensors require `CholeskyBatchFactoriser");
+        if (A.numMats() > 1) throw std::invalid_argument("[Cholesky] 3D tensors require `CholeskyBatchFactoriser`");
         if (A.numRows() != A.numCols()) throw std::invalid_argument("[Cholesky] Matrix A must be square");
         m_matrix = &A;
         computeWorkspaceSize();
@@ -1328,6 +1328,226 @@ inline int CholeskyFactoriser<float>::solve(DTensor<float> &rhs) {
 }
 
 
+/* ================================================================================================
+ *  QR DECOMPOSITION (QR)
+ * ================================================================================================ */
+
+/**
+ * QR decomposition (QR) needs a workspace to be setup for cuSolver before factorisation.
+ * This object can be setup for a specific type and size of (m,n,1)-tensor (i.e., a matrix).
+ * Then, many same-type-(m,n,1)-tensor can be factorised using this object's workspace
+ * @tparam T data type of (m,n,1)-tensor to be factorised (must be float or double)
+ */
+TEMPLATE_WITH_TYPE_T TEMPLATE_CONSTRAINT_REQUIRES_FPX
+class QRFactoriser {
+
+private:
+    int m_workspaceSize = 0;  ///< Size of workspace needed for LS
+    std::unique_ptr<DTensor<int>> m_info;  ///< Status code of computation
+    std::unique_ptr<DTensor<T>> m_householder;  ///< For storing householder reflectors
+    std::unique_ptr<DTensor<T>> m_workspace;  ///< Workspace for LS
+    DTensor<T> *m_matrix;  ///< Lhs matrix template. Do not destroy!
+
+    /**
+     * Computes the workspace size required by cuSolver.
+     */
+    void computeWorkspaceSize();
+
+public:
+
+    QRFactoriser(DTensor<T> &A) {
+        if (A.numMats() > 1) throw std::invalid_argument("[LeastSquares] 3D tensors require `leastSquaresBatched`");
+        if (A.numRows() < A.numCols()) throw std::invalid_argument("[Cholesky] Matrix A must be tall or square");
+        m_matrix = &A;
+        computeWorkspaceSize();
+        m_workspace = std::make_unique<DTensor<T>>(m_workspaceSize);
+        m_householder = std::make_unique<DTensor<T>>(m_matrix->numCols());
+        m_info = std::make_unique<DTensor<int>>(1);
+    }
+
+    /**
+     * Factorise matrix.
+     * @return status code of computation
+     */
+    int factorise();
+
+    /**
+     * Solves for the solution of A \ b using the QR of A.
+     * A is the matrix that is factorised and b is the provided matrix.
+     * A and b must have compatible dimensions (same number of rows and matrices=1).
+     * A must be tall or square (m>=n).
+     * @param b provided matrix
+     * @return status code of computation
+     */
+    int leastSquares(DTensor<T> &);
+
+    /**
+     * Populate the given tensors with Q and R.
+     * Caution! This is an inefficient method: only to be used for debugging.
+     * @return resulting Q and R from factorisation
+     */
+     int getQR(DTensor<T> &, DTensor<T> &);
+
+};
+
+template<>
+inline void QRFactoriser<double>::computeWorkspaceSize() {
+    size_t m = m_matrix->numRows();
+    size_t n = m_matrix->numCols();
+    gpuErrChk(cusolverDnDgeqrf_bufferSize(Session::getInstance().cuSolverHandle(),
+                                          m, n,
+                                          nullptr, m,
+                                          &m_workspaceSize));
+}
+
+template<>
+inline void QRFactoriser<float>::computeWorkspaceSize() {
+    size_t m = m_matrix->numRows();
+    size_t n = m_matrix->numCols();
+    gpuErrChk(cusolverDnSgeqrf_bufferSize(Session::getInstance().cuSolverHandle(),
+                                          m, n,
+                                          nullptr, m,
+                                          &m_workspaceSize));
+}
+
+template<>
+inline int QRFactoriser<double>::factorise() {
+    size_t m = m_matrix->numRows();
+    size_t n = m_matrix->numCols();
+    gpuErrChk(cusolverDnDgeqrf(Session::getInstance().cuSolverHandle(),
+                               m, n,
+                               m_matrix->raw(), m,
+                               m_householder->raw(),
+                               m_workspace->raw(), m_workspaceSize,
+                               m_info->raw()));
+    return (*m_info)(0);
+}
+
+
+template<>
+inline int QRFactoriser<float>::factorise() {
+    size_t m = m_matrix->numRows();
+    size_t n = m_matrix->numCols();
+    gpuErrChk(cusolverDnSgeqrf(Session::getInstance().cuSolverHandle(),
+                               m, n,
+                               m_matrix->raw(), m,
+                               m_householder->raw(),
+                               m_workspace->raw(), m_workspaceSize,
+                               m_info->raw()));
+    return (*m_info)(0);
+}
+
+template<>
+inline int QRFactoriser<double>::leastSquares(DTensor<double> &rhs) {
+    size_t m = m_matrix->numRows();
+    size_t n = m_matrix->numCols();
+    double alpha = 1.;
+    gpuErrChk(cusolverDnDormqr(Session::getInstance().cuSolverHandle(),
+                               CUBLAS_SIDE_LEFT, CUBLAS_OP_T, m, 1, n,
+                               m_matrix->raw(), m,
+                               m_householder->raw(),
+                               rhs.raw(), m,
+                               m_workspace->raw(), m_workspaceSize,
+                               m_info->raw()));
+    gpuErrChk(cublasDtrsm(Session::getInstance().cuBlasHandle(),
+                          CUBLAS_SIDE_LEFT, CUBLAS_FILL_MODE_UPPER, CUBLAS_OP_N, CUBLAS_DIAG_NON_UNIT, n, 1,
+                          &alpha,
+                          m_matrix->raw(), m,
+                          rhs.raw(), m));
+    return (*m_info)(0);
+}
+
+template<>
+inline int QRFactoriser<float>::leastSquares(DTensor<float> &rhs) {
+    size_t m = m_matrix->numRows();
+    size_t n = m_matrix->numCols();
+    float alpha = 1.;
+    gpuErrChk(cusolverDnSormqr(Session::getInstance().cuSolverHandle(),
+                               CUBLAS_SIDE_LEFT, CUBLAS_OP_T, m, 1, n,
+                               m_matrix->raw(), m,
+                               m_householder->raw(),
+                               rhs.raw(), m,
+                               m_workspace->raw(), m_workspaceSize,
+                               m_info->raw()));
+    gpuErrChk(cublasStrsm(Session::getInstance().cuBlasHandle(),
+                          CUBLAS_SIDE_LEFT, CUBLAS_FILL_MODE_UPPER, CUBLAS_OP_N, CUBLAS_DIAG_NON_UNIT, n, 1,
+                          &alpha,
+                          m_matrix->raw(), m,
+                          rhs.raw(), m));
+    return (*m_info)(0);
+}
+
+template<>
+inline int QRFactoriser<double>::getQR(DTensor<double> &Q, DTensor<double> &R) {
+    size_t m = m_matrix->numRows();
+    size_t n = m_matrix->numCols();
+    if (Q.numRows() != m || Q.numCols() != n)
+        throw std::invalid_argument("[QRFactoriser] invalid shape of Q.");
+    if (R.numRows() != n || R.numCols() != n)
+        throw std::invalid_argument("[QRFactoriser] invalid shape of R.");
+    // Initialize Q to 1's on diagonal
+    std::vector<double> vecQ(m * n, 0.);
+    for (int r = 0; r < m; r++) {
+        for (int c = 0; c < n; c++) {
+            if (r == c) { vecQ[r * n + c] = 1.; }
+        }
+    }
+    Q.upload(vecQ, rowMajor);
+    // Apply Householder reflectors to compute Q
+    gpuErrChk(cusolverDnDormqr(Session::getInstance().cuSolverHandle(),
+                               CUBLAS_SIDE_LEFT, CUBLAS_OP_N, m, n, n,
+                               m_matrix->raw(), m,
+                               m_householder->raw(),
+                               Q.raw(), m,
+                               m_workspace->raw(), m_workspaceSize,
+                               m_info->raw()));
+    // Extract upper triangular R
+    std::vector<double> vecR(n * n, 0.);
+    for (size_t r = 0; r < n; r++) {
+        for (size_t c = r; c < n; c++) {
+            vecR[r * n + c] = (*m_matrix)(r, c);
+        }
+    }
+    R.upload(vecR, rowMajor);
+    return (*m_info)(0);
+}
+
+template<>
+inline int QRFactoriser<float>::getQR(DTensor<float> &Q, DTensor<float> &R) {
+    size_t m = m_matrix->numRows();
+    size_t n = m_matrix->numCols();
+    if (Q.numRows() != m || Q.numCols() != n)
+        throw std::invalid_argument("[QRFactoriser] invalid shape of Q.");
+    if (R.numRows() != n || R.numCols() != n)
+        throw std::invalid_argument("[QRFactoriser] invalid shape of R.");
+    // Initialize Q to 1's on diagonal
+    std::vector<float> vecQ(m * n, 0.);
+    for (int r = 0; r < m; r++) {
+        for (int c = 0; c < n; c++) {
+            if (r == c) { vecQ[r * n + c] = 1.; }
+        }
+    }
+    Q.upload(vecQ, rowMajor);
+    // Apply Householder reflectors to compute Q
+    gpuErrChk(cusolverDnSormqr(Session::getInstance().cuSolverHandle(),
+                               CUBLAS_SIDE_LEFT, CUBLAS_OP_N, m, n, n,
+                               m_matrix->raw(), m,
+                               m_householder->raw(),
+                               Q.raw(), m,
+                               m_workspace->raw(), m_workspaceSize,
+                               m_info->raw()));
+    // Extract upper triangular R
+    std::vector<float> vecR(n * n, 0.);
+    for (size_t r = 0; r < n; r++) {
+        for (size_t c = r; c < n; c++) {
+            vecR[r * n + c] = (*m_matrix)(r, c);
+        }
+    }
+    R.upload(vecR, rowMajor);
+    return (*m_info)(0);
+}
+
+
 /* ================================================================================================
  *  Nullspace (N)
  * ================================================================================================ */

diff --git a/test/testTensor.cu b/test/testTensor.cu
@@ -698,7 +698,7 @@ void tensorLeastSquares1(T epsilon) {
     DTensor<T> A(A0);
     DTensor<T> B(bData, 2, 1, 3);
     DTensor<T> sol(B);
-    A0.leastSquares(sol);
+    A0.leastSquaresBatched(sol);
     DTensor<T> C(2, 1, 3);
     C.addAB(A, sol);
     C -= B;
@@ -1038,6 +1038,88 @@ TEST_F(CholeskyTest, choleskyBatchSolve) {
 }
 
 
+/* ================================================================================================
+ *  QR TESTS
+ * ================================================================================================ */
+class QRTest : public testing::Test {
+protected:
+    QRTest() {}
+
+    virtual ~QRTest() {}
+};
+
+
+/* ---------------------------------------
+ * Cholesky factorisation
+ * --------------------------------------- */
+
+TEMPLATE_WITH_TYPE_T TEMPLATE_CONSTRAINT_REQUIRES_FPX
+void qrFactorisation(T epsilon) {
+    size_t nR = 4;
+    size_t nC = 3;
+    DTensor<T> temp(nR, nC);
+    DTensor<T> A = DTensor<T>::createRandomTensor(nR, nC, 1, -100, 100);
+    QRFactoriser<T> qr(temp);
+    A.deviceCopyTo(temp);
+    int status = qr.factorise();
+    EXPECT_EQ(status, 0);
+    DTensor<T> Q(nR, nC);
+    DTensor<T> R(nC, nC, 1, true);
+    DTensor<T> QR(nR, nC);
+    status = qr.getQR(Q, R);
+    EXPECT_EQ(status, 0);
+    QR.addAB(Q, R);
+    QR -= A;
+    T nrm = QR.normF();
+    EXPECT_NEAR(nrm, 0., epsilon);
+}
+
+TEST_F(QRTest, qrFactorisation) {
+    qrFactorisation<float>(PRECISION_LOW);
+    qrFactorisation<double>(PRECISION_HIGH);
+}
+
+/* ---------------------------------------
+ * Cholesky factorisation: solve system
+ * --------------------------------------- */
+
+TEMPLATE_WITH_TYPE_T TEMPLATE_CONSTRAINT_REQUIRES_FPX
+void qrLeastSquares(T epsilon) {
+    size_t nR = 4;
+    size_t nC = 3;
+    DTensor<T> temp(nR, nC);
+    std::vector<T> vecA = { 85.5638, -59.4001, -80.1992,
+                            99.9464, 5.51393, 5.17935,
+                            6.87488, -26.7536, 36.0914,
+                            -44.3857, -32.1268, 54.8915 };  // Random matrix
+    std::vector<T> vecB = { -23.3585,
+                            -48.5744,
+                            43.4229,
+                            -56.5081 };  // Random vector
+    DTensor<T> A(vecA, nR, nC, 1, rowMajor);
+    DTensor<T> b(vecB, nR);
+    DTensor<T> xFull(nR);
+    DTensor<T> x(xFull, 0, 0, nC - 1);
+    DTensor<T> Ax(nR);
+    QRFactoriser<T> qr(temp);
+    A.deviceCopyTo(temp);
+    int status = qr.factorise();
+    EXPECT_EQ(status, 0);
+    b.deviceCopyTo(xFull);
+    status = qr.leastSquares(xFull);
+    EXPECT_EQ(status, 0);
+    Ax.addAB(A, x);
+    Ax -= b;
+    T nrm = Ax.normF();
+    EXPECT_NEAR(nrm, 80.003169364198072, epsilon);  // From MatLab
+}
+
+TEST_F(QRTest, qrLeastSquares) {
+    qrLeastSquares<float>(PRECISION_LOW);
+    qrLeastSquares<double>(PRECISION_HIGH);
+}
+
+
 /* ================================================================================================
  *  NULLSPACE TESTS
  * ================================================================================================ */