Skip to content

QR decomposition and least squares #41

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Oct 4, 2024
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -236,7 +236,7 @@ DTensor<float> B(bData, m);
Then, we can solve the system by

```c++
A.leastSquares(B);
A.leastSquaresBatched(B);
```

The `DTensor` `B` will be overwritten with the solution.
Expand Down
244 changes: 232 additions & 12 deletions include/tensor.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -398,7 +398,7 @@ public:
* @param b provided tensor
* @return least squares solution (overwrites b)
*/
void leastSquares(DTensor &b);
void leastSquaresBatched(DTensor &b);

/**
* Batched `C <- bC + a*A*B`.
Expand Down Expand Up @@ -884,17 +884,17 @@ inline void DTensor<float>::addAB(const DTensor<float> &A, const DTensor<float>
}

template<>
inline void DTensor<double>::leastSquares(DTensor &B) {
inline void DTensor<double>::leastSquaresBatched(DTensor &B) {
size_t batchSize = numMats();
size_t nColsB = B.numCols();
if (B.numRows() != m_numRows)
throw std::invalid_argument("[Least squares] rhs rows does not equal lhs rows");
throw std::invalid_argument("[Least squares batched] rhs rows does not equal lhs rows");
if (nColsB != 1)
throw std::invalid_argument("[Least squares] rhs are not vectors");
throw std::invalid_argument("[Least squares batched] rhs are not vectors");
if (B.numMats() != batchSize)
throw std::invalid_argument("[Least squares] rhs numMats does not equal lhs numMats");
throw std::invalid_argument("[Least squares batched] rhs numMats does not equal lhs numMats");
if (m_numCols > m_numRows)
throw std::invalid_argument("[Least squares] supports square or tall matrices only");
throw std::invalid_argument("[Least squares batched] supports square or tall matrices only");
int info = 0;
DTensor<int> infoArray(batchSize);
DTensor<double *> As = pointersToMatrices();
Expand All @@ -914,17 +914,17 @@ inline void DTensor<double>::leastSquares(DTensor &B) {
}

template<>
inline void DTensor<float>::leastSquares(DTensor &B) {
inline void DTensor<float>::leastSquaresBatched(DTensor &B) {
size_t batchSize = numMats();
size_t nColsB = B.numCols();
if (B.numRows() != m_numRows)
throw std::invalid_argument("[Least squares] rhs rows does not equal lhs rows");
throw std::invalid_argument("[Least squares batched] rhs rows does not equal lhs rows");
if (nColsB != 1)
throw std::invalid_argument("[Least squares] rhs are not vectors");
throw std::invalid_argument("[Least squares batched] rhs are not vectors");
if (B.numMats() != batchSize)
throw std::invalid_argument("[Least squares] rhs numMats does not equal lhs numMats");
throw std::invalid_argument("[Least squares batched] rhs numMats does not equal lhs numMats");
if (m_numCols > m_numRows)
throw std::invalid_argument("[Least squares] supports square or tall matrices only");
throw std::invalid_argument("[Least squares batched] supports square or tall matrices only");
int info = 0;
DTensor<int> infoArray(batchSize);
DTensor<float *> As = pointersToMatrices();
Expand Down Expand Up @@ -1238,7 +1238,7 @@ private:
public:

CholeskyFactoriser(DTensor<T> &A) {
if (A.numMats() > 1) throw std::invalid_argument("[Cholesky] 3D tensors require `CholeskyBatchFactoriser");
if (A.numMats() > 1) throw std::invalid_argument("[Cholesky] 3D tensors require `CholeskyBatchFactoriser`");
if (A.numRows() != A.numCols()) throw std::invalid_argument("[Cholesky] Matrix A must be square");
m_matrix = &A;
computeWorkspaceSize();
Expand Down Expand Up @@ -1328,6 +1328,226 @@ inline int CholeskyFactoriser<float>::solve(DTensor<float> &rhs) {
}


/* ================================================================================================
* QR DECOMPOSITION (QR)
* ================================================================================================ */

/**
* QR decomposition (QR) needs a workspace to be setup for cuSolver before factorisation.
* This object can be setup for a specific type and size of (m,n,1)-tensor (i.e., a matrix).
* Then, many same-type-(m,n,1)-tensor can be factorised using this object's workspace
* @tparam T data type of (m,n,1)-tensor to be factorised (must be float or double)
*/
TEMPLATE_WITH_TYPE_T TEMPLATE_CONSTRAINT_REQUIRES_FPX
class QRFactoriser {

private:
int m_workspaceSize = 0; ///< Size of workspace needed for LS
std::unique_ptr<DTensor<int>> m_info; ///< Status code of computation
std::unique_ptr<DTensor<T>> m_householder; ///< For storing householder reflectors
std::unique_ptr<DTensor<T>> m_workspace; ///< Workspace for LS
DTensor<T> *m_matrix; ///< Lhs matrix template. Do not destroy!

/**
* Computes the workspace size required by cuSolver.
*/
void computeWorkspaceSize();

public:

QRFactoriser(DTensor<T> &A) {
if (A.numMats() > 1) throw std::invalid_argument("[LeastSquares] 3D tensors require `leastSquaresBatched`");
if (A.numRows() < A.numCols()) throw std::invalid_argument("[Cholesky] Matrix A must be tall or square");
m_matrix = &A;
computeWorkspaceSize();
m_workspace = std::make_unique<DTensor<T>>(m_workspaceSize);
m_householder = std::make_unique<DTensor<T>>(m_matrix->numCols());
m_info = std::make_unique<DTensor<int>>(1);
}

/**
* Factorise matrix.
* @return status code of computation
*/
int factorise();

/**
* Solves for the solution of A \ b using the QR of A.
* A is the matrix that is factorised and b is the provided matrix.
* A and b must have compatible dimensions (same number of rows and matrices=1).
* A must be tall or square (m>=n).
* @param b provided matrix
* @return status code of computation
*/
int leastSquares(DTensor<T> &);

/**
* Populate the given tensors with Q and R.
* Caution! This is an inefficient method: only to be used for debugging.
* @return resulting Q and R from factorisation
*/
int getQR(DTensor<T> &, DTensor<T> &);

};

template<>
inline void QRFactoriser<double>::computeWorkspaceSize() {
size_t m = m_matrix->numRows();
size_t n = m_matrix->numCols();
gpuErrChk(cusolverDnDgeqrf_bufferSize(Session::getInstance().cuSolverHandle(),
m, n,
nullptr, m,
&m_workspaceSize));
}

template<>
inline void QRFactoriser<float>::computeWorkspaceSize() {
size_t m = m_matrix->numRows();
size_t n = m_matrix->numCols();
gpuErrChk(cusolverDnSgeqrf_bufferSize(Session::getInstance().cuSolverHandle(),
m, n,
nullptr, m,
&m_workspaceSize));
}

template<>
inline int QRFactoriser<double>::factorise() {
size_t m = m_matrix->numRows();
size_t n = m_matrix->numCols();
gpuErrChk(cusolverDnDgeqrf(Session::getInstance().cuSolverHandle(),
m, n,
m_matrix->raw(), m,
m_householder->raw(),
m_workspace->raw(), m_workspaceSize,
m_info->raw()));
return (*m_info)(0);
}


template<>
inline int QRFactoriser<float>::factorise() {
size_t m = m_matrix->numRows();
size_t n = m_matrix->numCols();
gpuErrChk(cusolverDnSgeqrf(Session::getInstance().cuSolverHandle(),
m, n,
m_matrix->raw(), m,
m_householder->raw(),
m_workspace->raw(), m_workspaceSize,
m_info->raw()));
return (*m_info)(0);
}

template<>
inline int QRFactoriser<double>::leastSquares(DTensor<double> &rhs) {
size_t m = m_matrix->numRows();
size_t n = m_matrix->numCols();
double alpha = 1.;
gpuErrChk(cusolverDnDormqr(Session::getInstance().cuSolverHandle(),
CUBLAS_SIDE_LEFT, CUBLAS_OP_T, m, 1, n,
m_matrix->raw(), m,
m_householder->raw(),
rhs.raw(), m,
m_workspace->raw(), m_workspaceSize,
m_info->raw()));
gpuErrChk(cublasDtrsm(Session::getInstance().cuBlasHandle(),
CUBLAS_SIDE_LEFT, CUBLAS_FILL_MODE_UPPER, CUBLAS_OP_N, CUBLAS_DIAG_NON_UNIT, n, 1,
&alpha,
m_matrix->raw(), m,
rhs.raw(), m));
return (*m_info)(0);
}

template<>
inline int QRFactoriser<float>::leastSquares(DTensor<float> &rhs) {
size_t m = m_matrix->numRows();
size_t n = m_matrix->numCols();
float alpha = 1.;
gpuErrChk(cusolverDnSormqr(Session::getInstance().cuSolverHandle(),
CUBLAS_SIDE_LEFT, CUBLAS_OP_T, m, 1, n,
m_matrix->raw(), m,
m_householder->raw(),
rhs.raw(), m,
m_workspace->raw(), m_workspaceSize,
m_info->raw()));
gpuErrChk(cublasStrsm(Session::getInstance().cuBlasHandle(),
CUBLAS_SIDE_LEFT, CUBLAS_FILL_MODE_UPPER, CUBLAS_OP_N, CUBLAS_DIAG_NON_UNIT, n, 1,
&alpha,
m_matrix->raw(), m,
rhs.raw(), m));
return (*m_info)(0);
}

template<>
inline int QRFactoriser<double>::getQR(DTensor<double> &Q, DTensor<double> &R) {
size_t m = m_matrix->numRows();
size_t n = m_matrix->numCols();
if (Q.numRows() != m || Q.numCols() != n)
throw std::invalid_argument("[QRFactoriser] invalid shape of Q.");
if (R.numRows() != n || R.numCols() != n)
throw std::invalid_argument("[QRFactoriser] invalid shape of R.");
// Initialize Q to 1's on diagonal
std::vector<double> vecQ(m * n, 0.);
for (int r = 0; r < m; r++) {
for (int c = 0; c < n; c++) {
if (r == c) { vecQ[r * n + c] = 1.; }
}
}
Q.upload(vecQ, rowMajor);
// Apply Householder reflectors to compute Q
gpuErrChk(cusolverDnDormqr(Session::getInstance().cuSolverHandle(),
CUBLAS_SIDE_LEFT, CUBLAS_OP_N, m, n, n,
m_matrix->raw(), m,
m_householder->raw(),
Q.raw(), m,
m_workspace->raw(), m_workspaceSize,
m_info->raw()));
// Extract upper triangular R
std::vector<double> vecR(n * n, 0.);
for (size_t r = 0; r < n; r++) {
for (size_t c = r; c < n; c++) {
vecR[r * n + c] = (*m_matrix)(r, c);
}
}
R.upload(vecR, rowMajor);
return (*m_info)(0);
}

template<>
inline int QRFactoriser<float>::getQR(DTensor<float> &Q, DTensor<float> &R) {
size_t m = m_matrix->numRows();
size_t n = m_matrix->numCols();
if (Q.numRows() != m || Q.numCols() != n)
throw std::invalid_argument("[QRFactoriser] invalid shape of Q.");
if (R.numRows() != n || R.numCols() != n)
throw std::invalid_argument("[QRFactoriser] invalid shape of R.");
// Initialize Q to 1's on diagonal
std::vector<float> vecQ(m * n, 0.);
for (int r = 0; r < m; r++) {
for (int c = 0; c < n; c++) {
if (r == c) { vecQ[r * n + c] = 1.; }
}
}
Q.upload(vecQ, rowMajor);
// Apply Householder reflectors to compute Q
gpuErrChk(cusolverDnSormqr(Session::getInstance().cuSolverHandle(),
CUBLAS_SIDE_LEFT, CUBLAS_OP_N, m, n, n,
m_matrix->raw(), m,
m_householder->raw(),
Q.raw(), m,
m_workspace->raw(), m_workspaceSize,
m_info->raw()));
// Extract upper triangular R
std::vector<float> vecR(n * n, 0.);
for (size_t r = 0; r < n; r++) {
for (size_t c = r; c < n; c++) {
vecR[r * n + c] = (*m_matrix)(r, c);
}
}
R.upload(vecR, rowMajor);
return (*m_info)(0);
}


/* ================================================================================================
* Nullspace (N)
* ================================================================================================ */
Expand Down
84 changes: 83 additions & 1 deletion test/testTensor.cu
Original file line number Diff line number Diff line change
Expand Up @@ -698,7 +698,7 @@ void tensorLeastSquares1(T epsilon) {
DTensor<T> A(A0);
DTensor<T> B(bData, 2, 1, 3);
DTensor<T> sol(B);
A0.leastSquares(sol);
A0.leastSquaresBatched(sol);
DTensor<T> C(2, 1, 3);
C.addAB(A, sol);
C -= B;
Expand Down Expand Up @@ -1038,6 +1038,88 @@ TEST_F(CholeskyTest, choleskyBatchSolve) {
}


/* ================================================================================================
* QR TESTS
* ================================================================================================ */
class QRTest : public testing::Test {
protected:
QRTest() {}

virtual ~QRTest() {}
};


/* ---------------------------------------
* Cholesky factorisation
* --------------------------------------- */

TEMPLATE_WITH_TYPE_T TEMPLATE_CONSTRAINT_REQUIRES_FPX
void qrFactorisation(T epsilon) {
size_t nR = 4;
size_t nC = 3;
DTensor<T> temp(nR, nC);
DTensor<T> A = DTensor<T>::createRandomTensor(nR, nC, 1, -100, 100);
QRFactoriser<T> qr(temp);
A.deviceCopyTo(temp);
int status = qr.factorise();
EXPECT_EQ(status, 0);
DTensor<T> Q(nR, nC);
DTensor<T> R(nC, nC, 1, true);
DTensor<T> QR(nR, nC);
status = qr.getQR(Q, R);
EXPECT_EQ(status, 0);
QR.addAB(Q, R);
QR -= A;
T nrm = QR.normF();
EXPECT_NEAR(nrm, 0., epsilon);
}

TEST_F(QRTest, qrFactorisation) {
qrFactorisation<float>(PRECISION_LOW);
qrFactorisation<double>(PRECISION_HIGH);
}

/* ---------------------------------------
* Cholesky factorisation: solve system
* --------------------------------------- */

TEMPLATE_WITH_TYPE_T TEMPLATE_CONSTRAINT_REQUIRES_FPX
void qrLeastSquares(T epsilon) {
size_t nR = 4;
size_t nC = 3;
DTensor<T> temp(nR, nC);
std::vector<T> vecA = { 85.5638, -59.4001, -80.1992,
99.9464, 5.51393, 5.17935,
6.87488, -26.7536, 36.0914,
-44.3857, -32.1268, 54.8915 }; // Random matrix
std::vector<T> vecB = { -23.3585,
-48.5744,
43.4229,
-56.5081 }; // Random vector
DTensor<T> A(vecA, nR, nC, 1, rowMajor);
DTensor<T> b(vecB, nR);
DTensor<T> xFull(nR);
DTensor<T> x(xFull, 0, 0, nC - 1);
DTensor<T> Ax(nR);
QRFactoriser<T> qr(temp);
A.deviceCopyTo(temp);
int status = qr.factorise();
EXPECT_EQ(status, 0);
b.deviceCopyTo(xFull);
status = qr.leastSquares(xFull);
EXPECT_EQ(status, 0);
Ax.addAB(A, x);
Ax -= b;
T nrm = Ax.normF();
EXPECT_NEAR(nrm, 80.003169364198072, epsilon); // From MatLab
}

TEST_F(QRTest, qrLeastSquares) {
qrLeastSquares<float>(PRECISION_LOW);
qrLeastSquares<double>(PRECISION_HIGH);
}


/* ================================================================================================
* NULLSPACE TESTS
* ================================================================================================ */
Expand Down