From 3c6f258a86afa7910fcca487a97800222daa9557 Mon Sep 17 00:00:00 2001
From: Pantelis Sopasakis
Date: Wed, 8 May 2024 11:15:45 +0100
Subject: [PATCH 01/15] c++17-proofing testTensor.cu
---
test/testTensor.cu | 100 ++++++++++++++++++++++-----------------------
1 file changed, 50 insertions(+), 50 deletions(-)
diff --git a/test/testTensor.cu b/test/testTensor.cu
index 5291c68..20ddc1b 100644
--- a/test/testTensor.cu
+++ b/test/testTensor.cu
@@ -24,7 +24,7 @@ protected:
* Zero Tensor (Constructor)
* --------------------------------------- */
-template
+TEMPLATE_WITH_TYPE_T
void tensorConstructionZero() {
DTensor zero(2, 3, 4, true);
EXPECT_EQ(2, zero.numRows());
@@ -46,7 +46,7 @@ TEST_F(TensorTest, tensorConstructionZero) {
* Row- and column-major data
* --------------------------------------- */
-template
+TEMPLATE_WITH_TYPE_T
void tensorConstructionStorageMode() {
size_t rows = 3;
size_t cols = 2;
@@ -98,7 +98,7 @@ TEST_F(TensorTest, tensorConstructionStorageMode) {
* Move constructor
* --------------------------------------- */
-template
+TEMPLATE_WITH_TYPE_T
void tensorMoveConstructor() {
DTensor zero(2, 3, 4, true);
DTensor x(std::move(zero));
@@ -118,7 +118,7 @@ TEST_F(TensorTest, tensorMoveConstructor) {
* Constructor
* --------------------------------------- */
-template
+TEMPLATE_WITH_TYPE_T
void tensorConstructionFromVector() {
std::vector data = TENSOR_DATA_234A;
DTensor tenz(data, 2, 3, 4);
@@ -138,7 +138,7 @@ TEST_F(TensorTest, tensorConstructionFromVector) {
* Tensor: Copy constructor
* --------------------------------------- */
-template
+TEMPLATE_WITH_TYPE_T
void tensorCopyConstructor() {
std::vector data = TENSOR_DATA_234A;
DTensor tenz(data, 2, 3, 4);
@@ -164,7 +164,7 @@ TEST_F(TensorTest, tensorCopyConstructor) {
* axis = 2 (matrices)
* --------------------------------------- */
-template
+TEMPLATE_WITH_TYPE_T
void tensorSlicingConstructorAxis2() {
std::vector data = TENSOR_DATA_234A;
DTensor tens(data, 2, 3, 4);
@@ -186,7 +186,7 @@ TEST_F(TensorTest, tensorSlicingConstructorAxis2) {
* axis = 1 (columns)
* --------------------------------------- */
-template
+TEMPLATE_WITH_TYPE_T
void tensorSlicingConstructorAxis1() {
std::vector data = TENSOR_DATA_234A;
DTensor tenz(data, 2, 3, 4);
@@ -211,7 +211,7 @@ TEST_F(TensorTest, tensorSlicingConstructorAxis1) {
* axis = 0 (columns)
* --------------------------------------- */
-template
+TEMPLATE_WITH_TYPE_T
void tensorSlicingConstructorAxis0() {
std::vector data = TENSOR_DATA_234A;
DTensor tenz(data, 2, 3, 4);
@@ -235,7 +235,7 @@ TEST_F(TensorTest, tensorSlicingConstructorAxis0) {
* Tensor: Upload data
* --------------------------------------- */
-template
+TEMPLATE_WITH_TYPE_T
void tensorUpload() {
std::vector data = TENSOR_DATA_234A;
DTensor tenz(2, 3, 4);
@@ -258,7 +258,7 @@ TEST_F(TensorTest, tensorUpload) {
* Tensor: deviceCopyTo
* --------------------------------------- */
-template
+TEMPLATE_WITH_TYPE_T
void tensorDeviceCopyTo() {
std::vector data = TENSOR_DATA_234A;
DTensor tenz(data, 2, 3, 4);
@@ -282,7 +282,7 @@ TEST_F(TensorTest, tensorDeviceCopyTo) {
* Tensor: Frobenius dot product
* --------------------------------------- */
-template
+TEMPLATE_WITH_TYPE_T
void tensorDotF(T epsilon) {
// as vectors
std::vector dataA = TENSOR_DATA_234A;
@@ -307,7 +307,7 @@ TEST_F(TensorTest, tensorDotF) {
* Tensor: Frobenius norm
* --------------------------------------- */
-template
+TEMPLATE_WITH_TYPE_T
void tensorNormF(T epsilon) {
std::vector data = TENSOR_DATA_234A;
DTensor tenz(data, 2, 3, 4);
@@ -324,7 +324,7 @@ TEST_F(TensorTest, tensorNormF) {
* all elements
* --------------------------------------- */
-template
+TEMPLATE_WITH_TYPE_T
void tensorSumAbs() {
std::vector data = TENSOR_DATA_234A;
DTensor tenz(data, 2, 3, 4);
@@ -341,7 +341,7 @@ TEST_F(TensorTest, tensorNormFtensorSumAbs) {
* e.g., t(2, 3, 4)
* --------------------------------------- */
-template
+TEMPLATE_WITH_TYPE_T
void tensorBracketOperator() {
std::vector data = TENSOR_DATA_234A;
DTensor tenz(data, 2, 3, 4);
@@ -360,7 +360,7 @@ TEST_F(TensorTest, tensorBracketOperator) {
* Tensor assignment operator
* --------------------------------------- */
-template
+TEMPLATE_WITH_TYPE_T
void tensorAssignmentOperator() {
std::vector data = TENSOR_DATA_234A;
DTensor tenz(data, 2, 3, 4);
@@ -382,7 +382,7 @@ TEST_F(TensorTest, tensorAssignmentOperator) {
* Tensor times-equals scalar
* --------------------------------------- */
-template
+TEMPLATE_WITH_TYPE_T
void tensorTimesEqualsScalar() {
std::vector data = TENSOR_DATA_234A;
std::vector dataTimes3 = {3, 6, 9, 12, 15, 18, 21, 24, 27, 24, 21, 30, 15, 12, 9, 6, 3, -3, 12, 9, 12, 9, 12,
@@ -403,7 +403,7 @@ TEST_F(TensorTest, tensorTimesEqualsScalar) {
* Scalar times tensor
* --------------------------------------- */
-template
+TEMPLATE_WITH_TYPE_T
void tensorTimesScalar() {
std::vector data = TENSOR_DATA_234A;
std::vector dataTimes3 = {3, 6, 9, 12, 15, 18, 21, 24, 27, 24, 21, 30, 15, 12, 9, 6, 3, -3, 12, 9, 12, 9, 12,
@@ -424,7 +424,7 @@ TEST_F(TensorTest, tensorTimesScalar) {
* Tensor plus-equals tensor
* --------------------------------------- */
-template
+TEMPLATE_WITH_TYPE_T
void tensorPlusEqualsTensor() {
std::vector dataA = TENSOR_DATA_234A;
std::vector dataB = TENSOR_DATA_234B;
@@ -446,7 +446,7 @@ TEST_F(TensorTest, tensorPlusEqualsTensor) {
* Tensor minus-equals tensor
* --------------------------------------- */
-template
+TEMPLATE_WITH_TYPE_T
void tensorMinusEqualsTensor() {
std::vector dataA = TENSOR_DATA_234A;
std::vector dataB = TENSOR_DATA_234B;
@@ -468,7 +468,7 @@ TEST_F(TensorTest, tensorMinusEqualsTensor) {
* Tensor + Tensor
* --------------------------------------- */
-template
+TEMPLATE_WITH_TYPE_T
void tensorPlusTensor() {
std::vector dataA = TENSOR_DATA_234A;
std::vector dataB = TENSOR_DATA_234B;
@@ -490,7 +490,7 @@ TEST_F(TensorTest, tensorPlusTensor) {
* Tensor - Tensor
* --------------------------------------- */
-template
+TEMPLATE_WITH_TYPE_T
void tensorMinusTensor() {
std::vector dataA = TENSOR_DATA_234A;
std::vector dataB = TENSOR_DATA_234B;
@@ -512,7 +512,7 @@ TEST_F(TensorTest, tensorMinusTensor) {
* Tensor: pointers to matrices (on device)
* --------------------------------------- */
-template
+TEMPLATE_WITH_TYPE_T
void tensorPointersToMatrices() {
std::vector dataA = TENSOR_DATA_234A;
DTensor A(dataA, 2, 3, 4);
@@ -536,7 +536,7 @@ TEST_F(TensorTest, tensorPointersToMatrices) {
* Tensor: C = AB
* --------------------------------------- */
-template
+TEMPLATE_WITH_TYPE_T
void tensorAddAB() {
std::vector aData = {1, 2, 3, 4, 5, 6,
7, 8, 9, 10, 11, 12,
@@ -563,7 +563,7 @@ TEST_F(TensorTest, tensorAddAB) {
* Tensor: getRows
* --------------------------------------- */
-template
+TEMPLATE_WITH_TYPE_T
void tensorGetRows() {
std::vector aData = {10.5, 25.0, 60.0,
-21.0, 720.0, -1.0,
@@ -595,7 +595,7 @@ TEST_F(TensorTest, tensorGetRows) {
* Tensor: transpose
* --------------------------------------- */
-template
+TEMPLATE_WITH_TYPE_T
void tensorTranspose() {
std::vector aData = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12};
DTensor A(aData, 3, 2, 2);
@@ -629,7 +629,7 @@ protected:
* Tensor: Least squares
* --------------------------------------- */
-template
+TEMPLATE_WITH_TYPE_T
void tensorLeastSquares1(T epsilon) {
// TODO test with tall matrices too
std::vector aData = {1, 2,
@@ -672,8 +672,8 @@ protected:
* and matrix rank
* --------------------------------------- */
-template
-requires std::floating_point
+TEMPLATE_WITH_TYPE_T
+TEMPLATE_CONSTRAINT_REQUIRES_FPX
void singularValuesComputation(float epsilon) {
std::vector bData = {1, 6, 6, 6, 6, 6, 6, 6,
2, 7, 7, 7, 7, 7, 7, 7,
@@ -699,8 +699,8 @@ TEST_F(SvdTest, singularValuesComputation) {
* Singular values - memory mgmt
* --------------------------------------- */
-template
-requires std::floating_point
+TEMPLATE_WITH_TYPE_T
+TEMPLATE_CONSTRAINT_REQUIRES_FPX
void singularValuesMemory(float epsilon) {
std::vector bData = {1, 6, 6, 6, 6, 6, 6, 6,
2, 7, 7, 7, 7, 7, 7, 7,
@@ -731,8 +731,8 @@ TEST_F(SvdTest, singularValuesMemory) {
/* ---------------------------------------
* SVD with multiple matrices
* --------------------------------------- */
-template
-requires std::floating_point
+TEMPLATE_WITH_TYPE_T
+TEMPLATE_CONSTRAINT_REQUIRES_FPX
void singularValuesMultipleMatrices(float epsilon) {
std::vector aData = {1, 2, 3, 4, 5, 6, 1, 1, 1, 2, 2, 2, 0, 0, 0, 0, 0, 1};
DTensor A(aData, 3, 2, 3);
@@ -779,8 +779,8 @@ TEST_F(SvdTest, singularValuesMultipleMatrices) {
* SVD for rank computation of multiple
* matrices
* --------------------------------------- */
-template
-requires std::floating_point
+TEMPLATE_WITH_TYPE_T
+TEMPLATE_CONSTRAINT_REQUIRES_FPX
void singularValuesRankMultipleMatrices(float epsilon) {
std::vector aData = {1, 4, 7, 10, 2, 5, 8, 11, 3, 6, 9, 0,
1, 4, 7, 10, 2, 5, 8, 11, 3, 6, 9, 12,
@@ -815,8 +815,8 @@ protected:
* Cholesky factorisation
* --------------------------------------- */
-template
-requires std::floating_point
+TEMPLATE_WITH_TYPE_T
+TEMPLATE_CONSTRAINT_REQUIRES_FPX
void choleskyFactorisation(T epsilon) {
std::vector aData = {10.0, 2.0, 3.0,
2.0, 20.0, -1.0,
@@ -838,8 +838,8 @@ TEST_F(CholeskyTest, choleskyFactorisation) {
* Cholesky factorisation: solve system
* --------------------------------------- */
-template
-requires std::floating_point
+TEMPLATE_WITH_TYPE_T
+TEMPLATE_CONSTRAINT_REQUIRES_FPX
void choleskyFactorisationSolution(T epsilon) {
std::vector aData = {10.0, 2.0, 3.0,
2.0, 20.0, -1.0,
@@ -874,8 +874,8 @@ TEST_F(CholeskyTest, choleskyFactorisationSolution) {
* Batched Cholesky factorisation
* --------------------------------------- */
-template
-requires std::floating_point
+TEMPLATE_WITH_TYPE_T
+TEMPLATE_CONSTRAINT_REQUIRES_FPX
void choleskyBatchFactorisation(T epsilon) {
std::vector aData = {10.0, 2.0, 3.0,
2.0, 20.0, -1.0,
@@ -906,8 +906,8 @@ TEST_F(CholeskyTest, choleskyBatchFactorisation) {
* Batched Cholesky solve
* --------------------------------------- */
-template
-requires std::floating_point
+TEMPLATE_WITH_TYPE_T
+TEMPLATE_CONSTRAINT_REQUIRES_FPX
void choleskyBatchFactorSolve(T epsilon) {
std::vector aData = {10.0, 2.0, 3.0,
2.0, 20.0, -1.0,
@@ -947,8 +947,8 @@ TEST_F(CholeskyTest, choleskyBatchFactorSolve) {
* Batched Cholesky solve (factor provided)
* --------------------------------------- */
-template
-requires std::floating_point
+TEMPLATE_WITH_TYPE_T
+TEMPLATE_CONSTRAINT_REQUIRES_FPX
void choleskyBatchSolve(T epsilon) {
std::vector aData = {10.0, 2.0, 3.0,
2.0, 20.0, -1.0,
@@ -1007,8 +1007,8 @@ protected:
* Basic nullspace test
* --------------------------------------- */
-template
-requires std::floating_point
+TEMPLATE_WITH_TYPE_T
+TEMPLATE_CONSTRAINT_REQUIRES_FPX
void computeNullspaceTensor(T epsilon) {
std::vector aData = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 0,
1, 2, 3, 4, 5, 6, 7, 8, 9, 7, 8, 9,
@@ -1048,8 +1048,8 @@ TEST_F(NullspaceTest, computeNullspaceTensor) {
* Nullspace is trivial
* --------------------------------------- */
-template
-requires std::floating_point
+TEMPLATE_WITH_TYPE_T
+TEMPLATE_CONSTRAINT_REQUIRES_FPX
void computeNullspaceTrivial(T epsilon) {
std::vector data{4, 5, 7,
4, 1, 8,
@@ -1072,8 +1072,8 @@ TEST_F(NullspaceTest, computeNullspaceTrivial) {
* Project onto nullspace
* --------------------------------------- */
-template
-requires std::floating_point
+TEMPLATE_WITH_TYPE_T
+TEMPLATE_CONSTRAINT_REQUIRES_FPX
void projectOnNullspaceTensor(T epsilon) {
// offline
size_t m = 3;
From 156048db1e927b02408f7d8f4a7d3b9c0c9d4a9a Mon Sep 17 00:00:00 2001
From: Pantelis Sopasakis
Date: Wed, 8 May 2024 11:56:08 +0100
Subject: [PATCH 02/15] CMakeLists: can compile on both A40 and Orin
---
CMakeLists.txt | 78 ++++++++++++++++++++++++++++++--------------------
1 file changed, 47 insertions(+), 31 deletions(-)
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 5d39ae0..2d13e09 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -2,36 +2,52 @@
# GPUtils
# ====================================================================
cmake_minimum_required(VERSION 3.20 FATAL_ERROR)
+cmake_policy(SET CMP0135 NEW)
project(GPUtils
- DESCRIPTION "Easy use of vectors and matrices on GPGPU devices."
- HOMEPAGE_URL "https://github.com/GPUEngineering/GPUtils"
- LANGUAGES CXX
-)
+ DESCRIPTION "Easy use of vectors and matrices on GPGPU devices."
+ HOMEPAGE_URL "https://github.com/GPUEngineering/GPUtils"
+ LANGUAGES CXX
+ )
+
+
+option(CPPVERSION "C++ version" 20) # A40: 20, Orin: 17
+option(SM_ARCH "SM architecture" 86) # A40: 86, Orin: 87
+
+set (cppversion ${CPPVERSION})
+set (cppstd "c++${CPPVERSION}")
+set (cxxstd cxx_std_${CPPVERSION})
+
# ----
+
set(CMAKE_CUDA_SEPARABLE_COMPILATION ON) # required for calling cuda kernels from cuda kernels
-set(CMAKE_CUDA_COMPILER "/usr/local/cuda-12.3/bin/nvcc")
-set(CMAKE_CUDA_ARCHITECTURES 86)
-set(CMAKE_CUDA_STANDARD 20)
-set(CMAKE_CXX_STANDARD 20)
-set(CMAKE_CUDA_FLAGS "-std=c++20")
-set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS}; -std=c++20)
+set(CMAKE_CUDA_COMPILER "/usr/local/cuda-11.4/bin/nvcc")
+set(CMAKE_CUDA_ARCHITECTURES ${SM_ARCH})
+set(CMAKE_CUDA_STANDARD ${cppversion})
+set(CMAKE_CXX_STANDARD ${cppversion})
+set(CMAKE_CUDA_FLAGS "-std=${cppstd}")
+set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS}; -std=${cppstd})
enable_language(CUDA)
+
# ----
+
add_library(device_compiler_flags INTERFACE)
-target_compile_features(device_compiler_flags INTERFACE cxx_std_20)
+target_compile_features(device_compiler_flags INTERFACE ${cxxstd})
set(CMAKE_CXX_EXTENSIONS OFF)
+
# ----
+
add_library(developer_flags INTERFACE)
set(cxx_flags -Wall)
-set(cuda_flags -arch=sm_60 -std=c++20 -Xcompiler=-Wall -Xcudafe=--display_error_number -g)
+set(cuda_flags -arch=sm_${SM_ARCH} -std=${cppstd} -Xcompiler=-Wall -Xcudafe=--display_error_number -g)
target_compile_options(developer_flags
- INTERFACE
- # flags for CXX builds
- $<$:${cxx_flags}>
- # flags for CUDA builds
- $<$:${cuda_flags}>
-)
+ INTERFACE
+ # flags for CXX builds
+ $<$:${cxx_flags}>
+ # flags for CUDA builds
+ $<$:${cuda_flags}>
+ )
target_link_libraries(device_compiler_flags INTERFACE $)
+
# ----
@@ -40,21 +56,21 @@ target_link_libraries(device_compiler_flags INTERFACE $
Date: Wed, 8 May 2024 12:44:45 +0100
Subject: [PATCH 03/15] now works with cmake v3.20
---
CMakeLists.txt | 44 +++++++++++++++++++++-----------------------
1 file changed, 21 insertions(+), 23 deletions(-)
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 2d13e09..4b87d7b 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -2,43 +2,42 @@
# GPUtils
# ====================================================================
cmake_minimum_required(VERSION 3.20 FATAL_ERROR)
-cmake_policy(SET CMP0135 NEW)
+
+if(CMAKE_VERSION VERSION_GREATER_EQUAL "3.29")
+ cmake_policy(SET CMP0135 NEW)
+endif()
+
+# Set C++ version and SM architecture
+if (NOT DEFINED CPPVERSION)
+ set(CPPVERSION 20) # A40: 20, Orin: 17
+endif()
+if (NOT DEFINED SM_ARCH)
+ set(SM_ARCH 86)# A40: 86, Orin: 87
+endif()
+
+
project(GPUtils
DESCRIPTION "Easy use of vectors and matrices on GPGPU devices."
HOMEPAGE_URL "https://github.com/GPUEngineering/GPUtils"
LANGUAGES CXX
)
-
-
-option(CPPVERSION "C++ version" 20) # A40: 20, Orin: 17
-option(SM_ARCH "SM architecture" 86) # A40: 86, Orin: 87
-
-set (cppversion ${CPPVERSION})
-set (cppstd "c++${CPPVERSION}")
-set (cxxstd cxx_std_${CPPVERSION})
-
# ----
-
set(CMAKE_CUDA_SEPARABLE_COMPILATION ON) # required for calling cuda kernels from cuda kernels
-set(CMAKE_CUDA_COMPILER "/usr/local/cuda-11.4/bin/nvcc")
+set(CMAKE_CUDA_COMPILER "/usr/local/cuda/bin/nvcc")
set(CMAKE_CUDA_ARCHITECTURES ${SM_ARCH})
-set(CMAKE_CUDA_STANDARD ${cppversion})
-set(CMAKE_CXX_STANDARD ${cppversion})
-set(CMAKE_CUDA_FLAGS "-std=${cppstd}")
-set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS}; -std=${cppstd})
+set(CMAKE_CUDA_STANDARD ${CPPVERSION})
+set(CMAKE_CXX_STANDARD ${CPPVERSION})
+set(CMAKE_CUDA_FLAGS "-std=c++${CPPVERSION}")
+set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS}; "-std=c++${CPPVERSION}")
enable_language(CUDA)
-
# ----
-
add_library(device_compiler_flags INTERFACE)
-target_compile_features(device_compiler_flags INTERFACE ${cxxstd})
+target_compile_features(device_compiler_flags INTERFACE cxx_std_${CPPVERSION})
set(CMAKE_CXX_EXTENSIONS OFF)
-
# ----
-
add_library(developer_flags INTERFACE)
set(cxx_flags -Wall)
-set(cuda_flags -arch=sm_${SM_ARCH} -std=${cppstd} -Xcompiler=-Wall -Xcudafe=--display_error_number -g)
+set(cuda_flags -arch=sm_${SM_ARCH} -std=c++${CPPVERSION} -Xcompiler=-Wall -Xcudafe=--display_error_number -g)
target_compile_options(developer_flags
INTERFACE
# flags for CXX builds
@@ -47,7 +46,6 @@ target_compile_options(developer_flags
$<$:${cuda_flags}>
)
target_link_libraries(device_compiler_flags INTERFACE $)
-
# ----
From e069c71063112fb62a2f2e262e0970eec8f55f84 Mon Sep 17 00:00:00 2001
From: Pantelis Sopasakis
Date: Wed, 8 May 2024 13:09:44 +0100
Subject: [PATCH 04/15] CI on Orin (first attempt)
---
ci/script.sh | 20 ++++++++++++++++----
1 file changed, 16 insertions(+), 4 deletions(-)
diff --git a/ci/script.sh b/ci/script.sh
index b3c8af8..13bc39f 100644
--- a/ci/script.sh
+++ b/ci/script.sh
@@ -2,12 +2,24 @@
set -euxo pipefail
tests() {
+ # Where are we?
+ hwInfoOrin = `lshw | grep Orin`
+ if [ ! -z "$hwInfoOrin" ]; then
+ echo "Running on Orin";
+ sm_arch=87
+ cpp_version=17
+ else
+ echo "Not running on Orin";
+ sm_arch=86
+ cpp_version=20
+ fi
+
# ------------------------------------
# Run tensor gtests
# ------------------------------------
# -- create build files
- cmake -S . -B ./build -Wno-dev
+ cmake -DCPPVERSION=${cpp_version} -DSM_ARCH=${sm_arch} -S . -B ./build -Wno-dev
# -- build files in build folder
cmake --build ./build
@@ -17,7 +29,7 @@ tests() {
# -- run compute sanitizer
cd ./build/test
- mem=$(/usr/local/cuda-12.3/bin/compute-sanitizer --tool memcheck --leak-check=full ./device_test)
+ mem=$(/usr/local/cuda/bin/compute-sanitizer --tool memcheck --leak-check=full ./device_test)
grep "0 errors" <<< "$mem"
cd ../..
@@ -27,7 +39,7 @@ tests() {
# -- create build files
cd example
- cmake -S . -B ./build -Wno-dev
+ cmake -DCPPVERSION=${cpp_version} -DSM_ARCH=${sm_arch} -S . -B ./build -Wno-dev
# -- build files in build folder
cmake --build ./build
@@ -37,7 +49,7 @@ tests() {
# -- run compute sanitizer
cd ./build
- mem=$(/usr/local/cuda-12.3/bin/compute-sanitizer --tool memcheck --leak-check=full ./example_main)
+ mem=$(/usr/local/cuda/bin/compute-sanitizer --tool memcheck --leak-check=full ./example_main)
grep "0 errors" <<< "$mem"
}
From cd3ced4a57a3756309dc56406e986f41f65ae419 Mon Sep 17 00:00:00 2001
From: Pantelis Sopasakis
Date: Wed, 8 May 2024 13:11:35 +0100
Subject: [PATCH 05/15] remove unnecessary space
---
ci/script.sh | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/ci/script.sh b/ci/script.sh
index 13bc39f..eebf5f6 100644
--- a/ci/script.sh
+++ b/ci/script.sh
@@ -3,7 +3,7 @@ set -euxo pipefail
tests() {
# Where are we?
- hwInfoOrin = `lshw | grep Orin`
+ hwInfoOrin=`lshw | grep Orin`
if [ ! -z "$hwInfoOrin" ]; then
echo "Running on Orin";
sm_arch=87
From 540d23bce4f7f5ce72c1755632d395892e502fc1 Mon Sep 17 00:00:00 2001
From: Pantelis Sopasakis
Date: Wed, 8 May 2024 13:21:53 +0100
Subject: [PATCH 06/15] prevent grep from failing
---
ci/script.sh | 6 +++---
1 file changed, 3 insertions(+), 3 deletions(-)
diff --git a/ci/script.sh b/ci/script.sh
index eebf5f6..2c6a07b 100644
--- a/ci/script.sh
+++ b/ci/script.sh
@@ -2,9 +2,9 @@
set -euxo pipefail
tests() {
- # Where are we?
- hwInfoOrin=`lshw | grep Orin`
- if [ ! -z "$hwInfoOrin" ]; then
+ # Where are we? (A40 or Orin?)
+ hwInfoOrin=`lshw | grep Orin` ||
+ if [ ! -z "$(hwInfoOrin)" ]; then
echo "Running on Orin";
sm_arch=87
cpp_version=17
From eac38ca5152acc962f8060e8397c449faf208939 Mon Sep 17 00:00:00 2001
From: Pantelis Sopasakis
Date: Wed, 8 May 2024 13:27:57 +0100
Subject: [PATCH 07/15] running on Orin for the first time
---
ci/script.sh | 1 +
1 file changed, 1 insertion(+)
diff --git a/ci/script.sh b/ci/script.sh
index 2c6a07b..026ad19 100644
--- a/ci/script.sh
+++ b/ci/script.sh
@@ -1,6 +1,7 @@
#!/bin/bash
set -euxo pipefail
+
tests() {
# Where are we? (A40 or Orin?)
hwInfoOrin=`lshw | grep Orin` ||
From 53b7c9a340caec86bd8562e047054261c0a05660 Mon Sep 17 00:00:00 2001
From: Pantelis Sopasakis
Date: Wed, 8 May 2024 13:33:54 +0100
Subject: [PATCH 08/15] runs on orin
---
.github/workflows/ci.yml | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 791cd73..ca95d13 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -7,7 +7,7 @@ name: Continuous integration
jobs:
ci:
- runs-on: self-hosted
+ runs-on: [self-hosted, orin]
steps:
- name: checkout code
uses: actions/checkout@v4
From dd6843cfd93b34d6bb3971adf50b6499a4aac7e7 Mon Sep 17 00:00:00 2001
From: Pantelis Sopasakis
Date: Wed, 8 May 2024 13:37:36 +0100
Subject: [PATCH 09/15] dealing with unbound variable error
---
ci/script.sh | 2 ++
1 file changed, 2 insertions(+)
diff --git a/ci/script.sh b/ci/script.sh
index 026ad19..d57cd83 100644
--- a/ci/script.sh
+++ b/ci/script.sh
@@ -4,6 +4,8 @@ set -euxo pipefail
tests() {
# Where are we? (A40 or Orin?)
+ cpp_version=17 # default
+ sm_arch=86 # default
hwInfoOrin=`lshw | grep Orin` ||
if [ ! -z "$(hwInfoOrin)" ]; then
echo "Running on Orin";
From e0ecf751fc8a003b504bed3c6b6897ba8ff9944e Mon Sep 17 00:00:00 2001
From: Pantelis Sopasakis
Date: Wed, 8 May 2024 13:42:29 +0100
Subject: [PATCH 10/15] trying to run on both hosts
---
.github/workflows/ci.yml | 5 ++++-
ci/script.sh | 4 ++++
2 files changed, 8 insertions(+), 1 deletion(-)
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index ca95d13..3b7ae72 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -7,7 +7,10 @@ name: Continuous integration
jobs:
ci:
- runs-on: [self-hosted, orin]
+ runs-on: ${{ matrix.runner }}
+ strategy:
+ matrix:
+ runner: [alphaville, pop-os]
steps:
- name: checkout code
uses: actions/checkout@v4
diff --git a/ci/script.sh b/ci/script.sh
index d57cd83..8c4cd4c 100644
--- a/ci/script.sh
+++ b/ci/script.sh
@@ -30,6 +30,10 @@ tests() {
# -- run tests
ctest --test-dir ./build/test --output-on-failure
+ if [ ! -z "$(hwInfoOrin)" ]; then
+ return;
+ fi
+
# -- run compute sanitizer
cd ./build/test
mem=$(/usr/local/cuda/bin/compute-sanitizer --tool memcheck --leak-check=full ./device_test)
From 498a8a49c2ec4a47de449f0d6118f2f33c8ea24d Mon Sep 17 00:00:00 2001
From: Pantelis Sopasakis
Date: Wed, 8 May 2024 13:45:47 +0100
Subject: [PATCH 11/15] trying to run on both hosts
---
.github/workflows/ci.yml | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 3b7ae72..32c33a6 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -10,7 +10,7 @@ jobs:
runs-on: ${{ matrix.runner }}
strategy:
matrix:
- runner: [alphaville, pop-os]
+ runner: [orin, a40]
steps:
- name: checkout code
uses: actions/checkout@v4
From bc0491541a48c31440c9f25390a69e20824e99f7 Mon Sep 17 00:00:00 2001
From: Pantelis Sopasakis
Date: Wed, 8 May 2024 13:48:39 +0100
Subject: [PATCH 12/15] memcheck only on a40
---
ci/script.sh | 43 +++++++++++++++++++++----------------------
1 file changed, 21 insertions(+), 22 deletions(-)
diff --git a/ci/script.sh b/ci/script.sh
index 8c4cd4c..7cc9070 100644
--- a/ci/script.sh
+++ b/ci/script.sh
@@ -30,34 +30,33 @@ tests() {
# -- run tests
ctest --test-dir ./build/test --output-on-failure
- if [ ! -z "$(hwInfoOrin)" ]; then
- return;
- fi
+ if [ -z "$(hwInfoOrin)" ]; then
- # -- run compute sanitizer
- cd ./build/test
- mem=$(/usr/local/cuda/bin/compute-sanitizer --tool memcheck --leak-check=full ./device_test)
- grep "0 errors" <<< "$mem"
- cd ../..
+ # -- run compute sanitizer
+ cd ./build/test
+ mem=$(/usr/local/cuda/bin/compute-sanitizer --tool memcheck --leak-check=full ./device_test)
+ grep "0 errors" <<< "$mem"
+ cd ../..
- # ------------------------------------
- # Run example executable
- # ------------------------------------
+ # ------------------------------------
+ # Run example executable
+ # ------------------------------------
- # -- create build files
- cd example
- cmake -DCPPVERSION=${cpp_version} -DSM_ARCH=${sm_arch} -S . -B ./build -Wno-dev
+ # -- create build files
+ cd example
+ cmake -DCPPVERSION=${cpp_version} -DSM_ARCH=${sm_arch} -S . -B ./build -Wno-dev
- # -- build files in build folder
- cmake --build ./build
+ # -- build files in build folder
+ cmake --build ./build
- # -- run main.cu
- ./build/example_main
+ # -- run main.cu
+ ./build/example_main
- # -- run compute sanitizer
- cd ./build
- mem=$(/usr/local/cuda/bin/compute-sanitizer --tool memcheck --leak-check=full ./example_main)
- grep "0 errors" <<< "$mem"
+ # -- run compute sanitizer
+ cd ./build
+ mem=$(/usr/local/cuda/bin/compute-sanitizer --tool memcheck --leak-check=full ./example_main)
+ grep "0 errors" <<< "$mem"
+ fi
}
From 83ff0ea36b96a2f6965c307618e9dbddcd565134 Mon Sep 17 00:00:00 2001
From: Pantelis Sopasakis
Date: Wed, 8 May 2024 13:55:16 +0100
Subject: [PATCH 13/15] another attempt to fix the bug trying with curly
brackets now
---
ci/script.sh | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/ci/script.sh b/ci/script.sh
index 7cc9070..64190bb 100644
--- a/ci/script.sh
+++ b/ci/script.sh
@@ -7,7 +7,7 @@ tests() {
cpp_version=17 # default
sm_arch=86 # default
hwInfoOrin=`lshw | grep Orin` ||
- if [ ! -z "$(hwInfoOrin)" ]; then
+ if [ ! -z "${hwInfoOrin}" ]; then
echo "Running on Orin";
sm_arch=87
cpp_version=17
@@ -30,7 +30,7 @@ tests() {
# -- run tests
ctest --test-dir ./build/test --output-on-failure
- if [ -z "$(hwInfoOrin)" ]; then
+ if [ -z "${hwInfoOrin}" ]; then
# -- run compute sanitizer
cd ./build/test
From 5c1ed5dd6ab39245655f7b3614d2e686419c885b Mon Sep 17 00:00:00 2001
From: Ruairi Moran
Date: Wed, 8 May 2024 13:59:27 +0100
Subject: [PATCH 14/15] fix cholesky
---
include/tensor.cuh | 14 ++++++++++----
1 file changed, 10 insertions(+), 4 deletions(-)
diff --git a/include/tensor.cuh b/include/tensor.cuh
index cf7003c..a6033b7 100644
--- a/include/tensor.cuh
+++ b/include/tensor.cuh
@@ -1326,7 +1326,7 @@ public:
};
template<>
-void CholeskyBatchFactoriser::factorise() {
+inline void CholeskyBatchFactoriser::factorise() {
if (m_factorisationDone) return;
DTensor ptrA = m_matrix->pointersToMatrices();
gpuErrChk(cusolverDnDpotrfBatched(Session::getInstance().cuSolverHandle(),
@@ -1340,7 +1340,7 @@ void CholeskyBatchFactoriser::factorise() {
}
template<>
-void CholeskyBatchFactoriser::factorise() {
+inline void CholeskyBatchFactoriser::factorise() {
if (m_factorisationDone) return;
DTensor ptrA = m_matrix->pointersToMatrices();
gpuErrChk(cusolverDnSpotrfBatched(Session::getInstance().cuSolverHandle(),
@@ -1354,8 +1354,11 @@ void CholeskyBatchFactoriser::factorise() {
}
template<>
-void CholeskyBatchFactoriser::solve(DTensor &b) {
+inline void CholeskyBatchFactoriser::solve(DTensor &b) {
if (!m_factorisationDone) throw std::logic_error("[CholeskyBatchSolve] no factor to solve with");
+ if (m_numRows != b.numRows() || m_numMats != b.numMats()) {
+ throw std::invalid_argument("[CholeskyBatchSolve] A and b incompatible");
+
if (b.numCols() != 1) throw std::invalid_argument("[CholeskyBatchSolve] only supports `b` with one column");
DTensor ptrA = m_matrix->pointersToMatrices();
DTensor ptrB = b.pointersToMatrices();
@@ -1372,8 +1375,11 @@ void CholeskyBatchFactoriser::solve(DTensor &b) {
}
template<>
-void CholeskyBatchFactoriser::solve(DTensor &b) {
+inline void CholeskyBatchFactoriser::solve(DTensor &b) {
if (!m_factorisationDone) throw std::logic_error("[CholeskyBatchSolve] no factor to solve with");
+ if (m_numRows != b.numRows() || m_numMats != b.numMats()) {
+ throw std::invalid_argument("[CholeskyBatchSolve] A and b incompatible");
+ }
if (b.numCols() != 1) throw std::invalid_argument("[CholeskyBatchSolve] only supports `b` with one column");
DTensor ptrA = m_matrix->pointersToMatrices();
DTensor ptrB = b.pointersToMatrices();
From 8d7c49d932a3d3ea2eb22291bb403c6219052ec2 Mon Sep 17 00:00:00 2001
From: Ruairi Moran
Date: Wed, 8 May 2024 14:03:42 +0100
Subject: [PATCH 15/15] how did that happen
---
include/tensor.cuh | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/include/tensor.cuh b/include/tensor.cuh
index a6033b7..a154d63 100644
--- a/include/tensor.cuh
+++ b/include/tensor.cuh
@@ -1358,7 +1358,7 @@ inline void CholeskyBatchFactoriser::solve(DTensor &b) {
if (!m_factorisationDone) throw std::logic_error("[CholeskyBatchSolve] no factor to solve with");
if (m_numRows != b.numRows() || m_numMats != b.numMats()) {
throw std::invalid_argument("[CholeskyBatchSolve] A and b incompatible");
-
+ }
if (b.numCols() != 1) throw std::invalid_argument("[CholeskyBatchSolve] only supports `b` with one column");
DTensor ptrA = m_matrix->pointersToMatrices();
DTensor ptrB = b.pointersToMatrices();