From 3c6f258a86afa7910fcca487a97800222daa9557 Mon Sep 17 00:00:00 2001 From: Pantelis Sopasakis Date: Wed, 8 May 2024 11:15:45 +0100 Subject: [PATCH 01/15] c++17-proofing testTensor.cu --- test/testTensor.cu | 100 ++++++++++++++++++++++----------------------- 1 file changed, 50 insertions(+), 50 deletions(-) diff --git a/test/testTensor.cu b/test/testTensor.cu index 5291c68..20ddc1b 100644 --- a/test/testTensor.cu +++ b/test/testTensor.cu @@ -24,7 +24,7 @@ protected: * Zero Tensor (Constructor) * --------------------------------------- */ -template +TEMPLATE_WITH_TYPE_T void tensorConstructionZero() { DTensor zero(2, 3, 4, true); EXPECT_EQ(2, zero.numRows()); @@ -46,7 +46,7 @@ TEST_F(TensorTest, tensorConstructionZero) { * Row- and column-major data * --------------------------------------- */ -template +TEMPLATE_WITH_TYPE_T void tensorConstructionStorageMode() { size_t rows = 3; size_t cols = 2; @@ -98,7 +98,7 @@ TEST_F(TensorTest, tensorConstructionStorageMode) { * Move constructor * --------------------------------------- */ -template +TEMPLATE_WITH_TYPE_T void tensorMoveConstructor() { DTensor zero(2, 3, 4, true); DTensor x(std::move(zero)); @@ -118,7 +118,7 @@ TEST_F(TensorTest, tensorMoveConstructor) { * Constructor * --------------------------------------- */ -template +TEMPLATE_WITH_TYPE_T void tensorConstructionFromVector() { std::vector data = TENSOR_DATA_234A; DTensor tenz(data, 2, 3, 4); @@ -138,7 +138,7 @@ TEST_F(TensorTest, tensorConstructionFromVector) { * Tensor: Copy constructor * --------------------------------------- */ -template +TEMPLATE_WITH_TYPE_T void tensorCopyConstructor() { std::vector data = TENSOR_DATA_234A; DTensor tenz(data, 2, 3, 4); @@ -164,7 +164,7 @@ TEST_F(TensorTest, tensorCopyConstructor) { * axis = 2 (matrices) * --------------------------------------- */ -template +TEMPLATE_WITH_TYPE_T void tensorSlicingConstructorAxis2() { std::vector data = TENSOR_DATA_234A; DTensor tens(data, 2, 3, 4); @@ -186,7 +186,7 @@ TEST_F(TensorTest, tensorSlicingConstructorAxis2) { * axis = 1 (columns) * --------------------------------------- */ -template +TEMPLATE_WITH_TYPE_T void tensorSlicingConstructorAxis1() { std::vector data = TENSOR_DATA_234A; DTensor tenz(data, 2, 3, 4); @@ -211,7 +211,7 @@ TEST_F(TensorTest, tensorSlicingConstructorAxis1) { * axis = 0 (columns) * --------------------------------------- */ -template +TEMPLATE_WITH_TYPE_T void tensorSlicingConstructorAxis0() { std::vector data = TENSOR_DATA_234A; DTensor tenz(data, 2, 3, 4); @@ -235,7 +235,7 @@ TEST_F(TensorTest, tensorSlicingConstructorAxis0) { * Tensor: Upload data * --------------------------------------- */ -template +TEMPLATE_WITH_TYPE_T void tensorUpload() { std::vector data = TENSOR_DATA_234A; DTensor tenz(2, 3, 4); @@ -258,7 +258,7 @@ TEST_F(TensorTest, tensorUpload) { * Tensor: deviceCopyTo * --------------------------------------- */ -template +TEMPLATE_WITH_TYPE_T void tensorDeviceCopyTo() { std::vector data = TENSOR_DATA_234A; DTensor tenz(data, 2, 3, 4); @@ -282,7 +282,7 @@ TEST_F(TensorTest, tensorDeviceCopyTo) { * Tensor: Frobenius dot product * --------------------------------------- */ -template +TEMPLATE_WITH_TYPE_T void tensorDotF(T epsilon) { // as vectors std::vector dataA = TENSOR_DATA_234A; @@ -307,7 +307,7 @@ TEST_F(TensorTest, tensorDotF) { * Tensor: Frobenius norm * --------------------------------------- */ -template +TEMPLATE_WITH_TYPE_T void tensorNormF(T epsilon) { std::vector data = TENSOR_DATA_234A; DTensor tenz(data, 2, 3, 4); @@ -324,7 +324,7 @@ TEST_F(TensorTest, tensorNormF) { * all elements * --------------------------------------- */ -template +TEMPLATE_WITH_TYPE_T void tensorSumAbs() { std::vector data = TENSOR_DATA_234A; DTensor tenz(data, 2, 3, 4); @@ -341,7 +341,7 @@ TEST_F(TensorTest, tensorNormFtensorSumAbs) { * e.g., t(2, 3, 4) * --------------------------------------- */ -template +TEMPLATE_WITH_TYPE_T void tensorBracketOperator() { std::vector data = TENSOR_DATA_234A; DTensor tenz(data, 2, 3, 4); @@ -360,7 +360,7 @@ TEST_F(TensorTest, tensorBracketOperator) { * Tensor assignment operator * --------------------------------------- */ -template +TEMPLATE_WITH_TYPE_T void tensorAssignmentOperator() { std::vector data = TENSOR_DATA_234A; DTensor tenz(data, 2, 3, 4); @@ -382,7 +382,7 @@ TEST_F(TensorTest, tensorAssignmentOperator) { * Tensor times-equals scalar * --------------------------------------- */ -template +TEMPLATE_WITH_TYPE_T void tensorTimesEqualsScalar() { std::vector data = TENSOR_DATA_234A; std::vector dataTimes3 = {3, 6, 9, 12, 15, 18, 21, 24, 27, 24, 21, 30, 15, 12, 9, 6, 3, -3, 12, 9, 12, 9, 12, @@ -403,7 +403,7 @@ TEST_F(TensorTest, tensorTimesEqualsScalar) { * Scalar times tensor * --------------------------------------- */ -template +TEMPLATE_WITH_TYPE_T void tensorTimesScalar() { std::vector data = TENSOR_DATA_234A; std::vector dataTimes3 = {3, 6, 9, 12, 15, 18, 21, 24, 27, 24, 21, 30, 15, 12, 9, 6, 3, -3, 12, 9, 12, 9, 12, @@ -424,7 +424,7 @@ TEST_F(TensorTest, tensorTimesScalar) { * Tensor plus-equals tensor * --------------------------------------- */ -template +TEMPLATE_WITH_TYPE_T void tensorPlusEqualsTensor() { std::vector dataA = TENSOR_DATA_234A; std::vector dataB = TENSOR_DATA_234B; @@ -446,7 +446,7 @@ TEST_F(TensorTest, tensorPlusEqualsTensor) { * Tensor minus-equals tensor * --------------------------------------- */ -template +TEMPLATE_WITH_TYPE_T void tensorMinusEqualsTensor() { std::vector dataA = TENSOR_DATA_234A; std::vector dataB = TENSOR_DATA_234B; @@ -468,7 +468,7 @@ TEST_F(TensorTest, tensorMinusEqualsTensor) { * Tensor + Tensor * --------------------------------------- */ -template +TEMPLATE_WITH_TYPE_T void tensorPlusTensor() { std::vector dataA = TENSOR_DATA_234A; std::vector dataB = TENSOR_DATA_234B; @@ -490,7 +490,7 @@ TEST_F(TensorTest, tensorPlusTensor) { * Tensor - Tensor * --------------------------------------- */ -template +TEMPLATE_WITH_TYPE_T void tensorMinusTensor() { std::vector dataA = TENSOR_DATA_234A; std::vector dataB = TENSOR_DATA_234B; @@ -512,7 +512,7 @@ TEST_F(TensorTest, tensorMinusTensor) { * Tensor: pointers to matrices (on device) * --------------------------------------- */ -template +TEMPLATE_WITH_TYPE_T void tensorPointersToMatrices() { std::vector dataA = TENSOR_DATA_234A; DTensor A(dataA, 2, 3, 4); @@ -536,7 +536,7 @@ TEST_F(TensorTest, tensorPointersToMatrices) { * Tensor: C = AB * --------------------------------------- */ -template +TEMPLATE_WITH_TYPE_T void tensorAddAB() { std::vector aData = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, @@ -563,7 +563,7 @@ TEST_F(TensorTest, tensorAddAB) { * Tensor: getRows * --------------------------------------- */ -template +TEMPLATE_WITH_TYPE_T void tensorGetRows() { std::vector aData = {10.5, 25.0, 60.0, -21.0, 720.0, -1.0, @@ -595,7 +595,7 @@ TEST_F(TensorTest, tensorGetRows) { * Tensor: transpose * --------------------------------------- */ -template +TEMPLATE_WITH_TYPE_T void tensorTranspose() { std::vector aData = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12}; DTensor A(aData, 3, 2, 2); @@ -629,7 +629,7 @@ protected: * Tensor: Least squares * --------------------------------------- */ -template +TEMPLATE_WITH_TYPE_T void tensorLeastSquares1(T epsilon) { // TODO test with tall matrices too std::vector aData = {1, 2, @@ -672,8 +672,8 @@ protected: * and matrix rank * --------------------------------------- */ -template -requires std::floating_point +TEMPLATE_WITH_TYPE_T +TEMPLATE_CONSTRAINT_REQUIRES_FPX void singularValuesComputation(float epsilon) { std::vector bData = {1, 6, 6, 6, 6, 6, 6, 6, 2, 7, 7, 7, 7, 7, 7, 7, @@ -699,8 +699,8 @@ TEST_F(SvdTest, singularValuesComputation) { * Singular values - memory mgmt * --------------------------------------- */ -template -requires std::floating_point +TEMPLATE_WITH_TYPE_T +TEMPLATE_CONSTRAINT_REQUIRES_FPX void singularValuesMemory(float epsilon) { std::vector bData = {1, 6, 6, 6, 6, 6, 6, 6, 2, 7, 7, 7, 7, 7, 7, 7, @@ -731,8 +731,8 @@ TEST_F(SvdTest, singularValuesMemory) { /* --------------------------------------- * SVD with multiple matrices * --------------------------------------- */ -template -requires std::floating_point +TEMPLATE_WITH_TYPE_T +TEMPLATE_CONSTRAINT_REQUIRES_FPX void singularValuesMultipleMatrices(float epsilon) { std::vector aData = {1, 2, 3, 4, 5, 6, 1, 1, 1, 2, 2, 2, 0, 0, 0, 0, 0, 1}; DTensor A(aData, 3, 2, 3); @@ -779,8 +779,8 @@ TEST_F(SvdTest, singularValuesMultipleMatrices) { * SVD for rank computation of multiple * matrices * --------------------------------------- */ -template -requires std::floating_point +TEMPLATE_WITH_TYPE_T +TEMPLATE_CONSTRAINT_REQUIRES_FPX void singularValuesRankMultipleMatrices(float epsilon) { std::vector aData = {1, 4, 7, 10, 2, 5, 8, 11, 3, 6, 9, 0, 1, 4, 7, 10, 2, 5, 8, 11, 3, 6, 9, 12, @@ -815,8 +815,8 @@ protected: * Cholesky factorisation * --------------------------------------- */ -template -requires std::floating_point +TEMPLATE_WITH_TYPE_T +TEMPLATE_CONSTRAINT_REQUIRES_FPX void choleskyFactorisation(T epsilon) { std::vector aData = {10.0, 2.0, 3.0, 2.0, 20.0, -1.0, @@ -838,8 +838,8 @@ TEST_F(CholeskyTest, choleskyFactorisation) { * Cholesky factorisation: solve system * --------------------------------------- */ -template -requires std::floating_point +TEMPLATE_WITH_TYPE_T +TEMPLATE_CONSTRAINT_REQUIRES_FPX void choleskyFactorisationSolution(T epsilon) { std::vector aData = {10.0, 2.0, 3.0, 2.0, 20.0, -1.0, @@ -874,8 +874,8 @@ TEST_F(CholeskyTest, choleskyFactorisationSolution) { * Batched Cholesky factorisation * --------------------------------------- */ -template -requires std::floating_point +TEMPLATE_WITH_TYPE_T +TEMPLATE_CONSTRAINT_REQUIRES_FPX void choleskyBatchFactorisation(T epsilon) { std::vector aData = {10.0, 2.0, 3.0, 2.0, 20.0, -1.0, @@ -906,8 +906,8 @@ TEST_F(CholeskyTest, choleskyBatchFactorisation) { * Batched Cholesky solve * --------------------------------------- */ -template -requires std::floating_point +TEMPLATE_WITH_TYPE_T +TEMPLATE_CONSTRAINT_REQUIRES_FPX void choleskyBatchFactorSolve(T epsilon) { std::vector aData = {10.0, 2.0, 3.0, 2.0, 20.0, -1.0, @@ -947,8 +947,8 @@ TEST_F(CholeskyTest, choleskyBatchFactorSolve) { * Batched Cholesky solve (factor provided) * --------------------------------------- */ -template -requires std::floating_point +TEMPLATE_WITH_TYPE_T +TEMPLATE_CONSTRAINT_REQUIRES_FPX void choleskyBatchSolve(T epsilon) { std::vector aData = {10.0, 2.0, 3.0, 2.0, 20.0, -1.0, @@ -1007,8 +1007,8 @@ protected: * Basic nullspace test * --------------------------------------- */ -template -requires std::floating_point +TEMPLATE_WITH_TYPE_T +TEMPLATE_CONSTRAINT_REQUIRES_FPX void computeNullspaceTensor(T epsilon) { std::vector aData = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 7, 8, 9, @@ -1048,8 +1048,8 @@ TEST_F(NullspaceTest, computeNullspaceTensor) { * Nullspace is trivial * --------------------------------------- */ -template -requires std::floating_point +TEMPLATE_WITH_TYPE_T +TEMPLATE_CONSTRAINT_REQUIRES_FPX void computeNullspaceTrivial(T epsilon) { std::vector data{4, 5, 7, 4, 1, 8, @@ -1072,8 +1072,8 @@ TEST_F(NullspaceTest, computeNullspaceTrivial) { * Project onto nullspace * --------------------------------------- */ -template -requires std::floating_point +TEMPLATE_WITH_TYPE_T +TEMPLATE_CONSTRAINT_REQUIRES_FPX void projectOnNullspaceTensor(T epsilon) { // offline size_t m = 3; From 156048db1e927b02408f7d8f4a7d3b9c0c9d4a9a Mon Sep 17 00:00:00 2001 From: Pantelis Sopasakis Date: Wed, 8 May 2024 11:56:08 +0100 Subject: [PATCH 02/15] CMakeLists: can compile on both A40 and Orin --- CMakeLists.txt | 78 ++++++++++++++++++++++++++++++-------------------- 1 file changed, 47 insertions(+), 31 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 5d39ae0..2d13e09 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -2,36 +2,52 @@ # GPUtils # ==================================================================== cmake_minimum_required(VERSION 3.20 FATAL_ERROR) +cmake_policy(SET CMP0135 NEW) project(GPUtils - DESCRIPTION "Easy use of vectors and matrices on GPGPU devices." - HOMEPAGE_URL "https://github.com/GPUEngineering/GPUtils" - LANGUAGES CXX -) + DESCRIPTION "Easy use of vectors and matrices on GPGPU devices." + HOMEPAGE_URL "https://github.com/GPUEngineering/GPUtils" + LANGUAGES CXX + ) + + +option(CPPVERSION "C++ version" 20) # A40: 20, Orin: 17 +option(SM_ARCH "SM architecture" 86) # A40: 86, Orin: 87 + +set (cppversion ${CPPVERSION}) +set (cppstd "c++${CPPVERSION}") +set (cxxstd cxx_std_${CPPVERSION}) + # ---- + set(CMAKE_CUDA_SEPARABLE_COMPILATION ON) # required for calling cuda kernels from cuda kernels -set(CMAKE_CUDA_COMPILER "/usr/local/cuda-12.3/bin/nvcc") -set(CMAKE_CUDA_ARCHITECTURES 86) -set(CMAKE_CUDA_STANDARD 20) -set(CMAKE_CXX_STANDARD 20) -set(CMAKE_CUDA_FLAGS "-std=c++20") -set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS}; -std=c++20) +set(CMAKE_CUDA_COMPILER "/usr/local/cuda-11.4/bin/nvcc") +set(CMAKE_CUDA_ARCHITECTURES ${SM_ARCH}) +set(CMAKE_CUDA_STANDARD ${cppversion}) +set(CMAKE_CXX_STANDARD ${cppversion}) +set(CMAKE_CUDA_FLAGS "-std=${cppstd}") +set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS}; -std=${cppstd}) enable_language(CUDA) + # ---- + add_library(device_compiler_flags INTERFACE) -target_compile_features(device_compiler_flags INTERFACE cxx_std_20) +target_compile_features(device_compiler_flags INTERFACE ${cxxstd}) set(CMAKE_CXX_EXTENSIONS OFF) + # ---- + add_library(developer_flags INTERFACE) set(cxx_flags -Wall) -set(cuda_flags -arch=sm_60 -std=c++20 -Xcompiler=-Wall -Xcudafe=--display_error_number -g) +set(cuda_flags -arch=sm_${SM_ARCH} -std=${cppstd} -Xcompiler=-Wall -Xcudafe=--display_error_number -g) target_compile_options(developer_flags - INTERFACE - # flags for CXX builds - $<$:${cxx_flags}> - # flags for CUDA builds - $<$:${cuda_flags}> -) + INTERFACE + # flags for CXX builds + $<$:${cxx_flags}> + # flags for CUDA builds + $<$:${cuda_flags}> + ) target_link_libraries(device_compiler_flags INTERFACE $) + # ---- @@ -40,21 +56,21 @@ target_link_libraries(device_compiler_flags INTERFACE $ Date: Wed, 8 May 2024 12:44:45 +0100 Subject: [PATCH 03/15] now works with cmake v3.20 --- CMakeLists.txt | 44 +++++++++++++++++++++----------------------- 1 file changed, 21 insertions(+), 23 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 2d13e09..4b87d7b 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -2,43 +2,42 @@ # GPUtils # ==================================================================== cmake_minimum_required(VERSION 3.20 FATAL_ERROR) -cmake_policy(SET CMP0135 NEW) + +if(CMAKE_VERSION VERSION_GREATER_EQUAL "3.29") + cmake_policy(SET CMP0135 NEW) +endif() + +# Set C++ version and SM architecture +if (NOT DEFINED CPPVERSION) + set(CPPVERSION 20) # A40: 20, Orin: 17 +endif() +if (NOT DEFINED SM_ARCH) + set(SM_ARCH 86)# A40: 86, Orin: 87 +endif() + + project(GPUtils DESCRIPTION "Easy use of vectors and matrices on GPGPU devices." HOMEPAGE_URL "https://github.com/GPUEngineering/GPUtils" LANGUAGES CXX ) - - -option(CPPVERSION "C++ version" 20) # A40: 20, Orin: 17 -option(SM_ARCH "SM architecture" 86) # A40: 86, Orin: 87 - -set (cppversion ${CPPVERSION}) -set (cppstd "c++${CPPVERSION}") -set (cxxstd cxx_std_${CPPVERSION}) - # ---- - set(CMAKE_CUDA_SEPARABLE_COMPILATION ON) # required for calling cuda kernels from cuda kernels -set(CMAKE_CUDA_COMPILER "/usr/local/cuda-11.4/bin/nvcc") +set(CMAKE_CUDA_COMPILER "/usr/local/cuda/bin/nvcc") set(CMAKE_CUDA_ARCHITECTURES ${SM_ARCH}) -set(CMAKE_CUDA_STANDARD ${cppversion}) -set(CMAKE_CXX_STANDARD ${cppversion}) -set(CMAKE_CUDA_FLAGS "-std=${cppstd}") -set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS}; -std=${cppstd}) +set(CMAKE_CUDA_STANDARD ${CPPVERSION}) +set(CMAKE_CXX_STANDARD ${CPPVERSION}) +set(CMAKE_CUDA_FLAGS "-std=c++${CPPVERSION}") +set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS}; "-std=c++${CPPVERSION}") enable_language(CUDA) - # ---- - add_library(device_compiler_flags INTERFACE) -target_compile_features(device_compiler_flags INTERFACE ${cxxstd}) +target_compile_features(device_compiler_flags INTERFACE cxx_std_${CPPVERSION}) set(CMAKE_CXX_EXTENSIONS OFF) - # ---- - add_library(developer_flags INTERFACE) set(cxx_flags -Wall) -set(cuda_flags -arch=sm_${SM_ARCH} -std=${cppstd} -Xcompiler=-Wall -Xcudafe=--display_error_number -g) +set(cuda_flags -arch=sm_${SM_ARCH} -std=c++${CPPVERSION} -Xcompiler=-Wall -Xcudafe=--display_error_number -g) target_compile_options(developer_flags INTERFACE # flags for CXX builds @@ -47,7 +46,6 @@ target_compile_options(developer_flags $<$:${cuda_flags}> ) target_link_libraries(device_compiler_flags INTERFACE $) - # ---- From e069c71063112fb62a2f2e262e0970eec8f55f84 Mon Sep 17 00:00:00 2001 From: Pantelis Sopasakis Date: Wed, 8 May 2024 13:09:44 +0100 Subject: [PATCH 04/15] CI on Orin (first attempt) --- ci/script.sh | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) diff --git a/ci/script.sh b/ci/script.sh index b3c8af8..13bc39f 100644 --- a/ci/script.sh +++ b/ci/script.sh @@ -2,12 +2,24 @@ set -euxo pipefail tests() { + # Where are we? + hwInfoOrin = `lshw | grep Orin` + if [ ! -z "$hwInfoOrin" ]; then + echo "Running on Orin"; + sm_arch=87 + cpp_version=17 + else + echo "Not running on Orin"; + sm_arch=86 + cpp_version=20 + fi + # ------------------------------------ # Run tensor gtests # ------------------------------------ # -- create build files - cmake -S . -B ./build -Wno-dev + cmake -DCPPVERSION=${cpp_version} -DSM_ARCH=${sm_arch} -S . -B ./build -Wno-dev # -- build files in build folder cmake --build ./build @@ -17,7 +29,7 @@ tests() { # -- run compute sanitizer cd ./build/test - mem=$(/usr/local/cuda-12.3/bin/compute-sanitizer --tool memcheck --leak-check=full ./device_test) + mem=$(/usr/local/cuda/bin/compute-sanitizer --tool memcheck --leak-check=full ./device_test) grep "0 errors" <<< "$mem" cd ../.. @@ -27,7 +39,7 @@ tests() { # -- create build files cd example - cmake -S . -B ./build -Wno-dev + cmake -DCPPVERSION=${cpp_version} -DSM_ARCH=${sm_arch} -S . -B ./build -Wno-dev # -- build files in build folder cmake --build ./build @@ -37,7 +49,7 @@ tests() { # -- run compute sanitizer cd ./build - mem=$(/usr/local/cuda-12.3/bin/compute-sanitizer --tool memcheck --leak-check=full ./example_main) + mem=$(/usr/local/cuda/bin/compute-sanitizer --tool memcheck --leak-check=full ./example_main) grep "0 errors" <<< "$mem" } From cd3ced4a57a3756309dc56406e986f41f65ae419 Mon Sep 17 00:00:00 2001 From: Pantelis Sopasakis Date: Wed, 8 May 2024 13:11:35 +0100 Subject: [PATCH 05/15] remove unnecessary space --- ci/script.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci/script.sh b/ci/script.sh index 13bc39f..eebf5f6 100644 --- a/ci/script.sh +++ b/ci/script.sh @@ -3,7 +3,7 @@ set -euxo pipefail tests() { # Where are we? - hwInfoOrin = `lshw | grep Orin` + hwInfoOrin=`lshw | grep Orin` if [ ! -z "$hwInfoOrin" ]; then echo "Running on Orin"; sm_arch=87 From 540d23bce4f7f5ce72c1755632d395892e502fc1 Mon Sep 17 00:00:00 2001 From: Pantelis Sopasakis Date: Wed, 8 May 2024 13:21:53 +0100 Subject: [PATCH 06/15] prevent grep from failing --- ci/script.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/ci/script.sh b/ci/script.sh index eebf5f6..2c6a07b 100644 --- a/ci/script.sh +++ b/ci/script.sh @@ -2,9 +2,9 @@ set -euxo pipefail tests() { - # Where are we? - hwInfoOrin=`lshw | grep Orin` - if [ ! -z "$hwInfoOrin" ]; then + # Where are we? (A40 or Orin?) + hwInfoOrin=`lshw | grep Orin` || + if [ ! -z "$(hwInfoOrin)" ]; then echo "Running on Orin"; sm_arch=87 cpp_version=17 From eac38ca5152acc962f8060e8397c449faf208939 Mon Sep 17 00:00:00 2001 From: Pantelis Sopasakis Date: Wed, 8 May 2024 13:27:57 +0100 Subject: [PATCH 07/15] running on Orin for the first time --- ci/script.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/ci/script.sh b/ci/script.sh index 2c6a07b..026ad19 100644 --- a/ci/script.sh +++ b/ci/script.sh @@ -1,6 +1,7 @@ #!/bin/bash set -euxo pipefail + tests() { # Where are we? (A40 or Orin?) hwInfoOrin=`lshw | grep Orin` || From 53b7c9a340caec86bd8562e047054261c0a05660 Mon Sep 17 00:00:00 2001 From: Pantelis Sopasakis Date: Wed, 8 May 2024 13:33:54 +0100 Subject: [PATCH 08/15] runs on orin --- .github/workflows/ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 791cd73..ca95d13 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -7,7 +7,7 @@ name: Continuous integration jobs: ci: - runs-on: self-hosted + runs-on: [self-hosted, orin] steps: - name: checkout code uses: actions/checkout@v4 From dd6843cfd93b34d6bb3971adf50b6499a4aac7e7 Mon Sep 17 00:00:00 2001 From: Pantelis Sopasakis Date: Wed, 8 May 2024 13:37:36 +0100 Subject: [PATCH 09/15] dealing with unbound variable error --- ci/script.sh | 2 ++ 1 file changed, 2 insertions(+) diff --git a/ci/script.sh b/ci/script.sh index 026ad19..d57cd83 100644 --- a/ci/script.sh +++ b/ci/script.sh @@ -4,6 +4,8 @@ set -euxo pipefail tests() { # Where are we? (A40 or Orin?) + cpp_version=17 # default + sm_arch=86 # default hwInfoOrin=`lshw | grep Orin` || if [ ! -z "$(hwInfoOrin)" ]; then echo "Running on Orin"; From e0ecf751fc8a003b504bed3c6b6897ba8ff9944e Mon Sep 17 00:00:00 2001 From: Pantelis Sopasakis Date: Wed, 8 May 2024 13:42:29 +0100 Subject: [PATCH 10/15] trying to run on both hosts --- .github/workflows/ci.yml | 5 ++++- ci/script.sh | 4 ++++ 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index ca95d13..3b7ae72 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -7,7 +7,10 @@ name: Continuous integration jobs: ci: - runs-on: [self-hosted, orin] + runs-on: ${{ matrix.runner }} + strategy: + matrix: + runner: [alphaville, pop-os] steps: - name: checkout code uses: actions/checkout@v4 diff --git a/ci/script.sh b/ci/script.sh index d57cd83..8c4cd4c 100644 --- a/ci/script.sh +++ b/ci/script.sh @@ -30,6 +30,10 @@ tests() { # -- run tests ctest --test-dir ./build/test --output-on-failure + if [ ! -z "$(hwInfoOrin)" ]; then + return; + fi + # -- run compute sanitizer cd ./build/test mem=$(/usr/local/cuda/bin/compute-sanitizer --tool memcheck --leak-check=full ./device_test) From 498a8a49c2ec4a47de449f0d6118f2f33c8ea24d Mon Sep 17 00:00:00 2001 From: Pantelis Sopasakis Date: Wed, 8 May 2024 13:45:47 +0100 Subject: [PATCH 11/15] trying to run on both hosts --- .github/workflows/ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 3b7ae72..32c33a6 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -10,7 +10,7 @@ jobs: runs-on: ${{ matrix.runner }} strategy: matrix: - runner: [alphaville, pop-os] + runner: [orin, a40] steps: - name: checkout code uses: actions/checkout@v4 From bc0491541a48c31440c9f25390a69e20824e99f7 Mon Sep 17 00:00:00 2001 From: Pantelis Sopasakis Date: Wed, 8 May 2024 13:48:39 +0100 Subject: [PATCH 12/15] memcheck only on a40 --- ci/script.sh | 43 +++++++++++++++++++++---------------------- 1 file changed, 21 insertions(+), 22 deletions(-) diff --git a/ci/script.sh b/ci/script.sh index 8c4cd4c..7cc9070 100644 --- a/ci/script.sh +++ b/ci/script.sh @@ -30,34 +30,33 @@ tests() { # -- run tests ctest --test-dir ./build/test --output-on-failure - if [ ! -z "$(hwInfoOrin)" ]; then - return; - fi + if [ -z "$(hwInfoOrin)" ]; then - # -- run compute sanitizer - cd ./build/test - mem=$(/usr/local/cuda/bin/compute-sanitizer --tool memcheck --leak-check=full ./device_test) - grep "0 errors" <<< "$mem" - cd ../.. + # -- run compute sanitizer + cd ./build/test + mem=$(/usr/local/cuda/bin/compute-sanitizer --tool memcheck --leak-check=full ./device_test) + grep "0 errors" <<< "$mem" + cd ../.. - # ------------------------------------ - # Run example executable - # ------------------------------------ + # ------------------------------------ + # Run example executable + # ------------------------------------ - # -- create build files - cd example - cmake -DCPPVERSION=${cpp_version} -DSM_ARCH=${sm_arch} -S . -B ./build -Wno-dev + # -- create build files + cd example + cmake -DCPPVERSION=${cpp_version} -DSM_ARCH=${sm_arch} -S . -B ./build -Wno-dev - # -- build files in build folder - cmake --build ./build + # -- build files in build folder + cmake --build ./build - # -- run main.cu - ./build/example_main + # -- run main.cu + ./build/example_main - # -- run compute sanitizer - cd ./build - mem=$(/usr/local/cuda/bin/compute-sanitizer --tool memcheck --leak-check=full ./example_main) - grep "0 errors" <<< "$mem" + # -- run compute sanitizer + cd ./build + mem=$(/usr/local/cuda/bin/compute-sanitizer --tool memcheck --leak-check=full ./example_main) + grep "0 errors" <<< "$mem" + fi } From 83ff0ea36b96a2f6965c307618e9dbddcd565134 Mon Sep 17 00:00:00 2001 From: Pantelis Sopasakis Date: Wed, 8 May 2024 13:55:16 +0100 Subject: [PATCH 13/15] another attempt to fix the bug trying with curly brackets now --- ci/script.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ci/script.sh b/ci/script.sh index 7cc9070..64190bb 100644 --- a/ci/script.sh +++ b/ci/script.sh @@ -7,7 +7,7 @@ tests() { cpp_version=17 # default sm_arch=86 # default hwInfoOrin=`lshw | grep Orin` || - if [ ! -z "$(hwInfoOrin)" ]; then + if [ ! -z "${hwInfoOrin}" ]; then echo "Running on Orin"; sm_arch=87 cpp_version=17 @@ -30,7 +30,7 @@ tests() { # -- run tests ctest --test-dir ./build/test --output-on-failure - if [ -z "$(hwInfoOrin)" ]; then + if [ -z "${hwInfoOrin}" ]; then # -- run compute sanitizer cd ./build/test From 5c1ed5dd6ab39245655f7b3614d2e686419c885b Mon Sep 17 00:00:00 2001 From: Ruairi Moran Date: Wed, 8 May 2024 13:59:27 +0100 Subject: [PATCH 14/15] fix cholesky --- include/tensor.cuh | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/include/tensor.cuh b/include/tensor.cuh index cf7003c..a6033b7 100644 --- a/include/tensor.cuh +++ b/include/tensor.cuh @@ -1326,7 +1326,7 @@ public: }; template<> -void CholeskyBatchFactoriser::factorise() { +inline void CholeskyBatchFactoriser::factorise() { if (m_factorisationDone) return; DTensor ptrA = m_matrix->pointersToMatrices(); gpuErrChk(cusolverDnDpotrfBatched(Session::getInstance().cuSolverHandle(), @@ -1340,7 +1340,7 @@ void CholeskyBatchFactoriser::factorise() { } template<> -void CholeskyBatchFactoriser::factorise() { +inline void CholeskyBatchFactoriser::factorise() { if (m_factorisationDone) return; DTensor ptrA = m_matrix->pointersToMatrices(); gpuErrChk(cusolverDnSpotrfBatched(Session::getInstance().cuSolverHandle(), @@ -1354,8 +1354,11 @@ void CholeskyBatchFactoriser::factorise() { } template<> -void CholeskyBatchFactoriser::solve(DTensor &b) { +inline void CholeskyBatchFactoriser::solve(DTensor &b) { if (!m_factorisationDone) throw std::logic_error("[CholeskyBatchSolve] no factor to solve with"); + if (m_numRows != b.numRows() || m_numMats != b.numMats()) { + throw std::invalid_argument("[CholeskyBatchSolve] A and b incompatible"); + if (b.numCols() != 1) throw std::invalid_argument("[CholeskyBatchSolve] only supports `b` with one column"); DTensor ptrA = m_matrix->pointersToMatrices(); DTensor ptrB = b.pointersToMatrices(); @@ -1372,8 +1375,11 @@ void CholeskyBatchFactoriser::solve(DTensor &b) { } template<> -void CholeskyBatchFactoriser::solve(DTensor &b) { +inline void CholeskyBatchFactoriser::solve(DTensor &b) { if (!m_factorisationDone) throw std::logic_error("[CholeskyBatchSolve] no factor to solve with"); + if (m_numRows != b.numRows() || m_numMats != b.numMats()) { + throw std::invalid_argument("[CholeskyBatchSolve] A and b incompatible"); + } if (b.numCols() != 1) throw std::invalid_argument("[CholeskyBatchSolve] only supports `b` with one column"); DTensor ptrA = m_matrix->pointersToMatrices(); DTensor ptrB = b.pointersToMatrices(); From 8d7c49d932a3d3ea2eb22291bb403c6219052ec2 Mon Sep 17 00:00:00 2001 From: Ruairi Moran Date: Wed, 8 May 2024 14:03:42 +0100 Subject: [PATCH 15/15] how did that happen --- include/tensor.cuh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/tensor.cuh b/include/tensor.cuh index a6033b7..a154d63 100644 --- a/include/tensor.cuh +++ b/include/tensor.cuh @@ -1358,7 +1358,7 @@ inline void CholeskyBatchFactoriser::solve(DTensor &b) { if (!m_factorisationDone) throw std::logic_error("[CholeskyBatchSolve] no factor to solve with"); if (m_numRows != b.numRows() || m_numMats != b.numMats()) { throw std::invalid_argument("[CholeskyBatchSolve] A and b incompatible"); - + } if (b.numCols() != 1) throw std::invalid_argument("[CholeskyBatchSolve] only supports `b` with one column"); DTensor ptrA = m_matrix->pointersToMatrices(); DTensor ptrB = b.pointersToMatrices();