Skip to content

C++17-proofing testTensor.cu #30

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 17 commits into from
May 8, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,10 @@ name: Continuous integration

jobs:
ci:
runs-on: self-hosted
runs-on: ${{ matrix.runner }}
strategy:
matrix:
runner: [orin, a40]
steps:
- name: checkout code
uses: actions/checkout@v4
Expand Down
76 changes: 45 additions & 31 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,35 +2,49 @@
# GPUtils
# ====================================================================
cmake_minimum_required(VERSION 3.20 FATAL_ERROR)

if(CMAKE_VERSION VERSION_GREATER_EQUAL "3.29")
cmake_policy(SET CMP0135 NEW)
endif()

# Set C++ version and SM architecture
if (NOT DEFINED CPPVERSION)
set(CPPVERSION 20) # A40: 20, Orin: 17
endif()
if (NOT DEFINED SM_ARCH)
set(SM_ARCH 86)# A40: 86, Orin: 87
endif()


project(GPUtils
DESCRIPTION "Easy use of vectors and matrices on GPGPU devices."
HOMEPAGE_URL "https://github.com/GPUEngineering/GPUtils"
LANGUAGES CXX
)
DESCRIPTION "Easy use of vectors and matrices on GPGPU devices."
HOMEPAGE_URL "https://github.com/GPUEngineering/GPUtils"
LANGUAGES CXX
)
# ----
set(CMAKE_CUDA_SEPARABLE_COMPILATION ON) # required for calling cuda kernels from cuda kernels
set(CMAKE_CUDA_COMPILER "/usr/local/cuda-12.3/bin/nvcc")
set(CMAKE_CUDA_ARCHITECTURES 86)
set(CMAKE_CUDA_STANDARD 20)
set(CMAKE_CXX_STANDARD 20)
set(CMAKE_CUDA_FLAGS "-std=c++20")
set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS}; -std=c++20)
set(CMAKE_CUDA_COMPILER "/usr/local/cuda/bin/nvcc")
set(CMAKE_CUDA_ARCHITECTURES ${SM_ARCH})
set(CMAKE_CUDA_STANDARD ${CPPVERSION})
set(CMAKE_CXX_STANDARD ${CPPVERSION})
set(CMAKE_CUDA_FLAGS "-std=c++${CPPVERSION}")
set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS}; "-std=c++${CPPVERSION}")
enable_language(CUDA)
# ----
add_library(device_compiler_flags INTERFACE)
target_compile_features(device_compiler_flags INTERFACE cxx_std_20)
target_compile_features(device_compiler_flags INTERFACE cxx_std_${CPPVERSION})
set(CMAKE_CXX_EXTENSIONS OFF)
# ----
add_library(developer_flags INTERFACE)
set(cxx_flags -Wall)
set(cuda_flags -arch=sm_60 -std=c++20 -Xcompiler=-Wall -Xcudafe=--display_error_number -g)
set(cuda_flags -arch=sm_${SM_ARCH} -std=c++${CPPVERSION} -Xcompiler=-Wall -Xcudafe=--display_error_number -g)
target_compile_options(developer_flags
INTERFACE
# flags for CXX builds
$<$<COMPILE_LANGUAGE:CXX>:${cxx_flags}>
# flags for CUDA builds
$<$<COMPILE_LANGUAGE:CUDA>:${cuda_flags}>
)
INTERFACE
# flags for CXX builds
$<$<COMPILE_LANGUAGE:CXX>:${cxx_flags}>
# flags for CUDA builds
$<$<COMPILE_LANGUAGE:CUDA>:${cuda_flags}>
)
target_link_libraries(device_compiler_flags INTERFACE $<BUILD_INTERFACE:developer_flags>)
# ----

Expand All @@ -40,21 +54,21 @@ target_link_libraries(device_compiler_flags INTERFACE $<BUILD_INTERFACE:develope
# ====================================================================
add_executable(main)
target_sources(main
PRIVATE
main.cu
)
PRIVATE
main.cu
)
target_link_libraries(main
PRIVATE
device_compiler_flags
cublas
cusolver
cudadevrt
)
PRIVATE
device_compiler_flags
cublas
cusolver
cudadevrt
)
target_include_directories(main
PRIVATE
"${PROJECT_BINARY_DIR}"
"${PROJECT_SOURCE_DIR}/include"
)
PRIVATE
"${PROJECT_BINARY_DIR}"
"${PROJECT_SOURCE_DIR}/include"
)
# ----
add_subdirectory(test)
# ----
58 changes: 38 additions & 20 deletions ci/script.sh
Original file line number Diff line number Diff line change
@@ -1,44 +1,62 @@
#!/bin/bash
set -euxo pipefail


tests() {
# Where are we? (A40 or Orin?)
cpp_version=17 # default
sm_arch=86 # default
hwInfoOrin=`lshw | grep Orin` ||
if [ ! -z "${hwInfoOrin}" ]; then
echo "Running on Orin";
sm_arch=87
cpp_version=17
else
echo "Not running on Orin";
sm_arch=86
cpp_version=20
fi

# ------------------------------------
# Run tensor gtests
# ------------------------------------

# -- create build files
cmake -S . -B ./build -Wno-dev
cmake -DCPPVERSION=${cpp_version} -DSM_ARCH=${sm_arch} -S . -B ./build -Wno-dev

# -- build files in build folder
cmake --build ./build

# -- run tests
ctest --test-dir ./build/test --output-on-failure

# -- run compute sanitizer
cd ./build/test
mem=$(/usr/local/cuda-12.3/bin/compute-sanitizer --tool memcheck --leak-check=full ./device_test)
grep "0 errors" <<< "$mem"
cd ../..
if [ -z "${hwInfoOrin}" ]; then

# ------------------------------------
# Run example executable
# ------------------------------------
# -- run compute sanitizer
cd ./build/test
mem=$(/usr/local/cuda/bin/compute-sanitizer --tool memcheck --leak-check=full ./device_test)
grep "0 errors" <<< "$mem"
cd ../..

# -- create build files
cd example
cmake -S . -B ./build -Wno-dev
# ------------------------------------
# Run example executable
# ------------------------------------

# -- build files in build folder
cmake --build ./build
# -- create build files
cd example
cmake -DCPPVERSION=${cpp_version} -DSM_ARCH=${sm_arch} -S . -B ./build -Wno-dev

# -- build files in build folder
cmake --build ./build

# -- run main.cu
./build/example_main
# -- run main.cu
./build/example_main

# -- run compute sanitizer
cd ./build
mem=$(/usr/local/cuda-12.3/bin/compute-sanitizer --tool memcheck --leak-check=full ./example_main)
grep "0 errors" <<< "$mem"
# -- run compute sanitizer
cd ./build
mem=$(/usr/local/cuda/bin/compute-sanitizer --tool memcheck --leak-check=full ./example_main)
grep "0 errors" <<< "$mem"
fi
}


Expand Down
14 changes: 10 additions & 4 deletions include/tensor.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -1326,7 +1326,7 @@ public:
};

template<>
void CholeskyBatchFactoriser<double>::factorise() {
inline void CholeskyBatchFactoriser<double>::factorise() {
if (m_factorisationDone) return;
DTensor<double *> ptrA = m_matrix->pointersToMatrices();
gpuErrChk(cusolverDnDpotrfBatched(Session::getInstance().cuSolverHandle(),
Expand All @@ -1340,7 +1340,7 @@ void CholeskyBatchFactoriser<double>::factorise() {
}

template<>
void CholeskyBatchFactoriser<float>::factorise() {
inline void CholeskyBatchFactoriser<float>::factorise() {
if (m_factorisationDone) return;
DTensor<float *> ptrA = m_matrix->pointersToMatrices();
gpuErrChk(cusolverDnSpotrfBatched(Session::getInstance().cuSolverHandle(),
Expand All @@ -1354,8 +1354,11 @@ void CholeskyBatchFactoriser<float>::factorise() {
}

template<>
void CholeskyBatchFactoriser<double>::solve(DTensor<double> &b) {
inline void CholeskyBatchFactoriser<double>::solve(DTensor<double> &b) {
if (!m_factorisationDone) throw std::logic_error("[CholeskyBatchSolve] no factor to solve with");
if (m_numRows != b.numRows() || m_numMats != b.numMats()) {
throw std::invalid_argument("[CholeskyBatchSolve] A and b incompatible");
}
if (b.numCols() != 1) throw std::invalid_argument("[CholeskyBatchSolve] only supports `b` with one column");
DTensor<double *> ptrA = m_matrix->pointersToMatrices();
DTensor<double *> ptrB = b.pointersToMatrices();
Expand All @@ -1372,8 +1375,11 @@ void CholeskyBatchFactoriser<double>::solve(DTensor<double> &b) {
}

template<>
void CholeskyBatchFactoriser<float>::solve(DTensor<float> &b) {
inline void CholeskyBatchFactoriser<float>::solve(DTensor<float> &b) {
if (!m_factorisationDone) throw std::logic_error("[CholeskyBatchSolve] no factor to solve with");
if (m_numRows != b.numRows() || m_numMats != b.numMats()) {
throw std::invalid_argument("[CholeskyBatchSolve] A and b incompatible");
}
if (b.numCols() != 1) throw std::invalid_argument("[CholeskyBatchSolve] only supports `b` with one column");
DTensor<float *> ptrA = m_matrix->pointersToMatrices();
DTensor<float *> ptrB = b.pointersToMatrices();
Expand Down
Loading