From 50099eb4ddcfd4360408e10aa3e32bcc1ce219ee Mon Sep 17 00:00:00 2001 From: Brian Staber Date: Sun, 21 Sep 2025 23:28:41 +0200 Subject: [PATCH 1/4] :rocket: More CUDA simple examples --- Justfile | 2 +- crates/cuda_hello/CMakeLists.txt | 6 +++--- crates/cuda_hello/{main.cu => hello.cu} | 0 crates/cuda_hello/reduce.cu | 0 crates/cuda_hello/vecadd.cu | 0 5 files changed, 4 insertions(+), 4 deletions(-) rename crates/cuda_hello/{main.cu => hello.cu} (100%) create mode 100644 crates/cuda_hello/reduce.cu create mode 100644 crates/cuda_hello/vecadd.cu diff --git a/Justfile b/Justfile index 8eee4a4..a8d947c 100644 --- a/Justfile +++ b/Justfile @@ -20,7 +20,7 @@ default: configure build test # CMake lifecycle # ------------------------- configure: - cmake -S . -B {{BUILD_DIR}} {{GENERATOR}} {{CMAKE_FLAGS}} + cmake -S . -B {{BUILD_DIR}} {{GENERATOR}} {{CMAKE_FLAGS}} -DBUILD_CUDA=ON build: configure cmake --build {{BUILD_DIR}} -j diff --git a/crates/cuda_hello/CMakeLists.txt b/crates/cuda_hello/CMakeLists.txt index aaf93a0..0d93de5 100644 --- a/crates/cuda_hello/CMakeLists.txt +++ b/crates/cuda_hello/CMakeLists.txt @@ -7,9 +7,9 @@ set(CMAKE_CUDA_STANDARD_REQUIRED ON) set(CMAKE_BUILD_RPATH_USE_ORIGIN ON) set(CMAKE_INSTALL_RPATH_USE_LINK_PATH ON) -add_executable(cuda_hello main.cu) +add_executable(cuda_hello_world hello.cu) find_package(CUDAToolkit REQUIRED) -target_link_libraries(cuda_hello PRIVATE CUDA::cudart) +target_link_libraries(cuda_hello_world PRIVATE CUDA::cudart) -set_property(TARGET cuda_hello PROPERTY CUDA_ARCHITECTURES native) +set_property(TARGET cuda_hello_world PROPERTY CUDA_ARCHITECTURES native) diff --git a/crates/cuda_hello/main.cu b/crates/cuda_hello/hello.cu similarity index 100% rename from crates/cuda_hello/main.cu rename to crates/cuda_hello/hello.cu diff --git a/crates/cuda_hello/reduce.cu b/crates/cuda_hello/reduce.cu new file mode 100644 index 0000000..e69de29 diff --git a/crates/cuda_hello/vecadd.cu b/crates/cuda_hello/vecadd.cu new file mode 100644 index 0000000..e69de29 From f159d87ef20eb4107bc5765d3e41360e5b72077e Mon Sep 17 00:00:00 2001 From: Brian Staber Date: Mon, 22 Sep 2025 21:56:02 +0200 Subject: [PATCH 2/4] add saxpy --- crates/cuda_hello/CMakeLists.txt | 3 ++ crates/cuda_hello/saxpy.cu | 78 ++++++++++++++++++++++++++++++++ crates/cuda_hello/vecadd.cu | 0 3 files changed, 81 insertions(+) create mode 100644 crates/cuda_hello/saxpy.cu delete mode 100644 crates/cuda_hello/vecadd.cu diff --git a/crates/cuda_hello/CMakeLists.txt b/crates/cuda_hello/CMakeLists.txt index 0d93de5..9ffb0d8 100644 --- a/crates/cuda_hello/CMakeLists.txt +++ b/crates/cuda_hello/CMakeLists.txt @@ -8,8 +8,11 @@ set(CMAKE_BUILD_RPATH_USE_ORIGIN ON) set(CMAKE_INSTALL_RPATH_USE_LINK_PATH ON) add_executable(cuda_hello_world hello.cu) +add_executable(cuda_saxpy saxpy.cu) find_package(CUDAToolkit REQUIRED) target_link_libraries(cuda_hello_world PRIVATE CUDA::cudart) +target_link_libraries(cuda_saxpy PRIVATE CUDA::cudart) set_property(TARGET cuda_hello_world PROPERTY CUDA_ARCHITECTURES native) +set_property(TARGET cuda_saxpy PROPERTY CUDA_ARCHITECTURES native) diff --git a/crates/cuda_hello/saxpy.cu b/crates/cuda_hello/saxpy.cu new file mode 100644 index 0000000..8c606cc --- /dev/null +++ b/crates/cuda_hello/saxpy.cu @@ -0,0 +1,78 @@ +#include +#include +#include + +__global__ void saxpy(int n, float a, float *x, float *y){ + // threadIdx.x: thread index within the block + // blockIdx.x: block index within the grid + // blockDim.x: number of threads per block + // gridDim.x: number of blocks in the grid + + // global_id: unique index for each thread in the entire grid + int global_id = threadIdx.x + blockDim.x * blockIdx.x; + + // Example: gridDim.x = 2, blockDim.x = 4 + + // Block 0: threadIdx.x = [0,1,2,3] → global_id = [0,1,2,3] + // Block 1: threadIdx.x = [0,1,2,3] → global_id = [4,5,6,7] + + // stride: total number of threads in the grid + int stride = blockDim.x * gridDim.x; + + // Each thread processes multiple elements, striding by the total number of threads + // Striding ensures all elements are processed even if n > total threads + for (int i=global_id; i < n; i += stride) + { + y[i] = a * x[i] + y[i]; + } +} + +int main() { + // Set up data + const int N = 100; + float alpha = 3.14f; + float *h_x, *h_y; + float *d_x, *d_y; + size_t size = N * sizeof(float); + + // Allocate device memory + cudaMalloc(&d_x, size); + cudaMalloc(&d_y, size); + + // Initialize host data + h_x = (float*)malloc(size); + h_y = (float*)malloc(size); + + for (int i = 0; i < N; i++) { + h_x[i] = rand() / (float)RAND_MAX; + h_y[i] = rand() / (float)RAND_MAX; + } + + // Copy data to device + cudaMemcpy(d_x, h_x, size, cudaMemcpyHostToDevice); + cudaMemcpy(d_y, h_y, size, cudaMemcpyHostToDevice); + + // Define block size (number of threads per block) + int blockSize = 4; + + // Calculate number of blocks needed + int numBlocks = (N + blockSize - 1) / blockSize; + + // Launch kernel + saxpy<<>>(N, alpha, d_x, d_y); + cudaDeviceSynchronize(); + + // Copy result back to host + cudaMemcpy(h_y, d_y, size, cudaMemcpyDeviceToHost); + for (int i = 0; i < N; i++) { + printf("h_y[%d] = %f\n", i, h_y[i]); + } + + // Clean up + free(h_x); + free(h_y); + cudaFree(d_x); + cudaFree(d_y); + + return 0; +} \ No newline at end of file diff --git a/crates/cuda_hello/vecadd.cu b/crates/cuda_hello/vecadd.cu deleted file mode 100644 index e69de29..0000000 From 79e48c48cb57f73c319fb1568e268aafb4e7d2d3 Mon Sep 17 00:00:00 2001 From: Brian Staber Date: Mon, 22 Sep 2025 22:04:57 +0200 Subject: [PATCH 3/4] a few tweaks --- crates/cuda_hello/saxpy.cu | 21 ++++++++------------- 1 file changed, 8 insertions(+), 13 deletions(-) diff --git a/crates/cuda_hello/saxpy.cu b/crates/cuda_hello/saxpy.cu index 8c606cc..9438cf8 100644 --- a/crates/cuda_hello/saxpy.cu +++ b/crates/cuda_hello/saxpy.cu @@ -1,6 +1,6 @@ #include #include -#include +#include __global__ void saxpy(int n, float a, float *x, float *y){ // threadIdx.x: thread index within the block @@ -31,7 +31,6 @@ int main() { // Set up data const int N = 100; float alpha = 3.14f; - float *h_x, *h_y; float *d_x, *d_y; size_t size = N * sizeof(float); @@ -40,17 +39,15 @@ int main() { cudaMalloc(&d_y, size); // Initialize host data - h_x = (float*)malloc(size); - h_y = (float*)malloc(size); - - for (int i = 0; i < N; i++) { - h_x[i] = rand() / (float)RAND_MAX; - h_y[i] = rand() / (float)RAND_MAX; + std::vector h_x(N), h_y(N); + for (int i = 0; i < N; ++i) { + h_x[i] = std::rand() / (float)RAND_MAX; + h_y[i] = std::rand() / (float)RAND_MAX; } // Copy data to device - cudaMemcpy(d_x, h_x, size, cudaMemcpyHostToDevice); - cudaMemcpy(d_y, h_y, size, cudaMemcpyHostToDevice); + cudaMemcpy(d_x, h_x.data(), size, cudaMemcpyHostToDevice); + cudaMemcpy(d_y, h_y.data(), size, cudaMemcpyHostToDevice); // Define block size (number of threads per block) int blockSize = 4; @@ -63,14 +60,12 @@ int main() { cudaDeviceSynchronize(); // Copy result back to host - cudaMemcpy(h_y, d_y, size, cudaMemcpyDeviceToHost); + cudaMemcpy(h_y.data(), d_y, size, cudaMemcpyDeviceToHost); for (int i = 0; i < N; i++) { printf("h_y[%d] = %f\n", i, h_y[i]); } // Clean up - free(h_x); - free(h_y); cudaFree(d_x); cudaFree(d_y); From 85aa2392398b1b0b254fcd89894b244e6e7909dc Mon Sep 17 00:00:00 2001 From: Brian Staber Date: Mon, 22 Sep 2025 22:34:34 +0200 Subject: [PATCH 4/4] remove reduce for now --- crates/cuda_hello/CMakeLists.txt | 2 +- crates/cuda_hello/reduce.cu | 0 2 files changed, 1 insertion(+), 1 deletion(-) delete mode 100644 crates/cuda_hello/reduce.cu diff --git a/crates/cuda_hello/CMakeLists.txt b/crates/cuda_hello/CMakeLists.txt index 9ffb0d8..e3296ff 100644 --- a/crates/cuda_hello/CMakeLists.txt +++ b/crates/cuda_hello/CMakeLists.txt @@ -15,4 +15,4 @@ target_link_libraries(cuda_hello_world PRIVATE CUDA::cudart) target_link_libraries(cuda_saxpy PRIVATE CUDA::cudart) set_property(TARGET cuda_hello_world PROPERTY CUDA_ARCHITECTURES native) -set_property(TARGET cuda_saxpy PROPERTY CUDA_ARCHITECTURES native) +set_property(TARGET cuda_saxpy PROPERTY CUDA_ARCHITECTURES native) \ No newline at end of file diff --git a/crates/cuda_hello/reduce.cu b/crates/cuda_hello/reduce.cu deleted file mode 100644 index e69de29..0000000