diff --git a/Justfile b/Justfile index 8eee4a4..a8d947c 100644 --- a/Justfile +++ b/Justfile @@ -20,7 +20,7 @@ default: configure build test # CMake lifecycle # ------------------------- configure: - cmake -S . -B {{BUILD_DIR}} {{GENERATOR}} {{CMAKE_FLAGS}} + cmake -S . -B {{BUILD_DIR}} {{GENERATOR}} {{CMAKE_FLAGS}} -DBUILD_CUDA=ON build: configure cmake --build {{BUILD_DIR}} -j diff --git a/crates/cuda_hello/CMakeLists.txt b/crates/cuda_hello/CMakeLists.txt index aaf93a0..e3296ff 100644 --- a/crates/cuda_hello/CMakeLists.txt +++ b/crates/cuda_hello/CMakeLists.txt @@ -7,9 +7,12 @@ set(CMAKE_CUDA_STANDARD_REQUIRED ON) set(CMAKE_BUILD_RPATH_USE_ORIGIN ON) set(CMAKE_INSTALL_RPATH_USE_LINK_PATH ON) -add_executable(cuda_hello main.cu) +add_executable(cuda_hello_world hello.cu) +add_executable(cuda_saxpy saxpy.cu) find_package(CUDAToolkit REQUIRED) -target_link_libraries(cuda_hello PRIVATE CUDA::cudart) +target_link_libraries(cuda_hello_world PRIVATE CUDA::cudart) +target_link_libraries(cuda_saxpy PRIVATE CUDA::cudart) -set_property(TARGET cuda_hello PROPERTY CUDA_ARCHITECTURES native) +set_property(TARGET cuda_hello_world PROPERTY CUDA_ARCHITECTURES native) +set_property(TARGET cuda_saxpy PROPERTY CUDA_ARCHITECTURES native) \ No newline at end of file diff --git a/crates/cuda_hello/main.cu b/crates/cuda_hello/hello.cu similarity index 100% rename from crates/cuda_hello/main.cu rename to crates/cuda_hello/hello.cu diff --git a/crates/cuda_hello/saxpy.cu b/crates/cuda_hello/saxpy.cu new file mode 100644 index 0000000..9438cf8 --- /dev/null +++ b/crates/cuda_hello/saxpy.cu @@ -0,0 +1,73 @@ +#include +#include +#include + +__global__ void saxpy(int n, float a, float *x, float *y){ + // threadIdx.x: thread index within the block + // blockIdx.x: block index within the grid + // blockDim.x: number of threads per block + // gridDim.x: number of blocks in the grid + + // global_id: unique index for each thread in the entire grid + int global_id = threadIdx.x + blockDim.x * blockIdx.x; + + // Example: gridDim.x = 2, blockDim.x = 4 + + // Block 0: threadIdx.x = [0,1,2,3] → global_id = [0,1,2,3] + // Block 1: threadIdx.x = [0,1,2,3] → global_id = [4,5,6,7] + + // stride: total number of threads in the grid + int stride = blockDim.x * gridDim.x; + + // Each thread processes multiple elements, striding by the total number of threads + // Striding ensures all elements are processed even if n > total threads + for (int i=global_id; i < n; i += stride) + { + y[i] = a * x[i] + y[i]; + } +} + +int main() { + // Set up data + const int N = 100; + float alpha = 3.14f; + float *d_x, *d_y; + size_t size = N * sizeof(float); + + // Allocate device memory + cudaMalloc(&d_x, size); + cudaMalloc(&d_y, size); + + // Initialize host data + std::vector h_x(N), h_y(N); + for (int i = 0; i < N; ++i) { + h_x[i] = std::rand() / (float)RAND_MAX; + h_y[i] = std::rand() / (float)RAND_MAX; + } + + // Copy data to device + cudaMemcpy(d_x, h_x.data(), size, cudaMemcpyHostToDevice); + cudaMemcpy(d_y, h_y.data(), size, cudaMemcpyHostToDevice); + + // Define block size (number of threads per block) + int blockSize = 4; + + // Calculate number of blocks needed + int numBlocks = (N + blockSize - 1) / blockSize; + + // Launch kernel + saxpy<<>>(N, alpha, d_x, d_y); + cudaDeviceSynchronize(); + + // Copy result back to host + cudaMemcpy(h_y.data(), d_y, size, cudaMemcpyDeviceToHost); + for (int i = 0; i < N; i++) { + printf("h_y[%d] = %f\n", i, h_y[i]); + } + + // Clean up + cudaFree(d_x); + cudaFree(d_y); + + return 0; +} \ No newline at end of file