Skip to content

Commit 4b17d77

Browse files
authored
🚀 CUDA saxpy (#8)
1 parent 159997b commit 4b17d77

File tree

4 files changed

+80
-4
lines changed

4 files changed

+80
-4
lines changed

Justfile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ default: configure build test
2020
# CMake lifecycle
2121
# -------------------------
2222
configure:
23-
cmake -S . -B {{BUILD_DIR}} {{GENERATOR}} {{CMAKE_FLAGS}}
23+
cmake -S . -B {{BUILD_DIR}} {{GENERATOR}} {{CMAKE_FLAGS}} -DBUILD_CUDA=ON
2424

2525
build: configure
2626
cmake --build {{BUILD_DIR}} -j

crates/cuda_hello/CMakeLists.txt

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -7,9 +7,12 @@ set(CMAKE_CUDA_STANDARD_REQUIRED ON)
77
set(CMAKE_BUILD_RPATH_USE_ORIGIN ON)
88
set(CMAKE_INSTALL_RPATH_USE_LINK_PATH ON)
99

10-
add_executable(cuda_hello main.cu)
10+
add_executable(cuda_hello_world hello.cu)
11+
add_executable(cuda_saxpy saxpy.cu)
1112

1213
find_package(CUDAToolkit REQUIRED)
13-
target_link_libraries(cuda_hello PRIVATE CUDA::cudart)
14+
target_link_libraries(cuda_hello_world PRIVATE CUDA::cudart)
15+
target_link_libraries(cuda_saxpy PRIVATE CUDA::cudart)
1416

15-
set_property(TARGET cuda_hello PROPERTY CUDA_ARCHITECTURES native)
17+
set_property(TARGET cuda_hello_world PROPERTY CUDA_ARCHITECTURES native)
18+
set_property(TARGET cuda_saxpy PROPERTY CUDA_ARCHITECTURES native)
File renamed without changes.

crates/cuda_hello/saxpy.cu

Lines changed: 73 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,73 @@
1+
#include <cstdio>
2+
#include <cuda_runtime.h>
3+
#include <vector>
4+
5+
__global__ void saxpy(int n, float a, float *x, float *y){
6+
// threadIdx.x: thread index within the block
7+
// blockIdx.x: block index within the grid
8+
// blockDim.x: number of threads per block
9+
// gridDim.x: number of blocks in the grid
10+
11+
// global_id: unique index for each thread in the entire grid
12+
int global_id = threadIdx.x + blockDim.x * blockIdx.x;
13+
14+
// Example: gridDim.x = 2, blockDim.x = 4
15+
16+
// Block 0: threadIdx.x = [0,1,2,3] → global_id = [0,1,2,3]
17+
// Block 1: threadIdx.x = [0,1,2,3] → global_id = [4,5,6,7]
18+
19+
// stride: total number of threads in the grid
20+
int stride = blockDim.x * gridDim.x;
21+
22+
// Each thread processes multiple elements, striding by the total number of threads
23+
// Striding ensures all elements are processed even if n > total threads
24+
for (int i=global_id; i < n; i += stride)
25+
{
26+
y[i] = a * x[i] + y[i];
27+
}
28+
}
29+
30+
int main() {
31+
// Set up data
32+
const int N = 100;
33+
float alpha = 3.14f;
34+
float *d_x, *d_y;
35+
size_t size = N * sizeof(float);
36+
37+
// Allocate device memory
38+
cudaMalloc(&d_x, size);
39+
cudaMalloc(&d_y, size);
40+
41+
// Initialize host data
42+
std::vector<float> h_x(N), h_y(N);
43+
for (int i = 0; i < N; ++i) {
44+
h_x[i] = std::rand() / (float)RAND_MAX;
45+
h_y[i] = std::rand() / (float)RAND_MAX;
46+
}
47+
48+
// Copy data to device
49+
cudaMemcpy(d_x, h_x.data(), size, cudaMemcpyHostToDevice);
50+
cudaMemcpy(d_y, h_y.data(), size, cudaMemcpyHostToDevice);
51+
52+
// Define block size (number of threads per block)
53+
int blockSize = 4;
54+
55+
// Calculate number of blocks needed
56+
int numBlocks = (N + blockSize - 1) / blockSize;
57+
58+
// Launch kernel
59+
saxpy<<<numBlocks, blockSize>>>(N, alpha, d_x, d_y);
60+
cudaDeviceSynchronize();
61+
62+
// Copy result back to host
63+
cudaMemcpy(h_y.data(), d_y, size, cudaMemcpyDeviceToHost);
64+
for (int i = 0; i < N; i++) {
65+
printf("h_y[%d] = %f\n", i, h_y[i]);
66+
}
67+
68+
// Clean up
69+
cudaFree(d_x);
70+
cudaFree(d_y);
71+
72+
return 0;
73+
}

0 commit comments

Comments
 (0)