diff --git a/Justfile b/Justfile
index 8eee4a4..a8d947c 100644
--- a/Justfile
+++ b/Justfile
@@ -20,7 +20,7 @@ default: configure build test
 # CMake lifecycle
 # -------------------------
 configure:
-    cmake -S . -B {{BUILD_DIR}} {{GENERATOR}} {{CMAKE_FLAGS}}
+    cmake -S . -B {{BUILD_DIR}} {{GENERATOR}} {{CMAKE_FLAGS}} -DBUILD_CUDA=ON
 
 build: configure
     cmake --build {{BUILD_DIR}} -j
diff --git a/crates/cuda_hello/CMakeLists.txt b/crates/cuda_hello/CMakeLists.txt
index aaf93a0..e3296ff 100644
--- a/crates/cuda_hello/CMakeLists.txt
+++ b/crates/cuda_hello/CMakeLists.txt
@@ -7,9 +7,12 @@ set(CMAKE_CUDA_STANDARD_REQUIRED ON)
 set(CMAKE_BUILD_RPATH_USE_ORIGIN ON)
 set(CMAKE_INSTALL_RPATH_USE_LINK_PATH ON)
 
-add_executable(cuda_hello main.cu)
+add_executable(cuda_hello_world hello.cu)
+add_executable(cuda_saxpy saxpy.cu)
 
 find_package(CUDAToolkit REQUIRED)
-target_link_libraries(cuda_hello PRIVATE CUDA::cudart)
+target_link_libraries(cuda_hello_world PRIVATE CUDA::cudart)
+target_link_libraries(cuda_saxpy PRIVATE CUDA::cudart)
 
-set_property(TARGET cuda_hello PROPERTY CUDA_ARCHITECTURES native)
+set_property(TARGET cuda_hello_world PROPERTY CUDA_ARCHITECTURES native)
+set_property(TARGET cuda_saxpy PROPERTY CUDA_ARCHITECTURES native)
\ No newline at end of file
diff --git a/crates/cuda_hello/main.cu b/crates/cuda_hello/hello.cu
similarity index 100%
rename from crates/cuda_hello/main.cu
rename to crates/cuda_hello/hello.cu
diff --git a/crates/cuda_hello/saxpy.cu b/crates/cuda_hello/saxpy.cu
new file mode 100644
index 0000000..9438cf8
--- /dev/null
+++ b/crates/cuda_hello/saxpy.cu
@@ -0,0 +1,73 @@
+#include <cstdio>
+#include <cuda_runtime.h>
+#include <vector>
+
+__global__ void saxpy(int n, float a, float *x, float *y){
+    // threadIdx.x: thread index within the block
+    // blockIdx.x: block index within the grid
+    // blockDim.x: number of threads per block
+    // gridDim.x: number of blocks in the grid
+
+    // global_id: unique index for each thread in the entire grid
+    int global_id = threadIdx.x + blockDim.x * blockIdx.x;
+
+    // Example: gridDim.x = 2, blockDim.x = 4
+
+    // Block 0: threadIdx.x = [0,1,2,3] → global_id = [0,1,2,3]
+    // Block 1: threadIdx.x = [0,1,2,3] → global_id = [4,5,6,7]
+
+    // stride: total number of threads in the grid
+    int stride = blockDim.x * gridDim.x;
+
+    // Each thread processes multiple elements, striding by the total number of threads
+    // Striding ensures all elements are processed even if n > total threads
+    for (int i=global_id; i < n; i += stride)
+    {
+        y[i] = a * x[i] + y[i];
+    }
+}
+
+int main() {
+    // Set up data
+    const int N = 100;
+    float alpha = 3.14f;
+    float *d_x, *d_y;
+    size_t size = N * sizeof(float);
+
+    // Allocate device memory
+    cudaMalloc(&d_x, size);
+    cudaMalloc(&d_y, size);
+
+    // Initialize host data
+    std::vector<float> h_x(N), h_y(N);
+    for (int i = 0; i < N; ++i) {
+        h_x[i] = std::rand() / (float)RAND_MAX;
+        h_y[i] = std::rand() / (float)RAND_MAX;
+    }
+
+    // Copy data to device
+    cudaMemcpy(d_x, h_x.data(), size, cudaMemcpyHostToDevice);
+    cudaMemcpy(d_y, h_y.data(), size, cudaMemcpyHostToDevice);
+
+    // Define block size (number of threads per block)
+    int blockSize = 4;
+
+    // Calculate number of blocks needed
+    int numBlocks = (N + blockSize - 1) / blockSize;
+
+    // Launch kernel
+    saxpy<<<numBlocks, blockSize>>>(N, alpha, d_x, d_y);
+    cudaDeviceSynchronize();
+
+    // Copy result back to host
+    cudaMemcpy(h_y.data(), d_y, size, cudaMemcpyDeviceToHost);
+    for (int i = 0; i < N; i++) {
+        printf("h_y[%d] = %f\n", i, h_y[i]);
+    }
+
+    // Clean up
+    cudaFree(d_x);
+    cudaFree(d_y);
+
+    return 0;
+}
\ No newline at end of file