Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion Justfile
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ default: configure build test
# CMake lifecycle
# -------------------------
configure:
cmake -S . -B {{BUILD_DIR}} {{GENERATOR}} {{CMAKE_FLAGS}}
cmake -S . -B {{BUILD_DIR}} {{GENERATOR}} {{CMAKE_FLAGS}} -DBUILD_CUDA=ON

build: configure
cmake --build {{BUILD_DIR}} -j
Expand Down
9 changes: 6 additions & 3 deletions crates/cuda_hello/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,12 @@ set(CMAKE_CUDA_STANDARD_REQUIRED ON)
set(CMAKE_BUILD_RPATH_USE_ORIGIN ON)
set(CMAKE_INSTALL_RPATH_USE_LINK_PATH ON)

add_executable(cuda_hello main.cu)
add_executable(cuda_hello_world hello.cu)
add_executable(cuda_saxpy saxpy.cu)

find_package(CUDAToolkit REQUIRED)
target_link_libraries(cuda_hello PRIVATE CUDA::cudart)
target_link_libraries(cuda_hello_world PRIVATE CUDA::cudart)
target_link_libraries(cuda_saxpy PRIVATE CUDA::cudart)

set_property(TARGET cuda_hello PROPERTY CUDA_ARCHITECTURES native)
set_property(TARGET cuda_hello_world PROPERTY CUDA_ARCHITECTURES native)
set_property(TARGET cuda_saxpy PROPERTY CUDA_ARCHITECTURES native)
File renamed without changes.
73 changes: 73 additions & 0 deletions crates/cuda_hello/saxpy.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
#include <cstdio>
#include <cuda_runtime.h>
#include <vector>

__global__ void saxpy(int n, float a, float *x, float *y){
// threadIdx.x: thread index within the block
// blockIdx.x: block index within the grid
// blockDim.x: number of threads per block
// gridDim.x: number of blocks in the grid

// global_id: unique index for each thread in the entire grid
int global_id = threadIdx.x + blockDim.x * blockIdx.x;

// Example: gridDim.x = 2, blockDim.x = 4

// Block 0: threadIdx.x = [0,1,2,3] → global_id = [0,1,2,3]
// Block 1: threadIdx.x = [0,1,2,3] → global_id = [4,5,6,7]

// stride: total number of threads in the grid
int stride = blockDim.x * gridDim.x;

// Each thread processes multiple elements, striding by the total number of threads
// Striding ensures all elements are processed even if n > total threads
for (int i=global_id; i < n; i += stride)
{
y[i] = a * x[i] + y[i];
}
}

int main() {
// Set up data
const int N = 100;
float alpha = 3.14f;
float *d_x, *d_y;
size_t size = N * sizeof(float);

// Allocate device memory
cudaMalloc(&d_x, size);
cudaMalloc(&d_y, size);

// Initialize host data
std::vector<float> h_x(N), h_y(N);
for (int i = 0; i < N; ++i) {
h_x[i] = std::rand() / (float)RAND_MAX;
h_y[i] = std::rand() / (float)RAND_MAX;
}

// Copy data to device
cudaMemcpy(d_x, h_x.data(), size, cudaMemcpyHostToDevice);
cudaMemcpy(d_y, h_y.data(), size, cudaMemcpyHostToDevice);

// Define block size (number of threads per block)
int blockSize = 4;

// Calculate number of blocks needed
int numBlocks = (N + blockSize - 1) / blockSize;

// Launch kernel
saxpy<<<numBlocks, blockSize>>>(N, alpha, d_x, d_y);
cudaDeviceSynchronize();

// Copy result back to host
cudaMemcpy(h_y.data(), d_y, size, cudaMemcpyDeviceToHost);
for (int i = 0; i < N; i++) {
printf("h_y[%d] = %f\n", i, h_y[i]);
}

// Clean up
cudaFree(d_x);
cudaFree(d_y);

return 0;
}