1+ #include < cstdio>
2+ #include < cuda_runtime.h>
3+ #include < vector>
4+
5+ __global__ void saxpy (int n, float a, float *x, float *y){
6+ // threadIdx.x: thread index within the block
7+ // blockIdx.x: block index within the grid
8+ // blockDim.x: number of threads per block
9+ // gridDim.x: number of blocks in the grid
10+
11+ // global_id: unique index for each thread in the entire grid
12+ int global_id = threadIdx .x + blockDim .x * blockIdx .x ;
13+
14+ // Example: gridDim.x = 2, blockDim.x = 4
15+
16+ // Block 0: threadIdx.x = [0,1,2,3] → global_id = [0,1,2,3]
17+ // Block 1: threadIdx.x = [0,1,2,3] → global_id = [4,5,6,7]
18+
19+ // stride: total number of threads in the grid
20+ int stride = blockDim .x * gridDim .x ;
21+
22+ // Each thread processes multiple elements, striding by the total number of threads
23+ // Striding ensures all elements are processed even if n > total threads
24+ for (int i=global_id; i < n; i += stride)
25+ {
26+ y[i] = a * x[i] + y[i];
27+ }
28+ }
29+
30+ int main () {
31+ // Set up data
32+ const int N = 100 ;
33+ float alpha = 3 .14f ;
34+ float *d_x, *d_y;
35+ size_t size = N * sizeof (float );
36+
37+ // Allocate device memory
38+ cudaMalloc (&d_x, size);
39+ cudaMalloc (&d_y, size);
40+
41+ // Initialize host data
42+ std::vector<float > h_x (N), h_y (N);
43+ for (int i = 0 ; i < N; ++i) {
44+ h_x[i] = std::rand () / (float )RAND_MAX;
45+ h_y[i] = std::rand () / (float )RAND_MAX;
46+ }
47+
48+ // Copy data to device
49+ cudaMemcpy (d_x, h_x.data (), size, cudaMemcpyHostToDevice);
50+ cudaMemcpy (d_y, h_y.data (), size, cudaMemcpyHostToDevice);
51+
52+ // Define block size (number of threads per block)
53+ int blockSize = 4 ;
54+
55+ // Calculate number of blocks needed
56+ int numBlocks = (N + blockSize - 1 ) / blockSize;
57+
58+ // Launch kernel
59+ saxpy<<<numBlocks, blockSize>>> (N, alpha, d_x, d_y);
60+ cudaDeviceSynchronize ();
61+
62+ // Copy result back to host
63+ cudaMemcpy (h_y.data (), d_y, size, cudaMemcpyDeviceToHost);
64+ for (int i = 0 ; i < N; i++) {
65+ printf (" h_y[%d] = %f\n " , i, h_y[i]);
66+ }
67+
68+ // Clean up
69+ cudaFree (d_x);
70+ cudaFree (d_y);
71+
72+ return 0 ;
73+ }
0 commit comments