Skip to content

Add CUDA support for low-level field arithmetic #15

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 1 commit into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
140 changes: 140 additions & 0 deletions gpu-poly/src/cuda/fft.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,140 @@
#include <stdio.h>
#include <stdint.h>
#include <stdbool.h>
#include <cuda.h>
#include "frarith.h"
#include <iostream>


/*==================== CUDA KERNELS ====================*/

/*
Inner FFT loop
*/
__device__ void inplace_fft_inner(uint64_t *__restrict__ A, uint64_t *__restrict__ tw, int j, int k, int m, int n) {
if (j + k + m / 2 < n && k < m / 2) {
unsigned r = j + k + m / 2;
unsigned l = j + k;
unsigned g = k * (n / m);

Mul(A + (r << 2), tw + (g << 2), A + (r << 2));

uint64_t t[4]{
A[(l << 2) + 0],
A[(l << 2) + 1],
A[(l << 2) + 2],
A[(l << 2) + 3]
};

Add(A + (l << 2), A + (r << 2), A + (l << 2));
Sub(t, A + (r << 2), A + (r << 2));
}
}

/*
Reorders array by bit-reversing the indices
*/
__global__ void bit_reverse(uint64_t *__restrict__ A, uint64_t *__restrict__ tmp, int s, size_t nthr) {
int id = blockIdx.x * nthr + threadIdx.x;
int n = 1 << s;
int shifted = __brev(id) >> (32 - s);
if (id < n && shifted < n) {
A[shifted * 4 + 0] = tmp[id * 4 + 0];
A[shifted * 4 + 1] = tmp[id * 4 + 1];
A[shifted * 4 + 2] = tmp[id * 4 + 2];
A[shifted * 4 + 3] = tmp[id * 4 + 3];
}
}

/*
FFT if we have enough threads
*/
__global__ void inplace_fft(uint64_t *__restrict__ A, uint64_t *__restrict__ tw, int j, int m, int n, size_t nthr) {
int k = blockIdx.x * nthr + threadIdx.x;
inplace_fft_inner(A, tw, j, k, m, n);
}

/*
FFT if we don't have enough threads
*/
__global__ void inplace_fft_outer(uint64_t *__restrict__ A, uint64_t *__restrict__ tw, int m, int n, size_t nthr) {
int j = (blockIdx.x * nthr + threadIdx.x) * m;
for (int k = 0; k < m / 2; k++) {
inplace_fft_inner(A, tw, j, k, m, n);
}
}

// #define CHECK_LAST_CUDA_ERROR() checkLast(__FILE__, __LINE__)
// void checkLast(const char* const file, const int line)
// {
// cudaError_t err{cudaGetLastError()};
// if (err != cudaSuccess)
// {
// std::cerr << "CUDA Runtime Error at: " << file << ":" << line
// << std::endl;
// std::cerr << cudaGetErrorString(err) << std::endl;
// // We don't exit when we encounter CUDA errors in this example.
// // std::exit(EXIT_FAILURE);
// }
// }

extern "C" {
void fft(uint64_t *a, uint64_t *omegas, size_t n, bool invert, int balance, size_t threads) {
// NOTE: n is the number of ELEMENTS,
// The total list size is n * S
uint64_t list_size = n * S;

// Create array from input vector
uint64_t buffer_size = list_size * sizeof(uint64_t);

// Allocate array
uint64_t *data_array = (uint64_t *) malloc(buffer_size);
for (int i = 0; i < list_size; i++) {
data_array[i] = a[i];
}

// Copy data to FPGA using 2 arrays for bitreverse
uint64_t *A, *tmp, *tw;
cudaMalloc((void **) &A, buffer_size);
cudaMalloc((void **) &tmp, buffer_size);
cudaMalloc((void **) &tw, buffer_size); // TODO: I think this will/can be smaller than buffer_size
cudaMemcpy(tmp, data_array, buffer_size, cudaMemcpyHostToDevice);
cudaMemcpy(tw, omegas, buffer_size / 2, cudaMemcpyHostToDevice);

// Bit reverse ordering
int s = log2(n);
bit_reverse<<<ceil((float)n / threads), threads>>>(A, tmp, s, threads);

// Synchronize
cudaDeviceSynchronize();
// Iterative FFT with loop parallelism balancing
for (int i = 1; i <= s; i++) {
int m = 1 << i;
if (n / m > balance) {
inplace_fft_outer<<<ceil((float)n / m / threads), threads>>>(A, tw, m, n, threads);
} else {
for (int j = 0; j < n; j += m) {
float repeats = m / 2;
inplace_fft<<<ceil(repeats / threads), threads>>>(A, tw, j, m, n, threads);
}
}
}

// Copy back result
uint64_t *result;
result = (uint64_t *) malloc(buffer_size);
cudaMemcpy(result, A, buffer_size, cudaMemcpyDeviceToHost);

for (int i = 0; i < list_size; i++) {
a[i] = result[i];
}

free(data_array);
free(result);
cudaFree(A);
cudaFree(tmp);
cudaFree(tw);

return;
}
}
33 changes: 33 additions & 0 deletions gpu-poly/src/cuda/includes/bn254.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
#ifndef __BN254_H__
#define __BN254_H__

// K -> Num bits to store a field element
#define K 256

// S -> Number of words to store a field element
#define S 4

#include <cuda.h>

// Constants for BN254
__device__ uint64_t MODULUS[S]{
4891460686036598785,
2896914383306846353,
13281191951274694749,
3486998266802970665
};
__device__ uint64_t R[S]{
12436184717236109307,
3962172157175319849,
7381016538464732718,
1011752739694698287
};
__device__ uint64_t UNITY[S]{
7164790868263648668,
11685701338293206998,
6216421865291908056,
1756667274303109607
};
__device__ uint64_t montConstant = 14042775128853446655;

#endif
Loading