diff --git a/Cargo.toml b/Cargo.toml index 6ce1952d..b806f3c7 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -28,18 +28,23 @@ num = "0.4" rand = { version = "0.8.5", default-features = false, features = ["small_rng"] } rand_distr = { version = "0.4", optional = true } serde = { version = "1", features = ["derive"], optional = true } +wgpu = { version = "25.0.2", optional = true } +pollster = { version = "0.4.0", optional = true } +bytemuck = { version = "1.23.0", optional = true } +lazy_static = { version = "1.5.0", optional = true } [target.'cfg(not(target_arch = "wasm32"))'.dependencies] typetag = { version = "0.2", optional = true } [features] -default = [] +default = ["gpu"] serde = ["dep:serde", "dep:typetag"] ndarray-bindings = ["dep:ndarray"] datasets = ["dep:rand_distr", "std_rand", "serde"] std_rand = ["rand/std_rng", "rand/std"] # used by wasm32-unknown-unknown for in-browser usage js = ["getrandom/js"] +gpu = ["wgpu", "pollster", "bytemuck", "lazy_static"] [target.'cfg(target_arch = "wasm32")'.dependencies] getrandom = { version = "0.2.8", optional = true } diff --git a/src/error/gpu_error.rs b/src/error/gpu_error.rs new file mode 100644 index 00000000..0c497b67 --- /dev/null +++ b/src/error/gpu_error.rs @@ -0,0 +1,37 @@ + +use std::fmt; + +#[derive(Debug)] +pub enum GpuError { + NoAdapter(String), + NoShader, + NoDevice(String), + InvalidWorkgroupSize, + MutexLock(String), + WorkerConversion, + ParamsBufferNotFound, + Generic(String) +} + +impl std::error::Error for GpuError {} +impl fmt::Display for GpuError { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match self { + Self::NoAdapter(err) => write!(f, "Unable to create GPU adapter, error: {}", err), + Self::NoShader => write!(f, "GPU adapter does not support computer shaders."), + Self::NoDevice(err) => write!(f, "Unable to create device on GPU, error: {}", err), + Self::InvalidWorkgroupSize => write!(f, "Workgroup size must be 64, 128, 256, 512 or 1024"), + Self::MutexLock(msg) => write!(f, "Unable to lock mutex: {}", msg), + Self::WorkerConversion => write!(f, "Unable to convert into GpuWorker"), + Self::ParamsBufferNotFound => write!(f, "Unable to update params buffer, as there doesn't appear to be a params buffer in this worker!"), + Self::Generic(msg) => write!(f, "{}", msg), + } + } +} + +impl From for GpuError { + fn from(err: std::io::Error) -> Self { + GpuError::Generic(err.to_string()) + } +} + diff --git a/src/error/mod.rs b/src/error/mod.rs index b6b1d982..5bea25de 100644 --- a/src/error/mod.rs +++ b/src/error/mod.rs @@ -2,6 +2,12 @@ use std::error::Error; use std::fmt; +#[cfg(feature = "gpu")] +pub use self::gpu_error::GpuError; + +#[cfg(feature = "gpu")] +mod gpu_error; + #[cfg(feature = "serde")] use serde::{Deserialize, Serialize}; diff --git a/src/gpu/adapter.rs b/src/gpu/adapter.rs new file mode 100644 index 00000000..b69ae4a2 --- /dev/null +++ b/src/gpu/adapter.rs @@ -0,0 +1,46 @@ + + +use wgpu::util::DeviceExt; +use crate::error::GpuError; + +#[derive(Clone)] +pub struct GpuAdapter { + adapter: wgpu::Adapter, + pub device: wgpu::Device, + pub queue: wgpu::Queue, + pub max_workgroup_size: u32 +} + +impl GpuAdapter { + pub fn new() -> Result { + + let instance = wgpu::Instance::new(&wgpu::InstanceDescriptor::default()); + let adapter = pollster::block_on(instance.request_adapter(&wgpu::RequestAdapterOptions::default())) + .map_err(|e| GpuError::NoAdapter(e.to_string()) )?; + println!("Running on Adapter: {:#?}", adapter.get_info()); + + // Ensure adapter supports a compute shader + let downlevel_capabilities = adapter.get_downlevel_capabilities(); + if !downlevel_capabilities.flags.contains(wgpu::DownlevelFlags::COMPUTE_SHADERS) { + return Err(GpuError::NoShader); + } + + // Create device and queue + let (device, queue) = pollster::block_on(adapter.request_device(&wgpu::DeviceDescriptor { + label: None, + required_features: wgpu::Features::empty(), + required_limits: wgpu::Limits::downlevel_defaults(), + memory_hints: wgpu::MemoryHints::MemoryUsage, + trace: wgpu::Trace::Off, + })) + .map_err(|e| GpuError::NoDevice(e.to_string()))?; + + // Get limits + let limits = device.limits(); + let max_workgroup_size: u32 = limits.max_compute_invocations_per_workgroup; + + Ok( Self { adapter, device, queue, max_workgroup_size }) + } +} + + diff --git a/src/gpu/algorithm.rs b/src/gpu/algorithm.rs new file mode 100644 index 00000000..50e62fa8 --- /dev/null +++ b/src/gpu/algorithm.rs @@ -0,0 +1,5 @@ + + + +pub enum GpuAlgorithm { + diff --git a/src/gpu/buffer.rs b/src/gpu/buffer.rs new file mode 100644 index 00000000..f0c57e14 --- /dev/null +++ b/src/gpu/buffer.rs @@ -0,0 +1,93 @@ + +use wgpu::util::DeviceExt; +use crate::numbers::basenum::Number; +use super::{GpuAdapter, GpuParams, GpuMatrix}; + +#[derive(Clone, Copy, Eq, PartialEq, Hash)] +pub enum GpuBuffer { + Samples, + Targets, + Weights, + TempStorage, + Params, + Download +} + +impl GpuBuffer { + pub fn included_in_bind_group(&self) -> bool { + match self { + GpuBuffer::Download => false, + _ => true + } + } + + pub fn is_read_only(&self) -> bool { + match self { + GpuBuffer::Weights => false, + GpuBuffer::TempStorage => false, + _ => true + } + } +} + + +pub fn create_samples(adapter: &GpuAdapter, matrix: &GpuMatrix) -> wgpu::Buffer { + adapter.device.create_buffer_init(&wgpu::util::BufferInitDescriptor { + label: Some("Samples"), + contents: bytemuck::cast_slice(&matrix.data), + usage: wgpu::BufferUsages::STORAGE, + }) +} + +pub fn create_targets(adapter: &GpuAdapter, targets: &Vec) -> wgpu::Buffer + where T: Number + Ord +{ + + let u_targets = targets.iter().filter_map(|&val| val.to_u32()).collect::>(); + adapter.device.create_buffer_init(&wgpu::util::BufferInitDescriptor { + label: Some("Targets"), + contents: bytemuck::cast_slice(&u_targets), + usage: wgpu::BufferUsages::STORAGE, + }) +} + +pub fn create_weights(adapter: &GpuAdapter, num_features: usize) -> wgpu::Buffer { + let zeros = vec![0.0f32; num_features]; + adapter.device.create_buffer_init(&wgpu::util::BufferInitDescriptor { + label: Some("Weights"), + contents: bytemuck::cast_slice(&zeros), + usage: wgpu::BufferUsages::STORAGE | wgpu::BufferUsages::COPY_SRC, + }) +} + + +pub fn create_temp_storage(adapter: &GpuAdapter, buffer_size: u64) -> wgpu::Buffer { + adapter.device.create_buffer(&wgpu::BufferDescriptor { + label: Some("Temp Storage"), + size: buffer_size, + usage: wgpu::BufferUsages::STORAGE | wgpu::BufferUsages::COPY_SRC, + mapped_at_creation: false, + }) +} + +pub fn create_params(adapter: &GpuAdapter, data: &Vec) -> wgpu::Buffer { + + adapter.device.create_buffer_init(&wgpu::util::BufferInitDescriptor { + label: Some("Params"), + contents: bytemuck::cast_slice(&data.as_slice()), + usage: wgpu::BufferUsages::UNIFORM | wgpu::BufferUsages::COPY_DST, + }) +} + +pub fn create_download(adapter: &GpuAdapter, buffer_size: u64) -> wgpu::Buffer { + adapter.device.create_buffer(&wgpu::BufferDescriptor { + label: Some("Download"), + size: buffer_size, + usage: wgpu::BufferUsages::COPY_DST | wgpu::BufferUsages::MAP_READ, + mapped_at_creation: false, + }) +} + + + + diff --git a/src/gpu/layout.rs b/src/gpu/layout.rs new file mode 100644 index 00000000..41dee2ee --- /dev/null +++ b/src/gpu/layout.rs @@ -0,0 +1,83 @@ + +use super::{GpuAdapter, GpuBuffer}; + +#[derive(Default, Copy, Clone, Eq, PartialEq, Hash)] +pub enum GpuLayout { + #[default] + Supervised, // Samples + Targets + Weights + TempStorage + Params + //Clustering, // Samples + Centroids + Assignments + Params + //Decomposition, // Samples + Vectors + Values + TempStorage + Params +} + +#[derive(Clone)] +pub struct GpuResourceLayout { + pub bind_group_layout: wgpu::BindGroupLayout, + pub pipeline_layout: wgpu::PipelineLayout, +} + +impl GpuLayout { + pub fn create_resource_layout(&self, adapter: &GpuAdapter) -> GpuResourceLayout { + + // Bind group layout + let bind_group_layout = self.create_bind_group_layout(&adapter); + + // Pipeline layout + let pipeline_layout = adapter.device.create_pipeline_layout(&wgpu::PipelineLayoutDescriptor { + label: None, + bind_group_layouts: &[&bind_group_layout], + push_constant_ranges: &[], + }); + + GpuResourceLayout { + bind_group_layout, + pipeline_layout + } + } + + pub fn get_buffer_templates(&self) -> Vec { + match self { + GpuLayout::Supervised => vec![GpuBuffer::Samples, GpuBuffer::Targets, GpuBuffer::Weights, GpuBuffer::TempStorage, GpuBuffer::Params], + } + } + + pub fn get_buffer_index(&self, buffer: GpuBuffer) -> Option { + let templates = self.get_buffer_templates(); + templates.iter().position(|&buf| buf == buffer) + } + + fn create_bind_group_layout(&self, adapter: &GpuAdapter) -> wgpu::BindGroupLayout { + + let templates = self.get_buffer_templates(); + let mut layout_entries: Vec = Vec::new(); + + let mut binding_num = 0; + for template in templates.iter() { + + if !template.included_in_bind_group() { + continue; + } + + layout_entries.push( wgpu::BindGroupLayoutEntry { + binding: binding_num, + visibility: wgpu::ShaderStages::COMPUTE, + ty: wgpu::BindingType::Buffer { + ty: wgpu::BufferBindingType::Storage { read_only: template.is_read_only() }, + has_dynamic_offset: false, + min_binding_size: None, + }, + count: None, + }); + + binding_num += 1; + } + + let layout = adapter.device.create_bind_group_layout(&wgpu::BindGroupLayoutDescriptor { + label: None, + entries: &layout_entries.as_slice() + }); + + layout + } +} + + diff --git a/src/gpu/matrix.rs b/src/gpu/matrix.rs new file mode 100644 index 00000000..e30b1c70 --- /dev/null +++ b/src/gpu/matrix.rs @@ -0,0 +1,52 @@ + +use super::STATION; + +#[derive(Default)] +pub struct GpuMatrix { + pub rows: usize, + pub cols: usize, + pub data: Vec +} + +impl GpuMatrix { + pub fn get_workgroup_size(&self) -> usize { + let mut size: usize = match self.cols { + c if c <= 64 => 64, + c if c <= 128 => 128, + c if c <= 256 => 256, + c if c <= 512 => 512, + _ => 1024 + }; + + if let Ok(adapter) = STATION.get_adapter() { + if size > adapter.max_workgroup_size as usize { + size = adapter.max_workgroup_size as usize; + } + } + + size + } +} + +impl From>> for GpuMatrix { + fn from(data: Vec>) -> Self { + Self { + rows: data.len(), + cols: data[0].len(), + data: data.into_iter().flatten().collect() + } + } +} + +impl From<&Vec>> for GpuMatrix { + fn from(data: &Vec>) -> Self { + Self { + rows: data.len(), + cols: data[0].len(), + data: data.clone().into_iter().flatten().collect() + } + } +} + + + diff --git a/src/gpu/mod.rs b/src/gpu/mod.rs new file mode 100644 index 00000000..fb501884 --- /dev/null +++ b/src/gpu/mod.rs @@ -0,0 +1,38 @@ + +use crate::error::GpuError; + +use lazy_static::lazy_static; + +lazy_static! { + pub static ref STATION: GpuStation = GpuStation::new(); +} + +pub use self::adapter::GpuAdapter; +pub use self::buffer::GpuBuffer; +pub use self::layout::{GpuLayout, GpuResourceLayout}; +pub use self::matrix::GpuMatrix; +pub use self::params::GpuParams; +pub use self::station::{GpuStation, GpuWorkgroup}; +pub use self::worker::GpuWorker; + +mod adapter; +pub mod buffer; +mod layout; +mod matrix; +pub mod models; +mod params; +mod station; +mod worker; + +#[derive(Clone, Copy, PartialEq, Eq, Hash)] +pub enum GpuAlgorithm { + LogisticRegressionGradientDescentBinaryClassification, + LogisticRegressionGradientDescentMultiClassification, +} +pub trait GpuModule { + fn get_params(&self, matrix: &GpuMatrix, num_classes: usize) -> Result; + fn get_wgsl_code(&self, matrix: &GpuMatrix, params: &GpuParams) -> String; + fn get_params_buffer_data(&self, params: &GpuParams) -> Vec; +} + + diff --git a/src/gpu/models/logistic_regression/gradient_descent.rs b/src/gpu/models/logistic_regression/gradient_descent.rs new file mode 100644 index 00000000..934b94b0 --- /dev/null +++ b/src/gpu/models/logistic_regression/gradient_descent.rs @@ -0,0 +1,268 @@ + +use std::mem::size_of_val; +use std::convert::{TryFrom, TryInto}; +use crate::linear::logistic_regression::{LogisticRegressionParameters, LogisticRegressionSolverName}; +use crate::gpu::{GpuModule, GpuAlgorithm, GpuLayout, GpuMatrix, GpuParams}; +use crate::error::GpuError; + +pub struct BinaryClassifier { + pub is_f32: bool, + pub learning_rate: f32 +} + +pub struct MultiClassifier { + pub is_f32: bool, + pub learning_rate: f32 +} + +impl GpuModule for BinaryClassifier { + fn get_params(&self, matrix: &GpuMatrix, num_classes: usize) -> Result { + + if num_classes > 2 { + return Err(GpuError::Generic("Trying to run binary classification with more than two classes. Uh oh, something went wrong somewhere!".to_string())); + } + + Ok(GpuParams::new(GpuAlgorithm::LogisticRegressionGradientDescentBinaryClassification, GpuLayout::Supervised, &matrix, num_classes as u32, 3, "main", self.learning_rate)) + } + + fn get_params_buffer_data(&self, params: &GpuParams) -> Vec { + vec![params.step, params.num_features, params.num_samples, params.learning_rate.to_bits()] + } + + fn get_wgsl_code(&self, matrix: &GpuMatrix, params: &GpuParams) -> String { + + let mut code = r#" +@group(0) @binding(0) +var input: array; +@group(0) @binding(1) +var targets: array; +@group(0) @binding(2) +var weights: array; +@group(0) @binding(3) +var output: array; +@group(0) @binding(4) +var params: Params; + +struct Params { + step: u32, // 0=predictions, 1=gradients, 2=update + num_features: u32, + num_samples: u32, + learning_rate: f32, +} + +// Sigmoid activation function +fn sigmoid(x: f32) -> f32 { + return 1.0 / (1.0 + exp(-x)); +} + +@compute @workgroup_size(~workgroup_size~) +fn main(@builtin(global_invocation_id) global_id: vec3) { + + // Get index + let idx = global_id.x; + if (idx >= params.num_features) { return; } + + switch params.step { + case 0u: { + + // Predictions + var prediction = 0.0; + for (var j = 0u; j < params.num_features; j++) { + let x_val = input[idx * params.num_features + j]; + let w_val = weights[j]; + prediction += x_val * w_val; + } + + // Apply sigmoid and store prediction + output[idx] = sigmoid(prediction); + } + case 1u: { + + // Gradient calculation + var gradient = 0.0; + for (var i = 0u; i < params.num_samples; i++) { + let x_val = input[i * params.num_features + idx]; + let y_pred = output[i]; // predictions from step 0 + let y_true = targets[i]; + let error = y_pred - y_true; + gradient += x_val * error; + } + + // Average the gradient + gradient = gradient / f32(params.num_samples); + output[idx] = gradient; + } + case 2u: { + + // Update weights + let gradient = output[idx]; // gradients from step 1 + weights[idx] = weights[idx] - params.learning_rate * gradient; + } + default: {} + } + } + "#; + + // Replace variables + let workgroup_size = format!("{}", matrix.get_workgroup_size()); + //code = code.replace("~workgroup_size~", &workgroup_size); + code.to_string() + } + +} + +impl GpuModule for MultiClassifier { + fn get_params(&self, matrix: &GpuMatrix, num_classes: usize) -> Result { + + if num_classes < 3 { + return Err(GpuError::Generic("Trying to run multi classification with less than three classes. Uh oh, something went wrong somewhere!".to_string())); + } + + Ok(GpuParams::new(GpuAlgorithm::LogisticRegressionGradientDescentMultiClassification, GpuLayout::Supervised, &matrix, num_classes as u32, 3, "main", self.learning_rate)) + } + + fn get_params_buffer_data(&self, params: &GpuParams) -> Vec { + vec![params.step, params.num_features, params.num_samples, params.num_classes, params.learning_rate.to_bits()] + } + + fn get_wgsl_code(&self, matrix: &GpuMatrix, params: &GpuParams) -> String { + + let mut code = format!(r#" + @group(0) @binding(0) + var input: array; // X matrix (flattened) + @group(0) @binding(1) + var targets: array; // y vector (target values) + @group(0) @binding(2) + var weights: array; // w matrix (weights, size: num_classes * num_features) + @group(0) @binding(3) + var output: array; // temporary storage for predictions/gradients + @group(0) @binding(4) + var params: Params; + + struct Params {{ + step: u32, // 0=predictions, 1=gradients, 2=update + num_features: u32, // number of columns in X + num_samples: u32, // number of rows in X + learning_rate: f32, // alpha for weight updates + num_classes: u32, // number of classes + }} + + // Softmax activation function for multi-class + fn softmax(values: ptr>, length: u32, idx: u32) -> f32 {{ + // Find max value for numerical stability + var max_val: f32 = -3.402823e+38; + for (var i = 0u; i < length; i++) {{ + if (*values)[i] > max_val) {{ + max_val = (*values)[i]; + }} + }} + + // Compute sum of exponentials + var sum: f32 = 0.0; + for (var i = 0u; i < length; i++) {{ + sum += exp((*values)[i] - max_val); + }} + + // Return softmax for the specified index + return exp((*values)[idx] - max_val) / sum; + }} + + @compute @workgroup_size(~workgroup_size~) + fn main(@builtin(global_invocation_id) global_id: vec3) {{ + let idx = global_id.x; + + switch params.step {{ + case 0u: {{ + // Step 0: Calculate predictions + let sample_idx = idx; + if (sample_idx >= params.num_samples) {{ return; }} + + // Local array for class scores + var class_scores: array; + + // Calculate score for each class + for (var c = 0u; c < params.num_classes; c++) {{ + var score = 0.0; + for (var j = 0u; j < params.num_features; j++) {{ + let x_val = input[sample_idx * params.num_features + j]; + let w_val = weights[c * params.num_features + j]; // Simple indexing! + score += x_val * w_val; + }} + class_scores[c] = score; + }} + + // Apply softmax and store predictions + for (var c = 0u; c < params.num_classes; c++) {{ + let prob = softmax(&class_scores, params.num_classes, c); + output[sample_idx * params.num_classes + c] = prob; + }} + }} + case 1u: {{ + // Step 1: Calculate gradients + let weight_idx = idx; + if (weight_idx >= params.num_classes * params.num_features) {{ return; }} + + let class_idx = weight_idx / params.num_features; + let feature_idx = weight_idx % params.num_features; + + var gradient = 0.0; + for (var i = 0u; i < params.num_samples; i++) {{ + let x_val = input[i * params.num_features + feature_idx]; + let y_pred = output[i * params.num_classes + class_idx]; // predictions from step 0 + + // One-hot encoding: target is 1.0 for true class, 0.0 for others + var y_true = 0.0; + if (targets[i] == f32(class_idx)) {{ + y_true = 1.0; + }} + + let error = y_pred - y_true; + gradient += x_val * error; + }} + + // Average the gradient + gradient = gradient / f32(params.num_samples); + output[weight_idx] = gradient; + }} + case 2u: {{ + // Step 2: Update weights + let weight_idx = idx; + if (weight_idx >= params.num_classes * params.num_features) {{ return; }} + + let gradient = output[weight_idx]; // gradients from step 1 + weights[weight_idx] = weights[weight_idx] - params.learning_rate * gradient; + }} + default: {{}} + }} + }} + "#, params.num_classes, params.num_classes); + + // Replace variables + let workgroup_size = format!("{}", matrix.get_workgroup_size()); + code = code.replace("~workgroup_size~", &workgroup_size); + code.to_string() + } + +} + + +impl TryFrom<(&LogisticRegressionParameters, usize)> for Box { + type Error = GpuError; + + fn try_from(value: (&LogisticRegressionParameters, usize)) -> Result { + let (params, num_classes) = value; + if params.solver == LogisticRegressionSolverName::GradientDescent { + let learning_rate = params.alpha as f32; + + if num_classes > 2 { + return Ok(Box::new(MultiClassifier { is_f32: true, learning_rate })); + } else { + return Ok(Box::new(BinaryClassifier { is_f32: true, learning_rate })); + } + } + + Err(GpuError::WorkerConversion) + } +} + + diff --git a/src/gpu/models/logistic_regression/mod.rs b/src/gpu/models/logistic_regression/mod.rs new file mode 100644 index 00000000..b0beb3f9 --- /dev/null +++ b/src/gpu/models/logistic_regression/mod.rs @@ -0,0 +1,4 @@ + +mod gradient_descent; + + diff --git a/src/gpu/models/mod.rs b/src/gpu/models/mod.rs new file mode 100644 index 00000000..c91de7b5 --- /dev/null +++ b/src/gpu/models/mod.rs @@ -0,0 +1,4 @@ + +pub mod logistic_regression; + + diff --git a/src/gpu/params.rs b/src/gpu/params.rs new file mode 100644 index 00000000..90a141b1 --- /dev/null +++ b/src/gpu/params.rs @@ -0,0 +1,32 @@ + +use super::{GpuLayout, GpuAlgorithm, GpuMatrix}; + +pub struct GpuParams { + pub algorithm: GpuAlgorithm, + pub layout: GpuLayout, + pub step: u32, + pub total_steps: usize, + pub entry_point: String, + pub num_features: u32, + pub num_samples: u32, + pub num_classes: u32, + pub learning_rate: f32 +} + +impl GpuParams { + pub fn new(algorithm: GpuAlgorithm, layout: GpuLayout, matrix: &GpuMatrix, num_classes: u32, total_steps: usize, entry_point: &str, learning_rate: f32) -> Self { + Self { + algorithm, layout, total_steps, num_classes, + step: 0, + num_features: matrix.cols as u32, + num_samples: matrix.rows as u32, + entry_point: entry_point.to_string(), + learning_rate + } + } + +} + + + + diff --git a/src/gpu/station.rs b/src/gpu/station.rs new file mode 100644 index 00000000..d31d0541 --- /dev/null +++ b/src/gpu/station.rs @@ -0,0 +1,112 @@ + +use std::collections::HashMap; +use std::sync::Mutex; +use crate::error::GpuError; +use super::{GpuAdapter, GpuLayout, GpuResourceLayout, GpuMatrix, GpuModule, GpuAlgorithm, GpuParams}; + +#[derive(Default)] +pub struct GpuStation { + cargo: Mutex +} + +#[derive(Default)] +struct GpuStationCargo { + adapter: Option, + layouts: HashMap, + workgroups: HashMap, +} + +#[derive(Copy, Clone, Eq, PartialEq, Hash)] +struct GpuWorkgroupParams { + algorithm: GpuAlgorithm, + workgroup_size: usize // must be 64, 128, 256, 512, or 1024 +} + +#[derive(Clone)] +pub struct GpuWorkgroup { + pub shader: wgpu::ShaderModule, + pub pipeline: wgpu::ComputePipeline +} + +impl GpuStation { + pub fn new() -> Self { + Self { + cargo: Mutex::new(GpuStationCargo::default()) + } + } + + pub fn get_adapter(&self) -> Result { + let mut cargo = self.cargo.lock() + .map_err(|e| GpuError::MutexLock(e.to_string()))?; + + if let Some(ref adapter) = cargo.adapter { + return Ok(adapter.clone()); + } + let adapter = GpuAdapter::new()?; + cargo.adapter = Some(adapter.clone()); + Ok(adapter) + } + + pub fn get_layout(&self, layout: GpuLayout, adapter: &GpuAdapter) -> Result { + let mut cargo = self.cargo.lock() + .map_err(|e| GpuError::MutexLock(e.to_string()))?; + + if let Some(group) = cargo.layouts.get(&layout) { + return Ok(group.clone()); + } + + let group = layout.create_resource_layout(&adapter); + cargo.layouts.insert(layout, group.clone()); + Ok(group) + } + + pub fn get_workgroup(&self, module: &M, matrix: &GpuMatrix, params: &GpuParams, adapter: &GpuAdapter, resources: &GpuResourceLayout) -> Result + where M: GpuModule, + { + + let mut cargo = self.cargo.lock() + .map_err(|e| GpuError::MutexLock(e.to_string()))?; + + let wg_params = GpuWorkgroupParams::new(params.algorithm, matrix.get_workgroup_size())?; + + if let Some(group) = cargo.workgroups.get(&wg_params) { + return Ok(group.clone()); + } + + let shader_source = module.get_wgsl_code(&matrix, ¶ms); + let shader = adapter.device.create_shader_module(wgpu::ShaderModuleDescriptor { + label: None, + source: wgpu::ShaderSource::Wgsl(shader_source.into()), + }); + + let pipeline = adapter.device.create_compute_pipeline(&wgpu::ComputePipelineDescriptor { + label: None, + layout: Some(&resources.pipeline_layout), + module: &shader, + entry_point: Some(¶ms.entry_point), + compilation_options: wgpu::PipelineCompilationOptions::default(), + cache: None, + }); + + let group = GpuWorkgroup { + shader, + pipeline + }; + cargo.workgroups.insert(wg_params, group.clone()); + + Ok(group) + } +} + +impl GpuWorkgroupParams { + pub fn new(algorithm: GpuAlgorithm, workgroup_size: usize) -> Result { + if ![64, 128, 256, 512, 1024].contains(&workgroup_size) { + return Err(GpuError::InvalidWorkgroupSize); + } + + Ok( Self { algorithm, workgroup_size }) + } +} + + + diff --git a/src/gpu/worker.rs b/src/gpu/worker.rs new file mode 100644 index 00000000..a7d91b4b --- /dev/null +++ b/src/gpu/worker.rs @@ -0,0 +1,158 @@ + +use wgpu::util::DeviceExt; +use crate::error::GpuError; +use crate::numbers::basenum::Number; +use crate::linalg::basic::arrays::ArrayView1; +use super::{buffer, STATION, GpuAdapter, GpuMatrix, GpuModule, GpuParams, GpuResourceLayout, GpuWorkgroup, GpuBuffer}; + +pub struct GpuWorker { + module: M, + adapter: GpuAdapter, + params: GpuParams, + resources: GpuResourceLayout, + workgroup: GpuWorkgroup, + matrix: GpuMatrix, + targets: Vec, + buffers: Vec<(GpuBuffer, wgpu::Buffer)>, +} + +impl GpuWorker { + pub fn new(module: M, matrix: GpuMatrix, targets: Vec) -> Result { + + // Get adapter + let adapter = STATION.get_adapter()?; + + // Get params + let num_classes = targets.unique().len(); + let params = module.get_params(&matrix, num_classes)?; + + // Get resource layout + let resources = STATION.get_layout(params.layout, &adapter)?; + + // Get workgroup + let workgroup = STATION.get_workgroup(&module, &matrix, ¶ms, &adapter, &resources)?; + + Ok( Self { + module, + adapter, + params, + resources, + workgroup, + matrix, + targets, + buffers: vec![] + }) + } + + pub fn run(&mut self) { + + // Create buffers + self.create_buffers(); + + // Create bind group + let bind_group = self.create_bind_group(); + + // Get workgroup counts + let workgroup_count = if self.matrix.cols > 1024 { + self.matrix.rows.div_ceil(self.matrix.get_workgroup_size()) * self.matrix.cols.div_ceil(1024) + } else { + self.matrix.rows.div_ceil(self.matrix.get_workgroup_size()) + }; + + // Get command encoder + let mut encoder = self.adapter.device.create_command_encoder(&wgpu::CommandEncoderDescriptor { label: None }); + + // Run commands + for step in 0..self.params.total_steps { + self.set_step(step as u32); + + let mut pass = encoder.begin_compute_pass(&wgpu::ComputePassDescriptor { + label: None, + timestamp_writes: None + }); + pass.set_pipeline(&self.workgroup.pipeline); + pass.set_bind_group(0, &bind_group, &[]); + pass.dispatch_workgroups(workgroup_count as u32, 1, 1); + drop(pass); + } + + // Submit work to GPU + self.adapter.queue.submit(Some(encoder.finish())); + } + + fn create_buffers(&mut self) { + + // Get templates + let templates = self.params.layout.get_buffer_templates(); + let mut input_size: u64 = 0; + + // Create buffers + for template in templates.iter() { + let buffer = match template { + GpuBuffer::Samples => buffer::create_samples(&self.adapter, &self.matrix), + GpuBuffer::Targets => buffer::create_targets(&self.adapter, &self.targets), + GpuBuffer::Weights => buffer::create_weights(&self.adapter, self.matrix.cols), + GpuBuffer::TempStorage => buffer::create_temp_storage(&self.adapter, input_size), + GpuBuffer::Params => buffer::create_params(&self.adapter, &self.module.get_params_buffer_data(&self.params)), + GpuBuffer::Download => buffer::create_download(&self.adapter, input_size), + _ => unreachable!() + }; + + if *template == GpuBuffer::Samples { + input_size = buffer.size(); + } + self.buffers.push((*template, buffer)); + } + + } + + fn create_bind_group(&self) -> wgpu::BindGroup { + + let mut binding_num = 0; + let mut bind_group_entries: Vec = Vec::new(); + + for index in 0..self.buffers.len() { + + if !self.buffers[index].0.included_in_bind_group() { + continue; + } + + bind_group_entries.push(wgpu::BindGroupEntry { + binding: binding_num, + resource: self.buffers[index].1.as_entire_binding(), + }); + binding_num += 1; + } + + // Create bind group + let bind_group = self.adapter.device.create_bind_group(&wgpu::BindGroupDescriptor { + label: None, + layout: &self.resources.bind_group_layout, + entries: &bind_group_entries.as_slice() + }); + + bind_group + } + + pub fn set_step(&mut self, step: u32) -> Result<(), GpuError> { + self.params.step = step; + self.update_params_buffer() + } + + pub fn set_learning_rate(&mut self, learning_rate: f32) -> Result<(), GpuError> { + self.params.learning_rate = learning_rate; + self.update_params_buffer() + } + + fn update_params_buffer(&self) -> Result<(), GpuError> { + match self.buffers.iter().position(|buf| buf.0 == GpuBuffer::Params) { + Some(index) => { + self.adapter.queue.write_buffer(&self.buffers[index].1, 0, bytemuck::cast_slice(&self.module.get_params_buffer_data(&self.params).as_slice())); + Ok(()) + }, + None => Err(GpuError::ParamsBufferNotFound) + } + } +} + + diff --git a/src/lib.rs b/src/lib.rs index c6f9349c..cf70d228 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,3 +1,4 @@ +#![allow(warnings)] #![allow( clippy::type_complexity, clippy::too_many_arguments, @@ -107,6 +108,9 @@ pub mod decomposition; /// Ensemble methods, including Random Forest classifier and regressor pub mod ensemble; pub mod error; +/// GPU acceleration +#[cfg(feature = "gpu")] +pub mod gpu; /// Diverse collection of linear algebra abstractions and methods that power smartcore algorithms pub mod linalg; /// Supervised classification and regression models that assume linear relationship between dependent and explanatory variables. diff --git a/src/linear/logistic_regression.rs b/src/linear/logistic_regression.rs index c28dc347..58645545 100644 --- a/src/linear/logistic_regression.rs +++ b/src/linear/logistic_regression.rs @@ -71,12 +71,15 @@ use crate::optimization::line_search::Backtracking; use crate::optimization::FunctionOrder; #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] +#[cfg_attr(feature = "gpu", derive(Hash, Copy))] #[derive(Debug, Clone, Eq, PartialEq, Default)] /// Solver options for Logistic regression. Right now only LBFGS solver is supported. pub enum LogisticRegressionSolverName { /// Limited-memory Broyden–Fletcher–Goldfarb–Shanno method, see [LBFGS paper](http://users.iems.northwestern.edu/~nocedal/lbfgsb.html) #[default] LBFGS, + // Gradient descent, with GPU acceleration support + GradientDescent } /// Logistic Regression parameters @@ -549,7 +552,6 @@ impl, Y: objective: impl ObjectiveFunction, ) -> OptimizerResult> { let f = |w: &Vec| -> TX { objective.f(w) }; - let df = |g: &mut Vec, w: &Vec| objective.df(g, w); let ls: Backtracking = Backtracking {