diff --git a/Cargo.toml b/Cargo.toml
index 6ce1952d..b806f3c7 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -28,18 +28,23 @@ num = "0.4"
 rand = { version = "0.8.5", default-features = false, features = ["small_rng"] }
 rand_distr = { version = "0.4", optional = true }
 serde = { version = "1", features = ["derive"], optional = true }
+wgpu = { version = "25.0.2", optional = true }
+pollster = { version = "0.4.0", optional = true }
+bytemuck = { version = "1.23.0", optional = true }
+lazy_static = { version = "1.5.0", optional = true }
 
 [target.'cfg(not(target_arch = "wasm32"))'.dependencies]
 typetag = { version = "0.2", optional = true }
 
 [features]
-default = []
+default = ["gpu"]
 serde = ["dep:serde", "dep:typetag"]
 ndarray-bindings = ["dep:ndarray"]
 datasets = ["dep:rand_distr", "std_rand", "serde"]
 std_rand = ["rand/std_rng", "rand/std"]
 # used by wasm32-unknown-unknown for in-browser usage
 js = ["getrandom/js"]
+gpu = ["wgpu", "pollster", "bytemuck", "lazy_static"]
 
 [target.'cfg(target_arch = "wasm32")'.dependencies]
 getrandom = { version = "0.2.8", optional = true }
diff --git a/src/error/gpu_error.rs b/src/error/gpu_error.rs
new file mode 100644
index 00000000..0c497b67
--- /dev/null
+++ b/src/error/gpu_error.rs
@@ -0,0 +1,37 @@
+
+use std::fmt;
+
+#[derive(Debug)]
+pub enum GpuError {
+    NoAdapter(String),
+    NoShader,
+    NoDevice(String),
+    InvalidWorkgroupSize,
+    MutexLock(String),
+    WorkerConversion,
+    ParamsBufferNotFound,
+    Generic(String)
+}
+
+impl std::error::Error for GpuError {}
+impl fmt::Display for GpuError {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        match self {
+            Self::NoAdapter(err) => write!(f, "Unable to create GPU adapter, error: {}", err),
+            Self::NoShader => write!(f, "GPU adapter does not support computer shaders."),
+            Self::NoDevice(err) => write!(f, "Unable to create device on GPU, error: {}", err),
+            Self::InvalidWorkgroupSize => write!(f, "Workgroup size must be 64, 128, 256, 512 or 1024"),
+            Self::MutexLock(msg) => write!(f, "Unable to lock mutex: {}", msg),
+            Self::WorkerConversion => write!(f, "Unable to convert into GpuWorker"), 
+            Self::ParamsBufferNotFound => write!(f, "Unable to update params buffer, as there doesn't appear to be a params buffer in this worker!"),
+            Self::Generic(msg) => write!(f, "{}", msg),
+        }
+    }
+}
+
+impl From<std::io::Error> for GpuError {
+    fn from(err: std::io::Error) -> Self {
+        GpuError::Generic(err.to_string())
+    }
+}
+
diff --git a/src/error/mod.rs b/src/error/mod.rs
index b6b1d982..5bea25de 100644
--- a/src/error/mod.rs
+++ b/src/error/mod.rs
@@ -2,6 +2,12 @@
 use std::error::Error;
 use std::fmt;
 
+#[cfg(feature = "gpu")]
+pub use self::gpu_error::GpuError;
+
+#[cfg(feature = "gpu")]
+mod gpu_error;
+
 #[cfg(feature = "serde")]
 use serde::{Deserialize, Serialize};
 
diff --git a/src/gpu/adapter.rs b/src/gpu/adapter.rs
new file mode 100644
index 00000000..b69ae4a2
--- /dev/null
+++ b/src/gpu/adapter.rs
@@ -0,0 +1,46 @@
+
+
+use wgpu::util::DeviceExt;
+use crate::error::GpuError;
+
+#[derive(Clone)]
+pub struct GpuAdapter {
+    adapter: wgpu::Adapter,
+    pub device: wgpu::Device,
+    pub queue: wgpu::Queue,
+    pub max_workgroup_size: u32
+}
+
+impl GpuAdapter {
+    pub fn new() -> Result<Self, GpuError> {
+
+        let instance = wgpu::Instance::new(&wgpu::InstanceDescriptor::default());
+        let adapter = pollster::block_on(instance.request_adapter(&wgpu::RequestAdapterOptions::default()))
+            .map_err(|e| GpuError::NoAdapter(e.to_string()) )?;
+        println!("Running on Adapter: {:#?}", adapter.get_info());
+
+        // Ensure adapter supports a compute shader
+        let downlevel_capabilities = adapter.get_downlevel_capabilities();
+        if !downlevel_capabilities.flags.contains(wgpu::DownlevelFlags::COMPUTE_SHADERS) {
+            return Err(GpuError::NoShader);
+        }
+
+        // Create device and queue
+        let (device, queue) = pollster::block_on(adapter.request_device(&wgpu::DeviceDescriptor {
+            label: None,
+            required_features: wgpu::Features::empty(),
+            required_limits: wgpu::Limits::downlevel_defaults(),
+            memory_hints: wgpu::MemoryHints::MemoryUsage,
+            trace: wgpu::Trace::Off,
+        }))
+            .map_err(|e| GpuError::NoDevice(e.to_string()))?;
+
+        // Get limits
+        let limits = device.limits();
+        let max_workgroup_size: u32 = limits.max_compute_invocations_per_workgroup;
+
+        Ok( Self { adapter, device, queue, max_workgroup_size })
+    }
+}
+
+
diff --git a/src/gpu/algorithm.rs b/src/gpu/algorithm.rs
new file mode 100644
index 00000000..50e62fa8
--- /dev/null
+++ b/src/gpu/algorithm.rs
@@ -0,0 +1,5 @@
+
+
+
+pub enum GpuAlgorithm {
+
diff --git a/src/gpu/buffer.rs b/src/gpu/buffer.rs
new file mode 100644
index 00000000..f0c57e14
--- /dev/null
+++ b/src/gpu/buffer.rs
@@ -0,0 +1,93 @@
+
+use wgpu::util::DeviceExt;
+use crate::numbers::basenum::Number;
+use super::{GpuAdapter, GpuParams, GpuMatrix};
+
+#[derive(Clone, Copy, Eq, PartialEq, Hash)]
+pub enum GpuBuffer {
+    Samples,
+    Targets,
+    Weights,
+    TempStorage,
+    Params,
+    Download
+}
+
+impl GpuBuffer {
+    pub fn included_in_bind_group(&self) -> bool {
+        match self {
+            GpuBuffer::Download => false,
+            _ => true
+        }
+    }
+
+    pub fn is_read_only(&self) -> bool {
+        match self {
+            GpuBuffer::Weights => false,
+            GpuBuffer::TempStorage => false,
+            _ => true
+        }
+    }
+}
+
+
+pub fn create_samples(adapter: &GpuAdapter, matrix: &GpuMatrix) -> wgpu::Buffer {
+    adapter.device.create_buffer_init(&wgpu::util::BufferInitDescriptor {
+        label: Some("Samples"),
+        contents: bytemuck::cast_slice(&matrix.data),
+        usage: wgpu::BufferUsages::STORAGE,
+    })
+}
+
+pub fn create_targets<T>(adapter: &GpuAdapter, targets: &Vec<T>) -> wgpu::Buffer 
+    where T: Number + Ord
+{
+
+    let u_targets = targets.iter().filter_map(|&val| val.to_u32()).collect::<Vec<u32>>();
+    adapter.device.create_buffer_init(&wgpu::util::BufferInitDescriptor {
+        label: Some("Targets"),
+        contents: bytemuck::cast_slice(&u_targets),
+        usage: wgpu::BufferUsages::STORAGE,
+    })
+}
+
+pub fn create_weights(adapter: &GpuAdapter, num_features: usize) -> wgpu::Buffer {
+    let zeros = vec![0.0f32; num_features];
+    adapter.device.create_buffer_init(&wgpu::util::BufferInitDescriptor {
+        label: Some("Weights"),
+        contents: bytemuck::cast_slice(&zeros),
+        usage: wgpu::BufferUsages::STORAGE | wgpu::BufferUsages::COPY_SRC,
+    })
+}
+
+
+pub fn create_temp_storage(adapter: &GpuAdapter, buffer_size: u64) -> wgpu::Buffer {
+    adapter.device.create_buffer(&wgpu::BufferDescriptor {
+        label: Some("Temp Storage"),
+        size: buffer_size,
+        usage: wgpu::BufferUsages::STORAGE | wgpu::BufferUsages::COPY_SRC,
+        mapped_at_creation: false,
+    })
+}
+
+pub fn create_params(adapter: &GpuAdapter, data: &Vec<u32>) -> wgpu::Buffer {
+
+    adapter.device.create_buffer_init(&wgpu::util::BufferInitDescriptor {
+        label: Some("Params"),
+        contents: bytemuck::cast_slice(&data.as_slice()),
+        usage: wgpu::BufferUsages::UNIFORM | wgpu::BufferUsages::COPY_DST,
+    })
+}
+
+pub fn create_download(adapter: &GpuAdapter, buffer_size: u64) -> wgpu::Buffer {
+    adapter.device.create_buffer(&wgpu::BufferDescriptor {
+        label: Some("Download"),
+        size: buffer_size,
+        usage: wgpu::BufferUsages::COPY_DST | wgpu::BufferUsages::MAP_READ,
+        mapped_at_creation: false,
+    })
+}
+
+
+
+
diff --git a/src/gpu/layout.rs b/src/gpu/layout.rs
new file mode 100644
index 00000000..41dee2ee
--- /dev/null
+++ b/src/gpu/layout.rs
@@ -0,0 +1,83 @@
+
+use super::{GpuAdapter, GpuBuffer};
+
+#[derive(Default, Copy, Clone, Eq, PartialEq, Hash)]
+pub enum GpuLayout {
+    #[default]
+    Supervised,    // Samples + Targets + Weights + TempStorage + Params
+    //Clustering,    // Samples + Centroids + Assignments + Params  
+    //Decomposition, // Samples + Vectors + Values + TempStorage + Params
+}
+
+#[derive(Clone)]
+pub struct GpuResourceLayout {
+    pub bind_group_layout: wgpu::BindGroupLayout,
+    pub pipeline_layout: wgpu::PipelineLayout,
+}
+
+impl GpuLayout {
+    pub fn create_resource_layout(&self, adapter: &GpuAdapter) -> GpuResourceLayout {
+
+        // Bind group layout
+        let bind_group_layout = self.create_bind_group_layout(&adapter);
+
+        // Pipeline layout
+        let pipeline_layout = adapter.device.create_pipeline_layout(&wgpu::PipelineLayoutDescriptor {
+            label: None,
+            bind_group_layouts: &[&bind_group_layout],
+            push_constant_ranges: &[],
+        });
+
+        GpuResourceLayout {
+            bind_group_layout,
+            pipeline_layout
+        }
+    }
+
+    pub fn get_buffer_templates(&self) -> Vec<GpuBuffer> {
+        match self {
+            GpuLayout::Supervised => vec![GpuBuffer::Samples, GpuBuffer::Targets, GpuBuffer::Weights, GpuBuffer::TempStorage, GpuBuffer::Params],
+        }
+    }
+
+    pub fn get_buffer_index(&self, buffer: GpuBuffer) -> Option<usize> {
+        let templates = self.get_buffer_templates();
+        templates.iter().position(|&buf| buf == buffer)
+    }
+
+    fn create_bind_group_layout(&self, adapter: &GpuAdapter) -> wgpu::BindGroupLayout  {
+
+        let templates = self.get_buffer_templates();
+        let mut layout_entries: Vec<wgpu::BindGroupLayoutEntry> = Vec::new();
+
+        let mut binding_num = 0;
+        for template in templates.iter() {
+
+            if !template.included_in_bind_group() {
+                continue;
+            }
+
+            layout_entries.push( wgpu::BindGroupLayoutEntry {
+                binding: binding_num,
+                visibility: wgpu::ShaderStages::COMPUTE,
+                ty: wgpu::BindingType::Buffer {
+                    ty: wgpu::BufferBindingType::Storage { read_only: template.is_read_only() },
+                    has_dynamic_offset: false,
+                    min_binding_size: None,
+                },
+                count: None,
+            });
+
+            binding_num += 1;
+        }
+
+        let layout = adapter.device.create_bind_group_layout(&wgpu::BindGroupLayoutDescriptor {
+            label: None,
+            entries: &layout_entries.as_slice()
+        });
+
+        layout
+    }
+}
+
+
diff --git a/src/gpu/matrix.rs b/src/gpu/matrix.rs
new file mode 100644
index 00000000..e30b1c70
--- /dev/null
+++ b/src/gpu/matrix.rs
@@ -0,0 +1,52 @@
+
+use super::STATION;
+
+#[derive(Default)]
+pub struct GpuMatrix {
+    pub rows: usize,
+    pub cols: usize,
+    pub data: Vec<f32>
+}
+
+impl GpuMatrix {
+    pub fn get_workgroup_size(&self) -> usize {
+        let mut size: usize = match self.cols {
+            c if c <= 64 => 64,
+            c if c <= 128 => 128,
+            c if c <= 256 => 256,
+            c if c <= 512 => 512,
+            _ => 1024
+        };
+
+        if let Ok(adapter) = STATION.get_adapter() {
+            if size > adapter.max_workgroup_size as usize {
+                size = adapter.max_workgroup_size as usize;
+            }
+        }
+
+        size
+    }
+}
+
+impl From<Vec<Vec<f32>>> for GpuMatrix {
+    fn from(data: Vec<Vec<f32>>) -> Self {
+        Self {
+            rows: data.len(),
+            cols: data[0].len(),
+            data: data.into_iter().flatten().collect()
+        }
+    }
+}
+
+impl From<&Vec<Vec<f32>>> for GpuMatrix {
+    fn from(data: &Vec<Vec<f32>>) -> Self {
+        Self {
+            rows: data.len(),
+            cols: data[0].len(),
+            data: data.clone().into_iter().flatten().collect()
+        }
+    }
+}
+
+
+
diff --git a/src/gpu/mod.rs b/src/gpu/mod.rs
new file mode 100644
index 00000000..fb501884
--- /dev/null
+++ b/src/gpu/mod.rs
@@ -0,0 +1,38 @@
+
+use crate::error::GpuError;
+
+use lazy_static::lazy_static;
+
+lazy_static! {
+    pub static ref STATION: GpuStation = GpuStation::new();
+}
+
+pub use self::adapter::GpuAdapter;
+pub use self::buffer::GpuBuffer;
+pub use self::layout::{GpuLayout, GpuResourceLayout};
+pub use self::matrix::GpuMatrix;
+pub use self::params::GpuParams;
+pub use self::station::{GpuStation, GpuWorkgroup};
+pub use self::worker::GpuWorker;
+
+mod adapter;
+pub mod buffer;
+mod layout;
+mod matrix;
+pub mod models;
+mod params;
+mod station;
+mod worker;
+
+#[derive(Clone, Copy, PartialEq, Eq, Hash)]
+pub enum GpuAlgorithm {
+    LogisticRegressionGradientDescentBinaryClassification,
+    LogisticRegressionGradientDescentMultiClassification,
+}
+pub trait GpuModule {
+    fn get_params(&self, matrix: &GpuMatrix, num_classes: usize) -> Result<GpuParams, GpuError>;
+    fn get_wgsl_code(&self, matrix: &GpuMatrix, params: &GpuParams) -> String;
+    fn get_params_buffer_data(&self, params: &GpuParams) -> Vec<u32>;
+}
+
+
diff --git a/src/gpu/models/logistic_regression/gradient_descent.rs b/src/gpu/models/logistic_regression/gradient_descent.rs
new file mode 100644
index 00000000..934b94b0
--- /dev/null
+++ b/src/gpu/models/logistic_regression/gradient_descent.rs
@@ -0,0 +1,268 @@
+
+use std::mem::size_of_val;
+use std::convert::{TryFrom, TryInto};
+use crate::linear::logistic_regression::{LogisticRegressionParameters, LogisticRegressionSolverName};
+use crate::gpu::{GpuModule, GpuAlgorithm, GpuLayout, GpuMatrix, GpuParams};
+use crate::error::GpuError;
+
+pub struct BinaryClassifier {
+    pub is_f32: bool,
+    pub learning_rate: f32
+}
+
+pub struct MultiClassifier {
+    pub is_f32: bool,
+    pub learning_rate: f32
+}
+
+impl GpuModule for BinaryClassifier {
+    fn get_params(&self, matrix: &GpuMatrix, num_classes: usize) -> Result<GpuParams, GpuError> {
+
+        if num_classes > 2 {
+            return Err(GpuError::Generic("Trying to run binary classification with more than two classes.  Uh oh, something went wrong somewhere!".to_string()));
+        }
+
+        Ok(GpuParams::new(GpuAlgorithm::LogisticRegressionGradientDescentBinaryClassification, GpuLayout::Supervised, &matrix, num_classes as u32, 3, "main", self.learning_rate)) 
+    }
+
+    fn get_params_buffer_data(&self, params: &GpuParams) -> Vec<u32> {
+        vec![params.step, params.num_features, params.num_samples, params.learning_rate.to_bits()]
+    }
+
+    fn get_wgsl_code(&self, matrix: &GpuMatrix, params: &GpuParams) -> String {
+
+        let mut code = r#"
+@group(0) @binding(0)
+var<storage, read> input: array<f32>;
+@group(0) @binding(1)
+var<storage, read> targets: array<f32>;        
+@group(0) @binding(2)
+var<storage, read_write> weights: array<f32>;
+@group(0) @binding(3)
+var<storage, read_write> output: array<f32>;
+@group(0) @binding(4)
+var<uniform> params: Params;
+
+struct Params {
+    step: u32,           // 0=predictions, 1=gradients, 2=update
+    num_features: u32,
+    num_samples: u32,
+    learning_rate: f32,
+}
+
+// Sigmoid activation function
+fn sigmoid(x: f32) -> f32 {
+    return 1.0 / (1.0 + exp(-x));
+}
+
+@compute @workgroup_size(~workgroup_size~)
+fn main(@builtin(global_invocation_id) global_id: vec3<u32>) {
+
+    // Get index
+    let idx = global_id.x;
+    if (idx >= params.num_features) { return; }
+
+    switch params.step {
+        case 0u: {
+
+            // Predictions
+            var prediction = 0.0;
+            for (var j = 0u; j < params.num_features; j++) {
+                let x_val = input[idx * params.num_features + j];
+                let w_val = weights[j];
+                prediction += x_val * w_val;
+            }
+
+            // Apply sigmoid and store prediction
+            output[idx] = sigmoid(prediction);
+        }
+        case 1u: {
+
+            // Gradient calculation
+            var gradient = 0.0;
+            for (var i = 0u; i < params.num_samples; i++) {
+                let x_val = input[i * params.num_features + idx];
+                let y_pred = output[i];  // predictions from step 0
+                let y_true = targets[i];
+                let error = y_pred - y_true;
+                gradient += x_val * error;
+            }
+
+            // Average the gradient
+            gradient = gradient / f32(params.num_samples);
+            output[idx] = gradient;
+        }
+        case 2u: {
+
+            // Update weights
+            let gradient = output[idx];  // gradients from step 1
+            weights[idx] = weights[idx] - params.learning_rate * gradient;
+        }
+        default: {}
+        }
+    }
+        "#;
+
+        // Replace variables
+        let workgroup_size = format!("{}", matrix.get_workgroup_size());
+        //code = code.replace("~workgroup_size~", &workgroup_size);
+        code.to_string()
+    }
+
+}
+
+impl GpuModule for MultiClassifier {
+    fn get_params(&self, matrix: &GpuMatrix, num_classes: usize) -> Result<GpuParams, GpuError> {
+
+        if num_classes < 3 {
+            return Err(GpuError::Generic("Trying to run multi classification with less than three classes.  Uh oh, something went wrong somewhere!".to_string()));
+        }
+
+        Ok(GpuParams::new(GpuAlgorithm::LogisticRegressionGradientDescentMultiClassification, GpuLayout::Supervised, &matrix, num_classes as u32, 3, "main", self.learning_rate))
+    }
+
+    fn get_params_buffer_data(&self, params: &GpuParams) -> Vec<u32> {
+        vec![params.step, params.num_features, params.num_samples, params.num_classes, params.learning_rate.to_bits()]
+    }
+
+    fn get_wgsl_code(&self, matrix: &GpuMatrix, params: &GpuParams) -> String {
+
+        let mut code = format!(r#"
+    @group(0) @binding(0)
+    var<storage, read> input: array<f32>;           // X matrix (flattened)
+    @group(0) @binding(1)
+    var<storage, read> targets: array<f32>;         // y vector (target values)
+    @group(0) @binding(2)
+    var<storage, read_write> weights: array<f32>;   // w matrix (weights, size: num_classes * num_features)
+    @group(0) @binding(3)
+    var<storage, read_write> output: array<f32>;    // temporary storage for predictions/gradients
+    @group(0) @binding(4)
+    var<uniform> params: Params;
+
+    struct Params {{
+        step: u32,           // 0=predictions, 1=gradients, 2=update
+        num_features: u32,   // number of columns in X
+        num_samples: u32,    // number of rows in X
+        learning_rate: f32,  // alpha for weight updates
+        num_classes: u32,    // number of classes
+    }}
+
+    // Softmax activation function for multi-class
+    fn softmax(values: ptr<function, array<f32, {}>>, length: u32, idx: u32) -> f32 {{
+        // Find max value for numerical stability
+        var max_val: f32 = -3.402823e+38;
+        for (var i = 0u; i < length; i++) {{
+            if (*values)[i] > max_val) {{
+                max_val = (*values)[i];
+            }}
+        }}
+        
+        // Compute sum of exponentials
+        var sum: f32 = 0.0;
+        for (var i = 0u; i < length; i++) {{
+            sum += exp((*values)[i] - max_val);
+        }}
+        
+        // Return softmax for the specified index
+        return exp((*values)[idx] - max_val) / sum;
+    }}
+
+    @compute @workgroup_size(~workgroup_size~)
+    fn main(@builtin(global_invocation_id) global_id: vec3<u32>) {{
+        let idx = global_id.x;
+        
+        switch params.step {{
+            case 0u: {{
+                // Step 0: Calculate predictions
+                let sample_idx = idx;
+                if (sample_idx >= params.num_samples) {{ return; }}
+                
+                // Local array for class scores
+                var class_scores: array<f32, {}>;
+                
+                // Calculate score for each class
+                for (var c = 0u; c < params.num_classes; c++) {{
+                    var score = 0.0;
+                    for (var j = 0u; j < params.num_features; j++) {{
+                        let x_val = input[sample_idx * params.num_features + j];
+                        let w_val = weights[c * params.num_features + j];  // Simple indexing!
+                        score += x_val * w_val;
+                    }}
+                    class_scores[c] = score;
+                }}
+                
+                // Apply softmax and store predictions
+                for (var c = 0u; c < params.num_classes; c++) {{
+                    let prob = softmax(&class_scores, params.num_classes, c);
+                    output[sample_idx * params.num_classes + c] = prob;
+                }}
+            }}
+            case 1u: {{
+                // Step 1: Calculate gradients
+                let weight_idx = idx;
+                if (weight_idx >= params.num_classes * params.num_features) {{ return; }}
+                
+                let class_idx = weight_idx / params.num_features;
+                let feature_idx = weight_idx % params.num_features;
+                
+                var gradient = 0.0;
+                for (var i = 0u; i < params.num_samples; i++) {{
+                    let x_val = input[i * params.num_features + feature_idx];
+                    let y_pred = output[i * params.num_classes + class_idx];  // predictions from step 0
+                    
+                    // One-hot encoding: target is 1.0 for true class, 0.0 for others
+                    var y_true = 0.0;
+                    if (targets[i] == f32(class_idx)) {{
+                        y_true = 1.0;
+                    }}
+                    
+                    let error = y_pred - y_true;
+                    gradient += x_val * error;
+                }}
+                
+                // Average the gradient
+                gradient = gradient / f32(params.num_samples);
+                output[weight_idx] = gradient;
+            }}
+            case 2u: {{
+                // Step 2: Update weights
+                let weight_idx = idx;
+                if (weight_idx >= params.num_classes * params.num_features) {{ return; }}
+                
+                let gradient = output[weight_idx];  // gradients from step 1
+                weights[weight_idx] = weights[weight_idx] - params.learning_rate * gradient;
+            }}
+            default: {{}}
+        }}
+    }}
+            "#, params.num_classes, params.num_classes);
+        
+        // Replace variables
+        let workgroup_size = format!("{}", matrix.get_workgroup_size());
+        code = code.replace("~workgroup_size~", &workgroup_size);
+        code.to_string()
+    }
+
+}
+
+
+impl TryFrom<(&LogisticRegressionParameters<f32>, usize)> for Box<dyn GpuModule> {
+    type Error = GpuError;
+
+    fn try_from(value: (&LogisticRegressionParameters<f32>, usize)) -> Result<Self, Self::Error> {
+        let (params, num_classes) = value;
+        if params.solver == LogisticRegressionSolverName::GradientDescent {
+            let learning_rate = params.alpha as f32;
+
+            if num_classes > 2 {
+                return Ok(Box::new(MultiClassifier { is_f32: true, learning_rate }));
+            } else {
+                return Ok(Box::new(BinaryClassifier { is_f32: true, learning_rate }));
+            }
+        }
+
+        Err(GpuError::WorkerConversion)
+    }
+}
+
+
diff --git a/src/gpu/models/logistic_regression/mod.rs b/src/gpu/models/logistic_regression/mod.rs
new file mode 100644
index 00000000..b0beb3f9
--- /dev/null
+++ b/src/gpu/models/logistic_regression/mod.rs
@@ -0,0 +1,4 @@
+
+mod gradient_descent;
+
+
diff --git a/src/gpu/models/mod.rs b/src/gpu/models/mod.rs
new file mode 100644
index 00000000..c91de7b5
--- /dev/null
+++ b/src/gpu/models/mod.rs
@@ -0,0 +1,4 @@
+
+pub mod logistic_regression;
+
+
diff --git a/src/gpu/params.rs b/src/gpu/params.rs
new file mode 100644
index 00000000..90a141b1
--- /dev/null
+++ b/src/gpu/params.rs
@@ -0,0 +1,32 @@
+
+use super::{GpuLayout, GpuAlgorithm, GpuMatrix};
+
+pub struct GpuParams {
+    pub algorithm: GpuAlgorithm,
+    pub layout: GpuLayout,
+    pub step: u32,
+    pub total_steps: usize,
+    pub entry_point: String,
+    pub num_features: u32,
+    pub num_samples: u32,
+    pub num_classes: u32,
+    pub learning_rate: f32
+}
+
+impl GpuParams {
+    pub fn new(algorithm: GpuAlgorithm, layout: GpuLayout, matrix: &GpuMatrix, num_classes: u32, total_steps: usize, entry_point: &str, learning_rate: f32) -> Self {
+        Self {
+            algorithm, layout, total_steps, num_classes,
+            step: 0,
+            num_features: matrix.cols as u32,
+            num_samples: matrix.rows as u32,
+            entry_point: entry_point.to_string(),
+            learning_rate
+        }
+    }
+
+}
+
+
+
+
diff --git a/src/gpu/station.rs b/src/gpu/station.rs
new file mode 100644
index 00000000..d31d0541
--- /dev/null
+++ b/src/gpu/station.rs
@@ -0,0 +1,112 @@
+
+use std::collections::HashMap;
+use std::sync::Mutex;
+use crate::error::GpuError;
+use super::{GpuAdapter, GpuLayout, GpuResourceLayout, GpuMatrix, GpuModule, GpuAlgorithm, GpuParams};
+
+#[derive(Default)]
+pub struct GpuStation {
+    cargo: Mutex<GpuStationCargo>
+}
+
+#[derive(Default)]
+struct GpuStationCargo {
+    adapter: Option<GpuAdapter>,
+    layouts: HashMap<GpuLayout, GpuResourceLayout>,
+    workgroups: HashMap<GpuWorkgroupParams, GpuWorkgroup>,
+}
+
+#[derive(Copy, Clone, Eq, PartialEq, Hash)]
+struct GpuWorkgroupParams {
+    algorithm: GpuAlgorithm,
+    workgroup_size: usize // must be 64, 128, 256, 512, or 1024
+}
+
+#[derive(Clone)]
+pub struct GpuWorkgroup {
+    pub shader: wgpu::ShaderModule,
+    pub pipeline: wgpu::ComputePipeline
+}
+
+impl GpuStation {
+    pub fn new() -> Self {
+        Self {
+            cargo: Mutex::new(GpuStationCargo::default())
+        }
+    }
+
+    pub fn get_adapter(&self) -> Result<GpuAdapter, GpuError> {
+        let mut cargo = self.cargo.lock()
+            .map_err(|e| GpuError::MutexLock(e.to_string()))?;
+
+        if let Some(ref adapter) = cargo.adapter {
+            return Ok(adapter.clone());
+        }
+        let adapter = GpuAdapter::new()?;
+        cargo.adapter = Some(adapter.clone());
+        Ok(adapter)
+    }
+
+    pub fn get_layout(&self, layout: GpuLayout, adapter: &GpuAdapter) -> Result<GpuResourceLayout, GpuError> {
+        let mut cargo = self.cargo.lock()
+            .map_err(|e| GpuError::MutexLock(e.to_string()))?;
+
+        if let Some(group) = cargo.layouts.get(&layout) {
+            return Ok(group.clone());
+        }
+
+        let group = layout.create_resource_layout(&adapter);
+        cargo.layouts.insert(layout, group.clone());
+        Ok(group)
+    }
+
+    pub fn get_workgroup<M>(&self, module: &M, matrix: &GpuMatrix, params: &GpuParams, adapter: &GpuAdapter, resources: &GpuResourceLayout) -> Result<GpuWorkgroup, GpuError> 
+        where M: GpuModule,
+    {
+
+        let mut cargo = self.cargo.lock()
+            .map_err(|e| GpuError::MutexLock(e.to_string()))?;
+
+        let wg_params = GpuWorkgroupParams::new(params.algorithm, matrix.get_workgroup_size())?;
+
+        if let Some(group) = cargo.workgroups.get(&wg_params) {
+            return Ok(group.clone());
+        }
+
+        let shader_source = module.get_wgsl_code(&matrix, &params);
+        let shader = adapter.device.create_shader_module(wgpu::ShaderModuleDescriptor {
+            label: None,
+            source: wgpu::ShaderSource::Wgsl(shader_source.into()),
+        });
+
+        let pipeline = adapter.device.create_compute_pipeline(&wgpu::ComputePipelineDescriptor {
+            label: None,
+            layout: Some(&resources.pipeline_layout),
+            module: &shader,
+            entry_point: Some(&params.entry_point),
+            compilation_options: wgpu::PipelineCompilationOptions::default(),
+            cache: None,
+        });
+
+        let group = GpuWorkgroup {
+            shader,
+            pipeline
+        };
+        cargo.workgroups.insert(wg_params, group.clone());
+
+        Ok(group)
+    }
+}
+
+impl GpuWorkgroupParams {
+    pub fn new(algorithm: GpuAlgorithm, workgroup_size: usize) -> Result<Self, GpuError> { 
+        if ![64, 128, 256, 512, 1024].contains(&workgroup_size) {
+            return Err(GpuError::InvalidWorkgroupSize);
+        }
+
+        Ok( Self { algorithm, workgroup_size })
+    }
+}
+
+
+
diff --git a/src/gpu/worker.rs b/src/gpu/worker.rs
new file mode 100644
index 00000000..a7d91b4b
--- /dev/null
+++ b/src/gpu/worker.rs
@@ -0,0 +1,158 @@
+
+use wgpu::util::DeviceExt;
+use crate::error::GpuError;
+use crate::numbers::basenum::Number;
+use crate::linalg::basic::arrays::ArrayView1;
+use super::{buffer, STATION, GpuAdapter, GpuMatrix, GpuModule, GpuParams, GpuResourceLayout, GpuWorkgroup, GpuBuffer};
+
+pub struct GpuWorker<M: GpuModule, T: Number + Ord> {
+    module: M,
+    adapter: GpuAdapter,
+    params: GpuParams,
+    resources: GpuResourceLayout,
+    workgroup: GpuWorkgroup,
+    matrix: GpuMatrix,
+    targets: Vec<T>,
+    buffers: Vec<(GpuBuffer, wgpu::Buffer)>,
+}
+
+impl<M: GpuModule, T: Number + Ord> GpuWorker<M, T> {
+    pub fn new(module: M, matrix: GpuMatrix, targets: Vec<T>) -> Result<Self, GpuError> {
+
+        // Get adapter
+        let adapter = STATION.get_adapter()?;
+
+        // Get params
+        let num_classes = targets.unique().len();
+        let params = module.get_params(&matrix, num_classes)?;
+
+        // Get resource layout
+        let resources = STATION.get_layout(params.layout, &adapter)?;
+
+        // Get workgroup
+        let workgroup = STATION.get_workgroup(&module, &matrix, &params, &adapter, &resources)?;
+
+        Ok( Self {
+            module,
+            adapter,
+            params,
+            resources,
+            workgroup,
+            matrix,
+            targets,
+            buffers: vec![]
+        })
+    }
+
+    pub fn run(&mut self) {
+
+        // Create buffers
+        self.create_buffers();
+
+        // Create bind group
+        let bind_group = self.create_bind_group();
+
+        // Get workgroup counts
+        let workgroup_count = if self.matrix.cols > 1024 {
+            self.matrix.rows.div_ceil(self.matrix.get_workgroup_size()) * self.matrix.cols.div_ceil(1024)
+        } else {
+            self.matrix.rows.div_ceil(self.matrix.get_workgroup_size())
+        };
+
+        // Get command encoder
+        let mut encoder = self.adapter.device.create_command_encoder(&wgpu::CommandEncoderDescriptor { label: None });
+
+        // Run commands
+        for step in 0..self.params.total_steps {
+            self.set_step(step as u32);
+
+            let mut pass = encoder.begin_compute_pass(&wgpu::ComputePassDescriptor {
+                label: None,
+                timestamp_writes: None
+            });
+            pass.set_pipeline(&self.workgroup.pipeline);
+            pass.set_bind_group(0, &bind_group, &[]);
+            pass.dispatch_workgroups(workgroup_count as u32, 1, 1);
+            drop(pass);
+        }
+
+        // Submit work to GPU
+        self.adapter.queue.submit(Some(encoder.finish()));
+    }
+
+    fn create_buffers(&mut self) {
+
+        // Get templates
+        let templates = self.params.layout.get_buffer_templates();
+        let mut input_size: u64 = 0;
+
+        // Create buffers
+        for template in templates.iter() {
+            let buffer = match template {
+                GpuBuffer::Samples => buffer::create_samples(&self.adapter, &self.matrix),
+                GpuBuffer::Targets => buffer::create_targets(&self.adapter, &self.targets),
+                GpuBuffer::Weights => buffer::create_weights(&self.adapter, self.matrix.cols),
+                GpuBuffer::TempStorage => buffer::create_temp_storage(&self.adapter, input_size),
+                GpuBuffer::Params => buffer::create_params(&self.adapter, &self.module.get_params_buffer_data(&self.params)),
+                GpuBuffer::Download => buffer::create_download(&self.adapter, input_size),
+                _ => unreachable!()
+            };
+
+            if *template == GpuBuffer::Samples {
+                input_size = buffer.size();
+            }
+            self.buffers.push((*template, buffer));
+        }
+
+    }
+
+    fn create_bind_group(&self) -> wgpu::BindGroup {
+
+        let mut binding_num = 0;
+        let mut bind_group_entries: Vec<wgpu::BindGroupEntry> = Vec::new();
+
+        for index in 0..self.buffers.len() {
+
+            if !self.buffers[index].0.included_in_bind_group() {
+                continue;
+            }
+
+            bind_group_entries.push(wgpu::BindGroupEntry {
+                binding: binding_num,
+                resource: self.buffers[index].1.as_entire_binding(),
+            });
+            binding_num += 1;
+        }
+
+        // Create bind group
+        let bind_group = self.adapter.device.create_bind_group(&wgpu::BindGroupDescriptor {
+            label: None,
+            layout: &self.resources.bind_group_layout,
+            entries: &bind_group_entries.as_slice()
+        });
+
+        bind_group
+    }
+
+    pub fn set_step(&mut self, step: u32) -> Result<(), GpuError> {
+        self.params.step = step;
+        self.update_params_buffer()
+    }
+
+    pub fn set_learning_rate(&mut self, learning_rate: f32) -> Result<(), GpuError> {
+        self.params.learning_rate = learning_rate;
+        self.update_params_buffer()
+    }
+
+    fn update_params_buffer(&self) -> Result<(), GpuError> {
+        match self.buffers.iter().position(|buf| buf.0 == GpuBuffer::Params) {
+            Some(index) => {
+                self.adapter.queue.write_buffer(&self.buffers[index].1, 0, bytemuck::cast_slice(&self.module.get_params_buffer_data(&self.params).as_slice()));
+                Ok(())
+            },
+            None => Err(GpuError::ParamsBufferNotFound)
+        }
+    }
+}
+
+
diff --git a/src/lib.rs b/src/lib.rs
index c6f9349c..cf70d228 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -1,3 +1,4 @@
+#![allow(warnings)]
 #![allow(
     clippy::type_complexity,
     clippy::too_many_arguments,
@@ -107,6 +108,9 @@ pub mod decomposition;
 /// Ensemble methods, including Random Forest classifier and regressor
 pub mod ensemble;
 pub mod error;
+/// GPU acceleration
+#[cfg(feature = "gpu")]
+pub mod gpu;
 /// Diverse collection of linear algebra abstractions and methods that power smartcore algorithms
 pub mod linalg;
 /// Supervised classification and regression models that assume linear relationship between dependent and explanatory variables.
diff --git a/src/linear/logistic_regression.rs b/src/linear/logistic_regression.rs
index c28dc347..58645545 100644
--- a/src/linear/logistic_regression.rs
+++ b/src/linear/logistic_regression.rs
@@ -71,12 +71,15 @@ use crate::optimization::line_search::Backtracking;
 use crate::optimization::FunctionOrder;
 
 #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
+#[cfg_attr(feature = "gpu", derive(Hash, Copy))]
 #[derive(Debug, Clone, Eq, PartialEq, Default)]
 /// Solver options for Logistic regression. Right now only LBFGS solver is supported.
 pub enum LogisticRegressionSolverName {
     /// Limited-memory Broyden–Fletcher–Goldfarb–Shanno method, see [LBFGS paper](http://users.iems.northwestern.edu/~nocedal/lbfgsb.html)
     #[default]
     LBFGS,
+    // Gradient descent, with GPU acceleration support
+    GradientDescent
 }
 
 /// Logistic Regression parameters
@@ -549,7 +552,6 @@ impl<TX: Number + FloatNumber + RealNumber, TY: Number + Ord, X: Array2<TX>, Y:
         objective: impl ObjectiveFunction<TX, X>,
     ) -> OptimizerResult<TX, Vec<TX>> {
         let f = |w: &Vec<TX>| -> TX { objective.f(w) };
-
         let df = |g: &mut Vec<TX>, w: &Vec<TX>| objective.df(g, w);
 
         let ls: Backtracking<TX> = Backtracking {