Feat: start adding cublas gemm

RDambrosio016 · RDambrosio016 · commit 23b71e87f421 · 2022-03-24T21:16:50.000-04:00
diff --git a/crates/blastoff/Cargo.toml b/crates/blastoff/Cargo.toml
@@ -10,6 +10,7 @@ bitflags = "1.3.2"
 cublas_sys = { version = "0.1", path = "../cublas_sys" }
 cust = { version = "0.3", path = "../cust", features = ["impl_num_complex"] }
 num-complex = "0.4.0"
+half = { version = "1.8.0", optional = true }
 
 [package.metadata.docs.rs]
-rustdoc-args = ["--html-in-header", "katex-header.html"]
+rustdoc-args = ["--html-in-header", "katex-header.html", "--cfg", "docsrs"]
diff --git a/crates/blastoff/src/context.rs b/crates/blastoff/src/context.rs
@@ -63,6 +63,9 @@ bitflags::bitflags! {
 /// - [Construct the modified givens rotation matrix that zeros the second entry of a vector<span style="float:right;">`rotmg`</span>](CublasContext::rotmg)
 /// - [Scale a vector by a scalar <span style="float:right;">`scal`</span>](CublasContext::scal)
 /// - [Swap two vectors <span style="float:right;">`swap`</span>](CublasContext::swap)
+///
+/// ## Level 3 Methods (Matrix-based operations)
+/// - [Matrix Multiplication <span style="float:right;">`gemm`</span>](CublasContext::gemm)
 #[derive(Debug)]
 pub struct CublasContext {
     pub(crate) raw: sys::v2::cublasHandle_t,
diff --git a/crates/blastoff/src/level3.rs b/crates/blastoff/src/level3.rs
@@ -0,0 +1,132 @@
+use crate::{
+    context::CublasContext,
+    error::{Error, ToResult},
+    raw::GemmOps,
+    GemmDatatype, MatrixOp,
+};
+use cust::memory::{GpuBox, GpuBuffer};
+use cust::stream::Stream;
+
+type Result<T = (), E = Error> = std::result::Result<T, E>;
+
+#[track_caller]
+fn check_gemm<T: GemmDatatype + GemmOps>(
+    m: usize,
+    n: usize,
+    k: usize,
+    a: &impl GpuBuffer<T>,
+    lda: usize,
+    op_a: MatrixOp,
+    b: &impl GpuBuffer<T>,
+    ldb: usize,
+    op_b: MatrixOp,
+    c: &mut impl GpuBuffer<T>,
+    ldc: usize,
+) {
+    assert!(m > 0 && n > 0 && k > 0, "m, n, and k must be at least 1");
+
+    if op_a == MatrixOp::None {
+        assert!(lda >= m, "lda must be at least m if op_a is None");
+
+        assert!(
+            a.len() >= lda * k,
+            "matrix A's length must be at least lda * k"
+        );
+    } else {
+        assert!(lda >= k, "lda must be at least k if op_a is None");
+
+        assert!(
+            a.len() >= lda * m,
+            "matrix A's length must be at least lda * m"
+        );
+    }
+
+    if op_b == MatrixOp::None {
+        assert!(ldb >= k, "ldb must be at least k if op_b is None");
+
+        assert!(
+            b.len() >= ldb * n,
+            "matrix B's length must be at least ldb * n"
+        );
+    } else {
+        assert!(ldb >= n, "ldb must be at least n if op_b is None");
+
+        assert!(
+            a.len() >= ldb * k,
+            "matrix B's length must be at least ldb * k"
+        );
+    }
+
+    assert!(ldc >= m, "ldc must be at least m");
+
+    assert!(
+        c.len() >= ldc * n,
+        "matrix C's length must be at least ldc * n"
+    );
+}
+
+impl CublasContext {
+    /// Generic Matrix Multiplication.
+    ///
+    /// # Panics
+    ///
+    /// Panics if any of the following conditions are not met:
+    /// - `m > 0 && n > 0 && k > 0`
+    /// - `lda >= m` if `op_a == MatrixOp::None`
+    /// - `a.len() >= lda * k` if `op_a == MatrixOp::None`
+    /// - `lda >= k` if `op_a == MatrixOp::Transpose` or `MatrixOp::ConjugateTranspose`
+    /// - `a.len() >= lda * m` if `op_a == MatrixOp::Transpose` or `MatrixOp::ConjugateTranspose`
+    /// - `ldb >= k` if `op_b == MatrixOp::None`
+    /// - `b.len() >= ldb * n` if `op_b == MatrixOp::None`
+    /// - `ldb >= n` if `op_b == MatrixOp::Transpose` or `MatrixOp::ConjugateTranspose`
+    /// - `b.len() >= ldb * k` if `op_b == MatrixOp::Transpose` or `MatrixOp::ConjugateTranspose`
+    /// - `ldc >= m`
+    /// - `c.len() >= ldc * n`
+    ///
+    /// # Errors
+    ///
+    /// Returns an error if the kernel execution failed or the selected precision is `half` and the device does not support half precision.
+    #[track_caller]
+    pub fn gemm<T: GemmDatatype + GemmOps>(
+        &mut self,
+        stream: &Stream,
+        m: usize,
+        n: usize,
+        k: usize,
+        alpha: &impl GpuBox<T>,
+        a: &impl GpuBuffer<T>,
+        lda: usize,
+        op_a: MatrixOp,
+        beta: &impl GpuBox<T>,
+        b: &impl GpuBuffer<T>,
+        ldb: usize,
+        op_b: MatrixOp,
+        c: &mut impl GpuBuffer<T>,
+        ldc: usize,
+    ) -> Result {
+        check_gemm(m, n, k, a, lda, op_a, b, ldb, op_b, c, ldc);
+
+        let transa = op_a.to_raw();
+        let transb = op_b.to_raw();
+
+        self.with_stream(stream, |ctx| unsafe {
+            Ok(T::gemm(
+                ctx.raw,
+                transa,
+                transb,
+                m as i32,
+                n as i32,
+                k as i32,
+                alpha.as_device_ptr().as_ptr(),
+                a.as_device_ptr().as_ptr(),
+                lda as i32,
+                b.as_device_ptr().as_ptr(),
+                ldb as i32,
+                beta.as_device_ptr().as_ptr(),
+                c.as_device_ptr().as_mut_ptr(),
+                ldc as i32,
+            )
+            .to_result()?)
+        })
+    }
+}
diff --git a/crates/blastoff/src/lib.rs b/crates/blastoff/src/lib.rs
@@ -8,6 +8,7 @@
 //! [`amin`](crate::context::CublasContext::amin) returns a 1-based index.**
 
 #![allow(clippy::too_many_arguments)]
+#![cfg_attr(docsrs, feature(doc_cfg))]
 
 pub use cublas_sys as sys;
 use num_complex::{Complex32, Complex64};
@@ -17,8 +18,23 @@ pub use context::*;
 mod context;
 pub mod error;
 mod level1;
+mod level3;
 pub mod raw;
 
+/// A possible datatype for a generic matrix mul operation. This is just [`BlasDatatype`] except optionally
+/// containing `f16` with the `half` feature.
+pub trait GemmDatatype: private::Sealed + cust::memory::DeviceCopy {}
+
+#[cfg(feature = "half")]
+impl private::Sealed for half::f16 {}
+#[cfg_attr(docsrs, doc(cfg(feature = "half")))]
+#[cfg(feature = "half")]
+impl GemmDatatype for half::f16 {}
+impl GemmDatatype for f32 {}
+impl GemmDatatype for f64 {}
+impl GemmDatatype for Complex32 {}
+impl GemmDatatype for Complex64 {}
+
 pub trait BlasDatatype: private::Sealed + cust::memory::DeviceCopy {
     /// The corresponding float type. For complex numbers this means their backing
     /// precision, and for floats it is just themselves.
@@ -74,3 +90,32 @@ pub(crate) mod private {
     impl Sealed for Complex32 {}
     impl Sealed for Complex64 {}
 }
+
+/// An optional operation to apply to a matrix before a matrix operation. This includes
+/// no operation, transpose, or conjugate transpose.
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
+pub enum MatrixOp {
+    /// No operation, leave the matrix as is. This is the default.
+    None,
+    /// Transpose the matrix in place.
+    Transpose,
+    /// Conjugate transpose the matrix in place.
+    ConjugateTranspose,
+}
+
+impl Default for MatrixOp {
+    fn default() -> Self {
+        MatrixOp::None
+    }
+}
+
+impl MatrixOp {
+    /// Returns the corresponding `cublasOperation_t` for this operation.
+    pub fn to_raw(self) -> sys::v2::cublasOperation_t {
+        match self {
+            MatrixOp::None => sys::v2::cublasOperation_t::CUBLAS_OP_N,
+            MatrixOp::Transpose => sys::v2::cublasOperation_t::CUBLAS_OP_T,
+            MatrixOp::ConjugateTranspose => sys::v2::cublasOperation_t::CUBLAS_OP_C,
+        }
+    }
+}
diff --git a/crates/blastoff/src/raw/level1.rs b/crates/blastoff/src/raw/level1.rs
@@ -1,7 +1,6 @@
-use std::os::raw::c_int;
-
 use crate::{sys::v2::*, BlasDatatype};
 use num_complex::{Complex32, Complex64};
+use std::os::raw::c_int;
 
 pub trait Level1: BlasDatatype {
     unsafe fn amax(
diff --git a/crates/blastoff/src/raw/level3.rs b/crates/blastoff/src/raw/level3.rs
diff --git a/crates/blastoff/src/raw/mod.rs b/crates/blastoff/src/raw/mod.rs