feat: add Naive Bayes and CategoricalNB (#15)

morenol · web-flow · commit 3d4d5f64f6eb · 2020-11-09T15:54:27.000-04:00
* feat: Implement Naive Bayes classifier

* Implement CategoricalNB
diff --git a/src/lib.rs b/src/lib.rs
@@ -85,6 +85,8 @@ pub mod math;
 /// Functions for assessing prediction error.
 pub mod metrics;
 pub mod model_selection;
+///  Supervised learning algorithms based on applying the Bayes theorem with the independence assumptions between predictors
+pub mod naive_bayes;
 /// Supervised neighbors-based learning methods
 pub mod neighbors;
 pub(crate) mod optimization;
diff --git a/src/naive_bayes/categorical.rs b/src/naive_bayes/categorical.rs
@@ -0,0 +1,232 @@
+use crate::error::Failed;
+use crate::linalg::BaseVector;
+use crate::linalg::Matrix;
+use crate::math::num::RealNumber;
+use crate::naive_bayes::{BaseNaiveBayes, NBDistribution};
+use serde::{Deserialize, Serialize};
+
+/// Naive Bayes classifier for categorical features
+struct CategoricalNBDistribution<T: RealNumber> {
+    class_labels: Vec<T>,
+    class_probabilities: Vec<T>,
+    coef: Vec<Vec<Vec<T>>>,
+    feature_categories: Vec<Vec<T>>,
+}
+
+impl<T: RealNumber, M: Matrix<T>> NBDistribution<T, M> for CategoricalNBDistribution<T> {
+    fn prior(&self, class_index: usize) -> T {
+        if class_index >= self.class_labels.len() {
+            T::zero()
+        } else {
+            self.class_probabilities[class_index]
+        }
+    }
+
+    fn conditional_probability(&self, class_index: usize, j: &M::RowVector) -> T {
+        if class_index < self.class_labels.len() {
+            let mut prob = T::one();
+            for feature in 0..j.len() {
+                let value = j.get(feature);
+                match self.feature_categories[feature]
+                    .iter()
+                    .position(|&t| t == value)
+                {
+                    Some(_i) => prob *= self.coef[class_index][feature][_i],
+                    None => return T::zero(),
+                }
+            }
+            prob
+        } else {
+            T::zero()
+        }
+    }
+
+    fn classes(&self) -> &Vec<T> {
+        &self.class_labels
+    }
+}
+
+impl<T: RealNumber> CategoricalNBDistribution<T> {
+    /// Fits the distribution to a NxM matrix where N is number of samples and M is number of features.
+    /// * `x` - training data.
+    /// * `y` - vector with target values (classes) of length N.
+    /// * `alpha` - Additive (Laplace/Lidstone) smoothing parameter (0 for no smoothing).
+    pub fn fit<M: Matrix<T>>(x: &M, y: &M::RowVector, alpha: T) -> Result<Self, Failed> {
+        if alpha < T::zero() {
+            return Err(Failed::fit(&format!(
+                "alpha should be >= 0, alpha=[{}]",
+                alpha
+            )));
+        }
+
+        let (n_samples, n_features) = x.shape();
+        let y_samples = y.len();
+        if y_samples != n_samples {
+            return Err(Failed::fit(&format!(
+                "Size of x should equal size of y; |x|=[{}], |y|=[{}]",
+                n_samples, y_samples
+            )));
+        }
+
+        if n_samples == 0 {
+            return Err(Failed::fit(&format!(
+                "Size of x and y should greater than 0; |x|=[{}]",
+                n_samples
+            )));
+        }
+
+        let mut y_sorted = y.to_vec();
+        y_sorted.sort_by(|a, b| a.partial_cmp(b).unwrap());
+        let mut class_labels = Vec::with_capacity(y.len());
+        class_labels.push(y_sorted[0]);
+        let mut classes_count = Vec::with_capacity(y.len());
+        let mut current_count = T::one();
+        for idx in 1..y_samples {
+            if y_sorted[idx] == y_sorted[idx - 1] {
+                current_count += T::one();
+            } else {
+                classes_count.push(current_count);
+                class_labels.push(y_sorted[idx]);
+                current_count = T::one()
+            }
+            classes_count.push(current_count);
+        }
+
+        let mut feature_categories: Vec<Vec<T>> = Vec::with_capacity(n_features);
+
+        for feature in 0..n_features {
+            let feature_types = x.get_col_as_vec(feature).unique();
+            feature_categories.push(feature_types);
+        }
+        let mut coef: Vec<Vec<Vec<T>>> = Vec::with_capacity(class_labels.len());
+        for (label, label_count) in class_labels.iter().zip(classes_count.iter()) {
+            let mut coef_i: Vec<Vec<T>> = Vec::with_capacity(n_features);
+            for (feature_index, feature_options) in
+                feature_categories.iter().enumerate().take(n_features)
+            {
+                let col = x
+                    .get_col_as_vec(feature_index)
+                    .iter()
+                    .enumerate()
+                    .filter(|(i, _j)| y.get(*i) == *label)
+                    .map(|(_, j)| *j)
+                    .collect::<Vec<T>>();
+                let mut feat_count: Vec<usize> = Vec::with_capacity(feature_options.len());
+                for k in feature_options.iter() {
+                    let feat_k_count = col.iter().filter(|&v| v == k).count();
+                    feat_count.push(feat_k_count);
+                }
+
+                let coef_i_j = feat_count
+                    .iter()
+                    .map(|c| {
+                        (T::from(*c).unwrap() + alpha)
+                            / (T::from(*label_count).unwrap()
+                                + T::from(feature_options.len()).unwrap() * alpha)
+                    })
+                    .collect::<Vec<T>>();
+                coef_i.push(coef_i_j);
+            }
+            coef.push(coef_i);
+        }
+        let class_probabilities = classes_count
+            .into_iter()
+            .map(|count| count / T::from(n_samples).unwrap())
+            .collect::<Vec<T>>();
+
+        Ok(Self {
+            class_labels,
+            class_probabilities,
+            coef,
+            feature_categories,
+        })
+    }
+}
+
+/// `CategoricalNB` parameters. Use `Default::default()` for default values.
+#[derive(Serialize, Deserialize, Debug)]
+pub struct CategoricalNBParameters<T: RealNumber> {
+    /// Additive (Laplace/Lidstone) smoothing parameter (0 for no smoothing).
+    pub alpha: T,
+}
+
+impl<T: RealNumber> CategoricalNBParameters<T> {
+    /// Create CategoricalNBParameters with specific paramaters.
+    pub fn new(alpha: T) -> Result<Self, Failed> {
+        if alpha > T::zero() {
+            Ok(Self { alpha })
+        } else {
+            Err(Failed::fit(&format!(
+                "alpha should be >= 0, alpha=[{}]",
+                alpha
+            )))
+        }
+    }
+}
+impl<T: RealNumber> Default for CategoricalNBParameters<T> {
+    fn default() -> Self {
+        Self { alpha: T::one() }
+    }
+}
+
+/// CategoricalNB implements the categorical naive Bayes algorithm for categorically distributed data.
+pub struct CategoricalNB<T: RealNumber, M: Matrix<T>> {
+    inner: BaseNaiveBayes<T, M, CategoricalNBDistribution<T>>,
+}
+
+impl<T: RealNumber, M: Matrix<T>> CategoricalNB<T, M> {
+    /// Fits CategoricalNB with given data
+    /// * `x` - training data of size NxM where N is the number of samples and M is the number of
+    /// features.
+    /// * `y` - vector with target values (classes) of length N.
+    /// * `parameters` - additional parameters like alpha for smoothing
+    pub fn fit(
+        x: &M,
+        y: &M::RowVector,
+        parameters: CategoricalNBParameters<T>,
+    ) -> Result<Self, Failed> {
+        let alpha = parameters.alpha;
+        let distribution = CategoricalNBDistribution::fit(x, y, alpha)?;
+        let inner = BaseNaiveBayes::fit(distribution)?;
+        Ok(Self { inner })
+    }
+
+    /// Estimates the class labels for the provided data.
+    /// * `x` - data of shape NxM where N is number of data points to estimate and M is number of features.
+    /// Returns a vector of size N with class estimates.
+    pub fn predict(&self, x: &M) -> Result<M::RowVector, Failed> {
+        self.inner.predict(x)
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::linalg::naive::dense_matrix::DenseMatrix;
+
+    #[test]
+    fn run_base_naive_bayes() {
+        let x = DenseMatrix::from_2d_array(&[
+            &[0., 2., 1., 0.],
+            &[0., 2., 1., 1.],
+            &[1., 2., 1., 0.],
+            &[2., 1., 1., 0.],
+            &[2., 0., 0., 0.],
+            &[2., 0., 0., 1.],
+            &[1., 0., 0., 1.],
+            &[0., 1., 1., 0.],
+            &[0., 0., 0., 0.],
+            &[2., 1., 0., 0.],
+            &[0., 1., 0., 1.],
+            &[1., 1., 1., 1.],
+            &[1., 2., 0., 0.],
+            &[2., 1., 1., 1.],
+        ]);
+        let y = vec![0., 0., 1., 1., 1., 0., 1., 0., 1., 1., 1., 1., 1., 0.];
+
+        let cnb = CategoricalNB::fit(&x, &y, Default::default()).unwrap();
+        let x_test = DenseMatrix::from_2d_array(&[&[0., 2., 1., 0.], &[2., 2., 0., 0.]]);
+        let y_hat = cnb.predict(&x_test).unwrap();
+        assert_eq!(y_hat, vec![0., 1.]);
+    }
+}
diff --git a/src/naive_bayes/mod.rs b/src/naive_bayes/mod.rs
@@ -0,0 +1,69 @@
+use crate::error::Failed;
+use crate::linalg::BaseVector;
+use crate::linalg::Matrix;
+use crate::math::num::RealNumber;
+use std::marker::PhantomData;
+
+/// Distribution used in the Naive Bayes classifier.
+pub(crate) trait NBDistribution<T: RealNumber, M: Matrix<T>> {
+    /// Prior of class at the given index.
+    fn prior(&self, class_index: usize) -> T;
+
+    /// Conditional probability of sample j given class in the specified index.
+    fn conditional_probability(&self, class_index: usize, j: &M::RowVector) -> T;
+
+    /// Possible classes of the distribution.
+    fn classes(&self) -> &Vec<T>;
+}
+
+/// Base struct for the Naive Bayes classifier.
+pub(crate) struct BaseNaiveBayes<T: RealNumber, M: Matrix<T>, D: NBDistribution<T, M>> {
+    distribution: D,
+    _phantom_t: PhantomData<T>,
+    _phantom_m: PhantomData<M>,
+}
+
+impl<T: RealNumber, M: Matrix<T>, D: NBDistribution<T, M>> BaseNaiveBayes<T, M, D> {
+    /// Fits NB classifier to a given NBdistribution.
+    /// * `distribution` - NBDistribution of the training data
+    pub fn fit(distribution: D) -> Result<Self, Failed> {
+        Ok(Self {
+            distribution,
+            _phantom_t: PhantomData,
+            _phantom_m: PhantomData,
+        })
+    }
+
+    /// Estimates the class labels for the provided data.
+    /// * `x` - data of shape NxM where N is number of data points to estimate and M is number of features.
+    /// Returns a vector of size N with class estimates.
+    pub fn predict(&self, x: &M) -> Result<M::RowVector, Failed> {
+        let y_classes = self.distribution.classes();
+        let (rows, _) = x.shape();
+        let predictions = (0..rows)
+            .map(|row_index| {
+                let row = x.get_row(row_index);
+                let (prediction, _probability) = y_classes
+                    .iter()
+                    .enumerate()
+                    .map(|(class_index, class)| {
+                        (
+                            class,
+                            self.distribution.conditional_probability(class_index, &row)
+                                * self.distribution.prior(class_index),
+                        )
+                    })
+                    .max_by(|(_, p1), (_, p2)| p1.partial_cmp(p2).unwrap())
+                    .unwrap();
+                *prediction
+            })
+            .collect::<Vec<T>>();
+        let mut y_hat = M::RowVector::zeros(rows);
+        for (i, prediction) in predictions.iter().enumerate().take(rows) {
+            y_hat.set(i, *prediction);
+        }
+        Ok(y_hat)
+    }
+}
+mod categorical;
+pub use categorical::{CategoricalNB, CategoricalNBParameters};