Skip to content

Commit 3d4d5f6

Browse files
authored
feat: add Naive Bayes and CategoricalNB (#15)
* feat: Implement Naive Bayes classifier * Implement CategoricalNB
1 parent 4efad85 commit 3d4d5f6

File tree

3 files changed

+303
-0
lines changed

3 files changed

+303
-0
lines changed

src/lib.rs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -85,6 +85,8 @@ pub mod math;
8585
/// Functions for assessing prediction error.
8686
pub mod metrics;
8787
pub mod model_selection;
88+
/// Supervised learning algorithms based on applying the Bayes theorem with the independence assumptions between predictors
89+
pub mod naive_bayes;
8890
/// Supervised neighbors-based learning methods
8991
pub mod neighbors;
9092
pub(crate) mod optimization;

src/naive_bayes/categorical.rs

Lines changed: 232 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,232 @@
1+
use crate::error::Failed;
2+
use crate::linalg::BaseVector;
3+
use crate::linalg::Matrix;
4+
use crate::math::num::RealNumber;
5+
use crate::naive_bayes::{BaseNaiveBayes, NBDistribution};
6+
use serde::{Deserialize, Serialize};
7+
8+
/// Naive Bayes classifier for categorical features
9+
struct CategoricalNBDistribution<T: RealNumber> {
10+
class_labels: Vec<T>,
11+
class_probabilities: Vec<T>,
12+
coef: Vec<Vec<Vec<T>>>,
13+
feature_categories: Vec<Vec<T>>,
14+
}
15+
16+
impl<T: RealNumber, M: Matrix<T>> NBDistribution<T, M> for CategoricalNBDistribution<T> {
17+
fn prior(&self, class_index: usize) -> T {
18+
if class_index >= self.class_labels.len() {
19+
T::zero()
20+
} else {
21+
self.class_probabilities[class_index]
22+
}
23+
}
24+
25+
fn conditional_probability(&self, class_index: usize, j: &M::RowVector) -> T {
26+
if class_index < self.class_labels.len() {
27+
let mut prob = T::one();
28+
for feature in 0..j.len() {
29+
let value = j.get(feature);
30+
match self.feature_categories[feature]
31+
.iter()
32+
.position(|&t| t == value)
33+
{
34+
Some(_i) => prob *= self.coef[class_index][feature][_i],
35+
None => return T::zero(),
36+
}
37+
}
38+
prob
39+
} else {
40+
T::zero()
41+
}
42+
}
43+
44+
fn classes(&self) -> &Vec<T> {
45+
&self.class_labels
46+
}
47+
}
48+
49+
impl<T: RealNumber> CategoricalNBDistribution<T> {
50+
/// Fits the distribution to a NxM matrix where N is number of samples and M is number of features.
51+
/// * `x` - training data.
52+
/// * `y` - vector with target values (classes) of length N.
53+
/// * `alpha` - Additive (Laplace/Lidstone) smoothing parameter (0 for no smoothing).
54+
pub fn fit<M: Matrix<T>>(x: &M, y: &M::RowVector, alpha: T) -> Result<Self, Failed> {
55+
if alpha < T::zero() {
56+
return Err(Failed::fit(&format!(
57+
"alpha should be >= 0, alpha=[{}]",
58+
alpha
59+
)));
60+
}
61+
62+
let (n_samples, n_features) = x.shape();
63+
let y_samples = y.len();
64+
if y_samples != n_samples {
65+
return Err(Failed::fit(&format!(
66+
"Size of x should equal size of y; |x|=[{}], |y|=[{}]",
67+
n_samples, y_samples
68+
)));
69+
}
70+
71+
if n_samples == 0 {
72+
return Err(Failed::fit(&format!(
73+
"Size of x and y should greater than 0; |x|=[{}]",
74+
n_samples
75+
)));
76+
}
77+
78+
let mut y_sorted = y.to_vec();
79+
y_sorted.sort_by(|a, b| a.partial_cmp(b).unwrap());
80+
let mut class_labels = Vec::with_capacity(y.len());
81+
class_labels.push(y_sorted[0]);
82+
let mut classes_count = Vec::with_capacity(y.len());
83+
let mut current_count = T::one();
84+
for idx in 1..y_samples {
85+
if y_sorted[idx] == y_sorted[idx - 1] {
86+
current_count += T::one();
87+
} else {
88+
classes_count.push(current_count);
89+
class_labels.push(y_sorted[idx]);
90+
current_count = T::one()
91+
}
92+
classes_count.push(current_count);
93+
}
94+
95+
let mut feature_categories: Vec<Vec<T>> = Vec::with_capacity(n_features);
96+
97+
for feature in 0..n_features {
98+
let feature_types = x.get_col_as_vec(feature).unique();
99+
feature_categories.push(feature_types);
100+
}
101+
let mut coef: Vec<Vec<Vec<T>>> = Vec::with_capacity(class_labels.len());
102+
for (label, label_count) in class_labels.iter().zip(classes_count.iter()) {
103+
let mut coef_i: Vec<Vec<T>> = Vec::with_capacity(n_features);
104+
for (feature_index, feature_options) in
105+
feature_categories.iter().enumerate().take(n_features)
106+
{
107+
let col = x
108+
.get_col_as_vec(feature_index)
109+
.iter()
110+
.enumerate()
111+
.filter(|(i, _j)| y.get(*i) == *label)
112+
.map(|(_, j)| *j)
113+
.collect::<Vec<T>>();
114+
let mut feat_count: Vec<usize> = Vec::with_capacity(feature_options.len());
115+
for k in feature_options.iter() {
116+
let feat_k_count = col.iter().filter(|&v| v == k).count();
117+
feat_count.push(feat_k_count);
118+
}
119+
120+
let coef_i_j = feat_count
121+
.iter()
122+
.map(|c| {
123+
(T::from(*c).unwrap() + alpha)
124+
/ (T::from(*label_count).unwrap()
125+
+ T::from(feature_options.len()).unwrap() * alpha)
126+
})
127+
.collect::<Vec<T>>();
128+
coef_i.push(coef_i_j);
129+
}
130+
coef.push(coef_i);
131+
}
132+
let class_probabilities = classes_count
133+
.into_iter()
134+
.map(|count| count / T::from(n_samples).unwrap())
135+
.collect::<Vec<T>>();
136+
137+
Ok(Self {
138+
class_labels,
139+
class_probabilities,
140+
coef,
141+
feature_categories,
142+
})
143+
}
144+
}
145+
146+
/// `CategoricalNB` parameters. Use `Default::default()` for default values.
147+
#[derive(Serialize, Deserialize, Debug)]
148+
pub struct CategoricalNBParameters<T: RealNumber> {
149+
/// Additive (Laplace/Lidstone) smoothing parameter (0 for no smoothing).
150+
pub alpha: T,
151+
}
152+
153+
impl<T: RealNumber> CategoricalNBParameters<T> {
154+
/// Create CategoricalNBParameters with specific paramaters.
155+
pub fn new(alpha: T) -> Result<Self, Failed> {
156+
if alpha > T::zero() {
157+
Ok(Self { alpha })
158+
} else {
159+
Err(Failed::fit(&format!(
160+
"alpha should be >= 0, alpha=[{}]",
161+
alpha
162+
)))
163+
}
164+
}
165+
}
166+
impl<T: RealNumber> Default for CategoricalNBParameters<T> {
167+
fn default() -> Self {
168+
Self { alpha: T::one() }
169+
}
170+
}
171+
172+
/// CategoricalNB implements the categorical naive Bayes algorithm for categorically distributed data.
173+
pub struct CategoricalNB<T: RealNumber, M: Matrix<T>> {
174+
inner: BaseNaiveBayes<T, M, CategoricalNBDistribution<T>>,
175+
}
176+
177+
impl<T: RealNumber, M: Matrix<T>> CategoricalNB<T, M> {
178+
/// Fits CategoricalNB with given data
179+
/// * `x` - training data of size NxM where N is the number of samples and M is the number of
180+
/// features.
181+
/// * `y` - vector with target values (classes) of length N.
182+
/// * `parameters` - additional parameters like alpha for smoothing
183+
pub fn fit(
184+
x: &M,
185+
y: &M::RowVector,
186+
parameters: CategoricalNBParameters<T>,
187+
) -> Result<Self, Failed> {
188+
let alpha = parameters.alpha;
189+
let distribution = CategoricalNBDistribution::fit(x, y, alpha)?;
190+
let inner = BaseNaiveBayes::fit(distribution)?;
191+
Ok(Self { inner })
192+
}
193+
194+
/// Estimates the class labels for the provided data.
195+
/// * `x` - data of shape NxM where N is number of data points to estimate and M is number of features.
196+
/// Returns a vector of size N with class estimates.
197+
pub fn predict(&self, x: &M) -> Result<M::RowVector, Failed> {
198+
self.inner.predict(x)
199+
}
200+
}
201+
202+
#[cfg(test)]
203+
mod tests {
204+
use super::*;
205+
use crate::linalg::naive::dense_matrix::DenseMatrix;
206+
207+
#[test]
208+
fn run_base_naive_bayes() {
209+
let x = DenseMatrix::from_2d_array(&[
210+
&[0., 2., 1., 0.],
211+
&[0., 2., 1., 1.],
212+
&[1., 2., 1., 0.],
213+
&[2., 1., 1., 0.],
214+
&[2., 0., 0., 0.],
215+
&[2., 0., 0., 1.],
216+
&[1., 0., 0., 1.],
217+
&[0., 1., 1., 0.],
218+
&[0., 0., 0., 0.],
219+
&[2., 1., 0., 0.],
220+
&[0., 1., 0., 1.],
221+
&[1., 1., 1., 1.],
222+
&[1., 2., 0., 0.],
223+
&[2., 1., 1., 1.],
224+
]);
225+
let y = vec![0., 0., 1., 1., 1., 0., 1., 0., 1., 1., 1., 1., 1., 0.];
226+
227+
let cnb = CategoricalNB::fit(&x, &y, Default::default()).unwrap();
228+
let x_test = DenseMatrix::from_2d_array(&[&[0., 2., 1., 0.], &[2., 2., 0., 0.]]);
229+
let y_hat = cnb.predict(&x_test).unwrap();
230+
assert_eq!(y_hat, vec![0., 1.]);
231+
}
232+
}

src/naive_bayes/mod.rs

Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,69 @@
1+
use crate::error::Failed;
2+
use crate::linalg::BaseVector;
3+
use crate::linalg::Matrix;
4+
use crate::math::num::RealNumber;
5+
use std::marker::PhantomData;
6+
7+
/// Distribution used in the Naive Bayes classifier.
8+
pub(crate) trait NBDistribution<T: RealNumber, M: Matrix<T>> {
9+
/// Prior of class at the given index.
10+
fn prior(&self, class_index: usize) -> T;
11+
12+
/// Conditional probability of sample j given class in the specified index.
13+
fn conditional_probability(&self, class_index: usize, j: &M::RowVector) -> T;
14+
15+
/// Possible classes of the distribution.
16+
fn classes(&self) -> &Vec<T>;
17+
}
18+
19+
/// Base struct for the Naive Bayes classifier.
20+
pub(crate) struct BaseNaiveBayes<T: RealNumber, M: Matrix<T>, D: NBDistribution<T, M>> {
21+
distribution: D,
22+
_phantom_t: PhantomData<T>,
23+
_phantom_m: PhantomData<M>,
24+
}
25+
26+
impl<T: RealNumber, M: Matrix<T>, D: NBDistribution<T, M>> BaseNaiveBayes<T, M, D> {
27+
/// Fits NB classifier to a given NBdistribution.
28+
/// * `distribution` - NBDistribution of the training data
29+
pub fn fit(distribution: D) -> Result<Self, Failed> {
30+
Ok(Self {
31+
distribution,
32+
_phantom_t: PhantomData,
33+
_phantom_m: PhantomData,
34+
})
35+
}
36+
37+
/// Estimates the class labels for the provided data.
38+
/// * `x` - data of shape NxM where N is number of data points to estimate and M is number of features.
39+
/// Returns a vector of size N with class estimates.
40+
pub fn predict(&self, x: &M) -> Result<M::RowVector, Failed> {
41+
let y_classes = self.distribution.classes();
42+
let (rows, _) = x.shape();
43+
let predictions = (0..rows)
44+
.map(|row_index| {
45+
let row = x.get_row(row_index);
46+
let (prediction, _probability) = y_classes
47+
.iter()
48+
.enumerate()
49+
.map(|(class_index, class)| {
50+
(
51+
class,
52+
self.distribution.conditional_probability(class_index, &row)
53+
* self.distribution.prior(class_index),
54+
)
55+
})
56+
.max_by(|(_, p1), (_, p2)| p1.partial_cmp(p2).unwrap())
57+
.unwrap();
58+
*prediction
59+
})
60+
.collect::<Vec<T>>();
61+
let mut y_hat = M::RowVector::zeros(rows);
62+
for (i, prediction) in predictions.iter().enumerate().take(rows) {
63+
y_hat.set(i, *prediction);
64+
}
65+
Ok(y_hat)
66+
}
67+
}
68+
mod categorical;
69+
pub use categorical::{CategoricalNB, CategoricalNBParameters};

0 commit comments

Comments
 (0)