|
| 1 | +use crate::error::Failed; |
| 2 | +use crate::linalg::row_iter; |
| 3 | +use crate::linalg::BaseVector; |
| 4 | +use crate::linalg::Matrix; |
| 5 | +use crate::math::num::RealNumber; |
| 6 | +use crate::math::vector::RealNumberVector; |
| 7 | +use crate::naive_bayes::{BaseNaiveBayes, NBDistribution}; |
| 8 | + |
| 9 | +use serde::{Deserialize, Serialize}; |
| 10 | + |
| 11 | +/// Naive Bayes classifier for Multinomial features |
| 12 | +#[derive(Serialize, Deserialize, Debug, PartialEq)] |
| 13 | +struct MultinomialNBDistribution<T: RealNumber> { |
| 14 | + /// class labels known to the classifier |
| 15 | + class_labels: Vec<T>, |
| 16 | + class_priors: Vec<T>, |
| 17 | + feature_prob: Vec<Vec<T>>, |
| 18 | +} |
| 19 | + |
| 20 | +impl<T: RealNumber, M: Matrix<T>> NBDistribution<T, M> for MultinomialNBDistribution<T> { |
| 21 | + fn prior(&self, class_index: usize) -> T { |
| 22 | + self.class_priors[class_index] |
| 23 | + } |
| 24 | + |
| 25 | + fn log_likelihood(&self, class_index: usize, j: &M::RowVector) -> T { |
| 26 | + let mut likelihood = T::zero(); |
| 27 | + for feature in 0..j.len() { |
| 28 | + let value = j.get(feature); |
| 29 | + likelihood += value * self.feature_prob[class_index][feature].ln(); |
| 30 | + } |
| 31 | + likelihood |
| 32 | + } |
| 33 | + |
| 34 | + fn classes(&self) -> &Vec<T> { |
| 35 | + &self.class_labels |
| 36 | + } |
| 37 | +} |
| 38 | + |
| 39 | +/// `MultinomialNB` parameters. Use `Default::default()` for default values. |
| 40 | +#[derive(Serialize, Deserialize, Debug)] |
| 41 | +pub struct MultinomialNBParameters<T: RealNumber> { |
| 42 | + /// Additive (Laplace/Lidstone) smoothing parameter (0 for no smoothing). |
| 43 | + pub alpha: T, |
| 44 | + /// Prior probabilities of the classes. If specified the priors are not adjusted according to the data |
| 45 | + pub priors: Option<Vec<T>>, |
| 46 | +} |
| 47 | + |
| 48 | +impl<T: RealNumber> MultinomialNBParameters<T> { |
| 49 | + /// Create MultinomialNBParameters with specific paramaters. |
| 50 | + pub fn new(alpha: T, priors: Option<Vec<T>>) -> Self { |
| 51 | + Self { alpha, priors } |
| 52 | + } |
| 53 | +} |
| 54 | + |
| 55 | +impl<T: RealNumber> Default for MultinomialNBParameters<T> { |
| 56 | + fn default() -> Self { |
| 57 | + Self { |
| 58 | + alpha: T::one(), |
| 59 | + priors: None, |
| 60 | + } |
| 61 | + } |
| 62 | +} |
| 63 | + |
| 64 | +impl<T: RealNumber> MultinomialNBDistribution<T> { |
| 65 | + /// Fits the distribution to a NxM matrix where N is number of samples and M is number of features. |
| 66 | + /// * `x` - training data. |
| 67 | + /// * `y` - vector with target values (classes) of length N. |
| 68 | + /// * `priors` - Optional vector with prior probabilities of the classes. If not defined, |
| 69 | + /// priors are adjusted according to the data. |
| 70 | + /// * `alpha` - Additive (Laplace/Lidstone) smoothing parameter. |
| 71 | + pub fn fit<M: Matrix<T>>( |
| 72 | + x: &M, |
| 73 | + y: &M::RowVector, |
| 74 | + alpha: T, |
| 75 | + priors: Option<Vec<T>>, |
| 76 | + ) -> Result<Self, Failed> { |
| 77 | + let (n_samples, n_features) = x.shape(); |
| 78 | + let y_samples = y.len(); |
| 79 | + if y_samples != n_samples { |
| 80 | + return Err(Failed::fit(&format!( |
| 81 | + "Size of x should equal size of y; |x|=[{}], |y|=[{}]", |
| 82 | + n_samples, y_samples |
| 83 | + ))); |
| 84 | + } |
| 85 | + |
| 86 | + if n_samples == 0 { |
| 87 | + return Err(Failed::fit(&format!( |
| 88 | + "Size of x and y should greater than 0; |x|=[{}]", |
| 89 | + n_samples |
| 90 | + ))); |
| 91 | + } |
| 92 | + if alpha < T::zero() { |
| 93 | + return Err(Failed::fit(&format!( |
| 94 | + "Alpha should be greater than 0; |alpha|=[{}]", |
| 95 | + alpha |
| 96 | + ))); |
| 97 | + } |
| 98 | + |
| 99 | + let y = y.to_vec(); |
| 100 | + |
| 101 | + let (class_labels, indices) = <Vec<T> as RealNumberVector<T>>::unique_with_indices(&y); |
| 102 | + let mut class_count = vec![T::zero(); class_labels.len()]; |
| 103 | + |
| 104 | + for class_index in indices.iter() { |
| 105 | + class_count[*class_index] += T::one(); |
| 106 | + } |
| 107 | + |
| 108 | + let class_priors = if let Some(class_priors) = priors { |
| 109 | + if class_priors.len() != class_labels.len() { |
| 110 | + return Err(Failed::fit( |
| 111 | + "Size of priors provided does not match the number of classes of the data.", |
| 112 | + )); |
| 113 | + } |
| 114 | + class_priors |
| 115 | + } else { |
| 116 | + class_count |
| 117 | + .iter() |
| 118 | + .map(|&c| c / T::from(n_samples).unwrap()) |
| 119 | + .collect() |
| 120 | + }; |
| 121 | + |
| 122 | + let mut feature_in_class_counter = vec![vec![T::zero(); n_features]; class_labels.len()]; |
| 123 | + |
| 124 | + for (row, class_index) in row_iter(x).zip(indices) { |
| 125 | + for idx in 0..n_features { |
| 126 | + feature_in_class_counter[class_index][idx] += row[idx]; |
| 127 | + } |
| 128 | + } |
| 129 | + |
| 130 | + let feature_prob = feature_in_class_counter |
| 131 | + .iter() |
| 132 | + .map(|feature_count| { |
| 133 | + let n_c = feature_count.sum(); |
| 134 | + feature_count |
| 135 | + .iter() |
| 136 | + .map(|&count| (count + alpha) / (n_c + alpha * T::from(n_features).unwrap())) |
| 137 | + .collect() |
| 138 | + }) |
| 139 | + .collect(); |
| 140 | + |
| 141 | + Ok(Self { |
| 142 | + class_labels, |
| 143 | + class_priors, |
| 144 | + feature_prob, |
| 145 | + }) |
| 146 | + } |
| 147 | +} |
| 148 | + |
| 149 | +/// MultinomialNB implements the categorical naive Bayes algorithm for categorically distributed data. |
| 150 | +#[derive(Serialize, Deserialize, Debug, PartialEq)] |
| 151 | +pub struct MultinomialNB<T: RealNumber, M: Matrix<T>> { |
| 152 | + inner: BaseNaiveBayes<T, M, MultinomialNBDistribution<T>>, |
| 153 | +} |
| 154 | + |
| 155 | +impl<T: RealNumber, M: Matrix<T>> MultinomialNB<T, M> { |
| 156 | + /// Fits MultinomialNB with given data |
| 157 | + /// * `x` - training data of size NxM where N is the number of samples and M is the number of |
| 158 | + /// features. |
| 159 | + /// * `y` - vector with target values (classes) of length N. |
| 160 | + /// * `parameters` - additional parameters like class priors, alpha for smoothing and |
| 161 | + /// binarizing threshold. |
| 162 | + pub fn fit( |
| 163 | + x: &M, |
| 164 | + y: &M::RowVector, |
| 165 | + parameters: MultinomialNBParameters<T>, |
| 166 | + ) -> Result<Self, Failed> { |
| 167 | + let distribution = |
| 168 | + MultinomialNBDistribution::fit(x, y, parameters.alpha, parameters.priors)?; |
| 169 | + let inner = BaseNaiveBayes::fit(distribution)?; |
| 170 | + Ok(Self { inner }) |
| 171 | + } |
| 172 | + |
| 173 | + /// Estimates the class labels for the provided data. |
| 174 | + /// * `x` - data of shape NxM where N is number of data points to estimate and M is number of features. |
| 175 | + /// Returns a vector of size N with class estimates. |
| 176 | + pub fn predict(&self, x: &M) -> Result<M::RowVector, Failed> { |
| 177 | + self.inner.predict(x) |
| 178 | + } |
| 179 | +} |
| 180 | + |
| 181 | +#[cfg(test)] |
| 182 | +mod tests { |
| 183 | + use super::*; |
| 184 | + use crate::linalg::naive::dense_matrix::DenseMatrix; |
| 185 | + |
| 186 | + #[test] |
| 187 | + fn run_multinomial_naive_bayes() { |
| 188 | + // Tests that MultinomialNB when alpha=1.0 gives the same values as |
| 189 | + // those given for the toy example in Manning, Raghavan, and |
| 190 | + // Schuetze's "Introduction to Information Retrieval" book: |
| 191 | + // https://nlp.stanford.edu/IR-book/html/htmledition/naive-bayes-text-classification-1.html |
| 192 | + |
| 193 | + // Training data points are: |
| 194 | + // Chinese Beijing Chinese (class: China) |
| 195 | + // Chinese Chinese Shanghai (class: China) |
| 196 | + // Chinese Macao (class: China) |
| 197 | + // Tokyo Japan Chinese (class: Japan) |
| 198 | + let x = DenseMatrix::<f64>::from_2d_array(&[ |
| 199 | + &[1., 2., 0., 0., 0., 0.], |
| 200 | + &[0., 2., 0., 0., 1., 0.], |
| 201 | + &[0., 1., 0., 1., 0., 0.], |
| 202 | + &[0., 1., 1., 0., 0., 1.], |
| 203 | + ]); |
| 204 | + let y = vec![0., 0., 0., 1.]; |
| 205 | + let mnb = MultinomialNB::fit(&x, &y, Default::default()).unwrap(); |
| 206 | + |
| 207 | + assert_eq!(mnb.inner.distribution.class_priors, &[0.75, 0.25]); |
| 208 | + assert_eq!( |
| 209 | + mnb.inner.distribution.feature_prob, |
| 210 | + &[ |
| 211 | + &[1. / 7., 3. / 7., 1. / 14., 1. / 7., 1. / 7., 1. / 14.], |
| 212 | + &[1. / 9., 2. / 9.0, 2. / 9.0, 1. / 9.0, 1. / 9.0, 2. / 9.0] |
| 213 | + ] |
| 214 | + ); |
| 215 | + |
| 216 | + // Testing data point is: |
| 217 | + // Chinese Chinese Chinese Tokyo Japan |
| 218 | + let x_test = DenseMatrix::<f64>::from_2d_array(&[&[0., 3., 1., 0., 0., 1.]]); |
| 219 | + let y_hat = mnb.predict(&x_test).unwrap(); |
| 220 | + |
| 221 | + assert_eq!(y_hat, &[0.]); |
| 222 | + } |
| 223 | + |
| 224 | + #[test] |
| 225 | + fn multinomial_nb_scikit_parity() { |
| 226 | + let x = DenseMatrix::<f64>::from_2d_array(&[ |
| 227 | + &[2., 4., 0., 0., 2., 1., 2., 4., 2., 0.], |
| 228 | + &[3., 4., 0., 2., 1., 0., 1., 4., 0., 3.], |
| 229 | + &[1., 4., 2., 4., 1., 0., 1., 2., 3., 2.], |
| 230 | + &[0., 3., 3., 4., 1., 0., 3., 1., 1., 1.], |
| 231 | + &[0., 2., 1., 4., 3., 4., 1., 2., 3., 1.], |
| 232 | + &[3., 2., 4., 1., 3., 0., 2., 4., 0., 2.], |
| 233 | + &[3., 1., 3., 0., 2., 0., 4., 4., 3., 4.], |
| 234 | + &[2., 2., 2., 0., 1., 1., 2., 1., 0., 1.], |
| 235 | + &[3., 3., 2., 2., 0., 2., 3., 2., 2., 3.], |
| 236 | + &[4., 3., 4., 4., 4., 2., 2., 0., 1., 4.], |
| 237 | + &[3., 4., 2., 2., 1., 4., 4., 4., 1., 3.], |
| 238 | + &[3., 0., 1., 4., 4., 0., 0., 3., 2., 4.], |
| 239 | + &[2., 0., 3., 3., 1., 2., 0., 2., 4., 1.], |
| 240 | + &[2., 4., 0., 4., 2., 4., 1., 3., 1., 4.], |
| 241 | + &[0., 2., 2., 3., 4., 0., 4., 4., 4., 4.], |
| 242 | + ]); |
| 243 | + let y = vec![2., 2., 0., 0., 0., 2., 1., 1., 0., 1., 0., 0., 2., 0., 2.]; |
| 244 | + let nb = MultinomialNB::fit(&x, &y, Default::default()).unwrap(); |
| 245 | + |
| 246 | + let y_hat = nb.predict(&x).unwrap(); |
| 247 | + |
| 248 | + assert!(nb |
| 249 | + .inner |
| 250 | + .distribution |
| 251 | + .class_priors |
| 252 | + .approximate_eq(&vec!(0.46, 0.2, 0.33), 1e-2)); |
| 253 | + assert!(nb.inner.distribution.feature_prob[1].approximate_eq( |
| 254 | + &vec!(0.07, 0.12, 0.07, 0.15, 0.07, 0.09, 0.08, 0.10, 0.08, 0.11), |
| 255 | + 1e-1 |
| 256 | + )); |
| 257 | + assert!(y_hat.approximate_eq( |
| 258 | + &vec!(2.0, 2.0, 0.0, 0.0, 0.0, 2.0, 2.0, 1.0, 0.0, 1.0, 0.0, 2.0, 0.0, 0.0, 2.0), |
| 259 | + 1e-5 |
| 260 | + )); |
| 261 | + } |
| 262 | + #[test] |
| 263 | + fn serde() { |
| 264 | + let x = DenseMatrix::<f64>::from_2d_array(&[ |
| 265 | + &[1., 1., 0., 0., 0., 0.], |
| 266 | + &[0., 1., 0., 0., 1., 0.], |
| 267 | + &[0., 1., 0., 1., 0., 0.], |
| 268 | + &[0., 1., 1., 0., 0., 1.], |
| 269 | + ]); |
| 270 | + let y = vec![0., 0., 0., 1.]; |
| 271 | + |
| 272 | + let mnb = MultinomialNB::fit(&x, &y, Default::default()).unwrap(); |
| 273 | + let deserialized_mnb: MultinomialNB<f64, DenseMatrix<f64>> = |
| 274 | + serde_json::from_str(&serde_json::to_string(&mnb).unwrap()).unwrap(); |
| 275 | + |
| 276 | + assert_eq!(mnb, deserialized_mnb); |
| 277 | + } |
| 278 | +} |
0 commit comments