@@ -6,36 +6,34 @@ use crate::naive_bayes::{BaseNaiveBayes, NBDistribution};
6
6
use serde:: { Deserialize , Serialize } ;
7
7
8
8
/// Naive Bayes classifier for categorical features
9
+ #[ derive( Debug ) ]
9
10
struct CategoricalNBDistribution < T : RealNumber > {
10
11
class_labels : Vec < T > ,
11
- class_probabilities : Vec < T > ,
12
- coef : Vec < Vec < Vec < T > > > ,
13
- feature_categories : Vec < Vec < T > > ,
12
+ class_priors : Vec < T > ,
13
+ coefficients : Vec < Vec < Vec < T > > > ,
14
14
}
15
15
16
16
impl < T : RealNumber , M : Matrix < T > > NBDistribution < T , M > for CategoricalNBDistribution < T > {
17
17
fn prior ( & self , class_index : usize ) -> T {
18
18
if class_index >= self . class_labels . len ( ) {
19
19
T :: zero ( )
20
20
} else {
21
- self . class_probabilities [ class_index]
21
+ self . class_priors [ class_index]
22
22
}
23
23
}
24
24
25
- fn conditional_probability ( & self , class_index : usize , j : & M :: RowVector ) -> T {
25
+ fn log_likelihood ( & self , class_index : usize , j : & M :: RowVector ) -> T {
26
26
if class_index < self . class_labels . len ( ) {
27
- let mut prob = T :: one ( ) ;
27
+ let mut likelihood = T :: zero ( ) ;
28
28
for feature in 0 ..j. len ( ) {
29
- let value = j. get ( feature) ;
30
- match self . feature_categories [ feature]
31
- . iter ( )
32
- . position ( |& t| t == value)
33
- {
34
- Some ( _i) => prob *= self . coef [ class_index] [ feature] [ _i] ,
35
- None => return T :: zero ( ) ,
29
+ let value = j. get ( feature) . floor ( ) . to_usize ( ) . unwrap ( ) ;
30
+ if self . coefficients [ class_index] [ feature] . len ( ) > value {
31
+ likelihood += self . coefficients [ class_index] [ feature] [ value] ;
32
+ } else {
33
+ return T :: zero ( ) ;
36
34
}
37
35
}
38
- prob
36
+ likelihood
39
37
} else {
40
38
T :: zero ( )
41
39
}
@@ -74,31 +72,45 @@ impl<T: RealNumber> CategoricalNBDistribution<T> {
74
72
n_samples
75
73
) ) ) ;
76
74
}
75
+ let y: Vec < usize > = y
76
+ . to_vec ( )
77
+ . iter ( )
78
+ . map ( |y_i| y_i. floor ( ) . to_usize ( ) . unwrap ( ) )
79
+ . collect ( ) ;
77
80
78
- let mut y_sorted = y. to_vec ( ) ;
79
- y_sorted. sort_by ( |a, b| a. partial_cmp ( b) . unwrap ( ) ) ;
80
- let mut class_labels = Vec :: with_capacity ( y. len ( ) ) ;
81
- class_labels. push ( y_sorted[ 0 ] ) ;
82
- let mut classes_count = Vec :: with_capacity ( y. len ( ) ) ;
83
- let mut current_count = T :: one ( ) ;
84
- for idx in 1 ..y_samples {
85
- if y_sorted[ idx] == y_sorted[ idx - 1 ] {
86
- current_count += T :: one ( ) ;
87
- } else {
88
- classes_count. push ( current_count) ;
89
- class_labels. push ( y_sorted[ idx] ) ;
90
- current_count = T :: one ( )
91
- }
92
- classes_count. push ( current_count) ;
81
+ let y_max = y
82
+ . iter ( )
83
+ . max ( )
84
+ . ok_or_else ( || Failed :: fit ( & "Failed to get the labels of y." . to_string ( ) ) ) ?;
85
+
86
+ let class_labels: Vec < T > = ( 0 ..* y_max + 1 )
87
+ . map ( |label| T :: from ( label) . unwrap ( ) )
88
+ . collect ( ) ;
89
+ let mut classes_count: Vec < T > = vec ! [ T :: zero( ) ; class_labels. len( ) ] ;
90
+ for elem in y. iter ( ) {
91
+ classes_count[ * elem] += T :: one ( ) ;
93
92
}
94
93
95
94
let mut feature_categories: Vec < Vec < T > > = Vec :: with_capacity ( n_features) ;
96
-
97
95
for feature in 0 ..n_features {
98
- let feature_types = x. get_col_as_vec ( feature) . unique ( ) ;
96
+ let feature_max = x
97
+ . get_col_as_vec ( feature)
98
+ . iter ( )
99
+ . map ( |f_i| f_i. floor ( ) . to_usize ( ) . unwrap ( ) )
100
+ . max ( )
101
+ . ok_or_else ( || {
102
+ Failed :: fit ( & format ! (
103
+ "Failed to get the categories for feature = {}" ,
104
+ feature
105
+ ) )
106
+ } ) ?;
107
+ let feature_types = ( 0 ..feature_max + 1 )
108
+ . map ( |feat| T :: from ( feat) . unwrap ( ) )
109
+ . collect ( ) ;
99
110
feature_categories. push ( feature_types) ;
100
111
}
101
- let mut coef: Vec < Vec < Vec < T > > > = Vec :: with_capacity ( class_labels. len ( ) ) ;
112
+
113
+ let mut coefficients: Vec < Vec < Vec < T > > > = Vec :: with_capacity ( class_labels. len ( ) ) ;
102
114
for ( label, label_count) in class_labels. iter ( ) . zip ( classes_count. iter ( ) ) {
103
115
let mut coef_i: Vec < Vec < T > > = Vec :: with_capacity ( n_features) ;
104
116
for ( feature_index, feature_options) in
@@ -108,37 +120,36 @@ impl<T: RealNumber> CategoricalNBDistribution<T> {
108
120
. get_col_as_vec ( feature_index)
109
121
. iter ( )
110
122
. enumerate ( )
111
- . filter ( |( i, _j) | y . get ( * i ) == * label)
123
+ . filter ( |( i, _j) | T :: from ( y [ * i ] ) . unwrap ( ) == * label)
112
124
. map ( |( _, j) | * j)
113
125
. collect :: < Vec < T > > ( ) ;
114
- let mut feat_count: Vec < usize > = Vec :: with_capacity ( feature_options. len ( ) ) ;
115
- for k in feature_options . iter ( ) {
116
- let feat_k_count = col . iter ( ) . filter ( | & v| v == k ) . count ( ) ;
117
- feat_count. push ( feat_k_count ) ;
126
+ let mut feat_count: Vec < T > = vec ! [ T :: zero ( ) ; feature_options. len( ) ] ;
127
+ for row in col . iter ( ) {
128
+ let index = row . floor ( ) . to_usize ( ) . unwrap ( ) ;
129
+ feat_count[ index ] += T :: one ( ) ;
118
130
}
119
-
120
131
let coef_i_j = feat_count
121
132
. iter ( )
122
133
. map ( |c| {
123
- ( T :: from ( * c) . unwrap ( ) + alpha)
124
- / ( T :: from ( * label_count ) . unwrap ( )
125
- + T :: from ( feature_options . len ( ) ) . unwrap ( ) * alpha )
134
+ ( ( * c + alpha)
135
+ / ( * label_count + T :: from ( feature_options . len ( ) ) . unwrap ( ) * alpha ) )
136
+ . ln ( )
126
137
} )
127
138
. collect :: < Vec < T > > ( ) ;
128
139
coef_i. push ( coef_i_j) ;
129
140
}
130
- coef . push ( coef_i) ;
141
+ coefficients . push ( coef_i) ;
131
142
}
132
- let class_probabilities = classes_count
143
+
144
+ let class_priors = classes_count
133
145
. into_iter ( )
134
146
. map ( |count| count / T :: from ( n_samples) . unwrap ( ) )
135
147
. collect :: < Vec < T > > ( ) ;
136
148
137
149
Ok ( Self {
138
150
class_labels,
139
- class_probabilities,
140
- coef,
141
- feature_categories,
151
+ class_priors,
152
+ coefficients,
142
153
} )
143
154
}
144
155
}
@@ -170,6 +181,7 @@ impl<T: RealNumber> Default for CategoricalNBParameters<T> {
170
181
}
171
182
172
183
/// CategoricalNB implements the categorical naive Bayes algorithm for categorically distributed data.
184
+ #[ derive( Debug ) ]
173
185
pub struct CategoricalNB < T : RealNumber , M : Matrix < T > > {
174
186
inner : BaseNaiveBayes < T , M , CategoricalNBDistribution < T > > ,
175
187
}
@@ -205,7 +217,7 @@ mod tests {
205
217
use crate :: linalg:: naive:: dense_matrix:: DenseMatrix ;
206
218
207
219
#[ test]
208
- fn run_base_naive_bayes ( ) {
220
+ fn run_categorical_naive_bayes ( ) {
209
221
let x = DenseMatrix :: from_2d_array ( & [
210
222
& [ 0. , 2. , 1. , 0. ] ,
211
223
& [ 0. , 2. , 1. , 1. ] ,
@@ -229,4 +241,32 @@ mod tests {
229
241
let y_hat = cnb. predict ( & x_test) . unwrap ( ) ;
230
242
assert_eq ! ( y_hat, vec![ 0. , 1. ] ) ;
231
243
}
244
+
245
+ #[ test]
246
+ fn run_categorical_naive_bayes2 ( ) {
247
+ let x = DenseMatrix :: from_2d_array ( & [
248
+ & [ 3. , 4. , 0. , 1. ] ,
249
+ & [ 3. , 0. , 0. , 1. ] ,
250
+ & [ 4. , 4. , 1. , 2. ] ,
251
+ & [ 4. , 2. , 4. , 3. ] ,
252
+ & [ 4. , 2. , 4. , 2. ] ,
253
+ & [ 4. , 1. , 1. , 0. ] ,
254
+ & [ 1. , 1. , 1. , 1. ] ,
255
+ & [ 0. , 4. , 1. , 0. ] ,
256
+ & [ 0. , 3. , 2. , 1. ] ,
257
+ & [ 0. , 3. , 1. , 1. ] ,
258
+ & [ 3. , 4. , 0. , 1. ] ,
259
+ & [ 3. , 4. , 2. , 4. ] ,
260
+ & [ 0. , 3. , 1. , 2. ] ,
261
+ & [ 0. , 4. , 1. , 2. ] ,
262
+ ] ) ;
263
+ let y = vec ! [ 0. , 0. , 1. , 1. , 1. , 0. , 1. , 0. , 1. , 1. , 1. , 1. , 1. , 0. ] ;
264
+
265
+ let cnb = CategoricalNB :: fit ( & x, & y, Default :: default ( ) ) . unwrap ( ) ;
266
+ let y_hat = cnb. predict ( & x) . unwrap ( ) ;
267
+ assert_eq ! (
268
+ y_hat,
269
+ vec![ 0. , 0. , 1. , 1. , 1. , 0. , 1. , 0. , 1. , 1. , 0. , 1. , 1. , 1. ]
270
+ ) ;
271
+ }
232
272
}
0 commit comments