fix: clippy, documentation and formatting

Volodymyr Orlov · Volodymyr Orlov · commit 9b221979da51 · 2020-12-22T16:35:28.000-08:00
diff --git a/src/linalg/mod.rs b/src/linalg/mod.rs
@@ -281,8 +281,8 @@ pub trait BaseVector<T: RealNumber>: Clone + Debug {
 
         let mut result = Self::zeros(n);
 
-        for i in 0..n {
-            result.set(i, self.get(index[i]));
+        for (i, idx) in index.iter().enumerate() {
+            result.set(i, self.get(*idx));
         }
 
         result
@@ -639,11 +639,11 @@ pub trait BaseMatrix<T: RealNumber>: Clone + Debug {
             _ => Self::zeros(n, index.len()),
         };
 
-        for i in 0..index.len() {
+        for (i, idx) in index.iter().enumerate() {
             for j in 0..k {
                 match axis {
-                    0 => result.set(i, j, self.get(index[i], j)),
-                    _ => result.set(j, i, self.get(j, index[i])),
+                    0 => result.set(i, j, self.get(*idx, j)),
+                    _ => result.set(j, i, self.get(j, *idx)),
                 };
             }
         }
diff --git a/src/linear/logistic_regression.rs b/src/linear/logistic_regression.rs
@@ -69,8 +69,7 @@ use crate::optimization::FunctionOrder;
 
 /// Logistic Regression parameters
 #[derive(Serialize, Deserialize, Debug, Clone)]
-pub struct LogisticRegressionParameters {
-}
+pub struct LogisticRegressionParameters {}
 
 /// Logistic Regression
 #[derive(Serialize, Deserialize, Debug)]
@@ -105,8 +104,7 @@ struct BinaryObjectiveFunction<'a, T: RealNumber, M: Matrix<T>> {
 
 impl Default for LogisticRegressionParameters {
     fn default() -> Self {
-        LogisticRegressionParameters {            
-        }
+        LogisticRegressionParameters {}
     }
 }
 
@@ -231,7 +229,11 @@ impl<T: RealNumber, M: Matrix<T>> LogisticRegression<T, M> {
     /// * `x` - _NxM_ matrix with _N_ observations and _M_ features in each observation.
     /// * `y` - target class values
     /// * `parameters` - other parameters, use `Default::default()` to set parameters to default values.    
-    pub fn fit(x: &M, y: &M::RowVector, _parameters: LogisticRegressionParameters) -> Result<LogisticRegression<T, M>, Failed> {
+    pub fn fit(
+        x: &M,
+        y: &M::RowVector,
+        _parameters: LogisticRegressionParameters,
+    ) -> Result<LogisticRegression<T, M>, Failed> {
         let y_m = M::from_row_vector(y.clone());
         let (x_nrows, num_attributes) = x.shape();
         let (_, y_nrows) = y_m.shape();
diff --git a/src/model_selection/kfold.rs b/src/model_selection/kfold.rs
@@ -1,30 +1,13 @@
 //! # KFold
 //!
-//! In statistics and machine learning we usually split our data into multiple subsets: training data and testing data (and sometimes to validate),
-//! and fit our model on the train data, in order to make predictions on the test data. We do that to avoid overfitting or underfitting model to our data.
-//! Overfitting is bad because the model we trained fits trained data too well and can’t make any inferences on new data.
-//! Underfitted is bad because the model is undetrained and does not fit the training data well.
-//! Splitting data into multiple subsets helps to find the right combination of hyperparameters, estimate model performance and choose the right model for
-//! your data.
-//!
-//! In SmartCore you can split your data into training and test datasets using `train_test_split` function.
+//! Defines k-fold cross validator.
 
 use crate::linalg::Matrix;
 use crate::math::num::RealNumber;
+use crate::model_selection::BaseKFold;
 use rand::seq::SliceRandom;
 use rand::thread_rng;
 
-/// An interface for the K-Folds cross-validator
-pub trait BaseKFold {
-    /// An iterator over indices that split data into training and test set.
-    type Output: Iterator<Item = (Vec<usize>, Vec<usize>)>;
-    /// Return a tuple containing the the training set indices for that split and
-    /// the testing set indices for that split.
-    fn split<T: RealNumber, M: Matrix<T>>(&self, x: &M) -> Self::Output;
-    /// Returns the number of splits
-    fn n_splits(&self) -> usize;
-}
-
 /// K-Folds cross-validator
 pub struct KFold {
     /// Number of folds. Must be at least 2.
@@ -101,12 +84,12 @@ impl KFold {
 }
 
 /// An iterator over indices that split data into training and test set.
-pub struct BaseKFoldIter {
+pub struct KFoldIter {
     indices: Vec<usize>,
     test_indices: Vec<Vec<bool>>,
 }
 
-impl Iterator for BaseKFoldIter {
+impl Iterator for KFoldIter {
     type Item = (Vec<usize>, Vec<usize>);
 
     fn next(&mut self) -> Option<(Vec<usize>, Vec<usize>)> {
@@ -133,7 +116,7 @@ impl Iterator for BaseKFoldIter {
 
 /// Abstract class for all KFold functionalities
 impl BaseKFold for KFold {
-    type Output = BaseKFoldIter;
+    type Output = KFoldIter;
 
     fn n_splits(&self) -> usize {
         self.n_splits
@@ -148,7 +131,7 @@ impl BaseKFold for KFold {
         let mut test_indices = self.test_masks(x);
         test_indices.reverse();
 
-        BaseKFoldIter {
+        KFoldIter {
             indices,
             test_indices,
         }
diff --git a/src/model_selection/mod.rs b/src/model_selection/mod.rs
@@ -14,15 +14,27 @@ use crate::error::Failed;
 use crate::linalg::BaseVector;
 use crate::linalg::Matrix;
 use crate::math::num::RealNumber;
-use crate::model_selection::kfold::BaseKFold;
 use rand::seq::SliceRandom;
 use rand::thread_rng;
 
-pub mod kfold;
+pub(crate) mod kfold;
+
+pub use kfold::{KFold, KFoldIter};
+
+/// An interface for the K-Folds cross-validator
+pub trait BaseKFold {
+    /// An iterator over indices that split data into training and test set.
+    type Output: Iterator<Item = (Vec<usize>, Vec<usize>)>;
+    /// Return a tuple containing the the training set indices for that split and
+    /// the testing set indices for that split.
+    fn split<T: RealNumber, M: Matrix<T>>(&self, x: &M) -> Self::Output;
+    /// Returns the number of splits
+    fn n_splits(&self) -> usize;
+}
 
 /// Splits data into 2 disjoint datasets.
 /// * `x` - features, matrix of size _NxM_ where _N_ is number of samples and _M_ is number of attributes.
-/// * `y` - target values, should be of size _M_
+/// * `y` - target values, should be of size _N_
 /// * `test_size`, (0, 1] - the proportion of the dataset to include in the test split.
 /// * `shuffle`, - whether or not to shuffle the data before splitting
 pub fn train_test_split<T: RealNumber, M: Matrix<T>>(
@@ -65,22 +77,33 @@ pub fn train_test_split<T: RealNumber, M: Matrix<T>>(
     (x_train, x_test, y_train, y_test)
 }
 
+/// Cross validation results.
 #[derive(Clone, Debug)]
 pub struct CrossValidationResult<T: RealNumber> {
+    /// Vector with test scores on each cv split
     pub test_score: Vec<T>,
+    /// Vector with training scores on each cv split
     pub train_score: Vec<T>,
 }
 
 impl<T: RealNumber> CrossValidationResult<T> {
+    /// Average test score
     pub fn mean_test_score(&self) -> T {
         self.test_score.sum() / T::from_usize(self.test_score.len()).unwrap()
     }
-
+    /// Average training score
     pub fn mean_train_score(&self) -> T {
         self.train_score.sum() / T::from_usize(self.train_score.len()).unwrap()
     }
 }
 
+/// Evaluate an estimator by cross-validation using given metric.
+/// * `fit_estimator` - a `fit` function of an estimator
+/// * `x` - features, matrix of size _NxM_ where _N_ is number of samples and _M_ is number of attributes.
+/// * `y` - target values, should be of size _N_
+/// * `parameters` - parameters of selected estimator. Use `Default::default()` for default parameters.
+/// * `cv` - the cross-validation splitting strategy, should be an instance of [`BaseKFold`](./trait.BaseKFold.html)
+/// * `score` - a metric to use for evaluation, see [metrics](../metrics/index.html)
 pub fn cross_validate<T, M, H, E, K, F, S>(
     fit_estimator: F,
     x: &M,
@@ -302,7 +325,6 @@ mod tests {
 
     #[test]
     fn test_some_classifier() {
-        
         let x = DenseMatrix::from_2d_array(&[
             &[5.1, 3.5, 1.4, 0.2],
             &[4.9, 3.0, 1.4, 0.2],
@@ -334,8 +356,15 @@ mod tests {
             ..KFold::default()
         };
 
-        let results =
-            cross_validate(DecisionTreeClassifier::fit, &x, &y, Default::default(), cv, &accuracy).unwrap();
+        let results = cross_validate(
+            DecisionTreeClassifier::fit,
+            &x,
+            &y,
+            Default::default(),
+            cv,
+            &accuracy,
+        )
+        .unwrap();
 
         println!("{}", results.mean_test_score());
         println!("{}", results.mean_train_score());
diff --git a/src/naive_bayes/multinomial.rs b/src/naive_bayes/multinomial.rs
@@ -188,7 +188,7 @@ pub struct MultinomialNB<T: RealNumber, M: Matrix<T>> {
     inner: BaseNaiveBayes<T, M, MultinomialNBDistribution<T>>,
 }
 
-impl<T: RealNumber, M: Matrix<T>> Predictor<M, M::RowVector> for MultinomialNB  <T, M> {
+impl<T: RealNumber, M: Matrix<T>> Predictor<M, M::RowVector> for MultinomialNB<T, M> {
     fn predict(&self, x: &M) -> Result<M::RowVector, Failed> {
         self.predict(x)
     }
diff --git a/src/svm/svc.rs b/src/svm/svc.rs
@@ -167,8 +167,8 @@ impl<T: RealNumber, M: Matrix<T>, K: Kernel<T, M::RowVector>> SVCParameters<T, M
             epoch: self.epoch,
             c: self.c,
             tol: self.tol,
-            kernel: kernel,
-            m: PhantomData
+            kernel,
+            m: PhantomData,
         }
     }
 }
@@ -180,12 +180,14 @@ impl<T: RealNumber, M: Matrix<T>> Default for SVCParameters<T, M, LinearKernel>
             c: T::one(),
             tol: T::from_f64(1e-3).unwrap(),
             kernel: Kernels::linear(),
-            m: PhantomData
+            m: PhantomData,
         }
     }
 }
 
-impl<T: RealNumber, M: Matrix<T>, K: Kernel<T, M::RowVector>> Predictor<M, M::RowVector> for SVC<T, M, K> {
+impl<T: RealNumber, M: Matrix<T>, K: Kernel<T, M::RowVector>> Predictor<M, M::RowVector>
+    for SVC<T, M, K>
+{
     fn predict(&self, x: &M) -> Result<M::RowVector, Failed> {
         self.predict(x)
     }
@@ -743,10 +745,12 @@ mod tests {
         let y_hat = SVC::fit(
             &x,
             &y,
-            SVCParameters::default().with_c(200.0).with_kernel(Kernels::linear()),
+            SVCParameters::default()
+                .with_c(200.0)
+                .with_kernel(Kernels::linear()),
         )
         .and_then(|lr| lr.predict(&x))
-        .unwrap();        
+        .unwrap();
 
         assert!(accuracy(&y_hat, &y) >= 0.9);
     }
@@ -784,7 +788,9 @@ mod tests {
         let y_hat = SVC::fit(
             &x,
             &y,
-            SVCParameters::default().with_c(1.0).with_kernel(Kernels::rbf(0.7)),
+            SVCParameters::default()
+                .with_c(1.0)
+                .with_kernel(Kernels::rbf(0.7)),
         )
         .and_then(|lr| lr.predict(&x))
         .unwrap();
diff --git a/src/svm/svr.rs b/src/svm/svr.rs
@@ -134,7 +134,7 @@ struct Cache<T: Clone> {
     data: Vec<RefCell<Option<Vec<T>>>>,
 }
 
-impl<T: RealNumber, M: Matrix<T>, K: Kernel<T, M::RowVector>> SVRParameters<T, M, K> {    
+impl<T: RealNumber, M: Matrix<T>, K: Kernel<T, M::RowVector>> SVRParameters<T, M, K> {
     /// Epsilon in the epsilon-SVR model.
     pub fn with_eps(mut self, eps: T) -> Self {
         self.eps = eps;
@@ -153,11 +153,11 @@ impl<T: RealNumber, M: Matrix<T>, K: Kernel<T, M::RowVector>> SVRParameters<T, M
     /// The kernel function.
     pub fn with_kernel<KK: Kernel<T, M::RowVector>>(&self, kernel: KK) -> SVRParameters<T, M, KK> {
         SVRParameters {
-            eps: self.eps,           
+            eps: self.eps,
             c: self.c,
             tol: self.tol,
-            kernel: kernel,
-            m: PhantomData
+            kernel,
+            m: PhantomData,
         }
     }
 }
@@ -169,12 +169,14 @@ impl<T: RealNumber, M: Matrix<T>> Default for SVRParameters<T, M, LinearKernel>
             c: T::one(),
             tol: T::from_f64(1e-3).unwrap(),
             kernel: Kernels::linear(),
-            m: PhantomData
+            m: PhantomData,
         }
     }
 }
 
-impl<T: RealNumber, M: Matrix<T>, K: Kernel<T, M::RowVector>> Predictor<M, M::RowVector> for SVR<T, M, K> {
+impl<T: RealNumber, M: Matrix<T>, K: Kernel<T, M::RowVector>> Predictor<M, M::RowVector>
+    for SVR<T, M, K>
+{
     fn predict(&self, x: &M) -> Result<M::RowVector, Failed> {
         self.predict(x)
     }
@@ -188,7 +190,7 @@ impl<T: RealNumber, M: Matrix<T>, K: Kernel<T, M::RowVector>> SVR<T, M, K> {
     /// * `parameters` - optional parameters, use `Default::default()` to set parameters to default values.
     pub fn fit(
         x: &M,
-        y: &M::RowVector,        
+        y: &M::RowVector,
         parameters: SVRParameters<T, M, K>,
     ) -> Result<SVR<T, M, K>, Failed> {
         let (n, _) = x.shape();
@@ -544,13 +546,9 @@ mod tests {
             114.2, 115.7, 116.9,
         ];
 
-        let y_hat = SVR::fit(
-            &x,
-            &y,
-            SVRParameters::default().with_eps(2.0).with_c(10.0),
-        )
-        .and_then(|lr| lr.predict(&x))
-        .unwrap();
+        let y_hat = SVR::fit(&x, &y, SVRParameters::default().with_eps(2.0).with_c(10.0))
+            .and_then(|lr| lr.predict(&x))
+            .unwrap();
 
         assert!(mean_squared_error(&y_hat, &y) < 2.5);
     }

Original file line number	Diff line number	Diff line change
`@@ -188,7 +188,7 @@ pub struct MultinomialNB<T: RealNumber, M: Matrix<T>> {`
`188`	`188`	`inner: BaseNaiveBayes<T, M, MultinomialNBDistribution<T>>,`
`189`	`189`	`}`
`190`	`190`
`191`		`-impl<T: RealNumber, M: Matrix<T>> Predictor<M, M::RowVector> for MultinomialNB <T, M> {`
	`191`	`+impl<T: RealNumber, M: Matrix<T>> Predictor<M, M::RowVector> for MultinomialNB<T, M> {`
`192`	`192`	`fn predict(&self, x: &M) -> Result<M::RowVector, Failed> {`
`193`	`193`	`self.predict(x)`
`194`	`194`	`}`
Original file line number	Diff line number	Diff line change
`@@ -167,8 +167,8 @@ impl<T: RealNumber, M: Matrix<T>, K: Kernel<T, M::RowVector>> SVCParameters<T, M`
`167`	`167`	`epoch: self.epoch,`
`168`	`168`	`c: self.c,`
`169`	`169`	`tol: self.tol,`
`170`		`- kernel: kernel,`
`171`		`- m: PhantomData`
	`170`	`+ kernel,`
	`171`	`+ m: PhantomData,`
`172`	`172`	`}`
`173`	`173`	`}`
`174`	`174`	`}`
`@@ -180,12 +180,14 @@ impl<T: RealNumber, M: Matrix<T>> Default for SVCParameters<T, M, LinearKernel>`
`180`	`180`	`c: T::one(),`
`181`	`181`	`tol: T::from_f64(1e-3).unwrap(),`
`182`	`182`	`kernel: Kernels::linear(),`
`183`		`- m: PhantomData`
	`183`	`+ m: PhantomData,`
`184`	`184`	`}`
`185`	`185`	`}`
`186`	`186`	`}`
`187`	`187`
`188`		`-impl<T: RealNumber, M: Matrix<T>, K: Kernel<T, M::RowVector>> Predictor<M, M::RowVector> for SVC<T, M, K> {`
	`188`	`+impl<T: RealNumber, M: Matrix<T>, K: Kernel<T, M::RowVector>> Predictor<M, M::RowVector>`
	`189`	`+ for SVC<T, M, K>`
	`190`	`+{`
`189`	`191`	`fn predict(&self, x: &M) -> Result<M::RowVector, Failed> {`
`190`	`192`	`self.predict(x)`
`191`	`193`	`}`
`@@ -743,10 +745,12 @@ mod tests {`
`743`	`745`	`let y_hat = SVC::fit(`
`744`	`746`	`&x,`
`745`	`747`	`&y,`
`746`		`- SVCParameters::default().with_c(200.0).with_kernel(Kernels::linear()),`
	`748`	`+ SVCParameters::default()`
	`749`	`+ .with_c(200.0)`
	`750`	`+ .with_kernel(Kernels::linear()),`
`747`	`751`	`)`
`748`	`752`	`.and_then(\|lr\| lr.predict(&x))`
`749`		`- .unwrap();`
	`753`	`+ .unwrap();`
`750`	`754`
`751`	`755`	`assert!(accuracy(&y_hat, &y) >= 0.9);`
`752`	`756`	`}`
`@@ -784,7 +788,9 @@ mod tests {`
`784`	`788`	`let y_hat = SVC::fit(`
`785`	`789`	`&x,`
`786`	`790`	`&y,`
`787`		`- SVCParameters::default().with_c(1.0).with_kernel(Kernels::rbf(0.7)),`
	`791`	`+ SVCParameters::default()`
	`792`	`+ .with_c(1.0)`
	`793`	`+ .with_kernel(Kernels::rbf(0.7)),`
`788`	`794`	`)`
`789`	`795`	`.and_then(\|lr\| lr.predict(&x))`
`790`	`796`	`.unwrap();`