|
1 | 1 | //! # Model Selection methods
|
2 | 2 | //!
|
3 |
| -//! In statistics and machine learning we usually split our data into multiple subsets: training data and testing data (and sometimes to validate), |
4 |
| -//! and fit our model on the train data, in order to make predictions on the test data. We do that to avoid overfitting or underfitting model to our data. |
| 3 | +//! In statistics and machine learning we usually split our data into two sets: one for training and the other one for testing. |
| 4 | +//! We fit our model to the training data, in order to make predictions on the test data. We do that to avoid overfitting or underfitting model to our data. |
5 | 5 | //! Overfitting is bad because the model we trained fits trained data too well and can’t make any inferences on new data.
|
6 | 6 | //! Underfitted is bad because the model is undetrained and does not fit the training data well.
|
7 |
| -//! Splitting data into multiple subsets helps to find the right combination of hyperparameters, estimate model performance and choose the right model for |
8 |
| -//! your data. |
| 7 | +//! Splitting data into multiple subsets helps us to find the right combination of hyperparameters, estimate model performance and choose the right model for |
| 8 | +//! the data. |
9 | 9 | //!
|
10 |
| -//! In SmartCore you can split your data into training and test datasets using `train_test_split` function. |
| 10 | +//! In SmartCore a random split into training and test sets can be quickly computed with the [train_test_split](./fn.train_test_split.html) helper function. |
| 11 | +//! |
| 12 | +//! ``` |
| 13 | +//! use crate::smartcore::linalg::BaseMatrix; |
| 14 | +//! use smartcore::linalg::naive::dense_matrix::DenseMatrix; |
| 15 | +//! use smartcore::model_selection::train_test_split; |
| 16 | +//! |
| 17 | +//! //Iris data |
| 18 | +//! let x = DenseMatrix::from_2d_array(&[ |
| 19 | +//! &[5.1, 3.5, 1.4, 0.2], |
| 20 | +//! &[4.9, 3.0, 1.4, 0.2], |
| 21 | +//! &[4.7, 3.2, 1.3, 0.2], |
| 22 | +//! &[4.6, 3.1, 1.5, 0.2], |
| 23 | +//! &[5.0, 3.6, 1.4, 0.2], |
| 24 | +//! &[5.4, 3.9, 1.7, 0.4], |
| 25 | +//! &[4.6, 3.4, 1.4, 0.3], |
| 26 | +//! &[5.0, 3.4, 1.5, 0.2], |
| 27 | +//! &[4.4, 2.9, 1.4, 0.2], |
| 28 | +//! &[4.9, 3.1, 1.5, 0.1], |
| 29 | +//! &[7.0, 3.2, 4.7, 1.4], |
| 30 | +//! &[6.4, 3.2, 4.5, 1.5], |
| 31 | +//! &[6.9, 3.1, 4.9, 1.5], |
| 32 | +//! &[5.5, 2.3, 4.0, 1.3], |
| 33 | +//! &[6.5, 2.8, 4.6, 1.5], |
| 34 | +//! &[5.7, 2.8, 4.5, 1.3], |
| 35 | +//! &[6.3, 3.3, 4.7, 1.6], |
| 36 | +//! &[4.9, 2.4, 3.3, 1.0], |
| 37 | +//! &[6.6, 2.9, 4.6, 1.3], |
| 38 | +//! &[5.2, 2.7, 3.9, 1.4], |
| 39 | +//! ]); |
| 40 | +//! let y: Vec<f64> = vec![ |
| 41 | +//! 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., |
| 42 | +//! ]; |
| 43 | +//! |
| 44 | +//! let (x_train, x_test, y_train, y_test) = train_test_split(&x, &y, 0.2, true); |
| 45 | +//! |
| 46 | +//! println!("X train: {:?}, y train: {}, X test: {:?}, y test: {}", |
| 47 | +//! x_train.shape(), y_train.len(), x_test.shape(), y_test.len()); |
| 48 | +//! ``` |
| 49 | +//! |
| 50 | +//! When we partition the available data into two disjoint sets, we drastically reduce the number of samples that can be used for training. |
| 51 | +//! |
| 52 | +//! One way to solve this problem is to use k-fold cross-validation. With k-fold validation, the dataset is split into k disjoint sets. |
| 53 | +//! A model is trained using k - 1 of the folds, and the resulting model is validated on the remaining portion of the data. |
| 54 | +//! |
| 55 | +//! The simplest way to run cross-validation is to use the [cross_val_score](./fn.cross_validate.html) helper function on your estimator and the dataset. |
| 56 | +//! |
| 57 | +//! ``` |
| 58 | +//! use smartcore::linalg::naive::dense_matrix::DenseMatrix; |
| 59 | +//! use smartcore::model_selection::{KFold, cross_validate}; |
| 60 | +//! use smartcore::metrics::accuracy; |
| 61 | +//! use smartcore::linear::logistic_regression::LogisticRegression; |
| 62 | +//! |
| 63 | +//! //Iris data |
| 64 | +//! let x = DenseMatrix::from_2d_array(&[ |
| 65 | +//! &[5.1, 3.5, 1.4, 0.2], |
| 66 | +//! &[4.9, 3.0, 1.4, 0.2], |
| 67 | +//! &[4.7, 3.2, 1.3, 0.2], |
| 68 | +//! &[4.6, 3.1, 1.5, 0.2], |
| 69 | +//! &[5.0, 3.6, 1.4, 0.2], |
| 70 | +//! &[5.4, 3.9, 1.7, 0.4], |
| 71 | +//! &[4.6, 3.4, 1.4, 0.3], |
| 72 | +//! &[5.0, 3.4, 1.5, 0.2], |
| 73 | +//! &[4.4, 2.9, 1.4, 0.2], |
| 74 | +//! &[4.9, 3.1, 1.5, 0.1], |
| 75 | +//! &[7.0, 3.2, 4.7, 1.4], |
| 76 | +//! &[6.4, 3.2, 4.5, 1.5], |
| 77 | +//! &[6.9, 3.1, 4.9, 1.5], |
| 78 | +//! &[5.5, 2.3, 4.0, 1.3], |
| 79 | +//! &[6.5, 2.8, 4.6, 1.5], |
| 80 | +//! &[5.7, 2.8, 4.5, 1.3], |
| 81 | +//! &[6.3, 3.3, 4.7, 1.6], |
| 82 | +//! &[4.9, 2.4, 3.3, 1.0], |
| 83 | +//! &[6.6, 2.9, 4.6, 1.3], |
| 84 | +//! &[5.2, 2.7, 3.9, 1.4], |
| 85 | +//! ]); |
| 86 | +//! let y: Vec<f64> = vec![ |
| 87 | +//! 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., |
| 88 | +//! ]; |
| 89 | +//! |
| 90 | +//! let cv = KFold::default().with_n_splits(3); |
| 91 | +//! |
| 92 | +//! let results = cross_validate(LogisticRegression::fit, //estimator |
| 93 | +//! &x, &y, //data |
| 94 | +//! Default::default(), //hyperparameters |
| 95 | +//! cv, //cross validation split |
| 96 | +//! &accuracy).unwrap(); //metric |
| 97 | +//! |
| 98 | +//! println!("Training accuracy: {}, test accuracy: {}", |
| 99 | +//! results.mean_test_score(), results.mean_train_score()); |
| 100 | +//! ``` |
| 101 | +//! |
| 102 | +//! The function [cross_val_predict](./fn.cross_val_predict.html) has a similar interface to `cross_val_score`, |
| 103 | +//! but instead of test error it calculates predictions for all samples in the test set. |
11 | 104 |
|
12 | 105 | use crate::api::Predictor;
|
13 | 106 | use crate::error::Failed;
|
|
0 commit comments