feat: + builders for algorithm parameters

Volodymyr Orlov · Volodymyr Orlov · commit dd341f4a12a8 · 2020-12-23T12:29:39.000-08:00
diff --git a/src/cluster/dbscan.rs b/src/cluster/dbscan.rs
@@ -53,14 +53,32 @@ pub struct DBSCAN<T: RealNumber, D: Distance<Vec<T>, T>> {
 #[derive(Debug, Clone)]
 /// DBSCAN clustering algorithm parameters
 pub struct DBSCANParameters<T: RealNumber> {
-    /// Maximum number of iterations of the k-means algorithm for a single run.
+    /// The number of samples (or total weight) in a neighborhood for a point to be considered as a core point.
     pub min_samples: usize,
-    /// The number of samples in a neighborhood for a point to be considered as a core point.
+    /// The maximum distance between two samples for one to be considered as in the neighborhood of the other.
     pub eps: T,
     /// KNN algorithm to use.
     pub algorithm: KNNAlgorithmName,
 }
 
+impl<T: RealNumber> DBSCANParameters<T> {
+    /// The number of samples (or total weight) in a neighborhood for a point to be considered as a core point.
+    pub fn with_min_samples(mut self, min_samples: usize) -> Self {
+        self.min_samples = min_samples;
+        self
+    }
+    /// The maximum distance between two samples for one to be considered as in the neighborhood of the other.
+    pub fn with_eps(mut self, eps: T) -> Self {
+        self.eps = eps;
+        self
+    }
+    /// KNN algorithm to use.
+    pub fn with_algorithm(mut self, algorithm: KNNAlgorithmName) -> Self {
+        self.algorithm = algorithm;
+        self
+    }
+}
+
 impl<T: RealNumber, D: Distance<Vec<T>, T>> PartialEq for DBSCAN<T, D> {
     fn eq(&self, other: &Self) -> bool {
         self.cluster_labels.len() == other.cluster_labels.len()
diff --git a/src/cluster/kmeans.rs b/src/cluster/kmeans.rs
@@ -105,6 +105,14 @@ pub struct KMeansParameters {
     pub max_iter: usize,
 }
 
+impl KMeansParameters {
+    /// Maximum number of iterations of the k-means algorithm for a single run.
+    pub fn with_max_iter(mut self, max_iter: usize) -> Self {
+        self.max_iter = max_iter;
+        self
+    }
+}
+
 impl Default for KMeansParameters {
     fn default() -> Self {
         KMeansParameters { max_iter: 100 }
diff --git a/src/decomposition/pca.rs b/src/decomposition/pca.rs
@@ -88,6 +88,15 @@ pub struct PCAParameters {
     pub use_correlation_matrix: bool,
 }
 
+impl PCAParameters {
+    /// By default, covariance matrix is used to compute principal components.
+    /// Enable this flag if you want to use correlation matrix instead.
+    pub fn with_use_correlation_matrix(mut self, use_correlation_matrix: bool) -> Self {
+        self.use_correlation_matrix = use_correlation_matrix;
+        self
+    }
+}
+
 impl Default for PCAParameters {
     fn default() -> Self {
         PCAParameters {
diff --git a/src/ensemble/random_forest_classifier.rs b/src/ensemble/random_forest_classifier.rs
@@ -85,6 +85,39 @@ pub struct RandomForestClassifier<T: RealNumber> {
     classes: Vec<T>,
 }
 
+impl RandomForestClassifierParameters {
+    /// Split criteria to use when building a tree. See [Decision Tree Classifier](../../tree/decision_tree_classifier/index.html)
+    pub fn with_criterion(mut self, criterion: SplitCriterion) -> Self {
+        self.criterion = criterion;
+        self
+    }
+    /// Tree max depth. See [Decision Tree Classifier](../../tree/decision_tree_classifier/index.html)
+    pub fn with_max_depth(mut self, max_depth: u16) -> Self {
+        self.max_depth = Some(max_depth);
+        self
+    }
+    /// The minimum number of samples required to be at a leaf node. See [Decision Tree Classifier](../../tree/decision_tree_classifier/index.html)
+    pub fn with_min_samples_leaf(mut self, min_samples_leaf: usize) -> Self {
+        self.min_samples_leaf = min_samples_leaf;
+        self
+    }
+    /// The minimum number of samples required to split an internal node. See [Decision Tree Classifier](../../tree/decision_tree_classifier/index.html)
+    pub fn with_min_samples_split(mut self, min_samples_split: usize) -> Self {
+        self.min_samples_split = min_samples_split;
+        self
+    }
+    /// The number of trees in the forest.
+    pub fn with_n_trees(mut self, n_trees: u16) -> Self {
+        self.n_trees = n_trees;
+        self
+    }
+    /// Number of random sample of predictors to use as split candidates.
+    pub fn with_m(mut self, m: usize) -> Self {
+        self.m = Some(m);
+        self
+    }
+}
+
 impl<T: RealNumber> PartialEq for RandomForestClassifier<T> {
     fn eq(&self, other: &Self) -> bool {
         if self.classes.len() != other.classes.len() || self.trees.len() != other.trees.len() {
diff --git a/src/ensemble/random_forest_regressor.rs b/src/ensemble/random_forest_regressor.rs
@@ -80,6 +80,34 @@ pub struct RandomForestRegressor<T: RealNumber> {
     trees: Vec<DecisionTreeRegressor<T>>,
 }
 
+impl RandomForestRegressorParameters {
+    /// Tree max depth. See [Decision Tree Classifier](../../tree/decision_tree_classifier/index.html)
+    pub fn with_max_depth(mut self, max_depth: u16) -> Self {
+        self.max_depth = Some(max_depth);
+        self
+    }
+    /// The minimum number of samples required to be at a leaf node. See [Decision Tree Classifier](../../tree/decision_tree_classifier/index.html)
+    pub fn with_min_samples_leaf(mut self, min_samples_leaf: usize) -> Self {
+        self.min_samples_leaf = min_samples_leaf;
+        self
+    }
+    /// The minimum number of samples required to split an internal node. See [Decision Tree Classifier](../../tree/decision_tree_classifier/index.html)
+    pub fn with_min_samples_split(mut self, min_samples_split: usize) -> Self {
+        self.min_samples_split = min_samples_split;
+        self
+    }
+    /// The number of trees in the forest.
+    pub fn with_n_trees(mut self, n_trees: usize) -> Self {
+        self.n_trees = n_trees;
+        self
+    }
+    /// Number of random sample of predictors to use as split candidates.
+    pub fn with_m(mut self, m: usize) -> Self {
+        self.m = Some(m);
+        self
+    }
+}
+
 impl Default for RandomForestRegressorParameters {
     fn default() -> Self {
         RandomForestRegressorParameters {
diff --git a/src/linear/elastic_net.rs b/src/linear/elastic_net.rs
@@ -90,6 +90,36 @@ pub struct ElasticNet<T: RealNumber, M: Matrix<T>> {
     intercept: T,
 }
 
+impl<T: RealNumber> ElasticNetParameters<T> {
+    /// Regularization parameter.
+    pub fn with_alpha(mut self, alpha: T) -> Self {
+        self.alpha = alpha;
+        self
+    }
+    /// The elastic net mixing parameter, with 0 <= l1_ratio <= 1.
+    /// For l1_ratio = 0 the penalty is an L2 penalty.
+    /// For l1_ratio = 1 it is an L1 penalty. For 0 < l1_ratio < 1, the penalty is a combination of L1 and L2.
+    pub fn with_l1_ratio(mut self, l1_ratio: T) -> Self {
+        self.l1_ratio = l1_ratio;
+        self
+    }
+    /// If True, the regressors X will be normalized before regression by subtracting the mean and dividing by the standard deviation.
+    pub fn with_normalize(mut self, normalize: bool) -> Self {
+        self.normalize = normalize;
+        self
+    }
+    /// The tolerance for the optimization
+    pub fn with_tol(mut self, tol: T) -> Self {
+        self.tol = tol;
+        self
+    }
+    /// The maximum number of iterations
+    pub fn with_max_iter(mut self, max_iter: usize) -> Self {
+        self.max_iter = max_iter;
+        self
+    }
+}
+
 impl<T: RealNumber> Default for ElasticNetParameters<T> {
     fn default() -> Self {
         ElasticNetParameters {
diff --git a/src/linear/lasso.rs b/src/linear/lasso.rs
@@ -54,6 +54,29 @@ pub struct Lasso<T: RealNumber, M: Matrix<T>> {
     intercept: T,
 }
 
+impl<T: RealNumber> LassoParameters<T> {
+    /// Regularization parameter.
+    pub fn with_alpha(mut self, alpha: T) -> Self {
+        self.alpha = alpha;
+        self
+    }
+    /// If True, the regressors X will be normalized before regression by subtracting the mean and dividing by the standard deviation.
+    pub fn with_normalize(mut self, normalize: bool) -> Self {
+        self.normalize = normalize;
+        self
+    }
+    /// The tolerance for the optimization
+    pub fn with_tol(mut self, tol: T) -> Self {
+        self.tol = tol;
+        self
+    }
+    /// The maximum number of iterations
+    pub fn with_max_iter(mut self, max_iter: usize) -> Self {
+        self.max_iter = max_iter;
+        self
+    }
+}
+
 impl<T: RealNumber> Default for LassoParameters<T> {
     fn default() -> Self {
         LassoParameters {
diff --git a/src/linear/linear_regression.rs b/src/linear/linear_regression.rs
@@ -93,6 +93,14 @@ pub struct LinearRegression<T: RealNumber, M: Matrix<T>> {
     solver: LinearRegressionSolverName,
 }
 
+impl LinearRegressionParameters {
+    /// Solver to use for estimation of regression coefficients.
+    pub fn with_solver(mut self, solver: LinearRegressionSolverName) -> Self {
+        self.solver = solver;
+        self
+    }
+}
+
 impl Default for LinearRegressionParameters {
     fn default() -> Self {
         LinearRegressionParameters {
diff --git a/src/linear/ridge_regression.rs b/src/linear/ridge_regression.rs
@@ -98,6 +98,24 @@ pub struct RidgeRegression<T: RealNumber, M: Matrix<T>> {
     solver: RidgeRegressionSolverName,
 }
 
+impl<T: RealNumber> RidgeRegressionParameters<T> {
+    /// Regularization parameter.
+    pub fn with_alpha(mut self, alpha: T) -> Self {
+        self.alpha = alpha;
+        self
+    }
+    /// Solver to use for estimation of regression coefficients.
+    pub fn with_solver(mut self, solver: RidgeRegressionSolverName) -> Self {
+        self.solver = solver;
+        self
+    }
+    /// If True, the regressors X will be normalized before regression by subtracting the mean and dividing by the standard deviation.
+    pub fn with_normalize(mut self, normalize: bool) -> Self {
+        self.normalize = normalize;
+        self
+    }
+}
+
 impl<T: RealNumber> Default for RidgeRegressionParameters<T> {
     fn default() -> Self {
         RidgeRegressionParameters {
diff --git a/src/naive_bayes/bernoulli.rs b/src/naive_bayes/bernoulli.rs
@@ -96,6 +96,21 @@ impl<T: RealNumber> BernoulliNBParameters<T> {
             binarize,
         }
     }
+    /// Additive (Laplace/Lidstone) smoothing parameter (0 for no smoothing).
+    pub fn with_alpha(mut self, alpha: T) -> Self {
+        self.alpha = alpha;
+        self
+    }
+    /// Prior probabilities of the classes. If specified the priors are not adjusted according to the data
+    pub fn with_priors(mut self, priors: Vec<T>) -> Self {
+        self.priors = Some(priors);
+        self
+    }
+    /// Threshold for binarizing (mapping to booleans) of sample features. If None, input is presumed to already consist of binary vectors.
+    pub fn with_binarize(mut self, binarize: T) -> Self {
+        self.binarize = Some(binarize);
+        self
+    }
 }
 
 impl<T: RealNumber> Default for BernoulliNBParameters<T> {
diff --git a/src/naive_bayes/categorical.rs b/src/naive_bayes/categorical.rs
@@ -234,7 +234,13 @@ impl<T: RealNumber> CategoricalNBParameters<T> {
             )))
         }
     }
+    /// Additive (Laplace/Lidstone) smoothing parameter (0 for no smoothing).
+    pub fn with_alpha(mut self, alpha: T) -> Self {
+        self.alpha = alpha;
+        self
+    }
 }
+
 impl<T: RealNumber> Default for CategoricalNBParameters<T> {
     fn default() -> Self {
         Self { alpha: T::one() }
diff --git a/src/naive_bayes/gaussian.rs b/src/naive_bayes/gaussian.rs
@@ -86,6 +86,11 @@ impl<T: RealNumber> GaussianNBParameters<T> {
     pub fn new(priors: Option<Vec<T>>) -> Self {
         Self { priors }
     }
+    /// Prior probabilities of the classes. If specified the priors are not adjusted according to the data
+    pub fn with_priors(mut self, priors: Vec<T>) -> Self {
+        self.priors = Some(priors);
+        self
+    }
 }
 
 impl<T: RealNumber> GaussianNBDistribution<T> {
diff --git a/src/naive_bayes/multinomial.rs b/src/naive_bayes/multinomial.rs
@@ -86,6 +86,16 @@ impl<T: RealNumber> MultinomialNBParameters<T> {
     pub fn new(alpha: T, priors: Option<Vec<T>>) -> Self {
         Self { alpha, priors }
     }
+    /// Additive (Laplace/Lidstone) smoothing parameter (0 for no smoothing).
+    pub fn with_alpha(mut self, alpha: T) -> Self {
+        self.alpha = alpha;
+        self
+    }
+    /// Prior probabilities of the classes. If specified the priors are not adjusted according to the data
+    pub fn with_priors(mut self, priors: Vec<T>) -> Self {
+        self.priors = Some(priors);
+        self
+    }
 }
 
 impl<T: RealNumber> Default for MultinomialNBParameters<T> {
diff --git a/src/neighbors/knn_classifier.rs b/src/neighbors/knn_classifier.rs
@@ -80,9 +80,17 @@ impl<T: RealNumber, D: Distance<Vec<T>, T>> KNNClassifierParameters<T, D> {
     /// a function that defines a distance between each pair of point in training data.
     /// This function should extend [`Distance`](../../math/distance/trait.Distance.html) trait.
     /// See [`Distances`](../../math/distance/struct.Distances.html) for a list of available functions.
-    pub fn with_distance(mut self, distance: D) -> Self {
-        self.distance = distance;
-        self
+    pub fn with_distance<DD: Distance<Vec<T>, T>>(
+        self,
+        distance: DD,
+    ) -> KNNClassifierParameters<T, DD> {
+        KNNClassifierParameters {
+            distance,
+            algorithm: self.algorithm,
+            weight: self.weight,
+            k: self.k,
+            t: PhantomData,
+        }
     }
     /// backend search algorithm. See [`knn search algorithms`](../../algorithm/neighbour/index.html). `CoverTree` is default.
     pub fn with_algorithm(mut self, algorithm: KNNAlgorithmName) -> Self {
diff --git a/src/neighbors/knn_regressor.rs b/src/neighbors/knn_regressor.rs
@@ -82,9 +82,17 @@ impl<T: RealNumber, D: Distance<Vec<T>, T>> KNNRegressorParameters<T, D> {
     /// a function that defines a distance between each pair of point in training data.
     /// This function should extend [`Distance`](../../math/distance/trait.Distance.html) trait.
     /// See [`Distances`](../../math/distance/struct.Distances.html) for a list of available functions.
-    pub fn with_distance(mut self, distance: D) -> Self {
-        self.distance = distance;
-        self
+    pub fn with_distance<DD: Distance<Vec<T>, T>>(
+        self,
+        distance: DD,
+    ) -> KNNRegressorParameters<T, DD> {
+        KNNRegressorParameters {
+            distance,
+            algorithm: self.algorithm,
+            weight: self.weight,
+            k: self.k,
+            t: PhantomData,
+        }
     }
     /// backend search algorithm. See [`knn search algorithms`](../../algorithm/neighbour/index.html). `CoverTree` is default.
     pub fn with_algorithm(mut self, algorithm: KNNAlgorithmName) -> Self {
diff --git a/src/tree/decision_tree_classifier.rs b/src/tree/decision_tree_classifier.rs
@@ -161,6 +161,29 @@ impl<T: RealNumber> PartialEq for Node<T> {
     }
 }
 
+impl DecisionTreeClassifierParameters {
+    /// Split criteria to use when building a tree.
+    pub fn with_criterion(mut self, criterion: SplitCriterion) -> Self {
+        self.criterion = criterion;
+        self
+    }
+    /// The maximum depth of the tree.
+    pub fn with_max_depth(mut self, max_depth: u16) -> Self {
+        self.max_depth = Some(max_depth);
+        self
+    }
+    /// The minimum number of samples required to be at a leaf node.
+    pub fn with_min_samples_leaf(mut self, min_samples_leaf: usize) -> Self {
+        self.min_samples_leaf = min_samples_leaf;
+        self
+    }
+    /// The minimum number of samples required to split an internal node.
+    pub fn with_min_samples_split(mut self, min_samples_split: usize) -> Self {
+        self.min_samples_split = min_samples_split;
+        self
+    }
+}
+
 impl Default for DecisionTreeClassifierParameters {
     fn default() -> Self {
         DecisionTreeClassifierParameters {
diff --git a/src/tree/decision_tree_regressor.rs b/src/tree/decision_tree_regressor.rs