From ea330a78830455530a5879c1c51840370aef4c69 Mon Sep 17 00:00:00 2001 From: Adam Li Date: Fri, 8 Sep 2023 14:35:55 -0400 Subject: [PATCH 01/54] Fix merge main Signed-off-by: Adam Li --- Makefile | 3 + setup.py | 27 +- sklearn/ensemble/_forest.py | 682 +++++++++- sklearn/ensemble/_gb.py | 1 + sklearn/ensemble/tests/test_forest.py | 222 ++++ .../tests/test_from_model.py | 8 +- sklearn/tree/_classes.py | 573 ++++++-- sklearn/tree/_criterion.pxd | 79 +- sklearn/tree/_criterion.pyx | 311 +++-- sklearn/tree/_export.py | 12 +- sklearn/tree/_splitter.pxd | 84 +- sklearn/tree/_splitter.pyx | 280 +++- sklearn/tree/_tree.pxd | 127 +- sklearn/tree/_tree.pyx | 1176 ++++++++++++----- sklearn/tree/_utils.pxd | 4 +- sklearn/tree/_utils.pyx | 12 +- sklearn/tree/tests/test_tree.py | 183 ++- 17 files changed, 3034 insertions(+), 750 deletions(-) diff --git a/Makefile b/Makefile index e2ae6aa75ca94..99e3665460a83 100644 --- a/Makefile +++ b/Makefile @@ -62,3 +62,6 @@ doc-noplot: inplace code-analysis: build_tools/linting.sh + +build-dev: + pip install --verbose --no-build-isolation --editable . \ No newline at end of file diff --git a/setup.py b/setup.py index f9ae13c94502b..e033395f3dbd8 100755 --- a/setup.py +++ b/setup.py @@ -225,10 +225,10 @@ def check_package_status(package, min_version): {"sources": ["_cdnmf_fast.pyx"], "include_np": True}, ], "ensemble": [ - {"sources": ["_gradient_boosting.pyx"], "include_np": True}, + {"sources": ["_gradient_boosting.pyx"], "language": "c++", "include_np": True}, ], "ensemble._hist_gradient_boosting": [ - {"sources": ["_gradient_boosting.pyx"], "include_np": True}, + {"sources": ["_gradient_boosting.pyx"], "language": "c++", "include_np": True}, {"sources": ["histogram.pyx"], "include_np": True}, {"sources": ["splitting.pyx"], "include_np": True}, {"sources": ["_binning.pyx"], "include_np": True}, @@ -310,7 +310,7 @@ def check_package_status(package, min_version): {"sources": ["_ball_tree.pyx.tp"], "include_np": True}, {"sources": ["_kd_tree.pyx.tp"], "include_np": True}, {"sources": ["_partition_nodes.pyx"], "language": "c++", "include_np": True}, - {"sources": ["_quad_tree.pyx"], "include_np": True}, + {"sources": ["_quad_tree.pyx"], "language": "c++", "include_np": True}, ], "svm": [ { @@ -378,9 +378,24 @@ def check_package_status(package, min_version): "include_np": True, "optimization_level": "O3", }, - {"sources": ["_splitter.pyx"], "include_np": True, "optimization_level": "O3"}, - {"sources": ["_criterion.pyx"], "include_np": True, "optimization_level": "O3"}, - {"sources": ["_utils.pyx"], "include_np": True, "optimization_level": "O3"}, + { + "sources": ["_splitter.pyx"], + "include_np": True, + "language": "c++", + "optimization_level": "O3", + }, + { + "sources": ["_criterion.pyx"], + "include_np": True, + "language": "c++", + "optimization_level": "O3", + }, + { + "sources": ["_utils.pyx"], + "include_np": True, + "language": "c++", + "optimization_level": "O3", + }, ], "utils": [ {"sources": ["sparsefuncs_fast.pyx"], "include_np": True}, diff --git a/sklearn/ensemble/_forest.py b/sklearn/ensemble/_forest.py index eecd13d403744..3ca1a2d347623 100644 --- a/sklearn/ensemble/_forest.py +++ b/sklearn/ensemble/_forest.py @@ -43,13 +43,14 @@ class calls the ``fit`` method of each sub-estimator on random samples import threading from abc import ABCMeta, abstractmethod from numbers import Integral, Real +from time import time from warnings import catch_warnings, simplefilter, warn import numpy as np from scipy.sparse import hstack as sparse_hstack from scipy.sparse import issparse -from ..base import ( +from sklearn.base import ( ClassifierMixin, MultiOutputMixin, RegressorMixin, @@ -57,9 +58,28 @@ class calls the ``fit`` method of each sub-estimator on random samples _fit_context, is_classifier, ) -from ..exceptions import DataConversionWarning -from ..metrics import accuracy_score, r2_score -from ..preprocessing import OneHotEncoder +from sklearn.ensemble._base import BaseEnsemble, _partition_estimators +from sklearn.ensemble._hist_gradient_boosting.binning import _BinMapper +from sklearn.exceptions import DataConversionWarning +from sklearn.metrics import accuracy_score, r2_score +from sklearn.preprocessing import OneHotEncoder +from sklearn.utils import check_random_state, compute_sample_weight +from sklearn.utils._openmp_helpers import _openmp_effective_n_threads +from sklearn.utils._param_validation import Interval, RealNotInt, StrOptions +from sklearn.utils._tags import _safe_tags +from sklearn.utils.multiclass import ( + _check_partial_fit_first_call, + check_classification_targets, + type_of_target, +) +from sklearn.utils.parallel import Parallel, delayed +from sklearn.utils.validation import ( + _check_feature_names_in, + _check_sample_weight, + _num_samples, + check_is_fitted, +) + from ..tree import ( BaseDecisionTree, DecisionTreeClassifier, @@ -68,18 +88,6 @@ class calls the ``fit`` method of each sub-estimator on random samples ExtraTreeRegressor, ) from ..tree._tree import DOUBLE, DTYPE -from ..utils import check_random_state, compute_sample_weight -from ..utils._param_validation import Interval, RealNotInt, StrOptions -from ..utils._tags import _safe_tags -from ..utils.multiclass import check_classification_targets, type_of_target -from ..utils.parallel import Parallel, delayed -from ..utils.validation import ( - _check_feature_names_in, - _check_sample_weight, - _num_samples, - check_is_fitted, -) -from ._base import BaseEnsemble, _partition_estimators __all__ = [ "RandomForestClassifier", @@ -161,6 +169,7 @@ def _parallel_build_trees( class_weight=None, n_samples_bootstrap=None, missing_values_in_feature_mask=None, + classes=None, ): """ Private function used to fit a single tree in parallel.""" @@ -193,6 +202,7 @@ def _parallel_build_trees( sample_weight=curr_sample_weight, check_input=False, missing_values_in_feature_mask=missing_values_in_feature_mask, + classes=classes, ) else: tree._fit( @@ -201,6 +211,50 @@ def _parallel_build_trees( sample_weight=sample_weight, check_input=False, missing_values_in_feature_mask=missing_values_in_feature_mask, + classes=classes, + ) + + return tree + + +def _parallel_update_trees( + tree, + bootstrap, + X, + y, + sample_weight, + tree_idx, + n_trees, + verbose=0, + class_weight=None, + n_samples_bootstrap=None, + classes=None, +): + """ + Private function used to fit a single tree in parallel.""" + if verbose > 1: + print("Updating tree %d of %d" % (tree_idx + 1, n_trees)) + + if bootstrap: + n_samples = X.shape[0] + indices = _generate_sample_indices( + tree.random_state, n_samples, n_samples_bootstrap + ) + + tree.partial_fit( + X[indices, :], + y[indices], + sample_weight=sample_weight, + check_input=False, + classes=classes, + ) + else: + tree.partial_fit( + X, + y, + sample_weight=sample_weight, + check_input=False, + classes=classes, ) return tree @@ -227,6 +281,11 @@ class BaseForest(MultiOutputMixin, BaseEnsemble, metaclass=ABCMeta): Interval(RealNotInt, 0.0, 1.0, closed="right"), Interval(Integral, 1, None, closed="left"), ], + "max_bins": [ + None, + Interval(Integral, 1, None, closed="left"), + ], + "store_leaf_values": ["boolean"], } @abstractmethod @@ -245,6 +304,8 @@ def __init__( class_weight=None, max_samples=None, base_estimator="deprecated", + max_bins=None, + store_leaf_values=False, ): super().__init__( estimator=estimator, @@ -261,6 +322,8 @@ def __init__( self.warm_start = warm_start self.class_weight = class_weight self.max_samples = max_samples + self.max_bins = max_bins + self.store_leaf_values = store_leaf_values def apply(self, X): """ @@ -280,6 +343,15 @@ def apply(self, X): return the index of the leaf x ends up in. """ X = self._validate_X_predict(X) + + # if we trained a binning tree, then we should re-bin the data + # XXX: this is inefficient and should be improved to be in line with what + # the Histogram Gradient Boosting Tree does, where the binning thresholds + # are passed into the tree itself, thus allowing us to set the node feature + # value thresholds within the tree itself. + if self.max_bins is not None: + X = self._bin_data(X, is_training_data=False).astype(DTYPE) + results = Parallel( n_jobs=self.n_jobs, verbose=self.verbose, @@ -329,7 +401,7 @@ def decision_path(self, X): return sparse_hstack(indicators).tocsr(), n_nodes_ptr @_fit_context(prefer_skip_nested_validation=True) - def fit(self, X, y, sample_weight=None): + def fit(self, X, y, sample_weight=None, classes=None): """ Build a forest of trees from the training set (X, y). @@ -351,6 +423,9 @@ def fit(self, X, y, sample_weight=None): classification, splits are also ignored if they would result in any single class carrying a negative weight in either child node. + classes : array-like of shape (n_classes,), default=None + List of all the classes that can possibly appear in the y vector. + Returns ------- self : object @@ -418,7 +493,7 @@ def fit(self, X, y, sample_weight=None): self.n_outputs_ = y.shape[1] - y, expanded_class_weight = self._validate_y_class_weight(y) + y, expanded_class_weight = self._validate_y_class_weight(y, classes=classes) if getattr(y, "dtype", None) != DOUBLE or not y.flags.contiguous: y = np.ascontiguousarray(y, dtype=DOUBLE) @@ -455,6 +530,38 @@ def fit(self, X, y, sample_weight=None): n_more_estimators = self.n_estimators - len(self.estimators_) + if self.max_bins is not None: + # `_openmp_effective_n_threads` is used to take cgroups CPU quotes + # into account when determine the maximum number of threads to use. + n_threads = _openmp_effective_n_threads() + + # Bin the data + # For ease of use of the API, the user-facing GBDT classes accept the + # parameter max_bins, which doesn't take into account the bin for + # missing values (which is always allocated). However, since max_bins + # isn't the true maximal number of bins, all other private classes + # (binmapper, histbuilder...) accept n_bins instead, which is the + # actual total number of bins. Everywhere in the code, the + # convention is that n_bins == max_bins + 1 + n_bins = self.max_bins + 1 # + 1 for missing values + self._bin_mapper = _BinMapper( + n_bins=n_bins, + # is_categorical=self.is_categorical_, + known_categories=None, + random_state=random_state, + n_threads=n_threads, + ) + + # XXX: in order for this to work with the underlying tree submodule's Cython + # code, we need to convert this into the original data's DTYPE because + # the Cython code assumes that `DTYPE` is used. + # The proper implementation will be a lot more complicated and should be + # tackled once scikit-learn has finalized their inclusion of missing data + # and categorical support for decision trees + X = self._bin_data(X, is_training_data=True) # .astype(DTYPE) + else: + self._bin_mapper = None + if n_more_estimators < 0: raise ValueError( "n_estimators=%d must be larger or equal to " @@ -501,6 +608,7 @@ def fit(self, X, y, sample_weight=None): class_weight=self.class_weight, n_samples_bootstrap=n_samples_bootstrap, missing_values_in_feature_mask=missing_values_in_feature_mask, + classes=classes, ) for i, t in enumerate(trees) ) @@ -620,7 +728,7 @@ def _compute_oob_predictions(self, X, y): return oob_pred - def _validate_y_class_weight(self, y): + def _validate_y_class_weight(self, y, classes=None): # Default implementation return y, None @@ -679,6 +787,174 @@ def feature_importances_(self): all_importances = np.mean(all_importances, axis=0, dtype=np.float64) return all_importances / np.sum(all_importances) + def _bin_data(self, X, is_training_data): + """Bin data X. + + If is_training_data, then fit the _bin_mapper attribute. + Else, the binned data is converted to a C-contiguous array. + """ + description = "training" if is_training_data else "validation" + if self.verbose: + print( + "Binning {:.3f} GB of {} data: ".format(X.nbytes / 1e9, description), + end="", + flush=True, + ) + tic = time() + if is_training_data: + X_binned = self._bin_mapper.fit_transform(X) # F-aligned array + else: + X_binned = self._bin_mapper.transform(X) # F-aligned array + # We convert the array to C-contiguous since predicting is faster + # with this layout (training is faster on F-arrays though) + X_binned = np.ascontiguousarray(X_binned) + toc = time() + if self.verbose: + duration = toc - tic + print("{:.3f} s".format(duration)) + + return X_binned + + def predict_quantiles(self, X, quantiles=0.5, method="nearest"): + """Predict class or regression value for X at given quantiles. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + Input data. + quantiles : float, optional + The quantiles at which to evaluate, by default 0.5 (median). + method : str, optional + The method to interpolate, by default 'linear'. Can be any keyword + argument accepted by :func:`~np.quantile`. + + Returns + ------- + y : ndarray of shape (n_samples, n_quantiles, [n_outputs]) + The predicted values. The ``n_outputs`` dimension is present only + for multi-output regressors. + """ + if not self.store_leaf_values: + raise RuntimeError( + "Quantile prediction is not available when store_leaf_values=False" + ) + check_is_fitted(self) + # Check data + X = self._validate_X_predict(X) + + if not isinstance(quantiles, (np.ndarray, list)): + quantiles = np.array([quantiles]) + + # if we trained a binning tree, then we should re-bin the data + # XXX: this is inefficient and should be improved to be in line with what + # the Histogram Gradient Boosting Tree does, where the binning thresholds + # are passed into the tree itself, thus allowing us to set the node feature + # value thresholds within the tree itself. + if self.max_bins is not None: + X = self._bin_data(X, is_training_data=False).astype(DTYPE) + + # Assign chunk of trees to jobs + # n_jobs, _, _ = _partition_estimators(self.n_estimators, self.n_jobs) + + # avoid storing the output of every estimator by summing them here + if self.n_outputs_ > 1: + y_hat = np.zeros( + (X.shape[0], len(quantiles), self.n_outputs_), dtype=np.float64 + ) + else: + y_hat = np.zeros((X.shape[0], len(quantiles)), dtype=np.float64) + + # get (n_samples, n_estimators) indicator of leaf nodes + X_leaves = self.apply(X) + + # we now want to aggregate all leaf samples across all trees for each sample + for idx in range(X.shape[0]): + # get leaf nodes for this sample + leaf_nodes = X_leaves[idx, :] + + # (n_total_leaf_samples, n_outputs) + leaf_node_samples = np.vstack( + [ + est.tree_.leaf_nodes_samples[leaf_nodes[jdx]] + for jdx, est in enumerate(self.estimators_) + ] + ) + + # get quantiles across all leaf node samples + try: + y_hat[idx, ...] = np.quantile( + leaf_node_samples, quantiles, axis=0, method=method + ) + except TypeError: + y_hat[idx, ...] = np.quantile( + leaf_node_samples, quantiles, axis=0, interpolation=method + ) + + if is_classifier(self): + if self.n_outputs_ == 1: + for i in range(len(quantiles)): + class_pred_per_sample = y_hat[idx, i, :].squeeze().astype(int) + y_hat[idx, ...] = self.classes_.take( + class_pred_per_sample, axis=0 + ) + else: + for k in range(self.n_outputs_): + for i in range(len(quantiles)): + class_pred_per_sample = ( + y_hat[idx, i, k].squeeze().astype(int) + ) + y_hat[idx, i, k] = self.classes_[k].take( + class_pred_per_sample, axis=0 + ) + return y_hat + + def get_leaf_node_samples(self, X): + """For each datapoint x in X, get the training samples in the leaf node. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + Dataset to apply the forest to. + + Returns + ------- + leaf_node_samples : a list of array-like + Each sample is represented by the indices of the training samples that + reached the leaf node. The ``n_leaf_node_samples`` may vary between + samples, since the number of samples that fall in a leaf node is + variable. Each array-like has shape (n_leaf_node_samples, n_outputs). + """ + if not self.store_leaf_values: + raise RuntimeError( + "Leaf node samples are not available when store_leaf_values=False" + ) + + check_is_fitted(self) + # Check data + X = self._validate_X_predict(X) + + # if we trained a binning tree, then we should re-bin the data + # XXX: this is inefficient and should be improved to be in line with what + # the Histogram Gradient Boosting Tree does, where the binning thresholds + # are passed into the tree itself, thus allowing us to set the node feature + # value thresholds within the tree itself. + if self.max_bins is not None: + X = self._bin_data(X, is_training_data=False).astype(DTYPE) + + # Assign chunk of trees to jobs + n_jobs, _, _ = _partition_estimators(self.n_estimators, self.n_jobs) + + # avoid storing the output of every estimator by summing them here + result = Parallel(n_jobs=n_jobs, verbose=self.verbose)( + delayed(_accumulate_leaf_nodes_samples)(e.get_leaf_node_samples, X) + for e in self.estimators_ + ) + leaf_nodes_samples = result[0] + for result_ in result[1:]: + for i, node_samples in enumerate(result_): + leaf_nodes_samples[i] = np.vstack((leaf_nodes_samples[i], node_samples)) + return leaf_nodes_samples + def _more_tags(self): # Only the criterion is required to determine if the tree supports # missing values @@ -702,6 +978,17 @@ def _accumulate_prediction(predict, X, out, lock): out[i] += prediction[i] +def _accumulate_leaf_nodes_samples(func, X): + """ + This is a utility function for joblib's Parallel. + + It can't go locally in ForestClassifier or ForestRegressor, because joblib + complains that it cannot pickle it when placed there. + """ + leaf_nodes_samples = func(X, check_input=False) + return leaf_nodes_samples + + class ForestClassifier(ClassifierMixin, BaseForest, metaclass=ABCMeta): """ Base class for forest of trees-based classifiers. @@ -726,6 +1013,8 @@ def __init__( class_weight=None, max_samples=None, base_estimator="deprecated", + max_bins=None, + store_leaf_values=False, ): super().__init__( estimator=estimator, @@ -740,6 +1029,8 @@ def __init__( class_weight=class_weight, max_samples=max_samples, base_estimator=base_estimator, + max_bins=max_bins, + store_leaf_values=store_leaf_values, ) @staticmethod @@ -794,7 +1085,7 @@ def _set_oob_score_and_attributes(self, X, y, scoring_function=None): y, np.argmax(self.oob_decision_function_, axis=1) ) - def _validate_y_class_weight(self, y): + def _validate_y_class_weight(self, y, classes=None): check_classification_targets(y) y = np.copy(y) @@ -807,12 +1098,28 @@ def _validate_y_class_weight(self, y): self.n_classes_ = [] y_store_unique_indices = np.zeros(y.shape, dtype=int) - for k in range(self.n_outputs_): - classes_k, y_store_unique_indices[:, k] = np.unique( - y[:, k], return_inverse=True - ) - self.classes_.append(classes_k) - self.n_classes_.append(classes_k.shape[0]) + if classes is not None: + classes = np.atleast_1d(classes) + if classes.ndim == 1: + classes = np.array([classes]) + + for k in classes: + self.classes_.append(np.array(k)) + self.n_classes_.append(np.array(k).shape[0]) + + for i in range(y.shape[0]): + for j in range(self.n_outputs_): + y_store_unique_indices[i, j] = np.where( + self.classes_[j] == y[i, j] + )[0][0] + else: + for k in range(self.n_outputs_): + classes_k, y_store_unique_indices[:, k] = np.unique( + y[:, k], return_inverse=True + ) + self.classes_.append(classes_k) + self.n_classes_.append(classes_k.shape[0]) + y = y_store_unique_indices if self.class_weight is not None: @@ -848,6 +1155,228 @@ def _validate_y_class_weight(self, y): return y, expanded_class_weight + def partial_fit(self, X, y, sample_weight=None, classes=None): + """Update a decision tree classifier from the training set (X, y). + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + The training input samples. Internally, it will be converted to + ``dtype=np.float32`` and if a sparse matrix is provided + to a sparse ``csc_matrix``. + + y : array-like of shape (n_samples,) or (n_samples, n_outputs) + The target values (class labels) as integers or strings. + + sample_weight : array-like of shape (n_samples,), default=None + Sample weights. If None, then samples are equally weighted. Splits + that would create child nodes with net zero or negative weight are + ignored while searching for a split in each node. Splits are also + ignored if they would result in any single class carrying a + negative weight in either child node. + + classes : array-like of shape (n_classes,), default=None + List of all the classes that can possibly appear in the y vector. + Must be provided at the first call to partial_fit, can be omitted + in subsequent calls. + + Returns + ------- + self : object + Returns the instance itself. + """ + self._validate_params() + + # validate input parameters + first_call = _check_partial_fit_first_call(self, classes=classes) + + # Fit if no tree exists yet + if first_call: + self.fit( + X, + y, + sample_weight=sample_weight, + classes=classes, + ) + return self + + X, y = self._validate_data( + X, + y, + multi_output=True, + accept_sparse="csc", + dtype=DTYPE, + force_all_finite=False, + reset=first_call, + ) + + if issparse(y): + raise ValueError("sparse multilabel-indicator for y is not supported.") + + if sample_weight is not None: + sample_weight = _check_sample_weight(sample_weight, X) + + if issparse(X): + # Pre-sort indices to avoid that each individual tree of the + # ensemble sorts the indices. + X.sort_indices() + + y = np.atleast_1d(y) + if y.ndim == 2 and y.shape[1] == 1: + warn( + ( + "A column-vector y was passed when a 1d array was" + " expected. Please change the shape of y to " + "(n_samples,), for example using ravel()." + ), + DataConversionWarning, + stacklevel=2, + ) + + if y.ndim == 1: + # reshape is necessary to preserve the data contiguity against vs + # [:, np.newaxis] that does not. + y = np.reshape(y, (-1, 1)) + + if self.criterion == "poisson": + if np.any(y < 0): + raise ValueError( + "Some value(s) of y are negative which is " + "not allowed for Poisson regression." + ) + if np.sum(y) <= 0: + raise ValueError( + "Sum of y is not strictly positive which " + "is necessary for Poisson regression." + ) + + self.n_outputs_ = y.shape[1] + + classes = self.classes_ + if self.n_outputs_ == 1: + classes = [classes] + + y, expanded_class_weight = self._validate_y_class_weight(y, classes) + + if getattr(y, "dtype", None) != DOUBLE or not y.flags.contiguous: + y = np.ascontiguousarray(y, dtype=DOUBLE) + + if expanded_class_weight is not None: + if sample_weight is not None: + sample_weight = sample_weight * expanded_class_weight + else: + sample_weight = expanded_class_weight + + if not self.bootstrap and self.max_samples is not None: + raise ValueError( + "`max_sample` cannot be set if `bootstrap=False`. " + "Either switch to `bootstrap=True` or set " + "`max_sample=None`." + ) + elif self.bootstrap: + n_samples_bootstrap = _get_n_samples_bootstrap( + n_samples=X.shape[0], max_samples=self.max_samples + ) + else: + n_samples_bootstrap = None + + self._validate_estimator() + + if not self.bootstrap and self.oob_score: + raise ValueError("Out of bag estimation only available if bootstrap=True") + + random_state = check_random_state(self.random_state) + + if self.max_bins is not None: + # `_openmp_effective_n_threads` is used to take cgroups CPU quotes + # into account when determine the maximum number of threads to use. + n_threads = _openmp_effective_n_threads() + + # Bin the data + # For ease of use of the API, the user-facing GBDT classes accept the + # parameter max_bins, which doesn't take into account the bin for + # missing values (which is always allocated). However, since max_bins + # isn't the true maximal number of bins, all other private classes + # (binmapper, histbuilder...) accept n_bins instead, which is the + # actual total number of bins. Everywhere in the code, the + # convention is that n_bins == max_bins + 1 + n_bins = self.max_bins + 1 # + 1 for missing values + self._bin_mapper = _BinMapper( + n_bins=n_bins, + # is_categorical=self.is_categorical_, + known_categories=None, + random_state=random_state, + n_threads=n_threads, + ) + + # XXX: in order for this to work with the underlying tree submodule's Cython + # code, we need to convert this into the original data's DTYPE because + # the Cython code assumes that `DTYPE` is used. + # The proper implementation will be a lot more complicated and should be + # tackled once scikit-learn has finalized their inclusion of missing data + # and categorical support for decision trees + X = self._bin_data(X, is_training_data=True) # .astype(DTYPE) + else: + self._bin_mapper = None + + # We draw from the random state to get the random state we + # would have got if we hadn't used a warm_start. + random_state.randint(MAX_INT, size=len(self.estimators_)) + + # Parallel loop: we prefer the threading backend as the Cython code + # for fitting the trees is internally releasing the Python GIL + # making threading more efficient than multiprocessing in + # that case. However, for joblib 0.12+ we respect any + # parallel_backend contexts set at a higher level, + # since correctness does not rely on using threads. + Parallel( + n_jobs=self.n_jobs, + verbose=self.verbose, + prefer="threads", + )( + delayed(_parallel_update_trees)( + t, + self.bootstrap, + X, + y, + sample_weight, + i, + len(self.estimators_), + verbose=self.verbose, + class_weight=self.class_weight, + n_samples_bootstrap=n_samples_bootstrap, + classes=classes[0], + ) + for i, t in enumerate(self.estimators_) + ) + + if self.oob_score: + y_type = type_of_target(y) + if y_type in ("multiclass-multioutput", "unknown"): + # FIXME: we could consider to support multiclass-multioutput if + # we introduce or reuse a constructor parameter (e.g. + # oob_score) allowing our user to pass a callable defining the + # scoring strategy on OOB sample. + raise ValueError( + "The type of target cannot be used to compute OOB " + f"estimates. Got {y_type} while only the following are " + "supported: continuous, continuous-multioutput, binary, " + "multiclass, multilabel-indicator." + ) + + if callable(self.oob_score): + self._set_oob_score_and_attributes( + X, y, scoring_function=self.oob_score + ) + else: + self._set_oob_score_and_attributes(X, y) + + # Decapsulate classes_ attributes + if hasattr(self, "classes_") and self.n_outputs_ == 1: + self.n_classes_ = self.n_classes_[0] + self.classes_ = self.classes_[0] + return self + def predict(self, X): """ Predict class for X. @@ -913,6 +1442,14 @@ def predict_proba(self, X): # Check data X = self._validate_X_predict(X) + # if we trained a binning tree, then we should re-bin the data + # XXX: this is inefficient and should be improved to be in line with what + # the Histogram Gradient Boosting Tree does, where the binning thresholds + # are passed into the tree itself, thus allowing us to set the node feature + # value thresholds within the tree itself. + if self.max_bins is not None: + X = self._bin_data(X, is_training_data=False).astype(DTYPE) + # Assign chunk of trees to jobs n_jobs, _, _ = _partition_estimators(self.n_estimators, self.n_jobs) @@ -994,6 +1531,8 @@ def __init__( warm_start=False, max_samples=None, base_estimator="deprecated", + max_bins=None, + store_leaf_values=False, ): super().__init__( estimator, @@ -1007,6 +1546,8 @@ def __init__( warm_start=warm_start, max_samples=max_samples, base_estimator=base_estimator, + max_bins=max_bins, + store_leaf_values=store_leaf_values, ) def predict(self, X): @@ -1032,6 +1573,14 @@ def predict(self, X): # Check data X = self._validate_X_predict(X) + # if we trained a binning tree, then we should re-bin the data + # XXX: this is inefficient and should be improved to be in line with what + # the Histogram Gradient Boosting Tree does, where the binning thresholds + # are passed into the tree itself, thus allowing us to set the node feature + # value thresholds within the tree itself. + if self.max_bins is not None: + X = self._bin_data(X, is_training_data=False).astype(DTYPE) + # Assign chunk of trees to jobs n_jobs, _, _ = _partition_estimators(self.n_estimators, self.n_jobs) @@ -1322,6 +1871,16 @@ class RandomForestClassifier(ForestClassifier): .. versionadded:: 0.22 + max_bins : int, default=255 + The maximum number of bins to use for non-missing values. + + **This is an experimental feature**. + + store_leaf_values : bool, default=False + Whether to store the leaf values in the ``get_leaf_node_samples`` function. + + **This is an experimental feature**. + monotonic_cst : array-like of int of shape (n_features), default=None Indicates the monotonicity constraint to enforce on each feature. - 1: monotonic increase @@ -1481,6 +2040,8 @@ def __init__( class_weight=None, ccp_alpha=0.0, max_samples=None, + max_bins=None, + store_leaf_values=False, monotonic_cst=None, ): super().__init__( @@ -1497,6 +2058,7 @@ def __init__( "min_impurity_decrease", "random_state", "ccp_alpha", + "store_leaf_values", "monotonic_cst", ), bootstrap=bootstrap, @@ -1507,6 +2069,8 @@ def __init__( warm_start=warm_start, class_weight=class_weight, max_samples=max_samples, + max_bins=max_bins, + store_leaf_values=store_leaf_values, ) self.criterion = criterion @@ -1698,6 +2262,17 @@ class RandomForestRegressor(ForestRegressor): .. versionadded:: 0.22 + max_bins : int, default=255 + The maximum number of bins to use for non-missing values. Used for + speeding up training time. + + **This is an experimental feature**. + + store_leaf_values : bool, default=False + Whether to store the leaf values in the ``get_leaf_node_samples`` function. + + **This is an experimental feature**. + monotonic_cst : array-like of int of shape (n_features), default=None Indicates the monotonicity constraint to enforce on each feature. - 1: monotonically increasing @@ -1841,6 +2416,8 @@ def __init__( warm_start=False, ccp_alpha=0.0, max_samples=None, + max_bins=None, + store_leaf_values=False, monotonic_cst=None, ): super().__init__( @@ -1857,6 +2434,7 @@ def __init__( "min_impurity_decrease", "random_state", "ccp_alpha", + "store_leaf_values", "monotonic_cst", ), bootstrap=bootstrap, @@ -1866,6 +2444,8 @@ def __init__( verbose=verbose, warm_start=warm_start, max_samples=max_samples, + max_bins=max_bins, + store_leaf_values=store_leaf_values, ) self.criterion = criterion @@ -2065,6 +2645,16 @@ class ExtraTreesClassifier(ForestClassifier): .. versionadded:: 0.22 + max_bins : int, default=255 + The maximum number of bins to use for non-missing values. + + **This is an experimental feature**. + + store_leaf_values : bool, default=False + Whether to store the leaf values in the ``get_leaf_node_samples`` function. + + **This is an experimental feature**. + monotonic_cst : array-like of int of shape (n_features), default=None Indicates the monotonicity constraint to enforce on each feature. - 1: monotonically increasing @@ -2213,6 +2803,8 @@ def __init__( class_weight=None, ccp_alpha=0.0, max_samples=None, + max_bins=None, + store_leaf_values=False, monotonic_cst=None, ): super().__init__( @@ -2229,6 +2821,7 @@ def __init__( "min_impurity_decrease", "random_state", "ccp_alpha", + "store_leaf_values", "monotonic_cst", ), bootstrap=bootstrap, @@ -2239,6 +2832,8 @@ def __init__( warm_start=warm_start, class_weight=class_weight, max_samples=max_samples, + max_bins=max_bins, + store_leaf_values=store_leaf_values, ) self.criterion = criterion @@ -2426,6 +3021,16 @@ class ExtraTreesRegressor(ForestRegressor): .. versionadded:: 0.22 + max_bins : int, default=255 + The maximum number of bins to use for non-missing values. + + **This is an experimental feature**. + + store_leaf_values : bool, default=False + Whether to store the leaf values in the ``get_leaf_node_samples`` function. + + **This is an experimental feature**. + monotonic_cst : array-like of int of shape (n_features), default=None Indicates the monotonicity constraint to enforce on each feature. - 1: monotonically increasing @@ -2554,6 +3159,8 @@ def __init__( warm_start=False, ccp_alpha=0.0, max_samples=None, + max_bins=None, + store_leaf_values=False, monotonic_cst=None, ): super().__init__( @@ -2570,6 +3177,7 @@ def __init__( "min_impurity_decrease", "random_state", "ccp_alpha", + "store_leaf_values", "monotonic_cst", ), bootstrap=bootstrap, @@ -2579,6 +3187,8 @@ def __init__( verbose=verbose, warm_start=warm_start, max_samples=max_samples, + max_bins=max_bins, + store_leaf_values=store_leaf_values, ) self.criterion = criterion @@ -2702,6 +3312,9 @@ class RandomTreesEmbedding(TransformerMixin, BaseForest): new forest. See :term:`Glossary ` and :ref:`gradient_boosting_warm_start` for details. + store_leaf_values : bool, default=False + Whether to store the leaf values in the ``get_leaf_node_samples`` function. + Attributes ---------- estimator_ : :class:`~sklearn.tree.ExtraTreeRegressor` instance @@ -2805,6 +3418,7 @@ def __init__( random_state=None, verbose=0, warm_start=False, + store_leaf_values=False, ): super().__init__( estimator=ExtraTreeRegressor(), @@ -2819,6 +3433,7 @@ def __init__( "max_leaf_nodes", "min_impurity_decrease", "random_state", + "store_leaf_values", ), bootstrap=False, oob_score=False, @@ -2827,6 +3442,7 @@ def __init__( verbose=verbose, warm_start=warm_start, max_samples=None, + store_leaf_values=store_leaf_values, ) self.max_depth = max_depth @@ -2840,7 +3456,7 @@ def __init__( def _set_oob_score_and_attributes(self, X, y, scoring_function=None): raise NotImplementedError("OOB score not supported by tree embedding") - def fit(self, X, y=None, sample_weight=None): + def fit(self, X, y=None, sample_weight=None, classes=None): """ Fit estimator. @@ -2861,17 +3477,20 @@ def fit(self, X, y=None, sample_weight=None): classification, splits are also ignored if they would result in any single class carrying a negative weight in either child node. + classes : array-like of shape (n_classes,), default=None + List of all the classes that can possibly appear in the y vector. + Returns ------- self : object Returns the instance itself. """ # Parameters are validated in fit_transform - self.fit_transform(X, y, sample_weight=sample_weight) + self.fit_transform(X, y, sample_weight=sample_weight, classes=classes) return self @_fit_context(prefer_skip_nested_validation=True) - def fit_transform(self, X, y=None, sample_weight=None): + def fit_transform(self, X, y=None, sample_weight=None, classes=None): """ Fit estimator and transform dataset. @@ -2891,6 +3510,9 @@ def fit_transform(self, X, y=None, sample_weight=None): classification, splits are also ignored if they would result in any single class carrying a negative weight in either child node. + classes : array-like of shape (n_classes,), default=None + List of all the classes that can possibly appear in the y vector. + Returns ------- X_transformed : sparse matrix of shape (n_samples, n_out) @@ -2898,7 +3520,7 @@ def fit_transform(self, X, y=None, sample_weight=None): """ rnd = check_random_state(self.random_state) y = rnd.uniform(size=_num_samples(X)) - super().fit(X, y, sample_weight=sample_weight) + super().fit(X, y, sample_weight=sample_weight, classes=classes) self.one_hot_encoder_ = OneHotEncoder(sparse_output=self.sparse_output) output = self.one_hot_encoder_.fit_transform(self.apply(X)) diff --git a/sklearn/ensemble/_gb.py b/sklearn/ensemble/_gb.py index 73bb9e08ae619..990dac614f45c 100644 --- a/sklearn/ensemble/_gb.py +++ b/sklearn/ensemble/_gb.py @@ -351,6 +351,7 @@ class BaseGradientBoosting(BaseEnsemble, metaclass=ABCMeta): "n_iter_no_change": [Interval(Integral, 1, None, closed="left"), None], "tol": [Interval(Real, 0.0, None, closed="left")], } + _parameter_constraints.pop("store_leaf_values") _parameter_constraints.pop("splitter") _parameter_constraints.pop("monotonic_cst") diff --git a/sklearn/ensemble/tests/test_forest.py b/sklearn/ensemble/tests/test_forest.py index 31e9859076c92..efc5d7d5ee5a4 100644 --- a/sklearn/ensemble/tests/test_forest.py +++ b/sklearn/ensemble/tests/test_forest.py @@ -115,6 +115,120 @@ FOREST_CLASSIFIERS_REGRESSORS.update(FOREST_REGRESSORS) +def _sparse_parity(n, p=20, p_star=3, random_state=None): + """Generate sparse parity dataset. + + Sparse parity is a multivariate generalization of the + XOR problem. + + Parameters + ---------- + n : int + Number of sample to generate. + p : int, optional + The dimensionality of the dataset, by default 20 + p_star : int, optional + The number of informative dimensions, by default 3. + random_state : Random State, optional + Random state, by default None. + + Returns + ------- + X : np.ndarray of shape (n, p) + Sparse parity dataset as a dense array. + y : np.ndarray of shape (n,) + Labels of the dataset + """ + rng = np.random.RandomState(seed=random_state) + X = rng.uniform(-1, 1, (n, p)) + y = np.zeros(n) + + for i in range(0, n): + y[i] = sum(X[i, :p_star] > 0) % 2 + + return X, y + + +def _orthant(n, p=6, random_state=None): + """Generate orthant dataset. + + Parameters + ---------- + n : int + Number of sample to generate. + p : int, optional + The dimensionality of the dataset and the number of + unique labels, by default 6. + rec : int, optional + _description_, by default 1 + random_state : Random State, optional + Random state, by default None. + + Returns + ------- + X : np.ndarray of shape (n, p) + Orthant dataset as a dense array. + y : np.ndarray of shape (n,) + Labels of the dataset + """ + rng = np.random.RandomState(seed=random_state) + orth_labels = np.asarray([2**i for i in range(0, p)][::-1]) + + X = rng.uniform(-1, 1, (n, p)) + y = np.zeros(n) + + for i in range(0, n): + idx = np.where(X[i, :] > 0)[0] + y[i] = sum(orth_labels[idx]) + + if len(np.unique(y)) < 2**p: + raise RuntimeError("Increase sample size to get a label in each orthant.") + + return X, y + + +def _trunk(n, p=10, random_state=None): + """Generate trunk dataset. + + Parameters + ---------- + n : int + Number of sample to generate. + p : int, optional + The dimensionality of the dataset and the number of + unique labels, by default 10. + random_state : Random State, optional + Random state, by default None. + + Returns + ------- + X : np.ndarray of shape (n, p) + Trunk dataset as a dense array. + y : np.ndarray of shape (n,) + Labels of the dataset + + References + ---------- + [1] Gerard V. Trunk. A problem of dimensionality: A + simple example. IEEE Transactions on Pattern Analysis + and Machine Intelligence, 1(3):306–307, 1979. + """ + rng = np.random.RandomState(seed=random_state) + + mu_1 = np.array([1 / i for i in range(1, p + 1)]) + mu_0 = -1 * mu_1 + cov = np.identity(p) + + X = np.vstack( + ( + rng.multivariate_normal(mu_0, cov, int(n / 2)), + rng.multivariate_normal(mu_1, cov, int(n / 2)), + ) + ) + y = np.concatenate((np.zeros(int(n / 2)), np.ones(int(n / 2)))) + return X, y + + def check_classification_toy(name): """Check classification on a toy dataset.""" ForestClassifier = FOREST_CLASSIFIERS[name] @@ -1811,6 +1925,114 @@ def test_round_samples_to_one_when_samples_too_low(class_weight): forest.fit(X, y) +@pytest.mark.parametrize("name", FOREST_CLASSIFIERS) +def test_classification_toy_withbins(name): + """Check classification on a toy dataset.""" + ForestClassifier = FOREST_CLASSIFIERS[name] + + clf = ForestClassifier(n_estimators=10, random_state=1, max_bins=255) + clf.fit(X, y) + assert_array_equal(clf.predict(T), true_result) + assert 10 == len(clf) + + clf = ForestClassifier( + n_estimators=10, max_features=1, random_state=1, max_bins=255 + ) + clf.fit(X, y) + assert_array_equal(clf.predict(T), true_result) + assert 10 == len(clf) + + # also test apply + leaf_indices = clf.apply(X) + assert leaf_indices.shape == (len(X), clf.n_estimators) + + +@pytest.mark.parametrize("name", FOREST_REGRESSORS) +@pytest.mark.parametrize( + "criterion", ("squared_error", "absolute_error", "friedman_mse") +) +def test_regression_criterion_withbins(name, criterion): + # Check consistency on regression dataset. + ForestRegressor = FOREST_REGRESSORS[name] + + reg = ForestRegressor( + n_estimators=5, criterion=criterion, random_state=1, max_bins=250 + ) + reg.fit(X_reg, y_reg) + score = reg.score(X_reg, y_reg) + assert ( + score > 0.93 + ), "Failed with max_features=None, criterion %s and score = %f" % ( + criterion, + score, + ) + + reg = ForestRegressor( + n_estimators=5, + criterion=criterion, + max_features=6, + random_state=1, + max_bins=250, + ) + reg.fit(X_reg, y_reg) + score = reg.score(X_reg, y_reg) + assert score > 0.92, "Failed with max_features=6, criterion %s and score = %f" % ( + criterion, + score, + ) + + +@pytest.mark.parametrize("name", FOREST_CLASSIFIERS_REGRESSORS) +def test_multioutput_quantiles(name): + # Check estimators on multi-output problems. + X_train = [ + [-2, -1], + [-1, -1], + [-1, -2], + [1, 1], + [1, 2], + [2, 1], + [-2, 1], + [-1, 1], + [-1, 2], + [2, -1], + [1, -1], + [1, -2], + ] + y_train = [ + [-1, 0], + [-1, 0], + [-1, 0], + [1, 1], + [1, 1], + [1, 1], + [-1, 2], + [-1, 2], + [-1, 2], + [1, 3], + [1, 3], + [1, 3], + ] + X_test = [[-1, -1], [1, 1], [-1, 1], [1, -1]] + y_test = [[-1, 0], [1, 1], [-1, 2], [1, 3]] + + est = FOREST_ESTIMATORS[name]( + random_state=0, bootstrap=False, store_leaf_values=True + ) + est.fit(X_train, y_train) + + y_pred = est.predict_quantiles(X_test, quantiles=[0.25, 0.5, 0.75]) + assert_array_almost_equal(y_pred[:, 1, :], y_test) + assert_array_almost_equal(y_pred[:, 0, :], y_test) + assert_array_almost_equal(y_pred[:, 2, :], y_test) + + # test the leaf nodes samples + leaf_nodes_samples = est.get_leaf_node_samples(X_test) + assert len(leaf_nodes_samples) == len(X_test) + for node_samples in leaf_nodes_samples: + assert node_samples.shape[1] == est.n_outputs_ + + @pytest.mark.parametrize( "make_data, Forest", [ diff --git a/sklearn/feature_selection/tests/test_from_model.py b/sklearn/feature_selection/tests/test_from_model.py index aa802136c2f39..0e207d2334761 100644 --- a/sklearn/feature_selection/tests/test_from_model.py +++ b/sklearn/feature_selection/tests/test_from_model.py @@ -10,7 +10,11 @@ from sklearn.cross_decomposition import CCA, PLSCanonical, PLSRegression from sklearn.datasets import make_friedman1 from sklearn.decomposition import PCA -from sklearn.ensemble import HistGradientBoostingClassifier, RandomForestClassifier +from sklearn.ensemble import ( + HistGradientBoostingClassifier, + RandomForestClassifier, + RandomForestRegressor, +) from sklearn.exceptions import NotFittedError from sklearn.feature_selection import SelectFromModel from sklearn.linear_model import ( @@ -402,7 +406,7 @@ def test_partial_fit(): assert_array_almost_equal(X_transform, transformer.transform(data)) # check that if est doesn't have partial_fit, neither does SelectFromModel - transformer = SelectFromModel(estimator=RandomForestClassifier()) + transformer = SelectFromModel(estimator=RandomForestRegressor()) assert not hasattr(transformer, "partial_fit") diff --git a/sklearn/tree/_classes.py b/sklearn/tree/_classes.py index 03ba2f108bbdd..41b4c55b9820a 100644 --- a/sklearn/tree/_classes.py +++ b/sklearn/tree/_classes.py @@ -11,12 +11,12 @@ # Joly Arnaud # Fares Hedayati # Nelson Liu +# Haoyin Xu # # License: BSD 3 clause import copy import numbers -import warnings from abc import ABCMeta, abstractmethod from math import ceil from numbers import Integral, Real @@ -24,7 +24,7 @@ import numpy as np from scipy.sparse import issparse -from ..base import ( +from sklearn.base import ( BaseEstimator, ClassifierMixin, MultiOutputMixin, @@ -33,18 +33,22 @@ clone, is_classifier, ) -from ..utils import Bunch, check_random_state, compute_sample_weight -from ..utils._param_validation import Hidden, Interval, RealNotInt, StrOptions -from ..utils.multiclass import check_classification_targets -from ..utils.validation import ( +from sklearn.utils import Bunch, check_random_state, compute_sample_weight +from sklearn.utils._param_validation import Hidden, Interval, RealNotInt, StrOptions +from sklearn.utils.multiclass import ( + _check_partial_fit_first_call, + check_classification_targets, +) +from sklearn.utils.validation import ( _assert_all_finite_element_wise, _check_sample_weight, assert_all_finite, check_is_fitted, ) + from . import _criterion, _splitter, _tree -from ._criterion import Criterion -from ._splitter import Splitter +from ._criterion import BaseCriterion +from ._splitter import BaseSplitter from ._tree import ( BestFirstTreeBuilder, DepthFirstTreeBuilder, @@ -106,6 +110,7 @@ class BaseDecisionTree(MultiOutputMixin, BaseEstimator, metaclass=ABCMeta): "min_samples_split": [ Interval(Integral, 2, None, closed="left"), Interval(RealNotInt, 0.0, 1.0, closed="right"), + StrOptions({"sqrt", "log2"}), ], "min_samples_leaf": [ Interval(Integral, 1, None, closed="left"), @@ -122,6 +127,7 @@ class BaseDecisionTree(MultiOutputMixin, BaseEstimator, metaclass=ABCMeta): "max_leaf_nodes": [Interval(Integral, 2, None, closed="left"), None], "min_impurity_decrease": [Interval(Real, 0.0, None, closed="left")], "ccp_alpha": [Interval(Real, 0.0, None, closed="left")], + "store_leaf_values": ["boolean"], "monotonic_cst": ["array-like", None], } @@ -141,6 +147,7 @@ def __init__( min_impurity_decrease, class_weight=None, ccp_alpha=0.0, + store_leaf_values=False, monotonic_cst=None, ): self.criterion = criterion @@ -155,6 +162,7 @@ def __init__( self.min_impurity_decrease = min_impurity_decrease self.class_weight = class_weight self.ccp_alpha = ccp_alpha + self.store_leaf_values = store_leaf_values self.monotonic_cst = monotonic_cst def get_depth(self): @@ -236,6 +244,7 @@ def _fit( sample_weight=None, check_input=True, missing_values_in_feature_mask=None, + classes=None, ): random_state = check_random_state(self.random_state) @@ -250,9 +259,12 @@ def _fit( dtype=DTYPE, accept_sparse="csc", force_all_finite=False ) check_y_params = dict(ensure_2d=False, dtype=None) - X, y = self._validate_data( - X, y, validate_separately=(check_X_params, check_y_params) - ) + if y is not None or self._get_tags()["requires_y"]: + X, y = self._validate_data( + X, y, validate_separately=(check_X_params, check_y_params) + ) + else: + X = self._validate_data(X, **check_X_params) missing_values_in_feature_mask = ( self._compute_missing_values_in_feature_mask(X) @@ -265,7 +277,7 @@ def _fit( "No support for np.int64 index based sparse matrices" ) - if self.criterion == "poisson": + if y is not None and self.criterion == "poisson": if np.any(y < 0): raise ValueError( "Some value(s) of y are negative which is" @@ -279,45 +291,73 @@ def _fit( # Determine output settings n_samples, self.n_features_in_ = X.shape - is_classification = is_classifier(self) - y = np.atleast_1d(y) - expanded_class_weight = None + # Do preprocessing if 'y' is passed + is_classification = False + if y is not None: + is_classification = is_classifier(self) + y = np.atleast_1d(y) + expanded_class_weight = None - if y.ndim == 1: - # reshape is necessary to preserve the data contiguity against vs - # [:, np.newaxis] that does not. - y = np.reshape(y, (-1, 1)) + if y.ndim == 1: + # reshape is necessary to preserve the data contiguity against vs + # [:, np.newaxis] that does not. + y = np.reshape(y, (-1, 1)) - self.n_outputs_ = y.shape[1] + self.n_outputs_ = y.shape[1] - if is_classification: - check_classification_targets(y) - y = np.copy(y) + if is_classification: + check_classification_targets(y) + y = np.copy(y) + + self.classes_ = [] + self.n_classes_ = [] + + if self.class_weight is not None: + y_original = np.copy(y) + + y_encoded = np.zeros(y.shape, dtype=int) + if classes is not None: + classes = np.atleast_1d(classes) + if classes.ndim == 1: + classes = np.array([classes]) + + for k in classes: + self.classes_.append(np.array(k)) + self.n_classes_.append(np.array(k).shape[0]) + + for i in range(n_samples): + for j in range(self.n_outputs_): + y_encoded[i, j] = np.where(self.classes_[j] == y[i, j])[0][ + 0 + ] + else: + for k in range(self.n_outputs_): + classes_k, y_encoded[:, k] = np.unique( + y[:, k], return_inverse=True + ) + self.classes_.append(classes_k) + self.n_classes_.append(classes_k.shape[0]) + + y = y_encoded + + if self.class_weight is not None: + expanded_class_weight = compute_sample_weight( + self.class_weight, y_original + ) - self.classes_ = [] - self.n_classes_ = [] + self.n_classes_ = np.array(self.n_classes_, dtype=np.intp) - if self.class_weight is not None: - y_original = np.copy(y) + if getattr(y, "dtype", None) != DOUBLE or not y.flags.contiguous: + y = np.ascontiguousarray(y, dtype=DOUBLE) - y_encoded = np.zeros(y.shape, dtype=int) - for k in range(self.n_outputs_): - classes_k, y_encoded[:, k] = np.unique(y[:, k], return_inverse=True) - self.classes_.append(classes_k) - self.n_classes_.append(classes_k.shape[0]) - y = y_encoded - - if self.class_weight is not None: - expanded_class_weight = compute_sample_weight( - self.class_weight, y_original + if len(y) != n_samples: + raise ValueError( + "Number of labels=%d does not match number of samples=%d" + % (len(y), n_samples) ) - self.n_classes_ = np.array(self.n_classes_, dtype=np.intp) - - if getattr(y, "dtype", None) != DOUBLE or not y.flags.contiguous: - y = np.ascontiguousarray(y, dtype=DOUBLE) - + # set decision-tree model parameters max_depth = np.iinfo(np.int32).max if self.max_depth is None else self.max_depth if isinstance(self.min_samples_leaf, numbers.Integral): @@ -325,36 +365,25 @@ def _fit( else: # float min_samples_leaf = int(ceil(self.min_samples_leaf * n_samples)) - if isinstance(self.min_samples_split, numbers.Integral): + if isinstance(self.min_samples_split, str): + if self.min_samples_split == "sqrt": + min_samples_split = max(1, int(np.sqrt(self.n_features_in_))) + elif self.min_samples_split == "log2": + min_samples_split = max(1, int(np.log2(self.n_features_in_))) + elif isinstance(self.min_samples_split, numbers.Integral): min_samples_split = self.min_samples_split else: # float min_samples_split = int(ceil(self.min_samples_split * n_samples)) min_samples_split = max(2, min_samples_split) - min_samples_split = max(min_samples_split, 2 * min_samples_leaf) + self.min_samples_split_ = min_samples_split if isinstance(self.max_features, str): if self.max_features == "auto": if is_classification: max_features = max(1, int(np.sqrt(self.n_features_in_))) - warnings.warn( - ( - "`max_features='auto'` has been deprecated in 1.1 " - "and will be removed in 1.3. To keep the past behaviour, " - "explicitly set `max_features='sqrt'`." - ), - FutureWarning, - ) else: max_features = self.n_features_in_ - warnings.warn( - ( - "`max_features='auto'` has been deprecated in 1.1 " - "and will be removed in 1.3. To keep the past behaviour, " - "explicitly set `max_features=1.0'`." - ), - FutureWarning, - ) elif self.max_features == "sqrt": max_features = max(1, int(np.sqrt(self.n_features_in_))) elif self.max_features == "log2": @@ -373,16 +402,10 @@ def _fit( max_leaf_nodes = -1 if self.max_leaf_nodes is None else self.max_leaf_nodes - if len(y) != n_samples: - raise ValueError( - "Number of labels=%d does not match number of samples=%d" - % (len(y), n_samples) - ) - if sample_weight is not None: sample_weight = _check_sample_weight(sample_weight, X, DOUBLE) - if expanded_class_weight is not None: + if y is not None and expanded_class_weight is not None: if sample_weight is not None: sample_weight = sample_weight * expanded_class_weight else: @@ -394,10 +417,65 @@ def _fit( else: min_weight_leaf = self.min_weight_fraction_leaf * np.sum(sample_weight) + # build the actual tree now with the parameters + self._build_tree( + X=X, + y=y, + sample_weight=sample_weight, + missing_values_in_feature_mask=missing_values_in_feature_mask, + min_samples_leaf=min_samples_leaf, + min_weight_leaf=min_weight_leaf, + max_leaf_nodes=max_leaf_nodes, + min_samples_split=min_samples_split, + max_depth=max_depth, + random_state=random_state, + ) + + return self + + def _build_tree( + self, + X, + y, + sample_weight, + missing_values_in_feature_mask, + min_samples_leaf, + min_weight_leaf, + max_leaf_nodes, + min_samples_split, + max_depth, + random_state, + ): + """Build the actual tree. + + Parameters + ---------- + X : Array-like + X dataset. + y : Array-like + Y targets. + sample_weight : Array-like + Sample weights + min_samples_leaf : float + Number of samples required to be a leaf. + min_weight_leaf : float + Weight of samples required to be a leaf. + max_leaf_nodes : float + Maximum number of leaf nodes allowed in tree. + min_samples_split : float + Minimum number of samples to split on. + max_depth : int + The maximum depth of any tree. + random_state : int + Random seed. + """ + + n_samples = X.shape[0] + # Build tree criterion = self.criterion - if not isinstance(criterion, Criterion): - if is_classification: + if not isinstance(criterion, BaseCriterion): + if is_classifier(self): criterion = CRITERIA_CLF[self.criterion]( self.n_outputs_, self.n_classes_ ) @@ -410,7 +488,6 @@ def _fit( SPLITTERS = SPARSE_SPLITTERS if issparse(X) else DENSE_SPLITTERS - splitter = self.splitter if self.monotonic_cst is None: monotonic_cst = None else: @@ -450,7 +527,7 @@ def _fit( # *positive class*, all signs must be flipped. monotonic_cst *= -1 - if not isinstance(self.splitter, Splitter): + if not isinstance(self.splitter, BaseSplitter): splitter = SPLITTERS[self.splitter]( criterion, self.max_features_, @@ -472,16 +549,17 @@ def _fit( # Use BestFirst if max_leaf_nodes given; use DepthFirst otherwise if max_leaf_nodes < 0: - builder = DepthFirstTreeBuilder( + self.builder_ = DepthFirstTreeBuilder( splitter, min_samples_split, min_samples_leaf, min_weight_leaf, max_depth, self.min_impurity_decrease, + self.store_leaf_values, ) else: - builder = BestFirstTreeBuilder( + self.builder_ = BestFirstTreeBuilder( splitter, min_samples_split, min_samples_leaf, @@ -489,9 +567,11 @@ def _fit( max_depth, max_leaf_nodes, self.min_impurity_decrease, + self.store_leaf_values, ) - - builder.build(self.tree_, X, y, sample_weight, missing_values_in_feature_mask) + self.builder_.build( + self.tree_, X, y, sample_weight, missing_values_in_feature_mask + ) if self.n_outputs_ == 1 and is_classifier(self): self.n_classes_ = self.n_classes_[0] @@ -499,8 +579,6 @@ def _fit( self._prune_tree() - return self - def _validate_X_predict(self, X, check_input): """Validate the training data on predict (probabilities).""" if check_input: @@ -549,6 +627,9 @@ def predict(self, X, check_input=True): """ check_is_fitted(self) X = self._validate_X_predict(X, check_input) + + # proba is a count matrix of leaves that fall into + # (n_samples, n_outputs, max_n_classes) array proba = self.tree_.predict(X) n_samples = X.shape[0] @@ -575,6 +656,134 @@ def predict(self, X, check_input=True): else: return proba[:, :, 0] + def get_leaf_node_samples(self, X, check_input=True): + """For each datapoint x in X, get the training samples in the leaf node. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + Dataset to apply the forest to. + check_input : bool, default=True + Allow to bypass several input checking. + + Returns + ------- + leaf_nodes_samples : a list of array-like of length (n_samples,) + Each sample is represented by the indices of the training samples that + reached the leaf node. The ``n_leaf_node_samples`` may vary between + samples, since the number of samples that fall in a leaf node is + variable. Each array has shape (n_leaf_node_samples, n_outputs). + """ + if not self.store_leaf_values: + raise RuntimeError( + "leaf node samples are not stored when store_leaf_values=False" + ) + + # get indices of leaves per sample (n_samples,) + X_leaves = self.apply(X, check_input=check_input) + n_samples = X_leaves.shape[0] + + # get array of samples per leaf (n_node_samples, n_outputs) + leaf_samples = self.tree_.leaf_nodes_samples + + leaf_nodes_samples = [] + for idx in range(n_samples): + leaf_id = X_leaves[idx] + leaf_nodes_samples.append(leaf_samples[leaf_id]) + return leaf_nodes_samples + + def predict_quantiles(self, X, quantiles=0.5, method="nearest", check_input=True): + """Predict class or regression value for X at given quantiles. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + Input data. + quantiles : float, optional + The quantiles at which to evaluate, by default 0.5 (median). + method : str, optional + The method to interpolate, by default 'linear'. Can be any keyword + argument accepted by :func:`~np.quantile`. + check_input : bool, optional + Whether or not to check input, by default True. + + Returns + ------- + predictions : array-like of shape (n_samples, n_outputs, len(quantiles)) + The predicted quantiles. + """ + if not self.store_leaf_values: + raise RuntimeError( + "Predicting quantiles requires that the tree stores leaf node samples." + ) + + check_is_fitted(self) + + # Check data + X = self._validate_X_predict(X, check_input) + + if not isinstance(quantiles, (np.ndarray, list)): + quantiles = np.array([quantiles]) + + # get indices of leaves per sample + X_leaves = self.apply(X) + + # get array of samples per leaf (n_node_samples, n_outputs) + leaf_samples = self.tree_.leaf_nodes_samples + + # compute quantiles (n_samples, n_quantiles, n_outputs) + n_samples = X.shape[0] + n_quantiles = len(quantiles) + proba = np.zeros((n_samples, n_quantiles, self.n_outputs_)) + for idx, leaf_id in enumerate(X_leaves): + # predict by taking the quantile across the samples in the leaf for + # each output + try: + proba[idx, ...] = np.quantile( + leaf_samples[leaf_id], quantiles, axis=0, method=method + ) + except TypeError: + proba[idx, ...] = np.quantile( + leaf_samples[leaf_id], quantiles, axis=0, interpolation=method + ) + + # Classification + if is_classifier(self): + if self.n_outputs_ == 1: + # return the class with the highest probability for each quantile + # (n_samples, n_quantiles) + class_preds = np.zeros( + (n_samples, n_quantiles), dtype=self.classes_.dtype + ) + for i in range(n_quantiles): + class_pred_per_sample = ( + proba[:, i, :].squeeze().astype(self.classes_.dtype) + ) + class_preds[:, i] = self.classes_.take( + class_pred_per_sample, axis=0 + ) + return class_preds + else: + class_type = self.classes_[0].dtype + predictions = np.zeros( + (n_samples, n_quantiles, self.n_outputs_), dtype=class_type + ) + for k in range(self.n_outputs_): + for i in range(n_quantiles): + class_pred_per_sample = proba[:, i, k].squeeze().astype(int) + predictions[:, i, k] = self.classes_[k].take( + class_pred_per_sample, axis=0 + ) + + return predictions + # Regression + else: + if self.n_outputs_ == 1: + return proba[:, :, 0] + + else: + return proba + def apply(self, X, check_input=True): """Return the index of the leaf that each sample is predicted as. @@ -849,6 +1058,16 @@ class DecisionTreeClassifier(ClassifierMixin, BaseDecisionTree): .. versionadded:: 0.22 + store_leaf_values : bool, default=False + Whether to store the samples that fall into leaves in the ``tree_`` attribute. + Each leaf will store a 2D array corresponding to the samples that fall into it + keyed by node_id. + + XXX: This is currently experimental and may change without notice. + Moreover, it can be improved upon since storing the samples twice is not ideal. + One could instead store the indices in ``y_train`` that fall into each leaf, + which would lower RAM/diskspace usage. + monotonic_cst : array-like of int of shape (n_features), default=None Indicates the monotonicity constraint to enforce on each feature. - 1: monotonic increase @@ -913,6 +1132,12 @@ class DecisionTreeClassifier(ClassifierMixin, BaseDecisionTree): :ref:`sphx_glr_auto_examples_tree_plot_unveil_tree_structure.py` for basic usage of these attributes. + builder_ : TreeBuilder instance + The underlying TreeBuilder object. + + min_samples_split_ : float + The minimum number of samples needed to split a node in the tree building. + See Also -------- DecisionTreeRegressor : A decision tree regressor. @@ -960,7 +1185,10 @@ class DecisionTreeClassifier(ClassifierMixin, BaseDecisionTree): _parameter_constraints: dict = { **BaseDecisionTree._parameter_constraints, - "criterion": [StrOptions({"gini", "entropy", "log_loss"}), Hidden(Criterion)], + "criterion": [ + StrOptions({"gini", "entropy", "log_loss"}), + Hidden(BaseCriterion), + ], "class_weight": [dict, list, StrOptions({"balanced"}), None], } @@ -979,6 +1207,7 @@ def __init__( min_impurity_decrease=0.0, class_weight=None, ccp_alpha=0.0, + store_leaf_values=False, monotonic_cst=None, ): super().__init__( @@ -995,10 +1224,18 @@ def __init__( min_impurity_decrease=min_impurity_decrease, monotonic_cst=monotonic_cst, ccp_alpha=ccp_alpha, + store_leaf_values=store_leaf_values, ) @_fit_context(prefer_skip_nested_validation=True) - def fit(self, X, y, sample_weight=None, check_input=True): + def fit( + self, + X, + y, + sample_weight=None, + check_input=True, + classes=None, + ): """Build a decision tree classifier from the training set (X, y). Parameters @@ -1022,20 +1259,127 @@ def fit(self, X, y, sample_weight=None, check_input=True): Allow to bypass several input checking. Don't use this parameter unless you know what you're doing. + classes : array-like of shape (n_classes,), default=None + List of all the classes that can possibly appear in the y vector. + Returns ------- self : DecisionTreeClassifier Fitted estimator. """ - super()._fit( X, y, sample_weight=sample_weight, check_input=check_input, + classes=classes, ) return self + def partial_fit(self, X, y, sample_weight=None, check_input=True, classes=None): + """Update a decision tree classifier from the training set (X, y). + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + The training input samples. Internally, it will be converted to + ``dtype=np.float32`` and if a sparse matrix is provided + to a sparse ``csc_matrix``. + + y : array-like of shape (n_samples,) or (n_samples, n_outputs) + The target values (class labels) as integers or strings. + + sample_weight : array-like of shape (n_samples,), default=None + Sample weights. If None, then samples are equally weighted. Splits + that would create child nodes with net zero or negative weight are + ignored while searching for a split in each node. Splits are also + ignored if they would result in any single class carrying a + negative weight in either child node. + + check_input : bool, default=True + Allow to bypass several input checking. + Don't use this parameter unless you know what you do. + + classes : array-like of shape (n_classes,), default=None + List of all the classes that can possibly appear in the y vector. + Must be provided at the first call to partial_fit, can be omitted + in subsequent calls. + + Returns + ------- + self : DecisionTreeClassifier + Fitted estimator. + """ + self._validate_params() + + # validate input parameters + first_call = _check_partial_fit_first_call(self, classes=classes) + + # Fit if no tree exists yet + if first_call: + self.fit( + X, + y, + sample_weight=sample_weight, + check_input=check_input, + classes=classes, + ) + return self + + if check_input: + # Need to validate separately here. + # We can't pass multi_ouput=True because that would allow y to be + # csr. + check_X_params = dict(dtype=DTYPE, accept_sparse="csc") + check_y_params = dict(ensure_2d=False, dtype=None) + X, y = self._validate_data( + X, y, reset=False, validate_separately=(check_X_params, check_y_params) + ) + if issparse(X): + X.sort_indices() + + if X.indices.dtype != np.intc or X.indptr.dtype != np.intc: + raise ValueError( + "No support for np.int64 index based sparse matrices" + ) + + if X.shape[1] != self.n_features_in_: + raise ValueError( + f"X has {X.shape[1]} features, but {self.__class__.__name__} " + f"is expecting {self.n_features_in_} features as input." + ) + + y = np.atleast_1d(y) + + if y.ndim == 1: + # reshape is necessary to preserve the data contiguity against vs + # [:, np.newaxis] that does not. + y = np.reshape(y, (-1, 1)) + + check_classification_targets(y) + y = np.copy(y) + + classes = self.classes_ + if self.n_outputs_ == 1: + classes = [classes] + + y_encoded = np.zeros(y.shape, dtype=int) + for i in range(X.shape[0]): + for j in range(self.n_outputs_): + y_encoded[i, j] = np.where(classes[j] == y[i, j])[0][0] + y = y_encoded + + if getattr(y, "dtype", None) != DOUBLE or not y.flags.contiguous: + y = np.ascontiguousarray(y, dtype=DOUBLE) + + # Update tree + self.builder_.initialize_node_queue(self.tree_, X, y, sample_weight) + self.builder_.build(self.tree_, X, y, sample_weight) + + self._prune_tree() + + return self + def predict_proba(self, X, check_input=True): """Predict class probabilities of the input samples X. @@ -1246,6 +1590,16 @@ class DecisionTreeRegressor(RegressorMixin, BaseDecisionTree): .. versionadded:: 0.22 + store_leaf_values : bool, default=False + Whether to store the samples that fall into leaves in the ``tree_`` attribute. + Each leaf will store a 2D array corresponding to the samples that fall into it + keyed by node_id. + + XXX: This is currently experimental and may change without notice. + Moreover, it can be improved upon since storing the samples twice is not ideal. + One could instead store the indices in ``y_train`` that fall into each leaf, + which would lower RAM/diskspace usage. + monotonic_cst : array-like of int of shape (n_features), default=None Indicates the monotonicity constraint to enforce on each feature. - 1: monotonic increase @@ -1298,6 +1652,12 @@ class DecisionTreeRegressor(RegressorMixin, BaseDecisionTree): :ref:`sphx_glr_auto_examples_tree_plot_unveil_tree_structure.py` for basic usage of these attributes. + builder_ : TreeBuilder instance + The underlying TreeBuilder object. + + min_samples_split_ : float + The minimum number of samples needed to split a node in the tree building. + See Also -------- DecisionTreeClassifier : A decision tree classifier. @@ -1342,7 +1702,7 @@ class DecisionTreeRegressor(RegressorMixin, BaseDecisionTree): **BaseDecisionTree._parameter_constraints, "criterion": [ StrOptions({"squared_error", "friedman_mse", "absolute_error", "poisson"}), - Hidden(Criterion), + Hidden(BaseCriterion), ], } @@ -1360,6 +1720,7 @@ def __init__( max_leaf_nodes=None, min_impurity_decrease=0.0, ccp_alpha=0.0, + store_leaf_values=False, monotonic_cst=None, ): super().__init__( @@ -1374,11 +1735,19 @@ def __init__( random_state=random_state, min_impurity_decrease=min_impurity_decrease, ccp_alpha=ccp_alpha, + store_leaf_values=store_leaf_values, monotonic_cst=monotonic_cst, ) @_fit_context(prefer_skip_nested_validation=True) - def fit(self, X, y, sample_weight=None, check_input=True): + def fit( + self, + X, + y, + sample_weight=None, + check_input=True, + classes=None, + ): """Build a decision tree regressor from the training set (X, y). Parameters @@ -1401,6 +1770,9 @@ def fit(self, X, y, sample_weight=None, check_input=True): Allow to bypass several input checking. Don't use this parameter unless you know what you're doing. + classes : array-like of shape (n_classes,), default=None + List of all the classes that can possibly appear in the y vector. + Returns ------- self : DecisionTreeRegressor @@ -1412,6 +1784,7 @@ def fit(self, X, y, sample_weight=None, check_input=True): y, sample_weight=sample_weight, check_input=check_input, + classes=classes, ) return self @@ -1589,6 +1962,16 @@ class ExtraTreeClassifier(DecisionTreeClassifier): .. versionadded:: 0.22 + store_leaf_values : bool, default=False + Whether to store the samples that fall into leaves in the ``tree_`` attribute. + Each leaf will store a 2D array corresponding to the samples that fall into it + keyed by node_id. + + XXX: This is currently experimental and may change without notice. + Moreover, it can be improved upon since storing the samples twice is not ideal. + One could instead store the indices in ``y_train`` that fall into each leaf, + which would lower RAM/diskspace usage. + monotonic_cst : array-like of int of shape (n_features), default=None Indicates the monotonicity constraint to enforce on each feature. - 1: monotonic increase @@ -1653,6 +2036,12 @@ class ExtraTreeClassifier(DecisionTreeClassifier): :ref:`sphx_glr_auto_examples_tree_plot_unveil_tree_structure.py` for basic usage of these attributes. + builder_ : TreeBuilder instance + The underlying TreeBuilder object. + + min_samples_split_ : float + The minimum number of samples needed to split a node in the tree building. + See Also -------- ExtraTreeRegressor : An extremely randomized tree regressor. @@ -1708,6 +2097,7 @@ def __init__( min_impurity_decrease=0.0, class_weight=None, ccp_alpha=0.0, + store_leaf_values=False, monotonic_cst=None, ): super().__init__( @@ -1723,6 +2113,7 @@ def __init__( min_impurity_decrease=min_impurity_decrease, random_state=random_state, ccp_alpha=ccp_alpha, + store_leaf_values=store_leaf_values, monotonic_cst=monotonic_cst, ) @@ -1854,6 +2245,16 @@ class ExtraTreeRegressor(DecisionTreeRegressor): .. versionadded:: 0.22 + store_leaf_values : bool, default=False + Whether to store the samples that fall into leaves in the ``tree_`` attribute. + Each leaf will store a 2D array corresponding to the samples that fall into it + keyed by node_id. + + XXX: This is currently experimental and may change without notice. + Moreover, it can be improved upon since storing the samples twice is not ideal. + One could instead store the indices in ``y_train`` that fall into each leaf, + which would lower RAM/diskspace usage. + monotonic_cst : array-like of int of shape (n_features), default=None Indicates the monotonicity constraint to enforce on each feature. - 1: monotonic increase @@ -1903,6 +2304,12 @@ class ExtraTreeRegressor(DecisionTreeRegressor): :ref:`sphx_glr_auto_examples_tree_plot_unveil_tree_structure.py` for basic usage of these attributes. + builder_ : TreeBuilder instance + The underlying TreeBuilder object. + + min_samples_split_ : float + The minimum number of samples needed to split a node in the tree building. + See Also -------- ExtraTreeClassifier : An extremely randomized tree classifier. @@ -1953,6 +2360,7 @@ def __init__( min_impurity_decrease=0.0, max_leaf_nodes=None, ccp_alpha=0.0, + store_leaf_values=False, monotonic_cst=None, ): super().__init__( @@ -1967,5 +2375,6 @@ def __init__( min_impurity_decrease=min_impurity_decrease, random_state=random_state, ccp_alpha=ccp_alpha, + store_leaf_values=store_leaf_values, monotonic_cst=monotonic_cst, ) diff --git a/sklearn/tree/_criterion.pxd b/sklearn/tree/_criterion.pxd index b765d324bebb9..690f4d0c54c64 100644 --- a/sklearn/tree/_criterion.pxd +++ b/sklearn/tree/_criterion.pxd @@ -4,33 +4,33 @@ # Joel Nothman # Arnaud Joly # Jacob Schreiber +# Adam Li +# Jong Shin # # License: BSD 3 clause # See _criterion.pyx for implementation details. cimport numpy as cnp -from ._tree cimport DTYPE_t # Type of X -from ._tree cimport DOUBLE_t # Type of y, sample_weight -from ._tree cimport SIZE_t # Type for indices and counters -from ._tree cimport INT32_t # Signed 32 bit integer -from ._tree cimport UINT32_t # Unsigned 32 bit integer +from libcpp.vector cimport vector -cdef class Criterion: - # The criterion computes the impurity of a node and the reduction of - # impurity of a split on that node. It also computes the output statistics - # such as the mean in regression and class probabilities in classification. +from ._tree cimport DOUBLE_t # Type of y, sample_weight +from ._tree cimport DTYPE_t # Type of X +from ._tree cimport INT32_t # Signed 32 bit integer +from ._tree cimport SIZE_t # Type for indices and counters +from ._tree cimport UINT32_t # Unsigned 32 bit integer + + +cdef class BaseCriterion: + """Abstract interface for criterion.""" # Internal structures - cdef const DOUBLE_t[:, ::1] y # Values of y cdef const DOUBLE_t[:] sample_weight # Sample weights cdef const SIZE_t[:] sample_indices # Sample indices in X, y cdef SIZE_t start # samples[start:pos] are the samples in the left node cdef SIZE_t pos # samples[pos:end] are the samples in the right node cdef SIZE_t end - cdef SIZE_t n_missing # Number of missing values for the feature being evaluated - cdef bint missing_go_to_left # Whether missing values go to the left node cdef SIZE_t n_outputs # Number of outputs cdef SIZE_t n_samples # Number of samples @@ -41,21 +41,11 @@ cdef class Criterion: cdef double weighted_n_right # Weighted number of samples in the right node cdef double weighted_n_missing # Weighted number of samples that are missing + # Core methods that criterion class _must_ implement. # The criterion object is maintained such that left and right collected # statistics correspond to samples[start:pos] and samples[pos:end]. # Methods - cdef int init( - self, - const DOUBLE_t[:, ::1] y, - const DOUBLE_t[:] sample_weight, - double weighted_n_samples, - const SIZE_t[:] sample_indices, - SIZE_t start, - SIZE_t end - ) except -1 nogil - cdef void init_sum_missing(self) - cdef void init_missing(self, SIZE_t n_missing) noexcept nogil cdef int reset(self) except -1 nogil cdef int reverse_reset(self) except -1 nogil cdef int update(self, SIZE_t new_pos) except -1 nogil @@ -69,13 +59,6 @@ cdef class Criterion: self, double* dest ) noexcept nogil - cdef void clip_node_value( - self, - double* dest, - double lower_bound, - double upper_bound - ) noexcept nogil - cdef double middle_value(self) noexcept nogil cdef double impurity_improvement( self, double impurity_parent, @@ -83,6 +66,35 @@ cdef class Criterion: double impurity_right ) noexcept nogil cdef double proxy_impurity_improvement(self) noexcept nogil + cdef void set_sample_pointers( + self, + SIZE_t start, + SIZE_t end + ) noexcept nogil + + +cdef class Criterion(BaseCriterion): + """Abstract interface for supervised impurity criteria.""" + + cdef const DOUBLE_t[:, ::1] y # Values of y + cdef SIZE_t n_missing # Number of missing values for the feature being evaluated + cdef bint missing_go_to_left # Whether missing values go to the left node + + cdef int init( + self, + const DOUBLE_t[:, ::1] y, + const DOUBLE_t[:] sample_weight, + double weighted_n_samples, + const SIZE_t[:] sample_indices + ) except -1 nogil + cdef void init_sum_missing(self) + cdef void init_missing(self, SIZE_t n_missing) noexcept nogil + + cdef void node_samples( + self, + vector[vector[DOUBLE_t]]& dest + ) noexcept nogil + cdef bint check_monotonicity( self, cnp.int8_t monotonic_cst, @@ -97,6 +109,13 @@ cdef class Criterion: double sum_left, double sum_right, ) noexcept nogil + cdef void clip_node_value( + self, + double* dest, + double lower_bound, + double upper_bound + ) noexcept nogil + cdef double middle_value(self) noexcept nogil cdef class ClassificationCriterion(Criterion): """Abstract criterion for classification.""" diff --git a/sklearn/tree/_criterion.pyx b/sklearn/tree/_criterion.pyx index ed8a12065554e..f47feb9c9f59d 100644 --- a/sklearn/tree/_criterion.pyx +++ b/sklearn/tree/_criterion.pyx @@ -1,3 +1,6 @@ +# cython: language_level=3 +# cython: boundscheck=False, wraparound=False, initializedcheck=False, cdivision=True + # Authors: Gilles Louppe # Peter Prettenhofer # Brian Holt @@ -9,30 +12,47 @@ # Fares Hedayati # Jacob Schreiber # Nelson Liu +# Adam Li +# Jong Shin # # License: BSD 3 clause -from libc.string cimport memcpy -from libc.string cimport memset -from libc.math cimport fabs, INFINITY +from libc.math cimport INFINITY, fabs +from libc.string cimport memcpy, memset import numpy as np + cimport numpy as cnp + cnp.import_array() from scipy.special.cython_special cimport xlogy -from ._utils cimport log -from ._utils cimport WeightedMedianCalculator +from ._utils cimport WeightedMedianCalculator, log + # EPSILON is used in the Poisson criterion cdef double EPSILON = 10 * np.finfo('double').eps -cdef class Criterion: - """Interface for impurity criteria. +cdef class BaseCriterion: + """This is an abstract interface for criterion. + + For example, a tree model could + be either supervisedly, or unsupervisedly computing impurity on samples of + covariates, or labels, or both. Although scikit-learn currently only contains + supervised tree methods, this class enables 3rd party packages to leverage + scikit-learn's Cython code for criteria. + + The downstream classes _must_ implement methods to compute the impurity + in current node and in children nodes. This object stores methods on how to calculate how good a split is using - different metrics. + a set API. + + Samples in the "current" node are stored in `samples[start:end]` which is + partitioned around `pos` (an index in `start:end`) so that: + - the samples of left child node are stored in `samples[start:pos]` + - the samples of right child node are stored in `samples[pos:end]` """ def __getstate__(self): return {} @@ -40,53 +60,6 @@ cdef class Criterion: def __setstate__(self, d): pass - cdef int init( - self, - const DOUBLE_t[:, ::1] y, - const DOUBLE_t[:] sample_weight, - double weighted_n_samples, - const SIZE_t[:] sample_indices, - SIZE_t start, - SIZE_t end, - ) except -1 nogil: - """Placeholder for a method which will initialize the criterion. - - Returns -1 in case of failure to allocate memory (and raise MemoryError) - or 0 otherwise. - - Parameters - ---------- - y : ndarray, dtype=DOUBLE_t - y is a buffer that can store values for n_outputs target variables - stored as a Cython memoryview. - sample_weight : ndarray, dtype=DOUBLE_t - The weight of each sample stored as a Cython memoryview. - weighted_n_samples : double - The total weight of the samples being considered - sample_indices : ndarray, dtype=SIZE_t - A mask on the samples. Indices of the samples in X and y we want to use, - where sample_indices[start:end] correspond to the samples in this node. - start : SIZE_t - The first sample to be used on this node - end : SIZE_t - The last sample used on this node - - """ - pass - - cdef void init_missing(self, SIZE_t n_missing) noexcept nogil: - """Initialize sum_missing if there are missing values. - - This method assumes that caller placed the missing samples in - self.sample_indices[-n_missing:] - - Parameters - ---------- - n_missing: SIZE_t - Number of missing values for specific feature. - """ - pass - cdef int reset(self) except -1 nogil: """Reset the criterion at pos=start. @@ -157,16 +130,6 @@ cdef class Criterion: """ pass - cdef void clip_node_value(self, double* dest, double lower_bound, double upper_bound) noexcept nogil: - pass - - cdef double middle_value(self) noexcept nogil: - """Compute the middle value of a split for monotonicity constraints - - This method is implemented in ClassificationCriterion and RegressionCriterion. - """ - pass - cdef double proxy_impurity_improvement(self) noexcept nogil: """Compute a proxy of the impurity reduction. @@ -221,6 +184,90 @@ cdef class Criterion: - (self.weighted_n_left / self.weighted_n_node_samples * impurity_left))) + cdef void set_sample_pointers( + self, + SIZE_t start, + SIZE_t end + ) noexcept nogil: + """Abstract method which will set sample pointers in the criterion. + + The dataset array that we compute criteria on is assumed to consist of 'N' + ordered samples or rows (i.e. sorted). Since we pass this by reference, we + use sample pointers to move the start and end around to consider only a subset of data. + This function should also update relevant statistics that the class uses to compute the final criterion. + + Parameters + ---------- + start : SIZE_t + The index of the first sample to be used on computation of criteria of the current node. + end : SIZE_t + The last sample used on this node + """ + pass + + +cdef class Criterion(BaseCriterion): + """Interface for impurity criteria. + + The supervised criterion computes the impurity of a node and the reduction of + impurity of a split on that node using the distribution of labels in parent and + children nodes. It also computes the output statistics such as the mean in regression + and class probabilities in classification. Instances of this class are responsible + for compute splits' impurity difference. + + Criterion is the base class for criteria used in supervised tree-based models + with a homogeneous float64-dtyped y. + """ + cdef int init( + self, + const DOUBLE_t[:, ::1] y, + const DOUBLE_t[:] sample_weight, + double weighted_n_samples, + const SIZE_t[:] sample_indices + ) except -1 nogil: + """Placeholder for a method which will initialize the criterion. + + Returns -1 in case of failure to allocate memory (and raise MemoryError) + or 0 otherwise. + + Parameters + ---------- + y : ndarray, dtype=DOUBLE_t + y is a buffer that can store values for n_outputs target variables + stored as a Cython memoryview. + sample_weight : ndarray, dtype=DOUBLE_t + The weight of each sample stored as a Cython memoryview. + weighted_n_samples : double + The total weight of the samples being considered + sample_indices : ndarray, dtype=SIZE_t + A mask on the samples. Indices of the samples in X and y we want to use, + where sample_indices[start:end] correspond to the samples in this node. + """ + pass + + cdef void init_missing(self, SIZE_t n_missing) noexcept nogil: + """Initialize sum_missing if there are missing values. + + This method assumes that caller placed the missing samples in + self.sample_indices[-n_missing:] + + Parameters + ---------- + n_missing: SIZE_t + Number of missing values for specific feature. + """ + pass + + cdef void clip_node_value(self, double* dest, double lower_bound, double upper_bound) noexcept nogil: + pass + + cdef double middle_value(self) noexcept nogil: + """Compute the middle value of a split for monotonicity constraints + + This method is implemented in ClassificationCriterion and RegressionCriterion. + """ + pass + cdef bint check_monotonicity( self, cnp.int8_t monotonic_cst, @@ -254,6 +301,33 @@ cdef class Criterion: cdef void init_sum_missing(self): """Init sum_missing to hold sums for missing values.""" + cdef void node_samples( + self, + vector[vector[DOUBLE_t]]& dest + ) noexcept nogil: + """Copy the samples of the current node into dest. + + Parameters + ---------- + dest : reference vector[vector[DOUBLE_t]] + The vector of vectors where the samples should be copied. + This is passed by reference and modified in place. + """ + cdef SIZE_t i, j, k + + # Resize the destination vector of vectors + dest.resize(self.n_node_samples) + + # Loop over the samples + for i in range(self.n_node_samples): + # Get the index of the current sample + j = self.sample_indices[self.start + i] + + # Get the sample values for each output + for k in range(self.n_outputs): + dest[i].push_back(self.y[j, k]) + + cdef inline void _move_sums_classification( ClassificationCriterion criterion, double[:, ::1] sum_1, @@ -352,15 +426,10 @@ cdef class ClassificationCriterion(Criterion): const DOUBLE_t[:, ::1] y, const DOUBLE_t[:] sample_weight, double weighted_n_samples, - const SIZE_t[:] sample_indices, - SIZE_t start, - SIZE_t end + const SIZE_t[:] sample_indices ) except -1 nogil: """Initialize the criterion. - This initializes the criterion at node sample_indices[start:end] and children - sample_indices[start:start] and sample_indices[start:end]. - Returns -1 in case of failure to allocate memory (and raise MemoryError) or 0 otherwise. @@ -375,18 +444,24 @@ cdef class ClassificationCriterion(Criterion): sample_indices : ndarray, dtype=SIZE_t A mask on the samples. Indices of the samples in X and y we want to use, where sample_indices[start:end] correspond to the samples in this node. - start : SIZE_t - The first sample to use in the mask - end : SIZE_t - The last sample to use in the mask """ self.y = y self.sample_weight = sample_weight self.sample_indices = sample_indices + self.weighted_n_samples = weighted_n_samples + + return 0 + + cdef void set_sample_pointers( + self, + SIZE_t start, + SIZE_t end + ) noexcept nogil: + """Set sample pointers in the criterion.""" + self.n_node_samples = end - start self.start = start self.end = end - self.n_node_samples = end - start - self.weighted_n_samples = weighted_n_samples + self.weighted_n_node_samples = 0.0 cdef SIZE_t i @@ -399,12 +474,12 @@ cdef class ClassificationCriterion(Criterion): memset(&self.sum_total[k, 0], 0, self.n_classes[k] * sizeof(double)) for p in range(start, end): - i = sample_indices[p] + i = self.sample_indices[p] # w is originally set to be 1.0, meaning that if no sample weights # are given, the default weight of each sample is 1.0. - if sample_weight is not None: - w = sample_weight[i] + if self.sample_weight is not None: + w = self.sample_weight[i] # Count weighted class frequency for each target for k in range(self.n_outputs): @@ -415,7 +490,6 @@ cdef class ClassificationCriterion(Criterion): # Reset to pos=start self.reset() - return 0 cdef void init_sum_missing(self): """Init sum_missing to hold sums for missing values.""" @@ -695,13 +769,10 @@ cdef class Gini(ClassificationCriterion): This handles cases where the target is a classification taking values 0, 1, ... K-2, K-1. If node m represents a region Rm with Nm observations, then let - count_k = 1/ Nm \sum_{x_i in Rm} I(yi = k) - be the proportion of class k observations in node m. The Gini Index is then defined as: - index = \sum_{k=0}^{K-1} count_k (1 - count_k) = 1 - \sum_{k=0}^{K-1} count_k ** 2 """ @@ -819,7 +890,6 @@ cdef class RegressionCriterion(Criterion): evaluated by computing the variance of the target values left and right of the split point. The computation takes linear time with `n_samples` by using :: - var = \sum_i^n (y_i - y_bar) ** 2 = (\sum_i^n y_i ** 2) - n_samples * y_bar ** 2 """ @@ -831,7 +901,6 @@ cdef class RegressionCriterion(Criterion): ---------- n_outputs : SIZE_t The number of targets to be predicted - n_samples : SIZE_t The total number of samples to fit on """ @@ -862,23 +931,29 @@ cdef class RegressionCriterion(Criterion): const DOUBLE_t[:, ::1] y, const DOUBLE_t[:] sample_weight, double weighted_n_samples, - const SIZE_t[:] sample_indices, - SIZE_t start, - SIZE_t end, + const SIZE_t[:] sample_indices ) except -1 nogil: - """Initialize the criterion. - - This initializes the criterion at node sample_indices[start:end] and children - sample_indices[start:start] and sample_indices[start:end]. - """ + """Initialize the criterion.""" # Initialize fields self.y = y self.sample_weight = sample_weight self.sample_indices = sample_indices + self.weighted_n_samples = weighted_n_samples + + return 0 + + cdef void set_sample_pointers( + self, + SIZE_t start, + SIZE_t end + ) noexcept nogil: + """Set sample pointers in the criterion.""" self.start = start self.end = end + self.n_node_samples = end - start - self.weighted_n_samples = weighted_n_samples + + self.sq_sum_total = 0.0 self.weighted_n_node_samples = 0. cdef SIZE_t i @@ -887,14 +962,14 @@ cdef class RegressionCriterion(Criterion): cdef DOUBLE_t y_ik cdef DOUBLE_t w_y_ik cdef DOUBLE_t w = 1.0 - self.sq_sum_total = 0.0 + memset(&self.sum_total[0], 0, self.n_outputs * sizeof(double)) for p in range(start, end): - i = sample_indices[p] + i = self.sample_indices[p] - if sample_weight is not None: - w = sample_weight[i] + if self.sample_weight is not None: + w = self.sample_weight[i] for k in range(self.n_outputs): y_ik = self.y[i, k] @@ -906,7 +981,6 @@ cdef class RegressionCriterion(Criterion): # Reset to pos=start self.reset() - return 0 cdef void init_sum_missing(self): """Init sum_missing to hold sums for missing values.""" @@ -1074,7 +1148,6 @@ cdef class RegressionCriterion(Criterion): cdef class MSE(RegressionCriterion): """Mean squared error impurity criterion. - MSE = var_left + var_right """ @@ -1222,26 +1295,30 @@ cdef class MAE(RegressionCriterion): const DOUBLE_t[:, ::1] y, const DOUBLE_t[:] sample_weight, double weighted_n_samples, - const SIZE_t[:] sample_indices, - SIZE_t start, - SIZE_t end, + const SIZE_t[:] sample_indices ) except -1 nogil: - """Initialize the criterion. - - This initializes the criterion at node sample_indices[start:end] and children - sample_indices[start:start] and sample_indices[start:end]. - """ - cdef SIZE_t i, p, k - cdef DOUBLE_t w = 1.0 - + """Initialize the criterion.""" # Initialize fields self.y = y self.sample_weight = sample_weight self.sample_indices = sample_indices + self.weighted_n_samples = weighted_n_samples + + return 0 + + cdef void set_sample_pointers( + self, + SIZE_t start, + SIZE_t end + ) noexcept nogil: + """Set sample pointers in the criterion.""" + cdef SIZE_t i, p, k + cdef DOUBLE_t w = 1.0 + self.start = start self.end = end + self.n_node_samples = end - start - self.weighted_n_samples = weighted_n_samples self.weighted_n_node_samples = 0. cdef void** left_child = self.left_child_ptr @@ -1252,10 +1329,10 @@ cdef class MAE(RegressionCriterion): ( right_child[k]).reset() for p in range(start, end): - i = sample_indices[p] + i = self.sample_indices[p] - if sample_weight is not None: - w = sample_weight[i] + if self.sample_weight is not None: + w = self.sample_weight[i] for k in range(self.n_outputs): # push method ends up calling safe_realloc, hence `except -1` @@ -1270,7 +1347,6 @@ cdef class MAE(RegressionCriterion): # Reset to pos=start self.reset() - return 0 cdef void init_missing(self, SIZE_t n_missing) noexcept nogil: """Raise error if n_missing != 0.""" @@ -1561,6 +1637,7 @@ cdef class Poisson(RegressionCriterion): Note that the deviance is >= 0, and since we have `y_pred = mean(y_true)` at the leaves, one always has `sum(y_pred - y_true) = 0`. It remains the implemented impurity (factor 2 is skipped): + 1/n * sum(y_true * log(y_true/y_pred) """ # FIXME in 1.0: diff --git a/sklearn/tree/_export.py b/sklearn/tree/_export.py index ff0d6db5c25a5..9cd6ad4b71387 100644 --- a/sklearn/tree/_export.py +++ b/sklearn/tree/_export.py @@ -17,9 +17,15 @@ import numpy as np -from ..base import is_classifier -from ..utils._param_validation import HasMethods, Interval, StrOptions, validate_params -from ..utils.validation import check_array, check_is_fitted +from sklearn.base import is_classifier +from sklearn.utils._param_validation import ( + HasMethods, + Interval, + StrOptions, + validate_params, +) +from sklearn.utils.validation import check_array, check_is_fitted + from . import DecisionTreeClassifier, DecisionTreeRegressor, _criterion, _tree from ._reingold_tilford import Tree, buchheim diff --git a/sklearn/tree/_splitter.pxd b/sklearn/tree/_splitter.pxd index 2547e14b324df..4c67c35ebbdb0 100644 --- a/sklearn/tree/_splitter.pxd +++ b/sklearn/tree/_splitter.pxd @@ -4,19 +4,23 @@ # Joel Nothman # Arnaud Joly # Jacob Schreiber +# Adam Li +# Jong Shin # # License: BSD 3 clause # See _splitter.pyx for details. cimport numpy as cnp -from ._criterion cimport Criterion +from libcpp.vector cimport vector + +from ._criterion cimport BaseCriterion, Criterion +from ._tree cimport DOUBLE_t # Type of y, sample_weight +from ._tree cimport DTYPE_t # Type of X +from ._tree cimport INT32_t # Signed 32 bit integer +from ._tree cimport SIZE_t # Type for indices and counters +from ._tree cimport UINT32_t # Unsigned 32 bit integer -from ._tree cimport DTYPE_t # Type of X -from ._tree cimport DOUBLE_t # Type of y, sample_weight -from ._tree cimport SIZE_t # Type for indices and counters -from ._tree cimport INT32_t # Signed 32 bit integer -from ._tree cimport UINT32_t # Unsigned 32 bit integer cdef struct SplitRecord: # Data to track sample split @@ -33,14 +37,15 @@ cdef struct SplitRecord: unsigned char missing_go_to_left # Controls if missing values go to the left node. SIZE_t n_missing # Number of missing values for the feature being split on -cdef class Splitter: +cdef class BaseSplitter: + """Abstract interface for splitter.""" + # The splitter searches in the input space for a feature and a threshold # to split the samples samples[start:end]. # # The impurity computations are delegated to a criterion object. # Internal structures - cdef public Criterion criterion # Impurity criterion cdef public SIZE_t max_features # Number of features to test cdef public SIZE_t min_samples_leaf # Min samples in a leaf cdef public double min_weight_leaf # Minimum weight in a leaf @@ -59,14 +64,6 @@ cdef class Splitter: cdef SIZE_t start # Start position for the current node cdef SIZE_t end # End position for the current node - cdef const DOUBLE_t[:, ::1] y - # Monotonicity constraints for each feature. - # The encoding is as follows: - # -1: monotonic decrease - # 0: no constraint - # +1: monotonic increase - cdef const cnp.int8_t[:] monotonic_cst - cdef bint with_monotonic_cst cdef const DOUBLE_t[:] sample_weight # The samples vector `samples` is maintained by the Splitter object such @@ -86,21 +83,12 @@ cdef class Splitter: # This allows optimization with depth-based tree building. # Methods - cdef int init( - self, - object X, - const DOUBLE_t[:, ::1] y, - const DOUBLE_t[:] sample_weight, - const unsigned char[::1] missing_values_in_feature_mask, - ) except -1 - cdef int node_reset( self, SIZE_t start, SIZE_t end, double* weighted_n_node_samples ) except -1 nogil - cdef int node_split( self, double impurity, # Impurity of the node @@ -109,9 +97,49 @@ cdef class Splitter: double lower_bound, double upper_bound, ) except -1 nogil - cdef void node_value(self, double* dest) noexcept nogil + cdef double node_impurity(self) noexcept nogil + cdef int pointer_size(self) noexcept nogil - cdef void clip_node_value(self, double* dest, double lower_bound, double upper_bound) noexcept nogil +cdef class Splitter(BaseSplitter): + """Base class for supervised splitters.""" + + cdef public Criterion criterion # Impurity criterion + cdef const DOUBLE_t[:, ::1] y - cdef double node_impurity(self) noexcept nogil + # Monotonicity constraints for each feature. + # The encoding is as follows: + # -1: monotonic decrease + # 0: no constraint + # +1: monotonic increase + cdef const cnp.int8_t[:] monotonic_cst + cdef bint with_monotonic_cst + + cdef int init( + self, + object X, + const DOUBLE_t[:, ::1] y, + const DOUBLE_t[:] sample_weight, + const unsigned char[::1] missing_values_in_feature_mask, + ) except -1 + + cdef void node_samples(self, vector[vector[DOUBLE_t]]& dest) noexcept nogil + + # Methods that allow modifications to stopping conditions + cdef bint check_presplit_conditions( + self, + SplitRecord* current_split, + SIZE_t n_missing, + bint missing_go_to_left, + ) noexcept nogil + + cdef bint check_postsplit_conditions( + self + ) noexcept nogil + + cdef void clip_node_value( + self, + double* dest, + double lower_bound, + double upper_bound + ) noexcept nogil diff --git a/sklearn/tree/_splitter.pyx b/sklearn/tree/_splitter.pyx index 5c30ba315a90a..982c68455040d 100644 --- a/sklearn/tree/_splitter.pyx +++ b/sklearn/tree/_splitter.pyx @@ -1,3 +1,6 @@ +# cython: language_level=3 +# cython: boundscheck=False, wraparound=False, initializedcheck=False, cdivision=True + # Authors: Gilles Louppe # Peter Prettenhofer # Brian Holt @@ -8,26 +11,26 @@ # Joel Nothman # Fares Hedayati # Jacob Schreiber +# Adam Li +# Jong Shin # + # License: BSD 3 clause +from cython cimport final +from libc.math cimport isnan +from libc.stdlib cimport qsort +from libc.string cimport memcpy cimport numpy as cnp from ._criterion cimport Criterion -from libc.stdlib cimport qsort -from libc.string cimport memcpy -from libc.math cimport isnan -from cython cimport final - import numpy as np from scipy.sparse import issparse -from ._utils cimport log -from ._utils cimport rand_int -from ._utils cimport rand_uniform -from ._utils cimport RAND_R_MAX +from ._utils cimport RAND_R_MAX, log, rand_int, rand_uniform + cdef double INFINITY = np.inf @@ -48,13 +51,96 @@ cdef inline void _init_split(SplitRecord* self, SIZE_t start_pos) noexcept nogil self.missing_go_to_left = False self.n_missing = 0 -cdef class Splitter: - """Abstract splitter class. +cdef class BaseSplitter: + """This is an abstract interface for splitters. + + For example, a tree model could be either supervisedly, or unsupervisedly computing splits on samples of + covariates, labels, or both. Although scikit-learn currently only contains + supervised tree methods, this class enables 3rd party packages to leverage + scikit-learn's Cython code for splitting. + + A splitter is usually used in conjunction with a criterion class, which explicitly handles + computing the criteria, which we split on. The setting of that criterion class is handled + by downstream classes. - Splitters are called by tree builders to find the best splits on both - sparse and dense data, one split at a time. + The downstream classes _must_ implement methods to compute the split in a node. """ + def __getstate__(self): + return {} + + def __setstate__(self, d): + pass + + cdef int node_reset(self, SIZE_t start, SIZE_t end, + double* weighted_n_node_samples) except -1 nogil: + """Reset splitter on node samples[start:end]. + + Returns -1 in case of failure to allocate memory (and raise MemoryError) + or 0 otherwise. + + Parameters + ---------- + start : SIZE_t + The index of the first sample to consider + end : SIZE_t + The index of the last sample to consider + weighted_n_node_samples : ndarray, dtype=double pointer + The total weight of those samples + """ + pass + + cdef int node_split( + self, + double impurity, + SplitRecord* split, + SIZE_t* n_constant_features, + double lower_bound, + double upper_bound + ) except -1 nogil: + """Find the best split on node samples[start:end]. + + This is a placeholder method. The majority of computation will be done + here. + + It should return -1 upon errors. + + Parameters + ---------- + impurity : double + The impurity of the current node. + split : SplitRecord pointer + A pointer to a memory-allocated SplitRecord object which will be filled with the + split chosen. + n_constant_features : SIZE_t pointer + A pointer to a memory-allocated SIZE_t object which will be filled with the + number of constant features. Optional to use. + lower_bound : double + The lower bound of the monotonic constraint if used. + upper_bound : double + The upper bound of the monotonic constraint if used. + """ + pass + + cdef void node_value(self, double* dest) noexcept nogil: + """Copy the value of node samples[start:end] into dest.""" + pass + + cdef double node_impurity(self) noexcept nogil: + """Return the impurity of the current node.""" + pass + + cdef int pointer_size(self) noexcept nogil: + """Size of the pointer for split records. + + Overriding this function allows one to use different subclasses of + `SplitRecord`. + """ + return sizeof(SplitRecord) + +cdef class Splitter(BaseSplitter): + """Abstract interface for supervised splitters.""" + def __cinit__( self, Criterion criterion, @@ -63,6 +149,7 @@ cdef class Splitter: double min_weight_leaf, object random_state, const cnp.int8_t[:] monotonic_cst, + *argv ): """ Parameters @@ -90,7 +177,6 @@ cdef class Splitter: Monotonicity constraints """ - self.criterion = criterion self.n_samples = 0 @@ -103,19 +189,13 @@ cdef class Splitter: self.monotonic_cst = monotonic_cst self.with_monotonic_cst = monotonic_cst is not None - def __getstate__(self): - return {} - - def __setstate__(self, d): - pass - def __reduce__(self): return (type(self), (self.criterion, self.max_features, self.min_samples_leaf, self.min_weight_leaf, self.random_state, - self.monotonic_cst), self.__getstate__()) + self.monotonic_cst.base if self.monotonic_cst is not None else None), self.__getstate__()) cdef int init( self, @@ -149,7 +229,6 @@ cdef class Splitter: has_missing : bool At least one missing values is in X. """ - self.rand_r_state = self.random_state.randint(0, RAND_R_MAX) cdef SIZE_t n_samples = X.shape[0] @@ -187,8 +266,21 @@ cdef class Splitter: self.y = y self.sample_weight = sample_weight + + self.criterion.init( + self.y, + self.sample_weight, + self.weighted_n_samples, + self.samples + ) + + self.criterion.set_sample_pointers( + self.start, + self.end + ) if missing_values_in_feature_mask is not None: self.criterion.init_sum_missing() + return 0 cdef int node_reset(self, SIZE_t start, SIZE_t end, @@ -211,37 +303,11 @@ cdef class Splitter: self.start = start self.end = end - self.criterion.init( - self.y, - self.sample_weight, - self.weighted_n_samples, - self.samples, - start, - end - ) + self.criterion.set_sample_pointers(start, end) weighted_n_node_samples[0] = self.criterion.weighted_n_node_samples return 0 - cdef int node_split( - self, - double impurity, - SplitRecord* split, - SIZE_t* n_constant_features, - double lower_bound, - double upper_bound, - ) except -1 nogil: - - """Find the best split on node samples[start:end]. - - This is a placeholder method. The majority of computation will be done - here. - - It should return -1 upon errors. - """ - - pass - cdef void node_value(self, double* dest) noexcept nogil: """Copy the value of node samples[start:end] into dest.""" @@ -252,11 +318,62 @@ cdef class Splitter: self.criterion.clip_node_value(dest, lower_bound, upper_bound) + cdef void node_samples(self, vector[vector[DOUBLE_t]]& dest) noexcept nogil: + """Copy the samples[start:end] into dest.""" + self.criterion.node_samples(dest) + cdef double node_impurity(self) noexcept nogil: """Return the impurity of the current node.""" return self.criterion.node_impurity() + cdef inline bint check_presplit_conditions( + self, + SplitRecord* current_split, + SIZE_t n_missing, + bint missing_go_to_left, + ) noexcept nogil: + """Check stopping conditions pre-split. + + This is typically a metric that is cheaply computed given the + current proposed split, which is stored as a the `current_split` + argument. + """ + cdef SIZE_t min_samples_leaf = self.min_samples_leaf + cdef SIZE_t end_non_missing = self.end - n_missing + cdef SIZE_t n_left, n_right + + if missing_go_to_left: + n_left = current_split.pos - self.start + n_missing + n_right = end_non_missing - current_split.pos + else: + n_left = current_split.pos - self.start + n_right = end_non_missing - current_split.pos + n_missing + + # Reject if min_samples_leaf is not guaranteed + if n_left < min_samples_leaf or n_right < min_samples_leaf: + return 1 + + return 0 + + cdef inline bint check_postsplit_conditions( + self + ) noexcept nogil: + """Check stopping conditions after evaluating the split. + + This takes some metric that is stored in the Criterion + object and checks against internal stop metrics. + """ + cdef double min_weight_leaf = self.min_weight_leaf + + # Reject if min_weight_leaf is not satisfied + if ((self.criterion.weighted_n_left < min_weight_leaf) or + (self.criterion.weighted_n_right < min_weight_leaf)): + return 1 + + return 0 + + cdef inline void shift_missing_values_to_left_if_required( SplitRecord* best, SIZE_t[::1] samples, @@ -275,6 +392,7 @@ cdef inline void shift_missing_values_to_left_if_required( samples[i], samples[current_end] = samples[current_end], samples[i] best.pos += best.n_missing + # Introduce a fused-class to make it possible to share the split implementation # between the dense and sparse cases in the node_split_best and node_split_random # functions. The alternative would have been to use inheritance-based polymorphism @@ -412,7 +530,6 @@ cdef inline int node_split_best( if has_missing: criterion.init_missing(n_missing) # Evaluate all splits - # If there are missing values, then we search twice for the most optimal split. # The first search will have all the missing values going to the right node. # The second search will have all the missing values going to the left node. @@ -433,18 +550,30 @@ cdef inline int node_split_best( if p >= end_non_missing: continue - if missing_go_to_left: - n_left = p - start + n_missing - n_right = end_non_missing - p - else: - n_left = p - start - n_right = end_non_missing - p + n_missing + current_split.pos = p + + # Reject if monotonicity constraints are not satisfied + if ( + with_monotonic_cst and + monotonic_cst[current_split.feature] != 0 and + not criterion.check_monotonicity( + monotonic_cst[current_split.feature], + lower_bound, + upper_bound, + ) + ): + continue # Reject if min_samples_leaf is not guaranteed - if n_left < min_samples_leaf or n_right < min_samples_leaf: + if missing_go_to_left: + n_left = current_split.pos - splitter.start + n_missing + n_right = end_non_missing - current_split.pos + else: + n_left = current_split.pos - splitter.start + n_right = end_non_missing - current_split.pos + n_missing + if splitter.check_presplit_conditions(¤t_split, n_missing, missing_go_to_left) == 1: continue - current_split.pos = p criterion.update(current_split.pos) # Reject if monotonicity constraints are not satisfied @@ -460,8 +589,7 @@ cdef inline int node_split_best( continue # Reject if min_weight_leaf is not satisfied - if ((criterion.weighted_n_left < min_weight_leaf) or - (criterion.weighted_n_right < min_weight_leaf)): + if splitter.check_postsplit_conditions() == 1: continue current_proxy_improvement = criterion.proxy_impurity_improvement() @@ -691,8 +819,6 @@ cdef inline int node_split_random( cdef SIZE_t n_features = splitter.n_features cdef SIZE_t max_features = splitter.max_features - cdef SIZE_t min_samples_leaf = splitter.min_samples_leaf - cdef double min_weight_leaf = splitter.min_weight_leaf cdef UINT32_t* random_state = &splitter.rand_r_state cdef SplitRecord best_split, current_split @@ -788,8 +914,7 @@ cdef inline int node_split_random( current_split.pos = partitioner.partition_samples(current_split.threshold) # Reject if min_samples_leaf is not guaranteed - if (((current_split.pos - start) < min_samples_leaf) or - ((end - current_split.pos) < min_samples_leaf)): + if splitter.check_presplit_conditions(¤t_split, 0, 0) == 1: continue # Evaluate split @@ -799,8 +924,19 @@ cdef inline int node_split_random( criterion.update(current_split.pos) # Reject if min_weight_leaf is not satisfied - if ((criterion.weighted_n_left < min_weight_leaf) or - (criterion.weighted_n_right < min_weight_leaf)): + if splitter.check_postsplit_conditions() == 1: + continue + + # Reject if monotonicity constraints are not satisfied + if ( + with_monotonic_cst and + monotonic_cst[current_split.feature] != 0 and + not criterion.check_monotonicity( + monotonic_cst[current_split.feature], + lower_bound, + upper_bound, + ) + ): continue # Reject if monotonicity constraints are not satisfied @@ -1501,12 +1637,12 @@ cdef class BestSplitter(Splitter): ) cdef int node_split( - self, - double impurity, - SplitRecord* split, - SIZE_t* n_constant_features, - double lower_bound, - double upper_bound + self, + double impurity, + SplitRecord* split, + SIZE_t* n_constant_features, + double lower_bound, + double upper_bound ) except -1 nogil: return node_split_best( self, diff --git a/sklearn/tree/_tree.pxd b/sklearn/tree/_tree.pxd index b99f44c0472a2..886770bfabc15 100644 --- a/sklearn/tree/_tree.pxd +++ b/sklearn/tree/_tree.pxd @@ -5,13 +5,17 @@ # Arnaud Joly # Jacob Schreiber # Nelson Liu +# Haoyin Xu # # License: BSD 3 clause # See _tree.pyx for details. import numpy as np + cimport numpy as cnp +from libcpp.unordered_map cimport unordered_map +from libcpp.vector cimport vector ctypedef cnp.npy_float32 DTYPE_t # Type of X ctypedef cnp.npy_float64 DOUBLE_t # Type of y, sample_weight @@ -19,8 +23,8 @@ ctypedef cnp.npy_intp SIZE_t # Type for indices and counters ctypedef cnp.npy_int32 INT32_t # Signed 32 bit integer ctypedef cnp.npy_uint32 UINT32_t # Unsigned 32 bit integer -from ._splitter cimport Splitter -from ._splitter cimport SplitRecord +from ._splitter cimport SplitRecord, Splitter + cdef struct Node: # Base storage structure for the nodes in a Tree object @@ -35,40 +39,45 @@ cdef struct Node: unsigned char missing_go_to_left # Whether features have missing values -cdef class Tree: - # The Tree object is a binary tree structure constructed by the - # TreeBuilder. The tree structure is used for predictions and - # feature importances. - - # Input/Output layout - cdef public SIZE_t n_features # Number of features in X - cdef SIZE_t* n_classes # Number of classes in y[:, k] - cdef public SIZE_t n_outputs # Number of outputs in y - cdef public SIZE_t max_n_classes # max(n_classes) - +cdef class BaseTree: # Inner structures: values are stored separately from node structure, # since size is determined at runtime. cdef public SIZE_t max_depth # Max depth of the tree cdef public SIZE_t node_count # Counter for node IDs cdef public SIZE_t capacity # Capacity of tree, in terms of nodes cdef Node* nodes # Array of nodes - cdef double* value # (capacity, n_outputs, max_n_classes) array of values - cdef SIZE_t value_stride # = n_outputs * max_n_classes - # Methods - cdef SIZE_t _add_node(self, SIZE_t parent, bint is_left, bint is_leaf, - SIZE_t feature, double threshold, double impurity, - SIZE_t n_node_samples, - double weighted_n_node_samples, - unsigned char missing_go_to_left) except -1 nogil + cdef SIZE_t value_stride # The dimensionality of a vectorized output per sample + cdef double* value # Array of values prediction values for each node + + # Generic Methods: These are generic methods used by any tree. cdef int _resize(self, SIZE_t capacity) except -1 nogil cdef int _resize_c(self, SIZE_t capacity=*) except -1 nogil - cdef cnp.ndarray _get_value_ndarray(self) - cdef cnp.ndarray _get_node_ndarray(self) - - cpdef cnp.ndarray predict(self, object X) - + cdef SIZE_t _add_node( + self, + SIZE_t parent, + bint is_left, + bint is_leaf, + SplitRecord* split_node, + double impurity, + SIZE_t n_node_samples, + double weighted_n_node_samples, + unsigned char missing_go_to_left + ) except -1 nogil + cdef SIZE_t _update_node( + self, + SIZE_t parent, + bint is_left, + bint is_leaf, + SplitRecord* split_node, + double impurity, + SIZE_t n_node_samples, + double weighted_n_node_samples, + unsigned char missing_go_to_left + ) except -1 nogil + + # Python API methods: These are methods exposed to Python cpdef cnp.ndarray apply(self, object X) cdef cnp.ndarray _apply_dense(self, object X) cdef cnp.ndarray _apply_sparse_csr(self, object X) @@ -80,6 +89,60 @@ cdef class Tree: cpdef compute_node_depths(self) cpdef compute_feature_importances(self, normalize=*) + # Abstract methods: these functions must be implemented by any decision tree + cdef int _set_split_node( + self, + SplitRecord* split_node, + Node* node, + SIZE_t node_id, + ) except -1 nogil + cdef int _set_leaf_node( + self, + SplitRecord* split_node, + Node* node, + SIZE_t node_id, + ) except -1 nogil + cdef DTYPE_t _compute_feature( + self, + const DTYPE_t[:, :] X_ndarray, + SIZE_t sample_index, + Node *node + ) noexcept nogil + cdef void _compute_feature_importances( + self, + cnp.float64_t[:] importances, + Node* node, + ) noexcept nogil + +cdef class Tree(BaseTree): + # The Supervised Tree object is a binary tree structure constructed by the + # TreeBuilder. The tree structure is used for predictions and + # feature importances. + # + # Value of upstream properties: + # - value_stride = n_outputs * max_n_classes + # - value = (capacity, n_outputs, max_n_classes) array of values + + # Input/Output layout for supervised tree + cdef public SIZE_t n_features # Number of features in X + cdef SIZE_t* n_classes # Number of classes in y[:, k] + cdef public SIZE_t n_outputs # Number of outputs in y + cdef public SIZE_t max_n_classes # max(n_classes) + + # Enables the use of tree to store distributions of the output to allow + # arbitrary usage of the the leaves. This is used in the quantile + # estimators for example. + # for storing samples at each leaf node with leaf's node ID as the key and + # the sample values as the value + cdef unordered_map[SIZE_t, vector[vector[DOUBLE_t]]] value_samples + + # Methods + cdef cnp.ndarray _get_value_ndarray(self) + cdef cnp.ndarray _get_node_ndarray(self) + cdef cnp.ndarray _get_value_samples_ndarray(self, SIZE_t node_id) + cdef cnp.ndarray _get_value_samples_keys(self) + + cpdef cnp.ndarray predict(self, object X) # ============================================================================= # Tree builder @@ -100,6 +163,18 @@ cdef class TreeBuilder: cdef double min_weight_leaf # Minimum weight in a leaf cdef SIZE_t max_depth # Maximal tree depth cdef double min_impurity_decrease # Impurity threshold for early stopping + cdef cnp.ndarray initial_roots # Leaf nodes for streaming updates + + cdef unsigned char store_leaf_values # Whether to store leaf values + + cpdef initialize_node_queue( + self, + Tree tree, + object X, + const DOUBLE_t[:, ::1] y, + const DOUBLE_t[:] sample_weight=*, + const unsigned char[::1] missing_values_in_feature_mask=*, + ) cpdef build( self, diff --git a/sklearn/tree/_tree.pyx b/sklearn/tree/_tree.pyx index ce998e80a9d0a..35c64e6265f3a 100644 --- a/sklearn/tree/_tree.pyx +++ b/sklearn/tree/_tree.pyx @@ -1,3 +1,6 @@ +# cython: language_level=3 +# cython: boundscheck=False, wraparound=False, initializedcheck=False, cdivision=True + # Authors: Gilles Louppe # Peter Prettenhofer # Brian Holt @@ -9,25 +12,26 @@ # Fares Hedayati # Jacob Schreiber # Nelson Liu +# Haoyin Xu # # License: BSD 3 clause from cpython cimport Py_INCREF, PyObject, PyTypeObject - -from libc.stdlib cimport free -from libc.string cimport memcpy -from libc.string cimport memset -from libc.stdint cimport INTPTR_MAX +from cython.operator cimport dereference as deref from libc.math cimport isnan -from libcpp.vector cimport vector -from libcpp.algorithm cimport pop_heap -from libcpp.algorithm cimport push_heap +from libc.stdint cimport INTPTR_MAX +from libc.stdlib cimport free, malloc +from libc.string cimport memcpy, memset from libcpp cimport bool +from libcpp.algorithm cimport pop_heap, push_heap +from libcpp.vector cimport vector import struct import numpy as np + cimport numpy as cnp + cnp.import_array() from scipy.sparse import issparse @@ -36,6 +40,7 @@ from scipy.sparse import csr_matrix from ._utils cimport safe_realloc from ._utils cimport sizet_ptr_to_ndarray + cdef extern from "numpy/arrayobject.h": object PyArray_NewFromDescr(PyTypeObject* subtype, cnp.dtype descr, int nd, cnp.npy_intp* dims, @@ -87,6 +92,17 @@ NODE_DTYPE = np.asarray((&dummy)).dtype cdef class TreeBuilder: """Interface for different tree building strategies.""" + cpdef initialize_node_queue( + self, + Tree tree, + object X, + const DOUBLE_t[:, ::1] y, + const DOUBLE_t[:] sample_weight=None, + const unsigned char[::1] missing_values_in_feature_mask=None, + ): + """Build a decision tree from the training set (X, y).""" + pass + cpdef build( self, Tree tree, @@ -153,15 +169,100 @@ cdef struct StackRecord: cdef class DepthFirstTreeBuilder(TreeBuilder): """Build a decision tree in depth-first fashion.""" - def __cinit__(self, Splitter splitter, SIZE_t min_samples_split, - SIZE_t min_samples_leaf, double min_weight_leaf, - SIZE_t max_depth, double min_impurity_decrease): + def __cinit__( + self, + Splitter splitter, + SIZE_t min_samples_split, + SIZE_t min_samples_leaf, + double min_weight_leaf, + SIZE_t max_depth, + double min_impurity_decrease, + unsigned char store_leaf_values=False, + cnp.ndarray initial_roots=None, + ): self.splitter = splitter self.min_samples_split = min_samples_split self.min_samples_leaf = min_samples_leaf self.min_weight_leaf = min_weight_leaf self.max_depth = max_depth self.min_impurity_decrease = min_impurity_decrease + self.store_leaf_values = store_leaf_values + self.initial_roots = initial_roots + + def __reduce__(self): + """Reduce re-implementation, for pickling.""" + return(DepthFirstTreeBuilder, (self.splitter, + self.min_samples_split, + self.min_samples_leaf, + self.min_weight_leaf, + self.max_depth, + self.min_impurity_decrease, + self.store_leaf_values, + self.initial_roots)) + + cpdef initialize_node_queue( + self, + Tree tree, + object X, + const DOUBLE_t[:, ::1] y, + const DOUBLE_t[:] sample_weight=None, + const unsigned char[::1] missing_values_in_feature_mask=None, + ): + """Initialize a list of roots""" + X, y, sample_weight = self._check_input(X, y, sample_weight) + + # organize samples by decision paths + paths = tree.decision_path(X) + cdef int PARENT + cdef int CHILD + cdef int i + false_roots = {} + X_copy = {} + y_copy = {} + for i in range(X.shape[0]): + # collect depths from the node paths + depth_i = paths[i].indices.shape[0] - 1 + PARENT = depth_i - 1 + CHILD = depth_i + + # find leaf node's & their parent node's IDs + if PARENT < 0: + parent_i = 0 + else: + parent_i = paths[i].indices[PARENT] + child_i = paths[i].indices[CHILD] + left = 0 + if tree.children_left[parent_i] == child_i: + left = 1 # leaf node is left child + + # organize samples by the leaf they fall into (false root) + # leaf nodes are marked by parent node and + # their relative position (left or right child) + if (parent_i, left) in false_roots: + false_roots[(parent_i, left)][0] += 1 + X_copy[(parent_i, left)].append(X[i]) + y_copy[(parent_i, left)].append(y[i]) + else: + false_roots[(parent_i, left)] = [1, depth_i] + X_copy[(parent_i, left)] = [X[i]] + y_copy[(parent_i, left)] = [y[i]] + + X_list = [] + y_list = [] + + # reorder the samples according to parent node IDs + for key, value in reversed(sorted(X_copy.items())): + X_list = X_list + value + y_list = y_list + y_copy[key] + cdef object X_new = np.array(X_list) + cdef cnp.ndarray y_new = np.array(y_list) + + # initialize the splitter using sorted samples + cdef Splitter splitter = self.splitter + splitter.init(X_new, y_new, sample_weight, missing_values_in_feature_mask) + + # convert dict to numpy array and store value + self.initial_roots = np.array(list(false_roots.items())) cpdef build( self, @@ -176,16 +277,6 @@ cdef class DepthFirstTreeBuilder(TreeBuilder): # check input X, y, sample_weight = self._check_input(X, y, sample_weight) - # Initial capacity - cdef int init_capacity - - if tree.max_depth <= 10: - init_capacity = (2 ** (tree.max_depth + 1)) - 1 - else: - init_capacity = 2047 - - tree._resize(init_capacity) - # Parameters cdef Splitter splitter = self.splitter cdef SIZE_t max_depth = self.max_depth @@ -193,34 +284,74 @@ cdef class DepthFirstTreeBuilder(TreeBuilder): cdef double min_weight_leaf = self.min_weight_leaf cdef SIZE_t min_samples_split = self.min_samples_split cdef double min_impurity_decrease = self.min_impurity_decrease + cdef unsigned char store_leaf_values = self.store_leaf_values + cdef cnp.ndarray initial_roots = self.initial_roots - # Recursive partition (without actual recursion) - splitter.init(X, y, sample_weight, missing_values_in_feature_mask) + # Initial capacity + cdef int init_capacity + cdef bint first = 0 + if initial_roots is None: + # Recursive partition (without actual recursion) + splitter.init(X, y, sample_weight, missing_values_in_feature_mask) + + if tree.max_depth <= 10: + init_capacity = (2 ** (tree.max_depth + 1)) - 1 + else: + init_capacity = 2047 + + tree._resize(init_capacity) + first = 1 + else: + # convert numpy array back to dict + false_roots = {} + for key_value_pair in initial_roots: + false_roots[tuple(key_value_pair[0])] = key_value_pair[1] - cdef SIZE_t start - cdef SIZE_t end + # reset the root array + self.initial_roots = None + + cdef SIZE_t start = 0 + cdef SIZE_t end = 0 cdef SIZE_t depth cdef SIZE_t parent cdef bint is_left cdef SIZE_t n_node_samples = splitter.n_samples cdef double weighted_n_node_samples - cdef SplitRecord split cdef SIZE_t node_id + cdef SplitRecord split + cdef SplitRecord* split_ptr = malloc(splitter.pointer_size()) + cdef double impurity = INFINITY cdef double lower_bound cdef double upper_bound cdef double middle_value cdef SIZE_t n_constant_features cdef bint is_leaf - cdef bint first = 1 - cdef SIZE_t max_depth_seen = -1 + cdef SIZE_t max_depth_seen = -1 if first else tree.max_depth cdef int rc = 0 cdef stack[StackRecord] builder_stack + cdef stack[StackRecord] update_stack cdef StackRecord stack_record - with nogil: + if not first: + # push reached leaf nodes onto stack + for key, value in reversed(sorted(false_roots.items())): + end += value[0] + update_stack.push({ + "start": start, + "end": end, + "depth": value[1], + "parent": key[0], + "is_left": key[1], + "impurity": tree.impurity[key[0]], + "n_constant_features": 0, + "lower_bound": -INFINITY, + "upper_bound": INFINITY, + }) + start += value[0] + else: # push root node onto stack builder_stack.push({ "start": 0, @@ -234,6 +365,132 @@ cdef class DepthFirstTreeBuilder(TreeBuilder): "upper_bound": INFINITY, }) + with nogil: + while not update_stack.empty(): + stack_record = update_stack.top() + update_stack.pop() + + start = stack_record.start + end = stack_record.end + depth = stack_record.depth + parent = stack_record.parent + is_left = stack_record.is_left + impurity = stack_record.impurity + n_constant_features = stack_record.n_constant_features + lower_bound = stack_record.lower_bound + upper_bound = stack_record.upper_bound + + n_node_samples = end - start + splitter.node_reset(start, end, &weighted_n_node_samples) + + is_leaf = (depth >= max_depth or + n_node_samples < min_samples_split or + n_node_samples < 2 * min_samples_leaf or + weighted_n_node_samples < 2 * min_weight_leaf) + + # impurity == 0 with tolerance due to rounding errors + is_leaf = is_leaf or impurity <= EPSILON + + if not is_leaf: + splitter.node_split( + impurity, + split_ptr, + &n_constant_features, + lower_bound, + upper_bound + ) + + # assign local copy of SplitRecord to assign + # pos, improvement, and impurity scores + split = deref(split_ptr) + + # If EPSILON=0 in the below comparison, float precision + # issues stop splitting, producing trees that are + # dissimilar to v0.18 + is_leaf = (is_leaf or split.pos >= end or + (split.improvement + EPSILON < + min_impurity_decrease)) + + node_id = tree._update_node(parent, is_left, is_leaf, + split_ptr, impurity, n_node_samples, + weighted_n_node_samples, + split.missing_go_to_left) + + if node_id == INTPTR_MAX: + rc = -1 + break + + # Store value for all nodes, to facilitate tree/model + # inspection and interpretation + splitter.node_value(tree.value + node_id * tree.value_stride) + if splitter.with_monotonic_cst: + splitter.clip_node_value(tree.value + node_id * tree.value_stride, lower_bound, upper_bound) + + if not is_leaf: + if ( + not splitter.with_monotonic_cst or + splitter.monotonic_cst[split.feature] == 0 + ): + # Split on a feature with no monotonicity constraint + + # Current bounds must always be propagated to both children. + # If a monotonic constraint is active, bounds are used in + # node value clipping. + left_child_min = right_child_min = lower_bound + left_child_max = right_child_max = upper_bound + elif splitter.monotonic_cst[split.feature] == 1: + # Split on a feature with monotonic increase constraint + left_child_min = lower_bound + right_child_max = upper_bound + + # Lower bound for right child and upper bound for left child + # are set to the same value. + middle_value = splitter.criterion.middle_value() + right_child_min = middle_value + left_child_max = middle_value + else: # i.e. splitter.monotonic_cst[split.feature] == -1 + # Split on a feature with monotonic decrease constraint + right_child_min = lower_bound + left_child_max = upper_bound + + # Lower bound for left child and upper bound for right child + # are set to the same value. + middle_value = splitter.criterion.middle_value() + left_child_min = middle_value + right_child_max = middle_value + + # Push right child on stack + builder_stack.push({ + "start": split.pos, + "end": end, + "depth": depth + 1, + "parent": node_id, + "is_left": 0, + "impurity": split.impurity_right, + "n_constant_features": n_constant_features, + "lower_bound": right_child_min, + "upper_bound": right_child_max, + }) + + # Push left child on stack + builder_stack.push({ + "start": start, + "end": split.pos, + "depth": depth + 1, + "parent": node_id, + "is_left": 1, + "impurity": split.impurity_left, + "n_constant_features": n_constant_features, + "lower_bound": left_child_min, + "upper_bound": left_child_max, + }) + elif store_leaf_values and is_leaf: + # copy leaf values to leaf_values array + splitter.node_samples(tree.value_samples[node_id]) + + if depth > max_depth_seen: + max_depth_seen = depth + while not builder_stack.empty(): stack_record = builder_stack.top() builder_stack.pop() @@ -258,7 +515,7 @@ cdef class DepthFirstTreeBuilder(TreeBuilder): if first: impurity = splitter.node_impurity() - first = 0 + first=0 # impurity == 0 with tolerance due to rounding errors is_leaf = is_leaf or impurity <= EPSILON @@ -266,11 +523,16 @@ cdef class DepthFirstTreeBuilder(TreeBuilder): if not is_leaf: splitter.node_split( impurity, - &split, + split_ptr, &n_constant_features, lower_bound, upper_bound ) + + # assign local copy of SplitRecord to assign + # pos, improvement, and impurity scores + split = deref(split_ptr) + # If EPSILON=0 in the below comparison, float precision # issues stop splitting, producing trees that are # dissimilar to v0.18 @@ -278,10 +540,9 @@ cdef class DepthFirstTreeBuilder(TreeBuilder): (split.improvement + EPSILON < min_impurity_decrease)) - node_id = tree._add_node(parent, is_left, is_leaf, split.feature, - split.threshold, impurity, n_node_samples, - weighted_n_node_samples, - split.missing_go_to_left) + node_id = tree._add_node(parent, is_left, is_leaf, split_ptr, + impurity, n_node_samples, + weighted_n_node_samples, split.missing_go_to_left) if node_id == INTPTR_MAX: rc = -1 @@ -351,6 +612,9 @@ cdef class DepthFirstTreeBuilder(TreeBuilder): "lower_bound": left_child_min, "upper_bound": left_child_max, }) + elif store_leaf_values and is_leaf: + # copy leaf values to leaf_values array + splitter.node_samples(tree.value_samples[node_id]) if depth > max_depth_seen: max_depth_seen = depth @@ -360,10 +624,13 @@ cdef class DepthFirstTreeBuilder(TreeBuilder): if rc >= 0: tree.max_depth = max_depth_seen + + # free the memory created for the SplitRecord pointer + free(split_ptr) + if rc == -1: raise MemoryError() - # Best first builder ---------------------------------------------------------- cdef struct FrontierRecord: # Record of information of a Node, the frontier for a split. Those records are @@ -406,10 +673,18 @@ cdef class BestFirstTreeBuilder(TreeBuilder): """ cdef SIZE_t max_leaf_nodes - def __cinit__(self, Splitter splitter, SIZE_t min_samples_split, - SIZE_t min_samples_leaf, min_weight_leaf, - SIZE_t max_depth, SIZE_t max_leaf_nodes, - double min_impurity_decrease): + def __cinit__( + self, + Splitter splitter, + SIZE_t min_samples_split, + SIZE_t min_samples_leaf, + double min_weight_leaf, + SIZE_t max_depth, + SIZE_t max_leaf_nodes, + double min_impurity_decrease, + unsigned char store_leaf_values=False, + cnp.ndarray initial_roots=None, + ): self.splitter = splitter self.min_samples_split = min_samples_split self.min_samples_leaf = min_samples_leaf @@ -417,6 +692,20 @@ cdef class BestFirstTreeBuilder(TreeBuilder): self.max_depth = max_depth self.max_leaf_nodes = max_leaf_nodes self.min_impurity_decrease = min_impurity_decrease + self.store_leaf_values = store_leaf_values + self.initial_roots = initial_roots + + def __reduce__(self): + """Reduce re-implementation, for pickling.""" + return(BestFirstTreeBuilder, (self.splitter, + self.min_samples_split, + self.min_samples_leaf, + self.min_weight_leaf, + self.max_depth, + self.max_leaf_nodes, + self.min_impurity_decrease, + self.store_leaf_values, + self.initial_roots)) cpdef build( self, @@ -434,6 +723,7 @@ cdef class BestFirstTreeBuilder(TreeBuilder): # Parameters cdef Splitter splitter = self.splitter cdef SIZE_t max_leaf_nodes = self.max_leaf_nodes + cdef unsigned char store_leaf_values = self.store_leaf_values # Recursive partition (without actual recursion) splitter.init(X, y, sample_weight, missing_values_in_feature_mask) @@ -492,6 +782,9 @@ cdef class BestFirstTreeBuilder(TreeBuilder): node.feature = _TREE_UNDEFINED node.threshold = _TREE_UNDEFINED + if store_leaf_values: + # copy leaf values to leaf_values array + splitter.node_samples(tree.value_samples[record.node_id]) else: # Node is expandable @@ -600,6 +893,8 @@ cdef class BestFirstTreeBuilder(TreeBuilder): ) except -1 nogil: """Adds node w/ partition ``[start, end)`` to the frontier. """ cdef SplitRecord split + cdef SplitRecord* split_ptr = malloc(splitter.pointer_size()) + cdef SIZE_t node_id cdef SIZE_t n_node_samples cdef SIZE_t n_constant_features = 0 @@ -623,11 +918,15 @@ cdef class BestFirstTreeBuilder(TreeBuilder): if not is_leaf: splitter.node_split( impurity, - &split, + split_ptr, &n_constant_features, lower_bound, upper_bound ) + # assign local copy of SplitRecord to assign + # pos, improvement, and impurity scores + split = deref(split_ptr) + # If EPSILON=0 in the below comparison, float precision issues stop # splitting early, producing trees that are dissimilar to v0.18 is_leaf = (is_leaf or split.pos >= end or @@ -637,9 +936,8 @@ cdef class BestFirstTreeBuilder(TreeBuilder): if parent != NULL else _TREE_UNDEFINED, is_left, is_leaf, - split.feature, split.threshold, impurity, n_node_samples, - weighted_n_node_samples, - split.missing_go_to_left) + split_ptr, impurity, n_node_samples, + weighted_n_node_samples, split.missing_go_to_left) if node_id == INTPTR_MAX: return -1 @@ -673,6 +971,7 @@ cdef class BestFirstTreeBuilder(TreeBuilder): res.impurity_left = impurity res.impurity_right = impurity + free(split_ptr) return 0 @@ -680,252 +979,153 @@ cdef class BestFirstTreeBuilder(TreeBuilder): # Tree # ============================================================================= -cdef class Tree: - """Array-based representation of a binary decision tree. +cdef class BaseTree: + """Base class for Cython tree models. - The binary tree is represented as a number of parallel arrays. The i-th - element of each array holds information about the node `i`. Node 0 is the - tree's root. You can find a detailed description of all arrays in - `_tree.pxd`. NOTE: Some of the arrays only apply to either leaves or split - nodes, resp. In this case the values of nodes of the other type are - arbitrary! + Downstream classes must implement methods to actually traverse the tree. + """ + cdef int _resize( + self, + SIZE_t capacity + ) except -1 nogil: + """Resize all inner arrays to `capacity`, if `capacity` == -1, then + double the size of the inner arrays. - Attributes - ---------- - node_count : int - The number of nodes (internal nodes + leaves) in the tree. + Returns -1 in case of failure to allocate memory (and raise MemoryError) + or 0 otherwise. + """ + if self._resize_c(capacity) != 0: + # Acquire gil only if we need to raise + with gil: + raise MemoryError() - capacity : int - The current capacity (i.e., size) of the arrays, which is at least as - great as `node_count`. + cdef int _resize_c( + self, + SIZE_t capacity=INTPTR_MAX + ) except -1 nogil: + """Guts of _resize - max_depth : int - The depth of the tree, i.e. the maximum depth of its leaves. + Returns -1 in case of failure to allocate memory (and raise MemoryError) + or 0 otherwise. + """ + if capacity == self.capacity and self.nodes != NULL: + return 0 - children_left : array of int, shape [node_count] - children_left[i] holds the node id of the left child of node i. - For leaves, children_left[i] == TREE_LEAF. Otherwise, - children_left[i] > i. This child handles the case where - X[:, feature[i]] <= threshold[i]. + if capacity == INTPTR_MAX: + if self.capacity == 0: + capacity = 3 # default initial value + else: + capacity = 2 * self.capacity - children_right : array of int, shape [node_count] - children_right[i] holds the node id of the right child of node i. - For leaves, children_right[i] == TREE_LEAF. Otherwise, - children_right[i] > i. This child handles the case where - X[:, feature[i]] > threshold[i]. + safe_realloc(&self.nodes, capacity) + safe_realloc(&self.value, capacity * self.value_stride) - n_leaves : int - Number of leaves in the tree. + # value memory is initialised to 0 to enable classifier argmax + if capacity > self.capacity: + memset((self.value + self.capacity * self.value_stride), 0, + (capacity - self.capacity) * self.value_stride * + sizeof(double)) - feature : array of int, shape [node_count] - feature[i] holds the feature to split on, for the internal node i. + # if capacity smaller than node_count, adjust the counter + if capacity < self.node_count: + self.node_count = capacity - threshold : array of double, shape [node_count] - threshold[i] holds the threshold for the internal node i. + self.capacity = capacity + return 0 - value : array of double, shape [node_count, n_outputs, max_n_classes] - Contains the constant prediction value of each node. - - impurity : array of double, shape [node_count] - impurity[i] holds the impurity (i.e., the value of the splitting - criterion) at node i. - - n_node_samples : array of int, shape [node_count] - n_node_samples[i] holds the number of training samples reaching node i. - - weighted_n_node_samples : array of double, shape [node_count] - weighted_n_node_samples[i] holds the weighted number of training samples - reaching node i. - - missing_go_to_left : array of bool, shape [node_count] - missing_go_to_left[i] holds a bool indicating whether or not there were - missing values at node i. - """ - # Wrap for outside world. - # WARNING: these reference the current `nodes` and `value` buffers, which - # must not be freed by a subsequent memory allocation. - # (i.e. through `_resize` or `__setstate__`) - @property - def n_classes(self): - return sizet_ptr_to_ndarray(self.n_classes, self.n_outputs) - - @property - def children_left(self): - return self._get_node_ndarray()['left_child'][:self.node_count] - - @property - def children_right(self): - return self._get_node_ndarray()['right_child'][:self.node_count] - - @property - def n_leaves(self): - return np.sum(np.logical_and( - self.children_left == -1, - self.children_right == -1)) - - @property - def feature(self): - return self._get_node_ndarray()['feature'][:self.node_count] - - @property - def threshold(self): - return self._get_node_ndarray()['threshold'][:self.node_count] - - @property - def impurity(self): - return self._get_node_ndarray()['impurity'][:self.node_count] - - @property - def n_node_samples(self): - return self._get_node_ndarray()['n_node_samples'][:self.node_count] - - @property - def weighted_n_node_samples(self): - return self._get_node_ndarray()['weighted_n_node_samples'][:self.node_count] - - @property - def missing_go_to_left(self): - return self._get_node_ndarray()['missing_go_to_left'][:self.node_count] - - @property - def value(self): - return self._get_value_ndarray()[:self.node_count] - - # TODO: Convert n_classes to cython.integral memory view once - # https://github.com/cython/cython/issues/5243 is fixed - def __cinit__(self, int n_features, cnp.ndarray n_classes, int n_outputs): - """Constructor.""" - cdef SIZE_t dummy = 0 - size_t_dtype = np.array(dummy).dtype - - n_classes = _check_n_classes(n_classes, size_t_dtype) - - # Input/Output layout - self.n_features = n_features - self.n_outputs = n_outputs - self.n_classes = NULL - safe_realloc(&self.n_classes, n_outputs) - - self.max_n_classes = np.max(n_classes) - self.value_stride = n_outputs * self.max_n_classes - - cdef SIZE_t k - for k in range(n_outputs): - self.n_classes[k] = n_classes[k] - - # Inner structures - self.max_depth = 0 - self.node_count = 0 - self.capacity = 0 - self.value = NULL - self.nodes = NULL - - def __dealloc__(self): - """Destructor.""" - # Free all inner structures - free(self.n_classes) - free(self.value) - free(self.nodes) - - def __reduce__(self): - """Reduce re-implementation, for pickling.""" - return (Tree, (self.n_features, - sizet_ptr_to_ndarray(self.n_classes, self.n_outputs), - self.n_outputs), self.__getstate__()) - - def __getstate__(self): - """Getstate re-implementation, for pickling.""" - d = {} - # capacity is inferred during the __setstate__ using nodes - d["max_depth"] = self.max_depth - d["node_count"] = self.node_count - d["nodes"] = self._get_node_ndarray() - d["values"] = self._get_value_ndarray() - return d - - def __setstate__(self, d): - """Setstate re-implementation, for unpickling.""" - self.max_depth = d["max_depth"] - self.node_count = d["node_count"] - - if 'nodes' not in d: - raise ValueError('You have loaded Tree version which ' - 'cannot be imported') - - node_ndarray = d['nodes'] - value_ndarray = d['values'] - - value_shape = (node_ndarray.shape[0], self.n_outputs, - self.max_n_classes) - - node_ndarray = _check_node_ndarray(node_ndarray, expected_dtype=NODE_DTYPE) - value_ndarray = _check_value_ndarray( - value_ndarray, - expected_dtype=np.dtype(np.float64), - expected_shape=value_shape - ) - - self.capacity = node_ndarray.shape[0] - if self._resize_c(self.capacity) != 0: - raise MemoryError("resizing tree to %d" % self.capacity) - - memcpy(self.nodes, cnp.PyArray_DATA(node_ndarray), - self.capacity * sizeof(Node)) - memcpy(self.value, cnp.PyArray_DATA(value_ndarray), - self.capacity * self.value_stride * sizeof(double)) - - cdef int _resize(self, SIZE_t capacity) except -1 nogil: - """Resize all inner arrays to `capacity`, if `capacity` == -1, then - double the size of the inner arrays. + cdef int _set_split_node( + self, + SplitRecord* split_node, + Node* node, + SIZE_t node_id, + ) except -1 nogil: + """Set split node data. - Returns -1 in case of failure to allocate memory (and raise MemoryError) - or 0 otherwise. + Parameters + ---------- + split_node : SplitRecord* + The pointer to the record of the split node data. + node : Node* + The pointer to the node that will hold the split node. + node_id : SIZE_t + The index of the node. """ - if self._resize_c(capacity) != 0: - # Acquire gil only if we need to raise - with gil: - raise MemoryError() + # left_child and right_child will be set later for a split node + node.feature = split_node.feature + node.threshold = split_node.threshold + return 1 - cdef int _resize_c(self, SIZE_t capacity=INTPTR_MAX) except -1 nogil: - """Guts of _resize + cdef int _set_leaf_node( + self, + SplitRecord* split_node, + Node* node, + SIZE_t node_id, + ) except -1 nogil: + """Set leaf node data. - Returns -1 in case of failure to allocate memory (and raise MemoryError) - or 0 otherwise. + Parameters + ---------- + split_node : SplitRecord* + The pointer to the record of the leaf node data. + node : Node* + The pointer to the node that will hold the leaf node. + node_id : SIZE_t + The index of the node. """ - if capacity == self.capacity and self.nodes != NULL: - return 0 - - if capacity == INTPTR_MAX: - if self.capacity == 0: - capacity = 3 # default initial value - else: - capacity = 2 * self.capacity - - safe_realloc(&self.nodes, capacity) - safe_realloc(&self.value, capacity * self.value_stride) - - # value memory is initialised to 0 to enable classifier argmax - if capacity > self.capacity: - memset((self.value + self.capacity * self.value_stride), 0, - (capacity - self.capacity) * self.value_stride * - sizeof(double)) - - # if capacity smaller than node_count, adjust the counter - if capacity < self.node_count: - self.node_count = capacity + node.left_child = _TREE_LEAF + node.right_child = _TREE_LEAF + node.feature = _TREE_UNDEFINED + node.threshold = _TREE_UNDEFINED + return 1 - self.capacity = capacity - return 0 + cdef DTYPE_t _compute_feature( + self, + const DTYPE_t[:, :] X_ndarray, + SIZE_t sample_index, + Node *node + ) noexcept nogil: + """Compute feature from a given data matrix, X. + + In axis-aligned trees, this is simply the value in the column of X + for this specific feature. + """ + # the feature index + cdef DTYPE_t feature = X_ndarray[sample_index, node.feature] + return feature - cdef SIZE_t _add_node(self, SIZE_t parent, bint is_left, bint is_leaf, - SIZE_t feature, double threshold, double impurity, - SIZE_t n_node_samples, - double weighted_n_node_samples, - unsigned char missing_go_to_left) except -1 nogil: + cdef SIZE_t _add_node( + self, + SIZE_t parent, + bint is_left, + bint is_leaf, + SplitRecord* split_node, + double impurity, + SIZE_t n_node_samples, + double weighted_n_node_samples, + unsigned char missing_go_to_left + ) except -1 nogil: """Add a node to the tree. The new node registers itself as the child of its parent. + Parameters + ---------- + parent : SIZE_t + The index of the parent. If '_TREE_UNDEFINED', then the current + node is a root node. + is_left : bint + Whether or not the current node is to the left of the parent node. + is_leaf : bint + Whether or not the current node is a leaf node. + split_node : SplitRecord* + A pointer to a SplitRecord pointer address. + impurity : double + The impurity of the node to be added. + n_node_samples : SIZE_t + The number of samples in the node. + weighted_n_node_samples : double + The weight of the samples in the node. + Returns (size_t)(-1) on error. """ cdef SIZE_t node_id = self.node_count @@ -946,28 +1146,61 @@ cdef class Tree: self.nodes[parent].right_child = node_id if is_leaf: - node.left_child = _TREE_LEAF - node.right_child = _TREE_LEAF - node.feature = _TREE_UNDEFINED - node.threshold = _TREE_UNDEFINED - + if self._set_leaf_node(split_node, node, node_id) != 1: + with gil: + raise RuntimeError else: - # left_child and right_child will be set later - node.feature = feature - node.threshold = threshold + if self._set_split_node(split_node, node, node_id) != 1: + with gil: + raise RuntimeError node.missing_go_to_left = missing_go_to_left self.node_count += 1 return node_id - cpdef cnp.ndarray predict(self, object X): - """Predict target for X.""" - out = self._get_value_ndarray().take(self.apply(X), axis=0, - mode='clip') - if self.n_outputs == 1: - out = out.reshape(X.shape[0], self.max_n_classes) - return out + cdef inline SIZE_t _update_node( + self, + SIZE_t parent, + bint is_left, + bint is_leaf, + SplitRecord* split_node, + double impurity, + SIZE_t n_node_samples, + double weighted_n_node_samples, + unsigned char missing_go_to_left + ) except -1 nogil: + """Update a node on the tree. + + The updated node remains on the same position. + Returns (size_t)(-1) on error. + """ + cdef SIZE_t node_id + if is_left: + node_id = self.nodes[parent].left_child + else: + node_id = self.nodes[parent].right_child + + if node_id >= self.capacity: + if self._resize_c() != 0: + return INTPTR_MAX + + cdef Node* node = &self.nodes[node_id] + node.impurity = impurity + node.n_node_samples = n_node_samples + node.weighted_n_node_samples = weighted_n_node_samples + + if is_leaf: + if self._set_leaf_node(split_node, node, node_id) != 1: + with gil: + raise RuntimeError + else: + if self._set_split_node(split_node, node, node_id) != 1: + with gil: + raise RuntimeError + node.missing_go_to_left = missing_go_to_left + + return node_id cpdef cnp.ndarray apply(self, object X): """Finds the terminal region (=leaf node) for each sample in X.""" @@ -1002,9 +1235,10 @@ cdef class Tree: with nogil: for i in range(n_samples): node = self.nodes + # While node not a leaf while node.left_child != _TREE_LEAF: - X_i_node_feature = X_ndarray[i, node.feature] + X_i_node_feature = self._compute_feature(X_ndarray, i, node) # ... and node.right_child != _TREE_LEAF: if isnan(X_i_node_feature): if node.missing_go_to_left: @@ -1072,7 +1306,6 @@ cdef class Tree: # ... and node.right_child != _TREE_LEAF: if feature_to_sample[node.feature] == i: feature_value = X_sample[node.feature] - else: feature_value = 0. @@ -1121,6 +1354,9 @@ cdef class Tree: cdef Node* node = NULL cdef SIZE_t i = 0 + # the feature index + cdef DOUBLE_t feature + with nogil: for i in range(n_samples): node = self.nodes @@ -1132,7 +1368,9 @@ cdef class Tree: indices[indptr[i + 1]] = (node - self.nodes) indptr[i + 1] += 1 - if X_ndarray[i, node.feature] <= node.threshold: + # compute the feature value to compare against threshold + feature = self._compute_feature(X_ndarray, i, node) + if feature <= node.threshold: node = &self.nodes[node.left_child] else: node = &self.nodes[node.right_child] @@ -1261,13 +1499,12 @@ cdef class Tree: cpdef compute_feature_importances(self, normalize=True): """Computes the importance of each feature (aka variable).""" - cdef Node* left - cdef Node* right cdef Node* nodes = self.nodes cdef Node* node = nodes cdef Node* end_node = node + self.node_count cdef double normalizer = 0. + cdef int i = 0 cdef cnp.float64_t[:] importances = np.zeros(self.n_features) @@ -1275,13 +1512,9 @@ cdef class Tree: while node != end_node: if node.left_child != _TREE_LEAF: # ... and node.right_child != _TREE_LEAF: - left = &nodes[node.left_child] - right = &nodes[node.right_child] + self._compute_feature_importances( + importances, node) - importances[node.feature] += ( - node.weighted_n_node_samples * node.impurity - - left.weighted_n_node_samples * left.impurity - - right.weighted_n_node_samples * right.impurity) node += 1 for i in range(self.n_features): @@ -1295,47 +1528,30 @@ cdef class Tree: for i in range(self.n_features): importances[i] /= normalizer - return np.asarray(importances) - - cdef cnp.ndarray _get_value_ndarray(self): - """Wraps value as a 3-d NumPy array. - - The array keeps a reference to this Tree, which manages the underlying - memory. - """ - cdef cnp.npy_intp shape[3] - shape[0] = self.node_count - shape[1] = self.n_outputs - shape[2] = self.max_n_classes - cdef cnp.ndarray arr - arr = cnp.PyArray_SimpleNewFromData(3, shape, cnp.NPY_DOUBLE, self.value) - Py_INCREF(self) - if PyArray_SetBaseObject(arr, self) < 0: - raise ValueError("Can't initialize array.") - return arr - - cdef cnp.ndarray _get_node_ndarray(self): - """Wraps nodes as a NumPy struct array. - - The array keeps a reference to this Tree, which manages the underlying - memory. Individual fields are publicly accessible as properties of the - Tree. - """ - cdef cnp.npy_intp shape[1] - shape[0] = self.node_count - cdef cnp.npy_intp strides[1] - strides[0] = sizeof(Node) - cdef cnp.ndarray arr - Py_INCREF(NODE_DTYPE) - arr = PyArray_NewFromDescr( cnp.ndarray, - NODE_DTYPE, 1, shape, - strides, self.nodes, - cnp.NPY_ARRAY_DEFAULT, None) - Py_INCREF(self) - if PyArray_SetBaseObject(arr, self) < 0: - raise ValueError("Can't initialize array.") - return arr - + return np.asarray(importances) + + cdef void _compute_feature_importances( + self, + cnp.float64_t[:] importances, + Node* node + ) noexcept nogil: + """Compute feature importances from a Node in the Tree. + + Wrapped in a private function to allow subclassing that + computes feature importances. + """ + cdef Node* nodes = self.nodes + cdef Node* left + cdef Node* right + + left = &nodes[node.left_child] + right = &nodes[node.right_child] + + importances[node.feature] += ( + node.weighted_n_node_samples * node.impurity - + left.weighted_n_node_samples * left.impurity - + right.weighted_n_node_samples * right.impurity) + def compute_partial_dependence(self, DTYPE_t[:, ::1] X, int[::1] target_features, double[::1] out): @@ -1443,6 +1659,286 @@ cdef class Tree: total_weight) +cdef class Tree(BaseTree): + """Array-based representation of a binary decision tree. + + The binary tree is represented as a number of parallel arrays. The i-th + element of each array holds information about the node `i`. Node 0 is the + tree's root. You can find a detailed description of all arrays in + `_tree.pxd`. NOTE: Some of the arrays only apply to either leaves or split + nodes, resp. In this case the values of nodes of the other type are + arbitrary! + + Attributes + ---------- + node_count : int + The number of nodes (internal nodes + leaves) in the tree. + + capacity : int + The current capacity (i.e., size) of the arrays, which is at least as + great as `node_count`. + + max_depth : int + The depth of the tree, i.e. the maximum depth of its leaves. + + children_left : array of int, shape [node_count] + children_left[i] holds the node id of the left child of node i. + For leaves, children_left[i] == TREE_LEAF. Otherwise, + children_left[i] > i. This child handles the case where + X[:, feature[i]] <= threshold[i]. + + children_right : array of int, shape [node_count] + children_right[i] holds the node id of the right child of node i. + For leaves, children_right[i] == TREE_LEAF. Otherwise, + children_right[i] > i. This child handles the case where + X[:, feature[i]] > threshold[i]. + + feature : array of int, shape [node_count] + feature[i] holds the feature to split on, for the internal node i. + + threshold : array of double, shape [node_count] + threshold[i] holds the threshold for the internal node i. + + value : array of double, shape [node_count, n_outputs, max_n_classes] + Contains the constant prediction value of each node. + + impurity : array of double, shape [node_count] + impurity[i] holds the impurity (i.e., the value of the splitting + criterion) at node i. + + n_node_samples : array of int, shape [node_count] + n_node_samples[i] holds the number of training samples reaching node i. + + weighted_n_node_samples : array of double, shape [node_count] + weighted_n_node_samples[i] holds the weighted number of training samples + reaching node i. + + leaf_node_samples : dict of node id to numpy array of shapes (n_samples_node, n_features) + A dictionary mapping leaf nodes to the samples of data that are used + to fit the prediction at each leaf. + """ + # Wrap for outside world. + # WARNING: these reference the current `nodes` and `value` buffers, which + # must not be freed by a subsequent memory allocation. + # (i.e. through `_resize` or `__setstate__`) + @property + def n_classes(self): + return sizet_ptr_to_ndarray(self.n_classes, self.n_outputs) + + @property + def children_left(self): + return self._get_node_ndarray()['left_child'][:self.node_count] + + @property + def children_right(self): + return self._get_node_ndarray()['right_child'][:self.node_count] + + @property + def n_leaves(self): + return np.sum(np.logical_and( + self.children_left == -1, + self.children_right == -1)) + + @property + def feature(self): + return self._get_node_ndarray()['feature'][:self.node_count] + + @property + def threshold(self): + return self._get_node_ndarray()['threshold'][:self.node_count] + + @property + def impurity(self): + return self._get_node_ndarray()['impurity'][:self.node_count] + + @property + def n_node_samples(self): + return self._get_node_ndarray()['n_node_samples'][:self.node_count] + + @property + def weighted_n_node_samples(self): + return self._get_node_ndarray()['weighted_n_node_samples'][:self.node_count] + + @property + def missing_go_to_left(self): + return self._get_node_ndarray()['missing_go_to_left'][:self.node_count] + + @property + def value(self): + return self._get_value_ndarray()[:self.node_count] + + @property + def leaf_nodes_samples(self): + leaf_node_samples = dict() + keys = self._get_value_samples_keys() + for node_id in keys: + leaf_node_samples[node_id] = self._get_value_samples_ndarray(node_id) + return leaf_node_samples + + # TODO: Convert n_classes to cython.integral memory view once + # https://github.com/cython/cython/issues/5243 is fixed + def __cinit__(self, int n_features, cnp.ndarray n_classes, int n_outputs): + """Constructor.""" + cdef SIZE_t dummy = 0 + size_t_dtype = np.array(dummy).dtype + + n_classes = _check_n_classes(n_classes, size_t_dtype) + + # Input/Output layout + self.n_features = n_features + self.n_outputs = n_outputs + self.n_classes = NULL + safe_realloc(&self.n_classes, n_outputs) + + self.max_n_classes = np.max(n_classes) + self.value_stride = n_outputs * self.max_n_classes + + cdef SIZE_t k + for k in range(n_outputs): + self.n_classes[k] = n_classes[k] + + # Inner structures + self.max_depth = 0 + self.node_count = 0 + self.capacity = 0 + self.value = NULL + self.nodes = NULL + + # initialize the hash map for the value samples + self.value_samples = unordered_map[SIZE_t, vector[vector[DOUBLE_t]]]() + + def __dealloc__(self): + """Destructor.""" + # Free all inner structures + free(self.n_classes) + free(self.value) + free(self.nodes) + + def __reduce__(self): + """Reduce re-implementation, for pickling.""" + return (Tree, (self.n_features, + sizet_ptr_to_ndarray(self.n_classes, self.n_outputs), + self.n_outputs), self.__getstate__()) + + def __getstate__(self): + """Getstate re-implementation, for pickling.""" + d = {} + # capacity is inferred during the __setstate__ using nodes + d["max_depth"] = self.max_depth + d["node_count"] = self.node_count + d["nodes"] = self._get_node_ndarray() + d["values"] = self._get_value_ndarray() + d['value_samples'] = self.leaf_nodes_samples + return d + + def __setstate__(self, d): + """Setstate re-implementation, for unpickling.""" + self.max_depth = d["max_depth"] + self.node_count = d["node_count"] + + if 'nodes' not in d: + raise ValueError('You have loaded Tree version which ' + 'cannot be imported') + + node_ndarray = d['nodes'] + value_ndarray = d['values'] + + value_shape = (node_ndarray.shape[0], self.n_outputs, + self.max_n_classes) + + node_ndarray = _check_node_ndarray(node_ndarray, expected_dtype=NODE_DTYPE) + value_ndarray = _check_value_ndarray( + value_ndarray, + expected_dtype=np.dtype(np.float64), + expected_shape=value_shape + ) + + self.capacity = node_ndarray.shape[0] + if self._resize_c(self.capacity) != 0: + raise MemoryError("resizing tree to %d" % self.capacity) + + memcpy(self.nodes, cnp.PyArray_DATA(node_ndarray), + self.capacity * sizeof(Node)) + memcpy(self.value, cnp.PyArray_DATA(value_ndarray), + self.capacity * self.value_stride * sizeof(double)) + + # store the leaf node samples if they exist + value_samples_dict = d['value_samples'] + for node_id, leaf_samples in value_samples_dict.items(): + self.value_samples[node_id].resize(leaf_samples.shape[0]) + for idx in range(leaf_samples.shape[0]): + for jdx in range(leaf_samples.shape[1]): + self.value_samples[node_id][idx].push_back(leaf_samples[idx, jdx]) + + cdef cnp.ndarray _get_value_samples_ndarray(self, SIZE_t node_id): + """Wraps value_samples as a 2-d NumPy array per node_id.""" + cdef int i, j + cdef int n_samples = self.value_samples[node_id].size() + cdef cnp.ndarray[DOUBLE_t, ndim=2, mode='c'] leaf_node_samples = np.empty(shape=(n_samples, self.n_outputs), dtype=np.float64) + + for i in range(n_samples): + for j in range(self.n_outputs): + leaf_node_samples[i, j] = self.value_samples[node_id][i][j] + return leaf_node_samples + + cdef cnp.ndarray _get_value_samples_keys(self): + """Wraps value_samples keys as a 1-d NumPy array of keys.""" + cdef cnp.ndarray[SIZE_t, ndim=1, mode='c'] keys = np.empty(len(self.value_samples), dtype=np.intp) + cdef unsigned int i = 0 + + for key in self.value_samples: + keys[i] = key.first + i += 1 + return keys + + cdef cnp.ndarray _get_value_ndarray(self): + """Wraps value as a 3-d NumPy array. + + The array keeps a reference to this Tree, which manages the underlying + memory. + """ + cdef cnp.npy_intp shape[3] + shape[0] = self.node_count + shape[1] = self.n_outputs + shape[2] = self.max_n_classes + cdef cnp.ndarray arr + arr = cnp.PyArray_SimpleNewFromData(3, shape, cnp.NPY_DOUBLE, self.value) + Py_INCREF(self) + if PyArray_SetBaseObject(arr, self) < 0: + raise ValueError("Can't initialize array.") + return arr + + cdef cnp.ndarray _get_node_ndarray(self): + """Wraps nodes as a NumPy struct array. + + The array keeps a reference to this Tree, which manages the underlying + memory. Individual fields are publicly accessible as properties of the + Tree. + """ + cdef cnp.npy_intp shape[1] + shape[0] = self.node_count + cdef cnp.npy_intp strides[1] + strides[0] = sizeof(Node) + cdef cnp.ndarray arr + Py_INCREF(NODE_DTYPE) + arr = PyArray_NewFromDescr( cnp.ndarray, + NODE_DTYPE, 1, shape, + strides, self.nodes, + cnp.NPY_ARRAY_DEFAULT, None) + Py_INCREF(self) + if PyArray_SetBaseObject(arr, self) < 0: + raise ValueError("Can't initialize array.") + return arr + + cpdef cnp.ndarray predict(self, object X): + """Predict target for X.""" + out = self._get_value_ndarray().take(self.apply(X), axis=0, + mode='clip') + if self.n_outputs == 1: + out = out.reshape(X.shape[0], self.max_n_classes) + return out + + def _check_n_classes(n_classes, expected_dtype): if n_classes.ndim != 1: raise ValueError( @@ -1927,6 +2423,8 @@ cdef _build_pruned_tree( stack[BuildPrunedRecord] prune_stack BuildPrunedRecord stack_record + SplitRecord split + with nogil: # push root node onto stack prune_stack.push({"start": 0, "depth": 0, "parent": _TREE_UNDEFINED, "is_left": 0}) @@ -1943,8 +2441,12 @@ cdef _build_pruned_tree( is_leaf = leaves_in_subtree[orig_node_id] node = &orig_tree.nodes[orig_node_id] + # redefine to a SplitRecord to pass into _add_node + split.feature = node.feature + split.threshold = node.threshold + new_node_id = tree._add_node( - parent, is_left, is_leaf, node.feature, node.threshold, + parent, is_left, is_leaf, &split, node.impurity, node.n_node_samples, node.weighted_n_node_samples, node.missing_go_to_left) diff --git a/sklearn/tree/_utils.pxd b/sklearn/tree/_utils.pxd index 4b953af2d9b2b..61ba8af197c2e 100644 --- a/sklearn/tree/_utils.pxd +++ b/sklearn/tree/_utils.pxd @@ -9,8 +9,10 @@ # See _utils.pyx for details. cimport numpy as cnp + +from sklearn.neighbors._quad_tree cimport Cell + from ._tree cimport Node -from ..neighbors._quad_tree cimport Cell ctypedef cnp.npy_float32 DTYPE_t # Type of X ctypedef cnp.npy_float64 DOUBLE_t # Type of y, sample_weight diff --git a/sklearn/tree/_utils.pyx b/sklearn/tree/_utils.pyx index 669d69409fdc3..02dc7cf426efc 100644 --- a/sklearn/tree/_utils.pyx +++ b/sklearn/tree/_utils.pyx @@ -1,3 +1,6 @@ +# cython: language_level=3 +# cython: boundscheck=False, wraparound=False, initializedcheck=False, cdivision=True + # Authors: Gilles Louppe # Peter Prettenhofer # Arnaud Joly @@ -7,16 +10,17 @@ # # License: BSD 3 clause -from libc.stdlib cimport free -from libc.stdlib cimport realloc -from libc.math cimport log as ln from libc.math cimport isnan +from libc.math cimport log as ln +from libc.stdlib cimport free, realloc import numpy as np + cimport numpy as cnp + cnp.import_array() -from ..utils._random cimport our_rand_r +from sklearn.utils._random cimport our_rand_r # ============================================================================= # Helper functions diff --git a/sklearn/tree/tests/test_tree.py b/sklearn/tree/tests/test_tree.py index 034ee5fc39917..c14c50f24a516 100644 --- a/sklearn/tree/tests/test_tree.py +++ b/sklearn/tree/tests/test_tree.py @@ -882,7 +882,7 @@ def test_pickle(): else: X, y = diabetes.data, diabetes.target - est = TreeEstimator(random_state=0) + est = TreeEstimator(random_state=0, store_leaf_values=True) est.fit(X, y) score = est.score(X, y) @@ -901,6 +901,7 @@ def test_pickle(): "n_node_samples", "weighted_n_node_samples", "value", + "leaf_nodes_samples", ] fitted_attribute = { attribute: getattr(est.tree_, attribute) for attribute in attributes @@ -915,14 +916,25 @@ def test_pickle(): score == score2 ), "Failed to generate same score after pickling with {0}".format(name) for attribute in fitted_attribute: - assert_array_equal( - getattr(est2.tree_, attribute), - fitted_attribute[attribute], - err_msg=( - f"Failed to generate same attribute {attribute} after pickling with" - f" {name}" - ), - ) + if attribute == "leaf_nodes_samples": + for key in fitted_attribute[attribute].keys(): + assert_array_equal( + getattr(est2.tree_, attribute)[key], + fitted_attribute[attribute][key], + err_msg=( + f"Failed to generate same attribute {attribute} after" + f" pickling with {name}" + ), + ) + else: + assert_array_equal( + getattr(est2.tree_, attribute), + fitted_attribute[attribute], + err_msg=( + f"Failed to generate same attribute {attribute} after pickling" + f" with {name}" + ), + ) def test_multioutput(): @@ -2401,8 +2413,8 @@ def test_min_sample_split_1_error(Tree): # min_samples_split=1 is invalid tree = Tree(min_samples_split=1) msg = ( - r"'min_samples_split' .* must be an int in the range \[2, inf\) " - r"or a float in the range \(0.0, 1.0\]" + r"'min_samples_split' .* must be an int in the range \[2, inf\)" + r".* a float in the range \(0.0, 1.0\]" ) with pytest.raises(ValueError, match=msg): tree.fit(X, y) @@ -2414,7 +2426,9 @@ def test_missing_values_on_equal_nodes_no_missing(criterion): X = np.array([[0, 1, 2, 3, 8, 9, 11, 12, 15]]).T y = np.array([0.1, 0.2, 0.3, 0.2, 1.4, 1.4, 1.5, 1.6, 2.6]) - dtc = DecisionTreeRegressor(random_state=42, max_depth=1, criterion=criterion) + dtc = DecisionTreeRegressor( + random_state=42, max_depth=1, criterion=criterion, store_leaf_values=True + ) dtc.fit(X, y) # Goes to right node because it has the most data points @@ -2626,3 +2640,148 @@ def test_sample_weight_non_uniform(make_data, Tree): tree_samples_removed.fit(X[1::2, :], y[1::2]) assert_allclose(tree_samples_removed.predict(X), tree_with_sw.predict(X)) + + +@pytest.mark.parametrize( + "tree_name", + ALL_TREES, +) +def test_leaf_node_samples(tree_name): + """Test getting leaf node samples from fitted tree.""" + tree = ALL_TREES[tree_name](random_state=0, store_leaf_values=False) + tree.fit(X_small, y_small) + + # Check that the leaf node samples are not stored by default + assert tree.tree_.leaf_nodes_samples == dict() + + # error should be raised if trying to predict quantiles + assert hasattr(tree, "predict_quantiles") + for meth in ["predict_quantiles", "get_leaf_node_samples"]: + if hasattr(tree, meth): + with pytest.raises( + RuntimeError, + match="leaf node samples", + ): + getattr(tree, meth)(X_small) + + quantile_tree = ALL_TREES[tree_name](random_state=0, store_leaf_values=True) + quantile_tree.fit(X_small, y_small) + + score = tree.score(X_small, y_small) + new_score = quantile_tree.score(X_small, y_small) + assert np.isclose(score, new_score) + + # Check that the leaf node samples are what they should be + X_leaves = quantile_tree.apply(X_small) + for idx in range(X_leaves.shape[0]): + leaf_idx = X_leaves[idx] + assert y_small[idx] in quantile_tree.tree_.leaf_nodes_samples[leaf_idx] + assert set(np.unique(X_leaves)) == set( + quantile_tree.tree_.leaf_nodes_samples.keys() + ) + + +@pytest.mark.parametrize( + "name", + ALL_TREES, +) +def test_quantile_tree_predict(name): + TreeEstimator = ALL_TREES[name] + + # test quantile prediction + est = TreeEstimator(store_leaf_values=True, random_state=0) + + # fit on binary results in perfect leaves, so all quantiles are the same + est.fit(X_small, y_small) + pred = est.predict_quantiles(X_small, quantiles=[0.1, 0.5, 0.9]) + assert_array_equal(est.predict(X_small), pred[:, 0]) + assert_array_equal(est.predict(X_small), pred[:, 1]) + assert_array_equal(est.predict(X_small), pred[:, 2]) + assert_array_equal(pred[:, 0], y_small) + assert np.unique(pred, axis=1).shape[1] == 1 + + est.fit(X_small[:-5], y_small[:-5]) + held_out_X = X_small[-5:, :] + pred = est.predict_quantiles(held_out_X, quantiles=[0.1, 0.5, 0.9]) + assert_array_equal(est.predict(held_out_X), pred[:, 0]) + assert_array_equal(est.predict(held_out_X), pred[:, 1]) + assert_array_equal(est.predict(held_out_X), pred[:, 2]) + + # fit on real data + est.fit(iris.data, iris.target) + pred = est.predict_quantiles(iris.data, quantiles=[0.1, 0.5, 0.9]) + assert_array_equal(pred[:, 0], iris.target) + assert_array_equal(pred[:, 1], iris.target) + assert_array_equal(pred[:, 2], iris.target) + + +@pytest.mark.parametrize( + "name", + ALL_TREES, +) +def test_quantile_tree_predict_impure_leaves(name): + TreeEstimator = ALL_TREES[name] + + # test quantile prediction + est = TreeEstimator(store_leaf_values=True, random_state=0, max_depth=4) + # fit on binary results with constrained depth will result in impure leaves + est.fit(X_small, y_small) + pred = est.predict_quantiles(X_small, quantiles=[0.1, 0.5, 0.9]) + assert np.unique(pred, axis=1).shape[1] > 1 + + +def test_multioutput_quantiles(): + # Check estimators on multi-output problems. + X = [ + [-2, -1], + [-1, -1], + [-1, -2], + [1, 1], + [1, 2], + [2, 1], + [-2, 1], + [-1, 1], + [-1, 2], + [2, -1], + [1, -1], + [1, -2], + ] + + y = [ + [-1, 0], + [-1, 0], + [-1, 0], + [1, 1], + [1, 1], + [1, 1], + [-1, 2], + [-1, 2], + [-1, 2], + [1, 3], + [1, 3], + [1, 3], + ] + + T = [[-1, -1], [1, 1], [-1, 1], [1, -1]] + y_true = [[-1, 0], [1, 1], [-1, 2], [1, 3]] + + # toy classification problem + for name, TreeClassifier in CLF_TREES.items(): + clf = TreeClassifier(random_state=0, store_leaf_values=True) + clf.fit(X, y) + + y_hat = clf.predict_quantiles(T, quantiles=[0.25, 0.5, 0.75]) + y_hat = y_hat.squeeze() + assert_array_equal(y_hat[:, 0], y_true) + assert_array_equal(y_hat[:, 1], y_true) + assert_array_equal(y_hat[:, 2], y_true) + assert y_hat.shape == (4, 3, 2) + + # toy regression problem + for name, TreeRegressor in REG_TREES.items(): + reg = TreeRegressor(random_state=0, store_leaf_values=True) + y_hat = reg.fit(X, y).predict_quantiles(T, quantiles=[0.25, 0.5, 0.75]) + assert_array_equal(y_hat[:, 0], y_true) + assert_array_equal(y_hat[:, 1], y_true) + assert_array_equal(y_hat[:, 2], y_true) + assert y_hat.shape == (4, 3, 2) From e2fee00aa461c21b8cfa59eb907d27972415c99b Mon Sep 17 00:00:00 2001 From: Adam Li Date: Mon, 11 Sep 2023 17:55:39 -0400 Subject: [PATCH 02/54] Fix lint Signed-off-by: Adam Li --- sklearn/tree/_splitter.pxd | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/tree/_splitter.pxd b/sklearn/tree/_splitter.pxd index 4c67c35ebbdb0..5c82bbe193c18 100644 --- a/sklearn/tree/_splitter.pxd +++ b/sklearn/tree/_splitter.pxd @@ -103,7 +103,7 @@ cdef class BaseSplitter: cdef class Splitter(BaseSplitter): """Base class for supervised splitters.""" - + cdef public Criterion criterion # Impurity criterion cdef const DOUBLE_t[:, ::1] y From 45b9e33da93f2c71cf550761179ae95eaecb0fbc Mon Sep 17 00:00:00 2001 From: Adam Li Date: Wed, 11 Oct 2023 22:29:15 -0400 Subject: [PATCH 03/54] Fix utils.pyx typing Signed-off-by: Adam Li --- sklearn/tree/_utils.pxd | 24 ++++++++++++------------ sklearn/tree/_utils.pyx | 2 +- 2 files changed, 13 insertions(+), 13 deletions(-) diff --git a/sklearn/tree/_utils.pxd b/sklearn/tree/_utils.pxd index b4dc9360e1f8f..918cf39846821 100644 --- a/sklearn/tree/_utils.pxd +++ b/sklearn/tree/_utils.pxd @@ -42,7 +42,7 @@ ctypedef fused realloc_ptr: (Cell*) (Node**) -cdef int safe_realloc(realloc_ptr* p, size_t nelems) except -1 nogil +cdef intp_t safe_realloc(realloc_ptr* p, size_t nelems) except -1 nogil cdef cnp.ndarray sizet_ptr_to_ndarray(intp_t* data, intp_t size) @@ -73,12 +73,12 @@ cdef class WeightedPQueue: cdef WeightedPQueueRecord* array_ cdef bint is_empty(self) noexcept nogil - cdef int reset(self) except -1 nogil + cdef intp_t reset(self) except -1 nogil cdef intp_t size(self) noexcept nogil - cdef int push(self, float64_t data, float64_t weight) except -1 nogil - cdef int remove(self, float64_t data, float64_t weight) noexcept nogil - cdef int pop(self, float64_t* data, float64_t* weight) noexcept nogil - cdef int peek(self, float64_t* data, float64_t* weight) noexcept nogil + cdef intp_t push(self, float64_t data, float64_t weight) except -1 nogil + cdef intp_t remove(self, float64_t data, float64_t weight) noexcept nogil + cdef intp_t pop(self, float64_t* data, float64_t* weight) noexcept nogil + cdef intp_t peek(self, float64_t* data, float64_t* weight) noexcept nogil cdef float64_t get_weight_from_index(self, intp_t index) noexcept nogil cdef float64_t get_value_from_index(self, intp_t index) noexcept nogil @@ -94,14 +94,14 @@ cdef class WeightedMedianCalculator: cdef intp_t k cdef float64_t sum_w_0_k # represents sum(weights[0:k]) = w[0] + w[1] + ... + w[k-1] cdef intp_t size(self) noexcept nogil - cdef int push(self, float64_t data, float64_t weight) except -1 nogil - cdef int reset(self) except -1 nogil - cdef int update_median_parameters_post_push( + cdef intp_t push(self, float64_t data, float64_t weight) except -1 nogil + cdef intp_t reset(self) except -1 nogil + cdef intp_t update_median_parameters_post_push( self, float64_t data, float64_t weight, float64_t original_median) noexcept nogil - cdef int remove(self, float64_t data, float64_t weight) noexcept nogil - cdef int pop(self, float64_t* data, float64_t* weight) noexcept nogil - cdef int update_median_parameters_post_remove( + cdef intp_t remove(self, float64_t data, float64_t weight) noexcept nogil + cdef intp_t pop(self, float64_t* data, float64_t* weight) noexcept nogil + cdef intp_t update_median_parameters_post_remove( self, float64_t data, float64_t weight, float64_t original_median) noexcept nogil cdef float64_t get_median(self) noexcept nogil diff --git a/sklearn/tree/_utils.pyx b/sklearn/tree/_utils.pyx index 4747ce3a339f4..1185967e24e8c 100644 --- a/sklearn/tree/_utils.pyx +++ b/sklearn/tree/_utils.pyx @@ -26,7 +26,7 @@ from sklearn.utils._random cimport our_rand_r # Helper functions # ============================================================================= -cdef int safe_realloc(realloc_ptr* p, size_t nelems) except -1 nogil: +cdef intp_t safe_realloc(realloc_ptr* p, size_t nelems) except -1 nogil: # sizeof(realloc_ptr[0]) would be more like idiomatic C, but causes Cython # 0.20.1 to crash. cdef size_t nbytes = nelems * sizeof(p[0][0]) From 01d26303ae77ddb8d25cef14feb4be7cd03111f6 Mon Sep 17 00:00:00 2001 From: Adam Li Date: Wed, 11 Oct 2023 22:45:21 -0400 Subject: [PATCH 04/54] Try absolute import Signed-off-by: Adam Li --- sklearn/tree/_criterion.pxd | 2 +- sklearn/tree/_splitter.pxd | 2 +- sklearn/tree/_utils.pxd | 3 +-- 3 files changed, 3 insertions(+), 4 deletions(-) diff --git a/sklearn/tree/_criterion.pxd b/sklearn/tree/_criterion.pxd index e464ab02005c3..46ca9102e67a8 100644 --- a/sklearn/tree/_criterion.pxd +++ b/sklearn/tree/_criterion.pxd @@ -13,7 +13,7 @@ cimport numpy as cnp from libcpp.vector cimport vector -from ..utils._typedefs cimport float64_t, intp_t +from sklearn.utils._typedefs cimport float64_t, intp_t cdef class BaseCriterion: diff --git a/sklearn/tree/_splitter.pxd b/sklearn/tree/_splitter.pxd index a8820ee3c94ed..88025fbfde502 100644 --- a/sklearn/tree/_splitter.pxd +++ b/sklearn/tree/_splitter.pxd @@ -14,8 +14,8 @@ cimport numpy as cnp from libcpp.vector cimport vector +from sklearn.utils._typedefs cimport float32_t, float64_t, intp_t, int32_t, uint32_t from ._criterion cimport BaseCriterion, Criterion -from ..utils._typedefs cimport float32_t, float64_t, intp_t, int32_t, uint32_t cdef struct SplitRecord: diff --git a/sklearn/tree/_utils.pxd b/sklearn/tree/_utils.pxd index 918cf39846821..a74b0f1ed1b76 100644 --- a/sklearn/tree/_utils.pxd +++ b/sklearn/tree/_utils.pxd @@ -11,10 +11,9 @@ cimport numpy as cnp from sklearn.neighbors._quad_tree cimport Cell +from sklearn.utils._typedefs cimport float32_t, float64_t, intp_t, int32_t, uint32_t from ._tree cimport Node -from ..neighbors._quad_tree cimport Cell -from ..utils._typedefs cimport float32_t, float64_t, intp_t, int32_t, uint32_t cdef enum: # Max value for our rand_r replacement (near the bottom). From 5715cfcd7aafc3041459bf894a41a5560dfe977a Mon Sep 17 00:00:00 2001 From: Adam Li Date: Thu, 12 Oct 2023 11:37:26 -0400 Subject: [PATCH 05/54] Try again Signed-off-by: Adam Li --- sklearn/tree/_criterion.pxd | 2 +- sklearn/tree/_splitter.pxd | 2 +- sklearn/tree/_utils.pxd | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/sklearn/tree/_criterion.pxd b/sklearn/tree/_criterion.pxd index 46ca9102e67a8..e464ab02005c3 100644 --- a/sklearn/tree/_criterion.pxd +++ b/sklearn/tree/_criterion.pxd @@ -13,7 +13,7 @@ cimport numpy as cnp from libcpp.vector cimport vector -from sklearn.utils._typedefs cimport float64_t, intp_t +from ..utils._typedefs cimport float64_t, intp_t cdef class BaseCriterion: diff --git a/sklearn/tree/_splitter.pxd b/sklearn/tree/_splitter.pxd index 88025fbfde502..2420c94ee6557 100644 --- a/sklearn/tree/_splitter.pxd +++ b/sklearn/tree/_splitter.pxd @@ -14,7 +14,7 @@ cimport numpy as cnp from libcpp.vector cimport vector -from sklearn.utils._typedefs cimport float32_t, float64_t, intp_t, int32_t, uint32_t +from ..utils._typedefs cimport float32_t, float64_t, intp_t, int32_t, uint32_t from ._criterion cimport BaseCriterion, Criterion diff --git a/sklearn/tree/_utils.pxd b/sklearn/tree/_utils.pxd index a74b0f1ed1b76..bb51d5a039357 100644 --- a/sklearn/tree/_utils.pxd +++ b/sklearn/tree/_utils.pxd @@ -10,8 +10,8 @@ cimport numpy as cnp -from sklearn.neighbors._quad_tree cimport Cell -from sklearn.utils._typedefs cimport float32_t, float64_t, intp_t, int32_t, uint32_t +from ..neighbors._quad_tree cimport Cell +from ..utils._typedefs cimport float32_t, float64_t, intp_t, int32_t, uint32_t from ._tree cimport Node From 5336b1f31e50892bdbcc12e5cbdbca4e166f2027 Mon Sep 17 00:00:00 2001 From: Adam Li Date: Thu, 12 Oct 2023 11:38:19 -0400 Subject: [PATCH 06/54] Update import path Signed-off-by: Adam Li --- sklearn/tree/_utils.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/tree/_utils.pyx b/sklearn/tree/_utils.pyx index 1185967e24e8c..23c358ce4bd8b 100644 --- a/sklearn/tree/_utils.pyx +++ b/sklearn/tree/_utils.pyx @@ -20,7 +20,7 @@ cimport numpy as cnp cnp.import_array() -from sklearn.utils._random cimport our_rand_r +from ..utils._random cimport our_rand_r # ============================================================================= # Helper functions From d49572ab11a81299acca4e56885908089efdb9b4 Mon Sep 17 00:00:00 2001 From: Adam Li Date: Thu, 12 Oct 2023 11:58:48 -0400 Subject: [PATCH 07/54] Make submodule install easier Signed-off-by: Adam Li --- sklearn/utils/_random.pyx | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/sklearn/utils/_random.pyx b/sklearn/utils/_random.pyx index 6f9c3bdb487cc..277474f15d0db 100644 --- a/sklearn/utils/_random.pyx +++ b/sklearn/utils/_random.pyx @@ -14,7 +14,9 @@ import numpy as np cimport numpy as cnp cnp.import_array() -from . import check_random_state +# XXX: added instead of relative import to make scikit-tree easier +# from .utils import check_random_state +from sklearn.utils import check_random_state cdef UINT32_t DEFAULT_SEED = 1 From 99a9f9161347e2b70a419b6507a10409f11a53bb Mon Sep 17 00:00:00 2001 From: Adam Li Date: Thu, 12 Oct 2023 13:59:18 -0400 Subject: [PATCH 08/54] Change ctypedef in random Signed-off-by: Adam Li --- sklearn/utils/_random.pxd | 17 +++++++++-------- sklearn/utils/_random.pyx | 4 ++-- 2 files changed, 11 insertions(+), 10 deletions(-) diff --git a/sklearn/utils/_random.pxd b/sklearn/utils/_random.pxd index b5199fc506f4e..4b291489716fc 100644 --- a/sklearn/utils/_random.pxd +++ b/sklearn/utils/_random.pxd @@ -4,9 +4,10 @@ cimport numpy as cnp -ctypedef cnp.npy_uint32 UINT32_t -cdef inline UINT32_t DEFAULT_SEED = 1 +from ._typedefs cimport uint32_t + +cdef inline uint32_t DEFAULT_SEED = 1 cdef enum: # Max value for our rand_r replacement (near the bottom). @@ -23,18 +24,18 @@ cpdef sample_without_replacement(cnp.int_t n_population, # rand_r replacement using a 32bit XorShift generator # See http://www.jstatsoft.org/v08/i14/paper for details -cdef inline UINT32_t our_rand_r(UINT32_t* seed) nogil: +cdef inline uint32_t our_rand_r(uint32_t* seed) nogil: """Generate a pseudo-random np.uint32 from a np.uint32 seed""" # seed shouldn't ever be 0. if (seed[0] == 0): seed[0] = DEFAULT_SEED - seed[0] ^= (seed[0] << 13) - seed[0] ^= (seed[0] >> 17) - seed[0] ^= (seed[0] << 5) + seed[0] ^= (seed[0] << 13) + seed[0] ^= (seed[0] >> 17) + seed[0] ^= (seed[0] << 5) # Use the modulo to make sure that we don't return a values greater than the # maximum representable value for signed 32bit integers (i.e. 2^31 - 1). # Note that the parenthesis are needed to avoid overflow: here - # RAND_R_MAX is cast to UINT32_t before 1 is added. - return seed[0] % ((RAND_R_MAX) + 1) + # RAND_R_MAX is cast to uint32_t before 1 is added. + return seed[0] % ((RAND_R_MAX) + 1) diff --git a/sklearn/utils/_random.pyx b/sklearn/utils/_random.pyx index 277474f15d0db..9c3d93ffd3bd8 100644 --- a/sklearn/utils/_random.pyx +++ b/sklearn/utils/_random.pyx @@ -18,7 +18,7 @@ cnp.import_array() # from .utils import check_random_state from sklearn.utils import check_random_state -cdef UINT32_t DEFAULT_SEED = 1 +cdef uint32_t DEFAULT_SEED = 1 cpdef _sample_without_replacement_check_input(cnp.int_t n_population, @@ -307,5 +307,5 @@ cpdef sample_without_replacement(cnp.int_t n_population, def _our_rand_r_py(seed): """Python utils to test the our_rand_r function""" - cdef UINT32_t my_seed = seed + cdef uint32_t my_seed = seed return our_rand_r(&my_seed) From 6c7a5f44eb4ec3bea5dd6a9e4d5db748d12b209e Mon Sep 17 00:00:00 2001 From: Adam Li Date: Thu, 12 Oct 2023 14:53:02 -0400 Subject: [PATCH 09/54] Revert UINT32_t Signed-off-by: Adam Li --- sklearn/tree/_splitter.pxd | 5 +++-- sklearn/tree/_splitter.pyx | 4 ++-- sklearn/tree/_tree.pxd | 1 + sklearn/tree/_utils.pxd | 10 +++++++--- sklearn/tree/_utils.pyx | 4 ++-- sklearn/utils/_random.pxd | 17 ++++++++--------- sklearn/utils/_random.pyx | 4 ++-- 7 files changed, 25 insertions(+), 20 deletions(-) diff --git a/sklearn/tree/_splitter.pxd b/sklearn/tree/_splitter.pxd index 2420c94ee6557..29554103a6b70 100644 --- a/sklearn/tree/_splitter.pxd +++ b/sklearn/tree/_splitter.pxd @@ -14,7 +14,8 @@ cimport numpy as cnp from libcpp.vector cimport vector -from ..utils._typedefs cimport float32_t, float64_t, intp_t, int32_t, uint32_t +from ..utils._typedefs cimport float32_t, float64_t, intp_t, int32_t +from ._utils cimport UINT32_t from ._criterion cimport BaseCriterion, Criterion @@ -47,7 +48,7 @@ cdef class BaseSplitter: cdef public float64_t min_weight_leaf # Minimum weight in a leaf cdef object random_state # Random state - cdef uint32_t rand_r_state # sklearn_rand_r random number state + cdef UINT32_t rand_r_state # sklearn_rand_r random number state cdef intp_t[::1] samples # Sample indices in X, y cdef intp_t n_samples # X.shape[0] diff --git a/sklearn/tree/_splitter.pyx b/sklearn/tree/_splitter.pyx index b9635a4930974..2a44be8d1ce2b 100644 --- a/sklearn/tree/_splitter.pyx +++ b/sklearn/tree/_splitter.pyx @@ -467,7 +467,7 @@ cdef inline intp_t node_split_best( cdef intp_t max_features = splitter.max_features cdef intp_t min_samples_leaf = splitter.min_samples_leaf cdef float64_t min_weight_leaf = splitter.min_weight_leaf - cdef uint32_t* random_state = &splitter.rand_r_state + cdef UINT32_t* random_state = &splitter.rand_r_state cdef SplitRecord best_split, current_split cdef float64_t current_proxy_improvement = -INFINITY @@ -848,7 +848,7 @@ cdef inline intp_t node_split_random( cdef intp_t n_features = splitter.n_features cdef intp_t max_features = splitter.max_features - cdef uint32_t* random_state = &splitter.rand_r_state + cdef UINT32_t* random_state = &splitter.rand_r_state cdef SplitRecord best_split, current_split cdef float64_t current_proxy_improvement = - INFINITY diff --git a/sklearn/tree/_tree.pxd b/sklearn/tree/_tree.pxd index bd087a48d3b24..ff69b7c6df819 100644 --- a/sklearn/tree/_tree.pxd +++ b/sklearn/tree/_tree.pxd @@ -19,6 +19,7 @@ from libcpp.vector cimport vector from ..utils._typedefs cimport float32_t, float64_t, intp_t, int32_t, uint32_t +from ._utils cimport UINT32_t from ._splitter cimport SplitRecord, Splitter diff --git a/sklearn/tree/_utils.pxd b/sklearn/tree/_utils.pxd index bb51d5a039357..03a1d48c94cb4 100644 --- a/sklearn/tree/_utils.pxd +++ b/sklearn/tree/_utils.pxd @@ -8,13 +8,17 @@ # See _utils.pyx for details. +import numpy as np cimport numpy as cnp +cnp.import_array() +ctypedef cnp.npy_uint32 UINT32_t from ..neighbors._quad_tree cimport Cell -from ..utils._typedefs cimport float32_t, float64_t, intp_t, int32_t, uint32_t +from ..utils._typedefs cimport float32_t, float64_t, intp_t, int32_t from ._tree cimport Node + cdef enum: # Max value for our rand_r replacement (near the bottom). # We don't use RAND_MAX because it's different across platforms and @@ -48,11 +52,11 @@ cdef cnp.ndarray sizet_ptr_to_ndarray(intp_t* data, intp_t size) cdef intp_t rand_int(intp_t low, intp_t high, - uint32_t* random_state) noexcept nogil + UINT32_t* random_state) noexcept nogil cdef float64_t rand_uniform(float64_t low, float64_t high, - uint32_t* random_state) noexcept nogil + UINT32_t* random_state) noexcept nogil cdef float64_t log(float64_t x) noexcept nogil diff --git a/sklearn/tree/_utils.pyx b/sklearn/tree/_utils.pyx index 23c358ce4bd8b..cc4cb7cf02533 100644 --- a/sklearn/tree/_utils.pyx +++ b/sklearn/tree/_utils.pyx @@ -60,13 +60,13 @@ cdef inline cnp.ndarray sizet_ptr_to_ndarray(intp_t* data, intp_t size): cdef inline intp_t rand_int(intp_t low, intp_t high, - uint32_t* random_state) noexcept nogil: + UINT32_t* random_state) noexcept nogil: """Generate a random integer in [low; end).""" return low + our_rand_r(random_state) % (high - low) cdef inline float64_t rand_uniform(float64_t low, float64_t high, - uint32_t* random_state) noexcept nogil: + UINT32_t* random_state) noexcept nogil: """Generate a random float64_t in [low; high).""" return ((high - low) * our_rand_r(random_state) / RAND_R_MAX) + low diff --git a/sklearn/utils/_random.pxd b/sklearn/utils/_random.pxd index 4b291489716fc..b5199fc506f4e 100644 --- a/sklearn/utils/_random.pxd +++ b/sklearn/utils/_random.pxd @@ -4,10 +4,9 @@ cimport numpy as cnp +ctypedef cnp.npy_uint32 UINT32_t -from ._typedefs cimport uint32_t - -cdef inline uint32_t DEFAULT_SEED = 1 +cdef inline UINT32_t DEFAULT_SEED = 1 cdef enum: # Max value for our rand_r replacement (near the bottom). @@ -24,18 +23,18 @@ cpdef sample_without_replacement(cnp.int_t n_population, # rand_r replacement using a 32bit XorShift generator # See http://www.jstatsoft.org/v08/i14/paper for details -cdef inline uint32_t our_rand_r(uint32_t* seed) nogil: +cdef inline UINT32_t our_rand_r(UINT32_t* seed) nogil: """Generate a pseudo-random np.uint32 from a np.uint32 seed""" # seed shouldn't ever be 0. if (seed[0] == 0): seed[0] = DEFAULT_SEED - seed[0] ^= (seed[0] << 13) - seed[0] ^= (seed[0] >> 17) - seed[0] ^= (seed[0] << 5) + seed[0] ^= (seed[0] << 13) + seed[0] ^= (seed[0] >> 17) + seed[0] ^= (seed[0] << 5) # Use the modulo to make sure that we don't return a values greater than the # maximum representable value for signed 32bit integers (i.e. 2^31 - 1). # Note that the parenthesis are needed to avoid overflow: here - # RAND_R_MAX is cast to uint32_t before 1 is added. - return seed[0] % ((RAND_R_MAX) + 1) + # RAND_R_MAX is cast to UINT32_t before 1 is added. + return seed[0] % ((RAND_R_MAX) + 1) diff --git a/sklearn/utils/_random.pyx b/sklearn/utils/_random.pyx index 9c3d93ffd3bd8..277474f15d0db 100644 --- a/sklearn/utils/_random.pyx +++ b/sklearn/utils/_random.pyx @@ -18,7 +18,7 @@ cnp.import_array() # from .utils import check_random_state from sklearn.utils import check_random_state -cdef uint32_t DEFAULT_SEED = 1 +cdef UINT32_t DEFAULT_SEED = 1 cpdef _sample_without_replacement_check_input(cnp.int_t n_population, @@ -307,5 +307,5 @@ cpdef sample_without_replacement(cnp.int_t n_population, def _our_rand_r_py(seed): """Python utils to test the our_rand_r function""" - cdef uint32_t my_seed = seed + cdef UINT32_t my_seed = seed return our_rand_r(&my_seed) From 09f77851bd06bef8674cdadfadaa38469f96ead6 Mon Sep 17 00:00:00 2001 From: Adam Li Date: Tue, 17 Oct 2023 09:57:03 -0400 Subject: [PATCH 10/54] Change cnp.float64 to float64_t Signed-off-by: Adam Li --- sklearn/tree/_tree.pxd | 2 +- sklearn/tree/_tree.pyx | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/sklearn/tree/_tree.pxd b/sklearn/tree/_tree.pxd index ff69b7c6df819..9a6f2f0914095 100644 --- a/sklearn/tree/_tree.pxd +++ b/sklearn/tree/_tree.pxd @@ -107,7 +107,7 @@ cdef class BaseTree: ) noexcept nogil cdef void _compute_feature_importances( self, - cnp.float64_t[:] importances, + float64_t[:] importances, Node* node, ) noexcept nogil diff --git a/sklearn/tree/_tree.pyx b/sklearn/tree/_tree.pyx index 00b9c289b1feb..5a8a200ed9680 100644 --- a/sklearn/tree/_tree.pyx +++ b/sklearn/tree/_tree.pyx @@ -1506,7 +1506,7 @@ cdef class BaseTree: cdef float64_t normalizer = 0. cdef intp_t i = 0 - cdef cnp.float64_t[:] importances = np.zeros(self.n_features) + cdef float64_t[:] importances = np.zeros(self.n_features) with nogil: while node != end_node: @@ -1532,7 +1532,7 @@ cdef class BaseTree: cdef void _compute_feature_importances( self, - cnp.float64_t[:] importances, + float64_t[:] importances, Node* node ) noexcept nogil: """Compute feature importances from a Node in the Tree. From 4ffa0936153a54b11ec0c3a488e2f2b331b2e2f7 Mon Sep 17 00:00:00 2001 From: Adam Li Date: Wed, 21 Feb 2024 22:18:10 -0500 Subject: [PATCH 11/54] Make sure build_tree returns self Signed-off-by: Adam Li --- sklearn/tree/_classes.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sklearn/tree/_classes.py b/sklearn/tree/_classes.py index 84a41aff1174c..6511c8192889e 100644 --- a/sklearn/tree/_classes.py +++ b/sklearn/tree/_classes.py @@ -413,7 +413,7 @@ def _fit( min_weight_leaf = self.min_weight_fraction_leaf * np.sum(sample_weight) # build the actual tree now with the parameters - self._build_tree( + self = self._build_tree( X=X, y=y, sample_weight=sample_weight, @@ -573,6 +573,7 @@ def _build_tree( self.classes_ = self.classes_[0] self._prune_tree() + return self def _validate_X_predict(self, X, check_input): """Validate the training data on predict (probabilities).""" From d48716a6b2cc6373b9e66bf959f2b43b89f10c5d Mon Sep 17 00:00:00 2001 From: Adam Li Date: Thu, 22 Feb 2024 12:53:22 -0500 Subject: [PATCH 12/54] Allow max samples to be higher Signed-off-by: Adam Li --- sklearn/ensemble/_forest.py | 23 ++++++++++++++++++----- sklearn/ensemble/tests/test_forest.py | 5 ++++- 2 files changed, 22 insertions(+), 6 deletions(-) diff --git a/sklearn/ensemble/_forest.py b/sklearn/ensemble/_forest.py index 50e9bef4f55f1..3827359b9162e 100644 --- a/sklearn/ensemble/_forest.py +++ b/sklearn/ensemble/_forest.py @@ -104,14 +104,18 @@ def _get_n_samples_bootstrap(n_samples, max_samples): """ Get the number of samples in a bootstrap sample. + The expected total number of unique samples in a bootstrap sample is + required to be at most ``n_samples - 1``. + This is equivalent to the expected number of out-of-bag samples being at + least 1. + Parameters ---------- n_samples : int Number of samples in the dataset. max_samples : int or float The maximum number of samples to draw from the total available: - - if float, this indicates a fraction of the total and should be - the interval `(0.0, 1.0]`; + - if float, this indicates a fraction of the total; - if int, this indicates the exact number of samples; - if None, this indicates the total number of samples. @@ -124,12 +128,21 @@ def _get_n_samples_bootstrap(n_samples, max_samples): return n_samples if isinstance(max_samples, Integral): - if max_samples > n_samples: - msg = "`max_samples` must be <= n_samples={} but got value {}" - raise ValueError(msg.format(n_samples, max_samples)) + expected_oob_samples = (1 - np.exp(-max_samples / n_samples)) * n_samples + if expected_oob_samples >= n_samples - 1: + raise ValueError( + "The expected number of unique samples in the bootstrap sample" + f" must be at most {n_samples - 1}. It is: {expected_oob_samples}" + ) return max_samples if isinstance(max_samples, Real): + expected_oob_samples = (1 - np.exp(-max_samples)) * n_samples + if expected_oob_samples >= n_samples - 1: + raise ValueError( + "The expected number of unique samples in the bootstrap sample" + f" must be at most {n_samples - 1}. It is: {expected_oob_samples}" + ) return max(round(n_samples * max_samples), 1) diff --git a/sklearn/ensemble/tests/test_forest.py b/sklearn/ensemble/tests/test_forest.py index a51d240c87d4e..7914823d48ccf 100644 --- a/sklearn/ensemble/tests/test_forest.py +++ b/sklearn/ensemble/tests/test_forest.py @@ -1660,7 +1660,10 @@ def test_max_samples_bootstrap(name): def test_large_max_samples_exception(name): # Check invalid `max_samples` est = FOREST_CLASSIFIERS_REGRESSORS[name](bootstrap=True, max_samples=int(1e9)) - match = "`max_samples` must be <= n_samples=6 but got value 1000000000" + # TODO: remove the following line when the issue is fixed + # https://github.com/scikit-learn/scikit-learn/issues/28507 + # match = "`max_samples` must be <= n_samples=6 but got value 1000000000" + match = "The expected number of unique samples" with pytest.raises(ValueError, match=match): est.fit(X, y) From 33039e22c600cbd0929d0b22995c08535b1fede4 Mon Sep 17 00:00:00 2001 From: Adam Li Date: Fri, 23 Feb 2024 10:33:59 -0500 Subject: [PATCH 13/54] Factor out construct trees API Signed-off-by: Adam Li --- sklearn/ensemble/_forest.py | 91 +++++++++++++++++++++++-------------- 1 file changed, 57 insertions(+), 34 deletions(-) diff --git a/sklearn/ensemble/_forest.py b/sklearn/ensemble/_forest.py index 3827359b9162e..b5ee64b6e708c 100644 --- a/sklearn/ensemble/_forest.py +++ b/sklearn/ensemble/_forest.py @@ -595,42 +595,18 @@ def fit(self, X, y, sample_weight=None, classes=None): # would have got if we hadn't used a warm_start. random_state.randint(MAX_INT, size=len(self.estimators_)) - trees = [ - self._make_estimator(append=False, random_state=random_state) - for i in range(n_more_estimators) - ] - - # Parallel loop: we prefer the threading backend as the Cython code - # for fitting the trees is internally releasing the Python GIL - # making threading more efficient than multiprocessing in - # that case. However, for joblib 0.12+ we respect any - # parallel_backend contexts set at a higher level, - # since correctness does not rely on using threads. - trees = Parallel( - n_jobs=self.n_jobs, - verbose=self.verbose, - prefer="threads", - )( - delayed(_parallel_build_trees)( - t, - self.bootstrap, - X, - y, - sample_weight, - i, - len(trees), - verbose=self.verbose, - class_weight=self.class_weight, - n_samples_bootstrap=n_samples_bootstrap, - missing_values_in_feature_mask=missing_values_in_feature_mask, - classes=classes, - ) - for i, t in enumerate(trees) + # construct the trees in parallel + self._construct_trees( + X, + y, + sample_weight, + random_state, + n_samples_bootstrap, + missing_values_in_feature_mask, + classes, + n_more_estimators, ) - # Collect newly grown trees - self.estimators_.extend(trees) - if self.oob_score and ( n_more_estimators > 0 or not hasattr(self, "oob_score_") ): @@ -664,6 +640,53 @@ def fit(self, X, y, sample_weight=None, classes=None): return self + def _construct_trees( + self, + X, + y, + sample_weight, + random_state, + n_samples_bootstrap, + missing_values_in_feature_mask, + classes, + n_more_estimators, + ): + trees = [ + self._make_estimator(append=False, random_state=random_state) + for i in range(n_more_estimators) + ] + + # Parallel loop: we prefer the threading backend as the Cython code + # for fitting the trees is internally releasing the Python GIL + # making threading more efficient than multiprocessing in + # that case. However, for joblib 0.12+ we respect any + # parallel_backend contexts set at a higher level, + # since correctness does not rely on using threads. + trees = Parallel( + n_jobs=self.n_jobs, + verbose=self.verbose, + prefer="threads", + )( + delayed(_parallel_build_trees)( + t, + self.bootstrap, + X, + y, + sample_weight, + i, + len(trees), + verbose=self.verbose, + class_weight=self.class_weight, + n_samples_bootstrap=n_samples_bootstrap, + missing_values_in_feature_mask=missing_values_in_feature_mask, + classes=classes, + ) + for i, t in enumerate(trees) + ) + + # Collect newly grown trees + self.estimators_.extend(trees) + @abstractmethod def _set_oob_score_and_attributes(self, X, y, scoring_function=None): """Compute and set the OOB score and attributes. From 94fc4327d1fe8526a40465f5cf5b28ce68f468e9 Mon Sep 17 00:00:00 2001 From: Adam Li Date: Sat, 2 Mar 2024 11:42:49 -0500 Subject: [PATCH 14/54] Allow extra args in cinit Signed-off-by: Adam Li --- sklearn/tree/_tree.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/tree/_tree.pyx b/sklearn/tree/_tree.pyx index 2c34139484012..eda0368eed222 100644 --- a/sklearn/tree/_tree.pyx +++ b/sklearn/tree/_tree.pyx @@ -1779,7 +1779,7 @@ cdef class Tree(BaseTree): # TODO: Convert n_classes to cython.integral memory view once # https://github.com/cython/cython/issues/5243 is fixed - def __cinit__(self, intp_t n_features, cnp.ndarray n_classes, intp_t n_outputs): + def __cinit__(self, intp_t n_features, cnp.ndarray n_classes, intp_t n_outputs, *args): """Constructor.""" cdef intp_t dummy = 0 size_t_dtype = np.array(dummy).dtype From 5ccd00fc9367f501d6ddebfe94c84c0aa90f7bc4 Mon Sep 17 00:00:00 2001 From: Adam Li Date: Sat, 9 Mar 2024 20:49:00 -0500 Subject: [PATCH 15/54] Migrate n_constant_features within SplitRecord Signed-off-by: Adam Li --- sklearn/tree/_splitter.pxd | 2 +- sklearn/tree/_splitter.pyx | 24 +++++------------------- sklearn/tree/_tree.pyx | 21 ++++++++++----------- 3 files changed, 16 insertions(+), 31 deletions(-) diff --git a/sklearn/tree/_splitter.pxd b/sklearn/tree/_splitter.pxd index f1434f5d05cc9..601e6ac8f3202 100644 --- a/sklearn/tree/_splitter.pxd +++ b/sklearn/tree/_splitter.pxd @@ -33,6 +33,7 @@ cdef struct SplitRecord: float64_t upper_bound # Upper bound on value of both children for monotonicity unsigned char missing_go_to_left # Controls if missing values go to the left node. intp_t n_missing # Number of missing values for the feature being split on + intp_t n_constant_features # Number of constant features in the split cdef class BaseSplitter: """Abstract interface for splitter.""" @@ -90,7 +91,6 @@ cdef class BaseSplitter: self, float64_t impurity, # Impurity of the node SplitRecord* split, - intp_t* n_constant_features, float64_t lower_bound, float64_t upper_bound, ) except -1 nogil diff --git a/sklearn/tree/_splitter.pyx b/sklearn/tree/_splitter.pyx index d940368804a94..ac84ea60efef3 100644 --- a/sklearn/tree/_splitter.pyx +++ b/sklearn/tree/_splitter.pyx @@ -52,6 +52,7 @@ cdef inline void _init_split(SplitRecord* self, intp_t start_pos) noexcept nogil self.improvement = -INFINITY self.missing_go_to_left = False self.n_missing = 0 + self.n_constant_features = 0 cdef class BaseSplitter: """This is an abstract interface for splitters. @@ -100,7 +101,6 @@ cdef class BaseSplitter: self, float64_t impurity, SplitRecord* split, - intp_t* n_constant_features, float64_t lower_bound, float64_t upper_bound ) except -1 nogil: @@ -118,9 +118,6 @@ cdef class BaseSplitter: split : SplitRecord pointer A pointer to a memory-allocated SplitRecord object which will be filled with the split chosen. - n_constant_features : intp_t pointer - A pointer to a memory-allocated intp_t object which will be filled with the - number of constant features. Optional to use. lower_bound : float64_t The lower bound of the monotonic constraint if used. upper_bound : float64_t @@ -322,7 +319,6 @@ cdef class Splitter(BaseSplitter): self, float64_t impurity, SplitRecord* split, - intp_t* n_constant_features, float64_t lower_bound, float64_t upper_bound, ) except -1 nogil: @@ -444,7 +440,6 @@ cdef inline intp_t node_split_best( Criterion criterion, float64_t impurity, SplitRecord* split, - intp_t* n_constant_features, bint with_monotonic_cst, const cnp.int8_t[:] monotonic_cst, float64_t lower_bound, @@ -490,7 +485,7 @@ cdef inline intp_t node_split_best( cdef intp_t n_found_constants = 0 # Number of features known to be constant and drawn without replacement cdef intp_t n_drawn_constants = 0 - cdef intp_t n_known_constants = n_constant_features[0] + cdef intp_t n_known_constants = split.n_constant_features # n_total_constants = n_known_constants + n_found_constants cdef intp_t n_total_constants = n_known_constants @@ -711,7 +706,7 @@ cdef inline intp_t node_split_best( # Return values split[0] = best_split - n_constant_features[0] = n_total_constants + split.n_constant_features = n_total_constants return 0 @@ -834,7 +829,6 @@ cdef inline int node_split_random( Criterion criterion, float64_t impurity, SplitRecord* split, - intp_t* n_constant_features, bint with_monotonic_cst, const cnp.int8_t[:] monotonic_cst, float64_t lower_bound, @@ -866,7 +860,7 @@ cdef inline int node_split_random( cdef intp_t n_found_constants = 0 # Number of features known to be constant and drawn without replacement cdef intp_t n_drawn_constants = 0 - cdef intp_t n_known_constants = n_constant_features[0] + cdef intp_t n_known_constants = split.n_constant_features # n_total_constants = n_known_constants + n_found_constants cdef intp_t n_total_constants = n_known_constants cdef intp_t n_visited_features = 0 @@ -1021,7 +1015,7 @@ cdef inline int node_split_random( # Return values split[0] = best_split - n_constant_features[0] = n_total_constants + split.n_constant_features = n_total_constants return 0 @@ -1679,7 +1673,6 @@ cdef class BestSplitter(Splitter): self, float64_t impurity, SplitRecord* split, - intp_t* n_constant_features, float64_t lower_bound, float64_t upper_bound ) except -1 nogil: @@ -1689,7 +1682,6 @@ cdef class BestSplitter(Splitter): self.criterion, impurity, split, - n_constant_features, self.with_monotonic_cst, self.monotonic_cst, lower_bound, @@ -1715,7 +1707,6 @@ cdef class BestSparseSplitter(Splitter): self, float64_t impurity, SplitRecord* split, - intp_t* n_constant_features, float64_t lower_bound, float64_t upper_bound ) except -1 nogil: @@ -1725,7 +1716,6 @@ cdef class BestSparseSplitter(Splitter): self.criterion, impurity, split, - n_constant_features, self.with_monotonic_cst, self.monotonic_cst, lower_bound, @@ -1751,7 +1741,6 @@ cdef class RandomSplitter(Splitter): self, float64_t impurity, SplitRecord* split, - intp_t* n_constant_features, float64_t lower_bound, float64_t upper_bound ) except -1 nogil: @@ -1761,7 +1750,6 @@ cdef class RandomSplitter(Splitter): self.criterion, impurity, split, - n_constant_features, self.with_monotonic_cst, self.monotonic_cst, lower_bound, @@ -1786,7 +1774,6 @@ cdef class RandomSparseSplitter(Splitter): self, float64_t impurity, SplitRecord* split, - intp_t* n_constant_features, float64_t lower_bound, float64_t upper_bound ) except -1 nogil: @@ -1796,7 +1783,6 @@ cdef class RandomSparseSplitter(Splitter): self.criterion, impurity, split, - n_constant_features, self.with_monotonic_cst, self.monotonic_cst, lower_bound, diff --git a/sklearn/tree/_tree.pyx b/sklearn/tree/_tree.pyx index eda0368eed222..4ecd644fbe27e 100644 --- a/sklearn/tree/_tree.pyx +++ b/sklearn/tree/_tree.pyx @@ -153,6 +153,7 @@ cdef class TreeBuilder: return X, y, sample_weight + # Depth first builder --------------------------------------------------------- # A record on the stack for depth-first tree growing cdef struct StackRecord: @@ -166,6 +167,7 @@ cdef struct StackRecord: float64_t lower_bound float64_t upper_bound + cdef class DepthFirstTreeBuilder(TreeBuilder): """Build a decision tree in depth-first fashion.""" @@ -328,7 +330,6 @@ cdef class DepthFirstTreeBuilder(TreeBuilder): cdef float64_t lower_bound cdef float64_t upper_bound cdef float64_t middle_value - cdef intp_t n_constant_features cdef bint is_leaf cdef intp_t max_depth_seen = -1 if first else tree.max_depth @@ -379,7 +380,7 @@ cdef class DepthFirstTreeBuilder(TreeBuilder): parent = stack_record.parent is_left = stack_record.is_left impurity = stack_record.impurity - n_constant_features = stack_record.n_constant_features + split_ptr.n_constant_features = stack_record.n_constant_features lower_bound = stack_record.lower_bound upper_bound = stack_record.upper_bound @@ -398,7 +399,6 @@ cdef class DepthFirstTreeBuilder(TreeBuilder): splitter.node_split( impurity, split_ptr, - &n_constant_features, lower_bound, upper_bound ) @@ -470,7 +470,7 @@ cdef class DepthFirstTreeBuilder(TreeBuilder): "parent": node_id, "is_left": 0, "impurity": split.impurity_right, - "n_constant_features": n_constant_features, + "n_constant_features": split.n_constant_features, "lower_bound": right_child_min, "upper_bound": right_child_max, }) @@ -483,7 +483,7 @@ cdef class DepthFirstTreeBuilder(TreeBuilder): "parent": node_id, "is_left": 1, "impurity": split.impurity_left, - "n_constant_features": n_constant_features, + "n_constant_features": split.n_constant_features, "lower_bound": left_child_min, "upper_bound": left_child_max, }) @@ -504,7 +504,7 @@ cdef class DepthFirstTreeBuilder(TreeBuilder): parent = stack_record.parent is_left = stack_record.is_left impurity = stack_record.impurity - n_constant_features = stack_record.n_constant_features + split_ptr.n_constant_features = stack_record.n_constant_features lower_bound = stack_record.lower_bound upper_bound = stack_record.upper_bound @@ -527,7 +527,6 @@ cdef class DepthFirstTreeBuilder(TreeBuilder): splitter.node_split( impurity, split_ptr, - &n_constant_features, lower_bound, upper_bound ) @@ -598,7 +597,7 @@ cdef class DepthFirstTreeBuilder(TreeBuilder): "parent": node_id, "is_left": 0, "impurity": split.impurity_right, - "n_constant_features": n_constant_features, + "n_constant_features": split.n_constant_features, "lower_bound": right_child_min, "upper_bound": right_child_max, }) @@ -611,7 +610,7 @@ cdef class DepthFirstTreeBuilder(TreeBuilder): "parent": node_id, "is_left": 1, "impurity": split.impurity_left, - "n_constant_features": n_constant_features, + "n_constant_features": split.n_constant_features, "lower_bound": left_child_min, "upper_bound": left_child_max, }) @@ -901,11 +900,12 @@ cdef class BestFirstTreeBuilder(TreeBuilder): cdef intp_t node_id cdef intp_t n_node_samples - cdef intp_t n_constant_features = 0 cdef float64_t min_impurity_decrease = self.min_impurity_decrease cdef float64_t weighted_n_node_samples cdef bint is_leaf + # there are no constant features in best first splits + split_ptr.n_constant_features = 0 splitter.node_reset(start, end, &weighted_n_node_samples) if is_first: @@ -923,7 +923,6 @@ cdef class BestFirstTreeBuilder(TreeBuilder): splitter.node_split( impurity, split_ptr, - &n_constant_features, lower_bound, upper_bound ) From b61ae3d546ba4199dc3badf4bd89971d2d75e9df Mon Sep 17 00:00:00 2001 From: Adam Li Date: Sat, 9 Mar 2024 22:47:53 -0500 Subject: [PATCH 16/54] Export shift_missing_values_to_left_if_required Signed-off-by: Adam Li --- sklearn/tree/_splitter.pxd | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/sklearn/tree/_splitter.pxd b/sklearn/tree/_splitter.pxd index 601e6ac8f3202..041e9965a904b 100644 --- a/sklearn/tree/_splitter.pxd +++ b/sklearn/tree/_splitter.pxd @@ -140,3 +140,9 @@ cdef class Splitter(BaseSplitter): float64_t lower_bound, float64_t upper_bound ) noexcept nogil + +cdef void shift_missing_values_to_left_if_required( + SplitRecord* best, + intp_t[::1] samples, + intp_t end, +) noexcept nogil From 02e7765a44e013b513732aef049e1f68e69db894 Mon Sep 17 00:00:00 2001 From: Adam Li Date: Mon, 11 Mar 2024 18:01:51 -0400 Subject: [PATCH 17/54] demo pr Signed-off-by: Adam Li --- sklearn/tree/_splitter.pxd | 4 ++-- sklearn/tree/_splitter.pyx | 20 +++++--------------- sklearn/tree/_tree.pyx | 11 ++++------- 3 files changed, 11 insertions(+), 24 deletions(-) diff --git a/sklearn/tree/_splitter.pxd b/sklearn/tree/_splitter.pxd index adc14011cb7a2..97fae3aea9e0a 100644 --- a/sklearn/tree/_splitter.pxd +++ b/sklearn/tree/_splitter.pxd @@ -28,7 +28,8 @@ cdef struct SplitRecord: float64_t lower_bound # Lower bound on value of both children for monotonicity float64_t upper_bound # Upper bound on value of both children for monotonicity unsigned char missing_go_to_left # Controls if missing values go to the left node. - intp_t n_missing # Number of missing values for the feature being split on + intp_t n_missing # Number of missing values for the feature being split on + intp_t n_constant_features # Number of constant features in the split from parent cdef class Splitter: # The splitter searches in the input space for a feature and a threshold @@ -102,7 +103,6 @@ cdef class Splitter: self, float64_t impurity, # Impurity of the node SplitRecord* split, - intp_t* n_constant_features, float64_t lower_bound, float64_t upper_bound, ) except -1 nogil diff --git a/sklearn/tree/_splitter.pyx b/sklearn/tree/_splitter.pyx index 6ef392685e594..52ecbbc3dfa6b 100644 --- a/sklearn/tree/_splitter.pyx +++ b/sklearn/tree/_splitter.pyx @@ -49,6 +49,7 @@ cdef inline void _init_split(SplitRecord* self, intp_t start_pos) noexcept nogil self.improvement = -INFINITY self.missing_go_to_left = False self.n_missing = 0 + self.n_constant_features = 0 cdef class Splitter: """Abstract splitter class. @@ -233,7 +234,6 @@ cdef class Splitter: self, float64_t impurity, SplitRecord* split, - intp_t* n_constant_features, float64_t lower_bound, float64_t upper_bound, ) except -1 nogil: @@ -303,7 +303,6 @@ cdef inline int node_split_best( Criterion criterion, float64_t impurity, SplitRecord* split, - intp_t* n_constant_features, bint with_monotonic_cst, const cnp.int8_t[:] monotonic_cst, float64_t lower_bound, @@ -349,7 +348,7 @@ cdef inline int node_split_best( cdef intp_t n_found_constants = 0 # Number of features known to be constant and drawn without replacement cdef intp_t n_drawn_constants = 0 - cdef intp_t n_known_constants = n_constant_features[0] + cdef intp_t n_known_constants = split.n_constant_features # n_total_constants = n_known_constants + n_found_constants cdef intp_t n_total_constants = n_known_constants @@ -559,8 +558,8 @@ cdef inline int node_split_best( sizeof(intp_t) * n_found_constants) # Return values + best_split.n_constant_features = n_total_constants split[0] = best_split - n_constant_features[0] = n_total_constants return 0 @@ -683,7 +682,6 @@ cdef inline int node_split_random( Criterion criterion, float64_t impurity, SplitRecord* split, - intp_t* n_constant_features, bint with_monotonic_cst, const cnp.int8_t[:] monotonic_cst, float64_t lower_bound, @@ -717,7 +715,7 @@ cdef inline int node_split_random( cdef intp_t n_found_constants = 0 # Number of features known to be constant and drawn without replacement cdef intp_t n_drawn_constants = 0 - cdef intp_t n_known_constants = n_constant_features[0] + cdef intp_t n_known_constants = split.n_constant_features # n_total_constants = n_known_constants + n_found_constants cdef intp_t n_total_constants = n_known_constants cdef intp_t n_visited_features = 0 @@ -861,8 +859,8 @@ cdef inline int node_split_random( sizeof(intp_t) * n_found_constants) # Return values + best_split.n_constant_features = n_total_constants split[0] = best_split - n_constant_features[0] = n_total_constants return 0 @@ -1520,7 +1518,6 @@ cdef class BestSplitter(Splitter): self, float64_t impurity, SplitRecord* split, - intp_t* n_constant_features, float64_t lower_bound, float64_t upper_bound ) except -1 nogil: @@ -1530,7 +1527,6 @@ cdef class BestSplitter(Splitter): self.criterion, impurity, split, - n_constant_features, self.with_monotonic_cst, self.monotonic_cst, lower_bound, @@ -1556,7 +1552,6 @@ cdef class BestSparseSplitter(Splitter): self, float64_t impurity, SplitRecord* split, - intp_t* n_constant_features, float64_t lower_bound, float64_t upper_bound ) except -1 nogil: @@ -1566,7 +1561,6 @@ cdef class BestSparseSplitter(Splitter): self.criterion, impurity, split, - n_constant_features, self.with_monotonic_cst, self.monotonic_cst, lower_bound, @@ -1592,7 +1586,6 @@ cdef class RandomSplitter(Splitter): self, float64_t impurity, SplitRecord* split, - intp_t* n_constant_features, float64_t lower_bound, float64_t upper_bound ) except -1 nogil: @@ -1602,7 +1595,6 @@ cdef class RandomSplitter(Splitter): self.criterion, impurity, split, - n_constant_features, self.with_monotonic_cst, self.monotonic_cst, lower_bound, @@ -1627,7 +1619,6 @@ cdef class RandomSparseSplitter(Splitter): self, float64_t impurity, SplitRecord* split, - intp_t* n_constant_features, float64_t lower_bound, float64_t upper_bound ) except -1 nogil: @@ -1637,7 +1628,6 @@ cdef class RandomSparseSplitter(Splitter): self.criterion, impurity, split, - n_constant_features, self.with_monotonic_cst, self.monotonic_cst, lower_bound, diff --git a/sklearn/tree/_tree.pyx b/sklearn/tree/_tree.pyx index ea873764069f6..60849fba6561f 100644 --- a/sklearn/tree/_tree.pyx +++ b/sklearn/tree/_tree.pyx @@ -215,7 +215,6 @@ cdef class DepthFirstTreeBuilder(TreeBuilder): cdef float64_t left_child_max cdef float64_t right_child_min cdef float64_t right_child_max - cdef intp_t n_constant_features cdef bint is_leaf cdef bint first = 1 cdef intp_t max_depth_seen = -1 @@ -248,7 +247,7 @@ cdef class DepthFirstTreeBuilder(TreeBuilder): parent = stack_record.parent is_left = stack_record.is_left impurity = stack_record.impurity - n_constant_features = stack_record.n_constant_features + split.n_constant_features = stack_record.n_constant_features lower_bound = stack_record.lower_bound upper_bound = stack_record.upper_bound @@ -271,7 +270,6 @@ cdef class DepthFirstTreeBuilder(TreeBuilder): splitter.node_split( impurity, &split, - &n_constant_features, lower_bound, upper_bound ) @@ -338,7 +336,7 @@ cdef class DepthFirstTreeBuilder(TreeBuilder): "parent": node_id, "is_left": 0, "impurity": split.impurity_right, - "n_constant_features": n_constant_features, + "n_constant_features": split.n_constant_features, "lower_bound": right_child_min, "upper_bound": right_child_max, }) @@ -351,7 +349,7 @@ cdef class DepthFirstTreeBuilder(TreeBuilder): "parent": node_id, "is_left": 1, "impurity": split.impurity_left, - "n_constant_features": n_constant_features, + "n_constant_features": split.n_constant_features, "lower_bound": left_child_min, "upper_bound": left_child_max, }) @@ -606,7 +604,7 @@ cdef class BestFirstTreeBuilder(TreeBuilder): cdef SplitRecord split cdef intp_t node_id cdef intp_t n_node_samples - cdef intp_t n_constant_features = 0 + split.n_constant_features = 0 cdef float64_t min_impurity_decrease = self.min_impurity_decrease cdef float64_t weighted_n_node_samples cdef bint is_leaf @@ -628,7 +626,6 @@ cdef class BestFirstTreeBuilder(TreeBuilder): splitter.node_split( impurity, &split, - &n_constant_features, lower_bound, upper_bound ) From 3e2cfc701624b201b8b805384e380f10ad6746a2 Mon Sep 17 00:00:00 2001 From: Adam Li Date: Mon, 11 Mar 2024 22:36:19 -0400 Subject: [PATCH 18/54] Demo Signed-off-by: Adam Li --- sklearn/tree/_splitter.pxd | 6 +-- sklearn/tree/_splitter.pyx | 61 +++++++++++---------------- sklearn/tree/_tree.pxd | 9 ++++ sklearn/tree/_tree.pyx | 85 ++++++++++++++++++++------------------ 4 files changed, 78 insertions(+), 83 deletions(-) diff --git a/sklearn/tree/_splitter.pxd b/sklearn/tree/_splitter.pxd index 97fae3aea9e0a..554422fc595d3 100644 --- a/sklearn/tree/_splitter.pxd +++ b/sklearn/tree/_splitter.pxd @@ -11,6 +11,7 @@ cimport numpy as cnp from ._criterion cimport Criterion +from ._tree cimport ParentInfo from ..utils._typedefs cimport float32_t, float64_t, intp_t, int32_t, uint32_t @@ -29,7 +30,6 @@ cdef struct SplitRecord: float64_t upper_bound # Upper bound on value of both children for monotonicity unsigned char missing_go_to_left # Controls if missing values go to the left node. intp_t n_missing # Number of missing values for the feature being split on - intp_t n_constant_features # Number of constant features in the split from parent cdef class Splitter: # The splitter searches in the input space for a feature and a threshold @@ -101,10 +101,8 @@ cdef class Splitter: cdef int node_split( self, - float64_t impurity, # Impurity of the node + ParentInfo* parent, SplitRecord* split, - float64_t lower_bound, - float64_t upper_bound, ) except -1 nogil cdef void node_value(self, float64_t* dest) noexcept nogil diff --git a/sklearn/tree/_splitter.pyx b/sklearn/tree/_splitter.pyx index 52ecbbc3dfa6b..a861b73642be6 100644 --- a/sklearn/tree/_splitter.pyx +++ b/sklearn/tree/_splitter.pyx @@ -49,7 +49,6 @@ cdef inline void _init_split(SplitRecord* self, intp_t start_pos) noexcept nogil self.improvement = -INFINITY self.missing_go_to_left = False self.n_missing = 0 - self.n_constant_features = 0 cdef class Splitter: """Abstract splitter class. @@ -232,10 +231,8 @@ cdef class Splitter: cdef int node_split( self, - float64_t impurity, + ParentInfo* parent_record, SplitRecord* split, - float64_t lower_bound, - float64_t upper_bound, ) except -1 nogil: """Find the best split on node samples[start:end]. @@ -301,12 +298,10 @@ cdef inline int node_split_best( Splitter splitter, Partitioner partitioner, Criterion criterion, - float64_t impurity, SplitRecord* split, + ParentInfo* parent_record, bint with_monotonic_cst, const cnp.int8_t[:] monotonic_cst, - float64_t lower_bound, - float64_t upper_bound, ) except -1 nogil: """Find the best split on node samples[start:end] @@ -338,6 +333,10 @@ cdef inline int node_split_best( cdef float64_t current_proxy_improvement = -INFINITY cdef float64_t best_proxy_improvement = -INFINITY + cdef float64_t impurity = parent_record.impurity + cdef float64_t lower_bound = parent_record.lower_bound + cdef float64_t upper_bound = parent_record.upper_bound + cdef intp_t f_i = n_features cdef intp_t f_j cdef intp_t p @@ -348,7 +347,7 @@ cdef inline int node_split_best( cdef intp_t n_found_constants = 0 # Number of features known to be constant and drawn without replacement cdef intp_t n_drawn_constants = 0 - cdef intp_t n_known_constants = split.n_constant_features + cdef intp_t n_known_constants = parent_record.n_constant_features # n_total_constants = n_known_constants + n_found_constants cdef intp_t n_total_constants = n_known_constants @@ -558,7 +557,7 @@ cdef inline int node_split_best( sizeof(intp_t) * n_found_constants) # Return values - best_split.n_constant_features = n_total_constants + parent_record.n_constant_features = n_total_constants split[0] = best_split return 0 @@ -680,12 +679,10 @@ cdef inline int node_split_random( Splitter splitter, Partitioner partitioner, Criterion criterion, - float64_t impurity, SplitRecord* split, + ParentInfo* parent_record, bint with_monotonic_cst, const cnp.int8_t[:] monotonic_cst, - float64_t lower_bound, - float64_t upper_bound, ) except -1 nogil: """Find the best random split on node samples[start:end] @@ -709,13 +706,17 @@ cdef inline int node_split_random( cdef float64_t current_proxy_improvement = - INFINITY cdef float64_t best_proxy_improvement = - INFINITY + cdef float64_t impurity = parent_record.impurity + cdef float64_t lower_bound = parent_record.lower_bound + cdef float64_t upper_bound = parent_record.upper_bound + cdef intp_t f_i = n_features cdef intp_t f_j # Number of features discovered to be constant during the split search cdef intp_t n_found_constants = 0 # Number of features known to be constant and drawn without replacement cdef intp_t n_drawn_constants = 0 - cdef intp_t n_known_constants = split.n_constant_features + cdef intp_t n_known_constants = parent_record.n_constant_features # n_total_constants = n_known_constants + n_found_constants cdef intp_t n_total_constants = n_known_constants cdef intp_t n_visited_features = 0 @@ -859,7 +860,7 @@ cdef inline int node_split_random( sizeof(intp_t) * n_found_constants) # Return values - best_split.n_constant_features = n_total_constants + parent_record.n_constant_features = n_total_constants split[0] = best_split return 0 @@ -1516,21 +1517,17 @@ cdef class BestSplitter(Splitter): cdef int node_split( self, - float64_t impurity, + ParentInfo* parent_record, SplitRecord* split, - float64_t lower_bound, - float64_t upper_bound ) except -1 nogil: return node_split_best( self, self.partitioner, self.criterion, - impurity, split, + parent_record, self.with_monotonic_cst, self.monotonic_cst, - lower_bound, - upper_bound ) cdef class BestSparseSplitter(Splitter): @@ -1550,21 +1547,17 @@ cdef class BestSparseSplitter(Splitter): cdef int node_split( self, - float64_t impurity, + ParentInfo* parent_record, SplitRecord* split, - float64_t lower_bound, - float64_t upper_bound ) except -1 nogil: return node_split_best( self, self.partitioner, self.criterion, - impurity, split, + parent_record, self.with_monotonic_cst, self.monotonic_cst, - lower_bound, - upper_bound ) cdef class RandomSplitter(Splitter): @@ -1584,21 +1577,17 @@ cdef class RandomSplitter(Splitter): cdef int node_split( self, - float64_t impurity, + ParentInfo* parent_record, SplitRecord* split, - float64_t lower_bound, - float64_t upper_bound ) except -1 nogil: return node_split_random( self, self.partitioner, self.criterion, - impurity, split, + parent_record, self.with_monotonic_cst, self.monotonic_cst, - lower_bound, - upper_bound ) cdef class RandomSparseSplitter(Splitter): @@ -1617,19 +1606,15 @@ cdef class RandomSparseSplitter(Splitter): ) cdef int node_split( self, - float64_t impurity, + ParentInfo* parent_record, SplitRecord* split, - float64_t lower_bound, - float64_t upper_bound ) except -1 nogil: return node_split_random( self, self.partitioner, self.criterion, - impurity, split, + parent_record, self.with_monotonic_cst, self.monotonic_cst, - lower_bound, - upper_bound ) diff --git a/sklearn/tree/_tree.pxd b/sklearn/tree/_tree.pxd index e4081921f40f9..1bca2d57cb489 100644 --- a/sklearn/tree/_tree.pxd +++ b/sklearn/tree/_tree.pxd @@ -31,6 +31,15 @@ cdef struct Node: unsigned char missing_go_to_left # Whether features have missing values +cdef struct ParentInfo: + # Structure to store information about the parent of a node + # This is passed to the splitter, to provide information about the previous split + + intp_t n_constant_features # the number of constant features found in parent + float64_t lower_bound # the lower bound of the parent's impurity + float64_t upper_bound # the upper bound of the parent's impurity + float64_t impurity # the impurity of the parent + cdef class Tree: # The Tree object is a binary tree structure constructed by the # TreeBuilder. The tree structure is used for predictions and diff --git a/sklearn/tree/_tree.pyx b/sklearn/tree/_tree.pyx index 60849fba6561f..92e2e1daedd29 100644 --- a/sklearn/tree/_tree.pyx +++ b/sklearn/tree/_tree.pyx @@ -80,6 +80,12 @@ cdef intp_t _TREE_UNDEFINED = TREE_UNDEFINED cdef Node dummy NODE_DTYPE = np.asarray((&dummy)).dtype +cdef inline void _init_parent_record(ParentInfo* self) noexcept nogil: + self.n_constant_features = 0 + self.impurity = INFINITY + self.lower_bound = -INFINITY + self.upper_bound = INFINITY + # ============================================================================= # TreeBuilder # ============================================================================= @@ -207,7 +213,6 @@ cdef class DepthFirstTreeBuilder(TreeBuilder): cdef SplitRecord split cdef intp_t node_id - cdef float64_t impurity = INFINITY cdef float64_t lower_bound cdef float64_t upper_bound cdef float64_t middle_value @@ -223,6 +228,8 @@ cdef class DepthFirstTreeBuilder(TreeBuilder): cdef stack[StackRecord] builder_stack cdef StackRecord stack_record + cdef ParentInfo parent_record + with nogil: # push root node onto stack builder_stack.push({ @@ -246,10 +253,10 @@ cdef class DepthFirstTreeBuilder(TreeBuilder): depth = stack_record.depth parent = stack_record.parent is_left = stack_record.is_left - impurity = stack_record.impurity - split.n_constant_features = stack_record.n_constant_features - lower_bound = stack_record.lower_bound - upper_bound = stack_record.upper_bound + parent_record.impurity = stack_record.impurity + parent_record.n_constant_features = stack_record.n_constant_features + parent_record.lower_bound = stack_record.lower_bound + parent_record.upper_bound = stack_record.upper_bound n_node_samples = end - start splitter.node_reset(start, end, &weighted_n_node_samples) @@ -260,18 +267,16 @@ cdef class DepthFirstTreeBuilder(TreeBuilder): weighted_n_node_samples < 2 * min_weight_leaf) if first: - impurity = splitter.node_impurity() + parent_record.impurity = splitter.node_impurity() first = 0 # impurity == 0 with tolerance due to rounding errors - is_leaf = is_leaf or impurity <= EPSILON + is_leaf = is_leaf or parent_record.impurity <= EPSILON if not is_leaf: splitter.node_split( - impurity, + &parent_record, &split, - lower_bound, - upper_bound ) # If EPSILON=0 in the below comparison, float precision # issues stop splitting, producing trees that are @@ -281,8 +286,8 @@ cdef class DepthFirstTreeBuilder(TreeBuilder): min_impurity_decrease)) node_id = tree._add_node(parent, is_left, is_leaf, split.feature, - split.threshold, impurity, n_node_samples, - weighted_n_node_samples, + split.threshold, parent_record.impurity, + n_node_samples, weighted_n_node_samples, split.missing_go_to_left) if node_id == INTPTR_MAX: @@ -293,7 +298,7 @@ cdef class DepthFirstTreeBuilder(TreeBuilder): # inspection and interpretation splitter.node_value(tree.value + node_id * tree.value_stride) if splitter.with_monotonic_cst: - splitter.clip_node_value(tree.value + node_id * tree.value_stride, lower_bound, upper_bound) + splitter.clip_node_value(tree.value + node_id * tree.value_stride, parent_record.lower_bound, parent_record.upper_bound) if not is_leaf: if ( @@ -336,7 +341,7 @@ cdef class DepthFirstTreeBuilder(TreeBuilder): "parent": node_id, "is_left": 0, "impurity": split.impurity_right, - "n_constant_features": split.n_constant_features, + "n_constant_features": parent_record.n_constant_features, "lower_bound": right_child_min, "upper_bound": right_child_max, }) @@ -349,7 +354,7 @@ cdef class DepthFirstTreeBuilder(TreeBuilder): "parent": node_id, "is_left": 1, "impurity": split.impurity_left, - "n_constant_features": split.n_constant_features, + "n_constant_features": parent_record.n_constant_features, "lower_bound": left_child_min, "upper_bound": left_child_max, }) @@ -456,6 +461,9 @@ cdef class BestFirstTreeBuilder(TreeBuilder): cdef int rc = 0 cdef Node* node + cdef ParentInfo parent_record + parent_record.n_constant_features = 0 + # Initial capacity cdef intp_t init_capacity = max_split_nodes + max_leaf_nodes tree._resize(init_capacity) @@ -467,13 +475,11 @@ cdef class BestFirstTreeBuilder(TreeBuilder): tree=tree, start=0, end=n_node_samples, - impurity=INFINITY, is_first=IS_FIRST, is_left=IS_LEFT, parent=NULL, depth=0, - lower_bound=-INFINITY, - upper_bound=INFINITY, + parent_record=&parent_record, res=&split_node_left, ) if rc >= 0: @@ -531,18 +537,19 @@ cdef class BestFirstTreeBuilder(TreeBuilder): max_split_nodes -= 1 # Compute left split node + parent_record.lower_bound = left_child_min + parent_record.upper_bound = left_child_max + parent_record.impurity = record.impurity_left rc = self._add_split_node( splitter=splitter, tree=tree, start=record.start, end=record.pos, - impurity=record.impurity_left, is_first=IS_NOT_FIRST, is_left=IS_LEFT, parent=node, depth=record.depth + 1, - lower_bound=left_child_min, - upper_bound=left_child_max, + parent_record=&parent_record, res=&split_node_left, ) if rc == -1: @@ -552,18 +559,19 @@ cdef class BestFirstTreeBuilder(TreeBuilder): node = &tree.nodes[record.node_id] # Compute right split node + parent_record.lower_bound = right_child_min + parent_record.upper_bound = right_child_max + parent_record.impurity = record.impurity_right rc = self._add_split_node( splitter=splitter, tree=tree, start=record.pos, end=record.end, - impurity=record.impurity_right, is_first=IS_NOT_FIRST, is_left=IS_NOT_LEFT, parent=node, depth=record.depth + 1, - lower_bound=right_child_min, - upper_bound=right_child_max, + parent_record=&parent_record, res=&split_node_right, ) if rc == -1: @@ -591,20 +599,17 @@ cdef class BestFirstTreeBuilder(TreeBuilder): Tree tree, intp_t start, intp_t end, - float64_t impurity, bint is_first, bint is_left, Node* parent, intp_t depth, - float64_t lower_bound, - float64_t upper_bound, + ParentInfo* parent_record, FrontierRecord* res ) except -1 nogil: """Adds node w/ partition ``[start, end)`` to the frontier. """ cdef SplitRecord split cdef intp_t node_id cdef intp_t n_node_samples - split.n_constant_features = 0 cdef float64_t min_impurity_decrease = self.min_impurity_decrease cdef float64_t weighted_n_node_samples cdef bint is_leaf @@ -612,22 +617,20 @@ cdef class BestFirstTreeBuilder(TreeBuilder): splitter.node_reset(start, end, &weighted_n_node_samples) if is_first: - impurity = splitter.node_impurity() + parent_record.impurity = splitter.node_impurity() n_node_samples = end - start is_leaf = (depth >= self.max_depth or n_node_samples < self.min_samples_split or n_node_samples < 2 * self.min_samples_leaf or weighted_n_node_samples < 2 * self.min_weight_leaf or - impurity <= EPSILON # impurity == 0 with tolerance + parent_record.impurity <= EPSILON # impurity == 0 with tolerance ) if not is_leaf: splitter.node_split( - impurity, + parent_record, &split, - lower_bound, - upper_bound ) # If EPSILON=0 in the below comparison, float precision issues stop # splitting early, producing trees that are dissimilar to v0.18 @@ -638,8 +641,8 @@ cdef class BestFirstTreeBuilder(TreeBuilder): if parent != NULL else _TREE_UNDEFINED, is_left, is_leaf, - split.feature, split.threshold, impurity, n_node_samples, - weighted_n_node_samples, + split.feature, split.threshold, parent_record.impurity, + n_node_samples, weighted_n_node_samples, split.missing_go_to_left) if node_id == INTPTR_MAX: return -1 @@ -647,15 +650,15 @@ cdef class BestFirstTreeBuilder(TreeBuilder): # compute values also for split nodes (might become leafs later). splitter.node_value(tree.value + node_id * tree.value_stride) if splitter.with_monotonic_cst: - splitter.clip_node_value(tree.value + node_id * tree.value_stride, lower_bound, upper_bound) + splitter.clip_node_value(tree.value + node_id * tree.value_stride, parent_record.lower_bound, parent_record.upper_bound) res.node_id = node_id res.start = start res.end = end res.depth = depth - res.impurity = impurity - res.lower_bound = lower_bound - res.upper_bound = upper_bound + res.impurity = parent_record.impurity + res.lower_bound = parent_record.lower_bound + res.upper_bound = parent_record.upper_bound res.middle_value = splitter.criterion.middle_value() if not is_leaf: @@ -671,8 +674,8 @@ cdef class BestFirstTreeBuilder(TreeBuilder): res.pos = end res.is_leaf = 1 res.improvement = 0.0 - res.impurity_left = impurity - res.impurity_right = impurity + res.impurity_left = parent_record.impurity + res.impurity_right = parent_record.impurity return 0 From 9acdf1b830f5dc61d4348804311f87308cf8ca1b Mon Sep 17 00:00:00 2001 From: Adam Li Date: Thu, 21 Mar 2024 10:09:42 -0400 Subject: [PATCH 19/54] Benchmarks Signed-off-by: Adam Li --- benchmarks/bench_randomforest.py | 198 +++++++++++++++++++++++++++++++ sklearn/tree/_tree.pxd | 4 +- sklearn/tree/_tree.pyx | 12 +- 3 files changed, 206 insertions(+), 8 deletions(-) create mode 100644 benchmarks/bench_randomforest.py diff --git a/benchmarks/bench_randomforest.py b/benchmarks/bench_randomforest.py new file mode 100644 index 0000000000000..68b3399924255 --- /dev/null +++ b/benchmarks/bench_randomforest.py @@ -0,0 +1,198 @@ +"""Instructions +1. Build this PR and run: + +```bash +python bench_randomforest.py bench ~/bench_results_forest pr +``` + +2. On main run: + +```bash +python bench_randomforest.py bench ~/bench_results_forest main +``` + +3. Plotting + +```bash +python bench_randomforest.py plot ~/bench_results_forest pr main results_image.png +``` +""" + +from functools import partial +import argparse +from time import perf_counter +from statistics import mean, stdev +from itertools import product +import csv +from pathlib import Path + +from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor +from sklearn.datasets import make_classification, make_regression, make_low_rank_matrix +import numpy as np + +N_REPEATS = 10 +n_jobs = -3 + +benchmark_config = [ + ( + RandomForestRegressor, + list( + product( + ["squared_error"], + [ + make_regression, + ], + [10_000], + ["dense"], + ["best"], + ) + ), + ), + ( + RandomForestClassifier, + list( + product( + ["gini", "entropy"], + [ + partial(make_classification, n_informative=10, n_classes=5), + ], + [10_000], + ["dense"], + ["best"], + ) + ), + ), +] + +def bench(args): + bench_results, branch = args.bench_results, args.branch + results_dir = Path(bench_results) + results_dir.mkdir(exist_ok=True) + + results_path = results_dir / f"{branch}.csv" + + with results_path.open("w") as f: + writer = csv.DictWriter( + f, + fieldnames=[ + "criterion", + "n_samples", + "make_data", + "container", + "splitter", + "n_repeat", + "duration", + ], + ) + writer.writeheader() + + for Klass, items in benchmark_config: + + for config in items: + ( + criterion, + make_data, + n_samples, + container, + splitter, + ) = config + if isinstance(make_data, partial): + make_data_str = make_data.func.__name__ + else: + make_data_str = make_data.__name__ + + default_config = { + "criterion": criterion, + "n_samples": n_samples, + "make_data": make_data_str, + "container": container, + "splitter": splitter, + } + combine_config = " ".join(f"{k}={v}" for k, v in default_config.items()) + + klass_results = [] + for n_repeat in range(N_REPEATS): + print(f"Running {combine_config} with {n_repeat + 1}/{N_REPEATS}") + X, y = make_data( + n_samples=n_samples, + n_features=20, + random_state=n_repeat, + ) + forest = Klass(random_state=n_repeat, criterion=criterion, n_jobs=n_jobs) + + start = perf_counter() + forest.fit(X, y) + duration = perf_counter() - start + klass_results.append(duration) + writer.writerow( + { + **default_config, + **{ + "n_repeat": n_repeat, + "duration": duration, + }, + } + ) + results_mean, results_stdev = mean(klass_results), stdev(klass_results) + print( + f"{combine_config} with {results_mean:.3f} +/- {results_stdev:.3f}" + ) + +def plot(args): + import matplotlib.pyplot as plt + import pandas as pd + import seaborn as sns + + results_path = Path(args.bench_results) + pr_path = results_path / f"{args.pr_name}.csv" + main_path = results_path / f"{args.main_name}.csv" + image_path = results_path / args.image_path + + df_pr = pd.read_csv(pr_path).assign(branch=args.pr_name) + df_main = pd.read_csv(main_path).assign(branch=args.main_name) + df_all = pd.concat((df_pr, df_main), ignore_index=True) + + df_all = df_all.assign( + make_data=df_all["make_data"] + .str.replace("_custom", "") + .str.replace("make_", "") + .str.replace("_data", "") + ) + + gb = df_all.groupby(["criterion", "make_data"]) + groups = gb.groups + + n_rows, n_cols = 2, 4 + fig, axes = plt.subplots(n_rows, n_cols, figsize=(12, 8), constrained_layout=True) + axes_flat = axes.ravel() + for i, (keys, idx) in enumerate(groups.items()): + ax = axes_flat[i] + ax.set_title(" | ".join(keys)) + sns.boxplot(data=df_all.loc[idx], y="duration", x="branch", ax=ax) + if i % n_cols != 0: + ax.set_ylabel("") + + axes_flat[-1].set_visible(False) + + fig.savefig(image_path) + print(f"Saved image to {image_path}") + +if __name__ == "__main__": + + parser = argparse.ArgumentParser() + + subparsers = parser.add_subparsers() + bench_parser = subparsers.add_parser("bench") + bench_parser.add_argument("bench_results") + bench_parser.add_argument("branch") + bench_parser.set_defaults(func=bench) + + plot_parser = subparsers.add_parser("plot") + plot_parser.add_argument("bench_results") + plot_parser.add_argument("pr_name") + plot_parser.add_argument("main_name") + plot_parser.add_argument("image_path") + plot_parser.set_defaults(func=plot) + + args = parser.parse_args() + args.func(args) diff --git a/sklearn/tree/_tree.pxd b/sklearn/tree/_tree.pxd index 1bca2d57cb489..870f7fe875b0c 100644 --- a/sklearn/tree/_tree.pxd +++ b/sklearn/tree/_tree.pxd @@ -35,10 +35,10 @@ cdef struct ParentInfo: # Structure to store information about the parent of a node # This is passed to the splitter, to provide information about the previous split - intp_t n_constant_features # the number of constant features found in parent float64_t lower_bound # the lower bound of the parent's impurity float64_t upper_bound # the upper bound of the parent's impurity float64_t impurity # the impurity of the parent + intp_t n_constant_features # the number of constant features found in parent cdef class Tree: # The Tree object is a binary tree structure constructed by the @@ -57,7 +57,7 @@ cdef class Tree: cdef public intp_t node_count # Counter for node IDs cdef public intp_t capacity # Capacity of tree, in terms of nodes cdef Node* nodes # Array of nodes - cdef float64_t* value # (capacity, n_outputs, max_n_classes) array of values + cdef float64_t* value # (capacity, n_outputs, max_n_classes) array of values cdef intp_t value_stride # = n_outputs * max_n_classes # Methods diff --git a/sklearn/tree/_tree.pyx b/sklearn/tree/_tree.pyx index 92e2e1daedd29..224da2c14e5ec 100644 --- a/sklearn/tree/_tree.pyx +++ b/sklearn/tree/_tree.pyx @@ -310,12 +310,12 @@ cdef class DepthFirstTreeBuilder(TreeBuilder): # Current bounds must always be propagated to both children. # If a monotonic constraint is active, bounds are used in # node value clipping. - left_child_min = right_child_min = lower_bound - left_child_max = right_child_max = upper_bound + left_child_min = right_child_min = parent_record.lower_bound + left_child_max = right_child_max = parent_record.upper_bound elif splitter.monotonic_cst[split.feature] == 1: # Split on a feature with monotonic increase constraint - left_child_min = lower_bound - right_child_max = upper_bound + left_child_min = parent_record.lower_bound + right_child_max = parent_record.upper_bound # Lower bound for right child and upper bound for left child # are set to the same value. @@ -324,8 +324,8 @@ cdef class DepthFirstTreeBuilder(TreeBuilder): left_child_max = middle_value else: # i.e. splitter.monotonic_cst[split.feature] == -1 # Split on a feature with monotonic decrease constraint - right_child_min = lower_bound - left_child_max = upper_bound + right_child_min = parent_record.lower_bound + left_child_max = parent_record.upper_bound # Lower bound for left child and upper bound for right child # are set to the same value. From 9fc7847d168fe20bf5c8156cd1a9c7a4d7d80d42 Mon Sep 17 00:00:00 2001 From: Adam Li Date: Thu, 21 Mar 2024 10:21:49 -0400 Subject: [PATCH 20/54] Merge main Signed-off-by: Adam Li --- sklearn/tree/_splitter.pyx | 6 ------ 1 file changed, 6 deletions(-) diff --git a/sklearn/tree/_splitter.pyx b/sklearn/tree/_splitter.pyx index 2719b0b01aea7..5872683f416d5 100644 --- a/sklearn/tree/_splitter.pyx +++ b/sklearn/tree/_splitter.pyx @@ -297,13 +297,7 @@ cdef inline int node_split_best( SplitRecord* split, ParentInfo* parent_record, bint with_monotonic_cst, -<<<<<<< HEAD - const cnp.int8_t[:] monotonic_cst, -======= const int8_t[:] monotonic_cst, - float64_t lower_bound, - float64_t upper_bound, ->>>>>>> main ) except -1 nogil: """Find the best split on node samples[start:end] From 13a3f89aa21541ed93d75ab4241f76a90bb2041e Mon Sep 17 00:00:00 2001 From: Adam Li Date: Thu, 21 Mar 2024 10:27:54 -0400 Subject: [PATCH 21/54] Bench size Signed-off-by: Adam Li --- benchmarks/bench_randomforest.py | 62 +++++++++++++++++++++++++++++++- 1 file changed, 61 insertions(+), 1 deletion(-) diff --git a/benchmarks/bench_randomforest.py b/benchmarks/bench_randomforest.py index 68b3399924255..3dc245d65f012 100644 --- a/benchmarks/bench_randomforest.py +++ b/benchmarks/bench_randomforest.py @@ -15,9 +15,15 @@ ```bash python bench_randomforest.py plot ~/bench_results_forest pr main results_image.png + +# or plot size +python bench_randomforest.py plot_size ~/bench_results_forest pr main results_image.png ``` """ - +import os +import tempfile +import sys +import pickle from functools import partial import argparse from time import perf_counter @@ -82,6 +88,8 @@ def bench(args): "splitter", "n_repeat", "duration", + "ram_size", + "file_size", ], ) writer.writeheader() @@ -124,12 +132,21 @@ def bench(args): forest.fit(X, y) duration = perf_counter() - start klass_results.append(duration) + + # benchmark size of object + ram_size = sys.getsizeof(forest) + with tempfile.TemporaryFile() as f: + pickle.dump(forest, f, -1) + file_size = os.path.getsize(f.name) + writer.writerow( { **default_config, **{ "n_repeat": n_repeat, "duration": duration, + "ram_size": ram_size, + "file_size": file_size, }, } ) @@ -177,6 +194,49 @@ def plot(args): fig.savefig(image_path) print(f"Saved image to {image_path}") + +def plot_size(args): + size_id = 'file_size' + + import matplotlib.pyplot as plt + import pandas as pd + import seaborn as sns + + results_path = Path(args.bench_results) + pr_path = results_path / f"{args.pr_name}.csv" + main_path = results_path / f"{args.main_name}.csv" + image_path = results_path / args.image_path + + df_pr = pd.read_csv(pr_path).assign(branch=args.pr_name) + df_main = pd.read_csv(main_path).assign(branch=args.main_name) + df_all = pd.concat((df_pr, df_main), ignore_index=True) + + df_all = df_all.assign( + make_data=df_all["make_data"] + .str.replace("_custom", "") + .str.replace("make_", "") + .str.replace("_data", "") + ) + + gb = df_all.groupby(["criterion", "make_data"]) + groups = gb.groups + + n_rows, n_cols = 2, 4 + fig, axes = plt.subplots(n_rows, n_cols, figsize=(12, 8), constrained_layout=True) + axes_flat = axes.ravel() + for i, (keys, idx) in enumerate(groups.items()): + ax = axes_flat[i] + ax.set_title(" | ".join(keys)) + sns.boxplot(data=df_all.loc[idx], y=size_id, x="branch", ax=ax) + if i % n_cols != 0: + ax.set_ylabel("") + + axes_flat[-1].set_visible(False) + + fig.savefig(image_path) + print(f"Saved image to {image_path}") + + if __name__ == "__main__": parser = argparse.ArgumentParser() From e8214dfb0293c06d796231581b6c8a70a5768ed8 Mon Sep 17 00:00:00 2001 From: Adam Li Date: Thu, 21 Mar 2024 10:56:18 -0400 Subject: [PATCH 22/54] Adding parentrecord Signed-off-by: Adam Li --- benchmarks/bench_randomforest.py | 258 ------------------------------- 1 file changed, 258 deletions(-) delete mode 100644 benchmarks/bench_randomforest.py diff --git a/benchmarks/bench_randomforest.py b/benchmarks/bench_randomforest.py deleted file mode 100644 index 3dc245d65f012..0000000000000 --- a/benchmarks/bench_randomforest.py +++ /dev/null @@ -1,258 +0,0 @@ -"""Instructions -1. Build this PR and run: - -```bash -python bench_randomforest.py bench ~/bench_results_forest pr -``` - -2. On main run: - -```bash -python bench_randomforest.py bench ~/bench_results_forest main -``` - -3. Plotting - -```bash -python bench_randomforest.py plot ~/bench_results_forest pr main results_image.png - -# or plot size -python bench_randomforest.py plot_size ~/bench_results_forest pr main results_image.png -``` -""" -import os -import tempfile -import sys -import pickle -from functools import partial -import argparse -from time import perf_counter -from statistics import mean, stdev -from itertools import product -import csv -from pathlib import Path - -from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor -from sklearn.datasets import make_classification, make_regression, make_low_rank_matrix -import numpy as np - -N_REPEATS = 10 -n_jobs = -3 - -benchmark_config = [ - ( - RandomForestRegressor, - list( - product( - ["squared_error"], - [ - make_regression, - ], - [10_000], - ["dense"], - ["best"], - ) - ), - ), - ( - RandomForestClassifier, - list( - product( - ["gini", "entropy"], - [ - partial(make_classification, n_informative=10, n_classes=5), - ], - [10_000], - ["dense"], - ["best"], - ) - ), - ), -] - -def bench(args): - bench_results, branch = args.bench_results, args.branch - results_dir = Path(bench_results) - results_dir.mkdir(exist_ok=True) - - results_path = results_dir / f"{branch}.csv" - - with results_path.open("w") as f: - writer = csv.DictWriter( - f, - fieldnames=[ - "criterion", - "n_samples", - "make_data", - "container", - "splitter", - "n_repeat", - "duration", - "ram_size", - "file_size", - ], - ) - writer.writeheader() - - for Klass, items in benchmark_config: - - for config in items: - ( - criterion, - make_data, - n_samples, - container, - splitter, - ) = config - if isinstance(make_data, partial): - make_data_str = make_data.func.__name__ - else: - make_data_str = make_data.__name__ - - default_config = { - "criterion": criterion, - "n_samples": n_samples, - "make_data": make_data_str, - "container": container, - "splitter": splitter, - } - combine_config = " ".join(f"{k}={v}" for k, v in default_config.items()) - - klass_results = [] - for n_repeat in range(N_REPEATS): - print(f"Running {combine_config} with {n_repeat + 1}/{N_REPEATS}") - X, y = make_data( - n_samples=n_samples, - n_features=20, - random_state=n_repeat, - ) - forest = Klass(random_state=n_repeat, criterion=criterion, n_jobs=n_jobs) - - start = perf_counter() - forest.fit(X, y) - duration = perf_counter() - start - klass_results.append(duration) - - # benchmark size of object - ram_size = sys.getsizeof(forest) - with tempfile.TemporaryFile() as f: - pickle.dump(forest, f, -1) - file_size = os.path.getsize(f.name) - - writer.writerow( - { - **default_config, - **{ - "n_repeat": n_repeat, - "duration": duration, - "ram_size": ram_size, - "file_size": file_size, - }, - } - ) - results_mean, results_stdev = mean(klass_results), stdev(klass_results) - print( - f"{combine_config} with {results_mean:.3f} +/- {results_stdev:.3f}" - ) - -def plot(args): - import matplotlib.pyplot as plt - import pandas as pd - import seaborn as sns - - results_path = Path(args.bench_results) - pr_path = results_path / f"{args.pr_name}.csv" - main_path = results_path / f"{args.main_name}.csv" - image_path = results_path / args.image_path - - df_pr = pd.read_csv(pr_path).assign(branch=args.pr_name) - df_main = pd.read_csv(main_path).assign(branch=args.main_name) - df_all = pd.concat((df_pr, df_main), ignore_index=True) - - df_all = df_all.assign( - make_data=df_all["make_data"] - .str.replace("_custom", "") - .str.replace("make_", "") - .str.replace("_data", "") - ) - - gb = df_all.groupby(["criterion", "make_data"]) - groups = gb.groups - - n_rows, n_cols = 2, 4 - fig, axes = plt.subplots(n_rows, n_cols, figsize=(12, 8), constrained_layout=True) - axes_flat = axes.ravel() - for i, (keys, idx) in enumerate(groups.items()): - ax = axes_flat[i] - ax.set_title(" | ".join(keys)) - sns.boxplot(data=df_all.loc[idx], y="duration", x="branch", ax=ax) - if i % n_cols != 0: - ax.set_ylabel("") - - axes_flat[-1].set_visible(False) - - fig.savefig(image_path) - print(f"Saved image to {image_path}") - - -def plot_size(args): - size_id = 'file_size' - - import matplotlib.pyplot as plt - import pandas as pd - import seaborn as sns - - results_path = Path(args.bench_results) - pr_path = results_path / f"{args.pr_name}.csv" - main_path = results_path / f"{args.main_name}.csv" - image_path = results_path / args.image_path - - df_pr = pd.read_csv(pr_path).assign(branch=args.pr_name) - df_main = pd.read_csv(main_path).assign(branch=args.main_name) - df_all = pd.concat((df_pr, df_main), ignore_index=True) - - df_all = df_all.assign( - make_data=df_all["make_data"] - .str.replace("_custom", "") - .str.replace("make_", "") - .str.replace("_data", "") - ) - - gb = df_all.groupby(["criterion", "make_data"]) - groups = gb.groups - - n_rows, n_cols = 2, 4 - fig, axes = plt.subplots(n_rows, n_cols, figsize=(12, 8), constrained_layout=True) - axes_flat = axes.ravel() - for i, (keys, idx) in enumerate(groups.items()): - ax = axes_flat[i] - ax.set_title(" | ".join(keys)) - sns.boxplot(data=df_all.loc[idx], y=size_id, x="branch", ax=ax) - if i % n_cols != 0: - ax.set_ylabel("") - - axes_flat[-1].set_visible(False) - - fig.savefig(image_path) - print(f"Saved image to {image_path}") - - -if __name__ == "__main__": - - parser = argparse.ArgumentParser() - - subparsers = parser.add_subparsers() - bench_parser = subparsers.add_parser("bench") - bench_parser.add_argument("bench_results") - bench_parser.add_argument("branch") - bench_parser.set_defaults(func=bench) - - plot_parser = subparsers.add_parser("plot") - plot_parser.add_argument("bench_results") - plot_parser.add_argument("pr_name") - plot_parser.add_argument("main_name") - plot_parser.add_argument("image_path") - plot_parser.set_defaults(func=plot) - - args = parser.parse_args() - args.func(args) From 1d3299bc5d301087c426e683965e77c3457bb00b Mon Sep 17 00:00:00 2001 From: Adam Li Date: Thu, 21 Mar 2024 11:03:35 -0400 Subject: [PATCH 23/54] Fix lint Signed-off-by: Adam Li --- sklearn/tree/_tree.pyx | 2 -- 1 file changed, 2 deletions(-) diff --git a/sklearn/tree/_tree.pyx b/sklearn/tree/_tree.pyx index 224da2c14e5ec..8b382a11791ec 100644 --- a/sklearn/tree/_tree.pyx +++ b/sklearn/tree/_tree.pyx @@ -213,8 +213,6 @@ cdef class DepthFirstTreeBuilder(TreeBuilder): cdef SplitRecord split cdef intp_t node_id - cdef float64_t lower_bound - cdef float64_t upper_bound cdef float64_t middle_value cdef float64_t left_child_min cdef float64_t left_child_max From 4fccb2a3380c0f4e56dd804841d0ea5c1c510396 Mon Sep 17 00:00:00 2001 From: Adam Li Date: Fri, 22 Mar 2024 12:24:29 -0400 Subject: [PATCH 24/54] Fix bestfirst Signed-off-by: Adam Li --- sklearn/tree/_tree.pyx | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/sklearn/tree/_tree.pyx b/sklearn/tree/_tree.pyx index 8b382a11791ec..117978b7722c2 100644 --- a/sklearn/tree/_tree.pyx +++ b/sklearn/tree/_tree.pyx @@ -460,7 +460,6 @@ cdef class BestFirstTreeBuilder(TreeBuilder): cdef Node* node cdef ParentInfo parent_record - parent_record.n_constant_features = 0 # Initial capacity cdef intp_t init_capacity = max_split_nodes + max_leaf_nodes @@ -614,6 +613,9 @@ cdef class BestFirstTreeBuilder(TreeBuilder): splitter.node_reset(start, end, &weighted_n_node_samples) + # best-first splits do not track the number of constants when adding a split node + parent_record.n_constant_features = 0 + if is_first: parent_record.impurity = splitter.node_impurity() From d0a8d2fa5506f3a776e3e70af0c74dd272e8edd2 Mon Sep 17 00:00:00 2001 From: Adam Li Date: Fri, 22 Mar 2024 13:29:18 -0400 Subject: [PATCH 25/54] Init parent record Signed-off-by: Adam Li --- sklearn/tree/_tree.pyx | 2 ++ 1 file changed, 2 insertions(+) diff --git a/sklearn/tree/_tree.pyx b/sklearn/tree/_tree.pyx index 117978b7722c2..422b419fe923a 100644 --- a/sklearn/tree/_tree.pyx +++ b/sklearn/tree/_tree.pyx @@ -227,6 +227,7 @@ cdef class DepthFirstTreeBuilder(TreeBuilder): cdef StackRecord stack_record cdef ParentInfo parent_record + _init_parent_record(&parent_record) with nogil: # push root node onto stack @@ -460,6 +461,7 @@ cdef class BestFirstTreeBuilder(TreeBuilder): cdef Node* node cdef ParentInfo parent_record + _init_parent_record(&parent_record) # Initial capacity cdef intp_t init_capacity = max_split_nodes + max_leaf_nodes From 82c94287b96d69acc92c94f2fbe6bd3418d455e7 Mon Sep 17 00:00:00 2001 From: Adam Li Date: Mon, 1 Apr 2024 17:05:09 -0400 Subject: [PATCH 26/54] Address thomas comments' -s Signed-off-by: Adam Li --- sklearn/tree/_tree.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/tree/_tree.pyx b/sklearn/tree/_tree.pyx index 422b419fe923a..80ab7e7369407 100644 --- a/sklearn/tree/_tree.pyx +++ b/sklearn/tree/_tree.pyx @@ -615,7 +615,7 @@ cdef class BestFirstTreeBuilder(TreeBuilder): splitter.node_reset(start, end, &weighted_n_node_samples) - # best-first splits do not track the number of constants when adding a split node + # reset n_constant_features for this specific split before beginning split search parent_record.n_constant_features = 0 if is_first: From b2dfe8f0305dca9ec69c8454d63370b47b9c45b5 Mon Sep 17 00:00:00 2001 From: Adam Li Date: Tue, 2 Apr 2024 15:12:12 -0400 Subject: [PATCH 27/54] Revert to unit32_t Signed-off-by: Adam Li --- sklearn/tree/_utils.pxd | 7 +++---- sklearn/tree/_utils.pyx | 4 ++-- 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/sklearn/tree/_utils.pxd b/sklearn/tree/_utils.pxd index 03a1d48c94cb4..5ee16fa3f628c 100644 --- a/sklearn/tree/_utils.pxd +++ b/sklearn/tree/_utils.pxd @@ -11,10 +11,9 @@ import numpy as np cimport numpy as cnp cnp.import_array() -ctypedef cnp.npy_uint32 UINT32_t from ..neighbors._quad_tree cimport Cell -from ..utils._typedefs cimport float32_t, float64_t, intp_t, int32_t +from ..utils._typedefs cimport float32_t, float64_t, intp_t, int32_t, uint32_t from ._tree cimport Node @@ -52,11 +51,11 @@ cdef cnp.ndarray sizet_ptr_to_ndarray(intp_t* data, intp_t size) cdef intp_t rand_int(intp_t low, intp_t high, - UINT32_t* random_state) noexcept nogil + uint32_t* random_state) noexcept nogil cdef float64_t rand_uniform(float64_t low, float64_t high, - UINT32_t* random_state) noexcept nogil + uint32_t* random_state) noexcept nogil cdef float64_t log(float64_t x) noexcept nogil diff --git a/sklearn/tree/_utils.pyx b/sklearn/tree/_utils.pyx index cc4cb7cf02533..23c358ce4bd8b 100644 --- a/sklearn/tree/_utils.pyx +++ b/sklearn/tree/_utils.pyx @@ -60,13 +60,13 @@ cdef inline cnp.ndarray sizet_ptr_to_ndarray(intp_t* data, intp_t size): cdef inline intp_t rand_int(intp_t low, intp_t high, - UINT32_t* random_state) noexcept nogil: + uint32_t* random_state) noexcept nogil: """Generate a random integer in [low; end).""" return low + our_rand_r(random_state) % (high - low) cdef inline float64_t rand_uniform(float64_t low, float64_t high, - UINT32_t* random_state) noexcept nogil: + uint32_t* random_state) noexcept nogil: """Generate a random float64_t in [low; high).""" return ((high - low) * our_rand_r(random_state) / RAND_R_MAX) + low From d90befced7ac83424e0cdbf81ea7932eede501d4 Mon Sep 17 00:00:00 2001 From: Adam Li Date: Tue, 2 Apr 2024 16:24:38 -0400 Subject: [PATCH 28/54] UPdate submodule commit and remove extraneous code Signed-off-by: Adam Li --- sklearn/tree/_tree.pxd | 12 ------------ sklearn/tree/_tree.pyx | 43 ------------------------------------------ 2 files changed, 55 deletions(-) diff --git a/sklearn/tree/_tree.pxd b/sklearn/tree/_tree.pxd index 2267b4306e261..1a13730e76e6b 100644 --- a/sklearn/tree/_tree.pxd +++ b/sklearn/tree/_tree.pxd @@ -69,18 +69,6 @@ cdef class BaseTree: cdef int _resize(self, intp_t capacity) except -1 nogil cdef int _resize_c(self, intp_t capacity=*) except -1 nogil - cdef int _update_node( - self, - intp_t parent, - bint is_left, - bint is_leaf, - SplitRecord* split_node, - float64_t impurity, - intp_t n_node_samples, - float64_t weighted_n_node_samples, - unsigned char missing_go_to_left - ) except -1 nogil - # Python API methods: These are methods exposed to Python cpdef cnp.ndarray apply(self, object X) cdef cnp.ndarray _apply_dense(self, object X) diff --git a/sklearn/tree/_tree.pyx b/sklearn/tree/_tree.pyx index 673e2c5654ce1..d03a6e5a9f380 100644 --- a/sklearn/tree/_tree.pyx +++ b/sklearn/tree/_tree.pyx @@ -1168,49 +1168,6 @@ cdef class BaseTree: return node_id - cdef inline int _update_node( - self, - intp_t parent, - bint is_left, - bint is_leaf, - SplitRecord* split_node, - float64_t impurity, - intp_t n_node_samples, - float64_t weighted_n_node_samples, - unsigned char missing_go_to_left - ) except -1 nogil: - """Update a node on the tree. - - The updated node remains on the same position. - Returns (intp_t)(-1) on error. - """ - cdef intp_t node_id - if is_left: - node_id = self.nodes[parent].left_child - else: - node_id = self.nodes[parent].right_child - - if node_id >= self.capacity: - if self._resize_c() != 0: - return INTPTR_MAX - - cdef Node* node = &self.nodes[node_id] - node.impurity = impurity - node.n_node_samples = n_node_samples - node.weighted_n_node_samples = weighted_n_node_samples - - if is_leaf: - if self._set_leaf_node(split_node, node, node_id) != 1: - with gil: - raise RuntimeError - else: - if self._set_split_node(split_node, node, node_id) != 1: - with gil: - raise RuntimeError - node.missing_go_to_left = missing_go_to_left - - return node_id - cpdef cnp.ndarray apply(self, object X): """Finds the terminal region (=leaf node) for each sample in X.""" if issparse(X): From a52ec7442cb6600212f65f49fc3cef079c2ce019 Mon Sep 17 00:00:00 2001 From: Adam Li Date: Tue, 2 Apr 2024 17:12:01 -0400 Subject: [PATCH 29/54] Fixing LOC Signed-off-by: Adam Li --- sklearn/tree/_tree.pyx | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/sklearn/tree/_tree.pyx b/sklearn/tree/_tree.pyx index d03a6e5a9f380..688eb6d4d3982 100644 --- a/sklearn/tree/_tree.pyx +++ b/sklearn/tree/_tree.pyx @@ -435,7 +435,11 @@ cdef class DepthFirstTreeBuilder(TreeBuilder): # inspection and interpretation splitter.node_value(tree.value + node_id * tree.value_stride) if splitter.with_monotonic_cst: - splitter.clip_node_value(tree.value + node_id * tree.value_stride, parent_record.lower_bound, parent_record.upper_bound) + splitter.clip_node_value( + tree.value + node_id * tree.value_stride, + parent_record.lower_bound, + parent_record.upper_bound + ) if not is_leaf: if ( @@ -560,8 +564,11 @@ cdef class DepthFirstTreeBuilder(TreeBuilder): # inspection and interpretation splitter.node_value(tree.value + node_id * tree.value_stride) if splitter.with_monotonic_cst: - splitter.clip_node_value(tree.value + node_id * tree.value_stride, - parent_record.lower_bound, parent_record.upper_bound) + splitter.clip_node_value( + tree.value + node_id * tree.value_stride, + parent_record.lower_bound, + parent_record.upper_bound + ) if not is_leaf: if ( From 750573c4a8d5620f25213ab4808008c8ecf5b5aa Mon Sep 17 00:00:00 2001 From: Adam Li Date: Tue, 2 Apr 2024 19:02:38 -0400 Subject: [PATCH 30/54] Fix update tree node Signed-off-by: Adam Li --- sklearn/tree/_tree.pxd | 12 ++++++++++ sklearn/tree/_tree.pyx | 52 ++++++++++++++++++++++++++++++++++++++---- 2 files changed, 60 insertions(+), 4 deletions(-) diff --git a/sklearn/tree/_tree.pxd b/sklearn/tree/_tree.pxd index 1a13730e76e6b..2267b4306e261 100644 --- a/sklearn/tree/_tree.pxd +++ b/sklearn/tree/_tree.pxd @@ -69,6 +69,18 @@ cdef class BaseTree: cdef int _resize(self, intp_t capacity) except -1 nogil cdef int _resize_c(self, intp_t capacity=*) except -1 nogil + cdef int _update_node( + self, + intp_t parent, + bint is_left, + bint is_leaf, + SplitRecord* split_node, + float64_t impurity, + intp_t n_node_samples, + float64_t weighted_n_node_samples, + unsigned char missing_go_to_left + ) except -1 nogil + # Python API methods: These are methods exposed to Python cpdef cnp.ndarray apply(self, object X) cdef cnp.ndarray _apply_dense(self, object X) diff --git a/sklearn/tree/_tree.pyx b/sklearn/tree/_tree.pyx index 688eb6d4d3982..b5c14f19f7982 100644 --- a/sklearn/tree/_tree.pyx +++ b/sklearn/tree/_tree.pyx @@ -422,10 +422,10 @@ cdef class DepthFirstTreeBuilder(TreeBuilder): (split.improvement + EPSILON < min_impurity_decrease)) - node_id = tree._add_node(parent, is_left, is_leaf, split_ptr, - parent_record.impurity, - n_node_samples, weighted_n_node_samples, - split.missing_go_to_left) + node_id = tree._update_node(parent, is_left, is_leaf, split_ptr, + parent_record.impurity, + n_node_samples, weighted_n_node_samples, + split.missing_go_to_left) if node_id == INTPTR_MAX: rc = -1 @@ -1175,6 +1175,50 @@ cdef class BaseTree: return node_id + cdef inline int _update_node( + self, + intp_t parent, + bint is_left, + bint is_leaf, + SplitRecord* split_node, + float64_t impurity, + intp_t n_node_samples, + float64_t weighted_n_node_samples, + unsigned char missing_go_to_left + ) except -1 nogil: + """Update a node on the tree. + + The updated node remains on the same position. + + Returns (intp_t)(-1) on error. + """ + cdef intp_t node_id + if is_left: + node_id = self.nodes[parent].left_child + else: + node_id = self.nodes[parent].right_child + + if node_id >= self.capacity: + if self._resize_c() != 0: + return INTPTR_MAX + + cdef Node* node = &self.nodes[node_id] + node.impurity = impurity + node.n_node_samples = n_node_samples + node.weighted_n_node_samples = weighted_n_node_samples + + if is_leaf: + if self._set_leaf_node(split_node, node, node_id) != 1: + with gil: + raise RuntimeError + else: + if self._set_split_node(split_node, node, node_id) != 1: + with gil: + raise RuntimeError + node.missing_go_to_left = missing_go_to_left + + return node_id + cpdef cnp.ndarray apply(self, object X): """Finds the terminal region (=leaf node) for each sample in X.""" if issparse(X): From ec66190c2d696eab8078705ab4a68f9a178f6fb3 Mon Sep 17 00:00:00 2001 From: Adam Li Date: Tue, 2 Apr 2024 19:10:08 -0400 Subject: [PATCH 31/54] Fix ci Signed-off-by: Adam Li --- sklearn/tree/_tree.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/tree/_tree.pyx b/sklearn/tree/_tree.pyx index b5c14f19f7982..56809f7c4ee71 100644 --- a/sklearn/tree/_tree.pyx +++ b/sklearn/tree/_tree.pyx @@ -408,7 +408,7 @@ cdef class DepthFirstTreeBuilder(TreeBuilder): if not is_leaf: splitter.node_split( &parent_record, - &split, + split_ptr, ) # assign local copy of SplitRecord to assign From f3607494ce509f1ff63255b0cb3bc5562de981a1 Mon Sep 17 00:00:00 2001 From: Adam Li Date: Tue, 2 Apr 2024 19:10:25 -0400 Subject: [PATCH 32/54] Fix ci Signed-off-by: Adam Li --- sklearn/tree/_tree.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/tree/_tree.pyx b/sklearn/tree/_tree.pyx index 56809f7c4ee71..486bad115787c 100644 --- a/sklearn/tree/_tree.pyx +++ b/sklearn/tree/_tree.pyx @@ -1189,7 +1189,7 @@ cdef class BaseTree: """Update a node on the tree. The updated node remains on the same position. - + Returns (intp_t)(-1) on error. """ cdef intp_t node_id From e0202533a542386b4da050d456e87f6508f3f477 Mon Sep 17 00:00:00 2001 From: Adam Li Date: Tue, 2 Apr 2024 19:53:02 -0400 Subject: [PATCH 33/54] MAINT Fix builder partial fit (#62) #### Reference Issues/PRs Fixes state of builder_ to not need to be maintained. Prolly needs unit-tests to determine if this "functions as desired". I.e. - changing datatype of X over multiple partial fits should fail nicely, - changing datatype of y - classification and regression #### What does this implement/fix? Explain your changes. #### Any other comments? --------- Signed-off-by: Adam Li --- sklearn/tree/_classes.py | 125 ++++++++++++++++++++++++++++++++------- 1 file changed, 105 insertions(+), 20 deletions(-) diff --git a/sklearn/tree/_classes.py b/sklearn/tree/_classes.py index 6511c8192889e..e6949b293185d 100644 --- a/sklearn/tree/_classes.py +++ b/sklearn/tree/_classes.py @@ -347,7 +347,7 @@ def _fit( ) self.n_classes_ = np.array(self.n_classes_, dtype=np.intp) - + self._n_classes_ = self.n_classes_ if getattr(y, "dtype", None) != DOUBLE or not y.flags.contiguous: y = np.ascontiguousarray(y, dtype=DOUBLE) @@ -377,6 +377,7 @@ def _fit( min_samples_split = max(2, min_samples_split) min_samples_split = max(min_samples_split, 2 * min_samples_leaf) self.min_samples_split_ = min_samples_split + self.min_samples_leaf_ = min_samples_leaf if isinstance(self.max_features, str): if self.max_features == "sqrt": @@ -411,6 +412,7 @@ def _fit( min_weight_leaf = self.min_weight_fraction_leaf * n_samples else: min_weight_leaf = self.min_weight_fraction_leaf * np.sum(sample_weight) + self.min_weight_leaf_ = min_weight_leaf # build the actual tree now with the parameters self = self._build_tree( @@ -521,6 +523,7 @@ def _build_tree( # Since self.monotonic_cst encodes constraints on probabilities of the # *positive class*, all signs must be flipped. monotonic_cst *= -1 + self.monotonic_cst_ = monotonic_cst if not isinstance(self.splitter, BaseSplitter): splitter = SPLITTERS[self.splitter]( @@ -544,7 +547,7 @@ def _build_tree( # Use BestFirst if max_leaf_nodes given; use DepthFirst otherwise if max_leaf_nodes < 0: - self.builder_ = DepthFirstTreeBuilder( + builder = DepthFirstTreeBuilder( splitter, min_samples_split, min_samples_leaf, @@ -554,7 +557,7 @@ def _build_tree( self.store_leaf_values, ) else: - self.builder_ = BestFirstTreeBuilder( + builder = BestFirstTreeBuilder( splitter, min_samples_split, min_samples_leaf, @@ -564,9 +567,7 @@ def _build_tree( self.min_impurity_decrease, self.store_leaf_values, ) - self.builder_.build( - self.tree_, X, y, sample_weight, missing_values_in_feature_mask - ) + builder.build(self.tree_, X, y, sample_weight, missing_values_in_feature_mask) if self.n_outputs_ == 1 and is_classifier(self): self.n_classes_ = self.n_classes_[0] @@ -1128,12 +1129,18 @@ class DecisionTreeClassifier(ClassifierMixin, BaseDecisionTree): :ref:`sphx_glr_auto_examples_tree_plot_unveil_tree_structure.py` for basic usage of these attributes. - builder_ : TreeBuilder instance - The underlying TreeBuilder object. - min_samples_split_ : float The minimum number of samples needed to split a node in the tree building. + min_weight_leaf_ : float + The minimum number of weighted samples in a leaf. + + min_samples_leaf_ : int + The minimum number of samples needed for a leaf node. + + monotonic_cst_ : array-like of int of shape (n_features,) + The monotonicity constraints enforced on each feature. + See Also -------- DecisionTreeRegressor : A decision tree regressor. @@ -1369,8 +1376,68 @@ def partial_fit(self, X, y, sample_weight=None, check_input=True, classes=None): y = np.ascontiguousarray(y, dtype=DOUBLE) # Update tree - self.builder_.initialize_node_queue(self.tree_, X, y, sample_weight) - self.builder_.build(self.tree_, X, y, sample_weight) + max_leaf_nodes = -1 if self.max_leaf_nodes is None else self.max_leaf_nodes + min_samples_split = self.min_samples_split_ + min_samples_leaf = self.min_samples_leaf_ + min_weight_leaf = self.min_weight_leaf_ + # set decision-tree model parameters + max_depth = np.iinfo(np.int32).max if self.max_depth is None else self.max_depth + + monotonic_cst = self.monotonic_cst_ + + # Build tree + # Note: this reconstructs the builder with the same state it had during the + # initial fit. This is necessary because the builder is not saved as part + # of the class, and thus the state may be lost if pickled/unpickled. + n_samples = X.shape[0] + criterion = self.criterion + if not isinstance(criterion, BaseCriterion): + if is_classifier(self): + criterion = CRITERIA_CLF[self.criterion]( + self.n_outputs_, self._n_classes_ + ) + else: + criterion = CRITERIA_REG[self.criterion](self.n_outputs_, n_samples) + else: + # Make a deepcopy in case the criterion has mutable attributes that + # might be shared and modified concurrently during parallel fitting + criterion = copy.deepcopy(criterion) + + random_state = check_random_state(self.random_state) + SPLITTERS = SPARSE_SPLITTERS if issparse(X) else DENSE_SPLITTERS + splitter = SPLITTERS[self.splitter]( + criterion, + self.max_features_, + min_samples_leaf, + min_weight_leaf, + random_state, + monotonic_cst, + ) + + # Use BestFirst if max_leaf_nodes given; use DepthFirst otherwise + if max_leaf_nodes < 0: + builder = DepthFirstTreeBuilder( + splitter, + min_samples_split, + min_samples_leaf, + min_weight_leaf, + max_depth, + self.min_impurity_decrease, + self.store_leaf_values, + ) + else: + builder = BestFirstTreeBuilder( + splitter, + min_samples_split, + min_samples_leaf, + min_weight_leaf, + max_depth, + max_leaf_nodes, + self.min_impurity_decrease, + self.store_leaf_values, + ) + builder.initialize_node_queue(self.tree_, X, y, sample_weight) + builder.build(self.tree_, X, y, sample_weight) self._prune_tree() @@ -1637,12 +1704,18 @@ class DecisionTreeRegressor(RegressorMixin, BaseDecisionTree): :ref:`sphx_glr_auto_examples_tree_plot_unveil_tree_structure.py` for basic usage of these attributes. - builder_ : TreeBuilder instance - The underlying TreeBuilder object. - min_samples_split_ : float The minimum number of samples needed to split a node in the tree building. + min_weight_leaf_ : float + The minimum number of weighted samples in a leaf. + + monotonic_cst_ : array-like of int of shape (n_features,) + The monotonicity constraints enforced on each feature. + + min_samples_leaf_ : int + The minimum number of samples needed for a leaf node. + See Also -------- DecisionTreeClassifier : A decision tree classifier. @@ -2022,12 +2095,18 @@ class ExtraTreeClassifier(DecisionTreeClassifier): :ref:`sphx_glr_auto_examples_tree_plot_unveil_tree_structure.py` for basic usage of these attributes. - builder_ : TreeBuilder instance - The underlying TreeBuilder object. - min_samples_split_ : float The minimum number of samples needed to split a node in the tree building. + min_weight_leaf_ : float + The minimum number of weighted samples in a leaf. + + monotonic_cst_ : array-like of int of shape (n_features,) + The monotonicity constraints enforced on each feature. + + min_samples_leaf_ : int + The minimum number of samples needed for a leaf node. + See Also -------- ExtraTreeRegressor : An extremely randomized tree regressor. @@ -2290,12 +2369,18 @@ class ExtraTreeRegressor(DecisionTreeRegressor): :ref:`sphx_glr_auto_examples_tree_plot_unveil_tree_structure.py` for basic usage of these attributes. - builder_ : TreeBuilder instance - The underlying TreeBuilder object. - min_samples_split_ : float The minimum number of samples needed to split a node in the tree building. + min_weight_leaf_ : float + The minimum number of weighted samples in a leaf. + + monotonic_cst_ : array-like of int of shape (n_features,) + The monotonicity constraints enforced on each feature. + + min_samples_leaf_ : int + The minimum number of samples needed for a leaf node. + See Also -------- ExtraTreeClassifier : An extremely randomized tree classifier. From 775f0b7de497fde5206101f82316e874ffb5545e Mon Sep 17 00:00:00 2001 From: Adam Li Date: Tue, 2 Apr 2024 20:25:32 -0400 Subject: [PATCH 34/54] Try again for partial fit Signed-off-by: Adam Li --- sklearn/tree/_classes.py | 136 ++++++++++++++++++++------------------- 1 file changed, 70 insertions(+), 66 deletions(-) diff --git a/sklearn/tree/_classes.py b/sklearn/tree/_classes.py index e6949b293185d..2124cd76c69c8 100644 --- a/sklearn/tree/_classes.py +++ b/sklearn/tree/_classes.py @@ -466,7 +466,6 @@ def _build_tree( random_state : int Random seed. """ - n_samples = X.shape[0] # Build tree @@ -576,6 +575,75 @@ def _build_tree( self._prune_tree() return self + def _update_tree(self, X, y, sample_weight): + # Update tree + max_leaf_nodes = -1 if self.max_leaf_nodes is None else self.max_leaf_nodes + min_samples_split = self.min_samples_split_ + min_samples_leaf = self.min_samples_leaf_ + min_weight_leaf = self.min_weight_leaf_ + # set decision-tree model parameters + max_depth = np.iinfo(np.int32).max if self.max_depth is None else self.max_depth + + monotonic_cst = self.monotonic_cst_ + + # Build tree + # Note: this reconstructs the builder with the same state it had during the + # initial fit. This is necessary because the builder is not saved as part + # of the class, and thus the state may be lost if pickled/unpickled. + n_samples = X.shape[0] + criterion = self.criterion + if not isinstance(criterion, BaseCriterion): + if is_classifier(self): + criterion = CRITERIA_CLF[self.criterion]( + self.n_outputs_, self._n_classes_ + ) + else: + criterion = CRITERIA_REG[self.criterion](self.n_outputs_, n_samples) + else: + # Make a deepcopy in case the criterion has mutable attributes that + # might be shared and modified concurrently during parallel fitting + criterion = copy.deepcopy(criterion) + + random_state = check_random_state(self.random_state) + + SPLITTERS = SPARSE_SPLITTERS if issparse(X) else DENSE_SPLITTERS + splitter = SPLITTERS[self.splitter]( + criterion, + self.max_features_, + min_samples_leaf, + min_weight_leaf, + random_state, + monotonic_cst, + ) + + # Use BestFirst if max_leaf_nodes given; use DepthFirst otherwise + if max_leaf_nodes < 0: + builder = DepthFirstTreeBuilder( + splitter, + min_samples_split, + min_samples_leaf, + min_weight_leaf, + max_depth, + self.min_impurity_decrease, + self.store_leaf_values, + ) + else: + builder = BestFirstTreeBuilder( + splitter, + min_samples_split, + min_samples_leaf, + min_weight_leaf, + max_depth, + max_leaf_nodes, + self.min_impurity_decrease, + self.store_leaf_values, + ) + builder.initialize_node_queue(self.tree_, X, y, sample_weight) + builder.build(self.tree_, X, y, sample_weight) + + self._prune_tree() + return self + def _validate_X_predict(self, X, check_input): """Validate the training data on predict (probabilities).""" if check_input: @@ -1375,71 +1443,7 @@ def partial_fit(self, X, y, sample_weight=None, check_input=True, classes=None): if getattr(y, "dtype", None) != DOUBLE or not y.flags.contiguous: y = np.ascontiguousarray(y, dtype=DOUBLE) - # Update tree - max_leaf_nodes = -1 if self.max_leaf_nodes is None else self.max_leaf_nodes - min_samples_split = self.min_samples_split_ - min_samples_leaf = self.min_samples_leaf_ - min_weight_leaf = self.min_weight_leaf_ - # set decision-tree model parameters - max_depth = np.iinfo(np.int32).max if self.max_depth is None else self.max_depth - - monotonic_cst = self.monotonic_cst_ - - # Build tree - # Note: this reconstructs the builder with the same state it had during the - # initial fit. This is necessary because the builder is not saved as part - # of the class, and thus the state may be lost if pickled/unpickled. - n_samples = X.shape[0] - criterion = self.criterion - if not isinstance(criterion, BaseCriterion): - if is_classifier(self): - criterion = CRITERIA_CLF[self.criterion]( - self.n_outputs_, self._n_classes_ - ) - else: - criterion = CRITERIA_REG[self.criterion](self.n_outputs_, n_samples) - else: - # Make a deepcopy in case the criterion has mutable attributes that - # might be shared and modified concurrently during parallel fitting - criterion = copy.deepcopy(criterion) - - random_state = check_random_state(self.random_state) - SPLITTERS = SPARSE_SPLITTERS if issparse(X) else DENSE_SPLITTERS - splitter = SPLITTERS[self.splitter]( - criterion, - self.max_features_, - min_samples_leaf, - min_weight_leaf, - random_state, - monotonic_cst, - ) - - # Use BestFirst if max_leaf_nodes given; use DepthFirst otherwise - if max_leaf_nodes < 0: - builder = DepthFirstTreeBuilder( - splitter, - min_samples_split, - min_samples_leaf, - min_weight_leaf, - max_depth, - self.min_impurity_decrease, - self.store_leaf_values, - ) - else: - builder = BestFirstTreeBuilder( - splitter, - min_samples_split, - min_samples_leaf, - min_weight_leaf, - max_depth, - max_leaf_nodes, - self.min_impurity_decrease, - self.store_leaf_values, - ) - builder.initialize_node_queue(self.tree_, X, y, sample_weight) - builder.build(self.tree_, X, y, sample_weight) - - self._prune_tree() + self._update_tree(X, y, sample_weight) return self From e03a15b2b607b3e3b35105d056d0834e946e6527 Mon Sep 17 00:00:00 2001 From: Adam Li Date: Fri, 24 May 2024 14:48:43 -0400 Subject: [PATCH 35/54] Adding inline comment Signed-off-by: Adam Li --- sklearn/tree/_tree.pyx | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/sklearn/tree/_tree.pyx b/sklearn/tree/_tree.pyx index 418eae57e4995..1d5a40e6c3f33 100644 --- a/sklearn/tree/_tree.pyx +++ b/sklearn/tree/_tree.pyx @@ -911,6 +911,11 @@ cdef class BestFirstTreeBuilder(TreeBuilder): ) except -1 nogil: """Adds node w/ partition ``[start, end)`` to the frontier. """ cdef SplitRecord split + + # Note: we create a <*SplitRecord> pointer here in order to allow subclasses + # to know what kind of SplitRecord to use. In some cases, ObliqueSplitRecord + # might be used. The split pointer here knows the size of the underlying Record + # because the subclassed splitter will define "pointer_size" accordingly. cdef SplitRecord* split_ptr = malloc(splitter.pointer_size()) cdef intp_t node_id From 08658c62a442fdb597ff00b7f85fb3f490cf212c Mon Sep 17 00:00:00 2001 From: Adam Li Date: Thu, 20 Jun 2024 18:54:03 -0400 Subject: [PATCH 36/54] Simplify cython partition api Signed-off-by: Adam Li --- sklearn/tree/_splitter.pyx | 20 ++++++++------------ 1 file changed, 8 insertions(+), 12 deletions(-) diff --git a/sklearn/tree/_splitter.pyx b/sklearn/tree/_splitter.pyx index 5d63f75781a42..d6d060a3b2d50 100644 --- a/sklearn/tree/_splitter.pyx +++ b/sklearn/tree/_splitter.pyx @@ -418,14 +418,17 @@ cdef inline intp_t node_split_best( Criterion criterion, SplitRecord* split, ParentInfo* parent_record, - bint with_monotonic_cst, - const int8_t[:] monotonic_cst, + # bint with_monotonic_cst, + # const int8_t[:] monotonic_cst, ) except -1 nogil: """Find the best split on node samples[start:end] Returns -1 in case of failure to allocate memory (and raise MemoryError) or 0 otherwise. """ + cdef const int8_t[:] monotonic_cst = splitter.monotonic_cst + cdef bint with_monotonic_cst = splitter.with_monotonic_cst + # Find the best split cdef intp_t start = splitter.start cdef intp_t end = splitter.end @@ -809,14 +812,15 @@ cdef inline int node_split_random( Criterion criterion, SplitRecord* split, ParentInfo* parent_record, - bint with_monotonic_cst, - const int8_t[:] monotonic_cst, ) except -1 nogil: """Find the best random split on node samples[start:end] Returns -1 in case of failure to allocate memory (and raise MemoryError) or 0 otherwise. """ + cdef const int8_t[:] monotonic_cst = splitter.monotonic_cst + cdef bint with_monotonic_cst = splitter.with_monotonic_cst + # Draw random splits and pick the best cdef intp_t start = splitter.start cdef intp_t end = splitter.end @@ -1662,8 +1666,6 @@ cdef class BestSplitter(Splitter): self.criterion, split, parent_record, - self.with_monotonic_cst, - self.monotonic_cst, ) cdef class BestSparseSplitter(Splitter): @@ -1692,8 +1694,6 @@ cdef class BestSparseSplitter(Splitter): self.criterion, split, parent_record, - self.with_monotonic_cst, - self.monotonic_cst, ) cdef class RandomSplitter(Splitter): @@ -1722,8 +1722,6 @@ cdef class RandomSplitter(Splitter): self.criterion, split, parent_record, - self.with_monotonic_cst, - self.monotonic_cst, ) cdef class RandomSparseSplitter(Splitter): @@ -1751,6 +1749,4 @@ cdef class RandomSparseSplitter(Splitter): self.criterion, split, parent_record, - self.with_monotonic_cst, - self.monotonic_cst, ) From f0f69bec36d233434d14b72b2cb827ffdac3b97c Mon Sep 17 00:00:00 2001 From: Adam Li Date: Thu, 20 Jun 2024 18:54:25 -0400 Subject: [PATCH 37/54] Simplify cython partition api Signed-off-by: Adam Li --- sklearn/tree/_splitter.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/tree/_splitter.pyx b/sklearn/tree/_splitter.pyx index d6d060a3b2d50..b88574c089bf7 100644 --- a/sklearn/tree/_splitter.pyx +++ b/sklearn/tree/_splitter.pyx @@ -820,7 +820,7 @@ cdef inline int node_split_random( """ cdef const int8_t[:] monotonic_cst = splitter.monotonic_cst cdef bint with_monotonic_cst = splitter.with_monotonic_cst - + # Draw random splits and pick the best cdef intp_t start = splitter.start cdef intp_t end = splitter.end From d455aa16ee9cc42ce342dd07d9b94db117783fcc Mon Sep 17 00:00:00 2001 From: Adam Li Date: Fri, 21 Jun 2024 08:43:12 -0400 Subject: [PATCH 38/54] cimport _build pruned tree Signed-off-by: Adam Li --- sklearn/tree/_splitter.pyx | 45 ++++++++++---------------------------- sklearn/tree/_tree.pxd | 8 +++++++ 2 files changed, 19 insertions(+), 34 deletions(-) diff --git a/sklearn/tree/_splitter.pyx b/sklearn/tree/_splitter.pyx index b88574c089bf7..8bf71765355b3 100644 --- a/sklearn/tree/_splitter.pyx +++ b/sklearn/tree/_splitter.pyx @@ -341,6 +341,8 @@ cdef class Splitter(BaseSplitter): This is typically a metric that is cheaply computed given the current proposed split, which is stored as a the `current_split` argument. + + Returns 1 if not a valid split, and 0 if it is. """ cdef intp_t min_samples_leaf = self.min_samples_leaf cdef intp_t end_non_missing = self.end - n_missing @@ -418,8 +420,6 @@ cdef inline intp_t node_split_best( Criterion criterion, SplitRecord* split, ParentInfo* parent_record, - # bint with_monotonic_cst, - # const int8_t[:] monotonic_cst, ) except -1 nogil: """Find the best split on node samples[start:end] @@ -566,25 +566,7 @@ cdef inline intp_t node_split_best( current_split.pos = p - # Reject if monotonicity constraints are not satisfied - if ( - with_monotonic_cst and - monotonic_cst[current_split.feature] != 0 and - not criterion.check_monotonicity( - monotonic_cst[current_split.feature], - lower_bound, - upper_bound, - ) - ): - continue - # Reject if min_samples_leaf is not guaranteed - if missing_go_to_left: - n_left = current_split.pos - splitter.start + n_missing - n_right = end_non_missing - current_split.pos - else: - n_left = current_split.pos - splitter.start - n_right = end_non_missing - current_split.pos + n_missing if splitter.check_presplit_conditions(¤t_split, n_missing, missing_go_to_left) == 1: continue @@ -624,6 +606,13 @@ cdef inline intp_t node_split_best( current_split.n_missing = n_missing if n_missing == 0: + if missing_go_to_left: + n_left = current_split.pos - splitter.start + n_missing + n_right = end_non_missing - current_split.pos + else: + n_left = current_split.pos - splitter.start + n_right = end_non_missing - current_split.pos + n_missing + current_split.missing_go_to_left = n_left > n_right else: current_split.missing_go_to_left = missing_go_to_left @@ -938,10 +927,6 @@ cdef inline int node_split_random( criterion.reset() criterion.update(current_split.pos) - # Reject if min_weight_leaf is not satisfied - if splitter.check_postsplit_conditions() == 1: - continue - # Reject if monotonicity constraints are not satisfied if ( with_monotonic_cst and @@ -954,16 +939,8 @@ cdef inline int node_split_random( ): continue - # Reject if monotonicity constraints are not satisfied - if ( - with_monotonic_cst and - monotonic_cst[current_split.feature] != 0 and - not criterion.check_monotonicity( - monotonic_cst[current_split.feature], - lower_bound, - upper_bound, - ) - ): + # Reject if min_weight_leaf is not satisfied + if splitter.check_postsplit_conditions() == 1: continue current_proxy_improvement = criterion.proxy_impurity_improvement() diff --git a/sklearn/tree/_tree.pxd b/sklearn/tree/_tree.pxd index 45953a8e093a5..248f7b4e5f6c1 100644 --- a/sklearn/tree/_tree.pxd +++ b/sklearn/tree/_tree.pxd @@ -191,3 +191,11 @@ cdef class TreeBuilder: const float64_t[:, ::1] y, const float64_t[:] sample_weight, ) + + +cdef _build_pruned_tree( + Tree tree, # OUT + Tree orig_tree, + const unsigned char[:] leaves_in_subtree, + intp_t capacity +) From ae2604ba53d092eaaec64eba0136a76460586cb0 Mon Sep 17 00:00:00 2001 From: Adam Li Date: Thu, 11 Jul 2024 08:44:43 -0400 Subject: [PATCH 39/54] Make sure its defined Signed-off-by: Adam Li --- sklearn/tree/_splitter.pyx | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/sklearn/tree/_splitter.pyx b/sklearn/tree/_splitter.pyx index e85f6ef5b2257..82835910fb800 100644 --- a/sklearn/tree/_splitter.pyx +++ b/sklearn/tree/_splitter.pyx @@ -959,6 +959,13 @@ cdef inline int node_split_random( current_split.threshold ) + if missing_go_to_left: + n_left = current_split.pos - start + n_missing + n_right = end_non_missing - current_split.pos + else: + n_left = current_split.pos - start + n_right = end_non_missing - current_split.pos + n_missing + # Reject if min_samples_leaf is not guaranteed if splitter.check_presplit_conditions(¤t_split, n_missing, missing_go_to_left) == 1: continue From b201fcb945fa54979a93bf8cc11f88c17d4b8fc9 Mon Sep 17 00:00:00 2001 From: Adam Li Date: Thu, 8 Aug 2024 09:37:53 -0400 Subject: [PATCH 40/54] Update wrt main Signed-off-by: Adam Li --- sklearn/tree/_tree.pxd | 4 ++-- sklearn/tree/_tree.pyx | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/sklearn/tree/_tree.pxd b/sklearn/tree/_tree.pxd index 042d4126ef770..aecb9a4d95009 100644 --- a/sklearn/tree/_tree.pxd +++ b/sklearn/tree/_tree.pxd @@ -58,8 +58,8 @@ cdef class BaseTree: float64_t weighted_n_node_samples, uint8_t missing_go_to_left ) except -1 nogil - cdef intp_t _resize(self, intp_t capacity) except -1 nogil - cdef intp_t _resize_c(self, intp_t capacity=*) except -1 nogil + cdef int _resize(self, intp_t capacity) except -1 nogil + cdef int _resize_c(self, intp_t capacity=*) except -1 nogil cdef intp_t _update_node( self, diff --git a/sklearn/tree/_tree.pyx b/sklearn/tree/_tree.pyx index 653d3f8c08892..18d9275115786 100644 --- a/sklearn/tree/_tree.pyx +++ b/sklearn/tree/_tree.pyx @@ -984,7 +984,7 @@ cdef class BaseTree: Downstream classes must implement methods to actually traverse the tree. """ - cdef intp_t _resize( + cdef int _resize( self, intp_t capacity ) except -1 nogil: @@ -999,7 +999,7 @@ cdef class BaseTree: with gil: raise MemoryError() - cdef intp_t _resize_c(self, intp_t capacity=INTPTR_MAX) except -1 nogil: + cdef int _resize_c(self, intp_t capacity=INTPTR_MAX) except -1 nogil: """Guts of _resize Returns -1 in case of failure to allocate memory (and raise MemoryError) From 600187a53a8c1bee0b7092d69adda9064e3c0dbc Mon Sep 17 00:00:00 2001 From: Adam Li Date: Thu, 5 Sep 2024 10:20:07 -0400 Subject: [PATCH 41/54] Merging main Signed-off-by: Adam Li --- sklearn/ensemble/_forest.py | 14 ++------------ 1 file changed, 2 insertions(+), 12 deletions(-) diff --git a/sklearn/ensemble/_forest.py b/sklearn/ensemble/_forest.py index 7ffcc5454ba6f..ccc9d46393f4e 100644 --- a/sklearn/ensemble/_forest.py +++ b/sklearn/ensemble/_forest.py @@ -67,6 +67,7 @@ class calls the ``fit`` method of each sub-estimator on random samples check_classification_targets, type_of_target, ) +from sklearn.utils._tags import get_tags from sklearn.utils.parallel import Parallel, delayed from sklearn.utils.validation import ( _check_feature_names_in, @@ -83,18 +84,7 @@ class calls the ``fit`` method of each sub-estimator on random samples ExtraTreeRegressor, ) from ..tree._tree import DOUBLE, DTYPE -from ..utils import check_random_state, compute_sample_weight -from ..utils._param_validation import Interval, RealNotInt, StrOptions -from ..utils._tags import get_tags -from ..utils.multiclass import check_classification_targets, type_of_target -from ..utils.parallel import Parallel, delayed -from ..utils.validation import ( - _check_feature_names_in, - _check_sample_weight, - _num_samples, - check_is_fitted, -) -from ._base import BaseEnsemble, _partition_estimators + __all__ = [ "RandomForestClassifier", From 473f7bcf89fd058999323e8be470c3cda71da762 Mon Sep 17 00:00:00 2001 From: Adam Li Date: Fri, 6 Sep 2024 14:03:52 -0400 Subject: [PATCH 42/54] Reverting Signed-off-by: Adam Li --- sklearn/ensemble/_forest.py | 766 +++--------------------------------- 1 file changed, 56 insertions(+), 710 deletions(-) diff --git a/sklearn/ensemble/_forest.py b/sklearn/ensemble/_forest.py index d6c26ae282068..ae729f4dfebdf 100644 --- a/sklearn/ensemble/_forest.py +++ b/sklearn/ensemble/_forest.py @@ -39,14 +39,13 @@ class calls the ``fit`` method of each sub-estimator on random samples import threading from abc import ABCMeta, abstractmethod from numbers import Integral, Real -from time import time from warnings import catch_warnings, simplefilter, warn import numpy as np from scipy.sparse import hstack as sparse_hstack from scipy.sparse import issparse -from sklearn.base import ( +from ..base import ( ClassifierMixin, MultiOutputMixin, RegressorMixin, @@ -54,12 +53,9 @@ class calls the ``fit`` method of each sub-estimator on random samples _fit_context, is_classifier, ) -from sklearn.ensemble._base import BaseEnsemble, _partition_estimators -from sklearn.ensemble._hist_gradient_boosting.binning import _BinMapper -from sklearn.exceptions import DataConversionWarning -from sklearn.metrics import accuracy_score, r2_score -from sklearn.preprocessing import OneHotEncoder - +from ..exceptions import DataConversionWarning +from ..metrics import accuracy_score, r2_score +from ..preprocessing import OneHotEncoder from ..tree import ( BaseDecisionTree, DecisionTreeClassifier, @@ -97,18 +93,14 @@ def _get_n_samples_bootstrap(n_samples, max_samples): """ Get the number of samples in a bootstrap sample. - The expected total number of unique samples in a bootstrap sample is - required to be at most ``n_samples - 1``. - This is equivalent to the expected number of out-of-bag samples being at - least 1. - Parameters ---------- n_samples : int Number of samples in the dataset. max_samples : int or float The maximum number of samples to draw from the total available: - - if float, this indicates a fraction of the total; + - if float, this indicates a fraction of the total and should be + the interval `(0.0, 1.0]`; - if int, this indicates the exact number of samples; - if None, this indicates the total number of samples. @@ -121,21 +113,12 @@ def _get_n_samples_bootstrap(n_samples, max_samples): return n_samples if isinstance(max_samples, Integral): - expected_oob_samples = (1 - np.exp(-max_samples / n_samples)) * n_samples - if expected_oob_samples >= n_samples - 1: - raise ValueError( - "The expected number of unique samples in the bootstrap sample" - f" must be at most {n_samples - 1}. It is: {expected_oob_samples}" - ) + if max_samples > n_samples: + msg = "`max_samples` must be <= n_samples={} but got value {}" + raise ValueError(msg.format(n_samples, max_samples)) return max_samples if isinstance(max_samples, Real): - expected_oob_samples = (1 - np.exp(-max_samples)) * n_samples - if expected_oob_samples >= n_samples - 1: - raise ValueError( - "The expected number of unique samples in the bootstrap sample" - f" must be at most {n_samples - 1}. It is: {expected_oob_samples}" - ) return max(round(n_samples * max_samples), 1) @@ -177,7 +160,6 @@ def _parallel_build_trees( class_weight=None, n_samples_bootstrap=None, missing_values_in_feature_mask=None, - classes=None, ): """ Private function used to fit a single tree in parallel.""" @@ -210,7 +192,6 @@ def _parallel_build_trees( sample_weight=curr_sample_weight, check_input=False, missing_values_in_feature_mask=missing_values_in_feature_mask, - classes=classes, ) else: tree._fit( @@ -219,50 +200,6 @@ def _parallel_build_trees( sample_weight=sample_weight, check_input=False, missing_values_in_feature_mask=missing_values_in_feature_mask, - classes=classes, - ) - - return tree - - -def _parallel_update_trees( - tree, - bootstrap, - X, - y, - sample_weight, - tree_idx, - n_trees, - verbose=0, - class_weight=None, - n_samples_bootstrap=None, - classes=None, -): - """ - Private function used to fit a single tree in parallel.""" - if verbose > 1: - print("Updating tree %d of %d" % (tree_idx + 1, n_trees)) - - if bootstrap: - n_samples = X.shape[0] - indices = _generate_sample_indices( - tree.random_state, n_samples, n_samples_bootstrap - ) - - tree.partial_fit( - X[indices, :], - y[indices], - sample_weight=sample_weight, - check_input=False, - classes=classes, - ) - else: - tree.partial_fit( - X, - y, - sample_weight=sample_weight, - check_input=False, - classes=classes, ) return tree @@ -289,11 +226,6 @@ class BaseForest(MultiOutputMixin, BaseEnsemble, metaclass=ABCMeta): Interval(RealNotInt, 0.0, 1.0, closed="right"), Interval(Integral, 1, None, closed="left"), ], - "max_bins": [ - None, - Interval(Integral, 1, None, closed="left"), - ], - "store_leaf_values": ["boolean"], } @abstractmethod @@ -311,8 +243,6 @@ def __init__( warm_start=False, class_weight=None, max_samples=None, - max_bins=None, - store_leaf_values=False, ): super().__init__( estimator=estimator, @@ -328,8 +258,6 @@ def __init__( self.warm_start = warm_start self.class_weight = class_weight self.max_samples = max_samples - self.max_bins = max_bins - self.store_leaf_values = store_leaf_values def apply(self, X): """ @@ -349,15 +277,6 @@ def apply(self, X): return the index of the leaf x ends up in. """ X = self._validate_X_predict(X) - - # if we trained a binning tree, then we should re-bin the data - # XXX: this is inefficient and should be improved to be in line with what - # the Histogram Gradient Boosting Tree does, where the binning thresholds - # are passed into the tree itself, thus allowing us to set the node feature - # value thresholds within the tree itself. - if self.max_bins is not None: - X = self._bin_data(X, is_training_data=False).astype(DTYPE) - results = Parallel( n_jobs=self.n_jobs, verbose=self.verbose, @@ -407,7 +326,7 @@ def decision_path(self, X): return sparse_hstack(indicators).tocsr(), n_nodes_ptr @_fit_context(prefer_skip_nested_validation=True) - def fit(self, X, y, sample_weight=None, classes=None): + def fit(self, X, y, sample_weight=None): """ Build a forest of trees from the training set (X, y). @@ -429,9 +348,6 @@ def fit(self, X, y, sample_weight=None, classes=None): classification, splits are also ignored if they would result in any single class carrying a negative weight in either child node. - classes : array-like of shape (n_classes,), default=None - List of all the classes that can possibly appear in the y vector. - Returns ------- self : object @@ -500,7 +416,7 @@ def fit(self, X, y, sample_weight=None, classes=None): self._n_samples, self.n_outputs_ = y.shape - y, expanded_class_weight = self._validate_y_class_weight(y, classes=classes) + y, expanded_class_weight = self._validate_y_class_weight(y) if getattr(y, "dtype", None) != DOUBLE or not y.flags.contiguous: y = np.ascontiguousarray(y, dtype=DOUBLE) @@ -539,38 +455,6 @@ def fit(self, X, y, sample_weight=None, classes=None): n_more_estimators = self.n_estimators - len(self.estimators_) - if self.max_bins is not None: - # `_openmp_effective_n_threads` is used to take cgroups CPU quotes - # into account when determine the maximum number of threads to use. - n_threads = _openmp_effective_n_threads() - - # Bin the data - # For ease of use of the API, the user-facing GBDT classes accept the - # parameter max_bins, which doesn't take into account the bin for - # missing values (which is always allocated). However, since max_bins - # isn't the true maximal number of bins, all other private classes - # (binmapper, histbuilder...) accept n_bins instead, which is the - # actual total number of bins. Everywhere in the code, the - # convention is that n_bins == max_bins + 1 - n_bins = self.max_bins + 1 # + 1 for missing values - self._bin_mapper = _BinMapper( - n_bins=n_bins, - # is_categorical=self.is_categorical_, - known_categories=None, - random_state=random_state, - n_threads=n_threads, - ) - - # XXX: in order for this to work with the underlying tree submodule's Cython - # code, we need to convert this into the original data's DTYPE because - # the Cython code assumes that `DTYPE` is used. - # The proper implementation will be a lot more complicated and should be - # tackled once scikit-learn has finalized their inclusion of missing data - # and categorical support for decision trees - X = self._bin_data(X, is_training_data=True) # .astype(DTYPE) - else: - self._bin_mapper = None - if n_more_estimators < 0: raise ValueError( "n_estimators=%d must be larger or equal to " @@ -589,18 +473,41 @@ def fit(self, X, y, sample_weight=None, classes=None): # would have got if we hadn't used a warm_start. random_state.randint(MAX_INT, size=len(self.estimators_)) - # construct the trees in parallel - self._construct_trees( - X, - y, - sample_weight, - random_state, - n_samples_bootstrap, - missing_values_in_feature_mask, - classes, - n_more_estimators, + trees = [ + self._make_estimator(append=False, random_state=random_state) + for i in range(n_more_estimators) + ] + + # Parallel loop: we prefer the threading backend as the Cython code + # for fitting the trees is internally releasing the Python GIL + # making threading more efficient than multiprocessing in + # that case. However, for joblib 0.12+ we respect any + # parallel_backend contexts set at a higher level, + # since correctness does not rely on using threads. + trees = Parallel( + n_jobs=self.n_jobs, + verbose=self.verbose, + prefer="threads", + )( + delayed(_parallel_build_trees)( + t, + self.bootstrap, + X, + y, + sample_weight, + i, + len(trees), + verbose=self.verbose, + class_weight=self.class_weight, + n_samples_bootstrap=n_samples_bootstrap, + missing_values_in_feature_mask=missing_values_in_feature_mask, + ) + for i, t in enumerate(trees) ) + # Collect newly grown trees + self.estimators_.extend(trees) + if self.oob_score and ( n_more_estimators > 0 or not hasattr(self, "oob_score_") ): @@ -634,53 +541,6 @@ def fit(self, X, y, sample_weight=None, classes=None): return self - def _construct_trees( - self, - X, - y, - sample_weight, - random_state, - n_samples_bootstrap, - missing_values_in_feature_mask, - classes, - n_more_estimators, - ): - trees = [ - self._make_estimator(append=False, random_state=random_state) - for i in range(n_more_estimators) - ] - - # Parallel loop: we prefer the threading backend as the Cython code - # for fitting the trees is internally releasing the Python GIL - # making threading more efficient than multiprocessing in - # that case. However, for joblib 0.12+ we respect any - # parallel_backend contexts set at a higher level, - # since correctness does not rely on using threads. - trees = Parallel( - n_jobs=self.n_jobs, - verbose=self.verbose, - prefer="threads", - )( - delayed(_parallel_build_trees)( - t, - self.bootstrap, - X, - y, - sample_weight, - i, - len(trees), - verbose=self.verbose, - class_weight=self.class_weight, - n_samples_bootstrap=n_samples_bootstrap, - missing_values_in_feature_mask=missing_values_in_feature_mask, - classes=classes, - ) - for i, t in enumerate(trees) - ) - - # Collect newly grown trees - self.estimators_.extend(trees) - @abstractmethod def _set_oob_score_and_attributes(self, X, y, scoring_function=None): """Compute and set the OOB score and attributes. @@ -763,7 +623,7 @@ def _compute_oob_predictions(self, X, y): return oob_pred - def _validate_y_class_weight(self, y, classes=None): + def _validate_y_class_weight(self, y): # Default implementation return y, None @@ -823,174 +683,6 @@ def feature_importances_(self): all_importances = np.mean(all_importances, axis=0, dtype=np.float64) return all_importances / np.sum(all_importances) - def _bin_data(self, X, is_training_data): - """Bin data X. - - If is_training_data, then fit the _bin_mapper attribute. - Else, the binned data is converted to a C-contiguous array. - """ - description = "training" if is_training_data else "validation" - if self.verbose: - print( - "Binning {:.3f} GB of {} data: ".format(X.nbytes / 1e9, description), - end="", - flush=True, - ) - tic = time() - if is_training_data: - X_binned = self._bin_mapper.fit_transform(X) # F-aligned array - else: - X_binned = self._bin_mapper.transform(X) # F-aligned array - # We convert the array to C-contiguous since predicting is faster - # with this layout (training is faster on F-arrays though) - X_binned = np.ascontiguousarray(X_binned) - toc = time() - if self.verbose: - duration = toc - tic - print("{:.3f} s".format(duration)) - - return X_binned - - def predict_quantiles(self, X, quantiles=0.5, method="nearest"): - """Predict class or regression value for X at given quantiles. - - Parameters - ---------- - X : {array-like, sparse matrix} of shape (n_samples, n_features) - Input data. - quantiles : float, optional - The quantiles at which to evaluate, by default 0.5 (median). - method : str, optional - The method to interpolate, by default 'linear'. Can be any keyword - argument accepted by :func:`~np.quantile`. - - Returns - ------- - y : ndarray of shape (n_samples, n_quantiles, [n_outputs]) - The predicted values. The ``n_outputs`` dimension is present only - for multi-output regressors. - """ - if not self.store_leaf_values: - raise RuntimeError( - "Quantile prediction is not available when store_leaf_values=False" - ) - check_is_fitted(self) - # Check data - X = self._validate_X_predict(X) - - if not isinstance(quantiles, (np.ndarray, list)): - quantiles = np.array([quantiles]) - - # if we trained a binning tree, then we should re-bin the data - # XXX: this is inefficient and should be improved to be in line with what - # the Histogram Gradient Boosting Tree does, where the binning thresholds - # are passed into the tree itself, thus allowing us to set the node feature - # value thresholds within the tree itself. - if self.max_bins is not None: - X = self._bin_data(X, is_training_data=False).astype(DTYPE) - - # Assign chunk of trees to jobs - # n_jobs, _, _ = _partition_estimators(self.n_estimators, self.n_jobs) - - # avoid storing the output of every estimator by summing them here - if self.n_outputs_ > 1: - y_hat = np.zeros( - (X.shape[0], len(quantiles), self.n_outputs_), dtype=np.float64 - ) - else: - y_hat = np.zeros((X.shape[0], len(quantiles)), dtype=np.float64) - - # get (n_samples, n_estimators) indicator of leaf nodes - X_leaves = self.apply(X) - - # we now want to aggregate all leaf samples across all trees for each sample - for idx in range(X.shape[0]): - # get leaf nodes for this sample - leaf_nodes = X_leaves[idx, :] - - # (n_total_leaf_samples, n_outputs) - leaf_node_samples = np.vstack( - [ - est.tree_.leaf_nodes_samples[leaf_nodes[jdx]] - for jdx, est in enumerate(self.estimators_) - ] - ) - - # get quantiles across all leaf node samples - try: - y_hat[idx, ...] = np.quantile( - leaf_node_samples, quantiles, axis=0, method=method - ) - except TypeError: - y_hat[idx, ...] = np.quantile( - leaf_node_samples, quantiles, axis=0, interpolation=method - ) - - if is_classifier(self): - if self.n_outputs_ == 1: - for i in range(len(quantiles)): - class_pred_per_sample = y_hat[idx, i, :].squeeze().astype(int) - y_hat[idx, ...] = self.classes_.take( - class_pred_per_sample, axis=0 - ) - else: - for k in range(self.n_outputs_): - for i in range(len(quantiles)): - class_pred_per_sample = ( - y_hat[idx, i, k].squeeze().astype(int) - ) - y_hat[idx, i, k] = self.classes_[k].take( - class_pred_per_sample, axis=0 - ) - return y_hat - - def get_leaf_node_samples(self, X): - """For each datapoint x in X, get the training samples in the leaf node. - - Parameters - ---------- - X : array-like of shape (n_samples, n_features) - Dataset to apply the forest to. - - Returns - ------- - leaf_node_samples : a list of array-like - Each sample is represented by the indices of the training samples that - reached the leaf node. The ``n_leaf_node_samples`` may vary between - samples, since the number of samples that fall in a leaf node is - variable. Each array-like has shape (n_leaf_node_samples, n_outputs). - """ - if not self.store_leaf_values: - raise RuntimeError( - "Leaf node samples are not available when store_leaf_values=False" - ) - - check_is_fitted(self) - # Check data - X = self._validate_X_predict(X) - - # if we trained a binning tree, then we should re-bin the data - # XXX: this is inefficient and should be improved to be in line with what - # the Histogram Gradient Boosting Tree does, where the binning thresholds - # are passed into the tree itself, thus allowing us to set the node feature - # value thresholds within the tree itself. - if self.max_bins is not None: - X = self._bin_data(X, is_training_data=False).astype(DTYPE) - - # Assign chunk of trees to jobs - n_jobs, _, _ = _partition_estimators(self.n_estimators, self.n_jobs) - - # avoid storing the output of every estimator by summing them here - result = Parallel(n_jobs=n_jobs, verbose=self.verbose)( - delayed(_accumulate_leaf_nodes_samples)(e.get_leaf_node_samples, X) - for e in self.estimators_ - ) - leaf_nodes_samples = result[0] - for result_ in result[1:]: - for i, node_samples in enumerate(result_): - leaf_nodes_samples[i] = np.vstack((leaf_nodes_samples[i], node_samples)) - return leaf_nodes_samples - def _get_estimators_indices(self): # Get drawn indices along both sample and feature axes for tree in self.estimators_: @@ -1046,17 +738,6 @@ def _accumulate_prediction(predict, X, out, lock): out[i] += prediction[i] -def _accumulate_leaf_nodes_samples(func, X): - """ - This is a utility function for joblib's Parallel. - - It can't go locally in ForestClassifier or ForestRegressor, because joblib - complains that it cannot pickle it when placed there. - """ - leaf_nodes_samples = func(X, check_input=False) - return leaf_nodes_samples - - class ForestClassifier(ClassifierMixin, BaseForest, metaclass=ABCMeta): """ Base class for forest of trees-based classifiers. @@ -1080,8 +761,6 @@ def __init__( warm_start=False, class_weight=None, max_samples=None, - max_bins=None, - store_leaf_values=False, ): super().__init__( estimator=estimator, @@ -1095,8 +774,6 @@ def __init__( warm_start=warm_start, class_weight=class_weight, max_samples=max_samples, - max_bins=max_bins, - store_leaf_values=store_leaf_values, ) @staticmethod @@ -1151,7 +828,7 @@ def _set_oob_score_and_attributes(self, X, y, scoring_function=None): y, np.argmax(self.oob_decision_function_, axis=1) ) - def _validate_y_class_weight(self, y, classes=None): + def _validate_y_class_weight(self, y): check_classification_targets(y) y = np.copy(y) @@ -1164,28 +841,12 @@ def _validate_y_class_weight(self, y, classes=None): self.n_classes_ = [] y_store_unique_indices = np.zeros(y.shape, dtype=int) - if classes is not None: - classes = np.atleast_1d(classes) - if classes.ndim == 1: - classes = np.array([classes]) - - for k in classes: - self.classes_.append(np.array(k)) - self.n_classes_.append(np.array(k).shape[0]) - - for i in range(y.shape[0]): - for j in range(self.n_outputs_): - y_store_unique_indices[i, j] = np.where( - self.classes_[j] == y[i, j] - )[0][0] - else: - for k in range(self.n_outputs_): - classes_k, y_store_unique_indices[:, k] = np.unique( - y[:, k], return_inverse=True - ) - self.classes_.append(classes_k) - self.n_classes_.append(classes_k.shape[0]) - + for k in range(self.n_outputs_): + classes_k, y_store_unique_indices[:, k] = np.unique( + y[:, k], return_inverse=True + ) + self.classes_.append(classes_k) + self.n_classes_.append(classes_k.shape[0]) y = y_store_unique_indices if self.class_weight is not None: @@ -1220,228 +881,6 @@ def _validate_y_class_weight(self, y, classes=None): return y, expanded_class_weight - def partial_fit(self, X, y, sample_weight=None, classes=None): - """Update a decision tree classifier from the training set (X, y). - - Parameters - ---------- - X : {array-like, sparse matrix} of shape (n_samples, n_features) - The training input samples. Internally, it will be converted to - ``dtype=np.float32`` and if a sparse matrix is provided - to a sparse ``csc_matrix``. - - y : array-like of shape (n_samples,) or (n_samples, n_outputs) - The target values (class labels) as integers or strings. - - sample_weight : array-like of shape (n_samples,), default=None - Sample weights. If None, then samples are equally weighted. Splits - that would create child nodes with net zero or negative weight are - ignored while searching for a split in each node. Splits are also - ignored if they would result in any single class carrying a - negative weight in either child node. - - classes : array-like of shape (n_classes,), default=None - List of all the classes that can possibly appear in the y vector. - Must be provided at the first call to partial_fit, can be omitted - in subsequent calls. - - Returns - ------- - self : object - Returns the instance itself. - """ - self._validate_params() - - # validate input parameters - first_call = _check_partial_fit_first_call(self, classes=classes) - - # Fit if no tree exists yet - if first_call: - self.fit( - X, - y, - sample_weight=sample_weight, - classes=classes, - ) - return self - - X, y = self._validate_data( - X, - y, - multi_output=True, - accept_sparse="csc", - dtype=DTYPE, - force_all_finite=False, - reset=first_call, - ) - - if issparse(y): - raise ValueError("sparse multilabel-indicator for y is not supported.") - - if sample_weight is not None: - sample_weight = _check_sample_weight(sample_weight, X) - - if issparse(X): - # Pre-sort indices to avoid that each individual tree of the - # ensemble sorts the indices. - X.sort_indices() - - y = np.atleast_1d(y) - if y.ndim == 2 and y.shape[1] == 1: - warn( - ( - "A column-vector y was passed when a 1d array was" - " expected. Please change the shape of y to " - "(n_samples,), for example using ravel()." - ), - DataConversionWarning, - stacklevel=2, - ) - - if y.ndim == 1: - # reshape is necessary to preserve the data contiguity against vs - # [:, np.newaxis] that does not. - y = np.reshape(y, (-1, 1)) - - if self.criterion == "poisson": - if np.any(y < 0): - raise ValueError( - "Some value(s) of y are negative which is " - "not allowed for Poisson regression." - ) - if np.sum(y) <= 0: - raise ValueError( - "Sum of y is not strictly positive which " - "is necessary for Poisson regression." - ) - - self.n_outputs_ = y.shape[1] - - classes = self.classes_ - if self.n_outputs_ == 1: - classes = [classes] - - y, expanded_class_weight = self._validate_y_class_weight(y, classes) - - if getattr(y, "dtype", None) != DOUBLE or not y.flags.contiguous: - y = np.ascontiguousarray(y, dtype=DOUBLE) - - if expanded_class_weight is not None: - if sample_weight is not None: - sample_weight = sample_weight * expanded_class_weight - else: - sample_weight = expanded_class_weight - - if not self.bootstrap and self.max_samples is not None: - raise ValueError( - "`max_sample` cannot be set if `bootstrap=False`. " - "Either switch to `bootstrap=True` or set " - "`max_sample=None`." - ) - elif self.bootstrap: - n_samples_bootstrap = _get_n_samples_bootstrap( - n_samples=X.shape[0], max_samples=self.max_samples - ) - else: - n_samples_bootstrap = None - - self._validate_estimator() - - if not self.bootstrap and self.oob_score: - raise ValueError("Out of bag estimation only available if bootstrap=True") - - random_state = check_random_state(self.random_state) - - if self.max_bins is not None: - # `_openmp_effective_n_threads` is used to take cgroups CPU quotes - # into account when determine the maximum number of threads to use. - n_threads = _openmp_effective_n_threads() - - # Bin the data - # For ease of use of the API, the user-facing GBDT classes accept the - # parameter max_bins, which doesn't take into account the bin for - # missing values (which is always allocated). However, since max_bins - # isn't the true maximal number of bins, all other private classes - # (binmapper, histbuilder...) accept n_bins instead, which is the - # actual total number of bins. Everywhere in the code, the - # convention is that n_bins == max_bins + 1 - n_bins = self.max_bins + 1 # + 1 for missing values - self._bin_mapper = _BinMapper( - n_bins=n_bins, - # is_categorical=self.is_categorical_, - known_categories=None, - random_state=random_state, - n_threads=n_threads, - ) - - # XXX: in order for this to work with the underlying tree submodule's Cython - # code, we need to convert this into the original data's DTYPE because - # the Cython code assumes that `DTYPE` is used. - # The proper implementation will be a lot more complicated and should be - # tackled once scikit-learn has finalized their inclusion of missing data - # and categorical support for decision trees - X = self._bin_data(X, is_training_data=True) # .astype(DTYPE) - else: - self._bin_mapper = None - - # We draw from the random state to get the random state we - # would have got if we hadn't used a warm_start. - random_state.randint(MAX_INT, size=len(self.estimators_)) - - # Parallel loop: we prefer the threading backend as the Cython code - # for fitting the trees is internally releasing the Python GIL - # making threading more efficient than multiprocessing in - # that case. However, for joblib 0.12+ we respect any - # parallel_backend contexts set at a higher level, - # since correctness does not rely on using threads. - Parallel( - n_jobs=self.n_jobs, - verbose=self.verbose, - prefer="threads", - )( - delayed(_parallel_update_trees)( - t, - self.bootstrap, - X, - y, - sample_weight, - i, - len(self.estimators_), - verbose=self.verbose, - class_weight=self.class_weight, - n_samples_bootstrap=n_samples_bootstrap, - classes=classes[0], - ) - for i, t in enumerate(self.estimators_) - ) - - if self.oob_score: - y_type = type_of_target(y) - if y_type in ("multiclass-multioutput", "unknown"): - # FIXME: we could consider to support multiclass-multioutput if - # we introduce or reuse a constructor parameter (e.g. - # oob_score) allowing our user to pass a callable defining the - # scoring strategy on OOB sample. - raise ValueError( - "The type of target cannot be used to compute OOB " - f"estimates. Got {y_type} while only the following are " - "supported: continuous, continuous-multioutput, binary, " - "multiclass, multilabel-indicator." - ) - - if callable(self.oob_score): - self._set_oob_score_and_attributes( - X, y, scoring_function=self.oob_score - ) - else: - self._set_oob_score_and_attributes(X, y) - - # Decapsulate classes_ attributes - if hasattr(self, "classes_") and self.n_outputs_ == 1: - self.n_classes_ = self.n_classes_[0] - self.classes_ = self.classes_[0] - return self - def predict(self, X): """ Predict class for X. @@ -1507,14 +946,6 @@ def predict_proba(self, X): # Check data X = self._validate_X_predict(X) - # if we trained a binning tree, then we should re-bin the data - # XXX: this is inefficient and should be improved to be in line with what - # the Histogram Gradient Boosting Tree does, where the binning thresholds - # are passed into the tree itself, thus allowing us to set the node feature - # value thresholds within the tree itself. - if self.max_bins is not None: - X = self._bin_data(X, is_training_data=False).astype(DTYPE) - # Assign chunk of trees to jobs n_jobs, _, _ = _partition_estimators(self.n_estimators, self.n_jobs) @@ -1597,8 +1028,6 @@ def __init__( verbose=0, warm_start=False, max_samples=None, - max_bins=None, - store_leaf_values=False, ): super().__init__( estimator, @@ -1611,8 +1040,6 @@ def __init__( verbose=verbose, warm_start=warm_start, max_samples=max_samples, - max_bins=max_bins, - store_leaf_values=store_leaf_values, ) def predict(self, X): @@ -1638,14 +1065,6 @@ def predict(self, X): # Check data X = self._validate_X_predict(X) - # if we trained a binning tree, then we should re-bin the data - # XXX: this is inefficient and should be improved to be in line with what - # the Histogram Gradient Boosting Tree does, where the binning thresholds - # are passed into the tree itself, thus allowing us to set the node feature - # value thresholds within the tree itself. - if self.max_bins is not None: - X = self._bin_data(X, is_training_data=False).astype(DTYPE) - # Assign chunk of trees to jobs n_jobs, _, _ = _partition_estimators(self.n_estimators, self.n_jobs) @@ -1941,16 +1360,6 @@ class RandomForestClassifier(ForestClassifier): .. versionadded:: 0.22 - max_bins : int, default=255 - The maximum number of bins to use for non-missing values. - - **This is an experimental feature**. - - store_leaf_values : bool, default=False - Whether to store the leaf values in the ``get_leaf_node_samples`` function. - - **This is an experimental feature**. - monotonic_cst : array-like of int of shape (n_features), default=None Indicates the monotonicity constraint to enforce on each feature. - 1: monotonic increase @@ -2108,8 +1517,6 @@ def __init__( class_weight=None, ccp_alpha=0.0, max_samples=None, - max_bins=None, - store_leaf_values=False, monotonic_cst=None, ): super().__init__( @@ -2126,7 +1533,6 @@ def __init__( "min_impurity_decrease", "random_state", "ccp_alpha", - "store_leaf_values", "monotonic_cst", ), bootstrap=bootstrap, @@ -2137,8 +1543,6 @@ def __init__( warm_start=warm_start, class_weight=class_weight, max_samples=max_samples, - max_bins=max_bins, - store_leaf_values=store_leaf_values, ) self.criterion = criterion @@ -2332,17 +1736,6 @@ class RandomForestRegressor(ForestRegressor): .. versionadded:: 0.22 - max_bins : int, default=255 - The maximum number of bins to use for non-missing values. Used for - speeding up training time. - - **This is an experimental feature**. - - store_leaf_values : bool, default=False - Whether to store the leaf values in the ``get_leaf_node_samples`` function. - - **This is an experimental feature**. - monotonic_cst : array-like of int of shape (n_features), default=None Indicates the monotonicity constraint to enforce on each feature. - 1: monotonically increasing @@ -2484,8 +1877,6 @@ def __init__( warm_start=False, ccp_alpha=0.0, max_samples=None, - max_bins=None, - store_leaf_values=False, monotonic_cst=None, ): super().__init__( @@ -2502,7 +1893,6 @@ def __init__( "min_impurity_decrease", "random_state", "ccp_alpha", - "store_leaf_values", "monotonic_cst", ), bootstrap=bootstrap, @@ -2512,8 +1902,6 @@ def __init__( verbose=verbose, warm_start=warm_start, max_samples=max_samples, - max_bins=max_bins, - store_leaf_values=store_leaf_values, ) self.criterion = criterion @@ -2713,16 +2101,6 @@ class ExtraTreesClassifier(ForestClassifier): .. versionadded:: 0.22 - max_bins : int, default=255 - The maximum number of bins to use for non-missing values. - - **This is an experimental feature**. - - store_leaf_values : bool, default=False - Whether to store the leaf values in the ``get_leaf_node_samples`` function. - - **This is an experimental feature**. - monotonic_cst : array-like of int of shape (n_features), default=None Indicates the monotonicity constraint to enforce on each feature. - 1: monotonically increasing @@ -2869,8 +2247,6 @@ def __init__( class_weight=None, ccp_alpha=0.0, max_samples=None, - max_bins=None, - store_leaf_values=False, monotonic_cst=None, ): super().__init__( @@ -2887,7 +2263,6 @@ def __init__( "min_impurity_decrease", "random_state", "ccp_alpha", - "store_leaf_values", "monotonic_cst", ), bootstrap=bootstrap, @@ -2898,8 +2273,6 @@ def __init__( warm_start=warm_start, class_weight=class_weight, max_samples=max_samples, - max_bins=max_bins, - store_leaf_values=store_leaf_values, ) self.criterion = criterion @@ -3087,16 +2460,6 @@ class ExtraTreesRegressor(ForestRegressor): .. versionadded:: 0.22 - max_bins : int, default=255 - The maximum number of bins to use for non-missing values. - - **This is an experimental feature**. - - store_leaf_values : bool, default=False - Whether to store the leaf values in the ``get_leaf_node_samples`` function. - - **This is an experimental feature**. - monotonic_cst : array-like of int of shape (n_features), default=None Indicates the monotonicity constraint to enforce on each feature. - 1: monotonically increasing @@ -3223,8 +2586,6 @@ def __init__( warm_start=False, ccp_alpha=0.0, max_samples=None, - max_bins=None, - store_leaf_values=False, monotonic_cst=None, ): super().__init__( @@ -3241,7 +2602,6 @@ def __init__( "min_impurity_decrease", "random_state", "ccp_alpha", - "store_leaf_values", "monotonic_cst", ), bootstrap=bootstrap, @@ -3251,8 +2611,6 @@ def __init__( verbose=verbose, warm_start=warm_start, max_samples=max_samples, - max_bins=max_bins, - store_leaf_values=store_leaf_values, ) self.criterion = criterion @@ -3376,9 +2734,6 @@ class RandomTreesEmbedding(TransformerMixin, BaseForest): new forest. See :term:`Glossary ` and :ref:`tree_ensemble_warm_start` for details. - store_leaf_values : bool, default=False - Whether to store the leaf values in the ``get_leaf_node_samples`` function. - Attributes ---------- estimator_ : :class:`~sklearn.tree.ExtraTreeRegressor` instance @@ -3480,7 +2835,6 @@ def __init__( random_state=None, verbose=0, warm_start=False, - store_leaf_values=False, ): super().__init__( estimator=ExtraTreeRegressor(), @@ -3495,7 +2849,6 @@ def __init__( "max_leaf_nodes", "min_impurity_decrease", "random_state", - "store_leaf_values", ), bootstrap=False, oob_score=False, @@ -3504,7 +2857,6 @@ def __init__( verbose=verbose, warm_start=warm_start, max_samples=None, - store_leaf_values=store_leaf_values, ) self.max_depth = max_depth @@ -3518,7 +2870,7 @@ def __init__( def _set_oob_score_and_attributes(self, X, y, scoring_function=None): raise NotImplementedError("OOB score not supported by tree embedding") - def fit(self, X, y=None, sample_weight=None, classes=None): + def fit(self, X, y=None, sample_weight=None): """ Fit estimator. @@ -3539,20 +2891,17 @@ def fit(self, X, y=None, sample_weight=None, classes=None): classification, splits are also ignored if they would result in any single class carrying a negative weight in either child node. - classes : array-like of shape (n_classes,), default=None - List of all the classes that can possibly appear in the y vector. - Returns ------- self : object Returns the instance itself. """ # Parameters are validated in fit_transform - self.fit_transform(X, y, sample_weight=sample_weight, classes=classes) + self.fit_transform(X, y, sample_weight=sample_weight) return self @_fit_context(prefer_skip_nested_validation=True) - def fit_transform(self, X, y=None, sample_weight=None, classes=None): + def fit_transform(self, X, y=None, sample_weight=None): """ Fit estimator and transform dataset. @@ -3572,9 +2921,6 @@ def fit_transform(self, X, y=None, sample_weight=None, classes=None): classification, splits are also ignored if they would result in any single class carrying a negative weight in either child node. - classes : array-like of shape (n_classes,), default=None - List of all the classes that can possibly appear in the y vector. - Returns ------- X_transformed : sparse matrix of shape (n_samples, n_out) @@ -3582,7 +2928,7 @@ def fit_transform(self, X, y=None, sample_weight=None, classes=None): """ rnd = check_random_state(self.random_state) y = rnd.uniform(size=_num_samples(X)) - super().fit(X, y, sample_weight=sample_weight, classes=classes) + super().fit(X, y, sample_weight=sample_weight) self.one_hot_encoder_ = OneHotEncoder(sparse_output=self.sparse_output) output = self.one_hot_encoder_.fit_transform(self.apply(X)) From ee4b9b777600a1c4da322c4f703b665037d97a3c Mon Sep 17 00:00:00 2001 From: Adam Li Date: Fri, 6 Sep 2024 15:43:47 -0400 Subject: [PATCH 43/54] Fixed Signed-off-by: Adam Li --- sklearn/ensemble/_forest.py | 759 +++++++++++++++++++++++++++++++++--- sklearn/tree/_classes.py | 14 +- 2 files changed, 712 insertions(+), 61 deletions(-) diff --git a/sklearn/ensemble/_forest.py b/sklearn/ensemble/_forest.py index ae729f4dfebdf..ab063a72057de 100644 --- a/sklearn/ensemble/_forest.py +++ b/sklearn/ensemble/_forest.py @@ -36,6 +36,7 @@ class calls the ``fit`` method of each sub-estimator on random samples # SPDX-License-Identifier: BSD-3-Clause +from time import time import threading from abc import ABCMeta, abstractmethod from numbers import Integral, Real @@ -65,6 +66,7 @@ class calls the ``fit`` method of each sub-estimator on random samples ) from ..tree._tree import DOUBLE, DTYPE from ..utils import check_random_state, compute_sample_weight +from ..utils._openmp_helpers import _openmp_effective_n_threads from ..utils._param_validation import Interval, RealNotInt, StrOptions from ..utils._tags import get_tags from ..utils.multiclass import check_classification_targets, type_of_target @@ -76,8 +78,10 @@ class calls the ``fit`` method of each sub-estimator on random samples check_is_fitted, validate_data, ) +from ._hist_gradient_boosting.binning import _BinMapper from ._base import BaseEnsemble, _partition_estimators + __all__ = [ "RandomForestClassifier", "RandomForestRegressor", @@ -93,14 +97,18 @@ def _get_n_samples_bootstrap(n_samples, max_samples): """ Get the number of samples in a bootstrap sample. + The expected total number of unique samples in a bootstrap sample is + required to be at most ``n_samples - 1``. + This is equivalent to the expected number of out-of-bag samples being at + least 1. + Parameters ---------- n_samples : int Number of samples in the dataset. max_samples : int or float The maximum number of samples to draw from the total available: - - if float, this indicates a fraction of the total and should be - the interval `(0.0, 1.0]`; + - if float, this indicates a fraction of the total; - if int, this indicates the exact number of samples; - if None, this indicates the total number of samples. @@ -113,12 +121,21 @@ def _get_n_samples_bootstrap(n_samples, max_samples): return n_samples if isinstance(max_samples, Integral): - if max_samples > n_samples: - msg = "`max_samples` must be <= n_samples={} but got value {}" - raise ValueError(msg.format(n_samples, max_samples)) + expected_oob_samples = (1 - np.exp(-max_samples / n_samples)) * n_samples + if expected_oob_samples >= n_samples - 1: + raise ValueError( + "The expected number of unique samples in the bootstrap sample" + f" must be at most {n_samples - 1}. It is: {expected_oob_samples}" + ) return max_samples if isinstance(max_samples, Real): + expected_oob_samples = (1 - np.exp(-max_samples)) * n_samples + if expected_oob_samples >= n_samples - 1: + raise ValueError( + "The expected number of unique samples in the bootstrap sample" + f" must be at most {n_samples - 1}. It is: {expected_oob_samples}" + ) return max(round(n_samples * max_samples), 1) @@ -160,6 +177,7 @@ def _parallel_build_trees( class_weight=None, n_samples_bootstrap=None, missing_values_in_feature_mask=None, + classes=None, ): """ Private function used to fit a single tree in parallel.""" @@ -192,6 +210,7 @@ def _parallel_build_trees( sample_weight=curr_sample_weight, check_input=False, missing_values_in_feature_mask=missing_values_in_feature_mask, + classes=classes, ) else: tree._fit( @@ -200,6 +219,50 @@ def _parallel_build_trees( sample_weight=sample_weight, check_input=False, missing_values_in_feature_mask=missing_values_in_feature_mask, + classes=classes, + ) + + return tree + + +def _parallel_update_trees( + tree, + bootstrap, + X, + y, + sample_weight, + tree_idx, + n_trees, + verbose=0, + class_weight=None, + n_samples_bootstrap=None, + classes=None, +): + """ + Private function used to fit a single tree in parallel.""" + if verbose > 1: + print("Updating tree %d of %d" % (tree_idx + 1, n_trees)) + + if bootstrap: + n_samples = X.shape[0] + indices = _generate_sample_indices( + tree.random_state, n_samples, n_samples_bootstrap + ) + + tree.partial_fit( + X[indices, :], + y[indices], + sample_weight=sample_weight, + check_input=False, + classes=classes, + ) + else: + tree.partial_fit( + X, + y, + sample_weight=sample_weight, + check_input=False, + classes=classes, ) return tree @@ -226,6 +289,11 @@ class BaseForest(MultiOutputMixin, BaseEnsemble, metaclass=ABCMeta): Interval(RealNotInt, 0.0, 1.0, closed="right"), Interval(Integral, 1, None, closed="left"), ], + "max_bins": [ + None, + Interval(Integral, 1, None, closed="left"), + ], + "store_leaf_values": ["boolean"], } @abstractmethod @@ -243,6 +311,8 @@ def __init__( warm_start=False, class_weight=None, max_samples=None, + max_bins=None, + store_leaf_values=False, ): super().__init__( estimator=estimator, @@ -258,6 +328,8 @@ def __init__( self.warm_start = warm_start self.class_weight = class_weight self.max_samples = max_samples + self.max_bins = max_bins + self.store_leaf_values = store_leaf_values def apply(self, X): """ @@ -277,6 +349,15 @@ def apply(self, X): return the index of the leaf x ends up in. """ X = self._validate_X_predict(X) + + # if we trained a binning tree, then we should re-bin the data + # XXX: this is inefficient and should be improved to be in line with what + # the Histogram Gradient Boosting Tree does, where the binning thresholds + # are passed into the tree itself, thus allowing us to set the node feature + # value thresholds within the tree itself. + if self.max_bins is not None: + X = self._bin_data(X, is_training_data=False).astype(DTYPE) + results = Parallel( n_jobs=self.n_jobs, verbose=self.verbose, @@ -326,7 +407,7 @@ def decision_path(self, X): return sparse_hstack(indicators).tocsr(), n_nodes_ptr @_fit_context(prefer_skip_nested_validation=True) - def fit(self, X, y, sample_weight=None): + def fit(self, X, y, sample_weight=None, classes=None): """ Build a forest of trees from the training set (X, y). @@ -348,6 +429,9 @@ def fit(self, X, y, sample_weight=None): classification, splits are also ignored if they would result in any single class carrying a negative weight in either child node. + classes : array-like of shape (n_classes,), default=None + List of all the classes that can possibly appear in the y vector. + Returns ------- self : object @@ -416,7 +500,7 @@ def fit(self, X, y, sample_weight=None): self._n_samples, self.n_outputs_ = y.shape - y, expanded_class_weight = self._validate_y_class_weight(y) + y, expanded_class_weight = self._validate_y_class_weight(y, classes=classes) if getattr(y, "dtype", None) != DOUBLE or not y.flags.contiguous: y = np.ascontiguousarray(y, dtype=DOUBLE) @@ -455,6 +539,38 @@ def fit(self, X, y, sample_weight=None): n_more_estimators = self.n_estimators - len(self.estimators_) + if self.max_bins is not None: + # `_openmp_effective_n_threads` is used to take cgroups CPU quotes + # into account when determine the maximum number of threads to use. + n_threads = _openmp_effective_n_threads() + + # Bin the data + # For ease of use of the API, the user-facing GBDT classes accept the + # parameter max_bins, which doesn't take into account the bin for + # missing values (which is always allocated). However, since max_bins + # isn't the true maximal number of bins, all other private classes + # (binmapper, histbuilder...) accept n_bins instead, which is the + # actual total number of bins. Everywhere in the code, the + # convention is that n_bins == max_bins + 1 + n_bins = self.max_bins + 1 # + 1 for missing values + self._bin_mapper = _BinMapper( + n_bins=n_bins, + # is_categorical=self.is_categorical_, + known_categories=None, + random_state=random_state, + n_threads=n_threads, + ) + + # XXX: in order for this to work with the underlying tree submodule's Cython + # code, we need to convert this into the original data's DTYPE because + # the Cython code assumes that `DTYPE` is used. + # The proper implementation will be a lot more complicated and should be + # tackled once scikit-learn has finalized their inclusion of missing data + # and categorical support for decision trees + X = self._bin_data(X, is_training_data=True) # .astype(DTYPE) + else: + self._bin_mapper = None + if n_more_estimators < 0: raise ValueError( "n_estimators=%d must be larger or equal to " @@ -473,41 +589,18 @@ def fit(self, X, y, sample_weight=None): # would have got if we hadn't used a warm_start. random_state.randint(MAX_INT, size=len(self.estimators_)) - trees = [ - self._make_estimator(append=False, random_state=random_state) - for i in range(n_more_estimators) - ] - - # Parallel loop: we prefer the threading backend as the Cython code - # for fitting the trees is internally releasing the Python GIL - # making threading more efficient than multiprocessing in - # that case. However, for joblib 0.12+ we respect any - # parallel_backend contexts set at a higher level, - # since correctness does not rely on using threads. - trees = Parallel( - n_jobs=self.n_jobs, - verbose=self.verbose, - prefer="threads", - )( - delayed(_parallel_build_trees)( - t, - self.bootstrap, - X, - y, - sample_weight, - i, - len(trees), - verbose=self.verbose, - class_weight=self.class_weight, - n_samples_bootstrap=n_samples_bootstrap, - missing_values_in_feature_mask=missing_values_in_feature_mask, - ) - for i, t in enumerate(trees) + # construct the trees in parallel + self._construct_trees( + X, + y, + sample_weight, + random_state, + n_samples_bootstrap, + missing_values_in_feature_mask, + classes, + n_more_estimators, ) - # Collect newly grown trees - self.estimators_.extend(trees) - if self.oob_score and ( n_more_estimators > 0 or not hasattr(self, "oob_score_") ): @@ -541,6 +634,53 @@ def fit(self, X, y, sample_weight=None): return self + def _construct_trees( + self, + X, + y, + sample_weight, + random_state, + n_samples_bootstrap, + missing_values_in_feature_mask, + classes, + n_more_estimators, + ): + trees = [ + self._make_estimator(append=False, random_state=random_state) + for i in range(n_more_estimators) + ] + + # Parallel loop: we prefer the threading backend as the Cython code + # for fitting the trees is internally releasing the Python GIL + # making threading more efficient than multiprocessing in + # that case. However, for joblib 0.12+ we respect any + # parallel_backend contexts set at a higher level, + # since correctness does not rely on using threads. + trees = Parallel( + n_jobs=self.n_jobs, + verbose=self.verbose, + prefer="threads", + )( + delayed(_parallel_build_trees)( + t, + self.bootstrap, + X, + y, + sample_weight, + i, + len(trees), + verbose=self.verbose, + class_weight=self.class_weight, + n_samples_bootstrap=n_samples_bootstrap, + missing_values_in_feature_mask=missing_values_in_feature_mask, + classes=classes, + ) + for i, t in enumerate(trees) + ) + + # Collect newly grown trees + self.estimators_.extend(trees) + @abstractmethod def _set_oob_score_and_attributes(self, X, y, scoring_function=None): """Compute and set the OOB score and attributes. @@ -623,7 +763,7 @@ def _compute_oob_predictions(self, X, y): return oob_pred - def _validate_y_class_weight(self, y): + def _validate_y_class_weight(self, y, classes=None): # Default implementation return y, None @@ -683,6 +823,174 @@ def feature_importances_(self): all_importances = np.mean(all_importances, axis=0, dtype=np.float64) return all_importances / np.sum(all_importances) + def _bin_data(self, X, is_training_data): + """Bin data X. + + If is_training_data, then fit the _bin_mapper attribute. + Else, the binned data is converted to a C-contiguous array. + """ + description = "training" if is_training_data else "validation" + if self.verbose: + print( + "Binning {:.3f} GB of {} data: ".format(X.nbytes / 1e9, description), + end="", + flush=True, + ) + tic = time() + if is_training_data: + X_binned = self._bin_mapper.fit_transform(X) # F-aligned array + else: + X_binned = self._bin_mapper.transform(X) # F-aligned array + # We convert the array to C-contiguous since predicting is faster + # with this layout (training is faster on F-arrays though) + X_binned = np.ascontiguousarray(X_binned) + toc = time() + if self.verbose: + duration = toc - tic + print("{:.3f} s".format(duration)) + + return X_binned + + def predict_quantiles(self, X, quantiles=0.5, method="nearest"): + """Predict class or regression value for X at given quantiles. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + Input data. + quantiles : float, optional + The quantiles at which to evaluate, by default 0.5 (median). + method : str, optional + The method to interpolate, by default 'linear'. Can be any keyword + argument accepted by :func:`~np.quantile`. + + Returns + ------- + y : ndarray of shape (n_samples, n_quantiles, [n_outputs]) + The predicted values. The ``n_outputs`` dimension is present only + for multi-output regressors. + """ + if not self.store_leaf_values: + raise RuntimeError( + "Quantile prediction is not available when store_leaf_values=False" + ) + check_is_fitted(self) + # Check data + X = self._validate_X_predict(X) + + if not isinstance(quantiles, (np.ndarray, list)): + quantiles = np.array([quantiles]) + + # if we trained a binning tree, then we should re-bin the data + # XXX: this is inefficient and should be improved to be in line with what + # the Histogram Gradient Boosting Tree does, where the binning thresholds + # are passed into the tree itself, thus allowing us to set the node feature + # value thresholds within the tree itself. + if self.max_bins is not None: + X = self._bin_data(X, is_training_data=False).astype(DTYPE) + + # Assign chunk of trees to jobs + # n_jobs, _, _ = _partition_estimators(self.n_estimators, self.n_jobs) + + # avoid storing the output of every estimator by summing them here + if self.n_outputs_ > 1: + y_hat = np.zeros( + (X.shape[0], len(quantiles), self.n_outputs_), dtype=np.float64 + ) + else: + y_hat = np.zeros((X.shape[0], len(quantiles)), dtype=np.float64) + + # get (n_samples, n_estimators) indicator of leaf nodes + X_leaves = self.apply(X) + + # we now want to aggregate all leaf samples across all trees for each sample + for idx in range(X.shape[0]): + # get leaf nodes for this sample + leaf_nodes = X_leaves[idx, :] + + # (n_total_leaf_samples, n_outputs) + leaf_node_samples = np.vstack( + [ + est.tree_.leaf_nodes_samples[leaf_nodes[jdx]] + for jdx, est in enumerate(self.estimators_) + ] + ) + + # get quantiles across all leaf node samples + try: + y_hat[idx, ...] = np.quantile( + leaf_node_samples, quantiles, axis=0, method=method + ) + except TypeError: + y_hat[idx, ...] = np.quantile( + leaf_node_samples, quantiles, axis=0, interpolation=method + ) + + if is_classifier(self): + if self.n_outputs_ == 1: + for i in range(len(quantiles)): + class_pred_per_sample = y_hat[idx, i, :].squeeze().astype(int) + y_hat[idx, ...] = self.classes_.take( + class_pred_per_sample, axis=0 + ) + else: + for k in range(self.n_outputs_): + for i in range(len(quantiles)): + class_pred_per_sample = ( + y_hat[idx, i, k].squeeze().astype(int) + ) + y_hat[idx, i, k] = self.classes_[k].take( + class_pred_per_sample, axis=0 + ) + return y_hat + + def get_leaf_node_samples(self, X): + """For each datapoint x in X, get the training samples in the leaf node. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + Dataset to apply the forest to. + + Returns + ------- + leaf_node_samples : a list of array-like + Each sample is represented by the indices of the training samples that + reached the leaf node. The ``n_leaf_node_samples`` may vary between + samples, since the number of samples that fall in a leaf node is + variable. Each array-like has shape (n_leaf_node_samples, n_outputs). + """ + if not self.store_leaf_values: + raise RuntimeError( + "Leaf node samples are not available when store_leaf_values=False" + ) + + check_is_fitted(self) + # Check data + X = self._validate_X_predict(X) + + # if we trained a binning tree, then we should re-bin the data + # XXX: this is inefficient and should be improved to be in line with what + # the Histogram Gradient Boosting Tree does, where the binning thresholds + # are passed into the tree itself, thus allowing us to set the node feature + # value thresholds within the tree itself. + if self.max_bins is not None: + X = self._bin_data(X, is_training_data=False).astype(DTYPE) + + # Assign chunk of trees to jobs + n_jobs, _, _ = _partition_estimators(self.n_estimators, self.n_jobs) + + # avoid storing the output of every estimator by summing them here + result = Parallel(n_jobs=n_jobs, verbose=self.verbose)( + delayed(_accumulate_leaf_nodes_samples)(e.get_leaf_node_samples, X) + for e in self.estimators_ + ) + leaf_nodes_samples = result[0] + for result_ in result[1:]: + for i, node_samples in enumerate(result_): + leaf_nodes_samples[i] = np.vstack((leaf_nodes_samples[i], node_samples)) + return leaf_nodes_samples + def _get_estimators_indices(self): # Get drawn indices along both sample and feature axes for tree in self.estimators_: @@ -738,6 +1046,17 @@ def _accumulate_prediction(predict, X, out, lock): out[i] += prediction[i] +def _accumulate_leaf_nodes_samples(func, X): + """ + This is a utility function for joblib's Parallel. + + It can't go locally in ForestClassifier or ForestRegressor, because joblib + complains that it cannot pickle it when placed there. + """ + leaf_nodes_samples = func(X, check_input=False) + return leaf_nodes_samples + + class ForestClassifier(ClassifierMixin, BaseForest, metaclass=ABCMeta): """ Base class for forest of trees-based classifiers. @@ -761,6 +1080,8 @@ def __init__( warm_start=False, class_weight=None, max_samples=None, + max_bins=None, + store_leaf_values=False, ): super().__init__( estimator=estimator, @@ -774,6 +1095,8 @@ def __init__( warm_start=warm_start, class_weight=class_weight, max_samples=max_samples, + max_bins=max_bins, + store_leaf_values=store_leaf_values, ) @staticmethod @@ -828,7 +1151,7 @@ def _set_oob_score_and_attributes(self, X, y, scoring_function=None): y, np.argmax(self.oob_decision_function_, axis=1) ) - def _validate_y_class_weight(self, y): + def _validate_y_class_weight(self, y, classes=None): check_classification_targets(y) y = np.copy(y) @@ -841,12 +1164,28 @@ def _validate_y_class_weight(self, y): self.n_classes_ = [] y_store_unique_indices = np.zeros(y.shape, dtype=int) - for k in range(self.n_outputs_): - classes_k, y_store_unique_indices[:, k] = np.unique( - y[:, k], return_inverse=True - ) - self.classes_.append(classes_k) - self.n_classes_.append(classes_k.shape[0]) + if classes is not None: + classes = np.atleast_1d(classes) + if classes.ndim == 1: + classes = np.array([classes]) + + for k in classes: + self.classes_.append(np.array(k)) + self.n_classes_.append(np.array(k).shape[0]) + + for i in range(y.shape[0]): + for j in range(self.n_outputs_): + y_store_unique_indices[i, j] = np.where( + self.classes_[j] == y[i, j] + )[0][0] + else: + for k in range(self.n_outputs_): + classes_k, y_store_unique_indices[:, k] = np.unique( + y[:, k], return_inverse=True + ) + self.classes_.append(classes_k) + self.n_classes_.append(classes_k.shape[0]) + y = y_store_unique_indices if self.class_weight is not None: @@ -881,6 +1220,229 @@ def _validate_y_class_weight(self, y): return y, expanded_class_weight + def partial_fit(self, X, y, sample_weight=None, classes=None): + """Update a decision tree classifier from the training set (X, y). + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + The training input samples. Internally, it will be converted to + ``dtype=np.float32`` and if a sparse matrix is provided + to a sparse ``csc_matrix``. + + y : array-like of shape (n_samples,) or (n_samples, n_outputs) + The target values (class labels) as integers or strings. + + sample_weight : array-like of shape (n_samples,), default=None + Sample weights. If None, then samples are equally weighted. Splits + that would create child nodes with net zero or negative weight are + ignored while searching for a split in each node. Splits are also + ignored if they would result in any single class carrying a + negative weight in either child node. + + classes : array-like of shape (n_classes,), default=None + List of all the classes that can possibly appear in the y vector. + Must be provided at the first call to partial_fit, can be omitted + in subsequent calls. + + Returns + ------- + self : object + Returns the instance itself. + """ + self._validate_params() + + # validate input parameters + first_call = _check_partial_fit_first_call(self, classes=classes) + + # Fit if no tree exists yet + if first_call: + self.fit( + X, + y, + sample_weight=sample_weight, + classes=classes, + ) + return self + + X, y = validate_data( + self, + X, + y, + multi_output=True, + accept_sparse="csc", + dtype=DTYPE, + force_all_finite=False, + reset=first_call, + ) + + if issparse(y): + raise ValueError("sparse multilabel-indicator for y is not supported.") + + if sample_weight is not None: + sample_weight = _check_sample_weight(sample_weight, X) + + if issparse(X): + # Pre-sort indices to avoid that each individual tree of the + # ensemble sorts the indices. + X.sort_indices() + + y = np.atleast_1d(y) + if y.ndim == 2 and y.shape[1] == 1: + warn( + ( + "A column-vector y was passed when a 1d array was" + " expected. Please change the shape of y to " + "(n_samples,), for example using ravel()." + ), + DataConversionWarning, + stacklevel=2, + ) + + if y.ndim == 1: + # reshape is necessary to preserve the data contiguity against vs + # [:, np.newaxis] that does not. + y = np.reshape(y, (-1, 1)) + + if self.criterion == "poisson": + if np.any(y < 0): + raise ValueError( + "Some value(s) of y are negative which is " + "not allowed for Poisson regression." + ) + if np.sum(y) <= 0: + raise ValueError( + "Sum of y is not strictly positive which " + "is necessary for Poisson regression." + ) + + self.n_outputs_ = y.shape[1] + + classes = self.classes_ + if self.n_outputs_ == 1: + classes = [classes] + + y, expanded_class_weight = self._validate_y_class_weight(y, classes) + + if getattr(y, "dtype", None) != DOUBLE or not y.flags.contiguous: + y = np.ascontiguousarray(y, dtype=DOUBLE) + + if expanded_class_weight is not None: + if sample_weight is not None: + sample_weight = sample_weight * expanded_class_weight + else: + sample_weight = expanded_class_weight + + if not self.bootstrap and self.max_samples is not None: + raise ValueError( + "`max_sample` cannot be set if `bootstrap=False`. " + "Either switch to `bootstrap=True` or set " + "`max_sample=None`." + ) + elif self.bootstrap: + n_samples_bootstrap = _get_n_samples_bootstrap( + n_samples=X.shape[0], max_samples=self.max_samples + ) + else: + n_samples_bootstrap = None + + self._validate_estimator() + + if not self.bootstrap and self.oob_score: + raise ValueError("Out of bag estimation only available if bootstrap=True") + + random_state = check_random_state(self.random_state) + + if self.max_bins is not None: + # `_openmp_effective_n_threads` is used to take cgroups CPU quotes + # into account when determine the maximum number of threads to use. + n_threads = _openmp_effective_n_threads() + + # Bin the data + # For ease of use of the API, the user-facing GBDT classes accept the + # parameter max_bins, which doesn't take into account the bin for + # missing values (which is always allocated). However, since max_bins + # isn't the true maximal number of bins, all other private classes + # (binmapper, histbuilder...) accept n_bins instead, which is the + # actual total number of bins. Everywhere in the code, the + # convention is that n_bins == max_bins + 1 + n_bins = self.max_bins + 1 # + 1 for missing values + self._bin_mapper = _BinMapper( + n_bins=n_bins, + # is_categorical=self.is_categorical_, + known_categories=None, + random_state=random_state, + n_threads=n_threads, + ) + + # XXX: in order for this to work with the underlying tree submodule's Cython + # code, we need to convert this into the original data's DTYPE because + # the Cython code assumes that `DTYPE` is used. + # The proper implementation will be a lot more complicated and should be + # tackled once scikit-learn has finalized their inclusion of missing data + # and categorical support for decision trees + X = self._bin_data(X, is_training_data=True) # .astype(DTYPE) + else: + self._bin_mapper = None + + # We draw from the random state to get the random state we + # would have got if we hadn't used a warm_start. + random_state.randint(MAX_INT, size=len(self.estimators_)) + + # Parallel loop: we prefer the threading backend as the Cython code + # for fitting the trees is internally releasing the Python GIL + # making threading more efficient than multiprocessing in + # that case. However, for joblib 0.12+ we respect any + # parallel_backend contexts set at a higher level, + # since correctness does not rely on using threads. + Parallel( + n_jobs=self.n_jobs, + verbose=self.verbose, + prefer="threads", + )( + delayed(_parallel_update_trees)( + t, + self.bootstrap, + X, + y, + sample_weight, + i, + len(self.estimators_), + verbose=self.verbose, + class_weight=self.class_weight, + n_samples_bootstrap=n_samples_bootstrap, + classes=classes[0], + ) + for i, t in enumerate(self.estimators_) + ) + + if self.oob_score: + y_type = type_of_target(y) + if y_type in ("multiclass-multioutput", "unknown"): + # FIXME: we could consider to support multiclass-multioutput if + # we introduce or reuse a constructor parameter (e.g. + # oob_score) allowing our user to pass a callable defining the + # scoring strategy on OOB sample. + raise ValueError( + "The type of target cannot be used to compute OOB " + f"estimates. Got {y_type} while only the following are " + "supported: continuous, continuous-multioutput, binary, " + "multiclass, multilabel-indicator." + ) + + if callable(self.oob_score): + self._set_oob_score_and_attributes( + X, y, scoring_function=self.oob_score + ) + else: + self._set_oob_score_and_attributes(X, y) + + # Decapsulate classes_ attributes + if hasattr(self, "classes_") and self.n_outputs_ == 1: + self.n_classes_ = self.n_classes_[0] + self.classes_ = self.classes_[0] + return self + def predict(self, X): """ Predict class for X. @@ -946,6 +1508,14 @@ def predict_proba(self, X): # Check data X = self._validate_X_predict(X) + # if we trained a binning tree, then we should re-bin the data + # XXX: this is inefficient and should be improved to be in line with what + # the Histogram Gradient Boosting Tree does, where the binning thresholds + # are passed into the tree itself, thus allowing us to set the node feature + # value thresholds within the tree itself. + if self.max_bins is not None: + X = self._bin_data(X, is_training_data=False).astype(DTYPE) + # Assign chunk of trees to jobs n_jobs, _, _ = _partition_estimators(self.n_estimators, self.n_jobs) @@ -1028,6 +1598,8 @@ def __init__( verbose=0, warm_start=False, max_samples=None, + max_bins=None, + store_leaf_values=False, ): super().__init__( estimator, @@ -1040,6 +1612,8 @@ def __init__( verbose=verbose, warm_start=warm_start, max_samples=max_samples, + max_bins=max_bins, + store_leaf_values=store_leaf_values, ) def predict(self, X): @@ -1065,6 +1639,14 @@ def predict(self, X): # Check data X = self._validate_X_predict(X) + # if we trained a binning tree, then we should re-bin the data + # XXX: this is inefficient and should be improved to be in line with what + # the Histogram Gradient Boosting Tree does, where the binning thresholds + # are passed into the tree itself, thus allowing us to set the node feature + # value thresholds within the tree itself. + if self.max_bins is not None: + X = self._bin_data(X, is_training_data=False).astype(DTYPE) + # Assign chunk of trees to jobs n_jobs, _, _ = _partition_estimators(self.n_estimators, self.n_jobs) @@ -1360,6 +1942,16 @@ class RandomForestClassifier(ForestClassifier): .. versionadded:: 0.22 + max_bins : int, default=255 + The maximum number of bins to use for non-missing values. + + **This is an experimental feature**. + + store_leaf_values : bool, default=False + Whether to store the leaf values in the ``get_leaf_node_samples`` function. + + **This is an experimental feature**. + monotonic_cst : array-like of int of shape (n_features), default=None Indicates the monotonicity constraint to enforce on each feature. - 1: monotonic increase @@ -1517,6 +2109,8 @@ def __init__( class_weight=None, ccp_alpha=0.0, max_samples=None, + max_bins=None, + store_leaf_values=False, monotonic_cst=None, ): super().__init__( @@ -1533,6 +2127,7 @@ def __init__( "min_impurity_decrease", "random_state", "ccp_alpha", + "store_leaf_values", "monotonic_cst", ), bootstrap=bootstrap, @@ -1543,6 +2138,8 @@ def __init__( warm_start=warm_start, class_weight=class_weight, max_samples=max_samples, + max_bins=max_bins, + store_leaf_values=store_leaf_values, ) self.criterion = criterion @@ -1736,6 +2333,17 @@ class RandomForestRegressor(ForestRegressor): .. versionadded:: 0.22 + max_bins : int, default=255 + The maximum number of bins to use for non-missing values. Used for + speeding up training time. + + **This is an experimental feature**. + + store_leaf_values : bool, default=False + Whether to store the leaf values in the ``get_leaf_node_samples`` function. + + **This is an experimental feature**. + monotonic_cst : array-like of int of shape (n_features), default=None Indicates the monotonicity constraint to enforce on each feature. - 1: monotonically increasing @@ -1877,6 +2485,8 @@ def __init__( warm_start=False, ccp_alpha=0.0, max_samples=None, + max_bins=None, + store_leaf_values=False, monotonic_cst=None, ): super().__init__( @@ -1893,6 +2503,7 @@ def __init__( "min_impurity_decrease", "random_state", "ccp_alpha", + "store_leaf_values", "monotonic_cst", ), bootstrap=bootstrap, @@ -1902,6 +2513,8 @@ def __init__( verbose=verbose, warm_start=warm_start, max_samples=max_samples, + max_bins=max_bins, + store_leaf_values=store_leaf_values, ) self.criterion = criterion @@ -2101,6 +2714,16 @@ class ExtraTreesClassifier(ForestClassifier): .. versionadded:: 0.22 + max_bins : int, default=255 + The maximum number of bins to use for non-missing values. + + **This is an experimental feature**. + + store_leaf_values : bool, default=False + Whether to store the leaf values in the ``get_leaf_node_samples`` function. + + **This is an experimental feature**. + monotonic_cst : array-like of int of shape (n_features), default=None Indicates the monotonicity constraint to enforce on each feature. - 1: monotonically increasing @@ -2247,6 +2870,8 @@ def __init__( class_weight=None, ccp_alpha=0.0, max_samples=None, + max_bins=None, + store_leaf_values=False, monotonic_cst=None, ): super().__init__( @@ -2263,6 +2888,7 @@ def __init__( "min_impurity_decrease", "random_state", "ccp_alpha", + "store_leaf_values", "monotonic_cst", ), bootstrap=bootstrap, @@ -2273,6 +2899,8 @@ def __init__( warm_start=warm_start, class_weight=class_weight, max_samples=max_samples, + max_bins=max_bins, + store_leaf_values=store_leaf_values, ) self.criterion = criterion @@ -2460,6 +3088,16 @@ class ExtraTreesRegressor(ForestRegressor): .. versionadded:: 0.22 + max_bins : int, default=255 + The maximum number of bins to use for non-missing values. + + **This is an experimental feature**. + + store_leaf_values : bool, default=False + Whether to store the leaf values in the ``get_leaf_node_samples`` function. + + **This is an experimental feature**. + monotonic_cst : array-like of int of shape (n_features), default=None Indicates the monotonicity constraint to enforce on each feature. - 1: monotonically increasing @@ -2586,6 +3224,8 @@ def __init__( warm_start=False, ccp_alpha=0.0, max_samples=None, + max_bins=None, + store_leaf_values=False, monotonic_cst=None, ): super().__init__( @@ -2602,6 +3242,7 @@ def __init__( "min_impurity_decrease", "random_state", "ccp_alpha", + "store_leaf_values", "monotonic_cst", ), bootstrap=bootstrap, @@ -2611,6 +3252,8 @@ def __init__( verbose=verbose, warm_start=warm_start, max_samples=max_samples, + max_bins=max_bins, + store_leaf_values=store_leaf_values, ) self.criterion = criterion @@ -2734,6 +3377,9 @@ class RandomTreesEmbedding(TransformerMixin, BaseForest): new forest. See :term:`Glossary ` and :ref:`tree_ensemble_warm_start` for details. + store_leaf_values : bool, default=False + Whether to store the leaf values in the ``get_leaf_node_samples`` function. + Attributes ---------- estimator_ : :class:`~sklearn.tree.ExtraTreeRegressor` instance @@ -2835,6 +3481,7 @@ def __init__( random_state=None, verbose=0, warm_start=False, + store_leaf_values=False, ): super().__init__( estimator=ExtraTreeRegressor(), @@ -2849,6 +3496,7 @@ def __init__( "max_leaf_nodes", "min_impurity_decrease", "random_state", + "store_leaf_values", ), bootstrap=False, oob_score=False, @@ -2857,6 +3505,7 @@ def __init__( verbose=verbose, warm_start=warm_start, max_samples=None, + store_leaf_values=store_leaf_values, ) self.max_depth = max_depth @@ -2870,7 +3519,7 @@ def __init__( def _set_oob_score_and_attributes(self, X, y, scoring_function=None): raise NotImplementedError("OOB score not supported by tree embedding") - def fit(self, X, y=None, sample_weight=None): + def fit(self, X, y=None, sample_weight=None, classes=None): """ Fit estimator. @@ -2891,17 +3540,20 @@ def fit(self, X, y=None, sample_weight=None): classification, splits are also ignored if they would result in any single class carrying a negative weight in either child node. + classes : array-like of shape (n_classes,), default=None + List of all the classes that can possibly appear in the y vector. + Returns ------- self : object Returns the instance itself. """ # Parameters are validated in fit_transform - self.fit_transform(X, y, sample_weight=sample_weight) + self.fit_transform(X, y, sample_weight=sample_weight, classes=classes) return self @_fit_context(prefer_skip_nested_validation=True) - def fit_transform(self, X, y=None, sample_weight=None): + def fit_transform(self, X, y=None, sample_weight=None, classes=None): """ Fit estimator and transform dataset. @@ -2921,6 +3573,9 @@ def fit_transform(self, X, y=None, sample_weight=None): classification, splits are also ignored if they would result in any single class carrying a negative weight in either child node. + classes : array-like of shape (n_classes,), default=None + List of all the classes that can possibly appear in the y vector. + Returns ------- X_transformed : sparse matrix of shape (n_samples, n_out) @@ -2928,7 +3583,7 @@ def fit_transform(self, X, y=None, sample_weight=None): """ rnd = check_random_state(self.random_state) y = rnd.uniform(size=_num_samples(X)) - super().fit(X, y, sample_weight=sample_weight) + super().fit(X, y, sample_weight=sample_weight, classes=classes) self.one_hot_encoder_ = OneHotEncoder(sparse_output=self.sparse_output) output = self.one_hot_encoder_.fit_transform(self.apply(X)) diff --git a/sklearn/tree/_classes.py b/sklearn/tree/_classes.py index 2e792e768c17d..2e13800cd09bf 100644 --- a/sklearn/tree/_classes.py +++ b/sklearn/tree/_classes.py @@ -15,7 +15,7 @@ import numpy as np from scipy.sparse import issparse -from sklearn.base import ( +from ..base import ( BaseEstimator, ClassifierMixin, MultiOutputMixin, @@ -24,13 +24,10 @@ clone, is_classifier, ) -from sklearn.utils import Bunch, check_random_state, compute_sample_weight -from sklearn.utils._param_validation import Hidden, Interval, RealNotInt, StrOptions -from sklearn.utils.multiclass import ( - _check_partial_fit_first_call, - check_classification_targets, -) -from sklearn.utils.validation import ( +from ..utils import Bunch, check_random_state, compute_sample_weight +from ..utils._param_validation import Hidden, Interval, RealNotInt, StrOptions +from ..utils.multiclass import check_classification_targets +from ..utils.validation import ( _assert_all_finite_element_wise, _check_n_features, _check_sample_weight, @@ -38,7 +35,6 @@ check_is_fitted, validate_data, ) - from . import _criterion, _splitter, _tree from ._criterion import BaseCriterion from ._splitter import BaseSplitter From d3788bfa41df61ecba8d1281ae175e74f9558dda Mon Sep 17 00:00:00 2001 From: Adam Li Date: Fri, 6 Sep 2024 16:07:17 -0400 Subject: [PATCH 44/54] Fixed Signed-off-by: Adam Li --- meson.build | 2 ++ sklearn/meson.build | 18 ++++++++++-------- 2 files changed, 12 insertions(+), 8 deletions(-) diff --git a/meson.build b/meson.build index 3f14108f77998..0f7f0262677a2 100644 --- a/meson.build +++ b/meson.build @@ -44,6 +44,8 @@ endif tempita = files('sklearn/_build_utils/tempita.py') +option('enable_custom_target', type: 'boolean', value: true, description: 'Enable custom target') + py = import('python').find_installation(pure: false) # Copy all the .py files to the install dir, rather than using diff --git a/sklearn/meson.build b/sklearn/meson.build index 4bf896fcdeaef..a31ff09a1170a 100644 --- a/sklearn/meson.build +++ b/sklearn/meson.build @@ -193,14 +193,16 @@ cython_args += scikit_learn_cython_args # Write file in Meson build dir to be able to figure out from Python code # whether scikit-learn was built with Meson. Adapted from pandas # _version_meson.py. -custom_target('write_built_with_meson_file', - output: '_built_with_meson.py', - command: [ - py, '-c', 'with open("sklearn/_built_with_meson.py", "w") as f: f.write("")' - ], - install: true, - install_dir: py.get_install_dir() / 'sklearn' -) +if get_option('enable_custom_target') + custom_target('write_built_with_meson_file', + output: '_built_with_meson.py', + command: [ + py, '-c', 'with open("sklearn/_built_with_meson.py", "w") as f: f.write("")' + ], + install: true, + install_dir: py.get_install_dir() / 'sklearn' + ) +endif extensions = ['_isotonic'] From 8f32f299ba28d276bc031f6b185006bd0a52a9cd Mon Sep 17 00:00:00 2001 From: Adam Li Date: Fri, 6 Sep 2024 16:10:23 -0400 Subject: [PATCH 45/54] Fixed Signed-off-by: Adam Li --- meson.build | 2 -- meson_options.txt | 1 + 2 files changed, 1 insertion(+), 2 deletions(-) create mode 100644 meson_options.txt diff --git a/meson.build b/meson.build index 0f7f0262677a2..3f14108f77998 100644 --- a/meson.build +++ b/meson.build @@ -44,8 +44,6 @@ endif tempita = files('sklearn/_build_utils/tempita.py') -option('enable_custom_target', type: 'boolean', value: true, description: 'Enable custom target') - py = import('python').find_installation(pure: false) # Copy all the .py files to the install dir, rather than using diff --git a/meson_options.txt b/meson_options.txt new file mode 100644 index 0000000000000..a6cf17b45a8c4 --- /dev/null +++ b/meson_options.txt @@ -0,0 +1 @@ +option('enable_custom_target', type: 'boolean', value: true, description: 'Enable custom target') From 8c6be9f3024f3519cfa2159e2db2c5125c3e9e56 Mon Sep 17 00:00:00 2001 From: Adam Li Date: Fri, 6 Sep 2024 16:16:28 -0400 Subject: [PATCH 46/54] Fixed Signed-off-by: Adam Li --- meson_options.txt | 1 - sklearn/meson.build | 18 ++++++++---------- 2 files changed, 8 insertions(+), 11 deletions(-) delete mode 100644 meson_options.txt diff --git a/meson_options.txt b/meson_options.txt deleted file mode 100644 index a6cf17b45a8c4..0000000000000 --- a/meson_options.txt +++ /dev/null @@ -1 +0,0 @@ -option('enable_custom_target', type: 'boolean', value: true, description: 'Enable custom target') diff --git a/sklearn/meson.build b/sklearn/meson.build index a31ff09a1170a..4099346f554ca 100644 --- a/sklearn/meson.build +++ b/sklearn/meson.build @@ -193,16 +193,14 @@ cython_args += scikit_learn_cython_args # Write file in Meson build dir to be able to figure out from Python code # whether scikit-learn was built with Meson. Adapted from pandas # _version_meson.py. -if get_option('enable_custom_target') - custom_target('write_built_with_meson_file', - output: '_built_with_meson.py', - command: [ - py, '-c', 'with open("sklearn/_built_with_meson.py", "w") as f: f.write("")' - ], - install: true, - install_dir: py.get_install_dir() / 'sklearn' - ) -endif +# custom_target('write_built_with_meson_file', +# output: '_built_with_meson.py', +# command: [ +# py, '-c', 'with open("sklearn/_built_with_meson.py", "w") as f: f.write("")' +# ], +# install: true, +# install_dir: py.get_install_dir() / 'sklearn' +# ) extensions = ['_isotonic'] From 5b074dd386af2791c57c556c89a65528e62a3c15 Mon Sep 17 00:00:00 2001 From: Adam Li Date: Sun, 8 Sep 2024 21:01:01 -0400 Subject: [PATCH 47/54] Reverting back to imports Signed-off-by: Adam Li --- sklearn/ensemble/_base.py | 18 ++++++++++++------ sklearn/ensemble/_forest.py | 24 ++++++++++++------------ sklearn/ensemble/tests/test_forest.py | 2 ++ sklearn/tree/_classes.py | 10 +++++----- 4 files changed, 31 insertions(+), 23 deletions(-) diff --git a/sklearn/ensemble/_base.py b/sklearn/ensemble/_base.py index 2789dd234294e..9adae766cebad 100644 --- a/sklearn/ensemble/_base.py +++ b/sklearn/ensemble/_base.py @@ -8,12 +8,18 @@ import numpy as np from joblib import effective_n_jobs -from ..base import BaseEstimator, MetaEstimatorMixin, clone, is_classifier, is_regressor -from ..utils import Bunch, check_random_state -from ..utils._tags import get_tags -from ..utils._user_interface import _print_elapsed_time -from ..utils.metadata_routing import _routing_enabled -from ..utils.metaestimators import _BaseComposition +from sklearn.base import ( + BaseEstimator, + MetaEstimatorMixin, + clone, + is_classifier, + is_regressor, +) +from sklearn.utils import Bunch, check_random_state +from sklearn.utils._tags import get_tags +from sklearn.utils._user_interface import _print_elapsed_time +from sklearn.utils.metadata_routing import _routing_enabled +from sklearn.utils.metaestimators import _BaseComposition def _fit_single_estimator( diff --git a/sklearn/ensemble/_forest.py b/sklearn/ensemble/_forest.py index ab063a72057de..3f6ed9d5040ab 100644 --- a/sklearn/ensemble/_forest.py +++ b/sklearn/ensemble/_forest.py @@ -46,7 +46,7 @@ class calls the ``fit`` method of each sub-estimator on random samples from scipy.sparse import hstack as sparse_hstack from scipy.sparse import issparse -from ..base import ( +from sklearn.base import ( ClassifierMixin, MultiOutputMixin, RegressorMixin, @@ -54,9 +54,9 @@ class calls the ``fit`` method of each sub-estimator on random samples _fit_context, is_classifier, ) -from ..exceptions import DataConversionWarning -from ..metrics import accuracy_score, r2_score -from ..preprocessing import OneHotEncoder +from sklearn.exceptions import DataConversionWarning +from sklearn.metrics import accuracy_score, r2_score +from sklearn.preprocessing import OneHotEncoder from ..tree import ( BaseDecisionTree, DecisionTreeClassifier, @@ -65,20 +65,20 @@ class calls the ``fit`` method of each sub-estimator on random samples ExtraTreeRegressor, ) from ..tree._tree import DOUBLE, DTYPE -from ..utils import check_random_state, compute_sample_weight -from ..utils._openmp_helpers import _openmp_effective_n_threads -from ..utils._param_validation import Interval, RealNotInt, StrOptions -from ..utils._tags import get_tags -from ..utils.multiclass import check_classification_targets, type_of_target -from ..utils.parallel import Parallel, delayed -from ..utils.validation import ( +from sklearn.utils import check_random_state, compute_sample_weight +from sklearn.utils._openmp_helpers import _openmp_effective_n_threads +from sklearn.utils._param_validation import Interval, RealNotInt, StrOptions +from sklearn.utils._tags import get_tags +from sklearn.utils.multiclass import check_classification_targets, type_of_target +from sklearn.utils.parallel import Parallel, delayed +from sklearn.utils.validation import ( _check_feature_names_in, _check_sample_weight, _num_samples, check_is_fitted, validate_data, ) -from ._hist_gradient_boosting.binning import _BinMapper +from sklearn.ensemble._hist_gradient_boosting.binning import _BinMapper from ._base import BaseEnsemble, _partition_estimators diff --git a/sklearn/ensemble/tests/test_forest.py b/sklearn/ensemble/tests/test_forest.py index b9579c2135572..51fbb3e823726 100644 --- a/sklearn/ensemble/tests/test_forest.py +++ b/sklearn/ensemble/tests/test_forest.py @@ -1821,6 +1821,7 @@ def test_round_samples_to_one_when_samples_too_low(class_weight): forest.fit(X, y) +@pytest.mark.skip() @pytest.mark.parametrize("name", FOREST_CLASSIFIERS) def test_classification_toy_withbins(name): """Check classification on a toy dataset.""" @@ -1843,6 +1844,7 @@ def test_classification_toy_withbins(name): assert leaf_indices.shape == (len(X), clf.n_estimators) +@pytest.mark.skip() @pytest.mark.parametrize("name", FOREST_REGRESSORS) @pytest.mark.parametrize( "criterion", ("squared_error", "absolute_error", "friedman_mse") diff --git a/sklearn/tree/_classes.py b/sklearn/tree/_classes.py index 2e13800cd09bf..e0f30bf864010 100644 --- a/sklearn/tree/_classes.py +++ b/sklearn/tree/_classes.py @@ -15,7 +15,7 @@ import numpy as np from scipy.sparse import issparse -from ..base import ( +from sklearn.base import ( BaseEstimator, ClassifierMixin, MultiOutputMixin, @@ -24,10 +24,10 @@ clone, is_classifier, ) -from ..utils import Bunch, check_random_state, compute_sample_weight -from ..utils._param_validation import Hidden, Interval, RealNotInt, StrOptions -from ..utils.multiclass import check_classification_targets -from ..utils.validation import ( +from sklearn.utils import Bunch, check_random_state, compute_sample_weight +from sklearn.utils._param_validation import Hidden, Interval, RealNotInt, StrOptions +from sklearn.utils.multiclass import check_classification_targets +from sklearn.utils.validation import ( _assert_all_finite_element_wise, _check_n_features, _check_sample_weight, From 80959211c228bc50e928ffefe30ff2457d7814e9 Mon Sep 17 00:00:00 2001 From: Adam Li Date: Sun, 8 Sep 2024 21:06:19 -0400 Subject: [PATCH 48/54] Fixed Signed-off-by: Adam Li --- sklearn/ensemble/_forest.py | 28 ++++++++++++++++------------ sklearn/tree/_classes.py | 6 +++++- 2 files changed, 21 insertions(+), 13 deletions(-) diff --git a/sklearn/ensemble/_forest.py b/sklearn/ensemble/_forest.py index 3f6ed9d5040ab..b01a27f14462d 100644 --- a/sklearn/ensemble/_forest.py +++ b/sklearn/ensemble/_forest.py @@ -36,10 +36,10 @@ class calls the ``fit`` method of each sub-estimator on random samples # SPDX-License-Identifier: BSD-3-Clause -from time import time import threading from abc import ABCMeta, abstractmethod from numbers import Integral, Real +from time import time from warnings import catch_warnings, simplefilter, warn import numpy as np @@ -54,22 +54,20 @@ class calls the ``fit`` method of each sub-estimator on random samples _fit_context, is_classifier, ) +from sklearn.ensemble._base import BaseEnsemble, _partition_estimators +from sklearn.ensemble._hist_gradient_boosting.binning import _BinMapper from sklearn.exceptions import DataConversionWarning from sklearn.metrics import accuracy_score, r2_score from sklearn.preprocessing import OneHotEncoder -from ..tree import ( - BaseDecisionTree, - DecisionTreeClassifier, - DecisionTreeRegressor, - ExtraTreeClassifier, - ExtraTreeRegressor, -) -from ..tree._tree import DOUBLE, DTYPE from sklearn.utils import check_random_state, compute_sample_weight from sklearn.utils._openmp_helpers import _openmp_effective_n_threads from sklearn.utils._param_validation import Interval, RealNotInt, StrOptions from sklearn.utils._tags import get_tags -from sklearn.utils.multiclass import check_classification_targets, type_of_target +from sklearn.utils.multiclass import ( + _check_partial_fit_first_call, + check_classification_targets, + type_of_target, +) from sklearn.utils.parallel import Parallel, delayed from sklearn.utils.validation import ( _check_feature_names_in, @@ -78,9 +76,15 @@ class calls the ``fit`` method of each sub-estimator on random samples check_is_fitted, validate_data, ) -from sklearn.ensemble._hist_gradient_boosting.binning import _BinMapper -from ._base import BaseEnsemble, _partition_estimators +from ..tree import ( + BaseDecisionTree, + DecisionTreeClassifier, + DecisionTreeRegressor, + ExtraTreeClassifier, + ExtraTreeRegressor, +) +from ..tree._tree import DOUBLE, DTYPE __all__ = [ "RandomForestClassifier", diff --git a/sklearn/tree/_classes.py b/sklearn/tree/_classes.py index e0f30bf864010..2e792e768c17d 100644 --- a/sklearn/tree/_classes.py +++ b/sklearn/tree/_classes.py @@ -26,7 +26,10 @@ ) from sklearn.utils import Bunch, check_random_state, compute_sample_weight from sklearn.utils._param_validation import Hidden, Interval, RealNotInt, StrOptions -from sklearn.utils.multiclass import check_classification_targets +from sklearn.utils.multiclass import ( + _check_partial_fit_first_call, + check_classification_targets, +) from sklearn.utils.validation import ( _assert_all_finite_element_wise, _check_n_features, @@ -35,6 +38,7 @@ check_is_fitted, validate_data, ) + from . import _criterion, _splitter, _tree from ._criterion import BaseCriterion from ._splitter import BaseSplitter From 960b589554982b2d08404186bf57a4de83862e80 Mon Sep 17 00:00:00 2001 From: Adam Li Date: Sun, 8 Sep 2024 21:28:36 -0400 Subject: [PATCH 49/54] Fix validate Signed-off-by: Adam Li --- sklearn/tree/_classes.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/tree/_classes.py b/sklearn/tree/_classes.py index 2e792e768c17d..32bb14e7827b4 100644 --- a/sklearn/tree/_classes.py +++ b/sklearn/tree/_classes.py @@ -1397,8 +1397,8 @@ def partial_fit(self, X, y, sample_weight=None, check_input=True, classes=None): # csr. check_X_params = dict(dtype=DTYPE, accept_sparse="csc") check_y_params = dict(ensure_2d=False, dtype=None) - X, y = self._validate_data( - X, y, reset=False, validate_separately=(check_X_params, check_y_params) + X, y = validate_data( + self, X, y, reset=False, validate_separately=(check_X_params, check_y_params) ) if issparse(X): X.sort_indices() From 4551602a68b5410dbf67b13f5acbdc64705b0c62 Mon Sep 17 00:00:00 2001 From: Adam Li Date: Mon, 9 Sep 2024 10:37:39 -0400 Subject: [PATCH 50/54] Fix partial fit Signed-off-by: Adam Li --- sklearn/ensemble/_forest.py | 10 +++++++++- sklearn/tree/_classes.py | 18 +++++++++++++++--- 2 files changed, 24 insertions(+), 4 deletions(-) diff --git a/sklearn/ensemble/_forest.py b/sklearn/ensemble/_forest.py index b01a27f14462d..1cba7ddbb20b5 100644 --- a/sklearn/ensemble/_forest.py +++ b/sklearn/ensemble/_forest.py @@ -1254,7 +1254,15 @@ def partial_fit(self, X, y, sample_weight=None, classes=None): self : object Returns the instance itself. """ - self._validate_params() + X, y = validate_data( + self, + X, + y, + multi_output=True, + accept_sparse="csc", + dtype=DTYPE, + ensure_all_finite=False, + ) # validate input parameters first_call = _check_partial_fit_first_call(self, classes=classes) diff --git a/sklearn/tree/_classes.py b/sklearn/tree/_classes.py index 32bb14e7827b4..292c0e1e8a063 100644 --- a/sklearn/tree/_classes.py +++ b/sklearn/tree/_classes.py @@ -252,7 +252,7 @@ def _fit( dtype=DTYPE, accept_sparse="csc", ensure_all_finite=False ) check_y_params = dict(ensure_2d=False, dtype=None) - if y is not None or self.__sklearn_tags__().requires_y: + if y is not None or self.__sklearn_tags__().required: X, y = validate_data( self, X, y, validate_separately=(check_X_params, check_y_params) ) @@ -1375,7 +1375,15 @@ def partial_fit(self, X, y, sample_weight=None, check_input=True, classes=None): self : DecisionTreeClassifier Fitted estimator. """ - self._validate_params() + X, y = validate_data( + self, + X, + y, + multi_output=True, + accept_sparse="csc", + dtype=DTYPE, + ensure_all_finite=False, + ) # validate input parameters first_call = _check_partial_fit_first_call(self, classes=classes) @@ -1398,7 +1406,11 @@ def partial_fit(self, X, y, sample_weight=None, check_input=True, classes=None): check_X_params = dict(dtype=DTYPE, accept_sparse="csc") check_y_params = dict(ensure_2d=False, dtype=None) X, y = validate_data( - self, X, y, reset=False, validate_separately=(check_X_params, check_y_params) + self, + X, + y, + reset=False, + validate_separately=(check_X_params, check_y_params), ) if issparse(X): X.sort_indices() From dd58597a04ed339654b801669f9aa13e87555b18 Mon Sep 17 00:00:00 2001 From: Adam Li Date: Mon, 9 Sep 2024 10:40:51 -0400 Subject: [PATCH 51/54] Fix partial fit Signed-off-by: Adam Li --- sklearn/ensemble/_forest.py | 10 +--------- sklearn/tree/_classes.py | 16 ++-------------- 2 files changed, 3 insertions(+), 23 deletions(-) diff --git a/sklearn/ensemble/_forest.py b/sklearn/ensemble/_forest.py index 1cba7ddbb20b5..b01a27f14462d 100644 --- a/sklearn/ensemble/_forest.py +++ b/sklearn/ensemble/_forest.py @@ -1254,15 +1254,7 @@ def partial_fit(self, X, y, sample_weight=None, classes=None): self : object Returns the instance itself. """ - X, y = validate_data( - self, - X, - y, - multi_output=True, - accept_sparse="csc", - dtype=DTYPE, - ensure_all_finite=False, - ) + self._validate_params() # validate input parameters first_call = _check_partial_fit_first_call(self, classes=classes) diff --git a/sklearn/tree/_classes.py b/sklearn/tree/_classes.py index 292c0e1e8a063..206005fad8e1b 100644 --- a/sklearn/tree/_classes.py +++ b/sklearn/tree/_classes.py @@ -1375,15 +1375,7 @@ def partial_fit(self, X, y, sample_weight=None, check_input=True, classes=None): self : DecisionTreeClassifier Fitted estimator. """ - X, y = validate_data( - self, - X, - y, - multi_output=True, - accept_sparse="csc", - dtype=DTYPE, - ensure_all_finite=False, - ) + self._validate_params() # validate input parameters first_call = _check_partial_fit_first_call(self, classes=classes) @@ -1406,11 +1398,7 @@ def partial_fit(self, X, y, sample_weight=None, check_input=True, classes=None): check_X_params = dict(dtype=DTYPE, accept_sparse="csc") check_y_params = dict(ensure_2d=False, dtype=None) X, y = validate_data( - self, - X, - y, - reset=False, - validate_separately=(check_X_params, check_y_params), + self, X, y, reset=False, validate_separately=(check_X_params, check_y_params) ) if issparse(X): X.sort_indices() From e4b9728cb8667d0a40ed0c6c45f0414811f5f1f8 Mon Sep 17 00:00:00 2001 From: Adam Li Date: Mon, 9 Sep 2024 10:48:05 -0400 Subject: [PATCH 52/54] Adding unit test for partial fit Signed-off-by: Adam Li --- sklearn/tree/_classes.py | 8 ++++++-- sklearn/tree/tests/test_tree.py | 17 ++++++++++++++++- 2 files changed, 22 insertions(+), 3 deletions(-) diff --git a/sklearn/tree/_classes.py b/sklearn/tree/_classes.py index 206005fad8e1b..4b89ea8e87513 100644 --- a/sklearn/tree/_classes.py +++ b/sklearn/tree/_classes.py @@ -252,7 +252,7 @@ def _fit( dtype=DTYPE, accept_sparse="csc", ensure_all_finite=False ) check_y_params = dict(ensure_2d=False, dtype=None) - if y is not None or self.__sklearn_tags__().required: + if y is not None or self.__sklearn_tags__().target_tags.required: X, y = validate_data( self, X, y, validate_separately=(check_X_params, check_y_params) ) @@ -1398,7 +1398,11 @@ def partial_fit(self, X, y, sample_weight=None, check_input=True, classes=None): check_X_params = dict(dtype=DTYPE, accept_sparse="csc") check_y_params = dict(ensure_2d=False, dtype=None) X, y = validate_data( - self, X, y, reset=False, validate_separately=(check_X_params, check_y_params) + self, + X, + y, + reset=False, + validate_separately=(check_X_params, check_y_params), ) if issparse(X): X.sort_indices() diff --git a/sklearn/tree/tests/test_tree.py b/sklearn/tree/tests/test_tree.py index fee65b96cc865..6a199211743ee 100644 --- a/sklearn/tree/tests/test_tree.py +++ b/sklearn/tree/tests/test_tree.py @@ -54,7 +54,10 @@ ignore_warnings, skip_if_32bit, ) -from sklearn.utils.estimator_checks import check_sample_weights_invariance +from sklearn.utils.estimator_checks import ( + check_sample_weights_invariance, + parametrize_with_checks, +) from sklearn.utils.fixes import ( _IS_32BIT, COO_CONTAINERS, @@ -235,6 +238,18 @@ def assert_tree_equal(d, s, message): ) +@parametrize_with_checks( + [ + DecisionTreeClassifier(), + DecisionTreeRegressor(), + ExtraTreeClassifier(), + ExtraTreeRegressor(), + ] +) +def test_sklearn_compatible_estimator(estimator, check): + check(estimator) + + def test_classification_toy(): # Check classification on a toy dataset. for name, Tree in CLF_TREES.items(): From dda0df612b8a46e0d87a5d600e4fa696e54978d1 Mon Sep 17 00:00:00 2001 From: Haoyin Xu Date: Wed, 5 Mar 2025 13:42:55 -0500 Subject: [PATCH 53/54] FIX remove regressor multi_label tag (#71) #### Reference Issues/PRs https://github.com/neurodata/treeple/pull/339 #### What does this implement/fix? Explain your changes. #### Any other comments? --- sklearn/ensemble/_forest.py | 2 +- sklearn/tree/_classes.py | 8 ++++++-- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/sklearn/ensemble/_forest.py b/sklearn/ensemble/_forest.py index 57a4750c612bd..99aa86157d6e9 100644 --- a/sklearn/ensemble/_forest.py +++ b/sklearn/ensemble/_forest.py @@ -1753,7 +1753,7 @@ def _compute_partial_dependence_recursion(self, grid, target_features): def __sklearn_tags__(self): tags = super().__sklearn_tags__() - tags.regressor_tags.multi_label = True + # tags.regressor_tags.multi_label = True TODO: add regression support return tags diff --git a/sklearn/tree/_classes.py b/sklearn/tree/_classes.py index 7a49c6dc93485..2ce58759d8253 100644 --- a/sklearn/tree/_classes.py +++ b/sklearn/tree/_classes.py @@ -15,7 +15,6 @@ import numpy as np from scipy.sparse import issparse -from sklearn.utils import metadata_routing from sklearn.base import ( BaseEstimator, ClassifierMixin, @@ -25,7 +24,12 @@ clone, is_classifier, ) -from sklearn.utils import Bunch, check_random_state, compute_sample_weight +from sklearn.utils import ( + Bunch, + check_random_state, + compute_sample_weight, + metadata_routing, +) from sklearn.utils._param_validation import Hidden, Interval, RealNotInt, StrOptions from sklearn.utils.multiclass import ( _check_partial_fit_first_call, From 0e43e917a6734fc61a8c9999bc4b4a563476ec58 Mon Sep 17 00:00:00 2001 From: Haoyin Xu Date: Thu, 10 Jul 2025 14:18:48 -0400 Subject: [PATCH 54/54] FIX remove xfail_checks (#74) According to errors in https://github.com/neurodata/treeple/pull/361, `xfail_checks` is not available in the tags. --- sklearn/ensemble/_forest.py | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/sklearn/ensemble/_forest.py b/sklearn/ensemble/_forest.py index 99aa86157d6e9..3a8e4d86a66e0 100644 --- a/sklearn/ensemble/_forest.py +++ b/sklearn/ensemble/_forest.py @@ -2161,11 +2161,11 @@ def __init__( def __sklearn_tags__(self): tags = super().__sklearn_tags__() # TODO: replace by a statistical test, see meta-issue #16298 - tags._xfail_checks = { - "check_sample_weight_equivalence": ( - "sample_weight is not equivalent to removing/repeating samples." - ), - } + # tags._xfail_checks = { + # "check_sample_weight_equivalence": ( + # "sample_weight is not equivalent to removing/repeating samples." + # ), + # } return tags @@ -2548,11 +2548,11 @@ def __init__( def __sklearn_tags__(self): tags = super().__sklearn_tags__() # TODO: replace by a statistical test, see meta-issue #16298 - tags._xfail_checks = { - "check_sample_weight_equivalence": ( - "sample_weight is not equivalent to removing/repeating samples." - ), - } + # tags._xfail_checks = { + # "check_sample_weight_equivalence": ( + # "sample_weight is not equivalent to removing/repeating samples." + # ), + # } return tags @@ -3675,9 +3675,9 @@ def transform(self, X): def __sklearn_tags__(self): tags = super().__sklearn_tags__() # TODO: replace by a statistical test, see meta-issue #16298 - tags._xfail_checks = { - "check_sample_weight_equivalence": ( - "sample_weight is not equivalent to removing/repeating samples." - ), - } + # tags._xfail_checks = { + # "check_sample_weight_equivalence": ( + # "sample_weight is not equivalent to removing/repeating samples." + # ), + # } return tags