diff --git a/sklearn/ensemble/_base.py b/sklearn/ensemble/_base.py index 386c4875a1804..45c4fda13999b 100644 --- a/sklearn/ensemble/_base.py +++ b/sklearn/ensemble/_base.py @@ -8,12 +8,18 @@ import numpy as np from joblib import effective_n_jobs -from ..base import BaseEstimator, MetaEstimatorMixin, clone, is_classifier, is_regressor -from ..utils import Bunch, check_random_state -from ..utils._tags import get_tags -from ..utils._user_interface import _print_elapsed_time -from ..utils.metadata_routing import _routing_enabled -from ..utils.metaestimators import _BaseComposition +from sklearn.base import ( + BaseEstimator, + MetaEstimatorMixin, + clone, + is_classifier, + is_regressor, +) +from sklearn.utils import Bunch, check_random_state +from sklearn.utils._tags import get_tags +from sklearn.utils._user_interface import _print_elapsed_time +from sklearn.utils.metadata_routing import _routing_enabled +from sklearn.utils.metaestimators import _BaseComposition def _fit_single_estimator( diff --git a/sklearn/ensemble/_forest.py b/sklearn/ensemble/_forest.py index 92713eecec9dd..3a8e4d86a66e0 100644 --- a/sklearn/ensemble/_forest.py +++ b/sklearn/ensemble/_forest.py @@ -39,13 +39,14 @@ class calls the ``fit`` method of each sub-estimator on random samples import threading from abc import ABCMeta, abstractmethod from numbers import Integral, Real +from time import time from warnings import catch_warnings, simplefilter, warn import numpy as np from scipy.sparse import hstack as sparse_hstack from scipy.sparse import issparse -from ..base import ( +from sklearn.base import ( ClassifierMixin, MultiOutputMixin, RegressorMixin, @@ -53,9 +54,29 @@ class calls the ``fit`` method of each sub-estimator on random samples _fit_context, is_classifier, ) -from ..exceptions import DataConversionWarning -from ..metrics import accuracy_score, r2_score -from ..preprocessing import OneHotEncoder +from sklearn.ensemble._base import BaseEnsemble, _partition_estimators +from sklearn.ensemble._hist_gradient_boosting.binning import _BinMapper +from sklearn.exceptions import DataConversionWarning +from sklearn.metrics import accuracy_score, r2_score +from sklearn.preprocessing import OneHotEncoder +from sklearn.utils import check_random_state, compute_sample_weight +from sklearn.utils._openmp_helpers import _openmp_effective_n_threads +from sklearn.utils._param_validation import Interval, RealNotInt, StrOptions +from sklearn.utils._tags import get_tags +from sklearn.utils.multiclass import ( + _check_partial_fit_first_call, + check_classification_targets, + type_of_target, +) +from sklearn.utils.parallel import Parallel, delayed +from sklearn.utils.validation import ( + _check_feature_names_in, + _check_sample_weight, + _num_samples, + check_is_fitted, + validate_data, +) + from ..tree import ( BaseDecisionTree, DecisionTreeClassifier, @@ -64,19 +85,6 @@ class calls the ``fit`` method of each sub-estimator on random samples ExtraTreeRegressor, ) from ..tree._tree import DOUBLE, DTYPE -from ..utils import check_random_state, compute_sample_weight -from ..utils._param_validation import Interval, RealNotInt, StrOptions -from ..utils._tags import get_tags -from ..utils.multiclass import check_classification_targets, type_of_target -from ..utils.parallel import Parallel, delayed -from ..utils.validation import ( - _check_feature_names_in, - _check_sample_weight, - _num_samples, - check_is_fitted, - validate_data, -) -from ._base import BaseEnsemble, _partition_estimators __all__ = [ "RandomForestClassifier", @@ -93,14 +101,18 @@ def _get_n_samples_bootstrap(n_samples, max_samples): """ Get the number of samples in a bootstrap sample. + The expected total number of unique samples in a bootstrap sample is + required to be at most ``n_samples - 1``. + This is equivalent to the expected number of out-of-bag samples being at + least 1. + Parameters ---------- n_samples : int Number of samples in the dataset. max_samples : int or float The maximum number of samples to draw from the total available: - - if float, this indicates a fraction of the total and should be - the interval `(0.0, 1.0]`; + - if float, this indicates a fraction of the total; - if int, this indicates the exact number of samples; - if None, this indicates the total number of samples. @@ -113,12 +125,21 @@ def _get_n_samples_bootstrap(n_samples, max_samples): return n_samples if isinstance(max_samples, Integral): - if max_samples > n_samples: - msg = "`max_samples` must be <= n_samples={} but got value {}" - raise ValueError(msg.format(n_samples, max_samples)) + expected_oob_samples = (1 - np.exp(-max_samples / n_samples)) * n_samples + if expected_oob_samples >= n_samples - 1: + raise ValueError( + "The expected number of unique samples in the bootstrap sample" + f" must be at most {n_samples - 1}. It is: {expected_oob_samples}" + ) return max_samples if isinstance(max_samples, Real): + expected_oob_samples = (1 - np.exp(-max_samples)) * n_samples + if expected_oob_samples >= n_samples - 1: + raise ValueError( + "The expected number of unique samples in the bootstrap sample" + f" must be at most {n_samples - 1}. It is: {expected_oob_samples}" + ) return max(round(n_samples * max_samples), 1) @@ -160,6 +181,7 @@ def _parallel_build_trees( class_weight=None, n_samples_bootstrap=None, missing_values_in_feature_mask=None, + classes=None, ): """ Private function used to fit a single tree in parallel.""" @@ -192,6 +214,7 @@ def _parallel_build_trees( sample_weight=curr_sample_weight, check_input=False, missing_values_in_feature_mask=missing_values_in_feature_mask, + classes=classes, ) else: tree._fit( @@ -200,6 +223,50 @@ def _parallel_build_trees( sample_weight=sample_weight, check_input=False, missing_values_in_feature_mask=missing_values_in_feature_mask, + classes=classes, + ) + + return tree + + +def _parallel_update_trees( + tree, + bootstrap, + X, + y, + sample_weight, + tree_idx, + n_trees, + verbose=0, + class_weight=None, + n_samples_bootstrap=None, + classes=None, +): + """ + Private function used to fit a single tree in parallel.""" + if verbose > 1: + print("Updating tree %d of %d" % (tree_idx + 1, n_trees)) + + if bootstrap: + n_samples = X.shape[0] + indices = _generate_sample_indices( + tree.random_state, n_samples, n_samples_bootstrap + ) + + tree.partial_fit( + X[indices, :], + y[indices], + sample_weight=sample_weight, + check_input=False, + classes=classes, + ) + else: + tree.partial_fit( + X, + y, + sample_weight=sample_weight, + check_input=False, + classes=classes, ) return tree @@ -226,6 +293,11 @@ class BaseForest(MultiOutputMixin, BaseEnsemble, metaclass=ABCMeta): Interval(RealNotInt, 0.0, 1.0, closed="right"), Interval(Integral, 1, None, closed="left"), ], + "max_bins": [ + None, + Interval(Integral, 1, None, closed="left"), + ], + "store_leaf_values": ["boolean"], } @abstractmethod @@ -243,6 +315,8 @@ def __init__( warm_start=False, class_weight=None, max_samples=None, + max_bins=None, + store_leaf_values=False, ): super().__init__( estimator=estimator, @@ -258,6 +332,8 @@ def __init__( self.warm_start = warm_start self.class_weight = class_weight self.max_samples = max_samples + self.max_bins = max_bins + self.store_leaf_values = store_leaf_values def apply(self, X): """ @@ -277,6 +353,15 @@ def apply(self, X): return the index of the leaf x ends up in. """ X = self._validate_X_predict(X) + + # if we trained a binning tree, then we should re-bin the data + # XXX: this is inefficient and should be improved to be in line with what + # the Histogram Gradient Boosting Tree does, where the binning thresholds + # are passed into the tree itself, thus allowing us to set the node feature + # value thresholds within the tree itself. + if self.max_bins is not None: + X = self._bin_data(X, is_training_data=False).astype(DTYPE) + results = Parallel( n_jobs=self.n_jobs, verbose=self.verbose, @@ -326,7 +411,7 @@ def decision_path(self, X): return sparse_hstack(indicators).tocsr(), n_nodes_ptr @_fit_context(prefer_skip_nested_validation=True) - def fit(self, X, y, sample_weight=None): + def fit(self, X, y, sample_weight=None, classes=None): """ Build a forest of trees from the training set (X, y). @@ -348,6 +433,9 @@ def fit(self, X, y, sample_weight=None): classification, splits are also ignored if they would result in any single class carrying a negative weight in either child node. + classes : array-like of shape (n_classes,), default=None + List of all the classes that can possibly appear in the y vector. + Returns ------- self : object @@ -416,7 +504,7 @@ def fit(self, X, y, sample_weight=None): self._n_samples, self.n_outputs_ = y.shape - y, expanded_class_weight = self._validate_y_class_weight(y) + y, expanded_class_weight = self._validate_y_class_weight(y, classes=classes) if getattr(y, "dtype", None) != DOUBLE or not y.flags.contiguous: y = np.ascontiguousarray(y, dtype=DOUBLE) @@ -455,6 +543,38 @@ def fit(self, X, y, sample_weight=None): n_more_estimators = self.n_estimators - len(self.estimators_) + if self.max_bins is not None: + # `_openmp_effective_n_threads` is used to take cgroups CPU quotes + # into account when determine the maximum number of threads to use. + n_threads = _openmp_effective_n_threads() + + # Bin the data + # For ease of use of the API, the user-facing GBDT classes accept the + # parameter max_bins, which doesn't take into account the bin for + # missing values (which is always allocated). However, since max_bins + # isn't the true maximal number of bins, all other private classes + # (binmapper, histbuilder...) accept n_bins instead, which is the + # actual total number of bins. Everywhere in the code, the + # convention is that n_bins == max_bins + 1 + n_bins = self.max_bins + 1 # + 1 for missing values + self._bin_mapper = _BinMapper( + n_bins=n_bins, + # is_categorical=self.is_categorical_, + known_categories=None, + random_state=random_state, + n_threads=n_threads, + ) + + # XXX: in order for this to work with the underlying tree submodule's Cython + # code, we need to convert this into the original data's DTYPE because + # the Cython code assumes that `DTYPE` is used. + # The proper implementation will be a lot more complicated and should be + # tackled once scikit-learn has finalized their inclusion of missing data + # and categorical support for decision trees + X = self._bin_data(X, is_training_data=True) # .astype(DTYPE) + else: + self._bin_mapper = None + if n_more_estimators < 0: raise ValueError( "n_estimators=%d must be larger or equal to " @@ -473,41 +593,18 @@ def fit(self, X, y, sample_weight=None): # would have got if we hadn't used a warm_start. random_state.randint(MAX_INT, size=len(self.estimators_)) - trees = [ - self._make_estimator(append=False, random_state=random_state) - for i in range(n_more_estimators) - ] - - # Parallel loop: we prefer the threading backend as the Cython code - # for fitting the trees is internally releasing the Python GIL - # making threading more efficient than multiprocessing in - # that case. However, for joblib 0.12+ we respect any - # parallel_backend contexts set at a higher level, - # since correctness does not rely on using threads. - trees = Parallel( - n_jobs=self.n_jobs, - verbose=self.verbose, - prefer="threads", - )( - delayed(_parallel_build_trees)( - t, - self.bootstrap, - X, - y, - sample_weight, - i, - len(trees), - verbose=self.verbose, - class_weight=self.class_weight, - n_samples_bootstrap=n_samples_bootstrap, - missing_values_in_feature_mask=missing_values_in_feature_mask, - ) - for i, t in enumerate(trees) + # construct the trees in parallel + self._construct_trees( + X, + y, + sample_weight, + random_state, + n_samples_bootstrap, + missing_values_in_feature_mask, + classes, + n_more_estimators, ) - # Collect newly grown trees - self.estimators_.extend(trees) - if self.oob_score and ( n_more_estimators > 0 or not hasattr(self, "oob_score_") ): @@ -540,6 +637,53 @@ def fit(self, X, y, sample_weight=None): return self + def _construct_trees( + self, + X, + y, + sample_weight, + random_state, + n_samples_bootstrap, + missing_values_in_feature_mask, + classes, + n_more_estimators, + ): + trees = [ + self._make_estimator(append=False, random_state=random_state) + for i in range(n_more_estimators) + ] + + # Parallel loop: we prefer the threading backend as the Cython code + # for fitting the trees is internally releasing the Python GIL + # making threading more efficient than multiprocessing in + # that case. However, for joblib 0.12+ we respect any + # parallel_backend contexts set at a higher level, + # since correctness does not rely on using threads. + trees = Parallel( + n_jobs=self.n_jobs, + verbose=self.verbose, + prefer="threads", + )( + delayed(_parallel_build_trees)( + t, + self.bootstrap, + X, + y, + sample_weight, + i, + len(trees), + verbose=self.verbose, + class_weight=self.class_weight, + n_samples_bootstrap=n_samples_bootstrap, + missing_values_in_feature_mask=missing_values_in_feature_mask, + classes=classes, + ) + for i, t in enumerate(trees) + ) + + # Collect newly grown trees + self.estimators_.extend(trees) + @abstractmethod def _set_oob_score_and_attributes(self, X, y, scoring_function=None): """Compute and set the OOB score and attributes. @@ -622,7 +766,7 @@ def _compute_oob_predictions(self, X, y): return oob_pred - def _validate_y_class_weight(self, y): + def _validate_y_class_weight(self, y, classes=None): # Default implementation return y, None @@ -682,6 +826,174 @@ def feature_importances_(self): all_importances = np.mean(all_importances, axis=0, dtype=np.float64) return all_importances / np.sum(all_importances) + def _bin_data(self, X, is_training_data): + """Bin data X. + + If is_training_data, then fit the _bin_mapper attribute. + Else, the binned data is converted to a C-contiguous array. + """ + description = "training" if is_training_data else "validation" + if self.verbose: + print( + "Binning {:.3f} GB of {} data: ".format(X.nbytes / 1e9, description), + end="", + flush=True, + ) + tic = time() + if is_training_data: + X_binned = self._bin_mapper.fit_transform(X) # F-aligned array + else: + X_binned = self._bin_mapper.transform(X) # F-aligned array + # We convert the array to C-contiguous since predicting is faster + # with this layout (training is faster on F-arrays though) + X_binned = np.ascontiguousarray(X_binned) + toc = time() + if self.verbose: + duration = toc - tic + print("{:.3f} s".format(duration)) + + return X_binned + + def predict_quantiles(self, X, quantiles=0.5, method="nearest"): + """Predict class or regression value for X at given quantiles. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + Input data. + quantiles : float, optional + The quantiles at which to evaluate, by default 0.5 (median). + method : str, optional + The method to interpolate, by default 'linear'. Can be any keyword + argument accepted by :func:`~np.quantile`. + + Returns + ------- + y : ndarray of shape (n_samples, n_quantiles, [n_outputs]) + The predicted values. The ``n_outputs`` dimension is present only + for multi-output regressors. + """ + if not self.store_leaf_values: + raise RuntimeError( + "Quantile prediction is not available when store_leaf_values=False" + ) + check_is_fitted(self) + # Check data + X = self._validate_X_predict(X) + + if not isinstance(quantiles, (np.ndarray, list)): + quantiles = np.array([quantiles]) + + # if we trained a binning tree, then we should re-bin the data + # XXX: this is inefficient and should be improved to be in line with what + # the Histogram Gradient Boosting Tree does, where the binning thresholds + # are passed into the tree itself, thus allowing us to set the node feature + # value thresholds within the tree itself. + if self.max_bins is not None: + X = self._bin_data(X, is_training_data=False).astype(DTYPE) + + # Assign chunk of trees to jobs + # n_jobs, _, _ = _partition_estimators(self.n_estimators, self.n_jobs) + + # avoid storing the output of every estimator by summing them here + if self.n_outputs_ > 1: + y_hat = np.zeros( + (X.shape[0], len(quantiles), self.n_outputs_), dtype=np.float64 + ) + else: + y_hat = np.zeros((X.shape[0], len(quantiles)), dtype=np.float64) + + # get (n_samples, n_estimators) indicator of leaf nodes + X_leaves = self.apply(X) + + # we now want to aggregate all leaf samples across all trees for each sample + for idx in range(X.shape[0]): + # get leaf nodes for this sample + leaf_nodes = X_leaves[idx, :] + + # (n_total_leaf_samples, n_outputs) + leaf_node_samples = np.vstack( + [ + est.tree_.leaf_nodes_samples[leaf_nodes[jdx]] + for jdx, est in enumerate(self.estimators_) + ] + ) + + # get quantiles across all leaf node samples + try: + y_hat[idx, ...] = np.quantile( + leaf_node_samples, quantiles, axis=0, method=method + ) + except TypeError: + y_hat[idx, ...] = np.quantile( + leaf_node_samples, quantiles, axis=0, interpolation=method + ) + + if is_classifier(self): + if self.n_outputs_ == 1: + for i in range(len(quantiles)): + class_pred_per_sample = y_hat[idx, i, :].squeeze().astype(int) + y_hat[idx, ...] = self.classes_.take( + class_pred_per_sample, axis=0 + ) + else: + for k in range(self.n_outputs_): + for i in range(len(quantiles)): + class_pred_per_sample = ( + y_hat[idx, i, k].squeeze().astype(int) + ) + y_hat[idx, i, k] = self.classes_[k].take( + class_pred_per_sample, axis=0 + ) + return y_hat + + def get_leaf_node_samples(self, X): + """For each datapoint x in X, get the training samples in the leaf node. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + Dataset to apply the forest to. + + Returns + ------- + leaf_node_samples : a list of array-like + Each sample is represented by the indices of the training samples that + reached the leaf node. The ``n_leaf_node_samples`` may vary between + samples, since the number of samples that fall in a leaf node is + variable. Each array-like has shape (n_leaf_node_samples, n_outputs). + """ + if not self.store_leaf_values: + raise RuntimeError( + "Leaf node samples are not available when store_leaf_values=False" + ) + + check_is_fitted(self) + # Check data + X = self._validate_X_predict(X) + + # if we trained a binning tree, then we should re-bin the data + # XXX: this is inefficient and should be improved to be in line with what + # the Histogram Gradient Boosting Tree does, where the binning thresholds + # are passed into the tree itself, thus allowing us to set the node feature + # value thresholds within the tree itself. + if self.max_bins is not None: + X = self._bin_data(X, is_training_data=False).astype(DTYPE) + + # Assign chunk of trees to jobs + n_jobs, _, _ = _partition_estimators(self.n_estimators, self.n_jobs) + + # avoid storing the output of every estimator by summing them here + result = Parallel(n_jobs=n_jobs, verbose=self.verbose)( + delayed(_accumulate_leaf_nodes_samples)(e.get_leaf_node_samples, X) + for e in self.estimators_ + ) + leaf_nodes_samples = result[0] + for result_ in result[1:]: + for i, node_samples in enumerate(result_): + leaf_nodes_samples[i] = np.vstack((leaf_nodes_samples[i], node_samples)) + return leaf_nodes_samples + def _get_estimators_indices(self): # Get drawn indices along both sample and feature axes for tree in self.estimators_: @@ -737,6 +1049,17 @@ def _accumulate_prediction(predict, X, out, lock): out[i] += prediction[i] +def _accumulate_leaf_nodes_samples(func, X): + """ + This is a utility function for joblib's Parallel. + + It can't go locally in ForestClassifier or ForestRegressor, because joblib + complains that it cannot pickle it when placed there. + """ + leaf_nodes_samples = func(X, check_input=False) + return leaf_nodes_samples + + class ForestClassifier(ClassifierMixin, BaseForest, metaclass=ABCMeta): """ Base class for forest of trees-based classifiers. @@ -760,6 +1083,8 @@ def __init__( warm_start=False, class_weight=None, max_samples=None, + max_bins=None, + store_leaf_values=False, ): super().__init__( estimator=estimator, @@ -773,6 +1098,8 @@ def __init__( warm_start=warm_start, class_weight=class_weight, max_samples=max_samples, + max_bins=max_bins, + store_leaf_values=store_leaf_values, ) @staticmethod @@ -827,7 +1154,7 @@ def _set_oob_score_and_attributes(self, X, y, scoring_function=None): y, np.argmax(self.oob_decision_function_, axis=1) ) - def _validate_y_class_weight(self, y): + def _validate_y_class_weight(self, y, classes=None): check_classification_targets(y) y = np.copy(y) @@ -840,12 +1167,28 @@ def _validate_y_class_weight(self, y): self.n_classes_ = [] y_store_unique_indices = np.zeros(y.shape, dtype=int) - for k in range(self.n_outputs_): - classes_k, y_store_unique_indices[:, k] = np.unique( - y[:, k], return_inverse=True - ) - self.classes_.append(classes_k) - self.n_classes_.append(classes_k.shape[0]) + if classes is not None: + classes = np.atleast_1d(classes) + if classes.ndim == 1: + classes = np.array([classes]) + + for k in classes: + self.classes_.append(np.array(k)) + self.n_classes_.append(np.array(k).shape[0]) + + for i in range(y.shape[0]): + for j in range(self.n_outputs_): + y_store_unique_indices[i, j] = np.where( + self.classes_[j] == y[i, j] + )[0][0] + else: + for k in range(self.n_outputs_): + classes_k, y_store_unique_indices[:, k] = np.unique( + y[:, k], return_inverse=True + ) + self.classes_.append(classes_k) + self.n_classes_.append(classes_k.shape[0]) + y = y_store_unique_indices if self.class_weight is not None: @@ -880,6 +1223,229 @@ def _validate_y_class_weight(self, y): return y, expanded_class_weight + def partial_fit(self, X, y, sample_weight=None, classes=None): + """Update a decision tree classifier from the training set (X, y). + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + The training input samples. Internally, it will be converted to + ``dtype=np.float32`` and if a sparse matrix is provided + to a sparse ``csc_matrix``. + + y : array-like of shape (n_samples,) or (n_samples, n_outputs) + The target values (class labels) as integers or strings. + + sample_weight : array-like of shape (n_samples,), default=None + Sample weights. If None, then samples are equally weighted. Splits + that would create child nodes with net zero or negative weight are + ignored while searching for a split in each node. Splits are also + ignored if they would result in any single class carrying a + negative weight in either child node. + + classes : array-like of shape (n_classes,), default=None + List of all the classes that can possibly appear in the y vector. + Must be provided at the first call to partial_fit, can be omitted + in subsequent calls. + + Returns + ------- + self : object + Returns the instance itself. + """ + self._validate_params() + + # validate input parameters + first_call = _check_partial_fit_first_call(self, classes=classes) + + # Fit if no tree exists yet + if first_call: + self.fit( + X, + y, + sample_weight=sample_weight, + classes=classes, + ) + return self + + X, y = validate_data( + self, + X, + y, + multi_output=True, + accept_sparse="csc", + dtype=DTYPE, + force_all_finite=False, + reset=first_call, + ) + + if issparse(y): + raise ValueError("sparse multilabel-indicator for y is not supported.") + + if sample_weight is not None: + sample_weight = _check_sample_weight(sample_weight, X) + + if issparse(X): + # Pre-sort indices to avoid that each individual tree of the + # ensemble sorts the indices. + X.sort_indices() + + y = np.atleast_1d(y) + if y.ndim == 2 and y.shape[1] == 1: + warn( + ( + "A column-vector y was passed when a 1d array was" + " expected. Please change the shape of y to " + "(n_samples,), for example using ravel()." + ), + DataConversionWarning, + stacklevel=2, + ) + + if y.ndim == 1: + # reshape is necessary to preserve the data contiguity against vs + # [:, np.newaxis] that does not. + y = np.reshape(y, (-1, 1)) + + if self.criterion == "poisson": + if np.any(y < 0): + raise ValueError( + "Some value(s) of y are negative which is " + "not allowed for Poisson regression." + ) + if np.sum(y) <= 0: + raise ValueError( + "Sum of y is not strictly positive which " + "is necessary for Poisson regression." + ) + + self.n_outputs_ = y.shape[1] + + classes = self.classes_ + if self.n_outputs_ == 1: + classes = [classes] + + y, expanded_class_weight = self._validate_y_class_weight(y, classes) + + if getattr(y, "dtype", None) != DOUBLE or not y.flags.contiguous: + y = np.ascontiguousarray(y, dtype=DOUBLE) + + if expanded_class_weight is not None: + if sample_weight is not None: + sample_weight = sample_weight * expanded_class_weight + else: + sample_weight = expanded_class_weight + + if not self.bootstrap and self.max_samples is not None: + raise ValueError( + "`max_sample` cannot be set if `bootstrap=False`. " + "Either switch to `bootstrap=True` or set " + "`max_sample=None`." + ) + elif self.bootstrap: + n_samples_bootstrap = _get_n_samples_bootstrap( + n_samples=X.shape[0], max_samples=self.max_samples + ) + else: + n_samples_bootstrap = None + + self._validate_estimator() + + if not self.bootstrap and self.oob_score: + raise ValueError("Out of bag estimation only available if bootstrap=True") + + random_state = check_random_state(self.random_state) + + if self.max_bins is not None: + # `_openmp_effective_n_threads` is used to take cgroups CPU quotes + # into account when determine the maximum number of threads to use. + n_threads = _openmp_effective_n_threads() + + # Bin the data + # For ease of use of the API, the user-facing GBDT classes accept the + # parameter max_bins, which doesn't take into account the bin for + # missing values (which is always allocated). However, since max_bins + # isn't the true maximal number of bins, all other private classes + # (binmapper, histbuilder...) accept n_bins instead, which is the + # actual total number of bins. Everywhere in the code, the + # convention is that n_bins == max_bins + 1 + n_bins = self.max_bins + 1 # + 1 for missing values + self._bin_mapper = _BinMapper( + n_bins=n_bins, + # is_categorical=self.is_categorical_, + known_categories=None, + random_state=random_state, + n_threads=n_threads, + ) + + # XXX: in order for this to work with the underlying tree submodule's Cython + # code, we need to convert this into the original data's DTYPE because + # the Cython code assumes that `DTYPE` is used. + # The proper implementation will be a lot more complicated and should be + # tackled once scikit-learn has finalized their inclusion of missing data + # and categorical support for decision trees + X = self._bin_data(X, is_training_data=True) # .astype(DTYPE) + else: + self._bin_mapper = None + + # We draw from the random state to get the random state we + # would have got if we hadn't used a warm_start. + random_state.randint(MAX_INT, size=len(self.estimators_)) + + # Parallel loop: we prefer the threading backend as the Cython code + # for fitting the trees is internally releasing the Python GIL + # making threading more efficient than multiprocessing in + # that case. However, for joblib 0.12+ we respect any + # parallel_backend contexts set at a higher level, + # since correctness does not rely on using threads. + Parallel( + n_jobs=self.n_jobs, + verbose=self.verbose, + prefer="threads", + )( + delayed(_parallel_update_trees)( + t, + self.bootstrap, + X, + y, + sample_weight, + i, + len(self.estimators_), + verbose=self.verbose, + class_weight=self.class_weight, + n_samples_bootstrap=n_samples_bootstrap, + classes=classes[0], + ) + for i, t in enumerate(self.estimators_) + ) + + if self.oob_score: + y_type = type_of_target(y) + if y_type in ("multiclass-multioutput", "unknown"): + # FIXME: we could consider to support multiclass-multioutput if + # we introduce or reuse a constructor parameter (e.g. + # oob_score) allowing our user to pass a callable defining the + # scoring strategy on OOB sample. + raise ValueError( + "The type of target cannot be used to compute OOB " + f"estimates. Got {y_type} while only the following are " + "supported: continuous, continuous-multioutput, binary, " + "multiclass, multilabel-indicator." + ) + + if callable(self.oob_score): + self._set_oob_score_and_attributes( + X, y, scoring_function=self.oob_score + ) + else: + self._set_oob_score_and_attributes(X, y) + + # Decapsulate classes_ attributes + if hasattr(self, "classes_") and self.n_outputs_ == 1: + self.n_classes_ = self.n_classes_[0] + self.classes_ = self.classes_[0] + return self + def predict(self, X): """ Predict class for X. @@ -945,6 +1511,14 @@ def predict_proba(self, X): # Check data X = self._validate_X_predict(X) + # if we trained a binning tree, then we should re-bin the data + # XXX: this is inefficient and should be improved to be in line with what + # the Histogram Gradient Boosting Tree does, where the binning thresholds + # are passed into the tree itself, thus allowing us to set the node feature + # value thresholds within the tree itself. + if self.max_bins is not None: + X = self._bin_data(X, is_training_data=False).astype(DTYPE) + # Assign chunk of trees to jobs n_jobs, _, _ = _partition_estimators(self.n_estimators, self.n_jobs) @@ -1027,6 +1601,8 @@ def __init__( verbose=0, warm_start=False, max_samples=None, + max_bins=None, + store_leaf_values=False, ): super().__init__( estimator, @@ -1039,6 +1615,8 @@ def __init__( verbose=verbose, warm_start=warm_start, max_samples=max_samples, + max_bins=max_bins, + store_leaf_values=store_leaf_values, ) def predict(self, X): @@ -1064,6 +1642,14 @@ def predict(self, X): # Check data X = self._validate_X_predict(X) + # if we trained a binning tree, then we should re-bin the data + # XXX: this is inefficient and should be improved to be in line with what + # the Histogram Gradient Boosting Tree does, where the binning thresholds + # are passed into the tree itself, thus allowing us to set the node feature + # value thresholds within the tree itself. + if self.max_bins is not None: + X = self._bin_data(X, is_training_data=False).astype(DTYPE) + # Assign chunk of trees to jobs n_jobs, _, _ = _partition_estimators(self.n_estimators, self.n_jobs) @@ -1167,7 +1753,7 @@ def _compute_partial_dependence_recursion(self, grid, target_features): def __sklearn_tags__(self): tags = super().__sklearn_tags__() - tags.regressor_tags.multi_label = True + # tags.regressor_tags.multi_label = True TODO: add regression support return tags @@ -1361,6 +1947,16 @@ class RandomForestClassifier(ForestClassifier): .. versionadded:: 0.22 + max_bins : int, default=255 + The maximum number of bins to use for non-missing values. + + **This is an experimental feature**. + + store_leaf_values : bool, default=False + Whether to store the leaf values in the ``get_leaf_node_samples`` function. + + **This is an experimental feature**. + monotonic_cst : array-like of int of shape (n_features), default=None Indicates the monotonicity constraint to enforce on each feature. - 1: monotonic increase @@ -1518,6 +2114,8 @@ def __init__( class_weight=None, ccp_alpha=0.0, max_samples=None, + max_bins=None, + store_leaf_values=False, monotonic_cst=None, ): super().__init__( @@ -1534,6 +2132,7 @@ def __init__( "min_impurity_decrease", "random_state", "ccp_alpha", + "store_leaf_values", "monotonic_cst", ), bootstrap=bootstrap, @@ -1544,6 +2143,8 @@ def __init__( warm_start=warm_start, class_weight=class_weight, max_samples=max_samples, + max_bins=max_bins, + store_leaf_values=store_leaf_values, ) self.criterion = criterion @@ -1560,11 +2161,11 @@ def __init__( def __sklearn_tags__(self): tags = super().__sklearn_tags__() # TODO: replace by a statistical test, see meta-issue #16298 - tags._xfail_checks = { - "check_sample_weight_equivalence": ( - "sample_weight is not equivalent to removing/repeating samples." - ), - } + # tags._xfail_checks = { + # "check_sample_weight_equivalence": ( + # "sample_weight is not equivalent to removing/repeating samples." + # ), + # } return tags @@ -1749,6 +2350,17 @@ class RandomForestRegressor(ForestRegressor): .. versionadded:: 0.22 + max_bins : int, default=255 + The maximum number of bins to use for non-missing values. Used for + speeding up training time. + + **This is an experimental feature**. + + store_leaf_values : bool, default=False + Whether to store the leaf values in the ``get_leaf_node_samples`` function. + + **This is an experimental feature**. + monotonic_cst : array-like of int of shape (n_features), default=None Indicates the monotonicity constraint to enforce on each feature. - 1: monotonically increasing @@ -1890,6 +2502,8 @@ def __init__( warm_start=False, ccp_alpha=0.0, max_samples=None, + max_bins=None, + store_leaf_values=False, monotonic_cst=None, ): super().__init__( @@ -1906,6 +2520,7 @@ def __init__( "min_impurity_decrease", "random_state", "ccp_alpha", + "store_leaf_values", "monotonic_cst", ), bootstrap=bootstrap, @@ -1915,6 +2530,8 @@ def __init__( verbose=verbose, warm_start=warm_start, max_samples=max_samples, + max_bins=max_bins, + store_leaf_values=store_leaf_values, ) self.criterion = criterion @@ -1931,11 +2548,11 @@ def __init__( def __sklearn_tags__(self): tags = super().__sklearn_tags__() # TODO: replace by a statistical test, see meta-issue #16298 - tags._xfail_checks = { - "check_sample_weight_equivalence": ( - "sample_weight is not equivalent to removing/repeating samples." - ), - } + # tags._xfail_checks = { + # "check_sample_weight_equivalence": ( + # "sample_weight is not equivalent to removing/repeating samples." + # ), + # } return tags @@ -2126,6 +2743,16 @@ class ExtraTreesClassifier(ForestClassifier): .. versionadded:: 0.22 + max_bins : int, default=255 + The maximum number of bins to use for non-missing values. + + **This is an experimental feature**. + + store_leaf_values : bool, default=False + Whether to store the leaf values in the ``get_leaf_node_samples`` function. + + **This is an experimental feature**. + monotonic_cst : array-like of int of shape (n_features), default=None Indicates the monotonicity constraint to enforce on each feature. - 1: monotonically increasing @@ -2272,6 +2899,8 @@ def __init__( class_weight=None, ccp_alpha=0.0, max_samples=None, + max_bins=None, + store_leaf_values=False, monotonic_cst=None, ): super().__init__( @@ -2288,6 +2917,7 @@ def __init__( "min_impurity_decrease", "random_state", "ccp_alpha", + "store_leaf_values", "monotonic_cst", ), bootstrap=bootstrap, @@ -2298,6 +2928,8 @@ def __init__( warm_start=warm_start, class_weight=class_weight, max_samples=max_samples, + max_bins=max_bins, + store_leaf_values=store_leaf_values, ) self.criterion = criterion @@ -2487,6 +3119,16 @@ class ExtraTreesRegressor(ForestRegressor): .. versionadded:: 0.22 + max_bins : int, default=255 + The maximum number of bins to use for non-missing values. + + **This is an experimental feature**. + + store_leaf_values : bool, default=False + Whether to store the leaf values in the ``get_leaf_node_samples`` function. + + **This is an experimental feature**. + monotonic_cst : array-like of int of shape (n_features), default=None Indicates the monotonicity constraint to enforce on each feature. - 1: monotonically increasing @@ -2613,6 +3255,8 @@ def __init__( warm_start=False, ccp_alpha=0.0, max_samples=None, + max_bins=None, + store_leaf_values=False, monotonic_cst=None, ): super().__init__( @@ -2629,6 +3273,7 @@ def __init__( "min_impurity_decrease", "random_state", "ccp_alpha", + "store_leaf_values", "monotonic_cst", ), bootstrap=bootstrap, @@ -2638,6 +3283,8 @@ def __init__( verbose=verbose, warm_start=warm_start, max_samples=max_samples, + max_bins=max_bins, + store_leaf_values=store_leaf_values, ) self.criterion = criterion @@ -2761,6 +3408,9 @@ class RandomTreesEmbedding(TransformerMixin, BaseForest): new forest. See :term:`Glossary ` and :ref:`tree_ensemble_warm_start` for details. + store_leaf_values : bool, default=False + Whether to store the leaf values in the ``get_leaf_node_samples`` function. + Attributes ---------- estimator_ : :class:`~sklearn.tree.ExtraTreeRegressor` instance @@ -2862,6 +3512,7 @@ def __init__( random_state=None, verbose=0, warm_start=False, + store_leaf_values=False, ): super().__init__( estimator=ExtraTreeRegressor(), @@ -2876,6 +3527,7 @@ def __init__( "max_leaf_nodes", "min_impurity_decrease", "random_state", + "store_leaf_values", ), bootstrap=False, oob_score=False, @@ -2884,6 +3536,7 @@ def __init__( verbose=verbose, warm_start=warm_start, max_samples=None, + store_leaf_values=store_leaf_values, ) self.max_depth = max_depth @@ -2897,7 +3550,7 @@ def __init__( def _set_oob_score_and_attributes(self, X, y, scoring_function=None): raise NotImplementedError("OOB score not supported by tree embedding") - def fit(self, X, y=None, sample_weight=None): + def fit(self, X, y=None, sample_weight=None, classes=None): """ Fit estimator. @@ -2918,17 +3571,20 @@ def fit(self, X, y=None, sample_weight=None): classification, splits are also ignored if they would result in any single class carrying a negative weight in either child node. + classes : array-like of shape (n_classes,), default=None + List of all the classes that can possibly appear in the y vector. + Returns ------- self : object Returns the instance itself. """ # Parameters are validated in fit_transform - self.fit_transform(X, y, sample_weight=sample_weight) + self.fit_transform(X, y, sample_weight=sample_weight, classes=classes) return self @_fit_context(prefer_skip_nested_validation=True) - def fit_transform(self, X, y=None, sample_weight=None): + def fit_transform(self, X, y=None, sample_weight=None, classes=None): """ Fit estimator and transform dataset. @@ -2948,6 +3604,9 @@ def fit_transform(self, X, y=None, sample_weight=None): classification, splits are also ignored if they would result in any single class carrying a negative weight in either child node. + classes : array-like of shape (n_classes,), default=None + List of all the classes that can possibly appear in the y vector. + Returns ------- X_transformed : sparse matrix of shape (n_samples, n_out) @@ -2955,7 +3614,7 @@ def fit_transform(self, X, y=None, sample_weight=None): """ rnd = check_random_state(self.random_state) y = rnd.uniform(size=_num_samples(X)) - super().fit(X, y, sample_weight=sample_weight) + super().fit(X, y, sample_weight=sample_weight, classes=classes) self.one_hot_encoder_ = OneHotEncoder(sparse_output=self.sparse_output) output = self.one_hot_encoder_.fit_transform(self.apply(X)) @@ -3016,9 +3675,9 @@ def transform(self, X): def __sklearn_tags__(self): tags = super().__sklearn_tags__() # TODO: replace by a statistical test, see meta-issue #16298 - tags._xfail_checks = { - "check_sample_weight_equivalence": ( - "sample_weight is not equivalent to removing/repeating samples." - ), - } + # tags._xfail_checks = { + # "check_sample_weight_equivalence": ( + # "sample_weight is not equivalent to removing/repeating samples." + # ), + # } return tags diff --git a/sklearn/ensemble/_gb.py b/sklearn/ensemble/_gb.py index 8f85f2f7aa3cd..1f48d6eaf8ccd 100644 --- a/sklearn/ensemble/_gb.py +++ b/sklearn/ensemble/_gb.py @@ -365,6 +365,7 @@ class BaseGradientBoosting(BaseEnsemble, metaclass=ABCMeta): "n_iter_no_change": [Interval(Integral, 1, None, closed="left"), None], "tol": [Interval(Real, 0.0, None, closed="left")], } + _parameter_constraints.pop("store_leaf_values") _parameter_constraints.pop("splitter") _parameter_constraints.pop("monotonic_cst") diff --git a/sklearn/ensemble/meson.build b/sklearn/ensemble/meson.build index bc5868b3a0104..e595757136a2d 100644 --- a/sklearn/ensemble/meson.build +++ b/sklearn/ensemble/meson.build @@ -2,6 +2,7 @@ py.extension_module( '_gradient_boosting', ['_gradient_boosting.pyx'] + utils_cython_tree, dependencies: [np_dep], + override_options : ['optimization=3', 'cython_language=cpp'], cython_args: cython_args, subdir: 'sklearn/ensemble', install: true diff --git a/sklearn/ensemble/tests/test_forest.py b/sklearn/ensemble/tests/test_forest.py index aadf230fd751e..51fbb3e823726 100644 --- a/sklearn/ensemble/tests/test_forest.py +++ b/sklearn/ensemble/tests/test_forest.py @@ -118,6 +118,120 @@ FOREST_CLASSIFIERS_REGRESSORS.update(FOREST_REGRESSORS) +def _sparse_parity(n, p=20, p_star=3, random_state=None): + """Generate sparse parity dataset. + + Sparse parity is a multivariate generalization of the + XOR problem. + + Parameters + ---------- + n : int + Number of sample to generate. + p : int, optional + The dimensionality of the dataset, by default 20 + p_star : int, optional + The number of informative dimensions, by default 3. + random_state : Random State, optional + Random state, by default None. + + Returns + ------- + X : np.ndarray of shape (n, p) + Sparse parity dataset as a dense array. + y : np.ndarray of shape (n,) + Labels of the dataset + """ + rng = np.random.RandomState(seed=random_state) + X = rng.uniform(-1, 1, (n, p)) + y = np.zeros(n) + + for i in range(0, n): + y[i] = sum(X[i, :p_star] > 0) % 2 + + return X, y + + +def _orthant(n, p=6, random_state=None): + """Generate orthant dataset. + + Parameters + ---------- + n : int + Number of sample to generate. + p : int, optional + The dimensionality of the dataset and the number of + unique labels, by default 6. + rec : int, optional + _description_, by default 1 + random_state : Random State, optional + Random state, by default None. + + Returns + ------- + X : np.ndarray of shape (n, p) + Orthant dataset as a dense array. + y : np.ndarray of shape (n,) + Labels of the dataset + """ + rng = np.random.RandomState(seed=random_state) + orth_labels = np.asarray([2**i for i in range(0, p)][::-1]) + + X = rng.uniform(-1, 1, (n, p)) + y = np.zeros(n) + + for i in range(0, n): + idx = np.where(X[i, :] > 0)[0] + y[i] = sum(orth_labels[idx]) + + if len(np.unique(y)) < 2**p: + raise RuntimeError("Increase sample size to get a label in each orthant.") + + return X, y + + +def _trunk(n, p=10, random_state=None): + """Generate trunk dataset. + + Parameters + ---------- + n : int + Number of sample to generate. + p : int, optional + The dimensionality of the dataset and the number of + unique labels, by default 10. + random_state : Random State, optional + Random state, by default None. + + Returns + ------- + X : np.ndarray of shape (n, p) + Trunk dataset as a dense array. + y : np.ndarray of shape (n,) + Labels of the dataset + + References + ---------- + [1] Gerard V. Trunk. A problem of dimensionality: A + simple example. IEEE Transactions on Pattern Analysis + and Machine Intelligence, 1(3):306–307, 1979. + """ + rng = np.random.RandomState(seed=random_state) + + mu_1 = np.array([1 / i for i in range(1, p + 1)]) + mu_0 = -1 * mu_1 + cov = np.identity(p) + + X = np.vstack( + ( + rng.multivariate_normal(mu_0, cov, int(n / 2)), + rng.multivariate_normal(mu_1, cov, int(n / 2)), + ) + ) + y = np.concatenate((np.zeros(int(n / 2)), np.ones(int(n / 2)))) + return X, y + + @pytest.mark.parametrize("name", FOREST_CLASSIFIERS) def test_classification_toy(name): """Check classification on a toy dataset.""" @@ -1542,7 +1656,10 @@ def test_max_samples_bootstrap(name): def test_large_max_samples_exception(name): # Check invalid `max_samples` est = FOREST_CLASSIFIERS_REGRESSORS[name](bootstrap=True, max_samples=int(1e9)) - match = "`max_samples` must be <= n_samples=6 but got value 1000000000" + # TODO: remove the following line when the issue is fixed + # https://github.com/scikit-learn/scikit-learn/issues/28507 + # match = "`max_samples` must be <= n_samples=6 but got value 1000000000" + match = "The expected number of unique samples" with pytest.raises(ValueError, match=match): est.fit(X, y) @@ -1704,6 +1821,116 @@ def test_round_samples_to_one_when_samples_too_low(class_weight): forest.fit(X, y) +@pytest.mark.skip() +@pytest.mark.parametrize("name", FOREST_CLASSIFIERS) +def test_classification_toy_withbins(name): + """Check classification on a toy dataset.""" + ForestClassifier = FOREST_CLASSIFIERS[name] + + clf = ForestClassifier(n_estimators=10, random_state=1, max_bins=255) + clf.fit(X, y) + assert_array_equal(clf.predict(T), true_result) + assert 10 == len(clf) + + clf = ForestClassifier( + n_estimators=10, max_features=1, random_state=1, max_bins=255 + ) + clf.fit(X, y) + assert_array_equal(clf.predict(T), true_result) + assert 10 == len(clf) + + # also test apply + leaf_indices = clf.apply(X) + assert leaf_indices.shape == (len(X), clf.n_estimators) + + +@pytest.mark.skip() +@pytest.mark.parametrize("name", FOREST_REGRESSORS) +@pytest.mark.parametrize( + "criterion", ("squared_error", "absolute_error", "friedman_mse") +) +def test_regression_criterion_withbins(name, criterion): + # Check consistency on regression dataset. + ForestRegressor = FOREST_REGRESSORS[name] + + reg = ForestRegressor( + n_estimators=5, criterion=criterion, random_state=1, max_bins=250 + ) + reg.fit(X_reg, y_reg) + score = reg.score(X_reg, y_reg) + assert ( + score > 0.93 + ), "Failed with max_features=None, criterion %s and score = %f" % ( + criterion, + score, + ) + + reg = ForestRegressor( + n_estimators=5, + criterion=criterion, + max_features=6, + random_state=1, + max_bins=250, + ) + reg.fit(X_reg, y_reg) + score = reg.score(X_reg, y_reg) + assert score > 0.92, "Failed with max_features=6, criterion %s and score = %f" % ( + criterion, + score, + ) + + +@pytest.mark.parametrize("name", FOREST_CLASSIFIERS_REGRESSORS) +def test_multioutput_quantiles(name): + # Check estimators on multi-output problems. + X_train = [ + [-2, -1], + [-1, -1], + [-1, -2], + [1, 1], + [1, 2], + [2, 1], + [-2, 1], + [-1, 1], + [-1, 2], + [2, -1], + [1, -1], + [1, -2], + ] + y_train = [ + [-1, 0], + [-1, 0], + [-1, 0], + [1, 1], + [1, 1], + [1, 1], + [-1, 2], + [-1, 2], + [-1, 2], + [1, 3], + [1, 3], + [1, 3], + ] + X_test = [[-1, -1], [1, 1], [-1, 1], [1, -1]] + y_test = [[-1, 0], [1, 1], [-1, 2], [1, 3]] + + est = FOREST_ESTIMATORS[name]( + random_state=0, bootstrap=False, store_leaf_values=True + ) + est.fit(X_train, y_train) + + y_pred = est.predict_quantiles(X_test, quantiles=[0.25, 0.5, 0.75]) + assert_array_almost_equal(y_pred[:, 1, :], y_test) + assert_array_almost_equal(y_pred[:, 0, :], y_test) + assert_array_almost_equal(y_pred[:, 2, :], y_test) + + # test the leaf nodes samples + leaf_nodes_samples = est.get_leaf_node_samples(X_test) + assert len(leaf_nodes_samples) == len(X_test) + for node_samples in leaf_nodes_samples: + assert node_samples.shape[1] == est.n_outputs_ + + @pytest.mark.parametrize("seed", [None, 1]) @pytest.mark.parametrize("bootstrap", [True, False]) @pytest.mark.parametrize("ForestClass", FOREST_CLASSIFIERS_REGRESSORS.values()) diff --git a/sklearn/feature_selection/tests/test_from_model.py b/sklearn/feature_selection/tests/test_from_model.py index 8008b8c028085..9b6c8382c1fd5 100644 --- a/sklearn/feature_selection/tests/test_from_model.py +++ b/sklearn/feature_selection/tests/test_from_model.py @@ -10,7 +10,11 @@ from sklearn.cross_decomposition import CCA, PLSCanonical, PLSRegression from sklearn.datasets import make_friedman1 from sklearn.decomposition import PCA -from sklearn.ensemble import HistGradientBoostingClassifier, RandomForestClassifier +from sklearn.ensemble import ( + HistGradientBoostingClassifier, + RandomForestClassifier, + RandomForestRegressor, +) from sklearn.exceptions import NotFittedError from sklearn.feature_selection import SelectFromModel from sklearn.linear_model import ( @@ -409,7 +413,7 @@ def test_partial_fit(): assert_array_almost_equal(X_transform, transformer.transform(data)) # check that if est doesn't have partial_fit, neither does SelectFromModel - transformer = SelectFromModel(estimator=RandomForestClassifier()) + transformer = SelectFromModel(estimator=RandomForestRegressor()) assert not hasattr(transformer, "partial_fit") diff --git a/sklearn/meson.build b/sklearn/meson.build index eaf1b98e60cc2..7c01ad73f9ce4 100644 --- a/sklearn/meson.build +++ b/sklearn/meson.build @@ -193,14 +193,14 @@ cython_args += scikit_learn_cython_args # Write file in Meson build dir to be able to figure out from Python code # whether scikit-learn was built with Meson. Adapted from pandas # _version_meson.py. -custom_target('write_built_with_meson_file', - output: '_built_with_meson.py', - command: [ - py, '-c', 'with open("sklearn/_built_with_meson.py", "w") as f: f.write("")' - ], - install: true, - install_dir: py.get_install_dir() / 'sklearn' -) +# custom_target('write_built_with_meson_file', +# output: '_built_with_meson.py', +# command: [ +# py, '-c', 'with open("sklearn/_built_with_meson.py", "w") as f: f.write("")' +# ], +# install: true, +# install_dir: py.get_install_dir() / 'sklearn' +# ) extensions = ['_isotonic'] diff --git a/sklearn/neighbors/meson.build b/sklearn/neighbors/meson.build index 22f81d597948b..0d850bc4fd76e 100644 --- a/sklearn/neighbors/meson.build +++ b/sklearn/neighbors/meson.build @@ -40,7 +40,8 @@ neighbors_extension_metadata = { '_partition_nodes': {'sources': ['_partition_nodes.pyx'], 'override_options': ['cython_language=cpp'], 'dependencies': [np_dep]}, - '_quad_tree': {'sources': ['_quad_tree.pyx'], 'dependencies': [np_dep]}, + '_quad_tree': {'sources': ['_quad_tree.pyx'], 'dependencies': [np_dep], + 'override_options': ['cython_language=cpp']}, } foreach ext_name, ext_dict : neighbors_extension_metadata diff --git a/sklearn/tree/_classes.py b/sklearn/tree/_classes.py index 93246a1376e85..2ce58759d8253 100644 --- a/sklearn/tree/_classes.py +++ b/sklearn/tree/_classes.py @@ -15,9 +15,7 @@ import numpy as np from scipy.sparse import issparse -from sklearn.utils import metadata_routing - -from ..base import ( +from sklearn.base import ( BaseEstimator, ClassifierMixin, MultiOutputMixin, @@ -26,10 +24,18 @@ clone, is_classifier, ) -from ..utils import Bunch, check_random_state, compute_sample_weight -from ..utils._param_validation import Hidden, Interval, RealNotInt, StrOptions -from ..utils.multiclass import check_classification_targets -from ..utils.validation import ( +from sklearn.utils import ( + Bunch, + check_random_state, + compute_sample_weight, + metadata_routing, +) +from sklearn.utils._param_validation import Hidden, Interval, RealNotInt, StrOptions +from sklearn.utils.multiclass import ( + _check_partial_fit_first_call, + check_classification_targets, +) +from sklearn.utils.validation import ( _assert_all_finite_element_wise, _check_n_features, _check_sample_weight, @@ -37,9 +43,10 @@ check_is_fitted, validate_data, ) + from . import _criterion, _splitter, _tree -from ._criterion import Criterion -from ._splitter import Splitter +from ._criterion import BaseCriterion +from ._splitter import BaseSplitter from ._tree import ( BestFirstTreeBuilder, DepthFirstTreeBuilder, @@ -105,6 +112,7 @@ class BaseDecisionTree(MultiOutputMixin, BaseEstimator, metaclass=ABCMeta): "min_samples_split": [ Interval(Integral, 2, None, closed="left"), Interval(RealNotInt, 0.0, 1.0, closed="right"), + StrOptions({"sqrt", "log2"}), ], "min_samples_leaf": [ Interval(Integral, 1, None, closed="left"), @@ -121,6 +129,7 @@ class BaseDecisionTree(MultiOutputMixin, BaseEstimator, metaclass=ABCMeta): "max_leaf_nodes": [Interval(Integral, 2, None, closed="left"), None], "min_impurity_decrease": [Interval(Real, 0.0, None, closed="left")], "ccp_alpha": [Interval(Real, 0.0, None, closed="left")], + "store_leaf_values": ["boolean"], "monotonic_cst": ["array-like", None], } @@ -140,6 +149,7 @@ def __init__( min_impurity_decrease, class_weight=None, ccp_alpha=0.0, + store_leaf_values=False, monotonic_cst=None, ): self.criterion = criterion @@ -154,6 +164,7 @@ def __init__( self.min_impurity_decrease = min_impurity_decrease self.class_weight = class_weight self.ccp_alpha = ccp_alpha + self.store_leaf_values = store_leaf_values self.monotonic_cst = monotonic_cst def get_depth(self): @@ -235,6 +246,7 @@ def _fit( sample_weight=None, check_input=True, missing_values_in_feature_mask=None, + classes=None, ): random_state = check_random_state(self.random_state) @@ -249,9 +261,12 @@ def _fit( dtype=DTYPE, accept_sparse="csc", ensure_all_finite=False ) check_y_params = dict(ensure_2d=False, dtype=None) - X, y = validate_data( - self, X, y, validate_separately=(check_X_params, check_y_params) - ) + if y is not None or self.__sklearn_tags__().target_tags.required: + X, y = validate_data( + self, X, y, validate_separately=(check_X_params, check_y_params) + ) + else: + X = validate_data(self, X, **check_X_params) missing_values_in_feature_mask = ( self._compute_missing_values_in_feature_mask(X) @@ -264,7 +279,7 @@ def _fit( "No support for np.int64 index based sparse matrices" ) - if self.criterion == "poisson": + if y is not None and self.criterion == "poisson": if np.any(y < 0): raise ValueError( "Some value(s) of y are negative which is" @@ -278,45 +293,73 @@ def _fit( # Determine output settings n_samples, self.n_features_in_ = X.shape - is_classification = is_classifier(self) - y = np.atleast_1d(y) - expanded_class_weight = None - - if y.ndim == 1: - # reshape is necessary to preserve the data contiguity against vs - # [:, np.newaxis] that does not. - y = np.reshape(y, (-1, 1)) + # Do preprocessing if 'y' is passed + is_classification = False + if y is not None: + is_classification = is_classifier(self) + y = np.atleast_1d(y) + expanded_class_weight = None - self.n_outputs_ = y.shape[1] + if y.ndim == 1: + # reshape is necessary to preserve the data contiguity against vs + # [:, np.newaxis] that does not. + y = np.reshape(y, (-1, 1)) - if is_classification: - check_classification_targets(y) - y = np.copy(y) + self.n_outputs_ = y.shape[1] - self.classes_ = [] - self.n_classes_ = [] + if is_classification: + check_classification_targets(y) + y = np.copy(y) + + self.classes_ = [] + self.n_classes_ = [] + + if self.class_weight is not None: + y_original = np.copy(y) + + y_encoded = np.zeros(y.shape, dtype=int) + if classes is not None: + classes = np.atleast_1d(classes) + if classes.ndim == 1: + classes = np.array([classes]) + + for k in classes: + self.classes_.append(np.array(k)) + self.n_classes_.append(np.array(k).shape[0]) + + for i in range(n_samples): + for j in range(self.n_outputs_): + y_encoded[i, j] = np.where(self.classes_[j] == y[i, j])[0][ + 0 + ] + else: + for k in range(self.n_outputs_): + classes_k, y_encoded[:, k] = np.unique( + y[:, k], return_inverse=True + ) + self.classes_.append(classes_k) + self.n_classes_.append(classes_k.shape[0]) + + y = y_encoded + + if self.class_weight is not None: + expanded_class_weight = compute_sample_weight( + self.class_weight, y_original + ) - if self.class_weight is not None: - y_original = np.copy(y) + self.n_classes_ = np.array(self.n_classes_, dtype=np.intp) + self._n_classes_ = self.n_classes_ + if getattr(y, "dtype", None) != DOUBLE or not y.flags.contiguous: + y = np.ascontiguousarray(y, dtype=DOUBLE) - y_encoded = np.zeros(y.shape, dtype=int) - for k in range(self.n_outputs_): - classes_k, y_encoded[:, k] = np.unique(y[:, k], return_inverse=True) - self.classes_.append(classes_k) - self.n_classes_.append(classes_k.shape[0]) - y = y_encoded - - if self.class_weight is not None: - expanded_class_weight = compute_sample_weight( - self.class_weight, y_original + if len(y) != n_samples: + raise ValueError( + "Number of labels=%d does not match number of samples=%d" + % (len(y), n_samples) ) - self.n_classes_ = np.array(self.n_classes_, dtype=np.intp) - - if getattr(y, "dtype", None) != DOUBLE or not y.flags.contiguous: - y = np.ascontiguousarray(y, dtype=DOUBLE) - + # set decision-tree model parameters max_depth = np.iinfo(np.int32).max if self.max_depth is None else self.max_depth if isinstance(self.min_samples_leaf, numbers.Integral): @@ -324,13 +367,19 @@ def _fit( else: # float min_samples_leaf = int(ceil(self.min_samples_leaf * n_samples)) - if isinstance(self.min_samples_split, numbers.Integral): + if isinstance(self.min_samples_split, str): + if self.min_samples_split == "sqrt": + min_samples_split = max(1, int(np.sqrt(self.n_features_in_))) + elif self.min_samples_split == "log2": + min_samples_split = max(1, int(np.log2(self.n_features_in_))) + elif isinstance(self.min_samples_split, numbers.Integral): min_samples_split = self.min_samples_split else: # float min_samples_split = int(ceil(self.min_samples_split * n_samples)) min_samples_split = max(2, min_samples_split) - min_samples_split = max(min_samples_split, 2 * min_samples_leaf) + self.min_samples_split_ = min_samples_split + self.min_samples_leaf_ = min_samples_leaf if isinstance(self.max_features, str): if self.max_features == "sqrt": @@ -351,16 +400,10 @@ def _fit( max_leaf_nodes = -1 if self.max_leaf_nodes is None else self.max_leaf_nodes - if len(y) != n_samples: - raise ValueError( - "Number of labels=%d does not match number of samples=%d" - % (len(y), n_samples) - ) - if sample_weight is not None: sample_weight = _check_sample_weight(sample_weight, X, DOUBLE) - if expanded_class_weight is not None: + if y is not None and expanded_class_weight is not None: if sample_weight is not None: sample_weight = sample_weight * expanded_class_weight else: @@ -371,11 +414,66 @@ def _fit( min_weight_leaf = self.min_weight_fraction_leaf * n_samples else: min_weight_leaf = self.min_weight_fraction_leaf * np.sum(sample_weight) + self.min_weight_leaf_ = min_weight_leaf + + # build the actual tree now with the parameters + self = self._build_tree( + X=X, + y=y, + sample_weight=sample_weight, + missing_values_in_feature_mask=missing_values_in_feature_mask, + min_samples_leaf=min_samples_leaf, + min_weight_leaf=min_weight_leaf, + max_leaf_nodes=max_leaf_nodes, + min_samples_split=min_samples_split, + max_depth=max_depth, + random_state=random_state, + ) + + return self + + def _build_tree( + self, + X, + y, + sample_weight, + missing_values_in_feature_mask, + min_samples_leaf, + min_weight_leaf, + max_leaf_nodes, + min_samples_split, + max_depth, + random_state, + ): + """Build the actual tree. + + Parameters + ---------- + X : Array-like + X dataset. + y : Array-like + Y targets. + sample_weight : Array-like + Sample weights + min_samples_leaf : float + Number of samples required to be a leaf. + min_weight_leaf : float + Weight of samples required to be a leaf. + max_leaf_nodes : float + Maximum number of leaf nodes allowed in tree. + min_samples_split : float + Minimum number of samples to split on. + max_depth : int + The maximum depth of any tree. + random_state : int + Random seed. + """ + n_samples = X.shape[0] # Build tree criterion = self.criterion - if not isinstance(criterion, Criterion): - if is_classification: + if not isinstance(criterion, BaseCriterion): + if is_classifier(self): criterion = CRITERIA_CLF[self.criterion]( self.n_outputs_, self.n_classes_ ) @@ -388,7 +486,6 @@ def _fit( SPLITTERS = SPARSE_SPLITTERS if issparse(X) else DENSE_SPLITTERS - splitter = self.splitter if self.monotonic_cst is None: monotonic_cst = None else: @@ -427,8 +524,9 @@ def _fit( # Since self.monotonic_cst encodes constraints on probabilities of the # *positive class*, all signs must be flipped. monotonic_cst *= -1 + self.monotonic_cst_ = monotonic_cst - if not isinstance(self.splitter, Splitter): + if not isinstance(self.splitter, BaseSplitter): splitter = SPLITTERS[self.splitter]( criterion, self.max_features_, @@ -457,6 +555,7 @@ def _fit( min_weight_leaf, max_depth, self.min_impurity_decrease, + self.store_leaf_values, ) else: builder = BestFirstTreeBuilder( @@ -467,8 +566,8 @@ def _fit( max_depth, max_leaf_nodes, self.min_impurity_decrease, + self.store_leaf_values, ) - builder.build(self.tree_, X, y, sample_weight, missing_values_in_feature_mask) if self.n_outputs_ == 1 and is_classifier(self): @@ -476,7 +575,75 @@ def _fit( self.classes_ = self.classes_[0] self._prune_tree() + return self + + def _update_tree(self, X, y, sample_weight): + # Update tree + max_leaf_nodes = -1 if self.max_leaf_nodes is None else self.max_leaf_nodes + min_samples_split = self.min_samples_split_ + min_samples_leaf = self.min_samples_leaf_ + min_weight_leaf = self.min_weight_leaf_ + # set decision-tree model parameters + max_depth = np.iinfo(np.int32).max if self.max_depth is None else self.max_depth + monotonic_cst = self.monotonic_cst_ + + # Build tree + # Note: this reconstructs the builder with the same state it had during the + # initial fit. This is necessary because the builder is not saved as part + # of the class, and thus the state may be lost if pickled/unpickled. + n_samples = X.shape[0] + criterion = self.criterion + if not isinstance(criterion, BaseCriterion): + if is_classifier(self): + criterion = CRITERIA_CLF[self.criterion]( + self.n_outputs_, self._n_classes_ + ) + else: + criterion = CRITERIA_REG[self.criterion](self.n_outputs_, n_samples) + else: + # Make a deepcopy in case the criterion has mutable attributes that + # might be shared and modified concurrently during parallel fitting + criterion = copy.deepcopy(criterion) + + random_state = check_random_state(self.random_state) + + SPLITTERS = SPARSE_SPLITTERS if issparse(X) else DENSE_SPLITTERS + splitter = SPLITTERS[self.splitter]( + criterion, + self.max_features_, + min_samples_leaf, + min_weight_leaf, + random_state, + monotonic_cst, + ) + + # Use BestFirst if max_leaf_nodes given; use DepthFirst otherwise + if max_leaf_nodes < 0: + builder = DepthFirstTreeBuilder( + splitter, + min_samples_split, + min_samples_leaf, + min_weight_leaf, + max_depth, + self.min_impurity_decrease, + self.store_leaf_values, + ) + else: + builder = BestFirstTreeBuilder( + splitter, + min_samples_split, + min_samples_leaf, + min_weight_leaf, + max_depth, + max_leaf_nodes, + self.min_impurity_decrease, + self.store_leaf_values, + ) + builder.initialize_node_queue(self.tree_, X, y, sample_weight) + builder.build(self.tree_, X, y, sample_weight) + + self._prune_tree() return self def _validate_X_predict(self, X, check_input): @@ -528,6 +695,9 @@ def predict(self, X, check_input=True): """ check_is_fitted(self) X = self._validate_X_predict(X, check_input) + + # proba is a count matrix of leaves that fall into + # (n_samples, n_outputs, max_n_classes) array proba = self.tree_.predict(X) n_samples = X.shape[0] @@ -554,6 +724,134 @@ def predict(self, X, check_input=True): else: return proba[:, :, 0] + def get_leaf_node_samples(self, X, check_input=True): + """For each datapoint x in X, get the training samples in the leaf node. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + Dataset to apply the forest to. + check_input : bool, default=True + Allow to bypass several input checking. + + Returns + ------- + leaf_nodes_samples : a list of array-like of length (n_samples,) + Each sample is represented by the indices of the training samples that + reached the leaf node. The ``n_leaf_node_samples`` may vary between + samples, since the number of samples that fall in a leaf node is + variable. Each array has shape (n_leaf_node_samples, n_outputs). + """ + if not self.store_leaf_values: + raise RuntimeError( + "leaf node samples are not stored when store_leaf_values=False" + ) + + # get indices of leaves per sample (n_samples,) + X_leaves = self.apply(X, check_input=check_input) + n_samples = X_leaves.shape[0] + + # get array of samples per leaf (n_node_samples, n_outputs) + leaf_samples = self.tree_.leaf_nodes_samples + + leaf_nodes_samples = [] + for idx in range(n_samples): + leaf_id = X_leaves[idx] + leaf_nodes_samples.append(leaf_samples[leaf_id]) + return leaf_nodes_samples + + def predict_quantiles(self, X, quantiles=0.5, method="nearest", check_input=True): + """Predict class or regression value for X at given quantiles. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + Input data. + quantiles : float, optional + The quantiles at which to evaluate, by default 0.5 (median). + method : str, optional + The method to interpolate, by default 'linear'. Can be any keyword + argument accepted by :func:`~np.quantile`. + check_input : bool, optional + Whether or not to check input, by default True. + + Returns + ------- + predictions : array-like of shape (n_samples, n_outputs, len(quantiles)) + The predicted quantiles. + """ + if not self.store_leaf_values: + raise RuntimeError( + "Predicting quantiles requires that the tree stores leaf node samples." + ) + + check_is_fitted(self) + + # Check data + X = self._validate_X_predict(X, check_input) + + if not isinstance(quantiles, (np.ndarray, list)): + quantiles = np.array([quantiles]) + + # get indices of leaves per sample + X_leaves = self.apply(X) + + # get array of samples per leaf (n_node_samples, n_outputs) + leaf_samples = self.tree_.leaf_nodes_samples + + # compute quantiles (n_samples, n_quantiles, n_outputs) + n_samples = X.shape[0] + n_quantiles = len(quantiles) + proba = np.zeros((n_samples, n_quantiles, self.n_outputs_)) + for idx, leaf_id in enumerate(X_leaves): + # predict by taking the quantile across the samples in the leaf for + # each output + try: + proba[idx, ...] = np.quantile( + leaf_samples[leaf_id], quantiles, axis=0, method=method + ) + except TypeError: + proba[idx, ...] = np.quantile( + leaf_samples[leaf_id], quantiles, axis=0, interpolation=method + ) + + # Classification + if is_classifier(self): + if self.n_outputs_ == 1: + # return the class with the highest probability for each quantile + # (n_samples, n_quantiles) + class_preds = np.zeros( + (n_samples, n_quantiles), dtype=self.classes_.dtype + ) + for i in range(n_quantiles): + class_pred_per_sample = ( + proba[:, i, :].squeeze().astype(self.classes_.dtype) + ) + class_preds[:, i] = self.classes_.take( + class_pred_per_sample, axis=0 + ) + return class_preds + else: + class_type = self.classes_[0].dtype + predictions = np.zeros( + (n_samples, n_quantiles, self.n_outputs_), dtype=class_type + ) + for k in range(self.n_outputs_): + for i in range(n_quantiles): + class_pred_per_sample = proba[:, i, k].squeeze().astype(int) + predictions[:, i, k] = self.classes_[k].take( + class_pred_per_sample, axis=0 + ) + + return predictions + # Regression + else: + if self.n_outputs_ == 1: + return proba[:, :, 0] + + else: + return proba + def apply(self, X, check_input=True): """Return the index of the leaf that each sample is predicted as. @@ -832,6 +1130,16 @@ class DecisionTreeClassifier(ClassifierMixin, BaseDecisionTree): .. versionadded:: 0.22 + store_leaf_values : bool, default=False + Whether to store the samples that fall into leaves in the ``tree_`` attribute. + Each leaf will store a 2D array corresponding to the samples that fall into it + keyed by node_id. + + XXX: This is currently experimental and may change without notice. + Moreover, it can be improved upon since storing the samples twice is not ideal. + One could instead store the indices in ``y_train`` that fall into each leaf, + which would lower RAM/diskspace usage. + monotonic_cst : array-like of int of shape (n_features), default=None Indicates the monotonicity constraint to enforce on each feature. - 1: monotonic increase @@ -896,6 +1204,18 @@ class DecisionTreeClassifier(ClassifierMixin, BaseDecisionTree): :ref:`sphx_glr_auto_examples_tree_plot_unveil_tree_structure.py` for basic usage of these attributes. + min_samples_split_ : float + The minimum number of samples needed to split a node in the tree building. + + min_weight_leaf_ : float + The minimum number of weighted samples in a leaf. + + min_samples_leaf_ : int + The minimum number of samples needed for a leaf node. + + monotonic_cst_ : array-like of int of shape (n_features,) + The monotonicity constraints enforced on each feature. + See Also -------- DecisionTreeRegressor : A decision tree regressor. @@ -948,7 +1268,10 @@ class DecisionTreeClassifier(ClassifierMixin, BaseDecisionTree): _parameter_constraints: dict = { **BaseDecisionTree._parameter_constraints, - "criterion": [StrOptions({"gini", "entropy", "log_loss"}), Hidden(Criterion)], + "criterion": [ + StrOptions({"gini", "entropy", "log_loss"}), + Hidden(BaseCriterion), + ], "class_weight": [dict, list, StrOptions({"balanced"}), None], } @@ -967,6 +1290,7 @@ def __init__( min_impurity_decrease=0.0, class_weight=None, ccp_alpha=0.0, + store_leaf_values=False, monotonic_cst=None, ): super().__init__( @@ -983,10 +1307,18 @@ def __init__( min_impurity_decrease=min_impurity_decrease, monotonic_cst=monotonic_cst, ccp_alpha=ccp_alpha, + store_leaf_values=store_leaf_values, ) @_fit_context(prefer_skip_nested_validation=True) - def fit(self, X, y, sample_weight=None, check_input=True): + def fit( + self, + X, + y, + sample_weight=None, + check_input=True, + classes=None, + ): """Build a decision tree classifier from the training set (X, y). Parameters @@ -1010,20 +1342,127 @@ def fit(self, X, y, sample_weight=None, check_input=True): Allow to bypass several input checking. Don't use this parameter unless you know what you're doing. + classes : array-like of shape (n_classes,), default=None + List of all the classes that can possibly appear in the y vector. + Returns ------- self : DecisionTreeClassifier Fitted estimator. """ - super()._fit( X, y, sample_weight=sample_weight, check_input=check_input, + classes=classes, ) return self + def partial_fit(self, X, y, sample_weight=None, check_input=True, classes=None): + """Update a decision tree classifier from the training set (X, y). + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + The training input samples. Internally, it will be converted to + ``dtype=np.float32`` and if a sparse matrix is provided + to a sparse ``csc_matrix``. + + y : array-like of shape (n_samples,) or (n_samples, n_outputs) + The target values (class labels) as integers or strings. + + sample_weight : array-like of shape (n_samples,), default=None + Sample weights. If None, then samples are equally weighted. Splits + that would create child nodes with net zero or negative weight are + ignored while searching for a split in each node. Splits are also + ignored if they would result in any single class carrying a + negative weight in either child node. + + check_input : bool, default=True + Allow to bypass several input checking. + Don't use this parameter unless you know what you do. + + classes : array-like of shape (n_classes,), default=None + List of all the classes that can possibly appear in the y vector. + Must be provided at the first call to partial_fit, can be omitted + in subsequent calls. + + Returns + ------- + self : DecisionTreeClassifier + Fitted estimator. + """ + self._validate_params() + + # validate input parameters + first_call = _check_partial_fit_first_call(self, classes=classes) + + # Fit if no tree exists yet + if first_call: + self.fit( + X, + y, + sample_weight=sample_weight, + check_input=check_input, + classes=classes, + ) + return self + + if check_input: + # Need to validate separately here. + # We can't pass multi_ouput=True because that would allow y to be + # csr. + check_X_params = dict(dtype=DTYPE, accept_sparse="csc") + check_y_params = dict(ensure_2d=False, dtype=None) + X, y = validate_data( + self, + X, + y, + reset=False, + validate_separately=(check_X_params, check_y_params), + ) + if issparse(X): + X.sort_indices() + + if X.indices.dtype != np.intc or X.indptr.dtype != np.intc: + raise ValueError( + "No support for np.int64 index based sparse matrices" + ) + + if X.shape[1] != self.n_features_in_: + raise ValueError( + f"X has {X.shape[1]} features, but {self.__class__.__name__} " + f"is expecting {self.n_features_in_} features as input." + ) + + y = np.atleast_1d(y) + + if y.ndim == 1: + # reshape is necessary to preserve the data contiguity against vs + # [:, np.newaxis] that does not. + y = np.reshape(y, (-1, 1)) + + check_classification_targets(y) + y = np.copy(y) + + classes = self.classes_ + if self.n_outputs_ == 1: + classes = [classes] + + y_encoded = np.zeros(y.shape, dtype=int) + for i in range(X.shape[0]): + for j in range(self.n_outputs_): + y_encoded[i, j] = np.where(classes[j] == y[i, j])[0][0] + y = y_encoded + + if getattr(y, "dtype", None) != DOUBLE or not y.flags.contiguous: + y = np.ascontiguousarray(y, dtype=DOUBLE) + + self._update_tree(X, y, sample_weight) + + return self + def predict_proba(self, X, check_input=True): """Predict class probabilities of the input samples X. @@ -1231,6 +1670,16 @@ class DecisionTreeRegressor(RegressorMixin, BaseDecisionTree): .. versionadded:: 0.22 + store_leaf_values : bool, default=False + Whether to store the samples that fall into leaves in the ``tree_`` attribute. + Each leaf will store a 2D array corresponding to the samples that fall into it + keyed by node_id. + + XXX: This is currently experimental and may change without notice. + Moreover, it can be improved upon since storing the samples twice is not ideal. + One could instead store the indices in ``y_train`` that fall into each leaf, + which would lower RAM/diskspace usage. + monotonic_cst : array-like of int of shape (n_features), default=None Indicates the monotonicity constraint to enforce on each feature. - 1: monotonic increase @@ -1283,6 +1732,18 @@ class DecisionTreeRegressor(RegressorMixin, BaseDecisionTree): :ref:`sphx_glr_auto_examples_tree_plot_unveil_tree_structure.py` for basic usage of these attributes. + min_samples_split_ : float + The minimum number of samples needed to split a node in the tree building. + + min_weight_leaf_ : float + The minimum number of weighted samples in a leaf. + + monotonic_cst_ : array-like of int of shape (n_features,) + The monotonicity constraints enforced on each feature. + + min_samples_leaf_ : int + The minimum number of samples needed for a leaf node. + See Also -------- DecisionTreeClassifier : A decision tree classifier. @@ -1331,7 +1792,7 @@ class DecisionTreeRegressor(RegressorMixin, BaseDecisionTree): **BaseDecisionTree._parameter_constraints, "criterion": [ StrOptions({"squared_error", "friedman_mse", "absolute_error", "poisson"}), - Hidden(Criterion), + Hidden(BaseCriterion), ], } @@ -1349,6 +1810,7 @@ def __init__( max_leaf_nodes=None, min_impurity_decrease=0.0, ccp_alpha=0.0, + store_leaf_values=False, monotonic_cst=None, ): super().__init__( @@ -1363,11 +1825,19 @@ def __init__( random_state=random_state, min_impurity_decrease=min_impurity_decrease, ccp_alpha=ccp_alpha, + store_leaf_values=store_leaf_values, monotonic_cst=monotonic_cst, ) @_fit_context(prefer_skip_nested_validation=True) - def fit(self, X, y, sample_weight=None, check_input=True): + def fit( + self, + X, + y, + sample_weight=None, + check_input=True, + classes=None, + ): """Build a decision tree regressor from the training set (X, y). Parameters @@ -1390,6 +1860,9 @@ def fit(self, X, y, sample_weight=None, check_input=True): Allow to bypass several input checking. Don't use this parameter unless you know what you're doing. + classes : array-like of shape (n_classes,), default=None + List of all the classes that can possibly appear in the y vector. + Returns ------- self : DecisionTreeRegressor @@ -1401,6 +1874,7 @@ def fit(self, X, y, sample_weight=None, check_input=True): y, sample_weight=sample_weight, check_input=check_input, + classes=classes, ) return self @@ -1583,6 +2057,16 @@ class ExtraTreeClassifier(DecisionTreeClassifier): .. versionadded:: 0.22 + store_leaf_values : bool, default=False + Whether to store the samples that fall into leaves in the ``tree_`` attribute. + Each leaf will store a 2D array corresponding to the samples that fall into it + keyed by node_id. + + XXX: This is currently experimental and may change without notice. + Moreover, it can be improved upon since storing the samples twice is not ideal. + One could instead store the indices in ``y_train`` that fall into each leaf, + which would lower RAM/diskspace usage. + monotonic_cst : array-like of int of shape (n_features), default=None Indicates the monotonicity constraint to enforce on each feature. - 1: monotonic increase @@ -1647,6 +2131,18 @@ class ExtraTreeClassifier(DecisionTreeClassifier): :ref:`sphx_glr_auto_examples_tree_plot_unveil_tree_structure.py` for basic usage of these attributes. + min_samples_split_ : float + The minimum number of samples needed to split a node in the tree building. + + min_weight_leaf_ : float + The minimum number of weighted samples in a leaf. + + monotonic_cst_ : array-like of int of shape (n_features,) + The monotonicity constraints enforced on each feature. + + min_samples_leaf_ : int + The minimum number of samples needed for a leaf node. + See Also -------- ExtraTreeRegressor : An extremely randomized tree regressor. @@ -1702,6 +2198,7 @@ def __init__( min_impurity_decrease=0.0, class_weight=None, ccp_alpha=0.0, + store_leaf_values=False, monotonic_cst=None, ): super().__init__( @@ -1717,6 +2214,7 @@ def __init__( min_impurity_decrease=min_impurity_decrease, random_state=random_state, ccp_alpha=ccp_alpha, + store_leaf_values=store_leaf_values, monotonic_cst=monotonic_cst, ) @@ -1863,6 +2361,16 @@ class ExtraTreeRegressor(DecisionTreeRegressor): .. versionadded:: 0.22 + store_leaf_values : bool, default=False + Whether to store the samples that fall into leaves in the ``tree_`` attribute. + Each leaf will store a 2D array corresponding to the samples that fall into it + keyed by node_id. + + XXX: This is currently experimental and may change without notice. + Moreover, it can be improved upon since storing the samples twice is not ideal. + One could instead store the indices in ``y_train`` that fall into each leaf, + which would lower RAM/diskspace usage. + monotonic_cst : array-like of int of shape (n_features), default=None Indicates the monotonicity constraint to enforce on each feature. - 1: monotonic increase @@ -1912,6 +2420,18 @@ class ExtraTreeRegressor(DecisionTreeRegressor): :ref:`sphx_glr_auto_examples_tree_plot_unveil_tree_structure.py` for basic usage of these attributes. + min_samples_split_ : float + The minimum number of samples needed to split a node in the tree building. + + min_weight_leaf_ : float + The minimum number of weighted samples in a leaf. + + monotonic_cst_ : array-like of int of shape (n_features,) + The monotonicity constraints enforced on each feature. + + min_samples_leaf_ : int + The minimum number of samples needed for a leaf node. + See Also -------- ExtraTreeClassifier : An extremely randomized tree classifier. @@ -1962,6 +2482,7 @@ def __init__( min_impurity_decrease=0.0, max_leaf_nodes=None, ccp_alpha=0.0, + store_leaf_values=False, monotonic_cst=None, ): super().__init__( @@ -1976,6 +2497,7 @@ def __init__( min_impurity_decrease=min_impurity_decrease, random_state=random_state, ccp_alpha=ccp_alpha, + store_leaf_values=store_leaf_values, monotonic_cst=monotonic_cst, ) diff --git a/sklearn/tree/_criterion.pxd b/sklearn/tree/_criterion.pxd index 84d2e800d6a87..ff2a80f939b9f 100644 --- a/sklearn/tree/_criterion.pxd +++ b/sklearn/tree/_criterion.pxd @@ -2,24 +2,20 @@ # SPDX-License-Identifier: BSD-3-Clause # See _criterion.pyx for implementation details. +from libcpp.vector cimport vector from ..utils._typedefs cimport float64_t, int8_t, intp_t -cdef class Criterion: - # The criterion computes the impurity of a node and the reduction of - # impurity of a split on that node. It also computes the output statistics - # such as the mean in regression and class probabilities in classification. +cdef class BaseCriterion: + """Abstract interface for criterion.""" # Internal structures - cdef const float64_t[:, ::1] y # Values of y cdef const float64_t[:] sample_weight # Sample weights cdef const intp_t[:] sample_indices # Sample indices in X, y cdef intp_t start # samples[start:pos] are the samples in the left node cdef intp_t pos # samples[pos:end] are the samples in the right node cdef intp_t end - cdef intp_t n_missing # Number of missing values for the feature being evaluated - cdef bint missing_go_to_left # Whether missing values go to the left node cdef intp_t n_outputs # Number of outputs cdef intp_t n_samples # Number of samples @@ -30,24 +26,14 @@ cdef class Criterion: cdef float64_t weighted_n_right # Weighted number of samples in the right node cdef float64_t weighted_n_missing # Weighted number of samples that are missing + # Core methods that criterion class _must_ implement. # The criterion object is maintained such that left and right collected # statistics correspond to samples[start:pos] and samples[pos:end]. # Methods - cdef int init( - self, - const float64_t[:, ::1] y, - const float64_t[:] sample_weight, - float64_t weighted_n_samples, - const intp_t[:] sample_indices, - intp_t start, - intp_t end - ) except -1 nogil - cdef void init_sum_missing(self) - cdef void init_missing(self, intp_t n_missing) noexcept nogil - cdef int reset(self) except -1 nogil - cdef int reverse_reset(self) except -1 nogil - cdef int update(self, intp_t new_pos) except -1 nogil + cdef intp_t reset(self) except -1 nogil + cdef intp_t reverse_reset(self) except -1 nogil + cdef intp_t update(self, intp_t new_pos) except -1 nogil cdef float64_t node_impurity(self) noexcept nogil cdef void children_impurity( self, @@ -58,13 +44,6 @@ cdef class Criterion: self, float64_t* dest ) noexcept nogil - cdef void clip_node_value( - self, - float64_t* dest, - float64_t lower_bound, - float64_t upper_bound - ) noexcept nogil - cdef float64_t middle_value(self) noexcept nogil cdef float64_t impurity_improvement( self, float64_t impurity_parent, @@ -72,19 +51,55 @@ cdef class Criterion: float64_t impurity_right ) noexcept nogil cdef float64_t proxy_impurity_improvement(self) noexcept nogil + cdef void set_sample_pointers( + self, + intp_t start, + intp_t end + ) noexcept nogil + + +cdef class Criterion(BaseCriterion): + """Abstract interface for supervised impurity criteria.""" + + cdef const float64_t[:, ::1] y # Values of y + cdef intp_t n_missing # Number of missing values for the feature being evaluated + cdef bint missing_go_to_left # Whether missing values go to the left node + + cdef intp_t init( + self, + const float64_t[:, ::1] y, + const float64_t[:] sample_weight, + float64_t weighted_n_samples, + const intp_t[:] sample_indices, + ) except -1 nogil + cdef void init_sum_missing(self) + cdef void init_missing(self, intp_t n_missing) noexcept nogil + cdef bint check_monotonicity( - self, - int8_t monotonic_cst, - float64_t lower_bound, - float64_t upper_bound, + self, + int8_t monotonic_cst, + float64_t lower_bound, + float64_t upper_bound, ) noexcept nogil cdef inline bint _check_monotonicity( - self, - int8_t monotonic_cst, - float64_t lower_bound, - float64_t upper_bound, - float64_t sum_left, - float64_t sum_right, + self, + int8_t monotonic_cst, + float64_t lower_bound, + float64_t upper_bound, + float64_t sum_left, + float64_t sum_right, + ) noexcept nogil + cdef void clip_node_value( + self, + float64_t* dest, + float64_t lower_bound, + float64_t upper_bound + ) noexcept nogil + cdef float64_t middle_value(self) noexcept nogil + + cdef void node_samples( + self, + vector[vector[float64_t]]& dest ) noexcept nogil cdef class ClassificationCriterion(Criterion): diff --git a/sklearn/tree/_criterion.pyx b/sklearn/tree/_criterion.pyx index 9f3db83399569..adb13b8929a59 100644 --- a/sklearn/tree/_criterion.pyx +++ b/sklearn/tree/_criterion.pyx @@ -1,27 +1,42 @@ # Authors: The scikit-learn developers # SPDX-License-Identifier: BSD-3-Clause -from libc.string cimport memcpy -from libc.string cimport memset -from libc.math cimport fabs, INFINITY +from libc.math cimport INFINITY, fabs +from libc.string cimport memcpy, memset import numpy as np + cimport numpy as cnp + cnp.import_array() from scipy.special.cython_special cimport xlogy -from ._utils cimport log -from ._utils cimport WeightedMedianCalculator +from ._utils cimport WeightedMedianCalculator, log + # EPSILON is used in the Poisson criterion cdef float64_t EPSILON = 10 * np.finfo('double').eps -cdef class Criterion: - """Interface for impurity criteria. +cdef class BaseCriterion: + """This is an abstract interface for criterion. + + For example, a tree model could + be either supervisedly, or unsupervisedly computing impurity on samples of + covariates, or labels, or both. Although scikit-learn currently only contains + supervised tree methods, this class enables 3rd party packages to leverage + scikit-learn's Cython code for criteria. + + The downstream classes _must_ implement methods to compute the impurity + in current node and in children nodes. This object stores methods on how to calculate how good a split is using - different metrics. + a set API. + + Samples in the "current" node are stored in `samples[start:end]` which is + partitioned around `pos` (an index in `start:end`) so that: + - the samples of left child node are stored in `samples[start:pos]` + - the samples of right child node are stored in `samples[pos:end]` """ def __getstate__(self): return {} @@ -29,68 +44,21 @@ cdef class Criterion: def __setstate__(self, d): pass - cdef int init( - self, - const float64_t[:, ::1] y, - const float64_t[:] sample_weight, - float64_t weighted_n_samples, - const intp_t[:] sample_indices, - intp_t start, - intp_t end, - ) except -1 nogil: - """Placeholder for a method which will initialize the criterion. - - Returns -1 in case of failure to allocate memory (and raise MemoryError) - or 0 otherwise. - - Parameters - ---------- - y : ndarray, dtype=float64_t - y is a buffer that can store values for n_outputs target variables - stored as a Cython memoryview. - sample_weight : ndarray, dtype=float64_t - The weight of each sample stored as a Cython memoryview. - weighted_n_samples : float64_t - The total weight of the samples being considered - sample_indices : ndarray, dtype=intp_t - A mask on the samples. Indices of the samples in X and y we want to use, - where sample_indices[start:end] correspond to the samples in this node. - start : intp_t - The first sample to be used on this node - end : intp_t - The last sample used on this node - - """ - pass - - cdef void init_missing(self, intp_t n_missing) noexcept nogil: - """Initialize sum_missing if there are missing values. - - This method assumes that caller placed the missing samples in - self.sample_indices[-n_missing:] - - Parameters - ---------- - n_missing: intp_t - Number of missing values for specific feature. - """ - pass - - cdef int reset(self) except -1 nogil: + cdef intp_t reset(self) except -1 nogil: """Reset the criterion at pos=start. This method must be implemented by the subclass. """ pass - cdef int reverse_reset(self) except -1 nogil: + cdef intp_t reverse_reset(self) except -1 nogil: """Reset the criterion at pos=end. This method must be implemented by the subclass. """ pass - cdef int update(self, intp_t new_pos) except -1 nogil: + cdef intp_t update(self, intp_t new_pos) except -1 nogil: """Updated statistics by moving sample_indices[pos:new_pos] to the left child. This updates the collected statistics by moving sample_indices[pos:new_pos] @@ -146,16 +114,6 @@ cdef class Criterion: """ pass - cdef void clip_node_value(self, float64_t* dest, float64_t lower_bound, float64_t upper_bound) noexcept nogil: - pass - - cdef float64_t middle_value(self) noexcept nogil: - """Compute the middle value of a split for monotonicity constraints - - This method is implemented in ClassificationCriterion and RegressionCriterion. - """ - pass - cdef float64_t proxy_impurity_improvement(self) noexcept nogil: """Compute a proxy of the impurity reduction. @@ -210,6 +168,95 @@ cdef class Criterion: - (self.weighted_n_left / self.weighted_n_node_samples * impurity_left))) + cdef void set_sample_pointers( + self, + intp_t start, + intp_t end + ) noexcept nogil: + """Abstract method which will set sample pointers in the criterion. + + The dataset array that we compute criteria on is assumed to consist of 'N' + ordered samples or rows (i.e. sorted). Since we pass this by reference, we + use sample pointers to move the start and end around to consider only a subset of data. + This function should also update relevant statistics that the class uses to compute the final criterion. + + Parameters + ---------- + start : intp_t + The index of the first sample to be used on computation of criteria of the current node. + end : intp_t + The last sample used on this node + """ + pass + + +cdef class Criterion(BaseCriterion): + """Interface for impurity criteria. + + The supervised criterion computes the impurity of a node and the reduction of + impurity of a split on that node using the distribution of labels in parent and + children nodes. It also computes the output statistics such as the mean in regression + and class probabilities in classification. Instances of this class are responsible + for compute splits' impurity difference. + + Criterion is the base class for criteria used in supervised tree-based models + with a homogeneous float64-dtyped y. + """ + cdef intp_t init( + self, + const float64_t[:, ::1] y, + const float64_t[:] sample_weight, + float64_t weighted_n_samples, + const intp_t[:] sample_indices, + ) except -1 nogil: + """Placeholder for a method which will initialize the criterion. + + Returns -1 in case of failure to allocate memory (and raise MemoryError) + or 0 otherwise. + + Parameters + ---------- + y : ndarray, dtype=float64_t + y is a buffer that can store values for n_outputs target variables + stored as a Cython memoryview. + sample_weight : ndarray, dtype=float64_t + The weight of each sample stored as a Cython memoryview. + weighted_n_samples : float64_t + The total weight of the samples being considered + sample_indices : ndarray, dtype=intp_t + A mask on the samples. Indices of the samples in X and y we want to use, + where sample_indices[start:end] correspond to the samples in this node. + """ + pass + + cdef void init_missing(self, intp_t n_missing) noexcept nogil: + """Initialize sum_missing if there are missing values. + + This method assumes that caller placed the missing samples in + self.sample_indices[-n_missing:] + + Parameters + ---------- + n_missing: intp_t + Number of missing values for specific feature. + """ + pass + + cdef void clip_node_value( + self, + float64_t* dest, + float64_t lower_bound, + float64_t upper_bound + ) noexcept nogil: + pass + + cdef float64_t middle_value(self) noexcept nogil: + """Compute the middle value of a split for monotonicity constraints + + This method is implemented in ClassificationCriterion and RegressionCriterion. + """ + pass + cdef bint check_monotonicity( self, cnp.int8_t monotonic_cst, @@ -243,6 +290,33 @@ cdef class Criterion: cdef void init_sum_missing(self): """Init sum_missing to hold sums for missing values.""" + cdef void node_samples( + self, + vector[vector[float64_t]]& dest + ) noexcept nogil: + """Copy the samples of the current node into dest. + + Parameters + ---------- + dest : reference vector[vector[float64_t]] + The vector of vectors where the samples should be copied. + This is passed by reference and modified in place. + """ + cdef intp_t i, j, k + + # Resize the destination vector of vectors + dest.resize(self.n_node_samples) + + # Loop over the samples + for i in range(self.n_node_samples): + # Get the index of the current sample + j = self.sample_indices[self.start + i] + + # Get the sample values for each output + for k in range(self.n_outputs): + dest[i].push_back(self.y[j, k]) + + cdef inline void _move_sums_classification( ClassificationCriterion criterion, float64_t[:, ::1] sum_1, @@ -336,20 +410,15 @@ cdef class ClassificationCriterion(Criterion): return (type(self), (self.n_outputs, np.asarray(self.n_classes)), self.__getstate__()) - cdef int init( + cdef intp_t init( self, const float64_t[:, ::1] y, const float64_t[:] sample_weight, float64_t weighted_n_samples, const intp_t[:] sample_indices, - intp_t start, - intp_t end ) except -1 nogil: """Initialize the criterion. - This initializes the criterion at node sample_indices[start:end] and children - sample_indices[start:start] and sample_indices[start:end]. - Returns -1 in case of failure to allocate memory (and raise MemoryError) or 0 otherwise. @@ -364,18 +433,24 @@ cdef class ClassificationCriterion(Criterion): sample_indices : ndarray, dtype=intp_t A mask on the samples. Indices of the samples in X and y we want to use, where sample_indices[start:end] correspond to the samples in this node. - start : intp_t - The first sample to use in the mask - end : intp_t - The last sample to use in the mask """ self.y = y self.sample_weight = sample_weight self.sample_indices = sample_indices + self.weighted_n_samples = weighted_n_samples + + return 0 + + cdef void set_sample_pointers( + self, + intp_t start, + intp_t end + ) noexcept nogil: + """Set sample pointers in the criterion.""" + self.n_node_samples = end - start self.start = start self.end = end - self.n_node_samples = end - start - self.weighted_n_samples = weighted_n_samples + self.weighted_n_node_samples = 0.0 cdef intp_t i @@ -388,12 +463,12 @@ cdef class ClassificationCriterion(Criterion): memset(&self.sum_total[k, 0], 0, self.n_classes[k] * sizeof(float64_t)) for p in range(start, end): - i = sample_indices[p] + i = self.sample_indices[p] # w is originally set to be 1.0, meaning that if no sample weights # are given, the default weight of each sample is 1.0. - if sample_weight is not None: - w = sample_weight[i] + if self.sample_weight is not None: + w = self.sample_weight[i] # Count weighted class frequency for each target for k in range(self.n_outputs): @@ -404,7 +479,6 @@ cdef class ClassificationCriterion(Criterion): # Reset to pos=start self.reset() - return 0 cdef void init_sum_missing(self): """Init sum_missing to hold sums for missing values.""" @@ -439,7 +513,7 @@ cdef class ClassificationCriterion(Criterion): self.weighted_n_missing += w - cdef int reset(self) except -1 nogil: + cdef intp_t reset(self) except -1 nogil: """Reset the criterion at pos=start. Returns -1 in case of failure to allocate memory (and raise MemoryError) @@ -456,7 +530,7 @@ cdef class ClassificationCriterion(Criterion): ) return 0 - cdef int reverse_reset(self) except -1 nogil: + cdef intp_t reverse_reset(self) except -1 nogil: """Reset the criterion at pos=end. Returns -1 in case of failure to allocate memory (and raise MemoryError) @@ -473,7 +547,7 @@ cdef class ClassificationCriterion(Criterion): ) return 0 - cdef int update(self, intp_t new_pos) except -1 nogil: + cdef intp_t update(self, intp_t new_pos) except -1 nogil: """Updated statistics by moving sample_indices[pos:new_pos] to the left child. Returns -1 in case of failure to allocate memory (and raise MemoryError) @@ -687,13 +761,10 @@ cdef class Gini(ClassificationCriterion): This handles cases where the target is a classification taking values 0, 1, ... K-2, K-1. If node m represents a region Rm with Nm observations, then let - count_k = 1/ Nm \sum_{x_i in Rm} I(yi = k) - be the proportion of class k observations in node m. The Gini Index is then defined as: - index = \sum_{k=0}^{K-1} count_k (1 - count_k) = 1 - \sum_{k=0}^{K-1} count_k ** 2 """ @@ -811,7 +882,6 @@ cdef class RegressionCriterion(Criterion): evaluated by computing the variance of the target values left and right of the split point. The computation takes linear time with `n_samples` by using :: - var = \sum_i^n (y_i - y_bar) ** 2 = (\sum_i^n y_i ** 2) - n_samples * y_bar ** 2 """ @@ -849,28 +919,33 @@ cdef class RegressionCriterion(Criterion): def __reduce__(self): return (type(self), (self.n_outputs, self.n_samples), self.__getstate__()) - cdef int init( + cdef intp_t init( self, const float64_t[:, ::1] y, const float64_t[:] sample_weight, float64_t weighted_n_samples, const intp_t[:] sample_indices, - intp_t start, - intp_t end, ) except -1 nogil: - """Initialize the criterion. - - This initializes the criterion at node sample_indices[start:end] and children - sample_indices[start:start] and sample_indices[start:end]. - """ + """Initialize the criterion.""" # Initialize fields self.y = y self.sample_weight = sample_weight self.sample_indices = sample_indices + self.weighted_n_samples = weighted_n_samples + + return 0 + + cdef void set_sample_pointers( + self, + intp_t start, + intp_t end + ) noexcept nogil: + """Set sample pointers in the criterion.""" self.start = start self.end = end + self.n_node_samples = end - start - self.weighted_n_samples = weighted_n_samples + self.weighted_n_node_samples = 0. cdef intp_t i @@ -883,10 +958,10 @@ cdef class RegressionCriterion(Criterion): memset(&self.sum_total[0], 0, self.n_outputs * sizeof(float64_t)) for p in range(start, end): - i = sample_indices[p] + i = self.sample_indices[p] - if sample_weight is not None: - w = sample_weight[i] + if self.sample_weight is not None: + w = self.sample_weight[i] for k in range(self.n_outputs): y_ik = self.y[i, k] @@ -898,7 +973,6 @@ cdef class RegressionCriterion(Criterion): # Reset to pos=start self.reset() - return 0 cdef void init_sum_missing(self): """Init sum_missing to hold sums for missing values.""" @@ -936,7 +1010,7 @@ cdef class RegressionCriterion(Criterion): self.weighted_n_missing += w - cdef int reset(self) except -1 nogil: + cdef intp_t reset(self) except -1 nogil: """Reset the criterion at pos=start.""" self.pos = self.start _move_sums_regression( @@ -949,7 +1023,7 @@ cdef class RegressionCriterion(Criterion): ) return 0 - cdef int reverse_reset(self) except -1 nogil: + cdef intp_t reverse_reset(self) except -1 nogil: """Reset the criterion at pos=end.""" self.pos = self.end _move_sums_regression( @@ -962,7 +1036,7 @@ cdef class RegressionCriterion(Criterion): ) return 0 - cdef int update(self, intp_t new_pos) except -1 nogil: + cdef intp_t update(self, intp_t new_pos) except -1 nogil: """Updated statistics by moving sample_indices[pos:new_pos] to the left.""" cdef const float64_t[:] sample_weight = self.sample_weight cdef const intp_t[:] sample_indices = self.sample_indices @@ -1066,7 +1140,6 @@ cdef class RegressionCriterion(Criterion): cdef class MSE(RegressionCriterion): """Mean squared error impurity criterion. - MSE = var_left + var_right """ @@ -1227,31 +1300,39 @@ cdef class MAE(RegressionCriterion): self.left_child_ptr = cnp.PyArray_DATA(self.left_child) self.right_child_ptr = cnp.PyArray_DATA(self.right_child) - cdef int init( + cdef intp_t init( self, const float64_t[:, ::1] y, const float64_t[:] sample_weight, float64_t weighted_n_samples, const intp_t[:] sample_indices, - intp_t start, - intp_t end, ) except -1 nogil: """Initialize the criterion. This initializes the criterion at node sample_indices[start:end] and children sample_indices[start:start] and sample_indices[start:end]. """ - cdef intp_t i, p, k - cdef float64_t w = 1.0 - # Initialize fields self.y = y self.sample_weight = sample_weight self.sample_indices = sample_indices + self.weighted_n_samples = weighted_n_samples + + return 0 + + cdef void set_sample_pointers( + self, + intp_t start, + intp_t end + ) noexcept nogil: + """Set sample pointers in the criterion.""" + cdef intp_t i, p, k + cdef float64_t w = 1.0 + self.start = start self.end = end + self.n_node_samples = end - start - self.weighted_n_samples = weighted_n_samples self.weighted_n_node_samples = 0. cdef void** left_child = self.left_child_ptr @@ -1262,10 +1343,10 @@ cdef class MAE(RegressionCriterion): ( right_child[k]).reset() for p in range(start, end): - i = sample_indices[p] + i = self.sample_indices[p] - if sample_weight is not None: - w = sample_weight[i] + if self.sample_weight is not None: + w = self.sample_weight[i] for k in range(self.n_outputs): # push method ends up calling safe_realloc, hence `except -1` @@ -1280,7 +1361,6 @@ cdef class MAE(RegressionCriterion): # Reset to pos=start self.reset() - return 0 cdef void init_missing(self, intp_t n_missing) noexcept nogil: """Raise error if n_missing != 0.""" @@ -1289,7 +1369,7 @@ cdef class MAE(RegressionCriterion): with gil: raise ValueError("missing values is not supported for MAE.") - cdef int reset(self) except -1 nogil: + cdef intp_t reset(self) except -1 nogil: """Reset the criterion at pos=start. Returns -1 in case of failure to allocate memory (and raise MemoryError) @@ -1320,7 +1400,7 @@ cdef class MAE(RegressionCriterion): weight) return 0 - cdef int reverse_reset(self) except -1 nogil: + cdef intp_t reverse_reset(self) except -1 nogil: """Reset the criterion at pos=end. Returns -1 in case of failure to allocate memory (and raise MemoryError) @@ -1348,7 +1428,7 @@ cdef class MAE(RegressionCriterion): weight) return 0 - cdef int update(self, intp_t new_pos) except -1 nogil: + cdef intp_t update(self, intp_t new_pos) except -1 nogil: """Updated statistics by moving sample_indices[pos:new_pos] to the left. Returns -1 in case of failure to allocate memory (and raise MemoryError) @@ -1571,6 +1651,7 @@ cdef class Poisson(RegressionCriterion): Note that the deviance is >= 0, and since we have `y_pred = mean(y_true)` at the leaves, one always has `sum(y_pred - y_true) = 0`. It remains the implemented impurity (factor 2 is skipped): + 1/n * sum(y_true * log(y_true/y_pred) """ # FIXME in 1.0: diff --git a/sklearn/tree/_export.py b/sklearn/tree/_export.py index 6726d0c67bfb1..e8e45af79fb28 100644 --- a/sklearn/tree/_export.py +++ b/sklearn/tree/_export.py @@ -11,9 +11,15 @@ import numpy as np -from ..base import is_classifier -from ..utils._param_validation import HasMethods, Interval, StrOptions, validate_params -from ..utils.validation import check_array, check_is_fitted +from sklearn.base import is_classifier +from sklearn.utils._param_validation import ( + HasMethods, + Interval, + StrOptions, + validate_params, +) +from sklearn.utils.validation import check_array, check_is_fitted + from . import DecisionTreeClassifier, DecisionTreeRegressor, _criterion, _tree from ._reingold_tilford import Tree, buchheim diff --git a/sklearn/tree/_splitter.pxd b/sklearn/tree/_splitter.pxd index 42c6c6d935a9c..393c81fd8b9a9 100644 --- a/sklearn/tree/_splitter.pxd +++ b/sklearn/tree/_splitter.pxd @@ -2,12 +2,11 @@ # SPDX-License-Identifier: BSD-3-Clause # See _splitter.pyx for details. +from libcpp.vector cimport vector -from ..utils._typedefs cimport ( - float32_t, float64_t, int8_t, int32_t, intp_t, uint8_t, uint32_t -) -from ._criterion cimport Criterion +from ._criterion cimport BaseCriterion, Criterion from ._tree cimport ParentInfo +from ..utils._typedefs cimport float32_t, float64_t, intp_t, int8_t, int32_t, uint8_t, uint32_t cdef struct SplitRecord: @@ -25,16 +24,17 @@ cdef struct SplitRecord: uint8_t missing_go_to_left # Controls if missing values go to the left node. intp_t n_missing # Number of missing values for the feature being split on -cdef class Splitter: +cdef class BaseSplitter: + """Abstract interface for splitter.""" + # The splitter searches in the input space for a feature and a threshold # to split the samples samples[start:end]. # # The impurity computations are delegated to a criterion object. # Internal structures - cdef public Criterion criterion # Impurity criterion - cdef public intp_t max_features # Number of features to test - cdef public intp_t min_samples_leaf # Min samples in a leaf + cdef public intp_t max_features # Number of features to test + cdef public intp_t min_samples_leaf # Min samples in a leaf cdef public float64_t min_weight_leaf # Minimum weight in a leaf cdef object random_state # Random state @@ -51,14 +51,6 @@ cdef class Splitter: cdef intp_t start # Start position for the current node cdef intp_t end # End position for the current node - cdef const float64_t[:, ::1] y - # Monotonicity constraints for each feature. - # The encoding is as follows: - # -1: monotonic decrease - # 0: no constraint - # +1: monotonic increase - cdef const int8_t[:] monotonic_cst - cdef bint with_monotonic_cst cdef const float64_t[:] sample_weight # The samples vector `samples` is maintained by the Splitter object such @@ -78,29 +70,60 @@ cdef class Splitter: # This allows optimization with depth-based tree building. # Methods - cdef int init( - self, - object X, - const float64_t[:, ::1] y, - const float64_t[:] sample_weight, - const uint8_t[::1] missing_values_in_feature_mask, - ) except -1 - cdef int node_reset( self, intp_t start, intp_t end, float64_t* weighted_n_node_samples ) except -1 nogil - cdef int node_split( self, ParentInfo* parent, SplitRecord* split, ) except -1 nogil - cdef void node_value(self, float64_t* dest) noexcept nogil + cdef float64_t node_impurity(self) noexcept nogil + cdef intp_t pointer_size(self) noexcept nogil - cdef void clip_node_value(self, float64_t* dest, float64_t lower_bound, float64_t upper_bound) noexcept nogil +cdef class Splitter(BaseSplitter): + """Base class for supervised splitters.""" - cdef float64_t node_impurity(self) noexcept nogil + cdef public Criterion criterion # Impurity criterion + cdef const float64_t[:, ::1] y + + # Monotonicity constraints for each feature. + # The encoding is as follows: + # -1: monotonic decrease + # 0: no constraint + # +1: monotonic increase + cdef const int8_t[:] monotonic_cst + cdef bint with_monotonic_cst + + cdef int init( + self, + object X, + const float64_t[:, ::1] y, + const float64_t[:] sample_weight, + const uint8_t[::1] missing_values_in_feature_mask, + ) except -1 + + cdef void node_samples(self, vector[vector[float64_t]]& dest) noexcept nogil + + # Methods that allow modifications to stopping conditions + cdef bint check_presplit_conditions( + self, + SplitRecord* current_split, + intp_t n_missing, + bint missing_go_to_left, + ) noexcept nogil + + cdef bint check_postsplit_conditions( + self + ) noexcept nogil + + cdef void clip_node_value( + self, + float64_t* dest, + float64_t lower_bound, + float64_t upper_bound + ) noexcept nogil diff --git a/sklearn/tree/_splitter.pyx b/sklearn/tree/_splitter.pyx index b557a4d1c6300..d69bfa067c92c 100644 --- a/sklearn/tree/_splitter.pyx +++ b/sklearn/tree/_splitter.pyx @@ -55,13 +55,94 @@ cdef inline void _init_split(SplitRecord* self, intp_t start_pos) noexcept nogil self.missing_go_to_left = False self.n_missing = 0 -cdef class Splitter: - """Abstract splitter class. +cdef class BaseSplitter: + """This is an abstract interface for splitters. - Splitters are called by tree builders to find the best splits on both - sparse and dense data, one split at a time. + For example, a tree model could be either supervisedly, or unsupervisedly computing splits on samples of + covariates, labels, or both. Although scikit-learn currently only contains + supervised tree methods, this class enables 3rd party packages to leverage + scikit-learn's Cython code for splitting. + + A splitter is usually used in conjunction with a criterion class, which explicitly handles + computing the criteria, which we split on. The setting of that criterion class is handled + by downstream classes. + + The downstream classes _must_ implement methods to compute the split in a node. """ + def __getstate__(self): + return {} + + def __setstate__(self, d): + pass + + cdef int node_reset( + self, + intp_t start, + intp_t end, + float64_t* weighted_n_node_samples + ) except -1 nogil: + """Reset splitter on node samples[start:end]. + + Returns -1 in case of failure to allocate memory (and raise MemoryError) + or 0 otherwise. + + Parameters + ---------- + start : intp_t + The index of the first sample to consider + end : intp_t + The index of the last sample to consider + weighted_n_node_samples : ndarray, dtype=float64_t pointer + The total weight of those samples + """ + pass + + cdef int node_split( + self, + ParentInfo* parent, + SplitRecord* split, + ) except -1 nogil: + """Find the best split on node samples[start:end]. + + This is a placeholder method. The majority of computation will be done + here. + + It should return -1 upon errors. + + Parameters + ---------- + impurity : float64_t + The impurity of the current node. + split : SplitRecord pointer + A pointer to a memory-allocated SplitRecord object which will be filled with the + split chosen. + lower_bound : float64_t + The lower bound of the monotonic constraint if used. + upper_bound : float64_t + The upper bound of the monotonic constraint if used. + """ + pass + + cdef void node_value(self, float64_t* dest) noexcept nogil: + """Copy the value of node samples[start:end] into dest.""" + pass + + cdef float64_t node_impurity(self) noexcept nogil: + """Return the impurity of the current node.""" + pass + + cdef intp_t pointer_size(self) noexcept nogil: + """Size of the pointer for split records. + + Overriding this function allows one to use different subclasses of + `SplitRecord`. + """ + return sizeof(SplitRecord) + +cdef class Splitter(BaseSplitter): + """Abstract interface for supervised splitters.""" + def __cinit__( self, Criterion criterion, @@ -70,6 +151,7 @@ cdef class Splitter: float64_t min_weight_leaf, object random_state, const int8_t[:] monotonic_cst, + *argv ): """ Parameters @@ -97,7 +179,6 @@ cdef class Splitter: Monotonicity constraints """ - self.criterion = criterion self.n_samples = 0 @@ -110,19 +191,13 @@ cdef class Splitter: self.monotonic_cst = monotonic_cst self.with_monotonic_cst = monotonic_cst is not None - def __getstate__(self): - return {} - - def __setstate__(self, d): - pass - def __reduce__(self): return (type(self), (self.criterion, self.max_features, self.min_samples_leaf, self.min_weight_leaf, self.random_state, - self.monotonic_cst), self.__getstate__()) + self.monotonic_cst.base if self.monotonic_cst is not None else None), self.__getstate__()) cdef int init( self, @@ -156,7 +231,6 @@ cdef class Splitter: has_missing : bool At least one missing values is in X. """ - self.rand_r_state = self.random_state.randint(0, RAND_R_MAX) cdef intp_t n_samples = X.shape[0] @@ -194,8 +268,21 @@ cdef class Splitter: self.y = y self.sample_weight = sample_weight + + self.criterion.init( + self.y, + self.sample_weight, + self.weighted_n_samples, + self.samples + ) + + self.criterion.set_sample_pointers( + self.start, + self.end + ) if missing_values_in_feature_mask is not None: self.criterion.init_sum_missing() + return 0 cdef int node_reset( @@ -222,14 +309,7 @@ cdef class Splitter: self.start = start self.end = end - self.criterion.init( - self.y, - self.sample_weight, - self.weighted_n_samples, - self.samples, - start, - end - ) + self.criterion.set_sample_pointers(start, end) weighted_n_node_samples[0] = self.criterion.weighted_n_node_samples return 0 @@ -260,13 +340,67 @@ cdef class Splitter: self.criterion.clip_node_value(dest, lower_bound, upper_bound) + cdef void node_samples(self, vector[vector[float64_t]]& dest) noexcept nogil: + """Copy the samples[start:end] into dest.""" + self.criterion.node_samples(dest) + cdef float64_t node_impurity(self) noexcept nogil: """Return the impurity of the current node.""" return self.criterion.node_impurity() + cdef inline bint check_presplit_conditions( + self, + SplitRecord* current_split, + intp_t n_missing, + bint missing_go_to_left, + ) noexcept nogil: + """Check stopping conditions pre-split. + + This is typically a metric that is cheaply computed given the + current proposed split, which is stored as a the `current_split` + argument. + + Returns 1 if not a valid split, and 0 if it is. + """ + cdef intp_t min_samples_leaf = self.min_samples_leaf + cdef intp_t end_non_missing = self.end - n_missing + cdef intp_t n_left, n_right + + if missing_go_to_left: + n_left = current_split.pos - self.start + n_missing + n_right = end_non_missing - current_split.pos + else: + n_left = current_split.pos - self.start + n_right = end_non_missing - current_split.pos + n_missing + + # Reject if min_samples_leaf is not guaranteed + if n_left < min_samples_leaf or n_right < min_samples_leaf: + return 1 + + return 0 + + cdef inline bint check_postsplit_conditions( + self + ) noexcept nogil: + """Check stopping conditions after evaluating the split. + + This takes some metric that is stored in the Criterion + object and checks against internal stop metrics. + + Returns 1 if condition is not met, and 0 if it is. + """ + cdef float64_t min_weight_leaf = self.min_weight_leaf + + # Reject if min_weight_leaf is not satisfied + if ((self.criterion.weighted_n_left < min_weight_leaf) or + (self.criterion.weighted_n_right < min_weight_leaf)): + return 1 + + return 0 + -cdef inline int node_split_best( +cdef inline intp_t node_split_best( Splitter splitter, Partitioner partitioner, Criterion criterion, @@ -396,7 +530,6 @@ cdef inline int node_split_best( criterion.init_missing(n_missing) # initialize even when n_missing == 0 # Evaluate all splits - # If there are missing values, then we search twice for the most optimal split. # The first search will have all the missing values going to the right node. # The second search will have all the missing values going to the left node. @@ -417,18 +550,12 @@ cdef inline int node_split_best( if p >= end_non_missing: continue - if missing_go_to_left: - n_left = p - start + n_missing - n_right = end_non_missing - p - else: - n_left = p - start - n_right = end_non_missing - p + n_missing + current_split.pos = p # Reject if min_samples_leaf is not guaranteed - if n_left < min_samples_leaf or n_right < min_samples_leaf: + if splitter.check_presplit_conditions(¤t_split, n_missing, missing_go_to_left) == 1: continue - current_split.pos = p criterion.update(current_split.pos) # Reject if monotonicity constraints are not satisfied @@ -444,8 +571,7 @@ cdef inline int node_split_best( continue # Reject if min_weight_leaf is not satisfied - if ((criterion.weighted_n_left < min_weight_leaf) or - (criterion.weighted_n_right < min_weight_leaf)): + if splitter.check_postsplit_conditions() == 1: continue current_proxy_improvement = criterion.proxy_impurity_improvement() @@ -470,6 +596,13 @@ cdef inline int node_split_best( # test time, we send missing values to the branch that contains # the most samples during training time. if n_missing == 0: + if missing_go_to_left: + n_left = current_split.pos - splitter.start + n_missing + n_right = end_non_missing - current_split.pos + else: + n_left = current_split.pos - splitter.start + n_right = end_non_missing - current_split.pos + n_missing + current_split.missing_go_to_left = n_left > n_right else: current_split.missing_go_to_left = missing_go_to_left @@ -569,8 +702,6 @@ cdef inline int node_split_random( cdef intp_t n_features = splitter.n_features cdef intp_t max_features = splitter.max_features - cdef intp_t min_samples_leaf = splitter.min_samples_leaf - cdef float64_t min_weight_leaf = splitter.min_weight_leaf cdef uint32_t* random_state = &splitter.rand_r_state cdef SplitRecord best_split, current_split @@ -706,7 +837,7 @@ cdef inline int node_split_random( n_right = end_non_missing - current_split.pos + n_missing # Reject if min_samples_leaf is not guaranteed - if n_left < min_samples_leaf or n_right < min_samples_leaf: + if splitter.check_presplit_conditions(¤t_split, n_missing, missing_go_to_left) == 1: continue # Evaluate split @@ -715,23 +846,22 @@ cdef inline int node_split_random( criterion.reset() criterion.update(current_split.pos) - # Reject if min_weight_leaf is not satisfied - if ((criterion.weighted_n_left < min_weight_leaf) or - (criterion.weighted_n_right < min_weight_leaf)): - continue - # Reject if monotonicity constraints are not satisfied if ( - with_monotonic_cst and - monotonic_cst[current_split.feature] != 0 and - not criterion.check_monotonicity( - monotonic_cst[current_split.feature], - lower_bound, - upper_bound, - ) + with_monotonic_cst and + monotonic_cst[current_split.feature] != 0 and + not criterion.check_monotonicity( + monotonic_cst[current_split.feature], + lower_bound, + upper_bound, + ) ): continue + # Reject if min_weight_leaf is not satisfied + if splitter.check_postsplit_conditions() == 1: + continue + current_proxy_improvement = criterion.proxy_impurity_improvement() if current_proxy_improvement > best_proxy_improvement: @@ -805,9 +935,9 @@ cdef class BestSplitter(Splitter): ) cdef int node_split( - self, - ParentInfo* parent_record, - SplitRecord* split, + self, + ParentInfo* parent_record, + SplitRecord* split, ) except -1 nogil: return node_split_best( self, @@ -833,9 +963,9 @@ cdef class BestSparseSplitter(Splitter): ) cdef int node_split( - self, - ParentInfo* parent_record, - SplitRecord* split, + self, + ParentInfo* parent_record, + SplitRecord* split, ) except -1 nogil: return node_split_best( self, @@ -861,9 +991,9 @@ cdef class RandomSplitter(Splitter): ) cdef int node_split( - self, - ParentInfo* parent_record, - SplitRecord* split, + self, + ParentInfo* parent_record, + SplitRecord* split, ) except -1 nogil: return node_split_random( self, diff --git a/sklearn/tree/_tree.pxd b/sklearn/tree/_tree.pxd index 2cadca4564a87..47a3cd2bd5c9d 100644 --- a/sklearn/tree/_tree.pxd +++ b/sklearn/tree/_tree.pxd @@ -4,12 +4,15 @@ # See _tree.pyx for details. import numpy as np + cimport numpy as cnp +from libcpp.unordered_map cimport unordered_map +from libcpp.vector cimport vector from ..utils._typedefs cimport float32_t, float64_t, intp_t, int32_t, uint8_t, uint32_t -from ._splitter cimport Splitter -from ._splitter cimport SplitRecord +from ._splitter cimport SplitRecord, Splitter + cdef struct Node: # Base storage structure for the nodes in a Tree object @@ -23,7 +26,6 @@ cdef struct Node: float64_t weighted_n_node_samples # Weighted number of samples at the node uint8_t missing_go_to_left # Whether features have missing values - cdef struct ParentInfo: # Structure to store information about the parent of a node # This is passed to the splitter, to provide information about the previous split @@ -33,16 +35,7 @@ cdef struct ParentInfo: float64_t impurity # the impurity of the parent intp_t n_constant_features # the number of constant features found in parent -cdef class Tree: - # The Tree object is a binary tree structure constructed by the - # TreeBuilder. The tree structure is used for predictions and - # feature importances. - - # Input/Output layout - cdef public intp_t n_features # Number of features in X - cdef intp_t* n_classes # Number of classes in y[:, k] - cdef public intp_t n_outputs # Number of outputs in y - cdef public intp_t max_n_classes # max(n_classes) +cdef class BaseTree: # Inner structures: values are stored separately from node structure, # since size is determined at runtime. @@ -54,19 +47,33 @@ cdef class Tree: cdef intp_t value_stride # = n_outputs * max_n_classes # Methods - cdef intp_t _add_node(self, intp_t parent, bint is_left, bint is_leaf, - intp_t feature, float64_t threshold, float64_t impurity, - intp_t n_node_samples, - float64_t weighted_n_node_samples, - uint8_t missing_go_to_left) except -1 nogil + cdef intp_t _add_node( + self, + intp_t parent, + bint is_left, + bint is_leaf, + SplitRecord* split_node, + float64_t impurity, + intp_t n_node_samples, + float64_t weighted_n_node_samples, + uint8_t missing_go_to_left + ) except -1 nogil cdef int _resize(self, intp_t capacity) except -1 nogil cdef int _resize_c(self, intp_t capacity=*) except -1 nogil - cdef cnp.ndarray _get_value_ndarray(self) - cdef cnp.ndarray _get_node_ndarray(self) - - cpdef cnp.ndarray predict(self, object X) - + cdef intp_t _update_node( + self, + intp_t parent, + bint is_left, + bint is_leaf, + SplitRecord* split_node, + float64_t impurity, + intp_t n_node_samples, + float64_t weighted_n_node_samples, + uint8_t missing_go_to_left + ) except -1 nogil + + # Python API methods: These are methods exposed to Python cpdef cnp.ndarray apply(self, object X) cdef cnp.ndarray _apply_dense(self, object X) cdef cnp.ndarray _apply_sparse_csr(self, object X) @@ -78,6 +85,64 @@ cdef class Tree: cpdef compute_node_depths(self) cpdef compute_feature_importances(self, normalize=*) + # Abstract methods: these functions must be implemented by any decision tree + cdef int _set_split_node( + self, + SplitRecord* split_node, + Node* node, + intp_t node_id, + ) except -1 nogil + cdef int _set_leaf_node( + self, + SplitRecord* split_node, + Node* node, + intp_t node_id, + ) except -1 nogil + cdef float32_t _compute_feature( + self, + const float32_t[:, :] X_ndarray, + intp_t sample_index, + Node *node + ) noexcept nogil + cdef void _compute_feature_importances( + self, + float64_t[:] importances, + Node* node, + ) noexcept nogil + +cdef class Tree(BaseTree): + # The Tree object is a binary tree structure constructed by the + # TreeBuilder. The tree structure is used for predictions and + # feature importances. + + # The Supervised Tree object is a binary tree structure constructed by the + # TreeBuilder. The tree structure is used for predictions and + # feature importances. + # + # Value of upstream properties: + # - value_stride = n_outputs * max_n_classes + # - value = (capacity, n_outputs, max_n_classes) array of values + + # Input/Output layout + cdef public intp_t n_features # Number of features in X + cdef intp_t* n_classes # Number of classes in y[:, k] + cdef public intp_t n_outputs # Number of outputs in y + cdef public intp_t max_n_classes # max(n_classes) + + # Enables the use of tree to store distributions of the output to allow + # arbitrary usage of the the leaves. This is used in the quantile + # estimators for example. + # for storing samples at each leaf node with leaf's node ID as the key and + # the sample values as the value + cdef unordered_map[intp_t, vector[vector[float64_t]]] value_samples + + # Methods + cdef cnp.ndarray _get_value_ndarray(self) + cdef cnp.ndarray _get_node_ndarray(self) + cdef cnp.ndarray _get_value_samples_ndarray(self, intp_t node_id) + cdef cnp.ndarray _get_value_samples_keys(self) + + cpdef cnp.ndarray predict(self, object X) # ============================================================================= # Tree builder @@ -93,11 +158,23 @@ cdef class TreeBuilder: cdef Splitter splitter # Splitting algorithm - cdef intp_t min_samples_split # Minimum number of samples in an internal node - cdef intp_t min_samples_leaf # Minimum number of samples in a leaf - cdef float64_t min_weight_leaf # Minimum weight in a leaf - cdef intp_t max_depth # Maximal tree depth - cdef float64_t min_impurity_decrease # Impurity threshold for early stopping + cdef intp_t min_samples_split # Minimum number of samples in an internal node + cdef intp_t min_samples_leaf # Minimum number of samples in a leaf + cdef float64_t min_weight_leaf # Minimum weight in a leaf + cdef intp_t max_depth # Maximal tree depth + cdef float64_t min_impurity_decrease # Impurity threshold for early stopping + cdef cnp.ndarray initial_roots # Leaf nodes for streaming updates + + cdef uint8_t store_leaf_values # Whether to store leaf values + + cpdef initialize_node_queue( + self, + Tree tree, + object X, + const float64_t[:, ::1] y, + const float64_t[:] sample_weight=*, + const uint8_t[::1] missing_values_in_feature_mask=*, + ) cpdef build( self, diff --git a/sklearn/tree/_tree.pyx b/sklearn/tree/_tree.pyx index 7e6946a718a81..943d5e6148538 100644 --- a/sklearn/tree/_tree.pyx +++ b/sklearn/tree/_tree.pyx @@ -2,22 +2,23 @@ # SPDX-License-Identifier: BSD-3-Clause from cpython cimport Py_INCREF, PyObject, PyTypeObject - -from libc.stdlib cimport free -from libc.string cimport memcpy -from libc.string cimport memset -from libc.stdint cimport INTPTR_MAX +from cython.operator cimport dereference as deref from libc.math cimport isnan +from libc.stdint cimport INTPTR_MAX +from libc.stdlib cimport free, malloc +from libc.string cimport memcpy, memset from libcpp.vector cimport vector -from libcpp.algorithm cimport pop_heap -from libcpp.algorithm cimport push_heap from libcpp.stack cimport stack from libcpp cimport bool +from libcpp.algorithm cimport pop_heap, push_heap +from libcpp.vector cimport vector import struct import numpy as np + cimport numpy as cnp + cnp.import_array() from scipy.sparse import issparse @@ -26,12 +27,13 @@ from scipy.sparse import csr_matrix from ._utils cimport safe_realloc from ._utils cimport sizet_ptr_to_ndarray + cdef extern from "numpy/arrayobject.h": object PyArray_NewFromDescr(PyTypeObject* subtype, cnp.dtype descr, - int nd, cnp.npy_intp* dims, + intp_t nd, cnp.npy_intp* dims, cnp.npy_intp* strides, - void* data, int flags, object obj) - int PyArray_SetBaseObject(cnp.ndarray arr, PyObject* obj) + void* data, intp_t flags, object obj) + intp_t PyArray_SetBaseObject(cnp.ndarray arr, PyObject* obj) # ============================================================================= # Types and constants @@ -74,6 +76,17 @@ cdef inline void _init_parent_record(ParentInfo* record) noexcept nogil: cdef class TreeBuilder: """Interface for different tree building strategies.""" + cpdef initialize_node_queue( + self, + Tree tree, + object X, + const float64_t[:, ::1] y, + const float64_t[:] sample_weight=None, + const uint8_t[::1] missing_values_in_feature_mask=None, + ): + """Build a decision tree from the training set (X, y).""" + pass + cpdef build( self, Tree tree, @@ -124,6 +137,7 @@ cdef class TreeBuilder: return X, y, sample_weight + # Depth first builder --------------------------------------------------------- # A record on the stack for depth-first tree growing cdef struct StackRecord: @@ -137,18 +151,104 @@ cdef struct StackRecord: float64_t lower_bound float64_t upper_bound + cdef class DepthFirstTreeBuilder(TreeBuilder): """Build a decision tree in depth-first fashion.""" - def __cinit__(self, Splitter splitter, intp_t min_samples_split, - intp_t min_samples_leaf, float64_t min_weight_leaf, - intp_t max_depth, float64_t min_impurity_decrease): + def __cinit__( + self, + Splitter splitter, + intp_t min_samples_split, + intp_t min_samples_leaf, + float64_t min_weight_leaf, + intp_t max_depth, + float64_t min_impurity_decrease, + uint8_t store_leaf_values=False, + cnp.ndarray initial_roots=None, + ): self.splitter = splitter self.min_samples_split = min_samples_split self.min_samples_leaf = min_samples_leaf self.min_weight_leaf = min_weight_leaf self.max_depth = max_depth self.min_impurity_decrease = min_impurity_decrease + self.store_leaf_values = store_leaf_values + self.initial_roots = initial_roots + + def __reduce__(self): + """Reduce re-implementation, for pickling.""" + return(DepthFirstTreeBuilder, (self.splitter, + self.min_samples_split, + self.min_samples_leaf, + self.min_weight_leaf, + self.max_depth, + self.min_impurity_decrease, + self.store_leaf_values, + self.initial_roots)) + + cpdef initialize_node_queue( + self, + Tree tree, + object X, + const float64_t[:, ::1] y, + const float64_t[:] sample_weight=None, + const uint8_t[::1] missing_values_in_feature_mask=None, + ): + """Initialize a list of roots""" + X, y, sample_weight = self._check_input(X, y, sample_weight) + + # organize samples by decision paths + paths = tree.decision_path(X) + cdef intp_t PARENT + cdef intp_t CHILD + cdef intp_t i + false_roots = {} + X_copy = {} + y_copy = {} + for i in range(X.shape[0]): + # collect depths from the node paths + depth_i = paths[i].indices.shape[0] - 1 + PARENT = depth_i - 1 + CHILD = depth_i + + # find leaf node's & their parent node's IDs + if PARENT < 0: + parent_i = 0 + else: + parent_i = paths[i].indices[PARENT] + child_i = paths[i].indices[CHILD] + left = 0 + if tree.children_left[parent_i] == child_i: + left = 1 # leaf node is left child + + # organize samples by the leaf they fall into (false root) + # leaf nodes are marked by parent node and + # their relative position (left or right child) + if (parent_i, left) in false_roots: + false_roots[(parent_i, left)][0] += 1 + X_copy[(parent_i, left)].append(X[i]) + y_copy[(parent_i, left)].append(y[i]) + else: + false_roots[(parent_i, left)] = [1, depth_i] + X_copy[(parent_i, left)] = [X[i]] + y_copy[(parent_i, left)] = [y[i]] + + X_list = [] + y_list = [] + + # reorder the samples according to parent node IDs + for key, value in reversed(sorted(X_copy.items())): + X_list = X_list + value + y_list = y_list + y_copy[key] + cdef object X_new = np.array(X_list) + cdef cnp.ndarray y_new = np.array(y_list) + + # initialize the splitter using sorted samples + cdef Splitter splitter = self.splitter + splitter.init(X_new, y_new, sample_weight, missing_values_in_feature_mask) + + # convert dict to numpy array and store value + self.initial_roots = np.array(list(false_roots.items())) cpdef build( self, @@ -163,16 +263,6 @@ cdef class DepthFirstTreeBuilder(TreeBuilder): # check input X, y, sample_weight = self._check_input(X, y, sample_weight) - # Initial capacity - cdef intp_t init_capacity - - if tree.max_depth <= 10: - init_capacity = (2 ** (tree.max_depth + 1)) - 1 - else: - init_capacity = 2047 - - tree._resize(init_capacity) - # Parameters cdef Splitter splitter = self.splitter cdef intp_t max_depth = self.max_depth @@ -181,36 +271,75 @@ cdef class DepthFirstTreeBuilder(TreeBuilder): cdef intp_t min_samples_split = self.min_samples_split cdef float64_t min_impurity_decrease = self.min_impurity_decrease - # Recursive partition (without actual recursion) - splitter.init(X, y, sample_weight, missing_values_in_feature_mask) + cdef uint8_t store_leaf_values = self.store_leaf_values + cdef cnp.ndarray initial_roots = self.initial_roots + + # Initial capacity + cdef intp_t init_capacity + cdef bint first = 0 + if initial_roots is None: + # Recursive partition (without actual recursion) + splitter.init(X, y, sample_weight, missing_values_in_feature_mask) + + if tree.max_depth <= 10: + init_capacity = (2 ** (tree.max_depth + 1)) - 1 + else: + init_capacity = 2047 + + tree._resize(init_capacity) + first = 1 + else: + # convert numpy array back to dict + false_roots = {} + for key_value_pair in initial_roots: + false_roots[tuple(key_value_pair[0])] = key_value_pair[1] + + # reset the root array + self.initial_roots = None - cdef intp_t start - cdef intp_t end + cdef intp_t start = 0 + cdef intp_t end = 0 cdef intp_t depth cdef intp_t parent cdef bint is_left cdef intp_t n_node_samples = splitter.n_samples cdef float64_t weighted_n_node_samples - cdef SplitRecord split cdef intp_t node_id + cdef float64_t right_child_min, left_child_min, right_child_max, left_child_max + + cdef SplitRecord split + cdef SplitRecord* split_ptr = malloc(splitter.pointer_size()) cdef float64_t middle_value - cdef float64_t left_child_min - cdef float64_t left_child_max - cdef float64_t right_child_min - cdef float64_t right_child_max cdef bint is_leaf - cdef bint first = 1 - cdef intp_t max_depth_seen = -1 - cdef int rc = 0 + cdef intp_t max_depth_seen = -1 if first else tree.max_depth + + cdef intp_t rc = 0 cdef stack[StackRecord] builder_stack + cdef stack[StackRecord] update_stack cdef StackRecord stack_record cdef ParentInfo parent_record _init_parent_record(&parent_record) - with nogil: + if not first: + # push reached leaf nodes onto stack + for key, value in reversed(sorted(false_roots.items())): + end += value[0] + update_stack.push({ + "start": start, + "end": end, + "depth": value[1], + "parent": key[0], + "is_left": key[1], + "impurity": tree.impurity[key[0]], + "n_constant_features": 0, + "lower_bound": -INFINITY, + "upper_bound": INFINITY, + }) + start += value[0] + else: # push root node onto stack builder_stack.push({ "start": 0, @@ -224,6 +353,137 @@ cdef class DepthFirstTreeBuilder(TreeBuilder): "upper_bound": INFINITY, }) + with nogil: + while not update_stack.empty(): + stack_record = update_stack.top() + update_stack.pop() + + start = stack_record.start + end = stack_record.end + depth = stack_record.depth + parent = stack_record.parent + is_left = stack_record.is_left + parent_record.impurity = stack_record.impurity + parent_record.n_constant_features = stack_record.n_constant_features + parent_record.lower_bound = stack_record.lower_bound + parent_record.upper_bound = stack_record.upper_bound + + n_node_samples = end - start + splitter.node_reset(start, end, &weighted_n_node_samples) + + is_leaf = (depth >= max_depth or + n_node_samples < min_samples_split or + n_node_samples < 2 * min_samples_leaf or + weighted_n_node_samples < 2 * min_weight_leaf) + + if first: + parent_record.impurity = splitter.node_impurity() + first = 0 + + # impurity == 0 with tolerance due to rounding errors + is_leaf = is_leaf or parent_record.impurity <= EPSILON + + if not is_leaf: + splitter.node_split( + &parent_record, + split_ptr, + ) + + # assign local copy of SplitRecord to assign + # pos, improvement, and impurity scores + split = deref(split_ptr) + + # If EPSILON=0 in the below comparison, float precision + # issues stop splitting, producing trees that are + # dissimilar to v0.18 + is_leaf = (is_leaf or split.pos >= end or + (split.improvement + EPSILON < + min_impurity_decrease)) + + node_id = tree._update_node(parent, is_left, is_leaf, split_ptr, + parent_record.impurity, + n_node_samples, weighted_n_node_samples, + split.missing_go_to_left) + + if node_id == INTPTR_MAX: + rc = -1 + break + + # Store value for all nodes, to facilitate tree/model + # inspection and interpretation + splitter.node_value(tree.value + node_id * tree.value_stride) + if splitter.with_monotonic_cst: + splitter.clip_node_value( + tree.value + node_id * tree.value_stride, + parent_record.lower_bound, + parent_record.upper_bound + ) + + if not is_leaf: + if ( + not splitter.with_monotonic_cst or + splitter.monotonic_cst[split.feature] == 0 + ): + # Split on a feature with no monotonicity constraint + + # Current bounds must always be propagated to both children. + # If a monotonic constraint is active, bounds are used in + # node value clipping. + left_child_min = right_child_min = parent_record.lower_bound + left_child_max = right_child_max = parent_record.upper_bound + elif splitter.monotonic_cst[split.feature] == 1: + # Split on a feature with monotonic increase constraint + left_child_min = parent_record.lower_bound + right_child_max = parent_record.upper_bound + + # Lower bound for right child and upper bound for left child + # are set to the same value. + middle_value = splitter.criterion.middle_value() + right_child_min = middle_value + left_child_max = middle_value + else: # i.e. splitter.monotonic_cst[split.feature] == -1 + # Split on a feature with monotonic decrease constraint + right_child_min = parent_record.lower_bound + left_child_max = parent_record.upper_bound + + # Lower bound for left child and upper bound for right child + # are set to the same value. + middle_value = splitter.criterion.middle_value() + left_child_min = middle_value + right_child_max = middle_value + + # Push right child on stack + builder_stack.push({ + "start": split.pos, + "end": end, + "depth": depth + 1, + "parent": node_id, + "is_left": 0, + "impurity": split.impurity_right, + "n_constant_features": parent_record.n_constant_features, + "lower_bound": right_child_min, + "upper_bound": right_child_max, + }) + + # Push left child on stack + builder_stack.push({ + "start": start, + "end": split.pos, + "depth": depth + 1, + "parent": node_id, + "is_left": 1, + "impurity": split.impurity_left, + "n_constant_features": parent_record.n_constant_features, + "lower_bound": left_child_min, + "upper_bound": left_child_max, + }) + elif store_leaf_values and is_leaf: + # copy leaf values to leaf_values array + splitter.node_samples(tree.value_samples[node_id]) + + if depth > max_depth_seen: + max_depth_seen = depth + while not builder_stack.empty(): stack_record = builder_stack.top() builder_stack.pop() @@ -248,7 +508,7 @@ cdef class DepthFirstTreeBuilder(TreeBuilder): if first: parent_record.impurity = splitter.node_impurity() - first = 0 + first=0 # impurity == 0 with tolerance due to rounding errors is_leaf = is_leaf or parent_record.impurity <= EPSILON @@ -256,8 +516,13 @@ cdef class DepthFirstTreeBuilder(TreeBuilder): if not is_leaf: splitter.node_split( &parent_record, - &split, + split_ptr, ) + + # assign local copy of SplitRecord to assign + # pos, improvement, and impurity scores + split = deref(split_ptr) + # If EPSILON=0 in the below comparison, float precision # issues stop splitting, producing trees that are # dissimilar to v0.18 @@ -265,10 +530,9 @@ cdef class DepthFirstTreeBuilder(TreeBuilder): (split.improvement + EPSILON < min_impurity_decrease)) - node_id = tree._add_node(parent, is_left, is_leaf, split.feature, - split.threshold, parent_record.impurity, - n_node_samples, weighted_n_node_samples, - split.missing_go_to_left) + node_id = tree._add_node(parent, is_left, is_leaf, split_ptr, + parent_record.impurity, n_node_samples, + weighted_n_node_samples, split.missing_go_to_left) if node_id == INTPTR_MAX: rc = -1 @@ -278,7 +542,11 @@ cdef class DepthFirstTreeBuilder(TreeBuilder): # inspection and interpretation splitter.node_value(tree.value + node_id * tree.value_stride) if splitter.with_monotonic_cst: - splitter.clip_node_value(tree.value + node_id * tree.value_stride, parent_record.lower_bound, parent_record.upper_bound) + splitter.clip_node_value( + tree.value + node_id * tree.value_stride, + parent_record.lower_bound, + parent_record.upper_bound + ) if not is_leaf: if ( @@ -338,6 +606,9 @@ cdef class DepthFirstTreeBuilder(TreeBuilder): "lower_bound": left_child_min, "upper_bound": left_child_max, }) + elif store_leaf_values and is_leaf: + # copy leaf values to leaf_values array + splitter.node_samples(tree.value_samples[node_id]) if depth > max_depth_seen: max_depth_seen = depth @@ -347,10 +618,13 @@ cdef class DepthFirstTreeBuilder(TreeBuilder): if rc >= 0: tree.max_depth = max_depth_seen + + # free the memory created for the SplitRecord pointer + free(split_ptr) + if rc == -1: raise MemoryError() - # Best first builder ---------------------------------------------------------- cdef struct FrontierRecord: # Record of information of a Node, the frontier for a split. Those records are @@ -393,10 +667,18 @@ cdef class BestFirstTreeBuilder(TreeBuilder): """ cdef intp_t max_leaf_nodes - def __cinit__(self, Splitter splitter, intp_t min_samples_split, - intp_t min_samples_leaf, min_weight_leaf, - intp_t max_depth, intp_t max_leaf_nodes, - float64_t min_impurity_decrease): + def __cinit__( + self, + Splitter splitter, + intp_t min_samples_split, + intp_t min_samples_leaf, + float64_t min_weight_leaf, + intp_t max_depth, + intp_t max_leaf_nodes, + float64_t min_impurity_decrease, + uint8_t store_leaf_values=False, + cnp.ndarray initial_roots=None, + ): self.splitter = splitter self.min_samples_split = min_samples_split self.min_samples_leaf = min_samples_leaf @@ -404,6 +686,20 @@ cdef class BestFirstTreeBuilder(TreeBuilder): self.max_depth = max_depth self.max_leaf_nodes = max_leaf_nodes self.min_impurity_decrease = min_impurity_decrease + self.store_leaf_values = store_leaf_values + self.initial_roots = initial_roots + + def __reduce__(self): + """Reduce re-implementation, for pickling.""" + return(BestFirstTreeBuilder, (self.splitter, + self.min_samples_split, + self.min_samples_leaf, + self.min_weight_leaf, + self.max_depth, + self.max_leaf_nodes, + self.min_impurity_decrease, + self.store_leaf_values, + self.initial_roots)) cpdef build( self, @@ -422,6 +718,8 @@ cdef class BestFirstTreeBuilder(TreeBuilder): cdef Splitter splitter = self.splitter cdef intp_t max_leaf_nodes = self.max_leaf_nodes + cdef uint8_t store_leaf_values = self.store_leaf_values + # Recursive partition (without actual recursion) splitter.init(X, y, sample_weight, missing_values_in_feature_mask) @@ -438,7 +736,7 @@ cdef class BestFirstTreeBuilder(TreeBuilder): cdef intp_t max_split_nodes = max_leaf_nodes - 1 cdef bint is_leaf cdef intp_t max_depth_seen = -1 - cdef int rc = 0 + cdef intp_t rc = 0 cdef Node* node cdef ParentInfo parent_record @@ -480,6 +778,9 @@ cdef class BestFirstTreeBuilder(TreeBuilder): node.feature = _TREE_UNDEFINED node.threshold = _TREE_UNDEFINED + if store_leaf_values: + # copy leaf values to leaf_values array + splitter.node_samples(tree.value_samples[record.node_id]) else: # Node is expandable @@ -573,7 +874,7 @@ cdef class BestFirstTreeBuilder(TreeBuilder): if rc == -1: raise MemoryError() - cdef inline int _add_split_node( + cdef inline intp_t _add_split_node( self, Splitter splitter, Tree tree, @@ -588,6 +889,13 @@ cdef class BestFirstTreeBuilder(TreeBuilder): ) except -1 nogil: """Adds node w/ partition ``[start, end)`` to the frontier. """ cdef SplitRecord split + + # Note: we create a <*SplitRecord> pointer here in order to allow subclasses + # to know what kind of SplitRecord to use. In some cases, ObliqueSplitRecord + # might be used. The split pointer here knows the size of the underlying Record + # because the subclassed splitter will define "pointer_size" accordingly. + cdef SplitRecord* split_ptr = malloc(splitter.pointer_size()) + cdef intp_t node_id cdef intp_t n_node_samples cdef float64_t min_impurity_decrease = self.min_impurity_decrease @@ -613,8 +921,12 @@ cdef class BestFirstTreeBuilder(TreeBuilder): if not is_leaf: splitter.node_split( parent_record, - &split, + split_ptr, ) + # assign local copy of SplitRecord to assign + # pos, improvement, and impurity scores + split = deref(split_ptr) + # If EPSILON=0 in the below comparison, float precision issues stop # splitting early, producing trees that are dissimilar to v0.18 is_leaf = (is_leaf or split.pos >= end or @@ -624,7 +936,7 @@ cdef class BestFirstTreeBuilder(TreeBuilder): if parent != NULL else _TREE_UNDEFINED, is_left, is_leaf, - split.feature, split.threshold, parent_record.impurity, + split_ptr, parent_record.impurity, n_node_samples, weighted_n_node_samples, split.missing_go_to_left) if node_id == INTPTR_MAX: @@ -660,236 +972,50 @@ cdef class BestFirstTreeBuilder(TreeBuilder): res.impurity_left = parent_record.impurity res.impurity_right = parent_record.impurity + free(split_ptr) return 0 # ============================================================================= # Tree # ============================================================================= +cdef class BaseTree: + """Base class for Cython tree models. -cdef class Tree: - """Array-based representation of a binary decision tree. - - The binary tree is represented as a number of parallel arrays. The i-th - element of each array holds information about the node `i`. Node 0 is the - tree's root. You can find a detailed description of all arrays in - `_tree.pxd`. NOTE: Some of the arrays only apply to either leaves or split - nodes, resp. In this case the values of nodes of the other type are - arbitrary! - - Attributes - ---------- - node_count : intp_t - The number of nodes (internal nodes + leaves) in the tree. - - capacity : intp_t - The current capacity (i.e., size) of the arrays, which is at least as - great as `node_count`. + Downstream classes must implement methods to actually traverse the tree. + """ + cdef int _resize( + self, + intp_t capacity + ) except -1 nogil: + """Resize all inner arrays to `capacity`, if `capacity` == -1, then + double the size of the inner arrays. - max_depth : intp_t - The depth of the tree, i.e. the maximum depth of its leaves. + Returns -1 in case of failure to allocate memory (and raise MemoryError) + or 0 otherwise. + """ + if self._resize_c(capacity) != 0: + # Acquire gil only if we need to raise + with gil: + raise MemoryError() - children_left : array of intp_t, shape [node_count] - children_left[i] holds the node id of the left child of node i. - For leaves, children_left[i] == TREE_LEAF. Otherwise, - children_left[i] > i. This child handles the case where - X[:, feature[i]] <= threshold[i]. + cdef int _resize_c(self, intp_t capacity=INTPTR_MAX) except -1 nogil: + """Guts of _resize - children_right : array of intp_t, shape [node_count] - children_right[i] holds the node id of the right child of node i. - For leaves, children_right[i] == TREE_LEAF. Otherwise, - children_right[i] > i. This child handles the case where - X[:, feature[i]] > threshold[i]. + Returns -1 in case of failure to allocate memory (and raise MemoryError) + or 0 otherwise. + """ + if capacity == self.capacity and self.nodes != NULL: + return 0 - n_leaves : intp_t - Number of leaves in the tree. + if capacity == INTPTR_MAX: + if self.capacity == 0: + capacity = 3 # default initial value + else: + capacity = 2 * self.capacity - feature : array of intp_t, shape [node_count] - feature[i] holds the feature to split on, for the internal node i. - - threshold : array of float64_t, shape [node_count] - threshold[i] holds the threshold for the internal node i. - - value : array of float64_t, shape [node_count, n_outputs, max_n_classes] - Contains the constant prediction value of each node. - - impurity : array of float64_t, shape [node_count] - impurity[i] holds the impurity (i.e., the value of the splitting - criterion) at node i. - - n_node_samples : array of intp_t, shape [node_count] - n_node_samples[i] holds the number of training samples reaching node i. - - weighted_n_node_samples : array of float64_t, shape [node_count] - weighted_n_node_samples[i] holds the weighted number of training samples - reaching node i. - - missing_go_to_left : array of bool, shape [node_count] - missing_go_to_left[i] holds a bool indicating whether or not there were - missing values at node i. - """ - # Wrap for outside world. - # WARNING: these reference the current `nodes` and `value` buffers, which - # must not be freed by a subsequent memory allocation. - # (i.e. through `_resize` or `__setstate__`) - @property - def n_classes(self): - return sizet_ptr_to_ndarray(self.n_classes, self.n_outputs) - - @property - def children_left(self): - return self._get_node_ndarray()['left_child'][:self.node_count] - - @property - def children_right(self): - return self._get_node_ndarray()['right_child'][:self.node_count] - - @property - def n_leaves(self): - return np.sum(np.logical_and( - self.children_left == -1, - self.children_right == -1)) - - @property - def feature(self): - return self._get_node_ndarray()['feature'][:self.node_count] - - @property - def threshold(self): - return self._get_node_ndarray()['threshold'][:self.node_count] - - @property - def impurity(self): - return self._get_node_ndarray()['impurity'][:self.node_count] - - @property - def n_node_samples(self): - return self._get_node_ndarray()['n_node_samples'][:self.node_count] - - @property - def weighted_n_node_samples(self): - return self._get_node_ndarray()['weighted_n_node_samples'][:self.node_count] - - @property - def missing_go_to_left(self): - return self._get_node_ndarray()['missing_go_to_left'][:self.node_count] - - @property - def value(self): - return self._get_value_ndarray()[:self.node_count] - - # TODO: Convert n_classes to cython.integral memory view once - # https://github.com/cython/cython/issues/5243 is fixed - def __cinit__(self, intp_t n_features, cnp.ndarray n_classes, intp_t n_outputs): - """Constructor.""" - cdef intp_t dummy = 0 - size_t_dtype = np.array(dummy).dtype - - n_classes = _check_n_classes(n_classes, size_t_dtype) - - # Input/Output layout - self.n_features = n_features - self.n_outputs = n_outputs - self.n_classes = NULL - safe_realloc(&self.n_classes, n_outputs) - - self.max_n_classes = np.max(n_classes) - self.value_stride = n_outputs * self.max_n_classes - - cdef intp_t k - for k in range(n_outputs): - self.n_classes[k] = n_classes[k] - - # Inner structures - self.max_depth = 0 - self.node_count = 0 - self.capacity = 0 - self.value = NULL - self.nodes = NULL - - def __dealloc__(self): - """Destructor.""" - # Free all inner structures - free(self.n_classes) - free(self.value) - free(self.nodes) - - def __reduce__(self): - """Reduce re-implementation, for pickling.""" - return (Tree, (self.n_features, - sizet_ptr_to_ndarray(self.n_classes, self.n_outputs), - self.n_outputs), self.__getstate__()) - - def __getstate__(self): - """Getstate re-implementation, for pickling.""" - d = {} - # capacity is inferred during the __setstate__ using nodes - d["max_depth"] = self.max_depth - d["node_count"] = self.node_count - d["nodes"] = self._get_node_ndarray() - d["values"] = self._get_value_ndarray() - return d - - def __setstate__(self, d): - """Setstate re-implementation, for unpickling.""" - self.max_depth = d["max_depth"] - self.node_count = d["node_count"] - - if 'nodes' not in d: - raise ValueError('You have loaded Tree version which ' - 'cannot be imported') - - node_ndarray = d['nodes'] - value_ndarray = d['values'] - - value_shape = (node_ndarray.shape[0], self.n_outputs, - self.max_n_classes) - - node_ndarray = _check_node_ndarray(node_ndarray, expected_dtype=NODE_DTYPE) - value_ndarray = _check_value_ndarray( - value_ndarray, - expected_dtype=np.dtype(np.float64), - expected_shape=value_shape - ) - - self.capacity = node_ndarray.shape[0] - if self._resize_c(self.capacity) != 0: - raise MemoryError("resizing tree to %d" % self.capacity) - - memcpy(self.nodes, cnp.PyArray_DATA(node_ndarray), - self.capacity * sizeof(Node)) - memcpy(self.value, cnp.PyArray_DATA(value_ndarray), - self.capacity * self.value_stride * sizeof(float64_t)) - - cdef int _resize(self, intp_t capacity) except -1 nogil: - """Resize all inner arrays to `capacity`, if `capacity` == -1, then - double the size of the inner arrays. - - Returns -1 in case of failure to allocate memory (and raise MemoryError) - or 0 otherwise. - """ - if self._resize_c(capacity) != 0: - # Acquire gil only if we need to raise - with gil: - raise MemoryError() - - cdef int _resize_c(self, intp_t capacity=INTPTR_MAX) except -1 nogil: - """Guts of _resize - - Returns -1 in case of failure to allocate memory (and raise MemoryError) - or 0 otherwise. - """ - if capacity == self.capacity and self.nodes != NULL: - return 0 - - if capacity == INTPTR_MAX: - if self.capacity == 0: - capacity = 3 # default initial value - else: - capacity = 2 * self.capacity - - safe_realloc(&self.nodes, capacity) - safe_realloc(&self.value, capacity * self.value_stride) + safe_realloc(&self.nodes, capacity) + safe_realloc(&self.value, capacity * self.value_stride) if capacity > self.capacity: # value memory is initialised to 0 to enable classifier argmax @@ -906,16 +1032,100 @@ cdef class Tree: self.capacity = capacity return 0 - cdef intp_t _add_node(self, intp_t parent, bint is_left, bint is_leaf, - intp_t feature, float64_t threshold, float64_t impurity, - intp_t n_node_samples, - float64_t weighted_n_node_samples, - uint8_t missing_go_to_left) except -1 nogil: + cdef int _set_split_node( + self, + SplitRecord* split_node, + Node* node, + intp_t node_id, + ) except -1 nogil: + """Set split node data. + + Parameters + ---------- + split_node : SplitRecord* + The pointer to the record of the split node data. + node : Node* + The pointer to the node that will hold the split node. + node_id : intp_t + The index of the node. + """ + # left_child and right_child will be set later for a split node + node.feature = split_node.feature + node.threshold = split_node.threshold + return 1 + + cdef int _set_leaf_node( + self, + SplitRecord* split_node, + Node* node, + intp_t node_id, + ) except -1 nogil: + """Set leaf node data. + + Parameters + ---------- + split_node : SplitRecord* + The pointer to the record of the leaf node data. + node : Node* + The pointer to the node that will hold the leaf node. + node_id : intp_t + The index of the node. + """ + node.left_child = _TREE_LEAF + node.right_child = _TREE_LEAF + node.feature = _TREE_UNDEFINED + node.threshold = _TREE_UNDEFINED + return 1 + + cdef float32_t _compute_feature( + self, + const float32_t[:, :] X_ndarray, + intp_t sample_index, + Node *node + ) noexcept nogil: + """Compute feature from a given data matrix, X. + + In axis-aligned trees, this is simply the value in the column of X + for this specific feature. + """ + # the feature index + cdef float32_t feature = X_ndarray[sample_index, node.feature] + return feature + + cdef intp_t _add_node( + self, + intp_t parent, + bint is_left, + bint is_leaf, + SplitRecord* split_node, + float64_t impurity, + intp_t n_node_samples, + float64_t weighted_n_node_samples, + uint8_t missing_go_to_left + ) except -1 nogil: """Add a node to the tree. The new node registers itself as the child of its parent. - Returns (size_t)(-1) on error. + Parameters + ---------- + parent : intp_t + The index of the parent. If '_TREE_UNDEFINED', then the current + node is a root node. + is_left : bint + Whether or not the current node is to the left of the parent node. + is_leaf : bint + Whether or not the current node is a leaf node. + split_node : SplitRecord* + A pointer to a SplitRecord pointer address. + impurity : float64_t + The impurity of the node to be added. + n_node_samples : intp_t + The number of samples in the node. + weighted_n_node_samples : float64_t + The weight of the samples in the node. + + Returns (intp_t)(-1) on error. """ cdef intp_t node_id = self.node_count @@ -935,28 +1145,62 @@ cdef class Tree: self.nodes[parent].right_child = node_id if is_leaf: - node.left_child = _TREE_LEAF - node.right_child = _TREE_LEAF - node.feature = _TREE_UNDEFINED - node.threshold = _TREE_UNDEFINED - + if self._set_leaf_node(split_node, node, node_id) != 1: + with gil: + raise RuntimeError else: - # left_child and right_child will be set later - node.feature = feature - node.threshold = threshold + if self._set_split_node(split_node, node, node_id) != 1: + with gil: + raise RuntimeError node.missing_go_to_left = missing_go_to_left self.node_count += 1 return node_id - cpdef cnp.ndarray predict(self, object X): - """Predict target for X.""" - out = self._get_value_ndarray().take(self.apply(X), axis=0, - mode='clip') - if self.n_outputs == 1: - out = out.reshape(X.shape[0], self.max_n_classes) - return out + cdef inline intp_t _update_node( + self, + intp_t parent, + bint is_left, + bint is_leaf, + SplitRecord* split_node, + float64_t impurity, + intp_t n_node_samples, + float64_t weighted_n_node_samples, + uint8_t missing_go_to_left + ) except -1 nogil: + """Update a node on the tree. + + The updated node remains on the same position. + + Returns (intp_t)(-1) on error. + """ + cdef intp_t node_id + if is_left: + node_id = self.nodes[parent].left_child + else: + node_id = self.nodes[parent].right_child + + if node_id >= self.capacity: + if self._resize_c() != 0: + return INTPTR_MAX + + cdef Node* node = &self.nodes[node_id] + node.impurity = impurity + node.n_node_samples = n_node_samples + node.weighted_n_node_samples = weighted_n_node_samples + + if is_leaf: + if self._set_leaf_node(split_node, node, node_id) != 1: + with gil: + raise RuntimeError + else: + if self._set_split_node(split_node, node, node_id) != 1: + with gil: + raise RuntimeError + node.missing_go_to_left = missing_go_to_left + + return node_id cpdef cnp.ndarray apply(self, object X): """Finds the terminal region (=leaf node) for each sample in X.""" @@ -991,9 +1235,10 @@ cdef class Tree: with nogil: for i in range(n_samples): node = self.nodes + # While node not a leaf while node.left_child != _TREE_LEAF: - X_i_node_feature = X_ndarray[i, node.feature] + X_i_node_feature = self._compute_feature(X_ndarray, i, node) # ... and node.right_child != _TREE_LEAF: if isnan(X_i_node_feature): if node.missing_go_to_left: @@ -1061,7 +1306,6 @@ cdef class Tree: # ... and node.right_child != _TREE_LEAF: if feature_to_sample[node.feature] == i: feature_value = X_sample[node.feature] - else: feature_value = 0. @@ -1110,6 +1354,9 @@ cdef class Tree: cdef Node* node = NULL cdef intp_t i = 0 + # the feature index + cdef float64_t feature + with nogil: for i in range(n_samples): node = self.nodes @@ -1121,7 +1368,9 @@ cdef class Tree: indices[indptr[i + 1]] = (node - self.nodes) indptr[i + 1] += 1 - if X_ndarray[i, node.feature] <= node.threshold: + # compute the feature value to compare against threshold + feature = self._compute_feature(X_ndarray, i, node) + if feature <= node.threshold: node = &self.nodes[node.left_child] else: node = &self.nodes[node.right_child] @@ -1250,27 +1499,22 @@ cdef class Tree: cpdef compute_feature_importances(self, normalize=True): """Computes the importance of each feature (aka variable).""" - cdef Node* left - cdef Node* right cdef Node* nodes = self.nodes cdef Node* node = nodes cdef Node* end_node = node + self.node_count cdef float64_t normalizer = 0. + cdef intp_t i = 0 - cdef cnp.float64_t[:] importances = np.zeros(self.n_features) + cdef float64_t[:] importances = np.zeros(self.n_features) with nogil: while node != end_node: if node.left_child != _TREE_LEAF: # ... and node.right_child != _TREE_LEAF: - left = &nodes[node.left_child] - right = &nodes[node.right_child] + self._compute_feature_importances( + importances, node) - importances[node.feature] += ( - node.weighted_n_node_samples * node.impurity - - left.weighted_n_node_samples * left.impurity - - right.weighted_n_node_samples * right.impurity) node += 1 for i in range(self.n_features): @@ -1284,46 +1528,29 @@ cdef class Tree: for i in range(self.n_features): importances[i] /= normalizer - return np.asarray(importances) - - cdef cnp.ndarray _get_value_ndarray(self): - """Wraps value as a 3-d NumPy array. - - The array keeps a reference to this Tree, which manages the underlying - memory. - """ - cdef cnp.npy_intp shape[3] - shape[0] = self.node_count - shape[1] = self.n_outputs - shape[2] = self.max_n_classes - cdef cnp.ndarray arr - arr = cnp.PyArray_SimpleNewFromData(3, shape, cnp.NPY_DOUBLE, self.value) - Py_INCREF(self) - if PyArray_SetBaseObject(arr, self) < 0: - raise ValueError("Can't initialize array.") - return arr - - cdef cnp.ndarray _get_node_ndarray(self): - """Wraps nodes as a NumPy struct array. - - The array keeps a reference to this Tree, which manages the underlying - memory. Individual fields are publicly accessible as properties of the - Tree. - """ - cdef cnp.npy_intp shape[1] - shape[0] = self.node_count - cdef cnp.npy_intp strides[1] - strides[0] = sizeof(Node) - cdef cnp.ndarray arr - Py_INCREF(NODE_DTYPE) - arr = PyArray_NewFromDescr( cnp.ndarray, - NODE_DTYPE, 1, shape, - strides, self.nodes, - cnp.NPY_ARRAY_DEFAULT, None) - Py_INCREF(self) - if PyArray_SetBaseObject(arr, self) < 0: - raise ValueError("Can't initialize array.") - return arr + return np.asarray(importances) + + cdef void _compute_feature_importances( + self, + float64_t[:] importances, + Node* node + ) noexcept nogil: + """Compute feature importances from a Node in the Tree. + + Wrapped in a private function to allow subclassing that + computes feature importances. + """ + cdef Node* nodes = self.nodes + cdef Node* left + cdef Node* right + + left = &nodes[node.left_child] + right = &nodes[node.right_child] + + importances[node.feature] += ( + node.weighted_n_node_samples * node.impurity - + left.weighted_n_node_samples * left.impurity - + right.weighted_n_node_samples * right.impurity) def compute_partial_dependence(self, float32_t[:, ::1] X, const intp_t[::1] target_features, @@ -1432,6 +1659,286 @@ cdef class Tree: total_weight) +cdef class Tree(BaseTree): + """Array-based representation of a binary decision tree. + + The binary tree is represented as a number of parallel arrays. The i-th + element of each array holds information about the node `i`. Node 0 is the + tree's root. You can find a detailed description of all arrays in + `_tree.pxd`. NOTE: Some of the arrays only apply to either leaves or split + nodes, resp. In this case the values of nodes of the other type are + arbitrary! + + Attributes + ---------- + node_count : intp_t + The number of nodes (internal nodes + leaves) in the tree. + + capacity : intp_t + The current capacity (i.e., size) of the arrays, which is at least as + great as `node_count`. + + max_depth : intp_t + The depth of the tree, i.e. the maximum depth of its leaves. + + children_left : array of intp_t, shape [node_count] + children_left[i] holds the node id of the left child of node i. + For leaves, children_left[i] == TREE_LEAF. Otherwise, + children_left[i] > i. This child handles the case where + X[:, feature[i]] <= threshold[i]. + + children_right : array of intp_t, shape [node_count] + children_right[i] holds the node id of the right child of node i. + For leaves, children_right[i] == TREE_LEAF. Otherwise, + children_right[i] > i. This child handles the case where + X[:, feature[i]] > threshold[i]. + + feature : array of intp_t, shape [node_count] + feature[i] holds the feature to split on, for the internal node i. + + threshold : array of float64_t, shape [node_count] + threshold[i] holds the threshold for the internal node i. + + value : array of float64_t, shape [node_count, n_outputs, max_n_classes] + Contains the constant prediction value of each node. + + impurity : array of float64_t, shape [node_count] + impurity[i] holds the impurity (i.e., the value of the splitting + criterion) at node i. + + n_node_samples : array of intp_t, shape [node_count] + n_node_samples[i] holds the number of training samples reaching node i. + + weighted_n_node_samples : array of float64_t, shape [node_count] + weighted_n_node_samples[i] holds the weighted number of training samples + reaching node i. + + leaf_node_samples : dict of node id to numpy array of shapes (n_samples_node, n_features) + A dictionary mapping leaf nodes to the samples of data that are used + to fit the prediction at each leaf. + """ + # Wrap for outside world. + # WARNING: these reference the current `nodes` and `value` buffers, which + # must not be freed by a subsequent memory allocation. + # (i.e. through `_resize` or `__setstate__`) + @property + def n_classes(self): + return sizet_ptr_to_ndarray(self.n_classes, self.n_outputs) + + @property + def children_left(self): + return self._get_node_ndarray()['left_child'][:self.node_count] + + @property + def children_right(self): + return self._get_node_ndarray()['right_child'][:self.node_count] + + @property + def n_leaves(self): + return np.sum(np.logical_and( + self.children_left == -1, + self.children_right == -1)) + + @property + def feature(self): + return self._get_node_ndarray()['feature'][:self.node_count] + + @property + def threshold(self): + return self._get_node_ndarray()['threshold'][:self.node_count] + + @property + def impurity(self): + return self._get_node_ndarray()['impurity'][:self.node_count] + + @property + def n_node_samples(self): + return self._get_node_ndarray()['n_node_samples'][:self.node_count] + + @property + def weighted_n_node_samples(self): + return self._get_node_ndarray()['weighted_n_node_samples'][:self.node_count] + + @property + def missing_go_to_left(self): + return self._get_node_ndarray()['missing_go_to_left'][:self.node_count] + + @property + def value(self): + return self._get_value_ndarray()[:self.node_count] + + @property + def leaf_nodes_samples(self): + leaf_node_samples = dict() + keys = self._get_value_samples_keys() + for node_id in keys: + leaf_node_samples[node_id] = self._get_value_samples_ndarray(node_id) + return leaf_node_samples + + # TODO: Convert n_classes to cython.integral memory view once + # https://github.com/cython/cython/issues/5243 is fixed + def __cinit__(self, intp_t n_features, cnp.ndarray n_classes, intp_t n_outputs, *args): + """Constructor.""" + cdef intp_t dummy = 0 + size_t_dtype = np.array(dummy).dtype + + n_classes = _check_n_classes(n_classes, size_t_dtype) + + # Input/Output layout + self.n_features = n_features + self.n_outputs = n_outputs + self.n_classes = NULL + safe_realloc(&self.n_classes, n_outputs) + + self.max_n_classes = np.max(n_classes) + self.value_stride = n_outputs * self.max_n_classes + + cdef intp_t k + for k in range(n_outputs): + self.n_classes[k] = n_classes[k] + + # Inner structures + self.max_depth = 0 + self.node_count = 0 + self.capacity = 0 + self.value = NULL + self.nodes = NULL + + # initialize the hash map for the value samples + self.value_samples = unordered_map[intp_t, vector[vector[float64_t]]]() + + def __dealloc__(self): + """Destructor.""" + # Free all inner structures + free(self.n_classes) + free(self.value) + free(self.nodes) + + def __reduce__(self): + """Reduce re-implementation, for pickling.""" + return (Tree, (self.n_features, + sizet_ptr_to_ndarray(self.n_classes, self.n_outputs), + self.n_outputs), self.__getstate__()) + + def __getstate__(self): + """Getstate re-implementation, for pickling.""" + d = {} + # capacity is inferred during the __setstate__ using nodes + d["max_depth"] = self.max_depth + d["node_count"] = self.node_count + d["nodes"] = self._get_node_ndarray() + d["values"] = self._get_value_ndarray() + d['value_samples'] = self.leaf_nodes_samples + return d + + def __setstate__(self, d): + """Setstate re-implementation, for unpickling.""" + self.max_depth = d["max_depth"] + self.node_count = d["node_count"] + + if 'nodes' not in d: + raise ValueError('You have loaded Tree version which ' + 'cannot be imported') + + node_ndarray = d['nodes'] + value_ndarray = d['values'] + + value_shape = (node_ndarray.shape[0], self.n_outputs, + self.max_n_classes) + + node_ndarray = _check_node_ndarray(node_ndarray, expected_dtype=NODE_DTYPE) + value_ndarray = _check_value_ndarray( + value_ndarray, + expected_dtype=np.dtype(np.float64), + expected_shape=value_shape + ) + + self.capacity = node_ndarray.shape[0] + if self._resize_c(self.capacity) != 0: + raise MemoryError("resizing tree to %d" % self.capacity) + + memcpy(self.nodes, cnp.PyArray_DATA(node_ndarray), + self.capacity * sizeof(Node)) + memcpy(self.value, cnp.PyArray_DATA(value_ndarray), + self.capacity * self.value_stride * sizeof(float64_t)) + + # store the leaf node samples if they exist + value_samples_dict = d['value_samples'] + for node_id, leaf_samples in value_samples_dict.items(): + self.value_samples[node_id].resize(leaf_samples.shape[0]) + for idx in range(leaf_samples.shape[0]): + for jdx in range(leaf_samples.shape[1]): + self.value_samples[node_id][idx].push_back(leaf_samples[idx, jdx]) + + cdef cnp.ndarray _get_value_samples_ndarray(self, intp_t node_id): + """Wraps value_samples as a 2-d NumPy array per node_id.""" + cdef intp_t i, j + cdef intp_t n_samples = self.value_samples[node_id].size() + cdef cnp.ndarray[float64_t, ndim=2, mode='c'] leaf_node_samples = np.empty(shape=(n_samples, self.n_outputs), dtype=np.float64) + + for i in range(n_samples): + for j in range(self.n_outputs): + leaf_node_samples[i, j] = self.value_samples[node_id][i][j] + return leaf_node_samples + + cdef cnp.ndarray _get_value_samples_keys(self): + """Wraps value_samples keys as a 1-d NumPy array of keys.""" + cdef cnp.ndarray[intp_t, ndim=1, mode='c'] keys = np.empty(len(self.value_samples), dtype=np.intp) + cdef intp_t i = 0 + + for key in self.value_samples: + keys[i] = key.first + i += 1 + return keys + + cdef cnp.ndarray _get_value_ndarray(self): + """Wraps value as a 3-d NumPy array. + + The array keeps a reference to this Tree, which manages the underlying + memory. + """ + cdef cnp.npy_intp shape[3] + shape[0] = self.node_count + shape[1] = self.n_outputs + shape[2] = self.max_n_classes + cdef cnp.ndarray arr + arr = cnp.PyArray_SimpleNewFromData(3, shape, cnp.NPY_DOUBLE, self.value) + Py_INCREF(self) + if PyArray_SetBaseObject(arr, self) < 0: + raise ValueError("Can't initialize array.") + return arr + + cdef cnp.ndarray _get_node_ndarray(self): + """Wraps nodes as a NumPy struct array. + + The array keeps a reference to this Tree, which manages the underlying + memory. Individual fields are publicly accessible as properties of the + Tree. + """ + cdef cnp.npy_intp shape[1] + shape[0] = self.node_count + cdef cnp.npy_intp strides[1] + strides[0] = sizeof(Node) + cdef cnp.ndarray arr + Py_INCREF(NODE_DTYPE) + arr = PyArray_NewFromDescr( cnp.ndarray, + NODE_DTYPE, 1, shape, + strides, self.nodes, + cnp.NPY_ARRAY_DEFAULT, None) + Py_INCREF(self) + if PyArray_SetBaseObject(arr, self) < 0: + raise ValueError("Can't initialize array.") + return arr + + cpdef cnp.ndarray predict(self, object X): + """Predict target for X.""" + out = self._get_value_ndarray().take(self.apply(X), axis=0, + mode='clip') + if self.n_outputs == 1: + out = out.reshape(X.shape[0], self.max_n_classes) + return out + + def _check_n_classes(n_classes, expected_dtype): if n_classes.ndim != 1: raise ValueError( @@ -1610,7 +2117,7 @@ cdef class _PathFinder(_CCPPruneController): cdef float64_t[:] impurities cdef uint32_t count - def __cinit__(self, intp_t node_count): + def __cinit__(self, intp_t node_count): self.ccp_alphas = np.zeros(shape=(node_count), dtype=np.float64) self.impurities = np.zeros(shape=(node_count), dtype=np.float64) self.count = 0 @@ -1907,7 +2414,7 @@ cdef void _build_pruned_tree( # value_stride for original tree and new tree are the same intp_t value_stride = orig_tree.value_stride intp_t max_depth_seen = -1 - int rc = 0 + intp_t rc = 0 Node* node float64_t* orig_value_ptr float64_t* new_value_ptr @@ -1915,6 +2422,8 @@ cdef void _build_pruned_tree( stack[BuildPrunedRecord] prune_stack BuildPrunedRecord stack_record + SplitRecord split + with nogil: # push root node onto stack prune_stack.push({"start": 0, "depth": 0, "parent": _TREE_UNDEFINED, "is_left": 0}) @@ -1940,8 +2449,12 @@ cdef void _build_pruned_tree( rc = -2 break + # redefine to a SplitRecord to pass into _add_node + split.feature = node.feature + split.threshold = node.threshold + new_node_id = tree._add_node( - parent, is_left, is_leaf, node.feature, node.threshold, + parent, is_left, is_leaf, &split, node.impurity, node.n_node_samples, node.weighted_n_node_samples, node.missing_go_to_left) diff --git a/sklearn/tree/_utils.pxd b/sklearn/tree/_utils.pxd index bc1d7668187d7..8094537f885d4 100644 --- a/sklearn/tree/_utils.pxd +++ b/sklearn/tree/_utils.pxd @@ -3,10 +3,13 @@ # See _utils.pyx for details. +import numpy as np cimport numpy as cnp -from ._tree cimport Node +cnp.import_array() + from ..neighbors._quad_tree cimport Cell from ..utils._typedefs cimport float32_t, float64_t, intp_t, uint8_t, int32_t, uint32_t +from ._tree cimport Node cdef enum: @@ -35,7 +38,7 @@ ctypedef fused realloc_ptr: (Cell*) (Node**) -cdef int safe_realloc(realloc_ptr* p, size_t nelems) except -1 nogil +cdef intp_t safe_realloc(realloc_ptr* p, size_t nelems) except -1 nogil cdef cnp.ndarray sizet_ptr_to_ndarray(intp_t* data, intp_t size) @@ -66,12 +69,12 @@ cdef class WeightedPQueue: cdef WeightedPQueueRecord* array_ cdef bint is_empty(self) noexcept nogil - cdef int reset(self) except -1 nogil + cdef intp_t reset(self) except -1 nogil cdef intp_t size(self) noexcept nogil - cdef int push(self, float64_t data, float64_t weight) except -1 nogil - cdef int remove(self, float64_t data, float64_t weight) noexcept nogil - cdef int pop(self, float64_t* data, float64_t* weight) noexcept nogil - cdef int peek(self, float64_t* data, float64_t* weight) noexcept nogil + cdef intp_t push(self, float64_t data, float64_t weight) except -1 nogil + cdef intp_t remove(self, float64_t data, float64_t weight) noexcept nogil + cdef intp_t pop(self, float64_t* data, float64_t* weight) noexcept nogil + cdef intp_t peek(self, float64_t* data, float64_t* weight) noexcept nogil cdef float64_t get_weight_from_index(self, intp_t index) noexcept nogil cdef float64_t get_value_from_index(self, intp_t index) noexcept nogil @@ -87,14 +90,14 @@ cdef class WeightedMedianCalculator: cdef intp_t k cdef float64_t sum_w_0_k # represents sum(weights[0:k]) = w[0] + w[1] + ... + w[k-1] cdef intp_t size(self) noexcept nogil - cdef int push(self, float64_t data, float64_t weight) except -1 nogil - cdef int reset(self) except -1 nogil - cdef int update_median_parameters_post_push( + cdef intp_t push(self, float64_t data, float64_t weight) except -1 nogil + cdef intp_t reset(self) except -1 nogil + cdef intp_t update_median_parameters_post_push( self, float64_t data, float64_t weight, float64_t original_median) noexcept nogil - cdef int remove(self, float64_t data, float64_t weight) noexcept nogil - cdef int pop(self, float64_t* data, float64_t* weight) noexcept nogil - cdef int update_median_parameters_post_remove( + cdef intp_t remove(self, float64_t data, float64_t weight) noexcept nogil + cdef intp_t pop(self, float64_t* data, float64_t* weight) noexcept nogil + cdef intp_t update_median_parameters_post_remove( self, float64_t data, float64_t weight, float64_t original_median) noexcept nogil cdef float64_t get_median(self) noexcept nogil diff --git a/sklearn/tree/_utils.pyx b/sklearn/tree/_utils.pyx index c5e936ae48eb1..113c61dcf9f3b 100644 --- a/sklearn/tree/_utils.pyx +++ b/sklearn/tree/_utils.pyx @@ -1,13 +1,14 @@ # Authors: The scikit-learn developers # SPDX-License-Identifier: BSD-3-Clause -from libc.stdlib cimport free -from libc.stdlib cimport realloc -from libc.math cimport log as ln from libc.math cimport isnan +from libc.math cimport log as ln +from libc.stdlib cimport free, realloc import numpy as np + cimport numpy as cnp + cnp.import_array() from ..utils._random cimport our_rand_r @@ -16,7 +17,7 @@ from ..utils._random cimport our_rand_r # Helper functions # ============================================================================= -cdef int safe_realloc(realloc_ptr* p, size_t nelems) except -1 nogil: +cdef intp_t safe_realloc(realloc_ptr* p, size_t nelems) except -1 nogil: # sizeof(realloc_ptr[0]) would be more like idiomatic C, but causes Cython # 0.20.1 to crash. cdef size_t nbytes = nelems * sizeof(p[0][0]) @@ -96,7 +97,7 @@ cdef class WeightedPQueue: def __dealloc__(self): free(self.array_) - cdef int reset(self) except -1 nogil: + cdef intp_t reset(self) except -1 nogil: """Reset the WeightedPQueue to its state at construction Return -1 in case of failure to allocate memory (and raise MemoryError) @@ -113,7 +114,7 @@ cdef class WeightedPQueue: cdef intp_t size(self) noexcept nogil: return self.array_ptr - cdef int push(self, float64_t data, float64_t weight) except -1 nogil: + cdef intp_t push(self, float64_t data, float64_t weight) except -1 nogil: """Push record on the array. Return -1 in case of failure to allocate memory (and raise MemoryError) @@ -145,7 +146,7 @@ cdef class WeightedPQueue: self.array_ptr = array_ptr + 1 return 0 - cdef int remove(self, float64_t data, float64_t weight) noexcept nogil: + cdef intp_t remove(self, float64_t data, float64_t weight) noexcept nogil: """Remove a specific value/weight record from the array. Returns 0 if successful, -1 if record not found.""" cdef intp_t array_ptr = self.array_ptr @@ -173,7 +174,7 @@ cdef class WeightedPQueue: self.array_ptr = array_ptr - 1 return 0 - cdef int pop(self, float64_t* data, float64_t* weight) noexcept nogil: + cdef intp_t pop(self, float64_t* data, float64_t* weight) noexcept nogil: """Remove the top (minimum) element from array. Returns 0 if successful, -1 if nothing to remove.""" cdef intp_t array_ptr = self.array_ptr @@ -194,7 +195,7 @@ cdef class WeightedPQueue: self.array_ptr = array_ptr - 1 return 0 - cdef int peek(self, float64_t* data, float64_t* weight) noexcept nogil: + cdef intp_t peek(self, float64_t* data, float64_t* weight) noexcept nogil: """Write the top element from array to a pointer. Returns 0 if successful, -1 if nothing to write.""" cdef WeightedPQueueRecord* array = self.array_ @@ -271,7 +272,7 @@ cdef class WeightedMedianCalculator: WeightedMedianCalculator""" return self.samples.size() - cdef int reset(self) except -1 nogil: + cdef intp_t reset(self) except -1 nogil: """Reset the WeightedMedianCalculator to its state at construction Return -1 in case of failure to allocate memory (and raise MemoryError) @@ -285,13 +286,13 @@ cdef class WeightedMedianCalculator: self.sum_w_0_k = 0 return 0 - cdef int push(self, float64_t data, float64_t weight) except -1 nogil: + cdef intp_t push(self, float64_t data, float64_t weight) except -1 nogil: """Push a value and its associated weight to the WeightedMedianCalculator Return -1 in case of failure to allocate memory (and raise MemoryError) or 0 otherwise. """ - cdef int return_value + cdef intp_t return_value cdef float64_t original_median = 0.0 if self.size() != 0: @@ -302,7 +303,7 @@ cdef class WeightedMedianCalculator: original_median) return return_value - cdef int update_median_parameters_post_push( + cdef intp_t update_median_parameters_post_push( self, float64_t data, float64_t weight, float64_t original_median) noexcept nogil: """Update the parameters used in the median calculation, @@ -344,11 +345,11 @@ cdef class WeightedMedianCalculator: self.sum_w_0_k += self.samples.get_weight_from_index(self.k-1) return 0 - cdef int remove(self, float64_t data, float64_t weight) noexcept nogil: + cdef intp_t remove(self, float64_t data, float64_t weight) noexcept nogil: """Remove a value from the MedianHeap, removing it from consideration in the median calculation """ - cdef int return_value + cdef intp_t return_value cdef float64_t original_median = 0.0 if self.size() != 0: @@ -359,11 +360,11 @@ cdef class WeightedMedianCalculator: original_median) return return_value - cdef int pop(self, float64_t* data, float64_t* weight) noexcept nogil: + cdef intp_t pop(self, float64_t* data, float64_t* weight) noexcept nogil: """Pop a value from the MedianHeap, starting from the left and moving to the right. """ - cdef int return_value + cdef intp_t return_value cdef float64_t original_median = 0.0 if self.size() != 0: @@ -379,7 +380,7 @@ cdef class WeightedMedianCalculator: original_median) return return_value - cdef int update_median_parameters_post_remove( + cdef intp_t update_median_parameters_post_remove( self, float64_t data, float64_t weight, float64_t original_median) noexcept nogil: """Update the parameters used in the median calculation, diff --git a/sklearn/tree/meson.build b/sklearn/tree/meson.build index 3e16af150b7ae..04d1d5f353d02 100644 --- a/sklearn/tree/meson.build +++ b/sklearn/tree/meson.build @@ -4,16 +4,16 @@ tree_extension_metadata = { 'override_options': ['cython_language=cpp', 'optimization=3']}, '_splitter': {'sources': ['_splitter.pyx'], - 'override_options': ['optimization=3']}, + 'override_options': ['cython_language=cpp', 'optimization=3']}, '_partitioner': {'sources': ['_partitioner.pyx'], - 'override_options': ['optimization=3']}, + 'override_options': ['cython_language=cpp', 'optimization=3']}, '_criterion': {'sources': ['_criterion.pyx'], - 'override_options': ['optimization=3']}, + 'override_options': ['cython_language=cpp', 'optimization=3']}, '_utils': {'sources': ['_utils.pyx'], - 'override_options': ['optimization=3']}, + 'override_options': ['cython_language=cpp', 'optimization=3']}, } foreach ext_name, ext_dict : tree_extension_metadata diff --git a/sklearn/tree/tests/test_tree.py b/sklearn/tree/tests/test_tree.py index fb5af073fc8c6..656a7257521ce 100644 --- a/sklearn/tree/tests/test_tree.py +++ b/sklearn/tree/tests/test_tree.py @@ -54,6 +54,9 @@ ignore_warnings, skip_if_32bit, ) +from sklearn.utils.estimator_checks import ( + parametrize_with_checks, +) from sklearn.utils.fixes import ( _IS_32BIT, COO_CONTAINERS, @@ -234,6 +237,18 @@ def assert_tree_equal(d, s, message): ) +@parametrize_with_checks( + [ + DecisionTreeClassifier(), + DecisionTreeRegressor(), + ExtraTreeClassifier(), + ExtraTreeRegressor(), + ] +) +def test_sklearn_compatible_estimator(estimator, check): + check(estimator) + + def test_classification_toy(): # Check classification on a toy dataset. for name, Tree in CLF_TREES.items(): @@ -892,7 +907,7 @@ def test_pickle(): else: X, y = diabetes.data, diabetes.target - est = TreeEstimator(random_state=0) + est = TreeEstimator(random_state=0, store_leaf_values=True) est.fit(X, y) score = est.score(X, y) @@ -911,6 +926,7 @@ def test_pickle(): "n_node_samples", "weighted_n_node_samples", "value", + "leaf_nodes_samples", ] fitted_attribute = { attribute: getattr(est.tree_, attribute) for attribute in attributes @@ -925,14 +941,25 @@ def test_pickle(): score == score2 ), "Failed to generate same score after pickling with {0}".format(name) for attribute in fitted_attribute: - assert_array_equal( - getattr(est2.tree_, attribute), - fitted_attribute[attribute], - err_msg=( - f"Failed to generate same attribute {attribute} after pickling with" - f" {name}" - ), - ) + if attribute == "leaf_nodes_samples": + for key in fitted_attribute[attribute].keys(): + assert_array_equal( + getattr(est2.tree_, attribute)[key], + fitted_attribute[attribute][key], + err_msg=( + f"Failed to generate same attribute {attribute} after" + f" pickling with {name}" + ), + ) + else: + assert_array_equal( + getattr(est2.tree_, attribute), + fitted_attribute[attribute], + err_msg=( + f"Failed to generate same attribute {attribute} after pickling" + f" with {name}" + ), + ) def test_multioutput(): @@ -2345,8 +2372,8 @@ def test_min_sample_split_1_error(Tree): # min_samples_split=1 is invalid tree = Tree(min_samples_split=1) msg = ( - r"'min_samples_split' .* must be an int in the range \[2, inf\) " - r"or a float in the range \(0.0, 1.0\]" + r"'min_samples_split' .* must be an int in the range \[2, inf\)" + r".* a float in the range \(0.0, 1.0\]" ) with pytest.raises(ValueError, match=msg): tree.fit(X, y) @@ -2358,7 +2385,9 @@ def test_missing_values_best_splitter_on_equal_nodes_no_missing(criterion): X = np.array([[0, 1, 2, 3, 8, 9, 11, 12, 15]]).T y = np.array([0.1, 0.2, 0.3, 0.2, 1.4, 1.4, 1.5, 1.6, 2.6]) - dtc = DecisionTreeRegressor(random_state=42, max_depth=1, criterion=criterion) + dtc = DecisionTreeRegressor( + random_state=42, max_depth=1, criterion=criterion, store_leaf_values=True + ) dtc.fit(X, y) # Goes to right node because it has the most data points @@ -2640,6 +2669,151 @@ def test_sample_weight_non_uniform(make_data, Tree): assert_allclose(tree_samples_removed.predict(X), tree_with_sw.predict(X)) +@pytest.mark.parametrize( + "tree_name", + ALL_TREES, +) +def test_leaf_node_samples(tree_name): + """Test getting leaf node samples from fitted tree.""" + tree = ALL_TREES[tree_name](random_state=0, store_leaf_values=False) + tree.fit(X_small, y_small) + + # Check that the leaf node samples are not stored by default + assert tree.tree_.leaf_nodes_samples == dict() + + # error should be raised if trying to predict quantiles + assert hasattr(tree, "predict_quantiles") + for meth in ["predict_quantiles", "get_leaf_node_samples"]: + if hasattr(tree, meth): + with pytest.raises( + RuntimeError, + match="leaf node samples", + ): + getattr(tree, meth)(X_small) + + quantile_tree = ALL_TREES[tree_name](random_state=0, store_leaf_values=True) + quantile_tree.fit(X_small, y_small) + + score = tree.score(X_small, y_small) + new_score = quantile_tree.score(X_small, y_small) + assert np.isclose(score, new_score) + + # Check that the leaf node samples are what they should be + X_leaves = quantile_tree.apply(X_small) + for idx in range(X_leaves.shape[0]): + leaf_idx = X_leaves[idx] + assert y_small[idx] in quantile_tree.tree_.leaf_nodes_samples[leaf_idx] + assert set(np.unique(X_leaves)) == set( + quantile_tree.tree_.leaf_nodes_samples.keys() + ) + + +@pytest.mark.parametrize( + "name", + ALL_TREES, +) +def test_quantile_tree_predict(name): + TreeEstimator = ALL_TREES[name] + + # test quantile prediction + est = TreeEstimator(store_leaf_values=True, random_state=0) + + # fit on binary results in perfect leaves, so all quantiles are the same + est.fit(X_small, y_small) + pred = est.predict_quantiles(X_small, quantiles=[0.1, 0.5, 0.9]) + assert_array_equal(est.predict(X_small), pred[:, 0]) + assert_array_equal(est.predict(X_small), pred[:, 1]) + assert_array_equal(est.predict(X_small), pred[:, 2]) + assert_array_equal(pred[:, 0], y_small) + assert np.unique(pred, axis=1).shape[1] == 1 + + est.fit(X_small[:-5], y_small[:-5]) + held_out_X = X_small[-5:, :] + pred = est.predict_quantiles(held_out_X, quantiles=[0.1, 0.5, 0.9]) + assert_array_equal(est.predict(held_out_X), pred[:, 0]) + assert_array_equal(est.predict(held_out_X), pred[:, 1]) + assert_array_equal(est.predict(held_out_X), pred[:, 2]) + + # fit on real data + est.fit(iris.data, iris.target) + pred = est.predict_quantiles(iris.data, quantiles=[0.1, 0.5, 0.9]) + assert_array_equal(pred[:, 0], iris.target) + assert_array_equal(pred[:, 1], iris.target) + assert_array_equal(pred[:, 2], iris.target) + + +@pytest.mark.parametrize( + "name", + ALL_TREES, +) +def test_quantile_tree_predict_impure_leaves(name): + TreeEstimator = ALL_TREES[name] + + # test quantile prediction + est = TreeEstimator(store_leaf_values=True, random_state=0, max_depth=4) + # fit on binary results with constrained depth will result in impure leaves + est.fit(X_small, y_small) + pred = est.predict_quantiles(X_small, quantiles=[0.1, 0.5, 0.9]) + assert np.unique(pred, axis=1).shape[1] > 1 + + +def test_multioutput_quantiles(): + # Check estimators on multi-output problems. + X = [ + [-2, -1], + [-1, -1], + [-1, -2], + [1, 1], + [1, 2], + [2, 1], + [-2, 1], + [-1, 1], + [-1, 2], + [2, -1], + [1, -1], + [1, -2], + ] + + y = [ + [-1, 0], + [-1, 0], + [-1, 0], + [1, 1], + [1, 1], + [1, 1], + [-1, 2], + [-1, 2], + [-1, 2], + [1, 3], + [1, 3], + [1, 3], + ] + + T = [[-1, -1], [1, 1], [-1, 1], [1, -1]] + y_true = [[-1, 0], [1, 1], [-1, 2], [1, 3]] + + # toy classification problem + for name, TreeClassifier in CLF_TREES.items(): + clf = TreeClassifier(random_state=0, store_leaf_values=True) + clf.fit(X, y) + + y_hat = clf.predict_quantiles(T, quantiles=[0.25, 0.5, 0.75]) + y_hat = y_hat.squeeze() + assert_array_equal(y_hat[:, 0], y_true) + assert_array_equal(y_hat[:, 1], y_true) + assert_array_equal(y_hat[:, 2], y_true) + assert y_hat.shape == (4, 3, 2) + + # toy regression problem + for name, TreeRegressor in REG_TREES.items(): + reg = TreeRegressor(random_state=0, store_leaf_values=True) + y_hat = reg.fit(X, y).predict_quantiles(T, quantiles=[0.25, 0.5, 0.75]) + assert_array_equal(y_hat[:, 0], y_true) + assert_array_equal(y_hat[:, 1], y_true) + assert_array_equal(y_hat[:, 2], y_true) + assert y_hat.shape == (4, 3, 2) + + def test_deterministic_pickle(): # Non-regression test for: # https://github.com/scikit-learn/scikit-learn/issues/27268 diff --git a/sklearn/utils/_random.pyx b/sklearn/utils/_random.pyx index f0e649e60fe7c..3a2bff3368d3e 100644 --- a/sklearn/utils/_random.pyx +++ b/sklearn/utils/_random.pyx @@ -11,7 +11,10 @@ The module contains: * Fast rand_r alternative based on xor shifts """ import numpy as np -from . import check_random_state + +# XXX: added instead of relative import to make scikit-tree easier +# from .utils import check_random_state +from sklearn.utils import check_random_state from ._typedefs cimport intp_t