From ea330a78830455530a5879c1c51840370aef4c69 Mon Sep 17 00:00:00 2001
From: Adam Li <adam2392@gmail.com>
Date: Fri, 8 Sep 2023 14:35:55 -0400
Subject: [PATCH 01/54] Fix merge main

Signed-off-by: Adam Li <adam2392@gmail.com>
---
 Makefile                                      |    3 +
 setup.py                                      |   27 +-
 sklearn/ensemble/_forest.py                   |  682 +++++++++-
 sklearn/ensemble/_gb.py                       |    1 +
 sklearn/ensemble/tests/test_forest.py         |  222 ++++
 .../tests/test_from_model.py                  |    8 +-
 sklearn/tree/_classes.py                      |  573 ++++++--
 sklearn/tree/_criterion.pxd                   |   79 +-
 sklearn/tree/_criterion.pyx                   |  311 +++--
 sklearn/tree/_export.py                       |   12 +-
 sklearn/tree/_splitter.pxd                    |   84 +-
 sklearn/tree/_splitter.pyx                    |  280 +++-
 sklearn/tree/_tree.pxd                        |  127 +-
 sklearn/tree/_tree.pyx                        | 1176 ++++++++++++-----
 sklearn/tree/_utils.pxd                       |    4 +-
 sklearn/tree/_utils.pyx                       |   12 +-
 sklearn/tree/tests/test_tree.py               |  183 ++-
 17 files changed, 3034 insertions(+), 750 deletions(-)

diff --git a/Makefile b/Makefile
index e2ae6aa75ca94..99e3665460a83 100644
--- a/Makefile
+++ b/Makefile
@@ -62,3 +62,6 @@ doc-noplot: inplace
 
 code-analysis:
 	build_tools/linting.sh
+
+build-dev:
+	pip install --verbose --no-build-isolation --editable .
\ No newline at end of file
diff --git a/setup.py b/setup.py
index f9ae13c94502b..e033395f3dbd8 100755
--- a/setup.py
+++ b/setup.py
@@ -225,10 +225,10 @@ def check_package_status(package, min_version):
         {"sources": ["_cdnmf_fast.pyx"], "include_np": True},
     ],
     "ensemble": [
-        {"sources": ["_gradient_boosting.pyx"], "include_np": True},
+        {"sources": ["_gradient_boosting.pyx"], "language": "c++", "include_np": True},
     ],
     "ensemble._hist_gradient_boosting": [
-        {"sources": ["_gradient_boosting.pyx"], "include_np": True},
+        {"sources": ["_gradient_boosting.pyx"], "language": "c++", "include_np": True},
         {"sources": ["histogram.pyx"], "include_np": True},
         {"sources": ["splitting.pyx"], "include_np": True},
         {"sources": ["_binning.pyx"], "include_np": True},
@@ -310,7 +310,7 @@ def check_package_status(package, min_version):
         {"sources": ["_ball_tree.pyx.tp"], "include_np": True},
         {"sources": ["_kd_tree.pyx.tp"], "include_np": True},
         {"sources": ["_partition_nodes.pyx"], "language": "c++", "include_np": True},
-        {"sources": ["_quad_tree.pyx"], "include_np": True},
+        {"sources": ["_quad_tree.pyx"], "language": "c++", "include_np": True},
     ],
     "svm": [
         {
@@ -378,9 +378,24 @@ def check_package_status(package, min_version):
             "include_np": True,
             "optimization_level": "O3",
         },
-        {"sources": ["_splitter.pyx"], "include_np": True, "optimization_level": "O3"},
-        {"sources": ["_criterion.pyx"], "include_np": True, "optimization_level": "O3"},
-        {"sources": ["_utils.pyx"], "include_np": True, "optimization_level": "O3"},
+        {
+            "sources": ["_splitter.pyx"],
+            "include_np": True,
+            "language": "c++",
+            "optimization_level": "O3",
+        },
+        {
+            "sources": ["_criterion.pyx"],
+            "include_np": True,
+            "language": "c++",
+            "optimization_level": "O3",
+        },
+        {
+            "sources": ["_utils.pyx"],
+            "include_np": True,
+            "language": "c++",
+            "optimization_level": "O3",
+        },
     ],
     "utils": [
         {"sources": ["sparsefuncs_fast.pyx"], "include_np": True},
diff --git a/sklearn/ensemble/_forest.py b/sklearn/ensemble/_forest.py
index eecd13d403744..3ca1a2d347623 100644
--- a/sklearn/ensemble/_forest.py
+++ b/sklearn/ensemble/_forest.py
@@ -43,13 +43,14 @@ class calls the ``fit`` method of each sub-estimator on random samples
 import threading
 from abc import ABCMeta, abstractmethod
 from numbers import Integral, Real
+from time import time
 from warnings import catch_warnings, simplefilter, warn
 
 import numpy as np
 from scipy.sparse import hstack as sparse_hstack
 from scipy.sparse import issparse
 
-from ..base import (
+from sklearn.base import (
     ClassifierMixin,
     MultiOutputMixin,
     RegressorMixin,
@@ -57,9 +58,28 @@ class calls the ``fit`` method of each sub-estimator on random samples
     _fit_context,
     is_classifier,
 )
-from ..exceptions import DataConversionWarning
-from ..metrics import accuracy_score, r2_score
-from ..preprocessing import OneHotEncoder
+from sklearn.ensemble._base import BaseEnsemble, _partition_estimators
+from sklearn.ensemble._hist_gradient_boosting.binning import _BinMapper
+from sklearn.exceptions import DataConversionWarning
+from sklearn.metrics import accuracy_score, r2_score
+from sklearn.preprocessing import OneHotEncoder
+from sklearn.utils import check_random_state, compute_sample_weight
+from sklearn.utils._openmp_helpers import _openmp_effective_n_threads
+from sklearn.utils._param_validation import Interval, RealNotInt, StrOptions
+from sklearn.utils._tags import _safe_tags
+from sklearn.utils.multiclass import (
+    _check_partial_fit_first_call,
+    check_classification_targets,
+    type_of_target,
+)
+from sklearn.utils.parallel import Parallel, delayed
+from sklearn.utils.validation import (
+    _check_feature_names_in,
+    _check_sample_weight,
+    _num_samples,
+    check_is_fitted,
+)
+
 from ..tree import (
     BaseDecisionTree,
     DecisionTreeClassifier,
@@ -68,18 +88,6 @@ class calls the ``fit`` method of each sub-estimator on random samples
     ExtraTreeRegressor,
 )
 from ..tree._tree import DOUBLE, DTYPE
-from ..utils import check_random_state, compute_sample_weight
-from ..utils._param_validation import Interval, RealNotInt, StrOptions
-from ..utils._tags import _safe_tags
-from ..utils.multiclass import check_classification_targets, type_of_target
-from ..utils.parallel import Parallel, delayed
-from ..utils.validation import (
-    _check_feature_names_in,
-    _check_sample_weight,
-    _num_samples,
-    check_is_fitted,
-)
-from ._base import BaseEnsemble, _partition_estimators
 
 __all__ = [
     "RandomForestClassifier",
@@ -161,6 +169,7 @@ def _parallel_build_trees(
     class_weight=None,
     n_samples_bootstrap=None,
     missing_values_in_feature_mask=None,
+    classes=None,
 ):
     """
     Private function used to fit a single tree in parallel."""
@@ -193,6 +202,7 @@ def _parallel_build_trees(
             sample_weight=curr_sample_weight,
             check_input=False,
             missing_values_in_feature_mask=missing_values_in_feature_mask,
+            classes=classes,
         )
     else:
         tree._fit(
@@ -201,6 +211,50 @@ def _parallel_build_trees(
             sample_weight=sample_weight,
             check_input=False,
             missing_values_in_feature_mask=missing_values_in_feature_mask,
+            classes=classes,
+        )
+
+    return tree
+
+
+def _parallel_update_trees(
+    tree,
+    bootstrap,
+    X,
+    y,
+    sample_weight,
+    tree_idx,
+    n_trees,
+    verbose=0,
+    class_weight=None,
+    n_samples_bootstrap=None,
+    classes=None,
+):
+    """
+    Private function used to fit a single tree in parallel."""
+    if verbose > 1:
+        print("Updating tree %d of %d" % (tree_idx + 1, n_trees))
+
+    if bootstrap:
+        n_samples = X.shape[0]
+        indices = _generate_sample_indices(
+            tree.random_state, n_samples, n_samples_bootstrap
+        )
+
+        tree.partial_fit(
+            X[indices, :],
+            y[indices],
+            sample_weight=sample_weight,
+            check_input=False,
+            classes=classes,
+        )
+    else:
+        tree.partial_fit(
+            X,
+            y,
+            sample_weight=sample_weight,
+            check_input=False,
+            classes=classes,
         )
 
     return tree
@@ -227,6 +281,11 @@ class BaseForest(MultiOutputMixin, BaseEnsemble, metaclass=ABCMeta):
             Interval(RealNotInt, 0.0, 1.0, closed="right"),
             Interval(Integral, 1, None, closed="left"),
         ],
+        "max_bins": [
+            None,
+            Interval(Integral, 1, None, closed="left"),
+        ],
+        "store_leaf_values": ["boolean"],
     }
 
     @abstractmethod
@@ -245,6 +304,8 @@ def __init__(
         class_weight=None,
         max_samples=None,
         base_estimator="deprecated",
+        max_bins=None,
+        store_leaf_values=False,
     ):
         super().__init__(
             estimator=estimator,
@@ -261,6 +322,8 @@ def __init__(
         self.warm_start = warm_start
         self.class_weight = class_weight
         self.max_samples = max_samples
+        self.max_bins = max_bins
+        self.store_leaf_values = store_leaf_values
 
     def apply(self, X):
         """
@@ -280,6 +343,15 @@ def apply(self, X):
             return the index of the leaf x ends up in.
         """
         X = self._validate_X_predict(X)
+
+        # if we trained a binning tree, then we should re-bin the data
+        # XXX: this is inefficient and should be improved to be in line with what
+        # the Histogram Gradient Boosting Tree does, where the binning thresholds
+        # are passed into the tree itself, thus allowing us to set the node feature
+        # value thresholds within the tree itself.
+        if self.max_bins is not None:
+            X = self._bin_data(X, is_training_data=False).astype(DTYPE)
+
         results = Parallel(
             n_jobs=self.n_jobs,
             verbose=self.verbose,
@@ -329,7 +401,7 @@ def decision_path(self, X):
         return sparse_hstack(indicators).tocsr(), n_nodes_ptr
 
     @_fit_context(prefer_skip_nested_validation=True)
-    def fit(self, X, y, sample_weight=None):
+    def fit(self, X, y, sample_weight=None, classes=None):
         """
         Build a forest of trees from the training set (X, y).
 
@@ -351,6 +423,9 @@ def fit(self, X, y, sample_weight=None):
             classification, splits are also ignored if they would result in any
             single class carrying a negative weight in either child node.
 
+        classes : array-like of shape (n_classes,), default=None
+            List of all the classes that can possibly appear in the y vector.
+
         Returns
         -------
         self : object
@@ -418,7 +493,7 @@ def fit(self, X, y, sample_weight=None):
 
         self.n_outputs_ = y.shape[1]
 
-        y, expanded_class_weight = self._validate_y_class_weight(y)
+        y, expanded_class_weight = self._validate_y_class_weight(y, classes=classes)
 
         if getattr(y, "dtype", None) != DOUBLE or not y.flags.contiguous:
             y = np.ascontiguousarray(y, dtype=DOUBLE)
@@ -455,6 +530,38 @@ def fit(self, X, y, sample_weight=None):
 
         n_more_estimators = self.n_estimators - len(self.estimators_)
 
+        if self.max_bins is not None:
+            # `_openmp_effective_n_threads` is used to take cgroups CPU quotes
+            # into account when determine the maximum number of threads to use.
+            n_threads = _openmp_effective_n_threads()
+
+            # Bin the data
+            # For ease of use of the API, the user-facing GBDT classes accept the
+            # parameter max_bins, which doesn't take into account the bin for
+            # missing values (which is always allocated). However, since max_bins
+            # isn't the true maximal number of bins, all other private classes
+            # (binmapper, histbuilder...) accept n_bins instead, which is the
+            # actual total number of bins. Everywhere in the code, the
+            # convention is that n_bins == max_bins + 1
+            n_bins = self.max_bins + 1  # + 1 for missing values
+            self._bin_mapper = _BinMapper(
+                n_bins=n_bins,
+                # is_categorical=self.is_categorical_,
+                known_categories=None,
+                random_state=random_state,
+                n_threads=n_threads,
+            )
+
+            # XXX: in order for this to work with the underlying tree submodule's Cython
+            # code, we need to convert this into the original data's DTYPE because
+            # the Cython code assumes that `DTYPE` is used.
+            # The proper implementation will be a lot more complicated and should be
+            # tackled once scikit-learn has finalized their inclusion of missing data
+            # and categorical support for decision trees
+            X = self._bin_data(X, is_training_data=True)  # .astype(DTYPE)
+        else:
+            self._bin_mapper = None
+
         if n_more_estimators < 0:
             raise ValueError(
                 "n_estimators=%d must be larger or equal to "
@@ -501,6 +608,7 @@ def fit(self, X, y, sample_weight=None):
                     class_weight=self.class_weight,
                     n_samples_bootstrap=n_samples_bootstrap,
                     missing_values_in_feature_mask=missing_values_in_feature_mask,
+                    classes=classes,
                 )
                 for i, t in enumerate(trees)
             )
@@ -620,7 +728,7 @@ def _compute_oob_predictions(self, X, y):
 
         return oob_pred
 
-    def _validate_y_class_weight(self, y):
+    def _validate_y_class_weight(self, y, classes=None):
         # Default implementation
         return y, None
 
@@ -679,6 +787,174 @@ def feature_importances_(self):
         all_importances = np.mean(all_importances, axis=0, dtype=np.float64)
         return all_importances / np.sum(all_importances)
 
+    def _bin_data(self, X, is_training_data):
+        """Bin data X.
+
+        If is_training_data, then fit the _bin_mapper attribute.
+        Else, the binned data is converted to a C-contiguous array.
+        """
+        description = "training" if is_training_data else "validation"
+        if self.verbose:
+            print(
+                "Binning {:.3f} GB of {} data: ".format(X.nbytes / 1e9, description),
+                end="",
+                flush=True,
+            )
+        tic = time()
+        if is_training_data:
+            X_binned = self._bin_mapper.fit_transform(X)  # F-aligned array
+        else:
+            X_binned = self._bin_mapper.transform(X)  # F-aligned array
+            # We convert the array to C-contiguous since predicting is faster
+            # with this layout (training is faster on F-arrays though)
+            X_binned = np.ascontiguousarray(X_binned)
+        toc = time()
+        if self.verbose:
+            duration = toc - tic
+            print("{:.3f} s".format(duration))
+
+        return X_binned
+
+    def predict_quantiles(self, X, quantiles=0.5, method="nearest"):
+        """Predict class or regression value for X at given quantiles.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            Input data.
+        quantiles : float, optional
+            The quantiles at which to evaluate, by default 0.5 (median).
+        method : str, optional
+            The method to interpolate, by default 'linear'. Can be any keyword
+            argument accepted by :func:`~np.quantile`.
+
+        Returns
+        -------
+        y : ndarray of shape (n_samples, n_quantiles, [n_outputs])
+            The predicted values. The ``n_outputs`` dimension is present only
+            for multi-output regressors.
+        """
+        if not self.store_leaf_values:
+            raise RuntimeError(
+                "Quantile prediction is not available when store_leaf_values=False"
+            )
+        check_is_fitted(self)
+        # Check data
+        X = self._validate_X_predict(X)
+
+        if not isinstance(quantiles, (np.ndarray, list)):
+            quantiles = np.array([quantiles])
+
+        # if we trained a binning tree, then we should re-bin the data
+        # XXX: this is inefficient and should be improved to be in line with what
+        # the Histogram Gradient Boosting Tree does, where the binning thresholds
+        # are passed into the tree itself, thus allowing us to set the node feature
+        # value thresholds within the tree itself.
+        if self.max_bins is not None:
+            X = self._bin_data(X, is_training_data=False).astype(DTYPE)
+
+        # Assign chunk of trees to jobs
+        # n_jobs, _, _ = _partition_estimators(self.n_estimators, self.n_jobs)
+
+        # avoid storing the output of every estimator by summing them here
+        if self.n_outputs_ > 1:
+            y_hat = np.zeros(
+                (X.shape[0], len(quantiles), self.n_outputs_), dtype=np.float64
+            )
+        else:
+            y_hat = np.zeros((X.shape[0], len(quantiles)), dtype=np.float64)
+
+        # get (n_samples, n_estimators) indicator of leaf nodes
+        X_leaves = self.apply(X)
+
+        # we now want to aggregate all leaf samples across all trees for each sample
+        for idx in range(X.shape[0]):
+            # get leaf nodes for this sample
+            leaf_nodes = X_leaves[idx, :]
+
+            # (n_total_leaf_samples, n_outputs)
+            leaf_node_samples = np.vstack(
+                [
+                    est.tree_.leaf_nodes_samples[leaf_nodes[jdx]]
+                    for jdx, est in enumerate(self.estimators_)
+                ]
+            )
+
+            # get quantiles across all leaf node samples
+            try:
+                y_hat[idx, ...] = np.quantile(
+                    leaf_node_samples, quantiles, axis=0, method=method
+                )
+            except TypeError:
+                y_hat[idx, ...] = np.quantile(
+                    leaf_node_samples, quantiles, axis=0, interpolation=method
+                )
+
+            if is_classifier(self):
+                if self.n_outputs_ == 1:
+                    for i in range(len(quantiles)):
+                        class_pred_per_sample = y_hat[idx, i, :].squeeze().astype(int)
+                        y_hat[idx, ...] = self.classes_.take(
+                            class_pred_per_sample, axis=0
+                        )
+                else:
+                    for k in range(self.n_outputs_):
+                        for i in range(len(quantiles)):
+                            class_pred_per_sample = (
+                                y_hat[idx, i, k].squeeze().astype(int)
+                            )
+                            y_hat[idx, i, k] = self.classes_[k].take(
+                                class_pred_per_sample, axis=0
+                            )
+        return y_hat
+
+    def get_leaf_node_samples(self, X):
+        """For each datapoint x in X, get the training samples in the leaf node.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            Dataset to apply the forest to.
+
+        Returns
+        -------
+        leaf_node_samples : a list of array-like
+            Each sample is represented by the indices of the training samples that
+            reached the leaf node. The ``n_leaf_node_samples`` may vary between
+            samples, since the number of samples that fall in a leaf node is
+            variable. Each array-like has shape (n_leaf_node_samples, n_outputs).
+        """
+        if not self.store_leaf_values:
+            raise RuntimeError(
+                "Leaf node samples are not available when store_leaf_values=False"
+            )
+
+        check_is_fitted(self)
+        # Check data
+        X = self._validate_X_predict(X)
+
+        # if we trained a binning tree, then we should re-bin the data
+        # XXX: this is inefficient and should be improved to be in line with what
+        # the Histogram Gradient Boosting Tree does, where the binning thresholds
+        # are passed into the tree itself, thus allowing us to set the node feature
+        # value thresholds within the tree itself.
+        if self.max_bins is not None:
+            X = self._bin_data(X, is_training_data=False).astype(DTYPE)
+
+        # Assign chunk of trees to jobs
+        n_jobs, _, _ = _partition_estimators(self.n_estimators, self.n_jobs)
+
+        # avoid storing the output of every estimator by summing them here
+        result = Parallel(n_jobs=n_jobs, verbose=self.verbose)(
+            delayed(_accumulate_leaf_nodes_samples)(e.get_leaf_node_samples, X)
+            for e in self.estimators_
+        )
+        leaf_nodes_samples = result[0]
+        for result_ in result[1:]:
+            for i, node_samples in enumerate(result_):
+                leaf_nodes_samples[i] = np.vstack((leaf_nodes_samples[i], node_samples))
+        return leaf_nodes_samples
+
     def _more_tags(self):
         # Only the criterion is required to determine if the tree supports
         # missing values
@@ -702,6 +978,17 @@ def _accumulate_prediction(predict, X, out, lock):
                 out[i] += prediction[i]
 
 
+def _accumulate_leaf_nodes_samples(func, X):
+    """
+    This is a utility function for joblib's Parallel.
+
+    It can't go locally in ForestClassifier or ForestRegressor, because joblib
+    complains that it cannot pickle it when placed there.
+    """
+    leaf_nodes_samples = func(X, check_input=False)
+    return leaf_nodes_samples
+
+
 class ForestClassifier(ClassifierMixin, BaseForest, metaclass=ABCMeta):
     """
     Base class for forest of trees-based classifiers.
@@ -726,6 +1013,8 @@ def __init__(
         class_weight=None,
         max_samples=None,
         base_estimator="deprecated",
+        max_bins=None,
+        store_leaf_values=False,
     ):
         super().__init__(
             estimator=estimator,
@@ -740,6 +1029,8 @@ def __init__(
             class_weight=class_weight,
             max_samples=max_samples,
             base_estimator=base_estimator,
+            max_bins=max_bins,
+            store_leaf_values=store_leaf_values,
         )
 
     @staticmethod
@@ -794,7 +1085,7 @@ def _set_oob_score_and_attributes(self, X, y, scoring_function=None):
             y, np.argmax(self.oob_decision_function_, axis=1)
         )
 
-    def _validate_y_class_weight(self, y):
+    def _validate_y_class_weight(self, y, classes=None):
         check_classification_targets(y)
 
         y = np.copy(y)
@@ -807,12 +1098,28 @@ def _validate_y_class_weight(self, y):
         self.n_classes_ = []
 
         y_store_unique_indices = np.zeros(y.shape, dtype=int)
-        for k in range(self.n_outputs_):
-            classes_k, y_store_unique_indices[:, k] = np.unique(
-                y[:, k], return_inverse=True
-            )
-            self.classes_.append(classes_k)
-            self.n_classes_.append(classes_k.shape[0])
+        if classes is not None:
+            classes = np.atleast_1d(classes)
+            if classes.ndim == 1:
+                classes = np.array([classes])
+
+            for k in classes:
+                self.classes_.append(np.array(k))
+                self.n_classes_.append(np.array(k).shape[0])
+
+            for i in range(y.shape[0]):
+                for j in range(self.n_outputs_):
+                    y_store_unique_indices[i, j] = np.where(
+                        self.classes_[j] == y[i, j]
+                    )[0][0]
+        else:
+            for k in range(self.n_outputs_):
+                classes_k, y_store_unique_indices[:, k] = np.unique(
+                    y[:, k], return_inverse=True
+                )
+                self.classes_.append(classes_k)
+                self.n_classes_.append(classes_k.shape[0])
+
         y = y_store_unique_indices
 
         if self.class_weight is not None:
@@ -848,6 +1155,228 @@ def _validate_y_class_weight(self, y):
 
         return y, expanded_class_weight
 
+    def partial_fit(self, X, y, sample_weight=None, classes=None):
+        """Update a decision tree classifier from the training set (X, y).
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            The training input samples. Internally, it will be converted to
+            ``dtype=np.float32`` and if a sparse matrix is provided
+            to a sparse ``csc_matrix``.
+
+        y : array-like of shape (n_samples,) or (n_samples, n_outputs)
+            The target values (class labels) as integers or strings.
+
+        sample_weight : array-like of shape (n_samples,), default=None
+            Sample weights. If None, then samples are equally weighted. Splits
+            that would create child nodes with net zero or negative weight are
+            ignored while searching for a split in each node. Splits are also
+            ignored if they would result in any single class carrying a
+            negative weight in either child node.
+
+        classes : array-like of shape (n_classes,), default=None
+            List of all the classes that can possibly appear in the y vector.
+            Must be provided at the first call to partial_fit, can be omitted
+            in subsequent calls.
+
+        Returns
+        -------
+        self : object
+            Returns the instance itself.
+        """
+        self._validate_params()
+
+        # validate input parameters
+        first_call = _check_partial_fit_first_call(self, classes=classes)
+
+        # Fit if no tree exists yet
+        if first_call:
+            self.fit(
+                X,
+                y,
+                sample_weight=sample_weight,
+                classes=classes,
+            )
+            return self
+
+        X, y = self._validate_data(
+            X,
+            y,
+            multi_output=True,
+            accept_sparse="csc",
+            dtype=DTYPE,
+            force_all_finite=False,
+            reset=first_call,
+        )
+
+        if issparse(y):
+            raise ValueError("sparse multilabel-indicator for y is not supported.")
+
+        if sample_weight is not None:
+            sample_weight = _check_sample_weight(sample_weight, X)
+
+        if issparse(X):
+            # Pre-sort indices to avoid that each individual tree of the
+            # ensemble sorts the indices.
+            X.sort_indices()
+
+        y = np.atleast_1d(y)
+        if y.ndim == 2 and y.shape[1] == 1:
+            warn(
+                (
+                    "A column-vector y was passed when a 1d array was"
+                    " expected. Please change the shape of y to "
+                    "(n_samples,), for example using ravel()."
+                ),
+                DataConversionWarning,
+                stacklevel=2,
+            )
+
+        if y.ndim == 1:
+            # reshape is necessary to preserve the data contiguity against vs
+            # [:, np.newaxis] that does not.
+            y = np.reshape(y, (-1, 1))
+
+        if self.criterion == "poisson":
+            if np.any(y < 0):
+                raise ValueError(
+                    "Some value(s) of y are negative which is "
+                    "not allowed for Poisson regression."
+                )
+            if np.sum(y) <= 0:
+                raise ValueError(
+                    "Sum of y is not strictly positive which "
+                    "is necessary for Poisson regression."
+                )
+
+        self.n_outputs_ = y.shape[1]
+
+        classes = self.classes_
+        if self.n_outputs_ == 1:
+            classes = [classes]
+
+        y, expanded_class_weight = self._validate_y_class_weight(y, classes)
+
+        if getattr(y, "dtype", None) != DOUBLE or not y.flags.contiguous:
+            y = np.ascontiguousarray(y, dtype=DOUBLE)
+
+        if expanded_class_weight is not None:
+            if sample_weight is not None:
+                sample_weight = sample_weight * expanded_class_weight
+            else:
+                sample_weight = expanded_class_weight
+
+        if not self.bootstrap and self.max_samples is not None:
+            raise ValueError(
+                "`max_sample` cannot be set if `bootstrap=False`. "
+                "Either switch to `bootstrap=True` or set "
+                "`max_sample=None`."
+            )
+        elif self.bootstrap:
+            n_samples_bootstrap = _get_n_samples_bootstrap(
+                n_samples=X.shape[0], max_samples=self.max_samples
+            )
+        else:
+            n_samples_bootstrap = None
+
+        self._validate_estimator()
+
+        if not self.bootstrap and self.oob_score:
+            raise ValueError("Out of bag estimation only available if bootstrap=True")
+
+        random_state = check_random_state(self.random_state)
+
+        if self.max_bins is not None:
+            # `_openmp_effective_n_threads` is used to take cgroups CPU quotes
+            # into account when determine the maximum number of threads to use.
+            n_threads = _openmp_effective_n_threads()
+
+            # Bin the data
+            # For ease of use of the API, the user-facing GBDT classes accept the
+            # parameter max_bins, which doesn't take into account the bin for
+            # missing values (which is always allocated). However, since max_bins
+            # isn't the true maximal number of bins, all other private classes
+            # (binmapper, histbuilder...) accept n_bins instead, which is the
+            # actual total number of bins. Everywhere in the code, the
+            # convention is that n_bins == max_bins + 1
+            n_bins = self.max_bins + 1  # + 1 for missing values
+            self._bin_mapper = _BinMapper(
+                n_bins=n_bins,
+                # is_categorical=self.is_categorical_,
+                known_categories=None,
+                random_state=random_state,
+                n_threads=n_threads,
+            )
+
+            # XXX: in order for this to work with the underlying tree submodule's Cython
+            # code, we need to convert this into the original data's DTYPE because
+            # the Cython code assumes that `DTYPE` is used.
+            # The proper implementation will be a lot more complicated and should be
+            # tackled once scikit-learn has finalized their inclusion of missing data
+            # and categorical support for decision trees
+            X = self._bin_data(X, is_training_data=True)  # .astype(DTYPE)
+        else:
+            self._bin_mapper = None
+
+        # We draw from the random state to get the random state we
+        # would have got if we hadn't used a warm_start.
+        random_state.randint(MAX_INT, size=len(self.estimators_))
+
+        # Parallel loop: we prefer the threading backend as the Cython code
+        # for fitting the trees is internally releasing the Python GIL
+        # making threading more efficient than multiprocessing in
+        # that case. However, for joblib 0.12+ we respect any
+        # parallel_backend contexts set at a higher level,
+        # since correctness does not rely on using threads.
+        Parallel(
+            n_jobs=self.n_jobs,
+            verbose=self.verbose,
+            prefer="threads",
+        )(
+            delayed(_parallel_update_trees)(
+                t,
+                self.bootstrap,
+                X,
+                y,
+                sample_weight,
+                i,
+                len(self.estimators_),
+                verbose=self.verbose,
+                class_weight=self.class_weight,
+                n_samples_bootstrap=n_samples_bootstrap,
+                classes=classes[0],
+            )
+            for i, t in enumerate(self.estimators_)
+        )
+
+        if self.oob_score:
+            y_type = type_of_target(y)
+            if y_type in ("multiclass-multioutput", "unknown"):
+                # FIXME: we could consider to support multiclass-multioutput if
+                # we introduce or reuse a constructor parameter (e.g.
+                # oob_score) allowing our user to pass a callable defining the
+                # scoring strategy on OOB sample.
+                raise ValueError(
+                    "The type of target cannot be used to compute OOB "
+                    f"estimates. Got {y_type} while only the following are "
+                    "supported: continuous, continuous-multioutput, binary, "
+                    "multiclass, multilabel-indicator."
+                )
+
+            if callable(self.oob_score):
+                self._set_oob_score_and_attributes(
+                    X, y, scoring_function=self.oob_score
+                )
+            else:
+                self._set_oob_score_and_attributes(X, y)
+
+        # Decapsulate classes_ attributes
+        if hasattr(self, "classes_") and self.n_outputs_ == 1:
+            self.n_classes_ = self.n_classes_[0]
+            self.classes_ = self.classes_[0]
+        return self
+
     def predict(self, X):
         """
         Predict class for X.
@@ -913,6 +1442,14 @@ def predict_proba(self, X):
         # Check data
         X = self._validate_X_predict(X)
 
+        # if we trained a binning tree, then we should re-bin the data
+        # XXX: this is inefficient and should be improved to be in line with what
+        # the Histogram Gradient Boosting Tree does, where the binning thresholds
+        # are passed into the tree itself, thus allowing us to set the node feature
+        # value thresholds within the tree itself.
+        if self.max_bins is not None:
+            X = self._bin_data(X, is_training_data=False).astype(DTYPE)
+
         # Assign chunk of trees to jobs
         n_jobs, _, _ = _partition_estimators(self.n_estimators, self.n_jobs)
 
@@ -994,6 +1531,8 @@ def __init__(
         warm_start=False,
         max_samples=None,
         base_estimator="deprecated",
+        max_bins=None,
+        store_leaf_values=False,
     ):
         super().__init__(
             estimator,
@@ -1007,6 +1546,8 @@ def __init__(
             warm_start=warm_start,
             max_samples=max_samples,
             base_estimator=base_estimator,
+            max_bins=max_bins,
+            store_leaf_values=store_leaf_values,
         )
 
     def predict(self, X):
@@ -1032,6 +1573,14 @@ def predict(self, X):
         # Check data
         X = self._validate_X_predict(X)
 
+        # if we trained a binning tree, then we should re-bin the data
+        # XXX: this is inefficient and should be improved to be in line with what
+        # the Histogram Gradient Boosting Tree does, where the binning thresholds
+        # are passed into the tree itself, thus allowing us to set the node feature
+        # value thresholds within the tree itself.
+        if self.max_bins is not None:
+            X = self._bin_data(X, is_training_data=False).astype(DTYPE)
+
         # Assign chunk of trees to jobs
         n_jobs, _, _ = _partition_estimators(self.n_estimators, self.n_jobs)
 
@@ -1322,6 +1871,16 @@ class RandomForestClassifier(ForestClassifier):
 
         .. versionadded:: 0.22
 
+    max_bins : int, default=255
+        The maximum number of bins to use for non-missing values.
+
+        **This is an experimental feature**.
+
+    store_leaf_values : bool, default=False
+        Whether to store the leaf values in the ``get_leaf_node_samples`` function.
+
+        **This is an experimental feature**.
+
     monotonic_cst : array-like of int of shape (n_features), default=None
         Indicates the monotonicity constraint to enforce on each feature.
           - 1: monotonic increase
@@ -1481,6 +2040,8 @@ def __init__(
         class_weight=None,
         ccp_alpha=0.0,
         max_samples=None,
+        max_bins=None,
+        store_leaf_values=False,
         monotonic_cst=None,
     ):
         super().__init__(
@@ -1497,6 +2058,7 @@ def __init__(
                 "min_impurity_decrease",
                 "random_state",
                 "ccp_alpha",
+                "store_leaf_values",
                 "monotonic_cst",
             ),
             bootstrap=bootstrap,
@@ -1507,6 +2069,8 @@ def __init__(
             warm_start=warm_start,
             class_weight=class_weight,
             max_samples=max_samples,
+            max_bins=max_bins,
+            store_leaf_values=store_leaf_values,
         )
 
         self.criterion = criterion
@@ -1698,6 +2262,17 @@ class RandomForestRegressor(ForestRegressor):
 
         .. versionadded:: 0.22
 
+    max_bins : int, default=255
+        The maximum number of bins to use for non-missing values. Used for
+        speeding up training time.
+
+        **This is an experimental feature**.
+
+    store_leaf_values : bool, default=False
+        Whether to store the leaf values in the ``get_leaf_node_samples`` function.
+
+        **This is an experimental feature**.
+
     monotonic_cst : array-like of int of shape (n_features), default=None
         Indicates the monotonicity constraint to enforce on each feature.
           - 1: monotonically increasing
@@ -1841,6 +2416,8 @@ def __init__(
         warm_start=False,
         ccp_alpha=0.0,
         max_samples=None,
+        max_bins=None,
+        store_leaf_values=False,
         monotonic_cst=None,
     ):
         super().__init__(
@@ -1857,6 +2434,7 @@ def __init__(
                 "min_impurity_decrease",
                 "random_state",
                 "ccp_alpha",
+                "store_leaf_values",
                 "monotonic_cst",
             ),
             bootstrap=bootstrap,
@@ -1866,6 +2444,8 @@ def __init__(
             verbose=verbose,
             warm_start=warm_start,
             max_samples=max_samples,
+            max_bins=max_bins,
+            store_leaf_values=store_leaf_values,
         )
 
         self.criterion = criterion
@@ -2065,6 +2645,16 @@ class ExtraTreesClassifier(ForestClassifier):
 
         .. versionadded:: 0.22
 
+    max_bins : int, default=255
+        The maximum number of bins to use for non-missing values.
+
+        **This is an experimental feature**.
+
+    store_leaf_values : bool, default=False
+        Whether to store the leaf values in the ``get_leaf_node_samples`` function.
+
+        **This is an experimental feature**.
+
     monotonic_cst : array-like of int of shape (n_features), default=None
         Indicates the monotonicity constraint to enforce on each feature.
           - 1: monotonically increasing
@@ -2213,6 +2803,8 @@ def __init__(
         class_weight=None,
         ccp_alpha=0.0,
         max_samples=None,
+        max_bins=None,
+        store_leaf_values=False,
         monotonic_cst=None,
     ):
         super().__init__(
@@ -2229,6 +2821,7 @@ def __init__(
                 "min_impurity_decrease",
                 "random_state",
                 "ccp_alpha",
+                "store_leaf_values",
                 "monotonic_cst",
             ),
             bootstrap=bootstrap,
@@ -2239,6 +2832,8 @@ def __init__(
             warm_start=warm_start,
             class_weight=class_weight,
             max_samples=max_samples,
+            max_bins=max_bins,
+            store_leaf_values=store_leaf_values,
         )
 
         self.criterion = criterion
@@ -2426,6 +3021,16 @@ class ExtraTreesRegressor(ForestRegressor):
 
         .. versionadded:: 0.22
 
+    max_bins : int, default=255
+        The maximum number of bins to use for non-missing values.
+
+        **This is an experimental feature**.
+
+    store_leaf_values : bool, default=False
+        Whether to store the leaf values in the ``get_leaf_node_samples`` function.
+
+        **This is an experimental feature**.
+
     monotonic_cst : array-like of int of shape (n_features), default=None
         Indicates the monotonicity constraint to enforce on each feature.
           - 1: monotonically increasing
@@ -2554,6 +3159,8 @@ def __init__(
         warm_start=False,
         ccp_alpha=0.0,
         max_samples=None,
+        max_bins=None,
+        store_leaf_values=False,
         monotonic_cst=None,
     ):
         super().__init__(
@@ -2570,6 +3177,7 @@ def __init__(
                 "min_impurity_decrease",
                 "random_state",
                 "ccp_alpha",
+                "store_leaf_values",
                 "monotonic_cst",
             ),
             bootstrap=bootstrap,
@@ -2579,6 +3187,8 @@ def __init__(
             verbose=verbose,
             warm_start=warm_start,
             max_samples=max_samples,
+            max_bins=max_bins,
+            store_leaf_values=store_leaf_values,
         )
 
         self.criterion = criterion
@@ -2702,6 +3312,9 @@ class RandomTreesEmbedding(TransformerMixin, BaseForest):
         new forest. See :term:`Glossary <warm_start>` and
         :ref:`gradient_boosting_warm_start` for details.
 
+    store_leaf_values : bool, default=False
+        Whether to store the leaf values in the ``get_leaf_node_samples`` function.
+
     Attributes
     ----------
     estimator_ : :class:`~sklearn.tree.ExtraTreeRegressor` instance
@@ -2805,6 +3418,7 @@ def __init__(
         random_state=None,
         verbose=0,
         warm_start=False,
+        store_leaf_values=False,
     ):
         super().__init__(
             estimator=ExtraTreeRegressor(),
@@ -2819,6 +3433,7 @@ def __init__(
                 "max_leaf_nodes",
                 "min_impurity_decrease",
                 "random_state",
+                "store_leaf_values",
             ),
             bootstrap=False,
             oob_score=False,
@@ -2827,6 +3442,7 @@ def __init__(
             verbose=verbose,
             warm_start=warm_start,
             max_samples=None,
+            store_leaf_values=store_leaf_values,
         )
 
         self.max_depth = max_depth
@@ -2840,7 +3456,7 @@ def __init__(
     def _set_oob_score_and_attributes(self, X, y, scoring_function=None):
         raise NotImplementedError("OOB score not supported by tree embedding")
 
-    def fit(self, X, y=None, sample_weight=None):
+    def fit(self, X, y=None, sample_weight=None, classes=None):
         """
         Fit estimator.
 
@@ -2861,17 +3477,20 @@ def fit(self, X, y=None, sample_weight=None):
             classification, splits are also ignored if they would result in any
             single class carrying a negative weight in either child node.
 
+        classes : array-like of shape (n_classes,), default=None
+            List of all the classes that can possibly appear in the y vector.
+
         Returns
         -------
         self : object
             Returns the instance itself.
         """
         # Parameters are validated in fit_transform
-        self.fit_transform(X, y, sample_weight=sample_weight)
+        self.fit_transform(X, y, sample_weight=sample_weight, classes=classes)
         return self
 
     @_fit_context(prefer_skip_nested_validation=True)
-    def fit_transform(self, X, y=None, sample_weight=None):
+    def fit_transform(self, X, y=None, sample_weight=None, classes=None):
         """
         Fit estimator and transform dataset.
 
@@ -2891,6 +3510,9 @@ def fit_transform(self, X, y=None, sample_weight=None):
             classification, splits are also ignored if they would result in any
             single class carrying a negative weight in either child node.
 
+        classes : array-like of shape (n_classes,), default=None
+            List of all the classes that can possibly appear in the y vector.
+
         Returns
         -------
         X_transformed : sparse matrix of shape (n_samples, n_out)
@@ -2898,7 +3520,7 @@ def fit_transform(self, X, y=None, sample_weight=None):
         """
         rnd = check_random_state(self.random_state)
         y = rnd.uniform(size=_num_samples(X))
-        super().fit(X, y, sample_weight=sample_weight)
+        super().fit(X, y, sample_weight=sample_weight, classes=classes)
 
         self.one_hot_encoder_ = OneHotEncoder(sparse_output=self.sparse_output)
         output = self.one_hot_encoder_.fit_transform(self.apply(X))
diff --git a/sklearn/ensemble/_gb.py b/sklearn/ensemble/_gb.py
index 73bb9e08ae619..990dac614f45c 100644
--- a/sklearn/ensemble/_gb.py
+++ b/sklearn/ensemble/_gb.py
@@ -351,6 +351,7 @@ class BaseGradientBoosting(BaseEnsemble, metaclass=ABCMeta):
         "n_iter_no_change": [Interval(Integral, 1, None, closed="left"), None],
         "tol": [Interval(Real, 0.0, None, closed="left")],
     }
+    _parameter_constraints.pop("store_leaf_values")
     _parameter_constraints.pop("splitter")
     _parameter_constraints.pop("monotonic_cst")
 
diff --git a/sklearn/ensemble/tests/test_forest.py b/sklearn/ensemble/tests/test_forest.py
index 31e9859076c92..efc5d7d5ee5a4 100644
--- a/sklearn/ensemble/tests/test_forest.py
+++ b/sklearn/ensemble/tests/test_forest.py
@@ -115,6 +115,120 @@
 FOREST_CLASSIFIERS_REGRESSORS.update(FOREST_REGRESSORS)
 
 
+def _sparse_parity(n, p=20, p_star=3, random_state=None):
+    """Generate sparse parity dataset.
+
+    Sparse parity is a multivariate generalization of the
+    XOR problem.
+
+    Parameters
+    ----------
+    n : int
+        Number of sample to generate.
+    p : int, optional
+        The dimensionality of the dataset, by default 20
+    p_star : int, optional
+        The number of informative dimensions, by default 3.
+    random_state : Random State, optional
+        Random state, by default None.
+
+    Returns
+    -------
+    X : np.ndarray of shape (n, p)
+        Sparse parity dataset as a dense array.
+    y : np.ndarray of shape (n,)
+        Labels of the dataset
+    """
+    rng = np.random.RandomState(seed=random_state)
+    X = rng.uniform(-1, 1, (n, p))
+    y = np.zeros(n)
+
+    for i in range(0, n):
+        y[i] = sum(X[i, :p_star] > 0) % 2
+
+    return X, y
+
+
+def _orthant(n, p=6, random_state=None):
+    """Generate orthant dataset.
+
+    Parameters
+    ----------
+    n : int
+        Number of sample to generate.
+    p : int, optional
+        The dimensionality of the dataset and the number of
+        unique labels, by default 6.
+    rec : int, optional
+        _description_, by default 1
+    random_state : Random State, optional
+        Random state, by default None.
+
+    Returns
+    -------
+    X : np.ndarray of shape (n, p)
+        Orthant dataset as a dense array.
+    y : np.ndarray of shape (n,)
+        Labels of the dataset
+    """
+    rng = np.random.RandomState(seed=random_state)
+    orth_labels = np.asarray([2**i for i in range(0, p)][::-1])
+
+    X = rng.uniform(-1, 1, (n, p))
+    y = np.zeros(n)
+
+    for i in range(0, n):
+        idx = np.where(X[i, :] > 0)[0]
+        y[i] = sum(orth_labels[idx])
+
+    if len(np.unique(y)) < 2**p:
+        raise RuntimeError("Increase sample size to get a label in each orthant.")
+
+    return X, y
+
+
+def _trunk(n, p=10, random_state=None):
+    """Generate trunk dataset.
+
+    Parameters
+    ----------
+    n : int
+        Number of sample to generate.
+    p : int, optional
+        The dimensionality of the dataset and the number of
+        unique labels, by default 10.
+    random_state : Random State, optional
+        Random state, by default None.
+
+    Returns
+    -------
+    X : np.ndarray of shape (n, p)
+        Trunk dataset as a dense array.
+    y : np.ndarray of shape (n,)
+        Labels of the dataset
+
+    References
+    ----------
+    [1] Gerard V. Trunk. A problem of dimensionality: A
+    simple example. IEEE Transactions on Pattern Analysis
+    and Machine Intelligence, 1(3):306–307, 1979.
+    """
+    rng = np.random.RandomState(seed=random_state)
+
+    mu_1 = np.array([1 / i for i in range(1, p + 1)])
+    mu_0 = -1 * mu_1
+    cov = np.identity(p)
+
+    X = np.vstack(
+        (
+            rng.multivariate_normal(mu_0, cov, int(n / 2)),
+            rng.multivariate_normal(mu_1, cov, int(n / 2)),
+        )
+    )
+    y = np.concatenate((np.zeros(int(n / 2)), np.ones(int(n / 2))))
+    return X, y
+
+
 def check_classification_toy(name):
     """Check classification on a toy dataset."""
     ForestClassifier = FOREST_CLASSIFIERS[name]
@@ -1811,6 +1925,114 @@ def test_round_samples_to_one_when_samples_too_low(class_weight):
     forest.fit(X, y)
 
 
+@pytest.mark.parametrize("name", FOREST_CLASSIFIERS)
+def test_classification_toy_withbins(name):
+    """Check classification on a toy dataset."""
+    ForestClassifier = FOREST_CLASSIFIERS[name]
+
+    clf = ForestClassifier(n_estimators=10, random_state=1, max_bins=255)
+    clf.fit(X, y)
+    assert_array_equal(clf.predict(T), true_result)
+    assert 10 == len(clf)
+
+    clf = ForestClassifier(
+        n_estimators=10, max_features=1, random_state=1, max_bins=255
+    )
+    clf.fit(X, y)
+    assert_array_equal(clf.predict(T), true_result)
+    assert 10 == len(clf)
+
+    # also test apply
+    leaf_indices = clf.apply(X)
+    assert leaf_indices.shape == (len(X), clf.n_estimators)
+
+
+@pytest.mark.parametrize("name", FOREST_REGRESSORS)
+@pytest.mark.parametrize(
+    "criterion", ("squared_error", "absolute_error", "friedman_mse")
+)
+def test_regression_criterion_withbins(name, criterion):
+    # Check consistency on regression dataset.
+    ForestRegressor = FOREST_REGRESSORS[name]
+
+    reg = ForestRegressor(
+        n_estimators=5, criterion=criterion, random_state=1, max_bins=250
+    )
+    reg.fit(X_reg, y_reg)
+    score = reg.score(X_reg, y_reg)
+    assert (
+        score > 0.93
+    ), "Failed with max_features=None, criterion %s and score = %f" % (
+        criterion,
+        score,
+    )
+
+    reg = ForestRegressor(
+        n_estimators=5,
+        criterion=criterion,
+        max_features=6,
+        random_state=1,
+        max_bins=250,
+    )
+    reg.fit(X_reg, y_reg)
+    score = reg.score(X_reg, y_reg)
+    assert score > 0.92, "Failed with max_features=6, criterion %s and score = %f" % (
+        criterion,
+        score,
+    )
+
+
+@pytest.mark.parametrize("name", FOREST_CLASSIFIERS_REGRESSORS)
+def test_multioutput_quantiles(name):
+    # Check estimators on multi-output problems.
+    X_train = [
+        [-2, -1],
+        [-1, -1],
+        [-1, -2],
+        [1, 1],
+        [1, 2],
+        [2, 1],
+        [-2, 1],
+        [-1, 1],
+        [-1, 2],
+        [2, -1],
+        [1, -1],
+        [1, -2],
+    ]
+    y_train = [
+        [-1, 0],
+        [-1, 0],
+        [-1, 0],
+        [1, 1],
+        [1, 1],
+        [1, 1],
+        [-1, 2],
+        [-1, 2],
+        [-1, 2],
+        [1, 3],
+        [1, 3],
+        [1, 3],
+    ]
+    X_test = [[-1, -1], [1, 1], [-1, 1], [1, -1]]
+    y_test = [[-1, 0], [1, 1], [-1, 2], [1, 3]]
+
+    est = FOREST_ESTIMATORS[name](
+        random_state=0, bootstrap=False, store_leaf_values=True
+    )
+    est.fit(X_train, y_train)
+
+    y_pred = est.predict_quantiles(X_test, quantiles=[0.25, 0.5, 0.75])
+    assert_array_almost_equal(y_pred[:, 1, :], y_test)
+    assert_array_almost_equal(y_pred[:, 0, :], y_test)
+    assert_array_almost_equal(y_pred[:, 2, :], y_test)
+
+    # test the leaf nodes samples
+    leaf_nodes_samples = est.get_leaf_node_samples(X_test)
+    assert len(leaf_nodes_samples) == len(X_test)
+    for node_samples in leaf_nodes_samples:
+        assert node_samples.shape[1] == est.n_outputs_
+
+
 @pytest.mark.parametrize(
     "make_data, Forest",
     [
diff --git a/sklearn/feature_selection/tests/test_from_model.py b/sklearn/feature_selection/tests/test_from_model.py
index aa802136c2f39..0e207d2334761 100644
--- a/sklearn/feature_selection/tests/test_from_model.py
+++ b/sklearn/feature_selection/tests/test_from_model.py
@@ -10,7 +10,11 @@
 from sklearn.cross_decomposition import CCA, PLSCanonical, PLSRegression
 from sklearn.datasets import make_friedman1
 from sklearn.decomposition import PCA
-from sklearn.ensemble import HistGradientBoostingClassifier, RandomForestClassifier
+from sklearn.ensemble import (
+    HistGradientBoostingClassifier,
+    RandomForestClassifier,
+    RandomForestRegressor,
+)
 from sklearn.exceptions import NotFittedError
 from sklearn.feature_selection import SelectFromModel
 from sklearn.linear_model import (
@@ -402,7 +406,7 @@ def test_partial_fit():
     assert_array_almost_equal(X_transform, transformer.transform(data))
 
     # check that if est doesn't have partial_fit, neither does SelectFromModel
-    transformer = SelectFromModel(estimator=RandomForestClassifier())
+    transformer = SelectFromModel(estimator=RandomForestRegressor())
     assert not hasattr(transformer, "partial_fit")
 
 
diff --git a/sklearn/tree/_classes.py b/sklearn/tree/_classes.py
index 03ba2f108bbdd..41b4c55b9820a 100644
--- a/sklearn/tree/_classes.py
+++ b/sklearn/tree/_classes.py
@@ -11,12 +11,12 @@
 #          Joly Arnaud <arnaud.v.joly@gmail.com>
 #          Fares Hedayati <fares.hedayati@gmail.com>
 #          Nelson Liu <nelson@nelsonliu.me>
+#          Haoyin Xu <haoyinxu@gmail.com>
 #
 # License: BSD 3 clause
 
 import copy
 import numbers
-import warnings
 from abc import ABCMeta, abstractmethod
 from math import ceil
 from numbers import Integral, Real
@@ -24,7 +24,7 @@
 import numpy as np
 from scipy.sparse import issparse
 
-from ..base import (
+from sklearn.base import (
     BaseEstimator,
     ClassifierMixin,
     MultiOutputMixin,
@@ -33,18 +33,22 @@
     clone,
     is_classifier,
 )
-from ..utils import Bunch, check_random_state, compute_sample_weight
-from ..utils._param_validation import Hidden, Interval, RealNotInt, StrOptions
-from ..utils.multiclass import check_classification_targets
-from ..utils.validation import (
+from sklearn.utils import Bunch, check_random_state, compute_sample_weight
+from sklearn.utils._param_validation import Hidden, Interval, RealNotInt, StrOptions
+from sklearn.utils.multiclass import (
+    _check_partial_fit_first_call,
+    check_classification_targets,
+)
+from sklearn.utils.validation import (
     _assert_all_finite_element_wise,
     _check_sample_weight,
     assert_all_finite,
     check_is_fitted,
 )
+
 from . import _criterion, _splitter, _tree
-from ._criterion import Criterion
-from ._splitter import Splitter
+from ._criterion import BaseCriterion
+from ._splitter import BaseSplitter
 from ._tree import (
     BestFirstTreeBuilder,
     DepthFirstTreeBuilder,
@@ -106,6 +110,7 @@ class BaseDecisionTree(MultiOutputMixin, BaseEstimator, metaclass=ABCMeta):
         "min_samples_split": [
             Interval(Integral, 2, None, closed="left"),
             Interval(RealNotInt, 0.0, 1.0, closed="right"),
+            StrOptions({"sqrt", "log2"}),
         ],
         "min_samples_leaf": [
             Interval(Integral, 1, None, closed="left"),
@@ -122,6 +127,7 @@ class BaseDecisionTree(MultiOutputMixin, BaseEstimator, metaclass=ABCMeta):
         "max_leaf_nodes": [Interval(Integral, 2, None, closed="left"), None],
         "min_impurity_decrease": [Interval(Real, 0.0, None, closed="left")],
         "ccp_alpha": [Interval(Real, 0.0, None, closed="left")],
+        "store_leaf_values": ["boolean"],
         "monotonic_cst": ["array-like", None],
     }
 
@@ -141,6 +147,7 @@ def __init__(
         min_impurity_decrease,
         class_weight=None,
         ccp_alpha=0.0,
+        store_leaf_values=False,
         monotonic_cst=None,
     ):
         self.criterion = criterion
@@ -155,6 +162,7 @@ def __init__(
         self.min_impurity_decrease = min_impurity_decrease
         self.class_weight = class_weight
         self.ccp_alpha = ccp_alpha
+        self.store_leaf_values = store_leaf_values
         self.monotonic_cst = monotonic_cst
 
     def get_depth(self):
@@ -236,6 +244,7 @@ def _fit(
         sample_weight=None,
         check_input=True,
         missing_values_in_feature_mask=None,
+        classes=None,
     ):
         random_state = check_random_state(self.random_state)
 
@@ -250,9 +259,12 @@ def _fit(
                 dtype=DTYPE, accept_sparse="csc", force_all_finite=False
             )
             check_y_params = dict(ensure_2d=False, dtype=None)
-            X, y = self._validate_data(
-                X, y, validate_separately=(check_X_params, check_y_params)
-            )
+            if y is not None or self._get_tags()["requires_y"]:
+                X, y = self._validate_data(
+                    X, y, validate_separately=(check_X_params, check_y_params)
+                )
+            else:
+                X = self._validate_data(X, **check_X_params)
 
             missing_values_in_feature_mask = (
                 self._compute_missing_values_in_feature_mask(X)
@@ -265,7 +277,7 @@ def _fit(
                         "No support for np.int64 index based sparse matrices"
                     )
 
-            if self.criterion == "poisson":
+            if y is not None and self.criterion == "poisson":
                 if np.any(y < 0):
                     raise ValueError(
                         "Some value(s) of y are negative which is"
@@ -279,45 +291,73 @@ def _fit(
 
         # Determine output settings
         n_samples, self.n_features_in_ = X.shape
-        is_classification = is_classifier(self)
 
-        y = np.atleast_1d(y)
-        expanded_class_weight = None
+        # Do preprocessing if 'y' is passed
+        is_classification = False
+        if y is not None:
+            is_classification = is_classifier(self)
+            y = np.atleast_1d(y)
+            expanded_class_weight = None
 
-        if y.ndim == 1:
-            # reshape is necessary to preserve the data contiguity against vs
-            # [:, np.newaxis] that does not.
-            y = np.reshape(y, (-1, 1))
+            if y.ndim == 1:
+                # reshape is necessary to preserve the data contiguity against vs
+                # [:, np.newaxis] that does not.
+                y = np.reshape(y, (-1, 1))
 
-        self.n_outputs_ = y.shape[1]
+            self.n_outputs_ = y.shape[1]
 
-        if is_classification:
-            check_classification_targets(y)
-            y = np.copy(y)
+            if is_classification:
+                check_classification_targets(y)
+                y = np.copy(y)
+
+                self.classes_ = []
+                self.n_classes_ = []
+
+                if self.class_weight is not None:
+                    y_original = np.copy(y)
+
+                y_encoded = np.zeros(y.shape, dtype=int)
+                if classes is not None:
+                    classes = np.atleast_1d(classes)
+                    if classes.ndim == 1:
+                        classes = np.array([classes])
+
+                    for k in classes:
+                        self.classes_.append(np.array(k))
+                        self.n_classes_.append(np.array(k).shape[0])
+
+                    for i in range(n_samples):
+                        for j in range(self.n_outputs_):
+                            y_encoded[i, j] = np.where(self.classes_[j] == y[i, j])[0][
+                                0
+                            ]
+                else:
+                    for k in range(self.n_outputs_):
+                        classes_k, y_encoded[:, k] = np.unique(
+                            y[:, k], return_inverse=True
+                        )
+                        self.classes_.append(classes_k)
+                        self.n_classes_.append(classes_k.shape[0])
+
+                y = y_encoded
+
+                if self.class_weight is not None:
+                    expanded_class_weight = compute_sample_weight(
+                        self.class_weight, y_original
+                    )
 
-            self.classes_ = []
-            self.n_classes_ = []
+                self.n_classes_ = np.array(self.n_classes_, dtype=np.intp)
 
-            if self.class_weight is not None:
-                y_original = np.copy(y)
+            if getattr(y, "dtype", None) != DOUBLE or not y.flags.contiguous:
+                y = np.ascontiguousarray(y, dtype=DOUBLE)
 
-            y_encoded = np.zeros(y.shape, dtype=int)
-            for k in range(self.n_outputs_):
-                classes_k, y_encoded[:, k] = np.unique(y[:, k], return_inverse=True)
-                self.classes_.append(classes_k)
-                self.n_classes_.append(classes_k.shape[0])
-            y = y_encoded
-
-            if self.class_weight is not None:
-                expanded_class_weight = compute_sample_weight(
-                    self.class_weight, y_original
+            if len(y) != n_samples:
+                raise ValueError(
+                    "Number of labels=%d does not match number of samples=%d"
+                    % (len(y), n_samples)
                 )
 
-            self.n_classes_ = np.array(self.n_classes_, dtype=np.intp)
-
-        if getattr(y, "dtype", None) != DOUBLE or not y.flags.contiguous:
-            y = np.ascontiguousarray(y, dtype=DOUBLE)
-
+        # set decision-tree model parameters
         max_depth = np.iinfo(np.int32).max if self.max_depth is None else self.max_depth
 
         if isinstance(self.min_samples_leaf, numbers.Integral):
@@ -325,36 +365,25 @@ def _fit(
         else:  # float
             min_samples_leaf = int(ceil(self.min_samples_leaf * n_samples))
 
-        if isinstance(self.min_samples_split, numbers.Integral):
+        if isinstance(self.min_samples_split, str):
+            if self.min_samples_split == "sqrt":
+                min_samples_split = max(1, int(np.sqrt(self.n_features_in_)))
+            elif self.min_samples_split == "log2":
+                min_samples_split = max(1, int(np.log2(self.n_features_in_)))
+        elif isinstance(self.min_samples_split, numbers.Integral):
             min_samples_split = self.min_samples_split
         else:  # float
             min_samples_split = int(ceil(self.min_samples_split * n_samples))
             min_samples_split = max(2, min_samples_split)
-
         min_samples_split = max(min_samples_split, 2 * min_samples_leaf)
+        self.min_samples_split_ = min_samples_split
 
         if isinstance(self.max_features, str):
             if self.max_features == "auto":
                 if is_classification:
                     max_features = max(1, int(np.sqrt(self.n_features_in_)))
-                    warnings.warn(
-                        (
-                            "`max_features='auto'` has been deprecated in 1.1 "
-                            "and will be removed in 1.3. To keep the past behaviour, "
-                            "explicitly set `max_features='sqrt'`."
-                        ),
-                        FutureWarning,
-                    )
                 else:
                     max_features = self.n_features_in_
-                    warnings.warn(
-                        (
-                            "`max_features='auto'` has been deprecated in 1.1 "
-                            "and will be removed in 1.3. To keep the past behaviour, "
-                            "explicitly set `max_features=1.0'`."
-                        ),
-                        FutureWarning,
-                    )
             elif self.max_features == "sqrt":
                 max_features = max(1, int(np.sqrt(self.n_features_in_)))
             elif self.max_features == "log2":
@@ -373,16 +402,10 @@ def _fit(
 
         max_leaf_nodes = -1 if self.max_leaf_nodes is None else self.max_leaf_nodes
 
-        if len(y) != n_samples:
-            raise ValueError(
-                "Number of labels=%d does not match number of samples=%d"
-                % (len(y), n_samples)
-            )
-
         if sample_weight is not None:
             sample_weight = _check_sample_weight(sample_weight, X, DOUBLE)
 
-        if expanded_class_weight is not None:
+        if y is not None and expanded_class_weight is not None:
             if sample_weight is not None:
                 sample_weight = sample_weight * expanded_class_weight
             else:
@@ -394,10 +417,65 @@ def _fit(
         else:
             min_weight_leaf = self.min_weight_fraction_leaf * np.sum(sample_weight)
 
+        # build the actual tree now with the parameters
+        self._build_tree(
+            X=X,
+            y=y,
+            sample_weight=sample_weight,
+            missing_values_in_feature_mask=missing_values_in_feature_mask,
+            min_samples_leaf=min_samples_leaf,
+            min_weight_leaf=min_weight_leaf,
+            max_leaf_nodes=max_leaf_nodes,
+            min_samples_split=min_samples_split,
+            max_depth=max_depth,
+            random_state=random_state,
+        )
+
+        return self
+
+    def _build_tree(
+        self,
+        X,
+        y,
+        sample_weight,
+        missing_values_in_feature_mask,
+        min_samples_leaf,
+        min_weight_leaf,
+        max_leaf_nodes,
+        min_samples_split,
+        max_depth,
+        random_state,
+    ):
+        """Build the actual tree.
+
+        Parameters
+        ----------
+        X : Array-like
+            X dataset.
+        y : Array-like
+            Y targets.
+        sample_weight : Array-like
+            Sample weights
+        min_samples_leaf : float
+            Number of samples required to be a leaf.
+        min_weight_leaf : float
+            Weight of samples required to be a leaf.
+        max_leaf_nodes : float
+            Maximum number of leaf nodes allowed in tree.
+        min_samples_split : float
+            Minimum number of samples to split on.
+        max_depth : int
+            The maximum depth of any tree.
+        random_state : int
+            Random seed.
+        """
+
+        n_samples = X.shape[0]
+
         # Build tree
         criterion = self.criterion
-        if not isinstance(criterion, Criterion):
-            if is_classification:
+        if not isinstance(criterion, BaseCriterion):
+            if is_classifier(self):
                 criterion = CRITERIA_CLF[self.criterion](
                     self.n_outputs_, self.n_classes_
                 )
@@ -410,7 +488,6 @@ def _fit(
 
         SPLITTERS = SPARSE_SPLITTERS if issparse(X) else DENSE_SPLITTERS
 
-        splitter = self.splitter
         if self.monotonic_cst is None:
             monotonic_cst = None
         else:
@@ -450,7 +527,7 @@ def _fit(
                 # *positive class*, all signs must be flipped.
                 monotonic_cst *= -1
 
-        if not isinstance(self.splitter, Splitter):
+        if not isinstance(self.splitter, BaseSplitter):
             splitter = SPLITTERS[self.splitter](
                 criterion,
                 self.max_features_,
@@ -472,16 +549,17 @@ def _fit(
 
         # Use BestFirst if max_leaf_nodes given; use DepthFirst otherwise
         if max_leaf_nodes < 0:
-            builder = DepthFirstTreeBuilder(
+            self.builder_ = DepthFirstTreeBuilder(
                 splitter,
                 min_samples_split,
                 min_samples_leaf,
                 min_weight_leaf,
                 max_depth,
                 self.min_impurity_decrease,
+                self.store_leaf_values,
             )
         else:
-            builder = BestFirstTreeBuilder(
+            self.builder_ = BestFirstTreeBuilder(
                 splitter,
                 min_samples_split,
                 min_samples_leaf,
@@ -489,9 +567,11 @@ def _fit(
                 max_depth,
                 max_leaf_nodes,
                 self.min_impurity_decrease,
+                self.store_leaf_values,
             )
-
-        builder.build(self.tree_, X, y, sample_weight, missing_values_in_feature_mask)
+        self.builder_.build(
+            self.tree_, X, y, sample_weight, missing_values_in_feature_mask
+        )
 
         if self.n_outputs_ == 1 and is_classifier(self):
             self.n_classes_ = self.n_classes_[0]
@@ -499,8 +579,6 @@ def _fit(
 
         self._prune_tree()
 
-        return self
-
     def _validate_X_predict(self, X, check_input):
         """Validate the training data on predict (probabilities)."""
         if check_input:
@@ -549,6 +627,9 @@ def predict(self, X, check_input=True):
         """
         check_is_fitted(self)
         X = self._validate_X_predict(X, check_input)
+
+        # proba is a count matrix of leaves that fall into
+        # (n_samples, n_outputs, max_n_classes) array
         proba = self.tree_.predict(X)
         n_samples = X.shape[0]
 
@@ -575,6 +656,134 @@ def predict(self, X, check_input=True):
             else:
                 return proba[:, :, 0]
 
+    def get_leaf_node_samples(self, X, check_input=True):
+        """For each datapoint x in X, get the training samples in the leaf node.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            Dataset to apply the forest to.
+        check_input : bool, default=True
+            Allow to bypass several input checking.
+
+        Returns
+        -------
+        leaf_nodes_samples : a list of array-like of length (n_samples,)
+            Each sample is represented by the indices of the training samples that
+            reached the leaf node. The ``n_leaf_node_samples`` may vary between
+            samples, since the number of samples that fall in a leaf node is
+            variable. Each array has shape (n_leaf_node_samples, n_outputs).
+        """
+        if not self.store_leaf_values:
+            raise RuntimeError(
+                "leaf node samples are not stored when store_leaf_values=False"
+            )
+
+        # get indices of leaves per sample (n_samples,)
+        X_leaves = self.apply(X, check_input=check_input)
+        n_samples = X_leaves.shape[0]
+
+        # get array of samples per leaf (n_node_samples, n_outputs)
+        leaf_samples = self.tree_.leaf_nodes_samples
+
+        leaf_nodes_samples = []
+        for idx in range(n_samples):
+            leaf_id = X_leaves[idx]
+            leaf_nodes_samples.append(leaf_samples[leaf_id])
+        return leaf_nodes_samples
+
+    def predict_quantiles(self, X, quantiles=0.5, method="nearest", check_input=True):
+        """Predict class or regression value for X at given quantiles.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            Input data.
+        quantiles : float, optional
+            The quantiles at which to evaluate, by default 0.5 (median).
+        method : str, optional
+            The method to interpolate, by default 'linear'. Can be any keyword
+            argument accepted by :func:`~np.quantile`.
+        check_input : bool, optional
+            Whether or not to check input, by default True.
+
+        Returns
+        -------
+        predictions : array-like of shape (n_samples, n_outputs, len(quantiles))
+            The predicted quantiles.
+        """
+        if not self.store_leaf_values:
+            raise RuntimeError(
+                "Predicting quantiles requires that the tree stores leaf node samples."
+            )
+
+        check_is_fitted(self)
+
+        # Check data
+        X = self._validate_X_predict(X, check_input)
+
+        if not isinstance(quantiles, (np.ndarray, list)):
+            quantiles = np.array([quantiles])
+
+        # get indices of leaves per sample
+        X_leaves = self.apply(X)
+
+        # get array of samples per leaf (n_node_samples, n_outputs)
+        leaf_samples = self.tree_.leaf_nodes_samples
+
+        # compute quantiles (n_samples, n_quantiles, n_outputs)
+        n_samples = X.shape[0]
+        n_quantiles = len(quantiles)
+        proba = np.zeros((n_samples, n_quantiles, self.n_outputs_))
+        for idx, leaf_id in enumerate(X_leaves):
+            # predict by taking the quantile across the samples in the leaf for
+            # each output
+            try:
+                proba[idx, ...] = np.quantile(
+                    leaf_samples[leaf_id], quantiles, axis=0, method=method
+                )
+            except TypeError:
+                proba[idx, ...] = np.quantile(
+                    leaf_samples[leaf_id], quantiles, axis=0, interpolation=method
+                )
+
+        # Classification
+        if is_classifier(self):
+            if self.n_outputs_ == 1:
+                # return the class with the highest probability for each quantile
+                # (n_samples, n_quantiles)
+                class_preds = np.zeros(
+                    (n_samples, n_quantiles), dtype=self.classes_.dtype
+                )
+                for i in range(n_quantiles):
+                    class_pred_per_sample = (
+                        proba[:, i, :].squeeze().astype(self.classes_.dtype)
+                    )
+                    class_preds[:, i] = self.classes_.take(
+                        class_pred_per_sample, axis=0
+                    )
+                return class_preds
+            else:
+                class_type = self.classes_[0].dtype
+                predictions = np.zeros(
+                    (n_samples, n_quantiles, self.n_outputs_), dtype=class_type
+                )
+                for k in range(self.n_outputs_):
+                    for i in range(n_quantiles):
+                        class_pred_per_sample = proba[:, i, k].squeeze().astype(int)
+                        predictions[:, i, k] = self.classes_[k].take(
+                            class_pred_per_sample, axis=0
+                        )
+
+                return predictions
+        # Regression
+        else:
+            if self.n_outputs_ == 1:
+                return proba[:, :, 0]
+
+            else:
+                return proba
+
     def apply(self, X, check_input=True):
         """Return the index of the leaf that each sample is predicted as.
 
@@ -849,6 +1058,16 @@ class DecisionTreeClassifier(ClassifierMixin, BaseDecisionTree):
 
         .. versionadded:: 0.22
 
+    store_leaf_values : bool, default=False
+        Whether to store the samples that fall into leaves in the ``tree_`` attribute.
+        Each leaf will store a 2D array corresponding to the samples that fall into it
+        keyed by node_id.
+
+        XXX: This is currently experimental and may change without notice.
+        Moreover, it can be improved upon since storing the samples twice is not ideal.
+        One could instead store the indices in ``y_train`` that fall into each leaf,
+        which would lower RAM/diskspace usage.
+
     monotonic_cst : array-like of int of shape (n_features), default=None
         Indicates the monotonicity constraint to enforce on each feature.
           - 1: monotonic increase
@@ -913,6 +1132,12 @@ class DecisionTreeClassifier(ClassifierMixin, BaseDecisionTree):
         :ref:`sphx_glr_auto_examples_tree_plot_unveil_tree_structure.py`
         for basic usage of these attributes.
 
+    builder_ : TreeBuilder instance
+        The underlying TreeBuilder object.
+
+    min_samples_split_ : float
+        The minimum number of samples needed to split a node in the tree building.
+
     See Also
     --------
     DecisionTreeRegressor : A decision tree regressor.
@@ -960,7 +1185,10 @@ class DecisionTreeClassifier(ClassifierMixin, BaseDecisionTree):
 
     _parameter_constraints: dict = {
         **BaseDecisionTree._parameter_constraints,
-        "criterion": [StrOptions({"gini", "entropy", "log_loss"}), Hidden(Criterion)],
+        "criterion": [
+            StrOptions({"gini", "entropy", "log_loss"}),
+            Hidden(BaseCriterion),
+        ],
         "class_weight": [dict, list, StrOptions({"balanced"}), None],
     }
 
@@ -979,6 +1207,7 @@ def __init__(
         min_impurity_decrease=0.0,
         class_weight=None,
         ccp_alpha=0.0,
+        store_leaf_values=False,
         monotonic_cst=None,
     ):
         super().__init__(
@@ -995,10 +1224,18 @@ def __init__(
             min_impurity_decrease=min_impurity_decrease,
             monotonic_cst=monotonic_cst,
             ccp_alpha=ccp_alpha,
+            store_leaf_values=store_leaf_values,
         )
 
     @_fit_context(prefer_skip_nested_validation=True)
-    def fit(self, X, y, sample_weight=None, check_input=True):
+    def fit(
+        self,
+        X,
+        y,
+        sample_weight=None,
+        check_input=True,
+        classes=None,
+    ):
         """Build a decision tree classifier from the training set (X, y).
 
         Parameters
@@ -1022,20 +1259,127 @@ def fit(self, X, y, sample_weight=None, check_input=True):
             Allow to bypass several input checking.
             Don't use this parameter unless you know what you're doing.
 
+        classes : array-like of shape (n_classes,), default=None
+            List of all the classes that can possibly appear in the y vector.
+
         Returns
         -------
         self : DecisionTreeClassifier
             Fitted estimator.
         """
-
         super()._fit(
             X,
             y,
             sample_weight=sample_weight,
             check_input=check_input,
+            classes=classes,
         )
         return self
 
+    def partial_fit(self, X, y, sample_weight=None, check_input=True, classes=None):
+        """Update a decision tree classifier from the training set (X, y).
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            The training input samples. Internally, it will be converted to
+            ``dtype=np.float32`` and if a sparse matrix is provided
+            to a sparse ``csc_matrix``.
+
+        y : array-like of shape (n_samples,) or (n_samples, n_outputs)
+            The target values (class labels) as integers or strings.
+
+        sample_weight : array-like of shape (n_samples,), default=None
+            Sample weights. If None, then samples are equally weighted. Splits
+            that would create child nodes with net zero or negative weight are
+            ignored while searching for a split in each node. Splits are also
+            ignored if they would result in any single class carrying a
+            negative weight in either child node.
+
+        check_input : bool, default=True
+            Allow to bypass several input checking.
+            Don't use this parameter unless you know what you do.
+
+        classes : array-like of shape (n_classes,), default=None
+            List of all the classes that can possibly appear in the y vector.
+            Must be provided at the first call to partial_fit, can be omitted
+            in subsequent calls.
+
+        Returns
+        -------
+        self : DecisionTreeClassifier
+            Fitted estimator.
+        """
+        self._validate_params()
+
+        # validate input parameters
+        first_call = _check_partial_fit_first_call(self, classes=classes)
+
+        # Fit if no tree exists yet
+        if first_call:
+            self.fit(
+                X,
+                y,
+                sample_weight=sample_weight,
+                check_input=check_input,
+                classes=classes,
+            )
+            return self
+
+        if check_input:
+            # Need to validate separately here.
+            # We can't pass multi_ouput=True because that would allow y to be
+            # csr.
+            check_X_params = dict(dtype=DTYPE, accept_sparse="csc")
+            check_y_params = dict(ensure_2d=False, dtype=None)
+            X, y = self._validate_data(
+                X, y, reset=False, validate_separately=(check_X_params, check_y_params)
+            )
+            if issparse(X):
+                X.sort_indices()
+
+                if X.indices.dtype != np.intc or X.indptr.dtype != np.intc:
+                    raise ValueError(
+                        "No support for np.int64 index based sparse matrices"
+                    )
+
+        if X.shape[1] != self.n_features_in_:
+            raise ValueError(
+                f"X has {X.shape[1]} features, but {self.__class__.__name__} "
+                f"is expecting {self.n_features_in_} features as input."
+            )
+
+        y = np.atleast_1d(y)
+
+        if y.ndim == 1:
+            # reshape is necessary to preserve the data contiguity against vs
+            # [:, np.newaxis] that does not.
+            y = np.reshape(y, (-1, 1))
+
+        check_classification_targets(y)
+        y = np.copy(y)
+
+        classes = self.classes_
+        if self.n_outputs_ == 1:
+            classes = [classes]
+
+        y_encoded = np.zeros(y.shape, dtype=int)
+        for i in range(X.shape[0]):
+            for j in range(self.n_outputs_):
+                y_encoded[i, j] = np.where(classes[j] == y[i, j])[0][0]
+        y = y_encoded
+
+        if getattr(y, "dtype", None) != DOUBLE or not y.flags.contiguous:
+            y = np.ascontiguousarray(y, dtype=DOUBLE)
+
+        # Update tree
+        self.builder_.initialize_node_queue(self.tree_, X, y, sample_weight)
+        self.builder_.build(self.tree_, X, y, sample_weight)
+
+        self._prune_tree()
+
+        return self
+
     def predict_proba(self, X, check_input=True):
         """Predict class probabilities of the input samples X.
 
@@ -1246,6 +1590,16 @@ class DecisionTreeRegressor(RegressorMixin, BaseDecisionTree):
 
         .. versionadded:: 0.22
 
+    store_leaf_values : bool, default=False
+        Whether to store the samples that fall into leaves in the ``tree_`` attribute.
+        Each leaf will store a 2D array corresponding to the samples that fall into it
+        keyed by node_id.
+
+        XXX: This is currently experimental and may change without notice.
+        Moreover, it can be improved upon since storing the samples twice is not ideal.
+        One could instead store the indices in ``y_train`` that fall into each leaf,
+        which would lower RAM/diskspace usage.
+
     monotonic_cst : array-like of int of shape (n_features), default=None
         Indicates the monotonicity constraint to enforce on each feature.
           - 1: monotonic increase
@@ -1298,6 +1652,12 @@ class DecisionTreeRegressor(RegressorMixin, BaseDecisionTree):
         :ref:`sphx_glr_auto_examples_tree_plot_unveil_tree_structure.py`
         for basic usage of these attributes.
 
+    builder_ : TreeBuilder instance
+        The underlying TreeBuilder object.
+
+    min_samples_split_ : float
+        The minimum number of samples needed to split a node in the tree building.
+
     See Also
     --------
     DecisionTreeClassifier : A decision tree classifier.
@@ -1342,7 +1702,7 @@ class DecisionTreeRegressor(RegressorMixin, BaseDecisionTree):
         **BaseDecisionTree._parameter_constraints,
         "criterion": [
             StrOptions({"squared_error", "friedman_mse", "absolute_error", "poisson"}),
-            Hidden(Criterion),
+            Hidden(BaseCriterion),
         ],
     }
 
@@ -1360,6 +1720,7 @@ def __init__(
         max_leaf_nodes=None,
         min_impurity_decrease=0.0,
         ccp_alpha=0.0,
+        store_leaf_values=False,
         monotonic_cst=None,
     ):
         super().__init__(
@@ -1374,11 +1735,19 @@ def __init__(
             random_state=random_state,
             min_impurity_decrease=min_impurity_decrease,
             ccp_alpha=ccp_alpha,
+            store_leaf_values=store_leaf_values,
             monotonic_cst=monotonic_cst,
         )
 
     @_fit_context(prefer_skip_nested_validation=True)
-    def fit(self, X, y, sample_weight=None, check_input=True):
+    def fit(
+        self,
+        X,
+        y,
+        sample_weight=None,
+        check_input=True,
+        classes=None,
+    ):
         """Build a decision tree regressor from the training set (X, y).
 
         Parameters
@@ -1401,6 +1770,9 @@ def fit(self, X, y, sample_weight=None, check_input=True):
             Allow to bypass several input checking.
             Don't use this parameter unless you know what you're doing.
 
+        classes : array-like of shape (n_classes,), default=None
+            List of all the classes that can possibly appear in the y vector.
+
         Returns
         -------
         self : DecisionTreeRegressor
@@ -1412,6 +1784,7 @@ def fit(self, X, y, sample_weight=None, check_input=True):
             y,
             sample_weight=sample_weight,
             check_input=check_input,
+            classes=classes,
         )
         return self
 
@@ -1589,6 +1962,16 @@ class ExtraTreeClassifier(DecisionTreeClassifier):
 
         .. versionadded:: 0.22
 
+    store_leaf_values : bool, default=False
+        Whether to store the samples that fall into leaves in the ``tree_`` attribute.
+        Each leaf will store a 2D array corresponding to the samples that fall into it
+        keyed by node_id.
+
+        XXX: This is currently experimental and may change without notice.
+        Moreover, it can be improved upon since storing the samples twice is not ideal.
+        One could instead store the indices in ``y_train`` that fall into each leaf,
+        which would lower RAM/diskspace usage.
+
     monotonic_cst : array-like of int of shape (n_features), default=None
         Indicates the monotonicity constraint to enforce on each feature.
           - 1: monotonic increase
@@ -1653,6 +2036,12 @@ class ExtraTreeClassifier(DecisionTreeClassifier):
         :ref:`sphx_glr_auto_examples_tree_plot_unveil_tree_structure.py`
         for basic usage of these attributes.
 
+    builder_ : TreeBuilder instance
+        The underlying TreeBuilder object.
+
+    min_samples_split_ : float
+        The minimum number of samples needed to split a node in the tree building.
+
     See Also
     --------
     ExtraTreeRegressor : An extremely randomized tree regressor.
@@ -1708,6 +2097,7 @@ def __init__(
         min_impurity_decrease=0.0,
         class_weight=None,
         ccp_alpha=0.0,
+        store_leaf_values=False,
         monotonic_cst=None,
     ):
         super().__init__(
@@ -1723,6 +2113,7 @@ def __init__(
             min_impurity_decrease=min_impurity_decrease,
             random_state=random_state,
             ccp_alpha=ccp_alpha,
+            store_leaf_values=store_leaf_values,
             monotonic_cst=monotonic_cst,
         )
 
@@ -1854,6 +2245,16 @@ class ExtraTreeRegressor(DecisionTreeRegressor):
 
         .. versionadded:: 0.22
 
+    store_leaf_values : bool, default=False
+        Whether to store the samples that fall into leaves in the ``tree_`` attribute.
+        Each leaf will store a 2D array corresponding to the samples that fall into it
+        keyed by node_id.
+
+        XXX: This is currently experimental and may change without notice.
+        Moreover, it can be improved upon since storing the samples twice is not ideal.
+        One could instead store the indices in ``y_train`` that fall into each leaf,
+        which would lower RAM/diskspace usage.
+
     monotonic_cst : array-like of int of shape (n_features), default=None
         Indicates the monotonicity constraint to enforce on each feature.
           - 1: monotonic increase
@@ -1903,6 +2304,12 @@ class ExtraTreeRegressor(DecisionTreeRegressor):
         :ref:`sphx_glr_auto_examples_tree_plot_unveil_tree_structure.py`
         for basic usage of these attributes.
 
+    builder_ : TreeBuilder instance
+        The underlying TreeBuilder object.
+
+    min_samples_split_ : float
+        The minimum number of samples needed to split a node in the tree building.
+
     See Also
     --------
     ExtraTreeClassifier : An extremely randomized tree classifier.
@@ -1953,6 +2360,7 @@ def __init__(
         min_impurity_decrease=0.0,
         max_leaf_nodes=None,
         ccp_alpha=0.0,
+        store_leaf_values=False,
         monotonic_cst=None,
     ):
         super().__init__(
@@ -1967,5 +2375,6 @@ def __init__(
             min_impurity_decrease=min_impurity_decrease,
             random_state=random_state,
             ccp_alpha=ccp_alpha,
+            store_leaf_values=store_leaf_values,
             monotonic_cst=monotonic_cst,
         )
diff --git a/sklearn/tree/_criterion.pxd b/sklearn/tree/_criterion.pxd
index b765d324bebb9..690f4d0c54c64 100644
--- a/sklearn/tree/_criterion.pxd
+++ b/sklearn/tree/_criterion.pxd
@@ -4,33 +4,33 @@
 #          Joel Nothman <joel.nothman@gmail.com>
 #          Arnaud Joly <arnaud.v.joly@gmail.com>
 #          Jacob Schreiber <jmschreiber91@gmail.com>
+#          Adam Li <adam2392@gmail.com>
+#          Jong Shin <jshinm@gmail.com>
 #
 # License: BSD 3 clause
 
 # See _criterion.pyx for implementation details.
 cimport numpy as cnp
 
-from ._tree cimport DTYPE_t          # Type of X
-from ._tree cimport DOUBLE_t         # Type of y, sample_weight
-from ._tree cimport SIZE_t           # Type for indices and counters
-from ._tree cimport INT32_t          # Signed 32 bit integer
-from ._tree cimport UINT32_t         # Unsigned 32 bit integer
+from libcpp.vector cimport vector
 
-cdef class Criterion:
-    # The criterion computes the impurity of a node and the reduction of
-    # impurity of a split on that node. It also computes the output statistics
-    # such as the mean in regression and class probabilities in classification.
+from ._tree cimport DOUBLE_t  # Type of y, sample_weight
+from ._tree cimport DTYPE_t  # Type of X
+from ._tree cimport INT32_t  # Signed 32 bit integer
+from ._tree cimport SIZE_t  # Type for indices and counters
+from ._tree cimport UINT32_t  # Unsigned 32 bit integer
+
+
+cdef class BaseCriterion:
+    """Abstract interface for criterion."""
 
     # Internal structures
-    cdef const DOUBLE_t[:, ::1] y         # Values of y
     cdef const DOUBLE_t[:] sample_weight  # Sample weights
 
     cdef const SIZE_t[:] sample_indices   # Sample indices in X, y
     cdef SIZE_t start                     # samples[start:pos] are the samples in the left node
     cdef SIZE_t pos                       # samples[pos:end] are the samples in the right node
     cdef SIZE_t end
-    cdef SIZE_t n_missing                # Number of missing values for the feature being evaluated
-    cdef bint missing_go_to_left         # Whether missing values go to the left node
 
     cdef SIZE_t n_outputs                 # Number of outputs
     cdef SIZE_t n_samples                 # Number of samples
@@ -41,21 +41,11 @@ cdef class Criterion:
     cdef double weighted_n_right          # Weighted number of samples in the right node
     cdef double weighted_n_missing       # Weighted number of samples that are missing
 
+    # Core methods that criterion class _must_ implement.
     # The criterion object is maintained such that left and right collected
     # statistics correspond to samples[start:pos] and samples[pos:end].
 
     # Methods
-    cdef int init(
-        self,
-        const DOUBLE_t[:, ::1] y,
-        const DOUBLE_t[:] sample_weight,
-        double weighted_n_samples,
-        const SIZE_t[:] sample_indices,
-        SIZE_t start,
-        SIZE_t end
-    ) except -1 nogil
-    cdef void init_sum_missing(self)
-    cdef void init_missing(self, SIZE_t n_missing) noexcept nogil
     cdef int reset(self) except -1 nogil
     cdef int reverse_reset(self) except -1 nogil
     cdef int update(self, SIZE_t new_pos) except -1 nogil
@@ -69,13 +59,6 @@ cdef class Criterion:
         self,
         double* dest
     ) noexcept nogil
-    cdef void clip_node_value(
-        self,
-        double* dest,
-        double lower_bound,
-        double upper_bound
-    ) noexcept nogil
-    cdef double middle_value(self) noexcept nogil
     cdef double impurity_improvement(
         self,
         double impurity_parent,
@@ -83,6 +66,35 @@ cdef class Criterion:
         double impurity_right
     ) noexcept nogil
     cdef double proxy_impurity_improvement(self) noexcept nogil
+    cdef void set_sample_pointers(
+        self,
+        SIZE_t start,
+        SIZE_t end
+    ) noexcept nogil
+
+
+cdef class Criterion(BaseCriterion):
+    """Abstract interface for supervised impurity criteria."""
+
+    cdef const DOUBLE_t[:, ::1] y         # Values of y
+    cdef SIZE_t n_missing                # Number of missing values for the feature being evaluated
+    cdef bint missing_go_to_left         # Whether missing values go to the left node
+
+    cdef int init(
+        self,
+        const DOUBLE_t[:, ::1] y,
+        const DOUBLE_t[:] sample_weight,
+        double weighted_n_samples,
+        const SIZE_t[:] sample_indices
+    ) except -1 nogil
+    cdef void init_sum_missing(self)
+    cdef void init_missing(self, SIZE_t n_missing) noexcept nogil
+
+    cdef void node_samples(
+        self,
+        vector[vector[DOUBLE_t]]& dest
+    ) noexcept nogil
+
     cdef bint check_monotonicity(
             self,
             cnp.int8_t monotonic_cst,
@@ -97,6 +109,13 @@ cdef class Criterion:
             double sum_left,
             double sum_right,
     ) noexcept nogil
+    cdef void clip_node_value(
+        self,
+        double* dest,
+        double lower_bound,
+        double upper_bound
+    ) noexcept nogil
+    cdef double middle_value(self) noexcept nogil
 
 cdef class ClassificationCriterion(Criterion):
     """Abstract criterion for classification."""
diff --git a/sklearn/tree/_criterion.pyx b/sklearn/tree/_criterion.pyx
index ed8a12065554e..f47feb9c9f59d 100644
--- a/sklearn/tree/_criterion.pyx
+++ b/sklearn/tree/_criterion.pyx
@@ -1,3 +1,6 @@
+# cython: language_level=3
+# cython: boundscheck=False, wraparound=False, initializedcheck=False, cdivision=True
+
 # Authors: Gilles Louppe <g.louppe@gmail.com>
 #          Peter Prettenhofer <peter.prettenhofer@gmail.com>
 #          Brian Holt <bdholt1@gmail.com>
@@ -9,30 +12,47 @@
 #          Fares Hedayati <fares.hedayati@gmail.com>
 #          Jacob Schreiber <jmschreiber91@gmail.com>
 #          Nelson Liu <nelson@nelsonliu.me>
+#          Adam Li <adam2392@gmail.com>
+#          Jong Shin <jshinm@gmail.com>
 #
 # License: BSD 3 clause
 
-from libc.string cimport memcpy
-from libc.string cimport memset
-from libc.math cimport fabs, INFINITY
+from libc.math cimport INFINITY, fabs
+from libc.string cimport memcpy, memset
 
 import numpy as np
+
 cimport numpy as cnp
+
 cnp.import_array()
 
 from scipy.special.cython_special cimport xlogy
 
-from ._utils cimport log
-from ._utils cimport WeightedMedianCalculator
+from ._utils cimport WeightedMedianCalculator, log
+
 
 # EPSILON is used in the Poisson criterion
 cdef double EPSILON = 10 * np.finfo('double').eps
 
-cdef class Criterion:
-    """Interface for impurity criteria.
+cdef class BaseCriterion:
+    """This is an abstract interface for criterion.
+
+    For example, a tree model could
+    be either supervisedly, or unsupervisedly computing impurity on samples of
+    covariates, or labels, or both. Although scikit-learn currently only contains
+    supervised tree methods, this class enables 3rd party packages to leverage
+    scikit-learn's Cython code for criteria.
+
+    The downstream classes _must_ implement methods to compute the impurity
+    in current node and in children nodes.
 
     This object stores methods on how to calculate how good a split is using
-    different metrics.
+    a set API.
+
+    Samples in the "current" node are stored in `samples[start:end]` which is
+    partitioned around `pos` (an index in `start:end`) so that:
+       - the samples of left child node are stored in `samples[start:pos]`
+       - the samples of right child node are stored in `samples[pos:end]`
     """
     def __getstate__(self):
         return {}
@@ -40,53 +60,6 @@ cdef class Criterion:
     def __setstate__(self, d):
         pass
 
-    cdef int init(
-        self,
-        const DOUBLE_t[:, ::1] y,
-        const DOUBLE_t[:] sample_weight,
-        double weighted_n_samples,
-        const SIZE_t[:] sample_indices,
-        SIZE_t start,
-        SIZE_t end,
-    ) except -1 nogil:
-        """Placeholder for a method which will initialize the criterion.
-
-        Returns -1 in case of failure to allocate memory (and raise MemoryError)
-        or 0 otherwise.
-
-        Parameters
-        ----------
-        y : ndarray, dtype=DOUBLE_t
-            y is a buffer that can store values for n_outputs target variables
-            stored as a Cython memoryview.
-        sample_weight : ndarray, dtype=DOUBLE_t
-            The weight of each sample stored as a Cython memoryview.
-        weighted_n_samples : double
-            The total weight of the samples being considered
-        sample_indices : ndarray, dtype=SIZE_t
-            A mask on the samples. Indices of the samples in X and y we want to use,
-            where sample_indices[start:end] correspond to the samples in this node.
-        start : SIZE_t
-            The first sample to be used on this node
-        end : SIZE_t
-            The last sample used on this node
-
-        """
-        pass
-
-    cdef void init_missing(self, SIZE_t n_missing) noexcept nogil:
-        """Initialize sum_missing if there are missing values.
-
-        This method assumes that caller placed the missing samples in
-        self.sample_indices[-n_missing:]
-
-        Parameters
-        ----------
-        n_missing: SIZE_t
-            Number of missing values for specific feature.
-        """
-        pass
-
     cdef int reset(self) except -1 nogil:
         """Reset the criterion at pos=start.
 
@@ -157,16 +130,6 @@ cdef class Criterion:
         """
         pass
 
-    cdef void clip_node_value(self, double* dest, double lower_bound, double upper_bound) noexcept nogil:
-        pass
-
-    cdef double middle_value(self) noexcept nogil:
-        """Compute the middle value of a split for monotonicity constraints
-
-        This method is implemented in ClassificationCriterion and RegressionCriterion.
-        """
-        pass
-
     cdef double proxy_impurity_improvement(self) noexcept nogil:
         """Compute a proxy of the impurity reduction.
 
@@ -221,6 +184,90 @@ cdef class Criterion:
                                  - (self.weighted_n_left /
                                     self.weighted_n_node_samples * impurity_left)))
 
+    cdef void set_sample_pointers(
+        self,
+        SIZE_t start,
+        SIZE_t end
+    ) noexcept nogil:
+        """Abstract method which will set sample pointers in the criterion.
+
+        The dataset array that we compute criteria on is assumed to consist of 'N'
+        ordered samples or rows (i.e. sorted). Since we pass this by reference, we
+        use sample pointers to move the start and end around to consider only a subset of data.
+        This function should also update relevant statistics that the class uses to compute the final criterion.
+
+        Parameters
+        ----------
+        start : SIZE_t
+            The index of the first sample to be used on computation of criteria of the current node.
+        end : SIZE_t
+            The last sample used on this node
+        """
+        pass
+
+
+cdef class Criterion(BaseCriterion):
+    """Interface for impurity criteria.
+
+    The supervised criterion computes the impurity of a node and the reduction of
+    impurity of a split on that node using the distribution of labels in parent and
+    children nodes. It also computes the output statistics such as the mean in regression
+    and class probabilities in classification. Instances of this class are responsible
+    for compute splits' impurity difference.
+
+    Criterion is the base class for criteria used in supervised tree-based models
+    with a homogeneous float64-dtyped y.
+    """
+    cdef int init(
+        self,
+        const DOUBLE_t[:, ::1] y,
+        const DOUBLE_t[:] sample_weight,
+        double weighted_n_samples,
+        const SIZE_t[:] sample_indices
+    ) except -1 nogil:
+        """Placeholder for a method which will initialize the criterion.
+
+        Returns -1 in case of failure to allocate memory (and raise MemoryError)
+        or 0 otherwise.
+
+        Parameters
+        ----------
+        y : ndarray, dtype=DOUBLE_t
+            y is a buffer that can store values for n_outputs target variables
+            stored as a Cython memoryview.
+        sample_weight : ndarray, dtype=DOUBLE_t
+            The weight of each sample stored as a Cython memoryview.
+        weighted_n_samples : double
+            The total weight of the samples being considered
+        sample_indices : ndarray, dtype=SIZE_t
+            A mask on the samples. Indices of the samples in X and y we want to use,
+            where sample_indices[start:end] correspond to the samples in this node.
+        """
+        pass
+
+    cdef void init_missing(self, SIZE_t n_missing) noexcept nogil:
+        """Initialize sum_missing if there are missing values.
+
+        This method assumes that caller placed the missing samples in
+        self.sample_indices[-n_missing:]
+
+        Parameters
+        ----------
+        n_missing: SIZE_t
+            Number of missing values for specific feature.
+        """
+        pass
+
+    cdef void clip_node_value(self, double* dest, double lower_bound, double upper_bound) noexcept nogil:
+        pass
+
+    cdef double middle_value(self) noexcept nogil:
+        """Compute the middle value of a split for monotonicity constraints
+
+        This method is implemented in ClassificationCriterion and RegressionCriterion.
+        """
+        pass
+
     cdef bint check_monotonicity(
         self,
         cnp.int8_t monotonic_cst,
@@ -254,6 +301,33 @@ cdef class Criterion:
     cdef void init_sum_missing(self):
         """Init sum_missing to hold sums for missing values."""
 
+    cdef void node_samples(
+        self,
+        vector[vector[DOUBLE_t]]& dest
+    ) noexcept nogil:
+        """Copy the samples of the current node into dest.
+
+        Parameters
+        ----------
+        dest : reference vector[vector[DOUBLE_t]]
+            The vector of vectors where the samples should be copied.
+            This is passed by reference and modified in place.
+        """
+        cdef SIZE_t i, j, k
+
+        # Resize the destination vector of vectors
+        dest.resize(self.n_node_samples)
+
+        # Loop over the samples
+        for i in range(self.n_node_samples):
+            # Get the index of the current sample
+            j = self.sample_indices[self.start + i]
+
+            # Get the sample values for each output
+            for k in range(self.n_outputs):
+                dest[i].push_back(self.y[j, k])
+
+
 cdef inline void _move_sums_classification(
     ClassificationCriterion criterion,
     double[:, ::1] sum_1,
@@ -352,15 +426,10 @@ cdef class ClassificationCriterion(Criterion):
         const DOUBLE_t[:, ::1] y,
         const DOUBLE_t[:] sample_weight,
         double weighted_n_samples,
-        const SIZE_t[:] sample_indices,
-        SIZE_t start,
-        SIZE_t end
+        const SIZE_t[:] sample_indices
     ) except -1 nogil:
         """Initialize the criterion.
 
-        This initializes the criterion at node sample_indices[start:end] and children
-        sample_indices[start:start] and sample_indices[start:end].
-
         Returns -1 in case of failure to allocate memory (and raise MemoryError)
         or 0 otherwise.
 
@@ -375,18 +444,24 @@ cdef class ClassificationCriterion(Criterion):
         sample_indices : ndarray, dtype=SIZE_t
             A mask on the samples. Indices of the samples in X and y we want to use,
             where sample_indices[start:end] correspond to the samples in this node.
-        start : SIZE_t
-            The first sample to use in the mask
-        end : SIZE_t
-            The last sample to use in the mask
         """
         self.y = y
         self.sample_weight = sample_weight
         self.sample_indices = sample_indices
+        self.weighted_n_samples = weighted_n_samples
+
+        return 0
+
+    cdef void set_sample_pointers(
+        self,
+        SIZE_t start,
+        SIZE_t end
+    ) noexcept nogil:
+        """Set sample pointers in the criterion."""
+        self.n_node_samples = end - start
         self.start = start
         self.end = end
-        self.n_node_samples = end - start
-        self.weighted_n_samples = weighted_n_samples
+
         self.weighted_n_node_samples = 0.0
 
         cdef SIZE_t i
@@ -399,12 +474,12 @@ cdef class ClassificationCriterion(Criterion):
             memset(&self.sum_total[k, 0], 0, self.n_classes[k] * sizeof(double))
 
         for p in range(start, end):
-            i = sample_indices[p]
+            i = self.sample_indices[p]
 
             # w is originally set to be 1.0, meaning that if no sample weights
             # are given, the default weight of each sample is 1.0.
-            if sample_weight is not None:
-                w = sample_weight[i]
+            if self.sample_weight is not None:
+                w = self.sample_weight[i]
 
             # Count weighted class frequency for each target
             for k in range(self.n_outputs):
@@ -415,7 +490,6 @@ cdef class ClassificationCriterion(Criterion):
 
         # Reset to pos=start
         self.reset()
-        return 0
 
     cdef void init_sum_missing(self):
         """Init sum_missing to hold sums for missing values."""
@@ -695,13 +769,10 @@ cdef class Gini(ClassificationCriterion):
     This handles cases where the target is a classification taking values
     0, 1, ... K-2, K-1. If node m represents a region Rm with Nm observations,
     then let
-
         count_k = 1/ Nm \sum_{x_i in Rm} I(yi = k)
-
     be the proportion of class k observations in node m.
 
     The Gini Index is then defined as:
-
         index = \sum_{k=0}^{K-1} count_k (1 - count_k)
               = 1 - \sum_{k=0}^{K-1} count_k ** 2
     """
@@ -819,7 +890,6 @@ cdef class RegressionCriterion(Criterion):
     evaluated by computing the variance of the target values left and right
     of the split point. The computation takes linear time with `n_samples`
     by using ::
-
         var = \sum_i^n (y_i - y_bar) ** 2
             = (\sum_i^n y_i ** 2) - n_samples * y_bar ** 2
     """
@@ -831,7 +901,6 @@ cdef class RegressionCriterion(Criterion):
         ----------
         n_outputs : SIZE_t
             The number of targets to be predicted
-
         n_samples : SIZE_t
             The total number of samples to fit on
         """
@@ -862,23 +931,29 @@ cdef class RegressionCriterion(Criterion):
         const DOUBLE_t[:, ::1] y,
         const DOUBLE_t[:] sample_weight,
         double weighted_n_samples,
-        const SIZE_t[:] sample_indices,
-        SIZE_t start,
-        SIZE_t end,
+        const SIZE_t[:] sample_indices
     ) except -1 nogil:
-        """Initialize the criterion.
-
-        This initializes the criterion at node sample_indices[start:end] and children
-        sample_indices[start:start] and sample_indices[start:end].
-        """
+        """Initialize the criterion."""
         # Initialize fields
         self.y = y
         self.sample_weight = sample_weight
         self.sample_indices = sample_indices
+        self.weighted_n_samples = weighted_n_samples
+
+        return 0
+
+    cdef void set_sample_pointers(
+        self,
+        SIZE_t start,
+        SIZE_t end
+    ) noexcept nogil:
+        """Set sample pointers in the criterion."""
         self.start = start
         self.end = end
+
         self.n_node_samples = end - start
-        self.weighted_n_samples = weighted_n_samples
+
+        self.sq_sum_total = 0.0
         self.weighted_n_node_samples = 0.
 
         cdef SIZE_t i
@@ -887,14 +962,14 @@ cdef class RegressionCriterion(Criterion):
         cdef DOUBLE_t y_ik
         cdef DOUBLE_t w_y_ik
         cdef DOUBLE_t w = 1.0
-        self.sq_sum_total = 0.0
+
         memset(&self.sum_total[0], 0, self.n_outputs * sizeof(double))
 
         for p in range(start, end):
-            i = sample_indices[p]
+            i = self.sample_indices[p]
 
-            if sample_weight is not None:
-                w = sample_weight[i]
+            if self.sample_weight is not None:
+                w = self.sample_weight[i]
 
             for k in range(self.n_outputs):
                 y_ik = self.y[i, k]
@@ -906,7 +981,6 @@ cdef class RegressionCriterion(Criterion):
 
         # Reset to pos=start
         self.reset()
-        return 0
 
     cdef void init_sum_missing(self):
         """Init sum_missing to hold sums for missing values."""
@@ -1074,7 +1148,6 @@ cdef class RegressionCriterion(Criterion):
 
 cdef class MSE(RegressionCriterion):
     """Mean squared error impurity criterion.
-
         MSE = var_left + var_right
     """
 
@@ -1222,26 +1295,30 @@ cdef class MAE(RegressionCriterion):
         const DOUBLE_t[:, ::1] y,
         const DOUBLE_t[:] sample_weight,
         double weighted_n_samples,
-        const SIZE_t[:] sample_indices,
-        SIZE_t start,
-        SIZE_t end,
+        const SIZE_t[:] sample_indices
     ) except -1 nogil:
-        """Initialize the criterion.
-
-        This initializes the criterion at node sample_indices[start:end] and children
-        sample_indices[start:start] and sample_indices[start:end].
-        """
-        cdef SIZE_t i, p, k
-        cdef DOUBLE_t w = 1.0
-
+        """Initialize the criterion."""
         # Initialize fields
         self.y = y
         self.sample_weight = sample_weight
         self.sample_indices = sample_indices
+        self.weighted_n_samples = weighted_n_samples
+
+        return 0
+
+    cdef void set_sample_pointers(
+        self,
+        SIZE_t start,
+        SIZE_t end
+    ) noexcept nogil:
+        """Set sample pointers in the criterion."""
+        cdef SIZE_t i, p, k
+        cdef DOUBLE_t w = 1.0
+
         self.start = start
         self.end = end
+
         self.n_node_samples = end - start
-        self.weighted_n_samples = weighted_n_samples
         self.weighted_n_node_samples = 0.
 
         cdef void** left_child = self.left_child_ptr
@@ -1252,10 +1329,10 @@ cdef class MAE(RegressionCriterion):
             (<WeightedMedianCalculator> right_child[k]).reset()
 
         for p in range(start, end):
-            i = sample_indices[p]
+            i = self.sample_indices[p]
 
-            if sample_weight is not None:
-                w = sample_weight[i]
+            if self.sample_weight is not None:
+                w = self.sample_weight[i]
 
             for k in range(self.n_outputs):
                 # push method ends up calling safe_realloc, hence `except -1`
@@ -1270,7 +1347,6 @@ cdef class MAE(RegressionCriterion):
 
         # Reset to pos=start
         self.reset()
-        return 0
 
     cdef void init_missing(self, SIZE_t n_missing) noexcept nogil:
         """Raise error if n_missing != 0."""
@@ -1561,6 +1637,7 @@ cdef class Poisson(RegressionCriterion):
     Note that the deviance is >= 0, and since we have `y_pred = mean(y_true)`
     at the leaves, one always has `sum(y_pred - y_true) = 0`. It remains the
     implemented impurity (factor 2 is skipped):
+
         1/n * sum(y_true * log(y_true/y_pred)
     """
     # FIXME in 1.0:
diff --git a/sklearn/tree/_export.py b/sklearn/tree/_export.py
index ff0d6db5c25a5..9cd6ad4b71387 100644
--- a/sklearn/tree/_export.py
+++ b/sklearn/tree/_export.py
@@ -17,9 +17,15 @@
 
 import numpy as np
 
-from ..base import is_classifier
-from ..utils._param_validation import HasMethods, Interval, StrOptions, validate_params
-from ..utils.validation import check_array, check_is_fitted
+from sklearn.base import is_classifier
+from sklearn.utils._param_validation import (
+    HasMethods,
+    Interval,
+    StrOptions,
+    validate_params,
+)
+from sklearn.utils.validation import check_array, check_is_fitted
+
 from . import DecisionTreeClassifier, DecisionTreeRegressor, _criterion, _tree
 from ._reingold_tilford import Tree, buchheim
 
diff --git a/sklearn/tree/_splitter.pxd b/sklearn/tree/_splitter.pxd
index 2547e14b324df..4c67c35ebbdb0 100644
--- a/sklearn/tree/_splitter.pxd
+++ b/sklearn/tree/_splitter.pxd
@@ -4,19 +4,23 @@
 #          Joel Nothman <joel.nothman@gmail.com>
 #          Arnaud Joly <arnaud.v.joly@gmail.com>
 #          Jacob Schreiber <jmschreiber91@gmail.com>
+#          Adam Li <adam2392@gmail.com>
+#          Jong Shin <jshinm@gmail.com>
 #
 # License: BSD 3 clause
 
 # See _splitter.pyx for details.
 cimport numpy as cnp
 
-from ._criterion cimport Criterion
+from libcpp.vector cimport vector
+
+from ._criterion cimport BaseCriterion, Criterion
+from ._tree cimport DOUBLE_t  # Type of y, sample_weight
+from ._tree cimport DTYPE_t  # Type of X
+from ._tree cimport INT32_t  # Signed 32 bit integer
+from ._tree cimport SIZE_t  # Type for indices and counters
+from ._tree cimport UINT32_t  # Unsigned 32 bit integer
 
-from ._tree cimport DTYPE_t          # Type of X
-from ._tree cimport DOUBLE_t         # Type of y, sample_weight
-from ._tree cimport SIZE_t           # Type for indices and counters
-from ._tree cimport INT32_t          # Signed 32 bit integer
-from ._tree cimport UINT32_t         # Unsigned 32 bit integer
 
 cdef struct SplitRecord:
     # Data to track sample split
@@ -33,14 +37,15 @@ cdef struct SplitRecord:
     unsigned char missing_go_to_left  # Controls if missing values go to the left node.
     SIZE_t n_missing       # Number of missing values for the feature being split on
 
-cdef class Splitter:
+cdef class BaseSplitter:
+    """Abstract interface for splitter."""
+
     # The splitter searches in the input space for a feature and a threshold
     # to split the samples samples[start:end].
     #
     # The impurity computations are delegated to a criterion object.
 
     # Internal structures
-    cdef public Criterion criterion      # Impurity criterion
     cdef public SIZE_t max_features      # Number of features to test
     cdef public SIZE_t min_samples_leaf  # Min samples in a leaf
     cdef public double min_weight_leaf   # Minimum weight in a leaf
@@ -59,14 +64,6 @@ cdef class Splitter:
     cdef SIZE_t start                    # Start position for the current node
     cdef SIZE_t end                      # End position for the current node
 
-    cdef const DOUBLE_t[:, ::1] y
-    # Monotonicity constraints for each feature.
-    # The encoding is as follows:
-    #   -1: monotonic decrease
-    #    0: no constraint
-    #   +1: monotonic increase
-    cdef const cnp.int8_t[:] monotonic_cst
-    cdef bint with_monotonic_cst
     cdef const DOUBLE_t[:] sample_weight
 
     # The samples vector `samples` is maintained by the Splitter object such
@@ -86,21 +83,12 @@ cdef class Splitter:
     # This allows optimization with depth-based tree building.
 
     # Methods
-    cdef int init(
-        self,
-        object X,
-        const DOUBLE_t[:, ::1] y,
-        const DOUBLE_t[:] sample_weight,
-        const unsigned char[::1] missing_values_in_feature_mask,
-    ) except -1
-
     cdef int node_reset(
         self,
         SIZE_t start,
         SIZE_t end,
         double* weighted_n_node_samples
     ) except -1 nogil
-
     cdef int node_split(
         self,
         double impurity,   # Impurity of the node
@@ -109,9 +97,49 @@ cdef class Splitter:
         double lower_bound,
         double upper_bound,
     ) except -1 nogil
-
     cdef void node_value(self, double* dest) noexcept nogil
+    cdef double node_impurity(self) noexcept nogil
+    cdef int pointer_size(self) noexcept nogil
 
-    cdef void clip_node_value(self, double* dest, double lower_bound, double upper_bound) noexcept nogil
+cdef class Splitter(BaseSplitter):
+    """Base class for supervised splitters."""
+    
+    cdef public Criterion criterion      # Impurity criterion
+    cdef const DOUBLE_t[:, ::1] y
 
-    cdef double node_impurity(self) noexcept nogil
+    # Monotonicity constraints for each feature.
+    # The encoding is as follows:
+    #   -1: monotonic decrease
+    #    0: no constraint
+    #   +1: monotonic increase
+    cdef const cnp.int8_t[:] monotonic_cst
+    cdef bint with_monotonic_cst
+
+    cdef int init(
+        self,
+        object X,
+        const DOUBLE_t[:, ::1] y,
+        const DOUBLE_t[:] sample_weight,
+        const unsigned char[::1] missing_values_in_feature_mask,
+    ) except -1
+
+    cdef void node_samples(self, vector[vector[DOUBLE_t]]& dest) noexcept nogil
+
+    # Methods that allow modifications to stopping conditions
+    cdef bint check_presplit_conditions(
+        self,
+        SplitRecord* current_split,
+        SIZE_t n_missing,
+        bint missing_go_to_left,
+    ) noexcept nogil
+
+    cdef bint check_postsplit_conditions(
+        self
+    ) noexcept nogil
+
+    cdef void clip_node_value(
+        self,
+        double* dest,
+        double lower_bound,
+        double upper_bound
+    ) noexcept nogil
diff --git a/sklearn/tree/_splitter.pyx b/sklearn/tree/_splitter.pyx
index 5c30ba315a90a..982c68455040d 100644
--- a/sklearn/tree/_splitter.pyx
+++ b/sklearn/tree/_splitter.pyx
@@ -1,3 +1,6 @@
+# cython: language_level=3
+# cython: boundscheck=False, wraparound=False, initializedcheck=False, cdivision=True
+
 # Authors: Gilles Louppe <g.louppe@gmail.com>
 #          Peter Prettenhofer <peter.prettenhofer@gmail.com>
 #          Brian Holt <bdholt1@gmail.com>
@@ -8,26 +11,26 @@
 #          Joel Nothman <joel.nothman@gmail.com>
 #          Fares Hedayati <fares.hedayati@gmail.com>
 #          Jacob Schreiber <jmschreiber91@gmail.com>
+#          Adam Li <adam2392@gmail.com>
+#          Jong Shin <jshinm@gmail.com>
 #
+
 # License: BSD 3 clause
 
+from cython cimport final
+from libc.math cimport isnan
+from libc.stdlib cimport qsort
+from libc.string cimport memcpy
 cimport numpy as cnp
 
 from ._criterion cimport Criterion
 
-from libc.stdlib cimport qsort
-from libc.string cimport memcpy
-from libc.math cimport isnan
-from cython cimport final
-
 import numpy as np
 
 from scipy.sparse import issparse
 
-from ._utils cimport log
-from ._utils cimport rand_int
-from ._utils cimport rand_uniform
-from ._utils cimport RAND_R_MAX
+from ._utils cimport RAND_R_MAX, log, rand_int, rand_uniform
+
 
 cdef double INFINITY = np.inf
 
@@ -48,13 +51,96 @@ cdef inline void _init_split(SplitRecord* self, SIZE_t start_pos) noexcept nogil
     self.missing_go_to_left = False
     self.n_missing = 0
 
-cdef class Splitter:
-    """Abstract splitter class.
+cdef class BaseSplitter:
+    """This is an abstract interface for splitters.
+
+    For example, a tree model could be either supervisedly, or unsupervisedly computing splits on samples of
+    covariates, labels, or both. Although scikit-learn currently only contains
+    supervised tree methods, this class enables 3rd party packages to leverage
+    scikit-learn's Cython code for splitting.
+
+    A splitter is usually used in conjunction with a criterion class, which explicitly handles
+    computing the criteria, which we split on. The setting of that criterion class is handled
+    by downstream classes.
 
-    Splitters are called by tree builders to find the best splits on both
-    sparse and dense data, one split at a time.
+    The downstream classes _must_ implement methods to compute the split in a node.
     """
 
+    def __getstate__(self):
+        return {}
+
+    def __setstate__(self, d):
+        pass
+
+    cdef int node_reset(self, SIZE_t start, SIZE_t end,
+                        double* weighted_n_node_samples) except -1 nogil:
+        """Reset splitter on node samples[start:end].
+
+        Returns -1 in case of failure to allocate memory (and raise MemoryError)
+        or 0 otherwise.
+
+        Parameters
+        ----------
+        start : SIZE_t
+            The index of the first sample to consider
+        end : SIZE_t
+            The index of the last sample to consider
+        weighted_n_node_samples : ndarray, dtype=double pointer
+            The total weight of those samples
+        """
+        pass
+
+    cdef int node_split(
+        self,
+        double impurity,
+        SplitRecord* split,
+        SIZE_t* n_constant_features,
+        double lower_bound,
+        double upper_bound
+    ) except -1 nogil:
+        """Find the best split on node samples[start:end].
+
+        This is a placeholder method. The majority of computation will be done
+        here.
+
+        It should return -1 upon errors.
+
+        Parameters
+        ----------
+        impurity : double
+            The impurity of the current node.
+        split : SplitRecord pointer
+            A pointer to a memory-allocated SplitRecord object which will be filled with the
+            split chosen.
+        n_constant_features : SIZE_t pointer
+            A pointer to a memory-allocated SIZE_t object which will be filled with the
+            number of constant features. Optional to use.
+        lower_bound : double
+            The lower bound of the monotonic constraint if used.
+        upper_bound : double
+            The upper bound of the monotonic constraint if used.
+        """
+        pass
+
+    cdef void node_value(self, double* dest) noexcept nogil:
+        """Copy the value of node samples[start:end] into dest."""
+        pass
+
+    cdef double node_impurity(self) noexcept nogil:
+        """Return the impurity of the current node."""
+        pass
+
+    cdef int pointer_size(self) noexcept nogil:
+        """Size of the pointer for split records.
+
+        Overriding this function allows one to use different subclasses of
+        `SplitRecord`.
+        """
+        return sizeof(SplitRecord)
+
+cdef class Splitter(BaseSplitter):
+    """Abstract interface for supervised splitters."""
+
     def __cinit__(
         self,
         Criterion criterion,
@@ -63,6 +149,7 @@ cdef class Splitter:
         double min_weight_leaf,
         object random_state,
         const cnp.int8_t[:] monotonic_cst,
+        *argv
     ):
         """
         Parameters
@@ -90,7 +177,6 @@ cdef class Splitter:
             Monotonicity constraints
 
         """
-
         self.criterion = criterion
 
         self.n_samples = 0
@@ -103,19 +189,13 @@ cdef class Splitter:
         self.monotonic_cst = monotonic_cst
         self.with_monotonic_cst = monotonic_cst is not None
 
-    def __getstate__(self):
-        return {}
-
-    def __setstate__(self, d):
-        pass
-
     def __reduce__(self):
         return (type(self), (self.criterion,
                              self.max_features,
                              self.min_samples_leaf,
                              self.min_weight_leaf,
                              self.random_state,
-                             self.monotonic_cst), self.__getstate__())
+                             self.monotonic_cst.base if self.monotonic_cst is not None else None), self.__getstate__())
 
     cdef int init(
         self,
@@ -149,7 +229,6 @@ cdef class Splitter:
         has_missing : bool
             At least one missing values is in X.
         """
-
         self.rand_r_state = self.random_state.randint(0, RAND_R_MAX)
         cdef SIZE_t n_samples = X.shape[0]
 
@@ -187,8 +266,21 @@ cdef class Splitter:
         self.y = y
 
         self.sample_weight = sample_weight
+
+        self.criterion.init(
+            self.y,
+            self.sample_weight,
+            self.weighted_n_samples,
+            self.samples
+        )
+
+        self.criterion.set_sample_pointers(
+            self.start,
+            self.end
+        )
         if missing_values_in_feature_mask is not None:
             self.criterion.init_sum_missing()
+
         return 0
 
     cdef int node_reset(self, SIZE_t start, SIZE_t end,
@@ -211,37 +303,11 @@ cdef class Splitter:
         self.start = start
         self.end = end
 
-        self.criterion.init(
-            self.y,
-            self.sample_weight,
-            self.weighted_n_samples,
-            self.samples,
-            start,
-            end
-        )
+        self.criterion.set_sample_pointers(start, end)
 
         weighted_n_node_samples[0] = self.criterion.weighted_n_node_samples
         return 0
 
-    cdef int node_split(
-        self,
-        double impurity,
-        SplitRecord* split,
-        SIZE_t* n_constant_features,
-        double lower_bound,
-        double upper_bound,
-    ) except -1 nogil:
-
-        """Find the best split on node samples[start:end].
-
-        This is a placeholder method. The majority of computation will be done
-        here.
-
-        It should return -1 upon errors.
-        """
-
-        pass
-
     cdef void node_value(self, double* dest) noexcept nogil:
         """Copy the value of node samples[start:end] into dest."""
 
@@ -252,11 +318,62 @@ cdef class Splitter:
 
         self.criterion.clip_node_value(dest, lower_bound, upper_bound)
 
+    cdef void node_samples(self, vector[vector[DOUBLE_t]]& dest) noexcept nogil:
+        """Copy the samples[start:end] into dest."""
+        self.criterion.node_samples(dest)
+
     cdef double node_impurity(self) noexcept nogil:
         """Return the impurity of the current node."""
 
         return self.criterion.node_impurity()
 
+    cdef inline bint check_presplit_conditions(
+        self,
+        SplitRecord* current_split,
+        SIZE_t n_missing,
+        bint missing_go_to_left,
+    ) noexcept nogil:
+        """Check stopping conditions pre-split.
+
+        This is typically a metric that is cheaply computed given the
+        current proposed split, which is stored as a the `current_split`
+        argument.
+        """
+        cdef SIZE_t min_samples_leaf = self.min_samples_leaf
+        cdef SIZE_t end_non_missing = self.end - n_missing
+        cdef SIZE_t n_left, n_right
+
+        if missing_go_to_left:
+            n_left = current_split.pos - self.start + n_missing
+            n_right = end_non_missing - current_split.pos
+        else:
+            n_left = current_split.pos - self.start
+            n_right = end_non_missing - current_split.pos + n_missing
+
+        # Reject if min_samples_leaf is not guaranteed
+        if n_left < min_samples_leaf or n_right < min_samples_leaf:
+            return 1
+
+        return 0
+
+    cdef inline bint check_postsplit_conditions(
+        self
+    ) noexcept nogil:
+        """Check stopping conditions after evaluating the split.
+
+        This takes some metric that is stored in the Criterion
+        object and checks against internal stop metrics.
+        """
+        cdef double min_weight_leaf = self.min_weight_leaf
+
+        # Reject if min_weight_leaf is not satisfied
+        if ((self.criterion.weighted_n_left < min_weight_leaf) or
+                (self.criterion.weighted_n_right < min_weight_leaf)):
+            return 1
+
+        return 0
+
+
 cdef inline void shift_missing_values_to_left_if_required(
     SplitRecord* best,
     SIZE_t[::1] samples,
@@ -275,6 +392,7 @@ cdef inline void shift_missing_values_to_left_if_required(
             samples[i], samples[current_end] = samples[current_end], samples[i]
         best.pos += best.n_missing
 
+
 # Introduce a fused-class to make it possible to share the split implementation
 # between the dense and sparse cases in the node_split_best and node_split_random
 # functions. The alternative would have been to use inheritance-based polymorphism
@@ -412,7 +530,6 @@ cdef inline int node_split_best(
         if has_missing:
             criterion.init_missing(n_missing)
         # Evaluate all splits
-
         # If there are missing values, then we search twice for the most optimal split.
         # The first search will have all the missing values going to the right node.
         # The second search will have all the missing values going to the left node.
@@ -433,18 +550,30 @@ cdef inline int node_split_best(
                 if p >= end_non_missing:
                     continue
 
-                if missing_go_to_left:
-                    n_left = p - start + n_missing
-                    n_right = end_non_missing - p
-                else:
-                    n_left = p - start
-                    n_right = end_non_missing - p + n_missing
+                current_split.pos = p
+
+                # Reject if monotonicity constraints are not satisfied
+                if (
+                    with_monotonic_cst and
+                    monotonic_cst[current_split.feature] != 0 and
+                    not criterion.check_monotonicity(
+                        monotonic_cst[current_split.feature],
+                        lower_bound,
+                        upper_bound,
+                    )
+                ):
+                    continue
 
                 # Reject if min_samples_leaf is not guaranteed
-                if n_left < min_samples_leaf or n_right < min_samples_leaf:
+                if missing_go_to_left:
+                    n_left = current_split.pos - splitter.start + n_missing
+                    n_right = end_non_missing - current_split.pos
+                else:
+                    n_left = current_split.pos - splitter.start
+                    n_right = end_non_missing - current_split.pos + n_missing
+                if splitter.check_presplit_conditions(&current_split, n_missing, missing_go_to_left) == 1:
                     continue
 
-                current_split.pos = p
                 criterion.update(current_split.pos)
 
                 # Reject if monotonicity constraints are not satisfied
@@ -460,8 +589,7 @@ cdef inline int node_split_best(
                     continue
 
                 # Reject if min_weight_leaf is not satisfied
-                if ((criterion.weighted_n_left < min_weight_leaf) or
-                        (criterion.weighted_n_right < min_weight_leaf)):
+                if splitter.check_postsplit_conditions() == 1:
                     continue
 
                 current_proxy_improvement = criterion.proxy_impurity_improvement()
@@ -691,8 +819,6 @@ cdef inline int node_split_random(
     cdef SIZE_t n_features = splitter.n_features
 
     cdef SIZE_t max_features = splitter.max_features
-    cdef SIZE_t min_samples_leaf = splitter.min_samples_leaf
-    cdef double min_weight_leaf = splitter.min_weight_leaf
     cdef UINT32_t* random_state = &splitter.rand_r_state
 
     cdef SplitRecord best_split, current_split
@@ -788,8 +914,7 @@ cdef inline int node_split_random(
         current_split.pos = partitioner.partition_samples(current_split.threshold)
 
         # Reject if min_samples_leaf is not guaranteed
-        if (((current_split.pos - start) < min_samples_leaf) or
-                ((end - current_split.pos) < min_samples_leaf)):
+        if splitter.check_presplit_conditions(&current_split, 0, 0) == 1:
             continue
 
         # Evaluate split
@@ -799,8 +924,19 @@ cdef inline int node_split_random(
         criterion.update(current_split.pos)
 
         # Reject if min_weight_leaf is not satisfied
-        if ((criterion.weighted_n_left < min_weight_leaf) or
-                (criterion.weighted_n_right < min_weight_leaf)):
+        if splitter.check_postsplit_conditions() == 1:
+            continue
+
+        # Reject if monotonicity constraints are not satisfied
+        if (
+            with_monotonic_cst and
+            monotonic_cst[current_split.feature] != 0 and
+            not criterion.check_monotonicity(
+                monotonic_cst[current_split.feature],
+                lower_bound,
+                upper_bound,
+            )
+        ):
             continue
 
         # Reject if monotonicity constraints are not satisfied
@@ -1501,12 +1637,12 @@ cdef class BestSplitter(Splitter):
         )
 
     cdef int node_split(
-            self,
-            double impurity,
-            SplitRecord* split,
-            SIZE_t* n_constant_features,
-            double lower_bound,
-            double upper_bound
+        self,
+        double impurity,
+        SplitRecord* split,
+        SIZE_t* n_constant_features,
+        double lower_bound,
+        double upper_bound
     ) except -1 nogil:
         return node_split_best(
             self,
diff --git a/sklearn/tree/_tree.pxd b/sklearn/tree/_tree.pxd
index b99f44c0472a2..886770bfabc15 100644
--- a/sklearn/tree/_tree.pxd
+++ b/sklearn/tree/_tree.pxd
@@ -5,13 +5,17 @@
 #          Arnaud Joly <arnaud.v.joly@gmail.com>
 #          Jacob Schreiber <jmschreiber91@gmail.com>
 #          Nelson Liu <nelson@nelsonliu.me>
+#          Haoyin Xu <haoyinxu@gmail.com>
 #
 # License: BSD 3 clause
 
 # See _tree.pyx for details.
 
 import numpy as np
+
 cimport numpy as cnp
+from libcpp.unordered_map cimport unordered_map
+from libcpp.vector cimport vector
 
 ctypedef cnp.npy_float32 DTYPE_t          # Type of X
 ctypedef cnp.npy_float64 DOUBLE_t         # Type of y, sample_weight
@@ -19,8 +23,8 @@ ctypedef cnp.npy_intp SIZE_t              # Type for indices and counters
 ctypedef cnp.npy_int32 INT32_t            # Signed 32 bit integer
 ctypedef cnp.npy_uint32 UINT32_t          # Unsigned 32 bit integer
 
-from ._splitter cimport Splitter
-from ._splitter cimport SplitRecord
+from ._splitter cimport SplitRecord, Splitter
+
 
 cdef struct Node:
     # Base storage structure for the nodes in a Tree object
@@ -35,40 +39,45 @@ cdef struct Node:
     unsigned char missing_go_to_left     # Whether features have missing values
 
 
-cdef class Tree:
-    # The Tree object is a binary tree structure constructed by the
-    # TreeBuilder. The tree structure is used for predictions and
-    # feature importances.
-
-    # Input/Output layout
-    cdef public SIZE_t n_features        # Number of features in X
-    cdef SIZE_t* n_classes               # Number of classes in y[:, k]
-    cdef public SIZE_t n_outputs         # Number of outputs in y
-    cdef public SIZE_t max_n_classes     # max(n_classes)
-
+cdef class BaseTree:
     # Inner structures: values are stored separately from node structure,
     # since size is determined at runtime.
     cdef public SIZE_t max_depth         # Max depth of the tree
     cdef public SIZE_t node_count        # Counter for node IDs
     cdef public SIZE_t capacity          # Capacity of tree, in terms of nodes
     cdef Node* nodes                     # Array of nodes
-    cdef double* value                   # (capacity, n_outputs, max_n_classes) array of values
-    cdef SIZE_t value_stride             # = n_outputs * max_n_classes
 
-    # Methods
-    cdef SIZE_t _add_node(self, SIZE_t parent, bint is_left, bint is_leaf,
-                          SIZE_t feature, double threshold, double impurity,
-                          SIZE_t n_node_samples,
-                          double weighted_n_node_samples,
-                          unsigned char missing_go_to_left) except -1 nogil
+    cdef SIZE_t value_stride             # The dimensionality of a vectorized output per sample
+    cdef double* value                   # Array of values prediction values for each node
+
+    # Generic Methods: These are generic methods used by any tree.
     cdef int _resize(self, SIZE_t capacity) except -1 nogil
     cdef int _resize_c(self, SIZE_t capacity=*) except -1 nogil
 
-    cdef cnp.ndarray _get_value_ndarray(self)
-    cdef cnp.ndarray _get_node_ndarray(self)
-
-    cpdef cnp.ndarray predict(self, object X)
-
+    cdef SIZE_t _add_node(
+        self,
+        SIZE_t parent,
+        bint is_left,
+        bint is_leaf,
+        SplitRecord* split_node,
+        double impurity,
+        SIZE_t n_node_samples,
+        double weighted_n_node_samples,
+        unsigned char missing_go_to_left
+    ) except -1 nogil
+    cdef SIZE_t _update_node(
+        self,
+        SIZE_t parent,
+        bint is_left,
+        bint is_leaf,
+        SplitRecord* split_node,
+        double impurity,
+        SIZE_t n_node_samples,
+        double weighted_n_node_samples,
+        unsigned char missing_go_to_left
+    ) except -1 nogil
+
+    # Python API methods: These are methods exposed to Python
     cpdef cnp.ndarray apply(self, object X)
     cdef cnp.ndarray _apply_dense(self, object X)
     cdef cnp.ndarray _apply_sparse_csr(self, object X)
@@ -80,6 +89,60 @@ cdef class Tree:
     cpdef compute_node_depths(self)
     cpdef compute_feature_importances(self, normalize=*)
 
+    # Abstract methods: these functions must be implemented by any decision tree
+    cdef int _set_split_node(
+        self,
+        SplitRecord* split_node,
+        Node* node,
+        SIZE_t node_id,
+    ) except -1 nogil
+    cdef int _set_leaf_node(
+        self,
+        SplitRecord* split_node,
+        Node* node,
+        SIZE_t node_id,
+    ) except -1 nogil
+    cdef DTYPE_t _compute_feature(
+        self,
+        const DTYPE_t[:, :] X_ndarray,
+        SIZE_t sample_index,
+        Node *node
+    ) noexcept nogil
+    cdef void _compute_feature_importances(
+        self,
+        cnp.float64_t[:] importances,
+        Node* node,
+    ) noexcept nogil
+
+cdef class Tree(BaseTree):
+    # The Supervised Tree object is a binary tree structure constructed by the
+    # TreeBuilder. The tree structure is used for predictions and
+    # feature importances.
+    #
+    # Value of upstream properties:
+    # - value_stride = n_outputs * max_n_classes
+    # - value = (capacity, n_outputs, max_n_classes) array of values
+
+    # Input/Output layout for supervised tree
+    cdef public SIZE_t n_features        # Number of features in X
+    cdef SIZE_t* n_classes               # Number of classes in y[:, k]
+    cdef public SIZE_t n_outputs         # Number of outputs in y
+    cdef public SIZE_t max_n_classes     # max(n_classes)
+
+    # Enables the use of tree to store distributions of the output to allow
+    # arbitrary usage of the the leaves. This is used in the quantile
+    # estimators for example.
+    # for storing samples at each leaf node with leaf's node ID as the key and
+    # the sample values as the value
+    cdef unordered_map[SIZE_t, vector[vector[DOUBLE_t]]] value_samples
+
+    # Methods
+    cdef cnp.ndarray _get_value_ndarray(self)
+    cdef cnp.ndarray _get_node_ndarray(self)
+    cdef cnp.ndarray _get_value_samples_ndarray(self, SIZE_t node_id)
+    cdef cnp.ndarray _get_value_samples_keys(self)
+
+    cpdef cnp.ndarray predict(self, object X)
 
 # =============================================================================
 # Tree builder
@@ -100,6 +163,18 @@ cdef class TreeBuilder:
     cdef double min_weight_leaf         # Minimum weight in a leaf
     cdef SIZE_t max_depth               # Maximal tree depth
     cdef double min_impurity_decrease   # Impurity threshold for early stopping
+    cdef cnp.ndarray initial_roots      # Leaf nodes for streaming updates
+
+    cdef unsigned char store_leaf_values    # Whether to store leaf values
+
+    cpdef initialize_node_queue(
+      self,
+      Tree tree,
+      object X,
+      const DOUBLE_t[:, ::1] y,
+      const DOUBLE_t[:] sample_weight=*,
+      const unsigned char[::1] missing_values_in_feature_mask=*,
+    )
 
     cpdef build(
         self,
diff --git a/sklearn/tree/_tree.pyx b/sklearn/tree/_tree.pyx
index ce998e80a9d0a..35c64e6265f3a 100644
--- a/sklearn/tree/_tree.pyx
+++ b/sklearn/tree/_tree.pyx
@@ -1,3 +1,6 @@
+# cython: language_level=3
+# cython: boundscheck=False, wraparound=False, initializedcheck=False, cdivision=True
+
 # Authors: Gilles Louppe <g.louppe@gmail.com>
 #          Peter Prettenhofer <peter.prettenhofer@gmail.com>
 #          Brian Holt <bdholt1@gmail.com>
@@ -9,25 +12,26 @@
 #          Fares Hedayati <fares.hedayati@gmail.com>
 #          Jacob Schreiber <jmschreiber91@gmail.com>
 #          Nelson Liu <nelson@nelsonliu.me>
+#          Haoyin Xu <haoyinxu@gmail.com>
 #
 # License: BSD 3 clause
 
 from cpython cimport Py_INCREF, PyObject, PyTypeObject
-
-from libc.stdlib cimport free
-from libc.string cimport memcpy
-from libc.string cimport memset
-from libc.stdint cimport INTPTR_MAX
+from cython.operator cimport dereference as deref
 from libc.math cimport isnan
-from libcpp.vector cimport vector
-from libcpp.algorithm cimport pop_heap
-from libcpp.algorithm cimport push_heap
+from libc.stdint cimport INTPTR_MAX
+from libc.stdlib cimport free, malloc
+from libc.string cimport memcpy, memset
 from libcpp cimport bool
+from libcpp.algorithm cimport pop_heap, push_heap
+from libcpp.vector cimport vector
 
 import struct
 
 import numpy as np
+
 cimport numpy as cnp
+
 cnp.import_array()
 
 from scipy.sparse import issparse
@@ -36,6 +40,7 @@ from scipy.sparse import csr_matrix
 from ._utils cimport safe_realloc
 from ._utils cimport sizet_ptr_to_ndarray
 
+
 cdef extern from "numpy/arrayobject.h":
     object PyArray_NewFromDescr(PyTypeObject* subtype, cnp.dtype descr,
                                 int nd, cnp.npy_intp* dims,
@@ -87,6 +92,17 @@ NODE_DTYPE = np.asarray(<Node[:1]>(&dummy)).dtype
 cdef class TreeBuilder:
     """Interface for different tree building strategies."""
 
+    cpdef initialize_node_queue(
+        self,
+        Tree tree,
+        object X,
+        const DOUBLE_t[:, ::1] y,
+        const DOUBLE_t[:] sample_weight=None,
+        const unsigned char[::1] missing_values_in_feature_mask=None,
+    ):
+        """Build a decision tree from the training set (X, y)."""
+        pass
+
     cpdef build(
         self,
         Tree tree,
@@ -153,15 +169,100 @@ cdef struct StackRecord:
 cdef class DepthFirstTreeBuilder(TreeBuilder):
     """Build a decision tree in depth-first fashion."""
 
-    def __cinit__(self, Splitter splitter, SIZE_t min_samples_split,
-                  SIZE_t min_samples_leaf, double min_weight_leaf,
-                  SIZE_t max_depth, double min_impurity_decrease):
+    def __cinit__(
+        self,
+        Splitter splitter,
+        SIZE_t min_samples_split,
+        SIZE_t min_samples_leaf,
+        double min_weight_leaf,
+        SIZE_t max_depth,
+        double min_impurity_decrease,
+        unsigned char store_leaf_values=False,
+        cnp.ndarray initial_roots=None,
+    ):
         self.splitter = splitter
         self.min_samples_split = min_samples_split
         self.min_samples_leaf = min_samples_leaf
         self.min_weight_leaf = min_weight_leaf
         self.max_depth = max_depth
         self.min_impurity_decrease = min_impurity_decrease
+        self.store_leaf_values = store_leaf_values
+        self.initial_roots = initial_roots
+
+    def __reduce__(self):
+        """Reduce re-implementation, for pickling."""
+        return(DepthFirstTreeBuilder, (self.splitter,
+                                       self.min_samples_split,
+                                       self.min_samples_leaf,
+                                       self.min_weight_leaf,
+                                       self.max_depth,
+                                       self.min_impurity_decrease,
+                                       self.store_leaf_values,
+                                       self.initial_roots))
+
+    cpdef initialize_node_queue(
+        self,
+        Tree tree,
+        object X,
+        const DOUBLE_t[:, ::1] y,
+        const DOUBLE_t[:] sample_weight=None,
+        const unsigned char[::1] missing_values_in_feature_mask=None,
+    ):
+        """Initialize a list of roots"""
+        X, y, sample_weight = self._check_input(X, y, sample_weight)
+
+        # organize samples by decision paths
+        paths = tree.decision_path(X)
+        cdef int PARENT
+        cdef int CHILD
+        cdef int i
+        false_roots = {}
+        X_copy = {}
+        y_copy = {}
+        for i in range(X.shape[0]):
+            # collect depths from the node paths
+            depth_i = paths[i].indices.shape[0] - 1
+            PARENT = depth_i - 1
+            CHILD = depth_i
+
+            # find leaf node's & their parent node's IDs
+            if PARENT < 0:
+                parent_i = 0
+            else:
+                parent_i = paths[i].indices[PARENT]
+            child_i = paths[i].indices[CHILD]
+            left = 0
+            if tree.children_left[parent_i] == child_i:
+                left = 1  # leaf node is left child
+
+            # organize samples by the leaf they fall into (false root)
+            # leaf nodes are marked by parent node and
+            # their relative position (left or right child)
+            if (parent_i, left) in false_roots:
+                false_roots[(parent_i, left)][0] += 1
+                X_copy[(parent_i, left)].append(X[i])
+                y_copy[(parent_i, left)].append(y[i])
+            else:
+                false_roots[(parent_i, left)] = [1, depth_i]
+                X_copy[(parent_i, left)] = [X[i]]
+                y_copy[(parent_i, left)] = [y[i]]
+
+        X_list = []
+        y_list = []
+
+        # reorder the samples according to parent node IDs
+        for key, value in reversed(sorted(X_copy.items())):
+            X_list = X_list + value
+            y_list = y_list + y_copy[key]
+        cdef object X_new = np.array(X_list)
+        cdef cnp.ndarray y_new = np.array(y_list)
+
+        # initialize the splitter using sorted samples
+        cdef Splitter splitter = self.splitter
+        splitter.init(X_new, y_new, sample_weight, missing_values_in_feature_mask)
+
+        # convert dict to numpy array and store value
+        self.initial_roots = np.array(list(false_roots.items()))
 
     cpdef build(
         self,
@@ -176,16 +277,6 @@ cdef class DepthFirstTreeBuilder(TreeBuilder):
         # check input
         X, y, sample_weight = self._check_input(X, y, sample_weight)
 
-        # Initial capacity
-        cdef int init_capacity
-
-        if tree.max_depth <= 10:
-            init_capacity = <int> (2 ** (tree.max_depth + 1)) - 1
-        else:
-            init_capacity = 2047
-
-        tree._resize(init_capacity)
-
         # Parameters
         cdef Splitter splitter = self.splitter
         cdef SIZE_t max_depth = self.max_depth
@@ -193,34 +284,74 @@ cdef class DepthFirstTreeBuilder(TreeBuilder):
         cdef double min_weight_leaf = self.min_weight_leaf
         cdef SIZE_t min_samples_split = self.min_samples_split
         cdef double min_impurity_decrease = self.min_impurity_decrease
+        cdef unsigned char store_leaf_values = self.store_leaf_values
+        cdef cnp.ndarray initial_roots = self.initial_roots
 
-        # Recursive partition (without actual recursion)
-        splitter.init(X, y, sample_weight, missing_values_in_feature_mask)
+        # Initial capacity
+        cdef int init_capacity
+        cdef bint first = 0
+        if initial_roots is None:
+            # Recursive partition (without actual recursion)
+            splitter.init(X, y, sample_weight, missing_values_in_feature_mask)
+
+            if tree.max_depth <= 10:
+                init_capacity = <int> (2 ** (tree.max_depth + 1)) - 1
+            else:
+                init_capacity = 2047
+
+            tree._resize(init_capacity)
+            first = 1
+        else:
+            # convert numpy array back to dict
+            false_roots = {}
+            for key_value_pair in initial_roots:
+                false_roots[tuple(key_value_pair[0])] = key_value_pair[1]
 
-        cdef SIZE_t start
-        cdef SIZE_t end
+            # reset the root array
+            self.initial_roots = None
+
+        cdef SIZE_t start = 0
+        cdef SIZE_t end = 0
         cdef SIZE_t depth
         cdef SIZE_t parent
         cdef bint is_left
         cdef SIZE_t n_node_samples = splitter.n_samples
         cdef double weighted_n_node_samples
-        cdef SplitRecord split
         cdef SIZE_t node_id
 
+        cdef SplitRecord split
+        cdef SplitRecord* split_ptr = <SplitRecord *>malloc(splitter.pointer_size())
+
         cdef double impurity = INFINITY
         cdef double lower_bound
         cdef double upper_bound
         cdef double middle_value
         cdef SIZE_t n_constant_features
         cdef bint is_leaf
-        cdef bint first = 1
-        cdef SIZE_t max_depth_seen = -1
+        cdef SIZE_t max_depth_seen = -1 if first else tree.max_depth
         cdef int rc = 0
 
         cdef stack[StackRecord] builder_stack
+        cdef stack[StackRecord] update_stack
         cdef StackRecord stack_record
 
-        with nogil:
+        if not first:
+            # push reached leaf nodes onto stack
+            for key, value in reversed(sorted(false_roots.items())):
+                end += value[0]
+                update_stack.push({
+                    "start": start,
+                    "end": end,
+                    "depth": value[1],
+                    "parent": key[0],
+                    "is_left": key[1],
+                    "impurity": tree.impurity[key[0]],
+                    "n_constant_features": 0,
+                    "lower_bound": -INFINITY,
+                    "upper_bound": INFINITY,
+                })
+                start += value[0]
+        else:
             # push root node onto stack
             builder_stack.push({
                 "start": 0,
@@ -234,6 +365,132 @@ cdef class DepthFirstTreeBuilder(TreeBuilder):
                 "upper_bound": INFINITY,
             })
 
+        with nogil:
+            while not update_stack.empty():
+                stack_record = update_stack.top()
+                update_stack.pop()
+
+                start = stack_record.start
+                end = stack_record.end
+                depth = stack_record.depth
+                parent = stack_record.parent
+                is_left = stack_record.is_left
+                impurity = stack_record.impurity
+                n_constant_features = stack_record.n_constant_features
+                lower_bound = stack_record.lower_bound
+                upper_bound = stack_record.upper_bound
+
+                n_node_samples = end - start
+                splitter.node_reset(start, end, &weighted_n_node_samples)
+
+                is_leaf = (depth >= max_depth or
+                           n_node_samples < min_samples_split or
+                           n_node_samples < 2 * min_samples_leaf or
+                           weighted_n_node_samples < 2 * min_weight_leaf)
+
+                # impurity == 0 with tolerance due to rounding errors
+                is_leaf = is_leaf or impurity <= EPSILON
+
+                if not is_leaf:
+                    splitter.node_split(
+                        impurity,
+                        split_ptr,
+                        &n_constant_features,
+                        lower_bound,
+                        upper_bound
+                    )
+
+                    # assign local copy of SplitRecord to assign
+                    # pos, improvement, and impurity scores
+                    split = deref(split_ptr)
+
+                    # If EPSILON=0 in the below comparison, float precision
+                    # issues stop splitting, producing trees that are
+                    # dissimilar to v0.18
+                    is_leaf = (is_leaf or split.pos >= end or
+                               (split.improvement + EPSILON <
+                                min_impurity_decrease))
+
+                node_id = tree._update_node(parent, is_left, is_leaf,
+                                            split_ptr, impurity, n_node_samples,
+                                            weighted_n_node_samples,
+                                            split.missing_go_to_left)
+
+                if node_id == INTPTR_MAX:
+                    rc = -1
+                    break
+
+                # Store value for all nodes, to facilitate tree/model
+                # inspection and interpretation
+                splitter.node_value(tree.value + node_id * tree.value_stride)
+                if splitter.with_monotonic_cst:
+                    splitter.clip_node_value(tree.value + node_id * tree.value_stride, lower_bound, upper_bound)
+
+                if not is_leaf:
+                    if (
+                        not splitter.with_monotonic_cst or
+                        splitter.monotonic_cst[split.feature] == 0
+                    ):
+                        # Split on a feature with no monotonicity constraint
+
+                        # Current bounds must always be propagated to both children.
+                        # If a monotonic constraint is active, bounds are used in
+                        # node value clipping.
+                        left_child_min = right_child_min = lower_bound
+                        left_child_max = right_child_max = upper_bound
+                    elif splitter.monotonic_cst[split.feature] == 1:
+                        # Split on a feature with monotonic increase constraint
+                        left_child_min = lower_bound
+                        right_child_max = upper_bound
+
+                        # Lower bound for right child and upper bound for left child
+                        # are set to the same value.
+                        middle_value = splitter.criterion.middle_value()
+                        right_child_min = middle_value
+                        left_child_max = middle_value
+                    else:  # i.e. splitter.monotonic_cst[split.feature] == -1
+                        # Split on a feature with monotonic decrease constraint
+                        right_child_min = lower_bound
+                        left_child_max = upper_bound
+
+                        # Lower bound for left child and upper bound for right child
+                        # are set to the same value.
+                        middle_value = splitter.criterion.middle_value()
+                        left_child_min = middle_value
+                        right_child_max = middle_value
+
+                    # Push right child on stack
+                    builder_stack.push({
+                        "start": split.pos,
+                        "end": end,
+                        "depth": depth + 1,
+                        "parent": node_id,
+                        "is_left": 0,
+                        "impurity": split.impurity_right,
+                        "n_constant_features": n_constant_features,
+                        "lower_bound": right_child_min,
+                        "upper_bound": right_child_max,
+                    })
+
+                    # Push left child on stack
+                    builder_stack.push({
+                        "start": start,
+                        "end": split.pos,
+                        "depth": depth + 1,
+                        "parent": node_id,
+                        "is_left": 1,
+                        "impurity": split.impurity_left,
+                        "n_constant_features": n_constant_features,
+                        "lower_bound": left_child_min,
+                        "upper_bound": left_child_max,
+                    })
+                elif store_leaf_values and is_leaf:
+                    # copy leaf values to leaf_values array
+                    splitter.node_samples(tree.value_samples[node_id])
+
+                if depth > max_depth_seen:
+                    max_depth_seen = depth
+
             while not builder_stack.empty():
                 stack_record = builder_stack.top()
                 builder_stack.pop()
@@ -258,7 +515,7 @@ cdef class DepthFirstTreeBuilder(TreeBuilder):
 
                 if first:
                     impurity = splitter.node_impurity()
-                    first = 0
+                    first=0
 
                 # impurity == 0 with tolerance due to rounding errors
                 is_leaf = is_leaf or impurity <= EPSILON
@@ -266,11 +523,16 @@ cdef class DepthFirstTreeBuilder(TreeBuilder):
                 if not is_leaf:
                     splitter.node_split(
                         impurity,
-                        &split,
+                        split_ptr,
                         &n_constant_features,
                         lower_bound,
                         upper_bound
                     )
+
+                    # assign local copy of SplitRecord to assign
+                    # pos, improvement, and impurity scores
+                    split = deref(split_ptr)
+
                     # If EPSILON=0 in the below comparison, float precision
                     # issues stop splitting, producing trees that are
                     # dissimilar to v0.18
@@ -278,10 +540,9 @@ cdef class DepthFirstTreeBuilder(TreeBuilder):
                                (split.improvement + EPSILON <
                                 min_impurity_decrease))
 
-                node_id = tree._add_node(parent, is_left, is_leaf, split.feature,
-                                         split.threshold, impurity, n_node_samples,
-                                         weighted_n_node_samples,
-                                         split.missing_go_to_left)
+                node_id = tree._add_node(parent, is_left, is_leaf, split_ptr,
+                                         impurity, n_node_samples,
+                                         weighted_n_node_samples, split.missing_go_to_left)
 
                 if node_id == INTPTR_MAX:
                     rc = -1
@@ -351,6 +612,9 @@ cdef class DepthFirstTreeBuilder(TreeBuilder):
                         "lower_bound": left_child_min,
                         "upper_bound": left_child_max,
                     })
+                elif store_leaf_values and is_leaf:
+                    # copy leaf values to leaf_values array
+                    splitter.node_samples(tree.value_samples[node_id])
 
                 if depth > max_depth_seen:
                     max_depth_seen = depth
@@ -360,10 +624,13 @@ cdef class DepthFirstTreeBuilder(TreeBuilder):
 
             if rc >= 0:
                 tree.max_depth = max_depth_seen
+
+        # free the memory created for the SplitRecord pointer
+        free(split_ptr)
+
         if rc == -1:
             raise MemoryError()
 
-
 # Best first builder ----------------------------------------------------------
 cdef struct FrontierRecord:
     # Record of information of a Node, the frontier for a split. Those records are
@@ -406,10 +673,18 @@ cdef class BestFirstTreeBuilder(TreeBuilder):
     """
     cdef SIZE_t max_leaf_nodes
 
-    def __cinit__(self, Splitter splitter, SIZE_t min_samples_split,
-                  SIZE_t min_samples_leaf,  min_weight_leaf,
-                  SIZE_t max_depth, SIZE_t max_leaf_nodes,
-                  double min_impurity_decrease):
+    def __cinit__(
+        self,
+        Splitter splitter,
+        SIZE_t min_samples_split,
+        SIZE_t min_samples_leaf,
+        double min_weight_leaf,
+        SIZE_t max_depth,
+        SIZE_t max_leaf_nodes,
+        double min_impurity_decrease,
+        unsigned char store_leaf_values=False,
+        cnp.ndarray initial_roots=None,
+    ):
         self.splitter = splitter
         self.min_samples_split = min_samples_split
         self.min_samples_leaf = min_samples_leaf
@@ -417,6 +692,20 @@ cdef class BestFirstTreeBuilder(TreeBuilder):
         self.max_depth = max_depth
         self.max_leaf_nodes = max_leaf_nodes
         self.min_impurity_decrease = min_impurity_decrease
+        self.store_leaf_values = store_leaf_values
+        self.initial_roots = initial_roots
+
+    def __reduce__(self):
+        """Reduce re-implementation, for pickling."""
+        return(BestFirstTreeBuilder, (self.splitter,
+                                      self.min_samples_split,
+                                      self.min_samples_leaf,
+                                      self.min_weight_leaf,
+                                      self.max_depth,
+                                      self.max_leaf_nodes,
+                                      self.min_impurity_decrease,
+                                      self.store_leaf_values,
+                                      self.initial_roots))
 
     cpdef build(
         self,
@@ -434,6 +723,7 @@ cdef class BestFirstTreeBuilder(TreeBuilder):
         # Parameters
         cdef Splitter splitter = self.splitter
         cdef SIZE_t max_leaf_nodes = self.max_leaf_nodes
+        cdef unsigned char store_leaf_values = self.store_leaf_values
 
         # Recursive partition (without actual recursion)
         splitter.init(X, y, sample_weight, missing_values_in_feature_mask)
@@ -492,6 +782,9 @@ cdef class BestFirstTreeBuilder(TreeBuilder):
                     node.feature = _TREE_UNDEFINED
                     node.threshold = _TREE_UNDEFINED
 
+                    if store_leaf_values:
+                        # copy leaf values to leaf_values array
+                        splitter.node_samples(tree.value_samples[record.node_id])
                 else:
                     # Node is expandable
 
@@ -600,6 +893,8 @@ cdef class BestFirstTreeBuilder(TreeBuilder):
     ) except -1 nogil:
         """Adds node w/ partition ``[start, end)`` to the frontier. """
         cdef SplitRecord split
+        cdef SplitRecord* split_ptr = <SplitRecord *>malloc(splitter.pointer_size())
+
         cdef SIZE_t node_id
         cdef SIZE_t n_node_samples
         cdef SIZE_t n_constant_features = 0
@@ -623,11 +918,15 @@ cdef class BestFirstTreeBuilder(TreeBuilder):
         if not is_leaf:
             splitter.node_split(
                 impurity,
-                &split,
+                split_ptr,
                 &n_constant_features,
                 lower_bound,
                 upper_bound
             )
+            # assign local copy of SplitRecord to assign
+            # pos, improvement, and impurity scores
+            split = deref(split_ptr)
+
             # If EPSILON=0 in the below comparison, float precision issues stop
             # splitting early, producing trees that are dissimilar to v0.18
             is_leaf = (is_leaf or split.pos >= end or
@@ -637,9 +936,8 @@ cdef class BestFirstTreeBuilder(TreeBuilder):
                                  if parent != NULL
                                  else _TREE_UNDEFINED,
                                  is_left, is_leaf,
-                                 split.feature, split.threshold, impurity, n_node_samples,
-                                 weighted_n_node_samples,
-                                 split.missing_go_to_left)
+                                 split_ptr, impurity, n_node_samples,
+                                 weighted_n_node_samples, split.missing_go_to_left)
         if node_id == INTPTR_MAX:
             return -1
 
@@ -673,6 +971,7 @@ cdef class BestFirstTreeBuilder(TreeBuilder):
             res.impurity_left = impurity
             res.impurity_right = impurity
 
+        free(split_ptr)
         return 0
 
 
@@ -680,252 +979,153 @@ cdef class BestFirstTreeBuilder(TreeBuilder):
 # Tree
 # =============================================================================
 
-cdef class Tree:
-    """Array-based representation of a binary decision tree.
+cdef class BaseTree:
+    """Base class for Cython tree models.
 
-    The binary tree is represented as a number of parallel arrays. The i-th
-    element of each array holds information about the node `i`. Node 0 is the
-    tree's root. You can find a detailed description of all arrays in
-    `_tree.pxd`. NOTE: Some of the arrays only apply to either leaves or split
-    nodes, resp. In this case the values of nodes of the other type are
-    arbitrary!
+    Downstream classes must implement methods to actually traverse the tree.
+    """
+    cdef int _resize(
+        self,
+        SIZE_t capacity
+    ) except -1 nogil:
+        """Resize all inner arrays to `capacity`, if `capacity` == -1, then
+           double the size of the inner arrays.
 
-    Attributes
-    ----------
-    node_count : int
-        The number of nodes (internal nodes + leaves) in the tree.
+        Returns -1 in case of failure to allocate memory (and raise MemoryError)
+        or 0 otherwise.
+        """
+        if self._resize_c(capacity) != 0:
+            # Acquire gil only if we need to raise
+            with gil:
+                raise MemoryError()
 
-    capacity : int
-        The current capacity (i.e., size) of the arrays, which is at least as
-        great as `node_count`.
+    cdef int _resize_c(
+        self,
+        SIZE_t capacity=INTPTR_MAX
+    ) except -1 nogil:
+        """Guts of _resize
 
-    max_depth : int
-        The depth of the tree, i.e. the maximum depth of its leaves.
+        Returns -1 in case of failure to allocate memory (and raise MemoryError)
+        or 0 otherwise.
+        """
+        if capacity == self.capacity and self.nodes != NULL:
+            return 0
 
-    children_left : array of int, shape [node_count]
-        children_left[i] holds the node id of the left child of node i.
-        For leaves, children_left[i] == TREE_LEAF. Otherwise,
-        children_left[i] > i. This child handles the case where
-        X[:, feature[i]] <= threshold[i].
+        if capacity == INTPTR_MAX:
+            if self.capacity == 0:
+                capacity = 3  # default initial value
+            else:
+                capacity = 2 * self.capacity
 
-    children_right : array of int, shape [node_count]
-        children_right[i] holds the node id of the right child of node i.
-        For leaves, children_right[i] == TREE_LEAF. Otherwise,
-        children_right[i] > i. This child handles the case where
-        X[:, feature[i]] > threshold[i].
+        safe_realloc(&self.nodes, capacity)
+        safe_realloc(&self.value, capacity * self.value_stride)
 
-    n_leaves : int
-        Number of leaves in the tree.
+        # value memory is initialised to 0 to enable classifier argmax
+        if capacity > self.capacity:
+            memset(<void*>(self.value + self.capacity * self.value_stride), 0,
+                   (capacity - self.capacity) * self.value_stride *
+                   sizeof(double))
 
-    feature : array of int, shape [node_count]
-        feature[i] holds the feature to split on, for the internal node i.
+        # if capacity smaller than node_count, adjust the counter
+        if capacity < self.node_count:
+            self.node_count = capacity
 
-    threshold : array of double, shape [node_count]
-        threshold[i] holds the threshold for the internal node i.
+        self.capacity = capacity
+        return 0
 
-    value : array of double, shape [node_count, n_outputs, max_n_classes]
-        Contains the constant prediction value of each node.
-
-    impurity : array of double, shape [node_count]
-        impurity[i] holds the impurity (i.e., the value of the splitting
-        criterion) at node i.
-
-    n_node_samples : array of int, shape [node_count]
-        n_node_samples[i] holds the number of training samples reaching node i.
-
-    weighted_n_node_samples : array of double, shape [node_count]
-        weighted_n_node_samples[i] holds the weighted number of training samples
-        reaching node i.
-
-    missing_go_to_left : array of bool, shape [node_count]
-        missing_go_to_left[i] holds a bool indicating whether or not there were
-        missing values at node i.
-    """
-    # Wrap for outside world.
-    # WARNING: these reference the current `nodes` and `value` buffers, which
-    # must not be freed by a subsequent memory allocation.
-    # (i.e. through `_resize` or `__setstate__`)
-    @property
-    def n_classes(self):
-        return sizet_ptr_to_ndarray(self.n_classes, self.n_outputs)
-
-    @property
-    def children_left(self):
-        return self._get_node_ndarray()['left_child'][:self.node_count]
-
-    @property
-    def children_right(self):
-        return self._get_node_ndarray()['right_child'][:self.node_count]
-
-    @property
-    def n_leaves(self):
-        return np.sum(np.logical_and(
-            self.children_left == -1,
-            self.children_right == -1))
-
-    @property
-    def feature(self):
-        return self._get_node_ndarray()['feature'][:self.node_count]
-
-    @property
-    def threshold(self):
-        return self._get_node_ndarray()['threshold'][:self.node_count]
-
-    @property
-    def impurity(self):
-        return self._get_node_ndarray()['impurity'][:self.node_count]
-
-    @property
-    def n_node_samples(self):
-        return self._get_node_ndarray()['n_node_samples'][:self.node_count]
-
-    @property
-    def weighted_n_node_samples(self):
-        return self._get_node_ndarray()['weighted_n_node_samples'][:self.node_count]
-
-    @property
-    def missing_go_to_left(self):
-        return self._get_node_ndarray()['missing_go_to_left'][:self.node_count]
-
-    @property
-    def value(self):
-        return self._get_value_ndarray()[:self.node_count]
-
-    # TODO: Convert n_classes to cython.integral memory view once
-    #  https://github.com/cython/cython/issues/5243 is fixed
-    def __cinit__(self, int n_features, cnp.ndarray n_classes, int n_outputs):
-        """Constructor."""
-        cdef SIZE_t dummy = 0
-        size_t_dtype = np.array(dummy).dtype
-
-        n_classes = _check_n_classes(n_classes, size_t_dtype)
-
-        # Input/Output layout
-        self.n_features = n_features
-        self.n_outputs = n_outputs
-        self.n_classes = NULL
-        safe_realloc(&self.n_classes, n_outputs)
-
-        self.max_n_classes = np.max(n_classes)
-        self.value_stride = n_outputs * self.max_n_classes
-
-        cdef SIZE_t k
-        for k in range(n_outputs):
-            self.n_classes[k] = n_classes[k]
-
-        # Inner structures
-        self.max_depth = 0
-        self.node_count = 0
-        self.capacity = 0
-        self.value = NULL
-        self.nodes = NULL
-
-    def __dealloc__(self):
-        """Destructor."""
-        # Free all inner structures
-        free(self.n_classes)
-        free(self.value)
-        free(self.nodes)
-
-    def __reduce__(self):
-        """Reduce re-implementation, for pickling."""
-        return (Tree, (self.n_features,
-                       sizet_ptr_to_ndarray(self.n_classes, self.n_outputs),
-                       self.n_outputs), self.__getstate__())
-
-    def __getstate__(self):
-        """Getstate re-implementation, for pickling."""
-        d = {}
-        # capacity is inferred during the __setstate__ using nodes
-        d["max_depth"] = self.max_depth
-        d["node_count"] = self.node_count
-        d["nodes"] = self._get_node_ndarray()
-        d["values"] = self._get_value_ndarray()
-        return d
-
-    def __setstate__(self, d):
-        """Setstate re-implementation, for unpickling."""
-        self.max_depth = d["max_depth"]
-        self.node_count = d["node_count"]
-
-        if 'nodes' not in d:
-            raise ValueError('You have loaded Tree version which '
-                             'cannot be imported')
-
-        node_ndarray = d['nodes']
-        value_ndarray = d['values']
-
-        value_shape = (node_ndarray.shape[0], self.n_outputs,
-                       self.max_n_classes)
-
-        node_ndarray = _check_node_ndarray(node_ndarray, expected_dtype=NODE_DTYPE)
-        value_ndarray = _check_value_ndarray(
-            value_ndarray,
-            expected_dtype=np.dtype(np.float64),
-            expected_shape=value_shape
-        )
-
-        self.capacity = node_ndarray.shape[0]
-        if self._resize_c(self.capacity) != 0:
-            raise MemoryError("resizing tree to %d" % self.capacity)
-
-        memcpy(self.nodes, cnp.PyArray_DATA(node_ndarray),
-               self.capacity * sizeof(Node))
-        memcpy(self.value, cnp.PyArray_DATA(value_ndarray),
-               self.capacity * self.value_stride * sizeof(double))
-
-    cdef int _resize(self, SIZE_t capacity) except -1 nogil:
-        """Resize all inner arrays to `capacity`, if `capacity` == -1, then
-           double the size of the inner arrays.
+    cdef int _set_split_node(
+        self,
+        SplitRecord* split_node,
+        Node* node,
+        SIZE_t node_id,
+    ) except -1 nogil:
+        """Set split node data.
 
-        Returns -1 in case of failure to allocate memory (and raise MemoryError)
-        or 0 otherwise.
+        Parameters
+        ----------
+        split_node : SplitRecord*
+            The pointer to the record of the split node data.
+        node : Node*
+            The pointer to the node that will hold the split node.
+        node_id : SIZE_t
+            The index of the node.
         """
-        if self._resize_c(capacity) != 0:
-            # Acquire gil only if we need to raise
-            with gil:
-                raise MemoryError()
+        # left_child and right_child will be set later for a split node
+        node.feature = split_node.feature
+        node.threshold = split_node.threshold
+        return 1
 
-    cdef int _resize_c(self, SIZE_t capacity=INTPTR_MAX) except -1 nogil:
-        """Guts of _resize
+    cdef int _set_leaf_node(
+        self,
+        SplitRecord* split_node,
+        Node* node,
+        SIZE_t node_id,
+    ) except -1 nogil:
+        """Set leaf node data.
 
-        Returns -1 in case of failure to allocate memory (and raise MemoryError)
-        or 0 otherwise.
+        Parameters
+        ----------
+        split_node : SplitRecord*
+            The pointer to the record of the leaf node data.
+        node : Node*
+            The pointer to the node that will hold the leaf node.
+        node_id : SIZE_t
+            The index of the node.
         """
-        if capacity == self.capacity and self.nodes != NULL:
-            return 0
-
-        if capacity == INTPTR_MAX:
-            if self.capacity == 0:
-                capacity = 3  # default initial value
-            else:
-                capacity = 2 * self.capacity
-
-        safe_realloc(&self.nodes, capacity)
-        safe_realloc(&self.value, capacity * self.value_stride)
-
-        # value memory is initialised to 0 to enable classifier argmax
-        if capacity > self.capacity:
-            memset(<void*>(self.value + self.capacity * self.value_stride), 0,
-                   (capacity - self.capacity) * self.value_stride *
-                   sizeof(double))
-
-        # if capacity smaller than node_count, adjust the counter
-        if capacity < self.node_count:
-            self.node_count = capacity
+        node.left_child = _TREE_LEAF
+        node.right_child = _TREE_LEAF
+        node.feature = _TREE_UNDEFINED
+        node.threshold = _TREE_UNDEFINED
+        return 1
 
-        self.capacity = capacity
-        return 0
+    cdef DTYPE_t _compute_feature(
+        self,
+        const DTYPE_t[:, :] X_ndarray,
+        SIZE_t sample_index,
+        Node *node
+    ) noexcept nogil:
+        """Compute feature from a given data matrix, X.
+
+        In axis-aligned trees, this is simply the value in the column of X
+        for this specific feature.
+        """
+        # the feature index
+        cdef DTYPE_t feature = X_ndarray[sample_index, node.feature]
+        return feature
 
-    cdef SIZE_t _add_node(self, SIZE_t parent, bint is_left, bint is_leaf,
-                          SIZE_t feature, double threshold, double impurity,
-                          SIZE_t n_node_samples,
-                          double weighted_n_node_samples,
-                          unsigned char missing_go_to_left) except -1 nogil:
+    cdef SIZE_t _add_node(
+        self,
+        SIZE_t parent,
+        bint is_left,
+        bint is_leaf,
+        SplitRecord* split_node,
+        double impurity,
+        SIZE_t n_node_samples,
+        double weighted_n_node_samples,
+        unsigned char missing_go_to_left
+    ) except -1 nogil:
         """Add a node to the tree.
 
         The new node registers itself as the child of its parent.
 
+        Parameters
+        ----------
+        parent : SIZE_t
+            The index of the parent. If '_TREE_UNDEFINED', then the current
+            node is a root node.
+        is_left : bint
+            Whether or not the current node is to the left of the parent node.
+        is_leaf : bint
+            Whether or not the current node is a leaf node.
+        split_node : SplitRecord*
+            A pointer to a SplitRecord pointer address.
+        impurity : double
+            The impurity of the node to be added.
+        n_node_samples : SIZE_t
+            The number of samples in the node.
+        weighted_n_node_samples : double
+            The weight of the samples in the node.
+
         Returns (size_t)(-1) on error.
         """
         cdef SIZE_t node_id = self.node_count
@@ -946,28 +1146,61 @@ cdef class Tree:
                 self.nodes[parent].right_child = node_id
 
         if is_leaf:
-            node.left_child = _TREE_LEAF
-            node.right_child = _TREE_LEAF
-            node.feature = _TREE_UNDEFINED
-            node.threshold = _TREE_UNDEFINED
-
+            if self._set_leaf_node(split_node, node, node_id) != 1:
+                with gil:
+                    raise RuntimeError
         else:
-            # left_child and right_child will be set later
-            node.feature = feature
-            node.threshold = threshold
+            if self._set_split_node(split_node, node, node_id) != 1:
+                with gil:
+                    raise RuntimeError
             node.missing_go_to_left = missing_go_to_left
 
         self.node_count += 1
 
         return node_id
 
-    cpdef cnp.ndarray predict(self, object X):
-        """Predict target for X."""
-        out = self._get_value_ndarray().take(self.apply(X), axis=0,
-                                             mode='clip')
-        if self.n_outputs == 1:
-            out = out.reshape(X.shape[0], self.max_n_classes)
-        return out
+    cdef inline SIZE_t _update_node(
+        self,
+        SIZE_t parent,
+        bint is_left,
+        bint is_leaf,
+        SplitRecord* split_node,
+        double impurity,
+        SIZE_t n_node_samples,
+        double weighted_n_node_samples,
+        unsigned char missing_go_to_left
+    ) except -1 nogil:
+        """Update a node on the tree.
+
+        The updated node remains on the same position.
+        Returns (size_t)(-1) on error.
+        """
+        cdef SIZE_t node_id
+        if is_left:
+            node_id = self.nodes[parent].left_child
+        else:
+            node_id = self.nodes[parent].right_child
+
+        if node_id >= self.capacity:
+            if self._resize_c() != 0:
+                return INTPTR_MAX
+
+        cdef Node* node = &self.nodes[node_id]
+        node.impurity = impurity
+        node.n_node_samples = n_node_samples
+        node.weighted_n_node_samples = weighted_n_node_samples
+
+        if is_leaf:
+            if self._set_leaf_node(split_node, node, node_id) != 1:
+                with gil:
+                    raise RuntimeError
+        else:
+            if self._set_split_node(split_node, node, node_id) != 1:
+                with gil:
+                    raise RuntimeError
+            node.missing_go_to_left = missing_go_to_left
+
+        return node_id
 
     cpdef cnp.ndarray apply(self, object X):
         """Finds the terminal region (=leaf node) for each sample in X."""
@@ -1002,9 +1235,10 @@ cdef class Tree:
         with nogil:
             for i in range(n_samples):
                 node = self.nodes
+
                 # While node not a leaf
                 while node.left_child != _TREE_LEAF:
-                    X_i_node_feature = X_ndarray[i, node.feature]
+                    X_i_node_feature = self._compute_feature(X_ndarray, i, node)
                     # ... and node.right_child != _TREE_LEAF:
                     if isnan(X_i_node_feature):
                         if node.missing_go_to_left:
@@ -1072,7 +1306,6 @@ cdef class Tree:
                     # ... and node.right_child != _TREE_LEAF:
                     if feature_to_sample[node.feature] == i:
                         feature_value = X_sample[node.feature]
-
                     else:
                         feature_value = 0.
 
@@ -1121,6 +1354,9 @@ cdef class Tree:
         cdef Node* node = NULL
         cdef SIZE_t i = 0
 
+        # the feature index
+        cdef DOUBLE_t feature
+
         with nogil:
             for i in range(n_samples):
                 node = self.nodes
@@ -1132,7 +1368,9 @@ cdef class Tree:
                     indices[indptr[i + 1]] = <SIZE_t>(node - self.nodes)
                     indptr[i + 1] += 1
 
-                    if X_ndarray[i, node.feature] <= node.threshold:
+                    # compute the feature value to compare against threshold
+                    feature = self._compute_feature(X_ndarray, i, node)
+                    if feature <= node.threshold:
                         node = &self.nodes[node.left_child]
                     else:
                         node = &self.nodes[node.right_child]
@@ -1261,13 +1499,12 @@ cdef class Tree:
 
     cpdef compute_feature_importances(self, normalize=True):
         """Computes the importance of each feature (aka variable)."""
-        cdef Node* left
-        cdef Node* right
         cdef Node* nodes = self.nodes
         cdef Node* node = nodes
         cdef Node* end_node = node + self.node_count
 
         cdef double normalizer = 0.
+        cdef int i = 0
 
         cdef cnp.float64_t[:] importances = np.zeros(self.n_features)
 
@@ -1275,13 +1512,9 @@ cdef class Tree:
             while node != end_node:
                 if node.left_child != _TREE_LEAF:
                     # ... and node.right_child != _TREE_LEAF:
-                    left = &nodes[node.left_child]
-                    right = &nodes[node.right_child]
+                    self._compute_feature_importances(
+                        importances, node)
 
-                    importances[node.feature] += (
-                        node.weighted_n_node_samples * node.impurity -
-                        left.weighted_n_node_samples * left.impurity -
-                        right.weighted_n_node_samples * right.impurity)
                 node += 1
 
         for i in range(self.n_features):
@@ -1295,47 +1528,30 @@ cdef class Tree:
                 for i in range(self.n_features):
                     importances[i] /= normalizer
 
-        return np.asarray(importances)
-
-    cdef cnp.ndarray _get_value_ndarray(self):
-        """Wraps value as a 3-d NumPy array.
-
-        The array keeps a reference to this Tree, which manages the underlying
-        memory.
-        """
-        cdef cnp.npy_intp shape[3]
-        shape[0] = <cnp.npy_intp> self.node_count
-        shape[1] = <cnp.npy_intp> self.n_outputs
-        shape[2] = <cnp.npy_intp> self.max_n_classes
-        cdef cnp.ndarray arr
-        arr = cnp.PyArray_SimpleNewFromData(3, shape, cnp.NPY_DOUBLE, self.value)
-        Py_INCREF(self)
-        if PyArray_SetBaseObject(arr, <PyObject*> self) < 0:
-            raise ValueError("Can't initialize array.")
-        return arr
-
-    cdef cnp.ndarray _get_node_ndarray(self):
-        """Wraps nodes as a NumPy struct array.
-
-        The array keeps a reference to this Tree, which manages the underlying
-        memory. Individual fields are publicly accessible as properties of the
-        Tree.
-        """
-        cdef cnp.npy_intp shape[1]
-        shape[0] = <cnp.npy_intp> self.node_count
-        cdef cnp.npy_intp strides[1]
-        strides[0] = sizeof(Node)
-        cdef cnp.ndarray arr
-        Py_INCREF(NODE_DTYPE)
-        arr = PyArray_NewFromDescr(<PyTypeObject *> cnp.ndarray,
-                                   <cnp.dtype> NODE_DTYPE, 1, shape,
-                                   strides, <void*> self.nodes,
-                                   cnp.NPY_ARRAY_DEFAULT, None)
-        Py_INCREF(self)
-        if PyArray_SetBaseObject(arr, <PyObject*> self) < 0:
-            raise ValueError("Can't initialize array.")
-        return arr
-
+        return np.asarray(importances)
+
+    cdef void _compute_feature_importances(
+        self,
+        cnp.float64_t[:] importances,
+        Node* node
+    ) noexcept nogil:
+        """Compute feature importances from a Node in the Tree.
+
+        Wrapped in a private function to allow subclassing that
+        computes feature importances.
+        """
+        cdef Node* nodes = self.nodes
+        cdef Node* left
+        cdef Node* right
+
+        left = &nodes[node.left_child]
+        right = &nodes[node.right_child]
+
+        importances[node.feature] += (
+                        node.weighted_n_node_samples * node.impurity -
+                        left.weighted_n_node_samples * left.impurity -
+                        right.weighted_n_node_samples * right.impurity)
+
     def compute_partial_dependence(self, DTYPE_t[:, ::1] X,
                                    int[::1] target_features,
                                    double[::1] out):
@@ -1443,6 +1659,286 @@ cdef class Tree:
                                  total_weight)
 
 
+cdef class Tree(BaseTree):
+    """Array-based representation of a binary decision tree.
+
+    The binary tree is represented as a number of parallel arrays. The i-th
+    element of each array holds information about the node `i`. Node 0 is the
+    tree's root. You can find a detailed description of all arrays in
+    `_tree.pxd`. NOTE: Some of the arrays only apply to either leaves or split
+    nodes, resp. In this case the values of nodes of the other type are
+    arbitrary!
+
+    Attributes
+    ----------
+    node_count : int
+        The number of nodes (internal nodes + leaves) in the tree.
+
+    capacity : int
+        The current capacity (i.e., size) of the arrays, which is at least as
+        great as `node_count`.
+
+    max_depth : int
+        The depth of the tree, i.e. the maximum depth of its leaves.
+
+    children_left : array of int, shape [node_count]
+        children_left[i] holds the node id of the left child of node i.
+        For leaves, children_left[i] == TREE_LEAF. Otherwise,
+        children_left[i] > i. This child handles the case where
+        X[:, feature[i]] <= threshold[i].
+
+    children_right : array of int, shape [node_count]
+        children_right[i] holds the node id of the right child of node i.
+        For leaves, children_right[i] == TREE_LEAF. Otherwise,
+        children_right[i] > i. This child handles the case where
+        X[:, feature[i]] > threshold[i].
+
+    feature : array of int, shape [node_count]
+        feature[i] holds the feature to split on, for the internal node i.
+
+    threshold : array of double, shape [node_count]
+        threshold[i] holds the threshold for the internal node i.
+
+    value : array of double, shape [node_count, n_outputs, max_n_classes]
+        Contains the constant prediction value of each node.
+
+    impurity : array of double, shape [node_count]
+        impurity[i] holds the impurity (i.e., the value of the splitting
+        criterion) at node i.
+
+    n_node_samples : array of int, shape [node_count]
+        n_node_samples[i] holds the number of training samples reaching node i.
+
+    weighted_n_node_samples : array of double, shape [node_count]
+        weighted_n_node_samples[i] holds the weighted number of training samples
+        reaching node i.
+
+    leaf_node_samples : dict of node id to numpy array of shapes (n_samples_node, n_features)
+        A dictionary mapping leaf nodes to the samples of data that are used
+        to fit the prediction at each leaf.
+    """
+    # Wrap for outside world.
+    # WARNING: these reference the current `nodes` and `value` buffers, which
+    # must not be freed by a subsequent memory allocation.
+    # (i.e. through `_resize` or `__setstate__`)
+    @property
+    def n_classes(self):
+        return sizet_ptr_to_ndarray(self.n_classes, self.n_outputs)
+
+    @property
+    def children_left(self):
+        return self._get_node_ndarray()['left_child'][:self.node_count]
+
+    @property
+    def children_right(self):
+        return self._get_node_ndarray()['right_child'][:self.node_count]
+
+    @property
+    def n_leaves(self):
+        return np.sum(np.logical_and(
+            self.children_left == -1,
+            self.children_right == -1))
+
+    @property
+    def feature(self):
+        return self._get_node_ndarray()['feature'][:self.node_count]
+
+    @property
+    def threshold(self):
+        return self._get_node_ndarray()['threshold'][:self.node_count]
+
+    @property
+    def impurity(self):
+        return self._get_node_ndarray()['impurity'][:self.node_count]
+
+    @property
+    def n_node_samples(self):
+        return self._get_node_ndarray()['n_node_samples'][:self.node_count]
+
+    @property
+    def weighted_n_node_samples(self):
+        return self._get_node_ndarray()['weighted_n_node_samples'][:self.node_count]
+
+    @property
+    def missing_go_to_left(self):
+        return self._get_node_ndarray()['missing_go_to_left'][:self.node_count]
+
+    @property
+    def value(self):
+        return self._get_value_ndarray()[:self.node_count]
+
+    @property
+    def leaf_nodes_samples(self):
+        leaf_node_samples = dict()
+        keys = self._get_value_samples_keys()
+        for node_id in keys:
+            leaf_node_samples[node_id] = self._get_value_samples_ndarray(node_id)
+        return leaf_node_samples
+
+    # TODO: Convert n_classes to cython.integral memory view once
+    #  https://github.com/cython/cython/issues/5243 is fixed
+    def __cinit__(self, int n_features, cnp.ndarray n_classes, int n_outputs):
+        """Constructor."""
+        cdef SIZE_t dummy = 0
+        size_t_dtype = np.array(dummy).dtype
+
+        n_classes = _check_n_classes(n_classes, size_t_dtype)
+
+        # Input/Output layout
+        self.n_features = n_features
+        self.n_outputs = n_outputs
+        self.n_classes = NULL
+        safe_realloc(&self.n_classes, n_outputs)
+
+        self.max_n_classes = np.max(n_classes)
+        self.value_stride = n_outputs * self.max_n_classes
+
+        cdef SIZE_t k
+        for k in range(n_outputs):
+            self.n_classes[k] = n_classes[k]
+
+        # Inner structures
+        self.max_depth = 0
+        self.node_count = 0
+        self.capacity = 0
+        self.value = NULL
+        self.nodes = NULL
+
+        # initialize the hash map for the value samples
+        self.value_samples = unordered_map[SIZE_t, vector[vector[DOUBLE_t]]]()
+
+    def __dealloc__(self):
+        """Destructor."""
+        # Free all inner structures
+        free(self.n_classes)
+        free(self.value)
+        free(self.nodes)
+
+    def __reduce__(self):
+        """Reduce re-implementation, for pickling."""
+        return (Tree, (self.n_features,
+                       sizet_ptr_to_ndarray(self.n_classes, self.n_outputs),
+                       self.n_outputs), self.__getstate__())
+
+    def __getstate__(self):
+        """Getstate re-implementation, for pickling."""
+        d = {}
+        # capacity is inferred during the __setstate__ using nodes
+        d["max_depth"] = self.max_depth
+        d["node_count"] = self.node_count
+        d["nodes"] = self._get_node_ndarray()
+        d["values"] = self._get_value_ndarray()
+        d['value_samples'] = self.leaf_nodes_samples
+        return d
+
+    def __setstate__(self, d):
+        """Setstate re-implementation, for unpickling."""
+        self.max_depth = d["max_depth"]
+        self.node_count = d["node_count"]
+
+        if 'nodes' not in d:
+            raise ValueError('You have loaded Tree version which '
+                             'cannot be imported')
+
+        node_ndarray = d['nodes']
+        value_ndarray = d['values']
+
+        value_shape = (node_ndarray.shape[0], self.n_outputs,
+                       self.max_n_classes)
+
+        node_ndarray = _check_node_ndarray(node_ndarray, expected_dtype=NODE_DTYPE)
+        value_ndarray = _check_value_ndarray(
+            value_ndarray,
+            expected_dtype=np.dtype(np.float64),
+            expected_shape=value_shape
+        )
+
+        self.capacity = node_ndarray.shape[0]
+        if self._resize_c(self.capacity) != 0:
+            raise MemoryError("resizing tree to %d" % self.capacity)
+
+        memcpy(self.nodes, cnp.PyArray_DATA(node_ndarray),
+               self.capacity * sizeof(Node))
+        memcpy(self.value, cnp.PyArray_DATA(value_ndarray),
+               self.capacity * self.value_stride * sizeof(double))
+
+        # store the leaf node samples if they exist
+        value_samples_dict = d['value_samples']
+        for node_id, leaf_samples in value_samples_dict.items():
+            self.value_samples[node_id].resize(leaf_samples.shape[0])
+            for idx in range(leaf_samples.shape[0]):
+                for jdx in range(leaf_samples.shape[1]):
+                    self.value_samples[node_id][idx].push_back(leaf_samples[idx, jdx])
+
+    cdef cnp.ndarray _get_value_samples_ndarray(self, SIZE_t node_id):
+        """Wraps value_samples as a 2-d NumPy array per node_id."""
+        cdef int i, j
+        cdef int n_samples = self.value_samples[node_id].size()
+        cdef cnp.ndarray[DOUBLE_t, ndim=2, mode='c'] leaf_node_samples = np.empty(shape=(n_samples, self.n_outputs), dtype=np.float64)
+
+        for i in range(n_samples):
+            for j in range(self.n_outputs):
+                leaf_node_samples[i, j] = self.value_samples[node_id][i][j]
+        return leaf_node_samples
+
+    cdef cnp.ndarray _get_value_samples_keys(self):
+        """Wraps value_samples keys as a 1-d NumPy array of keys."""
+        cdef cnp.ndarray[SIZE_t, ndim=1, mode='c'] keys = np.empty(len(self.value_samples), dtype=np.intp)
+        cdef unsigned int i = 0
+
+        for key in self.value_samples:
+            keys[i] = key.first
+            i += 1
+        return keys
+
+    cdef cnp.ndarray _get_value_ndarray(self):
+        """Wraps value as a 3-d NumPy array.
+
+        The array keeps a reference to this Tree, which manages the underlying
+        memory.
+        """
+        cdef cnp.npy_intp shape[3]
+        shape[0] = <cnp.npy_intp> self.node_count
+        shape[1] = <cnp.npy_intp> self.n_outputs
+        shape[2] = <cnp.npy_intp> self.max_n_classes
+        cdef cnp.ndarray arr
+        arr = cnp.PyArray_SimpleNewFromData(3, shape, cnp.NPY_DOUBLE, self.value)
+        Py_INCREF(self)
+        if PyArray_SetBaseObject(arr, <PyObject*> self) < 0:
+            raise ValueError("Can't initialize array.")
+        return arr
+
+    cdef cnp.ndarray _get_node_ndarray(self):
+        """Wraps nodes as a NumPy struct array.
+
+        The array keeps a reference to this Tree, which manages the underlying
+        memory. Individual fields are publicly accessible as properties of the
+        Tree.
+        """
+        cdef cnp.npy_intp shape[1]
+        shape[0] = <cnp.npy_intp> self.node_count
+        cdef cnp.npy_intp strides[1]
+        strides[0] = sizeof(Node)
+        cdef cnp.ndarray arr
+        Py_INCREF(NODE_DTYPE)
+        arr = PyArray_NewFromDescr(<PyTypeObject *> cnp.ndarray,
+                                   <cnp.dtype> NODE_DTYPE, 1, shape,
+                                   strides, <void*> self.nodes,
+                                   cnp.NPY_ARRAY_DEFAULT, None)
+        Py_INCREF(self)
+        if PyArray_SetBaseObject(arr, <PyObject*> self) < 0:
+            raise ValueError("Can't initialize array.")
+        return arr
+
+    cpdef cnp.ndarray predict(self, object X):
+        """Predict target for X."""
+        out = self._get_value_ndarray().take(self.apply(X), axis=0,
+                                             mode='clip')
+        if self.n_outputs == 1:
+            out = out.reshape(X.shape[0], self.max_n_classes)
+        return out
+
+
 def _check_n_classes(n_classes, expected_dtype):
     if n_classes.ndim != 1:
         raise ValueError(
@@ -1927,6 +2423,8 @@ cdef _build_pruned_tree(
         stack[BuildPrunedRecord] prune_stack
         BuildPrunedRecord stack_record
 
+        SplitRecord split
+
     with nogil:
         # push root node onto stack
         prune_stack.push({"start": 0, "depth": 0, "parent": _TREE_UNDEFINED, "is_left": 0})
@@ -1943,8 +2441,12 @@ cdef _build_pruned_tree(
             is_leaf = leaves_in_subtree[orig_node_id]
             node = &orig_tree.nodes[orig_node_id]
 
+            # redefine to a SplitRecord to pass into _add_node
+            split.feature = node.feature
+            split.threshold = node.threshold
+
             new_node_id = tree._add_node(
-                parent, is_left, is_leaf, node.feature, node.threshold,
+                parent, is_left, is_leaf, &split,
                 node.impurity, node.n_node_samples,
                 node.weighted_n_node_samples, node.missing_go_to_left)
 
diff --git a/sklearn/tree/_utils.pxd b/sklearn/tree/_utils.pxd
index 4b953af2d9b2b..61ba8af197c2e 100644
--- a/sklearn/tree/_utils.pxd
+++ b/sklearn/tree/_utils.pxd
@@ -9,8 +9,10 @@
 # See _utils.pyx for details.
 
 cimport numpy as cnp
+
+from sklearn.neighbors._quad_tree cimport Cell
+
 from ._tree cimport Node
-from ..neighbors._quad_tree cimport Cell
 
 ctypedef cnp.npy_float32 DTYPE_t          # Type of X
 ctypedef cnp.npy_float64 DOUBLE_t         # Type of y, sample_weight
diff --git a/sklearn/tree/_utils.pyx b/sklearn/tree/_utils.pyx
index 669d69409fdc3..02dc7cf426efc 100644
--- a/sklearn/tree/_utils.pyx
+++ b/sklearn/tree/_utils.pyx
@@ -1,3 +1,6 @@
+# cython: language_level=3
+# cython: boundscheck=False, wraparound=False, initializedcheck=False, cdivision=True
+
 # Authors: Gilles Louppe <g.louppe@gmail.com>
 #          Peter Prettenhofer <peter.prettenhofer@gmail.com>
 #          Arnaud Joly <arnaud.v.joly@gmail.com>
@@ -7,16 +10,17 @@
 #
 # License: BSD 3 clause
 
-from libc.stdlib cimport free
-from libc.stdlib cimport realloc
-from libc.math cimport log as ln
 from libc.math cimport isnan
+from libc.math cimport log as ln
+from libc.stdlib cimport free, realloc
 
 import numpy as np
+
 cimport numpy as cnp
+
 cnp.import_array()
 
-from ..utils._random cimport our_rand_r
+from sklearn.utils._random cimport our_rand_r
 
 # =============================================================================
 # Helper functions
diff --git a/sklearn/tree/tests/test_tree.py b/sklearn/tree/tests/test_tree.py
index 034ee5fc39917..c14c50f24a516 100644
--- a/sklearn/tree/tests/test_tree.py
+++ b/sklearn/tree/tests/test_tree.py
@@ -882,7 +882,7 @@ def test_pickle():
         else:
             X, y = diabetes.data, diabetes.target
 
-        est = TreeEstimator(random_state=0)
+        est = TreeEstimator(random_state=0, store_leaf_values=True)
         est.fit(X, y)
         score = est.score(X, y)
 
@@ -901,6 +901,7 @@ def test_pickle():
             "n_node_samples",
             "weighted_n_node_samples",
             "value",
+            "leaf_nodes_samples",
         ]
         fitted_attribute = {
             attribute: getattr(est.tree_, attribute) for attribute in attributes
@@ -915,14 +916,25 @@ def test_pickle():
             score == score2
         ), "Failed to generate same score  after pickling with {0}".format(name)
         for attribute in fitted_attribute:
-            assert_array_equal(
-                getattr(est2.tree_, attribute),
-                fitted_attribute[attribute],
-                err_msg=(
-                    f"Failed to generate same attribute {attribute} after pickling with"
-                    f" {name}"
-                ),
-            )
+            if attribute == "leaf_nodes_samples":
+                for key in fitted_attribute[attribute].keys():
+                    assert_array_equal(
+                        getattr(est2.tree_, attribute)[key],
+                        fitted_attribute[attribute][key],
+                        err_msg=(
+                            f"Failed to generate same attribute {attribute} after"
+                            f" pickling with {name}"
+                        ),
+                    )
+            else:
+                assert_array_equal(
+                    getattr(est2.tree_, attribute),
+                    fitted_attribute[attribute],
+                    err_msg=(
+                        f"Failed to generate same attribute {attribute} after pickling"
+                        f" with {name}"
+                    ),
+                )
 
 
 def test_multioutput():
@@ -2401,8 +2413,8 @@ def test_min_sample_split_1_error(Tree):
     # min_samples_split=1 is invalid
     tree = Tree(min_samples_split=1)
     msg = (
-        r"'min_samples_split' .* must be an int in the range \[2, inf\) "
-        r"or a float in the range \(0.0, 1.0\]"
+        r"'min_samples_split' .* must be an int in the range \[2, inf\)"
+        r".* a float in the range \(0.0, 1.0\]"
     )
     with pytest.raises(ValueError, match=msg):
         tree.fit(X, y)
@@ -2414,7 +2426,9 @@ def test_missing_values_on_equal_nodes_no_missing(criterion):
     X = np.array([[0, 1, 2, 3, 8, 9, 11, 12, 15]]).T
     y = np.array([0.1, 0.2, 0.3, 0.2, 1.4, 1.4, 1.5, 1.6, 2.6])
 
-    dtc = DecisionTreeRegressor(random_state=42, max_depth=1, criterion=criterion)
+    dtc = DecisionTreeRegressor(
+        random_state=42, max_depth=1, criterion=criterion, store_leaf_values=True
+    )
     dtc.fit(X, y)
 
     # Goes to right node because it has the most data points
@@ -2626,3 +2640,148 @@ def test_sample_weight_non_uniform(make_data, Tree):
     tree_samples_removed.fit(X[1::2, :], y[1::2])
 
     assert_allclose(tree_samples_removed.predict(X), tree_with_sw.predict(X))
+
+
+@pytest.mark.parametrize(
+    "tree_name",
+    ALL_TREES,
+)
+def test_leaf_node_samples(tree_name):
+    """Test getting leaf node samples from fitted tree."""
+    tree = ALL_TREES[tree_name](random_state=0, store_leaf_values=False)
+    tree.fit(X_small, y_small)
+
+    # Check that the leaf node samples are not stored by default
+    assert tree.tree_.leaf_nodes_samples == dict()
+
+    # error should be raised if trying to predict quantiles
+    assert hasattr(tree, "predict_quantiles")
+    for meth in ["predict_quantiles", "get_leaf_node_samples"]:
+        if hasattr(tree, meth):
+            with pytest.raises(
+                RuntimeError,
+                match="leaf node samples",
+            ):
+                getattr(tree, meth)(X_small)
+
+    quantile_tree = ALL_TREES[tree_name](random_state=0, store_leaf_values=True)
+    quantile_tree.fit(X_small, y_small)
+
+    score = tree.score(X_small, y_small)
+    new_score = quantile_tree.score(X_small, y_small)
+    assert np.isclose(score, new_score)
+
+    # Check that the leaf node samples are what they should be
+    X_leaves = quantile_tree.apply(X_small)
+    for idx in range(X_leaves.shape[0]):
+        leaf_idx = X_leaves[idx]
+        assert y_small[idx] in quantile_tree.tree_.leaf_nodes_samples[leaf_idx]
+    assert set(np.unique(X_leaves)) == set(
+        quantile_tree.tree_.leaf_nodes_samples.keys()
+    )
+
+
+@pytest.mark.parametrize(
+    "name",
+    ALL_TREES,
+)
+def test_quantile_tree_predict(name):
+    TreeEstimator = ALL_TREES[name]
+
+    # test quantile prediction
+    est = TreeEstimator(store_leaf_values=True, random_state=0)
+
+    # fit on binary results in perfect leaves, so all quantiles are the same
+    est.fit(X_small, y_small)
+    pred = est.predict_quantiles(X_small, quantiles=[0.1, 0.5, 0.9])
+    assert_array_equal(est.predict(X_small), pred[:, 0])
+    assert_array_equal(est.predict(X_small), pred[:, 1])
+    assert_array_equal(est.predict(X_small), pred[:, 2])
+    assert_array_equal(pred[:, 0], y_small)
+    assert np.unique(pred, axis=1).shape[1] == 1
+
+    est.fit(X_small[:-5], y_small[:-5])
+    held_out_X = X_small[-5:, :]
+    pred = est.predict_quantiles(held_out_X, quantiles=[0.1, 0.5, 0.9])
+    assert_array_equal(est.predict(held_out_X), pred[:, 0])
+    assert_array_equal(est.predict(held_out_X), pred[:, 1])
+    assert_array_equal(est.predict(held_out_X), pred[:, 2])
+
+    # fit on real data
+    est.fit(iris.data, iris.target)
+    pred = est.predict_quantiles(iris.data, quantiles=[0.1, 0.5, 0.9])
+    assert_array_equal(pred[:, 0], iris.target)
+    assert_array_equal(pred[:, 1], iris.target)
+    assert_array_equal(pred[:, 2], iris.target)
+
+
+@pytest.mark.parametrize(
+    "name",
+    ALL_TREES,
+)
+def test_quantile_tree_predict_impure_leaves(name):
+    TreeEstimator = ALL_TREES[name]
+
+    # test quantile prediction
+    est = TreeEstimator(store_leaf_values=True, random_state=0, max_depth=4)
+    # fit on binary results with constrained depth will result in impure leaves
+    est.fit(X_small, y_small)
+    pred = est.predict_quantiles(X_small, quantiles=[0.1, 0.5, 0.9])
+    assert np.unique(pred, axis=1).shape[1] > 1
+
+
+def test_multioutput_quantiles():
+    # Check estimators on multi-output problems.
+    X = [
+        [-2, -1],
+        [-1, -1],
+        [-1, -2],
+        [1, 1],
+        [1, 2],
+        [2, 1],
+        [-2, 1],
+        [-1, 1],
+        [-1, 2],
+        [2, -1],
+        [1, -1],
+        [1, -2],
+    ]
+
+    y = [
+        [-1, 0],
+        [-1, 0],
+        [-1, 0],
+        [1, 1],
+        [1, 1],
+        [1, 1],
+        [-1, 2],
+        [-1, 2],
+        [-1, 2],
+        [1, 3],
+        [1, 3],
+        [1, 3],
+    ]
+
+    T = [[-1, -1], [1, 1], [-1, 1], [1, -1]]
+    y_true = [[-1, 0], [1, 1], [-1, 2], [1, 3]]
+
+    # toy classification problem
+    for name, TreeClassifier in CLF_TREES.items():
+        clf = TreeClassifier(random_state=0, store_leaf_values=True)
+        clf.fit(X, y)
+
+        y_hat = clf.predict_quantiles(T, quantiles=[0.25, 0.5, 0.75])
+        y_hat = y_hat.squeeze()
+        assert_array_equal(y_hat[:, 0], y_true)
+        assert_array_equal(y_hat[:, 1], y_true)
+        assert_array_equal(y_hat[:, 2], y_true)
+        assert y_hat.shape == (4, 3, 2)
+
+    # toy regression problem
+    for name, TreeRegressor in REG_TREES.items():
+        reg = TreeRegressor(random_state=0, store_leaf_values=True)
+        y_hat = reg.fit(X, y).predict_quantiles(T, quantiles=[0.25, 0.5, 0.75])
+        assert_array_equal(y_hat[:, 0], y_true)
+        assert_array_equal(y_hat[:, 1], y_true)
+        assert_array_equal(y_hat[:, 2], y_true)
+        assert y_hat.shape == (4, 3, 2)

From e2fee00aa461c21b8cfa59eb907d27972415c99b Mon Sep 17 00:00:00 2001
From: Adam Li <adam2392@gmail.com>
Date: Mon, 11 Sep 2023 17:55:39 -0400
Subject: [PATCH 02/54] Fix lint

Signed-off-by: Adam Li <adam2392@gmail.com>
---
 sklearn/tree/_splitter.pxd | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/tree/_splitter.pxd b/sklearn/tree/_splitter.pxd
index 4c67c35ebbdb0..5c82bbe193c18 100644
--- a/sklearn/tree/_splitter.pxd
+++ b/sklearn/tree/_splitter.pxd
@@ -103,7 +103,7 @@ cdef class BaseSplitter:
 
 cdef class Splitter(BaseSplitter):
     """Base class for supervised splitters."""
-    
+
     cdef public Criterion criterion      # Impurity criterion
     cdef const DOUBLE_t[:, ::1] y
 

From 45b9e33da93f2c71cf550761179ae95eaecb0fbc Mon Sep 17 00:00:00 2001
From: Adam Li <adam2392@gmail.com>
Date: Wed, 11 Oct 2023 22:29:15 -0400
Subject: [PATCH 03/54] Fix utils.pyx typing

Signed-off-by: Adam Li <adam2392@gmail.com>
---
 sklearn/tree/_utils.pxd | 24 ++++++++++++------------
 sklearn/tree/_utils.pyx |  2 +-
 2 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/sklearn/tree/_utils.pxd b/sklearn/tree/_utils.pxd
index b4dc9360e1f8f..918cf39846821 100644
--- a/sklearn/tree/_utils.pxd
+++ b/sklearn/tree/_utils.pxd
@@ -42,7 +42,7 @@ ctypedef fused realloc_ptr:
     (Cell*)
     (Node**)
 
-cdef int safe_realloc(realloc_ptr* p, size_t nelems) except -1 nogil
+cdef intp_t safe_realloc(realloc_ptr* p, size_t nelems) except -1 nogil
 
 
 cdef cnp.ndarray sizet_ptr_to_ndarray(intp_t* data, intp_t size)
@@ -73,12 +73,12 @@ cdef class WeightedPQueue:
     cdef WeightedPQueueRecord* array_
 
     cdef bint is_empty(self) noexcept nogil
-    cdef int reset(self) except -1 nogil
+    cdef intp_t reset(self) except -1 nogil
     cdef intp_t size(self) noexcept nogil
-    cdef int push(self, float64_t data, float64_t weight) except -1 nogil
-    cdef int remove(self, float64_t data, float64_t weight) noexcept nogil
-    cdef int pop(self, float64_t* data, float64_t* weight) noexcept nogil
-    cdef int peek(self, float64_t* data, float64_t* weight) noexcept nogil
+    cdef intp_t push(self, float64_t data, float64_t weight) except -1 nogil
+    cdef intp_t remove(self, float64_t data, float64_t weight) noexcept nogil
+    cdef intp_t pop(self, float64_t* data, float64_t* weight) noexcept nogil
+    cdef intp_t peek(self, float64_t* data, float64_t* weight) noexcept nogil
     cdef float64_t get_weight_from_index(self, intp_t index) noexcept nogil
     cdef float64_t get_value_from_index(self, intp_t index) noexcept nogil
 
@@ -94,14 +94,14 @@ cdef class WeightedMedianCalculator:
     cdef intp_t k
     cdef float64_t sum_w_0_k  # represents sum(weights[0:k]) = w[0] + w[1] + ... + w[k-1]
     cdef intp_t size(self) noexcept nogil
-    cdef int push(self, float64_t data, float64_t weight) except -1 nogil
-    cdef int reset(self) except -1 nogil
-    cdef int update_median_parameters_post_push(
+    cdef intp_t push(self, float64_t data, float64_t weight) except -1 nogil
+    cdef intp_t reset(self) except -1 nogil
+    cdef intp_t update_median_parameters_post_push(
         self, float64_t data, float64_t weight,
         float64_t original_median) noexcept nogil
-    cdef int remove(self, float64_t data, float64_t weight) noexcept nogil
-    cdef int pop(self, float64_t* data, float64_t* weight) noexcept nogil
-    cdef int update_median_parameters_post_remove(
+    cdef intp_t remove(self, float64_t data, float64_t weight) noexcept nogil
+    cdef intp_t pop(self, float64_t* data, float64_t* weight) noexcept nogil
+    cdef intp_t update_median_parameters_post_remove(
         self, float64_t data, float64_t weight,
         float64_t original_median) noexcept nogil
     cdef float64_t get_median(self) noexcept nogil
diff --git a/sklearn/tree/_utils.pyx b/sklearn/tree/_utils.pyx
index 4747ce3a339f4..1185967e24e8c 100644
--- a/sklearn/tree/_utils.pyx
+++ b/sklearn/tree/_utils.pyx
@@ -26,7 +26,7 @@ from sklearn.utils._random cimport our_rand_r
 # Helper functions
 # =============================================================================
 
-cdef int safe_realloc(realloc_ptr* p, size_t nelems) except -1 nogil:
+cdef intp_t safe_realloc(realloc_ptr* p, size_t nelems) except -1 nogil:
     # sizeof(realloc_ptr[0]) would be more like idiomatic C, but causes Cython
     # 0.20.1 to crash.
     cdef size_t nbytes = nelems * sizeof(p[0][0])

From 01d26303ae77ddb8d25cef14feb4be7cd03111f6 Mon Sep 17 00:00:00 2001
From: Adam Li <adam2392@gmail.com>
Date: Wed, 11 Oct 2023 22:45:21 -0400
Subject: [PATCH 04/54] Try absolute import

Signed-off-by: Adam Li <adam2392@gmail.com>
---
 sklearn/tree/_criterion.pxd | 2 +-
 sklearn/tree/_splitter.pxd  | 2 +-
 sklearn/tree/_utils.pxd     | 3 +--
 3 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/sklearn/tree/_criterion.pxd b/sklearn/tree/_criterion.pxd
index e464ab02005c3..46ca9102e67a8 100644
--- a/sklearn/tree/_criterion.pxd
+++ b/sklearn/tree/_criterion.pxd
@@ -13,7 +13,7 @@
 cimport numpy as cnp
 
 from libcpp.vector cimport vector
-from ..utils._typedefs cimport float64_t, intp_t
+from sklearn.utils._typedefs cimport float64_t, intp_t
 
 
 cdef class BaseCriterion:
diff --git a/sklearn/tree/_splitter.pxd b/sklearn/tree/_splitter.pxd
index a8820ee3c94ed..88025fbfde502 100644
--- a/sklearn/tree/_splitter.pxd
+++ b/sklearn/tree/_splitter.pxd
@@ -14,8 +14,8 @@ cimport numpy as cnp
 
 from libcpp.vector cimport vector
 
+from sklearn.utils._typedefs cimport float32_t, float64_t, intp_t, int32_t, uint32_t
 from ._criterion cimport BaseCriterion, Criterion
-from ..utils._typedefs cimport float32_t, float64_t, intp_t, int32_t, uint32_t
 
 
 cdef struct SplitRecord:
diff --git a/sklearn/tree/_utils.pxd b/sklearn/tree/_utils.pxd
index 918cf39846821..a74b0f1ed1b76 100644
--- a/sklearn/tree/_utils.pxd
+++ b/sklearn/tree/_utils.pxd
@@ -11,10 +11,9 @@
 cimport numpy as cnp
 
 from sklearn.neighbors._quad_tree cimport Cell
+from sklearn.utils._typedefs cimport float32_t, float64_t, intp_t, int32_t, uint32_t
 
 from ._tree cimport Node
-from ..neighbors._quad_tree cimport Cell
-from ..utils._typedefs cimport float32_t, float64_t, intp_t, int32_t, uint32_t
 
 cdef enum:
     # Max value for our rand_r replacement (near the bottom).

From 5715cfcd7aafc3041459bf894a41a5560dfe977a Mon Sep 17 00:00:00 2001
From: Adam Li <adam2392@gmail.com>
Date: Thu, 12 Oct 2023 11:37:26 -0400
Subject: [PATCH 05/54] Try again

Signed-off-by: Adam Li <adam2392@gmail.com>
---
 sklearn/tree/_criterion.pxd | 2 +-
 sklearn/tree/_splitter.pxd  | 2 +-
 sklearn/tree/_utils.pxd     | 4 ++--
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/sklearn/tree/_criterion.pxd b/sklearn/tree/_criterion.pxd
index 46ca9102e67a8..e464ab02005c3 100644
--- a/sklearn/tree/_criterion.pxd
+++ b/sklearn/tree/_criterion.pxd
@@ -13,7 +13,7 @@
 cimport numpy as cnp
 
 from libcpp.vector cimport vector
-from sklearn.utils._typedefs cimport float64_t, intp_t
+from ..utils._typedefs cimport float64_t, intp_t
 
 
 cdef class BaseCriterion:
diff --git a/sklearn/tree/_splitter.pxd b/sklearn/tree/_splitter.pxd
index 88025fbfde502..2420c94ee6557 100644
--- a/sklearn/tree/_splitter.pxd
+++ b/sklearn/tree/_splitter.pxd
@@ -14,7 +14,7 @@ cimport numpy as cnp
 
 from libcpp.vector cimport vector
 
-from sklearn.utils._typedefs cimport float32_t, float64_t, intp_t, int32_t, uint32_t
+from ..utils._typedefs cimport float32_t, float64_t, intp_t, int32_t, uint32_t
 from ._criterion cimport BaseCriterion, Criterion
 
 
diff --git a/sklearn/tree/_utils.pxd b/sklearn/tree/_utils.pxd
index a74b0f1ed1b76..bb51d5a039357 100644
--- a/sklearn/tree/_utils.pxd
+++ b/sklearn/tree/_utils.pxd
@@ -10,8 +10,8 @@
 
 cimport numpy as cnp
 
-from sklearn.neighbors._quad_tree cimport Cell
-from sklearn.utils._typedefs cimport float32_t, float64_t, intp_t, int32_t, uint32_t
+from ..neighbors._quad_tree cimport Cell
+from ..utils._typedefs cimport float32_t, float64_t, intp_t, int32_t, uint32_t
 
 from ._tree cimport Node
 

From 5336b1f31e50892bdbcc12e5cbdbca4e166f2027 Mon Sep 17 00:00:00 2001
From: Adam Li <adam2392@gmail.com>
Date: Thu, 12 Oct 2023 11:38:19 -0400
Subject: [PATCH 06/54] Update import path

Signed-off-by: Adam Li <adam2392@gmail.com>
---
 sklearn/tree/_utils.pyx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/tree/_utils.pyx b/sklearn/tree/_utils.pyx
index 1185967e24e8c..23c358ce4bd8b 100644
--- a/sklearn/tree/_utils.pyx
+++ b/sklearn/tree/_utils.pyx
@@ -20,7 +20,7 @@ cimport numpy as cnp
 
 cnp.import_array()
 
-from sklearn.utils._random cimport our_rand_r
+from ..utils._random cimport our_rand_r
 
 # =============================================================================
 # Helper functions

From d49572ab11a81299acca4e56885908089efdb9b4 Mon Sep 17 00:00:00 2001
From: Adam Li <adam2392@gmail.com>
Date: Thu, 12 Oct 2023 11:58:48 -0400
Subject: [PATCH 07/54] Make submodule install easier

Signed-off-by: Adam Li <adam2392@gmail.com>
---
 sklearn/utils/_random.pyx | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/sklearn/utils/_random.pyx b/sklearn/utils/_random.pyx
index 6f9c3bdb487cc..277474f15d0db 100644
--- a/sklearn/utils/_random.pyx
+++ b/sklearn/utils/_random.pyx
@@ -14,7 +14,9 @@ import numpy as np
 cimport numpy as cnp
 cnp.import_array()
 
-from . import check_random_state
+# XXX: added instead of relative import to make scikit-tree easier
+# from .utils import check_random_state
+from sklearn.utils import check_random_state
 
 cdef UINT32_t DEFAULT_SEED = 1
 

From 99a9f9161347e2b70a419b6507a10409f11a53bb Mon Sep 17 00:00:00 2001
From: Adam Li <adam2392@gmail.com>
Date: Thu, 12 Oct 2023 13:59:18 -0400
Subject: [PATCH 08/54] Change ctypedef in random

Signed-off-by: Adam Li <adam2392@gmail.com>
---
 sklearn/utils/_random.pxd | 17 +++++++++--------
 sklearn/utils/_random.pyx |  4 ++--
 2 files changed, 11 insertions(+), 10 deletions(-)

diff --git a/sklearn/utils/_random.pxd b/sklearn/utils/_random.pxd
index b5199fc506f4e..4b291489716fc 100644
--- a/sklearn/utils/_random.pxd
+++ b/sklearn/utils/_random.pxd
@@ -4,9 +4,10 @@
 
 
 cimport numpy as cnp
-ctypedef cnp.npy_uint32 UINT32_t
 
-cdef inline UINT32_t DEFAULT_SEED = 1
+from ._typedefs cimport uint32_t
+
+cdef inline uint32_t DEFAULT_SEED = 1
 
 cdef enum:
     # Max value for our rand_r replacement (near the bottom).
@@ -23,18 +24,18 @@ cpdef sample_without_replacement(cnp.int_t n_population,
 
 # rand_r replacement using a 32bit XorShift generator
 # See http://www.jstatsoft.org/v08/i14/paper for details
-cdef inline UINT32_t our_rand_r(UINT32_t* seed) nogil:
+cdef inline uint32_t our_rand_r(uint32_t* seed) nogil:
     """Generate a pseudo-random np.uint32 from a np.uint32 seed"""
     # seed shouldn't ever be 0.
     if (seed[0] == 0):
         seed[0] = DEFAULT_SEED
 
-    seed[0] ^= <UINT32_t>(seed[0] << 13)
-    seed[0] ^= <UINT32_t>(seed[0] >> 17)
-    seed[0] ^= <UINT32_t>(seed[0] << 5)
+    seed[0] ^= <uint32_t>(seed[0] << 13)
+    seed[0] ^= <uint32_t>(seed[0] >> 17)
+    seed[0] ^= <uint32_t>(seed[0] << 5)
 
     # Use the modulo to make sure that we don't return a values greater than the
     # maximum representable value for signed 32bit integers (i.e. 2^31 - 1).
     # Note that the parenthesis are needed to avoid overflow: here
-    # RAND_R_MAX is cast to UINT32_t before 1 is added.
-    return seed[0] % ((<UINT32_t>RAND_R_MAX) + 1)
+    # RAND_R_MAX is cast to uint32_t before 1 is added.
+    return seed[0] % ((<uint32_t>RAND_R_MAX) + 1)
diff --git a/sklearn/utils/_random.pyx b/sklearn/utils/_random.pyx
index 277474f15d0db..9c3d93ffd3bd8 100644
--- a/sklearn/utils/_random.pyx
+++ b/sklearn/utils/_random.pyx
@@ -18,7 +18,7 @@ cnp.import_array()
 # from .utils import check_random_state
 from sklearn.utils import check_random_state
 
-cdef UINT32_t DEFAULT_SEED = 1
+cdef uint32_t DEFAULT_SEED = 1
 
 
 cpdef _sample_without_replacement_check_input(cnp.int_t n_population,
@@ -307,5 +307,5 @@ cpdef sample_without_replacement(cnp.int_t n_population,
 
 def _our_rand_r_py(seed):
     """Python utils to test the our_rand_r function"""
-    cdef UINT32_t my_seed = seed
+    cdef uint32_t my_seed = seed
     return our_rand_r(&my_seed)

From 6c7a5f44eb4ec3bea5dd6a9e4d5db748d12b209e Mon Sep 17 00:00:00 2001
From: Adam Li <adam2392@gmail.com>
Date: Thu, 12 Oct 2023 14:53:02 -0400
Subject: [PATCH 09/54] Revert UINT32_t

Signed-off-by: Adam Li <adam2392@gmail.com>
---
 sklearn/tree/_splitter.pxd |  5 +++--
 sklearn/tree/_splitter.pyx |  4 ++--
 sklearn/tree/_tree.pxd     |  1 +
 sklearn/tree/_utils.pxd    | 10 +++++++---
 sklearn/tree/_utils.pyx    |  4 ++--
 sklearn/utils/_random.pxd  | 17 ++++++++---------
 sklearn/utils/_random.pyx  |  4 ++--
 7 files changed, 25 insertions(+), 20 deletions(-)

diff --git a/sklearn/tree/_splitter.pxd b/sklearn/tree/_splitter.pxd
index 2420c94ee6557..29554103a6b70 100644
--- a/sklearn/tree/_splitter.pxd
+++ b/sklearn/tree/_splitter.pxd
@@ -14,7 +14,8 @@ cimport numpy as cnp
 
 from libcpp.vector cimport vector
 
-from ..utils._typedefs cimport float32_t, float64_t, intp_t, int32_t, uint32_t
+from ..utils._typedefs cimport float32_t, float64_t, intp_t, int32_t
+from ._utils cimport UINT32_t
 from ._criterion cimport BaseCriterion, Criterion
 
 
@@ -47,7 +48,7 @@ cdef class BaseSplitter:
     cdef public float64_t min_weight_leaf   # Minimum weight in a leaf
 
     cdef object random_state             # Random state
-    cdef uint32_t rand_r_state           # sklearn_rand_r random number state
+    cdef UINT32_t rand_r_state           # sklearn_rand_r random number state
 
     cdef intp_t[::1] samples             # Sample indices in X, y
     cdef intp_t n_samples                # X.shape[0]
diff --git a/sklearn/tree/_splitter.pyx b/sklearn/tree/_splitter.pyx
index b9635a4930974..2a44be8d1ce2b 100644
--- a/sklearn/tree/_splitter.pyx
+++ b/sklearn/tree/_splitter.pyx
@@ -467,7 +467,7 @@ cdef inline intp_t node_split_best(
     cdef intp_t max_features = splitter.max_features
     cdef intp_t min_samples_leaf = splitter.min_samples_leaf
     cdef float64_t min_weight_leaf = splitter.min_weight_leaf
-    cdef uint32_t* random_state = &splitter.rand_r_state
+    cdef UINT32_t* random_state = &splitter.rand_r_state
 
     cdef SplitRecord best_split, current_split
     cdef float64_t current_proxy_improvement = -INFINITY
@@ -848,7 +848,7 @@ cdef inline intp_t node_split_random(
     cdef intp_t n_features = splitter.n_features
 
     cdef intp_t max_features = splitter.max_features
-    cdef uint32_t* random_state = &splitter.rand_r_state
+    cdef UINT32_t* random_state = &splitter.rand_r_state
 
     cdef SplitRecord best_split, current_split
     cdef float64_t current_proxy_improvement = - INFINITY
diff --git a/sklearn/tree/_tree.pxd b/sklearn/tree/_tree.pxd
index bd087a48d3b24..ff69b7c6df819 100644
--- a/sklearn/tree/_tree.pxd
+++ b/sklearn/tree/_tree.pxd
@@ -19,6 +19,7 @@ from libcpp.vector cimport vector
 
 from ..utils._typedefs cimport float32_t, float64_t, intp_t, int32_t, uint32_t
 
+from ._utils cimport UINT32_t
 from ._splitter cimport SplitRecord, Splitter
 
 
diff --git a/sklearn/tree/_utils.pxd b/sklearn/tree/_utils.pxd
index bb51d5a039357..03a1d48c94cb4 100644
--- a/sklearn/tree/_utils.pxd
+++ b/sklearn/tree/_utils.pxd
@@ -8,13 +8,17 @@
 
 # See _utils.pyx for details.
 
+import numpy as np
 cimport numpy as cnp
+cnp.import_array()
+ctypedef cnp.npy_uint32 UINT32_t
 
 from ..neighbors._quad_tree cimport Cell
-from ..utils._typedefs cimport float32_t, float64_t, intp_t, int32_t, uint32_t
+from ..utils._typedefs cimport float32_t, float64_t, intp_t, int32_t
 
 from ._tree cimport Node
 
+
 cdef enum:
     # Max value for our rand_r replacement (near the bottom).
     # We don't use RAND_MAX because it's different across platforms and
@@ -48,11 +52,11 @@ cdef cnp.ndarray sizet_ptr_to_ndarray(intp_t* data, intp_t size)
 
 
 cdef intp_t rand_int(intp_t low, intp_t high,
-                     uint32_t* random_state) noexcept nogil
+                     UINT32_t* random_state) noexcept nogil
 
 
 cdef float64_t rand_uniform(float64_t low, float64_t high,
-                            uint32_t* random_state) noexcept nogil
+                            UINT32_t* random_state) noexcept nogil
 
 
 cdef float64_t log(float64_t x) noexcept nogil
diff --git a/sklearn/tree/_utils.pyx b/sklearn/tree/_utils.pyx
index 23c358ce4bd8b..cc4cb7cf02533 100644
--- a/sklearn/tree/_utils.pyx
+++ b/sklearn/tree/_utils.pyx
@@ -60,13 +60,13 @@ cdef inline cnp.ndarray sizet_ptr_to_ndarray(intp_t* data, intp_t size):
 
 
 cdef inline intp_t rand_int(intp_t low, intp_t high,
-                            uint32_t* random_state) noexcept nogil:
+                            UINT32_t* random_state) noexcept nogil:
     """Generate a random integer in [low; end)."""
     return low + our_rand_r(random_state) % (high - low)
 
 
 cdef inline float64_t rand_uniform(float64_t low, float64_t high,
-                                   uint32_t* random_state) noexcept nogil:
+                                   UINT32_t* random_state) noexcept nogil:
     """Generate a random float64_t in [low; high)."""
     return ((high - low) * <float64_t> our_rand_r(random_state) /
             <float64_t> RAND_R_MAX) + low
diff --git a/sklearn/utils/_random.pxd b/sklearn/utils/_random.pxd
index 4b291489716fc..b5199fc506f4e 100644
--- a/sklearn/utils/_random.pxd
+++ b/sklearn/utils/_random.pxd
@@ -4,10 +4,9 @@
 
 
 cimport numpy as cnp
+ctypedef cnp.npy_uint32 UINT32_t
 
-from ._typedefs cimport uint32_t
-
-cdef inline uint32_t DEFAULT_SEED = 1
+cdef inline UINT32_t DEFAULT_SEED = 1
 
 cdef enum:
     # Max value for our rand_r replacement (near the bottom).
@@ -24,18 +23,18 @@ cpdef sample_without_replacement(cnp.int_t n_population,
 
 # rand_r replacement using a 32bit XorShift generator
 # See http://www.jstatsoft.org/v08/i14/paper for details
-cdef inline uint32_t our_rand_r(uint32_t* seed) nogil:
+cdef inline UINT32_t our_rand_r(UINT32_t* seed) nogil:
     """Generate a pseudo-random np.uint32 from a np.uint32 seed"""
     # seed shouldn't ever be 0.
     if (seed[0] == 0):
         seed[0] = DEFAULT_SEED
 
-    seed[0] ^= <uint32_t>(seed[0] << 13)
-    seed[0] ^= <uint32_t>(seed[0] >> 17)
-    seed[0] ^= <uint32_t>(seed[0] << 5)
+    seed[0] ^= <UINT32_t>(seed[0] << 13)
+    seed[0] ^= <UINT32_t>(seed[0] >> 17)
+    seed[0] ^= <UINT32_t>(seed[0] << 5)
 
     # Use the modulo to make sure that we don't return a values greater than the
     # maximum representable value for signed 32bit integers (i.e. 2^31 - 1).
     # Note that the parenthesis are needed to avoid overflow: here
-    # RAND_R_MAX is cast to uint32_t before 1 is added.
-    return seed[0] % ((<uint32_t>RAND_R_MAX) + 1)
+    # RAND_R_MAX is cast to UINT32_t before 1 is added.
+    return seed[0] % ((<UINT32_t>RAND_R_MAX) + 1)
diff --git a/sklearn/utils/_random.pyx b/sklearn/utils/_random.pyx
index 9c3d93ffd3bd8..277474f15d0db 100644
--- a/sklearn/utils/_random.pyx
+++ b/sklearn/utils/_random.pyx
@@ -18,7 +18,7 @@ cnp.import_array()
 # from .utils import check_random_state
 from sklearn.utils import check_random_state
 
-cdef uint32_t DEFAULT_SEED = 1
+cdef UINT32_t DEFAULT_SEED = 1
 
 
 cpdef _sample_without_replacement_check_input(cnp.int_t n_population,
@@ -307,5 +307,5 @@ cpdef sample_without_replacement(cnp.int_t n_population,
 
 def _our_rand_r_py(seed):
     """Python utils to test the our_rand_r function"""
-    cdef uint32_t my_seed = seed
+    cdef UINT32_t my_seed = seed
     return our_rand_r(&my_seed)

From 09f77851bd06bef8674cdadfadaa38469f96ead6 Mon Sep 17 00:00:00 2001
From: Adam Li <adam2392@gmail.com>
Date: Tue, 17 Oct 2023 09:57:03 -0400
Subject: [PATCH 10/54] Change cnp.float64 to float64_t

Signed-off-by: Adam Li <adam2392@gmail.com>
---
 sklearn/tree/_tree.pxd | 2 +-
 sklearn/tree/_tree.pyx | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/sklearn/tree/_tree.pxd b/sklearn/tree/_tree.pxd
index ff69b7c6df819..9a6f2f0914095 100644
--- a/sklearn/tree/_tree.pxd
+++ b/sklearn/tree/_tree.pxd
@@ -107,7 +107,7 @@ cdef class BaseTree:
     ) noexcept nogil
     cdef void _compute_feature_importances(
         self,
-        cnp.float64_t[:] importances,
+        float64_t[:] importances,
         Node* node,
     ) noexcept nogil
 
diff --git a/sklearn/tree/_tree.pyx b/sklearn/tree/_tree.pyx
index 00b9c289b1feb..5a8a200ed9680 100644
--- a/sklearn/tree/_tree.pyx
+++ b/sklearn/tree/_tree.pyx
@@ -1506,7 +1506,7 @@ cdef class BaseTree:
         cdef float64_t normalizer = 0.
         cdef intp_t i = 0
 
-        cdef cnp.float64_t[:] importances = np.zeros(self.n_features)
+        cdef float64_t[:] importances = np.zeros(self.n_features)
 
         with nogil:
             while node != end_node:
@@ -1532,7 +1532,7 @@ cdef class BaseTree:
 
     cdef void _compute_feature_importances(
         self,
-        cnp.float64_t[:] importances,
+        float64_t[:] importances,
         Node* node
     ) noexcept nogil:
         """Compute feature importances from a Node in the Tree.

From 4ffa0936153a54b11ec0c3a488e2f2b331b2e2f7 Mon Sep 17 00:00:00 2001
From: Adam Li <adam2392@gmail.com>
Date: Wed, 21 Feb 2024 22:18:10 -0500
Subject: [PATCH 11/54] Make sure build_tree returns self

Signed-off-by: Adam Li <adam2392@gmail.com>
---
 sklearn/tree/_classes.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/sklearn/tree/_classes.py b/sklearn/tree/_classes.py
index 84a41aff1174c..6511c8192889e 100644
--- a/sklearn/tree/_classes.py
+++ b/sklearn/tree/_classes.py
@@ -413,7 +413,7 @@ def _fit(
             min_weight_leaf = self.min_weight_fraction_leaf * np.sum(sample_weight)
 
         # build the actual tree now with the parameters
-        self._build_tree(
+        self = self._build_tree(
             X=X,
             y=y,
             sample_weight=sample_weight,
@@ -573,6 +573,7 @@ def _build_tree(
             self.classes_ = self.classes_[0]
 
         self._prune_tree()
+        return self
 
     def _validate_X_predict(self, X, check_input):
         """Validate the training data on predict (probabilities)."""

From d48716a6b2cc6373b9e66bf959f2b43b89f10c5d Mon Sep 17 00:00:00 2001
From: Adam Li <adam2392@gmail.com>
Date: Thu, 22 Feb 2024 12:53:22 -0500
Subject: [PATCH 12/54] Allow max samples to be higher

Signed-off-by: Adam Li <adam2392@gmail.com>
---
 sklearn/ensemble/_forest.py           | 23 ++++++++++++++++++-----
 sklearn/ensemble/tests/test_forest.py |  5 ++++-
 2 files changed, 22 insertions(+), 6 deletions(-)

diff --git a/sklearn/ensemble/_forest.py b/sklearn/ensemble/_forest.py
index 50e9bef4f55f1..3827359b9162e 100644
--- a/sklearn/ensemble/_forest.py
+++ b/sklearn/ensemble/_forest.py
@@ -104,14 +104,18 @@ def _get_n_samples_bootstrap(n_samples, max_samples):
     """
     Get the number of samples in a bootstrap sample.
 
+    The expected total number of unique samples in a bootstrap sample is
+    required to be at most ``n_samples - 1``.
+    This is equivalent to the expected number of out-of-bag samples being at
+    least 1.
+
     Parameters
     ----------
     n_samples : int
         Number of samples in the dataset.
     max_samples : int or float
         The maximum number of samples to draw from the total available:
-            - if float, this indicates a fraction of the total and should be
-              the interval `(0.0, 1.0]`;
+            - if float, this indicates a fraction of the total;
             - if int, this indicates the exact number of samples;
             - if None, this indicates the total number of samples.
 
@@ -124,12 +128,21 @@ def _get_n_samples_bootstrap(n_samples, max_samples):
         return n_samples
 
     if isinstance(max_samples, Integral):
-        if max_samples > n_samples:
-            msg = "`max_samples` must be <= n_samples={} but got value {}"
-            raise ValueError(msg.format(n_samples, max_samples))
+        expected_oob_samples = (1 - np.exp(-max_samples / n_samples)) * n_samples
+        if expected_oob_samples >= n_samples - 1:
+            raise ValueError(
+                "The expected number of unique samples in the bootstrap sample"
+                f" must be at most {n_samples - 1}. It is: {expected_oob_samples}"
+            )
         return max_samples
 
     if isinstance(max_samples, Real):
+        expected_oob_samples = (1 - np.exp(-max_samples)) * n_samples
+        if expected_oob_samples >= n_samples - 1:
+            raise ValueError(
+                "The expected number of unique samples in the bootstrap sample"
+                f" must be at most {n_samples - 1}. It is: {expected_oob_samples}"
+            )
         return max(round(n_samples * max_samples), 1)
 
 
diff --git a/sklearn/ensemble/tests/test_forest.py b/sklearn/ensemble/tests/test_forest.py
index a51d240c87d4e..7914823d48ccf 100644
--- a/sklearn/ensemble/tests/test_forest.py
+++ b/sklearn/ensemble/tests/test_forest.py
@@ -1660,7 +1660,10 @@ def test_max_samples_bootstrap(name):
 def test_large_max_samples_exception(name):
     # Check invalid `max_samples`
     est = FOREST_CLASSIFIERS_REGRESSORS[name](bootstrap=True, max_samples=int(1e9))
-    match = "`max_samples` must be <= n_samples=6 but got value 1000000000"
+    # TODO: remove the following line when the issue is fixed
+    # https://github.com/scikit-learn/scikit-learn/issues/28507
+    # match = "`max_samples` must be <= n_samples=6 but got value 1000000000"
+    match = "The expected number of unique samples"
     with pytest.raises(ValueError, match=match):
         est.fit(X, y)
 

From 33039e22c600cbd0929d0b22995c08535b1fede4 Mon Sep 17 00:00:00 2001
From: Adam Li <adam2392@gmail.com>
Date: Fri, 23 Feb 2024 10:33:59 -0500
Subject: [PATCH 13/54] Factor out construct trees API

Signed-off-by: Adam Li <adam2392@gmail.com>
---
 sklearn/ensemble/_forest.py | 91 +++++++++++++++++++++++--------------
 1 file changed, 57 insertions(+), 34 deletions(-)

diff --git a/sklearn/ensemble/_forest.py b/sklearn/ensemble/_forest.py
index 3827359b9162e..b5ee64b6e708c 100644
--- a/sklearn/ensemble/_forest.py
+++ b/sklearn/ensemble/_forest.py
@@ -595,42 +595,18 @@ def fit(self, X, y, sample_weight=None, classes=None):
                 # would have got if we hadn't used a warm_start.
                 random_state.randint(MAX_INT, size=len(self.estimators_))
 
-            trees = [
-                self._make_estimator(append=False, random_state=random_state)
-                for i in range(n_more_estimators)
-            ]
-
-            # Parallel loop: we prefer the threading backend as the Cython code
-            # for fitting the trees is internally releasing the Python GIL
-            # making threading more efficient than multiprocessing in
-            # that case. However, for joblib 0.12+ we respect any
-            # parallel_backend contexts set at a higher level,
-            # since correctness does not rely on using threads.
-            trees = Parallel(
-                n_jobs=self.n_jobs,
-                verbose=self.verbose,
-                prefer="threads",
-            )(
-                delayed(_parallel_build_trees)(
-                    t,
-                    self.bootstrap,
-                    X,
-                    y,
-                    sample_weight,
-                    i,
-                    len(trees),
-                    verbose=self.verbose,
-                    class_weight=self.class_weight,
-                    n_samples_bootstrap=n_samples_bootstrap,
-                    missing_values_in_feature_mask=missing_values_in_feature_mask,
-                    classes=classes,
-                )
-                for i, t in enumerate(trees)
+            # construct the trees in parallel
+            self._construct_trees(
+                X,
+                y,
+                sample_weight,
+                random_state,
+                n_samples_bootstrap,
+                missing_values_in_feature_mask,
+                classes,
+                n_more_estimators,
             )
 
-            # Collect newly grown trees
-            self.estimators_.extend(trees)
-
         if self.oob_score and (
             n_more_estimators > 0 or not hasattr(self, "oob_score_")
         ):
@@ -664,6 +640,53 @@ def fit(self, X, y, sample_weight=None, classes=None):
 
         return self
 
+    def _construct_trees(
+        self,
+        X,
+        y,
+        sample_weight,
+        random_state,
+        n_samples_bootstrap,
+        missing_values_in_feature_mask,
+        classes,
+        n_more_estimators,
+    ):
+        trees = [
+            self._make_estimator(append=False, random_state=random_state)
+            for i in range(n_more_estimators)
+        ]
+
+        # Parallel loop: we prefer the threading backend as the Cython code
+        # for fitting the trees is internally releasing the Python GIL
+        # making threading more efficient than multiprocessing in
+        # that case. However, for joblib 0.12+ we respect any
+        # parallel_backend contexts set at a higher level,
+        # since correctness does not rely on using threads.
+        trees = Parallel(
+            n_jobs=self.n_jobs,
+            verbose=self.verbose,
+            prefer="threads",
+        )(
+            delayed(_parallel_build_trees)(
+                t,
+                self.bootstrap,
+                X,
+                y,
+                sample_weight,
+                i,
+                len(trees),
+                verbose=self.verbose,
+                class_weight=self.class_weight,
+                n_samples_bootstrap=n_samples_bootstrap,
+                missing_values_in_feature_mask=missing_values_in_feature_mask,
+                classes=classes,
+            )
+            for i, t in enumerate(trees)
+        )
+
+        # Collect newly grown trees
+        self.estimators_.extend(trees)
+
     @abstractmethod
     def _set_oob_score_and_attributes(self, X, y, scoring_function=None):
         """Compute and set the OOB score and attributes.

From 94fc4327d1fe8526a40465f5cf5b28ce68f468e9 Mon Sep 17 00:00:00 2001
From: Adam Li <adam2392@gmail.com>
Date: Sat, 2 Mar 2024 11:42:49 -0500
Subject: [PATCH 14/54] Allow extra args in cinit

Signed-off-by: Adam Li <adam2392@gmail.com>
---
 sklearn/tree/_tree.pyx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/tree/_tree.pyx b/sklearn/tree/_tree.pyx
index 2c34139484012..eda0368eed222 100644
--- a/sklearn/tree/_tree.pyx
+++ b/sklearn/tree/_tree.pyx
@@ -1779,7 +1779,7 @@ cdef class Tree(BaseTree):
 
     # TODO: Convert n_classes to cython.integral memory view once
     #  https://github.com/cython/cython/issues/5243 is fixed
-    def __cinit__(self, intp_t n_features, cnp.ndarray n_classes, intp_t n_outputs):
+    def __cinit__(self, intp_t n_features, cnp.ndarray n_classes, intp_t n_outputs, *args):
         """Constructor."""
         cdef intp_t dummy = 0
         size_t_dtype = np.array(dummy).dtype

From 5ccd00fc9367f501d6ddebfe94c84c0aa90f7bc4 Mon Sep 17 00:00:00 2001
From: Adam Li <adam2392@gmail.com>
Date: Sat, 9 Mar 2024 20:49:00 -0500
Subject: [PATCH 15/54] Migrate n_constant_features within SplitRecord

Signed-off-by: Adam Li <adam2392@gmail.com>
---
 sklearn/tree/_splitter.pxd |  2 +-
 sklearn/tree/_splitter.pyx | 24 +++++-------------------
 sklearn/tree/_tree.pyx     | 21 ++++++++++-----------
 3 files changed, 16 insertions(+), 31 deletions(-)

diff --git a/sklearn/tree/_splitter.pxd b/sklearn/tree/_splitter.pxd
index f1434f5d05cc9..601e6ac8f3202 100644
--- a/sklearn/tree/_splitter.pxd
+++ b/sklearn/tree/_splitter.pxd
@@ -33,6 +33,7 @@ cdef struct SplitRecord:
     float64_t upper_bound     # Upper bound on value of both children for monotonicity
     unsigned char missing_go_to_left  # Controls if missing values go to the left node.
     intp_t n_missing       # Number of missing values for the feature being split on
+    intp_t n_constant_features  # Number of constant features in the split
 
 cdef class BaseSplitter:
     """Abstract interface for splitter."""
@@ -90,7 +91,6 @@ cdef class BaseSplitter:
         self,
         float64_t impurity,   # Impurity of the node
         SplitRecord* split,
-        intp_t* n_constant_features,
         float64_t lower_bound,
         float64_t upper_bound,
     ) except -1 nogil
diff --git a/sklearn/tree/_splitter.pyx b/sklearn/tree/_splitter.pyx
index d940368804a94..ac84ea60efef3 100644
--- a/sklearn/tree/_splitter.pyx
+++ b/sklearn/tree/_splitter.pyx
@@ -52,6 +52,7 @@ cdef inline void _init_split(SplitRecord* self, intp_t start_pos) noexcept nogil
     self.improvement = -INFINITY
     self.missing_go_to_left = False
     self.n_missing = 0
+    self.n_constant_features = 0
 
 cdef class BaseSplitter:
     """This is an abstract interface for splitters.
@@ -100,7 +101,6 @@ cdef class BaseSplitter:
         self,
         float64_t impurity,
         SplitRecord* split,
-        intp_t* n_constant_features,
         float64_t lower_bound,
         float64_t upper_bound
     ) except -1 nogil:
@@ -118,9 +118,6 @@ cdef class BaseSplitter:
         split : SplitRecord pointer
             A pointer to a memory-allocated SplitRecord object which will be filled with the
             split chosen.
-        n_constant_features : intp_t pointer
-            A pointer to a memory-allocated intp_t object which will be filled with the
-            number of constant features. Optional to use.
         lower_bound : float64_t
             The lower bound of the monotonic constraint if used.
         upper_bound : float64_t
@@ -322,7 +319,6 @@ cdef class Splitter(BaseSplitter):
         self,
         float64_t impurity,
         SplitRecord* split,
-        intp_t* n_constant_features,
         float64_t lower_bound,
         float64_t upper_bound,
     ) except -1 nogil:
@@ -444,7 +440,6 @@ cdef inline intp_t node_split_best(
     Criterion criterion,
     float64_t impurity,
     SplitRecord* split,
-    intp_t* n_constant_features,
     bint with_monotonic_cst,
     const cnp.int8_t[:] monotonic_cst,
     float64_t lower_bound,
@@ -490,7 +485,7 @@ cdef inline intp_t node_split_best(
     cdef intp_t n_found_constants = 0
     # Number of features known to be constant and drawn without replacement
     cdef intp_t n_drawn_constants = 0
-    cdef intp_t n_known_constants = n_constant_features[0]
+    cdef intp_t n_known_constants = split.n_constant_features
     # n_total_constants = n_known_constants + n_found_constants
     cdef intp_t n_total_constants = n_known_constants
 
@@ -711,7 +706,7 @@ cdef inline intp_t node_split_best(
 
     # Return values
     split[0] = best_split
-    n_constant_features[0] = n_total_constants
+    split.n_constant_features = n_total_constants
     return 0
 
 
@@ -834,7 +829,6 @@ cdef inline int node_split_random(
     Criterion criterion,
     float64_t impurity,
     SplitRecord* split,
-    intp_t* n_constant_features,
     bint with_monotonic_cst,
     const cnp.int8_t[:] monotonic_cst,
     float64_t lower_bound,
@@ -866,7 +860,7 @@ cdef inline int node_split_random(
     cdef intp_t n_found_constants = 0
     # Number of features known to be constant and drawn without replacement
     cdef intp_t n_drawn_constants = 0
-    cdef intp_t n_known_constants = n_constant_features[0]
+    cdef intp_t n_known_constants = split.n_constant_features
     # n_total_constants = n_known_constants + n_found_constants
     cdef intp_t n_total_constants = n_known_constants
     cdef intp_t n_visited_features = 0
@@ -1021,7 +1015,7 @@ cdef inline int node_split_random(
 
     # Return values
     split[0] = best_split
-    n_constant_features[0] = n_total_constants
+    split.n_constant_features = n_total_constants
     return 0
 
 
@@ -1679,7 +1673,6 @@ cdef class BestSplitter(Splitter):
         self,
         float64_t impurity,
         SplitRecord* split,
-        intp_t* n_constant_features,
         float64_t lower_bound,
         float64_t upper_bound
     ) except -1 nogil:
@@ -1689,7 +1682,6 @@ cdef class BestSplitter(Splitter):
             self.criterion,
             impurity,
             split,
-            n_constant_features,
             self.with_monotonic_cst,
             self.monotonic_cst,
             lower_bound,
@@ -1715,7 +1707,6 @@ cdef class BestSparseSplitter(Splitter):
         self,
         float64_t impurity,
         SplitRecord* split,
-        intp_t* n_constant_features,
         float64_t lower_bound,
         float64_t upper_bound
     ) except -1 nogil:
@@ -1725,7 +1716,6 @@ cdef class BestSparseSplitter(Splitter):
             self.criterion,
             impurity,
             split,
-            n_constant_features,
             self.with_monotonic_cst,
             self.monotonic_cst,
             lower_bound,
@@ -1751,7 +1741,6 @@ cdef class RandomSplitter(Splitter):
         self,
         float64_t impurity,
         SplitRecord* split,
-        intp_t* n_constant_features,
         float64_t lower_bound,
         float64_t upper_bound
     ) except -1 nogil:
@@ -1761,7 +1750,6 @@ cdef class RandomSplitter(Splitter):
             self.criterion,
             impurity,
             split,
-            n_constant_features,
             self.with_monotonic_cst,
             self.monotonic_cst,
             lower_bound,
@@ -1786,7 +1774,6 @@ cdef class RandomSparseSplitter(Splitter):
             self,
             float64_t impurity,
             SplitRecord* split,
-            intp_t* n_constant_features,
             float64_t lower_bound,
             float64_t upper_bound
     ) except -1 nogil:
@@ -1796,7 +1783,6 @@ cdef class RandomSparseSplitter(Splitter):
             self.criterion,
             impurity,
             split,
-            n_constant_features,
             self.with_monotonic_cst,
             self.monotonic_cst,
             lower_bound,
diff --git a/sklearn/tree/_tree.pyx b/sklearn/tree/_tree.pyx
index eda0368eed222..4ecd644fbe27e 100644
--- a/sklearn/tree/_tree.pyx
+++ b/sklearn/tree/_tree.pyx
@@ -153,6 +153,7 @@ cdef class TreeBuilder:
 
         return X, y, sample_weight
 
+
 # Depth first builder ---------------------------------------------------------
 # A record on the stack for depth-first tree growing
 cdef struct StackRecord:
@@ -166,6 +167,7 @@ cdef struct StackRecord:
     float64_t lower_bound
     float64_t upper_bound
 
+
 cdef class DepthFirstTreeBuilder(TreeBuilder):
     """Build a decision tree in depth-first fashion."""
 
@@ -328,7 +330,6 @@ cdef class DepthFirstTreeBuilder(TreeBuilder):
         cdef float64_t lower_bound
         cdef float64_t upper_bound
         cdef float64_t middle_value
-        cdef intp_t n_constant_features
         cdef bint is_leaf
         cdef intp_t max_depth_seen = -1 if first else tree.max_depth
 
@@ -379,7 +380,7 @@ cdef class DepthFirstTreeBuilder(TreeBuilder):
                 parent = stack_record.parent
                 is_left = stack_record.is_left
                 impurity = stack_record.impurity
-                n_constant_features = stack_record.n_constant_features
+                split_ptr.n_constant_features = stack_record.n_constant_features
                 lower_bound = stack_record.lower_bound
                 upper_bound = stack_record.upper_bound
 
@@ -398,7 +399,6 @@ cdef class DepthFirstTreeBuilder(TreeBuilder):
                     splitter.node_split(
                         impurity,
                         split_ptr,
-                        &n_constant_features,
                         lower_bound,
                         upper_bound
                     )
@@ -470,7 +470,7 @@ cdef class DepthFirstTreeBuilder(TreeBuilder):
                         "parent": node_id,
                         "is_left": 0,
                         "impurity": split.impurity_right,
-                        "n_constant_features": n_constant_features,
+                        "n_constant_features": split.n_constant_features,
                         "lower_bound": right_child_min,
                         "upper_bound": right_child_max,
                     })
@@ -483,7 +483,7 @@ cdef class DepthFirstTreeBuilder(TreeBuilder):
                         "parent": node_id,
                         "is_left": 1,
                         "impurity": split.impurity_left,
-                        "n_constant_features": n_constant_features,
+                        "n_constant_features": split.n_constant_features,
                         "lower_bound": left_child_min,
                         "upper_bound": left_child_max,
                     })
@@ -504,7 +504,7 @@ cdef class DepthFirstTreeBuilder(TreeBuilder):
                 parent = stack_record.parent
                 is_left = stack_record.is_left
                 impurity = stack_record.impurity
-                n_constant_features = stack_record.n_constant_features
+                split_ptr.n_constant_features = stack_record.n_constant_features
                 lower_bound = stack_record.lower_bound
                 upper_bound = stack_record.upper_bound
 
@@ -527,7 +527,6 @@ cdef class DepthFirstTreeBuilder(TreeBuilder):
                     splitter.node_split(
                         impurity,
                         split_ptr,
-                        &n_constant_features,
                         lower_bound,
                         upper_bound
                     )
@@ -598,7 +597,7 @@ cdef class DepthFirstTreeBuilder(TreeBuilder):
                         "parent": node_id,
                         "is_left": 0,
                         "impurity": split.impurity_right,
-                        "n_constant_features": n_constant_features,
+                        "n_constant_features": split.n_constant_features,
                         "lower_bound": right_child_min,
                         "upper_bound": right_child_max,
                     })
@@ -611,7 +610,7 @@ cdef class DepthFirstTreeBuilder(TreeBuilder):
                         "parent": node_id,
                         "is_left": 1,
                         "impurity": split.impurity_left,
-                        "n_constant_features": n_constant_features,
+                        "n_constant_features": split.n_constant_features,
                         "lower_bound": left_child_min,
                         "upper_bound": left_child_max,
                     })
@@ -901,11 +900,12 @@ cdef class BestFirstTreeBuilder(TreeBuilder):
 
         cdef intp_t node_id
         cdef intp_t n_node_samples
-        cdef intp_t n_constant_features = 0
         cdef float64_t min_impurity_decrease = self.min_impurity_decrease
         cdef float64_t weighted_n_node_samples
         cdef bint is_leaf
 
+        # there are no constant features in best first splits
+        split_ptr.n_constant_features = 0
         splitter.node_reset(start, end, &weighted_n_node_samples)
 
         if is_first:
@@ -923,7 +923,6 @@ cdef class BestFirstTreeBuilder(TreeBuilder):
             splitter.node_split(
                 impurity,
                 split_ptr,
-                &n_constant_features,
                 lower_bound,
                 upper_bound
             )

From b61ae3d546ba4199dc3badf4bd89971d2d75e9df Mon Sep 17 00:00:00 2001
From: Adam Li <adam2392@gmail.com>
Date: Sat, 9 Mar 2024 22:47:53 -0500
Subject: [PATCH 16/54] Export shift_missing_values_to_left_if_required

Signed-off-by: Adam Li <adam2392@gmail.com>
---
 sklearn/tree/_splitter.pxd | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/sklearn/tree/_splitter.pxd b/sklearn/tree/_splitter.pxd
index 601e6ac8f3202..041e9965a904b 100644
--- a/sklearn/tree/_splitter.pxd
+++ b/sklearn/tree/_splitter.pxd
@@ -140,3 +140,9 @@ cdef class Splitter(BaseSplitter):
         float64_t lower_bound,
         float64_t upper_bound
     ) noexcept nogil
+
+cdef void shift_missing_values_to_left_if_required(
+    SplitRecord* best,
+    intp_t[::1] samples,
+    intp_t end,
+) noexcept nogil

From 02e7765a44e013b513732aef049e1f68e69db894 Mon Sep 17 00:00:00 2001
From: Adam Li <adam2392@gmail.com>
Date: Mon, 11 Mar 2024 18:01:51 -0400
Subject: [PATCH 17/54] demo pr

Signed-off-by: Adam Li <adam2392@gmail.com>
---
 sklearn/tree/_splitter.pxd |  4 ++--
 sklearn/tree/_splitter.pyx | 20 +++++---------------
 sklearn/tree/_tree.pyx     | 11 ++++-------
 3 files changed, 11 insertions(+), 24 deletions(-)

diff --git a/sklearn/tree/_splitter.pxd b/sklearn/tree/_splitter.pxd
index adc14011cb7a2..97fae3aea9e0a 100644
--- a/sklearn/tree/_splitter.pxd
+++ b/sklearn/tree/_splitter.pxd
@@ -28,7 +28,8 @@ cdef struct SplitRecord:
     float64_t lower_bound     # Lower bound on value of both children for monotonicity
     float64_t upper_bound     # Upper bound on value of both children for monotonicity
     unsigned char missing_go_to_left  # Controls if missing values go to the left node.
-    intp_t n_missing       # Number of missing values for the feature being split on
+    intp_t n_missing            # Number of missing values for the feature being split on
+    intp_t n_constant_features  # Number of constant features in the split from parent
 
 cdef class Splitter:
     # The splitter searches in the input space for a feature and a threshold
@@ -102,7 +103,6 @@ cdef class Splitter:
         self,
         float64_t impurity,   # Impurity of the node
         SplitRecord* split,
-        intp_t* n_constant_features,
         float64_t lower_bound,
         float64_t upper_bound,
     ) except -1 nogil
diff --git a/sklearn/tree/_splitter.pyx b/sklearn/tree/_splitter.pyx
index 6ef392685e594..52ecbbc3dfa6b 100644
--- a/sklearn/tree/_splitter.pyx
+++ b/sklearn/tree/_splitter.pyx
@@ -49,6 +49,7 @@ cdef inline void _init_split(SplitRecord* self, intp_t start_pos) noexcept nogil
     self.improvement = -INFINITY
     self.missing_go_to_left = False
     self.n_missing = 0
+    self.n_constant_features = 0
 
 cdef class Splitter:
     """Abstract splitter class.
@@ -233,7 +234,6 @@ cdef class Splitter:
         self,
         float64_t impurity,
         SplitRecord* split,
-        intp_t* n_constant_features,
         float64_t lower_bound,
         float64_t upper_bound,
     ) except -1 nogil:
@@ -303,7 +303,6 @@ cdef inline int node_split_best(
     Criterion criterion,
     float64_t impurity,
     SplitRecord* split,
-    intp_t* n_constant_features,
     bint with_monotonic_cst,
     const cnp.int8_t[:] monotonic_cst,
     float64_t lower_bound,
@@ -349,7 +348,7 @@ cdef inline int node_split_best(
     cdef intp_t n_found_constants = 0
     # Number of features known to be constant and drawn without replacement
     cdef intp_t n_drawn_constants = 0
-    cdef intp_t n_known_constants = n_constant_features[0]
+    cdef intp_t n_known_constants = split.n_constant_features
     # n_total_constants = n_known_constants + n_found_constants
     cdef intp_t n_total_constants = n_known_constants
 
@@ -559,8 +558,8 @@ cdef inline int node_split_best(
            sizeof(intp_t) * n_found_constants)
 
     # Return values
+    best_split.n_constant_features = n_total_constants
     split[0] = best_split
-    n_constant_features[0] = n_total_constants
     return 0
 
 
@@ -683,7 +682,6 @@ cdef inline int node_split_random(
     Criterion criterion,
     float64_t impurity,
     SplitRecord* split,
-    intp_t* n_constant_features,
     bint with_monotonic_cst,
     const cnp.int8_t[:] monotonic_cst,
     float64_t lower_bound,
@@ -717,7 +715,7 @@ cdef inline int node_split_random(
     cdef intp_t n_found_constants = 0
     # Number of features known to be constant and drawn without replacement
     cdef intp_t n_drawn_constants = 0
-    cdef intp_t n_known_constants = n_constant_features[0]
+    cdef intp_t n_known_constants = split.n_constant_features
     # n_total_constants = n_known_constants + n_found_constants
     cdef intp_t n_total_constants = n_known_constants
     cdef intp_t n_visited_features = 0
@@ -861,8 +859,8 @@ cdef inline int node_split_random(
            sizeof(intp_t) * n_found_constants)
 
     # Return values
+    best_split.n_constant_features = n_total_constants
     split[0] = best_split
-    n_constant_features[0] = n_total_constants
     return 0
 
 
@@ -1520,7 +1518,6 @@ cdef class BestSplitter(Splitter):
             self,
             float64_t impurity,
             SplitRecord* split,
-            intp_t* n_constant_features,
             float64_t lower_bound,
             float64_t upper_bound
     ) except -1 nogil:
@@ -1530,7 +1527,6 @@ cdef class BestSplitter(Splitter):
             self.criterion,
             impurity,
             split,
-            n_constant_features,
             self.with_monotonic_cst,
             self.monotonic_cst,
             lower_bound,
@@ -1556,7 +1552,6 @@ cdef class BestSparseSplitter(Splitter):
             self,
             float64_t impurity,
             SplitRecord* split,
-            intp_t* n_constant_features,
             float64_t lower_bound,
             float64_t upper_bound
     ) except -1 nogil:
@@ -1566,7 +1561,6 @@ cdef class BestSparseSplitter(Splitter):
             self.criterion,
             impurity,
             split,
-            n_constant_features,
             self.with_monotonic_cst,
             self.monotonic_cst,
             lower_bound,
@@ -1592,7 +1586,6 @@ cdef class RandomSplitter(Splitter):
             self,
             float64_t impurity,
             SplitRecord* split,
-            intp_t* n_constant_features,
             float64_t lower_bound,
             float64_t upper_bound
     ) except -1 nogil:
@@ -1602,7 +1595,6 @@ cdef class RandomSplitter(Splitter):
             self.criterion,
             impurity,
             split,
-            n_constant_features,
             self.with_monotonic_cst,
             self.monotonic_cst,
             lower_bound,
@@ -1627,7 +1619,6 @@ cdef class RandomSparseSplitter(Splitter):
             self,
             float64_t impurity,
             SplitRecord* split,
-            intp_t* n_constant_features,
             float64_t lower_bound,
             float64_t upper_bound
     ) except -1 nogil:
@@ -1637,7 +1628,6 @@ cdef class RandomSparseSplitter(Splitter):
             self.criterion,
             impurity,
             split,
-            n_constant_features,
             self.with_monotonic_cst,
             self.monotonic_cst,
             lower_bound,
diff --git a/sklearn/tree/_tree.pyx b/sklearn/tree/_tree.pyx
index ea873764069f6..60849fba6561f 100644
--- a/sklearn/tree/_tree.pyx
+++ b/sklearn/tree/_tree.pyx
@@ -215,7 +215,6 @@ cdef class DepthFirstTreeBuilder(TreeBuilder):
         cdef float64_t left_child_max
         cdef float64_t right_child_min
         cdef float64_t right_child_max
-        cdef intp_t n_constant_features
         cdef bint is_leaf
         cdef bint first = 1
         cdef intp_t max_depth_seen = -1
@@ -248,7 +247,7 @@ cdef class DepthFirstTreeBuilder(TreeBuilder):
                 parent = stack_record.parent
                 is_left = stack_record.is_left
                 impurity = stack_record.impurity
-                n_constant_features = stack_record.n_constant_features
+                split.n_constant_features = stack_record.n_constant_features
                 lower_bound = stack_record.lower_bound
                 upper_bound = stack_record.upper_bound
 
@@ -271,7 +270,6 @@ cdef class DepthFirstTreeBuilder(TreeBuilder):
                     splitter.node_split(
                         impurity,
                         &split,
-                        &n_constant_features,
                         lower_bound,
                         upper_bound
                     )
@@ -338,7 +336,7 @@ cdef class DepthFirstTreeBuilder(TreeBuilder):
                         "parent": node_id,
                         "is_left": 0,
                         "impurity": split.impurity_right,
-                        "n_constant_features": n_constant_features,
+                        "n_constant_features": split.n_constant_features,
                         "lower_bound": right_child_min,
                         "upper_bound": right_child_max,
                     })
@@ -351,7 +349,7 @@ cdef class DepthFirstTreeBuilder(TreeBuilder):
                         "parent": node_id,
                         "is_left": 1,
                         "impurity": split.impurity_left,
-                        "n_constant_features": n_constant_features,
+                        "n_constant_features": split.n_constant_features,
                         "lower_bound": left_child_min,
                         "upper_bound": left_child_max,
                     })
@@ -606,7 +604,7 @@ cdef class BestFirstTreeBuilder(TreeBuilder):
         cdef SplitRecord split
         cdef intp_t node_id
         cdef intp_t n_node_samples
-        cdef intp_t n_constant_features = 0
+        split.n_constant_features = 0
         cdef float64_t min_impurity_decrease = self.min_impurity_decrease
         cdef float64_t weighted_n_node_samples
         cdef bint is_leaf
@@ -628,7 +626,6 @@ cdef class BestFirstTreeBuilder(TreeBuilder):
             splitter.node_split(
                 impurity,
                 &split,
-                &n_constant_features,
                 lower_bound,
                 upper_bound
             )

From 3e2cfc701624b201b8b805384e380f10ad6746a2 Mon Sep 17 00:00:00 2001
From: Adam Li <adam2392@gmail.com>
Date: Mon, 11 Mar 2024 22:36:19 -0400
Subject: [PATCH 18/54] Demo

Signed-off-by: Adam Li <adam2392@gmail.com>
---
 sklearn/tree/_splitter.pxd |  6 +--
 sklearn/tree/_splitter.pyx | 61 +++++++++++----------------
 sklearn/tree/_tree.pxd     |  9 ++++
 sklearn/tree/_tree.pyx     | 85 ++++++++++++++++++++------------------
 4 files changed, 78 insertions(+), 83 deletions(-)

diff --git a/sklearn/tree/_splitter.pxd b/sklearn/tree/_splitter.pxd
index 97fae3aea9e0a..554422fc595d3 100644
--- a/sklearn/tree/_splitter.pxd
+++ b/sklearn/tree/_splitter.pxd
@@ -11,6 +11,7 @@
 cimport numpy as cnp
 
 from ._criterion cimport Criterion
+from ._tree cimport ParentInfo
 
 from ..utils._typedefs cimport float32_t, float64_t, intp_t, int32_t, uint32_t
 
@@ -29,7 +30,6 @@ cdef struct SplitRecord:
     float64_t upper_bound     # Upper bound on value of both children for monotonicity
     unsigned char missing_go_to_left  # Controls if missing values go to the left node.
     intp_t n_missing            # Number of missing values for the feature being split on
-    intp_t n_constant_features  # Number of constant features in the split from parent
 
 cdef class Splitter:
     # The splitter searches in the input space for a feature and a threshold
@@ -101,10 +101,8 @@ cdef class Splitter:
 
     cdef int node_split(
         self,
-        float64_t impurity,   # Impurity of the node
+        ParentInfo* parent,
         SplitRecord* split,
-        float64_t lower_bound,
-        float64_t upper_bound,
     ) except -1 nogil
 
     cdef void node_value(self, float64_t* dest) noexcept nogil
diff --git a/sklearn/tree/_splitter.pyx b/sklearn/tree/_splitter.pyx
index 52ecbbc3dfa6b..a861b73642be6 100644
--- a/sklearn/tree/_splitter.pyx
+++ b/sklearn/tree/_splitter.pyx
@@ -49,7 +49,6 @@ cdef inline void _init_split(SplitRecord* self, intp_t start_pos) noexcept nogil
     self.improvement = -INFINITY
     self.missing_go_to_left = False
     self.n_missing = 0
-    self.n_constant_features = 0
 
 cdef class Splitter:
     """Abstract splitter class.
@@ -232,10 +231,8 @@ cdef class Splitter:
 
     cdef int node_split(
         self,
-        float64_t impurity,
+        ParentInfo* parent_record,
         SplitRecord* split,
-        float64_t lower_bound,
-        float64_t upper_bound,
     ) except -1 nogil:
 
         """Find the best split on node samples[start:end].
@@ -301,12 +298,10 @@ cdef inline int node_split_best(
     Splitter splitter,
     Partitioner partitioner,
     Criterion criterion,
-    float64_t impurity,
     SplitRecord* split,
+    ParentInfo* parent_record,
     bint with_monotonic_cst,
     const cnp.int8_t[:] monotonic_cst,
-    float64_t lower_bound,
-    float64_t upper_bound,
 ) except -1 nogil:
     """Find the best split on node samples[start:end]
 
@@ -338,6 +333,10 @@ cdef inline int node_split_best(
     cdef float64_t current_proxy_improvement = -INFINITY
     cdef float64_t best_proxy_improvement = -INFINITY
 
+    cdef float64_t impurity = parent_record.impurity
+    cdef float64_t lower_bound = parent_record.lower_bound
+    cdef float64_t upper_bound = parent_record.upper_bound
+
     cdef intp_t f_i = n_features
     cdef intp_t f_j
     cdef intp_t p
@@ -348,7 +347,7 @@ cdef inline int node_split_best(
     cdef intp_t n_found_constants = 0
     # Number of features known to be constant and drawn without replacement
     cdef intp_t n_drawn_constants = 0
-    cdef intp_t n_known_constants = split.n_constant_features
+    cdef intp_t n_known_constants = parent_record.n_constant_features
     # n_total_constants = n_known_constants + n_found_constants
     cdef intp_t n_total_constants = n_known_constants
 
@@ -558,7 +557,7 @@ cdef inline int node_split_best(
            sizeof(intp_t) * n_found_constants)
 
     # Return values
-    best_split.n_constant_features = n_total_constants
+    parent_record.n_constant_features = n_total_constants
     split[0] = best_split
     return 0
 
@@ -680,12 +679,10 @@ cdef inline int node_split_random(
     Splitter splitter,
     Partitioner partitioner,
     Criterion criterion,
-    float64_t impurity,
     SplitRecord* split,
+    ParentInfo* parent_record,
     bint with_monotonic_cst,
     const cnp.int8_t[:] monotonic_cst,
-    float64_t lower_bound,
-    float64_t upper_bound,
 ) except -1 nogil:
     """Find the best random split on node samples[start:end]
 
@@ -709,13 +706,17 @@ cdef inline int node_split_random(
     cdef float64_t current_proxy_improvement = - INFINITY
     cdef float64_t best_proxy_improvement = - INFINITY
 
+    cdef float64_t impurity = parent_record.impurity
+    cdef float64_t lower_bound = parent_record.lower_bound
+    cdef float64_t upper_bound = parent_record.upper_bound
+
     cdef intp_t f_i = n_features
     cdef intp_t f_j
     # Number of features discovered to be constant during the split search
     cdef intp_t n_found_constants = 0
     # Number of features known to be constant and drawn without replacement
     cdef intp_t n_drawn_constants = 0
-    cdef intp_t n_known_constants = split.n_constant_features
+    cdef intp_t n_known_constants = parent_record.n_constant_features
     # n_total_constants = n_known_constants + n_found_constants
     cdef intp_t n_total_constants = n_known_constants
     cdef intp_t n_visited_features = 0
@@ -859,7 +860,7 @@ cdef inline int node_split_random(
            sizeof(intp_t) * n_found_constants)
 
     # Return values
-    best_split.n_constant_features = n_total_constants
+    parent_record.n_constant_features = n_total_constants
     split[0] = best_split
     return 0
 
@@ -1516,21 +1517,17 @@ cdef class BestSplitter(Splitter):
 
     cdef int node_split(
             self,
-            float64_t impurity,
+            ParentInfo* parent_record,
             SplitRecord* split,
-            float64_t lower_bound,
-            float64_t upper_bound
     ) except -1 nogil:
         return node_split_best(
             self,
             self.partitioner,
             self.criterion,
-            impurity,
             split,
+            parent_record,
             self.with_monotonic_cst,
             self.monotonic_cst,
-            lower_bound,
-            upper_bound
         )
 
 cdef class BestSparseSplitter(Splitter):
@@ -1550,21 +1547,17 @@ cdef class BestSparseSplitter(Splitter):
 
     cdef int node_split(
             self,
-            float64_t impurity,
+            ParentInfo* parent_record,
             SplitRecord* split,
-            float64_t lower_bound,
-            float64_t upper_bound
     ) except -1 nogil:
         return node_split_best(
             self,
             self.partitioner,
             self.criterion,
-            impurity,
             split,
+            parent_record,
             self.with_monotonic_cst,
             self.monotonic_cst,
-            lower_bound,
-            upper_bound
         )
 
 cdef class RandomSplitter(Splitter):
@@ -1584,21 +1577,17 @@ cdef class RandomSplitter(Splitter):
 
     cdef int node_split(
             self,
-            float64_t impurity,
+            ParentInfo* parent_record,
             SplitRecord* split,
-            float64_t lower_bound,
-            float64_t upper_bound
     ) except -1 nogil:
         return node_split_random(
             self,
             self.partitioner,
             self.criterion,
-            impurity,
             split,
+            parent_record,
             self.with_monotonic_cst,
             self.monotonic_cst,
-            lower_bound,
-            upper_bound
         )
 
 cdef class RandomSparseSplitter(Splitter):
@@ -1617,19 +1606,15 @@ cdef class RandomSparseSplitter(Splitter):
         )
     cdef int node_split(
             self,
-            float64_t impurity,
+            ParentInfo* parent_record,
             SplitRecord* split,
-            float64_t lower_bound,
-            float64_t upper_bound
     ) except -1 nogil:
         return node_split_random(
             self,
             self.partitioner,
             self.criterion,
-            impurity,
             split,
+            parent_record,
             self.with_monotonic_cst,
             self.monotonic_cst,
-            lower_bound,
-            upper_bound
         )
diff --git a/sklearn/tree/_tree.pxd b/sklearn/tree/_tree.pxd
index e4081921f40f9..1bca2d57cb489 100644
--- a/sklearn/tree/_tree.pxd
+++ b/sklearn/tree/_tree.pxd
@@ -31,6 +31,15 @@ cdef struct Node:
     unsigned char missing_go_to_left     # Whether features have missing values
 
 
+cdef struct ParentInfo:
+    # Structure to store information about the parent of a node
+    # This is passed to the splitter, to provide information about the previous split
+
+    intp_t n_constant_features      # the number of constant features found in parent
+    float64_t lower_bound           # the lower bound of the parent's impurity
+    float64_t upper_bound           # the upper bound of the parent's impurity
+    float64_t impurity              # the impurity of the parent
+
 cdef class Tree:
     # The Tree object is a binary tree structure constructed by the
     # TreeBuilder. The tree structure is used for predictions and
diff --git a/sklearn/tree/_tree.pyx b/sklearn/tree/_tree.pyx
index 60849fba6561f..92e2e1daedd29 100644
--- a/sklearn/tree/_tree.pyx
+++ b/sklearn/tree/_tree.pyx
@@ -80,6 +80,12 @@ cdef intp_t _TREE_UNDEFINED = TREE_UNDEFINED
 cdef Node dummy
 NODE_DTYPE = np.asarray(<Node[:1]>(&dummy)).dtype
 
+cdef inline void _init_parent_record(ParentInfo* self) noexcept nogil:
+    self.n_constant_features = 0
+    self.impurity = INFINITY
+    self.lower_bound = -INFINITY
+    self.upper_bound = INFINITY
+
 # =============================================================================
 # TreeBuilder
 # =============================================================================
@@ -207,7 +213,6 @@ cdef class DepthFirstTreeBuilder(TreeBuilder):
         cdef SplitRecord split
         cdef intp_t node_id
 
-        cdef float64_t impurity = INFINITY
         cdef float64_t lower_bound
         cdef float64_t upper_bound
         cdef float64_t middle_value
@@ -223,6 +228,8 @@ cdef class DepthFirstTreeBuilder(TreeBuilder):
         cdef stack[StackRecord] builder_stack
         cdef StackRecord stack_record
 
+        cdef ParentInfo parent_record
+
         with nogil:
             # push root node onto stack
             builder_stack.push({
@@ -246,10 +253,10 @@ cdef class DepthFirstTreeBuilder(TreeBuilder):
                 depth = stack_record.depth
                 parent = stack_record.parent
                 is_left = stack_record.is_left
-                impurity = stack_record.impurity
-                split.n_constant_features = stack_record.n_constant_features
-                lower_bound = stack_record.lower_bound
-                upper_bound = stack_record.upper_bound
+                parent_record.impurity = stack_record.impurity
+                parent_record.n_constant_features = stack_record.n_constant_features
+                parent_record.lower_bound = stack_record.lower_bound
+                parent_record.upper_bound = stack_record.upper_bound
 
                 n_node_samples = end - start
                 splitter.node_reset(start, end, &weighted_n_node_samples)
@@ -260,18 +267,16 @@ cdef class DepthFirstTreeBuilder(TreeBuilder):
                            weighted_n_node_samples < 2 * min_weight_leaf)
 
                 if first:
-                    impurity = splitter.node_impurity()
+                    parent_record.impurity = splitter.node_impurity()
                     first = 0
 
                 # impurity == 0 with tolerance due to rounding errors
-                is_leaf = is_leaf or impurity <= EPSILON
+                is_leaf = is_leaf or parent_record.impurity <= EPSILON
 
                 if not is_leaf:
                     splitter.node_split(
-                        impurity,
+                        &parent_record,
                         &split,
-                        lower_bound,
-                        upper_bound
                     )
                     # If EPSILON=0 in the below comparison, float precision
                     # issues stop splitting, producing trees that are
@@ -281,8 +286,8 @@ cdef class DepthFirstTreeBuilder(TreeBuilder):
                                 min_impurity_decrease))
 
                 node_id = tree._add_node(parent, is_left, is_leaf, split.feature,
-                                         split.threshold, impurity, n_node_samples,
-                                         weighted_n_node_samples,
+                                         split.threshold, parent_record.impurity,
+                                         n_node_samples, weighted_n_node_samples,
                                          split.missing_go_to_left)
 
                 if node_id == INTPTR_MAX:
@@ -293,7 +298,7 @@ cdef class DepthFirstTreeBuilder(TreeBuilder):
                 # inspection and interpretation
                 splitter.node_value(tree.value + node_id * tree.value_stride)
                 if splitter.with_monotonic_cst:
-                    splitter.clip_node_value(tree.value + node_id * tree.value_stride, lower_bound, upper_bound)
+                    splitter.clip_node_value(tree.value + node_id * tree.value_stride, parent_record.lower_bound, parent_record.upper_bound)
 
                 if not is_leaf:
                     if (
@@ -336,7 +341,7 @@ cdef class DepthFirstTreeBuilder(TreeBuilder):
                         "parent": node_id,
                         "is_left": 0,
                         "impurity": split.impurity_right,
-                        "n_constant_features": split.n_constant_features,
+                        "n_constant_features": parent_record.n_constant_features,
                         "lower_bound": right_child_min,
                         "upper_bound": right_child_max,
                     })
@@ -349,7 +354,7 @@ cdef class DepthFirstTreeBuilder(TreeBuilder):
                         "parent": node_id,
                         "is_left": 1,
                         "impurity": split.impurity_left,
-                        "n_constant_features": split.n_constant_features,
+                        "n_constant_features": parent_record.n_constant_features,
                         "lower_bound": left_child_min,
                         "upper_bound": left_child_max,
                     })
@@ -456,6 +461,9 @@ cdef class BestFirstTreeBuilder(TreeBuilder):
         cdef int rc = 0
         cdef Node* node
 
+        cdef ParentInfo parent_record
+        parent_record.n_constant_features = 0
+
         # Initial capacity
         cdef intp_t init_capacity = max_split_nodes + max_leaf_nodes
         tree._resize(init_capacity)
@@ -467,13 +475,11 @@ cdef class BestFirstTreeBuilder(TreeBuilder):
                 tree=tree,
                 start=0,
                 end=n_node_samples,
-                impurity=INFINITY,
                 is_first=IS_FIRST,
                 is_left=IS_LEFT,
                 parent=NULL,
                 depth=0,
-                lower_bound=-INFINITY,
-                upper_bound=INFINITY,
+                parent_record=&parent_record,
                 res=&split_node_left,
             )
             if rc >= 0:
@@ -531,18 +537,19 @@ cdef class BestFirstTreeBuilder(TreeBuilder):
                     max_split_nodes -= 1
 
                     # Compute left split node
+                    parent_record.lower_bound = left_child_min
+                    parent_record.upper_bound = left_child_max
+                    parent_record.impurity = record.impurity_left
                     rc = self._add_split_node(
                         splitter=splitter,
                         tree=tree,
                         start=record.start,
                         end=record.pos,
-                        impurity=record.impurity_left,
                         is_first=IS_NOT_FIRST,
                         is_left=IS_LEFT,
                         parent=node,
                         depth=record.depth + 1,
-                        lower_bound=left_child_min,
-                        upper_bound=left_child_max,
+                        parent_record=&parent_record,
                         res=&split_node_left,
                     )
                     if rc == -1:
@@ -552,18 +559,19 @@ cdef class BestFirstTreeBuilder(TreeBuilder):
                     node = &tree.nodes[record.node_id]
 
                     # Compute right split node
+                    parent_record.lower_bound = right_child_min
+                    parent_record.upper_bound = right_child_max
+                    parent_record.impurity = record.impurity_right
                     rc = self._add_split_node(
                         splitter=splitter,
                         tree=tree,
                         start=record.pos,
                         end=record.end,
-                        impurity=record.impurity_right,
                         is_first=IS_NOT_FIRST,
                         is_left=IS_NOT_LEFT,
                         parent=node,
                         depth=record.depth + 1,
-                        lower_bound=right_child_min,
-                        upper_bound=right_child_max,
+                        parent_record=&parent_record,
                         res=&split_node_right,
                     )
                     if rc == -1:
@@ -591,20 +599,17 @@ cdef class BestFirstTreeBuilder(TreeBuilder):
         Tree tree,
         intp_t start,
         intp_t end,
-        float64_t impurity,
         bint is_first,
         bint is_left,
         Node* parent,
         intp_t depth,
-        float64_t lower_bound,
-        float64_t upper_bound,
+        ParentInfo* parent_record,
         FrontierRecord* res
     ) except -1 nogil:
         """Adds node w/ partition ``[start, end)`` to the frontier. """
         cdef SplitRecord split
         cdef intp_t node_id
         cdef intp_t n_node_samples
-        split.n_constant_features = 0
         cdef float64_t min_impurity_decrease = self.min_impurity_decrease
         cdef float64_t weighted_n_node_samples
         cdef bint is_leaf
@@ -612,22 +617,20 @@ cdef class BestFirstTreeBuilder(TreeBuilder):
         splitter.node_reset(start, end, &weighted_n_node_samples)
 
         if is_first:
-            impurity = splitter.node_impurity()
+            parent_record.impurity = splitter.node_impurity()
 
         n_node_samples = end - start
         is_leaf = (depth >= self.max_depth or
                    n_node_samples < self.min_samples_split or
                    n_node_samples < 2 * self.min_samples_leaf or
                    weighted_n_node_samples < 2 * self.min_weight_leaf or
-                   impurity <= EPSILON  # impurity == 0 with tolerance
+                   parent_record.impurity <= EPSILON  # impurity == 0 with tolerance
                    )
 
         if not is_leaf:
             splitter.node_split(
-                impurity,
+                parent_record,
                 &split,
-                lower_bound,
-                upper_bound
             )
             # If EPSILON=0 in the below comparison, float precision issues stop
             # splitting early, producing trees that are dissimilar to v0.18
@@ -638,8 +641,8 @@ cdef class BestFirstTreeBuilder(TreeBuilder):
                                  if parent != NULL
                                  else _TREE_UNDEFINED,
                                  is_left, is_leaf,
-                                 split.feature, split.threshold, impurity, n_node_samples,
-                                 weighted_n_node_samples,
+                                 split.feature, split.threshold, parent_record.impurity,
+                                 n_node_samples, weighted_n_node_samples,
                                  split.missing_go_to_left)
         if node_id == INTPTR_MAX:
             return -1
@@ -647,15 +650,15 @@ cdef class BestFirstTreeBuilder(TreeBuilder):
         # compute values also for split nodes (might become leafs later).
         splitter.node_value(tree.value + node_id * tree.value_stride)
         if splitter.with_monotonic_cst:
-            splitter.clip_node_value(tree.value + node_id * tree.value_stride, lower_bound, upper_bound)
+            splitter.clip_node_value(tree.value + node_id * tree.value_stride, parent_record.lower_bound, parent_record.upper_bound)
 
         res.node_id = node_id
         res.start = start
         res.end = end
         res.depth = depth
-        res.impurity = impurity
-        res.lower_bound = lower_bound
-        res.upper_bound = upper_bound
+        res.impurity = parent_record.impurity
+        res.lower_bound = parent_record.lower_bound
+        res.upper_bound = parent_record.upper_bound
         res.middle_value = splitter.criterion.middle_value()
 
         if not is_leaf:
@@ -671,8 +674,8 @@ cdef class BestFirstTreeBuilder(TreeBuilder):
             res.pos = end
             res.is_leaf = 1
             res.improvement = 0.0
-            res.impurity_left = impurity
-            res.impurity_right = impurity
+            res.impurity_left = parent_record.impurity
+            res.impurity_right = parent_record.impurity
 
         return 0
 

From 9acdf1b830f5dc61d4348804311f87308cf8ca1b Mon Sep 17 00:00:00 2001
From: Adam Li <adam2392@gmail.com>
Date: Thu, 21 Mar 2024 10:09:42 -0400
Subject: [PATCH 19/54] Benchmarks

Signed-off-by: Adam Li <adam2392@gmail.com>
---
 benchmarks/bench_randomforest.py | 198 +++++++++++++++++++++++++++++++
 sklearn/tree/_tree.pxd           |   4 +-
 sklearn/tree/_tree.pyx           |  12 +-
 3 files changed, 206 insertions(+), 8 deletions(-)
 create mode 100644 benchmarks/bench_randomforest.py

diff --git a/benchmarks/bench_randomforest.py b/benchmarks/bench_randomforest.py
new file mode 100644
index 0000000000000..68b3399924255
--- /dev/null
+++ b/benchmarks/bench_randomforest.py
@@ -0,0 +1,198 @@
+"""Instructions
+1. Build this PR and run:
+
+```bash
+python bench_randomforest.py bench ~/bench_results_forest pr
+```
+
+2. On main run:
+
+```bash
+python bench_randomforest.py bench ~/bench_results_forest main
+```
+
+3. Plotting
+
+```bash
+python bench_randomforest.py plot ~/bench_results_forest pr main results_image.png
+```
+"""
+
+from functools import partial
+import argparse
+from time import perf_counter
+from statistics import mean, stdev
+from itertools import product
+import csv
+from pathlib import Path
+
+from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
+from sklearn.datasets import make_classification, make_regression, make_low_rank_matrix
+import numpy as np
+
+N_REPEATS = 10
+n_jobs = -3
+
+benchmark_config = [
+    (
+        RandomForestRegressor,
+        list(
+            product(
+                ["squared_error"],
+                [
+                    make_regression,
+                ],
+                [10_000],
+                ["dense"],
+                ["best"],
+            )
+        ),
+    ),
+    (
+        RandomForestClassifier,
+        list(
+            product(
+                ["gini", "entropy"],
+                [
+                    partial(make_classification, n_informative=10, n_classes=5),
+                ],
+                [10_000],
+                ["dense"],
+                ["best"],
+            )
+        ),
+    ),
+]
+
+def bench(args):
+    bench_results, branch = args.bench_results, args.branch
+    results_dir = Path(bench_results)
+    results_dir.mkdir(exist_ok=True)
+
+    results_path = results_dir / f"{branch}.csv"
+
+    with results_path.open("w") as f:
+        writer = csv.DictWriter(
+            f,
+            fieldnames=[
+                "criterion",
+                "n_samples",
+                "make_data",
+                "container",
+                "splitter",
+                "n_repeat",
+                "duration",
+            ],
+        )
+        writer.writeheader()
+
+        for Klass, items in benchmark_config:
+
+            for config in items:
+                (
+                    criterion,
+                    make_data,
+                    n_samples,
+                    container,
+                    splitter,
+                ) = config
+                if isinstance(make_data, partial):
+                    make_data_str = make_data.func.__name__
+                else:
+                    make_data_str = make_data.__name__
+
+                default_config = {
+                    "criterion": criterion,
+                    "n_samples": n_samples,
+                    "make_data": make_data_str,
+                    "container": container,
+                    "splitter": splitter,
+                }
+                combine_config = " ".join(f"{k}={v}" for k, v in default_config.items())
+
+                klass_results = []
+                for n_repeat in range(N_REPEATS):
+                    print(f"Running {combine_config} with {n_repeat + 1}/{N_REPEATS}")
+                    X, y = make_data(
+                        n_samples=n_samples,
+                        n_features=20,
+                        random_state=n_repeat,
+                    )
+                    forest = Klass(random_state=n_repeat, criterion=criterion, n_jobs=n_jobs)
+
+                    start = perf_counter()
+                    forest.fit(X, y)
+                    duration = perf_counter() - start
+                    klass_results.append(duration)
+                    writer.writerow(
+                        {
+                            **default_config,
+                            **{
+                                "n_repeat": n_repeat,
+                                "duration": duration,
+                            },
+                        }
+                    )
+                results_mean, results_stdev = mean(klass_results), stdev(klass_results)
+                print(
+                    f"{combine_config} with {results_mean:.3f} +/- {results_stdev:.3f}"
+                )
+
+def plot(args):
+    import matplotlib.pyplot as plt
+    import pandas as pd
+    import seaborn as sns
+
+    results_path = Path(args.bench_results)
+    pr_path = results_path / f"{args.pr_name}.csv"
+    main_path = results_path / f"{args.main_name}.csv"
+    image_path = results_path / args.image_path
+
+    df_pr = pd.read_csv(pr_path).assign(branch=args.pr_name)
+    df_main = pd.read_csv(main_path).assign(branch=args.main_name)
+    df_all = pd.concat((df_pr, df_main), ignore_index=True)
+
+    df_all = df_all.assign(
+        make_data=df_all["make_data"]
+        .str.replace("_custom", "")
+        .str.replace("make_", "")
+        .str.replace("_data", "")
+    )
+
+    gb = df_all.groupby(["criterion", "make_data"])
+    groups = gb.groups
+
+    n_rows, n_cols = 2, 4
+    fig, axes = plt.subplots(n_rows, n_cols, figsize=(12, 8), constrained_layout=True)
+    axes_flat = axes.ravel()
+    for i, (keys, idx) in enumerate(groups.items()):
+        ax = axes_flat[i]
+        ax.set_title(" | ".join(keys))
+        sns.boxplot(data=df_all.loc[idx], y="duration", x="branch", ax=ax)
+        if i % n_cols != 0:
+            ax.set_ylabel("")
+
+    axes_flat[-1].set_visible(False)
+
+    fig.savefig(image_path)
+    print(f"Saved image to {image_path}")
+
+if __name__ == "__main__":
+
+    parser = argparse.ArgumentParser()
+
+    subparsers = parser.add_subparsers()
+    bench_parser = subparsers.add_parser("bench")
+    bench_parser.add_argument("bench_results")
+    bench_parser.add_argument("branch")
+    bench_parser.set_defaults(func=bench)
+
+    plot_parser = subparsers.add_parser("plot")
+    plot_parser.add_argument("bench_results")
+    plot_parser.add_argument("pr_name")
+    plot_parser.add_argument("main_name")
+    plot_parser.add_argument("image_path")
+    plot_parser.set_defaults(func=plot)
+
+    args = parser.parse_args()
+    args.func(args)
diff --git a/sklearn/tree/_tree.pxd b/sklearn/tree/_tree.pxd
index 1bca2d57cb489..870f7fe875b0c 100644
--- a/sklearn/tree/_tree.pxd
+++ b/sklearn/tree/_tree.pxd
@@ -35,10 +35,10 @@ cdef struct ParentInfo:
     # Structure to store information about the parent of a node
     # This is passed to the splitter, to provide information about the previous split
 
-    intp_t n_constant_features      # the number of constant features found in parent
     float64_t lower_bound           # the lower bound of the parent's impurity
     float64_t upper_bound           # the upper bound of the parent's impurity
     float64_t impurity              # the impurity of the parent
+    intp_t n_constant_features      # the number of constant features found in parent
 
 cdef class Tree:
     # The Tree object is a binary tree structure constructed by the
@@ -57,7 +57,7 @@ cdef class Tree:
     cdef public intp_t node_count        # Counter for node IDs
     cdef public intp_t capacity          # Capacity of tree, in terms of nodes
     cdef Node* nodes                     # Array of nodes
-    cdef float64_t* value                   # (capacity, n_outputs, max_n_classes) array of values
+    cdef float64_t* value                # (capacity, n_outputs, max_n_classes) array of values
     cdef intp_t value_stride             # = n_outputs * max_n_classes
 
     # Methods
diff --git a/sklearn/tree/_tree.pyx b/sklearn/tree/_tree.pyx
index 92e2e1daedd29..224da2c14e5ec 100644
--- a/sklearn/tree/_tree.pyx
+++ b/sklearn/tree/_tree.pyx
@@ -310,12 +310,12 @@ cdef class DepthFirstTreeBuilder(TreeBuilder):
                         # Current bounds must always be propagated to both children.
                         # If a monotonic constraint is active, bounds are used in
                         # node value clipping.
-                        left_child_min = right_child_min = lower_bound
-                        left_child_max = right_child_max = upper_bound
+                        left_child_min = right_child_min = parent_record.lower_bound
+                        left_child_max = right_child_max = parent_record.upper_bound
                     elif splitter.monotonic_cst[split.feature] == 1:
                         # Split on a feature with monotonic increase constraint
-                        left_child_min = lower_bound
-                        right_child_max = upper_bound
+                        left_child_min = parent_record.lower_bound
+                        right_child_max = parent_record.upper_bound
 
                         # Lower bound for right child and upper bound for left child
                         # are set to the same value.
@@ -324,8 +324,8 @@ cdef class DepthFirstTreeBuilder(TreeBuilder):
                         left_child_max = middle_value
                     else:  # i.e. splitter.monotonic_cst[split.feature] == -1
                         # Split on a feature with monotonic decrease constraint
-                        right_child_min = lower_bound
-                        left_child_max = upper_bound
+                        right_child_min = parent_record.lower_bound
+                        left_child_max = parent_record.upper_bound
 
                         # Lower bound for left child and upper bound for right child
                         # are set to the same value.

From 9fc7847d168fe20bf5c8156cd1a9c7a4d7d80d42 Mon Sep 17 00:00:00 2001
From: Adam Li <adam2392@gmail.com>
Date: Thu, 21 Mar 2024 10:21:49 -0400
Subject: [PATCH 20/54] Merge main

Signed-off-by: Adam Li <adam2392@gmail.com>
---
 sklearn/tree/_splitter.pyx | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/sklearn/tree/_splitter.pyx b/sklearn/tree/_splitter.pyx
index 2719b0b01aea7..5872683f416d5 100644
--- a/sklearn/tree/_splitter.pyx
+++ b/sklearn/tree/_splitter.pyx
@@ -297,13 +297,7 @@ cdef inline int node_split_best(
     SplitRecord* split,
     ParentInfo* parent_record,
     bint with_monotonic_cst,
-<<<<<<< HEAD
-    const cnp.int8_t[:] monotonic_cst,
-=======
     const int8_t[:] monotonic_cst,
-    float64_t lower_bound,
-    float64_t upper_bound,
->>>>>>> main
 ) except -1 nogil:
     """Find the best split on node samples[start:end]
 

From 13a3f89aa21541ed93d75ab4241f76a90bb2041e Mon Sep 17 00:00:00 2001
From: Adam Li <adam2392@gmail.com>
Date: Thu, 21 Mar 2024 10:27:54 -0400
Subject: [PATCH 21/54] Bench size

Signed-off-by: Adam Li <adam2392@gmail.com>
---
 benchmarks/bench_randomforest.py | 62 +++++++++++++++++++++++++++++++-
 1 file changed, 61 insertions(+), 1 deletion(-)

diff --git a/benchmarks/bench_randomforest.py b/benchmarks/bench_randomforest.py
index 68b3399924255..3dc245d65f012 100644
--- a/benchmarks/bench_randomforest.py
+++ b/benchmarks/bench_randomforest.py
@@ -15,9 +15,15 @@
 
 ```bash
 python bench_randomforest.py plot ~/bench_results_forest pr main results_image.png
+
+# or plot size
+python bench_randomforest.py plot_size ~/bench_results_forest pr main results_image.png
 ```
 """
-
+import os
+import tempfile
+import sys
+import pickle
 from functools import partial
 import argparse
 from time import perf_counter
@@ -82,6 +88,8 @@ def bench(args):
                 "splitter",
                 "n_repeat",
                 "duration",
+                "ram_size",
+                "file_size",
             ],
         )
         writer.writeheader()
@@ -124,12 +132,21 @@ def bench(args):
                     forest.fit(X, y)
                     duration = perf_counter() - start
                     klass_results.append(duration)
+
+                    # benchmark size of object
+                    ram_size = sys.getsizeof(forest)
+                    with tempfile.TemporaryFile() as f:
+                        pickle.dump(forest, f, -1)
+                        file_size = os.path.getsize(f.name)
+
                     writer.writerow(
                         {
                             **default_config,
                             **{
                                 "n_repeat": n_repeat,
                                 "duration": duration,
+                                "ram_size": ram_size,
+                                "file_size": file_size,
                             },
                         }
                     )
@@ -177,6 +194,49 @@ def plot(args):
     fig.savefig(image_path)
     print(f"Saved image to {image_path}")
 
+
+def plot_size(args):
+    size_id = 'file_size'
+
+    import matplotlib.pyplot as plt
+    import pandas as pd
+    import seaborn as sns
+
+    results_path = Path(args.bench_results)
+    pr_path = results_path / f"{args.pr_name}.csv"
+    main_path = results_path / f"{args.main_name}.csv"
+    image_path = results_path / args.image_path
+
+    df_pr = pd.read_csv(pr_path).assign(branch=args.pr_name)
+    df_main = pd.read_csv(main_path).assign(branch=args.main_name)
+    df_all = pd.concat((df_pr, df_main), ignore_index=True)
+
+    df_all = df_all.assign(
+        make_data=df_all["make_data"]
+        .str.replace("_custom", "")
+        .str.replace("make_", "")
+        .str.replace("_data", "")
+    )
+
+    gb = df_all.groupby(["criterion", "make_data"])
+    groups = gb.groups
+
+    n_rows, n_cols = 2, 4
+    fig, axes = plt.subplots(n_rows, n_cols, figsize=(12, 8), constrained_layout=True)
+    axes_flat = axes.ravel()
+    for i, (keys, idx) in enumerate(groups.items()):
+        ax = axes_flat[i]
+        ax.set_title(" | ".join(keys))
+        sns.boxplot(data=df_all.loc[idx], y=size_id, x="branch", ax=ax)
+        if i % n_cols != 0:
+            ax.set_ylabel("")
+
+    axes_flat[-1].set_visible(False)
+
+    fig.savefig(image_path)
+    print(f"Saved image to {image_path}")
+
+
 if __name__ == "__main__":
 
     parser = argparse.ArgumentParser()

From e8214dfb0293c06d796231581b6c8a70a5768ed8 Mon Sep 17 00:00:00 2001
From: Adam Li <adam2392@gmail.com>
Date: Thu, 21 Mar 2024 10:56:18 -0400
Subject: [PATCH 22/54] Adding parentrecord

Signed-off-by: Adam Li <adam2392@gmail.com>
---
 benchmarks/bench_randomforest.py | 258 -------------------------------
 1 file changed, 258 deletions(-)
 delete mode 100644 benchmarks/bench_randomforest.py

diff --git a/benchmarks/bench_randomforest.py b/benchmarks/bench_randomforest.py
deleted file mode 100644
index 3dc245d65f012..0000000000000
--- a/benchmarks/bench_randomforest.py
+++ /dev/null
@@ -1,258 +0,0 @@
-"""Instructions
-1. Build this PR and run:
-
-```bash
-python bench_randomforest.py bench ~/bench_results_forest pr
-```
-
-2. On main run:
-
-```bash
-python bench_randomforest.py bench ~/bench_results_forest main
-```
-
-3. Plotting
-
-```bash
-python bench_randomforest.py plot ~/bench_results_forest pr main results_image.png
-
-# or plot size
-python bench_randomforest.py plot_size ~/bench_results_forest pr main results_image.png
-```
-"""
-import os
-import tempfile
-import sys
-import pickle
-from functools import partial
-import argparse
-from time import perf_counter
-from statistics import mean, stdev
-from itertools import product
-import csv
-from pathlib import Path
-
-from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
-from sklearn.datasets import make_classification, make_regression, make_low_rank_matrix
-import numpy as np
-
-N_REPEATS = 10
-n_jobs = -3
-
-benchmark_config = [
-    (
-        RandomForestRegressor,
-        list(
-            product(
-                ["squared_error"],
-                [
-                    make_regression,
-                ],
-                [10_000],
-                ["dense"],
-                ["best"],
-            )
-        ),
-    ),
-    (
-        RandomForestClassifier,
-        list(
-            product(
-                ["gini", "entropy"],
-                [
-                    partial(make_classification, n_informative=10, n_classes=5),
-                ],
-                [10_000],
-                ["dense"],
-                ["best"],
-            )
-        ),
-    ),
-]
-
-def bench(args):
-    bench_results, branch = args.bench_results, args.branch
-    results_dir = Path(bench_results)
-    results_dir.mkdir(exist_ok=True)
-
-    results_path = results_dir / f"{branch}.csv"
-
-    with results_path.open("w") as f:
-        writer = csv.DictWriter(
-            f,
-            fieldnames=[
-                "criterion",
-                "n_samples",
-                "make_data",
-                "container",
-                "splitter",
-                "n_repeat",
-                "duration",
-                "ram_size",
-                "file_size",
-            ],
-        )
-        writer.writeheader()
-
-        for Klass, items in benchmark_config:
-
-            for config in items:
-                (
-                    criterion,
-                    make_data,
-                    n_samples,
-                    container,
-                    splitter,
-                ) = config
-                if isinstance(make_data, partial):
-                    make_data_str = make_data.func.__name__
-                else:
-                    make_data_str = make_data.__name__
-
-                default_config = {
-                    "criterion": criterion,
-                    "n_samples": n_samples,
-                    "make_data": make_data_str,
-                    "container": container,
-                    "splitter": splitter,
-                }
-                combine_config = " ".join(f"{k}={v}" for k, v in default_config.items())
-
-                klass_results = []
-                for n_repeat in range(N_REPEATS):
-                    print(f"Running {combine_config} with {n_repeat + 1}/{N_REPEATS}")
-                    X, y = make_data(
-                        n_samples=n_samples,
-                        n_features=20,
-                        random_state=n_repeat,
-                    )
-                    forest = Klass(random_state=n_repeat, criterion=criterion, n_jobs=n_jobs)
-
-                    start = perf_counter()
-                    forest.fit(X, y)
-                    duration = perf_counter() - start
-                    klass_results.append(duration)
-
-                    # benchmark size of object
-                    ram_size = sys.getsizeof(forest)
-                    with tempfile.TemporaryFile() as f:
-                        pickle.dump(forest, f, -1)
-                        file_size = os.path.getsize(f.name)
-
-                    writer.writerow(
-                        {
-                            **default_config,
-                            **{
-                                "n_repeat": n_repeat,
-                                "duration": duration,
-                                "ram_size": ram_size,
-                                "file_size": file_size,
-                            },
-                        }
-                    )
-                results_mean, results_stdev = mean(klass_results), stdev(klass_results)
-                print(
-                    f"{combine_config} with {results_mean:.3f} +/- {results_stdev:.3f}"
-                )
-
-def plot(args):
-    import matplotlib.pyplot as plt
-    import pandas as pd
-    import seaborn as sns
-
-    results_path = Path(args.bench_results)
-    pr_path = results_path / f"{args.pr_name}.csv"
-    main_path = results_path / f"{args.main_name}.csv"
-    image_path = results_path / args.image_path
-
-    df_pr = pd.read_csv(pr_path).assign(branch=args.pr_name)
-    df_main = pd.read_csv(main_path).assign(branch=args.main_name)
-    df_all = pd.concat((df_pr, df_main), ignore_index=True)
-
-    df_all = df_all.assign(
-        make_data=df_all["make_data"]
-        .str.replace("_custom", "")
-        .str.replace("make_", "")
-        .str.replace("_data", "")
-    )
-
-    gb = df_all.groupby(["criterion", "make_data"])
-    groups = gb.groups
-
-    n_rows, n_cols = 2, 4
-    fig, axes = plt.subplots(n_rows, n_cols, figsize=(12, 8), constrained_layout=True)
-    axes_flat = axes.ravel()
-    for i, (keys, idx) in enumerate(groups.items()):
-        ax = axes_flat[i]
-        ax.set_title(" | ".join(keys))
-        sns.boxplot(data=df_all.loc[idx], y="duration", x="branch", ax=ax)
-        if i % n_cols != 0:
-            ax.set_ylabel("")
-
-    axes_flat[-1].set_visible(False)
-
-    fig.savefig(image_path)
-    print(f"Saved image to {image_path}")
-
-
-def plot_size(args):
-    size_id = 'file_size'
-
-    import matplotlib.pyplot as plt
-    import pandas as pd
-    import seaborn as sns
-
-    results_path = Path(args.bench_results)
-    pr_path = results_path / f"{args.pr_name}.csv"
-    main_path = results_path / f"{args.main_name}.csv"
-    image_path = results_path / args.image_path
-
-    df_pr = pd.read_csv(pr_path).assign(branch=args.pr_name)
-    df_main = pd.read_csv(main_path).assign(branch=args.main_name)
-    df_all = pd.concat((df_pr, df_main), ignore_index=True)
-
-    df_all = df_all.assign(
-        make_data=df_all["make_data"]
-        .str.replace("_custom", "")
-        .str.replace("make_", "")
-        .str.replace("_data", "")
-    )
-
-    gb = df_all.groupby(["criterion", "make_data"])
-    groups = gb.groups
-
-    n_rows, n_cols = 2, 4
-    fig, axes = plt.subplots(n_rows, n_cols, figsize=(12, 8), constrained_layout=True)
-    axes_flat = axes.ravel()
-    for i, (keys, idx) in enumerate(groups.items()):
-        ax = axes_flat[i]
-        ax.set_title(" | ".join(keys))
-        sns.boxplot(data=df_all.loc[idx], y=size_id, x="branch", ax=ax)
-        if i % n_cols != 0:
-            ax.set_ylabel("")
-
-    axes_flat[-1].set_visible(False)
-
-    fig.savefig(image_path)
-    print(f"Saved image to {image_path}")
-
-
-if __name__ == "__main__":
-
-    parser = argparse.ArgumentParser()
-
-    subparsers = parser.add_subparsers()
-    bench_parser = subparsers.add_parser("bench")
-    bench_parser.add_argument("bench_results")
-    bench_parser.add_argument("branch")
-    bench_parser.set_defaults(func=bench)
-
-    plot_parser = subparsers.add_parser("plot")
-    plot_parser.add_argument("bench_results")
-    plot_parser.add_argument("pr_name")
-    plot_parser.add_argument("main_name")
-    plot_parser.add_argument("image_path")
-    plot_parser.set_defaults(func=plot)
-
-    args = parser.parse_args()
-    args.func(args)

From 1d3299bc5d301087c426e683965e77c3457bb00b Mon Sep 17 00:00:00 2001
From: Adam Li <adam2392@gmail.com>
Date: Thu, 21 Mar 2024 11:03:35 -0400
Subject: [PATCH 23/54] Fix lint

Signed-off-by: Adam Li <adam2392@gmail.com>
---
 sklearn/tree/_tree.pyx | 2 --
 1 file changed, 2 deletions(-)

diff --git a/sklearn/tree/_tree.pyx b/sklearn/tree/_tree.pyx
index 224da2c14e5ec..8b382a11791ec 100644
--- a/sklearn/tree/_tree.pyx
+++ b/sklearn/tree/_tree.pyx
@@ -213,8 +213,6 @@ cdef class DepthFirstTreeBuilder(TreeBuilder):
         cdef SplitRecord split
         cdef intp_t node_id
 
-        cdef float64_t lower_bound
-        cdef float64_t upper_bound
         cdef float64_t middle_value
         cdef float64_t left_child_min
         cdef float64_t left_child_max

From 4fccb2a3380c0f4e56dd804841d0ea5c1c510396 Mon Sep 17 00:00:00 2001
From: Adam Li <adam2392@gmail.com>
Date: Fri, 22 Mar 2024 12:24:29 -0400
Subject: [PATCH 24/54] Fix bestfirst

Signed-off-by: Adam Li <adam2392@gmail.com>
---
 sklearn/tree/_tree.pyx | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/sklearn/tree/_tree.pyx b/sklearn/tree/_tree.pyx
index 8b382a11791ec..117978b7722c2 100644
--- a/sklearn/tree/_tree.pyx
+++ b/sklearn/tree/_tree.pyx
@@ -460,7 +460,6 @@ cdef class BestFirstTreeBuilder(TreeBuilder):
         cdef Node* node
 
         cdef ParentInfo parent_record
-        parent_record.n_constant_features = 0
 
         # Initial capacity
         cdef intp_t init_capacity = max_split_nodes + max_leaf_nodes
@@ -614,6 +613,9 @@ cdef class BestFirstTreeBuilder(TreeBuilder):
 
         splitter.node_reset(start, end, &weighted_n_node_samples)
 
+        # best-first splits do not track the number of constants when adding a split node
+        parent_record.n_constant_features = 0
+
         if is_first:
             parent_record.impurity = splitter.node_impurity()
 

From d0a8d2fa5506f3a776e3e70af0c74dd272e8edd2 Mon Sep 17 00:00:00 2001
From: Adam Li <adam2392@gmail.com>
Date: Fri, 22 Mar 2024 13:29:18 -0400
Subject: [PATCH 25/54] Init parent record

Signed-off-by: Adam Li <adam2392@gmail.com>
---
 sklearn/tree/_tree.pyx | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/sklearn/tree/_tree.pyx b/sklearn/tree/_tree.pyx
index 117978b7722c2..422b419fe923a 100644
--- a/sklearn/tree/_tree.pyx
+++ b/sklearn/tree/_tree.pyx
@@ -227,6 +227,7 @@ cdef class DepthFirstTreeBuilder(TreeBuilder):
         cdef StackRecord stack_record
 
         cdef ParentInfo parent_record
+        _init_parent_record(&parent_record)
 
         with nogil:
             # push root node onto stack
@@ -460,6 +461,7 @@ cdef class BestFirstTreeBuilder(TreeBuilder):
         cdef Node* node
 
         cdef ParentInfo parent_record
+        _init_parent_record(&parent_record)
 
         # Initial capacity
         cdef intp_t init_capacity = max_split_nodes + max_leaf_nodes

From 82c94287b96d69acc92c94f2fbe6bd3418d455e7 Mon Sep 17 00:00:00 2001
From: Adam Li <adam2392@gmail.com>
Date: Mon, 1 Apr 2024 17:05:09 -0400
Subject: [PATCH 26/54] Address thomas comments' -s

Signed-off-by: Adam Li <adam2392@gmail.com>
---
 sklearn/tree/_tree.pyx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/tree/_tree.pyx b/sklearn/tree/_tree.pyx
index 422b419fe923a..80ab7e7369407 100644
--- a/sklearn/tree/_tree.pyx
+++ b/sklearn/tree/_tree.pyx
@@ -615,7 +615,7 @@ cdef class BestFirstTreeBuilder(TreeBuilder):
 
         splitter.node_reset(start, end, &weighted_n_node_samples)
 
-        # best-first splits do not track the number of constants when adding a split node
+        # reset n_constant_features for this specific split before beginning split search
         parent_record.n_constant_features = 0
 
         if is_first:

From b2dfe8f0305dca9ec69c8454d63370b47b9c45b5 Mon Sep 17 00:00:00 2001
From: Adam Li <adam2392@gmail.com>
Date: Tue, 2 Apr 2024 15:12:12 -0400
Subject: [PATCH 27/54] Revert to unit32_t

Signed-off-by: Adam Li <adam2392@gmail.com>
---
 sklearn/tree/_utils.pxd | 7 +++----
 sklearn/tree/_utils.pyx | 4 ++--
 2 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/sklearn/tree/_utils.pxd b/sklearn/tree/_utils.pxd
index 03a1d48c94cb4..5ee16fa3f628c 100644
--- a/sklearn/tree/_utils.pxd
+++ b/sklearn/tree/_utils.pxd
@@ -11,10 +11,9 @@
 import numpy as np
 cimport numpy as cnp
 cnp.import_array()
-ctypedef cnp.npy_uint32 UINT32_t
 
 from ..neighbors._quad_tree cimport Cell
-from ..utils._typedefs cimport float32_t, float64_t, intp_t, int32_t
+from ..utils._typedefs cimport float32_t, float64_t, intp_t, int32_t, uint32_t
 
 from ._tree cimport Node
 
@@ -52,11 +51,11 @@ cdef cnp.ndarray sizet_ptr_to_ndarray(intp_t* data, intp_t size)
 
 
 cdef intp_t rand_int(intp_t low, intp_t high,
-                     UINT32_t* random_state) noexcept nogil
+                     uint32_t* random_state) noexcept nogil
 
 
 cdef float64_t rand_uniform(float64_t low, float64_t high,
-                            UINT32_t* random_state) noexcept nogil
+                            uint32_t* random_state) noexcept nogil
 
 
 cdef float64_t log(float64_t x) noexcept nogil
diff --git a/sklearn/tree/_utils.pyx b/sklearn/tree/_utils.pyx
index cc4cb7cf02533..23c358ce4bd8b 100644
--- a/sklearn/tree/_utils.pyx
+++ b/sklearn/tree/_utils.pyx
@@ -60,13 +60,13 @@ cdef inline cnp.ndarray sizet_ptr_to_ndarray(intp_t* data, intp_t size):
 
 
 cdef inline intp_t rand_int(intp_t low, intp_t high,
-                            UINT32_t* random_state) noexcept nogil:
+                            uint32_t* random_state) noexcept nogil:
     """Generate a random integer in [low; end)."""
     return low + our_rand_r(random_state) % (high - low)
 
 
 cdef inline float64_t rand_uniform(float64_t low, float64_t high,
-                                   UINT32_t* random_state) noexcept nogil:
+                                   uint32_t* random_state) noexcept nogil:
     """Generate a random float64_t in [low; high)."""
     return ((high - low) * <float64_t> our_rand_r(random_state) /
             <float64_t> RAND_R_MAX) + low

From d90befced7ac83424e0cdbf81ea7932eede501d4 Mon Sep 17 00:00:00 2001
From: Adam Li <adam2392@gmail.com>
Date: Tue, 2 Apr 2024 16:24:38 -0400
Subject: [PATCH 28/54] UPdate submodule commit and remove extraneous code

Signed-off-by: Adam Li <adam2392@gmail.com>
---
 sklearn/tree/_tree.pxd | 12 ------------
 sklearn/tree/_tree.pyx | 43 ------------------------------------------
 2 files changed, 55 deletions(-)

diff --git a/sklearn/tree/_tree.pxd b/sklearn/tree/_tree.pxd
index 2267b4306e261..1a13730e76e6b 100644
--- a/sklearn/tree/_tree.pxd
+++ b/sklearn/tree/_tree.pxd
@@ -69,18 +69,6 @@ cdef class BaseTree:
     cdef int _resize(self, intp_t capacity) except -1 nogil
     cdef int _resize_c(self, intp_t capacity=*) except -1 nogil
 
-    cdef int _update_node(
-        self,
-        intp_t parent,
-        bint is_left,
-        bint is_leaf,
-        SplitRecord* split_node,
-        float64_t impurity,
-        intp_t n_node_samples,
-        float64_t weighted_n_node_samples,
-        unsigned char missing_go_to_left
-    ) except -1 nogil
-
     # Python API methods: These are methods exposed to Python
     cpdef cnp.ndarray apply(self, object X)
     cdef cnp.ndarray _apply_dense(self, object X)
diff --git a/sklearn/tree/_tree.pyx b/sklearn/tree/_tree.pyx
index 673e2c5654ce1..d03a6e5a9f380 100644
--- a/sklearn/tree/_tree.pyx
+++ b/sklearn/tree/_tree.pyx
@@ -1168,49 +1168,6 @@ cdef class BaseTree:
 
         return node_id
 
-    cdef inline int _update_node(
-        self,
-        intp_t parent,
-        bint is_left,
-        bint is_leaf,
-        SplitRecord* split_node,
-        float64_t impurity,
-        intp_t n_node_samples,
-        float64_t weighted_n_node_samples,
-        unsigned char missing_go_to_left
-    ) except -1 nogil:
-        """Update a node on the tree.
-
-        The updated node remains on the same position.
-        Returns (intp_t)(-1) on error.
-        """
-        cdef intp_t node_id
-        if is_left:
-            node_id = self.nodes[parent].left_child
-        else:
-            node_id = self.nodes[parent].right_child
-
-        if node_id >= self.capacity:
-            if self._resize_c() != 0:
-                return INTPTR_MAX
-
-        cdef Node* node = &self.nodes[node_id]
-        node.impurity = impurity
-        node.n_node_samples = n_node_samples
-        node.weighted_n_node_samples = weighted_n_node_samples
-
-        if is_leaf:
-            if self._set_leaf_node(split_node, node, node_id) != 1:
-                with gil:
-                    raise RuntimeError
-        else:
-            if self._set_split_node(split_node, node, node_id) != 1:
-                with gil:
-                    raise RuntimeError
-            node.missing_go_to_left = missing_go_to_left
-
-        return node_id
-
     cpdef cnp.ndarray apply(self, object X):
         """Finds the terminal region (=leaf node) for each sample in X."""
         if issparse(X):

From a52ec7442cb6600212f65f49fc3cef079c2ce019 Mon Sep 17 00:00:00 2001
From: Adam Li <adam2392@gmail.com>
Date: Tue, 2 Apr 2024 17:12:01 -0400
Subject: [PATCH 29/54] Fixing LOC

Signed-off-by: Adam Li <adam2392@gmail.com>
---
 sklearn/tree/_tree.pyx | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/sklearn/tree/_tree.pyx b/sklearn/tree/_tree.pyx
index d03a6e5a9f380..688eb6d4d3982 100644
--- a/sklearn/tree/_tree.pyx
+++ b/sklearn/tree/_tree.pyx
@@ -435,7 +435,11 @@ cdef class DepthFirstTreeBuilder(TreeBuilder):
                 # inspection and interpretation
                 splitter.node_value(tree.value + node_id * tree.value_stride)
                 if splitter.with_monotonic_cst:
-                    splitter.clip_node_value(tree.value + node_id * tree.value_stride, parent_record.lower_bound, parent_record.upper_bound)
+                    splitter.clip_node_value(
+                        tree.value + node_id * tree.value_stride,
+                        parent_record.lower_bound,
+                        parent_record.upper_bound
+                    )
 
                 if not is_leaf:
                     if (
@@ -560,8 +564,11 @@ cdef class DepthFirstTreeBuilder(TreeBuilder):
                 # inspection and interpretation
                 splitter.node_value(tree.value + node_id * tree.value_stride)
                 if splitter.with_monotonic_cst:
-                    splitter.clip_node_value(tree.value + node_id * tree.value_stride,
-                                             parent_record.lower_bound, parent_record.upper_bound)
+                    splitter.clip_node_value(
+                        tree.value + node_id * tree.value_stride,
+                        parent_record.lower_bound,
+                        parent_record.upper_bound
+                    )
 
                 if not is_leaf:
                     if (

From 750573c4a8d5620f25213ab4808008c8ecf5b5aa Mon Sep 17 00:00:00 2001
From: Adam Li <adam2392@gmail.com>
Date: Tue, 2 Apr 2024 19:02:38 -0400
Subject: [PATCH 30/54] Fix update tree node

Signed-off-by: Adam Li <adam2392@gmail.com>
---
 sklearn/tree/_tree.pxd | 12 ++++++++++
 sklearn/tree/_tree.pyx | 52 ++++++++++++++++++++++++++++++++++++++----
 2 files changed, 60 insertions(+), 4 deletions(-)

diff --git a/sklearn/tree/_tree.pxd b/sklearn/tree/_tree.pxd
index 1a13730e76e6b..2267b4306e261 100644
--- a/sklearn/tree/_tree.pxd
+++ b/sklearn/tree/_tree.pxd
@@ -69,6 +69,18 @@ cdef class BaseTree:
     cdef int _resize(self, intp_t capacity) except -1 nogil
     cdef int _resize_c(self, intp_t capacity=*) except -1 nogil
 
+    cdef int _update_node(
+        self,
+        intp_t parent,
+        bint is_left,
+        bint is_leaf,
+        SplitRecord* split_node,
+        float64_t impurity,
+        intp_t n_node_samples,
+        float64_t weighted_n_node_samples,
+        unsigned char missing_go_to_left
+    ) except -1 nogil
+
     # Python API methods: These are methods exposed to Python
     cpdef cnp.ndarray apply(self, object X)
     cdef cnp.ndarray _apply_dense(self, object X)
diff --git a/sklearn/tree/_tree.pyx b/sklearn/tree/_tree.pyx
index 688eb6d4d3982..b5c14f19f7982 100644
--- a/sklearn/tree/_tree.pyx
+++ b/sklearn/tree/_tree.pyx
@@ -422,10 +422,10 @@ cdef class DepthFirstTreeBuilder(TreeBuilder):
                                (split.improvement + EPSILON <
                                 min_impurity_decrease))
 
-                node_id = tree._add_node(parent, is_left, is_leaf, split_ptr,
-                                         parent_record.impurity,
-                                         n_node_samples, weighted_n_node_samples,
-                                         split.missing_go_to_left)
+                node_id = tree._update_node(parent, is_left, is_leaf, split_ptr,
+                                            parent_record.impurity,
+                                            n_node_samples, weighted_n_node_samples,
+                                            split.missing_go_to_left)
 
                 if node_id == INTPTR_MAX:
                     rc = -1
@@ -1175,6 +1175,50 @@ cdef class BaseTree:
 
         return node_id
 
+    cdef inline int _update_node(
+        self,
+        intp_t parent,
+        bint is_left,
+        bint is_leaf,
+        SplitRecord* split_node,
+        float64_t impurity,
+        intp_t n_node_samples,
+        float64_t weighted_n_node_samples,
+        unsigned char missing_go_to_left
+    ) except -1 nogil:
+        """Update a node on the tree.
+
+        The updated node remains on the same position.
+        
+        Returns (intp_t)(-1) on error.
+        """
+        cdef intp_t node_id
+        if is_left:
+            node_id = self.nodes[parent].left_child
+        else:
+            node_id = self.nodes[parent].right_child
+
+        if node_id >= self.capacity:
+            if self._resize_c() != 0:
+                return INTPTR_MAX
+
+        cdef Node* node = &self.nodes[node_id]
+        node.impurity = impurity
+        node.n_node_samples = n_node_samples
+        node.weighted_n_node_samples = weighted_n_node_samples
+
+        if is_leaf:
+            if self._set_leaf_node(split_node, node, node_id) != 1:
+                with gil:
+                    raise RuntimeError
+        else:
+            if self._set_split_node(split_node, node, node_id) != 1:
+                with gil:
+                    raise RuntimeError
+            node.missing_go_to_left = missing_go_to_left
+
+        return node_id
+
     cpdef cnp.ndarray apply(self, object X):
         """Finds the terminal region (=leaf node) for each sample in X."""
         if issparse(X):

From ec66190c2d696eab8078705ab4a68f9a178f6fb3 Mon Sep 17 00:00:00 2001
From: Adam Li <adam2392@gmail.com>
Date: Tue, 2 Apr 2024 19:10:08 -0400
Subject: [PATCH 31/54] Fix ci

Signed-off-by: Adam Li <adam2392@gmail.com>
---
 sklearn/tree/_tree.pyx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/tree/_tree.pyx b/sklearn/tree/_tree.pyx
index b5c14f19f7982..56809f7c4ee71 100644
--- a/sklearn/tree/_tree.pyx
+++ b/sklearn/tree/_tree.pyx
@@ -408,7 +408,7 @@ cdef class DepthFirstTreeBuilder(TreeBuilder):
                 if not is_leaf:
                     splitter.node_split(
                         &parent_record,
-                        &split,
+                        split_ptr,
                     )
 
                     # assign local copy of SplitRecord to assign

From f3607494ce509f1ff63255b0cb3bc5562de981a1 Mon Sep 17 00:00:00 2001
From: Adam Li <adam2392@gmail.com>
Date: Tue, 2 Apr 2024 19:10:25 -0400
Subject: [PATCH 32/54] Fix ci

Signed-off-by: Adam Li <adam2392@gmail.com>
---
 sklearn/tree/_tree.pyx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/tree/_tree.pyx b/sklearn/tree/_tree.pyx
index 56809f7c4ee71..486bad115787c 100644
--- a/sklearn/tree/_tree.pyx
+++ b/sklearn/tree/_tree.pyx
@@ -1189,7 +1189,7 @@ cdef class BaseTree:
         """Update a node on the tree.
 
         The updated node remains on the same position.
-        
+
         Returns (intp_t)(-1) on error.
         """
         cdef intp_t node_id

From e0202533a542386b4da050d456e87f6508f3f477 Mon Sep 17 00:00:00 2001
From: Adam Li <adam2392@gmail.com>
Date: Tue, 2 Apr 2024 19:53:02 -0400
Subject: [PATCH 33/54] MAINT Fix builder partial fit (#62)

#### Reference Issues/PRs
Fixes state of builder_ to not need to be maintained.

Prolly needs unit-tests to determine if this "functions as desired".
I.e.
- changing datatype of X over multiple partial fits should fail nicely,
- changing datatype of y
- classification and regression

#### What does this implement/fix? Explain your changes.


#### Any other comments?


<!--
Please be aware that we are a loose team of volunteers so patience is
necessary; assistance handling other issues is very welcome. We value
all user contributions, no matter how minor they are. If we are slow to
review, either the pull request needs some benchmarking, tinkering,
convincing, etc. or more likely the reviewers are simply busy. In either
case, we ask for your understanding during the review process.
For more information, see our FAQ on this topic:

https://scikit-learn.org/dev/faq.html#why-is-my-pull-request-not-getting-any-attention.

Thanks for contributing!
-->

---------

Signed-off-by: Adam Li <adam2392@gmail.com>
---
 sklearn/tree/_classes.py | 125 ++++++++++++++++++++++++++++++++-------
 1 file changed, 105 insertions(+), 20 deletions(-)

diff --git a/sklearn/tree/_classes.py b/sklearn/tree/_classes.py
index 6511c8192889e..e6949b293185d 100644
--- a/sklearn/tree/_classes.py
+++ b/sklearn/tree/_classes.py
@@ -347,7 +347,7 @@ def _fit(
                     )
 
                 self.n_classes_ = np.array(self.n_classes_, dtype=np.intp)
-
+                self._n_classes_ = self.n_classes_
             if getattr(y, "dtype", None) != DOUBLE or not y.flags.contiguous:
                 y = np.ascontiguousarray(y, dtype=DOUBLE)
 
@@ -377,6 +377,7 @@ def _fit(
             min_samples_split = max(2, min_samples_split)
         min_samples_split = max(min_samples_split, 2 * min_samples_leaf)
         self.min_samples_split_ = min_samples_split
+        self.min_samples_leaf_ = min_samples_leaf
 
         if isinstance(self.max_features, str):
             if self.max_features == "sqrt":
@@ -411,6 +412,7 @@ def _fit(
             min_weight_leaf = self.min_weight_fraction_leaf * n_samples
         else:
             min_weight_leaf = self.min_weight_fraction_leaf * np.sum(sample_weight)
+        self.min_weight_leaf_ = min_weight_leaf
 
         # build the actual tree now with the parameters
         self = self._build_tree(
@@ -521,6 +523,7 @@ def _build_tree(
                 # Since self.monotonic_cst encodes constraints on probabilities of the
                 # *positive class*, all signs must be flipped.
                 monotonic_cst *= -1
+        self.monotonic_cst_ = monotonic_cst
 
         if not isinstance(self.splitter, BaseSplitter):
             splitter = SPLITTERS[self.splitter](
@@ -544,7 +547,7 @@ def _build_tree(
 
         # Use BestFirst if max_leaf_nodes given; use DepthFirst otherwise
         if max_leaf_nodes < 0:
-            self.builder_ = DepthFirstTreeBuilder(
+            builder = DepthFirstTreeBuilder(
                 splitter,
                 min_samples_split,
                 min_samples_leaf,
@@ -554,7 +557,7 @@ def _build_tree(
                 self.store_leaf_values,
             )
         else:
-            self.builder_ = BestFirstTreeBuilder(
+            builder = BestFirstTreeBuilder(
                 splitter,
                 min_samples_split,
                 min_samples_leaf,
@@ -564,9 +567,7 @@ def _build_tree(
                 self.min_impurity_decrease,
                 self.store_leaf_values,
             )
-        self.builder_.build(
-            self.tree_, X, y, sample_weight, missing_values_in_feature_mask
-        )
+        builder.build(self.tree_, X, y, sample_weight, missing_values_in_feature_mask)
 
         if self.n_outputs_ == 1 and is_classifier(self):
             self.n_classes_ = self.n_classes_[0]
@@ -1128,12 +1129,18 @@ class DecisionTreeClassifier(ClassifierMixin, BaseDecisionTree):
         :ref:`sphx_glr_auto_examples_tree_plot_unveil_tree_structure.py`
         for basic usage of these attributes.
 
-    builder_ : TreeBuilder instance
-        The underlying TreeBuilder object.
-
     min_samples_split_ : float
         The minimum number of samples needed to split a node in the tree building.
 
+    min_weight_leaf_ : float
+        The minimum number of weighted samples in a leaf.
+
+    min_samples_leaf_ : int
+        The minimum number of samples needed for a leaf node.
+
+    monotonic_cst_ : array-like of int of shape (n_features,)
+        The monotonicity constraints enforced on each feature.
+
     See Also
     --------
     DecisionTreeRegressor : A decision tree regressor.
@@ -1369,8 +1376,68 @@ def partial_fit(self, X, y, sample_weight=None, check_input=True, classes=None):
             y = np.ascontiguousarray(y, dtype=DOUBLE)
 
         # Update tree
-        self.builder_.initialize_node_queue(self.tree_, X, y, sample_weight)
-        self.builder_.build(self.tree_, X, y, sample_weight)
+        max_leaf_nodes = -1 if self.max_leaf_nodes is None else self.max_leaf_nodes
+        min_samples_split = self.min_samples_split_
+        min_samples_leaf = self.min_samples_leaf_
+        min_weight_leaf = self.min_weight_leaf_
+        # set decision-tree model parameters
+        max_depth = np.iinfo(np.int32).max if self.max_depth is None else self.max_depth
+
+        monotonic_cst = self.monotonic_cst_
+
+        # Build tree
+        # Note: this reconstructs the builder with the same state it had during the
+        # initial fit. This is necessary because the builder is not saved as part
+        # of the class, and thus the state may be lost if pickled/unpickled.
+        n_samples = X.shape[0]
+        criterion = self.criterion
+        if not isinstance(criterion, BaseCriterion):
+            if is_classifier(self):
+                criterion = CRITERIA_CLF[self.criterion](
+                    self.n_outputs_, self._n_classes_
+                )
+            else:
+                criterion = CRITERIA_REG[self.criterion](self.n_outputs_, n_samples)
+        else:
+            # Make a deepcopy in case the criterion has mutable attributes that
+            # might be shared and modified concurrently during parallel fitting
+            criterion = copy.deepcopy(criterion)
+
+        random_state = check_random_state(self.random_state)
+        SPLITTERS = SPARSE_SPLITTERS if issparse(X) else DENSE_SPLITTERS
+        splitter = SPLITTERS[self.splitter](
+            criterion,
+            self.max_features_,
+            min_samples_leaf,
+            min_weight_leaf,
+            random_state,
+            monotonic_cst,
+        )
+
+        # Use BestFirst if max_leaf_nodes given; use DepthFirst otherwise
+        if max_leaf_nodes < 0:
+            builder = DepthFirstTreeBuilder(
+                splitter,
+                min_samples_split,
+                min_samples_leaf,
+                min_weight_leaf,
+                max_depth,
+                self.min_impurity_decrease,
+                self.store_leaf_values,
+            )
+        else:
+            builder = BestFirstTreeBuilder(
+                splitter,
+                min_samples_split,
+                min_samples_leaf,
+                min_weight_leaf,
+                max_depth,
+                max_leaf_nodes,
+                self.min_impurity_decrease,
+                self.store_leaf_values,
+            )
+        builder.initialize_node_queue(self.tree_, X, y, sample_weight)
+        builder.build(self.tree_, X, y, sample_weight)
 
         self._prune_tree()
 
@@ -1637,12 +1704,18 @@ class DecisionTreeRegressor(RegressorMixin, BaseDecisionTree):
         :ref:`sphx_glr_auto_examples_tree_plot_unveil_tree_structure.py`
         for basic usage of these attributes.
 
-    builder_ : TreeBuilder instance
-        The underlying TreeBuilder object.
-
     min_samples_split_ : float
         The minimum number of samples needed to split a node in the tree building.
 
+    min_weight_leaf_ : float
+        The minimum number of weighted samples in a leaf.
+
+    monotonic_cst_ : array-like of int of shape (n_features,)
+        The monotonicity constraints enforced on each feature.
+
+    min_samples_leaf_ : int
+        The minimum number of samples needed for a leaf node.
+
     See Also
     --------
     DecisionTreeClassifier : A decision tree classifier.
@@ -2022,12 +2095,18 @@ class ExtraTreeClassifier(DecisionTreeClassifier):
         :ref:`sphx_glr_auto_examples_tree_plot_unveil_tree_structure.py`
         for basic usage of these attributes.
 
-    builder_ : TreeBuilder instance
-        The underlying TreeBuilder object.
-
     min_samples_split_ : float
         The minimum number of samples needed to split a node in the tree building.
 
+    min_weight_leaf_ : float
+        The minimum number of weighted samples in a leaf.
+
+    monotonic_cst_ : array-like of int of shape (n_features,)
+        The monotonicity constraints enforced on each feature.
+
+    min_samples_leaf_ : int
+        The minimum number of samples needed for a leaf node.
+
     See Also
     --------
     ExtraTreeRegressor : An extremely randomized tree regressor.
@@ -2290,12 +2369,18 @@ class ExtraTreeRegressor(DecisionTreeRegressor):
         :ref:`sphx_glr_auto_examples_tree_plot_unveil_tree_structure.py`
         for basic usage of these attributes.
 
-    builder_ : TreeBuilder instance
-        The underlying TreeBuilder object.
-
     min_samples_split_ : float
         The minimum number of samples needed to split a node in the tree building.
 
+    min_weight_leaf_ : float
+        The minimum number of weighted samples in a leaf.
+
+    monotonic_cst_ : array-like of int of shape (n_features,)
+        The monotonicity constraints enforced on each feature.
+
+    min_samples_leaf_ : int
+        The minimum number of samples needed for a leaf node.
+
     See Also
     --------
     ExtraTreeClassifier : An extremely randomized tree classifier.

From 775f0b7de497fde5206101f82316e874ffb5545e Mon Sep 17 00:00:00 2001
From: Adam Li <adam2392@gmail.com>
Date: Tue, 2 Apr 2024 20:25:32 -0400
Subject: [PATCH 34/54] Try again for partial fit

Signed-off-by: Adam Li <adam2392@gmail.com>
---
 sklearn/tree/_classes.py | 136 ++++++++++++++++++++-------------------
 1 file changed, 70 insertions(+), 66 deletions(-)

diff --git a/sklearn/tree/_classes.py b/sklearn/tree/_classes.py
index e6949b293185d..2124cd76c69c8 100644
--- a/sklearn/tree/_classes.py
+++ b/sklearn/tree/_classes.py
@@ -466,7 +466,6 @@ def _build_tree(
         random_state : int
             Random seed.
         """
-
         n_samples = X.shape[0]
 
         # Build tree
@@ -576,6 +575,75 @@ def _build_tree(
         self._prune_tree()
         return self
 
+    def _update_tree(self, X, y, sample_weight):
+        # Update tree
+        max_leaf_nodes = -1 if self.max_leaf_nodes is None else self.max_leaf_nodes
+        min_samples_split = self.min_samples_split_
+        min_samples_leaf = self.min_samples_leaf_
+        min_weight_leaf = self.min_weight_leaf_
+        # set decision-tree model parameters
+        max_depth = np.iinfo(np.int32).max if self.max_depth is None else self.max_depth
+
+        monotonic_cst = self.monotonic_cst_
+
+        # Build tree
+        # Note: this reconstructs the builder with the same state it had during the
+        # initial fit. This is necessary because the builder is not saved as part
+        # of the class, and thus the state may be lost if pickled/unpickled.
+        n_samples = X.shape[0]
+        criterion = self.criterion
+        if not isinstance(criterion, BaseCriterion):
+            if is_classifier(self):
+                criterion = CRITERIA_CLF[self.criterion](
+                    self.n_outputs_, self._n_classes_
+                )
+            else:
+                criterion = CRITERIA_REG[self.criterion](self.n_outputs_, n_samples)
+        else:
+            # Make a deepcopy in case the criterion has mutable attributes that
+            # might be shared and modified concurrently during parallel fitting
+            criterion = copy.deepcopy(criterion)
+
+        random_state = check_random_state(self.random_state)
+
+        SPLITTERS = SPARSE_SPLITTERS if issparse(X) else DENSE_SPLITTERS
+        splitter = SPLITTERS[self.splitter](
+            criterion,
+            self.max_features_,
+            min_samples_leaf,
+            min_weight_leaf,
+            random_state,
+            monotonic_cst,
+        )
+
+        # Use BestFirst if max_leaf_nodes given; use DepthFirst otherwise
+        if max_leaf_nodes < 0:
+            builder = DepthFirstTreeBuilder(
+                splitter,
+                min_samples_split,
+                min_samples_leaf,
+                min_weight_leaf,
+                max_depth,
+                self.min_impurity_decrease,
+                self.store_leaf_values,
+            )
+        else:
+            builder = BestFirstTreeBuilder(
+                splitter,
+                min_samples_split,
+                min_samples_leaf,
+                min_weight_leaf,
+                max_depth,
+                max_leaf_nodes,
+                self.min_impurity_decrease,
+                self.store_leaf_values,
+            )
+        builder.initialize_node_queue(self.tree_, X, y, sample_weight)
+        builder.build(self.tree_, X, y, sample_weight)
+
+        self._prune_tree()
+        return self
+
     def _validate_X_predict(self, X, check_input):
         """Validate the training data on predict (probabilities)."""
         if check_input:
@@ -1375,71 +1443,7 @@ def partial_fit(self, X, y, sample_weight=None, check_input=True, classes=None):
         if getattr(y, "dtype", None) != DOUBLE or not y.flags.contiguous:
             y = np.ascontiguousarray(y, dtype=DOUBLE)
 
-        # Update tree
-        max_leaf_nodes = -1 if self.max_leaf_nodes is None else self.max_leaf_nodes
-        min_samples_split = self.min_samples_split_
-        min_samples_leaf = self.min_samples_leaf_
-        min_weight_leaf = self.min_weight_leaf_
-        # set decision-tree model parameters
-        max_depth = np.iinfo(np.int32).max if self.max_depth is None else self.max_depth
-
-        monotonic_cst = self.monotonic_cst_
-
-        # Build tree
-        # Note: this reconstructs the builder with the same state it had during the
-        # initial fit. This is necessary because the builder is not saved as part
-        # of the class, and thus the state may be lost if pickled/unpickled.
-        n_samples = X.shape[0]
-        criterion = self.criterion
-        if not isinstance(criterion, BaseCriterion):
-            if is_classifier(self):
-                criterion = CRITERIA_CLF[self.criterion](
-                    self.n_outputs_, self._n_classes_
-                )
-            else:
-                criterion = CRITERIA_REG[self.criterion](self.n_outputs_, n_samples)
-        else:
-            # Make a deepcopy in case the criterion has mutable attributes that
-            # might be shared and modified concurrently during parallel fitting
-            criterion = copy.deepcopy(criterion)
-
-        random_state = check_random_state(self.random_state)
-        SPLITTERS = SPARSE_SPLITTERS if issparse(X) else DENSE_SPLITTERS
-        splitter = SPLITTERS[self.splitter](
-            criterion,
-            self.max_features_,
-            min_samples_leaf,
-            min_weight_leaf,
-            random_state,
-            monotonic_cst,
-        )
-
-        # Use BestFirst if max_leaf_nodes given; use DepthFirst otherwise
-        if max_leaf_nodes < 0:
-            builder = DepthFirstTreeBuilder(
-                splitter,
-                min_samples_split,
-                min_samples_leaf,
-                min_weight_leaf,
-                max_depth,
-                self.min_impurity_decrease,
-                self.store_leaf_values,
-            )
-        else:
-            builder = BestFirstTreeBuilder(
-                splitter,
-                min_samples_split,
-                min_samples_leaf,
-                min_weight_leaf,
-                max_depth,
-                max_leaf_nodes,
-                self.min_impurity_decrease,
-                self.store_leaf_values,
-            )
-        builder.initialize_node_queue(self.tree_, X, y, sample_weight)
-        builder.build(self.tree_, X, y, sample_weight)
-
-        self._prune_tree()
+        self._update_tree(X, y, sample_weight)
 
         return self
 

From e03a15b2b607b3e3b35105d056d0834e946e6527 Mon Sep 17 00:00:00 2001
From: Adam Li <adam2392@gmail.com>
Date: Fri, 24 May 2024 14:48:43 -0400
Subject: [PATCH 35/54] Adding inline comment

Signed-off-by: Adam Li <adam2392@gmail.com>
---
 sklearn/tree/_tree.pyx | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/sklearn/tree/_tree.pyx b/sklearn/tree/_tree.pyx
index 418eae57e4995..1d5a40e6c3f33 100644
--- a/sklearn/tree/_tree.pyx
+++ b/sklearn/tree/_tree.pyx
@@ -911,6 +911,11 @@ cdef class BestFirstTreeBuilder(TreeBuilder):
     ) except -1 nogil:
         """Adds node w/ partition ``[start, end)`` to the frontier. """
         cdef SplitRecord split
+
+        # Note: we create a <*SplitRecord> pointer here in order to allow subclasses
+        # to know what kind of SplitRecord to use. In some cases, ObliqueSplitRecord
+        # might be used. The split pointer here knows the size of the underlying Record
+        # because the subclassed splitter will define "pointer_size" accordingly.
         cdef SplitRecord* split_ptr = <SplitRecord *>malloc(splitter.pointer_size())
 
         cdef intp_t node_id

From 08658c62a442fdb597ff00b7f85fb3f490cf212c Mon Sep 17 00:00:00 2001
From: Adam Li <adam2392@gmail.com>
Date: Thu, 20 Jun 2024 18:54:03 -0400
Subject: [PATCH 36/54] Simplify cython partition api

Signed-off-by: Adam Li <adam2392@gmail.com>
---
 sklearn/tree/_splitter.pyx | 20 ++++++++------------
 1 file changed, 8 insertions(+), 12 deletions(-)

diff --git a/sklearn/tree/_splitter.pyx b/sklearn/tree/_splitter.pyx
index 5d63f75781a42..d6d060a3b2d50 100644
--- a/sklearn/tree/_splitter.pyx
+++ b/sklearn/tree/_splitter.pyx
@@ -418,14 +418,17 @@ cdef inline intp_t node_split_best(
     Criterion criterion,
     SplitRecord* split,
     ParentInfo* parent_record,
-    bint with_monotonic_cst,
-    const int8_t[:] monotonic_cst,
+    # bint with_monotonic_cst,
+    # const int8_t[:] monotonic_cst,
 ) except -1 nogil:
     """Find the best split on node samples[start:end]
 
     Returns -1 in case of failure to allocate memory (and raise MemoryError)
     or 0 otherwise.
     """
+    cdef const int8_t[:] monotonic_cst = splitter.monotonic_cst
+    cdef bint with_monotonic_cst = splitter.with_monotonic_cst
+
     # Find the best split
     cdef intp_t start = splitter.start
     cdef intp_t end = splitter.end
@@ -809,14 +812,15 @@ cdef inline int node_split_random(
     Criterion criterion,
     SplitRecord* split,
     ParentInfo* parent_record,
-    bint with_monotonic_cst,
-    const int8_t[:] monotonic_cst,
 ) except -1 nogil:
     """Find the best random split on node samples[start:end]
 
     Returns -1 in case of failure to allocate memory (and raise MemoryError)
     or 0 otherwise.
     """
+    cdef const int8_t[:] monotonic_cst = splitter.monotonic_cst
+    cdef bint with_monotonic_cst = splitter.with_monotonic_cst
+    
     # Draw random splits and pick the best
     cdef intp_t start = splitter.start
     cdef intp_t end = splitter.end
@@ -1662,8 +1666,6 @@ cdef class BestSplitter(Splitter):
             self.criterion,
             split,
             parent_record,
-            self.with_monotonic_cst,
-            self.monotonic_cst,
         )
 
 cdef class BestSparseSplitter(Splitter):
@@ -1692,8 +1694,6 @@ cdef class BestSparseSplitter(Splitter):
             self.criterion,
             split,
             parent_record,
-            self.with_monotonic_cst,
-            self.monotonic_cst,
         )
 
 cdef class RandomSplitter(Splitter):
@@ -1722,8 +1722,6 @@ cdef class RandomSplitter(Splitter):
             self.criterion,
             split,
             parent_record,
-            self.with_monotonic_cst,
-            self.monotonic_cst,
         )
 
 cdef class RandomSparseSplitter(Splitter):
@@ -1751,6 +1749,4 @@ cdef class RandomSparseSplitter(Splitter):
             self.criterion,
             split,
             parent_record,
-            self.with_monotonic_cst,
-            self.monotonic_cst,
         )

From f0f69bec36d233434d14b72b2cb827ffdac3b97c Mon Sep 17 00:00:00 2001
From: Adam Li <adam2392@gmail.com>
Date: Thu, 20 Jun 2024 18:54:25 -0400
Subject: [PATCH 37/54] Simplify cython partition api

Signed-off-by: Adam Li <adam2392@gmail.com>
---
 sklearn/tree/_splitter.pyx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/tree/_splitter.pyx b/sklearn/tree/_splitter.pyx
index d6d060a3b2d50..b88574c089bf7 100644
--- a/sklearn/tree/_splitter.pyx
+++ b/sklearn/tree/_splitter.pyx
@@ -820,7 +820,7 @@ cdef inline int node_split_random(
     """
     cdef const int8_t[:] monotonic_cst = splitter.monotonic_cst
     cdef bint with_monotonic_cst = splitter.with_monotonic_cst
-    
+
     # Draw random splits and pick the best
     cdef intp_t start = splitter.start
     cdef intp_t end = splitter.end

From d455aa16ee9cc42ce342dd07d9b94db117783fcc Mon Sep 17 00:00:00 2001
From: Adam Li <adam2392@gmail.com>
Date: Fri, 21 Jun 2024 08:43:12 -0400
Subject: [PATCH 38/54] cimport _build pruned tree

Signed-off-by: Adam Li <adam2392@gmail.com>
---
 sklearn/tree/_splitter.pyx | 45 ++++++++++----------------------------
 sklearn/tree/_tree.pxd     |  8 +++++++
 2 files changed, 19 insertions(+), 34 deletions(-)

diff --git a/sklearn/tree/_splitter.pyx b/sklearn/tree/_splitter.pyx
index b88574c089bf7..8bf71765355b3 100644
--- a/sklearn/tree/_splitter.pyx
+++ b/sklearn/tree/_splitter.pyx
@@ -341,6 +341,8 @@ cdef class Splitter(BaseSplitter):
         This is typically a metric that is cheaply computed given the
         current proposed split, which is stored as a the `current_split`
         argument.
+
+        Returns 1 if not a valid split, and 0 if it is.
         """
         cdef intp_t min_samples_leaf = self.min_samples_leaf
         cdef intp_t end_non_missing = self.end - n_missing
@@ -418,8 +420,6 @@ cdef inline intp_t node_split_best(
     Criterion criterion,
     SplitRecord* split,
     ParentInfo* parent_record,
-    # bint with_monotonic_cst,
-    # const int8_t[:] monotonic_cst,
 ) except -1 nogil:
     """Find the best split on node samples[start:end]
 
@@ -566,25 +566,7 @@ cdef inline intp_t node_split_best(
 
                 current_split.pos = p
 
-                # Reject if monotonicity constraints are not satisfied
-                if (
-                    with_monotonic_cst and
-                    monotonic_cst[current_split.feature] != 0 and
-                    not criterion.check_monotonicity(
-                        monotonic_cst[current_split.feature],
-                        lower_bound,
-                        upper_bound,
-                    )
-                ):
-                    continue
-
                 # Reject if min_samples_leaf is not guaranteed
-                if missing_go_to_left:
-                    n_left = current_split.pos - splitter.start + n_missing
-                    n_right = end_non_missing - current_split.pos
-                else:
-                    n_left = current_split.pos - splitter.start
-                    n_right = end_non_missing - current_split.pos + n_missing
                 if splitter.check_presplit_conditions(&current_split, n_missing, missing_go_to_left) == 1:
                     continue
 
@@ -624,6 +606,13 @@ cdef inline intp_t node_split_best(
 
                     current_split.n_missing = n_missing
                     if n_missing == 0:
+                        if missing_go_to_left:
+                            n_left = current_split.pos - splitter.start + n_missing
+                            n_right = end_non_missing - current_split.pos
+                        else:
+                            n_left = current_split.pos - splitter.start
+                            n_right = end_non_missing - current_split.pos + n_missing
+
                         current_split.missing_go_to_left = n_left > n_right
                     else:
                         current_split.missing_go_to_left = missing_go_to_left
@@ -938,10 +927,6 @@ cdef inline int node_split_random(
         criterion.reset()
         criterion.update(current_split.pos)
 
-        # Reject if min_weight_leaf is not satisfied
-        if splitter.check_postsplit_conditions() == 1:
-            continue
-
         # Reject if monotonicity constraints are not satisfied
         if (
             with_monotonic_cst and
@@ -954,16 +939,8 @@ cdef inline int node_split_random(
         ):
             continue
 
-        # Reject if monotonicity constraints are not satisfied
-        if (
-                with_monotonic_cst and
-                monotonic_cst[current_split.feature] != 0 and
-                not criterion.check_monotonicity(
-                    monotonic_cst[current_split.feature],
-                    lower_bound,
-                    upper_bound,
-                )
-        ):
+        # Reject if min_weight_leaf is not satisfied
+        if splitter.check_postsplit_conditions() == 1:
             continue
 
         current_proxy_improvement = criterion.proxy_impurity_improvement()
diff --git a/sklearn/tree/_tree.pxd b/sklearn/tree/_tree.pxd
index 45953a8e093a5..248f7b4e5f6c1 100644
--- a/sklearn/tree/_tree.pxd
+++ b/sklearn/tree/_tree.pxd
@@ -191,3 +191,11 @@ cdef class TreeBuilder:
         const float64_t[:, ::1] y,
         const float64_t[:] sample_weight,
     )
+
+
+cdef _build_pruned_tree(
+    Tree tree,  # OUT
+    Tree orig_tree,
+    const unsigned char[:] leaves_in_subtree,
+    intp_t capacity
+)

From ae2604ba53d092eaaec64eba0136a76460586cb0 Mon Sep 17 00:00:00 2001
From: Adam Li <adam2392@gmail.com>
Date: Thu, 11 Jul 2024 08:44:43 -0400
Subject: [PATCH 39/54] Make sure its defined

Signed-off-by: Adam Li <adam2392@gmail.com>
---
 sklearn/tree/_splitter.pyx | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/sklearn/tree/_splitter.pyx b/sklearn/tree/_splitter.pyx
index e85f6ef5b2257..82835910fb800 100644
--- a/sklearn/tree/_splitter.pyx
+++ b/sklearn/tree/_splitter.pyx
@@ -959,6 +959,13 @@ cdef inline int node_split_random(
             current_split.threshold
         )
 
+        if missing_go_to_left:
+            n_left = current_split.pos - start + n_missing
+            n_right = end_non_missing - current_split.pos
+        else:
+            n_left = current_split.pos - start
+            n_right = end_non_missing - current_split.pos + n_missing
+
         # Reject if min_samples_leaf is not guaranteed
         if splitter.check_presplit_conditions(&current_split, n_missing, missing_go_to_left) == 1:
             continue

From b201fcb945fa54979a93bf8cc11f88c17d4b8fc9 Mon Sep 17 00:00:00 2001
From: Adam Li <adam2392@gmail.com>
Date: Thu, 8 Aug 2024 09:37:53 -0400
Subject: [PATCH 40/54] Update wrt main

Signed-off-by: Adam Li <adam2392@gmail.com>
---
 sklearn/tree/_tree.pxd | 4 ++--
 sklearn/tree/_tree.pyx | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/sklearn/tree/_tree.pxd b/sklearn/tree/_tree.pxd
index 042d4126ef770..aecb9a4d95009 100644
--- a/sklearn/tree/_tree.pxd
+++ b/sklearn/tree/_tree.pxd
@@ -58,8 +58,8 @@ cdef class BaseTree:
         float64_t weighted_n_node_samples,
         uint8_t missing_go_to_left
     ) except -1 nogil
-    cdef intp_t _resize(self, intp_t capacity) except -1 nogil
-    cdef intp_t _resize_c(self, intp_t capacity=*) except -1 nogil
+    cdef int _resize(self, intp_t capacity) except -1 nogil
+    cdef int _resize_c(self, intp_t capacity=*) except -1 nogil
 
     cdef intp_t _update_node(
         self,
diff --git a/sklearn/tree/_tree.pyx b/sklearn/tree/_tree.pyx
index 653d3f8c08892..18d9275115786 100644
--- a/sklearn/tree/_tree.pyx
+++ b/sklearn/tree/_tree.pyx
@@ -984,7 +984,7 @@ cdef class BaseTree:
 
     Downstream classes must implement methods to actually traverse the tree.
     """
-    cdef intp_t _resize(
+    cdef int _resize(
         self,
         intp_t capacity
     ) except -1 nogil:
@@ -999,7 +999,7 @@ cdef class BaseTree:
             with gil:
                 raise MemoryError()
 
-    cdef intp_t _resize_c(self, intp_t capacity=INTPTR_MAX) except -1 nogil:
+    cdef int _resize_c(self, intp_t capacity=INTPTR_MAX) except -1 nogil:
         """Guts of _resize
 
         Returns -1 in case of failure to allocate memory (and raise MemoryError)

From 600187a53a8c1bee0b7092d69adda9064e3c0dbc Mon Sep 17 00:00:00 2001
From: Adam Li <adam2392@gmail.com>
Date: Thu, 5 Sep 2024 10:20:07 -0400
Subject: [PATCH 41/54] Merging main

Signed-off-by: Adam Li <adam2392@gmail.com>
---
 sklearn/ensemble/_forest.py | 14 ++------------
 1 file changed, 2 insertions(+), 12 deletions(-)

diff --git a/sklearn/ensemble/_forest.py b/sklearn/ensemble/_forest.py
index 7ffcc5454ba6f..ccc9d46393f4e 100644
--- a/sklearn/ensemble/_forest.py
+++ b/sklearn/ensemble/_forest.py
@@ -67,6 +67,7 @@ class calls the ``fit`` method of each sub-estimator on random samples
     check_classification_targets,
     type_of_target,
 )
+from sklearn.utils._tags import get_tags
 from sklearn.utils.parallel import Parallel, delayed
 from sklearn.utils.validation import (
     _check_feature_names_in,
@@ -83,18 +84,7 @@ class calls the ``fit`` method of each sub-estimator on random samples
     ExtraTreeRegressor,
 )
 from ..tree._tree import DOUBLE, DTYPE
-from ..utils import check_random_state, compute_sample_weight
-from ..utils._param_validation import Interval, RealNotInt, StrOptions
-from ..utils._tags import get_tags
-from ..utils.multiclass import check_classification_targets, type_of_target
-from ..utils.parallel import Parallel, delayed
-from ..utils.validation import (
-    _check_feature_names_in,
-    _check_sample_weight,
-    _num_samples,
-    check_is_fitted,
-)
-from ._base import BaseEnsemble, _partition_estimators
+
 
 __all__ = [
     "RandomForestClassifier",

From 473f7bcf89fd058999323e8be470c3cda71da762 Mon Sep 17 00:00:00 2001
From: Adam Li <adam2392@gmail.com>
Date: Fri, 6 Sep 2024 14:03:52 -0400
Subject: [PATCH 42/54] Reverting

Signed-off-by: Adam Li <adam2392@gmail.com>
---
 sklearn/ensemble/_forest.py | 766 +++---------------------------------
 1 file changed, 56 insertions(+), 710 deletions(-)

diff --git a/sklearn/ensemble/_forest.py b/sklearn/ensemble/_forest.py
index d6c26ae282068..ae729f4dfebdf 100644
--- a/sklearn/ensemble/_forest.py
+++ b/sklearn/ensemble/_forest.py
@@ -39,14 +39,13 @@ class calls the ``fit`` method of each sub-estimator on random samples
 import threading
 from abc import ABCMeta, abstractmethod
 from numbers import Integral, Real
-from time import time
 from warnings import catch_warnings, simplefilter, warn
 
 import numpy as np
 from scipy.sparse import hstack as sparse_hstack
 from scipy.sparse import issparse
 
-from sklearn.base import (
+from ..base import (
     ClassifierMixin,
     MultiOutputMixin,
     RegressorMixin,
@@ -54,12 +53,9 @@ class calls the ``fit`` method of each sub-estimator on random samples
     _fit_context,
     is_classifier,
 )
-from sklearn.ensemble._base import BaseEnsemble, _partition_estimators
-from sklearn.ensemble._hist_gradient_boosting.binning import _BinMapper
-from sklearn.exceptions import DataConversionWarning
-from sklearn.metrics import accuracy_score, r2_score
-from sklearn.preprocessing import OneHotEncoder
-
+from ..exceptions import DataConversionWarning
+from ..metrics import accuracy_score, r2_score
+from ..preprocessing import OneHotEncoder
 from ..tree import (
     BaseDecisionTree,
     DecisionTreeClassifier,
@@ -97,18 +93,14 @@ def _get_n_samples_bootstrap(n_samples, max_samples):
     """
     Get the number of samples in a bootstrap sample.
 
-    The expected total number of unique samples in a bootstrap sample is
-    required to be at most ``n_samples - 1``.
-    This is equivalent to the expected number of out-of-bag samples being at
-    least 1.
-
     Parameters
     ----------
     n_samples : int
         Number of samples in the dataset.
     max_samples : int or float
         The maximum number of samples to draw from the total available:
-            - if float, this indicates a fraction of the total;
+            - if float, this indicates a fraction of the total and should be
+              the interval `(0.0, 1.0]`;
             - if int, this indicates the exact number of samples;
             - if None, this indicates the total number of samples.
 
@@ -121,21 +113,12 @@ def _get_n_samples_bootstrap(n_samples, max_samples):
         return n_samples
 
     if isinstance(max_samples, Integral):
-        expected_oob_samples = (1 - np.exp(-max_samples / n_samples)) * n_samples
-        if expected_oob_samples >= n_samples - 1:
-            raise ValueError(
-                "The expected number of unique samples in the bootstrap sample"
-                f" must be at most {n_samples - 1}. It is: {expected_oob_samples}"
-            )
+        if max_samples > n_samples:
+            msg = "`max_samples` must be <= n_samples={} but got value {}"
+            raise ValueError(msg.format(n_samples, max_samples))
         return max_samples
 
     if isinstance(max_samples, Real):
-        expected_oob_samples = (1 - np.exp(-max_samples)) * n_samples
-        if expected_oob_samples >= n_samples - 1:
-            raise ValueError(
-                "The expected number of unique samples in the bootstrap sample"
-                f" must be at most {n_samples - 1}. It is: {expected_oob_samples}"
-            )
         return max(round(n_samples * max_samples), 1)
 
 
@@ -177,7 +160,6 @@ def _parallel_build_trees(
     class_weight=None,
     n_samples_bootstrap=None,
     missing_values_in_feature_mask=None,
-    classes=None,
 ):
     """
     Private function used to fit a single tree in parallel."""
@@ -210,7 +192,6 @@ def _parallel_build_trees(
             sample_weight=curr_sample_weight,
             check_input=False,
             missing_values_in_feature_mask=missing_values_in_feature_mask,
-            classes=classes,
         )
     else:
         tree._fit(
@@ -219,50 +200,6 @@ def _parallel_build_trees(
             sample_weight=sample_weight,
             check_input=False,
             missing_values_in_feature_mask=missing_values_in_feature_mask,
-            classes=classes,
-        )
-
-    return tree
-
-
-def _parallel_update_trees(
-    tree,
-    bootstrap,
-    X,
-    y,
-    sample_weight,
-    tree_idx,
-    n_trees,
-    verbose=0,
-    class_weight=None,
-    n_samples_bootstrap=None,
-    classes=None,
-):
-    """
-    Private function used to fit a single tree in parallel."""
-    if verbose > 1:
-        print("Updating tree %d of %d" % (tree_idx + 1, n_trees))
-
-    if bootstrap:
-        n_samples = X.shape[0]
-        indices = _generate_sample_indices(
-            tree.random_state, n_samples, n_samples_bootstrap
-        )
-
-        tree.partial_fit(
-            X[indices, :],
-            y[indices],
-            sample_weight=sample_weight,
-            check_input=False,
-            classes=classes,
-        )
-    else:
-        tree.partial_fit(
-            X,
-            y,
-            sample_weight=sample_weight,
-            check_input=False,
-            classes=classes,
         )
 
     return tree
@@ -289,11 +226,6 @@ class BaseForest(MultiOutputMixin, BaseEnsemble, metaclass=ABCMeta):
             Interval(RealNotInt, 0.0, 1.0, closed="right"),
             Interval(Integral, 1, None, closed="left"),
         ],
-        "max_bins": [
-            None,
-            Interval(Integral, 1, None, closed="left"),
-        ],
-        "store_leaf_values": ["boolean"],
     }
 
     @abstractmethod
@@ -311,8 +243,6 @@ def __init__(
         warm_start=False,
         class_weight=None,
         max_samples=None,
-        max_bins=None,
-        store_leaf_values=False,
     ):
         super().__init__(
             estimator=estimator,
@@ -328,8 +258,6 @@ def __init__(
         self.warm_start = warm_start
         self.class_weight = class_weight
         self.max_samples = max_samples
-        self.max_bins = max_bins
-        self.store_leaf_values = store_leaf_values
 
     def apply(self, X):
         """
@@ -349,15 +277,6 @@ def apply(self, X):
             return the index of the leaf x ends up in.
         """
         X = self._validate_X_predict(X)
-
-        # if we trained a binning tree, then we should re-bin the data
-        # XXX: this is inefficient and should be improved to be in line with what
-        # the Histogram Gradient Boosting Tree does, where the binning thresholds
-        # are passed into the tree itself, thus allowing us to set the node feature
-        # value thresholds within the tree itself.
-        if self.max_bins is not None:
-            X = self._bin_data(X, is_training_data=False).astype(DTYPE)
-
         results = Parallel(
             n_jobs=self.n_jobs,
             verbose=self.verbose,
@@ -407,7 +326,7 @@ def decision_path(self, X):
         return sparse_hstack(indicators).tocsr(), n_nodes_ptr
 
     @_fit_context(prefer_skip_nested_validation=True)
-    def fit(self, X, y, sample_weight=None, classes=None):
+    def fit(self, X, y, sample_weight=None):
         """
         Build a forest of trees from the training set (X, y).
 
@@ -429,9 +348,6 @@ def fit(self, X, y, sample_weight=None, classes=None):
             classification, splits are also ignored if they would result in any
             single class carrying a negative weight in either child node.
 
-        classes : array-like of shape (n_classes,), default=None
-            List of all the classes that can possibly appear in the y vector.
-
         Returns
         -------
         self : object
@@ -500,7 +416,7 @@ def fit(self, X, y, sample_weight=None, classes=None):
 
         self._n_samples, self.n_outputs_ = y.shape
 
-        y, expanded_class_weight = self._validate_y_class_weight(y, classes=classes)
+        y, expanded_class_weight = self._validate_y_class_weight(y)
 
         if getattr(y, "dtype", None) != DOUBLE or not y.flags.contiguous:
             y = np.ascontiguousarray(y, dtype=DOUBLE)
@@ -539,38 +455,6 @@ def fit(self, X, y, sample_weight=None, classes=None):
 
         n_more_estimators = self.n_estimators - len(self.estimators_)
 
-        if self.max_bins is not None:
-            # `_openmp_effective_n_threads` is used to take cgroups CPU quotes
-            # into account when determine the maximum number of threads to use.
-            n_threads = _openmp_effective_n_threads()
-
-            # Bin the data
-            # For ease of use of the API, the user-facing GBDT classes accept the
-            # parameter max_bins, which doesn't take into account the bin for
-            # missing values (which is always allocated). However, since max_bins
-            # isn't the true maximal number of bins, all other private classes
-            # (binmapper, histbuilder...) accept n_bins instead, which is the
-            # actual total number of bins. Everywhere in the code, the
-            # convention is that n_bins == max_bins + 1
-            n_bins = self.max_bins + 1  # + 1 for missing values
-            self._bin_mapper = _BinMapper(
-                n_bins=n_bins,
-                # is_categorical=self.is_categorical_,
-                known_categories=None,
-                random_state=random_state,
-                n_threads=n_threads,
-            )
-
-            # XXX: in order for this to work with the underlying tree submodule's Cython
-            # code, we need to convert this into the original data's DTYPE because
-            # the Cython code assumes that `DTYPE` is used.
-            # The proper implementation will be a lot more complicated and should be
-            # tackled once scikit-learn has finalized their inclusion of missing data
-            # and categorical support for decision trees
-            X = self._bin_data(X, is_training_data=True)  # .astype(DTYPE)
-        else:
-            self._bin_mapper = None
-
         if n_more_estimators < 0:
             raise ValueError(
                 "n_estimators=%d must be larger or equal to "
@@ -589,18 +473,41 @@ def fit(self, X, y, sample_weight=None, classes=None):
                 # would have got if we hadn't used a warm_start.
                 random_state.randint(MAX_INT, size=len(self.estimators_))
 
-            # construct the trees in parallel
-            self._construct_trees(
-                X,
-                y,
-                sample_weight,
-                random_state,
-                n_samples_bootstrap,
-                missing_values_in_feature_mask,
-                classes,
-                n_more_estimators,
+            trees = [
+                self._make_estimator(append=False, random_state=random_state)
+                for i in range(n_more_estimators)
+            ]
+
+            # Parallel loop: we prefer the threading backend as the Cython code
+            # for fitting the trees is internally releasing the Python GIL
+            # making threading more efficient than multiprocessing in
+            # that case. However, for joblib 0.12+ we respect any
+            # parallel_backend contexts set at a higher level,
+            # since correctness does not rely on using threads.
+            trees = Parallel(
+                n_jobs=self.n_jobs,
+                verbose=self.verbose,
+                prefer="threads",
+            )(
+                delayed(_parallel_build_trees)(
+                    t,
+                    self.bootstrap,
+                    X,
+                    y,
+                    sample_weight,
+                    i,
+                    len(trees),
+                    verbose=self.verbose,
+                    class_weight=self.class_weight,
+                    n_samples_bootstrap=n_samples_bootstrap,
+                    missing_values_in_feature_mask=missing_values_in_feature_mask,
+                )
+                for i, t in enumerate(trees)
             )
 
+            # Collect newly grown trees
+            self.estimators_.extend(trees)
+
         if self.oob_score and (
             n_more_estimators > 0 or not hasattr(self, "oob_score_")
         ):
@@ -634,53 +541,6 @@ def fit(self, X, y, sample_weight=None, classes=None):
 
         return self
 
-    def _construct_trees(
-        self,
-        X,
-        y,
-        sample_weight,
-        random_state,
-        n_samples_bootstrap,
-        missing_values_in_feature_mask,
-        classes,
-        n_more_estimators,
-    ):
-        trees = [
-            self._make_estimator(append=False, random_state=random_state)
-            for i in range(n_more_estimators)
-        ]
-
-        # Parallel loop: we prefer the threading backend as the Cython code
-        # for fitting the trees is internally releasing the Python GIL
-        # making threading more efficient than multiprocessing in
-        # that case. However, for joblib 0.12+ we respect any
-        # parallel_backend contexts set at a higher level,
-        # since correctness does not rely on using threads.
-        trees = Parallel(
-            n_jobs=self.n_jobs,
-            verbose=self.verbose,
-            prefer="threads",
-        )(
-            delayed(_parallel_build_trees)(
-                t,
-                self.bootstrap,
-                X,
-                y,
-                sample_weight,
-                i,
-                len(trees),
-                verbose=self.verbose,
-                class_weight=self.class_weight,
-                n_samples_bootstrap=n_samples_bootstrap,
-                missing_values_in_feature_mask=missing_values_in_feature_mask,
-                classes=classes,
-            )
-            for i, t in enumerate(trees)
-        )
-
-        # Collect newly grown trees
-        self.estimators_.extend(trees)
-
     @abstractmethod
     def _set_oob_score_and_attributes(self, X, y, scoring_function=None):
         """Compute and set the OOB score and attributes.
@@ -763,7 +623,7 @@ def _compute_oob_predictions(self, X, y):
 
         return oob_pred
 
-    def _validate_y_class_weight(self, y, classes=None):
+    def _validate_y_class_weight(self, y):
         # Default implementation
         return y, None
 
@@ -823,174 +683,6 @@ def feature_importances_(self):
         all_importances = np.mean(all_importances, axis=0, dtype=np.float64)
         return all_importances / np.sum(all_importances)
 
-    def _bin_data(self, X, is_training_data):
-        """Bin data X.
-
-        If is_training_data, then fit the _bin_mapper attribute.
-        Else, the binned data is converted to a C-contiguous array.
-        """
-        description = "training" if is_training_data else "validation"
-        if self.verbose:
-            print(
-                "Binning {:.3f} GB of {} data: ".format(X.nbytes / 1e9, description),
-                end="",
-                flush=True,
-            )
-        tic = time()
-        if is_training_data:
-            X_binned = self._bin_mapper.fit_transform(X)  # F-aligned array
-        else:
-            X_binned = self._bin_mapper.transform(X)  # F-aligned array
-            # We convert the array to C-contiguous since predicting is faster
-            # with this layout (training is faster on F-arrays though)
-            X_binned = np.ascontiguousarray(X_binned)
-        toc = time()
-        if self.verbose:
-            duration = toc - tic
-            print("{:.3f} s".format(duration))
-
-        return X_binned
-
-    def predict_quantiles(self, X, quantiles=0.5, method="nearest"):
-        """Predict class or regression value for X at given quantiles.
-
-        Parameters
-        ----------
-        X : {array-like, sparse matrix} of shape (n_samples, n_features)
-            Input data.
-        quantiles : float, optional
-            The quantiles at which to evaluate, by default 0.5 (median).
-        method : str, optional
-            The method to interpolate, by default 'linear'. Can be any keyword
-            argument accepted by :func:`~np.quantile`.
-
-        Returns
-        -------
-        y : ndarray of shape (n_samples, n_quantiles, [n_outputs])
-            The predicted values. The ``n_outputs`` dimension is present only
-            for multi-output regressors.
-        """
-        if not self.store_leaf_values:
-            raise RuntimeError(
-                "Quantile prediction is not available when store_leaf_values=False"
-            )
-        check_is_fitted(self)
-        # Check data
-        X = self._validate_X_predict(X)
-
-        if not isinstance(quantiles, (np.ndarray, list)):
-            quantiles = np.array([quantiles])
-
-        # if we trained a binning tree, then we should re-bin the data
-        # XXX: this is inefficient and should be improved to be in line with what
-        # the Histogram Gradient Boosting Tree does, where the binning thresholds
-        # are passed into the tree itself, thus allowing us to set the node feature
-        # value thresholds within the tree itself.
-        if self.max_bins is not None:
-            X = self._bin_data(X, is_training_data=False).astype(DTYPE)
-
-        # Assign chunk of trees to jobs
-        # n_jobs, _, _ = _partition_estimators(self.n_estimators, self.n_jobs)
-
-        # avoid storing the output of every estimator by summing them here
-        if self.n_outputs_ > 1:
-            y_hat = np.zeros(
-                (X.shape[0], len(quantiles), self.n_outputs_), dtype=np.float64
-            )
-        else:
-            y_hat = np.zeros((X.shape[0], len(quantiles)), dtype=np.float64)
-
-        # get (n_samples, n_estimators) indicator of leaf nodes
-        X_leaves = self.apply(X)
-
-        # we now want to aggregate all leaf samples across all trees for each sample
-        for idx in range(X.shape[0]):
-            # get leaf nodes for this sample
-            leaf_nodes = X_leaves[idx, :]
-
-            # (n_total_leaf_samples, n_outputs)
-            leaf_node_samples = np.vstack(
-                [
-                    est.tree_.leaf_nodes_samples[leaf_nodes[jdx]]
-                    for jdx, est in enumerate(self.estimators_)
-                ]
-            )
-
-            # get quantiles across all leaf node samples
-            try:
-                y_hat[idx, ...] = np.quantile(
-                    leaf_node_samples, quantiles, axis=0, method=method
-                )
-            except TypeError:
-                y_hat[idx, ...] = np.quantile(
-                    leaf_node_samples, quantiles, axis=0, interpolation=method
-                )
-
-            if is_classifier(self):
-                if self.n_outputs_ == 1:
-                    for i in range(len(quantiles)):
-                        class_pred_per_sample = y_hat[idx, i, :].squeeze().astype(int)
-                        y_hat[idx, ...] = self.classes_.take(
-                            class_pred_per_sample, axis=0
-                        )
-                else:
-                    for k in range(self.n_outputs_):
-                        for i in range(len(quantiles)):
-                            class_pred_per_sample = (
-                                y_hat[idx, i, k].squeeze().astype(int)
-                            )
-                            y_hat[idx, i, k] = self.classes_[k].take(
-                                class_pred_per_sample, axis=0
-                            )
-        return y_hat
-
-    def get_leaf_node_samples(self, X):
-        """For each datapoint x in X, get the training samples in the leaf node.
-
-        Parameters
-        ----------
-        X : array-like of shape (n_samples, n_features)
-            Dataset to apply the forest to.
-
-        Returns
-        -------
-        leaf_node_samples : a list of array-like
-            Each sample is represented by the indices of the training samples that
-            reached the leaf node. The ``n_leaf_node_samples`` may vary between
-            samples, since the number of samples that fall in a leaf node is
-            variable. Each array-like has shape (n_leaf_node_samples, n_outputs).
-        """
-        if not self.store_leaf_values:
-            raise RuntimeError(
-                "Leaf node samples are not available when store_leaf_values=False"
-            )
-
-        check_is_fitted(self)
-        # Check data
-        X = self._validate_X_predict(X)
-
-        # if we trained a binning tree, then we should re-bin the data
-        # XXX: this is inefficient and should be improved to be in line with what
-        # the Histogram Gradient Boosting Tree does, where the binning thresholds
-        # are passed into the tree itself, thus allowing us to set the node feature
-        # value thresholds within the tree itself.
-        if self.max_bins is not None:
-            X = self._bin_data(X, is_training_data=False).astype(DTYPE)
-
-        # Assign chunk of trees to jobs
-        n_jobs, _, _ = _partition_estimators(self.n_estimators, self.n_jobs)
-
-        # avoid storing the output of every estimator by summing them here
-        result = Parallel(n_jobs=n_jobs, verbose=self.verbose)(
-            delayed(_accumulate_leaf_nodes_samples)(e.get_leaf_node_samples, X)
-            for e in self.estimators_
-        )
-        leaf_nodes_samples = result[0]
-        for result_ in result[1:]:
-            for i, node_samples in enumerate(result_):
-                leaf_nodes_samples[i] = np.vstack((leaf_nodes_samples[i], node_samples))
-        return leaf_nodes_samples
-
     def _get_estimators_indices(self):
         # Get drawn indices along both sample and feature axes
         for tree in self.estimators_:
@@ -1046,17 +738,6 @@ def _accumulate_prediction(predict, X, out, lock):
                 out[i] += prediction[i]
 
 
-def _accumulate_leaf_nodes_samples(func, X):
-    """
-    This is a utility function for joblib's Parallel.
-
-    It can't go locally in ForestClassifier or ForestRegressor, because joblib
-    complains that it cannot pickle it when placed there.
-    """
-    leaf_nodes_samples = func(X, check_input=False)
-    return leaf_nodes_samples
-
-
 class ForestClassifier(ClassifierMixin, BaseForest, metaclass=ABCMeta):
     """
     Base class for forest of trees-based classifiers.
@@ -1080,8 +761,6 @@ def __init__(
         warm_start=False,
         class_weight=None,
         max_samples=None,
-        max_bins=None,
-        store_leaf_values=False,
     ):
         super().__init__(
             estimator=estimator,
@@ -1095,8 +774,6 @@ def __init__(
             warm_start=warm_start,
             class_weight=class_weight,
             max_samples=max_samples,
-            max_bins=max_bins,
-            store_leaf_values=store_leaf_values,
         )
 
     @staticmethod
@@ -1151,7 +828,7 @@ def _set_oob_score_and_attributes(self, X, y, scoring_function=None):
             y, np.argmax(self.oob_decision_function_, axis=1)
         )
 
-    def _validate_y_class_weight(self, y, classes=None):
+    def _validate_y_class_weight(self, y):
         check_classification_targets(y)
 
         y = np.copy(y)
@@ -1164,28 +841,12 @@ def _validate_y_class_weight(self, y, classes=None):
         self.n_classes_ = []
 
         y_store_unique_indices = np.zeros(y.shape, dtype=int)
-        if classes is not None:
-            classes = np.atleast_1d(classes)
-            if classes.ndim == 1:
-                classes = np.array([classes])
-
-            for k in classes:
-                self.classes_.append(np.array(k))
-                self.n_classes_.append(np.array(k).shape[0])
-
-            for i in range(y.shape[0]):
-                for j in range(self.n_outputs_):
-                    y_store_unique_indices[i, j] = np.where(
-                        self.classes_[j] == y[i, j]
-                    )[0][0]
-        else:
-            for k in range(self.n_outputs_):
-                classes_k, y_store_unique_indices[:, k] = np.unique(
-                    y[:, k], return_inverse=True
-                )
-                self.classes_.append(classes_k)
-                self.n_classes_.append(classes_k.shape[0])
-
+        for k in range(self.n_outputs_):
+            classes_k, y_store_unique_indices[:, k] = np.unique(
+                y[:, k], return_inverse=True
+            )
+            self.classes_.append(classes_k)
+            self.n_classes_.append(classes_k.shape[0])
         y = y_store_unique_indices
 
         if self.class_weight is not None:
@@ -1220,228 +881,6 @@ def _validate_y_class_weight(self, y, classes=None):
 
         return y, expanded_class_weight
 
-    def partial_fit(self, X, y, sample_weight=None, classes=None):
-        """Update a decision tree classifier from the training set (X, y).
-
-        Parameters
-        ----------
-        X : {array-like, sparse matrix} of shape (n_samples, n_features)
-            The training input samples. Internally, it will be converted to
-            ``dtype=np.float32`` and if a sparse matrix is provided
-            to a sparse ``csc_matrix``.
-
-        y : array-like of shape (n_samples,) or (n_samples, n_outputs)
-            The target values (class labels) as integers or strings.
-
-        sample_weight : array-like of shape (n_samples,), default=None
-            Sample weights. If None, then samples are equally weighted. Splits
-            that would create child nodes with net zero or negative weight are
-            ignored while searching for a split in each node. Splits are also
-            ignored if they would result in any single class carrying a
-            negative weight in either child node.
-
-        classes : array-like of shape (n_classes,), default=None
-            List of all the classes that can possibly appear in the y vector.
-            Must be provided at the first call to partial_fit, can be omitted
-            in subsequent calls.
-
-        Returns
-        -------
-        self : object
-            Returns the instance itself.
-        """
-        self._validate_params()
-
-        # validate input parameters
-        first_call = _check_partial_fit_first_call(self, classes=classes)
-
-        # Fit if no tree exists yet
-        if first_call:
-            self.fit(
-                X,
-                y,
-                sample_weight=sample_weight,
-                classes=classes,
-            )
-            return self
-
-        X, y = self._validate_data(
-            X,
-            y,
-            multi_output=True,
-            accept_sparse="csc",
-            dtype=DTYPE,
-            force_all_finite=False,
-            reset=first_call,
-        )
-
-        if issparse(y):
-            raise ValueError("sparse multilabel-indicator for y is not supported.")
-
-        if sample_weight is not None:
-            sample_weight = _check_sample_weight(sample_weight, X)
-
-        if issparse(X):
-            # Pre-sort indices to avoid that each individual tree of the
-            # ensemble sorts the indices.
-            X.sort_indices()
-
-        y = np.atleast_1d(y)
-        if y.ndim == 2 and y.shape[1] == 1:
-            warn(
-                (
-                    "A column-vector y was passed when a 1d array was"
-                    " expected. Please change the shape of y to "
-                    "(n_samples,), for example using ravel()."
-                ),
-                DataConversionWarning,
-                stacklevel=2,
-            )
-
-        if y.ndim == 1:
-            # reshape is necessary to preserve the data contiguity against vs
-            # [:, np.newaxis] that does not.
-            y = np.reshape(y, (-1, 1))
-
-        if self.criterion == "poisson":
-            if np.any(y < 0):
-                raise ValueError(
-                    "Some value(s) of y are negative which is "
-                    "not allowed for Poisson regression."
-                )
-            if np.sum(y) <= 0:
-                raise ValueError(
-                    "Sum of y is not strictly positive which "
-                    "is necessary for Poisson regression."
-                )
-
-        self.n_outputs_ = y.shape[1]
-
-        classes = self.classes_
-        if self.n_outputs_ == 1:
-            classes = [classes]
-
-        y, expanded_class_weight = self._validate_y_class_weight(y, classes)
-
-        if getattr(y, "dtype", None) != DOUBLE or not y.flags.contiguous:
-            y = np.ascontiguousarray(y, dtype=DOUBLE)
-
-        if expanded_class_weight is not None:
-            if sample_weight is not None:
-                sample_weight = sample_weight * expanded_class_weight
-            else:
-                sample_weight = expanded_class_weight
-
-        if not self.bootstrap and self.max_samples is not None:
-            raise ValueError(
-                "`max_sample` cannot be set if `bootstrap=False`. "
-                "Either switch to `bootstrap=True` or set "
-                "`max_sample=None`."
-            )
-        elif self.bootstrap:
-            n_samples_bootstrap = _get_n_samples_bootstrap(
-                n_samples=X.shape[0], max_samples=self.max_samples
-            )
-        else:
-            n_samples_bootstrap = None
-
-        self._validate_estimator()
-
-        if not self.bootstrap and self.oob_score:
-            raise ValueError("Out of bag estimation only available if bootstrap=True")
-
-        random_state = check_random_state(self.random_state)
-
-        if self.max_bins is not None:
-            # `_openmp_effective_n_threads` is used to take cgroups CPU quotes
-            # into account when determine the maximum number of threads to use.
-            n_threads = _openmp_effective_n_threads()
-
-            # Bin the data
-            # For ease of use of the API, the user-facing GBDT classes accept the
-            # parameter max_bins, which doesn't take into account the bin for
-            # missing values (which is always allocated). However, since max_bins
-            # isn't the true maximal number of bins, all other private classes
-            # (binmapper, histbuilder...) accept n_bins instead, which is the
-            # actual total number of bins. Everywhere in the code, the
-            # convention is that n_bins == max_bins + 1
-            n_bins = self.max_bins + 1  # + 1 for missing values
-            self._bin_mapper = _BinMapper(
-                n_bins=n_bins,
-                # is_categorical=self.is_categorical_,
-                known_categories=None,
-                random_state=random_state,
-                n_threads=n_threads,
-            )
-
-            # XXX: in order for this to work with the underlying tree submodule's Cython
-            # code, we need to convert this into the original data's DTYPE because
-            # the Cython code assumes that `DTYPE` is used.
-            # The proper implementation will be a lot more complicated and should be
-            # tackled once scikit-learn has finalized their inclusion of missing data
-            # and categorical support for decision trees
-            X = self._bin_data(X, is_training_data=True)  # .astype(DTYPE)
-        else:
-            self._bin_mapper = None
-
-        # We draw from the random state to get the random state we
-        # would have got if we hadn't used a warm_start.
-        random_state.randint(MAX_INT, size=len(self.estimators_))
-
-        # Parallel loop: we prefer the threading backend as the Cython code
-        # for fitting the trees is internally releasing the Python GIL
-        # making threading more efficient than multiprocessing in
-        # that case. However, for joblib 0.12+ we respect any
-        # parallel_backend contexts set at a higher level,
-        # since correctness does not rely on using threads.
-        Parallel(
-            n_jobs=self.n_jobs,
-            verbose=self.verbose,
-            prefer="threads",
-        )(
-            delayed(_parallel_update_trees)(
-                t,
-                self.bootstrap,
-                X,
-                y,
-                sample_weight,
-                i,
-                len(self.estimators_),
-                verbose=self.verbose,
-                class_weight=self.class_weight,
-                n_samples_bootstrap=n_samples_bootstrap,
-                classes=classes[0],
-            )
-            for i, t in enumerate(self.estimators_)
-        )
-
-        if self.oob_score:
-            y_type = type_of_target(y)
-            if y_type in ("multiclass-multioutput", "unknown"):
-                # FIXME: we could consider to support multiclass-multioutput if
-                # we introduce or reuse a constructor parameter (e.g.
-                # oob_score) allowing our user to pass a callable defining the
-                # scoring strategy on OOB sample.
-                raise ValueError(
-                    "The type of target cannot be used to compute OOB "
-                    f"estimates. Got {y_type} while only the following are "
-                    "supported: continuous, continuous-multioutput, binary, "
-                    "multiclass, multilabel-indicator."
-                )
-
-            if callable(self.oob_score):
-                self._set_oob_score_and_attributes(
-                    X, y, scoring_function=self.oob_score
-                )
-            else:
-                self._set_oob_score_and_attributes(X, y)
-
-        # Decapsulate classes_ attributes
-        if hasattr(self, "classes_") and self.n_outputs_ == 1:
-            self.n_classes_ = self.n_classes_[0]
-            self.classes_ = self.classes_[0]
-        return self
-
     def predict(self, X):
         """
         Predict class for X.
@@ -1507,14 +946,6 @@ def predict_proba(self, X):
         # Check data
         X = self._validate_X_predict(X)
 
-        # if we trained a binning tree, then we should re-bin the data
-        # XXX: this is inefficient and should be improved to be in line with what
-        # the Histogram Gradient Boosting Tree does, where the binning thresholds
-        # are passed into the tree itself, thus allowing us to set the node feature
-        # value thresholds within the tree itself.
-        if self.max_bins is not None:
-            X = self._bin_data(X, is_training_data=False).astype(DTYPE)
-
         # Assign chunk of trees to jobs
         n_jobs, _, _ = _partition_estimators(self.n_estimators, self.n_jobs)
 
@@ -1597,8 +1028,6 @@ def __init__(
         verbose=0,
         warm_start=False,
         max_samples=None,
-        max_bins=None,
-        store_leaf_values=False,
     ):
         super().__init__(
             estimator,
@@ -1611,8 +1040,6 @@ def __init__(
             verbose=verbose,
             warm_start=warm_start,
             max_samples=max_samples,
-            max_bins=max_bins,
-            store_leaf_values=store_leaf_values,
         )
 
     def predict(self, X):
@@ -1638,14 +1065,6 @@ def predict(self, X):
         # Check data
         X = self._validate_X_predict(X)
 
-        # if we trained a binning tree, then we should re-bin the data
-        # XXX: this is inefficient and should be improved to be in line with what
-        # the Histogram Gradient Boosting Tree does, where the binning thresholds
-        # are passed into the tree itself, thus allowing us to set the node feature
-        # value thresholds within the tree itself.
-        if self.max_bins is not None:
-            X = self._bin_data(X, is_training_data=False).astype(DTYPE)
-
         # Assign chunk of trees to jobs
         n_jobs, _, _ = _partition_estimators(self.n_estimators, self.n_jobs)
 
@@ -1941,16 +1360,6 @@ class RandomForestClassifier(ForestClassifier):
 
         .. versionadded:: 0.22
 
-    max_bins : int, default=255
-        The maximum number of bins to use for non-missing values.
-
-        **This is an experimental feature**.
-
-    store_leaf_values : bool, default=False
-        Whether to store the leaf values in the ``get_leaf_node_samples`` function.
-
-        **This is an experimental feature**.
-
     monotonic_cst : array-like of int of shape (n_features), default=None
         Indicates the monotonicity constraint to enforce on each feature.
           - 1: monotonic increase
@@ -2108,8 +1517,6 @@ def __init__(
         class_weight=None,
         ccp_alpha=0.0,
         max_samples=None,
-        max_bins=None,
-        store_leaf_values=False,
         monotonic_cst=None,
     ):
         super().__init__(
@@ -2126,7 +1533,6 @@ def __init__(
                 "min_impurity_decrease",
                 "random_state",
                 "ccp_alpha",
-                "store_leaf_values",
                 "monotonic_cst",
             ),
             bootstrap=bootstrap,
@@ -2137,8 +1543,6 @@ def __init__(
             warm_start=warm_start,
             class_weight=class_weight,
             max_samples=max_samples,
-            max_bins=max_bins,
-            store_leaf_values=store_leaf_values,
         )
 
         self.criterion = criterion
@@ -2332,17 +1736,6 @@ class RandomForestRegressor(ForestRegressor):
 
         .. versionadded:: 0.22
 
-    max_bins : int, default=255
-        The maximum number of bins to use for non-missing values. Used for
-        speeding up training time.
-
-        **This is an experimental feature**.
-
-    store_leaf_values : bool, default=False
-        Whether to store the leaf values in the ``get_leaf_node_samples`` function.
-
-        **This is an experimental feature**.
-
     monotonic_cst : array-like of int of shape (n_features), default=None
         Indicates the monotonicity constraint to enforce on each feature.
           - 1: monotonically increasing
@@ -2484,8 +1877,6 @@ def __init__(
         warm_start=False,
         ccp_alpha=0.0,
         max_samples=None,
-        max_bins=None,
-        store_leaf_values=False,
         monotonic_cst=None,
     ):
         super().__init__(
@@ -2502,7 +1893,6 @@ def __init__(
                 "min_impurity_decrease",
                 "random_state",
                 "ccp_alpha",
-                "store_leaf_values",
                 "monotonic_cst",
             ),
             bootstrap=bootstrap,
@@ -2512,8 +1902,6 @@ def __init__(
             verbose=verbose,
             warm_start=warm_start,
             max_samples=max_samples,
-            max_bins=max_bins,
-            store_leaf_values=store_leaf_values,
         )
 
         self.criterion = criterion
@@ -2713,16 +2101,6 @@ class ExtraTreesClassifier(ForestClassifier):
 
         .. versionadded:: 0.22
 
-    max_bins : int, default=255
-        The maximum number of bins to use for non-missing values.
-
-        **This is an experimental feature**.
-
-    store_leaf_values : bool, default=False
-        Whether to store the leaf values in the ``get_leaf_node_samples`` function.
-
-        **This is an experimental feature**.
-
     monotonic_cst : array-like of int of shape (n_features), default=None
         Indicates the monotonicity constraint to enforce on each feature.
           - 1: monotonically increasing
@@ -2869,8 +2247,6 @@ def __init__(
         class_weight=None,
         ccp_alpha=0.0,
         max_samples=None,
-        max_bins=None,
-        store_leaf_values=False,
         monotonic_cst=None,
     ):
         super().__init__(
@@ -2887,7 +2263,6 @@ def __init__(
                 "min_impurity_decrease",
                 "random_state",
                 "ccp_alpha",
-                "store_leaf_values",
                 "monotonic_cst",
             ),
             bootstrap=bootstrap,
@@ -2898,8 +2273,6 @@ def __init__(
             warm_start=warm_start,
             class_weight=class_weight,
             max_samples=max_samples,
-            max_bins=max_bins,
-            store_leaf_values=store_leaf_values,
         )
 
         self.criterion = criterion
@@ -3087,16 +2460,6 @@ class ExtraTreesRegressor(ForestRegressor):
 
         .. versionadded:: 0.22
 
-    max_bins : int, default=255
-        The maximum number of bins to use for non-missing values.
-
-        **This is an experimental feature**.
-
-    store_leaf_values : bool, default=False
-        Whether to store the leaf values in the ``get_leaf_node_samples`` function.
-
-        **This is an experimental feature**.
-
     monotonic_cst : array-like of int of shape (n_features), default=None
         Indicates the monotonicity constraint to enforce on each feature.
           - 1: monotonically increasing
@@ -3223,8 +2586,6 @@ def __init__(
         warm_start=False,
         ccp_alpha=0.0,
         max_samples=None,
-        max_bins=None,
-        store_leaf_values=False,
         monotonic_cst=None,
     ):
         super().__init__(
@@ -3241,7 +2602,6 @@ def __init__(
                 "min_impurity_decrease",
                 "random_state",
                 "ccp_alpha",
-                "store_leaf_values",
                 "monotonic_cst",
             ),
             bootstrap=bootstrap,
@@ -3251,8 +2611,6 @@ def __init__(
             verbose=verbose,
             warm_start=warm_start,
             max_samples=max_samples,
-            max_bins=max_bins,
-            store_leaf_values=store_leaf_values,
         )
 
         self.criterion = criterion
@@ -3376,9 +2734,6 @@ class RandomTreesEmbedding(TransformerMixin, BaseForest):
         new forest. See :term:`Glossary <warm_start>` and
         :ref:`tree_ensemble_warm_start` for details.
 
-    store_leaf_values : bool, default=False
-        Whether to store the leaf values in the ``get_leaf_node_samples`` function.
-
     Attributes
     ----------
     estimator_ : :class:`~sklearn.tree.ExtraTreeRegressor` instance
@@ -3480,7 +2835,6 @@ def __init__(
         random_state=None,
         verbose=0,
         warm_start=False,
-        store_leaf_values=False,
     ):
         super().__init__(
             estimator=ExtraTreeRegressor(),
@@ -3495,7 +2849,6 @@ def __init__(
                 "max_leaf_nodes",
                 "min_impurity_decrease",
                 "random_state",
-                "store_leaf_values",
             ),
             bootstrap=False,
             oob_score=False,
@@ -3504,7 +2857,6 @@ def __init__(
             verbose=verbose,
             warm_start=warm_start,
             max_samples=None,
-            store_leaf_values=store_leaf_values,
         )
 
         self.max_depth = max_depth
@@ -3518,7 +2870,7 @@ def __init__(
     def _set_oob_score_and_attributes(self, X, y, scoring_function=None):
         raise NotImplementedError("OOB score not supported by tree embedding")
 
-    def fit(self, X, y=None, sample_weight=None, classes=None):
+    def fit(self, X, y=None, sample_weight=None):
         """
         Fit estimator.
 
@@ -3539,20 +2891,17 @@ def fit(self, X, y=None, sample_weight=None, classes=None):
             classification, splits are also ignored if they would result in any
             single class carrying a negative weight in either child node.
 
-        classes : array-like of shape (n_classes,), default=None
-            List of all the classes that can possibly appear in the y vector.
-
         Returns
         -------
         self : object
             Returns the instance itself.
         """
         # Parameters are validated in fit_transform
-        self.fit_transform(X, y, sample_weight=sample_weight, classes=classes)
+        self.fit_transform(X, y, sample_weight=sample_weight)
         return self
 
     @_fit_context(prefer_skip_nested_validation=True)
-    def fit_transform(self, X, y=None, sample_weight=None, classes=None):
+    def fit_transform(self, X, y=None, sample_weight=None):
         """
         Fit estimator and transform dataset.
 
@@ -3572,9 +2921,6 @@ def fit_transform(self, X, y=None, sample_weight=None, classes=None):
             classification, splits are also ignored if they would result in any
             single class carrying a negative weight in either child node.
 
-        classes : array-like of shape (n_classes,), default=None
-            List of all the classes that can possibly appear in the y vector.
-
         Returns
         -------
         X_transformed : sparse matrix of shape (n_samples, n_out)
@@ -3582,7 +2928,7 @@ def fit_transform(self, X, y=None, sample_weight=None, classes=None):
         """
         rnd = check_random_state(self.random_state)
         y = rnd.uniform(size=_num_samples(X))
-        super().fit(X, y, sample_weight=sample_weight, classes=classes)
+        super().fit(X, y, sample_weight=sample_weight)
 
         self.one_hot_encoder_ = OneHotEncoder(sparse_output=self.sparse_output)
         output = self.one_hot_encoder_.fit_transform(self.apply(X))

From ee4b9b777600a1c4da322c4f703b665037d97a3c Mon Sep 17 00:00:00 2001
From: Adam Li <adam2392@gmail.com>
Date: Fri, 6 Sep 2024 15:43:47 -0400
Subject: [PATCH 43/54] Fixed

Signed-off-by: Adam Li <adam2392@gmail.com>
---
 sklearn/ensemble/_forest.py | 759 +++++++++++++++++++++++++++++++++---
 sklearn/tree/_classes.py    |  14 +-
 2 files changed, 712 insertions(+), 61 deletions(-)

diff --git a/sklearn/ensemble/_forest.py b/sklearn/ensemble/_forest.py
index ae729f4dfebdf..ab063a72057de 100644
--- a/sklearn/ensemble/_forest.py
+++ b/sklearn/ensemble/_forest.py
@@ -36,6 +36,7 @@ class calls the ``fit`` method of each sub-estimator on random samples
 # SPDX-License-Identifier: BSD-3-Clause
 
 
+from time import time
 import threading
 from abc import ABCMeta, abstractmethod
 from numbers import Integral, Real
@@ -65,6 +66,7 @@ class calls the ``fit`` method of each sub-estimator on random samples
 )
 from ..tree._tree import DOUBLE, DTYPE
 from ..utils import check_random_state, compute_sample_weight
+from ..utils._openmp_helpers import _openmp_effective_n_threads
 from ..utils._param_validation import Interval, RealNotInt, StrOptions
 from ..utils._tags import get_tags
 from ..utils.multiclass import check_classification_targets, type_of_target
@@ -76,8 +78,10 @@ class calls the ``fit`` method of each sub-estimator on random samples
     check_is_fitted,
     validate_data,
 )
+from ._hist_gradient_boosting.binning import _BinMapper
 from ._base import BaseEnsemble, _partition_estimators
 
+
 __all__ = [
     "RandomForestClassifier",
     "RandomForestRegressor",
@@ -93,14 +97,18 @@ def _get_n_samples_bootstrap(n_samples, max_samples):
     """
     Get the number of samples in a bootstrap sample.
 
+    The expected total number of unique samples in a bootstrap sample is
+    required to be at most ``n_samples - 1``.
+    This is equivalent to the expected number of out-of-bag samples being at
+    least 1.
+
     Parameters
     ----------
     n_samples : int
         Number of samples in the dataset.
     max_samples : int or float
         The maximum number of samples to draw from the total available:
-            - if float, this indicates a fraction of the total and should be
-              the interval `(0.0, 1.0]`;
+            - if float, this indicates a fraction of the total;
             - if int, this indicates the exact number of samples;
             - if None, this indicates the total number of samples.
 
@@ -113,12 +121,21 @@ def _get_n_samples_bootstrap(n_samples, max_samples):
         return n_samples
 
     if isinstance(max_samples, Integral):
-        if max_samples > n_samples:
-            msg = "`max_samples` must be <= n_samples={} but got value {}"
-            raise ValueError(msg.format(n_samples, max_samples))
+        expected_oob_samples = (1 - np.exp(-max_samples / n_samples)) * n_samples
+        if expected_oob_samples >= n_samples - 1:
+            raise ValueError(
+                "The expected number of unique samples in the bootstrap sample"
+                f" must be at most {n_samples - 1}. It is: {expected_oob_samples}"
+            )
         return max_samples
 
     if isinstance(max_samples, Real):
+        expected_oob_samples = (1 - np.exp(-max_samples)) * n_samples
+        if expected_oob_samples >= n_samples - 1:
+            raise ValueError(
+                "The expected number of unique samples in the bootstrap sample"
+                f" must be at most {n_samples - 1}. It is: {expected_oob_samples}"
+            )
         return max(round(n_samples * max_samples), 1)
 
 
@@ -160,6 +177,7 @@ def _parallel_build_trees(
     class_weight=None,
     n_samples_bootstrap=None,
     missing_values_in_feature_mask=None,
+    classes=None,
 ):
     """
     Private function used to fit a single tree in parallel."""
@@ -192,6 +210,7 @@ def _parallel_build_trees(
             sample_weight=curr_sample_weight,
             check_input=False,
             missing_values_in_feature_mask=missing_values_in_feature_mask,
+            classes=classes,
         )
     else:
         tree._fit(
@@ -200,6 +219,50 @@ def _parallel_build_trees(
             sample_weight=sample_weight,
             check_input=False,
             missing_values_in_feature_mask=missing_values_in_feature_mask,
+            classes=classes,
+        )
+
+    return tree
+
+
+def _parallel_update_trees(
+    tree,
+    bootstrap,
+    X,
+    y,
+    sample_weight,
+    tree_idx,
+    n_trees,
+    verbose=0,
+    class_weight=None,
+    n_samples_bootstrap=None,
+    classes=None,
+):
+    """
+    Private function used to fit a single tree in parallel."""
+    if verbose > 1:
+        print("Updating tree %d of %d" % (tree_idx + 1, n_trees))
+
+    if bootstrap:
+        n_samples = X.shape[0]
+        indices = _generate_sample_indices(
+            tree.random_state, n_samples, n_samples_bootstrap
+        )
+
+        tree.partial_fit(
+            X[indices, :],
+            y[indices],
+            sample_weight=sample_weight,
+            check_input=False,
+            classes=classes,
+        )
+    else:
+        tree.partial_fit(
+            X,
+            y,
+            sample_weight=sample_weight,
+            check_input=False,
+            classes=classes,
         )
 
     return tree
@@ -226,6 +289,11 @@ class BaseForest(MultiOutputMixin, BaseEnsemble, metaclass=ABCMeta):
             Interval(RealNotInt, 0.0, 1.0, closed="right"),
             Interval(Integral, 1, None, closed="left"),
         ],
+        "max_bins": [
+            None,
+            Interval(Integral, 1, None, closed="left"),
+        ],
+        "store_leaf_values": ["boolean"],
     }
 
     @abstractmethod
@@ -243,6 +311,8 @@ def __init__(
         warm_start=False,
         class_weight=None,
         max_samples=None,
+        max_bins=None,
+        store_leaf_values=False,
     ):
         super().__init__(
             estimator=estimator,
@@ -258,6 +328,8 @@ def __init__(
         self.warm_start = warm_start
         self.class_weight = class_weight
         self.max_samples = max_samples
+        self.max_bins = max_bins
+        self.store_leaf_values = store_leaf_values
 
     def apply(self, X):
         """
@@ -277,6 +349,15 @@ def apply(self, X):
             return the index of the leaf x ends up in.
         """
         X = self._validate_X_predict(X)
+
+        # if we trained a binning tree, then we should re-bin the data
+        # XXX: this is inefficient and should be improved to be in line with what
+        # the Histogram Gradient Boosting Tree does, where the binning thresholds
+        # are passed into the tree itself, thus allowing us to set the node feature
+        # value thresholds within the tree itself.
+        if self.max_bins is not None:
+            X = self._bin_data(X, is_training_data=False).astype(DTYPE)
+
         results = Parallel(
             n_jobs=self.n_jobs,
             verbose=self.verbose,
@@ -326,7 +407,7 @@ def decision_path(self, X):
         return sparse_hstack(indicators).tocsr(), n_nodes_ptr
 
     @_fit_context(prefer_skip_nested_validation=True)
-    def fit(self, X, y, sample_weight=None):
+    def fit(self, X, y, sample_weight=None, classes=None):
         """
         Build a forest of trees from the training set (X, y).
 
@@ -348,6 +429,9 @@ def fit(self, X, y, sample_weight=None):
             classification, splits are also ignored if they would result in any
             single class carrying a negative weight in either child node.
 
+        classes : array-like of shape (n_classes,), default=None
+            List of all the classes that can possibly appear in the y vector.
+
         Returns
         -------
         self : object
@@ -416,7 +500,7 @@ def fit(self, X, y, sample_weight=None):
 
         self._n_samples, self.n_outputs_ = y.shape
 
-        y, expanded_class_weight = self._validate_y_class_weight(y)
+        y, expanded_class_weight = self._validate_y_class_weight(y, classes=classes)
 
         if getattr(y, "dtype", None) != DOUBLE or not y.flags.contiguous:
             y = np.ascontiguousarray(y, dtype=DOUBLE)
@@ -455,6 +539,38 @@ def fit(self, X, y, sample_weight=None):
 
         n_more_estimators = self.n_estimators - len(self.estimators_)
 
+        if self.max_bins is not None:
+            # `_openmp_effective_n_threads` is used to take cgroups CPU quotes
+            # into account when determine the maximum number of threads to use.
+            n_threads = _openmp_effective_n_threads()
+
+            # Bin the data
+            # For ease of use of the API, the user-facing GBDT classes accept the
+            # parameter max_bins, which doesn't take into account the bin for
+            # missing values (which is always allocated). However, since max_bins
+            # isn't the true maximal number of bins, all other private classes
+            # (binmapper, histbuilder...) accept n_bins instead, which is the
+            # actual total number of bins. Everywhere in the code, the
+            # convention is that n_bins == max_bins + 1
+            n_bins = self.max_bins + 1  # + 1 for missing values
+            self._bin_mapper = _BinMapper(
+                n_bins=n_bins,
+                # is_categorical=self.is_categorical_,
+                known_categories=None,
+                random_state=random_state,
+                n_threads=n_threads,
+            )
+
+            # XXX: in order for this to work with the underlying tree submodule's Cython
+            # code, we need to convert this into the original data's DTYPE because
+            # the Cython code assumes that `DTYPE` is used.
+            # The proper implementation will be a lot more complicated and should be
+            # tackled once scikit-learn has finalized their inclusion of missing data
+            # and categorical support for decision trees
+            X = self._bin_data(X, is_training_data=True)  # .astype(DTYPE)
+        else:
+            self._bin_mapper = None
+
         if n_more_estimators < 0:
             raise ValueError(
                 "n_estimators=%d must be larger or equal to "
@@ -473,41 +589,18 @@ def fit(self, X, y, sample_weight=None):
                 # would have got if we hadn't used a warm_start.
                 random_state.randint(MAX_INT, size=len(self.estimators_))
 
-            trees = [
-                self._make_estimator(append=False, random_state=random_state)
-                for i in range(n_more_estimators)
-            ]
-
-            # Parallel loop: we prefer the threading backend as the Cython code
-            # for fitting the trees is internally releasing the Python GIL
-            # making threading more efficient than multiprocessing in
-            # that case. However, for joblib 0.12+ we respect any
-            # parallel_backend contexts set at a higher level,
-            # since correctness does not rely on using threads.
-            trees = Parallel(
-                n_jobs=self.n_jobs,
-                verbose=self.verbose,
-                prefer="threads",
-            )(
-                delayed(_parallel_build_trees)(
-                    t,
-                    self.bootstrap,
-                    X,
-                    y,
-                    sample_weight,
-                    i,
-                    len(trees),
-                    verbose=self.verbose,
-                    class_weight=self.class_weight,
-                    n_samples_bootstrap=n_samples_bootstrap,
-                    missing_values_in_feature_mask=missing_values_in_feature_mask,
-                )
-                for i, t in enumerate(trees)
+            # construct the trees in parallel
+            self._construct_trees(
+                X,
+                y,
+                sample_weight,
+                random_state,
+                n_samples_bootstrap,
+                missing_values_in_feature_mask,
+                classes,
+                n_more_estimators,
             )
 
-            # Collect newly grown trees
-            self.estimators_.extend(trees)
-
         if self.oob_score and (
             n_more_estimators > 0 or not hasattr(self, "oob_score_")
         ):
@@ -541,6 +634,53 @@ def fit(self, X, y, sample_weight=None):
 
         return self
 
+    def _construct_trees(
+        self,
+        X,
+        y,
+        sample_weight,
+        random_state,
+        n_samples_bootstrap,
+        missing_values_in_feature_mask,
+        classes,
+        n_more_estimators,
+    ):
+        trees = [
+            self._make_estimator(append=False, random_state=random_state)
+            for i in range(n_more_estimators)
+        ]
+
+        # Parallel loop: we prefer the threading backend as the Cython code
+        # for fitting the trees is internally releasing the Python GIL
+        # making threading more efficient than multiprocessing in
+        # that case. However, for joblib 0.12+ we respect any
+        # parallel_backend contexts set at a higher level,
+        # since correctness does not rely on using threads.
+        trees = Parallel(
+            n_jobs=self.n_jobs,
+            verbose=self.verbose,
+            prefer="threads",
+        )(
+            delayed(_parallel_build_trees)(
+                t,
+                self.bootstrap,
+                X,
+                y,
+                sample_weight,
+                i,
+                len(trees),
+                verbose=self.verbose,
+                class_weight=self.class_weight,
+                n_samples_bootstrap=n_samples_bootstrap,
+                missing_values_in_feature_mask=missing_values_in_feature_mask,
+                classes=classes,
+            )
+            for i, t in enumerate(trees)
+        )
+
+        # Collect newly grown trees
+        self.estimators_.extend(trees)
+
     @abstractmethod
     def _set_oob_score_and_attributes(self, X, y, scoring_function=None):
         """Compute and set the OOB score and attributes.
@@ -623,7 +763,7 @@ def _compute_oob_predictions(self, X, y):
 
         return oob_pred
 
-    def _validate_y_class_weight(self, y):
+    def _validate_y_class_weight(self, y, classes=None):
         # Default implementation
         return y, None
 
@@ -683,6 +823,174 @@ def feature_importances_(self):
         all_importances = np.mean(all_importances, axis=0, dtype=np.float64)
         return all_importances / np.sum(all_importances)
 
+    def _bin_data(self, X, is_training_data):
+        """Bin data X.
+
+        If is_training_data, then fit the _bin_mapper attribute.
+        Else, the binned data is converted to a C-contiguous array.
+        """
+        description = "training" if is_training_data else "validation"
+        if self.verbose:
+            print(
+                "Binning {:.3f} GB of {} data: ".format(X.nbytes / 1e9, description),
+                end="",
+                flush=True,
+            )
+        tic = time()
+        if is_training_data:
+            X_binned = self._bin_mapper.fit_transform(X)  # F-aligned array
+        else:
+            X_binned = self._bin_mapper.transform(X)  # F-aligned array
+            # We convert the array to C-contiguous since predicting is faster
+            # with this layout (training is faster on F-arrays though)
+            X_binned = np.ascontiguousarray(X_binned)
+        toc = time()
+        if self.verbose:
+            duration = toc - tic
+            print("{:.3f} s".format(duration))
+
+        return X_binned
+
+    def predict_quantiles(self, X, quantiles=0.5, method="nearest"):
+        """Predict class or regression value for X at given quantiles.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            Input data.
+        quantiles : float, optional
+            The quantiles at which to evaluate, by default 0.5 (median).
+        method : str, optional
+            The method to interpolate, by default 'linear'. Can be any keyword
+            argument accepted by :func:`~np.quantile`.
+
+        Returns
+        -------
+        y : ndarray of shape (n_samples, n_quantiles, [n_outputs])
+            The predicted values. The ``n_outputs`` dimension is present only
+            for multi-output regressors.
+        """
+        if not self.store_leaf_values:
+            raise RuntimeError(
+                "Quantile prediction is not available when store_leaf_values=False"
+            )
+        check_is_fitted(self)
+        # Check data
+        X = self._validate_X_predict(X)
+
+        if not isinstance(quantiles, (np.ndarray, list)):
+            quantiles = np.array([quantiles])
+
+        # if we trained a binning tree, then we should re-bin the data
+        # XXX: this is inefficient and should be improved to be in line with what
+        # the Histogram Gradient Boosting Tree does, where the binning thresholds
+        # are passed into the tree itself, thus allowing us to set the node feature
+        # value thresholds within the tree itself.
+        if self.max_bins is not None:
+            X = self._bin_data(X, is_training_data=False).astype(DTYPE)
+
+        # Assign chunk of trees to jobs
+        # n_jobs, _, _ = _partition_estimators(self.n_estimators, self.n_jobs)
+
+        # avoid storing the output of every estimator by summing them here
+        if self.n_outputs_ > 1:
+            y_hat = np.zeros(
+                (X.shape[0], len(quantiles), self.n_outputs_), dtype=np.float64
+            )
+        else:
+            y_hat = np.zeros((X.shape[0], len(quantiles)), dtype=np.float64)
+
+        # get (n_samples, n_estimators) indicator of leaf nodes
+        X_leaves = self.apply(X)
+
+        # we now want to aggregate all leaf samples across all trees for each sample
+        for idx in range(X.shape[0]):
+            # get leaf nodes for this sample
+            leaf_nodes = X_leaves[idx, :]
+
+            # (n_total_leaf_samples, n_outputs)
+            leaf_node_samples = np.vstack(
+                [
+                    est.tree_.leaf_nodes_samples[leaf_nodes[jdx]]
+                    for jdx, est in enumerate(self.estimators_)
+                ]
+            )
+
+            # get quantiles across all leaf node samples
+            try:
+                y_hat[idx, ...] = np.quantile(
+                    leaf_node_samples, quantiles, axis=0, method=method
+                )
+            except TypeError:
+                y_hat[idx, ...] = np.quantile(
+                    leaf_node_samples, quantiles, axis=0, interpolation=method
+                )
+
+            if is_classifier(self):
+                if self.n_outputs_ == 1:
+                    for i in range(len(quantiles)):
+                        class_pred_per_sample = y_hat[idx, i, :].squeeze().astype(int)
+                        y_hat[idx, ...] = self.classes_.take(
+                            class_pred_per_sample, axis=0
+                        )
+                else:
+                    for k in range(self.n_outputs_):
+                        for i in range(len(quantiles)):
+                            class_pred_per_sample = (
+                                y_hat[idx, i, k].squeeze().astype(int)
+                            )
+                            y_hat[idx, i, k] = self.classes_[k].take(
+                                class_pred_per_sample, axis=0
+                            )
+        return y_hat
+
+    def get_leaf_node_samples(self, X):
+        """For each datapoint x in X, get the training samples in the leaf node.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            Dataset to apply the forest to.
+
+        Returns
+        -------
+        leaf_node_samples : a list of array-like
+            Each sample is represented by the indices of the training samples that
+            reached the leaf node. The ``n_leaf_node_samples`` may vary between
+            samples, since the number of samples that fall in a leaf node is
+            variable. Each array-like has shape (n_leaf_node_samples, n_outputs).
+        """
+        if not self.store_leaf_values:
+            raise RuntimeError(
+                "Leaf node samples are not available when store_leaf_values=False"
+            )
+
+        check_is_fitted(self)
+        # Check data
+        X = self._validate_X_predict(X)
+
+        # if we trained a binning tree, then we should re-bin the data
+        # XXX: this is inefficient and should be improved to be in line with what
+        # the Histogram Gradient Boosting Tree does, where the binning thresholds
+        # are passed into the tree itself, thus allowing us to set the node feature
+        # value thresholds within the tree itself.
+        if self.max_bins is not None:
+            X = self._bin_data(X, is_training_data=False).astype(DTYPE)
+
+        # Assign chunk of trees to jobs
+        n_jobs, _, _ = _partition_estimators(self.n_estimators, self.n_jobs)
+
+        # avoid storing the output of every estimator by summing them here
+        result = Parallel(n_jobs=n_jobs, verbose=self.verbose)(
+            delayed(_accumulate_leaf_nodes_samples)(e.get_leaf_node_samples, X)
+            for e in self.estimators_
+        )
+        leaf_nodes_samples = result[0]
+        for result_ in result[1:]:
+            for i, node_samples in enumerate(result_):
+                leaf_nodes_samples[i] = np.vstack((leaf_nodes_samples[i], node_samples))
+        return leaf_nodes_samples
+
     def _get_estimators_indices(self):
         # Get drawn indices along both sample and feature axes
         for tree in self.estimators_:
@@ -738,6 +1046,17 @@ def _accumulate_prediction(predict, X, out, lock):
                 out[i] += prediction[i]
 
 
+def _accumulate_leaf_nodes_samples(func, X):
+    """
+    This is a utility function for joblib's Parallel.
+
+    It can't go locally in ForestClassifier or ForestRegressor, because joblib
+    complains that it cannot pickle it when placed there.
+    """
+    leaf_nodes_samples = func(X, check_input=False)
+    return leaf_nodes_samples
+
+
 class ForestClassifier(ClassifierMixin, BaseForest, metaclass=ABCMeta):
     """
     Base class for forest of trees-based classifiers.
@@ -761,6 +1080,8 @@ def __init__(
         warm_start=False,
         class_weight=None,
         max_samples=None,
+        max_bins=None,
+        store_leaf_values=False,
     ):
         super().__init__(
             estimator=estimator,
@@ -774,6 +1095,8 @@ def __init__(
             warm_start=warm_start,
             class_weight=class_weight,
             max_samples=max_samples,
+            max_bins=max_bins,
+            store_leaf_values=store_leaf_values,
         )
 
     @staticmethod
@@ -828,7 +1151,7 @@ def _set_oob_score_and_attributes(self, X, y, scoring_function=None):
             y, np.argmax(self.oob_decision_function_, axis=1)
         )
 
-    def _validate_y_class_weight(self, y):
+    def _validate_y_class_weight(self, y, classes=None):
         check_classification_targets(y)
 
         y = np.copy(y)
@@ -841,12 +1164,28 @@ def _validate_y_class_weight(self, y):
         self.n_classes_ = []
 
         y_store_unique_indices = np.zeros(y.shape, dtype=int)
-        for k in range(self.n_outputs_):
-            classes_k, y_store_unique_indices[:, k] = np.unique(
-                y[:, k], return_inverse=True
-            )
-            self.classes_.append(classes_k)
-            self.n_classes_.append(classes_k.shape[0])
+        if classes is not None:
+            classes = np.atleast_1d(classes)
+            if classes.ndim == 1:
+                classes = np.array([classes])
+
+            for k in classes:
+                self.classes_.append(np.array(k))
+                self.n_classes_.append(np.array(k).shape[0])
+
+            for i in range(y.shape[0]):
+                for j in range(self.n_outputs_):
+                    y_store_unique_indices[i, j] = np.where(
+                        self.classes_[j] == y[i, j]
+                    )[0][0]
+        else:
+            for k in range(self.n_outputs_):
+                classes_k, y_store_unique_indices[:, k] = np.unique(
+                    y[:, k], return_inverse=True
+                )
+                self.classes_.append(classes_k)
+                self.n_classes_.append(classes_k.shape[0])
+
         y = y_store_unique_indices
 
         if self.class_weight is not None:
@@ -881,6 +1220,229 @@ def _validate_y_class_weight(self, y):
 
         return y, expanded_class_weight
 
+    def partial_fit(self, X, y, sample_weight=None, classes=None):
+        """Update a decision tree classifier from the training set (X, y).
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            The training input samples. Internally, it will be converted to
+            ``dtype=np.float32`` and if a sparse matrix is provided
+            to a sparse ``csc_matrix``.
+
+        y : array-like of shape (n_samples,) or (n_samples, n_outputs)
+            The target values (class labels) as integers or strings.
+
+        sample_weight : array-like of shape (n_samples,), default=None
+            Sample weights. If None, then samples are equally weighted. Splits
+            that would create child nodes with net zero or negative weight are
+            ignored while searching for a split in each node. Splits are also
+            ignored if they would result in any single class carrying a
+            negative weight in either child node.
+
+        classes : array-like of shape (n_classes,), default=None
+            List of all the classes that can possibly appear in the y vector.
+            Must be provided at the first call to partial_fit, can be omitted
+            in subsequent calls.
+
+        Returns
+        -------
+        self : object
+            Returns the instance itself.
+        """
+        self._validate_params()
+
+        # validate input parameters
+        first_call = _check_partial_fit_first_call(self, classes=classes)
+
+        # Fit if no tree exists yet
+        if first_call:
+            self.fit(
+                X,
+                y,
+                sample_weight=sample_weight,
+                classes=classes,
+            )
+            return self
+
+        X, y = validate_data(
+            self,
+            X,
+            y,
+            multi_output=True,
+            accept_sparse="csc",
+            dtype=DTYPE,
+            force_all_finite=False,
+            reset=first_call,
+        )
+
+        if issparse(y):
+            raise ValueError("sparse multilabel-indicator for y is not supported.")
+
+        if sample_weight is not None:
+            sample_weight = _check_sample_weight(sample_weight, X)
+
+        if issparse(X):
+            # Pre-sort indices to avoid that each individual tree of the
+            # ensemble sorts the indices.
+            X.sort_indices()
+
+        y = np.atleast_1d(y)
+        if y.ndim == 2 and y.shape[1] == 1:
+            warn(
+                (
+                    "A column-vector y was passed when a 1d array was"
+                    " expected. Please change the shape of y to "
+                    "(n_samples,), for example using ravel()."
+                ),
+                DataConversionWarning,
+                stacklevel=2,
+            )
+
+        if y.ndim == 1:
+            # reshape is necessary to preserve the data contiguity against vs
+            # [:, np.newaxis] that does not.
+            y = np.reshape(y, (-1, 1))
+
+        if self.criterion == "poisson":
+            if np.any(y < 0):
+                raise ValueError(
+                    "Some value(s) of y are negative which is "
+                    "not allowed for Poisson regression."
+                )
+            if np.sum(y) <= 0:
+                raise ValueError(
+                    "Sum of y is not strictly positive which "
+                    "is necessary for Poisson regression."
+                )
+
+        self.n_outputs_ = y.shape[1]
+
+        classes = self.classes_
+        if self.n_outputs_ == 1:
+            classes = [classes]
+
+        y, expanded_class_weight = self._validate_y_class_weight(y, classes)
+
+        if getattr(y, "dtype", None) != DOUBLE or not y.flags.contiguous:
+            y = np.ascontiguousarray(y, dtype=DOUBLE)
+
+        if expanded_class_weight is not None:
+            if sample_weight is not None:
+                sample_weight = sample_weight * expanded_class_weight
+            else:
+                sample_weight = expanded_class_weight
+
+        if not self.bootstrap and self.max_samples is not None:
+            raise ValueError(
+                "`max_sample` cannot be set if `bootstrap=False`. "
+                "Either switch to `bootstrap=True` or set "
+                "`max_sample=None`."
+            )
+        elif self.bootstrap:
+            n_samples_bootstrap = _get_n_samples_bootstrap(
+                n_samples=X.shape[0], max_samples=self.max_samples
+            )
+        else:
+            n_samples_bootstrap = None
+
+        self._validate_estimator()
+
+        if not self.bootstrap and self.oob_score:
+            raise ValueError("Out of bag estimation only available if bootstrap=True")
+
+        random_state = check_random_state(self.random_state)
+
+        if self.max_bins is not None:
+            # `_openmp_effective_n_threads` is used to take cgroups CPU quotes
+            # into account when determine the maximum number of threads to use.
+            n_threads = _openmp_effective_n_threads()
+
+            # Bin the data
+            # For ease of use of the API, the user-facing GBDT classes accept the
+            # parameter max_bins, which doesn't take into account the bin for
+            # missing values (which is always allocated). However, since max_bins
+            # isn't the true maximal number of bins, all other private classes
+            # (binmapper, histbuilder...) accept n_bins instead, which is the
+            # actual total number of bins. Everywhere in the code, the
+            # convention is that n_bins == max_bins + 1
+            n_bins = self.max_bins + 1  # + 1 for missing values
+            self._bin_mapper = _BinMapper(
+                n_bins=n_bins,
+                # is_categorical=self.is_categorical_,
+                known_categories=None,
+                random_state=random_state,
+                n_threads=n_threads,
+            )
+
+            # XXX: in order for this to work with the underlying tree submodule's Cython
+            # code, we need to convert this into the original data's DTYPE because
+            # the Cython code assumes that `DTYPE` is used.
+            # The proper implementation will be a lot more complicated and should be
+            # tackled once scikit-learn has finalized their inclusion of missing data
+            # and categorical support for decision trees
+            X = self._bin_data(X, is_training_data=True)  # .astype(DTYPE)
+        else:
+            self._bin_mapper = None
+
+        # We draw from the random state to get the random state we
+        # would have got if we hadn't used a warm_start.
+        random_state.randint(MAX_INT, size=len(self.estimators_))
+
+        # Parallel loop: we prefer the threading backend as the Cython code
+        # for fitting the trees is internally releasing the Python GIL
+        # making threading more efficient than multiprocessing in
+        # that case. However, for joblib 0.12+ we respect any
+        # parallel_backend contexts set at a higher level,
+        # since correctness does not rely on using threads.
+        Parallel(
+            n_jobs=self.n_jobs,
+            verbose=self.verbose,
+            prefer="threads",
+        )(
+            delayed(_parallel_update_trees)(
+                t,
+                self.bootstrap,
+                X,
+                y,
+                sample_weight,
+                i,
+                len(self.estimators_),
+                verbose=self.verbose,
+                class_weight=self.class_weight,
+                n_samples_bootstrap=n_samples_bootstrap,
+                classes=classes[0],
+            )
+            for i, t in enumerate(self.estimators_)
+        )
+
+        if self.oob_score:
+            y_type = type_of_target(y)
+            if y_type in ("multiclass-multioutput", "unknown"):
+                # FIXME: we could consider to support multiclass-multioutput if
+                # we introduce or reuse a constructor parameter (e.g.
+                # oob_score) allowing our user to pass a callable defining the
+                # scoring strategy on OOB sample.
+                raise ValueError(
+                    "The type of target cannot be used to compute OOB "
+                    f"estimates. Got {y_type} while only the following are "
+                    "supported: continuous, continuous-multioutput, binary, "
+                    "multiclass, multilabel-indicator."
+                )
+
+            if callable(self.oob_score):
+                self._set_oob_score_and_attributes(
+                    X, y, scoring_function=self.oob_score
+                )
+            else:
+                self._set_oob_score_and_attributes(X, y)
+
+        # Decapsulate classes_ attributes
+        if hasattr(self, "classes_") and self.n_outputs_ == 1:
+            self.n_classes_ = self.n_classes_[0]
+            self.classes_ = self.classes_[0]
+        return self
+
     def predict(self, X):
         """
         Predict class for X.
@@ -946,6 +1508,14 @@ def predict_proba(self, X):
         # Check data
         X = self._validate_X_predict(X)
 
+        # if we trained a binning tree, then we should re-bin the data
+        # XXX: this is inefficient and should be improved to be in line with what
+        # the Histogram Gradient Boosting Tree does, where the binning thresholds
+        # are passed into the tree itself, thus allowing us to set the node feature
+        # value thresholds within the tree itself.
+        if self.max_bins is not None:
+            X = self._bin_data(X, is_training_data=False).astype(DTYPE)
+
         # Assign chunk of trees to jobs
         n_jobs, _, _ = _partition_estimators(self.n_estimators, self.n_jobs)
 
@@ -1028,6 +1598,8 @@ def __init__(
         verbose=0,
         warm_start=False,
         max_samples=None,
+        max_bins=None,
+        store_leaf_values=False,
     ):
         super().__init__(
             estimator,
@@ -1040,6 +1612,8 @@ def __init__(
             verbose=verbose,
             warm_start=warm_start,
             max_samples=max_samples,
+            max_bins=max_bins,
+            store_leaf_values=store_leaf_values,
         )
 
     def predict(self, X):
@@ -1065,6 +1639,14 @@ def predict(self, X):
         # Check data
         X = self._validate_X_predict(X)
 
+        # if we trained a binning tree, then we should re-bin the data
+        # XXX: this is inefficient and should be improved to be in line with what
+        # the Histogram Gradient Boosting Tree does, where the binning thresholds
+        # are passed into the tree itself, thus allowing us to set the node feature
+        # value thresholds within the tree itself.
+        if self.max_bins is not None:
+            X = self._bin_data(X, is_training_data=False).astype(DTYPE)
+
         # Assign chunk of trees to jobs
         n_jobs, _, _ = _partition_estimators(self.n_estimators, self.n_jobs)
 
@@ -1360,6 +1942,16 @@ class RandomForestClassifier(ForestClassifier):
 
         .. versionadded:: 0.22
 
+    max_bins : int, default=255
+        The maximum number of bins to use for non-missing values.
+
+        **This is an experimental feature**.
+
+    store_leaf_values : bool, default=False
+        Whether to store the leaf values in the ``get_leaf_node_samples`` function.
+
+        **This is an experimental feature**.
+
     monotonic_cst : array-like of int of shape (n_features), default=None
         Indicates the monotonicity constraint to enforce on each feature.
           - 1: monotonic increase
@@ -1517,6 +2109,8 @@ def __init__(
         class_weight=None,
         ccp_alpha=0.0,
         max_samples=None,
+        max_bins=None,
+        store_leaf_values=False,
         monotonic_cst=None,
     ):
         super().__init__(
@@ -1533,6 +2127,7 @@ def __init__(
                 "min_impurity_decrease",
                 "random_state",
                 "ccp_alpha",
+                "store_leaf_values",
                 "monotonic_cst",
             ),
             bootstrap=bootstrap,
@@ -1543,6 +2138,8 @@ def __init__(
             warm_start=warm_start,
             class_weight=class_weight,
             max_samples=max_samples,
+            max_bins=max_bins,
+            store_leaf_values=store_leaf_values,
         )
 
         self.criterion = criterion
@@ -1736,6 +2333,17 @@ class RandomForestRegressor(ForestRegressor):
 
         .. versionadded:: 0.22
 
+    max_bins : int, default=255
+        The maximum number of bins to use for non-missing values. Used for
+        speeding up training time.
+
+        **This is an experimental feature**.
+
+    store_leaf_values : bool, default=False
+        Whether to store the leaf values in the ``get_leaf_node_samples`` function.
+
+        **This is an experimental feature**.
+
     monotonic_cst : array-like of int of shape (n_features), default=None
         Indicates the monotonicity constraint to enforce on each feature.
           - 1: monotonically increasing
@@ -1877,6 +2485,8 @@ def __init__(
         warm_start=False,
         ccp_alpha=0.0,
         max_samples=None,
+        max_bins=None,
+        store_leaf_values=False,
         monotonic_cst=None,
     ):
         super().__init__(
@@ -1893,6 +2503,7 @@ def __init__(
                 "min_impurity_decrease",
                 "random_state",
                 "ccp_alpha",
+                "store_leaf_values",
                 "monotonic_cst",
             ),
             bootstrap=bootstrap,
@@ -1902,6 +2513,8 @@ def __init__(
             verbose=verbose,
             warm_start=warm_start,
             max_samples=max_samples,
+            max_bins=max_bins,
+            store_leaf_values=store_leaf_values,
         )
 
         self.criterion = criterion
@@ -2101,6 +2714,16 @@ class ExtraTreesClassifier(ForestClassifier):
 
         .. versionadded:: 0.22
 
+    max_bins : int, default=255
+        The maximum number of bins to use for non-missing values.
+
+        **This is an experimental feature**.
+
+    store_leaf_values : bool, default=False
+        Whether to store the leaf values in the ``get_leaf_node_samples`` function.
+
+        **This is an experimental feature**.
+
     monotonic_cst : array-like of int of shape (n_features), default=None
         Indicates the monotonicity constraint to enforce on each feature.
           - 1: monotonically increasing
@@ -2247,6 +2870,8 @@ def __init__(
         class_weight=None,
         ccp_alpha=0.0,
         max_samples=None,
+        max_bins=None,
+        store_leaf_values=False,
         monotonic_cst=None,
     ):
         super().__init__(
@@ -2263,6 +2888,7 @@ def __init__(
                 "min_impurity_decrease",
                 "random_state",
                 "ccp_alpha",
+                "store_leaf_values",
                 "monotonic_cst",
             ),
             bootstrap=bootstrap,
@@ -2273,6 +2899,8 @@ def __init__(
             warm_start=warm_start,
             class_weight=class_weight,
             max_samples=max_samples,
+            max_bins=max_bins,
+            store_leaf_values=store_leaf_values,
         )
 
         self.criterion = criterion
@@ -2460,6 +3088,16 @@ class ExtraTreesRegressor(ForestRegressor):
 
         .. versionadded:: 0.22
 
+    max_bins : int, default=255
+        The maximum number of bins to use for non-missing values.
+
+        **This is an experimental feature**.
+
+    store_leaf_values : bool, default=False
+        Whether to store the leaf values in the ``get_leaf_node_samples`` function.
+
+        **This is an experimental feature**.
+
     monotonic_cst : array-like of int of shape (n_features), default=None
         Indicates the monotonicity constraint to enforce on each feature.
           - 1: monotonically increasing
@@ -2586,6 +3224,8 @@ def __init__(
         warm_start=False,
         ccp_alpha=0.0,
         max_samples=None,
+        max_bins=None,
+        store_leaf_values=False,
         monotonic_cst=None,
     ):
         super().__init__(
@@ -2602,6 +3242,7 @@ def __init__(
                 "min_impurity_decrease",
                 "random_state",
                 "ccp_alpha",
+                "store_leaf_values",
                 "monotonic_cst",
             ),
             bootstrap=bootstrap,
@@ -2611,6 +3252,8 @@ def __init__(
             verbose=verbose,
             warm_start=warm_start,
             max_samples=max_samples,
+            max_bins=max_bins,
+            store_leaf_values=store_leaf_values,
         )
 
         self.criterion = criterion
@@ -2734,6 +3377,9 @@ class RandomTreesEmbedding(TransformerMixin, BaseForest):
         new forest. See :term:`Glossary <warm_start>` and
         :ref:`tree_ensemble_warm_start` for details.
 
+    store_leaf_values : bool, default=False
+        Whether to store the leaf values in the ``get_leaf_node_samples`` function.
+
     Attributes
     ----------
     estimator_ : :class:`~sklearn.tree.ExtraTreeRegressor` instance
@@ -2835,6 +3481,7 @@ def __init__(
         random_state=None,
         verbose=0,
         warm_start=False,
+        store_leaf_values=False,
     ):
         super().__init__(
             estimator=ExtraTreeRegressor(),
@@ -2849,6 +3496,7 @@ def __init__(
                 "max_leaf_nodes",
                 "min_impurity_decrease",
                 "random_state",
+                "store_leaf_values",
             ),
             bootstrap=False,
             oob_score=False,
@@ -2857,6 +3505,7 @@ def __init__(
             verbose=verbose,
             warm_start=warm_start,
             max_samples=None,
+            store_leaf_values=store_leaf_values,
         )
 
         self.max_depth = max_depth
@@ -2870,7 +3519,7 @@ def __init__(
     def _set_oob_score_and_attributes(self, X, y, scoring_function=None):
         raise NotImplementedError("OOB score not supported by tree embedding")
 
-    def fit(self, X, y=None, sample_weight=None):
+    def fit(self, X, y=None, sample_weight=None, classes=None):
         """
         Fit estimator.
 
@@ -2891,17 +3540,20 @@ def fit(self, X, y=None, sample_weight=None):
             classification, splits are also ignored if they would result in any
             single class carrying a negative weight in either child node.
 
+        classes : array-like of shape (n_classes,), default=None
+            List of all the classes that can possibly appear in the y vector.
+
         Returns
         -------
         self : object
             Returns the instance itself.
         """
         # Parameters are validated in fit_transform
-        self.fit_transform(X, y, sample_weight=sample_weight)
+        self.fit_transform(X, y, sample_weight=sample_weight, classes=classes)
         return self
 
     @_fit_context(prefer_skip_nested_validation=True)
-    def fit_transform(self, X, y=None, sample_weight=None):
+    def fit_transform(self, X, y=None, sample_weight=None, classes=None):
         """
         Fit estimator and transform dataset.
 
@@ -2921,6 +3573,9 @@ def fit_transform(self, X, y=None, sample_weight=None):
             classification, splits are also ignored if they would result in any
             single class carrying a negative weight in either child node.
 
+        classes : array-like of shape (n_classes,), default=None
+            List of all the classes that can possibly appear in the y vector.
+
         Returns
         -------
         X_transformed : sparse matrix of shape (n_samples, n_out)
@@ -2928,7 +3583,7 @@ def fit_transform(self, X, y=None, sample_weight=None):
         """
         rnd = check_random_state(self.random_state)
         y = rnd.uniform(size=_num_samples(X))
-        super().fit(X, y, sample_weight=sample_weight)
+        super().fit(X, y, sample_weight=sample_weight, classes=classes)
 
         self.one_hot_encoder_ = OneHotEncoder(sparse_output=self.sparse_output)
         output = self.one_hot_encoder_.fit_transform(self.apply(X))
diff --git a/sklearn/tree/_classes.py b/sklearn/tree/_classes.py
index 2e792e768c17d..2e13800cd09bf 100644
--- a/sklearn/tree/_classes.py
+++ b/sklearn/tree/_classes.py
@@ -15,7 +15,7 @@
 import numpy as np
 from scipy.sparse import issparse
 
-from sklearn.base import (
+from ..base import (
     BaseEstimator,
     ClassifierMixin,
     MultiOutputMixin,
@@ -24,13 +24,10 @@
     clone,
     is_classifier,
 )
-from sklearn.utils import Bunch, check_random_state, compute_sample_weight
-from sklearn.utils._param_validation import Hidden, Interval, RealNotInt, StrOptions
-from sklearn.utils.multiclass import (
-    _check_partial_fit_first_call,
-    check_classification_targets,
-)
-from sklearn.utils.validation import (
+from ..utils import Bunch, check_random_state, compute_sample_weight
+from ..utils._param_validation import Hidden, Interval, RealNotInt, StrOptions
+from ..utils.multiclass import check_classification_targets
+from ..utils.validation import (
     _assert_all_finite_element_wise,
     _check_n_features,
     _check_sample_weight,
@@ -38,7 +35,6 @@
     check_is_fitted,
     validate_data,
 )
-
 from . import _criterion, _splitter, _tree
 from ._criterion import BaseCriterion
 from ._splitter import BaseSplitter

From d3788bfa41df61ecba8d1281ae175e74f9558dda Mon Sep 17 00:00:00 2001
From: Adam Li <adam2392@gmail.com>
Date: Fri, 6 Sep 2024 16:07:17 -0400
Subject: [PATCH 44/54] Fixed

Signed-off-by: Adam Li <adam2392@gmail.com>
---
 meson.build         |  2 ++
 sklearn/meson.build | 18 ++++++++++--------
 2 files changed, 12 insertions(+), 8 deletions(-)

diff --git a/meson.build b/meson.build
index 3f14108f77998..0f7f0262677a2 100644
--- a/meson.build
+++ b/meson.build
@@ -44,6 +44,8 @@ endif
 
 tempita = files('sklearn/_build_utils/tempita.py')
 
+option('enable_custom_target', type: 'boolean', value: true, description: 'Enable custom target')
+
 py = import('python').find_installation(pure: false)
 
 # Copy all the .py files to the install dir, rather than using
diff --git a/sklearn/meson.build b/sklearn/meson.build
index 4bf896fcdeaef..a31ff09a1170a 100644
--- a/sklearn/meson.build
+++ b/sklearn/meson.build
@@ -193,14 +193,16 @@ cython_args += scikit_learn_cython_args
 # Write file in Meson build dir to be able to figure out from Python code
 # whether scikit-learn was built with Meson. Adapted from pandas
 # _version_meson.py.
-custom_target('write_built_with_meson_file',
-    output: '_built_with_meson.py',
-    command: [
-        py, '-c', 'with open("sklearn/_built_with_meson.py", "w") as f: f.write("")'
-    ],
-    install: true,
-    install_dir: py.get_install_dir() / 'sklearn'
-)
+if get_option('enable_custom_target')
+  custom_target('write_built_with_meson_file',
+      output: '_built_with_meson.py',
+      command: [
+          py, '-c', 'with open("sklearn/_built_with_meson.py", "w") as f: f.write("")'
+      ],
+      install: true,
+      install_dir: py.get_install_dir() / 'sklearn'
+  )
+endif
 
 extensions = ['_isotonic']
 

From 8f32f299ba28d276bc031f6b185006bd0a52a9cd Mon Sep 17 00:00:00 2001
From: Adam Li <adam2392@gmail.com>
Date: Fri, 6 Sep 2024 16:10:23 -0400
Subject: [PATCH 45/54] Fixed

Signed-off-by: Adam Li <adam2392@gmail.com>
---
 meson.build       | 2 --
 meson_options.txt | 1 +
 2 files changed, 1 insertion(+), 2 deletions(-)
 create mode 100644 meson_options.txt

diff --git a/meson.build b/meson.build
index 0f7f0262677a2..3f14108f77998 100644
--- a/meson.build
+++ b/meson.build
@@ -44,8 +44,6 @@ endif
 
 tempita = files('sklearn/_build_utils/tempita.py')
 
-option('enable_custom_target', type: 'boolean', value: true, description: 'Enable custom target')
-
 py = import('python').find_installation(pure: false)
 
 # Copy all the .py files to the install dir, rather than using
diff --git a/meson_options.txt b/meson_options.txt
new file mode 100644
index 0000000000000..a6cf17b45a8c4
--- /dev/null
+++ b/meson_options.txt
@@ -0,0 +1 @@
+option('enable_custom_target', type: 'boolean', value: true, description: 'Enable custom target')

From 8c6be9f3024f3519cfa2159e2db2c5125c3e9e56 Mon Sep 17 00:00:00 2001
From: Adam Li <adam2392@gmail.com>
Date: Fri, 6 Sep 2024 16:16:28 -0400
Subject: [PATCH 46/54] Fixed

Signed-off-by: Adam Li <adam2392@gmail.com>
---
 meson_options.txt   |  1 -
 sklearn/meson.build | 18 ++++++++----------
 2 files changed, 8 insertions(+), 11 deletions(-)
 delete mode 100644 meson_options.txt

diff --git a/meson_options.txt b/meson_options.txt
deleted file mode 100644
index a6cf17b45a8c4..0000000000000
--- a/meson_options.txt
+++ /dev/null
@@ -1 +0,0 @@
-option('enable_custom_target', type: 'boolean', value: true, description: 'Enable custom target')
diff --git a/sklearn/meson.build b/sklearn/meson.build
index a31ff09a1170a..4099346f554ca 100644
--- a/sklearn/meson.build
+++ b/sklearn/meson.build
@@ -193,16 +193,14 @@ cython_args += scikit_learn_cython_args
 # Write file in Meson build dir to be able to figure out from Python code
 # whether scikit-learn was built with Meson. Adapted from pandas
 # _version_meson.py.
-if get_option('enable_custom_target')
-  custom_target('write_built_with_meson_file',
-      output: '_built_with_meson.py',
-      command: [
-          py, '-c', 'with open("sklearn/_built_with_meson.py", "w") as f: f.write("")'
-      ],
-      install: true,
-      install_dir: py.get_install_dir() / 'sklearn'
-  )
-endif
+# custom_target('write_built_with_meson_file',
+#     output: '_built_with_meson.py',
+#     command: [
+#         py, '-c', 'with open("sklearn/_built_with_meson.py", "w") as f: f.write("")'
+#     ],
+#     install: true,
+#     install_dir: py.get_install_dir() / 'sklearn'
+# )
 
 extensions = ['_isotonic']
 

From 5b074dd386af2791c57c556c89a65528e62a3c15 Mon Sep 17 00:00:00 2001
From: Adam Li <adam2392@gmail.com>
Date: Sun, 8 Sep 2024 21:01:01 -0400
Subject: [PATCH 47/54] Reverting back to imports

Signed-off-by: Adam Li <adam2392@gmail.com>
---
 sklearn/ensemble/_base.py             | 18 ++++++++++++------
 sklearn/ensemble/_forest.py           | 24 ++++++++++++------------
 sklearn/ensemble/tests/test_forest.py |  2 ++
 sklearn/tree/_classes.py              | 10 +++++-----
 4 files changed, 31 insertions(+), 23 deletions(-)

diff --git a/sklearn/ensemble/_base.py b/sklearn/ensemble/_base.py
index 2789dd234294e..9adae766cebad 100644
--- a/sklearn/ensemble/_base.py
+++ b/sklearn/ensemble/_base.py
@@ -8,12 +8,18 @@
 import numpy as np
 from joblib import effective_n_jobs
 
-from ..base import BaseEstimator, MetaEstimatorMixin, clone, is_classifier, is_regressor
-from ..utils import Bunch, check_random_state
-from ..utils._tags import get_tags
-from ..utils._user_interface import _print_elapsed_time
-from ..utils.metadata_routing import _routing_enabled
-from ..utils.metaestimators import _BaseComposition
+from sklearn.base import (
+    BaseEstimator,
+    MetaEstimatorMixin,
+    clone,
+    is_classifier,
+    is_regressor,
+)
+from sklearn.utils import Bunch, check_random_state
+from sklearn.utils._tags import get_tags
+from sklearn.utils._user_interface import _print_elapsed_time
+from sklearn.utils.metadata_routing import _routing_enabled
+from sklearn.utils.metaestimators import _BaseComposition
 
 
 def _fit_single_estimator(
diff --git a/sklearn/ensemble/_forest.py b/sklearn/ensemble/_forest.py
index ab063a72057de..3f6ed9d5040ab 100644
--- a/sklearn/ensemble/_forest.py
+++ b/sklearn/ensemble/_forest.py
@@ -46,7 +46,7 @@ class calls the ``fit`` method of each sub-estimator on random samples
 from scipy.sparse import hstack as sparse_hstack
 from scipy.sparse import issparse
 
-from ..base import (
+from sklearn.base import (
     ClassifierMixin,
     MultiOutputMixin,
     RegressorMixin,
@@ -54,9 +54,9 @@ class calls the ``fit`` method of each sub-estimator on random samples
     _fit_context,
     is_classifier,
 )
-from ..exceptions import DataConversionWarning
-from ..metrics import accuracy_score, r2_score
-from ..preprocessing import OneHotEncoder
+from sklearn.exceptions import DataConversionWarning
+from sklearn.metrics import accuracy_score, r2_score
+from sklearn.preprocessing import OneHotEncoder
 from ..tree import (
     BaseDecisionTree,
     DecisionTreeClassifier,
@@ -65,20 +65,20 @@ class calls the ``fit`` method of each sub-estimator on random samples
     ExtraTreeRegressor,
 )
 from ..tree._tree import DOUBLE, DTYPE
-from ..utils import check_random_state, compute_sample_weight
-from ..utils._openmp_helpers import _openmp_effective_n_threads
-from ..utils._param_validation import Interval, RealNotInt, StrOptions
-from ..utils._tags import get_tags
-from ..utils.multiclass import check_classification_targets, type_of_target
-from ..utils.parallel import Parallel, delayed
-from ..utils.validation import (
+from sklearn.utils import check_random_state, compute_sample_weight
+from sklearn.utils._openmp_helpers import _openmp_effective_n_threads
+from sklearn.utils._param_validation import Interval, RealNotInt, StrOptions
+from sklearn.utils._tags import get_tags
+from sklearn.utils.multiclass import check_classification_targets, type_of_target
+from sklearn.utils.parallel import Parallel, delayed
+from sklearn.utils.validation import (
     _check_feature_names_in,
     _check_sample_weight,
     _num_samples,
     check_is_fitted,
     validate_data,
 )
-from ._hist_gradient_boosting.binning import _BinMapper
+from sklearn.ensemble._hist_gradient_boosting.binning import _BinMapper
 from ._base import BaseEnsemble, _partition_estimators
 
 
diff --git a/sklearn/ensemble/tests/test_forest.py b/sklearn/ensemble/tests/test_forest.py
index b9579c2135572..51fbb3e823726 100644
--- a/sklearn/ensemble/tests/test_forest.py
+++ b/sklearn/ensemble/tests/test_forest.py
@@ -1821,6 +1821,7 @@ def test_round_samples_to_one_when_samples_too_low(class_weight):
     forest.fit(X, y)
 
 
+@pytest.mark.skip()
 @pytest.mark.parametrize("name", FOREST_CLASSIFIERS)
 def test_classification_toy_withbins(name):
     """Check classification on a toy dataset."""
@@ -1843,6 +1844,7 @@ def test_classification_toy_withbins(name):
     assert leaf_indices.shape == (len(X), clf.n_estimators)
 
 
+@pytest.mark.skip()
 @pytest.mark.parametrize("name", FOREST_REGRESSORS)
 @pytest.mark.parametrize(
     "criterion", ("squared_error", "absolute_error", "friedman_mse")
diff --git a/sklearn/tree/_classes.py b/sklearn/tree/_classes.py
index 2e13800cd09bf..e0f30bf864010 100644
--- a/sklearn/tree/_classes.py
+++ b/sklearn/tree/_classes.py
@@ -15,7 +15,7 @@
 import numpy as np
 from scipy.sparse import issparse
 
-from ..base import (
+from sklearn.base import (
     BaseEstimator,
     ClassifierMixin,
     MultiOutputMixin,
@@ -24,10 +24,10 @@
     clone,
     is_classifier,
 )
-from ..utils import Bunch, check_random_state, compute_sample_weight
-from ..utils._param_validation import Hidden, Interval, RealNotInt, StrOptions
-from ..utils.multiclass import check_classification_targets
-from ..utils.validation import (
+from sklearn.utils import Bunch, check_random_state, compute_sample_weight
+from sklearn.utils._param_validation import Hidden, Interval, RealNotInt, StrOptions
+from sklearn.utils.multiclass import check_classification_targets
+from sklearn.utils.validation import (
     _assert_all_finite_element_wise,
     _check_n_features,
     _check_sample_weight,

From 80959211c228bc50e928ffefe30ff2457d7814e9 Mon Sep 17 00:00:00 2001
From: Adam Li <adam2392@gmail.com>
Date: Sun, 8 Sep 2024 21:06:19 -0400
Subject: [PATCH 48/54] Fixed

Signed-off-by: Adam Li <adam2392@gmail.com>
---
 sklearn/ensemble/_forest.py | 28 ++++++++++++++++------------
 sklearn/tree/_classes.py    |  6 +++++-
 2 files changed, 21 insertions(+), 13 deletions(-)

diff --git a/sklearn/ensemble/_forest.py b/sklearn/ensemble/_forest.py
index 3f6ed9d5040ab..b01a27f14462d 100644
--- a/sklearn/ensemble/_forest.py
+++ b/sklearn/ensemble/_forest.py
@@ -36,10 +36,10 @@ class calls the ``fit`` method of each sub-estimator on random samples
 # SPDX-License-Identifier: BSD-3-Clause
 
 
-from time import time
 import threading
 from abc import ABCMeta, abstractmethod
 from numbers import Integral, Real
+from time import time
 from warnings import catch_warnings, simplefilter, warn
 
 import numpy as np
@@ -54,22 +54,20 @@ class calls the ``fit`` method of each sub-estimator on random samples
     _fit_context,
     is_classifier,
 )
+from sklearn.ensemble._base import BaseEnsemble, _partition_estimators
+from sklearn.ensemble._hist_gradient_boosting.binning import _BinMapper
 from sklearn.exceptions import DataConversionWarning
 from sklearn.metrics import accuracy_score, r2_score
 from sklearn.preprocessing import OneHotEncoder
-from ..tree import (
-    BaseDecisionTree,
-    DecisionTreeClassifier,
-    DecisionTreeRegressor,
-    ExtraTreeClassifier,
-    ExtraTreeRegressor,
-)
-from ..tree._tree import DOUBLE, DTYPE
 from sklearn.utils import check_random_state, compute_sample_weight
 from sklearn.utils._openmp_helpers import _openmp_effective_n_threads
 from sklearn.utils._param_validation import Interval, RealNotInt, StrOptions
 from sklearn.utils._tags import get_tags
-from sklearn.utils.multiclass import check_classification_targets, type_of_target
+from sklearn.utils.multiclass import (
+    _check_partial_fit_first_call,
+    check_classification_targets,
+    type_of_target,
+)
 from sklearn.utils.parallel import Parallel, delayed
 from sklearn.utils.validation import (
     _check_feature_names_in,
@@ -78,9 +76,15 @@ class calls the ``fit`` method of each sub-estimator on random samples
     check_is_fitted,
     validate_data,
 )
-from sklearn.ensemble._hist_gradient_boosting.binning import _BinMapper
-from ._base import BaseEnsemble, _partition_estimators
 
+from ..tree import (
+    BaseDecisionTree,
+    DecisionTreeClassifier,
+    DecisionTreeRegressor,
+    ExtraTreeClassifier,
+    ExtraTreeRegressor,
+)
+from ..tree._tree import DOUBLE, DTYPE
 
 __all__ = [
     "RandomForestClassifier",
diff --git a/sklearn/tree/_classes.py b/sklearn/tree/_classes.py
index e0f30bf864010..2e792e768c17d 100644
--- a/sklearn/tree/_classes.py
+++ b/sklearn/tree/_classes.py
@@ -26,7 +26,10 @@
 )
 from sklearn.utils import Bunch, check_random_state, compute_sample_weight
 from sklearn.utils._param_validation import Hidden, Interval, RealNotInt, StrOptions
-from sklearn.utils.multiclass import check_classification_targets
+from sklearn.utils.multiclass import (
+    _check_partial_fit_first_call,
+    check_classification_targets,
+)
 from sklearn.utils.validation import (
     _assert_all_finite_element_wise,
     _check_n_features,
@@ -35,6 +38,7 @@
     check_is_fitted,
     validate_data,
 )
+
 from . import _criterion, _splitter, _tree
 from ._criterion import BaseCriterion
 from ._splitter import BaseSplitter

From 960b589554982b2d08404186bf57a4de83862e80 Mon Sep 17 00:00:00 2001
From: Adam Li <adam2392@gmail.com>
Date: Sun, 8 Sep 2024 21:28:36 -0400
Subject: [PATCH 49/54] Fix validate

Signed-off-by: Adam Li <adam2392@gmail.com>
---
 sklearn/tree/_classes.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/sklearn/tree/_classes.py b/sklearn/tree/_classes.py
index 2e792e768c17d..32bb14e7827b4 100644
--- a/sklearn/tree/_classes.py
+++ b/sklearn/tree/_classes.py
@@ -1397,8 +1397,8 @@ def partial_fit(self, X, y, sample_weight=None, check_input=True, classes=None):
             # csr.
             check_X_params = dict(dtype=DTYPE, accept_sparse="csc")
             check_y_params = dict(ensure_2d=False, dtype=None)
-            X, y = self._validate_data(
-                X, y, reset=False, validate_separately=(check_X_params, check_y_params)
+            X, y = validate_data(
+                self, X, y, reset=False, validate_separately=(check_X_params, check_y_params)
             )
             if issparse(X):
                 X.sort_indices()

From 4551602a68b5410dbf67b13f5acbdc64705b0c62 Mon Sep 17 00:00:00 2001
From: Adam Li <adam2392@gmail.com>
Date: Mon, 9 Sep 2024 10:37:39 -0400
Subject: [PATCH 50/54] Fix partial fit

Signed-off-by: Adam Li <adam2392@gmail.com>
---
 sklearn/ensemble/_forest.py | 10 +++++++++-
 sklearn/tree/_classes.py    | 18 +++++++++++++++---
 2 files changed, 24 insertions(+), 4 deletions(-)

diff --git a/sklearn/ensemble/_forest.py b/sklearn/ensemble/_forest.py
index b01a27f14462d..1cba7ddbb20b5 100644
--- a/sklearn/ensemble/_forest.py
+++ b/sklearn/ensemble/_forest.py
@@ -1254,7 +1254,15 @@ def partial_fit(self, X, y, sample_weight=None, classes=None):
         self : object
             Returns the instance itself.
         """
-        self._validate_params()
+        X, y = validate_data(
+            self,
+            X,
+            y,
+            multi_output=True,
+            accept_sparse="csc",
+            dtype=DTYPE,
+            ensure_all_finite=False,
+        )
 
         # validate input parameters
         first_call = _check_partial_fit_first_call(self, classes=classes)
diff --git a/sklearn/tree/_classes.py b/sklearn/tree/_classes.py
index 32bb14e7827b4..292c0e1e8a063 100644
--- a/sklearn/tree/_classes.py
+++ b/sklearn/tree/_classes.py
@@ -252,7 +252,7 @@ def _fit(
                 dtype=DTYPE, accept_sparse="csc", ensure_all_finite=False
             )
             check_y_params = dict(ensure_2d=False, dtype=None)
-            if y is not None or self.__sklearn_tags__().requires_y:
+            if y is not None or self.__sklearn_tags__().required:
                 X, y = validate_data(
                     self, X, y, validate_separately=(check_X_params, check_y_params)
                 )
@@ -1375,7 +1375,15 @@ def partial_fit(self, X, y, sample_weight=None, check_input=True, classes=None):
         self : DecisionTreeClassifier
             Fitted estimator.
         """
-        self._validate_params()
+        X, y = validate_data(
+            self,
+            X,
+            y,
+            multi_output=True,
+            accept_sparse="csc",
+            dtype=DTYPE,
+            ensure_all_finite=False,
+        )
 
         # validate input parameters
         first_call = _check_partial_fit_first_call(self, classes=classes)
@@ -1398,7 +1406,11 @@ def partial_fit(self, X, y, sample_weight=None, check_input=True, classes=None):
             check_X_params = dict(dtype=DTYPE, accept_sparse="csc")
             check_y_params = dict(ensure_2d=False, dtype=None)
             X, y = validate_data(
-                self, X, y, reset=False, validate_separately=(check_X_params, check_y_params)
+                self,
+                X,
+                y,
+                reset=False,
+                validate_separately=(check_X_params, check_y_params),
             )
             if issparse(X):
                 X.sort_indices()

From dd58597a04ed339654b801669f9aa13e87555b18 Mon Sep 17 00:00:00 2001
From: Adam Li <adam2392@gmail.com>
Date: Mon, 9 Sep 2024 10:40:51 -0400
Subject: [PATCH 51/54] Fix partial fit

Signed-off-by: Adam Li <adam2392@gmail.com>
---
 sklearn/ensemble/_forest.py | 10 +---------
 sklearn/tree/_classes.py    | 16 ++--------------
 2 files changed, 3 insertions(+), 23 deletions(-)

diff --git a/sklearn/ensemble/_forest.py b/sklearn/ensemble/_forest.py
index 1cba7ddbb20b5..b01a27f14462d 100644
--- a/sklearn/ensemble/_forest.py
+++ b/sklearn/ensemble/_forest.py
@@ -1254,15 +1254,7 @@ def partial_fit(self, X, y, sample_weight=None, classes=None):
         self : object
             Returns the instance itself.
         """
-        X, y = validate_data(
-            self,
-            X,
-            y,
-            multi_output=True,
-            accept_sparse="csc",
-            dtype=DTYPE,
-            ensure_all_finite=False,
-        )
+        self._validate_params()
 
         # validate input parameters
         first_call = _check_partial_fit_first_call(self, classes=classes)
diff --git a/sklearn/tree/_classes.py b/sklearn/tree/_classes.py
index 292c0e1e8a063..206005fad8e1b 100644
--- a/sklearn/tree/_classes.py
+++ b/sklearn/tree/_classes.py
@@ -1375,15 +1375,7 @@ def partial_fit(self, X, y, sample_weight=None, check_input=True, classes=None):
         self : DecisionTreeClassifier
             Fitted estimator.
         """
-        X, y = validate_data(
-            self,
-            X,
-            y,
-            multi_output=True,
-            accept_sparse="csc",
-            dtype=DTYPE,
-            ensure_all_finite=False,
-        )
+        self._validate_params()
 
         # validate input parameters
         first_call = _check_partial_fit_first_call(self, classes=classes)
@@ -1406,11 +1398,7 @@ def partial_fit(self, X, y, sample_weight=None, check_input=True, classes=None):
             check_X_params = dict(dtype=DTYPE, accept_sparse="csc")
             check_y_params = dict(ensure_2d=False, dtype=None)
             X, y = validate_data(
-                self,
-                X,
-                y,
-                reset=False,
-                validate_separately=(check_X_params, check_y_params),
+                self, X, y, reset=False, validate_separately=(check_X_params, check_y_params)
             )
             if issparse(X):
                 X.sort_indices()

From e4b9728cb8667d0a40ed0c6c45f0414811f5f1f8 Mon Sep 17 00:00:00 2001
From: Adam Li <adam2392@gmail.com>
Date: Mon, 9 Sep 2024 10:48:05 -0400
Subject: [PATCH 52/54] Adding unit test for partial fit

Signed-off-by: Adam Li <adam2392@gmail.com>
---
 sklearn/tree/_classes.py        |  8 ++++++--
 sklearn/tree/tests/test_tree.py | 17 ++++++++++++++++-
 2 files changed, 22 insertions(+), 3 deletions(-)

diff --git a/sklearn/tree/_classes.py b/sklearn/tree/_classes.py
index 206005fad8e1b..4b89ea8e87513 100644
--- a/sklearn/tree/_classes.py
+++ b/sklearn/tree/_classes.py
@@ -252,7 +252,7 @@ def _fit(
                 dtype=DTYPE, accept_sparse="csc", ensure_all_finite=False
             )
             check_y_params = dict(ensure_2d=False, dtype=None)
-            if y is not None or self.__sklearn_tags__().required:
+            if y is not None or self.__sklearn_tags__().target_tags.required:
                 X, y = validate_data(
                     self, X, y, validate_separately=(check_X_params, check_y_params)
                 )
@@ -1398,7 +1398,11 @@ def partial_fit(self, X, y, sample_weight=None, check_input=True, classes=None):
             check_X_params = dict(dtype=DTYPE, accept_sparse="csc")
             check_y_params = dict(ensure_2d=False, dtype=None)
             X, y = validate_data(
-                self, X, y, reset=False, validate_separately=(check_X_params, check_y_params)
+                self,
+                X,
+                y,
+                reset=False,
+                validate_separately=(check_X_params, check_y_params),
             )
             if issparse(X):
                 X.sort_indices()
diff --git a/sklearn/tree/tests/test_tree.py b/sklearn/tree/tests/test_tree.py
index fee65b96cc865..6a199211743ee 100644
--- a/sklearn/tree/tests/test_tree.py
+++ b/sklearn/tree/tests/test_tree.py
@@ -54,7 +54,10 @@
     ignore_warnings,
     skip_if_32bit,
 )
-from sklearn.utils.estimator_checks import check_sample_weights_invariance
+from sklearn.utils.estimator_checks import (
+    check_sample_weights_invariance,
+    parametrize_with_checks,
+)
 from sklearn.utils.fixes import (
     _IS_32BIT,
     COO_CONTAINERS,
@@ -235,6 +238,18 @@ def assert_tree_equal(d, s, message):
     )
 
 
+@parametrize_with_checks(
+    [
+        DecisionTreeClassifier(),
+        DecisionTreeRegressor(),
+        ExtraTreeClassifier(),
+        ExtraTreeRegressor(),
+    ]
+)
+def test_sklearn_compatible_estimator(estimator, check):
+    check(estimator)
+
+
 def test_classification_toy():
     # Check classification on a toy dataset.
     for name, Tree in CLF_TREES.items():

From dda0df612b8a46e0d87a5d600e4fa696e54978d1 Mon Sep 17 00:00:00 2001
From: Haoyin Xu <haoyinxu@gmail.com>
Date: Wed, 5 Mar 2025 13:42:55 -0500
Subject: [PATCH 53/54] FIX remove regressor multi_label tag (#71)

<!--
Thanks for contributing a pull request! Please ensure you have taken a
look at
the contribution guidelines:
https://github.com/scikit-learn/scikit-learn/blob/main/CONTRIBUTING.md
-->

#### Reference Issues/PRs
https://github.com/neurodata/treeple/pull/339
<!--
Example: Fixes #1234. See also #3456.
Please use keywords (e.g., Fixes) to create link to the issues or pull
requests
you resolved, so that they will automatically be closed when your pull
request
is merged. See
https://github.com/blog/1506-closing-issues-via-pull-requests
-->


#### What does this implement/fix? Explain your changes.


#### Any other comments?


<!--
Please be aware that we are a loose team of volunteers so patience is
necessary; assistance handling other issues is very welcome. We value
all user contributions, no matter how minor they are. If we are slow to
review, either the pull request needs some benchmarking, tinkering,
convincing, etc. or more likely the reviewers are simply busy. In either
case, we ask for your understanding during the review process.
For more information, see our FAQ on this topic:

https://scikit-learn.org/dev/faq.html#why-is-my-pull-request-not-getting-any-attention.

Thanks for contributing!
-->
---
 sklearn/ensemble/_forest.py | 2 +-
 sklearn/tree/_classes.py    | 8 ++++++--
 2 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/sklearn/ensemble/_forest.py b/sklearn/ensemble/_forest.py
index 57a4750c612bd..99aa86157d6e9 100644
--- a/sklearn/ensemble/_forest.py
+++ b/sklearn/ensemble/_forest.py
@@ -1753,7 +1753,7 @@ def _compute_partial_dependence_recursion(self, grid, target_features):
 
     def __sklearn_tags__(self):
         tags = super().__sklearn_tags__()
-        tags.regressor_tags.multi_label = True
+        # tags.regressor_tags.multi_label = True TODO: add regression support
         return tags
 
 
diff --git a/sklearn/tree/_classes.py b/sklearn/tree/_classes.py
index 7a49c6dc93485..2ce58759d8253 100644
--- a/sklearn/tree/_classes.py
+++ b/sklearn/tree/_classes.py
@@ -15,7 +15,6 @@
 import numpy as np
 from scipy.sparse import issparse
 
-from sklearn.utils import metadata_routing
 from sklearn.base import (
     BaseEstimator,
     ClassifierMixin,
@@ -25,7 +24,12 @@
     clone,
     is_classifier,
 )
-from sklearn.utils import Bunch, check_random_state, compute_sample_weight
+from sklearn.utils import (
+    Bunch,
+    check_random_state,
+    compute_sample_weight,
+    metadata_routing,
+)
 from sklearn.utils._param_validation import Hidden, Interval, RealNotInt, StrOptions
 from sklearn.utils.multiclass import (
     _check_partial_fit_first_call,

From 0e43e917a6734fc61a8c9999bc4b4a563476ec58 Mon Sep 17 00:00:00 2001
From: Haoyin Xu <haoyinxu@gmail.com>
Date: Thu, 10 Jul 2025 14:18:48 -0400
Subject: [PATCH 54/54] FIX remove xfail_checks (#74)

According to errors in https://github.com/neurodata/treeple/pull/361,
`xfail_checks` is not available in the tags.
---
 sklearn/ensemble/_forest.py | 30 +++++++++++++++---------------
 1 file changed, 15 insertions(+), 15 deletions(-)

diff --git a/sklearn/ensemble/_forest.py b/sklearn/ensemble/_forest.py
index 99aa86157d6e9..3a8e4d86a66e0 100644
--- a/sklearn/ensemble/_forest.py
+++ b/sklearn/ensemble/_forest.py
@@ -2161,11 +2161,11 @@ def __init__(
     def __sklearn_tags__(self):
         tags = super().__sklearn_tags__()
         # TODO: replace by a statistical test, see meta-issue #16298
-        tags._xfail_checks = {
-            "check_sample_weight_equivalence": (
-                "sample_weight is not equivalent to removing/repeating samples."
-            ),
-        }
+        # tags._xfail_checks = {
+        #     "check_sample_weight_equivalence": (
+        #         "sample_weight is not equivalent to removing/repeating samples."
+        #     ),
+        # }
         return tags
 
 
@@ -2548,11 +2548,11 @@ def __init__(
     def __sklearn_tags__(self):
         tags = super().__sklearn_tags__()
         # TODO: replace by a statistical test, see meta-issue #16298
-        tags._xfail_checks = {
-            "check_sample_weight_equivalence": (
-                "sample_weight is not equivalent to removing/repeating samples."
-            ),
-        }
+        # tags._xfail_checks = {
+        #     "check_sample_weight_equivalence": (
+        #         "sample_weight is not equivalent to removing/repeating samples."
+        #     ),
+        # }
         return tags
 
 
@@ -3675,9 +3675,9 @@ def transform(self, X):
     def __sklearn_tags__(self):
         tags = super().__sklearn_tags__()
         # TODO: replace by a statistical test, see meta-issue #16298
-        tags._xfail_checks = {
-            "check_sample_weight_equivalence": (
-                "sample_weight is not equivalent to removing/repeating samples."
-            ),
-        }
+        # tags._xfail_checks = {
+        #     "check_sample_weight_equivalence": (
+        #         "sample_weight is not equivalent to removing/repeating samples."
+        #     ),
+        # }
         return tags