diff --git a/sklearn/ensemble/_base.py b/sklearn/ensemble/_base.py
index 386c4875a1804..45c4fda13999b 100644
--- a/sklearn/ensemble/_base.py
+++ b/sklearn/ensemble/_base.py
@@ -8,12 +8,18 @@
 import numpy as np
 from joblib import effective_n_jobs
 
-from ..base import BaseEstimator, MetaEstimatorMixin, clone, is_classifier, is_regressor
-from ..utils import Bunch, check_random_state
-from ..utils._tags import get_tags
-from ..utils._user_interface import _print_elapsed_time
-from ..utils.metadata_routing import _routing_enabled
-from ..utils.metaestimators import _BaseComposition
+from sklearn.base import (
+    BaseEstimator,
+    MetaEstimatorMixin,
+    clone,
+    is_classifier,
+    is_regressor,
+)
+from sklearn.utils import Bunch, check_random_state
+from sklearn.utils._tags import get_tags
+from sklearn.utils._user_interface import _print_elapsed_time
+from sklearn.utils.metadata_routing import _routing_enabled
+from sklearn.utils.metaestimators import _BaseComposition
 
 
 def _fit_single_estimator(
diff --git a/sklearn/ensemble/_forest.py b/sklearn/ensemble/_forest.py
index 92713eecec9dd..3a8e4d86a66e0 100644
--- a/sklearn/ensemble/_forest.py
+++ b/sklearn/ensemble/_forest.py
@@ -39,13 +39,14 @@ class calls the ``fit`` method of each sub-estimator on random samples
 import threading
 from abc import ABCMeta, abstractmethod
 from numbers import Integral, Real
+from time import time
 from warnings import catch_warnings, simplefilter, warn
 
 import numpy as np
 from scipy.sparse import hstack as sparse_hstack
 from scipy.sparse import issparse
 
-from ..base import (
+from sklearn.base import (
     ClassifierMixin,
     MultiOutputMixin,
     RegressorMixin,
@@ -53,9 +54,29 @@ class calls the ``fit`` method of each sub-estimator on random samples
     _fit_context,
     is_classifier,
 )
-from ..exceptions import DataConversionWarning
-from ..metrics import accuracy_score, r2_score
-from ..preprocessing import OneHotEncoder
+from sklearn.ensemble._base import BaseEnsemble, _partition_estimators
+from sklearn.ensemble._hist_gradient_boosting.binning import _BinMapper
+from sklearn.exceptions import DataConversionWarning
+from sklearn.metrics import accuracy_score, r2_score
+from sklearn.preprocessing import OneHotEncoder
+from sklearn.utils import check_random_state, compute_sample_weight
+from sklearn.utils._openmp_helpers import _openmp_effective_n_threads
+from sklearn.utils._param_validation import Interval, RealNotInt, StrOptions
+from sklearn.utils._tags import get_tags
+from sklearn.utils.multiclass import (
+    _check_partial_fit_first_call,
+    check_classification_targets,
+    type_of_target,
+)
+from sklearn.utils.parallel import Parallel, delayed
+from sklearn.utils.validation import (
+    _check_feature_names_in,
+    _check_sample_weight,
+    _num_samples,
+    check_is_fitted,
+    validate_data,
+)
+
 from ..tree import (
     BaseDecisionTree,
     DecisionTreeClassifier,
@@ -64,19 +85,6 @@ class calls the ``fit`` method of each sub-estimator on random samples
     ExtraTreeRegressor,
 )
 from ..tree._tree import DOUBLE, DTYPE
-from ..utils import check_random_state, compute_sample_weight
-from ..utils._param_validation import Interval, RealNotInt, StrOptions
-from ..utils._tags import get_tags
-from ..utils.multiclass import check_classification_targets, type_of_target
-from ..utils.parallel import Parallel, delayed
-from ..utils.validation import (
-    _check_feature_names_in,
-    _check_sample_weight,
-    _num_samples,
-    check_is_fitted,
-    validate_data,
-)
-from ._base import BaseEnsemble, _partition_estimators
 
 __all__ = [
     "RandomForestClassifier",
@@ -93,14 +101,18 @@ def _get_n_samples_bootstrap(n_samples, max_samples):
     """
     Get the number of samples in a bootstrap sample.
 
+    The expected total number of unique samples in a bootstrap sample is
+    required to be at most ``n_samples - 1``.
+    This is equivalent to the expected number of out-of-bag samples being at
+    least 1.
+
     Parameters
     ----------
     n_samples : int
         Number of samples in the dataset.
     max_samples : int or float
         The maximum number of samples to draw from the total available:
-            - if float, this indicates a fraction of the total and should be
-              the interval `(0.0, 1.0]`;
+            - if float, this indicates a fraction of the total;
             - if int, this indicates the exact number of samples;
             - if None, this indicates the total number of samples.
 
@@ -113,12 +125,21 @@ def _get_n_samples_bootstrap(n_samples, max_samples):
         return n_samples
 
     if isinstance(max_samples, Integral):
-        if max_samples > n_samples:
-            msg = "`max_samples` must be <= n_samples={} but got value {}"
-            raise ValueError(msg.format(n_samples, max_samples))
+        expected_oob_samples = (1 - np.exp(-max_samples / n_samples)) * n_samples
+        if expected_oob_samples >= n_samples - 1:
+            raise ValueError(
+                "The expected number of unique samples in the bootstrap sample"
+                f" must be at most {n_samples - 1}. It is: {expected_oob_samples}"
+            )
         return max_samples
 
     if isinstance(max_samples, Real):
+        expected_oob_samples = (1 - np.exp(-max_samples)) * n_samples
+        if expected_oob_samples >= n_samples - 1:
+            raise ValueError(
+                "The expected number of unique samples in the bootstrap sample"
+                f" must be at most {n_samples - 1}. It is: {expected_oob_samples}"
+            )
         return max(round(n_samples * max_samples), 1)
 
 
@@ -160,6 +181,7 @@ def _parallel_build_trees(
     class_weight=None,
     n_samples_bootstrap=None,
     missing_values_in_feature_mask=None,
+    classes=None,
 ):
     """
     Private function used to fit a single tree in parallel."""
@@ -192,6 +214,7 @@ def _parallel_build_trees(
             sample_weight=curr_sample_weight,
             check_input=False,
             missing_values_in_feature_mask=missing_values_in_feature_mask,
+            classes=classes,
         )
     else:
         tree._fit(
@@ -200,6 +223,50 @@ def _parallel_build_trees(
             sample_weight=sample_weight,
             check_input=False,
             missing_values_in_feature_mask=missing_values_in_feature_mask,
+            classes=classes,
+        )
+
+    return tree
+
+
+def _parallel_update_trees(
+    tree,
+    bootstrap,
+    X,
+    y,
+    sample_weight,
+    tree_idx,
+    n_trees,
+    verbose=0,
+    class_weight=None,
+    n_samples_bootstrap=None,
+    classes=None,
+):
+    """
+    Private function used to fit a single tree in parallel."""
+    if verbose > 1:
+        print("Updating tree %d of %d" % (tree_idx + 1, n_trees))
+
+    if bootstrap:
+        n_samples = X.shape[0]
+        indices = _generate_sample_indices(
+            tree.random_state, n_samples, n_samples_bootstrap
+        )
+
+        tree.partial_fit(
+            X[indices, :],
+            y[indices],
+            sample_weight=sample_weight,
+            check_input=False,
+            classes=classes,
+        )
+    else:
+        tree.partial_fit(
+            X,
+            y,
+            sample_weight=sample_weight,
+            check_input=False,
+            classes=classes,
         )
 
     return tree
@@ -226,6 +293,11 @@ class BaseForest(MultiOutputMixin, BaseEnsemble, metaclass=ABCMeta):
             Interval(RealNotInt, 0.0, 1.0, closed="right"),
             Interval(Integral, 1, None, closed="left"),
         ],
+        "max_bins": [
+            None,
+            Interval(Integral, 1, None, closed="left"),
+        ],
+        "store_leaf_values": ["boolean"],
     }
 
     @abstractmethod
@@ -243,6 +315,8 @@ def __init__(
         warm_start=False,
         class_weight=None,
         max_samples=None,
+        max_bins=None,
+        store_leaf_values=False,
     ):
         super().__init__(
             estimator=estimator,
@@ -258,6 +332,8 @@ def __init__(
         self.warm_start = warm_start
         self.class_weight = class_weight
         self.max_samples = max_samples
+        self.max_bins = max_bins
+        self.store_leaf_values = store_leaf_values
 
     def apply(self, X):
         """
@@ -277,6 +353,15 @@ def apply(self, X):
             return the index of the leaf x ends up in.
         """
         X = self._validate_X_predict(X)
+
+        # if we trained a binning tree, then we should re-bin the data
+        # XXX: this is inefficient and should be improved to be in line with what
+        # the Histogram Gradient Boosting Tree does, where the binning thresholds
+        # are passed into the tree itself, thus allowing us to set the node feature
+        # value thresholds within the tree itself.
+        if self.max_bins is not None:
+            X = self._bin_data(X, is_training_data=False).astype(DTYPE)
+
         results = Parallel(
             n_jobs=self.n_jobs,
             verbose=self.verbose,
@@ -326,7 +411,7 @@ def decision_path(self, X):
         return sparse_hstack(indicators).tocsr(), n_nodes_ptr
 
     @_fit_context(prefer_skip_nested_validation=True)
-    def fit(self, X, y, sample_weight=None):
+    def fit(self, X, y, sample_weight=None, classes=None):
         """
         Build a forest of trees from the training set (X, y).
 
@@ -348,6 +433,9 @@ def fit(self, X, y, sample_weight=None):
             classification, splits are also ignored if they would result in any
             single class carrying a negative weight in either child node.
 
+        classes : array-like of shape (n_classes,), default=None
+            List of all the classes that can possibly appear in the y vector.
+
         Returns
         -------
         self : object
@@ -416,7 +504,7 @@ def fit(self, X, y, sample_weight=None):
 
         self._n_samples, self.n_outputs_ = y.shape
 
-        y, expanded_class_weight = self._validate_y_class_weight(y)
+        y, expanded_class_weight = self._validate_y_class_weight(y, classes=classes)
 
         if getattr(y, "dtype", None) != DOUBLE or not y.flags.contiguous:
             y = np.ascontiguousarray(y, dtype=DOUBLE)
@@ -455,6 +543,38 @@ def fit(self, X, y, sample_weight=None):
 
         n_more_estimators = self.n_estimators - len(self.estimators_)
 
+        if self.max_bins is not None:
+            # `_openmp_effective_n_threads` is used to take cgroups CPU quotes
+            # into account when determine the maximum number of threads to use.
+            n_threads = _openmp_effective_n_threads()
+
+            # Bin the data
+            # For ease of use of the API, the user-facing GBDT classes accept the
+            # parameter max_bins, which doesn't take into account the bin for
+            # missing values (which is always allocated). However, since max_bins
+            # isn't the true maximal number of bins, all other private classes
+            # (binmapper, histbuilder...) accept n_bins instead, which is the
+            # actual total number of bins. Everywhere in the code, the
+            # convention is that n_bins == max_bins + 1
+            n_bins = self.max_bins + 1  # + 1 for missing values
+            self._bin_mapper = _BinMapper(
+                n_bins=n_bins,
+                # is_categorical=self.is_categorical_,
+                known_categories=None,
+                random_state=random_state,
+                n_threads=n_threads,
+            )
+
+            # XXX: in order for this to work with the underlying tree submodule's Cython
+            # code, we need to convert this into the original data's DTYPE because
+            # the Cython code assumes that `DTYPE` is used.
+            # The proper implementation will be a lot more complicated and should be
+            # tackled once scikit-learn has finalized their inclusion of missing data
+            # and categorical support for decision trees
+            X = self._bin_data(X, is_training_data=True)  # .astype(DTYPE)
+        else:
+            self._bin_mapper = None
+
         if n_more_estimators < 0:
             raise ValueError(
                 "n_estimators=%d must be larger or equal to "
@@ -473,41 +593,18 @@ def fit(self, X, y, sample_weight=None):
                 # would have got if we hadn't used a warm_start.
                 random_state.randint(MAX_INT, size=len(self.estimators_))
 
-            trees = [
-                self._make_estimator(append=False, random_state=random_state)
-                for i in range(n_more_estimators)
-            ]
-
-            # Parallel loop: we prefer the threading backend as the Cython code
-            # for fitting the trees is internally releasing the Python GIL
-            # making threading more efficient than multiprocessing in
-            # that case. However, for joblib 0.12+ we respect any
-            # parallel_backend contexts set at a higher level,
-            # since correctness does not rely on using threads.
-            trees = Parallel(
-                n_jobs=self.n_jobs,
-                verbose=self.verbose,
-                prefer="threads",
-            )(
-                delayed(_parallel_build_trees)(
-                    t,
-                    self.bootstrap,
-                    X,
-                    y,
-                    sample_weight,
-                    i,
-                    len(trees),
-                    verbose=self.verbose,
-                    class_weight=self.class_weight,
-                    n_samples_bootstrap=n_samples_bootstrap,
-                    missing_values_in_feature_mask=missing_values_in_feature_mask,
-                )
-                for i, t in enumerate(trees)
+            # construct the trees in parallel
+            self._construct_trees(
+                X,
+                y,
+                sample_weight,
+                random_state,
+                n_samples_bootstrap,
+                missing_values_in_feature_mask,
+                classes,
+                n_more_estimators,
             )
 
-            # Collect newly grown trees
-            self.estimators_.extend(trees)
-
         if self.oob_score and (
             n_more_estimators > 0 or not hasattr(self, "oob_score_")
         ):
@@ -540,6 +637,53 @@ def fit(self, X, y, sample_weight=None):
 
         return self
 
+    def _construct_trees(
+        self,
+        X,
+        y,
+        sample_weight,
+        random_state,
+        n_samples_bootstrap,
+        missing_values_in_feature_mask,
+        classes,
+        n_more_estimators,
+    ):
+        trees = [
+            self._make_estimator(append=False, random_state=random_state)
+            for i in range(n_more_estimators)
+        ]
+
+        # Parallel loop: we prefer the threading backend as the Cython code
+        # for fitting the trees is internally releasing the Python GIL
+        # making threading more efficient than multiprocessing in
+        # that case. However, for joblib 0.12+ we respect any
+        # parallel_backend contexts set at a higher level,
+        # since correctness does not rely on using threads.
+        trees = Parallel(
+            n_jobs=self.n_jobs,
+            verbose=self.verbose,
+            prefer="threads",
+        )(
+            delayed(_parallel_build_trees)(
+                t,
+                self.bootstrap,
+                X,
+                y,
+                sample_weight,
+                i,
+                len(trees),
+                verbose=self.verbose,
+                class_weight=self.class_weight,
+                n_samples_bootstrap=n_samples_bootstrap,
+                missing_values_in_feature_mask=missing_values_in_feature_mask,
+                classes=classes,
+            )
+            for i, t in enumerate(trees)
+        )
+
+        # Collect newly grown trees
+        self.estimators_.extend(trees)
+
     @abstractmethod
     def _set_oob_score_and_attributes(self, X, y, scoring_function=None):
         """Compute and set the OOB score and attributes.
@@ -622,7 +766,7 @@ def _compute_oob_predictions(self, X, y):
 
         return oob_pred
 
-    def _validate_y_class_weight(self, y):
+    def _validate_y_class_weight(self, y, classes=None):
         # Default implementation
         return y, None
 
@@ -682,6 +826,174 @@ def feature_importances_(self):
         all_importances = np.mean(all_importances, axis=0, dtype=np.float64)
         return all_importances / np.sum(all_importances)
 
+    def _bin_data(self, X, is_training_data):
+        """Bin data X.
+
+        If is_training_data, then fit the _bin_mapper attribute.
+        Else, the binned data is converted to a C-contiguous array.
+        """
+        description = "training" if is_training_data else "validation"
+        if self.verbose:
+            print(
+                "Binning {:.3f} GB of {} data: ".format(X.nbytes / 1e9, description),
+                end="",
+                flush=True,
+            )
+        tic = time()
+        if is_training_data:
+            X_binned = self._bin_mapper.fit_transform(X)  # F-aligned array
+        else:
+            X_binned = self._bin_mapper.transform(X)  # F-aligned array
+            # We convert the array to C-contiguous since predicting is faster
+            # with this layout (training is faster on F-arrays though)
+            X_binned = np.ascontiguousarray(X_binned)
+        toc = time()
+        if self.verbose:
+            duration = toc - tic
+            print("{:.3f} s".format(duration))
+
+        return X_binned
+
+    def predict_quantiles(self, X, quantiles=0.5, method="nearest"):
+        """Predict class or regression value for X at given quantiles.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            Input data.
+        quantiles : float, optional
+            The quantiles at which to evaluate, by default 0.5 (median).
+        method : str, optional
+            The method to interpolate, by default 'linear'. Can be any keyword
+            argument accepted by :func:`~np.quantile`.
+
+        Returns
+        -------
+        y : ndarray of shape (n_samples, n_quantiles, [n_outputs])
+            The predicted values. The ``n_outputs`` dimension is present only
+            for multi-output regressors.
+        """
+        if not self.store_leaf_values:
+            raise RuntimeError(
+                "Quantile prediction is not available when store_leaf_values=False"
+            )
+        check_is_fitted(self)
+        # Check data
+        X = self._validate_X_predict(X)
+
+        if not isinstance(quantiles, (np.ndarray, list)):
+            quantiles = np.array([quantiles])
+
+        # if we trained a binning tree, then we should re-bin the data
+        # XXX: this is inefficient and should be improved to be in line with what
+        # the Histogram Gradient Boosting Tree does, where the binning thresholds
+        # are passed into the tree itself, thus allowing us to set the node feature
+        # value thresholds within the tree itself.
+        if self.max_bins is not None:
+            X = self._bin_data(X, is_training_data=False).astype(DTYPE)
+
+        # Assign chunk of trees to jobs
+        # n_jobs, _, _ = _partition_estimators(self.n_estimators, self.n_jobs)
+
+        # avoid storing the output of every estimator by summing them here
+        if self.n_outputs_ > 1:
+            y_hat = np.zeros(
+                (X.shape[0], len(quantiles), self.n_outputs_), dtype=np.float64
+            )
+        else:
+            y_hat = np.zeros((X.shape[0], len(quantiles)), dtype=np.float64)
+
+        # get (n_samples, n_estimators) indicator of leaf nodes
+        X_leaves = self.apply(X)
+
+        # we now want to aggregate all leaf samples across all trees for each sample
+        for idx in range(X.shape[0]):
+            # get leaf nodes for this sample
+            leaf_nodes = X_leaves[idx, :]
+
+            # (n_total_leaf_samples, n_outputs)
+            leaf_node_samples = np.vstack(
+                [
+                    est.tree_.leaf_nodes_samples[leaf_nodes[jdx]]
+                    for jdx, est in enumerate(self.estimators_)
+                ]
+            )
+
+            # get quantiles across all leaf node samples
+            try:
+                y_hat[idx, ...] = np.quantile(
+                    leaf_node_samples, quantiles, axis=0, method=method
+                )
+            except TypeError:
+                y_hat[idx, ...] = np.quantile(
+                    leaf_node_samples, quantiles, axis=0, interpolation=method
+                )
+
+            if is_classifier(self):
+                if self.n_outputs_ == 1:
+                    for i in range(len(quantiles)):
+                        class_pred_per_sample = y_hat[idx, i, :].squeeze().astype(int)
+                        y_hat[idx, ...] = self.classes_.take(
+                            class_pred_per_sample, axis=0
+                        )
+                else:
+                    for k in range(self.n_outputs_):
+                        for i in range(len(quantiles)):
+                            class_pred_per_sample = (
+                                y_hat[idx, i, k].squeeze().astype(int)
+                            )
+                            y_hat[idx, i, k] = self.classes_[k].take(
+                                class_pred_per_sample, axis=0
+                            )
+        return y_hat
+
+    def get_leaf_node_samples(self, X):
+        """For each datapoint x in X, get the training samples in the leaf node.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            Dataset to apply the forest to.
+
+        Returns
+        -------
+        leaf_node_samples : a list of array-like
+            Each sample is represented by the indices of the training samples that
+            reached the leaf node. The ``n_leaf_node_samples`` may vary between
+            samples, since the number of samples that fall in a leaf node is
+            variable. Each array-like has shape (n_leaf_node_samples, n_outputs).
+        """
+        if not self.store_leaf_values:
+            raise RuntimeError(
+                "Leaf node samples are not available when store_leaf_values=False"
+            )
+
+        check_is_fitted(self)
+        # Check data
+        X = self._validate_X_predict(X)
+
+        # if we trained a binning tree, then we should re-bin the data
+        # XXX: this is inefficient and should be improved to be in line with what
+        # the Histogram Gradient Boosting Tree does, where the binning thresholds
+        # are passed into the tree itself, thus allowing us to set the node feature
+        # value thresholds within the tree itself.
+        if self.max_bins is not None:
+            X = self._bin_data(X, is_training_data=False).astype(DTYPE)
+
+        # Assign chunk of trees to jobs
+        n_jobs, _, _ = _partition_estimators(self.n_estimators, self.n_jobs)
+
+        # avoid storing the output of every estimator by summing them here
+        result = Parallel(n_jobs=n_jobs, verbose=self.verbose)(
+            delayed(_accumulate_leaf_nodes_samples)(e.get_leaf_node_samples, X)
+            for e in self.estimators_
+        )
+        leaf_nodes_samples = result[0]
+        for result_ in result[1:]:
+            for i, node_samples in enumerate(result_):
+                leaf_nodes_samples[i] = np.vstack((leaf_nodes_samples[i], node_samples))
+        return leaf_nodes_samples
+
     def _get_estimators_indices(self):
         # Get drawn indices along both sample and feature axes
         for tree in self.estimators_:
@@ -737,6 +1049,17 @@ def _accumulate_prediction(predict, X, out, lock):
                 out[i] += prediction[i]
 
 
+def _accumulate_leaf_nodes_samples(func, X):
+    """
+    This is a utility function for joblib's Parallel.
+
+    It can't go locally in ForestClassifier or ForestRegressor, because joblib
+    complains that it cannot pickle it when placed there.
+    """
+    leaf_nodes_samples = func(X, check_input=False)
+    return leaf_nodes_samples
+
+
 class ForestClassifier(ClassifierMixin, BaseForest, metaclass=ABCMeta):
     """
     Base class for forest of trees-based classifiers.
@@ -760,6 +1083,8 @@ def __init__(
         warm_start=False,
         class_weight=None,
         max_samples=None,
+        max_bins=None,
+        store_leaf_values=False,
     ):
         super().__init__(
             estimator=estimator,
@@ -773,6 +1098,8 @@ def __init__(
             warm_start=warm_start,
             class_weight=class_weight,
             max_samples=max_samples,
+            max_bins=max_bins,
+            store_leaf_values=store_leaf_values,
         )
 
     @staticmethod
@@ -827,7 +1154,7 @@ def _set_oob_score_and_attributes(self, X, y, scoring_function=None):
             y, np.argmax(self.oob_decision_function_, axis=1)
         )
 
-    def _validate_y_class_weight(self, y):
+    def _validate_y_class_weight(self, y, classes=None):
         check_classification_targets(y)
 
         y = np.copy(y)
@@ -840,12 +1167,28 @@ def _validate_y_class_weight(self, y):
         self.n_classes_ = []
 
         y_store_unique_indices = np.zeros(y.shape, dtype=int)
-        for k in range(self.n_outputs_):
-            classes_k, y_store_unique_indices[:, k] = np.unique(
-                y[:, k], return_inverse=True
-            )
-            self.classes_.append(classes_k)
-            self.n_classes_.append(classes_k.shape[0])
+        if classes is not None:
+            classes = np.atleast_1d(classes)
+            if classes.ndim == 1:
+                classes = np.array([classes])
+
+            for k in classes:
+                self.classes_.append(np.array(k))
+                self.n_classes_.append(np.array(k).shape[0])
+
+            for i in range(y.shape[0]):
+                for j in range(self.n_outputs_):
+                    y_store_unique_indices[i, j] = np.where(
+                        self.classes_[j] == y[i, j]
+                    )[0][0]
+        else:
+            for k in range(self.n_outputs_):
+                classes_k, y_store_unique_indices[:, k] = np.unique(
+                    y[:, k], return_inverse=True
+                )
+                self.classes_.append(classes_k)
+                self.n_classes_.append(classes_k.shape[0])
+
         y = y_store_unique_indices
 
         if self.class_weight is not None:
@@ -880,6 +1223,229 @@ def _validate_y_class_weight(self, y):
 
         return y, expanded_class_weight
 
+    def partial_fit(self, X, y, sample_weight=None, classes=None):
+        """Update a decision tree classifier from the training set (X, y).
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            The training input samples. Internally, it will be converted to
+            ``dtype=np.float32`` and if a sparse matrix is provided
+            to a sparse ``csc_matrix``.
+
+        y : array-like of shape (n_samples,) or (n_samples, n_outputs)
+            The target values (class labels) as integers or strings.
+
+        sample_weight : array-like of shape (n_samples,), default=None
+            Sample weights. If None, then samples are equally weighted. Splits
+            that would create child nodes with net zero or negative weight are
+            ignored while searching for a split in each node. Splits are also
+            ignored if they would result in any single class carrying a
+            negative weight in either child node.
+
+        classes : array-like of shape (n_classes,), default=None
+            List of all the classes that can possibly appear in the y vector.
+            Must be provided at the first call to partial_fit, can be omitted
+            in subsequent calls.
+
+        Returns
+        -------
+        self : object
+            Returns the instance itself.
+        """
+        self._validate_params()
+
+        # validate input parameters
+        first_call = _check_partial_fit_first_call(self, classes=classes)
+
+        # Fit if no tree exists yet
+        if first_call:
+            self.fit(
+                X,
+                y,
+                sample_weight=sample_weight,
+                classes=classes,
+            )
+            return self
+
+        X, y = validate_data(
+            self,
+            X,
+            y,
+            multi_output=True,
+            accept_sparse="csc",
+            dtype=DTYPE,
+            force_all_finite=False,
+            reset=first_call,
+        )
+
+        if issparse(y):
+            raise ValueError("sparse multilabel-indicator for y is not supported.")
+
+        if sample_weight is not None:
+            sample_weight = _check_sample_weight(sample_weight, X)
+
+        if issparse(X):
+            # Pre-sort indices to avoid that each individual tree of the
+            # ensemble sorts the indices.
+            X.sort_indices()
+
+        y = np.atleast_1d(y)
+        if y.ndim == 2 and y.shape[1] == 1:
+            warn(
+                (
+                    "A column-vector y was passed when a 1d array was"
+                    " expected. Please change the shape of y to "
+                    "(n_samples,), for example using ravel()."
+                ),
+                DataConversionWarning,
+                stacklevel=2,
+            )
+
+        if y.ndim == 1:
+            # reshape is necessary to preserve the data contiguity against vs
+            # [:, np.newaxis] that does not.
+            y = np.reshape(y, (-1, 1))
+
+        if self.criterion == "poisson":
+            if np.any(y < 0):
+                raise ValueError(
+                    "Some value(s) of y are negative which is "
+                    "not allowed for Poisson regression."
+                )
+            if np.sum(y) <= 0:
+                raise ValueError(
+                    "Sum of y is not strictly positive which "
+                    "is necessary for Poisson regression."
+                )
+
+        self.n_outputs_ = y.shape[1]
+
+        classes = self.classes_
+        if self.n_outputs_ == 1:
+            classes = [classes]
+
+        y, expanded_class_weight = self._validate_y_class_weight(y, classes)
+
+        if getattr(y, "dtype", None) != DOUBLE or not y.flags.contiguous:
+            y = np.ascontiguousarray(y, dtype=DOUBLE)
+
+        if expanded_class_weight is not None:
+            if sample_weight is not None:
+                sample_weight = sample_weight * expanded_class_weight
+            else:
+                sample_weight = expanded_class_weight
+
+        if not self.bootstrap and self.max_samples is not None:
+            raise ValueError(
+                "`max_sample` cannot be set if `bootstrap=False`. "
+                "Either switch to `bootstrap=True` or set "
+                "`max_sample=None`."
+            )
+        elif self.bootstrap:
+            n_samples_bootstrap = _get_n_samples_bootstrap(
+                n_samples=X.shape[0], max_samples=self.max_samples
+            )
+        else:
+            n_samples_bootstrap = None
+
+        self._validate_estimator()
+
+        if not self.bootstrap and self.oob_score:
+            raise ValueError("Out of bag estimation only available if bootstrap=True")
+
+        random_state = check_random_state(self.random_state)
+
+        if self.max_bins is not None:
+            # `_openmp_effective_n_threads` is used to take cgroups CPU quotes
+            # into account when determine the maximum number of threads to use.
+            n_threads = _openmp_effective_n_threads()
+
+            # Bin the data
+            # For ease of use of the API, the user-facing GBDT classes accept the
+            # parameter max_bins, which doesn't take into account the bin for
+            # missing values (which is always allocated). However, since max_bins
+            # isn't the true maximal number of bins, all other private classes
+            # (binmapper, histbuilder...) accept n_bins instead, which is the
+            # actual total number of bins. Everywhere in the code, the
+            # convention is that n_bins == max_bins + 1
+            n_bins = self.max_bins + 1  # + 1 for missing values
+            self._bin_mapper = _BinMapper(
+                n_bins=n_bins,
+                # is_categorical=self.is_categorical_,
+                known_categories=None,
+                random_state=random_state,
+                n_threads=n_threads,
+            )
+
+            # XXX: in order for this to work with the underlying tree submodule's Cython
+            # code, we need to convert this into the original data's DTYPE because
+            # the Cython code assumes that `DTYPE` is used.
+            # The proper implementation will be a lot more complicated and should be
+            # tackled once scikit-learn has finalized their inclusion of missing data
+            # and categorical support for decision trees
+            X = self._bin_data(X, is_training_data=True)  # .astype(DTYPE)
+        else:
+            self._bin_mapper = None
+
+        # We draw from the random state to get the random state we
+        # would have got if we hadn't used a warm_start.
+        random_state.randint(MAX_INT, size=len(self.estimators_))
+
+        # Parallel loop: we prefer the threading backend as the Cython code
+        # for fitting the trees is internally releasing the Python GIL
+        # making threading more efficient than multiprocessing in
+        # that case. However, for joblib 0.12+ we respect any
+        # parallel_backend contexts set at a higher level,
+        # since correctness does not rely on using threads.
+        Parallel(
+            n_jobs=self.n_jobs,
+            verbose=self.verbose,
+            prefer="threads",
+        )(
+            delayed(_parallel_update_trees)(
+                t,
+                self.bootstrap,
+                X,
+                y,
+                sample_weight,
+                i,
+                len(self.estimators_),
+                verbose=self.verbose,
+                class_weight=self.class_weight,
+                n_samples_bootstrap=n_samples_bootstrap,
+                classes=classes[0],
+            )
+            for i, t in enumerate(self.estimators_)
+        )
+
+        if self.oob_score:
+            y_type = type_of_target(y)
+            if y_type in ("multiclass-multioutput", "unknown"):
+                # FIXME: we could consider to support multiclass-multioutput if
+                # we introduce or reuse a constructor parameter (e.g.
+                # oob_score) allowing our user to pass a callable defining the
+                # scoring strategy on OOB sample.
+                raise ValueError(
+                    "The type of target cannot be used to compute OOB "
+                    f"estimates. Got {y_type} while only the following are "
+                    "supported: continuous, continuous-multioutput, binary, "
+                    "multiclass, multilabel-indicator."
+                )
+
+            if callable(self.oob_score):
+                self._set_oob_score_and_attributes(
+                    X, y, scoring_function=self.oob_score
+                )
+            else:
+                self._set_oob_score_and_attributes(X, y)
+
+        # Decapsulate classes_ attributes
+        if hasattr(self, "classes_") and self.n_outputs_ == 1:
+            self.n_classes_ = self.n_classes_[0]
+            self.classes_ = self.classes_[0]
+        return self
+
     def predict(self, X):
         """
         Predict class for X.
@@ -945,6 +1511,14 @@ def predict_proba(self, X):
         # Check data
         X = self._validate_X_predict(X)
 
+        # if we trained a binning tree, then we should re-bin the data
+        # XXX: this is inefficient and should be improved to be in line with what
+        # the Histogram Gradient Boosting Tree does, where the binning thresholds
+        # are passed into the tree itself, thus allowing us to set the node feature
+        # value thresholds within the tree itself.
+        if self.max_bins is not None:
+            X = self._bin_data(X, is_training_data=False).astype(DTYPE)
+
         # Assign chunk of trees to jobs
         n_jobs, _, _ = _partition_estimators(self.n_estimators, self.n_jobs)
 
@@ -1027,6 +1601,8 @@ def __init__(
         verbose=0,
         warm_start=False,
         max_samples=None,
+        max_bins=None,
+        store_leaf_values=False,
     ):
         super().__init__(
             estimator,
@@ -1039,6 +1615,8 @@ def __init__(
             verbose=verbose,
             warm_start=warm_start,
             max_samples=max_samples,
+            max_bins=max_bins,
+            store_leaf_values=store_leaf_values,
         )
 
     def predict(self, X):
@@ -1064,6 +1642,14 @@ def predict(self, X):
         # Check data
         X = self._validate_X_predict(X)
 
+        # if we trained a binning tree, then we should re-bin the data
+        # XXX: this is inefficient and should be improved to be in line with what
+        # the Histogram Gradient Boosting Tree does, where the binning thresholds
+        # are passed into the tree itself, thus allowing us to set the node feature
+        # value thresholds within the tree itself.
+        if self.max_bins is not None:
+            X = self._bin_data(X, is_training_data=False).astype(DTYPE)
+
         # Assign chunk of trees to jobs
         n_jobs, _, _ = _partition_estimators(self.n_estimators, self.n_jobs)
 
@@ -1167,7 +1753,7 @@ def _compute_partial_dependence_recursion(self, grid, target_features):
 
     def __sklearn_tags__(self):
         tags = super().__sklearn_tags__()
-        tags.regressor_tags.multi_label = True
+        # tags.regressor_tags.multi_label = True TODO: add regression support
         return tags
 
 
@@ -1361,6 +1947,16 @@ class RandomForestClassifier(ForestClassifier):
 
         .. versionadded:: 0.22
 
+    max_bins : int, default=255
+        The maximum number of bins to use for non-missing values.
+
+        **This is an experimental feature**.
+
+    store_leaf_values : bool, default=False
+        Whether to store the leaf values in the ``get_leaf_node_samples`` function.
+
+        **This is an experimental feature**.
+
     monotonic_cst : array-like of int of shape (n_features), default=None
         Indicates the monotonicity constraint to enforce on each feature.
           - 1: monotonic increase
@@ -1518,6 +2114,8 @@ def __init__(
         class_weight=None,
         ccp_alpha=0.0,
         max_samples=None,
+        max_bins=None,
+        store_leaf_values=False,
         monotonic_cst=None,
     ):
         super().__init__(
@@ -1534,6 +2132,7 @@ def __init__(
                 "min_impurity_decrease",
                 "random_state",
                 "ccp_alpha",
+                "store_leaf_values",
                 "monotonic_cst",
             ),
             bootstrap=bootstrap,
@@ -1544,6 +2143,8 @@ def __init__(
             warm_start=warm_start,
             class_weight=class_weight,
             max_samples=max_samples,
+            max_bins=max_bins,
+            store_leaf_values=store_leaf_values,
         )
 
         self.criterion = criterion
@@ -1560,11 +2161,11 @@ def __init__(
     def __sklearn_tags__(self):
         tags = super().__sklearn_tags__()
         # TODO: replace by a statistical test, see meta-issue #16298
-        tags._xfail_checks = {
-            "check_sample_weight_equivalence": (
-                "sample_weight is not equivalent to removing/repeating samples."
-            ),
-        }
+        # tags._xfail_checks = {
+        #     "check_sample_weight_equivalence": (
+        #         "sample_weight is not equivalent to removing/repeating samples."
+        #     ),
+        # }
         return tags
 
 
@@ -1749,6 +2350,17 @@ class RandomForestRegressor(ForestRegressor):
 
         .. versionadded:: 0.22
 
+    max_bins : int, default=255
+        The maximum number of bins to use for non-missing values. Used for
+        speeding up training time.
+
+        **This is an experimental feature**.
+
+    store_leaf_values : bool, default=False
+        Whether to store the leaf values in the ``get_leaf_node_samples`` function.
+
+        **This is an experimental feature**.
+
     monotonic_cst : array-like of int of shape (n_features), default=None
         Indicates the monotonicity constraint to enforce on each feature.
           - 1: monotonically increasing
@@ -1890,6 +2502,8 @@ def __init__(
         warm_start=False,
         ccp_alpha=0.0,
         max_samples=None,
+        max_bins=None,
+        store_leaf_values=False,
         monotonic_cst=None,
     ):
         super().__init__(
@@ -1906,6 +2520,7 @@ def __init__(
                 "min_impurity_decrease",
                 "random_state",
                 "ccp_alpha",
+                "store_leaf_values",
                 "monotonic_cst",
             ),
             bootstrap=bootstrap,
@@ -1915,6 +2530,8 @@ def __init__(
             verbose=verbose,
             warm_start=warm_start,
             max_samples=max_samples,
+            max_bins=max_bins,
+            store_leaf_values=store_leaf_values,
         )
 
         self.criterion = criterion
@@ -1931,11 +2548,11 @@ def __init__(
     def __sklearn_tags__(self):
         tags = super().__sklearn_tags__()
         # TODO: replace by a statistical test, see meta-issue #16298
-        tags._xfail_checks = {
-            "check_sample_weight_equivalence": (
-                "sample_weight is not equivalent to removing/repeating samples."
-            ),
-        }
+        # tags._xfail_checks = {
+        #     "check_sample_weight_equivalence": (
+        #         "sample_weight is not equivalent to removing/repeating samples."
+        #     ),
+        # }
         return tags
 
 
@@ -2126,6 +2743,16 @@ class ExtraTreesClassifier(ForestClassifier):
 
         .. versionadded:: 0.22
 
+    max_bins : int, default=255
+        The maximum number of bins to use for non-missing values.
+
+        **This is an experimental feature**.
+
+    store_leaf_values : bool, default=False
+        Whether to store the leaf values in the ``get_leaf_node_samples`` function.
+
+        **This is an experimental feature**.
+
     monotonic_cst : array-like of int of shape (n_features), default=None
         Indicates the monotonicity constraint to enforce on each feature.
           - 1: monotonically increasing
@@ -2272,6 +2899,8 @@ def __init__(
         class_weight=None,
         ccp_alpha=0.0,
         max_samples=None,
+        max_bins=None,
+        store_leaf_values=False,
         monotonic_cst=None,
     ):
         super().__init__(
@@ -2288,6 +2917,7 @@ def __init__(
                 "min_impurity_decrease",
                 "random_state",
                 "ccp_alpha",
+                "store_leaf_values",
                 "monotonic_cst",
             ),
             bootstrap=bootstrap,
@@ -2298,6 +2928,8 @@ def __init__(
             warm_start=warm_start,
             class_weight=class_weight,
             max_samples=max_samples,
+            max_bins=max_bins,
+            store_leaf_values=store_leaf_values,
         )
 
         self.criterion = criterion
@@ -2487,6 +3119,16 @@ class ExtraTreesRegressor(ForestRegressor):
 
         .. versionadded:: 0.22
 
+    max_bins : int, default=255
+        The maximum number of bins to use for non-missing values.
+
+        **This is an experimental feature**.
+
+    store_leaf_values : bool, default=False
+        Whether to store the leaf values in the ``get_leaf_node_samples`` function.
+
+        **This is an experimental feature**.
+
     monotonic_cst : array-like of int of shape (n_features), default=None
         Indicates the monotonicity constraint to enforce on each feature.
           - 1: monotonically increasing
@@ -2613,6 +3255,8 @@ def __init__(
         warm_start=False,
         ccp_alpha=0.0,
         max_samples=None,
+        max_bins=None,
+        store_leaf_values=False,
         monotonic_cst=None,
     ):
         super().__init__(
@@ -2629,6 +3273,7 @@ def __init__(
                 "min_impurity_decrease",
                 "random_state",
                 "ccp_alpha",
+                "store_leaf_values",
                 "monotonic_cst",
             ),
             bootstrap=bootstrap,
@@ -2638,6 +3283,8 @@ def __init__(
             verbose=verbose,
             warm_start=warm_start,
             max_samples=max_samples,
+            max_bins=max_bins,
+            store_leaf_values=store_leaf_values,
         )
 
         self.criterion = criterion
@@ -2761,6 +3408,9 @@ class RandomTreesEmbedding(TransformerMixin, BaseForest):
         new forest. See :term:`Glossary <warm_start>` and
         :ref:`tree_ensemble_warm_start` for details.
 
+    store_leaf_values : bool, default=False
+        Whether to store the leaf values in the ``get_leaf_node_samples`` function.
+
     Attributes
     ----------
     estimator_ : :class:`~sklearn.tree.ExtraTreeRegressor` instance
@@ -2862,6 +3512,7 @@ def __init__(
         random_state=None,
         verbose=0,
         warm_start=False,
+        store_leaf_values=False,
     ):
         super().__init__(
             estimator=ExtraTreeRegressor(),
@@ -2876,6 +3527,7 @@ def __init__(
                 "max_leaf_nodes",
                 "min_impurity_decrease",
                 "random_state",
+                "store_leaf_values",
             ),
             bootstrap=False,
             oob_score=False,
@@ -2884,6 +3536,7 @@ def __init__(
             verbose=verbose,
             warm_start=warm_start,
             max_samples=None,
+            store_leaf_values=store_leaf_values,
         )
 
         self.max_depth = max_depth
@@ -2897,7 +3550,7 @@ def __init__(
     def _set_oob_score_and_attributes(self, X, y, scoring_function=None):
         raise NotImplementedError("OOB score not supported by tree embedding")
 
-    def fit(self, X, y=None, sample_weight=None):
+    def fit(self, X, y=None, sample_weight=None, classes=None):
         """
         Fit estimator.
 
@@ -2918,17 +3571,20 @@ def fit(self, X, y=None, sample_weight=None):
             classification, splits are also ignored if they would result in any
             single class carrying a negative weight in either child node.
 
+        classes : array-like of shape (n_classes,), default=None
+            List of all the classes that can possibly appear in the y vector.
+
         Returns
         -------
         self : object
             Returns the instance itself.
         """
         # Parameters are validated in fit_transform
-        self.fit_transform(X, y, sample_weight=sample_weight)
+        self.fit_transform(X, y, sample_weight=sample_weight, classes=classes)
         return self
 
     @_fit_context(prefer_skip_nested_validation=True)
-    def fit_transform(self, X, y=None, sample_weight=None):
+    def fit_transform(self, X, y=None, sample_weight=None, classes=None):
         """
         Fit estimator and transform dataset.
 
@@ -2948,6 +3604,9 @@ def fit_transform(self, X, y=None, sample_weight=None):
             classification, splits are also ignored if they would result in any
             single class carrying a negative weight in either child node.
 
+        classes : array-like of shape (n_classes,), default=None
+            List of all the classes that can possibly appear in the y vector.
+
         Returns
         -------
         X_transformed : sparse matrix of shape (n_samples, n_out)
@@ -2955,7 +3614,7 @@ def fit_transform(self, X, y=None, sample_weight=None):
         """
         rnd = check_random_state(self.random_state)
         y = rnd.uniform(size=_num_samples(X))
-        super().fit(X, y, sample_weight=sample_weight)
+        super().fit(X, y, sample_weight=sample_weight, classes=classes)
 
         self.one_hot_encoder_ = OneHotEncoder(sparse_output=self.sparse_output)
         output = self.one_hot_encoder_.fit_transform(self.apply(X))
@@ -3016,9 +3675,9 @@ def transform(self, X):
     def __sklearn_tags__(self):
         tags = super().__sklearn_tags__()
         # TODO: replace by a statistical test, see meta-issue #16298
-        tags._xfail_checks = {
-            "check_sample_weight_equivalence": (
-                "sample_weight is not equivalent to removing/repeating samples."
-            ),
-        }
+        # tags._xfail_checks = {
+        #     "check_sample_weight_equivalence": (
+        #         "sample_weight is not equivalent to removing/repeating samples."
+        #     ),
+        # }
         return tags
diff --git a/sklearn/ensemble/_gb.py b/sklearn/ensemble/_gb.py
index 8f85f2f7aa3cd..1f48d6eaf8ccd 100644
--- a/sklearn/ensemble/_gb.py
+++ b/sklearn/ensemble/_gb.py
@@ -365,6 +365,7 @@ class BaseGradientBoosting(BaseEnsemble, metaclass=ABCMeta):
         "n_iter_no_change": [Interval(Integral, 1, None, closed="left"), None],
         "tol": [Interval(Real, 0.0, None, closed="left")],
     }
+    _parameter_constraints.pop("store_leaf_values")
     _parameter_constraints.pop("splitter")
     _parameter_constraints.pop("monotonic_cst")
 
diff --git a/sklearn/ensemble/meson.build b/sklearn/ensemble/meson.build
index bc5868b3a0104..e595757136a2d 100644
--- a/sklearn/ensemble/meson.build
+++ b/sklearn/ensemble/meson.build
@@ -2,6 +2,7 @@ py.extension_module(
   '_gradient_boosting',
   ['_gradient_boosting.pyx'] + utils_cython_tree,
   dependencies: [np_dep],
+  override_options : ['optimization=3', 'cython_language=cpp'],
   cython_args: cython_args,
   subdir: 'sklearn/ensemble',
   install: true
diff --git a/sklearn/ensemble/tests/test_forest.py b/sklearn/ensemble/tests/test_forest.py
index aadf230fd751e..51fbb3e823726 100644
--- a/sklearn/ensemble/tests/test_forest.py
+++ b/sklearn/ensemble/tests/test_forest.py
@@ -118,6 +118,120 @@
 FOREST_CLASSIFIERS_REGRESSORS.update(FOREST_REGRESSORS)
 
 
+def _sparse_parity(n, p=20, p_star=3, random_state=None):
+    """Generate sparse parity dataset.
+
+    Sparse parity is a multivariate generalization of the
+    XOR problem.
+
+    Parameters
+    ----------
+    n : int
+        Number of sample to generate.
+    p : int, optional
+        The dimensionality of the dataset, by default 20
+    p_star : int, optional
+        The number of informative dimensions, by default 3.
+    random_state : Random State, optional
+        Random state, by default None.
+
+    Returns
+    -------
+    X : np.ndarray of shape (n, p)
+        Sparse parity dataset as a dense array.
+    y : np.ndarray of shape (n,)
+        Labels of the dataset
+    """
+    rng = np.random.RandomState(seed=random_state)
+    X = rng.uniform(-1, 1, (n, p))
+    y = np.zeros(n)
+
+    for i in range(0, n):
+        y[i] = sum(X[i, :p_star] > 0) % 2
+
+    return X, y
+
+
+def _orthant(n, p=6, random_state=None):
+    """Generate orthant dataset.
+
+    Parameters
+    ----------
+    n : int
+        Number of sample to generate.
+    p : int, optional
+        The dimensionality of the dataset and the number of
+        unique labels, by default 6.
+    rec : int, optional
+        _description_, by default 1
+    random_state : Random State, optional
+        Random state, by default None.
+
+    Returns
+    -------
+    X : np.ndarray of shape (n, p)
+        Orthant dataset as a dense array.
+    y : np.ndarray of shape (n,)
+        Labels of the dataset
+    """
+    rng = np.random.RandomState(seed=random_state)
+    orth_labels = np.asarray([2**i for i in range(0, p)][::-1])
+
+    X = rng.uniform(-1, 1, (n, p))
+    y = np.zeros(n)
+
+    for i in range(0, n):
+        idx = np.where(X[i, :] > 0)[0]
+        y[i] = sum(orth_labels[idx])
+
+    if len(np.unique(y)) < 2**p:
+        raise RuntimeError("Increase sample size to get a label in each orthant.")
+
+    return X, y
+
+
+def _trunk(n, p=10, random_state=None):
+    """Generate trunk dataset.
+
+    Parameters
+    ----------
+    n : int
+        Number of sample to generate.
+    p : int, optional
+        The dimensionality of the dataset and the number of
+        unique labels, by default 10.
+    random_state : Random State, optional
+        Random state, by default None.
+
+    Returns
+    -------
+    X : np.ndarray of shape (n, p)
+        Trunk dataset as a dense array.
+    y : np.ndarray of shape (n,)
+        Labels of the dataset
+
+    References
+    ----------
+    [1] Gerard V. Trunk. A problem of dimensionality: A
+    simple example. IEEE Transactions on Pattern Analysis
+    and Machine Intelligence, 1(3):306–307, 1979.
+    """
+    rng = np.random.RandomState(seed=random_state)
+
+    mu_1 = np.array([1 / i for i in range(1, p + 1)])
+    mu_0 = -1 * mu_1
+    cov = np.identity(p)
+
+    X = np.vstack(
+        (
+            rng.multivariate_normal(mu_0, cov, int(n / 2)),
+            rng.multivariate_normal(mu_1, cov, int(n / 2)),
+        )
+    )
+    y = np.concatenate((np.zeros(int(n / 2)), np.ones(int(n / 2))))
+    return X, y
+
+
 @pytest.mark.parametrize("name", FOREST_CLASSIFIERS)
 def test_classification_toy(name):
     """Check classification on a toy dataset."""
@@ -1542,7 +1656,10 @@ def test_max_samples_bootstrap(name):
 def test_large_max_samples_exception(name):
     # Check invalid `max_samples`
     est = FOREST_CLASSIFIERS_REGRESSORS[name](bootstrap=True, max_samples=int(1e9))
-    match = "`max_samples` must be <= n_samples=6 but got value 1000000000"
+    # TODO: remove the following line when the issue is fixed
+    # https://github.com/scikit-learn/scikit-learn/issues/28507
+    # match = "`max_samples` must be <= n_samples=6 but got value 1000000000"
+    match = "The expected number of unique samples"
     with pytest.raises(ValueError, match=match):
         est.fit(X, y)
 
@@ -1704,6 +1821,116 @@ def test_round_samples_to_one_when_samples_too_low(class_weight):
     forest.fit(X, y)
 
 
+@pytest.mark.skip()
+@pytest.mark.parametrize("name", FOREST_CLASSIFIERS)
+def test_classification_toy_withbins(name):
+    """Check classification on a toy dataset."""
+    ForestClassifier = FOREST_CLASSIFIERS[name]
+
+    clf = ForestClassifier(n_estimators=10, random_state=1, max_bins=255)
+    clf.fit(X, y)
+    assert_array_equal(clf.predict(T), true_result)
+    assert 10 == len(clf)
+
+    clf = ForestClassifier(
+        n_estimators=10, max_features=1, random_state=1, max_bins=255
+    )
+    clf.fit(X, y)
+    assert_array_equal(clf.predict(T), true_result)
+    assert 10 == len(clf)
+
+    # also test apply
+    leaf_indices = clf.apply(X)
+    assert leaf_indices.shape == (len(X), clf.n_estimators)
+
+
+@pytest.mark.skip()
+@pytest.mark.parametrize("name", FOREST_REGRESSORS)
+@pytest.mark.parametrize(
+    "criterion", ("squared_error", "absolute_error", "friedman_mse")
+)
+def test_regression_criterion_withbins(name, criterion):
+    # Check consistency on regression dataset.
+    ForestRegressor = FOREST_REGRESSORS[name]
+
+    reg = ForestRegressor(
+        n_estimators=5, criterion=criterion, random_state=1, max_bins=250
+    )
+    reg.fit(X_reg, y_reg)
+    score = reg.score(X_reg, y_reg)
+    assert (
+        score > 0.93
+    ), "Failed with max_features=None, criterion %s and score = %f" % (
+        criterion,
+        score,
+    )
+
+    reg = ForestRegressor(
+        n_estimators=5,
+        criterion=criterion,
+        max_features=6,
+        random_state=1,
+        max_bins=250,
+    )
+    reg.fit(X_reg, y_reg)
+    score = reg.score(X_reg, y_reg)
+    assert score > 0.92, "Failed with max_features=6, criterion %s and score = %f" % (
+        criterion,
+        score,
+    )
+
+
+@pytest.mark.parametrize("name", FOREST_CLASSIFIERS_REGRESSORS)
+def test_multioutput_quantiles(name):
+    # Check estimators on multi-output problems.
+    X_train = [
+        [-2, -1],
+        [-1, -1],
+        [-1, -2],
+        [1, 1],
+        [1, 2],
+        [2, 1],
+        [-2, 1],
+        [-1, 1],
+        [-1, 2],
+        [2, -1],
+        [1, -1],
+        [1, -2],
+    ]
+    y_train = [
+        [-1, 0],
+        [-1, 0],
+        [-1, 0],
+        [1, 1],
+        [1, 1],
+        [1, 1],
+        [-1, 2],
+        [-1, 2],
+        [-1, 2],
+        [1, 3],
+        [1, 3],
+        [1, 3],
+    ]
+    X_test = [[-1, -1], [1, 1], [-1, 1], [1, -1]]
+    y_test = [[-1, 0], [1, 1], [-1, 2], [1, 3]]
+
+    est = FOREST_ESTIMATORS[name](
+        random_state=0, bootstrap=False, store_leaf_values=True
+    )
+    est.fit(X_train, y_train)
+
+    y_pred = est.predict_quantiles(X_test, quantiles=[0.25, 0.5, 0.75])
+    assert_array_almost_equal(y_pred[:, 1, :], y_test)
+    assert_array_almost_equal(y_pred[:, 0, :], y_test)
+    assert_array_almost_equal(y_pred[:, 2, :], y_test)
+
+    # test the leaf nodes samples
+    leaf_nodes_samples = est.get_leaf_node_samples(X_test)
+    assert len(leaf_nodes_samples) == len(X_test)
+    for node_samples in leaf_nodes_samples:
+        assert node_samples.shape[1] == est.n_outputs_
+
+
 @pytest.mark.parametrize("seed", [None, 1])
 @pytest.mark.parametrize("bootstrap", [True, False])
 @pytest.mark.parametrize("ForestClass", FOREST_CLASSIFIERS_REGRESSORS.values())
diff --git a/sklearn/feature_selection/tests/test_from_model.py b/sklearn/feature_selection/tests/test_from_model.py
index 8008b8c028085..9b6c8382c1fd5 100644
--- a/sklearn/feature_selection/tests/test_from_model.py
+++ b/sklearn/feature_selection/tests/test_from_model.py
@@ -10,7 +10,11 @@
 from sklearn.cross_decomposition import CCA, PLSCanonical, PLSRegression
 from sklearn.datasets import make_friedman1
 from sklearn.decomposition import PCA
-from sklearn.ensemble import HistGradientBoostingClassifier, RandomForestClassifier
+from sklearn.ensemble import (
+    HistGradientBoostingClassifier,
+    RandomForestClassifier,
+    RandomForestRegressor,
+)
 from sklearn.exceptions import NotFittedError
 from sklearn.feature_selection import SelectFromModel
 from sklearn.linear_model import (
@@ -409,7 +413,7 @@ def test_partial_fit():
     assert_array_almost_equal(X_transform, transformer.transform(data))
 
     # check that if est doesn't have partial_fit, neither does SelectFromModel
-    transformer = SelectFromModel(estimator=RandomForestClassifier())
+    transformer = SelectFromModel(estimator=RandomForestRegressor())
     assert not hasattr(transformer, "partial_fit")
 
 
diff --git a/sklearn/meson.build b/sklearn/meson.build
index eaf1b98e60cc2..7c01ad73f9ce4 100644
--- a/sklearn/meson.build
+++ b/sklearn/meson.build
@@ -193,14 +193,14 @@ cython_args += scikit_learn_cython_args
 # Write file in Meson build dir to be able to figure out from Python code
 # whether scikit-learn was built with Meson. Adapted from pandas
 # _version_meson.py.
-custom_target('write_built_with_meson_file',
-    output: '_built_with_meson.py',
-    command: [
-        py, '-c', 'with open("sklearn/_built_with_meson.py", "w") as f: f.write("")'
-    ],
-    install: true,
-    install_dir: py.get_install_dir() / 'sklearn'
-)
+# custom_target('write_built_with_meson_file',
+#     output: '_built_with_meson.py',
+#     command: [
+#         py, '-c', 'with open("sklearn/_built_with_meson.py", "w") as f: f.write("")'
+#     ],
+#     install: true,
+#     install_dir: py.get_install_dir() / 'sklearn'
+# )
 
 extensions = ['_isotonic']
 
diff --git a/sklearn/neighbors/meson.build b/sklearn/neighbors/meson.build
index 22f81d597948b..0d850bc4fd76e 100644
--- a/sklearn/neighbors/meson.build
+++ b/sklearn/neighbors/meson.build
@@ -40,7 +40,8 @@ neighbors_extension_metadata = {
   '_partition_nodes':
       {'sources': ['_partition_nodes.pyx'],
        'override_options': ['cython_language=cpp'], 'dependencies': [np_dep]},
-  '_quad_tree': {'sources': ['_quad_tree.pyx'], 'dependencies': [np_dep]},
+  '_quad_tree': {'sources': ['_quad_tree.pyx'], 'dependencies': [np_dep], 
+      'override_options': ['cython_language=cpp']},
 }
 
 foreach ext_name, ext_dict : neighbors_extension_metadata
diff --git a/sklearn/tree/_classes.py b/sklearn/tree/_classes.py
index 93246a1376e85..2ce58759d8253 100644
--- a/sklearn/tree/_classes.py
+++ b/sklearn/tree/_classes.py
@@ -15,9 +15,7 @@
 import numpy as np
 from scipy.sparse import issparse
 
-from sklearn.utils import metadata_routing
-
-from ..base import (
+from sklearn.base import (
     BaseEstimator,
     ClassifierMixin,
     MultiOutputMixin,
@@ -26,10 +24,18 @@
     clone,
     is_classifier,
 )
-from ..utils import Bunch, check_random_state, compute_sample_weight
-from ..utils._param_validation import Hidden, Interval, RealNotInt, StrOptions
-from ..utils.multiclass import check_classification_targets
-from ..utils.validation import (
+from sklearn.utils import (
+    Bunch,
+    check_random_state,
+    compute_sample_weight,
+    metadata_routing,
+)
+from sklearn.utils._param_validation import Hidden, Interval, RealNotInt, StrOptions
+from sklearn.utils.multiclass import (
+    _check_partial_fit_first_call,
+    check_classification_targets,
+)
+from sklearn.utils.validation import (
     _assert_all_finite_element_wise,
     _check_n_features,
     _check_sample_weight,
@@ -37,9 +43,10 @@
     check_is_fitted,
     validate_data,
 )
+
 from . import _criterion, _splitter, _tree
-from ._criterion import Criterion
-from ._splitter import Splitter
+from ._criterion import BaseCriterion
+from ._splitter import BaseSplitter
 from ._tree import (
     BestFirstTreeBuilder,
     DepthFirstTreeBuilder,
@@ -105,6 +112,7 @@ class BaseDecisionTree(MultiOutputMixin, BaseEstimator, metaclass=ABCMeta):
         "min_samples_split": [
             Interval(Integral, 2, None, closed="left"),
             Interval(RealNotInt, 0.0, 1.0, closed="right"),
+            StrOptions({"sqrt", "log2"}),
         ],
         "min_samples_leaf": [
             Interval(Integral, 1, None, closed="left"),
@@ -121,6 +129,7 @@ class BaseDecisionTree(MultiOutputMixin, BaseEstimator, metaclass=ABCMeta):
         "max_leaf_nodes": [Interval(Integral, 2, None, closed="left"), None],
         "min_impurity_decrease": [Interval(Real, 0.0, None, closed="left")],
         "ccp_alpha": [Interval(Real, 0.0, None, closed="left")],
+        "store_leaf_values": ["boolean"],
         "monotonic_cst": ["array-like", None],
     }
 
@@ -140,6 +149,7 @@ def __init__(
         min_impurity_decrease,
         class_weight=None,
         ccp_alpha=0.0,
+        store_leaf_values=False,
         monotonic_cst=None,
     ):
         self.criterion = criterion
@@ -154,6 +164,7 @@ def __init__(
         self.min_impurity_decrease = min_impurity_decrease
         self.class_weight = class_weight
         self.ccp_alpha = ccp_alpha
+        self.store_leaf_values = store_leaf_values
         self.monotonic_cst = monotonic_cst
 
     def get_depth(self):
@@ -235,6 +246,7 @@ def _fit(
         sample_weight=None,
         check_input=True,
         missing_values_in_feature_mask=None,
+        classes=None,
     ):
         random_state = check_random_state(self.random_state)
 
@@ -249,9 +261,12 @@ def _fit(
                 dtype=DTYPE, accept_sparse="csc", ensure_all_finite=False
             )
             check_y_params = dict(ensure_2d=False, dtype=None)
-            X, y = validate_data(
-                self, X, y, validate_separately=(check_X_params, check_y_params)
-            )
+            if y is not None or self.__sklearn_tags__().target_tags.required:
+                X, y = validate_data(
+                    self, X, y, validate_separately=(check_X_params, check_y_params)
+                )
+            else:
+                X = validate_data(self, X, **check_X_params)
 
             missing_values_in_feature_mask = (
                 self._compute_missing_values_in_feature_mask(X)
@@ -264,7 +279,7 @@ def _fit(
                         "No support for np.int64 index based sparse matrices"
                     )
 
-            if self.criterion == "poisson":
+            if y is not None and self.criterion == "poisson":
                 if np.any(y < 0):
                     raise ValueError(
                         "Some value(s) of y are negative which is"
@@ -278,45 +293,73 @@ def _fit(
 
         # Determine output settings
         n_samples, self.n_features_in_ = X.shape
-        is_classification = is_classifier(self)
 
-        y = np.atleast_1d(y)
-        expanded_class_weight = None
-
-        if y.ndim == 1:
-            # reshape is necessary to preserve the data contiguity against vs
-            # [:, np.newaxis] that does not.
-            y = np.reshape(y, (-1, 1))
+        # Do preprocessing if 'y' is passed
+        is_classification = False
+        if y is not None:
+            is_classification = is_classifier(self)
+            y = np.atleast_1d(y)
+            expanded_class_weight = None
 
-        self.n_outputs_ = y.shape[1]
+            if y.ndim == 1:
+                # reshape is necessary to preserve the data contiguity against vs
+                # [:, np.newaxis] that does not.
+                y = np.reshape(y, (-1, 1))
 
-        if is_classification:
-            check_classification_targets(y)
-            y = np.copy(y)
+            self.n_outputs_ = y.shape[1]
 
-            self.classes_ = []
-            self.n_classes_ = []
+            if is_classification:
+                check_classification_targets(y)
+                y = np.copy(y)
+
+                self.classes_ = []
+                self.n_classes_ = []
+
+                if self.class_weight is not None:
+                    y_original = np.copy(y)
+
+                y_encoded = np.zeros(y.shape, dtype=int)
+                if classes is not None:
+                    classes = np.atleast_1d(classes)
+                    if classes.ndim == 1:
+                        classes = np.array([classes])
+
+                    for k in classes:
+                        self.classes_.append(np.array(k))
+                        self.n_classes_.append(np.array(k).shape[0])
+
+                    for i in range(n_samples):
+                        for j in range(self.n_outputs_):
+                            y_encoded[i, j] = np.where(self.classes_[j] == y[i, j])[0][
+                                0
+                            ]
+                else:
+                    for k in range(self.n_outputs_):
+                        classes_k, y_encoded[:, k] = np.unique(
+                            y[:, k], return_inverse=True
+                        )
+                        self.classes_.append(classes_k)
+                        self.n_classes_.append(classes_k.shape[0])
+
+                y = y_encoded
+
+                if self.class_weight is not None:
+                    expanded_class_weight = compute_sample_weight(
+                        self.class_weight, y_original
+                    )
 
-            if self.class_weight is not None:
-                y_original = np.copy(y)
+                self.n_classes_ = np.array(self.n_classes_, dtype=np.intp)
+                self._n_classes_ = self.n_classes_
+            if getattr(y, "dtype", None) != DOUBLE or not y.flags.contiguous:
+                y = np.ascontiguousarray(y, dtype=DOUBLE)
 
-            y_encoded = np.zeros(y.shape, dtype=int)
-            for k in range(self.n_outputs_):
-                classes_k, y_encoded[:, k] = np.unique(y[:, k], return_inverse=True)
-                self.classes_.append(classes_k)
-                self.n_classes_.append(classes_k.shape[0])
-            y = y_encoded
-
-            if self.class_weight is not None:
-                expanded_class_weight = compute_sample_weight(
-                    self.class_weight, y_original
+            if len(y) != n_samples:
+                raise ValueError(
+                    "Number of labels=%d does not match number of samples=%d"
+                    % (len(y), n_samples)
                 )
 
-            self.n_classes_ = np.array(self.n_classes_, dtype=np.intp)
-
-        if getattr(y, "dtype", None) != DOUBLE or not y.flags.contiguous:
-            y = np.ascontiguousarray(y, dtype=DOUBLE)
-
+        # set decision-tree model parameters
         max_depth = np.iinfo(np.int32).max if self.max_depth is None else self.max_depth
 
         if isinstance(self.min_samples_leaf, numbers.Integral):
@@ -324,13 +367,19 @@ def _fit(
         else:  # float
             min_samples_leaf = int(ceil(self.min_samples_leaf * n_samples))
 
-        if isinstance(self.min_samples_split, numbers.Integral):
+        if isinstance(self.min_samples_split, str):
+            if self.min_samples_split == "sqrt":
+                min_samples_split = max(1, int(np.sqrt(self.n_features_in_)))
+            elif self.min_samples_split == "log2":
+                min_samples_split = max(1, int(np.log2(self.n_features_in_)))
+        elif isinstance(self.min_samples_split, numbers.Integral):
             min_samples_split = self.min_samples_split
         else:  # float
             min_samples_split = int(ceil(self.min_samples_split * n_samples))
             min_samples_split = max(2, min_samples_split)
-
         min_samples_split = max(min_samples_split, 2 * min_samples_leaf)
+        self.min_samples_split_ = min_samples_split
+        self.min_samples_leaf_ = min_samples_leaf
 
         if isinstance(self.max_features, str):
             if self.max_features == "sqrt":
@@ -351,16 +400,10 @@ def _fit(
 
         max_leaf_nodes = -1 if self.max_leaf_nodes is None else self.max_leaf_nodes
 
-        if len(y) != n_samples:
-            raise ValueError(
-                "Number of labels=%d does not match number of samples=%d"
-                % (len(y), n_samples)
-            )
-
         if sample_weight is not None:
             sample_weight = _check_sample_weight(sample_weight, X, DOUBLE)
 
-        if expanded_class_weight is not None:
+        if y is not None and expanded_class_weight is not None:
             if sample_weight is not None:
                 sample_weight = sample_weight * expanded_class_weight
             else:
@@ -371,11 +414,66 @@ def _fit(
             min_weight_leaf = self.min_weight_fraction_leaf * n_samples
         else:
             min_weight_leaf = self.min_weight_fraction_leaf * np.sum(sample_weight)
+        self.min_weight_leaf_ = min_weight_leaf
+
+        # build the actual tree now with the parameters
+        self = self._build_tree(
+            X=X,
+            y=y,
+            sample_weight=sample_weight,
+            missing_values_in_feature_mask=missing_values_in_feature_mask,
+            min_samples_leaf=min_samples_leaf,
+            min_weight_leaf=min_weight_leaf,
+            max_leaf_nodes=max_leaf_nodes,
+            min_samples_split=min_samples_split,
+            max_depth=max_depth,
+            random_state=random_state,
+        )
+
+        return self
+
+    def _build_tree(
+        self,
+        X,
+        y,
+        sample_weight,
+        missing_values_in_feature_mask,
+        min_samples_leaf,
+        min_weight_leaf,
+        max_leaf_nodes,
+        min_samples_split,
+        max_depth,
+        random_state,
+    ):
+        """Build the actual tree.
+
+        Parameters
+        ----------
+        X : Array-like
+            X dataset.
+        y : Array-like
+            Y targets.
+        sample_weight : Array-like
+            Sample weights
+        min_samples_leaf : float
+            Number of samples required to be a leaf.
+        min_weight_leaf : float
+            Weight of samples required to be a leaf.
+        max_leaf_nodes : float
+            Maximum number of leaf nodes allowed in tree.
+        min_samples_split : float
+            Minimum number of samples to split on.
+        max_depth : int
+            The maximum depth of any tree.
+        random_state : int
+            Random seed.
+        """
+        n_samples = X.shape[0]
 
         # Build tree
         criterion = self.criterion
-        if not isinstance(criterion, Criterion):
-            if is_classification:
+        if not isinstance(criterion, BaseCriterion):
+            if is_classifier(self):
                 criterion = CRITERIA_CLF[self.criterion](
                     self.n_outputs_, self.n_classes_
                 )
@@ -388,7 +486,6 @@ def _fit(
 
         SPLITTERS = SPARSE_SPLITTERS if issparse(X) else DENSE_SPLITTERS
 
-        splitter = self.splitter
         if self.monotonic_cst is None:
             monotonic_cst = None
         else:
@@ -427,8 +524,9 @@ def _fit(
                 # Since self.monotonic_cst encodes constraints on probabilities of the
                 # *positive class*, all signs must be flipped.
                 monotonic_cst *= -1
+        self.monotonic_cst_ = monotonic_cst
 
-        if not isinstance(self.splitter, Splitter):
+        if not isinstance(self.splitter, BaseSplitter):
             splitter = SPLITTERS[self.splitter](
                 criterion,
                 self.max_features_,
@@ -457,6 +555,7 @@ def _fit(
                 min_weight_leaf,
                 max_depth,
                 self.min_impurity_decrease,
+                self.store_leaf_values,
             )
         else:
             builder = BestFirstTreeBuilder(
@@ -467,8 +566,8 @@ def _fit(
                 max_depth,
                 max_leaf_nodes,
                 self.min_impurity_decrease,
+                self.store_leaf_values,
             )
-
         builder.build(self.tree_, X, y, sample_weight, missing_values_in_feature_mask)
 
         if self.n_outputs_ == 1 and is_classifier(self):
@@ -476,7 +575,75 @@ def _fit(
             self.classes_ = self.classes_[0]
 
         self._prune_tree()
+        return self
+
+    def _update_tree(self, X, y, sample_weight):
+        # Update tree
+        max_leaf_nodes = -1 if self.max_leaf_nodes is None else self.max_leaf_nodes
+        min_samples_split = self.min_samples_split_
+        min_samples_leaf = self.min_samples_leaf_
+        min_weight_leaf = self.min_weight_leaf_
+        # set decision-tree model parameters
+        max_depth = np.iinfo(np.int32).max if self.max_depth is None else self.max_depth
 
+        monotonic_cst = self.monotonic_cst_
+
+        # Build tree
+        # Note: this reconstructs the builder with the same state it had during the
+        # initial fit. This is necessary because the builder is not saved as part
+        # of the class, and thus the state may be lost if pickled/unpickled.
+        n_samples = X.shape[0]
+        criterion = self.criterion
+        if not isinstance(criterion, BaseCriterion):
+            if is_classifier(self):
+                criterion = CRITERIA_CLF[self.criterion](
+                    self.n_outputs_, self._n_classes_
+                )
+            else:
+                criterion = CRITERIA_REG[self.criterion](self.n_outputs_, n_samples)
+        else:
+            # Make a deepcopy in case the criterion has mutable attributes that
+            # might be shared and modified concurrently during parallel fitting
+            criterion = copy.deepcopy(criterion)
+
+        random_state = check_random_state(self.random_state)
+
+        SPLITTERS = SPARSE_SPLITTERS if issparse(X) else DENSE_SPLITTERS
+        splitter = SPLITTERS[self.splitter](
+            criterion,
+            self.max_features_,
+            min_samples_leaf,
+            min_weight_leaf,
+            random_state,
+            monotonic_cst,
+        )
+
+        # Use BestFirst if max_leaf_nodes given; use DepthFirst otherwise
+        if max_leaf_nodes < 0:
+            builder = DepthFirstTreeBuilder(
+                splitter,
+                min_samples_split,
+                min_samples_leaf,
+                min_weight_leaf,
+                max_depth,
+                self.min_impurity_decrease,
+                self.store_leaf_values,
+            )
+        else:
+            builder = BestFirstTreeBuilder(
+                splitter,
+                min_samples_split,
+                min_samples_leaf,
+                min_weight_leaf,
+                max_depth,
+                max_leaf_nodes,
+                self.min_impurity_decrease,
+                self.store_leaf_values,
+            )
+        builder.initialize_node_queue(self.tree_, X, y, sample_weight)
+        builder.build(self.tree_, X, y, sample_weight)
+
+        self._prune_tree()
         return self
 
     def _validate_X_predict(self, X, check_input):
@@ -528,6 +695,9 @@ def predict(self, X, check_input=True):
         """
         check_is_fitted(self)
         X = self._validate_X_predict(X, check_input)
+
+        # proba is a count matrix of leaves that fall into
+        # (n_samples, n_outputs, max_n_classes) array
         proba = self.tree_.predict(X)
         n_samples = X.shape[0]
 
@@ -554,6 +724,134 @@ def predict(self, X, check_input=True):
             else:
                 return proba[:, :, 0]
 
+    def get_leaf_node_samples(self, X, check_input=True):
+        """For each datapoint x in X, get the training samples in the leaf node.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            Dataset to apply the forest to.
+        check_input : bool, default=True
+            Allow to bypass several input checking.
+
+        Returns
+        -------
+        leaf_nodes_samples : a list of array-like of length (n_samples,)
+            Each sample is represented by the indices of the training samples that
+            reached the leaf node. The ``n_leaf_node_samples`` may vary between
+            samples, since the number of samples that fall in a leaf node is
+            variable. Each array has shape (n_leaf_node_samples, n_outputs).
+        """
+        if not self.store_leaf_values:
+            raise RuntimeError(
+                "leaf node samples are not stored when store_leaf_values=False"
+            )
+
+        # get indices of leaves per sample (n_samples,)
+        X_leaves = self.apply(X, check_input=check_input)
+        n_samples = X_leaves.shape[0]
+
+        # get array of samples per leaf (n_node_samples, n_outputs)
+        leaf_samples = self.tree_.leaf_nodes_samples
+
+        leaf_nodes_samples = []
+        for idx in range(n_samples):
+            leaf_id = X_leaves[idx]
+            leaf_nodes_samples.append(leaf_samples[leaf_id])
+        return leaf_nodes_samples
+
+    def predict_quantiles(self, X, quantiles=0.5, method="nearest", check_input=True):
+        """Predict class or regression value for X at given quantiles.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            Input data.
+        quantiles : float, optional
+            The quantiles at which to evaluate, by default 0.5 (median).
+        method : str, optional
+            The method to interpolate, by default 'linear'. Can be any keyword
+            argument accepted by :func:`~np.quantile`.
+        check_input : bool, optional
+            Whether or not to check input, by default True.
+
+        Returns
+        -------
+        predictions : array-like of shape (n_samples, n_outputs, len(quantiles))
+            The predicted quantiles.
+        """
+        if not self.store_leaf_values:
+            raise RuntimeError(
+                "Predicting quantiles requires that the tree stores leaf node samples."
+            )
+
+        check_is_fitted(self)
+
+        # Check data
+        X = self._validate_X_predict(X, check_input)
+
+        if not isinstance(quantiles, (np.ndarray, list)):
+            quantiles = np.array([quantiles])
+
+        # get indices of leaves per sample
+        X_leaves = self.apply(X)
+
+        # get array of samples per leaf (n_node_samples, n_outputs)
+        leaf_samples = self.tree_.leaf_nodes_samples
+
+        # compute quantiles (n_samples, n_quantiles, n_outputs)
+        n_samples = X.shape[0]
+        n_quantiles = len(quantiles)
+        proba = np.zeros((n_samples, n_quantiles, self.n_outputs_))
+        for idx, leaf_id in enumerate(X_leaves):
+            # predict by taking the quantile across the samples in the leaf for
+            # each output
+            try:
+                proba[idx, ...] = np.quantile(
+                    leaf_samples[leaf_id], quantiles, axis=0, method=method
+                )
+            except TypeError:
+                proba[idx, ...] = np.quantile(
+                    leaf_samples[leaf_id], quantiles, axis=0, interpolation=method
+                )
+
+        # Classification
+        if is_classifier(self):
+            if self.n_outputs_ == 1:
+                # return the class with the highest probability for each quantile
+                # (n_samples, n_quantiles)
+                class_preds = np.zeros(
+                    (n_samples, n_quantiles), dtype=self.classes_.dtype
+                )
+                for i in range(n_quantiles):
+                    class_pred_per_sample = (
+                        proba[:, i, :].squeeze().astype(self.classes_.dtype)
+                    )
+                    class_preds[:, i] = self.classes_.take(
+                        class_pred_per_sample, axis=0
+                    )
+                return class_preds
+            else:
+                class_type = self.classes_[0].dtype
+                predictions = np.zeros(
+                    (n_samples, n_quantiles, self.n_outputs_), dtype=class_type
+                )
+                for k in range(self.n_outputs_):
+                    for i in range(n_quantiles):
+                        class_pred_per_sample = proba[:, i, k].squeeze().astype(int)
+                        predictions[:, i, k] = self.classes_[k].take(
+                            class_pred_per_sample, axis=0
+                        )
+
+                return predictions
+        # Regression
+        else:
+            if self.n_outputs_ == 1:
+                return proba[:, :, 0]
+
+            else:
+                return proba
+
     def apply(self, X, check_input=True):
         """Return the index of the leaf that each sample is predicted as.
 
@@ -832,6 +1130,16 @@ class DecisionTreeClassifier(ClassifierMixin, BaseDecisionTree):
 
         .. versionadded:: 0.22
 
+    store_leaf_values : bool, default=False
+        Whether to store the samples that fall into leaves in the ``tree_`` attribute.
+        Each leaf will store a 2D array corresponding to the samples that fall into it
+        keyed by node_id.
+
+        XXX: This is currently experimental and may change without notice.
+        Moreover, it can be improved upon since storing the samples twice is not ideal.
+        One could instead store the indices in ``y_train`` that fall into each leaf,
+        which would lower RAM/diskspace usage.
+
     monotonic_cst : array-like of int of shape (n_features), default=None
         Indicates the monotonicity constraint to enforce on each feature.
           - 1: monotonic increase
@@ -896,6 +1204,18 @@ class DecisionTreeClassifier(ClassifierMixin, BaseDecisionTree):
         :ref:`sphx_glr_auto_examples_tree_plot_unveil_tree_structure.py`
         for basic usage of these attributes.
 
+    min_samples_split_ : float
+        The minimum number of samples needed to split a node in the tree building.
+
+    min_weight_leaf_ : float
+        The minimum number of weighted samples in a leaf.
+
+    min_samples_leaf_ : int
+        The minimum number of samples needed for a leaf node.
+
+    monotonic_cst_ : array-like of int of shape (n_features,)
+        The monotonicity constraints enforced on each feature.
+
     See Also
     --------
     DecisionTreeRegressor : A decision tree regressor.
@@ -948,7 +1268,10 @@ class DecisionTreeClassifier(ClassifierMixin, BaseDecisionTree):
 
     _parameter_constraints: dict = {
         **BaseDecisionTree._parameter_constraints,
-        "criterion": [StrOptions({"gini", "entropy", "log_loss"}), Hidden(Criterion)],
+        "criterion": [
+            StrOptions({"gini", "entropy", "log_loss"}),
+            Hidden(BaseCriterion),
+        ],
         "class_weight": [dict, list, StrOptions({"balanced"}), None],
     }
 
@@ -967,6 +1290,7 @@ def __init__(
         min_impurity_decrease=0.0,
         class_weight=None,
         ccp_alpha=0.0,
+        store_leaf_values=False,
         monotonic_cst=None,
     ):
         super().__init__(
@@ -983,10 +1307,18 @@ def __init__(
             min_impurity_decrease=min_impurity_decrease,
             monotonic_cst=monotonic_cst,
             ccp_alpha=ccp_alpha,
+            store_leaf_values=store_leaf_values,
         )
 
     @_fit_context(prefer_skip_nested_validation=True)
-    def fit(self, X, y, sample_weight=None, check_input=True):
+    def fit(
+        self,
+        X,
+        y,
+        sample_weight=None,
+        check_input=True,
+        classes=None,
+    ):
         """Build a decision tree classifier from the training set (X, y).
 
         Parameters
@@ -1010,20 +1342,127 @@ def fit(self, X, y, sample_weight=None, check_input=True):
             Allow to bypass several input checking.
             Don't use this parameter unless you know what you're doing.
 
+        classes : array-like of shape (n_classes,), default=None
+            List of all the classes that can possibly appear in the y vector.
+
         Returns
         -------
         self : DecisionTreeClassifier
             Fitted estimator.
         """
-
         super()._fit(
             X,
             y,
             sample_weight=sample_weight,
             check_input=check_input,
+            classes=classes,
         )
         return self
 
+    def partial_fit(self, X, y, sample_weight=None, check_input=True, classes=None):
+        """Update a decision tree classifier from the training set (X, y).
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            The training input samples. Internally, it will be converted to
+            ``dtype=np.float32`` and if a sparse matrix is provided
+            to a sparse ``csc_matrix``.
+
+        y : array-like of shape (n_samples,) or (n_samples, n_outputs)
+            The target values (class labels) as integers or strings.
+
+        sample_weight : array-like of shape (n_samples,), default=None
+            Sample weights. If None, then samples are equally weighted. Splits
+            that would create child nodes with net zero or negative weight are
+            ignored while searching for a split in each node. Splits are also
+            ignored if they would result in any single class carrying a
+            negative weight in either child node.
+
+        check_input : bool, default=True
+            Allow to bypass several input checking.
+            Don't use this parameter unless you know what you do.
+
+        classes : array-like of shape (n_classes,), default=None
+            List of all the classes that can possibly appear in the y vector.
+            Must be provided at the first call to partial_fit, can be omitted
+            in subsequent calls.
+
+        Returns
+        -------
+        self : DecisionTreeClassifier
+            Fitted estimator.
+        """
+        self._validate_params()
+
+        # validate input parameters
+        first_call = _check_partial_fit_first_call(self, classes=classes)
+
+        # Fit if no tree exists yet
+        if first_call:
+            self.fit(
+                X,
+                y,
+                sample_weight=sample_weight,
+                check_input=check_input,
+                classes=classes,
+            )
+            return self
+
+        if check_input:
+            # Need to validate separately here.
+            # We can't pass multi_ouput=True because that would allow y to be
+            # csr.
+            check_X_params = dict(dtype=DTYPE, accept_sparse="csc")
+            check_y_params = dict(ensure_2d=False, dtype=None)
+            X, y = validate_data(
+                self,
+                X,
+                y,
+                reset=False,
+                validate_separately=(check_X_params, check_y_params),
+            )
+            if issparse(X):
+                X.sort_indices()
+
+                if X.indices.dtype != np.intc or X.indptr.dtype != np.intc:
+                    raise ValueError(
+                        "No support for np.int64 index based sparse matrices"
+                    )
+
+        if X.shape[1] != self.n_features_in_:
+            raise ValueError(
+                f"X has {X.shape[1]} features, but {self.__class__.__name__} "
+                f"is expecting {self.n_features_in_} features as input."
+            )
+
+        y = np.atleast_1d(y)
+
+        if y.ndim == 1:
+            # reshape is necessary to preserve the data contiguity against vs
+            # [:, np.newaxis] that does not.
+            y = np.reshape(y, (-1, 1))
+
+        check_classification_targets(y)
+        y = np.copy(y)
+
+        classes = self.classes_
+        if self.n_outputs_ == 1:
+            classes = [classes]
+
+        y_encoded = np.zeros(y.shape, dtype=int)
+        for i in range(X.shape[0]):
+            for j in range(self.n_outputs_):
+                y_encoded[i, j] = np.where(classes[j] == y[i, j])[0][0]
+        y = y_encoded
+
+        if getattr(y, "dtype", None) != DOUBLE or not y.flags.contiguous:
+            y = np.ascontiguousarray(y, dtype=DOUBLE)
+
+        self._update_tree(X, y, sample_weight)
+
+        return self
+
     def predict_proba(self, X, check_input=True):
         """Predict class probabilities of the input samples X.
 
@@ -1231,6 +1670,16 @@ class DecisionTreeRegressor(RegressorMixin, BaseDecisionTree):
 
         .. versionadded:: 0.22
 
+    store_leaf_values : bool, default=False
+        Whether to store the samples that fall into leaves in the ``tree_`` attribute.
+        Each leaf will store a 2D array corresponding to the samples that fall into it
+        keyed by node_id.
+
+        XXX: This is currently experimental and may change without notice.
+        Moreover, it can be improved upon since storing the samples twice is not ideal.
+        One could instead store the indices in ``y_train`` that fall into each leaf,
+        which would lower RAM/diskspace usage.
+
     monotonic_cst : array-like of int of shape (n_features), default=None
         Indicates the monotonicity constraint to enforce on each feature.
           - 1: monotonic increase
@@ -1283,6 +1732,18 @@ class DecisionTreeRegressor(RegressorMixin, BaseDecisionTree):
         :ref:`sphx_glr_auto_examples_tree_plot_unveil_tree_structure.py`
         for basic usage of these attributes.
 
+    min_samples_split_ : float
+        The minimum number of samples needed to split a node in the tree building.
+
+    min_weight_leaf_ : float
+        The minimum number of weighted samples in a leaf.
+
+    monotonic_cst_ : array-like of int of shape (n_features,)
+        The monotonicity constraints enforced on each feature.
+
+    min_samples_leaf_ : int
+        The minimum number of samples needed for a leaf node.
+
     See Also
     --------
     DecisionTreeClassifier : A decision tree classifier.
@@ -1331,7 +1792,7 @@ class DecisionTreeRegressor(RegressorMixin, BaseDecisionTree):
         **BaseDecisionTree._parameter_constraints,
         "criterion": [
             StrOptions({"squared_error", "friedman_mse", "absolute_error", "poisson"}),
-            Hidden(Criterion),
+            Hidden(BaseCriterion),
         ],
     }
 
@@ -1349,6 +1810,7 @@ def __init__(
         max_leaf_nodes=None,
         min_impurity_decrease=0.0,
         ccp_alpha=0.0,
+        store_leaf_values=False,
         monotonic_cst=None,
     ):
         super().__init__(
@@ -1363,11 +1825,19 @@ def __init__(
             random_state=random_state,
             min_impurity_decrease=min_impurity_decrease,
             ccp_alpha=ccp_alpha,
+            store_leaf_values=store_leaf_values,
             monotonic_cst=monotonic_cst,
         )
 
     @_fit_context(prefer_skip_nested_validation=True)
-    def fit(self, X, y, sample_weight=None, check_input=True):
+    def fit(
+        self,
+        X,
+        y,
+        sample_weight=None,
+        check_input=True,
+        classes=None,
+    ):
         """Build a decision tree regressor from the training set (X, y).
 
         Parameters
@@ -1390,6 +1860,9 @@ def fit(self, X, y, sample_weight=None, check_input=True):
             Allow to bypass several input checking.
             Don't use this parameter unless you know what you're doing.
 
+        classes : array-like of shape (n_classes,), default=None
+            List of all the classes that can possibly appear in the y vector.
+
         Returns
         -------
         self : DecisionTreeRegressor
@@ -1401,6 +1874,7 @@ def fit(self, X, y, sample_weight=None, check_input=True):
             y,
             sample_weight=sample_weight,
             check_input=check_input,
+            classes=classes,
         )
         return self
 
@@ -1583,6 +2057,16 @@ class ExtraTreeClassifier(DecisionTreeClassifier):
 
         .. versionadded:: 0.22
 
+    store_leaf_values : bool, default=False
+        Whether to store the samples that fall into leaves in the ``tree_`` attribute.
+        Each leaf will store a 2D array corresponding to the samples that fall into it
+        keyed by node_id.
+
+        XXX: This is currently experimental and may change without notice.
+        Moreover, it can be improved upon since storing the samples twice is not ideal.
+        One could instead store the indices in ``y_train`` that fall into each leaf,
+        which would lower RAM/diskspace usage.
+
     monotonic_cst : array-like of int of shape (n_features), default=None
         Indicates the monotonicity constraint to enforce on each feature.
           - 1: monotonic increase
@@ -1647,6 +2131,18 @@ class ExtraTreeClassifier(DecisionTreeClassifier):
         :ref:`sphx_glr_auto_examples_tree_plot_unveil_tree_structure.py`
         for basic usage of these attributes.
 
+    min_samples_split_ : float
+        The minimum number of samples needed to split a node in the tree building.
+
+    min_weight_leaf_ : float
+        The minimum number of weighted samples in a leaf.
+
+    monotonic_cst_ : array-like of int of shape (n_features,)
+        The monotonicity constraints enforced on each feature.
+
+    min_samples_leaf_ : int
+        The minimum number of samples needed for a leaf node.
+
     See Also
     --------
     ExtraTreeRegressor : An extremely randomized tree regressor.
@@ -1702,6 +2198,7 @@ def __init__(
         min_impurity_decrease=0.0,
         class_weight=None,
         ccp_alpha=0.0,
+        store_leaf_values=False,
         monotonic_cst=None,
     ):
         super().__init__(
@@ -1717,6 +2214,7 @@ def __init__(
             min_impurity_decrease=min_impurity_decrease,
             random_state=random_state,
             ccp_alpha=ccp_alpha,
+            store_leaf_values=store_leaf_values,
             monotonic_cst=monotonic_cst,
         )
 
@@ -1863,6 +2361,16 @@ class ExtraTreeRegressor(DecisionTreeRegressor):
 
         .. versionadded:: 0.22
 
+    store_leaf_values : bool, default=False
+        Whether to store the samples that fall into leaves in the ``tree_`` attribute.
+        Each leaf will store a 2D array corresponding to the samples that fall into it
+        keyed by node_id.
+
+        XXX: This is currently experimental and may change without notice.
+        Moreover, it can be improved upon since storing the samples twice is not ideal.
+        One could instead store the indices in ``y_train`` that fall into each leaf,
+        which would lower RAM/diskspace usage.
+
     monotonic_cst : array-like of int of shape (n_features), default=None
         Indicates the monotonicity constraint to enforce on each feature.
           - 1: monotonic increase
@@ -1912,6 +2420,18 @@ class ExtraTreeRegressor(DecisionTreeRegressor):
         :ref:`sphx_glr_auto_examples_tree_plot_unveil_tree_structure.py`
         for basic usage of these attributes.
 
+    min_samples_split_ : float
+        The minimum number of samples needed to split a node in the tree building.
+
+    min_weight_leaf_ : float
+        The minimum number of weighted samples in a leaf.
+
+    monotonic_cst_ : array-like of int of shape (n_features,)
+        The monotonicity constraints enforced on each feature.
+
+    min_samples_leaf_ : int
+        The minimum number of samples needed for a leaf node.
+
     See Also
     --------
     ExtraTreeClassifier : An extremely randomized tree classifier.
@@ -1962,6 +2482,7 @@ def __init__(
         min_impurity_decrease=0.0,
         max_leaf_nodes=None,
         ccp_alpha=0.0,
+        store_leaf_values=False,
         monotonic_cst=None,
     ):
         super().__init__(
@@ -1976,6 +2497,7 @@ def __init__(
             min_impurity_decrease=min_impurity_decrease,
             random_state=random_state,
             ccp_alpha=ccp_alpha,
+            store_leaf_values=store_leaf_values,
             monotonic_cst=monotonic_cst,
         )
 
diff --git a/sklearn/tree/_criterion.pxd b/sklearn/tree/_criterion.pxd
index 84d2e800d6a87..ff2a80f939b9f 100644
--- a/sklearn/tree/_criterion.pxd
+++ b/sklearn/tree/_criterion.pxd
@@ -2,24 +2,20 @@
 # SPDX-License-Identifier: BSD-3-Clause
 
 # See _criterion.pyx for implementation details.
+from libcpp.vector cimport vector
 from ..utils._typedefs cimport float64_t, int8_t, intp_t
 
 
-cdef class Criterion:
-    # The criterion computes the impurity of a node and the reduction of
-    # impurity of a split on that node. It also computes the output statistics
-    # such as the mean in regression and class probabilities in classification.
+cdef class BaseCriterion:
+    """Abstract interface for criterion."""
 
     # Internal structures
-    cdef const float64_t[:, ::1] y         # Values of y
     cdef const float64_t[:] sample_weight  # Sample weights
 
     cdef const intp_t[:] sample_indices    # Sample indices in X, y
     cdef intp_t start                      # samples[start:pos] are the samples in the left node
     cdef intp_t pos                        # samples[pos:end] are the samples in the right node
     cdef intp_t end
-    cdef intp_t n_missing                  # Number of missing values for the feature being evaluated
-    cdef bint missing_go_to_left           # Whether missing values go to the left node
 
     cdef intp_t n_outputs                  # Number of outputs
     cdef intp_t n_samples                  # Number of samples
@@ -30,24 +26,14 @@ cdef class Criterion:
     cdef float64_t weighted_n_right           # Weighted number of samples in the right node
     cdef float64_t weighted_n_missing         # Weighted number of samples that are missing
 
+    # Core methods that criterion class _must_ implement.
     # The criterion object is maintained such that left and right collected
     # statistics correspond to samples[start:pos] and samples[pos:end].
 
     # Methods
-    cdef int init(
-        self,
-        const float64_t[:, ::1] y,
-        const float64_t[:] sample_weight,
-        float64_t weighted_n_samples,
-        const intp_t[:] sample_indices,
-        intp_t start,
-        intp_t end
-    ) except -1 nogil
-    cdef void init_sum_missing(self)
-    cdef void init_missing(self, intp_t n_missing) noexcept nogil
-    cdef int reset(self) except -1 nogil
-    cdef int reverse_reset(self) except -1 nogil
-    cdef int update(self, intp_t new_pos) except -1 nogil
+    cdef intp_t reset(self) except -1 nogil
+    cdef intp_t reverse_reset(self) except -1 nogil
+    cdef intp_t update(self, intp_t new_pos) except -1 nogil
     cdef float64_t node_impurity(self) noexcept nogil
     cdef void children_impurity(
         self,
@@ -58,13 +44,6 @@ cdef class Criterion:
         self,
         float64_t* dest
     ) noexcept nogil
-    cdef void clip_node_value(
-        self,
-        float64_t* dest,
-        float64_t lower_bound,
-        float64_t upper_bound
-    ) noexcept nogil
-    cdef float64_t middle_value(self) noexcept nogil
     cdef float64_t impurity_improvement(
         self,
         float64_t impurity_parent,
@@ -72,19 +51,55 @@ cdef class Criterion:
         float64_t impurity_right
     ) noexcept nogil
     cdef float64_t proxy_impurity_improvement(self) noexcept nogil
+    cdef void set_sample_pointers(
+        self,
+        intp_t start,
+        intp_t end
+    ) noexcept nogil
+
+
+cdef class Criterion(BaseCriterion):
+    """Abstract interface for supervised impurity criteria."""
+
+    cdef const float64_t[:, ::1] y         # Values of y
+    cdef intp_t n_missing                  # Number of missing values for the feature being evaluated
+    cdef bint missing_go_to_left           # Whether missing values go to the left node
+
+    cdef intp_t init(
+        self,
+        const float64_t[:, ::1] y,
+        const float64_t[:] sample_weight,
+        float64_t weighted_n_samples,
+        const intp_t[:] sample_indices,
+    ) except -1 nogil
+    cdef void init_sum_missing(self)
+    cdef void init_missing(self, intp_t n_missing) noexcept nogil
+
     cdef bint check_monotonicity(
-            self,
-            int8_t monotonic_cst,
-            float64_t lower_bound,
-            float64_t upper_bound,
+        self,
+        int8_t monotonic_cst,
+        float64_t lower_bound,
+        float64_t upper_bound,
     ) noexcept nogil
     cdef inline bint _check_monotonicity(
-            self,
-            int8_t monotonic_cst,
-            float64_t lower_bound,
-            float64_t upper_bound,
-            float64_t sum_left,
-            float64_t sum_right,
+        self,
+        int8_t monotonic_cst,
+        float64_t lower_bound,
+        float64_t upper_bound,
+        float64_t sum_left,
+        float64_t sum_right,
+    ) noexcept nogil
+    cdef void clip_node_value(
+        self,
+        float64_t* dest,
+        float64_t lower_bound,
+        float64_t upper_bound
+    ) noexcept nogil
+    cdef float64_t middle_value(self) noexcept nogil
+
+    cdef void node_samples(
+        self,
+        vector[vector[float64_t]]& dest
     ) noexcept nogil
 
 cdef class ClassificationCriterion(Criterion):
diff --git a/sklearn/tree/_criterion.pyx b/sklearn/tree/_criterion.pyx
index 9f3db83399569..adb13b8929a59 100644
--- a/sklearn/tree/_criterion.pyx
+++ b/sklearn/tree/_criterion.pyx
@@ -1,27 +1,42 @@
 # Authors: The scikit-learn developers
 # SPDX-License-Identifier: BSD-3-Clause
 
-from libc.string cimport memcpy
-from libc.string cimport memset
-from libc.math cimport fabs, INFINITY
+from libc.math cimport INFINITY, fabs
+from libc.string cimport memcpy, memset
 
 import numpy as np
+
 cimport numpy as cnp
+
 cnp.import_array()
 
 from scipy.special.cython_special cimport xlogy
 
-from ._utils cimport log
-from ._utils cimport WeightedMedianCalculator
+from ._utils cimport WeightedMedianCalculator, log
+
 
 # EPSILON is used in the Poisson criterion
 cdef float64_t EPSILON = 10 * np.finfo('double').eps
 
-cdef class Criterion:
-    """Interface for impurity criteria.
+cdef class BaseCriterion:
+    """This is an abstract interface for criterion.
+
+    For example, a tree model could
+    be either supervisedly, or unsupervisedly computing impurity on samples of
+    covariates, or labels, or both. Although scikit-learn currently only contains
+    supervised tree methods, this class enables 3rd party packages to leverage
+    scikit-learn's Cython code for criteria.
+
+    The downstream classes _must_ implement methods to compute the impurity
+    in current node and in children nodes.
 
     This object stores methods on how to calculate how good a split is using
-    different metrics.
+    a set API.
+
+    Samples in the "current" node are stored in `samples[start:end]` which is
+    partitioned around `pos` (an index in `start:end`) so that:
+       - the samples of left child node are stored in `samples[start:pos]`
+       - the samples of right child node are stored in `samples[pos:end]`
     """
     def __getstate__(self):
         return {}
@@ -29,68 +44,21 @@ cdef class Criterion:
     def __setstate__(self, d):
         pass
 
-    cdef int init(
-        self,
-        const float64_t[:, ::1] y,
-        const float64_t[:] sample_weight,
-        float64_t weighted_n_samples,
-        const intp_t[:] sample_indices,
-        intp_t start,
-        intp_t end,
-    ) except -1 nogil:
-        """Placeholder for a method which will initialize the criterion.
-
-        Returns -1 in case of failure to allocate memory (and raise MemoryError)
-        or 0 otherwise.
-
-        Parameters
-        ----------
-        y : ndarray, dtype=float64_t
-            y is a buffer that can store values for n_outputs target variables
-            stored as a Cython memoryview.
-        sample_weight : ndarray, dtype=float64_t
-            The weight of each sample stored as a Cython memoryview.
-        weighted_n_samples : float64_t
-            The total weight of the samples being considered
-        sample_indices : ndarray, dtype=intp_t
-            A mask on the samples. Indices of the samples in X and y we want to use,
-            where sample_indices[start:end] correspond to the samples in this node.
-        start : intp_t
-            The first sample to be used on this node
-        end : intp_t
-            The last sample used on this node
-
-        """
-        pass
-
-    cdef void init_missing(self, intp_t n_missing) noexcept nogil:
-        """Initialize sum_missing if there are missing values.
-
-        This method assumes that caller placed the missing samples in
-        self.sample_indices[-n_missing:]
-
-        Parameters
-        ----------
-        n_missing: intp_t
-            Number of missing values for specific feature.
-        """
-        pass
-
-    cdef int reset(self) except -1 nogil:
+    cdef intp_t reset(self) except -1 nogil:
         """Reset the criterion at pos=start.
 
         This method must be implemented by the subclass.
         """
         pass
 
-    cdef int reverse_reset(self) except -1 nogil:
+    cdef intp_t reverse_reset(self) except -1 nogil:
         """Reset the criterion at pos=end.
 
         This method must be implemented by the subclass.
         """
         pass
 
-    cdef int update(self, intp_t new_pos) except -1 nogil:
+    cdef intp_t update(self, intp_t new_pos) except -1 nogil:
         """Updated statistics by moving sample_indices[pos:new_pos] to the left child.
 
         This updates the collected statistics by moving sample_indices[pos:new_pos]
@@ -146,16 +114,6 @@ cdef class Criterion:
         """
         pass
 
-    cdef void clip_node_value(self, float64_t* dest, float64_t lower_bound, float64_t upper_bound) noexcept nogil:
-        pass
-
-    cdef float64_t middle_value(self) noexcept nogil:
-        """Compute the middle value of a split for monotonicity constraints
-
-        This method is implemented in ClassificationCriterion and RegressionCriterion.
-        """
-        pass
-
     cdef float64_t proxy_impurity_improvement(self) noexcept nogil:
         """Compute a proxy of the impurity reduction.
 
@@ -210,6 +168,95 @@ cdef class Criterion:
                                  - (self.weighted_n_left /
                                     self.weighted_n_node_samples * impurity_left)))
 
+    cdef void set_sample_pointers(
+        self,
+        intp_t start,
+        intp_t end
+    ) noexcept nogil:
+        """Abstract method which will set sample pointers in the criterion.
+
+        The dataset array that we compute criteria on is assumed to consist of 'N'
+        ordered samples or rows (i.e. sorted). Since we pass this by reference, we
+        use sample pointers to move the start and end around to consider only a subset of data.
+        This function should also update relevant statistics that the class uses to compute the final criterion.
+
+        Parameters
+        ----------
+        start : intp_t
+            The index of the first sample to be used on computation of criteria of the current node.
+        end : intp_t
+            The last sample used on this node
+        """
+        pass
+
+
+cdef class Criterion(BaseCriterion):
+    """Interface for impurity criteria.
+
+    The supervised criterion computes the impurity of a node and the reduction of
+    impurity of a split on that node using the distribution of labels in parent and
+    children nodes. It also computes the output statistics such as the mean in regression
+    and class probabilities in classification. Instances of this class are responsible
+    for compute splits' impurity difference.
+
+    Criterion is the base class for criteria used in supervised tree-based models
+    with a homogeneous float64-dtyped y.
+    """
+    cdef intp_t init(
+        self,
+        const float64_t[:, ::1] y,
+        const float64_t[:] sample_weight,
+        float64_t weighted_n_samples,
+        const intp_t[:] sample_indices,
+    ) except -1 nogil:
+        """Placeholder for a method which will initialize the criterion.
+
+        Returns -1 in case of failure to allocate memory (and raise MemoryError)
+        or 0 otherwise.
+
+        Parameters
+        ----------
+        y : ndarray, dtype=float64_t
+            y is a buffer that can store values for n_outputs target variables
+            stored as a Cython memoryview.
+        sample_weight : ndarray, dtype=float64_t
+            The weight of each sample stored as a Cython memoryview.
+        weighted_n_samples : float64_t
+            The total weight of the samples being considered
+        sample_indices : ndarray, dtype=intp_t
+            A mask on the samples. Indices of the samples in X and y we want to use,
+            where sample_indices[start:end] correspond to the samples in this node.
+        """
+        pass
+
+    cdef void init_missing(self, intp_t n_missing) noexcept nogil:
+        """Initialize sum_missing if there are missing values.
+
+        This method assumes that caller placed the missing samples in
+        self.sample_indices[-n_missing:]
+
+        Parameters
+        ----------
+        n_missing: intp_t
+            Number of missing values for specific feature.
+        """
+        pass
+
+    cdef void clip_node_value(
+        self,
+        float64_t* dest,
+        float64_t lower_bound,
+        float64_t upper_bound
+    ) noexcept nogil:
+        pass
+
+    cdef float64_t middle_value(self) noexcept nogil:
+        """Compute the middle value of a split for monotonicity constraints
+
+        This method is implemented in ClassificationCriterion and RegressionCriterion.
+        """
+        pass
+
     cdef bint check_monotonicity(
         self,
         cnp.int8_t monotonic_cst,
@@ -243,6 +290,33 @@ cdef class Criterion:
     cdef void init_sum_missing(self):
         """Init sum_missing to hold sums for missing values."""
 
+    cdef void node_samples(
+        self,
+        vector[vector[float64_t]]& dest
+    ) noexcept nogil:
+        """Copy the samples of the current node into dest.
+
+        Parameters
+        ----------
+        dest : reference vector[vector[float64_t]]
+            The vector of vectors where the samples should be copied.
+            This is passed by reference and modified in place.
+        """
+        cdef intp_t i, j, k
+
+        # Resize the destination vector of vectors
+        dest.resize(self.n_node_samples)
+
+        # Loop over the samples
+        for i in range(self.n_node_samples):
+            # Get the index of the current sample
+            j = self.sample_indices[self.start + i]
+
+            # Get the sample values for each output
+            for k in range(self.n_outputs):
+                dest[i].push_back(self.y[j, k])
+
+
 cdef inline void _move_sums_classification(
     ClassificationCriterion criterion,
     float64_t[:, ::1] sum_1,
@@ -336,20 +410,15 @@ cdef class ClassificationCriterion(Criterion):
         return (type(self),
                 (self.n_outputs, np.asarray(self.n_classes)), self.__getstate__())
 
-    cdef int init(
+    cdef intp_t init(
         self,
         const float64_t[:, ::1] y,
         const float64_t[:] sample_weight,
         float64_t weighted_n_samples,
         const intp_t[:] sample_indices,
-        intp_t start,
-        intp_t end
     ) except -1 nogil:
         """Initialize the criterion.
 
-        This initializes the criterion at node sample_indices[start:end] and children
-        sample_indices[start:start] and sample_indices[start:end].
-
         Returns -1 in case of failure to allocate memory (and raise MemoryError)
         or 0 otherwise.
 
@@ -364,18 +433,24 @@ cdef class ClassificationCriterion(Criterion):
         sample_indices : ndarray, dtype=intp_t
             A mask on the samples. Indices of the samples in X and y we want to use,
             where sample_indices[start:end] correspond to the samples in this node.
-        start : intp_t
-            The first sample to use in the mask
-        end : intp_t
-            The last sample to use in the mask
         """
         self.y = y
         self.sample_weight = sample_weight
         self.sample_indices = sample_indices
+        self.weighted_n_samples = weighted_n_samples
+
+        return 0
+
+    cdef void set_sample_pointers(
+        self,
+        intp_t start,
+        intp_t end
+    ) noexcept nogil:
+        """Set sample pointers in the criterion."""
+        self.n_node_samples = end - start
         self.start = start
         self.end = end
-        self.n_node_samples = end - start
-        self.weighted_n_samples = weighted_n_samples
+
         self.weighted_n_node_samples = 0.0
 
         cdef intp_t i
@@ -388,12 +463,12 @@ cdef class ClassificationCriterion(Criterion):
             memset(&self.sum_total[k, 0], 0, self.n_classes[k] * sizeof(float64_t))
 
         for p in range(start, end):
-            i = sample_indices[p]
+            i = self.sample_indices[p]
 
             # w is originally set to be 1.0, meaning that if no sample weights
             # are given, the default weight of each sample is 1.0.
-            if sample_weight is not None:
-                w = sample_weight[i]
+            if self.sample_weight is not None:
+                w = self.sample_weight[i]
 
             # Count weighted class frequency for each target
             for k in range(self.n_outputs):
@@ -404,7 +479,6 @@ cdef class ClassificationCriterion(Criterion):
 
         # Reset to pos=start
         self.reset()
-        return 0
 
     cdef void init_sum_missing(self):
         """Init sum_missing to hold sums for missing values."""
@@ -439,7 +513,7 @@ cdef class ClassificationCriterion(Criterion):
 
             self.weighted_n_missing += w
 
-    cdef int reset(self) except -1 nogil:
+    cdef intp_t reset(self) except -1 nogil:
         """Reset the criterion at pos=start.
 
         Returns -1 in case of failure to allocate memory (and raise MemoryError)
@@ -456,7 +530,7 @@ cdef class ClassificationCriterion(Criterion):
         )
         return 0
 
-    cdef int reverse_reset(self) except -1 nogil:
+    cdef intp_t reverse_reset(self) except -1 nogil:
         """Reset the criterion at pos=end.
 
         Returns -1 in case of failure to allocate memory (and raise MemoryError)
@@ -473,7 +547,7 @@ cdef class ClassificationCriterion(Criterion):
         )
         return 0
 
-    cdef int update(self, intp_t new_pos) except -1 nogil:
+    cdef intp_t update(self, intp_t new_pos) except -1 nogil:
         """Updated statistics by moving sample_indices[pos:new_pos] to the left child.
 
         Returns -1 in case of failure to allocate memory (and raise MemoryError)
@@ -687,13 +761,10 @@ cdef class Gini(ClassificationCriterion):
     This handles cases where the target is a classification taking values
     0, 1, ... K-2, K-1. If node m represents a region Rm with Nm observations,
     then let
-
         count_k = 1/ Nm \sum_{x_i in Rm} I(yi = k)
-
     be the proportion of class k observations in node m.
 
     The Gini Index is then defined as:
-
         index = \sum_{k=0}^{K-1} count_k (1 - count_k)
               = 1 - \sum_{k=0}^{K-1} count_k ** 2
     """
@@ -811,7 +882,6 @@ cdef class RegressionCriterion(Criterion):
     evaluated by computing the variance of the target values left and right
     of the split point. The computation takes linear time with `n_samples`
     by using ::
-
         var = \sum_i^n (y_i - y_bar) ** 2
             = (\sum_i^n y_i ** 2) - n_samples * y_bar ** 2
     """
@@ -849,28 +919,33 @@ cdef class RegressionCriterion(Criterion):
     def __reduce__(self):
         return (type(self), (self.n_outputs, self.n_samples), self.__getstate__())
 
-    cdef int init(
+    cdef intp_t init(
         self,
         const float64_t[:, ::1] y,
         const float64_t[:] sample_weight,
         float64_t weighted_n_samples,
         const intp_t[:] sample_indices,
-        intp_t start,
-        intp_t end,
     ) except -1 nogil:
-        """Initialize the criterion.
-
-        This initializes the criterion at node sample_indices[start:end] and children
-        sample_indices[start:start] and sample_indices[start:end].
-        """
+        """Initialize the criterion."""
         # Initialize fields
         self.y = y
         self.sample_weight = sample_weight
         self.sample_indices = sample_indices
+        self.weighted_n_samples = weighted_n_samples
+
+        return 0
+
+    cdef void set_sample_pointers(
+        self,
+        intp_t start,
+        intp_t end
+    ) noexcept nogil:
+        """Set sample pointers in the criterion."""
         self.start = start
         self.end = end
+
         self.n_node_samples = end - start
-        self.weighted_n_samples = weighted_n_samples
+
         self.weighted_n_node_samples = 0.
 
         cdef intp_t i
@@ -883,10 +958,10 @@ cdef class RegressionCriterion(Criterion):
         memset(&self.sum_total[0], 0, self.n_outputs * sizeof(float64_t))
 
         for p in range(start, end):
-            i = sample_indices[p]
+            i = self.sample_indices[p]
 
-            if sample_weight is not None:
-                w = sample_weight[i]
+            if self.sample_weight is not None:
+                w = self.sample_weight[i]
 
             for k in range(self.n_outputs):
                 y_ik = self.y[i, k]
@@ -898,7 +973,6 @@ cdef class RegressionCriterion(Criterion):
 
         # Reset to pos=start
         self.reset()
-        return 0
 
     cdef void init_sum_missing(self):
         """Init sum_missing to hold sums for missing values."""
@@ -936,7 +1010,7 @@ cdef class RegressionCriterion(Criterion):
 
             self.weighted_n_missing += w
 
-    cdef int reset(self) except -1 nogil:
+    cdef intp_t reset(self) except -1 nogil:
         """Reset the criterion at pos=start."""
         self.pos = self.start
         _move_sums_regression(
@@ -949,7 +1023,7 @@ cdef class RegressionCriterion(Criterion):
         )
         return 0
 
-    cdef int reverse_reset(self) except -1 nogil:
+    cdef intp_t reverse_reset(self) except -1 nogil:
         """Reset the criterion at pos=end."""
         self.pos = self.end
         _move_sums_regression(
@@ -962,7 +1036,7 @@ cdef class RegressionCriterion(Criterion):
         )
         return 0
 
-    cdef int update(self, intp_t new_pos) except -1 nogil:
+    cdef intp_t update(self, intp_t new_pos) except -1 nogil:
         """Updated statistics by moving sample_indices[pos:new_pos] to the left."""
         cdef const float64_t[:] sample_weight = self.sample_weight
         cdef const intp_t[:] sample_indices = self.sample_indices
@@ -1066,7 +1140,6 @@ cdef class RegressionCriterion(Criterion):
 
 cdef class MSE(RegressionCriterion):
     """Mean squared error impurity criterion.
-
         MSE = var_left + var_right
     """
 
@@ -1227,31 +1300,39 @@ cdef class MAE(RegressionCriterion):
         self.left_child_ptr = <void**> cnp.PyArray_DATA(self.left_child)
         self.right_child_ptr = <void**> cnp.PyArray_DATA(self.right_child)
 
-    cdef int init(
+    cdef intp_t init(
         self,
         const float64_t[:, ::1] y,
         const float64_t[:] sample_weight,
         float64_t weighted_n_samples,
         const intp_t[:] sample_indices,
-        intp_t start,
-        intp_t end,
     ) except -1 nogil:
         """Initialize the criterion.
 
         This initializes the criterion at node sample_indices[start:end] and children
         sample_indices[start:start] and sample_indices[start:end].
         """
-        cdef intp_t i, p, k
-        cdef float64_t w = 1.0
-
         # Initialize fields
         self.y = y
         self.sample_weight = sample_weight
         self.sample_indices = sample_indices
+        self.weighted_n_samples = weighted_n_samples
+
+        return 0
+
+    cdef void set_sample_pointers(
+        self,
+        intp_t start,
+        intp_t end
+    ) noexcept nogil:
+        """Set sample pointers in the criterion."""
+        cdef intp_t i, p, k
+        cdef float64_t w = 1.0
+
         self.start = start
         self.end = end
+
         self.n_node_samples = end - start
-        self.weighted_n_samples = weighted_n_samples
         self.weighted_n_node_samples = 0.
 
         cdef void** left_child = self.left_child_ptr
@@ -1262,10 +1343,10 @@ cdef class MAE(RegressionCriterion):
             (<WeightedMedianCalculator> right_child[k]).reset()
 
         for p in range(start, end):
-            i = sample_indices[p]
+            i = self.sample_indices[p]
 
-            if sample_weight is not None:
-                w = sample_weight[i]
+            if self.sample_weight is not None:
+                w = self.sample_weight[i]
 
             for k in range(self.n_outputs):
                 # push method ends up calling safe_realloc, hence `except -1`
@@ -1280,7 +1361,6 @@ cdef class MAE(RegressionCriterion):
 
         # Reset to pos=start
         self.reset()
-        return 0
 
     cdef void init_missing(self, intp_t n_missing) noexcept nogil:
         """Raise error if n_missing != 0."""
@@ -1289,7 +1369,7 @@ cdef class MAE(RegressionCriterion):
         with gil:
             raise ValueError("missing values is not supported for MAE.")
 
-    cdef int reset(self) except -1 nogil:
+    cdef intp_t reset(self) except -1 nogil:
         """Reset the criterion at pos=start.
 
         Returns -1 in case of failure to allocate memory (and raise MemoryError)
@@ -1320,7 +1400,7 @@ cdef class MAE(RegressionCriterion):
                                                                  weight)
         return 0
 
-    cdef int reverse_reset(self) except -1 nogil:
+    cdef intp_t reverse_reset(self) except -1 nogil:
         """Reset the criterion at pos=end.
 
         Returns -1 in case of failure to allocate memory (and raise MemoryError)
@@ -1348,7 +1428,7 @@ cdef class MAE(RegressionCriterion):
                                                                 weight)
         return 0
 
-    cdef int update(self, intp_t new_pos) except -1 nogil:
+    cdef intp_t update(self, intp_t new_pos) except -1 nogil:
         """Updated statistics by moving sample_indices[pos:new_pos] to the left.
 
         Returns -1 in case of failure to allocate memory (and raise MemoryError)
@@ -1571,6 +1651,7 @@ cdef class Poisson(RegressionCriterion):
     Note that the deviance is >= 0, and since we have `y_pred = mean(y_true)`
     at the leaves, one always has `sum(y_pred - y_true) = 0`. It remains the
     implemented impurity (factor 2 is skipped):
+
         1/n * sum(y_true * log(y_true/y_pred)
     """
     # FIXME in 1.0:
diff --git a/sklearn/tree/_export.py b/sklearn/tree/_export.py
index 6726d0c67bfb1..e8e45af79fb28 100644
--- a/sklearn/tree/_export.py
+++ b/sklearn/tree/_export.py
@@ -11,9 +11,15 @@
 
 import numpy as np
 
-from ..base import is_classifier
-from ..utils._param_validation import HasMethods, Interval, StrOptions, validate_params
-from ..utils.validation import check_array, check_is_fitted
+from sklearn.base import is_classifier
+from sklearn.utils._param_validation import (
+    HasMethods,
+    Interval,
+    StrOptions,
+    validate_params,
+)
+from sklearn.utils.validation import check_array, check_is_fitted
+
 from . import DecisionTreeClassifier, DecisionTreeRegressor, _criterion, _tree
 from ._reingold_tilford import Tree, buchheim
 
diff --git a/sklearn/tree/_splitter.pxd b/sklearn/tree/_splitter.pxd
index 42c6c6d935a9c..393c81fd8b9a9 100644
--- a/sklearn/tree/_splitter.pxd
+++ b/sklearn/tree/_splitter.pxd
@@ -2,12 +2,11 @@
 # SPDX-License-Identifier: BSD-3-Clause
 
 # See _splitter.pyx for details.
+from libcpp.vector cimport vector
 
-from ..utils._typedefs cimport (
-    float32_t, float64_t, int8_t, int32_t, intp_t, uint8_t, uint32_t
-)
-from ._criterion cimport Criterion
+from ._criterion cimport BaseCriterion, Criterion
 from ._tree cimport ParentInfo
+from ..utils._typedefs cimport float32_t, float64_t, intp_t, int8_t, int32_t, uint8_t, uint32_t
 
 
 cdef struct SplitRecord:
@@ -25,16 +24,17 @@ cdef struct SplitRecord:
     uint8_t missing_go_to_left  # Controls if missing values go to the left node.
     intp_t n_missing            # Number of missing values for the feature being split on
 
-cdef class Splitter:
+cdef class BaseSplitter:
+    """Abstract interface for splitter."""
+
     # The splitter searches in the input space for a feature and a threshold
     # to split the samples samples[start:end].
     #
     # The impurity computations are delegated to a criterion object.
 
     # Internal structures
-    cdef public Criterion criterion      # Impurity criterion
-    cdef public intp_t max_features      # Number of features to test
-    cdef public intp_t min_samples_leaf  # Min samples in a leaf
+    cdef public intp_t max_features         # Number of features to test
+    cdef public intp_t min_samples_leaf     # Min samples in a leaf
     cdef public float64_t min_weight_leaf   # Minimum weight in a leaf
 
     cdef object random_state             # Random state
@@ -51,14 +51,6 @@ cdef class Splitter:
     cdef intp_t start                    # Start position for the current node
     cdef intp_t end                      # End position for the current node
 
-    cdef const float64_t[:, ::1] y
-    # Monotonicity constraints for each feature.
-    # The encoding is as follows:
-    #   -1: monotonic decrease
-    #    0: no constraint
-    #   +1: monotonic increase
-    cdef const int8_t[:] monotonic_cst
-    cdef bint with_monotonic_cst
     cdef const float64_t[:] sample_weight
 
     # The samples vector `samples` is maintained by the Splitter object such
@@ -78,29 +70,60 @@ cdef class Splitter:
     # This allows optimization with depth-based tree building.
 
     # Methods
-    cdef int init(
-        self,
-        object X,
-        const float64_t[:, ::1] y,
-        const float64_t[:] sample_weight,
-        const uint8_t[::1] missing_values_in_feature_mask,
-    ) except -1
-
     cdef int node_reset(
         self,
         intp_t start,
         intp_t end,
         float64_t* weighted_n_node_samples
     ) except -1 nogil
-
     cdef int node_split(
         self,
         ParentInfo* parent,
         SplitRecord* split,
     ) except -1 nogil
-
     cdef void node_value(self, float64_t* dest) noexcept nogil
+    cdef float64_t node_impurity(self) noexcept nogil
+    cdef intp_t pointer_size(self) noexcept nogil
 
-    cdef void clip_node_value(self, float64_t* dest, float64_t lower_bound, float64_t upper_bound) noexcept nogil
+cdef class Splitter(BaseSplitter):
+    """Base class for supervised splitters."""
 
-    cdef float64_t node_impurity(self) noexcept nogil
+    cdef public Criterion criterion      # Impurity criterion
+    cdef const float64_t[:, ::1] y
+
+    # Monotonicity constraints for each feature.
+    # The encoding is as follows:
+    #   -1: monotonic decrease
+    #    0: no constraint
+    #   +1: monotonic increase
+    cdef const int8_t[:] monotonic_cst
+    cdef bint with_monotonic_cst
+
+    cdef int init(
+        self,
+        object X,
+        const float64_t[:, ::1] y,
+        const float64_t[:] sample_weight,
+        const uint8_t[::1] missing_values_in_feature_mask,
+    ) except -1
+
+    cdef void node_samples(self, vector[vector[float64_t]]& dest) noexcept nogil
+
+    # Methods that allow modifications to stopping conditions
+    cdef bint check_presplit_conditions(
+        self,
+        SplitRecord* current_split,
+        intp_t n_missing,
+        bint missing_go_to_left,
+    ) noexcept nogil
+
+    cdef bint check_postsplit_conditions(
+        self
+    ) noexcept nogil
+
+    cdef void clip_node_value(
+        self,
+        float64_t* dest,
+        float64_t lower_bound,
+        float64_t upper_bound
+    ) noexcept nogil
diff --git a/sklearn/tree/_splitter.pyx b/sklearn/tree/_splitter.pyx
index b557a4d1c6300..d69bfa067c92c 100644
--- a/sklearn/tree/_splitter.pyx
+++ b/sklearn/tree/_splitter.pyx
@@ -55,13 +55,94 @@ cdef inline void _init_split(SplitRecord* self, intp_t start_pos) noexcept nogil
     self.missing_go_to_left = False
     self.n_missing = 0
 
-cdef class Splitter:
-    """Abstract splitter class.
+cdef class BaseSplitter:
+    """This is an abstract interface for splitters.
 
-    Splitters are called by tree builders to find the best splits on both
-    sparse and dense data, one split at a time.
+    For example, a tree model could be either supervisedly, or unsupervisedly computing splits on samples of
+    covariates, labels, or both. Although scikit-learn currently only contains
+    supervised tree methods, this class enables 3rd party packages to leverage
+    scikit-learn's Cython code for splitting.
+
+    A splitter is usually used in conjunction with a criterion class, which explicitly handles
+    computing the criteria, which we split on. The setting of that criterion class is handled
+    by downstream classes.
+
+    The downstream classes _must_ implement methods to compute the split in a node.
     """
 
+    def __getstate__(self):
+        return {}
+
+    def __setstate__(self, d):
+        pass
+
+    cdef int node_reset(
+        self,
+        intp_t start,
+        intp_t end,
+        float64_t* weighted_n_node_samples
+    ) except -1 nogil:
+        """Reset splitter on node samples[start:end].
+
+        Returns -1 in case of failure to allocate memory (and raise MemoryError)
+        or 0 otherwise.
+
+        Parameters
+        ----------
+        start : intp_t
+            The index of the first sample to consider
+        end : intp_t
+            The index of the last sample to consider
+        weighted_n_node_samples : ndarray, dtype=float64_t pointer
+            The total weight of those samples
+        """
+        pass
+
+    cdef int node_split(
+        self,
+        ParentInfo* parent,
+        SplitRecord* split,
+    ) except -1 nogil:
+        """Find the best split on node samples[start:end].
+
+        This is a placeholder method. The majority of computation will be done
+        here.
+
+        It should return -1 upon errors.
+
+        Parameters
+        ----------
+        impurity : float64_t
+            The impurity of the current node.
+        split : SplitRecord pointer
+            A pointer to a memory-allocated SplitRecord object which will be filled with the
+            split chosen.
+        lower_bound : float64_t
+            The lower bound of the monotonic constraint if used.
+        upper_bound : float64_t
+            The upper bound of the monotonic constraint if used.
+        """
+        pass
+
+    cdef void node_value(self, float64_t* dest) noexcept nogil:
+        """Copy the value of node samples[start:end] into dest."""
+        pass
+
+    cdef float64_t node_impurity(self) noexcept nogil:
+        """Return the impurity of the current node."""
+        pass
+
+    cdef intp_t pointer_size(self) noexcept nogil:
+        """Size of the pointer for split records.
+
+        Overriding this function allows one to use different subclasses of
+        `SplitRecord`.
+        """
+        return sizeof(SplitRecord)
+
+cdef class Splitter(BaseSplitter):
+    """Abstract interface for supervised splitters."""
+
     def __cinit__(
         self,
         Criterion criterion,
@@ -70,6 +151,7 @@ cdef class Splitter:
         float64_t min_weight_leaf,
         object random_state,
         const int8_t[:] monotonic_cst,
+        *argv
     ):
         """
         Parameters
@@ -97,7 +179,6 @@ cdef class Splitter:
             Monotonicity constraints
 
         """
-
         self.criterion = criterion
 
         self.n_samples = 0
@@ -110,19 +191,13 @@ cdef class Splitter:
         self.monotonic_cst = monotonic_cst
         self.with_monotonic_cst = monotonic_cst is not None
 
-    def __getstate__(self):
-        return {}
-
-    def __setstate__(self, d):
-        pass
-
     def __reduce__(self):
         return (type(self), (self.criterion,
                              self.max_features,
                              self.min_samples_leaf,
                              self.min_weight_leaf,
                              self.random_state,
-                             self.monotonic_cst), self.__getstate__())
+                             self.monotonic_cst.base if self.monotonic_cst is not None else None), self.__getstate__())
 
     cdef int init(
         self,
@@ -156,7 +231,6 @@ cdef class Splitter:
         has_missing : bool
             At least one missing values is in X.
         """
-
         self.rand_r_state = self.random_state.randint(0, RAND_R_MAX)
         cdef intp_t n_samples = X.shape[0]
 
@@ -194,8 +268,21 @@ cdef class Splitter:
         self.y = y
 
         self.sample_weight = sample_weight
+
+        self.criterion.init(
+            self.y,
+            self.sample_weight,
+            self.weighted_n_samples,
+            self.samples
+        )
+
+        self.criterion.set_sample_pointers(
+            self.start,
+            self.end
+        )
         if missing_values_in_feature_mask is not None:
             self.criterion.init_sum_missing()
+
         return 0
 
     cdef int node_reset(
@@ -222,14 +309,7 @@ cdef class Splitter:
         self.start = start
         self.end = end
 
-        self.criterion.init(
-            self.y,
-            self.sample_weight,
-            self.weighted_n_samples,
-            self.samples,
-            start,
-            end
-        )
+        self.criterion.set_sample_pointers(start, end)
 
         weighted_n_node_samples[0] = self.criterion.weighted_n_node_samples
         return 0
@@ -260,13 +340,67 @@ cdef class Splitter:
 
         self.criterion.clip_node_value(dest, lower_bound, upper_bound)
 
+    cdef void node_samples(self, vector[vector[float64_t]]& dest) noexcept nogil:
+        """Copy the samples[start:end] into dest."""
+        self.criterion.node_samples(dest)
+
     cdef float64_t node_impurity(self) noexcept nogil:
         """Return the impurity of the current node."""
 
         return self.criterion.node_impurity()
 
+    cdef inline bint check_presplit_conditions(
+        self,
+        SplitRecord* current_split,
+        intp_t n_missing,
+        bint missing_go_to_left,
+    ) noexcept nogil:
+        """Check stopping conditions pre-split.
+
+        This is typically a metric that is cheaply computed given the
+        current proposed split, which is stored as a the `current_split`
+        argument.
+
+        Returns 1 if not a valid split, and 0 if it is.
+        """
+        cdef intp_t min_samples_leaf = self.min_samples_leaf
+        cdef intp_t end_non_missing = self.end - n_missing
+        cdef intp_t n_left, n_right
+
+        if missing_go_to_left:
+            n_left = current_split.pos - self.start + n_missing
+            n_right = end_non_missing - current_split.pos
+        else:
+            n_left = current_split.pos - self.start
+            n_right = end_non_missing - current_split.pos + n_missing
+
+        # Reject if min_samples_leaf is not guaranteed
+        if n_left < min_samples_leaf or n_right < min_samples_leaf:
+            return 1
+
+        return 0
+
+    cdef inline bint check_postsplit_conditions(
+        self
+    ) noexcept nogil:
+        """Check stopping conditions after evaluating the split.
+
+        This takes some metric that is stored in the Criterion
+        object and checks against internal stop metrics.
+
+        Returns 1 if condition is not met, and 0 if it is.
+        """
+        cdef float64_t min_weight_leaf = self.min_weight_leaf
+
+        # Reject if min_weight_leaf is not satisfied
+        if ((self.criterion.weighted_n_left < min_weight_leaf) or
+                (self.criterion.weighted_n_right < min_weight_leaf)):
+            return 1
+
+        return 0
+
 
-cdef inline int node_split_best(
+cdef inline intp_t node_split_best(
     Splitter splitter,
     Partitioner partitioner,
     Criterion criterion,
@@ -396,7 +530,6 @@ cdef inline int node_split_best(
         criterion.init_missing(n_missing)  # initialize even when n_missing == 0
 
         # Evaluate all splits
-
         # If there are missing values, then we search twice for the most optimal split.
         # The first search will have all the missing values going to the right node.
         # The second search will have all the missing values going to the left node.
@@ -417,18 +550,12 @@ cdef inline int node_split_best(
                 if p >= end_non_missing:
                     continue
 
-                if missing_go_to_left:
-                    n_left = p - start + n_missing
-                    n_right = end_non_missing - p
-                else:
-                    n_left = p - start
-                    n_right = end_non_missing - p + n_missing
+                current_split.pos = p
 
                 # Reject if min_samples_leaf is not guaranteed
-                if n_left < min_samples_leaf or n_right < min_samples_leaf:
+                if splitter.check_presplit_conditions(&current_split, n_missing, missing_go_to_left) == 1:
                     continue
 
-                current_split.pos = p
                 criterion.update(current_split.pos)
 
                 # Reject if monotonicity constraints are not satisfied
@@ -444,8 +571,7 @@ cdef inline int node_split_best(
                     continue
 
                 # Reject if min_weight_leaf is not satisfied
-                if ((criterion.weighted_n_left < min_weight_leaf) or
-                        (criterion.weighted_n_right < min_weight_leaf)):
+                if splitter.check_postsplit_conditions() == 1:
                     continue
 
                 current_proxy_improvement = criterion.proxy_impurity_improvement()
@@ -470,6 +596,13 @@ cdef inline int node_split_best(
                     # test time, we send missing values to the branch that contains
                     # the most samples during training time.
                     if n_missing == 0:
+                        if missing_go_to_left:
+                            n_left = current_split.pos - splitter.start + n_missing
+                            n_right = end_non_missing - current_split.pos
+                        else:
+                            n_left = current_split.pos - splitter.start
+                            n_right = end_non_missing - current_split.pos + n_missing
+
                         current_split.missing_go_to_left = n_left > n_right
                     else:
                         current_split.missing_go_to_left = missing_go_to_left
@@ -569,8 +702,6 @@ cdef inline int node_split_random(
     cdef intp_t n_features = splitter.n_features
 
     cdef intp_t max_features = splitter.max_features
-    cdef intp_t min_samples_leaf = splitter.min_samples_leaf
-    cdef float64_t min_weight_leaf = splitter.min_weight_leaf
     cdef uint32_t* random_state = &splitter.rand_r_state
 
     cdef SplitRecord best_split, current_split
@@ -706,7 +837,7 @@ cdef inline int node_split_random(
             n_right = end_non_missing - current_split.pos + n_missing
 
         # Reject if min_samples_leaf is not guaranteed
-        if n_left < min_samples_leaf or n_right < min_samples_leaf:
+        if splitter.check_presplit_conditions(&current_split, n_missing, missing_go_to_left) == 1:
             continue
 
         # Evaluate split
@@ -715,23 +846,22 @@ cdef inline int node_split_random(
         criterion.reset()
         criterion.update(current_split.pos)
 
-        # Reject if min_weight_leaf is not satisfied
-        if ((criterion.weighted_n_left < min_weight_leaf) or
-                (criterion.weighted_n_right < min_weight_leaf)):
-            continue
-
         # Reject if monotonicity constraints are not satisfied
         if (
-                with_monotonic_cst and
-                monotonic_cst[current_split.feature] != 0 and
-                not criterion.check_monotonicity(
-                    monotonic_cst[current_split.feature],
-                    lower_bound,
-                    upper_bound,
-                )
+            with_monotonic_cst and
+            monotonic_cst[current_split.feature] != 0 and
+            not criterion.check_monotonicity(
+                monotonic_cst[current_split.feature],
+                lower_bound,
+                upper_bound,
+            )
         ):
             continue
 
+        # Reject if min_weight_leaf is not satisfied
+        if splitter.check_postsplit_conditions() == 1:
+            continue
+
         current_proxy_improvement = criterion.proxy_impurity_improvement()
 
         if current_proxy_improvement > best_proxy_improvement:
@@ -805,9 +935,9 @@ cdef class BestSplitter(Splitter):
         )
 
     cdef int node_split(
-            self,
-            ParentInfo* parent_record,
-            SplitRecord* split,
+        self,
+        ParentInfo* parent_record,
+        SplitRecord* split,
     ) except -1 nogil:
         return node_split_best(
             self,
@@ -833,9 +963,9 @@ cdef class BestSparseSplitter(Splitter):
         )
 
     cdef int node_split(
-            self,
-            ParentInfo* parent_record,
-            SplitRecord* split,
+        self,
+        ParentInfo* parent_record,
+        SplitRecord* split,
     ) except -1 nogil:
         return node_split_best(
             self,
@@ -861,9 +991,9 @@ cdef class RandomSplitter(Splitter):
         )
 
     cdef int node_split(
-            self,
-            ParentInfo* parent_record,
-            SplitRecord* split,
+        self,
+        ParentInfo* parent_record,
+        SplitRecord* split,
     ) except -1 nogil:
         return node_split_random(
             self,
diff --git a/sklearn/tree/_tree.pxd b/sklearn/tree/_tree.pxd
index 2cadca4564a87..47a3cd2bd5c9d 100644
--- a/sklearn/tree/_tree.pxd
+++ b/sklearn/tree/_tree.pxd
@@ -4,12 +4,15 @@
 # See _tree.pyx for details.
 
 import numpy as np
+
 cimport numpy as cnp
+from libcpp.unordered_map cimport unordered_map
+from libcpp.vector cimport vector
 
 from ..utils._typedefs cimport float32_t, float64_t, intp_t, int32_t, uint8_t, uint32_t
 
-from ._splitter cimport Splitter
-from ._splitter cimport SplitRecord
+from ._splitter cimport SplitRecord, Splitter
+
 
 cdef struct Node:
     # Base storage structure for the nodes in a Tree object
@@ -23,7 +26,6 @@ cdef struct Node:
     float64_t weighted_n_node_samples    # Weighted number of samples at the node
     uint8_t missing_go_to_left     # Whether features have missing values
 
-
 cdef struct ParentInfo:
     # Structure to store information about the parent of a node
     # This is passed to the splitter, to provide information about the previous split
@@ -33,16 +35,7 @@ cdef struct ParentInfo:
     float64_t impurity              # the impurity of the parent
     intp_t n_constant_features      # the number of constant features found in parent
 
-cdef class Tree:
-    # The Tree object is a binary tree structure constructed by the
-    # TreeBuilder. The tree structure is used for predictions and
-    # feature importances.
-
-    # Input/Output layout
-    cdef public intp_t n_features        # Number of features in X
-    cdef intp_t* n_classes               # Number of classes in y[:, k]
-    cdef public intp_t n_outputs         # Number of outputs in y
-    cdef public intp_t max_n_classes     # max(n_classes)
+cdef class BaseTree:
 
     # Inner structures: values are stored separately from node structure,
     # since size is determined at runtime.
@@ -54,19 +47,33 @@ cdef class Tree:
     cdef intp_t value_stride             # = n_outputs * max_n_classes
 
     # Methods
-    cdef intp_t _add_node(self, intp_t parent, bint is_left, bint is_leaf,
-                          intp_t feature, float64_t threshold, float64_t impurity,
-                          intp_t n_node_samples,
-                          float64_t weighted_n_node_samples,
-                          uint8_t missing_go_to_left) except -1 nogil
+    cdef intp_t _add_node(
+        self,
+        intp_t parent,
+        bint is_left,
+        bint is_leaf,
+        SplitRecord* split_node,
+        float64_t impurity,
+        intp_t n_node_samples,
+        float64_t weighted_n_node_samples,
+        uint8_t missing_go_to_left
+    ) except -1 nogil
     cdef int _resize(self, intp_t capacity) except -1 nogil
     cdef int _resize_c(self, intp_t capacity=*) except -1 nogil
 
-    cdef cnp.ndarray _get_value_ndarray(self)
-    cdef cnp.ndarray _get_node_ndarray(self)
-
-    cpdef cnp.ndarray predict(self, object X)
-
+    cdef intp_t _update_node(
+        self,
+        intp_t parent,
+        bint is_left,
+        bint is_leaf,
+        SplitRecord* split_node,
+        float64_t impurity,
+        intp_t n_node_samples,
+        float64_t weighted_n_node_samples,
+        uint8_t missing_go_to_left
+    ) except -1 nogil
+
+    # Python API methods: These are methods exposed to Python
     cpdef cnp.ndarray apply(self, object X)
     cdef cnp.ndarray _apply_dense(self, object X)
     cdef cnp.ndarray _apply_sparse_csr(self, object X)
@@ -78,6 +85,64 @@ cdef class Tree:
     cpdef compute_node_depths(self)
     cpdef compute_feature_importances(self, normalize=*)
 
+    # Abstract methods: these functions must be implemented by any decision tree
+    cdef int _set_split_node(
+        self,
+        SplitRecord* split_node,
+        Node* node,
+        intp_t node_id,
+    ) except -1 nogil
+    cdef int _set_leaf_node(
+        self,
+        SplitRecord* split_node,
+        Node* node,
+        intp_t node_id,
+    ) except -1 nogil
+    cdef float32_t _compute_feature(
+        self,
+        const float32_t[:, :] X_ndarray,
+        intp_t sample_index,
+        Node *node
+    ) noexcept nogil
+    cdef void _compute_feature_importances(
+        self,
+        float64_t[:] importances,
+        Node* node,
+    ) noexcept nogil
+
+cdef class Tree(BaseTree):
+    # The Tree object is a binary tree structure constructed by the
+    # TreeBuilder. The tree structure is used for predictions and
+    # feature importances.
+
+    # The Supervised Tree object is a binary tree structure constructed by the
+    # TreeBuilder. The tree structure is used for predictions and
+    # feature importances.
+    #
+    # Value of upstream properties:
+    # - value_stride = n_outputs * max_n_classes
+    # - value = (capacity, n_outputs, max_n_classes) array of values
+
+    # Input/Output layout
+    cdef public intp_t n_features        # Number of features in X
+    cdef intp_t* n_classes               # Number of classes in y[:, k]
+    cdef public intp_t n_outputs         # Number of outputs in y
+    cdef public intp_t max_n_classes     # max(n_classes)
+
+    # Enables the use of tree to store distributions of the output to allow
+    # arbitrary usage of the the leaves. This is used in the quantile
+    # estimators for example.
+    # for storing samples at each leaf node with leaf's node ID as the key and
+    # the sample values as the value
+    cdef unordered_map[intp_t, vector[vector[float64_t]]] value_samples
+
+    # Methods
+    cdef cnp.ndarray _get_value_ndarray(self)
+    cdef cnp.ndarray _get_node_ndarray(self)
+    cdef cnp.ndarray _get_value_samples_ndarray(self, intp_t node_id)
+    cdef cnp.ndarray _get_value_samples_keys(self)
+
+    cpdef cnp.ndarray predict(self, object X)
 
 # =============================================================================
 # Tree builder
@@ -93,11 +158,23 @@ cdef class TreeBuilder:
 
     cdef Splitter splitter              # Splitting algorithm
 
-    cdef intp_t min_samples_split       # Minimum number of samples in an internal node
-    cdef intp_t min_samples_leaf        # Minimum number of samples in a leaf
-    cdef float64_t min_weight_leaf         # Minimum weight in a leaf
-    cdef intp_t max_depth               # Maximal tree depth
-    cdef float64_t min_impurity_decrease   # Impurity threshold for early stopping
+    cdef intp_t min_samples_split           # Minimum number of samples in an internal node
+    cdef intp_t min_samples_leaf            # Minimum number of samples in a leaf
+    cdef float64_t min_weight_leaf          # Minimum weight in a leaf
+    cdef intp_t max_depth                   # Maximal tree depth
+    cdef float64_t min_impurity_decrease    # Impurity threshold for early stopping
+    cdef cnp.ndarray initial_roots          # Leaf nodes for streaming updates
+
+    cdef uint8_t store_leaf_values    # Whether to store leaf values
+
+    cpdef initialize_node_queue(
+      self,
+      Tree tree,
+      object X,
+      const float64_t[:, ::1] y,
+      const float64_t[:] sample_weight=*,
+      const uint8_t[::1] missing_values_in_feature_mask=*,
+    )
 
     cpdef build(
         self,
diff --git a/sklearn/tree/_tree.pyx b/sklearn/tree/_tree.pyx
index 7e6946a718a81..943d5e6148538 100644
--- a/sklearn/tree/_tree.pyx
+++ b/sklearn/tree/_tree.pyx
@@ -2,22 +2,23 @@
 # SPDX-License-Identifier: BSD-3-Clause
 
 from cpython cimport Py_INCREF, PyObject, PyTypeObject
-
-from libc.stdlib cimport free
-from libc.string cimport memcpy
-from libc.string cimport memset
-from libc.stdint cimport INTPTR_MAX
+from cython.operator cimport dereference as deref
 from libc.math cimport isnan
+from libc.stdint cimport INTPTR_MAX
+from libc.stdlib cimport free, malloc
+from libc.string cimport memcpy, memset
 from libcpp.vector cimport vector
-from libcpp.algorithm cimport pop_heap
-from libcpp.algorithm cimport push_heap
 from libcpp.stack cimport stack
 from libcpp cimport bool
+from libcpp.algorithm cimport pop_heap, push_heap
+from libcpp.vector cimport vector
 
 import struct
 
 import numpy as np
+
 cimport numpy as cnp
+
 cnp.import_array()
 
 from scipy.sparse import issparse
@@ -26,12 +27,13 @@ from scipy.sparse import csr_matrix
 from ._utils cimport safe_realloc
 from ._utils cimport sizet_ptr_to_ndarray
 
+
 cdef extern from "numpy/arrayobject.h":
     object PyArray_NewFromDescr(PyTypeObject* subtype, cnp.dtype descr,
-                                int nd, cnp.npy_intp* dims,
+                                intp_t nd, cnp.npy_intp* dims,
                                 cnp.npy_intp* strides,
-                                void* data, int flags, object obj)
-    int PyArray_SetBaseObject(cnp.ndarray arr, PyObject* obj)
+                                void* data, intp_t flags, object obj)
+    intp_t PyArray_SetBaseObject(cnp.ndarray arr, PyObject* obj)
 
 # =============================================================================
 # Types and constants
@@ -74,6 +76,17 @@ cdef inline void _init_parent_record(ParentInfo* record) noexcept nogil:
 cdef class TreeBuilder:
     """Interface for different tree building strategies."""
 
+    cpdef initialize_node_queue(
+        self,
+        Tree tree,
+        object X,
+        const float64_t[:, ::1] y,
+        const float64_t[:] sample_weight=None,
+        const uint8_t[::1] missing_values_in_feature_mask=None,
+    ):
+        """Build a decision tree from the training set (X, y)."""
+        pass
+
     cpdef build(
         self,
         Tree tree,
@@ -124,6 +137,7 @@ cdef class TreeBuilder:
 
         return X, y, sample_weight
 
+
 # Depth first builder ---------------------------------------------------------
 # A record on the stack for depth-first tree growing
 cdef struct StackRecord:
@@ -137,18 +151,104 @@ cdef struct StackRecord:
     float64_t lower_bound
     float64_t upper_bound
 
+
 cdef class DepthFirstTreeBuilder(TreeBuilder):
     """Build a decision tree in depth-first fashion."""
 
-    def __cinit__(self, Splitter splitter, intp_t min_samples_split,
-                  intp_t min_samples_leaf, float64_t min_weight_leaf,
-                  intp_t max_depth, float64_t min_impurity_decrease):
+    def __cinit__(
+        self,
+        Splitter splitter,
+        intp_t min_samples_split,
+        intp_t min_samples_leaf,
+        float64_t min_weight_leaf,
+        intp_t max_depth,
+        float64_t min_impurity_decrease,
+        uint8_t store_leaf_values=False,
+        cnp.ndarray initial_roots=None,
+    ):
         self.splitter = splitter
         self.min_samples_split = min_samples_split
         self.min_samples_leaf = min_samples_leaf
         self.min_weight_leaf = min_weight_leaf
         self.max_depth = max_depth
         self.min_impurity_decrease = min_impurity_decrease
+        self.store_leaf_values = store_leaf_values
+        self.initial_roots = initial_roots
+
+    def __reduce__(self):
+        """Reduce re-implementation, for pickling."""
+        return(DepthFirstTreeBuilder, (self.splitter,
+                                       self.min_samples_split,
+                                       self.min_samples_leaf,
+                                       self.min_weight_leaf,
+                                       self.max_depth,
+                                       self.min_impurity_decrease,
+                                       self.store_leaf_values,
+                                       self.initial_roots))
+
+    cpdef initialize_node_queue(
+        self,
+        Tree tree,
+        object X,
+        const float64_t[:, ::1] y,
+        const float64_t[:] sample_weight=None,
+        const uint8_t[::1] missing_values_in_feature_mask=None,
+    ):
+        """Initialize a list of roots"""
+        X, y, sample_weight = self._check_input(X, y, sample_weight)
+
+        # organize samples by decision paths
+        paths = tree.decision_path(X)
+        cdef intp_t PARENT
+        cdef intp_t CHILD
+        cdef intp_t i
+        false_roots = {}
+        X_copy = {}
+        y_copy = {}
+        for i in range(X.shape[0]):
+            # collect depths from the node paths
+            depth_i = paths[i].indices.shape[0] - 1
+            PARENT = depth_i - 1
+            CHILD = depth_i
+
+            # find leaf node's & their parent node's IDs
+            if PARENT < 0:
+                parent_i = 0
+            else:
+                parent_i = paths[i].indices[PARENT]
+            child_i = paths[i].indices[CHILD]
+            left = 0
+            if tree.children_left[parent_i] == child_i:
+                left = 1  # leaf node is left child
+
+            # organize samples by the leaf they fall into (false root)
+            # leaf nodes are marked by parent node and
+            # their relative position (left or right child)
+            if (parent_i, left) in false_roots:
+                false_roots[(parent_i, left)][0] += 1
+                X_copy[(parent_i, left)].append(X[i])
+                y_copy[(parent_i, left)].append(y[i])
+            else:
+                false_roots[(parent_i, left)] = [1, depth_i]
+                X_copy[(parent_i, left)] = [X[i]]
+                y_copy[(parent_i, left)] = [y[i]]
+
+        X_list = []
+        y_list = []
+
+        # reorder the samples according to parent node IDs
+        for key, value in reversed(sorted(X_copy.items())):
+            X_list = X_list + value
+            y_list = y_list + y_copy[key]
+        cdef object X_new = np.array(X_list)
+        cdef cnp.ndarray y_new = np.array(y_list)
+
+        # initialize the splitter using sorted samples
+        cdef Splitter splitter = self.splitter
+        splitter.init(X_new, y_new, sample_weight, missing_values_in_feature_mask)
+
+        # convert dict to numpy array and store value
+        self.initial_roots = np.array(list(false_roots.items()))
 
     cpdef build(
         self,
@@ -163,16 +263,6 @@ cdef class DepthFirstTreeBuilder(TreeBuilder):
         # check input
         X, y, sample_weight = self._check_input(X, y, sample_weight)
 
-        # Initial capacity
-        cdef intp_t init_capacity
-
-        if tree.max_depth <= 10:
-            init_capacity = <intp_t> (2 ** (tree.max_depth + 1)) - 1
-        else:
-            init_capacity = 2047
-
-        tree._resize(init_capacity)
-
         # Parameters
         cdef Splitter splitter = self.splitter
         cdef intp_t max_depth = self.max_depth
@@ -181,36 +271,75 @@ cdef class DepthFirstTreeBuilder(TreeBuilder):
         cdef intp_t min_samples_split = self.min_samples_split
         cdef float64_t min_impurity_decrease = self.min_impurity_decrease
 
-        # Recursive partition (without actual recursion)
-        splitter.init(X, y, sample_weight, missing_values_in_feature_mask)
+        cdef uint8_t store_leaf_values = self.store_leaf_values
+        cdef cnp.ndarray initial_roots = self.initial_roots
+
+        # Initial capacity
+        cdef intp_t init_capacity
+        cdef bint first = 0
+        if initial_roots is None:
+            # Recursive partition (without actual recursion)
+            splitter.init(X, y, sample_weight, missing_values_in_feature_mask)
+
+            if tree.max_depth <= 10:
+                init_capacity = <intp_t> (2 ** (tree.max_depth + 1)) - 1
+            else:
+                init_capacity = 2047
+
+            tree._resize(init_capacity)
+            first = 1
+        else:
+            # convert numpy array back to dict
+            false_roots = {}
+            for key_value_pair in initial_roots:
+                false_roots[tuple(key_value_pair[0])] = key_value_pair[1]
+
+            # reset the root array
+            self.initial_roots = None
 
-        cdef intp_t start
-        cdef intp_t end
+        cdef intp_t start = 0
+        cdef intp_t end = 0
         cdef intp_t depth
         cdef intp_t parent
         cdef bint is_left
         cdef intp_t n_node_samples = splitter.n_samples
         cdef float64_t weighted_n_node_samples
-        cdef SplitRecord split
         cdef intp_t node_id
+        cdef float64_t right_child_min, left_child_min, right_child_max, left_child_max
+
+        cdef SplitRecord split
+        cdef SplitRecord* split_ptr = <SplitRecord *>malloc(splitter.pointer_size())
 
         cdef float64_t middle_value
-        cdef float64_t left_child_min
-        cdef float64_t left_child_max
-        cdef float64_t right_child_min
-        cdef float64_t right_child_max
         cdef bint is_leaf
-        cdef bint first = 1
-        cdef intp_t max_depth_seen = -1
-        cdef int rc = 0
+        cdef intp_t max_depth_seen = -1 if first else tree.max_depth
+
+        cdef intp_t rc = 0
 
         cdef stack[StackRecord] builder_stack
+        cdef stack[StackRecord] update_stack
         cdef StackRecord stack_record
 
         cdef ParentInfo parent_record
         _init_parent_record(&parent_record)
 
-        with nogil:
+        if not first:
+            # push reached leaf nodes onto stack
+            for key, value in reversed(sorted(false_roots.items())):
+                end += value[0]
+                update_stack.push({
+                    "start": start,
+                    "end": end,
+                    "depth": value[1],
+                    "parent": key[0],
+                    "is_left": key[1],
+                    "impurity": tree.impurity[key[0]],
+                    "n_constant_features": 0,
+                    "lower_bound": -INFINITY,
+                    "upper_bound": INFINITY,
+                })
+                start += value[0]
+        else:
             # push root node onto stack
             builder_stack.push({
                 "start": 0,
@@ -224,6 +353,137 @@ cdef class DepthFirstTreeBuilder(TreeBuilder):
                 "upper_bound": INFINITY,
             })
 
+        with nogil:
+            while not update_stack.empty():
+                stack_record = update_stack.top()
+                update_stack.pop()
+
+                start = stack_record.start
+                end = stack_record.end
+                depth = stack_record.depth
+                parent = stack_record.parent
+                is_left = stack_record.is_left
+                parent_record.impurity = stack_record.impurity
+                parent_record.n_constant_features = stack_record.n_constant_features
+                parent_record.lower_bound = stack_record.lower_bound
+                parent_record.upper_bound = stack_record.upper_bound
+
+                n_node_samples = end - start
+                splitter.node_reset(start, end, &weighted_n_node_samples)
+
+                is_leaf = (depth >= max_depth or
+                           n_node_samples < min_samples_split or
+                           n_node_samples < 2 * min_samples_leaf or
+                           weighted_n_node_samples < 2 * min_weight_leaf)
+
+                if first:
+                    parent_record.impurity = splitter.node_impurity()
+                    first = 0
+
+                # impurity == 0 with tolerance due to rounding errors
+                is_leaf = is_leaf or parent_record.impurity <= EPSILON
+
+                if not is_leaf:
+                    splitter.node_split(
+                        &parent_record,
+                        split_ptr,
+                    )
+
+                    # assign local copy of SplitRecord to assign
+                    # pos, improvement, and impurity scores
+                    split = deref(split_ptr)
+
+                    # If EPSILON=0 in the below comparison, float precision
+                    # issues stop splitting, producing trees that are
+                    # dissimilar to v0.18
+                    is_leaf = (is_leaf or split.pos >= end or
+                               (split.improvement + EPSILON <
+                                min_impurity_decrease))
+
+                node_id = tree._update_node(parent, is_left, is_leaf, split_ptr,
+                                            parent_record.impurity,
+                                            n_node_samples, weighted_n_node_samples,
+                                            split.missing_go_to_left)
+
+                if node_id == INTPTR_MAX:
+                    rc = -1
+                    break
+
+                # Store value for all nodes, to facilitate tree/model
+                # inspection and interpretation
+                splitter.node_value(tree.value + node_id * tree.value_stride)
+                if splitter.with_monotonic_cst:
+                    splitter.clip_node_value(
+                        tree.value + node_id * tree.value_stride,
+                        parent_record.lower_bound,
+                        parent_record.upper_bound
+                    )
+
+                if not is_leaf:
+                    if (
+                        not splitter.with_monotonic_cst or
+                        splitter.monotonic_cst[split.feature] == 0
+                    ):
+                        # Split on a feature with no monotonicity constraint
+
+                        # Current bounds must always be propagated to both children.
+                        # If a monotonic constraint is active, bounds are used in
+                        # node value clipping.
+                        left_child_min = right_child_min = parent_record.lower_bound
+                        left_child_max = right_child_max = parent_record.upper_bound
+                    elif splitter.monotonic_cst[split.feature] == 1:
+                        # Split on a feature with monotonic increase constraint
+                        left_child_min = parent_record.lower_bound
+                        right_child_max = parent_record.upper_bound
+
+                        # Lower bound for right child and upper bound for left child
+                        # are set to the same value.
+                        middle_value = splitter.criterion.middle_value()
+                        right_child_min = middle_value
+                        left_child_max = middle_value
+                    else:  # i.e. splitter.monotonic_cst[split.feature] == -1
+                        # Split on a feature with monotonic decrease constraint
+                        right_child_min = parent_record.lower_bound
+                        left_child_max = parent_record.upper_bound
+
+                        # Lower bound for left child and upper bound for right child
+                        # are set to the same value.
+                        middle_value = splitter.criterion.middle_value()
+                        left_child_min = middle_value
+                        right_child_max = middle_value
+
+                    # Push right child on stack
+                    builder_stack.push({
+                        "start": split.pos,
+                        "end": end,
+                        "depth": depth + 1,
+                        "parent": node_id,
+                        "is_left": 0,
+                        "impurity": split.impurity_right,
+                        "n_constant_features": parent_record.n_constant_features,
+                        "lower_bound": right_child_min,
+                        "upper_bound": right_child_max,
+                    })
+
+                    # Push left child on stack
+                    builder_stack.push({
+                        "start": start,
+                        "end": split.pos,
+                        "depth": depth + 1,
+                        "parent": node_id,
+                        "is_left": 1,
+                        "impurity": split.impurity_left,
+                        "n_constant_features": parent_record.n_constant_features,
+                        "lower_bound": left_child_min,
+                        "upper_bound": left_child_max,
+                    })
+                elif store_leaf_values and is_leaf:
+                    # copy leaf values to leaf_values array
+                    splitter.node_samples(tree.value_samples[node_id])
+
+                if depth > max_depth_seen:
+                    max_depth_seen = depth
+
             while not builder_stack.empty():
                 stack_record = builder_stack.top()
                 builder_stack.pop()
@@ -248,7 +508,7 @@ cdef class DepthFirstTreeBuilder(TreeBuilder):
 
                 if first:
                     parent_record.impurity = splitter.node_impurity()
-                    first = 0
+                    first=0
 
                 # impurity == 0 with tolerance due to rounding errors
                 is_leaf = is_leaf or parent_record.impurity <= EPSILON
@@ -256,8 +516,13 @@ cdef class DepthFirstTreeBuilder(TreeBuilder):
                 if not is_leaf:
                     splitter.node_split(
                         &parent_record,
-                        &split,
+                        split_ptr,
                     )
+
+                    # assign local copy of SplitRecord to assign
+                    # pos, improvement, and impurity scores
+                    split = deref(split_ptr)
+
                     # If EPSILON=0 in the below comparison, float precision
                     # issues stop splitting, producing trees that are
                     # dissimilar to v0.18
@@ -265,10 +530,9 @@ cdef class DepthFirstTreeBuilder(TreeBuilder):
                                (split.improvement + EPSILON <
                                 min_impurity_decrease))
 
-                node_id = tree._add_node(parent, is_left, is_leaf, split.feature,
-                                         split.threshold, parent_record.impurity,
-                                         n_node_samples, weighted_n_node_samples,
-                                         split.missing_go_to_left)
+                node_id = tree._add_node(parent, is_left, is_leaf, split_ptr,
+                                         parent_record.impurity, n_node_samples,
+                                         weighted_n_node_samples, split.missing_go_to_left)
 
                 if node_id == INTPTR_MAX:
                     rc = -1
@@ -278,7 +542,11 @@ cdef class DepthFirstTreeBuilder(TreeBuilder):
                 # inspection and interpretation
                 splitter.node_value(tree.value + node_id * tree.value_stride)
                 if splitter.with_monotonic_cst:
-                    splitter.clip_node_value(tree.value + node_id * tree.value_stride, parent_record.lower_bound, parent_record.upper_bound)
+                    splitter.clip_node_value(
+                        tree.value + node_id * tree.value_stride,
+                        parent_record.lower_bound,
+                        parent_record.upper_bound
+                    )
 
                 if not is_leaf:
                     if (
@@ -338,6 +606,9 @@ cdef class DepthFirstTreeBuilder(TreeBuilder):
                         "lower_bound": left_child_min,
                         "upper_bound": left_child_max,
                     })
+                elif store_leaf_values and is_leaf:
+                    # copy leaf values to leaf_values array
+                    splitter.node_samples(tree.value_samples[node_id])
 
                 if depth > max_depth_seen:
                     max_depth_seen = depth
@@ -347,10 +618,13 @@ cdef class DepthFirstTreeBuilder(TreeBuilder):
 
             if rc >= 0:
                 tree.max_depth = max_depth_seen
+
+        # free the memory created for the SplitRecord pointer
+        free(split_ptr)
+
         if rc == -1:
             raise MemoryError()
 
-
 # Best first builder ----------------------------------------------------------
 cdef struct FrontierRecord:
     # Record of information of a Node, the frontier for a split. Those records are
@@ -393,10 +667,18 @@ cdef class BestFirstTreeBuilder(TreeBuilder):
     """
     cdef intp_t max_leaf_nodes
 
-    def __cinit__(self, Splitter splitter, intp_t min_samples_split,
-                  intp_t min_samples_leaf,  min_weight_leaf,
-                  intp_t max_depth, intp_t max_leaf_nodes,
-                  float64_t min_impurity_decrease):
+    def __cinit__(
+        self,
+        Splitter splitter,
+        intp_t min_samples_split,
+        intp_t min_samples_leaf,
+        float64_t min_weight_leaf,
+        intp_t max_depth,
+        intp_t max_leaf_nodes,
+        float64_t min_impurity_decrease,
+        uint8_t store_leaf_values=False,
+        cnp.ndarray initial_roots=None,
+    ):
         self.splitter = splitter
         self.min_samples_split = min_samples_split
         self.min_samples_leaf = min_samples_leaf
@@ -404,6 +686,20 @@ cdef class BestFirstTreeBuilder(TreeBuilder):
         self.max_depth = max_depth
         self.max_leaf_nodes = max_leaf_nodes
         self.min_impurity_decrease = min_impurity_decrease
+        self.store_leaf_values = store_leaf_values
+        self.initial_roots = initial_roots
+
+    def __reduce__(self):
+        """Reduce re-implementation, for pickling."""
+        return(BestFirstTreeBuilder, (self.splitter,
+                                      self.min_samples_split,
+                                      self.min_samples_leaf,
+                                      self.min_weight_leaf,
+                                      self.max_depth,
+                                      self.max_leaf_nodes,
+                                      self.min_impurity_decrease,
+                                      self.store_leaf_values,
+                                      self.initial_roots))
 
     cpdef build(
         self,
@@ -422,6 +718,8 @@ cdef class BestFirstTreeBuilder(TreeBuilder):
         cdef Splitter splitter = self.splitter
         cdef intp_t max_leaf_nodes = self.max_leaf_nodes
 
+        cdef uint8_t store_leaf_values = self.store_leaf_values
+
         # Recursive partition (without actual recursion)
         splitter.init(X, y, sample_weight, missing_values_in_feature_mask)
 
@@ -438,7 +736,7 @@ cdef class BestFirstTreeBuilder(TreeBuilder):
         cdef intp_t max_split_nodes = max_leaf_nodes - 1
         cdef bint is_leaf
         cdef intp_t max_depth_seen = -1
-        cdef int rc = 0
+        cdef intp_t rc = 0
         cdef Node* node
 
         cdef ParentInfo parent_record
@@ -480,6 +778,9 @@ cdef class BestFirstTreeBuilder(TreeBuilder):
                     node.feature = _TREE_UNDEFINED
                     node.threshold = _TREE_UNDEFINED
 
+                    if store_leaf_values:
+                        # copy leaf values to leaf_values array
+                        splitter.node_samples(tree.value_samples[record.node_id])
                 else:
                     # Node is expandable
 
@@ -573,7 +874,7 @@ cdef class BestFirstTreeBuilder(TreeBuilder):
         if rc == -1:
             raise MemoryError()
 
-    cdef inline int _add_split_node(
+    cdef inline intp_t _add_split_node(
         self,
         Splitter splitter,
         Tree tree,
@@ -588,6 +889,13 @@ cdef class BestFirstTreeBuilder(TreeBuilder):
     ) except -1 nogil:
         """Adds node w/ partition ``[start, end)`` to the frontier. """
         cdef SplitRecord split
+
+        # Note: we create a <*SplitRecord> pointer here in order to allow subclasses
+        # to know what kind of SplitRecord to use. In some cases, ObliqueSplitRecord
+        # might be used. The split pointer here knows the size of the underlying Record
+        # because the subclassed splitter will define "pointer_size" accordingly.
+        cdef SplitRecord* split_ptr = <SplitRecord *>malloc(splitter.pointer_size())
+
         cdef intp_t node_id
         cdef intp_t n_node_samples
         cdef float64_t min_impurity_decrease = self.min_impurity_decrease
@@ -613,8 +921,12 @@ cdef class BestFirstTreeBuilder(TreeBuilder):
         if not is_leaf:
             splitter.node_split(
                 parent_record,
-                &split,
+                split_ptr,
             )
+            # assign local copy of SplitRecord to assign
+            # pos, improvement, and impurity scores
+            split = deref(split_ptr)
+
             # If EPSILON=0 in the below comparison, float precision issues stop
             # splitting early, producing trees that are dissimilar to v0.18
             is_leaf = (is_leaf or split.pos >= end or
@@ -624,7 +936,7 @@ cdef class BestFirstTreeBuilder(TreeBuilder):
                                  if parent != NULL
                                  else _TREE_UNDEFINED,
                                  is_left, is_leaf,
-                                 split.feature, split.threshold, parent_record.impurity,
+                                 split_ptr, parent_record.impurity,
                                  n_node_samples, weighted_n_node_samples,
                                  split.missing_go_to_left)
         if node_id == INTPTR_MAX:
@@ -660,236 +972,50 @@ cdef class BestFirstTreeBuilder(TreeBuilder):
             res.impurity_left = parent_record.impurity
             res.impurity_right = parent_record.impurity
 
+        free(split_ptr)
         return 0
 
 
 # =============================================================================
 # Tree
 # =============================================================================
+cdef class BaseTree:
+    """Base class for Cython tree models.
 
-cdef class Tree:
-    """Array-based representation of a binary decision tree.
-
-    The binary tree is represented as a number of parallel arrays. The i-th
-    element of each array holds information about the node `i`. Node 0 is the
-    tree's root. You can find a detailed description of all arrays in
-    `_tree.pxd`. NOTE: Some of the arrays only apply to either leaves or split
-    nodes, resp. In this case the values of nodes of the other type are
-    arbitrary!
-
-    Attributes
-    ----------
-    node_count : intp_t
-        The number of nodes (internal nodes + leaves) in the tree.
-
-    capacity : intp_t
-        The current capacity (i.e., size) of the arrays, which is at least as
-        great as `node_count`.
+    Downstream classes must implement methods to actually traverse the tree.
+    """
+    cdef int _resize(
+        self,
+        intp_t capacity
+    ) except -1 nogil:
+        """Resize all inner arrays to `capacity`, if `capacity` == -1, then
+            double the size of the inner arrays.
 
-    max_depth : intp_t
-        The depth of the tree, i.e. the maximum depth of its leaves.
+        Returns -1 in case of failure to allocate memory (and raise MemoryError)
+        or 0 otherwise.
+        """
+        if self._resize_c(capacity) != 0:
+            # Acquire gil only if we need to raise
+            with gil:
+                raise MemoryError()
 
-    children_left : array of intp_t, shape [node_count]
-        children_left[i] holds the node id of the left child of node i.
-        For leaves, children_left[i] == TREE_LEAF. Otherwise,
-        children_left[i] > i. This child handles the case where
-        X[:, feature[i]] <= threshold[i].
+    cdef int _resize_c(self, intp_t capacity=INTPTR_MAX) except -1 nogil:
+        """Guts of _resize
 
-    children_right : array of intp_t, shape [node_count]
-        children_right[i] holds the node id of the right child of node i.
-        For leaves, children_right[i] == TREE_LEAF. Otherwise,
-        children_right[i] > i. This child handles the case where
-        X[:, feature[i]] > threshold[i].
+        Returns -1 in case of failure to allocate memory (and raise MemoryError)
+        or 0 otherwise.
+        """
+        if capacity == self.capacity and self.nodes != NULL:
+            return 0
 
-    n_leaves : intp_t
-        Number of leaves in the tree.
+        if capacity == INTPTR_MAX:
+            if self.capacity == 0:
+                capacity = 3  # default initial value
+            else:
+                capacity = 2 * self.capacity
 
-    feature : array of intp_t, shape [node_count]
-        feature[i] holds the feature to split on, for the internal node i.
-
-    threshold : array of float64_t, shape [node_count]
-        threshold[i] holds the threshold for the internal node i.
-
-    value : array of float64_t, shape [node_count, n_outputs, max_n_classes]
-        Contains the constant prediction value of each node.
-
-    impurity : array of float64_t, shape [node_count]
-        impurity[i] holds the impurity (i.e., the value of the splitting
-        criterion) at node i.
-
-    n_node_samples : array of intp_t, shape [node_count]
-        n_node_samples[i] holds the number of training samples reaching node i.
-
-    weighted_n_node_samples : array of float64_t, shape [node_count]
-        weighted_n_node_samples[i] holds the weighted number of training samples
-        reaching node i.
-
-    missing_go_to_left : array of bool, shape [node_count]
-        missing_go_to_left[i] holds a bool indicating whether or not there were
-        missing values at node i.
-    """
-    # Wrap for outside world.
-    # WARNING: these reference the current `nodes` and `value` buffers, which
-    # must not be freed by a subsequent memory allocation.
-    # (i.e. through `_resize` or `__setstate__`)
-    @property
-    def n_classes(self):
-        return sizet_ptr_to_ndarray(self.n_classes, self.n_outputs)
-
-    @property
-    def children_left(self):
-        return self._get_node_ndarray()['left_child'][:self.node_count]
-
-    @property
-    def children_right(self):
-        return self._get_node_ndarray()['right_child'][:self.node_count]
-
-    @property
-    def n_leaves(self):
-        return np.sum(np.logical_and(
-            self.children_left == -1,
-            self.children_right == -1))
-
-    @property
-    def feature(self):
-        return self._get_node_ndarray()['feature'][:self.node_count]
-
-    @property
-    def threshold(self):
-        return self._get_node_ndarray()['threshold'][:self.node_count]
-
-    @property
-    def impurity(self):
-        return self._get_node_ndarray()['impurity'][:self.node_count]
-
-    @property
-    def n_node_samples(self):
-        return self._get_node_ndarray()['n_node_samples'][:self.node_count]
-
-    @property
-    def weighted_n_node_samples(self):
-        return self._get_node_ndarray()['weighted_n_node_samples'][:self.node_count]
-
-    @property
-    def missing_go_to_left(self):
-        return self._get_node_ndarray()['missing_go_to_left'][:self.node_count]
-
-    @property
-    def value(self):
-        return self._get_value_ndarray()[:self.node_count]
-
-    # TODO: Convert n_classes to cython.integral memory view once
-    #  https://github.com/cython/cython/issues/5243 is fixed
-    def __cinit__(self, intp_t n_features, cnp.ndarray n_classes, intp_t n_outputs):
-        """Constructor."""
-        cdef intp_t dummy = 0
-        size_t_dtype = np.array(dummy).dtype
-
-        n_classes = _check_n_classes(n_classes, size_t_dtype)
-
-        # Input/Output layout
-        self.n_features = n_features
-        self.n_outputs = n_outputs
-        self.n_classes = NULL
-        safe_realloc(&self.n_classes, n_outputs)
-
-        self.max_n_classes = np.max(n_classes)
-        self.value_stride = n_outputs * self.max_n_classes
-
-        cdef intp_t k
-        for k in range(n_outputs):
-            self.n_classes[k] = n_classes[k]
-
-        # Inner structures
-        self.max_depth = 0
-        self.node_count = 0
-        self.capacity = 0
-        self.value = NULL
-        self.nodes = NULL
-
-    def __dealloc__(self):
-        """Destructor."""
-        # Free all inner structures
-        free(self.n_classes)
-        free(self.value)
-        free(self.nodes)
-
-    def __reduce__(self):
-        """Reduce re-implementation, for pickling."""
-        return (Tree, (self.n_features,
-                       sizet_ptr_to_ndarray(self.n_classes, self.n_outputs),
-                       self.n_outputs), self.__getstate__())
-
-    def __getstate__(self):
-        """Getstate re-implementation, for pickling."""
-        d = {}
-        # capacity is inferred during the __setstate__ using nodes
-        d["max_depth"] = self.max_depth
-        d["node_count"] = self.node_count
-        d["nodes"] = self._get_node_ndarray()
-        d["values"] = self._get_value_ndarray()
-        return d
-
-    def __setstate__(self, d):
-        """Setstate re-implementation, for unpickling."""
-        self.max_depth = d["max_depth"]
-        self.node_count = d["node_count"]
-
-        if 'nodes' not in d:
-            raise ValueError('You have loaded Tree version which '
-                             'cannot be imported')
-
-        node_ndarray = d['nodes']
-        value_ndarray = d['values']
-
-        value_shape = (node_ndarray.shape[0], self.n_outputs,
-                       self.max_n_classes)
-
-        node_ndarray = _check_node_ndarray(node_ndarray, expected_dtype=NODE_DTYPE)
-        value_ndarray = _check_value_ndarray(
-            value_ndarray,
-            expected_dtype=np.dtype(np.float64),
-            expected_shape=value_shape
-        )
-
-        self.capacity = node_ndarray.shape[0]
-        if self._resize_c(self.capacity) != 0:
-            raise MemoryError("resizing tree to %d" % self.capacity)
-
-        memcpy(self.nodes, cnp.PyArray_DATA(node_ndarray),
-               self.capacity * sizeof(Node))
-        memcpy(self.value, cnp.PyArray_DATA(value_ndarray),
-               self.capacity * self.value_stride * sizeof(float64_t))
-
-    cdef int _resize(self, intp_t capacity) except -1 nogil:
-        """Resize all inner arrays to `capacity`, if `capacity` == -1, then
-           double the size of the inner arrays.
-
-        Returns -1 in case of failure to allocate memory (and raise MemoryError)
-        or 0 otherwise.
-        """
-        if self._resize_c(capacity) != 0:
-            # Acquire gil only if we need to raise
-            with gil:
-                raise MemoryError()
-
-    cdef int _resize_c(self, intp_t capacity=INTPTR_MAX) except -1 nogil:
-        """Guts of _resize
-
-        Returns -1 in case of failure to allocate memory (and raise MemoryError)
-        or 0 otherwise.
-        """
-        if capacity == self.capacity and self.nodes != NULL:
-            return 0
-
-        if capacity == INTPTR_MAX:
-            if self.capacity == 0:
-                capacity = 3  # default initial value
-            else:
-                capacity = 2 * self.capacity
-
-        safe_realloc(&self.nodes, capacity)
-        safe_realloc(&self.value, capacity * self.value_stride)
+        safe_realloc(&self.nodes, capacity)
+        safe_realloc(&self.value, capacity * self.value_stride)
 
         if capacity > self.capacity:
             # value memory is initialised to 0 to enable classifier argmax
@@ -906,16 +1032,100 @@ cdef class Tree:
         self.capacity = capacity
         return 0
 
-    cdef intp_t _add_node(self, intp_t parent, bint is_left, bint is_leaf,
-                          intp_t feature, float64_t threshold, float64_t impurity,
-                          intp_t n_node_samples,
-                          float64_t weighted_n_node_samples,
-                          uint8_t missing_go_to_left) except -1 nogil:
+    cdef int _set_split_node(
+        self,
+        SplitRecord* split_node,
+        Node* node,
+        intp_t node_id,
+    ) except -1 nogil:
+        """Set split node data.
+
+        Parameters
+        ----------
+        split_node : SplitRecord*
+            The pointer to the record of the split node data.
+        node : Node*
+            The pointer to the node that will hold the split node.
+        node_id : intp_t
+            The index of the node.
+        """
+        # left_child and right_child will be set later for a split node
+        node.feature = split_node.feature
+        node.threshold = split_node.threshold
+        return 1
+
+    cdef int _set_leaf_node(
+        self,
+        SplitRecord* split_node,
+        Node* node,
+        intp_t node_id,
+    ) except -1 nogil:
+        """Set leaf node data.
+
+        Parameters
+        ----------
+        split_node : SplitRecord*
+            The pointer to the record of the leaf node data.
+        node : Node*
+            The pointer to the node that will hold the leaf node.
+        node_id : intp_t
+            The index of the node.
+        """
+        node.left_child = _TREE_LEAF
+        node.right_child = _TREE_LEAF
+        node.feature = _TREE_UNDEFINED
+        node.threshold = _TREE_UNDEFINED
+        return 1
+
+    cdef float32_t _compute_feature(
+        self,
+        const float32_t[:, :] X_ndarray,
+        intp_t sample_index,
+        Node *node
+    ) noexcept nogil:
+        """Compute feature from a given data matrix, X.
+
+        In axis-aligned trees, this is simply the value in the column of X
+        for this specific feature.
+        """
+        # the feature index
+        cdef float32_t feature = X_ndarray[sample_index, node.feature]
+        return feature
+
+    cdef intp_t _add_node(
+        self,
+        intp_t parent,
+        bint is_left,
+        bint is_leaf,
+        SplitRecord* split_node,
+        float64_t impurity,
+        intp_t n_node_samples,
+        float64_t weighted_n_node_samples,
+        uint8_t missing_go_to_left
+    ) except -1 nogil:
         """Add a node to the tree.
 
         The new node registers itself as the child of its parent.
 
-        Returns (size_t)(-1) on error.
+        Parameters
+        ----------
+        parent : intp_t
+            The index of the parent. If '_TREE_UNDEFINED', then the current
+            node is a root node.
+        is_left : bint
+            Whether or not the current node is to the left of the parent node.
+        is_leaf : bint
+            Whether or not the current node is a leaf node.
+        split_node : SplitRecord*
+            A pointer to a SplitRecord pointer address.
+        impurity : float64_t
+            The impurity of the node to be added.
+        n_node_samples : intp_t
+            The number of samples in the node.
+        weighted_n_node_samples : float64_t
+            The weight of the samples in the node.
+
+        Returns (intp_t)(-1) on error.
         """
         cdef intp_t node_id = self.node_count
 
@@ -935,28 +1145,62 @@ cdef class Tree:
                 self.nodes[parent].right_child = node_id
 
         if is_leaf:
-            node.left_child = _TREE_LEAF
-            node.right_child = _TREE_LEAF
-            node.feature = _TREE_UNDEFINED
-            node.threshold = _TREE_UNDEFINED
-
+            if self._set_leaf_node(split_node, node, node_id) != 1:
+                with gil:
+                    raise RuntimeError
         else:
-            # left_child and right_child will be set later
-            node.feature = feature
-            node.threshold = threshold
+            if self._set_split_node(split_node, node, node_id) != 1:
+                with gil:
+                    raise RuntimeError
             node.missing_go_to_left = missing_go_to_left
 
         self.node_count += 1
 
         return node_id
 
-    cpdef cnp.ndarray predict(self, object X):
-        """Predict target for X."""
-        out = self._get_value_ndarray().take(self.apply(X), axis=0,
-                                             mode='clip')
-        if self.n_outputs == 1:
-            out = out.reshape(X.shape[0], self.max_n_classes)
-        return out
+    cdef inline intp_t _update_node(
+        self,
+        intp_t parent,
+        bint is_left,
+        bint is_leaf,
+        SplitRecord* split_node,
+        float64_t impurity,
+        intp_t n_node_samples,
+        float64_t weighted_n_node_samples,
+        uint8_t missing_go_to_left
+    ) except -1 nogil:
+        """Update a node on the tree.
+
+        The updated node remains on the same position.
+
+        Returns (intp_t)(-1) on error.
+        """
+        cdef intp_t node_id
+        if is_left:
+            node_id = self.nodes[parent].left_child
+        else:
+            node_id = self.nodes[parent].right_child
+
+        if node_id >= self.capacity:
+            if self._resize_c() != 0:
+                return INTPTR_MAX
+
+        cdef Node* node = &self.nodes[node_id]
+        node.impurity = impurity
+        node.n_node_samples = n_node_samples
+        node.weighted_n_node_samples = weighted_n_node_samples
+
+        if is_leaf:
+            if self._set_leaf_node(split_node, node, node_id) != 1:
+                with gil:
+                    raise RuntimeError
+        else:
+            if self._set_split_node(split_node, node, node_id) != 1:
+                with gil:
+                    raise RuntimeError
+            node.missing_go_to_left = missing_go_to_left
+
+        return node_id
 
     cpdef cnp.ndarray apply(self, object X):
         """Finds the terminal region (=leaf node) for each sample in X."""
@@ -991,9 +1235,10 @@ cdef class Tree:
         with nogil:
             for i in range(n_samples):
                 node = self.nodes
+
                 # While node not a leaf
                 while node.left_child != _TREE_LEAF:
-                    X_i_node_feature = X_ndarray[i, node.feature]
+                    X_i_node_feature = self._compute_feature(X_ndarray, i, node)
                     # ... and node.right_child != _TREE_LEAF:
                     if isnan(X_i_node_feature):
                         if node.missing_go_to_left:
@@ -1061,7 +1306,6 @@ cdef class Tree:
                     # ... and node.right_child != _TREE_LEAF:
                     if feature_to_sample[node.feature] == i:
                         feature_value = X_sample[node.feature]
-
                     else:
                         feature_value = 0.
 
@@ -1110,6 +1354,9 @@ cdef class Tree:
         cdef Node* node = NULL
         cdef intp_t i = 0
 
+        # the feature index
+        cdef float64_t feature
+
         with nogil:
             for i in range(n_samples):
                 node = self.nodes
@@ -1121,7 +1368,9 @@ cdef class Tree:
                     indices[indptr[i + 1]] = <intp_t>(node - self.nodes)
                     indptr[i + 1] += 1
 
-                    if X_ndarray[i, node.feature] <= node.threshold:
+                    # compute the feature value to compare against threshold
+                    feature = self._compute_feature(X_ndarray, i, node)
+                    if feature <= node.threshold:
                         node = &self.nodes[node.left_child]
                     else:
                         node = &self.nodes[node.right_child]
@@ -1250,27 +1499,22 @@ cdef class Tree:
 
     cpdef compute_feature_importances(self, normalize=True):
         """Computes the importance of each feature (aka variable)."""
-        cdef Node* left
-        cdef Node* right
         cdef Node* nodes = self.nodes
         cdef Node* node = nodes
         cdef Node* end_node = node + self.node_count
 
         cdef float64_t normalizer = 0.
+        cdef intp_t i = 0
 
-        cdef cnp.float64_t[:] importances = np.zeros(self.n_features)
+        cdef float64_t[:] importances = np.zeros(self.n_features)
 
         with nogil:
             while node != end_node:
                 if node.left_child != _TREE_LEAF:
                     # ... and node.right_child != _TREE_LEAF:
-                    left = &nodes[node.left_child]
-                    right = &nodes[node.right_child]
+                    self._compute_feature_importances(
+                        importances, node)
 
-                    importances[node.feature] += (
-                        node.weighted_n_node_samples * node.impurity -
-                        left.weighted_n_node_samples * left.impurity -
-                        right.weighted_n_node_samples * right.impurity)
                 node += 1
 
         for i in range(self.n_features):
@@ -1284,46 +1528,29 @@ cdef class Tree:
                 for i in range(self.n_features):
                     importances[i] /= normalizer
 
-        return np.asarray(importances)
-
-    cdef cnp.ndarray _get_value_ndarray(self):
-        """Wraps value as a 3-d NumPy array.
-
-        The array keeps a reference to this Tree, which manages the underlying
-        memory.
-        """
-        cdef cnp.npy_intp shape[3]
-        shape[0] = <cnp.npy_intp> self.node_count
-        shape[1] = <cnp.npy_intp> self.n_outputs
-        shape[2] = <cnp.npy_intp> self.max_n_classes
-        cdef cnp.ndarray arr
-        arr = cnp.PyArray_SimpleNewFromData(3, shape, cnp.NPY_DOUBLE, self.value)
-        Py_INCREF(self)
-        if PyArray_SetBaseObject(arr, <PyObject*> self) < 0:
-            raise ValueError("Can't initialize array.")
-        return arr
-
-    cdef cnp.ndarray _get_node_ndarray(self):
-        """Wraps nodes as a NumPy struct array.
-
-        The array keeps a reference to this Tree, which manages the underlying
-        memory. Individual fields are publicly accessible as properties of the
-        Tree.
-        """
-        cdef cnp.npy_intp shape[1]
-        shape[0] = <cnp.npy_intp> self.node_count
-        cdef cnp.npy_intp strides[1]
-        strides[0] = sizeof(Node)
-        cdef cnp.ndarray arr
-        Py_INCREF(NODE_DTYPE)
-        arr = PyArray_NewFromDescr(<PyTypeObject *> cnp.ndarray,
-                                   <cnp.dtype> NODE_DTYPE, 1, shape,
-                                   strides, <void*> self.nodes,
-                                   cnp.NPY_ARRAY_DEFAULT, None)
-        Py_INCREF(self)
-        if PyArray_SetBaseObject(arr, <PyObject*> self) < 0:
-            raise ValueError("Can't initialize array.")
-        return arr
+        return np.asarray(importances)
+
+    cdef void _compute_feature_importances(
+        self,
+        float64_t[:] importances,
+        Node* node
+    ) noexcept nogil:
+        """Compute feature importances from a Node in the Tree.
+
+        Wrapped in a private function to allow subclassing that
+        computes feature importances.
+        """
+        cdef Node* nodes = self.nodes
+        cdef Node* left
+        cdef Node* right
+
+        left = &nodes[node.left_child]
+        right = &nodes[node.right_child]
+
+        importances[node.feature] += (
+                        node.weighted_n_node_samples * node.impurity -
+                        left.weighted_n_node_samples * left.impurity -
+                        right.weighted_n_node_samples * right.impurity)
 
     def compute_partial_dependence(self, float32_t[:, ::1] X,
                                    const intp_t[::1] target_features,
@@ -1432,6 +1659,286 @@ cdef class Tree:
                                  total_weight)
 
 
+cdef class Tree(BaseTree):
+    """Array-based representation of a binary decision tree.
+
+    The binary tree is represented as a number of parallel arrays. The i-th
+    element of each array holds information about the node `i`. Node 0 is the
+    tree's root. You can find a detailed description of all arrays in
+    `_tree.pxd`. NOTE: Some of the arrays only apply to either leaves or split
+    nodes, resp. In this case the values of nodes of the other type are
+    arbitrary!
+
+    Attributes
+    ----------
+    node_count : intp_t
+        The number of nodes (internal nodes + leaves) in the tree.
+
+    capacity : intp_t
+        The current capacity (i.e., size) of the arrays, which is at least as
+        great as `node_count`.
+
+    max_depth : intp_t
+        The depth of the tree, i.e. the maximum depth of its leaves.
+
+    children_left : array of intp_t, shape [node_count]
+        children_left[i] holds the node id of the left child of node i.
+        For leaves, children_left[i] == TREE_LEAF. Otherwise,
+        children_left[i] > i. This child handles the case where
+        X[:, feature[i]] <= threshold[i].
+
+    children_right : array of intp_t, shape [node_count]
+        children_right[i] holds the node id of the right child of node i.
+        For leaves, children_right[i] == TREE_LEAF. Otherwise,
+        children_right[i] > i. This child handles the case where
+        X[:, feature[i]] > threshold[i].
+
+    feature : array of intp_t, shape [node_count]
+        feature[i] holds the feature to split on, for the internal node i.
+
+    threshold : array of float64_t, shape [node_count]
+        threshold[i] holds the threshold for the internal node i.
+
+    value : array of float64_t, shape [node_count, n_outputs, max_n_classes]
+        Contains the constant prediction value of each node.
+
+    impurity : array of float64_t, shape [node_count]
+        impurity[i] holds the impurity (i.e., the value of the splitting
+        criterion) at node i.
+
+    n_node_samples : array of intp_t, shape [node_count]
+        n_node_samples[i] holds the number of training samples reaching node i.
+
+    weighted_n_node_samples : array of float64_t, shape [node_count]
+        weighted_n_node_samples[i] holds the weighted number of training samples
+        reaching node i.
+
+    leaf_node_samples : dict of node id to numpy array of shapes (n_samples_node, n_features)
+        A dictionary mapping leaf nodes to the samples of data that are used
+        to fit the prediction at each leaf.
+    """
+    # Wrap for outside world.
+    # WARNING: these reference the current `nodes` and `value` buffers, which
+    # must not be freed by a subsequent memory allocation.
+    # (i.e. through `_resize` or `__setstate__`)
+    @property
+    def n_classes(self):
+        return sizet_ptr_to_ndarray(self.n_classes, self.n_outputs)
+
+    @property
+    def children_left(self):
+        return self._get_node_ndarray()['left_child'][:self.node_count]
+
+    @property
+    def children_right(self):
+        return self._get_node_ndarray()['right_child'][:self.node_count]
+
+    @property
+    def n_leaves(self):
+        return np.sum(np.logical_and(
+            self.children_left == -1,
+            self.children_right == -1))
+
+    @property
+    def feature(self):
+        return self._get_node_ndarray()['feature'][:self.node_count]
+
+    @property
+    def threshold(self):
+        return self._get_node_ndarray()['threshold'][:self.node_count]
+
+    @property
+    def impurity(self):
+        return self._get_node_ndarray()['impurity'][:self.node_count]
+
+    @property
+    def n_node_samples(self):
+        return self._get_node_ndarray()['n_node_samples'][:self.node_count]
+
+    @property
+    def weighted_n_node_samples(self):
+        return self._get_node_ndarray()['weighted_n_node_samples'][:self.node_count]
+
+    @property
+    def missing_go_to_left(self):
+        return self._get_node_ndarray()['missing_go_to_left'][:self.node_count]
+
+    @property
+    def value(self):
+        return self._get_value_ndarray()[:self.node_count]
+
+    @property
+    def leaf_nodes_samples(self):
+        leaf_node_samples = dict()
+        keys = self._get_value_samples_keys()
+        for node_id in keys:
+            leaf_node_samples[node_id] = self._get_value_samples_ndarray(node_id)
+        return leaf_node_samples
+
+    # TODO: Convert n_classes to cython.integral memory view once
+    #  https://github.com/cython/cython/issues/5243 is fixed
+    def __cinit__(self, intp_t n_features, cnp.ndarray n_classes, intp_t n_outputs, *args):
+        """Constructor."""
+        cdef intp_t dummy = 0
+        size_t_dtype = np.array(dummy).dtype
+
+        n_classes = _check_n_classes(n_classes, size_t_dtype)
+
+        # Input/Output layout
+        self.n_features = n_features
+        self.n_outputs = n_outputs
+        self.n_classes = NULL
+        safe_realloc(&self.n_classes, n_outputs)
+
+        self.max_n_classes = np.max(n_classes)
+        self.value_stride = n_outputs * self.max_n_classes
+
+        cdef intp_t k
+        for k in range(n_outputs):
+            self.n_classes[k] = n_classes[k]
+
+        # Inner structures
+        self.max_depth = 0
+        self.node_count = 0
+        self.capacity = 0
+        self.value = NULL
+        self.nodes = NULL
+
+        # initialize the hash map for the value samples
+        self.value_samples = unordered_map[intp_t, vector[vector[float64_t]]]()
+
+    def __dealloc__(self):
+        """Destructor."""
+        # Free all inner structures
+        free(self.n_classes)
+        free(self.value)
+        free(self.nodes)
+
+    def __reduce__(self):
+        """Reduce re-implementation, for pickling."""
+        return (Tree, (self.n_features,
+                       sizet_ptr_to_ndarray(self.n_classes, self.n_outputs),
+                       self.n_outputs), self.__getstate__())
+
+    def __getstate__(self):
+        """Getstate re-implementation, for pickling."""
+        d = {}
+        # capacity is inferred during the __setstate__ using nodes
+        d["max_depth"] = self.max_depth
+        d["node_count"] = self.node_count
+        d["nodes"] = self._get_node_ndarray()
+        d["values"] = self._get_value_ndarray()
+        d['value_samples'] = self.leaf_nodes_samples
+        return d
+
+    def __setstate__(self, d):
+        """Setstate re-implementation, for unpickling."""
+        self.max_depth = d["max_depth"]
+        self.node_count = d["node_count"]
+
+        if 'nodes' not in d:
+            raise ValueError('You have loaded Tree version which '
+                             'cannot be imported')
+
+        node_ndarray = d['nodes']
+        value_ndarray = d['values']
+
+        value_shape = (node_ndarray.shape[0], self.n_outputs,
+                       self.max_n_classes)
+
+        node_ndarray = _check_node_ndarray(node_ndarray, expected_dtype=NODE_DTYPE)
+        value_ndarray = _check_value_ndarray(
+            value_ndarray,
+            expected_dtype=np.dtype(np.float64),
+            expected_shape=value_shape
+        )
+
+        self.capacity = node_ndarray.shape[0]
+        if self._resize_c(self.capacity) != 0:
+            raise MemoryError("resizing tree to %d" % self.capacity)
+
+        memcpy(self.nodes, cnp.PyArray_DATA(node_ndarray),
+               self.capacity * sizeof(Node))
+        memcpy(self.value, cnp.PyArray_DATA(value_ndarray),
+               self.capacity * self.value_stride * sizeof(float64_t))
+
+        # store the leaf node samples if they exist
+        value_samples_dict = d['value_samples']
+        for node_id, leaf_samples in value_samples_dict.items():
+            self.value_samples[node_id].resize(leaf_samples.shape[0])
+            for idx in range(leaf_samples.shape[0]):
+                for jdx in range(leaf_samples.shape[1]):
+                    self.value_samples[node_id][idx].push_back(leaf_samples[idx, jdx])
+
+    cdef cnp.ndarray _get_value_samples_ndarray(self, intp_t node_id):
+        """Wraps value_samples as a 2-d NumPy array per node_id."""
+        cdef intp_t i, j
+        cdef intp_t n_samples = self.value_samples[node_id].size()
+        cdef cnp.ndarray[float64_t, ndim=2, mode='c'] leaf_node_samples = np.empty(shape=(n_samples, self.n_outputs), dtype=np.float64)
+
+        for i in range(n_samples):
+            for j in range(self.n_outputs):
+                leaf_node_samples[i, j] = self.value_samples[node_id][i][j]
+        return leaf_node_samples
+
+    cdef cnp.ndarray _get_value_samples_keys(self):
+        """Wraps value_samples keys as a 1-d NumPy array of keys."""
+        cdef cnp.ndarray[intp_t, ndim=1, mode='c'] keys = np.empty(len(self.value_samples), dtype=np.intp)
+        cdef intp_t i = 0
+
+        for key in self.value_samples:
+            keys[i] = key.first
+            i += 1
+        return keys
+
+    cdef cnp.ndarray _get_value_ndarray(self):
+        """Wraps value as a 3-d NumPy array.
+
+        The array keeps a reference to this Tree, which manages the underlying
+        memory.
+        """
+        cdef cnp.npy_intp shape[3]
+        shape[0] = <cnp.npy_intp> self.node_count
+        shape[1] = <cnp.npy_intp> self.n_outputs
+        shape[2] = <cnp.npy_intp> self.max_n_classes
+        cdef cnp.ndarray arr
+        arr = cnp.PyArray_SimpleNewFromData(3, shape, cnp.NPY_DOUBLE, self.value)
+        Py_INCREF(self)
+        if PyArray_SetBaseObject(arr, <PyObject*> self) < 0:
+            raise ValueError("Can't initialize array.")
+        return arr
+
+    cdef cnp.ndarray _get_node_ndarray(self):
+        """Wraps nodes as a NumPy struct array.
+
+        The array keeps a reference to this Tree, which manages the underlying
+        memory. Individual fields are publicly accessible as properties of the
+        Tree.
+        """
+        cdef cnp.npy_intp shape[1]
+        shape[0] = <cnp.npy_intp> self.node_count
+        cdef cnp.npy_intp strides[1]
+        strides[0] = sizeof(Node)
+        cdef cnp.ndarray arr
+        Py_INCREF(NODE_DTYPE)
+        arr = PyArray_NewFromDescr(<PyTypeObject *> cnp.ndarray,
+                                   <cnp.dtype> NODE_DTYPE, 1, shape,
+                                   strides, <void*> self.nodes,
+                                   cnp.NPY_ARRAY_DEFAULT, None)
+        Py_INCREF(self)
+        if PyArray_SetBaseObject(arr, <PyObject*> self) < 0:
+            raise ValueError("Can't initialize array.")
+        return arr
+
+    cpdef cnp.ndarray predict(self, object X):
+        """Predict target for X."""
+        out = self._get_value_ndarray().take(self.apply(X), axis=0,
+                                             mode='clip')
+        if self.n_outputs == 1:
+            out = out.reshape(X.shape[0], self.max_n_classes)
+        return out
+
+
 def _check_n_classes(n_classes, expected_dtype):
     if n_classes.ndim != 1:
         raise ValueError(
@@ -1610,7 +2117,7 @@ cdef class _PathFinder(_CCPPruneController):
     cdef float64_t[:] impurities
     cdef uint32_t count
 
-    def __cinit__(self,  intp_t node_count):
+    def __cinit__(self, intp_t node_count):
         self.ccp_alphas = np.zeros(shape=(node_count), dtype=np.float64)
         self.impurities = np.zeros(shape=(node_count), dtype=np.float64)
         self.count = 0
@@ -1907,7 +2414,7 @@ cdef void _build_pruned_tree(
         # value_stride for original tree and new tree are the same
         intp_t value_stride = orig_tree.value_stride
         intp_t max_depth_seen = -1
-        int rc = 0
+        intp_t rc = 0
         Node* node
         float64_t* orig_value_ptr
         float64_t* new_value_ptr
@@ -1915,6 +2422,8 @@ cdef void _build_pruned_tree(
         stack[BuildPrunedRecord] prune_stack
         BuildPrunedRecord stack_record
 
+        SplitRecord split
+
     with nogil:
         # push root node onto stack
         prune_stack.push({"start": 0, "depth": 0, "parent": _TREE_UNDEFINED, "is_left": 0})
@@ -1940,8 +2449,12 @@ cdef void _build_pruned_tree(
                 rc = -2
                 break
 
+            # redefine to a SplitRecord to pass into _add_node
+            split.feature = node.feature
+            split.threshold = node.threshold
+
             new_node_id = tree._add_node(
-                parent, is_left, is_leaf, node.feature, node.threshold,
+                parent, is_left, is_leaf, &split,
                 node.impurity, node.n_node_samples,
                 node.weighted_n_node_samples, node.missing_go_to_left)
 
diff --git a/sklearn/tree/_utils.pxd b/sklearn/tree/_utils.pxd
index bc1d7668187d7..8094537f885d4 100644
--- a/sklearn/tree/_utils.pxd
+++ b/sklearn/tree/_utils.pxd
@@ -3,10 +3,13 @@
 
 # See _utils.pyx for details.
 
+import numpy as np
 cimport numpy as cnp
-from ._tree cimport Node
+cnp.import_array()
+
 from ..neighbors._quad_tree cimport Cell
 from ..utils._typedefs cimport float32_t, float64_t, intp_t, uint8_t, int32_t, uint32_t
+from ._tree cimport Node
 
 
 cdef enum:
@@ -35,7 +38,7 @@ ctypedef fused realloc_ptr:
     (Cell*)
     (Node**)
 
-cdef int safe_realloc(realloc_ptr* p, size_t nelems) except -1 nogil
+cdef intp_t safe_realloc(realloc_ptr* p, size_t nelems) except -1 nogil
 
 
 cdef cnp.ndarray sizet_ptr_to_ndarray(intp_t* data, intp_t size)
@@ -66,12 +69,12 @@ cdef class WeightedPQueue:
     cdef WeightedPQueueRecord* array_
 
     cdef bint is_empty(self) noexcept nogil
-    cdef int reset(self) except -1 nogil
+    cdef intp_t reset(self) except -1 nogil
     cdef intp_t size(self) noexcept nogil
-    cdef int push(self, float64_t data, float64_t weight) except -1 nogil
-    cdef int remove(self, float64_t data, float64_t weight) noexcept nogil
-    cdef int pop(self, float64_t* data, float64_t* weight) noexcept nogil
-    cdef int peek(self, float64_t* data, float64_t* weight) noexcept nogil
+    cdef intp_t push(self, float64_t data, float64_t weight) except -1 nogil
+    cdef intp_t remove(self, float64_t data, float64_t weight) noexcept nogil
+    cdef intp_t pop(self, float64_t* data, float64_t* weight) noexcept nogil
+    cdef intp_t peek(self, float64_t* data, float64_t* weight) noexcept nogil
     cdef float64_t get_weight_from_index(self, intp_t index) noexcept nogil
     cdef float64_t get_value_from_index(self, intp_t index) noexcept nogil
 
@@ -87,14 +90,14 @@ cdef class WeightedMedianCalculator:
     cdef intp_t k
     cdef float64_t sum_w_0_k  # represents sum(weights[0:k]) = w[0] + w[1] + ... + w[k-1]
     cdef intp_t size(self) noexcept nogil
-    cdef int push(self, float64_t data, float64_t weight) except -1 nogil
-    cdef int reset(self) except -1 nogil
-    cdef int update_median_parameters_post_push(
+    cdef intp_t push(self, float64_t data, float64_t weight) except -1 nogil
+    cdef intp_t reset(self) except -1 nogil
+    cdef intp_t update_median_parameters_post_push(
         self, float64_t data, float64_t weight,
         float64_t original_median) noexcept nogil
-    cdef int remove(self, float64_t data, float64_t weight) noexcept nogil
-    cdef int pop(self, float64_t* data, float64_t* weight) noexcept nogil
-    cdef int update_median_parameters_post_remove(
+    cdef intp_t remove(self, float64_t data, float64_t weight) noexcept nogil
+    cdef intp_t pop(self, float64_t* data, float64_t* weight) noexcept nogil
+    cdef intp_t update_median_parameters_post_remove(
         self, float64_t data, float64_t weight,
         float64_t original_median) noexcept nogil
     cdef float64_t get_median(self) noexcept nogil
diff --git a/sklearn/tree/_utils.pyx b/sklearn/tree/_utils.pyx
index c5e936ae48eb1..113c61dcf9f3b 100644
--- a/sklearn/tree/_utils.pyx
+++ b/sklearn/tree/_utils.pyx
@@ -1,13 +1,14 @@
 # Authors: The scikit-learn developers
 # SPDX-License-Identifier: BSD-3-Clause
 
-from libc.stdlib cimport free
-from libc.stdlib cimport realloc
-from libc.math cimport log as ln
 from libc.math cimport isnan
+from libc.math cimport log as ln
+from libc.stdlib cimport free, realloc
 
 import numpy as np
+
 cimport numpy as cnp
+
 cnp.import_array()
 
 from ..utils._random cimport our_rand_r
@@ -16,7 +17,7 @@ from ..utils._random cimport our_rand_r
 # Helper functions
 # =============================================================================
 
-cdef int safe_realloc(realloc_ptr* p, size_t nelems) except -1 nogil:
+cdef intp_t safe_realloc(realloc_ptr* p, size_t nelems) except -1 nogil:
     # sizeof(realloc_ptr[0]) would be more like idiomatic C, but causes Cython
     # 0.20.1 to crash.
     cdef size_t nbytes = nelems * sizeof(p[0][0])
@@ -96,7 +97,7 @@ cdef class WeightedPQueue:
     def __dealloc__(self):
         free(self.array_)
 
-    cdef int reset(self) except -1 nogil:
+    cdef intp_t reset(self) except -1 nogil:
         """Reset the WeightedPQueue to its state at construction
 
         Return -1 in case of failure to allocate memory (and raise MemoryError)
@@ -113,7 +114,7 @@ cdef class WeightedPQueue:
     cdef intp_t size(self) noexcept nogil:
         return self.array_ptr
 
-    cdef int push(self, float64_t data, float64_t weight) except -1 nogil:
+    cdef intp_t push(self, float64_t data, float64_t weight) except -1 nogil:
         """Push record on the array.
 
         Return -1 in case of failure to allocate memory (and raise MemoryError)
@@ -145,7 +146,7 @@ cdef class WeightedPQueue:
         self.array_ptr = array_ptr + 1
         return 0
 
-    cdef int remove(self, float64_t data, float64_t weight) noexcept nogil:
+    cdef intp_t remove(self, float64_t data, float64_t weight) noexcept nogil:
         """Remove a specific value/weight record from the array.
         Returns 0 if successful, -1 if record not found."""
         cdef intp_t array_ptr = self.array_ptr
@@ -173,7 +174,7 @@ cdef class WeightedPQueue:
         self.array_ptr = array_ptr - 1
         return 0
 
-    cdef int pop(self, float64_t* data, float64_t* weight) noexcept nogil:
+    cdef intp_t pop(self, float64_t* data, float64_t* weight) noexcept nogil:
         """Remove the top (minimum) element from array.
         Returns 0 if successful, -1 if nothing to remove."""
         cdef intp_t array_ptr = self.array_ptr
@@ -194,7 +195,7 @@ cdef class WeightedPQueue:
         self.array_ptr = array_ptr - 1
         return 0
 
-    cdef int peek(self, float64_t* data, float64_t* weight) noexcept nogil:
+    cdef intp_t peek(self, float64_t* data, float64_t* weight) noexcept nogil:
         """Write the top element from array to a pointer.
         Returns 0 if successful, -1 if nothing to write."""
         cdef WeightedPQueueRecord* array = self.array_
@@ -271,7 +272,7 @@ cdef class WeightedMedianCalculator:
         WeightedMedianCalculator"""
         return self.samples.size()
 
-    cdef int reset(self) except -1 nogil:
+    cdef intp_t reset(self) except -1 nogil:
         """Reset the WeightedMedianCalculator to its state at construction
 
         Return -1 in case of failure to allocate memory (and raise MemoryError)
@@ -285,13 +286,13 @@ cdef class WeightedMedianCalculator:
         self.sum_w_0_k = 0
         return 0
 
-    cdef int push(self, float64_t data, float64_t weight) except -1 nogil:
+    cdef intp_t push(self, float64_t data, float64_t weight) except -1 nogil:
         """Push a value and its associated weight to the WeightedMedianCalculator
 
         Return -1 in case of failure to allocate memory (and raise MemoryError)
         or 0 otherwise.
         """
-        cdef int return_value
+        cdef intp_t return_value
         cdef float64_t original_median = 0.0
 
         if self.size() != 0:
@@ -302,7 +303,7 @@ cdef class WeightedMedianCalculator:
                                                 original_median)
         return return_value
 
-    cdef int update_median_parameters_post_push(
+    cdef intp_t update_median_parameters_post_push(
             self, float64_t data, float64_t weight,
             float64_t original_median) noexcept nogil:
         """Update the parameters used in the median calculation,
@@ -344,11 +345,11 @@ cdef class WeightedMedianCalculator:
                 self.sum_w_0_k += self.samples.get_weight_from_index(self.k-1)
             return 0
 
-    cdef int remove(self, float64_t data, float64_t weight) noexcept nogil:
+    cdef intp_t remove(self, float64_t data, float64_t weight) noexcept nogil:
         """Remove a value from the MedianHeap, removing it
         from consideration in the median calculation
         """
-        cdef int return_value
+        cdef intp_t return_value
         cdef float64_t original_median = 0.0
 
         if self.size() != 0:
@@ -359,11 +360,11 @@ cdef class WeightedMedianCalculator:
                                                   original_median)
         return return_value
 
-    cdef int pop(self, float64_t* data, float64_t* weight) noexcept nogil:
+    cdef intp_t pop(self, float64_t* data, float64_t* weight) noexcept nogil:
         """Pop a value from the MedianHeap, starting from the
         left and moving to the right.
         """
-        cdef int return_value
+        cdef intp_t return_value
         cdef float64_t original_median = 0.0
 
         if self.size() != 0:
@@ -379,7 +380,7 @@ cdef class WeightedMedianCalculator:
                                                   original_median)
         return return_value
 
-    cdef int update_median_parameters_post_remove(
+    cdef intp_t update_median_parameters_post_remove(
             self, float64_t data, float64_t weight,
             float64_t original_median) noexcept nogil:
         """Update the parameters used in the median calculation,
diff --git a/sklearn/tree/meson.build b/sklearn/tree/meson.build
index 3e16af150b7ae..04d1d5f353d02 100644
--- a/sklearn/tree/meson.build
+++ b/sklearn/tree/meson.build
@@ -4,16 +4,16 @@ tree_extension_metadata = {
      'override_options': ['cython_language=cpp', 'optimization=3']},
   '_splitter':
     {'sources': ['_splitter.pyx'],
-     'override_options': ['optimization=3']},
+     'override_options': ['cython_language=cpp', 'optimization=3']},
   '_partitioner':
     {'sources': ['_partitioner.pyx'],
-     'override_options': ['optimization=3']},
+     'override_options': ['cython_language=cpp', 'optimization=3']},
   '_criterion':
     {'sources': ['_criterion.pyx'],
-     'override_options': ['optimization=3']},
+     'override_options': ['cython_language=cpp', 'optimization=3']},
   '_utils':
     {'sources': ['_utils.pyx'],
-     'override_options': ['optimization=3']},
+     'override_options': ['cython_language=cpp', 'optimization=3']},
 }
 
 foreach ext_name, ext_dict : tree_extension_metadata
diff --git a/sklearn/tree/tests/test_tree.py b/sklearn/tree/tests/test_tree.py
index fb5af073fc8c6..656a7257521ce 100644
--- a/sklearn/tree/tests/test_tree.py
+++ b/sklearn/tree/tests/test_tree.py
@@ -54,6 +54,9 @@
     ignore_warnings,
     skip_if_32bit,
 )
+from sklearn.utils.estimator_checks import (
+    parametrize_with_checks,
+)
 from sklearn.utils.fixes import (
     _IS_32BIT,
     COO_CONTAINERS,
@@ -234,6 +237,18 @@ def assert_tree_equal(d, s, message):
     )
 
 
+@parametrize_with_checks(
+    [
+        DecisionTreeClassifier(),
+        DecisionTreeRegressor(),
+        ExtraTreeClassifier(),
+        ExtraTreeRegressor(),
+    ]
+)
+def test_sklearn_compatible_estimator(estimator, check):
+    check(estimator)
+
+
 def test_classification_toy():
     # Check classification on a toy dataset.
     for name, Tree in CLF_TREES.items():
@@ -892,7 +907,7 @@ def test_pickle():
         else:
             X, y = diabetes.data, diabetes.target
 
-        est = TreeEstimator(random_state=0)
+        est = TreeEstimator(random_state=0, store_leaf_values=True)
         est.fit(X, y)
         score = est.score(X, y)
 
@@ -911,6 +926,7 @@ def test_pickle():
             "n_node_samples",
             "weighted_n_node_samples",
             "value",
+            "leaf_nodes_samples",
         ]
         fitted_attribute = {
             attribute: getattr(est.tree_, attribute) for attribute in attributes
@@ -925,14 +941,25 @@ def test_pickle():
             score == score2
         ), "Failed to generate same score  after pickling with {0}".format(name)
         for attribute in fitted_attribute:
-            assert_array_equal(
-                getattr(est2.tree_, attribute),
-                fitted_attribute[attribute],
-                err_msg=(
-                    f"Failed to generate same attribute {attribute} after pickling with"
-                    f" {name}"
-                ),
-            )
+            if attribute == "leaf_nodes_samples":
+                for key in fitted_attribute[attribute].keys():
+                    assert_array_equal(
+                        getattr(est2.tree_, attribute)[key],
+                        fitted_attribute[attribute][key],
+                        err_msg=(
+                            f"Failed to generate same attribute {attribute} after"
+                            f" pickling with {name}"
+                        ),
+                    )
+            else:
+                assert_array_equal(
+                    getattr(est2.tree_, attribute),
+                    fitted_attribute[attribute],
+                    err_msg=(
+                        f"Failed to generate same attribute {attribute} after pickling"
+                        f" with {name}"
+                    ),
+                )
 
 
 def test_multioutput():
@@ -2345,8 +2372,8 @@ def test_min_sample_split_1_error(Tree):
     # min_samples_split=1 is invalid
     tree = Tree(min_samples_split=1)
     msg = (
-        r"'min_samples_split' .* must be an int in the range \[2, inf\) "
-        r"or a float in the range \(0.0, 1.0\]"
+        r"'min_samples_split' .* must be an int in the range \[2, inf\)"
+        r".* a float in the range \(0.0, 1.0\]"
     )
     with pytest.raises(ValueError, match=msg):
         tree.fit(X, y)
@@ -2358,7 +2385,9 @@ def test_missing_values_best_splitter_on_equal_nodes_no_missing(criterion):
     X = np.array([[0, 1, 2, 3, 8, 9, 11, 12, 15]]).T
     y = np.array([0.1, 0.2, 0.3, 0.2, 1.4, 1.4, 1.5, 1.6, 2.6])
 
-    dtc = DecisionTreeRegressor(random_state=42, max_depth=1, criterion=criterion)
+    dtc = DecisionTreeRegressor(
+        random_state=42, max_depth=1, criterion=criterion, store_leaf_values=True
+    )
     dtc.fit(X, y)
 
     # Goes to right node because it has the most data points
@@ -2640,6 +2669,151 @@ def test_sample_weight_non_uniform(make_data, Tree):
     assert_allclose(tree_samples_removed.predict(X), tree_with_sw.predict(X))
 
 
+@pytest.mark.parametrize(
+    "tree_name",
+    ALL_TREES,
+)
+def test_leaf_node_samples(tree_name):
+    """Test getting leaf node samples from fitted tree."""
+    tree = ALL_TREES[tree_name](random_state=0, store_leaf_values=False)
+    tree.fit(X_small, y_small)
+
+    # Check that the leaf node samples are not stored by default
+    assert tree.tree_.leaf_nodes_samples == dict()
+
+    # error should be raised if trying to predict quantiles
+    assert hasattr(tree, "predict_quantiles")
+    for meth in ["predict_quantiles", "get_leaf_node_samples"]:
+        if hasattr(tree, meth):
+            with pytest.raises(
+                RuntimeError,
+                match="leaf node samples",
+            ):
+                getattr(tree, meth)(X_small)
+
+    quantile_tree = ALL_TREES[tree_name](random_state=0, store_leaf_values=True)
+    quantile_tree.fit(X_small, y_small)
+
+    score = tree.score(X_small, y_small)
+    new_score = quantile_tree.score(X_small, y_small)
+    assert np.isclose(score, new_score)
+
+    # Check that the leaf node samples are what they should be
+    X_leaves = quantile_tree.apply(X_small)
+    for idx in range(X_leaves.shape[0]):
+        leaf_idx = X_leaves[idx]
+        assert y_small[idx] in quantile_tree.tree_.leaf_nodes_samples[leaf_idx]
+    assert set(np.unique(X_leaves)) == set(
+        quantile_tree.tree_.leaf_nodes_samples.keys()
+    )
+
+
+@pytest.mark.parametrize(
+    "name",
+    ALL_TREES,
+)
+def test_quantile_tree_predict(name):
+    TreeEstimator = ALL_TREES[name]
+
+    # test quantile prediction
+    est = TreeEstimator(store_leaf_values=True, random_state=0)
+
+    # fit on binary results in perfect leaves, so all quantiles are the same
+    est.fit(X_small, y_small)
+    pred = est.predict_quantiles(X_small, quantiles=[0.1, 0.5, 0.9])
+    assert_array_equal(est.predict(X_small), pred[:, 0])
+    assert_array_equal(est.predict(X_small), pred[:, 1])
+    assert_array_equal(est.predict(X_small), pred[:, 2])
+    assert_array_equal(pred[:, 0], y_small)
+    assert np.unique(pred, axis=1).shape[1] == 1
+
+    est.fit(X_small[:-5], y_small[:-5])
+    held_out_X = X_small[-5:, :]
+    pred = est.predict_quantiles(held_out_X, quantiles=[0.1, 0.5, 0.9])
+    assert_array_equal(est.predict(held_out_X), pred[:, 0])
+    assert_array_equal(est.predict(held_out_X), pred[:, 1])
+    assert_array_equal(est.predict(held_out_X), pred[:, 2])
+
+    # fit on real data
+    est.fit(iris.data, iris.target)
+    pred = est.predict_quantiles(iris.data, quantiles=[0.1, 0.5, 0.9])
+    assert_array_equal(pred[:, 0], iris.target)
+    assert_array_equal(pred[:, 1], iris.target)
+    assert_array_equal(pred[:, 2], iris.target)
+
+
+@pytest.mark.parametrize(
+    "name",
+    ALL_TREES,
+)
+def test_quantile_tree_predict_impure_leaves(name):
+    TreeEstimator = ALL_TREES[name]
+
+    # test quantile prediction
+    est = TreeEstimator(store_leaf_values=True, random_state=0, max_depth=4)
+    # fit on binary results with constrained depth will result in impure leaves
+    est.fit(X_small, y_small)
+    pred = est.predict_quantiles(X_small, quantiles=[0.1, 0.5, 0.9])
+    assert np.unique(pred, axis=1).shape[1] > 1
+
+
+def test_multioutput_quantiles():
+    # Check estimators on multi-output problems.
+    X = [
+        [-2, -1],
+        [-1, -1],
+        [-1, -2],
+        [1, 1],
+        [1, 2],
+        [2, 1],
+        [-2, 1],
+        [-1, 1],
+        [-1, 2],
+        [2, -1],
+        [1, -1],
+        [1, -2],
+    ]
+
+    y = [
+        [-1, 0],
+        [-1, 0],
+        [-1, 0],
+        [1, 1],
+        [1, 1],
+        [1, 1],
+        [-1, 2],
+        [-1, 2],
+        [-1, 2],
+        [1, 3],
+        [1, 3],
+        [1, 3],
+    ]
+
+    T = [[-1, -1], [1, 1], [-1, 1], [1, -1]]
+    y_true = [[-1, 0], [1, 1], [-1, 2], [1, 3]]
+
+    # toy classification problem
+    for name, TreeClassifier in CLF_TREES.items():
+        clf = TreeClassifier(random_state=0, store_leaf_values=True)
+        clf.fit(X, y)
+
+        y_hat = clf.predict_quantiles(T, quantiles=[0.25, 0.5, 0.75])
+        y_hat = y_hat.squeeze()
+        assert_array_equal(y_hat[:, 0], y_true)
+        assert_array_equal(y_hat[:, 1], y_true)
+        assert_array_equal(y_hat[:, 2], y_true)
+        assert y_hat.shape == (4, 3, 2)
+
+    # toy regression problem
+    for name, TreeRegressor in REG_TREES.items():
+        reg = TreeRegressor(random_state=0, store_leaf_values=True)
+        y_hat = reg.fit(X, y).predict_quantiles(T, quantiles=[0.25, 0.5, 0.75])
+        assert_array_equal(y_hat[:, 0], y_true)
+        assert_array_equal(y_hat[:, 1], y_true)
+        assert_array_equal(y_hat[:, 2], y_true)
+        assert y_hat.shape == (4, 3, 2)
+
+
 def test_deterministic_pickle():
     # Non-regression test for:
     # https://github.com/scikit-learn/scikit-learn/issues/27268
diff --git a/sklearn/utils/_random.pyx b/sklearn/utils/_random.pyx
index f0e649e60fe7c..3a2bff3368d3e 100644
--- a/sklearn/utils/_random.pyx
+++ b/sklearn/utils/_random.pyx
@@ -11,7 +11,10 @@ The module contains:
     * Fast rand_r alternative based on xor shifts
 """
 import numpy as np
-from . import check_random_state
+
+# XXX: added instead of relative import to make scikit-tree easier
+# from .utils import check_random_state
+from sklearn.utils import check_random_state
 
 from ._typedefs cimport intp_t