From 0ac8a71ee2ad670d7b3a39201f63e1b48938204c Mon Sep 17 00:00:00 2001
From: Adam Li <adam2392@gmail.com>
Date: Tue, 28 Mar 2023 19:58:02 -0700
Subject: [PATCH 01/28] [EXAMPLE DIFF] (Tree featuresv2) Fork of sklearn that
 maintains all necessary refactorings to enable downstream functionality 
 (#32)

#### Reference Issues/PRs
This is the most up-to-date PR branch to consolidate all proposed
refactor changes that work with:

- unsupervised trees
- oblique trees
- no performance/runtime regressions against main

#### What does this implement/fix? Explain your changes.
Incorporates refactors to:

Internal Cython of scikit-learn's:
- criterion
- splitter
- tree

Internals of Python in scikit-learns:
- python Tree

Adds the basic implementation of oblique trees. The implementation of
oblique trees has been tested on all sklearn's `check_estimator` testing
function and has error-checking bounds for the new hyperparameter
introduced, which is `feature_combinations` that defaults to ``min(1.5,
n_features)``.

TODO:
1. [ ] ~Add honest support for trees (splitting the data at the Python
API level)~
2. [x] Build wheels
3. [ ] ~Brainstorm unit-tests, or weekly checks to determine when our
fork is out-of-date compared to upstream sklearn~
4. [x] Revamp README for the fork

#### Any other comments?

[cd build]

---------

Signed-off-by: Adam Li <adam2392@gmail.com>
Co-authored-by: Chester Huynh <chester.huynh924@gmail.com>
Co-authored-by: Parth Vora <pvora4@jhu.edu>
---
 .circleci/config.yml                        |  33 +-
 .cirrus.star                                |   4 +-
 .github/workflows/check-changelog.yml       |   3 +-
 .github/workflows/check-manifest.yml        |   2 +-
 .github/workflows/labeler-module.yml        |   4 +-
 .github/workflows/update_tracking_issue.yml |   2 +-
 .github/workflows/wheels.yml                |  33 +-
 .gitignore                                  |   1 +
 Makefile                                    |   3 +
 README.rst                                  | 322 ++++++----
 build_tools/azure/install.sh                |   2 +-
 build_tools/azure/install_win.sh            |   2 +-
 doc/Makefile                                |   2 +
 doc/conf.py                                 |   3 +-
 doc/modules/tree.rst                        |  61 +-
 examples/tree/plot_iris_dtc.py              |   4 -
 setup.py                                    |  45 +-
 sklearn/ensemble/_forest.py                 | 108 +++-
 sklearn/ensemble/tests/test_forest.py       | 171 +++++
 sklearn/tree/_classes.py                    | 162 +++--
 sklearn/tree/_criterion.pxd                 |  45 +-
 sklearn/tree/_criterion.pyx                 | 285 ++++-----
 sklearn/tree/_splitter.pxd                  |  41 +-
 sklearn/tree/_splitter.pyx                  | 165 +++--
 sklearn/tree/_tree.pxd                      |  90 ++-
 sklearn/tree/_tree.pyx                      | 659 ++++++++++++--------
 sklearn/tree/tests/test_tree.py             |  32 +-
 27 files changed, 1499 insertions(+), 785 deletions(-)
 mode change 100755 => 100644 setup.py

diff --git a/.circleci/config.yml b/.circleci/config.yml
index e2f54c0665c78..e4e66b5c57f49 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -94,22 +94,23 @@ jobs:
           root: doc/_build/html
           paths: .
 
-  deploy:
-    docker:
-      - image: cimg/python:3.8.12
-    steps:
-      - checkout
-      - run: ./build_tools/circle/checkout_merge_commit.sh
-      # Attach documentation generated in the 'doc' step so that it can be
-      # deployed.
-      - attach_workspace:
-          at: doc/_build/html
-      - run: ls -ltrh doc/_build/html/stable
-      - deploy:
-          command: |
-            if [[ "${CIRCLE_BRANCH}" =~ ^main$|^[0-9]+\.[0-9]+\.X$ ]]; then
-              bash build_tools/circle/push_doc.sh doc/_build/html/stable
-            fi
+  # XXX: in order to make sure our fork passes all the CIs and not remove too many LOC, we don't want to deploy
+  # deploy:
+  #   docker:
+  #     - image: cimg/python:3.8.12
+  #   steps:
+  #     - checkout
+  #     - run: ./build_tools/circle/checkout_merge_commit.sh
+  #     # Attach documentation generated in the 'doc' step so that it can be
+  #     # deployed.
+  #     - attach_workspace:
+  #         at: doc/_build/html
+  #     - run: ls -ltrh doc/_build/html/stable
+  #     - deploy:
+  #         command: |
+  #           if [[ "${CIRCLE_BRANCH}" =~ ^main$|^[0-9]+\.[0-9]+\.X$ ]]; then
+  #             bash build_tools/circle/push_doc.sh doc/_build/html/stable
+  #           fi
 
 workflows:
   version: 2
diff --git a/.cirrus.star b/.cirrus.star
index 8b3de0d10c532..2dd1e50144987 100644
--- a/.cirrus.star
+++ b/.cirrus.star
@@ -4,9 +4,9 @@
 load("cirrus", "env", "fs", "http")
 
 def main(ctx):
-    # Only run for scikit-learn/scikit-learn. For debugging on a fork, you can
+    # Only run for neurodata/scikit-learn. For debugging on a fork, you can
     # comment out the following condition.
-    if env.get("CIRRUS_REPO_FULL_NAME") != "scikit-learn/scikit-learn":
+    if env.get("CIRRUS_REPO_FULL_NAME") != "neurodata/scikit-learn":
         return []
 
     arm_wheel_yaml = "build_tools/cirrus/arm_wheel.yml"
diff --git a/.github/workflows/check-changelog.yml b/.github/workflows/check-changelog.yml
index d5bfc8ef0f430..53f64ba5c886b 100644
--- a/.github/workflows/check-changelog.yml
+++ b/.github/workflows/check-changelog.yml
@@ -10,12 +10,13 @@ jobs:
   check:
     name: A reviewer will let you know if it is required or can be bypassed
     runs-on: ubuntu-latest
-    if: ${{ contains(github.event.pull_request.labels.*.name, 'No Changelog Needed') == 0 }}
+    if: ${{ contains(github.event.pull_request.labels.*.name, 'No Changelog Needed') == 0 && github.repository == 'scikit-learn/scikit-learn' }}
     steps:
       - name: Get PR number and milestone
         run: |
           echo "PR_NUMBER=${{ github.event.pull_request.number }}" >> $GITHUB_ENV
           echo "TAGGED_MILESTONE=${{ github.event.pull_request.milestone.title }}" >> $GITHUB_ENV
+          echo "${{ github.repository }}"
       - uses: actions/checkout@v3
         with:
           fetch-depth: '0'
diff --git a/.github/workflows/check-manifest.yml b/.github/workflows/check-manifest.yml
index 004cc452e385e..5ef9ce2213e90 100644
--- a/.github/workflows/check-manifest.yml
+++ b/.github/workflows/check-manifest.yml
@@ -7,7 +7,7 @@ on:
 jobs:
   check-manifest:
     # Don't run on forks
-    if: github.repository == 'scikit-learn/scikit-learn'
+    if: github.repository == 'neurodata/scikit-learn'
 
     runs-on: ubuntu-latest
     steps:
diff --git a/.github/workflows/labeler-module.yml b/.github/workflows/labeler-module.yml
index 061d0094b38c5..8092711f07e45 100644
--- a/.github/workflows/labeler-module.yml
+++ b/.github/workflows/labeler-module.yml
@@ -16,7 +16,7 @@ jobs:
     steps:
     - uses: thomasjpfan/labeler@v2.5.0
       continue-on-error: true
-      if: github.repository == 'scikit-learn/scikit-learn'
+      if: github.repository == 'neurodata/scikit-learn'
       with:
         repo-token: "${{ secrets.GITHUB_TOKEN }}"
         max-labels: "3"
@@ -27,7 +27,7 @@ jobs:
     steps:
     - uses: thomasjpfan/labeler@v2.5.0
       continue-on-error: true
-      if: github.repository == 'scikit-learn/scikit-learn'
+      if: github.repository == 'neurodata/scikit-learn'
       with:
         repo-token: "${{ secrets.GITHUB_TOKEN }}"
         configuration-path: ".github/labeler-file-extensions.yml"
diff --git a/.github/workflows/update_tracking_issue.yml b/.github/workflows/update_tracking_issue.yml
index 124ea1e8c6ac4..c176ce356a4cf 100644
--- a/.github/workflows/update_tracking_issue.yml
+++ b/.github/workflows/update_tracking_issue.yml
@@ -24,7 +24,7 @@ on:
 jobs:
   update_tracking_issue:
     runs-on: ubuntu-latest
-    if: github.repository == 'scikit-learn/scikit-learn' && github.event_name == 'schedule'
+    if: github.repository == 'neurodata/scikit-learn' && github.event_name == 'schedule'
     steps:
       - uses: actions/checkout@v3
       - uses: actions/setup-python@v4
diff --git a/.github/workflows/wheels.yml b/.github/workflows/wheels.yml
index b43f29ffa4f7f..4ab75fd361586 100644
--- a/.github/workflows/wheels.yml
+++ b/.github/workflows/wheels.yml
@@ -7,12 +7,12 @@ on:
     - cron: "42 3 */1 * *"
   push:
     branches:
-      - main
+      - fork
       # Release branches
       - "[0-9]+.[0-9]+.X"
   pull_request:
     branches:
-      - main
+      - fork
       - "[0-9]+.[0-9]+.X"
   # Manual run
   workflow_dispatch:
@@ -26,7 +26,7 @@ jobs:
   check_build_trigger:
     name: Check build trigger
     runs-on: ubuntu-latest
-    if: github.repository == 'scikit-learn/scikit-learn'
+    if: github.repository == 'neurodata/scikit-learn'
     outputs:
       build: ${{ steps.check_build_trigger.outputs.build }}
 
@@ -178,31 +178,8 @@ jobs:
         with:
           path: dist/*.tar.gz
 
-  # Upload the wheels and the source distribution
-  upload_anaconda:
-    name: Upload to Anaconda
-    runs-on: ubuntu-latest
-    needs: [build_wheels, build_sdist]
-    # The artifacts cannot be uploaded on PRs
-    if: github.event_name != 'pull_request'
-
-    steps:
-      - name: Checkout scikit-learn
-        uses: actions/checkout@v3
-
-      - name: Download artifacts
-        uses: actions/download-artifact@v3
+      - uses: actions/upload-artifact@v3
         with:
           path: dist
+          name: ${{ matrix.python[0] }}-${{ matrix.os[1] }}
 
-      - name: Setup Python
-        uses: actions/setup-python@v4
-
-      - name: Upload artifacts
-        env:
-          # Secret variables need to be mapped to environment variables explicitly
-          SCIKIT_LEARN_NIGHTLY_UPLOAD_TOKEN: ${{ secrets.SCIKIT_LEARN_NIGHTLY_UPLOAD_TOKEN }}
-          SCIKIT_LEARN_STAGING_UPLOAD_TOKEN: ${{ secrets.SCIKIT_LEARN_STAGING_UPLOAD_TOKEN }}
-          ARTIFACTS_PATH: dist/artifact
-        # Force a replacement if the remote file already exists
-        run: bash build_tools/github/upload_anaconda.sh
diff --git a/.gitignore b/.gitignore
index 89600846100a8..1e28896f50be6 100644
--- a/.gitignore
+++ b/.gitignore
@@ -10,6 +10,7 @@
 build
 sklearn/datasets/__config__.py
 sklearn/**/*.html
+scikit_learn_tree.egg-info/*
 
 dist/
 MANIFEST
diff --git a/Makefile b/Makefile
index 5ea64dc0d6cac..148027b30f59f 100644
--- a/Makefile
+++ b/Makefile
@@ -63,3 +63,6 @@ doc-noplot: inplace
 code-analysis:
 	flake8 sklearn | grep -v __init__ | grep -v external
 	pylint -E -i y sklearn/ -d E1103,E0611,E1101
+
+build-dev:
+	pip install --verbose --no-build-isolation --editable .
diff --git a/README.rst b/README.rst
index 5e2de6a6d8b46..fbdfdaa95ef4c 100644
--- a/README.rst
+++ b/README.rst
@@ -44,20 +44,36 @@
 .. |PytestMinVersion| replace:: 5.3.1
 .. |PlotlyMinVersion| replace:: 5.10.0
 
-.. image:: https://raw.githubusercontent.com/scikit-learn/scikit-learn/main/doc/logos/scikit-learn-logo.png
-  :target: https://scikit-learn.org/
+``scikit-learn-tree`` is a maintained fork of scikit-learn, which advances the tree submodule, while staying in-line
+with changes from upstream scikit-learn. It is an exact stand-in for ``sklearn`` in package imports, but is
+released under the name ``scikit-learn-tree`` to avoid confusion.
 
-**scikit-learn** is a Python module for machine learning built on top of
-SciPy and is distributed under the 3-Clause BSD license.
+It is currently maintained by a team of volunteers.
 
-The project was started in 2007 by David Cournapeau as a Google Summer
-of Code project, and since then many volunteers have contributed. See
-the `About us <https://scikit-learn.org/dev/about.html#authors>`__ page
-for a list of core contributors.
+The upstream package **scikit-learn** is a Python module for machine learning built on top of
+SciPy and is distributed under the 3-Clause BSD license. Refer to their website for all documentation
+needs: https://scikit-learn.org.
 
-It is currently maintained by a team of volunteers.
+Why a fork?
+-----------
+Currently, the scikit-learn tree submodule is difficult to extend. Requests to modularize
+and improve the extensibility of the code is currently unsupported, or may take a long time.
+The desire for advanced tree models that also leverage the robustness of scikit-learn is desirable.
+
+However, "hard-forking" via copy/pasting the explicit Python/Cython code into another tree package
+altogether is undesirable because it results in a tree codebase that is inherently different
+and not compatible with ``scikit-learn``. For example, `quantile-forests <https://github.com/zillow/quantile-forest>`_,
+and `EconML <https://github.com/py-why/EconML>`_ do this, and their current tree submodules
+cannot take advantage of improvements made in upstream ``scikit-learn``.
+
+An example of seamless integration would be `scikit-survival <https://github.com/sebp/scikit-survival>`_, which
+only needs to implement a subclass of the Cython ``Criterion`` oject in their code to enable survival trees.
 
-Website: https://scikit-learn.org
+Maintaining a "soft-fork" of ``scikit-learn`` in the form of a repository fork allows us to develop
+a separate package that serves as a stand-in for ``sklearn`` in any package, extends the tree submodule
+and can also be synced with upstream changes in ``scikit-learn``. This enables this fork to always
+take advantage of improvements made in ``scikit-learn`` main upstream, while providing a customizable
+tree API.
 
 Installation
 ------------
@@ -73,133 +89,195 @@ scikit-learn requires:
 - joblib (>= |JoblibMinVersion|)
 - threadpoolctl (>= |ThreadpoolctlMinVersion|)
 
-=======
+============================
+Installing scikit-learn-tree
+============================
 
-**Scikit-learn 0.20 was the last version to support Python 2.7 and Python 3.4.**
-scikit-learn 1.0 and later require Python 3.7 or newer.
-scikit-learn 1.1 and later require Python 3.8 or newer.
+Scikit-learn-tree is a maintained fork of scikit-learn, which extends the
+tree submodule in a few ways documented in :ref:`changelog of the fork
+<fork-changelog>`. 
 
-Scikit-learn plotting capabilities (i.e., functions start with ``plot_`` and
-classes end with "Display") require Matplotlib (>= |MatplotlibMinVersion|).
-For running the examples Matplotlib >= |MatplotlibMinVersion| is required.
-A few examples require scikit-image >= |Scikit-ImageMinVersion|, a few examples
-require pandas >= |PandasMinVersion|, some examples require seaborn >=
-|SeabornMinVersion| and plotly >= |PlotlyMinVersion|.
+We release versions of scikit-learn-tree in an analagous fashion to
+scikit-learn main. Due to maintenance resources, we only release on PyPi
+and recommend therefore installing with ``pip``.
 
-User installation
-~~~~~~~~~~~~~~~~~
+There are different ways to install scikit-learn-tree:
 
-If you already have a working installation of numpy and scipy,
-the easiest way to install scikit-learn is using ``pip``::
+  * :ref:`Install the latest official release <install_fork_release>`. This
+    is the best approach for most users. It will provide a stable version
+    and pre-built packages are available for most platforms.
+    
+  * :ref:`Building the package from source
+    <install_source>`. This is best for users who want the
+    latest-and-greatest features and aren't afraid of running
+    brand-new code. This is also needed for users who wish to contribute to the
+    project.
 
-    pip install -U scikit-learn
+.. _install_fork_release:
 
-or ``conda``::
+Installing the latest release
+-----------------------------
+We release wheels for common distributions and this is thus installable via pip.
 
-    conda install -c conda-forge scikit-learn
+.. prompt:: bash $
+  
+  pip install scikit-learn-tree
 
-The documentation includes more detailed `installation instructions <https://scikit-learn.org/stable/install.html>`_.
+This will install ``scikit-learn-tree`` under the namespace of ``sklearn``, which then
+can be used as a stand-in for any package that relies on the public API of ``sklearn``.
 
+For example, any usage of ``scikit-learn`` is preserved with ``scikit-learn-tree``
 
-Changelog
----------
+  >>> # the sklearn installed is that of scikit-learn-tree and is equivalent to scikit-learn
+  >>> from sklearn.ensemble import RandomForestClassifier
+  >>> clf = RandomForestClassifier(random_state=0)
+  >>> X = [[ 1,  2,  3],  # 2 samples, 3 features
+  ...      [11, 12, 13]]
+  >>> y = [0, 1]  # classes of each sample
+  >>> clf.fit(X, y)
+  RandomForestClassifier(random_state=0)
 
-See the `changelog <https://scikit-learn.org/dev/whats_new.html>`__
-for a history of notable changes to scikit-learn.
+.. _install_source:
+
+Building from source
+--------------------
+If you are a developer and are interested in helping maintain, or add some new
+features to the fork, the building from source instructions are exactly the same
+as that of scikit-learn main, so please refer to `scikit-learn documentation <https://scikit-learn.org/stable/developers/advanced_installation.html#install-bleeding-edge>`_
+for instructions on building from source.
 
 Development
------------
+===========
 
-We welcome new contributors of all experience levels. The scikit-learn
-community goals are to be helpful, welcoming, and effective. The
+We welcome new contributors of all experience levels, specifically to maintain the fork.
+Any contributions that make sure our fork is "better in-line" with scikit-learn upstream,
+or improves the tree submodule in anyway will be appreciated.
+
+The scikit-learn community goals are to be helpful, welcoming, and effective. The
 `Development Guide <https://scikit-learn.org/stable/developers/index.html>`_
 has detailed information about contributing code, documentation, tests, and
 more. We've included some basic information in this README.
 
-Important links
-~~~~~~~~~~~~~~~
-
-- Official source code repo: https://github.com/scikit-learn/scikit-learn
-- Download releases: https://pypi.org/project/scikit-learn/
-- Issue tracker: https://github.com/scikit-learn/scikit-learn/issues
-
-Source code
-~~~~~~~~~~~
-
-You can check the latest sources with the command::
-
-    git clone https://github.com/scikit-learn/scikit-learn.git
-
-Contributing
-~~~~~~~~~~~~
-
-To learn more about making a contribution to scikit-learn, please see our
-`Contributing guide
-<https://scikit-learn.org/dev/developers/contributing.html>`_.
-
-Testing
-~~~~~~~
-
-After installation, you can launch the test suite from outside the source
-directory (you will need to have ``pytest`` >= |PyTestMinVersion| installed)::
-
-    pytest sklearn
-
-See the web page https://scikit-learn.org/dev/developers/contributing.html#testing-and-improving-test-coverage
-for more information.
-
-    Random number generation can be controlled during testing by setting
-    the ``SKLEARN_SEED`` environment variable.
-
-Submitting a Pull Request
-~~~~~~~~~~~~~~~~~~~~~~~~~
-
-Before opening a Pull Request, have a look at the
-full Contributing page to make sure your code complies
-with our guidelines: https://scikit-learn.org/stable/developers/index.html
-
-Project History
----------------
-
-The project was started in 2007 by David Cournapeau as a Google Summer
-of Code project, and since then many volunteers have contributed. See
-the `About us <https://scikit-learn.org/dev/about.html#authors>`__ page
-for a list of core contributors.
-
-The project is currently maintained by a team of volunteers.
-
-**Note**: `scikit-learn` was previously referred to as `scikits.learn`.
-
-Help and Support
-----------------
-
-Documentation
-~~~~~~~~~~~~~
-
-- HTML documentation (stable release): https://scikit-learn.org
-- HTML documentation (development version): https://scikit-learn.org/dev/
-- FAQ: https://scikit-learn.org/stable/faq.html
-
-Communication
-~~~~~~~~~~~~~
-
-- Mailing list: https://mail.python.org/mailman/listinfo/scikit-learn
-- Gitter: https://gitter.im/scikit-learn/scikit-learn
-- Logos & Branding: https://github.com/scikit-learn/scikit-learn/tree/main/doc/logos
-- Blog: https://blog.scikit-learn.org
-- Calendar: https://blog.scikit-learn.org/calendar/
-- Twitter: https://twitter.com/scikit_learn
-- Twitter (commits): https://twitter.com/sklearn_commits
-- Stack Overflow: https://stackoverflow.com/questions/tagged/scikit-learn
-- Github Discussions: https://github.com/scikit-learn/scikit-learn/discussions
-- Website: https://scikit-learn.org
-- LinkedIn: https://www.linkedin.com/company/scikit-learn
-- YouTube: https://www.youtube.com/channel/UCJosFjYm0ZYVUARxuOZqnnw/playlists
-- Facebook: https://www.facebook.com/scikitlearnofficial/
-- Instagram: https://www.instagram.com/scikitlearnofficial/
-- TikTok: https://www.tiktok.com/@scikit.learn
-
-Citation
-~~~~~~~~
-
-If you use scikit-learn in a scientific publication, we would appreciate citations: https://scikit-learn.org/stable/about.html#citing-scikit-learn
+.. _fork-changelog:
+Major Changes of the Fork
+=========================
+
+The purpose of this page is to illustrate some of the main features that
+``scikit-learn-tree`` provides compared to ``scikit-learn``. It assumes a
+an understanding of core package ``scikit-learn`` and also decision trees
+models. Please refer to our :ref:`installation instructions
+<fork-installation-instructions>` for installing ``scikit-learn-tree``.
+
+Scikit-learn-tree though operates as a stand-in for upstream ``scikit-learn``.
+It is used in packages exactly the same way and will support all features
+in the corresponding version of ``scikit-learn``. For example, if you
+are interested in features of ``scikit-learn`` in v1.2.2 for ``NearestNeighbors`` algorithm,
+then if ``scikit-learn-tree`` has a version release of v1.2.2, then it will have
+all those features. 
+
+The breaking API changes will be with respect to anything in the ``tree`` submodule,
+and related Forest ensemble models. See below for a detailed list of breaking changes.
+
+See: https://scikit-learn.org/ for documentation on scikit-learn main.
+
+Our Philosophy
+--------------
+Our design philosophy with this fork of ``scikit-learn`` is to maintain as few changes
+as possible, such that incorporating upstream changes into the fork requires minimal effort.
+
+Candidate changes and PRs accepted into the fork are those that:
+
+- improve compatability with upstream ``scikit-learn`` main
+- enable improved extensibility of tree models
+
+Decision tree generalizations
+-----------------------------
+
+``Scikit-learn`` provides an axis-aligned :class:`~sklearn.tree.DecisionTreeClassifier`
+decision tree model (classifier and regressor), which has a few fundamental limitations
+that prevent 3rd parties from utilizing the existing class, without forking a large
+amount of copy/pasted Python and Cython code. We highlight those limitations here
+and then describe how we generalize that limitation.
+
+Cython Internal Private API:
+
+Note, the Cython API for scikit-learn is still not a publicly supported API, so it may
+change without warning.
+
+- leaf and split nodes: These nodes are treated the same way and there is no internal
+  API for setting them differently. Quantile trees and causal trees inherently generalize
+  how leaf nodes are set.
+- Criterion class: The criterion class currently assumes a supervised learning interface.
+  - Our fix: We implement a ``BaseCriterion`` object that provides an abstract API for unsupervised criterion.
+- Splitter class: The splitter clas currently assumes a supervised learning interface and
+  does not provide a way of generalizing the way split candidates are proposed.
+  - Our fix: We implement a ``BaseSplitter`` object that provides an abstract API for unsupervised splitters and also implement an API to allow generalizations of the ``SplitRecord`` struct and ``Splitter.node_split`` function. For example, this enables oblique splits to be considered.
+- Tree class: The tree class currently assumes a supervised learning interface and does not
+  provide a way of generalizing the type of tree.
+  - Our fix: We implementa ``BaseTree`` object that provides an abstract API for general tree models and also implement an API that allows generalization of the type of tree. For example, oblique trees are trivially implementable as an extension now.
+- stopping conditions for splitter: Currently, the ``Splitter.node_split`` function has various
+  stopping conditions for the splitter based on hyperparameters. It is plausible that these conditions
+  may be extended. For example, in causal trees, one may want the splitter to also account for
+  a minimal degree of heterogeneity (i.e. variance) in its children nodes. 
+
+Python API:
+
+- ``sklearn.tree.BaseDecisionTree`` assumes the underlying tree model is supervised: The ``y``
+  parameter is required to be passed in, which is not necessary for general tree-based models.
+  For example, an unsupervised tree may pass in ``y=None``.
+  - Our fix: We fix this API, so the ``BaseDecisionTree`` is subclassable by unsupervised tree models that do not require ``y`` to be defined.
+- ``sklearn.tree.BaseDecisionTree`` does not provide a way to generalize the ``Criterion``, ``Splitter``
+  and ``Tree`` Cython classes used: The current codebase requires users to define custom
+  criterion and/or splitters outside the instantiation of the ``BaseDecisionTree``. This prevents
+  users from generalizing the ``Criterion`` and ``Splitter`` and creating a neat Python API wrapper.
+  Moreover, the ``Tree`` class is not customizable.
+  - Our fix: We internally implement a private function to actually build the entire tree, ``BaseDecisionTree._build_tree``, which can be overridden in subclasses that customize the criterion, splitter, or tree, or any combination of them.
+- ``sklearn.ensemble.BaseForest`` and its subclass algorithms are slow when ``n_samples`` is very high. Binning
+  features into a histogram, which is the basis of "LightGBM" and "HistGradientBoostingClassifier" is a computational
+  trick that can both significantly increase runtime efficiency, but also help prevent overfitting in trees, since
+  the sorting in "BestSplitter" is done on bins rather than the continuous feature values. This would enable
+  random forests and their variants to scale to millions of samples.
+  - Our fix: We added a ``max_bins=None`` keyword argument to the ``BaseForest`` class, and all its subclasses. The default behavior is no binning. The current implementation is not necessarily efficient. There are several improvements to be made. See below.
+
+Overall, the existing tree models, such as :class:`~sklearn.tree.DecisionTreeClassifier`
+and :class:`~sklearn.ensemble.RandomForestClassifier` all work exactly the same as they
+would in ``scikit-learn`` main, but these extensions enable 3rd-party packages to extend
+the Cython/Python API easily.
+
+Roadmap
+-------
+There are several improvements that can be made in this fork. Primarily, the binning feature
+promises to make Random Forests and their variants ultra-fast. However, the binning needs
+to be implemented in a similar fashion to ``HistGradientBoostingClassifier``, which passes
+in the binning thresholds throughout the tree construction step, such that the split nodes
+store the actual numerical value of the bin rather than the "bin index". This requires
+modifying the tree Cython code to take in a ``binning_thresholds`` parameter that is part
+of the ``_BinMapper`` fitted class. This also allows us not to do any binning during prediction/apply
+time because the tree already stores the "numerical" threshold value we would want to apply
+to any incoming ``X`` that is not binned.
+
+Besides that modification, the tree and splitter need to be able to handle not just ``np.float32``
+data (the type for X normally in Random Forests), but also ``uint8`` data (the type for X when it
+is binned in to e.g. 255 bins). This would not only save RAM since ``uint8`` storage of millions
+of samples would result in many GB saved, but also improved runtime.
+
+So in summary, the Cython code of the tree submodule needs to take in an extra parameter for
+the binning thresholds if binning occurs and also be able to handle ``X`` being of dtype ``uint8``.
+Afterwards, Random Forests will have fully leveraged the binning feature.
+
+Something to keep in mind is that upstream scikit-learn is actively working on incorporating
+missing-value handling and categorical handling into Random Forests.
+
+Next steps
+----------
+
+We have briefly covered how the tree submodule has changed with respect to ``scikit-learn``.
+This enables packages to leverage these changes in developing more complex tree models
+that may, or may not eventually be PRed into ``scikit-learn``. For example,
+
+- `scikit-tree <https://docs.neurodata.io/scikit-tree/dev/index.html>`_ is a scikit-learn
+  compatible package for more complex and advanced tree models.
+
+If you are developing tree models, we encourage you to take a look at that package, or
+if you have suggestions to make the tree submodule of our fork, ``scikit-learn-tree``
+more 
\ No newline at end of file
diff --git a/build_tools/azure/install.sh b/build_tools/azure/install.sh
index 5238cd1121d2e..db5b5d9414053 100755
--- a/build_tools/azure/install.sh
+++ b/build_tools/azure/install.sh
@@ -7,7 +7,7 @@ set -x
 source build_tools/shared.sh
 
 UNAMESTR=`uname`
-CCACHE_LINKS_DIR="/tmp/ccache"
+CCACHE_LINKS_DIR="/tmp/ccachev2"
 
 setup_ccache() {
     CCACHE_BIN=`which ccache || echo ""`
diff --git a/build_tools/azure/install_win.sh b/build_tools/azure/install_win.sh
index ab559a1878971..011e962885d45 100755
--- a/build_tools/azure/install_win.sh
+++ b/build_tools/azure/install_win.sh
@@ -22,4 +22,4 @@ show_installed_libraries
 python setup.py bdist_wheel
 
 # Install the generated wheel package to test it
-pip install --pre --no-index --find-links dist scikit-learn
+pip install --pre --no-index --find-links dist scikit-learn-tree
diff --git a/doc/Makefile b/doc/Makefile
index b56a1289cd581..c728bbbfd033e 100644
--- a/doc/Makefile
+++ b/doc/Makefile
@@ -53,6 +53,8 @@ html:
 	@echo
 	@echo "Build finished. The HTML pages are in $(BUILDDIR)/html/stable"
 
+# rm $(BUILDDIR)/html/stable/index.html
+# mv $(BUILDDIR)/html/stable/fork_index.html $(BUILDDIR)/html/stable/index.html
 html-noplot:
 	$(SPHINXBUILD) -D plot_gallery=0 -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html/stable
 	@echo
diff --git a/doc/conf.py b/doc/conf.py
index 52b084b331c8c..01e0a332dd54f 100644
--- a/doc/conf.py
+++ b/doc/conf.py
@@ -103,7 +103,8 @@
 # source_encoding = 'utf-8'
 
 # The main toctree document.
-root_doc = "contents"
+# root_doc = "contents"
+root_doc = "index"
 
 # General information about the project.
 project = "scikit-learn"
diff --git a/doc/modules/tree.rst b/doc/modules/tree.rst
index 789b0bab616ca..7fa12fd16d487 100644
--- a/doc/modules/tree.rst
+++ b/doc/modules/tree.rst
@@ -141,7 +141,7 @@ Once trained, you can plot the tree with the :func:`plot_tree` function::
     >>> tree.plot_tree(clf)
     [...]
 
-.. figure:: ../auto_examples/tree/images/sphx_glr_plot_iris_dtc_002.png
+.. figure:: ../auto_examples/tree/images/sphx_glr_plot_iris_dtc_003.png
    :target: ../auto_examples/tree/plot_iris_dtc.html
    :scale: 75
    :align: center
@@ -331,6 +331,8 @@ total cost over the entire trees (by summing the cost at each node) of
 :math:`O(n_{features}n_{samples}^{2}\log(n_{samples}))`.
 
 
+.. _tree_tips_usage:
+
 Tips on practical use
 =====================
 
@@ -612,11 +614,66 @@ be pruned. This process stops when the pruned tree's minimal
 
     * :ref:`sphx_glr_auto_examples_tree_plot_cost_complexity_pruning.py`
 
+Classification, regression and multi-output problems
+----------------------------------------------------
+
+OTs can be used for both classification and regression, and can handle multi-output
+problems in the same manner as DTs.
+
+Complexity
+----------
+
+The run time cost to construct an OT will be similar to that of a DT, with the
+added complexity of a (possibly sparse) matrix multiplication to combine random
+data columns into candidate split values. The cost at each node is
+:math:`O(n_{features}n_{samples}\log(n_{samples}) + n_{features}n_{samples}max\_features \lambda)`
+where the additional :math:`n_{features}n_{samples}max\_features \lambda` term
+comes from the (possibly sparse) matrix multiplication controlled by both the
+number of candidate splits to generate ("max_features") and the sparsity of
+the projection matrix that combines the data features (":math:`\lambda`").
+
+Another consideration is space-complexity.
+
+Space-complexity and storing the OT pickled on disc is also a consideration. OTs
+at every node need to store an additional vector of feature indices and vector of
+feature weights that are used together to form the candidate splits.
+
+Tips on practical use
+---------------------
+
+Similar to DTs, the intuition for most parameters are the same. Therefore refer
+to :ref:`tips for using decision trees <tree_tips_usage>` for information on standard
+tree parameters. Specific parameters, such as ``max_features`` and
+``feature_combinations`` are different or special to OTs. 
+
+  * As specified earlier, ``max_features`` is not constrained to ``n_features``
+    as it is in DTs. Setting ``max_features`` higher requires more computation time because
+    the algorithm needs to sample more candidate splits at every node. However, it also possibly
+    lets the user to sample more informative splits, thereby improving the model fit. This
+    presents a tradeoff between runtime resources and improvements to the model. In practice,
+    we found that sampling more splits, say up to ``max_features=n_features**2``, is desirable
+    if one is willing to spend the computational resources. 
+
+  * ``feature_combinations`` is the :math:`\lambda` term presented in the complexity
+    analysis, which specifies how sparse our combination of features is. If
+    ``feature_combinations=n_features``, then OT is the ``Forest-RC`` version. However,
+    in practice, ``feature_combinations`` can be set much lower, therefore improving runtime
+    and storage complexity.
+
+Finally, when asking the question of when to use OTs vs DTs, scikit-learn recommends
+always trying both model using some type of cross-validation procedure and hyperparameter
+optimization (e.g. `GridSearchCV`). If one has prior knowledge about how the data is
+distributed along its features, such as data being axis-aligned, then one might use a DT.
+Other considerations are runtime and space complexity.
+
 .. topic:: References:
 
     .. [BRE] L. Breiman, J. Friedman, R. Olshen, and C. Stone. Classification
       and Regression Trees. Wadsworth, Belmont, CA, 1984.
-
+    
+    .. [RF] L. Breiman. Random Forests. Machine Learning 45, 5–32 (2001).
+      https://doi.org/10.1023/A:1010933404324.
+      
     * https://en.wikipedia.org/wiki/Decision_tree_learning
 
     * https://en.wikipedia.org/wiki/Predictive_analytics
diff --git a/examples/tree/plot_iris_dtc.py b/examples/tree/plot_iris_dtc.py
index 14f6506b5810f..0dcca718bc6f0 100644
--- a/examples/tree/plot_iris_dtc.py
+++ b/examples/tree/plot_iris_dtc.py
@@ -2,16 +2,12 @@
 =======================================================================
 Plot the decision surface of decision trees trained on the iris dataset
 =======================================================================
-
 Plot the decision surface of a decision tree trained on pairs
 of features of the iris dataset.
-
 See :ref:`decision tree <tree>` for more information on the estimator.
-
 For each pair of iris features, the decision tree learns decision
 boundaries made of combinations of simple thresholding rules inferred from
 the training samples.
-
 We also show the tree structure of a model built on all of the features.
 """
 # %%
diff --git a/setup.py b/setup.py
old mode 100755
new mode 100644
index f5522600f623f..e39e39455b7bc
--- a/setup.py
+++ b/setup.py
@@ -30,19 +30,19 @@
 builtins.__SKLEARN_SETUP__ = True
 
 
-DISTNAME = "scikit-learn"
-DESCRIPTION = "A set of python modules for machine learning and data mining"
+DISTNAME = "scikit-learn-tree"
+DESCRIPTION = "A maintained fork of scikit-learn that extends the tree submodule."
 with open("README.rst") as f:
     LONG_DESCRIPTION = f.read()
-MAINTAINER = "Andreas Mueller"
-MAINTAINER_EMAIL = "amueller@ais.uni-bonn.de"
+MAINTAINER = "Adam Li"
+MAINTAINER_EMAIL = "adam.li@columbia.edu"
 URL = "http://scikit-learn.org"
-DOWNLOAD_URL = "https://pypi.org/project/scikit-learn/#files"
+DOWNLOAD_URL = "https://pypi.org/project/scikit-learn-tree/#files"
 LICENSE = "new BSD"
 PROJECT_URLS = {
-    "Bug Tracker": "https://github.com/scikit-learn/scikit-learn/issues",
+    "Bug Tracker": "https://github.com/neurodata/scikit-learn/issues",
     "Documentation": "https://scikit-learn.org/stable/documentation.html",
-    "Source Code": "https://github.com/scikit-learn/scikit-learn",
+    "Source Code": "https://github.com/neurodata/scikit-learn",
 }
 
 # We can actually import a restricted version of sklearn that
@@ -170,11 +170,11 @@ def check_package_status(package, min_version):
         package_status["up_to_date"] = False
         package_status["version"] = ""
 
-    req_str = "scikit-learn requires {} >= {}.\n".format(package, min_version)
+    req_str = "scikit-learn-tree requires {} >= {}.\n".format(package, min_version)
 
     instructions = (
         "Installation instructions are available on the "
-        "scikit-learn website: "
+        "scikit-learn-tree website: "
         "http://scikit-learn.org/stable/install.html\n"
     )
 
@@ -221,10 +221,10 @@ def check_package_status(package, min_version):
         {"sources": ["_cdnmf_fast.pyx"], "include_np": True},
     ],
     "ensemble": [
-        {"sources": ["_gradient_boosting.pyx"], "include_np": True},
+        {"sources": ["_gradient_boosting.pyx"], "language": "c++", "include_np": True},
     ],
     "ensemble._hist_gradient_boosting": [
-        {"sources": ["_gradient_boosting.pyx"], "include_np": True},
+        {"sources": ["_gradient_boosting.pyx"], "language": "c++", "include_np": True},
         {"sources": ["histogram.pyx"], "include_np": True},
         {"sources": ["splitting.pyx"], "include_np": True},
         {"sources": ["_binning.pyx"], "include_np": True},
@@ -306,7 +306,7 @@ def check_package_status(package, min_version):
         {"sources": ["_ball_tree.pyx"], "include_np": True},
         {"sources": ["_kd_tree.pyx"], "include_np": True},
         {"sources": ["_partition_nodes.pyx"], "language": "c++", "include_np": True},
-        {"sources": ["_quad_tree.pyx"], "include_np": True},
+        {"sources": ["_quad_tree.pyx"], "language": "c++", "include_np": True},
     ],
     "svm": [
         {
@@ -374,9 +374,24 @@ def check_package_status(package, min_version):
             "include_np": True,
             "optimization_level": "O3",
         },
-        {"sources": ["_splitter.pyx"], "include_np": True, "optimization_level": "O3"},
-        {"sources": ["_criterion.pyx"], "include_np": True, "optimization_level": "O3"},
-        {"sources": ["_utils.pyx"], "include_np": True, "optimization_level": "O3"},
+        {
+            "sources": ["_splitter.pyx"],
+            "include_np": True,
+            "language": "c++",
+            "optimization_level": "O3",
+        },
+        {
+            "sources": ["_criterion.pyx"],
+            "include_np": True,
+            "language": "c++",
+            "optimization_level": "O3",
+        },
+        {
+            "sources": ["_utils.pyx"],
+            "include_np": True,
+            "language": "c++",
+            "optimization_level": "O3",
+        },
     ],
     "utils": [
         {"sources": ["sparsefuncs_fast.pyx"], "include_np": True},
diff --git a/sklearn/ensemble/_forest.py b/sklearn/ensemble/_forest.py
index 19203da4fce1f..a3c29e4a269ce 100644
--- a/sklearn/ensemble/_forest.py
+++ b/sklearn/ensemble/_forest.py
@@ -40,6 +40,7 @@ class calls the ``fit`` method of each sub-estimator on random samples
 # License: BSD 3 clause
 
 
+from time import time
 from numbers import Integral, Real
 from warnings import catch_warnings, simplefilter, warn
 import threading
@@ -72,10 +73,11 @@ class calls the ``fit`` method of each sub-estimator on random samples
     _check_sample_weight,
     _check_feature_names_in,
 )
+from ..utils._openmp_helpers import _openmp_effective_n_threads
 from ..utils.validation import _num_samples
 from ..utils._param_validation import Interval, StrOptions
 from ..utils._param_validation import RealNotInt
-
+from ._hist_gradient_boosting.binning import _BinMapper
 
 __all__ = [
     "RandomForestClassifier",
@@ -210,6 +212,10 @@ class BaseForest(MultiOutputMixin, BaseEnsemble, metaclass=ABCMeta):
             Interval(RealNotInt, 0.0, 1.0, closed="right"),
             Interval(Integral, 1, None, closed="left"),
         ],
+        "max_bins": [
+            None,
+            Interval(Integral, 1, None, closed="left"),
+        ],
     }
 
     @abstractmethod
@@ -228,6 +234,7 @@ def __init__(
         class_weight=None,
         max_samples=None,
         base_estimator="deprecated",
+        max_bins=None,
     ):
         super().__init__(
             estimator=estimator,
@@ -244,6 +251,7 @@ def __init__(
         self.warm_start = warm_start
         self.class_weight = class_weight
         self.max_samples = max_samples
+        self.max_bins = max_bins
 
     def apply(self, X):
         """
@@ -263,6 +271,15 @@ def apply(self, X):
             return the index of the leaf x ends up in.
         """
         X = self._validate_X_predict(X)
+
+        # if we trained a binning tree, then we should re-bin the data
+        # XXX: this is inefficient and should be improved to be in line with what
+        # the Histogram Gradient Boosting Tree does, where the binning thresholds
+        # are passed into the tree itself, thus allowing us to set the node feature
+        # value thresholds within the tree itself.
+        if self.max_bins is not None:
+            X = self._bin_data(X, is_training_data=False).astype(DTYPE)
+
         results = Parallel(
             n_jobs=self.n_jobs,
             verbose=self.verbose,
@@ -420,6 +437,38 @@ def fit(self, X, y, sample_weight=None):
 
         n_more_estimators = self.n_estimators - len(self.estimators_)
 
+        if self.max_bins is not None:
+            # `_openmp_effective_n_threads` is used to take cgroups CPU quotes
+            # into account when determine the maximum number of threads to use.
+            n_threads = _openmp_effective_n_threads()
+
+            # Bin the data
+            # For ease of use of the API, the user-facing GBDT classes accept the
+            # parameter max_bins, which doesn't take into account the bin for
+            # missing values (which is always allocated). However, since max_bins
+            # isn't the true maximal number of bins, all other private classes
+            # (binmapper, histbuilder...) accept n_bins instead, which is the
+            # actual total number of bins. Everywhere in the code, the
+            # convention is that n_bins == max_bins + 1
+            n_bins = self.max_bins + 1  # + 1 for missing values
+            self._bin_mapper = _BinMapper(
+                n_bins=n_bins,
+                # is_categorical=self.is_categorical_,
+                known_categories=None,
+                random_state=random_state,
+                n_threads=n_threads,
+            )
+
+            # XXX: in order for this to work with the underlying tree submodule's Cython
+            # code, we need to convert this into the original data's DTYPE because
+            # the Cython code assumes that `DTYPE` is used.
+            # The proper implementation will be a lot more complicated and should be
+            # tackled once scikit-learn has finalized their inclusion of missing data
+            # and categorical support for decision trees
+            X = self._bin_data(X, is_training_data=True)  # .astype(DTYPE)
+        else:
+            self._bin_mapper = None
+
         if n_more_estimators < 0:
             raise ValueError(
                 "n_estimators=%d must be larger or equal to "
@@ -628,6 +677,35 @@ def feature_importances_(self):
         all_importances = np.mean(all_importances, axis=0, dtype=np.float64)
         return all_importances / np.sum(all_importances)
 
+    def _bin_data(self, X, is_training_data):
+        """Bin data X.
+
+        If is_training_data, then fit the _bin_mapper attribute.
+        Else, the binned data is converted to a C-contiguous array.
+        """
+
+        description = "training" if is_training_data else "validation"
+        if self.verbose:
+            print(
+                "Binning {:.3f} GB of {} data: ".format(X.nbytes / 1e9, description),
+                end="",
+                flush=True,
+            )
+        tic = time()
+        if is_training_data:
+            X_binned = self._bin_mapper.fit_transform(X)  # F-aligned array
+        else:
+            X_binned = self._bin_mapper.transform(X)  # F-aligned array
+            # We convert the array to C-contiguous since predicting is faster
+            # with this layout (training is faster on F-arrays though)
+            X_binned = np.ascontiguousarray(X_binned)
+        toc = time()
+        if self.verbose:
+            duration = toc - tic
+            print("{:.3f} s".format(duration))
+
+        return X_binned
+
 
 def _accumulate_prediction(predict, X, out, lock):
     """
@@ -669,6 +747,7 @@ def __init__(
         class_weight=None,
         max_samples=None,
         base_estimator="deprecated",
+        max_bins=None,
     ):
         super().__init__(
             estimator=estimator,
@@ -683,6 +762,7 @@ def __init__(
             class_weight=class_weight,
             max_samples=max_samples,
             base_estimator=base_estimator,
+            max_bins=max_bins,
         )
 
     @staticmethod
@@ -856,6 +936,14 @@ def predict_proba(self, X):
         # Check data
         X = self._validate_X_predict(X)
 
+        # if we trained a binning tree, then we should re-bin the data
+        # XXX: this is inefficient and should be improved to be in line with what
+        # the Histogram Gradient Boosting Tree does, where the binning thresholds
+        # are passed into the tree itself, thus allowing us to set the node feature
+        # value thresholds within the tree itself.
+        if self.max_bins is not None:
+            X = self._bin_data(X, is_training_data=False).astype(DTYPE)
+
         # Assign chunk of trees to jobs
         n_jobs, _, _ = _partition_estimators(self.n_estimators, self.n_jobs)
 
@@ -937,6 +1025,7 @@ def __init__(
         warm_start=False,
         max_samples=None,
         base_estimator="deprecated",
+        max_bins=None,
     ):
         super().__init__(
             estimator,
@@ -950,6 +1039,7 @@ def __init__(
             warm_start=warm_start,
             max_samples=max_samples,
             base_estimator=base_estimator,
+            max_bins=max_bins,
         )
 
     def predict(self, X):
@@ -975,6 +1065,14 @@ def predict(self, X):
         # Check data
         X = self._validate_X_predict(X)
 
+        # if we trained a binning tree, then we should re-bin the data
+        # XXX: this is inefficient and should be improved to be in line with what
+        # the Histogram Gradient Boosting Tree does, where the binning thresholds
+        # are passed into the tree itself, thus allowing us to set the node feature
+        # value thresholds within the tree itself.
+        if self.max_bins is not None:
+            X = self._bin_data(X, is_training_data=False).astype(DTYPE)
+
         # Assign chunk of trees to jobs
         n_jobs, _, _ = _partition_estimators(self.n_estimators, self.n_jobs)
 
@@ -1399,6 +1497,7 @@ def __init__(
         class_weight=None,
         ccp_alpha=0.0,
         max_samples=None,
+        max_bins=None,
     ):
         super().__init__(
             estimator=DecisionTreeClassifier(),
@@ -1423,6 +1522,7 @@ def __init__(
             warm_start=warm_start,
             class_weight=class_weight,
             max_samples=max_samples,
+            max_bins=max_bins,
         )
 
         self.criterion = criterion
@@ -1734,6 +1834,7 @@ def __init__(
         warm_start=False,
         ccp_alpha=0.0,
         max_samples=None,
+        max_bins=None,
     ):
         super().__init__(
             estimator=DecisionTreeRegressor(),
@@ -1757,6 +1858,7 @@ def __init__(
             verbose=verbose,
             warm_start=warm_start,
             max_samples=max_samples,
+            max_bins=max_bins,
         )
 
         self.criterion = criterion
@@ -2084,6 +2186,7 @@ def __init__(
         class_weight=None,
         ccp_alpha=0.0,
         max_samples=None,
+        max_bins=None,
     ):
         super().__init__(
             estimator=ExtraTreeClassifier(),
@@ -2108,6 +2211,7 @@ def __init__(
             warm_start=warm_start,
             class_weight=class_weight,
             max_samples=max_samples,
+            max_bins=max_bins,
         )
 
         self.criterion = criterion
@@ -2406,6 +2510,7 @@ def __init__(
         warm_start=False,
         ccp_alpha=0.0,
         max_samples=None,
+        max_bins=None,
     ):
         super().__init__(
             estimator=ExtraTreeRegressor(),
@@ -2429,6 +2534,7 @@ def __init__(
             verbose=verbose,
             warm_start=warm_start,
             max_samples=max_samples,
+            max_bins=max_bins,
         )
 
         self.criterion = criterion
diff --git a/sklearn/ensemble/tests/test_forest.py b/sklearn/ensemble/tests/test_forest.py
index 9bf0bb2becd9b..0150340f24bc6 100644
--- a/sklearn/ensemble/tests/test_forest.py
+++ b/sklearn/ensemble/tests/test_forest.py
@@ -118,6 +118,120 @@
 FOREST_CLASSIFIERS_REGRESSORS.update(FOREST_REGRESSORS)
 
 
+def _sparse_parity(n, p=20, p_star=3, random_state=None):
+    """Generate sparse parity dataset.
+
+    Sparse parity is a multivariate generalization of the
+    XOR problem.
+
+    Parameters
+    ----------
+    n : int
+        Number of sample to generate.
+    p : int, optional
+        The dimensionality of the dataset, by default 20
+    p_star : int, optional
+        The number of informative dimensions, by default 3.
+    random_state : Random State, optional
+        Random state, by default None.
+
+    Returns
+    -------
+    X : np.ndarray of shape (n, p)
+        Sparse parity dataset as a dense array.
+    y : np.ndarray of shape (n,)
+        Labels of the dataset
+    """
+    rng = np.random.RandomState(seed=random_state)
+    X = rng.uniform(-1, 1, (n, p))
+    y = np.zeros(n)
+
+    for i in range(0, n):
+        y[i] = sum(X[i, :p_star] > 0) % 2
+
+    return X, y
+
+
+def _orthant(n, p=6, random_state=None):
+    """Generate orthant dataset.
+
+    Parameters
+    ----------
+    n : int
+        Number of sample to generate.
+    p : int, optional
+        The dimensionality of the dataset and the number of
+        unique labels, by default 6.
+    rec : int, optional
+        _description_, by default 1
+    random_state : Random State, optional
+        Random state, by default None.
+
+    Returns
+    -------
+    X : np.ndarray of shape (n, p)
+        Orthant dataset as a dense array.
+    y : np.ndarray of shape (n,)
+        Labels of the dataset
+    """
+    rng = np.random.RandomState(seed=random_state)
+    orth_labels = np.asarray([2**i for i in range(0, p)][::-1])
+
+    X = rng.uniform(-1, 1, (n, p))
+    y = np.zeros(n)
+
+    for i in range(0, n):
+        idx = np.where(X[i, :] > 0)[0]
+        y[i] = sum(orth_labels[idx])
+
+    if len(np.unique(y)) < 2**p:
+        raise RuntimeError("Increase sample size to get a label in each orthant.")
+
+    return X, y
+
+
+def _trunk(n, p=10, random_state=None):
+    """Generate trunk dataset.
+
+    Parameters
+    ----------
+    n : int
+        Number of sample to generate.
+    p : int, optional
+        The dimensionality of the dataset and the number of
+        unique labels, by default 10.
+    random_state : Random State, optional
+        Random state, by default None.
+
+    Returns
+    -------
+    X : np.ndarray of shape (n, p)
+        Trunk dataset as a dense array.
+    y : np.ndarray of shape (n,)
+        Labels of the dataset
+
+    References
+    ----------
+    [1] Gerard V. Trunk. A problem of dimensionality: A
+    simple example. IEEE Transactions on Pattern Analysis
+    and Machine Intelligence, 1(3):306–307, 1979.
+    """
+    rng = np.random.RandomState(seed=random_state)
+
+    mu_1 = np.array([1 / i for i in range(1, p + 1)])
+    mu_0 = -1 * mu_1
+    cov = np.identity(p)
+
+    X = np.vstack(
+        (
+            rng.multivariate_normal(mu_0, cov, int(n / 2)),
+            rng.multivariate_normal(mu_1, cov, int(n / 2)),
+        )
+    )
+    y = np.concatenate((np.zeros(int(n / 2)), np.ones(int(n / 2))))
+    return X, y
+
+
 def check_classification_toy(name):
     """Check classification on a toy dataset."""
     ForestClassifier = FOREST_CLASSIFIERS[name]
@@ -1791,3 +1905,60 @@ def test_round_samples_to_one_when_samples_too_low(class_weight):
         n_estimators=10, max_samples=1e-4, class_weight=class_weight, random_state=0
     )
     forest.fit(X, y)
+
+
+@pytest.mark.parametrize("name", FOREST_CLASSIFIERS)
+def test_classification_toy_withbins(name):
+    """Check classification on a toy dataset."""
+    ForestClassifier = FOREST_CLASSIFIERS[name]
+
+    clf = ForestClassifier(n_estimators=10, random_state=1, max_bins=255)
+    clf.fit(X, y)
+    assert_array_equal(clf.predict(T), true_result)
+    assert 10 == len(clf)
+
+    clf = ForestClassifier(
+        n_estimators=10, max_features=1, random_state=1, max_bins=255
+    )
+    clf.fit(X, y)
+    assert_array_equal(clf.predict(T), true_result)
+    assert 10 == len(clf)
+
+    # also test apply
+    leaf_indices = clf.apply(X)
+    assert leaf_indices.shape == (len(X), clf.n_estimators)
+
+
+@pytest.mark.parametrize("name", FOREST_REGRESSORS)
+@pytest.mark.parametrize(
+    "criterion", ("squared_error", "absolute_error", "friedman_mse")
+)
+def test_regression_criterion_withbins(name, criterion):
+    # Check consistency on regression dataset.
+    ForestRegressor = FOREST_REGRESSORS[name]
+
+    reg = ForestRegressor(
+        n_estimators=5, criterion=criterion, random_state=1, max_bins=250
+    )
+    reg.fit(X_reg, y_reg)
+    score = reg.score(X_reg, y_reg)
+    assert (
+        score > 0.93
+    ), "Failed with max_features=None, criterion %s and score = %f" % (
+        criterion,
+        score,
+    )
+
+    reg = ForestRegressor(
+        n_estimators=5,
+        criterion=criterion,
+        max_features=6,
+        random_state=1,
+        max_bins=250,
+    )
+    reg.fit(X_reg, y_reg)
+    score = reg.score(X_reg, y_reg)
+    assert score > 0.92, "Failed with max_features=6, criterion %s and score = %f" % (
+        criterion,
+        score,
+    )
diff --git a/sklearn/tree/_classes.py b/sklearn/tree/_classes.py
index b175275ea92dc..bd54483bf2dfe 100644
--- a/sklearn/tree/_classes.py
+++ b/sklearn/tree/_classes.py
@@ -40,8 +40,8 @@
 from ..utils._param_validation import Hidden, Interval, StrOptions
 from ..utils._param_validation import RealNotInt
 
-from ._criterion import Criterion
-from ._splitter import Splitter
+from ._criterion import BaseCriterion
+from ._splitter import BaseSplitter
 from ._tree import DepthFirstTreeBuilder
 from ._tree import BestFirstTreeBuilder
 from ._tree import Tree
@@ -174,7 +174,7 @@ def get_n_leaves(self):
         check_is_fitted(self)
         return self.tree_.n_leaves
 
-    def fit(self, X, y, sample_weight=None, check_input=True):
+    def fit(self, X, y=None, sample_weight=None, check_input=True):
         self._validate_params()
         random_state = check_random_state(self.random_state)
 
@@ -184,9 +184,12 @@ def fit(self, X, y, sample_weight=None, check_input=True):
             # csr.
             check_X_params = dict(dtype=DTYPE, accept_sparse="csc")
             check_y_params = dict(ensure_2d=False, dtype=None)
-            X, y = self._validate_data(
-                X, y, validate_separately=(check_X_params, check_y_params)
-            )
+            if y is not None or self._get_tags()["requires_y"]:
+                X, y = self._validate_data(
+                    X, y, validate_separately=(check_X_params, check_y_params)
+                )
+            else:
+                X = self._validate_data(X, **check_X_params)
             if issparse(X):
                 X.sort_indices()
 
@@ -195,7 +198,7 @@ def fit(self, X, y, sample_weight=None, check_input=True):
                         "No support for np.int64 index based sparse matrices"
                     )
 
-            if self.criterion == "poisson":
+            if y is not None and self.criterion == "poisson":
                 if np.any(y < 0):
                     raise ValueError(
                         "Some value(s) of y are negative which is"
@@ -209,45 +212,56 @@ def fit(self, X, y, sample_weight=None, check_input=True):
 
         # Determine output settings
         n_samples, self.n_features_in_ = X.shape
-        is_classification = is_classifier(self)
 
-        y = np.atleast_1d(y)
-        expanded_class_weight = None
+        # Do preprocessing if 'y' is passed
+        is_classification = False
+        if y is not None:
+            is_classification = is_classifier(self)
+
+            y = np.atleast_1d(y)
+            expanded_class_weight = None
 
-        if y.ndim == 1:
-            # reshape is necessary to preserve the data contiguity against vs
-            # [:, np.newaxis] that does not.
-            y = np.reshape(y, (-1, 1))
+            if y.ndim == 1:
+                # reshape is necessary to preserve the data contiguity against vs
+                # [:, np.newaxis] that does not.
+                y = np.reshape(y, (-1, 1))
 
-        self.n_outputs_ = y.shape[1]
+            self.n_outputs_ = y.shape[1]
 
-        if is_classification:
-            check_classification_targets(y)
-            y = np.copy(y)
+            if is_classification:
+                check_classification_targets(y)
+                y = np.copy(y)
 
-            self.classes_ = []
-            self.n_classes_ = []
+                self.classes_ = []
+                self.n_classes_ = []
 
-            if self.class_weight is not None:
-                y_original = np.copy(y)
+                if self.class_weight is not None:
+                    y_original = np.copy(y)
 
-            y_encoded = np.zeros(y.shape, dtype=int)
-            for k in range(self.n_outputs_):
-                classes_k, y_encoded[:, k] = np.unique(y[:, k], return_inverse=True)
-                self.classes_.append(classes_k)
-                self.n_classes_.append(classes_k.shape[0])
-            y = y_encoded
-
-            if self.class_weight is not None:
-                expanded_class_weight = compute_sample_weight(
-                    self.class_weight, y_original
-                )
+                y_encoded = np.zeros(y.shape, dtype=int)
+                for k in range(self.n_outputs_):
+                    classes_k, y_encoded[:, k] = np.unique(y[:, k], return_inverse=True)
+                    self.classes_.append(classes_k)
+                    self.n_classes_.append(classes_k.shape[0])
+                y = y_encoded
+
+                if self.class_weight is not None:
+                    expanded_class_weight = compute_sample_weight(
+                        self.class_weight, y_original
+                    )
 
-            self.n_classes_ = np.array(self.n_classes_, dtype=np.intp)
+                self.n_classes_ = np.array(self.n_classes_, dtype=np.intp)
 
-        if getattr(y, "dtype", None) != DOUBLE or not y.flags.contiguous:
-            y = np.ascontiguousarray(y, dtype=DOUBLE)
+            if getattr(y, "dtype", None) != DOUBLE or not y.flags.contiguous:
+                y = np.ascontiguousarray(y, dtype=DOUBLE)
 
+            if len(y) != n_samples:
+                raise ValueError(
+                    "Number of labels=%d does not match number of samples=%d"
+                    % (len(y), n_samples)
+                )
+
+        # set decision-tree model parameters
         max_depth = np.iinfo(np.int32).max if self.max_depth is None else self.max_depth
 
         if isinstance(self.min_samples_leaf, numbers.Integral):
@@ -299,16 +313,10 @@ def fit(self, X, y, sample_weight=None, check_input=True):
 
         max_leaf_nodes = -1 if self.max_leaf_nodes is None else self.max_leaf_nodes
 
-        if len(y) != n_samples:
-            raise ValueError(
-                "Number of labels=%d does not match number of samples=%d"
-                % (len(y), n_samples)
-            )
-
         if sample_weight is not None:
             sample_weight = _check_sample_weight(sample_weight, X, DOUBLE)
 
-        if expanded_class_weight is not None:
+        if y is not None and expanded_class_weight is not None:
             if sample_weight is not None:
                 sample_weight = sample_weight * expanded_class_weight
             else:
@@ -320,10 +328,63 @@ def fit(self, X, y, sample_weight=None, check_input=True):
         else:
             min_weight_leaf = self.min_weight_fraction_leaf * np.sum(sample_weight)
 
+        # build the actual tree now with the parameters
+        self._build_tree(
+            X,
+            y,
+            sample_weight,
+            min_samples_leaf,
+            min_weight_leaf,
+            max_leaf_nodes,
+            min_samples_split,
+            max_depth,
+            random_state,
+        )
+
+        return self
+
+    def _build_tree(
+        self,
+        X,
+        y,
+        sample_weight,
+        min_samples_leaf,
+        min_weight_leaf,
+        max_leaf_nodes,
+        min_samples_split,
+        max_depth,
+        random_state,
+    ):
+        """Build the actual tree.
+
+        Parameters
+        ----------
+        X : Array-like
+            X dataset.
+        y : Array-like
+            Y targets.
+        sample_weight : Array-like
+            Sample weights
+        min_samples_leaf : float
+            Number of samples required to be a leaf.
+        min_weight_leaf : float
+            Weight of samples required to be a leaf.
+        max_leaf_nodes : float
+            Maximum number of leaf nodes allowed in tree.
+        min_samples_split : float
+            Minimum number of samples to split on.
+        max_depth : int
+            The maximum depth of any tree.
+        random_state : int
+            Random seed.
+        """
+
+        n_samples = X.shape[0]
+
         # Build tree
         criterion = self.criterion
-        if not isinstance(criterion, Criterion):
-            if is_classification:
+        if not isinstance(criterion, BaseCriterion):
+            if is_classifier(self):
                 criterion = CRITERIA_CLF[self.criterion](
                     self.n_outputs_, self.n_classes_
                 )
@@ -337,7 +398,7 @@ def fit(self, X, y, sample_weight=None, check_input=True):
         SPLITTERS = SPARSE_SPLITTERS if issparse(X) else DENSE_SPLITTERS
 
         splitter = self.splitter
-        if not isinstance(self.splitter, Splitter):
+        if not isinstance(self.splitter, BaseSplitter):
             splitter = SPLITTERS[self.splitter](
                 criterion,
                 self.max_features_,
@@ -385,8 +446,6 @@ def fit(self, X, y, sample_weight=None, check_input=True):
 
         self._prune_tree()
 
-        return self
-
     def _validate_X_predict(self, X, check_input):
         """Validate the training data on predict (probabilities)."""
         if check_input:
@@ -817,7 +876,10 @@ class DecisionTreeClassifier(ClassifierMixin, BaseDecisionTree):
 
     _parameter_constraints: dict = {
         **BaseDecisionTree._parameter_constraints,
-        "criterion": [StrOptions({"gini", "entropy", "log_loss"}), Hidden(Criterion)],
+        "criterion": [
+            StrOptions({"gini", "entropy", "log_loss"}),
+            Hidden(BaseCriterion),
+        ],
         "class_weight": [dict, list, StrOptions({"balanced"}), None],
     }
 
@@ -1173,7 +1235,7 @@ class DecisionTreeRegressor(RegressorMixin, BaseDecisionTree):
         **BaseDecisionTree._parameter_constraints,
         "criterion": [
             StrOptions({"squared_error", "friedman_mse", "absolute_error", "poisson"}),
-            Hidden(Criterion),
+            Hidden(BaseCriterion),
         ],
     }
 
diff --git a/sklearn/tree/_criterion.pxd b/sklearn/tree/_criterion.pxd
index 47f616c6bad50..2e179e78e8c3f 100644
--- a/sklearn/tree/_criterion.pxd
+++ b/sklearn/tree/_criterion.pxd
@@ -4,6 +4,8 @@
 #          Joel Nothman <joel.nothman@gmail.com>
 #          Arnaud Joly <arnaud.v.joly@gmail.com>
 #          Jacob Schreiber <jmschreiber91@gmail.com>
+#          Adam Li <adam2392@gmail.com>
+#          Jong Shin <jshinm@gmail.com>
 #
 # License: BSD 3 clause
 
@@ -15,13 +17,11 @@ from ._tree cimport SIZE_t           # Type for indices and counters
 from ._tree cimport INT32_t          # Signed 32 bit integer
 from ._tree cimport UINT32_t         # Unsigned 32 bit integer
 
-cdef class Criterion:
-    # The criterion computes the impurity of a node and the reduction of
-    # impurity of a split on that node. It also computes the output statistics
-    # such as the mean in regression and class probabilities in classification.
+
+cdef class BaseCriterion:
+    """Abstract interface for criterion."""    
 
     # Internal structures
-    cdef const DOUBLE_t[:, ::1] y        # Values of y
     cdef const DOUBLE_t[:] sample_weight # Sample weights
 
     cdef const SIZE_t[:] sample_indices  # Sample indices in X, y
@@ -37,19 +37,7 @@ cdef class Criterion:
     cdef double weighted_n_left          # Weighted number of samples in the left node
     cdef double weighted_n_right         # Weighted number of samples in the right node
 
-    # The criterion object is maintained such that left and right collected
-    # statistics correspond to samples[start:pos] and samples[pos:end].
-
-    # Methods
-    cdef int init(
-        self,
-        const DOUBLE_t[:, ::1] y,
-        const DOUBLE_t[:] sample_weight,
-        double weighted_n_samples,
-        const SIZE_t[:] sample_indices,
-        SIZE_t start,
-        SIZE_t end
-    ) except -1 nogil
+    # Core methods that criterion class _must_ implement.
     cdef int reset(self) except -1 nogil
     cdef int reverse_reset(self) except -1 nogil
     cdef int update(self, SIZE_t new_pos) except -1 nogil
@@ -71,6 +59,25 @@ cdef class Criterion:
     ) noexcept nogil
     cdef double proxy_impurity_improvement(self) noexcept nogil
 
+    cdef void set_sample_pointers(
+        self,
+        SIZE_t start,
+        SIZE_t end
+    ) noexcept nogil
+
+cdef class Criterion(BaseCriterion):
+    """Abstract interface for supervised impurity criteria."""
+
+    cdef const DOUBLE_t[:, ::1] y
+
+    cdef int init(
+        self,
+        const DOUBLE_t[:, ::1] y,
+        const DOUBLE_t[:] sample_weight,
+        double weighted_n_samples,
+        const SIZE_t[:] sample_indices
+    ) except -1 nogil
+
 cdef class ClassificationCriterion(Criterion):
     """Abstract criterion for classification."""
 
@@ -88,4 +95,4 @@ cdef class RegressionCriterion(Criterion):
 
     cdef double[::1] sum_total   # The sum of w*y.
     cdef double[::1] sum_left    # Same as above, but for the left side of the split
-    cdef double[::1] sum_right   # Same as above, but for the right side of the split
+    cdef double[::1] sum_right   # Same as above, but for the right side of the split
\ No newline at end of file
diff --git a/sklearn/tree/_criterion.pyx b/sklearn/tree/_criterion.pyx
index 7cd7bbb0e3c1b..c94914daa0e0b 100644
--- a/sklearn/tree/_criterion.pyx
+++ b/sklearn/tree/_criterion.pyx
@@ -9,6 +9,8 @@
 #          Fares Hedayati <fares.hedayati@gmail.com>
 #          Jacob Schreiber <jmschreiber91@gmail.com>
 #          Nelson Liu <nelson@nelsonliu.me>
+#          Adam Li <adam2392@gmail.com>
+#          Jong Shin <jshinm@gmail.com>
 #
 # License: BSD 3 clause
 
@@ -29,11 +31,20 @@ from ._utils cimport WeightedMedianCalculator
 # EPSILON is used in the Poisson criterion
 cdef double EPSILON = 10 * np.finfo('double').eps
 
-cdef class Criterion:
-    """Interface for impurity criteria.
-
+cdef class BaseCriterion:
+    """This is an abstract interface for criterion. For example, a tree model could
+    be either supervisedly, or unsupervisedly computing impurity on samples of
+    covariates, or labels, or both. Although scikit-learn currently only contains
+    supervised tree methods, this class enables 3rd party packages to leverage
+    scikit-learn's Cython code for criteria.
+    The downstream classes _must_ implement methods to compute the impurity
+    in current node and in children nodes.
     This object stores methods on how to calculate how good a split is using
-    different metrics.
+    a set API. 
+    Samples in the "current" node are stored in `samples[start:end]` which is
+    partitioned around `pos` (an index in `start:end`) so that:
+       - the samples of left child node are stored in `samples[start:pos]`
+       - the samples of right child node are stored in `samples[pos:end]`
     """
     def __getstate__(self):
         return {}
@@ -41,61 +52,23 @@ cdef class Criterion:
     def __setstate__(self, d):
         pass
 
-    cdef int init(
-        self,
-        const DOUBLE_t[:, ::1] y,
-        const DOUBLE_t[:] sample_weight,
-        double weighted_n_samples,
-        const SIZE_t[:] sample_indices,
-        SIZE_t start,
-        SIZE_t end,
-    ) except -1 nogil:
-        """Placeholder for a method which will initialize the criterion.
-
-        Returns -1 in case of failure to allocate memory (and raise MemoryError)
-        or 0 otherwise.
-
-        Parameters
-        ----------
-        y : ndarray, dtype=DOUBLE_t
-            y is a buffer that can store values for n_outputs target variables
-            stored as a Cython memoryview.
-        sample_weight : ndarray, dtype=DOUBLE_t
-            The weight of each sample stored as a Cython memoryview.
-        weighted_n_samples : double
-            The total weight of the samples being considered
-        sample_indices : ndarray, dtype=SIZE_t
-            A mask on the samples. Indices of the samples in X and y we want to use,
-            where sample_indices[start:end] correspond to the samples in this node.
-        start : SIZE_t
-            The first sample to be used on this node
-        end : SIZE_t
-            The last sample used on this node
-
-        """
-        pass
-
     cdef int reset(self) except -1 nogil:
         """Reset the criterion at pos=start.
-
         This method must be implemented by the subclass.
         """
         pass
 
     cdef int reverse_reset(self) except -1 nogil:
         """Reset the criterion at pos=end.
-
         This method must be implemented by the subclass.
         """
         pass
 
     cdef int update(self, SIZE_t new_pos) except -1 nogil:
         """Updated statistics by moving sample_indices[pos:new_pos] to the left child.
-
         This updates the collected statistics by moving sample_indices[pos:new_pos]
         from the right child to the left child. It must be implemented by
         the subclass.
-
         Parameters
         ----------
         new_pos : SIZE_t
@@ -105,7 +78,6 @@ cdef class Criterion:
 
     cdef double node_impurity(self) noexcept nogil:
         """Placeholder for calculating the impurity of the node.
-
         Placeholder for a method which will evaluate the impurity of
         the current node, i.e. the impurity of sample_indices[start:end]. This is the
         primary function of the criterion class. The smaller the impurity the
@@ -116,11 +88,9 @@ cdef class Criterion:
     cdef void children_impurity(self, double* impurity_left,
                                 double* impurity_right) noexcept nogil:
         """Placeholder for calculating the impurity of children.
-
         Placeholder for a method which evaluates the impurity in
         children nodes, i.e. the impurity of sample_indices[start:pos] + the impurity
         of sample_indices[pos:end].
-
         Parameters
         ----------
         impurity_left : double pointer
@@ -134,10 +104,8 @@ cdef class Criterion:
 
     cdef void node_value(self, double* dest) noexcept nogil:
         """Placeholder for storing the node value.
-
         Placeholder for a method which will compute the node value
         of sample_indices[start:end] and save the value into dest.
-
         Parameters
         ----------
         dest : double pointer
@@ -147,12 +115,10 @@ cdef class Criterion:
 
     cdef double proxy_impurity_improvement(self) noexcept nogil:
         """Compute a proxy of the impurity reduction.
-
         This method is used to speed up the search for the best split.
         It is a proxy quantity such that the split that maximizes this value
         also maximizes the impurity improvement. It neglects all constant terms
         of the impurity decrease for a given split.
-
         The absolute impurity improvement is only computed by the
         impurity_improvement method once the best split has been found.
         """
@@ -167,28 +133,21 @@ cdef class Criterion:
                                      double impurity_left,
                                      double impurity_right) noexcept nogil:
         """Compute the improvement in impurity.
-
         This method computes the improvement in impurity when a split occurs.
         The weighted impurity improvement equation is the following:
-
             N_t / N * (impurity - N_t_R / N_t * right_impurity
                                 - N_t_L / N_t * left_impurity)
-
         where N is the total number of samples, N_t is the number of samples
         at the current node, N_t_L is the number of samples in the left child,
         and N_t_R is the number of samples in the right child,
-
         Parameters
         ----------
         impurity_parent : double
             The initial impurity of the parent node before the split
-
         impurity_left : double
             The impurity of the left child
-
         impurity_right : double
             The impurity of the right child
-
         Return
         ------
         double : improvement in impurity after the split occurs
@@ -199,6 +158,61 @@ cdef class Criterion:
                                  - (self.weighted_n_left /
                                     self.weighted_n_node_samples * impurity_left)))
 
+    cdef void set_sample_pointers(
+        self,
+        SIZE_t start,
+        SIZE_t end
+    ) noexcept nogil:
+        """Abstract method which will set sample pointers in the criterion.
+        The dataset array that we compute criteria on is assumed to consist of 'N' 
+        ordered samples or rows (i.e. sorted). Since we pass this by reference, we 
+        use sample pointers to move the start and end around to consider only a subset of data. 
+        This function should also update relevant statistics that the class uses to compute the final criterion.
+        Parameters
+        ----------
+        start : SIZE_t
+            The index of the first sample to be used on computation of criteria of the current node.
+        end : SIZE_t
+            The last sample used on this node
+        """
+        pass
+
+
+cdef class Criterion(BaseCriterion):
+    """Interface for impurity criteria.
+    The supervised criterion computes the impurity of a node and the reduction of
+    impurity of a split on that node using the distribution of labels in parent and
+    children nodes. It also computes the output statistics
+    such as the mean in regression and class probabilities in classification.
+    Instances of this class are responsible for compute splits' impurity difference
+    Criterion is the base class for criteria used in supervised tree-based models
+    with a homogeneous float64-dtyped y.
+    """
+    cdef int init(
+        self,
+        const DOUBLE_t[:, ::1] y,
+        const DOUBLE_t[:] sample_weight,
+        double weighted_n_samples,
+        const SIZE_t[:] sample_indices
+    ) except -1 nogil:
+        """Placeholder for a method which will initialize the criterion.
+        Returns -1 in case of failure to allocate memory (and raise MemoryError)
+        or 0 otherwise.
+        Parameters
+        ----------
+        y : ndarray, dtype=DOUBLE_t
+            y is a buffer that can store values for n_outputs target variables
+            stored as a Cython memoryview.
+        sample_weight : ndarray, dtype=DOUBLE_t
+            The weight of each sample stored as a Cython memoryview.
+        weighted_n_samples : double
+            The total weight of the samples being considered
+        sample_indices : ndarray, dtype=SIZE_t
+            A mask on the samples. Indices of the samples in X and y we want to use,
+            where sample_indices[start:end] correspond to the samples in this node.
+        """
+        pass
+
 
 cdef class ClassificationCriterion(Criterion):
     """Abstract criterion for classification."""
@@ -206,7 +220,6 @@ cdef class ClassificationCriterion(Criterion):
     def __cinit__(self, SIZE_t n_outputs,
                   cnp.ndarray[SIZE_t, ndim=1] n_classes):
         """Initialize attributes for this criterion.
-
         Parameters
         ----------
         n_outputs : SIZE_t
@@ -254,18 +267,11 @@ cdef class ClassificationCriterion(Criterion):
         const DOUBLE_t[:, ::1] y,
         const DOUBLE_t[:] sample_weight,
         double weighted_n_samples,
-        const SIZE_t[:] sample_indices,
-        SIZE_t start,
-        SIZE_t end
+        const SIZE_t[:] sample_indices
     ) except -1 nogil:
         """Initialize the criterion.
-
-        This initializes the criterion at node sample_indices[start:end] and children
-        sample_indices[start:start] and sample_indices[start:end].
-
         Returns -1 in case of failure to allocate memory (and raise MemoryError)
         or 0 otherwise.
-
         Parameters
         ----------
         y : ndarray, dtype=DOUBLE_t
@@ -277,18 +283,24 @@ cdef class ClassificationCriterion(Criterion):
         sample_indices : ndarray, dtype=SIZE_t
             A mask on the samples. Indices of the samples in X and y we want to use,
             where sample_indices[start:end] correspond to the samples in this node.
-        start : SIZE_t
-            The first sample to use in the mask
-        end : SIZE_t
-            The last sample to use in the mask
         """
         self.y = y
         self.sample_weight = sample_weight
         self.sample_indices = sample_indices
+        self.weighted_n_samples = weighted_n_samples
+
+        return 0
+
+    cdef void set_sample_pointers(
+        self,
+        SIZE_t start,
+        SIZE_t end
+    ) noexcept nogil:
+        """Set sample pointers in the criterion."""
+        self.n_node_samples = end - start
         self.start = start
         self.end = end
-        self.n_node_samples = end - start
-        self.weighted_n_samples = weighted_n_samples
+
         self.weighted_n_node_samples = 0.0
 
         cdef SIZE_t i
@@ -301,12 +313,12 @@ cdef class ClassificationCriterion(Criterion):
             memset(&self.sum_total[k, 0], 0, self.n_classes[k] * sizeof(double))
 
         for p in range(start, end):
-            i = sample_indices[p]
+            i = self.sample_indices[p]
 
             # w is originally set to be 1.0, meaning that if no sample weights
             # are given, the default weight of each sample is 1.0.
-            if sample_weight is not None:
-                w = sample_weight[i]
+            if self.sample_weight is not None:
+                w = self.sample_weight[i]
 
             # Count weighted class frequency for each target
             for k in range(self.n_outputs):
@@ -317,11 +329,9 @@ cdef class ClassificationCriterion(Criterion):
 
         # Reset to pos=start
         self.reset()
-        return 0
 
     cdef int reset(self) except -1 nogil:
         """Reset the criterion at pos=start.
-
         Returns -1 in case of failure to allocate memory (and raise MemoryError)
         or 0 otherwise.
         """
@@ -338,7 +348,6 @@ cdef class ClassificationCriterion(Criterion):
 
     cdef int reverse_reset(self) except -1 nogil:
         """Reset the criterion at pos=end.
-
         Returns -1 in case of failure to allocate memory (and raise MemoryError)
         or 0 otherwise.
         """
@@ -355,10 +364,8 @@ cdef class ClassificationCriterion(Criterion):
 
     cdef int update(self, SIZE_t new_pos) except -1 nogil:
         """Updated statistics by moving sample_indices[pos:new_pos] to the left child.
-
         Returns -1 in case of failure to allocate memory (and raise MemoryError)
         or 0 otherwise.
-
         Parameters
         ----------
         new_pos : SIZE_t
@@ -428,7 +435,6 @@ cdef class ClassificationCriterion(Criterion):
 
     cdef void node_value(self, double* dest) noexcept nogil:
         """Compute the node value of sample_indices[start:end] and save it into dest.
-
         Parameters
         ----------
         dest : double pointer
@@ -443,23 +449,17 @@ cdef class ClassificationCriterion(Criterion):
 
 cdef class Entropy(ClassificationCriterion):
     r"""Cross Entropy impurity criterion.
-
     This handles cases where the target is a classification taking values
     0, 1, ... K-2, K-1. If node m represents a region Rm with Nm observations,
     then let
-
         count_k = 1 / Nm \sum_{x_i in Rm} I(yi = k)
-
     be the proportion of class k observations in node m.
-
     The cross-entropy is then defined as
-
         cross-entropy = -\sum_{k=0}^{K-1} count_k log(count_k)
     """
 
     cdef double node_impurity(self) noexcept nogil:
         """Evaluate the impurity of the current node.
-
         Evaluate the cross-entropy criterion as impurity of the current node,
         i.e. the impurity of sample_indices[start:end]. The smaller the impurity the
         better.
@@ -481,10 +481,8 @@ cdef class Entropy(ClassificationCriterion):
     cdef void children_impurity(self, double* impurity_left,
                                 double* impurity_right) noexcept nogil:
         """Evaluate the impurity in children nodes.
-
         i.e. the impurity of the left child (sample_indices[start:pos]) and the
         impurity the right child (sample_indices[pos:end]).
-
         Parameters
         ----------
         impurity_left : double pointer
@@ -516,24 +514,18 @@ cdef class Entropy(ClassificationCriterion):
 
 cdef class Gini(ClassificationCriterion):
     r"""Gini Index impurity criterion.
-
     This handles cases where the target is a classification taking values
     0, 1, ... K-2, K-1. If node m represents a region Rm with Nm observations,
     then let
-
         count_k = 1/ Nm \sum_{x_i in Rm} I(yi = k)
-
     be the proportion of class k observations in node m.
-
     The Gini Index is then defined as:
-
         index = \sum_{k=0}^{K-1} count_k (1 - count_k)
               = 1 - \sum_{k=0}^{K-1} count_k ** 2
     """
 
     cdef double node_impurity(self) noexcept nogil:
         """Evaluate the impurity of the current node.
-
         Evaluate the Gini criterion as impurity of the current node,
         i.e. the impurity of sample_indices[start:end]. The smaller the impurity the
         better.
@@ -559,10 +551,8 @@ cdef class Gini(ClassificationCriterion):
     cdef void children_impurity(self, double* impurity_left,
                                 double* impurity_right) noexcept nogil:
         """Evaluate the impurity in children nodes.
-
         i.e. the impurity of the left child (sample_indices[start:pos]) and the
         impurity the right child (sample_indices[pos:end]) using the Gini index.
-
         Parameters
         ----------
         impurity_left : double pointer
@@ -601,24 +591,20 @@ cdef class Gini(ClassificationCriterion):
 
 cdef class RegressionCriterion(Criterion):
     r"""Abstract regression criterion.
-
     This handles cases where the target is a continuous value, and is
     evaluated by computing the variance of the target values left and right
     of the split point. The computation takes linear time with `n_samples`
     by using ::
-
         var = \sum_i^n (y_i - y_bar) ** 2
             = (\sum_i^n y_i ** 2) - n_samples * y_bar ** 2
     """
 
     def __cinit__(self, SIZE_t n_outputs, SIZE_t n_samples):
         """Initialize parameters for this criterion.
-
         Parameters
         ----------
         n_outputs : SIZE_t
             The number of targets to be predicted
-
         n_samples : SIZE_t
             The total number of samples to fit on
         """
@@ -648,23 +634,29 @@ cdef class RegressionCriterion(Criterion):
         const DOUBLE_t[:, ::1] y,
         const DOUBLE_t[:] sample_weight,
         double weighted_n_samples,
-        const SIZE_t[:] sample_indices,
-        SIZE_t start,
-        SIZE_t end,
+        const SIZE_t[:] sample_indices
     ) except -1 nogil:
-        """Initialize the criterion.
-
-        This initializes the criterion at node sample_indices[start:end] and children
-        sample_indices[start:start] and sample_indices[start:end].
-        """
+        """Initialize the criterion."""
         # Initialize fields
         self.y = y
         self.sample_weight = sample_weight
         self.sample_indices = sample_indices
+        self.weighted_n_samples = weighted_n_samples
+
+        return 0
+
+    cdef void set_sample_pointers(
+        self,
+        SIZE_t start,
+        SIZE_t end
+    ) noexcept nogil:
+        """Set sample pointers in the criterion."""
         self.start = start
         self.end = end
+
         self.n_node_samples = end - start
-        self.weighted_n_samples = weighted_n_samples
+
+        self.sq_sum_total = 0.0
         self.weighted_n_node_samples = 0.
 
         cdef SIZE_t i
@@ -673,14 +665,14 @@ cdef class RegressionCriterion(Criterion):
         cdef DOUBLE_t y_ik
         cdef DOUBLE_t w_y_ik
         cdef DOUBLE_t w = 1.0
-        self.sq_sum_total = 0.0
+
         memset(&self.sum_total[0], 0, self.n_outputs * sizeof(double))
 
         for p in range(start, end):
-            i = sample_indices[p]
+            i = self.sample_indices[p]
 
-            if sample_weight is not None:
-                w = sample_weight[i]
+            if self.sample_weight is not None:
+                w = self.sample_weight[i]
 
             for k in range(self.n_outputs):
                 y_ik = self.y[i, k]
@@ -692,7 +684,6 @@ cdef class RegressionCriterion(Criterion):
 
         # Reset to pos=start
         self.reset()
-        return 0
 
     cdef int reset(self) except -1 nogil:
         """Reset the criterion at pos=start."""
@@ -785,13 +776,11 @@ cdef class RegressionCriterion(Criterion):
 
 cdef class MSE(RegressionCriterion):
     """Mean squared error impurity criterion.
-
         MSE = var_left + var_right
     """
 
     cdef double node_impurity(self) noexcept nogil:
         """Evaluate the impurity of the current node.
-
         Evaluate the MSE criterion as impurity of the current node,
         i.e. the impurity of sample_indices[start:end]. The smaller the impurity the
         better.
@@ -807,22 +796,16 @@ cdef class MSE(RegressionCriterion):
 
     cdef double proxy_impurity_improvement(self) noexcept nogil:
         """Compute a proxy of the impurity reduction.
-
         This method is used to speed up the search for the best split.
         It is a proxy quantity such that the split that maximizes this value
         also maximizes the impurity improvement. It neglects all constant terms
         of the impurity decrease for a given split.
-
         The absolute impurity improvement is only computed by the
         impurity_improvement method once the best split has been found.
-
         The MSE proxy is derived from
-
             sum_{i left}(y_i - y_pred_L)^2 + sum_{i right}(y_i - y_pred_R)^2
             = sum(y_i^2) - n_L * mean_{i left}(y_i)^2 - n_R * mean_{i right}(y_i)^2
-
         Neglecting constant terms, this gives:
-
             - 1/n_L * sum_{i left}(y_i)^2 - 1/n_R * sum_{i right}(y_i)^2
         """
         cdef SIZE_t k
@@ -839,7 +822,6 @@ cdef class MSE(RegressionCriterion):
     cdef void children_impurity(self, double* impurity_left,
                                 double* impurity_right) noexcept nogil:
         """Evaluate the impurity in children nodes.
-
         i.e. the impurity of the left child (sample_indices[start:pos]) and the
         impurity the right child (sample_indices[pos:end]).
         """
@@ -883,7 +865,6 @@ cdef class MSE(RegressionCriterion):
 
 cdef class MAE(RegressionCriterion):
     r"""Mean absolute error impurity criterion.
-
        MAE = (1 / n)*(\sum_i |y_i - f_i|), where y_i is the true
        value and f_i is the predicted value."""
 
@@ -895,12 +876,10 @@ cdef class MAE(RegressionCriterion):
 
     def __cinit__(self, SIZE_t n_outputs, SIZE_t n_samples):
         """Initialize parameters for this criterion.
-
         Parameters
         ----------
         n_outputs : SIZE_t
             The number of targets to be predicted
-
         n_samples : SIZE_t
             The total number of samples to fit on
         """
@@ -933,26 +912,30 @@ cdef class MAE(RegressionCriterion):
         const DOUBLE_t[:, ::1] y,
         const DOUBLE_t[:] sample_weight,
         double weighted_n_samples,
-        const SIZE_t[:] sample_indices,
-        SIZE_t start,
-        SIZE_t end,
+        const SIZE_t[:] sample_indices
     ) except -1 nogil:
-        """Initialize the criterion.
-
-        This initializes the criterion at node sample_indices[start:end] and children
-        sample_indices[start:start] and sample_indices[start:end].
-        """
-        cdef SIZE_t i, p, k
-        cdef DOUBLE_t w = 1.0
-
+        """Initialize the criterion."""
         # Initialize fields
         self.y = y
         self.sample_weight = sample_weight
         self.sample_indices = sample_indices
+        self.weighted_n_samples = weighted_n_samples
+
+        return 0
+
+    cdef void set_sample_pointers(
+        self,
+        SIZE_t start,
+        SIZE_t end
+    ) noexcept nogil:
+        """Set sample pointers in the criterion."""
+        cdef SIZE_t i, p, k
+        cdef DOUBLE_t w = 1.0
+
         self.start = start
         self.end = end
+
         self.n_node_samples = end - start
-        self.weighted_n_samples = weighted_n_samples
         self.weighted_n_node_samples = 0.
 
         cdef void** left_child = self.left_child_ptr
@@ -963,10 +946,10 @@ cdef class MAE(RegressionCriterion):
             (<WeightedMedianCalculator> right_child[k]).reset()
 
         for p in range(start, end):
-            i = sample_indices[p]
+            i = self.sample_indices[p]
 
-            if sample_weight is not None:
-                w = sample_weight[i]
+            if self.sample_weight is not None:
+                w = self.sample_weight[i]
 
             for k in range(self.n_outputs):
                 # push method ends up calling safe_realloc, hence `except -1`
@@ -981,11 +964,9 @@ cdef class MAE(RegressionCriterion):
 
         # Reset to pos=start
         self.reset()
-        return 0
 
     cdef int reset(self) except -1 nogil:
         """Reset the criterion at pos=start.
-
         Returns -1 in case of failure to allocate memory (and raise MemoryError)
         or 0 otherwise.
         """
@@ -1016,7 +997,6 @@ cdef class MAE(RegressionCriterion):
 
     cdef int reverse_reset(self) except -1 nogil:
         """Reset the criterion at pos=end.
-
         Returns -1 in case of failure to allocate memory (and raise MemoryError)
         or 0 otherwise.
         """
@@ -1044,7 +1024,6 @@ cdef class MAE(RegressionCriterion):
 
     cdef int update(self, SIZE_t new_pos) except -1 nogil:
         """Updated statistics by moving sample_indices[pos:new_pos] to the left.
-
         Returns -1 in case of failure to allocate memory (and raise MemoryError)
         or 0 otherwise.
         """
@@ -1107,7 +1086,6 @@ cdef class MAE(RegressionCriterion):
 
     cdef double node_impurity(self) noexcept nogil:
         """Evaluate the impurity of the current node.
-
         Evaluate the MAE criterion as impurity of the current node,
         i.e. the impurity of sample_indices[start:end]. The smaller the impurity the
         better.
@@ -1132,7 +1110,6 @@ cdef class MAE(RegressionCriterion):
     cdef void children_impurity(self, double* p_impurity_left,
                                 double* p_impurity_right) noexcept nogil:
         """Evaluate the impurity in children nodes.
-
         i.e. the impurity of the left child (sample_indices[start:pos]) and the
         impurity the right child (sample_indices[pos:end]).
         """
@@ -1179,21 +1156,17 @@ cdef class MAE(RegressionCriterion):
 
 cdef class FriedmanMSE(MSE):
     """Mean squared error impurity criterion with improvement score by Friedman.
-
     Uses the formula (35) in Friedman's original Gradient Boosting paper:
-
         diff = mean_left - mean_right
         improvement = n_left * n_right * diff^2 / (n_left + n_right)
     """
 
     cdef double proxy_impurity_improvement(self) noexcept nogil:
         """Compute a proxy of the impurity reduction.
-
         This method is used to speed up the search for the best split.
         It is a proxy quantity such that the split that maximizes this value
         also maximizes the impurity improvement. It neglects all constant terms
         of the impurity decrease for a given split.
-
         The absolute impurity improvement is only computed by the
         impurity_improvement method once the best split has been found.
         """
@@ -1234,9 +1207,7 @@ cdef class FriedmanMSE(MSE):
 
 cdef class Poisson(RegressionCriterion):
     """Half Poisson deviance as impurity criterion.
-
     Poisson deviance = 2/n * sum(y_true * log(y_true/y_pred) + y_pred - y_true)
-
     Note that the deviance is >= 0, and since we have `y_pred = mean(y_true)`
     at the leaves, one always has `sum(y_pred - y_true) = 0`. It remains the
     implemented impurity (factor 2 is skipped):
@@ -1255,7 +1226,6 @@ cdef class Poisson(RegressionCriterion):
 
     cdef double node_impurity(self) noexcept nogil:
         """Evaluate the impurity of the current node.
-
         Evaluate the Poisson criterion as impurity of the current node,
         i.e. the impurity of sample_indices[start:end]. The smaller the impurity the
         better.
@@ -1265,24 +1235,18 @@ cdef class Poisson(RegressionCriterion):
 
     cdef double proxy_impurity_improvement(self) noexcept nogil:
         """Compute a proxy of the impurity reduction.
-
         This method is used to speed up the search for the best split.
         It is a proxy quantity such that the split that maximizes this value
         also maximizes the impurity improvement. It neglects all constant terms
         of the impurity decrease for a given split.
-
         The absolute impurity improvement is only computed by the
         impurity_improvement method once the best split has been found.
-
         The Poisson proxy is derived from:
-
               sum_{i left }(y_i * log(y_i / y_pred_L))
             + sum_{i right}(y_i * log(y_i / y_pred_R))
             = sum(y_i * log(y_i) - n_L * mean_{i left}(y_i) * log(mean_{i left}(y_i))
                                  - n_R * mean_{i right}(y_i) * log(mean_{i right}(y_i))
-
         Neglecting constant terms, this gives
-
             - sum{i left }(y_i) * log(mean{i left}(y_i))
             - sum{i right}(y_i) * log(mean{i right}(y_i))
         """
@@ -1312,7 +1276,6 @@ cdef class Poisson(RegressionCriterion):
     cdef void children_impurity(self, double* impurity_left,
                                 double* impurity_right) noexcept nogil:
         """Evaluate the impurity in children nodes.
-
         i.e. the impurity of the left child (sample_indices[start:pos]) and the
         impurity of the right child (sample_indices[pos:end]) for Poisson.
         """
diff --git a/sklearn/tree/_splitter.pxd b/sklearn/tree/_splitter.pxd
index 13fec5974c3c5..b0207ab0a715d 100644
--- a/sklearn/tree/_splitter.pxd
+++ b/sklearn/tree/_splitter.pxd
@@ -4,12 +4,14 @@
 #          Joel Nothman <joel.nothman@gmail.com>
 #          Arnaud Joly <arnaud.v.joly@gmail.com>
 #          Jacob Schreiber <jmschreiber91@gmail.com>
+#          Adam Li <adam2392@gmail.com>
+#          Jong Shin <jshinm@gmail.com>
 #
 # License: BSD 3 clause
 
 # See _splitter.pyx for details.
 
-from ._criterion cimport Criterion
+from ._criterion cimport BaseCriterion, Criterion
 
 from ._tree cimport DTYPE_t          # Type of X
 from ._tree cimport DOUBLE_t         # Type of y, sample_weight
@@ -28,14 +30,15 @@ cdef struct SplitRecord:
     double impurity_left   # Impurity of the left split.
     double impurity_right  # Impurity of the right split.
 
-cdef class Splitter:
+cdef class BaseSplitter:
+    """Abstract interface for splitter."""
+
     # The splitter searches in the input space for a feature and a threshold
     # to split the samples samples[start:end].
     #
     # The impurity computations are delegated to a criterion object.
 
     # Internal structures
-    cdef public Criterion criterion      # Impurity criterion
     cdef public SIZE_t max_features      # Number of features to test
     cdef public SIZE_t min_samples_leaf  # Min samples in a leaf
     cdef public double min_weight_leaf   # Minimum weight in a leaf
@@ -54,7 +57,6 @@ cdef class Splitter:
     cdef SIZE_t start                    # Start position for the current node
     cdef SIZE_t end                      # End position for the current node
 
-    cdef const DOUBLE_t[:, ::1] y
     cdef const DOUBLE_t[:] sample_weight
 
     # The samples vector `samples` is maintained by the Splitter object such
@@ -74,27 +76,38 @@ cdef class Splitter:
     # This allows optimization with depth-based tree building.
 
     # Methods
-    cdef int init(
-        self,
-        object X,
-        const DOUBLE_t[:, ::1] y,
-        const DOUBLE_t[:] sample_weight
-    ) except -1
-
     cdef int node_reset(
         self,
         SIZE_t start,
         SIZE_t end,
         double* weighted_n_node_samples
     ) except -1 nogil
-
     cdef int node_split(
         self,
         double impurity,   # Impurity of the node
         SplitRecord* split,
         SIZE_t* n_constant_features
     ) except -1 nogil
-
     cdef void node_value(self, double* dest) noexcept nogil
-
     cdef double node_impurity(self) noexcept nogil
+    cdef int pointer_size(self) noexcept nogil
+
+cdef class Splitter(BaseSplitter):
+    cdef public Criterion criterion      # Impurity criterion
+    cdef const DOUBLE_t[:, ::1] y
+    
+    cdef int init(
+        self,
+        object X,
+        const DOUBLE_t[:, ::1] y,
+        const DOUBLE_t[:] sample_weight
+    ) except -1
+
+    # Methods that allow modifications to stopping conditions
+    cdef bint check_presplit_conditions(
+        self,
+        SplitRecord current_split,
+    ) noexcept nogil
+    cdef bint check_postsplit_conditions(
+        self
+    ) noexcept nogil
diff --git a/sklearn/tree/_splitter.pyx b/sklearn/tree/_splitter.pyx
index 83a80d90cc1b9..17a747433d1a8 100644
--- a/sklearn/tree/_splitter.pyx
+++ b/sklearn/tree/_splitter.pyx
@@ -8,7 +8,10 @@
 #          Joel Nothman <joel.nothman@gmail.com>
 #          Fares Hedayati <fares.hedayati@gmail.com>
 #          Jacob Schreiber <jmschreiber91@gmail.com>
+#          Adam Li <adam2392@gmail.com>
+#          Jong Shin <jshinm@gmail.com>
 #
+
 # License: BSD 3 clause
 
 from ._criterion cimport Criterion
@@ -43,16 +46,78 @@ cdef inline void _init_split(SplitRecord* self, SIZE_t start_pos) noexcept nogil
     self.threshold = 0.
     self.improvement = -INFINITY
 
-cdef class Splitter:
-    """Abstract splitter class.
+cdef class BaseSplitter:
+    """This is an abstract interface for splitters. 
+
+    For example, a tree model could be either supervisedly, or unsupervisedly computing splits on samples of
+    covariates, labels, or both. Although scikit-learn currently only contains
+    supervised tree methods, this class enables 3rd party packages to leverage
+    scikit-learn's Cython code for splitting. 
+
+    A splitter is usually used in conjunction with a criterion class, which explicitly handles
+    computing the criteria, which we split on. The setting of that criterion class is handled
+    by downstream classes.
 
-    Splitters are called by tree builders to find the best splits on both
-    sparse and dense data, one split at a time.
+    The downstream classes _must_ implement methods to compute the split in a node.
     """
 
+    def __getstate__(self):
+        return {}
+
+    def __setstate__(self, d):
+        pass
+
+    cdef int node_reset(self, SIZE_t start, SIZE_t end,
+                        double* weighted_n_node_samples) except -1 nogil:
+        """Reset splitter on node samples[start:end].
+
+        Returns -1 in case of failure to allocate memory (and raise MemoryError)
+        or 0 otherwise.
+
+        Parameters
+        ----------
+        start : SIZE_t
+            The index of the first sample to consider
+        end : SIZE_t
+            The index of the last sample to consider
+        weighted_n_node_samples : ndarray, dtype=double pointer
+            The total weight of those samples
+        """
+        pass
+
+    cdef int node_split(self, double impurity, SplitRecord* split,
+                        SIZE_t* n_constant_features) except -1 nogil:
+        """Find the best split on node samples[start:end].
+
+        This is a placeholder method. The majority of computation will be done
+        here.
+
+        It should return -1 upon errors.
+        """
+        pass
+
+    cdef void node_value(self, double* dest) noexcept nogil:
+        """Copy the value of node samples[start:end] into dest."""
+        pass
+
+    cdef double node_impurity(self) noexcept nogil:
+        """Return the impurity of the current node."""
+        pass
+
+    cdef int pointer_size(self) noexcept nogil:
+        """Size of the pointer for split records.
+        
+        Overriding this function allows one to use different subclasses of
+        `SplitRecord`.
+        """
+        return sizeof(SplitRecord)
+
+cdef class Splitter(BaseSplitter):
+    """Abstract interface for supervised splitters."""
+
     def __cinit__(self, Criterion criterion, SIZE_t max_features,
                   SIZE_t min_samples_leaf, double min_weight_leaf,
-                  object random_state):
+                  object random_state, *argv):
         """
         Parameters
         ----------
@@ -75,7 +140,6 @@ cdef class Splitter:
         random_state : object
             The user inputted random state to be used for pseudo-randomness
         """
-
         self.criterion = criterion
 
         self.n_samples = 0
@@ -86,11 +150,6 @@ cdef class Splitter:
         self.min_weight_leaf = min_weight_leaf
         self.random_state = random_state
 
-    def __getstate__(self):
-        return {}
-
-    def __setstate__(self, d):
-        pass
 
     def __reduce__(self):
         return (type(self), (self.criterion,
@@ -127,7 +186,6 @@ cdef class Splitter:
             are assumed to have uniform weight. This is represented
             as a Cython memoryview.
         """
-
         self.rand_r_state = self.random_state.randint(0, RAND_R_MAX)
         cdef SIZE_t n_samples = X.shape[0]
 
@@ -165,6 +223,19 @@ cdef class Splitter:
         self.y = y
 
         self.sample_weight = sample_weight
+
+        self.criterion.init(
+            self.y,
+            self.sample_weight,
+            self.weighted_n_samples,
+            self.samples
+        )
+
+        self.criterion.set_sample_pointers(
+            self.start,
+            self.end
+        )
+
         return 0
 
     cdef int node_reset(self, SIZE_t start, SIZE_t end,
@@ -187,30 +258,11 @@ cdef class Splitter:
         self.start = start
         self.end = end
 
-        self.criterion.init(
-            self.y,
-            self.sample_weight,
-            self.weighted_n_samples,
-            self.samples,
-            start,
-            end
-        )
+        self.criterion.set_sample_pointers(start, end)
 
         weighted_n_node_samples[0] = self.criterion.weighted_n_node_samples
         return 0
 
-    cdef int node_split(self, double impurity, SplitRecord* split,
-                        SIZE_t* n_constant_features) except -1 nogil:
-        """Find the best split on node samples[start:end].
-
-        This is a placeholder method. The majority of computation will be done
-        here.
-
-        It should return -1 upon errors.
-        """
-
-        pass
-
     cdef void node_value(self, double* dest) noexcept nogil:
         """Copy the value of node samples[start:end] into dest."""
 
@@ -221,6 +273,41 @@ cdef class Splitter:
 
         return self.criterion.node_impurity()
 
+    cdef bint check_presplit_conditions(
+        self,
+        SplitRecord current_split,
+    ) noexcept nogil:
+        """Check stopping conditions pre-split.
+        
+        This is typically a metric that is cheaply computed given the
+        current proposed split, which is stored as a the `current_split`
+        argument.
+        """
+        cdef SIZE_t min_samples_leaf = self.min_samples_leaf
+
+        if (((current_split.pos - self.start) < min_samples_leaf) or
+                ((self.end - current_split.pos) < min_samples_leaf)):
+            return 1
+        
+        return 0
+
+    cdef bint check_postsplit_conditions(
+        self
+    ) noexcept nogil:
+        """Check stopping conditions after evaluating the split.
+        
+        This takes some metric that is stored in the Criterion
+        object and checks against internal stop metrics.
+        """
+        cdef double min_weight_leaf = self.min_weight_leaf
+
+        # Reject if min_weight_leaf is not satisfied
+        if ((self.criterion.weighted_n_left < min_weight_leaf) or
+                (self.criterion.weighted_n_right < min_weight_leaf)):
+            return 1
+        
+        return 0
+
 # Introduce a fused-class to make it possible to share the split implementation
 # between the dense and sparse cases in the node_split_best and node_split_random
 # functions. The alternative would have been to use inheritance-based polymorphism
@@ -229,7 +316,7 @@ cdef class Splitter:
 ctypedef fused Partitioner:
     DensePartitioner
     SparsePartitioner
-
+    
 cdef inline int node_split_best(
     Splitter splitter,
     Partitioner partitioner,
@@ -349,15 +436,13 @@ cdef inline int node_split_best(
             current_split.pos = p
 
             # Reject if min_samples_leaf is not guaranteed
-            if (((current_split.pos - start) < min_samples_leaf) or
-                    ((end - current_split.pos) < min_samples_leaf)):
+            if splitter.check_presplit_conditions(current_split) == 1:
                 continue
 
             criterion.update(current_split.pos)
 
             # Reject if min_weight_leaf is not satisfied
-            if ((criterion.weighted_n_left < min_weight_leaf) or
-                    (criterion.weighted_n_right < min_weight_leaf)):
+            if splitter.check_postsplit_conditions() == 1:
                 continue
 
             current_proxy_improvement = criterion.proxy_impurity_improvement()
@@ -645,8 +730,7 @@ cdef inline int node_split_random(
         current_split.pos = partitioner.partition_samples(current_split.threshold)
 
         # Reject if min_samples_leaf is not guaranteed
-        if (((current_split.pos - start) < min_samples_leaf) or
-                ((end - current_split.pos) < min_samples_leaf)):
+        if splitter.check_presplit_conditions(current_split) == 1:
             continue
 
         # Evaluate split
@@ -656,8 +740,7 @@ cdef inline int node_split_random(
         criterion.update(current_split.pos)
 
         # Reject if min_weight_leaf is not satisfied
-        if ((criterion.weighted_n_left < min_weight_leaf) or
-                (criterion.weighted_n_right < min_weight_leaf)):
+        if splitter.check_postsplit_conditions() == 1:
             continue
 
         current_proxy_improvement = criterion.proxy_impurity_improvement()
diff --git a/sklearn/tree/_tree.pxd b/sklearn/tree/_tree.pxd
index 1966651d8c89a..8140733a9fc26 100644
--- a/sklearn/tree/_tree.pxd
+++ b/sklearn/tree/_tree.pxd
@@ -13,6 +13,8 @@
 import numpy as np
 cimport numpy as cnp
 
+from libcpp.vector cimport vector
+
 ctypedef cnp.npy_float32 DTYPE_t          # Type of X
 ctypedef cnp.npy_float64 DOUBLE_t         # Type of y, sample_weight
 ctypedef cnp.npy_intp SIZE_t              # Type for indices and counters
@@ -33,40 +35,32 @@ cdef struct Node:
     SIZE_t n_node_samples                # Number of samples at the node
     DOUBLE_t weighted_n_node_samples     # Weighted number of samples at the node
 
-
-cdef class Tree:
-    # The Tree object is a binary tree structure constructed by the
-    # TreeBuilder. The tree structure is used for predictions and
-    # feature importances.
-
-    # Input/Output layout
-    cdef public SIZE_t n_features        # Number of features in X
-    cdef SIZE_t* n_classes               # Number of classes in y[:, k]
-    cdef public SIZE_t n_outputs         # Number of outputs in y
-    cdef public SIZE_t max_n_classes     # max(n_classes)
-
+cdef class BaseTree:
     # Inner structures: values are stored separately from node structure,
     # since size is determined at runtime.
     cdef public SIZE_t max_depth         # Max depth of the tree
     cdef public SIZE_t node_count        # Counter for node IDs
     cdef public SIZE_t capacity          # Capacity of tree, in terms of nodes
     cdef Node* nodes                     # Array of nodes
-    cdef double* value                   # (capacity, n_outputs, max_n_classes) array of values
-    cdef SIZE_t value_stride             # = n_outputs * max_n_classes
 
-    # Methods
-    cdef SIZE_t _add_node(self, SIZE_t parent, bint is_left, bint is_leaf,
-                          SIZE_t feature, double threshold, double impurity,
-                          SIZE_t n_node_samples,
-                          double weighted_n_node_samples) except -1 nogil
+    cdef SIZE_t value_stride             # The dimensionality of a vectorized output per sample
+    cdef double* value                   # Array of values prediction values for each node        
+
+    # Generic Methods: These are generic methods used by any tree.
     cdef int _resize(self, SIZE_t capacity) except -1 nogil
     cdef int _resize_c(self, SIZE_t capacity=*) except -1 nogil
-
-    cdef cnp.ndarray _get_value_ndarray(self)
-    cdef cnp.ndarray _get_node_ndarray(self)
-
-    cpdef cnp.ndarray predict(self, object X)
-
+    cdef SIZE_t _add_node(
+        self,
+        SIZE_t parent,
+        bint is_left,
+        bint is_leaf,
+        SplitRecord* split_node,
+        double impurity,
+        SIZE_t n_node_samples,
+        double weighted_n_node_samples
+    ) except -1 nogil
+
+    # Python API methods: These are methods exposed to Python
     cpdef cnp.ndarray apply(self, object X)
     cdef cnp.ndarray _apply_dense(self, object X)
     cdef cnp.ndarray _apply_sparse_csr(self, object X)
@@ -78,6 +72,49 @@ cdef class Tree:
     cpdef compute_node_depths(self)
     cpdef compute_feature_importances(self, normalize=*)
 
+    # Abstract methods: these functions must be implemented by any decision tree
+    cdef int _set_split_node(
+        self,
+        SplitRecord* split_node,
+        Node* node
+    ) except -1 nogil
+    cdef int _set_leaf_node(
+        self,
+        SplitRecord* split_node,
+        Node* node
+    ) except -1 nogil
+    cdef DTYPE_t _compute_feature(
+        self,
+        const DTYPE_t[:, :] X_ndarray,
+        SIZE_t sample_index,
+        Node *node
+    ) noexcept nogil
+    cdef void _compute_feature_importances(
+        self,
+        cnp.float64_t[:] importances,
+        Node* node,
+    ) noexcept nogil
+
+cdef class Tree(BaseTree):
+    # The Supervised Tree object is a binary tree structure constructed by the
+    # TreeBuilder. The tree structure is used for predictions and
+    # feature importances.
+    # 
+    # Value of upstream properties:
+    # - value_stride = n_outputs * max_n_classes
+    # - value = (capacity, n_outputs, max_n_classes) array of values          
+
+    # Input/Output layout for supervised tree
+    cdef public SIZE_t n_features        # Number of features in X
+    cdef SIZE_t* n_classes               # Number of classes in y[:, k]
+    cdef public SIZE_t n_outputs         # Number of outputs in y
+    cdef public SIZE_t max_n_classes     # max(n_classes)
+
+    # Methods
+    cdef cnp.ndarray _get_value_ndarray(self)
+    cdef cnp.ndarray _get_node_ndarray(self)
+
+    cpdef cnp.ndarray predict(self, object X)
 
 # =============================================================================
 # Tree builder
@@ -91,8 +128,7 @@ cdef class TreeBuilder:
     # This class controls the various stopping criteria and the node splitting
     # evaluation order, e.g. depth-first or best-first.
 
-    cdef Splitter splitter              # Splitting algorithm
-
+    cdef Splitter splitter
     cdef SIZE_t min_samples_split       # Minimum number of samples in an internal node
     cdef SIZE_t min_samples_leaf        # Minimum number of samples in a leaf
     cdef double min_weight_leaf         # Minimum weight in a leaf
diff --git a/sklearn/tree/_tree.pyx b/sklearn/tree/_tree.pyx
index 75eed058bfd4e..e5b759aee23df 100644
--- a/sklearn/tree/_tree.pyx
+++ b/sklearn/tree/_tree.pyx
@@ -22,6 +22,8 @@ from libcpp.vector cimport vector
 from libcpp.algorithm cimport pop_heap
 from libcpp.algorithm cimport push_heap
 from libcpp cimport bool
+from cython.operator cimport dereference as deref
+from libc.stdlib cimport malloc, free
 
 import struct
 
@@ -83,6 +85,7 @@ NODE_DTYPE = np.asarray(<Node[:1]>(&dummy)).dtype
 # TreeBuilder
 # =============================================================================
 
+
 cdef class TreeBuilder:
     """Interface for different tree building strategies."""
 
@@ -196,9 +199,11 @@ cdef class DepthFirstTreeBuilder(TreeBuilder):
         cdef bint is_left
         cdef SIZE_t n_node_samples = splitter.n_samples
         cdef double weighted_n_node_samples
-        cdef SplitRecord split
         cdef SIZE_t node_id
 
+        cdef SplitRecord split
+        cdef SplitRecord* split_ptr = <SplitRecord *>malloc(splitter.pointer_size())
+
         cdef double impurity = INFINITY
         cdef SIZE_t n_constant_features
         cdef bint is_leaf
@@ -248,7 +253,12 @@ cdef class DepthFirstTreeBuilder(TreeBuilder):
                 is_leaf = is_leaf or impurity <= EPSILON
 
                 if not is_leaf:
-                    splitter.node_split(impurity, &split, &n_constant_features)
+                    splitter.node_split(impurity, split_ptr, &n_constant_features)
+
+                    # assign local copy of SplitRecord to assign
+                    # pos, improvement, and impurity scores
+                    split = deref(split_ptr)
+
                     # If EPSILON=0 in the below comparison, float precision
                     # issues stop splitting, producing trees that are
                     # dissimilar to v0.18
@@ -256,8 +266,8 @@ cdef class DepthFirstTreeBuilder(TreeBuilder):
                                (split.improvement + EPSILON <
                                 min_impurity_decrease))
 
-                node_id = tree._add_node(parent, is_left, is_leaf, split.feature,
-                                         split.threshold, impurity, n_node_samples,
+                node_id = tree._add_node(parent, is_left, is_leaf, split_ptr,
+                                         impurity, n_node_samples,
                                          weighted_n_node_samples)
 
                 if node_id == INTPTR_MAX:
@@ -297,6 +307,10 @@ cdef class DepthFirstTreeBuilder(TreeBuilder):
 
             if rc >= 0:
                 tree.max_depth = max_depth_seen
+        
+        # free the memory created for the SplitRecord pointer
+        free(split_ptr)
+
         if rc == -1:
             raise MemoryError()
 
@@ -462,6 +476,8 @@ cdef class BestFirstTreeBuilder(TreeBuilder):
                                     FrontierRecord* res) except -1 nogil:
         """Adds node w/ partition ``[start, end)`` to the frontier. """
         cdef SplitRecord split
+        cdef SplitRecord* split_ptr = <SplitRecord *>malloc(splitter.pointer_size())
+        
         cdef SIZE_t node_id
         cdef SIZE_t n_node_samples
         cdef SIZE_t n_constant_features = 0
@@ -483,7 +499,11 @@ cdef class BestFirstTreeBuilder(TreeBuilder):
                    )
 
         if not is_leaf:
-            splitter.node_split(impurity, &split, &n_constant_features)
+            splitter.node_split(impurity, split_ptr, &n_constant_features)
+            # assign local copy of SplitRecord to assign
+            # pos, improvement, and impurity scores
+            split = deref(split_ptr)
+
             # If EPSILON=0 in the below comparison, float precision issues stop
             # splitting early, producing trees that are dissimilar to v0.18
             is_leaf = (is_leaf or split.pos >= end or
@@ -493,7 +513,7 @@ cdef class BestFirstTreeBuilder(TreeBuilder):
                                  if parent != NULL
                                  else _TREE_UNDEFINED,
                                  is_left, is_leaf,
-                                 split.feature, split.threshold, impurity, n_node_samples,
+                                 split_ptr, impurity, n_node_samples,
                                  weighted_n_node_samples)
         if node_id == INTPTR_MAX:
             return -1
@@ -522,7 +542,8 @@ cdef class BestFirstTreeBuilder(TreeBuilder):
             res.improvement = 0.0
             res.impurity_left = impurity
             res.impurity_right = impurity
-
+        
+        free(split_ptr)
         return 0
 
 
@@ -530,190 +551,15 @@ cdef class BestFirstTreeBuilder(TreeBuilder):
 # Tree
 # =============================================================================
 
-cdef class Tree:
-    """Array-based representation of a binary decision tree.
-
-    The binary tree is represented as a number of parallel arrays. The i-th
-    element of each array holds information about the node `i`. Node 0 is the
-    tree's root. You can find a detailed description of all arrays in
-    `_tree.pxd`. NOTE: Some of the arrays only apply to either leaves or split
-    nodes, resp. In this case the values of nodes of the other type are
-    arbitrary!
-
-    Attributes
-    ----------
-    node_count : int
-        The number of nodes (internal nodes + leaves) in the tree.
-
-    capacity : int
-        The current capacity (i.e., size) of the arrays, which is at least as
-        great as `node_count`.
-
-    max_depth : int
-        The depth of the tree, i.e. the maximum depth of its leaves.
-
-    children_left : array of int, shape [node_count]
-        children_left[i] holds the node id of the left child of node i.
-        For leaves, children_left[i] == TREE_LEAF. Otherwise,
-        children_left[i] > i. This child handles the case where
-        X[:, feature[i]] <= threshold[i].
-
-    children_right : array of int, shape [node_count]
-        children_right[i] holds the node id of the right child of node i.
-        For leaves, children_right[i] == TREE_LEAF. Otherwise,
-        children_right[i] > i. This child handles the case where
-        X[:, feature[i]] > threshold[i].
-
-    feature : array of int, shape [node_count]
-        feature[i] holds the feature to split on, for the internal node i.
-
-    threshold : array of double, shape [node_count]
-        threshold[i] holds the threshold for the internal node i.
-
-    value : array of double, shape [node_count, n_outputs, max_n_classes]
-        Contains the constant prediction value of each node.
-
-    impurity : array of double, shape [node_count]
-        impurity[i] holds the impurity (i.e., the value of the splitting
-        criterion) at node i.
-
-    n_node_samples : array of int, shape [node_count]
-        n_node_samples[i] holds the number of training samples reaching node i.
-
-    weighted_n_node_samples : array of double, shape [node_count]
-        weighted_n_node_samples[i] holds the weighted number of training samples
-        reaching node i.
+cdef class BaseTree:
+    """Base class for Cython tree models.
+    
+    Downstream classes must implement
     """
-    # Wrap for outside world.
-    # WARNING: these reference the current `nodes` and `value` buffers, which
-    # must not be freed by a subsequent memory allocation.
-    # (i.e. through `_resize` or `__setstate__`)
-    property n_classes:
-        def __get__(self):
-            return sizet_ptr_to_ndarray(self.n_classes, self.n_outputs)
-
-    property children_left:
-        def __get__(self):
-            return self._get_node_ndarray()['left_child'][:self.node_count]
-
-    property children_right:
-        def __get__(self):
-            return self._get_node_ndarray()['right_child'][:self.node_count]
-
-    property n_leaves:
-        def __get__(self):
-            return np.sum(np.logical_and(
-                self.children_left == -1,
-                self.children_right == -1))
-
-    property feature:
-        def __get__(self):
-            return self._get_node_ndarray()['feature'][:self.node_count]
-
-    property threshold:
-        def __get__(self):
-            return self._get_node_ndarray()['threshold'][:self.node_count]
-
-    property impurity:
-        def __get__(self):
-            return self._get_node_ndarray()['impurity'][:self.node_count]
-
-    property n_node_samples:
-        def __get__(self):
-            return self._get_node_ndarray()['n_node_samples'][:self.node_count]
-
-    property weighted_n_node_samples:
-        def __get__(self):
-            return self._get_node_ndarray()['weighted_n_node_samples'][:self.node_count]
-
-    property value:
-        def __get__(self):
-            return self._get_value_ndarray()[:self.node_count]
-
-    # TODO: Convert n_classes to cython.integral memory view once
-    #  https://github.com/cython/cython/issues/5243 is fixed
-    def __cinit__(self, int n_features, cnp.ndarray n_classes, int n_outputs):
-        """Constructor."""
-        cdef SIZE_t dummy = 0
-        size_t_dtype = np.array(dummy).dtype
-
-        n_classes = _check_n_classes(n_classes, size_t_dtype)
-
-        # Input/Output layout
-        self.n_features = n_features
-        self.n_outputs = n_outputs
-        self.n_classes = NULL
-        safe_realloc(&self.n_classes, n_outputs)
-
-        self.max_n_classes = np.max(n_classes)
-        self.value_stride = n_outputs * self.max_n_classes
-
-        cdef SIZE_t k
-        for k in range(n_outputs):
-            self.n_classes[k] = n_classes[k]
-
-        # Inner structures
-        self.max_depth = 0
-        self.node_count = 0
-        self.capacity = 0
-        self.value = NULL
-        self.nodes = NULL
-
-    def __dealloc__(self):
-        """Destructor."""
-        # Free all inner structures
-        free(self.n_classes)
-        free(self.value)
-        free(self.nodes)
-
-    def __reduce__(self):
-        """Reduce re-implementation, for pickling."""
-        return (Tree, (self.n_features,
-                       sizet_ptr_to_ndarray(self.n_classes, self.n_outputs),
-                       self.n_outputs), self.__getstate__())
-
-    def __getstate__(self):
-        """Getstate re-implementation, for pickling."""
-        d = {}
-        # capacity is inferred during the __setstate__ using nodes
-        d["max_depth"] = self.max_depth
-        d["node_count"] = self.node_count
-        d["nodes"] = self._get_node_ndarray()
-        d["values"] = self._get_value_ndarray()
-        return d
-
-    def __setstate__(self, d):
-        """Setstate re-implementation, for unpickling."""
-        self.max_depth = d["max_depth"]
-        self.node_count = d["node_count"]
-
-        if 'nodes' not in d:
-            raise ValueError('You have loaded Tree version which '
-                             'cannot be imported')
-
-        node_ndarray = d['nodes']
-        value_ndarray = d['values']
-
-        value_shape = (node_ndarray.shape[0], self.n_outputs,
-                       self.max_n_classes)
-
-        node_ndarray = _check_node_ndarray(node_ndarray, expected_dtype=NODE_DTYPE)
-        value_ndarray = _check_value_ndarray(
-            value_ndarray,
-            expected_dtype=np.dtype(np.float64),
-            expected_shape=value_shape
-        )
-
-        self.capacity = node_ndarray.shape[0]
-        if self._resize_c(self.capacity) != 0:
-            raise MemoryError("resizing tree to %d" % self.capacity)
-
-        nodes = memcpy(self.nodes, cnp.PyArray_DATA(node_ndarray),
-                       self.capacity * sizeof(Node))
-        value = memcpy(self.value, cnp.PyArray_DATA(value_ndarray),
-                       self.capacity * self.value_stride * sizeof(double))
-
-    cdef int _resize(self, SIZE_t capacity) except -1 nogil:
+    cdef int _resize(
+        self,
+        SIZE_t capacity
+    ) except -1 nogil:
         """Resize all inner arrays to `capacity`, if `capacity` == -1, then
            double the size of the inner arrays.
 
@@ -725,7 +571,10 @@ cdef class Tree:
             with gil:
                 raise MemoryError()
 
-    cdef int _resize_c(self, SIZE_t capacity=INTPTR_MAX) except -1 nogil:
+    cdef int _resize_c(
+        self,
+        SIZE_t capacity=INTPTR_MAX
+    ) except -1 nogil:
         """Guts of _resize
 
         Returns -1 in case of failure to allocate memory (and raise MemoryError)
@@ -756,14 +605,87 @@ cdef class Tree:
         self.capacity = capacity
         return 0
 
-    cdef SIZE_t _add_node(self, SIZE_t parent, bint is_left, bint is_leaf,
-                          SIZE_t feature, double threshold, double impurity,
-                          SIZE_t n_node_samples,
-                          double weighted_n_node_samples) except -1 nogil:
-        """Add a node to the tree.
+    cdef int _set_split_node(
+        self,
+        SplitRecord* split_node,
+        Node* node
+    ) except -1 nogil:
+        """Set split node data.
+        
+        Parameters
+        ----------
+        split_node : SplitRecord*
+            The pointer to the record of the split node data.
+        node : Node*
+            The pointer to the node that will hold the split node.
+        """
+        # left_child and right_child will be set later for a split node
+        node.feature = split_node.feature
+        node.threshold = split_node.threshold
+        return 1
 
+    cdef int _set_leaf_node(
+        self,
+        SplitRecord* split_node,
+        Node* node
+    ) except -1 nogil:
+        """Set leaf node data.
+        
+        Parameters
+        ----------
+        split_node : SplitRecord*
+            The pointer to the record of the leaf node data.
+        node : Node*
+            The pointer to the node that will hold the leaf node.
+        """
+        node.left_child = _TREE_LEAF
+        node.right_child = _TREE_LEAF
+        node.feature = _TREE_UNDEFINED
+        node.threshold = _TREE_UNDEFINED
+        return 1
+
+    cdef DTYPE_t _compute_feature(self, const DTYPE_t[:, :] X_ndarray,
+            SIZE_t sample_index,
+            Node *node) noexcept nogil:
+        """Compute feature from a given data matrix, X.
+
+        In axis-aligned trees, this is simply the value in the column of X
+        for this specific feature.
+        """
+        # the feature index
+        cdef DTYPE_t feature = X_ndarray[sample_index, node.feature]
+        return feature
+
+    cdef SIZE_t _add_node(
+        self, 
+        SIZE_t parent,
+        bint is_left,
+        bint is_leaf,
+        SplitRecord* split_node,
+        double impurity,
+        SIZE_t n_node_samples,
+        double weighted_n_node_samples
+    ) except -1 nogil:
+        """Add a node to the tree.
         The new node registers itself as the child of its parent.
-
+        Parameters
+        ----------
+        parent : SIZE_t
+            The index of the parent. If '_TREE_UNDEFINED', then the current
+            node is a root node.
+        is_left : bint
+            Whether or not the current node is to the left of the parent node.
+        is_leaf : bint
+            Whether or not the current node is a leaf node.
+        split_node : SplitRecord*
+            A pointer to a SplitRecord pointer address.
+        impurity : double
+            The impurity of the node to be added.
+        n_node_samples : SIZE_t
+            The number of samples in the node.
+        weighted_n_node_samples : double
+            The weight of the samples in the node.
+            
         Returns (size_t)(-1) on error.
         """
         cdef SIZE_t node_id = self.node_count
@@ -784,28 +706,18 @@ cdef class Tree:
                 self.nodes[parent].right_child = node_id
 
         if is_leaf:
-            node.left_child = _TREE_LEAF
-            node.right_child = _TREE_LEAF
-            node.feature = _TREE_UNDEFINED
-            node.threshold = _TREE_UNDEFINED
-
+            if self._set_leaf_node(split_node, node) != 1:
+                 with gil:
+                     raise RuntimeError
         else:
-            # left_child and right_child will be set later
-            node.feature = feature
-            node.threshold = threshold
+            if self._set_split_node(split_node, node) != 1:
+                 with gil:
+                     raise RuntimeError
 
         self.node_count += 1
 
         return node_id
 
-    cpdef cnp.ndarray predict(self, object X):
-        """Predict target for X."""
-        out = self._get_value_ndarray().take(self.apply(X), axis=0,
-                                             mode='clip')
-        if self.n_outputs == 1:
-            out = out.reshape(X.shape[0], self.max_n_classes)
-        return out
-
     cpdef cnp.ndarray apply(self, object X):
         """Finds the terminal region (=leaf node) for each sample in X."""
         if issparse(X):
@@ -835,13 +747,20 @@ cdef class Tree:
         cdef Node* node = NULL
         cdef SIZE_t i = 0
 
+        # the feature value
+        cdef DTYPE_t feature_value = 0
+
         with nogil:
             for i in range(n_samples):
                 node = self.nodes
+
                 # While node not a leaf
                 while node.left_child != _TREE_LEAF:
                     # ... and node.right_child != _TREE_LEAF:
-                    if X_ndarray[i, node.feature] <= node.threshold:
+                    
+                    # compute the feature value to compare against threshold
+                    feature_value = self._compute_feature(X_ndarray, i, node)
+                    if feature_value <= node.threshold:
                         node = &self.nodes[node.left_child]
                     else:
                         node = &self.nodes[node.right_child]
@@ -902,7 +821,6 @@ cdef class Tree:
                     # ... and node.right_child != _TREE_LEAF:
                     if feature_to_sample[node.feature] == i:
                         feature_value = X_sample[node.feature]
-
                     else:
                         feature_value = 0.
 
@@ -951,6 +869,9 @@ cdef class Tree:
         cdef Node* node = NULL
         cdef SIZE_t i = 0
 
+        # the feature index
+        cdef DOUBLE_t feature
+
         with nogil:
             for i in range(n_samples):
                 node = self.nodes
@@ -962,7 +883,9 @@ cdef class Tree:
                     indices[indptr[i + 1]] = <SIZE_t>(node - self.nodes)
                     indptr[i + 1] += 1
 
-                    if X_ndarray[i, node.feature] <= node.threshold:
+                    # compute the feature value to compare against threshold
+                    feature = self._compute_feature(X_ndarray, i, node)
+                    if feature <= node.threshold:
                         node = &self.nodes[node.left_child]
                     else:
                         node = &self.nodes[node.right_child]
@@ -1091,8 +1014,6 @@ cdef class Tree:
 
     cpdef compute_feature_importances(self, normalize=True):
         """Computes the importance of each feature (aka variable)."""
-        cdef Node* left
-        cdef Node* right
         cdef Node* nodes = self.nodes
         cdef Node* node = nodes
         cdef Node* end_node = node + self.node_count
@@ -1105,13 +1026,9 @@ cdef class Tree:
             while node != end_node:
                 if node.left_child != _TREE_LEAF:
                     # ... and node.right_child != _TREE_LEAF:
-                    left = &nodes[node.left_child]
-                    right = &nodes[node.right_child]
-
-                    importances[node.feature] += (
-                        node.weighted_n_node_samples * node.impurity -
-                        left.weighted_n_node_samples * left.impurity -
-                        right.weighted_n_node_samples * right.impurity)
+                    self._compute_feature_importances(
+                        importances, node)
+                        
                 node += 1
 
         for i in range(self.n_features):
@@ -1127,44 +1044,27 @@ cdef class Tree:
 
         return np.asarray(importances)
 
-    cdef cnp.ndarray _get_value_ndarray(self):
-        """Wraps value as a 3-d NumPy array.
-
-        The array keeps a reference to this Tree, which manages the underlying
-        memory.
+    cdef void _compute_feature_importances(
+        self,
+        cnp.float64_t[:] importances,
+        Node* node
+    ) noexcept nogil:
+        """Compute feature importances from a Node in the Tree.
+        
+        Wrapped in a private function to allow subclassing that
+        computes feature importances.
         """
-        cdef cnp.npy_intp shape[3]
-        shape[0] = <cnp.npy_intp> self.node_count
-        shape[1] = <cnp.npy_intp> self.n_outputs
-        shape[2] = <cnp.npy_intp> self.max_n_classes
-        cdef cnp.ndarray arr
-        arr = cnp.PyArray_SimpleNewFromData(3, shape, cnp.NPY_DOUBLE, self.value)
-        Py_INCREF(self)
-        if PyArray_SetBaseObject(arr, <PyObject*> self) < 0:
-            raise ValueError("Can't initialize array.")
-        return arr
+        cdef Node* nodes = self.nodes
+        cdef Node* left
+        cdef Node* right
 
-    cdef cnp.ndarray _get_node_ndarray(self):
-        """Wraps nodes as a NumPy struct array.
+        left = &nodes[node.left_child]
+        right = &nodes[node.right_child]
 
-        The array keeps a reference to this Tree, which manages the underlying
-        memory. Individual fields are publicly accessible as properties of the
-        Tree.
-        """
-        cdef cnp.npy_intp shape[1]
-        shape[0] = <cnp.npy_intp> self.node_count
-        cdef cnp.npy_intp strides[1]
-        strides[0] = sizeof(Node)
-        cdef cnp.ndarray arr
-        Py_INCREF(NODE_DTYPE)
-        arr = PyArray_NewFromDescr(<PyTypeObject *> cnp.ndarray,
-                                   <cnp.dtype> NODE_DTYPE, 1, shape,
-                                   strides, <void*> self.nodes,
-                                   cnp.NPY_ARRAY_DEFAULT, None)
-        Py_INCREF(self)
-        if PyArray_SetBaseObject(arr, <PyObject*> self) < 0:
-            raise ValueError("Can't initialize array.")
-        return arr
+        importances[node.feature] += (
+                        node.weighted_n_node_samples * node.impurity -
+                        left.weighted_n_node_samples * left.impurity -
+                        right.weighted_n_node_samples * right.impurity)
 
     def compute_partial_dependence(self, DTYPE_t[:, ::1] X,
                                    int[::1] target_features,
@@ -1273,6 +1173,237 @@ cdef class Tree:
                                  total_weight)
 
 
+cdef class Tree(BaseTree):
+    """Array-based representation of a binary decision tree.
+
+    The binary tree is represented as a number of parallel arrays. The i-th
+    element of each array holds information about the node `i`. Node 0 is the
+    tree's root. You can find a detailed description of all arrays in
+    `_tree.pxd`. NOTE: Some of the arrays only apply to either leaves or split
+    nodes, resp. In this case the values of nodes of the other type are
+    arbitrary!
+
+    Attributes
+    ----------
+    node_count : int
+        The number of nodes (internal nodes + leaves) in the tree.
+
+    capacity : int
+        The current capacity (i.e., size) of the arrays, which is at least as
+        great as `node_count`.
+
+    max_depth : int
+        The depth of the tree, i.e. the maximum depth of its leaves.
+
+    children_left : array of int, shape [node_count]
+        children_left[i] holds the node id of the left child of node i.
+        For leaves, children_left[i] == TREE_LEAF. Otherwise,
+        children_left[i] > i. This child handles the case where
+        X[:, feature[i]] <= threshold[i].
+
+    children_right : array of int, shape [node_count]
+        children_right[i] holds the node id of the right child of node i.
+        For leaves, children_right[i] == TREE_LEAF. Otherwise,
+        children_right[i] > i. This child handles the case where
+        X[:, feature[i]] > threshold[i].
+
+    feature : array of int, shape [node_count]
+        feature[i] holds the feature to split on, for the internal node i.
+
+    threshold : array of double, shape [node_count]
+        threshold[i] holds the threshold for the internal node i.
+
+    value : array of double, shape [node_count, n_outputs, max_n_classes]
+        Contains the constant prediction value of each node.
+
+    impurity : array of double, shape [node_count]
+        impurity[i] holds the impurity (i.e., the value of the splitting
+        criterion) at node i.
+
+    n_node_samples : array of int, shape [node_count]
+        n_node_samples[i] holds the number of training samples reaching node i.
+
+    weighted_n_node_samples : array of double, shape [node_count]
+        weighted_n_node_samples[i] holds the weighted number of training samples
+        reaching node i.
+    """
+    # Wrap for outside world.
+    # WARNING: these reference the current `nodes` and `value` buffers, which
+    # must not be freed by a subsequent memory allocation.
+    # (i.e. through `_resize` or `__setstate__`)
+    property n_classes:
+        def __get__(self):
+            return sizet_ptr_to_ndarray(self.n_classes, self.n_outputs)
+
+    property children_left:
+        def __get__(self):
+            return self._get_node_ndarray()['left_child'][:self.node_count]
+
+    property children_right:
+        def __get__(self):
+            return self._get_node_ndarray()['right_child'][:self.node_count]
+
+    property n_leaves:
+        def __get__(self):
+            return np.sum(np.logical_and(
+                self.children_left == -1,
+                self.children_right == -1))
+
+    property feature:
+        def __get__(self):
+            return self._get_node_ndarray()['feature'][:self.node_count]
+
+    property threshold:
+        def __get__(self):
+            return self._get_node_ndarray()['threshold'][:self.node_count]
+
+    property impurity:
+        def __get__(self):
+            return self._get_node_ndarray()['impurity'][:self.node_count]
+
+    property n_node_samples:
+        def __get__(self):
+            return self._get_node_ndarray()['n_node_samples'][:self.node_count]
+
+    property weighted_n_node_samples:
+        def __get__(self):
+            return self._get_node_ndarray()['weighted_n_node_samples'][:self.node_count]
+
+    property value:
+        def __get__(self):
+            return self._get_value_ndarray()[:self.node_count]
+
+    # TODO: Convert n_classes to cython.integral memory view once
+    #  https://github.com/cython/cython/issues/5243 is fixed
+    def __cinit__(self, int n_features, cnp.ndarray n_classes, int n_outputs):
+        """Constructor."""
+        cdef SIZE_t dummy = 0
+        size_t_dtype = np.array(dummy).dtype
+
+        n_classes = _check_n_classes(n_classes, size_t_dtype)
+
+        # Input/Output layout
+        self.n_features = n_features
+        self.n_outputs = n_outputs
+        self.n_classes = NULL
+        safe_realloc(&self.n_classes, n_outputs)
+
+        self.max_n_classes = np.max(n_classes)
+        self.value_stride = n_outputs * self.max_n_classes
+
+        cdef SIZE_t k
+        for k in range(n_outputs):
+            self.n_classes[k] = n_classes[k]
+
+        # Inner structures
+        self.max_depth = 0
+        self.node_count = 0
+        self.capacity = 0
+        self.value = NULL
+        self.nodes = NULL
+
+    def __dealloc__(self):
+        """Destructor."""
+        # Free all inner structures
+        free(self.n_classes)
+        free(self.value)
+        free(self.nodes)
+
+    def __reduce__(self):
+        """Reduce re-implementation, for pickling."""
+        return (Tree, (self.n_features,
+                       sizet_ptr_to_ndarray(self.n_classes, self.n_outputs),
+                       self.n_outputs), self.__getstate__())
+
+    def __getstate__(self):
+        """Getstate re-implementation, for pickling."""
+        d = {}
+        # capacity is inferred during the __setstate__ using nodes
+        d["max_depth"] = self.max_depth
+        d["node_count"] = self.node_count
+        d["nodes"] = self._get_node_ndarray()
+        d["values"] = self._get_value_ndarray()
+        return d
+
+    def __setstate__(self, d):
+        """Setstate re-implementation, for unpickling."""
+        self.max_depth = d["max_depth"]
+        self.node_count = d["node_count"]
+
+        if 'nodes' not in d:
+            raise ValueError('You have loaded Tree version which '
+                             'cannot be imported')
+
+        node_ndarray = d['nodes']
+        value_ndarray = d['values']
+
+        value_shape = (node_ndarray.shape[0], self.n_outputs,
+                       self.max_n_classes)
+
+        node_ndarray = _check_node_ndarray(node_ndarray, expected_dtype=NODE_DTYPE)
+        value_ndarray = _check_value_ndarray(
+            value_ndarray,
+            expected_dtype=np.dtype(np.float64),
+            expected_shape=value_shape
+        )
+
+        self.capacity = node_ndarray.shape[0]
+        if self._resize_c(self.capacity) != 0:
+            raise MemoryError("resizing tree to %d" % self.capacity)
+
+        nodes = memcpy(self.nodes, cnp.PyArray_DATA(node_ndarray),
+                       self.capacity * sizeof(Node))
+        value = memcpy(self.value, cnp.PyArray_DATA(value_ndarray),
+                       self.capacity * self.value_stride * sizeof(double))
+
+    cdef cnp.ndarray _get_value_ndarray(self):
+        """Wraps value as a 3-d NumPy array.
+
+        The array keeps a reference to this Tree, which manages the underlying
+        memory.
+        """
+        cdef cnp.npy_intp shape[3]
+        shape[0] = <cnp.npy_intp> self.node_count
+        shape[1] = <cnp.npy_intp> self.n_outputs
+        shape[2] = <cnp.npy_intp> self.max_n_classes
+        cdef cnp.ndarray arr
+        arr = cnp.PyArray_SimpleNewFromData(3, shape, cnp.NPY_DOUBLE, self.value)
+        Py_INCREF(self)
+        if PyArray_SetBaseObject(arr, <PyObject*> self) < 0:
+            raise ValueError("Can't initialize array.")
+        return arr
+
+    cdef cnp.ndarray _get_node_ndarray(self):
+        """Wraps nodes as a NumPy struct array.
+
+        The array keeps a reference to this Tree, which manages the underlying
+        memory. Individual fields are publicly accessible as properties of the
+        Tree.
+        """
+        cdef cnp.npy_intp shape[1]
+        shape[0] = <cnp.npy_intp> self.node_count
+        cdef cnp.npy_intp strides[1]
+        strides[0] = sizeof(Node)
+        cdef cnp.ndarray arr
+        Py_INCREF(NODE_DTYPE)
+        arr = PyArray_NewFromDescr(<PyTypeObject *> cnp.ndarray,
+                                   <cnp.dtype> NODE_DTYPE, 1, shape,
+                                   strides, <void*> self.nodes,
+                                   cnp.NPY_ARRAY_DEFAULT, None)
+        Py_INCREF(self)
+        if PyArray_SetBaseObject(arr, <PyObject*> self) < 0:
+            raise ValueError("Can't initialize array.")
+        return arr
+
+    cpdef cnp.ndarray predict(self, object X):
+        """Predict target for X."""
+        out = self._get_value_ndarray().take(self.apply(X), axis=0,
+                                             mode='clip')
+        if self.n_outputs == 1:
+            out = out.reshape(X.shape[0], self.max_n_classes)
+        return out
+
+
 def _check_n_classes(n_classes, expected_dtype):
     if n_classes.ndim != 1:
         raise ValueError(
@@ -1755,6 +1886,8 @@ cdef _build_pruned_tree(
         stack[BuildPrunedRecord] prune_stack
         BuildPrunedRecord stack_record
 
+        SplitRecord split
+
     with nogil:
         # push root node onto stack
         prune_stack.push({"start": 0, "depth": 0, "parent": _TREE_UNDEFINED, "is_left": 0})
@@ -1771,8 +1904,12 @@ cdef _build_pruned_tree(
             is_leaf = leaves_in_subtree[orig_node_id]
             node = &orig_tree.nodes[orig_node_id]
 
+            # redefine to a SplitRecord to pass into _add_node
+            split.feature = node.feature
+            split.threshold = node.threshold
+
             new_node_id = tree._add_node(
-                parent, is_left, is_leaf, node.feature, node.threshold,
+                parent, is_left, is_leaf, &split,
                 node.impurity, node.n_node_samples,
                 node.weighted_n_node_samples)
 
diff --git a/sklearn/tree/tests/test_tree.py b/sklearn/tree/tests/test_tree.py
index 1f3a9bf394b9b..69f948839259a 100644
--- a/sklearn/tree/tests/test_tree.py
+++ b/sklearn/tree/tests/test_tree.py
@@ -300,7 +300,7 @@ def test_xor():
         clf.fit(X, y)
         assert clf.score(X, y) == 1.0, "Failed with {0}".format(name)
 
-        clf = Tree(random_state=0, max_features=1)
+        clf = Tree(random_state=0, max_features=X.shape[1])
         clf.fit(X, y)
         assert clf.score(X, y) == 1.0, "Failed with {0}".format(name)
 
@@ -440,7 +440,7 @@ def test_importances():
     X, y = datasets.make_classification(
         n_samples=5000,
         n_features=10,
-        n_informative=3,
+        n_informative=4,
         n_redundant=0,
         n_repeated=0,
         shuffle=False,
@@ -455,7 +455,7 @@ def test_importances():
         n_important = np.sum(importances > 0.1)
 
         assert importances.shape[0] == 10, "Failed with {0}".format(name)
-        assert n_important == 3, "Failed with {0}".format(name)
+        assert n_important == 4, "Failed with {0}".format(name)
 
     # Check on iris that importances are the same for all builders
     clf = DecisionTreeClassifier(random_state=0)
@@ -466,9 +466,9 @@ def test_importances():
     assert_array_equal(clf.feature_importances_, clf2.feature_importances_)
 
 
-def test_importances_raises():
+@pytest.mark.parametrize("clf", [DecisionTreeClassifier()])
+def test_importances_raises(clf):
     # Check if variable importance before fit raises ValueError.
-    clf = DecisionTreeClassifier()
     with pytest.raises(ValueError):
         getattr(clf, "feature_importances_")
 
@@ -653,6 +653,7 @@ def test_min_samples_leaf():
         est.fit(X, y)
         out = est.tree_.apply(X)
         node_counts = np.bincount(out)
+
         # drop inner nodes
         leaf_count = node_counts[node_counts != 0]
         assert np.min(leaf_count) > 4, "Failed with {0}".format(name)
@@ -677,7 +678,7 @@ def check_min_weight_fraction_leaf(name, datasets, sparse=False):
     else:
         X = DATASETS[datasets]["X"].astype(np.float32)
     y = DATASETS[datasets]["y"]
-
+    rng = np.random.RandomState(42)
     weights = rng.rand(X.shape[0])
     total_weight = np.sum(weights)
 
@@ -828,7 +829,7 @@ def test_min_impurity_decrease():
         )
         # Check with a much lower value of 0.0001
         est3 = TreeEstimator(
-            max_leaf_nodes=max_leaf_nodes, min_impurity_decrease=0.0001, random_state=0
+            max_leaf_nodes=max_leaf_nodes, min_impurity_decrease=0.0001, random_state=1
         )
         # Check with a much lower value of 0.1
         est4 = TreeEstimator(
@@ -918,6 +919,7 @@ def test_pickle():
         est2 = pickle.loads(serialized_object)
         assert type(est2) == est.__class__
 
+        # score should match before/after pickling
         score2 = est2.score(X, y)
         assert (
             score == score2
@@ -1031,7 +1033,6 @@ def test_memory_layout():
         ALL_TREES.items(), [np.float64, np.float32]
     ):
         est = TreeEstimator(random_state=0)
-
         # Nothing
         X = np.asarray(iris.data, dtype=dtype)
         y = iris.target
@@ -1052,6 +1053,11 @@ def test_memory_layout():
         y = iris.target
         assert_array_equal(est.fit(X, y).predict(X), y)
 
+        # Strided
+        X = np.asarray(iris.data[::3], dtype=dtype)
+        y = iris.target[::3]
+        assert_array_equal(est.fit(X, y).predict(X), y)
+
         # csr matrix
         X = csr_matrix(iris.data, dtype=dtype)
         y = iris.target
@@ -1062,11 +1068,6 @@ def test_memory_layout():
         y = iris.target
         assert_array_equal(est.fit(X, y).predict(X), y)
 
-        # Strided
-        X = np.asarray(iris.data[::3], dtype=dtype)
-        y = iris.target[::3]
-        assert_array_equal(est.fit(X, y).predict(X), y)
-
 
 def test_sample_weight():
     # Check sample weighting.
@@ -1260,7 +1261,7 @@ def test_behaviour_constant_feature_after_splits():
     y = [0, 0, 0, 1, 1, 2, 2, 2, 3, 3, 3]
     for name, TreeEstimator in ALL_TREES.items():
         # do not check extra random trees
-        if "ExtraTree" not in name:
+        if all(_name not in name for _name in ["ExtraTree"]):
             est = TreeEstimator(random_state=0, max_features=1)
             est.fit(X, y)
             assert est.tree_.max_depth == 2
@@ -1586,6 +1587,7 @@ def check_min_weight_leaf_split_level(name):
     sample_weight = [0.2, 0.2, 0.2, 0.2, 0.2]
     _check_min_weight_leaf_split_level(TreeEstimator, X, y, sample_weight)
 
+    # skip for sparse inputs
     _check_min_weight_leaf_split_level(TreeEstimator, csc_matrix(X), y, sample_weight)
 
 
@@ -1644,6 +1646,7 @@ def check_decision_path(name):
     # Assert that leaves index are correct
     leaves = est.apply(X)
     leave_indicator = [node_indicator[i, j] for i, j in enumerate(leaves)]
+
     assert_array_almost_equal(leave_indicator, np.ones(shape=n_samples))
 
     # Ensure only one leave node per sample
@@ -1930,6 +1933,7 @@ def assert_is_subtree(tree, subtree):
 def test_apply_path_readonly_all_trees(name, splitter, X_format):
     dataset = DATASETS["clf_small"]
     X_small = dataset["X"].astype(tree._tree.DTYPE, copy=False)
+
     if X_format == "dense":
         X_readonly = create_memmap_backed_data(X_small)
     else:

From 475bd05f779a4be4f301f751ac86ba6a998a219a Mon Sep 17 00:00:00 2001
From: Adam Li <adam2392@gmail.com>
Date: Wed, 29 Mar 2023 09:41:10 -0700
Subject: [PATCH 02/28] Docs (#39)

#### Reference Issues/PRs
Fixes README and wheel building


---------

Signed-off-by: Adam Li <adam2392@gmail.com>
---
 README.rst                                  | 36 ++++++++++++---------
 build_tools/azure/install.sh                |  2 +-
 build_tools/github/repair_windows_wheels.sh |  2 +-
 3 files changed, 22 insertions(+), 18 deletions(-)

diff --git a/README.rst b/README.rst
index fbdfdaa95ef4c..7a7bd41c42846 100644
--- a/README.rst
+++ b/README.rst
@@ -44,6 +44,10 @@
 .. |PytestMinVersion| replace:: 5.3.1
 .. |PlotlyMinVersion| replace:: 5.10.0
 
+=================
+Scikit-learn-tree
+=================
+
 ``scikit-learn-tree`` is a maintained fork of scikit-learn, which advances the tree submodule, while staying in-line
 with changes from upstream scikit-learn. It is an exact stand-in for ``sklearn`` in package imports, but is
 released under the name ``scikit-learn-tree`` to avoid confusion.
@@ -94,8 +98,7 @@ Installing scikit-learn-tree
 ============================
 
 Scikit-learn-tree is a maintained fork of scikit-learn, which extends the
-tree submodule in a few ways documented in :ref:`changelog of the fork
-<fork-changelog>`. 
+tree submodule in a few ways documented in `fork_changelog`_. 
 
 We release versions of scikit-learn-tree in an analagous fashion to
 scikit-learn main. Due to maintenance resources, we only release on PyPi
@@ -103,12 +106,11 @@ and recommend therefore installing with ``pip``.
 
 There are different ways to install scikit-learn-tree:
 
-  * :ref:`Install the latest official release <install_fork_release>`. This
+  * Install the latest official release `install_fork_release`_. This
     is the best approach for most users. It will provide a stable version
     and pre-built packages are available for most platforms.
     
-  * :ref:`Building the package from source
-    <install_source>`. This is best for users who want the
+  * Building the package from source `install_source`_. This is best for users who want the
     latest-and-greatest features and aren't afraid of running
     brand-new code. This is also needed for users who wish to contribute to the
     project.
@@ -119,9 +121,7 @@ Installing the latest release
 -----------------------------
 We release wheels for common distributions and this is thus installable via pip.
 
-.. prompt:: bash $
-  
-  pip install scikit-learn-tree
+    pip install scikit-learn-tree
 
 This will install ``scikit-learn-tree`` under the namespace of ``sklearn``, which then
 can be used as a stand-in for any package that relies on the public API of ``sklearn``.
@@ -146,9 +146,11 @@ features to the fork, the building from source instructions are exactly the same
 as that of scikit-learn main, so please refer to `scikit-learn documentation <https://scikit-learn.org/stable/developers/advanced_installation.html#install-bleeding-edge>`_
 for instructions on building from source.
 
-Development
 ===========
 
+Development
+-----------
+
 We welcome new contributors of all experience levels, specifically to maintain the fork.
 Any contributions that make sure our fork is "better in-line" with scikit-learn upstream,
 or improves the tree submodule in anyway will be appreciated.
@@ -158,15 +160,17 @@ The scikit-learn community goals are to be helpful, welcoming, and effective. Th
 has detailed information about contributing code, documentation, tests, and
 more. We've included some basic information in this README.
 
-.. _fork-changelog:
-Major Changes of the Fork
 =========================
 
+.. _fork_changelog:
+
+Major Changes of the Fork
+-------------------------
+
 The purpose of this page is to illustrate some of the main features that
 ``scikit-learn-tree`` provides compared to ``scikit-learn``. It assumes a
 an understanding of core package ``scikit-learn`` and also decision trees
-models. Please refer to our :ref:`installation instructions
-<fork-installation-instructions>` for installing ``scikit-learn-tree``.
+models. Please refer to our installation instructions `install_fork_release`_ for installing ``scikit-learn-tree``.
 
 Scikit-learn-tree though operates as a stand-in for upstream ``scikit-learn``.
 It is used in packages exactly the same way and will support all features
@@ -193,7 +197,7 @@ Candidate changes and PRs accepted into the fork are those that:
 Decision tree generalizations
 -----------------------------
 
-``Scikit-learn`` provides an axis-aligned :class:`~sklearn.tree.DecisionTreeClassifier`
+``Scikit-learn`` provides an axis-aligned `sklearn.tree.DecisionTreeClassifier <https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html>`_
 decision tree model (classifier and regressor), which has a few fundamental limitations
 that prevent 3rd parties from utilizing the existing class, without forking a large
 amount of copy/pasted Python and Cython code. We highlight those limitations here
@@ -239,8 +243,8 @@ Python API:
   random forests and their variants to scale to millions of samples.
   - Our fix: We added a ``max_bins=None`` keyword argument to the ``BaseForest`` class, and all its subclasses. The default behavior is no binning. The current implementation is not necessarily efficient. There are several improvements to be made. See below.
 
-Overall, the existing tree models, such as :class:`~sklearn.tree.DecisionTreeClassifier`
-and :class:`~sklearn.ensemble.RandomForestClassifier` all work exactly the same as they
+Overall, the existing tree models, such as `sklearn.tree.DecisionTreeClassifier <https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html>`_
+and `sklearn.ensemble.RandomForestClassifier <https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html#sklearn.ensemble.RandomForestClassifier>`_ all work exactly the same as they
 would in ``scikit-learn`` main, but these extensions enable 3rd-party packages to extend
 the Cython/Python API easily.
 
diff --git a/build_tools/azure/install.sh b/build_tools/azure/install.sh
index db5b5d9414053..5238cd1121d2e 100755
--- a/build_tools/azure/install.sh
+++ b/build_tools/azure/install.sh
@@ -7,7 +7,7 @@ set -x
 source build_tools/shared.sh
 
 UNAMESTR=`uname`
-CCACHE_LINKS_DIR="/tmp/ccachev2"
+CCACHE_LINKS_DIR="/tmp/ccache"
 
 setup_ccache() {
     CCACHE_BIN=`which ccache || echo ""`
diff --git a/build_tools/github/repair_windows_wheels.sh b/build_tools/github/repair_windows_wheels.sh
index cdd0c0c79d8c4..a857e61067960 100755
--- a/build_tools/github/repair_windows_wheels.sh
+++ b/build_tools/github/repair_windows_wheels.sh
@@ -9,7 +9,7 @@ DEST_DIR=$2
 # By default, the Windows wheels are not repaired.
 # In this case, we need to vendor VCRUNTIME140.dll
 wheel unpack "$WHEEL"
-WHEEL_DIRNAME=$(ls -d scikit_learn-*)
+WHEEL_DIRNAME=$(ls -d scikit_learn_tree-*)
 python build_tools/github/vendor.py "$WHEEL_DIRNAME"
 wheel pack "$WHEEL_DIRNAME" -d "$DEST_DIR"
 rm -rf "$WHEEL_DIRNAME"

From 706a74273bf736066b1d71eeed9da08c0943e311 Mon Sep 17 00:00:00 2001
From: Adam Li <adam2392@gmail.com>
Date: Tue, 4 Apr 2023 14:47:24 -0700
Subject: [PATCH 03/28] Release v1.2.2

<!--
Thanks for contributing a pull request! Please ensure you have taken a
look at
the contribution guidelines:
https://github.com/scikit-learn/scikit-learn/blob/main/CONTRIBUTING.md
-->

#### Reference Issues/PRs
<!--
Example: Fixes #1234. See also #3456.
Please use keywords (e.g., Fixes) to create link to the issues or pull
requests
you resolved, so that they will automatically be closed when your pull
request
is merged. See
https://github.com/blog/1506-closing-issues-via-pull-requests
-->


#### What does this implement/fix? Explain your changes.


#### Any other comments?


<!--
Please be aware that we are a loose team of volunteers so patience is
necessary; assistance handling other issues is very welcome. We value
all user contributions, no matter how minor they are. If we are slow to
review, either the pull request needs some benchmarking, tinkering,
convincing, etc. or more likely the reviewers are simply busy. In either
case, we ask for your understanding during the review process.
For more information, see our FAQ on this topic:

http://scikit-learn.org/dev/faq.html#why-is-my-pull-request-not-getting-any-attention.

Thanks for contributing!
-->

---------

Signed-off-by: Adam Li <adam2392@gmail.com>
---
 .github/workflows/check-upstream.yml | 27 +++++++++++++++++++++++++++
 sklearn/__init__.py                  |  2 +-
 2 files changed, 28 insertions(+), 1 deletion(-)
 create mode 100644 .github/workflows/check-upstream.yml

diff --git a/.github/workflows/check-upstream.yml b/.github/workflows/check-upstream.yml
new file mode 100644
index 0000000000000..80e8ace610607
--- /dev/null
+++ b/.github/workflows/check-upstream.yml
@@ -0,0 +1,27 @@
+# Create Github Actions workflow that checks upstream scikit-learn 'main' branch and
+# creates or updates
+# an existing pull request to https://github.com/neurodata/scikit-learn:fork.
+# Runs the check weekly.
+# Creates a pull request if there are changes.
+
+# name: Check upstream scikit-learn
+
+# on:
+#   schedule:
+#     - cron: '0 0 * * 0'
+
+# jobs:
+#   check-upstream:
+#     runs-on: ubuntu-latest
+#     steps:
+#       - uses: actions/checkout@v2
+#       - name: Check upstream scikit-learn
+#         uses: neurodata/check-upstream@main
+#         with:
+#           upstream: scikit-learn/scikit-learn
+#           fork: neurodata/scikit-learn
+#           branch: fork
+#           token: ${{ secrets.GITHUB_TOKEN }}
+
+# # Creates a pull request if there are changes.
+
diff --git a/sklearn/__init__.py b/sklearn/__init__.py
index 47bb893bd00a0..6d5af7c771fb8 100644
--- a/sklearn/__init__.py
+++ b/sklearn/__init__.py
@@ -39,7 +39,7 @@
 # Dev branch marker is: 'X.Y.dev' or 'X.Y.devN' where N is an integer.
 # 'X.Y.dev0' is the canonical version of 'X.Y.dev'
 #
-__version__ = "1.3.dev0"
+__version__ = "1.2.2"
 
 
 # On OSX, we can get a runtime error due to multiple OpenMP libraries loaded

From a22db039704399a31d466be861f2b5a86bbc51b3 Mon Sep 17 00:00:00 2001
From: Adam Li <adam2392@gmail.com>
Date: Tue, 11 Apr 2023 15:25:44 -0400
Subject: [PATCH 04/28] Update README

Signed-off-by: Adam Li <adam2392@gmail.com>
---
 README.rst          | 4 ++--
 sklearn/__init__.py | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/README.rst b/README.rst
index 7a7bd41c42846..444ead93017b9 100644
--- a/README.rst
+++ b/README.rst
@@ -48,7 +48,7 @@
 Scikit-learn-tree
 =================
 
-``scikit-learn-tree`` is a maintained fork of scikit-learn, which advances the tree submodule, while staying in-line
+``scikit-learn-tree`` is an alias of scikit-learn. It is a maintained fork of scikit-learn, which advances the tree submodule, while staying in-line
 with changes from upstream scikit-learn. It is an exact stand-in for ``sklearn`` in package imports, but is
 released under the name ``scikit-learn-tree`` to avoid confusion.
 
@@ -85,7 +85,7 @@ Installation
 Dependencies
 ~~~~~~~~~~~~
 
-scikit-learn requires:
+scikit-learn-tree requires:
 
 - Python (>= |PythonMinVersion|)
 - NumPy (>= |NumPyMinVersion|)
diff --git a/sklearn/__init__.py b/sklearn/__init__.py
index 6d5af7c771fb8..4d7badd6b678e 100644
--- a/sklearn/__init__.py
+++ b/sklearn/__init__.py
@@ -39,7 +39,7 @@
 # Dev branch marker is: 'X.Y.dev' or 'X.Y.devN' where N is an integer.
 # 'X.Y.dev0' is the canonical version of 'X.Y.dev'
 #
-__version__ = "1.2.2"
+__version__ = "1.3.0dev0"
 
 
 # On OSX, we can get a runtime error due to multiple OpenMP libraries loaded

From 9c5321daa396e0fd01cc6e582a5dfcc8ccb1afe5 Mon Sep 17 00:00:00 2001
From: Adam Li <adam2392@gmail.com>
Date: Thu, 8 Jun 2023 10:09:58 -0400
Subject: [PATCH 05/28] Adding working submodule

Signed-off-by: Adam Li <adam2392@gmail.com>
---
 sklearn/tree/_tree.pyx | 1 +
 1 file changed, 1 insertion(+)

diff --git a/sklearn/tree/_tree.pyx b/sklearn/tree/_tree.pyx
index 23b999d76326e..f4a1a80123d26 100644
--- a/sklearn/tree/_tree.pyx
+++ b/sklearn/tree/_tree.pyx
@@ -1022,6 +1022,7 @@ cdef class BaseTree:
         cdef Node* end_node = node + self.node_count
 
         cdef double normalizer = 0.
+        cdef int i = 0
 
         cdef cnp.float64_t[:] importances = np.zeros(self.n_features)
 

From f82f2582c0c5e347fd9a6109129c3ae7853b0593 Mon Sep 17 00:00:00 2001
From: Adam Li <adam2392@gmail.com>
Date: Thu, 8 Jun 2023 10:40:52 -0400
Subject: [PATCH 06/28] Merged main

Signed-off-by: Adam Li <adam2392@gmail.com>
---
 sklearn/tree/_splitter.pxd | 2 ++
 sklearn/tree/_splitter.pyx | 4 ++--
 sklearn/tree/_tree.pyx     | 2 +-
 3 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/sklearn/tree/_splitter.pxd b/sklearn/tree/_splitter.pxd
index 3419c6fa08819..01975df22ef23 100644
--- a/sklearn/tree/_splitter.pxd
+++ b/sklearn/tree/_splitter.pxd
@@ -110,6 +110,8 @@ cdef class Splitter(BaseSplitter):
     cdef bint check_presplit_conditions(
         self,
         SplitRecord current_split,
+        SIZE_t n_missing,
+        bint missing_go_to_left,
     ) noexcept nogil
     cdef bint check_postsplit_conditions(
         self
diff --git a/sklearn/tree/_splitter.pyx b/sklearn/tree/_splitter.pyx
index c8df3de1bb900..ae6cd772e37f7 100644
--- a/sklearn/tree/_splitter.pyx
+++ b/sklearn/tree/_splitter.pyx
@@ -505,7 +505,7 @@ cdef inline int node_split_best(
                 current_split.pos = p
                 
                 # Reject if min_samples_leaf is not guaranteed
-                if splitter.check_presplit_conditions(current_split) == 1:
+                if splitter.check_presplit_conditions(current_split, n_missing, missing_go_to_left) == 1:
                     continue
 
                 criterion.update(current_split.pos)
@@ -834,7 +834,7 @@ cdef inline int node_split_random(
         current_split.pos = partitioner.partition_samples(current_split.threshold)
 
         # Reject if min_samples_leaf is not guaranteed
-        if splitter.check_presplit_conditions(current_split) == 1:
+        if splitter.check_presplit_conditions(current_split, 0, 0) == 1:
             continue
 
         # Evaluate split
diff --git a/sklearn/tree/_tree.pyx b/sklearn/tree/_tree.pyx
index c8248ed65c36b..33a2a8308de5f 100644
--- a/sklearn/tree/_tree.pyx
+++ b/sklearn/tree/_tree.pyx
@@ -764,7 +764,7 @@ cdef class BaseTree:
 
                 # While node not a leaf
                 while node.left_child != _TREE_LEAF:
-                    X_i_node_features = self._compute_feature(X_ndarray, i, node)
+                    X_i_node_feature = self._compute_feature(X_ndarray, i, node)
                     # ... and node.right_child != _TREE_LEAF:
                     if isnan(X_i_node_feature):
                         if node.missing_go_to_left:

From 7e38502806e954d9b3084f8a5e22602556236fe4 Mon Sep 17 00:00:00 2001
From: Adam Li <adam2392@gmail.com>
Date: Thu, 8 Jun 2023 10:42:44 -0400
Subject: [PATCH 07/28] Successful merge with the missing value support

Signed-off-by: Adam Li <adam2392@gmail.com>
---
 sklearn/tree/_classes.py        |  2 ++
 sklearn/tree/tests/test_tree.py | 32 ++++++++++++++------------------
 2 files changed, 16 insertions(+), 18 deletions(-)

diff --git a/sklearn/tree/_classes.py b/sklearn/tree/_classes.py
index 638c51f1101bc..21fa5b7c200b2 100644
--- a/sklearn/tree/_classes.py
+++ b/sklearn/tree/_classes.py
@@ -388,6 +388,7 @@ def _fit(
             X,
             y,
             sample_weight,
+            feature_has_missing,
             min_samples_leaf,
             min_weight_leaf,
             max_leaf_nodes,
@@ -403,6 +404,7 @@ def _build_tree(
         X,
         y,
         sample_weight,
+        feature_has_missing,
         min_samples_leaf,
         min_weight_leaf,
         max_leaf_nodes,
diff --git a/sklearn/tree/tests/test_tree.py b/sklearn/tree/tests/test_tree.py
index 6be168e4c8e7c..eefae6cdaa3f6 100644
--- a/sklearn/tree/tests/test_tree.py
+++ b/sklearn/tree/tests/test_tree.py
@@ -300,7 +300,7 @@ def test_xor():
         clf.fit(X, y)
         assert clf.score(X, y) == 1.0, "Failed with {0}".format(name)
 
-        clf = Tree(random_state=0, max_features=X.shape[1])
+        clf = Tree(random_state=0, max_features=1)
         clf.fit(X, y)
         assert clf.score(X, y) == 1.0, "Failed with {0}".format(name)
 
@@ -440,7 +440,7 @@ def test_importances():
     X, y = datasets.make_classification(
         n_samples=5000,
         n_features=10,
-        n_informative=4,
+        n_informative=3,
         n_redundant=0,
         n_repeated=0,
         shuffle=False,
@@ -455,7 +455,7 @@ def test_importances():
         n_important = np.sum(importances > 0.1)
 
         assert importances.shape[0] == 10, "Failed with {0}".format(name)
-        assert n_important == 4, "Failed with {0}".format(name)
+        assert n_important == 3, "Failed with {0}".format(name)
 
     # Check on iris that importances are the same for all builders
     clf = DecisionTreeClassifier(random_state=0)
@@ -466,9 +466,9 @@ def test_importances():
     assert_array_equal(clf.feature_importances_, clf2.feature_importances_)
 
 
-@pytest.mark.parametrize("clf", [DecisionTreeClassifier()])
-def test_importances_raises(clf):
+def test_importances_raises():
     # Check if variable importance before fit raises ValueError.
+    clf = DecisionTreeClassifier()
     with pytest.raises(ValueError):
         getattr(clf, "feature_importances_")
 
@@ -653,7 +653,6 @@ def test_min_samples_leaf():
         est.fit(X, y)
         out = est.tree_.apply(X)
         node_counts = np.bincount(out)
-
         # drop inner nodes
         leaf_count = node_counts[node_counts != 0]
         assert np.min(leaf_count) > 4, "Failed with {0}".format(name)
@@ -678,7 +677,7 @@ def check_min_weight_fraction_leaf(name, datasets, sparse=False):
     else:
         X = DATASETS[datasets]["X"].astype(np.float32)
     y = DATASETS[datasets]["y"]
-    rng = np.random.RandomState(42)
+
     weights = rng.rand(X.shape[0])
     total_weight = np.sum(weights)
 
@@ -829,7 +828,7 @@ def test_min_impurity_decrease():
         )
         # Check with a much lower value of 0.0001
         est3 = TreeEstimator(
-            max_leaf_nodes=max_leaf_nodes, min_impurity_decrease=0.0001, random_state=1
+            max_leaf_nodes=max_leaf_nodes, min_impurity_decrease=0.0001, random_state=0
         )
         # Check with a much lower value of 0.1
         est4 = TreeEstimator(
@@ -919,7 +918,6 @@ def test_pickle():
         est2 = pickle.loads(serialized_object)
         assert type(est2) == est.__class__
 
-        # score should match before/after pickling
         score2 = est2.score(X, y)
         assert (
             score == score2
@@ -1033,6 +1031,7 @@ def test_memory_layout():
         ALL_TREES.items(), [np.float64, np.float32]
     ):
         est = TreeEstimator(random_state=0)
+
         # Nothing
         X = np.asarray(iris.data, dtype=dtype)
         y = iris.target
@@ -1053,11 +1052,6 @@ def test_memory_layout():
         y = iris.target
         assert_array_equal(est.fit(X, y).predict(X), y)
 
-        # Strided
-        X = np.asarray(iris.data[::3], dtype=dtype)
-        y = iris.target[::3]
-        assert_array_equal(est.fit(X, y).predict(X), y)
-
         # csr matrix
         X = csr_matrix(iris.data, dtype=dtype)
         y = iris.target
@@ -1068,6 +1062,11 @@ def test_memory_layout():
         y = iris.target
         assert_array_equal(est.fit(X, y).predict(X), y)
 
+        # Strided
+        X = np.asarray(iris.data[::3], dtype=dtype)
+        y = iris.target[::3]
+        assert_array_equal(est.fit(X, y).predict(X), y)
+
 
 def test_sample_weight():
     # Check sample weighting.
@@ -1261,7 +1260,7 @@ def test_behaviour_constant_feature_after_splits():
     y = [0, 0, 0, 1, 1, 2, 2, 2, 3, 3, 3]
     for name, TreeEstimator in ALL_TREES.items():
         # do not check extra random trees
-        if all(_name not in name for _name in ["ExtraTree"]):
+        if "ExtraTree" not in name:
             est = TreeEstimator(random_state=0, max_features=1)
             est.fit(X, y)
             assert est.tree_.max_depth == 2
@@ -1587,7 +1586,6 @@ def check_min_weight_leaf_split_level(name):
     sample_weight = [0.2, 0.2, 0.2, 0.2, 0.2]
     _check_min_weight_leaf_split_level(TreeEstimator, X, y, sample_weight)
 
-    # skip for sparse inputs
     _check_min_weight_leaf_split_level(TreeEstimator, csc_matrix(X), y, sample_weight)
 
 
@@ -1646,7 +1644,6 @@ def check_decision_path(name):
     # Assert that leaves index are correct
     leaves = est.apply(X)
     leave_indicator = [node_indicator[i, j] for i, j in enumerate(leaves)]
-
     assert_array_almost_equal(leave_indicator, np.ones(shape=n_samples))
 
     # Ensure only one leave node per sample
@@ -1933,7 +1930,6 @@ def assert_is_subtree(tree, subtree):
 def test_apply_path_readonly_all_trees(name, splitter, X_format):
     dataset = DATASETS["clf_small"]
     X_small = dataset["X"].astype(tree._tree.DTYPE, copy=False)
-
     if X_format == "dense":
         X_readonly = create_memmap_backed_data(X_small)
     else:

From 34a562130d9c92b083b6da99c27a12a7623226b7 Mon Sep 17 00:00:00 2001
From: Adam Li <adam2392@gmail.com>
Date: Thu, 8 Jun 2023 10:53:07 -0400
Subject: [PATCH 08/28] Add cyton headers

Signed-off-by: Adam Li <adam2392@gmail.com>
---
 sklearn/tree/_criterion.pyx | 3 +++
 sklearn/tree/_splitter.pyx  | 3 +++
 sklearn/tree/_tree.pyx      | 3 +++
 sklearn/tree/_utils.pyx     | 3 +++
 4 files changed, 12 insertions(+)

diff --git a/sklearn/tree/_criterion.pyx b/sklearn/tree/_criterion.pyx
index 9c59e75fedb10..8fbcafcaf1456 100644
--- a/sklearn/tree/_criterion.pyx
+++ b/sklearn/tree/_criterion.pyx
@@ -1,3 +1,6 @@
+# cython: language_level=3
+# cython: boundscheck=False, wraparound=False, initializedcheck=False, cdivision=True
+
 # Authors: Gilles Louppe <g.louppe@gmail.com>
 #          Peter Prettenhofer <peter.prettenhofer@gmail.com>
 #          Brian Holt <bdholt1@gmail.com>
diff --git a/sklearn/tree/_splitter.pyx b/sklearn/tree/_splitter.pyx
index ae6cd772e37f7..a58514d093ddf 100644
--- a/sklearn/tree/_splitter.pyx
+++ b/sklearn/tree/_splitter.pyx
@@ -1,3 +1,6 @@
+# cython: language_level=3
+# cython: boundscheck=False, wraparound=False, initializedcheck=False, cdivision=True
+
 # Authors: Gilles Louppe <g.louppe@gmail.com>
 #          Peter Prettenhofer <peter.prettenhofer@gmail.com>
 #          Brian Holt <bdholt1@gmail.com>
diff --git a/sklearn/tree/_tree.pyx b/sklearn/tree/_tree.pyx
index 33a2a8308de5f..2256b28c7df10 100644
--- a/sklearn/tree/_tree.pyx
+++ b/sklearn/tree/_tree.pyx
@@ -1,3 +1,6 @@
+# cython: language_level=3
+# cython: boundscheck=False, wraparound=False, initializedcheck=False, cdivision=True
+
 # Authors: Gilles Louppe <g.louppe@gmail.com>
 #          Peter Prettenhofer <peter.prettenhofer@gmail.com>
 #          Brian Holt <bdholt1@gmail.com>
diff --git a/sklearn/tree/_utils.pyx b/sklearn/tree/_utils.pyx
index 669d69409fdc3..0a7522bcf4255 100644
--- a/sklearn/tree/_utils.pyx
+++ b/sklearn/tree/_utils.pyx
@@ -1,3 +1,6 @@
+# cython: language_level=3
+# cython: boundscheck=False, wraparound=False, initializedcheck=False, cdivision=True
+
 # Authors: Gilles Louppe <g.louppe@gmail.com>
 #          Peter Prettenhofer <peter.prettenhofer@gmail.com>
 #          Arnaud Joly <arnaud.v.joly@gmail.com>

From f35c758189c8d38bfed56071b8c9a6cbbd39056f Mon Sep 17 00:00:00 2001
From: Adam Li <adam2392@gmail.com>
Date: Thu, 8 Jun 2023 14:04:19 -0400
Subject: [PATCH 09/28] Fix imports to be absolute

Signed-off-by: Adam Li <adam2392@gmail.com>
---
 sklearn/tree/_classes.py | 32 ++++++++++++++++----------------
 sklearn/tree/_export.py  | 11 ++++++++---
 sklearn/tree/_utils.pxd  |  2 +-
 sklearn/tree/_utils.pyx  |  2 +-
 4 files changed, 26 insertions(+), 21 deletions(-)

diff --git a/sklearn/tree/_classes.py b/sklearn/tree/_classes.py
index 21fa5b7c200b2..4fdd8f27cd652 100644
--- a/sklearn/tree/_classes.py
+++ b/sklearn/tree/_classes.py
@@ -25,22 +25,22 @@
 import numpy as np
 from scipy.sparse import issparse
 
-from ..base import BaseEstimator
-from ..base import ClassifierMixin
-from ..base import clone
-from ..base import RegressorMixin
-from ..base import is_classifier
-from ..base import MultiOutputMixin
-from ..utils import Bunch
-from ..utils import check_random_state
-from ..utils.validation import _check_sample_weight
-from ..utils.validation import assert_all_finite
-from ..utils.validation import _assert_all_finite_element_wise
-from ..utils import compute_sample_weight
-from ..utils.multiclass import check_classification_targets
-from ..utils.validation import check_is_fitted
-from ..utils._param_validation import Hidden, Interval, StrOptions
-from ..utils._param_validation import RealNotInt
+from sklearn.base import BaseEstimator
+from sklearn.base import ClassifierMixin
+from sklearn.base import clone
+from sklearn.base import RegressorMixin
+from sklearn.base import is_classifier
+from sklearn.base import MultiOutputMixin
+from sklearn.utils import Bunch
+from sklearn.utils import check_random_state
+from sklearn.utils.validation import _check_sample_weight
+from sklearn.utils.validation import assert_all_finite
+from sklearn.utils.validation import _assert_all_finite_element_wise
+from sklearn.utils import compute_sample_weight
+from sklearn.utils.multiclass import check_classification_targets
+from sklearn.utils.validation import check_is_fitted
+from sklearn.utils._param_validation import Hidden, Interval, StrOptions
+from sklearn.utils._param_validation import RealNotInt
 
 from ._criterion import BaseCriterion
 from ._splitter import BaseSplitter
diff --git a/sklearn/tree/_export.py b/sklearn/tree/_export.py
index e8dbe51138223..be545de0202d0 100644
--- a/sklearn/tree/_export.py
+++ b/sklearn/tree/_export.py
@@ -16,10 +16,15 @@
 
 import numpy as np
 
-from ..utils.validation import check_is_fitted, check_array
-from ..utils._param_validation import Interval, validate_params, StrOptions, HasMethods
+from sklearn.utils.validation import check_is_fitted, check_array
+from sklearn.utils._param_validation import (
+    Interval,
+    validate_params,
+    StrOptions,
+    HasMethods,
+)
 
-from ..base import is_classifier
+from sklearn.base import is_classifier
 
 from . import _criterion
 from . import _tree
diff --git a/sklearn/tree/_utils.pxd b/sklearn/tree/_utils.pxd
index 4938d3030245f..f7bae4c5c8553 100644
--- a/sklearn/tree/_utils.pxd
+++ b/sklearn/tree/_utils.pxd
@@ -10,7 +10,7 @@
 
 cimport numpy as cnp
 from ._tree cimport Node
-from ..neighbors._quad_tree cimport Cell
+from sklearn.neighbors._quad_tree cimport Cell
 
 ctypedef cnp.npy_float32 DTYPE_t          # Type of X
 ctypedef cnp.npy_float64 DOUBLE_t         # Type of y, sample_weight
diff --git a/sklearn/tree/_utils.pyx b/sklearn/tree/_utils.pyx
index 0a7522bcf4255..bc7e17f8766d8 100644
--- a/sklearn/tree/_utils.pyx
+++ b/sklearn/tree/_utils.pyx
@@ -19,7 +19,7 @@ import numpy as np
 cimport numpy as cnp
 cnp.import_array()
 
-from ..utils._random cimport our_rand_r
+from sklearn.utils._random cimport our_rand_r
 
 # =============================================================================
 # Helper functions

From 45320b4d3ef05b4ccbe81e8c13676b1c755d1973 Mon Sep 17 00:00:00 2001
From: Adam Li <adam2392@gmail.com>
Date: Thu, 8 Jun 2023 14:17:25 -0400
Subject: [PATCH 10/28] Fix forest import

Signed-off-by: Adam Li <adam2392@gmail.com>
---
 sklearn/ensemble/_forest.py | 35 ++++++++++++++++++++---------------
 1 file changed, 20 insertions(+), 15 deletions(-)

diff --git a/sklearn/ensemble/_forest.py b/sklearn/ensemble/_forest.py
index 4cc672bb6884d..4d9bf862bd806 100644
--- a/sklearn/ensemble/_forest.py
+++ b/sklearn/ensemble/_forest.py
@@ -50,11 +50,16 @@ class calls the ``fit`` method of each sub-estimator on random samples
 from scipy.sparse import issparse
 from scipy.sparse import hstack as sparse_hstack
 
-from ..base import is_classifier
-from ..base import ClassifierMixin, MultiOutputMixin, RegressorMixin, TransformerMixin
+from sklearn.base import is_classifier
+from sklearn.base import (
+    ClassifierMixin,
+    MultiOutputMixin,
+    RegressorMixin,
+    TransformerMixin,
+)
 
-from ..metrics import accuracy_score, r2_score
-from ..preprocessing import OneHotEncoder
+from sklearn.metrics import accuracy_score, r2_score
+from sklearn.preprocessing import OneHotEncoder
 from ..tree import (
     BaseDecisionTree,
     DecisionTreeClassifier,
@@ -63,21 +68,21 @@ class calls the ``fit`` method of each sub-estimator on random samples
     ExtraTreeRegressor,
 )
 from ..tree._tree import DTYPE, DOUBLE
-from ..utils import check_random_state, compute_sample_weight
-from ..exceptions import DataConversionWarning
-from ._base import BaseEnsemble, _partition_estimators
-from ..utils.parallel import delayed, Parallel
-from ..utils.multiclass import check_classification_targets, type_of_target
-from ..utils.validation import (
+from sklearn.utils import check_random_state, compute_sample_weight
+from sklearn.exceptions import DataConversionWarning
+from sklearn.ensemble._base import BaseEnsemble, _partition_estimators
+from sklearn.utils.parallel import delayed, Parallel
+from sklearn.utils.multiclass import check_classification_targets, type_of_target
+from sklearn.utils.validation import (
     check_is_fitted,
     _check_sample_weight,
     _check_feature_names_in,
 )
-from ..utils._openmp_helpers import _openmp_effective_n_threads
-from ..utils.validation import _num_samples
-from ..utils._param_validation import Interval, StrOptions
-from ..utils._param_validation import RealNotInt
-from ._hist_gradient_boosting.binning import _BinMapper
+from sklearn.utils._openmp_helpers import _openmp_effective_n_threads
+from sklearn.utils.validation import _num_samples
+from sklearn.utils._param_validation import Interval, StrOptions
+from sklearn.utils._param_validation import RealNotInt
+from sklearn.ensemble._hist_gradient_boosting.binning import _BinMapper
 
 __all__ = [
     "RandomForestClassifier",

From 49526f026c46727aa272be7bdd7a44d0101c089f Mon Sep 17 00:00:00 2001
From: Adam Li <adam2392@gmail.com>
Date: Tue, 13 Jun 2023 15:19:07 -0400
Subject: [PATCH 11/28] Fix classes and criterion

Signed-off-by: Adam Li <adam2392@gmail.com>
---
 sklearn/tree/_classes.py    | 67 ++++++++++++++++++++++++++++++++++++
 sklearn/tree/_criterion.pxd | 11 ++++--
 sklearn/tree/_criterion.pyx | 68 +++++++++++++++++++++++++++++++++++--
 3 files changed, 141 insertions(+), 5 deletions(-)

diff --git a/sklearn/tree/_classes.py b/sklearn/tree/_classes.py
index 4fdd8f27cd652..795c68c8b5081 100644
--- a/sklearn/tree/_classes.py
+++ b/sklearn/tree/_classes.py
@@ -713,6 +713,73 @@ def feature_importances_(self):
 
         return self.tree_.compute_feature_importances()
 
+    def _get_y_for_leaves(self, X, sample_weight=None):
+        n_samples = X.shape[0]
+
+        # get the predictions
+        X_leaves = self.apply(X)
+
+        bootstrap_indices = np.empty(shape, dtype=np.int64)
+        for i, estimator in enumerate(self.estimators_):
+            # Get bootstrap indices.
+            if self.bootstrap:
+                n_samples_bootstrap = _get_n_samples_bootstrap(n_samples, self.max_samples)
+                bootstrap_indices[:, i] = _generate_sample_indices(
+                    estimator.random_state, n_samples, n_samples_bootstrap
+                )
+            else:
+                bootstrap_indices[:, i] = np.arange(n_samples)
+
+            # Get predictions on bootstrap indices.
+            X_leaves[:, i] = X_leaves[bootstrap_indices[:, i], i]
+
+        if sorter is not None:
+            # Reassign bootstrap indices to account for target sorting.
+            bootstrap_indices = np.argsort(sorter)[bootstrap_indices]
+
+        bootstrap_indices += 1  # for sparse matrix (0s as empty)
+
+        # Get the maximum number of nodes (internal + leaves) across trees.
+        # Get the maximum number of samples per leaf across trees (if needed).
+        max_node_count = 0
+        max_samples_leaf = 0 if not leaf_subsample else max_samples_leaf
+        for i, estimator in enumerate(self.estimators_):
+            node_count = estimator.tree_.node_count
+            if node_count > max_node_count:
+                max_node_count = node_count
+            if not leaf_subsample:
+                sample_count = np.max(np.bincount(X_leaves[:, i]))
+                if sample_count > max_samples_leaf:
+                    max_samples_leaf = sample_count
+
+        # Initialize NumPy array (more efficient serialization than dict/list).
+        shape = (self.n_estimators, max_node_count, max_samples_leaf)
+        y_train_leaves = np.zeros(shape, dtype=np.int64)
+
+        for i, estimator in enumerate(self.estimators_):
+            # Group training indices by leaf node.
+            leaf_indices, leaf_values_list = _group_by_value(X_leaves[:, i])
+
+            if leaf_subsample:
+                random.seed(estimator.random_state)
+
+            # Map each leaf node to its list of training indices.
+            for leaf_idx, leaf_values in zip(leaf_indices, leaf_values_list):
+                y_indices = bootstrap_indices[:, i][leaf_values]
+
+                if sample_weight is not None:
+                    y_indices = y_indices[sample_weight[y_indices - 1] > 0]
+
+                # Subsample leaf training indices (without replacement).
+                if leaf_subsample and max_samples_leaf < len(y_indices):
+                    if not isinstance(y_indices, list):
+                        y_indices = list(y_indices)
+                    y_indices = random.sample(y_indices, max_samples_leaf)
+
+                y_train_leaves[i, leaf_idx, : len(y_indices)] = y_indices
+
+        return y_train_leaves
+
 
 # =============================================================================
 # Public estimators
diff --git a/sklearn/tree/_criterion.pxd b/sklearn/tree/_criterion.pxd
index 6cfc33c5bdcea..d72f22f8b348d 100644
--- a/sklearn/tree/_criterion.pxd
+++ b/sklearn/tree/_criterion.pxd
@@ -11,6 +11,8 @@
 
 # See _criterion.pyx for implementation details.
 
+# from libcpp.vector cimport vector
+
 from ._tree cimport DTYPE_t          # Type of X
 from ._tree cimport DOUBLE_t         # Type of y, sample_weight
 from ._tree cimport SIZE_t           # Type for indices and counters
@@ -19,7 +21,7 @@ from ._tree cimport UINT32_t         # Unsigned 32 bit integer
 
 
 cdef class BaseCriterion:
-    """Abstract interface for criterion."""    
+    """Abstract interface for criterion."""
 
     # Internal structures
     cdef const DOUBLE_t[:] sample_weight  # Sample weights
@@ -70,13 +72,18 @@ cdef class BaseCriterion:
         SIZE_t end
     ) noexcept nogil
 
+    # cdef void node_samples(
+    #     self,
+    #     vector[vector[DOUBLE_t]]* dest
+    # ) noexcept nogil
+
 cdef class Criterion(BaseCriterion):
     """Abstract interface for supervised impurity criteria."""
 
     cdef const DOUBLE_t[:, ::1] y         # Values of y
     cdef SIZE_t n_missing                # Number of missing values for the feature being evaluated
     cdef bint missing_go_to_left         # Whether missing values go to the left node
-    
+
     cdef int init(
         self,
         const DOUBLE_t[:, ::1] y,
diff --git a/sklearn/tree/_criterion.pyx b/sklearn/tree/_criterion.pyx
index 8fbcafcaf1456..e9c02ab2fa43d 100644
--- a/sklearn/tree/_criterion.pyx
+++ b/sklearn/tree/_criterion.pyx
@@ -39,10 +39,13 @@ cdef class BaseCriterion:
     covariates, or labels, or both. Although scikit-learn currently only contains
     supervised tree methods, this class enables 3rd party packages to leverage
     scikit-learn's Cython code for criteria.
+
     The downstream classes _must_ implement methods to compute the impurity
     in current node and in children nodes.
+
     This object stores methods on how to calculate how good a split is using
     a set API. 
+
     Samples in the "current" node are stored in `samples[start:end]` which is
     partitioned around `pos` (an index in `start:end`) so that:
        - the samples of left child node are stored in `samples[start:pos]`
@@ -56,21 +59,25 @@ cdef class BaseCriterion:
 
     cdef int reset(self) except -1 nogil:
         """Reset the criterion at pos=start.
+
         This method must be implemented by the subclass.
         """
         pass
 
     cdef int reverse_reset(self) except -1 nogil:
         """Reset the criterion at pos=end.
+
         This method must be implemented by the subclass.
         """
         pass
 
     cdef int update(self, SIZE_t new_pos) except -1 nogil:
         """Updated statistics by moving sample_indices[pos:new_pos] to the left child.
+
         This updates the collected statistics by moving sample_indices[pos:new_pos]
         from the right child to the left child. It must be implemented by
         the subclass.
+
         Parameters
         ----------
         new_pos : SIZE_t
@@ -80,6 +87,7 @@ cdef class BaseCriterion:
 
     cdef double node_impurity(self) noexcept nogil:
         """Placeholder for calculating the impurity of the node.
+
         Placeholder for a method which will evaluate the impurity of
         the current node, i.e. the impurity of sample_indices[start:end]. This is the
         primary function of the criterion class. The smaller the impurity the
@@ -90,9 +98,11 @@ cdef class BaseCriterion:
     cdef void children_impurity(self, double* impurity_left,
                                 double* impurity_right) noexcept nogil:
         """Placeholder for calculating the impurity of children.
+
         Placeholder for a method which evaluates the impurity in
         children nodes, i.e. the impurity of sample_indices[start:pos] + the impurity
         of sample_indices[pos:end].
+
         Parameters
         ----------
         impurity_left : double pointer
@@ -106,8 +116,10 @@ cdef class BaseCriterion:
 
     cdef void node_value(self, double* dest) noexcept nogil:
         """Placeholder for storing the node value.
+
         Placeholder for a method which will compute the node value
         of sample_indices[start:end] and save the value into dest.
+
         Parameters
         ----------
         dest : double pointer
@@ -117,10 +129,12 @@ cdef class BaseCriterion:
 
     cdef double proxy_impurity_improvement(self) noexcept nogil:
         """Compute a proxy of the impurity reduction.
+
         This method is used to speed up the search for the best split.
         It is a proxy quantity such that the split that maximizes this value
         also maximizes the impurity improvement. It neglects all constant terms
         of the impurity decrease for a given split.
+
         The absolute impurity improvement is only computed by the
         impurity_improvement method once the best split has been found.
         """
@@ -135,6 +149,7 @@ cdef class BaseCriterion:
                                      double impurity_left,
                                      double impurity_right) noexcept nogil:
         """Compute the improvement in impurity.
+
         This method computes the improvement in impurity when a split occurs.
         The weighted impurity improvement equation is the following:
             N_t / N * (impurity - N_t_R / N_t * right_impurity
@@ -142,6 +157,7 @@ cdef class BaseCriterion:
         where N is the total number of samples, N_t is the number of samples
         at the current node, N_t_L is the number of samples in the left child,
         and N_t_R is the number of samples in the right child,
+
         Parameters
         ----------
         impurity_parent : double
@@ -150,6 +166,7 @@ cdef class BaseCriterion:
             The impurity of the left child
         impurity_right : double
             The impurity of the right child
+
         Return
         ------
         double : improvement in impurity after the split occurs
@@ -166,10 +183,12 @@ cdef class BaseCriterion:
         SIZE_t end
     ) noexcept nogil:
         """Abstract method which will set sample pointers in the criterion.
+
         The dataset array that we compute criteria on is assumed to consist of 'N' 
         ordered samples or rows (i.e. sorted). Since we pass this by reference, we 
         use sample pointers to move the start and end around to consider only a subset of data. 
         This function should also update relevant statistics that the class uses to compute the final criterion.
+
         Parameters
         ----------
         start : SIZE_t
@@ -182,11 +201,13 @@ cdef class BaseCriterion:
 
 cdef class Criterion(BaseCriterion):
     """Interface for impurity criteria.
+
     The supervised criterion computes the impurity of a node and the reduction of
     impurity of a split on that node using the distribution of labels in parent and
-    children nodes. It also computes the output statistics
-    such as the mean in regression and class probabilities in classification.
-    Instances of this class are responsible for compute splits' impurity difference
+    children nodes. It also computes the output statistics such as the mean in regression
+    and class probabilities in classification. Instances of this class are responsible
+    for compute splits' impurity difference.
+
     Criterion is the base class for criteria used in supervised tree-based models
     with a homogeneous float64-dtyped y.
     """
@@ -198,8 +219,10 @@ cdef class Criterion(BaseCriterion):
         const SIZE_t[:] sample_indices
     ) except -1 nogil:
         """Placeholder for a method which will initialize the criterion.
+
         Returns -1 in case of failure to allocate memory (and raise MemoryError)
         or 0 otherwise.
+
         Parameters
         ----------
         y : ndarray, dtype=DOUBLE_t
@@ -279,6 +302,7 @@ cdef class ClassificationCriterion(Criterion):
     def __cinit__(self, SIZE_t n_outputs,
                   cnp.ndarray[SIZE_t, ndim=1] n_classes):
         """Initialize attributes for this criterion.
+
         Parameters
         ----------
         n_outputs : SIZE_t
@@ -331,8 +355,10 @@ cdef class ClassificationCriterion(Criterion):
         const SIZE_t[:] sample_indices
     ) except -1 nogil:
         """Initialize the criterion.
+
         Returns -1 in case of failure to allocate memory (and raise MemoryError)
         or 0 otherwise.
+
         Parameters
         ----------
         y : ndarray, dtype=DOUBLE_t
@@ -426,6 +452,7 @@ cdef class ClassificationCriterion(Criterion):
 
     cdef int reset(self) except -1 nogil:
         """Reset the criterion at pos=start.
+
         Returns -1 in case of failure to allocate memory (and raise MemoryError)
         or 0 otherwise.
         """
@@ -442,6 +469,7 @@ cdef class ClassificationCriterion(Criterion):
 
     cdef int reverse_reset(self) except -1 nogil:
         """Reset the criterion at pos=end.
+
         Returns -1 in case of failure to allocate memory (and raise MemoryError)
         or 0 otherwise.
         """
@@ -458,8 +486,10 @@ cdef class ClassificationCriterion(Criterion):
 
     cdef int update(self, SIZE_t new_pos) except -1 nogil:
         """Updated statistics by moving sample_indices[pos:new_pos] to the left child.
+
         Returns -1 in case of failure to allocate memory (and raise MemoryError)
         or 0 otherwise.
+
         Parameters
         ----------
         new_pos : SIZE_t
@@ -532,6 +562,7 @@ cdef class ClassificationCriterion(Criterion):
 
     cdef void node_value(self, double* dest) noexcept nogil:
         """Compute the node value of sample_indices[start:end] and save it into dest.
+
         Parameters
         ----------
         dest : double pointer
@@ -546,17 +577,20 @@ cdef class ClassificationCriterion(Criterion):
 
 cdef class Entropy(ClassificationCriterion):
     r"""Cross Entropy impurity criterion.
+
     This handles cases where the target is a classification taking values
     0, 1, ... K-2, K-1. If node m represents a region Rm with Nm observations,
     then let
         count_k = 1 / Nm \sum_{x_i in Rm} I(yi = k)
     be the proportion of class k observations in node m.
+
     The cross-entropy is then defined as
         cross-entropy = -\sum_{k=0}^{K-1} count_k log(count_k)
     """
 
     cdef double node_impurity(self) noexcept nogil:
         """Evaluate the impurity of the current node.
+
         Evaluate the cross-entropy criterion as impurity of the current node,
         i.e. the impurity of sample_indices[start:end]. The smaller the impurity the
         better.
@@ -578,8 +612,10 @@ cdef class Entropy(ClassificationCriterion):
     cdef void children_impurity(self, double* impurity_left,
                                 double* impurity_right) noexcept nogil:
         """Evaluate the impurity in children nodes.
+
         i.e. the impurity of the left child (sample_indices[start:pos]) and the
         impurity the right child (sample_indices[pos:end]).
+
         Parameters
         ----------
         impurity_left : double pointer
@@ -611,11 +647,13 @@ cdef class Entropy(ClassificationCriterion):
 
 cdef class Gini(ClassificationCriterion):
     r"""Gini Index impurity criterion.
+
     This handles cases where the target is a classification taking values
     0, 1, ... K-2, K-1. If node m represents a region Rm with Nm observations,
     then let
         count_k = 1/ Nm \sum_{x_i in Rm} I(yi = k)
     be the proportion of class k observations in node m.
+
     The Gini Index is then defined as:
         index = \sum_{k=0}^{K-1} count_k (1 - count_k)
               = 1 - \sum_{k=0}^{K-1} count_k ** 2
@@ -623,6 +661,7 @@ cdef class Gini(ClassificationCriterion):
 
     cdef double node_impurity(self) noexcept nogil:
         """Evaluate the impurity of the current node.
+
         Evaluate the Gini criterion as impurity of the current node,
         i.e. the impurity of sample_indices[start:end]. The smaller the impurity the
         better.
@@ -648,8 +687,10 @@ cdef class Gini(ClassificationCriterion):
     cdef void children_impurity(self, double* impurity_left,
                                 double* impurity_right) noexcept nogil:
         """Evaluate the impurity in children nodes.
+
         i.e. the impurity of the left child (sample_indices[start:pos]) and the
         impurity the right child (sample_indices[pos:end]) using the Gini index.
+
         Parameters
         ----------
         impurity_left : double pointer
@@ -726,6 +767,7 @@ cdef inline void _move_sums_regression(
 
 cdef class RegressionCriterion(Criterion):
     r"""Abstract regression criterion.
+
     This handles cases where the target is a continuous value, and is
     evaluated by computing the variance of the target values left and right
     of the split point. The computation takes linear time with `n_samples`
@@ -736,6 +778,7 @@ cdef class RegressionCriterion(Criterion):
 
     def __cinit__(self, SIZE_t n_outputs, SIZE_t n_samples):
         """Initialize parameters for this criterion.
+
         Parameters
         ----------
         n_outputs : SIZE_t
@@ -961,6 +1004,7 @@ cdef class MSE(RegressionCriterion):
 
     cdef double node_impurity(self) noexcept nogil:
         """Evaluate the impurity of the current node.
+
         Evaluate the MSE criterion as impurity of the current node,
         i.e. the impurity of sample_indices[start:end]. The smaller the impurity the
         better.
@@ -976,10 +1020,12 @@ cdef class MSE(RegressionCriterion):
 
     cdef double proxy_impurity_improvement(self) noexcept nogil:
         """Compute a proxy of the impurity reduction.
+
         This method is used to speed up the search for the best split.
         It is a proxy quantity such that the split that maximizes this value
         also maximizes the impurity improvement. It neglects all constant terms
         of the impurity decrease for a given split.
+
         The absolute impurity improvement is only computed by the
         impurity_improvement method once the best split has been found.
         The MSE proxy is derived from
@@ -1002,6 +1048,7 @@ cdef class MSE(RegressionCriterion):
     cdef void children_impurity(self, double* impurity_left,
                                 double* impurity_right) noexcept nogil:
         """Evaluate the impurity in children nodes.
+
         i.e. the impurity of the left child (sample_indices[start:pos]) and the
         impurity the right child (sample_indices[pos:end]).
         """
@@ -1045,6 +1092,7 @@ cdef class MSE(RegressionCriterion):
 
 cdef class MAE(RegressionCriterion):
     r"""Mean absolute error impurity criterion.
+
        MAE = (1 / n)*(\sum_i |y_i - f_i|), where y_i is the true
        value and f_i is the predicted value."""
 
@@ -1056,6 +1104,7 @@ cdef class MAE(RegressionCriterion):
 
     def __cinit__(self, SIZE_t n_outputs, SIZE_t n_samples):
         """Initialize parameters for this criterion.
+
         Parameters
         ----------
         n_outputs : SIZE_t
@@ -1154,6 +1203,7 @@ cdef class MAE(RegressionCriterion):
 
     cdef int reset(self) except -1 nogil:
         """Reset the criterion at pos=start.
+
         Returns -1 in case of failure to allocate memory (and raise MemoryError)
         or 0 otherwise.
         """
@@ -1184,6 +1234,7 @@ cdef class MAE(RegressionCriterion):
 
     cdef int reverse_reset(self) except -1 nogil:
         """Reset the criterion at pos=end.
+
         Returns -1 in case of failure to allocate memory (and raise MemoryError)
         or 0 otherwise.
         """
@@ -1211,6 +1262,7 @@ cdef class MAE(RegressionCriterion):
 
     cdef int update(self, SIZE_t new_pos) except -1 nogil:
         """Updated statistics by moving sample_indices[pos:new_pos] to the left.
+
         Returns -1 in case of failure to allocate memory (and raise MemoryError)
         or 0 otherwise.
         """
@@ -1273,6 +1325,7 @@ cdef class MAE(RegressionCriterion):
 
     cdef double node_impurity(self) noexcept nogil:
         """Evaluate the impurity of the current node.
+
         Evaluate the MAE criterion as impurity of the current node,
         i.e. the impurity of sample_indices[start:end]. The smaller the impurity the
         better.
@@ -1297,6 +1350,7 @@ cdef class MAE(RegressionCriterion):
     cdef void children_impurity(self, double* p_impurity_left,
                                 double* p_impurity_right) noexcept nogil:
         """Evaluate the impurity in children nodes.
+
         i.e. the impurity of the left child (sample_indices[start:pos]) and the
         impurity the right child (sample_indices[pos:end]).
         """
@@ -1343,6 +1397,7 @@ cdef class MAE(RegressionCriterion):
 
 cdef class FriedmanMSE(MSE):
     """Mean squared error impurity criterion with improvement score by Friedman.
+
     Uses the formula (35) in Friedman's original Gradient Boosting paper:
         diff = mean_left - mean_right
         improvement = n_left * n_right * diff^2 / (n_left + n_right)
@@ -1350,10 +1405,12 @@ cdef class FriedmanMSE(MSE):
 
     cdef double proxy_impurity_improvement(self) noexcept nogil:
         """Compute a proxy of the impurity reduction.
+
         This method is used to speed up the search for the best split.
         It is a proxy quantity such that the split that maximizes this value
         also maximizes the impurity improvement. It neglects all constant terms
         of the impurity decrease for a given split.
+
         The absolute impurity improvement is only computed by the
         impurity_improvement method once the best split has been found.
         """
@@ -1394,6 +1451,7 @@ cdef class FriedmanMSE(MSE):
 
 cdef class Poisson(RegressionCriterion):
     """Half Poisson deviance as impurity criterion.
+
     Poisson deviance = 2/n * sum(y_true * log(y_true/y_pred) + y_pred - y_true)
     Note that the deviance is >= 0, and since we have `y_pred = mean(y_true)`
     at the leaves, one always has `sum(y_pred - y_true) = 0`. It remains the
@@ -1413,6 +1471,7 @@ cdef class Poisson(RegressionCriterion):
 
     cdef double node_impurity(self) noexcept nogil:
         """Evaluate the impurity of the current node.
+
         Evaluate the Poisson criterion as impurity of the current node,
         i.e. the impurity of sample_indices[start:end]. The smaller the impurity the
         better.
@@ -1422,10 +1481,12 @@ cdef class Poisson(RegressionCriterion):
 
     cdef double proxy_impurity_improvement(self) noexcept nogil:
         """Compute a proxy of the impurity reduction.
+
         This method is used to speed up the search for the best split.
         It is a proxy quantity such that the split that maximizes this value
         also maximizes the impurity improvement. It neglects all constant terms
         of the impurity decrease for a given split.
+
         The absolute impurity improvement is only computed by the
         impurity_improvement method once the best split has been found.
         The Poisson proxy is derived from:
@@ -1463,6 +1524,7 @@ cdef class Poisson(RegressionCriterion):
     cdef void children_impurity(self, double* impurity_left,
                                 double* impurity_right) noexcept nogil:
         """Evaluate the impurity in children nodes.
+
         i.e. the impurity of the left child (sample_indices[start:pos]) and the
         impurity of the right child (sample_indices[pos:end]) for Poisson.
         """

From 2105949178bf03660c13df1fd197abbbb57d826e Mon Sep 17 00:00:00 2001
From: Adam Li <adam2392@gmail.com>
Date: Tue, 13 Jun 2023 15:22:15 -0400
Subject: [PATCH 12/28] Working..

Signed-off-by: Adam Li <adam2392@gmail.com>
---
 sklearn/tree/_criterion.pxd | 2 +-
 sklearn/tree/_criterion.pyx | 4 +++-
 sklearn/tree/_splitter.pxd  | 2 +-
 3 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/sklearn/tree/_criterion.pxd b/sklearn/tree/_criterion.pxd
index d72f22f8b348d..20020b4a5361c 100644
--- a/sklearn/tree/_criterion.pxd
+++ b/sklearn/tree/_criterion.pxd
@@ -11,7 +11,7 @@
 
 # See _criterion.pyx for implementation details.
 
-# from libcpp.vector cimport vector
+from libcpp.vector cimport vector
 
 from ._tree cimport DTYPE_t          # Type of X
 from ._tree cimport DOUBLE_t         # Type of y, sample_weight
diff --git a/sklearn/tree/_criterion.pyx b/sklearn/tree/_criterion.pyx
index e9c02ab2fa43d..d60cab3063c1b 100644
--- a/sklearn/tree/_criterion.pyx
+++ b/sklearn/tree/_criterion.pyx
@@ -34,7 +34,9 @@ from ._utils cimport WeightedMedianCalculator
 cdef double EPSILON = 10 * np.finfo('double').eps
 
 cdef class BaseCriterion:
-    """This is an abstract interface for criterion. For example, a tree model could
+    """This is an abstract interface for criterion.
+
+    For example, a tree model could
     be either supervisedly, or unsupervisedly computing impurity on samples of
     covariates, or labels, or both. Although scikit-learn currently only contains
     supervised tree methods, this class enables 3rd party packages to leverage
diff --git a/sklearn/tree/_splitter.pxd b/sklearn/tree/_splitter.pxd
index 01975df22ef23..fc49471569ecc 100644
--- a/sklearn/tree/_splitter.pxd
+++ b/sklearn/tree/_splitter.pxd
@@ -97,7 +97,7 @@ cdef class BaseSplitter:
 cdef class Splitter(BaseSplitter):
     cdef public Criterion criterion      # Impurity criterion
     cdef const DOUBLE_t[:, ::1] y
-    
+
     cdef int init(
         self,
         object X,

From 9b07f2ab2b1b6f8f4ea1294fce1a5f9bd3be1a1d Mon Sep 17 00:00:00 2001
From: Adam Li <adam2392@gmail.com>
Date: Tue, 13 Jun 2023 15:42:37 -0400
Subject: [PATCH 13/28] Add leaf storage ability

Signed-off-by: Adam Li <adam2392@gmail.com>
---
 sklearn/tree/_classes.py    | 67 ----------------------------
 sklearn/tree/_criterion.pxd |  9 ++--
 sklearn/tree/_criterion.pyx | 28 +++++++++---
 sklearn/tree/_splitter.pxd  |  3 ++
 sklearn/tree/_splitter.pyx  | 31 ++++++-------
 sklearn/tree/_tree.pxd      | 19 ++++++--
 sklearn/tree/_tree.pyx      | 88 +++++++++++++++++++++++++------------
 7 files changed, 122 insertions(+), 123 deletions(-)

diff --git a/sklearn/tree/_classes.py b/sklearn/tree/_classes.py
index 795c68c8b5081..4fdd8f27cd652 100644
--- a/sklearn/tree/_classes.py
+++ b/sklearn/tree/_classes.py
@@ -713,73 +713,6 @@ def feature_importances_(self):
 
         return self.tree_.compute_feature_importances()
 
-    def _get_y_for_leaves(self, X, sample_weight=None):
-        n_samples = X.shape[0]
-
-        # get the predictions
-        X_leaves = self.apply(X)
-
-        bootstrap_indices = np.empty(shape, dtype=np.int64)
-        for i, estimator in enumerate(self.estimators_):
-            # Get bootstrap indices.
-            if self.bootstrap:
-                n_samples_bootstrap = _get_n_samples_bootstrap(n_samples, self.max_samples)
-                bootstrap_indices[:, i] = _generate_sample_indices(
-                    estimator.random_state, n_samples, n_samples_bootstrap
-                )
-            else:
-                bootstrap_indices[:, i] = np.arange(n_samples)
-
-            # Get predictions on bootstrap indices.
-            X_leaves[:, i] = X_leaves[bootstrap_indices[:, i], i]
-
-        if sorter is not None:
-            # Reassign bootstrap indices to account for target sorting.
-            bootstrap_indices = np.argsort(sorter)[bootstrap_indices]
-
-        bootstrap_indices += 1  # for sparse matrix (0s as empty)
-
-        # Get the maximum number of nodes (internal + leaves) across trees.
-        # Get the maximum number of samples per leaf across trees (if needed).
-        max_node_count = 0
-        max_samples_leaf = 0 if not leaf_subsample else max_samples_leaf
-        for i, estimator in enumerate(self.estimators_):
-            node_count = estimator.tree_.node_count
-            if node_count > max_node_count:
-                max_node_count = node_count
-            if not leaf_subsample:
-                sample_count = np.max(np.bincount(X_leaves[:, i]))
-                if sample_count > max_samples_leaf:
-                    max_samples_leaf = sample_count
-
-        # Initialize NumPy array (more efficient serialization than dict/list).
-        shape = (self.n_estimators, max_node_count, max_samples_leaf)
-        y_train_leaves = np.zeros(shape, dtype=np.int64)
-
-        for i, estimator in enumerate(self.estimators_):
-            # Group training indices by leaf node.
-            leaf_indices, leaf_values_list = _group_by_value(X_leaves[:, i])
-
-            if leaf_subsample:
-                random.seed(estimator.random_state)
-
-            # Map each leaf node to its list of training indices.
-            for leaf_idx, leaf_values in zip(leaf_indices, leaf_values_list):
-                y_indices = bootstrap_indices[:, i][leaf_values]
-
-                if sample_weight is not None:
-                    y_indices = y_indices[sample_weight[y_indices - 1] > 0]
-
-                # Subsample leaf training indices (without replacement).
-                if leaf_subsample and max_samples_leaf < len(y_indices):
-                    if not isinstance(y_indices, list):
-                        y_indices = list(y_indices)
-                    y_indices = random.sample(y_indices, max_samples_leaf)
-
-                y_train_leaves[i, leaf_idx, : len(y_indices)] = y_indices
-
-        return y_train_leaves
-
 
 # =============================================================================
 # Public estimators
diff --git a/sklearn/tree/_criterion.pxd b/sklearn/tree/_criterion.pxd
index 20020b4a5361c..721b475f40436 100644
--- a/sklearn/tree/_criterion.pxd
+++ b/sklearn/tree/_criterion.pxd
@@ -72,10 +72,6 @@ cdef class BaseCriterion:
         SIZE_t end
     ) noexcept nogil
 
-    # cdef void node_samples(
-    #     self,
-    #     vector[vector[DOUBLE_t]]* dest
-    # ) noexcept nogil
 
 cdef class Criterion(BaseCriterion):
     """Abstract interface for supervised impurity criteria."""
@@ -94,6 +90,11 @@ cdef class Criterion(BaseCriterion):
     cdef void init_sum_missing(self)
     cdef void init_missing(self, SIZE_t n_missing) noexcept nogil
 
+    cdef void node_samples(
+        self,
+        vector[vector[DOUBLE_t]]* dest
+    ) noexcept nogil
+
 cdef class ClassificationCriterion(Criterion):
     """Abstract criterion for classification."""
 
diff --git a/sklearn/tree/_criterion.pyx b/sklearn/tree/_criterion.pyx
index d60cab3063c1b..c3f08ec859bee 100644
--- a/sklearn/tree/_criterion.pyx
+++ b/sklearn/tree/_criterion.pyx
@@ -46,7 +46,7 @@ cdef class BaseCriterion:
     in current node and in children nodes.
 
     This object stores methods on how to calculate how good a split is using
-    a set API. 
+    a set API.
 
     Samples in the "current" node are stored in `samples[start:end]` which is
     partitioned around `pos` (an index in `start:end`) so that:
@@ -186,9 +186,9 @@ cdef class BaseCriterion:
     ) noexcept nogil:
         """Abstract method which will set sample pointers in the criterion.
 
-        The dataset array that we compute criteria on is assumed to consist of 'N' 
-        ordered samples or rows (i.e. sorted). Since we pass this by reference, we 
-        use sample pointers to move the start and end around to consider only a subset of data. 
+        The dataset array that we compute criteria on is assumed to consist of 'N'
+        ordered samples or rows (i.e. sorted). Since we pass this by reference, we
+        use sample pointers to move the start and end around to consider only a subset of data.
         This function should also update relevant statistics that the class uses to compute the final criterion.
 
         Parameters
@@ -252,10 +252,28 @@ cdef class Criterion(BaseCriterion):
             Number of missing values for specific feature.
         """
         pass
-      
+
     cdef void init_sum_missing(self):
         """Init sum_missing to hold sums for missing values."""
 
+    cdef void node_samples(
+        self,
+        vector[vector[DOUBLE_t]]* dest
+    ) noexcept nogil:
+        cdef SIZE_t i, j
+
+        # Resize the destination vector of vectors
+        dest.resize(self.n_node_samples)
+
+        # Loop over the samples
+        for i in range(self.n_node_samples):
+            # Get the index of the current sample
+            j = self.sample_indices[self.start + i]
+
+            # Get the sample values for each output
+            for k in range(self.n_outputs):
+                dest[i][k].push_back(self.y[j, k])
+
 cdef inline void _move_sums_classification(
     ClassificationCriterion criterion,
     double[:, ::1] sum_1,
diff --git a/sklearn/tree/_splitter.pxd b/sklearn/tree/_splitter.pxd
index fc49471569ecc..fb21f676e66cc 100644
--- a/sklearn/tree/_splitter.pxd
+++ b/sklearn/tree/_splitter.pxd
@@ -10,6 +10,7 @@
 # License: BSD 3 clause
 
 # See _splitter.pyx for details.
+from libcpp.vector cimport vector
 
 from ._criterion cimport BaseCriterion, Criterion
 
@@ -106,6 +107,8 @@ cdef class Splitter(BaseSplitter):
         const unsigned char[::1] feature_has_missing,
     ) except -1
 
+    cdef void node_samples(self, vector[vector[DOUBLE_t]]* dest) noexcept nogil
+
     # Methods that allow modifications to stopping conditions
     cdef bint check_presplit_conditions(
         self,
diff --git a/sklearn/tree/_splitter.pyx b/sklearn/tree/_splitter.pyx
index a58514d093ddf..7f21d5da545fb 100644
--- a/sklearn/tree/_splitter.pyx
+++ b/sklearn/tree/_splitter.pyx
@@ -53,12 +53,12 @@ cdef inline void _init_split(SplitRecord* self, SIZE_t start_pos) noexcept nogil
     self.n_missing = 0
 
 cdef class BaseSplitter:
-    """This is an abstract interface for splitters. 
+    """This is an abstract interface for splitters.
 
     For example, a tree model could be either supervisedly, or unsupervisedly computing splits on samples of
     covariates, labels, or both. Although scikit-learn currently only contains
     supervised tree methods, this class enables 3rd party packages to leverage
-    scikit-learn's Cython code for splitting. 
+    scikit-learn's Cython code for splitting.
 
     A splitter is usually used in conjunction with a criterion class, which explicitly handles
     computing the criteria, which we split on. The setting of that criterion class is handled
@@ -112,7 +112,7 @@ cdef class BaseSplitter:
 
     cdef int pointer_size(self) noexcept nogil:
         """Size of the pointer for split records.
-        
+
         Overriding this function allows one to use different subclasses of
         `SplitRecord`.
         """
@@ -156,7 +156,6 @@ cdef class Splitter(BaseSplitter):
         self.min_weight_leaf = min_weight_leaf
         self.random_state = random_state
 
-
     def __reduce__(self):
         return (type(self), (self.criterion,
                              self.max_features,
@@ -281,6 +280,10 @@ cdef class Splitter(BaseSplitter):
 
         self.criterion.node_value(dest)
 
+    cdef void node_samples(self, vector[vector[DOUBLE_t]]* dest) noexcept nogil:
+        """Copy the samples[start:end] into dest."""
+        self.criterion.node_samples(dest)
+
     cdef double node_impurity(self) noexcept nogil:
         """Return the impurity of the current node."""
 
@@ -293,7 +296,7 @@ cdef class Splitter(BaseSplitter):
         bint missing_go_to_left,
     ) noexcept nogil:
         """Check stopping conditions pre-split.
-        
+
         This is typically a metric that is cheaply computed given the
         current proposed split, which is stored as a the `current_split`
         argument.
@@ -301,7 +304,7 @@ cdef class Splitter(BaseSplitter):
         cdef SIZE_t min_samples_leaf = self.min_samples_leaf
         cdef SIZE_t end_non_missing = self.end - n_missing
         cdef SIZE_t n_left, n_right
-        
+
         if missing_go_to_left:
             n_left = current_split.pos - self.start + n_missing
             n_right = end_non_missing - current_split.pos
@@ -312,14 +315,14 @@ cdef class Splitter(BaseSplitter):
         # Reject if min_samples_leaf is not guaranteed
         if n_left < min_samples_leaf or n_right < min_samples_leaf:
             return 1
-        
+
         return 0
 
     cdef bint check_postsplit_conditions(
         self
     ) noexcept nogil:
         """Check stopping conditions after evaluating the split.
-        
+
         This takes some metric that is stored in the Criterion
         object and checks against internal stop metrics.
         """
@@ -329,10 +332,10 @@ cdef class Splitter(BaseSplitter):
         if ((self.criterion.weighted_n_left < min_weight_leaf) or
                 (self.criterion.weighted_n_right < min_weight_leaf)):
             return 1
-        
+
         return 0
 
-      
+
 cdef inline void shift_missing_values_to_left_if_required(
     SplitRecord* best,
     SIZE_t[::1] samples,
@@ -360,7 +363,7 @@ cdef inline void shift_missing_values_to_left_if_required(
 ctypedef fused Partitioner:
     DensePartitioner
     SparsePartitioner
-    
+
 cdef inline int node_split_best(
     Splitter splitter,
     Partitioner partitioner,
@@ -504,9 +507,9 @@ cdef inline int node_split_best(
 
                 if p >= end_non_missing:
                     continue
-                    
+
                 current_split.pos = p
-                
+
                 # Reject if min_samples_leaf is not guaranteed
                 if splitter.check_presplit_conditions(current_split, n_missing, missing_go_to_left) == 1:
                     continue
@@ -740,8 +743,6 @@ cdef inline int node_split_random(
     cdef SIZE_t n_features = splitter.n_features
 
     cdef SIZE_t max_features = splitter.max_features
-    cdef SIZE_t min_samples_leaf = splitter.min_samples_leaf
-    cdef double min_weight_leaf = splitter.min_weight_leaf
     cdef UINT32_t* random_state = &splitter.rand_r_state
 
     cdef SplitRecord best_split, current_split
diff --git a/sklearn/tree/_tree.pxd b/sklearn/tree/_tree.pxd
index cbe85886cd865..94714cc33400c 100644
--- a/sklearn/tree/_tree.pxd
+++ b/sklearn/tree/_tree.pxd
@@ -14,6 +14,7 @@ import numpy as np
 cimport numpy as cnp
 
 from libcpp.vector cimport vector
+from libcpp.unordered_map cimport unordered_map
 
 ctypedef cnp.npy_float32 DTYPE_t          # Type of X
 ctypedef cnp.npy_float64 DOUBLE_t         # Type of y, sample_weight
@@ -36,6 +37,7 @@ cdef struct Node:
     DOUBLE_t weighted_n_node_samples     # Weighted number of samples at the node
     unsigned char missing_go_to_left     # Whether features have missing values
 
+
 cdef class BaseTree:
     # Inner structures: values are stored separately from node structure,
     # since size is determined at runtime.
@@ -45,7 +47,14 @@ cdef class BaseTree:
     cdef Node* nodes                     # Array of nodes
 
     cdef SIZE_t value_stride             # The dimensionality of a vectorized output per sample
-    cdef double* value                   # Array of values prediction values for each node        
+    cdef double* value                   # Array of values prediction values for each node
+
+    # Enables the use of tree to store distributions of the output to allow
+    # arbitrary usage of the the leaves. This is used in the quantile
+    # estimators for example.
+    # for storing samples at each leaf node with leaf's node ID as the key and
+    # the sample values as the value
+    cdef unordered_map[SIZE_t, vector[vector[DOUBLE_t]]] value_samples
 
     # Generic Methods: These are generic methods used by any tree.
     cdef int _resize(self, SIZE_t capacity) except -1 nogil
@@ -61,7 +70,7 @@ cdef class BaseTree:
         double weighted_n_node_samples,
         unsigned char missing_go_to_left
     ) except -1 nogil
-    
+
     # Python API methods: These are methods exposed to Python
     cpdef cnp.ndarray apply(self, object X)
     cdef cnp.ndarray _apply_dense(self, object X)
@@ -101,10 +110,10 @@ cdef class Tree(BaseTree):
     # The Supervised Tree object is a binary tree structure constructed by the
     # TreeBuilder. The tree structure is used for predictions and
     # feature importances.
-    # 
+    #
     # Value of upstream properties:
     # - value_stride = n_outputs * max_n_classes
-    # - value = (capacity, n_outputs, max_n_classes) array of values          
+    # - value = (capacity, n_outputs, max_n_classes) array of values
 
     # Input/Output layout for supervised tree
     cdef public SIZE_t n_features        # Number of features in X
@@ -137,6 +146,8 @@ cdef class TreeBuilder:
     cdef SIZE_t max_depth               # Maximal tree depth
     cdef double min_impurity_decrease   # Impurity threshold for early stopping
 
+    cdef unsigned char store_leaf_values # Whether to store leaf values
+
     cpdef build(
         self,
         Tree tree,
diff --git a/sklearn/tree/_tree.pyx b/sklearn/tree/_tree.pyx
index 2256b28c7df10..8ca98a64b42ab 100644
--- a/sklearn/tree/_tree.pyx
+++ b/sklearn/tree/_tree.pyx
@@ -158,15 +158,23 @@ cdef struct StackRecord:
 cdef class DepthFirstTreeBuilder(TreeBuilder):
     """Build a decision tree in depth-first fashion."""
 
-    def __cinit__(self, Splitter splitter, SIZE_t min_samples_split,
-                  SIZE_t min_samples_leaf, double min_weight_leaf,
-                  SIZE_t max_depth, double min_impurity_decrease):
+    def __cinit__(
+        self,
+        Splitter splitter,
+        SIZE_t min_samples_split,
+        SIZE_t min_samples_leaf,
+        double min_weight_leaf,
+        SIZE_t max_depth,
+        double min_impurity_decrease,
+        unsigned char store_leaf_values=False
+    ):
         self.splitter = splitter
         self.min_samples_split = min_samples_split
         self.min_samples_leaf = min_samples_leaf
         self.min_weight_leaf = min_weight_leaf
         self.max_depth = max_depth
         self.min_impurity_decrease = min_impurity_decrease
+        self.store_leaf_values = store_leaf_values
 
     cpdef build(
         self,
@@ -221,6 +229,8 @@ cdef class DepthFirstTreeBuilder(TreeBuilder):
         cdef SIZE_t max_depth_seen = -1
         cdef int rc = 0
 
+        cdef int node_idx
+
         cdef stack[StackRecord] builder_stack
         cdef StackRecord stack_record
 
@@ -308,6 +318,12 @@ cdef class DepthFirstTreeBuilder(TreeBuilder):
                         "is_left": 1,
                         "impurity": split.impurity_left,
                         "n_constant_features": n_constant_features})
+                elif self.store_leaf_values and is_leaf:
+                    with gil:
+                        print('Storing leaf values...')
+
+                    # copy leaf values to leaf_values array
+                    splitter.node_samples(&tree.value_samples[node_id])
 
                 if depth > max_depth_seen:
                     max_depth_seen = depth
@@ -317,7 +333,7 @@ cdef class DepthFirstTreeBuilder(TreeBuilder):
 
             if rc >= 0:
                 tree.max_depth = max_depth_seen
-        
+
         # free the memory created for the SplitRecord pointer
         free(split_ptr)
 
@@ -364,10 +380,17 @@ cdef class BestFirstTreeBuilder(TreeBuilder):
     """
     cdef SIZE_t max_leaf_nodes
 
-    def __cinit__(self, Splitter splitter, SIZE_t min_samples_split,
-                  SIZE_t min_samples_leaf,  min_weight_leaf,
-                  SIZE_t max_depth, SIZE_t max_leaf_nodes,
-                  double min_impurity_decrease):
+    def __cinit__(
+        self,
+        Splitter splitter,
+        SIZE_t min_samples_split,
+        SIZE_t min_samples_leaf,
+        double min_weight_leaf,
+        SIZE_t max_depth,
+        SIZE_t max_leaf_nodes,
+        double min_impurity_decrease,
+        unsigned char store_leaf_values=False,
+    ):
         self.splitter = splitter
         self.min_samples_split = min_samples_split
         self.min_samples_leaf = min_samples_leaf
@@ -375,6 +398,7 @@ cdef class BestFirstTreeBuilder(TreeBuilder):
         self.max_depth = max_depth
         self.max_leaf_nodes = max_leaf_nodes
         self.min_impurity_decrease = min_impurity_decrease
+        self.store_leaf_values = store_leaf_values
 
     cpdef build(
         self,
@@ -488,7 +512,7 @@ cdef class BestFirstTreeBuilder(TreeBuilder):
         """Adds node w/ partition ``[start, end)`` to the frontier. """
         cdef SplitRecord split
         cdef SplitRecord* split_ptr = <SplitRecord *>malloc(splitter.pointer_size())
-        
+
         cdef SIZE_t node_id
         cdef SIZE_t n_node_samples
         cdef SIZE_t n_constant_features = 0
@@ -553,7 +577,7 @@ cdef class BestFirstTreeBuilder(TreeBuilder):
             res.improvement = 0.0
             res.impurity_left = impurity
             res.impurity_right = impurity
-        
+
         free(split_ptr)
         return 0
 
@@ -564,7 +588,7 @@ cdef class BestFirstTreeBuilder(TreeBuilder):
 
 cdef class BaseTree:
     """Base class for Cython tree models.
-    
+
     Downstream classes must implement
     """
     cdef int _resize(
@@ -622,7 +646,7 @@ cdef class BaseTree:
         Node* node
     ) except -1 nogil:
         """Set split node data.
-        
+
         Parameters
         ----------
         split_node : SplitRecord*
@@ -641,7 +665,7 @@ cdef class BaseTree:
         Node* node
     ) except -1 nogil:
         """Set leaf node data.
-        
+
         Parameters
         ----------
         split_node : SplitRecord*
@@ -655,9 +679,12 @@ cdef class BaseTree:
         node.threshold = _TREE_UNDEFINED
         return 1
 
-    cdef DTYPE_t _compute_feature(self, const DTYPE_t[:, :] X_ndarray,
-            SIZE_t sample_index,
-            Node *node) noexcept nogil:
+    cdef DTYPE_t _compute_feature(
+        self,
+        const DTYPE_t[:, :] X_ndarray,
+        SIZE_t sample_index,
+        Node *node
+    ) noexcept nogil:
         """Compute feature from a given data matrix, X.
 
         In axis-aligned trees, this is simply the value in the column of X
@@ -668,7 +695,7 @@ cdef class BaseTree:
         return feature
 
     cdef SIZE_t _add_node(
-        self, 
+        self,
         SIZE_t parent,
         bint is_left,
         bint is_leaf,
@@ -679,7 +706,9 @@ cdef class BaseTree:
         unsigned char missing_go_to_left
     ) except -1 nogil:
         """Add a node to the tree.
+
         The new node registers itself as the child of its parent.
+
         Parameters
         ----------
         parent : SIZE_t
@@ -697,7 +726,7 @@ cdef class BaseTree:
             The number of samples in the node.
         weighted_n_node_samples : double
             The weight of the samples in the node.
-            
+
         Returns (size_t)(-1) on error.
         """
         cdef SIZE_t node_id = self.node_count
@@ -719,12 +748,12 @@ cdef class BaseTree:
 
         if is_leaf:
             if self._set_leaf_node(split_node, node) != 1:
-                 with gil:
-                     raise RuntimeError
+                with gil:
+                    raise RuntimeError
         else:
             if self._set_split_node(split_node, node) != 1:
-                 with gil:
-                     raise RuntimeError
+                with gil:
+                    raise RuntimeError
             node.missing_go_to_left = missing_go_to_left
 
         self.node_count += 1
@@ -796,8 +825,8 @@ cdef class BaseTree:
 
         # Extract input
         cdef const DTYPE_t[:] X_data = X.data
-        cdef const INT32_t[:] X_indices  = X.indices
-        cdef const INT32_t[:] X_indptr  = X.indptr
+        cdef const INT32_t[:] X_indices = X.indices
+        cdef const INT32_t[:] X_indptr = X.indptr
 
         cdef SIZE_t n_samples = X.shape[0]
         cdef SIZE_t n_features = X.shape[1]
@@ -928,8 +957,8 @@ cdef class BaseTree:
 
         # Extract input
         cdef const DTYPE_t[:] X_data = X.data
-        cdef const INT32_t[:] X_indices  = X.indices
-        cdef const INT32_t[:] X_indptr  = X.indptr
+        cdef const INT32_t[:] X_indices = X.indices
+        cdef const INT32_t[:] X_indptr = X.indptr
 
         cdef SIZE_t n_samples = X.shape[0]
         cdef SIZE_t n_features = X.shape[1]
@@ -1043,7 +1072,7 @@ cdef class BaseTree:
                     # ... and node.right_child != _TREE_LEAF:
                     self._compute_feature_importances(
                         importances, node)
-                        
+
                 node += 1
 
         for i in range(self.n_features):
@@ -1065,7 +1094,7 @@ cdef class BaseTree:
         Node* node
     ) noexcept nogil:
         """Compute feature importances from a Node in the Tree.
-        
+
         Wrapped in a private function to allow subclassing that
         computes feature importances.
         """
@@ -1321,6 +1350,9 @@ cdef class Tree(BaseTree):
         self.value = NULL
         self.nodes = NULL
 
+        # initialize the hash map for the value samples
+        self.value_samples = unordered_map[SIZE_t, vector[vector[DOUBLE_t]]]()
+
     def __dealloc__(self):
         """Destructor."""
         # Free all inner structures

From 21ccb30478bdff652118af59a4cd614a23f799d0 Mon Sep 17 00:00:00 2001
From: Adam Li <adam2392@gmail.com>
Date: Thu, 15 Jun 2023 10:35:44 -0400
Subject: [PATCH 14/28] [ENH] Adding leaf node samples to be stored when
 "quantile" tree is turned on (#45)

#### Reference Issues/PRs
Addresses the quantile-trees part of:
https://github.com/neurodata/scikit-tree/issues/29


#### What does this implement/fix? Explain your changes.
1. Stores for each leaf node a 2D numpy array of the y-samples (remember
`y` is (n_samples, n_outputs))
2. Does this all the way in Criterion
3. Only supports supervised tree/splitter/criterion
4. merges in `main` changes.

#### Any other comments?


<!--
Please be aware that we are a loose team of volunteers so patience is
necessary; assistance handling other issues is very welcome. We value
all user contributions, no matter how minor they are. If we are slow to
review, either the pull request needs some benchmarking, tinkering,
convincing, etc. or more likely the reviewers are simply busy. In either
case, we ask for your understanding during the review process.
For more information, see our FAQ on this topic:

http://scikit-learn.org/dev/faq.html#why-is-my-pull-request-not-getting-any-attention.

Thanks for contributing!
-->

---------

Signed-off-by: Adam Li <adam2392@gmail.com>
---
 doc/authors_emeritus.rst                      |   1 +
 doc/contributor_experience_team.rst           |  12 +-
 doc/modules/classes.rst                       |   1 +
 doc/modules/learning_curve.rst                |  42 +-
 doc/visualizations.rst                        |   1 +
 doc/whats_new/v1.3.rst                        | 286 +++++---
 .../plot_kernel_ridge_regression.py           |   1 +
 .../model_selection/plot_validation_curve.py  |  46 +-
 sklearn/base.py                               |  10 +-
 sklearn/calibration.py                        |   7 +-
 sklearn/cluster/_affinity_propagation.py      |   4 +-
 sklearn/cluster/_agglomerative.py             |   5 +-
 sklearn/cluster/_bicluster.py                 |   4 +-
 sklearn/cluster/_birch.py                     |   8 +-
 sklearn/cluster/_bisect_k_means.py            |   4 +-
 sklearn/cluster/_dbscan.py                    |   7 +-
 sklearn/cluster/_feature_agglomeration.py     |  34 +-
 sklearn/cluster/_kmeans.py                    |  11 +-
 sklearn/cluster/_mean_shift.py                |   3 +-
 sklearn/cluster/_optics.py                    |   7 +-
 sklearn/cluster/_spectral.py                  |   4 +-
 .../tests/test_feature_agglomeration.py       |  24 +
 sklearn/compose/_column_transformer.py        |   7 +-
 sklearn/compose/_target.py                    |   6 +-
 sklearn/covariance/_elliptic_envelope.py      |   3 +-
 sklearn/covariance/_empirical_covariance.py   |   3 +-
 sklearn/covariance/_graph_lasso.py            |   5 +-
 sklearn/covariance/_robust_covariance.py      |   3 +-
 sklearn/covariance/_shrunk_covariance.py      |   8 +-
 sklearn/cross_decomposition/_pls.py           |   7 +-
 sklearn/datasets/_arff_parser.py              |   5 +-
 sklearn/datasets/tests/test_openml.py         |   4 +-
 sklearn/decomposition/_dict_learning.py       |   7 +-
 sklearn/decomposition/_factor_analysis.py     |   4 +-
 sklearn/decomposition/_fastica.py             |   7 +-
 sklearn/decomposition/_incremental_pca.py     |   8 +-
 sklearn/decomposition/_kernel_pca.py          |   4 +-
 sklearn/decomposition/_lda.py                 |   7 +-
 sklearn/decomposition/_nmf.py                 |  44 +-
 sklearn/decomposition/_pca.py                 |   7 +-
 sklearn/decomposition/_sparse_pca.py          |   3 +-
 sklearn/decomposition/_truncated_svd.py       |   4 +-
 sklearn/decomposition/tests/test_nmf.py       |  27 +
 sklearn/discriminant_analysis.py              |   9 +-
 sklearn/dummy.py                              |   7 +-
 sklearn/ensemble/_bagging.py                  |   8 +-
 sklearn/ensemble/_forest.py                   | 174 ++++-
 sklearn/ensemble/_gb.py                       |   8 +-
 .../gradient_boosting.py                      |   4 +-
 sklearn/ensemble/_iforest.py                  |   3 +-
 sklearn/ensemble/_stacking.py                 |   8 +-
 sklearn/ensemble/_voting.py                   |  11 +-
 sklearn/ensemble/_weight_boosting.py          |   8 +-
 sklearn/ensemble/tests/test_forest.py         |  51 ++
 .../feature_extraction/_dict_vectorizer.py    |   5 +-
 sklearn/feature_extraction/_hash.py           |   4 +-
 sklearn/feature_extraction/image.py           |   3 +-
 sklearn/feature_extraction/text.py            |  14 +-
 sklearn/feature_selection/_from_model.py      |  11 +-
 sklearn/feature_selection/_rfe.py             |  11 +-
 sklearn/feature_selection/_sequential.py      |   7 +-
 .../_univariate_selection.py                  |   4 +-
 .../feature_selection/_variance_threshold.py  |   3 +-
 sklearn/gaussian_process/_gpc.py              |   4 +-
 sklearn/gaussian_process/_gpr.py              |   4 +-
 sklearn/impute/_base.py                       |   8 +-
 sklearn/impute/_iterative.py                  |   8 +-
 sklearn/impute/_knn.py                        |   3 +-
 sklearn/isotonic.py                           |   3 +-
 sklearn/kernel_approximation.py               |  13 +-
 sklearn/kernel_ridge.py                       |   4 +-
 sklearn/linear_model/_base.py                 |   5 +-
 sklearn/linear_model/_bayes.py                |   8 +-
 sklearn/linear_model/_coordinate_descent.py   |  11 +-
 sklearn/linear_model/_glm/glm.py              |   4 +-
 sklearn/linear_model/_huber.py                |   3 +-
 sklearn/linear_model/_least_angle.py          |  10 +-
 sklearn/linear_model/_logistic.py             |   9 +-
 sklearn/linear_model/_omp.py                  |   7 +-
 sklearn/linear_model/_passive_aggressive.py   |   9 +-
 sklearn/linear_model/_quantile.py             |   3 +-
 sklearn/linear_model/_ransac.py               |   7 +-
 sklearn/linear_model/_ridge.py                |  13 +-
 sklearn/linear_model/_stochastic_gradient.py  |  13 +-
 sklearn/linear_model/_theil_sen.py            |   3 +-
 sklearn/manifold/_isomap.py                   |  13 +-
 sklearn/manifold/_locally_linear.py           |   5 +-
 sklearn/manifold/_mds.py                      |   4 +-
 sklearn/manifold/_spectral_embedding.py       |   4 +-
 sklearn/manifold/_t_sne.py                    |  11 +-
 sklearn/metrics/pairwise.py                   |  14 +-
 sklearn/mixture/_base.py                      |   4 +-
 sklearn/model_selection/__init__.py           |   2 +
 sklearn/model_selection/_plot.py              | 680 +++++++++++++++---
 sklearn/model_selection/_search.py            |   6 +-
 .../_search_successive_halving.py             |   6 +-
 sklearn/model_selection/tests/test_plot.py    | 337 +++++++--
 sklearn/multiclass.py                         |  29 +-
 sklearn/multioutput.py                        |  26 +-
 sklearn/naive_bayes.py                        |  12 +-
 sklearn/neighbors/_classification.py          |  12 +-
 sklearn/neighbors/_graph.py                   |  11 +-
 sklearn/neighbors/_kde.py                     |   7 +-
 sklearn/neighbors/_lof.py                     |   7 +-
 sklearn/neighbors/_nca.py                     |   4 +-
 sklearn/neighbors/_nearest_centroid.py        |   4 +-
 sklearn/neighbors/_regression.py              |  12 +-
 sklearn/neighbors/_unsupervised.py            |   6 +-
 .../neural_network/_multilayer_perceptron.py  |  12 +-
 sklearn/neural_network/_rbm.py                |   9 +-
 sklearn/pipeline.py                           |  16 +-
 sklearn/preprocessing/_data.py                |  57 +-
 sklearn/preprocessing/_discretization.py      |   3 +-
 sklearn/preprocessing/_encoders.py            |   7 +-
 .../preprocessing/_function_transformer.py    |   3 +-
 sklearn/preprocessing/_label.py               |  10 +-
 sklearn/preprocessing/_polynomial.py          |   6 +-
 sklearn/preprocessing/_target_encoder.py      |   5 +-
 sklearn/preprocessing/tests/test_data.py      |  19 +
 sklearn/random_projection.py                  |   4 +-
 sklearn/semi_supervised/_label_propagation.py |   3 +-
 sklearn/semi_supervised/_self_training.py     |   7 +-
 sklearn/svm/_base.py                          |   4 +-
 sklearn/svm/_classes.py                       |   7 +-
 sklearn/tests/test_metadata_routing.py        |  15 +
 sklearn/tests/test_public_functions.py        |   1 +
 sklearn/tree/_classes.py                      | 185 ++++-
 sklearn/tree/_criterion.pxd                   |   2 +-
 sklearn/tree/_criterion.pyx                   |  15 +-
 sklearn/tree/_splitter.pxd                    |   4 +-
 sklearn/tree/_splitter.pyx                    |  46 +-
 sklearn/tree/_tree.pxd                        |  20 +-
 sklearn/tree/_tree.pyx                        |  58 +-
 sklearn/tree/tests/test_tree.py               | 175 ++++-
 sklearn/utils/_metadata_requests.py           |  12 +-
 sklearn/utils/_plotting.py                    |  40 ++
 sklearn/utils/estimator_checks.py             |  19 +-
 sklearn/utils/tests/test_param_validation.py  |   4 +-
 sklearn/utils/tests/test_plotting.py          |  63 ++
 sklearn/utils/tests/test_validation.py        |  10 +
 sklearn/utils/validation.py                   |  51 +-
 141 files changed, 2511 insertions(+), 797 deletions(-)
 create mode 100644 sklearn/utils/tests/test_plotting.py

diff --git a/doc/authors_emeritus.rst b/doc/authors_emeritus.rst
index b979b77bba974..a56e2bc408ff4 100644
--- a/doc/authors_emeritus.rst
+++ b/doc/authors_emeritus.rst
@@ -20,6 +20,7 @@
 - Wei Li
 - Paolo Losi
 - Gilles Louppe
+- Chiara Marmo
 - Vincent Michel
 - Jarrod Millman
 - Alexandre Passos
diff --git a/doc/contributor_experience_team.rst b/doc/contributor_experience_team.rst
index 2e09d9069849a..00b658632302e 100644
--- a/doc/contributor_experience_team.rst
+++ b/doc/contributor_experience_team.rst
@@ -18,6 +18,10 @@
     <p>Lucy Liu</p>
     </div>
     <div>
+    <a href='https://github.com/MaxwellLZH'><img src='https://avatars.githubusercontent.com/u/16646940?v=4' class='avatar' /></a> <br />
+    <p>Maxwell Liu</p>
+    </div>
+    <div>
     <a href='https://github.com/jmloyola'><img src='https://avatars.githubusercontent.com/u/2133361?v=4' class='avatar' /></a> <br />
     <p>Juan Martin Loyola</p>
     </div>
@@ -26,14 +30,6 @@
     <p>Sylvain Marié</p>
     </div>
     <div>
-    <a href='https://github.com/cmarmo'><img src='https://avatars.githubusercontent.com/u/1662261?v=4' class='avatar' /></a> <br />
-    <p>Chiara Marmo</p>
-    </div>
-    <div>
-    <a href='https://github.com/MaxwellLZH'><img src='https://avatars.githubusercontent.com/u/16646940?v=4' class='avatar' /></a> <br />
-    <p>Maxwell Liu</p>
-    </div>
-    <div>
     <a href='https://github.com/norbusan'><img src='https://avatars.githubusercontent.com/u/1735589?v=4' class='avatar' /></a> <br />
     <p>Norbert Preining</p>
     </div>
diff --git a/doc/modules/classes.rst b/doc/modules/classes.rst
index 4961fb0fec366..204c300b1a9b8 100644
--- a/doc/modules/classes.rst
+++ b/doc/modules/classes.rst
@@ -1247,6 +1247,7 @@ Visualization
    :template: display_only_from_estimator.rst
 
    model_selection.LearningCurveDisplay
+   model_selection.ValidationCurveDisplay
 
 .. _multiclass_ref:
 
diff --git a/doc/modules/learning_curve.rst b/doc/modules/learning_curve.rst
index 0ce64063d4cd9..3d458a1a67416 100644
--- a/doc/modules/learning_curve.rst
+++ b/doc/modules/learning_curve.rst
@@ -71,7 +71,7 @@ The function :func:`validation_curve` can help in this case::
   >>> import numpy as np
   >>> from sklearn.model_selection import validation_curve
   >>> from sklearn.datasets import load_iris
-  >>> from sklearn.linear_model import Ridge
+  >>> from sklearn.svm import SVC
 
   >>> np.random.seed(0)
   >>> X, y = load_iris(return_X_y=True)
@@ -80,30 +80,50 @@ The function :func:`validation_curve` can help in this case::
   >>> X, y = X[indices], y[indices]
 
   >>> train_scores, valid_scores = validation_curve(
-  ...     Ridge(), X, y, param_name="alpha", param_range=np.logspace(-7, 3, 3),
-  ...     cv=5)
+  ...     SVC(kernel="linear"), X, y, param_name="C", param_range=np.logspace(-7, 3, 3),
+  ... )
   >>> train_scores
-  array([[0.93..., 0.94..., 0.92..., 0.91..., 0.92...],
-         [0.93..., 0.94..., 0.92..., 0.91..., 0.92...],
-         [0.51..., 0.52..., 0.49..., 0.47..., 0.49...]])
+  array([[0.90..., 0.94..., 0.91..., 0.89..., 0.92...],
+         [0.9... , 0.92..., 0.93..., 0.92..., 0.93...],
+         [0.97..., 1...   , 0.98..., 0.97..., 0.99...]])
   >>> valid_scores
-  array([[0.90..., 0.84..., 0.94..., 0.96..., 0.93...],
-         [0.90..., 0.84..., 0.94..., 0.96..., 0.93...],
-         [0.46..., 0.25..., 0.50..., 0.49..., 0.52...]])
+  array([[0.9..., 0.9... , 0.9... , 0.96..., 0.9... ],
+         [0.9..., 0.83..., 0.96..., 0.96..., 0.93...],
+         [1.... , 0.93..., 1....  , 1....  , 0.9... ]])
+
+If you intend to plot the validation curves only, the class
+:class:`~sklearn.model_selection.ValidationCurveDisplay` is more direct than
+using matplotlib manually on the results of a call to :func:`validation_curve`.
+You can use the method
+:meth:`~sklearn.model_selection.ValidationCurveDisplay.from_estimator` similarly
+to :func:`validation_curve` to generate and plot the validation curve:
+
+.. plot::
+   :context: close-figs
+   :align: center
+
+      from sklearn.datasets import load_iris
+      from sklearn.model_selection import ValidationCurveDisplay
+      from sklearn.svm import SVC
+      from sklearn.utils import shuffle
+      X, y = load_iris(return_X_y=True)
+      X, y = shuffle(X, y, random_state=0)
+      ValidationCurveDisplay.from_estimator(
+         SVC(kernel="linear"), X, y, param_name="C", param_range=np.logspace(-7, 3, 10)
+      )
 
 If the training score and the validation score are both low, the estimator will
 be underfitting. If the training score is high and the validation score is low,
 the estimator is overfitting and otherwise it is working very well. A low
 training score and a high validation score is usually not possible. Underfitting,
 overfitting, and a working model are shown in the in the plot below where we vary
-the parameter :math:`\gamma` of an SVM on the digits dataset.
+the parameter `gamma` of an SVM with an RBF kernel on the digits dataset.
 
 .. figure:: ../auto_examples/model_selection/images/sphx_glr_plot_validation_curve_001.png
    :target: ../auto_examples/model_selection/plot_validation_curve.html
    :align: center
    :scale: 50%
 
-
 .. _learning_curve:
 
 Learning curve
diff --git a/doc/visualizations.rst b/doc/visualizations.rst
index f692fd8efd1df..9a44f6feb1b48 100644
--- a/doc/visualizations.rst
+++ b/doc/visualizations.rst
@@ -89,3 +89,4 @@ Display Objects
    metrics.PredictionErrorDisplay
    metrics.RocCurveDisplay
    model_selection.LearningCurveDisplay
+   model_selection.ValidationCurveDisplay
diff --git a/doc/whats_new/v1.3.rst b/doc/whats_new/v1.3.rst
index bb35a1db224b4..41c03293cf067 100644
--- a/doc/whats_new/v1.3.rst
+++ b/doc/whats_new/v1.3.rst
@@ -29,11 +29,6 @@ random sampling procedures.
   `transform_algorithm` is not the same as `fit_algorithm` and the number of iterations
   is small. :pr:`24871` by :user:`Omar Salman <OmarManzoor>`.
 
-- |Fix| Treat more consistently small values in the `W` and `H` matrices during the
-  `fit` and `transform` steps of :class:`decomposition.NMF` and
-  :class:`decomposition.MiniBatchNMF` which can produce different results than previous
-  versions. :pr:`25438` by :user:`Yotam Avidar-Constantini <yotamcons>`.
-
 - |Enhancement| The `sample_weight` parameter now will be used in centroids
   initialization for :class:`cluster.KMeans`, :class:`cluster.BisectingKMeans`
   and :class:`cluster.MiniBatchKMeans`.
@@ -43,6 +38,11 @@ random sampling procedures.
   :user:`Jérémie du Boisberranger <jeremiedbb>`,
   :user:`Guillaume Lemaitre <glemaitre>`.
 
+- |Fix| Treat more consistently small values in the `W` and `H` matrices during the
+  `fit` and `transform` steps of :class:`decomposition.NMF` and
+  :class:`decomposition.MiniBatchNMF` which can produce different results than previous
+  versions. :pr:`25438` by :user:`Yotam Avidar-Constantini <yotamcons>`.
+
 - |Fix| :class:`decomposition.KernelPCA` may produce different results through
   `inverse_transform` if `gamma` is `None`. Now it will be chosen correctly as
   `1/n_features` of the data that it is fitted on, while previously it might be
@@ -51,6 +51,14 @@ random sampling procedures.
   used each time the kernel is called.
   :pr:`26337` by :user:`Yao Xiao <Charlie-XIAO>`.
 
+Changed displays
+----------------
+
+- |Enhancement| :class:`model_selection.LearningCurveDisplay` displays both the
+  train and test curves by default. You can set `score_type="test"` to keep the
+  past behaviour.
+  :pr:`25120` by :user:`Guillaume Lemaitre <glemaitre>`.
+
 Changes impacting all modules
 -----------------------------
 
@@ -201,23 +209,9 @@ Changelog
 :mod:`sklearn.cluster`
 ......................
 
-- |API| The `sample_weight` parameter in `predict` for
-  :meth:`cluster.KMeans.predict` and :meth:`cluster.MiniBatchKMeans.predict`
-  is now deprecated and will be removed in v1.5.
-  :pr:`25251` by :user:`Gleb Levitski <glevv>`.
-
-- |Enhancement| The `sample_weight` parameter now will be used in centroids
-  initialization for :class:`cluster.KMeans`, :class:`cluster.BisectingKMeans`
-  and :class:`cluster.MiniBatchKMeans`.
-  This change will break backward compatibility, since numbers generated
-  from same random seeds will be different.
-  :pr:`25752` by :user:`Gleb Levitski <glevv>`,
-  :user:`Jérémie du Boisberranger <jeremiedbb>`,
-  :user:`Guillaume Lemaitre <glemaitre>`.
-
 - |MajorFeature| Added :class:`cluster.HDBSCAN`, a modern hierarchical density-based
   clustering algorithm. Similarly to :class:`cluster.OPTICS`, it can be seen as a
-  generalization of :class:`DBSCAN` by allowing for hierarchical instead of flat
+  generalization of :class:`cluster.DBSCAN` by allowing for hierarchical instead of flat
   clustering, however it varies in its approach from :class:`cluster.OPTICS`. This
   algorithm is very robust with respect to its hyperparameters' values and can
   be used on a wide variety of data without much, if any, tuning.
@@ -228,12 +222,30 @@ Changelog
 
   :pr:`26385` by :user:`Meekail Zain <micky774>`
 
+- |Enhancement| The `sample_weight` parameter now will be used in centroids
+  initialization for :class:`cluster.KMeans`, :class:`cluster.BisectingKMeans`
+  and :class:`cluster.MiniBatchKMeans`.
+  This change will break backward compatibility, since numbers generated
+  from same random seeds will be different.
+  :pr:`25752` by :user:`Gleb Levitski <glevv>`,
+  :user:`Jérémie du Boisberranger <jeremiedbb>`,
+  :user:`Guillaume Lemaitre <glemaitre>`.
+
+- |API| The `sample_weight` parameter in `predict` for
+  :meth:`cluster.KMeans.predict` and :meth:`cluster.MiniBatchKMeans.predict`
+  is now deprecated and will be removed in v1.5.
+  :pr:`25251` by :user:`Gleb Levitski <glevv>`.
+
+- |API| The `Xred` argument in :func:`cluster.FeatureAgglomeration.inverse_transform`
+  is renamed to `Xt` and will be removed in v1.5. :pr:`26503` by `Adrin Jalali`_.
+
 :mod:`sklearn.compose`
 ......................
 
-- |Fix| `compose.ColumnTransformer` raises an informative error when the individual transformers of `ColumnTransformer`
-  output pandas dataframes with indexes that are not consistent with each other and the output is configured
-  to be pandas. :pr:`26286` by `Thomas Fan`_.
+- |Fix| `compose.ColumnTransformer` raises an informative error when the individual
+  transformers of `ColumnTransformer` output pandas dataframes with indexes that are
+  not consistent with each other and the output is configured to be pandas.
+  :pr:`26286` by `Thomas Fan`_.
 
 - |Fix| :class:`compose.ColumnTransformer` correctly sets the output of the
   remainder when `set_output` is called. :pr:`26323` by `Thomas Fan`_.
@@ -241,6 +253,14 @@ Changelog
 :mod:`sklearn.covariance`
 .........................
 
+- |Fix| Allows `alpha=0` in :class:`covariance.GraphicalLasso` to be
+  consistent with :func:`covariance.graphical_lasso`.
+  :pr:`26033` by :user:`Genesis Valencia <genvalen>`.
+
+- |Fix| :func:`covariance.empirical_covariance` now gives an informative
+  error message when input is not appropriate.
+  :pr:`26108` by :user:`Quentin Barthélemy <qbarthelemy>`.
+
 - |API| Deprecates `cov_init` in :func:`covariance.graphical_lasso` in 1.3 since
   the parameter has no effect. It will be removed in 1.5.
   :pr:`26033` by :user:`Genesis Valencia <genvalen>`.
@@ -256,20 +276,13 @@ Changelog
   :func:`covariance.graphical_lasso_path`, and :class:`covariance.GraphicalLassoCV`.
   :pr:`26033` by :user:`Genesis Valencia <genvalen>`.
 
-- |Fix| Allows `alpha=0` in :class:`covariance.GraphicalLasso` to be
-  consistent with :func:`covariance.graphical_lasso`.
-  :pr:`26033` by :user:`Genesis Valencia <genvalen>`.
-
-- |Fix| :func:`covariance.empirical_covariance` now gives an informative
-  error message when input is not appropriate.
-  :pr:`26108` by :user:`Quentin Barthélemy <qbarthelemy>`.
-
 :mod:`sklearn.datasets`
 .......................
 
-- |API| The `data_transposed` argument of :func:`datasets.make_sparse_coded_signal`
-  is deprecated and will be removed in v1.5.
-  :pr:`25784` by :user:`Jérémie du Boisberranger`.
+- |Enhancement| Allows to overwrite the parameters used to open the ARFF file using
+  the parameter `read_csv_kwargs` in :func:`datasets.fetch_openml` when using the
+  pandas parser.
+  :pr:`26433` by :user:`Guillaume Lemaitre <glemaitre>`.
 
 - |Fix| :func:`datasets.fetch_openml` returns improved data types when
   `as_frame=True` and `parser="liac-arff"`. :pr:`26386` by `Thomas Fan`_.
@@ -279,28 +292,35 @@ Changelog
   the pandas parser. The parameter `read_csv_kwargs` allows to overwrite this behaviour.
   :pr:`26551` by :user:`Guillaume Lemaitre <glemaitre>`.
 
-- |Enhancement| Allows to overwrite the parameters used to open the ARFF file using
-  the parameter `read_csv_kwargs` in :func:`datasets.fetch_openml` when using the
-  pandas parser.
-  :pr:`26433` by :user:`Guillaume Lemaitre <glemaitre>`.
+- |Fix| :func:`dataasets.fetch_openml` will consistenly use `np.nan` as missing marker
+  with both parsers `"pandas"` and `"liac-arff"`.
+  :pr:`26579` by :user:`Guillaume Lemaitre <glemaitre>`.
+
+- |API| The `data_transposed` argument of :func:`datasets.make_sparse_coded_signal`
+  is deprecated and will be removed in v1.5.
+  :pr:`25784` by :user:`Jérémie du Boisberranger`.
 
 :mod:`sklearn.decomposition`
 ............................
 
-- |Enhancement| :class:`decomposition.DictionaryLearning` now accepts the parameter
-  `callback` for consistency with the function :func:`decomposition.dict_learning`.
-  :pr:`24871` by :user:`Omar Salman <OmarManzoor>`.
-
 - |Efficiency| :class:`decomposition.MiniBatchDictionaryLearning` and
   :class:`decomposition.MiniBatchSparsePCA` are now faster for small batch sizes by
   avoiding duplicate validations.
   :pr:`25490` by :user:`Jérémie du Boisberranger <jeremiedbb>`.
 
+- |Enhancement| :class:`decomposition.DictionaryLearning` now accepts the parameter
+  `callback` for consistency with the function :func:`decomposition.dict_learning`.
+  :pr:`24871` by :user:`Omar Salman <OmarManzoor>`.
+
 - |Fix| Treat more consistently small values in the `W` and `H` matrices during the
   `fit` and `transform` steps of :class:`decomposition.NMF` and
   :class:`decomposition.MiniBatchNMF` which can produce different results than previous
   versions. :pr:`25438` by :user:`Yotam Avidar-Constantini <yotamcons>`.
 
+- |API| The `W` argument in :func:`decomposition.NMF.inverse_transform` and
+  :class:`decomposition.MiniBatchNMF.inverse_transform` is renamed to `Xt` and
+  will be removed in v1.5. :pr:`26503` by `Adrin Jalali`_.
+
 :mod:`sklearn.discriminant_analysis`
 ....................................
 
@@ -364,6 +384,7 @@ Changelog
 
 :mod:`sklearn.exception`
 ........................
+
 - |Feature| Added :class:`exception.InconsistentVersionWarning` which is raised
   when a scikit-learn estimator is unpickled with a scikit-learn version that is
   inconsistent with the sckit-learn version the estimator was pickled with.
@@ -393,6 +414,9 @@ Changelog
 - |Enhancement| Added the parameter `fill_value` to :class:`impute.IterativeImputer`.
   :pr:`25232` by :user:`Thijs van Weezel <ValueInvestorThijs>`.
 
+- |Fix| :class:`impute.IterativeImputer` now correctly preserves the Pandas
+  Index when the `set_config(transform_output="pandas")`. :pr:`26454` by `Thomas Fan`_.
+
 :mod:`sklearn.inspection`
 .........................
 
@@ -420,12 +444,6 @@ Changelog
   now preserve dtype for `numpy.float32`.
   :pr:`25587` by :user:`Omar Salman <OmarManzoor>`.
 
-- |API| Deprecates `n_iter` in favor of `max_iter` in
-  :class:`linear_model.BayesianRidge` and :class:`linear_model.ARDRegression`.
-  `n_iter` will be removed in scikit-learn 1.5. This change makes those
-  estimators consistent with the rest of estimators.
-  :pr:`25697` by :user:`John Pangas <jpangas>`.
-
 - |Enhancement| The `n_iter_` attribute has been included in
   :class:`linear_model.ARDRegression` to expose the actual number of iterations
   required to reach the stopping criterion.
@@ -436,36 +454,41 @@ Changelog
   on linearly separable problems.
   :pr:`25214` by `Tom Dupre la Tour`_.
 
+- |API| Deprecates `n_iter` in favor of `max_iter` in
+  :class:`linear_model.BayesianRidge` and :class:`linear_model.ARDRegression`.
+  `n_iter` will be removed in scikit-learn 1.5. This change makes those
+  estimators consistent with the rest of estimators.
+  :pr:`25697` by :user:`John Pangas <jpangas>`.
+
+:mod:`sklearn.manifold`
+.......................
+
+- |Fix| :class:`manifold.Isomap` now correctly preserves the Pandas
+  Index when the `set_config(transform_output="pandas")`. :pr:`26454` by `Thomas Fan`_.
+
 :mod:`sklearn.metrics`
 ......................
 
-- |Efficiency| The computation of the expected mutual information in
-  :func:`metrics.adjusted_mutual_info_score` is now faster when the number of
-  unique labels is large and its memory usage is reduced in general.
-  :pr:`25713` by :user:`Kshitij Mathur <Kshitij68>`,
-  :user:`Guillaume Lemaitre <glemaitre>`, :user:`Omar Salman <OmarManzoor>` and
-  :user:`Jérémie du Boisberranger <jeremiedbb>`.
-
 - |Feature| Adds `zero_division=np.nan` to multiple classification metrics:
-  :func:`precision_score`, :func:`recall_score`, :func:`f1_score`,
-  :func:`fbeta_score`, :func:`precision_recall_fscore_support`,
-  :func:`classification_report`. When `zero_division=np.nan` and there is a
+  :func:`metrics.precision_score`, :func:`metrics.recall_score`,
+  :func:`metrics.f1_score`, :func:`metrics.fbeta_score`,
+  :func:`metrics.precision_recall_fscore_support`,
+  :func:`metrics.classification_report`. When `zero_division=np.nan` and there is a
   zero division, the metric is undefined and is excluded from averaging. When not used
   for averages, the value returned is `np.nan`.
   :pr:`25531` by :user:`Marc Torrellas Socastro <marctorsoc>`.
 
-- |Fix| :func:`metric.manhattan_distances` now supports readonly sparse datasets.
-  :pr:`25432` by :user:`Julien Jerphanion <jjerphan>`.
-
-- |Fix| Fixed :func:`classification_report` so that empty input will return
-  `np.nan`. Previously, "macro avg" and `weighted avg` would return
-  e.g. `f1-score=np.nan` and `f1-score=0.0`, being inconsistent. Now, they
-  both return `np.nan`.
-  :pr:`25531` by :user:`Marc Torrellas Socastro <marctorsoc>`.
+- |Feature| :func:`metrics.average_precision_score` now supports the
+  multiclass case.
+  :pr:`17388` by :user:`Geoffrey Bolmier <gbolmier>` and
+  :pr:`24769` by :user:`Ashwin Mathur <awinml>`.
 
-- |Fix| :func:`metric.ndcg_score` now gives a meaningful error message for input of
-  length 1.
-  :pr:`25672` by :user:`Lene Preuss <lene>` and :user:`Wei-Chun Chu <wcchu>`.
+- |Efficiency| The computation of the expected mutual information in
+  :func:`metrics.adjusted_mutual_info_score` is now faster when the number of
+  unique labels is large and its memory usage is reduced in general.
+  :pr:`25713` by :user:`Kshitij Mathur <Kshitij68>`,
+  :user:`Guillaume Lemaitre <glemaitre>`, :user:`Omar Salman <OmarManzoor>` and
+  :user:`Jérémie du Boisberranger <jeremiedbb>`.
 
 - |Enhancement| :class:`metrics.silhouette_samples` nows accepts a sparse
   matrix of pairwise distances between samples, or a feature array.
@@ -492,17 +515,23 @@ Changelog
   chance level. This line is exposed in the `chance_level_` attribute.
   :pr:`26019` by :user:`Yao Xiao <Charlie-XIAO>`.
 
-- |Fix| :func:`log_loss` raises a warning if the values of the parameter `y_pred` are
-  not normalized, instead of actually normalizing them in the metric. Starting from
-  1.5 this will raise an error. :pr:`25299` by :user:`Omar Salman <OmarManzoor`.
+- |Fix| :func:`metrics.manhattan_distances` now supports readonly sparse datasets.
+  :pr:`25432` by :user:`Julien Jerphanion <jjerphan>`.
+
+- |Fix| Fixed :func:`metrics.classification_report` so that empty input will return
+  `np.nan`. Previously, "macro avg" and `weighted avg` would return
+  e.g. `f1-score=np.nan` and `f1-score=0.0`, being inconsistent. Now, they
+  both return `np.nan`.
+  :pr:`25531` by :user:`Marc Torrellas Socastro <marctorsoc>`.
 
-- |API| The `eps` parameter of the :func:`log_loss` has been deprecated and will be
-  removed in 1.5. :pr:`25299` by :user:`Omar Salman <OmarManzoor>`.
+- |Fix| :func:`metrics.ndcg_score` now gives a meaningful error message for input of
+  length 1.
+  :pr:`25672` by :user:`Lene Preuss <lene>` and :user:`Wei-Chun Chu <wcchu>`.
 
-- |Feature| :func:`metrics.average_precision_score` now supports the
-  multiclass case.
-  :pr:`17388` by :user:`Geoffrey Bolmier <gbolmier>` and
-  :pr:`24769` by :user:`Ashwin Mathur <awinml>`.
+- |Fix| :func:`metrics.log_loss` raises a warning if the values of the parameter
+  `y_pred` are not normalized, instead of actually normalizing them in the metric.
+  Starting from 1.5 this will raise an error.
+  :pr:`25299` by :user:`Omar Salman <OmarManzoor`.
 
 - |Fix| In :func:`metrics.roc_curve`, use the threshold value `np.inf` instead of
   arbitrary `max(y_score) + 1`. This threshold is associated with the ROC curve point
@@ -514,6 +543,9 @@ Changelog
   `'matching'` anymore.
   :pr:`26264` by :user:`Barata T. Onggo <magnusbarata>`
 
+- |API| The `eps` parameter of the :func:`metrics.log_loss` has been deprecated and
+  will be removed in 1.5. :pr:`25299` by :user:`Omar Salman <OmarManzoor>`.
+
 :mod:`sklearn.gaussian_process`
 ...............................
 
@@ -524,6 +556,18 @@ Changelog
 :mod:`sklearn.model_selection`
 ..............................
 
+- |MajorFeature| Added the class :class:`model_selection.ValidationCurveDisplay`
+  that allows easy plotting of validation curves obtained by the function
+  :func:`model_selection.validation_curve`.
+  :pr:`25120` by :user:`Guillaume Lemaitre <glemaitre>`.
+
+- |API| The parameter `log_scale` in the class
+  :class:`model_selection.LearningCurveDisplay` has been deprecated in 1.3 and
+  will be removed in 1.5. The default scale can be overriden by setting it
+  directly on the `ax` object and will be set automatically from the spacing
+  of the data points otherwise.
+  :pr:`25120` by :user:`Guillaume Lemaitre <glemaitre>`.
+
 - |Enhancement| :func:`model_selection.cross_validate` accepts a new parameter
   `return_indices` to return the train-test indices of each cv split.
   :pr:`25659` by :user:`Guillaume Lemaitre <glemaitre>`.
@@ -546,15 +590,15 @@ Changelog
 :mod:`sklearn.neighbors`
 ........................
 
-- |Fix| Remove support for `KulsinskiDistance` in :class:`neighbors.BallTree`. This
-  dissimilarity is not a metric and cannot be supported by the BallTree.
-  :pr:`25417` by :user:`Guillaume Lemaitre <glemaitre>`.
-
 - |Enhancement| The performance of :meth:`neighbors.KNeighborsClassifier.predict`
   and of :meth:`neighbors.KNeighborsClassifier.predict_proba` has been improved
   when `n_neighbors` is large and `algorithm="brute"` with non Euclidean metrics.
   :pr:`24076` by :user:`Meekail Zain <micky774>`, :user:`Julien Jerphanion <jjerphan>`.
 
+- |Fix| Remove support for `KulsinskiDistance` in :class:`neighbors.BallTree`. This
+  dissimilarity is not a metric and cannot be supported by the BallTree.
+  :pr:`25417` by :user:`Guillaume Lemaitre <glemaitre>`.
+
 - |API| The support for metrics other than `euclidean` and `manhattan` and for
   callables in :class:`neighbors.NearestNeighbors` is deprecated and will be removed in
   version 1.5. :pr:`24083` by :user:`Valentin Laurent <Valentin-Laurent>`.
@@ -592,10 +636,24 @@ Changelog
   categorical encoding based on target mean conditioned on the value of the
   category. :pr:`25334` by `Thomas Fan`_.
 
+- |Feature| :class:`preprocessing.OrdinalEncoder` now supports grouping
+  infrequent categories into a single feature. Grouping infrequent categories
+  is enabled by specifying how to select infrequent categories with
+  `min_frequency` or `max_categories`. :pr:`25677` by `Thomas Fan`_.
+
+- |Enhancement| :class:`preprocessing.PolynomialFeatures` now calculates the
+  number of expanded terms a-priori when dealing with sparse `csr` matrices
+  in order to optimize the choice of `dtype` for `indices` and `indptr`. It
+  can now output `csr` matrices with `np.int32` `indices/indptr` components
+  when there are few enough elements, and will automatically use `np.int64`
+  for sufficiently large matrices.
+  :pr:`20524` by :user:`niuk-a <niuk-a>` and
+  :pr:`23731` by :user:`Meekail Zain <micky774>`
+
 - |Enhancement| A new parameter `sparse_output` was added to
-  :class:`SplineTransformer`, available as of SciPy 1.8. If `sparse_output=True`,
-  :class:`SplineTransformer` returns a sparse CSR matrix.
-  :pr:`24145` by :user:`Christian Lorentzen <lorentzenchr>`.
+  :class:`preprocessing.SplineTransformer`, available as of SciPy 1.8. If
+  `sparse_output=True`, :class:`preprocessing.SplineTransformer` returns a sparse
+  CSR matrix. :pr:`24145` by :user:`Christian Lorentzen <lorentzenchr>`.
 
 - |Enhancement| Adds a `feature_name_combiner` parameter to
   :class:`preprocessing.OneHotEncoder`. This specifies a custom callable to create
@@ -610,28 +668,35 @@ Changelog
   :pr:`24935` by :user:`Seladus <seladus>`, :user:`Guillaume Lemaitre <glemaitre>`, and
   :user:`Dea María Léon <deamarialeon>`, :pr:`25257` by :user:`Gleb Levitski <glevv>`.
 
-- |Feature| :class:`preprocessing.OrdinalEncoder` now supports grouping
-  infrequent categories into a single feature. Grouping infrequent categories
-  is enabled by specifying how to select infrequent categories with
-  `min_frequency` or `max_categories`. :pr:`25677` by `Thomas Fan`_.
-
 - |Enhancement| Subsampling through the `subsample` parameter can now be used in
   :class:`preprocessing.KBinsDiscretizer` regardless of the strategy used.
   :pr:`26424` by :user:`Jérémie du Boisberranger <jeremiedbb>`.
 
-- |API| The default value of the `subsample` parameter of
-  :class:`preprocessing.KBinsDiscretizer` will change from `None` to `200_000` in
-  version 1.5 when `strategy="kmeans"` or `strategy="uniform"`.
-  :pr:`26424` by :user:`Jérémie du Boisberranger <jeremiedbb>`.
+- |Fix| :class:`preprocessing.AdditiveChi2Sampler` is now stateless.
+  The `sample_interval_` attribute is deprecated and will be removed in 1.5.
+  :pr:`25190` by :user:`Vincent Maladière <Vincent-Maladiere>`.
 
 - |Fix| :class:`AdditiveChi2Sampler` is now stateless.
   The `sample_interval_` attribute is deprecated and will be removed in 1.5.
   :pr:`25190` by :user:`Vincent Maladière <Vincent-Maladiere>`.
 
+- |Fix| :class:`preprocessing.PowerTransformer` now correctly preserves the Pandas
+  Index when the `set_config(transform_output="pandas")`. :pr:`26454` by `Thomas Fan`_.
+
 - |Fix| :class:`preprocessing.PowerTransformer` now correcly raises error when
   using `method="box-cox"` on data with a constant `np.nan` column.
   :pr:`26400` by :user:`Yao Xiao <Charlie-XIAO>`.
 
+- |Fix| :class:`preprocessing.PowerTransformer` with `method="yeo-johnson"` now leaves
+  constant features unchanged instead of transforming with an arbitrary value for
+  the `lambdas_` fitted parameter.
+  :pr:`26566` by :user:`Jérémie du Boisberranger <jeremiedbb>`.
+
+- |API| The default value of the `subsample` parameter of
+  :class:`preprocessing.KBinsDiscretizer` will change from `None` to `200_000` in
+  version 1.5 when `strategy="kmeans"` or `strategy="uniform"`.
+  :pr:`26424` by :user:`Jérémie du Boisberranger <jeremiedbb>`.
+
 :mod:`sklearn.svm`
 ..................
 
@@ -660,45 +725,36 @@ Changelog
 :mod:`sklearn.utils`
 ....................
 
-- |API| :func:`estimator_checks.check_transformers_unfitted_stateless` has been
+- |FIX| Fixes :func:`utils.validation.check_array` to properly convert pandas
+  extension arrays. :pr:`25813` and :pr:`26106` by `Thomas Fan`_.
+
+- |Fix| :func:`utils.validation.check_array` now supports pandas DataFrames with
+  extension arrays and object dtypes by return an ndarray with object dtype.
+  :pr:`25814` by `Thomas Fan`_.
+
+- |API| :func:`utils.estimator_checks.check_transformers_unfitted_stateless` has been
   introduced to ensure stateless transformers don't raise `NotFittedError`
   during `transform` with no prior call to `fit` or `fit_transform`.
   :pr:`25190` by :user:`Vincent Maladière <Vincent-Maladiere>`.
 
-- |Enhancement| :class:`preprocessing.PolynomialFeatures` now calculates the
-  number of expanded terms a-priori when dealing with sparse `csr` matrices
-  in order to optimize the choice of `dtype` for `indices` and `indptr`. It
-  can now output `csr` matrices with `np.int32` `indices/indptr` components
-  when there are few enough elements, and will automatically use `np.int64`
-  for sufficiently large matrices.
-  :pr:`20524` by :user:`niuk-a <niuk-a>` and
-  :pr:`23731` by :user:`Meekail Zain <micky774>`
-
 - |API| A `FutureWarning` is now raised when instantiating a class which inherits from
   a deprecated base class (i.e. decorated by :class:`utils.deprecated`) and which
   overrides the `__init__` method.
   :pr:`25733` by :user:`Brigitta Sipőcz <bsipocz>` and
   :user:`Jérémie du Boisberranger <jeremiedbb>`.
 
-- |FIX| Fixes :func:`utils.validation.check_array` to properly convert pandas
-  extension arrays. :pr:`25813` and :pr:`26106` by `Thomas Fan`_.
-
-- |Fix| :func:`utils.validation.check_array` now supports pandas DataFrames with
-  extension arrays and object dtypes by return an ndarray with object dtype.
-  :pr:`25814` by `Thomas Fan`_.
-
 :mod:`sklearn.semi_supervised`
 ..............................
 
-- |Enhancement| :meth:`LabelSpreading.fit` and :meth:`LabelPropagation.fit` now
-  accepts sparse metrics.
+- |Enhancement| :meth:`semi_supervised.LabelSpreading.fit` and
+  :meth:`semi_supervised.LabelPropagation.fit` now accepts sparse metrics.
   :pr:`19664` by :user:`Kaushik Amar Das <cozek>`.
 
 Miscellaneous
 .............
 
-- |Enhancement| Replace obsolete exceptions EnvironmentError, IOError and
-  WindowsError.
+- |Enhancement| Replace obsolete exceptions `EnvironmentError`, `IOError` and
+  `WindowsError`.
   :pr:`26466` by :user:`Dimitri Papadopoulos ORfanos <DimitriPapadopoulos>`.
 
 Code and Documentation Contributors
diff --git a/examples/miscellaneous/plot_kernel_ridge_regression.py b/examples/miscellaneous/plot_kernel_ridge_regression.py
index 20b8496ab18aa..fa7cb15446473 100644
--- a/examples/miscellaneous/plot_kernel_ridge_regression.py
+++ b/examples/miscellaneous/plot_kernel_ridge_regression.py
@@ -203,6 +203,7 @@
     "scoring": "neg_mean_squared_error",
     "negate_score": True,
     "score_name": "Mean Squared Error",
+    "score_type": "test",
     "std_display_style": None,
     "ax": ax,
 }
diff --git a/examples/model_selection/plot_validation_curve.py b/examples/model_selection/plot_validation_curve.py
index 1b3c562594188..48aa19dfbc556 100644
--- a/examples/model_selection/plot_validation_curve.py
+++ b/examples/model_selection/plot_validation_curve.py
@@ -18,53 +18,23 @@
 
 from sklearn.datasets import load_digits
 from sklearn.svm import SVC
-from sklearn.model_selection import validation_curve
+from sklearn.model_selection import ValidationCurveDisplay
 
 X, y = load_digits(return_X_y=True)
 subset_mask = np.isin(y, [1, 2])  # binary classification: 1 vs 2
 X, y = X[subset_mask], y[subset_mask]
 
-param_range = np.logspace(-6, -1, 5)
-train_scores, test_scores = validation_curve(
+disp = ValidationCurveDisplay.from_estimator(
     SVC(),
     X,
     y,
     param_name="gamma",
-    param_range=param_range,
-    scoring="accuracy",
+    param_range=np.logspace(-6, -1, 5),
+    score_type="both",
     n_jobs=2,
+    score_name="Accuracy",
 )
-train_scores_mean = np.mean(train_scores, axis=1)
-train_scores_std = np.std(train_scores, axis=1)
-test_scores_mean = np.mean(test_scores, axis=1)
-test_scores_std = np.std(test_scores, axis=1)
-
-plt.title("Validation Curve with SVM")
-plt.xlabel(r"$\gamma$")
-plt.ylabel("Score")
-plt.ylim(0.0, 1.1)
-lw = 2
-plt.semilogx(
-    param_range, train_scores_mean, label="Training score", color="darkorange", lw=lw
-)
-plt.fill_between(
-    param_range,
-    train_scores_mean - train_scores_std,
-    train_scores_mean + train_scores_std,
-    alpha=0.2,
-    color="darkorange",
-    lw=lw,
-)
-plt.semilogx(
-    param_range, test_scores_mean, label="Cross-validation score", color="navy", lw=lw
-)
-plt.fill_between(
-    param_range,
-    test_scores_mean - test_scores_std,
-    test_scores_mean + test_scores_std,
-    alpha=0.2,
-    color="navy",
-    lw=lw,
-)
-plt.legend(loc="best")
+disp.ax_.set_title("Validation Curve for SVM with an RBF kernel")
+disp.ax_.set_xlabel(r"gamma (inverse radius of the RBF kernel)")
+disp.ax_.set_ylim(0.0, 1.1)
 plt.show()
diff --git a/sklearn/base.py b/sklearn/base.py
index 5cced34d4b8f0..13bbcab96aa61 100644
--- a/sklearn/base.py
+++ b/sklearn/base.py
@@ -27,7 +27,7 @@
 from .utils.validation import _num_features
 from .utils.validation import _check_feature_names_in
 from .utils.validation import _generate_get_feature_names_out
-from .utils.validation import check_is_fitted
+from .utils.validation import _is_fitted, check_is_fitted
 from .utils._metadata_requests import _MetadataRequester
 from .utils.validation import _get_feature_names
 from .utils._estimator_html_repr import estimator_html_repr
@@ -1131,7 +1131,13 @@ def decorator(fit_method):
         @functools.wraps(fit_method)
         def wrapper(estimator, *args, **kwargs):
             global_skip_validation = get_config()["skip_parameter_validation"]
-            if not global_skip_validation:
+
+            # we don't want to validate again for each call to partial_fit
+            partial_fit_and_fitted = (
+                fit_method.__name__ == "partial_fit" and _is_fitted(estimator)
+            )
+
+            if not global_skip_validation and not partial_fit_and_fitted:
                 estimator._validate_params()
 
             with config_context(
diff --git a/sklearn/calibration.py b/sklearn/calibration.py
index 5e7bfe2ab4a31..e4869387f4166 100644
--- a/sklearn/calibration.py
+++ b/sklearn/calibration.py
@@ -25,6 +25,7 @@
     RegressorMixin,
     clone,
     MetaEstimatorMixin,
+    _fit_context,
 )
 from .preprocessing import label_binarize, LabelEncoder
 from .utils import (
@@ -318,6 +319,10 @@ def _get_estimator(self):
 
         return estimator
 
+    @_fit_context(
+        # CalibratedClassifierCV.estimator is not validated yet
+        prefer_skip_nested_validation=False
+    )
     def fit(self, X, y, sample_weight=None, **fit_params):
         """Fit the calibrated model.
 
@@ -341,8 +346,6 @@ def fit(self, X, y, sample_weight=None, **fit_params):
         self : object
             Returns an instance of self.
         """
-        self._validate_params()
-
         check_classification_targets(y)
         X, y = indexable(X, y)
         if sample_weight is not None:
diff --git a/sklearn/cluster/_affinity_propagation.py b/sklearn/cluster/_affinity_propagation.py
index 8a3c2c2acde62..1ffc5f07e8c50 100644
--- a/sklearn/cluster/_affinity_propagation.py
+++ b/sklearn/cluster/_affinity_propagation.py
@@ -12,6 +12,7 @@
 
 from ..exceptions import ConvergenceWarning
 from ..base import BaseEstimator, ClusterMixin
+from ..base import _fit_context
 from ..utils import check_random_state
 from ..utils._param_validation import Interval, StrOptions, validate_params
 from ..utils.validation import check_is_fitted
@@ -469,6 +470,7 @@ def __init__(
     def _more_tags(self):
         return {"pairwise": self.affinity == "precomputed"}
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def fit(self, X, y=None):
         """Fit the clustering from features, or affinity matrix.
 
@@ -488,8 +490,6 @@ def fit(self, X, y=None):
         self
             Returns the instance itself.
         """
-        self._validate_params()
-
         if self.affinity == "precomputed":
             accept_sparse = False
         else:
diff --git a/sklearn/cluster/_agglomerative.py b/sklearn/cluster/_agglomerative.py
index 059056275ef3d..b7d08a45dcd80 100644
--- a/sklearn/cluster/_agglomerative.py
+++ b/sklearn/cluster/_agglomerative.py
@@ -16,6 +16,7 @@
 from scipy.sparse.csgraph import connected_components
 
 from ..base import BaseEstimator, ClusterMixin, ClassNamePrefixFeaturesOutMixin
+from ..base import _fit_context
 from ..metrics.pairwise import paired_distances
 from ..metrics.pairwise import _VALID_METRICS
 from ..metrics import DistanceMetric
@@ -950,6 +951,7 @@ def __init__(
         self.metric = metric
         self.compute_distances = compute_distances
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def fit(self, X, y=None):
         """Fit the hierarchical clustering from features, or distance matrix.
 
@@ -968,7 +970,6 @@ def fit(self, X, y=None):
         self : object
             Returns the fitted instance.
         """
-        self._validate_params()
         X = self._validate_data(X, ensure_min_samples=2)
         return self._fit(X)
 
@@ -1324,6 +1325,7 @@ def __init__(
         )
         self.pooling_func = pooling_func
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def fit(self, X, y=None):
         """Fit the hierarchical clustering on the data.
 
@@ -1340,7 +1342,6 @@ def fit(self, X, y=None):
         self : object
             Returns the transformer.
         """
-        self._validate_params()
         X = self._validate_data(X, ensure_min_features=2)
         super()._fit(X.T)
         self._n_features_out = self.n_clusters_
diff --git a/sklearn/cluster/_bicluster.py b/sklearn/cluster/_bicluster.py
index ba837bacc99d5..4133264626ebb 100644
--- a/sklearn/cluster/_bicluster.py
+++ b/sklearn/cluster/_bicluster.py
@@ -13,6 +13,7 @@
 
 from . import KMeans, MiniBatchKMeans
 from ..base import BaseEstimator, BiclusterMixin
+from ..base import _fit_context
 from ..utils import check_random_state
 from ..utils import check_scalar
 
@@ -118,6 +119,7 @@ def __init__(
     def _check_parameters(self, n_samples):
         """Validate parameters depending on the input data."""
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def fit(self, X, y=None):
         """Create a biclustering for X.
 
@@ -134,8 +136,6 @@ def fit(self, X, y=None):
         self : object
             SpectralBiclustering instance.
         """
-        self._validate_params()
-
         X = self._validate_data(X, accept_sparse="csr", dtype=np.float64)
         self._check_parameters(X.shape[0])
         self._fit(X)
diff --git a/sklearn/cluster/_birch.py b/sklearn/cluster/_birch.py
index 4c9d7921fdc70..e74630572a014 100644
--- a/sklearn/cluster/_birch.py
+++ b/sklearn/cluster/_birch.py
@@ -16,6 +16,7 @@
     ClusterMixin,
     BaseEstimator,
     ClassNamePrefixFeaturesOutMixin,
+    _fit_context,
 )
 from ..utils.extmath import row_norms
 from ..utils._param_validation import Interval
@@ -501,6 +502,7 @@ def __init__(
         self.compute_labels = compute_labels
         self.copy = copy
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def fit(self, X, y=None):
         """
         Build a CF Tree for the input data.
@@ -518,9 +520,6 @@ def fit(self, X, y=None):
         self
             Fitted estimator.
         """
-
-        self._validate_params()
-
         return self._fit(X, partial=False)
 
     def _fit(self, X, partial):
@@ -610,6 +609,7 @@ def _get_leaves(self):
             leaf_ptr = leaf_ptr.next_leaf_
         return leaves
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def partial_fit(self, X=None, y=None):
         """
         Online learning. Prevents rebuilding of CFTree from scratch.
@@ -629,8 +629,6 @@ def partial_fit(self, X=None, y=None):
         self
             Fitted estimator.
         """
-        self._validate_params()
-
         if X is None:
             # Perform just the final global clustering step.
             self._global_clustering()
diff --git a/sklearn/cluster/_bisect_k_means.py b/sklearn/cluster/_bisect_k_means.py
index fc2b38cc1bca9..959d78ae85009 100644
--- a/sklearn/cluster/_bisect_k_means.py
+++ b/sklearn/cluster/_bisect_k_means.py
@@ -6,6 +6,7 @@
 import numpy as np
 import scipy.sparse as sp
 
+from ..base import _fit_context
 from ._kmeans import _BaseKMeans
 from ._kmeans import _kmeans_single_elkan
 from ._kmeans import _kmeans_single_lloyd
@@ -347,6 +348,7 @@ def _bisect(self, X, x_squared_norms, sample_weight, cluster_to_bisect):
 
         cluster_to_bisect.split(best_labels, best_centers, scores)
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def fit(self, X, y=None, sample_weight=None):
         """Compute bisecting k-means clustering.
 
@@ -373,8 +375,6 @@ def fit(self, X, y=None, sample_weight=None):
         self
             Fitted estimator.
         """
-        self._validate_params()
-
         X = self._validate_data(
             X,
             accept_sparse="csr",
diff --git a/sklearn/cluster/_dbscan.py b/sklearn/cluster/_dbscan.py
index aa81ef27702e6..3c753935ac046 100644
--- a/sklearn/cluster/_dbscan.py
+++ b/sklearn/cluster/_dbscan.py
@@ -16,6 +16,7 @@
 
 from ..metrics.pairwise import _VALID_METRICS
 from ..base import BaseEstimator, ClusterMixin
+from ..base import _fit_context
 from ..utils.validation import _check_sample_weight
 from ..utils._param_validation import Interval, StrOptions
 from ..neighbors import NearestNeighbors
@@ -338,6 +339,10 @@ def __init__(
         self.p = p
         self.n_jobs = n_jobs
 
+    @_fit_context(
+        # DBSCAN.metric is not validated yet
+        prefer_skip_nested_validation=False
+    )
     def fit(self, X, y=None, sample_weight=None):
         """Perform DBSCAN clustering from features, or distance matrix.
 
@@ -363,8 +368,6 @@ def fit(self, X, y=None, sample_weight=None):
         self : object
             Returns a fitted instance of self.
         """
-        self._validate_params()
-
         X = self._validate_data(X, accept_sparse="csr")
 
         if sample_weight is not None:
diff --git a/sklearn/cluster/_feature_agglomeration.py b/sklearn/cluster/_feature_agglomeration.py
index 457a83dd41e71..55baf247a2931 100644
--- a/sklearn/cluster/_feature_agglomeration.py
+++ b/sklearn/cluster/_feature_agglomeration.py
@@ -5,10 +5,12 @@
 # Author: V. Michel, A. Gramfort
 # License: BSD 3 clause
 
+import warnings
 import numpy as np
 
 from ..base import TransformerMixin
 from ..utils.validation import check_is_fitted
+from ..utils import metadata_routing
 from scipy.sparse import issparse
 
 ###############################################################################
@@ -20,6 +22,11 @@ class AgglomerationTransform(TransformerMixin):
     A class for feature agglomeration via the transform interface.
     """
 
+    # This prevents ``set_split_inverse_transform`` to be generated for the
+    # non-standard ``Xred`` arg on ``inverse_transform``.
+    # TODO(1.5): remove when Xred is removed for inverse_transform.
+    __metadata_request__inverse_transform = {"Xred": metadata_routing.UNUSED}
+
     def transform(self, X):
         """
         Transform a new matrix using the built clustering.
@@ -54,22 +61,43 @@ def transform(self, X):
             nX = np.array(nX).T
         return nX
 
-    def inverse_transform(self, Xred):
+    def inverse_transform(self, Xt=None, Xred=None):
         """
         Inverse the transformation and return a vector of size `n_features`.
 
         Parameters
         ----------
-        Xred : array-like of shape (n_samples, n_clusters) or (n_clusters,)
+        Xt : array-like of shape (n_samples, n_clusters) or (n_clusters,)
             The values to be assigned to each cluster of samples.
 
+        Xred : deprecated
+            Use `Xt` instead.
+
+            .. deprecated:: 1.3
+
         Returns
         -------
         X : ndarray of shape (n_samples, n_features) or (n_features,)
             A vector of size `n_samples` with the values of `Xred` assigned to
             each of the cluster of samples.
         """
+        if Xt is None and Xred is None:
+            raise TypeError("Missing required positional argument: Xt")
+
+        if Xred is not None and Xt is not None:
+            raise ValueError("Please provide only `Xt`, and not `Xred`.")
+
+        if Xred is not None:
+            warnings.warn(
+                (
+                    "Input argument `Xred` was renamed to `Xt` in v1.3 and will be"
+                    " removed in v1.5."
+                ),
+                FutureWarning,
+            )
+            Xt = Xred
+
         check_is_fitted(self)
 
         unil, inverse = np.unique(self.labels_, return_inverse=True)
-        return Xred[..., inverse]
+        return Xt[..., inverse]
diff --git a/sklearn/cluster/_kmeans.py b/sklearn/cluster/_kmeans.py
index 971d5735fbe2b..b36999885a14e 100644
--- a/sklearn/cluster/_kmeans.py
+++ b/sklearn/cluster/_kmeans.py
@@ -23,6 +23,7 @@
     ClusterMixin,
     TransformerMixin,
     ClassNamePrefixFeaturesOutMixin,
+    _fit_context,
 )
 from ..metrics.pairwise import euclidean_distances
 from ..metrics.pairwise import _euclidean_distances
@@ -1448,6 +1449,7 @@ def _warn_mkl_vcomp(self, n_active_threads):
             f" variable OMP_NUM_THREADS={n_active_threads}."
         )
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def fit(self, X, y=None, sample_weight=None):
         """Compute k-means clustering.
 
@@ -1475,8 +1477,6 @@ def fit(self, X, y=None, sample_weight=None):
         self : object
             Fitted estimator.
         """
-        self._validate_params()
-
         X = self._validate_data(
             X,
             accept_sparse="csr",
@@ -2057,6 +2057,7 @@ def _random_reassign(self):
             return True
         return False
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def fit(self, X, y=None, sample_weight=None):
         """Compute the centroids on X by chunking it into mini-batches.
 
@@ -2084,8 +2085,6 @@ def fit(self, X, y=None, sample_weight=None):
         self : object
             Fitted estimator.
         """
-        self._validate_params()
-
         X = self._validate_data(
             X,
             accept_sparse="csr",
@@ -2214,6 +2213,7 @@ def fit(self, X, y=None, sample_weight=None):
 
         return self
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def partial_fit(self, X, y=None, sample_weight=None):
         """Update k means estimate on a single mini-batch X.
 
@@ -2241,9 +2241,6 @@ def partial_fit(self, X, y=None, sample_weight=None):
         """
         has_centers = hasattr(self, "cluster_centers_")
 
-        if not has_centers:
-            self._validate_params()
-
         X = self._validate_data(
             X,
             accept_sparse="csr",
diff --git a/sklearn/cluster/_mean_shift.py b/sklearn/cluster/_mean_shift.py
index 46a00ed3f0740..6b0f227d011f9 100644
--- a/sklearn/cluster/_mean_shift.py
+++ b/sklearn/cluster/_mean_shift.py
@@ -24,6 +24,7 @@
 from ..utils.parallel import delayed, Parallel
 from ..utils import check_random_state, gen_batches, check_array
 from ..base import BaseEstimator, ClusterMixin
+from ..base import _fit_context
 from ..neighbors import NearestNeighbors
 from ..metrics.pairwise import pairwise_distances_argmin
 from .._config import config_context
@@ -435,6 +436,7 @@ def __init__(
         self.n_jobs = n_jobs
         self.max_iter = max_iter
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def fit(self, X, y=None):
         """Perform clustering.
 
@@ -451,7 +453,6 @@ def fit(self, X, y=None):
         self : object
                Fitted instance.
         """
-        self._validate_params()
         X = self._validate_data(X)
         bandwidth = self.bandwidth
         if bandwidth is None:
diff --git a/sklearn/cluster/_optics.py b/sklearn/cluster/_optics.py
index 0f1c66ada2d4e..ca1c74d6f44e7 100755
--- a/sklearn/cluster/_optics.py
+++ b/sklearn/cluster/_optics.py
@@ -24,6 +24,7 @@
 from ..utils.validation import check_memory
 from ..neighbors import NearestNeighbors
 from ..base import BaseEstimator, ClusterMixin
+from ..base import _fit_context
 from ..metrics import pairwise_distances
 from scipy.sparse import issparse, SparseEfficiencyWarning
 
@@ -288,6 +289,10 @@ def __init__(
         self.memory = memory
         self.n_jobs = n_jobs
 
+    @_fit_context(
+        # Optics.metric is not validated yet
+        prefer_skip_nested_validation=False
+    )
     def fit(self, X, y=None):
         """Perform OPTICS clustering.
 
@@ -311,8 +316,6 @@ def fit(self, X, y=None):
         self : object
             Returns a fitted instance of self.
         """
-        self._validate_params()
-
         dtype = bool if self.metric in PAIRWISE_BOOLEAN_FUNCTIONS else float
         if dtype == bool and X.dtype != bool:
             msg = (
diff --git a/sklearn/cluster/_spectral.py b/sklearn/cluster/_spectral.py
index e0ab7da938bfd..f72db4b7c1da3 100644
--- a/sklearn/cluster/_spectral.py
+++ b/sklearn/cluster/_spectral.py
@@ -15,6 +15,7 @@
 from scipy.sparse import csc_matrix
 
 from ..base import BaseEstimator, ClusterMixin
+from ..base import _fit_context
 from ..utils._param_validation import Interval, StrOptions, validate_params
 from ..utils import check_random_state, as_float_array
 from ..metrics.pairwise import pairwise_kernels, KERNEL_PARAMS
@@ -649,6 +650,7 @@ def __init__(
         self.n_jobs = n_jobs
         self.verbose = verbose
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def fit(self, X, y=None):
         """Perform spectral clustering from features, or affinity matrix.
 
@@ -671,8 +673,6 @@ def fit(self, X, y=None):
         self : object
             A fitted instance of the estimator.
         """
-        self._validate_params()
-
         X = self._validate_data(
             X,
             accept_sparse=["csr", "csc", "coo"],
diff --git a/sklearn/cluster/tests/test_feature_agglomeration.py b/sklearn/cluster/tests/test_feature_agglomeration.py
index 3e4aa816b79c0..3db2862384c74 100644
--- a/sklearn/cluster/tests/test_feature_agglomeration.py
+++ b/sklearn/cluster/tests/test_feature_agglomeration.py
@@ -2,9 +2,11 @@
 Tests for sklearn.cluster._feature_agglomeration
 """
 # Authors: Sergul Aydore 2017
+import warnings
 import numpy as np
 
 from numpy.testing import assert_array_equal
+import pytest
 from sklearn.cluster import FeatureAgglomeration
 from sklearn.utils._testing import assert_array_almost_equal
 from sklearn.datasets import make_blobs
@@ -53,3 +55,25 @@ def test_feature_agglomeration_feature_names_out():
     assert_array_equal(
         [f"featureagglomeration{i}" for i in range(n_clusters)], names_out
     )
+
+
+# TODO(1.5): remove this test
+def test_inverse_transform_Xred_deprecation():
+    X = np.array([0, 0, 1]).reshape(1, 3)  # (n_samples, n_features)
+
+    est = FeatureAgglomeration(n_clusters=1, pooling_func=np.mean)
+    est.fit(X)
+    Xt = est.transform(X)
+
+    with pytest.raises(TypeError, match="Missing required positional argument"):
+        est.inverse_transform()
+
+    with pytest.raises(ValueError, match="Please provide only"):
+        est.inverse_transform(Xt=Xt, Xred=Xt)
+
+    with warnings.catch_warnings(record=True):
+        warnings.simplefilter("error")
+        est.inverse_transform(Xt)
+
+    with pytest.warns(FutureWarning, match="Input argument `Xred` was renamed to `Xt`"):
+        est.inverse_transform(Xred=Xt)
diff --git a/sklearn/compose/_column_transformer.py b/sklearn/compose/_column_transformer.py
index aab021c0c8d4f..14349662cfee9 100644
--- a/sklearn/compose/_column_transformer.py
+++ b/sklearn/compose/_column_transformer.py
@@ -14,6 +14,7 @@
 from scipy import sparse
 
 from ..base import clone, TransformerMixin
+from ..base import _fit_context
 from ..utils._estimator_html_repr import _VisualBlock
 from ..pipeline import _fit_transform_one, _transform_one, _name_estimators
 from ..preprocessing import FunctionTransformer
@@ -701,12 +702,15 @@ def fit(self, X, y=None):
         self : ColumnTransformer
             This estimator.
         """
-        self._validate_params()
         # we use fit_transform to make sure to set sparse_output_ (for which we
         # need the transformed data) to have consistent output type in predict
         self.fit_transform(X, y=y)
         return self
 
+    @_fit_context(
+        # estimators in ColumnTransformer.transformers are not validated yet
+        prefer_skip_nested_validation=False
+    )
     def fit_transform(self, X, y=None):
         """Fit all transformers, transform the data and concatenate results.
 
@@ -728,7 +732,6 @@ def fit_transform(self, X, y=None):
             any result is a sparse matrix, everything will be converted to
             sparse matrices.
         """
-        self._validate_params()
         self._check_feature_names(X, reset=True)
 
         X = _check_X(X)
diff --git a/sklearn/compose/_target.py b/sklearn/compose/_target.py
index f31a5a49b641e..e926ed7abe324 100644
--- a/sklearn/compose/_target.py
+++ b/sklearn/compose/_target.py
@@ -7,6 +7,7 @@
 import numpy as np
 
 from ..base import BaseEstimator, RegressorMixin, clone
+from ..base import _fit_context
 from ..utils.validation import check_is_fitted
 from ..utils._tags import _safe_tags
 from ..utils import check_array, _safe_indexing
@@ -197,6 +198,10 @@ def _fit_transformer(self, y):
                     UserWarning,
                 )
 
+    @_fit_context(
+        # TransformedTargetRegressor.regressor/transformer are not validated yet.
+        prefer_skip_nested_validation=False
+    )
     def fit(self, X, y, **fit_params):
         """Fit the model according to the given training data.
 
@@ -218,7 +223,6 @@ def fit(self, X, y, **fit_params):
         self : object
             Fitted estimator.
         """
-        self._validate_params()
         if y is None:
             raise ValueError(
                 f"This {self.__class__.__name__} estimator "
diff --git a/sklearn/covariance/_elliptic_envelope.py b/sklearn/covariance/_elliptic_envelope.py
index 1ef0eedd62f64..c99f200592580 100644
--- a/sklearn/covariance/_elliptic_envelope.py
+++ b/sklearn/covariance/_elliptic_envelope.py
@@ -9,6 +9,7 @@
 from ..utils.validation import check_is_fitted
 from ..metrics import accuracy_score
 from ..base import OutlierMixin
+from ..base import _fit_context
 
 
 class EllipticEnvelope(OutlierMixin, MinCovDet):
@@ -162,6 +163,7 @@ def __init__(
         )
         self.contamination = contamination
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def fit(self, X, y=None):
         """Fit the EllipticEnvelope model.
 
@@ -178,7 +180,6 @@ def fit(self, X, y=None):
         self : object
             Returns the instance itself.
         """
-        # `_validate_params` is called in `MinCovDet`
         super().fit(X)
         self.offset_ = np.percentile(-self.dist_, 100.0 * self.contamination)
         return self
diff --git a/sklearn/covariance/_empirical_covariance.py b/sklearn/covariance/_empirical_covariance.py
index 7fc23f36d92d3..8083bfd2e1aa1 100644
--- a/sklearn/covariance/_empirical_covariance.py
+++ b/sklearn/covariance/_empirical_covariance.py
@@ -16,6 +16,7 @@
 
 from .. import config_context
 from ..base import BaseEstimator
+from ..base import _fit_context
 from ..utils import check_array
 from ..utils._param_validation import validate_params
 from ..utils.extmath import fast_logdet
@@ -218,6 +219,7 @@ def get_precision(self):
             precision = linalg.pinvh(self.covariance_, check_finite=False)
         return precision
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def fit(self, X, y=None):
         """Fit the maximum likelihood covariance estimator to X.
 
@@ -235,7 +237,6 @@ def fit(self, X, y=None):
         self : object
             Returns the instance itself.
         """
-        self._validate_params()
         X = self._validate_data(X)
         if self.assume_centered:
             self.location_ = np.zeros(X.shape[1])
diff --git a/sklearn/covariance/_graph_lasso.py b/sklearn/covariance/_graph_lasso.py
index afe21fa3a02f1..8575cc4f75801 100644
--- a/sklearn/covariance/_graph_lasso.py
+++ b/sklearn/covariance/_graph_lasso.py
@@ -16,6 +16,7 @@
 
 from . import empirical_covariance, EmpiricalCovariance, log_likelihood
 
+from ..base import _fit_context
 from ..exceptions import ConvergenceWarning
 from ..utils.validation import (
     _is_arraylike_not_scalar,
@@ -532,6 +533,7 @@ def __init__(
         self.alpha = alpha
         self.covariance = covariance
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def fit(self, X, y=None):
         """Fit the GraphicalLasso model to X.
 
@@ -548,7 +550,6 @@ def fit(self, X, y=None):
         self : object
             Returns the instance itself.
         """
-        self._validate_params()
         # Covariance does not make sense for a single feature
         X = self._validate_data(X, ensure_min_features=2, ensure_min_samples=2)
 
@@ -925,6 +926,7 @@ def __init__(
         self.cv = cv
         self.n_jobs = n_jobs
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def fit(self, X, y=None):
         """Fit the GraphicalLasso covariance model to X.
 
@@ -941,7 +943,6 @@ def fit(self, X, y=None):
         self : object
             Returns the instance itself.
         """
-        self._validate_params()
         # Covariance does not make sense for a single feature
         X = self._validate_data(X, ensure_min_features=2)
         if self.assume_centered:
diff --git a/sklearn/covariance/_robust_covariance.py b/sklearn/covariance/_robust_covariance.py
index f3dd6d60badf8..c723bba7a097b 100644
--- a/sklearn/covariance/_robust_covariance.py
+++ b/sklearn/covariance/_robust_covariance.py
@@ -15,6 +15,7 @@
 from scipy.stats import chi2
 
 from . import empirical_covariance, EmpiricalCovariance
+from ..base import _fit_context
 from ..utils.extmath import fast_logdet
 from ..utils import check_random_state, check_array
 from ..utils._param_validation import Interval
@@ -719,6 +720,7 @@ def __init__(
         self.support_fraction = support_fraction
         self.random_state = random_state
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def fit(self, X, y=None):
         """Fit a Minimum Covariance Determinant with the FastMCD algorithm.
 
@@ -736,7 +738,6 @@ def fit(self, X, y=None):
         self : object
             Returns the instance itself.
         """
-        self._validate_params()
         X = self._validate_data(X, ensure_min_samples=2, estimator="MinCovDet")
         random_state = check_random_state(self.random_state)
         n_samples, n_features = X.shape
diff --git a/sklearn/covariance/_shrunk_covariance.py b/sklearn/covariance/_shrunk_covariance.py
index 4bf3d9a490b6b..21d2e034b45d7 100644
--- a/sklearn/covariance/_shrunk_covariance.py
+++ b/sklearn/covariance/_shrunk_covariance.py
@@ -18,6 +18,7 @@
 import numpy as np
 
 from . import empirical_covariance, EmpiricalCovariance
+from ..base import _fit_context
 from ..utils import check_array
 from ..utils._param_validation import Interval, validate_params
 
@@ -237,6 +238,7 @@ def __init__(self, *, store_precision=True, assume_centered=False, shrinkage=0.1
         )
         self.shrinkage = shrinkage
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def fit(self, X, y=None):
         """Fit the shrunk covariance model to X.
 
@@ -254,7 +256,6 @@ def fit(self, X, y=None):
         self : object
             Returns the instance itself.
         """
-        self._validate_params()
         X = self._validate_data(X)
         # Not calling the parent object to fit, to avoid a potential
         # matrix inversion when setting the precision
@@ -533,6 +534,7 @@ def __init__(self, *, store_precision=True, assume_centered=False, block_size=10
         )
         self.block_size = block_size
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def fit(self, X, y=None):
         """Fit the Ledoit-Wolf shrunk covariance model to X.
 
@@ -549,7 +551,6 @@ def fit(self, X, y=None):
         self : object
             Returns the instance itself.
         """
-        self._validate_params()
         # Not calling the parent object to fit, to avoid computing the
         # covariance matrix (and potentially the precision)
         X = self._validate_data(X)
@@ -722,6 +723,7 @@ class OAS(EmpiricalCovariance):
     0.0195...
     """
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def fit(self, X, y=None):
         """Fit the Oracle Approximating Shrinkage covariance model to X.
 
@@ -738,8 +740,6 @@ def fit(self, X, y=None):
         self : object
             Returns the instance itself.
         """
-        self._validate_params()
-
         X = self._validate_data(X)
         # Not calling the parent object to fit, to avoid computing the
         # covariance matrix (and potentially the precision)
diff --git a/sklearn/cross_decomposition/_pls.py b/sklearn/cross_decomposition/_pls.py
index a5e5a1ceff09a..da395d8f060fb 100644
--- a/sklearn/cross_decomposition/_pls.py
+++ b/sklearn/cross_decomposition/_pls.py
@@ -16,6 +16,7 @@
 from ..base import BaseEstimator, RegressorMixin, TransformerMixin
 from ..base import MultiOutputMixin
 from ..base import ClassNamePrefixFeaturesOutMixin
+from ..base import _fit_context
 from ..utils import check_array, check_consistent_length
 from ..utils.fixes import sp_version
 from ..utils.fixes import parse_version
@@ -208,6 +209,7 @@ def __init__(
         self.tol = tol
         self.copy = copy
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def fit(self, X, Y):
         """Fit model to data.
 
@@ -226,8 +228,6 @@ def fit(self, X, Y):
         self : object
             Fitted model.
         """
-        self._validate_params()
-
         check_consistent_length(X, Y)
         X = self._validate_data(
             X, dtype=np.float64, copy=self.copy, ensure_min_samples=2
@@ -958,6 +958,7 @@ def __init__(self, n_components=2, *, scale=True, copy=True):
         self.scale = scale
         self.copy = copy
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def fit(self, X, Y):
         """Fit model to data.
 
@@ -974,8 +975,6 @@ def fit(self, X, Y):
         self : object
             Fitted estimator.
         """
-        self._validate_params()
-
         check_consistent_length(X, Y)
         X = self._validate_data(
             X, dtype=np.float64, copy=self.copy, ensure_min_samples=2
diff --git a/sklearn/datasets/_arff_parser.py b/sklearn/datasets/_arff_parser.py
index 7b2faa4b67f4d..bba06fbb74021 100644
--- a/sklearn/datasets/_arff_parser.py
+++ b/sklearn/datasets/_arff_parser.py
@@ -204,7 +204,10 @@ def _io_to_generator(gzip_file):
         if len(dfs) >= 2:
             dfs[0] = dfs[0].astype(dfs[1].dtypes)
 
-        frame = pd.concat(dfs, ignore_index=True)
+        # liac-arff parser does not depend on NumPy and uses None to represent
+        # missing values. To be consistent with the pandas parser, we replace
+        # None with np.nan.
+        frame = pd.concat(dfs, ignore_index=True).fillna(value=np.nan)
         del dfs, first_df
 
         # cast the columns frame
diff --git a/sklearn/datasets/tests/test_openml.py b/sklearn/datasets/tests/test_openml.py
index 42f64fba2037b..c13b82dd769d3 100644
--- a/sklearn/datasets/tests/test_openml.py
+++ b/sklearn/datasets/tests/test_openml.py
@@ -920,9 +920,7 @@ def datasets_missing_values():
         (1119, "liac-arff", 9, 6, 0),
         (1119, "pandas", 9, 0, 6),
         # miceprotein
-        # 1 column has only missing values with object dtype
-        (40966, "liac-arff", 1, 76, 0),
-        # with casting it will be transformed to either float or Int64
+        (40966, "liac-arff", 1, 77, 0),
         (40966, "pandas", 1, 77, 0),
         # titanic
         (40945, "liac-arff", 3, 6, 0),
diff --git a/sklearn/decomposition/_dict_learning.py b/sklearn/decomposition/_dict_learning.py
index ab2f87de4bb84..54b3590f5b62e 100644
--- a/sklearn/decomposition/_dict_learning.py
+++ b/sklearn/decomposition/_dict_learning.py
@@ -1796,6 +1796,7 @@ def fit(self, X, y=None):
         self.fit_transform(X)
         return self
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def fit_transform(self, X, y=None):
         """Fit the model from data in X and return the transformed data.
 
@@ -1813,8 +1814,6 @@ def fit_transform(self, X, y=None):
         V : ndarray of shape (n_samples, n_components)
             Transformed data.
         """
-        self._validate_params()
-
         _check_positive_coding(method=self.fit_algorithm, positive=self.positive_code)
 
         method = "lasso_" + self.fit_algorithm
@@ -2435,6 +2434,7 @@ def fit(self, X, y=None):
 
         return self
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def partial_fit(self, X, y=None):
         """Update the model using the data in X as a mini-batch.
 
@@ -2454,9 +2454,6 @@ def partial_fit(self, X, y=None):
         """
         has_components = hasattr(self, "components_")
 
-        if not has_components:
-            self._validate_params()
-
         X = self._validate_data(
             X, dtype=[np.float64, np.float32], order="C", reset=not has_components
         )
diff --git a/sklearn/decomposition/_factor_analysis.py b/sklearn/decomposition/_factor_analysis.py
index a6507d167b9cb..8c3d590b2c814 100644
--- a/sklearn/decomposition/_factor_analysis.py
+++ b/sklearn/decomposition/_factor_analysis.py
@@ -27,6 +27,7 @@
 
 
 from ..base import BaseEstimator, TransformerMixin, ClassNamePrefixFeaturesOutMixin
+from ..base import _fit_context
 from ..utils import check_random_state
 from ..utils._param_validation import Interval, StrOptions
 from ..utils.extmath import fast_logdet, randomized_svd, squared_norm
@@ -197,6 +198,7 @@ def __init__(
         self.random_state = random_state
         self.rotation = rotation
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def fit(self, X, y=None):
         """Fit the FactorAnalysis model to X using SVD based approach.
 
@@ -213,8 +215,6 @@ def fit(self, X, y=None):
         self : object
             FactorAnalysis class instance.
         """
-        self._validate_params()
-
         X = self._validate_data(X, copy=self.copy, dtype=np.float64)
 
         n_samples, n_features = X.shape
diff --git a/sklearn/decomposition/_fastica.py b/sklearn/decomposition/_fastica.py
index 680a6cd8bbee1..6dcf62c0ace3b 100644
--- a/sklearn/decomposition/_fastica.py
+++ b/sklearn/decomposition/_fastica.py
@@ -16,6 +16,7 @@
 from scipy import linalg
 
 from ..base import BaseEstimator, TransformerMixin, ClassNamePrefixFeaturesOutMixin
+from ..base import _fit_context
 from ..exceptions import ConvergenceWarning
 from ..utils import check_array, as_float_array, check_random_state
 from ..utils.validation import check_is_fitted
@@ -672,6 +673,7 @@ def g(x, fun_args):
 
         return S
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def fit_transform(self, X, y=None):
         """Fit the model and recover the sources from X.
 
@@ -690,10 +692,9 @@ def fit_transform(self, X, y=None):
             Estimated sources obtained by transforming the data with the
             estimated unmixing matrix.
         """
-        self._validate_params()
-
         return self._fit_transform(X, compute_sources=True)
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def fit(self, X, y=None):
         """Fit the model to X.
 
@@ -711,8 +712,6 @@ def fit(self, X, y=None):
         self : object
             Returns the instance itself.
         """
-        self._validate_params()
-
         self._fit_transform(X, compute_sources=False)
         return self
 
diff --git a/sklearn/decomposition/_incremental_pca.py b/sklearn/decomposition/_incremental_pca.py
index d98a5f4fb3b7a..5ae5d58b06ca4 100644
--- a/sklearn/decomposition/_incremental_pca.py
+++ b/sklearn/decomposition/_incremental_pca.py
@@ -9,6 +9,7 @@
 from scipy import linalg, sparse
 
 from ._base import _BasePCA
+from ..base import _fit_context
 from ..utils import gen_batches
 from ..utils._param_validation import Interval
 from ..utils.extmath import svd_flip, _incremental_mean_and_var
@@ -192,6 +193,7 @@ def __init__(self, n_components=None, *, whiten=False, copy=True, batch_size=Non
         self.copy = copy
         self.batch_size = batch_size
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def fit(self, X, y=None):
         """Fit the model with X, using minibatches of size batch_size.
 
@@ -209,8 +211,6 @@ def fit(self, X, y=None):
         self : object
             Returns the instance itself.
         """
-        self._validate_params()
-
         self.components_ = None
         self.n_samples_seen_ = 0
         self.mean_ = 0.0
@@ -243,6 +243,7 @@ def fit(self, X, y=None):
 
         return self
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def partial_fit(self, X, y=None, check_input=True):
         """Incremental fit with X. All of X is processed as a single batch.
 
@@ -265,9 +266,6 @@ def partial_fit(self, X, y=None, check_input=True):
         """
         first_pass = not hasattr(self, "components_")
 
-        if first_pass:
-            self._validate_params()
-
         if check_input:
             if sparse.issparse(X):
                 raise TypeError(
diff --git a/sklearn/decomposition/_kernel_pca.py b/sklearn/decomposition/_kernel_pca.py
index fadcd6f94a2f8..61d502a006c5e 100644
--- a/sklearn/decomposition/_kernel_pca.py
+++ b/sklearn/decomposition/_kernel_pca.py
@@ -19,6 +19,7 @@
 from ..utils._param_validation import Interval, StrOptions
 from ..exceptions import NotFittedError
 from ..base import BaseEstimator, TransformerMixin, ClassNamePrefixFeaturesOutMixin
+from ..base import _fit_context
 from ..preprocessing import KernelCenterer
 from ..metrics.pairwise import pairwise_kernels
 
@@ -404,6 +405,7 @@ def _fit_inverse_transform(self, X_transformed, X):
         self.dual_coef_ = linalg.solve(K, X, assume_a="pos", overwrite_a=True)
         self.X_transformed_fit_ = X_transformed
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def fit(self, X, y=None):
         """Fit the model from data in X.
 
@@ -421,8 +423,6 @@ def fit(self, X, y=None):
         self : object
             Returns the instance itself.
         """
-        self._validate_params()
-
         if self.fit_inverse_transform and self.kernel == "precomputed":
             raise ValueError("Cannot fit_inverse_transform with a precomputed kernel.")
         X = self._validate_data(X, accept_sparse="csr", copy=self.copy_X)
diff --git a/sklearn/decomposition/_lda.py b/sklearn/decomposition/_lda.py
index 21829d4fedab3..ab1ea5ebb5460 100644
--- a/sklearn/decomposition/_lda.py
+++ b/sklearn/decomposition/_lda.py
@@ -18,6 +18,7 @@
 from joblib import effective_n_jobs
 
 from ..base import BaseEstimator, TransformerMixin, ClassNamePrefixFeaturesOutMixin
+from ..base import _fit_context
 from ..utils import check_random_state, gen_batches, gen_even_slices
 from ..utils.validation import check_non_negative
 from ..utils.validation import check_is_fitted
@@ -568,6 +569,7 @@ def _check_non_neg_array(self, X, reset_n_features, whom):
 
         return X
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def partial_fit(self, X, y=None):
         """Online VB with Mini-Batch update.
 
@@ -586,9 +588,6 @@ def partial_fit(self, X, y=None):
         """
         first_time = not hasattr(self, "components_")
 
-        if first_time:
-            self._validate_params()
-
         X = self._check_non_neg_array(
             X, reset_n_features=first_time, whom="LatentDirichletAllocation.partial_fit"
         )
@@ -618,6 +617,7 @@ def partial_fit(self, X, y=None):
 
         return self
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def fit(self, X, y=None):
         """Learn model for the data X with variational Bayes method.
 
@@ -637,7 +637,6 @@ def fit(self, X, y=None):
         self
             Fitted estimator.
         """
-        self._validate_params()
         X = self._check_non_neg_array(
             X, reset_n_features=True, whom="LatentDirichletAllocation.fit"
         )
diff --git a/sklearn/decomposition/_nmf.py b/sklearn/decomposition/_nmf.py
index 67dd0c2ab7b70..d561583dec205 100644
--- a/sklearn/decomposition/_nmf.py
+++ b/sklearn/decomposition/_nmf.py
@@ -19,6 +19,7 @@
 from ._cdnmf_fast import _update_cdnmf_fast
 from .._config import config_context
 from ..base import BaseEstimator, TransformerMixin, ClassNamePrefixFeaturesOutMixin
+from ..base import _fit_context
 from ..exceptions import ConvergenceWarning
 from ..utils import check_random_state, check_array, gen_batches
 from ..utils.extmath import randomized_svd, safe_sparse_dot, squared_norm
@@ -31,6 +32,7 @@
     StrOptions,
     validate_params,
 )
+from ..utils import metadata_routing
 
 
 EPSILON = np.finfo(np.float32).eps
@@ -1122,6 +1124,11 @@ def non_negative_factorization(
 class _BaseNMF(ClassNamePrefixFeaturesOutMixin, TransformerMixin, BaseEstimator, ABC):
     """Base class for NMF and MiniBatchNMF."""
 
+    # This prevents ``set_split_inverse_transform`` to be generated for the
+    # non-standard ``W`` arg on ``inverse_transform``.
+    # TODO: remove when W is removed in v1.5 for inverse_transform
+    __metadata_request__inverse_transform = {"W": metadata_routing.UNUSED}
+
     _parameter_constraints: dict = {
         "n_components": [Interval(Integral, 1, None, closed="left"), None],
         "init": [
@@ -1245,23 +1252,44 @@ def fit(self, X, y=None, **params):
         self.fit_transform(X, **params)
         return self
 
-    def inverse_transform(self, W):
+    def inverse_transform(self, Xt=None, W=None):
         """Transform data back to its original space.
 
         .. versionadded:: 0.18
 
         Parameters
         ----------
-        W : {ndarray, sparse matrix} of shape (n_samples, n_components)
+        Xt : {ndarray, sparse matrix} of shape (n_samples, n_components)
             Transformed data matrix.
 
+        W : deprecated
+            Use `Xt` instead.
+
+            .. deprecated:: 1.3
+
         Returns
         -------
         X : {ndarray, sparse matrix} of shape (n_samples, n_features)
             Returns a data matrix of the original shape.
         """
+        if Xt is None and W is None:
+            raise TypeError("Missing required positional argument: Xt")
+
+        if W is not None and Xt is not None:
+            raise ValueError("Please provide only `Xt`, and not `W`.")
+
+        if W is not None:
+            warnings.warn(
+                (
+                    "Input argument `W` was renamed to `Xt` in v1.3 and will be removed"
+                    " in v1.5."
+                ),
+                FutureWarning,
+            )
+            Xt = W
+
         check_is_fitted(self)
-        return W @ self.components_
+        return Xt @ self.components_
 
     @property
     def _n_features_out(self):
@@ -1539,6 +1567,7 @@ def _check_params(self, X):
 
         return self
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def fit_transform(self, X, y=None, W=None, H=None):
         """Learn a NMF model for the data X and returns the transformed data.
 
@@ -1566,8 +1595,6 @@ def fit_transform(self, X, y=None, W=None, H=None):
         W : ndarray of shape (n_samples, n_components)
             Transformed data.
         """
-        self._validate_params()
-
         X = self._validate_data(
             X, accept_sparse=("csr", "csc"), dtype=[np.float64, np.float32]
         )
@@ -2123,6 +2150,7 @@ def _minibatch_convergence(
 
         return False
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def fit_transform(self, X, y=None, W=None, H=None):
         """Learn a NMF model for the data X and returns the transformed data.
 
@@ -2149,8 +2177,6 @@ def fit_transform(self, X, y=None, W=None, H=None):
         W : ndarray of shape (n_samples, n_components)
             Transformed data.
         """
-        self._validate_params()
-
         X = self._validate_data(
             X, accept_sparse=("csr", "csc"), dtype=[np.float64, np.float32]
         )
@@ -2288,6 +2314,7 @@ def transform(self, X):
 
         return W
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def partial_fit(self, X, y=None, W=None, H=None):
         """Update the model using the data in `X` as a mini-batch.
 
@@ -2321,9 +2348,6 @@ def partial_fit(self, X, y=None, W=None, H=None):
         """
         has_components = hasattr(self, "components_")
 
-        if not has_components:
-            self._validate_params()
-
         X = self._validate_data(
             X,
             accept_sparse=("csr", "csc"),
diff --git a/sklearn/decomposition/_pca.py b/sklearn/decomposition/_pca.py
index e8c302fc47129..1d3c0678aca89 100644
--- a/sklearn/decomposition/_pca.py
+++ b/sklearn/decomposition/_pca.py
@@ -20,6 +20,7 @@
 from scipy.sparse.linalg import svds
 
 from ._base import _BasePCA
+from ..base import _fit_context
 from ..utils import check_random_state
 from ..utils._arpack import _init_arpack_v0
 from ..utils.deprecation import deprecated
@@ -414,6 +415,7 @@ def __init__(
     def n_features_(self):
         return self.n_features_in_
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def fit(self, X, y=None):
         """Fit the model with X.
 
@@ -431,11 +433,10 @@ def fit(self, X, y=None):
         self : object
             Returns the instance itself.
         """
-        self._validate_params()
-
         self._fit(X)
         return self
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def fit_transform(self, X, y=None):
         """Fit the model with X and apply the dimensionality reduction on X.
 
@@ -458,8 +459,6 @@ def fit_transform(self, X, y=None):
         This method returns a Fortran-ordered array. To convert it to a
         C-ordered array, use 'np.ascontiguousarray'.
         """
-        self._validate_params()
-
         U, S, Vt = self._fit(X)
         U = U[:, : self.n_components_]
 
diff --git a/sklearn/decomposition/_sparse_pca.py b/sklearn/decomposition/_sparse_pca.py
index 5974b86381e1a..93e4a2164a87f 100644
--- a/sklearn/decomposition/_sparse_pca.py
+++ b/sklearn/decomposition/_sparse_pca.py
@@ -12,6 +12,7 @@
 from ..utils.validation import check_array, check_is_fitted
 from ..linear_model import ridge_regression
 from ..base import BaseEstimator, TransformerMixin, ClassNamePrefixFeaturesOutMixin
+from ..base import _fit_context
 from ._dict_learning import dict_learning, MiniBatchDictionaryLearning
 
 
@@ -53,6 +54,7 @@ def __init__(
         self.verbose = verbose
         self.random_state = random_state
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def fit(self, X, y=None):
         """Fit the model from data in X.
 
@@ -70,7 +72,6 @@ def fit(self, X, y=None):
         self : object
             Returns the instance itself.
         """
-        self._validate_params()
         random_state = check_random_state(self.random_state)
         X = self._validate_data(X)
 
diff --git a/sklearn/decomposition/_truncated_svd.py b/sklearn/decomposition/_truncated_svd.py
index 999266a4f3f78..67f5c73028f15 100644
--- a/sklearn/decomposition/_truncated_svd.py
+++ b/sklearn/decomposition/_truncated_svd.py
@@ -12,6 +12,7 @@
 from scipy.sparse.linalg import svds
 
 from ..base import BaseEstimator, TransformerMixin, ClassNamePrefixFeaturesOutMixin
+from ..base import _fit_context
 from ..utils import check_array, check_random_state
 from ..utils._arpack import _init_arpack_v0
 from ..utils.extmath import randomized_svd, safe_sparse_dot, svd_flip
@@ -200,10 +201,10 @@ def fit(self, X, y=None):
         self : object
             Returns the transformer object.
         """
-        # param validation is done in fit_transform
         self.fit_transform(X)
         return self
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def fit_transform(self, X, y=None):
         """Fit model to X and perform dimensionality reduction on X.
 
@@ -220,7 +221,6 @@ def fit_transform(self, X, y=None):
         X_new : ndarray of shape (n_samples, n_components)
             Reduced version of X. This will always be a dense array.
         """
-        self._validate_params()
         X = self._validate_data(X, accept_sparse=["csr", "csc"], ensure_min_features=2)
         random_state = check_random_state(self.random_state)
 
diff --git a/sklearn/decomposition/tests/test_nmf.py b/sklearn/decomposition/tests/test_nmf.py
index 74218b83c6952..2b1ed4d91be5e 100644
--- a/sklearn/decomposition/tests/test_nmf.py
+++ b/sklearn/decomposition/tests/test_nmf.py
@@ -1,6 +1,7 @@
 import re
 import sys
 from io import StringIO
+import warnings
 
 import numpy as np
 import scipy.sparse as sp
@@ -906,3 +907,29 @@ def test_minibatch_nmf_verbose():
         nmf.fit(A)
     finally:
         sys.stdout = old_stdout
+
+
+# TODO(1.5): remove this test
+def test_NMF_inverse_transform_W_deprecation():
+    rng = np.random.mtrand.RandomState(42)
+    A = np.abs(rng.randn(6, 5))
+    est = NMF(
+        n_components=3,
+        init="random",
+        random_state=0,
+        tol=1e-6,
+    )
+    Xt = est.fit_transform(A)
+
+    with pytest.raises(TypeError, match="Missing required positional argument"):
+        est.inverse_transform()
+
+    with pytest.raises(ValueError, match="Please provide only"):
+        est.inverse_transform(Xt=Xt, W=Xt)
+
+    with warnings.catch_warnings(record=True):
+        warnings.simplefilter("error")
+        est.inverse_transform(Xt)
+
+    with pytest.warns(FutureWarning, match="Input argument `W` was renamed to `Xt`"):
+        est.inverse_transform(W=Xt)
diff --git a/sklearn/discriminant_analysis.py b/sklearn/discriminant_analysis.py
index c8c0a656e5784..275f4ae4d3b30 100644
--- a/sklearn/discriminant_analysis.py
+++ b/sklearn/discriminant_analysis.py
@@ -17,6 +17,7 @@
 
 from .base import BaseEstimator, TransformerMixin, ClassifierMixin
 from .base import ClassNamePrefixFeaturesOutMixin
+from .base import _fit_context
 from .linear_model._base import LinearClassifierMixin
 from .covariance import ledoit_wolf, empirical_covariance, shrunk_covariance
 from .utils.multiclass import unique_labels
@@ -546,6 +547,10 @@ def _solve_svd(self, X, y):
         self.coef_ = coef @ self.scalings_.T
         self.intercept_ -= self.xbar_ @ self.coef_.T
 
+    @_fit_context(
+        # LinearDiscriminantAnalysis.covariance_estimator is not validated yet
+        prefer_skip_nested_validation=False
+    )
     def fit(self, X, y):
         """Fit the Linear Discriminant Analysis model.
 
@@ -568,8 +573,6 @@ def fit(self, X, y):
         self : object
             Fitted estimator.
         """
-        self._validate_params()
-
         xp, _ = get_namespace(X)
 
         X, y = self._validate_data(
@@ -865,6 +868,7 @@ def __init__(
         self.store_covariance = store_covariance
         self.tol = tol
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def fit(self, X, y):
         """Fit the model according to the given training data and parameters.
 
@@ -889,7 +893,6 @@ def fit(self, X, y):
         self : object
             Fitted estimator.
         """
-        self._validate_params()
         X, y = self._validate_data(X, y)
         check_classification_targets(y)
         self.classes_, y = np.unique(y, return_inverse=True)
diff --git a/sklearn/dummy.py b/sklearn/dummy.py
index 25f910e8419f4..0d8519484d7a5 100644
--- a/sklearn/dummy.py
+++ b/sklearn/dummy.py
@@ -11,6 +11,7 @@
 
 from .base import BaseEstimator, ClassifierMixin, RegressorMixin
 from .base import MultiOutputMixin
+from .base import _fit_context
 from .utils import check_random_state
 from .utils._param_validation import StrOptions, Interval
 from .utils.validation import _num_samples
@@ -142,6 +143,7 @@ def __init__(self, *, strategy="prior", random_state=None, constant=None):
         self.random_state = random_state
         self.constant = constant
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def fit(self, X, y, sample_weight=None):
         """Fit the baseline classifier.
 
@@ -161,8 +163,6 @@ def fit(self, X, y, sample_weight=None):
         self : object
             Returns the instance itself.
         """
-        self._validate_params()
-
         self._strategy = self.strategy
 
         if self._strategy == "uniform" and sp.issparse(y):
@@ -518,6 +518,7 @@ def __init__(self, *, strategy="mean", constant=None, quantile=None):
         self.constant = constant
         self.quantile = quantile
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def fit(self, X, y, sample_weight=None):
         """Fit the random regressor.
 
@@ -537,8 +538,6 @@ def fit(self, X, y, sample_weight=None):
         self : object
             Fitted estimator.
         """
-        self._validate_params()
-
         y = check_array(y, ensure_2d=False, input_name="y")
         if len(y) == 0:
             raise ValueError("y must not be empty.")
diff --git a/sklearn/ensemble/_bagging.py b/sklearn/ensemble/_bagging.py
index bad6dcfb033ec..0354413fdebfe 100644
--- a/sklearn/ensemble/_bagging.py
+++ b/sklearn/ensemble/_bagging.py
@@ -14,6 +14,7 @@
 
 from ._base import BaseEnsemble, _partition_estimators
 from ..base import ClassifierMixin, RegressorMixin
+from ..base import _fit_context
 from ..metrics import r2_score, accuracy_score
 from ..tree import DecisionTreeClassifier, DecisionTreeRegressor
 from ..utils import check_random_state, column_or_1d
@@ -301,6 +302,10 @@ def __init__(
         self.random_state = random_state
         self.verbose = verbose
 
+    @_fit_context(
+        # BaseBagging.estimator is not validated yet
+        prefer_skip_nested_validation=False
+    )
     def fit(self, X, y, sample_weight=None):
         """Build a Bagging ensemble of estimators from the training set (X, y).
 
@@ -324,9 +329,6 @@ def fit(self, X, y, sample_weight=None):
         self : object
             Fitted estimator.
         """
-
-        self._validate_params()
-
         # Convert data (X is required to be 2d and indexable)
         X, y = self._validate_data(
             X,
diff --git a/sklearn/ensemble/_forest.py b/sklearn/ensemble/_forest.py
index 4d9bf862bd806..e715952947c04 100644
--- a/sklearn/ensemble/_forest.py
+++ b/sklearn/ensemble/_forest.py
@@ -50,7 +50,7 @@ class calls the ``fit`` method of each sub-estimator on random samples
 from scipy.sparse import issparse
 from scipy.sparse import hstack as sparse_hstack
 
-from sklearn.base import is_classifier
+from sklearn.base import is_classifier, _fit_context
 from sklearn.base import (
     ClassifierMixin,
     MultiOutputMixin,
@@ -221,6 +221,7 @@ class BaseForest(MultiOutputMixin, BaseEnsemble, metaclass=ABCMeta):
             None,
             Interval(Integral, 1, None, closed="left"),
         ],
+        "store_leaf_values": [bool],
     }
 
     @abstractmethod
@@ -240,6 +241,7 @@ def __init__(
         max_samples=None,
         base_estimator="deprecated",
         max_bins=None,
+        store_leaf_values=False,
     ):
         super().__init__(
             estimator=estimator,
@@ -257,6 +259,7 @@ def __init__(
         self.class_weight = class_weight
         self.max_samples = max_samples
         self.max_bins = max_bins
+        self.store_leaf_values = store_leaf_values
 
     def apply(self, X):
         """
@@ -333,6 +336,7 @@ def decision_path(self, X):
 
         return sparse_hstack(indicators).tocsr(), n_nodes_ptr
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def fit(self, X, y, sample_weight=None):
         """
         Build a forest of trees from the training set (X, y).
@@ -360,8 +364,6 @@ def fit(self, X, y, sample_weight=None):
         self : object
             Fitted estimator.
         """
-        self._validate_params()
-
         # Validate or convert input data
         if issparse(y):
             raise ValueError("sparse multilabel-indicator for y is not supported.")
@@ -717,6 +719,139 @@ def _bin_data(self, X, is_training_data):
 
         return X_binned
 
+    def predict_quantiles(self, X, quantiles=0.5, method="nearest"):
+        """Predict class or regression value for X at given quantiles.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            Input data.
+        quantiles : float, optional
+            The quantiles at which to evaluate, by default 0.5 (median).
+        method : str, optional
+            The method to interpolate, by default 'linear'. Can be any keyword
+            argument accepted by :func:`np.quantile`.
+        check_input : bool, optional
+            Whether or not to check input, by default True.
+
+        Returns
+        -------
+        y : ndarray of shape (n_samples, n_quantiles) or
+                (n_samples, n_quantiles, n_outputs)
+            The predicted values.
+        """
+        if not self.store_leaf_values:
+            raise RuntimeError(
+                "Quantile prediction is not available when store_leaf_values=False"
+            )
+        check_is_fitted(self)
+        # Check data
+        X = self._validate_X_predict(X)
+
+        if not isinstance(quantiles, (np.ndarray, list)):
+            quantiles = np.array([quantiles])
+
+        # if we trained a binning tree, then we should re-bin the data
+        # XXX: this is inefficient and should be improved to be in line with what
+        # the Histogram Gradient Boosting Tree does, where the binning thresholds
+        # are passed into the tree itself, thus allowing us to set the node feature
+        # value thresholds within the tree itself.
+        if self.max_bins is not None:
+            X = self._bin_data(X, is_training_data=False).astype(DTYPE)
+
+        # Assign chunk of trees to jobs
+        # n_jobs, _, _ = _partition_estimators(self.n_estimators, self.n_jobs)
+
+        # avoid storing the output of every estimator by summing them here
+        if self.n_outputs_ > 1:
+            y_hat = np.zeros(
+                (X.shape[0], len(quantiles), self.n_outputs_), dtype=np.float64
+            )
+        else:
+            y_hat = np.zeros((X.shape[0], len(quantiles)), dtype=np.float64)
+
+        # get (n_samples, n_estimators) indicator of leaf nodes
+        X_leaves = self.apply(X)
+
+        # we now want to aggregate all leaf samples across all trees for each sample
+        for idx in range(X.shape[0]):
+            # get leaf nodes for this sample
+            leaf_nodes = X_leaves[idx, :]
+
+            # (n_total_leaf_samples, n_outputs)
+            leaf_node_samples = np.vstack(
+                (
+                    est.leaf_nodes_samples_[leaf_nodes[jdx]]
+                    for jdx, est in enumerate(self.estimators_)
+                )
+            )
+
+            # get quantiles across all leaf node samples
+            y_hat[idx, ...] = np.quantile(
+                leaf_node_samples, quantiles, axis=0, interpolation=method
+            )
+
+            if is_classifier(self):
+                if self.n_outputs_ == 1:
+                    for i in range(len(quantiles)):
+                        class_pred_per_sample = y_hat[idx, i, :].squeeze().astype(int)
+                        y_hat[idx, ...] = self.classes_.take(
+                            class_pred_per_sample, axis=0
+                        )
+                else:
+                    for k in range(self.n_outputs_):
+                        for i in range(len(quantiles)):
+                            class_pred_per_sample = (
+                                y_hat[idx, i, k].squeeze().astype(int)
+                            )
+                            y_hat[idx, i, k] = self.classes_[k].take(
+                                class_pred_per_sample, axis=0
+                            )
+        return y_hat
+
+    def get_leaf_node_samples(self, X):
+        """For each datapoint x in X, get the training samples in the leaf node.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            Dataset to apply the forest to.
+
+        Returns
+        -------
+        leaf_node_samples : a list of array-like of shape
+                (n_leaf_node_samples, n_outputs)
+            Each sample is represented by the indices of the training samples that
+            reached the leaf node. The ``n_leaf_node_samples`` may vary between
+            samples, since the number of samples that fall in a leaf node is
+            variable.
+        """
+        check_is_fitted(self)
+        # Check data
+        X = self._validate_X_predict(X)
+
+        # if we trained a binning tree, then we should re-bin the data
+        # XXX: this is inefficient and should be improved to be in line with what
+        # the Histogram Gradient Boosting Tree does, where the binning thresholds
+        # are passed into the tree itself, thus allowing us to set the node feature
+        # value thresholds within the tree itself.
+        if self.max_bins is not None:
+            X = self._bin_data(X, is_training_data=False).astype(DTYPE)
+
+        # Assign chunk of trees to jobs
+        n_jobs, _, _ = _partition_estimators(self.n_estimators, self.n_jobs)
+
+        # avoid storing the output of every estimator by summing them here
+        result = Parallel(n_jobs=n_jobs, verbose=self.verbose)(
+            delayed(_accumulate_leaf_nodes_samples)(e.get_leaf_node_samples, X)
+            for e in self.estimators_
+        )
+        leaf_nodes_samples = result[0]
+        for result_ in result[1:]:
+            for i, node_samples in enumerate(result_):
+                leaf_nodes_samples[i] = np.vstack((leaf_nodes_samples[i], node_samples))
+        return leaf_nodes_samples
+
 
 def _accumulate_prediction(predict, X, out, lock):
     """
@@ -734,6 +869,17 @@ def _accumulate_prediction(predict, X, out, lock):
                 out[i] += prediction[i]
 
 
+def _accumulate_leaf_nodes_samples(func, X):
+    """
+    This is a utility function for joblib's Parallel.
+
+    It can't go locally in ForestClassifier or ForestRegressor, because joblib
+    complains that it cannot pickle it when placed there.
+    """
+    leaf_nodes_samples = func(X, check_input=False)
+    return leaf_nodes_samples
+
+
 class ForestClassifier(ClassifierMixin, BaseForest, metaclass=ABCMeta):
     """
     Base class for forest of trees-based classifiers.
@@ -759,6 +905,7 @@ def __init__(
         max_samples=None,
         base_estimator="deprecated",
         max_bins=None,
+        store_leaf_values=False,
     ):
         super().__init__(
             estimator=estimator,
@@ -774,6 +921,7 @@ def __init__(
             max_samples=max_samples,
             base_estimator=base_estimator,
             max_bins=max_bins,
+            store_leaf_values=store_leaf_values,
         )
 
     @staticmethod
@@ -1037,6 +1185,7 @@ def __init__(
         max_samples=None,
         base_estimator="deprecated",
         max_bins=None,
+        store_leaf_values=False,
     ):
         super().__init__(
             estimator,
@@ -1051,6 +1200,7 @@ def __init__(
             max_samples=max_samples,
             base_estimator=base_estimator,
             max_bins=max_bins,
+            store_leaf_values=store_leaf_values,
         )
 
     def predict(self, X):
@@ -1515,6 +1665,7 @@ def __init__(
         ccp_alpha=0.0,
         max_samples=None,
         max_bins=None,
+        store_leaf_values=False,
     ):
         super().__init__(
             estimator=DecisionTreeClassifier(),
@@ -1530,6 +1681,7 @@ def __init__(
                 "min_impurity_decrease",
                 "random_state",
                 "ccp_alpha",
+                "store_leaf_values",
             ),
             bootstrap=bootstrap,
             oob_score=oob_score,
@@ -1540,6 +1692,7 @@ def __init__(
             class_weight=class_weight,
             max_samples=max_samples,
             max_bins=max_bins,
+            store_leaf_values=store_leaf_values,
         )
 
         self.criterion = criterion
@@ -1858,6 +2011,7 @@ def __init__(
         ccp_alpha=0.0,
         max_samples=None,
         max_bins=None,
+        store_leaf_values=False,
     ):
         super().__init__(
             estimator=DecisionTreeRegressor(),
@@ -1873,6 +2027,7 @@ def __init__(
                 "min_impurity_decrease",
                 "random_state",
                 "ccp_alpha",
+                "store_leaf_values",
             ),
             bootstrap=bootstrap,
             oob_score=oob_score,
@@ -1882,6 +2037,7 @@ def __init__(
             warm_start=warm_start,
             max_samples=max_samples,
             max_bins=max_bins,
+            store_leaf_values=store_leaf_values,
         )
 
         self.criterion = criterion
@@ -2210,6 +2366,7 @@ def __init__(
         ccp_alpha=0.0,
         max_samples=None,
         max_bins=None,
+        store_leaf_values=False,
     ):
         super().__init__(
             estimator=ExtraTreeClassifier(),
@@ -2225,6 +2382,7 @@ def __init__(
                 "min_impurity_decrease",
                 "random_state",
                 "ccp_alpha",
+                "store_leaf_values",
             ),
             bootstrap=bootstrap,
             oob_score=oob_score,
@@ -2235,6 +2393,7 @@ def __init__(
             class_weight=class_weight,
             max_samples=max_samples,
             max_bins=max_bins,
+            store_leaf_values=store_leaf_values,
         )
 
         self.criterion = criterion
@@ -2534,6 +2693,7 @@ def __init__(
         ccp_alpha=0.0,
         max_samples=None,
         max_bins=None,
+        store_leaf_values=False,
     ):
         super().__init__(
             estimator=ExtraTreeRegressor(),
@@ -2549,6 +2709,7 @@ def __init__(
                 "min_impurity_decrease",
                 "random_state",
                 "ccp_alpha",
+                "store_leaf_values",
             ),
             bootstrap=bootstrap,
             oob_score=oob_score,
@@ -2558,6 +2719,7 @@ def __init__(
             warm_start=warm_start,
             max_samples=max_samples,
             max_bins=max_bins,
+            store_leaf_values=store_leaf_values,
         )
 
         self.criterion = criterion
@@ -2783,6 +2945,7 @@ def __init__(
         random_state=None,
         verbose=0,
         warm_start=False,
+        store_leaf_values=False,
     ):
         super().__init__(
             estimator=ExtraTreeRegressor(),
@@ -2797,6 +2960,7 @@ def __init__(
                 "max_leaf_nodes",
                 "min_impurity_decrease",
                 "random_state",
+                "store_leaf_values",
             ),
             bootstrap=False,
             oob_score=False,
@@ -2805,6 +2969,7 @@ def __init__(
             verbose=verbose,
             warm_start=warm_start,
             max_samples=None,
+            store_leaf_values=store_leaf_values,
         )
 
         self.max_depth = max_depth
@@ -2848,6 +3013,7 @@ def fit(self, X, y=None, sample_weight=None):
         self.fit_transform(X, y, sample_weight=sample_weight)
         return self
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def fit_transform(self, X, y=None, sample_weight=None):
         """
         Fit estimator and transform dataset.
@@ -2873,8 +3039,6 @@ def fit_transform(self, X, y=None, sample_weight=None):
         X_transformed : sparse matrix of shape (n_samples, n_out)
             Transformed dataset.
         """
-        self._validate_params()
-
         rnd = check_random_state(self.random_state)
         y = rnd.uniform(size=_num_samples(X))
         super().fit(X, y, sample_weight=sample_weight)
diff --git a/sklearn/ensemble/_gb.py b/sklearn/ensemble/_gb.py
index df9904c8a9aa4..8d435873aeb5c 100644
--- a/sklearn/ensemble/_gb.py
+++ b/sklearn/ensemble/_gb.py
@@ -28,6 +28,7 @@
 from ._base import BaseEnsemble
 from ..base import ClassifierMixin, RegressorMixin
 from ..base import is_classifier
+from ..base import _fit_context
 
 from ._gradient_boosting import predict_stages
 from ._gradient_boosting import predict_stage
@@ -146,6 +147,7 @@ class BaseGradientBoosting(BaseEnsemble, metaclass=ABCMeta):
         "n_iter_no_change": [Interval(Integral, 1, None, closed="left"), None],
         "tol": [Interval(Real, 0.0, None, closed="left")],
     }
+    _parameter_constraints.pop("store_leaf_values")
     _parameter_constraints.pop("splitter")
 
     @abstractmethod
@@ -376,6 +378,10 @@ def _check_initialized(self):
         """Check that the estimator is initialized, raising an error if not."""
         check_is_fitted(self)
 
+    @_fit_context(
+        # GradientBoosting*.init is not validated yet
+        prefer_skip_nested_validation=False
+    )
     def fit(self, X, y, sample_weight=None, monitor=None):
         """Fit the gradient boosting model.
 
@@ -412,8 +418,6 @@ def fit(self, X, y, sample_weight=None, monitor=None):
         self : object
             Fitted estimator.
         """
-        self._validate_params()
-
         if not self.warm_start:
             self._clear_state()
 
diff --git a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py
index 976335ea684d0..79b640057abe5 100644
--- a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py
+++ b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py
@@ -18,6 +18,7 @@
     PinballLoss,
 )
 from ...base import BaseEstimator, RegressorMixin, ClassifierMixin, is_classifier
+from ...base import _fit_context
 from ...utils import check_random_state, resample, compute_sample_weight
 from ...utils.validation import (
     check_is_fitted,
@@ -336,6 +337,7 @@ def _check_interaction_cst(self, n_features):
 
         return constraints
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def fit(self, X, y, sample_weight=None):
         """Fit the gradient boosting model.
 
@@ -357,8 +359,6 @@ def fit(self, X, y, sample_weight=None):
         self : object
             Fitted estimator.
         """
-        self._validate_params()
-
         fit_start_time = time()
         acc_find_split_time = 0.0  # time spent finding the best splits
         acc_apply_split_time = 0.0  # time spent splitting nodes
diff --git a/sklearn/ensemble/_iforest.py b/sklearn/ensemble/_iforest.py
index bb016fa33185b..048a1d69395e2 100644
--- a/sklearn/ensemble/_iforest.py
+++ b/sklearn/ensemble/_iforest.py
@@ -20,6 +20,7 @@
 from ..utils._param_validation import RealNotInt
 from ..utils.validation import check_is_fitted, _num_samples
 from ..base import OutlierMixin
+from ..base import _fit_context
 
 from ._bagging import BaseBagging
 
@@ -265,6 +266,7 @@ def _parallel_args(self):
         # copies.
         return {"prefer": "threads"}
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def fit(self, X, y=None, sample_weight=None):
         """
         Fit estimator.
@@ -287,7 +289,6 @@ def fit(self, X, y=None, sample_weight=None):
         self : object
             Fitted estimator.
         """
-        self._validate_params()
         X = self._validate_data(X, accept_sparse=["csc"], dtype=tree_dtype)
         if issparse(X):
             # Pre-sort indices to avoid that each individual tree of the
diff --git a/sklearn/ensemble/_stacking.py b/sklearn/ensemble/_stacking.py
index 10f7a606f20c9..5b3486edfeb33 100644
--- a/sklearn/ensemble/_stacking.py
+++ b/sklearn/ensemble/_stacking.py
@@ -13,6 +13,7 @@
 from ..base import clone
 from ..base import ClassifierMixin, RegressorMixin, TransformerMixin
 from ..base import is_classifier, is_regressor
+from ..base import _fit_context
 from ..exceptions import NotFittedError
 from ..utils._estimator_html_repr import _VisualBlock
 
@@ -159,6 +160,10 @@ def _method_name(name, estimator, method):
 
         return method_name
 
+    @_fit_context(
+        # estimators in Stacking*.estimators are not validated yet
+        prefer_skip_nested_validation=False
+    )
     def fit(self, X, y, sample_weight=None):
         """Fit the estimators.
 
@@ -184,9 +189,6 @@ def fit(self, X, y, sample_weight=None):
         -------
         self : object
         """
-
-        self._validate_params()
-
         # all_estimators contains all estimators, the one to be fitted and the
         # 'drop' string.
         names, all_estimators = self._validate_estimators()
diff --git a/sklearn/ensemble/_voting.py b/sklearn/ensemble/_voting.py
index 1c250cbe11a06..f8f4d2c4c197f 100644
--- a/sklearn/ensemble/_voting.py
+++ b/sklearn/ensemble/_voting.py
@@ -22,6 +22,7 @@
 from ..base import RegressorMixin
 from ..base import TransformerMixin
 from ..base import clone
+from ..base import _fit_context
 from ._base import _fit_single_estimator
 from ._base import _BaseHeterogeneousEnsemble
 from ..preprocessing import LabelEncoder
@@ -308,6 +309,10 @@ def __init__(
         self.flatten_transform = flatten_transform
         self.verbose = verbose
 
+    @_fit_context(
+        # estimators in VotingClassifier.estimators are not validated yet
+        prefer_skip_nested_validation=False
+    )
     def fit(self, X, y, sample_weight=None):
         """Fit the estimators.
 
@@ -332,7 +337,6 @@ def fit(self, X, y, sample_weight=None):
         self : object
             Returns the instance itself.
         """
-        self._validate_params()
         check_classification_targets(y)
         if isinstance(y, np.ndarray) and len(y.shape) > 1 and y.shape[1] > 1:
             raise NotImplementedError(
@@ -572,6 +576,10 @@ def __init__(self, estimators, *, weights=None, n_jobs=None, verbose=False):
         self.n_jobs = n_jobs
         self.verbose = verbose
 
+    @_fit_context(
+        # estimators in VotingRegressor.estimators are not validated yet
+        prefer_skip_nested_validation=False
+    )
     def fit(self, X, y, sample_weight=None):
         """Fit the estimators.
 
@@ -594,7 +602,6 @@ def fit(self, X, y, sample_weight=None):
         self : object
             Fitted estimator.
         """
-        self._validate_params()
         y = column_or_1d(y, warn=True)
         return super().fit(X, y, sample_weight)
 
diff --git a/sklearn/ensemble/_weight_boosting.py b/sklearn/ensemble/_weight_boosting.py
index b2aff503b0bb0..569609e6326e5 100644
--- a/sklearn/ensemble/_weight_boosting.py
+++ b/sklearn/ensemble/_weight_boosting.py
@@ -34,7 +34,7 @@
 
 from ._base import BaseEnsemble
 from ..base import ClassifierMixin, RegressorMixin, is_classifier, is_regressor
-
+from ..base import _fit_context
 from ..tree import DecisionTreeClassifier, DecisionTreeRegressor
 from ..utils import check_random_state, _safe_indexing
 from ..utils.extmath import softmax
@@ -103,6 +103,10 @@ def _check_X(self, X):
             reset=False,
         )
 
+    @_fit_context(
+        # AdaBoost*.estimator is not validated yet
+        prefer_skip_nested_validation=False
+    )
     def fit(self, X, y, sample_weight=None):
         """Build a boosted classifier/regressor from the training set (X, y).
 
@@ -124,8 +128,6 @@ def fit(self, X, y, sample_weight=None):
         self : object
             Fitted estimator.
         """
-        self._validate_params()
-
         X, y = self._validate_data(
             X,
             y,
diff --git a/sklearn/ensemble/tests/test_forest.py b/sklearn/ensemble/tests/test_forest.py
index d96f5c76842bf..a78e12a5a5181 100644
--- a/sklearn/ensemble/tests/test_forest.py
+++ b/sklearn/ensemble/tests/test_forest.py
@@ -1984,3 +1984,54 @@ def test_regression_criterion_withbins(name, criterion):
         criterion,
         score,
     )
+
+
+@pytest.mark.parametrize("name", FOREST_CLASSIFIERS_REGRESSORS)
+def test_multioutput_quantiles(name):
+    # Check estimators on multi-output problems.
+    X_train = [
+        [-2, -1],
+        [-1, -1],
+        [-1, -2],
+        [1, 1],
+        [1, 2],
+        [2, 1],
+        [-2, 1],
+        [-1, 1],
+        [-1, 2],
+        [2, -1],
+        [1, -1],
+        [1, -2],
+    ]
+    y_train = [
+        [-1, 0],
+        [-1, 0],
+        [-1, 0],
+        [1, 1],
+        [1, 1],
+        [1, 1],
+        [-1, 2],
+        [-1, 2],
+        [-1, 2],
+        [1, 3],
+        [1, 3],
+        [1, 3],
+    ]
+    X_test = [[-1, -1], [1, 1], [-1, 1], [1, -1]]
+    y_test = [[-1, 0], [1, 1], [-1, 2], [1, 3]]
+
+    est = FOREST_ESTIMATORS[name](
+        random_state=0, bootstrap=False, store_leaf_values=True
+    )
+    est.fit(X_train, y_train)
+
+    y_pred = est.predict_quantiles(X_test, quantiles=[0.25, 0.5, 0.75])
+    assert_array_almost_equal(y_pred[:, 1, :], y_test)
+    assert_array_almost_equal(y_pred[:, 0, :], y_test)
+    assert_array_almost_equal(y_pred[:, 2, :], y_test)
+
+    # test the leaf nodes samples
+    leaf_nodes_samples = est.get_leaf_node_samples(X_test)
+    assert len(leaf_nodes_samples) == len(X_test)
+    for node_samples in leaf_nodes_samples:
+        assert node_samples.shape[1] == est.n_outputs_
diff --git a/sklearn/feature_extraction/_dict_vectorizer.py b/sklearn/feature_extraction/_dict_vectorizer.py
index b51ccceaac9d1..60e2cb3b7ad84 100644
--- a/sklearn/feature_extraction/_dict_vectorizer.py
+++ b/sklearn/feature_extraction/_dict_vectorizer.py
@@ -11,6 +11,7 @@
 import scipy.sparse as sp
 
 from ..base import BaseEstimator, TransformerMixin
+from ..base import _fit_context
 from ..utils import check_array
 from ..utils.validation import check_is_fitted
 
@@ -133,6 +134,7 @@ def _add_iterable_element(
                 indices.append(vocab[feature_name])
                 values.append(self.dtype(vv))
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def fit(self, X, y=None):
         """Learn a list of feature name -> indices mappings.
 
@@ -153,7 +155,6 @@ def fit(self, X, y=None):
         self : object
             DictVectorizer class instance.
         """
-        self._validate_params()
         feature_names = []
         vocab = {}
 
@@ -286,6 +287,7 @@ def _transform(self, X, fitting):
 
         return result_matrix
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def fit_transform(self, X, y=None):
         """Learn a list of feature name -> indices mappings and transform X.
 
@@ -309,7 +311,6 @@ def fit_transform(self, X, y=None):
         Xa : {array, sparse matrix}
             Feature vectors; always 2-d.
         """
-        self._validate_params()
         return self._transform(X, fitting=True)
 
     def inverse_transform(self, X, dict_type=dict):
diff --git a/sklearn/feature_extraction/_hash.py b/sklearn/feature_extraction/_hash.py
index 1f2513e70eed5..e1b5e5f2561fe 100644
--- a/sklearn/feature_extraction/_hash.py
+++ b/sklearn/feature_extraction/_hash.py
@@ -8,6 +8,7 @@
 import scipy.sparse as sp
 
 from ..base import BaseEstimator, TransformerMixin
+from ..base import _fit_context
 from ._hashing_fast import transform as _hashing_transform
 from ..utils._param_validation import Interval, StrOptions
 
@@ -121,6 +122,7 @@ def __init__(
         self.n_features = n_features
         self.alternate_sign = alternate_sign
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def fit(self, X=None, y=None):
         """Only validates estimator's parameters.
 
@@ -140,8 +142,6 @@ def fit(self, X=None, y=None):
         self : object
             FeatureHasher class instance.
         """
-        # repeat input validation for grid search (which calls set_params)
-        self._validate_params()
         return self
 
     def transform(self, raw_X):
diff --git a/sklearn/feature_extraction/image.py b/sklearn/feature_extraction/image.py
index 89bdd7557f583..beea3e23e0adc 100644
--- a/sklearn/feature_extraction/image.py
+++ b/sklearn/feature_extraction/image.py
@@ -16,6 +16,7 @@
 from numpy.lib.stride_tricks import as_strided
 
 from ..base import BaseEstimator, TransformerMixin
+from ..base import _fit_context
 from ..utils import check_array, check_random_state
 from ..utils._param_validation import Hidden, Interval, validate_params
 from ..utils._param_validation import RealNotInt
@@ -561,6 +562,7 @@ def __init__(self, *, patch_size=None, max_patches=None, random_state=None):
         self.max_patches = max_patches
         self.random_state = random_state
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def fit(self, X, y=None):
         """Only validate the parameters of the estimator.
 
@@ -583,7 +585,6 @@ def fit(self, X, y=None):
         self : object
             Returns the instance itself.
         """
-        self._validate_params()
         return self
 
     def transform(self, X):
diff --git a/sklearn/feature_extraction/text.py b/sklearn/feature_extraction/text.py
index 21863d75eff2f..3201e3a0d51bb 100644
--- a/sklearn/feature_extraction/text.py
+++ b/sklearn/feature_extraction/text.py
@@ -25,6 +25,7 @@
 import scipy.sparse as sp
 
 from ..base import BaseEstimator, TransformerMixin, OneToOneFeatureMixin
+from ..base import _fit_context
 from ..preprocessing import normalize
 from ._hash import FeatureHasher
 from ._stop_words import ENGLISH_STOP_WORDS
@@ -801,6 +802,7 @@ def __init__(
         self.alternate_sign = alternate_sign
         self.dtype = dtype
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def partial_fit(self, X, y=None):
         """Only validates estimator's parameters.
 
@@ -820,10 +822,9 @@ def partial_fit(self, X, y=None):
         self : object
             HashingVectorizer instance.
         """
-        # TODO: only validate during the first call
-        self._validate_params()
         return self
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def fit(self, X, y=None):
         """Only validates estimator's parameters.
 
@@ -843,8 +844,6 @@ def fit(self, X, y=None):
         self : object
             HashingVectorizer instance.
         """
-        self._validate_params()
-
         # triggers a parameter validation
         if isinstance(X, str):
             raise ValueError(
@@ -1338,6 +1337,7 @@ def fit(self, raw_documents, y=None):
         self.fit_transform(raw_documents)
         return self
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def fit_transform(self, raw_documents, y=None):
         """Learn the vocabulary dictionary and return document-term matrix.
 
@@ -1365,7 +1365,6 @@ def fit_transform(self, raw_documents, y=None):
                 "Iterable over raw text documents expected, string object received."
             )
 
-        self._validate_params()
         self._validate_ngram_range()
         self._warn_for_unused_params()
         self._validate_vocabulary()
@@ -1639,6 +1638,7 @@ def __init__(self, *, norm="l2", use_idf=True, smooth_idf=True, sublinear_tf=Fal
         self.smooth_idf = smooth_idf
         self.sublinear_tf = sublinear_tf
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def fit(self, X, y=None):
         """Learn the idf vector (global term weights).
 
@@ -1655,8 +1655,6 @@ def fit(self, X, y=None):
         self : object
             Fitted transformer.
         """
-        self._validate_params()
-
         # large sparse data is not supported for 32bit platforms because
         # _document_frequency uses np.bincount which works on arrays of
         # dtype NPY_INTP which is int32 for 32bit platforms. See #20923
@@ -2073,6 +2071,7 @@ def _check_params(self):
                 UserWarning,
             )
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def fit(self, raw_documents, y=None):
         """Learn vocabulary and idf from training set.
 
@@ -2089,7 +2088,6 @@ def fit(self, raw_documents, y=None):
         self : object
             Fitted vectorizer.
         """
-        self._validate_params()
         self._check_params()
         self._warn_for_unused_params()
         self._tfidf = TfidfTransformer(
diff --git a/sklearn/feature_selection/_from_model.py b/sklearn/feature_selection/_from_model.py
index 7b8de4ae03585..47f98d89e8abe 100644
--- a/sklearn/feature_selection/_from_model.py
+++ b/sklearn/feature_selection/_from_model.py
@@ -9,6 +9,7 @@
 from ._base import SelectorMixin
 from ._base import _get_feature_importances
 from ..base import BaseEstimator, clone, MetaEstimatorMixin
+from ..base import _fit_context
 from ..utils._tags import _safe_tags
 from ..utils.validation import check_is_fitted, check_scalar, _num_features
 from ..utils._param_validation import HasMethods, Interval, Options
@@ -320,6 +321,10 @@ def _check_max_features(self, X):
             )
             self.max_features_ = max_features
 
+    @_fit_context(
+        # SelectFromModel.estimator is not validated yet
+        prefer_skip_nested_validation=False
+    )
     def fit(self, X, y=None, **fit_params):
         """Fit the SelectFromModel meta-transformer.
 
@@ -340,7 +345,6 @@ def fit(self, X, y=None, **fit_params):
         self : object
             Fitted estimator.
         """
-        self._validate_params()
         self._check_max_features(X)
 
         if self.prefit:
@@ -375,6 +379,10 @@ def threshold_(self):
         return _calculate_threshold(self.estimator, scores, self.threshold)
 
     @available_if(_estimator_has("partial_fit"))
+    @_fit_context(
+        # SelectFromModel.estimator is not validated yet
+        prefer_skip_nested_validation=False
+    )
     def partial_fit(self, X, y=None, **fit_params):
         """Fit the SelectFromModel meta-transformer only once.
 
@@ -398,7 +406,6 @@ def partial_fit(self, X, y=None, **fit_params):
         first_call = not hasattr(self, "estimator_")
 
         if first_call:
-            self._validate_params()
             self._check_max_features(X)
 
         if self.prefit:
diff --git a/sklearn/feature_selection/_rfe.py b/sklearn/feature_selection/_rfe.py
index 214ac9e0c30cf..932d66449ae22 100644
--- a/sklearn/feature_selection/_rfe.py
+++ b/sklearn/feature_selection/_rfe.py
@@ -22,6 +22,7 @@
 from ..base import MetaEstimatorMixin
 from ..base import clone
 from ..base import is_classifier
+from ..base import _fit_context
 from ..model_selection import check_cv
 from ..model_selection._validation import _score
 from ..metrics import check_scoring
@@ -228,6 +229,10 @@ def classes_(self):
         """
         return self.estimator_.classes_
 
+    @_fit_context(
+        # RFE.estimator is not validated yet
+        prefer_skip_nested_validation=False
+    )
     def fit(self, X, y, **fit_params):
         """Fit the RFE model and then the underlying estimator on the selected features.
 
@@ -248,7 +253,6 @@ def fit(self, X, y, **fit_params):
         self : object
             Fitted estimator.
         """
-        self._validate_params()
         return self._fit(X, y, **fit_params)
 
     def _fit(self, X, y, step_score=None, **fit_params):
@@ -649,6 +653,10 @@ def __init__(
         self.n_jobs = n_jobs
         self.min_features_to_select = min_features_to_select
 
+    @_fit_context(
+        # RFECV.estimator is not validated yet
+        prefer_skip_nested_validation=False
+    )
     def fit(self, X, y, groups=None):
         """Fit the RFE model and automatically tune the number of selected features.
 
@@ -674,7 +682,6 @@ def fit(self, X, y, groups=None):
         self : object
             Fitted estimator.
         """
-        self._validate_params()
         tags = self._get_tags()
         X, y = self._validate_data(
             X,
diff --git a/sklearn/feature_selection/_sequential.py b/sklearn/feature_selection/_sequential.py
index 8a61bdee0c554..0fbe91273053b 100644
--- a/sklearn/feature_selection/_sequential.py
+++ b/sklearn/feature_selection/_sequential.py
@@ -7,6 +7,7 @@
 
 from ._base import SelectorMixin
 from ..base import BaseEstimator, MetaEstimatorMixin, clone, is_classifier
+from ..base import _fit_context
 from ..utils._param_validation import HasMethods, Interval, StrOptions
 from ..utils._param_validation import RealNotInt
 from ..utils._tags import _safe_tags
@@ -179,6 +180,10 @@ def __init__(
         self.cv = cv
         self.n_jobs = n_jobs
 
+    @_fit_context(
+        # SequentialFeatureSelector.estimator is not validated yet
+        prefer_skip_nested_validation=False
+    )
     def fit(self, X, y=None):
         """Learn the features to select from X.
 
@@ -197,8 +202,6 @@ def fit(self, X, y=None):
         self : object
             Returns the instance itself.
         """
-        self._validate_params()
-
         tags = self._get_tags()
         X = self._validate_data(
             X,
diff --git a/sklearn/feature_selection/_univariate_selection.py b/sklearn/feature_selection/_univariate_selection.py
index 18e23d105b8bb..f4355c39f88cd 100644
--- a/sklearn/feature_selection/_univariate_selection.py
+++ b/sklearn/feature_selection/_univariate_selection.py
@@ -13,6 +13,7 @@
 from scipy.sparse import issparse
 
 from ..base import BaseEstimator
+from ..base import _fit_context
 from ..preprocessing import LabelBinarizer
 from ..utils import as_float_array, check_array, check_X_y, safe_sqr, safe_mask
 from ..utils.extmath import safe_sparse_dot, row_norms
@@ -473,6 +474,7 @@ class _BaseFilter(SelectorMixin, BaseEstimator):
     def __init__(self, score_func):
         self.score_func = score_func
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def fit(self, X, y):
         """Run score function on (X, y) and get the appropriate features.
 
@@ -490,8 +492,6 @@ def fit(self, X, y):
         self : object
             Returns the instance itself.
         """
-        self._validate_params()
-
         X, y = self._validate_data(
             X, y, accept_sparse=["csr", "csc"], multi_output=True
         )
diff --git a/sklearn/feature_selection/_variance_threshold.py b/sklearn/feature_selection/_variance_threshold.py
index 7c8db9cc7fa55..073a22c6ad92b 100644
--- a/sklearn/feature_selection/_variance_threshold.py
+++ b/sklearn/feature_selection/_variance_threshold.py
@@ -4,6 +4,7 @@
 
 import numpy as np
 from ..base import BaseEstimator
+from ..base import _fit_context
 from ._base import SelectorMixin
 from ..utils.sparsefuncs import mean_variance_axis, min_max_axis
 from ..utils.validation import check_is_fitted
@@ -76,6 +77,7 @@ class VarianceThreshold(SelectorMixin, BaseEstimator):
     def __init__(self, threshold=0.0):
         self.threshold = threshold
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def fit(self, X, y=None):
         """Learn empirical variances from X.
 
@@ -94,7 +96,6 @@ def fit(self, X, y=None):
         self : object
             Returns the instance itself.
         """
-        self._validate_params()
         X = self._validate_data(
             X,
             accept_sparse=("csr", "csc"),
diff --git a/sklearn/gaussian_process/_gpc.py b/sklearn/gaussian_process/_gpc.py
index 4a88034768870..50a8739372972 100644
--- a/sklearn/gaussian_process/_gpc.py
+++ b/sklearn/gaussian_process/_gpc.py
@@ -13,6 +13,7 @@
 from scipy.special import erf, expit
 
 from ..base import BaseEstimator, ClassifierMixin, clone
+from ..base import _fit_context
 from .kernels import Kernel, RBF, CompoundKernel, ConstantKernel as C
 from ..utils.validation import check_is_fitted
 from ..utils import check_random_state
@@ -679,6 +680,7 @@ def __init__(
         self.multi_class = multi_class
         self.n_jobs = n_jobs
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def fit(self, X, y):
         """Fit Gaussian process classification model.
 
@@ -695,8 +697,6 @@ def fit(self, X, y):
         self : object
             Returns an instance of self.
         """
-        self._validate_params()
-
         if isinstance(self.kernel, CompoundKernel):
             raise ValueError("kernel cannot be a CompoundKernel")
 
diff --git a/sklearn/gaussian_process/_gpr.py b/sklearn/gaussian_process/_gpr.py
index 9b7141f71b884..49fcab40c25f8 100644
--- a/sklearn/gaussian_process/_gpr.py
+++ b/sklearn/gaussian_process/_gpr.py
@@ -14,6 +14,7 @@
 
 from ..base import BaseEstimator, RegressorMixin, clone
 from ..base import MultiOutputMixin
+from ..base import _fit_context
 from .kernels import Kernel, RBF, ConstantKernel as C
 from ..preprocessing._data import _handle_zeros_in_scale
 from ..utils import check_random_state
@@ -214,6 +215,7 @@ def __init__(
         self.n_targets = n_targets
         self.random_state = random_state
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def fit(self, X, y):
         """Fit Gaussian process regression model.
 
@@ -230,8 +232,6 @@ def fit(self, X, y):
         self : object
             GaussianProcessRegressor class instance.
         """
-        self._validate_params()
-
         if self.kernel is None:  # Use an RBF kernel as default
             self.kernel_ = C(1.0, constant_value_bounds="fixed") * RBF(
                 1.0, length_scale_bounds="fixed"
diff --git a/sklearn/impute/_base.py b/sklearn/impute/_base.py
index b2f296c91740e..37fc43731514a 100644
--- a/sklearn/impute/_base.py
+++ b/sklearn/impute/_base.py
@@ -11,6 +11,7 @@
 from scipy import sparse as sp
 
 from ..base import BaseEstimator, TransformerMixin
+from ..base import _fit_context
 from ..utils._param_validation import StrOptions, MissingValues
 from ..utils.fixes import _mode
 from ..utils.sparsefuncs import _get_median
@@ -348,6 +349,7 @@ def _validate_input(self, X, in_fit):
 
         return X
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def fit(self, X, y=None):
         """Fit the imputer on `X`.
 
@@ -365,8 +367,6 @@ def fit(self, X, y=None):
         self : object
             Fitted estimator.
         """
-        self._validate_params()
-
         X = self._validate_input(X, in_fit=True)
 
         # default fill_value is 0 for numerical input and "missing_value"
@@ -927,6 +927,7 @@ def _fit(self, X, y=None, precomputed=False):
 
         return missing_features_info[0]
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def fit(self, X, y=None):
         """Fit the transformer on `X`.
 
@@ -944,7 +945,6 @@ def fit(self, X, y=None):
         self : object
             Fitted estimator.
         """
-        self._validate_params()
         self._fit(X, y)
 
         return self
@@ -990,6 +990,7 @@ def transform(self, X):
 
         return imputer_mask
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def fit_transform(self, X, y=None):
         """Generate missing values indicator for `X`.
 
@@ -1008,7 +1009,6 @@ def fit_transform(self, X, y=None):
             The missing indicator for input data. The data type of `Xt`
             will be boolean.
         """
-        self._validate_params()
         imputer_mask = self._fit(X, y)
 
         if self.features_.size < self._n_features:
diff --git a/sklearn/impute/_iterative.py b/sklearn/impute/_iterative.py
index 41ed19b7a8948..f977e5bc23e6c 100644
--- a/sklearn/impute/_iterative.py
+++ b/sklearn/impute/_iterative.py
@@ -7,6 +7,7 @@
 import numpy as np
 
 from ..base import clone
+from ..base import _fit_context
 from ..exceptions import ConvergenceWarning
 from ..preprocessing import normalize
 from ..utils import (
@@ -627,7 +628,7 @@ def _initial_imputation(self, X, in_fit=False):
                 strategy=self.initial_strategy,
                 fill_value=self.fill_value,
                 keep_empty_features=self.keep_empty_features,
-            )
+            ).set_output(transform="default")
             X_filled = self.initial_imputer_.fit_transform(X)
         else:
             X_filled = self.initial_imputer_.transform(X)
@@ -681,6 +682,10 @@ def _validate_limit(limit, limit_type, n_features):
             )
         return limit
 
+    @_fit_context(
+        # IterativeImputer.estimator is not validated yet
+        prefer_skip_nested_validation=False
+    )
     def fit_transform(self, X, y=None):
         """Fit the imputer on `X` and return the transformed `X`.
 
@@ -698,7 +703,6 @@ def fit_transform(self, X, y=None):
         Xt : array-like, shape (n_samples, n_features)
             The imputed input data.
         """
-        self._validate_params()
         self.random_state_ = getattr(
             self, "random_state_", check_random_state(self.random_state)
         )
diff --git a/sklearn/impute/_knn.py b/sklearn/impute/_knn.py
index 5735709dd7f29..915f8cbdb3fcb 100644
--- a/sklearn/impute/_knn.py
+++ b/sklearn/impute/_knn.py
@@ -6,6 +6,7 @@
 import numpy as np
 
 from ._base import _BaseImputer
+from ..base import _fit_context
 from ..utils.validation import FLOAT_DTYPES
 from ..metrics import pairwise_distances_chunked
 from ..metrics.pairwise import _NAN_METRICS
@@ -199,6 +200,7 @@ def _calc_impute(self, dist_pot_donors, n_neighbors, fit_X_col, mask_fit_X_col):
 
         return np.ma.average(donors, axis=1, weights=weight_matrix).data
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def fit(self, X, y=None):
         """Fit the imputer on X.
 
@@ -216,7 +218,6 @@ def fit(self, X, y=None):
         self : object
             The fitted `KNNImputer` class instance.
         """
-        self._validate_params()
         # Check data integrity and calling arguments
         if not is_scalar_nan(self.missing_values):
             force_all_finite = True
diff --git a/sklearn/isotonic.py b/sklearn/isotonic.py
index aa1521ab697d0..a1cf95b95591b 100644
--- a/sklearn/isotonic.py
+++ b/sklearn/isotonic.py
@@ -11,6 +11,7 @@
 import math
 
 from .base import BaseEstimator, TransformerMixin, RegressorMixin
+from .base import _fit_context
 from .utils import check_array, check_consistent_length
 from .utils.validation import _check_sample_weight, check_is_fitted
 from .utils._param_validation import Interval, StrOptions
@@ -310,6 +311,7 @@ def _build_y(self, X, y, sample_weight, trim_duplicates=True):
             # prediction speed).
             return X, y
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def fit(self, X, y, sample_weight=None):
         """Fit the model using X, y as training data.
 
@@ -338,7 +340,6 @@ def fit(self, X, y, sample_weight=None):
         X is stored for future use, as :meth:`transform` needs X to interpolate
         new input data.
         """
-        self._validate_params()
         check_params = dict(accept_sparse=False, ensure_2d=False)
         X = check_array(
             X, input_name="X", dtype=[np.float64, np.float32], **check_params
diff --git a/sklearn/kernel_approximation.py b/sklearn/kernel_approximation.py
index faa098e634937..7f190a2b66823 100644
--- a/sklearn/kernel_approximation.py
+++ b/sklearn/kernel_approximation.py
@@ -23,6 +23,7 @@
 from .base import BaseEstimator
 from .base import TransformerMixin
 from .base import ClassNamePrefixFeaturesOutMixin
+from .base import _fit_context
 from .utils import check_random_state
 from .utils import deprecated
 from .utils.extmath import safe_sparse_dot
@@ -139,6 +140,7 @@ def __init__(
         self.n_components = n_components
         self.random_state = random_state
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def fit(self, X, y=None):
         """Fit the model with X.
 
@@ -160,8 +162,6 @@ def fit(self, X, y=None):
         self : object
             Returns the instance itself.
         """
-        self._validate_params()
-
         X = self._validate_data(X, accept_sparse="csc")
         random_state = check_random_state(self.random_state)
 
@@ -338,6 +338,7 @@ def __init__(self, *, gamma=1.0, n_components=100, random_state=None):
         self.n_components = n_components
         self.random_state = random_state
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def fit(self, X, y=None):
         """Fit the model with X.
 
@@ -358,8 +359,6 @@ def fit(self, X, y=None):
         self : object
             Returns the instance itself.
         """
-        self._validate_params()
-
         X = self._validate_data(X, accept_sparse="csr")
         random_state = check_random_state(self.random_state)
         n_features = X.shape[1]
@@ -498,6 +497,7 @@ def __init__(self, *, skewedness=1.0, n_components=100, random_state=None):
         self.n_components = n_components
         self.random_state = random_state
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def fit(self, X, y=None):
         """Fit the model with X.
 
@@ -518,7 +518,6 @@ def fit(self, X, y=None):
         self : object
             Returns the instance itself.
         """
-        self._validate_params()
         X = self._validate_data(X)
         random_state = check_random_state(self.random_state)
         n_features = X.shape[1]
@@ -665,6 +664,7 @@ def __init__(self, *, sample_steps=2, sample_interval=None):
         self.sample_steps = sample_steps
         self.sample_interval = sample_interval
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def fit(self, X, y=None):
         """Only validates estimator's parameters.
 
@@ -686,7 +686,6 @@ def fit(self, X, y=None):
         self : object
             Returns the transformer.
         """
-        self._validate_params()
         X = self._validate_data(X, accept_sparse="csr")
         check_non_negative(X, "X in AdditiveChi2Sampler.fit")
 
@@ -1011,6 +1010,7 @@ def __init__(
         self.random_state = random_state
         self.n_jobs = n_jobs
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def fit(self, X, y=None):
         """Fit estimator to data.
 
@@ -1032,7 +1032,6 @@ def fit(self, X, y=None):
         self : object
             Returns the instance itself.
         """
-        self._validate_params()
         X = self._validate_data(X, accept_sparse="csr")
         rnd = check_random_state(self.random_state)
         n_samples = X.shape[0]
diff --git a/sklearn/kernel_ridge.py b/sklearn/kernel_ridge.py
index 111e62938f096..a7bfeefaef651 100644
--- a/sklearn/kernel_ridge.py
+++ b/sklearn/kernel_ridge.py
@@ -8,6 +8,7 @@
 import numpy as np
 
 from .base import BaseEstimator, RegressorMixin, MultiOutputMixin
+from .base import _fit_context
 from .utils._param_validation import Interval, StrOptions
 from .metrics.pairwise import PAIRWISE_KERNEL_FUNCTIONS, pairwise_kernels
 from .linear_model._ridge import _solve_cholesky_kernel
@@ -170,6 +171,7 @@ def _get_kernel(self, X, Y=None):
     def _more_tags(self):
         return {"pairwise": self.kernel == "precomputed"}
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def fit(self, X, y, sample_weight=None):
         """Fit Kernel Ridge regression model.
 
@@ -190,8 +192,6 @@ def fit(self, X, y, sample_weight=None):
         self : object
             Returns the instance itself.
         """
-        self._validate_params()
-
         # Convert data
         X, y = self._validate_data(
             X, y, accept_sparse=("csr", "csc"), multi_output=True, y_numeric=True
diff --git a/sklearn/linear_model/_base.py b/sklearn/linear_model/_base.py
index 06d8664dc013b..92c067c850225 100644
--- a/sklearn/linear_model/_base.py
+++ b/sklearn/linear_model/_base.py
@@ -28,6 +28,7 @@
 from numbers import Integral
 
 from ..base import BaseEstimator, ClassifierMixin, RegressorMixin, MultiOutputMixin
+from ..base import _fit_context
 from ..preprocessing._data import _is_constant_feature
 from ..utils import check_array
 from ..utils.validation import FLOAT_DTYPES
@@ -642,6 +643,7 @@ def __init__(
         self.n_jobs = n_jobs
         self.positive = positive
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def fit(self, X, y, sample_weight=None):
         """
         Fit linear model.
@@ -665,9 +667,6 @@ def fit(self, X, y, sample_weight=None):
         self : object
             Fitted Estimator.
         """
-
-        self._validate_params()
-
         n_jobs_ = self.n_jobs
 
         accept_sparse = False if self.positive else ["csr", "csc", "coo"]
diff --git a/sklearn/linear_model/_bayes.py b/sklearn/linear_model/_bayes.py
index 887c6a3ebcbbc..37dc3b81511f5 100644
--- a/sklearn/linear_model/_bayes.py
+++ b/sklearn/linear_model/_bayes.py
@@ -13,6 +13,7 @@
 
 from ._base import LinearModel, _preprocess_data, _rescale_data
 from ..base import RegressorMixin
+from ..base import _fit_context
 from ..utils.extmath import fast_logdet
 from scipy.linalg import pinvh
 from ..utils.validation import _check_sample_weight
@@ -267,6 +268,7 @@ def __init__(
         self.verbose = verbose
         self.n_iter = n_iter
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def fit(self, X, y, sample_weight=None):
         """Fit the model.
 
@@ -288,8 +290,6 @@ def fit(self, X, y, sample_weight=None):
         self : object
             Returns the instance itself.
         """
-        self._validate_params()
-
         max_iter = _deprecate_n_iter(self.n_iter, self.max_iter)
 
         X, y = self._validate_data(X, y, dtype=[np.float64, np.float32], y_numeric=True)
@@ -665,6 +665,7 @@ def __init__(
         self.verbose = verbose
         self.n_iter = n_iter
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def fit(self, X, y):
         """Fit the model according to the given training data and parameters.
 
@@ -683,9 +684,6 @@ def fit(self, X, y):
         self : object
             Fitted estimator.
         """
-
-        self._validate_params()
-
         max_iter = _deprecate_n_iter(self.n_iter, self.max_iter)
 
         X, y = self._validate_data(
diff --git a/sklearn/linear_model/_coordinate_descent.py b/sklearn/linear_model/_coordinate_descent.py
index ea1ee3115ea93..829c0ab6149f1 100644
--- a/sklearn/linear_model/_coordinate_descent.py
+++ b/sklearn/linear_model/_coordinate_descent.py
@@ -18,6 +18,7 @@
 
 from ._base import LinearModel, _pre_fit
 from ..base import RegressorMixin, MultiOutputMixin
+from ..base import _fit_context
 from ._base import _preprocess_data
 from ..utils import check_array, check_scalar
 from ..utils.validation import check_random_state
@@ -851,6 +852,7 @@ def __init__(
         self.random_state = random_state
         self.selection = selection
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def fit(self, X, y, sample_weight=None, check_input=True):
         """Fit model with coordinate descent.
 
@@ -886,8 +888,6 @@ def fit(self, X, y, sample_weight=None, check_input=True):
         To avoid memory re-allocation it is advised to allocate the
         initial data in memory directly using that format.
         """
-        self._validate_params()
-
         if self.alpha == 0:
             warnings.warn(
                 (
@@ -1475,6 +1475,7 @@ def _is_multitask(self):
     def path(X, y, **kwargs):
         """Compute path with coordinate descent."""
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def fit(self, X, y, sample_weight=None):
         """Fit linear model with coordinate descent.
 
@@ -1502,9 +1503,6 @@ def fit(self, X, y, sample_weight=None):
         self : object
             Returns an instance of fitted model.
         """
-
-        self._validate_params()
-
         # This makes sure that there is no duplication in memory.
         # Dealing right with copy_X is important in the following:
         # Multiple functions touch X and subsamples of X and can induce a
@@ -2343,6 +2341,7 @@ def __init__(
         self.random_state = random_state
         self.selection = selection
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def fit(self, X, y):
         """Fit MultiTaskElasticNet model with coordinate descent.
 
@@ -2367,8 +2366,6 @@ def fit(self, X, y):
         To avoid memory re-allocation it is advised to allocate the
         initial data in memory directly using that format.
         """
-        self._validate_params()
-
         # Need to validate separately here.
         # We can't pass multi_output=True because that would allow y to be csr.
         check_X_params = dict(
diff --git a/sklearn/linear_model/_glm/glm.py b/sklearn/linear_model/_glm/glm.py
index caf37a0f473e0..b1bc460f24dff 100644
--- a/sklearn/linear_model/_glm/glm.py
+++ b/sklearn/linear_model/_glm/glm.py
@@ -20,6 +20,7 @@
     HalfTweedieLossIdentity,
 )
 from ...base import BaseEstimator, RegressorMixin
+from ...base import _fit_context
 from ...utils import check_array
 from ...utils._openmp_helpers import _openmp_effective_n_threads
 from ...utils._param_validation import Hidden, Interval, StrOptions
@@ -168,6 +169,7 @@ def __init__(
         self.warm_start = warm_start
         self.verbose = verbose
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def fit(self, X, y, sample_weight=None):
         """Fit a Generalized Linear Model.
 
@@ -187,8 +189,6 @@ def fit(self, X, y, sample_weight=None):
         self : object
             Fitted model.
         """
-        self._validate_params()
-
         X, y = self._validate_data(
             X,
             y,
diff --git a/sklearn/linear_model/_huber.py b/sklearn/linear_model/_huber.py
index a7b848f647560..def2ae273d5c4 100644
--- a/sklearn/linear_model/_huber.py
+++ b/sklearn/linear_model/_huber.py
@@ -7,6 +7,7 @@
 from scipy import optimize
 
 from ..base import BaseEstimator, RegressorMixin
+from ..base import _fit_context
 from ._base import LinearModel
 from ..utils import axis0_safe_slice
 from ..utils._param_validation import Interval
@@ -273,6 +274,7 @@ def __init__(
         self.fit_intercept = fit_intercept
         self.tol = tol
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def fit(self, X, y, sample_weight=None):
         """Fit the model according to the given training data.
 
@@ -293,7 +295,6 @@ def fit(self, X, y, sample_weight=None):
         self : object
             Fitted `HuberRegressor` estimator.
         """
-        self._validate_params()
         X, y = self._validate_data(
             X,
             y,
diff --git a/sklearn/linear_model/_least_angle.py b/sklearn/linear_model/_least_angle.py
index 4be8bb730a0ae..e6c653eb80bb3 100644
--- a/sklearn/linear_model/_least_angle.py
+++ b/sklearn/linear_model/_least_angle.py
@@ -20,6 +20,7 @@
 from ._base import LinearModel, LinearRegression
 from ._base import _deprecate_normalize, _preprocess_data
 from ..base import RegressorMixin, MultiOutputMixin
+from ..base import _fit_context
 
 # mypy error: Module 'sklearn.utils' has no attribute 'arrayfuncs'
 from ..utils import arrayfuncs, as_float_array  # type: ignore
@@ -1097,6 +1098,7 @@ def _fit(self, X, y, max_iter, alpha, fit_path, normalize, Xy=None):
         self._set_intercept(X_offset, y_offset, X_scale)
         return self
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def fit(self, X, y, Xy=None):
         """Fit the model using X, y as training data.
 
@@ -1118,8 +1120,6 @@ def fit(self, X, y, Xy=None):
         self : object
             Returns an instance of self.
         """
-        self._validate_params()
-
         X, y = self._validate_data(X, y, y_numeric=True, multi_output=True)
 
         _normalize = _deprecate_normalize(
@@ -1691,6 +1691,7 @@ def __init__(
     def _more_tags(self):
         return {"multioutput": False}
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def fit(self, X, y):
         """Fit the model using X, y as training data.
 
@@ -1707,8 +1708,6 @@ def fit(self, X, y):
         self : object
             Returns an instance of self.
         """
-        self._validate_params()
-
         _normalize = _deprecate_normalize(
             self.normalize, estimator_name=self.__class__.__name__
         )
@@ -2216,6 +2215,7 @@ def __init__(
     def _more_tags(self):
         return {"multioutput": False}
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def fit(self, X, y, copy_X=None):
         """Fit the model using X, y as training data.
 
@@ -2237,8 +2237,6 @@ def fit(self, X, y, copy_X=None):
         self : object
             Returns an instance of self.
         """
-        self._validate_params()
-
         _normalize = _deprecate_normalize(
             self.normalize, estimator_name=self.__class__.__name__
         )
diff --git a/sklearn/linear_model/_logistic.py b/sklearn/linear_model/_logistic.py
index 3db27d9cc3163..30a0f40a0f2fd 100644
--- a/sklearn/linear_model/_logistic.py
+++ b/sklearn/linear_model/_logistic.py
@@ -24,6 +24,7 @@
 from ._linear_loss import LinearModelLoss
 from ._sag import sag_solver
 from ._glm.glm import NewtonCholeskySolver
+from ..base import _fit_context
 from .._loss.loss import HalfBinomialLoss, HalfMultinomialLoss
 from ..preprocessing import LabelEncoder, LabelBinarizer
 from ..svm._base import _fit_liblinear
@@ -1132,6 +1133,7 @@ def __init__(
         self.n_jobs = n_jobs
         self.l1_ratio = l1_ratio
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def fit(self, X, y, sample_weight=None):
         """
         Fit the model according to the given training data.
@@ -1161,9 +1163,6 @@ def fit(self, X, y, sample_weight=None):
         -----
         The SAGA solver supports both float64 and float32 bit arrays.
         """
-
-        self._validate_params()
-
         solver = _check_solver(self.solver, self.penalty, self.dual)
 
         if self.penalty != "elasticnet" and self.l1_ratio is not None:
@@ -1745,6 +1744,7 @@ def __init__(
         self.random_state = random_state
         self.l1_ratios = l1_ratios
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def fit(self, X, y, sample_weight=None):
         """Fit the model according to the given training data.
 
@@ -1766,9 +1766,6 @@ def fit(self, X, y, sample_weight=None):
         self : object
             Fitted LogisticRegressionCV estimator.
         """
-
-        self._validate_params()
-
         solver = _check_solver(self.solver, self.penalty, self.dual)
 
         if self.penalty == "elasticnet":
diff --git a/sklearn/linear_model/_omp.py b/sklearn/linear_model/_omp.py
index b1dc1e352fd62..df451a99417b0 100644
--- a/sklearn/linear_model/_omp.py
+++ b/sklearn/linear_model/_omp.py
@@ -15,6 +15,7 @@
 
 from ._base import LinearModel, _pre_fit, _deprecate_normalize
 from ..base import RegressorMixin, MultiOutputMixin
+from ..base import _fit_context
 from ..utils import as_float_array, check_array
 from ..utils.parallel import delayed, Parallel
 from ..utils._param_validation import Hidden, Interval, StrOptions
@@ -725,6 +726,7 @@ def __init__(
         self.normalize = normalize
         self.precompute = precompute
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def fit(self, X, y):
         """Fit the model using X, y as training data.
 
@@ -741,8 +743,6 @@ def fit(self, X, y):
         self : object
             Returns an instance of self.
         """
-        self._validate_params()
-
         _normalize = _deprecate_normalize(
             self.normalize, estimator_name=self.__class__.__name__
         )
@@ -1042,6 +1042,7 @@ def __init__(
         self.n_jobs = n_jobs
         self.verbose = verbose
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def fit(self, X, y):
         """Fit the model using X, y as training data.
 
@@ -1058,8 +1059,6 @@ def fit(self, X, y):
         self : object
             Returns an instance of self.
         """
-        self._validate_params()
-
         _normalize = _deprecate_normalize(
             self.normalize, estimator_name=self.__class__.__name__
         )
diff --git a/sklearn/linear_model/_passive_aggressive.py b/sklearn/linear_model/_passive_aggressive.py
index 2cacd4f78cc54..a9c81799c8ca3 100644
--- a/sklearn/linear_model/_passive_aggressive.py
+++ b/sklearn/linear_model/_passive_aggressive.py
@@ -5,6 +5,7 @@
 from ._stochastic_gradient import BaseSGDClassifier
 from ._stochastic_gradient import BaseSGDRegressor
 from ._stochastic_gradient import DEFAULT_EPSILON
+from ..base import _fit_context
 from ..utils._param_validation import Interval, StrOptions
 
 
@@ -220,6 +221,7 @@ def __init__(
         self.C = C
         self.loss = loss
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def partial_fit(self, X, y, classes=None):
         """Fit linear model with Passive Aggressive algorithm.
 
@@ -245,7 +247,6 @@ def partial_fit(self, X, y, classes=None):
             Fitted estimator.
         """
         if not hasattr(self, "classes_"):
-            self._validate_params()
             self._more_validate_params(for_partial_fit=True)
 
             if self.class_weight == "balanced":
@@ -276,6 +277,7 @@ def partial_fit(self, X, y, classes=None):
             intercept_init=None,
         )
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def fit(self, X, y, coef_init=None, intercept_init=None):
         """Fit linear model with Passive Aggressive algorithm.
 
@@ -298,7 +300,6 @@ def fit(self, X, y, coef_init=None, intercept_init=None):
         self : object
             Fitted estimator.
         """
-        self._validate_params()
         self._more_validate_params()
 
         lr = "pa1" if self.loss == "hinge" else "pa2"
@@ -504,6 +505,7 @@ def __init__(
         self.C = C
         self.loss = loss
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def partial_fit(self, X, y):
         """Fit linear model with Passive Aggressive algorithm.
 
@@ -521,7 +523,6 @@ def partial_fit(self, X, y):
             Fitted estimator.
         """
         if not hasattr(self, "coef_"):
-            self._validate_params()
             self._more_validate_params(for_partial_fit=True)
 
         lr = "pa1" if self.loss == "epsilon_insensitive" else "pa2"
@@ -538,6 +539,7 @@ def partial_fit(self, X, y):
             intercept_init=None,
         )
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def fit(self, X, y, coef_init=None, intercept_init=None):
         """Fit linear model with Passive Aggressive algorithm.
 
@@ -560,7 +562,6 @@ def fit(self, X, y, coef_init=None, intercept_init=None):
         self : object
             Fitted estimator.
         """
-        self._validate_params()
         self._more_validate_params()
 
         lr = "pa1" if self.loss == "epsilon_insensitive" else "pa2"
diff --git a/sklearn/linear_model/_quantile.py b/sklearn/linear_model/_quantile.py
index 081e3da5b51b7..b4a5581386a5f 100644
--- a/sklearn/linear_model/_quantile.py
+++ b/sklearn/linear_model/_quantile.py
@@ -9,6 +9,7 @@
 from scipy.optimize import linprog
 
 from ..base import BaseEstimator, RegressorMixin
+from ..base import _fit_context
 from ._base import LinearModel
 from ..exceptions import ConvergenceWarning
 from ..utils import _safe_indexing
@@ -141,6 +142,7 @@ def __init__(
         self.solver = solver
         self.solver_options = solver_options
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def fit(self, X, y, sample_weight=None):
         """Fit the model according to the given training data.
 
@@ -160,7 +162,6 @@ def fit(self, X, y, sample_weight=None):
         self : object
             Returns self.
         """
-        self._validate_params()
         X, y = self._validate_data(
             X,
             y,
diff --git a/sklearn/linear_model/_ransac.py b/sklearn/linear_model/_ransac.py
index 2474a25f07199..1c12ecc13a258 100644
--- a/sklearn/linear_model/_ransac.py
+++ b/sklearn/linear_model/_ransac.py
@@ -9,6 +9,7 @@
 
 from ..base import BaseEstimator, MetaEstimatorMixin, RegressorMixin, clone
 from ..base import MultiOutputMixin
+from ..base import _fit_context
 from ..utils import check_random_state, check_consistent_length
 from ..utils.random import sample_without_replacement
 from ..utils.validation import check_is_fitted, _check_sample_weight
@@ -283,6 +284,10 @@ def __init__(
         self.random_state = random_state
         self.loss = loss
 
+    @_fit_context(
+        # RansacRegressor.estimator is not validated yet
+        prefer_skip_nested_validation=False
+    )
     def fit(self, X, y, sample_weight=None):
         """Fit estimator using RANSAC algorithm.
 
@@ -313,8 +318,6 @@ def fit(self, X, y, sample_weight=None):
             `is_data_valid` and `is_model_valid` return False for all
             `max_trials` randomly chosen sub-samples.
         """
-        self._validate_params()
-
         # Need to validate separately here. We can't pass multi_output=True
         # because that would allow y to be csr. Delay expensive finiteness
         # check to the estimator's own input validation.
diff --git a/sklearn/linear_model/_ridge.py b/sklearn/linear_model/_ridge.py
index 28ef7cbd43eb7..893b10d1d93ae 100644
--- a/sklearn/linear_model/_ridge.py
+++ b/sklearn/linear_model/_ridge.py
@@ -25,6 +25,7 @@
 from ._base import _preprocess_data, _rescale_data
 from ._sag import sag_solver
 from ..base import MultiOutputMixin, RegressorMixin, is_classifier
+from ..base import _fit_context
 from ..utils.extmath import safe_sparse_dot
 from ..utils.extmath import row_norms
 from ..utils import check_array
@@ -1114,6 +1115,7 @@ def __init__(
             random_state=random_state,
         )
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def fit(self, X, y, sample_weight=None):
         """Fit Ridge regression model.
 
@@ -1134,8 +1136,6 @@ def fit(self, X, y, sample_weight=None):
         self : object
             Fitted estimator.
         """
-        self._validate_params()
-
         _accept_sparse = _get_valid_accept_sparse(sparse.issparse(X), self.solver)
         X, y = self._validate_data(
             X,
@@ -1423,6 +1423,7 @@ def __init__(
         )
         self.class_weight = class_weight
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def fit(self, X, y, sample_weight=None):
         """Fit Ridge classifier model.
 
@@ -1446,8 +1447,6 @@ def fit(self, X, y, sample_weight=None):
         self : object
             Instance of the estimator.
         """
-        self._validate_params()
-
         X, y, sample_weight, Y = self._prepare_data(X, y, sample_weight, self.solver)
 
         super().fit(X, Y, sample_weight=sample_weight)
@@ -2354,6 +2353,7 @@ class RidgeCV(MultiOutputMixin, RegressorMixin, _BaseRidgeCV):
     0.5166...
     """
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def fit(self, X, y, sample_weight=None):
         """Fit Ridge regression model with cv.
 
@@ -2383,8 +2383,6 @@ def fit(self, X, y, sample_weight=None):
         cross-validation takes the sample weights into account when computing
         the validation score.
         """
-        self._validate_params()
-
         super().fit(X, y, sample_weight=sample_weight)
         return self
 
@@ -2533,6 +2531,7 @@ def __init__(
         )
         self.class_weight = class_weight
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def fit(self, X, y, sample_weight=None):
         """Fit Ridge classifier with cv.
 
@@ -2555,8 +2554,6 @@ def fit(self, X, y, sample_weight=None):
         self : object
             Fitted estimator.
         """
-        self._validate_params()
-
         # `RidgeClassifier` does not accept "sag" or "saga" solver and thus support
         # csr, csc, and coo sparse matrices. By using solver="eigen" we force to accept
         # all sparse format.
diff --git a/sklearn/linear_model/_stochastic_gradient.py b/sklearn/linear_model/_stochastic_gradient.py
index 2f27bdee7968b..bc8f31016c6f8 100644
--- a/sklearn/linear_model/_stochastic_gradient.py
+++ b/sklearn/linear_model/_stochastic_gradient.py
@@ -13,6 +13,7 @@
 from numbers import Integral, Real
 
 from ..base import clone, is_classifier
+from ..base import _fit_context
 from ._base import LinearClassifierMixin, SparseCoefMixin
 from ._base import make_dataset
 from ..base import BaseEstimator, RegressorMixin, OutlierMixin
@@ -805,6 +806,7 @@ def _fit_multiclass(self, X, y, alpha, C, learning_rate, sample_weight, max_iter
                 self._standard_intercept = np.atleast_1d(self.intercept_)
                 self.intercept_ = self._standard_intercept
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def partial_fit(self, X, y, classes=None, sample_weight=None):
         """Perform one epoch of stochastic gradient descent on given samples.
 
@@ -839,7 +841,6 @@ def partial_fit(self, X, y, classes=None, sample_weight=None):
             Returns an instance of self.
         """
         if not hasattr(self, "classes_"):
-            self._validate_params()
             self._more_validate_params(for_partial_fit=True)
 
             if self.class_weight == "balanced":
@@ -869,6 +870,7 @@ def partial_fit(self, X, y, classes=None, sample_weight=None):
             intercept_init=None,
         )
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def fit(self, X, y, coef_init=None, intercept_init=None, sample_weight=None):
         """Fit linear model with Stochastic Gradient Descent.
 
@@ -897,7 +899,6 @@ def fit(self, X, y, coef_init=None, intercept_init=None, sample_weight=None):
         self : object
             Returns an instance of self.
         """
-        self._validate_params()
         self._more_validate_params()
 
         return self._fit(
@@ -1470,6 +1471,7 @@ def _partial_fit(
 
         return self
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def partial_fit(self, X, y, sample_weight=None):
         """Perform one epoch of stochastic gradient descent on given samples.
 
@@ -1496,7 +1498,6 @@ def partial_fit(self, X, y, sample_weight=None):
             Returns an instance of self.
         """
         if not hasattr(self, "coef_"):
-            self._validate_params()
             self._more_validate_params(for_partial_fit=True)
 
         return self._partial_fit(
@@ -1565,6 +1566,7 @@ def _fit(
 
         return self
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def fit(self, X, y, coef_init=None, intercept_init=None, sample_weight=None):
         """Fit linear model with Stochastic Gradient Descent.
 
@@ -1590,7 +1592,6 @@ def fit(self, X, y, coef_init=None, intercept_init=None, sample_weight=None):
         self : object
             Fitted `SGDRegressor` estimator.
         """
-        self._validate_params()
         self._more_validate_params()
 
         return self._fit(
@@ -2366,6 +2367,7 @@ def _partial_fit(
 
         return self
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def partial_fit(self, X, y=None, sample_weight=None):
         """Fit linear One-Class SVM with Stochastic Gradient Descent.
 
@@ -2386,7 +2388,6 @@ def partial_fit(self, X, y=None, sample_weight=None):
             Returns a fitted instance of self.
         """
         if not hasattr(self, "coef_"):
-            self._validate_params()
             self._more_validate_params(for_partial_fit=True)
 
         alpha = self.nu / 2
@@ -2453,6 +2454,7 @@ def _fit(
 
         return self
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def fit(self, X, y=None, coef_init=None, offset_init=None, sample_weight=None):
         """Fit linear One-Class SVM with Stochastic Gradient Descent.
 
@@ -2485,7 +2487,6 @@ def fit(self, X, y=None, coef_init=None, offset_init=None, sample_weight=None):
         self : object
             Returns a fitted instance of self.
         """
-        self._validate_params()
         self._more_validate_params()
 
         alpha = self.nu / 2
diff --git a/sklearn/linear_model/_theil_sen.py b/sklearn/linear_model/_theil_sen.py
index 67d6ca532a8ab..72c2d897681c4 100644
--- a/sklearn/linear_model/_theil_sen.py
+++ b/sklearn/linear_model/_theil_sen.py
@@ -19,6 +19,7 @@
 
 from ._base import LinearModel
 from ..base import RegressorMixin
+from ..base import _fit_context
 from ..utils import check_random_state
 from ..utils._param_validation import Interval
 from ..utils.parallel import delayed, Parallel
@@ -395,6 +396,7 @@ def _check_subparams(self, n_samples, n_features):
 
         return n_subsamples, n_subpopulation
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def fit(self, X, y):
         """Fit linear model.
 
@@ -410,7 +412,6 @@ def fit(self, X, y):
         self : returns an instance of self.
             Fitted `TheilSenRegressor` estimator.
         """
-        self._validate_params()
         random_state = check_random_state(self.random_state)
         X, y = self._validate_data(X, y, y_numeric=True)
         n_samples, n_features = X.shape
diff --git a/sklearn/manifold/_isomap.py b/sklearn/manifold/_isomap.py
index 92206721aac15..0917ef7d207bc 100644
--- a/sklearn/manifold/_isomap.py
+++ b/sklearn/manifold/_isomap.py
@@ -12,6 +12,7 @@
 from scipy.sparse.csgraph import connected_components
 
 from ..base import BaseEstimator, TransformerMixin, ClassNamePrefixFeaturesOutMixin
+from ..base import _fit_context
 from ..neighbors import NearestNeighbors, kneighbors_graph
 from ..neighbors import radius_neighbors_graph
 from ..utils.validation import check_is_fitted
@@ -235,7 +236,7 @@ def _fit_transform(self, X):
             tol=self.tol,
             max_iter=self.max_iter,
             n_jobs=self.n_jobs,
-        )
+        ).set_output(transform="default")
 
         if self.n_neighbors is not None:
             nbg = kneighbors_graph(
@@ -332,6 +333,10 @@ def reconstruction_error(self):
         evals = self.kernel_pca_.eigenvalues_
         return np.sqrt(np.sum(G_center**2) - np.sum(evals**2)) / G.shape[0]
 
+    @_fit_context(
+        # Isomap.metric is not validated yet
+        prefer_skip_nested_validation=False
+    )
     def fit(self, X, y=None):
         """Compute the embedding vectors for data X.
 
@@ -350,10 +355,13 @@ def fit(self, X, y=None):
         self : object
             Returns a fitted instance of self.
         """
-        self._validate_params()
         self._fit_transform(X)
         return self
 
+    @_fit_context(
+        # Isomap.metric is not validated yet
+        prefer_skip_nested_validation=False
+    )
     def fit_transform(self, X, y=None):
         """Fit the model from data in X and transform X.
 
@@ -371,7 +379,6 @@ def fit_transform(self, X, y=None):
         X_new : array-like, shape (n_samples, n_components)
             X transformed in the new space.
         """
-        self._validate_params()
         self._fit_transform(X)
         return self.embedding_
 
diff --git a/sklearn/manifold/_locally_linear.py b/sklearn/manifold/_locally_linear.py
index 10a22b12dfd1d..6f57b0627b8be 100644
--- a/sklearn/manifold/_locally_linear.py
+++ b/sklearn/manifold/_locally_linear.py
@@ -17,6 +17,7 @@
     TransformerMixin,
     _UnstableArchMixin,
     ClassNamePrefixFeaturesOutMixin,
+    _fit_context,
 )
 from ..utils import check_random_state, check_array
 from ..utils._arpack import _init_arpack_v0
@@ -759,6 +760,7 @@ def _fit_transform(self, X):
         )
         self._n_features_out = self.embedding_.shape[1]
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def fit(self, X, y=None):
         """Compute the embedding vectors for data X.
 
@@ -775,10 +777,10 @@ def fit(self, X, y=None):
         self : object
             Fitted `LocallyLinearEmbedding` class instance.
         """
-        self._validate_params()
         self._fit_transform(X)
         return self
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def fit_transform(self, X, y=None):
         """Compute the embedding vectors for data X and transform X.
 
@@ -795,7 +797,6 @@ def fit_transform(self, X, y=None):
         X_new : array-like, shape (n_samples, n_components)
             Returns the instance itself.
         """
-        self._validate_params()
         self._fit_transform(X)
         return self.embedding_
 
diff --git a/sklearn/manifold/_mds.py b/sklearn/manifold/_mds.py
index 7fc46325a1ae1..6b7a818b94ea8 100644
--- a/sklearn/manifold/_mds.py
+++ b/sklearn/manifold/_mds.py
@@ -13,6 +13,7 @@
 import warnings
 
 from ..base import BaseEstimator
+from ..base import _fit_context
 from ..metrics import euclidean_distances
 from ..utils import check_random_state, check_array, check_symmetric
 from ..isotonic import IsotonicRegression
@@ -569,10 +570,10 @@ def fit(self, X, y=None, init=None):
         self : object
             Fitted estimator.
         """
-        # parameter will be validated in `fit_transform` call
         self.fit_transform(X, init=init)
         return self
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def fit_transform(self, X, y=None, init=None):
         """
         Fit the data from `X`, and returns the embedded coordinates.
@@ -597,7 +598,6 @@ def fit_transform(self, X, y=None, init=None):
         X_new : ndarray of shape (n_samples, n_components)
             X transformed in the new space.
         """
-        self._validate_params()
         X = self._validate_data(X)
         if X.shape[0] == X.shape[1] and self.dissimilarity != "precomputed":
             warnings.warn(
diff --git a/sklearn/manifold/_spectral_embedding.py b/sklearn/manifold/_spectral_embedding.py
index 8291d8326eb05..af965a1362b8f 100644
--- a/sklearn/manifold/_spectral_embedding.py
+++ b/sklearn/manifold/_spectral_embedding.py
@@ -17,6 +17,7 @@
 from scipy.sparse.csgraph import laplacian as csgraph_laplacian
 
 from ..base import BaseEstimator
+from ..base import _fit_context
 from ..utils import (
     check_array,
     check_random_state,
@@ -652,6 +653,7 @@ def _get_affinity_matrix(self, X, Y=None):
         self.affinity_matrix_ = self.affinity(X)
         return self.affinity_matrix_
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def fit(self, X, y=None):
         """Fit the model from data in X.
 
@@ -674,8 +676,6 @@ def fit(self, X, y=None):
         self : object
             Returns the instance itself.
         """
-        self._validate_params()
-
         X = self._validate_data(X, accept_sparse="csr", ensure_min_samples=2)
 
         random_state = check_random_state(self.random_state)
diff --git a/sklearn/manifold/_t_sne.py b/sklearn/manifold/_t_sne.py
index 6ef6ce999cb08..c372ddcca3c2e 100644
--- a/sklearn/manifold/_t_sne.py
+++ b/sklearn/manifold/_t_sne.py
@@ -17,6 +17,7 @@
 from numbers import Integral, Real
 from ..neighbors import NearestNeighbors
 from ..base import BaseEstimator, ClassNamePrefixFeaturesOutMixin, TransformerMixin
+from ..base import _fit_context
 from ..utils import check_random_state
 from ..utils._openmp_helpers import _openmp_effective_n_threads
 from ..utils.validation import check_non_negative
@@ -1078,6 +1079,10 @@ def _tsne(
 
         return X_embedded
 
+    @_fit_context(
+        # TSNE.metric is not validated yet
+        prefer_skip_nested_validation=False
+    )
     def fit_transform(self, X, y=None):
         """Fit X into an embedded space and return that transformed output.
 
@@ -1099,12 +1104,15 @@ def fit_transform(self, X, y=None):
         X_new : ndarray of shape (n_samples, n_components)
             Embedding of the training data in low-dimensional space.
         """
-        self._validate_params()
         self._check_params_vs_input(X)
         embedding = self._fit(X)
         self.embedding_ = embedding
         return self.embedding_
 
+    @_fit_context(
+        # TSNE.metric is not validated yet
+        prefer_skip_nested_validation=False
+    )
     def fit(self, X, y=None):
         """Fit X into an embedded space.
 
@@ -1126,7 +1134,6 @@ def fit(self, X, y=None):
         X_new : array of shape (n_samples, n_components)
             Embedding of the training data in low-dimensional space.
         """
-        self._validate_params()
         self.fit_transform(X)
         return self
 
diff --git a/sklearn/metrics/pairwise.py b/sklearn/metrics/pairwise.py
index 67b04e9382acb..dbe5b76f0f4c9 100644
--- a/sklearn/metrics/pairwise.py
+++ b/sklearn/metrics/pairwise.py
@@ -919,8 +919,9 @@ def haversine_distances(X, Y=None):
     in radians. The dimension of the data must be 2.
 
     .. math::
-       D(x, y) = 2\\arcsin[\\sqrt{\\sin^2((x1 - y1) / 2)
-                                + \\cos(x1)\\cos(y1)\\sin^2((x2 - y2) / 2)}]
+       D(x, y) = 2\\arcsin[\\sqrt{\\sin^2((x_{lat} - y_{lat}) / 2)
+                                + \\cos(x_{lat})\\cos(y_{lat})\\
+                                sin^2((x_{lon} - y_{lon}) / 2)}]
 
     Parameters
     ----------
@@ -1220,6 +1221,13 @@ def paired_cosine_distances(X, Y):
 }
 
 
+@validate_params(
+    {
+        "X": ["array-like"],
+        "Y": ["array-like"],
+        "metric": [StrOptions(set(PAIRED_DISTANCES)), callable],
+    }
+)
 def paired_distances(X, Y, *, metric="euclidean", **kwds):
     """
     Compute the paired distances between X and Y.
@@ -1278,8 +1286,6 @@ def paired_distances(X, Y, *, metric="euclidean", **kwds):
         for i in range(len(X)):
             distances[i] = metric(X[i], Y[i])
         return distances
-    else:
-        raise ValueError("Unknown distance %s" % metric)
 
 
 # Kernels
diff --git a/sklearn/mixture/_base.py b/sklearn/mixture/_base.py
index a298dfec6a0da..fbca4f1d49dcd 100644
--- a/sklearn/mixture/_base.py
+++ b/sklearn/mixture/_base.py
@@ -16,6 +16,7 @@
 from ..cluster import kmeans_plusplus
 from ..base import BaseEstimator
 from ..base import DensityMixin
+from ..base import _fit_context
 from ..exceptions import ConvergenceWarning
 from ..utils import check_random_state
 from ..utils.validation import check_is_fitted
@@ -182,6 +183,7 @@ def fit(self, X, y=None):
         self.fit_predict(X, y)
         return self
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def fit_predict(self, X, y=None):
         """Estimate model parameters using X and predict the labels for X.
 
@@ -209,8 +211,6 @@ def fit_predict(self, X, y=None):
         labels : array, shape (n_samples,)
             Component labels.
         """
-        self._validate_params()
-
         X = self._validate_data(X, dtype=[np.float64, np.float32], ensure_min_samples=2)
         if X.shape[0] < self.n_components:
             raise ValueError(
diff --git a/sklearn/model_selection/__init__.py b/sklearn/model_selection/__init__.py
index 76dc02e625408..4a3f5d1e239a8 100644
--- a/sklearn/model_selection/__init__.py
+++ b/sklearn/model_selection/__init__.py
@@ -33,6 +33,7 @@
 from ._search import ParameterSampler
 
 from ._plot import LearningCurveDisplay
+from ._plot import ValidationCurveDisplay
 
 if typing.TYPE_CHECKING:
     # Avoid errors in type checkers (e.g. mypy) for experimental estimators.
@@ -74,6 +75,7 @@
     "permutation_test_score",
     "train_test_split",
     "validation_curve",
+    "ValidationCurveDisplay",
 ]
 
 
diff --git a/sklearn/model_selection/_plot.py b/sklearn/model_selection/_plot.py
index 6a6133a722251..bc5a600e57234 100644
--- a/sklearn/model_selection/_plot.py
+++ b/sklearn/model_selection/_plot.py
@@ -1,10 +1,140 @@
+import warnings
+
 import numpy as np
 
-from . import learning_curve
+from . import learning_curve, validation_curve
 from ..utils import check_matplotlib_support
+from ..utils._plotting import _validate_score_name, _interval_max_min_ratio
+
+
+class _BaseCurveDisplay:
+    def _plot_curve(
+        self,
+        x_data,
+        *,
+        ax=None,
+        negate_score=False,
+        score_name=None,
+        score_type="test",
+        log_scale="deprecated",
+        std_display_style="fill_between",
+        line_kw=None,
+        fill_between_kw=None,
+        errorbar_kw=None,
+    ):
+        check_matplotlib_support(f"{self.__class__.__name__}.plot")
+
+        import matplotlib.pyplot as plt
+
+        if ax is None:
+            _, ax = plt.subplots()
+
+        if negate_score:
+            train_scores, test_scores = -self.train_scores, -self.test_scores
+        else:
+            train_scores, test_scores = self.train_scores, self.test_scores
+
+        if std_display_style not in ("errorbar", "fill_between", None):
+            raise ValueError(
+                f"Unknown std_display_style: {std_display_style}. Should be one of"
+                " 'errorbar', 'fill_between', or None."
+            )
+
+        if score_type not in ("test", "train", "both"):
+            raise ValueError(
+                f"Unknown score_type: {score_type}. Should be one of 'test', "
+                "'train', or 'both'."
+            )
+
+        if score_type == "train":
+            scores = {"Train": train_scores}
+        elif score_type == "test":
+            scores = {"Test": test_scores}
+        else:  # score_type == "both"
+            scores = {"Train": train_scores, "Test": test_scores}
+
+        if std_display_style in ("fill_between", None):
+            # plot the mean score
+            if line_kw is None:
+                line_kw = {}
+
+            self.lines_ = []
+            for line_label, score in scores.items():
+                self.lines_.append(
+                    *ax.plot(
+                        x_data,
+                        score.mean(axis=1),
+                        label=line_label,
+                        **line_kw,
+                    )
+                )
+            self.errorbar_ = None
+            self.fill_between_ = None  # overwritten below by fill_between
+
+        if std_display_style == "errorbar":
+            if errorbar_kw is None:
+                errorbar_kw = {}
+
+            self.errorbar_ = []
+            for line_label, score in scores.items():
+                self.errorbar_.append(
+                    ax.errorbar(
+                        x_data,
+                        score.mean(axis=1),
+                        score.std(axis=1),
+                        label=line_label,
+                        **errorbar_kw,
+                    )
+                )
+            self.lines_, self.fill_between_ = None, None
+        elif std_display_style == "fill_between":
+            if fill_between_kw is None:
+                fill_between_kw = {}
+            default_fill_between_kw = {"alpha": 0.5}
+            fill_between_kw = {**default_fill_between_kw, **fill_between_kw}
+
+            self.fill_between_ = []
+            for line_label, score in scores.items():
+                self.fill_between_.append(
+                    ax.fill_between(
+                        x_data,
+                        score.mean(axis=1) - score.std(axis=1),
+                        score.mean(axis=1) + score.std(axis=1),
+                        **fill_between_kw,
+                    )
+                )
+
+        score_name = self.score_name if score_name is None else score_name
+
+        ax.legend()
 
+        # TODO(1.5): to be removed
+        if log_scale != "deprecated":
+            warnings.warn(
+                (
+                    "The `log_scale` parameter is deprecated as of version 1.3 "
+                    "and will be removed in 1.5. You can use display.ax_.set_xscale "
+                    "and display.ax_.set_yscale instead."
+                ),
+                FutureWarning,
+            )
+            xscale = "log" if log_scale else "linear"
+        else:
+            # We found that a ratio, smaller or bigger than 5, between the largest and
+            # smallest gap of the x values is a good indicator to choose between linear
+            # and log scale.
+            if _interval_max_min_ratio(x_data) > 5:
+                xscale = "symlog" if x_data.min() <= 0 else "log"
+            else:
+                xscale = "linear"
+        ax.set_xscale(xscale)
+        ax.set_ylabel(f"{score_name}")
 
-class LearningCurveDisplay:
+        self.ax_ = ax
+        self.figure_ = ax.figure
+
+
+class LearningCurveDisplay(_BaseCurveDisplay):
     """Learning Curve visualization.
 
     It is recommended to use
@@ -12,7 +142,10 @@ class LearningCurveDisplay:
     create a :class:`~sklearn.model_selection.LearningCurveDisplay` instance.
     All parameters are stored as attributes.
 
-    Read more in the :ref:`User Guide <visualizations>`.
+    Read more in the :ref:`User Guide <visualizations>` for general information
+    about the visualization API and
+    :ref:`detailed documentation <learning_curve>` regarding the learning
+    curve visualization.
 
     .. versionadded:: 1.2
 
@@ -29,9 +162,12 @@ class LearningCurveDisplay:
         Scores on test set.
 
     score_name : str, default=None
-        The name of the score used in `learning_curve`. It will be used to
-        decorate the y-axis. If `None`, the generic name `"Score"` will be
-        used.
+        The name of the score used in `learning_curve`. It will override the name
+        inferred from the `scoring` parameter. If `score` is `None`, we use `"Score"` if
+        `negate_score` is `False` and `"Negative score"` otherwise. If `scoring` is a
+        string or a callable, we infer the name. We replace `_` by spaces and capitalize
+        the first letter. We remove `neg_` and replace it by `"Negative"` if
+        `negate_score` is `False` or just remove it otherwise.
 
     Attributes
     ----------
@@ -89,8 +225,8 @@ def plot(
         *,
         negate_score=False,
         score_name=None,
-        score_type="test",
-        log_scale=False,
+        score_type="both",
+        log_scale="deprecated",
         std_display_style="fill_between",
         line_kw=None,
         fill_between_kw=None,
@@ -111,16 +247,25 @@ def plot(
             `scikit-learn`.
 
         score_name : str, default=None
-            The name of the score used to decorate the y-axis of the plot. If
-            `None`, the generic name "Score" will be used.
-
-        score_type : {"test", "train", "both"}, default="test"
+            The name of the score used to decorate the y-axis of the plot. It will
+            override the name inferred from the `scoring` parameter. If `score` is
+            `None`, we use `"Score"` if `negate_score` is `False` and `"Negative score"`
+            otherwise. If `scoring` is a string or a callable, we infer the name. We
+            replace `_` by spaces and capitalize the first letter. We remove `neg_` and
+            replace it by `"Negative"` if `negate_score` is
+            `False` or just remove it otherwise.
+
+        score_type : {"test", "train", "both"}, default="both"
             The type of score to plot. Can be one of `"test"`, `"train"`, or
             `"both"`.
 
-        log_scale : bool, default=False
+        log_scale : bool, default="deprecated"
             Whether or not to use a logarithmic scale for the x-axis.
 
+            .. deprecated:: 1.3
+               `log_scale` is deprecated in 1.3 and will be removed in 1.5.
+               Use `display.ax_.set_xscale` and `display.ax_.set_yscale` instead.
+
         std_display_style : {"errorbar", "fill_between"} or None, default="fill_between"
             The style used to display the score standard deviation around the
             mean score. If None, no standard deviation representation is
@@ -143,98 +288,19 @@ def plot(
         display : :class:`~sklearn.model_selection.LearningCurveDisplay`
             Object that stores computed values.
         """
-        check_matplotlib_support(f"{self.__class__.__name__}.plot")
-
-        import matplotlib.pyplot as plt
-
-        if ax is None:
-            _, ax = plt.subplots()
-
-        if negate_score:
-            train_scores, test_scores = -self.train_scores, -self.test_scores
-        else:
-            train_scores, test_scores = self.train_scores, self.test_scores
-
-        if std_display_style not in ("errorbar", "fill_between", None):
-            raise ValueError(
-                f"Unknown std_display_style: {std_display_style}. Should be one of"
-                " 'errorbar', 'fill_between', or None."
-            )
-
-        if score_type not in ("test", "train", "both"):
-            raise ValueError(
-                f"Unknown score_type: {score_type}. Should be one of 'test', "
-                "'train', or 'both'."
-            )
-
-        if score_type == "train":
-            scores = {"Training metric": train_scores}
-        elif score_type == "test":
-            scores = {"Testing metric": test_scores}
-        else:  # score_type == "both"
-            scores = {"Training metric": train_scores, "Testing metric": test_scores}
-
-        if std_display_style in ("fill_between", None):
-            # plot the mean score
-            if line_kw is None:
-                line_kw = {}
-
-            self.lines_ = []
-            for line_label, score in scores.items():
-                self.lines_.append(
-                    *ax.plot(
-                        self.train_sizes,
-                        score.mean(axis=1),
-                        label=line_label,
-                        **line_kw,
-                    )
-                )
-            self.errorbar_ = None
-            self.fill_between_ = None  # overwritten below by fill_between
-
-        if std_display_style == "errorbar":
-            if errorbar_kw is None:
-                errorbar_kw = {}
-
-            self.errorbar_ = []
-            for line_label, score in scores.items():
-                self.errorbar_.append(
-                    ax.errorbar(
-                        self.train_sizes,
-                        score.mean(axis=1),
-                        score.std(axis=1),
-                        label=line_label,
-                        **errorbar_kw,
-                    )
-                )
-            self.lines_, self.fill_between_ = None, None
-        elif std_display_style == "fill_between":
-            if fill_between_kw is None:
-                fill_between_kw = {}
-            default_fill_between_kw = {"alpha": 0.5}
-            fill_between_kw = {**default_fill_between_kw, **fill_between_kw}
-
-            self.fill_between_ = []
-            for line_label, score in scores.items():
-                self.fill_between_.append(
-                    ax.fill_between(
-                        self.train_sizes,
-                        score.mean(axis=1) - score.std(axis=1),
-                        score.mean(axis=1) + score.std(axis=1),
-                        **fill_between_kw,
-                    )
-                )
-
-        score_name = self.score_name if score_name is None else score_name
-
-        ax.legend()
-        if log_scale:
-            ax.set_xscale("log")
-        ax.set_xlabel("Number of samples in the training set")
-        ax.set_ylabel(f"{score_name}")
-
-        self.ax_ = ax
-        self.figure_ = ax.figure
+        self._plot_curve(
+            self.train_sizes,
+            ax=ax,
+            negate_score=negate_score,
+            score_name=score_name,
+            score_type=score_type,
+            log_scale=log_scale,
+            std_display_style=std_display_style,
+            line_kw=line_kw,
+            fill_between_kw=fill_between_kw,
+            errorbar_kw=errorbar_kw,
+        )
+        self.ax_.set_xlabel("Number of samples in the training set")
         return self
 
     @classmethod
@@ -259,8 +325,8 @@ def from_estimator(
         ax=None,
         negate_score=False,
         score_name=None,
-        score_type="test",
-        log_scale=False,
+        score_type="both",
+        log_scale="deprecated",
         std_display_style="fill_between",
         line_kw=None,
         fill_between_kw=None,
@@ -268,6 +334,11 @@ def from_estimator(
     ):
         """Create a learning curve display from an estimator.
 
+        Read more in the :ref:`User Guide <visualizations>` for general
+        information about the visualization API and :ref:`detailed
+        documentation <learning_curve>` regarding the learning curve
+        visualization.
+
         Parameters
         ----------
         estimator : object type that implements the "fit" and "predict" methods
@@ -368,16 +439,25 @@ def from_estimator(
             `scikit-learn`.
 
         score_name : str, default=None
-            The name of the score used to decorate the y-axis of the plot.
-            If `None`, the generic `"Score"` name will be used.
-
-        score_type : {"test", "train", "both"}, default="test"
+            The name of the score used to decorate the y-axis of the plot. It will
+            override the name inferred from the `scoring` parameter. If `score` is
+            `None`, we use `"Score"` if `negate_score` is `False` and `"Negative score"`
+            otherwise. If `scoring` is a string or a callable, we infer the name. We
+            replace `_` by spaces and capitalize the first letter. We remove `neg_` and
+            replace it by `"Negative"` if `negate_score` is
+            `False` or just remove it otherwise.
+
+        score_type : {"test", "train", "both"}, default="both"
             The type of score to plot. Can be one of `"test"`, `"train"`, or
             `"both"`.
 
-        log_scale : bool, default=False
+        log_scale : bool, default="deprecated"
             Whether or not to use a logarithmic scale for the x-axis.
 
+            .. deprecated:: 1.3
+               `log_scale` is deprecated in 1.3 and will be removed in 1.5.
+               Use `display.ax_.xscale` and `display.ax_.yscale` instead.
+
         std_display_style : {"errorbar", "fill_between"} or None, default="fill_between"
             The style used to display the score standard deviation around the
             mean score. If `None`, no representation of the standard deviation
@@ -414,7 +494,7 @@ def from_estimator(
         """
         check_matplotlib_support(f"{cls.__name__}.from_estimator")
 
-        score_name = "Score" if score_name is None else score_name
+        score_name = _validate_score_name(score_name, scoring, negate_score)
 
         train_sizes, train_scores, test_scores = learning_curve(
             estimator,
@@ -451,3 +531,377 @@ def from_estimator(
             fill_between_kw=fill_between_kw,
             errorbar_kw=errorbar_kw,
         )
+
+
+class ValidationCurveDisplay(_BaseCurveDisplay):
+    """Validation Curve visualization.
+
+    It is recommended to use
+    :meth:`~sklearn.model_selection.ValidationCurveDisplay.from_estimator` to
+    create a :class:`~sklearn.model_selection.ValidationCurveDisplay` instance.
+    All parameters are stored as attributes.
+
+    Read more in the :ref:`User Guide <visualizations>` for general information
+    about the visualization API and :ref:`detailed documentation
+    <validation_curve>` regarding the validation curve visualization.
+
+    .. versionadded:: 1.3
+
+    Parameters
+    ----------
+    param_name : str
+        Name of the parameter that has been varied.
+
+    param_range : ndarray of shape (n_ticks,)
+        The values of the parameter that have been evaluated.
+
+    train_scores : ndarray of shape (n_ticks, n_cv_folds)
+        Scores on training sets.
+
+    test_scores : ndarray of shape (n_ticks, n_cv_folds)
+        Scores on test set.
+
+    score_name : str, default=None
+        The name of the score used in `validation_curve`. It will override the name
+        inferred from the `scoring` parameter. If `score` is `None`, we use `"Score"` if
+        `negate_score` is `False` and `"Negative score"` otherwise. If `scoring` is a
+        string or a callable, we infer the name. We replace `_` by spaces and capitalize
+        the first letter. We remove `neg_` and replace it by `"Negative"` if
+        `negate_score` is `False` or just remove it otherwise.
+
+    Attributes
+    ----------
+    ax_ : matplotlib Axes
+        Axes with the validation curve.
+
+    figure_ : matplotlib Figure
+        Figure containing the validation curve.
+
+    errorbar_ : list of matplotlib Artist or None
+        When the `std_display_style` is `"errorbar"`, this is a list of
+        `matplotlib.container.ErrorbarContainer` objects. If another style is
+        used, `errorbar_` is `None`.
+
+    lines_ : list of matplotlib Artist or None
+        When the `std_display_style` is `"fill_between"`, this is a list of
+        `matplotlib.lines.Line2D` objects corresponding to the mean train and
+        test scores. If another style is used, `line_` is `None`.
+
+    fill_between_ : list of matplotlib Artist or None
+        When the `std_display_style` is `"fill_between"`, this is a list of
+        `matplotlib.collections.PolyCollection` objects. If another style is
+        used, `fill_between_` is `None`.
+
+    See Also
+    --------
+    sklearn.model_selection.validation_curve : Compute the validation curve.
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> import matplotlib.pyplot as plt
+    >>> from sklearn.datasets import make_classification
+    >>> from sklearn.model_selection import ValidationCurveDisplay, validation_curve
+    >>> from sklearn.linear_model import LogisticRegression
+    >>> X, y = make_classification(n_samples=1_000, random_state=0)
+    >>> logistic_regression = LogisticRegression()
+    >>> param_name, param_range = "C", np.logspace(-8, 3, 10)
+    >>> train_scores, test_scores = validation_curve(
+    ...     logistic_regression, X, y, param_name=param_name, param_range=param_range
+    ... )
+    >>> display = ValidationCurveDisplay(
+    ...     param_name=param_name, param_range=param_range,
+    ...     train_scores=train_scores, test_scores=test_scores, score_name="Score"
+    ... )
+    >>> display.plot()
+    <...>
+    >>> plt.show()
+    """
+
+    def __init__(
+        self, *, param_name, param_range, train_scores, test_scores, score_name=None
+    ):
+        self.param_name = param_name
+        self.param_range = param_range
+        self.train_scores = train_scores
+        self.test_scores = test_scores
+        self.score_name = score_name
+
+    def plot(
+        self,
+        ax=None,
+        *,
+        negate_score=False,
+        score_name=None,
+        score_type="both",
+        std_display_style="fill_between",
+        line_kw=None,
+        fill_between_kw=None,
+        errorbar_kw=None,
+    ):
+        """Plot visualization.
+
+        Parameters
+        ----------
+        ax : matplotlib Axes, default=None
+            Axes object to plot on. If `None`, a new figure and axes is
+            created.
+
+        negate_score : bool, default=False
+            Whether or not to negate the scores obtained through
+            :func:`~sklearn.model_selection.validation_curve`. This is
+            particularly useful when using the error denoted by `neg_*` in
+            `scikit-learn`.
+
+        score_name : str, default=None
+            The name of the score used to decorate the y-axis of the plot. It will
+            override the name inferred from the `scoring` parameter. If `score` is
+            `None`, we use `"Score"` if `negate_score` is `False` and `"Negative score"`
+            otherwise. If `scoring` is a string or a callable, we infer the name. We
+            replace `_` by spaces and capitalize the first letter. We remove `neg_` and
+            replace it by `"Negative"` if `negate_score` is
+            `False` or just remove it otherwise.
+
+        score_type : {"test", "train", "both"}, default="both"
+            The type of score to plot. Can be one of `"test"`, `"train"`, or
+            `"both"`.
+
+        std_display_style : {"errorbar", "fill_between"} or None, default="fill_between"
+            The style used to display the score standard deviation around the
+            mean score. If None, no standard deviation representation is
+            displayed.
+
+        line_kw : dict, default=None
+            Additional keyword arguments passed to the `plt.plot` used to draw
+            the mean score.
+
+        fill_between_kw : dict, default=None
+            Additional keyword arguments passed to the `plt.fill_between` used
+            to draw the score standard deviation.
+
+        errorbar_kw : dict, default=None
+            Additional keyword arguments passed to the `plt.errorbar` used to
+            draw mean score and standard deviation score.
+
+        Returns
+        -------
+        display : :class:`~sklearn.model_selection.ValidationCurveDisplay`
+            Object that stores computed values.
+        """
+        self._plot_curve(
+            self.param_range,
+            ax=ax,
+            negate_score=negate_score,
+            score_name=score_name,
+            score_type=score_type,
+            log_scale="deprecated",
+            std_display_style=std_display_style,
+            line_kw=line_kw,
+            fill_between_kw=fill_between_kw,
+            errorbar_kw=errorbar_kw,
+        )
+        self.ax_.set_xlabel(f"{self.param_name}")
+        return self
+
+    @classmethod
+    def from_estimator(
+        cls,
+        estimator,
+        X,
+        y,
+        *,
+        param_name,
+        param_range,
+        groups=None,
+        cv=None,
+        scoring=None,
+        n_jobs=None,
+        pre_dispatch="all",
+        verbose=0,
+        error_score=np.nan,
+        fit_params=None,
+        ax=None,
+        negate_score=False,
+        score_name=None,
+        score_type="both",
+        std_display_style="fill_between",
+        line_kw=None,
+        fill_between_kw=None,
+        errorbar_kw=None,
+    ):
+        """Create a validation curve display from an estimator.
+
+        Read more in the :ref:`User Guide <visualizations>` for general
+        information about the visualization API and :ref:`detailed
+        documentation <validation_curve>` regarding the validation curve
+        visualization.
+
+        Parameters
+        ----------
+        estimator : object type that implements the "fit" and "predict" methods
+            An object of that type which is cloned for each validation.
+
+        X : array-like of shape (n_samples, n_features)
+            Training data, where `n_samples` is the number of samples and
+            `n_features` is the number of features.
+
+        y : array-like of shape (n_samples,) or (n_samples, n_outputs) or None
+            Target relative to X for classification or regression;
+            None for unsupervised learning.
+
+        param_name : str
+            Name of the parameter that will be varied.
+
+        param_range : array-like of shape (n_values,)
+            The values of the parameter that will be evaluated.
+
+        groups : array-like of shape (n_samples,), default=None
+            Group labels for the samples used while splitting the dataset into
+            train/test set. Only used in conjunction with a "Group" :term:`cv`
+            instance (e.g., :class:`GroupKFold`).
+
+        cv : int, cross-validation generator or an iterable, default=None
+            Determines the cross-validation splitting strategy.
+            Possible inputs for cv are:
+
+            - None, to use the default 5-fold cross validation,
+            - int, to specify the number of folds in a `(Stratified)KFold`,
+            - :term:`CV splitter`,
+            - An iterable yielding (train, test) splits as arrays of indices.
+
+            For int/None inputs, if the estimator is a classifier and `y` is
+            either binary or multiclass,
+            :class:`~sklearn.model_selection.StratifiedKFold` is used. In all
+            other cases, :class:`~sklearn.model_selectionKFold` is used. These
+            splitters are instantiated with `shuffle=False` so the splits will
+            be the same across calls.
+
+            Refer :ref:`User Guide <cross_validation>` for the various
+            cross-validation strategies that can be used here.
+
+        scoring : str or callable, default=None
+            A string (see :ref:`scoring_parameter`) or
+            a scorer callable object / function with signature
+            `scorer(estimator, X, y)` (see :ref:`scoring`).
+
+        n_jobs : int, default=None
+            Number of jobs to run in parallel. Training the estimator and
+            computing the score are parallelized over the different training
+            and test sets. `None` means 1 unless in a
+            :obj:`joblib.parallel_backend` context. `-1` means using all
+            processors. See :term:`Glossary <n_jobs>` for more details.
+
+        pre_dispatch : int or str, default='all'
+            Number of predispatched jobs for parallel execution (default is
+            all). The option can reduce the allocated memory. The str can
+            be an expression like '2*n_jobs'.
+
+        verbose : int, default=0
+            Controls the verbosity: the higher, the more messages.
+
+        error_score : 'raise' or numeric, default=np.nan
+            Value to assign to the score if an error occurs in estimator
+            fitting. If set to 'raise', the error is raised. If a numeric value
+            is given, FitFailedWarning is raised.
+
+        fit_params : dict, default=None
+            Parameters to pass to the fit method of the estimator.
+
+        ax : matplotlib Axes, default=None
+            Axes object to plot on. If `None`, a new figure and axes is
+            created.
+
+        negate_score : bool, default=False
+            Whether or not to negate the scores obtained through
+            :func:`~sklearn.model_selection.validation_curve`. This is
+            particularly useful when using the error denoted by `neg_*` in
+            `scikit-learn`.
+
+        score_name : str, default=None
+            The name of the score used to decorate the y-axis of the plot. It will
+            override the name inferred from the `scoring` parameter. If `score` is
+            `None`, we use `"Score"` if `negate_score` is `False` and `"Negative score"`
+            otherwise. If `scoring` is a string or a callable, we infer the name. We
+            replace `_` by spaces and capitalize the first letter. We remove `neg_` and
+            replace it by `"Negative"` if `negate_score` is
+            `False` or just remove it otherwise.
+
+        score_type : {"test", "train", "both"}, default="both"
+            The type of score to plot. Can be one of `"test"`, `"train"`, or
+            `"both"`.
+
+        std_display_style : {"errorbar", "fill_between"} or None, default="fill_between"
+            The style used to display the score standard deviation around the
+            mean score. If `None`, no representation of the standard deviation
+            is displayed.
+
+        line_kw : dict, default=None
+            Additional keyword arguments passed to the `plt.plot` used to draw
+            the mean score.
+
+        fill_between_kw : dict, default=None
+            Additional keyword arguments passed to the `plt.fill_between` used
+            to draw the score standard deviation.
+
+        errorbar_kw : dict, default=None
+            Additional keyword arguments passed to the `plt.errorbar` used to
+            draw mean score and standard deviation score.
+
+        Returns
+        -------
+        display : :class:`~sklearn.model_selection.ValidationCurveDisplay`
+            Object that stores computed values.
+
+        Examples
+        --------
+        >>> import numpy as np
+        >>> import matplotlib.pyplot as plt
+        >>> from sklearn.datasets import make_classification
+        >>> from sklearn.model_selection import ValidationCurveDisplay
+        >>> from sklearn.linear_model import LogisticRegression
+        >>> X, y = make_classification(n_samples=1_000, random_state=0)
+        >>> logistic_regression = LogisticRegression()
+        >>> param_name, param_range = "C", np.logspace(-8, 3, 10)
+        >>> ValidationCurveDisplay.from_estimator(
+        ...     logistic_regression, X, y, param_name=param_name,
+        ...     param_range=param_range,
+        ... )
+        <...>
+        >>> plt.show()
+        """
+        check_matplotlib_support(f"{cls.__name__}.from_estimator")
+
+        score_name = _validate_score_name(score_name, scoring, negate_score)
+
+        train_scores, test_scores = validation_curve(
+            estimator,
+            X,
+            y,
+            param_name=param_name,
+            param_range=param_range,
+            groups=groups,
+            cv=cv,
+            scoring=scoring,
+            n_jobs=n_jobs,
+            pre_dispatch=pre_dispatch,
+            verbose=verbose,
+            error_score=error_score,
+            fit_params=fit_params,
+        )
+
+        viz = cls(
+            param_name=param_name,
+            param_range=param_range,
+            train_scores=train_scores,
+            test_scores=test_scores,
+            score_name=score_name,
+        )
+        return viz.plot(
+            ax=ax,
+            negate_score=negate_score,
+            score_type=score_type,
+            std_display_style=std_display_style,
+            line_kw=line_kw,
+            fill_between_kw=fill_between_kw,
+            errorbar_kw=errorbar_kw,
+        )
diff --git a/sklearn/model_selection/_search.py b/sklearn/model_selection/_search.py
index 1621dd324f81c..695614f4e1fa0 100644
--- a/sklearn/model_selection/_search.py
+++ b/sklearn/model_selection/_search.py
@@ -26,6 +26,7 @@
 
 from ..base import BaseEstimator, is_classifier, clone
 from ..base import MetaEstimatorMixin
+from ..base import _fit_context
 from ._split import check_cv
 from ._validation import _fit_and_score
 from ._validation import _aggregate_score_dicts
@@ -753,6 +754,10 @@ def _select_best_index(refit, refit_metric, results):
             best_index = results[f"rank_test_{refit_metric}"].argmin()
         return best_index
 
+    @_fit_context(
+        # *SearchCV.estimator is not validated yet
+        prefer_skip_nested_validation=False
+    )
     def fit(self, X, y=None, *, groups=None, **fit_params):
         """Run fit with all sets of parameters.
 
@@ -786,7 +791,6 @@ def fit(self, X, y=None, *, groups=None, **fit_params):
         self : object
             Instance of fitted estimator.
         """
-        self._validate_params()
         estimator = self.estimator
         refit_metric = "score"
 
diff --git a/sklearn/model_selection/_search_successive_halving.py b/sklearn/model_selection/_search_successive_halving.py
index 4826e7931d4d6..a061d7283b46d 100644
--- a/sklearn/model_selection/_search_successive_halving.py
+++ b/sklearn/model_selection/_search_successive_halving.py
@@ -7,6 +7,7 @@
 from ._search import BaseSearchCV
 from . import ParameterGrid, ParameterSampler
 from ..base import is_classifier
+from ..base import _fit_context
 from ._split import check_cv, _yields_constant_splits
 from ..metrics._scorer import get_scorer_names
 from ..utils import resample
@@ -211,6 +212,10 @@ def _select_best_index(refit, refit_metric, results):
 
         return last_iter_indices[best_idx]
 
+    @_fit_context(
+        # Halving*SearchCV.estimator is not validated yet
+        prefer_skip_nested_validation=False
+    )
     def fit(self, X, y=None, groups=None, **fit_params):
         """Run fit with all sets of parameters.
 
@@ -238,7 +243,6 @@ def fit(self, X, y=None, groups=None, **fit_params):
         self : object
             Instance of fitted estimator.
         """
-        self._validate_params()
         self._checked_cv_orig = check_cv(
             self.cv, y, classifier=is_classifier(self.estimator)
         )
diff --git a/sklearn/model_selection/tests/test_plot.py b/sklearn/model_selection/tests/test_plot.py
index 762af8fe08336..6baa211d2dc6e 100644
--- a/sklearn/model_selection/tests/test_plot.py
+++ b/sklearn/model_selection/tests/test_plot.py
@@ -1,3 +1,4 @@
+import numpy as np
 import pytest
 
 from sklearn.datasets import load_iris
@@ -5,8 +6,8 @@
 from sklearn.utils import shuffle
 from sklearn.utils._testing import assert_allclose, assert_array_equal
 
-from sklearn.model_selection import learning_curve
-from sklearn.model_selection import LearningCurveDisplay
+from sklearn.model_selection import learning_curve, validation_curve
+from sklearn.model_selection import LearningCurveDisplay, ValidationCurveDisplay
 
 
 @pytest.fixture
@@ -21,18 +22,22 @@ def data():
         ({"score_type": "invalid"}, ValueError, "Unknown score_type:"),
     ],
 )
-def test_learning_curve_display_parameters_validation(
-    pyplot, data, params, err_type, err_msg
+@pytest.mark.parametrize(
+    "CurveDisplay, specific_params",
+    [
+        (ValidationCurveDisplay, {"param_name": "max_depth", "param_range": [1, 3, 5]}),
+        (LearningCurveDisplay, {"train_sizes": [0.3, 0.6, 0.9]}),
+    ],
+)
+def test_curve_display_parameters_validation(
+    pyplot, data, params, err_type, err_msg, CurveDisplay, specific_params
 ):
     """Check that we raise a proper error when passing invalid parameters."""
     X, y = data
     estimator = DecisionTreeClassifier(random_state=0)
 
-    train_sizes = [0.3, 0.6, 0.9]
     with pytest.raises(err_type, match=err_msg):
-        LearningCurveDisplay.from_estimator(
-            estimator, X, y, train_sizes=train_sizes, **params
-        )
+        CurveDisplay.from_estimator(estimator, X, y, **specific_params, **params)
 
 
 def test_learning_curve_display_default_usage(pyplot, data):
@@ -63,7 +68,7 @@ def test_learning_curve_display_default_usage(pyplot, data):
     assert display.ax_.get_ylabel() == "Score"
 
     _, legend_labels = display.ax_.get_legend_handles_labels()
-    assert legend_labels == ["Testing metric"]
+    assert legend_labels == ["Train", "Test"]
 
     train_sizes_abs, train_scores, test_scores = learning_curve(
         estimator, X, y, train_sizes=train_sizes
@@ -74,21 +79,63 @@ def test_learning_curve_display_default_usage(pyplot, data):
     assert_allclose(display.test_scores, test_scores)
 
 
-def test_learning_curve_display_negate_score(pyplot, data):
+def test_validation_curve_display_default_usage(pyplot, data):
+    """Check the default usage of the ValidationCurveDisplay class."""
+    X, y = data
+    estimator = DecisionTreeClassifier(random_state=0)
+
+    param_name, param_range = "max_depth", [1, 3, 5]
+    display = ValidationCurveDisplay.from_estimator(
+        estimator, X, y, param_name=param_name, param_range=param_range
+    )
+
+    import matplotlib as mpl
+
+    assert display.errorbar_ is None
+
+    assert isinstance(display.lines_, list)
+    for line in display.lines_:
+        assert isinstance(line, mpl.lines.Line2D)
+
+    assert isinstance(display.fill_between_, list)
+    for fill in display.fill_between_:
+        assert isinstance(fill, mpl.collections.PolyCollection)
+        assert fill.get_alpha() == 0.5
+
+    assert display.score_name == "Score"
+    assert display.ax_.get_xlabel() == f"{param_name}"
+    assert display.ax_.get_ylabel() == "Score"
+
+    _, legend_labels = display.ax_.get_legend_handles_labels()
+    assert legend_labels == ["Train", "Test"]
+
+    train_scores, test_scores = validation_curve(
+        estimator, X, y, param_name=param_name, param_range=param_range
+    )
+
+    assert display.param_range == param_range
+    assert_array_equal(display.param_range, param_range)
+    assert_allclose(display.train_scores, train_scores)
+    assert_allclose(display.test_scores, test_scores)
+
+
+@pytest.mark.parametrize(
+    "CurveDisplay, specific_params",
+    [
+        (ValidationCurveDisplay, {"param_name": "max_depth", "param_range": [1, 3, 5]}),
+        (LearningCurveDisplay, {"train_sizes": [0.3, 0.6, 0.9]}),
+    ],
+)
+def test_curve_display_negate_score(pyplot, data, CurveDisplay, specific_params):
     """Check the behaviour of the `negate_score` parameter calling `from_estimator` and
     `plot`.
     """
     X, y = data
     estimator = DecisionTreeClassifier(max_depth=1, random_state=0)
 
-    train_sizes = [0.3, 0.6, 0.9]
     negate_score = False
-    display = LearningCurveDisplay.from_estimator(
-        estimator,
-        X,
-        y,
-        train_sizes=train_sizes,
-        negate_score=negate_score,
+    display = CurveDisplay.from_estimator(
+        estimator, X, y, **specific_params, negate_score=negate_score
     )
 
     positive_scores = display.lines_[0].get_data()[1]
@@ -96,22 +143,18 @@ def test_learning_curve_display_negate_score(pyplot, data):
     assert display.ax_.get_ylabel() == "Score"
 
     negate_score = True
-    display = LearningCurveDisplay.from_estimator(
-        estimator, X, y, train_sizes=train_sizes, negate_score=negate_score
+    display = CurveDisplay.from_estimator(
+        estimator, X, y, **specific_params, negate_score=negate_score
     )
 
     negative_scores = display.lines_[0].get_data()[1]
     assert (negative_scores <= 0).all()
     assert_allclose(negative_scores, -positive_scores)
-    assert display.ax_.get_ylabel() == "Score"
+    assert display.ax_.get_ylabel() == "Negative score"
 
     negate_score = False
-    display = LearningCurveDisplay.from_estimator(
-        estimator,
-        X,
-        y,
-        train_sizes=train_sizes,
-        negate_score=negate_score,
+    display = CurveDisplay.from_estimator(
+        estimator, X, y, **specific_params, negate_score=negate_score
     )
     assert display.ax_.get_ylabel() == "Score"
     display.plot(negate_score=not negate_score)
@@ -122,23 +165,30 @@ def test_learning_curve_display_negate_score(pyplot, data):
 @pytest.mark.parametrize(
     "score_name, ylabel", [(None, "Score"), ("Accuracy", "Accuracy")]
 )
-def test_learning_curve_display_score_name(pyplot, data, score_name, ylabel):
+@pytest.mark.parametrize(
+    "CurveDisplay, specific_params",
+    [
+        (ValidationCurveDisplay, {"param_name": "max_depth", "param_range": [1, 3, 5]}),
+        (LearningCurveDisplay, {"train_sizes": [0.3, 0.6, 0.9]}),
+    ],
+)
+def test_curve_display_score_name(
+    pyplot, data, score_name, ylabel, CurveDisplay, specific_params
+):
     """Check that we can overwrite the default score name shown on the y-axis."""
     X, y = data
     estimator = DecisionTreeClassifier(random_state=0)
 
-    train_sizes = [0.3, 0.6, 0.9]
-    display = LearningCurveDisplay.from_estimator(
-        estimator, X, y, train_sizes=train_sizes, score_name=score_name
+    display = CurveDisplay.from_estimator(
+        estimator, X, y, **specific_params, score_name=score_name
     )
 
     assert display.ax_.get_ylabel() == ylabel
     X, y = data
     estimator = DecisionTreeClassifier(max_depth=1, random_state=0)
 
-    train_sizes = [0.3, 0.6, 0.9]
-    display = LearningCurveDisplay.from_estimator(
-        estimator, X, y, train_sizes=train_sizes, score_name=score_name
+    display = CurveDisplay.from_estimator(
+        estimator, X, y, **specific_params, score_name=score_name
     )
 
     assert display.score_name == ylabel
@@ -166,7 +216,7 @@ def test_learning_curve_display_score_type(pyplot, data, std_display_style):
     )
 
     _, legend_label = display.ax_.get_legend_handles_labels()
-    assert legend_label == ["Training metric"]
+    assert legend_label == ["Train"]
 
     if std_display_style is None:
         assert len(display.lines_) == 1
@@ -191,7 +241,7 @@ def test_learning_curve_display_score_type(pyplot, data, std_display_style):
     )
 
     _, legend_label = display.ax_.get_legend_handles_labels()
-    assert legend_label == ["Testing metric"]
+    assert legend_label == ["Test"]
 
     if std_display_style is None:
         assert len(display.lines_) == 1
@@ -216,7 +266,7 @@ def test_learning_curve_display_score_type(pyplot, data, std_display_style):
     )
 
     _, legend_label = display.ax_.get_legend_handles_labels()
-    assert legend_label == ["Training metric", "Testing metric"]
+    assert legend_label == ["Train", "Test"]
 
     if std_display_style is None:
         assert len(display.lines_) == 2
@@ -235,100 +285,220 @@ def test_learning_curve_display_score_type(pyplot, data, std_display_style):
     assert_allclose(y_data_test, test_scores.mean(axis=1))
 
 
-def test_learning_curve_display_log_scale(pyplot, data):
-    """Check the behaviour of the parameter `log_scale`."""
+@pytest.mark.parametrize("std_display_style", (None, "errorbar"))
+def test_validation_curve_display_score_type(pyplot, data, std_display_style):
+    """Check the behaviour of setting the `score_type` parameter."""
     X, y = data
     estimator = DecisionTreeClassifier(random_state=0)
 
-    train_sizes = [0.3, 0.6, 0.9]
-    display = LearningCurveDisplay.from_estimator(
-        estimator, X, y, train_sizes=train_sizes, log_scale=True
+    param_name, param_range = "max_depth", [1, 3, 5]
+    train_scores, test_scores = validation_curve(
+        estimator, X, y, param_name=param_name, param_range=param_range
     )
 
-    assert display.ax_.get_xscale() == "log"
-    assert display.ax_.get_yscale() == "linear"
+    score_type = "train"
+    display = ValidationCurveDisplay.from_estimator(
+        estimator,
+        X,
+        y,
+        param_name=param_name,
+        param_range=param_range,
+        score_type=score_type,
+        std_display_style=std_display_style,
+    )
 
-    display = LearningCurveDisplay.from_estimator(
-        estimator, X, y, train_sizes=train_sizes, log_scale=False
+    _, legend_label = display.ax_.get_legend_handles_labels()
+    assert legend_label == ["Train"]
+
+    if std_display_style is None:
+        assert len(display.lines_) == 1
+        assert display.errorbar_ is None
+        x_data, y_data = display.lines_[0].get_data()
+    else:
+        assert display.lines_ is None
+        assert len(display.errorbar_) == 1
+        x_data, y_data = display.errorbar_[0].lines[0].get_data()
+
+    assert_array_equal(x_data, param_range)
+    assert_allclose(y_data, train_scores.mean(axis=1))
+
+    score_type = "test"
+    display = ValidationCurveDisplay.from_estimator(
+        estimator,
+        X,
+        y,
+        param_name=param_name,
+        param_range=param_range,
+        score_type=score_type,
+        std_display_style=std_display_style,
     )
 
-    assert display.ax_.get_xscale() == "linear"
-    assert display.ax_.get_yscale() == "linear"
+    _, legend_label = display.ax_.get_legend_handles_labels()
+    assert legend_label == ["Test"]
+
+    if std_display_style is None:
+        assert len(display.lines_) == 1
+        assert display.errorbar_ is None
+        x_data, y_data = display.lines_[0].get_data()
+    else:
+        assert display.lines_ is None
+        assert len(display.errorbar_) == 1
+        x_data, y_data = display.errorbar_[0].lines[0].get_data()
+
+    assert_array_equal(x_data, param_range)
+    assert_allclose(y_data, test_scores.mean(axis=1))
+
+    score_type = "both"
+    display = ValidationCurveDisplay.from_estimator(
+        estimator,
+        X,
+        y,
+        param_name=param_name,
+        param_range=param_range,
+        score_type=score_type,
+        std_display_style=std_display_style,
+    )
+
+    _, legend_label = display.ax_.get_legend_handles_labels()
+    assert legend_label == ["Train", "Test"]
+
+    if std_display_style is None:
+        assert len(display.lines_) == 2
+        assert display.errorbar_ is None
+        x_data_train, y_data_train = display.lines_[0].get_data()
+        x_data_test, y_data_test = display.lines_[1].get_data()
+    else:
+        assert display.lines_ is None
+        assert len(display.errorbar_) == 2
+        x_data_train, y_data_train = display.errorbar_[0].lines[0].get_data()
+        x_data_test, y_data_test = display.errorbar_[1].lines[0].get_data()
+
+    assert_array_equal(x_data_train, param_range)
+    assert_allclose(y_data_train, train_scores.mean(axis=1))
+    assert_array_equal(x_data_test, param_range)
+    assert_allclose(y_data_test, test_scores.mean(axis=1))
+
+
+@pytest.mark.parametrize(
+    "CurveDisplay, specific_params, expected_xscale",
+    [
+        (
+            ValidationCurveDisplay,
+            {"param_name": "max_depth", "param_range": np.arange(1, 5)},
+            "linear",
+        ),
+        (LearningCurveDisplay, {"train_sizes": np.linspace(0.1, 0.9, num=5)}, "linear"),
+        (
+            ValidationCurveDisplay,
+            {
+                "param_name": "max_depth",
+                "param_range": np.round(np.logspace(0, 2, num=5)).astype(np.int64),
+            },
+            "log",
+        ),
+        (LearningCurveDisplay, {"train_sizes": np.logspace(-1, 0, num=5)}, "log"),
+    ],
+)
+def test_curve_display_xscale_auto(
+    pyplot, data, CurveDisplay, specific_params, expected_xscale
+):
+    """Check the behaviour of the x-axis scaling depending on the data provided."""
+    X, y = data
+    estimator = DecisionTreeClassifier(random_state=0)
 
+    display = CurveDisplay.from_estimator(estimator, X, y, **specific_params)
+    assert display.ax_.get_xscale() == expected_xscale
 
-def test_learning_curve_display_std_display_style(pyplot, data):
+
+@pytest.mark.parametrize(
+    "CurveDisplay, specific_params",
+    [
+        (ValidationCurveDisplay, {"param_name": "max_depth", "param_range": [1, 3, 5]}),
+        (LearningCurveDisplay, {"train_sizes": [0.3, 0.6, 0.9]}),
+    ],
+)
+def test_curve_display_std_display_style(pyplot, data, CurveDisplay, specific_params):
     """Check the behaviour of the parameter `std_display_style`."""
     X, y = data
     estimator = DecisionTreeClassifier(random_state=0)
 
     import matplotlib as mpl
 
-    train_sizes = [0.3, 0.6, 0.9]
     std_display_style = None
-    display = LearningCurveDisplay.from_estimator(
+    display = CurveDisplay.from_estimator(
         estimator,
         X,
         y,
-        train_sizes=train_sizes,
+        **specific_params,
         std_display_style=std_display_style,
     )
 
-    assert len(display.lines_) == 1
-    assert isinstance(display.lines_[0], mpl.lines.Line2D)
+    assert len(display.lines_) == 2
+    for line in display.lines_:
+        assert isinstance(line, mpl.lines.Line2D)
     assert display.errorbar_ is None
     assert display.fill_between_ is None
     _, legend_label = display.ax_.get_legend_handles_labels()
-    assert len(legend_label) == 1
+    assert len(legend_label) == 2
 
     std_display_style = "fill_between"
-    display = LearningCurveDisplay.from_estimator(
+    display = CurveDisplay.from_estimator(
         estimator,
         X,
         y,
-        train_sizes=train_sizes,
+        **specific_params,
         std_display_style=std_display_style,
     )
 
-    assert len(display.lines_) == 1
-    assert isinstance(display.lines_[0], mpl.lines.Line2D)
+    assert len(display.lines_) == 2
+    for line in display.lines_:
+        assert isinstance(line, mpl.lines.Line2D)
     assert display.errorbar_ is None
-    assert len(display.fill_between_) == 1
-    assert isinstance(display.fill_between_[0], mpl.collections.PolyCollection)
+    assert len(display.fill_between_) == 2
+    for fill_between in display.fill_between_:
+        assert isinstance(fill_between, mpl.collections.PolyCollection)
     _, legend_label = display.ax_.get_legend_handles_labels()
-    assert len(legend_label) == 1
+    assert len(legend_label) == 2
 
     std_display_style = "errorbar"
-    display = LearningCurveDisplay.from_estimator(
+    display = CurveDisplay.from_estimator(
         estimator,
         X,
         y,
-        train_sizes=train_sizes,
+        **specific_params,
         std_display_style=std_display_style,
     )
 
     assert display.lines_ is None
-    assert len(display.errorbar_) == 1
-    assert isinstance(display.errorbar_[0], mpl.container.ErrorbarContainer)
+    assert len(display.errorbar_) == 2
+    for errorbar in display.errorbar_:
+        assert isinstance(errorbar, mpl.container.ErrorbarContainer)
     assert display.fill_between_ is None
     _, legend_label = display.ax_.get_legend_handles_labels()
-    assert len(legend_label) == 1
+    assert len(legend_label) == 2
 
 
-def test_learning_curve_display_plot_kwargs(pyplot, data):
+@pytest.mark.parametrize(
+    "CurveDisplay, specific_params",
+    [
+        (ValidationCurveDisplay, {"param_name": "max_depth", "param_range": [1, 3, 5]}),
+        (LearningCurveDisplay, {"train_sizes": [0.3, 0.6, 0.9]}),
+    ],
+)
+def test_curve_display_plot_kwargs(pyplot, data, CurveDisplay, specific_params):
     """Check the behaviour of the different plotting keyword arguments: `line_kw`,
     `fill_between_kw`, and `errorbar_kw`."""
     X, y = data
     estimator = DecisionTreeClassifier(random_state=0)
 
-    train_sizes = [0.3, 0.6, 0.9]
     std_display_style = "fill_between"
     line_kw = {"color": "red"}
     fill_between_kw = {"color": "red", "alpha": 1.0}
-    display = LearningCurveDisplay.from_estimator(
+    display = CurveDisplay.from_estimator(
         estimator,
         X,
         y,
-        train_sizes=train_sizes,
+        **specific_params,
         std_display_style=std_display_style,
         line_kw=line_kw,
         fill_between_kw=fill_between_kw,
@@ -342,13 +512,36 @@ def test_learning_curve_display_plot_kwargs(pyplot, data):
 
     std_display_style = "errorbar"
     errorbar_kw = {"color": "red"}
-    display = LearningCurveDisplay.from_estimator(
+    display = CurveDisplay.from_estimator(
         estimator,
         X,
         y,
-        train_sizes=train_sizes,
+        **specific_params,
         std_display_style=std_display_style,
         errorbar_kw=errorbar_kw,
     )
 
     assert display.errorbar_[0].lines[0].get_color() == "red"
+
+
+# TODO(1.5): to be removed
+def test_learning_curve_display_deprecate_log_scale(data, pyplot):
+    """Check that we warn for the deprecated parameter `log_scale`."""
+    X, y = data
+    estimator = DecisionTreeClassifier(random_state=0)
+
+    with pytest.warns(FutureWarning, match="`log_scale` parameter is deprecated"):
+        display = LearningCurveDisplay.from_estimator(
+            estimator, X, y, train_sizes=[0.3, 0.6, 0.9], log_scale=True
+        )
+
+    assert display.ax_.get_xscale() == "log"
+    assert display.ax_.get_yscale() == "linear"
+
+    with pytest.warns(FutureWarning, match="`log_scale` parameter is deprecated"):
+        display = LearningCurveDisplay.from_estimator(
+            estimator, X, y, train_sizes=[0.3, 0.6, 0.9], log_scale=False
+        )
+
+    assert display.ax_.get_xscale() == "linear"
+    assert display.ax_.get_yscale() == "linear"
diff --git a/sklearn/multiclass.py b/sklearn/multiclass.py
index 74684e608d3c1..4c30bcdb6cac3 100644
--- a/sklearn/multiclass.py
+++ b/sklearn/multiclass.py
@@ -43,6 +43,7 @@
 from .base import BaseEstimator, ClassifierMixin, clone, is_classifier
 from .base import MultiOutputMixin
 from .base import MetaEstimatorMixin, is_regressor
+from .base import _fit_context
 from .preprocessing import LabelBinarizer
 from .metrics.pairwise import pairwise_distances_argmin
 from .utils import check_random_state
@@ -296,6 +297,10 @@ def __init__(self, estimator, *, n_jobs=None, verbose=0):
         self.n_jobs = n_jobs
         self.verbose = verbose
 
+    @_fit_context(
+        # OneVsRestClassifier.estimator is not validated yet
+        prefer_skip_nested_validation=False
+    )
     def fit(self, X, y):
         """Fit underlying estimators.
 
@@ -313,8 +318,6 @@ def fit(self, X, y):
         self : object
             Instance of fitted estimator.
         """
-        self._validate_params()
-
         # A sparse LabelBinarizer, with sparse_output=True, has been shown to
         # outperform or match a dense label binarizer in all cases and has also
         # resulted in less or equal memory consumption in the fit_ovr function
@@ -348,6 +351,10 @@ def fit(self, X, y):
         return self
 
     @available_if(_estimators_has("partial_fit"))
+    @_fit_context(
+        # OneVsRestClassifier.estimator is not validated yet
+        prefer_skip_nested_validation=False
+    )
     def partial_fit(self, X, y, classes=None):
         """Partially fit underlying estimators.
 
@@ -376,8 +383,6 @@ def partial_fit(self, X, y, classes=None):
             Instance of partially fitted estimator.
         """
         if _check_partial_fit_first_call(self, classes):
-            self._validate_params()
-
             if not hasattr(self.estimator, "partial_fit"):
                 raise ValueError(
                     ("Base estimator {0}, doesn't have partial_fit method").format(
@@ -655,6 +660,10 @@ def __init__(self, estimator, *, n_jobs=None):
         self.estimator = estimator
         self.n_jobs = n_jobs
 
+    @_fit_context(
+        # OneVsOneClassifier.estimator is not validated yet
+        prefer_skip_nested_validation=False
+    )
     def fit(self, X, y):
         """Fit underlying estimators.
 
@@ -671,7 +680,6 @@ def fit(self, X, y):
         self : object
             The fitted underlying estimator.
         """
-        self._validate_params()
         # We need to validate the data because we do a safe_indexing later.
         X, y = self._validate_data(
             X, y, accept_sparse=["csr", "csc"], force_all_finite=False
@@ -706,6 +714,10 @@ def fit(self, X, y):
         return self
 
     @available_if(_estimators_has("partial_fit"))
+    @_fit_context(
+        # OneVsOneClassifier.estimator is not validated yet
+        prefer_skip_nested_validation=False
+    )
     def partial_fit(self, X, y, classes=None):
         """Partially fit underlying estimators.
 
@@ -735,8 +747,6 @@ def partial_fit(self, X, y, classes=None):
         """
         first_call = _check_partial_fit_first_call(self, classes)
         if first_call:
-            self._validate_params()
-
             self.estimators_ = [
                 clone(self.estimator)
                 for _ in range(self.n_classes_ * (self.n_classes_ - 1) // 2)
@@ -968,6 +978,10 @@ def __init__(self, estimator, *, code_size=1.5, random_state=None, n_jobs=None):
         self.random_state = random_state
         self.n_jobs = n_jobs
 
+    @_fit_context(
+        # OutputCodeClassifier.estimator is not validated yet
+        prefer_skip_nested_validation=False
+    )
     def fit(self, X, y):
         """Fit underlying estimators.
 
@@ -984,7 +998,6 @@ def fit(self, X, y):
         self : object
             Returns a fitted instance of self.
         """
-        self._validate_params()
         y = self._validate_data(X="no_validation", y=y)
 
         random_state = check_random_state(self.random_state)
diff --git a/sklearn/multioutput.py b/sklearn/multioutput.py
index 90c1f04f7e46a..8bb954e976f4c 100644
--- a/sklearn/multioutput.py
+++ b/sklearn/multioutput.py
@@ -28,6 +28,7 @@
     RegressorMixin,
     clone,
     is_classifier,
+    _fit_context,
 )
 from .model_selection import cross_val_predict
 from .utils import _print_elapsed_time, check_random_state, Bunch
@@ -104,6 +105,10 @@ def __init__(self, estimator, *, n_jobs=None):
         self.n_jobs = n_jobs
 
     @_available_if_estimator_has("partial_fit")
+    @_fit_context(
+        # MultiOutput*.estimator is not validated yet
+        prefer_skip_nested_validation=False
+    )
     def partial_fit(self, X, y, classes=None, sample_weight=None, **partial_fit_params):
         """Incrementally fit a separate model for each class output.
 
@@ -151,9 +156,6 @@ def partial_fit(self, X, y, classes=None, sample_weight=None, **partial_fit_para
 
         first_time = not hasattr(self, "estimators_")
 
-        if first_time:
-            self._validate_params()
-
         y = self._validate_data(X="no_validation", y=y, multi_output=True)
 
         if y.ndim == 1:
@@ -203,6 +205,10 @@ def partial_fit(self, X, y, classes=None, sample_weight=None, **partial_fit_para
 
         return self
 
+    @_fit_context(
+        # MultiOutput*.estimator is not validated yet
+        prefer_skip_nested_validation=False
+    )
     def fit(self, X, y, sample_weight=None, **fit_params):
         """Fit the model to data, separately for each output variable.
 
@@ -230,8 +236,6 @@ def fit(self, X, y, sample_weight=None, **fit_params):
         self : object
             Returns a fitted instance.
         """
-        self._validate_params()
-
         if not hasattr(self.estimator, "fit"):
             raise ValueError("The base estimator should implement a fit method")
 
@@ -887,6 +891,10 @@ class labels for each estimator in the chain.
            [0.0321..., 0.9935..., 0.0625...]])
     """
 
+    @_fit_context(
+        # ClassifierChain.base_estimator is not validated yet
+        prefer_skip_nested_validation=False
+    )
     def fit(self, X, Y, **fit_params):
         """Fit the model to data matrix X and targets Y.
 
@@ -917,8 +925,6 @@ def fit(self, X, Y, **fit_params):
                 "See the User Guide for more information."
             )
 
-        self._validate_params()
-
         super().fit(X, Y, **fit_params)
         self.classes_ = [
             estimator.classes_ for chain_idx, estimator in enumerate(self.estimators_)
@@ -1109,6 +1115,10 @@ class RegressorChain(MetaEstimatorMixin, RegressorMixin, _BaseChain):
            [2., 0.]])
     """
 
+    @_fit_context(
+        # RegressorChain.base_estimator is not validated yet
+        prefer_skip_nested_validation=False
+    )
     def fit(self, X, Y, **fit_params):
         """Fit the model to data matrix X and targets Y.
 
@@ -1131,8 +1141,6 @@ def fit(self, X, Y, **fit_params):
         self : object
             Returns a fitted instance.
         """
-        self._validate_params()
-
         super().fit(X, Y, **fit_params)
         return self
 
diff --git a/sklearn/naive_bayes.py b/sklearn/naive_bayes.py
index 20858ac8b5577..76d7189385828 100644
--- a/sklearn/naive_bayes.py
+++ b/sklearn/naive_bayes.py
@@ -22,6 +22,7 @@
 from scipy.special import logsumexp
 
 from .base import BaseEstimator, ClassifierMixin
+from .base import _fit_context
 from .preprocessing import binarize
 from .preprocessing import LabelBinarizer
 from .preprocessing import label_binarize
@@ -239,6 +240,7 @@ def __init__(self, *, priors=None, var_smoothing=1e-9):
         self.priors = priors
         self.var_smoothing = var_smoothing
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def fit(self, X, y, sample_weight=None):
         """Fit Gaussian Naive Bayes according to X, y.
 
@@ -262,7 +264,6 @@ def fit(self, X, y, sample_weight=None):
         self : object
             Returns the instance itself.
         """
-        self._validate_params()
         y = self._validate_data(y=y)
         return self._partial_fit(
             X, y, np.unique(y), _refit=True, sample_weight=sample_weight
@@ -346,6 +347,7 @@ def _update_mean_variance(n_past, mu, var, X, sample_weight=None):
 
         return total_mu, total_var
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def partial_fit(self, X, y, classes=None, sample_weight=None):
         """Incremental fit on a batch of samples.
 
@@ -386,8 +388,6 @@ def partial_fit(self, X, y, classes=None, sample_weight=None):
         self : object
             Returns the instance itself.
         """
-        self._validate_params()
-
         return self._partial_fit(
             X, y, classes, _refit=False, sample_weight=sample_weight
         )
@@ -643,6 +643,7 @@ def _check_alpha(self):
             return np.maximum(alpha, alpha_lower_bound)
         return alpha
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def partial_fit(self, X, y, classes=None, sample_weight=None):
         """Incremental fit on a batch of samples.
 
@@ -682,9 +683,6 @@ def partial_fit(self, X, y, classes=None, sample_weight=None):
         """
         first_call = not hasattr(self, "classes_")
 
-        if first_call:
-            self._validate_params()
-
         X, y = self._check_X_y(X, y, reset=first_call)
         _, n_features = X.shape
 
@@ -728,6 +726,7 @@ def partial_fit(self, X, y, classes=None, sample_weight=None):
         self._update_class_log_prior(class_prior=class_prior)
         return self
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def fit(self, X, y, sample_weight=None):
         """Fit Naive Bayes classifier according to X, y.
 
@@ -748,7 +747,6 @@ def fit(self, X, y, sample_weight=None):
         self : object
             Returns the instance itself.
         """
-        self._validate_params()
         X, y = self._check_X_y(X, y)
         _, n_features = X.shape
 
diff --git a/sklearn/neighbors/_classification.py b/sklearn/neighbors/_classification.py
index dbc070987d5d0..e3e2049a8f8e5 100644
--- a/sklearn/neighbors/_classification.py
+++ b/sklearn/neighbors/_classification.py
@@ -18,6 +18,7 @@
 from ._base import _get_weights
 from ._base import NeighborsBase, KNeighborsMixin, RadiusNeighborsMixin
 from ..base import ClassifierMixin
+from ..base import _fit_context
 from ..metrics._pairwise_distances_reduction import ArgKminClassMode
 from ..utils._param_validation import StrOptions
 from sklearn.neighbors._base import _check_precomputed
@@ -203,6 +204,10 @@ def __init__(
         )
         self.weights = weights
 
+    @_fit_context(
+        # KNeighborsClassifier.metric is not validated yet
+        prefer_skip_nested_validation=False
+    )
     def fit(self, X, y):
         """Fit the k-nearest neighbors classifier from the training dataset.
 
@@ -221,8 +226,6 @@ def fit(self, X, y):
         self : KNeighborsClassifier
             The fitted k-nearest neighbors classifier.
         """
-        self._validate_params()
-
         return self._fit(X, y)
 
     def predict(self, X):
@@ -572,6 +575,10 @@ def __init__(
         self.weights = weights
         self.outlier_label = outlier_label
 
+    @_fit_context(
+        # RadiusNeighborsClassifier.metric is not validated yet
+        prefer_skip_nested_validation=False
+    )
     def fit(self, X, y):
         """Fit the radius neighbors classifier from the training dataset.
 
@@ -590,7 +597,6 @@ def fit(self, X, y):
         self : RadiusNeighborsClassifier
             The fitted radius neighbors classifier.
         """
-        self._validate_params()
         self._fit(X, y)
 
         classes_ = self.classes_
diff --git a/sklearn/neighbors/_graph.py b/sklearn/neighbors/_graph.py
index 418761c2d21ee..e815d12e293c9 100644
--- a/sklearn/neighbors/_graph.py
+++ b/sklearn/neighbors/_graph.py
@@ -8,6 +8,7 @@
 from ._base import NeighborsBase
 from ._unsupervised import NearestNeighbors
 from ..base import TransformerMixin, ClassNamePrefixFeaturesOutMixin
+from ..base import _fit_context
 from ..utils._param_validation import StrOptions
 from ..utils.validation import check_is_fitted
 
@@ -372,6 +373,10 @@ def __init__(
         )
         self.mode = mode
 
+    @_fit_context(
+        # KNeighborsTransformer.metric is not validated yet
+        prefer_skip_nested_validation=False
+    )
     def fit(self, X, y=None):
         """Fit the k-nearest neighbors transformer from the training dataset.
 
@@ -388,7 +393,6 @@ def fit(self, X, y=None):
         self : KNeighborsTransformer
             The fitted k-nearest neighbors transformer.
         """
-        self._validate_params()
         self._fit(X)
         self._n_features_out = self.n_samples_fit_
         return self
@@ -600,6 +604,10 @@ def __init__(
         )
         self.mode = mode
 
+    @_fit_context(
+        # RadiusNeighborsTransformer.metric is not validated yet
+        prefer_skip_nested_validation=False
+    )
     def fit(self, X, y=None):
         """Fit the radius neighbors transformer from the training dataset.
 
@@ -617,7 +625,6 @@ def fit(self, X, y=None):
         self : RadiusNeighborsTransformer
             The fitted radius neighbors transformer.
         """
-        self._validate_params()
         self._fit(X)
         self._n_features_out = self.n_samples_fit_
         return self
diff --git a/sklearn/neighbors/_kde.py b/sklearn/neighbors/_kde.py
index f285b03403b5f..7f7b38497d209 100644
--- a/sklearn/neighbors/_kde.py
+++ b/sklearn/neighbors/_kde.py
@@ -10,6 +10,7 @@
 from scipy.special import gammainc
 
 from ..base import BaseEstimator
+from ..base import _fit_context
 from ..neighbors._base import VALID_METRICS
 from ..utils import check_random_state
 from ..utils.validation import _check_sample_weight, check_is_fitted
@@ -185,6 +186,10 @@ def _choose_algorithm(self, algorithm, metric):
                 )
             return algorithm
 
+    @_fit_context(
+        # KernelDensity.metric is not validated yet
+        prefer_skip_nested_validation=False
+    )
     def fit(self, X, y=None, sample_weight=None):
         """Fit the Kernel Density model on the data.
 
@@ -208,8 +213,6 @@ def fit(self, X, y=None, sample_weight=None):
         self : object
             Returns the instance itself.
         """
-        self._validate_params()
-
         algorithm = self._choose_algorithm(self.algorithm, self.metric)
 
         if isinstance(self.bandwidth, str):
diff --git a/sklearn/neighbors/_lof.py b/sklearn/neighbors/_lof.py
index 90b3b0aa3d8ce..40cdc9ab5fb9d 100644
--- a/sklearn/neighbors/_lof.py
+++ b/sklearn/neighbors/_lof.py
@@ -8,6 +8,7 @@
 from ._base import NeighborsBase
 from ._base import KNeighborsMixin
 from ..base import OutlierMixin
+from ..base import _fit_context
 from numbers import Real
 
 from ..utils._param_validation import Interval, StrOptions
@@ -256,6 +257,10 @@ def fit_predict(self, X, y=None):
 
         return self.fit(X)._predict()
 
+    @_fit_context(
+        # LocalOutlierFactor.metric is not validated yet
+        prefer_skip_nested_validation=False
+    )
     def fit(self, X, y=None):
         """Fit the local outlier factor detector from the training dataset.
 
@@ -273,8 +278,6 @@ def fit(self, X, y=None):
         self : LocalOutlierFactor
             The fitted local outlier factor detector.
         """
-        self._validate_params()
-
         self._fit(X)
 
         n_samples = self.n_samples_fit_
diff --git a/sklearn/neighbors/_nca.py b/sklearn/neighbors/_nca.py
index 4a83fcc7bc080..246f0adcb36ad 100644
--- a/sklearn/neighbors/_nca.py
+++ b/sklearn/neighbors/_nca.py
@@ -15,6 +15,7 @@
 from ..utils.extmath import softmax
 from ..metrics import pairwise_distances
 from ..base import BaseEstimator, TransformerMixin, ClassNamePrefixFeaturesOutMixin
+from ..base import _fit_context
 from ..preprocessing import LabelEncoder
 from ..decomposition import PCA
 from ..utils.multiclass import check_classification_targets
@@ -215,6 +216,7 @@ def __init__(
         self.verbose = verbose
         self.random_state = random_state
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def fit(self, X, y):
         """Fit the model according to the given training data.
 
@@ -231,8 +233,6 @@ def fit(self, X, y):
         self : object
             Fitted estimator.
         """
-        self._validate_params()
-
         # Validate the inputs X and y, and converts y to numerical classes.
         X, y = self._validate_data(X, y, ensure_min_samples=2)
         check_classification_targets(y)
diff --git a/sklearn/neighbors/_nearest_centroid.py b/sklearn/neighbors/_nearest_centroid.py
index 7b9c2479747d3..315393bf597e4 100644
--- a/sklearn/neighbors/_nearest_centroid.py
+++ b/sklearn/neighbors/_nearest_centroid.py
@@ -13,6 +13,7 @@
 from scipy import sparse as sp
 
 from ..base import BaseEstimator, ClassifierMixin
+from ..base import _fit_context
 from ..metrics.pairwise import pairwise_distances_argmin
 from ..preprocessing import LabelEncoder
 from ..utils.validation import check_is_fitted
@@ -122,6 +123,7 @@ def __init__(self, metric="euclidean", *, shrink_threshold=None):
         self.metric = metric
         self.shrink_threshold = shrink_threshold
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def fit(self, X, y):
         """
         Fit the NearestCentroid model according to the given training data.
@@ -140,8 +142,6 @@ def fit(self, X, y):
         self : object
             Fitted estimator.
         """
-        self._validate_params()
-
         if isinstance(self.metric, str) and self.metric not in (
             "manhattan",
             "euclidean",
diff --git a/sklearn/neighbors/_regression.py b/sklearn/neighbors/_regression.py
index 003b534074ecd..b2050345c9833 100644
--- a/sklearn/neighbors/_regression.py
+++ b/sklearn/neighbors/_regression.py
@@ -17,6 +17,7 @@
 from ._base import _get_weights
 from ._base import NeighborsBase, KNeighborsMixin, RadiusNeighborsMixin
 from ..base import RegressorMixin
+from ..base import _fit_context
 from ..utils._param_validation import StrOptions
 
 
@@ -194,6 +195,10 @@ def _more_tags(self):
         # For cross-validation routines to split data correctly
         return {"pairwise": self.metric == "precomputed"}
 
+    @_fit_context(
+        # KNeighborsRegressor.metric is not validated yet
+        prefer_skip_nested_validation=False
+    )
     def fit(self, X, y):
         """Fit the k-nearest neighbors regressor from the training dataset.
 
@@ -212,8 +217,6 @@ def fit(self, X, y):
         self : KNeighborsRegressor
             The fitted k-nearest neighbors regressor.
         """
-        self._validate_params()
-
         return self._fit(X, y)
 
     def predict(self, X):
@@ -422,6 +425,10 @@ def __init__(
         )
         self.weights = weights
 
+    @_fit_context(
+        # RadiusNeighborsRegressor.metric is not validated yet
+        prefer_skip_nested_validation=False
+    )
     def fit(self, X, y):
         """Fit the radius neighbors regressor from the training dataset.
 
@@ -440,7 +447,6 @@ def fit(self, X, y):
         self : RadiusNeighborsRegressor
             The fitted radius neighbors regressor.
         """
-        self._validate_params()
         return self._fit(X, y)
 
     def predict(self, X):
diff --git a/sklearn/neighbors/_unsupervised.py b/sklearn/neighbors/_unsupervised.py
index 53e69495b9ed4..05607f0bd0c71 100644
--- a/sklearn/neighbors/_unsupervised.py
+++ b/sklearn/neighbors/_unsupervised.py
@@ -1,4 +1,5 @@
 """Unsupervised nearest neighbors learner"""
+from ..base import _fit_context
 from ._base import NeighborsBase
 from ._base import KNeighborsMixin
 from ._base import RadiusNeighborsMixin
@@ -155,6 +156,10 @@ def __init__(
             n_jobs=n_jobs,
         )
 
+    @_fit_context(
+        # NearestNeighbors.metric is not validated yet
+        prefer_skip_nested_validation=False
+    )
     def fit(self, X, y=None):
         """Fit the nearest neighbors estimator from the training dataset.
 
@@ -172,5 +177,4 @@ def fit(self, X, y=None):
         self : NearestNeighbors
             The fitted nearest neighbors estimator.
         """
-        self._validate_params()
         return self._fit(X)
diff --git a/sklearn/neural_network/_multilayer_perceptron.py b/sklearn/neural_network/_multilayer_perceptron.py
index 5c4bc5a39aa2d..fb8eab2f1776d 100644
--- a/sklearn/neural_network/_multilayer_perceptron.py
+++ b/sklearn/neural_network/_multilayer_perceptron.py
@@ -21,6 +21,7 @@
     RegressorMixin,
 )
 from ..base import is_classifier
+from ..base import _fit_context
 from ._base import ACTIVATIONS, DERIVATIVES, LOSS_FUNCTIONS
 from ._stochastic_optimizers import SGDOptimizer, AdamOptimizer
 from ..metrics import accuracy_score, r2_score
@@ -727,6 +728,7 @@ def _update_no_improvement_count(self, early_stopping, X_val, y_val):
             if self.loss_curve_[-1] < self.best_loss_:
                 self.best_loss_ = self.loss_curve_[-1]
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def fit(self, X, y):
         """Fit the model to data matrix X and target(s) y.
 
@@ -744,8 +746,6 @@ def fit(self, X, y):
         self : object
             Returns a trained MLP model.
         """
-        self._validate_params()
-
         return self._fit(X, y, incremental=False)
 
     def _check_solver(self):
@@ -1170,6 +1170,7 @@ def _score(self, X, y):
         return accuracy_score(y, self._predict(X, check_input=False))
 
     @available_if(lambda est: est._check_solver())
+    @_fit_context(prefer_skip_nested_validation=True)
     def partial_fit(self, X, y, classes=None):
         """Update the model with a single iteration over the given data.
 
@@ -1194,9 +1195,6 @@ def partial_fit(self, X, y, classes=None):
         self : object
             Trained MLP model.
         """
-        if not hasattr(self, "coefs_"):
-            self._validate_params()
-
         if _check_partial_fit_first_call(self, classes):
             self._label_binarizer = LabelBinarizer()
             if type_of_target(y).startswith("multilabel"):
@@ -1624,6 +1622,7 @@ def _validate_input(self, X, y, incremental, reset):
         return X, y
 
     @available_if(lambda est: est._check_solver)
+    @_fit_context(prefer_skip_nested_validation=True)
     def partial_fit(self, X, y):
         """Update the model with a single iteration over the given data.
 
@@ -1640,7 +1639,4 @@ def partial_fit(self, X, y):
         self : object
             Trained MLP model.
         """
-        if not hasattr(self, "coefs_"):
-            self._validate_params()
-
         return self._fit(X, y, incremental=True)
diff --git a/sklearn/neural_network/_rbm.py b/sklearn/neural_network/_rbm.py
index 0624145116180..2ded6533d8d96 100644
--- a/sklearn/neural_network/_rbm.py
+++ b/sklearn/neural_network/_rbm.py
@@ -17,6 +17,7 @@
 from ..base import BaseEstimator
 from ..base import TransformerMixin
 from ..base import ClassNamePrefixFeaturesOutMixin
+from ..base import _fit_context
 from ..utils import check_random_state
 from ..utils import gen_even_slices
 from ..utils.extmath import safe_sparse_dot
@@ -269,6 +270,7 @@ def gibbs(self, v):
 
         return v_
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def partial_fit(self, X, y=None):
         """Fit the model to the partial segment of the data X.
 
@@ -285,9 +287,6 @@ def partial_fit(self, X, y=None):
         self : BernoulliRBM
             The fitted model.
         """
-
-        self._validate_params()
-
         first_pass = not hasattr(self, "components_")
         X = self._validate_data(
             X, accept_sparse="csr", dtype=np.float64, reset=first_pass
@@ -380,6 +379,7 @@ def score_samples(self, X):
         fe_ = self._free_energy(v_)
         return v.shape[1] * log_logistic(fe_ - fe)
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def fit(self, X, y=None):
         """Fit the model to the data X.
 
@@ -396,9 +396,6 @@ def fit(self, X, y=None):
         self : BernoulliRBM
             The fitted model.
         """
-
-        self._validate_params()
-
         X = self._validate_data(X, accept_sparse="csr", dtype=(np.float64, np.float32))
         n_samples = X.shape[0]
         rng = check_random_state(self.random_state)
diff --git a/sklearn/pipeline.py b/sklearn/pipeline.py
index 8c5dc3bd82917..43b6b7eb0c939 100644
--- a/sklearn/pipeline.py
+++ b/sklearn/pipeline.py
@@ -16,6 +16,7 @@
 from scipy import sparse
 
 from .base import clone, TransformerMixin
+from .base import _fit_context
 from .preprocessing import FunctionTransformer
 from .utils._estimator_html_repr import _VisualBlock
 from .utils.metaestimators import available_if
@@ -385,6 +386,10 @@ def _fit(self, X, y=None, **fit_params_steps):
             self.steps[step_idx] = (name, fitted_transformer)
         return X
 
+    @_fit_context(
+        # estimators in Pipeline.steps are not validated yet
+        prefer_skip_nested_validation=False
+    )
     def fit(self, X, y=None, **fit_params):
         """Fit the model.
 
@@ -411,7 +416,6 @@ def fit(self, X, y=None, **fit_params):
         self : object
             Pipeline with fitted steps.
         """
-        self._validate_params()
         fit_params_steps = self._check_fit_params(**fit_params)
         Xt = self._fit(X, y, **fit_params_steps)
         with _print_elapsed_time("Pipeline", self._log_message(len(self.steps) - 1)):
@@ -429,6 +433,10 @@ def _can_fit_transform(self):
         )
 
     @available_if(_can_fit_transform)
+    @_fit_context(
+        # estimators in Pipeline.steps are not validated yet
+        prefer_skip_nested_validation=False
+    )
     def fit_transform(self, X, y=None, **fit_params):
         """Fit the model and transform with the final estimator.
 
@@ -456,7 +464,6 @@ def fit_transform(self, X, y=None, **fit_params):
         Xt : ndarray of shape (n_samples, n_transformed_features)
             Transformed samples.
         """
-        self._validate_params()
         fit_params_steps = self._check_fit_params(**fit_params)
         Xt = self._fit(X, y, **fit_params_steps)
 
@@ -505,6 +512,10 @@ def predict(self, X, **predict_params):
         return self.steps[-1][1].predict(Xt, **predict_params)
 
     @available_if(_final_estimator_has("fit_predict"))
+    @_fit_context(
+        # estimators in Pipeline.steps are not validated yet
+        prefer_skip_nested_validation=False
+    )
     def fit_predict(self, X, y=None, **fit_params):
         """Transform the data, and apply `fit_predict` with the final estimator.
 
@@ -533,7 +544,6 @@ def fit_predict(self, X, y=None, **fit_params):
         y_pred : ndarray
             Result of calling `fit_predict` on the final estimator.
         """
-        self._validate_params()
         fit_params_steps = self._check_fit_params(**fit_params)
         Xt = self._fit(X, y, **fit_params_steps)
 
diff --git a/sklearn/preprocessing/_data.py b/sklearn/preprocessing/_data.py
index 013f1f57e9373..139022a9897e6 100644
--- a/sklearn/preprocessing/_data.py
+++ b/sklearn/preprocessing/_data.py
@@ -22,6 +22,7 @@
     TransformerMixin,
     OneToOneFeatureMixin,
     ClassNamePrefixFeaturesOutMixin,
+    _fit_context,
 )
 from ..utils import check_array
 from ..utils._param_validation import Interval, Options, StrOptions, validate_params
@@ -435,6 +436,7 @@ def fit(self, X, y=None):
         self._reset()
         return self.partial_fit(X, y)
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def partial_fit(self, X, y=None):
         """Online computation of min and max on X for later scaling.
 
@@ -456,8 +458,6 @@ def partial_fit(self, X, y=None):
         self : object
             Fitted scaler.
         """
-        self._validate_params()
-
         feature_range = self.feature_range
         if feature_range[0] >= feature_range[1]:
             raise ValueError(
@@ -838,6 +838,7 @@ def fit(self, X, y=None, sample_weight=None):
         self._reset()
         return self.partial_fit(X, y, sample_weight)
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def partial_fit(self, X, y=None, sample_weight=None):
         """Online computation of mean and std on X for later scaling.
 
@@ -870,8 +871,6 @@ def partial_fit(self, X, y=None, sample_weight=None):
         self : object
             Fitted scaler.
         """
-        self._validate_params()
-
         first_call = not hasattr(self, "n_samples_seen_")
         X = self._validate_data(
             X,
@@ -1183,6 +1182,7 @@ def fit(self, X, y=None):
         self._reset()
         return self.partial_fit(X, y)
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def partial_fit(self, X, y=None):
         """Online computation of max absolute value of X for later scaling.
 
@@ -1204,8 +1204,6 @@ def partial_fit(self, X, y=None):
         self : object
             Fitted scaler.
         """
-        self._validate_params()
-
         first_pass = not hasattr(self, "n_samples_seen_")
         X = self._validate_data(
             X,
@@ -1514,6 +1512,7 @@ def __init__(
         self.unit_variance = unit_variance
         self.copy = copy
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def fit(self, X, y=None):
         """Compute the median and quantiles to be used for scaling.
 
@@ -1531,8 +1530,6 @@ def fit(self, X, y=None):
         self : object
             Fitted scaler.
         """
-        self._validate_params()
-
         # at fit, convert sparse matrices to csc for optimized computation of
         # the quantiles
         X = self._validate_data(
@@ -1972,6 +1969,7 @@ def __init__(self, norm="l2", *, copy=True):
         self.norm = norm
         self.copy = copy
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def fit(self, X, y=None):
         """Only validates estimator's parameters.
 
@@ -1991,7 +1989,6 @@ def fit(self, X, y=None):
         self : object
             Fitted transformer.
         """
-        self._validate_params()
         self._validate_data(X, accept_sparse="csr")
         return self
 
@@ -2155,6 +2152,7 @@ def __init__(self, *, threshold=0.0, copy=True):
         self.threshold = threshold
         self.copy = copy
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def fit(self, X, y=None):
         """Only validates estimator's parameters.
 
@@ -2174,7 +2172,6 @@ def fit(self, X, y=None):
         self : object
             Fitted transformer.
         """
-        self._validate_params()
         self._validate_data(X, accept_sparse="csr")
         return self
 
@@ -2634,6 +2631,7 @@ def _sparse_fit(self, X, random_state):
         # https://github.com/numpy/numpy/issues/14685
         self.quantiles_ = np.maximum.accumulate(self.quantiles_)
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def fit(self, X, y=None):
         """Compute the quantiles used for transforming.
 
@@ -2653,8 +2651,6 @@ def fit(self, X, y=None):
         self : object
            Fitted transformer.
         """
-        self._validate_params()
-
         if self.n_quantiles > self.subsample:
             raise ValueError(
                 "The number of quantiles cannot be greater than"
@@ -3101,6 +3097,7 @@ def __init__(self, method="yeo-johnson", *, standardize=True, copy=True):
         self.standardize = standardize
         self.copy = copy
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def fit(self, X, y=None):
         """Estimate the optimal parameter lambda for each feature.
 
@@ -3120,10 +3117,10 @@ def fit(self, X, y=None):
         self : object
             Fitted transformer.
         """
-        self._validate_params()
         self._fit(X, y=y, force_transform=False)
         return self
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def fit_transform(self, X, y=None):
         """Fit `PowerTransformer` to `X`, then transform `X`.
 
@@ -3141,7 +3138,6 @@ def fit_transform(self, X, y=None):
         X_new : ndarray of shape (n_samples, n_features)
             Transformed data.
         """
-        self._validate_params()
         return self._fit(X, y, force_transform=True)
 
     def _fit(self, X, y=None, force_transform=False):
@@ -3150,24 +3146,37 @@ def _fit(self, X, y=None, force_transform=False):
         if not self.copy and not force_transform:  # if call from fit()
             X = X.copy()  # force copy so that fit does not change X inplace
 
+        n_samples = X.shape[0]
+        mean = np.mean(X, axis=0, dtype=np.float64)
+        var = np.var(X, axis=0, dtype=np.float64)
+
         optim_function = {
             "box-cox": self._box_cox_optimize,
             "yeo-johnson": self._yeo_johnson_optimize,
         }[self.method]
+
+        transform_function = {
+            "box-cox": boxcox,
+            "yeo-johnson": self._yeo_johnson_transform,
+        }[self.method]
+
         with np.errstate(invalid="ignore"):  # hide NaN warnings
-            self.lambdas_ = np.array([optim_function(col) for col in X.T])
+            self.lambdas_ = np.empty(X.shape[1], dtype=X.dtype)
+            for i, col in enumerate(X.T):
+                # For yeo-johnson, leave constant features unchanged
+                # lambda=1 corresponds to the identity transformation
+                is_constant_feature = _is_constant_feature(var[i], mean[i], n_samples)
+                if self.method == "yeo-johnson" and is_constant_feature:
+                    self.lambdas_[i] = 1.0
+                    continue
+
+                self.lambdas_[i] = optim_function(col)
 
-        if self.standardize or force_transform:
-            transform_function = {
-                "box-cox": boxcox,
-                "yeo-johnson": self._yeo_johnson_transform,
-            }[self.method]
-            for i, lmbda in enumerate(self.lambdas_):
-                with np.errstate(invalid="ignore"):  # hide NaN warnings
-                    X[:, i] = transform_function(X[:, i], lmbda)
+                if self.standardize or force_transform:
+                    X[:, i] = transform_function(X[:, i], self.lambdas_[i])
 
         if self.standardize:
-            self._scaler = StandardScaler(copy=False)
+            self._scaler = StandardScaler(copy=False).set_output(transform="default")
             if force_transform:
                 X = self._scaler.fit_transform(X)
             else:
diff --git a/sklearn/preprocessing/_discretization.py b/sklearn/preprocessing/_discretization.py
index 220950586a6ef..ac7432027f462 100644
--- a/sklearn/preprocessing/_discretization.py
+++ b/sklearn/preprocessing/_discretization.py
@@ -11,6 +11,7 @@
 from . import OneHotEncoder
 
 from ..base import BaseEstimator, TransformerMixin
+from ..base import _fit_context
 from ..utils._param_validation import Hidden, Interval, StrOptions, Options
 from ..utils.validation import check_array
 from ..utils.validation import check_is_fitted
@@ -192,6 +193,7 @@ def __init__(
         self.subsample = subsample
         self.random_state = random_state
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def fit(self, X, y=None, sample_weight=None):
         """
         Fit the estimator.
@@ -216,7 +218,6 @@ def fit(self, X, y=None, sample_weight=None):
         self : object
             Returns the instance itself.
         """
-        self._validate_params()
         X = self._validate_data(X, dtype="numeric")
 
         if self.dtype in (np.float64, np.float32):
diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py
index 1fc4b16a52467..de3f983d7ae6f 100644
--- a/sklearn/preprocessing/_encoders.py
+++ b/sklearn/preprocessing/_encoders.py
@@ -10,6 +10,7 @@
 from scipy import sparse
 
 from ..base import BaseEstimator, TransformerMixin, OneToOneFeatureMixin
+from ..base import _fit_context
 from ..utils import check_array, is_scalar_nan, _safe_indexing
 from ..utils.validation import check_is_fitted
 from ..utils.validation import _check_feature_names_in
@@ -953,6 +954,7 @@ def _compute_n_features_outs(self):
 
         return output
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def fit(self, X, y=None):
         """
         Fit OneHotEncoder to X.
@@ -971,8 +973,6 @@ def fit(self, X, y=None):
         self
             Fitted encoder.
         """
-        self._validate_params()
-
         if self.sparse != "deprecated":
             warnings.warn(
                 (
@@ -1446,6 +1446,7 @@ def __init__(
         self.min_frequency = min_frequency
         self.max_categories = max_categories
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def fit(self, X, y=None):
         """
         Fit the OrdinalEncoder to X.
@@ -1464,8 +1465,6 @@ def fit(self, X, y=None):
         self : object
             Fitted encoder.
         """
-        self._validate_params()
-
         if self.handle_unknown == "use_encoded_value":
             if is_scalar_nan(self.unknown_value):
                 if np.dtype(self.dtype).kind != "f":
diff --git a/sklearn/preprocessing/_function_transformer.py b/sklearn/preprocessing/_function_transformer.py
index c250c5cd0226e..d7bf1810e61c0 100644
--- a/sklearn/preprocessing/_function_transformer.py
+++ b/sklearn/preprocessing/_function_transformer.py
@@ -3,6 +3,7 @@
 import numpy as np
 
 from ..base import BaseEstimator, TransformerMixin
+from ..base import _fit_context
 from ..utils.metaestimators import available_if
 from ..utils.validation import (
     _allclose_dense_sparse,
@@ -197,6 +198,7 @@ def _check_inverse_transform(self, X):
                 UserWarning,
             )
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def fit(self, X, y=None):
         """Fit transformer by checking X.
 
@@ -216,7 +218,6 @@ def fit(self, X, y=None):
         self : object
             FunctionTransformer class instance.
         """
-        self._validate_params()
         X = self._check_input(X, reset=True)
         if self.check_inverse and not (self.func is None or self.inverse_func is None):
             self._check_inverse_transform(X)
diff --git a/sklearn/preprocessing/_label.py b/sklearn/preprocessing/_label.py
index ca8607b06c2e2..f656329607ee3 100644
--- a/sklearn/preprocessing/_label.py
+++ b/sklearn/preprocessing/_label.py
@@ -16,7 +16,7 @@
 import scipy.sparse as sp
 
 from ..base import BaseEstimator, TransformerMixin
-
+from ..base import _fit_context
 from ..utils.sparsefuncs import min_max_axis
 from ..utils._param_validation import Interval, validate_params
 from ..utils import column_or_1d
@@ -268,6 +268,7 @@ def __init__(self, *, neg_label=0, pos_label=1, sparse_output=False):
         self.pos_label = pos_label
         self.sparse_output = sparse_output
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def fit(self, y):
         """Fit label binarizer.
 
@@ -282,9 +283,6 @@ def fit(self, y):
         self : object
             Returns the instance itself.
         """
-
-        self._validate_params()
-
         if self.neg_label >= self.pos_label:
             raise ValueError(
                 f"neg_label={self.neg_label} must be strictly less than "
@@ -761,6 +759,7 @@ def __init__(self, *, classes=None, sparse_output=False):
         self.classes = classes
         self.sparse_output = sparse_output
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def fit(self, y):
         """Fit the label sets binarizer, storing :term:`classes_`.
 
@@ -776,7 +775,6 @@ def fit(self, y):
         self : object
             Fitted estimator.
         """
-        self._validate_params()
         self._cached_dict = None
 
         if self.classes is None:
@@ -794,6 +792,7 @@ def fit(self, y):
         self.classes_[:] = classes
         return self
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def fit_transform(self, y):
         """Fit the label sets binarizer and transform the given label sets.
 
@@ -814,7 +813,6 @@ def fit_transform(self, y):
         if self.classes is not None:
             return self.fit(y).transform(y)
 
-        self._validate_params()
         self._cached_dict = None
 
         # Automatically increment on new class
diff --git a/sklearn/preprocessing/_polynomial.py b/sklearn/preprocessing/_polynomial.py
index 08ccf6355fc4e..1dfee8a088114 100644
--- a/sklearn/preprocessing/_polynomial.py
+++ b/sklearn/preprocessing/_polynomial.py
@@ -12,6 +12,7 @@
 from scipy.special import comb
 
 from ..base import BaseEstimator, TransformerMixin
+from ..base import _fit_context
 from ..utils import check_array
 from ..utils.fixes import sp_version, parse_version
 from ..utils.validation import check_is_fitted, FLOAT_DTYPES, _check_sample_weight
@@ -299,6 +300,7 @@ def get_feature_names_out(self, input_features=None):
             feature_names.append(name)
         return np.asarray(feature_names, dtype=object)
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def fit(self, X, y=None):
         """
         Compute number of output features.
@@ -316,7 +318,6 @@ def fit(self, X, y=None):
         self : object
             Fitted transformer.
         """
-        self._validate_params()
         _, n_features = self._validate_data(X, accept_sparse=True).shape
 
         if isinstance(self.degree, Integral):
@@ -802,6 +803,7 @@ def get_feature_names_out(self, input_features=None):
                 feature_names.append(f"{input_features[i]}_sp_{j}")
         return np.asarray(feature_names, dtype=object)
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def fit(self, X, y=None, sample_weight=None):
         """Compute knot positions of splines.
 
@@ -823,8 +825,6 @@ def fit(self, X, y=None, sample_weight=None):
         self : object
             Fitted transformer.
         """
-        self._validate_params()
-
         X = self._validate_data(
             X,
             reset=True,
diff --git a/sklearn/preprocessing/_target_encoder.py b/sklearn/preprocessing/_target_encoder.py
index 9100d72194a32..9dd33ddfa3cce 100644
--- a/sklearn/preprocessing/_target_encoder.py
+++ b/sklearn/preprocessing/_target_encoder.py
@@ -4,6 +4,7 @@
 
 from ._encoders import _BaseEncoder
 from ..base import OneToOneFeatureMixin
+from ..base import _fit_context
 from ._target_encoder_fast import _fit_encoding_fast
 from ._target_encoder_fast import _fit_encoding_fast_auto_smooth
 from ..utils.validation import _check_y, check_consistent_length
@@ -176,6 +177,7 @@ def __init__(
         self.shuffle = shuffle
         self.random_state = random_state
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def fit(self, X, y):
         """Fit the :class:`TargetEncoder` to X and y.
 
@@ -192,10 +194,10 @@ def fit(self, X, y):
         self : object
             Fitted encoder.
         """
-        self._validate_params()
         self._fit_encodings_all(X, y)
         return self
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def fit_transform(self, X, y):
         """Fit :class:`TargetEncoder` and transform X with the target encoding.
 
@@ -219,7 +221,6 @@ def fit_transform(self, X, y):
         """
         from ..model_selection import KFold, StratifiedKFold  # avoid circular import
 
-        self._validate_params()
         X_ordinal, X_known_mask, y, n_categories = self._fit_encodings_all(X, y)
 
         # The cv splitter is voluntarily restricted to *KFold to enforce non
diff --git a/sklearn/preprocessing/tests/test_data.py b/sklearn/preprocessing/tests/test_data.py
index 2e6fd810fedac..c00de906a7dbb 100644
--- a/sklearn/preprocessing/tests/test_data.py
+++ b/sklearn/preprocessing/tests/test_data.py
@@ -2669,3 +2669,22 @@ def test_kernel_centerer_feature_names_out():
     names_out = centerer.get_feature_names_out()
     samples_out2 = X_pairwise.shape[1]
     assert_array_equal(names_out, [f"kernelcenterer{i}" for i in range(samples_out2)])
+
+
+@pytest.mark.parametrize("standardize", [True, False])
+def test_power_transformer_constant_feature(standardize):
+    """Check that PowerTransfomer leaves constant features unchanged."""
+    X = [[-2, 0, 2], [-2, 0, 2], [-2, 0, 2]]
+
+    pt = PowerTransformer(method="yeo-johnson", standardize=standardize).fit(X)
+
+    assert_allclose(pt.lambdas_, [1, 1, 1])
+
+    Xft = pt.fit_transform(X)
+    Xt = pt.transform(X)
+
+    for Xt_ in [Xft, Xt]:
+        if standardize:
+            assert_allclose(Xt_, np.zeros_like(X))
+        else:
+            assert_allclose(Xt_, X)
diff --git a/sklearn/random_projection.py b/sklearn/random_projection.py
index 9e9620e089521..ca0ee41784ab5 100644
--- a/sklearn/random_projection.py
+++ b/sklearn/random_projection.py
@@ -36,7 +36,7 @@
 
 from .base import BaseEstimator, TransformerMixin
 from .base import ClassNamePrefixFeaturesOutMixin
-
+from .base import _fit_context
 from .utils import check_random_state
 from .utils._param_validation import Interval, StrOptions, validate_params
 from .utils.extmath import safe_sparse_dot
@@ -356,6 +356,7 @@ def _compute_inverse_components(self):
             components = components.toarray()
         return linalg.pinv(components, check_finite=False)
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def fit(self, X, y=None):
         """Generate a sparse random projection matrix.
 
@@ -374,7 +375,6 @@ def fit(self, X, y=None):
         self : object
             BaseRandomProjection class instance.
         """
-        self._validate_params()
         X = self._validate_data(
             X, accept_sparse=["csr", "csc"], dtype=[np.float64, np.float32]
         )
diff --git a/sklearn/semi_supervised/_label_propagation.py b/sklearn/semi_supervised/_label_propagation.py
index 95fad0713d558..9d7786bc1d67e 100644
--- a/sklearn/semi_supervised/_label_propagation.py
+++ b/sklearn/semi_supervised/_label_propagation.py
@@ -64,6 +64,7 @@
 from scipy.sparse import csgraph
 
 from ..base import BaseEstimator, ClassifierMixin
+from ..base import _fit_context
 from ..metrics.pairwise import rbf_kernel
 from ..neighbors import NearestNeighbors
 from ..utils.extmath import safe_sparse_dot
@@ -230,6 +231,7 @@ class labels.
         probabilities /= normalizer
         return probabilities
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def fit(self, X, y):
         """Fit a semi-supervised label propagation model to X.
 
@@ -254,7 +256,6 @@ def fit(self, X, y):
         self : object
             Returns the instance itself.
         """
-        self._validate_params()
         X, y = self._validate_data(
             X,
             y,
diff --git a/sklearn/semi_supervised/_self_training.py b/sklearn/semi_supervised/_self_training.py
index 2438658ed89c8..c4706df1754da 100644
--- a/sklearn/semi_supervised/_self_training.py
+++ b/sklearn/semi_supervised/_self_training.py
@@ -4,6 +4,7 @@
 import numpy as np
 
 from ..base import MetaEstimatorMixin, clone, BaseEstimator
+from ..base import _fit_context
 from ..utils._param_validation import HasMethods, Interval, StrOptions
 from ..utils.validation import check_is_fitted
 from ..utils.metaestimators import available_if
@@ -171,6 +172,10 @@ def __init__(
         self.max_iter = max_iter
         self.verbose = verbose
 
+    @_fit_context(
+        # SelfTrainingClassifier.base_estimator is not validated yet
+        prefer_skip_nested_validation=False
+    )
     def fit(self, X, y):
         """
         Fit self-training classifier using `X`, `y` as training data.
@@ -189,8 +194,6 @@ def fit(self, X, y):
         self : object
             Fitted estimator.
         """
-        self._validate_params()
-
         # we need row slicing support for sparce matrices, but costly finiteness check
         # can be delegated to the base estimator.
         X, y = self._validate_data(
diff --git a/sklearn/svm/_base.py b/sklearn/svm/_base.py
index 55919099e027c..a54c31cecb6e1 100644
--- a/sklearn/svm/_base.py
+++ b/sklearn/svm/_base.py
@@ -11,6 +11,7 @@
 from . import _liblinear as liblinear  # type: ignore
 from . import _libsvm_sparse as libsvm_sparse  # type: ignore
 from ..base import BaseEstimator, ClassifierMixin
+from ..base import _fit_context
 from ..preprocessing import LabelEncoder
 from ..utils.multiclass import _ovr_decision_function
 from ..utils import check_array, check_random_state
@@ -143,6 +144,7 @@ def _more_tags(self):
         # Used by cross_val_score.
         return {"pairwise": self.kernel == "precomputed"}
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def fit(self, X, y, sample_weight=None):
         """Fit the SVM model according to the given training data.
 
@@ -176,8 +178,6 @@ def fit(self, X, y, sample_weight=None):
         If X is a dense array, then the other methods will not support sparse
         matrices as input.
         """
-        self._validate_params()
-
         rnd = check_random_state(self.random_state)
 
         sparse = sp.isspmatrix(X)
diff --git a/sklearn/svm/_classes.py b/sklearn/svm/_classes.py
index e035e74a05e2c..a438d007da970 100644
--- a/sklearn/svm/_classes.py
+++ b/sklearn/svm/_classes.py
@@ -5,6 +5,7 @@
 
 from ._base import _fit_liblinear, _get_liblinear_solver_type, BaseSVC, BaseLibSVM
 from ..base import BaseEstimator, RegressorMixin, OutlierMixin
+from ..base import _fit_context
 from ..linear_model._base import LinearClassifierMixin, SparseCoefMixin, LinearModel
 from ..utils import deprecated
 from ..utils.validation import _num_samples
@@ -272,6 +273,7 @@ def __init__(
         self.penalty = penalty
         self.loss = loss
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def fit(self, X, y, sample_weight=None):
         """Fit the model according to the given training data.
 
@@ -296,8 +298,6 @@ def fit(self, X, y, sample_weight=None):
         self : object
             An instance of the estimator.
         """
-        self._validate_params()
-
         X, y = self._validate_data(
             X,
             y,
@@ -529,6 +529,7 @@ def __init__(
         self.dual = dual
         self.loss = loss
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def fit(self, X, y, sample_weight=None):
         """Fit the model according to the given training data.
 
@@ -553,8 +554,6 @@ def fit(self, X, y, sample_weight=None):
         self : object
             An instance of the estimator.
         """
-        self._validate_params()
-
         X, y = self._validate_data(
             X,
             y,
diff --git a/sklearn/tests/test_metadata_routing.py b/sklearn/tests/test_metadata_routing.py
index 3b00b5a244ee8..a6e74c12f6e45 100644
--- a/sklearn/tests/test_metadata_routing.py
+++ b/sklearn/tests/test_metadata_routing.py
@@ -653,6 +653,21 @@ def fit(self, X, y, prop=None, **kwargs):
         Klass().fit(None, None)  # for coverage
 
 
+def test_removing_non_existing_param_raises():
+    """Test that removing a metadata using UNUSED which doesn't exist raises."""
+
+    class InvalidRequestRemoval(BaseEstimator):
+        # `fit` (in this class or a parent) requests `prop`, but we don't want
+        # it requested at all.
+        __metadata_request__fit = {"prop": metadata_routing.UNUSED}
+
+        def fit(self, X, y, **kwargs):
+            return self
+
+    with pytest.raises(ValueError, match="Trying to remove parameter"):
+        InvalidRequestRemoval().get_metadata_routing()
+
+
 def test_method_metadata_request():
     mmr = MethodMetadataRequest(owner="test", method="fit")
 
diff --git a/sklearn/tests/test_public_functions.py b/sklearn/tests/test_public_functions.py
index 3157e344cbef3..99f7f22d92e3d 100644
--- a/sklearn/tests/test_public_functions.py
+++ b/sklearn/tests/test_public_functions.py
@@ -241,6 +241,7 @@ def _check_function_param_validation(
     "sklearn.metrics.pairwise.manhattan_distances",
     "sklearn.metrics.pairwise.nan_euclidean_distances",
     "sklearn.metrics.pairwise.paired_cosine_distances",
+    "sklearn.metrics.pairwise.paired_distances",
     "sklearn.metrics.pairwise.paired_euclidean_distances",
     "sklearn.metrics.pairwise.paired_manhattan_distances",
     "sklearn.metrics.pairwise.polynomial_kernel",
diff --git a/sklearn/tree/_classes.py b/sklearn/tree/_classes.py
index 4fdd8f27cd652..64a444db0b228 100644
--- a/sklearn/tree/_classes.py
+++ b/sklearn/tree/_classes.py
@@ -31,6 +31,7 @@
 from sklearn.base import RegressorMixin
 from sklearn.base import is_classifier
 from sklearn.base import MultiOutputMixin
+from sklearn.base import _fit_context
 from sklearn.utils import Bunch
 from sklearn.utils import check_random_state
 from sklearn.utils.validation import _check_sample_weight
@@ -120,6 +121,7 @@ class BaseDecisionTree(MultiOutputMixin, BaseEstimator, metaclass=ABCMeta):
         "max_leaf_nodes": [Interval(Integral, 2, None, closed="left"), None],
         "min_impurity_decrease": [Interval(Real, 0.0, None, closed="left")],
         "ccp_alpha": [Interval(Real, 0.0, None, closed="left")],
+        "store_leaf_values": [bool],
     }
 
     @abstractmethod
@@ -138,6 +140,7 @@ def __init__(
         min_impurity_decrease,
         class_weight=None,
         ccp_alpha=0.0,
+        store_leaf_values=False,
     ):
         self.criterion = criterion
         self.splitter = splitter
@@ -151,6 +154,7 @@ def __init__(
         self.min_impurity_decrease = min_impurity_decrease
         self.class_weight = class_weight
         self.ccp_alpha = ccp_alpha
+        self.store_leaf_values = store_leaf_values
 
     def get_depth(self):
         """Return the depth of the decision tree.
@@ -180,7 +184,7 @@ def get_n_leaves(self):
     def _support_missing_values(self, X):
         return not issparse(X) and self._get_tags()["allow_nan"]
 
-    def _compute_feature_has_missing(self, X):
+    def _compute_missing_values_in_feature_mask(self, X):
         """Return boolean mask denoting if there are missing values for each feature.
 
         This method also ensures that X is finite.
@@ -192,7 +196,7 @@ def _compute_feature_has_missing(self, X):
 
         Returns
         -------
-        feature_has_missing : ndarray of shape (n_features,), or None
+        missing_values_in_feature_mask : ndarray of shape (n_features,), or None
             Missing value mask. If missing values are not supported or there
             are no missing values, return None.
         """
@@ -213,13 +217,17 @@ def _compute_feature_has_missing(self, X):
         if not np.isnan(overall_sum):
             return None
 
-        feature_has_missing = _any_isnan_axis0(X)
-        return feature_has_missing
+        missing_values_in_feature_mask = _any_isnan_axis0(X)
+        return missing_values_in_feature_mask
 
     def _fit(
-        self, X, y, sample_weight=None, check_input=True, feature_has_missing=None
+        self,
+        X,
+        y,
+        sample_weight=None,
+        check_input=True,
+        missing_values_in_feature_mask=None,
     ):
-        self._validate_params()
         random_state = check_random_state(self.random_state)
 
         if check_input:
@@ -227,7 +235,7 @@ def _fit(
             # We can't pass multi_output=True because that would allow y to be
             # csr.
 
-            # _compute_feature_has_missing will check for finite values and
+            # _compute_missing_values_in_feature_mask will check for finite values and
             # compute the missing mask if the tree supports missing values
             check_X_params = dict(
                 dtype=DTYPE, accept_sparse="csc", force_all_finite=False
@@ -240,7 +248,9 @@ def _fit(
             else:
                 X = self._validate_data(X, **check_X_params)
 
-            feature_has_missing = self._compute_feature_has_missing(X)
+            missing_values_in_feature_mask = (
+                self._compute_missing_values_in_feature_mask(X)
+            )
             if issparse(X):
                 X.sort_indices()
 
@@ -388,7 +398,7 @@ def _fit(
             X,
             y,
             sample_weight,
-            feature_has_missing,
+            missing_values_in_feature_mask,
             min_samples_leaf,
             min_weight_leaf,
             max_leaf_nodes,
@@ -397,6 +407,9 @@ def _fit(
             random_state,
         )
 
+        if self.store_leaf_values:
+            self.leaf_nodes_samples_ = self.tree_.leaf_nodes_samples
+
         return self
 
     def _build_tree(
@@ -404,7 +417,7 @@ def _build_tree(
         X,
         y,
         sample_weight,
-        feature_has_missing,
+        missing_values_in_feature_mask,
         min_samples_leaf,
         min_weight_leaf,
         max_leaf_nodes,
@@ -483,6 +496,7 @@ def _build_tree(
                 min_weight_leaf,
                 max_depth,
                 self.min_impurity_decrease,
+                self.store_leaf_values,
             )
         else:
             builder = BestFirstTreeBuilder(
@@ -493,9 +507,10 @@ def _build_tree(
                 max_depth,
                 max_leaf_nodes,
                 self.min_impurity_decrease,
+                self.store_leaf_values,
             )
 
-        builder.build(self.tree_, X, y, sample_weight, feature_has_missing)
+        builder.build(self.tree_, X, y, sample_weight, missing_values_in_feature_mask)
 
         if self.n_outputs_ == 1 and is_classifier(self):
             self.n_classes_ = self.n_classes_[0]
@@ -551,6 +566,9 @@ def predict(self, X, check_input=True):
         """
         check_is_fitted(self)
         X = self._validate_X_predict(X, check_input)
+
+        # proba is a count matrix of leaves that fall into
+        # (n_samples, n_outputs, max_n_classes) array
         proba = self.tree_.predict(X)
         n_samples = X.shape[0]
 
@@ -577,6 +595,128 @@ def predict(self, X, check_input=True):
             else:
                 return proba[:, :, 0]
 
+    def get_leaf_node_samples(self, X, check_input=True):
+        """For each datapoint x in X, get the training samples in the leaf node.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            Dataset to apply the forest to.
+
+        Returns
+        -------
+        leaf_nodes_samples : a list of array-like of shape
+                (n_leaf_node_samples, n_outputs)
+            Each sample is represented by the indices of the training samples that
+            reached the leaf node. The ``n_leaf_node_samples`` may vary between
+            samples, since the number of samples that fall in a leaf node is
+            variable.
+        """
+        if not self.store_leaf_values:
+            raise RuntimeError(
+                "leaf node samples are not stored when store_leaf_values=False"
+            )
+
+        # get indices of leaves per sample (n_samples,)
+        X_leaves = self.apply(X, check_input=check_input)
+        n_samples = X_leaves.shape[0]
+
+        # get array of samples per leaf (n_node_samples, n_outputs)
+        leaf_samples = self.tree_.leaf_nodes_samples
+
+        leaf_nodes_samples = []
+        for idx in range(n_samples):
+            leaf_id = X_leaves[idx]
+            leaf_nodes_samples.append(leaf_samples[leaf_id])
+        return leaf_nodes_samples
+
+    def predict_quantiles(self, X, quantiles=0.5, method="nearest", check_input=True):
+        """Predict class or regression value for X at given quantiles.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix} of shape (n_samples, n_features)
+            Input data.
+        quantiles : float, optional
+            The quantiles at which to evaluate, by default 0.5 (median).
+        method : str, optional
+            The method to interpolate, by default 'linear'. Can be any keyword
+            argument accepted by :func:`np.quantile`.
+        check_input : bool, optional
+            Whether or not to check input, by default True.
+
+        Returns
+        -------
+        predictions : array-like of shape (n_samples, n_outputs, len(quantiles))
+            The predicted quantiles.
+        """
+        if not self.store_leaf_values:
+            raise RuntimeError(
+                "Predicting quantiles requires that the tree stores leaf node samples."
+            )
+
+        check_is_fitted(self)
+
+        # Check data
+        X = self._validate_X_predict(X, check_input)
+
+        if not isinstance(quantiles, (np.ndarray, list)):
+            quantiles = np.array([quantiles])
+
+        # get indices of leaves per sample
+        X_leaves = self.apply(X)
+
+        # get array of samples per leaf (n_node_samples, n_outputs)
+        leaf_samples = self.tree_.leaf_nodes_samples
+
+        # compute quantiles (n_samples, n_quantiles, n_outputs)
+        n_samples = X.shape[0]
+        n_quantiles = len(quantiles)
+        proba = np.zeros((n_samples, n_quantiles, self.n_outputs_))
+        for idx, leaf_id in enumerate(X_leaves):
+            # predict by taking the quantile across the samples in the leaf for
+            # each output
+            proba[idx, ...] = np.quantile(
+                leaf_samples[leaf_id], quantiles, axis=0, interpolation=method
+            )
+
+        # Classification
+        if is_classifier(self):
+            if self.n_outputs_ == 1:
+                # return the class with the highest probability for each quantile
+                # (n_samples, n_quantiles)
+                class_preds = np.zeros(
+                    (n_samples, n_quantiles), dtype=self.classes_.dtype
+                )
+                for i in range(n_quantiles):
+                    class_pred_per_sample = (
+                        proba[:, i, :].squeeze().astype(self.classes_.dtype)
+                    )
+                    class_preds[:, i] = self.classes_.take(
+                        class_pred_per_sample, axis=0
+                    )
+                return class_preds
+            else:
+                class_type = self.classes_[0].dtype
+                predictions = np.zeros(
+                    (n_samples, n_quantiles, self.n_outputs_), dtype=class_type
+                )
+                for k in range(self.n_outputs_):
+                    for i in range(n_quantiles):
+                        class_pred_per_sample = proba[:, i, k].squeeze().astype(int)
+                        predictions[:, i, k] = self.classes_[k].take(
+                            class_pred_per_sample, axis=0
+                        )
+
+                return predictions
+        # Regression
+        else:
+            if self.n_outputs_ == 1:
+                return proba[:, :, 0]
+
+            else:
+                return proba
+
     def apply(self, X, check_input=True):
         """Return the index of the leaf that each sample is predicted as.
 
@@ -851,6 +991,16 @@ class DecisionTreeClassifier(ClassifierMixin, BaseDecisionTree):
 
         .. versionadded:: 0.22
 
+    store_leaf_values : bool, default=False
+        Whether to store the samples that fall into leaves in the ``tree_`` attribute.
+        Each leaf will store a 2D array corresponding to the samples that fall into it
+        keyed by node_id.
+
+        XXX: This is currently experimental and may change without notice.
+        Moreover, it can be improved upon since storing the samples twice is not ideal.
+        One could instead store the indices in ``y_train`` that fall into each leaf,
+        which would lower RAM/diskspace usage.
+
     Attributes
     ----------
     classes_ : ndarray of shape (n_classes,) or list of ndarray
@@ -896,6 +1046,9 @@ class DecisionTreeClassifier(ClassifierMixin, BaseDecisionTree):
         :ref:`sphx_glr_auto_examples_tree_plot_unveil_tree_structure.py`
         for basic usage of these attributes.
 
+    leaf_nodes_samples_ : dict
+        A dictionary of leaf node index and the y_train samples in that leaf.
+
     See Also
     --------
     DecisionTreeRegressor : A decision tree regressor.
@@ -965,6 +1118,7 @@ def __init__(
         min_impurity_decrease=0.0,
         class_weight=None,
         ccp_alpha=0.0,
+        store_leaf_values=False,
     ):
         super().__init__(
             criterion=criterion,
@@ -979,8 +1133,10 @@ def __init__(
             random_state=random_state,
             min_impurity_decrease=min_impurity_decrease,
             ccp_alpha=ccp_alpha,
+            store_leaf_values=store_leaf_values,
         )
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def fit(self, X, y, sample_weight=None, check_input=True):
         """Build a decision tree classifier from the training set (X, y).
 
@@ -1327,6 +1483,7 @@ def __init__(
         max_leaf_nodes=None,
         min_impurity_decrease=0.0,
         ccp_alpha=0.0,
+        store_leaf_values=False,
     ):
         super().__init__(
             criterion=criterion,
@@ -1340,8 +1497,10 @@ def __init__(
             random_state=random_state,
             min_impurity_decrease=min_impurity_decrease,
             ccp_alpha=ccp_alpha,
+            store_leaf_values=store_leaf_values,
         )
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def fit(self, X, y, sample_weight=None, check_input=True):
         """Build a decision tree regressor from the training set (X, y).
 
@@ -1653,6 +1812,7 @@ def __init__(
         min_impurity_decrease=0.0,
         class_weight=None,
         ccp_alpha=0.0,
+        store_leaf_values=False,
     ):
         super().__init__(
             criterion=criterion,
@@ -1667,6 +1827,7 @@ def __init__(
             min_impurity_decrease=min_impurity_decrease,
             random_state=random_state,
             ccp_alpha=ccp_alpha,
+            store_leaf_values=store_leaf_values,
         )
 
 
@@ -1880,6 +2041,7 @@ def __init__(
         min_impurity_decrease=0.0,
         max_leaf_nodes=None,
         ccp_alpha=0.0,
+        store_leaf_values=False,
     ):
         super().__init__(
             criterion=criterion,
@@ -1893,4 +2055,5 @@ def __init__(
             min_impurity_decrease=min_impurity_decrease,
             random_state=random_state,
             ccp_alpha=ccp_alpha,
+            store_leaf_values=store_leaf_values,
         )
diff --git a/sklearn/tree/_criterion.pxd b/sklearn/tree/_criterion.pxd
index 721b475f40436..31c10ccfe4f93 100644
--- a/sklearn/tree/_criterion.pxd
+++ b/sklearn/tree/_criterion.pxd
@@ -92,7 +92,7 @@ cdef class Criterion(BaseCriterion):
 
     cdef void node_samples(
         self,
-        vector[vector[DOUBLE_t]]* dest
+        vector[vector[DOUBLE_t]]& dest
     ) noexcept nogil
 
 cdef class ClassificationCriterion(Criterion):
diff --git a/sklearn/tree/_criterion.pyx b/sklearn/tree/_criterion.pyx
index c3f08ec859bee..dfa64c1184df5 100644
--- a/sklearn/tree/_criterion.pyx
+++ b/sklearn/tree/_criterion.pyx
@@ -258,9 +258,17 @@ cdef class Criterion(BaseCriterion):
 
     cdef void node_samples(
         self,
-        vector[vector[DOUBLE_t]]* dest
+        vector[vector[DOUBLE_t]]& dest
     ) noexcept nogil:
-        cdef SIZE_t i, j
+        """Copy the samples of the current node into dest.
+
+        Parameters
+        ----------
+        dest : reference vector[vector[DOUBLE_t]]
+            The vector of vectors where the samples should be copied.
+            This is passed by reference and modified in place.
+        """
+        cdef SIZE_t i, j, k
 
         # Resize the destination vector of vectors
         dest.resize(self.n_node_samples)
@@ -272,7 +280,8 @@ cdef class Criterion(BaseCriterion):
 
             # Get the sample values for each output
             for k in range(self.n_outputs):
-                dest[i][k].push_back(self.y[j, k])
+                dest[i].push_back(self.y[j, k])
+
 
 cdef inline void _move_sums_classification(
     ClassificationCriterion criterion,
diff --git a/sklearn/tree/_splitter.pxd b/sklearn/tree/_splitter.pxd
index fb21f676e66cc..915b2baa30e94 100644
--- a/sklearn/tree/_splitter.pxd
+++ b/sklearn/tree/_splitter.pxd
@@ -104,10 +104,10 @@ cdef class Splitter(BaseSplitter):
         object X,
         const DOUBLE_t[:, ::1] y,
         const DOUBLE_t[:] sample_weight,
-        const unsigned char[::1] feature_has_missing,
+        const unsigned char[::1] missing_values_in_feature_mask,
     ) except -1
 
-    cdef void node_samples(self, vector[vector[DOUBLE_t]]* dest) noexcept nogil
+    cdef void node_samples(self, vector[vector[DOUBLE_t]]& dest) noexcept nogil
 
     # Methods that allow modifications to stopping conditions
     cdef bint check_presplit_conditions(
diff --git a/sklearn/tree/_splitter.pyx b/sklearn/tree/_splitter.pyx
index 7f21d5da545fb..1f3d164370b95 100644
--- a/sklearn/tree/_splitter.pyx
+++ b/sklearn/tree/_splitter.pyx
@@ -168,7 +168,7 @@ cdef class Splitter(BaseSplitter):
         object X,
         const DOUBLE_t[:, ::1] y,
         const DOUBLE_t[:] sample_weight,
-        const unsigned char[::1] feature_has_missing,
+        const unsigned char[::1] missing_values_in_feature_mask,
     ) except -1:
         """Initialize the splitter.
 
@@ -245,7 +245,7 @@ cdef class Splitter(BaseSplitter):
             self.end
         )
 
-        if feature_has_missing is not None:
+        if missing_values_in_feature_mask is not None:
             self.criterion.init_sum_missing()
 
         return 0
@@ -280,7 +280,7 @@ cdef class Splitter(BaseSplitter):
 
         self.criterion.node_value(dest)
 
-    cdef void node_samples(self, vector[vector[DOUBLE_t]]* dest) noexcept nogil:
+    cdef void node_samples(self, vector[vector[DOUBLE_t]]& dest) noexcept nogil:
         """Copy the samples[start:end] into dest."""
         self.criterion.node_samples(dest)
 
@@ -903,19 +903,19 @@ cdef class DensePartitioner:
         cdef SIZE_t start
         cdef SIZE_t end
         cdef SIZE_t n_missing
-        cdef const unsigned char[::1] feature_has_missing
+        cdef const unsigned char[::1] missing_values_in_feature_mask
 
     def __init__(
         self,
         const DTYPE_t[:, :] X,
         SIZE_t[::1] samples,
         DTYPE_t[::1] feature_values,
-        const unsigned char[::1] feature_has_missing,
+        const unsigned char[::1] missing_values_in_feature_mask,
     ):
         self.X = X
         self.samples = samples
         self.feature_values = feature_values
-        self.feature_has_missing = feature_has_missing
+        self.missing_values_in_feature_mask = missing_values_in_feature_mask
 
     cdef inline void init_node_split(self, SIZE_t start, SIZE_t end) noexcept nogil:
         """Initialize splitter at the beginning of node_split."""
@@ -938,13 +938,13 @@ cdef class DensePartitioner:
             const DTYPE_t[:, :] X = self.X
             SIZE_t[::1] samples = self.samples
             SIZE_t n_missing = 0
-            const unsigned char[::1] feature_has_missing = self.feature_has_missing
+            const unsigned char[::1] missing_values_in_feature_mask = self.missing_values_in_feature_mask
 
         # Sort samples along that feature; by
         # copying the values into an array and
         # sorting the array in a manner which utilizes the cache more
         # effectively.
-        if feature_has_missing is not None and feature_has_missing[current_feature]:
+        if missing_values_in_feature_mask is not None and missing_values_in_feature_mask[current_feature]:
             i, current_end = self.start, self.end - 1
             # Missing values are placed at the end and do not participate in the sorting.
             while i <= current_end:
@@ -1113,7 +1113,7 @@ cdef class SparsePartitioner:
     cdef SIZE_t start
     cdef SIZE_t end
     cdef SIZE_t n_missing
-    cdef const unsigned char[::1] feature_has_missing
+    cdef const unsigned char[::1] missing_values_in_feature_mask
 
     cdef const DTYPE_t[::1] X_data
     cdef const INT32_t[::1] X_indices
@@ -1134,7 +1134,7 @@ cdef class SparsePartitioner:
         SIZE_t[::1] samples,
         SIZE_t n_samples,
         DTYPE_t[::1] feature_values,
-        const unsigned char[::1] feature_has_missing,
+        const unsigned char[::1] missing_values_in_feature_mask,
     ):
         if not isspmatrix_csc(X):
             raise ValueError("X should be in csc format")
@@ -1158,7 +1158,7 @@ cdef class SparsePartitioner:
         for p in range(n_samples):
             self.index_to_samples[samples[p]] = p
 
-        self.feature_has_missing = feature_has_missing
+        self.missing_values_in_feature_mask = missing_values_in_feature_mask
 
     cdef inline void init_node_split(self, SIZE_t start, SIZE_t end) noexcept nogil:
         """Initialize splitter at the beginning of node_split."""
@@ -1529,11 +1529,11 @@ cdef class BestSplitter(Splitter):
         object X,
         const DOUBLE_t[:, ::1] y,
         const DOUBLE_t[:] sample_weight,
-        const unsigned char[::1] feature_has_missing,
+        const unsigned char[::1] missing_values_in_feature_mask,
     ) except -1:
-        Splitter.init(self, X, y, sample_weight, feature_has_missing)
+        Splitter.init(self, X, y, sample_weight, missing_values_in_feature_mask)
         self.partitioner = DensePartitioner(
-            X, self.samples, self.feature_values, feature_has_missing
+            X, self.samples, self.feature_values, missing_values_in_feature_mask
         )
 
     cdef int node_split(self, double impurity, SplitRecord* split,
@@ -1555,11 +1555,11 @@ cdef class BestSparseSplitter(Splitter):
         object X,
         const DOUBLE_t[:, ::1] y,
         const DOUBLE_t[:] sample_weight,
-        const unsigned char[::1] feature_has_missing,
+        const unsigned char[::1] missing_values_in_feature_mask,
     ) except -1:
-        Splitter.init(self, X, y, sample_weight, feature_has_missing)
+        Splitter.init(self, X, y, sample_weight, missing_values_in_feature_mask)
         self.partitioner = SparsePartitioner(
-            X, self.samples, self.n_samples, self.feature_values, feature_has_missing
+            X, self.samples, self.n_samples, self.feature_values, missing_values_in_feature_mask
         )
 
     cdef int node_split(self, double impurity, SplitRecord* split,
@@ -1581,11 +1581,11 @@ cdef class RandomSplitter(Splitter):
         object X,
         const DOUBLE_t[:, ::1] y,
         const DOUBLE_t[:] sample_weight,
-        const unsigned char[::1] feature_has_missing,
+        const unsigned char[::1] missing_values_in_feature_mask,
     ) except -1:
-        Splitter.init(self, X, y, sample_weight, feature_has_missing)
+        Splitter.init(self, X, y, sample_weight, missing_values_in_feature_mask)
         self.partitioner = DensePartitioner(
-            X, self.samples, self.feature_values, feature_has_missing
+            X, self.samples, self.feature_values, missing_values_in_feature_mask
         )
 
     cdef int node_split(self, double impurity, SplitRecord* split,
@@ -1607,11 +1607,11 @@ cdef class RandomSparseSplitter(Splitter):
         object X,
         const DOUBLE_t[:, ::1] y,
         const DOUBLE_t[:] sample_weight,
-        const unsigned char[::1] feature_has_missing,
+        const unsigned char[::1] missing_values_in_feature_mask,
     ) except -1:
-        Splitter.init(self, X, y, sample_weight, feature_has_missing)
+        Splitter.init(self, X, y, sample_weight, missing_values_in_feature_mask)
         self.partitioner = SparsePartitioner(
-            X, self.samples, self.n_samples, self.feature_values, feature_has_missing
+            X, self.samples, self.n_samples, self.feature_values, missing_values_in_feature_mask
         )
 
     cdef int node_split(self, double impurity, SplitRecord* split,
diff --git a/sklearn/tree/_tree.pxd b/sklearn/tree/_tree.pxd
index 94714cc33400c..828c99a2f4ea1 100644
--- a/sklearn/tree/_tree.pxd
+++ b/sklearn/tree/_tree.pxd
@@ -49,13 +49,6 @@ cdef class BaseTree:
     cdef SIZE_t value_stride             # The dimensionality of a vectorized output per sample
     cdef double* value                   # Array of values prediction values for each node
 
-    # Enables the use of tree to store distributions of the output to allow
-    # arbitrary usage of the the leaves. This is used in the quantile
-    # estimators for example.
-    # for storing samples at each leaf node with leaf's node ID as the key and
-    # the sample values as the value
-    cdef unordered_map[SIZE_t, vector[vector[DOUBLE_t]]] value_samples
-
     # Generic Methods: These are generic methods used by any tree.
     cdef int _resize(self, SIZE_t capacity) except -1 nogil
     cdef int _resize_c(self, SIZE_t capacity=*) except -1 nogil
@@ -121,9 +114,18 @@ cdef class Tree(BaseTree):
     cdef public SIZE_t n_outputs         # Number of outputs in y
     cdef public SIZE_t max_n_classes     # max(n_classes)
 
+    # Enables the use of tree to store distributions of the output to allow
+    # arbitrary usage of the the leaves. This is used in the quantile
+    # estimators for example.
+    # for storing samples at each leaf node with leaf's node ID as the key and
+    # the sample values as the value
+    cdef unordered_map[SIZE_t, vector[vector[DOUBLE_t]]] value_samples
+
     # Methods
     cdef cnp.ndarray _get_value_ndarray(self)
     cdef cnp.ndarray _get_node_ndarray(self)
+    cdef cnp.ndarray _get_value_samples_ndarray(self, SIZE_t node_id)
+    cdef cnp.ndarray _get_value_samples_keys(self)
 
     cpdef cnp.ndarray predict(self, object X)
 
@@ -146,7 +148,7 @@ cdef class TreeBuilder:
     cdef SIZE_t max_depth               # Maximal tree depth
     cdef double min_impurity_decrease   # Impurity threshold for early stopping
 
-    cdef unsigned char store_leaf_values # Whether to store leaf values
+    cdef unsigned char store_leaf_values    # Whether to store leaf values
 
     cpdef build(
         self,
@@ -154,7 +156,7 @@ cdef class TreeBuilder:
         object X,
         const DOUBLE_t[:, ::1] y,
         const DOUBLE_t[:] sample_weight=*,
-        const unsigned char[::1] feature_has_missing=*,
+        const unsigned char[::1] missing_values_in_feature_mask=*,
     )
 
     cdef _check_input(
diff --git a/sklearn/tree/_tree.pyx b/sklearn/tree/_tree.pyx
index 8ca98a64b42ab..1565ab441969d 100644
--- a/sklearn/tree/_tree.pyx
+++ b/sklearn/tree/_tree.pyx
@@ -100,7 +100,7 @@ cdef class TreeBuilder:
         object X,
         const DOUBLE_t[:, ::1] y,
         const DOUBLE_t[:] sample_weight=None,
-        const unsigned char[::1] feature_has_missing=None,
+        const unsigned char[::1] missing_values_in_feature_mask=None,
     ):
         """Build a decision tree from the training set (X, y)."""
         pass
@@ -182,7 +182,7 @@ cdef class DepthFirstTreeBuilder(TreeBuilder):
         object X,
         const DOUBLE_t[:, ::1] y,
         const DOUBLE_t[:] sample_weight=None,
-        const unsigned char[::1] feature_has_missing=None,
+        const unsigned char[::1] missing_values_in_feature_mask=None,
     ):
         """Build a decision tree from the training set (X, y)."""
 
@@ -208,7 +208,7 @@ cdef class DepthFirstTreeBuilder(TreeBuilder):
         cdef double min_impurity_decrease = self.min_impurity_decrease
 
         # Recursive partition (without actual recursion)
-        splitter.init(X, y, sample_weight, feature_has_missing)
+        splitter.init(X, y, sample_weight, missing_values_in_feature_mask)
 
         cdef SIZE_t start
         cdef SIZE_t end
@@ -229,8 +229,6 @@ cdef class DepthFirstTreeBuilder(TreeBuilder):
         cdef SIZE_t max_depth_seen = -1
         cdef int rc = 0
 
-        cdef int node_idx
-
         cdef stack[StackRecord] builder_stack
         cdef StackRecord stack_record
 
@@ -319,11 +317,8 @@ cdef class DepthFirstTreeBuilder(TreeBuilder):
                         "impurity": split.impurity_left,
                         "n_constant_features": n_constant_features})
                 elif self.store_leaf_values and is_leaf:
-                    with gil:
-                        print('Storing leaf values...')
-
                     # copy leaf values to leaf_values array
-                    splitter.node_samples(&tree.value_samples[node_id])
+                    splitter.node_samples(tree.value_samples[node_id])
 
                 if depth > max_depth_seen:
                     max_depth_seen = depth
@@ -406,7 +401,7 @@ cdef class BestFirstTreeBuilder(TreeBuilder):
         object X,
         const DOUBLE_t[:, ::1] y,
         const DOUBLE_t[:] sample_weight=None,
-        const unsigned char[::1] feature_has_missing=None,
+        const unsigned char[::1] missing_values_in_feature_mask=None,
     ):
         """Build a decision tree from the training set (X, y)."""
 
@@ -418,7 +413,7 @@ cdef class BestFirstTreeBuilder(TreeBuilder):
         cdef SIZE_t max_leaf_nodes = self.max_leaf_nodes
 
         # Recursive partition (without actual recursion)
-        splitter.init(X, y, sample_weight, feature_has_missing)
+        splitter.init(X, y, sample_weight, missing_values_in_feature_mask)
 
         cdef vector[FrontierRecord] frontier
         cdef FrontierRecord record
@@ -459,6 +454,9 @@ cdef class BestFirstTreeBuilder(TreeBuilder):
                     node.feature = _TREE_UNDEFINED
                     node.threshold = _TREE_UNDEFINED
 
+                    if self.store_leaf_values:
+                        # copy leaf values to leaf_values array
+                        splitter.node_samples(tree.value_samples[record.node_id])
                 else:
                     # Node is expandable
 
@@ -1321,6 +1319,14 @@ cdef class Tree(BaseTree):
     def value(self):
         return self._get_value_ndarray()[:self.node_count]
 
+    @property
+    def leaf_nodes_samples(self):
+        leaf_node_samples = dict()
+        keys = self._get_value_samples_keys()
+        for node_id in keys:
+            leaf_node_samples[node_id] = self._get_value_samples_ndarray(node_id)
+        return leaf_node_samples
+
     # TODO: Convert n_classes to cython.integral memory view once
     #  https://github.com/cython/cython/issues/5243 is fixed
     def __cinit__(self, int n_features, cnp.ndarray n_classes, int n_outputs):
@@ -1374,6 +1380,7 @@ cdef class Tree(BaseTree):
         d["node_count"] = self.node_count
         d["nodes"] = self._get_node_ndarray()
         d["values"] = self._get_value_ndarray()
+        d['value_samples'] = self.leaf_nodes_samples
         return d
 
     def __setstate__(self, d):
@@ -1407,6 +1414,35 @@ cdef class Tree(BaseTree):
         memcpy(self.value, cnp.PyArray_DATA(value_ndarray),
                self.capacity * self.value_stride * sizeof(double))
 
+        # store the leaf node samples if they exist
+        value_samples_dict = d['value_samples']
+        for node_id, leaf_samples in value_samples_dict.items():
+            self.value_samples[node_id].resize(leaf_samples.shape[0])
+            for idx in range(leaf_samples.shape[0]):
+                for jdx in range(leaf_samples.shape[1]):
+                    self.value_samples[node_id][idx].push_back(leaf_samples[idx, jdx])
+
+    cdef cnp.ndarray _get_value_samples_ndarray(self, SIZE_t node_id):
+        """Wraps value_samples as a 2-d NumPy array per node_id."""
+        cdef int i, j
+        cdef int n_samples = self.value_samples[node_id].size()
+        cdef cnp.ndarray[DOUBLE_t, ndim=2, mode='c'] leaf_node_samples = np.empty(shape=(n_samples, self.n_outputs), dtype=np.float64)
+
+        for i in range(n_samples):
+            for j in range(self.n_outputs):
+                leaf_node_samples[i, j] = self.value_samples[node_id][i][j]
+        return leaf_node_samples
+
+    cdef cnp.ndarray _get_value_samples_keys(self):
+        """Wraps value_samples keys as a 1-d NumPy array of keys."""
+        cdef cnp.ndarray[SIZE_t, ndim=1, mode='c'] keys = np.empty(len(self.value_samples), dtype=np.intp)
+        cdef unsigned int i = 0
+
+        for key in self.value_samples:
+            keys[i] = key.first
+            i += 1
+        return keys
+
     cdef cnp.ndarray _get_value_ndarray(self):
         """Wraps value as a 3-d NumPy array.
 
diff --git a/sklearn/tree/tests/test_tree.py b/sklearn/tree/tests/test_tree.py
index eefae6cdaa3f6..44a19b3dc0520 100644
--- a/sklearn/tree/tests/test_tree.py
+++ b/sklearn/tree/tests/test_tree.py
@@ -890,7 +890,7 @@ def test_pickle():
         else:
             X, y = diabetes.data, diabetes.target
 
-        est = TreeEstimator(random_state=0)
+        est = TreeEstimator(random_state=0, store_leaf_values=True)
         est.fit(X, y)
         score = est.score(X, y)
 
@@ -909,6 +909,7 @@ def test_pickle():
             "n_node_samples",
             "weighted_n_node_samples",
             "value",
+            "leaf_nodes_samples",
         ]
         fitted_attribute = {
             attribute: getattr(est.tree_, attribute) for attribute in attributes
@@ -923,14 +924,25 @@ def test_pickle():
             score == score2
         ), "Failed to generate same score  after pickling with {0}".format(name)
         for attribute in fitted_attribute:
-            assert_array_equal(
-                getattr(est2.tree_, attribute),
-                fitted_attribute[attribute],
-                err_msg=(
-                    f"Failed to generate same attribute {attribute} after pickling with"
-                    f" {name}"
-                ),
-            )
+            if attribute == "leaf_nodes_samples":
+                for key in fitted_attribute[attribute].keys():
+                    assert_array_equal(
+                        getattr(est2.tree_, attribute)[key],
+                        fitted_attribute[attribute][key],
+                        err_msg=(
+                            f"Failed to generate same attribute {attribute} after"
+                            f" pickling with {name}"
+                        ),
+                    )
+            else:
+                assert_array_equal(
+                    getattr(est2.tree_, attribute),
+                    fitted_attribute[attribute],
+                    err_msg=(
+                        f"Failed to generate same attribute {attribute} after pickling"
+                        f" with {name}"
+                    ),
+                )
 
 
 def test_multioutput():
@@ -2634,3 +2646,148 @@ def test_sample_weight_non_uniform(make_data, Tree):
     tree_samples_removed.fit(X[1::2, :], y[1::2])
 
     assert_allclose(tree_samples_removed.predict(X), tree_with_sw.predict(X))
+
+
+@pytest.mark.parametrize(
+    "tree_name",
+    ALL_TREES,
+)
+def test_leaf_node_samples(tree_name):
+    """Test getting leaf node samples from fitted tree."""
+    tree = ALL_TREES[tree_name](random_state=0, store_leaf_values=False)
+    tree.fit(X_small, y_small)
+
+    # Check that the leaf node samples are not stored by default
+    assert tree.tree_.leaf_nodes_samples == dict()
+
+    # error should be raised if trying to predict quantiles
+    assert hasattr(tree, "predict_quantiles")
+    for meth in ["predict_quantiles", "get_leaf_node_samples"]:
+        if hasattr(tree, meth):
+            with pytest.raises(
+                RuntimeError,
+                match="leaf node samples",
+            ):
+                getattr(tree, meth)(X_small)
+
+    quantile_tree = ALL_TREES[tree_name](random_state=0, store_leaf_values=True)
+    quantile_tree.fit(X_small, y_small)
+
+    score = tree.score(X_small, y_small)
+    new_score = quantile_tree.score(X_small, y_small)
+    assert np.isclose(score, new_score)
+
+    # Check that the leaf node samples are what they should be
+    X_leaves = quantile_tree.apply(X_small)
+    for idx in range(X_leaves.shape[0]):
+        leaf_idx = X_leaves[idx]
+        assert y_small[idx] in quantile_tree.tree_.leaf_nodes_samples[leaf_idx]
+    assert set(np.unique(X_leaves)) == set(
+        quantile_tree.tree_.leaf_nodes_samples.keys()
+    )
+
+
+@pytest.mark.parametrize(
+    "name",
+    ALL_TREES,
+)
+def test_quantile_tree_predict(name):
+    TreeEstimator = ALL_TREES[name]
+
+    # test quantile prediction
+    est = TreeEstimator(store_leaf_values=True, random_state=0)
+
+    # fit on binary results in perfect leaves, so all quantiles are the same
+    est.fit(X_small, y_small)
+    pred = est.predict_quantiles(X_small, quantiles=[0.1, 0.5, 0.9])
+    assert_array_equal(est.predict(X_small), pred[:, 0])
+    assert_array_equal(est.predict(X_small), pred[:, 1])
+    assert_array_equal(est.predict(X_small), pred[:, 2])
+    assert_array_equal(pred[:, 0], y_small)
+    assert np.unique(pred, axis=1).shape[1] == 1
+
+    est.fit(X_small[:-5], y_small[:-5])
+    held_out_X = X_small[-5:, :]
+    pred = est.predict_quantiles(held_out_X, quantiles=[0.1, 0.5, 0.9])
+    assert_array_equal(est.predict(held_out_X), pred[:, 0])
+    assert_array_equal(est.predict(held_out_X), pred[:, 1])
+    assert_array_equal(est.predict(held_out_X), pred[:, 2])
+
+    # fit on real data
+    est.fit(iris.data, iris.target)
+    pred = est.predict_quantiles(iris.data, quantiles=[0.1, 0.5, 0.9])
+    assert_array_equal(pred[:, 0], iris.target)
+    assert_array_equal(pred[:, 1], iris.target)
+    assert_array_equal(pred[:, 2], iris.target)
+
+
+@pytest.mark.parametrize(
+    "name",
+    ALL_TREES,
+)
+def test_quantile_tree_predict_impure_leaves(name):
+    TreeEstimator = ALL_TREES[name]
+
+    # test quantile prediction
+    est = TreeEstimator(store_leaf_values=True, random_state=0, max_depth=4)
+    # fit on binary results with constrained depth will result in impure leaves
+    est.fit(X_small, y_small)
+    pred = est.predict_quantiles(X_small, quantiles=[0.1, 0.5, 0.9])
+    assert np.unique(pred, axis=1).shape[1] > 1
+
+
+def test_multioutput_quantiles():
+    # Check estimators on multi-output problems.
+    X = [
+        [-2, -1],
+        [-1, -1],
+        [-1, -2],
+        [1, 1],
+        [1, 2],
+        [2, 1],
+        [-2, 1],
+        [-1, 1],
+        [-1, 2],
+        [2, -1],
+        [1, -1],
+        [1, -2],
+    ]
+
+    y = [
+        [-1, 0],
+        [-1, 0],
+        [-1, 0],
+        [1, 1],
+        [1, 1],
+        [1, 1],
+        [-1, 2],
+        [-1, 2],
+        [-1, 2],
+        [1, 3],
+        [1, 3],
+        [1, 3],
+    ]
+
+    T = [[-1, -1], [1, 1], [-1, 1], [1, -1]]
+    y_true = [[-1, 0], [1, 1], [-1, 2], [1, 3]]
+
+    # toy classification problem
+    for name, TreeClassifier in CLF_TREES.items():
+        clf = TreeClassifier(random_state=0, store_leaf_values=True)
+        clf.fit(X, y)
+
+        y_hat = clf.predict_quantiles(T, quantiles=[0.25, 0.5, 0.75])
+        y_hat = y_hat.squeeze()
+        assert_array_equal(y_hat[:, 0], y_true)
+        assert_array_equal(y_hat[:, 1], y_true)
+        assert_array_equal(y_hat[:, 2], y_true)
+        assert y_hat.shape == (4, 3, 2)
+
+    # toy regression problem
+    for name, TreeRegressor in REG_TREES.items():
+        reg = TreeRegressor(random_state=0, store_leaf_values=True)
+        y_hat = reg.fit(X, y).predict_quantiles(T, quantiles=[0.25, 0.5, 0.75])
+        assert_array_equal(y_hat[:, 0], y_true)
+        assert_array_equal(y_hat[:, 1], y_true)
+        assert_array_equal(y_hat[:, 2], y_true)
+        assert y_hat.shape == (4, 3, 2)
diff --git a/sklearn/utils/_metadata_requests.py b/sklearn/utils/_metadata_requests.py
index 82b3eec69b461..a1cd934c13756 100644
--- a/sklearn/utils/_metadata_requests.py
+++ b/sklearn/utils/_metadata_requests.py
@@ -241,8 +241,14 @@ def add_request(
         if alias == param:
             alias = True
 
-        if alias == UNUSED and param in self._requests:
-            del self._requests[param]
+        if alias == UNUSED:
+            if param in self._requests:
+                del self._requests[param]
+            else:
+                raise ValueError(
+                    f"Trying to remove parameter {param} with UNUSED which doesn't"
+                    " exist."
+                )
         else:
             self._requests[param] = alias
 
@@ -1155,7 +1161,7 @@ def _build_request_for_signature(cls, router, method):
         # ignore the first parameter of the method, which is usually "self"
         params = list(inspect.signature(getattr(cls, method)).parameters.items())[1:]
         for pname, param in params:
-            if pname in {"X", "y", "Y"}:
+            if pname in {"X", "y", "Y", "Xt", "yt"}:
                 continue
             if param.kind in {param.VAR_POSITIONAL, param.VAR_KEYWORD}:
                 continue
diff --git a/sklearn/utils/_plotting.py b/sklearn/utils/_plotting.py
index cc301b509e386..c0671046c9cd4 100644
--- a/sklearn/utils/_plotting.py
+++ b/sklearn/utils/_plotting.py
@@ -1,3 +1,5 @@
+import numpy as np
+
 from . import check_consistent_length, check_matplotlib_support
 from .multiclass import type_of_target
 from .validation import _check_pos_label_consistency
@@ -56,3 +58,41 @@ def _validate_from_predictions_params(
         name = name if name is not None else "Classifier"
 
         return pos_label, name
+
+
+def _validate_score_name(score_name, scoring, negate_score):
+    """Validate the `score_name` parameter.
+
+    If `score_name` is provided, we just return it as-is.
+    If `score_name` is `None`, we use `Score` if `negate_score` is `False` and
+    `Negative score` otherwise.
+    If `score_name` is a string or a callable, we infer the name. We replace `_` by
+    spaces and capitalize the first letter. We remove `neg_` and replace it by
+    `"Negative"` if `negate_score` is `False` or just remove it otherwise.
+    """
+    if score_name is not None:
+        return score_name
+    elif scoring is None:
+        return "Negative score" if negate_score else "Score"
+    else:
+        score_name = scoring.__name__ if callable(scoring) else scoring
+        if negate_score:
+            if score_name.startswith("neg_"):
+                score_name = score_name[4:]
+            else:
+                score_name = f"Negative {score_name}"
+        elif score_name.startswith("neg_"):
+            score_name = f"Negative {score_name[4:]}"
+        score_name = score_name.replace("_", " ")
+        return score_name.capitalize()
+
+
+def _interval_max_min_ratio(data):
+    """Compute the ratio between the largest and smallest inter-point distances.
+
+    A value larger than 5 typically indicates that the parameter range would
+    better be displayed with a log scale while a linear scale would be more
+    suitable otherwise.
+    """
+    diff = np.diff(np.sort(data))
+    return diff.max() / diff.min()
diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py
index cb1e0f2b1fa4d..7d8e673210ff7 100644
--- a/sklearn/utils/estimator_checks.py
+++ b/sklearn/utils/estimator_checks.py
@@ -4424,7 +4424,7 @@ def _output_from_fit_transform(transformer, name, X, df, y):
     return outputs
 
 
-def _check_generated_dataframe(name, case, outputs_default, outputs_pandas):
+def _check_generated_dataframe(name, case, index, outputs_default, outputs_pandas):
     import pandas as pd
 
     X_trans, feature_names_default = outputs_default
@@ -4434,7 +4434,12 @@ def _check_generated_dataframe(name, case, outputs_default, outputs_pandas):
     # We always rely on the output of `get_feature_names_out` of the
     # transformer used to generate the dataframe as a ground-truth of the
     # columns.
-    expected_dataframe = pd.DataFrame(X_trans, columns=feature_names_pandas, copy=False)
+    # If a dataframe is passed into transform, then the output should have the same
+    # index
+    expected_index = index if case.endswith("df") else None
+    expected_dataframe = pd.DataFrame(
+        X_trans, columns=feature_names_pandas, copy=False, index=expected_index
+    )
 
     try:
         pd.testing.assert_frame_equal(df_trans, expected_dataframe)
@@ -4469,7 +4474,8 @@ def check_set_output_transform_pandas(name, transformer_orig):
     set_random_state(transformer)
 
     feature_names_in = [f"col{i}" for i in range(X.shape[1])]
-    df = pd.DataFrame(X, columns=feature_names_in, copy=False)
+    index = [f"index{i}" for i in range(X.shape[0])]
+    df = pd.DataFrame(X, columns=feature_names_in, copy=False, index=index)
 
     transformer_default = clone(transformer).set_output(transform="default")
     outputs_default = _output_from_fit_transform(transformer_default, name, X, df, y)
@@ -4483,7 +4489,7 @@ def check_set_output_transform_pandas(name, transformer_orig):
 
     for case in outputs_default:
         _check_generated_dataframe(
-            name, case, outputs_default[case], outputs_pandas[case]
+            name, case, index, outputs_default[case], outputs_pandas[case]
         )
 
 
@@ -4511,7 +4517,8 @@ def check_global_ouptut_transform_pandas(name, transformer_orig):
     set_random_state(transformer)
 
     feature_names_in = [f"col{i}" for i in range(X.shape[1])]
-    df = pd.DataFrame(X, columns=feature_names_in, copy=False)
+    index = [f"index{i}" for i in range(X.shape[0])]
+    df = pd.DataFrame(X, columns=feature_names_in, copy=False, index=index)
 
     transformer_default = clone(transformer).set_output(transform="default")
     outputs_default = _output_from_fit_transform(transformer_default, name, X, df, y)
@@ -4528,5 +4535,5 @@ def check_global_ouptut_transform_pandas(name, transformer_orig):
 
     for case in outputs_default:
         _check_generated_dataframe(
-            name, case, outputs_default[case], outputs_pandas[case]
+            name, case, index, outputs_default[case], outputs_pandas[case]
         )
diff --git a/sklearn/utils/tests/test_param_validation.py b/sklearn/utils/tests/test_param_validation.py
index 528a667a3f58e..022f9f373a049 100644
--- a/sklearn/utils/tests/test_param_validation.py
+++ b/sklearn/utils/tests/test_param_validation.py
@@ -6,6 +6,7 @@
 
 from sklearn._config import config_context, get_config
 from sklearn.base import BaseEstimator
+from sklearn.base import _fit_context
 from sklearn.model_selection import LeaveOneOut
 from sklearn.utils import deprecated
 from sklearn.utils._param_validation import Hidden
@@ -60,8 +61,9 @@ class _Estimator(BaseEstimator):
     def __init__(self, a):
         self.a = a
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def fit(self, X=None, y=None):
-        self._validate_params()
+        pass
 
 
 @pytest.mark.parametrize("interval_type", [Integral, Real])
diff --git a/sklearn/utils/tests/test_plotting.py b/sklearn/utils/tests/test_plotting.py
new file mode 100644
index 0000000000000..00b1f7f74fcd0
--- /dev/null
+++ b/sklearn/utils/tests/test_plotting.py
@@ -0,0 +1,63 @@
+import numpy as np
+import pytest
+
+from sklearn.utils._plotting import _validate_score_name, _interval_max_min_ratio
+
+
+def metric():
+    pass  # pragma: no cover
+
+
+def neg_metric():
+    pass  # pragma: no cover
+
+
+@pytest.mark.parametrize(
+    "score_name, scoring, negate_score, expected_score_name",
+    [
+        ("accuracy", None, False, "accuracy"),  # do not transform the name
+        (None, "accuracy", False, "Accuracy"),  # capitalize the name
+        (None, "accuracy", True, "Negative accuracy"),  # add "Negative"
+        (None, "neg_mean_absolute_error", False, "Negative mean absolute error"),
+        (None, "neg_mean_absolute_error", True, "Mean absolute error"),  # remove "neg_"
+        ("MAE", "neg_mean_absolute_error", True, "MAE"),  # keep score_name
+        (None, None, False, "Score"),  # default name
+        (None, None, True, "Negative score"),  # default name but negated
+        ("Some metric", metric, False, "Some metric"),  # do not transform the name
+        ("Some metric", metric, True, "Some metric"),  # do not transform the name
+        (None, metric, False, "Metric"),  # default name
+        (None, metric, True, "Negative metric"),  # default name but negated
+        ("Some metric", neg_metric, False, "Some metric"),  # do not transform the name
+        ("Some metric", neg_metric, True, "Some metric"),  # do not transform the name
+        (None, neg_metric, False, "Negative metric"),  # default name
+        (None, neg_metric, True, "Metric"),  # default name but negated
+    ],
+)
+def test_validate_score_name(score_name, scoring, negate_score, expected_score_name):
+    """Check that we return the right score name."""
+    assert (
+        _validate_score_name(score_name, scoring, negate_score) == expected_score_name
+    )
+
+
+# In the following test, we check the value of the max to min ratio
+# for parameter value intervals to check that using a decision threshold
+# of 5. is a good heuristic to decide between linear and log scales on
+# common ranges of parameter values.
+@pytest.mark.parametrize(
+    "data, lower_bound, upper_bound",
+    [
+        # Such a range could be clearly displayed with either log scale or linear
+        # scale.
+        (np.geomspace(0.1, 1, 5), 5, 6),
+        # Checking that the ratio is still positive on a negative log scale.
+        (-np.geomspace(0.1, 1, 10), 7, 8),
+        # Evenly spaced parameter values lead to a ratio of 1.
+        (np.linspace(0, 1, 5), 0.9, 1.1),
+        # This is not exactly spaced on a log scale but we will benefit from treating
+        # it as such for visualization.
+        ([1, 2, 5, 10, 20, 50], 20, 40),
+    ],
+)
+def test_inverval_max_min_ratio(data, lower_bound, upper_bound):
+    assert lower_bound < _interval_max_min_ratio(data) < upper_bound
diff --git a/sklearn/utils/tests/test_validation.py b/sklearn/utils/tests/test_validation.py
index 4a765d1404794..2d39279f81745 100644
--- a/sklearn/utils/tests/test_validation.py
+++ b/sklearn/utils/tests/test_validation.py
@@ -42,6 +42,7 @@
 from sklearn.utils import _safe_indexing
 from sklearn.utils.validation import (
     has_fit_parameter,
+    _is_fitted,
     check_is_fitted,
     check_consistent_length,
     assert_all_finite,
@@ -848,23 +849,32 @@ def fit(self, X, y):
     msg = "not fitted"
     est = MyEstimator()
 
+    assert not _is_fitted(est, attributes=["a_", "b_"])
     with pytest.raises(NotFittedError, match=msg):
         check_is_fitted(est, attributes=["a_", "b_"])
+    assert not _is_fitted(est, attributes=["a_", "b_"], all_or_any=all)
     with pytest.raises(NotFittedError, match=msg):
         check_is_fitted(est, attributes=["a_", "b_"], all_or_any=all)
+    assert not _is_fitted(est, attributes=["a_", "b_"], all_or_any=any)
     with pytest.raises(NotFittedError, match=msg):
         check_is_fitted(est, attributes=["a_", "b_"], all_or_any=any)
 
     est.a_ = "a"
+    assert not _is_fitted(est, attributes=["a_", "b_"])
     with pytest.raises(NotFittedError, match=msg):
         check_is_fitted(est, attributes=["a_", "b_"])
+    assert not _is_fitted(est, attributes=["a_", "b_"], all_or_any=all)
     with pytest.raises(NotFittedError, match=msg):
         check_is_fitted(est, attributes=["a_", "b_"], all_or_any=all)
+    assert _is_fitted(est, attributes=["a_", "b_"], all_or_any=any)
     check_is_fitted(est, attributes=["a_", "b_"], all_or_any=any)
 
     est.b_ = "b"
+    assert _is_fitted(est, attributes=["a_", "b_"])
     check_is_fitted(est, attributes=["a_", "b_"])
+    assert _is_fitted(est, attributes=["a_", "b_"], all_or_any=all)
     check_is_fitted(est, attributes=["a_", "b_"], all_or_any=all)
+    assert _is_fitted(est, attributes=["a_", "b_"], all_or_any=any)
     check_is_fitted(est, attributes=["a_", "b_"], all_or_any=any)
 
 
diff --git a/sklearn/utils/validation.py b/sklearn/utils/validation.py
index 6179d91c2a491..8ceef15986567 100644
--- a/sklearn/utils/validation.py
+++ b/sklearn/utils/validation.py
@@ -1369,6 +1369,44 @@ def check_symmetric(array, *, tol=1e-10, raise_warning=True, raise_exception=Fal
     return array
 
 
+def _is_fitted(estimator, attributes=None, all_or_any=all):
+    """Determine if an estimator is fitted
+
+    Parameters
+    ----------
+    estimator : estimator instance
+        Estimator instance for which the check is performed.
+
+    attributes : str, list or tuple of str, default=None
+        Attribute name(s) given as string or a list/tuple of strings
+        Eg.: ``["coef_", "estimator_", ...], "coef_"``
+
+        If `None`, `estimator` is considered fitted if there exist an
+        attribute that ends with a underscore and does not start with double
+        underscore.
+
+    all_or_any : callable, {all, any}, default=all
+        Specify whether all or any of the given attributes must exist.
+
+    Returns
+    -------
+    fitted : bool
+        Whether the estimator is fitted.
+    """
+    if attributes is not None:
+        if not isinstance(attributes, (list, tuple)):
+            attributes = [attributes]
+        return all_or_any([hasattr(estimator, attr) for attr in attributes])
+
+    if hasattr(estimator, "__sklearn_is_fitted__"):
+        return estimator.__sklearn_is_fitted__()
+
+    fitted_attrs = [
+        v for v in vars(estimator) if v.endswith("_") and not v.startswith("__")
+    ]
+    return len(fitted_attrs) > 0
+
+
 def check_is_fitted(estimator, attributes=None, *, msg=None, all_or_any=all):
     """Perform is_fitted validation for estimator.
 
@@ -1425,18 +1463,7 @@ def check_is_fitted(estimator, attributes=None, *, msg=None, all_or_any=all):
     if not hasattr(estimator, "fit"):
         raise TypeError("%s is not an estimator instance." % (estimator))
 
-    if attributes is not None:
-        if not isinstance(attributes, (list, tuple)):
-            attributes = [attributes]
-        fitted = all_or_any([hasattr(estimator, attr) for attr in attributes])
-    elif hasattr(estimator, "__sklearn_is_fitted__"):
-        fitted = estimator.__sklearn_is_fitted__()
-    else:
-        fitted = [
-            v for v in vars(estimator) if v.endswith("_") and not v.startswith("__")
-        ]
-
-    if not fitted:
+    if not _is_fitted(estimator, attributes, all_or_any):
         raise NotFittedError(msg % {"name": type(estimator).__name__})
 
 
From 855ee192407d19b51adb4f50a49c6752ee80c820 Mon Sep 17 00:00:00 2001
From: Adam Li <adam2392@gmail.com>
Date: Thu, 15 Jun 2023 20:32:20 -0400
Subject: [PATCH 15/28] Add quantile

Signed-off-by: Adam Li <adam2392@gmail.com>
---
 sklearn/ensemble/_forest.py | 2 +-
 sklearn/tree/_classes.py    | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/sklearn/ensemble/_forest.py b/sklearn/ensemble/_forest.py
index e715952947c04..b43bbeaf0b435 100644
--- a/sklearn/ensemble/_forest.py
+++ b/sklearn/ensemble/_forest.py
@@ -730,7 +730,7 @@ def predict_quantiles(self, X, quantiles=0.5, method="nearest"):
             The quantiles at which to evaluate, by default 0.5 (median).
         method : str, optional
             The method to interpolate, by default 'linear'. Can be any keyword
-            argument accepted by :func:`np.quantile`.
+            argument accepted by :func:`~np.quantile`.
         check_input : bool, optional
             Whether or not to check input, by default True.
 
diff --git a/sklearn/tree/_classes.py b/sklearn/tree/_classes.py
index 64a444db0b228..d7d8cedb63696 100644
--- a/sklearn/tree/_classes.py
+++ b/sklearn/tree/_classes.py
@@ -641,7 +641,7 @@ def predict_quantiles(self, X, quantiles=0.5, method="nearest", check_input=True
             The quantiles at which to evaluate, by default 0.5 (median).
         method : str, optional
             The method to interpolate, by default 'linear'. Can be any keyword
-            argument accepted by :func:`np.quantile`.
+            argument accepted by :func:`~np.quantile`.
         check_input : bool, optional
             Whether or not to check input, by default True.
 

From 3f5cb6597e36a08f651f8f0eb7324e9658a14bea Mon Sep 17 00:00:00 2001
From: Adam Li <adam2392@gmail.com>
Date: Fri, 16 Jun 2023 11:05:43 -0400
Subject: [PATCH 16/28] Add check input

Signed-off-by: Adam Li <adam2392@gmail.com>
---
 sklearn/ensemble/_forest.py | 2 --
 sklearn/tree/_classes.py    | 2 ++
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/sklearn/ensemble/_forest.py b/sklearn/ensemble/_forest.py
index b43bbeaf0b435..c51c489dbd5dd 100644
--- a/sklearn/ensemble/_forest.py
+++ b/sklearn/ensemble/_forest.py
@@ -731,8 +731,6 @@ def predict_quantiles(self, X, quantiles=0.5, method="nearest"):
         method : str, optional
             The method to interpolate, by default 'linear'. Can be any keyword
             argument accepted by :func:`~np.quantile`.
-        check_input : bool, optional
-            Whether or not to check input, by default True.
 
         Returns
         -------
diff --git a/sklearn/tree/_classes.py b/sklearn/tree/_classes.py
index d7d8cedb63696..78454b8854d26 100644
--- a/sklearn/tree/_classes.py
+++ b/sklearn/tree/_classes.py
@@ -602,6 +602,8 @@ def get_leaf_node_samples(self, X, check_input=True):
         ----------
         X : array-like of shape (n_samples, n_features)
             Dataset to apply the forest to.
+        check_input : bool, default=True
+            Allow to bypass several input checking.
 
         Returns
         -------

From 7401ddcb19a42132cf46e79a14b22a2bdfb8519c Mon Sep 17 00:00:00 2001
From: Adam Li <adam2392@gmail.com>
Date: Fri, 16 Jun 2023 18:35:39 -0400
Subject: [PATCH 17/28] Try to fix docstring

Signed-off-by: Adam Li <adam2392@gmail.com>
---
 sklearn/tree/_classes.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/sklearn/tree/_classes.py b/sklearn/tree/_classes.py
index 78454b8854d26..c75c933c49b39 100644
--- a/sklearn/tree/_classes.py
+++ b/sklearn/tree/_classes.py
@@ -607,12 +607,11 @@ def get_leaf_node_samples(self, X, check_input=True):
 
         Returns
         -------
-        leaf_nodes_samples : a list of array-like of shape
-                (n_leaf_node_samples, n_outputs)
+        leaf_nodes_samples : a list of array-like
             Each sample is represented by the indices of the training samples that
             reached the leaf node. The ``n_leaf_node_samples`` may vary between
             samples, since the number of samples that fall in a leaf node is
-            variable.
+            variable. Each array has shape (n_leaf_node_samples, n_outputs).
         """
         if not self.store_leaf_values:
             raise RuntimeError(

From 13e29135bd0b640f3bf325ec40a22a879096b719 Mon Sep 17 00:00:00 2001
From: Adam Li <adam2392@gmail.com>
Date: Fri, 16 Jun 2023 18:41:17 -0400
Subject: [PATCH 18/28] Try to fix docstring

Signed-off-by: Adam Li <adam2392@gmail.com>
---
 sklearn/tree/_classes.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/sklearn/tree/_classes.py b/sklearn/tree/_classes.py
index c75c933c49b39..2d83a94dc8ec1 100644
--- a/sklearn/tree/_classes.py
+++ b/sklearn/tree/_classes.py
@@ -1167,7 +1167,6 @@ def fit(self, X, y, sample_weight=None, check_input=True):
         self : DecisionTreeClassifier
             Fitted estimator.
         """
-
         super()._fit(
             X,
             y,

From 43aa3ef51ca96b58b00a178954d033579db09de9 Mon Sep 17 00:00:00 2001
From: Adam Li <adam2392@gmail.com>
Date: Sat, 17 Jun 2023 10:41:44 -0400
Subject: [PATCH 19/28] Fix docstring

Signed-off-by: Adam Li <adam2392@gmail.com>
---
 sklearn/ensemble/_forest.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/sklearn/ensemble/_forest.py b/sklearn/ensemble/_forest.py
index c51c489dbd5dd..5482ebcaf1d41 100644
--- a/sklearn/ensemble/_forest.py
+++ b/sklearn/ensemble/_forest.py
@@ -817,12 +817,11 @@ def get_leaf_node_samples(self, X):
 
         Returns
         -------
-        leaf_node_samples : a list of array-like of shape
-                (n_leaf_node_samples, n_outputs)
+        leaf_node_samples : a list of array-like
             Each sample is represented by the indices of the training samples that
             reached the leaf node. The ``n_leaf_node_samples`` may vary between
             samples, since the number of samples that fall in a leaf node is
-            variable.
+            variable. Each array-like has shape (n_leaf_node_samples, n_outputs).
         """
         check_is_fitted(self)
         # Check data

From fe3072f4ee28f49d590e7b437bf01bffd61ab917 Mon Sep 17 00:00:00 2001
From: Adam Li <adam2392@gmail.com>
Date: Sat, 17 Jun 2023 11:01:09 -0400
Subject: [PATCH 20/28] Fix docstring

Signed-off-by: Adam Li <adam2392@gmail.com>
---
 sklearn/ensemble/_forest.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/sklearn/ensemble/_forest.py b/sklearn/ensemble/_forest.py
index 5482ebcaf1d41..9fd3af21b1fd9 100644
--- a/sklearn/ensemble/_forest.py
+++ b/sklearn/ensemble/_forest.py
@@ -696,7 +696,6 @@ def _bin_data(self, X, is_training_data):
         If is_training_data, then fit the _bin_mapper attribute.
         Else, the binned data is converted to a C-contiguous array.
         """
-
         description = "training" if is_training_data else "validation"
         if self.verbose:
             print(

From 2d4de9aff7567bf796626aed4f27149f6ccf399c Mon Sep 17 00:00:00 2001
From: Adam Li <adam2392@gmail.com>
Date: Mon, 19 Jun 2023 21:33:55 -0400
Subject: [PATCH 21/28] Fix the predict quantiles docstring

Signed-off-by: Adam Li <adam2392@gmail.com>
---
 sklearn/ensemble/_forest.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/sklearn/ensemble/_forest.py b/sklearn/ensemble/_forest.py
index 9fd3af21b1fd9..f85efb0b0a43b 100644
--- a/sklearn/ensemble/_forest.py
+++ b/sklearn/ensemble/_forest.py
@@ -733,9 +733,9 @@ def predict_quantiles(self, X, quantiles=0.5, method="nearest"):
 
         Returns
         -------
-        y : ndarray of shape (n_samples, n_quantiles) or
-                (n_samples, n_quantiles, n_outputs)
-            The predicted values.
+        y : ndarray of shape (n_samples, n_quantiles, [n_output])
+            The predicted values. The ``n_outputs`` dimension is present only
+            for multi-output regressors.
         """
         if not self.store_leaf_values:
             raise RuntimeError(

From 1c1ec8cff3a181b7a86a4df8a2aeb01fa7cdbe6a Mon Sep 17 00:00:00 2001
From: Adam Li <adam2392@gmail.com>
Date: Mon, 19 Jun 2023 21:35:33 -0400
Subject: [PATCH 22/28] Fix the predict quantiles docstring

Signed-off-by: Adam Li <adam2392@gmail.com>
---
 sklearn/ensemble/_forest.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/ensemble/_forest.py b/sklearn/ensemble/_forest.py
index f85efb0b0a43b..3eb61c9497918 100644
--- a/sklearn/ensemble/_forest.py
+++ b/sklearn/ensemble/_forest.py
@@ -733,7 +733,7 @@ def predict_quantiles(self, X, quantiles=0.5, method="nearest"):
 
         Returns
         -------
-        y : ndarray of shape (n_samples, n_quantiles, [n_output])
+        y : ndarray of shape (n_samples, n_quantiles, [n_outputs])
             The predicted values. The ``n_outputs`` dimension is present only
             for multi-output regressors.
         """

From 4bc651dd7916d7c267690ef0c9705b3f2d69c9d0 Mon Sep 17 00:00:00 2001
From: Adam Li <adam2392@gmail.com>
Date: Fri, 23 Jun 2023 12:02:45 -0400
Subject: [PATCH 23/28] Remove some diff

Signed-off-by: Adam Li <adam2392@gmail.com>
---
 sklearn/tree/_classes.py        |  1 -
 sklearn/tree/_criterion.pyx     | 18 ++++++++++++++++++
 sklearn/tree/_tree.pxd          |  3 ++-
 sklearn/tree/_tree.pyx          |  2 --
 sklearn/tree/tests/test_tree.py |  8 +++++---
 5 files changed, 25 insertions(+), 7 deletions(-)

diff --git a/sklearn/tree/_classes.py b/sklearn/tree/_classes.py
index 74e60c64ce85f..e61f674d300c9 100644
--- a/sklearn/tree/_classes.py
+++ b/sklearn/tree/_classes.py
@@ -511,7 +511,6 @@ def _build_tree(
                 self.min_impurity_decrease,
                 self.store_leaf_values,
             )
-
         builder.build(self.tree_, X, y, sample_weight, missing_values_in_feature_mask)
 
         if self.n_outputs_ == 1 and is_classifier(self):
diff --git a/sklearn/tree/_criterion.pyx b/sklearn/tree/_criterion.pyx
index 178a9adee9e80..2ddc02194c490 100644
--- a/sklearn/tree/_criterion.pyx
+++ b/sklearn/tree/_criterion.pyx
@@ -155,8 +155,10 @@ cdef class BaseCriterion:
 
         This method computes the improvement in impurity when a split occurs.
         The weighted impurity improvement equation is the following:
+
             N_t / N * (impurity - N_t_R / N_t * right_impurity
                                 - N_t_L / N_t * left_impurity)
+
         where N is the total number of samples, N_t is the number of samples
         at the current node, N_t_L is the number of samples in the left child,
         and N_t_R is the number of samples in the right child,
@@ -165,8 +167,10 @@ cdef class BaseCriterion:
         ----------
         impurity_parent : double
             The initial impurity of the parent node before the split
+
         impurity_left : double
             The impurity of the left child
+
         impurity_right : double
             The impurity of the right child
 
@@ -611,10 +615,13 @@ cdef class Entropy(ClassificationCriterion):
     This handles cases where the target is a classification taking values
     0, 1, ... K-2, K-1. If node m represents a region Rm with Nm observations,
     then let
+
         count_k = 1 / Nm \sum_{x_i in Rm} I(yi = k)
+
     be the proportion of class k observations in node m.
 
     The cross-entropy is then defined as
+
         cross-entropy = -\sum_{k=0}^{K-1} count_k log(count_k)
     """
 
@@ -1058,10 +1065,14 @@ cdef class MSE(RegressionCriterion):
 
         The absolute impurity improvement is only computed by the
         impurity_improvement method once the best split has been found.
+
         The MSE proxy is derived from
+
             sum_{i left}(y_i - y_pred_L)^2 + sum_{i right}(y_i - y_pred_R)^2
             = sum(y_i^2) - n_L * mean_{i left}(y_i)^2 - n_R * mean_{i right}(y_i)^2
+
         Neglecting constant terms, this gives:
+
             - 1/n_L * sum_{i left}(y_i)^2 - 1/n_R * sum_{i right}(y_i)^2
         """
         cdef SIZE_t k
@@ -1139,6 +1150,7 @@ cdef class MAE(RegressionCriterion):
         ----------
         n_outputs : SIZE_t
             The number of targets to be predicted
+
         n_samples : SIZE_t
             The total number of samples to fit on
         """
@@ -1429,6 +1441,7 @@ cdef class FriedmanMSE(MSE):
     """Mean squared error impurity criterion with improvement score by Friedman.
 
     Uses the formula (35) in Friedman's original Gradient Boosting paper:
+
         diff = mean_left - mean_right
         improvement = n_left * n_right * diff^2 / (n_left + n_right)
     """
@@ -1483,6 +1496,7 @@ cdef class Poisson(RegressionCriterion):
     """Half Poisson deviance as impurity criterion.
 
     Poisson deviance = 2/n * sum(y_true * log(y_true/y_pred) + y_pred - y_true)
+    
     Note that the deviance is >= 0, and since we have `y_pred = mean(y_true)`
     at the leaves, one always has `sum(y_pred - y_true) = 0`. It remains the
     implemented impurity (factor 2 is skipped):
@@ -1519,12 +1533,16 @@ cdef class Poisson(RegressionCriterion):
 
         The absolute impurity improvement is only computed by the
         impurity_improvement method once the best split has been found.
+
         The Poisson proxy is derived from:
+
               sum_{i left }(y_i * log(y_i / y_pred_L))
             + sum_{i right}(y_i * log(y_i / y_pred_R))
             = sum(y_i * log(y_i) - n_L * mean_{i left}(y_i) * log(mean_{i left}(y_i))
                                  - n_R * mean_{i right}(y_i) * log(mean_{i right}(y_i))
+
         Neglecting constant terms, this gives
+
             - sum{i left }(y_i) * log(mean{i left}(y_i))
             - sum{i right}(y_i) * log(mean{i right}(y_i))
         """
diff --git a/sklearn/tree/_tree.pxd b/sklearn/tree/_tree.pxd
index 7b933d905c79a..dedd820c41e0f 100644
--- a/sklearn/tree/_tree.pxd
+++ b/sklearn/tree/_tree.pxd
@@ -141,7 +141,8 @@ cdef class TreeBuilder:
     # This class controls the various stopping criteria and the node splitting
     # evaluation order, e.g. depth-first or best-first.
 
-    cdef Splitter splitter
+    cdef Splitter splitter              # Splitting algorithm
+
     cdef SIZE_t min_samples_split       # Minimum number of samples in an internal node
     cdef SIZE_t min_samples_leaf        # Minimum number of samples in a leaf
     cdef double min_weight_leaf         # Minimum weight in a leaf
diff --git a/sklearn/tree/_tree.pyx b/sklearn/tree/_tree.pyx
index 24b01b96aa726..c44022f54d3a5 100644
--- a/sklearn/tree/_tree.pyx
+++ b/sklearn/tree/_tree.pyx
@@ -61,7 +61,6 @@ cdef extern from "<stack>" namespace "std" nogil:
 from numpy import float32 as DTYPE
 from numpy import float64 as DOUBLE
 
-
 cdef double INFINITY = np.inf
 cdef double EPSILON = np.finfo('double').eps
 
@@ -87,7 +86,6 @@ NODE_DTYPE = np.asarray(<Node[:1]>(&dummy)).dtype
 # TreeBuilder
 # =============================================================================
 
-
 cdef class TreeBuilder:
     """Interface for different tree building strategies."""
 
diff --git a/sklearn/tree/tests/test_tree.py b/sklearn/tree/tests/test_tree.py
index 792ba44b1302e..9be3dbd6f549e 100644
--- a/sklearn/tree/tests/test_tree.py
+++ b/sklearn/tree/tests/test_tree.py
@@ -33,13 +33,15 @@
     DENSE_SPLITTERS,
     SPARSE_SPLITTERS,
 )
-from sklearn.tree._tree import NODE_DTYPE, TREE_LEAF, TREE_UNDEFINED
-from sklearn.tree._tree import Tree as CythonTree
 from sklearn.tree._tree import (
+    NODE_DTYPE,
+    TREE_LEAF,
+    TREE_UNDEFINED,
     _check_n_classes,
     _check_node_ndarray,
     _check_value_ndarray,
 )
+from sklearn.tree._tree import Tree as CythonTree
 from sklearn.utils import _IS_32BIT, compute_sample_weight
 from sklearn.utils._testing import (
     assert_almost_equal,
@@ -2424,7 +2426,7 @@ def test_missing_values_on_equal_nodes_no_missing(criterion):
     X = np.array([[0, 1, 2, 3, 8, 9, 11, 12, 15]]).T
     y = np.array([0.1, 0.2, 0.3, 0.2, 1.4, 1.4, 1.5, 1.6, 2.6])
 
-    dtc = DecisionTreeRegressor(random_state=42, max_depth=1, criterion=criterion)
+    dtc = DecisionTreeRegressor(random_state=42, max_depth=1, criterion=criterion, store_leaf_values=True)
     dtc.fit(X, y)
 
     # Goes to right node because it has the most data points

From cc035d04b9784e6facb7096a56c9c81801d819ec Mon Sep 17 00:00:00 2001
From: Adam Li <adam2392@gmail.com>
Date: Fri, 23 Jun 2023 15:42:08 -0400
Subject: [PATCH 24/28] Fix regression error

Signed-off-by: Adam Li <adam2392@gmail.com>
---
 sklearn/ensemble/_forest.py     | 18 +++++++++---------
 sklearn/tree/_criterion.pyx     |  3 ++-
 sklearn/tree/_splitter.pyx      |  6 ++++++
 sklearn/tree/tests/test_tree.py |  4 +++-
 4 files changed, 20 insertions(+), 11 deletions(-)

diff --git a/sklearn/ensemble/_forest.py b/sklearn/ensemble/_forest.py
index d8a94940799c0..f2e0201d534cd 100644
--- a/sklearn/ensemble/_forest.py
+++ b/sklearn/ensemble/_forest.py
@@ -40,27 +40,28 @@ class calls the ``fit`` method of each sub-estimator on random samples
 # License: BSD 3 clause
 
 
-from time import time
 import threading
 from abc import ABCMeta, abstractmethod
 from numbers import Integral, Real
+from time import time
 from warnings import catch_warnings, simplefilter, warn
 
 import numpy as np
 from scipy.sparse import hstack as sparse_hstack
 from scipy.sparse import issparse
 
-from sklearn.base import is_classifier, _fit_context
 from sklearn.base import (
     ClassifierMixin,
     MultiOutputMixin,
     RegressorMixin,
     TransformerMixin,
+    _fit_context,
+    is_classifier,
 )
-
-from sklearn.metrics import accuracy_score, r2_score
-from sklearn.preprocessing import OneHotEncoder
+from sklearn.ensemble._base import BaseEnsemble, _partition_estimators
+from sklearn.ensemble._hist_gradient_boosting.binning import _BinMapper
 from sklearn.exceptions import DataConversionWarning
+from sklearn.metrics import accuracy_score, r2_score
 from sklearn.preprocessing import OneHotEncoder
 from sklearn.tree import (
     BaseDecisionTree,
@@ -69,8 +70,8 @@ class calls the ``fit`` method of each sub-estimator on random samples
     ExtraTreeClassifier,
     ExtraTreeRegressor,
 )
-from ..tree._tree import DOUBLE, DTYPE
 from sklearn.utils import check_random_state, compute_sample_weight
+from sklearn.utils._openmp_helpers import _openmp_effective_n_threads
 from sklearn.utils._param_validation import Interval, RealNotInt, StrOptions
 from sklearn.utils.multiclass import check_classification_targets, type_of_target
 from sklearn.utils.parallel import Parallel, delayed
@@ -80,9 +81,8 @@ class calls the ``fit`` method of each sub-estimator on random samples
     _num_samples,
     check_is_fitted,
 )
-from sklearn.ensemble._hist_gradient_boosting.binning import _BinMapper
-from sklearn.utils._openmp_helpers import _openmp_effective_n_threads
-from sklearn.ensemble._base import BaseEnsemble, _partition_estimators
+
+from ..tree._tree import DOUBLE, DTYPE
 
 __all__ = [
     "RandomForestClassifier",
diff --git a/sklearn/tree/_criterion.pyx b/sklearn/tree/_criterion.pyx
index 2ddc02194c490..bd1bdef0a6a93 100644
--- a/sklearn/tree/_criterion.pyx
+++ b/sklearn/tree/_criterion.pyx
@@ -1496,10 +1496,11 @@ cdef class Poisson(RegressionCriterion):
     """Half Poisson deviance as impurity criterion.
 
     Poisson deviance = 2/n * sum(y_true * log(y_true/y_pred) + y_pred - y_true)
-    
+
     Note that the deviance is >= 0, and since we have `y_pred = mean(y_true)`
     at the leaves, one always has `sum(y_pred - y_true) = 0`. It remains the
     implemented impurity (factor 2 is skipped):
+
         1/n * sum(y_true * log(y_true/y_pred)
     """
     # FIXME in 1.0:
diff --git a/sklearn/tree/_splitter.pyx b/sklearn/tree/_splitter.pyx
index 007d55a589df7..bca38d5f04374 100644
--- a/sklearn/tree/_splitter.pyx
+++ b/sklearn/tree/_splitter.pyx
@@ -507,6 +507,12 @@ cdef inline int node_split_best(
                 current_split.pos = p
 
                 # Reject if min_samples_leaf is not guaranteed
+                if missing_go_to_left:
+                    n_left = current_split.pos - splitter.start + n_missing
+                    n_right = end_non_missing - current_split.pos
+                else:
+                    n_left = current_split.pos - splitter.start
+                    n_right = end_non_missing - current_split.pos + n_missing
                 if splitter.check_presplit_conditions(current_split, n_missing, missing_go_to_left) == 1:
                     continue
 
diff --git a/sklearn/tree/tests/test_tree.py b/sklearn/tree/tests/test_tree.py
index 9be3dbd6f549e..0ce7a548c7bdb 100644
--- a/sklearn/tree/tests/test_tree.py
+++ b/sklearn/tree/tests/test_tree.py
@@ -2426,7 +2426,9 @@ def test_missing_values_on_equal_nodes_no_missing(criterion):
     X = np.array([[0, 1, 2, 3, 8, 9, 11, 12, 15]]).T
     y = np.array([0.1, 0.2, 0.3, 0.2, 1.4, 1.4, 1.5, 1.6, 2.6])
 
-    dtc = DecisionTreeRegressor(random_state=42, max_depth=1, criterion=criterion, store_leaf_values=True)
+    dtc = DecisionTreeRegressor(
+        random_state=42, max_depth=1, criterion=criterion, store_leaf_values=True
+    )
     dtc.fit(X, y)
 
     # Goes to right node because it has the most data points

From 4840d4e3e3ef6175c4e1197c87c77f8fe06f10cf Mon Sep 17 00:00:00 2001
From: Adam Li <adam2392@gmail.com>
Date: Fri, 23 Jun 2023 18:26:04 -0400
Subject: [PATCH 25/28] Fix boolean

Signed-off-by: Adam Li <adam2392@gmail.com>
---
 sklearn/ensemble/_forest.py | 2 +-
 sklearn/tree/_classes.py    | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/sklearn/ensemble/_forest.py b/sklearn/ensemble/_forest.py
index f2e0201d534cd..b3feec10a3072 100644
--- a/sklearn/ensemble/_forest.py
+++ b/sklearn/ensemble/_forest.py
@@ -221,7 +221,7 @@ class BaseForest(MultiOutputMixin, BaseEnsemble, metaclass=ABCMeta):
             None,
             Interval(Integral, 1, None, closed="left"),
         ],
-        "store_leaf_values": [bool],
+        "store_leaf_values": ["boolean"],
     }
 
     @abstractmethod
diff --git a/sklearn/tree/_classes.py b/sklearn/tree/_classes.py
index e61f674d300c9..6825c36df155c 100644
--- a/sklearn/tree/_classes.py
+++ b/sklearn/tree/_classes.py
@@ -123,7 +123,7 @@ class BaseDecisionTree(MultiOutputMixin, BaseEstimator, metaclass=ABCMeta):
         "max_leaf_nodes": [Interval(Integral, 2, None, closed="left"), None],
         "min_impurity_decrease": [Interval(Real, 0.0, None, closed="left")],
         "ccp_alpha": [Interval(Real, 0.0, None, closed="left")],
-        "store_leaf_values": [bool],
+        "store_leaf_values": ["boolean"],
     }
 
     @abstractmethod

From fdf2e2dbe1e1c316a1e2987aea31da26ebbec2cd Mon Sep 17 00:00:00 2001
From: Adam Li <adam2392@gmail.com>
Date: Fri, 30 Jun 2023 12:49:16 -0700
Subject: [PATCH 26/28] Added doc to store_leaf_values

Signed-off-by: Adam Li <adam2392@gmail.com>
---
 sklearn/ensemble/_forest.py | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/sklearn/ensemble/_forest.py b/sklearn/ensemble/_forest.py
index b3feec10a3072..34bebab399566 100644
--- a/sklearn/ensemble/_forest.py
+++ b/sklearn/ensemble/_forest.py
@@ -822,6 +822,11 @@ def get_leaf_node_samples(self, X):
             samples, since the number of samples that fall in a leaf node is
             variable. Each array-like has shape (n_leaf_node_samples, n_outputs).
         """
+        if not self.store_leaf_values:
+            raise RuntimeError(
+                "Leaf node samples are not available when store_leaf_values=False"
+            )
+
         check_is_fitted(self)
         # Check data
         X = self._validate_X_predict(X)
@@ -1520,6 +1525,9 @@ class RandomForestClassifier(ForestClassifier):
 
         .. versionadded:: 0.22
 
+    store_leaf_values : bool, default=False
+        Whether to store the leaf values in the ``get_leaf_node_samples`` function.
+
     Attributes
     ----------
     estimator_ : :class:`~sklearn.tree.DecisionTreeClassifier`
@@ -1879,6 +1887,9 @@ class RandomForestRegressor(ForestRegressor):
 
         .. versionadded:: 0.22
 
+    store_leaf_values : bool, default=False
+        Whether to store the leaf values in the ``get_leaf_node_samples`` function.
+
     Attributes
     ----------
     estimator_ : :class:`~sklearn.tree.DecisionTreeRegressor`
@@ -2232,6 +2243,9 @@ class ExtraTreesClassifier(ForestClassifier):
 
         .. versionadded:: 0.22
 
+    store_leaf_values : bool, default=False
+        Whether to store the leaf values in the ``get_leaf_node_samples`` function.
+
     Attributes
     ----------
     estimator_ : :class:`~sklearn.tree.ExtraTreesClassifier`
@@ -2576,6 +2590,9 @@ class ExtraTreesRegressor(ForestRegressor):
 
         .. versionadded:: 0.22
 
+    store_leaf_values : bool, default=False
+        Whether to store the leaf values in the ``get_leaf_node_samples`` function.
+
     Attributes
     ----------
     estimator_ : :class:`~sklearn.tree.ExtraTreeRegressor`

From 5b7ce7e1c6842aac174ebc4b1b2a68a1f1e25a7d Mon Sep 17 00:00:00 2001
From: Adam Li <adam2392@gmail.com>
Date: Fri, 30 Jun 2023 12:51:20 -0700
Subject: [PATCH 27/28] Merging main

Signed-off-by: Adam Li <adam2392@gmail.com>
---
 sklearn/tree/_classes.py | 30 ++++++++++++++++++++++++++++++
 1 file changed, 30 insertions(+)

diff --git a/sklearn/tree/_classes.py b/sklearn/tree/_classes.py
index 6825c36df155c..200f87b0b9ef3 100644
--- a/sklearn/tree/_classes.py
+++ b/sklearn/tree/_classes.py
@@ -1386,6 +1386,16 @@ class DecisionTreeRegressor(RegressorMixin, BaseDecisionTree):
 
         .. versionadded:: 0.22
 
+    store_leaf_values : bool, default=False
+        Whether to store the samples that fall into leaves in the ``tree_`` attribute.
+        Each leaf will store a 2D array corresponding to the samples that fall into it
+        keyed by node_id.
+
+        XXX: This is currently experimental and may change without notice.
+        Moreover, it can be improved upon since storing the samples twice is not ideal.
+        One could instead store the indices in ``y_train`` that fall into each leaf,
+        which would lower RAM/diskspace usage.
+
     Attributes
     ----------
     feature_importances_ : ndarray of shape (n_features,)
@@ -1713,6 +1723,16 @@ class ExtraTreeClassifier(DecisionTreeClassifier):
 
         .. versionadded:: 0.22
 
+    store_leaf_values : bool, default=False
+        Whether to store the samples that fall into leaves in the ``tree_`` attribute.
+        Each leaf will store a 2D array corresponding to the samples that fall into it
+        keyed by node_id.
+
+        XXX: This is currently experimental and may change without notice.
+        Moreover, it can be improved upon since storing the samples twice is not ideal.
+        One could instead store the indices in ``y_train`` that fall into each leaf,
+        which would lower RAM/diskspace usage.
+
     Attributes
     ----------
     classes_ : ndarray of shape (n_classes,) or list of ndarray
@@ -1959,6 +1979,16 @@ class ExtraTreeRegressor(DecisionTreeRegressor):
 
         .. versionadded:: 0.22
 
+    store_leaf_values : bool, default=False
+        Whether to store the samples that fall into leaves in the ``tree_`` attribute.
+        Each leaf will store a 2D array corresponding to the samples that fall into it
+        keyed by node_id.
+
+        XXX: This is currently experimental and may change without notice.
+        Moreover, it can be improved upon since storing the samples twice is not ideal.
+        One could instead store the indices in ``y_train`` that fall into each leaf,
+        which would lower RAM/diskspace usage.
+
     Attributes
     ----------
     max_features_ : int

From 9655d013870e3007d5c5a1898212a9d0eeea0968 Mon Sep 17 00:00:00 2001
From: Adam Li <adam2392@gmail.com>
Date: Fri, 30 Jun 2023 13:03:26 -0700
Subject: [PATCH 28/28] Fix now

Signed-off-by: Adam Li <adam2392@gmail.com>
---
 sklearn/ensemble/_forest.py | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/sklearn/ensemble/_forest.py b/sklearn/ensemble/_forest.py
index 34bebab399566..768eeeaf1959f 100644
--- a/sklearn/ensemble/_forest.py
+++ b/sklearn/ensemble/_forest.py
@@ -63,13 +63,6 @@ class calls the ``fit`` method of each sub-estimator on random samples
 from sklearn.exceptions import DataConversionWarning
 from sklearn.metrics import accuracy_score, r2_score
 from sklearn.preprocessing import OneHotEncoder
-from sklearn.tree import (
-    BaseDecisionTree,
-    DecisionTreeClassifier,
-    DecisionTreeRegressor,
-    ExtraTreeClassifier,
-    ExtraTreeRegressor,
-)
 from sklearn.utils import check_random_state, compute_sample_weight
 from sklearn.utils._openmp_helpers import _openmp_effective_n_threads
 from sklearn.utils._param_validation import Interval, RealNotInt, StrOptions
@@ -82,6 +75,13 @@ class calls the ``fit`` method of each sub-estimator on random samples
     check_is_fitted,
 )
 
+from ..tree import (
+    BaseDecisionTree,
+    DecisionTreeClassifier,
+    DecisionTreeRegressor,
+    ExtraTreeClassifier,
+    ExtraTreeRegressor,
+)
 from ..tree._tree import DOUBLE, DTYPE
 
 __all__ = [