From 0ac8a71ee2ad670d7b3a39201f63e1b48938204c Mon Sep 17 00:00:00 2001 From: Adam Li Date: Tue, 28 Mar 2023 19:58:02 -0700 Subject: [PATCH 01/28] [EXAMPLE DIFF] (Tree featuresv2) Fork of sklearn that maintains all necessary refactorings to enable downstream functionality (#32) #### Reference Issues/PRs This is the most up-to-date PR branch to consolidate all proposed refactor changes that work with: - unsupervised trees - oblique trees - no performance/runtime regressions against main #### What does this implement/fix? Explain your changes. Incorporates refactors to: Internal Cython of scikit-learn's: - criterion - splitter - tree Internals of Python in scikit-learns: - python Tree Adds the basic implementation of oblique trees. The implementation of oblique trees has been tested on all sklearn's `check_estimator` testing function and has error-checking bounds for the new hyperparameter introduced, which is `feature_combinations` that defaults to ``min(1.5, n_features)``. TODO: 1. [ ] ~Add honest support for trees (splitting the data at the Python API level)~ 2. [x] Build wheels 3. [ ] ~Brainstorm unit-tests, or weekly checks to determine when our fork is out-of-date compared to upstream sklearn~ 4. [x] Revamp README for the fork #### Any other comments? [cd build] --------- Signed-off-by: Adam Li Co-authored-by: Chester Huynh Co-authored-by: Parth Vora --- .circleci/config.yml | 33 +- .cirrus.star | 4 +- .github/workflows/check-changelog.yml | 3 +- .github/workflows/check-manifest.yml | 2 +- .github/workflows/labeler-module.yml | 4 +- .github/workflows/update_tracking_issue.yml | 2 +- .github/workflows/wheels.yml | 33 +- .gitignore | 1 + Makefile | 3 + README.rst | 322 ++++++---- build_tools/azure/install.sh | 2 +- build_tools/azure/install_win.sh | 2 +- doc/Makefile | 2 + doc/conf.py | 3 +- doc/modules/tree.rst | 61 +- examples/tree/plot_iris_dtc.py | 4 - setup.py | 45 +- sklearn/ensemble/_forest.py | 108 +++- sklearn/ensemble/tests/test_forest.py | 171 +++++ sklearn/tree/_classes.py | 162 +++-- sklearn/tree/_criterion.pxd | 45 +- sklearn/tree/_criterion.pyx | 285 ++++----- sklearn/tree/_splitter.pxd | 41 +- sklearn/tree/_splitter.pyx | 165 +++-- sklearn/tree/_tree.pxd | 90 ++- sklearn/tree/_tree.pyx | 659 ++++++++++++-------- sklearn/tree/tests/test_tree.py | 32 +- 27 files changed, 1499 insertions(+), 785 deletions(-) mode change 100755 => 100644 setup.py diff --git a/.circleci/config.yml b/.circleci/config.yml index e2f54c0665c78..e4e66b5c57f49 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -94,22 +94,23 @@ jobs: root: doc/_build/html paths: . - deploy: - docker: - - image: cimg/python:3.8.12 - steps: - - checkout - - run: ./build_tools/circle/checkout_merge_commit.sh - # Attach documentation generated in the 'doc' step so that it can be - # deployed. - - attach_workspace: - at: doc/_build/html - - run: ls -ltrh doc/_build/html/stable - - deploy: - command: | - if [[ "${CIRCLE_BRANCH}" =~ ^main$|^[0-9]+\.[0-9]+\.X$ ]]; then - bash build_tools/circle/push_doc.sh doc/_build/html/stable - fi + # XXX: in order to make sure our fork passes all the CIs and not remove too many LOC, we don't want to deploy + # deploy: + # docker: + # - image: cimg/python:3.8.12 + # steps: + # - checkout + # - run: ./build_tools/circle/checkout_merge_commit.sh + # # Attach documentation generated in the 'doc' step so that it can be + # # deployed. + # - attach_workspace: + # at: doc/_build/html + # - run: ls -ltrh doc/_build/html/stable + # - deploy: + # command: | + # if [[ "${CIRCLE_BRANCH}" =~ ^main$|^[0-9]+\.[0-9]+\.X$ ]]; then + # bash build_tools/circle/push_doc.sh doc/_build/html/stable + # fi workflows: version: 2 diff --git a/.cirrus.star b/.cirrus.star index 8b3de0d10c532..2dd1e50144987 100644 --- a/.cirrus.star +++ b/.cirrus.star @@ -4,9 +4,9 @@ load("cirrus", "env", "fs", "http") def main(ctx): - # Only run for scikit-learn/scikit-learn. For debugging on a fork, you can + # Only run for neurodata/scikit-learn. For debugging on a fork, you can # comment out the following condition. - if env.get("CIRRUS_REPO_FULL_NAME") != "scikit-learn/scikit-learn": + if env.get("CIRRUS_REPO_FULL_NAME") != "neurodata/scikit-learn": return [] arm_wheel_yaml = "build_tools/cirrus/arm_wheel.yml" diff --git a/.github/workflows/check-changelog.yml b/.github/workflows/check-changelog.yml index d5bfc8ef0f430..53f64ba5c886b 100644 --- a/.github/workflows/check-changelog.yml +++ b/.github/workflows/check-changelog.yml @@ -10,12 +10,13 @@ jobs: check: name: A reviewer will let you know if it is required or can be bypassed runs-on: ubuntu-latest - if: ${{ contains(github.event.pull_request.labels.*.name, 'No Changelog Needed') == 0 }} + if: ${{ contains(github.event.pull_request.labels.*.name, 'No Changelog Needed') == 0 && github.repository == 'scikit-learn/scikit-learn' }} steps: - name: Get PR number and milestone run: | echo "PR_NUMBER=${{ github.event.pull_request.number }}" >> $GITHUB_ENV echo "TAGGED_MILESTONE=${{ github.event.pull_request.milestone.title }}" >> $GITHUB_ENV + echo "${{ github.repository }}" - uses: actions/checkout@v3 with: fetch-depth: '0' diff --git a/.github/workflows/check-manifest.yml b/.github/workflows/check-manifest.yml index 004cc452e385e..5ef9ce2213e90 100644 --- a/.github/workflows/check-manifest.yml +++ b/.github/workflows/check-manifest.yml @@ -7,7 +7,7 @@ on: jobs: check-manifest: # Don't run on forks - if: github.repository == 'scikit-learn/scikit-learn' + if: github.repository == 'neurodata/scikit-learn' runs-on: ubuntu-latest steps: diff --git a/.github/workflows/labeler-module.yml b/.github/workflows/labeler-module.yml index 061d0094b38c5..8092711f07e45 100644 --- a/.github/workflows/labeler-module.yml +++ b/.github/workflows/labeler-module.yml @@ -16,7 +16,7 @@ jobs: steps: - uses: thomasjpfan/labeler@v2.5.0 continue-on-error: true - if: github.repository == 'scikit-learn/scikit-learn' + if: github.repository == 'neurodata/scikit-learn' with: repo-token: "${{ secrets.GITHUB_TOKEN }}" max-labels: "3" @@ -27,7 +27,7 @@ jobs: steps: - uses: thomasjpfan/labeler@v2.5.0 continue-on-error: true - if: github.repository == 'scikit-learn/scikit-learn' + if: github.repository == 'neurodata/scikit-learn' with: repo-token: "${{ secrets.GITHUB_TOKEN }}" configuration-path: ".github/labeler-file-extensions.yml" diff --git a/.github/workflows/update_tracking_issue.yml b/.github/workflows/update_tracking_issue.yml index 124ea1e8c6ac4..c176ce356a4cf 100644 --- a/.github/workflows/update_tracking_issue.yml +++ b/.github/workflows/update_tracking_issue.yml @@ -24,7 +24,7 @@ on: jobs: update_tracking_issue: runs-on: ubuntu-latest - if: github.repository == 'scikit-learn/scikit-learn' && github.event_name == 'schedule' + if: github.repository == 'neurodata/scikit-learn' && github.event_name == 'schedule' steps: - uses: actions/checkout@v3 - uses: actions/setup-python@v4 diff --git a/.github/workflows/wheels.yml b/.github/workflows/wheels.yml index b43f29ffa4f7f..4ab75fd361586 100644 --- a/.github/workflows/wheels.yml +++ b/.github/workflows/wheels.yml @@ -7,12 +7,12 @@ on: - cron: "42 3 */1 * *" push: branches: - - main + - fork # Release branches - "[0-9]+.[0-9]+.X" pull_request: branches: - - main + - fork - "[0-9]+.[0-9]+.X" # Manual run workflow_dispatch: @@ -26,7 +26,7 @@ jobs: check_build_trigger: name: Check build trigger runs-on: ubuntu-latest - if: github.repository == 'scikit-learn/scikit-learn' + if: github.repository == 'neurodata/scikit-learn' outputs: build: ${{ steps.check_build_trigger.outputs.build }} @@ -178,31 +178,8 @@ jobs: with: path: dist/*.tar.gz - # Upload the wheels and the source distribution - upload_anaconda: - name: Upload to Anaconda - runs-on: ubuntu-latest - needs: [build_wheels, build_sdist] - # The artifacts cannot be uploaded on PRs - if: github.event_name != 'pull_request' - - steps: - - name: Checkout scikit-learn - uses: actions/checkout@v3 - - - name: Download artifacts - uses: actions/download-artifact@v3 + - uses: actions/upload-artifact@v3 with: path: dist + name: ${{ matrix.python[0] }}-${{ matrix.os[1] }} - - name: Setup Python - uses: actions/setup-python@v4 - - - name: Upload artifacts - env: - # Secret variables need to be mapped to environment variables explicitly - SCIKIT_LEARN_NIGHTLY_UPLOAD_TOKEN: ${{ secrets.SCIKIT_LEARN_NIGHTLY_UPLOAD_TOKEN }} - SCIKIT_LEARN_STAGING_UPLOAD_TOKEN: ${{ secrets.SCIKIT_LEARN_STAGING_UPLOAD_TOKEN }} - ARTIFACTS_PATH: dist/artifact - # Force a replacement if the remote file already exists - run: bash build_tools/github/upload_anaconda.sh diff --git a/.gitignore b/.gitignore index 89600846100a8..1e28896f50be6 100644 --- a/.gitignore +++ b/.gitignore @@ -10,6 +10,7 @@ build sklearn/datasets/__config__.py sklearn/**/*.html +scikit_learn_tree.egg-info/* dist/ MANIFEST diff --git a/Makefile b/Makefile index 5ea64dc0d6cac..148027b30f59f 100644 --- a/Makefile +++ b/Makefile @@ -63,3 +63,6 @@ doc-noplot: inplace code-analysis: flake8 sklearn | grep -v __init__ | grep -v external pylint -E -i y sklearn/ -d E1103,E0611,E1101 + +build-dev: + pip install --verbose --no-build-isolation --editable . diff --git a/README.rst b/README.rst index 5e2de6a6d8b46..fbdfdaa95ef4c 100644 --- a/README.rst +++ b/README.rst @@ -44,20 +44,36 @@ .. |PytestMinVersion| replace:: 5.3.1 .. |PlotlyMinVersion| replace:: 5.10.0 -.. image:: https://raw.githubusercontent.com/scikit-learn/scikit-learn/main/doc/logos/scikit-learn-logo.png - :target: https://scikit-learn.org/ +``scikit-learn-tree`` is a maintained fork of scikit-learn, which advances the tree submodule, while staying in-line +with changes from upstream scikit-learn. It is an exact stand-in for ``sklearn`` in package imports, but is +released under the name ``scikit-learn-tree`` to avoid confusion. -**scikit-learn** is a Python module for machine learning built on top of -SciPy and is distributed under the 3-Clause BSD license. +It is currently maintained by a team of volunteers. -The project was started in 2007 by David Cournapeau as a Google Summer -of Code project, and since then many volunteers have contributed. See -the `About us `__ page -for a list of core contributors. +The upstream package **scikit-learn** is a Python module for machine learning built on top of +SciPy and is distributed under the 3-Clause BSD license. Refer to their website for all documentation +needs: https://scikit-learn.org. -It is currently maintained by a team of volunteers. +Why a fork? +----------- +Currently, the scikit-learn tree submodule is difficult to extend. Requests to modularize +and improve the extensibility of the code is currently unsupported, or may take a long time. +The desire for advanced tree models that also leverage the robustness of scikit-learn is desirable. + +However, "hard-forking" via copy/pasting the explicit Python/Cython code into another tree package +altogether is undesirable because it results in a tree codebase that is inherently different +and not compatible with ``scikit-learn``. For example, `quantile-forests `_, +and `EconML `_ do this, and their current tree submodules +cannot take advantage of improvements made in upstream ``scikit-learn``. + +An example of seamless integration would be `scikit-survival `_, which +only needs to implement a subclass of the Cython ``Criterion`` oject in their code to enable survival trees. -Website: https://scikit-learn.org +Maintaining a "soft-fork" of ``scikit-learn`` in the form of a repository fork allows us to develop +a separate package that serves as a stand-in for ``sklearn`` in any package, extends the tree submodule +and can also be synced with upstream changes in ``scikit-learn``. This enables this fork to always +take advantage of improvements made in ``scikit-learn`` main upstream, while providing a customizable +tree API. Installation ------------ @@ -73,133 +89,195 @@ scikit-learn requires: - joblib (>= |JoblibMinVersion|) - threadpoolctl (>= |ThreadpoolctlMinVersion|) -======= +============================ +Installing scikit-learn-tree +============================ -**Scikit-learn 0.20 was the last version to support Python 2.7 and Python 3.4.** -scikit-learn 1.0 and later require Python 3.7 or newer. -scikit-learn 1.1 and later require Python 3.8 or newer. +Scikit-learn-tree is a maintained fork of scikit-learn, which extends the +tree submodule in a few ways documented in :ref:`changelog of the fork +`. -Scikit-learn plotting capabilities (i.e., functions start with ``plot_`` and -classes end with "Display") require Matplotlib (>= |MatplotlibMinVersion|). -For running the examples Matplotlib >= |MatplotlibMinVersion| is required. -A few examples require scikit-image >= |Scikit-ImageMinVersion|, a few examples -require pandas >= |PandasMinVersion|, some examples require seaborn >= -|SeabornMinVersion| and plotly >= |PlotlyMinVersion|. +We release versions of scikit-learn-tree in an analagous fashion to +scikit-learn main. Due to maintenance resources, we only release on PyPi +and recommend therefore installing with ``pip``. -User installation -~~~~~~~~~~~~~~~~~ +There are different ways to install scikit-learn-tree: -If you already have a working installation of numpy and scipy, -the easiest way to install scikit-learn is using ``pip``:: + * :ref:`Install the latest official release `. This + is the best approach for most users. It will provide a stable version + and pre-built packages are available for most platforms. + + * :ref:`Building the package from source + `. This is best for users who want the + latest-and-greatest features and aren't afraid of running + brand-new code. This is also needed for users who wish to contribute to the + project. - pip install -U scikit-learn +.. _install_fork_release: -or ``conda``:: +Installing the latest release +----------------------------- +We release wheels for common distributions and this is thus installable via pip. - conda install -c conda-forge scikit-learn +.. prompt:: bash $ + + pip install scikit-learn-tree -The documentation includes more detailed `installation instructions `_. +This will install ``scikit-learn-tree`` under the namespace of ``sklearn``, which then +can be used as a stand-in for any package that relies on the public API of ``sklearn``. +For example, any usage of ``scikit-learn`` is preserved with ``scikit-learn-tree`` -Changelog ---------- + >>> # the sklearn installed is that of scikit-learn-tree and is equivalent to scikit-learn + >>> from sklearn.ensemble import RandomForestClassifier + >>> clf = RandomForestClassifier(random_state=0) + >>> X = [[ 1, 2, 3], # 2 samples, 3 features + ... [11, 12, 13]] + >>> y = [0, 1] # classes of each sample + >>> clf.fit(X, y) + RandomForestClassifier(random_state=0) -See the `changelog `__ -for a history of notable changes to scikit-learn. +.. _install_source: + +Building from source +-------------------- +If you are a developer and are interested in helping maintain, or add some new +features to the fork, the building from source instructions are exactly the same +as that of scikit-learn main, so please refer to `scikit-learn documentation `_ +for instructions on building from source. Development ------------ +=========== -We welcome new contributors of all experience levels. The scikit-learn -community goals are to be helpful, welcoming, and effective. The +We welcome new contributors of all experience levels, specifically to maintain the fork. +Any contributions that make sure our fork is "better in-line" with scikit-learn upstream, +or improves the tree submodule in anyway will be appreciated. + +The scikit-learn community goals are to be helpful, welcoming, and effective. The `Development Guide `_ has detailed information about contributing code, documentation, tests, and more. We've included some basic information in this README. -Important links -~~~~~~~~~~~~~~~ - -- Official source code repo: https://github.com/scikit-learn/scikit-learn -- Download releases: https://pypi.org/project/scikit-learn/ -- Issue tracker: https://github.com/scikit-learn/scikit-learn/issues - -Source code -~~~~~~~~~~~ - -You can check the latest sources with the command:: - - git clone https://github.com/scikit-learn/scikit-learn.git - -Contributing -~~~~~~~~~~~~ - -To learn more about making a contribution to scikit-learn, please see our -`Contributing guide -`_. - -Testing -~~~~~~~ - -After installation, you can launch the test suite from outside the source -directory (you will need to have ``pytest`` >= |PyTestMinVersion| installed):: - - pytest sklearn - -See the web page https://scikit-learn.org/dev/developers/contributing.html#testing-and-improving-test-coverage -for more information. - - Random number generation can be controlled during testing by setting - the ``SKLEARN_SEED`` environment variable. - -Submitting a Pull Request -~~~~~~~~~~~~~~~~~~~~~~~~~ - -Before opening a Pull Request, have a look at the -full Contributing page to make sure your code complies -with our guidelines: https://scikit-learn.org/stable/developers/index.html - -Project History ---------------- - -The project was started in 2007 by David Cournapeau as a Google Summer -of Code project, and since then many volunteers have contributed. See -the `About us `__ page -for a list of core contributors. - -The project is currently maintained by a team of volunteers. - -**Note**: `scikit-learn` was previously referred to as `scikits.learn`. - -Help and Support ----------------- - -Documentation -~~~~~~~~~~~~~ - -- HTML documentation (stable release): https://scikit-learn.org -- HTML documentation (development version): https://scikit-learn.org/dev/ -- FAQ: https://scikit-learn.org/stable/faq.html - -Communication -~~~~~~~~~~~~~ - -- Mailing list: https://mail.python.org/mailman/listinfo/scikit-learn -- Gitter: https://gitter.im/scikit-learn/scikit-learn -- Logos & Branding: https://github.com/scikit-learn/scikit-learn/tree/main/doc/logos -- Blog: https://blog.scikit-learn.org -- Calendar: https://blog.scikit-learn.org/calendar/ -- Twitter: https://twitter.com/scikit_learn -- Twitter (commits): https://twitter.com/sklearn_commits -- Stack Overflow: https://stackoverflow.com/questions/tagged/scikit-learn -- Github Discussions: https://github.com/scikit-learn/scikit-learn/discussions -- Website: https://scikit-learn.org -- LinkedIn: https://www.linkedin.com/company/scikit-learn -- YouTube: https://www.youtube.com/channel/UCJosFjYm0ZYVUARxuOZqnnw/playlists -- Facebook: https://www.facebook.com/scikitlearnofficial/ -- Instagram: https://www.instagram.com/scikitlearnofficial/ -- TikTok: https://www.tiktok.com/@scikit.learn - -Citation -~~~~~~~~ - -If you use scikit-learn in a scientific publication, we would appreciate citations: https://scikit-learn.org/stable/about.html#citing-scikit-learn +.. _fork-changelog: +Major Changes of the Fork +========================= + +The purpose of this page is to illustrate some of the main features that +``scikit-learn-tree`` provides compared to ``scikit-learn``. It assumes a +an understanding of core package ``scikit-learn`` and also decision trees +models. Please refer to our :ref:`installation instructions +` for installing ``scikit-learn-tree``. + +Scikit-learn-tree though operates as a stand-in for upstream ``scikit-learn``. +It is used in packages exactly the same way and will support all features +in the corresponding version of ``scikit-learn``. For example, if you +are interested in features of ``scikit-learn`` in v1.2.2 for ``NearestNeighbors`` algorithm, +then if ``scikit-learn-tree`` has a version release of v1.2.2, then it will have +all those features. + +The breaking API changes will be with respect to anything in the ``tree`` submodule, +and related Forest ensemble models. See below for a detailed list of breaking changes. + +See: https://scikit-learn.org/ for documentation on scikit-learn main. + +Our Philosophy +-------------- +Our design philosophy with this fork of ``scikit-learn`` is to maintain as few changes +as possible, such that incorporating upstream changes into the fork requires minimal effort. + +Candidate changes and PRs accepted into the fork are those that: + +- improve compatability with upstream ``scikit-learn`` main +- enable improved extensibility of tree models + +Decision tree generalizations +----------------------------- + +``Scikit-learn`` provides an axis-aligned :class:`~sklearn.tree.DecisionTreeClassifier` +decision tree model (classifier and regressor), which has a few fundamental limitations +that prevent 3rd parties from utilizing the existing class, without forking a large +amount of copy/pasted Python and Cython code. We highlight those limitations here +and then describe how we generalize that limitation. + +Cython Internal Private API: + +Note, the Cython API for scikit-learn is still not a publicly supported API, so it may +change without warning. + +- leaf and split nodes: These nodes are treated the same way and there is no internal + API for setting them differently. Quantile trees and causal trees inherently generalize + how leaf nodes are set. +- Criterion class: The criterion class currently assumes a supervised learning interface. + - Our fix: We implement a ``BaseCriterion`` object that provides an abstract API for unsupervised criterion. +- Splitter class: The splitter clas currently assumes a supervised learning interface and + does not provide a way of generalizing the way split candidates are proposed. + - Our fix: We implement a ``BaseSplitter`` object that provides an abstract API for unsupervised splitters and also implement an API to allow generalizations of the ``SplitRecord`` struct and ``Splitter.node_split`` function. For example, this enables oblique splits to be considered. +- Tree class: The tree class currently assumes a supervised learning interface and does not + provide a way of generalizing the type of tree. + - Our fix: We implementa ``BaseTree`` object that provides an abstract API for general tree models and also implement an API that allows generalization of the type of tree. For example, oblique trees are trivially implementable as an extension now. +- stopping conditions for splitter: Currently, the ``Splitter.node_split`` function has various + stopping conditions for the splitter based on hyperparameters. It is plausible that these conditions + may be extended. For example, in causal trees, one may want the splitter to also account for + a minimal degree of heterogeneity (i.e. variance) in its children nodes. + +Python API: + +- ``sklearn.tree.BaseDecisionTree`` assumes the underlying tree model is supervised: The ``y`` + parameter is required to be passed in, which is not necessary for general tree-based models. + For example, an unsupervised tree may pass in ``y=None``. + - Our fix: We fix this API, so the ``BaseDecisionTree`` is subclassable by unsupervised tree models that do not require ``y`` to be defined. +- ``sklearn.tree.BaseDecisionTree`` does not provide a way to generalize the ``Criterion``, ``Splitter`` + and ``Tree`` Cython classes used: The current codebase requires users to define custom + criterion and/or splitters outside the instantiation of the ``BaseDecisionTree``. This prevents + users from generalizing the ``Criterion`` and ``Splitter`` and creating a neat Python API wrapper. + Moreover, the ``Tree`` class is not customizable. + - Our fix: We internally implement a private function to actually build the entire tree, ``BaseDecisionTree._build_tree``, which can be overridden in subclasses that customize the criterion, splitter, or tree, or any combination of them. +- ``sklearn.ensemble.BaseForest`` and its subclass algorithms are slow when ``n_samples`` is very high. Binning + features into a histogram, which is the basis of "LightGBM" and "HistGradientBoostingClassifier" is a computational + trick that can both significantly increase runtime efficiency, but also help prevent overfitting in trees, since + the sorting in "BestSplitter" is done on bins rather than the continuous feature values. This would enable + random forests and their variants to scale to millions of samples. + - Our fix: We added a ``max_bins=None`` keyword argument to the ``BaseForest`` class, and all its subclasses. The default behavior is no binning. The current implementation is not necessarily efficient. There are several improvements to be made. See below. + +Overall, the existing tree models, such as :class:`~sklearn.tree.DecisionTreeClassifier` +and :class:`~sklearn.ensemble.RandomForestClassifier` all work exactly the same as they +would in ``scikit-learn`` main, but these extensions enable 3rd-party packages to extend +the Cython/Python API easily. + +Roadmap +------- +There are several improvements that can be made in this fork. Primarily, the binning feature +promises to make Random Forests and their variants ultra-fast. However, the binning needs +to be implemented in a similar fashion to ``HistGradientBoostingClassifier``, which passes +in the binning thresholds throughout the tree construction step, such that the split nodes +store the actual numerical value of the bin rather than the "bin index". This requires +modifying the tree Cython code to take in a ``binning_thresholds`` parameter that is part +of the ``_BinMapper`` fitted class. This also allows us not to do any binning during prediction/apply +time because the tree already stores the "numerical" threshold value we would want to apply +to any incoming ``X`` that is not binned. + +Besides that modification, the tree and splitter need to be able to handle not just ``np.float32`` +data (the type for X normally in Random Forests), but also ``uint8`` data (the type for X when it +is binned in to e.g. 255 bins). This would not only save RAM since ``uint8`` storage of millions +of samples would result in many GB saved, but also improved runtime. + +So in summary, the Cython code of the tree submodule needs to take in an extra parameter for +the binning thresholds if binning occurs and also be able to handle ``X`` being of dtype ``uint8``. +Afterwards, Random Forests will have fully leveraged the binning feature. + +Something to keep in mind is that upstream scikit-learn is actively working on incorporating +missing-value handling and categorical handling into Random Forests. + +Next steps +---------- + +We have briefly covered how the tree submodule has changed with respect to ``scikit-learn``. +This enables packages to leverage these changes in developing more complex tree models +that may, or may not eventually be PRed into ``scikit-learn``. For example, + +- `scikit-tree `_ is a scikit-learn + compatible package for more complex and advanced tree models. + +If you are developing tree models, we encourage you to take a look at that package, or +if you have suggestions to make the tree submodule of our fork, ``scikit-learn-tree`` +more \ No newline at end of file diff --git a/build_tools/azure/install.sh b/build_tools/azure/install.sh index 5238cd1121d2e..db5b5d9414053 100755 --- a/build_tools/azure/install.sh +++ b/build_tools/azure/install.sh @@ -7,7 +7,7 @@ set -x source build_tools/shared.sh UNAMESTR=`uname` -CCACHE_LINKS_DIR="/tmp/ccache" +CCACHE_LINKS_DIR="/tmp/ccachev2" setup_ccache() { CCACHE_BIN=`which ccache || echo ""` diff --git a/build_tools/azure/install_win.sh b/build_tools/azure/install_win.sh index ab559a1878971..011e962885d45 100755 --- a/build_tools/azure/install_win.sh +++ b/build_tools/azure/install_win.sh @@ -22,4 +22,4 @@ show_installed_libraries python setup.py bdist_wheel # Install the generated wheel package to test it -pip install --pre --no-index --find-links dist scikit-learn +pip install --pre --no-index --find-links dist scikit-learn-tree diff --git a/doc/Makefile b/doc/Makefile index b56a1289cd581..c728bbbfd033e 100644 --- a/doc/Makefile +++ b/doc/Makefile @@ -53,6 +53,8 @@ html: @echo @echo "Build finished. The HTML pages are in $(BUILDDIR)/html/stable" +# rm $(BUILDDIR)/html/stable/index.html +# mv $(BUILDDIR)/html/stable/fork_index.html $(BUILDDIR)/html/stable/index.html html-noplot: $(SPHINXBUILD) -D plot_gallery=0 -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html/stable @echo diff --git a/doc/conf.py b/doc/conf.py index 52b084b331c8c..01e0a332dd54f 100644 --- a/doc/conf.py +++ b/doc/conf.py @@ -103,7 +103,8 @@ # source_encoding = 'utf-8' # The main toctree document. -root_doc = "contents" +# root_doc = "contents" +root_doc = "index" # General information about the project. project = "scikit-learn" diff --git a/doc/modules/tree.rst b/doc/modules/tree.rst index 789b0bab616ca..7fa12fd16d487 100644 --- a/doc/modules/tree.rst +++ b/doc/modules/tree.rst @@ -141,7 +141,7 @@ Once trained, you can plot the tree with the :func:`plot_tree` function:: >>> tree.plot_tree(clf) [...] -.. figure:: ../auto_examples/tree/images/sphx_glr_plot_iris_dtc_002.png +.. figure:: ../auto_examples/tree/images/sphx_glr_plot_iris_dtc_003.png :target: ../auto_examples/tree/plot_iris_dtc.html :scale: 75 :align: center @@ -331,6 +331,8 @@ total cost over the entire trees (by summing the cost at each node) of :math:`O(n_{features}n_{samples}^{2}\log(n_{samples}))`. +.. _tree_tips_usage: + Tips on practical use ===================== @@ -612,11 +614,66 @@ be pruned. This process stops when the pruned tree's minimal * :ref:`sphx_glr_auto_examples_tree_plot_cost_complexity_pruning.py` +Classification, regression and multi-output problems +---------------------------------------------------- + +OTs can be used for both classification and regression, and can handle multi-output +problems in the same manner as DTs. + +Complexity +---------- + +The run time cost to construct an OT will be similar to that of a DT, with the +added complexity of a (possibly sparse) matrix multiplication to combine random +data columns into candidate split values. The cost at each node is +:math:`O(n_{features}n_{samples}\log(n_{samples}) + n_{features}n_{samples}max\_features \lambda)` +where the additional :math:`n_{features}n_{samples}max\_features \lambda` term +comes from the (possibly sparse) matrix multiplication controlled by both the +number of candidate splits to generate ("max_features") and the sparsity of +the projection matrix that combines the data features (":math:`\lambda`"). + +Another consideration is space-complexity. + +Space-complexity and storing the OT pickled on disc is also a consideration. OTs +at every node need to store an additional vector of feature indices and vector of +feature weights that are used together to form the candidate splits. + +Tips on practical use +--------------------- + +Similar to DTs, the intuition for most parameters are the same. Therefore refer +to :ref:`tips for using decision trees ` for information on standard +tree parameters. Specific parameters, such as ``max_features`` and +``feature_combinations`` are different or special to OTs. + + * As specified earlier, ``max_features`` is not constrained to ``n_features`` + as it is in DTs. Setting ``max_features`` higher requires more computation time because + the algorithm needs to sample more candidate splits at every node. However, it also possibly + lets the user to sample more informative splits, thereby improving the model fit. This + presents a tradeoff between runtime resources and improvements to the model. In practice, + we found that sampling more splits, say up to ``max_features=n_features**2``, is desirable + if one is willing to spend the computational resources. + + * ``feature_combinations`` is the :math:`\lambda` term presented in the complexity + analysis, which specifies how sparse our combination of features is. If + ``feature_combinations=n_features``, then OT is the ``Forest-RC`` version. However, + in practice, ``feature_combinations`` can be set much lower, therefore improving runtime + and storage complexity. + +Finally, when asking the question of when to use OTs vs DTs, scikit-learn recommends +always trying both model using some type of cross-validation procedure and hyperparameter +optimization (e.g. `GridSearchCV`). If one has prior knowledge about how the data is +distributed along its features, such as data being axis-aligned, then one might use a DT. +Other considerations are runtime and space complexity. + .. topic:: References: .. [BRE] L. Breiman, J. Friedman, R. Olshen, and C. Stone. Classification and Regression Trees. Wadsworth, Belmont, CA, 1984. - + + .. [RF] L. Breiman. Random Forests. Machine Learning 45, 5–32 (2001). + https://doi.org/10.1023/A:1010933404324. + * https://en.wikipedia.org/wiki/Decision_tree_learning * https://en.wikipedia.org/wiki/Predictive_analytics diff --git a/examples/tree/plot_iris_dtc.py b/examples/tree/plot_iris_dtc.py index 14f6506b5810f..0dcca718bc6f0 100644 --- a/examples/tree/plot_iris_dtc.py +++ b/examples/tree/plot_iris_dtc.py @@ -2,16 +2,12 @@ ======================================================================= Plot the decision surface of decision trees trained on the iris dataset ======================================================================= - Plot the decision surface of a decision tree trained on pairs of features of the iris dataset. - See :ref:`decision tree ` for more information on the estimator. - For each pair of iris features, the decision tree learns decision boundaries made of combinations of simple thresholding rules inferred from the training samples. - We also show the tree structure of a model built on all of the features. """ # %% diff --git a/setup.py b/setup.py old mode 100755 new mode 100644 index f5522600f623f..e39e39455b7bc --- a/setup.py +++ b/setup.py @@ -30,19 +30,19 @@ builtins.__SKLEARN_SETUP__ = True -DISTNAME = "scikit-learn" -DESCRIPTION = "A set of python modules for machine learning and data mining" +DISTNAME = "scikit-learn-tree" +DESCRIPTION = "A maintained fork of scikit-learn that extends the tree submodule." with open("README.rst") as f: LONG_DESCRIPTION = f.read() -MAINTAINER = "Andreas Mueller" -MAINTAINER_EMAIL = "amueller@ais.uni-bonn.de" +MAINTAINER = "Adam Li" +MAINTAINER_EMAIL = "adam.li@columbia.edu" URL = "http://scikit-learn.org" -DOWNLOAD_URL = "https://pypi.org/project/scikit-learn/#files" +DOWNLOAD_URL = "https://pypi.org/project/scikit-learn-tree/#files" LICENSE = "new BSD" PROJECT_URLS = { - "Bug Tracker": "https://github.com/scikit-learn/scikit-learn/issues", + "Bug Tracker": "https://github.com/neurodata/scikit-learn/issues", "Documentation": "https://scikit-learn.org/stable/documentation.html", - "Source Code": "https://github.com/scikit-learn/scikit-learn", + "Source Code": "https://github.com/neurodata/scikit-learn", } # We can actually import a restricted version of sklearn that @@ -170,11 +170,11 @@ def check_package_status(package, min_version): package_status["up_to_date"] = False package_status["version"] = "" - req_str = "scikit-learn requires {} >= {}.\n".format(package, min_version) + req_str = "scikit-learn-tree requires {} >= {}.\n".format(package, min_version) instructions = ( "Installation instructions are available on the " - "scikit-learn website: " + "scikit-learn-tree website: " "http://scikit-learn.org/stable/install.html\n" ) @@ -221,10 +221,10 @@ def check_package_status(package, min_version): {"sources": ["_cdnmf_fast.pyx"], "include_np": True}, ], "ensemble": [ - {"sources": ["_gradient_boosting.pyx"], "include_np": True}, + {"sources": ["_gradient_boosting.pyx"], "language": "c++", "include_np": True}, ], "ensemble._hist_gradient_boosting": [ - {"sources": ["_gradient_boosting.pyx"], "include_np": True}, + {"sources": ["_gradient_boosting.pyx"], "language": "c++", "include_np": True}, {"sources": ["histogram.pyx"], "include_np": True}, {"sources": ["splitting.pyx"], "include_np": True}, {"sources": ["_binning.pyx"], "include_np": True}, @@ -306,7 +306,7 @@ def check_package_status(package, min_version): {"sources": ["_ball_tree.pyx"], "include_np": True}, {"sources": ["_kd_tree.pyx"], "include_np": True}, {"sources": ["_partition_nodes.pyx"], "language": "c++", "include_np": True}, - {"sources": ["_quad_tree.pyx"], "include_np": True}, + {"sources": ["_quad_tree.pyx"], "language": "c++", "include_np": True}, ], "svm": [ { @@ -374,9 +374,24 @@ def check_package_status(package, min_version): "include_np": True, "optimization_level": "O3", }, - {"sources": ["_splitter.pyx"], "include_np": True, "optimization_level": "O3"}, - {"sources": ["_criterion.pyx"], "include_np": True, "optimization_level": "O3"}, - {"sources": ["_utils.pyx"], "include_np": True, "optimization_level": "O3"}, + { + "sources": ["_splitter.pyx"], + "include_np": True, + "language": "c++", + "optimization_level": "O3", + }, + { + "sources": ["_criterion.pyx"], + "include_np": True, + "language": "c++", + "optimization_level": "O3", + }, + { + "sources": ["_utils.pyx"], + "include_np": True, + "language": "c++", + "optimization_level": "O3", + }, ], "utils": [ {"sources": ["sparsefuncs_fast.pyx"], "include_np": True}, diff --git a/sklearn/ensemble/_forest.py b/sklearn/ensemble/_forest.py index 19203da4fce1f..a3c29e4a269ce 100644 --- a/sklearn/ensemble/_forest.py +++ b/sklearn/ensemble/_forest.py @@ -40,6 +40,7 @@ class calls the ``fit`` method of each sub-estimator on random samples # License: BSD 3 clause +from time import time from numbers import Integral, Real from warnings import catch_warnings, simplefilter, warn import threading @@ -72,10 +73,11 @@ class calls the ``fit`` method of each sub-estimator on random samples _check_sample_weight, _check_feature_names_in, ) +from ..utils._openmp_helpers import _openmp_effective_n_threads from ..utils.validation import _num_samples from ..utils._param_validation import Interval, StrOptions from ..utils._param_validation import RealNotInt - +from ._hist_gradient_boosting.binning import _BinMapper __all__ = [ "RandomForestClassifier", @@ -210,6 +212,10 @@ class BaseForest(MultiOutputMixin, BaseEnsemble, metaclass=ABCMeta): Interval(RealNotInt, 0.0, 1.0, closed="right"), Interval(Integral, 1, None, closed="left"), ], + "max_bins": [ + None, + Interval(Integral, 1, None, closed="left"), + ], } @abstractmethod @@ -228,6 +234,7 @@ def __init__( class_weight=None, max_samples=None, base_estimator="deprecated", + max_bins=None, ): super().__init__( estimator=estimator, @@ -244,6 +251,7 @@ def __init__( self.warm_start = warm_start self.class_weight = class_weight self.max_samples = max_samples + self.max_bins = max_bins def apply(self, X): """ @@ -263,6 +271,15 @@ def apply(self, X): return the index of the leaf x ends up in. """ X = self._validate_X_predict(X) + + # if we trained a binning tree, then we should re-bin the data + # XXX: this is inefficient and should be improved to be in line with what + # the Histogram Gradient Boosting Tree does, where the binning thresholds + # are passed into the tree itself, thus allowing us to set the node feature + # value thresholds within the tree itself. + if self.max_bins is not None: + X = self._bin_data(X, is_training_data=False).astype(DTYPE) + results = Parallel( n_jobs=self.n_jobs, verbose=self.verbose, @@ -420,6 +437,38 @@ def fit(self, X, y, sample_weight=None): n_more_estimators = self.n_estimators - len(self.estimators_) + if self.max_bins is not None: + # `_openmp_effective_n_threads` is used to take cgroups CPU quotes + # into account when determine the maximum number of threads to use. + n_threads = _openmp_effective_n_threads() + + # Bin the data + # For ease of use of the API, the user-facing GBDT classes accept the + # parameter max_bins, which doesn't take into account the bin for + # missing values (which is always allocated). However, since max_bins + # isn't the true maximal number of bins, all other private classes + # (binmapper, histbuilder...) accept n_bins instead, which is the + # actual total number of bins. Everywhere in the code, the + # convention is that n_bins == max_bins + 1 + n_bins = self.max_bins + 1 # + 1 for missing values + self._bin_mapper = _BinMapper( + n_bins=n_bins, + # is_categorical=self.is_categorical_, + known_categories=None, + random_state=random_state, + n_threads=n_threads, + ) + + # XXX: in order for this to work with the underlying tree submodule's Cython + # code, we need to convert this into the original data's DTYPE because + # the Cython code assumes that `DTYPE` is used. + # The proper implementation will be a lot more complicated and should be + # tackled once scikit-learn has finalized their inclusion of missing data + # and categorical support for decision trees + X = self._bin_data(X, is_training_data=True) # .astype(DTYPE) + else: + self._bin_mapper = None + if n_more_estimators < 0: raise ValueError( "n_estimators=%d must be larger or equal to " @@ -628,6 +677,35 @@ def feature_importances_(self): all_importances = np.mean(all_importances, axis=0, dtype=np.float64) return all_importances / np.sum(all_importances) + def _bin_data(self, X, is_training_data): + """Bin data X. + + If is_training_data, then fit the _bin_mapper attribute. + Else, the binned data is converted to a C-contiguous array. + """ + + description = "training" if is_training_data else "validation" + if self.verbose: + print( + "Binning {:.3f} GB of {} data: ".format(X.nbytes / 1e9, description), + end="", + flush=True, + ) + tic = time() + if is_training_data: + X_binned = self._bin_mapper.fit_transform(X) # F-aligned array + else: + X_binned = self._bin_mapper.transform(X) # F-aligned array + # We convert the array to C-contiguous since predicting is faster + # with this layout (training is faster on F-arrays though) + X_binned = np.ascontiguousarray(X_binned) + toc = time() + if self.verbose: + duration = toc - tic + print("{:.3f} s".format(duration)) + + return X_binned + def _accumulate_prediction(predict, X, out, lock): """ @@ -669,6 +747,7 @@ def __init__( class_weight=None, max_samples=None, base_estimator="deprecated", + max_bins=None, ): super().__init__( estimator=estimator, @@ -683,6 +762,7 @@ def __init__( class_weight=class_weight, max_samples=max_samples, base_estimator=base_estimator, + max_bins=max_bins, ) @staticmethod @@ -856,6 +936,14 @@ def predict_proba(self, X): # Check data X = self._validate_X_predict(X) + # if we trained a binning tree, then we should re-bin the data + # XXX: this is inefficient and should be improved to be in line with what + # the Histogram Gradient Boosting Tree does, where the binning thresholds + # are passed into the tree itself, thus allowing us to set the node feature + # value thresholds within the tree itself. + if self.max_bins is not None: + X = self._bin_data(X, is_training_data=False).astype(DTYPE) + # Assign chunk of trees to jobs n_jobs, _, _ = _partition_estimators(self.n_estimators, self.n_jobs) @@ -937,6 +1025,7 @@ def __init__( warm_start=False, max_samples=None, base_estimator="deprecated", + max_bins=None, ): super().__init__( estimator, @@ -950,6 +1039,7 @@ def __init__( warm_start=warm_start, max_samples=max_samples, base_estimator=base_estimator, + max_bins=max_bins, ) def predict(self, X): @@ -975,6 +1065,14 @@ def predict(self, X): # Check data X = self._validate_X_predict(X) + # if we trained a binning tree, then we should re-bin the data + # XXX: this is inefficient and should be improved to be in line with what + # the Histogram Gradient Boosting Tree does, where the binning thresholds + # are passed into the tree itself, thus allowing us to set the node feature + # value thresholds within the tree itself. + if self.max_bins is not None: + X = self._bin_data(X, is_training_data=False).astype(DTYPE) + # Assign chunk of trees to jobs n_jobs, _, _ = _partition_estimators(self.n_estimators, self.n_jobs) @@ -1399,6 +1497,7 @@ def __init__( class_weight=None, ccp_alpha=0.0, max_samples=None, + max_bins=None, ): super().__init__( estimator=DecisionTreeClassifier(), @@ -1423,6 +1522,7 @@ def __init__( warm_start=warm_start, class_weight=class_weight, max_samples=max_samples, + max_bins=max_bins, ) self.criterion = criterion @@ -1734,6 +1834,7 @@ def __init__( warm_start=False, ccp_alpha=0.0, max_samples=None, + max_bins=None, ): super().__init__( estimator=DecisionTreeRegressor(), @@ -1757,6 +1858,7 @@ def __init__( verbose=verbose, warm_start=warm_start, max_samples=max_samples, + max_bins=max_bins, ) self.criterion = criterion @@ -2084,6 +2186,7 @@ def __init__( class_weight=None, ccp_alpha=0.0, max_samples=None, + max_bins=None, ): super().__init__( estimator=ExtraTreeClassifier(), @@ -2108,6 +2211,7 @@ def __init__( warm_start=warm_start, class_weight=class_weight, max_samples=max_samples, + max_bins=max_bins, ) self.criterion = criterion @@ -2406,6 +2510,7 @@ def __init__( warm_start=False, ccp_alpha=0.0, max_samples=None, + max_bins=None, ): super().__init__( estimator=ExtraTreeRegressor(), @@ -2429,6 +2534,7 @@ def __init__( verbose=verbose, warm_start=warm_start, max_samples=max_samples, + max_bins=max_bins, ) self.criterion = criterion diff --git a/sklearn/ensemble/tests/test_forest.py b/sklearn/ensemble/tests/test_forest.py index 9bf0bb2becd9b..0150340f24bc6 100644 --- a/sklearn/ensemble/tests/test_forest.py +++ b/sklearn/ensemble/tests/test_forest.py @@ -118,6 +118,120 @@ FOREST_CLASSIFIERS_REGRESSORS.update(FOREST_REGRESSORS) +def _sparse_parity(n, p=20, p_star=3, random_state=None): + """Generate sparse parity dataset. + + Sparse parity is a multivariate generalization of the + XOR problem. + + Parameters + ---------- + n : int + Number of sample to generate. + p : int, optional + The dimensionality of the dataset, by default 20 + p_star : int, optional + The number of informative dimensions, by default 3. + random_state : Random State, optional + Random state, by default None. + + Returns + ------- + X : np.ndarray of shape (n, p) + Sparse parity dataset as a dense array. + y : np.ndarray of shape (n,) + Labels of the dataset + """ + rng = np.random.RandomState(seed=random_state) + X = rng.uniform(-1, 1, (n, p)) + y = np.zeros(n) + + for i in range(0, n): + y[i] = sum(X[i, :p_star] > 0) % 2 + + return X, y + + +def _orthant(n, p=6, random_state=None): + """Generate orthant dataset. + + Parameters + ---------- + n : int + Number of sample to generate. + p : int, optional + The dimensionality of the dataset and the number of + unique labels, by default 6. + rec : int, optional + _description_, by default 1 + random_state : Random State, optional + Random state, by default None. + + Returns + ------- + X : np.ndarray of shape (n, p) + Orthant dataset as a dense array. + y : np.ndarray of shape (n,) + Labels of the dataset + """ + rng = np.random.RandomState(seed=random_state) + orth_labels = np.asarray([2**i for i in range(0, p)][::-1]) + + X = rng.uniform(-1, 1, (n, p)) + y = np.zeros(n) + + for i in range(0, n): + idx = np.where(X[i, :] > 0)[0] + y[i] = sum(orth_labels[idx]) + + if len(np.unique(y)) < 2**p: + raise RuntimeError("Increase sample size to get a label in each orthant.") + + return X, y + + +def _trunk(n, p=10, random_state=None): + """Generate trunk dataset. + + Parameters + ---------- + n : int + Number of sample to generate. + p : int, optional + The dimensionality of the dataset and the number of + unique labels, by default 10. + random_state : Random State, optional + Random state, by default None. + + Returns + ------- + X : np.ndarray of shape (n, p) + Trunk dataset as a dense array. + y : np.ndarray of shape (n,) + Labels of the dataset + + References + ---------- + [1] Gerard V. Trunk. A problem of dimensionality: A + simple example. IEEE Transactions on Pattern Analysis + and Machine Intelligence, 1(3):306–307, 1979. + """ + rng = np.random.RandomState(seed=random_state) + + mu_1 = np.array([1 / i for i in range(1, p + 1)]) + mu_0 = -1 * mu_1 + cov = np.identity(p) + + X = np.vstack( + ( + rng.multivariate_normal(mu_0, cov, int(n / 2)), + rng.multivariate_normal(mu_1, cov, int(n / 2)), + ) + ) + y = np.concatenate((np.zeros(int(n / 2)), np.ones(int(n / 2)))) + return X, y + + def check_classification_toy(name): """Check classification on a toy dataset.""" ForestClassifier = FOREST_CLASSIFIERS[name] @@ -1791,3 +1905,60 @@ def test_round_samples_to_one_when_samples_too_low(class_weight): n_estimators=10, max_samples=1e-4, class_weight=class_weight, random_state=0 ) forest.fit(X, y) + + +@pytest.mark.parametrize("name", FOREST_CLASSIFIERS) +def test_classification_toy_withbins(name): + """Check classification on a toy dataset.""" + ForestClassifier = FOREST_CLASSIFIERS[name] + + clf = ForestClassifier(n_estimators=10, random_state=1, max_bins=255) + clf.fit(X, y) + assert_array_equal(clf.predict(T), true_result) + assert 10 == len(clf) + + clf = ForestClassifier( + n_estimators=10, max_features=1, random_state=1, max_bins=255 + ) + clf.fit(X, y) + assert_array_equal(clf.predict(T), true_result) + assert 10 == len(clf) + + # also test apply + leaf_indices = clf.apply(X) + assert leaf_indices.shape == (len(X), clf.n_estimators) + + +@pytest.mark.parametrize("name", FOREST_REGRESSORS) +@pytest.mark.parametrize( + "criterion", ("squared_error", "absolute_error", "friedman_mse") +) +def test_regression_criterion_withbins(name, criterion): + # Check consistency on regression dataset. + ForestRegressor = FOREST_REGRESSORS[name] + + reg = ForestRegressor( + n_estimators=5, criterion=criterion, random_state=1, max_bins=250 + ) + reg.fit(X_reg, y_reg) + score = reg.score(X_reg, y_reg) + assert ( + score > 0.93 + ), "Failed with max_features=None, criterion %s and score = %f" % ( + criterion, + score, + ) + + reg = ForestRegressor( + n_estimators=5, + criterion=criterion, + max_features=6, + random_state=1, + max_bins=250, + ) + reg.fit(X_reg, y_reg) + score = reg.score(X_reg, y_reg) + assert score > 0.92, "Failed with max_features=6, criterion %s and score = %f" % ( + criterion, + score, + ) diff --git a/sklearn/tree/_classes.py b/sklearn/tree/_classes.py index b175275ea92dc..bd54483bf2dfe 100644 --- a/sklearn/tree/_classes.py +++ b/sklearn/tree/_classes.py @@ -40,8 +40,8 @@ from ..utils._param_validation import Hidden, Interval, StrOptions from ..utils._param_validation import RealNotInt -from ._criterion import Criterion -from ._splitter import Splitter +from ._criterion import BaseCriterion +from ._splitter import BaseSplitter from ._tree import DepthFirstTreeBuilder from ._tree import BestFirstTreeBuilder from ._tree import Tree @@ -174,7 +174,7 @@ def get_n_leaves(self): check_is_fitted(self) return self.tree_.n_leaves - def fit(self, X, y, sample_weight=None, check_input=True): + def fit(self, X, y=None, sample_weight=None, check_input=True): self._validate_params() random_state = check_random_state(self.random_state) @@ -184,9 +184,12 @@ def fit(self, X, y, sample_weight=None, check_input=True): # csr. check_X_params = dict(dtype=DTYPE, accept_sparse="csc") check_y_params = dict(ensure_2d=False, dtype=None) - X, y = self._validate_data( - X, y, validate_separately=(check_X_params, check_y_params) - ) + if y is not None or self._get_tags()["requires_y"]: + X, y = self._validate_data( + X, y, validate_separately=(check_X_params, check_y_params) + ) + else: + X = self._validate_data(X, **check_X_params) if issparse(X): X.sort_indices() @@ -195,7 +198,7 @@ def fit(self, X, y, sample_weight=None, check_input=True): "No support for np.int64 index based sparse matrices" ) - if self.criterion == "poisson": + if y is not None and self.criterion == "poisson": if np.any(y < 0): raise ValueError( "Some value(s) of y are negative which is" @@ -209,45 +212,56 @@ def fit(self, X, y, sample_weight=None, check_input=True): # Determine output settings n_samples, self.n_features_in_ = X.shape - is_classification = is_classifier(self) - y = np.atleast_1d(y) - expanded_class_weight = None + # Do preprocessing if 'y' is passed + is_classification = False + if y is not None: + is_classification = is_classifier(self) + + y = np.atleast_1d(y) + expanded_class_weight = None - if y.ndim == 1: - # reshape is necessary to preserve the data contiguity against vs - # [:, np.newaxis] that does not. - y = np.reshape(y, (-1, 1)) + if y.ndim == 1: + # reshape is necessary to preserve the data contiguity against vs + # [:, np.newaxis] that does not. + y = np.reshape(y, (-1, 1)) - self.n_outputs_ = y.shape[1] + self.n_outputs_ = y.shape[1] - if is_classification: - check_classification_targets(y) - y = np.copy(y) + if is_classification: + check_classification_targets(y) + y = np.copy(y) - self.classes_ = [] - self.n_classes_ = [] + self.classes_ = [] + self.n_classes_ = [] - if self.class_weight is not None: - y_original = np.copy(y) + if self.class_weight is not None: + y_original = np.copy(y) - y_encoded = np.zeros(y.shape, dtype=int) - for k in range(self.n_outputs_): - classes_k, y_encoded[:, k] = np.unique(y[:, k], return_inverse=True) - self.classes_.append(classes_k) - self.n_classes_.append(classes_k.shape[0]) - y = y_encoded - - if self.class_weight is not None: - expanded_class_weight = compute_sample_weight( - self.class_weight, y_original - ) + y_encoded = np.zeros(y.shape, dtype=int) + for k in range(self.n_outputs_): + classes_k, y_encoded[:, k] = np.unique(y[:, k], return_inverse=True) + self.classes_.append(classes_k) + self.n_classes_.append(classes_k.shape[0]) + y = y_encoded + + if self.class_weight is not None: + expanded_class_weight = compute_sample_weight( + self.class_weight, y_original + ) - self.n_classes_ = np.array(self.n_classes_, dtype=np.intp) + self.n_classes_ = np.array(self.n_classes_, dtype=np.intp) - if getattr(y, "dtype", None) != DOUBLE or not y.flags.contiguous: - y = np.ascontiguousarray(y, dtype=DOUBLE) + if getattr(y, "dtype", None) != DOUBLE or not y.flags.contiguous: + y = np.ascontiguousarray(y, dtype=DOUBLE) + if len(y) != n_samples: + raise ValueError( + "Number of labels=%d does not match number of samples=%d" + % (len(y), n_samples) + ) + + # set decision-tree model parameters max_depth = np.iinfo(np.int32).max if self.max_depth is None else self.max_depth if isinstance(self.min_samples_leaf, numbers.Integral): @@ -299,16 +313,10 @@ def fit(self, X, y, sample_weight=None, check_input=True): max_leaf_nodes = -1 if self.max_leaf_nodes is None else self.max_leaf_nodes - if len(y) != n_samples: - raise ValueError( - "Number of labels=%d does not match number of samples=%d" - % (len(y), n_samples) - ) - if sample_weight is not None: sample_weight = _check_sample_weight(sample_weight, X, DOUBLE) - if expanded_class_weight is not None: + if y is not None and expanded_class_weight is not None: if sample_weight is not None: sample_weight = sample_weight * expanded_class_weight else: @@ -320,10 +328,63 @@ def fit(self, X, y, sample_weight=None, check_input=True): else: min_weight_leaf = self.min_weight_fraction_leaf * np.sum(sample_weight) + # build the actual tree now with the parameters + self._build_tree( + X, + y, + sample_weight, + min_samples_leaf, + min_weight_leaf, + max_leaf_nodes, + min_samples_split, + max_depth, + random_state, + ) + + return self + + def _build_tree( + self, + X, + y, + sample_weight, + min_samples_leaf, + min_weight_leaf, + max_leaf_nodes, + min_samples_split, + max_depth, + random_state, + ): + """Build the actual tree. + + Parameters + ---------- + X : Array-like + X dataset. + y : Array-like + Y targets. + sample_weight : Array-like + Sample weights + min_samples_leaf : float + Number of samples required to be a leaf. + min_weight_leaf : float + Weight of samples required to be a leaf. + max_leaf_nodes : float + Maximum number of leaf nodes allowed in tree. + min_samples_split : float + Minimum number of samples to split on. + max_depth : int + The maximum depth of any tree. + random_state : int + Random seed. + """ + + n_samples = X.shape[0] + # Build tree criterion = self.criterion - if not isinstance(criterion, Criterion): - if is_classification: + if not isinstance(criterion, BaseCriterion): + if is_classifier(self): criterion = CRITERIA_CLF[self.criterion]( self.n_outputs_, self.n_classes_ ) @@ -337,7 +398,7 @@ def fit(self, X, y, sample_weight=None, check_input=True): SPLITTERS = SPARSE_SPLITTERS if issparse(X) else DENSE_SPLITTERS splitter = self.splitter - if not isinstance(self.splitter, Splitter): + if not isinstance(self.splitter, BaseSplitter): splitter = SPLITTERS[self.splitter]( criterion, self.max_features_, @@ -385,8 +446,6 @@ def fit(self, X, y, sample_weight=None, check_input=True): self._prune_tree() - return self - def _validate_X_predict(self, X, check_input): """Validate the training data on predict (probabilities).""" if check_input: @@ -817,7 +876,10 @@ class DecisionTreeClassifier(ClassifierMixin, BaseDecisionTree): _parameter_constraints: dict = { **BaseDecisionTree._parameter_constraints, - "criterion": [StrOptions({"gini", "entropy", "log_loss"}), Hidden(Criterion)], + "criterion": [ + StrOptions({"gini", "entropy", "log_loss"}), + Hidden(BaseCriterion), + ], "class_weight": [dict, list, StrOptions({"balanced"}), None], } @@ -1173,7 +1235,7 @@ class DecisionTreeRegressor(RegressorMixin, BaseDecisionTree): **BaseDecisionTree._parameter_constraints, "criterion": [ StrOptions({"squared_error", "friedman_mse", "absolute_error", "poisson"}), - Hidden(Criterion), + Hidden(BaseCriterion), ], } diff --git a/sklearn/tree/_criterion.pxd b/sklearn/tree/_criterion.pxd index 47f616c6bad50..2e179e78e8c3f 100644 --- a/sklearn/tree/_criterion.pxd +++ b/sklearn/tree/_criterion.pxd @@ -4,6 +4,8 @@ # Joel Nothman # Arnaud Joly # Jacob Schreiber +# Adam Li +# Jong Shin # # License: BSD 3 clause @@ -15,13 +17,11 @@ from ._tree cimport SIZE_t # Type for indices and counters from ._tree cimport INT32_t # Signed 32 bit integer from ._tree cimport UINT32_t # Unsigned 32 bit integer -cdef class Criterion: - # The criterion computes the impurity of a node and the reduction of - # impurity of a split on that node. It also computes the output statistics - # such as the mean in regression and class probabilities in classification. + +cdef class BaseCriterion: + """Abstract interface for criterion.""" # Internal structures - cdef const DOUBLE_t[:, ::1] y # Values of y cdef const DOUBLE_t[:] sample_weight # Sample weights cdef const SIZE_t[:] sample_indices # Sample indices in X, y @@ -37,19 +37,7 @@ cdef class Criterion: cdef double weighted_n_left # Weighted number of samples in the left node cdef double weighted_n_right # Weighted number of samples in the right node - # The criterion object is maintained such that left and right collected - # statistics correspond to samples[start:pos] and samples[pos:end]. - - # Methods - cdef int init( - self, - const DOUBLE_t[:, ::1] y, - const DOUBLE_t[:] sample_weight, - double weighted_n_samples, - const SIZE_t[:] sample_indices, - SIZE_t start, - SIZE_t end - ) except -1 nogil + # Core methods that criterion class _must_ implement. cdef int reset(self) except -1 nogil cdef int reverse_reset(self) except -1 nogil cdef int update(self, SIZE_t new_pos) except -1 nogil @@ -71,6 +59,25 @@ cdef class Criterion: ) noexcept nogil cdef double proxy_impurity_improvement(self) noexcept nogil + cdef void set_sample_pointers( + self, + SIZE_t start, + SIZE_t end + ) noexcept nogil + +cdef class Criterion(BaseCriterion): + """Abstract interface for supervised impurity criteria.""" + + cdef const DOUBLE_t[:, ::1] y + + cdef int init( + self, + const DOUBLE_t[:, ::1] y, + const DOUBLE_t[:] sample_weight, + double weighted_n_samples, + const SIZE_t[:] sample_indices + ) except -1 nogil + cdef class ClassificationCriterion(Criterion): """Abstract criterion for classification.""" @@ -88,4 +95,4 @@ cdef class RegressionCriterion(Criterion): cdef double[::1] sum_total # The sum of w*y. cdef double[::1] sum_left # Same as above, but for the left side of the split - cdef double[::1] sum_right # Same as above, but for the right side of the split + cdef double[::1] sum_right # Same as above, but for the right side of the split \ No newline at end of file diff --git a/sklearn/tree/_criterion.pyx b/sklearn/tree/_criterion.pyx index 7cd7bbb0e3c1b..c94914daa0e0b 100644 --- a/sklearn/tree/_criterion.pyx +++ b/sklearn/tree/_criterion.pyx @@ -9,6 +9,8 @@ # Fares Hedayati # Jacob Schreiber # Nelson Liu +# Adam Li +# Jong Shin # # License: BSD 3 clause @@ -29,11 +31,20 @@ from ._utils cimport WeightedMedianCalculator # EPSILON is used in the Poisson criterion cdef double EPSILON = 10 * np.finfo('double').eps -cdef class Criterion: - """Interface for impurity criteria. - +cdef class BaseCriterion: + """This is an abstract interface for criterion. For example, a tree model could + be either supervisedly, or unsupervisedly computing impurity on samples of + covariates, or labels, or both. Although scikit-learn currently only contains + supervised tree methods, this class enables 3rd party packages to leverage + scikit-learn's Cython code for criteria. + The downstream classes _must_ implement methods to compute the impurity + in current node and in children nodes. This object stores methods on how to calculate how good a split is using - different metrics. + a set API. + Samples in the "current" node are stored in `samples[start:end]` which is + partitioned around `pos` (an index in `start:end`) so that: + - the samples of left child node are stored in `samples[start:pos]` + - the samples of right child node are stored in `samples[pos:end]` """ def __getstate__(self): return {} @@ -41,61 +52,23 @@ cdef class Criterion: def __setstate__(self, d): pass - cdef int init( - self, - const DOUBLE_t[:, ::1] y, - const DOUBLE_t[:] sample_weight, - double weighted_n_samples, - const SIZE_t[:] sample_indices, - SIZE_t start, - SIZE_t end, - ) except -1 nogil: - """Placeholder for a method which will initialize the criterion. - - Returns -1 in case of failure to allocate memory (and raise MemoryError) - or 0 otherwise. - - Parameters - ---------- - y : ndarray, dtype=DOUBLE_t - y is a buffer that can store values for n_outputs target variables - stored as a Cython memoryview. - sample_weight : ndarray, dtype=DOUBLE_t - The weight of each sample stored as a Cython memoryview. - weighted_n_samples : double - The total weight of the samples being considered - sample_indices : ndarray, dtype=SIZE_t - A mask on the samples. Indices of the samples in X and y we want to use, - where sample_indices[start:end] correspond to the samples in this node. - start : SIZE_t - The first sample to be used on this node - end : SIZE_t - The last sample used on this node - - """ - pass - cdef int reset(self) except -1 nogil: """Reset the criterion at pos=start. - This method must be implemented by the subclass. """ pass cdef int reverse_reset(self) except -1 nogil: """Reset the criterion at pos=end. - This method must be implemented by the subclass. """ pass cdef int update(self, SIZE_t new_pos) except -1 nogil: """Updated statistics by moving sample_indices[pos:new_pos] to the left child. - This updates the collected statistics by moving sample_indices[pos:new_pos] from the right child to the left child. It must be implemented by the subclass. - Parameters ---------- new_pos : SIZE_t @@ -105,7 +78,6 @@ cdef class Criterion: cdef double node_impurity(self) noexcept nogil: """Placeholder for calculating the impurity of the node. - Placeholder for a method which will evaluate the impurity of the current node, i.e. the impurity of sample_indices[start:end]. This is the primary function of the criterion class. The smaller the impurity the @@ -116,11 +88,9 @@ cdef class Criterion: cdef void children_impurity(self, double* impurity_left, double* impurity_right) noexcept nogil: """Placeholder for calculating the impurity of children. - Placeholder for a method which evaluates the impurity in children nodes, i.e. the impurity of sample_indices[start:pos] + the impurity of sample_indices[pos:end]. - Parameters ---------- impurity_left : double pointer @@ -134,10 +104,8 @@ cdef class Criterion: cdef void node_value(self, double* dest) noexcept nogil: """Placeholder for storing the node value. - Placeholder for a method which will compute the node value of sample_indices[start:end] and save the value into dest. - Parameters ---------- dest : double pointer @@ -147,12 +115,10 @@ cdef class Criterion: cdef double proxy_impurity_improvement(self) noexcept nogil: """Compute a proxy of the impurity reduction. - This method is used to speed up the search for the best split. It is a proxy quantity such that the split that maximizes this value also maximizes the impurity improvement. It neglects all constant terms of the impurity decrease for a given split. - The absolute impurity improvement is only computed by the impurity_improvement method once the best split has been found. """ @@ -167,28 +133,21 @@ cdef class Criterion: double impurity_left, double impurity_right) noexcept nogil: """Compute the improvement in impurity. - This method computes the improvement in impurity when a split occurs. The weighted impurity improvement equation is the following: - N_t / N * (impurity - N_t_R / N_t * right_impurity - N_t_L / N_t * left_impurity) - where N is the total number of samples, N_t is the number of samples at the current node, N_t_L is the number of samples in the left child, and N_t_R is the number of samples in the right child, - Parameters ---------- impurity_parent : double The initial impurity of the parent node before the split - impurity_left : double The impurity of the left child - impurity_right : double The impurity of the right child - Return ------ double : improvement in impurity after the split occurs @@ -199,6 +158,61 @@ cdef class Criterion: - (self.weighted_n_left / self.weighted_n_node_samples * impurity_left))) + cdef void set_sample_pointers( + self, + SIZE_t start, + SIZE_t end + ) noexcept nogil: + """Abstract method which will set sample pointers in the criterion. + The dataset array that we compute criteria on is assumed to consist of 'N' + ordered samples or rows (i.e. sorted). Since we pass this by reference, we + use sample pointers to move the start and end around to consider only a subset of data. + This function should also update relevant statistics that the class uses to compute the final criterion. + Parameters + ---------- + start : SIZE_t + The index of the first sample to be used on computation of criteria of the current node. + end : SIZE_t + The last sample used on this node + """ + pass + + +cdef class Criterion(BaseCriterion): + """Interface for impurity criteria. + The supervised criterion computes the impurity of a node and the reduction of + impurity of a split on that node using the distribution of labels in parent and + children nodes. It also computes the output statistics + such as the mean in regression and class probabilities in classification. + Instances of this class are responsible for compute splits' impurity difference + Criterion is the base class for criteria used in supervised tree-based models + with a homogeneous float64-dtyped y. + """ + cdef int init( + self, + const DOUBLE_t[:, ::1] y, + const DOUBLE_t[:] sample_weight, + double weighted_n_samples, + const SIZE_t[:] sample_indices + ) except -1 nogil: + """Placeholder for a method which will initialize the criterion. + Returns -1 in case of failure to allocate memory (and raise MemoryError) + or 0 otherwise. + Parameters + ---------- + y : ndarray, dtype=DOUBLE_t + y is a buffer that can store values for n_outputs target variables + stored as a Cython memoryview. + sample_weight : ndarray, dtype=DOUBLE_t + The weight of each sample stored as a Cython memoryview. + weighted_n_samples : double + The total weight of the samples being considered + sample_indices : ndarray, dtype=SIZE_t + A mask on the samples. Indices of the samples in X and y we want to use, + where sample_indices[start:end] correspond to the samples in this node. + """ + pass + cdef class ClassificationCriterion(Criterion): """Abstract criterion for classification.""" @@ -206,7 +220,6 @@ cdef class ClassificationCriterion(Criterion): def __cinit__(self, SIZE_t n_outputs, cnp.ndarray[SIZE_t, ndim=1] n_classes): """Initialize attributes for this criterion. - Parameters ---------- n_outputs : SIZE_t @@ -254,18 +267,11 @@ cdef class ClassificationCriterion(Criterion): const DOUBLE_t[:, ::1] y, const DOUBLE_t[:] sample_weight, double weighted_n_samples, - const SIZE_t[:] sample_indices, - SIZE_t start, - SIZE_t end + const SIZE_t[:] sample_indices ) except -1 nogil: """Initialize the criterion. - - This initializes the criterion at node sample_indices[start:end] and children - sample_indices[start:start] and sample_indices[start:end]. - Returns -1 in case of failure to allocate memory (and raise MemoryError) or 0 otherwise. - Parameters ---------- y : ndarray, dtype=DOUBLE_t @@ -277,18 +283,24 @@ cdef class ClassificationCriterion(Criterion): sample_indices : ndarray, dtype=SIZE_t A mask on the samples. Indices of the samples in X and y we want to use, where sample_indices[start:end] correspond to the samples in this node. - start : SIZE_t - The first sample to use in the mask - end : SIZE_t - The last sample to use in the mask """ self.y = y self.sample_weight = sample_weight self.sample_indices = sample_indices + self.weighted_n_samples = weighted_n_samples + + return 0 + + cdef void set_sample_pointers( + self, + SIZE_t start, + SIZE_t end + ) noexcept nogil: + """Set sample pointers in the criterion.""" + self.n_node_samples = end - start self.start = start self.end = end - self.n_node_samples = end - start - self.weighted_n_samples = weighted_n_samples + self.weighted_n_node_samples = 0.0 cdef SIZE_t i @@ -301,12 +313,12 @@ cdef class ClassificationCriterion(Criterion): memset(&self.sum_total[k, 0], 0, self.n_classes[k] * sizeof(double)) for p in range(start, end): - i = sample_indices[p] + i = self.sample_indices[p] # w is originally set to be 1.0, meaning that if no sample weights # are given, the default weight of each sample is 1.0. - if sample_weight is not None: - w = sample_weight[i] + if self.sample_weight is not None: + w = self.sample_weight[i] # Count weighted class frequency for each target for k in range(self.n_outputs): @@ -317,11 +329,9 @@ cdef class ClassificationCriterion(Criterion): # Reset to pos=start self.reset() - return 0 cdef int reset(self) except -1 nogil: """Reset the criterion at pos=start. - Returns -1 in case of failure to allocate memory (and raise MemoryError) or 0 otherwise. """ @@ -338,7 +348,6 @@ cdef class ClassificationCriterion(Criterion): cdef int reverse_reset(self) except -1 nogil: """Reset the criterion at pos=end. - Returns -1 in case of failure to allocate memory (and raise MemoryError) or 0 otherwise. """ @@ -355,10 +364,8 @@ cdef class ClassificationCriterion(Criterion): cdef int update(self, SIZE_t new_pos) except -1 nogil: """Updated statistics by moving sample_indices[pos:new_pos] to the left child. - Returns -1 in case of failure to allocate memory (and raise MemoryError) or 0 otherwise. - Parameters ---------- new_pos : SIZE_t @@ -428,7 +435,6 @@ cdef class ClassificationCriterion(Criterion): cdef void node_value(self, double* dest) noexcept nogil: """Compute the node value of sample_indices[start:end] and save it into dest. - Parameters ---------- dest : double pointer @@ -443,23 +449,17 @@ cdef class ClassificationCriterion(Criterion): cdef class Entropy(ClassificationCriterion): r"""Cross Entropy impurity criterion. - This handles cases where the target is a classification taking values 0, 1, ... K-2, K-1. If node m represents a region Rm with Nm observations, then let - count_k = 1 / Nm \sum_{x_i in Rm} I(yi = k) - be the proportion of class k observations in node m. - The cross-entropy is then defined as - cross-entropy = -\sum_{k=0}^{K-1} count_k log(count_k) """ cdef double node_impurity(self) noexcept nogil: """Evaluate the impurity of the current node. - Evaluate the cross-entropy criterion as impurity of the current node, i.e. the impurity of sample_indices[start:end]. The smaller the impurity the better. @@ -481,10 +481,8 @@ cdef class Entropy(ClassificationCriterion): cdef void children_impurity(self, double* impurity_left, double* impurity_right) noexcept nogil: """Evaluate the impurity in children nodes. - i.e. the impurity of the left child (sample_indices[start:pos]) and the impurity the right child (sample_indices[pos:end]). - Parameters ---------- impurity_left : double pointer @@ -516,24 +514,18 @@ cdef class Entropy(ClassificationCriterion): cdef class Gini(ClassificationCriterion): r"""Gini Index impurity criterion. - This handles cases where the target is a classification taking values 0, 1, ... K-2, K-1. If node m represents a region Rm with Nm observations, then let - count_k = 1/ Nm \sum_{x_i in Rm} I(yi = k) - be the proportion of class k observations in node m. - The Gini Index is then defined as: - index = \sum_{k=0}^{K-1} count_k (1 - count_k) = 1 - \sum_{k=0}^{K-1} count_k ** 2 """ cdef double node_impurity(self) noexcept nogil: """Evaluate the impurity of the current node. - Evaluate the Gini criterion as impurity of the current node, i.e. the impurity of sample_indices[start:end]. The smaller the impurity the better. @@ -559,10 +551,8 @@ cdef class Gini(ClassificationCriterion): cdef void children_impurity(self, double* impurity_left, double* impurity_right) noexcept nogil: """Evaluate the impurity in children nodes. - i.e. the impurity of the left child (sample_indices[start:pos]) and the impurity the right child (sample_indices[pos:end]) using the Gini index. - Parameters ---------- impurity_left : double pointer @@ -601,24 +591,20 @@ cdef class Gini(ClassificationCriterion): cdef class RegressionCriterion(Criterion): r"""Abstract regression criterion. - This handles cases where the target is a continuous value, and is evaluated by computing the variance of the target values left and right of the split point. The computation takes linear time with `n_samples` by using :: - var = \sum_i^n (y_i - y_bar) ** 2 = (\sum_i^n y_i ** 2) - n_samples * y_bar ** 2 """ def __cinit__(self, SIZE_t n_outputs, SIZE_t n_samples): """Initialize parameters for this criterion. - Parameters ---------- n_outputs : SIZE_t The number of targets to be predicted - n_samples : SIZE_t The total number of samples to fit on """ @@ -648,23 +634,29 @@ cdef class RegressionCriterion(Criterion): const DOUBLE_t[:, ::1] y, const DOUBLE_t[:] sample_weight, double weighted_n_samples, - const SIZE_t[:] sample_indices, - SIZE_t start, - SIZE_t end, + const SIZE_t[:] sample_indices ) except -1 nogil: - """Initialize the criterion. - - This initializes the criterion at node sample_indices[start:end] and children - sample_indices[start:start] and sample_indices[start:end]. - """ + """Initialize the criterion.""" # Initialize fields self.y = y self.sample_weight = sample_weight self.sample_indices = sample_indices + self.weighted_n_samples = weighted_n_samples + + return 0 + + cdef void set_sample_pointers( + self, + SIZE_t start, + SIZE_t end + ) noexcept nogil: + """Set sample pointers in the criterion.""" self.start = start self.end = end + self.n_node_samples = end - start - self.weighted_n_samples = weighted_n_samples + + self.sq_sum_total = 0.0 self.weighted_n_node_samples = 0. cdef SIZE_t i @@ -673,14 +665,14 @@ cdef class RegressionCriterion(Criterion): cdef DOUBLE_t y_ik cdef DOUBLE_t w_y_ik cdef DOUBLE_t w = 1.0 - self.sq_sum_total = 0.0 + memset(&self.sum_total[0], 0, self.n_outputs * sizeof(double)) for p in range(start, end): - i = sample_indices[p] + i = self.sample_indices[p] - if sample_weight is not None: - w = sample_weight[i] + if self.sample_weight is not None: + w = self.sample_weight[i] for k in range(self.n_outputs): y_ik = self.y[i, k] @@ -692,7 +684,6 @@ cdef class RegressionCriterion(Criterion): # Reset to pos=start self.reset() - return 0 cdef int reset(self) except -1 nogil: """Reset the criterion at pos=start.""" @@ -785,13 +776,11 @@ cdef class RegressionCriterion(Criterion): cdef class MSE(RegressionCriterion): """Mean squared error impurity criterion. - MSE = var_left + var_right """ cdef double node_impurity(self) noexcept nogil: """Evaluate the impurity of the current node. - Evaluate the MSE criterion as impurity of the current node, i.e. the impurity of sample_indices[start:end]. The smaller the impurity the better. @@ -807,22 +796,16 @@ cdef class MSE(RegressionCriterion): cdef double proxy_impurity_improvement(self) noexcept nogil: """Compute a proxy of the impurity reduction. - This method is used to speed up the search for the best split. It is a proxy quantity such that the split that maximizes this value also maximizes the impurity improvement. It neglects all constant terms of the impurity decrease for a given split. - The absolute impurity improvement is only computed by the impurity_improvement method once the best split has been found. - The MSE proxy is derived from - sum_{i left}(y_i - y_pred_L)^2 + sum_{i right}(y_i - y_pred_R)^2 = sum(y_i^2) - n_L * mean_{i left}(y_i)^2 - n_R * mean_{i right}(y_i)^2 - Neglecting constant terms, this gives: - - 1/n_L * sum_{i left}(y_i)^2 - 1/n_R * sum_{i right}(y_i)^2 """ cdef SIZE_t k @@ -839,7 +822,6 @@ cdef class MSE(RegressionCriterion): cdef void children_impurity(self, double* impurity_left, double* impurity_right) noexcept nogil: """Evaluate the impurity in children nodes. - i.e. the impurity of the left child (sample_indices[start:pos]) and the impurity the right child (sample_indices[pos:end]). """ @@ -883,7 +865,6 @@ cdef class MSE(RegressionCriterion): cdef class MAE(RegressionCriterion): r"""Mean absolute error impurity criterion. - MAE = (1 / n)*(\sum_i |y_i - f_i|), where y_i is the true value and f_i is the predicted value.""" @@ -895,12 +876,10 @@ cdef class MAE(RegressionCriterion): def __cinit__(self, SIZE_t n_outputs, SIZE_t n_samples): """Initialize parameters for this criterion. - Parameters ---------- n_outputs : SIZE_t The number of targets to be predicted - n_samples : SIZE_t The total number of samples to fit on """ @@ -933,26 +912,30 @@ cdef class MAE(RegressionCriterion): const DOUBLE_t[:, ::1] y, const DOUBLE_t[:] sample_weight, double weighted_n_samples, - const SIZE_t[:] sample_indices, - SIZE_t start, - SIZE_t end, + const SIZE_t[:] sample_indices ) except -1 nogil: - """Initialize the criterion. - - This initializes the criterion at node sample_indices[start:end] and children - sample_indices[start:start] and sample_indices[start:end]. - """ - cdef SIZE_t i, p, k - cdef DOUBLE_t w = 1.0 - + """Initialize the criterion.""" # Initialize fields self.y = y self.sample_weight = sample_weight self.sample_indices = sample_indices + self.weighted_n_samples = weighted_n_samples + + return 0 + + cdef void set_sample_pointers( + self, + SIZE_t start, + SIZE_t end + ) noexcept nogil: + """Set sample pointers in the criterion.""" + cdef SIZE_t i, p, k + cdef DOUBLE_t w = 1.0 + self.start = start self.end = end + self.n_node_samples = end - start - self.weighted_n_samples = weighted_n_samples self.weighted_n_node_samples = 0. cdef void** left_child = self.left_child_ptr @@ -963,10 +946,10 @@ cdef class MAE(RegressionCriterion): ( right_child[k]).reset() for p in range(start, end): - i = sample_indices[p] + i = self.sample_indices[p] - if sample_weight is not None: - w = sample_weight[i] + if self.sample_weight is not None: + w = self.sample_weight[i] for k in range(self.n_outputs): # push method ends up calling safe_realloc, hence `except -1` @@ -981,11 +964,9 @@ cdef class MAE(RegressionCriterion): # Reset to pos=start self.reset() - return 0 cdef int reset(self) except -1 nogil: """Reset the criterion at pos=start. - Returns -1 in case of failure to allocate memory (and raise MemoryError) or 0 otherwise. """ @@ -1016,7 +997,6 @@ cdef class MAE(RegressionCriterion): cdef int reverse_reset(self) except -1 nogil: """Reset the criterion at pos=end. - Returns -1 in case of failure to allocate memory (and raise MemoryError) or 0 otherwise. """ @@ -1044,7 +1024,6 @@ cdef class MAE(RegressionCriterion): cdef int update(self, SIZE_t new_pos) except -1 nogil: """Updated statistics by moving sample_indices[pos:new_pos] to the left. - Returns -1 in case of failure to allocate memory (and raise MemoryError) or 0 otherwise. """ @@ -1107,7 +1086,6 @@ cdef class MAE(RegressionCriterion): cdef double node_impurity(self) noexcept nogil: """Evaluate the impurity of the current node. - Evaluate the MAE criterion as impurity of the current node, i.e. the impurity of sample_indices[start:end]. The smaller the impurity the better. @@ -1132,7 +1110,6 @@ cdef class MAE(RegressionCriterion): cdef void children_impurity(self, double* p_impurity_left, double* p_impurity_right) noexcept nogil: """Evaluate the impurity in children nodes. - i.e. the impurity of the left child (sample_indices[start:pos]) and the impurity the right child (sample_indices[pos:end]). """ @@ -1179,21 +1156,17 @@ cdef class MAE(RegressionCriterion): cdef class FriedmanMSE(MSE): """Mean squared error impurity criterion with improvement score by Friedman. - Uses the formula (35) in Friedman's original Gradient Boosting paper: - diff = mean_left - mean_right improvement = n_left * n_right * diff^2 / (n_left + n_right) """ cdef double proxy_impurity_improvement(self) noexcept nogil: """Compute a proxy of the impurity reduction. - This method is used to speed up the search for the best split. It is a proxy quantity such that the split that maximizes this value also maximizes the impurity improvement. It neglects all constant terms of the impurity decrease for a given split. - The absolute impurity improvement is only computed by the impurity_improvement method once the best split has been found. """ @@ -1234,9 +1207,7 @@ cdef class FriedmanMSE(MSE): cdef class Poisson(RegressionCriterion): """Half Poisson deviance as impurity criterion. - Poisson deviance = 2/n * sum(y_true * log(y_true/y_pred) + y_pred - y_true) - Note that the deviance is >= 0, and since we have `y_pred = mean(y_true)` at the leaves, one always has `sum(y_pred - y_true) = 0`. It remains the implemented impurity (factor 2 is skipped): @@ -1255,7 +1226,6 @@ cdef class Poisson(RegressionCriterion): cdef double node_impurity(self) noexcept nogil: """Evaluate the impurity of the current node. - Evaluate the Poisson criterion as impurity of the current node, i.e. the impurity of sample_indices[start:end]. The smaller the impurity the better. @@ -1265,24 +1235,18 @@ cdef class Poisson(RegressionCriterion): cdef double proxy_impurity_improvement(self) noexcept nogil: """Compute a proxy of the impurity reduction. - This method is used to speed up the search for the best split. It is a proxy quantity such that the split that maximizes this value also maximizes the impurity improvement. It neglects all constant terms of the impurity decrease for a given split. - The absolute impurity improvement is only computed by the impurity_improvement method once the best split has been found. - The Poisson proxy is derived from: - sum_{i left }(y_i * log(y_i / y_pred_L)) + sum_{i right}(y_i * log(y_i / y_pred_R)) = sum(y_i * log(y_i) - n_L * mean_{i left}(y_i) * log(mean_{i left}(y_i)) - n_R * mean_{i right}(y_i) * log(mean_{i right}(y_i)) - Neglecting constant terms, this gives - - sum{i left }(y_i) * log(mean{i left}(y_i)) - sum{i right}(y_i) * log(mean{i right}(y_i)) """ @@ -1312,7 +1276,6 @@ cdef class Poisson(RegressionCriterion): cdef void children_impurity(self, double* impurity_left, double* impurity_right) noexcept nogil: """Evaluate the impurity in children nodes. - i.e. the impurity of the left child (sample_indices[start:pos]) and the impurity of the right child (sample_indices[pos:end]) for Poisson. """ diff --git a/sklearn/tree/_splitter.pxd b/sklearn/tree/_splitter.pxd index 13fec5974c3c5..b0207ab0a715d 100644 --- a/sklearn/tree/_splitter.pxd +++ b/sklearn/tree/_splitter.pxd @@ -4,12 +4,14 @@ # Joel Nothman # Arnaud Joly # Jacob Schreiber +# Adam Li +# Jong Shin # # License: BSD 3 clause # See _splitter.pyx for details. -from ._criterion cimport Criterion +from ._criterion cimport BaseCriterion, Criterion from ._tree cimport DTYPE_t # Type of X from ._tree cimport DOUBLE_t # Type of y, sample_weight @@ -28,14 +30,15 @@ cdef struct SplitRecord: double impurity_left # Impurity of the left split. double impurity_right # Impurity of the right split. -cdef class Splitter: +cdef class BaseSplitter: + """Abstract interface for splitter.""" + # The splitter searches in the input space for a feature and a threshold # to split the samples samples[start:end]. # # The impurity computations are delegated to a criterion object. # Internal structures - cdef public Criterion criterion # Impurity criterion cdef public SIZE_t max_features # Number of features to test cdef public SIZE_t min_samples_leaf # Min samples in a leaf cdef public double min_weight_leaf # Minimum weight in a leaf @@ -54,7 +57,6 @@ cdef class Splitter: cdef SIZE_t start # Start position for the current node cdef SIZE_t end # End position for the current node - cdef const DOUBLE_t[:, ::1] y cdef const DOUBLE_t[:] sample_weight # The samples vector `samples` is maintained by the Splitter object such @@ -74,27 +76,38 @@ cdef class Splitter: # This allows optimization with depth-based tree building. # Methods - cdef int init( - self, - object X, - const DOUBLE_t[:, ::1] y, - const DOUBLE_t[:] sample_weight - ) except -1 - cdef int node_reset( self, SIZE_t start, SIZE_t end, double* weighted_n_node_samples ) except -1 nogil - cdef int node_split( self, double impurity, # Impurity of the node SplitRecord* split, SIZE_t* n_constant_features ) except -1 nogil - cdef void node_value(self, double* dest) noexcept nogil - cdef double node_impurity(self) noexcept nogil + cdef int pointer_size(self) noexcept nogil + +cdef class Splitter(BaseSplitter): + cdef public Criterion criterion # Impurity criterion + cdef const DOUBLE_t[:, ::1] y + + cdef int init( + self, + object X, + const DOUBLE_t[:, ::1] y, + const DOUBLE_t[:] sample_weight + ) except -1 + + # Methods that allow modifications to stopping conditions + cdef bint check_presplit_conditions( + self, + SplitRecord current_split, + ) noexcept nogil + cdef bint check_postsplit_conditions( + self + ) noexcept nogil diff --git a/sklearn/tree/_splitter.pyx b/sklearn/tree/_splitter.pyx index 83a80d90cc1b9..17a747433d1a8 100644 --- a/sklearn/tree/_splitter.pyx +++ b/sklearn/tree/_splitter.pyx @@ -8,7 +8,10 @@ # Joel Nothman # Fares Hedayati # Jacob Schreiber +# Adam Li +# Jong Shin # + # License: BSD 3 clause from ._criterion cimport Criterion @@ -43,16 +46,78 @@ cdef inline void _init_split(SplitRecord* self, SIZE_t start_pos) noexcept nogil self.threshold = 0. self.improvement = -INFINITY -cdef class Splitter: - """Abstract splitter class. +cdef class BaseSplitter: + """This is an abstract interface for splitters. + + For example, a tree model could be either supervisedly, or unsupervisedly computing splits on samples of + covariates, labels, or both. Although scikit-learn currently only contains + supervised tree methods, this class enables 3rd party packages to leverage + scikit-learn's Cython code for splitting. + + A splitter is usually used in conjunction with a criterion class, which explicitly handles + computing the criteria, which we split on. The setting of that criterion class is handled + by downstream classes. - Splitters are called by tree builders to find the best splits on both - sparse and dense data, one split at a time. + The downstream classes _must_ implement methods to compute the split in a node. """ + def __getstate__(self): + return {} + + def __setstate__(self, d): + pass + + cdef int node_reset(self, SIZE_t start, SIZE_t end, + double* weighted_n_node_samples) except -1 nogil: + """Reset splitter on node samples[start:end]. + + Returns -1 in case of failure to allocate memory (and raise MemoryError) + or 0 otherwise. + + Parameters + ---------- + start : SIZE_t + The index of the first sample to consider + end : SIZE_t + The index of the last sample to consider + weighted_n_node_samples : ndarray, dtype=double pointer + The total weight of those samples + """ + pass + + cdef int node_split(self, double impurity, SplitRecord* split, + SIZE_t* n_constant_features) except -1 nogil: + """Find the best split on node samples[start:end]. + + This is a placeholder method. The majority of computation will be done + here. + + It should return -1 upon errors. + """ + pass + + cdef void node_value(self, double* dest) noexcept nogil: + """Copy the value of node samples[start:end] into dest.""" + pass + + cdef double node_impurity(self) noexcept nogil: + """Return the impurity of the current node.""" + pass + + cdef int pointer_size(self) noexcept nogil: + """Size of the pointer for split records. + + Overriding this function allows one to use different subclasses of + `SplitRecord`. + """ + return sizeof(SplitRecord) + +cdef class Splitter(BaseSplitter): + """Abstract interface for supervised splitters.""" + def __cinit__(self, Criterion criterion, SIZE_t max_features, SIZE_t min_samples_leaf, double min_weight_leaf, - object random_state): + object random_state, *argv): """ Parameters ---------- @@ -75,7 +140,6 @@ cdef class Splitter: random_state : object The user inputted random state to be used for pseudo-randomness """ - self.criterion = criterion self.n_samples = 0 @@ -86,11 +150,6 @@ cdef class Splitter: self.min_weight_leaf = min_weight_leaf self.random_state = random_state - def __getstate__(self): - return {} - - def __setstate__(self, d): - pass def __reduce__(self): return (type(self), (self.criterion, @@ -127,7 +186,6 @@ cdef class Splitter: are assumed to have uniform weight. This is represented as a Cython memoryview. """ - self.rand_r_state = self.random_state.randint(0, RAND_R_MAX) cdef SIZE_t n_samples = X.shape[0] @@ -165,6 +223,19 @@ cdef class Splitter: self.y = y self.sample_weight = sample_weight + + self.criterion.init( + self.y, + self.sample_weight, + self.weighted_n_samples, + self.samples + ) + + self.criterion.set_sample_pointers( + self.start, + self.end + ) + return 0 cdef int node_reset(self, SIZE_t start, SIZE_t end, @@ -187,30 +258,11 @@ cdef class Splitter: self.start = start self.end = end - self.criterion.init( - self.y, - self.sample_weight, - self.weighted_n_samples, - self.samples, - start, - end - ) + self.criterion.set_sample_pointers(start, end) weighted_n_node_samples[0] = self.criterion.weighted_n_node_samples return 0 - cdef int node_split(self, double impurity, SplitRecord* split, - SIZE_t* n_constant_features) except -1 nogil: - """Find the best split on node samples[start:end]. - - This is a placeholder method. The majority of computation will be done - here. - - It should return -1 upon errors. - """ - - pass - cdef void node_value(self, double* dest) noexcept nogil: """Copy the value of node samples[start:end] into dest.""" @@ -221,6 +273,41 @@ cdef class Splitter: return self.criterion.node_impurity() + cdef bint check_presplit_conditions( + self, + SplitRecord current_split, + ) noexcept nogil: + """Check stopping conditions pre-split. + + This is typically a metric that is cheaply computed given the + current proposed split, which is stored as a the `current_split` + argument. + """ + cdef SIZE_t min_samples_leaf = self.min_samples_leaf + + if (((current_split.pos - self.start) < min_samples_leaf) or + ((self.end - current_split.pos) < min_samples_leaf)): + return 1 + + return 0 + + cdef bint check_postsplit_conditions( + self + ) noexcept nogil: + """Check stopping conditions after evaluating the split. + + This takes some metric that is stored in the Criterion + object and checks against internal stop metrics. + """ + cdef double min_weight_leaf = self.min_weight_leaf + + # Reject if min_weight_leaf is not satisfied + if ((self.criterion.weighted_n_left < min_weight_leaf) or + (self.criterion.weighted_n_right < min_weight_leaf)): + return 1 + + return 0 + # Introduce a fused-class to make it possible to share the split implementation # between the dense and sparse cases in the node_split_best and node_split_random # functions. The alternative would have been to use inheritance-based polymorphism @@ -229,7 +316,7 @@ cdef class Splitter: ctypedef fused Partitioner: DensePartitioner SparsePartitioner - + cdef inline int node_split_best( Splitter splitter, Partitioner partitioner, @@ -349,15 +436,13 @@ cdef inline int node_split_best( current_split.pos = p # Reject if min_samples_leaf is not guaranteed - if (((current_split.pos - start) < min_samples_leaf) or - ((end - current_split.pos) < min_samples_leaf)): + if splitter.check_presplit_conditions(current_split) == 1: continue criterion.update(current_split.pos) # Reject if min_weight_leaf is not satisfied - if ((criterion.weighted_n_left < min_weight_leaf) or - (criterion.weighted_n_right < min_weight_leaf)): + if splitter.check_postsplit_conditions() == 1: continue current_proxy_improvement = criterion.proxy_impurity_improvement() @@ -645,8 +730,7 @@ cdef inline int node_split_random( current_split.pos = partitioner.partition_samples(current_split.threshold) # Reject if min_samples_leaf is not guaranteed - if (((current_split.pos - start) < min_samples_leaf) or - ((end - current_split.pos) < min_samples_leaf)): + if splitter.check_presplit_conditions(current_split) == 1: continue # Evaluate split @@ -656,8 +740,7 @@ cdef inline int node_split_random( criterion.update(current_split.pos) # Reject if min_weight_leaf is not satisfied - if ((criterion.weighted_n_left < min_weight_leaf) or - (criterion.weighted_n_right < min_weight_leaf)): + if splitter.check_postsplit_conditions() == 1: continue current_proxy_improvement = criterion.proxy_impurity_improvement() diff --git a/sklearn/tree/_tree.pxd b/sklearn/tree/_tree.pxd index 1966651d8c89a..8140733a9fc26 100644 --- a/sklearn/tree/_tree.pxd +++ b/sklearn/tree/_tree.pxd @@ -13,6 +13,8 @@ import numpy as np cimport numpy as cnp +from libcpp.vector cimport vector + ctypedef cnp.npy_float32 DTYPE_t # Type of X ctypedef cnp.npy_float64 DOUBLE_t # Type of y, sample_weight ctypedef cnp.npy_intp SIZE_t # Type for indices and counters @@ -33,40 +35,32 @@ cdef struct Node: SIZE_t n_node_samples # Number of samples at the node DOUBLE_t weighted_n_node_samples # Weighted number of samples at the node - -cdef class Tree: - # The Tree object is a binary tree structure constructed by the - # TreeBuilder. The tree structure is used for predictions and - # feature importances. - - # Input/Output layout - cdef public SIZE_t n_features # Number of features in X - cdef SIZE_t* n_classes # Number of classes in y[:, k] - cdef public SIZE_t n_outputs # Number of outputs in y - cdef public SIZE_t max_n_classes # max(n_classes) - +cdef class BaseTree: # Inner structures: values are stored separately from node structure, # since size is determined at runtime. cdef public SIZE_t max_depth # Max depth of the tree cdef public SIZE_t node_count # Counter for node IDs cdef public SIZE_t capacity # Capacity of tree, in terms of nodes cdef Node* nodes # Array of nodes - cdef double* value # (capacity, n_outputs, max_n_classes) array of values - cdef SIZE_t value_stride # = n_outputs * max_n_classes - # Methods - cdef SIZE_t _add_node(self, SIZE_t parent, bint is_left, bint is_leaf, - SIZE_t feature, double threshold, double impurity, - SIZE_t n_node_samples, - double weighted_n_node_samples) except -1 nogil + cdef SIZE_t value_stride # The dimensionality of a vectorized output per sample + cdef double* value # Array of values prediction values for each node + + # Generic Methods: These are generic methods used by any tree. cdef int _resize(self, SIZE_t capacity) except -1 nogil cdef int _resize_c(self, SIZE_t capacity=*) except -1 nogil - - cdef cnp.ndarray _get_value_ndarray(self) - cdef cnp.ndarray _get_node_ndarray(self) - - cpdef cnp.ndarray predict(self, object X) - + cdef SIZE_t _add_node( + self, + SIZE_t parent, + bint is_left, + bint is_leaf, + SplitRecord* split_node, + double impurity, + SIZE_t n_node_samples, + double weighted_n_node_samples + ) except -1 nogil + + # Python API methods: These are methods exposed to Python cpdef cnp.ndarray apply(self, object X) cdef cnp.ndarray _apply_dense(self, object X) cdef cnp.ndarray _apply_sparse_csr(self, object X) @@ -78,6 +72,49 @@ cdef class Tree: cpdef compute_node_depths(self) cpdef compute_feature_importances(self, normalize=*) + # Abstract methods: these functions must be implemented by any decision tree + cdef int _set_split_node( + self, + SplitRecord* split_node, + Node* node + ) except -1 nogil + cdef int _set_leaf_node( + self, + SplitRecord* split_node, + Node* node + ) except -1 nogil + cdef DTYPE_t _compute_feature( + self, + const DTYPE_t[:, :] X_ndarray, + SIZE_t sample_index, + Node *node + ) noexcept nogil + cdef void _compute_feature_importances( + self, + cnp.float64_t[:] importances, + Node* node, + ) noexcept nogil + +cdef class Tree(BaseTree): + # The Supervised Tree object is a binary tree structure constructed by the + # TreeBuilder. The tree structure is used for predictions and + # feature importances. + # + # Value of upstream properties: + # - value_stride = n_outputs * max_n_classes + # - value = (capacity, n_outputs, max_n_classes) array of values + + # Input/Output layout for supervised tree + cdef public SIZE_t n_features # Number of features in X + cdef SIZE_t* n_classes # Number of classes in y[:, k] + cdef public SIZE_t n_outputs # Number of outputs in y + cdef public SIZE_t max_n_classes # max(n_classes) + + # Methods + cdef cnp.ndarray _get_value_ndarray(self) + cdef cnp.ndarray _get_node_ndarray(self) + + cpdef cnp.ndarray predict(self, object X) # ============================================================================= # Tree builder @@ -91,8 +128,7 @@ cdef class TreeBuilder: # This class controls the various stopping criteria and the node splitting # evaluation order, e.g. depth-first or best-first. - cdef Splitter splitter # Splitting algorithm - + cdef Splitter splitter cdef SIZE_t min_samples_split # Minimum number of samples in an internal node cdef SIZE_t min_samples_leaf # Minimum number of samples in a leaf cdef double min_weight_leaf # Minimum weight in a leaf diff --git a/sklearn/tree/_tree.pyx b/sklearn/tree/_tree.pyx index 75eed058bfd4e..e5b759aee23df 100644 --- a/sklearn/tree/_tree.pyx +++ b/sklearn/tree/_tree.pyx @@ -22,6 +22,8 @@ from libcpp.vector cimport vector from libcpp.algorithm cimport pop_heap from libcpp.algorithm cimport push_heap from libcpp cimport bool +from cython.operator cimport dereference as deref +from libc.stdlib cimport malloc, free import struct @@ -83,6 +85,7 @@ NODE_DTYPE = np.asarray((&dummy)).dtype # TreeBuilder # ============================================================================= + cdef class TreeBuilder: """Interface for different tree building strategies.""" @@ -196,9 +199,11 @@ cdef class DepthFirstTreeBuilder(TreeBuilder): cdef bint is_left cdef SIZE_t n_node_samples = splitter.n_samples cdef double weighted_n_node_samples - cdef SplitRecord split cdef SIZE_t node_id + cdef SplitRecord split + cdef SplitRecord* split_ptr = malloc(splitter.pointer_size()) + cdef double impurity = INFINITY cdef SIZE_t n_constant_features cdef bint is_leaf @@ -248,7 +253,12 @@ cdef class DepthFirstTreeBuilder(TreeBuilder): is_leaf = is_leaf or impurity <= EPSILON if not is_leaf: - splitter.node_split(impurity, &split, &n_constant_features) + splitter.node_split(impurity, split_ptr, &n_constant_features) + + # assign local copy of SplitRecord to assign + # pos, improvement, and impurity scores + split = deref(split_ptr) + # If EPSILON=0 in the below comparison, float precision # issues stop splitting, producing trees that are # dissimilar to v0.18 @@ -256,8 +266,8 @@ cdef class DepthFirstTreeBuilder(TreeBuilder): (split.improvement + EPSILON < min_impurity_decrease)) - node_id = tree._add_node(parent, is_left, is_leaf, split.feature, - split.threshold, impurity, n_node_samples, + node_id = tree._add_node(parent, is_left, is_leaf, split_ptr, + impurity, n_node_samples, weighted_n_node_samples) if node_id == INTPTR_MAX: @@ -297,6 +307,10 @@ cdef class DepthFirstTreeBuilder(TreeBuilder): if rc >= 0: tree.max_depth = max_depth_seen + + # free the memory created for the SplitRecord pointer + free(split_ptr) + if rc == -1: raise MemoryError() @@ -462,6 +476,8 @@ cdef class BestFirstTreeBuilder(TreeBuilder): FrontierRecord* res) except -1 nogil: """Adds node w/ partition ``[start, end)`` to the frontier. """ cdef SplitRecord split + cdef SplitRecord* split_ptr = malloc(splitter.pointer_size()) + cdef SIZE_t node_id cdef SIZE_t n_node_samples cdef SIZE_t n_constant_features = 0 @@ -483,7 +499,11 @@ cdef class BestFirstTreeBuilder(TreeBuilder): ) if not is_leaf: - splitter.node_split(impurity, &split, &n_constant_features) + splitter.node_split(impurity, split_ptr, &n_constant_features) + # assign local copy of SplitRecord to assign + # pos, improvement, and impurity scores + split = deref(split_ptr) + # If EPSILON=0 in the below comparison, float precision issues stop # splitting early, producing trees that are dissimilar to v0.18 is_leaf = (is_leaf or split.pos >= end or @@ -493,7 +513,7 @@ cdef class BestFirstTreeBuilder(TreeBuilder): if parent != NULL else _TREE_UNDEFINED, is_left, is_leaf, - split.feature, split.threshold, impurity, n_node_samples, + split_ptr, impurity, n_node_samples, weighted_n_node_samples) if node_id == INTPTR_MAX: return -1 @@ -522,7 +542,8 @@ cdef class BestFirstTreeBuilder(TreeBuilder): res.improvement = 0.0 res.impurity_left = impurity res.impurity_right = impurity - + + free(split_ptr) return 0 @@ -530,190 +551,15 @@ cdef class BestFirstTreeBuilder(TreeBuilder): # Tree # ============================================================================= -cdef class Tree: - """Array-based representation of a binary decision tree. - - The binary tree is represented as a number of parallel arrays. The i-th - element of each array holds information about the node `i`. Node 0 is the - tree's root. You can find a detailed description of all arrays in - `_tree.pxd`. NOTE: Some of the arrays only apply to either leaves or split - nodes, resp. In this case the values of nodes of the other type are - arbitrary! - - Attributes - ---------- - node_count : int - The number of nodes (internal nodes + leaves) in the tree. - - capacity : int - The current capacity (i.e., size) of the arrays, which is at least as - great as `node_count`. - - max_depth : int - The depth of the tree, i.e. the maximum depth of its leaves. - - children_left : array of int, shape [node_count] - children_left[i] holds the node id of the left child of node i. - For leaves, children_left[i] == TREE_LEAF. Otherwise, - children_left[i] > i. This child handles the case where - X[:, feature[i]] <= threshold[i]. - - children_right : array of int, shape [node_count] - children_right[i] holds the node id of the right child of node i. - For leaves, children_right[i] == TREE_LEAF. Otherwise, - children_right[i] > i. This child handles the case where - X[:, feature[i]] > threshold[i]. - - feature : array of int, shape [node_count] - feature[i] holds the feature to split on, for the internal node i. - - threshold : array of double, shape [node_count] - threshold[i] holds the threshold for the internal node i. - - value : array of double, shape [node_count, n_outputs, max_n_classes] - Contains the constant prediction value of each node. - - impurity : array of double, shape [node_count] - impurity[i] holds the impurity (i.e., the value of the splitting - criterion) at node i. - - n_node_samples : array of int, shape [node_count] - n_node_samples[i] holds the number of training samples reaching node i. - - weighted_n_node_samples : array of double, shape [node_count] - weighted_n_node_samples[i] holds the weighted number of training samples - reaching node i. +cdef class BaseTree: + """Base class for Cython tree models. + + Downstream classes must implement """ - # Wrap for outside world. - # WARNING: these reference the current `nodes` and `value` buffers, which - # must not be freed by a subsequent memory allocation. - # (i.e. through `_resize` or `__setstate__`) - property n_classes: - def __get__(self): - return sizet_ptr_to_ndarray(self.n_classes, self.n_outputs) - - property children_left: - def __get__(self): - return self._get_node_ndarray()['left_child'][:self.node_count] - - property children_right: - def __get__(self): - return self._get_node_ndarray()['right_child'][:self.node_count] - - property n_leaves: - def __get__(self): - return np.sum(np.logical_and( - self.children_left == -1, - self.children_right == -1)) - - property feature: - def __get__(self): - return self._get_node_ndarray()['feature'][:self.node_count] - - property threshold: - def __get__(self): - return self._get_node_ndarray()['threshold'][:self.node_count] - - property impurity: - def __get__(self): - return self._get_node_ndarray()['impurity'][:self.node_count] - - property n_node_samples: - def __get__(self): - return self._get_node_ndarray()['n_node_samples'][:self.node_count] - - property weighted_n_node_samples: - def __get__(self): - return self._get_node_ndarray()['weighted_n_node_samples'][:self.node_count] - - property value: - def __get__(self): - return self._get_value_ndarray()[:self.node_count] - - # TODO: Convert n_classes to cython.integral memory view once - # https://github.com/cython/cython/issues/5243 is fixed - def __cinit__(self, int n_features, cnp.ndarray n_classes, int n_outputs): - """Constructor.""" - cdef SIZE_t dummy = 0 - size_t_dtype = np.array(dummy).dtype - - n_classes = _check_n_classes(n_classes, size_t_dtype) - - # Input/Output layout - self.n_features = n_features - self.n_outputs = n_outputs - self.n_classes = NULL - safe_realloc(&self.n_classes, n_outputs) - - self.max_n_classes = np.max(n_classes) - self.value_stride = n_outputs * self.max_n_classes - - cdef SIZE_t k - for k in range(n_outputs): - self.n_classes[k] = n_classes[k] - - # Inner structures - self.max_depth = 0 - self.node_count = 0 - self.capacity = 0 - self.value = NULL - self.nodes = NULL - - def __dealloc__(self): - """Destructor.""" - # Free all inner structures - free(self.n_classes) - free(self.value) - free(self.nodes) - - def __reduce__(self): - """Reduce re-implementation, for pickling.""" - return (Tree, (self.n_features, - sizet_ptr_to_ndarray(self.n_classes, self.n_outputs), - self.n_outputs), self.__getstate__()) - - def __getstate__(self): - """Getstate re-implementation, for pickling.""" - d = {} - # capacity is inferred during the __setstate__ using nodes - d["max_depth"] = self.max_depth - d["node_count"] = self.node_count - d["nodes"] = self._get_node_ndarray() - d["values"] = self._get_value_ndarray() - return d - - def __setstate__(self, d): - """Setstate re-implementation, for unpickling.""" - self.max_depth = d["max_depth"] - self.node_count = d["node_count"] - - if 'nodes' not in d: - raise ValueError('You have loaded Tree version which ' - 'cannot be imported') - - node_ndarray = d['nodes'] - value_ndarray = d['values'] - - value_shape = (node_ndarray.shape[0], self.n_outputs, - self.max_n_classes) - - node_ndarray = _check_node_ndarray(node_ndarray, expected_dtype=NODE_DTYPE) - value_ndarray = _check_value_ndarray( - value_ndarray, - expected_dtype=np.dtype(np.float64), - expected_shape=value_shape - ) - - self.capacity = node_ndarray.shape[0] - if self._resize_c(self.capacity) != 0: - raise MemoryError("resizing tree to %d" % self.capacity) - - nodes = memcpy(self.nodes, cnp.PyArray_DATA(node_ndarray), - self.capacity * sizeof(Node)) - value = memcpy(self.value, cnp.PyArray_DATA(value_ndarray), - self.capacity * self.value_stride * sizeof(double)) - - cdef int _resize(self, SIZE_t capacity) except -1 nogil: + cdef int _resize( + self, + SIZE_t capacity + ) except -1 nogil: """Resize all inner arrays to `capacity`, if `capacity` == -1, then double the size of the inner arrays. @@ -725,7 +571,10 @@ cdef class Tree: with gil: raise MemoryError() - cdef int _resize_c(self, SIZE_t capacity=INTPTR_MAX) except -1 nogil: + cdef int _resize_c( + self, + SIZE_t capacity=INTPTR_MAX + ) except -1 nogil: """Guts of _resize Returns -1 in case of failure to allocate memory (and raise MemoryError) @@ -756,14 +605,87 @@ cdef class Tree: self.capacity = capacity return 0 - cdef SIZE_t _add_node(self, SIZE_t parent, bint is_left, bint is_leaf, - SIZE_t feature, double threshold, double impurity, - SIZE_t n_node_samples, - double weighted_n_node_samples) except -1 nogil: - """Add a node to the tree. + cdef int _set_split_node( + self, + SplitRecord* split_node, + Node* node + ) except -1 nogil: + """Set split node data. + + Parameters + ---------- + split_node : SplitRecord* + The pointer to the record of the split node data. + node : Node* + The pointer to the node that will hold the split node. + """ + # left_child and right_child will be set later for a split node + node.feature = split_node.feature + node.threshold = split_node.threshold + return 1 + cdef int _set_leaf_node( + self, + SplitRecord* split_node, + Node* node + ) except -1 nogil: + """Set leaf node data. + + Parameters + ---------- + split_node : SplitRecord* + The pointer to the record of the leaf node data. + node : Node* + The pointer to the node that will hold the leaf node. + """ + node.left_child = _TREE_LEAF + node.right_child = _TREE_LEAF + node.feature = _TREE_UNDEFINED + node.threshold = _TREE_UNDEFINED + return 1 + + cdef DTYPE_t _compute_feature(self, const DTYPE_t[:, :] X_ndarray, + SIZE_t sample_index, + Node *node) noexcept nogil: + """Compute feature from a given data matrix, X. + + In axis-aligned trees, this is simply the value in the column of X + for this specific feature. + """ + # the feature index + cdef DTYPE_t feature = X_ndarray[sample_index, node.feature] + return feature + + cdef SIZE_t _add_node( + self, + SIZE_t parent, + bint is_left, + bint is_leaf, + SplitRecord* split_node, + double impurity, + SIZE_t n_node_samples, + double weighted_n_node_samples + ) except -1 nogil: + """Add a node to the tree. The new node registers itself as the child of its parent. - + Parameters + ---------- + parent : SIZE_t + The index of the parent. If '_TREE_UNDEFINED', then the current + node is a root node. + is_left : bint + Whether or not the current node is to the left of the parent node. + is_leaf : bint + Whether or not the current node is a leaf node. + split_node : SplitRecord* + A pointer to a SplitRecord pointer address. + impurity : double + The impurity of the node to be added. + n_node_samples : SIZE_t + The number of samples in the node. + weighted_n_node_samples : double + The weight of the samples in the node. + Returns (size_t)(-1) on error. """ cdef SIZE_t node_id = self.node_count @@ -784,28 +706,18 @@ cdef class Tree: self.nodes[parent].right_child = node_id if is_leaf: - node.left_child = _TREE_LEAF - node.right_child = _TREE_LEAF - node.feature = _TREE_UNDEFINED - node.threshold = _TREE_UNDEFINED - + if self._set_leaf_node(split_node, node) != 1: + with gil: + raise RuntimeError else: - # left_child and right_child will be set later - node.feature = feature - node.threshold = threshold + if self._set_split_node(split_node, node) != 1: + with gil: + raise RuntimeError self.node_count += 1 return node_id - cpdef cnp.ndarray predict(self, object X): - """Predict target for X.""" - out = self._get_value_ndarray().take(self.apply(X), axis=0, - mode='clip') - if self.n_outputs == 1: - out = out.reshape(X.shape[0], self.max_n_classes) - return out - cpdef cnp.ndarray apply(self, object X): """Finds the terminal region (=leaf node) for each sample in X.""" if issparse(X): @@ -835,13 +747,20 @@ cdef class Tree: cdef Node* node = NULL cdef SIZE_t i = 0 + # the feature value + cdef DTYPE_t feature_value = 0 + with nogil: for i in range(n_samples): node = self.nodes + # While node not a leaf while node.left_child != _TREE_LEAF: # ... and node.right_child != _TREE_LEAF: - if X_ndarray[i, node.feature] <= node.threshold: + + # compute the feature value to compare against threshold + feature_value = self._compute_feature(X_ndarray, i, node) + if feature_value <= node.threshold: node = &self.nodes[node.left_child] else: node = &self.nodes[node.right_child] @@ -902,7 +821,6 @@ cdef class Tree: # ... and node.right_child != _TREE_LEAF: if feature_to_sample[node.feature] == i: feature_value = X_sample[node.feature] - else: feature_value = 0. @@ -951,6 +869,9 @@ cdef class Tree: cdef Node* node = NULL cdef SIZE_t i = 0 + # the feature index + cdef DOUBLE_t feature + with nogil: for i in range(n_samples): node = self.nodes @@ -962,7 +883,9 @@ cdef class Tree: indices[indptr[i + 1]] = (node - self.nodes) indptr[i + 1] += 1 - if X_ndarray[i, node.feature] <= node.threshold: + # compute the feature value to compare against threshold + feature = self._compute_feature(X_ndarray, i, node) + if feature <= node.threshold: node = &self.nodes[node.left_child] else: node = &self.nodes[node.right_child] @@ -1091,8 +1014,6 @@ cdef class Tree: cpdef compute_feature_importances(self, normalize=True): """Computes the importance of each feature (aka variable).""" - cdef Node* left - cdef Node* right cdef Node* nodes = self.nodes cdef Node* node = nodes cdef Node* end_node = node + self.node_count @@ -1105,13 +1026,9 @@ cdef class Tree: while node != end_node: if node.left_child != _TREE_LEAF: # ... and node.right_child != _TREE_LEAF: - left = &nodes[node.left_child] - right = &nodes[node.right_child] - - importances[node.feature] += ( - node.weighted_n_node_samples * node.impurity - - left.weighted_n_node_samples * left.impurity - - right.weighted_n_node_samples * right.impurity) + self._compute_feature_importances( + importances, node) + node += 1 for i in range(self.n_features): @@ -1127,44 +1044,27 @@ cdef class Tree: return np.asarray(importances) - cdef cnp.ndarray _get_value_ndarray(self): - """Wraps value as a 3-d NumPy array. - - The array keeps a reference to this Tree, which manages the underlying - memory. + cdef void _compute_feature_importances( + self, + cnp.float64_t[:] importances, + Node* node + ) noexcept nogil: + """Compute feature importances from a Node in the Tree. + + Wrapped in a private function to allow subclassing that + computes feature importances. """ - cdef cnp.npy_intp shape[3] - shape[0] = self.node_count - shape[1] = self.n_outputs - shape[2] = self.max_n_classes - cdef cnp.ndarray arr - arr = cnp.PyArray_SimpleNewFromData(3, shape, cnp.NPY_DOUBLE, self.value) - Py_INCREF(self) - if PyArray_SetBaseObject(arr, self) < 0: - raise ValueError("Can't initialize array.") - return arr + cdef Node* nodes = self.nodes + cdef Node* left + cdef Node* right - cdef cnp.ndarray _get_node_ndarray(self): - """Wraps nodes as a NumPy struct array. + left = &nodes[node.left_child] + right = &nodes[node.right_child] - The array keeps a reference to this Tree, which manages the underlying - memory. Individual fields are publicly accessible as properties of the - Tree. - """ - cdef cnp.npy_intp shape[1] - shape[0] = self.node_count - cdef cnp.npy_intp strides[1] - strides[0] = sizeof(Node) - cdef cnp.ndarray arr - Py_INCREF(NODE_DTYPE) - arr = PyArray_NewFromDescr( cnp.ndarray, - NODE_DTYPE, 1, shape, - strides, self.nodes, - cnp.NPY_ARRAY_DEFAULT, None) - Py_INCREF(self) - if PyArray_SetBaseObject(arr, self) < 0: - raise ValueError("Can't initialize array.") - return arr + importances[node.feature] += ( + node.weighted_n_node_samples * node.impurity - + left.weighted_n_node_samples * left.impurity - + right.weighted_n_node_samples * right.impurity) def compute_partial_dependence(self, DTYPE_t[:, ::1] X, int[::1] target_features, @@ -1273,6 +1173,237 @@ cdef class Tree: total_weight) +cdef class Tree(BaseTree): + """Array-based representation of a binary decision tree. + + The binary tree is represented as a number of parallel arrays. The i-th + element of each array holds information about the node `i`. Node 0 is the + tree's root. You can find a detailed description of all arrays in + `_tree.pxd`. NOTE: Some of the arrays only apply to either leaves or split + nodes, resp. In this case the values of nodes of the other type are + arbitrary! + + Attributes + ---------- + node_count : int + The number of nodes (internal nodes + leaves) in the tree. + + capacity : int + The current capacity (i.e., size) of the arrays, which is at least as + great as `node_count`. + + max_depth : int + The depth of the tree, i.e. the maximum depth of its leaves. + + children_left : array of int, shape [node_count] + children_left[i] holds the node id of the left child of node i. + For leaves, children_left[i] == TREE_LEAF. Otherwise, + children_left[i] > i. This child handles the case where + X[:, feature[i]] <= threshold[i]. + + children_right : array of int, shape [node_count] + children_right[i] holds the node id of the right child of node i. + For leaves, children_right[i] == TREE_LEAF. Otherwise, + children_right[i] > i. This child handles the case where + X[:, feature[i]] > threshold[i]. + + feature : array of int, shape [node_count] + feature[i] holds the feature to split on, for the internal node i. + + threshold : array of double, shape [node_count] + threshold[i] holds the threshold for the internal node i. + + value : array of double, shape [node_count, n_outputs, max_n_classes] + Contains the constant prediction value of each node. + + impurity : array of double, shape [node_count] + impurity[i] holds the impurity (i.e., the value of the splitting + criterion) at node i. + + n_node_samples : array of int, shape [node_count] + n_node_samples[i] holds the number of training samples reaching node i. + + weighted_n_node_samples : array of double, shape [node_count] + weighted_n_node_samples[i] holds the weighted number of training samples + reaching node i. + """ + # Wrap for outside world. + # WARNING: these reference the current `nodes` and `value` buffers, which + # must not be freed by a subsequent memory allocation. + # (i.e. through `_resize` or `__setstate__`) + property n_classes: + def __get__(self): + return sizet_ptr_to_ndarray(self.n_classes, self.n_outputs) + + property children_left: + def __get__(self): + return self._get_node_ndarray()['left_child'][:self.node_count] + + property children_right: + def __get__(self): + return self._get_node_ndarray()['right_child'][:self.node_count] + + property n_leaves: + def __get__(self): + return np.sum(np.logical_and( + self.children_left == -1, + self.children_right == -1)) + + property feature: + def __get__(self): + return self._get_node_ndarray()['feature'][:self.node_count] + + property threshold: + def __get__(self): + return self._get_node_ndarray()['threshold'][:self.node_count] + + property impurity: + def __get__(self): + return self._get_node_ndarray()['impurity'][:self.node_count] + + property n_node_samples: + def __get__(self): + return self._get_node_ndarray()['n_node_samples'][:self.node_count] + + property weighted_n_node_samples: + def __get__(self): + return self._get_node_ndarray()['weighted_n_node_samples'][:self.node_count] + + property value: + def __get__(self): + return self._get_value_ndarray()[:self.node_count] + + # TODO: Convert n_classes to cython.integral memory view once + # https://github.com/cython/cython/issues/5243 is fixed + def __cinit__(self, int n_features, cnp.ndarray n_classes, int n_outputs): + """Constructor.""" + cdef SIZE_t dummy = 0 + size_t_dtype = np.array(dummy).dtype + + n_classes = _check_n_classes(n_classes, size_t_dtype) + + # Input/Output layout + self.n_features = n_features + self.n_outputs = n_outputs + self.n_classes = NULL + safe_realloc(&self.n_classes, n_outputs) + + self.max_n_classes = np.max(n_classes) + self.value_stride = n_outputs * self.max_n_classes + + cdef SIZE_t k + for k in range(n_outputs): + self.n_classes[k] = n_classes[k] + + # Inner structures + self.max_depth = 0 + self.node_count = 0 + self.capacity = 0 + self.value = NULL + self.nodes = NULL + + def __dealloc__(self): + """Destructor.""" + # Free all inner structures + free(self.n_classes) + free(self.value) + free(self.nodes) + + def __reduce__(self): + """Reduce re-implementation, for pickling.""" + return (Tree, (self.n_features, + sizet_ptr_to_ndarray(self.n_classes, self.n_outputs), + self.n_outputs), self.__getstate__()) + + def __getstate__(self): + """Getstate re-implementation, for pickling.""" + d = {} + # capacity is inferred during the __setstate__ using nodes + d["max_depth"] = self.max_depth + d["node_count"] = self.node_count + d["nodes"] = self._get_node_ndarray() + d["values"] = self._get_value_ndarray() + return d + + def __setstate__(self, d): + """Setstate re-implementation, for unpickling.""" + self.max_depth = d["max_depth"] + self.node_count = d["node_count"] + + if 'nodes' not in d: + raise ValueError('You have loaded Tree version which ' + 'cannot be imported') + + node_ndarray = d['nodes'] + value_ndarray = d['values'] + + value_shape = (node_ndarray.shape[0], self.n_outputs, + self.max_n_classes) + + node_ndarray = _check_node_ndarray(node_ndarray, expected_dtype=NODE_DTYPE) + value_ndarray = _check_value_ndarray( + value_ndarray, + expected_dtype=np.dtype(np.float64), + expected_shape=value_shape + ) + + self.capacity = node_ndarray.shape[0] + if self._resize_c(self.capacity) != 0: + raise MemoryError("resizing tree to %d" % self.capacity) + + nodes = memcpy(self.nodes, cnp.PyArray_DATA(node_ndarray), + self.capacity * sizeof(Node)) + value = memcpy(self.value, cnp.PyArray_DATA(value_ndarray), + self.capacity * self.value_stride * sizeof(double)) + + cdef cnp.ndarray _get_value_ndarray(self): + """Wraps value as a 3-d NumPy array. + + The array keeps a reference to this Tree, which manages the underlying + memory. + """ + cdef cnp.npy_intp shape[3] + shape[0] = self.node_count + shape[1] = self.n_outputs + shape[2] = self.max_n_classes + cdef cnp.ndarray arr + arr = cnp.PyArray_SimpleNewFromData(3, shape, cnp.NPY_DOUBLE, self.value) + Py_INCREF(self) + if PyArray_SetBaseObject(arr, self) < 0: + raise ValueError("Can't initialize array.") + return arr + + cdef cnp.ndarray _get_node_ndarray(self): + """Wraps nodes as a NumPy struct array. + + The array keeps a reference to this Tree, which manages the underlying + memory. Individual fields are publicly accessible as properties of the + Tree. + """ + cdef cnp.npy_intp shape[1] + shape[0] = self.node_count + cdef cnp.npy_intp strides[1] + strides[0] = sizeof(Node) + cdef cnp.ndarray arr + Py_INCREF(NODE_DTYPE) + arr = PyArray_NewFromDescr( cnp.ndarray, + NODE_DTYPE, 1, shape, + strides, self.nodes, + cnp.NPY_ARRAY_DEFAULT, None) + Py_INCREF(self) + if PyArray_SetBaseObject(arr, self) < 0: + raise ValueError("Can't initialize array.") + return arr + + cpdef cnp.ndarray predict(self, object X): + """Predict target for X.""" + out = self._get_value_ndarray().take(self.apply(X), axis=0, + mode='clip') + if self.n_outputs == 1: + out = out.reshape(X.shape[0], self.max_n_classes) + return out + + def _check_n_classes(n_classes, expected_dtype): if n_classes.ndim != 1: raise ValueError( @@ -1755,6 +1886,8 @@ cdef _build_pruned_tree( stack[BuildPrunedRecord] prune_stack BuildPrunedRecord stack_record + SplitRecord split + with nogil: # push root node onto stack prune_stack.push({"start": 0, "depth": 0, "parent": _TREE_UNDEFINED, "is_left": 0}) @@ -1771,8 +1904,12 @@ cdef _build_pruned_tree( is_leaf = leaves_in_subtree[orig_node_id] node = &orig_tree.nodes[orig_node_id] + # redefine to a SplitRecord to pass into _add_node + split.feature = node.feature + split.threshold = node.threshold + new_node_id = tree._add_node( - parent, is_left, is_leaf, node.feature, node.threshold, + parent, is_left, is_leaf, &split, node.impurity, node.n_node_samples, node.weighted_n_node_samples) diff --git a/sklearn/tree/tests/test_tree.py b/sklearn/tree/tests/test_tree.py index 1f3a9bf394b9b..69f948839259a 100644 --- a/sklearn/tree/tests/test_tree.py +++ b/sklearn/tree/tests/test_tree.py @@ -300,7 +300,7 @@ def test_xor(): clf.fit(X, y) assert clf.score(X, y) == 1.0, "Failed with {0}".format(name) - clf = Tree(random_state=0, max_features=1) + clf = Tree(random_state=0, max_features=X.shape[1]) clf.fit(X, y) assert clf.score(X, y) == 1.0, "Failed with {0}".format(name) @@ -440,7 +440,7 @@ def test_importances(): X, y = datasets.make_classification( n_samples=5000, n_features=10, - n_informative=3, + n_informative=4, n_redundant=0, n_repeated=0, shuffle=False, @@ -455,7 +455,7 @@ def test_importances(): n_important = np.sum(importances > 0.1) assert importances.shape[0] == 10, "Failed with {0}".format(name) - assert n_important == 3, "Failed with {0}".format(name) + assert n_important == 4, "Failed with {0}".format(name) # Check on iris that importances are the same for all builders clf = DecisionTreeClassifier(random_state=0) @@ -466,9 +466,9 @@ def test_importances(): assert_array_equal(clf.feature_importances_, clf2.feature_importances_) -def test_importances_raises(): +@pytest.mark.parametrize("clf", [DecisionTreeClassifier()]) +def test_importances_raises(clf): # Check if variable importance before fit raises ValueError. - clf = DecisionTreeClassifier() with pytest.raises(ValueError): getattr(clf, "feature_importances_") @@ -653,6 +653,7 @@ def test_min_samples_leaf(): est.fit(X, y) out = est.tree_.apply(X) node_counts = np.bincount(out) + # drop inner nodes leaf_count = node_counts[node_counts != 0] assert np.min(leaf_count) > 4, "Failed with {0}".format(name) @@ -677,7 +678,7 @@ def check_min_weight_fraction_leaf(name, datasets, sparse=False): else: X = DATASETS[datasets]["X"].astype(np.float32) y = DATASETS[datasets]["y"] - + rng = np.random.RandomState(42) weights = rng.rand(X.shape[0]) total_weight = np.sum(weights) @@ -828,7 +829,7 @@ def test_min_impurity_decrease(): ) # Check with a much lower value of 0.0001 est3 = TreeEstimator( - max_leaf_nodes=max_leaf_nodes, min_impurity_decrease=0.0001, random_state=0 + max_leaf_nodes=max_leaf_nodes, min_impurity_decrease=0.0001, random_state=1 ) # Check with a much lower value of 0.1 est4 = TreeEstimator( @@ -918,6 +919,7 @@ def test_pickle(): est2 = pickle.loads(serialized_object) assert type(est2) == est.__class__ + # score should match before/after pickling score2 = est2.score(X, y) assert ( score == score2 @@ -1031,7 +1033,6 @@ def test_memory_layout(): ALL_TREES.items(), [np.float64, np.float32] ): est = TreeEstimator(random_state=0) - # Nothing X = np.asarray(iris.data, dtype=dtype) y = iris.target @@ -1052,6 +1053,11 @@ def test_memory_layout(): y = iris.target assert_array_equal(est.fit(X, y).predict(X), y) + # Strided + X = np.asarray(iris.data[::3], dtype=dtype) + y = iris.target[::3] + assert_array_equal(est.fit(X, y).predict(X), y) + # csr matrix X = csr_matrix(iris.data, dtype=dtype) y = iris.target @@ -1062,11 +1068,6 @@ def test_memory_layout(): y = iris.target assert_array_equal(est.fit(X, y).predict(X), y) - # Strided - X = np.asarray(iris.data[::3], dtype=dtype) - y = iris.target[::3] - assert_array_equal(est.fit(X, y).predict(X), y) - def test_sample_weight(): # Check sample weighting. @@ -1260,7 +1261,7 @@ def test_behaviour_constant_feature_after_splits(): y = [0, 0, 0, 1, 1, 2, 2, 2, 3, 3, 3] for name, TreeEstimator in ALL_TREES.items(): # do not check extra random trees - if "ExtraTree" not in name: + if all(_name not in name for _name in ["ExtraTree"]): est = TreeEstimator(random_state=0, max_features=1) est.fit(X, y) assert est.tree_.max_depth == 2 @@ -1586,6 +1587,7 @@ def check_min_weight_leaf_split_level(name): sample_weight = [0.2, 0.2, 0.2, 0.2, 0.2] _check_min_weight_leaf_split_level(TreeEstimator, X, y, sample_weight) + # skip for sparse inputs _check_min_weight_leaf_split_level(TreeEstimator, csc_matrix(X), y, sample_weight) @@ -1644,6 +1646,7 @@ def check_decision_path(name): # Assert that leaves index are correct leaves = est.apply(X) leave_indicator = [node_indicator[i, j] for i, j in enumerate(leaves)] + assert_array_almost_equal(leave_indicator, np.ones(shape=n_samples)) # Ensure only one leave node per sample @@ -1930,6 +1933,7 @@ def assert_is_subtree(tree, subtree): def test_apply_path_readonly_all_trees(name, splitter, X_format): dataset = DATASETS["clf_small"] X_small = dataset["X"].astype(tree._tree.DTYPE, copy=False) + if X_format == "dense": X_readonly = create_memmap_backed_data(X_small) else: From 475bd05f779a4be4f301f751ac86ba6a998a219a Mon Sep 17 00:00:00 2001 From: Adam Li Date: Wed, 29 Mar 2023 09:41:10 -0700 Subject: [PATCH 02/28] Docs (#39) #### Reference Issues/PRs Fixes README and wheel building --------- Signed-off-by: Adam Li --- README.rst | 36 ++++++++++++--------- build_tools/azure/install.sh | 2 +- build_tools/github/repair_windows_wheels.sh | 2 +- 3 files changed, 22 insertions(+), 18 deletions(-) diff --git a/README.rst b/README.rst index fbdfdaa95ef4c..7a7bd41c42846 100644 --- a/README.rst +++ b/README.rst @@ -44,6 +44,10 @@ .. |PytestMinVersion| replace:: 5.3.1 .. |PlotlyMinVersion| replace:: 5.10.0 +================= +Scikit-learn-tree +================= + ``scikit-learn-tree`` is a maintained fork of scikit-learn, which advances the tree submodule, while staying in-line with changes from upstream scikit-learn. It is an exact stand-in for ``sklearn`` in package imports, but is released under the name ``scikit-learn-tree`` to avoid confusion. @@ -94,8 +98,7 @@ Installing scikit-learn-tree ============================ Scikit-learn-tree is a maintained fork of scikit-learn, which extends the -tree submodule in a few ways documented in :ref:`changelog of the fork -`. +tree submodule in a few ways documented in `fork_changelog`_. We release versions of scikit-learn-tree in an analagous fashion to scikit-learn main. Due to maintenance resources, we only release on PyPi @@ -103,12 +106,11 @@ and recommend therefore installing with ``pip``. There are different ways to install scikit-learn-tree: - * :ref:`Install the latest official release `. This + * Install the latest official release `install_fork_release`_. This is the best approach for most users. It will provide a stable version and pre-built packages are available for most platforms. - * :ref:`Building the package from source - `. This is best for users who want the + * Building the package from source `install_source`_. This is best for users who want the latest-and-greatest features and aren't afraid of running brand-new code. This is also needed for users who wish to contribute to the project. @@ -119,9 +121,7 @@ Installing the latest release ----------------------------- We release wheels for common distributions and this is thus installable via pip. -.. prompt:: bash $ - - pip install scikit-learn-tree + pip install scikit-learn-tree This will install ``scikit-learn-tree`` under the namespace of ``sklearn``, which then can be used as a stand-in for any package that relies on the public API of ``sklearn``. @@ -146,9 +146,11 @@ features to the fork, the building from source instructions are exactly the same as that of scikit-learn main, so please refer to `scikit-learn documentation `_ for instructions on building from source. -Development =========== +Development +----------- + We welcome new contributors of all experience levels, specifically to maintain the fork. Any contributions that make sure our fork is "better in-line" with scikit-learn upstream, or improves the tree submodule in anyway will be appreciated. @@ -158,15 +160,17 @@ The scikit-learn community goals are to be helpful, welcoming, and effective. Th has detailed information about contributing code, documentation, tests, and more. We've included some basic information in this README. -.. _fork-changelog: -Major Changes of the Fork ========================= +.. _fork_changelog: + +Major Changes of the Fork +------------------------- + The purpose of this page is to illustrate some of the main features that ``scikit-learn-tree`` provides compared to ``scikit-learn``. It assumes a an understanding of core package ``scikit-learn`` and also decision trees -models. Please refer to our :ref:`installation instructions -` for installing ``scikit-learn-tree``. +models. Please refer to our installation instructions `install_fork_release`_ for installing ``scikit-learn-tree``. Scikit-learn-tree though operates as a stand-in for upstream ``scikit-learn``. It is used in packages exactly the same way and will support all features @@ -193,7 +197,7 @@ Candidate changes and PRs accepted into the fork are those that: Decision tree generalizations ----------------------------- -``Scikit-learn`` provides an axis-aligned :class:`~sklearn.tree.DecisionTreeClassifier` +``Scikit-learn`` provides an axis-aligned `sklearn.tree.DecisionTreeClassifier `_ decision tree model (classifier and regressor), which has a few fundamental limitations that prevent 3rd parties from utilizing the existing class, without forking a large amount of copy/pasted Python and Cython code. We highlight those limitations here @@ -239,8 +243,8 @@ Python API: random forests and their variants to scale to millions of samples. - Our fix: We added a ``max_bins=None`` keyword argument to the ``BaseForest`` class, and all its subclasses. The default behavior is no binning. The current implementation is not necessarily efficient. There are several improvements to be made. See below. -Overall, the existing tree models, such as :class:`~sklearn.tree.DecisionTreeClassifier` -and :class:`~sklearn.ensemble.RandomForestClassifier` all work exactly the same as they +Overall, the existing tree models, such as `sklearn.tree.DecisionTreeClassifier `_ +and `sklearn.ensemble.RandomForestClassifier `_ all work exactly the same as they would in ``scikit-learn`` main, but these extensions enable 3rd-party packages to extend the Cython/Python API easily. diff --git a/build_tools/azure/install.sh b/build_tools/azure/install.sh index db5b5d9414053..5238cd1121d2e 100755 --- a/build_tools/azure/install.sh +++ b/build_tools/azure/install.sh @@ -7,7 +7,7 @@ set -x source build_tools/shared.sh UNAMESTR=`uname` -CCACHE_LINKS_DIR="/tmp/ccachev2" +CCACHE_LINKS_DIR="/tmp/ccache" setup_ccache() { CCACHE_BIN=`which ccache || echo ""` diff --git a/build_tools/github/repair_windows_wheels.sh b/build_tools/github/repair_windows_wheels.sh index cdd0c0c79d8c4..a857e61067960 100755 --- a/build_tools/github/repair_windows_wheels.sh +++ b/build_tools/github/repair_windows_wheels.sh @@ -9,7 +9,7 @@ DEST_DIR=$2 # By default, the Windows wheels are not repaired. # In this case, we need to vendor VCRUNTIME140.dll wheel unpack "$WHEEL" -WHEEL_DIRNAME=$(ls -d scikit_learn-*) +WHEEL_DIRNAME=$(ls -d scikit_learn_tree-*) python build_tools/github/vendor.py "$WHEEL_DIRNAME" wheel pack "$WHEEL_DIRNAME" -d "$DEST_DIR" rm -rf "$WHEEL_DIRNAME" From 706a74273bf736066b1d71eeed9da08c0943e311 Mon Sep 17 00:00:00 2001 From: Adam Li Date: Tue, 4 Apr 2023 14:47:24 -0700 Subject: [PATCH 03/28] Release v1.2.2 #### Reference Issues/PRs #### What does this implement/fix? Explain your changes. #### Any other comments? --------- Signed-off-by: Adam Li --- .github/workflows/check-upstream.yml | 27 +++++++++++++++++++++++++++ sklearn/__init__.py | 2 +- 2 files changed, 28 insertions(+), 1 deletion(-) create mode 100644 .github/workflows/check-upstream.yml diff --git a/.github/workflows/check-upstream.yml b/.github/workflows/check-upstream.yml new file mode 100644 index 0000000000000..80e8ace610607 --- /dev/null +++ b/.github/workflows/check-upstream.yml @@ -0,0 +1,27 @@ +# Create Github Actions workflow that checks upstream scikit-learn 'main' branch and +# creates or updates +# an existing pull request to https://github.com/neurodata/scikit-learn:fork. +# Runs the check weekly. +# Creates a pull request if there are changes. + +# name: Check upstream scikit-learn + +# on: +# schedule: +# - cron: '0 0 * * 0' + +# jobs: +# check-upstream: +# runs-on: ubuntu-latest +# steps: +# - uses: actions/checkout@v2 +# - name: Check upstream scikit-learn +# uses: neurodata/check-upstream@main +# with: +# upstream: scikit-learn/scikit-learn +# fork: neurodata/scikit-learn +# branch: fork +# token: ${{ secrets.GITHUB_TOKEN }} + +# # Creates a pull request if there are changes. + diff --git a/sklearn/__init__.py b/sklearn/__init__.py index 47bb893bd00a0..6d5af7c771fb8 100644 --- a/sklearn/__init__.py +++ b/sklearn/__init__.py @@ -39,7 +39,7 @@ # Dev branch marker is: 'X.Y.dev' or 'X.Y.devN' where N is an integer. # 'X.Y.dev0' is the canonical version of 'X.Y.dev' # -__version__ = "1.3.dev0" +__version__ = "1.2.2" # On OSX, we can get a runtime error due to multiple OpenMP libraries loaded From a22db039704399a31d466be861f2b5a86bbc51b3 Mon Sep 17 00:00:00 2001 From: Adam Li Date: Tue, 11 Apr 2023 15:25:44 -0400 Subject: [PATCH 04/28] Update README Signed-off-by: Adam Li --- README.rst | 4 ++-- sklearn/__init__.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/README.rst b/README.rst index 7a7bd41c42846..444ead93017b9 100644 --- a/README.rst +++ b/README.rst @@ -48,7 +48,7 @@ Scikit-learn-tree ================= -``scikit-learn-tree`` is a maintained fork of scikit-learn, which advances the tree submodule, while staying in-line +``scikit-learn-tree`` is an alias of scikit-learn. It is a maintained fork of scikit-learn, which advances the tree submodule, while staying in-line with changes from upstream scikit-learn. It is an exact stand-in for ``sklearn`` in package imports, but is released under the name ``scikit-learn-tree`` to avoid confusion. @@ -85,7 +85,7 @@ Installation Dependencies ~~~~~~~~~~~~ -scikit-learn requires: +scikit-learn-tree requires: - Python (>= |PythonMinVersion|) - NumPy (>= |NumPyMinVersion|) diff --git a/sklearn/__init__.py b/sklearn/__init__.py index 6d5af7c771fb8..4d7badd6b678e 100644 --- a/sklearn/__init__.py +++ b/sklearn/__init__.py @@ -39,7 +39,7 @@ # Dev branch marker is: 'X.Y.dev' or 'X.Y.devN' where N is an integer. # 'X.Y.dev0' is the canonical version of 'X.Y.dev' # -__version__ = "1.2.2" +__version__ = "1.3.0dev0" # On OSX, we can get a runtime error due to multiple OpenMP libraries loaded From 9c5321daa396e0fd01cc6e582a5dfcc8ccb1afe5 Mon Sep 17 00:00:00 2001 From: Adam Li Date: Thu, 8 Jun 2023 10:09:58 -0400 Subject: [PATCH 05/28] Adding working submodule Signed-off-by: Adam Li --- sklearn/tree/_tree.pyx | 1 + 1 file changed, 1 insertion(+) diff --git a/sklearn/tree/_tree.pyx b/sklearn/tree/_tree.pyx index 23b999d76326e..f4a1a80123d26 100644 --- a/sklearn/tree/_tree.pyx +++ b/sklearn/tree/_tree.pyx @@ -1022,6 +1022,7 @@ cdef class BaseTree: cdef Node* end_node = node + self.node_count cdef double normalizer = 0. + cdef int i = 0 cdef cnp.float64_t[:] importances = np.zeros(self.n_features) From f82f2582c0c5e347fd9a6109129c3ae7853b0593 Mon Sep 17 00:00:00 2001 From: Adam Li Date: Thu, 8 Jun 2023 10:40:52 -0400 Subject: [PATCH 06/28] Merged main Signed-off-by: Adam Li --- sklearn/tree/_splitter.pxd | 2 ++ sklearn/tree/_splitter.pyx | 4 ++-- sklearn/tree/_tree.pyx | 2 +- 3 files changed, 5 insertions(+), 3 deletions(-) diff --git a/sklearn/tree/_splitter.pxd b/sklearn/tree/_splitter.pxd index 3419c6fa08819..01975df22ef23 100644 --- a/sklearn/tree/_splitter.pxd +++ b/sklearn/tree/_splitter.pxd @@ -110,6 +110,8 @@ cdef class Splitter(BaseSplitter): cdef bint check_presplit_conditions( self, SplitRecord current_split, + SIZE_t n_missing, + bint missing_go_to_left, ) noexcept nogil cdef bint check_postsplit_conditions( self diff --git a/sklearn/tree/_splitter.pyx b/sklearn/tree/_splitter.pyx index c8df3de1bb900..ae6cd772e37f7 100644 --- a/sklearn/tree/_splitter.pyx +++ b/sklearn/tree/_splitter.pyx @@ -505,7 +505,7 @@ cdef inline int node_split_best( current_split.pos = p # Reject if min_samples_leaf is not guaranteed - if splitter.check_presplit_conditions(current_split) == 1: + if splitter.check_presplit_conditions(current_split, n_missing, missing_go_to_left) == 1: continue criterion.update(current_split.pos) @@ -834,7 +834,7 @@ cdef inline int node_split_random( current_split.pos = partitioner.partition_samples(current_split.threshold) # Reject if min_samples_leaf is not guaranteed - if splitter.check_presplit_conditions(current_split) == 1: + if splitter.check_presplit_conditions(current_split, 0, 0) == 1: continue # Evaluate split diff --git a/sklearn/tree/_tree.pyx b/sklearn/tree/_tree.pyx index c8248ed65c36b..33a2a8308de5f 100644 --- a/sklearn/tree/_tree.pyx +++ b/sklearn/tree/_tree.pyx @@ -764,7 +764,7 @@ cdef class BaseTree: # While node not a leaf while node.left_child != _TREE_LEAF: - X_i_node_features = self._compute_feature(X_ndarray, i, node) + X_i_node_feature = self._compute_feature(X_ndarray, i, node) # ... and node.right_child != _TREE_LEAF: if isnan(X_i_node_feature): if node.missing_go_to_left: From 7e38502806e954d9b3084f8a5e22602556236fe4 Mon Sep 17 00:00:00 2001 From: Adam Li Date: Thu, 8 Jun 2023 10:42:44 -0400 Subject: [PATCH 07/28] Successful merge with the missing value support Signed-off-by: Adam Li --- sklearn/tree/_classes.py | 2 ++ sklearn/tree/tests/test_tree.py | 32 ++++++++++++++------------------ 2 files changed, 16 insertions(+), 18 deletions(-) diff --git a/sklearn/tree/_classes.py b/sklearn/tree/_classes.py index 638c51f1101bc..21fa5b7c200b2 100644 --- a/sklearn/tree/_classes.py +++ b/sklearn/tree/_classes.py @@ -388,6 +388,7 @@ def _fit( X, y, sample_weight, + feature_has_missing, min_samples_leaf, min_weight_leaf, max_leaf_nodes, @@ -403,6 +404,7 @@ def _build_tree( X, y, sample_weight, + feature_has_missing, min_samples_leaf, min_weight_leaf, max_leaf_nodes, diff --git a/sklearn/tree/tests/test_tree.py b/sklearn/tree/tests/test_tree.py index 6be168e4c8e7c..eefae6cdaa3f6 100644 --- a/sklearn/tree/tests/test_tree.py +++ b/sklearn/tree/tests/test_tree.py @@ -300,7 +300,7 @@ def test_xor(): clf.fit(X, y) assert clf.score(X, y) == 1.0, "Failed with {0}".format(name) - clf = Tree(random_state=0, max_features=X.shape[1]) + clf = Tree(random_state=0, max_features=1) clf.fit(X, y) assert clf.score(X, y) == 1.0, "Failed with {0}".format(name) @@ -440,7 +440,7 @@ def test_importances(): X, y = datasets.make_classification( n_samples=5000, n_features=10, - n_informative=4, + n_informative=3, n_redundant=0, n_repeated=0, shuffle=False, @@ -455,7 +455,7 @@ def test_importances(): n_important = np.sum(importances > 0.1) assert importances.shape[0] == 10, "Failed with {0}".format(name) - assert n_important == 4, "Failed with {0}".format(name) + assert n_important == 3, "Failed with {0}".format(name) # Check on iris that importances are the same for all builders clf = DecisionTreeClassifier(random_state=0) @@ -466,9 +466,9 @@ def test_importances(): assert_array_equal(clf.feature_importances_, clf2.feature_importances_) -@pytest.mark.parametrize("clf", [DecisionTreeClassifier()]) -def test_importances_raises(clf): +def test_importances_raises(): # Check if variable importance before fit raises ValueError. + clf = DecisionTreeClassifier() with pytest.raises(ValueError): getattr(clf, "feature_importances_") @@ -653,7 +653,6 @@ def test_min_samples_leaf(): est.fit(X, y) out = est.tree_.apply(X) node_counts = np.bincount(out) - # drop inner nodes leaf_count = node_counts[node_counts != 0] assert np.min(leaf_count) > 4, "Failed with {0}".format(name) @@ -678,7 +677,7 @@ def check_min_weight_fraction_leaf(name, datasets, sparse=False): else: X = DATASETS[datasets]["X"].astype(np.float32) y = DATASETS[datasets]["y"] - rng = np.random.RandomState(42) + weights = rng.rand(X.shape[0]) total_weight = np.sum(weights) @@ -829,7 +828,7 @@ def test_min_impurity_decrease(): ) # Check with a much lower value of 0.0001 est3 = TreeEstimator( - max_leaf_nodes=max_leaf_nodes, min_impurity_decrease=0.0001, random_state=1 + max_leaf_nodes=max_leaf_nodes, min_impurity_decrease=0.0001, random_state=0 ) # Check with a much lower value of 0.1 est4 = TreeEstimator( @@ -919,7 +918,6 @@ def test_pickle(): est2 = pickle.loads(serialized_object) assert type(est2) == est.__class__ - # score should match before/after pickling score2 = est2.score(X, y) assert ( score == score2 @@ -1033,6 +1031,7 @@ def test_memory_layout(): ALL_TREES.items(), [np.float64, np.float32] ): est = TreeEstimator(random_state=0) + # Nothing X = np.asarray(iris.data, dtype=dtype) y = iris.target @@ -1053,11 +1052,6 @@ def test_memory_layout(): y = iris.target assert_array_equal(est.fit(X, y).predict(X), y) - # Strided - X = np.asarray(iris.data[::3], dtype=dtype) - y = iris.target[::3] - assert_array_equal(est.fit(X, y).predict(X), y) - # csr matrix X = csr_matrix(iris.data, dtype=dtype) y = iris.target @@ -1068,6 +1062,11 @@ def test_memory_layout(): y = iris.target assert_array_equal(est.fit(X, y).predict(X), y) + # Strided + X = np.asarray(iris.data[::3], dtype=dtype) + y = iris.target[::3] + assert_array_equal(est.fit(X, y).predict(X), y) + def test_sample_weight(): # Check sample weighting. @@ -1261,7 +1260,7 @@ def test_behaviour_constant_feature_after_splits(): y = [0, 0, 0, 1, 1, 2, 2, 2, 3, 3, 3] for name, TreeEstimator in ALL_TREES.items(): # do not check extra random trees - if all(_name not in name for _name in ["ExtraTree"]): + if "ExtraTree" not in name: est = TreeEstimator(random_state=0, max_features=1) est.fit(X, y) assert est.tree_.max_depth == 2 @@ -1587,7 +1586,6 @@ def check_min_weight_leaf_split_level(name): sample_weight = [0.2, 0.2, 0.2, 0.2, 0.2] _check_min_weight_leaf_split_level(TreeEstimator, X, y, sample_weight) - # skip for sparse inputs _check_min_weight_leaf_split_level(TreeEstimator, csc_matrix(X), y, sample_weight) @@ -1646,7 +1644,6 @@ def check_decision_path(name): # Assert that leaves index are correct leaves = est.apply(X) leave_indicator = [node_indicator[i, j] for i, j in enumerate(leaves)] - assert_array_almost_equal(leave_indicator, np.ones(shape=n_samples)) # Ensure only one leave node per sample @@ -1933,7 +1930,6 @@ def assert_is_subtree(tree, subtree): def test_apply_path_readonly_all_trees(name, splitter, X_format): dataset = DATASETS["clf_small"] X_small = dataset["X"].astype(tree._tree.DTYPE, copy=False) - if X_format == "dense": X_readonly = create_memmap_backed_data(X_small) else: From 34a562130d9c92b083b6da99c27a12a7623226b7 Mon Sep 17 00:00:00 2001 From: Adam Li Date: Thu, 8 Jun 2023 10:53:07 -0400 Subject: [PATCH 08/28] Add cyton headers Signed-off-by: Adam Li --- sklearn/tree/_criterion.pyx | 3 +++ sklearn/tree/_splitter.pyx | 3 +++ sklearn/tree/_tree.pyx | 3 +++ sklearn/tree/_utils.pyx | 3 +++ 4 files changed, 12 insertions(+) diff --git a/sklearn/tree/_criterion.pyx b/sklearn/tree/_criterion.pyx index 9c59e75fedb10..8fbcafcaf1456 100644 --- a/sklearn/tree/_criterion.pyx +++ b/sklearn/tree/_criterion.pyx @@ -1,3 +1,6 @@ +# cython: language_level=3 +# cython: boundscheck=False, wraparound=False, initializedcheck=False, cdivision=True + # Authors: Gilles Louppe # Peter Prettenhofer # Brian Holt diff --git a/sklearn/tree/_splitter.pyx b/sklearn/tree/_splitter.pyx index ae6cd772e37f7..a58514d093ddf 100644 --- a/sklearn/tree/_splitter.pyx +++ b/sklearn/tree/_splitter.pyx @@ -1,3 +1,6 @@ +# cython: language_level=3 +# cython: boundscheck=False, wraparound=False, initializedcheck=False, cdivision=True + # Authors: Gilles Louppe # Peter Prettenhofer # Brian Holt diff --git a/sklearn/tree/_tree.pyx b/sklearn/tree/_tree.pyx index 33a2a8308de5f..2256b28c7df10 100644 --- a/sklearn/tree/_tree.pyx +++ b/sklearn/tree/_tree.pyx @@ -1,3 +1,6 @@ +# cython: language_level=3 +# cython: boundscheck=False, wraparound=False, initializedcheck=False, cdivision=True + # Authors: Gilles Louppe # Peter Prettenhofer # Brian Holt diff --git a/sklearn/tree/_utils.pyx b/sklearn/tree/_utils.pyx index 669d69409fdc3..0a7522bcf4255 100644 --- a/sklearn/tree/_utils.pyx +++ b/sklearn/tree/_utils.pyx @@ -1,3 +1,6 @@ +# cython: language_level=3 +# cython: boundscheck=False, wraparound=False, initializedcheck=False, cdivision=True + # Authors: Gilles Louppe # Peter Prettenhofer # Arnaud Joly From f35c758189c8d38bfed56071b8c9a6cbbd39056f Mon Sep 17 00:00:00 2001 From: Adam Li Date: Thu, 8 Jun 2023 14:04:19 -0400 Subject: [PATCH 09/28] Fix imports to be absolute Signed-off-by: Adam Li --- sklearn/tree/_classes.py | 32 ++++++++++++++++---------------- sklearn/tree/_export.py | 11 ++++++++--- sklearn/tree/_utils.pxd | 2 +- sklearn/tree/_utils.pyx | 2 +- 4 files changed, 26 insertions(+), 21 deletions(-) diff --git a/sklearn/tree/_classes.py b/sklearn/tree/_classes.py index 21fa5b7c200b2..4fdd8f27cd652 100644 --- a/sklearn/tree/_classes.py +++ b/sklearn/tree/_classes.py @@ -25,22 +25,22 @@ import numpy as np from scipy.sparse import issparse -from ..base import BaseEstimator -from ..base import ClassifierMixin -from ..base import clone -from ..base import RegressorMixin -from ..base import is_classifier -from ..base import MultiOutputMixin -from ..utils import Bunch -from ..utils import check_random_state -from ..utils.validation import _check_sample_weight -from ..utils.validation import assert_all_finite -from ..utils.validation import _assert_all_finite_element_wise -from ..utils import compute_sample_weight -from ..utils.multiclass import check_classification_targets -from ..utils.validation import check_is_fitted -from ..utils._param_validation import Hidden, Interval, StrOptions -from ..utils._param_validation import RealNotInt +from sklearn.base import BaseEstimator +from sklearn.base import ClassifierMixin +from sklearn.base import clone +from sklearn.base import RegressorMixin +from sklearn.base import is_classifier +from sklearn.base import MultiOutputMixin +from sklearn.utils import Bunch +from sklearn.utils import check_random_state +from sklearn.utils.validation import _check_sample_weight +from sklearn.utils.validation import assert_all_finite +from sklearn.utils.validation import _assert_all_finite_element_wise +from sklearn.utils import compute_sample_weight +from sklearn.utils.multiclass import check_classification_targets +from sklearn.utils.validation import check_is_fitted +from sklearn.utils._param_validation import Hidden, Interval, StrOptions +from sklearn.utils._param_validation import RealNotInt from ._criterion import BaseCriterion from ._splitter import BaseSplitter diff --git a/sklearn/tree/_export.py b/sklearn/tree/_export.py index e8dbe51138223..be545de0202d0 100644 --- a/sklearn/tree/_export.py +++ b/sklearn/tree/_export.py @@ -16,10 +16,15 @@ import numpy as np -from ..utils.validation import check_is_fitted, check_array -from ..utils._param_validation import Interval, validate_params, StrOptions, HasMethods +from sklearn.utils.validation import check_is_fitted, check_array +from sklearn.utils._param_validation import ( + Interval, + validate_params, + StrOptions, + HasMethods, +) -from ..base import is_classifier +from sklearn.base import is_classifier from . import _criterion from . import _tree diff --git a/sklearn/tree/_utils.pxd b/sklearn/tree/_utils.pxd index 4938d3030245f..f7bae4c5c8553 100644 --- a/sklearn/tree/_utils.pxd +++ b/sklearn/tree/_utils.pxd @@ -10,7 +10,7 @@ cimport numpy as cnp from ._tree cimport Node -from ..neighbors._quad_tree cimport Cell +from sklearn.neighbors._quad_tree cimport Cell ctypedef cnp.npy_float32 DTYPE_t # Type of X ctypedef cnp.npy_float64 DOUBLE_t # Type of y, sample_weight diff --git a/sklearn/tree/_utils.pyx b/sklearn/tree/_utils.pyx index 0a7522bcf4255..bc7e17f8766d8 100644 --- a/sklearn/tree/_utils.pyx +++ b/sklearn/tree/_utils.pyx @@ -19,7 +19,7 @@ import numpy as np cimport numpy as cnp cnp.import_array() -from ..utils._random cimport our_rand_r +from sklearn.utils._random cimport our_rand_r # ============================================================================= # Helper functions From 45320b4d3ef05b4ccbe81e8c13676b1c755d1973 Mon Sep 17 00:00:00 2001 From: Adam Li Date: Thu, 8 Jun 2023 14:17:25 -0400 Subject: [PATCH 10/28] Fix forest import Signed-off-by: Adam Li --- sklearn/ensemble/_forest.py | 35 ++++++++++++++++++++--------------- 1 file changed, 20 insertions(+), 15 deletions(-) diff --git a/sklearn/ensemble/_forest.py b/sklearn/ensemble/_forest.py index 4cc672bb6884d..4d9bf862bd806 100644 --- a/sklearn/ensemble/_forest.py +++ b/sklearn/ensemble/_forest.py @@ -50,11 +50,16 @@ class calls the ``fit`` method of each sub-estimator on random samples from scipy.sparse import issparse from scipy.sparse import hstack as sparse_hstack -from ..base import is_classifier -from ..base import ClassifierMixin, MultiOutputMixin, RegressorMixin, TransformerMixin +from sklearn.base import is_classifier +from sklearn.base import ( + ClassifierMixin, + MultiOutputMixin, + RegressorMixin, + TransformerMixin, +) -from ..metrics import accuracy_score, r2_score -from ..preprocessing import OneHotEncoder +from sklearn.metrics import accuracy_score, r2_score +from sklearn.preprocessing import OneHotEncoder from ..tree import ( BaseDecisionTree, DecisionTreeClassifier, @@ -63,21 +68,21 @@ class calls the ``fit`` method of each sub-estimator on random samples ExtraTreeRegressor, ) from ..tree._tree import DTYPE, DOUBLE -from ..utils import check_random_state, compute_sample_weight -from ..exceptions import DataConversionWarning -from ._base import BaseEnsemble, _partition_estimators -from ..utils.parallel import delayed, Parallel -from ..utils.multiclass import check_classification_targets, type_of_target -from ..utils.validation import ( +from sklearn.utils import check_random_state, compute_sample_weight +from sklearn.exceptions import DataConversionWarning +from sklearn.ensemble._base import BaseEnsemble, _partition_estimators +from sklearn.utils.parallel import delayed, Parallel +from sklearn.utils.multiclass import check_classification_targets, type_of_target +from sklearn.utils.validation import ( check_is_fitted, _check_sample_weight, _check_feature_names_in, ) -from ..utils._openmp_helpers import _openmp_effective_n_threads -from ..utils.validation import _num_samples -from ..utils._param_validation import Interval, StrOptions -from ..utils._param_validation import RealNotInt -from ._hist_gradient_boosting.binning import _BinMapper +from sklearn.utils._openmp_helpers import _openmp_effective_n_threads +from sklearn.utils.validation import _num_samples +from sklearn.utils._param_validation import Interval, StrOptions +from sklearn.utils._param_validation import RealNotInt +from sklearn.ensemble._hist_gradient_boosting.binning import _BinMapper __all__ = [ "RandomForestClassifier", From 49526f026c46727aa272be7bdd7a44d0101c089f Mon Sep 17 00:00:00 2001 From: Adam Li Date: Tue, 13 Jun 2023 15:19:07 -0400 Subject: [PATCH 11/28] Fix classes and criterion Signed-off-by: Adam Li --- sklearn/tree/_classes.py | 67 ++++++++++++++++++++++++++++++++++++ sklearn/tree/_criterion.pxd | 11 ++++-- sklearn/tree/_criterion.pyx | 68 +++++++++++++++++++++++++++++++++++-- 3 files changed, 141 insertions(+), 5 deletions(-) diff --git a/sklearn/tree/_classes.py b/sklearn/tree/_classes.py index 4fdd8f27cd652..795c68c8b5081 100644 --- a/sklearn/tree/_classes.py +++ b/sklearn/tree/_classes.py @@ -713,6 +713,73 @@ def feature_importances_(self): return self.tree_.compute_feature_importances() + def _get_y_for_leaves(self, X, sample_weight=None): + n_samples = X.shape[0] + + # get the predictions + X_leaves = self.apply(X) + + bootstrap_indices = np.empty(shape, dtype=np.int64) + for i, estimator in enumerate(self.estimators_): + # Get bootstrap indices. + if self.bootstrap: + n_samples_bootstrap = _get_n_samples_bootstrap(n_samples, self.max_samples) + bootstrap_indices[:, i] = _generate_sample_indices( + estimator.random_state, n_samples, n_samples_bootstrap + ) + else: + bootstrap_indices[:, i] = np.arange(n_samples) + + # Get predictions on bootstrap indices. + X_leaves[:, i] = X_leaves[bootstrap_indices[:, i], i] + + if sorter is not None: + # Reassign bootstrap indices to account for target sorting. + bootstrap_indices = np.argsort(sorter)[bootstrap_indices] + + bootstrap_indices += 1 # for sparse matrix (0s as empty) + + # Get the maximum number of nodes (internal + leaves) across trees. + # Get the maximum number of samples per leaf across trees (if needed). + max_node_count = 0 + max_samples_leaf = 0 if not leaf_subsample else max_samples_leaf + for i, estimator in enumerate(self.estimators_): + node_count = estimator.tree_.node_count + if node_count > max_node_count: + max_node_count = node_count + if not leaf_subsample: + sample_count = np.max(np.bincount(X_leaves[:, i])) + if sample_count > max_samples_leaf: + max_samples_leaf = sample_count + + # Initialize NumPy array (more efficient serialization than dict/list). + shape = (self.n_estimators, max_node_count, max_samples_leaf) + y_train_leaves = np.zeros(shape, dtype=np.int64) + + for i, estimator in enumerate(self.estimators_): + # Group training indices by leaf node. + leaf_indices, leaf_values_list = _group_by_value(X_leaves[:, i]) + + if leaf_subsample: + random.seed(estimator.random_state) + + # Map each leaf node to its list of training indices. + for leaf_idx, leaf_values in zip(leaf_indices, leaf_values_list): + y_indices = bootstrap_indices[:, i][leaf_values] + + if sample_weight is not None: + y_indices = y_indices[sample_weight[y_indices - 1] > 0] + + # Subsample leaf training indices (without replacement). + if leaf_subsample and max_samples_leaf < len(y_indices): + if not isinstance(y_indices, list): + y_indices = list(y_indices) + y_indices = random.sample(y_indices, max_samples_leaf) + + y_train_leaves[i, leaf_idx, : len(y_indices)] = y_indices + + return y_train_leaves + # ============================================================================= # Public estimators diff --git a/sklearn/tree/_criterion.pxd b/sklearn/tree/_criterion.pxd index 6cfc33c5bdcea..d72f22f8b348d 100644 --- a/sklearn/tree/_criterion.pxd +++ b/sklearn/tree/_criterion.pxd @@ -11,6 +11,8 @@ # See _criterion.pyx for implementation details. +# from libcpp.vector cimport vector + from ._tree cimport DTYPE_t # Type of X from ._tree cimport DOUBLE_t # Type of y, sample_weight from ._tree cimport SIZE_t # Type for indices and counters @@ -19,7 +21,7 @@ from ._tree cimport UINT32_t # Unsigned 32 bit integer cdef class BaseCriterion: - """Abstract interface for criterion.""" + """Abstract interface for criterion.""" # Internal structures cdef const DOUBLE_t[:] sample_weight # Sample weights @@ -70,13 +72,18 @@ cdef class BaseCriterion: SIZE_t end ) noexcept nogil + # cdef void node_samples( + # self, + # vector[vector[DOUBLE_t]]* dest + # ) noexcept nogil + cdef class Criterion(BaseCriterion): """Abstract interface for supervised impurity criteria.""" cdef const DOUBLE_t[:, ::1] y # Values of y cdef SIZE_t n_missing # Number of missing values for the feature being evaluated cdef bint missing_go_to_left # Whether missing values go to the left node - + cdef int init( self, const DOUBLE_t[:, ::1] y, diff --git a/sklearn/tree/_criterion.pyx b/sklearn/tree/_criterion.pyx index 8fbcafcaf1456..e9c02ab2fa43d 100644 --- a/sklearn/tree/_criterion.pyx +++ b/sklearn/tree/_criterion.pyx @@ -39,10 +39,13 @@ cdef class BaseCriterion: covariates, or labels, or both. Although scikit-learn currently only contains supervised tree methods, this class enables 3rd party packages to leverage scikit-learn's Cython code for criteria. + The downstream classes _must_ implement methods to compute the impurity in current node and in children nodes. + This object stores methods on how to calculate how good a split is using a set API. + Samples in the "current" node are stored in `samples[start:end]` which is partitioned around `pos` (an index in `start:end`) so that: - the samples of left child node are stored in `samples[start:pos]` @@ -56,21 +59,25 @@ cdef class BaseCriterion: cdef int reset(self) except -1 nogil: """Reset the criterion at pos=start. + This method must be implemented by the subclass. """ pass cdef int reverse_reset(self) except -1 nogil: """Reset the criterion at pos=end. + This method must be implemented by the subclass. """ pass cdef int update(self, SIZE_t new_pos) except -1 nogil: """Updated statistics by moving sample_indices[pos:new_pos] to the left child. + This updates the collected statistics by moving sample_indices[pos:new_pos] from the right child to the left child. It must be implemented by the subclass. + Parameters ---------- new_pos : SIZE_t @@ -80,6 +87,7 @@ cdef class BaseCriterion: cdef double node_impurity(self) noexcept nogil: """Placeholder for calculating the impurity of the node. + Placeholder for a method which will evaluate the impurity of the current node, i.e. the impurity of sample_indices[start:end]. This is the primary function of the criterion class. The smaller the impurity the @@ -90,9 +98,11 @@ cdef class BaseCriterion: cdef void children_impurity(self, double* impurity_left, double* impurity_right) noexcept nogil: """Placeholder for calculating the impurity of children. + Placeholder for a method which evaluates the impurity in children nodes, i.e. the impurity of sample_indices[start:pos] + the impurity of sample_indices[pos:end]. + Parameters ---------- impurity_left : double pointer @@ -106,8 +116,10 @@ cdef class BaseCriterion: cdef void node_value(self, double* dest) noexcept nogil: """Placeholder for storing the node value. + Placeholder for a method which will compute the node value of sample_indices[start:end] and save the value into dest. + Parameters ---------- dest : double pointer @@ -117,10 +129,12 @@ cdef class BaseCriterion: cdef double proxy_impurity_improvement(self) noexcept nogil: """Compute a proxy of the impurity reduction. + This method is used to speed up the search for the best split. It is a proxy quantity such that the split that maximizes this value also maximizes the impurity improvement. It neglects all constant terms of the impurity decrease for a given split. + The absolute impurity improvement is only computed by the impurity_improvement method once the best split has been found. """ @@ -135,6 +149,7 @@ cdef class BaseCriterion: double impurity_left, double impurity_right) noexcept nogil: """Compute the improvement in impurity. + This method computes the improvement in impurity when a split occurs. The weighted impurity improvement equation is the following: N_t / N * (impurity - N_t_R / N_t * right_impurity @@ -142,6 +157,7 @@ cdef class BaseCriterion: where N is the total number of samples, N_t is the number of samples at the current node, N_t_L is the number of samples in the left child, and N_t_R is the number of samples in the right child, + Parameters ---------- impurity_parent : double @@ -150,6 +166,7 @@ cdef class BaseCriterion: The impurity of the left child impurity_right : double The impurity of the right child + Return ------ double : improvement in impurity after the split occurs @@ -166,10 +183,12 @@ cdef class BaseCriterion: SIZE_t end ) noexcept nogil: """Abstract method which will set sample pointers in the criterion. + The dataset array that we compute criteria on is assumed to consist of 'N' ordered samples or rows (i.e. sorted). Since we pass this by reference, we use sample pointers to move the start and end around to consider only a subset of data. This function should also update relevant statistics that the class uses to compute the final criterion. + Parameters ---------- start : SIZE_t @@ -182,11 +201,13 @@ cdef class BaseCriterion: cdef class Criterion(BaseCriterion): """Interface for impurity criteria. + The supervised criterion computes the impurity of a node and the reduction of impurity of a split on that node using the distribution of labels in parent and - children nodes. It also computes the output statistics - such as the mean in regression and class probabilities in classification. - Instances of this class are responsible for compute splits' impurity difference + children nodes. It also computes the output statistics such as the mean in regression + and class probabilities in classification. Instances of this class are responsible + for compute splits' impurity difference. + Criterion is the base class for criteria used in supervised tree-based models with a homogeneous float64-dtyped y. """ @@ -198,8 +219,10 @@ cdef class Criterion(BaseCriterion): const SIZE_t[:] sample_indices ) except -1 nogil: """Placeholder for a method which will initialize the criterion. + Returns -1 in case of failure to allocate memory (and raise MemoryError) or 0 otherwise. + Parameters ---------- y : ndarray, dtype=DOUBLE_t @@ -279,6 +302,7 @@ cdef class ClassificationCriterion(Criterion): def __cinit__(self, SIZE_t n_outputs, cnp.ndarray[SIZE_t, ndim=1] n_classes): """Initialize attributes for this criterion. + Parameters ---------- n_outputs : SIZE_t @@ -331,8 +355,10 @@ cdef class ClassificationCriterion(Criterion): const SIZE_t[:] sample_indices ) except -1 nogil: """Initialize the criterion. + Returns -1 in case of failure to allocate memory (and raise MemoryError) or 0 otherwise. + Parameters ---------- y : ndarray, dtype=DOUBLE_t @@ -426,6 +452,7 @@ cdef class ClassificationCriterion(Criterion): cdef int reset(self) except -1 nogil: """Reset the criterion at pos=start. + Returns -1 in case of failure to allocate memory (and raise MemoryError) or 0 otherwise. """ @@ -442,6 +469,7 @@ cdef class ClassificationCriterion(Criterion): cdef int reverse_reset(self) except -1 nogil: """Reset the criterion at pos=end. + Returns -1 in case of failure to allocate memory (and raise MemoryError) or 0 otherwise. """ @@ -458,8 +486,10 @@ cdef class ClassificationCriterion(Criterion): cdef int update(self, SIZE_t new_pos) except -1 nogil: """Updated statistics by moving sample_indices[pos:new_pos] to the left child. + Returns -1 in case of failure to allocate memory (and raise MemoryError) or 0 otherwise. + Parameters ---------- new_pos : SIZE_t @@ -532,6 +562,7 @@ cdef class ClassificationCriterion(Criterion): cdef void node_value(self, double* dest) noexcept nogil: """Compute the node value of sample_indices[start:end] and save it into dest. + Parameters ---------- dest : double pointer @@ -546,17 +577,20 @@ cdef class ClassificationCriterion(Criterion): cdef class Entropy(ClassificationCriterion): r"""Cross Entropy impurity criterion. + This handles cases where the target is a classification taking values 0, 1, ... K-2, K-1. If node m represents a region Rm with Nm observations, then let count_k = 1 / Nm \sum_{x_i in Rm} I(yi = k) be the proportion of class k observations in node m. + The cross-entropy is then defined as cross-entropy = -\sum_{k=0}^{K-1} count_k log(count_k) """ cdef double node_impurity(self) noexcept nogil: """Evaluate the impurity of the current node. + Evaluate the cross-entropy criterion as impurity of the current node, i.e. the impurity of sample_indices[start:end]. The smaller the impurity the better. @@ -578,8 +612,10 @@ cdef class Entropy(ClassificationCriterion): cdef void children_impurity(self, double* impurity_left, double* impurity_right) noexcept nogil: """Evaluate the impurity in children nodes. + i.e. the impurity of the left child (sample_indices[start:pos]) and the impurity the right child (sample_indices[pos:end]). + Parameters ---------- impurity_left : double pointer @@ -611,11 +647,13 @@ cdef class Entropy(ClassificationCriterion): cdef class Gini(ClassificationCriterion): r"""Gini Index impurity criterion. + This handles cases where the target is a classification taking values 0, 1, ... K-2, K-1. If node m represents a region Rm with Nm observations, then let count_k = 1/ Nm \sum_{x_i in Rm} I(yi = k) be the proportion of class k observations in node m. + The Gini Index is then defined as: index = \sum_{k=0}^{K-1} count_k (1 - count_k) = 1 - \sum_{k=0}^{K-1} count_k ** 2 @@ -623,6 +661,7 @@ cdef class Gini(ClassificationCriterion): cdef double node_impurity(self) noexcept nogil: """Evaluate the impurity of the current node. + Evaluate the Gini criterion as impurity of the current node, i.e. the impurity of sample_indices[start:end]. The smaller the impurity the better. @@ -648,8 +687,10 @@ cdef class Gini(ClassificationCriterion): cdef void children_impurity(self, double* impurity_left, double* impurity_right) noexcept nogil: """Evaluate the impurity in children nodes. + i.e. the impurity of the left child (sample_indices[start:pos]) and the impurity the right child (sample_indices[pos:end]) using the Gini index. + Parameters ---------- impurity_left : double pointer @@ -726,6 +767,7 @@ cdef inline void _move_sums_regression( cdef class RegressionCriterion(Criterion): r"""Abstract regression criterion. + This handles cases where the target is a continuous value, and is evaluated by computing the variance of the target values left and right of the split point. The computation takes linear time with `n_samples` @@ -736,6 +778,7 @@ cdef class RegressionCriterion(Criterion): def __cinit__(self, SIZE_t n_outputs, SIZE_t n_samples): """Initialize parameters for this criterion. + Parameters ---------- n_outputs : SIZE_t @@ -961,6 +1004,7 @@ cdef class MSE(RegressionCriterion): cdef double node_impurity(self) noexcept nogil: """Evaluate the impurity of the current node. + Evaluate the MSE criterion as impurity of the current node, i.e. the impurity of sample_indices[start:end]. The smaller the impurity the better. @@ -976,10 +1020,12 @@ cdef class MSE(RegressionCriterion): cdef double proxy_impurity_improvement(self) noexcept nogil: """Compute a proxy of the impurity reduction. + This method is used to speed up the search for the best split. It is a proxy quantity such that the split that maximizes this value also maximizes the impurity improvement. It neglects all constant terms of the impurity decrease for a given split. + The absolute impurity improvement is only computed by the impurity_improvement method once the best split has been found. The MSE proxy is derived from @@ -1002,6 +1048,7 @@ cdef class MSE(RegressionCriterion): cdef void children_impurity(self, double* impurity_left, double* impurity_right) noexcept nogil: """Evaluate the impurity in children nodes. + i.e. the impurity of the left child (sample_indices[start:pos]) and the impurity the right child (sample_indices[pos:end]). """ @@ -1045,6 +1092,7 @@ cdef class MSE(RegressionCriterion): cdef class MAE(RegressionCriterion): r"""Mean absolute error impurity criterion. + MAE = (1 / n)*(\sum_i |y_i - f_i|), where y_i is the true value and f_i is the predicted value.""" @@ -1056,6 +1104,7 @@ cdef class MAE(RegressionCriterion): def __cinit__(self, SIZE_t n_outputs, SIZE_t n_samples): """Initialize parameters for this criterion. + Parameters ---------- n_outputs : SIZE_t @@ -1154,6 +1203,7 @@ cdef class MAE(RegressionCriterion): cdef int reset(self) except -1 nogil: """Reset the criterion at pos=start. + Returns -1 in case of failure to allocate memory (and raise MemoryError) or 0 otherwise. """ @@ -1184,6 +1234,7 @@ cdef class MAE(RegressionCriterion): cdef int reverse_reset(self) except -1 nogil: """Reset the criterion at pos=end. + Returns -1 in case of failure to allocate memory (and raise MemoryError) or 0 otherwise. """ @@ -1211,6 +1262,7 @@ cdef class MAE(RegressionCriterion): cdef int update(self, SIZE_t new_pos) except -1 nogil: """Updated statistics by moving sample_indices[pos:new_pos] to the left. + Returns -1 in case of failure to allocate memory (and raise MemoryError) or 0 otherwise. """ @@ -1273,6 +1325,7 @@ cdef class MAE(RegressionCriterion): cdef double node_impurity(self) noexcept nogil: """Evaluate the impurity of the current node. + Evaluate the MAE criterion as impurity of the current node, i.e. the impurity of sample_indices[start:end]. The smaller the impurity the better. @@ -1297,6 +1350,7 @@ cdef class MAE(RegressionCriterion): cdef void children_impurity(self, double* p_impurity_left, double* p_impurity_right) noexcept nogil: """Evaluate the impurity in children nodes. + i.e. the impurity of the left child (sample_indices[start:pos]) and the impurity the right child (sample_indices[pos:end]). """ @@ -1343,6 +1397,7 @@ cdef class MAE(RegressionCriterion): cdef class FriedmanMSE(MSE): """Mean squared error impurity criterion with improvement score by Friedman. + Uses the formula (35) in Friedman's original Gradient Boosting paper: diff = mean_left - mean_right improvement = n_left * n_right * diff^2 / (n_left + n_right) @@ -1350,10 +1405,12 @@ cdef class FriedmanMSE(MSE): cdef double proxy_impurity_improvement(self) noexcept nogil: """Compute a proxy of the impurity reduction. + This method is used to speed up the search for the best split. It is a proxy quantity such that the split that maximizes this value also maximizes the impurity improvement. It neglects all constant terms of the impurity decrease for a given split. + The absolute impurity improvement is only computed by the impurity_improvement method once the best split has been found. """ @@ -1394,6 +1451,7 @@ cdef class FriedmanMSE(MSE): cdef class Poisson(RegressionCriterion): """Half Poisson deviance as impurity criterion. + Poisson deviance = 2/n * sum(y_true * log(y_true/y_pred) + y_pred - y_true) Note that the deviance is >= 0, and since we have `y_pred = mean(y_true)` at the leaves, one always has `sum(y_pred - y_true) = 0`. It remains the @@ -1413,6 +1471,7 @@ cdef class Poisson(RegressionCriterion): cdef double node_impurity(self) noexcept nogil: """Evaluate the impurity of the current node. + Evaluate the Poisson criterion as impurity of the current node, i.e. the impurity of sample_indices[start:end]. The smaller the impurity the better. @@ -1422,10 +1481,12 @@ cdef class Poisson(RegressionCriterion): cdef double proxy_impurity_improvement(self) noexcept nogil: """Compute a proxy of the impurity reduction. + This method is used to speed up the search for the best split. It is a proxy quantity such that the split that maximizes this value also maximizes the impurity improvement. It neglects all constant terms of the impurity decrease for a given split. + The absolute impurity improvement is only computed by the impurity_improvement method once the best split has been found. The Poisson proxy is derived from: @@ -1463,6 +1524,7 @@ cdef class Poisson(RegressionCriterion): cdef void children_impurity(self, double* impurity_left, double* impurity_right) noexcept nogil: """Evaluate the impurity in children nodes. + i.e. the impurity of the left child (sample_indices[start:pos]) and the impurity of the right child (sample_indices[pos:end]) for Poisson. """ From 2105949178bf03660c13df1fd197abbbb57d826e Mon Sep 17 00:00:00 2001 From: Adam Li Date: Tue, 13 Jun 2023 15:22:15 -0400 Subject: [PATCH 12/28] Working.. Signed-off-by: Adam Li --- sklearn/tree/_criterion.pxd | 2 +- sklearn/tree/_criterion.pyx | 4 +++- sklearn/tree/_splitter.pxd | 2 +- 3 files changed, 5 insertions(+), 3 deletions(-) diff --git a/sklearn/tree/_criterion.pxd b/sklearn/tree/_criterion.pxd index d72f22f8b348d..20020b4a5361c 100644 --- a/sklearn/tree/_criterion.pxd +++ b/sklearn/tree/_criterion.pxd @@ -11,7 +11,7 @@ # See _criterion.pyx for implementation details. -# from libcpp.vector cimport vector +from libcpp.vector cimport vector from ._tree cimport DTYPE_t # Type of X from ._tree cimport DOUBLE_t # Type of y, sample_weight diff --git a/sklearn/tree/_criterion.pyx b/sklearn/tree/_criterion.pyx index e9c02ab2fa43d..d60cab3063c1b 100644 --- a/sklearn/tree/_criterion.pyx +++ b/sklearn/tree/_criterion.pyx @@ -34,7 +34,9 @@ from ._utils cimport WeightedMedianCalculator cdef double EPSILON = 10 * np.finfo('double').eps cdef class BaseCriterion: - """This is an abstract interface for criterion. For example, a tree model could + """This is an abstract interface for criterion. + + For example, a tree model could be either supervisedly, or unsupervisedly computing impurity on samples of covariates, or labels, or both. Although scikit-learn currently only contains supervised tree methods, this class enables 3rd party packages to leverage diff --git a/sklearn/tree/_splitter.pxd b/sklearn/tree/_splitter.pxd index 01975df22ef23..fc49471569ecc 100644 --- a/sklearn/tree/_splitter.pxd +++ b/sklearn/tree/_splitter.pxd @@ -97,7 +97,7 @@ cdef class BaseSplitter: cdef class Splitter(BaseSplitter): cdef public Criterion criterion # Impurity criterion cdef const DOUBLE_t[:, ::1] y - + cdef int init( self, object X, From 9b07f2ab2b1b6f8f4ea1294fce1a5f9bd3be1a1d Mon Sep 17 00:00:00 2001 From: Adam Li Date: Tue, 13 Jun 2023 15:42:37 -0400 Subject: [PATCH 13/28] Add leaf storage ability Signed-off-by: Adam Li --- sklearn/tree/_classes.py | 67 ---------------------------- sklearn/tree/_criterion.pxd | 9 ++-- sklearn/tree/_criterion.pyx | 28 +++++++++--- sklearn/tree/_splitter.pxd | 3 ++ sklearn/tree/_splitter.pyx | 31 ++++++------- sklearn/tree/_tree.pxd | 19 ++++++-- sklearn/tree/_tree.pyx | 88 +++++++++++++++++++++++++------------ 7 files changed, 122 insertions(+), 123 deletions(-) diff --git a/sklearn/tree/_classes.py b/sklearn/tree/_classes.py index 795c68c8b5081..4fdd8f27cd652 100644 --- a/sklearn/tree/_classes.py +++ b/sklearn/tree/_classes.py @@ -713,73 +713,6 @@ def feature_importances_(self): return self.tree_.compute_feature_importances() - def _get_y_for_leaves(self, X, sample_weight=None): - n_samples = X.shape[0] - - # get the predictions - X_leaves = self.apply(X) - - bootstrap_indices = np.empty(shape, dtype=np.int64) - for i, estimator in enumerate(self.estimators_): - # Get bootstrap indices. - if self.bootstrap: - n_samples_bootstrap = _get_n_samples_bootstrap(n_samples, self.max_samples) - bootstrap_indices[:, i] = _generate_sample_indices( - estimator.random_state, n_samples, n_samples_bootstrap - ) - else: - bootstrap_indices[:, i] = np.arange(n_samples) - - # Get predictions on bootstrap indices. - X_leaves[:, i] = X_leaves[bootstrap_indices[:, i], i] - - if sorter is not None: - # Reassign bootstrap indices to account for target sorting. - bootstrap_indices = np.argsort(sorter)[bootstrap_indices] - - bootstrap_indices += 1 # for sparse matrix (0s as empty) - - # Get the maximum number of nodes (internal + leaves) across trees. - # Get the maximum number of samples per leaf across trees (if needed). - max_node_count = 0 - max_samples_leaf = 0 if not leaf_subsample else max_samples_leaf - for i, estimator in enumerate(self.estimators_): - node_count = estimator.tree_.node_count - if node_count > max_node_count: - max_node_count = node_count - if not leaf_subsample: - sample_count = np.max(np.bincount(X_leaves[:, i])) - if sample_count > max_samples_leaf: - max_samples_leaf = sample_count - - # Initialize NumPy array (more efficient serialization than dict/list). - shape = (self.n_estimators, max_node_count, max_samples_leaf) - y_train_leaves = np.zeros(shape, dtype=np.int64) - - for i, estimator in enumerate(self.estimators_): - # Group training indices by leaf node. - leaf_indices, leaf_values_list = _group_by_value(X_leaves[:, i]) - - if leaf_subsample: - random.seed(estimator.random_state) - - # Map each leaf node to its list of training indices. - for leaf_idx, leaf_values in zip(leaf_indices, leaf_values_list): - y_indices = bootstrap_indices[:, i][leaf_values] - - if sample_weight is not None: - y_indices = y_indices[sample_weight[y_indices - 1] > 0] - - # Subsample leaf training indices (without replacement). - if leaf_subsample and max_samples_leaf < len(y_indices): - if not isinstance(y_indices, list): - y_indices = list(y_indices) - y_indices = random.sample(y_indices, max_samples_leaf) - - y_train_leaves[i, leaf_idx, : len(y_indices)] = y_indices - - return y_train_leaves - # ============================================================================= # Public estimators diff --git a/sklearn/tree/_criterion.pxd b/sklearn/tree/_criterion.pxd index 20020b4a5361c..721b475f40436 100644 --- a/sklearn/tree/_criterion.pxd +++ b/sklearn/tree/_criterion.pxd @@ -72,10 +72,6 @@ cdef class BaseCriterion: SIZE_t end ) noexcept nogil - # cdef void node_samples( - # self, - # vector[vector[DOUBLE_t]]* dest - # ) noexcept nogil cdef class Criterion(BaseCriterion): """Abstract interface for supervised impurity criteria.""" @@ -94,6 +90,11 @@ cdef class Criterion(BaseCriterion): cdef void init_sum_missing(self) cdef void init_missing(self, SIZE_t n_missing) noexcept nogil + cdef void node_samples( + self, + vector[vector[DOUBLE_t]]* dest + ) noexcept nogil + cdef class ClassificationCriterion(Criterion): """Abstract criterion for classification.""" diff --git a/sklearn/tree/_criterion.pyx b/sklearn/tree/_criterion.pyx index d60cab3063c1b..c3f08ec859bee 100644 --- a/sklearn/tree/_criterion.pyx +++ b/sklearn/tree/_criterion.pyx @@ -46,7 +46,7 @@ cdef class BaseCriterion: in current node and in children nodes. This object stores methods on how to calculate how good a split is using - a set API. + a set API. Samples in the "current" node are stored in `samples[start:end]` which is partitioned around `pos` (an index in `start:end`) so that: @@ -186,9 +186,9 @@ cdef class BaseCriterion: ) noexcept nogil: """Abstract method which will set sample pointers in the criterion. - The dataset array that we compute criteria on is assumed to consist of 'N' - ordered samples or rows (i.e. sorted). Since we pass this by reference, we - use sample pointers to move the start and end around to consider only a subset of data. + The dataset array that we compute criteria on is assumed to consist of 'N' + ordered samples or rows (i.e. sorted). Since we pass this by reference, we + use sample pointers to move the start and end around to consider only a subset of data. This function should also update relevant statistics that the class uses to compute the final criterion. Parameters @@ -252,10 +252,28 @@ cdef class Criterion(BaseCriterion): Number of missing values for specific feature. """ pass - + cdef void init_sum_missing(self): """Init sum_missing to hold sums for missing values.""" + cdef void node_samples( + self, + vector[vector[DOUBLE_t]]* dest + ) noexcept nogil: + cdef SIZE_t i, j + + # Resize the destination vector of vectors + dest.resize(self.n_node_samples) + + # Loop over the samples + for i in range(self.n_node_samples): + # Get the index of the current sample + j = self.sample_indices[self.start + i] + + # Get the sample values for each output + for k in range(self.n_outputs): + dest[i][k].push_back(self.y[j, k]) + cdef inline void _move_sums_classification( ClassificationCriterion criterion, double[:, ::1] sum_1, diff --git a/sklearn/tree/_splitter.pxd b/sklearn/tree/_splitter.pxd index fc49471569ecc..fb21f676e66cc 100644 --- a/sklearn/tree/_splitter.pxd +++ b/sklearn/tree/_splitter.pxd @@ -10,6 +10,7 @@ # License: BSD 3 clause # See _splitter.pyx for details. +from libcpp.vector cimport vector from ._criterion cimport BaseCriterion, Criterion @@ -106,6 +107,8 @@ cdef class Splitter(BaseSplitter): const unsigned char[::1] feature_has_missing, ) except -1 + cdef void node_samples(self, vector[vector[DOUBLE_t]]* dest) noexcept nogil + # Methods that allow modifications to stopping conditions cdef bint check_presplit_conditions( self, diff --git a/sklearn/tree/_splitter.pyx b/sklearn/tree/_splitter.pyx index a58514d093ddf..7f21d5da545fb 100644 --- a/sklearn/tree/_splitter.pyx +++ b/sklearn/tree/_splitter.pyx @@ -53,12 +53,12 @@ cdef inline void _init_split(SplitRecord* self, SIZE_t start_pos) noexcept nogil self.n_missing = 0 cdef class BaseSplitter: - """This is an abstract interface for splitters. + """This is an abstract interface for splitters. For example, a tree model could be either supervisedly, or unsupervisedly computing splits on samples of covariates, labels, or both. Although scikit-learn currently only contains supervised tree methods, this class enables 3rd party packages to leverage - scikit-learn's Cython code for splitting. + scikit-learn's Cython code for splitting. A splitter is usually used in conjunction with a criterion class, which explicitly handles computing the criteria, which we split on. The setting of that criterion class is handled @@ -112,7 +112,7 @@ cdef class BaseSplitter: cdef int pointer_size(self) noexcept nogil: """Size of the pointer for split records. - + Overriding this function allows one to use different subclasses of `SplitRecord`. """ @@ -156,7 +156,6 @@ cdef class Splitter(BaseSplitter): self.min_weight_leaf = min_weight_leaf self.random_state = random_state - def __reduce__(self): return (type(self), (self.criterion, self.max_features, @@ -281,6 +280,10 @@ cdef class Splitter(BaseSplitter): self.criterion.node_value(dest) + cdef void node_samples(self, vector[vector[DOUBLE_t]]* dest) noexcept nogil: + """Copy the samples[start:end] into dest.""" + self.criterion.node_samples(dest) + cdef double node_impurity(self) noexcept nogil: """Return the impurity of the current node.""" @@ -293,7 +296,7 @@ cdef class Splitter(BaseSplitter): bint missing_go_to_left, ) noexcept nogil: """Check stopping conditions pre-split. - + This is typically a metric that is cheaply computed given the current proposed split, which is stored as a the `current_split` argument. @@ -301,7 +304,7 @@ cdef class Splitter(BaseSplitter): cdef SIZE_t min_samples_leaf = self.min_samples_leaf cdef SIZE_t end_non_missing = self.end - n_missing cdef SIZE_t n_left, n_right - + if missing_go_to_left: n_left = current_split.pos - self.start + n_missing n_right = end_non_missing - current_split.pos @@ -312,14 +315,14 @@ cdef class Splitter(BaseSplitter): # Reject if min_samples_leaf is not guaranteed if n_left < min_samples_leaf or n_right < min_samples_leaf: return 1 - + return 0 cdef bint check_postsplit_conditions( self ) noexcept nogil: """Check stopping conditions after evaluating the split. - + This takes some metric that is stored in the Criterion object and checks against internal stop metrics. """ @@ -329,10 +332,10 @@ cdef class Splitter(BaseSplitter): if ((self.criterion.weighted_n_left < min_weight_leaf) or (self.criterion.weighted_n_right < min_weight_leaf)): return 1 - + return 0 - + cdef inline void shift_missing_values_to_left_if_required( SplitRecord* best, SIZE_t[::1] samples, @@ -360,7 +363,7 @@ cdef inline void shift_missing_values_to_left_if_required( ctypedef fused Partitioner: DensePartitioner SparsePartitioner - + cdef inline int node_split_best( Splitter splitter, Partitioner partitioner, @@ -504,9 +507,9 @@ cdef inline int node_split_best( if p >= end_non_missing: continue - + current_split.pos = p - + # Reject if min_samples_leaf is not guaranteed if splitter.check_presplit_conditions(current_split, n_missing, missing_go_to_left) == 1: continue @@ -740,8 +743,6 @@ cdef inline int node_split_random( cdef SIZE_t n_features = splitter.n_features cdef SIZE_t max_features = splitter.max_features - cdef SIZE_t min_samples_leaf = splitter.min_samples_leaf - cdef double min_weight_leaf = splitter.min_weight_leaf cdef UINT32_t* random_state = &splitter.rand_r_state cdef SplitRecord best_split, current_split diff --git a/sklearn/tree/_tree.pxd b/sklearn/tree/_tree.pxd index cbe85886cd865..94714cc33400c 100644 --- a/sklearn/tree/_tree.pxd +++ b/sklearn/tree/_tree.pxd @@ -14,6 +14,7 @@ import numpy as np cimport numpy as cnp from libcpp.vector cimport vector +from libcpp.unordered_map cimport unordered_map ctypedef cnp.npy_float32 DTYPE_t # Type of X ctypedef cnp.npy_float64 DOUBLE_t # Type of y, sample_weight @@ -36,6 +37,7 @@ cdef struct Node: DOUBLE_t weighted_n_node_samples # Weighted number of samples at the node unsigned char missing_go_to_left # Whether features have missing values + cdef class BaseTree: # Inner structures: values are stored separately from node structure, # since size is determined at runtime. @@ -45,7 +47,14 @@ cdef class BaseTree: cdef Node* nodes # Array of nodes cdef SIZE_t value_stride # The dimensionality of a vectorized output per sample - cdef double* value # Array of values prediction values for each node + cdef double* value # Array of values prediction values for each node + + # Enables the use of tree to store distributions of the output to allow + # arbitrary usage of the the leaves. This is used in the quantile + # estimators for example. + # for storing samples at each leaf node with leaf's node ID as the key and + # the sample values as the value + cdef unordered_map[SIZE_t, vector[vector[DOUBLE_t]]] value_samples # Generic Methods: These are generic methods used by any tree. cdef int _resize(self, SIZE_t capacity) except -1 nogil @@ -61,7 +70,7 @@ cdef class BaseTree: double weighted_n_node_samples, unsigned char missing_go_to_left ) except -1 nogil - + # Python API methods: These are methods exposed to Python cpdef cnp.ndarray apply(self, object X) cdef cnp.ndarray _apply_dense(self, object X) @@ -101,10 +110,10 @@ cdef class Tree(BaseTree): # The Supervised Tree object is a binary tree structure constructed by the # TreeBuilder. The tree structure is used for predictions and # feature importances. - # + # # Value of upstream properties: # - value_stride = n_outputs * max_n_classes - # - value = (capacity, n_outputs, max_n_classes) array of values + # - value = (capacity, n_outputs, max_n_classes) array of values # Input/Output layout for supervised tree cdef public SIZE_t n_features # Number of features in X @@ -137,6 +146,8 @@ cdef class TreeBuilder: cdef SIZE_t max_depth # Maximal tree depth cdef double min_impurity_decrease # Impurity threshold for early stopping + cdef unsigned char store_leaf_values # Whether to store leaf values + cpdef build( self, Tree tree, diff --git a/sklearn/tree/_tree.pyx b/sklearn/tree/_tree.pyx index 2256b28c7df10..8ca98a64b42ab 100644 --- a/sklearn/tree/_tree.pyx +++ b/sklearn/tree/_tree.pyx @@ -158,15 +158,23 @@ cdef struct StackRecord: cdef class DepthFirstTreeBuilder(TreeBuilder): """Build a decision tree in depth-first fashion.""" - def __cinit__(self, Splitter splitter, SIZE_t min_samples_split, - SIZE_t min_samples_leaf, double min_weight_leaf, - SIZE_t max_depth, double min_impurity_decrease): + def __cinit__( + self, + Splitter splitter, + SIZE_t min_samples_split, + SIZE_t min_samples_leaf, + double min_weight_leaf, + SIZE_t max_depth, + double min_impurity_decrease, + unsigned char store_leaf_values=False + ): self.splitter = splitter self.min_samples_split = min_samples_split self.min_samples_leaf = min_samples_leaf self.min_weight_leaf = min_weight_leaf self.max_depth = max_depth self.min_impurity_decrease = min_impurity_decrease + self.store_leaf_values = store_leaf_values cpdef build( self, @@ -221,6 +229,8 @@ cdef class DepthFirstTreeBuilder(TreeBuilder): cdef SIZE_t max_depth_seen = -1 cdef int rc = 0 + cdef int node_idx + cdef stack[StackRecord] builder_stack cdef StackRecord stack_record @@ -308,6 +318,12 @@ cdef class DepthFirstTreeBuilder(TreeBuilder): "is_left": 1, "impurity": split.impurity_left, "n_constant_features": n_constant_features}) + elif self.store_leaf_values and is_leaf: + with gil: + print('Storing leaf values...') + + # copy leaf values to leaf_values array + splitter.node_samples(&tree.value_samples[node_id]) if depth > max_depth_seen: max_depth_seen = depth @@ -317,7 +333,7 @@ cdef class DepthFirstTreeBuilder(TreeBuilder): if rc >= 0: tree.max_depth = max_depth_seen - + # free the memory created for the SplitRecord pointer free(split_ptr) @@ -364,10 +380,17 @@ cdef class BestFirstTreeBuilder(TreeBuilder): """ cdef SIZE_t max_leaf_nodes - def __cinit__(self, Splitter splitter, SIZE_t min_samples_split, - SIZE_t min_samples_leaf, min_weight_leaf, - SIZE_t max_depth, SIZE_t max_leaf_nodes, - double min_impurity_decrease): + def __cinit__( + self, + Splitter splitter, + SIZE_t min_samples_split, + SIZE_t min_samples_leaf, + double min_weight_leaf, + SIZE_t max_depth, + SIZE_t max_leaf_nodes, + double min_impurity_decrease, + unsigned char store_leaf_values=False, + ): self.splitter = splitter self.min_samples_split = min_samples_split self.min_samples_leaf = min_samples_leaf @@ -375,6 +398,7 @@ cdef class BestFirstTreeBuilder(TreeBuilder): self.max_depth = max_depth self.max_leaf_nodes = max_leaf_nodes self.min_impurity_decrease = min_impurity_decrease + self.store_leaf_values = store_leaf_values cpdef build( self, @@ -488,7 +512,7 @@ cdef class BestFirstTreeBuilder(TreeBuilder): """Adds node w/ partition ``[start, end)`` to the frontier. """ cdef SplitRecord split cdef SplitRecord* split_ptr = malloc(splitter.pointer_size()) - + cdef SIZE_t node_id cdef SIZE_t n_node_samples cdef SIZE_t n_constant_features = 0 @@ -553,7 +577,7 @@ cdef class BestFirstTreeBuilder(TreeBuilder): res.improvement = 0.0 res.impurity_left = impurity res.impurity_right = impurity - + free(split_ptr) return 0 @@ -564,7 +588,7 @@ cdef class BestFirstTreeBuilder(TreeBuilder): cdef class BaseTree: """Base class for Cython tree models. - + Downstream classes must implement """ cdef int _resize( @@ -622,7 +646,7 @@ cdef class BaseTree: Node* node ) except -1 nogil: """Set split node data. - + Parameters ---------- split_node : SplitRecord* @@ -641,7 +665,7 @@ cdef class BaseTree: Node* node ) except -1 nogil: """Set leaf node data. - + Parameters ---------- split_node : SplitRecord* @@ -655,9 +679,12 @@ cdef class BaseTree: node.threshold = _TREE_UNDEFINED return 1 - cdef DTYPE_t _compute_feature(self, const DTYPE_t[:, :] X_ndarray, - SIZE_t sample_index, - Node *node) noexcept nogil: + cdef DTYPE_t _compute_feature( + self, + const DTYPE_t[:, :] X_ndarray, + SIZE_t sample_index, + Node *node + ) noexcept nogil: """Compute feature from a given data matrix, X. In axis-aligned trees, this is simply the value in the column of X @@ -668,7 +695,7 @@ cdef class BaseTree: return feature cdef SIZE_t _add_node( - self, + self, SIZE_t parent, bint is_left, bint is_leaf, @@ -679,7 +706,9 @@ cdef class BaseTree: unsigned char missing_go_to_left ) except -1 nogil: """Add a node to the tree. + The new node registers itself as the child of its parent. + Parameters ---------- parent : SIZE_t @@ -697,7 +726,7 @@ cdef class BaseTree: The number of samples in the node. weighted_n_node_samples : double The weight of the samples in the node. - + Returns (size_t)(-1) on error. """ cdef SIZE_t node_id = self.node_count @@ -719,12 +748,12 @@ cdef class BaseTree: if is_leaf: if self._set_leaf_node(split_node, node) != 1: - with gil: - raise RuntimeError + with gil: + raise RuntimeError else: if self._set_split_node(split_node, node) != 1: - with gil: - raise RuntimeError + with gil: + raise RuntimeError node.missing_go_to_left = missing_go_to_left self.node_count += 1 @@ -796,8 +825,8 @@ cdef class BaseTree: # Extract input cdef const DTYPE_t[:] X_data = X.data - cdef const INT32_t[:] X_indices = X.indices - cdef const INT32_t[:] X_indptr = X.indptr + cdef const INT32_t[:] X_indices = X.indices + cdef const INT32_t[:] X_indptr = X.indptr cdef SIZE_t n_samples = X.shape[0] cdef SIZE_t n_features = X.shape[1] @@ -928,8 +957,8 @@ cdef class BaseTree: # Extract input cdef const DTYPE_t[:] X_data = X.data - cdef const INT32_t[:] X_indices = X.indices - cdef const INT32_t[:] X_indptr = X.indptr + cdef const INT32_t[:] X_indices = X.indices + cdef const INT32_t[:] X_indptr = X.indptr cdef SIZE_t n_samples = X.shape[0] cdef SIZE_t n_features = X.shape[1] @@ -1043,7 +1072,7 @@ cdef class BaseTree: # ... and node.right_child != _TREE_LEAF: self._compute_feature_importances( importances, node) - + node += 1 for i in range(self.n_features): @@ -1065,7 +1094,7 @@ cdef class BaseTree: Node* node ) noexcept nogil: """Compute feature importances from a Node in the Tree. - + Wrapped in a private function to allow subclassing that computes feature importances. """ @@ -1321,6 +1350,9 @@ cdef class Tree(BaseTree): self.value = NULL self.nodes = NULL + # initialize the hash map for the value samples + self.value_samples = unordered_map[SIZE_t, vector[vector[DOUBLE_t]]]() + def __dealloc__(self): """Destructor.""" # Free all inner structures From 21ccb30478bdff652118af59a4cd614a23f799d0 Mon Sep 17 00:00:00 2001 From: Adam Li Date: Thu, 15 Jun 2023 10:35:44 -0400 Subject: [PATCH 14/28] [ENH] Adding leaf node samples to be stored when "quantile" tree is turned on (#45) #### Reference Issues/PRs Addresses the quantile-trees part of: https://github.com/neurodata/scikit-tree/issues/29 #### What does this implement/fix? Explain your changes. 1. Stores for each leaf node a 2D numpy array of the y-samples (remember `y` is (n_samples, n_outputs)) 2. Does this all the way in Criterion 3. Only supports supervised tree/splitter/criterion 4. merges in `main` changes. #### Any other comments? --------- Signed-off-by: Adam Li --- doc/authors_emeritus.rst | 1 + doc/contributor_experience_team.rst | 12 +- doc/modules/classes.rst | 1 + doc/modules/learning_curve.rst | 42 +- doc/visualizations.rst | 1 + doc/whats_new/v1.3.rst | 286 +++++--- .../plot_kernel_ridge_regression.py | 1 + .../model_selection/plot_validation_curve.py | 46 +- sklearn/base.py | 10 +- sklearn/calibration.py | 7 +- sklearn/cluster/_affinity_propagation.py | 4 +- sklearn/cluster/_agglomerative.py | 5 +- sklearn/cluster/_bicluster.py | 4 +- sklearn/cluster/_birch.py | 8 +- sklearn/cluster/_bisect_k_means.py | 4 +- sklearn/cluster/_dbscan.py | 7 +- sklearn/cluster/_feature_agglomeration.py | 34 +- sklearn/cluster/_kmeans.py | 11 +- sklearn/cluster/_mean_shift.py | 3 +- sklearn/cluster/_optics.py | 7 +- sklearn/cluster/_spectral.py | 4 +- .../tests/test_feature_agglomeration.py | 24 + sklearn/compose/_column_transformer.py | 7 +- sklearn/compose/_target.py | 6 +- sklearn/covariance/_elliptic_envelope.py | 3 +- sklearn/covariance/_empirical_covariance.py | 3 +- sklearn/covariance/_graph_lasso.py | 5 +- sklearn/covariance/_robust_covariance.py | 3 +- sklearn/covariance/_shrunk_covariance.py | 8 +- sklearn/cross_decomposition/_pls.py | 7 +- sklearn/datasets/_arff_parser.py | 5 +- sklearn/datasets/tests/test_openml.py | 4 +- sklearn/decomposition/_dict_learning.py | 7 +- sklearn/decomposition/_factor_analysis.py | 4 +- sklearn/decomposition/_fastica.py | 7 +- sklearn/decomposition/_incremental_pca.py | 8 +- sklearn/decomposition/_kernel_pca.py | 4 +- sklearn/decomposition/_lda.py | 7 +- sklearn/decomposition/_nmf.py | 44 +- sklearn/decomposition/_pca.py | 7 +- sklearn/decomposition/_sparse_pca.py | 3 +- sklearn/decomposition/_truncated_svd.py | 4 +- sklearn/decomposition/tests/test_nmf.py | 27 + sklearn/discriminant_analysis.py | 9 +- sklearn/dummy.py | 7 +- sklearn/ensemble/_bagging.py | 8 +- sklearn/ensemble/_forest.py | 174 ++++- sklearn/ensemble/_gb.py | 8 +- .../gradient_boosting.py | 4 +- sklearn/ensemble/_iforest.py | 3 +- sklearn/ensemble/_stacking.py | 8 +- sklearn/ensemble/_voting.py | 11 +- sklearn/ensemble/_weight_boosting.py | 8 +- sklearn/ensemble/tests/test_forest.py | 51 ++ .../feature_extraction/_dict_vectorizer.py | 5 +- sklearn/feature_extraction/_hash.py | 4 +- sklearn/feature_extraction/image.py | 3 +- sklearn/feature_extraction/text.py | 14 +- sklearn/feature_selection/_from_model.py | 11 +- sklearn/feature_selection/_rfe.py | 11 +- sklearn/feature_selection/_sequential.py | 7 +- .../_univariate_selection.py | 4 +- .../feature_selection/_variance_threshold.py | 3 +- sklearn/gaussian_process/_gpc.py | 4 +- sklearn/gaussian_process/_gpr.py | 4 +- sklearn/impute/_base.py | 8 +- sklearn/impute/_iterative.py | 8 +- sklearn/impute/_knn.py | 3 +- sklearn/isotonic.py | 3 +- sklearn/kernel_approximation.py | 13 +- sklearn/kernel_ridge.py | 4 +- sklearn/linear_model/_base.py | 5 +- sklearn/linear_model/_bayes.py | 8 +- sklearn/linear_model/_coordinate_descent.py | 11 +- sklearn/linear_model/_glm/glm.py | 4 +- sklearn/linear_model/_huber.py | 3 +- sklearn/linear_model/_least_angle.py | 10 +- sklearn/linear_model/_logistic.py | 9 +- sklearn/linear_model/_omp.py | 7 +- sklearn/linear_model/_passive_aggressive.py | 9 +- sklearn/linear_model/_quantile.py | 3 +- sklearn/linear_model/_ransac.py | 7 +- sklearn/linear_model/_ridge.py | 13 +- sklearn/linear_model/_stochastic_gradient.py | 13 +- sklearn/linear_model/_theil_sen.py | 3 +- sklearn/manifold/_isomap.py | 13 +- sklearn/manifold/_locally_linear.py | 5 +- sklearn/manifold/_mds.py | 4 +- sklearn/manifold/_spectral_embedding.py | 4 +- sklearn/manifold/_t_sne.py | 11 +- sklearn/metrics/pairwise.py | 14 +- sklearn/mixture/_base.py | 4 +- sklearn/model_selection/__init__.py | 2 + sklearn/model_selection/_plot.py | 680 +++++++++++++++--- sklearn/model_selection/_search.py | 6 +- .../_search_successive_halving.py | 6 +- sklearn/model_selection/tests/test_plot.py | 337 +++++++-- sklearn/multiclass.py | 29 +- sklearn/multioutput.py | 26 +- sklearn/naive_bayes.py | 12 +- sklearn/neighbors/_classification.py | 12 +- sklearn/neighbors/_graph.py | 11 +- sklearn/neighbors/_kde.py | 7 +- sklearn/neighbors/_lof.py | 7 +- sklearn/neighbors/_nca.py | 4 +- sklearn/neighbors/_nearest_centroid.py | 4 +- sklearn/neighbors/_regression.py | 12 +- sklearn/neighbors/_unsupervised.py | 6 +- .../neural_network/_multilayer_perceptron.py | 12 +- sklearn/neural_network/_rbm.py | 9 +- sklearn/pipeline.py | 16 +- sklearn/preprocessing/_data.py | 57 +- sklearn/preprocessing/_discretization.py | 3 +- sklearn/preprocessing/_encoders.py | 7 +- .../preprocessing/_function_transformer.py | 3 +- sklearn/preprocessing/_label.py | 10 +- sklearn/preprocessing/_polynomial.py | 6 +- sklearn/preprocessing/_target_encoder.py | 5 +- sklearn/preprocessing/tests/test_data.py | 19 + sklearn/random_projection.py | 4 +- sklearn/semi_supervised/_label_propagation.py | 3 +- sklearn/semi_supervised/_self_training.py | 7 +- sklearn/svm/_base.py | 4 +- sklearn/svm/_classes.py | 7 +- sklearn/tests/test_metadata_routing.py | 15 + sklearn/tests/test_public_functions.py | 1 + sklearn/tree/_classes.py | 185 ++++- sklearn/tree/_criterion.pxd | 2 +- sklearn/tree/_criterion.pyx | 15 +- sklearn/tree/_splitter.pxd | 4 +- sklearn/tree/_splitter.pyx | 46 +- sklearn/tree/_tree.pxd | 20 +- sklearn/tree/_tree.pyx | 58 +- sklearn/tree/tests/test_tree.py | 175 ++++- sklearn/utils/_metadata_requests.py | 12 +- sklearn/utils/_plotting.py | 40 ++ sklearn/utils/estimator_checks.py | 19 +- sklearn/utils/tests/test_param_validation.py | 4 +- sklearn/utils/tests/test_plotting.py | 63 ++ sklearn/utils/tests/test_validation.py | 10 + sklearn/utils/validation.py | 51 +- 141 files changed, 2511 insertions(+), 797 deletions(-) create mode 100644 sklearn/utils/tests/test_plotting.py diff --git a/doc/authors_emeritus.rst b/doc/authors_emeritus.rst index b979b77bba974..a56e2bc408ff4 100644 --- a/doc/authors_emeritus.rst +++ b/doc/authors_emeritus.rst @@ -20,6 +20,7 @@ - Wei Li - Paolo Losi - Gilles Louppe +- Chiara Marmo - Vincent Michel - Jarrod Millman - Alexandre Passos diff --git a/doc/contributor_experience_team.rst b/doc/contributor_experience_team.rst index 2e09d9069849a..00b658632302e 100644 --- a/doc/contributor_experience_team.rst +++ b/doc/contributor_experience_team.rst @@ -18,6 +18,10 @@

Lucy Liu

+
+

Maxwell Liu

+
+

Juan Martin Loyola

@@ -26,14 +30,6 @@

Sylvain Marié

-
-

Chiara Marmo

-
-
-
-

Maxwell Liu

-
-

Norbert Preining

diff --git a/doc/modules/classes.rst b/doc/modules/classes.rst index 4961fb0fec366..204c300b1a9b8 100644 --- a/doc/modules/classes.rst +++ b/doc/modules/classes.rst @@ -1247,6 +1247,7 @@ Visualization :template: display_only_from_estimator.rst model_selection.LearningCurveDisplay + model_selection.ValidationCurveDisplay .. _multiclass_ref: diff --git a/doc/modules/learning_curve.rst b/doc/modules/learning_curve.rst index 0ce64063d4cd9..3d458a1a67416 100644 --- a/doc/modules/learning_curve.rst +++ b/doc/modules/learning_curve.rst @@ -71,7 +71,7 @@ The function :func:`validation_curve` can help in this case:: >>> import numpy as np >>> from sklearn.model_selection import validation_curve >>> from sklearn.datasets import load_iris - >>> from sklearn.linear_model import Ridge + >>> from sklearn.svm import SVC >>> np.random.seed(0) >>> X, y = load_iris(return_X_y=True) @@ -80,30 +80,50 @@ The function :func:`validation_curve` can help in this case:: >>> X, y = X[indices], y[indices] >>> train_scores, valid_scores = validation_curve( - ... Ridge(), X, y, param_name="alpha", param_range=np.logspace(-7, 3, 3), - ... cv=5) + ... SVC(kernel="linear"), X, y, param_name="C", param_range=np.logspace(-7, 3, 3), + ... ) >>> train_scores - array([[0.93..., 0.94..., 0.92..., 0.91..., 0.92...], - [0.93..., 0.94..., 0.92..., 0.91..., 0.92...], - [0.51..., 0.52..., 0.49..., 0.47..., 0.49...]]) + array([[0.90..., 0.94..., 0.91..., 0.89..., 0.92...], + [0.9... , 0.92..., 0.93..., 0.92..., 0.93...], + [0.97..., 1... , 0.98..., 0.97..., 0.99...]]) >>> valid_scores - array([[0.90..., 0.84..., 0.94..., 0.96..., 0.93...], - [0.90..., 0.84..., 0.94..., 0.96..., 0.93...], - [0.46..., 0.25..., 0.50..., 0.49..., 0.52...]]) + array([[0.9..., 0.9... , 0.9... , 0.96..., 0.9... ], + [0.9..., 0.83..., 0.96..., 0.96..., 0.93...], + [1.... , 0.93..., 1.... , 1.... , 0.9... ]]) + +If you intend to plot the validation curves only, the class +:class:`~sklearn.model_selection.ValidationCurveDisplay` is more direct than +using matplotlib manually on the results of a call to :func:`validation_curve`. +You can use the method +:meth:`~sklearn.model_selection.ValidationCurveDisplay.from_estimator` similarly +to :func:`validation_curve` to generate and plot the validation curve: + +.. plot:: + :context: close-figs + :align: center + + from sklearn.datasets import load_iris + from sklearn.model_selection import ValidationCurveDisplay + from sklearn.svm import SVC + from sklearn.utils import shuffle + X, y = load_iris(return_X_y=True) + X, y = shuffle(X, y, random_state=0) + ValidationCurveDisplay.from_estimator( + SVC(kernel="linear"), X, y, param_name="C", param_range=np.logspace(-7, 3, 10) + ) If the training score and the validation score are both low, the estimator will be underfitting. If the training score is high and the validation score is low, the estimator is overfitting and otherwise it is working very well. A low training score and a high validation score is usually not possible. Underfitting, overfitting, and a working model are shown in the in the plot below where we vary -the parameter :math:`\gamma` of an SVM on the digits dataset. +the parameter `gamma` of an SVM with an RBF kernel on the digits dataset. .. figure:: ../auto_examples/model_selection/images/sphx_glr_plot_validation_curve_001.png :target: ../auto_examples/model_selection/plot_validation_curve.html :align: center :scale: 50% - .. _learning_curve: Learning curve diff --git a/doc/visualizations.rst b/doc/visualizations.rst index f692fd8efd1df..9a44f6feb1b48 100644 --- a/doc/visualizations.rst +++ b/doc/visualizations.rst @@ -89,3 +89,4 @@ Display Objects metrics.PredictionErrorDisplay metrics.RocCurveDisplay model_selection.LearningCurveDisplay + model_selection.ValidationCurveDisplay diff --git a/doc/whats_new/v1.3.rst b/doc/whats_new/v1.3.rst index bb35a1db224b4..41c03293cf067 100644 --- a/doc/whats_new/v1.3.rst +++ b/doc/whats_new/v1.3.rst @@ -29,11 +29,6 @@ random sampling procedures. `transform_algorithm` is not the same as `fit_algorithm` and the number of iterations is small. :pr:`24871` by :user:`Omar Salman `. -- |Fix| Treat more consistently small values in the `W` and `H` matrices during the - `fit` and `transform` steps of :class:`decomposition.NMF` and - :class:`decomposition.MiniBatchNMF` which can produce different results than previous - versions. :pr:`25438` by :user:`Yotam Avidar-Constantini `. - - |Enhancement| The `sample_weight` parameter now will be used in centroids initialization for :class:`cluster.KMeans`, :class:`cluster.BisectingKMeans` and :class:`cluster.MiniBatchKMeans`. @@ -43,6 +38,11 @@ random sampling procedures. :user:`Jérémie du Boisberranger `, :user:`Guillaume Lemaitre `. +- |Fix| Treat more consistently small values in the `W` and `H` matrices during the + `fit` and `transform` steps of :class:`decomposition.NMF` and + :class:`decomposition.MiniBatchNMF` which can produce different results than previous + versions. :pr:`25438` by :user:`Yotam Avidar-Constantini `. + - |Fix| :class:`decomposition.KernelPCA` may produce different results through `inverse_transform` if `gamma` is `None`. Now it will be chosen correctly as `1/n_features` of the data that it is fitted on, while previously it might be @@ -51,6 +51,14 @@ random sampling procedures. used each time the kernel is called. :pr:`26337` by :user:`Yao Xiao `. +Changed displays +---------------- + +- |Enhancement| :class:`model_selection.LearningCurveDisplay` displays both the + train and test curves by default. You can set `score_type="test"` to keep the + past behaviour. + :pr:`25120` by :user:`Guillaume Lemaitre `. + Changes impacting all modules ----------------------------- @@ -201,23 +209,9 @@ Changelog :mod:`sklearn.cluster` ...................... -- |API| The `sample_weight` parameter in `predict` for - :meth:`cluster.KMeans.predict` and :meth:`cluster.MiniBatchKMeans.predict` - is now deprecated and will be removed in v1.5. - :pr:`25251` by :user:`Gleb Levitski `. - -- |Enhancement| The `sample_weight` parameter now will be used in centroids - initialization for :class:`cluster.KMeans`, :class:`cluster.BisectingKMeans` - and :class:`cluster.MiniBatchKMeans`. - This change will break backward compatibility, since numbers generated - from same random seeds will be different. - :pr:`25752` by :user:`Gleb Levitski `, - :user:`Jérémie du Boisberranger `, - :user:`Guillaume Lemaitre `. - - |MajorFeature| Added :class:`cluster.HDBSCAN`, a modern hierarchical density-based clustering algorithm. Similarly to :class:`cluster.OPTICS`, it can be seen as a - generalization of :class:`DBSCAN` by allowing for hierarchical instead of flat + generalization of :class:`cluster.DBSCAN` by allowing for hierarchical instead of flat clustering, however it varies in its approach from :class:`cluster.OPTICS`. This algorithm is very robust with respect to its hyperparameters' values and can be used on a wide variety of data without much, if any, tuning. @@ -228,12 +222,30 @@ Changelog :pr:`26385` by :user:`Meekail Zain ` +- |Enhancement| The `sample_weight` parameter now will be used in centroids + initialization for :class:`cluster.KMeans`, :class:`cluster.BisectingKMeans` + and :class:`cluster.MiniBatchKMeans`. + This change will break backward compatibility, since numbers generated + from same random seeds will be different. + :pr:`25752` by :user:`Gleb Levitski `, + :user:`Jérémie du Boisberranger `, + :user:`Guillaume Lemaitre `. + +- |API| The `sample_weight` parameter in `predict` for + :meth:`cluster.KMeans.predict` and :meth:`cluster.MiniBatchKMeans.predict` + is now deprecated and will be removed in v1.5. + :pr:`25251` by :user:`Gleb Levitski `. + +- |API| The `Xred` argument in :func:`cluster.FeatureAgglomeration.inverse_transform` + is renamed to `Xt` and will be removed in v1.5. :pr:`26503` by `Adrin Jalali`_. + :mod:`sklearn.compose` ...................... -- |Fix| `compose.ColumnTransformer` raises an informative error when the individual transformers of `ColumnTransformer` - output pandas dataframes with indexes that are not consistent with each other and the output is configured - to be pandas. :pr:`26286` by `Thomas Fan`_. +- |Fix| `compose.ColumnTransformer` raises an informative error when the individual + transformers of `ColumnTransformer` output pandas dataframes with indexes that are + not consistent with each other and the output is configured to be pandas. + :pr:`26286` by `Thomas Fan`_. - |Fix| :class:`compose.ColumnTransformer` correctly sets the output of the remainder when `set_output` is called. :pr:`26323` by `Thomas Fan`_. @@ -241,6 +253,14 @@ Changelog :mod:`sklearn.covariance` ......................... +- |Fix| Allows `alpha=0` in :class:`covariance.GraphicalLasso` to be + consistent with :func:`covariance.graphical_lasso`. + :pr:`26033` by :user:`Genesis Valencia `. + +- |Fix| :func:`covariance.empirical_covariance` now gives an informative + error message when input is not appropriate. + :pr:`26108` by :user:`Quentin Barthélemy `. + - |API| Deprecates `cov_init` in :func:`covariance.graphical_lasso` in 1.3 since the parameter has no effect. It will be removed in 1.5. :pr:`26033` by :user:`Genesis Valencia `. @@ -256,20 +276,13 @@ Changelog :func:`covariance.graphical_lasso_path`, and :class:`covariance.GraphicalLassoCV`. :pr:`26033` by :user:`Genesis Valencia `. -- |Fix| Allows `alpha=0` in :class:`covariance.GraphicalLasso` to be - consistent with :func:`covariance.graphical_lasso`. - :pr:`26033` by :user:`Genesis Valencia `. - -- |Fix| :func:`covariance.empirical_covariance` now gives an informative - error message when input is not appropriate. - :pr:`26108` by :user:`Quentin Barthélemy `. - :mod:`sklearn.datasets` ....................... -- |API| The `data_transposed` argument of :func:`datasets.make_sparse_coded_signal` - is deprecated and will be removed in v1.5. - :pr:`25784` by :user:`Jérémie du Boisberranger`. +- |Enhancement| Allows to overwrite the parameters used to open the ARFF file using + the parameter `read_csv_kwargs` in :func:`datasets.fetch_openml` when using the + pandas parser. + :pr:`26433` by :user:`Guillaume Lemaitre `. - |Fix| :func:`datasets.fetch_openml` returns improved data types when `as_frame=True` and `parser="liac-arff"`. :pr:`26386` by `Thomas Fan`_. @@ -279,28 +292,35 @@ Changelog the pandas parser. The parameter `read_csv_kwargs` allows to overwrite this behaviour. :pr:`26551` by :user:`Guillaume Lemaitre `. -- |Enhancement| Allows to overwrite the parameters used to open the ARFF file using - the parameter `read_csv_kwargs` in :func:`datasets.fetch_openml` when using the - pandas parser. - :pr:`26433` by :user:`Guillaume Lemaitre `. +- |Fix| :func:`dataasets.fetch_openml` will consistenly use `np.nan` as missing marker + with both parsers `"pandas"` and `"liac-arff"`. + :pr:`26579` by :user:`Guillaume Lemaitre `. + +- |API| The `data_transposed` argument of :func:`datasets.make_sparse_coded_signal` + is deprecated and will be removed in v1.5. + :pr:`25784` by :user:`Jérémie du Boisberranger`. :mod:`sklearn.decomposition` ............................ -- |Enhancement| :class:`decomposition.DictionaryLearning` now accepts the parameter - `callback` for consistency with the function :func:`decomposition.dict_learning`. - :pr:`24871` by :user:`Omar Salman `. - - |Efficiency| :class:`decomposition.MiniBatchDictionaryLearning` and :class:`decomposition.MiniBatchSparsePCA` are now faster for small batch sizes by avoiding duplicate validations. :pr:`25490` by :user:`Jérémie du Boisberranger `. +- |Enhancement| :class:`decomposition.DictionaryLearning` now accepts the parameter + `callback` for consistency with the function :func:`decomposition.dict_learning`. + :pr:`24871` by :user:`Omar Salman `. + - |Fix| Treat more consistently small values in the `W` and `H` matrices during the `fit` and `transform` steps of :class:`decomposition.NMF` and :class:`decomposition.MiniBatchNMF` which can produce different results than previous versions. :pr:`25438` by :user:`Yotam Avidar-Constantini `. +- |API| The `W` argument in :func:`decomposition.NMF.inverse_transform` and + :class:`decomposition.MiniBatchNMF.inverse_transform` is renamed to `Xt` and + will be removed in v1.5. :pr:`26503` by `Adrin Jalali`_. + :mod:`sklearn.discriminant_analysis` .................................... @@ -364,6 +384,7 @@ Changelog :mod:`sklearn.exception` ........................ + - |Feature| Added :class:`exception.InconsistentVersionWarning` which is raised when a scikit-learn estimator is unpickled with a scikit-learn version that is inconsistent with the sckit-learn version the estimator was pickled with. @@ -393,6 +414,9 @@ Changelog - |Enhancement| Added the parameter `fill_value` to :class:`impute.IterativeImputer`. :pr:`25232` by :user:`Thijs van Weezel `. +- |Fix| :class:`impute.IterativeImputer` now correctly preserves the Pandas + Index when the `set_config(transform_output="pandas")`. :pr:`26454` by `Thomas Fan`_. + :mod:`sklearn.inspection` ......................... @@ -420,12 +444,6 @@ Changelog now preserve dtype for `numpy.float32`. :pr:`25587` by :user:`Omar Salman `. -- |API| Deprecates `n_iter` in favor of `max_iter` in - :class:`linear_model.BayesianRidge` and :class:`linear_model.ARDRegression`. - `n_iter` will be removed in scikit-learn 1.5. This change makes those - estimators consistent with the rest of estimators. - :pr:`25697` by :user:`John Pangas `. - - |Enhancement| The `n_iter_` attribute has been included in :class:`linear_model.ARDRegression` to expose the actual number of iterations required to reach the stopping criterion. @@ -436,36 +454,41 @@ Changelog on linearly separable problems. :pr:`25214` by `Tom Dupre la Tour`_. +- |API| Deprecates `n_iter` in favor of `max_iter` in + :class:`linear_model.BayesianRidge` and :class:`linear_model.ARDRegression`. + `n_iter` will be removed in scikit-learn 1.5. This change makes those + estimators consistent with the rest of estimators. + :pr:`25697` by :user:`John Pangas `. + +:mod:`sklearn.manifold` +....................... + +- |Fix| :class:`manifold.Isomap` now correctly preserves the Pandas + Index when the `set_config(transform_output="pandas")`. :pr:`26454` by `Thomas Fan`_. + :mod:`sklearn.metrics` ...................... -- |Efficiency| The computation of the expected mutual information in - :func:`metrics.adjusted_mutual_info_score` is now faster when the number of - unique labels is large and its memory usage is reduced in general. - :pr:`25713` by :user:`Kshitij Mathur `, - :user:`Guillaume Lemaitre `, :user:`Omar Salman ` and - :user:`Jérémie du Boisberranger `. - - |Feature| Adds `zero_division=np.nan` to multiple classification metrics: - :func:`precision_score`, :func:`recall_score`, :func:`f1_score`, - :func:`fbeta_score`, :func:`precision_recall_fscore_support`, - :func:`classification_report`. When `zero_division=np.nan` and there is a + :func:`metrics.precision_score`, :func:`metrics.recall_score`, + :func:`metrics.f1_score`, :func:`metrics.fbeta_score`, + :func:`metrics.precision_recall_fscore_support`, + :func:`metrics.classification_report`. When `zero_division=np.nan` and there is a zero division, the metric is undefined and is excluded from averaging. When not used for averages, the value returned is `np.nan`. :pr:`25531` by :user:`Marc Torrellas Socastro `. -- |Fix| :func:`metric.manhattan_distances` now supports readonly sparse datasets. - :pr:`25432` by :user:`Julien Jerphanion `. - -- |Fix| Fixed :func:`classification_report` so that empty input will return - `np.nan`. Previously, "macro avg" and `weighted avg` would return - e.g. `f1-score=np.nan` and `f1-score=0.0`, being inconsistent. Now, they - both return `np.nan`. - :pr:`25531` by :user:`Marc Torrellas Socastro `. +- |Feature| :func:`metrics.average_precision_score` now supports the + multiclass case. + :pr:`17388` by :user:`Geoffrey Bolmier ` and + :pr:`24769` by :user:`Ashwin Mathur `. -- |Fix| :func:`metric.ndcg_score` now gives a meaningful error message for input of - length 1. - :pr:`25672` by :user:`Lene Preuss ` and :user:`Wei-Chun Chu `. +- |Efficiency| The computation of the expected mutual information in + :func:`metrics.adjusted_mutual_info_score` is now faster when the number of + unique labels is large and its memory usage is reduced in general. + :pr:`25713` by :user:`Kshitij Mathur `, + :user:`Guillaume Lemaitre `, :user:`Omar Salman ` and + :user:`Jérémie du Boisberranger `. - |Enhancement| :class:`metrics.silhouette_samples` nows accepts a sparse matrix of pairwise distances between samples, or a feature array. @@ -492,17 +515,23 @@ Changelog chance level. This line is exposed in the `chance_level_` attribute. :pr:`26019` by :user:`Yao Xiao `. -- |Fix| :func:`log_loss` raises a warning if the values of the parameter `y_pred` are - not normalized, instead of actually normalizing them in the metric. Starting from - 1.5 this will raise an error. :pr:`25299` by :user:`Omar Salman `. + +- |Fix| Fixed :func:`metrics.classification_report` so that empty input will return + `np.nan`. Previously, "macro avg" and `weighted avg` would return + e.g. `f1-score=np.nan` and `f1-score=0.0`, being inconsistent. Now, they + both return `np.nan`. + :pr:`25531` by :user:`Marc Torrellas Socastro `. -- |API| The `eps` parameter of the :func:`log_loss` has been deprecated and will be - removed in 1.5. :pr:`25299` by :user:`Omar Salman `. +- |Fix| :func:`metrics.ndcg_score` now gives a meaningful error message for input of + length 1. + :pr:`25672` by :user:`Lene Preuss ` and :user:`Wei-Chun Chu `. -- |Feature| :func:`metrics.average_precision_score` now supports the - multiclass case. - :pr:`17388` by :user:`Geoffrey Bolmier ` and - :pr:`24769` by :user:`Ashwin Mathur `. +- |Fix| :func:`metrics.log_loss` raises a warning if the values of the parameter + `y_pred` are not normalized, instead of actually normalizing them in the metric. + Starting from 1.5 this will raise an error. + :pr:`25299` by :user:`Omar Salman ` +- |API| The `eps` parameter of the :func:`metrics.log_loss` has been deprecated and + will be removed in 1.5. :pr:`25299` by :user:`Omar Salman `. + :mod:`sklearn.gaussian_process` ............................... @@ -524,6 +556,18 @@ Changelog :mod:`sklearn.model_selection` .............................. +- |MajorFeature| Added the class :class:`model_selection.ValidationCurveDisplay` + that allows easy plotting of validation curves obtained by the function + :func:`model_selection.validation_curve`. + :pr:`25120` by :user:`Guillaume Lemaitre `. + +- |API| The parameter `log_scale` in the class + :class:`model_selection.LearningCurveDisplay` has been deprecated in 1.3 and + will be removed in 1.5. The default scale can be overriden by setting it + directly on the `ax` object and will be set automatically from the spacing + of the data points otherwise. + :pr:`25120` by :user:`Guillaume Lemaitre `. + - |Enhancement| :func:`model_selection.cross_validate` accepts a new parameter `return_indices` to return the train-test indices of each cv split. :pr:`25659` by :user:`Guillaume Lemaitre `. @@ -546,15 +590,15 @@ Changelog :mod:`sklearn.neighbors` ........................ -- |Fix| Remove support for `KulsinskiDistance` in :class:`neighbors.BallTree`. This - dissimilarity is not a metric and cannot be supported by the BallTree. - :pr:`25417` by :user:`Guillaume Lemaitre `. - - |Enhancement| The performance of :meth:`neighbors.KNeighborsClassifier.predict` and of :meth:`neighbors.KNeighborsClassifier.predict_proba` has been improved when `n_neighbors` is large and `algorithm="brute"` with non Euclidean metrics. :pr:`24076` by :user:`Meekail Zain `, :user:`Julien Jerphanion `. +- |Fix| Remove support for `KulsinskiDistance` in :class:`neighbors.BallTree`. This + dissimilarity is not a metric and cannot be supported by the BallTree. + :pr:`25417` by :user:`Guillaume Lemaitre `. + - |API| The support for metrics other than `euclidean` and `manhattan` and for callables in :class:`neighbors.NearestNeighbors` is deprecated and will be removed in version 1.5. :pr:`24083` by :user:`Valentin Laurent `. @@ -592,10 +636,24 @@ Changelog categorical encoding based on target mean conditioned on the value of the category. :pr:`25334` by `Thomas Fan`_. +- |Feature| :class:`preprocessing.OrdinalEncoder` now supports grouping + infrequent categories into a single feature. Grouping infrequent categories + is enabled by specifying how to select infrequent categories with + `min_frequency` or `max_categories`. :pr:`25677` by `Thomas Fan`_. + +- |Enhancement| :class:`preprocessing.PolynomialFeatures` now calculates the + number of expanded terms a-priori when dealing with sparse `csr` matrices + in order to optimize the choice of `dtype` for `indices` and `indptr`. It + can now output `csr` matrices with `np.int32` `indices/indptr` components + when there are few enough elements, and will automatically use `np.int64` + for sufficiently large matrices. + :pr:`20524` by :user:`niuk-a ` and + :pr:`23731` by :user:`Meekail Zain ` + - |Enhancement| A new parameter `sparse_output` was added to - :class:`SplineTransformer`, available as of SciPy 1.8. If `sparse_output=True`, - :class:`SplineTransformer` returns a sparse CSR matrix. - :pr:`24145` by :user:`Christian Lorentzen `. + :class:`preprocessing.SplineTransformer`, available as of SciPy 1.8. If + `sparse_output=True`, :class:`preprocessing.SplineTransformer` returns a sparse + CSR matrix. :pr:`24145` by :user:`Christian Lorentzen `. - |Enhancement| Adds a `feature_name_combiner` parameter to :class:`preprocessing.OneHotEncoder`. This specifies a custom callable to create @@ -610,28 +668,35 @@ Changelog :pr:`24935` by :user:`Seladus `, :user:`Guillaume Lemaitre `, and :user:`Dea María Léon `, :pr:`25257` by :user:`Gleb Levitski `. -- |Feature| :class:`preprocessing.OrdinalEncoder` now supports grouping - infrequent categories into a single feature. Grouping infrequent categories - is enabled by specifying how to select infrequent categories with - `min_frequency` or `max_categories`. :pr:`25677` by `Thomas Fan`_. - - |Enhancement| Subsampling through the `subsample` parameter can now be used in :class:`preprocessing.KBinsDiscretizer` regardless of the strategy used. :pr:`26424` by :user:`Jérémie du Boisberranger `. -- |API| The default value of the `subsample` parameter of - :class:`preprocessing.KBinsDiscretizer` will change from `None` to `200_000` in - version 1.5 when `strategy="kmeans"` or `strategy="uniform"`. - :pr:`26424` by :user:`Jérémie du Boisberranger `. +- |Fix| :class:`preprocessing.AdditiveChi2Sampler` is now stateless. + The `sample_interval_` attribute is deprecated and will be removed in 1.5. + :pr:`25190` by :user:`Vincent Maladière `. - |Fix| :class:`AdditiveChi2Sampler` is now stateless. The `sample_interval_` attribute is deprecated and will be removed in 1.5. :pr:`25190` by :user:`Vincent Maladière `. +- |Fix| :class:`preprocessing.PowerTransformer` now correctly preserves the Pandas + Index when the `set_config(transform_output="pandas")`. :pr:`26454` by `Thomas Fan`_. + - |Fix| :class:`preprocessing.PowerTransformer` now correcly raises error when using `method="box-cox"` on data with a constant `np.nan` column. :pr:`26400` by :user:`Yao Xiao `. +- |Fix| :class:`preprocessing.PowerTransformer` with `method="yeo-johnson"` now leaves + constant features unchanged instead of transforming with an arbitrary value for + the `lambdas_` fitted parameter. + :pr:`26566` by :user:`Jérémie du Boisberranger `. + +- |API| The default value of the `subsample` parameter of + :class:`preprocessing.KBinsDiscretizer` will change from `None` to `200_000` in + version 1.5 when `strategy="kmeans"` or `strategy="uniform"`. + :pr:`26424` by :user:`Jérémie du Boisberranger `. + :mod:`sklearn.svm` .................. @@ -660,45 +725,36 @@ Changelog :mod:`sklearn.utils` .................... -- |API| :func:`estimator_checks.check_transformers_unfitted_stateless` has been +- |FIX| Fixes :func:`utils.validation.check_array` to properly convert pandas + extension arrays. :pr:`25813` and :pr:`26106` by `Thomas Fan`_. + +- |Fix| :func:`utils.validation.check_array` now supports pandas DataFrames with + extension arrays and object dtypes by return an ndarray with object dtype. + :pr:`25814` by `Thomas Fan`_. + +- |API| :func:`utils.estimator_checks.check_transformers_unfitted_stateless` has been introduced to ensure stateless transformers don't raise `NotFittedError` during `transform` with no prior call to `fit` or `fit_transform`. :pr:`25190` by :user:`Vincent Maladière `. -- |Enhancement| :class:`preprocessing.PolynomialFeatures` now calculates the - number of expanded terms a-priori when dealing with sparse `csr` matrices - in order to optimize the choice of `dtype` for `indices` and `indptr`. It - can now output `csr` matrices with `np.int32` `indices/indptr` components - when there are few enough elements, and will automatically use `np.int64` - for sufficiently large matrices. - :pr:`20524` by :user:`niuk-a ` and - :pr:`23731` by :user:`Meekail Zain ` - - |API| A `FutureWarning` is now raised when instantiating a class which inherits from a deprecated base class (i.e. decorated by :class:`utils.deprecated`) and which overrides the `__init__` method. :pr:`25733` by :user:`Brigitta Sipőcz ` and :user:`Jérémie du Boisberranger `. -- |FIX| Fixes :func:`utils.validation.check_array` to properly convert pandas - extension arrays. :pr:`25813` and :pr:`26106` by `Thomas Fan`_. - -- |Fix| :func:`utils.validation.check_array` now supports pandas DataFrames with - extension arrays and object dtypes by return an ndarray with object dtype. - :pr:`25814` by `Thomas Fan`_. - :mod:`sklearn.semi_supervised` .............................. -- |Enhancement| :meth:`LabelSpreading.fit` and :meth:`LabelPropagation.fit` now - accepts sparse metrics. +- |Enhancement| :meth:`semi_supervised.LabelSpreading.fit` and + :meth:`semi_supervised.LabelPropagation.fit` now accepts sparse metrics. :pr:`19664` by :user:`Kaushik Amar Das `. Miscellaneous ............. -- |Enhancement| Replace obsolete exceptions EnvironmentError, IOError and - WindowsError. +- |Enhancement| Replace obsolete exceptions `EnvironmentError`, `IOError` and + `WindowsError`. :pr:`26466` by :user:`Dimitri Papadopoulos ORfanos `. Code and Documentation Contributors diff --git a/examples/miscellaneous/plot_kernel_ridge_regression.py b/examples/miscellaneous/plot_kernel_ridge_regression.py index 20b8496ab18aa..fa7cb15446473 100644 --- a/examples/miscellaneous/plot_kernel_ridge_regression.py +++ b/examples/miscellaneous/plot_kernel_ridge_regression.py @@ -203,6 +203,7 @@ "scoring": "neg_mean_squared_error", "negate_score": True, "score_name": "Mean Squared Error", + "score_type": "test", "std_display_style": None, "ax": ax, } diff --git a/examples/model_selection/plot_validation_curve.py b/examples/model_selection/plot_validation_curve.py index 1b3c562594188..48aa19dfbc556 100644 --- a/examples/model_selection/plot_validation_curve.py +++ b/examples/model_selection/plot_validation_curve.py @@ -18,53 +18,23 @@ from sklearn.datasets import load_digits from sklearn.svm import SVC -from sklearn.model_selection import validation_curve +from sklearn.model_selection import ValidationCurveDisplay X, y = load_digits(return_X_y=True) subset_mask = np.isin(y, [1, 2]) # binary classification: 1 vs 2 X, y = X[subset_mask], y[subset_mask] -param_range = np.logspace(-6, -1, 5) -train_scores, test_scores = validation_curve( +disp = ValidationCurveDisplay.from_estimator( SVC(), X, y, param_name="gamma", - param_range=param_range, - scoring="accuracy", + param_range=np.logspace(-6, -1, 5), + score_type="both", n_jobs=2, + score_name="Accuracy", ) -train_scores_mean = np.mean(train_scores, axis=1) -train_scores_std = np.std(train_scores, axis=1) -test_scores_mean = np.mean(test_scores, axis=1) -test_scores_std = np.std(test_scores, axis=1) - -plt.title("Validation Curve with SVM") -plt.xlabel(r"$\gamma$") -plt.ylabel("Score") -plt.ylim(0.0, 1.1) -lw = 2 -plt.semilogx( - param_range, train_scores_mean, label="Training score", color="darkorange", lw=lw -) -plt.fill_between( - param_range, - train_scores_mean - train_scores_std, - train_scores_mean + train_scores_std, - alpha=0.2, - color="darkorange", - lw=lw, -) -plt.semilogx( - param_range, test_scores_mean, label="Cross-validation score", color="navy", lw=lw -) -plt.fill_between( - param_range, - test_scores_mean - test_scores_std, - test_scores_mean + test_scores_std, - alpha=0.2, - color="navy", - lw=lw, -) -plt.legend(loc="best") +disp.ax_.set_title("Validation Curve for SVM with an RBF kernel") +disp.ax_.set_xlabel(r"gamma (inverse radius of the RBF kernel)") +disp.ax_.set_ylim(0.0, 1.1) plt.show() diff --git a/sklearn/base.py b/sklearn/base.py index 5cced34d4b8f0..13bbcab96aa61 100644 --- a/sklearn/base.py +++ b/sklearn/base.py @@ -27,7 +27,7 @@ from .utils.validation import _num_features from .utils.validation import _check_feature_names_in from .utils.validation import _generate_get_feature_names_out -from .utils.validation import check_is_fitted +from .utils.validation import _is_fitted, check_is_fitted from .utils._metadata_requests import _MetadataRequester from .utils.validation import _get_feature_names from .utils._estimator_html_repr import estimator_html_repr @@ -1131,7 +1131,13 @@ def decorator(fit_method): @functools.wraps(fit_method) def wrapper(estimator, *args, **kwargs): global_skip_validation = get_config()["skip_parameter_validation"] - if not global_skip_validation: + + # we don't want to validate again for each call to partial_fit + partial_fit_and_fitted = ( + fit_method.__name__ == "partial_fit" and _is_fitted(estimator) + ) + + if not global_skip_validation and not partial_fit_and_fitted: estimator._validate_params() with config_context( diff --git a/sklearn/calibration.py b/sklearn/calibration.py index 5e7bfe2ab4a31..e4869387f4166 100644 --- a/sklearn/calibration.py +++ b/sklearn/calibration.py @@ -25,6 +25,7 @@ RegressorMixin, clone, MetaEstimatorMixin, + _fit_context, ) from .preprocessing import label_binarize, LabelEncoder from .utils import ( @@ -318,6 +319,10 @@ def _get_estimator(self): return estimator + @_fit_context( + # CalibratedClassifierCV.estimator is not validated yet + prefer_skip_nested_validation=False + ) def fit(self, X, y, sample_weight=None, **fit_params): """Fit the calibrated model. @@ -341,8 +346,6 @@ def fit(self, X, y, sample_weight=None, **fit_params): self : object Returns an instance of self. """ - self._validate_params() - check_classification_targets(y) X, y = indexable(X, y) if sample_weight is not None: diff --git a/sklearn/cluster/_affinity_propagation.py b/sklearn/cluster/_affinity_propagation.py index 8a3c2c2acde62..1ffc5f07e8c50 100644 --- a/sklearn/cluster/_affinity_propagation.py +++ b/sklearn/cluster/_affinity_propagation.py @@ -12,6 +12,7 @@ from ..exceptions import ConvergenceWarning from ..base import BaseEstimator, ClusterMixin +from ..base import _fit_context from ..utils import check_random_state from ..utils._param_validation import Interval, StrOptions, validate_params from ..utils.validation import check_is_fitted @@ -469,6 +470,7 @@ def __init__( def _more_tags(self): return {"pairwise": self.affinity == "precomputed"} + @_fit_context(prefer_skip_nested_validation=True) def fit(self, X, y=None): """Fit the clustering from features, or affinity matrix. @@ -488,8 +490,6 @@ def fit(self, X, y=None): self Returns the instance itself. """ - self._validate_params() - if self.affinity == "precomputed": accept_sparse = False else: diff --git a/sklearn/cluster/_agglomerative.py b/sklearn/cluster/_agglomerative.py index 059056275ef3d..b7d08a45dcd80 100644 --- a/sklearn/cluster/_agglomerative.py +++ b/sklearn/cluster/_agglomerative.py @@ -16,6 +16,7 @@ from scipy.sparse.csgraph import connected_components from ..base import BaseEstimator, ClusterMixin, ClassNamePrefixFeaturesOutMixin +from ..base import _fit_context from ..metrics.pairwise import paired_distances from ..metrics.pairwise import _VALID_METRICS from ..metrics import DistanceMetric @@ -950,6 +951,7 @@ def __init__( self.metric = metric self.compute_distances = compute_distances + @_fit_context(prefer_skip_nested_validation=True) def fit(self, X, y=None): """Fit the hierarchical clustering from features, or distance matrix. @@ -968,7 +970,6 @@ def fit(self, X, y=None): self : object Returns the fitted instance. """ - self._validate_params() X = self._validate_data(X, ensure_min_samples=2) return self._fit(X) @@ -1324,6 +1325,7 @@ def __init__( ) self.pooling_func = pooling_func + @_fit_context(prefer_skip_nested_validation=True) def fit(self, X, y=None): """Fit the hierarchical clustering on the data. @@ -1340,7 +1342,6 @@ def fit(self, X, y=None): self : object Returns the transformer. """ - self._validate_params() X = self._validate_data(X, ensure_min_features=2) super()._fit(X.T) self._n_features_out = self.n_clusters_ diff --git a/sklearn/cluster/_bicluster.py b/sklearn/cluster/_bicluster.py index ba837bacc99d5..4133264626ebb 100644 --- a/sklearn/cluster/_bicluster.py +++ b/sklearn/cluster/_bicluster.py @@ -13,6 +13,7 @@ from . import KMeans, MiniBatchKMeans from ..base import BaseEstimator, BiclusterMixin +from ..base import _fit_context from ..utils import check_random_state from ..utils import check_scalar @@ -118,6 +119,7 @@ def __init__( def _check_parameters(self, n_samples): """Validate parameters depending on the input data.""" + @_fit_context(prefer_skip_nested_validation=True) def fit(self, X, y=None): """Create a biclustering for X. @@ -134,8 +136,6 @@ def fit(self, X, y=None): self : object SpectralBiclustering instance. """ - self._validate_params() - X = self._validate_data(X, accept_sparse="csr", dtype=np.float64) self._check_parameters(X.shape[0]) self._fit(X) diff --git a/sklearn/cluster/_birch.py b/sklearn/cluster/_birch.py index 4c9d7921fdc70..e74630572a014 100644 --- a/sklearn/cluster/_birch.py +++ b/sklearn/cluster/_birch.py @@ -16,6 +16,7 @@ ClusterMixin, BaseEstimator, ClassNamePrefixFeaturesOutMixin, + _fit_context, ) from ..utils.extmath import row_norms from ..utils._param_validation import Interval @@ -501,6 +502,7 @@ def __init__( self.compute_labels = compute_labels self.copy = copy + @_fit_context(prefer_skip_nested_validation=True) def fit(self, X, y=None): """ Build a CF Tree for the input data. @@ -518,9 +520,6 @@ def fit(self, X, y=None): self Fitted estimator. """ - - self._validate_params() - return self._fit(X, partial=False) def _fit(self, X, partial): @@ -610,6 +609,7 @@ def _get_leaves(self): leaf_ptr = leaf_ptr.next_leaf_ return leaves + @_fit_context(prefer_skip_nested_validation=True) def partial_fit(self, X=None, y=None): """ Online learning. Prevents rebuilding of CFTree from scratch. @@ -629,8 +629,6 @@ def partial_fit(self, X=None, y=None): self Fitted estimator. """ - self._validate_params() - if X is None: # Perform just the final global clustering step. self._global_clustering() diff --git a/sklearn/cluster/_bisect_k_means.py b/sklearn/cluster/_bisect_k_means.py index fc2b38cc1bca9..959d78ae85009 100644 --- a/sklearn/cluster/_bisect_k_means.py +++ b/sklearn/cluster/_bisect_k_means.py @@ -6,6 +6,7 @@ import numpy as np import scipy.sparse as sp +from ..base import _fit_context from ._kmeans import _BaseKMeans from ._kmeans import _kmeans_single_elkan from ._kmeans import _kmeans_single_lloyd @@ -347,6 +348,7 @@ def _bisect(self, X, x_squared_norms, sample_weight, cluster_to_bisect): cluster_to_bisect.split(best_labels, best_centers, scores) + @_fit_context(prefer_skip_nested_validation=True) def fit(self, X, y=None, sample_weight=None): """Compute bisecting k-means clustering. @@ -373,8 +375,6 @@ def fit(self, X, y=None, sample_weight=None): self Fitted estimator. """ - self._validate_params() - X = self._validate_data( X, accept_sparse="csr", diff --git a/sklearn/cluster/_dbscan.py b/sklearn/cluster/_dbscan.py index aa81ef27702e6..3c753935ac046 100644 --- a/sklearn/cluster/_dbscan.py +++ b/sklearn/cluster/_dbscan.py @@ -16,6 +16,7 @@ from ..metrics.pairwise import _VALID_METRICS from ..base import BaseEstimator, ClusterMixin +from ..base import _fit_context from ..utils.validation import _check_sample_weight from ..utils._param_validation import Interval, StrOptions from ..neighbors import NearestNeighbors @@ -338,6 +339,10 @@ def __init__( self.p = p self.n_jobs = n_jobs + @_fit_context( + # DBSCAN.metric is not validated yet + prefer_skip_nested_validation=False + ) def fit(self, X, y=None, sample_weight=None): """Perform DBSCAN clustering from features, or distance matrix. @@ -363,8 +368,6 @@ def fit(self, X, y=None, sample_weight=None): self : object Returns a fitted instance of self. """ - self._validate_params() - X = self._validate_data(X, accept_sparse="csr") if sample_weight is not None: diff --git a/sklearn/cluster/_feature_agglomeration.py b/sklearn/cluster/_feature_agglomeration.py index 457a83dd41e71..55baf247a2931 100644 --- a/sklearn/cluster/_feature_agglomeration.py +++ b/sklearn/cluster/_feature_agglomeration.py @@ -5,10 +5,12 @@ # Author: V. Michel, A. Gramfort # License: BSD 3 clause +import warnings import numpy as np from ..base import TransformerMixin from ..utils.validation import check_is_fitted +from ..utils import metadata_routing from scipy.sparse import issparse ############################################################################### @@ -20,6 +22,11 @@ class AgglomerationTransform(TransformerMixin): A class for feature agglomeration via the transform interface. """ + # This prevents ``set_split_inverse_transform`` to be generated for the + # non-standard ``Xred`` arg on ``inverse_transform``. + # TODO(1.5): remove when Xred is removed for inverse_transform. + __metadata_request__inverse_transform = {"Xred": metadata_routing.UNUSED} + def transform(self, X): """ Transform a new matrix using the built clustering. @@ -54,22 +61,43 @@ def transform(self, X): nX = np.array(nX).T return nX - def inverse_transform(self, Xred): + def inverse_transform(self, Xt=None, Xred=None): """ Inverse the transformation and return a vector of size `n_features`. Parameters ---------- - Xred : array-like of shape (n_samples, n_clusters) or (n_clusters,) + Xt : array-like of shape (n_samples, n_clusters) or (n_clusters,) The values to be assigned to each cluster of samples. + Xred : deprecated + Use `Xt` instead. + + .. deprecated:: 1.3 + Returns ------- X : ndarray of shape (n_samples, n_features) or (n_features,) A vector of size `n_samples` with the values of `Xred` assigned to each of the cluster of samples. """ + if Xt is None and Xred is None: + raise TypeError("Missing required positional argument: Xt") + + if Xred is not None and Xt is not None: + raise ValueError("Please provide only `Xt`, and not `Xred`.") + + if Xred is not None: + warnings.warn( + ( + "Input argument `Xred` was renamed to `Xt` in v1.3 and will be" + " removed in v1.5." + ), + FutureWarning, + ) + Xt = Xred + check_is_fitted(self) unil, inverse = np.unique(self.labels_, return_inverse=True) - return Xred[..., inverse] + return Xt[..., inverse] diff --git a/sklearn/cluster/_kmeans.py b/sklearn/cluster/_kmeans.py index 971d5735fbe2b..b36999885a14e 100644 --- a/sklearn/cluster/_kmeans.py +++ b/sklearn/cluster/_kmeans.py @@ -23,6 +23,7 @@ ClusterMixin, TransformerMixin, ClassNamePrefixFeaturesOutMixin, + _fit_context, ) from ..metrics.pairwise import euclidean_distances from ..metrics.pairwise import _euclidean_distances @@ -1448,6 +1449,7 @@ def _warn_mkl_vcomp(self, n_active_threads): f" variable OMP_NUM_THREADS={n_active_threads}." ) + @_fit_context(prefer_skip_nested_validation=True) def fit(self, X, y=None, sample_weight=None): """Compute k-means clustering. @@ -1475,8 +1477,6 @@ def fit(self, X, y=None, sample_weight=None): self : object Fitted estimator. """ - self._validate_params() - X = self._validate_data( X, accept_sparse="csr", @@ -2057,6 +2057,7 @@ def _random_reassign(self): return True return False + @_fit_context(prefer_skip_nested_validation=True) def fit(self, X, y=None, sample_weight=None): """Compute the centroids on X by chunking it into mini-batches. @@ -2084,8 +2085,6 @@ def fit(self, X, y=None, sample_weight=None): self : object Fitted estimator. """ - self._validate_params() - X = self._validate_data( X, accept_sparse="csr", @@ -2214,6 +2213,7 @@ def fit(self, X, y=None, sample_weight=None): return self + @_fit_context(prefer_skip_nested_validation=True) def partial_fit(self, X, y=None, sample_weight=None): """Update k means estimate on a single mini-batch X. @@ -2241,9 +2241,6 @@ def partial_fit(self, X, y=None, sample_weight=None): """ has_centers = hasattr(self, "cluster_centers_") - if not has_centers: - self._validate_params() - X = self._validate_data( X, accept_sparse="csr", diff --git a/sklearn/cluster/_mean_shift.py b/sklearn/cluster/_mean_shift.py index 46a00ed3f0740..6b0f227d011f9 100644 --- a/sklearn/cluster/_mean_shift.py +++ b/sklearn/cluster/_mean_shift.py @@ -24,6 +24,7 @@ from ..utils.parallel import delayed, Parallel from ..utils import check_random_state, gen_batches, check_array from ..base import BaseEstimator, ClusterMixin +from ..base import _fit_context from ..neighbors import NearestNeighbors from ..metrics.pairwise import pairwise_distances_argmin from .._config import config_context @@ -435,6 +436,7 @@ def __init__( self.n_jobs = n_jobs self.max_iter = max_iter + @_fit_context(prefer_skip_nested_validation=True) def fit(self, X, y=None): """Perform clustering. @@ -451,7 +453,6 @@ def fit(self, X, y=None): self : object Fitted instance. """ - self._validate_params() X = self._validate_data(X) bandwidth = self.bandwidth if bandwidth is None: diff --git a/sklearn/cluster/_optics.py b/sklearn/cluster/_optics.py index 0f1c66ada2d4e..ca1c74d6f44e7 100755 --- a/sklearn/cluster/_optics.py +++ b/sklearn/cluster/_optics.py @@ -24,6 +24,7 @@ from ..utils.validation import check_memory from ..neighbors import NearestNeighbors from ..base import BaseEstimator, ClusterMixin +from ..base import _fit_context from ..metrics import pairwise_distances from scipy.sparse import issparse, SparseEfficiencyWarning @@ -288,6 +289,10 @@ def __init__( self.memory = memory self.n_jobs = n_jobs + @_fit_context( + # Optics.metric is not validated yet + prefer_skip_nested_validation=False + ) def fit(self, X, y=None): """Perform OPTICS clustering. @@ -311,8 +316,6 @@ def fit(self, X, y=None): self : object Returns a fitted instance of self. """ - self._validate_params() - dtype = bool if self.metric in PAIRWISE_BOOLEAN_FUNCTIONS else float if dtype == bool and X.dtype != bool: msg = ( diff --git a/sklearn/cluster/_spectral.py b/sklearn/cluster/_spectral.py index e0ab7da938bfd..f72db4b7c1da3 100644 --- a/sklearn/cluster/_spectral.py +++ b/sklearn/cluster/_spectral.py @@ -15,6 +15,7 @@ from scipy.sparse import csc_matrix from ..base import BaseEstimator, ClusterMixin +from ..base import _fit_context from ..utils._param_validation import Interval, StrOptions, validate_params from ..utils import check_random_state, as_float_array from ..metrics.pairwise import pairwise_kernels, KERNEL_PARAMS @@ -649,6 +650,7 @@ def __init__( self.n_jobs = n_jobs self.verbose = verbose + @_fit_context(prefer_skip_nested_validation=True) def fit(self, X, y=None): """Perform spectral clustering from features, or affinity matrix. @@ -671,8 +673,6 @@ def fit(self, X, y=None): self : object A fitted instance of the estimator. """ - self._validate_params() - X = self._validate_data( X, accept_sparse=["csr", "csc", "coo"], diff --git a/sklearn/cluster/tests/test_feature_agglomeration.py b/sklearn/cluster/tests/test_feature_agglomeration.py index 3e4aa816b79c0..3db2862384c74 100644 --- a/sklearn/cluster/tests/test_feature_agglomeration.py +++ b/sklearn/cluster/tests/test_feature_agglomeration.py @@ -2,9 +2,11 @@ Tests for sklearn.cluster._feature_agglomeration """ # Authors: Sergul Aydore 2017 +import warnings import numpy as np from numpy.testing import assert_array_equal +import pytest from sklearn.cluster import FeatureAgglomeration from sklearn.utils._testing import assert_array_almost_equal from sklearn.datasets import make_blobs @@ -53,3 +55,25 @@ def test_feature_agglomeration_feature_names_out(): assert_array_equal( [f"featureagglomeration{i}" for i in range(n_clusters)], names_out ) + + +# TODO(1.5): remove this test +def test_inverse_transform_Xred_deprecation(): + X = np.array([0, 0, 1]).reshape(1, 3) # (n_samples, n_features) + + est = FeatureAgglomeration(n_clusters=1, pooling_func=np.mean) + est.fit(X) + Xt = est.transform(X) + + with pytest.raises(TypeError, match="Missing required positional argument"): + est.inverse_transform() + + with pytest.raises(ValueError, match="Please provide only"): + est.inverse_transform(Xt=Xt, Xred=Xt) + + with warnings.catch_warnings(record=True): + warnings.simplefilter("error") + est.inverse_transform(Xt) + + with pytest.warns(FutureWarning, match="Input argument `Xred` was renamed to `Xt`"): + est.inverse_transform(Xred=Xt) diff --git a/sklearn/compose/_column_transformer.py b/sklearn/compose/_column_transformer.py index aab021c0c8d4f..14349662cfee9 100644 --- a/sklearn/compose/_column_transformer.py +++ b/sklearn/compose/_column_transformer.py @@ -14,6 +14,7 @@ from scipy import sparse from ..base import clone, TransformerMixin +from ..base import _fit_context from ..utils._estimator_html_repr import _VisualBlock from ..pipeline import _fit_transform_one, _transform_one, _name_estimators from ..preprocessing import FunctionTransformer @@ -701,12 +702,15 @@ def fit(self, X, y=None): self : ColumnTransformer This estimator. """ - self._validate_params() # we use fit_transform to make sure to set sparse_output_ (for which we # need the transformed data) to have consistent output type in predict self.fit_transform(X, y=y) return self + @_fit_context( + # estimators in ColumnTransformer.transformers are not validated yet + prefer_skip_nested_validation=False + ) def fit_transform(self, X, y=None): """Fit all transformers, transform the data and concatenate results. @@ -728,7 +732,6 @@ def fit_transform(self, X, y=None): any result is a sparse matrix, everything will be converted to sparse matrices. """ - self._validate_params() self._check_feature_names(X, reset=True) X = _check_X(X) diff --git a/sklearn/compose/_target.py b/sklearn/compose/_target.py index f31a5a49b641e..e926ed7abe324 100644 --- a/sklearn/compose/_target.py +++ b/sklearn/compose/_target.py @@ -7,6 +7,7 @@ import numpy as np from ..base import BaseEstimator, RegressorMixin, clone +from ..base import _fit_context from ..utils.validation import check_is_fitted from ..utils._tags import _safe_tags from ..utils import check_array, _safe_indexing @@ -197,6 +198,10 @@ def _fit_transformer(self, y): UserWarning, ) + @_fit_context( + # TransformedTargetRegressor.regressor/transformer are not validated yet. + prefer_skip_nested_validation=False + ) def fit(self, X, y, **fit_params): """Fit the model according to the given training data. @@ -218,7 +223,6 @@ def fit(self, X, y, **fit_params): self : object Fitted estimator. """ - self._validate_params() if y is None: raise ValueError( f"This {self.__class__.__name__} estimator " diff --git a/sklearn/covariance/_elliptic_envelope.py b/sklearn/covariance/_elliptic_envelope.py index 1ef0eedd62f64..c99f200592580 100644 --- a/sklearn/covariance/_elliptic_envelope.py +++ b/sklearn/covariance/_elliptic_envelope.py @@ -9,6 +9,7 @@ from ..utils.validation import check_is_fitted from ..metrics import accuracy_score from ..base import OutlierMixin +from ..base import _fit_context class EllipticEnvelope(OutlierMixin, MinCovDet): @@ -162,6 +163,7 @@ def __init__( ) self.contamination = contamination + @_fit_context(prefer_skip_nested_validation=True) def fit(self, X, y=None): """Fit the EllipticEnvelope model. @@ -178,7 +180,6 @@ def fit(self, X, y=None): self : object Returns the instance itself. """ - # `_validate_params` is called in `MinCovDet` super().fit(X) self.offset_ = np.percentile(-self.dist_, 100.0 * self.contamination) return self diff --git a/sklearn/covariance/_empirical_covariance.py b/sklearn/covariance/_empirical_covariance.py index 7fc23f36d92d3..8083bfd2e1aa1 100644 --- a/sklearn/covariance/_empirical_covariance.py +++ b/sklearn/covariance/_empirical_covariance.py @@ -16,6 +16,7 @@ from .. import config_context from ..base import BaseEstimator +from ..base import _fit_context from ..utils import check_array from ..utils._param_validation import validate_params from ..utils.extmath import fast_logdet @@ -218,6 +219,7 @@ def get_precision(self): precision = linalg.pinvh(self.covariance_, check_finite=False) return precision + @_fit_context(prefer_skip_nested_validation=True) def fit(self, X, y=None): """Fit the maximum likelihood covariance estimator to X. @@ -235,7 +237,6 @@ def fit(self, X, y=None): self : object Returns the instance itself. """ - self._validate_params() X = self._validate_data(X) if self.assume_centered: self.location_ = np.zeros(X.shape[1]) diff --git a/sklearn/covariance/_graph_lasso.py b/sklearn/covariance/_graph_lasso.py index afe21fa3a02f1..8575cc4f75801 100644 --- a/sklearn/covariance/_graph_lasso.py +++ b/sklearn/covariance/_graph_lasso.py @@ -16,6 +16,7 @@ from . import empirical_covariance, EmpiricalCovariance, log_likelihood +from ..base import _fit_context from ..exceptions import ConvergenceWarning from ..utils.validation import ( _is_arraylike_not_scalar, @@ -532,6 +533,7 @@ def __init__( self.alpha = alpha self.covariance = covariance + @_fit_context(prefer_skip_nested_validation=True) def fit(self, X, y=None): """Fit the GraphicalLasso model to X. @@ -548,7 +550,6 @@ def fit(self, X, y=None): self : object Returns the instance itself. """ - self._validate_params() # Covariance does not make sense for a single feature X = self._validate_data(X, ensure_min_features=2, ensure_min_samples=2) @@ -925,6 +926,7 @@ def __init__( self.cv = cv self.n_jobs = n_jobs + @_fit_context(prefer_skip_nested_validation=True) def fit(self, X, y=None): """Fit the GraphicalLasso covariance model to X. @@ -941,7 +943,6 @@ def fit(self, X, y=None): self : object Returns the instance itself. """ - self._validate_params() # Covariance does not make sense for a single feature X = self._validate_data(X, ensure_min_features=2) if self.assume_centered: diff --git a/sklearn/covariance/_robust_covariance.py b/sklearn/covariance/_robust_covariance.py index f3dd6d60badf8..c723bba7a097b 100644 --- a/sklearn/covariance/_robust_covariance.py +++ b/sklearn/covariance/_robust_covariance.py @@ -15,6 +15,7 @@ from scipy.stats import chi2 from . import empirical_covariance, EmpiricalCovariance +from ..base import _fit_context from ..utils.extmath import fast_logdet from ..utils import check_random_state, check_array from ..utils._param_validation import Interval @@ -719,6 +720,7 @@ def __init__( self.support_fraction = support_fraction self.random_state = random_state + @_fit_context(prefer_skip_nested_validation=True) def fit(self, X, y=None): """Fit a Minimum Covariance Determinant with the FastMCD algorithm. @@ -736,7 +738,6 @@ def fit(self, X, y=None): self : object Returns the instance itself. """ - self._validate_params() X = self._validate_data(X, ensure_min_samples=2, estimator="MinCovDet") random_state = check_random_state(self.random_state) n_samples, n_features = X.shape diff --git a/sklearn/covariance/_shrunk_covariance.py b/sklearn/covariance/_shrunk_covariance.py index 4bf3d9a490b6b..21d2e034b45d7 100644 --- a/sklearn/covariance/_shrunk_covariance.py +++ b/sklearn/covariance/_shrunk_covariance.py @@ -18,6 +18,7 @@ import numpy as np from . import empirical_covariance, EmpiricalCovariance +from ..base import _fit_context from ..utils import check_array from ..utils._param_validation import Interval, validate_params @@ -237,6 +238,7 @@ def __init__(self, *, store_precision=True, assume_centered=False, shrinkage=0.1 ) self.shrinkage = shrinkage + @_fit_context(prefer_skip_nested_validation=True) def fit(self, X, y=None): """Fit the shrunk covariance model to X. @@ -254,7 +256,6 @@ def fit(self, X, y=None): self : object Returns the instance itself. """ - self._validate_params() X = self._validate_data(X) # Not calling the parent object to fit, to avoid a potential # matrix inversion when setting the precision @@ -533,6 +534,7 @@ def __init__(self, *, store_precision=True, assume_centered=False, block_size=10 ) self.block_size = block_size + @_fit_context(prefer_skip_nested_validation=True) def fit(self, X, y=None): """Fit the Ledoit-Wolf shrunk covariance model to X. @@ -549,7 +551,6 @@ def fit(self, X, y=None): self : object Returns the instance itself. """ - self._validate_params() # Not calling the parent object to fit, to avoid computing the # covariance matrix (and potentially the precision) X = self._validate_data(X) @@ -722,6 +723,7 @@ class OAS(EmpiricalCovariance): 0.0195... """ + @_fit_context(prefer_skip_nested_validation=True) def fit(self, X, y=None): """Fit the Oracle Approximating Shrinkage covariance model to X. @@ -738,8 +740,6 @@ def fit(self, X, y=None): self : object Returns the instance itself. """ - self._validate_params() - X = self._validate_data(X) # Not calling the parent object to fit, to avoid computing the # covariance matrix (and potentially the precision) diff --git a/sklearn/cross_decomposition/_pls.py b/sklearn/cross_decomposition/_pls.py index a5e5a1ceff09a..da395d8f060fb 100644 --- a/sklearn/cross_decomposition/_pls.py +++ b/sklearn/cross_decomposition/_pls.py @@ -16,6 +16,7 @@ from ..base import BaseEstimator, RegressorMixin, TransformerMixin from ..base import MultiOutputMixin from ..base import ClassNamePrefixFeaturesOutMixin +from ..base import _fit_context from ..utils import check_array, check_consistent_length from ..utils.fixes import sp_version from ..utils.fixes import parse_version @@ -208,6 +209,7 @@ def __init__( self.tol = tol self.copy = copy + @_fit_context(prefer_skip_nested_validation=True) def fit(self, X, Y): """Fit model to data. @@ -226,8 +228,6 @@ def fit(self, X, Y): self : object Fitted model. """ - self._validate_params() - check_consistent_length(X, Y) X = self._validate_data( X, dtype=np.float64, copy=self.copy, ensure_min_samples=2 @@ -958,6 +958,7 @@ def __init__(self, n_components=2, *, scale=True, copy=True): self.scale = scale self.copy = copy + @_fit_context(prefer_skip_nested_validation=True) def fit(self, X, Y): """Fit model to data. @@ -974,8 +975,6 @@ def fit(self, X, Y): self : object Fitted estimator. """ - self._validate_params() - check_consistent_length(X, Y) X = self._validate_data( X, dtype=np.float64, copy=self.copy, ensure_min_samples=2 diff --git a/sklearn/datasets/_arff_parser.py b/sklearn/datasets/_arff_parser.py index 7b2faa4b67f4d..bba06fbb74021 100644 --- a/sklearn/datasets/_arff_parser.py +++ b/sklearn/datasets/_arff_parser.py @@ -204,7 +204,10 @@ def _io_to_generator(gzip_file): if len(dfs) >= 2: dfs[0] = dfs[0].astype(dfs[1].dtypes) - frame = pd.concat(dfs, ignore_index=True) + # liac-arff parser does not depend on NumPy and uses None to represent + # missing values. To be consistent with the pandas parser, we replace + # None with np.nan. + frame = pd.concat(dfs, ignore_index=True).fillna(value=np.nan) del dfs, first_df # cast the columns frame diff --git a/sklearn/datasets/tests/test_openml.py b/sklearn/datasets/tests/test_openml.py index 42f64fba2037b..c13b82dd769d3 100644 --- a/sklearn/datasets/tests/test_openml.py +++ b/sklearn/datasets/tests/test_openml.py @@ -920,9 +920,7 @@ def datasets_missing_values(): (1119, "liac-arff", 9, 6, 0), (1119, "pandas", 9, 0, 6), # miceprotein - # 1 column has only missing values with object dtype - (40966, "liac-arff", 1, 76, 0), - # with casting it will be transformed to either float or Int64 + (40966, "liac-arff", 1, 77, 0), (40966, "pandas", 1, 77, 0), # titanic (40945, "liac-arff", 3, 6, 0), diff --git a/sklearn/decomposition/_dict_learning.py b/sklearn/decomposition/_dict_learning.py index ab2f87de4bb84..54b3590f5b62e 100644 --- a/sklearn/decomposition/_dict_learning.py +++ b/sklearn/decomposition/_dict_learning.py @@ -1796,6 +1796,7 @@ def fit(self, X, y=None): self.fit_transform(X) return self + @_fit_context(prefer_skip_nested_validation=True) def fit_transform(self, X, y=None): """Fit the model from data in X and return the transformed data. @@ -1813,8 +1814,6 @@ def fit_transform(self, X, y=None): V : ndarray of shape (n_samples, n_components) Transformed data. """ - self._validate_params() - _check_positive_coding(method=self.fit_algorithm, positive=self.positive_code) method = "lasso_" + self.fit_algorithm @@ -2435,6 +2434,7 @@ def fit(self, X, y=None): return self + @_fit_context(prefer_skip_nested_validation=True) def partial_fit(self, X, y=None): """Update the model using the data in X as a mini-batch. @@ -2454,9 +2454,6 @@ def partial_fit(self, X, y=None): """ has_components = hasattr(self, "components_") - if not has_components: - self._validate_params() - X = self._validate_data( X, dtype=[np.float64, np.float32], order="C", reset=not has_components ) diff --git a/sklearn/decomposition/_factor_analysis.py b/sklearn/decomposition/_factor_analysis.py index a6507d167b9cb..8c3d590b2c814 100644 --- a/sklearn/decomposition/_factor_analysis.py +++ b/sklearn/decomposition/_factor_analysis.py @@ -27,6 +27,7 @@ from ..base import BaseEstimator, TransformerMixin, ClassNamePrefixFeaturesOutMixin +from ..base import _fit_context from ..utils import check_random_state from ..utils._param_validation import Interval, StrOptions from ..utils.extmath import fast_logdet, randomized_svd, squared_norm @@ -197,6 +198,7 @@ def __init__( self.random_state = random_state self.rotation = rotation + @_fit_context(prefer_skip_nested_validation=True) def fit(self, X, y=None): """Fit the FactorAnalysis model to X using SVD based approach. @@ -213,8 +215,6 @@ def fit(self, X, y=None): self : object FactorAnalysis class instance. """ - self._validate_params() - X = self._validate_data(X, copy=self.copy, dtype=np.float64) n_samples, n_features = X.shape diff --git a/sklearn/decomposition/_fastica.py b/sklearn/decomposition/_fastica.py index 680a6cd8bbee1..6dcf62c0ace3b 100644 --- a/sklearn/decomposition/_fastica.py +++ b/sklearn/decomposition/_fastica.py @@ -16,6 +16,7 @@ from scipy import linalg from ..base import BaseEstimator, TransformerMixin, ClassNamePrefixFeaturesOutMixin +from ..base import _fit_context from ..exceptions import ConvergenceWarning from ..utils import check_array, as_float_array, check_random_state from ..utils.validation import check_is_fitted @@ -672,6 +673,7 @@ def g(x, fun_args): return S + @_fit_context(prefer_skip_nested_validation=True) def fit_transform(self, X, y=None): """Fit the model and recover the sources from X. @@ -690,10 +692,9 @@ def fit_transform(self, X, y=None): Estimated sources obtained by transforming the data with the estimated unmixing matrix. """ - self._validate_params() - return self._fit_transform(X, compute_sources=True) + @_fit_context(prefer_skip_nested_validation=True) def fit(self, X, y=None): """Fit the model to X. @@ -711,8 +712,6 @@ def fit(self, X, y=None): self : object Returns the instance itself. """ - self._validate_params() - self._fit_transform(X, compute_sources=False) return self diff --git a/sklearn/decomposition/_incremental_pca.py b/sklearn/decomposition/_incremental_pca.py index d98a5f4fb3b7a..5ae5d58b06ca4 100644 --- a/sklearn/decomposition/_incremental_pca.py +++ b/sklearn/decomposition/_incremental_pca.py @@ -9,6 +9,7 @@ from scipy import linalg, sparse from ._base import _BasePCA +from ..base import _fit_context from ..utils import gen_batches from ..utils._param_validation import Interval from ..utils.extmath import svd_flip, _incremental_mean_and_var @@ -192,6 +193,7 @@ def __init__(self, n_components=None, *, whiten=False, copy=True, batch_size=Non self.copy = copy self.batch_size = batch_size + @_fit_context(prefer_skip_nested_validation=True) def fit(self, X, y=None): """Fit the model with X, using minibatches of size batch_size. @@ -209,8 +211,6 @@ def fit(self, X, y=None): self : object Returns the instance itself. """ - self._validate_params() - self.components_ = None self.n_samples_seen_ = 0 self.mean_ = 0.0 @@ -243,6 +243,7 @@ def fit(self, X, y=None): return self + @_fit_context(prefer_skip_nested_validation=True) def partial_fit(self, X, y=None, check_input=True): """Incremental fit with X. All of X is processed as a single batch. @@ -265,9 +266,6 @@ def partial_fit(self, X, y=None, check_input=True): """ first_pass = not hasattr(self, "components_") - if first_pass: - self._validate_params() - if check_input: if sparse.issparse(X): raise TypeError( diff --git a/sklearn/decomposition/_kernel_pca.py b/sklearn/decomposition/_kernel_pca.py index fadcd6f94a2f8..61d502a006c5e 100644 --- a/sklearn/decomposition/_kernel_pca.py +++ b/sklearn/decomposition/_kernel_pca.py @@ -19,6 +19,7 @@ from ..utils._param_validation import Interval, StrOptions from ..exceptions import NotFittedError from ..base import BaseEstimator, TransformerMixin, ClassNamePrefixFeaturesOutMixin +from ..base import _fit_context from ..preprocessing import KernelCenterer from ..metrics.pairwise import pairwise_kernels @@ -404,6 +405,7 @@ def _fit_inverse_transform(self, X_transformed, X): self.dual_coef_ = linalg.solve(K, X, assume_a="pos", overwrite_a=True) self.X_transformed_fit_ = X_transformed + @_fit_context(prefer_skip_nested_validation=True) def fit(self, X, y=None): """Fit the model from data in X. @@ -421,8 +423,6 @@ def fit(self, X, y=None): self : object Returns the instance itself. """ - self._validate_params() - if self.fit_inverse_transform and self.kernel == "precomputed": raise ValueError("Cannot fit_inverse_transform with a precomputed kernel.") X = self._validate_data(X, accept_sparse="csr", copy=self.copy_X) diff --git a/sklearn/decomposition/_lda.py b/sklearn/decomposition/_lda.py index 21829d4fedab3..ab1ea5ebb5460 100644 --- a/sklearn/decomposition/_lda.py +++ b/sklearn/decomposition/_lda.py @@ -18,6 +18,7 @@ from joblib import effective_n_jobs from ..base import BaseEstimator, TransformerMixin, ClassNamePrefixFeaturesOutMixin +from ..base import _fit_context from ..utils import check_random_state, gen_batches, gen_even_slices from ..utils.validation import check_non_negative from ..utils.validation import check_is_fitted @@ -568,6 +569,7 @@ def _check_non_neg_array(self, X, reset_n_features, whom): return X + @_fit_context(prefer_skip_nested_validation=True) def partial_fit(self, X, y=None): """Online VB with Mini-Batch update. @@ -586,9 +588,6 @@ def partial_fit(self, X, y=None): """ first_time = not hasattr(self, "components_") - if first_time: - self._validate_params() - X = self._check_non_neg_array( X, reset_n_features=first_time, whom="LatentDirichletAllocation.partial_fit" ) @@ -618,6 +617,7 @@ def partial_fit(self, X, y=None): return self + @_fit_context(prefer_skip_nested_validation=True) def fit(self, X, y=None): """Learn model for the data X with variational Bayes method. @@ -637,7 +637,6 @@ def fit(self, X, y=None): self Fitted estimator. """ - self._validate_params() X = self._check_non_neg_array( X, reset_n_features=True, whom="LatentDirichletAllocation.fit" ) diff --git a/sklearn/decomposition/_nmf.py b/sklearn/decomposition/_nmf.py index 67dd0c2ab7b70..d561583dec205 100644 --- a/sklearn/decomposition/_nmf.py +++ b/sklearn/decomposition/_nmf.py @@ -19,6 +19,7 @@ from ._cdnmf_fast import _update_cdnmf_fast from .._config import config_context from ..base import BaseEstimator, TransformerMixin, ClassNamePrefixFeaturesOutMixin +from ..base import _fit_context from ..exceptions import ConvergenceWarning from ..utils import check_random_state, check_array, gen_batches from ..utils.extmath import randomized_svd, safe_sparse_dot, squared_norm @@ -31,6 +32,7 @@ StrOptions, validate_params, ) +from ..utils import metadata_routing EPSILON = np.finfo(np.float32).eps @@ -1122,6 +1124,11 @@ def non_negative_factorization( class _BaseNMF(ClassNamePrefixFeaturesOutMixin, TransformerMixin, BaseEstimator, ABC): """Base class for NMF and MiniBatchNMF.""" + # This prevents ``set_split_inverse_transform`` to be generated for the + # non-standard ``W`` arg on ``inverse_transform``. + # TODO: remove when W is removed in v1.5 for inverse_transform + __metadata_request__inverse_transform = {"W": metadata_routing.UNUSED} + _parameter_constraints: dict = { "n_components": [Interval(Integral, 1, None, closed="left"), None], "init": [ @@ -1245,23 +1252,44 @@ def fit(self, X, y=None, **params): self.fit_transform(X, **params) return self - def inverse_transform(self, W): + def inverse_transform(self, Xt=None, W=None): """Transform data back to its original space. .. versionadded:: 0.18 Parameters ---------- - W : {ndarray, sparse matrix} of shape (n_samples, n_components) + Xt : {ndarray, sparse matrix} of shape (n_samples, n_components) Transformed data matrix. + W : deprecated + Use `Xt` instead. + + .. deprecated:: 1.3 + Returns ------- X : {ndarray, sparse matrix} of shape (n_samples, n_features) Returns a data matrix of the original shape. """ + if Xt is None and W is None: + raise TypeError("Missing required positional argument: Xt") + + if W is not None and Xt is not None: + raise ValueError("Please provide only `Xt`, and not `W`.") + + if W is not None: + warnings.warn( + ( + "Input argument `W` was renamed to `Xt` in v1.3 and will be removed" + " in v1.5." + ), + FutureWarning, + ) + Xt = W + check_is_fitted(self) - return W @ self.components_ + return Xt @ self.components_ @property def _n_features_out(self): @@ -1539,6 +1567,7 @@ def _check_params(self, X): return self + @_fit_context(prefer_skip_nested_validation=True) def fit_transform(self, X, y=None, W=None, H=None): """Learn a NMF model for the data X and returns the transformed data. @@ -1566,8 +1595,6 @@ def fit_transform(self, X, y=None, W=None, H=None): W : ndarray of shape (n_samples, n_components) Transformed data. """ - self._validate_params() - X = self._validate_data( X, accept_sparse=("csr", "csc"), dtype=[np.float64, np.float32] ) @@ -2123,6 +2150,7 @@ def _minibatch_convergence( return False + @_fit_context(prefer_skip_nested_validation=True) def fit_transform(self, X, y=None, W=None, H=None): """Learn a NMF model for the data X and returns the transformed data. @@ -2149,8 +2177,6 @@ def fit_transform(self, X, y=None, W=None, H=None): W : ndarray of shape (n_samples, n_components) Transformed data. """ - self._validate_params() - X = self._validate_data( X, accept_sparse=("csr", "csc"), dtype=[np.float64, np.float32] ) @@ -2288,6 +2314,7 @@ def transform(self, X): return W + @_fit_context(prefer_skip_nested_validation=True) def partial_fit(self, X, y=None, W=None, H=None): """Update the model using the data in `X` as a mini-batch. @@ -2321,9 +2348,6 @@ def partial_fit(self, X, y=None, W=None, H=None): """ has_components = hasattr(self, "components_") - if not has_components: - self._validate_params() - X = self._validate_data( X, accept_sparse=("csr", "csc"), diff --git a/sklearn/decomposition/_pca.py b/sklearn/decomposition/_pca.py index e8c302fc47129..1d3c0678aca89 100644 --- a/sklearn/decomposition/_pca.py +++ b/sklearn/decomposition/_pca.py @@ -20,6 +20,7 @@ from scipy.sparse.linalg import svds from ._base import _BasePCA +from ..base import _fit_context from ..utils import check_random_state from ..utils._arpack import _init_arpack_v0 from ..utils.deprecation import deprecated @@ -414,6 +415,7 @@ def __init__( def n_features_(self): return self.n_features_in_ + @_fit_context(prefer_skip_nested_validation=True) def fit(self, X, y=None): """Fit the model with X. @@ -431,11 +433,10 @@ def fit(self, X, y=None): self : object Returns the instance itself. """ - self._validate_params() - self._fit(X) return self + @_fit_context(prefer_skip_nested_validation=True) def fit_transform(self, X, y=None): """Fit the model with X and apply the dimensionality reduction on X. @@ -458,8 +459,6 @@ def fit_transform(self, X, y=None): This method returns a Fortran-ordered array. To convert it to a C-ordered array, use 'np.ascontiguousarray'. """ - self._validate_params() - U, S, Vt = self._fit(X) U = U[:, : self.n_components_] diff --git a/sklearn/decomposition/_sparse_pca.py b/sklearn/decomposition/_sparse_pca.py index 5974b86381e1a..93e4a2164a87f 100644 --- a/sklearn/decomposition/_sparse_pca.py +++ b/sklearn/decomposition/_sparse_pca.py @@ -12,6 +12,7 @@ from ..utils.validation import check_array, check_is_fitted from ..linear_model import ridge_regression from ..base import BaseEstimator, TransformerMixin, ClassNamePrefixFeaturesOutMixin +from ..base import _fit_context from ._dict_learning import dict_learning, MiniBatchDictionaryLearning @@ -53,6 +54,7 @@ def __init__( self.verbose = verbose self.random_state = random_state + @_fit_context(prefer_skip_nested_validation=True) def fit(self, X, y=None): """Fit the model from data in X. @@ -70,7 +72,6 @@ def fit(self, X, y=None): self : object Returns the instance itself. """ - self._validate_params() random_state = check_random_state(self.random_state) X = self._validate_data(X) diff --git a/sklearn/decomposition/_truncated_svd.py b/sklearn/decomposition/_truncated_svd.py index 999266a4f3f78..67f5c73028f15 100644 --- a/sklearn/decomposition/_truncated_svd.py +++ b/sklearn/decomposition/_truncated_svd.py @@ -12,6 +12,7 @@ from scipy.sparse.linalg import svds from ..base import BaseEstimator, TransformerMixin, ClassNamePrefixFeaturesOutMixin +from ..base import _fit_context from ..utils import check_array, check_random_state from ..utils._arpack import _init_arpack_v0 from ..utils.extmath import randomized_svd, safe_sparse_dot, svd_flip @@ -200,10 +201,10 @@ def fit(self, X, y=None): self : object Returns the transformer object. """ - # param validation is done in fit_transform self.fit_transform(X) return self + @_fit_context(prefer_skip_nested_validation=True) def fit_transform(self, X, y=None): """Fit model to X and perform dimensionality reduction on X. @@ -220,7 +221,6 @@ def fit_transform(self, X, y=None): X_new : ndarray of shape (n_samples, n_components) Reduced version of X. This will always be a dense array. """ - self._validate_params() X = self._validate_data(X, accept_sparse=["csr", "csc"], ensure_min_features=2) random_state = check_random_state(self.random_state) diff --git a/sklearn/decomposition/tests/test_nmf.py b/sklearn/decomposition/tests/test_nmf.py index 74218b83c6952..2b1ed4d91be5e 100644 --- a/sklearn/decomposition/tests/test_nmf.py +++ b/sklearn/decomposition/tests/test_nmf.py @@ -1,6 +1,7 @@ import re import sys from io import StringIO +import warnings import numpy as np import scipy.sparse as sp @@ -906,3 +907,29 @@ def test_minibatch_nmf_verbose(): nmf.fit(A) finally: sys.stdout = old_stdout + + +# TODO(1.5): remove this test +def test_NMF_inverse_transform_W_deprecation(): + rng = np.random.mtrand.RandomState(42) + A = np.abs(rng.randn(6, 5)) + est = NMF( + n_components=3, + init="random", + random_state=0, + tol=1e-6, + ) + Xt = est.fit_transform(A) + + with pytest.raises(TypeError, match="Missing required positional argument"): + est.inverse_transform() + + with pytest.raises(ValueError, match="Please provide only"): + est.inverse_transform(Xt=Xt, W=Xt) + + with warnings.catch_warnings(record=True): + warnings.simplefilter("error") + est.inverse_transform(Xt) + + with pytest.warns(FutureWarning, match="Input argument `W` was renamed to `Xt`"): + est.inverse_transform(W=Xt) diff --git a/sklearn/discriminant_analysis.py b/sklearn/discriminant_analysis.py index c8c0a656e5784..275f4ae4d3b30 100644 --- a/sklearn/discriminant_analysis.py +++ b/sklearn/discriminant_analysis.py @@ -17,6 +17,7 @@ from .base import BaseEstimator, TransformerMixin, ClassifierMixin from .base import ClassNamePrefixFeaturesOutMixin +from .base import _fit_context from .linear_model._base import LinearClassifierMixin from .covariance import ledoit_wolf, empirical_covariance, shrunk_covariance from .utils.multiclass import unique_labels @@ -546,6 +547,10 @@ def _solve_svd(self, X, y): self.coef_ = coef @ self.scalings_.T self.intercept_ -= self.xbar_ @ self.coef_.T + @_fit_context( + # LinearDiscriminantAnalysis.covariance_estimator is not validated yet + prefer_skip_nested_validation=False + ) def fit(self, X, y): """Fit the Linear Discriminant Analysis model. @@ -568,8 +573,6 @@ def fit(self, X, y): self : object Fitted estimator. """ - self._validate_params() - xp, _ = get_namespace(X) X, y = self._validate_data( @@ -865,6 +868,7 @@ def __init__( self.store_covariance = store_covariance self.tol = tol + @_fit_context(prefer_skip_nested_validation=True) def fit(self, X, y): """Fit the model according to the given training data and parameters. @@ -889,7 +893,6 @@ def fit(self, X, y): self : object Fitted estimator. """ - self._validate_params() X, y = self._validate_data(X, y) check_classification_targets(y) self.classes_, y = np.unique(y, return_inverse=True) diff --git a/sklearn/dummy.py b/sklearn/dummy.py index 25f910e8419f4..0d8519484d7a5 100644 --- a/sklearn/dummy.py +++ b/sklearn/dummy.py @@ -11,6 +11,7 @@ from .base import BaseEstimator, ClassifierMixin, RegressorMixin from .base import MultiOutputMixin +from .base import _fit_context from .utils import check_random_state from .utils._param_validation import StrOptions, Interval from .utils.validation import _num_samples @@ -142,6 +143,7 @@ def __init__(self, *, strategy="prior", random_state=None, constant=None): self.random_state = random_state self.constant = constant + @_fit_context(prefer_skip_nested_validation=True) def fit(self, X, y, sample_weight=None): """Fit the baseline classifier. @@ -161,8 +163,6 @@ def fit(self, X, y, sample_weight=None): self : object Returns the instance itself. """ - self._validate_params() - self._strategy = self.strategy if self._strategy == "uniform" and sp.issparse(y): @@ -518,6 +518,7 @@ def __init__(self, *, strategy="mean", constant=None, quantile=None): self.constant = constant self.quantile = quantile + @_fit_context(prefer_skip_nested_validation=True) def fit(self, X, y, sample_weight=None): """Fit the random regressor. @@ -537,8 +538,6 @@ def fit(self, X, y, sample_weight=None): self : object Fitted estimator. """ - self._validate_params() - y = check_array(y, ensure_2d=False, input_name="y") if len(y) == 0: raise ValueError("y must not be empty.") diff --git a/sklearn/ensemble/_bagging.py b/sklearn/ensemble/_bagging.py index bad6dcfb033ec..0354413fdebfe 100644 --- a/sklearn/ensemble/_bagging.py +++ b/sklearn/ensemble/_bagging.py @@ -14,6 +14,7 @@ from ._base import BaseEnsemble, _partition_estimators from ..base import ClassifierMixin, RegressorMixin +from ..base import _fit_context from ..metrics import r2_score, accuracy_score from ..tree import DecisionTreeClassifier, DecisionTreeRegressor from ..utils import check_random_state, column_or_1d @@ -301,6 +302,10 @@ def __init__( self.random_state = random_state self.verbose = verbose + @_fit_context( + # BaseBagging.estimator is not validated yet + prefer_skip_nested_validation=False + ) def fit(self, X, y, sample_weight=None): """Build a Bagging ensemble of estimators from the training set (X, y). @@ -324,9 +329,6 @@ def fit(self, X, y, sample_weight=None): self : object Fitted estimator. """ - - self._validate_params() - # Convert data (X is required to be 2d and indexable) X, y = self._validate_data( X, diff --git a/sklearn/ensemble/_forest.py b/sklearn/ensemble/_forest.py index 4d9bf862bd806..e715952947c04 100644 --- a/sklearn/ensemble/_forest.py +++ b/sklearn/ensemble/_forest.py @@ -50,7 +50,7 @@ class calls the ``fit`` method of each sub-estimator on random samples from scipy.sparse import issparse from scipy.sparse import hstack as sparse_hstack -from sklearn.base import is_classifier +from sklearn.base import is_classifier, _fit_context from sklearn.base import ( ClassifierMixin, MultiOutputMixin, @@ -221,6 +221,7 @@ class BaseForest(MultiOutputMixin, BaseEnsemble, metaclass=ABCMeta): None, Interval(Integral, 1, None, closed="left"), ], + "store_leaf_values": [bool], } @abstractmethod @@ -240,6 +241,7 @@ def __init__( max_samples=None, base_estimator="deprecated", max_bins=None, + store_leaf_values=False, ): super().__init__( estimator=estimator, @@ -257,6 +259,7 @@ def __init__( self.class_weight = class_weight self.max_samples = max_samples self.max_bins = max_bins + self.store_leaf_values = store_leaf_values def apply(self, X): """ @@ -333,6 +336,7 @@ def decision_path(self, X): return sparse_hstack(indicators).tocsr(), n_nodes_ptr + @_fit_context(prefer_skip_nested_validation=True) def fit(self, X, y, sample_weight=None): """ Build a forest of trees from the training set (X, y). @@ -360,8 +364,6 @@ def fit(self, X, y, sample_weight=None): self : object Fitted estimator. """ - self._validate_params() - # Validate or convert input data if issparse(y): raise ValueError("sparse multilabel-indicator for y is not supported.") @@ -717,6 +719,139 @@ def _bin_data(self, X, is_training_data): return X_binned + def predict_quantiles(self, X, quantiles=0.5, method="nearest"): + """Predict class or regression value for X at given quantiles. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + Input data. + quantiles : float, optional + The quantiles at which to evaluate, by default 0.5 (median). + method : str, optional + The method to interpolate, by default 'linear'. Can be any keyword + argument accepted by :func:`np.quantile`. + check_input : bool, optional + Whether or not to check input, by default True. + + Returns + ------- + y : ndarray of shape (n_samples, n_quantiles) or + (n_samples, n_quantiles, n_outputs) + The predicted values. + """ + if not self.store_leaf_values: + raise RuntimeError( + "Quantile prediction is not available when store_leaf_values=False" + ) + check_is_fitted(self) + # Check data + X = self._validate_X_predict(X) + + if not isinstance(quantiles, (np.ndarray, list)): + quantiles = np.array([quantiles]) + + # if we trained a binning tree, then we should re-bin the data + # XXX: this is inefficient and should be improved to be in line with what + # the Histogram Gradient Boosting Tree does, where the binning thresholds + # are passed into the tree itself, thus allowing us to set the node feature + # value thresholds within the tree itself. + if self.max_bins is not None: + X = self._bin_data(X, is_training_data=False).astype(DTYPE) + + # Assign chunk of trees to jobs + # n_jobs, _, _ = _partition_estimators(self.n_estimators, self.n_jobs) + + # avoid storing the output of every estimator by summing them here + if self.n_outputs_ > 1: + y_hat = np.zeros( + (X.shape[0], len(quantiles), self.n_outputs_), dtype=np.float64 + ) + else: + y_hat = np.zeros((X.shape[0], len(quantiles)), dtype=np.float64) + + # get (n_samples, n_estimators) indicator of leaf nodes + X_leaves = self.apply(X) + + # we now want to aggregate all leaf samples across all trees for each sample + for idx in range(X.shape[0]): + # get leaf nodes for this sample + leaf_nodes = X_leaves[idx, :] + + # (n_total_leaf_samples, n_outputs) + leaf_node_samples = np.vstack( + ( + est.leaf_nodes_samples_[leaf_nodes[jdx]] + for jdx, est in enumerate(self.estimators_) + ) + ) + + # get quantiles across all leaf node samples + y_hat[idx, ...] = np.quantile( + leaf_node_samples, quantiles, axis=0, interpolation=method + ) + + if is_classifier(self): + if self.n_outputs_ == 1: + for i in range(len(quantiles)): + class_pred_per_sample = y_hat[idx, i, :].squeeze().astype(int) + y_hat[idx, ...] = self.classes_.take( + class_pred_per_sample, axis=0 + ) + else: + for k in range(self.n_outputs_): + for i in range(len(quantiles)): + class_pred_per_sample = ( + y_hat[idx, i, k].squeeze().astype(int) + ) + y_hat[idx, i, k] = self.classes_[k].take( + class_pred_per_sample, axis=0 + ) + return y_hat + + def get_leaf_node_samples(self, X): + """For each datapoint x in X, get the training samples in the leaf node. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + Dataset to apply the forest to. + + Returns + ------- + leaf_node_samples : a list of array-like of shape + (n_leaf_node_samples, n_outputs) + Each sample is represented by the indices of the training samples that + reached the leaf node. The ``n_leaf_node_samples`` may vary between + samples, since the number of samples that fall in a leaf node is + variable. + """ + check_is_fitted(self) + # Check data + X = self._validate_X_predict(X) + + # if we trained a binning tree, then we should re-bin the data + # XXX: this is inefficient and should be improved to be in line with what + # the Histogram Gradient Boosting Tree does, where the binning thresholds + # are passed into the tree itself, thus allowing us to set the node feature + # value thresholds within the tree itself. + if self.max_bins is not None: + X = self._bin_data(X, is_training_data=False).astype(DTYPE) + + # Assign chunk of trees to jobs + n_jobs, _, _ = _partition_estimators(self.n_estimators, self.n_jobs) + + # avoid storing the output of every estimator by summing them here + result = Parallel(n_jobs=n_jobs, verbose=self.verbose)( + delayed(_accumulate_leaf_nodes_samples)(e.get_leaf_node_samples, X) + for e in self.estimators_ + ) + leaf_nodes_samples = result[0] + for result_ in result[1:]: + for i, node_samples in enumerate(result_): + leaf_nodes_samples[i] = np.vstack((leaf_nodes_samples[i], node_samples)) + return leaf_nodes_samples + def _accumulate_prediction(predict, X, out, lock): """ @@ -734,6 +869,17 @@ def _accumulate_prediction(predict, X, out, lock): out[i] += prediction[i] +def _accumulate_leaf_nodes_samples(func, X): + """ + This is a utility function for joblib's Parallel. + + It can't go locally in ForestClassifier or ForestRegressor, because joblib + complains that it cannot pickle it when placed there. + """ + leaf_nodes_samples = func(X, check_input=False) + return leaf_nodes_samples + + class ForestClassifier(ClassifierMixin, BaseForest, metaclass=ABCMeta): """ Base class for forest of trees-based classifiers. @@ -759,6 +905,7 @@ def __init__( max_samples=None, base_estimator="deprecated", max_bins=None, + store_leaf_values=False, ): super().__init__( estimator=estimator, @@ -774,6 +921,7 @@ def __init__( max_samples=max_samples, base_estimator=base_estimator, max_bins=max_bins, + store_leaf_values=store_leaf_values, ) @staticmethod @@ -1037,6 +1185,7 @@ def __init__( max_samples=None, base_estimator="deprecated", max_bins=None, + store_leaf_values=False, ): super().__init__( estimator, @@ -1051,6 +1200,7 @@ def __init__( max_samples=max_samples, base_estimator=base_estimator, max_bins=max_bins, + store_leaf_values=store_leaf_values, ) def predict(self, X): @@ -1515,6 +1665,7 @@ def __init__( ccp_alpha=0.0, max_samples=None, max_bins=None, + store_leaf_values=False, ): super().__init__( estimator=DecisionTreeClassifier(), @@ -1530,6 +1681,7 @@ def __init__( "min_impurity_decrease", "random_state", "ccp_alpha", + "store_leaf_values", ), bootstrap=bootstrap, oob_score=oob_score, @@ -1540,6 +1692,7 @@ def __init__( class_weight=class_weight, max_samples=max_samples, max_bins=max_bins, + store_leaf_values=store_leaf_values, ) self.criterion = criterion @@ -1858,6 +2011,7 @@ def __init__( ccp_alpha=0.0, max_samples=None, max_bins=None, + store_leaf_values=False, ): super().__init__( estimator=DecisionTreeRegressor(), @@ -1873,6 +2027,7 @@ def __init__( "min_impurity_decrease", "random_state", "ccp_alpha", + "store_leaf_values", ), bootstrap=bootstrap, oob_score=oob_score, @@ -1882,6 +2037,7 @@ def __init__( warm_start=warm_start, max_samples=max_samples, max_bins=max_bins, + store_leaf_values=store_leaf_values, ) self.criterion = criterion @@ -2210,6 +2366,7 @@ def __init__( ccp_alpha=0.0, max_samples=None, max_bins=None, + store_leaf_values=False, ): super().__init__( estimator=ExtraTreeClassifier(), @@ -2225,6 +2382,7 @@ def __init__( "min_impurity_decrease", "random_state", "ccp_alpha", + "store_leaf_values", ), bootstrap=bootstrap, oob_score=oob_score, @@ -2235,6 +2393,7 @@ def __init__( class_weight=class_weight, max_samples=max_samples, max_bins=max_bins, + store_leaf_values=store_leaf_values, ) self.criterion = criterion @@ -2534,6 +2693,7 @@ def __init__( ccp_alpha=0.0, max_samples=None, max_bins=None, + store_leaf_values=False, ): super().__init__( estimator=ExtraTreeRegressor(), @@ -2549,6 +2709,7 @@ def __init__( "min_impurity_decrease", "random_state", "ccp_alpha", + "store_leaf_values", ), bootstrap=bootstrap, oob_score=oob_score, @@ -2558,6 +2719,7 @@ def __init__( warm_start=warm_start, max_samples=max_samples, max_bins=max_bins, + store_leaf_values=store_leaf_values, ) self.criterion = criterion @@ -2783,6 +2945,7 @@ def __init__( random_state=None, verbose=0, warm_start=False, + store_leaf_values=False, ): super().__init__( estimator=ExtraTreeRegressor(), @@ -2797,6 +2960,7 @@ def __init__( "max_leaf_nodes", "min_impurity_decrease", "random_state", + "store_leaf_values", ), bootstrap=False, oob_score=False, @@ -2805,6 +2969,7 @@ def __init__( verbose=verbose, warm_start=warm_start, max_samples=None, + store_leaf_values=store_leaf_values, ) self.max_depth = max_depth @@ -2848,6 +3013,7 @@ def fit(self, X, y=None, sample_weight=None): self.fit_transform(X, y, sample_weight=sample_weight) return self + @_fit_context(prefer_skip_nested_validation=True) def fit_transform(self, X, y=None, sample_weight=None): """ Fit estimator and transform dataset. @@ -2873,8 +3039,6 @@ def fit_transform(self, X, y=None, sample_weight=None): X_transformed : sparse matrix of shape (n_samples, n_out) Transformed dataset. """ - self._validate_params() - rnd = check_random_state(self.random_state) y = rnd.uniform(size=_num_samples(X)) super().fit(X, y, sample_weight=sample_weight) diff --git a/sklearn/ensemble/_gb.py b/sklearn/ensemble/_gb.py index df9904c8a9aa4..8d435873aeb5c 100644 --- a/sklearn/ensemble/_gb.py +++ b/sklearn/ensemble/_gb.py @@ -28,6 +28,7 @@ from ._base import BaseEnsemble from ..base import ClassifierMixin, RegressorMixin from ..base import is_classifier +from ..base import _fit_context from ._gradient_boosting import predict_stages from ._gradient_boosting import predict_stage @@ -146,6 +147,7 @@ class BaseGradientBoosting(BaseEnsemble, metaclass=ABCMeta): "n_iter_no_change": [Interval(Integral, 1, None, closed="left"), None], "tol": [Interval(Real, 0.0, None, closed="left")], } + _parameter_constraints.pop("store_leaf_values") _parameter_constraints.pop("splitter") @abstractmethod @@ -376,6 +378,10 @@ def _check_initialized(self): """Check that the estimator is initialized, raising an error if not.""" check_is_fitted(self) + @_fit_context( + # GradientBoosting*.init is not validated yet + prefer_skip_nested_validation=False + ) def fit(self, X, y, sample_weight=None, monitor=None): """Fit the gradient boosting model. @@ -412,8 +418,6 @@ def fit(self, X, y, sample_weight=None, monitor=None): self : object Fitted estimator. """ - self._validate_params() - if not self.warm_start: self._clear_state() diff --git a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py index 976335ea684d0..79b640057abe5 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py +++ b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py @@ -18,6 +18,7 @@ PinballLoss, ) from ...base import BaseEstimator, RegressorMixin, ClassifierMixin, is_classifier +from ...base import _fit_context from ...utils import check_random_state, resample, compute_sample_weight from ...utils.validation import ( check_is_fitted, @@ -336,6 +337,7 @@ def _check_interaction_cst(self, n_features): return constraints + @_fit_context(prefer_skip_nested_validation=True) def fit(self, X, y, sample_weight=None): """Fit the gradient boosting model. @@ -357,8 +359,6 @@ def fit(self, X, y, sample_weight=None): self : object Fitted estimator. """ - self._validate_params() - fit_start_time = time() acc_find_split_time = 0.0 # time spent finding the best splits acc_apply_split_time = 0.0 # time spent splitting nodes diff --git a/sklearn/ensemble/_iforest.py b/sklearn/ensemble/_iforest.py index bb016fa33185b..048a1d69395e2 100644 --- a/sklearn/ensemble/_iforest.py +++ b/sklearn/ensemble/_iforest.py @@ -20,6 +20,7 @@ from ..utils._param_validation import RealNotInt from ..utils.validation import check_is_fitted, _num_samples from ..base import OutlierMixin +from ..base import _fit_context from ._bagging import BaseBagging @@ -265,6 +266,7 @@ def _parallel_args(self): # copies. return {"prefer": "threads"} + @_fit_context(prefer_skip_nested_validation=True) def fit(self, X, y=None, sample_weight=None): """ Fit estimator. @@ -287,7 +289,6 @@ def fit(self, X, y=None, sample_weight=None): self : object Fitted estimator. """ - self._validate_params() X = self._validate_data(X, accept_sparse=["csc"], dtype=tree_dtype) if issparse(X): # Pre-sort indices to avoid that each individual tree of the diff --git a/sklearn/ensemble/_stacking.py b/sklearn/ensemble/_stacking.py index 10f7a606f20c9..5b3486edfeb33 100644 --- a/sklearn/ensemble/_stacking.py +++ b/sklearn/ensemble/_stacking.py @@ -13,6 +13,7 @@ from ..base import clone from ..base import ClassifierMixin, RegressorMixin, TransformerMixin from ..base import is_classifier, is_regressor +from ..base import _fit_context from ..exceptions import NotFittedError from ..utils._estimator_html_repr import _VisualBlock @@ -159,6 +160,10 @@ def _method_name(name, estimator, method): return method_name + @_fit_context( + # estimators in Stacking*.estimators are not validated yet + prefer_skip_nested_validation=False + ) def fit(self, X, y, sample_weight=None): """Fit the estimators. @@ -184,9 +189,6 @@ def fit(self, X, y, sample_weight=None): ------- self : object """ - - self._validate_params() - # all_estimators contains all estimators, the one to be fitted and the # 'drop' string. names, all_estimators = self._validate_estimators() diff --git a/sklearn/ensemble/_voting.py b/sklearn/ensemble/_voting.py index 1c250cbe11a06..f8f4d2c4c197f 100644 --- a/sklearn/ensemble/_voting.py +++ b/sklearn/ensemble/_voting.py @@ -22,6 +22,7 @@ from ..base import RegressorMixin from ..base import TransformerMixin from ..base import clone +from ..base import _fit_context from ._base import _fit_single_estimator from ._base import _BaseHeterogeneousEnsemble from ..preprocessing import LabelEncoder @@ -308,6 +309,10 @@ def __init__( self.flatten_transform = flatten_transform self.verbose = verbose + @_fit_context( + # estimators in VotingClassifier.estimators are not validated yet + prefer_skip_nested_validation=False + ) def fit(self, X, y, sample_weight=None): """Fit the estimators. @@ -332,7 +337,6 @@ def fit(self, X, y, sample_weight=None): self : object Returns the instance itself. """ - self._validate_params() check_classification_targets(y) if isinstance(y, np.ndarray) and len(y.shape) > 1 and y.shape[1] > 1: raise NotImplementedError( @@ -572,6 +576,10 @@ def __init__(self, estimators, *, weights=None, n_jobs=None, verbose=False): self.n_jobs = n_jobs self.verbose = verbose + @_fit_context( + # estimators in VotingRegressor.estimators are not validated yet + prefer_skip_nested_validation=False + ) def fit(self, X, y, sample_weight=None): """Fit the estimators. @@ -594,7 +602,6 @@ def fit(self, X, y, sample_weight=None): self : object Fitted estimator. """ - self._validate_params() y = column_or_1d(y, warn=True) return super().fit(X, y, sample_weight) diff --git a/sklearn/ensemble/_weight_boosting.py b/sklearn/ensemble/_weight_boosting.py index b2aff503b0bb0..569609e6326e5 100644 --- a/sklearn/ensemble/_weight_boosting.py +++ b/sklearn/ensemble/_weight_boosting.py @@ -34,7 +34,7 @@ from ._base import BaseEnsemble from ..base import ClassifierMixin, RegressorMixin, is_classifier, is_regressor - +from ..base import _fit_context from ..tree import DecisionTreeClassifier, DecisionTreeRegressor from ..utils import check_random_state, _safe_indexing from ..utils.extmath import softmax @@ -103,6 +103,10 @@ def _check_X(self, X): reset=False, ) + @_fit_context( + # AdaBoost*.estimator is not validated yet + prefer_skip_nested_validation=False + ) def fit(self, X, y, sample_weight=None): """Build a boosted classifier/regressor from the training set (X, y). @@ -124,8 +128,6 @@ def fit(self, X, y, sample_weight=None): self : object Fitted estimator. """ - self._validate_params() - X, y = self._validate_data( X, y, diff --git a/sklearn/ensemble/tests/test_forest.py b/sklearn/ensemble/tests/test_forest.py index d96f5c76842bf..a78e12a5a5181 100644 --- a/sklearn/ensemble/tests/test_forest.py +++ b/sklearn/ensemble/tests/test_forest.py @@ -1984,3 +1984,54 @@ def test_regression_criterion_withbins(name, criterion): criterion, score, ) + + +@pytest.mark.parametrize("name", FOREST_CLASSIFIERS_REGRESSORS) +def test_multioutput_quantiles(name): + # Check estimators on multi-output problems. + X_train = [ + [-2, -1], + [-1, -1], + [-1, -2], + [1, 1], + [1, 2], + [2, 1], + [-2, 1], + [-1, 1], + [-1, 2], + [2, -1], + [1, -1], + [1, -2], + ] + y_train = [ + [-1, 0], + [-1, 0], + [-1, 0], + [1, 1], + [1, 1], + [1, 1], + [-1, 2], + [-1, 2], + [-1, 2], + [1, 3], + [1, 3], + [1, 3], + ] + X_test = [[-1, -1], [1, 1], [-1, 1], [1, -1]] + y_test = [[-1, 0], [1, 1], [-1, 2], [1, 3]] + + est = FOREST_ESTIMATORS[name]( + random_state=0, bootstrap=False, store_leaf_values=True + ) + est.fit(X_train, y_train) + + y_pred = est.predict_quantiles(X_test, quantiles=[0.25, 0.5, 0.75]) + assert_array_almost_equal(y_pred[:, 1, :], y_test) + assert_array_almost_equal(y_pred[:, 0, :], y_test) + assert_array_almost_equal(y_pred[:, 2, :], y_test) + + # test the leaf nodes samples + leaf_nodes_samples = est.get_leaf_node_samples(X_test) + assert len(leaf_nodes_samples) == len(X_test) + for node_samples in leaf_nodes_samples: + assert node_samples.shape[1] == est.n_outputs_ diff --git a/sklearn/feature_extraction/_dict_vectorizer.py b/sklearn/feature_extraction/_dict_vectorizer.py index b51ccceaac9d1..60e2cb3b7ad84 100644 --- a/sklearn/feature_extraction/_dict_vectorizer.py +++ b/sklearn/feature_extraction/_dict_vectorizer.py @@ -11,6 +11,7 @@ import scipy.sparse as sp from ..base import BaseEstimator, TransformerMixin +from ..base import _fit_context from ..utils import check_array from ..utils.validation import check_is_fitted @@ -133,6 +134,7 @@ def _add_iterable_element( indices.append(vocab[feature_name]) values.append(self.dtype(vv)) + @_fit_context(prefer_skip_nested_validation=True) def fit(self, X, y=None): """Learn a list of feature name -> indices mappings. @@ -153,7 +155,6 @@ def fit(self, X, y=None): self : object DictVectorizer class instance. """ - self._validate_params() feature_names = [] vocab = {} @@ -286,6 +287,7 @@ def _transform(self, X, fitting): return result_matrix + @_fit_context(prefer_skip_nested_validation=True) def fit_transform(self, X, y=None): """Learn a list of feature name -> indices mappings and transform X. @@ -309,7 +311,6 @@ def fit_transform(self, X, y=None): Xa : {array, sparse matrix} Feature vectors; always 2-d. """ - self._validate_params() return self._transform(X, fitting=True) def inverse_transform(self, X, dict_type=dict): diff --git a/sklearn/feature_extraction/_hash.py b/sklearn/feature_extraction/_hash.py index 1f2513e70eed5..e1b5e5f2561fe 100644 --- a/sklearn/feature_extraction/_hash.py +++ b/sklearn/feature_extraction/_hash.py @@ -8,6 +8,7 @@ import scipy.sparse as sp from ..base import BaseEstimator, TransformerMixin +from ..base import _fit_context from ._hashing_fast import transform as _hashing_transform from ..utils._param_validation import Interval, StrOptions @@ -121,6 +122,7 @@ def __init__( self.n_features = n_features self.alternate_sign = alternate_sign + @_fit_context(prefer_skip_nested_validation=True) def fit(self, X=None, y=None): """Only validates estimator's parameters. @@ -140,8 +142,6 @@ def fit(self, X=None, y=None): self : object FeatureHasher class instance. """ - # repeat input validation for grid search (which calls set_params) - self._validate_params() return self def transform(self, raw_X): diff --git a/sklearn/feature_extraction/image.py b/sklearn/feature_extraction/image.py index 89bdd7557f583..beea3e23e0adc 100644 --- a/sklearn/feature_extraction/image.py +++ b/sklearn/feature_extraction/image.py @@ -16,6 +16,7 @@ from numpy.lib.stride_tricks import as_strided from ..base import BaseEstimator, TransformerMixin +from ..base import _fit_context from ..utils import check_array, check_random_state from ..utils._param_validation import Hidden, Interval, validate_params from ..utils._param_validation import RealNotInt @@ -561,6 +562,7 @@ def __init__(self, *, patch_size=None, max_patches=None, random_state=None): self.max_patches = max_patches self.random_state = random_state + @_fit_context(prefer_skip_nested_validation=True) def fit(self, X, y=None): """Only validate the parameters of the estimator. @@ -583,7 +585,6 @@ def fit(self, X, y=None): self : object Returns the instance itself. """ - self._validate_params() return self def transform(self, X): diff --git a/sklearn/feature_extraction/text.py b/sklearn/feature_extraction/text.py index 21863d75eff2f..3201e3a0d51bb 100644 --- a/sklearn/feature_extraction/text.py +++ b/sklearn/feature_extraction/text.py @@ -25,6 +25,7 @@ import scipy.sparse as sp from ..base import BaseEstimator, TransformerMixin, OneToOneFeatureMixin +from ..base import _fit_context from ..preprocessing import normalize from ._hash import FeatureHasher from ._stop_words import ENGLISH_STOP_WORDS @@ -801,6 +802,7 @@ def __init__( self.alternate_sign = alternate_sign self.dtype = dtype + @_fit_context(prefer_skip_nested_validation=True) def partial_fit(self, X, y=None): """Only validates estimator's parameters. @@ -820,10 +822,9 @@ def partial_fit(self, X, y=None): self : object HashingVectorizer instance. """ - # TODO: only validate during the first call - self._validate_params() return self + @_fit_context(prefer_skip_nested_validation=True) def fit(self, X, y=None): """Only validates estimator's parameters. @@ -843,8 +844,6 @@ def fit(self, X, y=None): self : object HashingVectorizer instance. """ - self._validate_params() - # triggers a parameter validation if isinstance(X, str): raise ValueError( @@ -1338,6 +1337,7 @@ def fit(self, raw_documents, y=None): self.fit_transform(raw_documents) return self + @_fit_context(prefer_skip_nested_validation=True) def fit_transform(self, raw_documents, y=None): """Learn the vocabulary dictionary and return document-term matrix. @@ -1365,7 +1365,6 @@ def fit_transform(self, raw_documents, y=None): "Iterable over raw text documents expected, string object received." ) - self._validate_params() self._validate_ngram_range() self._warn_for_unused_params() self._validate_vocabulary() @@ -1639,6 +1638,7 @@ def __init__(self, *, norm="l2", use_idf=True, smooth_idf=True, sublinear_tf=Fal self.smooth_idf = smooth_idf self.sublinear_tf = sublinear_tf + @_fit_context(prefer_skip_nested_validation=True) def fit(self, X, y=None): """Learn the idf vector (global term weights). @@ -1655,8 +1655,6 @@ def fit(self, X, y=None): self : object Fitted transformer. """ - self._validate_params() - # large sparse data is not supported for 32bit platforms because # _document_frequency uses np.bincount which works on arrays of # dtype NPY_INTP which is int32 for 32bit platforms. See #20923 @@ -2073,6 +2071,7 @@ def _check_params(self): UserWarning, ) + @_fit_context(prefer_skip_nested_validation=True) def fit(self, raw_documents, y=None): """Learn vocabulary and idf from training set. @@ -2089,7 +2088,6 @@ def fit(self, raw_documents, y=None): self : object Fitted vectorizer. """ - self._validate_params() self._check_params() self._warn_for_unused_params() self._tfidf = TfidfTransformer( diff --git a/sklearn/feature_selection/_from_model.py b/sklearn/feature_selection/_from_model.py index 7b8de4ae03585..47f98d89e8abe 100644 --- a/sklearn/feature_selection/_from_model.py +++ b/sklearn/feature_selection/_from_model.py @@ -9,6 +9,7 @@ from ._base import SelectorMixin from ._base import _get_feature_importances from ..base import BaseEstimator, clone, MetaEstimatorMixin +from ..base import _fit_context from ..utils._tags import _safe_tags from ..utils.validation import check_is_fitted, check_scalar, _num_features from ..utils._param_validation import HasMethods, Interval, Options @@ -320,6 +321,10 @@ def _check_max_features(self, X): ) self.max_features_ = max_features + @_fit_context( + # SelectFromModel.estimator is not validated yet + prefer_skip_nested_validation=False + ) def fit(self, X, y=None, **fit_params): """Fit the SelectFromModel meta-transformer. @@ -340,7 +345,6 @@ def fit(self, X, y=None, **fit_params): self : object Fitted estimator. """ - self._validate_params() self._check_max_features(X) if self.prefit: @@ -375,6 +379,10 @@ def threshold_(self): return _calculate_threshold(self.estimator, scores, self.threshold) @available_if(_estimator_has("partial_fit")) + @_fit_context( + # SelectFromModel.estimator is not validated yet + prefer_skip_nested_validation=False + ) def partial_fit(self, X, y=None, **fit_params): """Fit the SelectFromModel meta-transformer only once. @@ -398,7 +406,6 @@ def partial_fit(self, X, y=None, **fit_params): first_call = not hasattr(self, "estimator_") if first_call: - self._validate_params() self._check_max_features(X) if self.prefit: diff --git a/sklearn/feature_selection/_rfe.py b/sklearn/feature_selection/_rfe.py index 214ac9e0c30cf..932d66449ae22 100644 --- a/sklearn/feature_selection/_rfe.py +++ b/sklearn/feature_selection/_rfe.py @@ -22,6 +22,7 @@ from ..base import MetaEstimatorMixin from ..base import clone from ..base import is_classifier +from ..base import _fit_context from ..model_selection import check_cv from ..model_selection._validation import _score from ..metrics import check_scoring @@ -228,6 +229,10 @@ def classes_(self): """ return self.estimator_.classes_ + @_fit_context( + # RFE.estimator is not validated yet + prefer_skip_nested_validation=False + ) def fit(self, X, y, **fit_params): """Fit the RFE model and then the underlying estimator on the selected features. @@ -248,7 +253,6 @@ def fit(self, X, y, **fit_params): self : object Fitted estimator. """ - self._validate_params() return self._fit(X, y, **fit_params) def _fit(self, X, y, step_score=None, **fit_params): @@ -649,6 +653,10 @@ def __init__( self.n_jobs = n_jobs self.min_features_to_select = min_features_to_select + @_fit_context( + # RFECV.estimator is not validated yet + prefer_skip_nested_validation=False + ) def fit(self, X, y, groups=None): """Fit the RFE model and automatically tune the number of selected features. @@ -674,7 +682,6 @@ def fit(self, X, y, groups=None): self : object Fitted estimator. """ - self._validate_params() tags = self._get_tags() X, y = self._validate_data( X, diff --git a/sklearn/feature_selection/_sequential.py b/sklearn/feature_selection/_sequential.py index 8a61bdee0c554..0fbe91273053b 100644 --- a/sklearn/feature_selection/_sequential.py +++ b/sklearn/feature_selection/_sequential.py @@ -7,6 +7,7 @@ from ._base import SelectorMixin from ..base import BaseEstimator, MetaEstimatorMixin, clone, is_classifier +from ..base import _fit_context from ..utils._param_validation import HasMethods, Interval, StrOptions from ..utils._param_validation import RealNotInt from ..utils._tags import _safe_tags @@ -179,6 +180,10 @@ def __init__( self.cv = cv self.n_jobs = n_jobs + @_fit_context( + # SequentialFeatureSelector.estimator is not validated yet + prefer_skip_nested_validation=False + ) def fit(self, X, y=None): """Learn the features to select from X. @@ -197,8 +202,6 @@ def fit(self, X, y=None): self : object Returns the instance itself. """ - self._validate_params() - tags = self._get_tags() X = self._validate_data( X, diff --git a/sklearn/feature_selection/_univariate_selection.py b/sklearn/feature_selection/_univariate_selection.py index 18e23d105b8bb..f4355c39f88cd 100644 --- a/sklearn/feature_selection/_univariate_selection.py +++ b/sklearn/feature_selection/_univariate_selection.py @@ -13,6 +13,7 @@ from scipy.sparse import issparse from ..base import BaseEstimator +from ..base import _fit_context from ..preprocessing import LabelBinarizer from ..utils import as_float_array, check_array, check_X_y, safe_sqr, safe_mask from ..utils.extmath import safe_sparse_dot, row_norms @@ -473,6 +474,7 @@ class _BaseFilter(SelectorMixin, BaseEstimator): def __init__(self, score_func): self.score_func = score_func + @_fit_context(prefer_skip_nested_validation=True) def fit(self, X, y): """Run score function on (X, y) and get the appropriate features. @@ -490,8 +492,6 @@ def fit(self, X, y): self : object Returns the instance itself. """ - self._validate_params() - X, y = self._validate_data( X, y, accept_sparse=["csr", "csc"], multi_output=True ) diff --git a/sklearn/feature_selection/_variance_threshold.py b/sklearn/feature_selection/_variance_threshold.py index 7c8db9cc7fa55..073a22c6ad92b 100644 --- a/sklearn/feature_selection/_variance_threshold.py +++ b/sklearn/feature_selection/_variance_threshold.py @@ -4,6 +4,7 @@ import numpy as np from ..base import BaseEstimator +from ..base import _fit_context from ._base import SelectorMixin from ..utils.sparsefuncs import mean_variance_axis, min_max_axis from ..utils.validation import check_is_fitted @@ -76,6 +77,7 @@ class VarianceThreshold(SelectorMixin, BaseEstimator): def __init__(self, threshold=0.0): self.threshold = threshold + @_fit_context(prefer_skip_nested_validation=True) def fit(self, X, y=None): """Learn empirical variances from X. @@ -94,7 +96,6 @@ def fit(self, X, y=None): self : object Returns the instance itself. """ - self._validate_params() X = self._validate_data( X, accept_sparse=("csr", "csc"), diff --git a/sklearn/gaussian_process/_gpc.py b/sklearn/gaussian_process/_gpc.py index 4a88034768870..50a8739372972 100644 --- a/sklearn/gaussian_process/_gpc.py +++ b/sklearn/gaussian_process/_gpc.py @@ -13,6 +13,7 @@ from scipy.special import erf, expit from ..base import BaseEstimator, ClassifierMixin, clone +from ..base import _fit_context from .kernels import Kernel, RBF, CompoundKernel, ConstantKernel as C from ..utils.validation import check_is_fitted from ..utils import check_random_state @@ -679,6 +680,7 @@ def __init__( self.multi_class = multi_class self.n_jobs = n_jobs + @_fit_context(prefer_skip_nested_validation=True) def fit(self, X, y): """Fit Gaussian process classification model. @@ -695,8 +697,6 @@ def fit(self, X, y): self : object Returns an instance of self. """ - self._validate_params() - if isinstance(self.kernel, CompoundKernel): raise ValueError("kernel cannot be a CompoundKernel") diff --git a/sklearn/gaussian_process/_gpr.py b/sklearn/gaussian_process/_gpr.py index 9b7141f71b884..49fcab40c25f8 100644 --- a/sklearn/gaussian_process/_gpr.py +++ b/sklearn/gaussian_process/_gpr.py @@ -14,6 +14,7 @@ from ..base import BaseEstimator, RegressorMixin, clone from ..base import MultiOutputMixin +from ..base import _fit_context from .kernels import Kernel, RBF, ConstantKernel as C from ..preprocessing._data import _handle_zeros_in_scale from ..utils import check_random_state @@ -214,6 +215,7 @@ def __init__( self.n_targets = n_targets self.random_state = random_state + @_fit_context(prefer_skip_nested_validation=True) def fit(self, X, y): """Fit Gaussian process regression model. @@ -230,8 +232,6 @@ def fit(self, X, y): self : object GaussianProcessRegressor class instance. """ - self._validate_params() - if self.kernel is None: # Use an RBF kernel as default self.kernel_ = C(1.0, constant_value_bounds="fixed") * RBF( 1.0, length_scale_bounds="fixed" diff --git a/sklearn/impute/_base.py b/sklearn/impute/_base.py index b2f296c91740e..37fc43731514a 100644 --- a/sklearn/impute/_base.py +++ b/sklearn/impute/_base.py @@ -11,6 +11,7 @@ from scipy import sparse as sp from ..base import BaseEstimator, TransformerMixin +from ..base import _fit_context from ..utils._param_validation import StrOptions, MissingValues from ..utils.fixes import _mode from ..utils.sparsefuncs import _get_median @@ -348,6 +349,7 @@ def _validate_input(self, X, in_fit): return X + @_fit_context(prefer_skip_nested_validation=True) def fit(self, X, y=None): """Fit the imputer on `X`. @@ -365,8 +367,6 @@ def fit(self, X, y=None): self : object Fitted estimator. """ - self._validate_params() - X = self._validate_input(X, in_fit=True) # default fill_value is 0 for numerical input and "missing_value" @@ -927,6 +927,7 @@ def _fit(self, X, y=None, precomputed=False): return missing_features_info[0] + @_fit_context(prefer_skip_nested_validation=True) def fit(self, X, y=None): """Fit the transformer on `X`. @@ -944,7 +945,6 @@ def fit(self, X, y=None): self : object Fitted estimator. """ - self._validate_params() self._fit(X, y) return self @@ -990,6 +990,7 @@ def transform(self, X): return imputer_mask + @_fit_context(prefer_skip_nested_validation=True) def fit_transform(self, X, y=None): """Generate missing values indicator for `X`. @@ -1008,7 +1009,6 @@ def fit_transform(self, X, y=None): The missing indicator for input data. The data type of `Xt` will be boolean. """ - self._validate_params() imputer_mask = self._fit(X, y) if self.features_.size < self._n_features: diff --git a/sklearn/impute/_iterative.py b/sklearn/impute/_iterative.py index 41ed19b7a8948..f977e5bc23e6c 100644 --- a/sklearn/impute/_iterative.py +++ b/sklearn/impute/_iterative.py @@ -7,6 +7,7 @@ import numpy as np from ..base import clone +from ..base import _fit_context from ..exceptions import ConvergenceWarning from ..preprocessing import normalize from ..utils import ( @@ -627,7 +628,7 @@ def _initial_imputation(self, X, in_fit=False): strategy=self.initial_strategy, fill_value=self.fill_value, keep_empty_features=self.keep_empty_features, - ) + ).set_output(transform="default") X_filled = self.initial_imputer_.fit_transform(X) else: X_filled = self.initial_imputer_.transform(X) @@ -681,6 +682,10 @@ def _validate_limit(limit, limit_type, n_features): ) return limit + @_fit_context( + # IterativeImputer.estimator is not validated yet + prefer_skip_nested_validation=False + ) def fit_transform(self, X, y=None): """Fit the imputer on `X` and return the transformed `X`. @@ -698,7 +703,6 @@ def fit_transform(self, X, y=None): Xt : array-like, shape (n_samples, n_features) The imputed input data. """ - self._validate_params() self.random_state_ = getattr( self, "random_state_", check_random_state(self.random_state) ) diff --git a/sklearn/impute/_knn.py b/sklearn/impute/_knn.py index 5735709dd7f29..915f8cbdb3fcb 100644 --- a/sklearn/impute/_knn.py +++ b/sklearn/impute/_knn.py @@ -6,6 +6,7 @@ import numpy as np from ._base import _BaseImputer +from ..base import _fit_context from ..utils.validation import FLOAT_DTYPES from ..metrics import pairwise_distances_chunked from ..metrics.pairwise import _NAN_METRICS @@ -199,6 +200,7 @@ def _calc_impute(self, dist_pot_donors, n_neighbors, fit_X_col, mask_fit_X_col): return np.ma.average(donors, axis=1, weights=weight_matrix).data + @_fit_context(prefer_skip_nested_validation=True) def fit(self, X, y=None): """Fit the imputer on X. @@ -216,7 +218,6 @@ def fit(self, X, y=None): self : object The fitted `KNNImputer` class instance. """ - self._validate_params() # Check data integrity and calling arguments if not is_scalar_nan(self.missing_values): force_all_finite = True diff --git a/sklearn/isotonic.py b/sklearn/isotonic.py index aa1521ab697d0..a1cf95b95591b 100644 --- a/sklearn/isotonic.py +++ b/sklearn/isotonic.py @@ -11,6 +11,7 @@ import math from .base import BaseEstimator, TransformerMixin, RegressorMixin +from .base import _fit_context from .utils import check_array, check_consistent_length from .utils.validation import _check_sample_weight, check_is_fitted from .utils._param_validation import Interval, StrOptions @@ -310,6 +311,7 @@ def _build_y(self, X, y, sample_weight, trim_duplicates=True): # prediction speed). return X, y + @_fit_context(prefer_skip_nested_validation=True) def fit(self, X, y, sample_weight=None): """Fit the model using X, y as training data. @@ -338,7 +340,6 @@ def fit(self, X, y, sample_weight=None): X is stored for future use, as :meth:`transform` needs X to interpolate new input data. """ - self._validate_params() check_params = dict(accept_sparse=False, ensure_2d=False) X = check_array( X, input_name="X", dtype=[np.float64, np.float32], **check_params diff --git a/sklearn/kernel_approximation.py b/sklearn/kernel_approximation.py index faa098e634937..7f190a2b66823 100644 --- a/sklearn/kernel_approximation.py +++ b/sklearn/kernel_approximation.py @@ -23,6 +23,7 @@ from .base import BaseEstimator from .base import TransformerMixin from .base import ClassNamePrefixFeaturesOutMixin +from .base import _fit_context from .utils import check_random_state from .utils import deprecated from .utils.extmath import safe_sparse_dot @@ -139,6 +140,7 @@ def __init__( self.n_components = n_components self.random_state = random_state + @_fit_context(prefer_skip_nested_validation=True) def fit(self, X, y=None): """Fit the model with X. @@ -160,8 +162,6 @@ def fit(self, X, y=None): self : object Returns the instance itself. """ - self._validate_params() - X = self._validate_data(X, accept_sparse="csc") random_state = check_random_state(self.random_state) @@ -338,6 +338,7 @@ def __init__(self, *, gamma=1.0, n_components=100, random_state=None): self.n_components = n_components self.random_state = random_state + @_fit_context(prefer_skip_nested_validation=True) def fit(self, X, y=None): """Fit the model with X. @@ -358,8 +359,6 @@ def fit(self, X, y=None): self : object Returns the instance itself. """ - self._validate_params() - X = self._validate_data(X, accept_sparse="csr") random_state = check_random_state(self.random_state) n_features = X.shape[1] @@ -498,6 +497,7 @@ def __init__(self, *, skewedness=1.0, n_components=100, random_state=None): self.n_components = n_components self.random_state = random_state + @_fit_context(prefer_skip_nested_validation=True) def fit(self, X, y=None): """Fit the model with X. @@ -518,7 +518,6 @@ def fit(self, X, y=None): self : object Returns the instance itself. """ - self._validate_params() X = self._validate_data(X) random_state = check_random_state(self.random_state) n_features = X.shape[1] @@ -665,6 +664,7 @@ def __init__(self, *, sample_steps=2, sample_interval=None): self.sample_steps = sample_steps self.sample_interval = sample_interval + @_fit_context(prefer_skip_nested_validation=True) def fit(self, X, y=None): """Only validates estimator's parameters. @@ -686,7 +686,6 @@ def fit(self, X, y=None): self : object Returns the transformer. """ - self._validate_params() X = self._validate_data(X, accept_sparse="csr") check_non_negative(X, "X in AdditiveChi2Sampler.fit") @@ -1011,6 +1010,7 @@ def __init__( self.random_state = random_state self.n_jobs = n_jobs + @_fit_context(prefer_skip_nested_validation=True) def fit(self, X, y=None): """Fit estimator to data. @@ -1032,7 +1032,6 @@ def fit(self, X, y=None): self : object Returns the instance itself. """ - self._validate_params() X = self._validate_data(X, accept_sparse="csr") rnd = check_random_state(self.random_state) n_samples = X.shape[0] diff --git a/sklearn/kernel_ridge.py b/sklearn/kernel_ridge.py index 111e62938f096..a7bfeefaef651 100644 --- a/sklearn/kernel_ridge.py +++ b/sklearn/kernel_ridge.py @@ -8,6 +8,7 @@ import numpy as np from .base import BaseEstimator, RegressorMixin, MultiOutputMixin +from .base import _fit_context from .utils._param_validation import Interval, StrOptions from .metrics.pairwise import PAIRWISE_KERNEL_FUNCTIONS, pairwise_kernels from .linear_model._ridge import _solve_cholesky_kernel @@ -170,6 +171,7 @@ def _get_kernel(self, X, Y=None): def _more_tags(self): return {"pairwise": self.kernel == "precomputed"} + @_fit_context(prefer_skip_nested_validation=True) def fit(self, X, y, sample_weight=None): """Fit Kernel Ridge regression model. @@ -190,8 +192,6 @@ def fit(self, X, y, sample_weight=None): self : object Returns the instance itself. """ - self._validate_params() - # Convert data X, y = self._validate_data( X, y, accept_sparse=("csr", "csc"), multi_output=True, y_numeric=True diff --git a/sklearn/linear_model/_base.py b/sklearn/linear_model/_base.py index 06d8664dc013b..92c067c850225 100644 --- a/sklearn/linear_model/_base.py +++ b/sklearn/linear_model/_base.py @@ -28,6 +28,7 @@ from numbers import Integral from ..base import BaseEstimator, ClassifierMixin, RegressorMixin, MultiOutputMixin +from ..base import _fit_context from ..preprocessing._data import _is_constant_feature from ..utils import check_array from ..utils.validation import FLOAT_DTYPES @@ -642,6 +643,7 @@ def __init__( self.n_jobs = n_jobs self.positive = positive + @_fit_context(prefer_skip_nested_validation=True) def fit(self, X, y, sample_weight=None): """ Fit linear model. @@ -665,9 +667,6 @@ def fit(self, X, y, sample_weight=None): self : object Fitted Estimator. """ - - self._validate_params() - n_jobs_ = self.n_jobs accept_sparse = False if self.positive else ["csr", "csc", "coo"] diff --git a/sklearn/linear_model/_bayes.py b/sklearn/linear_model/_bayes.py index 887c6a3ebcbbc..37dc3b81511f5 100644 --- a/sklearn/linear_model/_bayes.py +++ b/sklearn/linear_model/_bayes.py @@ -13,6 +13,7 @@ from ._base import LinearModel, _preprocess_data, _rescale_data from ..base import RegressorMixin +from ..base import _fit_context from ..utils.extmath import fast_logdet from scipy.linalg import pinvh from ..utils.validation import _check_sample_weight @@ -267,6 +268,7 @@ def __init__( self.verbose = verbose self.n_iter = n_iter + @_fit_context(prefer_skip_nested_validation=True) def fit(self, X, y, sample_weight=None): """Fit the model. @@ -288,8 +290,6 @@ def fit(self, X, y, sample_weight=None): self : object Returns the instance itself. """ - self._validate_params() - max_iter = _deprecate_n_iter(self.n_iter, self.max_iter) X, y = self._validate_data(X, y, dtype=[np.float64, np.float32], y_numeric=True) @@ -665,6 +665,7 @@ def __init__( self.verbose = verbose self.n_iter = n_iter + @_fit_context(prefer_skip_nested_validation=True) def fit(self, X, y): """Fit the model according to the given training data and parameters. @@ -683,9 +684,6 @@ def fit(self, X, y): self : object Fitted estimator. """ - - self._validate_params() - max_iter = _deprecate_n_iter(self.n_iter, self.max_iter) X, y = self._validate_data( diff --git a/sklearn/linear_model/_coordinate_descent.py b/sklearn/linear_model/_coordinate_descent.py index ea1ee3115ea93..829c0ab6149f1 100644 --- a/sklearn/linear_model/_coordinate_descent.py +++ b/sklearn/linear_model/_coordinate_descent.py @@ -18,6 +18,7 @@ from ._base import LinearModel, _pre_fit from ..base import RegressorMixin, MultiOutputMixin +from ..base import _fit_context from ._base import _preprocess_data from ..utils import check_array, check_scalar from ..utils.validation import check_random_state @@ -851,6 +852,7 @@ def __init__( self.random_state = random_state self.selection = selection + @_fit_context(prefer_skip_nested_validation=True) def fit(self, X, y, sample_weight=None, check_input=True): """Fit model with coordinate descent. @@ -886,8 +888,6 @@ def fit(self, X, y, sample_weight=None, check_input=True): To avoid memory re-allocation it is advised to allocate the initial data in memory directly using that format. """ - self._validate_params() - if self.alpha == 0: warnings.warn( ( @@ -1475,6 +1475,7 @@ def _is_multitask(self): def path(X, y, **kwargs): """Compute path with coordinate descent.""" + @_fit_context(prefer_skip_nested_validation=True) def fit(self, X, y, sample_weight=None): """Fit linear model with coordinate descent. @@ -1502,9 +1503,6 @@ def fit(self, X, y, sample_weight=None): self : object Returns an instance of fitted model. """ - - self._validate_params() - # This makes sure that there is no duplication in memory. # Dealing right with copy_X is important in the following: # Multiple functions touch X and subsamples of X and can induce a @@ -2343,6 +2341,7 @@ def __init__( self.random_state = random_state self.selection = selection + @_fit_context(prefer_skip_nested_validation=True) def fit(self, X, y): """Fit MultiTaskElasticNet model with coordinate descent. @@ -2367,8 +2366,6 @@ def fit(self, X, y): To avoid memory re-allocation it is advised to allocate the initial data in memory directly using that format. """ - self._validate_params() - # Need to validate separately here. # We can't pass multi_output=True because that would allow y to be csr. check_X_params = dict( diff --git a/sklearn/linear_model/_glm/glm.py b/sklearn/linear_model/_glm/glm.py index caf37a0f473e0..b1bc460f24dff 100644 --- a/sklearn/linear_model/_glm/glm.py +++ b/sklearn/linear_model/_glm/glm.py @@ -20,6 +20,7 @@ HalfTweedieLossIdentity, ) from ...base import BaseEstimator, RegressorMixin +from ...base import _fit_context from ...utils import check_array from ...utils._openmp_helpers import _openmp_effective_n_threads from ...utils._param_validation import Hidden, Interval, StrOptions @@ -168,6 +169,7 @@ def __init__( self.warm_start = warm_start self.verbose = verbose + @_fit_context(prefer_skip_nested_validation=True) def fit(self, X, y, sample_weight=None): """Fit a Generalized Linear Model. @@ -187,8 +189,6 @@ def fit(self, X, y, sample_weight=None): self : object Fitted model. """ - self._validate_params() - X, y = self._validate_data( X, y, diff --git a/sklearn/linear_model/_huber.py b/sklearn/linear_model/_huber.py index a7b848f647560..def2ae273d5c4 100644 --- a/sklearn/linear_model/_huber.py +++ b/sklearn/linear_model/_huber.py @@ -7,6 +7,7 @@ from scipy import optimize from ..base import BaseEstimator, RegressorMixin +from ..base import _fit_context from ._base import LinearModel from ..utils import axis0_safe_slice from ..utils._param_validation import Interval @@ -273,6 +274,7 @@ def __init__( self.fit_intercept = fit_intercept self.tol = tol + @_fit_context(prefer_skip_nested_validation=True) def fit(self, X, y, sample_weight=None): """Fit the model according to the given training data. @@ -293,7 +295,6 @@ def fit(self, X, y, sample_weight=None): self : object Fitted `HuberRegressor` estimator. """ - self._validate_params() X, y = self._validate_data( X, y, diff --git a/sklearn/linear_model/_least_angle.py b/sklearn/linear_model/_least_angle.py index 4be8bb730a0ae..e6c653eb80bb3 100644 --- a/sklearn/linear_model/_least_angle.py +++ b/sklearn/linear_model/_least_angle.py @@ -20,6 +20,7 @@ from ._base import LinearModel, LinearRegression from ._base import _deprecate_normalize, _preprocess_data from ..base import RegressorMixin, MultiOutputMixin +from ..base import _fit_context # mypy error: Module 'sklearn.utils' has no attribute 'arrayfuncs' from ..utils import arrayfuncs, as_float_array # type: ignore @@ -1097,6 +1098,7 @@ def _fit(self, X, y, max_iter, alpha, fit_path, normalize, Xy=None): self._set_intercept(X_offset, y_offset, X_scale) return self + @_fit_context(prefer_skip_nested_validation=True) def fit(self, X, y, Xy=None): """Fit the model using X, y as training data. @@ -1118,8 +1120,6 @@ def fit(self, X, y, Xy=None): self : object Returns an instance of self. """ - self._validate_params() - X, y = self._validate_data(X, y, y_numeric=True, multi_output=True) _normalize = _deprecate_normalize( @@ -1691,6 +1691,7 @@ def __init__( def _more_tags(self): return {"multioutput": False} + @_fit_context(prefer_skip_nested_validation=True) def fit(self, X, y): """Fit the model using X, y as training data. @@ -1707,8 +1708,6 @@ def fit(self, X, y): self : object Returns an instance of self. """ - self._validate_params() - _normalize = _deprecate_normalize( self.normalize, estimator_name=self.__class__.__name__ ) @@ -2216,6 +2215,7 @@ def __init__( def _more_tags(self): return {"multioutput": False} + @_fit_context(prefer_skip_nested_validation=True) def fit(self, X, y, copy_X=None): """Fit the model using X, y as training data. @@ -2237,8 +2237,6 @@ def fit(self, X, y, copy_X=None): self : object Returns an instance of self. """ - self._validate_params() - _normalize = _deprecate_normalize( self.normalize, estimator_name=self.__class__.__name__ ) diff --git a/sklearn/linear_model/_logistic.py b/sklearn/linear_model/_logistic.py index 3db27d9cc3163..30a0f40a0f2fd 100644 --- a/sklearn/linear_model/_logistic.py +++ b/sklearn/linear_model/_logistic.py @@ -24,6 +24,7 @@ from ._linear_loss import LinearModelLoss from ._sag import sag_solver from ._glm.glm import NewtonCholeskySolver +from ..base import _fit_context from .._loss.loss import HalfBinomialLoss, HalfMultinomialLoss from ..preprocessing import LabelEncoder, LabelBinarizer from ..svm._base import _fit_liblinear @@ -1132,6 +1133,7 @@ def __init__( self.n_jobs = n_jobs self.l1_ratio = l1_ratio + @_fit_context(prefer_skip_nested_validation=True) def fit(self, X, y, sample_weight=None): """ Fit the model according to the given training data. @@ -1161,9 +1163,6 @@ def fit(self, X, y, sample_weight=None): ----- The SAGA solver supports both float64 and float32 bit arrays. """ - - self._validate_params() - solver = _check_solver(self.solver, self.penalty, self.dual) if self.penalty != "elasticnet" and self.l1_ratio is not None: @@ -1745,6 +1744,7 @@ def __init__( self.random_state = random_state self.l1_ratios = l1_ratios + @_fit_context(prefer_skip_nested_validation=True) def fit(self, X, y, sample_weight=None): """Fit the model according to the given training data. @@ -1766,9 +1766,6 @@ def fit(self, X, y, sample_weight=None): self : object Fitted LogisticRegressionCV estimator. """ - - self._validate_params() - solver = _check_solver(self.solver, self.penalty, self.dual) if self.penalty == "elasticnet": diff --git a/sklearn/linear_model/_omp.py b/sklearn/linear_model/_omp.py index b1dc1e352fd62..df451a99417b0 100644 --- a/sklearn/linear_model/_omp.py +++ b/sklearn/linear_model/_omp.py @@ -15,6 +15,7 @@ from ._base import LinearModel, _pre_fit, _deprecate_normalize from ..base import RegressorMixin, MultiOutputMixin +from ..base import _fit_context from ..utils import as_float_array, check_array from ..utils.parallel import delayed, Parallel from ..utils._param_validation import Hidden, Interval, StrOptions @@ -725,6 +726,7 @@ def __init__( self.normalize = normalize self.precompute = precompute + @_fit_context(prefer_skip_nested_validation=True) def fit(self, X, y): """Fit the model using X, y as training data. @@ -741,8 +743,6 @@ def fit(self, X, y): self : object Returns an instance of self. """ - self._validate_params() - _normalize = _deprecate_normalize( self.normalize, estimator_name=self.__class__.__name__ ) @@ -1042,6 +1042,7 @@ def __init__( self.n_jobs = n_jobs self.verbose = verbose + @_fit_context(prefer_skip_nested_validation=True) def fit(self, X, y): """Fit the model using X, y as training data. @@ -1058,8 +1059,6 @@ def fit(self, X, y): self : object Returns an instance of self. """ - self._validate_params() - _normalize = _deprecate_normalize( self.normalize, estimator_name=self.__class__.__name__ ) diff --git a/sklearn/linear_model/_passive_aggressive.py b/sklearn/linear_model/_passive_aggressive.py index 2cacd4f78cc54..a9c81799c8ca3 100644 --- a/sklearn/linear_model/_passive_aggressive.py +++ b/sklearn/linear_model/_passive_aggressive.py @@ -5,6 +5,7 @@ from ._stochastic_gradient import BaseSGDClassifier from ._stochastic_gradient import BaseSGDRegressor from ._stochastic_gradient import DEFAULT_EPSILON +from ..base import _fit_context from ..utils._param_validation import Interval, StrOptions @@ -220,6 +221,7 @@ def __init__( self.C = C self.loss = loss + @_fit_context(prefer_skip_nested_validation=True) def partial_fit(self, X, y, classes=None): """Fit linear model with Passive Aggressive algorithm. @@ -245,7 +247,6 @@ def partial_fit(self, X, y, classes=None): Fitted estimator. """ if not hasattr(self, "classes_"): - self._validate_params() self._more_validate_params(for_partial_fit=True) if self.class_weight == "balanced": @@ -276,6 +277,7 @@ def partial_fit(self, X, y, classes=None): intercept_init=None, ) + @_fit_context(prefer_skip_nested_validation=True) def fit(self, X, y, coef_init=None, intercept_init=None): """Fit linear model with Passive Aggressive algorithm. @@ -298,7 +300,6 @@ def fit(self, X, y, coef_init=None, intercept_init=None): self : object Fitted estimator. """ - self._validate_params() self._more_validate_params() lr = "pa1" if self.loss == "hinge" else "pa2" @@ -504,6 +505,7 @@ def __init__( self.C = C self.loss = loss + @_fit_context(prefer_skip_nested_validation=True) def partial_fit(self, X, y): """Fit linear model with Passive Aggressive algorithm. @@ -521,7 +523,6 @@ def partial_fit(self, X, y): Fitted estimator. """ if not hasattr(self, "coef_"): - self._validate_params() self._more_validate_params(for_partial_fit=True) lr = "pa1" if self.loss == "epsilon_insensitive" else "pa2" @@ -538,6 +539,7 @@ def partial_fit(self, X, y): intercept_init=None, ) + @_fit_context(prefer_skip_nested_validation=True) def fit(self, X, y, coef_init=None, intercept_init=None): """Fit linear model with Passive Aggressive algorithm. @@ -560,7 +562,6 @@ def fit(self, X, y, coef_init=None, intercept_init=None): self : object Fitted estimator. """ - self._validate_params() self._more_validate_params() lr = "pa1" if self.loss == "epsilon_insensitive" else "pa2" diff --git a/sklearn/linear_model/_quantile.py b/sklearn/linear_model/_quantile.py index 081e3da5b51b7..b4a5581386a5f 100644 --- a/sklearn/linear_model/_quantile.py +++ b/sklearn/linear_model/_quantile.py @@ -9,6 +9,7 @@ from scipy.optimize import linprog from ..base import BaseEstimator, RegressorMixin +from ..base import _fit_context from ._base import LinearModel from ..exceptions import ConvergenceWarning from ..utils import _safe_indexing @@ -141,6 +142,7 @@ def __init__( self.solver = solver self.solver_options = solver_options + @_fit_context(prefer_skip_nested_validation=True) def fit(self, X, y, sample_weight=None): """Fit the model according to the given training data. @@ -160,7 +162,6 @@ def fit(self, X, y, sample_weight=None): self : object Returns self. """ - self._validate_params() X, y = self._validate_data( X, y, diff --git a/sklearn/linear_model/_ransac.py b/sklearn/linear_model/_ransac.py index 2474a25f07199..1c12ecc13a258 100644 --- a/sklearn/linear_model/_ransac.py +++ b/sklearn/linear_model/_ransac.py @@ -9,6 +9,7 @@ from ..base import BaseEstimator, MetaEstimatorMixin, RegressorMixin, clone from ..base import MultiOutputMixin +from ..base import _fit_context from ..utils import check_random_state, check_consistent_length from ..utils.random import sample_without_replacement from ..utils.validation import check_is_fitted, _check_sample_weight @@ -283,6 +284,10 @@ def __init__( self.random_state = random_state self.loss = loss + @_fit_context( + # RansacRegressor.estimator is not validated yet + prefer_skip_nested_validation=False + ) def fit(self, X, y, sample_weight=None): """Fit estimator using RANSAC algorithm. @@ -313,8 +318,6 @@ def fit(self, X, y, sample_weight=None): `is_data_valid` and `is_model_valid` return False for all `max_trials` randomly chosen sub-samples. """ - self._validate_params() - # Need to validate separately here. We can't pass multi_output=True # because that would allow y to be csr. Delay expensive finiteness # check to the estimator's own input validation. diff --git a/sklearn/linear_model/_ridge.py b/sklearn/linear_model/_ridge.py index 28ef7cbd43eb7..893b10d1d93ae 100644 --- a/sklearn/linear_model/_ridge.py +++ b/sklearn/linear_model/_ridge.py @@ -25,6 +25,7 @@ from ._base import _preprocess_data, _rescale_data from ._sag import sag_solver from ..base import MultiOutputMixin, RegressorMixin, is_classifier +from ..base import _fit_context from ..utils.extmath import safe_sparse_dot from ..utils.extmath import row_norms from ..utils import check_array @@ -1114,6 +1115,7 @@ def __init__( random_state=random_state, ) + @_fit_context(prefer_skip_nested_validation=True) def fit(self, X, y, sample_weight=None): """Fit Ridge regression model. @@ -1134,8 +1136,6 @@ def fit(self, X, y, sample_weight=None): self : object Fitted estimator. """ - self._validate_params() - _accept_sparse = _get_valid_accept_sparse(sparse.issparse(X), self.solver) X, y = self._validate_data( X, @@ -1423,6 +1423,7 @@ def __init__( ) self.class_weight = class_weight + @_fit_context(prefer_skip_nested_validation=True) def fit(self, X, y, sample_weight=None): """Fit Ridge classifier model. @@ -1446,8 +1447,6 @@ def fit(self, X, y, sample_weight=None): self : object Instance of the estimator. """ - self._validate_params() - X, y, sample_weight, Y = self._prepare_data(X, y, sample_weight, self.solver) super().fit(X, Y, sample_weight=sample_weight) @@ -2354,6 +2353,7 @@ class RidgeCV(MultiOutputMixin, RegressorMixin, _BaseRidgeCV): 0.5166... """ + @_fit_context(prefer_skip_nested_validation=True) def fit(self, X, y, sample_weight=None): """Fit Ridge regression model with cv. @@ -2383,8 +2383,6 @@ def fit(self, X, y, sample_weight=None): cross-validation takes the sample weights into account when computing the validation score. """ - self._validate_params() - super().fit(X, y, sample_weight=sample_weight) return self @@ -2533,6 +2531,7 @@ def __init__( ) self.class_weight = class_weight + @_fit_context(prefer_skip_nested_validation=True) def fit(self, X, y, sample_weight=None): """Fit Ridge classifier with cv. @@ -2555,8 +2554,6 @@ def fit(self, X, y, sample_weight=None): self : object Fitted estimator. """ - self._validate_params() - # `RidgeClassifier` does not accept "sag" or "saga" solver and thus support # csr, csc, and coo sparse matrices. By using solver="eigen" we force to accept # all sparse format. diff --git a/sklearn/linear_model/_stochastic_gradient.py b/sklearn/linear_model/_stochastic_gradient.py index 2f27bdee7968b..bc8f31016c6f8 100644 --- a/sklearn/linear_model/_stochastic_gradient.py +++ b/sklearn/linear_model/_stochastic_gradient.py @@ -13,6 +13,7 @@ from numbers import Integral, Real from ..base import clone, is_classifier +from ..base import _fit_context from ._base import LinearClassifierMixin, SparseCoefMixin from ._base import make_dataset from ..base import BaseEstimator, RegressorMixin, OutlierMixin @@ -805,6 +806,7 @@ def _fit_multiclass(self, X, y, alpha, C, learning_rate, sample_weight, max_iter self._standard_intercept = np.atleast_1d(self.intercept_) self.intercept_ = self._standard_intercept + @_fit_context(prefer_skip_nested_validation=True) def partial_fit(self, X, y, classes=None, sample_weight=None): """Perform one epoch of stochastic gradient descent on given samples. @@ -839,7 +841,6 @@ def partial_fit(self, X, y, classes=None, sample_weight=None): Returns an instance of self. """ if not hasattr(self, "classes_"): - self._validate_params() self._more_validate_params(for_partial_fit=True) if self.class_weight == "balanced": @@ -869,6 +870,7 @@ def partial_fit(self, X, y, classes=None, sample_weight=None): intercept_init=None, ) + @_fit_context(prefer_skip_nested_validation=True) def fit(self, X, y, coef_init=None, intercept_init=None, sample_weight=None): """Fit linear model with Stochastic Gradient Descent. @@ -897,7 +899,6 @@ def fit(self, X, y, coef_init=None, intercept_init=None, sample_weight=None): self : object Returns an instance of self. """ - self._validate_params() self._more_validate_params() return self._fit( @@ -1470,6 +1471,7 @@ def _partial_fit( return self + @_fit_context(prefer_skip_nested_validation=True) def partial_fit(self, X, y, sample_weight=None): """Perform one epoch of stochastic gradient descent on given samples. @@ -1496,7 +1498,6 @@ def partial_fit(self, X, y, sample_weight=None): Returns an instance of self. """ if not hasattr(self, "coef_"): - self._validate_params() self._more_validate_params(for_partial_fit=True) return self._partial_fit( @@ -1565,6 +1566,7 @@ def _fit( return self + @_fit_context(prefer_skip_nested_validation=True) def fit(self, X, y, coef_init=None, intercept_init=None, sample_weight=None): """Fit linear model with Stochastic Gradient Descent. @@ -1590,7 +1592,6 @@ def fit(self, X, y, coef_init=None, intercept_init=None, sample_weight=None): self : object Fitted `SGDRegressor` estimator. """ - self._validate_params() self._more_validate_params() return self._fit( @@ -2366,6 +2367,7 @@ def _partial_fit( return self + @_fit_context(prefer_skip_nested_validation=True) def partial_fit(self, X, y=None, sample_weight=None): """Fit linear One-Class SVM with Stochastic Gradient Descent. @@ -2386,7 +2388,6 @@ def partial_fit(self, X, y=None, sample_weight=None): Returns a fitted instance of self. """ if not hasattr(self, "coef_"): - self._validate_params() self._more_validate_params(for_partial_fit=True) alpha = self.nu / 2 @@ -2453,6 +2454,7 @@ def _fit( return self + @_fit_context(prefer_skip_nested_validation=True) def fit(self, X, y=None, coef_init=None, offset_init=None, sample_weight=None): """Fit linear One-Class SVM with Stochastic Gradient Descent. @@ -2485,7 +2487,6 @@ def fit(self, X, y=None, coef_init=None, offset_init=None, sample_weight=None): self : object Returns a fitted instance of self. """ - self._validate_params() self._more_validate_params() alpha = self.nu / 2 diff --git a/sklearn/linear_model/_theil_sen.py b/sklearn/linear_model/_theil_sen.py index 67d6ca532a8ab..72c2d897681c4 100644 --- a/sklearn/linear_model/_theil_sen.py +++ b/sklearn/linear_model/_theil_sen.py @@ -19,6 +19,7 @@ from ._base import LinearModel from ..base import RegressorMixin +from ..base import _fit_context from ..utils import check_random_state from ..utils._param_validation import Interval from ..utils.parallel import delayed, Parallel @@ -395,6 +396,7 @@ def _check_subparams(self, n_samples, n_features): return n_subsamples, n_subpopulation + @_fit_context(prefer_skip_nested_validation=True) def fit(self, X, y): """Fit linear model. @@ -410,7 +412,6 @@ def fit(self, X, y): self : returns an instance of self. Fitted `TheilSenRegressor` estimator. """ - self._validate_params() random_state = check_random_state(self.random_state) X, y = self._validate_data(X, y, y_numeric=True) n_samples, n_features = X.shape diff --git a/sklearn/manifold/_isomap.py b/sklearn/manifold/_isomap.py index 92206721aac15..0917ef7d207bc 100644 --- a/sklearn/manifold/_isomap.py +++ b/sklearn/manifold/_isomap.py @@ -12,6 +12,7 @@ from scipy.sparse.csgraph import connected_components from ..base import BaseEstimator, TransformerMixin, ClassNamePrefixFeaturesOutMixin +from ..base import _fit_context from ..neighbors import NearestNeighbors, kneighbors_graph from ..neighbors import radius_neighbors_graph from ..utils.validation import check_is_fitted @@ -235,7 +236,7 @@ def _fit_transform(self, X): tol=self.tol, max_iter=self.max_iter, n_jobs=self.n_jobs, - ) + ).set_output(transform="default") if self.n_neighbors is not None: nbg = kneighbors_graph( @@ -332,6 +333,10 @@ def reconstruction_error(self): evals = self.kernel_pca_.eigenvalues_ return np.sqrt(np.sum(G_center**2) - np.sum(evals**2)) / G.shape[0] + @_fit_context( + # Isomap.metric is not validated yet + prefer_skip_nested_validation=False + ) def fit(self, X, y=None): """Compute the embedding vectors for data X. @@ -350,10 +355,13 @@ def fit(self, X, y=None): self : object Returns a fitted instance of self. """ - self._validate_params() self._fit_transform(X) return self + @_fit_context( + # Isomap.metric is not validated yet + prefer_skip_nested_validation=False + ) def fit_transform(self, X, y=None): """Fit the model from data in X and transform X. @@ -371,7 +379,6 @@ def fit_transform(self, X, y=None): X_new : array-like, shape (n_samples, n_components) X transformed in the new space. """ - self._validate_params() self._fit_transform(X) return self.embedding_ diff --git a/sklearn/manifold/_locally_linear.py b/sklearn/manifold/_locally_linear.py index 10a22b12dfd1d..6f57b0627b8be 100644 --- a/sklearn/manifold/_locally_linear.py +++ b/sklearn/manifold/_locally_linear.py @@ -17,6 +17,7 @@ TransformerMixin, _UnstableArchMixin, ClassNamePrefixFeaturesOutMixin, + _fit_context, ) from ..utils import check_random_state, check_array from ..utils._arpack import _init_arpack_v0 @@ -759,6 +760,7 @@ def _fit_transform(self, X): ) self._n_features_out = self.embedding_.shape[1] + @_fit_context(prefer_skip_nested_validation=True) def fit(self, X, y=None): """Compute the embedding vectors for data X. @@ -775,10 +777,10 @@ def fit(self, X, y=None): self : object Fitted `LocallyLinearEmbedding` class instance. """ - self._validate_params() self._fit_transform(X) return self + @_fit_context(prefer_skip_nested_validation=True) def fit_transform(self, X, y=None): """Compute the embedding vectors for data X and transform X. @@ -795,7 +797,6 @@ def fit_transform(self, X, y=None): X_new : array-like, shape (n_samples, n_components) Returns the instance itself. """ - self._validate_params() self._fit_transform(X) return self.embedding_ diff --git a/sklearn/manifold/_mds.py b/sklearn/manifold/_mds.py index 7fc46325a1ae1..6b7a818b94ea8 100644 --- a/sklearn/manifold/_mds.py +++ b/sklearn/manifold/_mds.py @@ -13,6 +13,7 @@ import warnings from ..base import BaseEstimator +from ..base import _fit_context from ..metrics import euclidean_distances from ..utils import check_random_state, check_array, check_symmetric from ..isotonic import IsotonicRegression @@ -569,10 +570,10 @@ def fit(self, X, y=None, init=None): self : object Fitted estimator. """ - # parameter will be validated in `fit_transform` call self.fit_transform(X, init=init) return self + @_fit_context(prefer_skip_nested_validation=True) def fit_transform(self, X, y=None, init=None): """ Fit the data from `X`, and returns the embedded coordinates. @@ -597,7 +598,6 @@ def fit_transform(self, X, y=None, init=None): X_new : ndarray of shape (n_samples, n_components) X transformed in the new space. """ - self._validate_params() X = self._validate_data(X) if X.shape[0] == X.shape[1] and self.dissimilarity != "precomputed": warnings.warn( diff --git a/sklearn/manifold/_spectral_embedding.py b/sklearn/manifold/_spectral_embedding.py index 8291d8326eb05..af965a1362b8f 100644 --- a/sklearn/manifold/_spectral_embedding.py +++ b/sklearn/manifold/_spectral_embedding.py @@ -17,6 +17,7 @@ from scipy.sparse.csgraph import laplacian as csgraph_laplacian from ..base import BaseEstimator +from ..base import _fit_context from ..utils import ( check_array, check_random_state, @@ -652,6 +653,7 @@ def _get_affinity_matrix(self, X, Y=None): self.affinity_matrix_ = self.affinity(X) return self.affinity_matrix_ + @_fit_context(prefer_skip_nested_validation=True) def fit(self, X, y=None): """Fit the model from data in X. @@ -674,8 +676,6 @@ def fit(self, X, y=None): self : object Returns the instance itself. """ - self._validate_params() - X = self._validate_data(X, accept_sparse="csr", ensure_min_samples=2) random_state = check_random_state(self.random_state) diff --git a/sklearn/manifold/_t_sne.py b/sklearn/manifold/_t_sne.py index 6ef6ce999cb08..c372ddcca3c2e 100644 --- a/sklearn/manifold/_t_sne.py +++ b/sklearn/manifold/_t_sne.py @@ -17,6 +17,7 @@ from numbers import Integral, Real from ..neighbors import NearestNeighbors from ..base import BaseEstimator, ClassNamePrefixFeaturesOutMixin, TransformerMixin +from ..base import _fit_context from ..utils import check_random_state from ..utils._openmp_helpers import _openmp_effective_n_threads from ..utils.validation import check_non_negative @@ -1078,6 +1079,10 @@ def _tsne( return X_embedded + @_fit_context( + # TSNE.metric is not validated yet + prefer_skip_nested_validation=False + ) def fit_transform(self, X, y=None): """Fit X into an embedded space and return that transformed output. @@ -1099,12 +1104,15 @@ def fit_transform(self, X, y=None): X_new : ndarray of shape (n_samples, n_components) Embedding of the training data in low-dimensional space. """ - self._validate_params() self._check_params_vs_input(X) embedding = self._fit(X) self.embedding_ = embedding return self.embedding_ + @_fit_context( + # TSNE.metric is not validated yet + prefer_skip_nested_validation=False + ) def fit(self, X, y=None): """Fit X into an embedded space. @@ -1126,7 +1134,6 @@ def fit(self, X, y=None): X_new : array of shape (n_samples, n_components) Embedding of the training data in low-dimensional space. """ - self._validate_params() self.fit_transform(X) return self diff --git a/sklearn/metrics/pairwise.py b/sklearn/metrics/pairwise.py index 67b04e9382acb..dbe5b76f0f4c9 100644 --- a/sklearn/metrics/pairwise.py +++ b/sklearn/metrics/pairwise.py @@ -919,8 +919,9 @@ def haversine_distances(X, Y=None): in radians. The dimension of the data must be 2. .. math:: - D(x, y) = 2\\arcsin[\\sqrt{\\sin^2((x1 - y1) / 2) - + \\cos(x1)\\cos(y1)\\sin^2((x2 - y2) / 2)}] + D(x, y) = 2\\arcsin[\\sqrt{\\sin^2((x_{lat} - y_{lat}) / 2) + + \\cos(x_{lat})\\cos(y_{lat})\\ + sin^2((x_{lon} - y_{lon}) / 2)}] Parameters ---------- @@ -1220,6 +1221,13 @@ def paired_cosine_distances(X, Y): } +@validate_params( + { + "X": ["array-like"], + "Y": ["array-like"], + "metric": [StrOptions(set(PAIRED_DISTANCES)), callable], + } +) def paired_distances(X, Y, *, metric="euclidean", **kwds): """ Compute the paired distances between X and Y. @@ -1278,8 +1286,6 @@ def paired_distances(X, Y, *, metric="euclidean", **kwds): for i in range(len(X)): distances[i] = metric(X[i], Y[i]) return distances - else: - raise ValueError("Unknown distance %s" % metric) # Kernels diff --git a/sklearn/mixture/_base.py b/sklearn/mixture/_base.py index a298dfec6a0da..fbca4f1d49dcd 100644 --- a/sklearn/mixture/_base.py +++ b/sklearn/mixture/_base.py @@ -16,6 +16,7 @@ from ..cluster import kmeans_plusplus from ..base import BaseEstimator from ..base import DensityMixin +from ..base import _fit_context from ..exceptions import ConvergenceWarning from ..utils import check_random_state from ..utils.validation import check_is_fitted @@ -182,6 +183,7 @@ def fit(self, X, y=None): self.fit_predict(X, y) return self + @_fit_context(prefer_skip_nested_validation=True) def fit_predict(self, X, y=None): """Estimate model parameters using X and predict the labels for X. @@ -209,8 +211,6 @@ def fit_predict(self, X, y=None): labels : array, shape (n_samples,) Component labels. """ - self._validate_params() - X = self._validate_data(X, dtype=[np.float64, np.float32], ensure_min_samples=2) if X.shape[0] < self.n_components: raise ValueError( diff --git a/sklearn/model_selection/__init__.py b/sklearn/model_selection/__init__.py index 76dc02e625408..4a3f5d1e239a8 100644 --- a/sklearn/model_selection/__init__.py +++ b/sklearn/model_selection/__init__.py @@ -33,6 +33,7 @@ from ._search import ParameterSampler from ._plot import LearningCurveDisplay +from ._plot import ValidationCurveDisplay if typing.TYPE_CHECKING: # Avoid errors in type checkers (e.g. mypy) for experimental estimators. @@ -74,6 +75,7 @@ "permutation_test_score", "train_test_split", "validation_curve", + "ValidationCurveDisplay", ] diff --git a/sklearn/model_selection/_plot.py b/sklearn/model_selection/_plot.py index 6a6133a722251..bc5a600e57234 100644 --- a/sklearn/model_selection/_plot.py +++ b/sklearn/model_selection/_plot.py @@ -1,10 +1,140 @@ +import warnings + import numpy as np -from . import learning_curve +from . import learning_curve, validation_curve from ..utils import check_matplotlib_support +from ..utils._plotting import _validate_score_name, _interval_max_min_ratio + + +class _BaseCurveDisplay: + def _plot_curve( + self, + x_data, + *, + ax=None, + negate_score=False, + score_name=None, + score_type="test", + log_scale="deprecated", + std_display_style="fill_between", + line_kw=None, + fill_between_kw=None, + errorbar_kw=None, + ): + check_matplotlib_support(f"{self.__class__.__name__}.plot") + + import matplotlib.pyplot as plt + + if ax is None: + _, ax = plt.subplots() + + if negate_score: + train_scores, test_scores = -self.train_scores, -self.test_scores + else: + train_scores, test_scores = self.train_scores, self.test_scores + + if std_display_style not in ("errorbar", "fill_between", None): + raise ValueError( + f"Unknown std_display_style: {std_display_style}. Should be one of" + " 'errorbar', 'fill_between', or None." + ) + + if score_type not in ("test", "train", "both"): + raise ValueError( + f"Unknown score_type: {score_type}. Should be one of 'test', " + "'train', or 'both'." + ) + + if score_type == "train": + scores = {"Train": train_scores} + elif score_type == "test": + scores = {"Test": test_scores} + else: # score_type == "both" + scores = {"Train": train_scores, "Test": test_scores} + + if std_display_style in ("fill_between", None): + # plot the mean score + if line_kw is None: + line_kw = {} + + self.lines_ = [] + for line_label, score in scores.items(): + self.lines_.append( + *ax.plot( + x_data, + score.mean(axis=1), + label=line_label, + **line_kw, + ) + ) + self.errorbar_ = None + self.fill_between_ = None # overwritten below by fill_between + + if std_display_style == "errorbar": + if errorbar_kw is None: + errorbar_kw = {} + + self.errorbar_ = [] + for line_label, score in scores.items(): + self.errorbar_.append( + ax.errorbar( + x_data, + score.mean(axis=1), + score.std(axis=1), + label=line_label, + **errorbar_kw, + ) + ) + self.lines_, self.fill_between_ = None, None + elif std_display_style == "fill_between": + if fill_between_kw is None: + fill_between_kw = {} + default_fill_between_kw = {"alpha": 0.5} + fill_between_kw = {**default_fill_between_kw, **fill_between_kw} + + self.fill_between_ = [] + for line_label, score in scores.items(): + self.fill_between_.append( + ax.fill_between( + x_data, + score.mean(axis=1) - score.std(axis=1), + score.mean(axis=1) + score.std(axis=1), + **fill_between_kw, + ) + ) + + score_name = self.score_name if score_name is None else score_name + + ax.legend() + # TODO(1.5): to be removed + if log_scale != "deprecated": + warnings.warn( + ( + "The `log_scale` parameter is deprecated as of version 1.3 " + "and will be removed in 1.5. You can use display.ax_.set_xscale " + "and display.ax_.set_yscale instead." + ), + FutureWarning, + ) + xscale = "log" if log_scale else "linear" + else: + # We found that a ratio, smaller or bigger than 5, between the largest and + # smallest gap of the x values is a good indicator to choose between linear + # and log scale. + if _interval_max_min_ratio(x_data) > 5: + xscale = "symlog" if x_data.min() <= 0 else "log" + else: + xscale = "linear" + ax.set_xscale(xscale) + ax.set_ylabel(f"{score_name}") -class LearningCurveDisplay: + self.ax_ = ax + self.figure_ = ax.figure + + +class LearningCurveDisplay(_BaseCurveDisplay): """Learning Curve visualization. It is recommended to use @@ -12,7 +142,10 @@ class LearningCurveDisplay: create a :class:`~sklearn.model_selection.LearningCurveDisplay` instance. All parameters are stored as attributes. - Read more in the :ref:`User Guide `. + Read more in the :ref:`User Guide ` for general information + about the visualization API and + :ref:`detailed documentation ` regarding the learning + curve visualization. .. versionadded:: 1.2 @@ -29,9 +162,12 @@ class LearningCurveDisplay: Scores on test set. score_name : str, default=None - The name of the score used in `learning_curve`. It will be used to - decorate the y-axis. If `None`, the generic name `"Score"` will be - used. + The name of the score used in `learning_curve`. It will override the name + inferred from the `scoring` parameter. If `score` is `None`, we use `"Score"` if + `negate_score` is `False` and `"Negative score"` otherwise. If `scoring` is a + string or a callable, we infer the name. We replace `_` by spaces and capitalize + the first letter. We remove `neg_` and replace it by `"Negative"` if + `negate_score` is `False` or just remove it otherwise. Attributes ---------- @@ -89,8 +225,8 @@ def plot( *, negate_score=False, score_name=None, - score_type="test", - log_scale=False, + score_type="both", + log_scale="deprecated", std_display_style="fill_between", line_kw=None, fill_between_kw=None, @@ -111,16 +247,25 @@ def plot( `scikit-learn`. score_name : str, default=None - The name of the score used to decorate the y-axis of the plot. If - `None`, the generic name "Score" will be used. - - score_type : {"test", "train", "both"}, default="test" + The name of the score used to decorate the y-axis of the plot. It will + override the name inferred from the `scoring` parameter. If `score` is + `None`, we use `"Score"` if `negate_score` is `False` and `"Negative score"` + otherwise. If `scoring` is a string or a callable, we infer the name. We + replace `_` by spaces and capitalize the first letter. We remove `neg_` and + replace it by `"Negative"` if `negate_score` is + `False` or just remove it otherwise. + + score_type : {"test", "train", "both"}, default="both" The type of score to plot. Can be one of `"test"`, `"train"`, or `"both"`. - log_scale : bool, default=False + log_scale : bool, default="deprecated" Whether or not to use a logarithmic scale for the x-axis. + .. deprecated:: 1.3 + `log_scale` is deprecated in 1.3 and will be removed in 1.5. + Use `display.ax_.set_xscale` and `display.ax_.set_yscale` instead. + std_display_style : {"errorbar", "fill_between"} or None, default="fill_between" The style used to display the score standard deviation around the mean score. If None, no standard deviation representation is @@ -143,98 +288,19 @@ def plot( display : :class:`~sklearn.model_selection.LearningCurveDisplay` Object that stores computed values. """ - check_matplotlib_support(f"{self.__class__.__name__}.plot") - - import matplotlib.pyplot as plt - - if ax is None: - _, ax = plt.subplots() - - if negate_score: - train_scores, test_scores = -self.train_scores, -self.test_scores - else: - train_scores, test_scores = self.train_scores, self.test_scores - - if std_display_style not in ("errorbar", "fill_between", None): - raise ValueError( - f"Unknown std_display_style: {std_display_style}. Should be one of" - " 'errorbar', 'fill_between', or None." - ) - - if score_type not in ("test", "train", "both"): - raise ValueError( - f"Unknown score_type: {score_type}. Should be one of 'test', " - "'train', or 'both'." - ) - - if score_type == "train": - scores = {"Training metric": train_scores} - elif score_type == "test": - scores = {"Testing metric": test_scores} - else: # score_type == "both" - scores = {"Training metric": train_scores, "Testing metric": test_scores} - - if std_display_style in ("fill_between", None): - # plot the mean score - if line_kw is None: - line_kw = {} - - self.lines_ = [] - for line_label, score in scores.items(): - self.lines_.append( - *ax.plot( - self.train_sizes, - score.mean(axis=1), - label=line_label, - **line_kw, - ) - ) - self.errorbar_ = None - self.fill_between_ = None # overwritten below by fill_between - - if std_display_style == "errorbar": - if errorbar_kw is None: - errorbar_kw = {} - - self.errorbar_ = [] - for line_label, score in scores.items(): - self.errorbar_.append( - ax.errorbar( - self.train_sizes, - score.mean(axis=1), - score.std(axis=1), - label=line_label, - **errorbar_kw, - ) - ) - self.lines_, self.fill_between_ = None, None - elif std_display_style == "fill_between": - if fill_between_kw is None: - fill_between_kw = {} - default_fill_between_kw = {"alpha": 0.5} - fill_between_kw = {**default_fill_between_kw, **fill_between_kw} - - self.fill_between_ = [] - for line_label, score in scores.items(): - self.fill_between_.append( - ax.fill_between( - self.train_sizes, - score.mean(axis=1) - score.std(axis=1), - score.mean(axis=1) + score.std(axis=1), - **fill_between_kw, - ) - ) - - score_name = self.score_name if score_name is None else score_name - - ax.legend() - if log_scale: - ax.set_xscale("log") - ax.set_xlabel("Number of samples in the training set") - ax.set_ylabel(f"{score_name}") - - self.ax_ = ax - self.figure_ = ax.figure + self._plot_curve( + self.train_sizes, + ax=ax, + negate_score=negate_score, + score_name=score_name, + score_type=score_type, + log_scale=log_scale, + std_display_style=std_display_style, + line_kw=line_kw, + fill_between_kw=fill_between_kw, + errorbar_kw=errorbar_kw, + ) + self.ax_.set_xlabel("Number of samples in the training set") return self @classmethod @@ -259,8 +325,8 @@ def from_estimator( ax=None, negate_score=False, score_name=None, - score_type="test", - log_scale=False, + score_type="both", + log_scale="deprecated", std_display_style="fill_between", line_kw=None, fill_between_kw=None, @@ -268,6 +334,11 @@ def from_estimator( ): """Create a learning curve display from an estimator. + Read more in the :ref:`User Guide ` for general + information about the visualization API and :ref:`detailed + documentation ` regarding the learning curve + visualization. + Parameters ---------- estimator : object type that implements the "fit" and "predict" methods @@ -368,16 +439,25 @@ def from_estimator( `scikit-learn`. score_name : str, default=None - The name of the score used to decorate the y-axis of the plot. - If `None`, the generic `"Score"` name will be used. - - score_type : {"test", "train", "both"}, default="test" + The name of the score used to decorate the y-axis of the plot. It will + override the name inferred from the `scoring` parameter. If `score` is + `None`, we use `"Score"` if `negate_score` is `False` and `"Negative score"` + otherwise. If `scoring` is a string or a callable, we infer the name. We + replace `_` by spaces and capitalize the first letter. We remove `neg_` and + replace it by `"Negative"` if `negate_score` is + `False` or just remove it otherwise. + + score_type : {"test", "train", "both"}, default="both" The type of score to plot. Can be one of `"test"`, `"train"`, or `"both"`. - log_scale : bool, default=False + log_scale : bool, default="deprecated" Whether or not to use a logarithmic scale for the x-axis. + .. deprecated:: 1.3 + `log_scale` is deprecated in 1.3 and will be removed in 1.5. + Use `display.ax_.xscale` and `display.ax_.yscale` instead. + std_display_style : {"errorbar", "fill_between"} or None, default="fill_between" The style used to display the score standard deviation around the mean score. If `None`, no representation of the standard deviation @@ -414,7 +494,7 @@ def from_estimator( """ check_matplotlib_support(f"{cls.__name__}.from_estimator") - score_name = "Score" if score_name is None else score_name + score_name = _validate_score_name(score_name, scoring, negate_score) train_sizes, train_scores, test_scores = learning_curve( estimator, @@ -451,3 +531,377 @@ def from_estimator( fill_between_kw=fill_between_kw, errorbar_kw=errorbar_kw, ) + + +class ValidationCurveDisplay(_BaseCurveDisplay): + """Validation Curve visualization. + + It is recommended to use + :meth:`~sklearn.model_selection.ValidationCurveDisplay.from_estimator` to + create a :class:`~sklearn.model_selection.ValidationCurveDisplay` instance. + All parameters are stored as attributes. + + Read more in the :ref:`User Guide ` for general information + about the visualization API and :ref:`detailed documentation + ` regarding the validation curve visualization. + + .. versionadded:: 1.3 + + Parameters + ---------- + param_name : str + Name of the parameter that has been varied. + + param_range : ndarray of shape (n_ticks,) + The values of the parameter that have been evaluated. + + train_scores : ndarray of shape (n_ticks, n_cv_folds) + Scores on training sets. + + test_scores : ndarray of shape (n_ticks, n_cv_folds) + Scores on test set. + + score_name : str, default=None + The name of the score used in `validation_curve`. It will override the name + inferred from the `scoring` parameter. If `score` is `None`, we use `"Score"` if + `negate_score` is `False` and `"Negative score"` otherwise. If `scoring` is a + string or a callable, we infer the name. We replace `_` by spaces and capitalize + the first letter. We remove `neg_` and replace it by `"Negative"` if + `negate_score` is `False` or just remove it otherwise. + + Attributes + ---------- + ax_ : matplotlib Axes + Axes with the validation curve. + + figure_ : matplotlib Figure + Figure containing the validation curve. + + errorbar_ : list of matplotlib Artist or None + When the `std_display_style` is `"errorbar"`, this is a list of + `matplotlib.container.ErrorbarContainer` objects. If another style is + used, `errorbar_` is `None`. + + lines_ : list of matplotlib Artist or None + When the `std_display_style` is `"fill_between"`, this is a list of + `matplotlib.lines.Line2D` objects corresponding to the mean train and + test scores. If another style is used, `line_` is `None`. + + fill_between_ : list of matplotlib Artist or None + When the `std_display_style` is `"fill_between"`, this is a list of + `matplotlib.collections.PolyCollection` objects. If another style is + used, `fill_between_` is `None`. + + See Also + -------- + sklearn.model_selection.validation_curve : Compute the validation curve. + + Examples + -------- + >>> import numpy as np + >>> import matplotlib.pyplot as plt + >>> from sklearn.datasets import make_classification + >>> from sklearn.model_selection import ValidationCurveDisplay, validation_curve + >>> from sklearn.linear_model import LogisticRegression + >>> X, y = make_classification(n_samples=1_000, random_state=0) + >>> logistic_regression = LogisticRegression() + >>> param_name, param_range = "C", np.logspace(-8, 3, 10) + >>> train_scores, test_scores = validation_curve( + ... logistic_regression, X, y, param_name=param_name, param_range=param_range + ... ) + >>> display = ValidationCurveDisplay( + ... param_name=param_name, param_range=param_range, + ... train_scores=train_scores, test_scores=test_scores, score_name="Score" + ... ) + >>> display.plot() + <...> + >>> plt.show() + """ + + def __init__( + self, *, param_name, param_range, train_scores, test_scores, score_name=None + ): + self.param_name = param_name + self.param_range = param_range + self.train_scores = train_scores + self.test_scores = test_scores + self.score_name = score_name + + def plot( + self, + ax=None, + *, + negate_score=False, + score_name=None, + score_type="both", + std_display_style="fill_between", + line_kw=None, + fill_between_kw=None, + errorbar_kw=None, + ): + """Plot visualization. + + Parameters + ---------- + ax : matplotlib Axes, default=None + Axes object to plot on. If `None`, a new figure and axes is + created. + + negate_score : bool, default=False + Whether or not to negate the scores obtained through + :func:`~sklearn.model_selection.validation_curve`. This is + particularly useful when using the error denoted by `neg_*` in + `scikit-learn`. + + score_name : str, default=None + The name of the score used to decorate the y-axis of the plot. It will + override the name inferred from the `scoring` parameter. If `score` is + `None`, we use `"Score"` if `negate_score` is `False` and `"Negative score"` + otherwise. If `scoring` is a string or a callable, we infer the name. We + replace `_` by spaces and capitalize the first letter. We remove `neg_` and + replace it by `"Negative"` if `negate_score` is + `False` or just remove it otherwise. + + score_type : {"test", "train", "both"}, default="both" + The type of score to plot. Can be one of `"test"`, `"train"`, or + `"both"`. + + std_display_style : {"errorbar", "fill_between"} or None, default="fill_between" + The style used to display the score standard deviation around the + mean score. If None, no standard deviation representation is + displayed. + + line_kw : dict, default=None + Additional keyword arguments passed to the `plt.plot` used to draw + the mean score. + + fill_between_kw : dict, default=None + Additional keyword arguments passed to the `plt.fill_between` used + to draw the score standard deviation. + + errorbar_kw : dict, default=None + Additional keyword arguments passed to the `plt.errorbar` used to + draw mean score and standard deviation score. + + Returns + ------- + display : :class:`~sklearn.model_selection.ValidationCurveDisplay` + Object that stores computed values. + """ + self._plot_curve( + self.param_range, + ax=ax, + negate_score=negate_score, + score_name=score_name, + score_type=score_type, + log_scale="deprecated", + std_display_style=std_display_style, + line_kw=line_kw, + fill_between_kw=fill_between_kw, + errorbar_kw=errorbar_kw, + ) + self.ax_.set_xlabel(f"{self.param_name}") + return self + + @classmethod + def from_estimator( + cls, + estimator, + X, + y, + *, + param_name, + param_range, + groups=None, + cv=None, + scoring=None, + n_jobs=None, + pre_dispatch="all", + verbose=0, + error_score=np.nan, + fit_params=None, + ax=None, + negate_score=False, + score_name=None, + score_type="both", + std_display_style="fill_between", + line_kw=None, + fill_between_kw=None, + errorbar_kw=None, + ): + """Create a validation curve display from an estimator. + + Read more in the :ref:`User Guide ` for general + information about the visualization API and :ref:`detailed + documentation ` regarding the validation curve + visualization. + + Parameters + ---------- + estimator : object type that implements the "fit" and "predict" methods + An object of that type which is cloned for each validation. + + X : array-like of shape (n_samples, n_features) + Training data, where `n_samples` is the number of samples and + `n_features` is the number of features. + + y : array-like of shape (n_samples,) or (n_samples, n_outputs) or None + Target relative to X for classification or regression; + None for unsupervised learning. + + param_name : str + Name of the parameter that will be varied. + + param_range : array-like of shape (n_values,) + The values of the parameter that will be evaluated. + + groups : array-like of shape (n_samples,), default=None + Group labels for the samples used while splitting the dataset into + train/test set. Only used in conjunction with a "Group" :term:`cv` + instance (e.g., :class:`GroupKFold`). + + cv : int, cross-validation generator or an iterable, default=None + Determines the cross-validation splitting strategy. + Possible inputs for cv are: + + - None, to use the default 5-fold cross validation, + - int, to specify the number of folds in a `(Stratified)KFold`, + - :term:`CV splitter`, + - An iterable yielding (train, test) splits as arrays of indices. + + For int/None inputs, if the estimator is a classifier and `y` is + either binary or multiclass, + :class:`~sklearn.model_selection.StratifiedKFold` is used. In all + other cases, :class:`~sklearn.model_selectionKFold` is used. These + splitters are instantiated with `shuffle=False` so the splits will + be the same across calls. + + Refer :ref:`User Guide ` for the various + cross-validation strategies that can be used here. + + scoring : str or callable, default=None + A string (see :ref:`scoring_parameter`) or + a scorer callable object / function with signature + `scorer(estimator, X, y)` (see :ref:`scoring`). + + n_jobs : int, default=None + Number of jobs to run in parallel. Training the estimator and + computing the score are parallelized over the different training + and test sets. `None` means 1 unless in a + :obj:`joblib.parallel_backend` context. `-1` means using all + processors. See :term:`Glossary ` for more details. + + pre_dispatch : int or str, default='all' + Number of predispatched jobs for parallel execution (default is + all). The option can reduce the allocated memory. The str can + be an expression like '2*n_jobs'. + + verbose : int, default=0 + Controls the verbosity: the higher, the more messages. + + error_score : 'raise' or numeric, default=np.nan + Value to assign to the score if an error occurs in estimator + fitting. If set to 'raise', the error is raised. If a numeric value + is given, FitFailedWarning is raised. + + fit_params : dict, default=None + Parameters to pass to the fit method of the estimator. + + ax : matplotlib Axes, default=None + Axes object to plot on. If `None`, a new figure and axes is + created. + + negate_score : bool, default=False + Whether or not to negate the scores obtained through + :func:`~sklearn.model_selection.validation_curve`. This is + particularly useful when using the error denoted by `neg_*` in + `scikit-learn`. + + score_name : str, default=None + The name of the score used to decorate the y-axis of the plot. It will + override the name inferred from the `scoring` parameter. If `score` is + `None`, we use `"Score"` if `negate_score` is `False` and `"Negative score"` + otherwise. If `scoring` is a string or a callable, we infer the name. We + replace `_` by spaces and capitalize the first letter. We remove `neg_` and + replace it by `"Negative"` if `negate_score` is + `False` or just remove it otherwise. + + score_type : {"test", "train", "both"}, default="both" + The type of score to plot. Can be one of `"test"`, `"train"`, or + `"both"`. + + std_display_style : {"errorbar", "fill_between"} or None, default="fill_between" + The style used to display the score standard deviation around the + mean score. If `None`, no representation of the standard deviation + is displayed. + + line_kw : dict, default=None + Additional keyword arguments passed to the `plt.plot` used to draw + the mean score. + + fill_between_kw : dict, default=None + Additional keyword arguments passed to the `plt.fill_between` used + to draw the score standard deviation. + + errorbar_kw : dict, default=None + Additional keyword arguments passed to the `plt.errorbar` used to + draw mean score and standard deviation score. + + Returns + ------- + display : :class:`~sklearn.model_selection.ValidationCurveDisplay` + Object that stores computed values. + + Examples + -------- + >>> import numpy as np + >>> import matplotlib.pyplot as plt + >>> from sklearn.datasets import make_classification + >>> from sklearn.model_selection import ValidationCurveDisplay + >>> from sklearn.linear_model import LogisticRegression + >>> X, y = make_classification(n_samples=1_000, random_state=0) + >>> logistic_regression = LogisticRegression() + >>> param_name, param_range = "C", np.logspace(-8, 3, 10) + >>> ValidationCurveDisplay.from_estimator( + ... logistic_regression, X, y, param_name=param_name, + ... param_range=param_range, + ... ) + <...> + >>> plt.show() + """ + check_matplotlib_support(f"{cls.__name__}.from_estimator") + + score_name = _validate_score_name(score_name, scoring, negate_score) + + train_scores, test_scores = validation_curve( + estimator, + X, + y, + param_name=param_name, + param_range=param_range, + groups=groups, + cv=cv, + scoring=scoring, + n_jobs=n_jobs, + pre_dispatch=pre_dispatch, + verbose=verbose, + error_score=error_score, + fit_params=fit_params, + ) + + viz = cls( + param_name=param_name, + param_range=param_range, + train_scores=train_scores, + test_scores=test_scores, + score_name=score_name, + ) + return viz.plot( + ax=ax, + negate_score=negate_score, + score_type=score_type, + std_display_style=std_display_style, + line_kw=line_kw, + fill_between_kw=fill_between_kw, + errorbar_kw=errorbar_kw, + ) diff --git a/sklearn/model_selection/_search.py b/sklearn/model_selection/_search.py index 1621dd324f81c..695614f4e1fa0 100644 --- a/sklearn/model_selection/_search.py +++ b/sklearn/model_selection/_search.py @@ -26,6 +26,7 @@ from ..base import BaseEstimator, is_classifier, clone from ..base import MetaEstimatorMixin +from ..base import _fit_context from ._split import check_cv from ._validation import _fit_and_score from ._validation import _aggregate_score_dicts @@ -753,6 +754,10 @@ def _select_best_index(refit, refit_metric, results): best_index = results[f"rank_test_{refit_metric}"].argmin() return best_index + @_fit_context( + # *SearchCV.estimator is not validated yet + prefer_skip_nested_validation=False + ) def fit(self, X, y=None, *, groups=None, **fit_params): """Run fit with all sets of parameters. @@ -786,7 +791,6 @@ def fit(self, X, y=None, *, groups=None, **fit_params): self : object Instance of fitted estimator. """ - self._validate_params() estimator = self.estimator refit_metric = "score" diff --git a/sklearn/model_selection/_search_successive_halving.py b/sklearn/model_selection/_search_successive_halving.py index 4826e7931d4d6..a061d7283b46d 100644 --- a/sklearn/model_selection/_search_successive_halving.py +++ b/sklearn/model_selection/_search_successive_halving.py @@ -7,6 +7,7 @@ from ._search import BaseSearchCV from . import ParameterGrid, ParameterSampler from ..base import is_classifier +from ..base import _fit_context from ._split import check_cv, _yields_constant_splits from ..metrics._scorer import get_scorer_names from ..utils import resample @@ -211,6 +212,10 @@ def _select_best_index(refit, refit_metric, results): return last_iter_indices[best_idx] + @_fit_context( + # Halving*SearchCV.estimator is not validated yet + prefer_skip_nested_validation=False + ) def fit(self, X, y=None, groups=None, **fit_params): """Run fit with all sets of parameters. @@ -238,7 +243,6 @@ def fit(self, X, y=None, groups=None, **fit_params): self : object Instance of fitted estimator. """ - self._validate_params() self._checked_cv_orig = check_cv( self.cv, y, classifier=is_classifier(self.estimator) ) diff --git a/sklearn/model_selection/tests/test_plot.py b/sklearn/model_selection/tests/test_plot.py index 762af8fe08336..6baa211d2dc6e 100644 --- a/sklearn/model_selection/tests/test_plot.py +++ b/sklearn/model_selection/tests/test_plot.py @@ -1,3 +1,4 @@ +import numpy as np import pytest from sklearn.datasets import load_iris @@ -5,8 +6,8 @@ from sklearn.utils import shuffle from sklearn.utils._testing import assert_allclose, assert_array_equal -from sklearn.model_selection import learning_curve -from sklearn.model_selection import LearningCurveDisplay +from sklearn.model_selection import learning_curve, validation_curve +from sklearn.model_selection import LearningCurveDisplay, ValidationCurveDisplay @pytest.fixture @@ -21,18 +22,22 @@ def data(): ({"score_type": "invalid"}, ValueError, "Unknown score_type:"), ], ) -def test_learning_curve_display_parameters_validation( - pyplot, data, params, err_type, err_msg +@pytest.mark.parametrize( + "CurveDisplay, specific_params", + [ + (ValidationCurveDisplay, {"param_name": "max_depth", "param_range": [1, 3, 5]}), + (LearningCurveDisplay, {"train_sizes": [0.3, 0.6, 0.9]}), + ], +) +def test_curve_display_parameters_validation( + pyplot, data, params, err_type, err_msg, CurveDisplay, specific_params ): """Check that we raise a proper error when passing invalid parameters.""" X, y = data estimator = DecisionTreeClassifier(random_state=0) - train_sizes = [0.3, 0.6, 0.9] with pytest.raises(err_type, match=err_msg): - LearningCurveDisplay.from_estimator( - estimator, X, y, train_sizes=train_sizes, **params - ) + CurveDisplay.from_estimator(estimator, X, y, **specific_params, **params) def test_learning_curve_display_default_usage(pyplot, data): @@ -63,7 +68,7 @@ def test_learning_curve_display_default_usage(pyplot, data): assert display.ax_.get_ylabel() == "Score" _, legend_labels = display.ax_.get_legend_handles_labels() - assert legend_labels == ["Testing metric"] + assert legend_labels == ["Train", "Test"] train_sizes_abs, train_scores, test_scores = learning_curve( estimator, X, y, train_sizes=train_sizes @@ -74,21 +79,63 @@ def test_learning_curve_display_default_usage(pyplot, data): assert_allclose(display.test_scores, test_scores) -def test_learning_curve_display_negate_score(pyplot, data): +def test_validation_curve_display_default_usage(pyplot, data): + """Check the default usage of the ValidationCurveDisplay class.""" + X, y = data + estimator = DecisionTreeClassifier(random_state=0) + + param_name, param_range = "max_depth", [1, 3, 5] + display = ValidationCurveDisplay.from_estimator( + estimator, X, y, param_name=param_name, param_range=param_range + ) + + import matplotlib as mpl + + assert display.errorbar_ is None + + assert isinstance(display.lines_, list) + for line in display.lines_: + assert isinstance(line, mpl.lines.Line2D) + + assert isinstance(display.fill_between_, list) + for fill in display.fill_between_: + assert isinstance(fill, mpl.collections.PolyCollection) + assert fill.get_alpha() == 0.5 + + assert display.score_name == "Score" + assert display.ax_.get_xlabel() == f"{param_name}" + assert display.ax_.get_ylabel() == "Score" + + _, legend_labels = display.ax_.get_legend_handles_labels() + assert legend_labels == ["Train", "Test"] + + train_scores, test_scores = validation_curve( + estimator, X, y, param_name=param_name, param_range=param_range + ) + + assert display.param_range == param_range + assert_array_equal(display.param_range, param_range) + assert_allclose(display.train_scores, train_scores) + assert_allclose(display.test_scores, test_scores) + + +@pytest.mark.parametrize( + "CurveDisplay, specific_params", + [ + (ValidationCurveDisplay, {"param_name": "max_depth", "param_range": [1, 3, 5]}), + (LearningCurveDisplay, {"train_sizes": [0.3, 0.6, 0.9]}), + ], +) +def test_curve_display_negate_score(pyplot, data, CurveDisplay, specific_params): """Check the behaviour of the `negate_score` parameter calling `from_estimator` and `plot`. """ X, y = data estimator = DecisionTreeClassifier(max_depth=1, random_state=0) - train_sizes = [0.3, 0.6, 0.9] negate_score = False - display = LearningCurveDisplay.from_estimator( - estimator, - X, - y, - train_sizes=train_sizes, - negate_score=negate_score, + display = CurveDisplay.from_estimator( + estimator, X, y, **specific_params, negate_score=negate_score ) positive_scores = display.lines_[0].get_data()[1] @@ -96,22 +143,18 @@ def test_learning_curve_display_negate_score(pyplot, data): assert display.ax_.get_ylabel() == "Score" negate_score = True - display = LearningCurveDisplay.from_estimator( - estimator, X, y, train_sizes=train_sizes, negate_score=negate_score + display = CurveDisplay.from_estimator( + estimator, X, y, **specific_params, negate_score=negate_score ) negative_scores = display.lines_[0].get_data()[1] assert (negative_scores <= 0).all() assert_allclose(negative_scores, -positive_scores) - assert display.ax_.get_ylabel() == "Score" + assert display.ax_.get_ylabel() == "Negative score" negate_score = False - display = LearningCurveDisplay.from_estimator( - estimator, - X, - y, - train_sizes=train_sizes, - negate_score=negate_score, + display = CurveDisplay.from_estimator( + estimator, X, y, **specific_params, negate_score=negate_score ) assert display.ax_.get_ylabel() == "Score" display.plot(negate_score=not negate_score) @@ -122,23 +165,30 @@ def test_learning_curve_display_negate_score(pyplot, data): @pytest.mark.parametrize( "score_name, ylabel", [(None, "Score"), ("Accuracy", "Accuracy")] ) -def test_learning_curve_display_score_name(pyplot, data, score_name, ylabel): +@pytest.mark.parametrize( + "CurveDisplay, specific_params", + [ + (ValidationCurveDisplay, {"param_name": "max_depth", "param_range": [1, 3, 5]}), + (LearningCurveDisplay, {"train_sizes": [0.3, 0.6, 0.9]}), + ], +) +def test_curve_display_score_name( + pyplot, data, score_name, ylabel, CurveDisplay, specific_params +): """Check that we can overwrite the default score name shown on the y-axis.""" X, y = data estimator = DecisionTreeClassifier(random_state=0) - train_sizes = [0.3, 0.6, 0.9] - display = LearningCurveDisplay.from_estimator( - estimator, X, y, train_sizes=train_sizes, score_name=score_name + display = CurveDisplay.from_estimator( + estimator, X, y, **specific_params, score_name=score_name ) assert display.ax_.get_ylabel() == ylabel X, y = data estimator = DecisionTreeClassifier(max_depth=1, random_state=0) - train_sizes = [0.3, 0.6, 0.9] - display = LearningCurveDisplay.from_estimator( - estimator, X, y, train_sizes=train_sizes, score_name=score_name + display = CurveDisplay.from_estimator( + estimator, X, y, **specific_params, score_name=score_name ) assert display.score_name == ylabel @@ -166,7 +216,7 @@ def test_learning_curve_display_score_type(pyplot, data, std_display_style): ) _, legend_label = display.ax_.get_legend_handles_labels() - assert legend_label == ["Training metric"] + assert legend_label == ["Train"] if std_display_style is None: assert len(display.lines_) == 1 @@ -191,7 +241,7 @@ def test_learning_curve_display_score_type(pyplot, data, std_display_style): ) _, legend_label = display.ax_.get_legend_handles_labels() - assert legend_label == ["Testing metric"] + assert legend_label == ["Test"] if std_display_style is None: assert len(display.lines_) == 1 @@ -216,7 +266,7 @@ def test_learning_curve_display_score_type(pyplot, data, std_display_style): ) _, legend_label = display.ax_.get_legend_handles_labels() - assert legend_label == ["Training metric", "Testing metric"] + assert legend_label == ["Train", "Test"] if std_display_style is None: assert len(display.lines_) == 2 @@ -235,100 +285,220 @@ def test_learning_curve_display_score_type(pyplot, data, std_display_style): assert_allclose(y_data_test, test_scores.mean(axis=1)) -def test_learning_curve_display_log_scale(pyplot, data): - """Check the behaviour of the parameter `log_scale`.""" +@pytest.mark.parametrize("std_display_style", (None, "errorbar")) +def test_validation_curve_display_score_type(pyplot, data, std_display_style): + """Check the behaviour of setting the `score_type` parameter.""" X, y = data estimator = DecisionTreeClassifier(random_state=0) - train_sizes = [0.3, 0.6, 0.9] - display = LearningCurveDisplay.from_estimator( - estimator, X, y, train_sizes=train_sizes, log_scale=True + param_name, param_range = "max_depth", [1, 3, 5] + train_scores, test_scores = validation_curve( + estimator, X, y, param_name=param_name, param_range=param_range ) - assert display.ax_.get_xscale() == "log" - assert display.ax_.get_yscale() == "linear" + score_type = "train" + display = ValidationCurveDisplay.from_estimator( + estimator, + X, + y, + param_name=param_name, + param_range=param_range, + score_type=score_type, + std_display_style=std_display_style, + ) - display = LearningCurveDisplay.from_estimator( - estimator, X, y, train_sizes=train_sizes, log_scale=False + _, legend_label = display.ax_.get_legend_handles_labels() + assert legend_label == ["Train"] + + if std_display_style is None: + assert len(display.lines_) == 1 + assert display.errorbar_ is None + x_data, y_data = display.lines_[0].get_data() + else: + assert display.lines_ is None + assert len(display.errorbar_) == 1 + x_data, y_data = display.errorbar_[0].lines[0].get_data() + + assert_array_equal(x_data, param_range) + assert_allclose(y_data, train_scores.mean(axis=1)) + + score_type = "test" + display = ValidationCurveDisplay.from_estimator( + estimator, + X, + y, + param_name=param_name, + param_range=param_range, + score_type=score_type, + std_display_style=std_display_style, ) - assert display.ax_.get_xscale() == "linear" - assert display.ax_.get_yscale() == "linear" + _, legend_label = display.ax_.get_legend_handles_labels() + assert legend_label == ["Test"] + + if std_display_style is None: + assert len(display.lines_) == 1 + assert display.errorbar_ is None + x_data, y_data = display.lines_[0].get_data() + else: + assert display.lines_ is None + assert len(display.errorbar_) == 1 + x_data, y_data = display.errorbar_[0].lines[0].get_data() + + assert_array_equal(x_data, param_range) + assert_allclose(y_data, test_scores.mean(axis=1)) + + score_type = "both" + display = ValidationCurveDisplay.from_estimator( + estimator, + X, + y, + param_name=param_name, + param_range=param_range, + score_type=score_type, + std_display_style=std_display_style, + ) + + _, legend_label = display.ax_.get_legend_handles_labels() + assert legend_label == ["Train", "Test"] + + if std_display_style is None: + assert len(display.lines_) == 2 + assert display.errorbar_ is None + x_data_train, y_data_train = display.lines_[0].get_data() + x_data_test, y_data_test = display.lines_[1].get_data() + else: + assert display.lines_ is None + assert len(display.errorbar_) == 2 + x_data_train, y_data_train = display.errorbar_[0].lines[0].get_data() + x_data_test, y_data_test = display.errorbar_[1].lines[0].get_data() + + assert_array_equal(x_data_train, param_range) + assert_allclose(y_data_train, train_scores.mean(axis=1)) + assert_array_equal(x_data_test, param_range) + assert_allclose(y_data_test, test_scores.mean(axis=1)) + + +@pytest.mark.parametrize( + "CurveDisplay, specific_params, expected_xscale", + [ + ( + ValidationCurveDisplay, + {"param_name": "max_depth", "param_range": np.arange(1, 5)}, + "linear", + ), + (LearningCurveDisplay, {"train_sizes": np.linspace(0.1, 0.9, num=5)}, "linear"), + ( + ValidationCurveDisplay, + { + "param_name": "max_depth", + "param_range": np.round(np.logspace(0, 2, num=5)).astype(np.int64), + }, + "log", + ), + (LearningCurveDisplay, {"train_sizes": np.logspace(-1, 0, num=5)}, "log"), + ], +) +def test_curve_display_xscale_auto( + pyplot, data, CurveDisplay, specific_params, expected_xscale +): + """Check the behaviour of the x-axis scaling depending on the data provided.""" + X, y = data + estimator = DecisionTreeClassifier(random_state=0) + display = CurveDisplay.from_estimator(estimator, X, y, **specific_params) + assert display.ax_.get_xscale() == expected_xscale -def test_learning_curve_display_std_display_style(pyplot, data): + +@pytest.mark.parametrize( + "CurveDisplay, specific_params", + [ + (ValidationCurveDisplay, {"param_name": "max_depth", "param_range": [1, 3, 5]}), + (LearningCurveDisplay, {"train_sizes": [0.3, 0.6, 0.9]}), + ], +) +def test_curve_display_std_display_style(pyplot, data, CurveDisplay, specific_params): """Check the behaviour of the parameter `std_display_style`.""" X, y = data estimator = DecisionTreeClassifier(random_state=0) import matplotlib as mpl - train_sizes = [0.3, 0.6, 0.9] std_display_style = None - display = LearningCurveDisplay.from_estimator( + display = CurveDisplay.from_estimator( estimator, X, y, - train_sizes=train_sizes, + **specific_params, std_display_style=std_display_style, ) - assert len(display.lines_) == 1 - assert isinstance(display.lines_[0], mpl.lines.Line2D) + assert len(display.lines_) == 2 + for line in display.lines_: + assert isinstance(line, mpl.lines.Line2D) assert display.errorbar_ is None assert display.fill_between_ is None _, legend_label = display.ax_.get_legend_handles_labels() - assert len(legend_label) == 1 + assert len(legend_label) == 2 std_display_style = "fill_between" - display = LearningCurveDisplay.from_estimator( + display = CurveDisplay.from_estimator( estimator, X, y, - train_sizes=train_sizes, + **specific_params, std_display_style=std_display_style, ) - assert len(display.lines_) == 1 - assert isinstance(display.lines_[0], mpl.lines.Line2D) + assert len(display.lines_) == 2 + for line in display.lines_: + assert isinstance(line, mpl.lines.Line2D) assert display.errorbar_ is None - assert len(display.fill_between_) == 1 - assert isinstance(display.fill_between_[0], mpl.collections.PolyCollection) + assert len(display.fill_between_) == 2 + for fill_between in display.fill_between_: + assert isinstance(fill_between, mpl.collections.PolyCollection) _, legend_label = display.ax_.get_legend_handles_labels() - assert len(legend_label) == 1 + assert len(legend_label) == 2 std_display_style = "errorbar" - display = LearningCurveDisplay.from_estimator( + display = CurveDisplay.from_estimator( estimator, X, y, - train_sizes=train_sizes, + **specific_params, std_display_style=std_display_style, ) assert display.lines_ is None - assert len(display.errorbar_) == 1 - assert isinstance(display.errorbar_[0], mpl.container.ErrorbarContainer) + assert len(display.errorbar_) == 2 + for errorbar in display.errorbar_: + assert isinstance(errorbar, mpl.container.ErrorbarContainer) assert display.fill_between_ is None _, legend_label = display.ax_.get_legend_handles_labels() - assert len(legend_label) == 1 + assert len(legend_label) == 2 -def test_learning_curve_display_plot_kwargs(pyplot, data): +@pytest.mark.parametrize( + "CurveDisplay, specific_params", + [ + (ValidationCurveDisplay, {"param_name": "max_depth", "param_range": [1, 3, 5]}), + (LearningCurveDisplay, {"train_sizes": [0.3, 0.6, 0.9]}), + ], +) +def test_curve_display_plot_kwargs(pyplot, data, CurveDisplay, specific_params): """Check the behaviour of the different plotting keyword arguments: `line_kw`, `fill_between_kw`, and `errorbar_kw`.""" X, y = data estimator = DecisionTreeClassifier(random_state=0) - train_sizes = [0.3, 0.6, 0.9] std_display_style = "fill_between" line_kw = {"color": "red"} fill_between_kw = {"color": "red", "alpha": 1.0} - display = LearningCurveDisplay.from_estimator( + display = CurveDisplay.from_estimator( estimator, X, y, - train_sizes=train_sizes, + **specific_params, std_display_style=std_display_style, line_kw=line_kw, fill_between_kw=fill_between_kw, @@ -342,13 +512,36 @@ def test_learning_curve_display_plot_kwargs(pyplot, data): std_display_style = "errorbar" errorbar_kw = {"color": "red"} - display = LearningCurveDisplay.from_estimator( + display = CurveDisplay.from_estimator( estimator, X, y, - train_sizes=train_sizes, + **specific_params, std_display_style=std_display_style, errorbar_kw=errorbar_kw, ) assert display.errorbar_[0].lines[0].get_color() == "red" + + +# TODO(1.5): to be removed +def test_learning_curve_display_deprecate_log_scale(data, pyplot): + """Check that we warn for the deprecated parameter `log_scale`.""" + X, y = data + estimator = DecisionTreeClassifier(random_state=0) + + with pytest.warns(FutureWarning, match="`log_scale` parameter is deprecated"): + display = LearningCurveDisplay.from_estimator( + estimator, X, y, train_sizes=[0.3, 0.6, 0.9], log_scale=True + ) + + assert display.ax_.get_xscale() == "log" + assert display.ax_.get_yscale() == "linear" + + with pytest.warns(FutureWarning, match="`log_scale` parameter is deprecated"): + display = LearningCurveDisplay.from_estimator( + estimator, X, y, train_sizes=[0.3, 0.6, 0.9], log_scale=False + ) + + assert display.ax_.get_xscale() == "linear" + assert display.ax_.get_yscale() == "linear" diff --git a/sklearn/multiclass.py b/sklearn/multiclass.py index 74684e608d3c1..4c30bcdb6cac3 100644 --- a/sklearn/multiclass.py +++ b/sklearn/multiclass.py @@ -43,6 +43,7 @@ from .base import BaseEstimator, ClassifierMixin, clone, is_classifier from .base import MultiOutputMixin from .base import MetaEstimatorMixin, is_regressor +from .base import _fit_context from .preprocessing import LabelBinarizer from .metrics.pairwise import pairwise_distances_argmin from .utils import check_random_state @@ -296,6 +297,10 @@ def __init__(self, estimator, *, n_jobs=None, verbose=0): self.n_jobs = n_jobs self.verbose = verbose + @_fit_context( + # OneVsRestClassifier.estimator is not validated yet + prefer_skip_nested_validation=False + ) def fit(self, X, y): """Fit underlying estimators. @@ -313,8 +318,6 @@ def fit(self, X, y): self : object Instance of fitted estimator. """ - self._validate_params() - # A sparse LabelBinarizer, with sparse_output=True, has been shown to # outperform or match a dense label binarizer in all cases and has also # resulted in less or equal memory consumption in the fit_ovr function @@ -348,6 +351,10 @@ def fit(self, X, y): return self @available_if(_estimators_has("partial_fit")) + @_fit_context( + # OneVsRestClassifier.estimator is not validated yet + prefer_skip_nested_validation=False + ) def partial_fit(self, X, y, classes=None): """Partially fit underlying estimators. @@ -376,8 +383,6 @@ def partial_fit(self, X, y, classes=None): Instance of partially fitted estimator. """ if _check_partial_fit_first_call(self, classes): - self._validate_params() - if not hasattr(self.estimator, "partial_fit"): raise ValueError( ("Base estimator {0}, doesn't have partial_fit method").format( @@ -655,6 +660,10 @@ def __init__(self, estimator, *, n_jobs=None): self.estimator = estimator self.n_jobs = n_jobs + @_fit_context( + # OneVsOneClassifier.estimator is not validated yet + prefer_skip_nested_validation=False + ) def fit(self, X, y): """Fit underlying estimators. @@ -671,7 +680,6 @@ def fit(self, X, y): self : object The fitted underlying estimator. """ - self._validate_params() # We need to validate the data because we do a safe_indexing later. X, y = self._validate_data( X, y, accept_sparse=["csr", "csc"], force_all_finite=False @@ -706,6 +714,10 @@ def fit(self, X, y): return self @available_if(_estimators_has("partial_fit")) + @_fit_context( + # OneVsOneClassifier.estimator is not validated yet + prefer_skip_nested_validation=False + ) def partial_fit(self, X, y, classes=None): """Partially fit underlying estimators. @@ -735,8 +747,6 @@ def partial_fit(self, X, y, classes=None): """ first_call = _check_partial_fit_first_call(self, classes) if first_call: - self._validate_params() - self.estimators_ = [ clone(self.estimator) for _ in range(self.n_classes_ * (self.n_classes_ - 1) // 2) @@ -968,6 +978,10 @@ def __init__(self, estimator, *, code_size=1.5, random_state=None, n_jobs=None): self.random_state = random_state self.n_jobs = n_jobs + @_fit_context( + # OutputCodeClassifier.estimator is not validated yet + prefer_skip_nested_validation=False + ) def fit(self, X, y): """Fit underlying estimators. @@ -984,7 +998,6 @@ def fit(self, X, y): self : object Returns a fitted instance of self. """ - self._validate_params() y = self._validate_data(X="no_validation", y=y) random_state = check_random_state(self.random_state) diff --git a/sklearn/multioutput.py b/sklearn/multioutput.py index 90c1f04f7e46a..8bb954e976f4c 100644 --- a/sklearn/multioutput.py +++ b/sklearn/multioutput.py @@ -28,6 +28,7 @@ RegressorMixin, clone, is_classifier, + _fit_context, ) from .model_selection import cross_val_predict from .utils import _print_elapsed_time, check_random_state, Bunch @@ -104,6 +105,10 @@ def __init__(self, estimator, *, n_jobs=None): self.n_jobs = n_jobs @_available_if_estimator_has("partial_fit") + @_fit_context( + # MultiOutput*.estimator is not validated yet + prefer_skip_nested_validation=False + ) def partial_fit(self, X, y, classes=None, sample_weight=None, **partial_fit_params): """Incrementally fit a separate model for each class output. @@ -151,9 +156,6 @@ def partial_fit(self, X, y, classes=None, sample_weight=None, **partial_fit_para first_time = not hasattr(self, "estimators_") - if first_time: - self._validate_params() - y = self._validate_data(X="no_validation", y=y, multi_output=True) if y.ndim == 1: @@ -203,6 +205,10 @@ def partial_fit(self, X, y, classes=None, sample_weight=None, **partial_fit_para return self + @_fit_context( + # MultiOutput*.estimator is not validated yet + prefer_skip_nested_validation=False + ) def fit(self, X, y, sample_weight=None, **fit_params): """Fit the model to data, separately for each output variable. @@ -230,8 +236,6 @@ def fit(self, X, y, sample_weight=None, **fit_params): self : object Returns a fitted instance. """ - self._validate_params() - if not hasattr(self.estimator, "fit"): raise ValueError("The base estimator should implement a fit method") @@ -887,6 +891,10 @@ class labels for each estimator in the chain. [0.0321..., 0.9935..., 0.0625...]]) """ + @_fit_context( + # ClassifierChain.base_estimator is not validated yet + prefer_skip_nested_validation=False + ) def fit(self, X, Y, **fit_params): """Fit the model to data matrix X and targets Y. @@ -917,8 +925,6 @@ def fit(self, X, Y, **fit_params): "See the User Guide for more information." ) - self._validate_params() - super().fit(X, Y, **fit_params) self.classes_ = [ estimator.classes_ for chain_idx, estimator in enumerate(self.estimators_) @@ -1109,6 +1115,10 @@ class RegressorChain(MetaEstimatorMixin, RegressorMixin, _BaseChain): [2., 0.]]) """ + @_fit_context( + # RegressorChain.base_estimator is not validated yet + prefer_skip_nested_validation=False + ) def fit(self, X, Y, **fit_params): """Fit the model to data matrix X and targets Y. @@ -1131,8 +1141,6 @@ def fit(self, X, Y, **fit_params): self : object Returns a fitted instance. """ - self._validate_params() - super().fit(X, Y, **fit_params) return self diff --git a/sklearn/naive_bayes.py b/sklearn/naive_bayes.py index 20858ac8b5577..76d7189385828 100644 --- a/sklearn/naive_bayes.py +++ b/sklearn/naive_bayes.py @@ -22,6 +22,7 @@ from scipy.special import logsumexp from .base import BaseEstimator, ClassifierMixin +from .base import _fit_context from .preprocessing import binarize from .preprocessing import LabelBinarizer from .preprocessing import label_binarize @@ -239,6 +240,7 @@ def __init__(self, *, priors=None, var_smoothing=1e-9): self.priors = priors self.var_smoothing = var_smoothing + @_fit_context(prefer_skip_nested_validation=True) def fit(self, X, y, sample_weight=None): """Fit Gaussian Naive Bayes according to X, y. @@ -262,7 +264,6 @@ def fit(self, X, y, sample_weight=None): self : object Returns the instance itself. """ - self._validate_params() y = self._validate_data(y=y) return self._partial_fit( X, y, np.unique(y), _refit=True, sample_weight=sample_weight @@ -346,6 +347,7 @@ def _update_mean_variance(n_past, mu, var, X, sample_weight=None): return total_mu, total_var + @_fit_context(prefer_skip_nested_validation=True) def partial_fit(self, X, y, classes=None, sample_weight=None): """Incremental fit on a batch of samples. @@ -386,8 +388,6 @@ def partial_fit(self, X, y, classes=None, sample_weight=None): self : object Returns the instance itself. """ - self._validate_params() - return self._partial_fit( X, y, classes, _refit=False, sample_weight=sample_weight ) @@ -643,6 +643,7 @@ def _check_alpha(self): return np.maximum(alpha, alpha_lower_bound) return alpha + @_fit_context(prefer_skip_nested_validation=True) def partial_fit(self, X, y, classes=None, sample_weight=None): """Incremental fit on a batch of samples. @@ -682,9 +683,6 @@ def partial_fit(self, X, y, classes=None, sample_weight=None): """ first_call = not hasattr(self, "classes_") - if first_call: - self._validate_params() - X, y = self._check_X_y(X, y, reset=first_call) _, n_features = X.shape @@ -728,6 +726,7 @@ def partial_fit(self, X, y, classes=None, sample_weight=None): self._update_class_log_prior(class_prior=class_prior) return self + @_fit_context(prefer_skip_nested_validation=True) def fit(self, X, y, sample_weight=None): """Fit Naive Bayes classifier according to X, y. @@ -748,7 +747,6 @@ def fit(self, X, y, sample_weight=None): self : object Returns the instance itself. """ - self._validate_params() X, y = self._check_X_y(X, y) _, n_features = X.shape diff --git a/sklearn/neighbors/_classification.py b/sklearn/neighbors/_classification.py index dbc070987d5d0..e3e2049a8f8e5 100644 --- a/sklearn/neighbors/_classification.py +++ b/sklearn/neighbors/_classification.py @@ -18,6 +18,7 @@ from ._base import _get_weights from ._base import NeighborsBase, KNeighborsMixin, RadiusNeighborsMixin from ..base import ClassifierMixin +from ..base import _fit_context from ..metrics._pairwise_distances_reduction import ArgKminClassMode from ..utils._param_validation import StrOptions from sklearn.neighbors._base import _check_precomputed @@ -203,6 +204,10 @@ def __init__( ) self.weights = weights + @_fit_context( + # KNeighborsClassifier.metric is not validated yet + prefer_skip_nested_validation=False + ) def fit(self, X, y): """Fit the k-nearest neighbors classifier from the training dataset. @@ -221,8 +226,6 @@ def fit(self, X, y): self : KNeighborsClassifier The fitted k-nearest neighbors classifier. """ - self._validate_params() - return self._fit(X, y) def predict(self, X): @@ -572,6 +575,10 @@ def __init__( self.weights = weights self.outlier_label = outlier_label + @_fit_context( + # RadiusNeighborsClassifier.metric is not validated yet + prefer_skip_nested_validation=False + ) def fit(self, X, y): """Fit the radius neighbors classifier from the training dataset. @@ -590,7 +597,6 @@ def fit(self, X, y): self : RadiusNeighborsClassifier The fitted radius neighbors classifier. """ - self._validate_params() self._fit(X, y) classes_ = self.classes_ diff --git a/sklearn/neighbors/_graph.py b/sklearn/neighbors/_graph.py index 418761c2d21ee..e815d12e293c9 100644 --- a/sklearn/neighbors/_graph.py +++ b/sklearn/neighbors/_graph.py @@ -8,6 +8,7 @@ from ._base import NeighborsBase from ._unsupervised import NearestNeighbors from ..base import TransformerMixin, ClassNamePrefixFeaturesOutMixin +from ..base import _fit_context from ..utils._param_validation import StrOptions from ..utils.validation import check_is_fitted @@ -372,6 +373,10 @@ def __init__( ) self.mode = mode + @_fit_context( + # KNeighborsTransformer.metric is not validated yet + prefer_skip_nested_validation=False + ) def fit(self, X, y=None): """Fit the k-nearest neighbors transformer from the training dataset. @@ -388,7 +393,6 @@ def fit(self, X, y=None): self : KNeighborsTransformer The fitted k-nearest neighbors transformer. """ - self._validate_params() self._fit(X) self._n_features_out = self.n_samples_fit_ return self @@ -600,6 +604,10 @@ def __init__( ) self.mode = mode + @_fit_context( + # RadiusNeighborsTransformer.metric is not validated yet + prefer_skip_nested_validation=False + ) def fit(self, X, y=None): """Fit the radius neighbors transformer from the training dataset. @@ -617,7 +625,6 @@ def fit(self, X, y=None): self : RadiusNeighborsTransformer The fitted radius neighbors transformer. """ - self._validate_params() self._fit(X) self._n_features_out = self.n_samples_fit_ return self diff --git a/sklearn/neighbors/_kde.py b/sklearn/neighbors/_kde.py index f285b03403b5f..7f7b38497d209 100644 --- a/sklearn/neighbors/_kde.py +++ b/sklearn/neighbors/_kde.py @@ -10,6 +10,7 @@ from scipy.special import gammainc from ..base import BaseEstimator +from ..base import _fit_context from ..neighbors._base import VALID_METRICS from ..utils import check_random_state from ..utils.validation import _check_sample_weight, check_is_fitted @@ -185,6 +186,10 @@ def _choose_algorithm(self, algorithm, metric): ) return algorithm + @_fit_context( + # KernelDensity.metric is not validated yet + prefer_skip_nested_validation=False + ) def fit(self, X, y=None, sample_weight=None): """Fit the Kernel Density model on the data. @@ -208,8 +213,6 @@ def fit(self, X, y=None, sample_weight=None): self : object Returns the instance itself. """ - self._validate_params() - algorithm = self._choose_algorithm(self.algorithm, self.metric) if isinstance(self.bandwidth, str): diff --git a/sklearn/neighbors/_lof.py b/sklearn/neighbors/_lof.py index 90b3b0aa3d8ce..40cdc9ab5fb9d 100644 --- a/sklearn/neighbors/_lof.py +++ b/sklearn/neighbors/_lof.py @@ -8,6 +8,7 @@ from ._base import NeighborsBase from ._base import KNeighborsMixin from ..base import OutlierMixin +from ..base import _fit_context from numbers import Real from ..utils._param_validation import Interval, StrOptions @@ -256,6 +257,10 @@ def fit_predict(self, X, y=None): return self.fit(X)._predict() + @_fit_context( + # LocalOutlierFactor.metric is not validated yet + prefer_skip_nested_validation=False + ) def fit(self, X, y=None): """Fit the local outlier factor detector from the training dataset. @@ -273,8 +278,6 @@ def fit(self, X, y=None): self : LocalOutlierFactor The fitted local outlier factor detector. """ - self._validate_params() - self._fit(X) n_samples = self.n_samples_fit_ diff --git a/sklearn/neighbors/_nca.py b/sklearn/neighbors/_nca.py index 4a83fcc7bc080..246f0adcb36ad 100644 --- a/sklearn/neighbors/_nca.py +++ b/sklearn/neighbors/_nca.py @@ -15,6 +15,7 @@ from ..utils.extmath import softmax from ..metrics import pairwise_distances from ..base import BaseEstimator, TransformerMixin, ClassNamePrefixFeaturesOutMixin +from ..base import _fit_context from ..preprocessing import LabelEncoder from ..decomposition import PCA from ..utils.multiclass import check_classification_targets @@ -215,6 +216,7 @@ def __init__( self.verbose = verbose self.random_state = random_state + @_fit_context(prefer_skip_nested_validation=True) def fit(self, X, y): """Fit the model according to the given training data. @@ -231,8 +233,6 @@ def fit(self, X, y): self : object Fitted estimator. """ - self._validate_params() - # Validate the inputs X and y, and converts y to numerical classes. X, y = self._validate_data(X, y, ensure_min_samples=2) check_classification_targets(y) diff --git a/sklearn/neighbors/_nearest_centroid.py b/sklearn/neighbors/_nearest_centroid.py index 7b9c2479747d3..315393bf597e4 100644 --- a/sklearn/neighbors/_nearest_centroid.py +++ b/sklearn/neighbors/_nearest_centroid.py @@ -13,6 +13,7 @@ from scipy import sparse as sp from ..base import BaseEstimator, ClassifierMixin +from ..base import _fit_context from ..metrics.pairwise import pairwise_distances_argmin from ..preprocessing import LabelEncoder from ..utils.validation import check_is_fitted @@ -122,6 +123,7 @@ def __init__(self, metric="euclidean", *, shrink_threshold=None): self.metric = metric self.shrink_threshold = shrink_threshold + @_fit_context(prefer_skip_nested_validation=True) def fit(self, X, y): """ Fit the NearestCentroid model according to the given training data. @@ -140,8 +142,6 @@ def fit(self, X, y): self : object Fitted estimator. """ - self._validate_params() - if isinstance(self.metric, str) and self.metric not in ( "manhattan", "euclidean", diff --git a/sklearn/neighbors/_regression.py b/sklearn/neighbors/_regression.py index 003b534074ecd..b2050345c9833 100644 --- a/sklearn/neighbors/_regression.py +++ b/sklearn/neighbors/_regression.py @@ -17,6 +17,7 @@ from ._base import _get_weights from ._base import NeighborsBase, KNeighborsMixin, RadiusNeighborsMixin from ..base import RegressorMixin +from ..base import _fit_context from ..utils._param_validation import StrOptions @@ -194,6 +195,10 @@ def _more_tags(self): # For cross-validation routines to split data correctly return {"pairwise": self.metric == "precomputed"} + @_fit_context( + # KNeighborsRegressor.metric is not validated yet + prefer_skip_nested_validation=False + ) def fit(self, X, y): """Fit the k-nearest neighbors regressor from the training dataset. @@ -212,8 +217,6 @@ def fit(self, X, y): self : KNeighborsRegressor The fitted k-nearest neighbors regressor. """ - self._validate_params() - return self._fit(X, y) def predict(self, X): @@ -422,6 +425,10 @@ def __init__( ) self.weights = weights + @_fit_context( + # RadiusNeighborsRegressor.metric is not validated yet + prefer_skip_nested_validation=False + ) def fit(self, X, y): """Fit the radius neighbors regressor from the training dataset. @@ -440,7 +447,6 @@ def fit(self, X, y): self : RadiusNeighborsRegressor The fitted radius neighbors regressor. """ - self._validate_params() return self._fit(X, y) def predict(self, X): diff --git a/sklearn/neighbors/_unsupervised.py b/sklearn/neighbors/_unsupervised.py index 53e69495b9ed4..05607f0bd0c71 100644 --- a/sklearn/neighbors/_unsupervised.py +++ b/sklearn/neighbors/_unsupervised.py @@ -1,4 +1,5 @@ """Unsupervised nearest neighbors learner""" +from ..base import _fit_context from ._base import NeighborsBase from ._base import KNeighborsMixin from ._base import RadiusNeighborsMixin @@ -155,6 +156,10 @@ def __init__( n_jobs=n_jobs, ) + @_fit_context( + # NearestNeighbors.metric is not validated yet + prefer_skip_nested_validation=False + ) def fit(self, X, y=None): """Fit the nearest neighbors estimator from the training dataset. @@ -172,5 +177,4 @@ def fit(self, X, y=None): self : NearestNeighbors The fitted nearest neighbors estimator. """ - self._validate_params() return self._fit(X) diff --git a/sklearn/neural_network/_multilayer_perceptron.py b/sklearn/neural_network/_multilayer_perceptron.py index 5c4bc5a39aa2d..fb8eab2f1776d 100644 --- a/sklearn/neural_network/_multilayer_perceptron.py +++ b/sklearn/neural_network/_multilayer_perceptron.py @@ -21,6 +21,7 @@ RegressorMixin, ) from ..base import is_classifier +from ..base import _fit_context from ._base import ACTIVATIONS, DERIVATIVES, LOSS_FUNCTIONS from ._stochastic_optimizers import SGDOptimizer, AdamOptimizer from ..metrics import accuracy_score, r2_score @@ -727,6 +728,7 @@ def _update_no_improvement_count(self, early_stopping, X_val, y_val): if self.loss_curve_[-1] < self.best_loss_: self.best_loss_ = self.loss_curve_[-1] + @_fit_context(prefer_skip_nested_validation=True) def fit(self, X, y): """Fit the model to data matrix X and target(s) y. @@ -744,8 +746,6 @@ def fit(self, X, y): self : object Returns a trained MLP model. """ - self._validate_params() - return self._fit(X, y, incremental=False) def _check_solver(self): @@ -1170,6 +1170,7 @@ def _score(self, X, y): return accuracy_score(y, self._predict(X, check_input=False)) @available_if(lambda est: est._check_solver()) + @_fit_context(prefer_skip_nested_validation=True) def partial_fit(self, X, y, classes=None): """Update the model with a single iteration over the given data. @@ -1194,9 +1195,6 @@ def partial_fit(self, X, y, classes=None): self : object Trained MLP model. """ - if not hasattr(self, "coefs_"): - self._validate_params() - if _check_partial_fit_first_call(self, classes): self._label_binarizer = LabelBinarizer() if type_of_target(y).startswith("multilabel"): @@ -1624,6 +1622,7 @@ def _validate_input(self, X, y, incremental, reset): return X, y @available_if(lambda est: est._check_solver) + @_fit_context(prefer_skip_nested_validation=True) def partial_fit(self, X, y): """Update the model with a single iteration over the given data. @@ -1640,7 +1639,4 @@ def partial_fit(self, X, y): self : object Trained MLP model. """ - if not hasattr(self, "coefs_"): - self._validate_params() - return self._fit(X, y, incremental=True) diff --git a/sklearn/neural_network/_rbm.py b/sklearn/neural_network/_rbm.py index 0624145116180..2ded6533d8d96 100644 --- a/sklearn/neural_network/_rbm.py +++ b/sklearn/neural_network/_rbm.py @@ -17,6 +17,7 @@ from ..base import BaseEstimator from ..base import TransformerMixin from ..base import ClassNamePrefixFeaturesOutMixin +from ..base import _fit_context from ..utils import check_random_state from ..utils import gen_even_slices from ..utils.extmath import safe_sparse_dot @@ -269,6 +270,7 @@ def gibbs(self, v): return v_ + @_fit_context(prefer_skip_nested_validation=True) def partial_fit(self, X, y=None): """Fit the model to the partial segment of the data X. @@ -285,9 +287,6 @@ def partial_fit(self, X, y=None): self : BernoulliRBM The fitted model. """ - - self._validate_params() - first_pass = not hasattr(self, "components_") X = self._validate_data( X, accept_sparse="csr", dtype=np.float64, reset=first_pass @@ -380,6 +379,7 @@ def score_samples(self, X): fe_ = self._free_energy(v_) return v.shape[1] * log_logistic(fe_ - fe) + @_fit_context(prefer_skip_nested_validation=True) def fit(self, X, y=None): """Fit the model to the data X. @@ -396,9 +396,6 @@ def fit(self, X, y=None): self : BernoulliRBM The fitted model. """ - - self._validate_params() - X = self._validate_data(X, accept_sparse="csr", dtype=(np.float64, np.float32)) n_samples = X.shape[0] rng = check_random_state(self.random_state) diff --git a/sklearn/pipeline.py b/sklearn/pipeline.py index 8c5dc3bd82917..43b6b7eb0c939 100644 --- a/sklearn/pipeline.py +++ b/sklearn/pipeline.py @@ -16,6 +16,7 @@ from scipy import sparse from .base import clone, TransformerMixin +from .base import _fit_context from .preprocessing import FunctionTransformer from .utils._estimator_html_repr import _VisualBlock from .utils.metaestimators import available_if @@ -385,6 +386,10 @@ def _fit(self, X, y=None, **fit_params_steps): self.steps[step_idx] = (name, fitted_transformer) return X + @_fit_context( + # estimators in Pipeline.steps are not validated yet + prefer_skip_nested_validation=False + ) def fit(self, X, y=None, **fit_params): """Fit the model. @@ -411,7 +416,6 @@ def fit(self, X, y=None, **fit_params): self : object Pipeline with fitted steps. """ - self._validate_params() fit_params_steps = self._check_fit_params(**fit_params) Xt = self._fit(X, y, **fit_params_steps) with _print_elapsed_time("Pipeline", self._log_message(len(self.steps) - 1)): @@ -429,6 +433,10 @@ def _can_fit_transform(self): ) @available_if(_can_fit_transform) + @_fit_context( + # estimators in Pipeline.steps are not validated yet + prefer_skip_nested_validation=False + ) def fit_transform(self, X, y=None, **fit_params): """Fit the model and transform with the final estimator. @@ -456,7 +464,6 @@ def fit_transform(self, X, y=None, **fit_params): Xt : ndarray of shape (n_samples, n_transformed_features) Transformed samples. """ - self._validate_params() fit_params_steps = self._check_fit_params(**fit_params) Xt = self._fit(X, y, **fit_params_steps) @@ -505,6 +512,10 @@ def predict(self, X, **predict_params): return self.steps[-1][1].predict(Xt, **predict_params) @available_if(_final_estimator_has("fit_predict")) + @_fit_context( + # estimators in Pipeline.steps are not validated yet + prefer_skip_nested_validation=False + ) def fit_predict(self, X, y=None, **fit_params): """Transform the data, and apply `fit_predict` with the final estimator. @@ -533,7 +544,6 @@ def fit_predict(self, X, y=None, **fit_params): y_pred : ndarray Result of calling `fit_predict` on the final estimator. """ - self._validate_params() fit_params_steps = self._check_fit_params(**fit_params) Xt = self._fit(X, y, **fit_params_steps) diff --git a/sklearn/preprocessing/_data.py b/sklearn/preprocessing/_data.py index 013f1f57e9373..139022a9897e6 100644 --- a/sklearn/preprocessing/_data.py +++ b/sklearn/preprocessing/_data.py @@ -22,6 +22,7 @@ TransformerMixin, OneToOneFeatureMixin, ClassNamePrefixFeaturesOutMixin, + _fit_context, ) from ..utils import check_array from ..utils._param_validation import Interval, Options, StrOptions, validate_params @@ -435,6 +436,7 @@ def fit(self, X, y=None): self._reset() return self.partial_fit(X, y) + @_fit_context(prefer_skip_nested_validation=True) def partial_fit(self, X, y=None): """Online computation of min and max on X for later scaling. @@ -456,8 +458,6 @@ def partial_fit(self, X, y=None): self : object Fitted scaler. """ - self._validate_params() - feature_range = self.feature_range if feature_range[0] >= feature_range[1]: raise ValueError( @@ -838,6 +838,7 @@ def fit(self, X, y=None, sample_weight=None): self._reset() return self.partial_fit(X, y, sample_weight) + @_fit_context(prefer_skip_nested_validation=True) def partial_fit(self, X, y=None, sample_weight=None): """Online computation of mean and std on X for later scaling. @@ -870,8 +871,6 @@ def partial_fit(self, X, y=None, sample_weight=None): self : object Fitted scaler. """ - self._validate_params() - first_call = not hasattr(self, "n_samples_seen_") X = self._validate_data( X, @@ -1183,6 +1182,7 @@ def fit(self, X, y=None): self._reset() return self.partial_fit(X, y) + @_fit_context(prefer_skip_nested_validation=True) def partial_fit(self, X, y=None): """Online computation of max absolute value of X for later scaling. @@ -1204,8 +1204,6 @@ def partial_fit(self, X, y=None): self : object Fitted scaler. """ - self._validate_params() - first_pass = not hasattr(self, "n_samples_seen_") X = self._validate_data( X, @@ -1514,6 +1512,7 @@ def __init__( self.unit_variance = unit_variance self.copy = copy + @_fit_context(prefer_skip_nested_validation=True) def fit(self, X, y=None): """Compute the median and quantiles to be used for scaling. @@ -1531,8 +1530,6 @@ def fit(self, X, y=None): self : object Fitted scaler. """ - self._validate_params() - # at fit, convert sparse matrices to csc for optimized computation of # the quantiles X = self._validate_data( @@ -1972,6 +1969,7 @@ def __init__(self, norm="l2", *, copy=True): self.norm = norm self.copy = copy + @_fit_context(prefer_skip_nested_validation=True) def fit(self, X, y=None): """Only validates estimator's parameters. @@ -1991,7 +1989,6 @@ def fit(self, X, y=None): self : object Fitted transformer. """ - self._validate_params() self._validate_data(X, accept_sparse="csr") return self @@ -2155,6 +2152,7 @@ def __init__(self, *, threshold=0.0, copy=True): self.threshold = threshold self.copy = copy + @_fit_context(prefer_skip_nested_validation=True) def fit(self, X, y=None): """Only validates estimator's parameters. @@ -2174,7 +2172,6 @@ def fit(self, X, y=None): self : object Fitted transformer. """ - self._validate_params() self._validate_data(X, accept_sparse="csr") return self @@ -2634,6 +2631,7 @@ def _sparse_fit(self, X, random_state): # https://github.com/numpy/numpy/issues/14685 self.quantiles_ = np.maximum.accumulate(self.quantiles_) + @_fit_context(prefer_skip_nested_validation=True) def fit(self, X, y=None): """Compute the quantiles used for transforming. @@ -2653,8 +2651,6 @@ def fit(self, X, y=None): self : object Fitted transformer. """ - self._validate_params() - if self.n_quantiles > self.subsample: raise ValueError( "The number of quantiles cannot be greater than" @@ -3101,6 +3097,7 @@ def __init__(self, method="yeo-johnson", *, standardize=True, copy=True): self.standardize = standardize self.copy = copy + @_fit_context(prefer_skip_nested_validation=True) def fit(self, X, y=None): """Estimate the optimal parameter lambda for each feature. @@ -3120,10 +3117,10 @@ def fit(self, X, y=None): self : object Fitted transformer. """ - self._validate_params() self._fit(X, y=y, force_transform=False) return self + @_fit_context(prefer_skip_nested_validation=True) def fit_transform(self, X, y=None): """Fit `PowerTransformer` to `X`, then transform `X`. @@ -3141,7 +3138,6 @@ def fit_transform(self, X, y=None): X_new : ndarray of shape (n_samples, n_features) Transformed data. """ - self._validate_params() return self._fit(X, y, force_transform=True) def _fit(self, X, y=None, force_transform=False): @@ -3150,24 +3146,37 @@ def _fit(self, X, y=None, force_transform=False): if not self.copy and not force_transform: # if call from fit() X = X.copy() # force copy so that fit does not change X inplace + n_samples = X.shape[0] + mean = np.mean(X, axis=0, dtype=np.float64) + var = np.var(X, axis=0, dtype=np.float64) + optim_function = { "box-cox": self._box_cox_optimize, "yeo-johnson": self._yeo_johnson_optimize, }[self.method] + + transform_function = { + "box-cox": boxcox, + "yeo-johnson": self._yeo_johnson_transform, + }[self.method] + with np.errstate(invalid="ignore"): # hide NaN warnings - self.lambdas_ = np.array([optim_function(col) for col in X.T]) + self.lambdas_ = np.empty(X.shape[1], dtype=X.dtype) + for i, col in enumerate(X.T): + # For yeo-johnson, leave constant features unchanged + # lambda=1 corresponds to the identity transformation + is_constant_feature = _is_constant_feature(var[i], mean[i], n_samples) + if self.method == "yeo-johnson" and is_constant_feature: + self.lambdas_[i] = 1.0 + continue + + self.lambdas_[i] = optim_function(col) - if self.standardize or force_transform: - transform_function = { - "box-cox": boxcox, - "yeo-johnson": self._yeo_johnson_transform, - }[self.method] - for i, lmbda in enumerate(self.lambdas_): - with np.errstate(invalid="ignore"): # hide NaN warnings - X[:, i] = transform_function(X[:, i], lmbda) + if self.standardize or force_transform: + X[:, i] = transform_function(X[:, i], self.lambdas_[i]) if self.standardize: - self._scaler = StandardScaler(copy=False) + self._scaler = StandardScaler(copy=False).set_output(transform="default") if force_transform: X = self._scaler.fit_transform(X) else: diff --git a/sklearn/preprocessing/_discretization.py b/sklearn/preprocessing/_discretization.py index 220950586a6ef..ac7432027f462 100644 --- a/sklearn/preprocessing/_discretization.py +++ b/sklearn/preprocessing/_discretization.py @@ -11,6 +11,7 @@ from . import OneHotEncoder from ..base import BaseEstimator, TransformerMixin +from ..base import _fit_context from ..utils._param_validation import Hidden, Interval, StrOptions, Options from ..utils.validation import check_array from ..utils.validation import check_is_fitted @@ -192,6 +193,7 @@ def __init__( self.subsample = subsample self.random_state = random_state + @_fit_context(prefer_skip_nested_validation=True) def fit(self, X, y=None, sample_weight=None): """ Fit the estimator. @@ -216,7 +218,6 @@ def fit(self, X, y=None, sample_weight=None): self : object Returns the instance itself. """ - self._validate_params() X = self._validate_data(X, dtype="numeric") if self.dtype in (np.float64, np.float32): diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py index 1fc4b16a52467..de3f983d7ae6f 100644 --- a/sklearn/preprocessing/_encoders.py +++ b/sklearn/preprocessing/_encoders.py @@ -10,6 +10,7 @@ from scipy import sparse from ..base import BaseEstimator, TransformerMixin, OneToOneFeatureMixin +from ..base import _fit_context from ..utils import check_array, is_scalar_nan, _safe_indexing from ..utils.validation import check_is_fitted from ..utils.validation import _check_feature_names_in @@ -953,6 +954,7 @@ def _compute_n_features_outs(self): return output + @_fit_context(prefer_skip_nested_validation=True) def fit(self, X, y=None): """ Fit OneHotEncoder to X. @@ -971,8 +973,6 @@ def fit(self, X, y=None): self Fitted encoder. """ - self._validate_params() - if self.sparse != "deprecated": warnings.warn( ( @@ -1446,6 +1446,7 @@ def __init__( self.min_frequency = min_frequency self.max_categories = max_categories + @_fit_context(prefer_skip_nested_validation=True) def fit(self, X, y=None): """ Fit the OrdinalEncoder to X. @@ -1464,8 +1465,6 @@ def fit(self, X, y=None): self : object Fitted encoder. """ - self._validate_params() - if self.handle_unknown == "use_encoded_value": if is_scalar_nan(self.unknown_value): if np.dtype(self.dtype).kind != "f": diff --git a/sklearn/preprocessing/_function_transformer.py b/sklearn/preprocessing/_function_transformer.py index c250c5cd0226e..d7bf1810e61c0 100644 --- a/sklearn/preprocessing/_function_transformer.py +++ b/sklearn/preprocessing/_function_transformer.py @@ -3,6 +3,7 @@ import numpy as np from ..base import BaseEstimator, TransformerMixin +from ..base import _fit_context from ..utils.metaestimators import available_if from ..utils.validation import ( _allclose_dense_sparse, @@ -197,6 +198,7 @@ def _check_inverse_transform(self, X): UserWarning, ) + @_fit_context(prefer_skip_nested_validation=True) def fit(self, X, y=None): """Fit transformer by checking X. @@ -216,7 +218,6 @@ def fit(self, X, y=None): self : object FunctionTransformer class instance. """ - self._validate_params() X = self._check_input(X, reset=True) if self.check_inverse and not (self.func is None or self.inverse_func is None): self._check_inverse_transform(X) diff --git a/sklearn/preprocessing/_label.py b/sklearn/preprocessing/_label.py index ca8607b06c2e2..f656329607ee3 100644 --- a/sklearn/preprocessing/_label.py +++ b/sklearn/preprocessing/_label.py @@ -16,7 +16,7 @@ import scipy.sparse as sp from ..base import BaseEstimator, TransformerMixin - +from ..base import _fit_context from ..utils.sparsefuncs import min_max_axis from ..utils._param_validation import Interval, validate_params from ..utils import column_or_1d @@ -268,6 +268,7 @@ def __init__(self, *, neg_label=0, pos_label=1, sparse_output=False): self.pos_label = pos_label self.sparse_output = sparse_output + @_fit_context(prefer_skip_nested_validation=True) def fit(self, y): """Fit label binarizer. @@ -282,9 +283,6 @@ def fit(self, y): self : object Returns the instance itself. """ - - self._validate_params() - if self.neg_label >= self.pos_label: raise ValueError( f"neg_label={self.neg_label} must be strictly less than " @@ -761,6 +759,7 @@ def __init__(self, *, classes=None, sparse_output=False): self.classes = classes self.sparse_output = sparse_output + @_fit_context(prefer_skip_nested_validation=True) def fit(self, y): """Fit the label sets binarizer, storing :term:`classes_`. @@ -776,7 +775,6 @@ def fit(self, y): self : object Fitted estimator. """ - self._validate_params() self._cached_dict = None if self.classes is None: @@ -794,6 +792,7 @@ def fit(self, y): self.classes_[:] = classes return self + @_fit_context(prefer_skip_nested_validation=True) def fit_transform(self, y): """Fit the label sets binarizer and transform the given label sets. @@ -814,7 +813,6 @@ def fit_transform(self, y): if self.classes is not None: return self.fit(y).transform(y) - self._validate_params() self._cached_dict = None # Automatically increment on new class diff --git a/sklearn/preprocessing/_polynomial.py b/sklearn/preprocessing/_polynomial.py index 08ccf6355fc4e..1dfee8a088114 100644 --- a/sklearn/preprocessing/_polynomial.py +++ b/sklearn/preprocessing/_polynomial.py @@ -12,6 +12,7 @@ from scipy.special import comb from ..base import BaseEstimator, TransformerMixin +from ..base import _fit_context from ..utils import check_array from ..utils.fixes import sp_version, parse_version from ..utils.validation import check_is_fitted, FLOAT_DTYPES, _check_sample_weight @@ -299,6 +300,7 @@ def get_feature_names_out(self, input_features=None): feature_names.append(name) return np.asarray(feature_names, dtype=object) + @_fit_context(prefer_skip_nested_validation=True) def fit(self, X, y=None): """ Compute number of output features. @@ -316,7 +318,6 @@ def fit(self, X, y=None): self : object Fitted transformer. """ - self._validate_params() _, n_features = self._validate_data(X, accept_sparse=True).shape if isinstance(self.degree, Integral): @@ -802,6 +803,7 @@ def get_feature_names_out(self, input_features=None): feature_names.append(f"{input_features[i]}_sp_{j}") return np.asarray(feature_names, dtype=object) + @_fit_context(prefer_skip_nested_validation=True) def fit(self, X, y=None, sample_weight=None): """Compute knot positions of splines. @@ -823,8 +825,6 @@ def fit(self, X, y=None, sample_weight=None): self : object Fitted transformer. """ - self._validate_params() - X = self._validate_data( X, reset=True, diff --git a/sklearn/preprocessing/_target_encoder.py b/sklearn/preprocessing/_target_encoder.py index 9100d72194a32..9dd33ddfa3cce 100644 --- a/sklearn/preprocessing/_target_encoder.py +++ b/sklearn/preprocessing/_target_encoder.py @@ -4,6 +4,7 @@ from ._encoders import _BaseEncoder from ..base import OneToOneFeatureMixin +from ..base import _fit_context from ._target_encoder_fast import _fit_encoding_fast from ._target_encoder_fast import _fit_encoding_fast_auto_smooth from ..utils.validation import _check_y, check_consistent_length @@ -176,6 +177,7 @@ def __init__( self.shuffle = shuffle self.random_state = random_state + @_fit_context(prefer_skip_nested_validation=True) def fit(self, X, y): """Fit the :class:`TargetEncoder` to X and y. @@ -192,10 +194,10 @@ def fit(self, X, y): self : object Fitted encoder. """ - self._validate_params() self._fit_encodings_all(X, y) return self + @_fit_context(prefer_skip_nested_validation=True) def fit_transform(self, X, y): """Fit :class:`TargetEncoder` and transform X with the target encoding. @@ -219,7 +221,6 @@ def fit_transform(self, X, y): """ from ..model_selection import KFold, StratifiedKFold # avoid circular import - self._validate_params() X_ordinal, X_known_mask, y, n_categories = self._fit_encodings_all(X, y) # The cv splitter is voluntarily restricted to *KFold to enforce non diff --git a/sklearn/preprocessing/tests/test_data.py b/sklearn/preprocessing/tests/test_data.py index 2e6fd810fedac..c00de906a7dbb 100644 --- a/sklearn/preprocessing/tests/test_data.py +++ b/sklearn/preprocessing/tests/test_data.py @@ -2669,3 +2669,22 @@ def test_kernel_centerer_feature_names_out(): names_out = centerer.get_feature_names_out() samples_out2 = X_pairwise.shape[1] assert_array_equal(names_out, [f"kernelcenterer{i}" for i in range(samples_out2)]) + + +@pytest.mark.parametrize("standardize", [True, False]) +def test_power_transformer_constant_feature(standardize): + """Check that PowerTransfomer leaves constant features unchanged.""" + X = [[-2, 0, 2], [-2, 0, 2], [-2, 0, 2]] + + pt = PowerTransformer(method="yeo-johnson", standardize=standardize).fit(X) + + assert_allclose(pt.lambdas_, [1, 1, 1]) + + Xft = pt.fit_transform(X) + Xt = pt.transform(X) + + for Xt_ in [Xft, Xt]: + if standardize: + assert_allclose(Xt_, np.zeros_like(X)) + else: + assert_allclose(Xt_, X) diff --git a/sklearn/random_projection.py b/sklearn/random_projection.py index 9e9620e089521..ca0ee41784ab5 100644 --- a/sklearn/random_projection.py +++ b/sklearn/random_projection.py @@ -36,7 +36,7 @@ from .base import BaseEstimator, TransformerMixin from .base import ClassNamePrefixFeaturesOutMixin - +from .base import _fit_context from .utils import check_random_state from .utils._param_validation import Interval, StrOptions, validate_params from .utils.extmath import safe_sparse_dot @@ -356,6 +356,7 @@ def _compute_inverse_components(self): components = components.toarray() return linalg.pinv(components, check_finite=False) + @_fit_context(prefer_skip_nested_validation=True) def fit(self, X, y=None): """Generate a sparse random projection matrix. @@ -374,7 +375,6 @@ def fit(self, X, y=None): self : object BaseRandomProjection class instance. """ - self._validate_params() X = self._validate_data( X, accept_sparse=["csr", "csc"], dtype=[np.float64, np.float32] ) diff --git a/sklearn/semi_supervised/_label_propagation.py b/sklearn/semi_supervised/_label_propagation.py index 95fad0713d558..9d7786bc1d67e 100644 --- a/sklearn/semi_supervised/_label_propagation.py +++ b/sklearn/semi_supervised/_label_propagation.py @@ -64,6 +64,7 @@ from scipy.sparse import csgraph from ..base import BaseEstimator, ClassifierMixin +from ..base import _fit_context from ..metrics.pairwise import rbf_kernel from ..neighbors import NearestNeighbors from ..utils.extmath import safe_sparse_dot @@ -230,6 +231,7 @@ class labels. probabilities /= normalizer return probabilities + @_fit_context(prefer_skip_nested_validation=True) def fit(self, X, y): """Fit a semi-supervised label propagation model to X. @@ -254,7 +256,6 @@ def fit(self, X, y): self : object Returns the instance itself. """ - self._validate_params() X, y = self._validate_data( X, y, diff --git a/sklearn/semi_supervised/_self_training.py b/sklearn/semi_supervised/_self_training.py index 2438658ed89c8..c4706df1754da 100644 --- a/sklearn/semi_supervised/_self_training.py +++ b/sklearn/semi_supervised/_self_training.py @@ -4,6 +4,7 @@ import numpy as np from ..base import MetaEstimatorMixin, clone, BaseEstimator +from ..base import _fit_context from ..utils._param_validation import HasMethods, Interval, StrOptions from ..utils.validation import check_is_fitted from ..utils.metaestimators import available_if @@ -171,6 +172,10 @@ def __init__( self.max_iter = max_iter self.verbose = verbose + @_fit_context( + # SelfTrainingClassifier.base_estimator is not validated yet + prefer_skip_nested_validation=False + ) def fit(self, X, y): """ Fit self-training classifier using `X`, `y` as training data. @@ -189,8 +194,6 @@ def fit(self, X, y): self : object Fitted estimator. """ - self._validate_params() - # we need row slicing support for sparce matrices, but costly finiteness check # can be delegated to the base estimator. X, y = self._validate_data( diff --git a/sklearn/svm/_base.py b/sklearn/svm/_base.py index 55919099e027c..a54c31cecb6e1 100644 --- a/sklearn/svm/_base.py +++ b/sklearn/svm/_base.py @@ -11,6 +11,7 @@ from . import _liblinear as liblinear # type: ignore from . import _libsvm_sparse as libsvm_sparse # type: ignore from ..base import BaseEstimator, ClassifierMixin +from ..base import _fit_context from ..preprocessing import LabelEncoder from ..utils.multiclass import _ovr_decision_function from ..utils import check_array, check_random_state @@ -143,6 +144,7 @@ def _more_tags(self): # Used by cross_val_score. return {"pairwise": self.kernel == "precomputed"} + @_fit_context(prefer_skip_nested_validation=True) def fit(self, X, y, sample_weight=None): """Fit the SVM model according to the given training data. @@ -176,8 +178,6 @@ def fit(self, X, y, sample_weight=None): If X is a dense array, then the other methods will not support sparse matrices as input. """ - self._validate_params() - rnd = check_random_state(self.random_state) sparse = sp.isspmatrix(X) diff --git a/sklearn/svm/_classes.py b/sklearn/svm/_classes.py index e035e74a05e2c..a438d007da970 100644 --- a/sklearn/svm/_classes.py +++ b/sklearn/svm/_classes.py @@ -5,6 +5,7 @@ from ._base import _fit_liblinear, _get_liblinear_solver_type, BaseSVC, BaseLibSVM from ..base import BaseEstimator, RegressorMixin, OutlierMixin +from ..base import _fit_context from ..linear_model._base import LinearClassifierMixin, SparseCoefMixin, LinearModel from ..utils import deprecated from ..utils.validation import _num_samples @@ -272,6 +273,7 @@ def __init__( self.penalty = penalty self.loss = loss + @_fit_context(prefer_skip_nested_validation=True) def fit(self, X, y, sample_weight=None): """Fit the model according to the given training data. @@ -296,8 +298,6 @@ def fit(self, X, y, sample_weight=None): self : object An instance of the estimator. """ - self._validate_params() - X, y = self._validate_data( X, y, @@ -529,6 +529,7 @@ def __init__( self.dual = dual self.loss = loss + @_fit_context(prefer_skip_nested_validation=True) def fit(self, X, y, sample_weight=None): """Fit the model according to the given training data. @@ -553,8 +554,6 @@ def fit(self, X, y, sample_weight=None): self : object An instance of the estimator. """ - self._validate_params() - X, y = self._validate_data( X, y, diff --git a/sklearn/tests/test_metadata_routing.py b/sklearn/tests/test_metadata_routing.py index 3b00b5a244ee8..a6e74c12f6e45 100644 --- a/sklearn/tests/test_metadata_routing.py +++ b/sklearn/tests/test_metadata_routing.py @@ -653,6 +653,21 @@ def fit(self, X, y, prop=None, **kwargs): Klass().fit(None, None) # for coverage +def test_removing_non_existing_param_raises(): + """Test that removing a metadata using UNUSED which doesn't exist raises.""" + + class InvalidRequestRemoval(BaseEstimator): + # `fit` (in this class or a parent) requests `prop`, but we don't want + # it requested at all. + __metadata_request__fit = {"prop": metadata_routing.UNUSED} + + def fit(self, X, y, **kwargs): + return self + + with pytest.raises(ValueError, match="Trying to remove parameter"): + InvalidRequestRemoval().get_metadata_routing() + + def test_method_metadata_request(): mmr = MethodMetadataRequest(owner="test", method="fit") diff --git a/sklearn/tests/test_public_functions.py b/sklearn/tests/test_public_functions.py index 3157e344cbef3..99f7f22d92e3d 100644 --- a/sklearn/tests/test_public_functions.py +++ b/sklearn/tests/test_public_functions.py @@ -241,6 +241,7 @@ def _check_function_param_validation( "sklearn.metrics.pairwise.manhattan_distances", "sklearn.metrics.pairwise.nan_euclidean_distances", "sklearn.metrics.pairwise.paired_cosine_distances", + "sklearn.metrics.pairwise.paired_distances", "sklearn.metrics.pairwise.paired_euclidean_distances", "sklearn.metrics.pairwise.paired_manhattan_distances", "sklearn.metrics.pairwise.polynomial_kernel", diff --git a/sklearn/tree/_classes.py b/sklearn/tree/_classes.py index 4fdd8f27cd652..64a444db0b228 100644 --- a/sklearn/tree/_classes.py +++ b/sklearn/tree/_classes.py @@ -31,6 +31,7 @@ from sklearn.base import RegressorMixin from sklearn.base import is_classifier from sklearn.base import MultiOutputMixin +from sklearn.base import _fit_context from sklearn.utils import Bunch from sklearn.utils import check_random_state from sklearn.utils.validation import _check_sample_weight @@ -120,6 +121,7 @@ class BaseDecisionTree(MultiOutputMixin, BaseEstimator, metaclass=ABCMeta): "max_leaf_nodes": [Interval(Integral, 2, None, closed="left"), None], "min_impurity_decrease": [Interval(Real, 0.0, None, closed="left")], "ccp_alpha": [Interval(Real, 0.0, None, closed="left")], + "store_leaf_values": [bool], } @abstractmethod @@ -138,6 +140,7 @@ def __init__( min_impurity_decrease, class_weight=None, ccp_alpha=0.0, + store_leaf_values=False, ): self.criterion = criterion self.splitter = splitter @@ -151,6 +154,7 @@ def __init__( self.min_impurity_decrease = min_impurity_decrease self.class_weight = class_weight self.ccp_alpha = ccp_alpha + self.store_leaf_values = store_leaf_values def get_depth(self): """Return the depth of the decision tree. @@ -180,7 +184,7 @@ def get_n_leaves(self): def _support_missing_values(self, X): return not issparse(X) and self._get_tags()["allow_nan"] - def _compute_feature_has_missing(self, X): + def _compute_missing_values_in_feature_mask(self, X): """Return boolean mask denoting if there are missing values for each feature. This method also ensures that X is finite. @@ -192,7 +196,7 @@ def _compute_feature_has_missing(self, X): Returns ------- - feature_has_missing : ndarray of shape (n_features,), or None + missing_values_in_feature_mask : ndarray of shape (n_features,), or None Missing value mask. If missing values are not supported or there are no missing values, return None. """ @@ -213,13 +217,17 @@ def _compute_feature_has_missing(self, X): if not np.isnan(overall_sum): return None - feature_has_missing = _any_isnan_axis0(X) - return feature_has_missing + missing_values_in_feature_mask = _any_isnan_axis0(X) + return missing_values_in_feature_mask def _fit( - self, X, y, sample_weight=None, check_input=True, feature_has_missing=None + self, + X, + y, + sample_weight=None, + check_input=True, + missing_values_in_feature_mask=None, ): - self._validate_params() random_state = check_random_state(self.random_state) if check_input: @@ -227,7 +235,7 @@ def _fit( # We can't pass multi_output=True because that would allow y to be # csr. - # _compute_feature_has_missing will check for finite values and + # _compute_missing_values_in_feature_mask will check for finite values and # compute the missing mask if the tree supports missing values check_X_params = dict( dtype=DTYPE, accept_sparse="csc", force_all_finite=False @@ -240,7 +248,9 @@ def _fit( else: X = self._validate_data(X, **check_X_params) - feature_has_missing = self._compute_feature_has_missing(X) + missing_values_in_feature_mask = ( + self._compute_missing_values_in_feature_mask(X) + ) if issparse(X): X.sort_indices() @@ -388,7 +398,7 @@ def _fit( X, y, sample_weight, - feature_has_missing, + missing_values_in_feature_mask, min_samples_leaf, min_weight_leaf, max_leaf_nodes, @@ -397,6 +407,9 @@ def _fit( random_state, ) + if self.store_leaf_values: + self.leaf_nodes_samples_ = self.tree_.leaf_nodes_samples + return self def _build_tree( @@ -404,7 +417,7 @@ def _build_tree( X, y, sample_weight, - feature_has_missing, + missing_values_in_feature_mask, min_samples_leaf, min_weight_leaf, max_leaf_nodes, @@ -483,6 +496,7 @@ def _build_tree( min_weight_leaf, max_depth, self.min_impurity_decrease, + self.store_leaf_values, ) else: builder = BestFirstTreeBuilder( @@ -493,9 +507,10 @@ def _build_tree( max_depth, max_leaf_nodes, self.min_impurity_decrease, + self.store_leaf_values, ) - builder.build(self.tree_, X, y, sample_weight, feature_has_missing) + builder.build(self.tree_, X, y, sample_weight, missing_values_in_feature_mask) if self.n_outputs_ == 1 and is_classifier(self): self.n_classes_ = self.n_classes_[0] @@ -551,6 +566,9 @@ def predict(self, X, check_input=True): """ check_is_fitted(self) X = self._validate_X_predict(X, check_input) + + # proba is a count matrix of leaves that fall into + # (n_samples, n_outputs, max_n_classes) array proba = self.tree_.predict(X) n_samples = X.shape[0] @@ -577,6 +595,128 @@ def predict(self, X, check_input=True): else: return proba[:, :, 0] + def get_leaf_node_samples(self, X, check_input=True): + """For each datapoint x in X, get the training samples in the leaf node. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + Dataset to apply the forest to. + + Returns + ------- + leaf_nodes_samples : a list of array-like of shape + (n_leaf_node_samples, n_outputs) + Each sample is represented by the indices of the training samples that + reached the leaf node. The ``n_leaf_node_samples`` may vary between + samples, since the number of samples that fall in a leaf node is + variable. + """ + if not self.store_leaf_values: + raise RuntimeError( + "leaf node samples are not stored when store_leaf_values=False" + ) + + # get indices of leaves per sample (n_samples,) + X_leaves = self.apply(X, check_input=check_input) + n_samples = X_leaves.shape[0] + + # get array of samples per leaf (n_node_samples, n_outputs) + leaf_samples = self.tree_.leaf_nodes_samples + + leaf_nodes_samples = [] + for idx in range(n_samples): + leaf_id = X_leaves[idx] + leaf_nodes_samples.append(leaf_samples[leaf_id]) + return leaf_nodes_samples + + def predict_quantiles(self, X, quantiles=0.5, method="nearest", check_input=True): + """Predict class or regression value for X at given quantiles. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + Input data. + quantiles : float, optional + The quantiles at which to evaluate, by default 0.5 (median). + method : str, optional + The method to interpolate, by default 'linear'. Can be any keyword + argument accepted by :func:`np.quantile`. + check_input : bool, optional + Whether or not to check input, by default True. + + Returns + ------- + predictions : array-like of shape (n_samples, n_outputs, len(quantiles)) + The predicted quantiles. + """ + if not self.store_leaf_values: + raise RuntimeError( + "Predicting quantiles requires that the tree stores leaf node samples." + ) + + check_is_fitted(self) + + # Check data + X = self._validate_X_predict(X, check_input) + + if not isinstance(quantiles, (np.ndarray, list)): + quantiles = np.array([quantiles]) + + # get indices of leaves per sample + X_leaves = self.apply(X) + + # get array of samples per leaf (n_node_samples, n_outputs) + leaf_samples = self.tree_.leaf_nodes_samples + + # compute quantiles (n_samples, n_quantiles, n_outputs) + n_samples = X.shape[0] + n_quantiles = len(quantiles) + proba = np.zeros((n_samples, n_quantiles, self.n_outputs_)) + for idx, leaf_id in enumerate(X_leaves): + # predict by taking the quantile across the samples in the leaf for + # each output + proba[idx, ...] = np.quantile( + leaf_samples[leaf_id], quantiles, axis=0, interpolation=method + ) + + # Classification + if is_classifier(self): + if self.n_outputs_ == 1: + # return the class with the highest probability for each quantile + # (n_samples, n_quantiles) + class_preds = np.zeros( + (n_samples, n_quantiles), dtype=self.classes_.dtype + ) + for i in range(n_quantiles): + class_pred_per_sample = ( + proba[:, i, :].squeeze().astype(self.classes_.dtype) + ) + class_preds[:, i] = self.classes_.take( + class_pred_per_sample, axis=0 + ) + return class_preds + else: + class_type = self.classes_[0].dtype + predictions = np.zeros( + (n_samples, n_quantiles, self.n_outputs_), dtype=class_type + ) + for k in range(self.n_outputs_): + for i in range(n_quantiles): + class_pred_per_sample = proba[:, i, k].squeeze().astype(int) + predictions[:, i, k] = self.classes_[k].take( + class_pred_per_sample, axis=0 + ) + + return predictions + # Regression + else: + if self.n_outputs_ == 1: + return proba[:, :, 0] + + else: + return proba + def apply(self, X, check_input=True): """Return the index of the leaf that each sample is predicted as. @@ -851,6 +991,16 @@ class DecisionTreeClassifier(ClassifierMixin, BaseDecisionTree): .. versionadded:: 0.22 + store_leaf_values : bool, default=False + Whether to store the samples that fall into leaves in the ``tree_`` attribute. + Each leaf will store a 2D array corresponding to the samples that fall into it + keyed by node_id. + + XXX: This is currently experimental and may change without notice. + Moreover, it can be improved upon since storing the samples twice is not ideal. + One could instead store the indices in ``y_train`` that fall into each leaf, + which would lower RAM/diskspace usage. + Attributes ---------- classes_ : ndarray of shape (n_classes,) or list of ndarray @@ -896,6 +1046,9 @@ class DecisionTreeClassifier(ClassifierMixin, BaseDecisionTree): :ref:`sphx_glr_auto_examples_tree_plot_unveil_tree_structure.py` for basic usage of these attributes. + leaf_nodes_samples_ : dict + A dictionary of leaf node index and the y_train samples in that leaf. + See Also -------- DecisionTreeRegressor : A decision tree regressor. @@ -965,6 +1118,7 @@ def __init__( min_impurity_decrease=0.0, class_weight=None, ccp_alpha=0.0, + store_leaf_values=False, ): super().__init__( criterion=criterion, @@ -979,8 +1133,10 @@ def __init__( random_state=random_state, min_impurity_decrease=min_impurity_decrease, ccp_alpha=ccp_alpha, + store_leaf_values=store_leaf_values, ) + @_fit_context(prefer_skip_nested_validation=True) def fit(self, X, y, sample_weight=None, check_input=True): """Build a decision tree classifier from the training set (X, y). @@ -1327,6 +1483,7 @@ def __init__( max_leaf_nodes=None, min_impurity_decrease=0.0, ccp_alpha=0.0, + store_leaf_values=False, ): super().__init__( criterion=criterion, @@ -1340,8 +1497,10 @@ def __init__( random_state=random_state, min_impurity_decrease=min_impurity_decrease, ccp_alpha=ccp_alpha, + store_leaf_values=store_leaf_values, ) + @_fit_context(prefer_skip_nested_validation=True) def fit(self, X, y, sample_weight=None, check_input=True): """Build a decision tree regressor from the training set (X, y). @@ -1653,6 +1812,7 @@ def __init__( min_impurity_decrease=0.0, class_weight=None, ccp_alpha=0.0, + store_leaf_values=False, ): super().__init__( criterion=criterion, @@ -1667,6 +1827,7 @@ def __init__( min_impurity_decrease=min_impurity_decrease, random_state=random_state, ccp_alpha=ccp_alpha, + store_leaf_values=store_leaf_values, ) @@ -1880,6 +2041,7 @@ def __init__( min_impurity_decrease=0.0, max_leaf_nodes=None, ccp_alpha=0.0, + store_leaf_values=False, ): super().__init__( criterion=criterion, @@ -1893,4 +2055,5 @@ def __init__( min_impurity_decrease=min_impurity_decrease, random_state=random_state, ccp_alpha=ccp_alpha, + store_leaf_values=store_leaf_values, ) diff --git a/sklearn/tree/_criterion.pxd b/sklearn/tree/_criterion.pxd index 721b475f40436..31c10ccfe4f93 100644 --- a/sklearn/tree/_criterion.pxd +++ b/sklearn/tree/_criterion.pxd @@ -92,7 +92,7 @@ cdef class Criterion(BaseCriterion): cdef void node_samples( self, - vector[vector[DOUBLE_t]]* dest + vector[vector[DOUBLE_t]]& dest ) noexcept nogil cdef class ClassificationCriterion(Criterion): diff --git a/sklearn/tree/_criterion.pyx b/sklearn/tree/_criterion.pyx index c3f08ec859bee..dfa64c1184df5 100644 --- a/sklearn/tree/_criterion.pyx +++ b/sklearn/tree/_criterion.pyx @@ -258,9 +258,17 @@ cdef class Criterion(BaseCriterion): cdef void node_samples( self, - vector[vector[DOUBLE_t]]* dest + vector[vector[DOUBLE_t]]& dest ) noexcept nogil: - cdef SIZE_t i, j + """Copy the samples of the current node into dest. + + Parameters + ---------- + dest : reference vector[vector[DOUBLE_t]] + The vector of vectors where the samples should be copied. + This is passed by reference and modified in place. + """ + cdef SIZE_t i, j, k # Resize the destination vector of vectors dest.resize(self.n_node_samples) @@ -272,7 +280,8 @@ cdef class Criterion(BaseCriterion): # Get the sample values for each output for k in range(self.n_outputs): - dest[i][k].push_back(self.y[j, k]) + dest[i].push_back(self.y[j, k]) + cdef inline void _move_sums_classification( ClassificationCriterion criterion, diff --git a/sklearn/tree/_splitter.pxd b/sklearn/tree/_splitter.pxd index fb21f676e66cc..915b2baa30e94 100644 --- a/sklearn/tree/_splitter.pxd +++ b/sklearn/tree/_splitter.pxd @@ -104,10 +104,10 @@ cdef class Splitter(BaseSplitter): object X, const DOUBLE_t[:, ::1] y, const DOUBLE_t[:] sample_weight, - const unsigned char[::1] feature_has_missing, + const unsigned char[::1] missing_values_in_feature_mask, ) except -1 - cdef void node_samples(self, vector[vector[DOUBLE_t]]* dest) noexcept nogil + cdef void node_samples(self, vector[vector[DOUBLE_t]]& dest) noexcept nogil # Methods that allow modifications to stopping conditions cdef bint check_presplit_conditions( diff --git a/sklearn/tree/_splitter.pyx b/sklearn/tree/_splitter.pyx index 7f21d5da545fb..1f3d164370b95 100644 --- a/sklearn/tree/_splitter.pyx +++ b/sklearn/tree/_splitter.pyx @@ -168,7 +168,7 @@ cdef class Splitter(BaseSplitter): object X, const DOUBLE_t[:, ::1] y, const DOUBLE_t[:] sample_weight, - const unsigned char[::1] feature_has_missing, + const unsigned char[::1] missing_values_in_feature_mask, ) except -1: """Initialize the splitter. @@ -245,7 +245,7 @@ cdef class Splitter(BaseSplitter): self.end ) - if feature_has_missing is not None: + if missing_values_in_feature_mask is not None: self.criterion.init_sum_missing() return 0 @@ -280,7 +280,7 @@ cdef class Splitter(BaseSplitter): self.criterion.node_value(dest) - cdef void node_samples(self, vector[vector[DOUBLE_t]]* dest) noexcept nogil: + cdef void node_samples(self, vector[vector[DOUBLE_t]]& dest) noexcept nogil: """Copy the samples[start:end] into dest.""" self.criterion.node_samples(dest) @@ -903,19 +903,19 @@ cdef class DensePartitioner: cdef SIZE_t start cdef SIZE_t end cdef SIZE_t n_missing - cdef const unsigned char[::1] feature_has_missing + cdef const unsigned char[::1] missing_values_in_feature_mask def __init__( self, const DTYPE_t[:, :] X, SIZE_t[::1] samples, DTYPE_t[::1] feature_values, - const unsigned char[::1] feature_has_missing, + const unsigned char[::1] missing_values_in_feature_mask, ): self.X = X self.samples = samples self.feature_values = feature_values - self.feature_has_missing = feature_has_missing + self.missing_values_in_feature_mask = missing_values_in_feature_mask cdef inline void init_node_split(self, SIZE_t start, SIZE_t end) noexcept nogil: """Initialize splitter at the beginning of node_split.""" @@ -938,13 +938,13 @@ cdef class DensePartitioner: const DTYPE_t[:, :] X = self.X SIZE_t[::1] samples = self.samples SIZE_t n_missing = 0 - const unsigned char[::1] feature_has_missing = self.feature_has_missing + const unsigned char[::1] missing_values_in_feature_mask = self.missing_values_in_feature_mask # Sort samples along that feature; by # copying the values into an array and # sorting the array in a manner which utilizes the cache more # effectively. - if feature_has_missing is not None and feature_has_missing[current_feature]: + if missing_values_in_feature_mask is not None and missing_values_in_feature_mask[current_feature]: i, current_end = self.start, self.end - 1 # Missing values are placed at the end and do not participate in the sorting. while i <= current_end: @@ -1113,7 +1113,7 @@ cdef class SparsePartitioner: cdef SIZE_t start cdef SIZE_t end cdef SIZE_t n_missing - cdef const unsigned char[::1] feature_has_missing + cdef const unsigned char[::1] missing_values_in_feature_mask cdef const DTYPE_t[::1] X_data cdef const INT32_t[::1] X_indices @@ -1134,7 +1134,7 @@ cdef class SparsePartitioner: SIZE_t[::1] samples, SIZE_t n_samples, DTYPE_t[::1] feature_values, - const unsigned char[::1] feature_has_missing, + const unsigned char[::1] missing_values_in_feature_mask, ): if not isspmatrix_csc(X): raise ValueError("X should be in csc format") @@ -1158,7 +1158,7 @@ cdef class SparsePartitioner: for p in range(n_samples): self.index_to_samples[samples[p]] = p - self.feature_has_missing = feature_has_missing + self.missing_values_in_feature_mask = missing_values_in_feature_mask cdef inline void init_node_split(self, SIZE_t start, SIZE_t end) noexcept nogil: """Initialize splitter at the beginning of node_split.""" @@ -1529,11 +1529,11 @@ cdef class BestSplitter(Splitter): object X, const DOUBLE_t[:, ::1] y, const DOUBLE_t[:] sample_weight, - const unsigned char[::1] feature_has_missing, + const unsigned char[::1] missing_values_in_feature_mask, ) except -1: - Splitter.init(self, X, y, sample_weight, feature_has_missing) + Splitter.init(self, X, y, sample_weight, missing_values_in_feature_mask) self.partitioner = DensePartitioner( - X, self.samples, self.feature_values, feature_has_missing + X, self.samples, self.feature_values, missing_values_in_feature_mask ) cdef int node_split(self, double impurity, SplitRecord* split, @@ -1555,11 +1555,11 @@ cdef class BestSparseSplitter(Splitter): object X, const DOUBLE_t[:, ::1] y, const DOUBLE_t[:] sample_weight, - const unsigned char[::1] feature_has_missing, + const unsigned char[::1] missing_values_in_feature_mask, ) except -1: - Splitter.init(self, X, y, sample_weight, feature_has_missing) + Splitter.init(self, X, y, sample_weight, missing_values_in_feature_mask) self.partitioner = SparsePartitioner( - X, self.samples, self.n_samples, self.feature_values, feature_has_missing + X, self.samples, self.n_samples, self.feature_values, missing_values_in_feature_mask ) cdef int node_split(self, double impurity, SplitRecord* split, @@ -1581,11 +1581,11 @@ cdef class RandomSplitter(Splitter): object X, const DOUBLE_t[:, ::1] y, const DOUBLE_t[:] sample_weight, - const unsigned char[::1] feature_has_missing, + const unsigned char[::1] missing_values_in_feature_mask, ) except -1: - Splitter.init(self, X, y, sample_weight, feature_has_missing) + Splitter.init(self, X, y, sample_weight, missing_values_in_feature_mask) self.partitioner = DensePartitioner( - X, self.samples, self.feature_values, feature_has_missing + X, self.samples, self.feature_values, missing_values_in_feature_mask ) cdef int node_split(self, double impurity, SplitRecord* split, @@ -1607,11 +1607,11 @@ cdef class RandomSparseSplitter(Splitter): object X, const DOUBLE_t[:, ::1] y, const DOUBLE_t[:] sample_weight, - const unsigned char[::1] feature_has_missing, + const unsigned char[::1] missing_values_in_feature_mask, ) except -1: - Splitter.init(self, X, y, sample_weight, feature_has_missing) + Splitter.init(self, X, y, sample_weight, missing_values_in_feature_mask) self.partitioner = SparsePartitioner( - X, self.samples, self.n_samples, self.feature_values, feature_has_missing + X, self.samples, self.n_samples, self.feature_values, missing_values_in_feature_mask ) cdef int node_split(self, double impurity, SplitRecord* split, diff --git a/sklearn/tree/_tree.pxd b/sklearn/tree/_tree.pxd index 94714cc33400c..828c99a2f4ea1 100644 --- a/sklearn/tree/_tree.pxd +++ b/sklearn/tree/_tree.pxd @@ -49,13 +49,6 @@ cdef class BaseTree: cdef SIZE_t value_stride # The dimensionality of a vectorized output per sample cdef double* value # Array of values prediction values for each node - # Enables the use of tree to store distributions of the output to allow - # arbitrary usage of the the leaves. This is used in the quantile - # estimators for example. - # for storing samples at each leaf node with leaf's node ID as the key and - # the sample values as the value - cdef unordered_map[SIZE_t, vector[vector[DOUBLE_t]]] value_samples - # Generic Methods: These are generic methods used by any tree. cdef int _resize(self, SIZE_t capacity) except -1 nogil cdef int _resize_c(self, SIZE_t capacity=*) except -1 nogil @@ -121,9 +114,18 @@ cdef class Tree(BaseTree): cdef public SIZE_t n_outputs # Number of outputs in y cdef public SIZE_t max_n_classes # max(n_classes) + # Enables the use of tree to store distributions of the output to allow + # arbitrary usage of the the leaves. This is used in the quantile + # estimators for example. + # for storing samples at each leaf node with leaf's node ID as the key and + # the sample values as the value + cdef unordered_map[SIZE_t, vector[vector[DOUBLE_t]]] value_samples + # Methods cdef cnp.ndarray _get_value_ndarray(self) cdef cnp.ndarray _get_node_ndarray(self) + cdef cnp.ndarray _get_value_samples_ndarray(self, SIZE_t node_id) + cdef cnp.ndarray _get_value_samples_keys(self) cpdef cnp.ndarray predict(self, object X) @@ -146,7 +148,7 @@ cdef class TreeBuilder: cdef SIZE_t max_depth # Maximal tree depth cdef double min_impurity_decrease # Impurity threshold for early stopping - cdef unsigned char store_leaf_values # Whether to store leaf values + cdef unsigned char store_leaf_values # Whether to store leaf values cpdef build( self, @@ -154,7 +156,7 @@ cdef class TreeBuilder: object X, const DOUBLE_t[:, ::1] y, const DOUBLE_t[:] sample_weight=*, - const unsigned char[::1] feature_has_missing=*, + const unsigned char[::1] missing_values_in_feature_mask=*, ) cdef _check_input( diff --git a/sklearn/tree/_tree.pyx b/sklearn/tree/_tree.pyx index 8ca98a64b42ab..1565ab441969d 100644 --- a/sklearn/tree/_tree.pyx +++ b/sklearn/tree/_tree.pyx @@ -100,7 +100,7 @@ cdef class TreeBuilder: object X, const DOUBLE_t[:, ::1] y, const DOUBLE_t[:] sample_weight=None, - const unsigned char[::1] feature_has_missing=None, + const unsigned char[::1] missing_values_in_feature_mask=None, ): """Build a decision tree from the training set (X, y).""" pass @@ -182,7 +182,7 @@ cdef class DepthFirstTreeBuilder(TreeBuilder): object X, const DOUBLE_t[:, ::1] y, const DOUBLE_t[:] sample_weight=None, - const unsigned char[::1] feature_has_missing=None, + const unsigned char[::1] missing_values_in_feature_mask=None, ): """Build a decision tree from the training set (X, y).""" @@ -208,7 +208,7 @@ cdef class DepthFirstTreeBuilder(TreeBuilder): cdef double min_impurity_decrease = self.min_impurity_decrease # Recursive partition (without actual recursion) - splitter.init(X, y, sample_weight, feature_has_missing) + splitter.init(X, y, sample_weight, missing_values_in_feature_mask) cdef SIZE_t start cdef SIZE_t end @@ -229,8 +229,6 @@ cdef class DepthFirstTreeBuilder(TreeBuilder): cdef SIZE_t max_depth_seen = -1 cdef int rc = 0 - cdef int node_idx - cdef stack[StackRecord] builder_stack cdef StackRecord stack_record @@ -319,11 +317,8 @@ cdef class DepthFirstTreeBuilder(TreeBuilder): "impurity": split.impurity_left, "n_constant_features": n_constant_features}) elif self.store_leaf_values and is_leaf: - with gil: - print('Storing leaf values...') - # copy leaf values to leaf_values array - splitter.node_samples(&tree.value_samples[node_id]) + splitter.node_samples(tree.value_samples[node_id]) if depth > max_depth_seen: max_depth_seen = depth @@ -406,7 +401,7 @@ cdef class BestFirstTreeBuilder(TreeBuilder): object X, const DOUBLE_t[:, ::1] y, const DOUBLE_t[:] sample_weight=None, - const unsigned char[::1] feature_has_missing=None, + const unsigned char[::1] missing_values_in_feature_mask=None, ): """Build a decision tree from the training set (X, y).""" @@ -418,7 +413,7 @@ cdef class BestFirstTreeBuilder(TreeBuilder): cdef SIZE_t max_leaf_nodes = self.max_leaf_nodes # Recursive partition (without actual recursion) - splitter.init(X, y, sample_weight, feature_has_missing) + splitter.init(X, y, sample_weight, missing_values_in_feature_mask) cdef vector[FrontierRecord] frontier cdef FrontierRecord record @@ -459,6 +454,9 @@ cdef class BestFirstTreeBuilder(TreeBuilder): node.feature = _TREE_UNDEFINED node.threshold = _TREE_UNDEFINED + if self.store_leaf_values: + # copy leaf values to leaf_values array + splitter.node_samples(tree.value_samples[record.node_id]) else: # Node is expandable @@ -1321,6 +1319,14 @@ cdef class Tree(BaseTree): def value(self): return self._get_value_ndarray()[:self.node_count] + @property + def leaf_nodes_samples(self): + leaf_node_samples = dict() + keys = self._get_value_samples_keys() + for node_id in keys: + leaf_node_samples[node_id] = self._get_value_samples_ndarray(node_id) + return leaf_node_samples + # TODO: Convert n_classes to cython.integral memory view once # https://github.com/cython/cython/issues/5243 is fixed def __cinit__(self, int n_features, cnp.ndarray n_classes, int n_outputs): @@ -1374,6 +1380,7 @@ cdef class Tree(BaseTree): d["node_count"] = self.node_count d["nodes"] = self._get_node_ndarray() d["values"] = self._get_value_ndarray() + d['value_samples'] = self.leaf_nodes_samples return d def __setstate__(self, d): @@ -1407,6 +1414,35 @@ cdef class Tree(BaseTree): memcpy(self.value, cnp.PyArray_DATA(value_ndarray), self.capacity * self.value_stride * sizeof(double)) + # store the leaf node samples if they exist + value_samples_dict = d['value_samples'] + for node_id, leaf_samples in value_samples_dict.items(): + self.value_samples[node_id].resize(leaf_samples.shape[0]) + for idx in range(leaf_samples.shape[0]): + for jdx in range(leaf_samples.shape[1]): + self.value_samples[node_id][idx].push_back(leaf_samples[idx, jdx]) + + cdef cnp.ndarray _get_value_samples_ndarray(self, SIZE_t node_id): + """Wraps value_samples as a 2-d NumPy array per node_id.""" + cdef int i, j + cdef int n_samples = self.value_samples[node_id].size() + cdef cnp.ndarray[DOUBLE_t, ndim=2, mode='c'] leaf_node_samples = np.empty(shape=(n_samples, self.n_outputs), dtype=np.float64) + + for i in range(n_samples): + for j in range(self.n_outputs): + leaf_node_samples[i, j] = self.value_samples[node_id][i][j] + return leaf_node_samples + + cdef cnp.ndarray _get_value_samples_keys(self): + """Wraps value_samples keys as a 1-d NumPy array of keys.""" + cdef cnp.ndarray[SIZE_t, ndim=1, mode='c'] keys = np.empty(len(self.value_samples), dtype=np.intp) + cdef unsigned int i = 0 + + for key in self.value_samples: + keys[i] = key.first + i += 1 + return keys + cdef cnp.ndarray _get_value_ndarray(self): """Wraps value as a 3-d NumPy array. diff --git a/sklearn/tree/tests/test_tree.py b/sklearn/tree/tests/test_tree.py index eefae6cdaa3f6..44a19b3dc0520 100644 --- a/sklearn/tree/tests/test_tree.py +++ b/sklearn/tree/tests/test_tree.py @@ -890,7 +890,7 @@ def test_pickle(): else: X, y = diabetes.data, diabetes.target - est = TreeEstimator(random_state=0) + est = TreeEstimator(random_state=0, store_leaf_values=True) est.fit(X, y) score = est.score(X, y) @@ -909,6 +909,7 @@ def test_pickle(): "n_node_samples", "weighted_n_node_samples", "value", + "leaf_nodes_samples", ] fitted_attribute = { attribute: getattr(est.tree_, attribute) for attribute in attributes @@ -923,14 +924,25 @@ def test_pickle(): score == score2 ), "Failed to generate same score after pickling with {0}".format(name) for attribute in fitted_attribute: - assert_array_equal( - getattr(est2.tree_, attribute), - fitted_attribute[attribute], - err_msg=( - f"Failed to generate same attribute {attribute} after pickling with" - f" {name}" - ), - ) + if attribute == "leaf_nodes_samples": + for key in fitted_attribute[attribute].keys(): + assert_array_equal( + getattr(est2.tree_, attribute)[key], + fitted_attribute[attribute][key], + err_msg=( + f"Failed to generate same attribute {attribute} after" + f" pickling with {name}" + ), + ) + else: + assert_array_equal( + getattr(est2.tree_, attribute), + fitted_attribute[attribute], + err_msg=( + f"Failed to generate same attribute {attribute} after pickling" + f" with {name}" + ), + ) def test_multioutput(): @@ -2634,3 +2646,148 @@ def test_sample_weight_non_uniform(make_data, Tree): tree_samples_removed.fit(X[1::2, :], y[1::2]) assert_allclose(tree_samples_removed.predict(X), tree_with_sw.predict(X)) + + +@pytest.mark.parametrize( + "tree_name", + ALL_TREES, +) +def test_leaf_node_samples(tree_name): + """Test getting leaf node samples from fitted tree.""" + tree = ALL_TREES[tree_name](random_state=0, store_leaf_values=False) + tree.fit(X_small, y_small) + + # Check that the leaf node samples are not stored by default + assert tree.tree_.leaf_nodes_samples == dict() + + # error should be raised if trying to predict quantiles + assert hasattr(tree, "predict_quantiles") + for meth in ["predict_quantiles", "get_leaf_node_samples"]: + if hasattr(tree, meth): + with pytest.raises( + RuntimeError, + match="leaf node samples", + ): + getattr(tree, meth)(X_small) + + quantile_tree = ALL_TREES[tree_name](random_state=0, store_leaf_values=True) + quantile_tree.fit(X_small, y_small) + + score = tree.score(X_small, y_small) + new_score = quantile_tree.score(X_small, y_small) + assert np.isclose(score, new_score) + + # Check that the leaf node samples are what they should be + X_leaves = quantile_tree.apply(X_small) + for idx in range(X_leaves.shape[0]): + leaf_idx = X_leaves[idx] + assert y_small[idx] in quantile_tree.tree_.leaf_nodes_samples[leaf_idx] + assert set(np.unique(X_leaves)) == set( + quantile_tree.tree_.leaf_nodes_samples.keys() + ) + + +@pytest.mark.parametrize( + "name", + ALL_TREES, +) +def test_quantile_tree_predict(name): + TreeEstimator = ALL_TREES[name] + + # test quantile prediction + est = TreeEstimator(store_leaf_values=True, random_state=0) + + # fit on binary results in perfect leaves, so all quantiles are the same + est.fit(X_small, y_small) + pred = est.predict_quantiles(X_small, quantiles=[0.1, 0.5, 0.9]) + assert_array_equal(est.predict(X_small), pred[:, 0]) + assert_array_equal(est.predict(X_small), pred[:, 1]) + assert_array_equal(est.predict(X_small), pred[:, 2]) + assert_array_equal(pred[:, 0], y_small) + assert np.unique(pred, axis=1).shape[1] == 1 + + est.fit(X_small[:-5], y_small[:-5]) + held_out_X = X_small[-5:, :] + pred = est.predict_quantiles(held_out_X, quantiles=[0.1, 0.5, 0.9]) + assert_array_equal(est.predict(held_out_X), pred[:, 0]) + assert_array_equal(est.predict(held_out_X), pred[:, 1]) + assert_array_equal(est.predict(held_out_X), pred[:, 2]) + + # fit on real data + est.fit(iris.data, iris.target) + pred = est.predict_quantiles(iris.data, quantiles=[0.1, 0.5, 0.9]) + assert_array_equal(pred[:, 0], iris.target) + assert_array_equal(pred[:, 1], iris.target) + assert_array_equal(pred[:, 2], iris.target) + + +@pytest.mark.parametrize( + "name", + ALL_TREES, +) +def test_quantile_tree_predict_impure_leaves(name): + TreeEstimator = ALL_TREES[name] + + # test quantile prediction + est = TreeEstimator(store_leaf_values=True, random_state=0, max_depth=4) + # fit on binary results with constrained depth will result in impure leaves + est.fit(X_small, y_small) + pred = est.predict_quantiles(X_small, quantiles=[0.1, 0.5, 0.9]) + assert np.unique(pred, axis=1).shape[1] > 1 + + +def test_multioutput_quantiles(): + # Check estimators on multi-output problems. + X = [ + [-2, -1], + [-1, -1], + [-1, -2], + [1, 1], + [1, 2], + [2, 1], + [-2, 1], + [-1, 1], + [-1, 2], + [2, -1], + [1, -1], + [1, -2], + ] + + y = [ + [-1, 0], + [-1, 0], + [-1, 0], + [1, 1], + [1, 1], + [1, 1], + [-1, 2], + [-1, 2], + [-1, 2], + [1, 3], + [1, 3], + [1, 3], + ] + + T = [[-1, -1], [1, 1], [-1, 1], [1, -1]] + y_true = [[-1, 0], [1, 1], [-1, 2], [1, 3]] + + # toy classification problem + for name, TreeClassifier in CLF_TREES.items(): + clf = TreeClassifier(random_state=0, store_leaf_values=True) + clf.fit(X, y) + + y_hat = clf.predict_quantiles(T, quantiles=[0.25, 0.5, 0.75]) + y_hat = y_hat.squeeze() + assert_array_equal(y_hat[:, 0], y_true) + assert_array_equal(y_hat[:, 1], y_true) + assert_array_equal(y_hat[:, 2], y_true) + assert y_hat.shape == (4, 3, 2) + + # toy regression problem + for name, TreeRegressor in REG_TREES.items(): + reg = TreeRegressor(random_state=0, store_leaf_values=True) + y_hat = reg.fit(X, y).predict_quantiles(T, quantiles=[0.25, 0.5, 0.75]) + assert_array_equal(y_hat[:, 0], y_true) + assert_array_equal(y_hat[:, 1], y_true) + assert_array_equal(y_hat[:, 2], y_true) + assert y_hat.shape == (4, 3, 2) diff --git a/sklearn/utils/_metadata_requests.py b/sklearn/utils/_metadata_requests.py index 82b3eec69b461..a1cd934c13756 100644 --- a/sklearn/utils/_metadata_requests.py +++ b/sklearn/utils/_metadata_requests.py @@ -241,8 +241,14 @@ def add_request( if alias == param: alias = True - if alias == UNUSED and param in self._requests: - del self._requests[param] + if alias == UNUSED: + if param in self._requests: + del self._requests[param] + else: + raise ValueError( + f"Trying to remove parameter {param} with UNUSED which doesn't" + " exist." + ) else: self._requests[param] = alias @@ -1155,7 +1161,7 @@ def _build_request_for_signature(cls, router, method): # ignore the first parameter of the method, which is usually "self" params = list(inspect.signature(getattr(cls, method)).parameters.items())[1:] for pname, param in params: - if pname in {"X", "y", "Y"}: + if pname in {"X", "y", "Y", "Xt", "yt"}: continue if param.kind in {param.VAR_POSITIONAL, param.VAR_KEYWORD}: continue diff --git a/sklearn/utils/_plotting.py b/sklearn/utils/_plotting.py index cc301b509e386..c0671046c9cd4 100644 --- a/sklearn/utils/_plotting.py +++ b/sklearn/utils/_plotting.py @@ -1,3 +1,5 @@ +import numpy as np + from . import check_consistent_length, check_matplotlib_support from .multiclass import type_of_target from .validation import _check_pos_label_consistency @@ -56,3 +58,41 @@ def _validate_from_predictions_params( name = name if name is not None else "Classifier" return pos_label, name + + +def _validate_score_name(score_name, scoring, negate_score): + """Validate the `score_name` parameter. + + If `score_name` is provided, we just return it as-is. + If `score_name` is `None`, we use `Score` if `negate_score` is `False` and + `Negative score` otherwise. + If `score_name` is a string or a callable, we infer the name. We replace `_` by + spaces and capitalize the first letter. We remove `neg_` and replace it by + `"Negative"` if `negate_score` is `False` or just remove it otherwise. + """ + if score_name is not None: + return score_name + elif scoring is None: + return "Negative score" if negate_score else "Score" + else: + score_name = scoring.__name__ if callable(scoring) else scoring + if negate_score: + if score_name.startswith("neg_"): + score_name = score_name[4:] + else: + score_name = f"Negative {score_name}" + elif score_name.startswith("neg_"): + score_name = f"Negative {score_name[4:]}" + score_name = score_name.replace("_", " ") + return score_name.capitalize() + + +def _interval_max_min_ratio(data): + """Compute the ratio between the largest and smallest inter-point distances. + + A value larger than 5 typically indicates that the parameter range would + better be displayed with a log scale while a linear scale would be more + suitable otherwise. + """ + diff = np.diff(np.sort(data)) + return diff.max() / diff.min() diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index cb1e0f2b1fa4d..7d8e673210ff7 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -4424,7 +4424,7 @@ def _output_from_fit_transform(transformer, name, X, df, y): return outputs -def _check_generated_dataframe(name, case, outputs_default, outputs_pandas): +def _check_generated_dataframe(name, case, index, outputs_default, outputs_pandas): import pandas as pd X_trans, feature_names_default = outputs_default @@ -4434,7 +4434,12 @@ def _check_generated_dataframe(name, case, outputs_default, outputs_pandas): # We always rely on the output of `get_feature_names_out` of the # transformer used to generate the dataframe as a ground-truth of the # columns. - expected_dataframe = pd.DataFrame(X_trans, columns=feature_names_pandas, copy=False) + # If a dataframe is passed into transform, then the output should have the same + # index + expected_index = index if case.endswith("df") else None + expected_dataframe = pd.DataFrame( + X_trans, columns=feature_names_pandas, copy=False, index=expected_index + ) try: pd.testing.assert_frame_equal(df_trans, expected_dataframe) @@ -4469,7 +4474,8 @@ def check_set_output_transform_pandas(name, transformer_orig): set_random_state(transformer) feature_names_in = [f"col{i}" for i in range(X.shape[1])] - df = pd.DataFrame(X, columns=feature_names_in, copy=False) + index = [f"index{i}" for i in range(X.shape[0])] + df = pd.DataFrame(X, columns=feature_names_in, copy=False, index=index) transformer_default = clone(transformer).set_output(transform="default") outputs_default = _output_from_fit_transform(transformer_default, name, X, df, y) @@ -4483,7 +4489,7 @@ def check_set_output_transform_pandas(name, transformer_orig): for case in outputs_default: _check_generated_dataframe( - name, case, outputs_default[case], outputs_pandas[case] + name, case, index, outputs_default[case], outputs_pandas[case] ) @@ -4511,7 +4517,8 @@ def check_global_ouptut_transform_pandas(name, transformer_orig): set_random_state(transformer) feature_names_in = [f"col{i}" for i in range(X.shape[1])] - df = pd.DataFrame(X, columns=feature_names_in, copy=False) + index = [f"index{i}" for i in range(X.shape[0])] + df = pd.DataFrame(X, columns=feature_names_in, copy=False, index=index) transformer_default = clone(transformer).set_output(transform="default") outputs_default = _output_from_fit_transform(transformer_default, name, X, df, y) @@ -4528,5 +4535,5 @@ def check_global_ouptut_transform_pandas(name, transformer_orig): for case in outputs_default: _check_generated_dataframe( - name, case, outputs_default[case], outputs_pandas[case] + name, case, index, outputs_default[case], outputs_pandas[case] ) diff --git a/sklearn/utils/tests/test_param_validation.py b/sklearn/utils/tests/test_param_validation.py index 528a667a3f58e..022f9f373a049 100644 --- a/sklearn/utils/tests/test_param_validation.py +++ b/sklearn/utils/tests/test_param_validation.py @@ -6,6 +6,7 @@ from sklearn._config import config_context, get_config from sklearn.base import BaseEstimator +from sklearn.base import _fit_context from sklearn.model_selection import LeaveOneOut from sklearn.utils import deprecated from sklearn.utils._param_validation import Hidden @@ -60,8 +61,9 @@ class _Estimator(BaseEstimator): def __init__(self, a): self.a = a + @_fit_context(prefer_skip_nested_validation=True) def fit(self, X=None, y=None): - self._validate_params() + pass @pytest.mark.parametrize("interval_type", [Integral, Real]) diff --git a/sklearn/utils/tests/test_plotting.py b/sklearn/utils/tests/test_plotting.py new file mode 100644 index 0000000000000..00b1f7f74fcd0 --- /dev/null +++ b/sklearn/utils/tests/test_plotting.py @@ -0,0 +1,63 @@ +import numpy as np +import pytest + +from sklearn.utils._plotting import _validate_score_name, _interval_max_min_ratio + + +def metric(): + pass # pragma: no cover + + +def neg_metric(): + pass # pragma: no cover + + +@pytest.mark.parametrize( + "score_name, scoring, negate_score, expected_score_name", + [ + ("accuracy", None, False, "accuracy"), # do not transform the name + (None, "accuracy", False, "Accuracy"), # capitalize the name + (None, "accuracy", True, "Negative accuracy"), # add "Negative" + (None, "neg_mean_absolute_error", False, "Negative mean absolute error"), + (None, "neg_mean_absolute_error", True, "Mean absolute error"), # remove "neg_" + ("MAE", "neg_mean_absolute_error", True, "MAE"), # keep score_name + (None, None, False, "Score"), # default name + (None, None, True, "Negative score"), # default name but negated + ("Some metric", metric, False, "Some metric"), # do not transform the name + ("Some metric", metric, True, "Some metric"), # do not transform the name + (None, metric, False, "Metric"), # default name + (None, metric, True, "Negative metric"), # default name but negated + ("Some metric", neg_metric, False, "Some metric"), # do not transform the name + ("Some metric", neg_metric, True, "Some metric"), # do not transform the name + (None, neg_metric, False, "Negative metric"), # default name + (None, neg_metric, True, "Metric"), # default name but negated + ], +) +def test_validate_score_name(score_name, scoring, negate_score, expected_score_name): + """Check that we return the right score name.""" + assert ( + _validate_score_name(score_name, scoring, negate_score) == expected_score_name + ) + + +# In the following test, we check the value of the max to min ratio +# for parameter value intervals to check that using a decision threshold +# of 5. is a good heuristic to decide between linear and log scales on +# common ranges of parameter values. +@pytest.mark.parametrize( + "data, lower_bound, upper_bound", + [ + # Such a range could be clearly displayed with either log scale or linear + # scale. + (np.geomspace(0.1, 1, 5), 5, 6), + # Checking that the ratio is still positive on a negative log scale. + (-np.geomspace(0.1, 1, 10), 7, 8), + # Evenly spaced parameter values lead to a ratio of 1. + (np.linspace(0, 1, 5), 0.9, 1.1), + # This is not exactly spaced on a log scale but we will benefit from treating + # it as such for visualization. + ([1, 2, 5, 10, 20, 50], 20, 40), + ], +) +def test_inverval_max_min_ratio(data, lower_bound, upper_bound): + assert lower_bound < _interval_max_min_ratio(data) < upper_bound diff --git a/sklearn/utils/tests/test_validation.py b/sklearn/utils/tests/test_validation.py index 4a765d1404794..2d39279f81745 100644 --- a/sklearn/utils/tests/test_validation.py +++ b/sklearn/utils/tests/test_validation.py @@ -42,6 +42,7 @@ from sklearn.utils import _safe_indexing from sklearn.utils.validation import ( has_fit_parameter, + _is_fitted, check_is_fitted, check_consistent_length, assert_all_finite, @@ -848,23 +849,32 @@ def fit(self, X, y): msg = "not fitted" est = MyEstimator() + assert not _is_fitted(est, attributes=["a_", "b_"]) with pytest.raises(NotFittedError, match=msg): check_is_fitted(est, attributes=["a_", "b_"]) + assert not _is_fitted(est, attributes=["a_", "b_"], all_or_any=all) with pytest.raises(NotFittedError, match=msg): check_is_fitted(est, attributes=["a_", "b_"], all_or_any=all) + assert not _is_fitted(est, attributes=["a_", "b_"], all_or_any=any) with pytest.raises(NotFittedError, match=msg): check_is_fitted(est, attributes=["a_", "b_"], all_or_any=any) est.a_ = "a" + assert not _is_fitted(est, attributes=["a_", "b_"]) with pytest.raises(NotFittedError, match=msg): check_is_fitted(est, attributes=["a_", "b_"]) + assert not _is_fitted(est, attributes=["a_", "b_"], all_or_any=all) with pytest.raises(NotFittedError, match=msg): check_is_fitted(est, attributes=["a_", "b_"], all_or_any=all) + assert _is_fitted(est, attributes=["a_", "b_"], all_or_any=any) check_is_fitted(est, attributes=["a_", "b_"], all_or_any=any) est.b_ = "b" + assert _is_fitted(est, attributes=["a_", "b_"]) check_is_fitted(est, attributes=["a_", "b_"]) + assert _is_fitted(est, attributes=["a_", "b_"], all_or_any=all) check_is_fitted(est, attributes=["a_", "b_"], all_or_any=all) + assert _is_fitted(est, attributes=["a_", "b_"], all_or_any=any) check_is_fitted(est, attributes=["a_", "b_"], all_or_any=any) diff --git a/sklearn/utils/validation.py b/sklearn/utils/validation.py index 6179d91c2a491..8ceef15986567 100644 --- a/sklearn/utils/validation.py +++ b/sklearn/utils/validation.py @@ -1369,6 +1369,44 @@ def check_symmetric(array, *, tol=1e-10, raise_warning=True, raise_exception=Fal return array +def _is_fitted(estimator, attributes=None, all_or_any=all): + """Determine if an estimator is fitted + + Parameters + ---------- + estimator : estimator instance + Estimator instance for which the check is performed. + + attributes : str, list or tuple of str, default=None + Attribute name(s) given as string or a list/tuple of strings + Eg.: ``["coef_", "estimator_", ...], "coef_"`` + + If `None`, `estimator` is considered fitted if there exist an + attribute that ends with a underscore and does not start with double + underscore. + + all_or_any : callable, {all, any}, default=all + Specify whether all or any of the given attributes must exist. + + Returns + ------- + fitted : bool + Whether the estimator is fitted. + """ + if attributes is not None: + if not isinstance(attributes, (list, tuple)): + attributes = [attributes] + return all_or_any([hasattr(estimator, attr) for attr in attributes]) + + if hasattr(estimator, "__sklearn_is_fitted__"): + return estimator.__sklearn_is_fitted__() + + fitted_attrs = [ + v for v in vars(estimator) if v.endswith("_") and not v.startswith("__") + ] + return len(fitted_attrs) > 0 + + def check_is_fitted(estimator, attributes=None, *, msg=None, all_or_any=all): """Perform is_fitted validation for estimator. @@ -1425,18 +1463,7 @@ def check_is_fitted(estimator, attributes=None, *, msg=None, all_or_any=all): if not hasattr(estimator, "fit"): raise TypeError("%s is not an estimator instance." % (estimator)) - if attributes is not None: - if not isinstance(attributes, (list, tuple)): - attributes = [attributes] - fitted = all_or_any([hasattr(estimator, attr) for attr in attributes]) - elif hasattr(estimator, "__sklearn_is_fitted__"): - fitted = estimator.__sklearn_is_fitted__() - else: - fitted = [ - v for v in vars(estimator) if v.endswith("_") and not v.startswith("__") - ] - - if not fitted: + if not _is_fitted(estimator, attributes, all_or_any): raise NotFittedError(msg % {"name": type(estimator).__name__}) From 855ee192407d19b51adb4f50a49c6752ee80c820 Mon Sep 17 00:00:00 2001 From: Adam Li Date: Thu, 15 Jun 2023 20:32:20 -0400 Subject: [PATCH 15/28] Add quantile Signed-off-by: Adam Li --- sklearn/ensemble/_forest.py | 2 +- sklearn/tree/_classes.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/ensemble/_forest.py b/sklearn/ensemble/_forest.py index e715952947c04..b43bbeaf0b435 100644 --- a/sklearn/ensemble/_forest.py +++ b/sklearn/ensemble/_forest.py @@ -730,7 +730,7 @@ def predict_quantiles(self, X, quantiles=0.5, method="nearest"): The quantiles at which to evaluate, by default 0.5 (median). method : str, optional The method to interpolate, by default 'linear'. Can be any keyword - argument accepted by :func:`np.quantile`. + argument accepted by :func:`~np.quantile`. check_input : bool, optional Whether or not to check input, by default True. diff --git a/sklearn/tree/_classes.py b/sklearn/tree/_classes.py index 64a444db0b228..d7d8cedb63696 100644 --- a/sklearn/tree/_classes.py +++ b/sklearn/tree/_classes.py @@ -641,7 +641,7 @@ def predict_quantiles(self, X, quantiles=0.5, method="nearest", check_input=True The quantiles at which to evaluate, by default 0.5 (median). method : str, optional The method to interpolate, by default 'linear'. Can be any keyword - argument accepted by :func:`np.quantile`. + argument accepted by :func:`~np.quantile`. check_input : bool, optional Whether or not to check input, by default True. From 3f5cb6597e36a08f651f8f0eb7324e9658a14bea Mon Sep 17 00:00:00 2001 From: Adam Li Date: Fri, 16 Jun 2023 11:05:43 -0400 Subject: [PATCH 16/28] Add check input Signed-off-by: Adam Li --- sklearn/ensemble/_forest.py | 2 -- sklearn/tree/_classes.py | 2 ++ 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/ensemble/_forest.py b/sklearn/ensemble/_forest.py index b43bbeaf0b435..c51c489dbd5dd 100644 --- a/sklearn/ensemble/_forest.py +++ b/sklearn/ensemble/_forest.py @@ -731,8 +731,6 @@ def predict_quantiles(self, X, quantiles=0.5, method="nearest"): method : str, optional The method to interpolate, by default 'linear'. Can be any keyword argument accepted by :func:`~np.quantile`. - check_input : bool, optional - Whether or not to check input, by default True. Returns ------- diff --git a/sklearn/tree/_classes.py b/sklearn/tree/_classes.py index d7d8cedb63696..78454b8854d26 100644 --- a/sklearn/tree/_classes.py +++ b/sklearn/tree/_classes.py @@ -602,6 +602,8 @@ def get_leaf_node_samples(self, X, check_input=True): ---------- X : array-like of shape (n_samples, n_features) Dataset to apply the forest to. + check_input : bool, default=True + Allow to bypass several input checking. Returns ------- From 7401ddcb19a42132cf46e79a14b22a2bdfb8519c Mon Sep 17 00:00:00 2001 From: Adam Li Date: Fri, 16 Jun 2023 18:35:39 -0400 Subject: [PATCH 17/28] Try to fix docstring Signed-off-by: Adam Li --- sklearn/tree/_classes.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/sklearn/tree/_classes.py b/sklearn/tree/_classes.py index 78454b8854d26..c75c933c49b39 100644 --- a/sklearn/tree/_classes.py +++ b/sklearn/tree/_classes.py @@ -607,12 +607,11 @@ def get_leaf_node_samples(self, X, check_input=True): Returns ------- - leaf_nodes_samples : a list of array-like of shape - (n_leaf_node_samples, n_outputs) + leaf_nodes_samples : a list of array-like Each sample is represented by the indices of the training samples that reached the leaf node. The ``n_leaf_node_samples`` may vary between samples, since the number of samples that fall in a leaf node is - variable. + variable. Each array has shape (n_leaf_node_samples, n_outputs). """ if not self.store_leaf_values: raise RuntimeError( From 13e29135bd0b640f3bf325ec40a22a879096b719 Mon Sep 17 00:00:00 2001 From: Adam Li Date: Fri, 16 Jun 2023 18:41:17 -0400 Subject: [PATCH 18/28] Try to fix docstring Signed-off-by: Adam Li --- sklearn/tree/_classes.py | 1 - 1 file changed, 1 deletion(-) diff --git a/sklearn/tree/_classes.py b/sklearn/tree/_classes.py index c75c933c49b39..2d83a94dc8ec1 100644 --- a/sklearn/tree/_classes.py +++ b/sklearn/tree/_classes.py @@ -1167,7 +1167,6 @@ def fit(self, X, y, sample_weight=None, check_input=True): self : DecisionTreeClassifier Fitted estimator. """ - super()._fit( X, y, From 43aa3ef51ca96b58b00a178954d033579db09de9 Mon Sep 17 00:00:00 2001 From: Adam Li Date: Sat, 17 Jun 2023 10:41:44 -0400 Subject: [PATCH 19/28] Fix docstring Signed-off-by: Adam Li --- sklearn/ensemble/_forest.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/sklearn/ensemble/_forest.py b/sklearn/ensemble/_forest.py index c51c489dbd5dd..5482ebcaf1d41 100644 --- a/sklearn/ensemble/_forest.py +++ b/sklearn/ensemble/_forest.py @@ -817,12 +817,11 @@ def get_leaf_node_samples(self, X): Returns ------- - leaf_node_samples : a list of array-like of shape - (n_leaf_node_samples, n_outputs) + leaf_node_samples : a list of array-like Each sample is represented by the indices of the training samples that reached the leaf node. The ``n_leaf_node_samples`` may vary between samples, since the number of samples that fall in a leaf node is - variable. + variable. Each array-like has shape (n_leaf_node_samples, n_outputs). """ check_is_fitted(self) # Check data From fe3072f4ee28f49d590e7b437bf01bffd61ab917 Mon Sep 17 00:00:00 2001 From: Adam Li Date: Sat, 17 Jun 2023 11:01:09 -0400 Subject: [PATCH 20/28] Fix docstring Signed-off-by: Adam Li --- sklearn/ensemble/_forest.py | 1 - 1 file changed, 1 deletion(-) diff --git a/sklearn/ensemble/_forest.py b/sklearn/ensemble/_forest.py index 5482ebcaf1d41..9fd3af21b1fd9 100644 --- a/sklearn/ensemble/_forest.py +++ b/sklearn/ensemble/_forest.py @@ -696,7 +696,6 @@ def _bin_data(self, X, is_training_data): If is_training_data, then fit the _bin_mapper attribute. Else, the binned data is converted to a C-contiguous array. """ - description = "training" if is_training_data else "validation" if self.verbose: print( From 2d4de9aff7567bf796626aed4f27149f6ccf399c Mon Sep 17 00:00:00 2001 From: Adam Li Date: Mon, 19 Jun 2023 21:33:55 -0400 Subject: [PATCH 21/28] Fix the predict quantiles docstring Signed-off-by: Adam Li --- sklearn/ensemble/_forest.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/sklearn/ensemble/_forest.py b/sklearn/ensemble/_forest.py index 9fd3af21b1fd9..f85efb0b0a43b 100644 --- a/sklearn/ensemble/_forest.py +++ b/sklearn/ensemble/_forest.py @@ -733,9 +733,9 @@ def predict_quantiles(self, X, quantiles=0.5, method="nearest"): Returns ------- - y : ndarray of shape (n_samples, n_quantiles) or - (n_samples, n_quantiles, n_outputs) - The predicted values. + y : ndarray of shape (n_samples, n_quantiles, [n_output]) + The predicted values. The ``n_outputs`` dimension is present only + for multi-output regressors. """ if not self.store_leaf_values: raise RuntimeError( From 1c1ec8cff3a181b7a86a4df8a2aeb01fa7cdbe6a Mon Sep 17 00:00:00 2001 From: Adam Li Date: Mon, 19 Jun 2023 21:35:33 -0400 Subject: [PATCH 22/28] Fix the predict quantiles docstring Signed-off-by: Adam Li --- sklearn/ensemble/_forest.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/ensemble/_forest.py b/sklearn/ensemble/_forest.py index f85efb0b0a43b..3eb61c9497918 100644 --- a/sklearn/ensemble/_forest.py +++ b/sklearn/ensemble/_forest.py @@ -733,7 +733,7 @@ def predict_quantiles(self, X, quantiles=0.5, method="nearest"): Returns ------- - y : ndarray of shape (n_samples, n_quantiles, [n_output]) + y : ndarray of shape (n_samples, n_quantiles, [n_outputs]) The predicted values. The ``n_outputs`` dimension is present only for multi-output regressors. """ From 4bc651dd7916d7c267690ef0c9705b3f2d69c9d0 Mon Sep 17 00:00:00 2001 From: Adam Li Date: Fri, 23 Jun 2023 12:02:45 -0400 Subject: [PATCH 23/28] Remove some diff Signed-off-by: Adam Li --- sklearn/tree/_classes.py | 1 - sklearn/tree/_criterion.pyx | 18 ++++++++++++++++++ sklearn/tree/_tree.pxd | 3 ++- sklearn/tree/_tree.pyx | 2 -- sklearn/tree/tests/test_tree.py | 8 +++++--- 5 files changed, 25 insertions(+), 7 deletions(-) diff --git a/sklearn/tree/_classes.py b/sklearn/tree/_classes.py index 74e60c64ce85f..e61f674d300c9 100644 --- a/sklearn/tree/_classes.py +++ b/sklearn/tree/_classes.py @@ -511,7 +511,6 @@ def _build_tree( self.min_impurity_decrease, self.store_leaf_values, ) - builder.build(self.tree_, X, y, sample_weight, missing_values_in_feature_mask) if self.n_outputs_ == 1 and is_classifier(self): diff --git a/sklearn/tree/_criterion.pyx b/sklearn/tree/_criterion.pyx index 178a9adee9e80..2ddc02194c490 100644 --- a/sklearn/tree/_criterion.pyx +++ b/sklearn/tree/_criterion.pyx @@ -155,8 +155,10 @@ cdef class BaseCriterion: This method computes the improvement in impurity when a split occurs. The weighted impurity improvement equation is the following: + N_t / N * (impurity - N_t_R / N_t * right_impurity - N_t_L / N_t * left_impurity) + where N is the total number of samples, N_t is the number of samples at the current node, N_t_L is the number of samples in the left child, and N_t_R is the number of samples in the right child, @@ -165,8 +167,10 @@ cdef class BaseCriterion: ---------- impurity_parent : double The initial impurity of the parent node before the split + impurity_left : double The impurity of the left child + impurity_right : double The impurity of the right child @@ -611,10 +615,13 @@ cdef class Entropy(ClassificationCriterion): This handles cases where the target is a classification taking values 0, 1, ... K-2, K-1. If node m represents a region Rm with Nm observations, then let + count_k = 1 / Nm \sum_{x_i in Rm} I(yi = k) + be the proportion of class k observations in node m. The cross-entropy is then defined as + cross-entropy = -\sum_{k=0}^{K-1} count_k log(count_k) """ @@ -1058,10 +1065,14 @@ cdef class MSE(RegressionCriterion): The absolute impurity improvement is only computed by the impurity_improvement method once the best split has been found. + The MSE proxy is derived from + sum_{i left}(y_i - y_pred_L)^2 + sum_{i right}(y_i - y_pred_R)^2 = sum(y_i^2) - n_L * mean_{i left}(y_i)^2 - n_R * mean_{i right}(y_i)^2 + Neglecting constant terms, this gives: + - 1/n_L * sum_{i left}(y_i)^2 - 1/n_R * sum_{i right}(y_i)^2 """ cdef SIZE_t k @@ -1139,6 +1150,7 @@ cdef class MAE(RegressionCriterion): ---------- n_outputs : SIZE_t The number of targets to be predicted + n_samples : SIZE_t The total number of samples to fit on """ @@ -1429,6 +1441,7 @@ cdef class FriedmanMSE(MSE): """Mean squared error impurity criterion with improvement score by Friedman. Uses the formula (35) in Friedman's original Gradient Boosting paper: + diff = mean_left - mean_right improvement = n_left * n_right * diff^2 / (n_left + n_right) """ @@ -1483,6 +1496,7 @@ cdef class Poisson(RegressionCriterion): """Half Poisson deviance as impurity criterion. Poisson deviance = 2/n * sum(y_true * log(y_true/y_pred) + y_pred - y_true) + Note that the deviance is >= 0, and since we have `y_pred = mean(y_true)` at the leaves, one always has `sum(y_pred - y_true) = 0`. It remains the implemented impurity (factor 2 is skipped): @@ -1519,12 +1533,16 @@ cdef class Poisson(RegressionCriterion): The absolute impurity improvement is only computed by the impurity_improvement method once the best split has been found. + The Poisson proxy is derived from: + sum_{i left }(y_i * log(y_i / y_pred_L)) + sum_{i right}(y_i * log(y_i / y_pred_R)) = sum(y_i * log(y_i) - n_L * mean_{i left}(y_i) * log(mean_{i left}(y_i)) - n_R * mean_{i right}(y_i) * log(mean_{i right}(y_i)) + Neglecting constant terms, this gives + - sum{i left }(y_i) * log(mean{i left}(y_i)) - sum{i right}(y_i) * log(mean{i right}(y_i)) """ diff --git a/sklearn/tree/_tree.pxd b/sklearn/tree/_tree.pxd index 7b933d905c79a..dedd820c41e0f 100644 --- a/sklearn/tree/_tree.pxd +++ b/sklearn/tree/_tree.pxd @@ -141,7 +141,8 @@ cdef class TreeBuilder: # This class controls the various stopping criteria and the node splitting # evaluation order, e.g. depth-first or best-first. - cdef Splitter splitter + cdef Splitter splitter # Splitting algorithm + cdef SIZE_t min_samples_split # Minimum number of samples in an internal node cdef SIZE_t min_samples_leaf # Minimum number of samples in a leaf cdef double min_weight_leaf # Minimum weight in a leaf diff --git a/sklearn/tree/_tree.pyx b/sklearn/tree/_tree.pyx index 24b01b96aa726..c44022f54d3a5 100644 --- a/sklearn/tree/_tree.pyx +++ b/sklearn/tree/_tree.pyx @@ -61,7 +61,6 @@ cdef extern from "" namespace "std" nogil: from numpy import float32 as DTYPE from numpy import float64 as DOUBLE - cdef double INFINITY = np.inf cdef double EPSILON = np.finfo('double').eps @@ -87,7 +86,6 @@ NODE_DTYPE = np.asarray((&dummy)).dtype # TreeBuilder # ============================================================================= - cdef class TreeBuilder: """Interface for different tree building strategies.""" diff --git a/sklearn/tree/tests/test_tree.py b/sklearn/tree/tests/test_tree.py index 792ba44b1302e..9be3dbd6f549e 100644 --- a/sklearn/tree/tests/test_tree.py +++ b/sklearn/tree/tests/test_tree.py @@ -33,13 +33,15 @@ DENSE_SPLITTERS, SPARSE_SPLITTERS, ) -from sklearn.tree._tree import NODE_DTYPE, TREE_LEAF, TREE_UNDEFINED -from sklearn.tree._tree import Tree as CythonTree from sklearn.tree._tree import ( + NODE_DTYPE, + TREE_LEAF, + TREE_UNDEFINED, _check_n_classes, _check_node_ndarray, _check_value_ndarray, ) +from sklearn.tree._tree import Tree as CythonTree from sklearn.utils import _IS_32BIT, compute_sample_weight from sklearn.utils._testing import ( assert_almost_equal, @@ -2424,7 +2426,7 @@ def test_missing_values_on_equal_nodes_no_missing(criterion): X = np.array([[0, 1, 2, 3, 8, 9, 11, 12, 15]]).T y = np.array([0.1, 0.2, 0.3, 0.2, 1.4, 1.4, 1.5, 1.6, 2.6]) - dtc = DecisionTreeRegressor(random_state=42, max_depth=1, criterion=criterion) + dtc = DecisionTreeRegressor(random_state=42, max_depth=1, criterion=criterion, store_leaf_values=True) dtc.fit(X, y) # Goes to right node because it has the most data points From cc035d04b9784e6facb7096a56c9c81801d819ec Mon Sep 17 00:00:00 2001 From: Adam Li Date: Fri, 23 Jun 2023 15:42:08 -0400 Subject: [PATCH 24/28] Fix regression error Signed-off-by: Adam Li --- sklearn/ensemble/_forest.py | 18 +++++++++--------- sklearn/tree/_criterion.pyx | 3 ++- sklearn/tree/_splitter.pyx | 6 ++++++ sklearn/tree/tests/test_tree.py | 4 +++- 4 files changed, 20 insertions(+), 11 deletions(-) diff --git a/sklearn/ensemble/_forest.py b/sklearn/ensemble/_forest.py index d8a94940799c0..f2e0201d534cd 100644 --- a/sklearn/ensemble/_forest.py +++ b/sklearn/ensemble/_forest.py @@ -40,27 +40,28 @@ class calls the ``fit`` method of each sub-estimator on random samples # License: BSD 3 clause -from time import time import threading from abc import ABCMeta, abstractmethod from numbers import Integral, Real +from time import time from warnings import catch_warnings, simplefilter, warn import numpy as np from scipy.sparse import hstack as sparse_hstack from scipy.sparse import issparse -from sklearn.base import is_classifier, _fit_context from sklearn.base import ( ClassifierMixin, MultiOutputMixin, RegressorMixin, TransformerMixin, + _fit_context, + is_classifier, ) - -from sklearn.metrics import accuracy_score, r2_score -from sklearn.preprocessing import OneHotEncoder +from sklearn.ensemble._base import BaseEnsemble, _partition_estimators +from sklearn.ensemble._hist_gradient_boosting.binning import _BinMapper from sklearn.exceptions import DataConversionWarning +from sklearn.metrics import accuracy_score, r2_score from sklearn.preprocessing import OneHotEncoder from sklearn.tree import ( BaseDecisionTree, @@ -69,8 +70,8 @@ class calls the ``fit`` method of each sub-estimator on random samples ExtraTreeClassifier, ExtraTreeRegressor, ) -from ..tree._tree import DOUBLE, DTYPE from sklearn.utils import check_random_state, compute_sample_weight +from sklearn.utils._openmp_helpers import _openmp_effective_n_threads from sklearn.utils._param_validation import Interval, RealNotInt, StrOptions from sklearn.utils.multiclass import check_classification_targets, type_of_target from sklearn.utils.parallel import Parallel, delayed @@ -80,9 +81,8 @@ class calls the ``fit`` method of each sub-estimator on random samples _num_samples, check_is_fitted, ) -from sklearn.ensemble._hist_gradient_boosting.binning import _BinMapper -from sklearn.utils._openmp_helpers import _openmp_effective_n_threads -from sklearn.ensemble._base import BaseEnsemble, _partition_estimators + +from ..tree._tree import DOUBLE, DTYPE __all__ = [ "RandomForestClassifier", diff --git a/sklearn/tree/_criterion.pyx b/sklearn/tree/_criterion.pyx index 2ddc02194c490..bd1bdef0a6a93 100644 --- a/sklearn/tree/_criterion.pyx +++ b/sklearn/tree/_criterion.pyx @@ -1496,10 +1496,11 @@ cdef class Poisson(RegressionCriterion): """Half Poisson deviance as impurity criterion. Poisson deviance = 2/n * sum(y_true * log(y_true/y_pred) + y_pred - y_true) - + Note that the deviance is >= 0, and since we have `y_pred = mean(y_true)` at the leaves, one always has `sum(y_pred - y_true) = 0`. It remains the implemented impurity (factor 2 is skipped): + 1/n * sum(y_true * log(y_true/y_pred) """ # FIXME in 1.0: diff --git a/sklearn/tree/_splitter.pyx b/sklearn/tree/_splitter.pyx index 007d55a589df7..bca38d5f04374 100644 --- a/sklearn/tree/_splitter.pyx +++ b/sklearn/tree/_splitter.pyx @@ -507,6 +507,12 @@ cdef inline int node_split_best( current_split.pos = p # Reject if min_samples_leaf is not guaranteed + if missing_go_to_left: + n_left = current_split.pos - splitter.start + n_missing + n_right = end_non_missing - current_split.pos + else: + n_left = current_split.pos - splitter.start + n_right = end_non_missing - current_split.pos + n_missing if splitter.check_presplit_conditions(current_split, n_missing, missing_go_to_left) == 1: continue diff --git a/sklearn/tree/tests/test_tree.py b/sklearn/tree/tests/test_tree.py index 9be3dbd6f549e..0ce7a548c7bdb 100644 --- a/sklearn/tree/tests/test_tree.py +++ b/sklearn/tree/tests/test_tree.py @@ -2426,7 +2426,9 @@ def test_missing_values_on_equal_nodes_no_missing(criterion): X = np.array([[0, 1, 2, 3, 8, 9, 11, 12, 15]]).T y = np.array([0.1, 0.2, 0.3, 0.2, 1.4, 1.4, 1.5, 1.6, 2.6]) - dtc = DecisionTreeRegressor(random_state=42, max_depth=1, criterion=criterion, store_leaf_values=True) + dtc = DecisionTreeRegressor( + random_state=42, max_depth=1, criterion=criterion, store_leaf_values=True + ) dtc.fit(X, y) # Goes to right node because it has the most data points From 4840d4e3e3ef6175c4e1197c87c77f8fe06f10cf Mon Sep 17 00:00:00 2001 From: Adam Li Date: Fri, 23 Jun 2023 18:26:04 -0400 Subject: [PATCH 25/28] Fix boolean Signed-off-by: Adam Li --- sklearn/ensemble/_forest.py | 2 +- sklearn/tree/_classes.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/ensemble/_forest.py b/sklearn/ensemble/_forest.py index f2e0201d534cd..b3feec10a3072 100644 --- a/sklearn/ensemble/_forest.py +++ b/sklearn/ensemble/_forest.py @@ -221,7 +221,7 @@ class BaseForest(MultiOutputMixin, BaseEnsemble, metaclass=ABCMeta): None, Interval(Integral, 1, None, closed="left"), ], - "store_leaf_values": [bool], + "store_leaf_values": ["boolean"], } @abstractmethod diff --git a/sklearn/tree/_classes.py b/sklearn/tree/_classes.py index e61f674d300c9..6825c36df155c 100644 --- a/sklearn/tree/_classes.py +++ b/sklearn/tree/_classes.py @@ -123,7 +123,7 @@ class BaseDecisionTree(MultiOutputMixin, BaseEstimator, metaclass=ABCMeta): "max_leaf_nodes": [Interval(Integral, 2, None, closed="left"), None], "min_impurity_decrease": [Interval(Real, 0.0, None, closed="left")], "ccp_alpha": [Interval(Real, 0.0, None, closed="left")], - "store_leaf_values": [bool], + "store_leaf_values": ["boolean"], } @abstractmethod From fdf2e2dbe1e1c316a1e2987aea31da26ebbec2cd Mon Sep 17 00:00:00 2001 From: Adam Li Date: Fri, 30 Jun 2023 12:49:16 -0700 Subject: [PATCH 26/28] Added doc to store_leaf_values Signed-off-by: Adam Li --- sklearn/ensemble/_forest.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/sklearn/ensemble/_forest.py b/sklearn/ensemble/_forest.py index b3feec10a3072..34bebab399566 100644 --- a/sklearn/ensemble/_forest.py +++ b/sklearn/ensemble/_forest.py @@ -822,6 +822,11 @@ def get_leaf_node_samples(self, X): samples, since the number of samples that fall in a leaf node is variable. Each array-like has shape (n_leaf_node_samples, n_outputs). """ + if not self.store_leaf_values: + raise RuntimeError( + "Leaf node samples are not available when store_leaf_values=False" + ) + check_is_fitted(self) # Check data X = self._validate_X_predict(X) @@ -1520,6 +1525,9 @@ class RandomForestClassifier(ForestClassifier): .. versionadded:: 0.22 + store_leaf_values : bool, default=False + Whether to store the leaf values in the ``get_leaf_node_samples`` function. + Attributes ---------- estimator_ : :class:`~sklearn.tree.DecisionTreeClassifier` @@ -1879,6 +1887,9 @@ class RandomForestRegressor(ForestRegressor): .. versionadded:: 0.22 + store_leaf_values : bool, default=False + Whether to store the leaf values in the ``get_leaf_node_samples`` function. + Attributes ---------- estimator_ : :class:`~sklearn.tree.DecisionTreeRegressor` @@ -2232,6 +2243,9 @@ class ExtraTreesClassifier(ForestClassifier): .. versionadded:: 0.22 + store_leaf_values : bool, default=False + Whether to store the leaf values in the ``get_leaf_node_samples`` function. + Attributes ---------- estimator_ : :class:`~sklearn.tree.ExtraTreesClassifier` @@ -2576,6 +2590,9 @@ class ExtraTreesRegressor(ForestRegressor): .. versionadded:: 0.22 + store_leaf_values : bool, default=False + Whether to store the leaf values in the ``get_leaf_node_samples`` function. + Attributes ---------- estimator_ : :class:`~sklearn.tree.ExtraTreeRegressor` From 5b7ce7e1c6842aac174ebc4b1b2a68a1f1e25a7d Mon Sep 17 00:00:00 2001 From: Adam Li Date: Fri, 30 Jun 2023 12:51:20 -0700 Subject: [PATCH 27/28] Merging main Signed-off-by: Adam Li --- sklearn/tree/_classes.py | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/sklearn/tree/_classes.py b/sklearn/tree/_classes.py index 6825c36df155c..200f87b0b9ef3 100644 --- a/sklearn/tree/_classes.py +++ b/sklearn/tree/_classes.py @@ -1386,6 +1386,16 @@ class DecisionTreeRegressor(RegressorMixin, BaseDecisionTree): .. versionadded:: 0.22 + store_leaf_values : bool, default=False + Whether to store the samples that fall into leaves in the ``tree_`` attribute. + Each leaf will store a 2D array corresponding to the samples that fall into it + keyed by node_id. + + XXX: This is currently experimental and may change without notice. + Moreover, it can be improved upon since storing the samples twice is not ideal. + One could instead store the indices in ``y_train`` that fall into each leaf, + which would lower RAM/diskspace usage. + Attributes ---------- feature_importances_ : ndarray of shape (n_features,) @@ -1713,6 +1723,16 @@ class ExtraTreeClassifier(DecisionTreeClassifier): .. versionadded:: 0.22 + store_leaf_values : bool, default=False + Whether to store the samples that fall into leaves in the ``tree_`` attribute. + Each leaf will store a 2D array corresponding to the samples that fall into it + keyed by node_id. + + XXX: This is currently experimental and may change without notice. + Moreover, it can be improved upon since storing the samples twice is not ideal. + One could instead store the indices in ``y_train`` that fall into each leaf, + which would lower RAM/diskspace usage. + Attributes ---------- classes_ : ndarray of shape (n_classes,) or list of ndarray @@ -1959,6 +1979,16 @@ class ExtraTreeRegressor(DecisionTreeRegressor): .. versionadded:: 0.22 + store_leaf_values : bool, default=False + Whether to store the samples that fall into leaves in the ``tree_`` attribute. + Each leaf will store a 2D array corresponding to the samples that fall into it + keyed by node_id. + + XXX: This is currently experimental and may change without notice. + Moreover, it can be improved upon since storing the samples twice is not ideal. + One could instead store the indices in ``y_train`` that fall into each leaf, + which would lower RAM/diskspace usage. + Attributes ---------- max_features_ : int From 9655d013870e3007d5c5a1898212a9d0eeea0968 Mon Sep 17 00:00:00 2001 From: Adam Li Date: Fri, 30 Jun 2023 13:03:26 -0700 Subject: [PATCH 28/28] Fix now Signed-off-by: Adam Li --- sklearn/ensemble/_forest.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/sklearn/ensemble/_forest.py b/sklearn/ensemble/_forest.py index 34bebab399566..768eeeaf1959f 100644 --- a/sklearn/ensemble/_forest.py +++ b/sklearn/ensemble/_forest.py @@ -63,13 +63,6 @@ class calls the ``fit`` method of each sub-estimator on random samples from sklearn.exceptions import DataConversionWarning from sklearn.metrics import accuracy_score, r2_score from sklearn.preprocessing import OneHotEncoder -from sklearn.tree import ( - BaseDecisionTree, - DecisionTreeClassifier, - DecisionTreeRegressor, - ExtraTreeClassifier, - ExtraTreeRegressor, -) from sklearn.utils import check_random_state, compute_sample_weight from sklearn.utils._openmp_helpers import _openmp_effective_n_threads from sklearn.utils._param_validation import Interval, RealNotInt, StrOptions @@ -82,6 +75,13 @@ class calls the ``fit`` method of each sub-estimator on random samples check_is_fitted, ) +from ..tree import ( + BaseDecisionTree, + DecisionTreeClassifier, + DecisionTreeRegressor, + ExtraTreeClassifier, + ExtraTreeRegressor, +) from ..tree._tree import DOUBLE, DTYPE __all__ = [