diff --git a/.circleci/config.yml b/.circleci/config.yml index eedc286a5a5f2..f58bbb0a42487 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -89,22 +89,23 @@ jobs: root: doc/_build/html paths: . - deploy: - docker: - - image: cimg/python:3.8.12 - steps: - - checkout - - run: ./build_tools/circle/checkout_merge_commit.sh - # Attach documentation generated in the 'doc' step so that it can be - # deployed. - - attach_workspace: - at: doc/_build/html - - run: ls -ltrh doc/_build/html/stable - - deploy: - command: | - if [[ "${CIRCLE_BRANCH}" =~ ^main$|^[0-9]+\.[0-9]+\.X$ ]]; then - bash build_tools/circle/push_doc.sh doc/_build/html/stable - fi + # XXX: in order to make sure our fork passes all the CIs and not remove too many LOC, we don't want to deploy + # deploy: + # docker: + # - image: cimg/python:3.8.12 + # steps: + # - checkout + # - run: ./build_tools/circle/checkout_merge_commit.sh + # # Attach documentation generated in the 'doc' step so that it can be + # # deployed. + # - attach_workspace: + # at: doc/_build/html + # - run: ls -ltrh doc/_build/html/stable + # - deploy: + # command: | + # if [[ "${CIRCLE_BRANCH}" =~ ^main$|^[0-9]+\.[0-9]+\.X$ ]]; then + # bash build_tools/circle/push_doc.sh doc/_build/html/stable + # fi workflows: version: 2 diff --git a/.cirrus.star b/.cirrus.star index 8b3de0d10c532..2dd1e50144987 100644 --- a/.cirrus.star +++ b/.cirrus.star @@ -4,9 +4,9 @@ load("cirrus", "env", "fs", "http") def main(ctx): - # Only run for scikit-learn/scikit-learn. For debugging on a fork, you can + # Only run for neurodata/scikit-learn. For debugging on a fork, you can # comment out the following condition. - if env.get("CIRRUS_REPO_FULL_NAME") != "scikit-learn/scikit-learn": + if env.get("CIRRUS_REPO_FULL_NAME") != "neurodata/scikit-learn": return [] arm_wheel_yaml = "build_tools/cirrus/arm_wheel.yml" diff --git a/.github/workflows/check-changelog.yml b/.github/workflows/check-changelog.yml index d5bfc8ef0f430..53f64ba5c886b 100644 --- a/.github/workflows/check-changelog.yml +++ b/.github/workflows/check-changelog.yml @@ -10,12 +10,13 @@ jobs: check: name: A reviewer will let you know if it is required or can be bypassed runs-on: ubuntu-latest - if: ${{ contains(github.event.pull_request.labels.*.name, 'No Changelog Needed') == 0 }} + if: ${{ contains(github.event.pull_request.labels.*.name, 'No Changelog Needed') == 0 && github.repository == 'scikit-learn/scikit-learn' }} steps: - name: Get PR number and milestone run: | echo "PR_NUMBER=${{ github.event.pull_request.number }}" >> $GITHUB_ENV echo "TAGGED_MILESTONE=${{ github.event.pull_request.milestone.title }}" >> $GITHUB_ENV + echo "${{ github.repository }}" - uses: actions/checkout@v3 with: fetch-depth: '0' diff --git a/.github/workflows/check-manifest.yml b/.github/workflows/check-manifest.yml index 004cc452e385e..5ef9ce2213e90 100644 --- a/.github/workflows/check-manifest.yml +++ b/.github/workflows/check-manifest.yml @@ -7,7 +7,7 @@ on: jobs: check-manifest: # Don't run on forks - if: github.repository == 'scikit-learn/scikit-learn' + if: github.repository == 'neurodata/scikit-learn' runs-on: ubuntu-latest steps: diff --git a/.github/workflows/check-upstream.yml b/.github/workflows/check-upstream.yml new file mode 100644 index 0000000000000..80e8ace610607 --- /dev/null +++ b/.github/workflows/check-upstream.yml @@ -0,0 +1,27 @@ +# Create Github Actions workflow that checks upstream scikit-learn 'main' branch and +# creates or updates +# an existing pull request to https://github.com/neurodata/scikit-learn:fork. +# Runs the check weekly. +# Creates a pull request if there are changes. + +# name: Check upstream scikit-learn + +# on: +# schedule: +# - cron: '0 0 * * 0' + +# jobs: +# check-upstream: +# runs-on: ubuntu-latest +# steps: +# - uses: actions/checkout@v2 +# - name: Check upstream scikit-learn +# uses: neurodata/check-upstream@main +# with: +# upstream: scikit-learn/scikit-learn +# fork: neurodata/scikit-learn +# branch: fork +# token: ${{ secrets.GITHUB_TOKEN }} + +# # Creates a pull request if there are changes. + diff --git a/.github/workflows/labeler-module.yml b/.github/workflows/labeler-module.yml index 468d3282903f2..8b4f39461b8da 100644 --- a/.github/workflows/labeler-module.yml +++ b/.github/workflows/labeler-module.yml @@ -16,7 +16,7 @@ jobs: steps: - uses: thomasjpfan/labeler@v2.5.1 continue-on-error: true - if: github.repository == 'scikit-learn/scikit-learn' + if: github.repository == 'neurodata/scikit-learn' with: repo-token: "${{ secrets.GITHUB_TOKEN }}" max-labels: "3" @@ -27,7 +27,7 @@ jobs: steps: - uses: thomasjpfan/labeler@v2.5.1 continue-on-error: true - if: github.repository == 'scikit-learn/scikit-learn' + if: github.repository == 'neurodata/scikit-learn' with: repo-token: "${{ secrets.GITHUB_TOKEN }}" configuration-path: ".github/labeler-file-extensions.yml" diff --git a/.github/workflows/update_tracking_issue.yml b/.github/workflows/update_tracking_issue.yml index 124ea1e8c6ac4..c176ce356a4cf 100644 --- a/.github/workflows/update_tracking_issue.yml +++ b/.github/workflows/update_tracking_issue.yml @@ -24,7 +24,7 @@ on: jobs: update_tracking_issue: runs-on: ubuntu-latest - if: github.repository == 'scikit-learn/scikit-learn' && github.event_name == 'schedule' + if: github.repository == 'neurodata/scikit-learn' && github.event_name == 'schedule' steps: - uses: actions/checkout@v3 - uses: actions/setup-python@v4 diff --git a/.github/workflows/wheels.yml b/.github/workflows/wheels.yml index b43f29ffa4f7f..4ab75fd361586 100644 --- a/.github/workflows/wheels.yml +++ b/.github/workflows/wheels.yml @@ -7,12 +7,12 @@ on: - cron: "42 3 */1 * *" push: branches: - - main + - fork # Release branches - "[0-9]+.[0-9]+.X" pull_request: branches: - - main + - fork - "[0-9]+.[0-9]+.X" # Manual run workflow_dispatch: @@ -26,7 +26,7 @@ jobs: check_build_trigger: name: Check build trigger runs-on: ubuntu-latest - if: github.repository == 'scikit-learn/scikit-learn' + if: github.repository == 'neurodata/scikit-learn' outputs: build: ${{ steps.check_build_trigger.outputs.build }} @@ -178,31 +178,8 @@ jobs: with: path: dist/*.tar.gz - # Upload the wheels and the source distribution - upload_anaconda: - name: Upload to Anaconda - runs-on: ubuntu-latest - needs: [build_wheels, build_sdist] - # The artifacts cannot be uploaded on PRs - if: github.event_name != 'pull_request' - - steps: - - name: Checkout scikit-learn - uses: actions/checkout@v3 - - - name: Download artifacts - uses: actions/download-artifact@v3 + - uses: actions/upload-artifact@v3 with: path: dist + name: ${{ matrix.python[0] }}-${{ matrix.os[1] }} - - name: Setup Python - uses: actions/setup-python@v4 - - - name: Upload artifacts - env: - # Secret variables need to be mapped to environment variables explicitly - SCIKIT_LEARN_NIGHTLY_UPLOAD_TOKEN: ${{ secrets.SCIKIT_LEARN_NIGHTLY_UPLOAD_TOKEN }} - SCIKIT_LEARN_STAGING_UPLOAD_TOKEN: ${{ secrets.SCIKIT_LEARN_STAGING_UPLOAD_TOKEN }} - ARTIFACTS_PATH: dist/artifact - # Force a replacement if the remote file already exists - run: bash build_tools/github/upload_anaconda.sh diff --git a/.gitignore b/.gitignore index f4601a15655a5..5296f46280e4d 100644 --- a/.gitignore +++ b/.gitignore @@ -10,6 +10,7 @@ build sklearn/datasets/__config__.py sklearn/**/*.html +scikit_learn_tree.egg-info/* dist/ MANIFEST diff --git a/Makefile b/Makefile index e2ae6aa75ca94..4e685872a4c61 100644 --- a/Makefile +++ b/Makefile @@ -62,3 +62,6 @@ doc-noplot: inplace code-analysis: build_tools/linting.sh + +build-dev: + pip install --verbose --no-build-isolation --editable . diff --git a/README.rst b/README.rst index 80de41a8890a1..4d1b135400c3e 100644 --- a/README.rst +++ b/README.rst @@ -44,20 +44,40 @@ .. |PytestMinVersion| replace:: 7.1.2 .. |PlotlyMinVersion| replace:: 5.14.0 -.. image:: https://raw.githubusercontent.com/scikit-learn/scikit-learn/main/doc/logos/scikit-learn-logo.png - :target: https://scikit-learn.org/ +================= +Scikit-learn-tree +================= -**scikit-learn** is a Python module for machine learning built on top of -SciPy and is distributed under the 3-Clause BSD license. - -The project was started in 2007 by David Cournapeau as a Google Summer -of Code project, and since then many volunteers have contributed. See -the `About us `__ page -for a list of core contributors. +``scikit-learn-tree`` is an alias of scikit-learn. It is a maintained fork of scikit-learn, which advances the tree submodule, while staying in-line +with changes from upstream scikit-learn. It is an exact stand-in for ``sklearn`` in package imports, but is +released under the name ``scikit-learn-tree`` to avoid confusion. It is currently maintained by a team of volunteers. -Website: https://scikit-learn.org +The upstream package **scikit-learn** is a Python module for machine learning built on top of +SciPy and is distributed under the 3-Clause BSD license. Refer to their website for all documentation +needs: https://scikit-learn.org. + +Why a fork? +----------- +Currently, the scikit-learn tree submodule is difficult to extend. Requests to modularize +and improve the extensibility of the code is currently unsupported, or may take a long time. +The desire for advanced tree models that also leverage the robustness of scikit-learn is desirable. + +However, "hard-forking" via copy/pasting the explicit Python/Cython code into another tree package +altogether is undesirable because it results in a tree codebase that is inherently different +and not compatible with ``scikit-learn``. For example, `quantile-forests `_, +and `EconML `_ do this, and their current tree submodules +cannot take advantage of improvements made in upstream ``scikit-learn``. + +An example of seamless integration would be `scikit-survival `_, which +only needs to implement a subclass of the Cython ``Criterion`` oject in their code to enable survival trees. + +Maintaining a "soft-fork" of ``scikit-learn`` in the form of a repository fork allows us to develop +a separate package that serves as a stand-in for ``sklearn`` in any package, extends the tree submodule +and can also be synced with upstream changes in ``scikit-learn``. This enables this fork to always +take advantage of improvements made in ``scikit-learn`` main upstream, while providing a customizable +tree API. Installation ------------ @@ -65,7 +85,7 @@ Installation Dependencies ~~~~~~~~~~~~ -scikit-learn requires: +scikit-learn-tree requires: - Python (>= |PythonMinVersion|) - NumPy (>= |NumPyMinVersion|) @@ -73,132 +93,193 @@ scikit-learn requires: - joblib (>= |JoblibMinVersion|) - threadpoolctl (>= |ThreadpoolctlMinVersion|) -======= +============================ +Installing scikit-learn-tree +============================ + +Scikit-learn-tree is a maintained fork of scikit-learn, which extends the +tree submodule in a few ways documented in `fork_changelog`_. -**Scikit-learn 0.20 was the last version to support Python 2.7 and Python 3.4.** -scikit-learn 1.0 and later require Python 3.7 or newer. -scikit-learn 1.1 and later require Python 3.8 or newer. +We release versions of scikit-learn-tree in an analagous fashion to +scikit-learn main. Due to maintenance resources, we only release on PyPi +and recommend therefore installing with ``pip``. -Scikit-learn plotting capabilities (i.e., functions start with ``plot_`` and -classes end with "Display") require Matplotlib (>= |MatplotlibMinVersion|). -For running the examples Matplotlib >= |MatplotlibMinVersion| is required. -A few examples require scikit-image >= |Scikit-ImageMinVersion|, a few examples -require pandas >= |PandasMinVersion|, some examples require seaborn >= -|SeabornMinVersion| and plotly >= |PlotlyMinVersion|. +There are different ways to install scikit-learn-tree: -User installation -~~~~~~~~~~~~~~~~~ + * Install the latest official release `install_fork_release`_. This + is the best approach for most users. It will provide a stable version + and pre-built packages are available for most platforms. + + * Building the package from source `install_source`_. This is best for users who want the + latest-and-greatest features and aren't afraid of running + brand-new code. This is also needed for users who wish to contribute to the + project. -If you already have a working installation of numpy and scipy, -the easiest way to install scikit-learn is using ``pip``:: +.. _install_fork_release: - pip install -U scikit-learn +Installing the latest release +----------------------------- +We release wheels for common distributions and this is thus installable via pip. -or ``conda``:: + pip install scikit-learn-tree - conda install -c conda-forge scikit-learn +This will install ``scikit-learn-tree`` under the namespace of ``sklearn``, which then +can be used as a stand-in for any package that relies on the public API of ``sklearn``. -The documentation includes more detailed `installation instructions `_. +For example, any usage of ``scikit-learn`` is preserved with ``scikit-learn-tree`` + >>> # the sklearn installed is that of scikit-learn-tree and is equivalent to scikit-learn + >>> from sklearn.ensemble import RandomForestClassifier + >>> clf = RandomForestClassifier(random_state=0) + >>> X = [[ 1, 2, 3], # 2 samples, 3 features + ... [11, 12, 13]] + >>> y = [0, 1] # classes of each sample + >>> clf.fit(X, y) + RandomForestClassifier(random_state=0) -Changelog ---------- +.. _install_source: -See the `changelog `__ -for a history of notable changes to scikit-learn. +Building from source +-------------------- +If you are a developer and are interested in helping maintain, or add some new +features to the fork, the building from source instructions are exactly the same +as that of scikit-learn main, so please refer to `scikit-learn documentation `_ +for instructions on building from source. + +=========== Development ----------- -We welcome new contributors of all experience levels. The scikit-learn -community goals are to be helpful, welcoming, and effective. The +We welcome new contributors of all experience levels, specifically to maintain the fork. +Any contributions that make sure our fork is "better in-line" with scikit-learn upstream, +or improves the tree submodule in anyway will be appreciated. + +The scikit-learn community goals are to be helpful, welcoming, and effective. The `Development Guide `_ has detailed information about contributing code, documentation, tests, and more. We've included some basic information in this README. -Important links -~~~~~~~~~~~~~~~ - -- Official source code repo: https://github.com/scikit-learn/scikit-learn -- Download releases: https://pypi.org/project/scikit-learn/ -- Issue tracker: https://github.com/scikit-learn/scikit-learn/issues - -Source code -~~~~~~~~~~~ - -You can check the latest sources with the command:: - - git clone https://github.com/scikit-learn/scikit-learn.git - -Contributing -~~~~~~~~~~~~ - -To learn more about making a contribution to scikit-learn, please see our -`Contributing guide -`_. - -Testing -~~~~~~~ - -After installation, you can launch the test suite from outside the source -directory (you will need to have ``pytest`` >= |PyTestMinVersion| installed):: - - pytest sklearn - -See the web page https://scikit-learn.org/dev/developers/contributing.html#testing-and-improving-test-coverage -for more information. - - Random number generation can be controlled during testing by setting - the ``SKLEARN_SEED`` environment variable. - -Submitting a Pull Request -~~~~~~~~~~~~~~~~~~~~~~~~~ - -Before opening a Pull Request, have a look at the -full Contributing page to make sure your code complies -with our guidelines: https://scikit-learn.org/stable/developers/index.html - -Project History ---------------- - -The project was started in 2007 by David Cournapeau as a Google Summer -of Code project, and since then many volunteers have contributed. See -the `About us `__ page -for a list of core contributors. - -The project is currently maintained by a team of volunteers. - -**Note**: `scikit-learn` was previously referred to as `scikits.learn`. - -Help and Support ----------------- - -Documentation -~~~~~~~~~~~~~ - -- HTML documentation (stable release): https://scikit-learn.org -- HTML documentation (development version): https://scikit-learn.org/dev/ -- FAQ: https://scikit-learn.org/stable/faq.html - -Communication -~~~~~~~~~~~~~ - -- Mailing list: https://mail.python.org/mailman/listinfo/scikit-learn -- Gitter: https://gitter.im/scikit-learn/scikit-learn -- Logos & Branding: https://github.com/scikit-learn/scikit-learn/tree/main/doc/logos -- Blog: https://blog.scikit-learn.org -- Calendar: https://blog.scikit-learn.org/calendar/ -- Twitter: https://twitter.com/scikit_learn -- Stack Overflow: https://stackoverflow.com/questions/tagged/scikit-learn -- Github Discussions: https://github.com/scikit-learn/scikit-learn/discussions -- Website: https://scikit-learn.org -- LinkedIn: https://www.linkedin.com/company/scikit-learn -- YouTube: https://www.youtube.com/channel/UCJosFjYm0ZYVUARxuOZqnnw/playlists -- Facebook: https://www.facebook.com/scikitlearnofficial/ -- Instagram: https://www.instagram.com/scikitlearnofficial/ -- TikTok: https://www.tiktok.com/@scikit.learn - -Citation -~~~~~~~~ - -If you use scikit-learn in a scientific publication, we would appreciate citations: https://scikit-learn.org/stable/about.html#citing-scikit-learn +.. _fork_changelog: + +Major Changes of the Fork +------------------------- + +The purpose of this page is to illustrate some of the main features that +``scikit-learn-tree`` provides compared to ``scikit-learn``. It assumes a +an understanding of core package ``scikit-learn`` and also decision trees +models. Please refer to our installation instructions `install_fork_release`_ for installing ``scikit-learn-tree``. + +Scikit-learn-tree though operates as a stand-in for upstream ``scikit-learn``. +It is used in packages exactly the same way and will support all features +in the corresponding version of ``scikit-learn``. For example, if you +are interested in features of ``scikit-learn`` in v1.2.2 for ``NearestNeighbors`` algorithm, +then if ``scikit-learn-tree`` has a version release of v1.2.2, then it will have +all those features. + +The breaking API changes will be with respect to anything in the ``tree`` submodule, +and related Forest ensemble models. See below for a detailed list of breaking changes. + +See: https://scikit-learn.org/ for documentation on scikit-learn main. + +Our Philosophy +-------------- +Our design philosophy with this fork of ``scikit-learn`` is to maintain as few changes +as possible, such that incorporating upstream changes into the fork requires minimal effort. + +Candidate changes and PRs accepted into the fork are those that: + +- improve compatability with upstream ``scikit-learn`` main +- enable improved extensibility of tree models + +Decision tree generalizations +----------------------------- + +``Scikit-learn`` provides an axis-aligned `sklearn.tree.DecisionTreeClassifier `_ +decision tree model (classifier and regressor), which has a few fundamental limitations +that prevent 3rd parties from utilizing the existing class, without forking a large +amount of copy/pasted Python and Cython code. We highlight those limitations here +and then describe how we generalize that limitation. + +Cython Internal Private API: + +Note, the Cython API for scikit-learn is still not a publicly supported API, so it may +change without warning. + +- leaf and split nodes: These nodes are treated the same way and there is no internal + API for setting them differently. Quantile trees and causal trees inherently generalize + how leaf nodes are set. +- Criterion class: The criterion class currently assumes a supervised learning interface. + - Our fix: We implement a ``BaseCriterion`` object that provides an abstract API for unsupervised criterion. +- Splitter class: The splitter clas currently assumes a supervised learning interface and + does not provide a way of generalizing the way split candidates are proposed. + - Our fix: We implement a ``BaseSplitter`` object that provides an abstract API for unsupervised splitters and also implement an API to allow generalizations of the ``SplitRecord`` struct and ``Splitter.node_split`` function. For example, this enables oblique splits to be considered. +- Tree class: The tree class currently assumes a supervised learning interface and does not + provide a way of generalizing the type of tree. + - Our fix: We implementa ``BaseTree`` object that provides an abstract API for general tree models and also implement an API that allows generalization of the type of tree. For example, oblique trees are trivially implementable as an extension now. +- stopping conditions for splitter: Currently, the ``Splitter.node_split`` function has various + stopping conditions for the splitter based on hyperparameters. It is plausible that these conditions + may be extended. For example, in causal trees, one may want the splitter to also account for + a minimal degree of heterogeneity (i.e. variance) in its children nodes. + +Python API: + +- ``sklearn.tree.BaseDecisionTree`` assumes the underlying tree model is supervised: The ``y`` + parameter is required to be passed in, which is not necessary for general tree-based models. + For example, an unsupervised tree may pass in ``y=None``. + - Our fix: We fix this API, so the ``BaseDecisionTree`` is subclassable by unsupervised tree models that do not require ``y`` to be defined. +- ``sklearn.tree.BaseDecisionTree`` does not provide a way to generalize the ``Criterion``, ``Splitter`` + and ``Tree`` Cython classes used: The current codebase requires users to define custom + criterion and/or splitters outside the instantiation of the ``BaseDecisionTree``. This prevents + users from generalizing the ``Criterion`` and ``Splitter`` and creating a neat Python API wrapper. + Moreover, the ``Tree`` class is not customizable. + - Our fix: We internally implement a private function to actually build the entire tree, ``BaseDecisionTree._build_tree``, which can be overridden in subclasses that customize the criterion, splitter, or tree, or any combination of them. +- ``sklearn.ensemble.BaseForest`` and its subclass algorithms are slow when ``n_samples`` is very high. Binning + features into a histogram, which is the basis of "LightGBM" and "HistGradientBoostingClassifier" is a computational + trick that can both significantly increase runtime efficiency, but also help prevent overfitting in trees, since + the sorting in "BestSplitter" is done on bins rather than the continuous feature values. This would enable + random forests and their variants to scale to millions of samples. + - Our fix: We added a ``max_bins=None`` keyword argument to the ``BaseForest`` class, and all its subclasses. The default behavior is no binning. The current implementation is not necessarily efficient. There are several improvements to be made. See below. + +Overall, the existing tree models, such as `sklearn.tree.DecisionTreeClassifier `_ +and `sklearn.ensemble.RandomForestClassifier `_ all work exactly the same as they +would in ``scikit-learn`` main, but these extensions enable 3rd-party packages to extend +the Cython/Python API easily. + +Roadmap +------- +There are several improvements that can be made in this fork. Primarily, the binning feature +promises to make Random Forests and their variants ultra-fast. However, the binning needs +to be implemented in a similar fashion to ``HistGradientBoostingClassifier``, which passes +in the binning thresholds throughout the tree construction step, such that the split nodes +store the actual numerical value of the bin rather than the "bin index". This requires +modifying the tree Cython code to take in a ``binning_thresholds`` parameter that is part +of the ``_BinMapper`` fitted class. This also allows us not to do any binning during prediction/apply +time because the tree already stores the "numerical" threshold value we would want to apply +to any incoming ``X`` that is not binned. + +Besides that modification, the tree and splitter need to be able to handle not just ``np.float32`` +data (the type for X normally in Random Forests), but also ``uint8`` data (the type for X when it +is binned in to e.g. 255 bins). This would not only save RAM since ``uint8`` storage of millions +of samples would result in many GB saved, but also improved runtime. + +So in summary, the Cython code of the tree submodule needs to take in an extra parameter for +the binning thresholds if binning occurs and also be able to handle ``X`` being of dtype ``uint8``. +Afterwards, Random Forests will have fully leveraged the binning feature. + +Something to keep in mind is that upstream scikit-learn is actively working on incorporating +missing-value handling and categorical handling into Random Forests. + +Next steps +---------- + +We have briefly covered how the tree submodule has changed with respect to ``scikit-learn``. +This enables packages to leverage these changes in developing more complex tree models +that may, or may not eventually be PRed into ``scikit-learn``. For example, + +- `scikit-tree `_ is a scikit-learn + compatible package for more complex and advanced tree models. + +If you are developing tree models, we encourage you to take a look at that package, or +if you have suggestions to make the tree submodule of our fork, ``scikit-learn-tree`` +more diff --git a/build_tools/azure/install_win.sh b/build_tools/azure/install_win.sh index ab559a1878971..011e962885d45 100755 --- a/build_tools/azure/install_win.sh +++ b/build_tools/azure/install_win.sh @@ -22,4 +22,4 @@ show_installed_libraries python setup.py bdist_wheel # Install the generated wheel package to test it -pip install --pre --no-index --find-links dist scikit-learn +pip install --pre --no-index --find-links dist scikit-learn-tree diff --git a/build_tools/github/repair_windows_wheels.sh b/build_tools/github/repair_windows_wheels.sh index cdd0c0c79d8c4..a857e61067960 100755 --- a/build_tools/github/repair_windows_wheels.sh +++ b/build_tools/github/repair_windows_wheels.sh @@ -9,7 +9,7 @@ DEST_DIR=$2 # By default, the Windows wheels are not repaired. # In this case, we need to vendor VCRUNTIME140.dll wheel unpack "$WHEEL" -WHEEL_DIRNAME=$(ls -d scikit_learn-*) +WHEEL_DIRNAME=$(ls -d scikit_learn_tree-*) python build_tools/github/vendor.py "$WHEEL_DIRNAME" wheel pack "$WHEEL_DIRNAME" -d "$DEST_DIR" rm -rf "$WHEEL_DIRNAME" diff --git a/doc/conf.py b/doc/conf.py index db69cfedd48a3..7c52a20014d1b 100644 --- a/doc/conf.py +++ b/doc/conf.py @@ -124,7 +124,8 @@ # source_encoding = 'utf-8' # The main toctree document. -root_doc = "contents" +# root_doc = "contents" +root_doc = "index" # General information about the project. project = "scikit-learn" diff --git a/doc/modules/tree.rst b/doc/modules/tree.rst index f7d43c5a3d7da..b6c76309108ad 100644 --- a/doc/modules/tree.rst +++ b/doc/modules/tree.rst @@ -141,7 +141,7 @@ Once trained, you can plot the tree with the :func:`plot_tree` function:: >>> tree.plot_tree(clf) [...] -.. figure:: ../auto_examples/tree/images/sphx_glr_plot_iris_dtc_002.png +.. figure:: ../auto_examples/tree/images/sphx_glr_plot_iris_dtc_003.png :target: ../auto_examples/tree/plot_iris_dtc.html :scale: 75 :align: center @@ -331,6 +331,8 @@ total cost over the entire trees (by summing the cost at each node) of :math:`O(n_{features}n_{samples}^{2}\log(n_{samples}))`. +.. _tree_tips_usage: + Tips on practical use ===================== @@ -671,11 +673,66 @@ be pruned. This process stops when the pruned tree's minimal * :ref:`sphx_glr_auto_examples_tree_plot_cost_complexity_pruning.py` +Classification, regression and multi-output problems +---------------------------------------------------- + +OTs can be used for both classification and regression, and can handle multi-output +problems in the same manner as DTs. + +Complexity +---------- + +The run time cost to construct an OT will be similar to that of a DT, with the +added complexity of a (possibly sparse) matrix multiplication to combine random +data columns into candidate split values. The cost at each node is +:math:`O(n_{features}n_{samples}\log(n_{samples}) + n_{features}n_{samples}max\_features \lambda)` +where the additional :math:`n_{features}n_{samples}max\_features \lambda` term +comes from the (possibly sparse) matrix multiplication controlled by both the +number of candidate splits to generate ("max_features") and the sparsity of +the projection matrix that combines the data features (":math:`\lambda`"). + +Another consideration is space-complexity. + +Space-complexity and storing the OT pickled on disc is also a consideration. OTs +at every node need to store an additional vector of feature indices and vector of +feature weights that are used together to form the candidate splits. + +Tips on practical use +--------------------- + +Similar to DTs, the intuition for most parameters are the same. Therefore refer +to :ref:`tips for using decision trees ` for information on standard +tree parameters. Specific parameters, such as ``max_features`` and +``feature_combinations`` are different or special to OTs. + + * As specified earlier, ``max_features`` is not constrained to ``n_features`` + as it is in DTs. Setting ``max_features`` higher requires more computation time because + the algorithm needs to sample more candidate splits at every node. However, it also possibly + lets the user to sample more informative splits, thereby improving the model fit. This + presents a tradeoff between runtime resources and improvements to the model. In practice, + we found that sampling more splits, say up to ``max_features=n_features**2``, is desirable + if one is willing to spend the computational resources. + + * ``feature_combinations`` is the :math:`\lambda` term presented in the complexity + analysis, which specifies how sparse our combination of features is. If + ``feature_combinations=n_features``, then OT is the ``Forest-RC`` version. However, + in practice, ``feature_combinations`` can be set much lower, therefore improving runtime + and storage complexity. + +Finally, when asking the question of when to use OTs vs DTs, scikit-learn recommends +always trying both model using some type of cross-validation procedure and hyperparameter +optimization (e.g. `GridSearchCV`). If one has prior knowledge about how the data is +distributed along its features, such as data being axis-aligned, then one might use a DT. +Other considerations are runtime and space complexity. + .. topic:: References: .. [BRE] L. Breiman, J. Friedman, R. Olshen, and C. Stone. Classification and Regression Trees. Wadsworth, Belmont, CA, 1984. - + + .. [RF] L. Breiman. Random Forests. Machine Learning 45, 5–32 (2001). + https://doi.org/10.1023/A:1010933404324. + * https://en.wikipedia.org/wiki/Decision_tree_learning * https://en.wikipedia.org/wiki/Predictive_analytics diff --git a/examples/tree/plot_iris_dtc.py b/examples/tree/plot_iris_dtc.py index b3d834da5d067..99b9e6b18b109 100644 --- a/examples/tree/plot_iris_dtc.py +++ b/examples/tree/plot_iris_dtc.py @@ -2,16 +2,12 @@ ======================================================================= Plot the decision surface of decision trees trained on the iris dataset ======================================================================= - Plot the decision surface of a decision tree trained on pairs of features of the iris dataset. - See :ref:`decision tree ` for more information on the estimator. - For each pair of iris features, the decision tree learns decision boundaries made of combinations of simple thresholding rules inferred from the training samples. - We also show the tree structure of a model built on all of the features. """ # %% diff --git a/setup.py b/setup.py old mode 100755 new mode 100644 index 5af738f5f841f..c41883aa5c37a --- a/setup.py +++ b/setup.py @@ -29,19 +29,19 @@ builtins.__SKLEARN_SETUP__ = True -DISTNAME = "scikit-learn" -DESCRIPTION = "A set of python modules for machine learning and data mining" +DISTNAME = "scikit-learn-tree" +DESCRIPTION = "A maintained fork of scikit-learn that extends the tree submodule." with open("README.rst") as f: LONG_DESCRIPTION = f.read() -MAINTAINER = "Andreas Mueller" -MAINTAINER_EMAIL = "amueller@ais.uni-bonn.de" +MAINTAINER = "Adam Li" +MAINTAINER_EMAIL = "adam.li@columbia.edu" URL = "http://scikit-learn.org" -DOWNLOAD_URL = "https://pypi.org/project/scikit-learn/#files" +DOWNLOAD_URL = "https://pypi.org/project/scikit-learn-tree/#files" LICENSE = "new BSD" PROJECT_URLS = { - "Bug Tracker": "https://github.com/scikit-learn/scikit-learn/issues", + "Bug Tracker": "https://github.com/neurodata/scikit-learn/issues", "Documentation": "https://scikit-learn.org/stable/documentation.html", - "Source Code": "https://github.com/scikit-learn/scikit-learn", + "Source Code": "https://github.com/neurodata/scikit-learn", } # We can actually import a restricted version of sklearn that @@ -169,11 +169,11 @@ def check_package_status(package, min_version): package_status["up_to_date"] = False package_status["version"] = "" - req_str = "scikit-learn requires {} >= {}.\n".format(package, min_version) + req_str = "scikit-learn-tree requires {} >= {}.\n".format(package, min_version) instructions = ( "Installation instructions are available on the " - "scikit-learn website: " + "scikit-learn-tree website: " "http://scikit-learn.org/stable/install.html\n" ) @@ -225,10 +225,10 @@ def check_package_status(package, min_version): {"sources": ["_cdnmf_fast.pyx"], "include_np": True}, ], "ensemble": [ - {"sources": ["_gradient_boosting.pyx"], "include_np": True}, + {"sources": ["_gradient_boosting.pyx"], "language": "c++", "include_np": True}, ], "ensemble._hist_gradient_boosting": [ - {"sources": ["_gradient_boosting.pyx"], "include_np": True}, + {"sources": ["_gradient_boosting.pyx"], "language": "c++", "include_np": True}, {"sources": ["histogram.pyx"], "include_np": True}, {"sources": ["splitting.pyx"], "include_np": True}, {"sources": ["_binning.pyx"], "include_np": True}, @@ -309,7 +309,7 @@ def check_package_status(package, min_version): {"sources": ["_ball_tree.pyx"], "include_np": True}, {"sources": ["_kd_tree.pyx"], "include_np": True}, {"sources": ["_partition_nodes.pyx"], "language": "c++", "include_np": True}, - {"sources": ["_quad_tree.pyx"], "include_np": True}, + {"sources": ["_quad_tree.pyx"], "language": "c++", "include_np": True}, ], "svm": [ { @@ -377,9 +377,24 @@ def check_package_status(package, min_version): "include_np": True, "optimization_level": "O3", }, - {"sources": ["_splitter.pyx"], "include_np": True, "optimization_level": "O3"}, - {"sources": ["_criterion.pyx"], "include_np": True, "optimization_level": "O3"}, - {"sources": ["_utils.pyx"], "include_np": True, "optimization_level": "O3"}, + { + "sources": ["_splitter.pyx"], + "include_np": True, + "language": "c++", + "optimization_level": "O3", + }, + { + "sources": ["_criterion.pyx"], + "include_np": True, + "language": "c++", + "optimization_level": "O3", + }, + { + "sources": ["_utils.pyx"], + "include_np": True, + "language": "c++", + "optimization_level": "O3", + }, ], "utils": [ {"sources": ["sparsefuncs_fast.pyx"], "include_np": True}, diff --git a/sklearn/ensemble/_forest.py b/sklearn/ensemble/_forest.py index df8ecc974dd34..768eeeaf1959f 100644 --- a/sklearn/ensemble/_forest.py +++ b/sklearn/ensemble/_forest.py @@ -43,13 +43,14 @@ class calls the ``fit`` method of each sub-estimator on random samples import threading from abc import ABCMeta, abstractmethod from numbers import Integral, Real +from time import time from warnings import catch_warnings, simplefilter, warn import numpy as np from scipy.sparse import hstack as sparse_hstack from scipy.sparse import issparse -from ..base import ( +from sklearn.base import ( ClassifierMixin, MultiOutputMixin, RegressorMixin, @@ -57,9 +58,23 @@ class calls the ``fit`` method of each sub-estimator on random samples _fit_context, is_classifier, ) -from ..exceptions import DataConversionWarning -from ..metrics import accuracy_score, r2_score -from ..preprocessing import OneHotEncoder +from sklearn.ensemble._base import BaseEnsemble, _partition_estimators +from sklearn.ensemble._hist_gradient_boosting.binning import _BinMapper +from sklearn.exceptions import DataConversionWarning +from sklearn.metrics import accuracy_score, r2_score +from sklearn.preprocessing import OneHotEncoder +from sklearn.utils import check_random_state, compute_sample_weight +from sklearn.utils._openmp_helpers import _openmp_effective_n_threads +from sklearn.utils._param_validation import Interval, RealNotInt, StrOptions +from sklearn.utils.multiclass import check_classification_targets, type_of_target +from sklearn.utils.parallel import Parallel, delayed +from sklearn.utils.validation import ( + _check_feature_names_in, + _check_sample_weight, + _num_samples, + check_is_fitted, +) + from ..tree import ( BaseDecisionTree, DecisionTreeClassifier, @@ -68,17 +83,6 @@ class calls the ``fit`` method of each sub-estimator on random samples ExtraTreeRegressor, ) from ..tree._tree import DOUBLE, DTYPE -from ..utils import check_random_state, compute_sample_weight -from ..utils._param_validation import Interval, RealNotInt, StrOptions -from ..utils.multiclass import check_classification_targets, type_of_target -from ..utils.parallel import Parallel, delayed -from ..utils.validation import ( - _check_feature_names_in, - _check_sample_weight, - _num_samples, - check_is_fitted, -) -from ._base import BaseEnsemble, _partition_estimators __all__ = [ "RandomForestClassifier", @@ -213,6 +217,11 @@ class BaseForest(MultiOutputMixin, BaseEnsemble, metaclass=ABCMeta): Interval(RealNotInt, 0.0, 1.0, closed="right"), Interval(Integral, 1, None, closed="left"), ], + "max_bins": [ + None, + Interval(Integral, 1, None, closed="left"), + ], + "store_leaf_values": ["boolean"], } @abstractmethod @@ -231,6 +240,8 @@ def __init__( class_weight=None, max_samples=None, base_estimator="deprecated", + max_bins=None, + store_leaf_values=False, ): super().__init__( estimator=estimator, @@ -247,6 +258,8 @@ def __init__( self.warm_start = warm_start self.class_weight = class_weight self.max_samples = max_samples + self.max_bins = max_bins + self.store_leaf_values = store_leaf_values def apply(self, X): """ @@ -266,6 +279,15 @@ def apply(self, X): return the index of the leaf x ends up in. """ X = self._validate_X_predict(X) + + # if we trained a binning tree, then we should re-bin the data + # XXX: this is inefficient and should be improved to be in line with what + # the Histogram Gradient Boosting Tree does, where the binning thresholds + # are passed into the tree itself, thus allowing us to set the node feature + # value thresholds within the tree itself. + if self.max_bins is not None: + X = self._bin_data(X, is_training_data=False).astype(DTYPE) + results = Parallel( n_jobs=self.n_jobs, verbose=self.verbose, @@ -424,6 +446,38 @@ def fit(self, X, y, sample_weight=None): n_more_estimators = self.n_estimators - len(self.estimators_) + if self.max_bins is not None: + # `_openmp_effective_n_threads` is used to take cgroups CPU quotes + # into account when determine the maximum number of threads to use. + n_threads = _openmp_effective_n_threads() + + # Bin the data + # For ease of use of the API, the user-facing GBDT classes accept the + # parameter max_bins, which doesn't take into account the bin for + # missing values (which is always allocated). However, since max_bins + # isn't the true maximal number of bins, all other private classes + # (binmapper, histbuilder...) accept n_bins instead, which is the + # actual total number of bins. Everywhere in the code, the + # convention is that n_bins == max_bins + 1 + n_bins = self.max_bins + 1 # + 1 for missing values + self._bin_mapper = _BinMapper( + n_bins=n_bins, + # is_categorical=self.is_categorical_, + known_categories=None, + random_state=random_state, + n_threads=n_threads, + ) + + # XXX: in order for this to work with the underlying tree submodule's Cython + # code, we need to convert this into the original data's DTYPE because + # the Cython code assumes that `DTYPE` is used. + # The proper implementation will be a lot more complicated and should be + # tackled once scikit-learn has finalized their inclusion of missing data + # and categorical support for decision trees + X = self._bin_data(X, is_training_data=True) # .astype(DTYPE) + else: + self._bin_mapper = None + if n_more_estimators < 0: raise ValueError( "n_estimators=%d must be larger or equal to " @@ -636,6 +690,169 @@ def feature_importances_(self): all_importances = np.mean(all_importances, axis=0, dtype=np.float64) return all_importances / np.sum(all_importances) + def _bin_data(self, X, is_training_data): + """Bin data X. + + If is_training_data, then fit the _bin_mapper attribute. + Else, the binned data is converted to a C-contiguous array. + """ + description = "training" if is_training_data else "validation" + if self.verbose: + print( + "Binning {:.3f} GB of {} data: ".format(X.nbytes / 1e9, description), + end="", + flush=True, + ) + tic = time() + if is_training_data: + X_binned = self._bin_mapper.fit_transform(X) # F-aligned array + else: + X_binned = self._bin_mapper.transform(X) # F-aligned array + # We convert the array to C-contiguous since predicting is faster + # with this layout (training is faster on F-arrays though) + X_binned = np.ascontiguousarray(X_binned) + toc = time() + if self.verbose: + duration = toc - tic + print("{:.3f} s".format(duration)) + + return X_binned + + def predict_quantiles(self, X, quantiles=0.5, method="nearest"): + """Predict class or regression value for X at given quantiles. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + Input data. + quantiles : float, optional + The quantiles at which to evaluate, by default 0.5 (median). + method : str, optional + The method to interpolate, by default 'linear'. Can be any keyword + argument accepted by :func:`~np.quantile`. + + Returns + ------- + y : ndarray of shape (n_samples, n_quantiles, [n_outputs]) + The predicted values. The ``n_outputs`` dimension is present only + for multi-output regressors. + """ + if not self.store_leaf_values: + raise RuntimeError( + "Quantile prediction is not available when store_leaf_values=False" + ) + check_is_fitted(self) + # Check data + X = self._validate_X_predict(X) + + if not isinstance(quantiles, (np.ndarray, list)): + quantiles = np.array([quantiles]) + + # if we trained a binning tree, then we should re-bin the data + # XXX: this is inefficient and should be improved to be in line with what + # the Histogram Gradient Boosting Tree does, where the binning thresholds + # are passed into the tree itself, thus allowing us to set the node feature + # value thresholds within the tree itself. + if self.max_bins is not None: + X = self._bin_data(X, is_training_data=False).astype(DTYPE) + + # Assign chunk of trees to jobs + # n_jobs, _, _ = _partition_estimators(self.n_estimators, self.n_jobs) + + # avoid storing the output of every estimator by summing them here + if self.n_outputs_ > 1: + y_hat = np.zeros( + (X.shape[0], len(quantiles), self.n_outputs_), dtype=np.float64 + ) + else: + y_hat = np.zeros((X.shape[0], len(quantiles)), dtype=np.float64) + + # get (n_samples, n_estimators) indicator of leaf nodes + X_leaves = self.apply(X) + + # we now want to aggregate all leaf samples across all trees for each sample + for idx in range(X.shape[0]): + # get leaf nodes for this sample + leaf_nodes = X_leaves[idx, :] + + # (n_total_leaf_samples, n_outputs) + leaf_node_samples = np.vstack( + ( + est.leaf_nodes_samples_[leaf_nodes[jdx]] + for jdx, est in enumerate(self.estimators_) + ) + ) + + # get quantiles across all leaf node samples + y_hat[idx, ...] = np.quantile( + leaf_node_samples, quantiles, axis=0, interpolation=method + ) + + if is_classifier(self): + if self.n_outputs_ == 1: + for i in range(len(quantiles)): + class_pred_per_sample = y_hat[idx, i, :].squeeze().astype(int) + y_hat[idx, ...] = self.classes_.take( + class_pred_per_sample, axis=0 + ) + else: + for k in range(self.n_outputs_): + for i in range(len(quantiles)): + class_pred_per_sample = ( + y_hat[idx, i, k].squeeze().astype(int) + ) + y_hat[idx, i, k] = self.classes_[k].take( + class_pred_per_sample, axis=0 + ) + return y_hat + + def get_leaf_node_samples(self, X): + """For each datapoint x in X, get the training samples in the leaf node. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + Dataset to apply the forest to. + + Returns + ------- + leaf_node_samples : a list of array-like + Each sample is represented by the indices of the training samples that + reached the leaf node. The ``n_leaf_node_samples`` may vary between + samples, since the number of samples that fall in a leaf node is + variable. Each array-like has shape (n_leaf_node_samples, n_outputs). + """ + if not self.store_leaf_values: + raise RuntimeError( + "Leaf node samples are not available when store_leaf_values=False" + ) + + check_is_fitted(self) + # Check data + X = self._validate_X_predict(X) + + # if we trained a binning tree, then we should re-bin the data + # XXX: this is inefficient and should be improved to be in line with what + # the Histogram Gradient Boosting Tree does, where the binning thresholds + # are passed into the tree itself, thus allowing us to set the node feature + # value thresholds within the tree itself. + if self.max_bins is not None: + X = self._bin_data(X, is_training_data=False).astype(DTYPE) + + # Assign chunk of trees to jobs + n_jobs, _, _ = _partition_estimators(self.n_estimators, self.n_jobs) + + # avoid storing the output of every estimator by summing them here + result = Parallel(n_jobs=n_jobs, verbose=self.verbose)( + delayed(_accumulate_leaf_nodes_samples)(e.get_leaf_node_samples, X) + for e in self.estimators_ + ) + leaf_nodes_samples = result[0] + for result_ in result[1:]: + for i, node_samples in enumerate(result_): + leaf_nodes_samples[i] = np.vstack((leaf_nodes_samples[i], node_samples)) + return leaf_nodes_samples + def _accumulate_prediction(predict, X, out, lock): """ @@ -653,6 +870,17 @@ def _accumulate_prediction(predict, X, out, lock): out[i] += prediction[i] +def _accumulate_leaf_nodes_samples(func, X): + """ + This is a utility function for joblib's Parallel. + + It can't go locally in ForestClassifier or ForestRegressor, because joblib + complains that it cannot pickle it when placed there. + """ + leaf_nodes_samples = func(X, check_input=False) + return leaf_nodes_samples + + class ForestClassifier(ClassifierMixin, BaseForest, metaclass=ABCMeta): """ Base class for forest of trees-based classifiers. @@ -677,6 +905,8 @@ def __init__( class_weight=None, max_samples=None, base_estimator="deprecated", + max_bins=None, + store_leaf_values=False, ): super().__init__( estimator=estimator, @@ -691,6 +921,8 @@ def __init__( class_weight=class_weight, max_samples=max_samples, base_estimator=base_estimator, + max_bins=max_bins, + store_leaf_values=store_leaf_values, ) @staticmethod @@ -864,6 +1096,14 @@ def predict_proba(self, X): # Check data X = self._validate_X_predict(X) + # if we trained a binning tree, then we should re-bin the data + # XXX: this is inefficient and should be improved to be in line with what + # the Histogram Gradient Boosting Tree does, where the binning thresholds + # are passed into the tree itself, thus allowing us to set the node feature + # value thresholds within the tree itself. + if self.max_bins is not None: + X = self._bin_data(X, is_training_data=False).astype(DTYPE) + # Assign chunk of trees to jobs n_jobs, _, _ = _partition_estimators(self.n_estimators, self.n_jobs) @@ -945,6 +1185,8 @@ def __init__( warm_start=False, max_samples=None, base_estimator="deprecated", + max_bins=None, + store_leaf_values=False, ): super().__init__( estimator, @@ -958,6 +1200,8 @@ def __init__( warm_start=warm_start, max_samples=max_samples, base_estimator=base_estimator, + max_bins=max_bins, + store_leaf_values=store_leaf_values, ) def predict(self, X): @@ -983,6 +1227,14 @@ def predict(self, X): # Check data X = self._validate_X_predict(X) + # if we trained a binning tree, then we should re-bin the data + # XXX: this is inefficient and should be improved to be in line with what + # the Histogram Gradient Boosting Tree does, where the binning thresholds + # are passed into the tree itself, thus allowing us to set the node feature + # value thresholds within the tree itself. + if self.max_bins is not None: + X = self._bin_data(X, is_training_data=False).astype(DTYPE) + # Assign chunk of trees to jobs n_jobs, _, _ = _partition_estimators(self.n_estimators, self.n_jobs) @@ -1273,6 +1525,9 @@ class RandomForestClassifier(ForestClassifier): .. versionadded:: 0.22 + store_leaf_values : bool, default=False + Whether to store the leaf values in the ``get_leaf_node_samples`` function. + Attributes ---------- estimator_ : :class:`~sklearn.tree.DecisionTreeClassifier` @@ -1413,6 +1668,8 @@ def __init__( class_weight=None, ccp_alpha=0.0, max_samples=None, + max_bins=None, + store_leaf_values=False, ): super().__init__( estimator=DecisionTreeClassifier(), @@ -1428,6 +1685,7 @@ def __init__( "min_impurity_decrease", "random_state", "ccp_alpha", + "store_leaf_values", ), bootstrap=bootstrap, oob_score=oob_score, @@ -1437,6 +1695,8 @@ def __init__( warm_start=warm_start, class_weight=class_weight, max_samples=max_samples, + max_bins=max_bins, + store_leaf_values=store_leaf_values, ) self.criterion = criterion @@ -1627,6 +1887,9 @@ class RandomForestRegressor(ForestRegressor): .. versionadded:: 0.22 + store_leaf_values : bool, default=False + Whether to store the leaf values in the ``get_leaf_node_samples`` function. + Attributes ---------- estimator_ : :class:`~sklearn.tree.DecisionTreeRegressor` @@ -1754,6 +2017,8 @@ def __init__( warm_start=False, ccp_alpha=0.0, max_samples=None, + max_bins=None, + store_leaf_values=False, ): super().__init__( estimator=DecisionTreeRegressor(), @@ -1769,6 +2034,7 @@ def __init__( "min_impurity_decrease", "random_state", "ccp_alpha", + "store_leaf_values", ), bootstrap=bootstrap, oob_score=oob_score, @@ -1777,6 +2043,8 @@ def __init__( verbose=verbose, warm_start=warm_start, max_samples=max_samples, + max_bins=max_bins, + store_leaf_values=store_leaf_values, ) self.criterion = criterion @@ -1975,6 +2243,9 @@ class ExtraTreesClassifier(ForestClassifier): .. versionadded:: 0.22 + store_leaf_values : bool, default=False + Whether to store the leaf values in the ``get_leaf_node_samples`` function. + Attributes ---------- estimator_ : :class:`~sklearn.tree.ExtraTreesClassifier` @@ -2104,6 +2375,8 @@ def __init__( class_weight=None, ccp_alpha=0.0, max_samples=None, + max_bins=None, + store_leaf_values=False, ): super().__init__( estimator=ExtraTreeClassifier(), @@ -2119,6 +2392,7 @@ def __init__( "min_impurity_decrease", "random_state", "ccp_alpha", + "store_leaf_values", ), bootstrap=bootstrap, oob_score=oob_score, @@ -2128,6 +2402,8 @@ def __init__( warm_start=warm_start, class_weight=class_weight, max_samples=max_samples, + max_bins=max_bins, + store_leaf_values=store_leaf_values, ) self.criterion = criterion @@ -2314,6 +2590,9 @@ class ExtraTreesRegressor(ForestRegressor): .. versionadded:: 0.22 + store_leaf_values : bool, default=False + Whether to store the leaf values in the ``get_leaf_node_samples`` function. + Attributes ---------- estimator_ : :class:`~sklearn.tree.ExtraTreeRegressor` @@ -2426,6 +2705,8 @@ def __init__( warm_start=False, ccp_alpha=0.0, max_samples=None, + max_bins=None, + store_leaf_values=False, ): super().__init__( estimator=ExtraTreeRegressor(), @@ -2441,6 +2722,7 @@ def __init__( "min_impurity_decrease", "random_state", "ccp_alpha", + "store_leaf_values", ), bootstrap=bootstrap, oob_score=oob_score, @@ -2449,6 +2731,8 @@ def __init__( verbose=verbose, warm_start=warm_start, max_samples=max_samples, + max_bins=max_bins, + store_leaf_values=store_leaf_values, ) self.criterion = criterion @@ -2674,6 +2958,7 @@ def __init__( random_state=None, verbose=0, warm_start=False, + store_leaf_values=False, ): super().__init__( estimator=ExtraTreeRegressor(), @@ -2688,6 +2973,7 @@ def __init__( "max_leaf_nodes", "min_impurity_decrease", "random_state", + "store_leaf_values", ), bootstrap=False, oob_score=False, @@ -2696,6 +2982,7 @@ def __init__( verbose=verbose, warm_start=warm_start, max_samples=None, + store_leaf_values=store_leaf_values, ) self.max_depth = max_depth diff --git a/sklearn/ensemble/_gb.py b/sklearn/ensemble/_gb.py index 777e1a18d8396..21acb6bfe7693 100644 --- a/sklearn/ensemble/_gb.py +++ b/sklearn/ensemble/_gb.py @@ -136,6 +136,7 @@ class BaseGradientBoosting(BaseEnsemble, metaclass=ABCMeta): "n_iter_no_change": [Interval(Integral, 1, None, closed="left"), None], "tol": [Interval(Real, 0.0, None, closed="left")], } + _parameter_constraints.pop("store_leaf_values") _parameter_constraints.pop("splitter") @abstractmethod diff --git a/sklearn/ensemble/tests/test_forest.py b/sklearn/ensemble/tests/test_forest.py index 15d2999b5ef4d..9291b6982a923 100644 --- a/sklearn/ensemble/tests/test_forest.py +++ b/sklearn/ensemble/tests/test_forest.py @@ -115,6 +115,120 @@ FOREST_CLASSIFIERS_REGRESSORS.update(FOREST_REGRESSORS) +def _sparse_parity(n, p=20, p_star=3, random_state=None): + """Generate sparse parity dataset. + + Sparse parity is a multivariate generalization of the + XOR problem. + + Parameters + ---------- + n : int + Number of sample to generate. + p : int, optional + The dimensionality of the dataset, by default 20 + p_star : int, optional + The number of informative dimensions, by default 3. + random_state : Random State, optional + Random state, by default None. + + Returns + ------- + X : np.ndarray of shape (n, p) + Sparse parity dataset as a dense array. + y : np.ndarray of shape (n,) + Labels of the dataset + """ + rng = np.random.RandomState(seed=random_state) + X = rng.uniform(-1, 1, (n, p)) + y = np.zeros(n) + + for i in range(0, n): + y[i] = sum(X[i, :p_star] > 0) % 2 + + return X, y + + +def _orthant(n, p=6, random_state=None): + """Generate orthant dataset. + + Parameters + ---------- + n : int + Number of sample to generate. + p : int, optional + The dimensionality of the dataset and the number of + unique labels, by default 6. + rec : int, optional + _description_, by default 1 + random_state : Random State, optional + Random state, by default None. + + Returns + ------- + X : np.ndarray of shape (n, p) + Orthant dataset as a dense array. + y : np.ndarray of shape (n,) + Labels of the dataset + """ + rng = np.random.RandomState(seed=random_state) + orth_labels = np.asarray([2**i for i in range(0, p)][::-1]) + + X = rng.uniform(-1, 1, (n, p)) + y = np.zeros(n) + + for i in range(0, n): + idx = np.where(X[i, :] > 0)[0] + y[i] = sum(orth_labels[idx]) + + if len(np.unique(y)) < 2**p: + raise RuntimeError("Increase sample size to get a label in each orthant.") + + return X, y + + +def _trunk(n, p=10, random_state=None): + """Generate trunk dataset. + + Parameters + ---------- + n : int + Number of sample to generate. + p : int, optional + The dimensionality of the dataset and the number of + unique labels, by default 10. + random_state : Random State, optional + Random state, by default None. + + Returns + ------- + X : np.ndarray of shape (n, p) + Trunk dataset as a dense array. + y : np.ndarray of shape (n,) + Labels of the dataset + + References + ---------- + [1] Gerard V. Trunk. A problem of dimensionality: A + simple example. IEEE Transactions on Pattern Analysis + and Machine Intelligence, 1(3):306–307, 1979. + """ + rng = np.random.RandomState(seed=random_state) + + mu_1 = np.array([1 / i for i in range(1, p + 1)]) + mu_0 = -1 * mu_1 + cov = np.identity(p) + + X = np.vstack( + ( + rng.multivariate_normal(mu_0, cov, int(n / 2)), + rng.multivariate_normal(mu_1, cov, int(n / 2)), + ) + ) + y = np.concatenate((np.zeros(int(n / 2)), np.ones(int(n / 2)))) + return X, y + + def check_classification_toy(name): """Check classification on a toy dataset.""" ForestClassifier = FOREST_CLASSIFIERS[name] @@ -1809,3 +1923,111 @@ def test_round_samples_to_one_when_samples_too_low(class_weight): n_estimators=10, max_samples=1e-4, class_weight=class_weight, random_state=0 ) forest.fit(X, y) + + +@pytest.mark.parametrize("name", FOREST_CLASSIFIERS) +def test_classification_toy_withbins(name): + """Check classification on a toy dataset.""" + ForestClassifier = FOREST_CLASSIFIERS[name] + + clf = ForestClassifier(n_estimators=10, random_state=1, max_bins=255) + clf.fit(X, y) + assert_array_equal(clf.predict(T), true_result) + assert 10 == len(clf) + + clf = ForestClassifier( + n_estimators=10, max_features=1, random_state=1, max_bins=255 + ) + clf.fit(X, y) + assert_array_equal(clf.predict(T), true_result) + assert 10 == len(clf) + + # also test apply + leaf_indices = clf.apply(X) + assert leaf_indices.shape == (len(X), clf.n_estimators) + + +@pytest.mark.parametrize("name", FOREST_REGRESSORS) +@pytest.mark.parametrize( + "criterion", ("squared_error", "absolute_error", "friedman_mse") +) +def test_regression_criterion_withbins(name, criterion): + # Check consistency on regression dataset. + ForestRegressor = FOREST_REGRESSORS[name] + + reg = ForestRegressor( + n_estimators=5, criterion=criterion, random_state=1, max_bins=250 + ) + reg.fit(X_reg, y_reg) + score = reg.score(X_reg, y_reg) + assert ( + score > 0.93 + ), "Failed with max_features=None, criterion %s and score = %f" % ( + criterion, + score, + ) + + reg = ForestRegressor( + n_estimators=5, + criterion=criterion, + max_features=6, + random_state=1, + max_bins=250, + ) + reg.fit(X_reg, y_reg) + score = reg.score(X_reg, y_reg) + assert score > 0.92, "Failed with max_features=6, criterion %s and score = %f" % ( + criterion, + score, + ) + + +@pytest.mark.parametrize("name", FOREST_CLASSIFIERS_REGRESSORS) +def test_multioutput_quantiles(name): + # Check estimators on multi-output problems. + X_train = [ + [-2, -1], + [-1, -1], + [-1, -2], + [1, 1], + [1, 2], + [2, 1], + [-2, 1], + [-1, 1], + [-1, 2], + [2, -1], + [1, -1], + [1, -2], + ] + y_train = [ + [-1, 0], + [-1, 0], + [-1, 0], + [1, 1], + [1, 1], + [1, 1], + [-1, 2], + [-1, 2], + [-1, 2], + [1, 3], + [1, 3], + [1, 3], + ] + X_test = [[-1, -1], [1, 1], [-1, 1], [1, -1]] + y_test = [[-1, 0], [1, 1], [-1, 2], [1, 3]] + + est = FOREST_ESTIMATORS[name]( + random_state=0, bootstrap=False, store_leaf_values=True + ) + est.fit(X_train, y_train) + + y_pred = est.predict_quantiles(X_test, quantiles=[0.25, 0.5, 0.75]) + assert_array_almost_equal(y_pred[:, 1, :], y_test) + assert_array_almost_equal(y_pred[:, 0, :], y_test) + assert_array_almost_equal(y_pred[:, 2, :], y_test) + + # test the leaf nodes samples + leaf_nodes_samples = est.get_leaf_node_samples(X_test) + assert len(leaf_nodes_samples) == len(X_test) + for node_samples in leaf_nodes_samples: + assert node_samples.shape[1] == est.n_outputs_ diff --git a/sklearn/tree/_classes.py b/sklearn/tree/_classes.py index 1721cd891c302..200f87b0b9ef3 100644 --- a/sklearn/tree/_classes.py +++ b/sklearn/tree/_classes.py @@ -24,7 +24,7 @@ import numpy as np from scipy.sparse import issparse -from ..base import ( +from sklearn.base import ( BaseEstimator, ClassifierMixin, MultiOutputMixin, @@ -33,18 +33,19 @@ clone, is_classifier, ) -from ..utils import Bunch, check_random_state, compute_sample_weight -from ..utils._param_validation import Hidden, Interval, RealNotInt, StrOptions -from ..utils.multiclass import check_classification_targets -from ..utils.validation import ( +from sklearn.utils import Bunch, check_random_state, compute_sample_weight +from sklearn.utils._param_validation import Hidden, Interval, RealNotInt, StrOptions +from sklearn.utils.multiclass import check_classification_targets +from sklearn.utils.validation import ( _assert_all_finite_element_wise, _check_sample_weight, assert_all_finite, check_is_fitted, ) + from . import _criterion, _splitter, _tree -from ._criterion import Criterion -from ._splitter import Splitter +from ._criterion import BaseCriterion +from ._splitter import BaseSplitter from ._tree import ( BestFirstTreeBuilder, DepthFirstTreeBuilder, @@ -122,6 +123,7 @@ class BaseDecisionTree(MultiOutputMixin, BaseEstimator, metaclass=ABCMeta): "max_leaf_nodes": [Interval(Integral, 2, None, closed="left"), None], "min_impurity_decrease": [Interval(Real, 0.0, None, closed="left")], "ccp_alpha": [Interval(Real, 0.0, None, closed="left")], + "store_leaf_values": ["boolean"], } @abstractmethod @@ -140,6 +142,7 @@ def __init__( min_impurity_decrease, class_weight=None, ccp_alpha=0.0, + store_leaf_values=False, ): self.criterion = criterion self.splitter = splitter @@ -153,6 +156,7 @@ def __init__( self.min_impurity_decrease = min_impurity_decrease self.class_weight = class_weight self.ccp_alpha = ccp_alpha + self.store_leaf_values = store_leaf_values def get_depth(self): """Return the depth of the decision tree. @@ -239,9 +243,12 @@ def _fit( dtype=DTYPE, accept_sparse="csc", force_all_finite=False ) check_y_params = dict(ensure_2d=False, dtype=None) - X, y = self._validate_data( - X, y, validate_separately=(check_X_params, check_y_params) - ) + if y is not None or self._get_tags()["requires_y"]: + X, y = self._validate_data( + X, y, validate_separately=(check_X_params, check_y_params) + ) + else: + X = self._validate_data(X, **check_X_params) missing_values_in_feature_mask = ( self._compute_missing_values_in_feature_mask(X) @@ -254,7 +261,7 @@ def _fit( "No support for np.int64 index based sparse matrices" ) - if self.criterion == "poisson": + if y is not None and self.criterion == "poisson": if np.any(y < 0): raise ValueError( "Some value(s) of y are negative which is" @@ -268,45 +275,56 @@ def _fit( # Determine output settings n_samples, self.n_features_in_ = X.shape - is_classification = is_classifier(self) - y = np.atleast_1d(y) - expanded_class_weight = None + # Do preprocessing if 'y' is passed + is_classification = False + if y is not None: + is_classification = is_classifier(self) - if y.ndim == 1: - # reshape is necessary to preserve the data contiguity against vs - # [:, np.newaxis] that does not. - y = np.reshape(y, (-1, 1)) + y = np.atleast_1d(y) + expanded_class_weight = None - self.n_outputs_ = y.shape[1] + if y.ndim == 1: + # reshape is necessary to preserve the data contiguity against vs + # [:, np.newaxis] that does not. + y = np.reshape(y, (-1, 1)) - if is_classification: - check_classification_targets(y) - y = np.copy(y) + self.n_outputs_ = y.shape[1] - self.classes_ = [] - self.n_classes_ = [] + if is_classification: + check_classification_targets(y) + y = np.copy(y) - if self.class_weight is not None: - y_original = np.copy(y) + self.classes_ = [] + self.n_classes_ = [] - y_encoded = np.zeros(y.shape, dtype=int) - for k in range(self.n_outputs_): - classes_k, y_encoded[:, k] = np.unique(y[:, k], return_inverse=True) - self.classes_.append(classes_k) - self.n_classes_.append(classes_k.shape[0]) - y = y_encoded - - if self.class_weight is not None: - expanded_class_weight = compute_sample_weight( - self.class_weight, y_original - ) + if self.class_weight is not None: + y_original = np.copy(y) - self.n_classes_ = np.array(self.n_classes_, dtype=np.intp) + y_encoded = np.zeros(y.shape, dtype=int) + for k in range(self.n_outputs_): + classes_k, y_encoded[:, k] = np.unique(y[:, k], return_inverse=True) + self.classes_.append(classes_k) + self.n_classes_.append(classes_k.shape[0]) + y = y_encoded + + if self.class_weight is not None: + expanded_class_weight = compute_sample_weight( + self.class_weight, y_original + ) - if getattr(y, "dtype", None) != DOUBLE or not y.flags.contiguous: - y = np.ascontiguousarray(y, dtype=DOUBLE) + self.n_classes_ = np.array(self.n_classes_, dtype=np.intp) + if getattr(y, "dtype", None) != DOUBLE or not y.flags.contiguous: + y = np.ascontiguousarray(y, dtype=DOUBLE) + + if len(y) != n_samples: + raise ValueError( + "Number of labels=%d does not match number of samples=%d" + % (len(y), n_samples) + ) + + # set decision-tree model parameters max_depth = np.iinfo(np.int32).max if self.max_depth is None else self.max_depth if isinstance(self.min_samples_leaf, numbers.Integral): @@ -362,16 +380,10 @@ def _fit( max_leaf_nodes = -1 if self.max_leaf_nodes is None else self.max_leaf_nodes - if len(y) != n_samples: - raise ValueError( - "Number of labels=%d does not match number of samples=%d" - % (len(y), n_samples) - ) - if sample_weight is not None: sample_weight = _check_sample_weight(sample_weight, X, DOUBLE) - if expanded_class_weight is not None: + if y is not None and expanded_class_weight is not None: if sample_weight is not None: sample_weight = sample_weight * expanded_class_weight else: @@ -383,10 +395,68 @@ def _fit( else: min_weight_leaf = self.min_weight_fraction_leaf * np.sum(sample_weight) + # build the actual tree now with the parameters + self._build_tree( + X, + y, + sample_weight, + missing_values_in_feature_mask, + min_samples_leaf, + min_weight_leaf, + max_leaf_nodes, + min_samples_split, + max_depth, + random_state, + ) + + if self.store_leaf_values: + self.leaf_nodes_samples_ = self.tree_.leaf_nodes_samples + + return self + + def _build_tree( + self, + X, + y, + sample_weight, + missing_values_in_feature_mask, + min_samples_leaf, + min_weight_leaf, + max_leaf_nodes, + min_samples_split, + max_depth, + random_state, + ): + """Build the actual tree. + + Parameters + ---------- + X : Array-like + X dataset. + y : Array-like + Y targets. + sample_weight : Array-like + Sample weights + min_samples_leaf : float + Number of samples required to be a leaf. + min_weight_leaf : float + Weight of samples required to be a leaf. + max_leaf_nodes : float + Maximum number of leaf nodes allowed in tree. + min_samples_split : float + Minimum number of samples to split on. + max_depth : int + The maximum depth of any tree. + random_state : int + Random seed. + """ + + n_samples = X.shape[0] + # Build tree criterion = self.criterion - if not isinstance(criterion, Criterion): - if is_classification: + if not isinstance(criterion, BaseCriterion): + if is_classifier(self): criterion = CRITERIA_CLF[self.criterion]( self.n_outputs_, self.n_classes_ ) @@ -400,7 +470,7 @@ def _fit( SPLITTERS = SPARSE_SPLITTERS if issparse(X) else DENSE_SPLITTERS splitter = self.splitter - if not isinstance(self.splitter, Splitter): + if not isinstance(self.splitter, BaseSplitter): splitter = SPLITTERS[self.splitter]( criterion, self.max_features_, @@ -428,6 +498,7 @@ def _fit( min_weight_leaf, max_depth, self.min_impurity_decrease, + self.store_leaf_values, ) else: builder = BestFirstTreeBuilder( @@ -438,8 +509,8 @@ def _fit( max_depth, max_leaf_nodes, self.min_impurity_decrease, + self.store_leaf_values, ) - builder.build(self.tree_, X, y, sample_weight, missing_values_in_feature_mask) if self.n_outputs_ == 1 and is_classifier(self): @@ -448,8 +519,6 @@ def _fit( self._prune_tree() - return self - def _validate_X_predict(self, X, check_input): """Validate the training data on predict (probabilities).""" if check_input: @@ -498,6 +567,9 @@ def predict(self, X, check_input=True): """ check_is_fitted(self) X = self._validate_X_predict(X, check_input) + + # proba is a count matrix of leaves that fall into + # (n_samples, n_outputs, max_n_classes) array proba = self.tree_.predict(X) n_samples = X.shape[0] @@ -524,6 +596,129 @@ def predict(self, X, check_input=True): else: return proba[:, :, 0] + def get_leaf_node_samples(self, X, check_input=True): + """For each datapoint x in X, get the training samples in the leaf node. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + Dataset to apply the forest to. + check_input : bool, default=True + Allow to bypass several input checking. + + Returns + ------- + leaf_nodes_samples : a list of array-like + Each sample is represented by the indices of the training samples that + reached the leaf node. The ``n_leaf_node_samples`` may vary between + samples, since the number of samples that fall in a leaf node is + variable. Each array has shape (n_leaf_node_samples, n_outputs). + """ + if not self.store_leaf_values: + raise RuntimeError( + "leaf node samples are not stored when store_leaf_values=False" + ) + + # get indices of leaves per sample (n_samples,) + X_leaves = self.apply(X, check_input=check_input) + n_samples = X_leaves.shape[0] + + # get array of samples per leaf (n_node_samples, n_outputs) + leaf_samples = self.tree_.leaf_nodes_samples + + leaf_nodes_samples = [] + for idx in range(n_samples): + leaf_id = X_leaves[idx] + leaf_nodes_samples.append(leaf_samples[leaf_id]) + return leaf_nodes_samples + + def predict_quantiles(self, X, quantiles=0.5, method="nearest", check_input=True): + """Predict class or regression value for X at given quantiles. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + Input data. + quantiles : float, optional + The quantiles at which to evaluate, by default 0.5 (median). + method : str, optional + The method to interpolate, by default 'linear'. Can be any keyword + argument accepted by :func:`~np.quantile`. + check_input : bool, optional + Whether or not to check input, by default True. + + Returns + ------- + predictions : array-like of shape (n_samples, n_outputs, len(quantiles)) + The predicted quantiles. + """ + if not self.store_leaf_values: + raise RuntimeError( + "Predicting quantiles requires that the tree stores leaf node samples." + ) + + check_is_fitted(self) + + # Check data + X = self._validate_X_predict(X, check_input) + + if not isinstance(quantiles, (np.ndarray, list)): + quantiles = np.array([quantiles]) + + # get indices of leaves per sample + X_leaves = self.apply(X) + + # get array of samples per leaf (n_node_samples, n_outputs) + leaf_samples = self.tree_.leaf_nodes_samples + + # compute quantiles (n_samples, n_quantiles, n_outputs) + n_samples = X.shape[0] + n_quantiles = len(quantiles) + proba = np.zeros((n_samples, n_quantiles, self.n_outputs_)) + for idx, leaf_id in enumerate(X_leaves): + # predict by taking the quantile across the samples in the leaf for + # each output + proba[idx, ...] = np.quantile( + leaf_samples[leaf_id], quantiles, axis=0, interpolation=method + ) + + # Classification + if is_classifier(self): + if self.n_outputs_ == 1: + # return the class with the highest probability for each quantile + # (n_samples, n_quantiles) + class_preds = np.zeros( + (n_samples, n_quantiles), dtype=self.classes_.dtype + ) + for i in range(n_quantiles): + class_pred_per_sample = ( + proba[:, i, :].squeeze().astype(self.classes_.dtype) + ) + class_preds[:, i] = self.classes_.take( + class_pred_per_sample, axis=0 + ) + return class_preds + else: + class_type = self.classes_[0].dtype + predictions = np.zeros( + (n_samples, n_quantiles, self.n_outputs_), dtype=class_type + ) + for k in range(self.n_outputs_): + for i in range(n_quantiles): + class_pred_per_sample = proba[:, i, k].squeeze().astype(int) + predictions[:, i, k] = self.classes_[k].take( + class_pred_per_sample, axis=0 + ) + + return predictions + # Regression + else: + if self.n_outputs_ == 1: + return proba[:, :, 0] + + else: + return proba + def apply(self, X, check_input=True): """Return the index of the leaf that each sample is predicted as. @@ -798,6 +993,16 @@ class DecisionTreeClassifier(ClassifierMixin, BaseDecisionTree): .. versionadded:: 0.22 + store_leaf_values : bool, default=False + Whether to store the samples that fall into leaves in the ``tree_`` attribute. + Each leaf will store a 2D array corresponding to the samples that fall into it + keyed by node_id. + + XXX: This is currently experimental and may change without notice. + Moreover, it can be improved upon since storing the samples twice is not ideal. + One could instead store the indices in ``y_train`` that fall into each leaf, + which would lower RAM/diskspace usage. + Attributes ---------- classes_ : ndarray of shape (n_classes,) or list of ndarray @@ -843,6 +1048,9 @@ class DecisionTreeClassifier(ClassifierMixin, BaseDecisionTree): :ref:`sphx_glr_auto_examples_tree_plot_unveil_tree_structure.py` for basic usage of these attributes. + leaf_nodes_samples_ : dict + A dictionary of leaf node index and the y_train samples in that leaf. + See Also -------- DecisionTreeRegressor : A decision tree regressor. @@ -890,7 +1098,10 @@ class DecisionTreeClassifier(ClassifierMixin, BaseDecisionTree): _parameter_constraints: dict = { **BaseDecisionTree._parameter_constraints, - "criterion": [StrOptions({"gini", "entropy", "log_loss"}), Hidden(Criterion)], + "criterion": [ + StrOptions({"gini", "entropy", "log_loss"}), + Hidden(BaseCriterion), + ], "class_weight": [dict, list, StrOptions({"balanced"}), None], } @@ -909,6 +1120,7 @@ def __init__( min_impurity_decrease=0.0, class_weight=None, ccp_alpha=0.0, + store_leaf_values=False, ): super().__init__( criterion=criterion, @@ -923,6 +1135,7 @@ def __init__( random_state=random_state, min_impurity_decrease=min_impurity_decrease, ccp_alpha=ccp_alpha, + store_leaf_values=store_leaf_values, ) @_fit_context(prefer_skip_nested_validation=True) @@ -955,7 +1168,6 @@ def fit(self, X, y, sample_weight=None, check_input=True): self : DecisionTreeClassifier Fitted estimator. """ - super()._fit( X, y, @@ -1174,6 +1386,16 @@ class DecisionTreeRegressor(RegressorMixin, BaseDecisionTree): .. versionadded:: 0.22 + store_leaf_values : bool, default=False + Whether to store the samples that fall into leaves in the ``tree_`` attribute. + Each leaf will store a 2D array corresponding to the samples that fall into it + keyed by node_id. + + XXX: This is currently experimental and may change without notice. + Moreover, it can be improved upon since storing the samples twice is not ideal. + One could instead store the indices in ``y_train`` that fall into each leaf, + which would lower RAM/diskspace usage. + Attributes ---------- feature_importances_ : ndarray of shape (n_features,) @@ -1254,7 +1476,7 @@ class DecisionTreeRegressor(RegressorMixin, BaseDecisionTree): **BaseDecisionTree._parameter_constraints, "criterion": [ StrOptions({"squared_error", "friedman_mse", "absolute_error", "poisson"}), - Hidden(Criterion), + Hidden(BaseCriterion), ], } @@ -1272,6 +1494,7 @@ def __init__( max_leaf_nodes=None, min_impurity_decrease=0.0, ccp_alpha=0.0, + store_leaf_values=False, ): super().__init__( criterion=criterion, @@ -1285,6 +1508,7 @@ def __init__( random_state=random_state, min_impurity_decrease=min_impurity_decrease, ccp_alpha=ccp_alpha, + store_leaf_values=store_leaf_values, ) @_fit_context(prefer_skip_nested_validation=True) @@ -1499,6 +1723,16 @@ class ExtraTreeClassifier(DecisionTreeClassifier): .. versionadded:: 0.22 + store_leaf_values : bool, default=False + Whether to store the samples that fall into leaves in the ``tree_`` attribute. + Each leaf will store a 2D array corresponding to the samples that fall into it + keyed by node_id. + + XXX: This is currently experimental and may change without notice. + Moreover, it can be improved upon since storing the samples twice is not ideal. + One could instead store the indices in ``y_train`` that fall into each leaf, + which would lower RAM/diskspace usage. + Attributes ---------- classes_ : ndarray of shape (n_classes,) or list of ndarray @@ -1599,6 +1833,7 @@ def __init__( min_impurity_decrease=0.0, class_weight=None, ccp_alpha=0.0, + store_leaf_values=False, ): super().__init__( criterion=criterion, @@ -1613,6 +1848,7 @@ def __init__( min_impurity_decrease=min_impurity_decrease, random_state=random_state, ccp_alpha=ccp_alpha, + store_leaf_values=store_leaf_values, ) @@ -1743,6 +1979,16 @@ class ExtraTreeRegressor(DecisionTreeRegressor): .. versionadded:: 0.22 + store_leaf_values : bool, default=False + Whether to store the samples that fall into leaves in the ``tree_`` attribute. + Each leaf will store a 2D array corresponding to the samples that fall into it + keyed by node_id. + + XXX: This is currently experimental and may change without notice. + Moreover, it can be improved upon since storing the samples twice is not ideal. + One could instead store the indices in ``y_train`` that fall into each leaf, + which would lower RAM/diskspace usage. + Attributes ---------- max_features_ : int @@ -1826,6 +2072,7 @@ def __init__( min_impurity_decrease=0.0, max_leaf_nodes=None, ccp_alpha=0.0, + store_leaf_values=False, ): super().__init__( criterion=criterion, @@ -1839,4 +2086,5 @@ def __init__( min_impurity_decrease=min_impurity_decrease, random_state=random_state, ccp_alpha=ccp_alpha, + store_leaf_values=store_leaf_values, ) diff --git a/sklearn/tree/_criterion.pxd b/sklearn/tree/_criterion.pxd index a0a357a700fb4..ecbf56e5f6016 100644 --- a/sklearn/tree/_criterion.pxd +++ b/sklearn/tree/_criterion.pxd @@ -4,32 +4,32 @@ # Joel Nothman # Arnaud Joly # Jacob Schreiber +# Adam Li +# Jong Shin # # License: BSD 3 clause # See _criterion.pyx for implementation details. -from ._tree cimport DTYPE_t # Type of X -from ._tree cimport DOUBLE_t # Type of y, sample_weight -from ._tree cimport SIZE_t # Type for indices and counters -from ._tree cimport INT32_t # Signed 32 bit integer -from ._tree cimport UINT32_t # Unsigned 32 bit integer +from libcpp.vector cimport vector -cdef class Criterion: - # The criterion computes the impurity of a node and the reduction of - # impurity of a split on that node. It also computes the output statistics - # such as the mean in regression and class probabilities in classification. +from ._tree cimport DOUBLE_t # Type of y, sample_weight +from ._tree cimport DTYPE_t # Type of X +from ._tree cimport INT32_t # Signed 32 bit integer +from ._tree cimport SIZE_t # Type for indices and counters +from ._tree cimport UINT32_t # Unsigned 32 bit integer + + +cdef class BaseCriterion: + """Abstract interface for criterion.""" # Internal structures - cdef const DOUBLE_t[:, ::1] y # Values of y cdef const DOUBLE_t[:] sample_weight # Sample weights cdef const SIZE_t[:] sample_indices # Sample indices in X, y cdef SIZE_t start # samples[start:pos] are the samples in the left node cdef SIZE_t pos # samples[pos:end] are the samples in the right node cdef SIZE_t end - cdef SIZE_t n_missing # Number of missing values for the feature being evaluated - cdef bint missing_go_to_left # Whether missing values go to the left node cdef SIZE_t n_outputs # Number of outputs cdef SIZE_t n_samples # Number of samples @@ -40,21 +40,11 @@ cdef class Criterion: cdef double weighted_n_right # Weighted number of samples in the right node cdef double weighted_n_missing # Weighted number of samples that are missing + # Core methods that criterion class _must_ implement. # The criterion object is maintained such that left and right collected # statistics correspond to samples[start:pos] and samples[pos:end]. # Methods - cdef int init( - self, - const DOUBLE_t[:, ::1] y, - const DOUBLE_t[:] sample_weight, - double weighted_n_samples, - const SIZE_t[:] sample_indices, - SIZE_t start, - SIZE_t end - ) except -1 nogil - cdef void init_sum_missing(self) - cdef void init_missing(self, SIZE_t n_missing) noexcept nogil cdef int reset(self) except -1 nogil cdef int reverse_reset(self) except -1 nogil cdef int update(self, SIZE_t new_pos) except -1 nogil @@ -76,6 +66,35 @@ cdef class Criterion: ) noexcept nogil cdef double proxy_impurity_improvement(self) noexcept nogil + cdef void set_sample_pointers( + self, + SIZE_t start, + SIZE_t end + ) noexcept nogil + + +cdef class Criterion(BaseCriterion): + """Abstract interface for supervised impurity criteria.""" + + cdef const DOUBLE_t[:, ::1] y # Values of y + cdef SIZE_t n_missing # Number of missing values for the feature being evaluated + cdef bint missing_go_to_left # Whether missing values go to the left node + + cdef int init( + self, + const DOUBLE_t[:, ::1] y, + const DOUBLE_t[:] sample_weight, + double weighted_n_samples, + const SIZE_t[:] sample_indices + ) except -1 nogil + cdef void init_sum_missing(self) + cdef void init_missing(self, SIZE_t n_missing) noexcept nogil + + cdef void node_samples( + self, + vector[vector[DOUBLE_t]]& dest + ) noexcept nogil + cdef class ClassificationCriterion(Criterion): """Abstract criterion for classification.""" diff --git a/sklearn/tree/_criterion.pyx b/sklearn/tree/_criterion.pyx index 91c347735c5e0..bd1bdef0a6a93 100644 --- a/sklearn/tree/_criterion.pyx +++ b/sklearn/tree/_criterion.pyx @@ -1,3 +1,6 @@ +# cython: language_level=3 +# cython: boundscheck=False, wraparound=False, initializedcheck=False, cdivision=True + # Authors: Gilles Louppe # Peter Prettenhofer # Brian Holt @@ -9,30 +12,47 @@ # Fares Hedayati # Jacob Schreiber # Nelson Liu +# Adam Li +# Jong Shin # # License: BSD 3 clause -from libc.string cimport memcpy -from libc.string cimport memset -from libc.math cimport fabs, INFINITY +from libc.math cimport INFINITY, fabs +from libc.string cimport memcpy, memset import numpy as np + cimport numpy as cnp + cnp.import_array() from scipy.special.cython_special cimport xlogy -from ._utils cimport log -from ._utils cimport WeightedMedianCalculator +from ._utils cimport WeightedMedianCalculator, log + # EPSILON is used in the Poisson criterion cdef double EPSILON = 10 * np.finfo('double').eps -cdef class Criterion: - """Interface for impurity criteria. +cdef class BaseCriterion: + """This is an abstract interface for criterion. + + For example, a tree model could + be either supervisedly, or unsupervisedly computing impurity on samples of + covariates, or labels, or both. Although scikit-learn currently only contains + supervised tree methods, this class enables 3rd party packages to leverage + scikit-learn's Cython code for criteria. + + The downstream classes _must_ implement methods to compute the impurity + in current node and in children nodes. This object stores methods on how to calculate how good a split is using - different metrics. + a set API. + + Samples in the "current" node are stored in `samples[start:end]` which is + partitioned around `pos` (an index in `start:end`) so that: + - the samples of left child node are stored in `samples[start:pos]` + - the samples of right child node are stored in `samples[pos:end]` """ def __getstate__(self): return {} @@ -40,53 +60,6 @@ cdef class Criterion: def __setstate__(self, d): pass - cdef int init( - self, - const DOUBLE_t[:, ::1] y, - const DOUBLE_t[:] sample_weight, - double weighted_n_samples, - const SIZE_t[:] sample_indices, - SIZE_t start, - SIZE_t end, - ) except -1 nogil: - """Placeholder for a method which will initialize the criterion. - - Returns -1 in case of failure to allocate memory (and raise MemoryError) - or 0 otherwise. - - Parameters - ---------- - y : ndarray, dtype=DOUBLE_t - y is a buffer that can store values for n_outputs target variables - stored as a Cython memoryview. - sample_weight : ndarray, dtype=DOUBLE_t - The weight of each sample stored as a Cython memoryview. - weighted_n_samples : double - The total weight of the samples being considered - sample_indices : ndarray, dtype=SIZE_t - A mask on the samples. Indices of the samples in X and y we want to use, - where sample_indices[start:end] correspond to the samples in this node. - start : SIZE_t - The first sample to be used on this node - end : SIZE_t - The last sample used on this node - - """ - pass - - cdef void init_missing(self, SIZE_t n_missing) noexcept nogil: - """Initialize sum_missing if there are missing values. - - This method assumes that caller placed the missing samples in - self.sample_indices[-n_missing:] - - Parameters - ---------- - n_missing: SIZE_t - Number of missing values for specific feature. - """ - pass - cdef int reset(self) except -1 nogil: """Reset the criterion at pos=start. @@ -211,9 +184,110 @@ cdef class Criterion: - (self.weighted_n_left / self.weighted_n_node_samples * impurity_left))) + cdef void set_sample_pointers( + self, + SIZE_t start, + SIZE_t end + ) noexcept nogil: + """Abstract method which will set sample pointers in the criterion. + + The dataset array that we compute criteria on is assumed to consist of 'N' + ordered samples or rows (i.e. sorted). Since we pass this by reference, we + use sample pointers to move the start and end around to consider only a subset of data. + This function should also update relevant statistics that the class uses to compute the final criterion. + + Parameters + ---------- + start : SIZE_t + The index of the first sample to be used on computation of criteria of the current node. + end : SIZE_t + The last sample used on this node + """ + pass + + +cdef class Criterion(BaseCriterion): + """Interface for impurity criteria. + + The supervised criterion computes the impurity of a node and the reduction of + impurity of a split on that node using the distribution of labels in parent and + children nodes. It also computes the output statistics such as the mean in regression + and class probabilities in classification. Instances of this class are responsible + for compute splits' impurity difference. + + Criterion is the base class for criteria used in supervised tree-based models + with a homogeneous float64-dtyped y. + """ + cdef int init( + self, + const DOUBLE_t[:, ::1] y, + const DOUBLE_t[:] sample_weight, + double weighted_n_samples, + const SIZE_t[:] sample_indices + ) except -1 nogil: + """Placeholder for a method which will initialize the criterion. + + Returns -1 in case of failure to allocate memory (and raise MemoryError) + or 0 otherwise. + + Parameters + ---------- + y : ndarray, dtype=DOUBLE_t + y is a buffer that can store values for n_outputs target variables + stored as a Cython memoryview. + sample_weight : ndarray, dtype=DOUBLE_t + The weight of each sample stored as a Cython memoryview. + weighted_n_samples : double + The total weight of the samples being considered + sample_indices : ndarray, dtype=SIZE_t + A mask on the samples. Indices of the samples in X and y we want to use, + where sample_indices[start:end] correspond to the samples in this node. + """ + pass + + cdef void init_missing(self, SIZE_t n_missing) noexcept nogil: + """Initialize sum_missing if there are missing values. + + This method assumes that caller placed the missing samples in + self.sample_indices[-n_missing:] + + Parameters + ---------- + n_missing: SIZE_t + Number of missing values for specific feature. + """ + pass + cdef void init_sum_missing(self): """Init sum_missing to hold sums for missing values.""" + cdef void node_samples( + self, + vector[vector[DOUBLE_t]]& dest + ) noexcept nogil: + """Copy the samples of the current node into dest. + + Parameters + ---------- + dest : reference vector[vector[DOUBLE_t]] + The vector of vectors where the samples should be copied. + This is passed by reference and modified in place. + """ + cdef SIZE_t i, j, k + + # Resize the destination vector of vectors + dest.resize(self.n_node_samples) + + # Loop over the samples + for i in range(self.n_node_samples): + # Get the index of the current sample + j = self.sample_indices[self.start + i] + + # Get the sample values for each output + for k in range(self.n_outputs): + dest[i].push_back(self.y[j, k]) + + cdef inline void _move_sums_classification( ClassificationCriterion criterion, double[:, ::1] sum_1, @@ -312,15 +386,10 @@ cdef class ClassificationCriterion(Criterion): const DOUBLE_t[:, ::1] y, const DOUBLE_t[:] sample_weight, double weighted_n_samples, - const SIZE_t[:] sample_indices, - SIZE_t start, - SIZE_t end + const SIZE_t[:] sample_indices ) except -1 nogil: """Initialize the criterion. - This initializes the criterion at node sample_indices[start:end] and children - sample_indices[start:start] and sample_indices[start:end]. - Returns -1 in case of failure to allocate memory (and raise MemoryError) or 0 otherwise. @@ -335,18 +404,24 @@ cdef class ClassificationCriterion(Criterion): sample_indices : ndarray, dtype=SIZE_t A mask on the samples. Indices of the samples in X and y we want to use, where sample_indices[start:end] correspond to the samples in this node. - start : SIZE_t - The first sample to use in the mask - end : SIZE_t - The last sample to use in the mask """ self.y = y self.sample_weight = sample_weight self.sample_indices = sample_indices + self.weighted_n_samples = weighted_n_samples + + return 0 + + cdef void set_sample_pointers( + self, + SIZE_t start, + SIZE_t end + ) noexcept nogil: + """Set sample pointers in the criterion.""" + self.n_node_samples = end - start self.start = start self.end = end - self.n_node_samples = end - start - self.weighted_n_samples = weighted_n_samples + self.weighted_n_node_samples = 0.0 cdef SIZE_t i @@ -359,12 +434,12 @@ cdef class ClassificationCriterion(Criterion): memset(&self.sum_total[k, 0], 0, self.n_classes[k] * sizeof(double)) for p in range(start, end): - i = sample_indices[p] + i = self.sample_indices[p] # w is originally set to be 1.0, meaning that if no sample weights # are given, the default weight of each sample is 1.0. - if sample_weight is not None: - w = sample_weight[i] + if self.sample_weight is not None: + w = self.sample_weight[i] # Count weighted class frequency for each target for k in range(self.n_outputs): @@ -375,7 +450,6 @@ cdef class ClassificationCriterion(Criterion): # Reset to pos=start self.reset() - return 0 cdef void init_sum_missing(self): """Init sum_missing to hold sums for missing values.""" @@ -614,13 +688,10 @@ cdef class Gini(ClassificationCriterion): This handles cases where the target is a classification taking values 0, 1, ... K-2, K-1. If node m represents a region Rm with Nm observations, then let - count_k = 1/ Nm \sum_{x_i in Rm} I(yi = k) - be the proportion of class k observations in node m. The Gini Index is then defined as: - index = \sum_{k=0}^{K-1} count_k (1 - count_k) = 1 - \sum_{k=0}^{K-1} count_k ** 2 """ @@ -738,7 +809,6 @@ cdef class RegressionCriterion(Criterion): evaluated by computing the variance of the target values left and right of the split point. The computation takes linear time with `n_samples` by using :: - var = \sum_i^n (y_i - y_bar) ** 2 = (\sum_i^n y_i ** 2) - n_samples * y_bar ** 2 """ @@ -750,7 +820,6 @@ cdef class RegressionCriterion(Criterion): ---------- n_outputs : SIZE_t The number of targets to be predicted - n_samples : SIZE_t The total number of samples to fit on """ @@ -781,23 +850,29 @@ cdef class RegressionCriterion(Criterion): const DOUBLE_t[:, ::1] y, const DOUBLE_t[:] sample_weight, double weighted_n_samples, - const SIZE_t[:] sample_indices, - SIZE_t start, - SIZE_t end, + const SIZE_t[:] sample_indices ) except -1 nogil: - """Initialize the criterion. - - This initializes the criterion at node sample_indices[start:end] and children - sample_indices[start:start] and sample_indices[start:end]. - """ + """Initialize the criterion.""" # Initialize fields self.y = y self.sample_weight = sample_weight self.sample_indices = sample_indices + self.weighted_n_samples = weighted_n_samples + + return 0 + + cdef void set_sample_pointers( + self, + SIZE_t start, + SIZE_t end + ) noexcept nogil: + """Set sample pointers in the criterion.""" self.start = start self.end = end + self.n_node_samples = end - start - self.weighted_n_samples = weighted_n_samples + + self.sq_sum_total = 0.0 self.weighted_n_node_samples = 0. cdef SIZE_t i @@ -806,14 +881,14 @@ cdef class RegressionCriterion(Criterion): cdef DOUBLE_t y_ik cdef DOUBLE_t w_y_ik cdef DOUBLE_t w = 1.0 - self.sq_sum_total = 0.0 + memset(&self.sum_total[0], 0, self.n_outputs * sizeof(double)) for p in range(start, end): - i = sample_indices[p] + i = self.sample_indices[p] - if sample_weight is not None: - w = sample_weight[i] + if self.sample_weight is not None: + w = self.sample_weight[i] for k in range(self.n_outputs): y_ik = self.y[i, k] @@ -825,7 +900,6 @@ cdef class RegressionCriterion(Criterion): # Reset to pos=start self.reset() - return 0 cdef void init_sum_missing(self): """Init sum_missing to hold sums for missing values.""" @@ -962,7 +1036,6 @@ cdef class RegressionCriterion(Criterion): cdef class MSE(RegressionCriterion): """Mean squared error impurity criterion. - MSE = var_left + var_right """ @@ -1110,26 +1183,30 @@ cdef class MAE(RegressionCriterion): const DOUBLE_t[:, ::1] y, const DOUBLE_t[:] sample_weight, double weighted_n_samples, - const SIZE_t[:] sample_indices, - SIZE_t start, - SIZE_t end, + const SIZE_t[:] sample_indices ) except -1 nogil: - """Initialize the criterion. - - This initializes the criterion at node sample_indices[start:end] and children - sample_indices[start:start] and sample_indices[start:end]. - """ - cdef SIZE_t i, p, k - cdef DOUBLE_t w = 1.0 - + """Initialize the criterion.""" # Initialize fields self.y = y self.sample_weight = sample_weight self.sample_indices = sample_indices + self.weighted_n_samples = weighted_n_samples + + return 0 + + cdef void set_sample_pointers( + self, + SIZE_t start, + SIZE_t end + ) noexcept nogil: + """Set sample pointers in the criterion.""" + cdef SIZE_t i, p, k + cdef DOUBLE_t w = 1.0 + self.start = start self.end = end + self.n_node_samples = end - start - self.weighted_n_samples = weighted_n_samples self.weighted_n_node_samples = 0. cdef void** left_child = self.left_child_ptr @@ -1140,10 +1217,10 @@ cdef class MAE(RegressionCriterion): ( right_child[k]).reset() for p in range(start, end): - i = sample_indices[p] + i = self.sample_indices[p] - if sample_weight is not None: - w = sample_weight[i] + if self.sample_weight is not None: + w = self.sample_weight[i] for k in range(self.n_outputs): # push method ends up calling safe_realloc, hence `except -1` @@ -1158,7 +1235,6 @@ cdef class MAE(RegressionCriterion): # Reset to pos=start self.reset() - return 0 cdef void init_missing(self, SIZE_t n_missing) noexcept nogil: """Raise error if n_missing != 0.""" @@ -1424,6 +1500,7 @@ cdef class Poisson(RegressionCriterion): Note that the deviance is >= 0, and since we have `y_pred = mean(y_true)` at the leaves, one always has `sum(y_pred - y_true) = 0`. It remains the implemented impurity (factor 2 is skipped): + 1/n * sum(y_true * log(y_true/y_pred) """ # FIXME in 1.0: diff --git a/sklearn/tree/_export.py b/sklearn/tree/_export.py index 02f8ea81404c7..4a23f4d2da946 100644 --- a/sklearn/tree/_export.py +++ b/sklearn/tree/_export.py @@ -17,9 +17,15 @@ import numpy as np -from ..base import is_classifier -from ..utils._param_validation import HasMethods, Interval, StrOptions, validate_params -from ..utils.validation import check_array, check_is_fitted +from sklearn.base import is_classifier +from sklearn.utils._param_validation import ( + HasMethods, + Interval, + StrOptions, + validate_params, +) +from sklearn.utils.validation import check_array, check_is_fitted + from . import DecisionTreeClassifier, DecisionTreeRegressor, _criterion, _tree from ._reingold_tilford import Tree, buchheim diff --git a/sklearn/tree/_splitter.pxd b/sklearn/tree/_splitter.pxd index acc67a7315add..a6515338c492d 100644 --- a/sklearn/tree/_splitter.pxd +++ b/sklearn/tree/_splitter.pxd @@ -4,18 +4,22 @@ # Joel Nothman # Arnaud Joly # Jacob Schreiber +# Adam Li +# Jong Shin # # License: BSD 3 clause # See _splitter.pyx for details. -from ._criterion cimport Criterion +from libcpp.vector cimport vector + +from ._criterion cimport BaseCriterion, Criterion +from ._tree cimport DOUBLE_t # Type of y, sample_weight +from ._tree cimport DTYPE_t # Type of X +from ._tree cimport INT32_t # Signed 32 bit integer +from ._tree cimport SIZE_t # Type for indices and counters +from ._tree cimport UINT32_t # Unsigned 32 bit integer -from ._tree cimport DTYPE_t # Type of X -from ._tree cimport DOUBLE_t # Type of y, sample_weight -from ._tree cimport SIZE_t # Type for indices and counters -from ._tree cimport INT32_t # Signed 32 bit integer -from ._tree cimport UINT32_t # Unsigned 32 bit integer cdef struct SplitRecord: # Data to track sample split @@ -30,14 +34,15 @@ cdef struct SplitRecord: unsigned char missing_go_to_left # Controls if missing values go to the left node. SIZE_t n_missing # Number of missing values for the feature being split on -cdef class Splitter: +cdef class BaseSplitter: + """Abstract interface for splitter.""" + # The splitter searches in the input space for a feature and a threshold # to split the samples samples[start:end]. # # The impurity computations are delegated to a criterion object. # Internal structures - cdef public Criterion criterion # Impurity criterion cdef public SIZE_t max_features # Number of features to test cdef public SIZE_t min_samples_leaf # Min samples in a leaf cdef public double min_weight_leaf # Minimum weight in a leaf @@ -56,7 +61,6 @@ cdef class Splitter: cdef SIZE_t start # Start position for the current node cdef SIZE_t end # End position for the current node - cdef const DOUBLE_t[:, ::1] y cdef const DOUBLE_t[:] sample_weight # The samples vector `samples` is maintained by the Splitter object such @@ -76,28 +80,43 @@ cdef class Splitter: # This allows optimization with depth-based tree building. # Methods - cdef int init( - self, - object X, - const DOUBLE_t[:, ::1] y, - const DOUBLE_t[:] sample_weight, - const unsigned char[::1] missing_values_in_feature_mask, - ) except -1 - cdef int node_reset( self, SIZE_t start, SIZE_t end, double* weighted_n_node_samples ) except -1 nogil - cdef int node_split( self, double impurity, # Impurity of the node SplitRecord* split, SIZE_t* n_constant_features ) except -1 nogil - cdef void node_value(self, double* dest) noexcept nogil - cdef double node_impurity(self) noexcept nogil + cdef int pointer_size(self) noexcept nogil + +cdef class Splitter(BaseSplitter): + cdef public Criterion criterion # Impurity criterion + cdef const DOUBLE_t[:, ::1] y + + cdef int init( + self, + object X, + const DOUBLE_t[:, ::1] y, + const DOUBLE_t[:] sample_weight, + const unsigned char[::1] missing_values_in_feature_mask, + ) except -1 + + cdef void node_samples(self, vector[vector[DOUBLE_t]]& dest) noexcept nogil + + # Methods that allow modifications to stopping conditions + cdef bint check_presplit_conditions( + self, + SplitRecord current_split, + SIZE_t n_missing, + bint missing_go_to_left, + ) noexcept nogil + cdef bint check_postsplit_conditions( + self + ) noexcept nogil diff --git a/sklearn/tree/_splitter.pyx b/sklearn/tree/_splitter.pyx index 7e60f0023d2a2..bca38d5f04374 100644 --- a/sklearn/tree/_splitter.pyx +++ b/sklearn/tree/_splitter.pyx @@ -1,3 +1,6 @@ +# cython: language_level=3 +# cython: boundscheck=False, wraparound=False, initializedcheck=False, cdivision=True + # Authors: Gilles Louppe # Peter Prettenhofer # Brian Holt @@ -8,24 +11,24 @@ # Joel Nothman # Fares Hedayati # Jacob Schreiber +# Adam Li +# Jong Shin # -# License: BSD 3 clause -from ._criterion cimport Criterion +# License: BSD 3 clause +from cython cimport final +from libc.math cimport isnan from libc.stdlib cimport qsort from libc.string cimport memcpy -from libc.math cimport isnan -from cython cimport final -import numpy as np +from ._criterion cimport Criterion +import numpy as np from scipy.sparse import isspmatrix_csc -from ._utils cimport log -from ._utils cimport rand_int -from ._utils cimport rand_uniform -from ._utils cimport RAND_R_MAX +from ._utils cimport RAND_R_MAX, log, rand_int, rand_uniform + cdef double INFINITY = np.inf @@ -46,16 +49,78 @@ cdef inline void _init_split(SplitRecord* self, SIZE_t start_pos) noexcept nogil self.missing_go_to_left = False self.n_missing = 0 -cdef class Splitter: - """Abstract splitter class. +cdef class BaseSplitter: + """This is an abstract interface for splitters. - Splitters are called by tree builders to find the best splits on both - sparse and dense data, one split at a time. + For example, a tree model could be either supervisedly, or unsupervisedly computing splits on samples of + covariates, labels, or both. Although scikit-learn currently only contains + supervised tree methods, this class enables 3rd party packages to leverage + scikit-learn's Cython code for splitting. + + A splitter is usually used in conjunction with a criterion class, which explicitly handles + computing the criteria, which we split on. The setting of that criterion class is handled + by downstream classes. + + The downstream classes _must_ implement methods to compute the split in a node. """ + def __getstate__(self): + return {} + + def __setstate__(self, d): + pass + + cdef int node_reset(self, SIZE_t start, SIZE_t end, + double* weighted_n_node_samples) except -1 nogil: + """Reset splitter on node samples[start:end]. + + Returns -1 in case of failure to allocate memory (and raise MemoryError) + or 0 otherwise. + + Parameters + ---------- + start : SIZE_t + The index of the first sample to consider + end : SIZE_t + The index of the last sample to consider + weighted_n_node_samples : ndarray, dtype=double pointer + The total weight of those samples + """ + pass + + cdef int node_split(self, double impurity, SplitRecord* split, + SIZE_t* n_constant_features) except -1 nogil: + """Find the best split on node samples[start:end]. + + This is a placeholder method. The majority of computation will be done + here. + + It should return -1 upon errors. + """ + pass + + cdef void node_value(self, double* dest) noexcept nogil: + """Copy the value of node samples[start:end] into dest.""" + pass + + cdef double node_impurity(self) noexcept nogil: + """Return the impurity of the current node.""" + pass + + cdef int pointer_size(self) noexcept nogil: + """Size of the pointer for split records. + + Overriding this function allows one to use different subclasses of + `SplitRecord`. + """ + return sizeof(SplitRecord) + +cdef class Splitter(BaseSplitter): + """Abstract interface for supervised splitters.""" + def __cinit__(self, Criterion criterion, SIZE_t max_features, SIZE_t min_samples_leaf, double min_weight_leaf, - object random_state): + object random_state, *argv): """ Parameters ---------- @@ -78,7 +143,6 @@ cdef class Splitter: random_state : object The user inputted random state to be used for pseudo-randomness """ - self.criterion = criterion self.n_samples = 0 @@ -89,12 +153,6 @@ cdef class Splitter: self.min_weight_leaf = min_weight_leaf self.random_state = random_state - def __getstate__(self): - return {} - - def __setstate__(self, d): - pass - def __reduce__(self): return (type(self), (self.criterion, self.max_features, @@ -134,7 +192,6 @@ cdef class Splitter: has_missing : bool At least one missing values is in X. """ - self.rand_r_state = self.random_state.randint(0, RAND_R_MAX) cdef SIZE_t n_samples = X.shape[0] @@ -172,8 +229,21 @@ cdef class Splitter: self.y = y self.sample_weight = sample_weight + + self.criterion.init( + self.y, + self.sample_weight, + self.weighted_n_samples, + self.samples + ) + + self.criterion.set_sample_pointers( + self.start, + self.end + ) if missing_values_in_feature_mask is not None: self.criterion.init_sum_missing() + return 0 cdef int node_reset(self, SIZE_t start, SIZE_t end, @@ -196,40 +266,72 @@ cdef class Splitter: self.start = start self.end = end - self.criterion.init( - self.y, - self.sample_weight, - self.weighted_n_samples, - self.samples, - start, - end - ) + self.criterion.set_sample_pointers(start, end) weighted_n_node_samples[0] = self.criterion.weighted_n_node_samples return 0 - cdef int node_split(self, double impurity, SplitRecord* split, - SIZE_t* n_constant_features) except -1 nogil: - """Find the best split on node samples[start:end]. - - This is a placeholder method. The majority of computation will be done - here. - - It should return -1 upon errors. - """ - - pass - cdef void node_value(self, double* dest) noexcept nogil: """Copy the value of node samples[start:end] into dest.""" self.criterion.node_value(dest) + cdef void node_samples(self, vector[vector[DOUBLE_t]]& dest) noexcept nogil: + """Copy the samples[start:end] into dest.""" + self.criterion.node_samples(dest) + cdef double node_impurity(self) noexcept nogil: """Return the impurity of the current node.""" return self.criterion.node_impurity() + cdef bint check_presplit_conditions( + self, + SplitRecord current_split, + SIZE_t n_missing, + bint missing_go_to_left, + ) noexcept nogil: + """Check stopping conditions pre-split. + + This is typically a metric that is cheaply computed given the + current proposed split, which is stored as a the `current_split` + argument. + """ + cdef SIZE_t min_samples_leaf = self.min_samples_leaf + cdef SIZE_t end_non_missing = self.end - n_missing + cdef SIZE_t n_left, n_right + + if missing_go_to_left: + n_left = current_split.pos - self.start + n_missing + n_right = end_non_missing - current_split.pos + else: + n_left = current_split.pos - self.start + n_right = end_non_missing - current_split.pos + n_missing + + # Reject if min_samples_leaf is not guaranteed + if n_left < min_samples_leaf or n_right < min_samples_leaf: + return 1 + + return 0 + + cdef bint check_postsplit_conditions( + self + ) noexcept nogil: + """Check stopping conditions after evaluating the split. + + This takes some metric that is stored in the Criterion + object and checks against internal stop metrics. + """ + cdef double min_weight_leaf = self.min_weight_leaf + + # Reject if min_weight_leaf is not satisfied + if ((self.criterion.weighted_n_left < min_weight_leaf) or + (self.criterion.weighted_n_right < min_weight_leaf)): + return 1 + + return 0 + + cdef inline void shift_missing_values_to_left_if_required( SplitRecord* best, SIZE_t[::1] samples, @@ -248,6 +350,7 @@ cdef inline void shift_missing_values_to_left_if_required( samples[i], samples[current_end] = samples[current_end], samples[i] best.pos += best.n_missing + # Introduce a fused-class to make it possible to share the split implementation # between the dense and sparse cases in the node_split_best and node_split_random # functions. The alternative would have been to use inheritance-based polymorphism @@ -381,7 +484,6 @@ cdef inline int node_split_best( if has_missing: criterion.init_missing(n_missing) # Evaluate all splits - # If there are missing values, then we search twice for the most optimal split. # The first search will have all the missing values going to the right node. # The second search will have all the missing values going to the left node. @@ -402,23 +504,22 @@ cdef inline int node_split_best( if p >= end_non_missing: continue - if missing_go_to_left: - n_left = p - start + n_missing - n_right = end_non_missing - p - else: - n_left = p - start - n_right = end_non_missing - p + n_missing + current_split.pos = p # Reject if min_samples_leaf is not guaranteed - if n_left < min_samples_leaf or n_right < min_samples_leaf: + if missing_go_to_left: + n_left = current_split.pos - splitter.start + n_missing + n_right = end_non_missing - current_split.pos + else: + n_left = current_split.pos - splitter.start + n_right = end_non_missing - current_split.pos + n_missing + if splitter.check_presplit_conditions(current_split, n_missing, missing_go_to_left) == 1: continue - current_split.pos = p criterion.update(current_split.pos) # Reject if min_weight_leaf is not satisfied - if ((criterion.weighted_n_left < min_weight_leaf) or - (criterion.weighted_n_right < min_weight_leaf)): + if splitter.check_postsplit_conditions() == 1: continue current_proxy_improvement = criterion.proxy_impurity_improvement() @@ -644,8 +745,6 @@ cdef inline int node_split_random( cdef SIZE_t n_features = splitter.n_features cdef SIZE_t max_features = splitter.max_features - cdef SIZE_t min_samples_leaf = splitter.min_samples_leaf - cdef double min_weight_leaf = splitter.min_weight_leaf cdef UINT32_t* random_state = &splitter.rand_r_state cdef SplitRecord best_split, current_split @@ -741,8 +840,7 @@ cdef inline int node_split_random( current_split.pos = partitioner.partition_samples(current_split.threshold) # Reject if min_samples_leaf is not guaranteed - if (((current_split.pos - start) < min_samples_leaf) or - ((end - current_split.pos) < min_samples_leaf)): + if splitter.check_presplit_conditions(current_split, 0, 0) == 1: continue # Evaluate split @@ -752,8 +850,7 @@ cdef inline int node_split_random( criterion.update(current_split.pos) # Reject if min_weight_leaf is not satisfied - if ((criterion.weighted_n_left < min_weight_leaf) or - (criterion.weighted_n_right < min_weight_leaf)): + if splitter.check_postsplit_conditions() == 1: continue current_proxy_improvement = criterion.proxy_impurity_improvement() diff --git a/sklearn/tree/_tree.pxd b/sklearn/tree/_tree.pxd index b99f44c0472a2..dedd820c41e0f 100644 --- a/sklearn/tree/_tree.pxd +++ b/sklearn/tree/_tree.pxd @@ -11,7 +11,10 @@ # See _tree.pyx for details. import numpy as np + cimport numpy as cnp +from libcpp.unordered_map cimport unordered_map +from libcpp.vector cimport vector ctypedef cnp.npy_float32 DTYPE_t # Type of X ctypedef cnp.npy_float64 DOUBLE_t # Type of y, sample_weight @@ -19,8 +22,8 @@ ctypedef cnp.npy_intp SIZE_t # Type for indices and counters ctypedef cnp.npy_int32 INT32_t # Signed 32 bit integer ctypedef cnp.npy_uint32 UINT32_t # Unsigned 32 bit integer -from ._splitter cimport Splitter -from ._splitter cimport SplitRecord +from ._splitter cimport SplitRecord, Splitter + cdef struct Node: # Base storage structure for the nodes in a Tree object @@ -35,40 +38,33 @@ cdef struct Node: unsigned char missing_go_to_left # Whether features have missing values -cdef class Tree: - # The Tree object is a binary tree structure constructed by the - # TreeBuilder. The tree structure is used for predictions and - # feature importances. - - # Input/Output layout - cdef public SIZE_t n_features # Number of features in X - cdef SIZE_t* n_classes # Number of classes in y[:, k] - cdef public SIZE_t n_outputs # Number of outputs in y - cdef public SIZE_t max_n_classes # max(n_classes) - +cdef class BaseTree: # Inner structures: values are stored separately from node structure, # since size is determined at runtime. cdef public SIZE_t max_depth # Max depth of the tree cdef public SIZE_t node_count # Counter for node IDs cdef public SIZE_t capacity # Capacity of tree, in terms of nodes cdef Node* nodes # Array of nodes - cdef double* value # (capacity, n_outputs, max_n_classes) array of values - cdef SIZE_t value_stride # = n_outputs * max_n_classes - # Methods - cdef SIZE_t _add_node(self, SIZE_t parent, bint is_left, bint is_leaf, - SIZE_t feature, double threshold, double impurity, - SIZE_t n_node_samples, - double weighted_n_node_samples, - unsigned char missing_go_to_left) except -1 nogil + cdef SIZE_t value_stride # The dimensionality of a vectorized output per sample + cdef double* value # Array of values prediction values for each node + + # Generic Methods: These are generic methods used by any tree. cdef int _resize(self, SIZE_t capacity) except -1 nogil cdef int _resize_c(self, SIZE_t capacity=*) except -1 nogil - - cdef cnp.ndarray _get_value_ndarray(self) - cdef cnp.ndarray _get_node_ndarray(self) - - cpdef cnp.ndarray predict(self, object X) - + cdef SIZE_t _add_node( + self, + SIZE_t parent, + bint is_left, + bint is_leaf, + SplitRecord* split_node, + double impurity, + SIZE_t n_node_samples, + double weighted_n_node_samples, + unsigned char missing_go_to_left + ) except -1 nogil + + # Python API methods: These are methods exposed to Python cpdef cnp.ndarray apply(self, object X) cdef cnp.ndarray _apply_dense(self, object X) cdef cnp.ndarray _apply_sparse_csr(self, object X) @@ -80,6 +76,58 @@ cdef class Tree: cpdef compute_node_depths(self) cpdef compute_feature_importances(self, normalize=*) + # Abstract methods: these functions must be implemented by any decision tree + cdef int _set_split_node( + self, + SplitRecord* split_node, + Node* node + ) except -1 nogil + cdef int _set_leaf_node( + self, + SplitRecord* split_node, + Node* node + ) except -1 nogil + cdef DTYPE_t _compute_feature( + self, + const DTYPE_t[:, :] X_ndarray, + SIZE_t sample_index, + Node *node + ) noexcept nogil + cdef void _compute_feature_importances( + self, + cnp.float64_t[:] importances, + Node* node, + ) noexcept nogil + +cdef class Tree(BaseTree): + # The Supervised Tree object is a binary tree structure constructed by the + # TreeBuilder. The tree structure is used for predictions and + # feature importances. + # + # Value of upstream properties: + # - value_stride = n_outputs * max_n_classes + # - value = (capacity, n_outputs, max_n_classes) array of values + + # Input/Output layout for supervised tree + cdef public SIZE_t n_features # Number of features in X + cdef SIZE_t* n_classes # Number of classes in y[:, k] + cdef public SIZE_t n_outputs # Number of outputs in y + cdef public SIZE_t max_n_classes # max(n_classes) + + # Enables the use of tree to store distributions of the output to allow + # arbitrary usage of the the leaves. This is used in the quantile + # estimators for example. + # for storing samples at each leaf node with leaf's node ID as the key and + # the sample values as the value + cdef unordered_map[SIZE_t, vector[vector[DOUBLE_t]]] value_samples + + # Methods + cdef cnp.ndarray _get_value_ndarray(self) + cdef cnp.ndarray _get_node_ndarray(self) + cdef cnp.ndarray _get_value_samples_ndarray(self, SIZE_t node_id) + cdef cnp.ndarray _get_value_samples_keys(self) + + cpdef cnp.ndarray predict(self, object X) # ============================================================================= # Tree builder @@ -101,6 +149,8 @@ cdef class TreeBuilder: cdef SIZE_t max_depth # Maximal tree depth cdef double min_impurity_decrease # Impurity threshold for early stopping + cdef unsigned char store_leaf_values # Whether to store leaf values + cpdef build( self, Tree tree, diff --git a/sklearn/tree/_tree.pyx b/sklearn/tree/_tree.pyx index e7a0ab2f2966d..c44022f54d3a5 100644 --- a/sklearn/tree/_tree.pyx +++ b/sklearn/tree/_tree.pyx @@ -1,3 +1,6 @@ +# cython: language_level=3 +# cython: boundscheck=False, wraparound=False, initializedcheck=False, cdivision=True + # Authors: Gilles Louppe # Peter Prettenhofer # Brian Holt @@ -13,29 +16,27 @@ # License: BSD 3 clause from cpython cimport Py_INCREF, PyObject, PyTypeObject - -from libc.stdlib cimport free -from libc.string cimport memcpy -from libc.string cimport memset -from libc.stdint cimport INTPTR_MAX +from cython.operator cimport dereference as deref from libc.math cimport isnan -from libcpp.vector cimport vector -from libcpp.algorithm cimport pop_heap -from libcpp.algorithm cimport push_heap +from libc.stdint cimport INTPTR_MAX +from libc.stdlib cimport free, malloc +from libc.string cimport memcpy, memset from libcpp cimport bool +from libcpp.algorithm cimport pop_heap, push_heap +from libcpp.vector cimport vector import struct import numpy as np + cimport numpy as cnp + cnp.import_array() -from scipy.sparse import issparse -from scipy.sparse import csr_matrix -from scipy.sparse import isspmatrix_csr +from scipy.sparse import csr_matrix, issparse, isspmatrix_csr + +from ._utils cimport safe_realloc, sizet_ptr_to_ndarray -from ._utils cimport safe_realloc -from ._utils cimport sizet_ptr_to_ndarray cdef extern from "numpy/arrayobject.h": object PyArray_NewFromDescr(PyTypeObject* subtype, cnp.dtype descr, @@ -152,15 +153,23 @@ cdef struct StackRecord: cdef class DepthFirstTreeBuilder(TreeBuilder): """Build a decision tree in depth-first fashion.""" - def __cinit__(self, Splitter splitter, SIZE_t min_samples_split, - SIZE_t min_samples_leaf, double min_weight_leaf, - SIZE_t max_depth, double min_impurity_decrease): + def __cinit__( + self, + Splitter splitter, + SIZE_t min_samples_split, + SIZE_t min_samples_leaf, + double min_weight_leaf, + SIZE_t max_depth, + double min_impurity_decrease, + unsigned char store_leaf_values=False + ): self.splitter = splitter self.min_samples_split = min_samples_split self.min_samples_leaf = min_samples_leaf self.min_weight_leaf = min_weight_leaf self.max_depth = max_depth self.min_impurity_decrease = min_impurity_decrease + self.store_leaf_values = store_leaf_values cpdef build( self, @@ -203,9 +212,11 @@ cdef class DepthFirstTreeBuilder(TreeBuilder): cdef bint is_left cdef SIZE_t n_node_samples = splitter.n_samples cdef double weighted_n_node_samples - cdef SplitRecord split cdef SIZE_t node_id + cdef SplitRecord split + cdef SplitRecord* split_ptr = malloc(splitter.pointer_size()) + cdef double impurity = INFINITY cdef SIZE_t n_constant_features cdef bint is_leaf @@ -255,7 +266,12 @@ cdef class DepthFirstTreeBuilder(TreeBuilder): is_leaf = is_leaf or impurity <= EPSILON if not is_leaf: - splitter.node_split(impurity, &split, &n_constant_features) + splitter.node_split(impurity, split_ptr, &n_constant_features) + + # assign local copy of SplitRecord to assign + # pos, improvement, and impurity scores + split = deref(split_ptr) + # If EPSILON=0 in the below comparison, float precision # issues stop splitting, producing trees that are # dissimilar to v0.18 @@ -263,10 +279,9 @@ cdef class DepthFirstTreeBuilder(TreeBuilder): (split.improvement + EPSILON < min_impurity_decrease)) - node_id = tree._add_node(parent, is_left, is_leaf, split.feature, - split.threshold, impurity, n_node_samples, - weighted_n_node_samples, - split.missing_go_to_left) + node_id = tree._add_node(parent, is_left, is_leaf, split_ptr, + impurity, n_node_samples, + weighted_n_node_samples, split.missing_go_to_left) if node_id == INTPTR_MAX: rc = -1 @@ -296,6 +311,9 @@ cdef class DepthFirstTreeBuilder(TreeBuilder): "is_left": 1, "impurity": split.impurity_left, "n_constant_features": n_constant_features}) + elif self.store_leaf_values and is_leaf: + # copy leaf values to leaf_values array + splitter.node_samples(tree.value_samples[node_id]) if depth > max_depth_seen: max_depth_seen = depth @@ -305,6 +323,10 @@ cdef class DepthFirstTreeBuilder(TreeBuilder): if rc >= 0: tree.max_depth = max_depth_seen + + # free the memory created for the SplitRecord pointer + free(split_ptr) + if rc == -1: raise MemoryError() @@ -348,10 +370,17 @@ cdef class BestFirstTreeBuilder(TreeBuilder): """ cdef SIZE_t max_leaf_nodes - def __cinit__(self, Splitter splitter, SIZE_t min_samples_split, - SIZE_t min_samples_leaf, min_weight_leaf, - SIZE_t max_depth, SIZE_t max_leaf_nodes, - double min_impurity_decrease): + def __cinit__( + self, + Splitter splitter, + SIZE_t min_samples_split, + SIZE_t min_samples_leaf, + double min_weight_leaf, + SIZE_t max_depth, + SIZE_t max_leaf_nodes, + double min_impurity_decrease, + unsigned char store_leaf_values=False, + ): self.splitter = splitter self.min_samples_split = min_samples_split self.min_samples_leaf = min_samples_leaf @@ -359,6 +388,7 @@ cdef class BestFirstTreeBuilder(TreeBuilder): self.max_depth = max_depth self.max_leaf_nodes = max_leaf_nodes self.min_impurity_decrease = min_impurity_decrease + self.store_leaf_values = store_leaf_values cpdef build( self, @@ -419,6 +449,9 @@ cdef class BestFirstTreeBuilder(TreeBuilder): node.feature = _TREE_UNDEFINED node.threshold = _TREE_UNDEFINED + if self.store_leaf_values: + # copy leaf values to leaf_values array + splitter.node_samples(tree.value_samples[record.node_id]) else: # Node is expandable @@ -471,6 +504,8 @@ cdef class BestFirstTreeBuilder(TreeBuilder): FrontierRecord* res) except -1 nogil: """Adds node w/ partition ``[start, end)`` to the frontier. """ cdef SplitRecord split + cdef SplitRecord* split_ptr = malloc(splitter.pointer_size()) + cdef SIZE_t node_id cdef SIZE_t n_node_samples cdef SIZE_t n_constant_features = 0 @@ -492,7 +527,11 @@ cdef class BestFirstTreeBuilder(TreeBuilder): ) if not is_leaf: - splitter.node_split(impurity, &split, &n_constant_features) + splitter.node_split(impurity, split_ptr, &n_constant_features) + # assign local copy of SplitRecord to assign + # pos, improvement, and impurity scores + split = deref(split_ptr) + # If EPSILON=0 in the below comparison, float precision issues stop # splitting early, producing trees that are dissimilar to v0.18 is_leaf = (is_leaf or split.pos >= end or @@ -502,9 +541,8 @@ cdef class BestFirstTreeBuilder(TreeBuilder): if parent != NULL else _TREE_UNDEFINED, is_left, is_leaf, - split.feature, split.threshold, impurity, n_node_samples, - weighted_n_node_samples, - split.missing_go_to_left) + split_ptr, impurity, n_node_samples, + weighted_n_node_samples, split.missing_go_to_left) if node_id == INTPTR_MAX: return -1 @@ -533,6 +571,7 @@ cdef class BestFirstTreeBuilder(TreeBuilder): res.impurity_left = impurity res.impurity_right = impurity + free(split_ptr) return 0 @@ -540,194 +579,15 @@ cdef class BestFirstTreeBuilder(TreeBuilder): # Tree # ============================================================================= -cdef class Tree: - """Array-based representation of a binary decision tree. - - The binary tree is represented as a number of parallel arrays. The i-th - element of each array holds information about the node `i`. Node 0 is the - tree's root. You can find a detailed description of all arrays in - `_tree.pxd`. NOTE: Some of the arrays only apply to either leaves or split - nodes, resp. In this case the values of nodes of the other type are - arbitrary! - - Attributes - ---------- - node_count : int - The number of nodes (internal nodes + leaves) in the tree. - - capacity : int - The current capacity (i.e., size) of the arrays, which is at least as - great as `node_count`. - - max_depth : int - The depth of the tree, i.e. the maximum depth of its leaves. - - children_left : array of int, shape [node_count] - children_left[i] holds the node id of the left child of node i. - For leaves, children_left[i] == TREE_LEAF. Otherwise, - children_left[i] > i. This child handles the case where - X[:, feature[i]] <= threshold[i]. - - children_right : array of int, shape [node_count] - children_right[i] holds the node id of the right child of node i. - For leaves, children_right[i] == TREE_LEAF. Otherwise, - children_right[i] > i. This child handles the case where - X[:, feature[i]] > threshold[i]. - - feature : array of int, shape [node_count] - feature[i] holds the feature to split on, for the internal node i. - - threshold : array of double, shape [node_count] - threshold[i] holds the threshold for the internal node i. - - value : array of double, shape [node_count, n_outputs, max_n_classes] - Contains the constant prediction value of each node. - - impurity : array of double, shape [node_count] - impurity[i] holds the impurity (i.e., the value of the splitting - criterion) at node i. - - n_node_samples : array of int, shape [node_count] - n_node_samples[i] holds the number of training samples reaching node i. +cdef class BaseTree: + """Base class for Cython tree models. - weighted_n_node_samples : array of double, shape [node_count] - weighted_n_node_samples[i] holds the weighted number of training samples - reaching node i. + Downstream classes must implement """ - # Wrap for outside world. - # WARNING: these reference the current `nodes` and `value` buffers, which - # must not be freed by a subsequent memory allocation. - # (i.e. through `_resize` or `__setstate__`) - @property - def n_classes(self): - return sizet_ptr_to_ndarray(self.n_classes, self.n_outputs) - - @property - def children_left(self): - return self._get_node_ndarray()['left_child'][:self.node_count] - - @property - def children_right(self): - return self._get_node_ndarray()['right_child'][:self.node_count] - - @property - def n_leaves(self): - return np.sum(np.logical_and( - self.children_left == -1, - self.children_right == -1)) - - @property - def feature(self): - return self._get_node_ndarray()['feature'][:self.node_count] - - @property - def threshold(self): - return self._get_node_ndarray()['threshold'][:self.node_count] - - @property - def impurity(self): - return self._get_node_ndarray()['impurity'][:self.node_count] - - @property - def n_node_samples(self): - return self._get_node_ndarray()['n_node_samples'][:self.node_count] - - @property - def weighted_n_node_samples(self): - return self._get_node_ndarray()['weighted_n_node_samples'][:self.node_count] - - @property - def missing_go_to_left(self): - return self._get_node_ndarray()['missing_go_to_left'][:self.node_count] - - @property - def value(self): - return self._get_value_ndarray()[:self.node_count] - - # TODO: Convert n_classes to cython.integral memory view once - # https://github.com/cython/cython/issues/5243 is fixed - def __cinit__(self, int n_features, cnp.ndarray n_classes, int n_outputs): - """Constructor.""" - cdef SIZE_t dummy = 0 - size_t_dtype = np.array(dummy).dtype - - n_classes = _check_n_classes(n_classes, size_t_dtype) - - # Input/Output layout - self.n_features = n_features - self.n_outputs = n_outputs - self.n_classes = NULL - safe_realloc(&self.n_classes, n_outputs) - - self.max_n_classes = np.max(n_classes) - self.value_stride = n_outputs * self.max_n_classes - - cdef SIZE_t k - for k in range(n_outputs): - self.n_classes[k] = n_classes[k] - - # Inner structures - self.max_depth = 0 - self.node_count = 0 - self.capacity = 0 - self.value = NULL - self.nodes = NULL - - def __dealloc__(self): - """Destructor.""" - # Free all inner structures - free(self.n_classes) - free(self.value) - free(self.nodes) - - def __reduce__(self): - """Reduce re-implementation, for pickling.""" - return (Tree, (self.n_features, - sizet_ptr_to_ndarray(self.n_classes, self.n_outputs), - self.n_outputs), self.__getstate__()) - - def __getstate__(self): - """Getstate re-implementation, for pickling.""" - d = {} - # capacity is inferred during the __setstate__ using nodes - d["max_depth"] = self.max_depth - d["node_count"] = self.node_count - d["nodes"] = self._get_node_ndarray() - d["values"] = self._get_value_ndarray() - return d - - def __setstate__(self, d): - """Setstate re-implementation, for unpickling.""" - self.max_depth = d["max_depth"] - self.node_count = d["node_count"] - - if 'nodes' not in d: - raise ValueError('You have loaded Tree version which ' - 'cannot be imported') - - node_ndarray = d['nodes'] - value_ndarray = d['values'] - - value_shape = (node_ndarray.shape[0], self.n_outputs, - self.max_n_classes) - - node_ndarray = _check_node_ndarray(node_ndarray, expected_dtype=NODE_DTYPE) - value_ndarray = _check_value_ndarray( - value_ndarray, - expected_dtype=np.dtype(np.float64), - expected_shape=value_shape - ) - - self.capacity = node_ndarray.shape[0] - if self._resize_c(self.capacity) != 0: - raise MemoryError("resizing tree to %d" % self.capacity) - - memcpy(self.nodes, cnp.PyArray_DATA(node_ndarray), - self.capacity * sizeof(Node)) - memcpy(self.value, cnp.PyArray_DATA(value_ndarray), - self.capacity * self.value_stride * sizeof(double)) - - cdef int _resize(self, SIZE_t capacity) except -1 nogil: + cdef int _resize( + self, + SIZE_t capacity + ) except -1 nogil: """Resize all inner arrays to `capacity`, if `capacity` == -1, then double the size of the inner arrays. @@ -739,7 +599,10 @@ cdef class Tree: with gil: raise MemoryError() - cdef int _resize_c(self, SIZE_t capacity=INTPTR_MAX) except -1 nogil: + cdef int _resize_c( + self, + SIZE_t capacity=INTPTR_MAX + ) except -1 nogil: """Guts of _resize Returns -1 in case of failure to allocate memory (and raise MemoryError) @@ -770,15 +633,93 @@ cdef class Tree: self.capacity = capacity return 0 - cdef SIZE_t _add_node(self, SIZE_t parent, bint is_left, bint is_leaf, - SIZE_t feature, double threshold, double impurity, - SIZE_t n_node_samples, - double weighted_n_node_samples, - unsigned char missing_go_to_left) except -1 nogil: + cdef int _set_split_node( + self, + SplitRecord* split_node, + Node* node + ) except -1 nogil: + """Set split node data. + + Parameters + ---------- + split_node : SplitRecord* + The pointer to the record of the split node data. + node : Node* + The pointer to the node that will hold the split node. + """ + # left_child and right_child will be set later for a split node + node.feature = split_node.feature + node.threshold = split_node.threshold + return 1 + + cdef int _set_leaf_node( + self, + SplitRecord* split_node, + Node* node + ) except -1 nogil: + """Set leaf node data. + + Parameters + ---------- + split_node : SplitRecord* + The pointer to the record of the leaf node data. + node : Node* + The pointer to the node that will hold the leaf node. + """ + node.left_child = _TREE_LEAF + node.right_child = _TREE_LEAF + node.feature = _TREE_UNDEFINED + node.threshold = _TREE_UNDEFINED + return 1 + + cdef DTYPE_t _compute_feature( + self, + const DTYPE_t[:, :] X_ndarray, + SIZE_t sample_index, + Node *node + ) noexcept nogil: + """Compute feature from a given data matrix, X. + + In axis-aligned trees, this is simply the value in the column of X + for this specific feature. + """ + # the feature index + cdef DTYPE_t feature = X_ndarray[sample_index, node.feature] + return feature + + cdef SIZE_t _add_node( + self, + SIZE_t parent, + bint is_left, + bint is_leaf, + SplitRecord* split_node, + double impurity, + SIZE_t n_node_samples, + double weighted_n_node_samples, + unsigned char missing_go_to_left + ) except -1 nogil: """Add a node to the tree. The new node registers itself as the child of its parent. + Parameters + ---------- + parent : SIZE_t + The index of the parent. If '_TREE_UNDEFINED', then the current + node is a root node. + is_left : bint + Whether or not the current node is to the left of the parent node. + is_leaf : bint + Whether or not the current node is a leaf node. + split_node : SplitRecord* + A pointer to a SplitRecord pointer address. + impurity : double + The impurity of the node to be added. + n_node_samples : SIZE_t + The number of samples in the node. + weighted_n_node_samples : double + The weight of the samples in the node. + Returns (size_t)(-1) on error. """ cdef SIZE_t node_id = self.node_count @@ -799,29 +740,19 @@ cdef class Tree: self.nodes[parent].right_child = node_id if is_leaf: - node.left_child = _TREE_LEAF - node.right_child = _TREE_LEAF - node.feature = _TREE_UNDEFINED - node.threshold = _TREE_UNDEFINED - + if self._set_leaf_node(split_node, node) != 1: + with gil: + raise RuntimeError else: - # left_child and right_child will be set later - node.feature = feature - node.threshold = threshold + if self._set_split_node(split_node, node) != 1: + with gil: + raise RuntimeError node.missing_go_to_left = missing_go_to_left self.node_count += 1 return node_id - cpdef cnp.ndarray predict(self, object X): - """Predict target for X.""" - out = self._get_value_ndarray().take(self.apply(X), axis=0, - mode='clip') - if self.n_outputs == 1: - out = out.reshape(X.shape[0], self.max_n_classes) - return out - cpdef cnp.ndarray apply(self, object X): """Finds the terminal region (=leaf node) for each sample in X.""" if issparse(X): @@ -855,9 +786,10 @@ cdef class Tree: with nogil: for i in range(n_samples): node = self.nodes + # While node not a leaf while node.left_child != _TREE_LEAF: - X_i_node_feature = X_ndarray[i, node.feature] + X_i_node_feature = self._compute_feature(X_ndarray, i, node) # ... and node.right_child != _TREE_LEAF: if isnan(X_i_node_feature): if node.missing_go_to_left: @@ -925,7 +857,6 @@ cdef class Tree: # ... and node.right_child != _TREE_LEAF: if feature_to_sample[node.feature] == i: feature_value = X_sample[node.feature] - else: feature_value = 0. @@ -974,6 +905,9 @@ cdef class Tree: cdef Node* node = NULL cdef SIZE_t i = 0 + # the feature index + cdef DOUBLE_t feature + with nogil: for i in range(n_samples): node = self.nodes @@ -985,7 +919,9 @@ cdef class Tree: indices[indptr[i + 1]] = (node - self.nodes) indptr[i + 1] += 1 - if X_ndarray[i, node.feature] <= node.threshold: + # compute the feature value to compare against threshold + feature = self._compute_feature(X_ndarray, i, node) + if feature <= node.threshold: node = &self.nodes[node.left_child] else: node = &self.nodes[node.right_child] @@ -1114,13 +1050,12 @@ cdef class Tree: cpdef compute_feature_importances(self, normalize=True): """Computes the importance of each feature (aka variable).""" - cdef Node* left - cdef Node* right cdef Node* nodes = self.nodes cdef Node* node = nodes cdef Node* end_node = node + self.node_count cdef double normalizer = 0. + cdef int i = 0 cdef cnp.float64_t[:] importances = np.zeros(self.n_features) @@ -1128,13 +1063,9 @@ cdef class Tree: while node != end_node: if node.left_child != _TREE_LEAF: # ... and node.right_child != _TREE_LEAF: - left = &nodes[node.left_child] - right = &nodes[node.right_child] + self._compute_feature_importances( + importances, node) - importances[node.feature] += ( - node.weighted_n_node_samples * node.impurity - - left.weighted_n_node_samples * left.impurity - - right.weighted_n_node_samples * right.impurity) node += 1 for i in range(self.n_features): @@ -1150,44 +1081,27 @@ cdef class Tree: return np.asarray(importances) - cdef cnp.ndarray _get_value_ndarray(self): - """Wraps value as a 3-d NumPy array. + cdef void _compute_feature_importances( + self, + cnp.float64_t[:] importances, + Node* node + ) noexcept nogil: + """Compute feature importances from a Node in the Tree. - The array keeps a reference to this Tree, which manages the underlying - memory. + Wrapped in a private function to allow subclassing that + computes feature importances. """ - cdef cnp.npy_intp shape[3] - shape[0] = self.node_count - shape[1] = self.n_outputs - shape[2] = self.max_n_classes - cdef cnp.ndarray arr - arr = cnp.PyArray_SimpleNewFromData(3, shape, cnp.NPY_DOUBLE, self.value) - Py_INCREF(self) - if PyArray_SetBaseObject(arr, self) < 0: - raise ValueError("Can't initialize array.") - return arr + cdef Node* nodes = self.nodes + cdef Node* left + cdef Node* right - cdef cnp.ndarray _get_node_ndarray(self): - """Wraps nodes as a NumPy struct array. + left = &nodes[node.left_child] + right = &nodes[node.right_child] - The array keeps a reference to this Tree, which manages the underlying - memory. Individual fields are publicly accessible as properties of the - Tree. - """ - cdef cnp.npy_intp shape[1] - shape[0] = self.node_count - cdef cnp.npy_intp strides[1] - strides[0] = sizeof(Node) - cdef cnp.ndarray arr - Py_INCREF(NODE_DTYPE) - arr = PyArray_NewFromDescr( cnp.ndarray, - NODE_DTYPE, 1, shape, - strides, self.nodes, - cnp.NPY_ARRAY_DEFAULT, None) - Py_INCREF(self) - if PyArray_SetBaseObject(arr, self) < 0: - raise ValueError("Can't initialize array.") - return arr + importances[node.feature] += ( + node.weighted_n_node_samples * node.impurity - + left.weighted_n_node_samples * left.impurity - + right.weighted_n_node_samples * right.impurity) def compute_partial_dependence(self, DTYPE_t[:, ::1] X, int[::1] target_features, @@ -1296,6 +1210,282 @@ cdef class Tree: total_weight) +cdef class Tree(BaseTree): + """Array-based representation of a binary decision tree. + + The binary tree is represented as a number of parallel arrays. The i-th + element of each array holds information about the node `i`. Node 0 is the + tree's root. You can find a detailed description of all arrays in + `_tree.pxd`. NOTE: Some of the arrays only apply to either leaves or split + nodes, resp. In this case the values of nodes of the other type are + arbitrary! + + Attributes + ---------- + node_count : int + The number of nodes (internal nodes + leaves) in the tree. + + capacity : int + The current capacity (i.e., size) of the arrays, which is at least as + great as `node_count`. + + max_depth : int + The depth of the tree, i.e. the maximum depth of its leaves. + + children_left : array of int, shape [node_count] + children_left[i] holds the node id of the left child of node i. + For leaves, children_left[i] == TREE_LEAF. Otherwise, + children_left[i] > i. This child handles the case where + X[:, feature[i]] <= threshold[i]. + + children_right : array of int, shape [node_count] + children_right[i] holds the node id of the right child of node i. + For leaves, children_right[i] == TREE_LEAF. Otherwise, + children_right[i] > i. This child handles the case where + X[:, feature[i]] > threshold[i]. + + feature : array of int, shape [node_count] + feature[i] holds the feature to split on, for the internal node i. + + threshold : array of double, shape [node_count] + threshold[i] holds the threshold for the internal node i. + + value : array of double, shape [node_count, n_outputs, max_n_classes] + Contains the constant prediction value of each node. + + impurity : array of double, shape [node_count] + impurity[i] holds the impurity (i.e., the value of the splitting + criterion) at node i. + + n_node_samples : array of int, shape [node_count] + n_node_samples[i] holds the number of training samples reaching node i. + + weighted_n_node_samples : array of double, shape [node_count] + weighted_n_node_samples[i] holds the weighted number of training samples + reaching node i. + """ + # Wrap for outside world. + # WARNING: these reference the current `nodes` and `value` buffers, which + # must not be freed by a subsequent memory allocation. + # (i.e. through `_resize` or `__setstate__`) + @property + def n_classes(self): + return sizet_ptr_to_ndarray(self.n_classes, self.n_outputs) + + @property + def children_left(self): + return self._get_node_ndarray()['left_child'][:self.node_count] + + @property + def children_right(self): + return self._get_node_ndarray()['right_child'][:self.node_count] + + @property + def n_leaves(self): + return np.sum(np.logical_and( + self.children_left == -1, + self.children_right == -1)) + + @property + def feature(self): + return self._get_node_ndarray()['feature'][:self.node_count] + + @property + def threshold(self): + return self._get_node_ndarray()['threshold'][:self.node_count] + + @property + def impurity(self): + return self._get_node_ndarray()['impurity'][:self.node_count] + + @property + def n_node_samples(self): + return self._get_node_ndarray()['n_node_samples'][:self.node_count] + + @property + def weighted_n_node_samples(self): + return self._get_node_ndarray()['weighted_n_node_samples'][:self.node_count] + + @property + def missing_go_to_left(self): + return self._get_node_ndarray()['missing_go_to_left'][:self.node_count] + + @property + def value(self): + return self._get_value_ndarray()[:self.node_count] + + @property + def leaf_nodes_samples(self): + leaf_node_samples = dict() + keys = self._get_value_samples_keys() + for node_id in keys: + leaf_node_samples[node_id] = self._get_value_samples_ndarray(node_id) + return leaf_node_samples + + # TODO: Convert n_classes to cython.integral memory view once + # https://github.com/cython/cython/issues/5243 is fixed + def __cinit__(self, int n_features, cnp.ndarray n_classes, int n_outputs): + """Constructor.""" + cdef SIZE_t dummy = 0 + size_t_dtype = np.array(dummy).dtype + + n_classes = _check_n_classes(n_classes, size_t_dtype) + + # Input/Output layout + self.n_features = n_features + self.n_outputs = n_outputs + self.n_classes = NULL + safe_realloc(&self.n_classes, n_outputs) + + self.max_n_classes = np.max(n_classes) + self.value_stride = n_outputs * self.max_n_classes + + cdef SIZE_t k + for k in range(n_outputs): + self.n_classes[k] = n_classes[k] + + # Inner structures + self.max_depth = 0 + self.node_count = 0 + self.capacity = 0 + self.value = NULL + self.nodes = NULL + + # initialize the hash map for the value samples + self.value_samples = unordered_map[SIZE_t, vector[vector[DOUBLE_t]]]() + + def __dealloc__(self): + """Destructor.""" + # Free all inner structures + free(self.n_classes) + free(self.value) + free(self.nodes) + + def __reduce__(self): + """Reduce re-implementation, for pickling.""" + return (Tree, (self.n_features, + sizet_ptr_to_ndarray(self.n_classes, self.n_outputs), + self.n_outputs), self.__getstate__()) + + def __getstate__(self): + """Getstate re-implementation, for pickling.""" + d = {} + # capacity is inferred during the __setstate__ using nodes + d["max_depth"] = self.max_depth + d["node_count"] = self.node_count + d["nodes"] = self._get_node_ndarray() + d["values"] = self._get_value_ndarray() + d['value_samples'] = self.leaf_nodes_samples + return d + + def __setstate__(self, d): + """Setstate re-implementation, for unpickling.""" + self.max_depth = d["max_depth"] + self.node_count = d["node_count"] + + if 'nodes' not in d: + raise ValueError('You have loaded Tree version which ' + 'cannot be imported') + + node_ndarray = d['nodes'] + value_ndarray = d['values'] + + value_shape = (node_ndarray.shape[0], self.n_outputs, + self.max_n_classes) + + node_ndarray = _check_node_ndarray(node_ndarray, expected_dtype=NODE_DTYPE) + value_ndarray = _check_value_ndarray( + value_ndarray, + expected_dtype=np.dtype(np.float64), + expected_shape=value_shape + ) + + self.capacity = node_ndarray.shape[0] + if self._resize_c(self.capacity) != 0: + raise MemoryError("resizing tree to %d" % self.capacity) + + memcpy(self.nodes, cnp.PyArray_DATA(node_ndarray), + self.capacity * sizeof(Node)) + memcpy(self.value, cnp.PyArray_DATA(value_ndarray), + self.capacity * self.value_stride * sizeof(double)) + + # store the leaf node samples if they exist + value_samples_dict = d['value_samples'] + for node_id, leaf_samples in value_samples_dict.items(): + self.value_samples[node_id].resize(leaf_samples.shape[0]) + for idx in range(leaf_samples.shape[0]): + for jdx in range(leaf_samples.shape[1]): + self.value_samples[node_id][idx].push_back(leaf_samples[idx, jdx]) + + cdef cnp.ndarray _get_value_samples_ndarray(self, SIZE_t node_id): + """Wraps value_samples as a 2-d NumPy array per node_id.""" + cdef int i, j + cdef int n_samples = self.value_samples[node_id].size() + cdef cnp.ndarray[DOUBLE_t, ndim=2, mode='c'] leaf_node_samples = np.empty(shape=(n_samples, self.n_outputs), dtype=np.float64) + + for i in range(n_samples): + for j in range(self.n_outputs): + leaf_node_samples[i, j] = self.value_samples[node_id][i][j] + return leaf_node_samples + + cdef cnp.ndarray _get_value_samples_keys(self): + """Wraps value_samples keys as a 1-d NumPy array of keys.""" + cdef cnp.ndarray[SIZE_t, ndim=1, mode='c'] keys = np.empty(len(self.value_samples), dtype=np.intp) + cdef unsigned int i = 0 + + for key in self.value_samples: + keys[i] = key.first + i += 1 + return keys + + cdef cnp.ndarray _get_value_ndarray(self): + """Wraps value as a 3-d NumPy array. + + The array keeps a reference to this Tree, which manages the underlying + memory. + """ + cdef cnp.npy_intp shape[3] + shape[0] = self.node_count + shape[1] = self.n_outputs + shape[2] = self.max_n_classes + cdef cnp.ndarray arr + arr = cnp.PyArray_SimpleNewFromData(3, shape, cnp.NPY_DOUBLE, self.value) + Py_INCREF(self) + if PyArray_SetBaseObject(arr, self) < 0: + raise ValueError("Can't initialize array.") + return arr + + cdef cnp.ndarray _get_node_ndarray(self): + """Wraps nodes as a NumPy struct array. + + The array keeps a reference to this Tree, which manages the underlying + memory. Individual fields are publicly accessible as properties of the + Tree. + """ + cdef cnp.npy_intp shape[1] + shape[0] = self.node_count + cdef cnp.npy_intp strides[1] + strides[0] = sizeof(Node) + cdef cnp.ndarray arr + Py_INCREF(NODE_DTYPE) + arr = PyArray_NewFromDescr( cnp.ndarray, + NODE_DTYPE, 1, shape, + strides, self.nodes, + cnp.NPY_ARRAY_DEFAULT, None) + Py_INCREF(self) + if PyArray_SetBaseObject(arr, self) < 0: + raise ValueError("Can't initialize array.") + return arr + + cpdef cnp.ndarray predict(self, object X): + """Predict target for X.""" + out = self._get_value_ndarray().take(self.apply(X), axis=0, + mode='clip') + if self.n_outputs == 1: + out = out.reshape(X.shape[0], self.max_n_classes) + return out + + def _check_n_classes(n_classes, expected_dtype): if n_classes.ndim != 1: raise ValueError( @@ -1780,6 +1970,8 @@ cdef _build_pruned_tree( stack[BuildPrunedRecord] prune_stack BuildPrunedRecord stack_record + SplitRecord split + with nogil: # push root node onto stack prune_stack.push({"start": 0, "depth": 0, "parent": _TREE_UNDEFINED, "is_left": 0}) @@ -1796,8 +1988,12 @@ cdef _build_pruned_tree( is_leaf = leaves_in_subtree[orig_node_id] node = &orig_tree.nodes[orig_node_id] + # redefine to a SplitRecord to pass into _add_node + split.feature = node.feature + split.threshold = node.threshold + new_node_id = tree._add_node( - parent, is_left, is_leaf, node.feature, node.threshold, + parent, is_left, is_leaf, &split, node.impurity, node.n_node_samples, node.weighted_n_node_samples, node.missing_go_to_left) diff --git a/sklearn/tree/_utils.pxd b/sklearn/tree/_utils.pxd index 4938d3030245f..4d3575a0526ab 100644 --- a/sklearn/tree/_utils.pxd +++ b/sklearn/tree/_utils.pxd @@ -9,8 +9,10 @@ # See _utils.pyx for details. cimport numpy as cnp + +from sklearn.neighbors._quad_tree cimport Cell + from ._tree cimport Node -from ..neighbors._quad_tree cimport Cell ctypedef cnp.npy_float32 DTYPE_t # Type of X ctypedef cnp.npy_float64 DOUBLE_t # Type of y, sample_weight diff --git a/sklearn/tree/_utils.pyx b/sklearn/tree/_utils.pyx index 669d69409fdc3..02dc7cf426efc 100644 --- a/sklearn/tree/_utils.pyx +++ b/sklearn/tree/_utils.pyx @@ -1,3 +1,6 @@ +# cython: language_level=3 +# cython: boundscheck=False, wraparound=False, initializedcheck=False, cdivision=True + # Authors: Gilles Louppe # Peter Prettenhofer # Arnaud Joly @@ -7,16 +10,17 @@ # # License: BSD 3 clause -from libc.stdlib cimport free -from libc.stdlib cimport realloc -from libc.math cimport log as ln from libc.math cimport isnan +from libc.math cimport log as ln +from libc.stdlib cimport free, realloc import numpy as np + cimport numpy as cnp + cnp.import_array() -from ..utils._random cimport our_rand_r +from sklearn.utils._random cimport our_rand_r # ============================================================================= # Helper functions diff --git a/sklearn/tree/tests/test_tree.py b/sklearn/tree/tests/test_tree.py index cadbe2c9f702e..0ce7a548c7bdb 100644 --- a/sklearn/tree/tests/test_tree.py +++ b/sklearn/tree/tests/test_tree.py @@ -882,7 +882,7 @@ def test_pickle(): else: X, y = diabetes.data, diabetes.target - est = TreeEstimator(random_state=0) + est = TreeEstimator(random_state=0, store_leaf_values=True) est.fit(X, y) score = est.score(X, y) @@ -901,6 +901,7 @@ def test_pickle(): "n_node_samples", "weighted_n_node_samples", "value", + "leaf_nodes_samples", ] fitted_attribute = { attribute: getattr(est.tree_, attribute) for attribute in attributes @@ -915,14 +916,25 @@ def test_pickle(): score == score2 ), "Failed to generate same score after pickling with {0}".format(name) for attribute in fitted_attribute: - assert_array_equal( - getattr(est2.tree_, attribute), - fitted_attribute[attribute], - err_msg=( - f"Failed to generate same attribute {attribute} after pickling with" - f" {name}" - ), - ) + if attribute == "leaf_nodes_samples": + for key in fitted_attribute[attribute].keys(): + assert_array_equal( + getattr(est2.tree_, attribute)[key], + fitted_attribute[attribute][key], + err_msg=( + f"Failed to generate same attribute {attribute} after" + f" pickling with {name}" + ), + ) + else: + assert_array_equal( + getattr(est2.tree_, attribute), + fitted_attribute[attribute], + err_msg=( + f"Failed to generate same attribute {attribute} after pickling" + f" with {name}" + ), + ) def test_multioutput(): @@ -2414,7 +2426,9 @@ def test_missing_values_on_equal_nodes_no_missing(criterion): X = np.array([[0, 1, 2, 3, 8, 9, 11, 12, 15]]).T y = np.array([0.1, 0.2, 0.3, 0.2, 1.4, 1.4, 1.5, 1.6, 2.6]) - dtc = DecisionTreeRegressor(random_state=42, max_depth=1, criterion=criterion) + dtc = DecisionTreeRegressor( + random_state=42, max_depth=1, criterion=criterion, store_leaf_values=True + ) dtc.fit(X, y) # Goes to right node because it has the most data points @@ -2626,3 +2640,148 @@ def test_sample_weight_non_uniform(make_data, Tree): tree_samples_removed.fit(X[1::2, :], y[1::2]) assert_allclose(tree_samples_removed.predict(X), tree_with_sw.predict(X)) + + +@pytest.mark.parametrize( + "tree_name", + ALL_TREES, +) +def test_leaf_node_samples(tree_name): + """Test getting leaf node samples from fitted tree.""" + tree = ALL_TREES[tree_name](random_state=0, store_leaf_values=False) + tree.fit(X_small, y_small) + + # Check that the leaf node samples are not stored by default + assert tree.tree_.leaf_nodes_samples == dict() + + # error should be raised if trying to predict quantiles + assert hasattr(tree, "predict_quantiles") + for meth in ["predict_quantiles", "get_leaf_node_samples"]: + if hasattr(tree, meth): + with pytest.raises( + RuntimeError, + match="leaf node samples", + ): + getattr(tree, meth)(X_small) + + quantile_tree = ALL_TREES[tree_name](random_state=0, store_leaf_values=True) + quantile_tree.fit(X_small, y_small) + + score = tree.score(X_small, y_small) + new_score = quantile_tree.score(X_small, y_small) + assert np.isclose(score, new_score) + + # Check that the leaf node samples are what they should be + X_leaves = quantile_tree.apply(X_small) + for idx in range(X_leaves.shape[0]): + leaf_idx = X_leaves[idx] + assert y_small[idx] in quantile_tree.tree_.leaf_nodes_samples[leaf_idx] + assert set(np.unique(X_leaves)) == set( + quantile_tree.tree_.leaf_nodes_samples.keys() + ) + + +@pytest.mark.parametrize( + "name", + ALL_TREES, +) +def test_quantile_tree_predict(name): + TreeEstimator = ALL_TREES[name] + + # test quantile prediction + est = TreeEstimator(store_leaf_values=True, random_state=0) + + # fit on binary results in perfect leaves, so all quantiles are the same + est.fit(X_small, y_small) + pred = est.predict_quantiles(X_small, quantiles=[0.1, 0.5, 0.9]) + assert_array_equal(est.predict(X_small), pred[:, 0]) + assert_array_equal(est.predict(X_small), pred[:, 1]) + assert_array_equal(est.predict(X_small), pred[:, 2]) + assert_array_equal(pred[:, 0], y_small) + assert np.unique(pred, axis=1).shape[1] == 1 + + est.fit(X_small[:-5], y_small[:-5]) + held_out_X = X_small[-5:, :] + pred = est.predict_quantiles(held_out_X, quantiles=[0.1, 0.5, 0.9]) + assert_array_equal(est.predict(held_out_X), pred[:, 0]) + assert_array_equal(est.predict(held_out_X), pred[:, 1]) + assert_array_equal(est.predict(held_out_X), pred[:, 2]) + + # fit on real data + est.fit(iris.data, iris.target) + pred = est.predict_quantiles(iris.data, quantiles=[0.1, 0.5, 0.9]) + assert_array_equal(pred[:, 0], iris.target) + assert_array_equal(pred[:, 1], iris.target) + assert_array_equal(pred[:, 2], iris.target) + + +@pytest.mark.parametrize( + "name", + ALL_TREES, +) +def test_quantile_tree_predict_impure_leaves(name): + TreeEstimator = ALL_TREES[name] + + # test quantile prediction + est = TreeEstimator(store_leaf_values=True, random_state=0, max_depth=4) + # fit on binary results with constrained depth will result in impure leaves + est.fit(X_small, y_small) + pred = est.predict_quantiles(X_small, quantiles=[0.1, 0.5, 0.9]) + assert np.unique(pred, axis=1).shape[1] > 1 + + +def test_multioutput_quantiles(): + # Check estimators on multi-output problems. + X = [ + [-2, -1], + [-1, -1], + [-1, -2], + [1, 1], + [1, 2], + [2, 1], + [-2, 1], + [-1, 1], + [-1, 2], + [2, -1], + [1, -1], + [1, -2], + ] + + y = [ + [-1, 0], + [-1, 0], + [-1, 0], + [1, 1], + [1, 1], + [1, 1], + [-1, 2], + [-1, 2], + [-1, 2], + [1, 3], + [1, 3], + [1, 3], + ] + + T = [[-1, -1], [1, 1], [-1, 1], [1, -1]] + y_true = [[-1, 0], [1, 1], [-1, 2], [1, 3]] + + # toy classification problem + for name, TreeClassifier in CLF_TREES.items(): + clf = TreeClassifier(random_state=0, store_leaf_values=True) + clf.fit(X, y) + + y_hat = clf.predict_quantiles(T, quantiles=[0.25, 0.5, 0.75]) + y_hat = y_hat.squeeze() + assert_array_equal(y_hat[:, 0], y_true) + assert_array_equal(y_hat[:, 1], y_true) + assert_array_equal(y_hat[:, 2], y_true) + assert y_hat.shape == (4, 3, 2) + + # toy regression problem + for name, TreeRegressor in REG_TREES.items(): + reg = TreeRegressor(random_state=0, store_leaf_values=True) + y_hat = reg.fit(X, y).predict_quantiles(T, quantiles=[0.25, 0.5, 0.75]) + assert_array_equal(y_hat[:, 0], y_true) + assert_array_equal(y_hat[:, 1], y_true) + assert_array_equal(y_hat[:, 2], y_true) + assert y_hat.shape == (4, 3, 2)