PERF Implement PairwiseDistancesReduction backend for RadiusNeighbors.predict_proba (scikit-learn#26828)

OmarManzoor · Omar Salman · jjerphan · web-flow · commit f86f41d80bff · 2023-09-25T10:56:45.000-04:00
Co-authored-by: Omar Salman &lt;omar.salman@arbisoft&gt;
Co-authored-by: Julien Jerphanion &lt;git@jjerphan.xyz&gt;
diff --git a/.gitignore b/.gitignore
@@ -99,6 +99,7 @@ sklearn/metrics/_pairwise_distances_reduction/_middle_term_computer.pxd
 sklearn/metrics/_pairwise_distances_reduction/_middle_term_computer.pyx
 sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pxd
 sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pyx
+sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors_classmode.pyx
 sklearn/neighbors/_ball_tree.pyx
 sklearn/neighbors/_binary_tree.pxi
 sklearn/neighbors/_kd_tree.pyx
diff --git a/doc/whats_new/v1.4.rst b/doc/whats_new/v1.4.rst
@@ -280,6 +280,11 @@ Changelog
   :class:`metric.DistanceMetric` objects.
   :pr:`26267` by :user:`Meekail Zain <micky774>`
 
+- |Efficiency| The performance of :meth:`neighbors.RadiusNeighborsClassifier.predict`
+  and of :meth:`neighbors.RadiusNeighborsClassifier.predict_proba` has been improved
+  when `radius` is large and `algorithm="brute"` with non-Euclidean metrics.
+  :pr:`26828` by :user:`Omar Salman <OmarManzoor>`.
+
 :mod:`sklearn.preprocessing`
 ............................
 
diff --git a/setup.cfg b/setup.cfg
@@ -53,6 +53,7 @@ ignore =
     sklearn/metrics/_pairwise_distances_reduction/_middle_term_computer.pyx
     sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pxd
     sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pyx
+    sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors_classmode.pyx
     sklearn/neighbors/_ball_tree.pyx
     sklearn/neighbors/_binary_tree.pxi
     sklearn/neighbors/_kd_tree.pyx
diff --git a/setup.py b/setup.py
@@ -295,6 +295,12 @@ def check_package_status(package, min_version):
             "include_np": True,
             "extra_compile_args": ["-std=c++11"],
         },
+        {
+            "sources": ["_radius_neighbors_classmode.pyx.tp"],
+            "language": "c++",
+            "include_np": True,
+            "extra_compile_args": ["-std=c++11"],
+        },
     ],
     "preprocessing": [
         {"sources": ["_csr_polynomial_expansion.pyx"]},
diff --git a/sklearn/metrics/_pairwise_distances_reduction/__init__.py b/sklearn/metrics/_pairwise_distances_reduction/__init__.py
@@ -91,6 +91,7 @@
     ArgKminClassMode,
     BaseDistancesReductionDispatcher,
     RadiusNeighbors,
+    RadiusNeighborsClassMode,
     sqeuclidean_row_norms,
 )
 
@@ -99,5 +100,6 @@
     "ArgKmin",
     "RadiusNeighbors",
     "ArgKminClassMode",
+    "RadiusNeighborsClassMode",
     "sqeuclidean_row_norms",
 ]
diff --git a/sklearn/metrics/_pairwise_distances_reduction/_dispatcher.py b/sklearn/metrics/_pairwise_distances_reduction/_dispatcher.py
@@ -23,6 +23,10 @@
     RadiusNeighbors32,
     RadiusNeighbors64,
 )
+from ._radius_neighbors_classmode import (
+    RadiusNeighborsClassMode32,
+    RadiusNeighborsClassMode64,
+)
 
 
 def sqeuclidean_row_norms(X, num_threads):
@@ -597,3 +601,153 @@ def compute(
             "Only float64 or float32 datasets pairs are supported at this time, "
             f"got: X.dtype={X.dtype} and Y.dtype={Y.dtype}."
         )
+
+
+class RadiusNeighborsClassMode(BaseDistancesReductionDispatcher):
+    """Compute radius-based class modes of row vectors of X using the
+    those of Y.
+
+    For each row-vector X[i] of the queries X, find all the indices j of
+    row-vectors in Y such that:
+
+                        dist(X[i], Y[j]) <= radius
+
+    RadiusNeighborsClassMode is typically used to perform bruteforce
+    radius neighbors queries when the weighted mode of the labels for
+    the nearest neighbors within the specified radius are required,
+    such as in `predict` methods.
+
+    This class is not meant to be instantiated, one should only use
+    its :meth:`compute` classmethod which handles allocation and
+    deallocation consistently.
+    """
+
+    @classmethod
+    def valid_metrics(cls) -> List[str]:
+        excluded = {
+            # Euclidean is technically usable for RadiusNeighborsClassMode
+            # but it would not be competitive.
+            # TODO: implement Euclidean specialization using GEMM.
+            "euclidean",
+            "sqeuclidean",
+        }
+        return sorted(set(BaseDistancesReductionDispatcher.valid_metrics()) - excluded)
+
+    @classmethod
+    def compute(
+        cls,
+        X,
+        Y,
+        radius,
+        weights,
+        Y_labels,
+        unique_Y_labels,
+        outlier_label,
+        metric="euclidean",
+        chunk_size=None,
+        metric_kwargs=None,
+        strategy=None,
+    ):
+        """Return the results of the reduction for the given arguments.
+        Parameters
+        ----------
+        X : ndarray of shape (n_samples_X, n_features)
+            The input array to be labelled.
+        Y : ndarray of shape (n_samples_Y, n_features)
+            The input array whose class membership is provided through
+            the `Y_labels` parameter.
+        radius : float
+            The radius defining the neighborhood.
+        weights : ndarray
+            The weights applied to the `Y_labels` when computing the
+            weighted mode of the labels.
+        Y_labels : ndarray
+            An array containing the index of the class membership of the
+            associated samples in `Y`. This is used in labeling `X`.
+        unique_Y_labels : ndarray
+            An array containing all unique class labels.
+        outlier_label : int, default=None
+            Label for outlier samples (samples with no neighbors in given
+            radius). In the default case when the value is None if any
+            outlier is detected, a ValueError will be raised. The outlier
+            label should be selected from among the unique 'Y' labels. If
+            it is specified with a different value a warning will be raised
+            and all class probabilities of outliers will be assigned to be 0.
+        metric : str, default='euclidean'
+            The distance metric to use. For a list of available metrics, see
+            the documentation of :class:`~sklearn.metrics.DistanceMetric`.
+            Currently does not support `'precomputed'`.
+        chunk_size : int, default=None,
+            The number of vectors per chunk. If None (default) looks-up in
+            scikit-learn configuration for `pairwise_dist_chunk_size`,
+            and use 256 if it is not set.
+        metric_kwargs : dict, default=None
+            Keyword arguments to pass to specified metric function.
+        strategy : str, {'auto', 'parallel_on_X', 'parallel_on_Y'}, default=None
+            The chunking strategy defining which dataset parallelization are made on.
+            For both strategies the computations happens with two nested loops,
+            respectively on chunks of X and chunks of Y.
+            Strategies differs on which loop (outer or inner) is made to run
+            in parallel with the Cython `prange` construct:
+              - 'parallel_on_X' dispatches chunks of X uniformly on threads.
+                Each thread then iterates on all the chunks of Y. This strategy is
+                embarrassingly parallel and comes with no datastructures
+                synchronisation.
+              - 'parallel_on_Y' dispatches chunks of Y uniformly on threads.
+                Each thread processes all the chunks of X in turn. This strategy is
+                a sequence of embarrassingly parallel subtasks (the inner loop on Y
+                chunks) with intermediate datastructures synchronisation at each
+                iteration of the sequential outer loop on X chunks.
+              - 'auto' relies on a simple heuristic to choose between
+                'parallel_on_X' and 'parallel_on_Y': when `X.shape[0]` is large enough,
+                'parallel_on_X' is usually the most efficient strategy.
+                When `X.shape[0]` is small but `Y.shape[0]` is large, 'parallel_on_Y'
+                brings more opportunity for parallelism and is therefore more efficient
+                despite the synchronization step at each iteration of the outer loop
+                on chunks of `X`.
+              - None (default) looks-up in scikit-learn configuration for
+                `pairwise_dist_parallel_strategy`, and use 'auto' if it is not set.
+        Returns
+        -------
+        probabilities : ndarray of shape (n_samples_X, n_classes)
+            An array containing the class probabilities for each sample.
+        """
+        if weights not in {"uniform", "distance"}:
+            raise ValueError(
+                "Only the 'uniform' or 'distance' weights options are supported"
+                f" at this time. Got: {weights=}."
+            )
+        if X.dtype == Y.dtype == np.float64:
+            return RadiusNeighborsClassMode64.compute(
+                X=X,
+                Y=Y,
+                radius=radius,
+                weights=weights,
+                Y_labels=np.array(Y_labels, dtype=np.intp),
+                unique_Y_labels=np.array(unique_Y_labels, dtype=np.intp),
+                outlier_label=outlier_label,
+                metric=metric,
+                chunk_size=chunk_size,
+                metric_kwargs=metric_kwargs,
+                strategy=strategy,
+            )
+
+        if X.dtype == Y.dtype == np.float32:
+            return RadiusNeighborsClassMode32.compute(
+                X=X,
+                Y=Y,
+                radius=radius,
+                weights=weights,
+                Y_labels=np.array(Y_labels, dtype=np.intp),
+                unique_Y_labels=np.array(unique_Y_labels, dtype=np.intp),
+                outlier_label=outlier_label,
+                metric=metric,
+                chunk_size=chunk_size,
+                metric_kwargs=metric_kwargs,
+                strategy=strategy,
+            )
+
+        raise ValueError(
+            "Only float64 or float32 datasets pairs are supported at this time, "
+            f"got: X.dtype={X.dtype} and Y.dtype={Y.dtype}."
+        )
diff --git a/sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors_classmode.pyx.tp b/sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors_classmode.pyx.tp
diff --git a/sklearn/metrics/tests/test_pairwise_distances_reduction.py b/sklearn/metrics/tests/test_pairwise_distances_reduction.py
diff --git a/sklearn/neighbors/_classification.py b/sklearn/neighbors/_classification.py

Original file line number	Diff line number	Diff line change
`@@ -91,6 +91,7 @@`
`91`	`91`	`ArgKminClassMode,`
`92`	`92`	`BaseDistancesReductionDispatcher,`
`93`	`93`	`RadiusNeighbors,`
	`94`	`+ RadiusNeighborsClassMode,`
`94`	`95`	`sqeuclidean_row_norms,`
`95`	`96`	`)`
`96`	`97`
`@@ -99,5 +100,6 @@`
`99`	`100`	`"ArgKmin",`
`100`	`101`	`"RadiusNeighbors",`
`101`	`102`	`"ArgKminClassMode",`
	`103`	`+ "RadiusNeighborsClassMode",`
`102`	`104`	`"sqeuclidean_row_norms",`
`103`	`105`	`]`