Skip to content

Commit f86f41d

Browse files
OmarManzoorOmar Salmanjjerphan
authored
PERF Implement PairwiseDistancesReduction backend for RadiusNeighbors.predict_proba (scikit-learn#26828)
Co-authored-by: Omar Salman <omar.salman@arbisoft> Co-authored-by: Julien Jerphanion <git@jjerphan.xyz>
1 parent ae2ccbf commit f86f41d

File tree

9 files changed

+572
-2
lines changed

9 files changed

+572
-2
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -99,6 +99,7 @@ sklearn/metrics/_pairwise_distances_reduction/_middle_term_computer.pxd
9999
sklearn/metrics/_pairwise_distances_reduction/_middle_term_computer.pyx
100100
sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pxd
101101
sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pyx
102+
sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors_classmode.pyx
102103
sklearn/neighbors/_ball_tree.pyx
103104
sklearn/neighbors/_binary_tree.pxi
104105
sklearn/neighbors/_kd_tree.pyx

doc/whats_new/v1.4.rst

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -280,6 +280,11 @@ Changelog
280280
:class:`metric.DistanceMetric` objects.
281281
:pr:`26267` by :user:`Meekail Zain <micky774>`
282282

283+
- |Efficiency| The performance of :meth:`neighbors.RadiusNeighborsClassifier.predict`
284+
and of :meth:`neighbors.RadiusNeighborsClassifier.predict_proba` has been improved
285+
when `radius` is large and `algorithm="brute"` with non-Euclidean metrics.
286+
:pr:`26828` by :user:`Omar Salman <OmarManzoor>`.
287+
283288
:mod:`sklearn.preprocessing`
284289
............................
285290

setup.cfg

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,7 @@ ignore =
5353
sklearn/metrics/_pairwise_distances_reduction/_middle_term_computer.pyx
5454
sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pxd
5555
sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors.pyx
56+
sklearn/metrics/_pairwise_distances_reduction/_radius_neighbors_classmode.pyx
5657
sklearn/neighbors/_ball_tree.pyx
5758
sklearn/neighbors/_binary_tree.pxi
5859
sklearn/neighbors/_kd_tree.pyx

setup.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -295,6 +295,12 @@ def check_package_status(package, min_version):
295295
"include_np": True,
296296
"extra_compile_args": ["-std=c++11"],
297297
},
298+
{
299+
"sources": ["_radius_neighbors_classmode.pyx.tp"],
300+
"language": "c++",
301+
"include_np": True,
302+
"extra_compile_args": ["-std=c++11"],
303+
},
298304
],
299305
"preprocessing": [
300306
{"sources": ["_csr_polynomial_expansion.pyx"]},

sklearn/metrics/_pairwise_distances_reduction/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -91,6 +91,7 @@
9191
ArgKminClassMode,
9292
BaseDistancesReductionDispatcher,
9393
RadiusNeighbors,
94+
RadiusNeighborsClassMode,
9495
sqeuclidean_row_norms,
9596
)
9697

@@ -99,5 +100,6 @@
99100
"ArgKmin",
100101
"RadiusNeighbors",
101102
"ArgKminClassMode",
103+
"RadiusNeighborsClassMode",
102104
"sqeuclidean_row_norms",
103105
]

sklearn/metrics/_pairwise_distances_reduction/_dispatcher.py

Lines changed: 154 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,10 @@
2323
RadiusNeighbors32,
2424
RadiusNeighbors64,
2525
)
26+
from ._radius_neighbors_classmode import (
27+
RadiusNeighborsClassMode32,
28+
RadiusNeighborsClassMode64,
29+
)
2630

2731

2832
def sqeuclidean_row_norms(X, num_threads):
@@ -597,3 +601,153 @@ def compute(
597601
"Only float64 or float32 datasets pairs are supported at this time, "
598602
f"got: X.dtype={X.dtype} and Y.dtype={Y.dtype}."
599603
)
604+
605+
606+
class RadiusNeighborsClassMode(BaseDistancesReductionDispatcher):
607+
"""Compute radius-based class modes of row vectors of X using the
608+
those of Y.
609+
610+
For each row-vector X[i] of the queries X, find all the indices j of
611+
row-vectors in Y such that:
612+
613+
dist(X[i], Y[j]) <= radius
614+
615+
RadiusNeighborsClassMode is typically used to perform bruteforce
616+
radius neighbors queries when the weighted mode of the labels for
617+
the nearest neighbors within the specified radius are required,
618+
such as in `predict` methods.
619+
620+
This class is not meant to be instantiated, one should only use
621+
its :meth:`compute` classmethod which handles allocation and
622+
deallocation consistently.
623+
"""
624+
625+
@classmethod
626+
def valid_metrics(cls) -> List[str]:
627+
excluded = {
628+
# Euclidean is technically usable for RadiusNeighborsClassMode
629+
# but it would not be competitive.
630+
# TODO: implement Euclidean specialization using GEMM.
631+
"euclidean",
632+
"sqeuclidean",
633+
}
634+
return sorted(set(BaseDistancesReductionDispatcher.valid_metrics()) - excluded)
635+
636+
@classmethod
637+
def compute(
638+
cls,
639+
X,
640+
Y,
641+
radius,
642+
weights,
643+
Y_labels,
644+
unique_Y_labels,
645+
outlier_label,
646+
metric="euclidean",
647+
chunk_size=None,
648+
metric_kwargs=None,
649+
strategy=None,
650+
):
651+
"""Return the results of the reduction for the given arguments.
652+
Parameters
653+
----------
654+
X : ndarray of shape (n_samples_X, n_features)
655+
The input array to be labelled.
656+
Y : ndarray of shape (n_samples_Y, n_features)
657+
The input array whose class membership is provided through
658+
the `Y_labels` parameter.
659+
radius : float
660+
The radius defining the neighborhood.
661+
weights : ndarray
662+
The weights applied to the `Y_labels` when computing the
663+
weighted mode of the labels.
664+
Y_labels : ndarray
665+
An array containing the index of the class membership of the
666+
associated samples in `Y`. This is used in labeling `X`.
667+
unique_Y_labels : ndarray
668+
An array containing all unique class labels.
669+
outlier_label : int, default=None
670+
Label for outlier samples (samples with no neighbors in given
671+
radius). In the default case when the value is None if any
672+
outlier is detected, a ValueError will be raised. The outlier
673+
label should be selected from among the unique 'Y' labels. If
674+
it is specified with a different value a warning will be raised
675+
and all class probabilities of outliers will be assigned to be 0.
676+
metric : str, default='euclidean'
677+
The distance metric to use. For a list of available metrics, see
678+
the documentation of :class:`~sklearn.metrics.DistanceMetric`.
679+
Currently does not support `'precomputed'`.
680+
chunk_size : int, default=None,
681+
The number of vectors per chunk. If None (default) looks-up in
682+
scikit-learn configuration for `pairwise_dist_chunk_size`,
683+
and use 256 if it is not set.
684+
metric_kwargs : dict, default=None
685+
Keyword arguments to pass to specified metric function.
686+
strategy : str, {'auto', 'parallel_on_X', 'parallel_on_Y'}, default=None
687+
The chunking strategy defining which dataset parallelization are made on.
688+
For both strategies the computations happens with two nested loops,
689+
respectively on chunks of X and chunks of Y.
690+
Strategies differs on which loop (outer or inner) is made to run
691+
in parallel with the Cython `prange` construct:
692+
- 'parallel_on_X' dispatches chunks of X uniformly on threads.
693+
Each thread then iterates on all the chunks of Y. This strategy is
694+
embarrassingly parallel and comes with no datastructures
695+
synchronisation.
696+
- 'parallel_on_Y' dispatches chunks of Y uniformly on threads.
697+
Each thread processes all the chunks of X in turn. This strategy is
698+
a sequence of embarrassingly parallel subtasks (the inner loop on Y
699+
chunks) with intermediate datastructures synchronisation at each
700+
iteration of the sequential outer loop on X chunks.
701+
- 'auto' relies on a simple heuristic to choose between
702+
'parallel_on_X' and 'parallel_on_Y': when `X.shape[0]` is large enough,
703+
'parallel_on_X' is usually the most efficient strategy.
704+
When `X.shape[0]` is small but `Y.shape[0]` is large, 'parallel_on_Y'
705+
brings more opportunity for parallelism and is therefore more efficient
706+
despite the synchronization step at each iteration of the outer loop
707+
on chunks of `X`.
708+
- None (default) looks-up in scikit-learn configuration for
709+
`pairwise_dist_parallel_strategy`, and use 'auto' if it is not set.
710+
Returns
711+
-------
712+
probabilities : ndarray of shape (n_samples_X, n_classes)
713+
An array containing the class probabilities for each sample.
714+
"""
715+
if weights not in {"uniform", "distance"}:
716+
raise ValueError(
717+
"Only the 'uniform' or 'distance' weights options are supported"
718+
f" at this time. Got: {weights=}."
719+
)
720+
if X.dtype == Y.dtype == np.float64:
721+
return RadiusNeighborsClassMode64.compute(
722+
X=X,
723+
Y=Y,
724+
radius=radius,
725+
weights=weights,
726+
Y_labels=np.array(Y_labels, dtype=np.intp),
727+
unique_Y_labels=np.array(unique_Y_labels, dtype=np.intp),
728+
outlier_label=outlier_label,
729+
metric=metric,
730+
chunk_size=chunk_size,
731+
metric_kwargs=metric_kwargs,
732+
strategy=strategy,
733+
)
734+
735+
if X.dtype == Y.dtype == np.float32:
736+
return RadiusNeighborsClassMode32.compute(
737+
X=X,
738+
Y=Y,
739+
radius=radius,
740+
weights=weights,
741+
Y_labels=np.array(Y_labels, dtype=np.intp),
742+
unique_Y_labels=np.array(unique_Y_labels, dtype=np.intp),
743+
outlier_label=outlier_label,
744+
metric=metric,
745+
chunk_size=chunk_size,
746+
metric_kwargs=metric_kwargs,
747+
strategy=strategy,
748+
)
749+
750+
raise ValueError(
751+
"Only float64 or float32 datasets pairs are supported at this time, "
752+
f"got: X.dtype={X.dtype} and Y.dtype={Y.dtype}."
753+
)

0 commit comments

Comments
 (0)