|
23 | 23 | RadiusNeighbors32,
|
24 | 24 | RadiusNeighbors64,
|
25 | 25 | )
|
| 26 | +from ._radius_neighbors_classmode import ( |
| 27 | + RadiusNeighborsClassMode32, |
| 28 | + RadiusNeighborsClassMode64, |
| 29 | +) |
26 | 30 |
|
27 | 31 |
|
28 | 32 | def sqeuclidean_row_norms(X, num_threads):
|
@@ -597,3 +601,153 @@ def compute(
|
597 | 601 | "Only float64 or float32 datasets pairs are supported at this time, "
|
598 | 602 | f"got: X.dtype={X.dtype} and Y.dtype={Y.dtype}."
|
599 | 603 | )
|
| 604 | + |
| 605 | + |
| 606 | +class RadiusNeighborsClassMode(BaseDistancesReductionDispatcher): |
| 607 | + """Compute radius-based class modes of row vectors of X using the |
| 608 | + those of Y. |
| 609 | +
|
| 610 | + For each row-vector X[i] of the queries X, find all the indices j of |
| 611 | + row-vectors in Y such that: |
| 612 | +
|
| 613 | + dist(X[i], Y[j]) <= radius |
| 614 | +
|
| 615 | + RadiusNeighborsClassMode is typically used to perform bruteforce |
| 616 | + radius neighbors queries when the weighted mode of the labels for |
| 617 | + the nearest neighbors within the specified radius are required, |
| 618 | + such as in `predict` methods. |
| 619 | +
|
| 620 | + This class is not meant to be instantiated, one should only use |
| 621 | + its :meth:`compute` classmethod which handles allocation and |
| 622 | + deallocation consistently. |
| 623 | + """ |
| 624 | + |
| 625 | + @classmethod |
| 626 | + def valid_metrics(cls) -> List[str]: |
| 627 | + excluded = { |
| 628 | + # Euclidean is technically usable for RadiusNeighborsClassMode |
| 629 | + # but it would not be competitive. |
| 630 | + # TODO: implement Euclidean specialization using GEMM. |
| 631 | + "euclidean", |
| 632 | + "sqeuclidean", |
| 633 | + } |
| 634 | + return sorted(set(BaseDistancesReductionDispatcher.valid_metrics()) - excluded) |
| 635 | + |
| 636 | + @classmethod |
| 637 | + def compute( |
| 638 | + cls, |
| 639 | + X, |
| 640 | + Y, |
| 641 | + radius, |
| 642 | + weights, |
| 643 | + Y_labels, |
| 644 | + unique_Y_labels, |
| 645 | + outlier_label, |
| 646 | + metric="euclidean", |
| 647 | + chunk_size=None, |
| 648 | + metric_kwargs=None, |
| 649 | + strategy=None, |
| 650 | + ): |
| 651 | + """Return the results of the reduction for the given arguments. |
| 652 | + Parameters |
| 653 | + ---------- |
| 654 | + X : ndarray of shape (n_samples_X, n_features) |
| 655 | + The input array to be labelled. |
| 656 | + Y : ndarray of shape (n_samples_Y, n_features) |
| 657 | + The input array whose class membership is provided through |
| 658 | + the `Y_labels` parameter. |
| 659 | + radius : float |
| 660 | + The radius defining the neighborhood. |
| 661 | + weights : ndarray |
| 662 | + The weights applied to the `Y_labels` when computing the |
| 663 | + weighted mode of the labels. |
| 664 | + Y_labels : ndarray |
| 665 | + An array containing the index of the class membership of the |
| 666 | + associated samples in `Y`. This is used in labeling `X`. |
| 667 | + unique_Y_labels : ndarray |
| 668 | + An array containing all unique class labels. |
| 669 | + outlier_label : int, default=None |
| 670 | + Label for outlier samples (samples with no neighbors in given |
| 671 | + radius). In the default case when the value is None if any |
| 672 | + outlier is detected, a ValueError will be raised. The outlier |
| 673 | + label should be selected from among the unique 'Y' labels. If |
| 674 | + it is specified with a different value a warning will be raised |
| 675 | + and all class probabilities of outliers will be assigned to be 0. |
| 676 | + metric : str, default='euclidean' |
| 677 | + The distance metric to use. For a list of available metrics, see |
| 678 | + the documentation of :class:`~sklearn.metrics.DistanceMetric`. |
| 679 | + Currently does not support `'precomputed'`. |
| 680 | + chunk_size : int, default=None, |
| 681 | + The number of vectors per chunk. If None (default) looks-up in |
| 682 | + scikit-learn configuration for `pairwise_dist_chunk_size`, |
| 683 | + and use 256 if it is not set. |
| 684 | + metric_kwargs : dict, default=None |
| 685 | + Keyword arguments to pass to specified metric function. |
| 686 | + strategy : str, {'auto', 'parallel_on_X', 'parallel_on_Y'}, default=None |
| 687 | + The chunking strategy defining which dataset parallelization are made on. |
| 688 | + For both strategies the computations happens with two nested loops, |
| 689 | + respectively on chunks of X and chunks of Y. |
| 690 | + Strategies differs on which loop (outer or inner) is made to run |
| 691 | + in parallel with the Cython `prange` construct: |
| 692 | + - 'parallel_on_X' dispatches chunks of X uniformly on threads. |
| 693 | + Each thread then iterates on all the chunks of Y. This strategy is |
| 694 | + embarrassingly parallel and comes with no datastructures |
| 695 | + synchronisation. |
| 696 | + - 'parallel_on_Y' dispatches chunks of Y uniformly on threads. |
| 697 | + Each thread processes all the chunks of X in turn. This strategy is |
| 698 | + a sequence of embarrassingly parallel subtasks (the inner loop on Y |
| 699 | + chunks) with intermediate datastructures synchronisation at each |
| 700 | + iteration of the sequential outer loop on X chunks. |
| 701 | + - 'auto' relies on a simple heuristic to choose between |
| 702 | + 'parallel_on_X' and 'parallel_on_Y': when `X.shape[0]` is large enough, |
| 703 | + 'parallel_on_X' is usually the most efficient strategy. |
| 704 | + When `X.shape[0]` is small but `Y.shape[0]` is large, 'parallel_on_Y' |
| 705 | + brings more opportunity for parallelism and is therefore more efficient |
| 706 | + despite the synchronization step at each iteration of the outer loop |
| 707 | + on chunks of `X`. |
| 708 | + - None (default) looks-up in scikit-learn configuration for |
| 709 | + `pairwise_dist_parallel_strategy`, and use 'auto' if it is not set. |
| 710 | + Returns |
| 711 | + ------- |
| 712 | + probabilities : ndarray of shape (n_samples_X, n_classes) |
| 713 | + An array containing the class probabilities for each sample. |
| 714 | + """ |
| 715 | + if weights not in {"uniform", "distance"}: |
| 716 | + raise ValueError( |
| 717 | + "Only the 'uniform' or 'distance' weights options are supported" |
| 718 | + f" at this time. Got: {weights=}." |
| 719 | + ) |
| 720 | + if X.dtype == Y.dtype == np.float64: |
| 721 | + return RadiusNeighborsClassMode64.compute( |
| 722 | + X=X, |
| 723 | + Y=Y, |
| 724 | + radius=radius, |
| 725 | + weights=weights, |
| 726 | + Y_labels=np.array(Y_labels, dtype=np.intp), |
| 727 | + unique_Y_labels=np.array(unique_Y_labels, dtype=np.intp), |
| 728 | + outlier_label=outlier_label, |
| 729 | + metric=metric, |
| 730 | + chunk_size=chunk_size, |
| 731 | + metric_kwargs=metric_kwargs, |
| 732 | + strategy=strategy, |
| 733 | + ) |
| 734 | + |
| 735 | + if X.dtype == Y.dtype == np.float32: |
| 736 | + return RadiusNeighborsClassMode32.compute( |
| 737 | + X=X, |
| 738 | + Y=Y, |
| 739 | + radius=radius, |
| 740 | + weights=weights, |
| 741 | + Y_labels=np.array(Y_labels, dtype=np.intp), |
| 742 | + unique_Y_labels=np.array(unique_Y_labels, dtype=np.intp), |
| 743 | + outlier_label=outlier_label, |
| 744 | + metric=metric, |
| 745 | + chunk_size=chunk_size, |
| 746 | + metric_kwargs=metric_kwargs, |
| 747 | + strategy=strategy, |
| 748 | + ) |
| 749 | + |
| 750 | + raise ValueError( |
| 751 | + "Only float64 or float32 datasets pairs are supported at this time, " |
| 752 | + f"got: X.dtype={X.dtype} and Y.dtype={Y.dtype}." |
| 753 | + ) |
0 commit comments