MAINT: Remove np.in1d and np.trapz usages (scikit-learn#27140)

mtsokol · lesteve · web-flow · commit cb15a82e6439 · 2023-09-06T17:06:31.000+02:00
Co-authored-by: Loïc Estève &lt;loic.esteve@ymail.com&gt;
diff --git a/examples/semi_supervised/plot_label_propagation_digits_active_learning.py b/examples/semi_supervised/plot_label_propagation_digits_active_learning.py
@@ -79,7 +79,7 @@
     # select up to 5 digit examples that the classifier is most uncertain about
     uncertainty_index = np.argsort(pred_entropies)[::-1]
     uncertainty_index = uncertainty_index[
-        np.in1d(uncertainty_index, unlabeled_indices)
+        np.isin(uncertainty_index, unlabeled_indices)
     ][:5]
 
     # keep track of indices that we get labels for
diff --git a/sklearn/datasets/_twenty_newsgroups.py b/sklearn/datasets/_twenty_newsgroups.py
@@ -319,7 +319,7 @@ def fetch_20newsgroups(
         # Sort the categories to have the ordering of the labels
         labels.sort()
         labels, categories = zip(*labels)
-        mask = np.in1d(data.target, labels)
+        mask = np.isin(data.target, labels)
         data.filenames = data.filenames[mask]
         data.target = data.target[mask]
         # searchsorted to have continuous labels
diff --git a/sklearn/feature_extraction/image.py b/sklearn/feature_extraction/image.py
@@ -76,7 +76,7 @@ def _mask_edges_weights(mask, edges, weights=None):
     """Apply a mask to edges (weighted or not)"""
     inds = np.arange(mask.size)
     inds = inds[mask.ravel()]
-    ind_mask = np.logical_and(np.in1d(edges[0], inds), np.in1d(edges[1], inds))
+    ind_mask = np.logical_and(np.isin(edges[0], inds), np.isin(edges[1], inds))
     edges = edges[:, ind_mask]
     if weights is not None:
         weights = weights[ind_mask]
diff --git a/sklearn/metrics/_plot/tests/test_precision_recall_display.py b/sklearn/metrics/_plot/tests/test_precision_recall_display.py
@@ -16,6 +16,7 @@
 from sklearn.pipeline import make_pipeline
 from sklearn.preprocessing import StandardScaler
 from sklearn.utils import shuffle
+from sklearn.utils.fixes import trapezoid
 
 # TODO: Remove when https://github.com/numpy/numpy/issues/14397 is resolved
 pytestmark = pytest.mark.filterwarnings(
@@ -289,7 +290,7 @@ def test_plot_precision_recall_pos_label(pyplot, constructor_name, response_meth
     # we should obtain the statistics of the "cancer" class
     avg_prec_limit = 0.65
     assert display.average_precision < avg_prec_limit
-    assert -np.trapz(display.precision, display.recall) < avg_prec_limit
+    assert -trapezoid(display.precision, display.recall) < avg_prec_limit
 
     # otherwise we should obtain the statistics of the "not cancer" class
     if constructor_name == "from_estimator":
@@ -308,7 +309,7 @@ def test_plot_precision_recall_pos_label(pyplot, constructor_name, response_meth
         )
     avg_prec_limit = 0.95
     assert display.average_precision > avg_prec_limit
-    assert -np.trapz(display.precision, display.recall) > avg_prec_limit
+    assert -trapezoid(display.precision, display.recall) > avg_prec_limit
 
 
 @pytest.mark.parametrize("constructor_name", ["from_estimator", "from_predictions"])
diff --git a/sklearn/metrics/_plot/tests/test_roc_curve_display.py b/sklearn/metrics/_plot/tests/test_roc_curve_display.py
@@ -11,6 +11,7 @@
 from sklearn.pipeline import make_pipeline
 from sklearn.preprocessing import StandardScaler
 from sklearn.utils import shuffle
+from sklearn.utils.fixes import trapezoid
 
 
 @pytest.fixture(scope="module")
@@ -293,7 +294,7 @@ def test_plot_roc_curve_pos_label(pyplot, response_method, constructor_name):
     roc_auc_limit = 0.95679
 
     assert display.roc_auc == pytest.approx(roc_auc_limit)
-    assert np.trapz(display.tpr, display.fpr) == pytest.approx(roc_auc_limit)
+    assert trapezoid(display.tpr, display.fpr) == pytest.approx(roc_auc_limit)
 
     if constructor_name == "from_estimator":
         display = RocCurveDisplay.from_estimator(
@@ -311,4 +312,4 @@ def test_plot_roc_curve_pos_label(pyplot, response_method, constructor_name):
         )
 
     assert display.roc_auc == pytest.approx(roc_auc_limit)
-    assert np.trapz(display.tpr, display.fpr) == pytest.approx(roc_auc_limit)
+    assert trapezoid(display.tpr, display.fpr) == pytest.approx(roc_auc_limit)
diff --git a/sklearn/metrics/_ranking.py b/sklearn/metrics/_ranking.py
@@ -38,6 +38,7 @@
 from ..utils._encode import _encode, _unique
 from ..utils._param_validation import Interval, StrOptions, validate_params
 from ..utils.extmath import stable_cumsum
+from ..utils.fixes import trapezoid
 from ..utils.multiclass import type_of_target
 from ..utils.sparsefuncs import count_nonzero
 from ..utils.validation import _check_pos_label_consistency, _check_sample_weight
@@ -104,9 +105,9 @@ def auc(x, y):
         else:
             raise ValueError("x is neither increasing nor decreasing : {}.".format(x))
 
-    area = direction * np.trapz(y, x)
+    area = direction * trapezoid(y, x)
     if isinstance(area, np.memmap):
-        # Reductions such as .sum used internally in np.trapz do not return a
+        # Reductions such as .sum used internally in trapezoid do not return a
         # scalar by default for numpy.memmap instances contrary to
         # regular numpy.ndarray instances.
         area = area.dtype.type(area)
diff --git a/sklearn/model_selection/_split.py b/sklearn/model_selection/_split.py
@@ -1973,8 +1973,8 @@ def _iter_indices(self, X, y, groups):
             # these are the indices of classes in the partition
             # invert them into data indices
 
-            train = np.flatnonzero(np.in1d(group_indices, group_train))
-            test = np.flatnonzero(np.in1d(group_indices, group_test))
+            train = np.flatnonzero(np.isin(group_indices, group_train))
+            test = np.flatnonzero(np.isin(group_indices, group_test))
 
             yield train, test
 
diff --git a/sklearn/model_selection/tests/test_search.py b/sklearn/model_selection/tests/test_search.py
@@ -1418,7 +1418,7 @@ def test_grid_search_correct_score_results():
         expected_keys = ("mean_test_score", "rank_test_score") + tuple(
             "split%d_test_score" % cv_i for cv_i in range(n_splits)
         )
-        assert all(np.in1d(expected_keys, result_keys))
+        assert all(np.isin(expected_keys, result_keys))
 
         cv = StratifiedKFold(n_splits=n_splits)
         n_splits = grid_search.n_splits_
diff --git a/sklearn/model_selection/tests/test_split.py b/sklearn/model_selection/tests/test_split.py
@@ -987,8 +987,8 @@ def test_group_shuffle_split():
             # First test: no train group is in the test set and vice versa
             l_train_unique = np.unique(l[train])
             l_test_unique = np.unique(l[test])
-            assert not np.any(np.in1d(l[train], l_test_unique))
-            assert not np.any(np.in1d(l[test], l_train_unique))
+            assert not np.any(np.isin(l[train], l_test_unique))
+            assert not np.any(np.isin(l[test], l_train_unique))
 
             # Second test: train and test add up to all the data
             assert l[train].size + l[test].size == l.size
diff --git a/sklearn/naive_bayes.py b/sklearn/naive_bayes.py
@@ -467,7 +467,7 @@ def _partial_fit(self, X, y, classes=None, _refit=False, sample_weight=None):
         classes = self.classes_
 
         unique_y = np.unique(y)
-        unique_y_in_classes = np.in1d(unique_y, classes)
+        unique_y_in_classes = np.isin(unique_y, classes)
 
         if not np.all(unique_y_in_classes):
             raise ValueError(
diff --git a/sklearn/preprocessing/_label.py b/sklearn/preprocessing/_label.py
@@ -553,7 +553,7 @@ def label_binarize(y, *, classes, neg_label=0, pos_label=1, sparse_output=False)
         y = column_or_1d(y)
 
         # pick out the known labels from y
-        y_in_classes = np.in1d(y, classes)
+        y_in_classes = np.isin(y, classes)
         y_seen = y[y_in_classes]
         indices = np.searchsorted(sorted_class, y_seen)
         indptr = np.hstack((0, np.cumsum(y_in_classes)))
diff --git a/sklearn/tests/test_isotonic.py b/sklearn/tests/test_isotonic.py
@@ -595,7 +595,7 @@ def test_isotonic_thresholds(increasing):
     # the data is already strictly monotonic which is not the case with
     # this random data)
     assert X_thresholds.shape[0] < X.shape[0]
-    assert np.in1d(X_thresholds, X).all()
+    assert np.isin(X_thresholds, X).all()
 
     # Output thresholds lie in the range of the training set:
     assert y_thresholds.max() <= y.max()
diff --git a/sklearn/utils/_encode.py b/sklearn/utils/_encode.py
@@ -296,7 +296,7 @@ def is_valid(value):
         diff = np.setdiff1d(unique_values, known_values, assume_unique=True)
         if return_mask:
             if diff.size:
-                valid_mask = np.in1d(values, known_values)
+                valid_mask = np.isin(values, known_values)
             else:
                 valid_mask = np.ones(len(values), dtype=bool)
 
diff --git a/sklearn/utils/class_weight.py b/sklearn/utils/class_weight.py
@@ -57,7 +57,7 @@ def compute_class_weight(class_weight, *, classes, y):
         # Find the weight of each class as present in y.
         le = LabelEncoder()
         y_ind = le.fit_transform(y)
-        if not all(np.in1d(classes, le.classes_)):
+        if not all(np.isin(classes, le.classes_)):
             raise ValueError("classes should have valid labels that are in y")
 
         recip_freq = len(y) / (len(le.classes_) * np.bincount(y_ind).astype(np.float64))
@@ -195,7 +195,7 @@ def compute_sample_weight(class_weight, y, *, indices=None):
 
         if classes_missing:
             # Make missing classes' weight zero
-            weight_k[np.in1d(y_full, list(classes_missing))] = 0.0
+            weight_k[np.isin(y_full, list(classes_missing))] = 0.0
 
         expanded_class_weight.append(weight_k)
 
diff --git a/sklearn/utils/fixes.py b/sklearn/utils/fixes.py
@@ -200,3 +200,10 @@ def _contents(data_module):
     from numpy.exceptions import ComplexWarning, VisibleDeprecationWarning
 else:
     from numpy import ComplexWarning, VisibleDeprecationWarning  # type: ignore  # noqa
+
+
+# TODO: Remove when Scipy 1.6 is the minimum supported version
+try:
+    from scipy.integrate import trapezoid  # type: ignore  # noqa
+except ImportError:
+    from scipy.integrate import trapz as trapezoid  # type: ignore  # noqa

Original file line number	Diff line number	Diff line change
`@@ -1418,7 +1418,7 @@ def test_grid_search_correct_score_results():`
`1418`	`1418`	`expected_keys = ("mean_test_score", "rank_test_score") + tuple(`
`1419`	`1419`	`"split%d_test_score" % cv_i for cv_i in range(n_splits)`
`1420`	`1420`	`)`
`1421`		`- assert all(np.in1d(expected_keys, result_keys))`
	`1421`	`+ assert all(np.isin(expected_keys, result_keys))`
`1422`	`1422`
`1423`	`1423`	`cv = StratifiedKFold(n_splits=n_splits)`
`1424`	`1424`	`n_splits = grid_search.n_splits_`