Merge branch 'main' into submodulev2

adam2392 · web-flow · commit 6b4d0e7f37db · 2023-06-12T10:43:45.000-04:00
diff --git a/build_tools/cirrus/update_tracking_issue.sh b/build_tools/cirrus/update_tracking_issue.sh
@@ -18,4 +18,5 @@ python maint_tools/update_tracking_issue.py \
     $CIRRUS_TASK_NAME \
     $CIRRUS_REPO_FULL_NAME \
     $LINK_TO_RUN \
-    --tests-passed $TEST_PASSED
+    --tests-passed $TEST_PASSED \
+    --auto-close false
diff --git a/doc/whats_new/v1.3.rst b/doc/whats_new/v1.3.rst
@@ -274,6 +274,11 @@ Changelog
 - |Fix| :func:`datasets.fetch_openml` returns improved data types when
   `as_frame=True` and `parser="liac-arff"`. :pr:`26386` by `Thomas Fan`_.
 
+- |Fix| Following the ARFF specs, only the marker `"?"` is now considered as a missing
+  values when opening ARFF files fetched using :func:`datasets.fetch_openml` when using
+  the pandas parser. The parameter `read_csv_kwargs` allows to overwrite this behaviour.
+  :pr:`26551` by :user:`Guillaume Lemaitre <glemaitre>`.
+
 - |Enhancement| Allows to overwrite the parameters used to open the ARFF file using
   the parameter `read_csv_kwargs` in :func:`datasets.fetch_openml` when using the
   pandas parser.
diff --git a/sklearn/cluster/_hdbscan/_tree.pyx b/sklearn/cluster/_hdbscan/_tree.pyx
@@ -36,6 +36,9 @@ import cython
 
 import numpy as np
 
+cdef extern from "numpy/arrayobject.h":
+    intp_t * PyArray_SHAPE(cnp.PyArrayObject *)
+
 cdef cnp.float64_t INFTY = np.inf
 cdef cnp.intp_t NOISE = -1
 
@@ -240,40 +243,37 @@ cdef dict _compute_stability(
         cnp.float64_t[::1] result, births
         cnp.intp_t[:] parents = condensed_tree['parent']
 
-        cnp.intp_t parent, cluster_size, result_index
+        cnp.intp_t parent, cluster_size, result_index, idx
         cnp.float64_t lambda_val
         CONDENSED_t condensed_node
-        cnp.float64_t[:, :] result_pre_dict
         cnp.intp_t largest_child = condensed_tree['child'].max()
         cnp.intp_t smallest_cluster = np.min(parents)
         cnp.intp_t num_clusters = np.max(parents) - smallest_cluster + 1
+        dict stability_dict = {}
 
     largest_child = max(largest_child, smallest_cluster)
     births = np.full(largest_child + 1, np.nan, dtype=np.float64)
 
-    births = np.full(largest_child + 1, np.nan, dtype=np.float64)
-    for condensed_node in condensed_tree:
+    for idx in range(PyArray_SHAPE(<cnp.PyArrayObject*> condensed_tree)[0]):
+        condensed_node = condensed_tree[idx]
         births[condensed_node.child] = condensed_node.value
 
     births[smallest_cluster] = 0.0
 
     result = np.zeros(num_clusters, dtype=np.float64)
-    for condensed_node in condensed_tree:
+    for idx in range(PyArray_SHAPE(<cnp.PyArrayObject*> condensed_tree)[0]):
+        condensed_node = condensed_tree[idx]
         parent = condensed_node.parent
         lambda_val = condensed_node.value
         cluster_size = condensed_node.cluster_size
 
         result_index = parent - smallest_cluster
         result[result_index] += (lambda_val - births[parent]) * cluster_size
 
-    result_pre_dict = np.vstack(
-        (
-            np.arange(smallest_cluster, np.max(parents) + 1),
-            result
-        )
-    ).T
+    for idx in range(num_clusters):
+        stability_dict[idx + smallest_cluster] = result[idx]
 
-    return dict(result_pre_dict)
+    return stability_dict
 
 
 cdef list bfs_from_cluster_tree(
diff --git a/sklearn/datasets/_arff_parser.py b/sklearn/datasets/_arff_parser.py
@@ -387,6 +387,7 @@ def _pandas_arff_parser(
         "header": None,
         "index_col": False,  # always force pandas to not use the first column as index
         "na_values": ["?"],  # missing values are represented by `?`
+        "keep_default_na": False,  # only `?` is a missing value given the ARFF specs
         "comment": "%",  # skip line starting by `%` since they are comments
         "quotechar": '"',  # delimiter to use for quoted strings
         "skipinitialspace": True,  # skip spaces after delimiter to follow ARFF specs
diff --git a/sklearn/decomposition/_dict_learning.py b/sklearn/decomposition/_dict_learning.py
@@ -2412,7 +2412,7 @@ def fit(self, X, y=None):
             self.n_steps_ = i + 1
             self.n_iter_ = np.ceil(self.n_steps_ / n_steps_per_iter)
         else:
-            # TODO remove this branch in 1.3
+            # TODO remove this branch in 1.4
             n_iter = 1000 if self.n_iter == "deprecated" else self.n_iter
 
             batches = gen_batches(n_samples, self._batch_size)
diff --git a/sklearn/linear_model/_logistic.py b/sklearn/linear_model/_logistic.py
@@ -506,6 +506,9 @@ def _logistic_regression_path(
                 w0 = np.concatenate([coef_.ravel(), intercept_])
             else:
                 w0 = coef_.ravel()
+            # n_iter_i is an array for each class. However, `target` is always encoded
+            # in {-1, 1}, so we only take the first element of n_iter_i.
+            n_iter_i = n_iter_i.item()
 
         elif solver in ["sag", "saga"]:
             if multi_class == "multinomial":
diff --git a/sklearn/linear_model/tests/test_sag.py b/sklearn/linear_model/tests/test_sag.py
@@ -95,7 +95,7 @@ def sag(
 
     for epoch in range(n_iter):
         for k in range(n_samples):
-            idx = int(rng.rand(1) * n_samples)
+            idx = int(rng.rand() * n_samples)
             # idx = k
             entry = X[idx]
             seen.add(idx)
@@ -167,7 +167,7 @@ def sag_sparse(
     for epoch in range(n_iter):
         for k in range(n_samples):
             # idx = k
-            idx = int(rng.rand(1) * n_samples)
+            idx = int(rng.rand() * n_samples)
             entry = X[idx]
             seen.add(idx)
 
diff --git a/sklearn/metrics/tests/test_pairwise.py b/sklearn/metrics/tests/test_pairwise.py
@@ -298,11 +298,10 @@ def test_pairwise_precomputed_non_negative():
 def callable_rbf_kernel(x, y, **kwds):
     # Callable version of pairwise.rbf_kernel.
     K = rbf_kernel(np.atleast_2d(x), np.atleast_2d(y), **kwds)
-    return K
+    # unpack the output since this is a scalar packed in a 0-dim array
+    return K.item()
 
 
-# TODO: Remove filterwarnings in 1.3 when wminkowski is removed
-@pytest.mark.filterwarnings("ignore:WMinkowskiDistance:FutureWarning:sklearn")
 @pytest.mark.parametrize(
     "func, metric, kwds",
     [
diff --git a/sklearn/mixture/tests/test_bayesian_mixture.py b/sklearn/mixture/tests/test_bayesian_mixture.py
@@ -58,7 +58,7 @@ def test_log_wishart_norm():
                 ),
                 0,
             )
-        )
+        ).item()
     predected_norm = _log_wishart_norm(
         degrees_of_freedom, log_det_precisions_chol, n_features
     )
diff --git a/sklearn/neighbors/tests/test_neighbors.py b/sklearn/neighbors/tests/test_neighbors.py
@@ -2120,8 +2120,6 @@ def test_auto_algorithm(X, metric, metric_params, expected_algo):
     assert model._fit_method == expected_algo
 
 
-# TODO: Remove filterwarnings in 1.3 when wminkowski is removed
-@pytest.mark.filterwarnings("ignore:WMinkowskiDistance:FutureWarning:sklearn")
 @pytest.mark.parametrize(
     "metric", sorted(set(neighbors.VALID_METRICS["brute"]) - set(["precomputed"]))
 )
diff --git a/sklearn/preprocessing/_polynomial.py b/sklearn/preprocessing/_polynomial.py
@@ -1145,7 +1145,7 @@ def transform(self, X):
                     " of scipy `>=1.9.2` or alter the `SplineTransformer`"
                     " transformer to produce fewer than 2^31 output features"
                 )
-            XBS = sparse.hstack(output_list)
+            XBS = sparse.hstack(output_list, format="csr")
         elif self.sparse_output:
             # TODO: Remove ones scipy 1.10 is the minimum version. See comments above.
             XBS = sparse.csr_matrix(XBS)
diff --git a/sklearn/tests/test_base.py b/sklearn/tests/test_base.py
@@ -188,7 +188,9 @@ def test_clone_nan():
 
 def test_clone_sparse_matrices():
     sparse_matrix_classes = [
-        getattr(sp, name) for name in dir(sp) if name.endswith("_matrix")
+        cls
+        for name in dir(sp)
+        if name.endswith("_matrix") and type(cls := getattr(sp, name)) is type
     ]
 
     for cls in sparse_matrix_classes:
diff --git a/sklearn/tests/test_discriminant_analysis.py b/sklearn/tests/test_discriminant_analysis.py
@@ -187,7 +187,7 @@ def generate_dataset(n_samples, centers, covariances, random_state=None):
     sample = np.array([[-22, 22]])
 
     def discriminant_func(sample, coef, intercept, clazz):
-        return np.exp(intercept[clazz] + np.dot(sample, coef[clazz]))
+        return np.exp(intercept[clazz] + np.dot(sample, coef[clazz])).item()
 
     prob = np.array(
         [
diff --git a/sklearn/tests/test_docstring_parameters.py b/sklearn/tests/test_docstring_parameters.py
@@ -197,6 +197,7 @@ def _construct_sparse_coder(Estimator):
     return Estimator(dictionary=dictionary)
 
 
+@ignore_warnings(category=sklearn.exceptions.ConvergenceWarning)
 @pytest.mark.parametrize("name, Estimator", all_estimators())
 def test_fit_docstring_attributes(name, Estimator):
     pytest.importorskip("numpydoc")
@@ -240,18 +241,6 @@ def test_fit_docstring_attributes(name, Estimator):
         # default raises an error, perplexity must be less than n_samples
         est.set_params(perplexity=2)
 
-    # FIXME: TO BE REMOVED for 1.3 (avoid FutureWarning)
-    if Estimator.__name__ == "SequentialFeatureSelector":
-        est.set_params(n_features_to_select="auto")
-
-    # FIXME: TO BE REMOVED for 1.3 (avoid FutureWarning)
-    if Estimator.__name__ == "FastICA":
-        est.set_params(whiten="unit-variance")
-
-    # FIXME: TO BE REMOVED for 1.3 (avoid FutureWarning)
-    if Estimator.__name__ == "MiniBatchDictionaryLearning":
-        est.set_params(batch_size=5)
-
     # TODO(1.4): TO BE REMOVED for 1.4 (avoid FutureWarning)
     if Estimator.__name__ in ("KMeans", "MiniBatchKMeans"):
         est.set_params(n_init="auto")
@@ -277,6 +266,14 @@ def test_fit_docstring_attributes(name, Estimator):
     if Estimator.__name__ == "MDS":
         est.set_params(normalized_stress="auto")
 
+    # Low max iter to speed up tests: we are only interested in checking the existence
+    # of fitted attributes. This should be invariant to whether it has converged or not.
+    if "max_iter" in est.get_params():
+        est.set_params(max_iter=2)
+
+    if "random_state" in est.get_params():
+        est.set_params(random_state=0)
+
     # In case we want to deprecate some attributes in the future
     skipped_attributes = {}
 
diff --git a/sklearn/tree/_export.py b/sklearn/tree/_export.py
@@ -11,6 +11,7 @@
 #          Li Li <aiki.nogard@gmail.com>
 #          Giuseppe Vettigli <vettigli@gmail.com>
 # License: BSD 3 clause
+from collections.abc import Iterable
 from io import StringIO
 from numbers import Integral
 
@@ -252,7 +253,7 @@ def get_color(self, value):
             color = list(self.colors["rgb"][np.argmax(value)])
             sorted_values = sorted(value, reverse=True)
             if len(sorted_values) == 1:
-                alpha = 0
+                alpha = 0.0
             else:
                 alpha = (sorted_values[0] - sorted_values[1]) / (1 - sorted_values[1])
         else:
@@ -261,8 +262,6 @@ def get_color(self, value):
             alpha = (value - self.colors["bounds"][0]) / (
                 self.colors["bounds"][1] - self.colors["bounds"][0]
             )
-        # unpack numpy scalars
-        alpha = float(alpha)
         # compute the color as alpha against white
         color = [int(round(alpha * c + (1 - alpha) * 255, 0)) for c in color]
         # Return html color code in #RRGGBB format
@@ -282,8 +281,12 @@ def get_fill_color(self, tree, node_id):
         if tree.n_outputs == 1:
             node_val = tree.value[node_id][0, :] / tree.weighted_n_node_samples[node_id]
             if tree.n_classes[0] == 1:
-                # Regression
+                # Regression or degraded classification with single class
                 node_val = tree.value[node_id][0, :]
+                if isinstance(node_val, Iterable) and self.colors["bounds"] is not None:
+                    # Only unpack the float only for the regression tree case.
+                    # Classification tree requires an Iterable in `get_color`.
+                    node_val = node_val.item()
         else:
             # If multi-output color node by impurity
             node_val = -tree.impurity[node_id]
diff --git a/sklearn/utils/random.py b/sklearn/utils/random.py
@@ -75,7 +75,8 @@ def _random_choice_csc(n_samples, classes, class_probability=None, random_state=
         # If there are nonzero classes choose randomly using class_probability
         rng = check_random_state(random_state)
         if classes[j].shape[0] > 1:
-            p_nonzero = 1 - class_prob_j[classes[j] == 0]
+            index_class_0 = np.flatnonzero(classes[j] == 0).item()
+            p_nonzero = 1 - class_prob_j[index_class_0]
             nnz = int(n_samples * p_nonzero)
             ind_sample = sample_without_replacement(
                 n_population=n_samples, n_samples=nnz, random_state=random_state

Original file line number	Diff line number	Diff line change
`@@ -58,7 +58,7 @@ def test_log_wishart_norm():`
`58`	`58`	`),`
`59`	`59`	`0,`
`60`	`60`	`)`
`61`		`- )`
	`61`	`+ ).item()`
`62`	`62`	`predected_norm = _log_wishart_norm(`
`63`	`63`	`degrees_of_freedom, log_det_precisions_chol, n_features`
`64`	`64`	`)`
Original file line number	Diff line number	Diff line change
`@@ -2120,8 +2120,6 @@ def test_auto_algorithm(X, metric, metric_params, expected_algo):`
`2120`	`2120`	`assert model._fit_method == expected_algo`
`2121`	`2121`
`2122`	`2122`
`2123`		`-# TODO: Remove filterwarnings in 1.3 when wminkowski is removed`
`2124`		`-@pytest.mark.filterwarnings("ignore:WMinkowskiDistance:FutureWarning:sklearn")`
`2125`	`2123`	`@pytest.mark.parametrize(`
`2126`	`2124`	`"metric", sorted(set(neighbors.VALID_METRICS["brute"]) - set(["precomputed"]))`
`2127`	`2125`	`)`