Skip to content

Commit 6b4d0e7

Browse files
authored
Merge branch 'main' into submodulev2
2 parents 45320b4 + 1e8a5b8 commit 6b4d0e7

File tree

16 files changed

+52
-42
lines changed

16 files changed

+52
-42
lines changed

build_tools/cirrus/update_tracking_issue.sh

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,4 +18,5 @@ python maint_tools/update_tracking_issue.py \
1818
$CIRRUS_TASK_NAME \
1919
$CIRRUS_REPO_FULL_NAME \
2020
$LINK_TO_RUN \
21-
--tests-passed $TEST_PASSED
21+
--tests-passed $TEST_PASSED \
22+
--auto-close false

doc/whats_new/v1.3.rst

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -274,6 +274,11 @@ Changelog
274274
- |Fix| :func:`datasets.fetch_openml` returns improved data types when
275275
`as_frame=True` and `parser="liac-arff"`. :pr:`26386` by `Thomas Fan`_.
276276

277+
- |Fix| Following the ARFF specs, only the marker `"?"` is now considered as a missing
278+
values when opening ARFF files fetched using :func:`datasets.fetch_openml` when using
279+
the pandas parser. The parameter `read_csv_kwargs` allows to overwrite this behaviour.
280+
:pr:`26551` by :user:`Guillaume Lemaitre <glemaitre>`.
281+
277282
- |Enhancement| Allows to overwrite the parameters used to open the ARFF file using
278283
the parameter `read_csv_kwargs` in :func:`datasets.fetch_openml` when using the
279284
pandas parser.

sklearn/cluster/_hdbscan/_tree.pyx

Lines changed: 12 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,9 @@ import cython
3636

3737
import numpy as np
3838

39+
cdef extern from "numpy/arrayobject.h":
40+
intp_t * PyArray_SHAPE(cnp.PyArrayObject *)
41+
3942
cdef cnp.float64_t INFTY = np.inf
4043
cdef cnp.intp_t NOISE = -1
4144

@@ -240,40 +243,37 @@ cdef dict _compute_stability(
240243
cnp.float64_t[::1] result, births
241244
cnp.intp_t[:] parents = condensed_tree['parent']
242245

243-
cnp.intp_t parent, cluster_size, result_index
246+
cnp.intp_t parent, cluster_size, result_index, idx
244247
cnp.float64_t lambda_val
245248
CONDENSED_t condensed_node
246-
cnp.float64_t[:, :] result_pre_dict
247249
cnp.intp_t largest_child = condensed_tree['child'].max()
248250
cnp.intp_t smallest_cluster = np.min(parents)
249251
cnp.intp_t num_clusters = np.max(parents) - smallest_cluster + 1
252+
dict stability_dict = {}
250253

251254
largest_child = max(largest_child, smallest_cluster)
252255
births = np.full(largest_child + 1, np.nan, dtype=np.float64)
253256

254-
births = np.full(largest_child + 1, np.nan, dtype=np.float64)
255-
for condensed_node in condensed_tree:
257+
for idx in range(PyArray_SHAPE(<cnp.PyArrayObject*> condensed_tree)[0]):
258+
condensed_node = condensed_tree[idx]
256259
births[condensed_node.child] = condensed_node.value
257260

258261
births[smallest_cluster] = 0.0
259262

260263
result = np.zeros(num_clusters, dtype=np.float64)
261-
for condensed_node in condensed_tree:
264+
for idx in range(PyArray_SHAPE(<cnp.PyArrayObject*> condensed_tree)[0]):
265+
condensed_node = condensed_tree[idx]
262266
parent = condensed_node.parent
263267
lambda_val = condensed_node.value
264268
cluster_size = condensed_node.cluster_size
265269

266270
result_index = parent - smallest_cluster
267271
result[result_index] += (lambda_val - births[parent]) * cluster_size
268272

269-
result_pre_dict = np.vstack(
270-
(
271-
np.arange(smallest_cluster, np.max(parents) + 1),
272-
result
273-
)
274-
).T
273+
for idx in range(num_clusters):
274+
stability_dict[idx + smallest_cluster] = result[idx]
275275

276-
return dict(result_pre_dict)
276+
return stability_dict
277277

278278

279279
cdef list bfs_from_cluster_tree(

sklearn/datasets/_arff_parser.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -387,6 +387,7 @@ def _pandas_arff_parser(
387387
"header": None,
388388
"index_col": False, # always force pandas to not use the first column as index
389389
"na_values": ["?"], # missing values are represented by `?`
390+
"keep_default_na": False, # only `?` is a missing value given the ARFF specs
390391
"comment": "%", # skip line starting by `%` since they are comments
391392
"quotechar": '"', # delimiter to use for quoted strings
392393
"skipinitialspace": True, # skip spaces after delimiter to follow ARFF specs

sklearn/decomposition/_dict_learning.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2412,7 +2412,7 @@ def fit(self, X, y=None):
24122412
self.n_steps_ = i + 1
24132413
self.n_iter_ = np.ceil(self.n_steps_ / n_steps_per_iter)
24142414
else:
2415-
# TODO remove this branch in 1.3
2415+
# TODO remove this branch in 1.4
24162416
n_iter = 1000 if self.n_iter == "deprecated" else self.n_iter
24172417

24182418
batches = gen_batches(n_samples, self._batch_size)

sklearn/linear_model/_logistic.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -506,6 +506,9 @@ def _logistic_regression_path(
506506
w0 = np.concatenate([coef_.ravel(), intercept_])
507507
else:
508508
w0 = coef_.ravel()
509+
# n_iter_i is an array for each class. However, `target` is always encoded
510+
# in {-1, 1}, so we only take the first element of n_iter_i.
511+
n_iter_i = n_iter_i.item()
509512

510513
elif solver in ["sag", "saga"]:
511514
if multi_class == "multinomial":

sklearn/linear_model/tests/test_sag.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -95,7 +95,7 @@ def sag(
9595

9696
for epoch in range(n_iter):
9797
for k in range(n_samples):
98-
idx = int(rng.rand(1) * n_samples)
98+
idx = int(rng.rand() * n_samples)
9999
# idx = k
100100
entry = X[idx]
101101
seen.add(idx)
@@ -167,7 +167,7 @@ def sag_sparse(
167167
for epoch in range(n_iter):
168168
for k in range(n_samples):
169169
# idx = k
170-
idx = int(rng.rand(1) * n_samples)
170+
idx = int(rng.rand() * n_samples)
171171
entry = X[idx]
172172
seen.add(idx)
173173

sklearn/metrics/tests/test_pairwise.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -298,11 +298,10 @@ def test_pairwise_precomputed_non_negative():
298298
def callable_rbf_kernel(x, y, **kwds):
299299
# Callable version of pairwise.rbf_kernel.
300300
K = rbf_kernel(np.atleast_2d(x), np.atleast_2d(y), **kwds)
301-
return K
301+
# unpack the output since this is a scalar packed in a 0-dim array
302+
return K.item()
302303

303304

304-
# TODO: Remove filterwarnings in 1.3 when wminkowski is removed
305-
@pytest.mark.filterwarnings("ignore:WMinkowskiDistance:FutureWarning:sklearn")
306305
@pytest.mark.parametrize(
307306
"func, metric, kwds",
308307
[

sklearn/mixture/tests/test_bayesian_mixture.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -58,7 +58,7 @@ def test_log_wishart_norm():
5858
),
5959
0,
6060
)
61-
)
61+
).item()
6262
predected_norm = _log_wishart_norm(
6363
degrees_of_freedom, log_det_precisions_chol, n_features
6464
)

sklearn/neighbors/tests/test_neighbors.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2120,8 +2120,6 @@ def test_auto_algorithm(X, metric, metric_params, expected_algo):
21202120
assert model._fit_method == expected_algo
21212121

21222122

2123-
# TODO: Remove filterwarnings in 1.3 when wminkowski is removed
2124-
@pytest.mark.filterwarnings("ignore:WMinkowskiDistance:FutureWarning:sklearn")
21252123
@pytest.mark.parametrize(
21262124
"metric", sorted(set(neighbors.VALID_METRICS["brute"]) - set(["precomputed"]))
21272125
)

0 commit comments

Comments
 (0)