Skip to content

Commit aae5837

Browse files
authored
Fix low level crash in BisectingKMeans.predict on rescaled data (scikit-learn#27167)
1 parent 50dde09 commit aae5837

File tree

4 files changed

+54
-1
lines changed

4 files changed

+54
-1
lines changed

doc/whats_new/v1.3.rst

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,13 @@ Changes impacting all modules
1818
Changelog
1919
---------
2020

21+
:mod:`sklearn.cluster`
22+
......................
23+
24+
- |Fix| :class:`cluster.BisectingKMeans` could crash when predicting on data
25+
with a different scale than the data used to fit the model.
26+
:pr:`27167` by `Olivier Grisel`_.
27+
2128
:mod:`sklearn.impute`
2229
.....................
2330

sklearn/cluster/_k_means_elkan.pyx

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -259,6 +259,14 @@ def elkan_iter_chunked_dense(
259259
int n_features = X.shape[1]
260260
int n_clusters = centers_new.shape[0]
261261

262+
if n_samples == 0:
263+
# An empty array was passed, do nothing and return early (before
264+
# attempting to compute n_chunks). This can typically happen when
265+
# calling the prediction function of a bisecting k-means model with a
266+
# large fraction of outiers.
267+
return
268+
269+
cdef:
262270
# hard-coded number of samples per chunk. Splitting in chunks is
263271
# necessary to get parallelism. Chunk size chosen to be same as lloyd's
264272
int n_samples_chunk = CHUNK_SIZE if n_samples > CHUNK_SIZE else n_samples
@@ -494,6 +502,14 @@ def elkan_iter_chunked_sparse(
494502
int n_features = X.shape[1]
495503
int n_clusters = centers_new.shape[0]
496504

505+
if n_samples == 0:
506+
# An empty array was passed, do nothing and return early (before
507+
# attempting to compute n_chunks). This can typically happen when
508+
# calling the prediction function of a bisecting k-means model with a
509+
# large fraction of outiers.
510+
return
511+
512+
cdef:
497513
floating[::1] X_data = X.data
498514
int[::1] X_indices = X.indices
499515
int[::1] X_indptr = X.indptr

sklearn/cluster/_k_means_lloyd.pyx

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,14 @@ def lloyd_iter_chunked_dense(
7878
int n_features = X.shape[1]
7979
int n_clusters = centers_old.shape[0]
8080

81+
if n_samples == 0:
82+
# An empty array was passed, do nothing and return early (before
83+
# attempting to compute n_chunks). This can typically happen when
84+
# calling the prediction function of a bisecting k-means model with a
85+
# large fraction of outiers.
86+
return
87+
88+
cdef:
8189
# hard-coded number of samples per chunk. Appeared to be close to
8290
# optimal in all situations.
8391
int n_samples_chunk = CHUNK_SIZE if n_samples > CHUNK_SIZE else n_samples
@@ -263,12 +271,19 @@ def lloyd_iter_chunked_sparse(
263271
the algorithm. This is useful especially when calling predict on a
264272
fitted model.
265273
"""
266-
# print(X.indices.dtype)
267274
cdef:
268275
int n_samples = X.shape[0]
269276
int n_features = X.shape[1]
270277
int n_clusters = centers_old.shape[0]
271278

279+
if n_samples == 0:
280+
# An empty array was passed, do nothing and return early (before
281+
# attempting to compute n_chunks). This can typically happen when
282+
# calling the prediction function of a bisecting k-means model with a
283+
# large fraction of outiers.
284+
return
285+
286+
cdef:
272287
# Choose same as for dense. Does not have the same impact since with
273288
# sparse data the pairwise distances matrix is not precomputed.
274289
# However, splitting in chunks is necessary to get parallelism.

sklearn/cluster/tests/test_bisect_k_means.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -133,3 +133,18 @@ def test_float32_float64_equivalence(csr_container):
133133

134134
assert_allclose(km32.cluster_centers_, km64.cluster_centers_)
135135
assert_array_equal(km32.labels_, km64.labels_)
136+
137+
138+
@pytest.mark.parametrize("algorithm", ("lloyd", "elkan"))
139+
def test_no_crash_on_empty_bisections(algorithm):
140+
# Non-regression test for:
141+
# https://github.com/scikit-learn/scikit-learn/issues/27081
142+
rng = np.random.RandomState(0)
143+
X_train = rng.rand(3000, 10)
144+
bkm = BisectingKMeans(n_clusters=10, algorithm=algorithm).fit(X_train)
145+
146+
# predict on scaled data to trigger pathologic case
147+
# where the inner mask leads to empty bisections.
148+
X_test = 50 * rng.rand(100, 10)
149+
labels = bkm.predict(X_test) # should not crash with idiv by 0
150+
assert np.isin(np.unique(labels), np.arange(10)).all()

0 commit comments

Comments
 (0)