Fix low level crash in BisectingKMeans.predict on rescaled data (scikit-learn#27167)

ogrisel · web-flow · commit aae5837ecf70 · 2023-08-28T14:54:18.000+05:00
diff --git a/doc/whats_new/v1.3.rst b/doc/whats_new/v1.3.rst
@@ -18,6 +18,13 @@ Changes impacting all modules
 Changelog
 ---------
 
+:mod:`sklearn.cluster`
+......................
+
+- |Fix| :class:`cluster.BisectingKMeans` could crash when predicting on data
+  with a different scale than the data used to fit the model.
+  :pr:`27167` by `Olivier Grisel`_.
+
 :mod:`sklearn.impute`
 .....................
 
diff --git a/sklearn/cluster/_k_means_elkan.pyx b/sklearn/cluster/_k_means_elkan.pyx
@@ -259,6 +259,14 @@ def elkan_iter_chunked_dense(
         int n_features = X.shape[1]
         int n_clusters = centers_new.shape[0]
 
+    if n_samples == 0:
+        # An empty array was passed, do nothing and return early (before
+        # attempting to compute n_chunks). This can typically happen when
+        # calling the prediction function of a bisecting k-means model with a
+        # large fraction of outiers.
+        return
+
+    cdef:
         # hard-coded number of samples per chunk. Splitting in chunks is
         # necessary to get parallelism. Chunk size chosen to be same as lloyd's
         int n_samples_chunk = CHUNK_SIZE if n_samples > CHUNK_SIZE else n_samples
@@ -494,6 +502,14 @@ def elkan_iter_chunked_sparse(
         int n_features = X.shape[1]
         int n_clusters = centers_new.shape[0]
 
+    if n_samples == 0:
+        # An empty array was passed, do nothing and return early (before
+        # attempting to compute n_chunks). This can typically happen when
+        # calling the prediction function of a bisecting k-means model with a
+        # large fraction of outiers.
+        return
+
+    cdef:
         floating[::1] X_data = X.data
         int[::1] X_indices = X.indices
         int[::1] X_indptr = X.indptr
diff --git a/sklearn/cluster/_k_means_lloyd.pyx b/sklearn/cluster/_k_means_lloyd.pyx
@@ -78,6 +78,14 @@ def lloyd_iter_chunked_dense(
         int n_features = X.shape[1]
         int n_clusters = centers_old.shape[0]
 
+    if n_samples == 0:
+        # An empty array was passed, do nothing and return early (before
+        # attempting to compute n_chunks). This can typically happen when
+        # calling the prediction function of a bisecting k-means model with a
+        # large fraction of outiers.
+        return
+
+    cdef:
         # hard-coded number of samples per chunk. Appeared to be close to
         # optimal in all situations.
         int n_samples_chunk = CHUNK_SIZE if n_samples > CHUNK_SIZE else n_samples
@@ -263,12 +271,19 @@ def lloyd_iter_chunked_sparse(
           the algorithm. This is useful especially when calling predict on a
           fitted model.
     """
-    # print(X.indices.dtype)
     cdef:
         int n_samples = X.shape[0]
         int n_features = X.shape[1]
         int n_clusters = centers_old.shape[0]
 
+    if n_samples == 0:
+        # An empty array was passed, do nothing and return early (before
+        # attempting to compute n_chunks). This can typically happen when
+        # calling the prediction function of a bisecting k-means model with a
+        # large fraction of outiers.
+        return
+
+    cdef:
         # Choose same as for dense. Does not have the same impact since with
         # sparse data the pairwise distances matrix is not precomputed.
         # However, splitting in chunks is necessary to get parallelism.
diff --git a/sklearn/cluster/tests/test_bisect_k_means.py b/sklearn/cluster/tests/test_bisect_k_means.py
@@ -133,3 +133,18 @@ def test_float32_float64_equivalence(csr_container):
 
     assert_allclose(km32.cluster_centers_, km64.cluster_centers_)
     assert_array_equal(km32.labels_, km64.labels_)
+
+
+@pytest.mark.parametrize("algorithm", ("lloyd", "elkan"))
+def test_no_crash_on_empty_bisections(algorithm):
+    # Non-regression test for:
+    # https://github.com/scikit-learn/scikit-learn/issues/27081
+    rng = np.random.RandomState(0)
+    X_train = rng.rand(3000, 10)
+    bkm = BisectingKMeans(n_clusters=10, algorithm=algorithm).fit(X_train)
+
+    # predict on scaled data to trigger pathologic case
+    # where the inner mask leads to empty bisections.
+    X_test = 50 * rng.rand(100, 10)
+    labels = bkm.predict(X_test)  # should not crash with idiv by 0
+    assert np.isin(np.unique(labels), np.arange(10)).all()