31
31
create_memmap_backed_data ,
32
32
)
33
33
from sklearn .utils .extmath import row_norms
34
- from sklearn .utils .fixes import threadpool_limits
34
+ from sklearn .utils .fixes import CSR_CONTAINERS , threadpool_limits
35
35
36
36
# TODO(1.4): Remove
37
37
msg = (
53
53
X , true_labels = make_blobs (
54
54
n_samples = n_samples , centers = centers , cluster_std = 1.0 , random_state = 42
55
55
)
56
- X_csr = sp .csr_matrix (X )
56
+ X_as_any_csr = [container (X ) for container in CSR_CONTAINERS ]
57
+ data_containers = [np .array ] + CSR_CONTAINERS
58
+ data_containers_ids = (
59
+ ["dense" , "sparse_matrix" , "sparse_array" ]
60
+ if len (X_as_any_csr ) == 2
61
+ else ["dense" , "sparse_matrix" ]
62
+ )
57
63
58
64
59
- @pytest .mark .parametrize (
60
- "array_constr" , [np .array , sp .csr_matrix ], ids = ["dense" , "sparse" ]
61
- )
65
+ @pytest .mark .parametrize ("array_constr" , data_containers , ids = data_containers_ids )
62
66
@pytest .mark .parametrize ("algo" , ["lloyd" , "elkan" ])
63
67
@pytest .mark .parametrize ("dtype" , [np .float32 , np .float64 ])
64
68
def test_kmeans_results (array_constr , algo , dtype ):
@@ -82,9 +86,7 @@ def test_kmeans_results(array_constr, algo, dtype):
82
86
assert kmeans .n_iter_ == expected_n_iter
83
87
84
88
85
- @pytest .mark .parametrize (
86
- "array_constr" , [np .array , sp .csr_matrix ], ids = ["dense" , "sparse" ]
87
- )
89
+ @pytest .mark .parametrize ("array_constr" , data_containers , ids = data_containers_ids )
88
90
@pytest .mark .parametrize ("algo" , ["lloyd" , "elkan" ])
89
91
def test_kmeans_relocated_clusters (array_constr , algo ):
90
92
# check that empty clusters are relocated as expected
@@ -115,9 +117,7 @@ def test_kmeans_relocated_clusters(array_constr, algo):
115
117
assert_allclose (kmeans .cluster_centers_ , expected_centers )
116
118
117
119
118
- @pytest .mark .parametrize (
119
- "array_constr" , [np .array , sp .csr_matrix ], ids = ["dense" , "sparse" ]
120
- )
120
+ @pytest .mark .parametrize ("array_constr" , data_containers , ids = data_containers_ids )
121
121
def test_relocate_empty_clusters (array_constr ):
122
122
# test for the _relocate_empty_clusters_(dense/sparse) helpers
123
123
@@ -160,9 +160,7 @@ def test_relocate_empty_clusters(array_constr):
160
160
161
161
162
162
@pytest .mark .parametrize ("distribution" , ["normal" , "blobs" ])
163
- @pytest .mark .parametrize (
164
- "array_constr" , [np .array , sp .csr_matrix ], ids = ["dense" , "sparse" ]
165
- )
163
+ @pytest .mark .parametrize ("array_constr" , data_containers , ids = data_containers_ids )
166
164
@pytest .mark .parametrize ("tol" , [1e-2 , 1e-8 , 1e-100 , 0 ])
167
165
def test_kmeans_elkan_results (distribution , array_constr , tol , global_random_seed ):
168
166
# Check that results are identical between lloyd and elkan algorithms
@@ -238,7 +236,8 @@ def test_predict_sample_weight_deprecation_warning(Estimator):
238
236
kmeans .predict (X , sample_weight = sample_weight )
239
237
240
238
241
- def test_minibatch_update_consistency (global_random_seed ):
239
+ @pytest .mark .parametrize ("X_csr" , X_as_any_csr )
240
+ def test_minibatch_update_consistency (X_csr , global_random_seed ):
242
241
# Check that dense and sparse minibatch update give the same results
243
242
rng = np .random .RandomState (global_random_seed )
244
243
@@ -315,19 +314,23 @@ def _check_fitted_model(km):
315
314
assert km .inertia_ > 0.0
316
315
317
316
318
- @pytest .mark .parametrize ("data" , [X , X_csr ], ids = ["dense" , "sparse" ])
317
+ @pytest .mark .parametrize (
318
+ "input_data" ,
319
+ [X ] + X_as_any_csr ,
320
+ ids = data_containers_ids ,
321
+ )
319
322
@pytest .mark .parametrize (
320
323
"init" ,
321
324
["random" , "k-means++" , centers , lambda X , k , random_state : centers ],
322
325
ids = ["random" , "k-means++" , "ndarray" , "callable" ],
323
326
)
324
327
@pytest .mark .parametrize ("Estimator" , [KMeans , MiniBatchKMeans ])
325
- def test_all_init (Estimator , data , init ):
328
+ def test_all_init (Estimator , input_data , init ):
326
329
# Check KMeans and MiniBatchKMeans with all possible init.
327
330
n_init = 10 if isinstance (init , str ) else 1
328
331
km = Estimator (
329
332
init = init , n_clusters = n_clusters , random_state = 42 , n_init = n_init
330
- ).fit (data )
333
+ ).fit (input_data )
331
334
_check_fitted_model (km )
332
335
333
336
@@ -485,8 +488,12 @@ def test_minibatch_sensible_reassign(global_random_seed):
485
488
assert km .cluster_centers_ .any (axis = 1 ).sum () > 10
486
489
487
490
488
- @pytest .mark .parametrize ("data" , [X , X_csr ], ids = ["dense" , "sparse" ])
489
- def test_minibatch_reassign (data , global_random_seed ):
491
+ @pytest .mark .parametrize (
492
+ "input_data" ,
493
+ [X ] + X_as_any_csr ,
494
+ ids = data_containers_ids ,
495
+ )
496
+ def test_minibatch_reassign (input_data , global_random_seed ):
490
497
# Check the reassignment part of the minibatch step with very high or very
491
498
# low reassignment ratio.
492
499
perfect_centers = np .empty ((n_clusters , n_features ))
@@ -499,10 +506,10 @@ def test_minibatch_reassign(data, global_random_seed):
499
506
# Give a perfect initialization, but a large reassignment_ratio, as a
500
507
# result many centers should be reassigned and the model should no longer
501
508
# be good
502
- score_before = - _labels_inertia (data , sample_weight , perfect_centers , 1 )[1 ]
509
+ score_before = - _labels_inertia (input_data , sample_weight , perfect_centers , 1 )[1 ]
503
510
504
511
_mini_batch_step (
505
- data ,
512
+ input_data ,
506
513
sample_weight ,
507
514
perfect_centers ,
508
515
centers_new ,
@@ -512,14 +519,14 @@ def test_minibatch_reassign(data, global_random_seed):
512
519
reassignment_ratio = 1 ,
513
520
)
514
521
515
- score_after = - _labels_inertia (data , sample_weight , centers_new , 1 )[1 ]
522
+ score_after = - _labels_inertia (input_data , sample_weight , centers_new , 1 )[1 ]
516
523
517
524
assert score_before > score_after
518
525
519
526
# Give a perfect initialization, with a small reassignment_ratio,
520
527
# no center should be reassigned.
521
528
_mini_batch_step (
522
- data ,
529
+ input_data ,
523
530
sample_weight ,
524
531
perfect_centers ,
525
532
centers_new ,
@@ -641,9 +648,7 @@ def test_score_max_iter(Estimator, global_random_seed):
641
648
assert s2 > s1
642
649
643
650
644
- @pytest .mark .parametrize (
645
- "array_constr" , [np .array , sp .csr_matrix ], ids = ["dense" , "sparse" ]
646
- )
651
+ @pytest .mark .parametrize ("array_constr" , data_containers , ids = data_containers_ids )
647
652
@pytest .mark .parametrize (
648
653
"Estimator, algorithm" ,
649
654
[(KMeans , "lloyd" ), (KMeans , "elkan" ), (MiniBatchKMeans , None )],
@@ -684,8 +689,9 @@ def test_kmeans_predict(
684
689
assert_array_equal (pred , np .arange (10 ))
685
690
686
691
692
+ @pytest .mark .parametrize ("X_csr" , X_as_any_csr )
687
693
@pytest .mark .parametrize ("Estimator" , [KMeans , MiniBatchKMeans ])
688
- def test_dense_sparse (Estimator , global_random_seed ):
694
+ def test_dense_sparse (Estimator , X_csr , global_random_seed ):
689
695
# Check that the results are the same for dense and sparse input.
690
696
sample_weight = np .random .RandomState (global_random_seed ).random_sample (
691
697
(n_samples ,)
@@ -703,11 +709,12 @@ def test_dense_sparse(Estimator, global_random_seed):
703
709
assert_allclose (km_dense .cluster_centers_ , km_sparse .cluster_centers_ )
704
710
705
711
712
+ @pytest .mark .parametrize ("X_csr" , X_as_any_csr )
706
713
@pytest .mark .parametrize (
707
714
"init" , ["random" , "k-means++" , centers ], ids = ["random" , "k-means++" , "ndarray" ]
708
715
)
709
716
@pytest .mark .parametrize ("Estimator" , [KMeans , MiniBatchKMeans ])
710
- def test_predict_dense_sparse (Estimator , init ):
717
+ def test_predict_dense_sparse (Estimator , init , X_csr ):
711
718
# check that models trained on sparse input also works for dense input at
712
719
# predict time and vice versa.
713
720
n_init = 10 if isinstance (init , str ) else 1
@@ -720,9 +727,7 @@ def test_predict_dense_sparse(Estimator, init):
720
727
assert_array_equal (km .predict (X_csr ), km .labels_ )
721
728
722
729
723
- @pytest .mark .parametrize (
724
- "array_constr" , [np .array , sp .csr_matrix ], ids = ["dense" , "sparse" ]
725
- )
730
+ @pytest .mark .parametrize ("array_constr" , data_containers , ids = data_containers_ids )
726
731
@pytest .mark .parametrize ("dtype" , [np .int32 , np .int64 ])
727
732
@pytest .mark .parametrize ("init" , ["k-means++" , "ndarray" ])
728
733
@pytest .mark .parametrize ("Estimator" , [KMeans , MiniBatchKMeans ])
@@ -810,9 +815,13 @@ def test_k_means_function(global_random_seed):
810
815
assert inertia > 0.0
811
816
812
817
813
- @pytest .mark .parametrize ("data" , [X , X_csr ], ids = ["dense" , "sparse" ])
818
+ @pytest .mark .parametrize (
819
+ "input_data" ,
820
+ [X ] + X_as_any_csr ,
821
+ ids = data_containers_ids ,
822
+ )
814
823
@pytest .mark .parametrize ("Estimator" , [KMeans , MiniBatchKMeans ])
815
- def test_float_precision (Estimator , data , global_random_seed ):
824
+ def test_float_precision (Estimator , input_data , global_random_seed ):
816
825
# Check that the results are the same for single and double precision.
817
826
km = Estimator (n_init = 1 , random_state = global_random_seed )
818
827
@@ -822,7 +831,7 @@ def test_float_precision(Estimator, data, global_random_seed):
822
831
labels = {}
823
832
824
833
for dtype in [np .float64 , np .float32 ]:
825
- X = data .astype (dtype , copy = False )
834
+ X = input_data .astype (dtype , copy = False )
826
835
km .fit (X )
827
836
828
837
inertia [dtype ] = km .inertia_
@@ -863,12 +872,18 @@ def test_centers_not_mutated(Estimator, dtype):
863
872
assert not np .may_share_memory (km .cluster_centers_ , centers_new_type )
864
873
865
874
866
- @pytest .mark .parametrize ("data" , [X , X_csr ], ids = ["dense" , "sparse" ])
867
- def test_kmeans_init_fitted_centers (data ):
875
+ @pytest .mark .parametrize (
876
+ "input_data" ,
877
+ [X ] + X_as_any_csr ,
878
+ ids = data_containers_ids ,
879
+ )
880
+ def test_kmeans_init_fitted_centers (input_data ):
868
881
# Check that starting fitting from a local optimum shouldn't change the
869
882
# solution
870
- km1 = KMeans (n_clusters = n_clusters ).fit (data )
871
- km2 = KMeans (n_clusters = n_clusters , init = km1 .cluster_centers_ , n_init = 1 ).fit (data )
883
+ km1 = KMeans (n_clusters = n_clusters ).fit (input_data )
884
+ km2 = KMeans (n_clusters = n_clusters , init = km1 .cluster_centers_ , n_init = 1 ).fit (
885
+ input_data
886
+ )
872
887
873
888
assert_allclose (km1 .cluster_centers_ , km2 .cluster_centers_ )
874
889
@@ -920,31 +935,39 @@ def test_weighted_vs_repeated(global_random_seed):
920
935
)
921
936
922
937
923
- @pytest .mark .parametrize ("data" , [X , X_csr ], ids = ["dense" , "sparse" ])
938
+ @pytest .mark .parametrize (
939
+ "input_data" ,
940
+ [X ] + X_as_any_csr ,
941
+ ids = data_containers_ids ,
942
+ )
924
943
@pytest .mark .parametrize ("Estimator" , [KMeans , MiniBatchKMeans ])
925
- def test_unit_weights_vs_no_weights (Estimator , data , global_random_seed ):
944
+ def test_unit_weights_vs_no_weights (Estimator , input_data , global_random_seed ):
926
945
# Check that not passing sample weights should be equivalent to passing
927
946
# sample weights all equal to one.
928
947
sample_weight = np .ones (n_samples )
929
948
930
949
km = Estimator (n_clusters = n_clusters , random_state = global_random_seed , n_init = 1 )
931
- km_none = clone (km ).fit (data , sample_weight = None )
932
- km_ones = clone (km ).fit (data , sample_weight = sample_weight )
950
+ km_none = clone (km ).fit (input_data , sample_weight = None )
951
+ km_ones = clone (km ).fit (input_data , sample_weight = sample_weight )
933
952
934
953
assert_array_equal (km_none .labels_ , km_ones .labels_ )
935
954
assert_allclose (km_none .cluster_centers_ , km_ones .cluster_centers_ )
936
955
937
956
938
- @pytest .mark .parametrize ("data" , [X , X_csr ], ids = ["dense" , "sparse" ])
957
+ @pytest .mark .parametrize (
958
+ "input_data" ,
959
+ [X ] + X_as_any_csr ,
960
+ ids = data_containers_ids ,
961
+ )
939
962
@pytest .mark .parametrize ("Estimator" , [KMeans , MiniBatchKMeans ])
940
- def test_scaled_weights (Estimator , data , global_random_seed ):
963
+ def test_scaled_weights (Estimator , input_data , global_random_seed ):
941
964
# Check that scaling all sample weights by a common factor
942
965
# shouldn't change the result
943
966
sample_weight = np .random .RandomState (global_random_seed ).uniform (size = n_samples )
944
967
945
968
km = Estimator (n_clusters = n_clusters , random_state = global_random_seed , n_init = 1 )
946
- km_orig = clone (km ).fit (data , sample_weight = sample_weight )
947
- km_scaled = clone (km ).fit (data , sample_weight = 0.5 * sample_weight )
969
+ km_orig = clone (km ).fit (input_data , sample_weight = sample_weight )
970
+ km_scaled = clone (km ).fit (input_data , sample_weight = 0.5 * sample_weight )
948
971
949
972
assert_array_equal (km_orig .labels_ , km_scaled .labels_ )
950
973
assert_allclose (km_orig .cluster_centers_ , km_scaled .cluster_centers_ )
@@ -957,9 +980,7 @@ def test_kmeans_elkan_iter_attribute():
957
980
assert km .n_iter_ == 1
958
981
959
982
960
- @pytest .mark .parametrize (
961
- "array_constr" , [np .array , sp .csr_matrix ], ids = ["dense" , "sparse" ]
962
- )
983
+ @pytest .mark .parametrize ("array_constr" , data_containers , ids = data_containers_ids )
963
984
def test_kmeans_empty_cluster_relocated (array_constr ):
964
985
# check that empty clusters are correctly relocated when using sample
965
986
# weights (#13486)
@@ -1005,9 +1026,7 @@ def test_warning_elkan_1_cluster():
1005
1026
KMeans (n_clusters = 1 , algorithm = "elkan" ).fit (X )
1006
1027
1007
1028
1008
- @pytest .mark .parametrize (
1009
- "array_constr" , [np .array , sp .csr_matrix ], ids = ["dense" , "sparse" ]
1010
- )
1029
+ @pytest .mark .parametrize ("array_constr" , data_containers , ids = data_containers_ids )
1011
1030
@pytest .mark .parametrize ("algo" , ["lloyd" , "elkan" ])
1012
1031
def test_k_means_1_iteration (array_constr , algo , global_random_seed ):
1013
1032
# check the results after a single iteration (E-step M-step E-step) by
@@ -1196,11 +1215,14 @@ def test_kmeans_plusplus_wrong_params(param, match):
1196
1215
kmeans_plusplus (X , n_clusters , ** param )
1197
1216
1198
1217
1199
- @pytest .mark .parametrize ("data" , [X , X_csr ])
1218
+ @pytest .mark .parametrize (
1219
+ "input_data" ,
1220
+ [X ] + X_as_any_csr ,
1221
+ )
1200
1222
@pytest .mark .parametrize ("dtype" , [np .float64 , np .float32 ])
1201
- def test_kmeans_plusplus_output (data , dtype , global_random_seed ):
1223
+ def test_kmeans_plusplus_output (input_data , dtype , global_random_seed ):
1202
1224
# Check for the correct number of seeds and all positive values
1203
- data = data .astype (dtype )
1225
+ data = input_data .astype (dtype )
1204
1226
centers , indices = kmeans_plusplus (
1205
1227
data , n_clusters , random_state = global_random_seed
1206
1228
)
@@ -1289,15 +1311,15 @@ def test_feature_names_out(Klass, method):
1289
1311
assert_array_equal ([f"{ class_name } { i } " for i in range (n_clusters )], names_out )
1290
1312
1291
1313
1292
- @pytest .mark .parametrize ("is_sparse " , [ True , False ])
1293
- def test_predict_does_not_change_cluster_centers (is_sparse ):
1314
+ @pytest .mark .parametrize ("csr_container " , CSR_CONTAINERS + [ None ])
1315
+ def test_predict_does_not_change_cluster_centers (csr_container ):
1294
1316
"""Check that predict does not change cluster centers.
1295
1317
1296
1318
Non-regression test for gh-24253.
1297
1319
"""
1298
1320
X , _ = make_blobs (n_samples = 200 , n_features = 10 , centers = 10 , random_state = 0 )
1299
- if is_sparse :
1300
- X = sp . csr_matrix (X )
1321
+ if csr_container is not None :
1322
+ X = csr_container (X )
1301
1323
1302
1324
kmeans = KMeans ()
1303
1325
y_pred1 = kmeans .fit_predict (X )
0 commit comments