3
3
4
4
import numpy as np
5
5
from numpy import linalg
6
- from scipy .sparse import csr_matrix , dok_matrix , issparse
6
+ from scipy .sparse import issparse
7
7
from scipy .spatial .distance import (
8
8
cdist ,
9
9
cityblock ,
62
62
assert_array_equal ,
63
63
ignore_warnings ,
64
64
)
65
- from sklearn .utils .fixes import parse_version , sp_version
65
+ from sklearn .utils .fixes import (
66
+ BSR_CONTAINERS ,
67
+ COO_CONTAINERS ,
68
+ CSC_CONTAINERS ,
69
+ CSR_CONTAINERS ,
70
+ DOK_CONTAINERS ,
71
+ parse_version ,
72
+ sp_version ,
73
+ )
66
74
from sklearn .utils .parallel import Parallel , delayed
67
75
68
76
69
- def test_pairwise_distances (global_dtype ):
77
+ def test_pairwise_distances_for_dense_data (global_dtype ):
70
78
# Test the pairwise_distance helper function.
71
79
rng = np .random .RandomState (0 )
72
80
@@ -144,10 +152,23 @@ def test_pairwise_distances(global_dtype):
144
152
assert S .shape [1 ] == Y .shape [0 ]
145
153
assert_allclose (S , S2 )
146
154
155
+
156
+ @pytest .mark .parametrize ("coo_container" , COO_CONTAINERS )
157
+ @pytest .mark .parametrize ("csc_container" , CSC_CONTAINERS )
158
+ @pytest .mark .parametrize ("bsr_container" , BSR_CONTAINERS )
159
+ @pytest .mark .parametrize ("csr_container" , CSR_CONTAINERS )
160
+ def test_pairwise_distances_for_sparse_data (
161
+ coo_container , csc_container , bsr_container , csr_container , global_dtype
162
+ ):
163
+ # Test the pairwise_distance helper function.
164
+ rng = np .random .RandomState (0 )
165
+ X = rng .random_sample ((5 , 4 )).astype (global_dtype , copy = False )
166
+ Y = rng .random_sample ((2 , 4 )).astype (global_dtype , copy = False )
167
+
147
168
# Test with sparse X and Y,
148
169
# currently only supported for Euclidean, L1 and cosine.
149
- X_sparse = csr_matrix (X )
150
- Y_sparse = csr_matrix (Y )
170
+ X_sparse = csr_container (X )
171
+ Y_sparse = csr_container (Y )
151
172
152
173
S = pairwise_distances (X_sparse , Y_sparse , metric = "euclidean" )
153
174
S2 = euclidean_distances (X_sparse , Y_sparse )
@@ -159,8 +180,8 @@ def test_pairwise_distances(global_dtype):
159
180
assert_allclose (S , S2 )
160
181
assert S .dtype == S2 .dtype == global_dtype
161
182
162
- S = pairwise_distances (X_sparse , Y_sparse . tocsc ( ), metric = "manhattan" )
163
- S2 = manhattan_distances (X_sparse . tobsr ( ), Y_sparse . tocoo ( ))
183
+ S = pairwise_distances (X_sparse , csc_container ( Y ), metric = "manhattan" )
184
+ S2 = manhattan_distances (bsr_container ( X ), coo_container ( Y ))
164
185
assert_allclose (S , S2 )
165
186
if global_dtype == np .float64 :
166
187
assert S .dtype == S2 .dtype == global_dtype
@@ -368,7 +389,8 @@ def test_pairwise_callable_nonstrict_metric():
368
389
"metric" ,
369
390
["rbf" , "laplacian" , "sigmoid" , "polynomial" , "linear" , "chi2" , "additive_chi2" ],
370
391
)
371
- def test_pairwise_kernels (metric ):
392
+ @pytest .mark .parametrize ("csr_container" , CSR_CONTAINERS )
393
+ def test_pairwise_kernels (metric , csr_container ):
372
394
# Test the pairwise_kernels helper function.
373
395
374
396
rng = np .random .RandomState (0 )
@@ -390,8 +412,8 @@ def test_pairwise_kernels(metric):
390
412
assert_allclose (K1 , K2 )
391
413
392
414
# Test with sparse X and Y
393
- X_sparse = csr_matrix (X )
394
- Y_sparse = csr_matrix (Y )
415
+ X_sparse = csr_container (X )
416
+ Y_sparse = csr_container (Y )
395
417
if metric in ["chi2" , "additive_chi2" ]:
396
418
# these don't support sparse matrices yet
397
419
return
@@ -432,7 +454,8 @@ def test_pairwise_kernels_filter_param():
432
454
433
455
434
456
@pytest .mark .parametrize ("metric, func" , PAIRED_DISTANCES .items ())
435
- def test_paired_distances (metric , func ):
457
+ @pytest .mark .parametrize ("csr_container" , CSR_CONTAINERS )
458
+ def test_paired_distances (metric , func , csr_container ):
436
459
# Test the pairwise_distance helper function.
437
460
rng = np .random .RandomState (0 )
438
461
# Euclidean distance should be equivalent to calling the function.
@@ -443,7 +466,7 @@ def test_paired_distances(metric, func):
443
466
S = paired_distances (X , Y , metric = metric )
444
467
S2 = func (X , Y )
445
468
assert_allclose (S , S2 )
446
- S3 = func (csr_matrix (X ), csr_matrix (Y ))
469
+ S3 = func (csr_container (X ), csr_container (Y ))
447
470
assert_allclose (S , S3 )
448
471
if metric in PAIRWISE_DISTANCE_FUNCTIONS :
449
472
# Check the pairwise_distances implementation
@@ -473,13 +496,15 @@ def test_paired_distances_callable(global_dtype):
473
496
paired_distances (X , Y )
474
497
475
498
476
- def test_pairwise_distances_argmin_min (global_dtype ):
499
+ @pytest .mark .parametrize ("dok_container" , DOK_CONTAINERS )
500
+ @pytest .mark .parametrize ("csr_container" , CSR_CONTAINERS )
501
+ def test_pairwise_distances_argmin_min (dok_container , csr_container , global_dtype ):
477
502
# Check pairwise minimum distances computation for any metric
478
503
X = np .asarray ([[0 ], [1 ]], dtype = global_dtype )
479
504
Y = np .asarray ([[- 2 ], [3 ]], dtype = global_dtype )
480
505
481
- Xsp = dok_matrix (X )
482
- Ysp = csr_matrix (Y , dtype = global_dtype )
506
+ Xsp = dok_container (X )
507
+ Ysp = csr_container (Y , dtype = global_dtype )
483
508
484
509
expected_idx = [0 , 1 ]
485
510
expected_vals = [2 , 2 ]
@@ -633,9 +658,19 @@ def test_pairwise_distances_chunked_reduce_none(global_dtype):
633
658
[
634
659
lambda D , start : list (D ),
635
660
lambda D , start : np .array (D ),
636
- lambda D , start : csr_matrix (D ),
637
661
lambda D , start : (list (D ), list (D )),
638
- lambda D , start : (dok_matrix (D ), np .array (D ), list (D )),
662
+ ]
663
+ + [
664
+ lambda D , start , scipy_csr_type = scipy_csr_type : scipy_csr_type (D )
665
+ for scipy_csr_type in CSR_CONTAINERS
666
+ ]
667
+ + [
668
+ lambda D , start , scipy_dok_type = scipy_dok_type : (
669
+ scipy_dok_type (D ),
670
+ np .array (D ),
671
+ list (D ),
672
+ )
673
+ for scipy_dok_type in DOK_CONTAINERS
639
674
],
640
675
)
641
676
def test_pairwise_distances_chunked_reduce_valid (good_reduce ):
@@ -759,10 +794,14 @@ def test_pairwise_distances_chunked(global_dtype):
759
794
760
795
761
796
@pytest .mark .parametrize (
762
- "x_array_constr" , [np .array , csr_matrix ], ids = ["dense" , "sparse" ]
797
+ "x_array_constr" ,
798
+ [np .array ] + CSR_CONTAINERS ,
799
+ ids = ["dense" ] + [container .__name__ for container in CSR_CONTAINERS ],
763
800
)
764
801
@pytest .mark .parametrize (
765
- "y_array_constr" , [np .array , csr_matrix ], ids = ["dense" , "sparse" ]
802
+ "y_array_constr" ,
803
+ [np .array ] + CSR_CONTAINERS ,
804
+ ids = ["dense" ] + [container .__name__ for container in CSR_CONTAINERS ],
766
805
)
767
806
def test_euclidean_distances_known_result (x_array_constr , y_array_constr ):
768
807
# Check the pairwise Euclidean distances computation on known result
@@ -773,7 +812,9 @@ def test_euclidean_distances_known_result(x_array_constr, y_array_constr):
773
812
774
813
775
814
@pytest .mark .parametrize (
776
- "y_array_constr" , [np .array , csr_matrix ], ids = ["dense" , "sparse" ]
815
+ "y_array_constr" ,
816
+ [np .array ] + CSR_CONTAINERS ,
817
+ ids = ["dense" ] + [container .__name__ for container in CSR_CONTAINERS ],
777
818
)
778
819
def test_euclidean_distances_with_norms (global_dtype , y_array_constr ):
779
820
# check that we still get the right answers with {X,Y}_norm_squared
@@ -842,10 +883,14 @@ def test_euclidean_distances_norm_shapes():
842
883
843
884
844
885
@pytest .mark .parametrize (
845
- "x_array_constr" , [np .array , csr_matrix ], ids = ["dense" , "sparse" ]
886
+ "x_array_constr" ,
887
+ [np .array ] + CSR_CONTAINERS ,
888
+ ids = ["dense" ] + [container .__name__ for container in CSR_CONTAINERS ],
846
889
)
847
890
@pytest .mark .parametrize (
848
- "y_array_constr" , [np .array , csr_matrix ], ids = ["dense" , "sparse" ]
891
+ "y_array_constr" ,
892
+ [np .array ] + CSR_CONTAINERS ,
893
+ ids = ["dense" ] + [container .__name__ for container in CSR_CONTAINERS ],
849
894
)
850
895
def test_euclidean_distances (global_dtype , x_array_constr , y_array_constr ):
851
896
# check that euclidean distances gives same result as scipy cdist
@@ -869,7 +914,9 @@ def test_euclidean_distances(global_dtype, x_array_constr, y_array_constr):
869
914
870
915
871
916
@pytest .mark .parametrize (
872
- "x_array_constr" , [np .array , csr_matrix ], ids = ["dense" , "sparse" ]
917
+ "x_array_constr" ,
918
+ [np .array ] + CSR_CONTAINERS ,
919
+ ids = ["dense" ] + [container .__name__ for container in CSR_CONTAINERS ],
873
920
)
874
921
def test_euclidean_distances_sym (global_dtype , x_array_constr ):
875
922
# check that euclidean distances gives same result as scipy pdist
@@ -891,10 +938,14 @@ def test_euclidean_distances_sym(global_dtype, x_array_constr):
891
938
892
939
@pytest .mark .parametrize ("batch_size" , [None , 5 , 7 , 101 ])
893
940
@pytest .mark .parametrize (
894
- "x_array_constr" , [np .array , csr_matrix ], ids = ["dense" , "sparse" ]
941
+ "x_array_constr" ,
942
+ [np .array ] + CSR_CONTAINERS ,
943
+ ids = ["dense" ] + [container .__name__ for container in CSR_CONTAINERS ],
895
944
)
896
945
@pytest .mark .parametrize (
897
- "y_array_constr" , [np .array , csr_matrix ], ids = ["dense" , "sparse" ]
946
+ "y_array_constr" ,
947
+ [np .array ] + CSR_CONTAINERS ,
948
+ ids = ["dense" ] + [container .__name__ for container in CSR_CONTAINERS ],
898
949
)
899
950
def test_euclidean_distances_upcast (batch_size , x_array_constr , y_array_constr ):
900
951
# check batches handling when Y != X (#13910)
@@ -918,7 +969,9 @@ def test_euclidean_distances_upcast(batch_size, x_array_constr, y_array_constr):
918
969
919
970
@pytest .mark .parametrize ("batch_size" , [None , 5 , 7 , 101 ])
920
971
@pytest .mark .parametrize (
921
- "x_array_constr" , [np .array , csr_matrix ], ids = ["dense" , "sparse" ]
972
+ "x_array_constr" ,
973
+ [np .array ] + CSR_CONTAINERS ,
974
+ ids = ["dense" ] + [container .__name__ for container in CSR_CONTAINERS ],
922
975
)
923
976
def test_euclidean_distances_upcast_sym (batch_size , x_array_constr ):
924
977
# check batches handling when X is Y (#13910)
@@ -1267,10 +1320,11 @@ def test_kernel_symmetry(kernel):
1267
1320
cosine_similarity ,
1268
1321
),
1269
1322
)
1270
- def test_kernel_sparse (kernel ):
1323
+ @pytest .mark .parametrize ("csr_container" , CSR_CONTAINERS )
1324
+ def test_kernel_sparse (kernel , csr_container ):
1271
1325
rng = np .random .RandomState (0 )
1272
1326
X = rng .random_sample ((5 , 4 ))
1273
- X_sparse = csr_matrix (X )
1327
+ X_sparse = csr_container (X )
1274
1328
K = kernel (X , X )
1275
1329
K2 = kernel (X_sparse , X_sparse )
1276
1330
assert_allclose (K , K2 )
@@ -1305,14 +1359,16 @@ def test_laplacian_kernel():
1305
1359
1306
1360
1307
1361
@pytest .mark .parametrize (
1308
- "metric, pairwise_func" , [("linear" , linear_kernel ), ("cosine" , cosine_similarity )]
1362
+ "metric, pairwise_func" ,
1363
+ [("linear" , linear_kernel ), ("cosine" , cosine_similarity )],
1309
1364
)
1310
- def test_pairwise_similarity_sparse_output (metric , pairwise_func ):
1365
+ @pytest .mark .parametrize ("csr_container" , CSR_CONTAINERS )
1366
+ def test_pairwise_similarity_sparse_output (metric , pairwise_func , csr_container ):
1311
1367
rng = np .random .RandomState (0 )
1312
1368
X = rng .random_sample ((5 , 4 ))
1313
1369
Y = rng .random_sample ((3 , 4 ))
1314
- Xcsr = csr_matrix (X )
1315
- Ycsr = csr_matrix (Y )
1370
+ Xcsr = csr_container (X )
1371
+ Ycsr = csr_container (Y )
1316
1372
1317
1373
# should be sparse
1318
1374
K1 = pairwise_func (Xcsr , Ycsr , dense_output = False )
@@ -1328,14 +1384,15 @@ def test_pairwise_similarity_sparse_output(metric, pairwise_func):
1328
1384
assert_allclose (K1 .toarray (), K3 )
1329
1385
1330
1386
1331
- def test_cosine_similarity ():
1387
+ @pytest .mark .parametrize ("csr_container" , CSR_CONTAINERS )
1388
+ def test_cosine_similarity (csr_container ):
1332
1389
# Test the cosine_similarity.
1333
1390
1334
1391
rng = np .random .RandomState (0 )
1335
1392
X = rng .random_sample ((5 , 4 ))
1336
1393
Y = rng .random_sample ((3 , 4 ))
1337
- Xcsr = csr_matrix (X )
1338
- Ycsr = csr_matrix (Y )
1394
+ Xcsr = csr_container (X )
1395
+ Ycsr = csr_container (Y )
1339
1396
1340
1397
for X_ , Y_ in ((X , None ), (X , Y ), (Xcsr , None ), (Xcsr , Ycsr )):
1341
1398
# Test that the cosine is kernel is equal to a linear kernel when data
@@ -1399,13 +1456,14 @@ def test_check_invalid_dimensions():
1399
1456
check_pairwise_arrays (XA , XB )
1400
1457
1401
1458
1402
- def test_check_sparse_arrays ():
1459
+ @pytest .mark .parametrize ("csr_container" , CSR_CONTAINERS )
1460
+ def test_check_sparse_arrays (csr_container ):
1403
1461
# Ensures that checks return valid sparse matrices.
1404
1462
rng = np .random .RandomState (0 )
1405
1463
XA = rng .random_sample ((5 , 4 ))
1406
- XA_sparse = csr_matrix (XA )
1464
+ XA_sparse = csr_container (XA )
1407
1465
XB = rng .random_sample ((5 , 4 ))
1408
- XB_sparse = csr_matrix (XB )
1466
+ XB_sparse = csr_container (XB )
1409
1467
XA_checked , XB_checked = check_pairwise_arrays (XA_sparse , XB_sparse )
1410
1468
# compare their difference because testing csr matrices for
1411
1469
# equality with '==' does not work as expected.
@@ -1550,10 +1608,11 @@ def test_numeric_pairwise_distances_datatypes(metric, global_dtype, y_is_x):
1550
1608
assert_allclose (dist , expected_dist )
1551
1609
1552
1610
1553
- def test_sparse_manhattan_readonly_dataset ():
1611
+ @pytest .mark .parametrize ("csr_container" , CSR_CONTAINERS )
1612
+ def test_sparse_manhattan_readonly_dataset (csr_container ):
1554
1613
# Non-regression test for: https://github.com/scikit-learn/scikit-learn/issues/7981
1555
- matrices1 = [csr_matrix (np .ones ((5 , 5 )))]
1556
- matrices2 = [csr_matrix (np .ones ((5 , 5 )))]
1614
+ matrices1 = [csr_container (np .ones ((5 , 5 )))]
1615
+ matrices2 = [csr_container (np .ones ((5 , 5 )))]
1557
1616
# Joblib memory maps datasets which makes them read-only.
1558
1617
# The following call was reporting as failing in #7981, but this must pass.
1559
1618
Parallel (n_jobs = 2 , max_nbytes = 0 )(
0 commit comments