Skip to content

Commit df60c75

Browse files
TST Extend tests for scipy.sparse.*array in test_pairwise.py (scikit-learn#27288)
1 parent bdf66d0 commit df60c75

File tree

1 file changed

+100
-41
lines changed

1 file changed

+100
-41
lines changed

sklearn/metrics/tests/test_pairwise.py

Lines changed: 100 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33

44
import numpy as np
55
from numpy import linalg
6-
from scipy.sparse import csr_matrix, dok_matrix, issparse
6+
from scipy.sparse import issparse
77
from scipy.spatial.distance import (
88
cdist,
99
cityblock,
@@ -62,11 +62,19 @@
6262
assert_array_equal,
6363
ignore_warnings,
6464
)
65-
from sklearn.utils.fixes import parse_version, sp_version
65+
from sklearn.utils.fixes import (
66+
BSR_CONTAINERS,
67+
COO_CONTAINERS,
68+
CSC_CONTAINERS,
69+
CSR_CONTAINERS,
70+
DOK_CONTAINERS,
71+
parse_version,
72+
sp_version,
73+
)
6674
from sklearn.utils.parallel import Parallel, delayed
6775

6876

69-
def test_pairwise_distances(global_dtype):
77+
def test_pairwise_distances_for_dense_data(global_dtype):
7078
# Test the pairwise_distance helper function.
7179
rng = np.random.RandomState(0)
7280

@@ -144,10 +152,23 @@ def test_pairwise_distances(global_dtype):
144152
assert S.shape[1] == Y.shape[0]
145153
assert_allclose(S, S2)
146154

155+
156+
@pytest.mark.parametrize("coo_container", COO_CONTAINERS)
157+
@pytest.mark.parametrize("csc_container", CSC_CONTAINERS)
158+
@pytest.mark.parametrize("bsr_container", BSR_CONTAINERS)
159+
@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
160+
def test_pairwise_distances_for_sparse_data(
161+
coo_container, csc_container, bsr_container, csr_container, global_dtype
162+
):
163+
# Test the pairwise_distance helper function.
164+
rng = np.random.RandomState(0)
165+
X = rng.random_sample((5, 4)).astype(global_dtype, copy=False)
166+
Y = rng.random_sample((2, 4)).astype(global_dtype, copy=False)
167+
147168
# Test with sparse X and Y,
148169
# currently only supported for Euclidean, L1 and cosine.
149-
X_sparse = csr_matrix(X)
150-
Y_sparse = csr_matrix(Y)
170+
X_sparse = csr_container(X)
171+
Y_sparse = csr_container(Y)
151172

152173
S = pairwise_distances(X_sparse, Y_sparse, metric="euclidean")
153174
S2 = euclidean_distances(X_sparse, Y_sparse)
@@ -159,8 +180,8 @@ def test_pairwise_distances(global_dtype):
159180
assert_allclose(S, S2)
160181
assert S.dtype == S2.dtype == global_dtype
161182

162-
S = pairwise_distances(X_sparse, Y_sparse.tocsc(), metric="manhattan")
163-
S2 = manhattan_distances(X_sparse.tobsr(), Y_sparse.tocoo())
183+
S = pairwise_distances(X_sparse, csc_container(Y), metric="manhattan")
184+
S2 = manhattan_distances(bsr_container(X), coo_container(Y))
164185
assert_allclose(S, S2)
165186
if global_dtype == np.float64:
166187
assert S.dtype == S2.dtype == global_dtype
@@ -368,7 +389,8 @@ def test_pairwise_callable_nonstrict_metric():
368389
"metric",
369390
["rbf", "laplacian", "sigmoid", "polynomial", "linear", "chi2", "additive_chi2"],
370391
)
371-
def test_pairwise_kernels(metric):
392+
@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
393+
def test_pairwise_kernels(metric, csr_container):
372394
# Test the pairwise_kernels helper function.
373395

374396
rng = np.random.RandomState(0)
@@ -390,8 +412,8 @@ def test_pairwise_kernels(metric):
390412
assert_allclose(K1, K2)
391413

392414
# Test with sparse X and Y
393-
X_sparse = csr_matrix(X)
394-
Y_sparse = csr_matrix(Y)
415+
X_sparse = csr_container(X)
416+
Y_sparse = csr_container(Y)
395417
if metric in ["chi2", "additive_chi2"]:
396418
# these don't support sparse matrices yet
397419
return
@@ -432,7 +454,8 @@ def test_pairwise_kernels_filter_param():
432454

433455

434456
@pytest.mark.parametrize("metric, func", PAIRED_DISTANCES.items())
435-
def test_paired_distances(metric, func):
457+
@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
458+
def test_paired_distances(metric, func, csr_container):
436459
# Test the pairwise_distance helper function.
437460
rng = np.random.RandomState(0)
438461
# Euclidean distance should be equivalent to calling the function.
@@ -443,7 +466,7 @@ def test_paired_distances(metric, func):
443466
S = paired_distances(X, Y, metric=metric)
444467
S2 = func(X, Y)
445468
assert_allclose(S, S2)
446-
S3 = func(csr_matrix(X), csr_matrix(Y))
469+
S3 = func(csr_container(X), csr_container(Y))
447470
assert_allclose(S, S3)
448471
if metric in PAIRWISE_DISTANCE_FUNCTIONS:
449472
# Check the pairwise_distances implementation
@@ -473,13 +496,15 @@ def test_paired_distances_callable(global_dtype):
473496
paired_distances(X, Y)
474497

475498

476-
def test_pairwise_distances_argmin_min(global_dtype):
499+
@pytest.mark.parametrize("dok_container", DOK_CONTAINERS)
500+
@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
501+
def test_pairwise_distances_argmin_min(dok_container, csr_container, global_dtype):
477502
# Check pairwise minimum distances computation for any metric
478503
X = np.asarray([[0], [1]], dtype=global_dtype)
479504
Y = np.asarray([[-2], [3]], dtype=global_dtype)
480505

481-
Xsp = dok_matrix(X)
482-
Ysp = csr_matrix(Y, dtype=global_dtype)
506+
Xsp = dok_container(X)
507+
Ysp = csr_container(Y, dtype=global_dtype)
483508

484509
expected_idx = [0, 1]
485510
expected_vals = [2, 2]
@@ -633,9 +658,19 @@ def test_pairwise_distances_chunked_reduce_none(global_dtype):
633658
[
634659
lambda D, start: list(D),
635660
lambda D, start: np.array(D),
636-
lambda D, start: csr_matrix(D),
637661
lambda D, start: (list(D), list(D)),
638-
lambda D, start: (dok_matrix(D), np.array(D), list(D)),
662+
]
663+
+ [
664+
lambda D, start, scipy_csr_type=scipy_csr_type: scipy_csr_type(D)
665+
for scipy_csr_type in CSR_CONTAINERS
666+
]
667+
+ [
668+
lambda D, start, scipy_dok_type=scipy_dok_type: (
669+
scipy_dok_type(D),
670+
np.array(D),
671+
list(D),
672+
)
673+
for scipy_dok_type in DOK_CONTAINERS
639674
],
640675
)
641676
def test_pairwise_distances_chunked_reduce_valid(good_reduce):
@@ -759,10 +794,14 @@ def test_pairwise_distances_chunked(global_dtype):
759794

760795

761796
@pytest.mark.parametrize(
762-
"x_array_constr", [np.array, csr_matrix], ids=["dense", "sparse"]
797+
"x_array_constr",
798+
[np.array] + CSR_CONTAINERS,
799+
ids=["dense"] + [container.__name__ for container in CSR_CONTAINERS],
763800
)
764801
@pytest.mark.parametrize(
765-
"y_array_constr", [np.array, csr_matrix], ids=["dense", "sparse"]
802+
"y_array_constr",
803+
[np.array] + CSR_CONTAINERS,
804+
ids=["dense"] + [container.__name__ for container in CSR_CONTAINERS],
766805
)
767806
def test_euclidean_distances_known_result(x_array_constr, y_array_constr):
768807
# Check the pairwise Euclidean distances computation on known result
@@ -773,7 +812,9 @@ def test_euclidean_distances_known_result(x_array_constr, y_array_constr):
773812

774813

775814
@pytest.mark.parametrize(
776-
"y_array_constr", [np.array, csr_matrix], ids=["dense", "sparse"]
815+
"y_array_constr",
816+
[np.array] + CSR_CONTAINERS,
817+
ids=["dense"] + [container.__name__ for container in CSR_CONTAINERS],
777818
)
778819
def test_euclidean_distances_with_norms(global_dtype, y_array_constr):
779820
# check that we still get the right answers with {X,Y}_norm_squared
@@ -842,10 +883,14 @@ def test_euclidean_distances_norm_shapes():
842883

843884

844885
@pytest.mark.parametrize(
845-
"x_array_constr", [np.array, csr_matrix], ids=["dense", "sparse"]
886+
"x_array_constr",
887+
[np.array] + CSR_CONTAINERS,
888+
ids=["dense"] + [container.__name__ for container in CSR_CONTAINERS],
846889
)
847890
@pytest.mark.parametrize(
848-
"y_array_constr", [np.array, csr_matrix], ids=["dense", "sparse"]
891+
"y_array_constr",
892+
[np.array] + CSR_CONTAINERS,
893+
ids=["dense"] + [container.__name__ for container in CSR_CONTAINERS],
849894
)
850895
def test_euclidean_distances(global_dtype, x_array_constr, y_array_constr):
851896
# check that euclidean distances gives same result as scipy cdist
@@ -869,7 +914,9 @@ def test_euclidean_distances(global_dtype, x_array_constr, y_array_constr):
869914

870915

871916
@pytest.mark.parametrize(
872-
"x_array_constr", [np.array, csr_matrix], ids=["dense", "sparse"]
917+
"x_array_constr",
918+
[np.array] + CSR_CONTAINERS,
919+
ids=["dense"] + [container.__name__ for container in CSR_CONTAINERS],
873920
)
874921
def test_euclidean_distances_sym(global_dtype, x_array_constr):
875922
# check that euclidean distances gives same result as scipy pdist
@@ -891,10 +938,14 @@ def test_euclidean_distances_sym(global_dtype, x_array_constr):
891938

892939
@pytest.mark.parametrize("batch_size", [None, 5, 7, 101])
893940
@pytest.mark.parametrize(
894-
"x_array_constr", [np.array, csr_matrix], ids=["dense", "sparse"]
941+
"x_array_constr",
942+
[np.array] + CSR_CONTAINERS,
943+
ids=["dense"] + [container.__name__ for container in CSR_CONTAINERS],
895944
)
896945
@pytest.mark.parametrize(
897-
"y_array_constr", [np.array, csr_matrix], ids=["dense", "sparse"]
946+
"y_array_constr",
947+
[np.array] + CSR_CONTAINERS,
948+
ids=["dense"] + [container.__name__ for container in CSR_CONTAINERS],
898949
)
899950
def test_euclidean_distances_upcast(batch_size, x_array_constr, y_array_constr):
900951
# check batches handling when Y != X (#13910)
@@ -918,7 +969,9 @@ def test_euclidean_distances_upcast(batch_size, x_array_constr, y_array_constr):
918969

919970
@pytest.mark.parametrize("batch_size", [None, 5, 7, 101])
920971
@pytest.mark.parametrize(
921-
"x_array_constr", [np.array, csr_matrix], ids=["dense", "sparse"]
972+
"x_array_constr",
973+
[np.array] + CSR_CONTAINERS,
974+
ids=["dense"] + [container.__name__ for container in CSR_CONTAINERS],
922975
)
923976
def test_euclidean_distances_upcast_sym(batch_size, x_array_constr):
924977
# check batches handling when X is Y (#13910)
@@ -1267,10 +1320,11 @@ def test_kernel_symmetry(kernel):
12671320
cosine_similarity,
12681321
),
12691322
)
1270-
def test_kernel_sparse(kernel):
1323+
@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
1324+
def test_kernel_sparse(kernel, csr_container):
12711325
rng = np.random.RandomState(0)
12721326
X = rng.random_sample((5, 4))
1273-
X_sparse = csr_matrix(X)
1327+
X_sparse = csr_container(X)
12741328
K = kernel(X, X)
12751329
K2 = kernel(X_sparse, X_sparse)
12761330
assert_allclose(K, K2)
@@ -1305,14 +1359,16 @@ def test_laplacian_kernel():
13051359

13061360

13071361
@pytest.mark.parametrize(
1308-
"metric, pairwise_func", [("linear", linear_kernel), ("cosine", cosine_similarity)]
1362+
"metric, pairwise_func",
1363+
[("linear", linear_kernel), ("cosine", cosine_similarity)],
13091364
)
1310-
def test_pairwise_similarity_sparse_output(metric, pairwise_func):
1365+
@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
1366+
def test_pairwise_similarity_sparse_output(metric, pairwise_func, csr_container):
13111367
rng = np.random.RandomState(0)
13121368
X = rng.random_sample((5, 4))
13131369
Y = rng.random_sample((3, 4))
1314-
Xcsr = csr_matrix(X)
1315-
Ycsr = csr_matrix(Y)
1370+
Xcsr = csr_container(X)
1371+
Ycsr = csr_container(Y)
13161372

13171373
# should be sparse
13181374
K1 = pairwise_func(Xcsr, Ycsr, dense_output=False)
@@ -1328,14 +1384,15 @@ def test_pairwise_similarity_sparse_output(metric, pairwise_func):
13281384
assert_allclose(K1.toarray(), K3)
13291385

13301386

1331-
def test_cosine_similarity():
1387+
@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
1388+
def test_cosine_similarity(csr_container):
13321389
# Test the cosine_similarity.
13331390

13341391
rng = np.random.RandomState(0)
13351392
X = rng.random_sample((5, 4))
13361393
Y = rng.random_sample((3, 4))
1337-
Xcsr = csr_matrix(X)
1338-
Ycsr = csr_matrix(Y)
1394+
Xcsr = csr_container(X)
1395+
Ycsr = csr_container(Y)
13391396

13401397
for X_, Y_ in ((X, None), (X, Y), (Xcsr, None), (Xcsr, Ycsr)):
13411398
# Test that the cosine is kernel is equal to a linear kernel when data
@@ -1399,13 +1456,14 @@ def test_check_invalid_dimensions():
13991456
check_pairwise_arrays(XA, XB)
14001457

14011458

1402-
def test_check_sparse_arrays():
1459+
@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
1460+
def test_check_sparse_arrays(csr_container):
14031461
# Ensures that checks return valid sparse matrices.
14041462
rng = np.random.RandomState(0)
14051463
XA = rng.random_sample((5, 4))
1406-
XA_sparse = csr_matrix(XA)
1464+
XA_sparse = csr_container(XA)
14071465
XB = rng.random_sample((5, 4))
1408-
XB_sparse = csr_matrix(XB)
1466+
XB_sparse = csr_container(XB)
14091467
XA_checked, XB_checked = check_pairwise_arrays(XA_sparse, XB_sparse)
14101468
# compare their difference because testing csr matrices for
14111469
# equality with '==' does not work as expected.
@@ -1550,10 +1608,11 @@ def test_numeric_pairwise_distances_datatypes(metric, global_dtype, y_is_x):
15501608
assert_allclose(dist, expected_dist)
15511609

15521610

1553-
def test_sparse_manhattan_readonly_dataset():
1611+
@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
1612+
def test_sparse_manhattan_readonly_dataset(csr_container):
15541613
# Non-regression test for: https://github.com/scikit-learn/scikit-learn/issues/7981
1555-
matrices1 = [csr_matrix(np.ones((5, 5)))]
1556-
matrices2 = [csr_matrix(np.ones((5, 5)))]
1614+
matrices1 = [csr_container(np.ones((5, 5)))]
1615+
matrices2 = [csr_container(np.ones((5, 5)))]
15571616
# Joblib memory maps datasets which makes them read-only.
15581617
# The following call was reporting as failing in #7981, but this must pass.
15591618
Parallel(n_jobs=2, max_nbytes=0)(

0 commit comments

Comments
 (0)