Skip to content

Commit 462a6d3

Browse files
authored
TST Extend tests for scipy.sparse.*array in sklearn/decomposition/tests/test_online_lda.py (scikit-learn#27144)
1 parent 57bc768 commit 462a6d3

File tree

1 file changed

+56
-35
lines changed

1 file changed

+56
-35
lines changed

sklearn/decomposition/tests/test_online_lda.py

Lines changed: 56 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,6 @@
55
import pytest
66
from numpy.testing import assert_array_equal
77
from scipy.linalg import block_diag
8-
from scipy.sparse import csr_matrix
98
from scipy.special import psi
109

1110
from sklearn.decomposition import LatentDirichletAllocation
@@ -20,23 +19,25 @@
2019
assert_array_almost_equal,
2120
if_safe_multiprocessing_with_blas,
2221
)
22+
from sklearn.utils.fixes import CSR_CONTAINERS
2323

2424

25-
def _build_sparse_mtx():
25+
def _build_sparse_array(csr_container):
2626
# Create 3 topics and each topic has 3 distinct words.
2727
# (Each word only belongs to a single topic.)
2828
n_components = 3
2929
block = np.full((3, 3), n_components, dtype=int)
3030
blocks = [block] * n_components
3131
X = block_diag(*blocks)
32-
X = csr_matrix(X)
32+
X = csr_container(X)
3333
return (n_components, X)
3434

3535

36-
def test_lda_default_prior_params():
36+
@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
37+
def test_lda_default_prior_params(csr_container):
3738
# default prior parameter should be `1 / topics`
3839
# and verbose params should not affect result
39-
n_components, X = _build_sparse_mtx()
40+
n_components, X = _build_sparse_array(csr_container)
4041
prior = 1.0 / n_components
4142
lda_1 = LatentDirichletAllocation(
4243
n_components=n_components,
@@ -50,10 +51,11 @@ def test_lda_default_prior_params():
5051
assert_almost_equal(topic_distr_1, topic_distr_2)
5152

5253

53-
def test_lda_fit_batch():
54+
@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
55+
def test_lda_fit_batch(csr_container):
5456
# Test LDA batch learning_offset (`fit` method with 'batch' learning)
5557
rng = np.random.RandomState(0)
56-
n_components, X = _build_sparse_mtx()
58+
n_components, X = _build_sparse_array(csr_container)
5759
lda = LatentDirichletAllocation(
5860
n_components=n_components,
5961
evaluate_every=1,
@@ -69,10 +71,11 @@ def test_lda_fit_batch():
6971
assert tuple(sorted(top_idx)) in correct_idx_grps
7072

7173

72-
def test_lda_fit_online():
74+
@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
75+
def test_lda_fit_online(csr_container):
7376
# Test LDA online learning (`fit` method with 'online' learning)
7477
rng = np.random.RandomState(0)
75-
n_components, X = _build_sparse_mtx()
78+
n_components, X = _build_sparse_array(csr_container)
7679
lda = LatentDirichletAllocation(
7780
n_components=n_components,
7881
learning_offset=10.0,
@@ -89,11 +92,12 @@ def test_lda_fit_online():
8992
assert tuple(sorted(top_idx)) in correct_idx_grps
9093

9194

92-
def test_lda_partial_fit():
95+
@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
96+
def test_lda_partial_fit(csr_container):
9397
# Test LDA online learning (`partial_fit` method)
9498
# (same as test_lda_batch)
9599
rng = np.random.RandomState(0)
96-
n_components, X = _build_sparse_mtx()
100+
n_components, X = _build_sparse_array(csr_container)
97101
lda = LatentDirichletAllocation(
98102
n_components=n_components,
99103
learning_offset=10.0,
@@ -109,10 +113,11 @@ def test_lda_partial_fit():
109113
assert tuple(sorted(top_idx)) in correct_idx_grps
110114

111115

112-
def test_lda_dense_input():
116+
@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
117+
def test_lda_dense_input(csr_container):
113118
# Test LDA with dense input.
114119
rng = np.random.RandomState(0)
115-
n_components, X = _build_sparse_mtx()
120+
n_components, X = _build_sparse_array(csr_container)
116121
lda = LatentDirichletAllocation(
117122
n_components=n_components, learning_method="batch", random_state=rng
118123
)
@@ -175,9 +180,10 @@ def test_lda_no_component_error():
175180

176181

177182
@if_safe_multiprocessing_with_blas
183+
@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
178184
@pytest.mark.parametrize("method", ("online", "batch"))
179-
def test_lda_multi_jobs(method):
180-
n_components, X = _build_sparse_mtx()
185+
def test_lda_multi_jobs(method, csr_container):
186+
n_components, X = _build_sparse_array(csr_container)
181187
# Test LDA batch training with multi CPU
182188
rng = np.random.RandomState(0)
183189
lda = LatentDirichletAllocation(
@@ -196,10 +202,11 @@ def test_lda_multi_jobs(method):
196202

197203

198204
@if_safe_multiprocessing_with_blas
199-
def test_lda_partial_fit_multi_jobs():
205+
@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
206+
def test_lda_partial_fit_multi_jobs(csr_container):
200207
# Test LDA online training with multi CPU
201208
rng = np.random.RandomState(0)
202-
n_components, X = _build_sparse_mtx()
209+
n_components, X = _build_sparse_array(csr_container)
203210
lda = LatentDirichletAllocation(
204211
n_components=n_components,
205212
n_jobs=2,
@@ -240,10 +247,11 @@ def test_lda_preplexity_mismatch():
240247

241248

242249
@pytest.mark.parametrize("method", ("online", "batch"))
243-
def test_lda_perplexity(method):
250+
@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
251+
def test_lda_perplexity(method, csr_container):
244252
# Test LDA perplexity for batch training
245253
# perplexity should be lower after each iteration
246-
n_components, X = _build_sparse_mtx()
254+
n_components, X = _build_sparse_array(csr_container)
247255
lda_1 = LatentDirichletAllocation(
248256
n_components=n_components,
249257
max_iter=1,
@@ -271,10 +279,11 @@ def test_lda_perplexity(method):
271279

272280

273281
@pytest.mark.parametrize("method", ("online", "batch"))
274-
def test_lda_score(method):
282+
@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
283+
def test_lda_score(method, csr_container):
275284
# Test LDA score for batch training
276285
# score should be higher after each iteration
277-
n_components, X = _build_sparse_mtx()
286+
n_components, X = _build_sparse_array(csr_container)
278287
lda_1 = LatentDirichletAllocation(
279288
n_components=n_components,
280289
max_iter=1,
@@ -297,10 +306,11 @@ def test_lda_score(method):
297306
assert score_2 >= score_1
298307

299308

300-
def test_perplexity_input_format():
309+
@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
310+
def test_perplexity_input_format(csr_container):
301311
# Test LDA perplexity for sparse and dense input
302312
# score should be the same for both dense and sparse input
303-
n_components, X = _build_sparse_mtx()
313+
n_components, X = _build_sparse_array(csr_container)
304314
lda = LatentDirichletAllocation(
305315
n_components=n_components,
306316
max_iter=1,
@@ -314,9 +324,10 @@ def test_perplexity_input_format():
314324
assert_almost_equal(perp_1, perp_2)
315325

316326

317-
def test_lda_score_perplexity():
327+
@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
328+
def test_lda_score_perplexity(csr_container):
318329
# Test the relationship between LDA score and perplexity
319-
n_components, X = _build_sparse_mtx()
330+
n_components, X = _build_sparse_array(csr_container)
320331
lda = LatentDirichletAllocation(
321332
n_components=n_components, max_iter=10, random_state=0
322333
)
@@ -328,10 +339,11 @@ def test_lda_score_perplexity():
328339
assert_almost_equal(perplexity_1, perplexity_2)
329340

330341

331-
def test_lda_fit_perplexity():
342+
@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
343+
def test_lda_fit_perplexity(csr_container):
332344
# Test that the perplexity computed during fit is consistent with what is
333345
# returned by the perplexity method
334-
n_components, X = _build_sparse_mtx()
346+
n_components, X = _build_sparse_array(csr_container)
335347
lda = LatentDirichletAllocation(
336348
n_components=n_components,
337349
max_iter=1,
@@ -350,10 +362,11 @@ def test_lda_fit_perplexity():
350362
assert_almost_equal(perplexity1, perplexity2)
351363

352364

353-
def test_lda_empty_docs():
365+
@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
366+
def test_lda_empty_docs(csr_container):
354367
"""Test LDA on empty document (all-zero rows)."""
355368
Z = np.zeros((5, 4))
356-
for X in [Z, csr_matrix(Z)]:
369+
for X in [Z, csr_container(Z)]:
357370
lda = LatentDirichletAllocation(max_iter=750).fit(X)
358371
assert_almost_equal(
359372
lda.components_.sum(axis=0), np.ones(lda.components_.shape[1])
@@ -376,8 +389,10 @@ def test_dirichlet_expectation():
376389
)
377390

378391

379-
def check_verbosity(verbose, evaluate_every, expected_lines, expected_perplexities):
380-
n_components, X = _build_sparse_mtx()
392+
def check_verbosity(
393+
verbose, evaluate_every, expected_lines, expected_perplexities, csr_container
394+
):
395+
n_components, X = _build_sparse_array(csr_container)
381396
lda = LatentDirichletAllocation(
382397
n_components=n_components,
383398
max_iter=3,
@@ -409,13 +424,19 @@ def check_verbosity(verbose, evaluate_every, expected_lines, expected_perplexiti
409424
(True, 2, 3, 1),
410425
],
411426
)
412-
def test_verbosity(verbose, evaluate_every, expected_lines, expected_perplexities):
413-
check_verbosity(verbose, evaluate_every, expected_lines, expected_perplexities)
427+
@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
428+
def test_verbosity(
429+
verbose, evaluate_every, expected_lines, expected_perplexities, csr_container
430+
):
431+
check_verbosity(
432+
verbose, evaluate_every, expected_lines, expected_perplexities, csr_container
433+
)
414434

415435

416-
def test_lda_feature_names_out():
436+
@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
437+
def test_lda_feature_names_out(csr_container):
417438
"""Check feature names out for LatentDirichletAllocation."""
418-
n_components, X = _build_sparse_mtx()
439+
n_components, X = _build_sparse_array(csr_container)
419440
lda = LatentDirichletAllocation(n_components=n_components).fit(X)
420441

421442
names = lda.get_feature_names_out()

0 commit comments

Comments
 (0)