TST Extend tests for scipy.sparse.*array in sklearn/decomposition/tests/test_online_lda.py (scikit-learn#27144)

work-mohit · web-flow · commit 462a6d36ef89 · 2023-08-24T15:14:45.000+05:00
diff --git a/sklearn/decomposition/tests/test_online_lda.py b/sklearn/decomposition/tests/test_online_lda.py
@@ -5,7 +5,6 @@
 import pytest
 from numpy.testing import assert_array_equal
 from scipy.linalg import block_diag
-from scipy.sparse import csr_matrix
 from scipy.special import psi
 
 from sklearn.decomposition import LatentDirichletAllocation
@@ -20,23 +19,25 @@
     assert_array_almost_equal,
     if_safe_multiprocessing_with_blas,
 )
+from sklearn.utils.fixes import CSR_CONTAINERS
 
 
-def _build_sparse_mtx():
+def _build_sparse_array(csr_container):
     # Create 3 topics and each topic has 3 distinct words.
     # (Each word only belongs to a single topic.)
     n_components = 3
     block = np.full((3, 3), n_components, dtype=int)
     blocks = [block] * n_components
     X = block_diag(*blocks)
-    X = csr_matrix(X)
+    X = csr_container(X)
     return (n_components, X)
 
 
-def test_lda_default_prior_params():
+@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
+def test_lda_default_prior_params(csr_container):
     # default prior parameter should be `1 / topics`
     # and verbose params should not affect result
-    n_components, X = _build_sparse_mtx()
+    n_components, X = _build_sparse_array(csr_container)
     prior = 1.0 / n_components
     lda_1 = LatentDirichletAllocation(
         n_components=n_components,
@@ -50,10 +51,11 @@ def test_lda_default_prior_params():
     assert_almost_equal(topic_distr_1, topic_distr_2)
 
 
-def test_lda_fit_batch():
+@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
+def test_lda_fit_batch(csr_container):
     # Test LDA batch learning_offset (`fit` method with 'batch' learning)
     rng = np.random.RandomState(0)
-    n_components, X = _build_sparse_mtx()
+    n_components, X = _build_sparse_array(csr_container)
     lda = LatentDirichletAllocation(
         n_components=n_components,
         evaluate_every=1,
@@ -69,10 +71,11 @@ def test_lda_fit_batch():
         assert tuple(sorted(top_idx)) in correct_idx_grps
 
 
-def test_lda_fit_online():
+@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
+def test_lda_fit_online(csr_container):
     # Test LDA online learning (`fit` method with 'online' learning)
     rng = np.random.RandomState(0)
-    n_components, X = _build_sparse_mtx()
+    n_components, X = _build_sparse_array(csr_container)
     lda = LatentDirichletAllocation(
         n_components=n_components,
         learning_offset=10.0,
@@ -89,11 +92,12 @@ def test_lda_fit_online():
         assert tuple(sorted(top_idx)) in correct_idx_grps
 
 
-def test_lda_partial_fit():
+@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
+def test_lda_partial_fit(csr_container):
     # Test LDA online learning (`partial_fit` method)
     # (same as test_lda_batch)
     rng = np.random.RandomState(0)
-    n_components, X = _build_sparse_mtx()
+    n_components, X = _build_sparse_array(csr_container)
     lda = LatentDirichletAllocation(
         n_components=n_components,
         learning_offset=10.0,
@@ -109,10 +113,11 @@ def test_lda_partial_fit():
         assert tuple(sorted(top_idx)) in correct_idx_grps
 
 
-def test_lda_dense_input():
+@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
+def test_lda_dense_input(csr_container):
     # Test LDA with dense input.
     rng = np.random.RandomState(0)
-    n_components, X = _build_sparse_mtx()
+    n_components, X = _build_sparse_array(csr_container)
     lda = LatentDirichletAllocation(
         n_components=n_components, learning_method="batch", random_state=rng
     )
@@ -175,9 +180,10 @@ def test_lda_no_component_error():
 
 
 @if_safe_multiprocessing_with_blas
+@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
 @pytest.mark.parametrize("method", ("online", "batch"))
-def test_lda_multi_jobs(method):
-    n_components, X = _build_sparse_mtx()
+def test_lda_multi_jobs(method, csr_container):
+    n_components, X = _build_sparse_array(csr_container)
     # Test LDA batch training with multi CPU
     rng = np.random.RandomState(0)
     lda = LatentDirichletAllocation(
@@ -196,10 +202,11 @@ def test_lda_multi_jobs(method):
 
 
 @if_safe_multiprocessing_with_blas
-def test_lda_partial_fit_multi_jobs():
+@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
+def test_lda_partial_fit_multi_jobs(csr_container):
     # Test LDA online training with multi CPU
     rng = np.random.RandomState(0)
-    n_components, X = _build_sparse_mtx()
+    n_components, X = _build_sparse_array(csr_container)
     lda = LatentDirichletAllocation(
         n_components=n_components,
         n_jobs=2,
@@ -240,10 +247,11 @@ def test_lda_preplexity_mismatch():
 
 
 @pytest.mark.parametrize("method", ("online", "batch"))
-def test_lda_perplexity(method):
+@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
+def test_lda_perplexity(method, csr_container):
     # Test LDA perplexity for batch training
     # perplexity should be lower after each iteration
-    n_components, X = _build_sparse_mtx()
+    n_components, X = _build_sparse_array(csr_container)
     lda_1 = LatentDirichletAllocation(
         n_components=n_components,
         max_iter=1,
@@ -271,10 +279,11 @@ def test_lda_perplexity(method):
 
 
 @pytest.mark.parametrize("method", ("online", "batch"))
-def test_lda_score(method):
+@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
+def test_lda_score(method, csr_container):
     # Test LDA score for batch training
     # score should be higher after each iteration
-    n_components, X = _build_sparse_mtx()
+    n_components, X = _build_sparse_array(csr_container)
     lda_1 = LatentDirichletAllocation(
         n_components=n_components,
         max_iter=1,
@@ -297,10 +306,11 @@ def test_lda_score(method):
     assert score_2 >= score_1
 
 
-def test_perplexity_input_format():
+@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
+def test_perplexity_input_format(csr_container):
     # Test LDA perplexity for sparse and dense input
     # score should be the same for both dense and sparse input
-    n_components, X = _build_sparse_mtx()
+    n_components, X = _build_sparse_array(csr_container)
     lda = LatentDirichletAllocation(
         n_components=n_components,
         max_iter=1,
@@ -314,9 +324,10 @@ def test_perplexity_input_format():
     assert_almost_equal(perp_1, perp_2)
 
 
-def test_lda_score_perplexity():
+@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
+def test_lda_score_perplexity(csr_container):
     # Test the relationship between LDA score and perplexity
-    n_components, X = _build_sparse_mtx()
+    n_components, X = _build_sparse_array(csr_container)
     lda = LatentDirichletAllocation(
         n_components=n_components, max_iter=10, random_state=0
     )
@@ -328,10 +339,11 @@ def test_lda_score_perplexity():
     assert_almost_equal(perplexity_1, perplexity_2)
 
 
-def test_lda_fit_perplexity():
+@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
+def test_lda_fit_perplexity(csr_container):
     # Test that the perplexity computed during fit is consistent with what is
     # returned by the perplexity method
-    n_components, X = _build_sparse_mtx()
+    n_components, X = _build_sparse_array(csr_container)
     lda = LatentDirichletAllocation(
         n_components=n_components,
         max_iter=1,
@@ -350,10 +362,11 @@ def test_lda_fit_perplexity():
     assert_almost_equal(perplexity1, perplexity2)
 
 
-def test_lda_empty_docs():
+@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
+def test_lda_empty_docs(csr_container):
     """Test LDA on empty document (all-zero rows)."""
     Z = np.zeros((5, 4))
-    for X in [Z, csr_matrix(Z)]:
+    for X in [Z, csr_container(Z)]:
         lda = LatentDirichletAllocation(max_iter=750).fit(X)
         assert_almost_equal(
             lda.components_.sum(axis=0), np.ones(lda.components_.shape[1])
@@ -376,8 +389,10 @@ def test_dirichlet_expectation():
     )
 
 
-def check_verbosity(verbose, evaluate_every, expected_lines, expected_perplexities):
-    n_components, X = _build_sparse_mtx()
+def check_verbosity(
+    verbose, evaluate_every, expected_lines, expected_perplexities, csr_container
+):
+    n_components, X = _build_sparse_array(csr_container)
     lda = LatentDirichletAllocation(
         n_components=n_components,
         max_iter=3,
@@ -409,13 +424,19 @@ def check_verbosity(verbose, evaluate_every, expected_lines, expected_perplexiti
         (True, 2, 3, 1),
     ],
 )
-def test_verbosity(verbose, evaluate_every, expected_lines, expected_perplexities):
-    check_verbosity(verbose, evaluate_every, expected_lines, expected_perplexities)
+@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
+def test_verbosity(
+    verbose, evaluate_every, expected_lines, expected_perplexities, csr_container
+):
+    check_verbosity(
+        verbose, evaluate_every, expected_lines, expected_perplexities, csr_container
+    )
 
 
-def test_lda_feature_names_out():
+@pytest.mark.parametrize("csr_container", CSR_CONTAINERS)
+def test_lda_feature_names_out(csr_container):
     """Check feature names out for LatentDirichletAllocation."""
-    n_components, X = _build_sparse_mtx()
+    n_components, X = _build_sparse_array(csr_container)
     lda = LatentDirichletAllocation(n_components=n_components).fit(X)
 
     names = lda.get_feature_names_out()