Allow for custom corpus in LSA primitive (#148)

thehomebrewnerd · gsheni · web-flow · commit db5b13302039 · 2022-06-16T21:14:11.000Z
* allow for custom corpus in LSA primitive

* update release notes

* update new test answers

* update test

* more test cleanup

* update _create_trainer

* user arpack instead of randomized

* update doctest

* remove doctest

* update algo again

* add back doctest

* update docstring, algorithm and tests

* lint fix

* Update nlp_primitives/lsa.py

Co-authored-by: Gaurav Sheni &lt;gvsheni@gmail.com&gt;

* catch bad SVD algorithm input

* update docstring to include args

* fix doc link

Co-authored-by: Gaurav Sheni &lt;gvsheni@gmail.com&gt;
diff --git a/docs/source/changelog.rst b/docs/source/changelog.rst
@@ -5,6 +5,7 @@ Changelog
 Future Release
 ==============
     * Enhancements
+        * Allow users to optionally pass in a custom corpus to use with the LSA primitive (:pr:`148`)
     * Fixes
         * Fix bug in ``CountString`` with null values (:pr:`154`)
         * Fix a bug with nltk data was not included in package (:pr:`157`)
@@ -27,7 +28,7 @@ v2.6.0 Jun 16, 2022
         * Fixed unit tests workflow test choice logic (:pr:`151`)
 
     Thanks to the following people for contributing to this release:
-    :user:`gsheni`, :user:`rwedge`
+    :user:`gsheni`, :user:`rwedge`, :user:`thehomebrewnerd`
 
 v2.5.0 Apr 7, 2022
 ==================
diff --git a/nlp_primitives/lsa.py b/nlp_primitives/lsa.py
@@ -19,11 +19,26 @@ class LSA(TransformPrimitive):
         Given a list of strings, transforms those strings using tf-idf and single
         value decomposition to go from a sparse matrix to a compact matrix with two
         values for each string. These values represent that Latent Semantic Analysis
-        of each string. These values will represent their context with respect to
-        (nltk's gutenberg corpus.)[https://www.nltk.org/book/ch02.html#gutenberg-corpus]
+        of each string. By default these values will represent their context with respect to
+        `nltk's gutenberg corpus. <https://www.nltk.org/book/ch02.html#gutenberg-corpus>`_
+        Users can optionally pass in a custom corpus when initializing the primitive
+        by specifying the corpus values in a list with the corpus parameter.
 
         If a string is missing, return `NaN`.
 
+        Note: If a small custom corpus is used, the output of the primitive may vary
+        depending on the computer architecture being used (Linux, MacOS, Windows). This
+        is especially true when using the default "randomized" algorithm for the
+        TruncatedSVD component.
+
+    Args:
+        random_seed (int, optional): The random seed value to use for the call to TruncatedSVD.
+            Will default to 0 if not specified.
+        custom_corpus (list[str], optional): A list of strings to use as a custom corpus. Will
+            default to the NLTK Gutenberg corpus if not specified.
+        algorithm (str, optional): The algorithm to use for the call to TruncatedSVD. Should be either
+            "randomized" or "arpack". Will default to "randomized" if not specified.
+
     Examples:
         >>> lsa = LSA()
         >>> x = ["he helped her walk,", "me me me eat food", "the sentence doth long"]
@@ -32,8 +47,8 @@ class LSA(TransformPrimitive):
         >>> res
         [[0.01, 0.01, 0.01], [0.0, 0.0, 0.01]]
 
-        Now, if we change the values of the input corpus, to something that better resembles
-        the given text, the same given input text will result in a different, more discerning,
+        Now, if we change the values of the input text, to something that better resembles
+        the given corpus, the same given input text will result in a different, more discerning,
         output. Also, NaN values are handled, as well as strings without words.
 
         >>> lsa = LSA()
@@ -43,25 +58,48 @@ class LSA(TransformPrimitive):
         >>> res
         [[0.02, 0.0, nan, 0.0], [0.02, 0.0, nan, 0.0]]
 
+        Users can optionally also pass in a custom corpus and specify the algorithm to use
+        for the TruncatedSVD component used by the primitive.
+
+        >>> custom_corpus = ["dogs ate food", "she ate pineapple", "hello"]
+        >>> lsa = LSA(corpus=custom_corpus, algorithm="arpack")
+        >>> x = ["The dogs ate food.",
+        ...      "She ate a pineapple",
+        ...      "Consume Electrolytes, he told me.",
+        ...      "Hello",]
+        >>> res = lsa(x).tolist()
+        >>> for i in range(len(res)): res[i] = [abs(round(x, 2)) for x in res[i]]
+        >>> res
+        [[0.68, 0.78, 0.0, 0.0], [0.0, 0.0, 0.0, 1.0]]
     """
 
     name = "lsa"
     input_types = [ColumnSchema(logical_type=NaturalLanguage)]
     return_type = ColumnSchema(logical_type=Double, semantic_tags={"numeric"})
     default_value = 0
 
-    def __init__(self, random_seed=0):
-        # TODO: allow user to use own corpus
+    def __init__(self, random_seed=0, corpus=None, algorithm=None):
         self.number_output_features = 2
         self.n = 2
         self.trainer = None
         self.random_seed = random_seed
+        self.corpus = corpus
+        self.algorithm = algorithm or "randomized"
+        if self.algorithm not in ["randomized", "arpack"]:
+            raise ValueError(
+                "TruncatedSVD algorithm must be either 'randomized' or 'arpack'"
+            )
 
     def _create_trainer(self):
-        gutenberg = nltk.corpus.gutenberg.sents()
-        svd = TruncatedSVD(random_state=self.random_seed)
+        if self.corpus is None:
+            gutenberg = nltk.corpus.gutenberg.sents()
+            corpus = [" ".join(sent) for sent in gutenberg]
+        else:
+            corpus = self.corpus
+        svd = TruncatedSVD(random_state=self.random_seed, algorithm=self.algorithm)
+
         self.trainer = make_pipeline(TfidfVectorizer(), svd)
-        self.trainer.fit([" ".join(sent) for sent in gutenberg])
+        self.trainer.fit(corpus)
 
     def get_function(self):
         if self.trainer is None:
diff --git a/nlp_primitives/tests/test_lsa.py b/nlp_primitives/tests/test_lsa.py
@@ -1,5 +1,7 @@
+import nltk
 import numpy as np
 import pandas as pd
+import pytest
 
 from ..lsa import LSA
 from .test_utils import PrimitiveT, find_applicable_primitives, valid_dfs
@@ -42,6 +44,39 @@ def test_strings(self):
             decimal=2,
         )
 
+    def test_strings_custom_corpus(self):
+        x = pd.Series(
+            [
+                "The dogs ate food.",
+                "She ate a pineapple",
+                "Consume Electrolytes, he told me.",
+                "Hello",
+            ]
+        )
+        # Create a new corpus using only the first 10000 elements from Gutenberg
+        gutenberg = nltk.corpus.gutenberg.sents()
+        corpus = [" ".join(sent) for sent in gutenberg]
+        corpus = corpus[:10000]
+        primitive_func = self.primitive(corpus=corpus).get_function()
+
+        answers = pd.Series(
+            [
+                [0.03858566832087156, 0.04979961879358504, 0.013042488281432613, 0.0],
+                [
+                    -0.0010495388842080527,
+                    -0.0011128696986250912,
+                    0.001556757056617563,
+                    0.0,
+                ],
+            ]
+        )
+        results = primitive_func(x)
+        np.testing.assert_array_almost_equal(
+            np.concatenate(([np.array(answers[0])], [np.array(answers[1])]), axis=0),
+            np.concatenate(([np.array(results[0])], [np.array(results[1])]), axis=0),
+            decimal=2,
+        )
+
     def test_nan(self):
         x = pd.Series([np.nan, "#;.<", "This IS a STRING."])
         primitive_func = self.primitive().get_function()
@@ -69,3 +104,8 @@ def test_with_featuretools(self, es):
         valid_dfs(
             es, aggregation, transform, self.primitive.name.upper(), multi_output=True
         )
+
+    def test_bad_algorithm_input_value(self):
+        err_message = "TruncatedSVD algorithm must be either 'randomized' or 'arpack'"
+        with pytest.raises(ValueError, match=err_message):
+            LSA(algorithm="bad_algo")