Add ruff, remove isort, and add more pre-commits (#237)

gsheni · web-flow · commit 77b6a08591fd · 2023-01-10T12:33:22.000-05:00
* add ruff

* release notes
diff --git a/.flake8 b/.flake8
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -1,19 +1,34 @@
+exclude: |
+  (?x)
+  ^nlp_primitives/data/nltk-data/|
+  .html$|.csv$|.svg$|.md$|.txt$|.json$|.xml$|.pickle$|^.github/|
+  (LICENSE.*|README.*)
 default_stages: [commit]
-exclude: ^LICENSE/|\.(html|csv|svg|md|txt|json|tab|bib|adv)$
 repos:
-  - repo: https://github.com/MarcoGorelli/absolufy-imports
-    rev: v0.3.1
+  - repo: https://github.com/pre-commit/pre-commit-hooks
+    rev: 'v4.3.0'
     hooks:
-      - id: absolufy-imports
-        files: ^nlp_primitives/
-  - repo: https://github.com/PyCQA/isort
-    rev: 5.0.4
+      - id: check-yaml
+      - id: end-of-file-fixer
+      - id: trailing-whitespace
+  - repo: https://github.com/abravalheri/validate-pyproject
+    rev: 'v0.10.1'
     hooks:
-      - id: isort
-        args: [--settings-path=pyproject.toml]
+      - id: validate-pyproject
+  - repo: https://github.com/asottile/add-trailing-comma
+    rev: 'v2.2.3'
+    hooks:
+      - id: add-trailing-comma
+        name: Add trailing comma
+  - repo: https://github.com/charliermarsh/ruff-pre-commit
+    rev: 'v0.0.191'
+    hooks:
+      - id: ruff
+        args: ["--fix"]
   - repo: https://github.com/python/black
-    rev: 22.8.0
+    rev: 22.12.0
     hooks:
       - id: black
-        args: [--target-version=py310]
-        types_or: [python]
+        args: [--preview]
+        additional_dependencies: [".[jupyter]"]
+        types_or: [python, jupyter]
diff --git a/LICENSE b/LICENSE
@@ -26,4 +26,4 @@ DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/Makefile b/Makefile
@@ -18,14 +18,13 @@ clean:
 
 .PHONY: lint
 lint:
-	isort --check-only nlp_primitives
-	black nlp_primitives -t py310 --check
-	flake8 nlp_primitives
+	black . --check --preview
+	ruff .
 
 .PHONY: lint-fix
 lint-fix:
-	black -t py310 nlp_primitives
-	isort nlp_primitives
+	black . --preview
+	ruff . --fix
 
 .PHONY: test
 test:
diff --git a/nlp_primitives/lsa.py b/nlp_primitives/lsa.py
@@ -87,7 +87,7 @@ def __init__(self, random_seed=0, corpus=None, algorithm="randomized"):
         self.algorithm = algorithm
         if self.algorithm not in ["randomized", "arpack"]:
             raise ValueError(
-                "TruncatedSVD algorithm must be either 'randomized' or 'arpack'"
+                "TruncatedSVD algorithm must be either 'randomized' or 'arpack'",
             )
 
     def _create_trainer(self):
diff --git a/nlp_primitives/part_of_speech_count.py b/nlp_primitives/part_of_speech_count.py
@@ -38,7 +38,6 @@ def __init__(self):
         self.n = 15
 
     def get_function(self):
-
         # For more info about the different parts of speech, see here: https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html
         types = [
             "C",  # cardinal digits
diff --git a/nlp_primitives/tensorflow/elmo.py b/nlp_primitives/tensorflow/elmo.py
@@ -54,7 +54,7 @@ def elmo(col):
                     [
                         tf.compat.v1.global_variables_initializer(),
                         tf.compat.v1.tables_initializer(),
-                    ]
+                    ],
                 )
                 embeddings = session.run(self.embed(col.tolist()))
             return embeddings.transpose()
diff --git a/nlp_primitives/tensorflow/universal_sentence_encoder.py b/nlp_primitives/tensorflow/universal_sentence_encoder.py
@@ -29,7 +29,10 @@ class UniversalSentenceEncoder(TransformPrimitive):
     return_type = ColumnSchema(logical_type=Double, semantic_tags={"numeric"})
 
     def __init__(self):
-        message = "In order to use the UniversalSentenceEncoder primitive install 'nlp_primitives[complete]'"
+        message = (
+            "In order to use the UniversalSentenceEncoder primitive install"
+            " 'nlp_primitives[complete]'"
+        )
         self.tf = import_or_raise("tensorflow", message)
         hub = import_or_raise("tensorflow_hub", message)
         self.tf.compat.v1.disable_eager_execution()
@@ -45,7 +48,7 @@ def universal_sentence_encoder(col):
                     [
                         self.tf.compat.v1.global_variables_initializer(),
                         self.tf.compat.v1.tables_initializer(),
-                    ]
+                    ],
                 )
                 embeddings = session.run(self.embed(col.tolist()))
             return embeddings.transpose()
diff --git a/nlp_primitives/tests/test_diversity_score.py b/nlp_primitives/tests/test_diversity_score.py
@@ -19,7 +19,7 @@ def test_primitive_func_1(self):
                 "Not diverse not diverse not",
                 "this is a semi diverse diverse example",
                 "a a",
-            ]
+            ],
         )
         primitive_instance = self.primitive()
         primitive_func = primitive_instance.get_function()
diff --git a/nlp_primitives/tests/test_elmo.py b/nlp_primitives/tests/test_elmo.py
@@ -21,7 +21,7 @@ def test_regular(self):
                 "The roller coaster was built in 1885.",
                 "When will humans go to mars?",
                 "Mitochondria is the powerhouse of the cell",
-            ]
+            ],
         )
         new_results = primitive(words)
         assert round(sum(new_results[:, 0]), 3) == 8.744
diff --git a/nlp_primitives/tests/test_lsa.py b/nlp_primitives/tests/test_lsa.py
@@ -21,15 +21,15 @@ def test_strings(self):
                 "She ate a pineapple",
                 "Consume Electrolytes, he told me.",
                 "Hello",
-            ]
+            ],
         )
         primitive_func = self.primitive().get_function()
 
         answers = pd.Series(
             [
                 [2.41e-03, 6.29e-04, 7.26e-03, -1.85e-19],
                 [1.28e-03, 5.51e-04, 5.37e-03, -1.20e-15],
-            ]
+            ],
         )
         results = primitive_func(x)
         np.testing.assert_array_almost_equal(
@@ -45,7 +45,7 @@ def test_strings_custom_corpus(self):
                 "She ate a pineapple",
                 "Consume Electrolytes, he told me.",
                 "Hello",
-            ]
+            ],
         )
         # Create a new corpus using only the first 10000 elements from Gutenberg
         gutenberg = nltk.corpus.gutenberg.sents()
@@ -62,7 +62,7 @@ def test_strings_custom_corpus(self):
                     1.56e-03,
                     0.0,
                 ],
-            ]
+            ],
         )
         results = primitive_func(x)
         np.testing.assert_array_almost_equal(
@@ -96,7 +96,11 @@ def test_with_featuretools(self, es):
         primitive_instance = self.primitive()
         transform.append(primitive_instance)
         valid_dfs(
-            es, aggregation, transform, self.primitive.name.upper(), multi_output=True
+            es,
+            aggregation,
+            transform,
+            self.primitive.name.upper(),
+            multi_output=True,
         )
 
     def test_bad_algorithm_input_value(self):
@@ -120,6 +124,8 @@ def test_args_strings(self):
 
         # Test all args
         args_string = self.primitive(
-            random_seed=100, corpus=custom_corpus, algorithm="arpack"
+            random_seed=100,
+            corpus=custom_corpus,
+            algorithm="arpack",
         ).get_args_string()
         assert args_string == ", random_seed=100, corpus=user_defined, algorithm=arpack"
diff --git a/nlp_primitives/tests/test_mean_characters_per_sentence.py b/nlp_primitives/tests/test_mean_characters_per_sentence.py
@@ -19,7 +19,7 @@ def test_sentences(self):
                 "Ab. Bb. Db.",
                 "And? Why! Box. Car? Rat.",
                 "Yep.",
-            ]
+            ],
         )
         primitive_func = self.primitive().get_function()
         answers = pd.Series([3.0, 4.0, 4.0])
diff --git a/nlp_primitives/tests/test_number_of_sentences.py b/nlp_primitives/tests/test_number_of_sentences.py
@@ -18,7 +18,7 @@ def test_regular_input(self):
                 "Hello. Hello! Hello? Hello.",
                 "and?",
                 "yes no",
-            ]
+            ],
         )
         expected = [4.0, 1.0, 1.0]
         actual = self.primitive().get_function()(x)
@@ -34,7 +34,7 @@ def test_multiline(self):
         x = pd.Series(
             [
                 "Yes\n, this is true!",
-            ]
+            ],
         )
 
         expected = [1.0]
diff --git a/nlp_primitives/tests/test_part_of_speech_count.py b/nlp_primitives/tests/test_part_of_speech_count.py
@@ -14,7 +14,7 @@ class TestPartOfSpeechCount(PrimitiveT):
 
     def test_strings(self):
         x = pd.Series(
-            ["This IS a STRING.", "Testing AAA", "Testing AAA-BBB", "Testing AA3"]
+            ["This IS a STRING.", "Testing AAA", "Testing AAA-BBB", "Testing AA3"],
         )
         primitive_func = self.primitive().get_function()
 
@@ -35,7 +35,7 @@ def test_strings(self):
                 [0.0, 0.0, 0.0, 0.0],
                 [0.0, 1.0, 1.0, 1.0],
                 [0.0, 0.0, 0.0, 0.0],
-            ]
+            ],
         )
 
         pd.testing.assert_series_equal(primitive_func(x), answers, check_names=False)
@@ -61,7 +61,7 @@ def test_nan(self):
                 [np.nan, 0.0, 0.0],
                 [np.nan, 0.0, 0.0],
                 [np.nan, 0.0, 0.0],
-            ]
+            ],
         )
         pd.testing.assert_series_equal(primitive_func(x), answers, check_names=False)
 
@@ -70,5 +70,9 @@ def test_with_featuretools(self, es):
         primitive_instance = self.primitive()
         transform.append(primitive_instance)
         valid_dfs(
-            es, aggregation, transform, self.primitive.name.upper(), multi_output=True
+            es,
+            aggregation,
+            transform,
+            self.primitive.name.upper(),
+            multi_output=True,
         )
diff --git a/nlp_primitives/tests/test_polarity_score.py b/nlp_primitives/tests/test_polarity_score.py
@@ -14,7 +14,7 @@ class TestPolarityScore(PrimitiveT):
 
     def test_primitive_func_1(self):
         array = pd.Series(
-            ["He hates cars!", "She loves everything", "This is neutral", "!12323"]
+            ["He hates cars!", "She loves everything", "This is neutral", "!12323"],
         )
         primitive_instance = self.primitive()
         primitive_func = primitive_instance.get_function()
diff --git a/nlp_primitives/tests/test_stopword_count.py b/nlp_primitives/tests/test_stopword_count.py
@@ -19,7 +19,7 @@ def test_strings(self):
                 "This is second string",
                 "third string",
                 "This IS the fourth string.",
-            ]
+            ],
         )
         primitive_func = self.primitive().get_function()
         answers = pd.Series([3, 2, 0, 3])
diff --git a/nlp_primitives/tests/test_universal_sentence_encoder.py b/nlp_primitives/tests/test_universal_sentence_encoder.py
@@ -21,7 +21,7 @@ def test_regular(universal_sentence_encoder):
             "The roller coaster was built in 1885.",
             "When will humans go to mars?",
             "Mitochondria is the powerhouse of the cell",
-        ]
+        ],
     )
     a = pd.DataFrame(universal_sentence_encoder(sentences))
     a = a.mean().round(7).to_numpy()
@@ -43,7 +43,10 @@ def mock_remove_tensorflow():
 
 
 def test_without_tensorflow(universal_sentence_encoder, mock_remove_tensorflow):
-    err_message = "In order to use the UniversalSentenceEncoder primitive install 'nlp_primitives[complete]'"
+    err_message = (
+        "In order to use the UniversalSentenceEncoder primitive install"
+        " 'nlp_primitives[complete]'"
+    )
     with pytest.raises(ImportError) as error:
         UniversalSentenceEncoder()
     assert error.value.args[0] == err_message
@@ -57,7 +60,7 @@ def test_primitive_serialization(universal_sentence_encoder):
             "The roller coaster was built in 1885.",
             "When will humans go to mars?",
             "Mitochondria is the powerhouse of the cell",
-        ]
+        ],
     )
     serialized_primitive = serialize_primitive(universal_sentence_encoder)
     deserializer = PrimitivesDeserializer()
@@ -77,7 +80,7 @@ def test_feature_serialization(universal_sentence_encoder, tmpdir):
             "The roller coaster was built in 1885.",
             "When will humans go to mars?",
             "Mitochondria is the powerhouse of the cell",
-        ]
+        ],
     )
 
     es = ft.EntitySet("es")
diff --git a/nlp_primitives/tests/test_utils.py b/nlp_primitives/tests/test_utils.py
@@ -107,10 +107,12 @@ def find_applicable_primitives(primitive):
     all_transform_primitives = list(get_transform_primitives().values())
     all_aggregation_primitives = list(get_aggregation_primitives().values())
     applicable_transforms = find_stackable_primitives(
-        all_transform_primitives, primitive
+        all_transform_primitives,
+        primitive,
     )
     applicable_aggregations = find_stackable_primitives(
-        all_aggregation_primitives, primitive
+        all_aggregation_primitives,
+        primitive,
     )
     return applicable_transforms, applicable_aggregations
 
@@ -153,13 +155,14 @@ def valid_dfs(
                 applicable_features.append(feat)
     if len(applicable_features) == 0:
         raise ValueError(
-            "No feature names with %s, verify the name attribute \
-                          is defined and/or generate_name() is defined to \
-                          return %s "
-            % (feature_substrings, feature_substrings)
+            "No feature names with %s, verify the name attribute                       "
+            "    is defined and/or generate_name() is defined to                       "
+            "    return %s " % (feature_substrings, feature_substrings),
         )
     df = ft.calculate_feature_matrix(
-        entityset=es, features=applicable_features, instance_ids=instance_ids
+        entityset=es,
+        features=applicable_features,
+        instance_ids=instance_ids,
     )
 
     ft.encode_features(df, applicable_features)
diff --git a/nlp_primitives/utilities.py b/nlp_primitives/utilities.py
@@ -10,7 +10,7 @@ def clean_tokens(text: str) -> List[str]:
 
     # Remove stopwords and punctuation
     stopwords_and_punctuation = set(nltk.corpus.stopwords.words("english")).union(
-        set(string.punctuation)
+        set(string.punctuation),
     )
     text = [word for word in text if word not in stopwords_and_punctuation]
 
diff --git a/pyproject.toml b/pyproject.toml
diff --git a/release_notes.rst b/release_notes.rst

Original file line number	Diff line number	Diff line change
`@@ -87,7 +87,7 @@ def __init__(self, random_seed=0, corpus=None, algorithm="randomized"):`
`87`	`87`	`self.algorithm = algorithm`
`88`	`88`	`if self.algorithm not in ["randomized", "arpack"]:`
`89`	`89`	`raise ValueError(`
`90`		`- "TruncatedSVD algorithm must be either 'randomized' or 'arpack'"`
	`90`	`+ "TruncatedSVD algorithm must be either 'randomized' or 'arpack'",`
`91`	`91`	`)`
`92`	`92`
`93`	`93`	`def _create_trainer(self):`
Original file line number	Diff line number	Diff line change
`@@ -54,7 +54,7 @@ def elmo(col):`
`54`	`54`	`[`
`55`	`55`	`tf.compat.v1.global_variables_initializer(),`
`56`	`56`	`tf.compat.v1.tables_initializer(),`
`57`		`- ]`
	`57`	`+ ],`
`58`	`58`	`)`
`59`	`59`	`embeddings = session.run(self.embed(col.tolist()))`
`60`	`60`	`return embeddings.transpose()`
Original file line number	Diff line number	Diff line change
`@@ -19,7 +19,7 @@ def test_primitive_func_1(self):`
`19`	`19`	`"Not diverse not diverse not",`
`20`	`20`	`"this is a semi diverse diverse example",`
`21`	`21`	`"a a",`
`22`		`- ]`
	`22`	`+ ],`
`23`	`23`	`)`
`24`	`24`	`primitive_instance = self.primitive()`
`25`	`25`	`primitive_func = primitive_instance.get_function()`
Original file line number	Diff line number	Diff line change
`@@ -21,7 +21,7 @@ def test_regular(self):`
`21`	`21`	`"The roller coaster was built in 1885.",`
`22`	`22`	`"When will humans go to mars?",`
`23`	`23`	`"Mitochondria is the powerhouse of the cell",`
`24`		`- ]`
	`24`	`+ ],`
`25`	`25`	`)`
`26`	`26`	`new_results = primitive(words)`
`27`	`27`	`assert round(sum(new_results[:, 0]), 3) == 8.744`
Original file line number	Diff line number	Diff line change
`@@ -19,7 +19,7 @@ def test_sentences(self):`
`19`	`19`	`"Ab. Bb. Db.",`
`20`	`20`	`"And? Why! Box. Car? Rat.",`
`21`	`21`	`"Yep.",`
`22`		`- ]`
	`22`	`+ ],`
`23`	`23`	`)`
`24`	`24`	`primitive_func = self.primitive().get_function()`
`25`	`25`	`answers = pd.Series([3.0, 4.0, 4.0])`
Original file line number	Diff line number	Diff line change
`@@ -18,7 +18,7 @@ def test_regular_input(self):`
`18`	`18`	`"Hello. Hello! Hello? Hello.",`
`19`	`19`	`"and?",`
`20`	`20`	`"yes no",`
`21`		`- ]`
	`21`	`+ ],`
`22`	`22`	`)`
`23`	`23`	`expected = [4.0, 1.0, 1.0]`
`24`	`24`	`actual = self.primitive().get_function()(x)`
`@@ -34,7 +34,7 @@ def test_multiline(self):`
`34`	`34`	`x = pd.Series(`
`35`	`35`	`[`
`36`	`36`	`"Yes\n, this is true!",`
`37`		`- ]`
	`37`	`+ ],`
`38`	`38`	`)`
`39`	`39`
`40`	`40`	`expected = [1.0]`
Original file line number	Diff line number	Diff line change
`@@ -14,7 +14,7 @@ class TestPolarityScore(PrimitiveT):`
`14`	`14`
`15`	`15`	`def test_primitive_func_1(self):`
`16`	`16`	`array = pd.Series(`
`17`		`- ["He hates cars!", "She loves everything", "This is neutral", "!12323"]`
	`17`	`+ ["He hates cars!", "She loves everything", "This is neutral", "!12323"],`
`18`	`18`	`)`
`19`	`19`	`primitive_instance = self.primitive()`
`20`	`20`	`primitive_func = primitive_instance.get_function()`
Original file line number	Diff line number	Diff line change
`@@ -19,7 +19,7 @@ def test_strings(self):`
`19`	`19`	`"This is second string",`
`20`	`20`	`"third string",`
`21`	`21`	`"This IS the fourth string.",`
`22`		`- ]`
	`22`	`+ ],`
`23`	`23`	`)`
`24`	`24`	`primitive_func = self.primitive().get_function()`
`25`	`25`	`answers = pd.Series([3, 2, 0, 3])`
Original file line number	Diff line number	Diff line change
`@@ -10,7 +10,7 @@ def clean_tokens(text: str) -> List[str]:`
`10`	`10`
`11`	`11`	`# Remove stopwords and punctuation`
`12`	`12`	`stopwords_and_punctuation = set(nltk.corpus.stopwords.words("english")).union(`
`13`		`- set(string.punctuation)`
	`13`	`+ set(string.punctuation),`
`14`	`14`	`)`
`15`	`15`	`text = [word for word in text if word not in stopwords_and_punctuation]`
`16`	`16`