pandas-dev · dontgoto · Mar 19, 2024 · Mar 19, 2024 · Mar 20, 2024 · Mar 23, 2024
diff --git a/scripts/tests/test_validate_docstrings.py b/scripts/tests/test_validate_docstrings.py
@@ -279,7 +279,6 @@ def test_validate_all_ignore_errors(self, monkeypatch):
         assert exit_status == 2 * 2 - 1
 
 
-
 class TestApiItems:
     @property
     def api_doc(self):
@@ -374,9 +373,10 @@ class TestPandasDocstringClass:
     )
     def test_encode_content_write_to_file(self, name) -> None:
         # GH25466
-        docstr = validate_docstrings.PandasDocstring(name).validate_pep8()
+        docstr = validate_docstrings.PandasDocstring(name)
+        errors = validate_docstrings.validate_pep8(docstr)[docstr]
         # the list of pep8 errors should be empty
-        assert not list(docstr)
+        assert not list(errors)
 
 
 class TestMainFunction:

diff --git a/scripts/validate_docstrings.py b/scripts/validate_docstrings.py
@@ -20,7 +20,6 @@
 import doctest
 import importlib
 import json
-import os
 import pathlib
 import subprocess
 import sys
@@ -153,6 +152,100 @@ def get_api_items(api_doc_fd):
         previous_line = line_stripped
 
 
+def validate_pep8(docs: list[PandasDocstring] | PandasDocstring
+                  ) -> dict[PandasDocstring, list[tuple]]:
+    """
+    Call the pep8 validation for docstrings with examples and add the found errors.
+
+    Parameters
+    ----------
+    docs : list[PandasDocString]
+        List of docstrings to validate.
+
+    Returns
+    -------
+    dict[PandasDocstring, list]
+       Dict of function names and the pep8 error messages found in their docstrings.
+       The errors messages are of the form
+       (error_code, message, line_number, col_number).
+    """
+    if isinstance(docs, PandasDocstring):
+        docs = [docs]
+
+    with tempfile.TemporaryDirectory() as temp_dir:
+        doc_to_filename = {}
+        for doc in docs:
+            if not doc.examples:
+                continue
+
+            # F401 is needed to not generate flake8 errors in examples
+            # that do not use numpy or pandas
+            content = "".join(
+                (
+                    "import numpy as np  # noqa: F401\n",
+                    "import pandas as pd  # noqa: F401\n",
+                    *doc.examples_source_code,
+                )
+            )
+
+            temp_file = tempfile.NamedTemporaryFile(mode="w",
+                                                    dir=temp_dir,
+                                                    encoding="utf-8",
+                                                    delete=False)
+            temp_file.write(content)
+            temp_file.flush()
+            doc_to_filename[doc] = temp_file.name
+
+        # No docs with examples to process
+        if not doc_to_filename:
+            return {}
+
+        cmd = [
+            sys.executable,
+            "-m",
+            "flake8",
+            "--format=%(row)d\t%(col)d\t%(code)s\t%(text)s",
+            "--max-line-length=88",
+            "--ignore=E203,E3,W503,W504,E402,E731,E128,E124,E704",
+        ]
+        cmd.extend(doc_to_filename.values())
+        response = subprocess.run(cmd, capture_output=True, check=False,
+                                  text=True)
+
+    all_docs_error_messages = {doc: [] for doc in docs}
+    for doc, temp_file_name in doc_to_filename.items():
+        # one output for each error, each error must be mapped to the func_name
+        for output in ("stdout", "stderr"):
+            out = getattr(response, output)
+            out = out.replace(temp_file_name, "").strip("\n").splitlines()
+            if out:
+                all_docs_error_messages[doc].extend(out)
+
+    for doc, raw_error_messages in all_docs_error_messages.items():
+        doc_error_messages = []
+        for raw_error_message in raw_error_messages:
+            line_num, col_num, err_code, msg = raw_error_message.split("\t", maxsplit=3)
+            # Note: we subtract 2 from the line number because
+            # 'import numpy as np\nimport pandas as pd\n'
+            # is prepended to the docstrings.
+            doc_error_messages.append(
+                (
+                    err_code,
+                    msg,
+                    int(line_num) - 2,
+                    int(col_num)
+                )
+            )
+        all_docs_error_messages[doc] = doc_error_messages
+
+    for doc in docs:
+        if doc.examples and doc not in all_docs_error_messages.keys():
+            raise KeyError(f"Docstring\n###\n{doc}\n###\nhas examples but "
+                           f"no pep8 validation results.")
+
+    return all_docs_error_messages
+
+
 class PandasDocstring(Validator):
     def __init__(self, func_name: str, doc_obj=None) -> None:
         self.func_name = func_name
@@ -173,55 +266,6 @@ def examples_source_code(self):
         lines = doctest.DocTestParser().get_examples(self.raw_doc)
         return [line.source for line in lines]
 
-    def validate_pep8(self):
-        if not self.examples:
-            return
-
-        # F401 is needed to not generate flake8 errors in examples
-        # that do not user numpy or pandas
-        content = "".join(
-            (
-                "import numpy as np  # noqa: F401\n",
-                "import pandas as pd  # noqa: F401\n",
-                *self.examples_source_code,
-            )
-        )
-
-        error_messages = []
-
-        file = tempfile.NamedTemporaryFile(mode="w", encoding="utf-8", delete=False)
-        try:
-            file.write(content)
-            file.flush()
-            cmd = [
-                sys.executable,
-                "-m",
-                "flake8",
-                "--format=%(row)d\t%(col)d\t%(code)s\t%(text)s",
-                "--max-line-length=88",
-                "--ignore=E203,E3,W503,W504,E402,E731,E128,E124,E704",
-                file.name,
-            ]
-            response = subprocess.run(cmd, capture_output=True, check=False, text=True)
-            for output in ("stdout", "stderr"):
-                out = getattr(response, output)
-                out = out.replace(file.name, "")
-                messages = out.strip("\n").splitlines()
-                if messages:
-                    error_messages.extend(messages)
-        finally:
-            file.close()
-            os.unlink(file.name)
-
-        for error_message in error_messages:
-            line_number, col_number, error_code, message = error_message.split(
-                "\t", maxsplit=3
-            )
-            # Note: we subtract 2 from the line number because
-            # 'import numpy as np\nimport pandas as pd\n'
-            # is prepended to the docstrings.
-            yield error_code, message, int(line_number) - 2, int(col_number)
-
     def non_hyphenated_array_like(self):
         return "array_like" in self.raw_doc
 
@@ -264,14 +308,14 @@ def pandas_validate(func_name: str):
 
     result["examples_errs"] = ""
     if doc.examples:
-        for error_code, error_message, line_number, col_number in doc.validate_pep8():
+        for err_code, err_message, line_num, col_num in validate_pep8(doc)[doc]:
             result["errors"].append(
                 pandas_error(
                     "EX03",
-                    error_code=error_code,
-                    error_message=error_message,
-                    line_number=line_number,
-                    col_number=col_number,
+                    error_code=err_code,
+                    error_message=err_message,
+                    line_number=line_num,
+                    col_number=col_num,
                 )
             )
         examples_source_code = "".join(doc.examples_source_code)
@@ -346,7 +390,7 @@ def print_validate_all_results(
     output_format: str,
     prefix: str | None,
     ignore_deprecated: bool,
-    ignore_errors: dict[str, set[str]],
+    ignore_errors: dict[str | None, set[str]],
 ):
     if output_format not in ("default", "json", "actions"):
         raise ValueError(f'Unknown output_format "{output_format}"')
@@ -384,7 +428,7 @@ def print_validate_all_results(
 
 
 def print_validate_one_results(func_name: str,
-                               ignore_errors: dict[str, set[str]]) -> int:
+                               ignore_errors: dict[str | None, set[str]]) -> int:
     def header(title, width=80, char="#") -> str:
         full_line = char * width
         side_len = (width - len(title) - 2) // 2