delb-xml · funkyfuture · May 11, 2024 · May 11, 2024 · May 11, 2024 · May 11, 2024
diff --git a/.editorconfig b/.editorconfig
@@ -4,12 +4,18 @@ end_of_line = lf
 insert_final_newline = true
 trim_trailing_whitespace = true
 
+[*.md]
+indent_style = space
+indent_size = x
+max_line_length = 80
+
 [*.py]
 indent_style = space
 indent_size = 4
 max_line_length = 88
 
 [*.rst]
+indent_style = space
 max_line_length = 80
 
 [*.yml]

diff --git a/integration-tests/README.md b/integration-tests/README.md
@@ -1,20 +1,24 @@
 # Integration tests against corpora
 
 This folder serves as playground for tests of basic functionality against many
-XML documents, mostly TEI-encodings. They are supposed to be executed with
+XML documents, mostly TEI encodings. They are supposed to be executed with
 major code change proposals and before releases.
 
 ## Test corpus
 
 Place document collections into the `corpora` folder. The `fetch-corpora.py`
-script helps to get going with the minimal requirement (~3GB) of tests.
+script helps to get going with the minimal requirement (~6GB) of data.
+Set any non-empty string as environment variable `SKIP_EXISTING` to skip
+downloading a corpus whose target folder already exists.
 
 Due to the `lb` tag [issue](https://github.com/deutschestextarchiv/dtabf/issues/33)
 with the DTABf the DTA corpus isn't considered. It could be an experiment to
 use *delb* for transformations with regards to the conclusions of that issue.
 
 The `normalize-corpora.py` script addresses issues that were found in the text
-encodings and must be run before the tests.
+encodings and must be run after fetching test data.
+One of the corpus folder names can be passed as argument to the script in order
+to process only this one's contents.
 
 ## Tests
 
@@ -25,3 +29,8 @@ reserves a `report.txt` for messages redirected from *stdout*.
 
 When problems occur, carefully investigate that it's not due to the source, and
 if not extract simple enough cases for the unit tests.
+
+## TODO
+
+After adding the third kind of test, wrap all scripts here into a
+[textual](https://textual.textualize.io) app.
diff --git a/integration-tests/fetch-corpora.py b/integration-tests/fetch-corpora.py
@@ -30,6 +30,7 @@
 from typing import Final, NamedTuple
 
 from httpx import AsyncClient, HTTPError
+from tenacity import retry, wait_random_exponential
 
 import delb
 
@@ -243,6 +244,7 @@ class Archive(NamedTuple):
 http_client: Final = AsyncClient()
 
 
+@retry(wait=wait_random_exponential(multiplier=1, max=120))
 async def fetch_resource(url: str, destination: io.BufferedWriter) -> bool:
     async with http_client.stream("GET", url, follow_redirects=True) as response:
         try:

diff --git a/integration-tests/normalize-corpora.py b/integration-tests/normalize-corpora.py
@@ -5,7 +5,11 @@
 from functools import partial
 from pathlib import Path
 from typing import Final
-from sys import argv, stderr
+from sys import argv
+
+import tqdm
+
+from utils import indicate_progress
 
 CORPORA_PATH: Final = Path(__file__).parent.resolve() / "corpora"
 
@@ -31,28 +35,31 @@
 cr_ent_to_lf = partial(re.compile(re.escape(b"&#xd;"), flags=re.IGNORECASE).subn, b"\n")
 
 
-async def normalize_file(file: Path):
+async def normalize_file(file: Path, indicate: callable = indicate_progress):
     match file.parent.name:
         case "casebooks":
             contents, subs = adjust_casebooks_dtd_path(file.read_bytes())
         case "papyri":
             contents, subs = cr_ent_to_lf(file.read_bytes())
         case _:
+            indicate()
             return
 
     if subs:
         file.write_bytes(contents)
-    stderr.write("✓")
+    indicate()
 
 
 async def main():
     root = CORPORA_PATH
     if len(argv) > 1:
         root /= argv[1]
-    print(root)
+    print(f"Normalizing contents of {root}")
+    files = list(root.rglob("*.xml"))
+    pbar = tqdm.tqdm(total=len(files), mininterval=.5)
     async with asyncio.TaskGroup() as tasks:
-        for file in root.rglob("*.xml"):
-            tasks.create_task(normalize_file(file))
+        for file in files:
+            tasks.create_task(normalize_file(file, pbar.update))
 
 
 if __name__ == "__main__":

diff --git a/integration-tests/test-location-paths.py b/integration-tests/test-location-paths.py
@@ -0,0 +1,106 @@
+#!/bin/env python3
+
+import multiprocessing as mp
+import random
+from collections.abc import Iterable
+from itertools import chain
+from pathlib import Path
+from sys import stderr
+from typing import Final
+
+from _delb.plugins.core_loaders import path_loader
+from delb import is_tag_node, Document, FailedDocumentLoading, ParserOptions
+
+from utils import indicate_progress
+
+BATCH_SIZE: Final = 64
+CPU_COUNT: Final = mp.cpu_count()
+
+CORPORA_PATH: Final = Path(__file__).parent.resolve() / "corpora"
+
+DOCUMENT_SAMPLES_PERCENT: Final = 25
+LOCATIONS_PATHS_SAMPLES_PERCENT: Final = 25
+
+
+def verify_location_paths(file: Path):
+    try:
+        document = Document(
+            file,
+            parser_options=ParserOptions(
+                collapse_whitespace=False, resolve_entities=False, unplugged=True
+            ),
+        )
+    except FailedDocumentLoading as exc:
+        print(
+            f"\nFailed to load {file.name}: {exc.excuses[path_loader]}",
+            end="",
+        )
+        return
+
+    root = document.root
+    for node in chain((root,), root.iterate_descendants(is_tag_node)):
+        if random.randint(1, 100) > LOCATIONS_PATHS_SAMPLES_PERCENT:
+            continue
+
+        query_results = document.xpath(node.location_path)
+        if query_results.size == 1 and query_results.first is node:
+            indicate_progress()
+        else:
+            print(
+                f"\nXPath query `{node.location_path}` in {file} yielded unexpected "
+                "results."
+            )
+            stderr.write("🕱")
+
+
+def dispatch_batch(files: Iterable[Path]):
+    for file in files:
+        try:
+            verify_location_paths(file)
+        except Exception as e:
+            print(f"\nUnhandled exception while testing {file}: {e}")
+
+
+def main():
+    mp.set_start_method("forkserver")
+
+    all_counter = counter = 0
+    selected_files = []
+    dispatched_tasks = []
+
+    with mp.Pool(CPU_COUNT) as pool:
+        for file in CORPORA_PATH.rglob("*.xml"):
+            all_counter += 1
+            if random.randint(1, 100) > DOCUMENT_SAMPLES_PERCENT:
+                continue
+
+            selected_files.append(file)
+            counter += 1
+            if len(selected_files) < BATCH_SIZE:
+                continue
+
+            dispatched_tasks.append(
+                pool.apply_async(dispatch_batch, (tuple(selected_files),))
+            )
+            selected_files.clear()
+
+            while len(dispatched_tasks) >= CPU_COUNT:
+                for task in dispatched_tasks:
+                    if task.ready():
+                        dispatched_tasks.remove(task)
+
+            stderr.flush()
+
+    dispatch_batch(selected_files)
+    stderr.flush()
+
+    print(
+        f"\n\nTested against {counter} *randomly* selected out of {all_counter} "
+        "documents."
+        f"\n{LOCATIONS_PATHS_SAMPLES_PERCENT}% of the tag nodes' `location_path` "
+        f"attribute were verified per document."
+    )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/integration-tests/test-parse-serialize-equality.py b/integration-tests/test-parse-serialize-equality.py
@@ -9,6 +9,8 @@
 from _delb.plugins.core_loaders import path_loader
 from delb import compare_trees, Document, FailedDocumentLoading, ParserOptions
 
+from utils import indicate_progress
+
 
 BATCH_SIZE: Final = 64
 CPU_COUNT: Final = mp.cpu_count()
@@ -26,7 +28,8 @@ def parse_serialize_compare(file: Path):
 
     try:
         document = Document(
-            file, parser_options=ParserOptions(collapse_whitespace=False)
+            file,
+            parser_options=ParserOptions(collapse_whitespace=False, unplugged=True),
         )
     except FailedDocumentLoading as exc:
         print(
@@ -57,22 +60,23 @@ def parse_serialize_compare(file: Path):
     # TODO? compare with lxml as well
     else:
         result_file.unlink()
-        stderr.write("✓")
+        indicate_progress()
 
 
 def dispatch_batch(files_list: list[Path]):
     for file in files_list:
         try:
             parse_serialize_compare(file)
         except Exception as e:
-            print(f"\nUnhandled exception while testing {file}: {e}", end="")
+            print(f"\nUnhandled exception while testing {file}: {e}")
 
 
 def main():
-    counter = 0
     mp.set_start_method("forkserver")
 
+    counter = 0
     dispatched_tasks = []
+
     with mp.Pool(CPU_COUNT) as pool:
         for file_list in batched(CORPORA_PATH.rglob("*.xml"), n=BATCH_SIZE):
             dispatched_tasks.append(

diff --git a/integration-tests/utils.py b/integration-tests/utils.py
@@ -0,0 +1,10 @@
+import random
+from sys import stderr
+from typing import Final
+
+# Neukölln's digest
+PROGRESS_INDICATION_CHARCATERS: Final = "✓→🚴✊★☆⯪𓄁𓅯▶️✴️🪇⚒️🧻🚬🗿🎳⏳🌝☕🐑🐞🌼🪱🌸🏵💮️"
+
+
+def indicate_progress():
+    stderr.write(random.choice(PROGRESS_INDICATION_CHARCATERS))
diff --git a/pyproject.toml b/pyproject.toml
@@ -192,3 +192,14 @@ coverage-report = """
         --cov=_delb --cov=delb \
         tests
 """
+
+[tool.hatch.envs.integration-tests]
+dependencies = [
+    "pytest-httpx",
+    "tenacity",
+    "tqdm",
+]
+[tool.hatch.envs.integration-tests.scripts]
+fetch = "python integration-tests/fetch-corpora.py"
+normalize = "python integration-tests/normalize-corpora.py"
+test = "python integration-tests/test-*"