Skip to content

Adds verification of TagNode.location_path to the integration tests #87

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 10 commits into from
Closed
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions .editorconfig
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,18 @@ end_of_line = lf
insert_final_newline = true
trim_trailing_whitespace = true

[*.md]
indent_style = space
indent_size = x
max_line_length = 80

[*.py]
indent_style = space
indent_size = 4
max_line_length = 88

[*.rst]
indent_style = space
max_line_length = 80

[*.yml]
Expand Down
15 changes: 12 additions & 3 deletions integration-tests/README.md
Original file line number Diff line number Diff line change
@@ -1,20 +1,24 @@
# Integration tests against corpora

This folder serves as playground for tests of basic functionality against many
XML documents, mostly TEI-encodings. They are supposed to be executed with
XML documents, mostly TEI encodings. They are supposed to be executed with
major code change proposals and before releases.

## Test corpus

Place document collections into the `corpora` folder. The `fetch-corpora.py`
script helps to get going with the minimal requirement (~3GB) of tests.
script helps to get going with the minimal requirement (~6GB) of data.
Set any non-empty string as environment variable `SKIP_EXISTING` to skip
downloading a corpus whose target folder already exists.

Due to the `lb` tag [issue](https://github.com/deutschestextarchiv/dtabf/issues/33)
with the DTABf the DTA corpus isn't considered. It could be an experiment to
use *delb* for transformations with regards to the conclusions of that issue.

The `normalize-corpora.py` script addresses issues that were found in the text
encodings and must be run before the tests.
encodings and must be run after fetching test data.
One of the corpus folder names can be passed as argument to the script in order
to process only this one's contents.

## Tests

Expand All @@ -25,3 +29,8 @@ reserves a `report.txt` for messages redirected from *stdout*.

When problems occur, carefully investigate that it's not due to the source, and
if not extract simple enough cases for the unit tests.

## TODO

After adding the third kind of test, wrap all scripts here into a
[textual](https://textual.textualize.io) app.
2 changes: 2 additions & 0 deletions integration-tests/fetch-corpora.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
from typing import Final, NamedTuple

from httpx import AsyncClient, HTTPError
from tenacity import retry, wait_random_exponential

import delb

Expand Down Expand Up @@ -243,6 +244,7 @@ class Archive(NamedTuple):
http_client: Final = AsyncClient()


@retry(wait=wait_random_exponential(multiplier=1, max=120))
async def fetch_resource(url: str, destination: io.BufferedWriter) -> bool:
async with http_client.stream("GET", url, follow_redirects=True) as response:
try:
Expand Down
19 changes: 13 additions & 6 deletions integration-tests/normalize-corpora.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,11 @@
from functools import partial
from pathlib import Path
from typing import Final
from sys import argv, stderr
from sys import argv

import tqdm

from utils import indicate_progress

CORPORA_PATH: Final = Path(__file__).parent.resolve() / "corpora"

Expand All @@ -31,28 +35,31 @@
cr_ent_to_lf = partial(re.compile(re.escape(b"
"), flags=re.IGNORECASE).subn, b"\n")


async def normalize_file(file: Path):
async def normalize_file(file: Path, indicate: callable = indicate_progress):
match file.parent.name:
case "casebooks":
contents, subs = adjust_casebooks_dtd_path(file.read_bytes())
case "papyri":
contents, subs = cr_ent_to_lf(file.read_bytes())
case _:
indicate()
return

if subs:
file.write_bytes(contents)
stderr.write("✓")
indicate()


async def main():
root = CORPORA_PATH
if len(argv) > 1:
root /= argv[1]
print(root)
print(f"Normalizing contents of {root}")
files = list(root.rglob("*.xml"))
pbar = tqdm.tqdm(total=len(files), mininterval=.5)
async with asyncio.TaskGroup() as tasks:
for file in root.rglob("*.xml"):
tasks.create_task(normalize_file(file))
for file in files:
tasks.create_task(normalize_file(file, pbar.update))


if __name__ == "__main__":
Expand Down
106 changes: 106 additions & 0 deletions integration-tests/test-location-paths.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
#!/bin/env python3

import multiprocessing as mp
import random
from collections.abc import Iterable
from itertools import chain
from pathlib import Path
from sys import stderr
from typing import Final

from _delb.plugins.core_loaders import path_loader
from delb import is_tag_node, Document, FailedDocumentLoading, ParserOptions

from utils import indicate_progress

BATCH_SIZE: Final = 64
CPU_COUNT: Final = mp.cpu_count()

CORPORA_PATH: Final = Path(__file__).parent.resolve() / "corpora"

DOCUMENT_SAMPLES_PERCENT: Final = 25
LOCATIONS_PATHS_SAMPLES_PERCENT: Final = 25


def verify_location_paths(file: Path):
try:
document = Document(
file,
parser_options=ParserOptions(
collapse_whitespace=False, resolve_entities=False, unplugged=True
),
)
except FailedDocumentLoading as exc:
print(
f"\nFailed to load {file.name}: {exc.excuses[path_loader]}",
end="",
)
return

root = document.root
for node in chain((root,), root.iterate_descendants(is_tag_node)):
if random.randint(1, 100) > LOCATIONS_PATHS_SAMPLES_PERCENT:
continue

query_results = document.xpath(node.location_path)
if query_results.size == 1 and query_results.first is node:
indicate_progress()
else:
print(
f"\nXPath query `{node.location_path}` in {file} yielded unexpected "
"results."
)
stderr.write("🕱")


def dispatch_batch(files: Iterable[Path]):
for file in files:
try:
verify_location_paths(file)
except Exception as e:
print(f"\nUnhandled exception while testing {file}: {e}")


def main():
mp.set_start_method("forkserver")

all_counter = counter = 0
selected_files = []
dispatched_tasks = []

with mp.Pool(CPU_COUNT) as pool:
for file in CORPORA_PATH.rglob("*.xml"):
all_counter += 1
if random.randint(1, 100) > DOCUMENT_SAMPLES_PERCENT:
continue

selected_files.append(file)
counter += 1
if len(selected_files) < BATCH_SIZE:
continue

dispatched_tasks.append(
pool.apply_async(dispatch_batch, (tuple(selected_files),))
)
selected_files.clear()

while len(dispatched_tasks) >= CPU_COUNT:
for task in dispatched_tasks:
if task.ready():
dispatched_tasks.remove(task)

stderr.flush()

dispatch_batch(selected_files)
stderr.flush()

print(
f"\n\nTested against {counter} *randomly* selected out of {all_counter} "
"documents."
f"\n{LOCATIONS_PATHS_SAMPLES_PERCENT}% of the tag nodes' `location_path` "
f"attribute were verified per document."
)


if __name__ == "__main__":
main()
12 changes: 8 additions & 4 deletions integration-tests/test-parse-serialize-equality.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@
from _delb.plugins.core_loaders import path_loader
from delb import compare_trees, Document, FailedDocumentLoading, ParserOptions

from utils import indicate_progress


BATCH_SIZE: Final = 64
CPU_COUNT: Final = mp.cpu_count()
Expand All @@ -26,7 +28,8 @@ def parse_serialize_compare(file: Path):

try:
document = Document(
file, parser_options=ParserOptions(collapse_whitespace=False)
file,
parser_options=ParserOptions(collapse_whitespace=False, unplugged=True),
)
except FailedDocumentLoading as exc:
print(
Expand Down Expand Up @@ -57,22 +60,23 @@ def parse_serialize_compare(file: Path):
# TODO? compare with lxml as well
else:
result_file.unlink()
stderr.write("✓")
indicate_progress()


def dispatch_batch(files_list: list[Path]):
for file in files_list:
try:
parse_serialize_compare(file)
except Exception as e:
print(f"\nUnhandled exception while testing {file}: {e}", end="")
print(f"\nUnhandled exception while testing {file}: {e}")


def main():
counter = 0
mp.set_start_method("forkserver")

counter = 0
dispatched_tasks = []

with mp.Pool(CPU_COUNT) as pool:
for file_list in batched(CORPORA_PATH.rglob("*.xml"), n=BATCH_SIZE):
dispatched_tasks.append(
Expand Down
10 changes: 10 additions & 0 deletions integration-tests/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
import random
from sys import stderr
from typing import Final

# Neukölln's digest
PROGRESS_INDICATION_CHARCATERS: Final = "✓→🚴✊★☆⯪𓄁𓅯▶️✴️🪇⚒️🧻🚬🗿🎳⏳🌝☕🐑🐞🌼🪱🌸🏵💮️"
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

the i think fairly standard font i use for code doesn't have a lot of those charcaters...



def indicate_progress():
stderr.write(random.choice(PROGRESS_INDICATION_CHARCATERS))
11 changes: 11 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -192,3 +192,14 @@ coverage-report = """
--cov=_delb --cov=delb \
tests
"""

[tool.hatch.envs.integration-tests]
dependencies = [
"pytest-httpx",
"tenacity",
"tqdm",
]
[tool.hatch.envs.integration-tests.scripts]
fetch = "python integration-tests/fetch-corpora.py"
normalize = "python integration-tests/normalize-corpora.py"
test = "python integration-tests/test-*"