Skip to content
Merged
Show file tree
Hide file tree
Changes from 11 commits
Commits
Show all changes
121 commits
Select commit Hold shift + click to select a range
6f76462
Add pycache to gitignore
Bill-hbrhbr Jul 10, 2025
6fa551a
Preemptively update docs
Bill-hbrhbr Jul 10, 2025
01fba04
Add project setup
Bill-hbrhbr Jul 10, 2025
4ac0005
Add integration tests to python linting
Bill-hbrhbr Jul 10, 2025
6f76924
Add sample tests and task workflow to run them
Bill-hbrhbr Jul 10, 2025
6c6701d
Big update
Bill-hbrhbr Jul 11, 2025
c659eb4
Add clp-s test code
Bill-hbrhbr Jul 13, 2025
dc62fd4
Package restructure
Bill-hbrhbr Jul 14, 2025
ded7d4c
Complete clp-s testing (with bug)
Bill-hbrhbr Jul 14, 2025
6880a0f
Make clp-s test workable with keys and rows sorting
Bill-hbrhbr Jul 14, 2025
c24cdaf
Merge branch 'main' into integration-tests-boilerplate
Bill-hbrhbr Jul 21, 2025
9587f99
Address some review comments
Bill-hbrhbr Jul 24, 2025
6d73f31
Address more review comments
Bill-hbrhbr Jul 24, 2025
09fc85c
turn download and extract fixture into a private helper function
Bill-hbrhbr Jul 24, 2025
6216e68
UNcomment larget datasets
Bill-hbrhbr Jul 24, 2025
3a6753b
Apply suggestions from code review
Bill-hbrhbr Jul 24, 2025
ac5a9d8
Merge branch 'main' into integration-tests-boilerplate
Bill-hbrhbr Jul 24, 2025
734ce73
Move download dir and other attributes inside dataset_logs fixture
Bill-hbrhbr Jul 28, 2025
777a0d3
Make json compare into a helper function
Bill-hbrhbr Jul 28, 2025
5dbe1fc
Merge branch 'main' into integration-tests-boilerplate
Bill-hbrhbr Jul 28, 2025
97d881c
Add package basic validity check
Bill-hbrhbr Jul 28, 2025
55a4b21
remove dup class def
Bill-hbrhbr Jul 28, 2025
4aadaae
Merge branch 'main' into integration-tests-boilerplate
Bill-hbrhbr Jul 31, 2025
3acbf02
Address review comments
Bill-hbrhbr Aug 1, 2025
be64383
remove unnecessary class param customizations
Bill-hbrhbr Aug 1, 2025
6f83578
Add back missing dataset tests
Bill-hbrhbr Aug 1, 2025
cb0d282
Update dev utils
Bill-hbrhbr Aug 1, 2025
57cf51b
Rename fixtures and fields according to offline discussions
Bill-hbrhbr Aug 4, 2025
bdbe7d2
Address most review comments
Bill-hbrhbr Aug 5, 2025
9c92e93
Furthur renaming from package_config to test_config
Bill-hbrhbr Aug 5, 2025
0857b49
Use __post_init__ to improve dataclass design
Bill-hbrhbr Aug 5, 2025
c035d9d
Merge branch 'main' into integration-tests-boilerplate
Bill-hbrhbr Aug 6, 2025
e747bfc
Update integration-tests/tests/test_identity_transformation.py
Bill-hbrhbr Aug 6, 2025
1b10e01
Rename to avoid classes starting with Test
Bill-hbrhbr Aug 8, 2025
61909bd
Address review comment
Bill-hbrhbr Aug 8, 2025
54b26a9
Uncomment tests
Bill-hbrhbr Aug 8, 2025
2922b47
Merge branch 'main' into integration-tests-boilerplate
Bill-hbrhbr Aug 8, 2025
14b0141
Split out package config
Bill-hbrhbr Aug 8, 2025
709d9c8
Apply suggestions from code review
Bill-hbrhbr Aug 9, 2025
1a33927
Use singular term for name
Bill-hbrhbr Aug 9, 2025
b047457
Merge branch 'main' into integration-tests-boilerplate
Bill-hbrhbr Aug 9, 2025
f0c5a8f
abbreviate var
Bill-hbrhbr Aug 9, 2025
9ca7500
Revert test_log_name singular noun change
Bill-hbrhbr Aug 9, 2025
28119ea
Apply suggestions from code review
Bill-hbrhbr Aug 10, 2025
0cedb12
Lint fix
Bill-hbrhbr Aug 10, 2025
3508514
Merge branch 'main' into integration-tests-boilerplate
Bill-hbrhbr Aug 13, 2025
0378bed
Address most review comments
Bill-hbrhbr Aug 16, 2025
64dbb72
Test using CLP core bins instead of package
Bill-hbrhbr Aug 16, 2025
5ae418c
Rename tasks
Bill-hbrhbr Aug 16, 2025
812450c
Add uv requirement to core building
Bill-hbrhbr Aug 16, 2025
011d073
Add README shell script lang hint
Bill-hbrhbr Aug 16, 2025
fce6489
Apply suggestions from code review
Bill-hbrhbr Aug 16, 2025
5303ec9
Merge branch 'main' into integration-tests-boilerplate
Bill-hbrhbr Aug 16, 2025
9b40d16
Address coderabbit ai copmments
Bill-hbrhbr Aug 16, 2025
53a6768
Add helper for validating directory exists
Bill-hbrhbr Aug 16, 2025
29c2ded
Add docstring for dataclasses
Bill-hbrhbr Aug 17, 2025
85c37f2
Add docstrings for test utils and improve functions
Bill-hbrhbr Aug 17, 2025
37ffcac
Lint fix
Bill-hbrhbr Aug 17, 2025
35f1690
Add mypy and ruff linters
Bill-hbrhbr Aug 17, 2025
95b6886
ruff lint
Bill-hbrhbr Aug 17, 2025
0a68983
Add missing __init__ files
Bill-hbrhbr Aug 17, 2025
e4f4891
Pass mypy test
Bill-hbrhbr Aug 17, 2025
ca43014
Disable warning about assert
Bill-hbrhbr Aug 17, 2025
ebbc541
Add mypy taskflow and fix all ruff complaints
Bill-hbrhbr Aug 17, 2025
baa6e9d
Update integration-tests/.pytest.ini
Bill-hbrhbr Aug 17, 2025
d22c534
Address coderabbit comment
Bill-hbrhbr Aug 17, 2025
9833069
Lint fix
Bill-hbrhbr Aug 17, 2025
7c818d4
logic fix
Bill-hbrhbr Aug 17, 2025
37c0e23
Improve docstrings
Bill-hbrhbr Aug 17, 2025
c608df6
Add linting section to README
Bill-hbrhbr Aug 17, 2025
1cae84e
Apply suggestions from code review
Bill-hbrhbr Aug 17, 2025
1aa50e6
Add yoda-condition check skips
Bill-hbrhbr Aug 17, 2025
d49cedd
Update integration-tests/README.md
Bill-hbrhbr Aug 17, 2025
5ea3289
Space out README code section
Bill-hbrhbr Aug 17, 2025
8f81696
Update integration-tests/pyproject.toml
Bill-hbrhbr Aug 17, 2025
c5ad284
Merge branch 'main' into integration-tests-boilerplate
Bill-hbrhbr Aug 17, 2025
1d63ccc
Make use of python class property
Bill-hbrhbr Aug 17, 2025
4139341
Improve taskfile
Bill-hbrhbr Aug 17, 2025
d5b7f76
Fix tab spaces
Bill-hbrhbr Aug 17, 2025
9f30065
Create taskfile python linting for projects using uv
Bill-hbrhbr Aug 17, 2025
6648734
satisfy yaml linter
Bill-hbrhbr Aug 17, 2025
5b219fb
Merge branch 'main' into integration-tests-boilerplate
Bill-hbrhbr Aug 18, 2025
09fbd51
Merge branch 'main' into integration-tests-boilerplate
Bill-hbrhbr Aug 18, 2025
6da26a1
Refactor docs.
kirkrodrigues Aug 18, 2025
789f4d1
Apply suggestions from code review
Bill-hbrhbr Aug 20, 2025
d7619cb
Address review concern
Bill-hbrhbr Aug 20, 2025
c8bd52f
Apply suggestions from code review
Bill-hbrhbr Aug 20, 2025
8400e01
Address review comments
Bill-hbrhbr Aug 20, 2025
95d19ba
Update integration-tests/tests/utils/config.py
Bill-hbrhbr Aug 20, 2025
bc4e20b
Merge branch 'main' into integration-tests-boilerplate
Bill-hbrhbr Aug 20, 2025
9f022f8
Make integration task depend on the whole package
Bill-hbrhbr Aug 20, 2025
aaba2f1
Merge branch 'main' into integration-tests-boilerplate
Bill-hbrhbr Aug 24, 2025
cbd0e8e
Apply suggestions from code review
Bill-hbrhbr Sep 2, 2025
a4382d6
Merge branch 'main' into integration-tests-boilerplate
Bill-hbrhbr Sep 2, 2025
b53c04a
Rename assert_utils to asserting_utils
Bill-hbrhbr Sep 2, 2025
73dd3d1
Change validate to validates in docstring start
Bill-hbrhbr Sep 2, 2025
9d280d7
abbreviate validate_dir_exists
Bill-hbrhbr Sep 2, 2025
0de00bb
localize integration tests taskfile vars
Bill-hbrhbr Sep 2, 2025
2f2c97f
Add back missing asserting_utils.py
Bill-hbrhbr Sep 2, 2025
bafa272
Update docs/src/dev-docs/index.md
Bill-hbrhbr Sep 2, 2025
98b228d
lint fix
Bill-hbrhbr Sep 2, 2025
eaf04e4
Merge branch 'main' into integration-tests-boilerplate
Bill-hbrhbr Sep 3, 2025
51ec20c
Remove unrelated changes
Bill-hbrhbr Sep 3, 2025
fa81d00
Update docs/src/dev-docs/index.md
Bill-hbrhbr Sep 3, 2025
7ca37da
Fix docs.
kirkrodrigues Sep 4, 2025
9365193
Apply Rabbit's suggestion.
kirkrodrigues Sep 8, 2025
b641bd9
Alphabetize .gitignore.
kirkrodrigues Sep 8, 2025
c2073cd
Update integration-tests/README.md
kirkrodrigues Sep 8, 2025
1fcc997
Merge branch 'main' into integration-tests-boilerplate
Bill-hbrhbr Sep 10, 2025
fb274d1
Address review comments
Bill-hbrhbr Sep 10, 2025
4840360
typo fix
Bill-hbrhbr Sep 10, 2025
4f3bb75
Merge branch 'main' into integration-tests-boilerplate
Bill-hbrhbr Sep 10, 2025
66f61a7
Merge branch 'main' into integration-tests-boilerplate
Bill-hbrhbr Sep 11, 2025
47ccb1c
Move python linting checks specific for unit tests into their own cat…
Bill-hbrhbr Sep 11, 2025
37f59bb
Apply suggestions from code review
Bill-hbrhbr Sep 14, 2025
ba35f88
Merge branch 'main' into integration-tests-boilerplate
Bill-hbrhbr Sep 14, 2025
8c98e4d
Address review comment
Bill-hbrhbr Sep 14, 2025
60501f1
lint fix
Bill-hbrhbr Sep 14, 2025
31e69e3
use shutil to find chmod binary
Bill-hbrhbr Sep 14, 2025
f424f21
use shutil to find the curl executable
Bill-hbrhbr Sep 15, 2025
31da58e
Merge branch 'main' into integration-tests-boilerplate
Bill-hbrhbr Sep 15, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -2,3 +2,4 @@
.task/
build/
**/dist/
**/__pycache__/
10 changes: 10 additions & 0 deletions docs/src/dev-guide/building-package.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ prebuilt version instead, check out the [releases](https://github.com/y-scope/cl
* python3-venv (for the version of Python installed)
* [Task] >= 3.38.0 and < 3.43.0
* We constrain the version due to unresolved [issues][clp-issue-872].
* [uv] 0.7.19 or newer

## Setup

Expand Down Expand Up @@ -61,6 +62,14 @@ where `<flavour>` is `json` or `text`.
The tar will be written to `build/clp-<flavour>-<os>-<arch>-v<version>.tar.gz`, with appropriate
values for the fields in angle brackets.

## Test

To test the package, run:

```shell
task test-package
```

## Cleanup

To clean up all build artifacts, run:
Expand All @@ -71,3 +80,4 @@ task clean

[clp-issue-872]: https://github.com/y-scope/clp/issues/872
[Task]: https://taskfile.dev/
[uv]: https://docs.astral.sh/uv/
15 changes: 15 additions & 0 deletions integration-tests/.pytest.ini
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
[pytest]
addopts =
--strict-config
--strict-markers
--capture=no
--verbose
--color=yes
--code-highlight=yes
env =
D:CLP_BUILD_DIR=../build
D:CLP_PACKAGE_DIR=../build/clp-package
markers =
binaries: mark tests that directly call the binaries in the package bin
clp: mark tests that use the CLP storage engine
clp_s: mark tests that use the CLP-S storage engine
1 change: 1 addition & 0 deletions integration-tests/.python-version
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
3.10
Empty file added integration-tests/README.md
Empty file.
23 changes: 23 additions & 0 deletions integration-tests/pyproject.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
[project]
name = "integration-tests"
version = "0.1.0"
description = "Integration tests for the CLP project."
readme = "README.md"
authors = [
{ name = "YScope Inc.", email = "dev@yscope.com" }
]
requires-python = ">=3.10"

[project.scripts]
integration-tests = "integration_tests:main"

[build-system]
requires = ["hatchling"]
build-backend = "hatchling.build"

[dependency-groups]
dev = [
"pytest>=8.3.5",
"pytest-benchmark>=5.1.0",
"pytest-env>=1.1.5",
]
2 changes: 2 additions & 0 deletions integration-tests/src/integration_tests/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
def main() -> None:
print("Hello from integration-tests!")
7 changes: 7 additions & 0 deletions integration-tests/tests/conftest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
from tests.fixtures.base_config import base_config
from tests.fixtures.dataset_logs import (
download_and_extract_dataset,
hive_24hr,
postgresql,
spark_event_logs,
)
22 changes: 22 additions & 0 deletions integration-tests/tests/fixtures/base_config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
from pathlib import Path

import pytest
from tests.utils.config import BaseConfig
from tests.utils.utils import get_env_var


@pytest.fixture(scope="session")
def base_config() -> BaseConfig:
clp_build_dir = Path(get_env_var("CLP_BUILD_DIR")).resolve()
clp_package_dir = Path(get_env_var("CLP_PACKAGE_DIR")).resolve()

base_config = BaseConfig(
clp_bin_dir=clp_package_dir / "bin",
clp_package_dir=clp_package_dir,
clp_sbin_dir=clp_package_dir / "sbin",
test_output_dir=clp_build_dir / "var" / "logs" / "pytest",
uncompressed_logs_dir=clp_build_dir / "var" / "data" / "pytest" / "downloads",
)
base_config.test_output_dir.mkdir(parents=True, exist_ok=True)
base_config.uncompressed_logs_dir.mkdir(parents=True, exist_ok=True)
return base_config
71 changes: 71 additions & 0 deletions integration-tests/tests/fixtures/dataset_logs.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
import shutil

import pytest
from tests.utils.config import (
BaseConfig,
DatasetLogs,
)
from tests.utils.utils import run_and_assert


@pytest.fixture(scope="session")
def hive_24hr() -> DatasetLogs:
return DatasetLogs(
name="hive-24hr",
tar_url="https://zenodo.org/records/7094921/files/hive-24hr.tar.gz?download=1",
)


@pytest.fixture(scope="session")
def elasticsearch() -> DatasetLogs:
return DatasetLogs(
name="elasticsearch",
tar_url="https://zenodo.org/records/10516227/files/elasticsearch.tar.gz?download=1",
)


@pytest.fixture(scope="session")
def spark_event_logs() -> DatasetLogs:
return DatasetLogs(
name="spark-event-logs",
tar_url="https://zenodo.org/records/10516346/files/spark-event-logs.tar.gz?download=1",
)


@pytest.fixture(scope="session")
def postgresql() -> DatasetLogs:
return DatasetLogs(
name="postgresql",
tar_url="https://zenodo.org/records/10516402/files/postgresql.tar.gz?download=1",
)


@pytest.fixture(autouse=True)
def download_and_extract_dataset(request, base_config: BaseConfig) -> DatasetLogs:
dataset_config = request.getfixturevalue(request.param)
dataset_name = dataset_config.name
if request.config.cache.get(dataset_name, False):
print(f"Uncompressed logs for dataset `{dataset_name}` is up-to-date.")
return dataset_config

download_path = str(base_config.uncompressed_logs_dir / f"{dataset_name}.tar.gz")
extract_path = str(base_config.uncompressed_logs_dir / dataset_name)
# fmt: off
cmds = [
"curl",
"--fail",
"--location",
"--output", str(download_path),
"--show-error",
dataset_config.tar_url,
]
# fmt: on
run_and_assert(cmds)

try:
shutil.unpack_archive(download_path, extract_path)
except:
assert False, f"Tar extraction failed for downloaded dataset `{dataset_name}`."

request.config.cache.set(dataset_name, True)
return dataset_config
122 changes: 122 additions & 0 deletions integration-tests/tests/test_identity_transformation.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,122 @@
import shutil
from pathlib import Path
from tempfile import NamedTemporaryFile
from typing import IO

import pytest
from tests.utils.config import (
BaseConfig,
DatasetLogs,
)
from tests.utils.utils import (
diff_equal,
run_and_assert,
)

pytestmark = pytest.mark.binaries

text_datasets = pytest.mark.parametrize(
"download_and_extract_dataset",
[
"hive_24hr",
],
indirect=["download_and_extract_dataset"],
)

json_datasets = pytest.mark.parametrize(
"download_and_extract_dataset",
[
"spark_event_logs",
"postgresql",
],
indirect=["download_and_extract_dataset"],
)


@pytest.mark.clp
@text_datasets
def test_clp_identity_transform(
request, base_config: BaseConfig, download_and_extract_dataset: DatasetLogs
) -> None:
binary_path_str = str(base_config.clp_bin_dir / "clp")
dataset_name = download_and_extract_dataset.name
download_dir = base_config.uncompressed_logs_dir / dataset_name
archives_dir = base_config.test_output_dir / f"{dataset_name}-archives"
extract_dir = base_config.test_output_dir / f"{dataset_name}-logs"

shutil.rmtree(archives_dir, ignore_errors=True)
shutil.rmtree(extract_dir, ignore_errors=True)

# fmt: off
compression_cmd = [
binary_path_str,
"c",
"--progress",
"--remove-path-prefix", str(download_dir),
str(archives_dir),
str(download_dir),
]
# fmt: on
run_and_assert(compression_cmd)
run_and_assert([binary_path_str, "x", str(archives_dir), str(extract_dir)])

diff_equal(download_dir, extract_dir)

shutil.rmtree(archives_dir, ignore_errors=True)
shutil.rmtree(extract_dir, ignore_errors=True)


@pytest.mark.clp_s
@json_datasets
def test_clp_s_identity_transform(
request, base_config: BaseConfig, download_and_extract_dataset: DatasetLogs
) -> None:
binary_path_str = str(base_config.clp_bin_dir / "clp-s")
dataset_name = download_and_extract_dataset.name
download_dir = base_config.uncompressed_logs_dir / dataset_name
archives_dir = base_config.test_output_dir / f"{dataset_name}-archives"
extract_dir = base_config.test_output_dir / f"{dataset_name}-logs"

shutil.rmtree(archives_dir, ignore_errors=True)
shutil.rmtree(extract_dir, ignore_errors=True)

run_and_assert([binary_path_str, "c", str(archives_dir), str(download_dir)])
run_and_assert([binary_path_str, "x", str(archives_dir), str(extract_dir)])

# Recompress the decompressed single-file output and decompress it again to verify consistency.
# TODO: Remove this check once we can directly compare decompressed logs (which would preserve
# the directory structure and row/key order) with the original downloaded logs.
# See also: https://docs.yscope.com/clp/main/user-guide/core-clp-s.html#current-limitations
single_file_archives_dir = base_config.test_output_dir / f"{dataset_name}-single-file-archives"
single_file_extract_dir = base_config.test_output_dir / f"{dataset_name}-single-file-logs"

shutil.rmtree(single_file_archives_dir, ignore_errors=True)
shutil.rmtree(single_file_extract_dir, ignore_errors=True)

run_and_assert([binary_path_str, "c", single_file_archives_dir, extract_dir])
run_and_assert([binary_path_str, "x", single_file_archives_dir, single_file_extract_dir])

# Key and row orders are not preserved during `clp-s` operations, so sort before diffing.
with _sort_json_keys_and_rows(extract_dir / "original") as s1, _sort_json_keys_and_rows(
single_file_extract_dir / "original"
) as s2:
diff_equal(s1.name, s2.name)

shutil.rmtree(archives_dir, ignore_errors=True)
shutil.rmtree(extract_dir, ignore_errors=True)
shutil.rmtree(single_file_archives_dir, ignore_errors=True)
shutil.rmtree(single_file_extract_dir, ignore_errors=True)


def _sort_json_keys_and_rows(json_fp: Path) -> IO[bytes]:
with NamedTemporaryFile(mode="w+", delete=True) as keys_sorted, NamedTemporaryFile(
mode="w+", delete=True
) as flattened:
keys_and_rows_sorted = NamedTemporaryFile(mode="w+", delete=True)
run_and_assert(["jq", "--sort-keys", ".", str(json_fp)], stdout=keys_sorted)
keys_sorted.flush()
run_and_assert(["jq", ".", keys_sorted.name], stdout=flattened)
flattened.flush()
run_and_assert(["sort", flattened.name], stdout=keys_and_rows_sorted)
keys_and_rows_sorted.flush()
return keys_and_rows_sorted
17 changes: 17 additions & 0 deletions integration-tests/tests/utils/config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
from dataclasses import dataclass
from pathlib import Path


@dataclass(frozen=True)
class BaseConfig:
clp_bin_dir: Path
clp_package_dir: Path
clp_sbin_dir: Path
test_output_dir: Path
uncompressed_logs_dir: Path


@dataclass(frozen=True)
class DatasetLogs:
name: str
tar_url: str
25 changes: 25 additions & 0 deletions integration-tests/tests/utils/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
import os
import subprocess
from pathlib import Path
from typing import List


def diff_equal(path1: Path, path2: Path) -> None:
cmd = ["diff", "--brief", "--recursive", str(path1), str(path2)]
proc = subprocess.run(cmd, stdout=subprocess.PIPE)
if 0 != proc.returncode:
if 1 == proc.returncode:
assert False, "Files/Directories don't match."
assert False, f"Command failed {' '.join(cmd)}"


def get_env_var(var_name: str) -> str:
value = os.environ.get(var_name)
assert value is not None, f"Environment variable {var_name} is not set."
return value


def run_and_assert(cmd: List[str], **kwargs) -> subprocess.CompletedProcess:
proc = subprocess.run(cmd, **kwargs)
assert 0 == proc.returncode, f"Command failed: {' '.join(cmd)}"
return proc
Loading
Loading