y-scope · Bill-hbrhbr · Oct 8, 2025 · Oct 9, 2025 · Oct 9, 2025 · Oct 9, 2025
@@ -7,34 +7,44 @@ Initialize and update submodules:
 git submodule update --init --recursive
 ```
 
-Run the following code to setup the virtual environment, add the python files in src to python's
-import path, then run the venv
+## Download Datasets
 
-```
-python3 -m venv venv
+You can download all the datasets we use in the benchmark using the [download\_all.py](/scripts/download_all.py) script we provide.
+
+The [download\_all.py](/scripts/download_all.py) script will download all datasets into the correct directories **with** the specified names, concentrate multi-file datasets together into a single file, and generate any modified version of the dataset needed for tools like Presto \+ CLP.
+
+## Docker Environments
+
+Benchmarks are executed inside Docker containers to ensure reproducible, isolated test environments
+with controlled resource limits.
 
-echo "$(pwd)" > $(find venv/lib -maxdepth 1 -mindepth 1 -type d)/site-packages/project_root.pth
+### Build All Docker Images
 
-. venv/bin/activate
+To build all Docker images concurrently:
 
-pip3 install -r requirements.txt
+```shell
+task docker-images:build
 ```
 
-## Download Datasets
+### Build a Single Docker Images
 
-You can download all the datasets we use in the benchmark using the [download\_all.py](/scripts/download_all.py) script we provide.
+To build a specific image for a given engine:
 
-The [download\_all.py](/scripts/download_all.py) script will download all datasets into the correct directories **with** the specified names, concentrate multi-file datasets together into a single file, and generate any modified version of the dataset needed for tools like Presto \+ CLP.
+```shell
+uv run src/log_archival_bench/scripts/docker_images/build.py --engine-name <engine_name>
+```
+
+Each image corresponds to a specific engine (e.g. `clp`, `clickhouse`, `elasticsearch`, `sparksql`).
 
 ## Run Everything
 
 Follow the instructions above to set up your virtual environment.
 
 Stay in the [Log Archival Bench](/) directory and run [scripts/benchall.py](/scripts/benchall.py). This script runs the tools \+ parameters in its "benchmarks" variable across all datasets under [data/](/data).
 
-## Run One Tool
+## Run One Engine
 
-Execute `./assets/{tool name}/main.py {path to <dataset name>.log}` to run ingestion and search on that dataset.
+Execute `./assets/{engine_name}/main.py {path to <dataset_name>.log}` to run ingestion and search on that dataset.
-Execute `./assets/{engine_name}/main.py {path to <dataset_name>.log}` to run ingestion and search on that dataset.
+Execute `./assets/{engine_name}/main.py ./data/{engine_name}` to run ingestion and search on that dataset.
-Execute `./assets/{engine_name}/main.py {path to <dataset_name>.log}` to run ingestion and search on that dataset.
+Execute `uv run python -m assets.{engine_name} ???` to run ingestion and search on that dataset.
-Execute `./assets/{engine_name}/main.py {path to <dataset_name>.log}` to run ingestion and search on that dataset.
+Execute `./assets/{engine_name}/main.py ./data/{engine_name}` to run ingestion and search on that dataset.
-Execute `./assets/{engine_name}/main.py {path to <dataset_name>.log}` to run ingestion and search on that dataset.
+Execute `uv run python -m assets.{engine_name} ???` to run ingestion and search on that dataset.
 
 ## Contributing
 

@@ -24,6 +24,8 @@ dev = [
 ]
 
 [tool.mypy]
+explicit_package_bases = true
+mypy_path = ["src"]
 strict = true
 
 # Additional output

@@ -0,0 +1 @@
+"""Scripts for docker-container-related tasks."""
@@ -0,0 +1,71 @@
+#!/usr/bin/env python3
+"""Build a Docker image and optionally dump its configuration as JSON."""
+
+import argparse
+import subprocess
+import sys
+from pathlib import Path
+
+from log_archival_bench.scripts.docker_images.utils import get_image_name
+from log_archival_bench.utils.path_utils import (
+    get_config_dir,
+    get_package_root,
+    which,
+)
+
+
+def main(argv: list[str]) -> int:
+    """
+    Build a Docker image and optionally dump its configuration as JSON.
+
+    :param argv:
+    :return: 0 on success, non-zero error code on failure.
+    """
+    args_parser = argparse.ArgumentParser()
+    args_parser.add_argument(
+        "--engine-name", required=True, help="The engine to be installed inside the Docker image."
+    )
+    args_parser.add_argument(
+        "--dump-config-path", help="Path to the file to dump the Docker image JSON metadata."
+    )
+
+    parsed_args = args_parser.parse_args(argv[1:])
+    engine_name = parsed_args.engine_name
+    dump_config_path = parsed_args.dump_config_path
+
+    image_name = get_image_name(engine_name)
+
+    docker_file_path = get_config_dir() / "docker-images" / engine_name / "Dockerfile"
+    if not docker_file_path.is_file():
+        err_msg = f"Dockerfile for `{engine_name}` does not exist."
+        raise RuntimeError(err_msg)
+
+    docker_bin = which("docker")
+    # fmt: off
+    build_cmds = [
+      docker_bin,
+      "build",
+      "--tag", image_name,
+      str(get_package_root()),
+      "--file", str(docker_file_path),
+    ]
+    # fmt: on
+    subprocess.run(build_cmds, check=True)
+
+    if dump_config_path is not None:
+        output_path = Path(dump_config_path)
+        output_path.parent.mkdir(parents=True, exist_ok=True)
+        with output_path.open("w", encoding="utf-8") as f:
+            dump_cmds = [
+                docker_bin,
+                "inspect",
+                "--type=image",
+                image_name,
+            ]
+            subprocess.run(dump_cmds, check=True, stdout=f)
+
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main(sys.argv))
@@ -0,0 +1,12 @@
+"""Shared helpers for Docker image scripts."""
+
+import os
+
+
+def get_image_name(engine_name: str) -> str:
+    """
+    :param engine_name: The service engine inside the Docker image.
+    :return: The Docker image name.
+    """
+    user = os.getenv("USER", "clp-user")
+    return f"log-archival-bench-{engine_name}-ubuntu-jammy:dev-{user}"
@@ -0,0 +1 @@
+"""Scripts providing general python utilities for the project."""
@@ -0,0 +1,41 @@
+"""Helpers for getting paths and determining package layouts."""
+
+import shutil
+from pathlib import Path
+
+import log_archival_bench
+
+_PACKAGE_ROOT = Path(log_archival_bench.__file__).parent
+
+_BUILD_DIR = _PACKAGE_ROOT / "build"
+_CONFIG_DIR = _PACKAGE_ROOT / "config"
+
+
+def get_package_root() -> Path:
+    """:return: The path to the project root."""
+    return _PACKAGE_ROOT
+
+
+def get_build_dir() -> Path:
+    """:return: The path to the build output directory."""
+    return _BUILD_DIR
+
+
+def get_config_dir() -> Path:
+    """:return: The path to the project config directory."""
+    return _CONFIG_DIR
+
+
+def which(binary_name: str) -> str:
+    """
+    Locate the full path of an executable.
+
+    :param binary_name: Name of the binary to search for.
+    :return: Full path to the executable as a string.
+    :raises RuntimeError: If the binary is not found in PATH.
+    """
+    bin_path = shutil.which(binary_name)
+    if bin_path is None:
+        err_msg = f"Executable for {binary_name} is not found."
+        raise RuntimeError(err_msg)
+    return bin_path
@@ -5,9 +5,11 @@ shopt: ["globstar"]
 
 includes:
   lint: "taskfiles/lint/main.yaml"
+  docker-images: "taskfiles/docker-images/main.yaml"
 
 vars:
   G_OUTPUT_DIR: "{{.ROOT_DIR}}/build"
+  G_PROJECT_SRC_DIR: "{{.ROOT_DIR}}/src/log_archival_bench"
 
 tasks:
   clean:

@@ -0,0 +1,76 @@
+version: "3"
+
+includes:
+  utils:
+    internal: true
+    taskfile: "../../tools/yscope-dev-utils/exports/taskfiles/utils/utils.yaml"
+
+vars:
+  G_DOCKER_IMAGE_CHECKSUMS_DIR: "{{.G_OUTPUT_DIR}}/docker-images/checksums"
+  G_DOCKER_IMAGE_SCRIPT_DIR: "{{.G_PROJECT_SRC_DIR}}/scripts/docker_images"
+
+  G_DOCKER_IMAGE_ENGINES:
+    - "clickhouse"
+    - "clp"
+    - "elasticsearch"
+    #- "presto_clp"
+    #- "presto_parquet"
+    - "sparksql"
+    - "zstandard"
+
+tasks:
+  build:
+    deps:
+      - task: ":init"
+      - task: "init"
+    cmds:
+      - task: "build-all-parallel"
+
+  init:
+    internal: true
+    silent: true
+    run: "once"
+    cmds:
+      - "mkdir -p '{{.G_DOCKER_IMAGE_CHECKSUMS_DIR}}'"
+
+  build-all-parallel:
+    internal: true
+    run: "once"
+    deps:
+      - for:
+          var: "G_DOCKER_IMAGE_ENGINES"
+        task: "build-single-image"
+        vars:
+          ENGINE_NAME: "{{.ITEM}}"
+
+  build-single-image:
+    internal: true
+    label: "{{.TASK}}:{{.ENGINE_NAME}}"
+    vars:
+      CHECKSUM_FILE: "{{.G_DOCKER_IMAGE_CHECKSUMS_DIR}}/{{.ENGINE_NAME}}.md5"
+      OUTPUT_DIR: "{{.G_DOCKER_IMAGE_CHECKSUMS_DIR}}/{{.ENGINE_NAME}}"
+    requires:
+      vars: ["ENGINE_NAME"]
+    sources:
+      - "{{.G_PROJECT_SRC_DIR}}/config/docker-images/{{.ENGINE_NAME}}/Dockerfile"
+      - "{{.G_PROJECT_SRC_DIR}}/scripts/docker_images/build.py"
+      - "{{.G_PROJECT_SRC_DIR}}/scripts/docker_images/utils.py"
+      - "{{.TASKFILE}}"
+    generates: ["{{.CHECKSUM_FILE}}"]
+    deps:
+      - task: "utils:checksum:validate"
+        vars:
+          CHECKSUM_FILE: "{{.CHECKSUM_FILE}}"
+          INCLUDE_PATTERNS: ["{{.OUTPUT_DIR}}"]
+    cmds:
+      - |-
+        mkdir -p {{.OUTPUT_DIR}}
+        uv run {{.G_DOCKER_IMAGE_SCRIPT_DIR}}/build.py \
+          --engine-name {{.ENGINE_NAME}} \
+          --dump-config-path "{{.OUTPUT_DIR}}/docker_inspect.json"
+
+      # This command must be last
+      - task: "utils:checksum:compute"
+        vars:
+          CHECKSUM_FILE: "{{.CHECKSUM_FILE}}"
+          INCLUDE_PATTERNS: ["{{.OUTPUT_DIR}}"]
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		"""Scripts for docker-container-related tasks."""
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		"""Scripts providing general python utilities for the project."""