cmu-db
diff --git a/‎README.md
Lines changed: 86 additions & 1 deletion b/‎README.md
Lines changed: 86 additions & 1 deletion
diff --git a/‎benchmark/tpch/cli.py
Lines changed: 2 additions & 2 deletions b/‎benchmark/tpch/cli.py
Lines changed: 2 additions & 2 deletions
diff --git a/‎dbms/postgres/build_repo.sh
Lines changed: 9 additions & 9 deletions b/‎dbms/postgres/build_repo.sh
Lines changed: 9 additions & 9 deletions
diff --git a/‎dbms/postgres/cli.py
Lines changed: 58 additions & 59 deletions b/‎dbms/postgres/cli.py
Lines changed: 58 additions & 59 deletions
@@ -1 +1,86 @@
-# Database Gym
+# 🛢️ Database Gym 🏋️
+[\[Slides\]](http://www.cidrdb.org/cidr2023/slides/p27-lim-slides.pdf) [\[Paper\]](https://www.cidrdb.org/cidr2023/papers/p27-lim.pdf)
+
+*An end-to-end research vehicle for the field of self-driving DBMSs.*
+
+## Quickstart
+
+These steps were tested on a fresh repository clone, Ubuntu 22.04.
+
+```
+# Setup dependencies.
+# You may want to create a Python virtual environment (e.g. with conda) before doing this.
+./dependency/install_dependencies.sh
+
+# Compile a custom fork of PostgreSQL, load TPC-H (SF 0.01), train the Proto-X agent, and tune.
+./scripts/quickstart.sh postgres tpch 0.01 protox
+```
+
+## Overview
+
+Autonomous DBMS research often involves more engineering than research.
+As new advances in state-of-the-art technology are made, it is common to find that they have have
+reimplemented the database tuning pipeline from scratch: workload capture, database setup,
+training data collection, model creation, model deployment, and more.
+Moreover, these bespoke pipelines make it difficult to combine different techniques even when they
+should be independent (e.g., using a different operator latency model in a tuning algorithm).
+
+The database gym project is our attempt at standardizing the APIs between these disparate tasks,
+allowing researchers to mix-and-match the different pipeline components.
+It draws inspiration from the Farama Foundation's Gymnasium (formerly OpenAI Gym), which
+accelerates the development and comparison of reinforcement learning algorithms by providing a set
+of agents, environments, and a standardized API for communicating between them.
+Through the database gym, we hope to save other people time and reimplementation effort by
+providing an extensible open-source platform for autonomous DBMS research.
+
+This project is under active development.
+Currently, we decompose the database tuning pipeline into the following components:
+
+1. Workload: collection, forecasting, synthesis
+2. Database: database loading, instrumentation, orchestrating workload execution
+3. Agent: identifying tuning actions, suggesting an action
+
+## Repository Structure
+
+`task.py` is the entrypoint for all tasks.
+The tasks are grouped into categories that correspond to the top-level directories of the repository:
+
+- `benchmark` - tasks to generate data and queries for different benchmarks (e.g., TPC-H, JOB)
+- `dbms` - tasks to build and start DBMSs (e.g., PostgreSQL)
+- `tune` - tasks to train autonomous database tuning agents
+
+## Credits
+
+The Database Gym project rose from the ashes of the [NoisePage](https://db.cs.cmu.edu/projects/noisepage/) self-driving DBMS project.
+
+The first prototype was written by [Patrick Wang](https://github.com/wangpatrick57), integrating [Boot (VLDB 2024)](https://github.com/lmwnshn/boot) and [Proto-X (VLDB 2024)](https://github.com/17zhangw/protox) into a cohesive system.
+
+## Citing This Repository
+
+If you use this repository in an academic paper, please cite:
+
+```
+@inproceedings{lim23,
+  author = {Lim, Wan Shen and Butrovich, Matthew and Zhang, William and Crotty, Andrew and Ma, Lin and Xu, Peijing and Gehrke, Johannes and Pavlo, Andrew},
+  title = {Database Gyms},
+  booktitle = {{CIDR} 2023, Conference on Innovative Data Systems Research},
+  year = {2023},
+  url = {https://db.cs.cmu.edu/papers/2023/p27-lim.pdf},
+ }
+```
+
+Additionally, please cite any module-specific paper that is relevant to your use.
+
+**Accelerating Training Data Generation**
+
+```
+(citation pending)
+Boot, appearing at VLDB 2024.
+```
+
+**Simultaneously Tuning Multiple Configuration Spaces with Proto Actions**
+
+```
+(citation pending)
+Proto-X, appearing at VLDB 2024.
+```
@@ -21,8 +21,8 @@ def tpch_group(dbgym_cfg: DBGymConfig):
 @tpch_group.command(name="data")
 @click.argument("scale-factor", type=float)
 @click.pass_obj
-# The reason generate-data is separate from create-pgdata is because generate-data is generic
-#   to all DBMSs while create-pgdata is specific to Postgres.
+# The reason generate data is separate from create dbdata is because generate-data is generic
+#   to all DBMSs while create dbdata is specific to a single DBMS.
 def tpch_data(dbgym_cfg: DBGymConfig, scale_factor: float):
     _clone(dbgym_cfg)
     _generate_data(dbgym_cfg, scale_factor)
 
@@ -4,34 +4,34 @@ set -euxo pipefail
 
 REPO_REAL_PARENT_DPATH="$1"
 
-# download and make postgres from the boot repository
+# Download and make postgres from the boot repository.
 mkdir -p "${REPO_REAL_PARENT_DPATH}"
 cd "${REPO_REAL_PARENT_DPATH}"
-git clone git@github.com:lmwnshn/boot.git --single-branch --branch boot --depth 1
+git clone git@github.com:lmwnshn/boot.git --single-branch --branch vldb_2024 --depth 1
 cd ./boot
 ./cmudb/build/configure.sh release "${REPO_REAL_PARENT_DPATH}/boot/build/postgres"
 make clean
 make install-world-bin -j4
 
-# download and make bytejack
-cd ./cmudb/extension/bytejack_rs/
+# Download and make boot.
+cd ./cmudb/extension/boot_rs/
 cargo build --release
-cbindgen . -o target/bytejack_rs.h --lang c
+cbindgen . -o target/boot_rs.h --lang c
 cd "${REPO_REAL_PARENT_DPATH}/boot"
 
-cd ./cmudb/extension/bytejack/
+cd ./cmudb/extension/boot/
 make clean
 make install -j
 cd "${REPO_REAL_PARENT_DPATH}/boot"
 
-# download and make hypopg
+# Download and make hypopg.
 git clone git@github.com:HypoPG/hypopg.git
 cd ./hypopg
 PG_CONFIG="${REPO_REAL_PARENT_DPATH}/boot/build/postgres/bin/pg_config" make install
 cd "${REPO_REAL_PARENT_DPATH}/boot"
 
-# download and make pg_hint_plan
-# we need -L to follow links
+# Download and make pg_hint_plan.
+# We need -L to follow links.
 curl -L https://github.com/ossc-db/pg_hint_plan/archive/refs/tags/REL15_1_5_1.tar.gz -o REL15_1_5_1.tar.gz
 tar -xzf REL15_1_5_1.tar.gz
 rm REL15_1_5_1.tar.gz
 
@@ -1,5 +1,5 @@
 """
-At a high level, this file's goal is to (1) install+build postgres and (2) create pgdata.
+At a high level, this file's goal is to (1) build postgres and (2) create dbdata (aka pgdata).
 On the other hand, the goal of tune.protox.env.util.postgres is to provide helpers to manage
     a Postgres instance during agent tuning.
 util.pg provides helpers used by *both* of the above files (as well as other files).
@@ -10,11 +10,10 @@
 import subprocess
 from pathlib import Path
 import click
-import ssd_checker
 
 from benchmark.tpch.load_info import TpchLoadInfo
 from dbms.load_info_base_class import LoadInfoBaseClass
-from misc.utils import DBGymConfig, conv_inputpath_to_realabspath, link_result, open_and_save, save_file, get_pgdata_tgz_name, default_pgbin_path, WORKSPACE_PATH_PLACEHOLDER, default_pgdata_parent_dpath
+from misc.utils import DBGymConfig, conv_inputpath_to_realabspath, link_result, open_and_save, save_file, get_dbdata_tgz_name, default_pgbin_path, WORKSPACE_PATH_PLACEHOLDER, default_dbdata_parent_dpath, is_ssd
 from util.shell import subprocess_run
 from sqlalchemy import Connection
 from util.pg import SHARED_PRELOAD_LIBRARIES, conn_execute, sql_file_execute, DBGYM_POSTGRES_DBNAME, create_conn, DEFAULT_POSTGRES_PORT, DBGYM_POSTGRES_USER, DBGYM_POSTGRES_PASS, DEFAULT_POSTGRES_DBNAME
@@ -32,7 +31,7 @@ def postgres_group(dbgym_cfg: DBGymConfig):
 
 @postgres_group.command(
     name="build",
-    help="Download and build the Postgres repository and all necessary extensions/shared libraries. Does not create pgdata.",
+    help="Download and build the Postgres repository and all necessary extensions/shared libraries. Does not create dbdata.",
 )
 @click.pass_obj
 @click.option("--rebuild", is_flag=True, help="Include this flag to rebuild Postgres even if it already exists.")
@@ -41,46 +40,46 @@ def postgres_build(dbgym_cfg: DBGymConfig, rebuild: bool):
 
 
 @postgres_group.command(
-    name="pgdata",
-    help="Build a .tgz file of pgdata with various specifications for its contents.",
+    name="dbdata",
+    help="Build a .tgz file of dbdata with various specifications for its contents.",
 )
 @click.pass_obj
 @click.argument("benchmark_name", type=str)
 @click.option("--scale-factor", type=float, default=1)
 @click.option("--pgbin-path", type=Path, default=None, help=f"The path to the bin containing Postgres executables. The default is {default_pgbin_path(WORKSPACE_PATH_PLACEHOLDER)}.")
 @click.option(
-    "--intended-pgdata-hardware",
+    "--intended-dbdata-hardware",
     type=click.Choice(["hdd", "ssd"]),
     default="hdd",
-    help=f"The intended hardware pgdata should be on. Used as a sanity check for --pgdata-parent-dpath.",
+    help=f"The intended hardware dbdata should be on. Used as a sanity check for --dbdata-parent-dpath.",
 )
 @click.option(
-    "--pgdata-parent-dpath",
+    "--dbdata-parent-dpath",
     default=None,
     type=Path,
-    help=f"The path to the parent directory of the pgdata which will be actively tuned. The default is {default_pgdata_parent_dpath(WORKSPACE_PATH_PLACEHOLDER)}.",
+    help=f"The path to the parent directory of the dbdata which will be actively tuned. The default is {default_dbdata_parent_dpath(WORKSPACE_PATH_PLACEHOLDER)}.",
 )
-def postgres_pgdata(dbgym_cfg: DBGymConfig, benchmark_name: str, scale_factor: float, pgbin_path: Path, intended_pgdata_hardware: str, pgdata_parent_dpath: Path):
+def postgres_dbdata(dbgym_cfg: DBGymConfig, benchmark_name: str, scale_factor: float, pgbin_path: Path, intended_dbdata_hardware: str, dbdata_parent_dpath: Path):
     # Set args to defaults programmatically (do this before doing anything else in the function)
     if pgbin_path == None:
         pgbin_path = default_pgbin_path(dbgym_cfg.dbgym_workspace_path)
-    if pgdata_parent_dpath == None:
-        pgdata_parent_dpath = default_pgdata_parent_dpath(dbgym_cfg.dbgym_workspace_path)
+    if dbdata_parent_dpath == None:
+        dbdata_parent_dpath = default_dbdata_parent_dpath(dbgym_cfg.dbgym_workspace_path)
 
     # Convert all input paths to absolute paths
     pgbin_path = conv_inputpath_to_realabspath(dbgym_cfg, pgbin_path)
-    pgdata_parent_dpath = conv_inputpath_to_realabspath(dbgym_cfg, pgdata_parent_dpath)
+    dbdata_parent_dpath = conv_inputpath_to_realabspath(dbgym_cfg, dbdata_parent_dpath)
 
     # Check assertions on args
-    if intended_pgdata_hardware == "hdd":
-        assert not ssd_checker.is_ssd(pgdata_parent_dpath), f"Intended hardware is HDD but pgdata_parent_dpath ({pgdata_parent_dpath}) is an SSD"
-    elif intended_pgdata_hardware == "ssd":
-        assert ssd_checker.is_ssd(pgdata_parent_dpath), f"Intended hardware is SSD but pgdata_parent_dpath ({pgdata_parent_dpath}) is an HDD"
+    if intended_dbdata_hardware == "hdd":
+        assert not is_ssd(dbdata_parent_dpath), f"Intended hardware is HDD but dbdata_parent_dpath ({dbdata_parent_dpath}) is an SSD"
+    elif intended_dbdata_hardware == "ssd":
+        assert is_ssd(dbdata_parent_dpath), f"Intended hardware is SSD but dbdata_parent_dpath ({dbdata_parent_dpath}) is an HDD"
     else:
         assert False
 
-    # Create pgdata
-    _create_pgdata(dbgym_cfg, benchmark_name, scale_factor, pgbin_path, pgdata_parent_dpath)
+    # Create dbdata
+    _create_dbdata(dbgym_cfg, benchmark_name, scale_factor, pgbin_path, dbdata_parent_dpath)
 
 
 def _get_pgbin_symlink_path(dbgym_cfg: DBGymConfig) -> Path:
@@ -109,52 +108,52 @@ def _build_repo(dbgym_cfg: DBGymConfig, rebuild):
     dbms_postgres_logger.info(f"Set up repo in {expected_repo_symlink_dpath}")
 
 
-def _create_pgdata(dbgym_cfg: DBGymConfig, benchmark_name: str, scale_factor: float, pgbin_path: Path, pgdata_parent_dpath: Path) -> None:
+def _create_dbdata(dbgym_cfg: DBGymConfig, benchmark_name: str, scale_factor: float, pgbin_path: Path, dbdata_parent_dpath: Path) -> None:
     """
-    I chose *not* for this function to skip by default if pgdata_tgz_symlink_path already exists. This
+    I chose *not* for this function to skip by default if dbdata_tgz_symlink_path already exists. This
       is because, while the generated data is deterministic given benchmark_name and scale_factor, any
-      change in the _create_pgdata() function would result in a different pgdata. Since _create_pgdata()
+      change in the _create_dbdata() function would result in a different dbdata. Since _create_dbdata()
       may change somewhat frequently, I decided to get rid of the footgun of having changes to
-      _create_pgdata() not propagate to [pgdata].tgz by default.
+      _create_dbdata() not propagate to [dbdata].tgz by default.
     """
 
-    # It's ok for the pgdata/ directory to be temporary. It just matters that the .tgz is saved in a safe place.
-    pgdata_dpath = pgdata_parent_dpath / "pgdata_being_created"
-    # We might be reusing the same pgdata_parent_dpath, so delete pgdata_dpath if it already exists
-    if pgdata_dpath.exists():
-        shutil.rmtree(pgdata_dpath)
+    # It's ok for the dbdata/ directory to be temporary. It just matters that the .tgz is saved in a safe place.
+    dbdata_dpath = dbdata_parent_dpath / "dbdata_being_created"
+    # We might be reusing the same dbdata_parent_dpath, so delete dbdata_dpath if it already exists
+    if dbdata_dpath.exists():
+        shutil.rmtree(dbdata_dpath)
 
     # Call initdb.
     # Save any script we call from pgbin_symlink_dpath because they are dependencies generated from another task run.
     save_file(dbgym_cfg, pgbin_path / "initdb")
-    subprocess_run(f'./initdb -D "{pgdata_dpath}"', cwd=pgbin_path)
+    subprocess_run(f'./initdb -D "{dbdata_dpath}"', cwd=pgbin_path)
 
-    # Start Postgres (all other pgdata setup requires postgres to be started).
+    # Start Postgres (all other dbdata setup requires postgres to be started).
     # Note that subprocess_run() never returns when running "pg_ctl start", so I'm using subprocess.run() instead.
-    start_postgres(dbgym_cfg, pgbin_path, pgdata_dpath)
+    start_postgres(dbgym_cfg, pgbin_path, dbdata_dpath)
 
     # Set up Postgres.
-    _generic_pgdata_setup(dbgym_cfg)
-    _load_benchmark_into_pgdata(dbgym_cfg, benchmark_name, scale_factor)
+    _generic_dbdata_setup(dbgym_cfg)
+    _load_benchmark_into_dbdata(dbgym_cfg, benchmark_name, scale_factor)
 
     # Stop Postgres so that we don't "leak" processes.
-    stop_postgres(dbgym_cfg, pgbin_path, pgdata_dpath)
+    stop_postgres(dbgym_cfg, pgbin_path, dbdata_dpath)
 
     # Create .tgz file.
-    # Note that you can't pass "[pgdata].tgz" as an arg to cur_task_runs_data_path() because that would create "[pgdata].tgz" as a dir.
-    pgdata_tgz_real_fpath = dbgym_cfg.cur_task_runs_data_path(
+    # Note that you can't pass "[dbdata].tgz" as an arg to cur_task_runs_data_path() because that would create "[dbdata].tgz" as a dir.
+    dbdata_tgz_real_fpath = dbgym_cfg.cur_task_runs_data_path(
         mkdir=True
-    ) / get_pgdata_tgz_name(benchmark_name, scale_factor)
-    # We need to cd into pgdata_dpath so that the tar file does not contain folders for the whole path of pgdata_dpath.
-    subprocess_run(f"tar -czf {pgdata_tgz_real_fpath} .", cwd=pgdata_dpath)
+    ) / get_dbdata_tgz_name(benchmark_name, scale_factor)
+    # We need to cd into dbdata_dpath so that the tar file does not contain folders for the whole path of dbdata_dpath.
+    subprocess_run(f"tar -czf {dbdata_tgz_real_fpath} .", cwd=dbdata_dpath)
 
     # Create symlink.
-    # Only link at the end so that the link only ever points to a complete pgdata.
-    pgdata_tgz_symlink_path = link_result(dbgym_cfg, pgdata_tgz_real_fpath)
-    dbms_postgres_logger.info(f"Created pgdata in {pgdata_tgz_symlink_path}")
+    # Only link at the end so that the link only ever points to a complete dbdata.
+    dbdata_tgz_symlink_path = link_result(dbgym_cfg, dbdata_tgz_real_fpath)
+    dbms_postgres_logger.info(f"Created dbdata in {dbdata_tgz_symlink_path}")
 
 
-def _generic_pgdata_setup(dbgym_cfg: DBGymConfig):
+def _generic_dbdata_setup(dbgym_cfg: DBGymConfig):
     # get necessary vars
     pgbin_real_dpath = _get_pgbin_symlink_path(dbgym_cfg).resolve()
     assert pgbin_real_dpath.exists()
@@ -182,29 +181,29 @@ def _generic_pgdata_setup(dbgym_cfg: DBGymConfig):
             cwd=pgbin_real_dpath,
         )
 
-    # Create the dbgym database. since one pgdata dir maps to one benchmark, all benchmarks will use the same database
-    # as opposed to using databases named after the benchmark
+    # Create the dbgym database. Since one dbdata dir maps to one benchmark, all benchmarks will use the same database
+    # as opposed to using databases named after the benchmark.
     subprocess_run(
         f"./psql -c \"create database {DBGYM_POSTGRES_DBNAME} with owner = '{dbgym_pguser}'\" {DEFAULT_POSTGRES_DBNAME} -p {pgport} -h localhost",
         cwd=pgbin_real_dpath,
     )
 
 
-def _load_benchmark_into_pgdata(
+def _load_benchmark_into_dbdata(
     dbgym_cfg: DBGymConfig, benchmark_name: str, scale_factor: float
 ):
     with create_conn(use_psycopg=False) as conn:
         if benchmark_name == "tpch":
             load_info = TpchLoadInfo(dbgym_cfg, scale_factor)
         else:
             raise AssertionError(
-                f"_load_benchmark_into_pgdata(): the benchmark of name {benchmark_name} is not implemented"
+                f"_load_benchmark_into_dbdata(): the benchmark of name {benchmark_name} is not implemented"
             )
 
-        _load_into_pgdata(dbgym_cfg, conn, load_info)
+        _load_into_dbdata(dbgym_cfg, conn, load_info)
 
 
-def _load_into_pgdata(dbgym_cfg: DBGymConfig, conn: Connection, load_info: LoadInfoBaseClass):
+def _load_into_dbdata(dbgym_cfg: DBGymConfig, conn: Connection, load_info: LoadInfoBaseClass):
     sql_file_execute(dbgym_cfg, conn, load_info.get_schema_fpath())
 
     # truncate all tables first before even loading a single one
@@ -223,29 +222,29 @@ def _load_into_pgdata(dbgym_cfg: DBGymConfig, conn: Connection, load_info: LoadI
         sql_file_execute(dbgym_cfg, conn, constraints_fpath)
 
 
-def start_postgres(dbgym_cfg: DBGymConfig, pgbin_path: Path, pgdata_dpath: Path) -> None:
-    _start_or_stop_postgres(dbgym_cfg, pgbin_path, pgdata_dpath, True)
+def start_postgres(dbgym_cfg: DBGymConfig, pgbin_path: Path, dbdata_dpath: Path) -> None:
+    _start_or_stop_postgres(dbgym_cfg, pgbin_path, dbdata_dpath, True)
 
 
-def stop_postgres(dbgym_cfg: DBGymConfig, pgbin_path: Path, pgdata_dpath: Path) -> None:
-    _start_or_stop_postgres(dbgym_cfg, pgbin_path, pgdata_dpath, False)
+def stop_postgres(dbgym_cfg: DBGymConfig, pgbin_path: Path, dbdata_dpath: Path) -> None:
+    _start_or_stop_postgres(dbgym_cfg, pgbin_path, dbdata_dpath, False)
 
 
-def _start_or_stop_postgres(dbgym_cfg: DBGymConfig, pgbin_path: Path, pgdata_dpath: Path, is_start: bool) -> None:
+def _start_or_stop_postgres(dbgym_cfg: DBGymConfig, pgbin_path: Path, dbdata_dpath: Path, is_start: bool) -> None:
     # They should be absolute paths and should exist
     assert pgbin_path.is_absolute() and pgbin_path.exists()
-    assert pgdata_dpath.is_absolute() and pgdata_dpath.exists()
+    assert dbdata_dpath.is_absolute() and dbdata_dpath.exists()
     # The inputs may be symlinks so we need to resolve them first
     pgbin_real_dpath = pgbin_path.resolve()
-    pgdata_dpath = pgdata_dpath.resolve()
+    dbdata_dpath = dbdata_dpath.resolve()
     pgport = DEFAULT_POSTGRES_PORT
     save_file(dbgym_cfg, pgbin_real_dpath / "pg_ctl")
 
     if is_start:
         # We use subprocess.run() because subprocess_run() never returns when running "pg_ctl start".
         # The reason subprocess_run() never returns is because pg_ctl spawns a postgres process so .poll() always returns None.
         # On the other hand, subprocess.run() does return normally, like calling `./pg_ctl` on the command line would do.
-        result = subprocess.run(f"./pg_ctl -D \"{pgdata_dpath}\" -o '-p {pgport}' start", cwd=pgbin_real_dpath, shell=True)
+        result = subprocess.run(f"./pg_ctl -D \"{dbdata_dpath}\" -o '-p {pgport}' start", cwd=pgbin_real_dpath, shell=True)
         result.check_returncode()
     else:
-        subprocess_run(f"./pg_ctl -D \"{pgdata_dpath}\" -o '-p {pgport}' stop", cwd=pgbin_real_dpath)
+        subprocess_run(f"./pg_ctl -D \"{dbdata_dpath}\" -o '-p {pgport}' stop", cwd=pgbin_real_dpath)