Skip to content

Commit ef24dc1

Browse files
Formatting with black and isort (#38)
**Summary**: added `black` and `isort` checks to the GHA. Fixed format to pass GHA. **Demo**: ![Screenshot 2024-08-25 at 10 15 23](https://github.com/user-attachments/assets/c159bb58-a3a6-416f-a377-f3dc87552ac3) [Passing GHA](https://github.com/cmu-db/dbgym/actions/runs/10547256041)
1 parent 9ef1ee6 commit ef24dc1

39 files changed

+1736
-815
lines changed

.github/workflows/tests_ci.yml

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ on:
66
branches: [main]
77

88
jobs:
9-
build:
9+
ci:
1010
runs-on: self-hosted
1111

1212
steps:
@@ -32,12 +32,16 @@ jobs:
3232
run: |
3333
./dependencies/install_dependencies.sh
3434
35+
- name: Check formatting
36+
run: |
37+
./scripts/check_format.sh
38+
3539
- name: Run unit tests
3640
run: |
3741
. "$HOME/.cargo/env"
3842
python scripts/run_unit_tests.py
3943
40-
- name: Run integration test
44+
- name: Run integration tests
4145
# Delete the workspace. Run once with a clean workspace. Run again from the existing workspace.
4246
# Need to run with a non-root user in order to start Postgres.
4347
run: |

benchmark/tpch/cli.py

Lines changed: 59 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -4,9 +4,14 @@
44

55
import click
66

7-
from misc.utils import DBGymConfig, get_scale_factor_string, link_result, workload_name_fn
8-
from util.shell import subprocess_run
7+
from misc.utils import (
8+
DBGymConfig,
9+
get_scale_factor_string,
10+
link_result,
11+
workload_name_fn,
12+
)
913
from util.pg import *
14+
from util.shell import subprocess_run
1015

1116
benchmark_tpch_logger = logging.getLogger("benchmark/tpch")
1217
benchmark_tpch_logger.setLevel(logging.INFO)
@@ -29,8 +34,18 @@ def tpch_data(dbgym_cfg: DBGymConfig, scale_factor: float):
2934

3035

3136
@tpch_group.command(name="workload")
32-
@click.option("--seed-start", type=int, default=15721, help="A workload consists of queries from multiple seeds. This is the starting seed (inclusive).")
33-
@click.option("--seed-end", type=int, default=15721, help="A workload consists of queries from multiple seeds. This is the ending seed (inclusive).")
37+
@click.option(
38+
"--seed-start",
39+
type=int,
40+
default=15721,
41+
help="A workload consists of queries from multiple seeds. This is the starting seed (inclusive).",
42+
)
43+
@click.option(
44+
"--seed-end",
45+
type=int,
46+
default=15721,
47+
help="A workload consists of queries from multiple seeds. This is the ending seed (inclusive).",
48+
)
3449
@click.option(
3550
"--query-subset",
3651
type=click.Choice(["all", "even", "odd"]),
@@ -45,7 +60,9 @@ def tpch_workload(
4560
query_subset: str,
4661
scale_factor: float,
4762
):
48-
assert seed_start <= seed_end, f'seed_start ({seed_start}) must be <= seed_end ({seed_end})'
63+
assert (
64+
seed_start <= seed_end
65+
), f"seed_start ({seed_start}) must be <= seed_end ({seed_end})"
4966
_clone(dbgym_cfg)
5067
_generate_queries(dbgym_cfg, seed_start, seed_end, scale_factor)
5168
_generate_workload(dbgym_cfg, seed_start, seed_end, query_subset, scale_factor)
@@ -56,7 +73,9 @@ def _get_queries_dname(seed: int, scale_factor: float) -> str:
5673

5774

5875
def _clone(dbgym_cfg: DBGymConfig):
59-
expected_symlink_dpath = dbgym_cfg.cur_symlinks_build_path(mkdir=True) / "tpch-kit.link"
76+
expected_symlink_dpath = (
77+
dbgym_cfg.cur_symlinks_build_path(mkdir=True) / "tpch-kit.link"
78+
)
6079
if expected_symlink_dpath.exists():
6180
benchmark_tpch_logger.info(f"Skipping clone: {expected_symlink_dpath}")
6281
return
@@ -73,22 +92,32 @@ def _clone(dbgym_cfg: DBGymConfig):
7392

7493
def _get_tpch_kit_dpath(dbgym_cfg: DBGymConfig) -> Path:
7594
tpch_kit_dpath = (dbgym_cfg.cur_symlinks_build_path() / "tpch-kit.link").resolve()
76-
assert tpch_kit_dpath.exists() and tpch_kit_dpath.is_absolute() and not tpch_kit_dpath.is_symlink()
95+
assert (
96+
tpch_kit_dpath.exists()
97+
and tpch_kit_dpath.is_absolute()
98+
and not tpch_kit_dpath.is_symlink()
99+
)
77100
return tpch_kit_dpath
78101

79102

80-
def _generate_queries(dbgym_cfg: DBGymConfig, seed_start: int, seed_end: int, scale_factor: float):
103+
def _generate_queries(
104+
dbgym_cfg: DBGymConfig, seed_start: int, seed_end: int, scale_factor: float
105+
):
81106
tpch_kit_dpath = _get_tpch_kit_dpath(dbgym_cfg)
82107
data_path = dbgym_cfg.cur_symlinks_data_path(mkdir=True)
83108
benchmark_tpch_logger.info(
84109
f"Generating queries: {data_path} [{seed_start}, {seed_end}]"
85110
)
86111
for seed in range(seed_start, seed_end + 1):
87-
expected_queries_symlink_dpath = data_path / (_get_queries_dname(seed, scale_factor) + ".link")
112+
expected_queries_symlink_dpath = data_path / (
113+
_get_queries_dname(seed, scale_factor) + ".link"
114+
)
88115
if expected_queries_symlink_dpath.exists():
89116
continue
90117

91-
real_dir = dbgym_cfg.cur_task_runs_data_path(_get_queries_dname(seed, scale_factor), mkdir=True)
118+
real_dir = dbgym_cfg.cur_task_runs_data_path(
119+
_get_queries_dname(seed, scale_factor), mkdir=True
120+
)
92121
for i in range(1, 22 + 1):
93122
target_sql = (real_dir / f"{i}.sql").resolve()
94123
subprocess_run(
@@ -106,16 +135,20 @@ def _generate_queries(dbgym_cfg: DBGymConfig, seed_start: int, seed_end: int, sc
106135
def _generate_data(dbgym_cfg: DBGymConfig, scale_factor: float):
107136
tpch_kit_dpath = _get_tpch_kit_dpath(dbgym_cfg)
108137
data_path = dbgym_cfg.cur_symlinks_data_path(mkdir=True)
109-
expected_tables_symlink_dpath = data_path / f"tables_sf{get_scale_factor_string(scale_factor)}.link"
138+
expected_tables_symlink_dpath = (
139+
data_path / f"tables_sf{get_scale_factor_string(scale_factor)}.link"
140+
)
110141
if expected_tables_symlink_dpath.exists():
111-
benchmark_tpch_logger.info(f"Skipping generation: {expected_tables_symlink_dpath}")
142+
benchmark_tpch_logger.info(
143+
f"Skipping generation: {expected_tables_symlink_dpath}"
144+
)
112145
return
113146

114147
benchmark_tpch_logger.info(f"Generating: {expected_tables_symlink_dpath}")
115-
subprocess_run(
116-
f"./dbgen -vf -s {scale_factor}", cwd=tpch_kit_dpath / "dbgen"
148+
subprocess_run(f"./dbgen -vf -s {scale_factor}", cwd=tpch_kit_dpath / "dbgen")
149+
real_dir = dbgym_cfg.cur_task_runs_data_path(
150+
f"tables_sf{get_scale_factor_string(scale_factor)}", mkdir=True
117151
)
118-
real_dir = dbgym_cfg.cur_task_runs_data_path(f"tables_sf{get_scale_factor_string(scale_factor)}", mkdir=True)
119152
subprocess_run(f"mv ./*.tbl {real_dir}", cwd=tpch_kit_dpath / "dbgen")
120153

121154
tables_symlink_dpath = link_result(dbgym_cfg, real_dir)
@@ -135,9 +168,7 @@ def _generate_workload(
135168
expected_workload_symlink_dpath = symlink_data_dpath / (workload_name + ".link")
136169

137170
benchmark_tpch_logger.info(f"Generating: {expected_workload_symlink_dpath}")
138-
real_dpath = dbgym_cfg.cur_task_runs_data_path(
139-
workload_name, mkdir=True
140-
)
171+
real_dpath = dbgym_cfg.cur_task_runs_data_path(workload_name, mkdir=True)
141172

142173
queries = None
143174
if query_subset == "all":
@@ -150,12 +181,19 @@ def _generate_workload(
150181
with open(real_dpath / "order.txt", "w") as f:
151182
for seed in range(seed_start, seed_end + 1):
152183
for qnum in queries:
153-
sql_fpath = (symlink_data_dpath / (_get_queries_dname(seed, scale_factor) + ".link")).resolve() / f"{qnum}.sql"
154-
assert sql_fpath.exists() and not sql_fpath.is_symlink() and sql_fpath.is_absolute(), "We should only write existent real absolute paths to a file"
184+
sql_fpath = (
185+
symlink_data_dpath
186+
/ (_get_queries_dname(seed, scale_factor) + ".link")
187+
).resolve() / f"{qnum}.sql"
188+
assert (
189+
sql_fpath.exists()
190+
and not sql_fpath.is_symlink()
191+
and sql_fpath.is_absolute()
192+
), "We should only write existent real absolute paths to a file"
155193
output = ",".join([f"S{seed}-Q{qnum}", str(sql_fpath)])
156194
print(output, file=f)
157195
# TODO(WAN): add option to deep-copy the workload.
158-
196+
159197
workload_symlink_dpath = link_result(dbgym_cfg, real_dpath)
160198
assert workload_symlink_dpath == expected_workload_symlink_dpath
161199
benchmark_tpch_logger.info(f"Generated: {expected_workload_symlink_dpath}")

benchmark/tpch/load_info.py

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,6 @@
11
from dbms.load_info_base_class import LoadInfoBaseClass
22
from misc.utils import DBGymConfig, get_scale_factor_string
33

4-
54
TPCH_SCHEMA_FNAME = "tpch_schema.sql"
65
TPCH_CONSTRAINTS_FNAME = "tpch_constraints.sql"
76

@@ -39,11 +38,17 @@ def __init__(self, dbgym_cfg: DBGymConfig, scale_factor: float):
3938
), f"self._constraints_fpath ({self._constraints_fpath}) does not exist"
4039

4140
# tables
42-
data_root_dpath = dbgym_cfg.dbgym_symlinks_path / TpchLoadInfo.CODEBASE_DNAME / "data"
43-
tables_symlink_dpath = data_root_dpath / f"tables_sf{get_scale_factor_string(scale_factor)}.link"
41+
data_root_dpath = (
42+
dbgym_cfg.dbgym_symlinks_path / TpchLoadInfo.CODEBASE_DNAME / "data"
43+
)
44+
tables_symlink_dpath = (
45+
data_root_dpath / f"tables_sf{get_scale_factor_string(scale_factor)}.link"
46+
)
4447
tables_dpath = tables_symlink_dpath.resolve()
4548
assert (
46-
tables_dpath.exists() and tables_dpath.is_absolute() and not tables_dpath.is_symlink()
49+
tables_dpath.exists()
50+
and tables_dpath.is_absolute()
51+
and not tables_dpath.is_symlink()
4752
), f"tables_dpath ({tables_dpath}) should be an existent real absolute path. Make sure you have generated the TPC-H data"
4853
self._tables_and_fpaths = []
4954
for table in TpchLoadInfo.TABLES:

0 commit comments

Comments
 (0)