Skip to content

Commit 9d111d3

Browse files
committed
added and passed test_postgres_dbdata
1 parent e0cde22 commit 9d111d3

File tree

6 files changed

+156
-61
lines changed

6 files changed

+156
-61
lines changed

benchmark/tpch/cli.py

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,9 @@
44
from gymlib.symlinks_paths import (
55
get_tables_dirname,
66
get_tables_symlink_path,
7-
get_workload_dirname,
87
get_workload_suffix,
8+
get_workload_symlink_path,
9+
linkname_to_name,
910
)
1011

1112
from benchmark.constants import DEFAULT_SCALE_FACTOR
@@ -17,7 +18,6 @@
1718
fully_resolve_path,
1819
get_scale_factor_string,
1920
is_fully_resolved,
20-
link_result,
2121
)
2222

2323
TPCH_KIT_DIRNAME = "tpch-kit"
@@ -194,16 +194,14 @@ def _generate_tpch_workload(
194194
query_subset: str,
195195
scale_factor: float,
196196
) -> None:
197-
workload_name = get_workload_dirname(
197+
expected_workload_symlink_path = get_workload_symlink_path(
198+
dbgym_workspace.dbgym_workspace_path,
198199
"tpch",
199200
scale_factor,
200201
get_workload_suffix(
201202
"tpch", seed_start=seed_start, seed_end=seed_end, query_subset=query_subset
202203
),
203204
)
204-
expected_workload_symlink_path = dbgym_workspace.dbgym_cur_symlinks_path / (
205-
workload_name + ".link"
206-
)
207205
if expected_workload_symlink_path.exists():
208206
logging.getLogger(DBGYM_LOGGER_NAME).info(
209207
f"Skipping generation: {expected_workload_symlink_path}"
@@ -213,7 +211,9 @@ def _generate_tpch_workload(
213211
logging.getLogger(DBGYM_LOGGER_NAME).info(
214212
f"Generating: {expected_workload_symlink_path}"
215213
)
216-
workload_path = dbgym_workspace.dbgym_this_run_path / workload_name
214+
workload_path = dbgym_workspace.dbgym_this_run_path / linkname_to_name(
215+
expected_workload_symlink_path.name
216+
)
217217
workload_path.mkdir(parents=False, exist_ok=False)
218218

219219
query_names = None

dbms/postgres/cli.py

Lines changed: 65 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,12 @@
1010

1111
import click
1212
import sqlalchemy
13-
from gymlib.symlinks_paths import get_pgbin_symlink_path, get_repo_symlink_path
13+
from gymlib.symlinks_paths import (
14+
get_dbdata_tgz_symlink_path,
15+
get_pgbin_symlink_path,
16+
get_repo_symlink_path,
17+
linkname_to_name,
18+
)
1419

1520
from benchmark.constants import DEFAULT_SCALE_FACTOR
1621
from benchmark.job.load_info import JobLoadInfo
@@ -33,13 +38,9 @@
3338
WORKSPACE_PATH_PLACEHOLDER,
3439
DBGymWorkspace,
3540
fully_resolve_path,
36-
get_dbdata_tgz_filename,
3741
get_default_dbdata_parent_dpath,
3842
is_fully_resolved,
3943
is_ssd,
40-
link_result,
41-
open_and_save,
42-
save_file,
4344
)
4445

4546

@@ -127,6 +128,27 @@ def postgres_dbdata(
127128
intended_dbdata_hardware: str,
128129
dbdata_parent_dpath: Optional[Path],
129130
) -> None:
131+
_postgres_dbdata(
132+
dbgym_workspace,
133+
benchmark_name,
134+
scale_factor,
135+
pgbin_path,
136+
intended_dbdata_hardware,
137+
dbdata_parent_dpath,
138+
)
139+
140+
141+
def _postgres_dbdata(
142+
dbgym_workspace: DBGymWorkspace,
143+
benchmark_name: str,
144+
scale_factor: float,
145+
pgbin_path: Optional[Path],
146+
intended_dbdata_hardware: str,
147+
dbdata_parent_dpath: Optional[Path],
148+
) -> None:
149+
"""
150+
This function exists as a hook for integration tests.
151+
"""
130152
# Set args to defaults programmatically (do this before doing anything else in the function)
131153
if pgbin_path is None:
132154
pgbin_path = get_pgbin_symlink_path(dbgym_workspace.dbgym_workspace_path)
@@ -165,46 +187,54 @@ def _create_dbdata(
165187
dbdata_parent_dpath: Path,
166188
) -> None:
167189
"""
168-
I chose *not* for this function to skip by default if dbdata_tgz_symlink_path already exists. This
169-
is because, while the generated data is deterministic given benchmark_name and scale_factor, any
170-
change in the _create_dbdata() function would result in a different dbdata. Since _create_dbdata()
171-
may change somewhat frequently, I decided to get rid of the footgun of having changes to
172-
_create_dbdata() not propagate to [dbdata].tgz by default.
190+
If you change the code of _create_dbdata(), you should also delete the symlink so that the next time you run
191+
`dbms postgres dbdata` it will re-create the dbdata.
173192
"""
193+
expected_dbdata_tgz_symlink_path = get_dbdata_tgz_symlink_path(
194+
dbgym_workspace.dbgym_workspace_path,
195+
benchmark_name,
196+
scale_factor,
197+
)
198+
if expected_dbdata_tgz_symlink_path.exists():
199+
logging.getLogger(DBGYM_LOGGER_NAME).info(
200+
f"Skipping _create_dbdata: {expected_dbdata_tgz_symlink_path}"
201+
)
202+
return
174203

175204
# It's ok for the dbdata/ directory to be temporary. It just matters that the .tgz is saved in a safe place.
176-
dbdata_dpath = dbdata_parent_dpath / "dbdata_being_created"
177-
# We might be reusing the same dbdata_parent_dpath, so delete dbdata_dpath if it already exists
178-
if dbdata_dpath.exists():
179-
shutil.rmtree(dbdata_dpath)
205+
dbdata_path = dbdata_parent_dpath / "dbdata_being_created"
206+
# We might be reusing the same dbdata_parent_dpath, so delete dbdata_path if it already exists
207+
if dbdata_path.exists():
208+
shutil.rmtree(dbdata_path)
180209

181210
# Call initdb.
182211
# Save any script we call from pgbin_symlink_dpath because they are dependencies generated from another task run.
183-
save_file(dbgym_workspace, pgbin_path / "initdb")
184-
subprocess_run(f'./initdb -D "{dbdata_dpath}"', cwd=pgbin_path)
212+
dbgym_workspace.save_file(pgbin_path / "initdb")
213+
subprocess_run(f'./initdb -D "{dbdata_path}"', cwd=pgbin_path)
185214

186215
# Start Postgres (all other dbdata setup requires postgres to be started).
187216
# Note that subprocess_run() never returns when running "pg_ctl start", so I'm using subprocess.run() instead.
188-
start_postgres(dbgym_workspace, pgbin_path, dbdata_dpath)
217+
start_postgres(dbgym_workspace, pgbin_path, dbdata_path)
189218

190219
# Set up Postgres.
191220
_generic_dbdata_setup(dbgym_workspace)
192221
_load_benchmark_into_dbdata(dbgym_workspace, benchmark_name, scale_factor)
193222

194223
# Stop Postgres so that we don't "leak" processes.
195-
stop_postgres(dbgym_workspace, pgbin_path, dbdata_dpath)
224+
stop_postgres(dbgym_workspace, pgbin_path, dbdata_path)
196225

197226
# Create .tgz file.
198227
# Note that you can't pass "[dbdata].tgz" as an arg to cur_task_runs_data_path() because that would create "[dbdata].tgz" as a dir.
199-
dbdata_tgz_real_fpath = dbgym_workspace.cur_task_runs_data_path(
200-
mkdir=True
201-
) / get_dbdata_tgz_filename(benchmark_name, scale_factor)
202-
# We need to cd into dbdata_dpath so that the tar file does not contain folders for the whole path of dbdata_dpath.
203-
subprocess_run(f"tar -czf {dbdata_tgz_real_fpath} .", cwd=dbdata_dpath)
228+
dbdata_tgz_real_path = dbgym_workspace.dbgym_this_run_path / linkname_to_name(
229+
expected_dbdata_tgz_symlink_path.name
230+
)
231+
# We need to cd into dbdata_path so that the tar file does not contain folders for the whole path of dbdata_path.
232+
subprocess_run(f"tar -czf {dbdata_tgz_real_path} .", cwd=dbdata_path)
204233

205234
# Create symlink.
206235
# Only link at the end so that the link only ever points to a complete dbdata.
207-
dbdata_tgz_symlink_path = link_result(dbgym_workspace, dbdata_tgz_real_fpath)
236+
dbdata_tgz_symlink_path = dbgym_workspace.link_result(dbdata_tgz_real_path)
237+
assert expected_dbdata_tgz_symlink_path.samefile(dbdata_tgz_symlink_path)
208238
logging.getLogger(DBGYM_LOGGER_NAME).info(
209239
f"Created dbdata in {dbdata_tgz_symlink_path}"
210240
)
@@ -221,7 +251,7 @@ def _generic_dbdata_setup(dbgym_workspace: DBGymWorkspace) -> None:
221251
pgport = DEFAULT_POSTGRES_PORT
222252

223253
# Create user
224-
save_file(dbgym_workspace, pgbin_real_dpath / "psql")
254+
dbgym_workspace.save_file(pgbin_real_dpath / "psql")
225255
subprocess_run(
226256
f"./psql -c \"create user {dbgym_pguser} with superuser password '{dbgym_pgpass}'\" {DEFAULT_POSTGRES_DBNAME} -p {pgport} -h localhost",
227257
cwd=pgbin_real_dpath,
@@ -278,7 +308,7 @@ def _load_into_dbdata(
278308
sqlalchemy_conn_execute(conn, f"TRUNCATE {table} CASCADE")
279309
# Then, load the tables.
280310
for table, table_fpath in load_info.get_tables_and_paths():
281-
with open_and_save(dbgym_workspace, table_fpath, "r") as table_csv:
311+
with dbgym_workspace.open_and_save(table_fpath, "r") as table_csv:
282312
assert conn.connection.dbapi_connection is not None
283313
cur = conn.connection.dbapi_connection.cursor()
284314
try:
@@ -301,41 +331,41 @@ def _load_into_dbdata(
301331
# even though they are a little redundant. It seems better than making `dbms` depend on the behavior of the
302332
# tuning environment.
303333
def start_postgres(
304-
dbgym_workspace: DBGymWorkspace, pgbin_path: Path, dbdata_dpath: Path
334+
dbgym_workspace: DBGymWorkspace, pgbin_path: Path, dbdata_path: Path
305335
) -> None:
306-
_start_or_stop_postgres(dbgym_workspace, pgbin_path, dbdata_dpath, True)
336+
_start_or_stop_postgres(dbgym_workspace, pgbin_path, dbdata_path, True)
307337

308338

309339
def stop_postgres(
310-
dbgym_workspace: DBGymWorkspace, pgbin_path: Path, dbdata_dpath: Path
340+
dbgym_workspace: DBGymWorkspace, pgbin_path: Path, dbdata_path: Path
311341
) -> None:
312-
_start_or_stop_postgres(dbgym_workspace, pgbin_path, dbdata_dpath, False)
342+
_start_or_stop_postgres(dbgym_workspace, pgbin_path, dbdata_path, False)
313343

314344

315345
def _start_or_stop_postgres(
316346
dbgym_workspace: DBGymWorkspace,
317347
pgbin_path: Path,
318-
dbdata_dpath: Path,
348+
dbdata_path: Path,
319349
is_start: bool,
320350
) -> None:
321351
# They should be absolute paths and should exist
322352
assert is_fully_resolved(pgbin_path)
323-
assert is_fully_resolved(dbdata_dpath)
353+
assert is_fully_resolved(dbdata_path)
324354
pgport = DEFAULT_POSTGRES_PORT
325-
save_file(dbgym_workspace, pgbin_path / "pg_ctl")
355+
dbgym_workspace.save_file(pgbin_path / "pg_ctl")
326356

327357
if is_start:
328358
# We use subprocess.run() because subprocess_run() never returns when running "pg_ctl start".
329359
# The reason subprocess_run() never returns is because pg_ctl spawns a postgres process so .poll() always returns None.
330360
# On the other hand, subprocess.run() does return normally, like calling `./pg_ctl` on the command line would do.
331361
result = subprocess.run(
332-
f"./pg_ctl -D \"{dbdata_dpath}\" -o '-p {pgport}' start",
362+
f"./pg_ctl -D \"{dbdata_path}\" -o '-p {pgport}' start",
333363
cwd=pgbin_path,
334364
shell=True,
335365
)
336366
result.check_returncode()
337367
else:
338368
subprocess_run(
339-
f"./pg_ctl -D \"{dbdata_dpath}\" -o '-p {pgport}' stop",
369+
f"./pg_ctl -D \"{dbdata_path}\" -o '-p {pgport}' stop",
340370
cwd=pgbin_path,
341371
)

dbms/tests/integtest_dbms.py

Lines changed: 23 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,9 +2,10 @@
22
import unittest
33
from pathlib import Path
44

5-
from gymlib.symlinks_paths import get_repo_symlink_path
5+
from gymlib.symlinks_paths import get_dbdata_tgz_symlink_path, get_repo_symlink_path
66

7-
from dbms.postgres.cli import _postgres_build
7+
from benchmark.tpch.cli import _tpch_tables
8+
from dbms.postgres.cli import _postgres_build, _postgres_dbdata
89
from util.workspace import (
910
DBGymWorkspace,
1011
fully_resolve_path,
@@ -37,6 +38,26 @@ def test_postgres_build(self) -> None:
3738
self.assertTrue(repo_path.exists())
3839
self.assertTrue(fully_resolve_path(repo_path).exists())
3940

41+
def test_postgres_dbdata(self) -> None:
42+
# Setup
43+
# Make sure to recreate self.workspace so that each function call counts as its own run.
44+
scale_factor = 0.01
45+
_postgres_build(self.workspace, False)
46+
DBGymWorkspace.num_times_created_this_run = 0
47+
self.workspace = DBGymWorkspace(self.workspace.dbgym_workspace_path)
48+
_tpch_tables(self.workspace, scale_factor)
49+
DBGymWorkspace.num_times_created_this_run = 0
50+
self.workspace = DBGymWorkspace(self.workspace.dbgym_workspace_path)
51+
52+
# Test
53+
dbdata_tgz_path = get_dbdata_tgz_symlink_path(
54+
self.workspace.dbgym_workspace_path, "tpch", scale_factor
55+
)
56+
self.assertFalse(dbdata_tgz_path.exists())
57+
_postgres_dbdata(self.workspace, "tpch", scale_factor, None, "hdd", None)
58+
self.assertTrue(dbdata_tgz_path.exists())
59+
self.assertTrue(fully_resolve_path(dbdata_tgz_path).exists())
60+
4061

4162
if __name__ == "__main__":
4263
unittest.main()

env/tests/gymlib_integtest_util.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55

66
# TODO: remove symlinks_paths from the import
77
from gymlib.symlinks_paths import (
8+
get_dbdata_tgz_symlink_path,
89
get_pgbin_symlink_path,
910
get_workload_suffix,
1011
get_workload_symlink_path,
@@ -16,7 +17,6 @@
1617
DBGymWorkspace,
1718
fully_resolve_path,
1819
get_default_dbdata_parent_dpath,
19-
get_default_pristine_dbdata_snapshot_path,
2020
get_workspace_path_from_config,
2121
)
2222

@@ -98,7 +98,7 @@ def get_default_metadata() -> TuningMetadata:
9898
),
9999
),
100100
pristine_dbdata_snapshot_path=fully_resolve_path(
101-
get_default_pristine_dbdata_snapshot_path(
101+
get_dbdata_tgz_symlink_path(
102102
dbgym_workspace.dbgym_workspace_path,
103103
GymlibIntegtestManager.BENCHMARK,
104104
GymlibIntegtestManager.SCALE_FACTOR,

gymlib_package/gymlib/symlinks_paths.py

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,10 @@ def get_workload_dirname(benchmark: str, scale_factor: float | str, suffix: str)
3939
return f"workload_{benchmark}_sf{get_scale_factor_string(scale_factor)}_{suffix}"
4040

4141

42+
def get_dbdata_tgz_filename(benchmark_name: str, scale_factor: float | str) -> str:
43+
return f"{benchmark_name}_sf{get_scale_factor_string(scale_factor)}_pristine_dbdata.tgz"
44+
45+
4246
def get_tables_symlink_path(
4347
workspace_path: Path, benchmark: str, scale_factor: float | str
4448
) -> Path:
@@ -67,3 +71,25 @@ def get_repo_symlink_path(workspace_path: Path) -> Path:
6771

6872
def get_pgbin_symlink_path(workspace_path: Path) -> Path:
6973
return get_repo_symlink_path(workspace_path) / "boot" / "build" / "postgres" / "bin"
74+
75+
76+
def get_dbdata_tgz_symlink_path(
77+
workspace_path: Path, benchmark_name: str, scale_factor: float | str
78+
) -> Path:
79+
return (
80+
workspace_path
81+
/ SYMLINKS_DNAME
82+
/ DBGYM_APP_NAME
83+
/ (get_dbdata_tgz_filename(benchmark_name, scale_factor) + ".link")
84+
)
85+
86+
87+
# TODO: refactor stuff to use this
88+
def name_to_linkname(name: str) -> str:
89+
assert not name.endswith(".link")
90+
return f"{name}.link"
91+
92+
93+
def linkname_to_name(linkname: str) -> str:
94+
assert linkname.endswith(".link")
95+
return linkname[: -len(".link")]

0 commit comments

Comments
 (0)