1
1
"""
2
- At a high level, this file's goal is to (1) install+ build postgres and (2) create pgdata.
2
+ At a high level, this file's goal is to (1) build postgres and (2) create dbdata (aka pgdata) .
3
3
On the other hand, the goal of tune.protox.env.util.postgres is to provide helpers to manage
4
4
a Postgres instance during agent tuning.
5
5
util.pg provides helpers used by *both* of the above files (as well as other files).
10
10
import subprocess
11
11
from pathlib import Path
12
12
import click
13
- import ssd_checker
14
13
15
14
from benchmark .tpch .load_info import TpchLoadInfo
16
15
from dbms .load_info_base_class import LoadInfoBaseClass
17
- from misc .utils import DBGymConfig , conv_inputpath_to_realabspath , link_result , open_and_save , save_file , get_pgdata_tgz_name , default_pgbin_path , WORKSPACE_PATH_PLACEHOLDER , default_pgdata_parent_dpath
16
+ from misc .utils import DBGymConfig , conv_inputpath_to_realabspath , link_result , open_and_save , save_file , get_dbdata_tgz_name , default_pgbin_path , WORKSPACE_PATH_PLACEHOLDER , default_dbdata_parent_dpath , is_ssd
18
17
from util .shell import subprocess_run
19
18
from sqlalchemy import Connection
20
19
from util .pg import SHARED_PRELOAD_LIBRARIES , conn_execute , sql_file_execute , DBGYM_POSTGRES_DBNAME , create_conn , DEFAULT_POSTGRES_PORT , DBGYM_POSTGRES_USER , DBGYM_POSTGRES_PASS , DEFAULT_POSTGRES_DBNAME
@@ -32,7 +31,7 @@ def postgres_group(dbgym_cfg: DBGymConfig):
32
31
33
32
@postgres_group .command (
34
33
name = "build" ,
35
- help = "Download and build the Postgres repository and all necessary extensions/shared libraries. Does not create pgdata ." ,
34
+ help = "Download and build the Postgres repository and all necessary extensions/shared libraries. Does not create dbdata ." ,
36
35
)
37
36
@click .pass_obj
38
37
@click .option ("--rebuild" , is_flag = True , help = "Include this flag to rebuild Postgres even if it already exists." )
@@ -41,46 +40,46 @@ def postgres_build(dbgym_cfg: DBGymConfig, rebuild: bool):
41
40
42
41
43
42
@postgres_group .command (
44
- name = "pgdata " ,
45
- help = "Build a .tgz file of pgdata with various specifications for its contents." ,
43
+ name = "dbdata " ,
44
+ help = "Build a .tgz file of dbdata with various specifications for its contents." ,
46
45
)
47
46
@click .pass_obj
48
47
@click .argument ("benchmark_name" , type = str )
49
48
@click .option ("--scale-factor" , type = float , default = 1 )
50
49
@click .option ("--pgbin-path" , type = Path , default = None , help = f"The path to the bin containing Postgres executables. The default is { default_pgbin_path (WORKSPACE_PATH_PLACEHOLDER )} ." )
51
50
@click .option (
52
- "--intended-pgdata -hardware" ,
51
+ "--intended-dbdata -hardware" ,
53
52
type = click .Choice (["hdd" , "ssd" ]),
54
53
default = "hdd" ,
55
- help = f"The intended hardware pgdata should be on. Used as a sanity check for --pgdata -parent-dpath." ,
54
+ help = f"The intended hardware dbdata should be on. Used as a sanity check for --dbdata -parent-dpath." ,
56
55
)
57
56
@click .option (
58
- "--pgdata -parent-dpath" ,
57
+ "--dbdata -parent-dpath" ,
59
58
default = None ,
60
59
type = Path ,
61
- help = f"The path to the parent directory of the pgdata which will be actively tuned. The default is { default_pgdata_parent_dpath (WORKSPACE_PATH_PLACEHOLDER )} ." ,
60
+ help = f"The path to the parent directory of the dbdata which will be actively tuned. The default is { default_dbdata_parent_dpath (WORKSPACE_PATH_PLACEHOLDER )} ." ,
62
61
)
63
- def postgres_pgdata (dbgym_cfg : DBGymConfig , benchmark_name : str , scale_factor : float , pgbin_path : Path , intended_pgdata_hardware : str , pgdata_parent_dpath : Path ):
62
+ def postgres_dbdata (dbgym_cfg : DBGymConfig , benchmark_name : str , scale_factor : float , pgbin_path : Path , intended_dbdata_hardware : str , dbdata_parent_dpath : Path ):
64
63
# Set args to defaults programmatically (do this before doing anything else in the function)
65
64
if pgbin_path == None :
66
65
pgbin_path = default_pgbin_path (dbgym_cfg .dbgym_workspace_path )
67
- if pgdata_parent_dpath == None :
68
- pgdata_parent_dpath = default_pgdata_parent_dpath (dbgym_cfg .dbgym_workspace_path )
66
+ if dbdata_parent_dpath == None :
67
+ dbdata_parent_dpath = default_dbdata_parent_dpath (dbgym_cfg .dbgym_workspace_path )
69
68
70
69
# Convert all input paths to absolute paths
71
70
pgbin_path = conv_inputpath_to_realabspath (dbgym_cfg , pgbin_path )
72
- pgdata_parent_dpath = conv_inputpath_to_realabspath (dbgym_cfg , pgdata_parent_dpath )
71
+ dbdata_parent_dpath = conv_inputpath_to_realabspath (dbgym_cfg , dbdata_parent_dpath )
73
72
74
73
# Check assertions on args
75
- if intended_pgdata_hardware == "hdd" :
76
- assert not ssd_checker . is_ssd (pgdata_parent_dpath ), f"Intended hardware is HDD but pgdata_parent_dpath ({ pgdata_parent_dpath } ) is an SSD"
77
- elif intended_pgdata_hardware == "ssd" :
78
- assert ssd_checker . is_ssd (pgdata_parent_dpath ), f"Intended hardware is SSD but pgdata_parent_dpath ({ pgdata_parent_dpath } ) is an HDD"
74
+ if intended_dbdata_hardware == "hdd" :
75
+ assert not is_ssd (dbdata_parent_dpath ), f"Intended hardware is HDD but dbdata_parent_dpath ({ dbdata_parent_dpath } ) is an SSD"
76
+ elif intended_dbdata_hardware == "ssd" :
77
+ assert is_ssd (dbdata_parent_dpath ), f"Intended hardware is SSD but dbdata_parent_dpath ({ dbdata_parent_dpath } ) is an HDD"
79
78
else :
80
79
assert False
81
80
82
- # Create pgdata
83
- _create_pgdata (dbgym_cfg , benchmark_name , scale_factor , pgbin_path , pgdata_parent_dpath )
81
+ # Create dbdata
82
+ _create_dbdata (dbgym_cfg , benchmark_name , scale_factor , pgbin_path , dbdata_parent_dpath )
84
83
85
84
86
85
def _get_pgbin_symlink_path (dbgym_cfg : DBGymConfig ) -> Path :
@@ -109,52 +108,52 @@ def _build_repo(dbgym_cfg: DBGymConfig, rebuild):
109
108
dbms_postgres_logger .info (f"Set up repo in { expected_repo_symlink_dpath } " )
110
109
111
110
112
- def _create_pgdata (dbgym_cfg : DBGymConfig , benchmark_name : str , scale_factor : float , pgbin_path : Path , pgdata_parent_dpath : Path ) -> None :
111
+ def _create_dbdata (dbgym_cfg : DBGymConfig , benchmark_name : str , scale_factor : float , pgbin_path : Path , dbdata_parent_dpath : Path ) -> None :
113
112
"""
114
- I chose *not* for this function to skip by default if pgdata_tgz_symlink_path already exists. This
113
+ I chose *not* for this function to skip by default if dbdata_tgz_symlink_path already exists. This
115
114
is because, while the generated data is deterministic given benchmark_name and scale_factor, any
116
- change in the _create_pgdata () function would result in a different pgdata . Since _create_pgdata ()
115
+ change in the _create_dbdata () function would result in a different dbdata . Since _create_dbdata ()
117
116
may change somewhat frequently, I decided to get rid of the footgun of having changes to
118
- _create_pgdata () not propagate to [pgdata ].tgz by default.
117
+ _create_dbdata () not propagate to [dbdata ].tgz by default.
119
118
"""
120
119
121
- # It's ok for the pgdata / directory to be temporary. It just matters that the .tgz is saved in a safe place.
122
- pgdata_dpath = pgdata_parent_dpath / "pgdata_being_created "
123
- # We might be reusing the same pgdata_parent_dpath , so delete pgdata_dpath if it already exists
124
- if pgdata_dpath .exists ():
125
- shutil .rmtree (pgdata_dpath )
120
+ # It's ok for the dbdata / directory to be temporary. It just matters that the .tgz is saved in a safe place.
121
+ dbdata_dpath = dbdata_parent_dpath / "dbdata_being_created "
122
+ # We might be reusing the same dbdata_parent_dpath , so delete dbdata_dpath if it already exists
123
+ if dbdata_dpath .exists ():
124
+ shutil .rmtree (dbdata_dpath )
126
125
127
126
# Call initdb.
128
127
# Save any script we call from pgbin_symlink_dpath because they are dependencies generated from another task run.
129
128
save_file (dbgym_cfg , pgbin_path / "initdb" )
130
- subprocess_run (f'./initdb -D "{ pgdata_dpath } "' , cwd = pgbin_path )
129
+ subprocess_run (f'./initdb -D "{ dbdata_dpath } "' , cwd = pgbin_path )
131
130
132
- # Start Postgres (all other pgdata setup requires postgres to be started).
131
+ # Start Postgres (all other dbdata setup requires postgres to be started).
133
132
# Note that subprocess_run() never returns when running "pg_ctl start", so I'm using subprocess.run() instead.
134
- start_postgres (dbgym_cfg , pgbin_path , pgdata_dpath )
133
+ start_postgres (dbgym_cfg , pgbin_path , dbdata_dpath )
135
134
136
135
# Set up Postgres.
137
- _generic_pgdata_setup (dbgym_cfg )
138
- _load_benchmark_into_pgdata (dbgym_cfg , benchmark_name , scale_factor )
136
+ _generic_dbdata_setup (dbgym_cfg )
137
+ _load_benchmark_into_dbdata (dbgym_cfg , benchmark_name , scale_factor )
139
138
140
139
# Stop Postgres so that we don't "leak" processes.
141
- stop_postgres (dbgym_cfg , pgbin_path , pgdata_dpath )
140
+ stop_postgres (dbgym_cfg , pgbin_path , dbdata_dpath )
142
141
143
142
# Create .tgz file.
144
- # Note that you can't pass "[pgdata ].tgz" as an arg to cur_task_runs_data_path() because that would create "[pgdata ].tgz" as a dir.
145
- pgdata_tgz_real_fpath = dbgym_cfg .cur_task_runs_data_path (
143
+ # Note that you can't pass "[dbdata ].tgz" as an arg to cur_task_runs_data_path() because that would create "[dbdata ].tgz" as a dir.
144
+ dbdata_tgz_real_fpath = dbgym_cfg .cur_task_runs_data_path (
146
145
mkdir = True
147
- ) / get_pgdata_tgz_name (benchmark_name , scale_factor )
148
- # We need to cd into pgdata_dpath so that the tar file does not contain folders for the whole path of pgdata_dpath .
149
- subprocess_run (f"tar -czf { pgdata_tgz_real_fpath } ." , cwd = pgdata_dpath )
146
+ ) / get_dbdata_tgz_name (benchmark_name , scale_factor )
147
+ # We need to cd into dbdata_dpath so that the tar file does not contain folders for the whole path of dbdata_dpath .
148
+ subprocess_run (f"tar -czf { dbdata_tgz_real_fpath } ." , cwd = dbdata_dpath )
150
149
151
150
# Create symlink.
152
- # Only link at the end so that the link only ever points to a complete pgdata .
153
- pgdata_tgz_symlink_path = link_result (dbgym_cfg , pgdata_tgz_real_fpath )
154
- dbms_postgres_logger .info (f"Created pgdata in { pgdata_tgz_symlink_path } " )
151
+ # Only link at the end so that the link only ever points to a complete dbdata .
152
+ dbdata_tgz_symlink_path = link_result (dbgym_cfg , dbdata_tgz_real_fpath )
153
+ dbms_postgres_logger .info (f"Created dbdata in { dbdata_tgz_symlink_path } " )
155
154
156
155
157
- def _generic_pgdata_setup (dbgym_cfg : DBGymConfig ):
156
+ def _generic_dbdata_setup (dbgym_cfg : DBGymConfig ):
158
157
# get necessary vars
159
158
pgbin_real_dpath = _get_pgbin_symlink_path (dbgym_cfg ).resolve ()
160
159
assert pgbin_real_dpath .exists ()
@@ -182,29 +181,29 @@ def _generic_pgdata_setup(dbgym_cfg: DBGymConfig):
182
181
cwd = pgbin_real_dpath ,
183
182
)
184
183
185
- # Create the dbgym database. since one pgdata dir maps to one benchmark, all benchmarks will use the same database
186
- # as opposed to using databases named after the benchmark
184
+ # Create the dbgym database. Since one dbdata dir maps to one benchmark, all benchmarks will use the same database
185
+ # as opposed to using databases named after the benchmark.
187
186
subprocess_run (
188
187
f"./psql -c \" create database { DBGYM_POSTGRES_DBNAME } with owner = '{ dbgym_pguser } '\" { DEFAULT_POSTGRES_DBNAME } -p { pgport } -h localhost" ,
189
188
cwd = pgbin_real_dpath ,
190
189
)
191
190
192
191
193
- def _load_benchmark_into_pgdata (
192
+ def _load_benchmark_into_dbdata (
194
193
dbgym_cfg : DBGymConfig , benchmark_name : str , scale_factor : float
195
194
):
196
195
with create_conn (use_psycopg = False ) as conn :
197
196
if benchmark_name == "tpch" :
198
197
load_info = TpchLoadInfo (dbgym_cfg , scale_factor )
199
198
else :
200
199
raise AssertionError (
201
- f"_load_benchmark_into_pgdata (): the benchmark of name { benchmark_name } is not implemented"
200
+ f"_load_benchmark_into_dbdata (): the benchmark of name { benchmark_name } is not implemented"
202
201
)
203
202
204
- _load_into_pgdata (dbgym_cfg , conn , load_info )
203
+ _load_into_dbdata (dbgym_cfg , conn , load_info )
205
204
206
205
207
- def _load_into_pgdata (dbgym_cfg : DBGymConfig , conn : Connection , load_info : LoadInfoBaseClass ):
206
+ def _load_into_dbdata (dbgym_cfg : DBGymConfig , conn : Connection , load_info : LoadInfoBaseClass ):
208
207
sql_file_execute (dbgym_cfg , conn , load_info .get_schema_fpath ())
209
208
210
209
# truncate all tables first before even loading a single one
@@ -223,29 +222,29 @@ def _load_into_pgdata(dbgym_cfg: DBGymConfig, conn: Connection, load_info: LoadI
223
222
sql_file_execute (dbgym_cfg , conn , constraints_fpath )
224
223
225
224
226
- def start_postgres (dbgym_cfg : DBGymConfig , pgbin_path : Path , pgdata_dpath : Path ) -> None :
227
- _start_or_stop_postgres (dbgym_cfg , pgbin_path , pgdata_dpath , True )
225
+ def start_postgres (dbgym_cfg : DBGymConfig , pgbin_path : Path , dbdata_dpath : Path ) -> None :
226
+ _start_or_stop_postgres (dbgym_cfg , pgbin_path , dbdata_dpath , True )
228
227
229
228
230
- def stop_postgres (dbgym_cfg : DBGymConfig , pgbin_path : Path , pgdata_dpath : Path ) -> None :
231
- _start_or_stop_postgres (dbgym_cfg , pgbin_path , pgdata_dpath , False )
229
+ def stop_postgres (dbgym_cfg : DBGymConfig , pgbin_path : Path , dbdata_dpath : Path ) -> None :
230
+ _start_or_stop_postgres (dbgym_cfg , pgbin_path , dbdata_dpath , False )
232
231
233
232
234
- def _start_or_stop_postgres (dbgym_cfg : DBGymConfig , pgbin_path : Path , pgdata_dpath : Path , is_start : bool ) -> None :
233
+ def _start_or_stop_postgres (dbgym_cfg : DBGymConfig , pgbin_path : Path , dbdata_dpath : Path , is_start : bool ) -> None :
235
234
# They should be absolute paths and should exist
236
235
assert pgbin_path .is_absolute () and pgbin_path .exists ()
237
- assert pgdata_dpath .is_absolute () and pgdata_dpath .exists ()
236
+ assert dbdata_dpath .is_absolute () and dbdata_dpath .exists ()
238
237
# The inputs may be symlinks so we need to resolve them first
239
238
pgbin_real_dpath = pgbin_path .resolve ()
240
- pgdata_dpath = pgdata_dpath .resolve ()
239
+ dbdata_dpath = dbdata_dpath .resolve ()
241
240
pgport = DEFAULT_POSTGRES_PORT
242
241
save_file (dbgym_cfg , pgbin_real_dpath / "pg_ctl" )
243
242
244
243
if is_start :
245
244
# We use subprocess.run() because subprocess_run() never returns when running "pg_ctl start".
246
245
# The reason subprocess_run() never returns is because pg_ctl spawns a postgres process so .poll() always returns None.
247
246
# On the other hand, subprocess.run() does return normally, like calling `./pg_ctl` on the command line would do.
248
- result = subprocess .run (f"./pg_ctl -D \" { pgdata_dpath } \" -o '-p { pgport } ' start" , cwd = pgbin_real_dpath , shell = True )
247
+ result = subprocess .run (f"./pg_ctl -D \" { dbdata_dpath } \" -o '-p { pgport } ' start" , cwd = pgbin_real_dpath , shell = True )
249
248
result .check_returncode ()
250
249
else :
251
- subprocess_run (f"./pg_ctl -D \" { pgdata_dpath } \" -o '-p { pgport } ' stop" , cwd = pgbin_real_dpath )
250
+ subprocess_run (f"./pg_ctl -D \" { dbdata_dpath } \" -o '-p { pgport } ' stop" , cwd = pgbin_real_dpath )
0 commit comments