10
10
11
11
import click
12
12
import sqlalchemy
13
- from gymlib .symlinks_paths import get_pgbin_symlink_path , get_repo_symlink_path
13
+ from gymlib .symlinks_paths import (
14
+ get_dbdata_tgz_symlink_path ,
15
+ get_pgbin_symlink_path ,
16
+ get_repo_symlink_path ,
17
+ linkname_to_name ,
18
+ )
14
19
15
20
from benchmark .constants import DEFAULT_SCALE_FACTOR
16
21
from benchmark .job .load_info import JobLoadInfo
33
38
WORKSPACE_PATH_PLACEHOLDER ,
34
39
DBGymWorkspace ,
35
40
fully_resolve_path ,
36
- get_dbdata_tgz_filename ,
37
41
get_default_dbdata_parent_dpath ,
38
42
is_fully_resolved ,
39
43
is_ssd ,
40
- link_result ,
41
- open_and_save ,
42
- save_file ,
43
44
)
44
45
45
46
@@ -127,6 +128,27 @@ def postgres_dbdata(
127
128
intended_dbdata_hardware : str ,
128
129
dbdata_parent_dpath : Optional [Path ],
129
130
) -> None :
131
+ _postgres_dbdata (
132
+ dbgym_workspace ,
133
+ benchmark_name ,
134
+ scale_factor ,
135
+ pgbin_path ,
136
+ intended_dbdata_hardware ,
137
+ dbdata_parent_dpath ,
138
+ )
139
+
140
+
141
+ def _postgres_dbdata (
142
+ dbgym_workspace : DBGymWorkspace ,
143
+ benchmark_name : str ,
144
+ scale_factor : float ,
145
+ pgbin_path : Optional [Path ],
146
+ intended_dbdata_hardware : str ,
147
+ dbdata_parent_dpath : Optional [Path ],
148
+ ) -> None :
149
+ """
150
+ This function exists as a hook for integration tests.
151
+ """
130
152
# Set args to defaults programmatically (do this before doing anything else in the function)
131
153
if pgbin_path is None :
132
154
pgbin_path = get_pgbin_symlink_path (dbgym_workspace .dbgym_workspace_path )
@@ -165,46 +187,54 @@ def _create_dbdata(
165
187
dbdata_parent_dpath : Path ,
166
188
) -> None :
167
189
"""
168
- I chose *not* for this function to skip by default if dbdata_tgz_symlink_path already exists. This
169
- is because, while the generated data is deterministic given benchmark_name and scale_factor, any
170
- change in the _create_dbdata() function would result in a different dbdata. Since _create_dbdata()
171
- may change somewhat frequently, I decided to get rid of the footgun of having changes to
172
- _create_dbdata() not propagate to [dbdata].tgz by default.
190
+ If you change the code of _create_dbdata(), you should also delete the symlink so that the next time you run
191
+ `dbms postgres dbdata` it will re-create the dbdata.
173
192
"""
193
+ expected_dbdata_tgz_symlink_path = get_dbdata_tgz_symlink_path (
194
+ dbgym_workspace .dbgym_workspace_path ,
195
+ benchmark_name ,
196
+ scale_factor ,
197
+ )
198
+ if expected_dbdata_tgz_symlink_path .exists ():
199
+ logging .getLogger (DBGYM_LOGGER_NAME ).info (
200
+ f"Skipping _create_dbdata: { expected_dbdata_tgz_symlink_path } "
201
+ )
202
+ return
174
203
175
204
# It's ok for the dbdata/ directory to be temporary. It just matters that the .tgz is saved in a safe place.
176
- dbdata_dpath = dbdata_parent_dpath / "dbdata_being_created"
177
- # We might be reusing the same dbdata_parent_dpath, so delete dbdata_dpath if it already exists
178
- if dbdata_dpath .exists ():
179
- shutil .rmtree (dbdata_dpath )
205
+ dbdata_path = dbdata_parent_dpath / "dbdata_being_created"
206
+ # We might be reusing the same dbdata_parent_dpath, so delete dbdata_path if it already exists
207
+ if dbdata_path .exists ():
208
+ shutil .rmtree (dbdata_path )
180
209
181
210
# Call initdb.
182
211
# Save any script we call from pgbin_symlink_dpath because they are dependencies generated from another task run.
183
- save_file (dbgym_workspace , pgbin_path / "initdb" )
184
- subprocess_run (f'./initdb -D "{ dbdata_dpath } "' , cwd = pgbin_path )
212
+ dbgym_workspace . save_file (pgbin_path / "initdb" )
213
+ subprocess_run (f'./initdb -D "{ dbdata_path } "' , cwd = pgbin_path )
185
214
186
215
# Start Postgres (all other dbdata setup requires postgres to be started).
187
216
# Note that subprocess_run() never returns when running "pg_ctl start", so I'm using subprocess.run() instead.
188
- start_postgres (dbgym_workspace , pgbin_path , dbdata_dpath )
217
+ start_postgres (dbgym_workspace , pgbin_path , dbdata_path )
189
218
190
219
# Set up Postgres.
191
220
_generic_dbdata_setup (dbgym_workspace )
192
221
_load_benchmark_into_dbdata (dbgym_workspace , benchmark_name , scale_factor )
193
222
194
223
# Stop Postgres so that we don't "leak" processes.
195
- stop_postgres (dbgym_workspace , pgbin_path , dbdata_dpath )
224
+ stop_postgres (dbgym_workspace , pgbin_path , dbdata_path )
196
225
197
226
# Create .tgz file.
198
227
# Note that you can't pass "[dbdata].tgz" as an arg to cur_task_runs_data_path() because that would create "[dbdata].tgz" as a dir.
199
- dbdata_tgz_real_fpath = dbgym_workspace .cur_task_runs_data_path (
200
- mkdir = True
201
- ) / get_dbdata_tgz_filename ( benchmark_name , scale_factor )
202
- # We need to cd into dbdata_dpath so that the tar file does not contain folders for the whole path of dbdata_dpath .
203
- subprocess_run (f"tar -czf { dbdata_tgz_real_fpath } ." , cwd = dbdata_dpath )
228
+ dbdata_tgz_real_path = dbgym_workspace .dbgym_this_run_path / linkname_to_name (
229
+ expected_dbdata_tgz_symlink_path . name
230
+ )
231
+ # We need to cd into dbdata_path so that the tar file does not contain folders for the whole path of dbdata_path .
232
+ subprocess_run (f"tar -czf { dbdata_tgz_real_path } ." , cwd = dbdata_path )
204
233
205
234
# Create symlink.
206
235
# Only link at the end so that the link only ever points to a complete dbdata.
207
- dbdata_tgz_symlink_path = link_result (dbgym_workspace , dbdata_tgz_real_fpath )
236
+ dbdata_tgz_symlink_path = dbgym_workspace .link_result (dbdata_tgz_real_path )
237
+ assert expected_dbdata_tgz_symlink_path .samefile (dbdata_tgz_symlink_path )
208
238
logging .getLogger (DBGYM_LOGGER_NAME ).info (
209
239
f"Created dbdata in { dbdata_tgz_symlink_path } "
210
240
)
@@ -221,7 +251,7 @@ def _generic_dbdata_setup(dbgym_workspace: DBGymWorkspace) -> None:
221
251
pgport = DEFAULT_POSTGRES_PORT
222
252
223
253
# Create user
224
- save_file (dbgym_workspace , pgbin_real_dpath / "psql" )
254
+ dbgym_workspace . save_file (pgbin_real_dpath / "psql" )
225
255
subprocess_run (
226
256
f"./psql -c \" create user { dbgym_pguser } with superuser password '{ dbgym_pgpass } '\" { DEFAULT_POSTGRES_DBNAME } -p { pgport } -h localhost" ,
227
257
cwd = pgbin_real_dpath ,
@@ -278,7 +308,7 @@ def _load_into_dbdata(
278
308
sqlalchemy_conn_execute (conn , f"TRUNCATE { table } CASCADE" )
279
309
# Then, load the tables.
280
310
for table , table_fpath in load_info .get_tables_and_paths ():
281
- with open_and_save (dbgym_workspace , table_fpath , "r" ) as table_csv :
311
+ with dbgym_workspace . open_and_save (table_fpath , "r" ) as table_csv :
282
312
assert conn .connection .dbapi_connection is not None
283
313
cur = conn .connection .dbapi_connection .cursor ()
284
314
try :
@@ -301,41 +331,41 @@ def _load_into_dbdata(
301
331
# even though they are a little redundant. It seems better than making `dbms` depend on the behavior of the
302
332
# tuning environment.
303
333
def start_postgres (
304
- dbgym_workspace : DBGymWorkspace , pgbin_path : Path , dbdata_dpath : Path
334
+ dbgym_workspace : DBGymWorkspace , pgbin_path : Path , dbdata_path : Path
305
335
) -> None :
306
- _start_or_stop_postgres (dbgym_workspace , pgbin_path , dbdata_dpath , True )
336
+ _start_or_stop_postgres (dbgym_workspace , pgbin_path , dbdata_path , True )
307
337
308
338
309
339
def stop_postgres (
310
- dbgym_workspace : DBGymWorkspace , pgbin_path : Path , dbdata_dpath : Path
340
+ dbgym_workspace : DBGymWorkspace , pgbin_path : Path , dbdata_path : Path
311
341
) -> None :
312
- _start_or_stop_postgres (dbgym_workspace , pgbin_path , dbdata_dpath , False )
342
+ _start_or_stop_postgres (dbgym_workspace , pgbin_path , dbdata_path , False )
313
343
314
344
315
345
def _start_or_stop_postgres (
316
346
dbgym_workspace : DBGymWorkspace ,
317
347
pgbin_path : Path ,
318
- dbdata_dpath : Path ,
348
+ dbdata_path : Path ,
319
349
is_start : bool ,
320
350
) -> None :
321
351
# They should be absolute paths and should exist
322
352
assert is_fully_resolved (pgbin_path )
323
- assert is_fully_resolved (dbdata_dpath )
353
+ assert is_fully_resolved (dbdata_path )
324
354
pgport = DEFAULT_POSTGRES_PORT
325
- save_file (dbgym_workspace , pgbin_path / "pg_ctl" )
355
+ dbgym_workspace . save_file (pgbin_path / "pg_ctl" )
326
356
327
357
if is_start :
328
358
# We use subprocess.run() because subprocess_run() never returns when running "pg_ctl start".
329
359
# The reason subprocess_run() never returns is because pg_ctl spawns a postgres process so .poll() always returns None.
330
360
# On the other hand, subprocess.run() does return normally, like calling `./pg_ctl` on the command line would do.
331
361
result = subprocess .run (
332
- f"./pg_ctl -D \" { dbdata_dpath } \" -o '-p { pgport } ' start" ,
362
+ f"./pg_ctl -D \" { dbdata_path } \" -o '-p { pgport } ' start" ,
333
363
cwd = pgbin_path ,
334
364
shell = True ,
335
365
)
336
366
result .check_returncode ()
337
367
else :
338
368
subprocess_run (
339
- f"./pg_ctl -D \" { dbdata_dpath } \" -o '-p { pgport } ' stop" ,
369
+ f"./pg_ctl -D \" { dbdata_path } \" -o '-p { pgport } ' stop" ,
340
370
cwd = pgbin_path ,
341
371
)
0 commit comments