Skip to content

Commit 041feb1

Browse files
committed
added untarred_original_dname option to _download_and_untar_dir
1 parent e649d23 commit 041feb1

File tree

2 files changed

+30
-4
lines changed

2 files changed

+30
-4
lines changed

benchmark/job/cli.py

Lines changed: 29 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
import logging
2+
from typing import Optional
23

34
import click
45

@@ -175,15 +176,28 @@ def _download_job_data(dbgym_cfg: DBGymConfig) -> None:
175176

176177

177178
def _download_job_queries(dbgym_cfg: DBGymConfig) -> None:
178-
_download_and_untar_dir(dbgym_cfg, JOB_QUERIES_URL, "job.tgz", JOB_QUERIES_DNAME)
179+
_download_and_untar_dir(
180+
dbgym_cfg,
181+
JOB_QUERIES_URL,
182+
"job.tgz",
183+
JOB_QUERIES_DNAME,
184+
untarred_original_dname="job",
185+
)
179186

180187

181188
def _download_and_untar_dir(
182189
dbgym_cfg: DBGymConfig,
183190
download_url: str,
184191
download_tarred_fname: str,
185192
untarred_dname: str,
193+
untarred_original_dname: Optional[str] = None,
186194
) -> None:
195+
"""
196+
Some .tgz files are built from a directory while others are built from the contents of
197+
the directory. If the .tgz file we're untarring is built from a directory, it will have
198+
an "original" directory name. If this is the case, you should set
199+
`untarred_original_dname` to ensure that it gets renamed to `untarred_dname`.
200+
"""
187201
expected_symlink_dpath = (
188202
dbgym_cfg.cur_symlinks_data_path(mkdir=True) / f"{untarred_dname}.link"
189203
)
@@ -196,8 +210,20 @@ def _download_and_untar_dir(
196210
logging.getLogger(DBGYM_LOGGER_NAME).info(f"Downloading: {expected_symlink_dpath}")
197211
real_data_path = dbgym_cfg.cur_task_runs_data_path(mkdir=True)
198212
subprocess_run(f"curl -O {download_url}", cwd=real_data_path)
199-
untarred_data_dpath = dbgym_cfg.cur_task_runs_data_path(untarred_dname, mkdir=True)
200-
subprocess_run(f"tar -zxvf ../{download_tarred_fname}", cwd=untarred_data_dpath)
213+
untarred_data_dpath = dbgym_cfg.cur_task_runs_data_path(untarred_dname)
214+
215+
if untarred_original_dname is not None:
216+
assert not untarred_data_dpath.exists()
217+
subprocess_run(f"tar -zxvf {download_tarred_fname}", cwd=real_data_path)
218+
assert (real_data_path / untarred_original_dname).exists()
219+
subprocess_run(
220+
f"mv {untarred_original_dname} {untarred_dname}", cwd=real_data_path
221+
)
222+
else:
223+
untarred_data_dpath.mkdir(parents=True, exist_ok=False)
224+
subprocess_run(f"tar -zxvf ../{download_tarred_fname}", cwd=untarred_data_dpath)
225+
226+
assert untarred_data_dpath.exists()
201227
subprocess_run(f"rm {download_tarred_fname}", cwd=real_data_path)
202228
symlink_dpath = link_result(dbgym_cfg, untarred_data_dpath)
203229
assert expected_symlink_dpath.samefile(symlink_dpath)

scripts/run_protox_e2e_test.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -200,5 +200,5 @@ def run_e2e_for_benchmark(benchmark_name: str, intended_dbdata_hardware: str) ->
200200
# Set the config file so that we use resources that don't conflict with normal usage (e.g. a different workspace, different ports, etc.).
201201
os.environ["DBGYM_CONFIG_PATH"] = str(E2ETEST_DBGYM_CONFIG_FPATH)
202202

203-
run_e2e_for_benchmark("tpch", intended_dbdata_hardware)
203+
# run_e2e_for_benchmark("tpch", intended_dbdata_hardware) # TODO: Uncomment this
204204
run_e2e_for_benchmark("job", intended_dbdata_hardware)

0 commit comments

Comments
 (0)