Skip to content

Commit b5fd76e

Browse files
committed
fully removed the old open_and_save, save_file, and link_result
1 parent 1dcf669 commit b5fd76e

File tree

4 files changed

+3
-166
lines changed

4 files changed

+3
-166
lines changed

benchmark/job/cli.py

Lines changed: 1 addition & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -11,12 +11,7 @@
1111
from benchmark.constants import DEFAULT_SCALE_FACTOR
1212
from util.log import DBGYM_LOGGER_NAME
1313
from util.shell import subprocess_run
14-
from util.workspace import (
15-
DBGymWorkspace,
16-
fully_resolve_path,
17-
is_fully_resolved,
18-
link_result,
19-
)
14+
from util.workspace import DBGymWorkspace, fully_resolve_path
2015

2116
JOB_TABLES_URL = "https://event.cwi.nl/da/job/imdb.tgz"
2217
JOB_QUERIES_URL = "https://event.cwi.nl/da/job/job.tgz"

env/pg_conn.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@
2424

2525
from util.log import DBGYM_LOGGER_NAME
2626
from util.pg import DBGYM_POSTGRES_DBNAME, SHARED_PRELOAD_LIBRARIES, get_kv_connstr
27-
from util.workspace import DBGymWorkspace, open_and_save, parent_path_of_path
27+
from util.workspace import DBGymWorkspace, parent_path_of_path
2828

2929
CONNECT_TIMEOUT = 300
3030

util/pg.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
import sqlalchemy
1212
from sqlalchemy import create_engine, text
1313

14-
from util.workspace import DBGymWorkspace, open_and_save
14+
from util.workspace import DBGymWorkspace
1515

1616
DBGYM_POSTGRES_USER = "dbgym_user"
1717
DBGYM_POSTGRES_PASS = "dbgym_pass"

util/workspace.py

Lines changed: 0 additions & 158 deletions
Original file line numberDiff line numberDiff line change
@@ -524,42 +524,6 @@ def is_child_path(child_path: os.PathLike[str], parent_path: os.PathLike[str]) -
524524
)
525525

526526

527-
# TODO(phw2): deprecate this once I'm done with unittest_workspace.py
528-
def open_and_save(
529-
dbgym_workspace: DBGymWorkspace, open_path: Path, mode: str = "r"
530-
) -> IO[Any]:
531-
"""
532-
Open a file and "save" it to [workspace]/task_runs/run_*/.
533-
It takes in a str | Path to match the interface of open().
534-
This file does not work if open_path is a symlink, to make its interface identical to that of open().
535-
Make sure to resolve all symlinks with fully_resolve_path().
536-
To avoid confusion, I'm enforcing this function to only work with absolute paths.
537-
# TODO: maybe make it work on non-fully-resolved paths to better match open()
538-
See the comment of save_file() for what "saving" means
539-
If you are generating a "result" for the run, _do not_ use this. Just use the normal open().
540-
This shouldn't be too hard to remember because this function crashes if open_path doesn't exist,
541-
and when you write results you're usually opening open_paths which do not exist.
542-
"""
543-
# validate open_path
544-
assert isinstance(open_path, Path)
545-
assert is_fully_resolved(
546-
open_path
547-
), f"open_and_save(): open_path ({open_path}) should be a fully resolved path"
548-
assert not os.path.islink(
549-
open_path
550-
), f"open_path ({open_path}) should not be a symlink"
551-
assert os.path.exists(open_path), f"open_path ({open_path}) does not exist"
552-
# open_and_save *must* be called on files because it doesn't make sense to open a directory. note that this doesn't mean we'll always save
553-
# a file though. we sometimes save a directory (see save_file() for details)
554-
assert os.path.isfile(open_path), f"open_path ({open_path}) is not a file"
555-
556-
# save
557-
save_file(dbgym_workspace, open_path)
558-
559-
# open
560-
return open(open_path, mode=mode)
561-
562-
563527
def extract_from_task_run_path(
564528
dbgym_workspace: DBGymWorkspace, task_run_path: Path
565529
) -> tuple[Path, str, Path, str]:
@@ -593,128 +557,6 @@ def extract_from_task_run_path(
593557
return codebase_path, codebase_dname, org_path, org_dname
594558

595559

596-
# TODO(phw2): deprecate this once I'm done with unittest_workspace.py
597-
def save_file(dbgym_workspace: DBGymWorkspace, path: Path) -> None:
598-
"""
599-
If an external function takes in a file/directory as input, you will not be able to call open_and_save().
600-
In these situations, just call save_file().
601-
Like open_and_save(), this function only works with real absolute paths.
602-
"Saving" can mean either copying the file or creating a symlink to it
603-
We copy the file if it is a "config", meaning it just exists without having been generated
604-
We create a symlink if it is a "dependency", meaning a task.py command was run to generate it
605-
In these cases we create a symlink so we have full provenance for how the dependency was created
606-
"""
607-
# validate path
608-
assert is_fully_resolved(path), f"path ({path}) should be a fully resolved path"
609-
assert os.path.isfile(path), f"path ({path}) should be a file"
610-
assert not is_child_path(
611-
path, dbgym_workspace.dbgym_this_run_path
612-
), f"path ({path}) was generated in this task run ({dbgym_workspace.dbgym_this_run_path}). You do not need to save it"
613-
614-
# save _something_ to dbgym_this_run_path
615-
# save a symlink if the opened file was generated by a run. this is for two reasons:
616-
# 1. files or dirs generated by a run are supposed to be immutable so saving a symlink is safe
617-
# 2. files or dirs generated by a run may be very large (up to 100s of GBs) so we don't want to copy them
618-
if is_child_path(path, dbgym_workspace.dbgym_runs_path):
619-
# get paths we'll need later.
620-
_, codebase_dname, org_path, org_dname = extract_from_task_run_path(
621-
dbgym_workspace, path
622-
)
623-
this_run_save_path = (
624-
dbgym_workspace.dbgym_this_run_path / codebase_dname / org_dname
625-
)
626-
os.makedirs(this_run_save_path, exist_ok=True)
627-
628-
# if the path file is directly in org_path, we symlink the file directly
629-
parent_path = parent_path_of_path(path)
630-
if parent_path.samefile(org_path):
631-
fname = basename_of_path(path)
632-
symlink_path = this_run_save_path / (fname + ".link")
633-
try_create_symlink(path, symlink_path)
634-
# else, we know the path file is _not_ directly inside org_path dir
635-
# we go as far back as we can while still staying in org_path and symlink that "base" dir
636-
# this is because lots of runs create dirs within org_path and it's just a waste of space to symlink every individual file
637-
else:
638-
# set base_path such that its parent is org_path
639-
base_path = parent_path
640-
while not parent_path_of_path(base_path).samefile(org_path):
641-
base_path = parent_path_of_path(base_path)
642-
643-
# create symlink
644-
open_base_dname = basename_of_path(base_path)
645-
symlink_path = this_run_save_path / (open_base_dname + ".link")
646-
try_create_symlink(base_path, symlink_path)
647-
# if it wasn't generated by a run
648-
else:
649-
# since we don't know where the file is at all, the location is "unknown" and the org is "all"
650-
this_run_save_path = dbgym_workspace.dbgym_this_run_path / "unknown" / "all"
651-
os.makedirs(this_run_save_path, exist_ok=True)
652-
fname = basename_of_path(path)
653-
# in this case, we want to copy instead of symlinking since it might disappear in the future
654-
copy_path = this_run_save_path / fname
655-
shutil.copy(path, copy_path)
656-
657-
658-
# TODO(phw2): deprecate this once I'm done with unittest_workspace.py
659-
def link_result(
660-
dbgym_workspace: DBGymWorkspace,
661-
result_path: Path,
662-
custom_result_name: Optional[str] = None,
663-
) -> Path:
664-
"""
665-
result_path must be a "result", meaning it was generated inside dbgym_workspace.dbgym_this_run_path.
666-
Further, result_path must have been generated by this invocation to task.py. This also means that
667-
result_path itself can be a file or a dir but not a symlink.
668-
Given a file or directory in task_runs/run_*/[codebase]/[org], this will create a symlink inside
669-
symlinks/[codebase]/[org]/.
670-
Will override the old symlink if there is one, so that symlinks/ always contains the latest generated
671-
version of a file.
672-
This function will return the path to the symlink that was created.
673-
"""
674-
assert isinstance(result_path, Path)
675-
assert is_fully_resolved(
676-
result_path
677-
), f"result_path ({result_path}) should be a fully resolved path"
678-
assert is_child_path(result_path, dbgym_workspace.dbgym_this_run_path)
679-
assert not os.path.islink(result_path)
680-
681-
if type(custom_result_name) is str:
682-
result_name = custom_result_name
683-
else:
684-
if os.path.isfile(result_path):
685-
result_name = basename_of_path(result_path) + ".link"
686-
elif os.path.isdir(result_path):
687-
result_name = basename_of_path(result_path) + ".link"
688-
else:
689-
raise AssertionError("result_path must be either a file or dir")
690-
691-
# Figure out the parent directory path of the symlink
692-
codebase_path, codebase_dname, _, org_dname = extract_from_task_run_path(
693-
dbgym_workspace, result_path
694-
)
695-
# We're only supposed to save files generated by us, which means they should be in cur_task_runs_path()
696-
assert codebase_path.samefile(
697-
dbgym_workspace.cur_task_runs_path()
698-
), f"link_result should only be called on files generated by this invocation to task.py"
699-
symlink_parent_path = (
700-
dbgym_workspace.dbgym_symlinks_path / codebase_dname / org_dname
701-
)
702-
symlink_parent_path.mkdir(parents=True, exist_ok=True)
703-
704-
# Remove the old symlink ("old" meaning created in an earlier run) if there is one
705-
# Note that in a multi-threaded setting, this might remove one created by a process in the same run,
706-
# meaning it's not "old" by our definition of "old". However, we'll always end up with a symlink
707-
# file of the current run regardless of the order of threads.
708-
assert result_name.endswith(".link") and not result_name.endswith(
709-
".link.link"
710-
), f'result_name ({result_name}) should end with ".link"'
711-
symlink_path = symlink_parent_path / result_name
712-
try_remove_file(symlink_path)
713-
try_create_symlink(result_path, symlink_path)
714-
715-
return symlink_path
716-
717-
718560
def try_create_symlink(src_path: Path, dst_path: Path) -> None:
719561
"""
720562
Our functions that create symlinks might be called by multiple processes at once

0 commit comments

Comments
 (0)