Skip to content

Commit 3245aab

Browse files
Automated workspace pruning/cleaning (#35)
**Summary**: A command to find and remove unneeded files from the workspace directory. **Demo**: Passing 25 unit tests with many edge cases which create files/directories, call `clean_workspace()`, and then verify its contents. ![Screenshot 2024-07-07 at 17 05 07](https://github.com/cmu-db/dbgym/assets/20631215/ed2edeae-062a-40ae-b38e-e2ad3718d6b3) **Details** * "Aggressive" mode removes all task_runs/\*/ directories that are not directly pointed to by a symlink in symlinks/. * "Safe" mode also keeps task_runs/\*/ directories which are indirectly pointed to by a symlink. This can happen if a symlink points to a task_runs/\*/ directory which has a symlink in it that points to another task_runs/*/ directory. * I chose to write so many unit tests because this operation must be bug-free.
1 parent 2f17bd4 commit 3245aab

File tree

251 files changed

+1030
-130
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

251 files changed

+1030
-130
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
__pycache__/
22
.conda/
33
.idea/
4+
test_clean_scratchspace/
45

56
workspace/
67
default_*_benchbase_config_*.xml

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ These steps were tested on a fresh repository clone, Ubuntu 22.04.
99

1010
```
1111
# Setup dependencies.
12-
# You may want to create a Python virtual environment (e.g. with conda) before doing this.
12+
# You may want to create a Python 3.10 virtual environment (e.g. with conda) before doing this.
1313
./dependency/install_dependencies.sh
1414
1515
# Compile a custom fork of PostgreSQL, load TPC-H (SF 0.01), train the Proto-X agent, and tune.
File renamed without changes.
File renamed without changes.

dependencies/rust.sh

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
#!/bin/bash
2+
curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh
File renamed without changes.

manage/cli.py

Lines changed: 217 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,217 @@
1+
import shutil
2+
from typing import List, Set
3+
import click
4+
import yaml
5+
import logging
6+
from pathlib import Path
7+
from misc.utils import DBGymConfig, is_child_path, parent_dpath_of_path
8+
from itertools import chain
9+
import os
10+
11+
12+
task_logger = logging.getLogger("task")
13+
task_logger.setLevel(logging.INFO)
14+
15+
16+
@click.group(name="manage")
17+
def manage_group():
18+
pass
19+
20+
21+
@click.command(name="show")
22+
@click.argument("keys", nargs=-1)
23+
@click.pass_obj
24+
def manage_show(dbgym_cfg, keys):
25+
config_path = dbgym_cfg.path
26+
config_yaml = dbgym_cfg.yaml
27+
28+
# Traverse the YAML.
29+
for key in keys:
30+
config_yaml = config_yaml[key]
31+
32+
# Pretty-print the requested YAML value.
33+
output_str = None
34+
if type(config_yaml) != dict:
35+
output_str = config_yaml
36+
else:
37+
output_str = yaml.dump(config_yaml, default_flow_style=False)
38+
if len(keys) > 0:
39+
output_str = " " + output_str.replace("\n", "\n ")
40+
output_str = output_str.rstrip()
41+
print(output_str)
42+
43+
task_logger.info(f"Read: {Path(config_path)}")
44+
45+
46+
@click.command(name="write")
47+
@click.argument("keys", nargs=-1)
48+
@click.argument("value_type")
49+
@click.argument("value")
50+
@click.pass_obj
51+
def manage_write(dbgym_cfg, keys, value_type, value):
52+
config_path = dbgym_cfg.path
53+
config_yaml = dbgym_cfg.yaml
54+
55+
# Traverse the YAML.
56+
root_yaml = config_yaml
57+
for key in keys[:-1]:
58+
config_yaml = config_yaml[key]
59+
60+
# Modify the requested YAML value and write the YAML file.
61+
assert type(config_yaml[keys[-1]]) != dict
62+
config_yaml[keys[-1]] = getattr(__builtins__, value_type)(value)
63+
new_yaml = yaml.dump(root_yaml, default_flow_style=False).rstrip()
64+
Path(config_path).write_text(new_yaml)
65+
66+
task_logger.info(f"Updated: {Path(config_path)}")
67+
68+
69+
@click.command(name="standardize")
70+
@click.pass_obj
71+
def manage_standardize(dbgym_cfg):
72+
config_path = dbgym_cfg.path
73+
config_yaml = dbgym_cfg.yaml
74+
75+
# Write the YAML file.
76+
new_yaml = yaml.dump(config_yaml, default_flow_style=False).rstrip()
77+
Path(config_path).write_text(new_yaml)
78+
79+
task_logger.info(f"Updated: {Path(config_path)}")
80+
81+
82+
@click.command("clean")
83+
@click.pass_obj
84+
@click.option(
85+
"--mode",
86+
type=click.Choice(["safe", "aggressive"]),
87+
default="safe",
88+
help="The mode to clean the workspace (default=\"safe\"). \"aggressive\" means \"only keep run_*/ folders referenced by a file in symlinks/\". \"safe\" means \"in addition to that, recursively keep any run_*/ folders referenced by any symlinks in run_*/ folders we are keeping.\""
89+
)
90+
def manage_clean(dbgym_cfg: DBGymConfig, mode: str):
91+
clean_workspace(dbgym_cfg, mode=mode, verbose=True)
92+
93+
94+
@click.command("count")
95+
@click.pass_obj
96+
def manage_count(dbgym_cfg: DBGymConfig):
97+
num_files = _count_files_in_workspace(dbgym_cfg)
98+
print(f"The workspace ({dbgym_cfg.dbgym_workspace_path}) has {num_files} total files/dirs/symlinks.")
99+
100+
101+
def add_symlinks_in_dpath(symlinks_stack: List[Path], root_dpath: Path, processed_symlinks: Set[Path]) -> None:
102+
"""
103+
Will modify symlinks_stack and processed_symlinks.
104+
"""
105+
for root_pathstr, dir_names, file_names in os.walk(root_dpath):
106+
root_path = Path(root_pathstr)
107+
# symlinks can either be files or directories, so we go through both dir_names and file_names
108+
for file_name in chain(dir_names, file_names):
109+
file_path = root_path / file_name
110+
if file_path.is_symlink() and file_path not in processed_symlinks:
111+
symlinks_stack.append(file_path)
112+
processed_symlinks.add(file_path)
113+
114+
115+
def _count_files_in_workspace(dbgym_cfg: DBGymConfig) -> int:
116+
"""
117+
Counts the number of files (regular file or dir or symlink) in the workspace.
118+
"""
119+
total_count = 0
120+
for dirpath, dirnames, filenames in os.walk(dbgym_cfg.dbgym_workspace_path, followlinks=False):
121+
# Check if any of the directories are symbolic links and remove them from dirnames
122+
dirnames[:] = [d for d in dirnames if not os.path.islink(os.path.join(dirpath, d))]
123+
124+
# Count files and directories (non-symlink directories already filtered)
125+
total_count += len(filenames) + len(dirnames)
126+
127+
return total_count
128+
129+
130+
def clean_workspace(dbgym_cfg: DBGymConfig, mode: str="safe", verbose=False) -> None:
131+
"""
132+
Clean all [workspace]/task_runs/run_*/ directories that are not referenced by any "active symlinks".
133+
If mode is "aggressive", "active symlinks" means *only* the symlinks directly in [workspace]/symlinks/.
134+
If mode is "safe", "active symlinks" means the symlinks directly in [workspace]/symlinks/ as well as
135+
any symlinks referenced in task_runs/run_*/ directories we have already decided to keep.
136+
"""
137+
# This stack holds the symlinks that are left to be processed
138+
symlink_fpaths_to_process = []
139+
# This set holds the symlinks that have already been processed to avoid infinite loops
140+
processed_symlinks = set()
141+
142+
# 1. Initialize paths to process
143+
if dbgym_cfg.dbgym_symlinks_path.exists():
144+
add_symlinks_in_dpath(symlink_fpaths_to_process, dbgym_cfg.dbgym_symlinks_path, processed_symlinks)
145+
146+
# 2. Go through symlinks, figuring out which "children of task runs" to keep
147+
# Based on the rules of the framework, "children of task runs" should be run_*/ directories.
148+
# However, the user's workspace might happen to break these rules by putting directories not
149+
# named "run_*/" or files directly in task_runs/. Thus, I use the term "task_run_child_fordpaths"
150+
# instead of "run_dpaths".
151+
task_run_child_fordpaths_to_keep = set()
152+
153+
if dbgym_cfg.dbgym_runs_path.exists():
154+
while symlink_fpaths_to_process:
155+
symlink_fpath: Path = symlink_fpaths_to_process.pop()
156+
assert symlink_fpath.is_symlink()
157+
# Path.resolve() resolves all layers of symlinks while os.readlink() only resolves one layer.
158+
# However, os.readlink() literally reads the string contents of the link. We need to do some
159+
# processing on the result of os.readlink() to convert it to an absolute path
160+
real_fordpath = symlink_fpath.resolve()
161+
one_layer_resolved_fordpath = os.readlink(symlink_fpath)
162+
assert str(real_fordpath) == str(os.readlink(symlink_fpath)), f"symlink_fpath ({symlink_fpath}) seems to point to *another* symlink. This is difficult to handle, so it is currently disallowed. Please resolve this situation manually."
163+
164+
# If the file doesn't exist, we'll just ignore it.
165+
if not real_fordpath.exists():
166+
continue
167+
# We're only trying to figure out which direct children of task_runs/ to save. If the file isn't
168+
# even a descendant, we don't care about it.
169+
if not is_child_path(real_fordpath, dbgym_cfg.dbgym_runs_path):
170+
continue
171+
172+
assert not os.path.samefile(real_fordpath, dbgym_cfg.dbgym_runs_path)
173+
174+
# Figure out the task_run_child_fordpath to put into task_run_child_fordpaths_to_keep
175+
task_run_child_fordpath = None
176+
if os.path.samefile(parent_dpath_of_path(real_fordpath), dbgym_cfg.dbgym_runs_path):
177+
# While it's true that it shouldn't be possible to symlink to a directory directly in task_runs/,
178+
# we'll just not delete it if the user happens to have one like this. Even if the user messed up
179+
# the structure somehow, it's just a good idea not to delete it.
180+
task_run_child_fordpath = real_fordpath
181+
else:
182+
# Technically, it's not allowed to symlink to any files not in task_runs/run_*/[codebase]/[organization]/.
183+
# However, as with above, we won't just nuke files if the workspace doesn't follow this rule for
184+
# some reason.
185+
task_run_child_fordpath = real_fordpath
186+
while not os.path.samefile(parent_dpath_of_path(task_run_child_fordpath), dbgym_cfg.dbgym_runs_path):
187+
task_run_child_fordpath = parent_dpath_of_path(task_run_child_fordpath)
188+
assert task_run_child_fordpath != None
189+
assert os.path.samefile(parent_dpath_of_path(task_run_child_fordpath), dbgym_cfg.dbgym_runs_path), f"task_run_child_fordpath ({task_run_child_fordpath}) is not a direct child of dbgym_cfg.dbgym_runs_path"
190+
task_run_child_fordpaths_to_keep.add(task_run_child_fordpath)
191+
192+
# If on safe mode, add symlinks inside the task_run_child_fordpath to be processed
193+
if mode == "safe":
194+
add_symlinks_in_dpath(symlink_fpaths_to_process, task_run_child_fordpath, processed_symlinks)
195+
196+
# 3. Go through all children of task_runs/*, deleting any that we weren't told to keep
197+
# It's true that symlinks might link outside of task_runs/*. We'll just not care about those
198+
starting_num_files = _count_files_in_workspace(dbgym_cfg)
199+
if dbgym_cfg.dbgym_runs_path.exists():
200+
for child_fordpath in dbgym_cfg.dbgym_runs_path.iterdir():
201+
if child_fordpath not in task_run_child_fordpaths_to_keep:
202+
if child_fordpath.is_dir():
203+
shutil.rmtree(child_fordpath)
204+
else:
205+
os.remove(child_fordpath)
206+
ending_num_files = _count_files_in_workspace(dbgym_cfg)
207+
208+
if verbose:
209+
task_logger.info(f"Removed {starting_num_files - ending_num_files} out of {starting_num_files} files")
210+
task_logger.info(f"Workspace went from {starting_num_files - ending_num_files} to {starting_num_files}")
211+
212+
213+
manage_group.add_command(manage_show)
214+
manage_group.add_command(manage_write)
215+
manage_group.add_command(manage_standardize)
216+
manage_group.add_command(manage_clean)
217+
manage_group.add_command(manage_count)

manage/tests/__init__.py

Whitespace-only changes.

0 commit comments

Comments
 (0)