1
+ import shutil
2
+ from typing import List , Set
3
+ import click
4
+ import yaml
5
+ import logging
6
+ from pathlib import Path
7
+ from misc .utils import DBGymConfig , is_child_path , parent_dpath_of_path
8
+ from itertools import chain
9
+ import os
10
+
11
+
12
+ task_logger = logging .getLogger ("task" )
13
+ task_logger .setLevel (logging .INFO )
14
+
15
+
16
+ @click .group (name = "manage" )
17
+ def manage_group ():
18
+ pass
19
+
20
+
21
+ @click .command (name = "show" )
22
+ @click .argument ("keys" , nargs = - 1 )
23
+ @click .pass_obj
24
+ def manage_show (dbgym_cfg , keys ):
25
+ config_path = dbgym_cfg .path
26
+ config_yaml = dbgym_cfg .yaml
27
+
28
+ # Traverse the YAML.
29
+ for key in keys :
30
+ config_yaml = config_yaml [key ]
31
+
32
+ # Pretty-print the requested YAML value.
33
+ output_str = None
34
+ if type (config_yaml ) != dict :
35
+ output_str = config_yaml
36
+ else :
37
+ output_str = yaml .dump (config_yaml , default_flow_style = False )
38
+ if len (keys ) > 0 :
39
+ output_str = " " + output_str .replace ("\n " , "\n " )
40
+ output_str = output_str .rstrip ()
41
+ print (output_str )
42
+
43
+ task_logger .info (f"Read: { Path (config_path )} " )
44
+
45
+
46
+ @click .command (name = "write" )
47
+ @click .argument ("keys" , nargs = - 1 )
48
+ @click .argument ("value_type" )
49
+ @click .argument ("value" )
50
+ @click .pass_obj
51
+ def manage_write (dbgym_cfg , keys , value_type , value ):
52
+ config_path = dbgym_cfg .path
53
+ config_yaml = dbgym_cfg .yaml
54
+
55
+ # Traverse the YAML.
56
+ root_yaml = config_yaml
57
+ for key in keys [:- 1 ]:
58
+ config_yaml = config_yaml [key ]
59
+
60
+ # Modify the requested YAML value and write the YAML file.
61
+ assert type (config_yaml [keys [- 1 ]]) != dict
62
+ config_yaml [keys [- 1 ]] = getattr (__builtins__ , value_type )(value )
63
+ new_yaml = yaml .dump (root_yaml , default_flow_style = False ).rstrip ()
64
+ Path (config_path ).write_text (new_yaml )
65
+
66
+ task_logger .info (f"Updated: { Path (config_path )} " )
67
+
68
+
69
+ @click .command (name = "standardize" )
70
+ @click .pass_obj
71
+ def manage_standardize (dbgym_cfg ):
72
+ config_path = dbgym_cfg .path
73
+ config_yaml = dbgym_cfg .yaml
74
+
75
+ # Write the YAML file.
76
+ new_yaml = yaml .dump (config_yaml , default_flow_style = False ).rstrip ()
77
+ Path (config_path ).write_text (new_yaml )
78
+
79
+ task_logger .info (f"Updated: { Path (config_path )} " )
80
+
81
+
82
+ @click .command ("clean" )
83
+ @click .pass_obj
84
+ @click .option (
85
+ "--mode" ,
86
+ type = click .Choice (["safe" , "aggressive" ]),
87
+ default = "safe" ,
88
+ help = "The mode to clean the workspace (default=\" safe\" ). \" aggressive\" means \" only keep run_*/ folders referenced by a file in symlinks/\" . \" safe\" means \" in addition to that, recursively keep any run_*/ folders referenced by any symlinks in run_*/ folders we are keeping.\" "
89
+ )
90
+ def manage_clean (dbgym_cfg : DBGymConfig , mode : str ):
91
+ clean_workspace (dbgym_cfg , mode = mode , verbose = True )
92
+
93
+
94
+ @click .command ("count" )
95
+ @click .pass_obj
96
+ def manage_count (dbgym_cfg : DBGymConfig ):
97
+ num_files = _count_files_in_workspace (dbgym_cfg )
98
+ print (f"The workspace ({ dbgym_cfg .dbgym_workspace_path } ) has { num_files } total files/dirs/symlinks." )
99
+
100
+
101
+ def add_symlinks_in_dpath (symlinks_stack : List [Path ], root_dpath : Path , processed_symlinks : Set [Path ]) -> None :
102
+ """
103
+ Will modify symlinks_stack and processed_symlinks.
104
+ """
105
+ for root_pathstr , dir_names , file_names in os .walk (root_dpath ):
106
+ root_path = Path (root_pathstr )
107
+ # symlinks can either be files or directories, so we go through both dir_names and file_names
108
+ for file_name in chain (dir_names , file_names ):
109
+ file_path = root_path / file_name
110
+ if file_path .is_symlink () and file_path not in processed_symlinks :
111
+ symlinks_stack .append (file_path )
112
+ processed_symlinks .add (file_path )
113
+
114
+
115
+ def _count_files_in_workspace (dbgym_cfg : DBGymConfig ) -> int :
116
+ """
117
+ Counts the number of files (regular file or dir or symlink) in the workspace.
118
+ """
119
+ total_count = 0
120
+ for dirpath , dirnames , filenames in os .walk (dbgym_cfg .dbgym_workspace_path , followlinks = False ):
121
+ # Check if any of the directories are symbolic links and remove them from dirnames
122
+ dirnames [:] = [d for d in dirnames if not os .path .islink (os .path .join (dirpath , d ))]
123
+
124
+ # Count files and directories (non-symlink directories already filtered)
125
+ total_count += len (filenames ) + len (dirnames )
126
+
127
+ return total_count
128
+
129
+
130
+ def clean_workspace (dbgym_cfg : DBGymConfig , mode : str = "safe" , verbose = False ) -> None :
131
+ """
132
+ Clean all [workspace]/task_runs/run_*/ directories that are not referenced by any "active symlinks".
133
+ If mode is "aggressive", "active symlinks" means *only* the symlinks directly in [workspace]/symlinks/.
134
+ If mode is "safe", "active symlinks" means the symlinks directly in [workspace]/symlinks/ as well as
135
+ any symlinks referenced in task_runs/run_*/ directories we have already decided to keep.
136
+ """
137
+ # This stack holds the symlinks that are left to be processed
138
+ symlink_fpaths_to_process = []
139
+ # This set holds the symlinks that have already been processed to avoid infinite loops
140
+ processed_symlinks = set ()
141
+
142
+ # 1. Initialize paths to process
143
+ if dbgym_cfg .dbgym_symlinks_path .exists ():
144
+ add_symlinks_in_dpath (symlink_fpaths_to_process , dbgym_cfg .dbgym_symlinks_path , processed_symlinks )
145
+
146
+ # 2. Go through symlinks, figuring out which "children of task runs" to keep
147
+ # Based on the rules of the framework, "children of task runs" should be run_*/ directories.
148
+ # However, the user's workspace might happen to break these rules by putting directories not
149
+ # named "run_*/" or files directly in task_runs/. Thus, I use the term "task_run_child_fordpaths"
150
+ # instead of "run_dpaths".
151
+ task_run_child_fordpaths_to_keep = set ()
152
+
153
+ if dbgym_cfg .dbgym_runs_path .exists ():
154
+ while symlink_fpaths_to_process :
155
+ symlink_fpath : Path = symlink_fpaths_to_process .pop ()
156
+ assert symlink_fpath .is_symlink ()
157
+ # Path.resolve() resolves all layers of symlinks while os.readlink() only resolves one layer.
158
+ # However, os.readlink() literally reads the string contents of the link. We need to do some
159
+ # processing on the result of os.readlink() to convert it to an absolute path
160
+ real_fordpath = symlink_fpath .resolve ()
161
+ one_layer_resolved_fordpath = os .readlink (symlink_fpath )
162
+ assert str (real_fordpath ) == str (os .readlink (symlink_fpath )), f"symlink_fpath ({ symlink_fpath } ) seems to point to *another* symlink. This is difficult to handle, so it is currently disallowed. Please resolve this situation manually."
163
+
164
+ # If the file doesn't exist, we'll just ignore it.
165
+ if not real_fordpath .exists ():
166
+ continue
167
+ # We're only trying to figure out which direct children of task_runs/ to save. If the file isn't
168
+ # even a descendant, we don't care about it.
169
+ if not is_child_path (real_fordpath , dbgym_cfg .dbgym_runs_path ):
170
+ continue
171
+
172
+ assert not os .path .samefile (real_fordpath , dbgym_cfg .dbgym_runs_path )
173
+
174
+ # Figure out the task_run_child_fordpath to put into task_run_child_fordpaths_to_keep
175
+ task_run_child_fordpath = None
176
+ if os .path .samefile (parent_dpath_of_path (real_fordpath ), dbgym_cfg .dbgym_runs_path ):
177
+ # While it's true that it shouldn't be possible to symlink to a directory directly in task_runs/,
178
+ # we'll just not delete it if the user happens to have one like this. Even if the user messed up
179
+ # the structure somehow, it's just a good idea not to delete it.
180
+ task_run_child_fordpath = real_fordpath
181
+ else :
182
+ # Technically, it's not allowed to symlink to any files not in task_runs/run_*/[codebase]/[organization]/.
183
+ # However, as with above, we won't just nuke files if the workspace doesn't follow this rule for
184
+ # some reason.
185
+ task_run_child_fordpath = real_fordpath
186
+ while not os .path .samefile (parent_dpath_of_path (task_run_child_fordpath ), dbgym_cfg .dbgym_runs_path ):
187
+ task_run_child_fordpath = parent_dpath_of_path (task_run_child_fordpath )
188
+ assert task_run_child_fordpath != None
189
+ assert os .path .samefile (parent_dpath_of_path (task_run_child_fordpath ), dbgym_cfg .dbgym_runs_path ), f"task_run_child_fordpath ({ task_run_child_fordpath } ) is not a direct child of dbgym_cfg.dbgym_runs_path"
190
+ task_run_child_fordpaths_to_keep .add (task_run_child_fordpath )
191
+
192
+ # If on safe mode, add symlinks inside the task_run_child_fordpath to be processed
193
+ if mode == "safe" :
194
+ add_symlinks_in_dpath (symlink_fpaths_to_process , task_run_child_fordpath , processed_symlinks )
195
+
196
+ # 3. Go through all children of task_runs/*, deleting any that we weren't told to keep
197
+ # It's true that symlinks might link outside of task_runs/*. We'll just not care about those
198
+ starting_num_files = _count_files_in_workspace (dbgym_cfg )
199
+ if dbgym_cfg .dbgym_runs_path .exists ():
200
+ for child_fordpath in dbgym_cfg .dbgym_runs_path .iterdir ():
201
+ if child_fordpath not in task_run_child_fordpaths_to_keep :
202
+ if child_fordpath .is_dir ():
203
+ shutil .rmtree (child_fordpath )
204
+ else :
205
+ os .remove (child_fordpath )
206
+ ending_num_files = _count_files_in_workspace (dbgym_cfg )
207
+
208
+ if verbose :
209
+ task_logger .info (f"Removed { starting_num_files - ending_num_files } out of { starting_num_files } files" )
210
+ task_logger .info (f"Workspace went from { starting_num_files - ending_num_files } to { starting_num_files } " )
211
+
212
+
213
+ manage_group .add_command (manage_show )
214
+ manage_group .add_command (manage_write )
215
+ manage_group .add_command (manage_standardize )
216
+ manage_group .add_command (manage_clean )
217
+ manage_group .add_command (manage_count )
0 commit comments