1
1
import os
2
2
import posixpath
3
3
from collections .abc import Iterable
4
- from typing import TYPE_CHECKING , Any , TypedDict , Union
4
+ from typing import TYPE_CHECKING , Optional , TypedDict , Union
5
5
6
6
from dvc .fs .callbacks import DEFAULT_CALLBACK
7
- from dvc .repo .worktree import worktree_view
8
7
from dvc .ui import ui
8
+ from dvc_data .index .view import DataIndexView
9
9
10
10
if TYPE_CHECKING :
11
11
from dvc .fs .callbacks import Callback
12
12
from dvc .repo import Repo
13
13
from dvc .scm import Git , NoSCM
14
- from dvc_data .index import DataIndex
14
+ from dvc_data .index import BaseDataIndex , DataIndex , DataIndexKey
15
15
from dvc_data .index .diff import Change
16
16
17
17
@@ -47,12 +47,13 @@ def _adapt_path(change: "Change") -> str:
47
47
48
48
49
49
def _diff (
50
- old : "DataIndex " ,
51
- new : "DataIndex " ,
50
+ old : "BaseDataIndex " ,
51
+ new : "BaseDataIndex " ,
52
52
* ,
53
53
granular : bool = False ,
54
54
not_in_cache : bool = False ,
55
55
callback : "Callback" = DEFAULT_CALLBACK ,
56
+ filter_keys : Optional [list ["DataIndexKey" ]] = None ,
56
57
) -> dict [str , list [str ]]:
57
58
from dvc_data .index .diff import UNCHANGED , UNKNOWN , diff
58
59
@@ -74,6 +75,16 @@ def _add_change(typ, change):
74
75
with_unknown = True ,
75
76
callback = callback ,
76
77
):
78
+ # The index is a trie, so even when we filter by a specific path
79
+ # like `dir/file`, all parent nodes leading to that path (e.g., `dir/`)
80
+ # still appear in the view. As a result, keys like `dir/` will be present
81
+ # even if only `dir/file` matches the filter.
82
+ # We need to skip such entries to avoid showing root of tracked directories.
83
+ if filter_keys and not any (
84
+ change .key [: len (filter_key )] == filter_key for filter_key in filter_keys
85
+ ):
86
+ continue
87
+
77
88
if (
78
89
change .typ == UNCHANGED
79
90
and (not change .old or not change .old .hash_info )
@@ -136,7 +147,45 @@ def _git_info(scm: Union["Git", "NoSCM"], untracked_files: str = "all") -> GitIn
136
147
)
137
148
138
149
139
- def _diff_index_to_wtree (repo : "Repo" , ** kwargs : Any ) -> dict [str , list [str ]]:
150
+ def filter_index (
151
+ index : Union ["DataIndex" , "DataIndexView" ],
152
+ filter_keys : Optional [list ["DataIndexKey" ]] = None ,
153
+ ) -> "BaseDataIndex" :
154
+ if not filter_keys :
155
+ return index
156
+
157
+ if isinstance (index , DataIndexView ):
158
+ orig_index = index ._index
159
+ parent_filter_fn = index .filter_fn
160
+ else :
161
+ orig_index = index
162
+ parent_filter_fn = None
163
+
164
+ def filter_fn (key : "DataIndexKey" ) -> bool :
165
+ if parent_filter_fn is not None and not parent_filter_fn (key ):
166
+ return False
167
+
168
+ for filter_key in filter_keys :
169
+ # eg: if key is "dir/file" and filter_key is "dir/", return True
170
+ if key [: len (filter_key )] == filter_key :
171
+ return True
172
+ # eg: if key is `dir/` and filter_key is `dir/file`, also return True.
173
+ # This ensures we include parent prefixes needed to reach matching leaves.
174
+ # Intermediate prefixes must be retained to access nested keys.
175
+ if filter_key [: len (key )] == key :
176
+ return True
177
+ return False
178
+
179
+ from dvc_data .index import view
180
+
181
+ return view (orig_index , filter_fn = filter_fn )
182
+
183
+
184
+ def _diff_index_to_wtree (
185
+ repo : "Repo" ,
186
+ filter_keys : Optional [list ["DataIndexKey" ]] = None ,
187
+ granular : bool = False ,
188
+ ) -> dict [str , list [str ]]:
140
189
from .index import build_data_index
141
190
142
191
with ui .progress (desc = "Building workspace index" , unit = "entry" ) as pb :
@@ -147,30 +196,45 @@ def _diff_index_to_wtree(repo: "Repo", **kwargs: Any) -> dict[str, list[str]]:
147
196
compute_hash = True ,
148
197
callback = pb .as_callback (),
149
198
)
199
+ workspace_view = filter_index (workspace , filter_keys = filter_keys )
150
200
151
201
with ui .progress (
152
202
desc = "Calculating diff between index/workspace" ,
153
203
unit = "entry" ,
154
204
) as pb :
205
+ index = repo .index .data ["repo" ]
206
+ view = filter_index (index , filter_keys = filter_keys )
155
207
return _diff (
156
- repo .index .data ["repo" ],
157
- workspace ,
208
+ view ,
209
+ workspace_view ,
210
+ filter_keys = filter_keys ,
211
+ granular = granular ,
158
212
not_in_cache = True ,
159
213
callback = pb .as_callback (),
160
- ** kwargs ,
161
214
)
162
215
163
216
164
217
def _diff_head_to_index (
165
- repo : "Repo" , head : str = "HEAD" , ** kwargs : Any
218
+ repo : "Repo" ,
219
+ head : str = "HEAD" ,
220
+ filter_keys : Optional [list ["DataIndexKey" ]] = None ,
221
+ granular : bool = False ,
166
222
) -> dict [str , list [str ]]:
167
223
index = repo .index .data ["repo" ]
224
+ index_view = filter_index (index , filter_keys = filter_keys )
168
225
169
226
with repo .switch (head ):
170
227
head_index = repo .index .data ["repo" ]
228
+ head_view = filter_index (head_index , filter_keys = filter_keys )
171
229
172
230
with ui .progress (desc = "Calculating diff between head/index" , unit = "entry" ) as pb :
173
- return _diff (head_index , index , callback = pb .as_callback (), ** kwargs )
231
+ return _diff (
232
+ head_view ,
233
+ index_view ,
234
+ filter_keys = filter_keys ,
235
+ granular = granular ,
236
+ callback = pb .as_callback (),
237
+ )
174
238
175
239
176
240
class Status (TypedDict ):
@@ -204,46 +268,87 @@ def _transform_git_paths_to_dvc(repo: "Repo", files: Iterable[str]) -> list[str]
204
268
205
269
def _get_entries_not_in_remote (
206
270
repo : "Repo" ,
271
+ filter_keys : Optional [list ["DataIndexKey" ]] = None ,
207
272
granular : bool = False ,
208
273
remote_refresh : bool = False ,
209
274
) -> list [str ]:
210
275
"""Get entries that are not in remote storage."""
276
+ from dvc .repo .worktree import worktree_view
211
277
from dvc_data .index import StorageKeyError
212
278
213
279
# View into the index, with only pushable entries
214
- view = worktree_view (repo .index , push = True ).data ["repo" ]
280
+ index = worktree_view (repo .index , push = True )
281
+ data_index = index .data ["repo" ]
215
282
216
- missing_entries = []
217
- for key , entry in view .iteritems (shallow = not granular ):
218
- if not (entry and entry .hash_info ):
219
- continue
283
+ view = filter_index (data_index , filter_keys = filter_keys ) # type: ignore[arg-type]
220
284
221
- k = (* key , "" ) if entry .meta and entry .meta .isdir else key
222
- try :
223
- if not view .storage_map .remote_exists (entry , refresh = remote_refresh ):
224
- missing_entries .append (os .path .sep .join (k ))
225
- except StorageKeyError :
226
- pass
285
+ missing_entries = []
286
+ with ui .progress (desc = "Checking remote" , unit = "entry" ) as pb :
287
+ for key , entry in view .iteritems (shallow = not granular ):
288
+ if not (entry and entry .hash_info ):
289
+ continue
290
+
291
+ # The index is a trie, so even when we filter by a specific path
292
+ # like `dir/file`, all parent nodes leading to that path (e.g., `dir/`)
293
+ # still appear in the view. As a result, keys like `dir/` will be present
294
+ # even if only `dir/file` matches the filter.
295
+ # We need to skip such entries to avoid showing root of tracked directories.
296
+ if filter_keys and not any (
297
+ key [: len (filter_key )] == filter_key for filter_key in filter_keys
298
+ ):
299
+ continue
300
+
301
+ k = (* key , "" ) if entry .meta and entry .meta .isdir else key
302
+ try :
303
+ if not view .storage_map .remote_exists (entry , refresh = remote_refresh ):
304
+ missing_entries .append (os .path .sep .join (k ))
305
+ pb .update ()
306
+ except StorageKeyError :
307
+ pass
227
308
228
309
return missing_entries
229
310
230
311
312
+ def _matches_target (p : str , targets : Iterable [str ]) -> bool :
313
+ sep = os .sep
314
+ return any (p == t or p .startswith (t + sep ) for t in targets )
315
+
316
+
317
+ def _prune_keys (filter_keys : list ["DataIndexKey" ]) -> list ["DataIndexKey" ]:
318
+ sorted_keys = sorted (set (filter_keys ), key = len )
319
+ result : list [DataIndexKey ] = []
320
+
321
+ for key in sorted_keys :
322
+ if not any (key [: len (prefix )] == prefix for prefix in result ):
323
+ result .append (key )
324
+ return result
325
+
326
+
231
327
def status (
232
328
repo : "Repo" ,
329
+ targets : Optional [Iterable [Union [os .PathLike [str ], str ]]] = None ,
330
+ * ,
331
+ granular : bool = False ,
233
332
untracked_files : str = "no" ,
234
333
not_in_remote : bool = False ,
235
334
remote_refresh : bool = False ,
236
- granular : bool = False ,
237
335
head : str = "HEAD" ,
238
336
) -> Status :
239
337
from dvc .scm import NoSCMError , SCMError
240
338
241
- uncommitted_diff = _diff_index_to_wtree (repo , granular = granular )
242
- unchanged = set (uncommitted_diff .pop ("unchanged" , []))
339
+ targets = targets or []
340
+ filter_keys : list [DataIndexKey ] = [repo .fs .relparts (os .fspath (t )) for t in targets ]
341
+ # try to remove duplicate and overlapping keys
342
+ filter_keys = _prune_keys (filter_keys )
243
343
344
+ uncommitted_diff = _diff_index_to_wtree (
345
+ repo , filter_keys = filter_keys , granular = granular
346
+ )
347
+ unchanged = set (uncommitted_diff .pop ("unchanged" , []))
244
348
entries_not_in_remote = (
245
349
_get_entries_not_in_remote (
246
350
repo ,
351
+ filter_keys = filter_keys ,
247
352
granular = granular ,
248
353
remote_refresh = remote_refresh ,
249
354
)
@@ -252,16 +357,24 @@ def status(
252
357
)
253
358
254
359
try :
255
- committed_diff = _diff_head_to_index (repo , head = head , granular = granular )
360
+ committed_diff = _diff_head_to_index (
361
+ repo , filter_keys = filter_keys , head = head , granular = granular
362
+ )
256
363
except (SCMError , NoSCMError ):
257
364
committed_diff = {}
258
365
else :
259
366
unchanged &= set (committed_diff .pop ("unchanged" , []))
260
367
261
368
git_info = _git_info (repo .scm , untracked_files = untracked_files )
262
- untracked = git_info .get ("untracked" , [])
263
- untracked = _transform_git_paths_to_dvc (repo , untracked )
264
-
369
+ scm_filter_targets = {
370
+ os .path .relpath (os .path .abspath (t ), repo .scm .root_dir ) for t in targets
371
+ }
372
+ untracked_it : Iterable [str ] = git_info .get ("untracked" , [])
373
+ if scm_filter_targets :
374
+ untracked_it = (
375
+ f for f in untracked_it if _matches_target (f , scm_filter_targets )
376
+ )
377
+ untracked = _transform_git_paths_to_dvc (repo , untracked_it )
265
378
# order matters here
266
379
return Status (
267
380
not_in_cache = uncommitted_diff .pop ("not_in_cache" , []),
0 commit comments