Skip to content

Commit aac32e1

Browse files
authored
data status: support targets to limit scope of command (#10789)
1 parent 45fd102 commit aac32e1

File tree

4 files changed

+492
-33
lines changed

4 files changed

+492
-33
lines changed

dvc/commands/data.py

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22

33
from funcy import chunks, compact, log_durations
44

5-
from dvc.cli import formatter
5+
from dvc.cli import completion, formatter
66
from dvc.cli.command import CmdBase
77
from dvc.cli.utils import append_doc_link
88
from dvc.log import logger
@@ -108,6 +108,7 @@ def _show_status(cls, status: "DataStatus") -> int: # noqa: C901
108108
def run(self) -> int:
109109
with log_durations(logger.trace, "in data_status"):
110110
status = self.repo.data_status(
111+
targets=self.args.targets,
111112
granular=self.args.granular,
112113
untracked_files=self.args.untracked_files,
113114
not_in_remote=self.args.not_in_remote,
@@ -147,6 +148,14 @@ def add_parser(subparsers, parent_parser):
147148
formatter_class=formatter.RawDescriptionHelpFormatter,
148149
help=DATA_STATUS_HELP,
149150
)
151+
data_status_parser.add_argument(
152+
"targets",
153+
nargs="*",
154+
help=(
155+
"Limit command scope to these tracked files/directories, "
156+
".dvc files and stage names."
157+
),
158+
).complete = completion.FILE # type: ignore[attr-defined]
150159
data_status_parser.add_argument(
151160
"--json",
152161
action="store_true",

dvc/repo/data.py

Lines changed: 142 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -1,17 +1,17 @@
11
import os
22
import posixpath
33
from collections.abc import Iterable
4-
from typing import TYPE_CHECKING, Any, TypedDict, Union
4+
from typing import TYPE_CHECKING, Optional, TypedDict, Union
55

66
from dvc.fs.callbacks import DEFAULT_CALLBACK
7-
from dvc.repo.worktree import worktree_view
87
from dvc.ui import ui
8+
from dvc_data.index.view import DataIndexView
99

1010
if TYPE_CHECKING:
1111
from dvc.fs.callbacks import Callback
1212
from dvc.repo import Repo
1313
from dvc.scm import Git, NoSCM
14-
from dvc_data.index import DataIndex
14+
from dvc_data.index import BaseDataIndex, DataIndex, DataIndexKey
1515
from dvc_data.index.diff import Change
1616

1717

@@ -47,12 +47,13 @@ def _adapt_path(change: "Change") -> str:
4747

4848

4949
def _diff(
50-
old: "DataIndex",
51-
new: "DataIndex",
50+
old: "BaseDataIndex",
51+
new: "BaseDataIndex",
5252
*,
5353
granular: bool = False,
5454
not_in_cache: bool = False,
5555
callback: "Callback" = DEFAULT_CALLBACK,
56+
filter_keys: Optional[list["DataIndexKey"]] = None,
5657
) -> dict[str, list[str]]:
5758
from dvc_data.index.diff import UNCHANGED, UNKNOWN, diff
5859

@@ -74,6 +75,16 @@ def _add_change(typ, change):
7475
with_unknown=True,
7576
callback=callback,
7677
):
78+
# The index is a trie, so even when we filter by a specific path
79+
# like `dir/file`, all parent nodes leading to that path (e.g., `dir/`)
80+
# still appear in the view. As a result, keys like `dir/` will be present
81+
# even if only `dir/file` matches the filter.
82+
# We need to skip such entries to avoid showing root of tracked directories.
83+
if filter_keys and not any(
84+
change.key[: len(filter_key)] == filter_key for filter_key in filter_keys
85+
):
86+
continue
87+
7788
if (
7889
change.typ == UNCHANGED
7990
and (not change.old or not change.old.hash_info)
@@ -136,7 +147,45 @@ def _git_info(scm: Union["Git", "NoSCM"], untracked_files: str = "all") -> GitIn
136147
)
137148

138149

139-
def _diff_index_to_wtree(repo: "Repo", **kwargs: Any) -> dict[str, list[str]]:
150+
def filter_index(
151+
index: Union["DataIndex", "DataIndexView"],
152+
filter_keys: Optional[list["DataIndexKey"]] = None,
153+
) -> "BaseDataIndex":
154+
if not filter_keys:
155+
return index
156+
157+
if isinstance(index, DataIndexView):
158+
orig_index = index._index
159+
parent_filter_fn = index.filter_fn
160+
else:
161+
orig_index = index
162+
parent_filter_fn = None
163+
164+
def filter_fn(key: "DataIndexKey") -> bool:
165+
if parent_filter_fn is not None and not parent_filter_fn(key):
166+
return False
167+
168+
for filter_key in filter_keys:
169+
# eg: if key is "dir/file" and filter_key is "dir/", return True
170+
if key[: len(filter_key)] == filter_key:
171+
return True
172+
# eg: if key is `dir/` and filter_key is `dir/file`, also return True.
173+
# This ensures we include parent prefixes needed to reach matching leaves.
174+
# Intermediate prefixes must be retained to access nested keys.
175+
if filter_key[: len(key)] == key:
176+
return True
177+
return False
178+
179+
from dvc_data.index import view
180+
181+
return view(orig_index, filter_fn=filter_fn)
182+
183+
184+
def _diff_index_to_wtree(
185+
repo: "Repo",
186+
filter_keys: Optional[list["DataIndexKey"]] = None,
187+
granular: bool = False,
188+
) -> dict[str, list[str]]:
140189
from .index import build_data_index
141190

142191
with ui.progress(desc="Building workspace index", unit="entry") as pb:
@@ -147,30 +196,45 @@ def _diff_index_to_wtree(repo: "Repo", **kwargs: Any) -> dict[str, list[str]]:
147196
compute_hash=True,
148197
callback=pb.as_callback(),
149198
)
199+
workspace_view = filter_index(workspace, filter_keys=filter_keys)
150200

151201
with ui.progress(
152202
desc="Calculating diff between index/workspace",
153203
unit="entry",
154204
) as pb:
205+
index = repo.index.data["repo"]
206+
view = filter_index(index, filter_keys=filter_keys)
155207
return _diff(
156-
repo.index.data["repo"],
157-
workspace,
208+
view,
209+
workspace_view,
210+
filter_keys=filter_keys,
211+
granular=granular,
158212
not_in_cache=True,
159213
callback=pb.as_callback(),
160-
**kwargs,
161214
)
162215

163216

164217
def _diff_head_to_index(
165-
repo: "Repo", head: str = "HEAD", **kwargs: Any
218+
repo: "Repo",
219+
head: str = "HEAD",
220+
filter_keys: Optional[list["DataIndexKey"]] = None,
221+
granular: bool = False,
166222
) -> dict[str, list[str]]:
167223
index = repo.index.data["repo"]
224+
index_view = filter_index(index, filter_keys=filter_keys)
168225

169226
with repo.switch(head):
170227
head_index = repo.index.data["repo"]
228+
head_view = filter_index(head_index, filter_keys=filter_keys)
171229

172230
with ui.progress(desc="Calculating diff between head/index", unit="entry") as pb:
173-
return _diff(head_index, index, callback=pb.as_callback(), **kwargs)
231+
return _diff(
232+
head_view,
233+
index_view,
234+
filter_keys=filter_keys,
235+
granular=granular,
236+
callback=pb.as_callback(),
237+
)
174238

175239

176240
class Status(TypedDict):
@@ -204,46 +268,87 @@ def _transform_git_paths_to_dvc(repo: "Repo", files: Iterable[str]) -> list[str]
204268

205269
def _get_entries_not_in_remote(
206270
repo: "Repo",
271+
filter_keys: Optional[list["DataIndexKey"]] = None,
207272
granular: bool = False,
208273
remote_refresh: bool = False,
209274
) -> list[str]:
210275
"""Get entries that are not in remote storage."""
276+
from dvc.repo.worktree import worktree_view
211277
from dvc_data.index import StorageKeyError
212278

213279
# View into the index, with only pushable entries
214-
view = worktree_view(repo.index, push=True).data["repo"]
280+
index = worktree_view(repo.index, push=True)
281+
data_index = index.data["repo"]
215282

216-
missing_entries = []
217-
for key, entry in view.iteritems(shallow=not granular):
218-
if not (entry and entry.hash_info):
219-
continue
283+
view = filter_index(data_index, filter_keys=filter_keys) # type: ignore[arg-type]
220284

221-
k = (*key, "") if entry.meta and entry.meta.isdir else key
222-
try:
223-
if not view.storage_map.remote_exists(entry, refresh=remote_refresh):
224-
missing_entries.append(os.path.sep.join(k))
225-
except StorageKeyError:
226-
pass
285+
missing_entries = []
286+
with ui.progress(desc="Checking remote", unit="entry") as pb:
287+
for key, entry in view.iteritems(shallow=not granular):
288+
if not (entry and entry.hash_info):
289+
continue
290+
291+
# The index is a trie, so even when we filter by a specific path
292+
# like `dir/file`, all parent nodes leading to that path (e.g., `dir/`)
293+
# still appear in the view. As a result, keys like `dir/` will be present
294+
# even if only `dir/file` matches the filter.
295+
# We need to skip such entries to avoid showing root of tracked directories.
296+
if filter_keys and not any(
297+
key[: len(filter_key)] == filter_key for filter_key in filter_keys
298+
):
299+
continue
300+
301+
k = (*key, "") if entry.meta and entry.meta.isdir else key
302+
try:
303+
if not view.storage_map.remote_exists(entry, refresh=remote_refresh):
304+
missing_entries.append(os.path.sep.join(k))
305+
pb.update()
306+
except StorageKeyError:
307+
pass
227308

228309
return missing_entries
229310

230311

312+
def _matches_target(p: str, targets: Iterable[str]) -> bool:
313+
sep = os.sep
314+
return any(p == t or p.startswith(t + sep) for t in targets)
315+
316+
317+
def _prune_keys(filter_keys: list["DataIndexKey"]) -> list["DataIndexKey"]:
318+
sorted_keys = sorted(set(filter_keys), key=len)
319+
result: list[DataIndexKey] = []
320+
321+
for key in sorted_keys:
322+
if not any(key[: len(prefix)] == prefix for prefix in result):
323+
result.append(key)
324+
return result
325+
326+
231327
def status(
232328
repo: "Repo",
329+
targets: Optional[Iterable[Union[os.PathLike[str], str]]] = None,
330+
*,
331+
granular: bool = False,
233332
untracked_files: str = "no",
234333
not_in_remote: bool = False,
235334
remote_refresh: bool = False,
236-
granular: bool = False,
237335
head: str = "HEAD",
238336
) -> Status:
239337
from dvc.scm import NoSCMError, SCMError
240338

241-
uncommitted_diff = _diff_index_to_wtree(repo, granular=granular)
242-
unchanged = set(uncommitted_diff.pop("unchanged", []))
339+
targets = targets or []
340+
filter_keys: list[DataIndexKey] = [repo.fs.relparts(os.fspath(t)) for t in targets]
341+
# try to remove duplicate and overlapping keys
342+
filter_keys = _prune_keys(filter_keys)
243343

344+
uncommitted_diff = _diff_index_to_wtree(
345+
repo, filter_keys=filter_keys, granular=granular
346+
)
347+
unchanged = set(uncommitted_diff.pop("unchanged", []))
244348
entries_not_in_remote = (
245349
_get_entries_not_in_remote(
246350
repo,
351+
filter_keys=filter_keys,
247352
granular=granular,
248353
remote_refresh=remote_refresh,
249354
)
@@ -252,16 +357,24 @@ def status(
252357
)
253358

254359
try:
255-
committed_diff = _diff_head_to_index(repo, head=head, granular=granular)
360+
committed_diff = _diff_head_to_index(
361+
repo, filter_keys=filter_keys, head=head, granular=granular
362+
)
256363
except (SCMError, NoSCMError):
257364
committed_diff = {}
258365
else:
259366
unchanged &= set(committed_diff.pop("unchanged", []))
260367

261368
git_info = _git_info(repo.scm, untracked_files=untracked_files)
262-
untracked = git_info.get("untracked", [])
263-
untracked = _transform_git_paths_to_dvc(repo, untracked)
264-
369+
scm_filter_targets = {
370+
os.path.relpath(os.path.abspath(t), repo.scm.root_dir) for t in targets
371+
}
372+
untracked_it: Iterable[str] = git_info.get("untracked", [])
373+
if scm_filter_targets:
374+
untracked_it = (
375+
f for f in untracked_it if _matches_target(f, scm_filter_targets)
376+
)
377+
untracked = _transform_git_paths_to_dvc(repo, untracked_it)
265378
# order matters here
266379
return Status(
267380
not_in_cache=uncommitted_diff.pop("not_in_cache", []),

0 commit comments

Comments
 (0)