Skip to content

Commit 4308f0d

Browse files
authored
[TECH DEBT] Remove walked paths parameter from dependency graph walker (#3645)
## Changes Remove walked paths from dependency graph walker as it should not be used, the developer should relay on the `MockPathLookup.successfully_resolved_paths` as it is important that the paths are resolved not just added to the cache in the walker
1 parent 3583396 commit 4308f0d

File tree

6 files changed

+115
-74
lines changed

6 files changed

+115
-74
lines changed

src/databricks/labs/ucx/source_code/linters/folders.py

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -62,7 +62,7 @@ def lint(
6262
stdout.write(f"{located_advice}\n")
6363
return located_advices
6464

65-
def lint_path(self, path: Path, linted_paths: set[Path] | None = None) -> Iterable[LocatedAdvice]:
65+
def lint_path(self, path: Path) -> Iterable[LocatedAdvice]:
6666
is_dir = path.is_dir()
6767
loader: DependencyLoader
6868
if is_a_notebook(path):
@@ -79,9 +79,7 @@ def lint_path(self, path: Path, linted_paths: set[Path] | None = None) -> Iterab
7979
problems = container.build_dependency_graph(graph)
8080
for problem in problems:
8181
yield problem.as_located_advice()
82-
if linted_paths is None:
83-
linted_paths = set()
84-
walker = LintingWalker(graph, linted_paths, self._path_lookup, path.name, self._context_factory)
82+
walker = LintingWalker(graph, self._path_lookup, self._context_factory)
8583
yield from walker
8684

8785

src/databricks/labs/ucx/source_code/linters/graph_walkers.py

Lines changed: 33 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -36,22 +36,38 @@
3636

3737

3838
class DependencyGraphWalker(abc.ABC, Generic[T]):
39+
"""Walks over the dependencies in a graph starting from the root dependencies going depth first.
3940
40-
def __init__(self, graph: DependencyGraph, walked_paths: set[Path], path_lookup: PathLookup):
41+
Implemented as an object to iterate over, for example:
42+
``` python
43+
walker = DependencyGraphWalker()
44+
for processed_dependency_output in walker:
45+
# Do something with output
46+
```
47+
"""
48+
49+
def __init__(self, graph: DependencyGraph, path_lookup: PathLookup):
4150
self._graph = graph
42-
self._walked_paths = walked_paths
4351
self._path_lookup = path_lookup
44-
self._lineage: list[Dependency] = []
52+
53+
self._walked_paths = set[Path]()
54+
self._lineage = list[Dependency]()
4555

4656
def __iter__(self) -> Iterator[T]:
57+
"""Iterate over the dependencies starting from the root."""
4758
for dependency in self._graph.root_dependencies:
4859
# the dependency is a root, so its path is the one to use
4960
# for computing lineage and building python global context
50-
root_path = dependency.path
51-
yield from self._iter_one(dependency, self._graph, root_path)
61+
yield from self._iter_one(dependency, self._graph, dependency.path)
5262

5363
def _iter_one(self, dependency: Dependency, graph: DependencyGraph, root_path: Path) -> Iterable[T]:
64+
"""Iterate over a single dependency going depth first."""
5465
if dependency.path in self._walked_paths:
66+
# TODO: Decide to not skip dependencies that have been walked already.
67+
# Open questions:
68+
# - Should this come before or after the lineage logging?
69+
# - When do we reach this? Also, it could mean that is coming from a different root,
70+
# which maybe needs to be processed.
5571
return
5672
self._lineage.append(dependency)
5773
self._walked_paths.add(dependency.path)
@@ -63,11 +79,13 @@ def _iter_one(self, dependency: Dependency, graph: DependencyGraph, root_path: P
6379
# missing graph problems have already been reported while building the graph
6480
if maybe_graph.graph:
6581
child_graph = maybe_graph.graph
82+
# This makes the implementation depth first
6683
for child_dependency in child_graph.local_dependencies:
6784
yield from self._iter_one(child_dependency, child_graph, root_path)
6885
self._lineage.pop()
6986

7087
def _log_walk_one(self, dependency: Dependency) -> None:
88+
"""Possibly overwrite this method in a subclass for more specific logging"""
7189
logger.debug(f'Analyzing dependency: {dependency}')
7290

7391
@abc.abstractmethod
@@ -76,37 +94,34 @@ def _process_dependency(
7694
dependency: Dependency,
7795
path_lookup: PathLookup,
7896
inherited_tree: Tree | None,
79-
) -> Iterable[T]: ...
97+
) -> Iterable[T]:
98+
"""Process a dependency."""
8099

81100
@property
82101
def lineage(self) -> list[LineageAtom]:
102+
"""The lineage for getting to the dependency."""
83103
lists: list[list[LineageAtom]] = [dependency.lineage for dependency in self._lineage]
84104
return list(itertools.chain(*lists))
85105

86106

87107
class LintingWalker(DependencyGraphWalker[LocatedAdvice]):
108+
"""Lint the dependencies in the graph."""
88109

89-
def __init__(
90-
self,
91-
graph: DependencyGraph,
92-
walked_paths: set[Path],
93-
path_lookup: PathLookup,
94-
key: str,
95-
context_factory: Callable[[], LinterContext],
96-
):
97-
super().__init__(graph, walked_paths, path_lookup)
98-
self._key = key
110+
def __init__(self, graph: DependencyGraph, path_lookup: PathLookup, context_factory: Callable[[], LinterContext]):
111+
super().__init__(graph, path_lookup)
99112
self._context_factory = context_factory
100113

101114
def _log_walk_one(self, dependency: Dependency) -> None:
102-
logger.info(f'Linting {self._key} dependency: {dependency}')
115+
"""Log linting a dependency"""
116+
logger.info(f"Linting dependency: {dependency}")
103117

104118
def _process_dependency(
105119
self,
106120
dependency: Dependency,
107121
path_lookup: PathLookup,
108122
inherited_tree: Tree | None,
109123
) -> Iterable[LocatedAdvice]:
124+
"""Lint the dependency and yield the located advices."""
110125
# FileLinter determines which file/notebook linter to use
111126
linter = FileLinter(dependency, path_lookup, self._context_factory(), inherited_tree)
112127
for advice in linter.lint():
@@ -121,12 +136,11 @@ class _CollectorWalker(DependencyGraphWalker[S], abc.ABC):
121136
def __init__(
122137
self,
123138
graph: DependencyGraph,
124-
walked_paths: set[Path],
125139
path_lookup: PathLookup,
126140
session_state: CurrentSessionState,
127141
migration_index: TableMigrationIndex,
128142
):
129-
super().__init__(graph, walked_paths, path_lookup)
143+
super().__init__(graph, path_lookup)
130144
self._linter_context = LinterContext(migration_index, session_state)
131145

132146
def _process_dependency(

src/databricks/labs/ucx/source_code/linters/jobs.py

Lines changed: 5 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -117,11 +117,10 @@ def _lint_job(self, job: jobs.Job) -> tuple[list[JobProblem], list[DirectFsAcces
117117
assert job.settings is not None
118118
assert job.settings.name is not None
119119
assert job.settings.tasks is not None
120-
linted_paths: set[Path] = set()
121120
for task in job.settings.tasks:
122121
graph, advices, session_state = self._build_task_dependency_graph(task, job)
123122
if not advices:
124-
advices = self._lint_task(task, graph, session_state, linted_paths)
123+
advices = self._lint_task(graph, session_state)
125124
for advice in advices:
126125
absolute_path = "UNKNOWN" if advice.has_missing_path() else advice.path.absolute().as_posix()
127126
job_problem = JobProblem(
@@ -173,20 +172,8 @@ def _build_task_dependency_graph(
173172
located_advices = [problem.as_located_advice() for problem in problems]
174173
return graph, located_advices, session_state
175174

176-
def _lint_task(
177-
self,
178-
task: jobs.Task,
179-
graph: DependencyGraph,
180-
session_state: CurrentSessionState,
181-
linted_paths: set[Path],
182-
) -> Iterable[LocatedAdvice]:
183-
walker = LintingWalker(
184-
graph,
185-
linted_paths,
186-
self._path_lookup,
187-
task.task_key,
188-
lambda: LinterContext(self._migration_index, session_state),
189-
)
175+
def _lint_task(self, graph: DependencyGraph, session_state: CurrentSessionState) -> Iterable[LocatedAdvice]:
176+
walker = LintingWalker(graph, self._path_lookup, lambda: LinterContext(self._migration_index, session_state))
190177
yield from walker
191178

192179
def _collect_task_dfsas(
@@ -199,7 +186,7 @@ def _collect_task_dfsas(
199186
# need to add lineage for job/task because walker doesn't register them
200187
job_id = str(job.job_id)
201188
job_name = job.settings.name if job.settings and job.settings.name else "<anonymous>"
202-
for dfsa in DfsaCollectorWalker(graph, set(), self._path_lookup, session_state, self._migration_index):
189+
for dfsa in DfsaCollectorWalker(graph, self._path_lookup, session_state, self._migration_index):
203190
atoms = [
204191
LineageAtom(object_type="WORKFLOW", object_id=job_id, other={"name": job_name}),
205192
LineageAtom(object_type="TASK", object_id=f"{job_id}/{task.task_key}"),
@@ -216,7 +203,7 @@ def _collect_task_tables(
216203
# need to add lineage for job/task because walker doesn't register them
217204
job_id = str(job.job_id)
218205
job_name = job.settings.name if job.settings and job.settings.name else "<anonymous>"
219-
for used_table in TablesCollectorWalker(graph, set(), self._path_lookup, session_state, self._migration_index):
206+
for used_table in TablesCollectorWalker(graph, self._path_lookup, session_state, self._migration_index):
220207
atoms = [
221208
LineageAtom(object_type="WORKFLOW", object_id=job_id, other={"name": job_name}),
222209
LineageAtom(object_type="TASK", object_id=f"{job_id}/{task.task_key}"),

tests/integration/source_code/solacc.py

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
from databricks.labs.ucx.framework.utils import run_command
1717
from databricks.labs.ucx.hive_metastore.table_migration_status import TableMigrationIndex
1818
from databricks.labs.ucx.source_code.base import LocatedAdvice
19+
from databricks.labs.ucx.source_code.files import FileLoader
1920
from databricks.labs.ucx.source_code.linters.context import LinterContext
2021
from databricks.labs.ucx.source_code.path_lookup import PathLookup
2122

@@ -106,7 +107,7 @@ class _SolaccContext:
106107
unparsed_files_path: Path | None = None
107108
files_to_skip: set[Path] | None = None
108109
total_count = 0
109-
parseable_count = 0
110+
unparseable_count = 0
110111
uninferrable_count = 0
111112
missing_imports: dict[str, dict[str, int]] = field(default_factory=dict)
112113
stats: list[_SolaccStats] = field(default_factory=list)
@@ -164,18 +165,17 @@ def clean_tmp_sys_paths(self):
164165
def _lint_dir(solacc: _SolaccContext, soldir: Path):
165166
path_lookup = _CleanablePathLookup()
166167
ws = WorkspaceClient(host='...', token='...')
168+
files_to_skip = set(solacc.files_to_skip) if solacc.files_to_skip else set()
167169
ctx = LocalCheckoutContext(ws).replace(
168170
linter_context_factory=lambda session_state: LinterContext(TableMigrationIndex([]), session_state),
169171
path_lookup=path_lookup,
172+
file_loader=FileLoader(exclude_paths=files_to_skip),
170173
)
171174
all_files = list(soldir.glob('**/*.py')) + list(soldir.glob('**/*.sql'))
172175
solacc.total_count += len(all_files)
173-
# pre-populate linted_files such that files to skip are not linted
174-
files_to_skip = set(solacc.files_to_skip) if solacc.files_to_skip else set()
175-
linted_files = set(files_to_skip)
176176
# lint solution
177177
start_timestamp = datetime.now(timezone.utc)
178-
advices = list(ctx.local_code_linter.lint_path(soldir, linted_files))
178+
advices = list(ctx.local_code_linter.lint_path(soldir))
179179
end_timestamp = datetime.now(timezone.utc)
180180
# record stats
181181
stats = _SolaccStats(
@@ -189,7 +189,7 @@ def _lint_dir(solacc: _SolaccContext, soldir: Path):
189189
solacc.stats.append(stats)
190190
# collect unparseable files
191191
unparseables = _collect_unparseable(advices)
192-
solacc.parseable_count += len(linted_files) - len(files_to_skip) - len(set(advice.path for advice in unparseables))
192+
solacc.unparseable_count += len(files_to_skip) + len(set(advice.path for advice in unparseables))
193193
if solacc.unparsed_files_path:
194194
for unparseable in unparseables:
195195
logger.error(f"Error during parsing of {unparseable.path}: {unparseable.advice.message}".replace("\n", " "))
@@ -227,11 +227,11 @@ def _lint_repos(clone_urls, sol_to_lint: str | None):
227227
if os.getenv("CI"):
228228
shutil.rmtree(sol_dir)
229229
all_files_len = solacc.total_count - (len(solacc.files_to_skip) if solacc.files_to_skip else 0)
230-
parseable_pct = int(solacc.parseable_count / all_files_len * 100)
230+
unparseable_pct = int(solacc.unparseable_count / all_files_len * 100)
231231
missing_imports_count = sum(sum(details.values()) for details in solacc.missing_imports.values())
232232
logger.info(
233233
f"Skipped: {len(solacc.files_to_skip or [])}, "
234-
f"parseable: {parseable_pct}% ({solacc.parseable_count}/{all_files_len}), "
234+
f"unparseable: {unparseable_pct}% ({solacc.unparseable_count}/{all_files_len}), "
235235
f"missing imports: {missing_imports_count}, "
236236
f"not computed: {solacc.uninferrable_count}"
237237
)
@@ -243,7 +243,7 @@ def _lint_repos(clone_urls, sol_to_lint: str | None):
243243
message = json.dumps(dataclasses.asdict(stats), default=str)
244244
stats_file.writelines([message])
245245
# fail the job if files are unparseable
246-
if parseable_pct < 100:
246+
if unparseable_pct > 0:
247247
sys.exit(1)
248248

249249

tests/unit/source_code/linters/test_files.py

Lines changed: 4 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -130,19 +130,17 @@ def local_code_linter(mock_path_lookup, migration_index):
130130
def test_linter_walks_directory(mock_path_lookup, local_code_linter) -> None:
131131
mock_path_lookup.append_path(Path(_samples_path(SourceContainer)))
132132
path = Path(__file__).parent / "../samples" / "simulate-sys-path"
133-
paths: set[Path] = set()
134-
advices = list(local_code_linter.lint_path(path, paths))
135-
assert len(paths) > 10
133+
advices = list(local_code_linter.lint_path(path))
136134
assert not advices
135+
assert len(mock_path_lookup.successfully_resolved_paths) > 10
137136

138137

139138
def test_linter_lints_children_in_context(mock_path_lookup, local_code_linter) -> None:
140139
mock_path_lookup.append_path(Path(_samples_path(SourceContainer)))
141140
path = Path(__file__).parent.parent / "samples" / "parent-child-context"
142-
paths: set[Path] = set()
143-
advices = list(local_code_linter.lint_path(path, paths))
144-
assert len(paths) == 3
141+
advices = list(local_code_linter.lint_path(path))
145142
assert not advices
143+
assert mock_path_lookup.successfully_resolved_paths == {path, Path("parent.py"), Path("child.py")}
146144

147145

148146
def test_triple_dot_import() -> None:

0 commit comments

Comments
 (0)