Skip to content

Commit 3ca395c

Browse files
authored
Move graph walkers to separate module (#3581)
## Changes Move all graph walkers into a separate module `graph_walkers.py` under the `linters` module to centralize them, because they are used for linting only and to reuse them instead of recreating them ### Linked issues Progresses #3514 Breaks up #3520 ### Functionality - [x] modified existing command: `databricks labs ucx migrate-local-code` ### Tests - [ ] manually tested - [x] modified and added unit tests - [x] modified and added integration tests
1 parent f8bf94c commit 3ca395c

File tree

8 files changed

+337
-312
lines changed

8 files changed

+337
-312
lines changed

src/databricks/labs/ucx/source_code/graph.py

Lines changed: 1 addition & 55 deletions
Original file line numberDiff line numberDiff line change
@@ -6,8 +6,7 @@
66
import logging
77
from dataclasses import dataclass
88
from pathlib import Path
9-
from collections.abc import Callable, Iterable, Iterator
10-
from typing import TypeVar, Generic
9+
from collections.abc import Callable, Iterable
1110

1211
from astroid import ( # type: ignore
1312
NodeNG,
@@ -601,56 +600,3 @@ def finalize(self) -> InheritedContext:
601600
return self
602601
tree = self.tree.renumber(-1)
603602
return InheritedContext(tree, self.found, [])
604-
605-
606-
T = TypeVar("T")
607-
608-
609-
class DependencyGraphWalker(abc.ABC, Generic[T]):
610-
611-
def __init__(self, graph: DependencyGraph, walked_paths: set[Path], path_lookup: PathLookup):
612-
self._graph = graph
613-
self._walked_paths = walked_paths
614-
self._path_lookup = path_lookup
615-
self._lineage: list[Dependency] = []
616-
617-
def __iter__(self) -> Iterator[T]:
618-
for dependency in self._graph.root_dependencies:
619-
# the dependency is a root, so its path is the one to use
620-
# for computing lineage and building python global context
621-
root_path = dependency.path
622-
yield from self._iter_one(dependency, self._graph, root_path)
623-
624-
def _iter_one(self, dependency: Dependency, graph: DependencyGraph, root_path: Path) -> Iterable[T]:
625-
if dependency.path in self._walked_paths:
626-
return
627-
self._lineage.append(dependency)
628-
self._walked_paths.add(dependency.path)
629-
self._log_walk_one(dependency)
630-
if dependency.path.is_file() or is_a_notebook(dependency.path):
631-
inherited_tree = graph.root.build_inherited_tree(root_path, dependency.path)
632-
path_lookup = self._path_lookup.change_directory(dependency.path.parent)
633-
yield from self._process_dependency(dependency, path_lookup, inherited_tree)
634-
maybe_graph = graph.locate_dependency(dependency.path)
635-
# missing graph problems have already been reported while building the graph
636-
if maybe_graph.graph:
637-
child_graph = maybe_graph.graph
638-
for child_dependency in child_graph.local_dependencies:
639-
yield from self._iter_one(child_dependency, child_graph, root_path)
640-
self._lineage.pop()
641-
642-
def _log_walk_one(self, dependency: Dependency) -> None:
643-
logger.debug(f'Analyzing dependency: {dependency}')
644-
645-
@abc.abstractmethod
646-
def _process_dependency(
647-
self,
648-
dependency: Dependency,
649-
path_lookup: PathLookup,
650-
inherited_tree: Tree | None,
651-
) -> Iterable[T]: ...
652-
653-
@property
654-
def lineage(self) -> list[LineageAtom]:
655-
lists: list[list[LineageAtom]] = [dependency.lineage for dependency in self._lineage]
656-
return list(itertools.chain(*lists))

src/databricks/labs/ucx/source_code/jobs.py

Lines changed: 12 additions & 174 deletions
Original file line numberDiff line numberDiff line change
@@ -3,14 +3,12 @@
33
import logging
44
import shutil
55
import tempfile
6-
from abc import ABC, abstractmethod
7-
from collections.abc import Generator, Iterable, Callable
6+
from collections.abc import Generator, Iterable
87
from contextlib import contextmanager
98
from dataclasses import dataclass
109
from datetime import datetime, timezone
1110
from importlib import metadata
1211
from pathlib import Path
13-
from typing import TypeVar
1412
from urllib import parse
1513

1614
from databricks.labs.blueprint.parallel import Threads
@@ -21,20 +19,15 @@
2119
from databricks.sdk.service import compute, jobs
2220
from databricks.sdk.service.compute import DataSecurityMode
2321
from databricks.sdk.service.jobs import Source
24-
from databricks.sdk.service.workspace import Language
2522

2623
from databricks.labs.ucx.assessment.crawlers import runtime_version_tuple
2724
from databricks.labs.ucx.hive_metastore.table_migration_status import TableMigrationIndex
2825
from databricks.labs.ucx.mixins.cached_workspace_path import WorkspaceCache, InvalidPath
2926
from databricks.labs.ucx.source_code.base import (
3027
CurrentSessionState,
3128
LocatedAdvice,
32-
is_a_notebook,
33-
file_language,
34-
SourceInfo,
3529
UsedTable,
3630
LineageAtom,
37-
safe_read_text,
3831
)
3932
from databricks.labs.ucx.source_code.directfs_access import (
4033
DirectFsAccessCrawler,
@@ -47,12 +40,13 @@
4740
DependencyResolver,
4841
SourceContainer,
4942
WrappingLoader,
50-
DependencyGraphWalker,
43+
)
44+
from databricks.labs.ucx.source_code.linters.graph_walkers import (
45+
LintingWalker,
46+
DfsaCollectorWalker,
47+
TablesCollectorWalker,
5148
)
5249
from databricks.labs.ucx.source_code.linters.context import LinterContext
53-
from databricks.labs.ucx.source_code.notebooks.cells import CellLanguage
54-
from databricks.labs.ucx.source_code.python.python_ast import MaybeTree, Tree, PythonSequentialLinter
55-
from databricks.labs.ucx.source_code.notebooks.sources import FileLinter, Notebook
5650
from databricks.labs.ucx.source_code.path_lookup import PathLookup
5751
from databricks.labs.ucx.source_code.used_table import UsedTablesCrawler
5852

@@ -532,7 +526,12 @@ def _lint_task(
532526
linted_paths: set[Path],
533527
) -> Iterable[LocatedAdvice]:
534528
walker = LintingWalker(
535-
graph, linted_paths, self._path_lookup, task.task_key, session_state, self._migration_index
529+
graph,
530+
linted_paths,
531+
self._path_lookup,
532+
task.task_key,
533+
session_state,
534+
lambda: LinterContext(self._migration_index, session_state),
536535
)
537536
yield from walker
538537

@@ -569,164 +568,3 @@ def _collect_task_tables(
569568
LineageAtom(object_type="TASK", object_id=f"{job_id}/{task.task_key}"),
570569
]
571570
yield dataclasses.replace(used_table, source_lineage=atoms + used_table.source_lineage)
572-
573-
574-
class LintingWalker(DependencyGraphWalker[LocatedAdvice]):
575-
576-
def __init__(
577-
self,
578-
graph: DependencyGraph,
579-
walked_paths: set[Path],
580-
path_lookup: PathLookup,
581-
key: str,
582-
session_state: CurrentSessionState,
583-
migration_index: TableMigrationIndex,
584-
):
585-
super().__init__(graph, walked_paths, path_lookup)
586-
self._key = key
587-
self._session_state = session_state
588-
self._linter_context = LinterContext(migration_index, session_state)
589-
590-
def _log_walk_one(self, dependency: Dependency) -> None:
591-
logger.info(f'Linting {self._key} dependency: {dependency}')
592-
593-
def _process_dependency(
594-
self,
595-
dependency: Dependency,
596-
path_lookup: PathLookup,
597-
inherited_tree: Tree | None,
598-
) -> Iterable[LocatedAdvice]:
599-
# FileLinter determines which file/notebook linter to use
600-
linter = FileLinter(self._linter_context, path_lookup, self._session_state, dependency.path, inherited_tree)
601-
for advice in linter.lint():
602-
yield LocatedAdvice(advice, dependency.path)
603-
604-
605-
T = TypeVar("T", bound=SourceInfo)
606-
607-
608-
class _CollectorWalker(DependencyGraphWalker[T], ABC):
609-
610-
def __init__(
611-
self,
612-
graph: DependencyGraph,
613-
walked_paths: set[Path],
614-
path_lookup: PathLookup,
615-
session_state: CurrentSessionState,
616-
migration_index: TableMigrationIndex,
617-
):
618-
super().__init__(graph, walked_paths, path_lookup)
619-
self._session_state = session_state
620-
self._linter_context = LinterContext(migration_index, session_state)
621-
622-
def _process_dependency(
623-
self,
624-
dependency: Dependency,
625-
path_lookup: PathLookup,
626-
inherited_tree: Tree | None,
627-
) -> Iterable[T]:
628-
language = file_language(dependency.path)
629-
if not language:
630-
logger.warning(f"Unknown language for {dependency.path}")
631-
return
632-
cell_language = CellLanguage.of_language(language)
633-
source = safe_read_text(dependency.path)
634-
if not source:
635-
return
636-
if is_a_notebook(dependency.path):
637-
yield from self._collect_from_notebook(source, cell_language, dependency.path, inherited_tree)
638-
elif dependency.path.is_file():
639-
yield from self._collect_from_source(source, cell_language, dependency.path, inherited_tree)
640-
641-
def _collect_from_notebook(
642-
self,
643-
source: str,
644-
language: CellLanguage,
645-
path: Path,
646-
inherited_tree: Tree | None,
647-
) -> Iterable[T]:
648-
notebook = Notebook.parse(path, source, language.language)
649-
src_timestamp = datetime.fromtimestamp(path.stat().st_mtime, timezone.utc)
650-
src_id = str(path)
651-
for cell in notebook.cells:
652-
for item in self._collect_from_source(cell.original_code, cell.language, path, inherited_tree):
653-
yield item.replace_source(source_id=src_id, source_lineage=self.lineage, source_timestamp=src_timestamp)
654-
if cell.language is CellLanguage.PYTHON:
655-
if inherited_tree is None:
656-
inherited_tree = Tree.new_module()
657-
maybe_tree = MaybeTree.from_source_code(cell.original_code)
658-
if maybe_tree.failure:
659-
logger.warning(maybe_tree.failure.message)
660-
continue
661-
assert maybe_tree.tree is not None
662-
inherited_tree.attach_child_tree(maybe_tree.tree)
663-
664-
def _collect_from_source(
665-
self,
666-
source: str,
667-
language: CellLanguage,
668-
path: Path,
669-
inherited_tree: Tree | None,
670-
) -> Iterable[T]:
671-
if language is CellLanguage.PYTHON:
672-
iterable = self._collect_from_python(source, inherited_tree)
673-
else:
674-
fn: Callable[[str], Iterable[T]] | None = getattr(self, f"_collect_from_{language.name.lower()}", None)
675-
if not fn:
676-
raise ValueError(f"Language {language.name} not supported yet!")
677-
# the below is for disabling a false pylint positive
678-
# pylint: disable=not-callable
679-
iterable = fn(source)
680-
src_timestamp = datetime.fromtimestamp(path.stat().st_mtime, timezone.utc)
681-
src_id = str(path)
682-
for item in iterable:
683-
yield item.replace_source(source_id=src_id, source_lineage=self.lineage, source_timestamp=src_timestamp)
684-
685-
@abstractmethod
686-
def _collect_from_python(self, source: str, inherited_tree: Tree | None) -> Iterable[T]: ...
687-
688-
def _collect_from_sql(self, _source: str) -> Iterable[T]:
689-
return []
690-
691-
def _collect_from_r(self, _source: str) -> Iterable[T]:
692-
logger.warning("Language R not supported yet!")
693-
return []
694-
695-
def _collect_from_scala(self, _source: str) -> Iterable[T]:
696-
logger.warning("Language scala not supported yet!")
697-
return []
698-
699-
def _collect_from_shell(self, _source: str) -> Iterable[T]:
700-
return []
701-
702-
def _collect_from_markdown(self, _source: str) -> Iterable[T]:
703-
return []
704-
705-
def _collect_from_run(self, _source: str) -> Iterable[T]:
706-
return []
707-
708-
def _collect_from_pip(self, _source: str) -> Iterable[T]:
709-
return []
710-
711-
712-
class DfsaCollectorWalker(_CollectorWalker[DirectFsAccess]):
713-
714-
def _collect_from_python(self, source: str, inherited_tree: Tree | None) -> Iterable[DirectFsAccess]:
715-
collector = self._linter_context.dfsa_collector(Language.PYTHON)
716-
yield from collector.collect_dfsas(source)
717-
718-
def _collect_from_sql(self, source: str) -> Iterable[DirectFsAccess]:
719-
collector = self._linter_context.dfsa_collector(Language.SQL)
720-
yield from collector.collect_dfsas(source)
721-
722-
723-
class TablesCollectorWalker(_CollectorWalker[UsedTable]):
724-
725-
def _collect_from_python(self, source: str, inherited_tree: Tree | None) -> Iterable[UsedTable]:
726-
collector = self._linter_context.tables_collector(Language.PYTHON)
727-
assert isinstance(collector, PythonSequentialLinter)
728-
yield from collector.collect_tables(source)
729-
730-
def _collect_from_sql(self, source: str) -> Iterable[UsedTable]:
731-
collector = self._linter_context.tables_collector(Language.SQL)
732-
yield from collector.collect_tables(source)

src/databricks/labs/ucx/source_code/linters/files.py

Lines changed: 4 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -8,9 +8,7 @@
88
from typing import TextIO
99

1010
from databricks.labs.ucx.source_code.base import LocatedAdvice, CurrentSessionState, file_language, is_a_notebook
11-
from databricks.labs.ucx.source_code.python.python_ast import Tree
1211
from databricks.labs.ucx.source_code.notebooks.loaders import NotebookLoader
13-
from databricks.labs.ucx.source_code.notebooks.sources import FileLinter
1412
from databricks.labs.ucx.source_code.path_lookup import PathLookup
1513
from databricks.labs.ucx.source_code.known import KnownList
1614
from databricks.sdk.service.workspace import Language
@@ -29,8 +27,8 @@
2927
SourceContainer,
3028
DependencyResolver,
3129
InheritedContext,
32-
DependencyGraphWalker,
3330
)
31+
from databricks.labs.ucx.source_code.linters.graph_walkers import LintingWalker
3432

3533
logger = logging.getLogger(__name__)
3634

@@ -171,23 +169,11 @@ def lint_path(self, path: Path, linted_paths: set[Path] | None = None) -> Iterab
171169
problems = container.build_dependency_graph(graph)
172170
for problem in problems:
173171
yield problem.as_located_advice()
174-
context_factory = self._context_factory
175-
session_state = self._session_state
176-
177-
class LintingWalker(DependencyGraphWalker[LocatedAdvice]):
178-
179-
def _process_dependency(
180-
self, dependency: Dependency, path_lookup: PathLookup, inherited_tree: Tree | None
181-
) -> Iterable[LocatedAdvice]:
182-
ctx = context_factory()
183-
# FileLinter will determine which file/notebook linter to use
184-
linter = FileLinter(ctx, path_lookup, session_state, dependency.path, inherited_tree)
185-
for advice in linter.lint():
186-
yield LocatedAdvice(advice, dependency.path)
187-
188172
if linted_paths is None:
189173
linted_paths = set()
190-
walker = LintingWalker(graph, linted_paths, self._path_lookup)
174+
walker = LintingWalker(
175+
graph, linted_paths, self._path_lookup, path.name, self._session_state, self._context_factory
176+
)
191177
yield from walker
192178

193179

0 commit comments

Comments
 (0)