Skip to content

Commit ec1801c

Browse files
authored
Build dependency graph for local files (#1462)
## Changes Refactored dependency classes to clarify resolution vs loading Implemented LocalFileMigrator.build_dependency_graph (same pattern as NotebookMigrator) ### Linked issues #1202 Resolves #1360 ### Functionality - [ ] added relevant user documentation - [ ] added new CLI command - [ ] modified existing command: `databricks labs ucx ...` - [ ] added a new workflow - [ ] modified existing workflow: `...` - [ ] added a new table - [ ] modified existing table: `...` ### Tests - [ ] manually tested - [x] added unit tests - [ ] added integration tests - [ ] verified on staging environment (screenshot attached)
1 parent 9cb9dbf commit ec1801c

File tree

11 files changed

+491
-335
lines changed

11 files changed

+491
-335
lines changed
Lines changed: 147 additions & 67 deletions
Original file line numberDiff line numberDiff line change
@@ -1,77 +1,99 @@
11
from __future__ import annotations
22

33
import abc
4+
import ast
45
from collections.abc import Callable, Iterable
6+
from pathlib import Path
57

6-
from databricks.sdk.service.workspace import ObjectType, ObjectInfo, ExportFormat, Language
8+
from databricks.sdk.service.workspace import ObjectType, ObjectInfo, ExportFormat
79
from databricks.sdk import WorkspaceClient
810

911
from databricks.labs.ucx.source_code.base import Advice, Deprecation
12+
from databricks.labs.ucx.source_code.python_linter import ASTLinter, PythonLinter
1013
from databricks.labs.ucx.source_code.whitelist import Whitelist, UCCompatibility
1114

1215

13-
class Dependency:
16+
class Dependency(abc.ABC):
1417

15-
@staticmethod
16-
def from_object_info(object_info: ObjectInfo):
17-
assert object_info.object_type is not None
18-
assert object_info.path is not None
19-
return Dependency(object_info.object_type, object_info.path)
20-
21-
def __init__(self, object_type: ObjectType | None, path: str):
22-
self._type = object_type
18+
def __init__(self, loader: DependencyLoader, path: Path):
19+
self._loader = loader
2320
self._path = path
2421

2522
@property
26-
def type(self) -> ObjectType | None:
27-
return self._type
28-
29-
@property
30-
def path(self) -> str:
23+
def path(self) -> Path:
3124
return self._path
3225

3326
def __hash__(self):
3427
return hash(self.path)
3528

3629
def __eq__(self, other):
37-
return isinstance(other, Dependency) and self.path == other.path
30+
return isinstance(other, type(self)) and self.path == other.path
31+
32+
def load(self) -> SourceContainer | None:
33+
return self._loader.load_dependency(self)
3834

3935

4036
class SourceContainer(abc.ABC):
4137

42-
@property
4338
@abc.abstractmethod
44-
def object_type(self) -> ObjectType:
39+
def build_dependency_graph(self, parent: DependencyGraph) -> None:
4540
raise NotImplementedError()
4641

42+
43+
class DependencyLoader(abc.ABC):
44+
4745
@abc.abstractmethod
48-
def build_dependency_graph(self, graph) -> None:
46+
def load_dependency(self, dependency: Dependency) -> SourceContainer | None:
4947
raise NotImplementedError()
5048

49+
@abc.abstractmethod
50+
def is_notebook(self, path: Path) -> bool:
51+
raise NotImplementedError()
52+
53+
54+
class LocalFileLoader(DependencyLoader):
55+
# see https://github.com/databrickslabs/ucx/issues/1499
56+
def load_dependency(self, dependency: Dependency) -> SourceContainer | None:
57+
raise NotImplementedError()
58+
59+
def is_file(self, path: Path) -> bool:
60+
raise NotImplementedError()
61+
62+
def is_notebook(self, path: Path) -> bool:
63+
raise NotImplementedError()
64+
65+
66+
class NotebookLoader(DependencyLoader, abc.ABC):
67+
pass
68+
5169

52-
class DependencyLoader:
70+
class LocalNotebookLoader(NotebookLoader, LocalFileLoader):
71+
# see https://github.com/databrickslabs/ucx/issues/1499
72+
pass
73+
74+
75+
class WorkspaceNotebookLoader(NotebookLoader):
5376

5477
def __init__(self, ws: WorkspaceClient):
5578
self._ws = ws
5679

80+
def is_notebook(self, path: Path):
81+
object_info = self._ws.workspace.get_status(str(path))
82+
# TODO check error conditions, see https://github.com/databrickslabs/ucx/issues/1361
83+
return object_info is not None and object_info.object_type is ObjectType.NOTEBOOK
84+
5785
def load_dependency(self, dependency: Dependency) -> SourceContainer | None:
5886
object_info = self._load_object(dependency)
59-
if object_info.object_type is ObjectType.NOTEBOOK:
60-
return self._load_notebook(object_info)
61-
if object_info.object_type is ObjectType.FILE:
62-
return self._load_file(object_info)
63-
if object_info.object_type in [ObjectType.LIBRARY, ObjectType.DIRECTORY, ObjectType.DASHBOARD, ObjectType.REPO]:
64-
return None
65-
raise NotImplementedError(str(object_info.object_type))
87+
return self._load_notebook(object_info)
6688

6789
def _load_object(self, dependency: Dependency) -> ObjectInfo:
68-
object_info = self._ws.workspace.get_status(dependency.path)
90+
object_info = self._ws.workspace.get_status(str(dependency.path))
6991
# TODO check error conditions, see https://github.com/databrickslabs/ucx/issues/1361
7092
if object_info is None or object_info.object_type is None:
7193
raise ValueError(f"Could not locate object at '{dependency.path}'")
72-
if dependency.type is not None and object_info.object_type is not dependency.type:
94+
if object_info.object_type is not ObjectType.NOTEBOOK:
7395
raise ValueError(
74-
f"Invalid object at '{dependency.path}', expected a {str(dependency.type)}, got a {str(object_info.object_type)}"
96+
f"Invalid object at '{dependency.path}', expected a {ObjectType.NOTEBOOK.name}, got a {str(object_info.object_type)}"
7597
)
7698
return object_info
7799

@@ -85,19 +107,6 @@ def _load_notebook(self, object_info: ObjectInfo) -> SourceContainer:
85107
source = self._load_source(object_info)
86108
return Notebook.parse(object_info.path, source, object_info.language)
87109

88-
def _load_file(self, object_info: ObjectInfo) -> SourceContainer:
89-
# local import to avoid cyclic dependency
90-
# pylint: disable=import-outside-toplevel, cyclic-import
91-
from databricks.labs.ucx.source_code.files import WorkspaceFile
92-
93-
assert object_info.path is not None
94-
# TODO https://github.com/databrickslabs/ucx/issues/1363
95-
# the below assumes that the dependency was discovered whilst processing a Python notebook or cell
96-
# which is safe since Python is the only language supported as of writing
97-
language = Language.PYTHON if object_info.language is None else object_info.language
98-
source = self._load_source(object_info)
99-
return WorkspaceFile(object_info.path, source, language)
100-
101110
def _load_source(self, object_info: ObjectInfo) -> str:
102111
if not object_info.path:
103112
raise ValueError(f"Invalid ObjectInfo: {object_info}")
@@ -106,29 +115,46 @@ def _load_source(self, object_info: ObjectInfo) -> str:
106115

107116

108117
class DependencyResolver:
109-
def __init__(self, whitelist: Whitelist | None = None):
118+
def __init__(
119+
self,
120+
whitelist: Whitelist,
121+
file_loader: LocalFileLoader,
122+
notebook_loader: NotebookLoader,
123+
):
110124
self._whitelist = Whitelist() if whitelist is None else whitelist
125+
self._file_loader = file_loader
126+
self._notebook_loader = notebook_loader
111127
self._advices: list[Advice] = []
112128

113-
def resolve(self, dependency: Dependency) -> Dependency | None:
114-
if dependency.type == ObjectType.NOTEBOOK:
115-
return dependency
116-
compatibility = self._whitelist.compatibility(dependency.path)
129+
def resolve_notebook(self, path: Path) -> Dependency | None:
130+
if self._notebook_loader.is_notebook(path):
131+
return Dependency(self._notebook_loader, path)
132+
return None
133+
134+
def resolve_local_file(self, path: Path) -> Dependency | None:
135+
if self._file_loader.is_file(path) and not self._file_loader.is_notebook(path):
136+
return Dependency(self._file_loader, path)
137+
return None
138+
139+
def resolve_import(self, name: str) -> Dependency | None:
140+
compatibility = self._whitelist.compatibility(name)
117141
# TODO attach compatibility to dependency, see https://github.com/databrickslabs/ucx/issues/1382
118142
if compatibility is not None:
119143
if compatibility == UCCompatibility.NONE:
120144
self._advices.append(
121145
Deprecation(
122146
code="dependency-check",
123-
message=f"Use of dependency {dependency.path} is deprecated",
147+
message=f"Use of dependency {name} is deprecated",
124148
start_line=0,
125149
start_col=0,
126150
end_line=0,
127151
end_col=0,
128152
)
129153
)
130154
return None
131-
return dependency
155+
if self._file_loader.is_file(Path(name)):
156+
return Dependency(self._file_loader, Path(name))
157+
raise ValueError(f"Could not locate {name}")
132158

133159
def get_advices(self) -> Iterable[Advice]:
134160
yield from self._advices
@@ -140,13 +166,11 @@ def __init__(
140166
self,
141167
dependency: Dependency,
142168
parent: DependencyGraph | None,
143-
loader: DependencyLoader,
144-
resolver: DependencyResolver | None = None,
169+
resolver: DependencyResolver,
145170
):
146171
self._dependency = dependency
147172
self._parent = parent
148-
self._loader = loader
149-
self._resolver = resolver or DependencyResolver()
173+
self._resolver = resolver
150174
self._dependencies: dict[Dependency, DependencyGraph] = {}
151175

152176
@property
@@ -157,35 +181,49 @@ def dependency(self):
157181
def path(self):
158182
return self._dependency.path
159183

160-
def register_dependency(self, dependency: Dependency) -> DependencyGraph | None:
161-
resolved = self._resolver.resolve(dependency)
184+
def register_notebook(self, path: Path) -> DependencyGraph | None:
185+
resolved = self._resolver.resolve_notebook(path)
162186
if resolved is None:
163187
return None
188+
return self.register_dependency(resolved)
189+
190+
def register_import(self, name: str) -> DependencyGraph | None:
191+
resolved = self._resolver.resolve_import(name)
192+
if resolved is None:
193+
return None
194+
return self.register_dependency(resolved)
195+
196+
def register_dependency(self, dependency: Dependency):
164197
# already registered ?
165-
child_graph = self.locate_dependency(resolved)
198+
child_graph = self._locate_dependency(dependency)
166199
if child_graph is not None:
167-
self._dependencies[resolved] = child_graph
200+
self._dependencies[dependency] = child_graph
168201
return child_graph
169202
# nay, create the child graph and populate it
170-
child_graph = DependencyGraph(resolved, self, self._loader, self._resolver)
171-
self._dependencies[resolved] = child_graph
172-
container = self._loader.load_dependency(resolved)
203+
child_graph = DependencyGraph(dependency, self, self._resolver)
204+
self._dependencies[dependency] = child_graph
205+
container = dependency.load()
173206
if not container:
174207
return None
175208
container.build_dependency_graph(child_graph)
176209
return child_graph
177210

178-
def locate_dependency(self, dependency: Dependency) -> DependencyGraph | None:
211+
def _locate_dependency(self, dependency: Dependency) -> DependencyGraph | None:
212+
return self.locate_dependency(dependency.path)
213+
214+
def locate_dependency(self, path: Path) -> DependencyGraph | None:
179215
# need a list since unlike JS, Python won't let you assign closure variables
180216
found: list[DependencyGraph] = []
181-
path = dependency.path
182217
# TODO https://github.com/databrickslabs/ucx/issues/1287
183-
path = path[2:] if path.startswith('./') else path
218+
path_str = str(path)
219+
path_str = path_str[2:] if path_str.startswith('./') else path_str
184220

185221
def check_registered_dependency(graph):
186222
# TODO https://github.com/databrickslabs/ucx/issues/1287
187-
graph_path = graph.path[2:] if graph.path.startswith('./') else graph.path
188-
if graph_path == path:
223+
graph_path = str(graph.path)
224+
if graph_path.startswith('./'):
225+
graph_path = graph_path[2:]
226+
if graph_path == path_str:
189227
found.append(graph)
190228
return True
191229
return False
@@ -211,7 +249,7 @@ def add_to_dependencies(graph: DependencyGraph) -> bool:
211249
return dependencies
212250

213251
@property
214-
def paths(self) -> set[str]:
252+
def paths(self) -> set[Path]:
215253
return {d.path for d in self.dependencies}
216254

217255
# when visit_node returns True it interrupts the visit
@@ -222,3 +260,45 @@ def visit(self, visit_node: Callable[[DependencyGraph], bool | None]) -> bool |
222260
if dependency.visit(visit_node):
223261
return True
224262
return False
263+
264+
def build_graph_from_python_source(self, python_code: str):
265+
linter = ASTLinter.parse(python_code)
266+
calls = linter.locate(ast.Call, [("run", ast.Attribute), ("notebook", ast.Attribute), ("dbutils", ast.Name)])
267+
for call in calls:
268+
assert isinstance(call, ast.Call)
269+
path = PythonLinter.get_dbutils_notebook_run_path_arg(call)
270+
if isinstance(path, ast.Constant):
271+
path = path.value.strip().strip("'").strip('"')
272+
dependency = self.register_notebook(path)
273+
if dependency is None:
274+
# TODO raise Advice, see https://github.com/databrickslabs/ucx/issues/1439
275+
raise ValueError(f"Invalid notebook path in dbutils.notebook.run command: {path}")
276+
else:
277+
# TODO raise Advice, see https://github.com/databrickslabs/ucx/issues/1439
278+
pass
279+
for name in PythonLinter.list_import_sources(linter):
280+
self.register_import(name)
281+
282+
283+
class DependencyGraphBuilder:
284+
285+
def __init__(self, resolver: DependencyResolver):
286+
self._resolver = resolver
287+
288+
def build_local_file_dependency_graph(self, path: Path) -> DependencyGraph:
289+
dependency = self._resolver.resolve_local_file(path)
290+
assert dependency is not None
291+
graph = DependencyGraph(dependency, None, self._resolver)
292+
container = dependency.load()
293+
if container is not None:
294+
container.build_dependency_graph(graph)
295+
return graph
296+
297+
def build_notebook_dependency_graph(self, path: Path) -> DependencyGraph:
298+
dependency = self._resolver.resolve_notebook(path)
299+
assert dependency is not None
300+
graph = DependencyGraph(dependency, None, self._resolver)
301+
container = dependency.load()
302+
if container is not None:
303+
container.build_dependency_graph(graph)
304+
return graph

0 commit comments

Comments
 (0)