Skip to content

Commit 3583396

Browse files
authored
Support ignoring files and notebooks (#3653)
<!-- REMOVE IRRELEVANT COMMENTS BEFORE CREATING A PULL REQUEST --> ## Changes Support ignoring files and notebooks ### Linked issues Progresses #3645 ### Functionality - [x] modified file and notebook loading in the source code related modules ### Tests - [x] added unit tests
1 parent 7c46c53 commit 3583396

File tree

6 files changed

+163
-54
lines changed

6 files changed

+163
-54
lines changed

src/databricks/labs/ucx/source_code/files.py

Lines changed: 16 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
BaseImportResolver,
1818
BaseFileResolver,
1919
MaybeDependency,
20+
StubContainer,
2021
)
2122
from databricks.labs.ucx.source_code.known import KnownList
2223
from databricks.labs.ucx.source_code.path_lookup import PathLookup
@@ -70,24 +71,30 @@ def __repr__(self):
7071
return f"<LocalFile {self._path}>"
7172

7273

73-
class StubContainer(SourceContainer):
74-
75-
def __init__(self, path: Path):
76-
super().__init__()
77-
self._path = path
74+
class FileLoader(DependencyLoader):
75+
"""Loader for a file dependency.
7876
79-
def build_dependency_graph(self, parent: DependencyGraph) -> list[DependencyProblem]:
80-
return []
77+
Args:
78+
exclude_paths (set[Path] | None) : A set of paths to load as
79+
class:`StubContainer`. If None, no paths are excluded.
8180
81+
Note: The exclude paths are loaded as `StubContainer` to
82+
signal that the path is found, however, it should not be
83+
processed.
84+
"""
8285

83-
class FileLoader(DependencyLoader):
84-
"""Loader for a file dependency."""
86+
def __init__(self, *, exclude_paths: set[Path] | None = None):
87+
self._exclude_paths = exclude_paths or set[Path]()
8588

8689
def load_dependency(self, path_lookup: PathLookup, dependency: Dependency) -> LocalFile | StubContainer | None:
8790
"""Load the dependency."""
8891
resolved_path = path_lookup.resolve(dependency.path)
8992
if not resolved_path:
9093
return None
94+
if resolved_path in self._exclude_paths:
95+
# Paths are excluded from further processing by loading them as stub container.
96+
# Note we don't return `None`, as the path is found.
97+
return StubContainer(resolved_path)
9198
language = file_language(resolved_path)
9299
if not language:
93100
return StubContainer(resolved_path)

src/databricks/labs/ucx/source_code/graph.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -600,3 +600,13 @@ def finalize(self) -> InheritedContext:
600600
return self
601601
tree = self.tree.renumber(-1)
602602
return InheritedContext(tree, self.found, [])
603+
604+
605+
class StubContainer(SourceContainer):
606+
607+
def __init__(self, path: Path):
608+
super().__init__()
609+
self._path = path
610+
611+
def build_dependency_graph(self, parent: DependencyGraph) -> list[DependencyProblem]:
612+
return []

src/databricks/labs/ucx/source_code/notebooks/loaders.py

Lines changed: 44 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -1,20 +1,18 @@
11
from __future__ import annotations
22

3-
import abc
43
import logging
54
from pathlib import Path
65
from typing import TypeVar
76

8-
from databricks.sdk.errors import NotFound
97
from databricks.sdk.service.workspace import Language
108

11-
from databricks.labs.ucx.source_code.base import is_a_notebook, file_language
9+
from databricks.labs.ucx.source_code.base import is_a_notebook, file_language, safe_read_text
1210
from databricks.labs.ucx.source_code.graph import (
1311
BaseNotebookResolver,
1412
Dependency,
1513
DependencyLoader,
1614
MaybeDependency,
17-
SourceContainer,
15+
StubContainer,
1816
)
1917
from databricks.labs.ucx.source_code.notebooks.cells import CellLanguage
2018
from databricks.labs.ucx.source_code.notebooks.sources import Notebook
@@ -40,10 +38,32 @@ def resolve_notebook(self, path_lookup: PathLookup, path: Path, inherit_context:
4038
return MaybeDependency(dependency, [])
4139

4240

43-
class NotebookLoader(DependencyLoader, abc.ABC):
41+
class NotebookLoader(DependencyLoader):
42+
"""Load a notebook.
43+
44+
Args:
45+
exclude_paths (set[Path] | None) : A set of paths to load as
46+
class:`StubContainer`. If None, no paths are excluded.
47+
48+
Note: The exclude paths are loaded as `StubContainer` to
49+
signal that the path is found, however, it should not be
50+
processed.
51+
52+
TODO:
53+
Let `NotebookLoader` inherit from `FileLoader` and reuse the
54+
implementation of `load_dependency` to first load the file, then
55+
convert it to a `Notebook` if it is a notebook source.
56+
"""
57+
58+
def __init__(self, *, exclude_paths: set[Path] | None = None):
59+
self._exclude_paths = exclude_paths or set[Path]()
60+
4461
def resolve(self, path_lookup: PathLookup, path: Path) -> Path | None:
45-
"""If the path is a Python file, return the path to the Python file. If the path is neither,
46-
return None."""
62+
"""Resolve the notebook path.
63+
64+
If the path is a Python file, return the path to the Python file. If the path is neither,
65+
return None.
66+
"""
4767
# check current working directory first
4868
absolute_path = path_lookup.cwd / path
4969
absolute_path = absolute_path.resolve()
@@ -58,29 +78,27 @@ def resolve(self, path_lookup: PathLookup, path: Path) -> Path | None:
5878
return a_path
5979
return None
6080

61-
def load_dependency(self, path_lookup: PathLookup, dependency: Dependency) -> SourceContainer | None:
62-
absolute_path = self.resolve(path_lookup, dependency.path)
63-
if not absolute_path:
64-
return None
65-
try:
66-
content = absolute_path.read_text("utf-8")
67-
except NotFound:
68-
logger.warning(f"Path not found trying to read notebook from workspace: {absolute_path}")
81+
def load_dependency(self, path_lookup: PathLookup, dependency: Dependency) -> Notebook | StubContainer | None:
82+
"""Load the notebook dependency."""
83+
resolved_path = self.resolve(path_lookup, dependency.path)
84+
if not resolved_path:
6985
return None
70-
except PermissionError:
71-
logger.warning(
72-
f"Permission error while reading notebook from workspace: {absolute_path}",
73-
exc_info=True,
74-
)
86+
if resolved_path in self._exclude_paths:
87+
# Paths are excluded from further processing by loading them as stub container.
88+
# Note we don't return `None`, as the path is found.
89+
return StubContainer(resolved_path)
90+
content = safe_read_text(resolved_path)
91+
if content is None:
7592
return None
76-
except UnicodeDecodeError:
77-
logger.warning(f"Cannot decode non-UTF-8 encoded notebook from workspace: {absolute_path}")
78-
return None
79-
language = self._detect_language(absolute_path, content)
93+
language = self._detect_language(resolved_path, content)
8094
if not language:
81-
logger.warning(f"Could not detect language for {absolute_path}")
95+
logger.warning(f"Could not detect language for {resolved_path}")
96+
return None
97+
try:
98+
return Notebook.parse(resolved_path, content, language)
99+
except ValueError as e:
100+
logger.warning(f"Could not parse notebook for {resolved_path}", exc_info=e)
82101
return None
83-
return Notebook.parse(absolute_path, content, language)
84102

85103
@staticmethod
86104
def _detect_language(path: Path, content: str) -> Language | None:
Lines changed: 77 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,15 @@
1+
import codecs
12
import logging
23
from pathlib import Path
34
from unittest.mock import create_autospec
45

56
import pytest
67
from databricks.sdk.service.workspace import Language
78

8-
from databricks.labs.ucx.source_code.graph import Dependency
9+
from databricks.labs.ucx.source_code.base import NOTEBOOK_HEADER
10+
from databricks.labs.ucx.source_code.graph import Dependency, StubContainer
911
from databricks.labs.ucx.source_code.notebooks.loaders import NotebookLoader
12+
from databricks.labs.ucx.source_code.notebooks.sources import Notebook
1013
from databricks.labs.ucx.source_code.path_lookup import PathLookup
1114

1215

@@ -26,25 +29,85 @@ def detect_language(cls, path: Path, content: str):
2629

2730

2831
@pytest.mark.parametrize(
29-
"error, message",
32+
"error",
3033
[
31-
(PermissionError("Permission denied"), "Permission error while reading notebook from workspace"),
32-
(
33-
UnicodeDecodeError("utf-8", b"\x80\x81\x82", 0, 1, "invalid start byte"),
34-
"Cannot decode non-UTF-8 encoded notebook from workspace",
35-
),
34+
PermissionError("Permission denied"),
35+
FileNotFoundError("File not found"),
36+
UnicodeDecodeError("utf-8", b"\x80\x81\x82", 0, 1, "invalid start byte"),
3637
],
3738
)
38-
def test_notebook_loader_loads_dependency_raises_error(caplog, error: Exception, message: str) -> None:
39+
def test_notebook_loader_loads_dependency_raises_error(caplog, error: Exception) -> None:
3940
path = create_autospec(Path)
40-
path.read_text.side_effect = error
41+
path.suffix = ".py"
42+
path.open.side_effect = error
4143
path_lookup = create_autospec(PathLookup)
4244
path_lookup.resolve.return_value = path
43-
dependency = create_autospec(Dependency)
44-
dependency.path.return_value = path
45+
dependency = Dependency(NotebookLoader(), path)
4546

4647
with caplog.at_level(logging.WARNING, logger="databricks.labs.ucx.source_code.notebooks.loaders"):
47-
found = NotebookLoader().load_dependency(path_lookup, dependency)
48+
notebook = dependency.load(path_lookup)
4849

49-
assert f"{message}: {path}" in caplog.text
50-
assert found is None
50+
assert f"Could not read file: {path}" in caplog.text
51+
assert notebook is None
52+
53+
54+
def test_notebook_loader_ignores_loading_non_ascii_file(mock_path_lookup) -> None:
55+
dependency = Dependency(NotebookLoader(), Path("nonascii.py"))
56+
57+
notebook = dependency.load(mock_path_lookup)
58+
59+
# TODO: Test specific error while loading: https://github.com/databrickslabs/ucx/issues/3584
60+
assert notebook is None
61+
assert Path("nonascii.py") in mock_path_lookup.successfully_resolved_paths
62+
63+
64+
@pytest.mark.parametrize(
65+
"bom, encoding",
66+
[
67+
(codecs.BOM_UTF8, "utf-8"),
68+
(codecs.BOM_UTF16_LE, "utf-16-le"),
69+
(codecs.BOM_UTF16_BE, "utf-16-be"),
70+
(codecs.BOM_UTF32_LE, "utf-32-le"),
71+
(codecs.BOM_UTF32_BE, "utf-32-be"),
72+
],
73+
)
74+
def test_notebook_loader_loads_file_with_bom(tmp_path, bom, encoding) -> None:
75+
path = tmp_path / "file.py"
76+
path.write_bytes(bom + f"# {NOTEBOOK_HEADER}\na = 12".encode(encoding))
77+
dependency = Dependency(NotebookLoader(), path)
78+
path_lookup = create_autospec(PathLookup)
79+
path_lookup.resolve.return_value = path
80+
81+
notebook = dependency.load(path_lookup)
82+
83+
# TODO: Test specific error while loading: https://github.com/databrickslabs/ucx/issues/3584
84+
assert isinstance(notebook, Notebook)
85+
assert notebook.original_code == f"# {NOTEBOOK_HEADER}\na = 12"
86+
path_lookup.resolve.assert_called_once_with(path)
87+
88+
89+
def test_notebook_loader_cannot_load_empty_file(tmp_path) -> None:
90+
"""Empty file fails because it misses the notebook header."""
91+
path = tmp_path / "empty.py"
92+
path.write_text("")
93+
dependency = Dependency(NotebookLoader(), path)
94+
path_lookup = create_autospec(PathLookup)
95+
path_lookup.resolve.return_value = path
96+
97+
notebook = dependency.load(path_lookup)
98+
99+
# TODO: Test specific error while loading: https://github.com/databrickslabs/ucx/issues/3584
100+
assert not notebook
101+
path_lookup.resolve.assert_called_once_with(path)
102+
103+
104+
def test_notebook_loader_ignores_path_by_loading_it_as_stub_container(tmp_path) -> None:
105+
path = tmp_path / "path.py"
106+
dependency = Dependency(NotebookLoader(exclude_paths={path}), path)
107+
path_lookup = create_autospec(PathLookup)
108+
path_lookup.resolve.return_value = path
109+
110+
stub = dependency.load(path_lookup)
111+
112+
assert isinstance(stub, StubContainer)
113+
path_lookup.resolve.assert_called_once_with(path)

tests/unit/source_code/test_dependencies.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,6 @@
55
Dependency,
66
DependencyProblem,
77
DependencyResolver,
8-
SourceContainer,
98
)
109
from databricks.labs.ucx.source_code.files import FileLoader, ImportFileResolver
1110
from databricks.labs.ucx.source_code.notebooks.loaders import (
@@ -191,7 +190,7 @@ def load_dependency(self, path_lookup: PathLookup, dependency: Dependency) -> No
191190
def test_dependency_resolver_raises_problem_with_unloadable_root_notebook(mock_path_lookup) -> None:
192191

193192
class FailingNotebookLoader(NotebookLoader):
194-
def load_dependency(self, path_lookup: PathLookup, dependency: Dependency) -> SourceContainer | None:
193+
def load_dependency(self, path_lookup: PathLookup, dependency: Dependency) -> None:
195194
return None
196195

197196
notebook_loader = FailingNotebookLoader()

tests/unit/source_code/test_files.py

Lines changed: 15 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
from databricks.sdk.service.workspace import Language
77

88
from databricks.labs.ucx.source_code.base import CurrentSessionState
9-
from databricks.labs.ucx.source_code.graph import Dependency, DependencyGraph, DependencyProblem
9+
from databricks.labs.ucx.source_code.graph import Dependency, DependencyGraph, DependencyProblem, StubContainer
1010
from databricks.labs.ucx.source_code.files import FileLoader, LocalFile
1111
from databricks.labs.ucx.source_code.path_lookup import PathLookup
1212

@@ -115,13 +115,13 @@ def test_file_loader_loads_file_without_permission() -> None:
115115
path_lookup.resolve.assert_called_once_with(path)
116116

117117

118-
def test_file_loader_loads_non_ascii_file(mock_path_lookup) -> None:
118+
def test_file_loader_ignores_loading_non_ascii_file(mock_path_lookup) -> None:
119119
dependency = Dependency(FileLoader(), Path("nonascii.py"))
120120

121121
local_file = dependency.load(mock_path_lookup)
122122

123123
# TODO: Test specific error while loading: https://github.com/databrickslabs/ucx/issues/3584
124-
assert local_file is None
124+
assert not local_file
125125
assert Path("nonascii.py") in mock_path_lookup.successfully_resolved_paths
126126

127127

@@ -177,3 +177,15 @@ def test_file_loader_loads_empty_file(tmp_path) -> None:
177177
# TODO: Test specific error while loading: https://github.com/databrickslabs/ucx/issues/3584
178178
assert local_file
179179
path_lookup.resolve.assert_called_once_with(path)
180+
181+
182+
def test_file_loader_ignores_path_by_loading_it_as_stub_container(tmp_path) -> None:
183+
path = tmp_path / "path.py"
184+
dependency = Dependency(FileLoader(exclude_paths={path}), path)
185+
path_lookup = create_autospec(PathLookup)
186+
path_lookup.resolve.return_value = path
187+
188+
stub = dependency.load(path_lookup)
189+
190+
assert isinstance(stub, StubContainer)
191+
path_lookup.resolve.assert_called_once_with(path)

0 commit comments

Comments
 (0)