Skip to content

Commit 08db6cb

Browse files
authored
Refactor linting related resources (#3633)
## Changes Refactor linting related resources by moving them into the `linters/` submodule **with the exception of lsp resources as I did not want to touch them**. This refactor PR should make the work on the linter easier as i) it is easier to find the linter resources, ii) those PR required small refactors because the resources were entangled. - Moved linting resources to the `linters` submodule. - Introduced `files.py` modules containing file related resources (files, notebooks, imports) - Introduced `folders.py` modules containing folder related resource (combining files and notebooks) - Introduced a `linters/base.py` for linter base resources. High level linter resources: - `WorkflowLinter` in `jobs.py` (kept `jobs.py` as that is consistent with other parts of the code base) - `LocalCodeLinter` in `folders.py`, lints a directory of files Mid level linter resources: - `FileLinter` and `FileMigrator` (will be merged later) - `NotebookLinter` and `NotebookMigrator` (will be merged later) Low level linter resources - `PythonAnalyzer` - All the linter and collectors that parse a Python tree
1 parent 8ff68ee commit 08db6cb

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

55 files changed

+1350
-1270
lines changed

src/databricks/labs/ucx/cli.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@
1919
from databricks.labs.ucx.contexts.workspace_cli import WorkspaceContext, LocalCheckoutContext
2020
from databricks.labs.ucx.hive_metastore.tables import What
2121
from databricks.labs.ucx.install import AccountInstaller
22-
from databricks.labs.ucx.source_code.linters.files import LocalCodeLinter
22+
from databricks.labs.ucx.source_code.linters.folders import LocalCodeLinter
2323
from databricks.labs.ucx.workspace_access.groups import AccountGroupLookup
2424

2525
ucx = App(__file__)

src/databricks/labs/ucx/contexts/application.py

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -65,16 +65,17 @@
6565
from databricks.labs.ucx.installer.workflows import DeployedWorkflows
6666
from databricks.labs.ucx.progress.install import VerifyProgressTracking
6767
from databricks.labs.ucx.source_code.graph import DependencyResolver
68-
from databricks.labs.ucx.source_code.jobs import WorkflowLinter
68+
from databricks.labs.ucx.source_code.linters.jobs import WorkflowLinter
6969
from databricks.labs.ucx.source_code.known import KnownList
70-
from databricks.labs.ucx.source_code.linters.files import FileLoader, FolderLoader, ImportFileResolver
70+
from databricks.labs.ucx.source_code.folders import FolderLoader
71+
from databricks.labs.ucx.source_code.files import FileLoader, ImportFileResolver
7172
from databricks.labs.ucx.source_code.notebooks.loaders import (
7273
NotebookResolver,
7374
NotebookLoader,
7475
)
7576
from databricks.labs.ucx.source_code.path_lookup import PathLookup
76-
from databricks.labs.ucx.source_code.queries import QueryLinter
77-
from databricks.labs.ucx.source_code.redash import Redash
77+
from databricks.labs.ucx.source_code.linters.queries import QueryLinter
78+
from databricks.labs.ucx.source_code.linters.redash import Redash
7879
from databricks.labs.ucx.workspace_access import generic, redash
7980
from databricks.labs.ucx.workspace_access.groups import GroupManager
8081
from databricks.labs.ucx.workspace_access.manager import PermissionManager

src/databricks/labs/ucx/contexts/workspace_cli.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@
2323
from databricks.labs.ucx.progress.install import ProgressTrackingInstallation
2424
from databricks.labs.ucx.source_code.base import CurrentSessionState
2525
from databricks.labs.ucx.source_code.linters.context import LinterContext
26-
from databricks.labs.ucx.source_code.linters.files import LocalFileMigrator, LocalCodeLinter
26+
from databricks.labs.ucx.source_code.linters.folders import LocalCodeLinter, LocalFileMigrator
2727
from databricks.labs.ucx.source_code.notebooks.loaders import NotebookLoader
2828
from databricks.labs.ucx.workspace_access.clusters import ClusterAccess
2929

src/databricks/labs/ucx/install.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -74,7 +74,7 @@
7474
from databricks.labs.ucx.source_code.base import UsedTable
7575
from databricks.labs.ucx.source_code.directfs_access import DirectFsAccess
7676
from databricks.labs.ucx.source_code.jobs import JobProblem
77-
from databricks.labs.ucx.source_code.queries import QueryProblem
77+
from databricks.labs.ucx.source_code.linters.queries import QueryProblem
7878
from databricks.labs.ucx.workspace_access.base import Permissions
7979
from databricks.labs.ucx.workspace_access.generic import WorkspaceObjectInfo
8080
from databricks.labs.ucx.workspace_access.groups import AccountGroupLookup, ConfigureGroups, MigratedGroup

src/databricks/labs/ucx/progress/dashboards.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
from databricks.labs.ucx.progress.history import ProgressEncoder
1212
from databricks.labs.ucx.progress.install import Historical
1313
from databricks.labs.ucx.source_code.base import UsedTable
14-
from databricks.labs.ucx.source_code.queries import QueryProblem
14+
from databricks.labs.ucx.source_code.linters.queries import QueryProblem
1515
from databricks.labs.ucx.source_code.used_table import UsedTablesCrawler
1616

1717

src/databricks/labs/ucx/source_code/base.py

Lines changed: 0 additions & 77 deletions
Original file line numberDiff line numberDiff line change
@@ -13,8 +13,6 @@
1313
from typing import Any, BinaryIO, TextIO, TypeVar
1414

1515
from astroid import NodeNG # type: ignore
16-
from sqlglot import Expression, parse as parse_sql
17-
from sqlglot.errors import SqlglotError
1816

1917
from databricks.sdk.service import compute
2018
from databricks.sdk.service.workspace import Language
@@ -125,56 +123,6 @@ class Convention(Advice):
125123
"""A suggestion for a better way to write the code."""
126124

127125

128-
class Linter:
129-
@abstractmethod
130-
def lint(self, code: str) -> Iterable[Advice]: ...
131-
132-
133-
class SqlLinter(Linter):
134-
135-
def lint(self, code: str) -> Iterable[Advice]:
136-
try:
137-
# TODO: unify with SqlParser.walk_expressions(...)
138-
expressions = parse_sql(code, read='databricks')
139-
for expression in expressions:
140-
if not expression:
141-
continue
142-
yield from self.lint_expression(expression)
143-
except SqlglotError as e:
144-
logger.debug(f"Failed to parse SQL: {code}", exc_info=e)
145-
yield self.sql_parse_failure(code)
146-
147-
@staticmethod
148-
def sql_parse_failure(code: str) -> Failure:
149-
return Failure(
150-
code='sql-parse-error',
151-
message=f"SQL expression is not supported yet: {code}",
152-
# SQLGlot does not propagate tokens yet. See https://github.com/tobymao/sqlglot/issues/3159
153-
start_line=0,
154-
start_col=0,
155-
end_line=0,
156-
end_col=1024,
157-
)
158-
159-
@abstractmethod
160-
def lint_expression(self, expression: Expression) -> Iterable[Advice]: ...
161-
162-
163-
class Fixer(ABC):
164-
165-
@property
166-
@abstractmethod
167-
def diagnostic_code(self) -> str:
168-
"""The diagnostic code that this fixer fixes."""
169-
170-
def is_supported(self, diagnostic_code: str) -> bool:
171-
"""Indicate if the diagnostic code is supported by this fixer."""
172-
return self.diagnostic_code is not None and diagnostic_code == self.diagnostic_code
173-
174-
@abstractmethod
175-
def apply(self, code: str) -> str: ...
176-
177-
178126
@dataclass
179127
class LineageAtom:
180128

@@ -366,31 +314,6 @@ def parse_security_mode(mode_str: str | None) -> compute.DataSecurityMode | None
366314
return None
367315

368316

369-
class SqlSequentialLinter(SqlLinter, DfsaCollector, TableCollector):
370-
371-
def __init__(
372-
self,
373-
linters: list[SqlLinter],
374-
dfsa_collectors: list[DfsaSqlCollector],
375-
table_collectors: list[TableSqlCollector],
376-
):
377-
self._linters = linters
378-
self._dfsa_collectors = dfsa_collectors
379-
self._table_collectors = table_collectors
380-
381-
def lint_expression(self, expression: Expression) -> Iterable[Advice]:
382-
for linter in self._linters:
383-
yield from linter.lint_expression(expression)
384-
385-
def collect_dfsas(self, source_code: str) -> Iterable[DirectFsAccess]:
386-
for collector in self._dfsa_collectors:
387-
yield from collector.collect_dfsas(source_code)
388-
389-
def collect_tables(self, source_code: str) -> Iterable[UsedTable]:
390-
for collector in self._table_collectors:
391-
yield from collector.collect_tables(source_code)
392-
393-
394317
SUPPORTED_EXTENSION_LANGUAGES = {
395318
'.py': Language.PYTHON,
396319
'.sql': Language.SQL,
Lines changed: 169 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,169 @@
1+
from __future__ import annotations
2+
3+
import dataclasses
4+
import logging
5+
from pathlib import Path
6+
7+
from databricks.sdk.service.workspace import Language
8+
9+
from databricks.labs.ucx.source_code.base import file_language
10+
from databricks.labs.ucx.source_code.graph import (
11+
SourceContainer,
12+
DependencyGraph,
13+
DependencyProblem,
14+
InheritedContext,
15+
DependencyLoader,
16+
Dependency,
17+
BaseImportResolver,
18+
BaseFileResolver,
19+
MaybeDependency,
20+
)
21+
from databricks.labs.ucx.source_code.known import KnownList
22+
from databricks.labs.ucx.source_code.notebooks.cells import CellLanguage
23+
from databricks.labs.ucx.source_code.path_lookup import PathLookup
24+
from databricks.labs.ucx.source_code.linters.python import PythonCodeAnalyzer
25+
26+
27+
logger = logging.getLogger(__name__)
28+
29+
30+
class LocalFile(SourceContainer):
31+
32+
def __init__(self, path: Path, source: str, language: Language):
33+
self._path = path
34+
self._original_code = source
35+
# using CellLanguage so we can reuse the facilities it provides
36+
self._language = CellLanguage.of_language(language)
37+
38+
@property
39+
def path(self) -> Path:
40+
return self._path
41+
42+
def build_dependency_graph(self, parent: DependencyGraph) -> list[DependencyProblem]:
43+
if self._language is CellLanguage.PYTHON:
44+
context = parent.new_dependency_graph_context()
45+
analyzer = PythonCodeAnalyzer(context, self._original_code)
46+
problems = analyzer.build_graph()
47+
for idx, problem in enumerate(problems):
48+
if problem.has_missing_path():
49+
problems[idx] = dataclasses.replace(problem, source_path=self._path)
50+
return problems
51+
# supported language that does not generate dependencies
52+
if self._language is CellLanguage.SQL:
53+
return []
54+
logger.warning(f"Unsupported language: {self._language.language}")
55+
return []
56+
57+
def build_inherited_context(self, graph: DependencyGraph, child_path: Path) -> InheritedContext:
58+
if self._language is CellLanguage.PYTHON:
59+
context = graph.new_dependency_graph_context()
60+
analyzer = PythonCodeAnalyzer(context, self._original_code)
61+
inherited = analyzer.build_inherited_context(child_path)
62+
problems = list(inherited.problems)
63+
for idx, problem in enumerate(problems):
64+
if problem.has_missing_path():
65+
problems[idx] = dataclasses.replace(problem, source_path=self._path)
66+
return dataclasses.replace(inherited, problems=problems)
67+
return InheritedContext(None, False, [])
68+
69+
def __repr__(self):
70+
return f"<LocalFile {self._path}>"
71+
72+
73+
class StubContainer(SourceContainer):
74+
75+
def __init__(self, path: Path):
76+
super().__init__()
77+
self._path = path
78+
79+
def build_dependency_graph(self, parent: DependencyGraph) -> list[DependencyProblem]:
80+
return []
81+
82+
83+
class FileLoader(DependencyLoader):
84+
def load_dependency(self, path_lookup: PathLookup, dependency: Dependency) -> SourceContainer | None:
85+
absolute_path = path_lookup.resolve(dependency.path)
86+
if not absolute_path:
87+
return None
88+
language = file_language(absolute_path)
89+
if not language:
90+
return StubContainer(absolute_path)
91+
for encoding in ("utf-8", "ascii"):
92+
try:
93+
code = absolute_path.read_text(encoding)
94+
return LocalFile(absolute_path, code, language)
95+
except UnicodeDecodeError:
96+
pass
97+
return None
98+
99+
def exists(self, path: Path) -> bool:
100+
return path.exists()
101+
102+
def __repr__(self):
103+
return "FileLoader()"
104+
105+
106+
class ImportFileResolver(BaseImportResolver, BaseFileResolver):
107+
108+
def __init__(self, file_loader: FileLoader, allow_list: KnownList):
109+
super().__init__()
110+
self._allow_list = allow_list
111+
self._file_loader = file_loader
112+
113+
def resolve_file(self, path_lookup, path: Path) -> MaybeDependency:
114+
absolute_path = path_lookup.resolve(path)
115+
if absolute_path:
116+
return MaybeDependency(Dependency(self._file_loader, absolute_path), [])
117+
problem = DependencyProblem("file-not-found", f"File not found: {path.as_posix()}")
118+
return MaybeDependency(None, [problem])
119+
120+
def resolve_import(self, path_lookup: PathLookup, name: str) -> MaybeDependency:
121+
maybe = self._resolve_allow_list(name)
122+
if maybe is not None:
123+
return maybe
124+
maybe = self._resolve_import(path_lookup, name)
125+
if maybe is not None:
126+
return maybe
127+
return self._fail('import-not-found', f"Could not locate import: {name}")
128+
129+
def _resolve_allow_list(self, name: str) -> MaybeDependency | None:
130+
compatibility = self._allow_list.module_compatibility(name)
131+
if not compatibility.known:
132+
logger.debug(f"Resolving unknown import: {name}")
133+
return None
134+
if not compatibility.problems:
135+
return MaybeDependency(None, [])
136+
# TODO move to linter, see https://github.com/databrickslabs/ucx/issues/1527
137+
return MaybeDependency(None, compatibility.problems)
138+
139+
def _resolve_import(self, path_lookup: PathLookup, name: str) -> MaybeDependency | None:
140+
if not name:
141+
return MaybeDependency(None, [DependencyProblem("ucx-bug", "Import name is empty")])
142+
parts = []
143+
# Relative imports use leading dots. A single leading dot indicates a relative import, starting with
144+
# the current package. Two or more leading dots indicate a relative import to the parent(s) of the current
145+
# package, one level per dot after the first.
146+
# see https://docs.python.org/3/reference/import.html#package-relative-imports
147+
for i, rune in enumerate(name):
148+
if not i and rune == '.': # leading single dot
149+
parts.append(path_lookup.cwd.as_posix())
150+
continue
151+
if rune != '.':
152+
parts.append(name[i:].replace('.', '/'))
153+
break
154+
parts.append("..")
155+
for candidate in (f'{"/".join(parts)}.py', f'{"/".join(parts)}/__init__.py'):
156+
relative_path = Path(candidate)
157+
absolute_path = path_lookup.resolve(relative_path)
158+
if not absolute_path:
159+
continue
160+
dependency = Dependency(self._file_loader, absolute_path, False)
161+
return MaybeDependency(dependency, [])
162+
return None
163+
164+
@staticmethod
165+
def _fail(code: str, message: str) -> MaybeDependency:
166+
return MaybeDependency(None, [DependencyProblem(code, message)])
167+
168+
def __repr__(self):
169+
return "ImportFileResolver()"
Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
from __future__ import annotations
2+
3+
from pathlib import Path
4+
from collections.abc import Iterable
5+
6+
from databricks.labs.ucx.source_code.base import is_a_notebook
7+
from databricks.labs.ucx.source_code.files import FileLoader
8+
from databricks.labs.ucx.source_code.graph import SourceContainer, DependencyGraph, DependencyProblem, Dependency
9+
from databricks.labs.ucx.source_code.notebooks.loaders import NotebookLoader
10+
from databricks.labs.ucx.source_code.path_lookup import PathLookup
11+
12+
13+
class Folder(SourceContainer):
14+
15+
def __init__(
16+
self,
17+
path: Path,
18+
notebook_loader: NotebookLoader,
19+
file_loader: FileLoader,
20+
folder_loader: FolderLoader,
21+
):
22+
self._path = path
23+
self._notebook_loader = notebook_loader
24+
self._file_loader = file_loader
25+
self._folder_loader = folder_loader
26+
27+
@property
28+
def path(self) -> Path:
29+
return self._path
30+
31+
def build_dependency_graph(self, parent: DependencyGraph) -> list[DependencyProblem]:
32+
# don't directly scan non-source directories, let it be done for relevant imports only
33+
if self._path.name in {"__pycache__", ".git", ".github", ".venv", ".mypy_cache", "site-packages"}:
34+
return []
35+
return list(self._build_dependency_graph(parent))
36+
37+
def _build_dependency_graph(self, parent: DependencyGraph) -> Iterable[DependencyProblem]:
38+
for child_path in self._path.iterdir():
39+
is_file = child_path.is_file()
40+
is_notebook = is_a_notebook(child_path)
41+
loader = self._notebook_loader if is_notebook else self._file_loader if is_file else self._folder_loader
42+
dependency = Dependency(loader, child_path, inherits_context=is_notebook)
43+
yield from parent.register_dependency(dependency).problems
44+
45+
def __repr__(self):
46+
return f"<Folder {self._path}>"
47+
48+
49+
class FolderLoader(FileLoader):
50+
51+
def __init__(self, notebook_loader: NotebookLoader, file_loader: FileLoader):
52+
self._notebook_loader = notebook_loader
53+
self._file_loader = file_loader
54+
55+
def load_dependency(self, path_lookup: PathLookup, dependency: Dependency) -> SourceContainer | None:
56+
absolute_path = path_lookup.resolve(dependency.path)
57+
if not absolute_path:
58+
return None
59+
return Folder(absolute_path, self._notebook_loader, self._file_loader, self)

0 commit comments

Comments
 (0)