Skip to content

Commit eb6009a

Browse files
authored
Fix notebook sources with NotebookLinter.apply (#3693)
## Changes Add notebook fixing to `NotebookLinter` - Implement `Notebook.apply` - Call `Notebook.apply` from `FileLiner.apply` for `Notebook` source container - Remove legacy `NotebookMigrator` - Introduce `PythonLinter` to run apply on a AST tree - Allow to ### Linked issues Progresses #3514 Breaks up #3520 ### Functionality - [x] modified existing command: `databricks labs ucx migrate-local-code` ### Tests - [x] manually tested - [x] modified and added unit tests - [x] modified and added integration tests
1 parent 01c4263 commit eb6009a

File tree

16 files changed

+561
-135
lines changed

16 files changed

+561
-135
lines changed

src/databricks/labs/ucx/github.py

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
from enum import Enum
2+
import urllib.parse
3+
4+
5+
DOCS_URL = "https://databrickslabs.github.io/ucx/docs/"
6+
GITHUB_URL = "https://github.com/databrickslabs/ucx"
7+
8+
9+
class IssueType(Enum):
10+
"""The issue type"""
11+
12+
FEATURE = "Feature"
13+
BUG = "Bug"
14+
TASK = "Task"
15+
16+
17+
def construct_new_issue_url(
18+
issue_type: IssueType,
19+
title: str,
20+
body: str,
21+
*,
22+
labels: set[str] | None = None,
23+
) -> str:
24+
"""Construct a new issue URL.
25+
26+
References:
27+
- https://docs.github.com/en/issues/tracking-your-work-with-issues/using-issues/creating-an-issue#creating-an-issue-from-a-url-query
28+
"""
29+
labels = labels or set()
30+
labels.add("needs-triage")
31+
parameters = {
32+
"type": issue_type.value,
33+
"title": title,
34+
"body": body,
35+
"labels": ",".join(sorted(labels)),
36+
}
37+
query = "&".join(f"{key}={urllib.parse.quote_plus(value)}" for key, value in parameters.items())
38+
return f"{GITHUB_URL}/issues/new?{query}"

src/databricks/labs/ucx/install.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,7 @@
4848
)
4949
from databricks.sdk.useragent import with_extra
5050

51+
from databricks.labs.ucx.github import DOCS_URL
5152
from databricks.labs.ucx.__about__ import __version__
5253
from databricks.labs.ucx.assessment.azure import AzureServicePrincipalInfo
5354
from databricks.labs.ucx.assessment.clusters import ClusterInfo, PolicyInfo
@@ -218,7 +219,7 @@ def run(
218219
if isinstance(err.__cause__, RequestsConnectionError):
219220
logger.warning(
220221
f"Cannot connect with {self.workspace_client.config.host} see "
221-
f"https://github.com/databrickslabs/ucx#network-connectivity-issues for help: {err}"
222+
f"{DOCS_URL}#reference/common_challenges/#network-connectivity-issues for help: {err}"
222223
)
223224
raise err
224225
return config
@@ -573,7 +574,7 @@ def _create_database(self):
573574
if "Unable to load AWS credentials from any provider in the chain" in str(err):
574575
msg = (
575576
"The UCX installation is configured to use external metastore. There is issue with the external metastore connectivity.\n"
576-
"Please check the UCX installation instruction https://github.com/databrickslabs/ucx?tab=readme-ov-file#prerequisites"
577+
f"Please check the UCX installation instruction {DOCS_URL}/installation "
577578
"and re-run installation.\n"
578579
"Please Follow the Below Command to uninstall and Install UCX\n"
579580
"UCX Uninstall: databricks labs uninstall ucx.\n"

src/databricks/labs/ucx/source_code/known.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414

1515
from databricks.labs.blueprint.entrypoint import get_logger
1616

17+
from databricks.labs.ucx.github import GITHUB_URL
1718
from databricks.labs.ucx.hive_metastore.table_migration_status import TableMigrationIndex
1819
from databricks.labs.ucx.source_code.base import Advice, CurrentSessionState
1920
from databricks.labs.ucx.source_code.graph import (
@@ -24,6 +25,7 @@
2425
from databricks.labs.ucx.source_code.path_lookup import PathLookup
2526

2627
logger = logging.getLogger(__name__)
28+
KNOWN_URL = f"{GITHUB_URL}/blob/main/src/databricks/labs/ucx/source_code/known.json"
2729

2830
"""
2931
Known libraries that are not in known.json
@@ -282,10 +284,9 @@ class KnownDependency(Dependency):
282284
"""A dependency for known libraries, see :class:KnownList."""
283285

284286
def __init__(self, module_name: str, problems: list[KnownProblem]):
285-
known_url = "https://github.com/databrickslabs/ucx/blob/main/src/databricks/labs/ucx/source_code/known.json"
286287
# Note that Github does not support navigating JSON files, hence the #<module_name> does nothing.
287288
# https://docs.github.com/en/repositories/working-with-files/using-files/navigating-code-on-github
288-
super().__init__(KnownLoader(), Path(f"{known_url}#{module_name}"), inherits_context=False)
289+
super().__init__(KnownLoader(), Path(f"{KNOWN_URL}#{module_name}"), inherits_context=False)
289290
self._module_name = module_name
290291
self.problems = problems
291292

src/databricks/labs/ucx/source_code/linters/base.py

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -114,6 +114,35 @@ def lint(self, code: str) -> Iterable[Advice]:
114114
def lint_tree(self, tree: Tree) -> Iterable[Advice]: ...
115115

116116

117+
class PythonFixer(Fixer):
118+
"""Fix python source code."""
119+
120+
def apply(self, code: str) -> str:
121+
"""Apply the changes to Python source code.
122+
123+
The source code is parsed into an AST tree, and the fixes are applied
124+
to the tree.
125+
"""
126+
maybe_tree = MaybeTree.from_source_code(code)
127+
if maybe_tree.failure:
128+
# Fixing does not yield parse failures, linting does
129+
logger.warning(f"Parsing source code resulted in failure `{maybe_tree.failure}`: {code}")
130+
return code
131+
assert maybe_tree.tree is not None
132+
tree = self.apply_tree(maybe_tree.tree)
133+
return tree.node.as_string()
134+
135+
@abstractmethod
136+
def apply_tree(self, tree: Tree) -> Tree:
137+
"""Apply the fixes to the AST tree.
138+
139+
For Python, the fixes are applied to a Tree so that we
140+
- Can chain multiple fixers without transpiling back and forth between
141+
source code and AST tree
142+
- Can extend the tree with (brought into scope) nodes, e.g. to add imports
143+
"""
144+
145+
117146
class DfsaPyCollector(DfsaCollector, ABC):
118147

119148
def collect_dfsas(self, source_code: str) -> Iterable[DirectFsAccess]:

src/databricks/labs/ucx/source_code/linters/files.py

Lines changed: 43 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717
from databricks.labs.ucx.source_code.files import LocalFile
1818
from databricks.labs.ucx.source_code.graph import Dependency
1919
from databricks.labs.ucx.source_code.known import KnownDependency
20-
from databricks.labs.ucx.source_code.linters.base import PythonLinter
20+
from databricks.labs.ucx.source_code.linters.base import PythonFixer, PythonLinter
2121
from databricks.labs.ucx.source_code.linters.context import LinterContext
2222
from databricks.labs.ucx.source_code.linters.imports import SysPathChange, UnresolvedPath
2323
from databricks.labs.ucx.source_code.notebooks.cells import (
@@ -26,7 +26,6 @@
2626
RunCell,
2727
RunCommand,
2828
)
29-
from databricks.labs.ucx.source_code.notebooks.loaders import NotebookLoader
3029
from databricks.labs.ucx.source_code.notebooks.magic import MagicLine
3130
from databricks.labs.ucx.source_code.notebooks.sources import Notebook
3231
from databricks.labs.ucx.source_code.path_lookup import PathLookup
@@ -42,7 +41,11 @@ class NotebookLinter:
4241
"""
4342

4443
def __init__(
45-
self, notebook: Notebook, path_lookup: PathLookup, context: LinterContext, parent_tree: Tree | None = None
44+
self,
45+
notebook: Notebook,
46+
path_lookup: PathLookup,
47+
context: LinterContext,
48+
parent_tree: Tree | None = None,
4649
):
4750
self._context: LinterContext = context
4851
self._path_lookup = path_lookup
@@ -76,6 +79,37 @@ def lint(self) -> Iterable[Advice]:
7679
)
7780
return
7881

82+
def apply(self) -> None:
83+
"""Apply changes to the notebook."""
84+
maybe_tree = self._parse_notebook(self._notebook, parent_tree=self._parent_tree)
85+
if maybe_tree and maybe_tree.failure:
86+
logger.warning("Failed to parse the notebook, run linter for more details.")
87+
return
88+
for cell in self._notebook.cells:
89+
try:
90+
linter = self._context.linter(cell.language.language)
91+
except ValueError: # Language is not supported (yet)
92+
continue
93+
fixed_code = cell.original_code # For default fixing
94+
tree = self._python_tree_cache.get((self._notebook.path, cell)) # For Python fixing
95+
is_python_cell = isinstance(cell, PythonCell)
96+
if is_python_cell and tree:
97+
advices = cast(PythonLinter, linter).lint_tree(tree)
98+
else:
99+
advices = linter.lint(cell.original_code)
100+
for advice in advices:
101+
fixer = self._context.fixer(cell.language.language, advice.code)
102+
if not fixer:
103+
continue
104+
if is_python_cell and tree:
105+
# By calling `apply_tree` instead of `apply`, we chain fixes on the same tree
106+
tree = cast(PythonFixer, fixer).apply_tree(tree)
107+
else:
108+
fixed_code = fixer.apply(fixed_code)
109+
cell.migrated_code = tree.node.as_string() if tree else fixed_code
110+
self._notebook.back_up_original_and_flush_migrated_code()
111+
return
112+
79113
def _parse_notebook(self, notebook: Notebook, *, parent_tree: Tree) -> MaybeTree | None:
80114
"""Parse a notebook by parsing its cells.
81115
@@ -264,50 +298,16 @@ def apply(self) -> None:
264298
source_container = self._dependency.load(self._path_lookup)
265299
if isinstance(source_container, LocalFile):
266300
self._apply_file(source_container)
301+
elif isinstance(source_container, Notebook):
302+
self._apply_notebook(source_container)
267303

268304
def _apply_file(self, local_file: LocalFile) -> None:
269305
"""Apply changes to a local file."""
270306
fixed_code = self._context.apply_fixes(local_file.language, local_file.original_code)
271307
local_file.migrated_code = fixed_code
272308
local_file.back_up_original_and_flush_migrated_code()
273309

274-
275-
class NotebookMigrator:
276-
def __init__(self, languages: LinterContext):
277-
# TODO: move languages to `apply`
278-
self._languages = languages
279-
280-
def revert(self, path: Path) -> bool:
281-
backup_path = path.with_suffix(".bak")
282-
if not backup_path.exists():
283-
return False
284-
return path.write_text(backup_path.read_text()) > 0
285-
286-
def apply(self, path: Path) -> bool:
287-
if not path.exists():
288-
return False
289-
dependency = Dependency(NotebookLoader(), path)
290-
# TODO: the interface for this method has to be changed
291-
lookup = PathLookup.from_sys_path(Path.cwd())
292-
container = dependency.load(lookup)
293-
assert isinstance(container, Notebook)
294-
return self._apply(container)
295-
296-
def _apply(self, notebook: Notebook) -> bool:
297-
changed = False
298-
for cell in notebook.cells:
299-
# %run is not a supported language, so this needs to come first
300-
if isinstance(cell, RunCell):
301-
# TODO migration data, see https://github.com/databrickslabs/ucx/issues/1327
302-
continue
303-
if not self._languages.is_supported(cell.language.language):
304-
continue
305-
migrated_code = self._languages.apply_fixes(cell.language.language, cell.original_code)
306-
if migrated_code != cell.original_code:
307-
cell.migrated_code = migrated_code
308-
changed = True
309-
if changed:
310-
# TODO https://github.com/databrickslabs/ucx/issues/1327 store 'migrated' status
311-
notebook.path.replace(notebook.path.with_suffix(".bak"))
312-
notebook.path.write_text(notebook.to_migrated_code())
313-
return changed
310+
def _apply_notebook(self, notebook: Notebook) -> None:
311+
"""Apply changes to a notebook."""
312+
notebook_linter = NotebookLinter(notebook, self._path_lookup, self._context, self._inherited_tree)
313+
notebook_linter.apply()

src/databricks/labs/ucx/source_code/linters/pyspark.py

Lines changed: 22 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,8 @@
66
from typing import TypeVar
77

88
from astroid import Attribute, Call, Const, Name, NodeNG # type: ignore
9+
10+
from databricks.labs.ucx.github import IssueType, construct_new_issue_url
911
from databricks.labs.ucx.hive_metastore.table_migration_status import TableMigrationIndex, TableMigrationStatus
1012
from databricks.labs.ucx.source_code.base import (
1113
Advice,
@@ -20,6 +22,7 @@
2022
from databricks.labs.ucx.source_code.linters.base import (
2123
SqlLinter,
2224
Fixer,
25+
PythonFixer,
2326
PythonLinter,
2427
DfsaPyCollector,
2528
TablePyCollector,
@@ -33,7 +36,6 @@
3336
from databricks.labs.ucx.source_code.linters.from_table import FromTableSqlLinter
3437
from databricks.labs.ucx.source_code.python.python_ast import (
3538
MatchingVisitor,
36-
MaybeTree,
3739
Tree,
3840
TreeHelper,
3941
)
@@ -170,8 +172,16 @@ def lint(
170172

171173
def apply(self, from_table: FromTableSqlLinter, index: TableMigrationIndex, node: Call) -> None:
172174
table_arg = self._get_table_arg(node)
173-
assert isinstance(table_arg, Const)
174-
# TODO locate constant when value is inferred
175+
if not isinstance(table_arg, Const):
176+
# TODO: https://github.com/databrickslabs/ucx/issues/3695
177+
source_code = node.as_string()
178+
body = (
179+
"# Desired behaviour\n\nAutofix following Python code\n\n"
180+
f"``` python\nTODO: Add relevant source code\n{source_code}\n```"
181+
)
182+
url = construct_new_issue_url(IssueType.FEATURE, "Autofix the following Python code", body)
183+
logger.warning(f"Cannot fix the following Python code: {source_code}. Please report this issue at {url}")
184+
return
175185
info = UsedTable.parse(table_arg.value, from_table.schema)
176186
dst = self._find_dest(index, info)
177187
if dst is not None:
@@ -393,7 +403,7 @@ def matchers(self) -> dict[str, _TableNameMatcher]:
393403
return self._matchers
394404

395405

396-
class SparkTableNamePyLinter(PythonLinter, Fixer, TablePyCollector):
406+
class SparkTableNamePyLinter(PythonLinter, PythonFixer, TablePyCollector):
397407
"""Linter for table name references in PySpark
398408
399409
Examples:
@@ -427,21 +437,15 @@ def lint_tree(self, tree: Tree) -> Iterable[Advice]:
427437
assert isinstance(node, Call)
428438
yield from matcher.lint(self._from_table, self._index, self._session_state, node)
429439

430-
def apply(self, code: str) -> str:
431-
maybe_tree = MaybeTree.from_source_code(code)
432-
if not maybe_tree.tree:
433-
assert maybe_tree.failure is not None
434-
logger.warning(maybe_tree.failure.message)
435-
return code
436-
tree = maybe_tree.tree
437-
# we won't be doing it like this in production, but for the sake of the example
440+
def apply_tree(self, tree: Tree) -> Tree:
441+
"""Apply the fixes to the AST tree."""
438442
for node in tree.walk():
439443
matcher = self._find_matcher(node)
440444
if matcher is None:
441445
continue
442446
assert isinstance(node, Call)
443447
matcher.apply(self._from_table, self._index, node)
444-
return tree.node.as_string()
448+
return tree
445449

446450
def _find_matcher(self, node: NodeNG) -> _TableNameMatcher | None:
447451
if not isinstance(node, Call):
@@ -476,7 +480,7 @@ def _visit_call_nodes(cls, tree: Tree) -> Iterable[tuple[Call, NodeNG]]:
476480
yield call_node, query
477481

478482

479-
class _SparkSqlPyLinter(_SparkSqlAnalyzer, PythonLinter, Fixer):
483+
class _SparkSqlPyLinter(_SparkSqlAnalyzer, PythonLinter, PythonFixer):
480484
"""Linter for SparkSQL used within PySpark."""
481485

482486
def __init__(self, sql_linter: SqlLinter, sql_fixer: Fixer | None):
@@ -503,23 +507,18 @@ def lint_tree(self, tree: Tree) -> Iterable[Advice]:
503507
code = self.diagnostic_code
504508
yield dataclasses.replace(advice.replace_from_node(call_node), code=code)
505509

506-
def apply(self, code: str) -> str:
510+
def apply_tree(self, tree: Tree) -> Tree:
511+
"""Apply the fixes to the AST tree."""
507512
if not self._sql_fixer:
508-
return code
509-
maybe_tree = MaybeTree.from_source_code(code)
510-
if maybe_tree.failure:
511-
logger.warning(maybe_tree.failure.message)
512-
return code
513-
assert maybe_tree.tree is not None
514-
tree = maybe_tree.tree
513+
return tree
515514
for _call_node, query in self._visit_call_nodes(tree):
516515
if not isinstance(query, Const) or not isinstance(query.value, str):
517516
continue
518517
# TODO avoid applying same fix multiple times
519518
# this requires changing 'apply' API in order to check advice fragment location
520519
new_query = self._sql_fixer.apply(query.value)
521520
query.value = new_query
522-
return tree.node.as_string()
521+
return tree
523522

524523

525524
class FromTableSqlPyLinter(_SparkSqlPyLinter):

0 commit comments

Comments
 (0)