Skip to content

Commit c64de1a

Browse files
authored
Make MaybeTree the main Python AST entrypoint for constructing the syntax tree (#3550)
## Changes Make `MaybeTree` the main Python AST entrypoint for constructing the syntax tree: - Move the `@classmethod`s and `@staticmethod`s that construct a `MaybeTree` from the `Tree` to the `MaybeTree` class as this uses the `@classmethod` properly by returning the initialization of the for `cls` argument - Make the `@classmethod` that normalizes the source code before parsing the only entrypoint by removing the other `@classmethod` that does not normalize the source code before parsing to enforce normalization (and resolve the linked issues below) - Rename `normalized_parse` to `from_source_code` to match the commonly used naming for `classmethod`s within UCX - Remove `MaybeTree`'s methods `walk` and `first_statement` as those are repetitions from `Tree`'s methods ### Linked issues Resolves #3457 Resolves #3213 ### Functionality - [x] modified Python linting related code ### Tests - [x] added unit tests
1 parent 5ac7143 commit c64de1a

File tree

13 files changed

+248
-276
lines changed

13 files changed

+248
-276
lines changed

src/databricks/labs/ucx/source_code/jobs.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,7 @@
5151
)
5252
from databricks.labs.ucx.source_code.linters.context import LinterContext
5353
from databricks.labs.ucx.source_code.notebooks.cells import CellLanguage
54-
from databricks.labs.ucx.source_code.python.python_ast import Tree, PythonSequentialLinter
54+
from databricks.labs.ucx.source_code.python.python_ast import MaybeTree, Tree, PythonSequentialLinter
5555
from databricks.labs.ucx.source_code.notebooks.sources import FileLinter, Notebook
5656
from databricks.labs.ucx.source_code.path_lookup import PathLookup
5757
from databricks.labs.ucx.source_code.used_table import UsedTablesCrawler
@@ -657,7 +657,7 @@ def _collect_from_notebook(
657657
if cell.language is CellLanguage.PYTHON:
658658
if inherited_tree is None:
659659
inherited_tree = Tree.new_module()
660-
maybe_tree = Tree.maybe_normalized_parse(cell.original_code)
660+
maybe_tree = MaybeTree.from_source_code(cell.original_code)
661661
if maybe_tree.failure:
662662
logger.warning(maybe_tree.failure.message)
663663
continue

src/databricks/labs/ucx/source_code/linters/pyspark.py

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -22,12 +22,13 @@
2222
from databricks.labs.ucx.source_code.python.python_infer import InferredValue
2323
from databricks.labs.ucx.source_code.linters.from_table import FromTableSqlLinter
2424
from databricks.labs.ucx.source_code.python.python_ast import (
25-
Tree,
26-
TreeHelper,
25+
DfsaPyCollector,
2726
MatchingVisitor,
27+
MaybeTree,
2828
PythonLinter,
2929
TablePyCollector,
30-
DfsaPyCollector,
30+
Tree,
31+
TreeHelper,
3132
)
3233

3334
logger = logging.getLogger(__name__)
@@ -412,7 +413,7 @@ def lint_tree(self, tree: Tree) -> Iterable[Advice]:
412413
yield from matcher.lint(self._from_table, self._index, self._session_state, node)
413414

414415
def apply(self, code: str) -> str:
415-
maybe_tree = Tree.maybe_parse(code)
416+
maybe_tree = MaybeTree.from_source_code(code)
416417
if not maybe_tree.tree:
417418
assert maybe_tree.failure is not None
418419
logger.warning(maybe_tree.failure.message)
@@ -486,7 +487,7 @@ def lint_tree(self, tree: Tree) -> Iterable[Advice]:
486487
def apply(self, code: str) -> str:
487488
if not self._sql_fixer:
488489
return code
489-
maybe_tree = Tree.maybe_normalized_parse(code)
490+
maybe_tree = MaybeTree.from_source_code(code)
490491
if maybe_tree.failure:
491492
logger.warning(maybe_tree.failure.message)
492493
return code

src/databricks/labs/ucx/source_code/notebooks/sources.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@
3636
UnresolvedPath,
3737
)
3838
from databricks.labs.ucx.source_code.notebooks.magic import MagicLine
39-
from databricks.labs.ucx.source_code.python.python_ast import Tree, NodeBase, PythonSequentialLinter
39+
from databricks.labs.ucx.source_code.python.python_ast import MaybeTree, NodeBase, PythonSequentialLinter, Tree
4040
from databricks.labs.ucx.source_code.notebooks.cells import (
4141
CellLanguage,
4242
Cell,
@@ -196,7 +196,7 @@ def _load_tree_from_notebook(self, notebook: Notebook, register_trees: bool) ->
196196
continue
197197

198198
def _load_tree_from_python_cell(self, python_cell: PythonCell, register_trees: bool) -> Iterable[Advice]:
199-
maybe_tree = Tree.maybe_normalized_parse(python_cell.original_code)
199+
maybe_tree = MaybeTree.from_source_code(python_cell.original_code)
200200
if maybe_tree.failure:
201201
yield maybe_tree.failure
202202
tree = maybe_tree.tree

src/databricks/labs/ucx/source_code/python/python_analyzer.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@
2121
UnresolvedPath,
2222
)
2323
from databricks.labs.ucx.source_code.notebooks.magic import MagicLine
24-
from databricks.labs.ucx.source_code.python.python_ast import Tree, NodeBase
24+
from databricks.labs.ucx.source_code.python.python_ast import MaybeTree, Tree, NodeBase
2525

2626
logger = logging.getLogger(__name__)
2727

@@ -93,7 +93,7 @@ def build_inherited_context(self, child_path: Path) -> InheritedContext:
9393

9494
def _parse_and_extract_nodes(self) -> tuple[Tree | None, list[NodeBase], Iterable[DependencyProblem]]:
9595
problems: list[DependencyProblem] = []
96-
maybe_tree = Tree.maybe_normalized_parse(self._python_code)
96+
maybe_tree = MaybeTree.from_source_code(self._python_code)
9797
if maybe_tree.failure:
9898
return None, [], [DependencyProblem(maybe_tree.failure.code, maybe_tree.failure.message)]
9999
assert maybe_tree.tree is not None

src/databricks/labs/ucx/source_code/python/python_ast.py

Lines changed: 70 additions & 64 deletions
Original file line numberDiff line numberDiff line change
@@ -49,88 +49,91 @@
4949

5050
@dataclass(frozen=True)
5151
class MaybeTree:
52-
tree: Tree | None
53-
failure: Failure | None
52+
"""A :class:`Tree` or a :class:`Failure`.
53+
54+
The `MaybeTree` is designed to either contain a `Tree` OR a `Failure`,
55+
never both or neither. Typically, a `Tree` is constructed using the
56+
`MaybeTree` class method(s) that yields a `Failure` if the `Tree` could
57+
NOT be constructed, otherwise it yields the `Tree`, resulting in code that
58+
looks like:
59+
60+
``` python
61+
maybe_tree = Tree.from_source_code("print(1)")
62+
if maybe_tree.failure:
63+
# Handle failure and return early
64+
assert maybe_tree.tree, "Tree should be not-None when Failure is None."
65+
# Use tree
66+
```
67+
"""
5468

55-
def walk(self) -> Iterable[NodeNG]:
56-
# mainly a helper method for unit testing
57-
if self.tree is None: # no cov
58-
assert self.failure is not None
59-
logger.warning(self.failure.message)
60-
return []
61-
return self.tree.walk()
69+
tree: Tree | None
70+
"""The UCX Python abstract syntax tree object"""
6271

63-
def first_statement(self) -> NodeNG | None:
64-
# mainly a helper method for unit testing
65-
if self.tree is None: # no cov
66-
assert self.failure is not None
67-
logger.warning(self.failure.message)
68-
return None
69-
return self.tree.first_statement()
72+
failure: Failure | None
73+
"""The failure during constructing the tree"""
7074

75+
def __post_init__(self):
76+
if self.tree is None and self.failure is None:
77+
raise ValueError(f"Tree and failure should not be both `None`: {self}")
78+
if self.tree is not None and self.failure is not None:
79+
raise ValueError(f"Tree and failure should not be both given: {self}")
7180

72-
class Tree: # pylint: disable=too-many-public-methods
81+
@classmethod
82+
def from_source_code(cls, code: str) -> MaybeTree:
83+
"""Normalize and parse the source code to get a `Tree` or parse `Failure`."""
84+
code = cls._normalize(code)
85+
return cls._maybe_parse(code)
7386

7487
@classmethod
75-
def maybe_parse(cls, code: str) -> MaybeTree:
88+
def _maybe_parse(cls, code: str) -> MaybeTree:
7689
try:
7790
root = parse(code)
7891
tree = Tree(root)
79-
return MaybeTree(tree, None)
92+
return cls(tree, None)
8093
except Exception as e: # pylint: disable=broad-exception-caught
8194
# see https://github.com/databrickslabs/ucx/issues/2976
82-
return cls._definitely_failure(code, e)
95+
failure = cls._failure_from_exception(code, e)
96+
return cls(None, failure)
8397

8498
@staticmethod
85-
def _definitely_failure(source_code: str, e: Exception) -> MaybeTree:
99+
def _failure_from_exception(source_code: str, e: Exception) -> Failure:
86100
if isinstance(e, AstroidSyntaxError) and isinstance(e.error, SyntaxError):
87-
return MaybeTree(
88-
None,
89-
Failure(
90-
code="python-parse-error",
91-
message=f"Failed to parse code due to invalid syntax: {source_code}",
92-
# Lines and columns are both 0-based: the first line is line 0.
93-
start_line=(e.error.lineno or 1) - 1,
94-
start_col=(e.error.offset or 1) - 1,
95-
end_line=(e.error.end_lineno or 2) - 1,
96-
end_col=(e.error.end_offset or 2) - 1,
97-
),
101+
return Failure(
102+
code="python-parse-error",
103+
message=f"Failed to parse code due to invalid syntax: {source_code}",
104+
# Lines and columns are both 0-based: the first line is line 0.
105+
start_line=(e.error.lineno or 1) - 1,
106+
start_col=(e.error.offset or 1) - 1,
107+
end_line=(e.error.end_lineno or 2) - 1,
108+
end_col=(e.error.end_offset or 2) - 1,
98109
)
99110
new_issue_url = (
100111
"https://github.com/databrickslabs/ucx/issues/new?title=[BUG]:+Python+parse+error"
101112
"&labels=migrate/code,needs-triage,bug"
102113
"&body=%23+Current+behaviour%0A%0ACannot+parse+the+following+Python+code"
103114
f"%0A%0A%60%60%60+python%0A{urllib.parse.quote_plus(source_code)}%0A%60%60%60"
104115
)
105-
return MaybeTree(
106-
None,
107-
Failure(
108-
code="python-parse-error",
109-
message=(
110-
f"Please report the following error as an issue on UCX GitHub: {new_issue_url}\n"
111-
f"Caught error `{type(e)} : {e}` while parsing code: {source_code}"
112-
),
113-
# Lines and columns are both 0-based: the first line is line 0.
114-
start_line=0,
115-
start_col=0,
116-
end_line=1,
117-
end_col=1,
116+
return Failure(
117+
code="python-parse-error",
118+
message=(
119+
f"Please report the following error as an issue on UCX GitHub: {new_issue_url}\n"
120+
f"Caught error `{type(e)} : {e}` while parsing code: {source_code}"
118121
),
122+
# Lines and columns are both 0-based: the first line is line 0.
123+
start_line=0,
124+
start_col=0,
125+
end_line=1,
126+
end_col=1,
119127
)
120128

121129
@classmethod
122-
def maybe_normalized_parse(cls, code: str) -> MaybeTree:
123-
code = cls.normalize(code)
124-
return cls.maybe_parse(code)
125-
126-
@classmethod
127-
def normalize(cls, code: str) -> str:
130+
def _normalize(cls, code: str) -> str:
128131
code = cls._normalize_indents(code)
129132
code = cls._convert_magic_lines_to_magic_commands(code)
130133
return code
131134

132-
@classmethod
133-
def _normalize_indents(cls, python_code: str) -> str:
135+
@staticmethod
136+
def _normalize_indents(python_code: str) -> str:
134137
lines = python_code.split("\n")
135138
for line in lines:
136139
# skip leading ws and comments
@@ -148,8 +151,8 @@ def _normalize_indents(cls, python_code: str) -> str:
148151
return "\n".join(lines)
149152
return python_code
150153

151-
@classmethod
152-
def _convert_magic_lines_to_magic_commands(cls, python_code: str) -> str:
154+
@staticmethod
155+
def _convert_magic_lines_to_magic_commands(python_code: str) -> str:
153156
lines = python_code.split("\n")
154157
magic_markers = {"%", "!"}
155158
in_multi_line_comment = False
@@ -165,10 +168,14 @@ def _convert_magic_lines_to_magic_commands(cls, python_code: str) -> str:
165168
in_multi_line_comment = not in_multi_line_comment
166169
return "\n".join(lines)
167170

171+
172+
class Tree:
173+
"""The UCX Python abstract syntax tree object"""
174+
168175
@classmethod
169176
def new_module(cls) -> Tree:
170177
node = Module("root")
171-
return Tree(node)
178+
return cls(node)
172179

173180
def __init__(self, node: NodeNG):
174181
self._node: NodeNG = node
@@ -187,11 +194,10 @@ def root(self) -> NodeNG:
187194
def walk(self) -> Iterable[NodeNG]:
188195
yield from self._walk(self._node)
189196

190-
@classmethod
191-
def _walk(cls, node: NodeNG) -> Iterable[NodeNG]:
197+
def _walk(self, node: NodeNG) -> Iterable[NodeNG]:
192198
yield node
193199
for child in node.get_children():
194-
yield from cls._walk(child)
200+
yield from self._walk(child)
195201

196202
def locate(self, node_type: type[T], match_nodes: list[tuple[str, type]]) -> list[T]:
197203
visitor = MatchingVisitor(node_type, match_nodes)
@@ -627,7 +633,7 @@ def __repr__(self):
627633
class PythonLinter(Linter):
628634

629635
def lint(self, code: str) -> Iterable[Advice]:
630-
maybe_tree = Tree.maybe_normalized_parse(code)
636+
maybe_tree = MaybeTree.from_source_code(code)
631637
if maybe_tree.failure:
632638
yield maybe_tree.failure
633639
return
@@ -641,7 +647,7 @@ def lint_tree(self, tree: Tree) -> Iterable[Advice]: ...
641647
class TablePyCollector(TableCollector, ABC):
642648

643649
def collect_tables(self, source_code: str) -> Iterable[UsedTable]:
644-
maybe_tree = Tree.maybe_normalized_parse(source_code)
650+
maybe_tree = MaybeTree.from_source_code(source_code)
645651
if maybe_tree.failure:
646652
logger.warning(maybe_tree.failure.message)
647653
return
@@ -656,7 +662,7 @@ def collect_tables_from_tree(self, tree: Tree) -> Iterable[UsedTableNode]: ...
656662
class DfsaPyCollector(DfsaCollector, ABC):
657663

658664
def collect_dfsas(self, source_code: str) -> Iterable[DirectFsAccess]:
659-
maybe_tree = Tree.maybe_normalized_parse(source_code)
665+
maybe_tree = MaybeTree.from_source_code(source_code)
660666
if maybe_tree.failure:
661667
logger.warning(maybe_tree.failure.message)
662668
return
@@ -694,7 +700,7 @@ def lint_tree(self, tree: Tree) -> Iterable[Advice]:
694700
yield from linter.lint_tree(tree)
695701

696702
def _parse_and_append(self, code: str) -> MaybeTree:
697-
maybe_tree = Tree.maybe_normalized_parse(code)
703+
maybe_tree = MaybeTree.from_source_code(code)
698704
if maybe_tree.failure:
699705
return maybe_tree
700706
assert maybe_tree.tree is not None
@@ -712,7 +718,7 @@ def append_globals(self, globs: dict) -> None:
712718

713719
def process_child_cell(self, code: str) -> None:
714720
this_tree = self._make_tree()
715-
maybe_tree = Tree.maybe_normalized_parse(code)
721+
maybe_tree = MaybeTree.from_source_code(code)
716722
if maybe_tree.failure:
717723
# TODO: bubble up this error
718724
logger.warning(maybe_tree.failure.message)

tests/integration/source_code/message_codes.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
from databricks.labs.blueprint.wheels import ProductInfo
33

44
from databricks.labs.ucx.source_code.base import Advice
5-
from databricks.labs.ucx.source_code.python.python_ast import Tree
5+
from databricks.labs.ucx.source_code.python.python_ast import MaybeTree
66

77

88
def main():
@@ -11,7 +11,7 @@ def main():
1111
product_info = ProductInfo.from_class(Advice)
1212
source_code = product_info.version_file().parent
1313
for file in source_code.glob("**/*.py"):
14-
maybe_tree = Tree.maybe_parse(file.read_text())
14+
maybe_tree = MaybeTree.from_source_code(file.read_text())
1515
if not maybe_tree.tree:
1616
continue
1717
tree = maybe_tree.tree

0 commit comments

Comments
 (0)