Skip to content

Commit 62d2e27

Browse files
authored
Change logic of direct filesystem access linting (#2766)
## Changes Our current logic detects all string constants that match a DFSA pattern, excluding false positives on a per use case basis. This leaves many false positives. Practically, we only care about DFSA if called from `spark` or `dbutils` modules. This PR implements this change. ### Linked issues None ### Functionality None ### Tests - [x] added unit tests --------- Co-authored-by: Eric Vergnaud <eric.vergnaud@databricks.com>
1 parent 9a44fc1 commit 62d2e27

File tree

6 files changed

+82
-70
lines changed

6 files changed

+82
-70
lines changed

src/databricks/labs/ucx/source_code/linters/directfs.py

Lines changed: 16 additions & 53 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
from abc import ABC
33
from collections.abc import Iterable
44

5-
from astroid import Attribute, Call, Const, InferenceError, JoinedStr, Name, NodeNG # type: ignore
5+
from astroid import Call, InferenceError, NodeNG # type: ignore
66
from sqlglot.expressions import Alter, Create, Delete, Drop, Expression, Identifier, Insert, Literal, Select
77

88
from databricks.labs.ucx.source_code.base import (
@@ -16,7 +16,7 @@
1616
DfsaSqlCollector,
1717
DirectFsAccess,
1818
)
19-
from databricks.labs.ucx.source_code.python.python_ast import Tree, TreeVisitor
19+
from databricks.labs.ucx.source_code.python.python_ast import Tree, TreeVisitor, TreeHelper
2020
from databricks.labs.ucx.source_code.python.python_infer import InferredValue
2121
from databricks.labs.ucx.source_code.sql.sql_parser import SqlParser, SqlExpression
2222

@@ -68,43 +68,37 @@ class _DetectDirectFsAccessVisitor(TreeVisitor):
6868
def __init__(self, session_state: CurrentSessionState, prevent_spark_duplicates: bool) -> None:
6969
self._session_state = session_state
7070
self._directfs_nodes: list[DirectFsAccessNode] = []
71-
self._reported_locations: set[tuple[int, int]] = set()
7271
self._prevent_spark_duplicates = prevent_spark_duplicates
7372

7473
def visit_call(self, node: Call):
7574
for arg in node.args:
76-
self._visit_arg(arg)
75+
self._visit_arg(node, arg)
7776

78-
def _visit_arg(self, arg: NodeNG):
77+
def _visit_arg(self, call: Call, arg: NodeNG):
7978
try:
8079
for inferred in InferredValue.infer_from_node(arg, self._session_state):
8180
if not inferred.is_inferred():
8281
logger.debug(f"Could not infer value of {arg.as_string()}")
8382
continue
84-
self._check_str_constant(arg, inferred)
83+
self._check_str_arg(call, arg, inferred)
8584
except InferenceError as e:
8685
logger.debug(f"Could not infer value of {arg.as_string()}", exc_info=e)
8786

88-
def visit_const(self, node: Const):
89-
# Constant strings yield Advisories
90-
if isinstance(node.value, str):
91-
self._check_str_constant(node, InferredValue([node]))
92-
93-
def _check_str_constant(self, source_node: NodeNG, inferred: InferredValue):
94-
if self._already_reported(source_node, inferred):
95-
return
96-
# don't report on JoinedStr fragments
97-
if isinstance(source_node.parent, JoinedStr):
98-
return
87+
def _check_str_arg(self, call_node: Call, arg_node: NodeNG, inferred: InferredValue):
9988
value = inferred.as_string()
10089
for pattern in DIRECT_FS_ACCESS_PATTERNS:
10190
if not pattern.matches(value):
10291
continue
103-
# avoid false positives with relative URLs
104-
if self._is_http_call_parameter(source_node):
92+
# only capture 'open' calls or calls originating from spark or dbutils
93+
# because there is no other known way to manipulate data directly from file system
94+
tree = Tree(call_node)
95+
is_open = TreeHelper.get_call_name(call_node) == "open" and tree.is_builtin()
96+
is_from_db_utils = False if is_open else tree.is_from_module("dbutils")
97+
is_from_spark = False if is_open or is_from_db_utils else tree.is_from_module("spark")
98+
if not (is_open or is_from_db_utils or is_from_spark):
10599
return
106100
# avoid duplicate advices that are reported by SparkSqlPyLinter
107-
if self._prevent_spark_duplicates and Tree(source_node).is_from_module("spark"):
101+
if self._prevent_spark_duplicates and is_from_spark:
108102
return
109103
# since we're normally filtering out spark calls, we're dealing with dfsas we know little about
110104
# notably we don't know is_read or is_write
@@ -113,39 +107,8 @@ def _check_str_constant(self, source_node: NodeNG, inferred: InferredValue):
113107
is_read=True,
114108
is_write=False,
115109
)
116-
self._directfs_nodes.append(DirectFsAccessNode(dfsa, source_node))
117-
self._reported_locations.add((source_node.lineno, source_node.col_offset))
118-
119-
@classmethod
120-
def _is_http_call_parameter(cls, source_node: NodeNG):
121-
if not isinstance(source_node.parent, Call):
122-
return False
123-
# for now we only cater for ws.api_client.do
124-
return cls._is_ws_api_client_do_call(source_node)
125-
126-
@classmethod
127-
def _is_ws_api_client_do_call(cls, source_node: NodeNG):
128-
assert isinstance(source_node.parent, Call)
129-
func = source_node.parent.func
130-
if not isinstance(func, Attribute) or func.attrname != "do":
131-
return False
132-
expr = func.expr
133-
if not isinstance(expr, Attribute) or expr.attrname != "api_client":
134-
return False
135-
expr = expr.expr
136-
if not isinstance(expr, Name):
137-
return False
138-
for value in InferredValue.infer_from_node(expr):
139-
if not value.is_inferred():
140-
continue
141-
for node in value.nodes:
142-
return Tree(node).is_instance_of("WorkspaceClient")
143-
# at this point is seems safer to assume that expr.expr is a workspace than the opposite
144-
return True
145-
146-
def _already_reported(self, source_node: NodeNG, inferred: InferredValue):
147-
all_nodes = [source_node] + inferred.nodes
148-
return any((node.lineno, node.col_offset) in self._reported_locations for node in all_nodes)
110+
self._directfs_nodes.append(DirectFsAccessNode(dfsa, arg_node))
111+
return
149112

150113
@property
151114
def directfs_nodes(self):

src/databricks/labs/ucx/source_code/python/python_ast.py

Lines changed: 23 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
from __future__ import annotations
22

3+
import builtins
4+
import sys
35
from abc import ABC
46
import logging
57
import re
@@ -25,7 +27,6 @@
2527
)
2628

2729
logger = logging.getLogger(__name__)
28-
2930
missing_handlers: set[str] = set()
3031

3132

@@ -289,6 +290,16 @@ def renumber_node(node: NodeNG, offset: int) -> None:
289290
start = start + num_lines if start > 0 else start - num_lines
290291
return self
291292

293+
def is_builtin(self) -> bool:
294+
if isinstance(self._node, Name):
295+
name = self._node.name
296+
return name in dir(builtins) or name in sys.stdlib_module_names or name in sys.builtin_module_names
297+
if isinstance(self._node, Call):
298+
return Tree(self._node.func).is_builtin()
299+
if isinstance(self._node, Attribute):
300+
return Tree(self._node.expr).is_builtin()
301+
return False # not supported yet
302+
292303

293304
class _LocalTree(Tree):
294305

@@ -298,6 +309,17 @@ def is_from_module_visited(self, name: str, visited_nodes: set[NodeNG]) -> bool:
298309

299310
class TreeHelper(ABC):
300311

312+
@classmethod
313+
def get_call_name(cls, call: Call) -> str:
314+
if not isinstance(call, Call):
315+
return ""
316+
func = call.func
317+
if isinstance(func, Name):
318+
return func.name
319+
if isinstance(func, Attribute):
320+
return func.attrname
321+
return "" # not supported yet
322+
301323
@classmethod
302324
def extract_call_by_name(cls, call: Call, name: str) -> Call | None:
303325
"""Given a call-chain, extract its sub-call by method name (if it has one)"""

tests/unit/source_code/linters/test_directfs.py

Lines changed: 14 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -29,11 +29,14 @@ def test_matches_dfsa_pattern(path, matches):
2929
"code, expected",
3030
[
3131
('SOME_CONSTANT = "not a file system path"', 0),
32-
('SOME_CONSTANT = ("/dbfs/mnt", "dbfs:/", "/mnt/")', 3),
32+
('SOME_CONSTANT = ("/dbfs/mnt", "dbfs:/", "/mnt/")', 0),
3333
('# "/dbfs/mnt"', 0),
34-
('SOME_CONSTANT = "/dbfs/mnt"', 1),
35-
('SOME_CONSTANT = "/dbfs/mnt"; load_data(SOME_CONSTANT)', 1),
34+
('SOME_CONSTANT = "/dbfs/mnt"', 0),
35+
('SOME_CONSTANT = "/dbfs/mnt"; load_data(SOME_CONSTANT)', 0),
36+
('SOME_CONSTANT = "/dbfs/mnt"; spark.table(SOME_CONSTANT)', 1),
37+
('SOME_CONSTANT = ("/dbfs/mnt", "dbfs:/", "/mnt/"); [dbutils.fs(path) for path in SOME_CONSTANT]', 3),
3638
('SOME_CONSTANT = 42; load_data(SOME_CONSTANT)', 0),
39+
('SOME_CONSTANT = "/dbfs/mnt"; dbutils.fs(SOME_CONSTANT)', 1),
3740
],
3841
)
3942
def test_detects_dfsa_paths(code, expected):
@@ -47,9 +50,14 @@ def test_detects_dfsa_paths(code, expected):
4750
@pytest.mark.parametrize(
4851
"code, expected",
4952
[
50-
("load_data('/dbfs/mnt/data')", 1),
51-
("load_data('/data')", 1),
52-
("load_data('/dbfs/mnt/data', '/data')", 2),
53+
("load_data('/dbfs/mnt/data')", 0),
54+
(
55+
"""with open('/dbfs/mnt/data') as f:
56+
f.read()""",
57+
1,
58+
),
59+
("dbutils.fs('/data')", 1),
60+
("dbutils.fs('/dbfs/mnt/data', '/data')", 2),
5361
("# load_data('/dbfs/mnt/data', '/data')", 0),
5462
('spark.read.parquet("/mnt/foo/bar")', 1),
5563
('spark.read.parquet("dbfs:/mnt/foo/bar")', 1),

tests/unit/source_code/python/test_python_ast.py

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -235,6 +235,35 @@ def test_counts_lines(source: str, line_count: int):
235235
assert tree.line_count() == line_count
236236

237237

238+
@pytest.mark.parametrize(
239+
"source, name, is_builtin",
240+
[
241+
("x = open()", "open", True),
242+
("import datetime; x = datetime.datetime.now()", "now", True),
243+
("import stuff; x = stuff()", "stuff", False),
244+
(
245+
"""def stuff():
246+
pass
247+
x = stuff()""",
248+
"stuff",
249+
False,
250+
),
251+
],
252+
)
253+
def test_is_builtin(source, name, is_builtin):
254+
tree = Tree.normalize_and_parse(source)
255+
nodes = list(tree.node.get_children())
256+
for node in nodes:
257+
if isinstance(node, Assign):
258+
call = node.value
259+
assert isinstance(call, Call)
260+
func_name = TreeHelper.get_call_name(call)
261+
assert func_name == name
262+
assert Tree(call).is_builtin() == is_builtin
263+
return
264+
assert False # could not locate call
265+
266+
238267
def test_first_statement_is_none():
239268
node = Const("xyz")
240269
assert not Tree(node).first_statement()

tests/unit/source_code/samples/functional/file-access/complex-sql-notebook.sql

Lines changed: 0 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -5,20 +5,12 @@
55
-- COMMAND ----------
66
-- DBTITLE 1,A Python cell that references DBFS
77
-- MAGIC %python
8-
-- ucx[direct-filesystem-access:+1:7:+1:18] The use of direct filesystem references is deprecated: dbfs:/...
98
-- MAGIC DBFS = "dbfs:/..."
10-
-- ucx[direct-filesystem-access:+1:7:+1:18] The use of direct filesystem references is deprecated: /dbfs/mnt
119
-- MAGIC DBFS = "/dbfs/mnt"
12-
-- ucx[direct-filesystem-access:+1:7:+1:14] The use of direct filesystem references is deprecated: /mnt/
1310
-- MAGIC DBFS = "/mnt/"
14-
-- ucx[direct-filesystem-access:+1:7:+1:18] The use of direct filesystem references is deprecated: dbfs:/...
1511
-- MAGIC DBFS = "dbfs:/..."
16-
-- ucx[direct-filesystem-access:+1:10:+1:26] The use of direct filesystem references is deprecated: /dbfs/mnt/data
1712
-- MAGIC load_data('/dbfs/mnt/data')
18-
-- ucx[direct-filesystem-access:+1:10:+1:17] The use of direct filesystem references is deprecated: /data
1913
-- MAGIC load_data('/data')
20-
-- ucx[direct-filesystem-access:+2:10:+2:26] The use of direct filesystem references is deprecated: /dbfs/mnt/data
21-
-- ucx[direct-filesystem-access:+1:28:+1:35] The use of direct filesystem references is deprecated: /data
2214
-- MAGIC load_data('/dbfs/mnt/data', '/data')
2315
-- MAGIC # load_data('/dbfs/mnt/data', '/data')
2416
-- ucx[direct-filesystem-access:+1:0:+1:34] The use of direct filesystem references is deprecated: /mnt/foo/bar

tests/unit/source_code/samples/functional/file-access/direct-fs3.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,4 @@
1-
# ucx[direct-filesystem-access:+1:6:+1:26] The use of direct filesystem references is deprecated: dbfs:/mnt/foo/bar1
21
DBFS1="dbfs:/mnt/foo/bar1"
3-
# ucx[direct-filesystem-access:+1:16:+1:36] The use of direct filesystem references is deprecated: dbfs:/mnt/foo/bar2
42
systems=[DBFS1, "dbfs:/mnt/foo/bar2"]
53
for system in systems:
64
# ucx[direct-filesystem-access:+2:4:+2:30] The use of direct filesystem references is deprecated: dbfs:/mnt/foo/bar1

0 commit comments

Comments
 (0)