Skip to content

Commit 9cea6a6

Browse files
authored
Remove the linting false positive for missing table format warning when using spark.table (#3589)
## Changes Remove the linting false positive for missing table format warning when using `spark.table` ### Linked issues Resolves #3545 ### Functionality - [x] modified liniting related logic ### Tests - [x] modified unit tests
1 parent c64de1a commit 9cea6a6

17 files changed

+29
-58
lines changed

src/databricks/labs/ucx/source_code/linters/table_creation.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -97,6 +97,7 @@ def lint(self, node: NodeNG) -> Iterator[Advice]:
9797

9898
class DBRv8d0PyLinter(PythonLinter):
9999
"""Performs Python linting for backwards incompatible changes in DBR version 8.0.
100+
100101
Specifically, it yields advice for table-creation with implicit format.
101102
"""
102103

@@ -106,10 +107,13 @@ def __init__(self, dbr_version: tuple[int, int] | None):
106107
version_cutoff = (8, 0)
107108
self._skip_dbr = dbr_version is not None and dbr_version >= version_cutoff
108109

110+
# A more precise match would check if the method names come from their respective parent classes. However, given
111+
# the (current) uniqueness of the names within the Spark module it is not required (yet).
109112
self._linter = NoFormatPythonLinter(
110113
[
114+
# https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/api/pyspark.sql.DataFrame.writeTo.html
111115
NoFormatPythonMatcher("writeTo", 1, 1),
112-
NoFormatPythonMatcher("table", 1, 1),
116+
# https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/api/pyspark.sql.DataFrameWriter.saveAsTable.html
113117
NoFormatPythonMatcher("saveAsTable", 1, 4, 2, "format"),
114118
]
115119
)

tests/unit/source_code/linters/test_files.py

Lines changed: 2 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
import pytest
55
from databricks.labs.blueprint.tui import MockPrompts
66

7-
from databricks.labs.ucx.source_code.base import CurrentSessionState, LocatedAdvice, Advice
7+
from databricks.labs.ucx.source_code.base import CurrentSessionState
88
from databricks.labs.ucx.source_code.graph import DependencyResolver, SourceContainer
99
from databricks.labs.ucx.source_code.notebooks.loaders import NotebookResolver, NotebookLoader
1010
from databricks.labs.ucx.source_code.notebooks.migrator import NotebookMigrator
@@ -147,19 +147,7 @@ def test_linter_lints_children_in_context(mock_path_lookup, local_code_linter) -
147147
paths: set[Path] = set()
148148
advices = list(local_code_linter.lint_path(path, paths))
149149
assert len(paths) == 3
150-
assert advices == [
151-
LocatedAdvice(
152-
advice=Advice(
153-
code='default-format-changed-in-dbr8',
154-
message='The default format changed in Databricks Runtime 8.0, from Parquet to Delta',
155-
start_line=3,
156-
start_col=0,
157-
end_line=3,
158-
end_col=33,
159-
),
160-
path=path / "child.py",
161-
)
162-
]
150+
assert not advices
163151

164152

165153
def test_triple_dot_import() -> None:

tests/unit/source_code/linters/test_table_creation.py

Lines changed: 14 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,6 @@
88

99
METHOD_NAMES = [
1010
"writeTo",
11-
"table",
1211
"saveAsTable",
1312
]
1413
ASSIGN = [True, False]
@@ -80,18 +79,21 @@ def test_no_format(migration_index, method_name, assign) -> None:
8079

8180

8281
@pytest.mark.parametrize(
83-
"params",
82+
"statement",
8483
[
85-
{"stmt": 'spark.foo().bar().table().baz()', "expected": False},
86-
{"stmt": 'spark.foo().bar().table("catalog.db.table").baz()', "expected": True},
87-
{"stmt": 'spark.foo().bar().table("catalog.db.table", "xyz").baz()', "expected": False},
88-
{"stmt": 'spark.foo().bar().table("catalog.db.table", fmt="xyz").baz()', "expected": False},
84+
"spark.foo().bar().table().baz()",
85+
"spark.foo().bar().table('catalog.db.table').baz()",
86+
"spark.foo().bar().table('catalog.db.table', 'xyz').baz()",
87+
"spark.foo().bar().table('catalog.db.table', fmt='xyz').baz()",
8988
],
9089
)
91-
def test_no_format_args_count(migration_index, params) -> None:
92-
"""Tests that the number of arguments to table creation call is considered in matching"""
93-
old_code = get_code(False, params["stmt"])
94-
assert (not params["expected"]) == (not lint(old_code))
90+
def test_reading_table_yields_no_advice(statement: str) -> None:
91+
"""Tests that reading a table with `.table()` does not yield an advice.
92+
93+
Regression test kept for false positive advice on default table format change when reading a table.
94+
"""
95+
old_code = get_code(False, statement)
96+
assert not lint(old_code)
9597

9698

9799
@pytest.mark.parametrize("assign", ASSIGN)
@@ -118,7 +120,7 @@ def test_has_format_arg_none(migration_index, assign) -> None:
118120
@pytest.mark.parametrize("dbr_version", DBR_VERSIONS)
119121
def test_dbr_version_filter(migration_index, dbr_version) -> None:
120122
"""Tests the DBR version cutoff filter"""
121-
old_code = get_code(False, 'spark.foo().bar().table("catalog.db.table").baz()')
122-
expected = [] if dbr_version["suppress"] else [get_advice(False, 'table', 18)]
123+
old_code = get_code(False, 'spark.foo().bar().writeTo("catalog.db.table").baz()')
124+
expected = [] if dbr_version["suppress"] else [get_advice(False, 'writeTo', 18)]
123125
actual = lint(old_code, dbr_version["version"])
124126
assert actual == expected
Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,5 @@
11
# Databricks notebook source
22

3-
# ucx[cannot-autofix-table-reference:+2:0:+2:33] Can't migrate 'spark.table(f'{some_table_name}')' because its table name argument cannot be computed
4-
# ucx[default-format-changed-in-dbr8:+1:0:+1:33] The default format changed in Databricks Runtime 8.0, from Parquet to Delta
3+
# ucx[cannot-autofix-table-reference:+1:0:+1:33] Can't migrate 'spark.table(f'{some_table_name}')' because its table name argument cannot be computed
54
spark.table(f"{some_table_name}")
65
x = 2
Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,3 @@
11
# Databricks notebook source
22

3-
# ucx[default-format-changed-in-dbr8:+1:0:+1:33] The default format changed in Databricks Runtime 8.0, from Parquet to Delta
43
spark.table(f"{some_table_name}")

tests/unit/source_code/samples/functional/es-1285042.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,11 @@
11
import pyspark.sql.functions as F
22

3-
# ucx[default-format-changed-in-dbr8:+1:17:+1:41] The default format changed in Databricks Runtime 8.0, from Parquet to Delta
43
churn_features = spark.table("something")
54
churn_features = (churn_features.withColumn("random", F.rand(seed=42)).withColumn("split",F.when(F.col("random") < train_ratio, "train").when(F.col("random") < train_ratio + val_ratio, "validate").otherwise("test")).drop("random"))
65

76
# ucx[default-format-changed-in-dbr8:+1:1:+1:109] The default format changed in Databricks Runtime 8.0, from Parquet to Delta
87
(churn_features.write.mode("overwrite").option("overwriteSchema", "true").saveAsTable("mlops_churn_training"))
98

10-
# ucx[default-format-changed-in-dbr8:+1:21:+1:74] The default format changed in Databricks Runtime 8.0, from Parquet to Delta
119
sdf_system_columns = spark.read.table("system.information_schema.columns")
1210

1311
# ucx[sql-parse-error:+1:14:+1:140] SQL expression is not supported yet: SELECT 1 AS col1, 2 AS col2, 3 AS col3 FROM {sdf_system_columns} LIMIT 5

tests/unit/source_code/samples/functional/pyspark/spark-table.py

Lines changed: 3 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -3,15 +3,11 @@
33
for i in range(10):
44

55
## Check a literal reference to a known table that is migrated.
6-
# ucx[table-migrated-to-uc:+3:9:+3:34] Table old.things is migrated to brand.new.stuff in Unity Catalog
7-
# TODO: Fix false positive:
8-
# ucx[default-format-changed-in-dbr8:+1:9:+1:34] The default format changed in Databricks Runtime 8.0, from Parquet to Delta
6+
# ucx[table-migrated-to-uc:+1:9:+1:34] Table old.things is migrated to brand.new.stuff in Unity Catalog
97
df = spark.table("old.things")
108
do_stuff_with(df)
119

1210
## Check a literal reference to an unknown table (that is not migrated); we expect no warning.
13-
# TODO: Fix false positive:
14-
# ucx[default-format-changed-in-dbr8:+1:9:+1:51] The default format changed in Databricks Runtime 8.0, from Parquet to Delta
1511
df = spark.table("table.we.know.nothing.about")
1612
do_stuff_with(df)
1713

@@ -20,14 +16,10 @@
2016
do_stuff_with(df)
2117

2218
## Some calls that use a variable whose value is unknown: they could potentially reference a migrated table.
23-
# ucx[cannot-autofix-table-reference:+3:9:+3:26] Can't migrate 'spark.table(name)' because its table name argument cannot be computed
24-
# TODO: Fix false positive:
25-
# ucx[default-format-changed-in-dbr8:+1:9:+1:26] The default format changed in Databricks Runtime 8.0, from Parquet to Delta
19+
# ucx[cannot-autofix-table-reference:+1:9:+1:26] Can't migrate 'spark.table(name)' because its table name argument cannot be computed
2620
df = spark.table(name)
2721
do_stuff_with(df)
28-
# ucx[cannot-autofix-table-reference:+3:9:+3:36] Can't migrate 'spark.table(f'boop{stuff}')' because its table name argument cannot be computed
29-
# TODO: Fix false positive:
30-
# ucx[default-format-changed-in-dbr8:+1:9:+1:36] The default format changed in Databricks Runtime 8.0, from Parquet to Delta
22+
# ucx[cannot-autofix-table-reference:+1:9:+1:36] Can't migrate 'spark.table(f'boop{stuff}')' because its table name argument cannot be computed
3123
df = spark.table(f"boop{stuff}")
3224
do_stuff_with(df)
3325

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,3 @@
11
# Databricks notebook source
22

3-
# ucx[default-format-changed-in-dbr8:+1:0:+1:33] The default format changed in Databricks Runtime 8.0, from Parquet to Delta
43
spark.table(f"{some_table_name}")

tests/unit/source_code/samples/functional/table-access.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,4 @@
11
# Databricks notebook source
2-
# ucx[default-format-changed-in-dbr8:+1:0:+1:18] The default format changed in Databricks Runtime 8.0, from Parquet to Delta
32
spark.table("a.b").count()
43
spark.sql("SELECT * FROM b.c LEFT JOIN c.d USING (e)")
54
%sql SELECT * FROM b.c LEFT JOIN c.d USING (e)

tests/unit/source_code/samples/functional/table-access.sql

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,6 @@ SELECT * FROM b.c LEFT JOIN c.d USING (e)
55
-- COMMAND ----------
66

77
-- MAGIC %python
8-
-- ucx[default-format-changed-in-dbr8:+1:0:+1:18] The default format changed in Databricks Runtime 8.0, from Parquet to Delta
98
-- MAGIC spark.table("a.b").count()
109
-- MAGIC spark.sql("SELECT * FROM b.c LEFT JOIN c.d USING (e)")
1110

0 commit comments

Comments
 (0)