Remove the linting false positive for missing table format warning when using spark.table (#3589)

JCZuurmond · web-flow · commit 9cea6a6cf133 · 2025-01-30T15:18:09.000Z
## Changes Remove the linting false positive for missing table format warning when using `spark.table` ### Linked issues Resolves #3545 ### Functionality - [x] modified liniting related logic ### Tests - [x] modified unit tests
diff --git a/src/databricks/labs/ucx/source_code/linters/table_creation.py b/src/databricks/labs/ucx/source_code/linters/table_creation.py
@@ -97,6 +97,7 @@ def lint(self, node: NodeNG) -> Iterator[Advice]:
 
 class DBRv8d0PyLinter(PythonLinter):
     """Performs Python linting for backwards incompatible changes in DBR version 8.0.
+
     Specifically, it yields advice for table-creation with implicit format.
     """
 
@@ -106,10 +107,13 @@ def __init__(self, dbr_version: tuple[int, int] | None):
         version_cutoff = (8, 0)
         self._skip_dbr = dbr_version is not None and dbr_version >= version_cutoff
 
+        # A more precise match would check if the method names come from their respective parent classes. However, given
+        # the (current) uniqueness of the names within the Spark module it is not required (yet).
         self._linter = NoFormatPythonLinter(
             [
+                # https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/api/pyspark.sql.DataFrame.writeTo.html
                 NoFormatPythonMatcher("writeTo", 1, 1),
-                NoFormatPythonMatcher("table", 1, 1),
+                # https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/api/pyspark.sql.DataFrameWriter.saveAsTable.html
                 NoFormatPythonMatcher("saveAsTable", 1, 4, 2, "format"),
             ]
         )
diff --git a/tests/unit/source_code/linters/test_files.py b/tests/unit/source_code/linters/test_files.py
@@ -4,7 +4,7 @@
 import pytest
 from databricks.labs.blueprint.tui import MockPrompts
 
-from databricks.labs.ucx.source_code.base import CurrentSessionState, LocatedAdvice, Advice
+from databricks.labs.ucx.source_code.base import CurrentSessionState
 from databricks.labs.ucx.source_code.graph import DependencyResolver, SourceContainer
 from databricks.labs.ucx.source_code.notebooks.loaders import NotebookResolver, NotebookLoader
 from databricks.labs.ucx.source_code.notebooks.migrator import NotebookMigrator
@@ -147,19 +147,7 @@ def test_linter_lints_children_in_context(mock_path_lookup, local_code_linter) -
     paths: set[Path] = set()
     advices = list(local_code_linter.lint_path(path, paths))
     assert len(paths) == 3
-    assert advices == [
-        LocatedAdvice(
-            advice=Advice(
-                code='default-format-changed-in-dbr8',
-                message='The default format changed in Databricks Runtime 8.0, from Parquet to Delta',
-                start_line=3,
-                start_col=0,
-                end_line=3,
-                end_col=33,
-            ),
-            path=path / "child.py",
-        )
-    ]
+    assert not advices
 
 
 def test_triple_dot_import() -> None:
diff --git a/tests/unit/source_code/linters/test_table_creation.py b/tests/unit/source_code/linters/test_table_creation.py
@@ -8,7 +8,6 @@
 
 METHOD_NAMES = [
     "writeTo",
-    "table",
     "saveAsTable",
 ]
 ASSIGN = [True, False]
@@ -80,18 +79,21 @@ def test_no_format(migration_index, method_name, assign) -> None:
 
 
 @pytest.mark.parametrize(
-    "params",
+    "statement",
     [
-        {"stmt": 'spark.foo().bar().table().baz()', "expected": False},
-        {"stmt": 'spark.foo().bar().table("catalog.db.table").baz()', "expected": True},
-        {"stmt": 'spark.foo().bar().table("catalog.db.table", "xyz").baz()', "expected": False},
-        {"stmt": 'spark.foo().bar().table("catalog.db.table", fmt="xyz").baz()', "expected": False},
+        "spark.foo().bar().table().baz()",
+        "spark.foo().bar().table('catalog.db.table').baz()",
+        "spark.foo().bar().table('catalog.db.table', 'xyz').baz()",
+        "spark.foo().bar().table('catalog.db.table', fmt='xyz').baz()",
     ],
 )
-def test_no_format_args_count(migration_index, params) -> None:
-    """Tests that the number of arguments to table creation call is considered in matching"""
-    old_code = get_code(False, params["stmt"])
-    assert (not params["expected"]) == (not lint(old_code))
+def test_reading_table_yields_no_advice(statement: str) -> None:
+    """Tests that reading a table with `.table()` does not yield an advice.
+
+    Regression test kept for false positive advice on default table format change when reading a table.
+    """
+    old_code = get_code(False, statement)
+    assert not lint(old_code)
 
 
 @pytest.mark.parametrize("assign", ASSIGN)
@@ -118,7 +120,7 @@ def test_has_format_arg_none(migration_index, assign) -> None:
 @pytest.mark.parametrize("dbr_version", DBR_VERSIONS)
 def test_dbr_version_filter(migration_index, dbr_version) -> None:
     """Tests the DBR version cutoff filter"""
-    old_code = get_code(False, 'spark.foo().bar().table("catalog.db.table").baz()')
-    expected = [] if dbr_version["suppress"] else [get_advice(False, 'table', 18)]
+    old_code = get_code(False, 'spark.foo().bar().writeTo("catalog.db.table").baz()')
+    expected = [] if dbr_version["suppress"] else [get_advice(False, 'writeTo', 18)]
     actual = lint(old_code, dbr_version["version"])
     assert actual == expected
diff --git a/tests/unit/source_code/samples/functional/_child_that_uses_missing_value.py b/tests/unit/source_code/samples/functional/_child_that_uses_missing_value.py
@@ -1,6 +1,5 @@
 # Databricks notebook source
 
-# ucx[cannot-autofix-table-reference:+2:0:+2:33] Can't migrate 'spark.table(f'{some_table_name}')' because its table name argument cannot be computed
-# ucx[default-format-changed-in-dbr8:+1:0:+1:33] The default format changed in Databricks Runtime 8.0, from Parquet to Delta
+# ucx[cannot-autofix-table-reference:+1:0:+1:33] Can't migrate 'spark.table(f'{some_table_name}')' because its table name argument cannot be computed
 spark.table(f"{some_table_name}")
 x = 2
diff --git a/tests/unit/source_code/samples/functional/_child_that_uses_value_from_parent.py b/tests/unit/source_code/samples/functional/_child_that_uses_value_from_parent.py
@@ -1,4 +1,3 @@
 # Databricks notebook source
 
-# ucx[default-format-changed-in-dbr8:+1:0:+1:33] The default format changed in Databricks Runtime 8.0, from Parquet to Delta
 spark.table(f"{some_table_name}")
diff --git a/tests/unit/source_code/samples/functional/es-1285042.py b/tests/unit/source_code/samples/functional/es-1285042.py
@@ -1,13 +1,11 @@
 import pyspark.sql.functions as F
 
-# ucx[default-format-changed-in-dbr8:+1:17:+1:41] The default format changed in Databricks Runtime 8.0, from Parquet to Delta
 churn_features = spark.table("something")
 churn_features = (churn_features.withColumn("random", F.rand(seed=42)).withColumn("split",F.when(F.col("random") < train_ratio, "train").when(F.col("random") < train_ratio + val_ratio, "validate").otherwise("test")).drop("random"))
 
 # ucx[default-format-changed-in-dbr8:+1:1:+1:109] The default format changed in Databricks Runtime 8.0, from Parquet to Delta
 (churn_features.write.mode("overwrite").option("overwriteSchema", "true").saveAsTable("mlops_churn_training"))
 
-# ucx[default-format-changed-in-dbr8:+1:21:+1:74] The default format changed in Databricks Runtime 8.0, from Parquet to Delta
 sdf_system_columns = spark.read.table("system.information_schema.columns")
 
 # ucx[sql-parse-error:+1:14:+1:140] SQL expression is not supported yet: SELECT 1 AS col1, 2 AS col2, 3 AS col3 FROM {sdf_system_columns} LIMIT 5
diff --git a/tests/unit/source_code/samples/functional/pyspark/spark-table.py b/tests/unit/source_code/samples/functional/pyspark/spark-table.py
@@ -3,15 +3,11 @@
 for i in range(10):
 
     ## Check a literal reference to a known table that is migrated.
-    # ucx[table-migrated-to-uc:+3:9:+3:34] Table old.things is migrated to brand.new.stuff in Unity Catalog
-    # TODO: Fix false positive:
-    # ucx[default-format-changed-in-dbr8:+1:9:+1:34] The default format changed in Databricks Runtime 8.0, from Parquet to Delta
+    # ucx[table-migrated-to-uc:+1:9:+1:34] Table old.things is migrated to brand.new.stuff in Unity Catalog
     df = spark.table("old.things")
     do_stuff_with(df)
 
     ## Check a literal reference to an unknown table (that is not migrated); we expect no warning.
-    # TODO: Fix false positive:
-    # ucx[default-format-changed-in-dbr8:+1:9:+1:51] The default format changed in Databricks Runtime 8.0, from Parquet to Delta
     df = spark.table("table.we.know.nothing.about")
     do_stuff_with(df)
 
@@ -20,14 +16,10 @@
     do_stuff_with(df)
 
     ## Some calls that use a variable whose value is unknown: they could potentially reference a migrated table.
-    # ucx[cannot-autofix-table-reference:+3:9:+3:26] Can't migrate 'spark.table(name)' because its table name argument cannot be computed
-    # TODO: Fix false positive:
-    # ucx[default-format-changed-in-dbr8:+1:9:+1:26] The default format changed in Databricks Runtime 8.0, from Parquet to Delta
+    # ucx[cannot-autofix-table-reference:+1:9:+1:26] Can't migrate 'spark.table(name)' because its table name argument cannot be computed
     df = spark.table(name)
     do_stuff_with(df)
-    # ucx[cannot-autofix-table-reference:+3:9:+3:36] Can't migrate 'spark.table(f'boop{stuff}')' because its table name argument cannot be computed
-    # TODO: Fix false positive:
-    # ucx[default-format-changed-in-dbr8:+1:9:+1:36] The default format changed in Databricks Runtime 8.0, from Parquet to Delta
+    # ucx[cannot-autofix-table-reference:+1:9:+1:36] Can't migrate 'spark.table(f'boop{stuff}')' because its table name argument cannot be computed
     df = spark.table(f"boop{stuff}")
     do_stuff_with(df)
 
diff --git a/tests/unit/source_code/samples/functional/spark-connect/_child_that_uses_missing_value.py b/tests/unit/source_code/samples/functional/spark-connect/_child_that_uses_missing_value.py
@@ -1,4 +1,3 @@
 # Databricks notebook source
 
-# ucx[default-format-changed-in-dbr8:+1:0:+1:33] The default format changed in Databricks Runtime 8.0, from Parquet to Delta
 spark.table(f"{some_table_name}")
diff --git a/tests/unit/source_code/samples/functional/table-access.py b/tests/unit/source_code/samples/functional/table-access.py
@@ -1,5 +1,4 @@
 # Databricks notebook source
-# ucx[default-format-changed-in-dbr8:+1:0:+1:18] The default format changed in Databricks Runtime 8.0, from Parquet to Delta
 spark.table("a.b").count()
 spark.sql("SELECT * FROM b.c LEFT JOIN c.d USING (e)")
 %sql SELECT * FROM b.c LEFT JOIN c.d USING (e)
diff --git a/tests/unit/source_code/samples/functional/table-access.sql b/tests/unit/source_code/samples/functional/table-access.sql
@@ -5,7 +5,6 @@ SELECT * FROM b.c LEFT JOIN c.d USING (e)
 -- COMMAND ----------
 
 -- MAGIC %python
--- ucx[default-format-changed-in-dbr8:+1:0:+1:18] The default format changed in Databricks Runtime 8.0, from Parquet to Delta
 -- MAGIC spark.table("a.b").count()
 -- MAGIC spark.sql("SELECT * FROM b.c LEFT JOIN c.d USING (e)")
 
diff --git a/tests/unit/source_code/samples/functional/table-migration/table-migration-notebook.py b/tests/unit/source_code/samples/functional/table-migration/table-migration-notebook.py
@@ -4,8 +4,7 @@
 
 # COMMAND ----------
 
-# ucx[table-migrated-to-uc:+2:8:+2:29] Table people is migrated to cata4.nondefault.newpeople in Unity Catalog
-# ucx[default-format-changed-in-dbr8:+1:8:+1:29] The default format changed in Databricks Runtime 8.0, from Parquet to Delta
+# ucx[table-migrated-to-uc:+1:8:+1:29] Table people is migrated to cata4.nondefault.newpeople in Unity Catalog
 display(spark.table('people')) # we are looking at default.people table
 
 # COMMAND ----------
@@ -14,8 +13,7 @@
 
 # COMMAND ----------
 
-# ucx[table-migrated-to-uc:+2:8:+2:30] Table persons is migrated to cata4.newsomething.persons in Unity Catalog
-# ucx[default-format-changed-in-dbr8:+1:8:+1:30] The default format changed in Databricks Runtime 8.0, from Parquet to Delta
+# ucx[table-migrated-to-uc:+1:8:+1:30] Table persons is migrated to cata4.newsomething.persons in Unity Catalog
 display(spark.table('persons')) # we are looking at something.persons table
 
 # COMMAND ----------
@@ -24,8 +22,7 @@
 
 # COMMAND ----------
 
-# ucx[table-migrated-to-uc:+2:8:+2:30] Table kittens is migrated to cata4.felines.toms in Unity Catalog
-# ucx[default-format-changed-in-dbr8:+1:8:+1:30] The default format changed in Databricks Runtime 8.0, from Parquet to Delta
+# ucx[table-migrated-to-uc:+1:8:+1:30] Table kittens is migrated to cata4.felines.toms in Unity Catalog
 display(spark.table('kittens')) # we are looking at whatever.kittens table
 
 # COMMAND ----------
diff --git a/tests/unit/source_code/samples/functional/values_across_cells.py b/tests/unit/source_code/samples/functional/values_across_cells.py
@@ -3,5 +3,4 @@
 
 # COMMAND ----------
 
-# ucx[default-format-changed-in-dbr8:+1:0:+1:19] The default format changed in Databricks Runtime 8.0, from Parquet to Delta
 spark.table(f"{a}")
diff --git a/tests/unit/source_code/samples/functional/values_across_notebooks_dbutils_notebook_run.py b/tests/unit/source_code/samples/functional/values_across_notebooks_dbutils_notebook_run.py
@@ -1,7 +1,6 @@
 # Databricks notebook source
 
 dbutils.notebook.run("./values_across_notebooks_child.py")
-# ucx[default-format-changed-in-dbr8:+3:0:+3:19] The default format changed in Databricks Runtime 8.0, from Parquet to Delta
 # dbutils.notebook.run runs in a separate process and thus does NOT import values
 # ucx[cannot-autofix-table-reference:+1:0:+1:19] Can't migrate 'spark.table(f'{a}')' because its table name argument cannot be computed
 spark.table(f"{a}")
diff --git a/tests/unit/source_code/samples/functional/values_across_notebooks_magic_line.py b/tests/unit/source_code/samples/functional/values_across_notebooks_magic_line.py
@@ -4,5 +4,4 @@
 
 # COMMAND ----------
 
-# ucx[default-format-changed-in-dbr8:+1:0:+1:19] The default format changed in Databricks Runtime 8.0, from Parquet to Delta
 spark.table(f"{a}")
diff --git a/tests/unit/source_code/samples/functional/values_across_notebooks_run_cell.py b/tests/unit/source_code/samples/functional/values_across_notebooks_run_cell.py
@@ -6,5 +6,4 @@
 
 # COMMAND ----------
 
-# ucx[default-format-changed-in-dbr8:+1:0:+1:19] The default format changed in Databricks Runtime 8.0, from Parquet to Delta
 spark.table(f"{a}")
diff --git a/tests/unit/source_code/samples/parent-child-context/child.py b/tests/unit/source_code/samples/parent-child-context/child.py
@@ -1,4 +1,3 @@
 # Databricks notebook source
 
-# ucx[default-format-changed-in-dbr8:+1:0:+1:33] The default format changed in Databricks Runtime 8.0, from Parquet to Delta
 spark.table(f"{some_table_name}")
diff --git a/tests/unit/source_code/test_jobs.py b/tests/unit/source_code/test_jobs.py
@@ -516,7 +516,7 @@ def test_xxx(graph) -> None:
 
 
 def test_linting_walker_populates_paths(dependency_resolver, mock_path_lookup, migration_index) -> None:
-    path = mock_path_lookup.resolve(Path("functional/values_across_cells.py"))
+    path = mock_path_lookup.resolve(Path("functional/values_across_notebooks_dbutils_notebook_run.py"))
     root = Dependency(NotebookLoader(), path)
     xgraph = DependencyGraph(root, None, dependency_resolver, mock_path_lookup, CurrentSessionState())
     walker = LintingWalker(xgraph, set(), mock_path_lookup, "key", CurrentSessionState(), migration_index)

Original file line number	Diff line number	Diff line change
`@@ -97,6 +97,7 @@ def lint(self, node: NodeNG) -> Iterator[Advice]:`
`97`	`97`
`98`	`98`	`class DBRv8d0PyLinter(PythonLinter):`
`99`	`99`	`"""Performs Python linting for backwards incompatible changes in DBR version 8.0.`
	`100`	`+`
`100`	`101`	`Specifically, it yields advice for table-creation with implicit format.`
`101`	`102`	`"""`
`102`	`103`
`@@ -106,10 +107,13 @@ def __init__(self, dbr_version: tuple[int, int] \| None):`
`106`	`107`	`version_cutoff = (8, 0)`
`107`	`108`	`self._skip_dbr = dbr_version is not None and dbr_version >= version_cutoff`
`108`	`109`
	`110`	`+ # A more precise match would check if the method names come from their respective parent classes. However, given`
	`111`	`+ # the (current) uniqueness of the names within the Spark module it is not required (yet).`
`109`	`112`	`self._linter = NoFormatPythonLinter(`
`110`	`113`	`[`
	`114`	`+ # https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/api/pyspark.sql.DataFrame.writeTo.html`
`111`	`115`	`NoFormatPythonMatcher("writeTo", 1, 1),`
`112`		`- NoFormatPythonMatcher("table", 1, 1),`
	`116`	`+ # https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/api/pyspark.sql.DataFrameWriter.saveAsTable.html`
`113`	`117`	`NoFormatPythonMatcher("saveAsTable", 1, 4, 2, "format"),`
`114`	`118`	`]`
`115`	`119`	`)`
Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,3 @@`
`1`	`1`	`# Databricks notebook source`
`2`	`2`
`3`		`-# ucx[default-format-changed-in-dbr8:+1:0:+1:33] The default format changed in Databricks Runtime 8.0, from Parquet to Delta`
`4`	`3`	`spark.table(f"{some_table_name}")`
Original file line number	Diff line number	Diff line change
`@@ -1,5 +1,4 @@`
`1`	`1`	`# Databricks notebook source`
`2`		`-# ucx[default-format-changed-in-dbr8:+1:0:+1:18] The default format changed in Databricks Runtime 8.0, from Parquet to Delta`
`3`	`2`	`spark.table("a.b").count()`
`4`	`3`	`spark.sql("SELECT * FROM b.c LEFT JOIN c.d USING (e)")`
`5`	`4`	`%sql SELECT * FROM b.c LEFT JOIN c.d USING (e)`