Fixed snapshot loading for DFSA and used-table crawlers (#3046)

asnare · web-flow · commit 1054e35b585a · 2024-10-23T14:53:18.000+02:00
## Changes This PR fixes an issue with the DFSA and used-table crawlers that could prevent loading of the snapshots. When loading they convert the rows to dictionaries using `.as_dict()` which isn't available on rows provided by the spark-based lsql backend. Instead `.asDict()` needs to be used. Incidental changes: - An existing integration test was updated to also test snapshot loading for these crawlers. - Another test was renamed to fix a typo in the name. ### Linked issues Relates to #3036, #3039. ### Tests - existing unit tests - existing integration tests
diff --git a/src/databricks/labs/ucx/source_code/directfs_access.py b/src/databricks/labs/ucx/source_code/directfs_access.py
@@ -48,7 +48,7 @@ def dump_all(self, dfsas: Sequence[DirectFsAccess]) -> None:
     def _try_fetch(self) -> Iterable[DirectFsAccess]:
         sql = f"SELECT * FROM {escape_sql_identifier(self.full_name)}"
         for row in self._backend.fetch(sql):
-            yield self._klass.from_dict(row.as_dict())
+            yield self._klass.from_dict(row.asDict())
 
     def _crawl(self) -> Iterable[DirectFsAccess]:
         return []
diff --git a/src/databricks/labs/ucx/source_code/used_table.py b/src/databricks/labs/ucx/source_code/used_table.py
@@ -47,7 +47,7 @@ def dump_all(self, tables: Sequence[UsedTable]) -> None:
     def _try_fetch(self) -> Iterable[UsedTable]:
         sql = f"SELECT * FROM {escape_sql_identifier(self.full_name)}"
         for row in self._backend.fetch(sql):
-            yield self._klass.from_dict(row.as_dict())
+            yield self._klass.from_dict(row.asDict())
 
     def _crawl(self) -> Iterable[UsedTable]:
         return []
diff --git a/tests/integration/source_code/test_jobs.py b/tests/integration/source_code/test_jobs.py
@@ -36,17 +36,23 @@
 def test_linter_from_context(simple_ctx, make_job) -> None:
     # This code is similar to test_running_real_workflow_linter_job, but it's executed on the caller side and is easier
     # to debug.
-    # Ensure we have at least 1 job that fails
-    job = make_job(content="import xyz")
+    # Ensure we have at least 1 job that fails: "Deprecated file system path in call to: /mnt/things/e/f/g"
+    job = make_job(content="spark.read.table('a_table').write.csv('/mnt/things/e/f/g')\n")
     simple_ctx.config.include_job_ids = [job.job_id]
     simple_ctx.workflow_linter.refresh_report(simple_ctx.sql_backend, simple_ctx.inventory_database)
 
+    # Verify that the 'problems' table has content.
     cursor = simple_ctx.sql_backend.fetch(
         f"SELECT COUNT(*) AS count FROM {simple_ctx.inventory_database}.workflow_problems"
     )
     result = next(cursor)
     assert result['count'] > 0
 
+    # Verify that the other data produced snapshot can be loaded.
+    dfsa_records = simple_ctx.directfs_access_crawler_for_paths.snapshot()
+    used_table_records = simple_ctx.used_tables_crawler_for_paths.snapshot()
+    assert dfsa_records and used_table_records
+
 
 def test_job_linter_no_problems(simple_ctx, make_job) -> None:
     j = make_job()
diff --git a/tests/unit/source_code/test_queries.py b/tests/unit/source_code/test_queries.py
@@ -39,7 +39,7 @@ def test_query_linter_collects_dfsas_from_queries(name, query, dfsa_paths, is_re
     assert all(dfsa.is_write == is_write for dfsa in dfsas)
 
 
-def test_query_liner_refresh_report_writes_query_problems(migration_index, mock_backend) -> None:
+def test_query_linter_refresh_report_writes_query_problems(migration_index, mock_backend) -> None:
     ws = create_autospec(WorkspaceClient)
     dfsa_crawler = create_autospec(DirectFsAccessCrawler)
     used_tables_crawler = create_autospec(UsedTablesCrawler)