From 1599803c021bf2501fddffc97362eeaf62694000 Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Thu, 3 Oct 2024 10:12:34 +0200 Subject: [PATCH 1/7] Lint cached property --- src/databricks/labs/ucx/contexts/application.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/databricks/labs/ucx/contexts/application.py b/src/databricks/labs/ucx/contexts/application.py index 95944a3d2a..53e936de83 100644 --- a/src/databricks/labs/ucx/contexts/application.py +++ b/src/databricks/labs/ucx/contexts/application.py @@ -422,7 +422,7 @@ def dependency_resolver(self): ) @cached_property - def workflow_linter(self): + def workflow_linter(self) -> WorkflowLinter: return WorkflowLinter( self.workspace_client, self.dependency_resolver, From 3a17a564130bf8587b0b1801bca08229f9061abf Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Thu, 3 Oct 2024 10:19:56 +0200 Subject: [PATCH 2/7] Add missing type hints to UsedTablesCrawler --- src/databricks/labs/ucx/source_code/used_table.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/databricks/labs/ucx/source_code/used_table.py b/src/databricks/labs/ucx/source_code/used_table.py index 0b77897832..f5cb9be08a 100644 --- a/src/databricks/labs/ucx/source_code/used_table.py +++ b/src/databricks/labs/ucx/source_code/used_table.py @@ -16,14 +16,14 @@ class UsedTablesCrawler(CrawlerBase[UsedTable]): @classmethod - def for_paths(cls, backend: SqlBackend, schema) -> UsedTablesCrawler: + def for_paths(cls, backend: SqlBackend, schema: str) -> UsedTablesCrawler: return UsedTablesCrawler(backend, schema, "used_tables_in_paths") @classmethod - def for_queries(cls, backend: SqlBackend, schema) -> UsedTablesCrawler: + def for_queries(cls, backend: SqlBackend, schema: str) -> UsedTablesCrawler: return UsedTablesCrawler(backend, schema, "used_tables_in_queries") - def __init__(self, backend: SqlBackend, schema: str, table: str): + def __init__(self, backend: SqlBackend, schema: str, table: str) -> None: """ Initializes a DFSACrawler instance. @@ -33,7 +33,7 @@ def __init__(self, backend: SqlBackend, schema: str, table: str): """ super().__init__(backend=backend, catalog="hive_metastore", schema=schema, table=table, klass=UsedTable) - def dump_all(self, tables: Sequence[UsedTable]): + def dump_all(self, tables: Sequence[UsedTable]) -> None: """This crawler doesn't follow the pull model because the fetcher fetches data for 3 crawlers, not just one It's not **bad** because all records are pushed at once. Providing a multi-entity crawler is out-of-scope of this PR From ac2fefe5c578d0acb539c6e3381678beae0e6e19 Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Thu, 3 Oct 2024 10:20:16 +0200 Subject: [PATCH 3/7] Move __init__ to top of class --- .../labs/ucx/source_code/used_table.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/src/databricks/labs/ucx/source_code/used_table.py b/src/databricks/labs/ucx/source_code/used_table.py index f5cb9be08a..5b45a96864 100644 --- a/src/databricks/labs/ucx/source_code/used_table.py +++ b/src/databricks/labs/ucx/source_code/used_table.py @@ -15,14 +15,6 @@ class UsedTablesCrawler(CrawlerBase[UsedTable]): - @classmethod - def for_paths(cls, backend: SqlBackend, schema: str) -> UsedTablesCrawler: - return UsedTablesCrawler(backend, schema, "used_tables_in_paths") - - @classmethod - def for_queries(cls, backend: SqlBackend, schema: str) -> UsedTablesCrawler: - return UsedTablesCrawler(backend, schema, "used_tables_in_queries") - def __init__(self, backend: SqlBackend, schema: str, table: str) -> None: """ Initializes a DFSACrawler instance. @@ -33,6 +25,14 @@ def __init__(self, backend: SqlBackend, schema: str, table: str) -> None: """ super().__init__(backend=backend, catalog="hive_metastore", schema=schema, table=table, klass=UsedTable) + @classmethod + def for_paths(cls, backend: SqlBackend, schema: str) -> UsedTablesCrawler: + return UsedTablesCrawler(backend, schema, "used_tables_in_paths") + + @classmethod + def for_queries(cls, backend: SqlBackend, schema: str) -> UsedTablesCrawler: + return UsedTablesCrawler(backend, schema, "used_tables_in_queries") + def dump_all(self, tables: Sequence[UsedTable]) -> None: """This crawler doesn't follow the pull model because the fetcher fetches data for 3 crawlers, not just one It's not **bad** because all records are pushed at once. From f334cfac25f4c6a149c4df3e1f3b89a50fe9d16d Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Thu, 3 Oct 2024 10:21:02 +0200 Subject: [PATCH 4/7] Add used tables crawlers for paths and queries to global context --- src/databricks/labs/ucx/contexts/application.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/src/databricks/labs/ucx/contexts/application.py b/src/databricks/labs/ucx/contexts/application.py index 53e936de83..6fad07cd54 100644 --- a/src/databricks/labs/ucx/contexts/application.py +++ b/src/databricks/labs/ucx/contexts/application.py @@ -17,6 +17,7 @@ from databricks.labs.ucx.recon.schema_comparator import StandardSchemaComparator from databricks.labs.ucx.source_code.directfs_access import DirectFsAccessCrawler from databricks.labs.ucx.source_code.python_libraries import PythonLibraryResolver +from databricks.labs.ucx.source_code.used_table import UsedTablesCrawler from databricks.sdk import AccountClient, WorkspaceClient, core from databricks.sdk.service import sql @@ -485,6 +486,14 @@ def migration_recon(self): self.config.recon_tolerance_percent, ) + @cached_property + def used_tables_crawler_for_paths(self) -> UsedTablesCrawler: + return UsedTablesCrawler.for_paths(self.sql_backend, self.config.ucx_catalog) + + @cached_property + def used_tables_crawler_for_queries(self) -> UsedTablesCrawler: + return UsedTablesCrawler.for_paths(self.sql_backend, self.config.ucx_catalog) + class CliContext(GlobalContext, abc.ABC): @cached_property From b3590b51eb620fdc29e11386b56b2dde8daf4eaa Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Thu, 3 Oct 2024 10:23:18 +0200 Subject: [PATCH 5/7] Pass used tables crawlers for paths to workflow linter --- src/databricks/labs/ucx/contexts/application.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/databricks/labs/ucx/contexts/application.py b/src/databricks/labs/ucx/contexts/application.py index 6fad07cd54..e1bdf3c894 100644 --- a/src/databricks/labs/ucx/contexts/application.py +++ b/src/databricks/labs/ucx/contexts/application.py @@ -430,6 +430,7 @@ def workflow_linter(self) -> WorkflowLinter: self.path_lookup, TableMigrationIndex([]), # TODO: bring back self.tables_migrator.index() self.directfs_access_crawler_for_paths, + self.used_tables_crawler_for_paths, self.config.include_job_ids, ) From a76d7fef224cadd37f5a35b8bd5a7bbd57b7c940 Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Thu, 3 Oct 2024 14:22:54 +0200 Subject: [PATCH 6/7] Improve test coverage --- tests/unit/contexts/test_application.py | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/tests/unit/contexts/test_application.py b/tests/unit/contexts/test_application.py index d283a784ff..5f0448f09f 100644 --- a/tests/unit/contexts/test_application.py +++ b/tests/unit/contexts/test_application.py @@ -1,6 +1,7 @@ from unittest.mock import create_autospec import pytest +from databricks.labs.lsql.backends import MockBackend from databricks.labs.ucx.contexts.application import GlobalContext from databricks.labs.ucx.contexts.workspace_cli import LocalCheckoutContext @@ -11,18 +12,28 @@ @pytest.mark.parametrize( - "attribute", ["dependency_resolver", "pip_resolver", "site_packages_path", "notebook_loader", "folder_loader"] + "attribute", + [ + "dependency_resolver", + "pip_resolver", + "site_packages_path", + "notebook_loader", + "folder_loader", + "workflow_linter", + "used_tables_crawler_for_paths", + "used_tables_crawler_for_queries", + ], ) -def test_global_context_attributes_not_none(attribute: str): +def test_global_context_attributes_not_none(attribute: str) -> None: """Attributes should be not None""" # Goal is to improve test coverage - ctx = GlobalContext() + ctx = GlobalContext().replace(workspace_client=mock_workspace_client(), sql_backend=MockBackend()) assert hasattr(ctx, attribute) assert getattr(ctx, attribute) is not None @pytest.mark.parametrize("attribute", ["local_file_migrator", "local_code_linter"]) -def test_local_context_attributes_not_none(attribute: str): +def test_local_context_attributes_not_none(attribute: str) -> None: """Attributes should be not None""" # Goal is to improve test coverage client = mock_workspace_client() From 767b0726b97fefc31fd7e4db106592e283aa64d6 Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Thu, 3 Oct 2024 17:14:26 +0200 Subject: [PATCH 7/7] Remove duplicate cached properties --- src/databricks/labs/ucx/contexts/application.py | 9 --------- 1 file changed, 9 deletions(-) diff --git a/src/databricks/labs/ucx/contexts/application.py b/src/databricks/labs/ucx/contexts/application.py index 7536af958b..6e2e66c5c1 100644 --- a/src/databricks/labs/ucx/contexts/application.py +++ b/src/databricks/labs/ucx/contexts/application.py @@ -56,7 +56,6 @@ from databricks.labs.ucx.source_code.known import KnownList from databricks.labs.ucx.source_code.queries import QueryLinter from databricks.labs.ucx.source_code.redash import Redash -from databricks.labs.ucx.source_code.used_table import UsedTablesCrawler from databricks.labs.ucx.workspace_access import generic, redash from databricks.labs.ucx.workspace_access.groups import GroupManager from databricks.labs.ucx.workspace_access.manager import PermissionManager @@ -496,14 +495,6 @@ def migration_recon(self): self.config.recon_tolerance_percent, ) - @cached_property - def used_tables_crawler_for_paths(self) -> UsedTablesCrawler: - return UsedTablesCrawler.for_paths(self.sql_backend, self.config.ucx_catalog) - - @cached_property - def used_tables_crawler_for_queries(self) -> UsedTablesCrawler: - return UsedTablesCrawler.for_paths(self.sql_backend, self.config.ucx_catalog) - class CliContext(GlobalContext, abc.ABC): @cached_property