From cc50b87f2ec49f8055248be71b7be4add38cc368 Mon Sep 17 00:00:00 2001 From: Eric Vergnaud Date: Thu, 24 Oct 2024 19:03:43 +0200 Subject: [PATCH 1/3] implementation --- .../labs/ucx/assessment/clusters.py | 8 +++++- src/databricks/labs/ucx/assessment/jobs.py | 5 +++- .../labs/ucx/assessment/pipelines.py | 5 +++- .../labs/ucx/contexts/application.py | 19 +++++++++++++- src/databricks/labs/ucx/framework/owners.py | 14 +++++++--- .../labs/ucx/hive_metastore/grants.py | 5 +++- .../hive_metastore/table_migration_status.py | 2 +- .../labs/ucx/hive_metastore/tables.py | 5 +++- .../labs/ucx/hive_metastore/udfs.py | 5 +++- .../labs/ucx/source_code/directfs_access.py | 2 +- tests/unit/contexts/test_application.py | 26 ++++++++++++++++++- tests/unit/framework/test_owners.py | 2 +- 12 files changed, 84 insertions(+), 14 deletions(-) diff --git a/src/databricks/labs/ucx/assessment/clusters.py b/src/databricks/labs/ucx/assessment/clusters.py index e31284a1bb..ca7322632c 100644 --- a/src/databricks/labs/ucx/assessment/clusters.py +++ b/src/databricks/labs/ucx/assessment/clusters.py @@ -30,7 +30,7 @@ ) from databricks.labs.ucx.assessment.init_scripts import CheckInitScriptMixin from databricks.labs.ucx.framework.crawlers import CrawlerBase -from databricks.labs.ucx.framework.owners import Ownership +from databricks.labs.ucx.framework.owners import Ownership, AdministratorLocator from databricks.labs.ucx.framework.utils import escape_sql_identifier logger = logging.getLogger(__name__) @@ -191,6 +191,9 @@ class ClusterOwnership(Ownership[ClusterInfo]): This is the cluster creator (if known). """ + def __init__(self, administrator_locator: AdministratorLocator): + super().__init__(administrator_locator, ClusterInfo) + def _maybe_direct_owner(self, record: ClusterInfo) -> str | None: return record.creator @@ -256,5 +259,8 @@ class ClusterPolicyOwnership(Ownership[PolicyInfo]): This is the creator of the cluster policy (if known). """ + def __init__(self, administrator_locator: AdministratorLocator): + super().__init__(administrator_locator, PolicyInfo) + def _maybe_direct_owner(self, record: PolicyInfo) -> str | None: return record.creator diff --git a/src/databricks/labs/ucx/assessment/jobs.py b/src/databricks/labs/ucx/assessment/jobs.py index 924b33bd9f..c5e458cb18 100644 --- a/src/databricks/labs/ucx/assessment/jobs.py +++ b/src/databricks/labs/ucx/assessment/jobs.py @@ -26,7 +26,7 @@ from databricks.labs.ucx.assessment.clusters import CheckClusterMixin from databricks.labs.ucx.assessment.crawlers import spark_version_compatibility from databricks.labs.ucx.framework.crawlers import CrawlerBase -from databricks.labs.ucx.framework.owners import Ownership +from databricks.labs.ucx.framework.owners import Ownership, AdministratorLocator from databricks.labs.ucx.framework.utils import escape_sql_identifier logger = logging.getLogger(__name__) @@ -158,6 +158,9 @@ class JobOwnership(Ownership[JobInfo]): This is the job creator (if known). """ + def __init__(self, administrator_locator: AdministratorLocator): + super().__init__(administrator_locator, JobInfo) + def _maybe_direct_owner(self, record: JobInfo) -> str | None: return record.creator diff --git a/src/databricks/labs/ucx/assessment/pipelines.py b/src/databricks/labs/ucx/assessment/pipelines.py index 40163b7dff..c45183a5b2 100644 --- a/src/databricks/labs/ucx/assessment/pipelines.py +++ b/src/databricks/labs/ucx/assessment/pipelines.py @@ -10,7 +10,7 @@ from databricks.labs.ucx.assessment.clusters import CheckClusterMixin from databricks.labs.ucx.framework.crawlers import CrawlerBase -from databricks.labs.ucx.framework.owners import Ownership +from databricks.labs.ucx.framework.owners import Ownership, AdministratorLocator from databricks.labs.ucx.framework.utils import escape_sql_identifier logger = logging.getLogger(__name__) @@ -86,5 +86,8 @@ class PipelineOwnership(Ownership[PipelineInfo]): This is the pipeline creator (if known). """ + def __init__(self, administrator_locator: AdministratorLocator): + super().__init__(administrator_locator, PipelineInfo) + def _maybe_direct_owner(self, record: PipelineInfo) -> str | None: return record.creator_name diff --git a/src/databricks/labs/ucx/contexts/application.py b/src/databricks/labs/ucx/contexts/application.py index 300051c7e8..5be6084f3c 100644 --- a/src/databricks/labs/ucx/contexts/application.py +++ b/src/databricks/labs/ucx/contexts/application.py @@ -28,7 +28,11 @@ from databricks.labs.ucx.assessment.export import AssessmentExporter from databricks.labs.ucx.aws.credentials import CredentialManager from databricks.labs.ucx.config import WorkspaceConfig -from databricks.labs.ucx.framework.owners import AdministratorLocator, WorkspacePathOwnership +from databricks.labs.ucx.framework.owners import ( + AdministratorLocator, + WorkspacePathOwnership, + Ownership, +) from databricks.labs.ucx.hive_metastore import ExternalLocations, MountsCrawler, TablesCrawler from databricks.labs.ucx.hive_metastore.catalog_schema import CatalogSchema from databricks.labs.ucx.hive_metastore.grants import ( @@ -559,6 +563,19 @@ def migration_recon(self) -> MigrationRecon: def administrator_locator(self) -> AdministratorLocator: return AdministratorLocator(self.workspace_client) + @cached_property + def ownership_factory(self) -> Callable[[type], Ownership]: + # ensure registration of Ownerships + names_with_ownership = [name for name in dir(GlobalContext) if "ownership" in name] + for name in names_with_ownership: + if name == "ownership_factory": + continue + prop = getattr(GlobalContext, name) + if not isinstance(prop, cached_property): + continue + _ = getattr(self, name) + return Ownership.for_record_type + class CliContext(GlobalContext, abc.ABC): @cached_property diff --git a/src/databricks/labs/ucx/framework/owners.py b/src/databricks/labs/ucx/framework/owners.py index 0350ecfd60..ea1b8c6fb8 100644 --- a/src/databricks/labs/ucx/framework/owners.py +++ b/src/databricks/labs/ucx/framework/owners.py @@ -1,9 +1,10 @@ +from __future__ import annotations import logging from abc import ABC, abstractmethod from collections.abc import Callable, Iterable, Sequence from datetime import timedelta from functools import cached_property -from typing import Generic, TypeVar, final +from typing import Generic, TypeVar, final, cast from databricks.labs.blueprint.paths import WorkspacePath from databricks.sdk import WorkspaceClient @@ -169,8 +170,15 @@ def get_workspace_administrator(self) -> str: class Ownership(ABC, Generic[Record]): """Determine an owner for a given type of object.""" - def __init__(self, administrator_locator: AdministratorLocator) -> None: + _factories: dict[type, Ownership] = {} + + @classmethod + def for_record_type(cls, record_type: type) -> Ownership[Record]: + return cast(Ownership[Record], cls._factories[record_type]) + + def __init__(self, administrator_locator: AdministratorLocator, record_type: type) -> None: self._administrator_locator = administrator_locator + self._factories[record_type] = self @final def owner_of(self, record: Record) -> str: @@ -198,7 +206,7 @@ def _maybe_direct_owner(self, record: Record) -> str | None: class WorkspacePathOwnership(Ownership[WorkspacePath]): def __init__(self, administrator_locator: AdministratorLocator, ws: WorkspaceClient) -> None: - super().__init__(administrator_locator) + super().__init__(administrator_locator, WorkspacePath) self._ws = ws @retried(on=[InternalError], timeout=timedelta(minutes=1)) diff --git a/src/databricks/labs/ucx/hive_metastore/grants.py b/src/databricks/labs/ucx/hive_metastore/grants.py index de9106ce40..bb9d6a0bf5 100644 --- a/src/databricks/labs/ucx/hive_metastore/grants.py +++ b/src/databricks/labs/ucx/hive_metastore/grants.py @@ -32,7 +32,7 @@ StoragePermissionMapping, ) from databricks.labs.ucx.framework.crawlers import CrawlerBase -from databricks.labs.ucx.framework.owners import Ownership +from databricks.labs.ucx.framework.owners import Ownership, AdministratorLocator from databricks.labs.ucx.framework.utils import escape_sql_identifier from databricks.labs.ucx.hive_metastore import TablesCrawler from databricks.labs.ucx.hive_metastore.locations import ( @@ -404,6 +404,9 @@ class GrantOwnership(Ownership[Grant]): At the present we can't determine a specific owner for grants. """ + def __init__(self, administrator_locator: AdministratorLocator): + super().__init__(administrator_locator, Grant) + def _maybe_direct_owner(self, record: Grant) -> None: return None diff --git a/src/databricks/labs/ucx/hive_metastore/table_migration_status.py b/src/databricks/labs/ucx/hive_metastore/table_migration_status.py index 1a15be28f0..321f8a6362 100644 --- a/src/databricks/labs/ucx/hive_metastore/table_migration_status.py +++ b/src/databricks/labs/ucx/hive_metastore/table_migration_status.py @@ -171,7 +171,7 @@ class TableMigrationOwnership(Ownership[TableMigrationStatus]): """ def __init__(self, tables_crawler: TablesCrawler, table_ownership: TableOwnership) -> None: - super().__init__(table_ownership._administrator_locator) + super().__init__(table_ownership._administrator_locator, TableMigrationStatus) self._tables_crawler = tables_crawler self._table_ownership = table_ownership self._indexed_tables: dict[tuple[str, str], Table] | None = None diff --git a/src/databricks/labs/ucx/hive_metastore/tables.py b/src/databricks/labs/ucx/hive_metastore/tables.py index 8f4adacde5..ce66150b69 100644 --- a/src/databricks/labs/ucx/hive_metastore/tables.py +++ b/src/databricks/labs/ucx/hive_metastore/tables.py @@ -16,7 +16,7 @@ from databricks.sdk.errors import NotFound from databricks.labs.ucx.framework.crawlers import CrawlerBase -from databricks.labs.ucx.framework.owners import Ownership +from databricks.labs.ucx.framework.owners import Ownership, AdministratorLocator from databricks.labs.ucx.framework.utils import escape_sql_identifier logger = logging.getLogger(__name__) @@ -668,5 +668,8 @@ class TableOwnership(Ownership[Table]): At the present we don't determine a specific owner for tables. """ + def __init__(self, administrator_locator: AdministratorLocator): + super().__init__(administrator_locator, Table) + def _maybe_direct_owner(self, record: Table) -> None: return None diff --git a/src/databricks/labs/ucx/hive_metastore/udfs.py b/src/databricks/labs/ucx/hive_metastore/udfs.py index 81ed350838..4dc8e96df7 100644 --- a/src/databricks/labs/ucx/hive_metastore/udfs.py +++ b/src/databricks/labs/ucx/hive_metastore/udfs.py @@ -9,7 +9,7 @@ from databricks.sdk.errors import Unknown, NotFound from databricks.labs.ucx.framework.crawlers import CrawlerBase -from databricks.labs.ucx.framework.owners import Ownership +from databricks.labs.ucx.framework.owners import Ownership, AdministratorLocator from databricks.labs.ucx.framework.utils import escape_sql_identifier logger = logging.getLogger(__name__) @@ -151,5 +151,8 @@ class UdfOwnership(Ownership[Udf]): At the present we don't determine a specific owner for UDFs. """ + def __init__(self, administrator_locator: AdministratorLocator): + super().__init__(administrator_locator, Udf) + def _maybe_direct_owner(self, record: Udf) -> None: return None diff --git a/src/databricks/labs/ucx/source_code/directfs_access.py b/src/databricks/labs/ucx/source_code/directfs_access.py index 42406d3b13..e5b749cede 100644 --- a/src/databricks/labs/ucx/source_code/directfs_access.py +++ b/src/databricks/labs/ucx/source_code/directfs_access.py @@ -73,7 +73,7 @@ def __init__( workspace_path_ownership: WorkspacePathOwnership, workspace_client: WorkspaceClient, ) -> None: - super().__init__(administrator_locator) + super().__init__(administrator_locator, DirectFsAccess) self._workspace_path_ownership = workspace_path_ownership self._workspace_client = workspace_client diff --git a/tests/unit/contexts/test_application.py b/tests/unit/contexts/test_application.py index 06e620c342..29fa392986 100644 --- a/tests/unit/contexts/test_application.py +++ b/tests/unit/contexts/test_application.py @@ -1,12 +1,18 @@ from unittest.mock import create_autospec import pytest +from databricks.labs.blueprint.paths import WorkspacePath from databricks.labs.lsql.backends import MockBackend from databricks.labs.ucx.contexts.application import GlobalContext from databricks.labs.ucx.contexts.workspace_cli import LocalCheckoutContext -from databricks.labs.ucx.hive_metastore.table_migration_status import TableMigrationIndex +from databricks.labs.ucx.framework.owners import Ownership +from databricks.labs.ucx.hive_metastore.grants import Grant +from databricks.labs.ucx.hive_metastore.table_migration_status import TableMigrationIndex, TableMigrationStatus from databricks.labs.ucx.hive_metastore.table_migrate import TablesMigrator +from databricks.labs.ucx.hive_metastore.tables import Table +from databricks.labs.ucx.hive_metastore.udfs import Udf +from databricks.labs.ucx.source_code.base import DirectFsAccess from databricks.labs.ucx.source_code.linters.context import LinterContext from tests.unit import mock_workspace_client @@ -23,6 +29,7 @@ "used_tables_crawler_for_paths", "used_tables_crawler_for_queries", "verify_has_ucx_catalog", + "ownership_factory", ], ) def test_global_context_attributes_not_none(attribute: str) -> None: @@ -44,3 +51,20 @@ def test_local_context_attributes_not_none(attribute: str) -> None: ctx.replace(languages=LinterContext(TableMigrationIndex([])), tables_migrator=tables_migrator) assert hasattr(ctx, attribute) assert getattr(ctx, attribute) is not None + + +@pytest.mark.parametrize( + "record_type", + [ + DirectFsAccess, + WorkspacePath, + Grant, + Table, + Udf, + TableMigrationStatus, + ], +) +def test_ownership_factory_succeeds(record_type: type): + ctx = GlobalContext().replace(workspace_client=mock_workspace_client(), sql_backend=MockBackend()) + ownership = ctx.ownership_factory(record_type) + assert isinstance(ownership, Ownership) diff --git a/tests/unit/framework/test_owners.py b/tests/unit/framework/test_owners.py index 25dd465b6f..b5665a6f70 100644 --- a/tests/unit/framework/test_owners.py +++ b/tests/unit/framework/test_owners.py @@ -23,7 +23,7 @@ def __init__( owner_fn: Callable[[Record], str | None] = lambda _: None, ): mock_admin_locator = create_autospec(AdministratorLocator) # pylint: disable=mock-no-usage - super().__init__(mock_admin_locator) + super().__init__(mock_admin_locator, object) self._owner_fn = owner_fn self.mock_admin_locator = mock_admin_locator From 5991123f57b378072b6b00f9e3f49e5437368f07 Mon Sep 17 00:00:00 2001 From: Eric Vergnaud Date: Fri, 25 Oct 2024 16:06:27 +0200 Subject: [PATCH 2/3] Use LegacyQueryPath dataclass as record for LegacyQueryOwnership (#3080) ## Changes `LegacyQueryOwnership` takes a `str` as record type which obfuscates the meaning of that str and creates risk of collision in the ownership factory This PR fixes that by introducing a `LegacyQueryPath` data class as a wrapper around the `str` ### Linked issues None ### Functionality None ### Tests - [x] ran unit tests Co-authored-by: Eric Vergnaud --- src/databricks/labs/ucx/framework/owners.py | 14 ++++++++++---- .../labs/ucx/hive_metastore/ownership.py | 3 ++- .../labs/ucx/source_code/directfs_access.py | 3 ++- 3 files changed, 14 insertions(+), 6 deletions(-) diff --git a/src/databricks/labs/ucx/framework/owners.py b/src/databricks/labs/ucx/framework/owners.py index c9d16fca1c..71698ab1de 100644 --- a/src/databricks/labs/ucx/framework/owners.py +++ b/src/databricks/labs/ucx/framework/owners.py @@ -2,6 +2,7 @@ import logging from abc import ABC, abstractmethod from collections.abc import Callable, Iterable, Sequence +from dataclasses import dataclass from datetime import timedelta from functools import cached_property from typing import Generic, TypeVar, final, cast @@ -250,14 +251,19 @@ def _infer_from_first_can_manage(object_permissions): return None -class LegacyQueryOwnership(Ownership[str]): +@dataclass +class LegacyQueryPath: + path: str + + +class LegacyQueryOwnership(Ownership[LegacyQueryPath]): def __init__(self, administrator_locator: AdministratorLocator, workspace_client: WorkspaceClient) -> None: - super().__init__(administrator_locator, str) + super().__init__(administrator_locator, LegacyQueryPath) self._workspace_client = workspace_client - def _maybe_direct_owner(self, record: str) -> str | None: + def _maybe_direct_owner(self, record: LegacyQueryPath) -> str | None: try: - legacy_query = self._workspace_client.queries.get(record) + legacy_query = self._workspace_client.queries.get(record.path) return legacy_query.owner_user_name except NotFound: return None diff --git a/src/databricks/labs/ucx/hive_metastore/ownership.py b/src/databricks/labs/ucx/hive_metastore/ownership.py index 2a2530ef68..e1e76a629d 100644 --- a/src/databricks/labs/ucx/hive_metastore/ownership.py +++ b/src/databricks/labs/ucx/hive_metastore/ownership.py @@ -6,6 +6,7 @@ AdministratorLocator, LegacyQueryOwnership, WorkspacePathOwnership, + LegacyQueryPath, ) from databricks.labs.ucx.hive_metastore import TablesCrawler from databricks.labs.ucx.hive_metastore.grants import GrantsCrawler @@ -54,7 +55,7 @@ def _maybe_from_sources(self, record: Table) -> str | None: if not used_table.is_write: return None if used_table.source_type == 'QUERY' and used_table.query_id: - return self._legacy_query_ownership.owner_of(used_table.query_id) + return self._legacy_query_ownership.owner_of(LegacyQueryPath(used_table.query_id)) if used_table.source_type in {'NOTEBOOK', 'FILE'}: return self._workspace_path_ownership.owner_of_path(used_table.source_id) logger.warning(f"Unknown source type {used_table.source_type} for {used_table.source_id}") diff --git a/src/databricks/labs/ucx/source_code/directfs_access.py b/src/databricks/labs/ucx/source_code/directfs_access.py index 2e794d4b26..6e450021fc 100644 --- a/src/databricks/labs/ucx/source_code/directfs_access.py +++ b/src/databricks/labs/ucx/source_code/directfs_access.py @@ -15,6 +15,7 @@ AdministratorLocator, WorkspacePathOwnership, LegacyQueryOwnership, + LegacyQueryPath, ) from databricks.labs.ucx.framework.utils import escape_sql_identifier from databricks.labs.ucx.source_code.base import DirectFsAccess @@ -88,7 +89,7 @@ def __init__( def _maybe_direct_owner(self, record: DirectFsAccess) -> str | None: if record.source_type == 'QUERY' and record.query_id: - return self._legacy_query_ownership.owner_of(record.query_id) + return self._legacy_query_ownership.owner_of(LegacyQueryPath(record.query_id)) if record.source_type in {'NOTEBOOK', 'FILE'}: return self._notebook_owner(record) logger.warning(f"Unknown source type {record.source_type} for {record.source_id}") From 37e29e9c2c016a07dbb2f1a69d0c62635364ad87 Mon Sep 17 00:00:00 2001 From: Eric Vergnaud Date: Mon, 4 Nov 2024 18:30:36 +0100 Subject: [PATCH 3/3] address comments --- .../labs/ucx/assessment/clusters.py | 12 ++++---- src/databricks/labs/ucx/assessment/jobs.py | 8 +++--- .../labs/ucx/assessment/pipelines.py | 8 +++--- .../labs/ucx/contexts/application.py | 22 ++++++++------- src/databricks/labs/ucx/framework/owners.py | 28 +++++++++++++------ .../labs/ucx/hive_metastore/grants.py | 8 +++--- .../labs/ucx/hive_metastore/ownership.py | 12 ++++++-- .../labs/ucx/hive_metastore/udfs.py | 8 +++--- .../labs/ucx/source_code/directfs_access.py | 6 +++- tests/unit/contexts/test_application.py | 18 ++++++------ tests/unit/framework/test_owners.py | 6 +++- 11 files changed, 83 insertions(+), 53 deletions(-) diff --git a/src/databricks/labs/ucx/assessment/clusters.py b/src/databricks/labs/ucx/assessment/clusters.py index 10d9453715..aea1bbbeb4 100644 --- a/src/databricks/labs/ucx/assessment/clusters.py +++ b/src/databricks/labs/ucx/assessment/clusters.py @@ -3,7 +3,7 @@ import logging from collections.abc import Iterable from dataclasses import dataclass -from typing import ClassVar +from typing import ClassVar, Any from databricks.labs.lsql.backends import SqlBackend from databricks.sdk import WorkspaceClient @@ -30,7 +30,7 @@ ) from databricks.labs.ucx.assessment.init_scripts import CheckInitScriptMixin from databricks.labs.ucx.framework.crawlers import CrawlerBase -from databricks.labs.ucx.framework.owners import Ownership, AdministratorLocator +from databricks.labs.ucx.framework.owners import Ownership from databricks.labs.ucx.framework.utils import escape_sql_identifier logger = logging.getLogger(__name__) @@ -191,8 +191,8 @@ class ClusterOwnership(Ownership[ClusterInfo]): This is the cluster creator (if known). """ - def __init__(self, administrator_locator: AdministratorLocator): - super().__init__(administrator_locator, ClusterInfo) + def is_applicable_to(self, record: Any) -> bool: + return isinstance(record, ClusterInfo) def _maybe_direct_owner(self, record: ClusterInfo) -> str | None: return record.creator @@ -259,8 +259,8 @@ class ClusterPolicyOwnership(Ownership[PolicyInfo]): This is the creator of the cluster policy (if known). """ - def __init__(self, administrator_locator: AdministratorLocator): - super().__init__(administrator_locator, PolicyInfo) + def is_applicable_to(self, record: Any) -> bool: + return isinstance(record, PolicyInfo) def _maybe_direct_owner(self, record: PolicyInfo) -> str | None: return record.creator diff --git a/src/databricks/labs/ucx/assessment/jobs.py b/src/databricks/labs/ucx/assessment/jobs.py index 3dffce7f03..e751677429 100644 --- a/src/databricks/labs/ucx/assessment/jobs.py +++ b/src/databricks/labs/ucx/assessment/jobs.py @@ -4,7 +4,7 @@ from dataclasses import dataclass from datetime import datetime, timedelta, timezone from hashlib import sha256 -from typing import ClassVar +from typing import ClassVar, Any from databricks.labs.lsql.backends import SqlBackend from databricks.sdk import WorkspaceClient @@ -26,7 +26,7 @@ from databricks.labs.ucx.assessment.clusters import CheckClusterMixin from databricks.labs.ucx.assessment.crawlers import spark_version_compatibility from databricks.labs.ucx.framework.crawlers import CrawlerBase -from databricks.labs.ucx.framework.owners import Ownership, AdministratorLocator +from databricks.labs.ucx.framework.owners import Ownership from databricks.labs.ucx.framework.utils import escape_sql_identifier logger = logging.getLogger(__name__) @@ -158,8 +158,8 @@ class JobOwnership(Ownership[JobInfo]): This is the job creator (if known). """ - def __init__(self, administrator_locator: AdministratorLocator): - super().__init__(administrator_locator, JobInfo) + def is_applicable_to(self, record: Any) -> bool: + return isinstance(record, JobInfo) def _maybe_direct_owner(self, record: JobInfo) -> str | None: return record.creator diff --git a/src/databricks/labs/ucx/assessment/pipelines.py b/src/databricks/labs/ucx/assessment/pipelines.py index a17a95bf41..fd75c4f8f7 100644 --- a/src/databricks/labs/ucx/assessment/pipelines.py +++ b/src/databricks/labs/ucx/assessment/pipelines.py @@ -2,7 +2,7 @@ import logging from collections.abc import Iterable from dataclasses import dataclass -from typing import ClassVar +from typing import ClassVar, Any from databricks.labs.lsql.backends import SqlBackend from databricks.sdk import WorkspaceClient @@ -10,7 +10,7 @@ from databricks.labs.ucx.assessment.clusters import CheckClusterMixin from databricks.labs.ucx.framework.crawlers import CrawlerBase -from databricks.labs.ucx.framework.owners import Ownership, AdministratorLocator +from databricks.labs.ucx.framework.owners import Ownership from databricks.labs.ucx.framework.utils import escape_sql_identifier logger = logging.getLogger(__name__) @@ -86,8 +86,8 @@ class PipelineOwnership(Ownership[PipelineInfo]): This is the pipeline creator (if known). """ - def __init__(self, administrator_locator: AdministratorLocator): - super().__init__(administrator_locator, PipelineInfo) + def is_applicable_to(self, record: Any) -> bool: + return isinstance(record, PipelineInfo) def _maybe_direct_owner(self, record: PipelineInfo) -> str | None: return record.creator_name diff --git a/src/databricks/labs/ucx/contexts/application.py b/src/databricks/labs/ucx/contexts/application.py index 3e91066575..9c1a86ce92 100644 --- a/src/databricks/labs/ucx/contexts/application.py +++ b/src/databricks/labs/ucx/contexts/application.py @@ -33,6 +33,7 @@ WorkspacePathOwnership, Ownership, LegacyQueryOwnership, + Record, ) from databricks.labs.ucx.hive_metastore import ExternalLocations, MountsCrawler, TablesCrawler from databricks.labs.ucx.hive_metastore.catalog_schema import CatalogSchema @@ -577,17 +578,18 @@ def administrator_locator(self) -> AdministratorLocator: return AdministratorLocator(self.workspace_client) @cached_property - def ownership_factory(self) -> Callable[[type], Ownership]: + def ownership_factory(self) -> Callable[[Record], Ownership]: # ensure registration of Ownerships - names_with_ownership = [name for name in dir(GlobalContext) if "ownership" in name] - for name in names_with_ownership: - if name == "ownership_factory": - continue - prop = getattr(GlobalContext, name) - if not isinstance(prop, cached_property): - continue - _ = getattr(self, name) - return Ownership.for_record_type + _ = [ + self.directfs_access_ownership, + self.grant_ownership, + self.legacy_query_ownership, + self.table_migration_ownership, + self.table_ownership, + self.udf_ownership, + self.workspace_path_ownership, + ] + return Ownership.for_record class CliContext(GlobalContext, abc.ABC): diff --git a/src/databricks/labs/ucx/framework/owners.py b/src/databricks/labs/ucx/framework/owners.py index 71698ab1de..05fe72a9d9 100644 --- a/src/databricks/labs/ucx/framework/owners.py +++ b/src/databricks/labs/ucx/framework/owners.py @@ -5,7 +5,7 @@ from dataclasses import dataclass from datetime import timedelta from functools import cached_property -from typing import Generic, TypeVar, final, cast +from typing import Generic, TypeVar, final, Any from databricks.labs.blueprint.paths import WorkspacePath from databricks.sdk import WorkspaceClient @@ -171,15 +171,21 @@ def get_workspace_administrator(self) -> str: class Ownership(ABC, Generic[Record]): """Determine an owner for a given type of object.""" - _factories: dict[type, Ownership] = {} + _ownerships: set[Ownership] = set() @classmethod - def for_record_type(cls, record_type: type) -> Ownership[Record]: - return cast(Ownership[Record], cls._factories[record_type]) + def for_record(cls, record: Any) -> Ownership[Record]: + for ownership in cls._ownerships: + if ownership.is_applicable_to(record): + return ownership + raise ValueError(f"Ownership not implemented or not registered for {type(record).__name__}") - def __init__(self, administrator_locator: AdministratorLocator, record_type: type) -> None: + def __init__(self, administrator_locator: AdministratorLocator) -> None: self._administrator_locator = administrator_locator - self._factories[record_type] = self + self._ownerships.add(self) + + @abstractmethod + def is_applicable_to(self, record: Any) -> bool: ... @final def owner_of(self, record: Record) -> str: @@ -207,9 +213,12 @@ def _maybe_direct_owner(self, record: Record) -> str | None: class WorkspacePathOwnership(Ownership[WorkspacePath]): def __init__(self, administrator_locator: AdministratorLocator, ws: WorkspaceClient) -> None: - super().__init__(administrator_locator, WorkspacePath) + super().__init__(administrator_locator) self._ws = ws + def is_applicable_to(self, record: Any) -> bool: + return isinstance(record, WorkspacePath) + def owner_of_path(self, path: str) -> str: return self.owner_of(WorkspacePath(self._ws, path)) @@ -258,9 +267,12 @@ class LegacyQueryPath: class LegacyQueryOwnership(Ownership[LegacyQueryPath]): def __init__(self, administrator_locator: AdministratorLocator, workspace_client: WorkspaceClient) -> None: - super().__init__(administrator_locator, LegacyQueryPath) + super().__init__(administrator_locator) self._workspace_client = workspace_client + def is_applicable_to(self, record: Any) -> bool: + return isinstance(record, LegacyQueryPath) + def _maybe_direct_owner(self, record: LegacyQueryPath) -> str | None: try: legacy_query = self._workspace_client.queries.get(record.path) diff --git a/src/databricks/labs/ucx/hive_metastore/grants.py b/src/databricks/labs/ucx/hive_metastore/grants.py index d7df815360..eee172acf4 100644 --- a/src/databricks/labs/ucx/hive_metastore/grants.py +++ b/src/databricks/labs/ucx/hive_metastore/grants.py @@ -3,7 +3,7 @@ from collections.abc import Callable, Iterable from dataclasses import dataclass, replace from functools import partial, cached_property -from typing import ClassVar, Protocol +from typing import ClassVar, Protocol, Any from databricks.labs.blueprint.installation import Installation from databricks.labs.blueprint.parallel import ManyError, Threads @@ -32,7 +32,7 @@ StoragePermissionMapping, ) from databricks.labs.ucx.framework.crawlers import CrawlerBase -from databricks.labs.ucx.framework.owners import Ownership, AdministratorLocator +from databricks.labs.ucx.framework.owners import Ownership from databricks.labs.ucx.framework.utils import escape_sql_identifier from databricks.labs.ucx.hive_metastore import TablesCrawler from databricks.labs.ucx.hive_metastore.locations import ( @@ -404,8 +404,8 @@ class GrantOwnership(Ownership[Grant]): At the present we can't determine a specific owner for grants. """ - def __init__(self, administrator_locator: AdministratorLocator): - super().__init__(administrator_locator, Grant) + def is_applicable_to(self, record: Any) -> bool: + return isinstance(record, Grant) def _maybe_direct_owner(self, record: Grant) -> None: return None diff --git a/src/databricks/labs/ucx/hive_metastore/ownership.py b/src/databricks/labs/ucx/hive_metastore/ownership.py index e1e76a629d..46e47bda9a 100644 --- a/src/databricks/labs/ucx/hive_metastore/ownership.py +++ b/src/databricks/labs/ucx/hive_metastore/ownership.py @@ -1,5 +1,6 @@ import logging from functools import cached_property +from typing import Any from databricks.labs.ucx.framework.owners import ( Ownership, @@ -34,13 +35,16 @@ def __init__( legacy_query_ownership: LegacyQueryOwnership, workspace_path_ownership: WorkspacePathOwnership, ) -> None: - super().__init__(administrator_locator, Table) + super().__init__(administrator_locator) self._grants_crawler = grants_crawler self._used_tables_in_paths = used_tables_in_paths self._used_tables_in_queries = used_tables_in_queries self._legacy_query_ownership = legacy_query_ownership self._workspace_path_ownership = workspace_path_ownership + def is_applicable_to(self, record: Any) -> bool: + return isinstance(record, Table) + def _maybe_direct_owner(self, record: Table) -> str | None: owner = self._maybe_from_grants(record) if owner: @@ -93,12 +97,16 @@ class TableMigrationOwnership(Ownership[TableMigrationStatus]): """ def __init__(self, tables_crawler: TablesCrawler, table_ownership: TableOwnership) -> None: - super().__init__(table_ownership._administrator_locator, TableMigrationStatus) # TODO: Fix this + super().__init__(table_ownership._administrator_locator) # TODO: Fix this self._tables_crawler = tables_crawler self._table_ownership = table_ownership self._indexed_tables: dict[tuple[str, str], Table] | None = None + def is_applicable_to(self, record: Any) -> bool: + return isinstance(record, TableMigrationStatus) + def _tables_snapshot_index(self, reindex: bool = False) -> dict[tuple[str, str], Table]: + index = self._indexed_tables if index is None or reindex: snapshot = self._tables_crawler.snapshot() diff --git a/src/databricks/labs/ucx/hive_metastore/udfs.py b/src/databricks/labs/ucx/hive_metastore/udfs.py index bc914900ad..427a51be4d 100644 --- a/src/databricks/labs/ucx/hive_metastore/udfs.py +++ b/src/databricks/labs/ucx/hive_metastore/udfs.py @@ -2,14 +2,14 @@ from collections.abc import Iterable from dataclasses import dataclass, replace from functools import partial -from typing import ClassVar +from typing import ClassVar, Any from databricks.labs.blueprint.parallel import Threads from databricks.labs.lsql.backends import SqlBackend from databricks.sdk.errors import Unknown, NotFound from databricks.labs.ucx.framework.crawlers import CrawlerBase -from databricks.labs.ucx.framework.owners import Ownership, AdministratorLocator +from databricks.labs.ucx.framework.owners import Ownership from databricks.labs.ucx.framework.utils import escape_sql_identifier logger = logging.getLogger(__name__) @@ -151,8 +151,8 @@ class UdfOwnership(Ownership[Udf]): At the present we don't determine a specific owner for UDFs. """ - def __init__(self, administrator_locator: AdministratorLocator): - super().__init__(administrator_locator, Udf) + def is_applicable_to(self, record: Any) -> bool: + return isinstance(record, Udf) def _maybe_direct_owner(self, record: Udf) -> None: return None diff --git a/src/databricks/labs/ucx/source_code/directfs_access.py b/src/databricks/labs/ucx/source_code/directfs_access.py index 6e450021fc..5b58d9c0a2 100644 --- a/src/databricks/labs/ucx/source_code/directfs_access.py +++ b/src/databricks/labs/ucx/source_code/directfs_access.py @@ -2,6 +2,7 @@ import logging from collections.abc import Sequence, Iterable +from typing import Any from databricks.labs.blueprint.paths import WorkspacePath from databricks.sdk import WorkspaceClient @@ -82,11 +83,14 @@ def __init__( legacy_query_ownership: LegacyQueryOwnership, workspace_client: WorkspaceClient, ) -> None: - super().__init__(administrator_locator, DirectFsAccess) + super().__init__(administrator_locator) self._workspace_path_ownership = workspace_path_ownership self._legacy_query_ownership = legacy_query_ownership self._workspace_client = workspace_client + def is_applicable_to(self, record: Any) -> bool: + return isinstance(record, DirectFsAccess) + def _maybe_direct_owner(self, record: DirectFsAccess) -> str | None: if record.source_type == 'QUERY' and record.query_id: return self._legacy_query_ownership.owner_of(LegacyQueryPath(record.query_id)) diff --git a/tests/unit/contexts/test_application.py b/tests/unit/contexts/test_application.py index 29fa392986..de47ce35dd 100644 --- a/tests/unit/contexts/test_application.py +++ b/tests/unit/contexts/test_application.py @@ -54,17 +54,17 @@ def test_local_context_attributes_not_none(attribute: str) -> None: @pytest.mark.parametrize( - "record_type", + "record", [ - DirectFsAccess, - WorkspacePath, - Grant, - Table, - Udf, - TableMigrationStatus, + DirectFsAccess(), + WorkspacePath(mock_workspace_client()), + Grant("x", "y"), + Table("a", "b", "c", "d", "e"), + Udf("a", "b", "c", "d", "e", "a", False, "c", "d", "e"), + TableMigrationStatus("x", "y"), ], ) -def test_ownership_factory_succeeds(record_type: type): +def test_ownership_factory_succeeds(record: type): ctx = GlobalContext().replace(workspace_client=mock_workspace_client(), sql_backend=MockBackend()) - ownership = ctx.ownership_factory(record_type) + ownership = ctx.ownership_factory(record) assert isinstance(ownership, Ownership) diff --git a/tests/unit/framework/test_owners.py b/tests/unit/framework/test_owners.py index b5665a6f70..b9512f9d75 100644 --- a/tests/unit/framework/test_owners.py +++ b/tests/unit/framework/test_owners.py @@ -1,5 +1,6 @@ import re from collections.abc import Callable, Sequence +from typing import Any from unittest.mock import create_autospec, Mock import pytest @@ -23,10 +24,13 @@ def __init__( owner_fn: Callable[[Record], str | None] = lambda _: None, ): mock_admin_locator = create_autospec(AdministratorLocator) # pylint: disable=mock-no-usage - super().__init__(mock_admin_locator, object) + super().__init__(mock_admin_locator) self._owner_fn = owner_fn self.mock_admin_locator = mock_admin_locator + def is_applicable_to(self, record: Any) -> bool: + return True + def _maybe_direct_owner(self, record: Record) -> str | None: return self._owner_fn(record)