From 573fca294c6c3ad3709bc4173f86c9b24091f99b Mon Sep 17 00:00:00 2001 From: Andrew Snare Date: Tue, 1 Oct 2024 16:02:41 +0200 Subject: [PATCH 01/58] Crawler support for object ownership. This involves wiring up all the crawlers to have a workspace client, needed to locate the workspace administrator when an owner for the object cannot be found. --- src/databricks/labs/ucx/assessment/azure.py | 3 +- .../labs/ucx/assessment/clusters.py | 6 +- .../labs/ucx/assessment/init_scripts.py | 3 +- src/databricks/labs/ucx/assessment/jobs.py | 6 +- .../labs/ucx/assessment/pipelines.py | 3 +- .../labs/ucx/contexts/application.py | 20 ++++-- .../labs/ucx/contexts/workflow_task.py | 4 +- src/databricks/labs/ucx/framework/crawlers.py | 66 ++++++++++++++++++- src/databricks/labs/ucx/framework/utils.py | 55 ++++++++++++++++ .../labs/ucx/hive_metastore/grants.py | 3 +- .../labs/ucx/hive_metastore/locations.py | 11 ++-- .../hive_metastore/table_migration_status.py | 3 +- .../labs/ucx/hive_metastore/table_size.py | 21 +++--- .../labs/ucx/hive_metastore/tables.py | 10 +-- .../labs/ucx/hive_metastore/udfs.py | 12 +++- .../labs/ucx/recon/migration_recon.py | 4 +- .../labs/ucx/source_code/directfs_access.py | 21 ++++-- .../labs/ucx/workspace_access/generic.py | 2 +- .../labs/ucx/workspace_access/groups.py | 3 +- .../labs/ucx/workspace_access/manager.py | 5 +- tests/integration/conftest.py | 11 +++- tests/integration/source_code/test_queries.py | 2 +- .../test_permissions_manager.py | 2 +- tests/unit/azure/test_locations.py | 2 +- tests/unit/conftest.py | 20 ++++-- tests/unit/framework/test_crawlers.py | 44 +++++++------ tests/unit/hive_metastore/test_grants.py | 60 ++++++++--------- tests/unit/hive_metastore/test_mapping.py | 4 +- .../unit/hive_metastore/test_migrate_acls.py | 8 --- .../unit/hive_metastore/test_table_migrate.py | 41 +++++------- tests/unit/hive_metastore/test_table_size.py | 25 +++---- tests/unit/hive_metastore/test_tables.py | 39 +++++------ tests/unit/hive_metastore/test_udfs.py | 8 +-- tests/unit/recon/test_migration_recon.py | 15 +---- .../unit/source_code/test_directfs_access.py | 4 +- tests/unit/workspace_access/test_manager.py | 44 ++++++------- tests/unit/workspace_access/test_tacl.py | 66 +++++++++---------- 37 files changed, 393 insertions(+), 263 deletions(-) diff --git a/src/databricks/labs/ucx/assessment/azure.py b/src/databricks/labs/ucx/assessment/azure.py index 81c99e784b..ed5c34bf3f 100644 --- a/src/databricks/labs/ucx/assessment/azure.py +++ b/src/databricks/labs/ucx/assessment/azure.py @@ -42,8 +42,7 @@ class ServicePrincipalClusterMapping: class AzureServicePrincipalCrawler(CrawlerBase[AzureServicePrincipalInfo], JobsMixin, SecretsMixin): def __init__(self, ws: WorkspaceClient, sbe: SqlBackend, schema): - super().__init__(sbe, "hive_metastore", schema, "azure_service_principals", AzureServicePrincipalInfo) - self._ws = ws + super().__init__(ws, sbe, "hive_metastore", schema, "azure_service_principals", AzureServicePrincipalInfo) def _try_fetch(self) -> Iterable[AzureServicePrincipalInfo]: for row in self._fetch(f"SELECT * FROM {escape_sql_identifier(self.full_name)}"): diff --git a/src/databricks/labs/ucx/assessment/clusters.py b/src/databricks/labs/ucx/assessment/clusters.py index 02badb64ec..b69862b9a6 100644 --- a/src/databricks/labs/ucx/assessment/clusters.py +++ b/src/databricks/labs/ucx/assessment/clusters.py @@ -143,8 +143,7 @@ def _check_cluster_failures(self, cluster: ClusterDetails, source: str) -> list[ class ClustersCrawler(CrawlerBase[ClusterInfo], CheckClusterMixin): def __init__(self, ws: WorkspaceClient, sbe: SqlBackend, schema: str): - super().__init__(sbe, "hive_metastore", schema, "clusters", ClusterInfo) - self._ws = ws + super().__init__(ws, sbe, "hive_metastore", schema, "clusters", ClusterInfo) def _crawl(self) -> Iterable[ClusterInfo]: all_clusters = list(self._ws.clusters.list()) @@ -192,8 +191,7 @@ class PolicyInfo: class PoliciesCrawler(CrawlerBase[PolicyInfo], CheckClusterMixin): def __init__(self, ws: WorkspaceClient, sbe: SqlBackend, schema): - super().__init__(sbe, "hive_metastore", schema, "policies", PolicyInfo) - self._ws = ws + super().__init__(ws, sbe, "hive_metastore", schema, "policies", PolicyInfo) def _crawl(self) -> Iterable[PolicyInfo]: all_policices = list(self._ws.cluster_policies.list()) diff --git a/src/databricks/labs/ucx/assessment/init_scripts.py b/src/databricks/labs/ucx/assessment/init_scripts.py index 909015b678..b1add2e9dc 100644 --- a/src/databricks/labs/ucx/assessment/init_scripts.py +++ b/src/databricks/labs/ucx/assessment/init_scripts.py @@ -42,8 +42,7 @@ def check_init_script(self, init_script_data: str | None, source: str) -> list[s class GlobalInitScriptCrawler(CrawlerBase[GlobalInitScriptInfo], CheckInitScriptMixin): def __init__(self, ws: WorkspaceClient, sbe: SqlBackend, schema): - super().__init__(sbe, "hive_metastore", schema, "global_init_scripts", GlobalInitScriptInfo) - self._ws = ws + super().__init__(ws, sbe, "hive_metastore", schema, "global_init_scripts", GlobalInitScriptInfo) def _crawl(self) -> Iterable[GlobalInitScriptInfo]: all_global_init_scripts = list(self._ws.global_init_scripts.list()) diff --git a/src/databricks/labs/ucx/assessment/jobs.py b/src/databricks/labs/ucx/assessment/jobs.py index d5b77d68e0..9f7e3cb0e9 100644 --- a/src/databricks/labs/ucx/assessment/jobs.py +++ b/src/databricks/labs/ucx/assessment/jobs.py @@ -72,8 +72,7 @@ def _job_clusters(job): class JobsCrawler(CrawlerBase[JobInfo], JobsMixin, CheckClusterMixin): def __init__(self, ws: WorkspaceClient, sbe: SqlBackend, schema): - super().__init__(sbe, "hive_metastore", schema, "jobs", JobInfo) - self._ws = ws + super().__init__(ws, sbe, "hive_metastore", schema, "jobs", JobInfo) def _crawl(self) -> Iterable[JobInfo]: all_jobs = list(self._ws.jobs.list(expand_tasks=True)) @@ -159,8 +158,7 @@ class SubmitRunsCrawler(CrawlerBase[SubmitRunInfo], JobsMixin, CheckClusterMixin ] def __init__(self, ws: WorkspaceClient, sbe: SqlBackend, schema: str, num_days_history: int): - super().__init__(sbe, "hive_metastore", schema, "submit_runs", SubmitRunInfo) - self._ws = ws + super().__init__(ws, sbe, "hive_metastore", schema, "submit_runs", SubmitRunInfo) self._num_days_history = num_days_history @staticmethod diff --git a/src/databricks/labs/ucx/assessment/pipelines.py b/src/databricks/labs/ucx/assessment/pipelines.py index 8421e53084..329215c804 100644 --- a/src/databricks/labs/ucx/assessment/pipelines.py +++ b/src/databricks/labs/ucx/assessment/pipelines.py @@ -24,8 +24,7 @@ class PipelineInfo: class PipelinesCrawler(CrawlerBase[PipelineInfo], CheckClusterMixin): def __init__(self, ws: WorkspaceClient, sbe: SqlBackend, schema): - super().__init__(sbe, "hive_metastore", schema, "pipelines", PipelineInfo) - self._ws = ws + super().__init__(ws, sbe, "hive_metastore", schema, "pipelines", PipelineInfo) def _crawl(self) -> Iterable[PipelineInfo]: all_pipelines = list(self._ws.pipelines.list_pipelines()) diff --git a/src/databricks/labs/ucx/contexts/application.py b/src/databricks/labs/ucx/contexts/application.py index 95944a3d2a..d06017e8f4 100644 --- a/src/databricks/labs/ucx/contexts/application.py +++ b/src/databricks/labs/ucx/contexts/application.py @@ -201,6 +201,7 @@ def legacy_table_acl_support(self): @cached_property def permission_manager(self): return PermissionManager( + self.workspace_client, self.sql_backend, self.inventory_database, [ @@ -232,11 +233,21 @@ def grants_crawler(self): @cached_property def udfs_crawler(self): - return UdfsCrawler(self.sql_backend, self.inventory_database, self.config.include_databases) + return UdfsCrawler( + self.workspace_client, + self.sql_backend, + self.inventory_database, + self.config.include_databases, + ) @cached_property def tables_crawler(self): - return TablesCrawler(self.sql_backend, self.inventory_database, self.config.include_databases) + return TablesCrawler( + self.workspace_client, + self.sql_backend, + self.inventory_database, + self.config.include_databases, + ) @cached_property def tables_migrator(self): @@ -443,11 +454,11 @@ def query_linter(self): @cached_property def directfs_access_crawler_for_paths(self): - return DirectFsAccessCrawler.for_paths(self.sql_backend, self.inventory_database) + return DirectFsAccessCrawler.for_paths(self.workspace_client, self.sql_backend, self.inventory_database) @cached_property def directfs_access_crawler_for_queries(self): - return DirectFsAccessCrawler.for_queries(self.sql_backend, self.inventory_database) + return DirectFsAccessCrawler.for_queries(self.workspace_client, self.sql_backend, self.inventory_database) @cached_property def redash(self): @@ -476,6 +487,7 @@ def data_comparator(self): @cached_property def migration_recon(self): return MigrationRecon( + self.workspace_client, self.sql_backend, self.inventory_database, self.migration_status_refresher, diff --git a/src/databricks/labs/ucx/contexts/workflow_task.py b/src/databricks/labs/ucx/contexts/workflow_task.py index 488c224243..1a8d30d4e6 100644 --- a/src/databricks/labs/ucx/contexts/workflow_task.py +++ b/src/databricks/labs/ucx/contexts/workflow_task.py @@ -72,7 +72,7 @@ def pipelines_crawler(self): @cached_property def table_size_crawler(self): - return TableSizeCrawler(self.sql_backend, self.inventory_database, self.config.include_databases) + return TableSizeCrawler(self.tables_crawler) @cached_property def policies_crawler(self): @@ -84,7 +84,7 @@ def global_init_scripts_crawler(self): @cached_property def tables_crawler(self): - return FasterTableScanCrawler(self.sql_backend, self.inventory_database) + return FasterTableScanCrawler(self.workspace_client, self.sql_backend, self.inventory_database) @cached_property def tables_in_mounts(self): diff --git a/src/databricks/labs/ucx/framework/crawlers.py b/src/databricks/labs/ucx/framework/crawlers.py index 48d774d403..d224ea8743 100644 --- a/src/databricks/labs/ucx/framework/crawlers.py +++ b/src/databricks/labs/ucx/framework/crawlers.py @@ -1,12 +1,14 @@ import logging from abc import ABC, abstractmethod from collections.abc import Callable, Iterable, Sequence -from typing import ClassVar, Generic, Literal, Protocol, TypeVar +from functools import cached_property +from typing import ClassVar, Generic, Literal, Protocol, TypeVar, final from databricks.labs.lsql.backends import SqlBackend +from databricks.sdk import WorkspaceClient from databricks.sdk.errors import NotFound -from databricks.labs.ucx.framework.utils import escape_sql_identifier +from databricks.labs.ucx.framework.utils import escape_sql_identifier, find_an_admin logger = logging.getLogger(__name__) @@ -21,17 +23,25 @@ class DataclassInstance(Protocol): class CrawlerBase(ABC, Generic[Result]): - def __init__(self, backend: SqlBackend, catalog: str, schema: str, table: str, klass: type[Result]): + + _cached_workspace_admins: dict[int, str | RuntimeError] = {} + """Cached user names of workspace administrators, keyed by workspace id.""" + + def __init__( + self, ws: WorkspaceClient, backend: SqlBackend, catalog: str, schema: str, table: str, klass: type[Result] + ): """ Initializes a CrawlerBase instance. Args: + ws (WorkspaceClient): A client for the current workspace. backend (SqlBackend): The backend that executes SQL queries: Statement Execution API or Databricks Runtime. catalog (str): The catalog name for the inventory persistence. schema: The schema name for the inventory persistence. table: The table name for the inventory persistence. """ + self._ws = ws self._catalog = self._valid(catalog) self._schema = self._valid(schema) self._table = self._valid(table) @@ -107,6 +117,56 @@ def snapshot(self, *, force_refresh: bool = False) -> Iterable[Result]: """ return self._snapshot(self._try_fetch, self._crawl, force_refresh=force_refresh) + @final + def owner_of(self, result: Result) -> str: + """Obtain the user-name of a user that is responsible for the given record. + + This is intended to be a point of contact, and is either: + + - The user that originally created the resource associated with the result; or + - An active administrator for the current workspace. + + Args: + result (Result): The record for which an associated user-name is sought. + Returns: + A string containing the user-name attribute of the user considered to own the resource. + Raises: + RuntimeError if there are no active administrators for the current workspace. + """ + return self._result_owner(result) or self._workspace_admin + + @cached_property + def _workspace_admin(self) -> str: + # Avoid repeatedly hitting the shared cache. + return self._find_administrator_for(self._ws) + + @classmethod + @final + def _find_administrator_for(cls, ws: WorkspaceClient) -> str: + # Finding an administrator is quite expensive, so we ensure that for a given workspace we only + # do it once. + workspace_id = ws.get_workspace_id() + found_admin_or_error = cls._cached_workspace_admins.get(workspace_id, None) + if isinstance(found_admin_or_error, str): + return found_admin_or_error + if isinstance(found_admin_or_error, RuntimeError): + raise found_admin_or_error + + found_admin = find_an_admin(ws) + if found_admin is None or not found_admin.user_name: + msg = f"No active workspace or account administrator can be found for workspace: {workspace_id}" + error = RuntimeError(msg) + cls._cached_workspace_admins[workspace_id] = error + raise error + user_name = found_admin.user_name + cls._cached_workspace_admins[workspace_id] = user_name + return user_name + + @classmethod + def _result_owner(cls, result: Result) -> str | None: # pylint: disable=unused-argument + """Obtain the record-specific user-name associated with the given result, if any.""" + return None + @abstractmethod def _try_fetch(self) -> Iterable[Result]: """Fetch existing data that has (previously) been crawled by this crawler. diff --git a/src/databricks/labs/ucx/framework/utils.py b/src/databricks/labs/ucx/framework/utils.py index d428447911..348f08b935 100644 --- a/src/databricks/labs/ucx/framework/utils.py +++ b/src/databricks/labs/ucx/framework/utils.py @@ -1,5 +1,11 @@ +import functools import logging import subprocess +from collections.abc import Iterable + +from databricks.sdk import WorkspaceClient +from databricks.sdk.service.iam import User + logger = logging.getLogger(__name__) @@ -22,6 +28,55 @@ def escape_sql_identifier(path: str, *, maxsplit: int = 2) -> str: return ".".join(escaped) +def _has_role(user: User, role: str) -> bool: + return user.roles is not None and any(r.value == role for r in user.roles) + + +def find_workspace_admins(ws: WorkspaceClient) -> Iterable[User]: + """Enumerate the active workspace administrators in a given workspace. + + Arguments: + ws (WorkspaceClient): The client for the workspace whose administrators should be enumerated. + Returns: + Iterable[User]: The active workspace administrators, if any. + """ + all_users = ws.users.list(attributes="id,active,userName,roles") + return (user for user in all_users if user.active and _has_role(user, "workspace_admin")) + + +def find_account_admins(ws: WorkspaceClient) -> Iterable[User]: + """Enumerate the active account administrators associated with a given workspace. + + Arguments: + ws (WorkspaceClient): The client for the workspace whose account administrators should be enumerated. + Returns: + Iterable[User]: The active account administrators, if any. + """ + response = ws.api_client.do( + "GET", "/api/2.0/account/scim/v2/Users", query={"attributes": "id,active,userName,roles"} + ) + assert isinstance(response, dict) + all_users = (User.from_dict(resource) for resource in response.get("Resources", [])) + return (user for user in all_users if user.active and _has_role(user, "account_admin")) + + +def find_an_admin(ws: WorkspaceClient) -> User | None: + """Locate an active administrator for the current workspace. + + If an active workspace administrator can be located, this is returned. When there are multiple, they are sorted + alphabetically by user-name and the first is returned. If there are no workspace administrators then an active + account administrator is sought, again returning the first alphabetically by user-name if there is more than one. + + Arguments: + ws (WorkspaceClient): The client for the workspace for which an administrator should be located. + Returns: + the first (alphabetically by user-name) active workspace or account administrator, or `None` if neither can be + found. + """ + first_user = functools.partial(min, default=None, key=lambda user: user.name) + return first_user(find_workspace_admins(ws)) or first_user(find_account_admins(ws)) + + def run_command(command: str | list[str]) -> tuple[int, str, str]: args = command.split() if isinstance(command, str) else command logger.info(f"Invoking command: {args!r}") diff --git a/src/databricks/labs/ucx/hive_metastore/grants.py b/src/databricks/labs/ucx/hive_metastore/grants.py index 8673779697..5c6575eddb 100644 --- a/src/databricks/labs/ucx/hive_metastore/grants.py +++ b/src/databricks/labs/ucx/hive_metastore/grants.py @@ -199,10 +199,11 @@ class GrantsCrawler(CrawlerBase[Grant]): """Crawler that captures access controls that relate to data and other securable objects.""" def __init__(self, tc: TablesCrawler, udf: UdfsCrawler, include_databases: list[str] | None = None): + assert tc._ws == udf._ws assert tc._backend == udf._backend assert tc._catalog == udf._catalog assert tc._schema == udf._schema - super().__init__(tc._backend, tc._catalog, tc._schema, "grants", Grant) + super().__init__(tc._ws, tc._backend, tc._catalog, tc._schema, "grants", Grant) self._tc = tc self._udf = udf self._include_databases = include_databases diff --git a/src/databricks/labs/ucx/hive_metastore/locations.py b/src/databricks/labs/ucx/hive_metastore/locations.py index 05802153b4..33a0a90d07 100644 --- a/src/databricks/labs/ucx/hive_metastore/locations.py +++ b/src/databricks/labs/ucx/hive_metastore/locations.py @@ -117,8 +117,7 @@ class ExternalLocations(CrawlerBase[ExternalLocation]): _prefix_size: ClassVar[list[int]] = [1, 12] def __init__(self, ws: WorkspaceClient, sbe: SqlBackend, schema: str): - super().__init__(sbe, "hive_metastore", schema, "external_locations", ExternalLocation) - self._ws = ws + super().__init__(ws, sbe, "hive_metastore", schema, "external_locations", ExternalLocation) def _external_locations(self, tables: list[Row], mounts) -> Iterable[ExternalLocation]: min_slash = 2 @@ -301,8 +300,7 @@ def save_as_terraform_definitions_on_workspace(self, installation: Installation) class Mounts(CrawlerBase[Mount]): def __init__(self, backend: SqlBackend, ws: WorkspaceClient, inventory_database: str): - super().__init__(backend, "hive_metastore", inventory_database, "mounts", Mount) - self._dbutils = ws.dbutils + super().__init__(ws, backend, "hive_metastore", inventory_database, "mounts", Mount) @staticmethod def _deduplicate_mounts(mounts: list) -> list: @@ -320,7 +318,7 @@ def _deduplicate_mounts(mounts: list) -> list: def _crawl(self) -> Iterable[Mount]: mounts = [] - for mount_point, source, _ in self._dbutils.fs.mounts(): + for mount_point, source, _ in self._ws.dbutils.fs.mounts(): mounts.append(Mount(mount_point, source)) return self._deduplicate_mounts(mounts) @@ -356,11 +354,10 @@ def __init__( exclude_paths_in_mount: list[str] | None = None, include_paths_in_mount: list[str] | None = None, ): - super().__init__(backend, "hive_metastore", inventory_database, "tables", Table) + super().__init__(ws, backend, "hive_metastore", inventory_database, "tables", Table) self._dbutils = ws.dbutils self._mounts_crawler = mc self._include_mounts = include_mounts - self._ws = ws self._include_paths_in_mount = include_paths_in_mount irrelevant_patterns = {'_SUCCESS', '_committed_', '_started_'} diff --git a/src/databricks/labs/ucx/hive_metastore/table_migration_status.py b/src/databricks/labs/ucx/hive_metastore/table_migration_status.py index 283be4f717..640068931d 100644 --- a/src/databricks/labs/ucx/hive_metastore/table_migration_status.py +++ b/src/databricks/labs/ucx/hive_metastore/table_migration_status.py @@ -76,8 +76,7 @@ class TableMigrationStatusRefresher(CrawlerBase[TableMigrationStatus]): """ def __init__(self, ws: WorkspaceClient, sbe: SqlBackend, schema, table_crawler: TablesCrawler): - super().__init__(sbe, "hive_metastore", schema, "migration_status", TableMigrationStatus) - self._ws = ws + super().__init__(ws, sbe, "hive_metastore", schema, "migration_status", TableMigrationStatus) self._table_crawler = table_crawler def index(self, *, force_refresh: bool = False) -> TableMigrationIndex: diff --git a/src/databricks/labs/ucx/hive_metastore/table_size.py b/src/databricks/labs/ucx/hive_metastore/table_size.py index 3e5c61f81c..eb9bd2c23c 100644 --- a/src/databricks/labs/ucx/hive_metastore/table_size.py +++ b/src/databricks/labs/ucx/hive_metastore/table_size.py @@ -4,12 +4,11 @@ from functools import partial from databricks.labs.blueprint.parallel import Threads -from databricks.labs.lsql.backends import SqlBackend from databricks.labs.ucx.framework.crawlers import CrawlerBase from databricks.labs.ucx.framework.utils import escape_sql_identifier from databricks.labs.ucx.hive_metastore import TablesCrawler -from databricks.labs.ucx.hive_metastore.tables import Table +from databricks.labs.ucx.hive_metastore.tables import FasterTableScanCrawler, Table logger = logging.getLogger(__name__) @@ -23,20 +22,26 @@ class TableSize: class TableSizeCrawler(CrawlerBase[TableSize]): - def __init__(self, backend: SqlBackend, schema, include_databases: list[str] | None = None): + # TODO: Ensure TablesCrawler and FasterTableScanCrawler share a common interface. + def __init__(self, tables_crawler: TablesCrawler | FasterTableScanCrawler) -> None: """ Initializes a TablesSizeCrawler instance. Args: - backend (SqlBackend): The SQL Execution Backend abstraction (either REST API or Spark) - schema: The schema name for the inventory persistence. + tables_crawler (TablesCrawler): The crawler to use to obtain the table inventory. """ # pylint: disable-next=import-error,import-outside-toplevel from pyspark.sql.session import SparkSession # type: ignore[import-not-found] - self._backend = backend - super().__init__(backend, "hive_metastore", schema, "table_size", TableSize) - self._tables_crawler = TablesCrawler(backend, schema, include_databases) + super().__init__( + tables_crawler._ws, + tables_crawler._backend, + "hive_metastore", + tables_crawler._schema, + "table_size", + TableSize, + ) + self._tables_crawler = tables_crawler self._spark = SparkSession.builder.getOrCreate() def _crawl(self) -> Iterable[TableSize]: diff --git a/src/databricks/labs/ucx/hive_metastore/tables.py b/src/databricks/labs/ucx/hive_metastore/tables.py index f935aada95..9c5810f467 100644 --- a/src/databricks/labs/ucx/hive_metastore/tables.py +++ b/src/databricks/labs/ucx/hive_metastore/tables.py @@ -13,6 +13,7 @@ from databricks.labs.blueprint.parallel import Threads from databricks.labs.lsql.backends import SqlBackend +from databricks.sdk import WorkspaceClient from databricks.sdk.errors import NotFound from databricks.labs.ucx.framework.crawlers import CrawlerBase @@ -341,15 +342,16 @@ class MigrationCount: class TablesCrawler(CrawlerBase[Table]): - def __init__(self, backend: SqlBackend, schema, include_databases: list[str] | None = None): + def __init__(self, ws: WorkspaceClient, backend: SqlBackend, schema, include_databases: list[str] | None = None): """ Initializes a TablesCrawler instance. Args: + ws (WorkspaceClient): A client for the current workspace. backend (SqlBackend): The SQL Execution Backend abstraction (either REST API or Spark) schema: The schema name for the inventory persistence. """ - super().__init__(backend, "hive_metastore", schema, "tables", Table) + super().__init__(ws, backend, "hive_metastore", schema, "tables", Table) self._include_database = include_databases def _all_databases(self) -> list[str]: @@ -486,14 +488,14 @@ class FasterTableScanCrawler(CrawlerBase[Table]): Databricks workspace. """ - def __init__(self, backend: SqlBackend, schema, include_databases: list[str] | None = None): + def __init__(self, ws: WorkspaceClient, backend: SqlBackend, schema, include_databases: list[str] | None = None): self._backend = backend self._include_database = include_databases # pylint: disable-next=import-error,import-outside-toplevel from pyspark.sql.session import SparkSession # type: ignore[import-not-found] - super().__init__(backend, "hive_metastore", schema, "tables", Table) + super().__init__(ws, backend, "hive_metastore", schema, "tables", Table) self._spark = SparkSession.builder.getOrCreate() @cached_property diff --git a/src/databricks/labs/ucx/hive_metastore/udfs.py b/src/databricks/labs/ucx/hive_metastore/udfs.py index 6ee1eefd38..7f272696dc 100644 --- a/src/databricks/labs/ucx/hive_metastore/udfs.py +++ b/src/databricks/labs/ucx/hive_metastore/udfs.py @@ -5,6 +5,7 @@ from databricks.labs.blueprint.parallel import Threads from databricks.labs.lsql.backends import SqlBackend +from databricks.sdk import WorkspaceClient from databricks.sdk.errors import Unknown, NotFound from databricks.labs.ucx.framework.crawlers import CrawlerBase @@ -34,15 +35,22 @@ def key(self) -> str: class UdfsCrawler(CrawlerBase[Udf]): - def __init__(self, backend: SqlBackend, schema: str, include_databases: list[str] | None = None): + def __init__( + self, + ws: WorkspaceClient, + backend: SqlBackend, + schema: str, + include_databases: list[str] | None = None, + ): """ Initializes a UdfsCrawler instance. Args: + ws (WorkspaceClient): The client for the current workspace. backend (SqlBackend): The SQL Execution Backend abstraction (either REST API or Spark) schema: The schema name for the inventory persistence. """ - super().__init__(backend, "hive_metastore", schema, "udfs", Udf) + super().__init__(ws, backend, "hive_metastore", schema, "udfs", Udf) self._include_database = include_databases def _all_databases(self) -> list[str]: diff --git a/src/databricks/labs/ucx/recon/migration_recon.py b/src/databricks/labs/ucx/recon/migration_recon.py index 404fd8f1ba..24d435328a 100644 --- a/src/databricks/labs/ucx/recon/migration_recon.py +++ b/src/databricks/labs/ucx/recon/migration_recon.py @@ -4,6 +4,7 @@ from dataclasses import dataclass from functools import partial +from databricks.sdk import WorkspaceClient from databricks.sdk.errors import DatabricksError from databricks.labs.blueprint.parallel import Threads from databricks.labs.lsql.backends import SqlBackend @@ -38,6 +39,7 @@ class ReconResult: class MigrationRecon(CrawlerBase[ReconResult]): def __init__( self, + ws: WorkspaceClient, sbe: SqlBackend, schema: str, migration_status_refresher: TableMigrationStatusRefresher, @@ -46,7 +48,7 @@ def __init__( data_comparator: DataComparator, default_threshold: float, ): - super().__init__(sbe, "hive_metastore", schema, "recon_results", ReconResult) + super().__init__(ws, sbe, "hive_metastore", schema, "recon_results", ReconResult) self._migration_status_refresher = migration_status_refresher self._table_mapping = table_mapping self._schema_comparator = schema_comparator diff --git a/src/databricks/labs/ucx/source_code/directfs_access.py b/src/databricks/labs/ucx/source_code/directfs_access.py index 9ad65d5d0b..132e9880b9 100644 --- a/src/databricks/labs/ucx/source_code/directfs_access.py +++ b/src/databricks/labs/ucx/source_code/directfs_access.py @@ -10,6 +10,7 @@ from databricks.labs.ucx.framework.crawlers import CrawlerBase from databricks.labs.lsql.backends import SqlBackend +from databricks.sdk import WorkspaceClient from databricks.sdk.errors import DatabricksError from databricks.labs.ucx.framework.utils import escape_sql_identifier @@ -80,22 +81,30 @@ def replace_assessment_infos( class DirectFsAccessCrawler(CrawlerBase[DirectFsAccess]): @classmethod - def for_paths(cls, backend: SqlBackend, schema) -> DirectFsAccessCrawler: - return DirectFsAccessCrawler(backend, schema, "directfs_in_paths") + def for_paths(cls, ws: WorkspaceClient, backend: SqlBackend, schema) -> DirectFsAccessCrawler: + return DirectFsAccessCrawler(ws, backend, schema, "directfs_in_paths") @classmethod - def for_queries(cls, backend: SqlBackend, schema) -> DirectFsAccessCrawler: - return DirectFsAccessCrawler(backend, schema, "directfs_in_queries") + def for_queries(cls, ws: WorkspaceClient, backend: SqlBackend, schema) -> DirectFsAccessCrawler: + return DirectFsAccessCrawler(ws, backend, schema, "directfs_in_queries") - def __init__(self, backend: SqlBackend, schema: str, table: str): + def __init__(self, ws: WorkspaceClient, backend: SqlBackend, schema: str, table: str): """ Initializes a DFSACrawler instance. Args: + ws (WorkspaceClient): The client associated with this workspace. sql_backend (SqlBackend): The SQL Execution Backend abstraction (either REST API or Spark) schema: The schema name for the inventory persistence. """ - super().__init__(backend=backend, catalog="hive_metastore", schema=schema, table=table, klass=DirectFsAccess) + super().__init__( + ws=ws, + backend=backend, + catalog="hive_metastore", + schema=schema, + table=table, + klass=DirectFsAccess, + ) def dump_all(self, dfsas: Sequence[DirectFsAccess]): """This crawler doesn't follow the pull model because the fetcher fetches data for 2 crawlers, not just one diff --git a/src/databricks/labs/ucx/workspace_access/generic.py b/src/databricks/labs/ucx/workspace_access/generic.py index 0fd06db6d9..0d37fa76d9 100644 --- a/src/databricks/labs/ucx/workspace_access/generic.py +++ b/src/databricks/labs/ucx/workspace_access/generic.py @@ -332,13 +332,13 @@ def __init__( Listing.__init__(self, lambda: [], "_", "_") CrawlerBase.__init__( self, + ws=ws, backend=sql_backend, catalog="hive_metastore", schema=inventory_database, table="workspace_objects", klass=WorkspaceObjectInfo, ) - self._ws = ws self._num_threads = num_threads self._start_path = start_path self._sql_backend = sql_backend diff --git a/src/databricks/labs/ucx/workspace_access/groups.py b/src/databricks/labs/ucx/workspace_access/groups.py index 75d59a8d61..cc6c397aa8 100644 --- a/src/databricks/labs/ucx/workspace_access/groups.py +++ b/src/databricks/labs/ucx/workspace_access/groups.py @@ -418,11 +418,10 @@ def __init__( # pylint: disable=too-many-arguments *, external_id_match: bool = False, ): - super().__init__(sql_backend, "hive_metastore", inventory_database, "groups", MigratedGroup) + super().__init__(ws, sql_backend, "hive_metastore", inventory_database, "groups", MigratedGroup) if not renamed_group_prefix: renamed_group_prefix = "db-temp-" - self._ws = ws self._include_group_names = include_group_names self._renamed_group_prefix = renamed_group_prefix self._workspace_group_regex = workspace_group_regex diff --git a/src/databricks/labs/ucx/workspace_access/manager.py b/src/databricks/labs/ucx/workspace_access/manager.py index 50eba51d95..cfdb36f445 100644 --- a/src/databricks/labs/ucx/workspace_access/manager.py +++ b/src/databricks/labs/ucx/workspace_access/manager.py @@ -4,6 +4,7 @@ from databricks.labs.blueprint.parallel import ManyError, Threads from databricks.labs.lsql.backends import SqlBackend +from databricks.sdk import WorkspaceClient from databricks.labs.ucx.framework.crawlers import CrawlerBase from databricks.labs.ucx.framework.utils import escape_sql_identifier @@ -23,8 +24,8 @@ class PermissionManager(CrawlerBase[Permissions]): ERRORS_TO_IGNORE = ["FEATURE_DISABLED"] - def __init__(self, backend: SqlBackend, inventory_database: str, crawlers: list[AclSupport]): - super().__init__(backend, "hive_metastore", inventory_database, "permissions", Permissions) + def __init__(self, ws: WorkspaceClient, backend: SqlBackend, inventory_database: str, crawlers: list[AclSupport]): + super().__init__(ws, backend, "hive_metastore", inventory_database, "permissions", Permissions) self._acl_support = crawlers def _crawl(self) -> Iterable[Permissions]: diff --git a/tests/integration/conftest.py b/tests/integration/conftest.py index c4dc8f4c33..98a8968c08 100644 --- a/tests/integration/conftest.py +++ b/tests/integration/conftest.py @@ -331,8 +331,8 @@ def get_azure_spark_conf(): class StaticTablesCrawler(TablesCrawler): - def __init__(self, sb: SqlBackend, schema: str, tables: list[TableInfo]): - super().__init__(sb, schema) + def __init__(self, ws: WorkspaceClient, sb: SqlBackend, schema: str, tables: list[TableInfo]): + super().__init__(ws, sb, schema) self._tables = [ Table( catalog=_.catalog_name, @@ -570,7 +570,12 @@ def tables_crawler(self) -> TablesCrawler: Overrides the FasterTableScanCrawler with TablesCrawler used as DBR is not available while running integration tests :return: TablesCrawler """ - return TablesCrawler(self.sql_backend, self.inventory_database, self.config.include_databases) + return TablesCrawler( + self.workspace_client, + self.sql_backend, + self.inventory_database, + self.config.include_databases, + ) def save_tables(self, is_hiveserde: bool = False): # populate the tables crawled, as it is used by get_tables_to_migrate in the migrate-tables workflow diff --git a/tests/integration/source_code/test_queries.py b/tests/integration/source_code/test_queries.py index 10d4ded773..029af876eb 100644 --- a/tests/integration/source_code/test_queries.py +++ b/tests/integration/source_code/test_queries.py @@ -11,7 +11,7 @@ def test_query_linter_lints_queries_and_stores_dfsas(simple_ctx, ws, sql_backend all_problems = sql_backend.fetch("SELECT * FROM query_problems", schema=simple_ctx.inventory_database) problems = [row for row in all_problems if row["query_name"] == query.name] assert len(problems) == 1 - crawler = DirectFsAccessCrawler.for_queries(sql_backend, simple_ctx.inventory_database) + crawler = DirectFsAccessCrawler.for_queries(ws, sql_backend, simple_ctx.inventory_database) all_dfsas = crawler.snapshot() source_id = f"{_dashboard.id}/{query.id}" dfsas = [dfsa for dfsa in all_dfsas if dfsa.source_id == source_id] diff --git a/tests/integration/workspace_access/test_permissions_manager.py b/tests/integration/workspace_access/test_permissions_manager.py index 9868923f48..0a672d06cd 100644 --- a/tests/integration/workspace_access/test_permissions_manager.py +++ b/tests/integration/workspace_access/test_permissions_manager.py @@ -16,7 +16,7 @@ def get_verify_task(self, item: Permissions) -> Callable[[], bool] | None: ... def object_types(self) -> set[str]: return {"bcd", "fgh"} - permission_manager = PermissionManager(sql_backend, inventory_schema, [StubbedCrawler()]) + permission_manager = PermissionManager(ws, sql_backend, inventory_schema, [StubbedCrawler()]) snapshot = list(permission_manager.snapshot()) # Snapshotting is multithreaded, meaning the order of results is non-deterministic. snapshot.sort(key=lambda x: x.object_id) diff --git a/tests/unit/azure/test_locations.py b/tests/unit/azure/test_locations.py index f1b901638b..7e4401f439 100644 --- a/tests/unit/azure/test_locations.py +++ b/tests/unit/azure/test_locations.py @@ -28,7 +28,7 @@ def location_migration_for_test(ws, mock_backend, mock_installation, azurerm=Non azurerm = azurerm or AzureResources(azure_api_client(), azure_api_client()) location_crawler = ExternalLocations(ws, mock_backend, "location_test") azure_resource_permissions = AzureResourcePermissions(mock_installation, ws, azurerm, location_crawler) - tables_crawler = TablesCrawler(mock_backend, 'ucx') + tables_crawler = TablesCrawler(ws, mock_backend, 'ucx') mounts_crawler = Mounts(mock_backend, ws, 'ucx') principal_acl = PrincipalACL(ws, mock_backend, mock_installation, tables_crawler, mounts_crawler, lambda: []) external_locations_migration = ExternalLocationsMigration( diff --git a/tests/unit/conftest.py b/tests/unit/conftest.py index 8f828a417b..e628603c60 100644 --- a/tests/unit/conftest.py +++ b/tests/unit/conftest.py @@ -120,7 +120,7 @@ def product_element_side_effect(index): @pytest.fixture -def run_workflow(mocker, mock_installation, spark_table_crawl_mocker): +def run_workflow(mocker, mock_installation, ws, spark_table_crawl_mocker): def inner(cb, **replace) -> RuntimeContext: with _lock, patch.dict(os.environ, {"DATABRICKS_RUNTIME_VERSION": "14.0"}): pyspark_sql_session = mocker.Mock() @@ -128,16 +128,17 @@ def inner(cb, **replace) -> RuntimeContext: if 'installation' not in replace: replace['installation'] = mock_installation if 'workspace_client' not in replace: - ws = create_autospec(WorkspaceClient) - ws.api_client.do.return_value = {} - ws.permissions.get.return_value = {} replace['workspace_client'] = ws if 'sql_backend' not in replace: replace['sql_backend'] = MockBackend() if 'config' not in replace: replace['config'] = mock_installation.load(WorkspaceConfig) if 'tables_crawler' not in replace: - replace['tables_crawler'] = TablesCrawler(replace['sql_backend'], replace['config'].inventory_database) + replace['tables_crawler'] = TablesCrawler( + replace['workspace_client'], + replace['sql_backend'], + replace['config'].inventory_database, + ) module = __import__(cb.__module__, fromlist=[cb.__name__]) klass, method = cb.__qualname__.split('.', 1) @@ -197,3 +198,12 @@ def mock_notebook_resolver(): @pytest.fixture def mock_backend() -> MockBackend: return MockBackend() + + +@pytest.fixture +def ws() -> WorkspaceClient: + client = create_autospec(WorkspaceClient) + client.api_client.do.return_value = {} + client.permissions.get.return_value = {} + client.get_workspace_id.return_value = "12345" + return client diff --git a/tests/unit/framework/test_crawlers.py b/tests/unit/framework/test_crawlers.py index 2fa5c9bfc9..f83461db3e 100644 --- a/tests/unit/framework/test_crawlers.py +++ b/tests/unit/framework/test_crawlers.py @@ -5,6 +5,7 @@ import pytest from databricks.labs.lsql import Row from databricks.labs.lsql.backends import MockBackend +from databricks.sdk import WorkspaceClient from databricks.sdk.errors import NotFound from databricks.labs.ucx.framework.crawlers import CrawlerBase, Result, ResultFn @@ -32,6 +33,7 @@ class Bar: class _CrawlerFixture(CrawlerBase[Result]): def __init__( self, + ws: WorkspaceClient, backend: MockBackend, catalog: str, schema: str, @@ -41,7 +43,7 @@ def __init__( fetcher: ResultFn = lambda: [], loader: ResultFn = lambda: [], ): - super().__init__(backend, catalog, schema, table, klass) + super().__init__(ws, backend, catalog, schema, table, klass) self._fetcher = fetcher self._loader = loader @@ -52,22 +54,22 @@ def _crawl(self) -> Iterable[Result]: return self._loader() -def test_invalid(): +def test_invalid(ws): with pytest.raises(ValueError): - _CrawlerFixture(MockBackend(), "a.a.a", "b", "c", Bar) + _CrawlerFixture(ws, MockBackend(), "a.a.a", "b", "c", Bar) -def test_full_name(): - cb = _CrawlerFixture(MockBackend(), "a", "b", "c", Bar) +def test_full_name(ws): + cb = _CrawlerFixture(ws, MockBackend(), "a", "b", "c", Bar) assert cb.full_name == "a.b.c" -def test_snapshot_crawls_when_no_prior_crawl() -> None: +def test_snapshot_crawls_when_no_prior_crawl(ws) -> None: """Check that the crawler is invoked when the fetcher reports that the inventory doesn't exist.""" mock_backend = MockBackend() mock_fetcher = Mock(side_effect=NotFound(".. TABLE_OR_VIEW_NOT_FOUND ..")) mock_loader = Mock(return_value=[Baz(first="first")]) - cb = _CrawlerFixture[Baz](mock_backend, "a", "b", "c", Baz, fetcher=mock_fetcher, loader=mock_loader) + cb = _CrawlerFixture[Baz](ws, mock_backend, "a", "b", "c", Baz, fetcher=mock_fetcher, loader=mock_loader) result = cb.snapshot() @@ -76,12 +78,12 @@ def test_snapshot_crawls_when_no_prior_crawl() -> None: assert [Baz(first="first")] == result -def test_snapshot_crawls_when_prior_crawl_yielded_no_data() -> None: +def test_snapshot_crawls_when_prior_crawl_yielded_no_data(ws) -> None: """Check that the crawler is invoked when the fetcher reports that the inventory exists but doesn't contain data.""" mock_backend = MockBackend() mock_fetcher = Mock(return_value=[]) mock_loader = Mock(return_value=[Baz(first="first")]) - cb = _CrawlerFixture[Baz](mock_backend, "a", "b", "c", Baz, fetcher=mock_fetcher, loader=mock_loader) + cb = _CrawlerFixture[Baz](ws, mock_backend, "a", "b", "c", Baz, fetcher=mock_fetcher, loader=mock_loader) result = cb.snapshot() @@ -90,12 +92,12 @@ def test_snapshot_crawls_when_prior_crawl_yielded_no_data() -> None: assert [Baz(first="first")] == result -def test_snapshot_doesnt_crawl_if_previous_crawl_yielded_data() -> None: +def test_snapshot_doesnt_crawl_if_previous_crawl_yielded_data(ws) -> None: """Check that existing data is used (with no crawl) if the fetcher can load the snapshot data.""" mock_backend = MockBackend() mock_fetcher = Mock(return_value=[Baz(first="first")]) mock_loader = Mock(return_value=[Baz(first="second")]) - cb = _CrawlerFixture[Baz](mock_backend, "a", "b", "c", Baz, fetcher=mock_fetcher, loader=mock_loader) + cb = _CrawlerFixture[Baz](ws, mock_backend, "a", "b", "c", Baz, fetcher=mock_fetcher, loader=mock_loader) result = cb.snapshot() @@ -104,12 +106,12 @@ def test_snapshot_doesnt_crawl_if_previous_crawl_yielded_data() -> None: assert [Baz(first="first")] == result -def test_snapshot_crawls_if_refresh_forced() -> None: +def test_snapshot_crawls_if_refresh_forced(ws) -> None: """Check that a crawl happens (without even checking existing data) if a refresh is forced.""" mock_backend = MockBackend() mock_fetcher = Mock(return_value=[Baz(first="first")]) mock_loader = Mock(return_value=[Baz(first="second")]) - cb = _CrawlerFixture[Baz](mock_backend, "a", "b", "c", Baz, fetcher=mock_fetcher, loader=mock_loader) + cb = _CrawlerFixture[Baz](ws, mock_backend, "a", "b", "c", Baz, fetcher=mock_fetcher, loader=mock_loader) result = cb.snapshot(force_refresh=True) @@ -118,12 +120,12 @@ def test_snapshot_crawls_if_refresh_forced() -> None: assert [Baz(first="second")] == result -def test_snapshot_force_refresh_replaces_prior_data() -> None: +def test_snapshot_force_refresh_replaces_prior_data(ws) -> None: """Check that when refreshing the new data replaces (via overwrite) any existing data.""" mock_backend = MockBackend() mock_fetcher = Mock(side_effect=RuntimeError("never called")) mock_loader = Mock(return_value=[Baz(first="second")]) - cb = _CrawlerFixture[Baz](mock_backend, "a", "b", "c", Baz, fetcher=mock_fetcher, loader=mock_loader) + cb = _CrawlerFixture[Baz](ws, mock_backend, "a", "b", "c", Baz, fetcher=mock_fetcher, loader=mock_loader) cb.snapshot(force_refresh=True) @@ -132,9 +134,9 @@ def test_snapshot_force_refresh_replaces_prior_data() -> None: assert [Row(first="second", second=None)] == mock_backend.rows_written_for("a.b.c", mode="overwrite") -def test_snapshot_updates_existing_table() -> None: +def test_snapshot_updates_existing_table(ws) -> None: mock_backend = MockBackend() - cb = _CrawlerFixture[Baz](mock_backend, "a", "b", "c", Baz, loader=lambda: [Baz(first="first")]) + cb = _CrawlerFixture[Baz](ws, mock_backend, "a", "b", "c", Baz, loader=lambda: [Baz(first="first")]) result = cb.snapshot() @@ -142,7 +144,7 @@ def test_snapshot_updates_existing_table() -> None: assert [Row(first="first", second=None)] == mock_backend.rows_written_for("a.b.c", "overwrite") -def test_snapshot_updates_new_table() -> None: +def test_snapshot_updates_new_table(ws) -> None: mock_backend = MockBackend() def fetcher(): @@ -150,7 +152,7 @@ def fetcher(): raise NotFound(msg) cb = _CrawlerFixture[Foo]( - mock_backend, "a", "b", "c", Foo, fetcher=fetcher, loader=lambda: [Foo(first="first", second=True)] + ws, mock_backend, "a", "b", "c", Foo, fetcher=fetcher, loader=lambda: [Foo(first="first", second=True)] ) result = cb.snapshot() @@ -159,14 +161,14 @@ def fetcher(): assert [Row(first="first", second=True)] == mock_backend.rows_written_for("a.b.c", "overwrite") -def test_snapshot_wrong_error() -> None: +def test_snapshot_wrong_error(ws) -> None: sql_backend = MockBackend() def fetcher(): msg = "always fails" raise ValueError(msg) - cb = _CrawlerFixture[Bar](sql_backend, "a", "b", "c", Bar, fetcher=fetcher) + cb = _CrawlerFixture[Bar](ws, sql_backend, "a", "b", "c", Bar, fetcher=fetcher) with pytest.raises(ValueError): cb.snapshot() diff --git a/tests/unit/hive_metastore/test_grants.py b/tests/unit/hive_metastore/test_grants.py index 101f1dd602..2985343d05 100644 --- a/tests/unit/hive_metastore/test_grants.py +++ b/tests/unit/hive_metastore/test_grants.py @@ -174,16 +174,16 @@ def test_uc_sql(grant, query): } -def test_crawler_no_data(): +def test_crawler_no_data(ws): sql_backend = MockBackend() - table = TablesCrawler(sql_backend, "schema") - udf = UdfsCrawler(sql_backend, "schema") + table = TablesCrawler(ws, sql_backend, "schema") + udf = UdfsCrawler(ws, sql_backend, "schema") crawler = GrantsCrawler(table, udf) grants = list(crawler.snapshot()) assert len(grants) == 0 -def test_crawler_crawl(): +def test_crawler_crawl(ws): sql_backend = MockBackend( rows={ "SHOW DATABASES": SHOW_DATABASES[ @@ -238,14 +238,14 @@ def test_crawler_crawl(): action_type="SELECT", ), } - table = TablesCrawler(sql_backend, "schema") - udf = UdfsCrawler(sql_backend, "schema") + table = TablesCrawler(ws, sql_backend, "schema") + udf = UdfsCrawler(ws, sql_backend, "schema") crawler = GrantsCrawler(table, udf) grants = list(crawler.snapshot()) assert len(grants) == len(expected_grants) and set(grants) == expected_grants -def test_crawler_udf_crawl(): +def test_crawler_udf_crawl(ws): sql_backend = MockBackend( rows={ "SHOW DATABASES": SHOW_DATABASES[("database_one",),], @@ -287,33 +287,33 @@ def test_crawler_udf_crawl(): ), } - table = TablesCrawler(sql_backend, "schema") - udf = UdfsCrawler(sql_backend, "schema") + table = TablesCrawler(ws, sql_backend, "schema") + udf = UdfsCrawler(ws, sql_backend, "schema") crawler = GrantsCrawler(table, udf) grants = list(crawler.snapshot()) assert len(grants) == len(expected_grants) and set(grants) == expected_grants -def test_crawler_snapshot_when_no_data(): +def test_crawler_snapshot_when_no_data(ws): sql_backend = MockBackend() - table = TablesCrawler(sql_backend, "schema") - udf = UdfsCrawler(sql_backend, "schema") + table = TablesCrawler(ws, sql_backend, "schema") + udf = UdfsCrawler(ws, sql_backend, "schema") crawler = GrantsCrawler(table, udf) snapshot = list(crawler.snapshot()) assert len(snapshot) == 0 -def test_crawler_snapshot_with_data(): +def test_crawler_snapshot_with_data(ws): sql_backend = MockBackend(rows=ROWS) - table = TablesCrawler(sql_backend, "schema") - udf = UdfsCrawler(sql_backend, "schema") + table = TablesCrawler(ws, sql_backend, "schema") + udf = UdfsCrawler(ws, sql_backend, "schema") crawler = GrantsCrawler(table, udf) snapshot = list(crawler.snapshot()) assert len(snapshot) == 3 -def test_grants_returning_error_when_showing_grants(): +def test_grants_returning_error_when_showing_grants(ws): errors = {"SHOW GRANTS ON TABLE `hive_metastore`.`test_database`.`table1`": "error"} rows = { "SHOW DATABASES": SHOW_DATABASES[ @@ -334,8 +334,8 @@ def test_grants_returning_error_when_showing_grants(): } backend = MockBackend(fails_on_first=errors, rows=rows) - table_crawler = TablesCrawler(backend, "default") - udf = UdfsCrawler(backend, "default") + table_crawler = TablesCrawler(ws, backend, "default") + udf = UdfsCrawler(ws, backend, "default") crawler = GrantsCrawler(table_crawler, udf) results = list(crawler.snapshot()) @@ -352,7 +352,7 @@ def test_grants_returning_error_when_showing_grants(): ] -def test_grants_returning_error_when_describing(): +def test_grants_returning_error_when_describing(ws): errors = {"DESCRIBE TABLE EXTENDED `hive_metastore`.`test_database`.`table1`": "error"} rows = { "SHOW DATABASES": SHOW_DATABASES[("test_database",),], @@ -370,8 +370,8 @@ def test_grants_returning_error_when_describing(): } backend = MockBackend(fails_on_first=errors, rows=rows) - table_crawler = TablesCrawler(backend, "default") - udf = UdfsCrawler(backend, "default") + table_crawler = TablesCrawler(ws, backend, "default") + udf = UdfsCrawler(ws, backend, "default") crawler = GrantsCrawler(table_crawler, udf) results = list(crawler.snapshot()) @@ -388,7 +388,7 @@ def test_grants_returning_error_when_describing(): ] -def test_udf_grants_returning_error_when_showing_grants(): +def test_udf_grants_returning_error_when_showing_grants(ws): errors = {"SHOW GRANTS ON FUNCTION `hive_metastore`.`test_database`.`function_bad`": "error"} rows = { "SHOW DATABASES": SHOW_DATABASES[ @@ -409,8 +409,8 @@ def test_udf_grants_returning_error_when_showing_grants(): } backend = MockBackend(fails_on_first=errors, rows=rows) - table_crawler = TablesCrawler(backend, "default") - udf = UdfsCrawler(backend, "default") + table_crawler = TablesCrawler(ws, backend, "default") + udf = UdfsCrawler(ws, backend, "default") crawler = GrantsCrawler(table_crawler, udf) results = list(crawler.snapshot()) @@ -427,7 +427,7 @@ def test_udf_grants_returning_error_when_showing_grants(): ] -def test_udf_grants_returning_error_when_describing(): +def test_udf_grants_returning_error_when_describing(ws): errors = {"DESCRIBE FUNCTION EXTENDED `hive_metastore`.`test_database`.`function_bad`": "error"} rows = { "SHOW DATABASES": SHOW_DATABASES[("test_database",),], @@ -445,8 +445,8 @@ def test_udf_grants_returning_error_when_describing(): } backend = MockBackend(fails_on_first=errors, rows=rows) - table_crawler = TablesCrawler(backend, "default") - udf = UdfsCrawler(backend, "default") + table_crawler = TablesCrawler(ws, backend, "default") + udf = UdfsCrawler(ws, backend, "default") crawler = GrantsCrawler(table_crawler, udf) results = list(crawler.snapshot()) @@ -463,7 +463,7 @@ def test_udf_grants_returning_error_when_describing(): ] -def test_crawler_should_filter_databases(): +def test_crawler_should_filter_databases(ws): sql_backend = MockBackend( rows={ "SHOW TABLES FROM `hive_metastore`\\.`database_one`": SHOW_TABLES[("database_one", "table_one", "true"),], @@ -490,8 +490,8 @@ def test_crawler_should_filter_databases(): ), } - table = TablesCrawler(sql_backend, "schema", include_databases=["database_one"]) - udf = UdfsCrawler(sql_backend, "schema", include_databases=["database_one"]) + table = TablesCrawler(ws, sql_backend, "schema", include_databases=["database_one"]) + udf = UdfsCrawler(ws, sql_backend, "schema", include_databases=["database_one"]) crawler = GrantsCrawler(table, udf, include_databases=["database_one"]) grants = list(crawler.snapshot()) diff --git a/tests/unit/hive_metastore/test_mapping.py b/tests/unit/hive_metastore/test_mapping.py index e0ac9f56ad..94b5ec9aaa 100644 --- a/tests/unit/hive_metastore/test_mapping.py +++ b/tests/unit/hive_metastore/test_mapping.py @@ -299,11 +299,11 @@ def test_skip_missing_table(caplog): assert [rec.message for rec in caplog.records if "table not found" in rec.message.lower()] -def test_extract_database_skip_property(): +def test_extract_database_skip_property(ws): errors = {} rows = {} backend = MockBackend(fails_on_first=errors, rows=rows) - table_crawler = TablesCrawler(backend, "ucx") + table_crawler = TablesCrawler(ws, backend, "ucx") assert "databricks.labs.ucx.skip" in table_crawler.parse_database_props("(databricks.labs.ucx.skip,true)") diff --git a/tests/unit/hive_metastore/test_migrate_acls.py b/tests/unit/hive_metastore/test_migrate_acls.py index 656a764dee..d90c8e9bc3 100644 --- a/tests/unit/hive_metastore/test_migrate_acls.py +++ b/tests/unit/hive_metastore/test_migrate_acls.py @@ -2,7 +2,6 @@ from unittest.mock import create_autospec import pytest from databricks.labs.lsql.backends import SqlBackend -from databricks.sdk import WorkspaceClient from databricks.labs.ucx.account.workspaces import WorkspaceInfo from databricks.labs.ucx.hive_metastore.grants import MigrateGrants, ACLMigrator, Grant @@ -16,13 +15,6 @@ logger = logging.getLogger(__name__) -@pytest.fixture -def ws(): - client = create_autospec(WorkspaceClient) - client.get_workspace_id.return_value = "12345" - return client - - @pytest.fixture def ws_info(): info = create_autospec(WorkspaceInfo) diff --git a/tests/unit/hive_metastore/test_table_migrate.py b/tests/unit/hive_metastore/test_table_migrate.py index 1187fb011d..4a096ad125 100644 --- a/tests/unit/hive_metastore/test_table_migrate.py +++ b/tests/unit/hive_metastore/test_table_migrate.py @@ -38,18 +38,11 @@ logger = logging.getLogger(__name__) -@pytest.fixture -def ws(): - client = create_autospec(WorkspaceClient) - client.get_workspace_id.return_value = "12345" - return client - - def test_migrate_dbfs_root_tables_should_produce_proper_queries(ws): errors = {} rows = {r"SYNC .*": MockBackend.rows("status_code", "description")[("SUCCESS", "test")]} backend = MockBackend(fails_on_first=errors, rows=rows) - table_crawler = TablesCrawler(backend, "inventory_database") + table_crawler = TablesCrawler(ws, backend, "inventory_database") table_mapping = mock_table_mapping(["managed_dbfs", "managed_mnt", "managed_other"]) migration_status_refresher = TableMigrationStatusRefresher(ws, backend, "inventory_database", table_crawler) migrate_grants = create_autospec(MigrateGrants) @@ -100,7 +93,7 @@ def test_dbfs_non_delta_tables_should_produce_proper_queries(ws): ] } backend = MockBackend(fails_on_first=errors, rows=rows) - table_crawler = TablesCrawler(backend, "inventory_database") + table_crawler = TablesCrawler(ws, backend, "inventory_database") table_mapping = mock_table_mapping(["dbfs_parquet"]) migration_status_refresher = TableMigrationStatusRefresher(ws, backend, "inventory_database", table_crawler) migrate_grants = create_autospec(MigrateGrants) @@ -136,7 +129,7 @@ def test_migrate_dbfs_root_tables_should_be_skipped_when_upgrading_external(ws): rows = {} crawler_backend = MockBackend(fails_on_first=errors, rows=rows) backend = MockBackend(fails_on_first=errors, rows=rows) - table_crawler = TablesCrawler(crawler_backend, "inventory_database") + table_crawler = TablesCrawler(ws, crawler_backend, "inventory_database") table_mapping = mock_table_mapping(["managed_dbfs"]) migration_status_refresher = TableMigrationStatusRefresher(ws, backend, "inventory_database", table_crawler) migrate_grants = create_autospec(MigrateGrants) @@ -158,7 +151,7 @@ def test_migrate_external_tables_should_produce_proper_queries(ws): rows = {r"SYNC .*": MockBackend.rows("status_code", "description")[("SUCCESS", "test")]} crawler_backend = MockBackend(fails_on_first=errors, rows=rows) backend = MockBackend(fails_on_first=errors, rows=rows) - table_crawler = TablesCrawler(crawler_backend, "inventory_database") + table_crawler = TablesCrawler(ws, crawler_backend, "inventory_database") table_mapping = mock_table_mapping(["external_src"]) migration_status_refresher = TableMigrationStatusRefresher(ws, backend, "inventory_database", table_crawler) migrate_grants = create_autospec(MigrateGrants) @@ -189,7 +182,7 @@ def test_migrate_external_table_failed_sync(ws, caplog): rows = {r"SYNC .*": MockBackend.rows("status_code", "description")[("LOCATION_OVERLAP", "test")]} backend = MockBackend(fails_on_first=errors, rows=rows) crawler_backend = MockBackend(fails_on_first=errors, rows=rows) - table_crawler = TablesCrawler(crawler_backend, "inventory_database") + table_crawler = TablesCrawler(ws, crawler_backend, "inventory_database") table_mapping = mock_table_mapping(["external_src"]) migration_status_refresher = TableMigrationStatusRefresher(ws, backend, "inventory_database", table_crawler) migrate_grants = create_autospec(MigrateGrants) @@ -294,7 +287,7 @@ def test_migrate_external_hiveserde_table_in_place( }, fails_on_first=errors, ) - table_crawler = TablesCrawler(backend, "inventory_database") + table_crawler = TablesCrawler(ws, backend, "inventory_database") table_mapping = mock_table_mapping(["external_hiveserde"]) migration_status_refresher = TableMigrationStatusRefresher(ws, backend, "inventory_database", table_crawler) mount_crawler = create_autospec(Mounts) @@ -346,7 +339,7 @@ def test_migrate_external_hiveserde_table_in_place( ) def test_migrate_external_tables_ctas_should_produce_proper_queries(ws, what, test_table, expected_query): backend = MockBackend() - table_crawler = TablesCrawler(backend, "inventory_database") + table_crawler = TablesCrawler(ws, backend, "inventory_database") table_mapping = mock_table_mapping([test_table]) migration_status_refresher = TableMigrationStatusRefresher(ws, backend, "inventory_database", table_crawler) mounts_crawler = create_autospec(Mounts) @@ -371,7 +364,7 @@ def test_migrate_already_upgraded_table_should_produce_no_queries(ws): rows = {} crawler_backend = MockBackend(fails_on_first=errors, rows=rows) backend = MockBackend(fails_on_first=errors, rows=rows) - table_crawler = TablesCrawler(crawler_backend, "inventory_database") + table_crawler = TablesCrawler(ws, crawler_backend, "inventory_database") ws.catalogs.list.return_value = [CatalogInfo(name="cat1")] ws.schemas.list.return_value = [ SchemaInfo(catalog_name="cat1", name="test_schema1"), @@ -414,7 +407,7 @@ def test_migrate_unsupported_format_table_should_produce_no_queries(ws): rows = {} crawler_backend = MockBackend(fails_on_first=errors, rows=rows) backend = MockBackend(fails_on_first=errors, rows=rows) - table_crawler = TablesCrawler(crawler_backend, "inventory_database") + table_crawler = TablesCrawler(ws, crawler_backend, "inventory_database") table_mapping = mock_table_mapping(["external_src_unsupported"]) migration_status_refresher = TableMigrationStatusRefresher(ws, backend, "inventory_database", table_crawler) migrate_grants = create_autospec(MigrateGrants) @@ -439,7 +432,7 @@ def test_migrate_view_should_produce_proper_queries(ws): ) rows = {"SHOW CREATE TABLE": [{"createtab_stmt": original_view}]} backend = MockBackend(fails_on_first=errors, rows=rows) - table_crawler = TablesCrawler(backend, "inventory_database") + table_crawler = TablesCrawler(ws, backend, "inventory_database") table_mapping = mock_table_mapping(["managed_dbfs", "view"]) migration_status_refresher = create_autospec(TableMigrationStatusRefresher) migration_status_refresher.get_seen_tables.return_value = { @@ -487,7 +480,7 @@ def test_migrate_view_with_columns(ws): create = "CREATE OR REPLACE VIEW hive_metastore.db1_src.view_src (a,b) AS SELECT * FROM db1_src.managed_dbfs" rows = {"SHOW CREATE TABLE": [{"createtab_stmt": create}]} backend = MockBackend(fails_on_first=errors, rows=rows) - table_crawler = TablesCrawler(backend, "inventory_database") + table_crawler = TablesCrawler(ws, backend, "inventory_database") table_mapping = mock_table_mapping(["managed_dbfs", "view"]) migration_status_refresher = create_autospec(TableMigrationStatusRefresher) migration_status_refresher.get_seen_tables.return_value = { @@ -1034,7 +1027,7 @@ def test_migrate_views_should_be_properly_sequenced(ws): assert next((key for key in table_keys if key == "hive_metastore.db1_src.t1_src"), None) is None -def test_table_in_mount_mapping_with_table_owner(): +def test_table_in_mount_mapping_with_table_owner(ws): client = create_autospec(WorkspaceClient) client.tables.get.side_effect = NotFound() backend = MockBackend( @@ -1055,7 +1048,7 @@ def test_table_in_mount_mapping_with_table_owner(): Rule("prod", "tgt_catalog", "mounted_datalake", "tgt_db", "abfss://bucket@msft/path/test", "test"), ) ] - table_crawler = TablesCrawler(backend, "inventory_database") + table_crawler = TablesCrawler(ws, backend, "inventory_database") migration_status_refresher = TableMigrationStatusRefresher(client, backend, "inventory_database", table_crawler) migrate_grants = create_autospec(MigrateGrants) table_migrate = TablesMigrator( @@ -1074,7 +1067,7 @@ def test_table_in_mount_mapping_with_table_owner(): migrate_grants.apply.assert_called() -def test_table_in_mount_mapping_with_partition_information(): +def test_table_in_mount_mapping_with_partition_information(ws): client = create_autospec(WorkspaceClient) client.tables.get.side_effect = NotFound() backend = MockBackend( @@ -1098,7 +1091,7 @@ def test_table_in_mount_mapping_with_partition_information(): Rule("prod", "tgt_catalog", "mounted_datalake", "tgt_db", "abfss://bucket@msft/path/test", "test"), ) ] - table_crawler = TablesCrawler(backend, "inventory_database") + table_crawler = TablesCrawler(ws, backend, "inventory_database") migration_status_refresher = TableMigrationStatusRefresher(client, backend, "inventory_database", table_crawler) migrate_grants = create_autospec(MigrateGrants) table_migrate = TablesMigrator( @@ -1122,7 +1115,7 @@ def test_migrate_view_failed(ws, caplog): create = "CREATE OR REPLACE VIEW hive_metastore.db1_src.view_src (a,b) AS SELECT * FROM db1_src.managed_dbfs" rows = {"SHOW CREATE TABLE": [{"createtab_stmt": create}]} backend = MockBackend(fails_on_first=errors, rows=rows) - table_crawler = TablesCrawler(backend, "inventory_database") + table_crawler = TablesCrawler(ws, backend, "inventory_database") table_mapping = mock_table_mapping(["managed_dbfs", "view"]) migration_status_refresher = create_autospec(TableMigrationStatusRefresher) migration_status_refresher.get_seen_tables.return_value = { @@ -1155,7 +1148,7 @@ def test_migrate_view_failed(ws, caplog): def test_migrate_dbfs_root_tables_failed(ws, caplog): errors = {"CREATE TABLE IF NOT EXISTS": "error"} backend = MockBackend(fails_on_first=errors, rows={}) - table_crawler = TablesCrawler(backend, "inventory_database") + table_crawler = TablesCrawler(ws, backend, "inventory_database") table_mapping = mock_table_mapping(["managed_dbfs"]) migration_status_refresher = TableMigrationStatusRefresher(ws, backend, "inventory_database", table_crawler) migrate_grants = create_autospec(MigrateGrants) diff --git a/tests/unit/hive_metastore/test_table_size.py b/tests/unit/hive_metastore/test_table_size.py index e23d3a7a35..540eb66e19 100644 --- a/tests/unit/hive_metastore/test_table_size.py +++ b/tests/unit/hive_metastore/test_table_size.py @@ -2,6 +2,7 @@ import sys from databricks.labs.lsql.backends import MockBackend +from databricks.labs.ucx.hive_metastore import TablesCrawler from databricks.labs.ucx.hive_metastore.table_size import TableSize, TableSizeCrawler @@ -12,7 +13,7 @@ class SparkSession: pass -def test_table_size_crawler(mocker): +def test_table_size_crawler(ws, mocker): errors = {} rows = { "table_size": [], @@ -32,7 +33,7 @@ def test_table_size_crawler(mocker): backend = MockBackend(fails_on_first=errors, rows=rows) pyspark_sql_session = mocker.Mock() sys.modules["pyspark.sql.session"] = pyspark_sql_session - tsc = TableSizeCrawler(backend, "inventory_database") + tsc = TableSizeCrawler(TablesCrawler(ws, backend, "inventory_database")) tsc._spark._jsparkSession.table().queryExecution().analyzed().stats().sizeInBytes.side_effect = [100, 200, 300] results = tsc.snapshot() assert "ANALYZE table `hive_metastore`.`db1`.`table1` compute STATISTICS NOSCAN" in backend.queries @@ -42,7 +43,7 @@ def test_table_size_crawler(mocker): assert TableSize("hive_metastore", "db1", "table2", 200) in results -def test_table_size_unknown_error(mocker, caplog): +def test_table_size_unknown_error(ws, mocker, caplog): errors = {} rows = { "table_size": [], @@ -54,7 +55,7 @@ def test_table_size_unknown_error(mocker, caplog): backend = MockBackend(fails_on_first=errors, rows=rows) pyspark_sql_session = mocker.Mock() sys.modules["pyspark.sql.session"] = pyspark_sql_session - tsc = TableSizeCrawler(backend, "inventory_database") + tsc = TableSizeCrawler(TablesCrawler(ws, backend, "inventory_database")) tsc._spark._jsparkSession.table().queryExecution().analyzed().stats().sizeInBytes.side_effect = Exception(...) with caplog.at_level(logging.WARNING): @@ -63,7 +64,7 @@ def test_table_size_unknown_error(mocker, caplog): assert len(results) == 0 -def test_table_size_table_or_view_not_found(mocker, caplog): +def test_table_size_table_or_view_not_found(ws, mocker, caplog): errors = {} rows = { "table_size": [], @@ -75,7 +76,7 @@ def test_table_size_table_or_view_not_found(mocker, caplog): backend = MockBackend(fails_on_first=errors, rows=rows) pyspark_sql_session = mocker.Mock() sys.modules["pyspark.sql.session"] = pyspark_sql_session - tsc = TableSizeCrawler(backend, "inventory_database") + tsc = TableSizeCrawler(TablesCrawler(ws, backend, "inventory_database")) # table removed after crawling tsc._spark._jsparkSession.table().queryExecution().analyzed().stats().sizeInBytes.side_effect = Exception( @@ -89,7 +90,7 @@ def test_table_size_table_or_view_not_found(mocker, caplog): assert "Failed to evaluate hive_metastore.db1.table1 table size. Table not found" in caplog.text -def test_table_size_delta_table_not_found(mocker, caplog): +def test_table_size_delta_table_not_found(ws, mocker, caplog): errors = {} rows = { "table_size": [], @@ -101,7 +102,7 @@ def test_table_size_delta_table_not_found(mocker, caplog): backend = MockBackend(fails_on_first=errors, rows=rows) pyspark_sql_session = mocker.Mock() sys.modules["pyspark.sql.session"] = pyspark_sql_session - tsc = TableSizeCrawler(backend, "inventory_database") + tsc = TableSizeCrawler(TablesCrawler(ws, backend, "inventory_database")) # table removed after crawling tsc._spark._jsparkSession.table().queryExecution().analyzed().stats().sizeInBytes.side_effect = Exception( @@ -115,7 +116,7 @@ def test_table_size_delta_table_not_found(mocker, caplog): assert "Failed to evaluate hive_metastore.db1.table1 table size. Table not found" in caplog.text -def test_table_size_when_table_corrupted(mocker, caplog): +def test_table_size_when_table_corrupted(ws, mocker, caplog): errors = {} rows = { "table_size": [], @@ -127,7 +128,7 @@ def test_table_size_when_table_corrupted(mocker, caplog): backend = MockBackend(fails_on_first=errors, rows=rows) pyspark_sql_session = mocker.Mock() sys.modules["pyspark.sql.session"] = pyspark_sql_session - tsc = TableSizeCrawler(backend, "inventory_database") + tsc = TableSizeCrawler(TablesCrawler(ws, backend, "inventory_database")) tsc._spark._jsparkSession.table().queryExecution().analyzed().stats().sizeInBytes.side_effect = Exception( "[DELTA_MISSING_TRANSACTION_LOG]" @@ -140,7 +141,7 @@ def test_table_size_when_table_corrupted(mocker, caplog): assert "Delta table hive_metastore.db1.table1 is corrupt: missing transaction log" in caplog.text -def test_table_size_when_delta_invalid_format_error(mocker, caplog): +def test_table_size_when_delta_invalid_format_error(ws, mocker, caplog): errors = {} rows = { "table_size": [], @@ -152,7 +153,7 @@ def test_table_size_when_delta_invalid_format_error(mocker, caplog): backend = MockBackend(fails_on_first=errors, rows=rows) pyspark_sql_session = mocker.Mock() sys.modules["pyspark.sql.session"] = pyspark_sql_session - tsc = TableSizeCrawler(backend, "inventory_database") + tsc = TableSizeCrawler(TablesCrawler(ws, backend, "inventory_database")) tsc._spark._jsparkSession.table().queryExecution().analyzed().stats().sizeInBytes.side_effect = Exception( "[DELTA_INVALID_FORMAT]" diff --git a/tests/unit/hive_metastore/test_tables.py b/tests/unit/hive_metastore/test_tables.py index c48698f53c..5c53e18b81 100644 --- a/tests/unit/hive_metastore/test_tables.py +++ b/tests/unit/hive_metastore/test_tables.py @@ -158,7 +158,7 @@ def test_uc_sql_when_table_is_in_mount(schema, partitions, table_schema): assert table.sql_migrate_table_in_mount(target, table_schema) == expected -def test_tables_returning_error_when_describing(): +def test_tables_returning_error_when_describing(ws): errors = {"DESCRIBE TABLE EXTENDED `hive_metastore`.`database`.`table1`": "error"} rows = { "SHOW DATABASES": [("database",)], @@ -174,18 +174,18 @@ def test_tables_returning_error_when_describing(): ], } backend = MockBackend(fails_on_first=errors, rows=rows) - tables_crawler = TablesCrawler(backend, "default") + tables_crawler = TablesCrawler(ws, backend, "default") results = tables_crawler.snapshot() assert len(results) == 1 first = results[0] assert first.upgraded_to == 'fake_cat.fake_ext.fake_delta' -def test_tables_returning_error_when_show_tables(caplog): +def test_tables_returning_error_when_show_tables(ws, caplog): errors = {"SHOW TABLES FROM `hive_metastore`.`database`": "SCHEMA_NOT_FOUND"} rows = {"SHOW DATABASES": [("database",)]} backend = MockBackend(fails_on_first=errors, rows=rows) - tables_crawler = TablesCrawler(backend, "default") + tables_crawler = TablesCrawler(ws, backend, "default") results = tables_crawler.snapshot() assert len(results) == 0 assert "Schema hive_metastore.database no longer exists" in caplog.text @@ -285,13 +285,13 @@ def test_table_what(table, what): assert table.what == what -def test_tables_crawler_should_filter_by_database(): +def test_tables_crawler_should_filter_by_database(ws): rows = { "SHOW TABLES FROM `hive_metastore`.`database`": [("", "table1", ""), ("", "table2", "")], "SHOW TABLES FROM `hive_metastore`.`database_2`": [("", "table1", "")], } backend = MockBackend(rows=rows) - tables_crawler = TablesCrawler(backend, "default", ["database"]) + tables_crawler = TablesCrawler(ws, backend, "default", ["database"]) results = tables_crawler.snapshot() assert len(results) == 2 assert sorted(backend.queries) == sorted( @@ -304,7 +304,7 @@ def test_tables_crawler_should_filter_by_database(): ) -def test_is_partitioned_flag(): +def test_is_partitioned_flag(ws): rows = { "SHOW DATABASES": [("database",)], "SHOW TABLES FROM `hive_metastore`.`database`": [("", "table1", ""), ("", "table2", "")], @@ -325,10 +325,7 @@ def test_is_partitioned_flag(): ], } backend = MockBackend(rows=rows) - tables_crawler = TablesCrawler( - backend, - "default", - ) + tables_crawler = TablesCrawler(ws, backend, "default") results = tables_crawler.snapshot() assert len(results) == 2 assert ( @@ -534,7 +531,7 @@ def test_in_place_migrate_hiveserde_sql_parsing_failure(caplog, ddl, expected_lo assert expected_log in caplog.text -def test_fast_table_scan_crawler_already_crawled(mocker): +def test_fast_table_scan_crawler_already_crawled(ws, mocker): pyspark_sql_session = mocker.Mock() sys.modules["pyspark.sql.session"] = pyspark_sql_session @@ -547,12 +544,12 @@ def test_fast_table_scan_crawler_already_crawled(mocker): ], } sql_backend = MockBackend(fails_on_first=errors, rows=rows) - ftsc = FasterTableScanCrawler(sql_backend, "inventory_database") + ftsc = FasterTableScanCrawler(ws, sql_backend, "inventory_database") results = ftsc.snapshot() assert len(results) == 3 -def test_fast_table_scan_crawler_crawl_new(caplog, mocker, spark_table_crawl_mocker): +def test_fast_table_scan_crawler_crawl_new(ws, caplog, mocker, spark_table_crawl_mocker): pyspark_sql_session = mocker.Mock() sys.modules["pyspark.sql.session"] = pyspark_sql_session @@ -561,7 +558,7 @@ def test_fast_table_scan_crawler_crawl_new(caplog, mocker, spark_table_crawl_moc "hive_metastore.inventory_database.tables": [], } sql_backend = MockBackend(fails_on_first=errors, rows=rows) - ftsc = FasterTableScanCrawler(sql_backend, "inventory_database") + ftsc = FasterTableScanCrawler(ws, sql_backend, "inventory_database") mock_list_databases_iterator, mock_list_tables_iterator, get_table_mock = spark_table_crawl_mocker # pylint: disable=protected-access @@ -583,7 +580,7 @@ def test_fast_table_scan_crawler_crawl_new(caplog, mocker, spark_table_crawl_moc ) -def test_fast_table_scan_crawler_crawl_test_warnings_list_databases(caplog, mocker, spark_table_crawl_mocker): +def test_fast_table_scan_crawler_crawl_test_warnings_list_databases(ws, caplog, mocker, spark_table_crawl_mocker): pyspark_sql_session = mocker.Mock() sys.modules["pyspark.sql.session"] = pyspark_sql_session @@ -593,7 +590,7 @@ def test_fast_table_scan_crawler_crawl_test_warnings_list_databases(caplog, mock "hive_metastore.inventory_database.tables": [], } sql_backend = MockBackend(fails_on_first=errors, rows=rows) - ftsc = FasterTableScanCrawler(sql_backend, "inventory_database") + ftsc = FasterTableScanCrawler(ws, sql_backend, "inventory_database") # pylint: disable=protected-access ftsc._spark._jsparkSession.sharedState().externalCatalog().listDatabases.side_effect = Exception( @@ -605,7 +602,7 @@ def test_fast_table_scan_crawler_crawl_test_warnings_list_databases(caplog, mock assert "Test listDatabases warning" in caplog.text -def test_fast_table_scan_crawler_crawl_test_warnings_list_tables(caplog, mocker, spark_table_crawl_mocker): +def test_fast_table_scan_crawler_crawl_test_warnings_list_tables(ws, caplog, mocker, spark_table_crawl_mocker): pyspark_sql_session = mocker.Mock() sys.modules["pyspark.sql.session"] = pyspark_sql_session @@ -615,7 +612,7 @@ def test_fast_table_scan_crawler_crawl_test_warnings_list_tables(caplog, mocker, "hive_metastore.inventory_database.tables": [], } sql_backend = MockBackend(fails_on_first=errors, rows=rows) - ftsc = FasterTableScanCrawler(sql_backend, "inventory_database") + ftsc = FasterTableScanCrawler(ws, sql_backend, "inventory_database") mock_list_databases_iterator, _, _ = spark_table_crawl_mocker @@ -630,7 +627,7 @@ def test_fast_table_scan_crawler_crawl_test_warnings_list_tables(caplog, mocker, assert "Test listTables warning" in caplog.text -def test_fast_table_scan_crawler_crawl_test_warnings_get_table(caplog, mocker, spark_table_crawl_mocker): +def test_fast_table_scan_crawler_crawl_test_warnings_get_table(ws, caplog, mocker, spark_table_crawl_mocker): pyspark_sql_session = mocker.Mock() sys.modules["pyspark.sql.session"] = pyspark_sql_session @@ -640,7 +637,7 @@ def test_fast_table_scan_crawler_crawl_test_warnings_get_table(caplog, mocker, s "hive_metastore.inventory_database.tables": [], } sql_backend = MockBackend(fails_on_first=errors, rows=rows) - ftsc = FasterTableScanCrawler(sql_backend, "inventory_database") + ftsc = FasterTableScanCrawler(ws, sql_backend, "inventory_database") mock_list_databases_iterator, mock_list_tables_iterator, _ = spark_table_crawl_mocker diff --git a/tests/unit/hive_metastore/test_udfs.py b/tests/unit/hive_metastore/test_udfs.py index b3ba27a63e..5dc5b7070c 100644 --- a/tests/unit/hive_metastore/test_udfs.py +++ b/tests/unit/hive_metastore/test_udfs.py @@ -23,23 +23,23 @@ def test_key(): SHOW_FUNCTIONS = MockBackend.rows("function") -def test_udfs_returning_error_when_describing(): +def test_udfs_returning_error_when_describing(ws): errors = {"DESCRIBE FUNCTION EXTENDED hive_metastore.database.function1": "error"} rows = { "SHOW DATABASES": SHOW_DATABASES[("database",),], "SHOW USER FUNCTIONS FROM hive_metastore.database": SHOW_FUNCTIONS[("hive_metastore.database.function1",),], } backend = MockBackend(fails_on_first=errors, rows=rows) - udf_crawler = UdfsCrawler(backend, "default") + udf_crawler = UdfsCrawler(ws, backend, "default") results = udf_crawler.snapshot() assert len(results) == 0 -def test_tables_crawler_should_filter_by_database(): +def test_tables_crawler_should_filter_by_database(ws): rows = { "SHOW USER FUNCTIONS FROM `hive_metastore`.`database`": SHOW_FUNCTIONS[("hive_metastore.database.function1",),], } backend = MockBackend(rows=rows) - udf_crawler = UdfsCrawler(backend, "default", ["database"]) + udf_crawler = UdfsCrawler(ws, backend, "default", ["database"]) results = udf_crawler.snapshot() assert len(results) == 1 diff --git a/tests/unit/recon/test_migration_recon.py b/tests/unit/recon/test_migration_recon.py index c8460f4feb..e8ce64d9c5 100644 --- a/tests/unit/recon/test_migration_recon.py +++ b/tests/unit/recon/test_migration_recon.py @@ -1,8 +1,4 @@ -from unittest.mock import create_autospec - -import pytest from databricks.labs.lsql.backends import MockBackend -from databricks.sdk import WorkspaceClient from databricks.labs.ucx.hive_metastore import TablesCrawler from databricks.labs.ucx.hive_metastore.table_migration_status import TableMigrationStatusRefresher @@ -14,14 +10,6 @@ from databricks.labs.ucx.recon.schema_comparator import StandardSchemaComparator from tests.unit import mock_table_mapping - -@pytest.fixture -def ws(): - client = create_autospec(WorkspaceClient) - client.get_workspace_id.return_value = "12345" - return client - - MIGRATION_STATUS = MockBackend.rows( "src_schema", "src_table", @@ -74,11 +62,12 @@ def test_migrate_recon_should_produce_proper_queries( "WITH compare_results": data_comp_row_factory[(102, 100, 2),], } backend = MockBackend(fails_on_first=errors, rows=rows) - table_crawler = TablesCrawler(backend, "inventory_database") + table_crawler = TablesCrawler(ws, backend, "inventory_database") migration_status_refresher = TableMigrationStatusRefresher(ws, backend, "inventory_database", table_crawler) metadata_retriever = DatabricksTableMetadataRetriever(backend) data_profiler = StandardDataProfiler(backend, metadata_retriever) migration_recon = MigrationRecon( + ws, backend, "inventory_database", migration_status_refresher, diff --git a/tests/unit/source_code/test_directfs_access.py b/tests/unit/source_code/test_directfs_access.py index 75961390ae..a4a7a0c71a 100644 --- a/tests/unit/source_code/test_directfs_access.py +++ b/tests/unit/source_code/test_directfs_access.py @@ -9,9 +9,9 @@ ) -def test_crawler_appends_dfsas(): +def test_crawler_appends_dfsas(ws): backend = MockBackend() - crawler = DirectFsAccessCrawler.for_paths(backend, "schema") + crawler = DirectFsAccessCrawler.for_paths(ws, backend, "schema") dfsas = list( DirectFsAccess( path=path, diff --git a/tests/unit/workspace_access/test_manager.py b/tests/unit/workspace_access/test_manager.py index 8a1d7d85cc..c99bdf307b 100644 --- a/tests/unit/workspace_access/test_manager.py +++ b/tests/unit/workspace_access/test_manager.py @@ -4,7 +4,6 @@ import pytest from databricks.labs.lsql import Row from databricks.labs.lsql.backends import MockBackend -from databricks.sdk import WorkspaceClient from databricks.sdk.errors import DatabricksError from databricks.sdk.service import iam @@ -14,8 +13,8 @@ from databricks.labs.ucx.workspace_access.manager import PermissionManager, Permissions -def test_inventory_permission_manager_init(mock_backend): - permission_manager = PermissionManager(mock_backend, "test_database", []) +def test_inventory_permission_manager_init(ws, mock_backend): + permission_manager = PermissionManager(ws, mock_backend, "test_database", []) assert permission_manager.full_name == "hive_metastore.test_database.permissions" @@ -23,7 +22,7 @@ def test_inventory_permission_manager_init(mock_backend): _PermissionsRow = Row.factory(["object_id", "object_type", "raw"]) -def test_snapshot_fetch() -> None: +def test_snapshot_fetch(ws) -> None: """Verify that the snapshot will load existing data from the inventory.""" sql_backend = MockBackend( rows={ @@ -32,18 +31,18 @@ def test_snapshot_fetch() -> None: ], } ) - permission_manager = PermissionManager(sql_backend, "test_database", []) + permission_manager = PermissionManager(ws, sql_backend, "test_database", []) output = list(permission_manager.snapshot()) assert output[0] == Permissions(object_id="object1", object_type="clusters", raw="test acl") -def test_snapshot_crawl_fallback(mocker) -> None: +def test_snapshot_crawl_fallback(ws, mocker) -> None: """Verify that the snapshot will first attempt to load the (empty) inventory and then crawl.""" some_crawler = mocker.Mock() some_crawler.get_crawler_tasks = lambda: [lambda: None, lambda: Permissions("a", "b", "c"), lambda: None] sql_backend = MockBackend(rows={"SELECT object_id, object_type, raw FROM ": []}) - permission_manager = PermissionManager(sql_backend, "test_database", [some_crawler]) + permission_manager = PermissionManager(ws, sql_backend, "test_database", [some_crawler]) permission_manager.snapshot() @@ -52,7 +51,7 @@ def test_snapshot_crawl_fallback(mocker) -> None: ) -def test_manager_snapshot_crawl_ignore_disabled_features(mock_backend, mocker): +def test_manager_snapshot_crawl_ignore_disabled_features(ws, mock_backend, mocker): def raise_error(): raise DatabricksError( "Model serving is not enabled for your shard. " @@ -62,7 +61,7 @@ def raise_error(): some_crawler = mocker.Mock() some_crawler.get_crawler_tasks = lambda: [lambda: None, lambda: Permissions("a", "b", "c"), raise_error] - permission_manager = PermissionManager(mock_backend, "test_database", [some_crawler]) + permission_manager = PermissionManager(ws, mock_backend, "test_database", [some_crawler]) permission_manager.snapshot() @@ -71,7 +70,7 @@ def raise_error(): ) -def test_manager_snapshot_crawl_with_error(mock_backend, mocker): +def test_manager_snapshot_crawl_with_error(ws, mock_backend, mocker): def raise_error(): raise DatabricksError( "Fail the job", @@ -83,14 +82,14 @@ def raise_error_no_code(): some_crawler = mocker.Mock() some_crawler.get_crawler_tasks = lambda: [lambda: Permissions("a", "b", "c"), raise_error, raise_error_no_code] - permission_manager = PermissionManager(mock_backend, "test_database", [some_crawler]) + permission_manager = PermissionManager(ws, mock_backend, "test_database", [some_crawler]) with pytest.raises(ManyError) as expected_err: permission_manager.snapshot() assert len(expected_err.value.errs) == 2 -def test_manager_apply(mocker): +def test_manager_apply(ws, mocker): sql_backend = MockBackend( rows={ "SELECT object_id": [ @@ -142,7 +141,7 @@ def test_manager_apply(mocker): # this emulates a real applier and call to an API mock_applier.get_apply_task = lambda item, _: lambda: applied_items.add(f"{item.object_id} {item.object_id}") - permission_manager = PermissionManager(sql_backend, "test_database", [mock_applier]) + permission_manager = PermissionManager(ws, sql_backend, "test_database", [mock_applier]) group_migration_state = MigrationState( [ MigratedGroup( @@ -163,7 +162,7 @@ def test_manager_apply(mocker): assert {"test2 test2", "test test"} == applied_items -def test_unregistered_support(): +def test_unregistered_support(ws): sql_backend = MockBackend( rows={ "SELECT": [ @@ -171,11 +170,11 @@ def test_unregistered_support(): ] } ) - permission_manager = PermissionManager(sql_backend, "test", []) + permission_manager = PermissionManager(ws, sql_backend, "test", []) permission_manager.apply_group_permissions(migration_state=MigrationState([])) -def test_manager_verify(): +def test_manager_verify(ws): sql_backend = MockBackend( rows={ "SELECT object_id": [ @@ -208,14 +207,14 @@ def test_manager_verify(): # this emulates a real verifier and call to an API mock_verifier.get_verify_task = lambda item: lambda: items.add(f"{item.object_id} {item.object_id}") - permission_manager = PermissionManager(sql_backend, "test_database", [mock_verifier]) + permission_manager = PermissionManager(ws, sql_backend, "test_database", [mock_verifier]) result = permission_manager.verify_group_permissions() assert result assert {"test test"} == items -def test_manager_verify_not_supported_type(): +def test_manager_verify_not_supported_type(ws): sql_backend = MockBackend( rows={ "SELECT object_id": [ @@ -243,13 +242,13 @@ def test_manager_verify_not_supported_type(): mock_verifier = create_autospec(AclSupport) # pylint: disable=mock-no-usage mock_verifier.object_types = lambda: {"not_supported"} - permission_manager = PermissionManager(sql_backend, "test_database", [mock_verifier]) + permission_manager = PermissionManager(ws, sql_backend, "test_database", [mock_verifier]) with pytest.raises(ValueError): permission_manager.verify_group_permissions() -def test_manager_verify_no_tasks(): +def test_manager_verify_no_tasks(ws): sql_backend = MockBackend( rows={ "SELECT object_id": [ @@ -280,14 +279,13 @@ def test_manager_verify_no_tasks(): # this emulates a real verifier and call to an API mock_verifier.get_verify_task = lambda item: None - permission_manager = PermissionManager(sql_backend, "test_database", [mock_verifier]) + permission_manager = PermissionManager(ws, sql_backend, "test_database", [mock_verifier]) result = permission_manager.verify_group_permissions() assert result -def test_manager_apply_experimental_no_tasks(caplog): - ws = create_autospec(WorkspaceClient) +def test_manager_apply_experimental_no_tasks(ws, caplog): group_migration_state = MigrationState([]) with caplog.at_level("INFO"): diff --git a/tests/unit/workspace_access/test_tacl.py b/tests/unit/workspace_access/test_tacl.py index cfa1a2bdc2..9afb6f0c05 100644 --- a/tests/unit/workspace_access/test_tacl.py +++ b/tests/unit/workspace_access/test_tacl.py @@ -22,7 +22,7 @@ SHOW_TABLES = MockBackend.rows("databaseName", "tableName", "isTmp") -def test_tacl_crawler(): +def test_tacl_crawler(ws): sql_backend = MockBackend( rows={ "SELECT \\* FROM `hive_metastore`.`test`.`grants`": UCX_GRANTS[ @@ -30,8 +30,8 @@ def test_tacl_crawler(): ] } ) - tables_crawler = TablesCrawler(sql_backend, "test") - udf_crawler = UdfsCrawler(sql_backend, "test") + tables_crawler = TablesCrawler(ws, sql_backend, "test") + udf_crawler = UdfsCrawler(ws, sql_backend, "test") grants_crawler = GrantsCrawler(tables_crawler, udf_crawler) table_acl_support = TableAclSupport(grants_crawler, sql_backend) @@ -43,7 +43,7 @@ def test_tacl_crawler(): assert obj.object_id == "catalog_a.database_b.table_c" -def test_tacl_udf_crawler(): +def test_tacl_udf_crawler(ws): sql_backend = MockBackend( rows={ "SELECT \\* FROM `hive_metastore`.`test`.`grants`": UCX_GRANTS[ @@ -51,8 +51,8 @@ def test_tacl_udf_crawler(): ] } ) - tables_crawler = TablesCrawler(sql_backend, "test") - udf_crawler = UdfsCrawler(sql_backend, "test") + tables_crawler = TablesCrawler(ws, sql_backend, "test") + udf_crawler = UdfsCrawler(ws, sql_backend, "test") grants_crawler = GrantsCrawler(tables_crawler, udf_crawler) table_acl_support = TableAclSupport(grants_crawler, sql_backend) @@ -64,7 +64,7 @@ def test_tacl_udf_crawler(): assert obj.object_id == "catalog_a.database_b.function_c" -def test_tacl_crawler_multiple_permissions(): +def test_tacl_crawler_multiple_permissions(ws): sql_backend = MockBackend( rows={ "SELECT \\* FROM `hive_metastore`.`test`.`grants`": UCX_GRANTS[ @@ -92,8 +92,8 @@ def test_tacl_crawler_multiple_permissions(): ] } ) - tables_crawler = TablesCrawler(sql_backend, "test") - udf_crawler = UdfsCrawler(sql_backend, "test") + tables_crawler = TablesCrawler(ws, sql_backend, "test") + udf_crawler = UdfsCrawler(ws, sql_backend, "test") grants_crawler = GrantsCrawler(tables_crawler, udf_crawler) table_acl_support = TableAclSupport(grants_crawler, sql_backend) @@ -244,7 +244,7 @@ def test_tacl_crawler_multiple_permissions(): ) == Grant(**json.loads(permissions.raw)) -def test_tacl_applier(): +def test_tacl_applier(ws): sql_backend = MockBackend( rows={ "SELECT \\* FROM `hive_metastore`.`test`.`grants`": UCX_GRANTS[ @@ -255,8 +255,8 @@ def test_tacl_applier(): ], } ) - tables_crawler = TablesCrawler(sql_backend, "test") - udf_crawler = UdfsCrawler(sql_backend, "test") + tables_crawler = TablesCrawler(ws, sql_backend, "test") + udf_crawler = UdfsCrawler(ws, sql_backend, "test") grants_crawler = GrantsCrawler(tables_crawler, udf_crawler) table_acl_support = TableAclSupport(grants_crawler, sql_backend) @@ -296,10 +296,10 @@ def test_tacl_applier(): assert validation_res -def test_tacl_applier_not_applied(): +def test_tacl_applier_not_applied(ws): sql_backend = MockBackend(rows={"SELECT \\* FROM `hive_metastore`.`test`.`grants`": []}) - tables_crawler = TablesCrawler(sql_backend, "test") - udf_crawler = UdfsCrawler(sql_backend, "test") + tables_crawler = TablesCrawler(ws, sql_backend, "test") + udf_crawler = UdfsCrawler(ws, sql_backend, "test") grants_crawler = GrantsCrawler(tables_crawler, udf_crawler) table_acl_support = TableAclSupport(grants_crawler, sql_backend) @@ -339,7 +339,7 @@ def test_tacl_applier_not_applied(): assert not validation_res -def test_tacl_udf_applier(mocker): +def test_tacl_udf_applier(ws): sql_backend = MockBackend( rows={ "SELECT \\* FROM `hive_metastore`.`test`.`grants`": UCX_GRANTS[ @@ -350,8 +350,8 @@ def test_tacl_udf_applier(mocker): ], } ) - tables_crawler = TablesCrawler(sql_backend, "test") - udf_crawler = UdfsCrawler(sql_backend, "test") + tables_crawler = TablesCrawler(ws, sql_backend, "test") + udf_crawler = UdfsCrawler(ws, sql_backend, "test") grants_crawler = GrantsCrawler(tables_crawler, udf_crawler) table_acl_support = TableAclSupport(grants_crawler, sql_backend) @@ -391,7 +391,7 @@ def test_tacl_udf_applier(mocker): assert validation_res -def test_tacl_applier_multiple_actions(): +def test_tacl_applier_multiple_actions(ws): sql_backend = MockBackend( rows={ "SELECT \\* FROM `hive_metastore`.`test`.`grants`": UCX_GRANTS[ @@ -403,8 +403,8 @@ def test_tacl_applier_multiple_actions(): ], } ) - tables_crawler = TablesCrawler(sql_backend, "test") - udf_crawler = UdfsCrawler(sql_backend, "test") + tables_crawler = TablesCrawler(ws, sql_backend, "test") + udf_crawler = UdfsCrawler(ws, sql_backend, "test") grants_crawler = GrantsCrawler(tables_crawler, udf_crawler) table_acl_support = TableAclSupport(grants_crawler, sql_backend) @@ -444,7 +444,7 @@ def test_tacl_applier_multiple_actions(): assert validation_res -def test_tacl_applier_deny_and_grant(): +def test_tacl_applier_deny_and_grant(ws): sql_backend = MockBackend( rows={ "SELECT \\* FROM `hive_metastore`.`test`.`grants`": UCX_GRANTS[ @@ -457,8 +457,8 @@ def test_tacl_applier_deny_and_grant(): ], } ) - tables_crawler = TablesCrawler(sql_backend, "test") - udf_crawler = UdfsCrawler(sql_backend, "test") + tables_crawler = TablesCrawler(ws, sql_backend, "test") + udf_crawler = UdfsCrawler(ws, sql_backend, "test") grants_crawler = GrantsCrawler(tables_crawler, udf_crawler) table_acl_support = TableAclSupport(grants_crawler, sql_backend) @@ -535,7 +535,7 @@ def test_tacl_applier_no_target_principal(mocker): assert not sql_backend.queries -def test_verify_task_should_return_true_if_permissions_applied(): +def test_verify_task_should_return_true_if_permissions_applied(ws): sql_backend = MockBackend( rows={ "SHOW GRANTS ON TABLE `catalog_a`.`database_b`.`table_c`": SHOW_GRANTS[ @@ -543,8 +543,8 @@ def test_verify_task_should_return_true_if_permissions_applied(): ], } ) - tables_crawler = TablesCrawler(sql_backend, "test") - udf_crawler = UdfsCrawler(sql_backend, "test") + tables_crawler = TablesCrawler(ws, sql_backend, "test") + udf_crawler = UdfsCrawler(ws, sql_backend, "test") grants_crawler = GrantsCrawler(tables_crawler, udf_crawler) table_acl_support = TableAclSupport(grants_crawler, sql_backend) @@ -567,7 +567,7 @@ def test_verify_task_should_return_true_if_permissions_applied(): assert result -def test_verify_task_should_fail_if_permissions_not_applied(): +def test_verify_task_should_fail_if_permissions_not_applied(ws): sql_backend = MockBackend( rows={ "SHOW GRANTS ON TABLE `catalog_a`.`database_b`.`table_c`": SHOW_GRANTS[ @@ -575,8 +575,8 @@ def test_verify_task_should_fail_if_permissions_not_applied(): ], } ) - tables_crawler = TablesCrawler(sql_backend, "test") - udf_crawler = UdfsCrawler(sql_backend, "test") + tables_crawler = TablesCrawler(ws, sql_backend, "test") + udf_crawler = UdfsCrawler(ws, sql_backend, "test") grants_crawler = GrantsCrawler(tables_crawler, udf_crawler) table_acl_support = TableAclSupport(grants_crawler, sql_backend) @@ -599,10 +599,10 @@ def test_verify_task_should_fail_if_permissions_not_applied(): task() -def test_verify_task_should_return_false_if_not_grants_present(): +def test_verify_task_should_return_false_if_not_grants_present(ws): sql_backend = MockBackend() - tables_crawler = TablesCrawler(sql_backend, "test") - udf_crawler = UdfsCrawler(sql_backend, "test") + tables_crawler = TablesCrawler(ws, sql_backend, "test") + udf_crawler = UdfsCrawler(ws, sql_backend, "test") grants_crawler = GrantsCrawler(tables_crawler, udf_crawler) table_acl_support = TableAclSupport(grants_crawler, sql_backend) From b8f7e69a3fa7525c53c1a65bfd0201f1ab225d53 Mon Sep 17 00:00:00 2001 From: Andrew Snare Date: Thu, 3 Oct 2024 10:37:28 +0200 Subject: [PATCH 02/58] Move the ownership code into its own module, and stub unit tests. --- src/databricks/labs/ucx/framework/crawlers.py | 59 +----- src/databricks/labs/ucx/framework/owners.py | 168 ++++++++++++++++++ src/databricks/labs/ucx/framework/utils.py | 54 ------ tests/unit/conftest.py | 2 +- tests/unit/framework/test_owners.py | 80 +++++++++ 5 files changed, 251 insertions(+), 112 deletions(-) create mode 100644 src/databricks/labs/ucx/framework/owners.py create mode 100644 tests/unit/framework/test_owners.py diff --git a/src/databricks/labs/ucx/framework/crawlers.py b/src/databricks/labs/ucx/framework/crawlers.py index d224ea8743..52cd92d12d 100644 --- a/src/databricks/labs/ucx/framework/crawlers.py +++ b/src/databricks/labs/ucx/framework/crawlers.py @@ -1,14 +1,13 @@ import logging from abc import ABC, abstractmethod from collections.abc import Callable, Iterable, Sequence -from functools import cached_property -from typing import ClassVar, Generic, Literal, Protocol, TypeVar, final +from typing import ClassVar, Generic, Literal, Protocol, TypeVar from databricks.labs.lsql.backends import SqlBackend from databricks.sdk import WorkspaceClient from databricks.sdk.errors import NotFound -from databricks.labs.ucx.framework.utils import escape_sql_identifier, find_an_admin +from databricks.labs.ucx.framework.utils import escape_sql_identifier logger = logging.getLogger(__name__) @@ -23,10 +22,6 @@ class DataclassInstance(Protocol): class CrawlerBase(ABC, Generic[Result]): - - _cached_workspace_admins: dict[int, str | RuntimeError] = {} - """Cached user names of workspace administrators, keyed by workspace id.""" - def __init__( self, ws: WorkspaceClient, backend: SqlBackend, catalog: str, schema: str, table: str, klass: type[Result] ): @@ -117,56 +112,6 @@ def snapshot(self, *, force_refresh: bool = False) -> Iterable[Result]: """ return self._snapshot(self._try_fetch, self._crawl, force_refresh=force_refresh) - @final - def owner_of(self, result: Result) -> str: - """Obtain the user-name of a user that is responsible for the given record. - - This is intended to be a point of contact, and is either: - - - The user that originally created the resource associated with the result; or - - An active administrator for the current workspace. - - Args: - result (Result): The record for which an associated user-name is sought. - Returns: - A string containing the user-name attribute of the user considered to own the resource. - Raises: - RuntimeError if there are no active administrators for the current workspace. - """ - return self._result_owner(result) or self._workspace_admin - - @cached_property - def _workspace_admin(self) -> str: - # Avoid repeatedly hitting the shared cache. - return self._find_administrator_for(self._ws) - - @classmethod - @final - def _find_administrator_for(cls, ws: WorkspaceClient) -> str: - # Finding an administrator is quite expensive, so we ensure that for a given workspace we only - # do it once. - workspace_id = ws.get_workspace_id() - found_admin_or_error = cls._cached_workspace_admins.get(workspace_id, None) - if isinstance(found_admin_or_error, str): - return found_admin_or_error - if isinstance(found_admin_or_error, RuntimeError): - raise found_admin_or_error - - found_admin = find_an_admin(ws) - if found_admin is None or not found_admin.user_name: - msg = f"No active workspace or account administrator can be found for workspace: {workspace_id}" - error = RuntimeError(msg) - cls._cached_workspace_admins[workspace_id] = error - raise error - user_name = found_admin.user_name - cls._cached_workspace_admins[workspace_id] = user_name - return user_name - - @classmethod - def _result_owner(cls, result: Result) -> str | None: # pylint: disable=unused-argument - """Obtain the record-specific user-name associated with the given result, if any.""" - return None - @abstractmethod def _try_fetch(self) -> Iterable[Result]: """Fetch existing data that has (previously) been crawled by this crawler. diff --git a/src/databricks/labs/ucx/framework/owners.py b/src/databricks/labs/ucx/framework/owners.py new file mode 100644 index 0000000000..d099329b89 --- /dev/null +++ b/src/databricks/labs/ucx/framework/owners.py @@ -0,0 +1,168 @@ +import functools +import logging +from abc import ABC, abstractmethod +from collections.abc import Iterable +from functools import cached_property +from typing import ClassVar, Generic, Protocol, TypeVar, final + +from databricks.sdk import WorkspaceClient +from databricks.sdk.errors import DatabricksError, NotFound +from databricks.sdk.service.iam import User + +logger = logging.getLogger(__name__) + + +class DataclassInstance(Protocol): + __dataclass_fields__: ClassVar[dict] + + +Record = TypeVar("Record") + + +class Ownership(ABC, Generic[Record]): + """Determine an owner for a given type of object.""" + + _cached_workspace_admins: dict[int, str | Exception] = {} + """Cached user names of workspace administrators, keyed by workspace id.""" + + def __init__(self, ws: WorkspaceClient) -> None: + self._ws = ws + + @staticmethod + def _has_role(user: User, role: str) -> bool: + """Determine whether a user has a given role or not.""" + return user.roles is not None and any(r.value == role for r in user.roles) + + @staticmethod + def _member_of_group_named(user: User, group_name: str) -> bool: + """Determine whether a user belongs to a group with the given name or not.""" + return user.groups is not None and any(g.display == group_name for g in user.groups) + + @staticmethod + def _member_of_group(user: User, group_id: str) -> bool: + """Determine whether a user belongs to a group with the given identifier or not.""" + return user.groups is not None and any(g.value == group_id for g in user.groups) + + def _filter_workspace_groups(self, identifiers: Iterable[str]) -> Iterable[str]: + """Limit a set of identifiers to those that are workspace groups.""" + seen = set() + for group_id in identifiers: + if group_id in seen: + continue + seen.add(group_id) + try: + group = self._ws.groups.get(group_id) + except NotFound: + continue + if group.meta and group.meta.resource_type == "WorkspaceGroup": + yield group_id + + def _find_workspace_admins(self) -> Iterable[User]: + """Enumerate the active workspace administrators in a given workspace. + + Returns: + Iterable[User]: The active workspace administrators, if any. + """ + logger.debug("Enumerating users to locate active workspace administrators...") + all_users = self._ws.users.list(attributes="id,active,userName,groups") + # The groups attribute is a flattened list of groups a user belongs to; hunt for the 'admins' workspace group. + admin_users = [user for user in all_users if user.active and self._member_of_group_named(user, "admins")] + logger.debug(f"Verifying membership of the 'admins' workspace group for users: {admin_users}") + candidate_group_ids = ( + group.value + for user in admin_users + if user.groups + for group in user.groups + if group.display == "admins" and group.value + ) + admin_groups = self._filter_workspace_groups(candidate_group_ids) + match list(admin_groups): + case []: + return () + case [admin_group]: + return (user for user in admin_users if self._member_of_group(user, admin_group)) + case _: + msg = f"Multiple 'admins' workspace groups found; something is wrong: {admin_groups}" + raise RuntimeError(msg) + + def _find_account_admins(self) -> Iterable[User]: + """Enumerate the active account administrators associated with a given workspace. + + Returns: + Iterable[User]: The active account administrators, if any. + """ + logger.debug("Enumerating account users to locate active administrators...") + response = self._ws.api_client.do( + "GET", "/api/2.0/account/scim/v2/Users", query={"attributes": "id,active,userName,roles"} + ) + assert isinstance(response, dict) + all_users = (User.from_dict(resource) for resource in response.get("Resources", [])) + return (user for user in all_users if user.active and self._has_role(user, "account_admin")) + + def _find_an_admin(self) -> User | None: + """Locate an active administrator for the current workspace. + + If an active workspace administrator can be located, this is returned. When there are multiple, they are sorted + alphabetically by user-name and the first is returned. If there are no workspace administrators then an active + account administrator is sought, again returning the first alphabetically by user-name if there is more than one. + + Returns: + the first (alphabetically by user-name) active workspace or account administrator, or `None` if neither can + be found. + """ + first_user = functools.partial(min, default=None, key=lambda user: user.name) + return first_user(self._find_workspace_admins()) or first_user(self._find_account_admins()) + + @final + def owner_of(self, record: Record) -> str: + """Obtain the user-name of a user that is responsible for the given record. + + This is intended to be a point of contact, and is either: + + - The user that originally created the resource associated with the result; or + - An active administrator for the current workspace. + + Args: + record (Record): The record for which an associated user-name is sought. + Returns: + A string containing the user-name attribute of the user considered to own the resource. + Raises: + RuntimeError if there are no active administrators for the current workspace. + """ + return self._get_owner(record) or self._workspace_admin + + @cached_property + def _workspace_admin(self) -> str: + # Avoid repeatedly hitting the shared cache. + return self._find_an_administrator() + + @final + def _find_an_administrator(self) -> str: + # Finding an administrator is quite expensive, so we ensure that for a given workspace we only do it once. + # Found administrators are cached on a class attribute. The method here: + # - is thread-safe, with the compromise that we might perform some redundant lookups during init. + # - no administrator is converted into an error. + # - an error during lookup is preserved and raised for subsequent requests, to avoid too many REST calls. + workspace_id = self._ws.get_workspace_id() + found_admin_or_error = self._cached_workspace_admins.get(workspace_id, None) + if found_admin_or_error is None: + logger.debug(f"Locating an active workspace or account administrator for workspace: {workspace_id}") + try: + user = self._find_an_admin() + except DatabricksError as e: + found_admin_or_error = e + else: + found_admin_or_error = user.user_name if user is not None else None + # If not found, convert once into the error that we will raise each time. + if found_admin_or_error is None: + msg = f"No active workspace or account administrator can be found for workspace: {workspace_id}" + found_admin_or_error = RuntimeError(msg) # pylint: disable=redefined-variable-type + self._cached_workspace_admins[workspace_id] = found_admin_or_error + if isinstance(found_admin_or_error, Exception): + raise found_admin_or_error + return found_admin_or_error + + @abstractmethod + def _get_owner(self, record: Record) -> str | None: + """Obtain the record-specific user-name associated with the given result, if any.""" + return None diff --git a/src/databricks/labs/ucx/framework/utils.py b/src/databricks/labs/ucx/framework/utils.py index 348f08b935..0a291960f6 100644 --- a/src/databricks/labs/ucx/framework/utils.py +++ b/src/databricks/labs/ucx/framework/utils.py @@ -1,10 +1,5 @@ -import functools import logging import subprocess -from collections.abc import Iterable - -from databricks.sdk import WorkspaceClient -from databricks.sdk.service.iam import User logger = logging.getLogger(__name__) @@ -28,55 +23,6 @@ def escape_sql_identifier(path: str, *, maxsplit: int = 2) -> str: return ".".join(escaped) -def _has_role(user: User, role: str) -> bool: - return user.roles is not None and any(r.value == role for r in user.roles) - - -def find_workspace_admins(ws: WorkspaceClient) -> Iterable[User]: - """Enumerate the active workspace administrators in a given workspace. - - Arguments: - ws (WorkspaceClient): The client for the workspace whose administrators should be enumerated. - Returns: - Iterable[User]: The active workspace administrators, if any. - """ - all_users = ws.users.list(attributes="id,active,userName,roles") - return (user for user in all_users if user.active and _has_role(user, "workspace_admin")) - - -def find_account_admins(ws: WorkspaceClient) -> Iterable[User]: - """Enumerate the active account administrators associated with a given workspace. - - Arguments: - ws (WorkspaceClient): The client for the workspace whose account administrators should be enumerated. - Returns: - Iterable[User]: The active account administrators, if any. - """ - response = ws.api_client.do( - "GET", "/api/2.0/account/scim/v2/Users", query={"attributes": "id,active,userName,roles"} - ) - assert isinstance(response, dict) - all_users = (User.from_dict(resource) for resource in response.get("Resources", [])) - return (user for user in all_users if user.active and _has_role(user, "account_admin")) - - -def find_an_admin(ws: WorkspaceClient) -> User | None: - """Locate an active administrator for the current workspace. - - If an active workspace administrator can be located, this is returned. When there are multiple, they are sorted - alphabetically by user-name and the first is returned. If there are no workspace administrators then an active - account administrator is sought, again returning the first alphabetically by user-name if there is more than one. - - Arguments: - ws (WorkspaceClient): The client for the workspace for which an administrator should be located. - Returns: - the first (alphabetically by user-name) active workspace or account administrator, or `None` if neither can be - found. - """ - first_user = functools.partial(min, default=None, key=lambda user: user.name) - return first_user(find_workspace_admins(ws)) or first_user(find_account_admins(ws)) - - def run_command(command: str | list[str]) -> tuple[int, str, str]: args = command.split() if isinstance(command, str) else command logger.info(f"Invoking command: {args!r}") diff --git a/tests/unit/conftest.py b/tests/unit/conftest.py index e628603c60..c8fe88cb09 100644 --- a/tests/unit/conftest.py +++ b/tests/unit/conftest.py @@ -201,7 +201,7 @@ def mock_backend() -> MockBackend: @pytest.fixture -def ws() -> WorkspaceClient: +def ws(): client = create_autospec(WorkspaceClient) client.api_client.do.return_value = {} client.permissions.get.return_value = {} diff --git a/tests/unit/framework/test_owners.py b/tests/unit/framework/test_owners.py new file mode 100644 index 0000000000..486959a334 --- /dev/null +++ b/tests/unit/framework/test_owners.py @@ -0,0 +1,80 @@ +from collections.abc import Callable + +import pytest +from databricks.sdk import WorkspaceClient + +from databricks.labs.ucx.framework.owners import Ownership, Record + + +class _OwnershipFixture(Ownership[Record]): + def __init__( + self, + ws: WorkspaceClient, + *, + owner_fn: Callable[[Record], str | None] = lambda _: None, + ): + super().__init__(ws) + self._owner_fn = owner_fn + + def _get_owner(self, record: Record) -> str | None: + return self._owner_fn(record) + + +def test_ownership_prefers_record_owner(ws) -> None: + """Verify that if an owner for the record can be found, that is used.""" + ownership = _OwnershipFixture[str](ws, owner_fn=lambda _: "bob") + + assert ownership.owner_of("school") == "bob" + ws.get_workspace_id.assert_not_called() + +def test_ownership_admin_user_fallback(ws) -> None: + """Verify that if no owner for the record can be found, an admin user is returned instead.""" + ownership = _OwnershipFixture[str](ws) + pytest.xfail("Not yet implemented") + + +def test_ownership_workspace_admin_preferred_over_account_admin(ws) -> None: + """Verify that when both workspace and account administrators are configured, the workspace admin is preferred.""" + pytest.xfail("Not yet implemented") + + + +def test_ownership_workspace_admin_prefer_first_alphabetically(ws) -> None: + """Verify that when multiple workspace administrators can found, the first alphabetically preferred is used.""" + pytest.xfail("Not yet implemented") + + + +def test_ownership_account_admin_prefer_first_alphabetically(ws) -> None: + """Verify that when multiple account administrators can found, the first alphabetically preferred is used.""" + pytest.xfail("Not yet implemented") + + + +def test_ownership_error_when_no_owner_can_be_located(ws) -> None: + """Verify that an error is raised when no workspace or account administrators can be found.""" + pytest.xfail("Not yet implemented") + + + +def test_ownership_fallback_instance_cache(ws) -> None: + """Verify that the fallback owner is cached on each instance to avoid many REST calls.""" + pytest.xfail("Not yet implemented") + + + +def test_ownership_fallback_class_cache(ws) -> None: + """Verify that the fallback owner for a workspace is cached at class level to avoid many REST calls.""" + pytest.xfail("Not yet implemented") + + + +def test_ownership_fallback_class_cache_multiple_workspaces(ws) -> None: + """Verify that cache of workspace administrators supports multiple workspaces.""" + pytest.xfail("Not yet implemented") + + + +def test_ownership_fallback_error_handling(ws) -> None: + """Verify that the class-level owner-cache and tracks errors to avoid many REST calls.""" + pytest.xfail("Not yet implemented") From 07fa875b7a9e096c15a113568632e7066aaef13c Mon Sep 17 00:00:00 2001 From: Andrew Snare Date: Thu, 3 Oct 2024 13:48:16 +0200 Subject: [PATCH 03/58] Skip users that don't have a user-name. --- src/databricks/labs/ucx/framework/owners.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/databricks/labs/ucx/framework/owners.py b/src/databricks/labs/ucx/framework/owners.py index d099329b89..a62c437794 100644 --- a/src/databricks/labs/ucx/framework/owners.py +++ b/src/databricks/labs/ucx/framework/owners.py @@ -66,7 +66,9 @@ def _find_workspace_admins(self) -> Iterable[User]: logger.debug("Enumerating users to locate active workspace administrators...") all_users = self._ws.users.list(attributes="id,active,userName,groups") # The groups attribute is a flattened list of groups a user belongs to; hunt for the 'admins' workspace group. - admin_users = [user for user in all_users if user.active and self._member_of_group_named(user, "admins")] + admin_users = [ + user for user in all_users if user.active and user.user_name and self._member_of_group_named(user, "admins") + ] logger.debug(f"Verifying membership of the 'admins' workspace group for users: {admin_users}") candidate_group_ids = ( group.value @@ -97,7 +99,7 @@ def _find_account_admins(self) -> Iterable[User]: ) assert isinstance(response, dict) all_users = (User.from_dict(resource) for resource in response.get("Resources", [])) - return (user for user in all_users if user.active and self._has_role(user, "account_admin")) + return (user for user in all_users if user.active and user.user_name and self._has_role(user, "account_admin")) def _find_an_admin(self) -> User | None: """Locate an active administrator for the current workspace. From 28daa7e981ee1a9cd11a3c06c72f1235bec17761 Mon Sep 17 00:00:00 2001 From: Andrew Snare Date: Thu, 3 Oct 2024 13:48:55 +0200 Subject: [PATCH 04/58] Sort by the user-name attribute, not name. --- src/databricks/labs/ucx/framework/owners.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/databricks/labs/ucx/framework/owners.py b/src/databricks/labs/ucx/framework/owners.py index a62c437794..7095d3878b 100644 --- a/src/databricks/labs/ucx/framework/owners.py +++ b/src/databricks/labs/ucx/framework/owners.py @@ -112,7 +112,7 @@ def _find_an_admin(self) -> User | None: the first (alphabetically by user-name) active workspace or account administrator, or `None` if neither can be found. """ - first_user = functools.partial(min, default=None, key=lambda user: user.name) + first_user = functools.partial(min, default=None, key=lambda user: user.user_name) return first_user(self._find_workspace_admins()) or first_user(self._find_account_admins()) @final From f4e247ed9de94b164275f58f6cc850e59176be35 Mon Sep 17 00:00:00 2001 From: Andrew Snare Date: Thu, 3 Oct 2024 13:49:20 +0200 Subject: [PATCH 05/58] Materialize list earlier, to aid debugging. --- src/databricks/labs/ucx/framework/owners.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/databricks/labs/ucx/framework/owners.py b/src/databricks/labs/ucx/framework/owners.py index 7095d3878b..b32071b363 100644 --- a/src/databricks/labs/ucx/framework/owners.py +++ b/src/databricks/labs/ucx/framework/owners.py @@ -77,8 +77,8 @@ def _find_workspace_admins(self) -> Iterable[User]: for group in user.groups if group.display == "admins" and group.value ) - admin_groups = self._filter_workspace_groups(candidate_group_ids) - match list(admin_groups): + admin_groups = list(self._filter_workspace_groups(candidate_group_ids)) + match admin_groups: case []: return () case [admin_group]: From d0c22dbe561c2ce7bc218762c4d372781a5b6b48 Mon Sep 17 00:00:00 2001 From: Andrew Snare Date: Thu, 3 Oct 2024 13:50:00 +0200 Subject: [PATCH 06/58] Documentation references for how administrators are marked for workspace and account users. --- src/databricks/labs/ucx/framework/owners.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/databricks/labs/ucx/framework/owners.py b/src/databricks/labs/ucx/framework/owners.py index b32071b363..0548b9d9bd 100644 --- a/src/databricks/labs/ucx/framework/owners.py +++ b/src/databricks/labs/ucx/framework/owners.py @@ -66,6 +66,7 @@ def _find_workspace_admins(self) -> Iterable[User]: logger.debug("Enumerating users to locate active workspace administrators...") all_users = self._ws.users.list(attributes="id,active,userName,groups") # The groups attribute is a flattened list of groups a user belongs to; hunt for the 'admins' workspace group. + # Reference: https://learn.microsoft.com/en-us/azure/databricks/admin/users-groups/groups#account-vs-workspace-group admin_users = [ user for user in all_users if user.active and user.user_name and self._member_of_group_named(user, "admins") ] @@ -99,6 +100,7 @@ def _find_account_admins(self) -> Iterable[User]: ) assert isinstance(response, dict) all_users = (User.from_dict(resource) for resource in response.get("Resources", [])) + # Reference: https://learn.microsoft.com/en-us/azure/databricks/admin/users-groups/groups#account-admin return (user for user in all_users if user.active and user.user_name and self._has_role(user, "account_admin")) def _find_an_admin(self) -> User | None: From 467f912ff96258f393786b3ab7ecf50eaac2e953 Mon Sep 17 00:00:00 2001 From: Andrew Snare Date: Thu, 3 Oct 2024 13:51:22 +0200 Subject: [PATCH 07/58] Ensure that unit tests reset the (class-level) cache before they start. --- src/databricks/labs/ucx/framework/owners.py | 6 ++++++ tests/unit/framework/test_owners.py | 6 ++++++ 2 files changed, 12 insertions(+) diff --git a/src/databricks/labs/ucx/framework/owners.py b/src/databricks/labs/ucx/framework/owners.py index 0548b9d9bd..5490350597 100644 --- a/src/databricks/labs/ucx/framework/owners.py +++ b/src/databricks/labs/ucx/framework/owners.py @@ -25,6 +25,12 @@ class Ownership(ABC, Generic[Record]): _cached_workspace_admins: dict[int, str | Exception] = {} """Cached user names of workspace administrators, keyed by workspace id.""" + @classmethod + def reset_cache(cls) -> None: + """Reset the cache of discovered administrators that we maintain at class level.""" + # Intended for use by tests. + cls._cached_workspace_admins = {} + def __init__(self, ws: WorkspaceClient) -> None: self._ws = ws diff --git a/tests/unit/framework/test_owners.py b/tests/unit/framework/test_owners.py index 486959a334..386b953681 100644 --- a/tests/unit/framework/test_owners.py +++ b/tests/unit/framework/test_owners.py @@ -20,6 +20,12 @@ def _get_owner(self, record: Record) -> str | None: return self._owner_fn(record) +@pytest.fixture(autouse=True) +def _clear_ownership_cache() -> None: + """Ensure that the class-level cache of workspace owners is cleared before each test.""" + Ownership.reset_cache() + + def test_ownership_prefers_record_owner(ws) -> None: """Verify that if an owner for the record can be found, that is used.""" ownership = _OwnershipFixture[str](ws, owner_fn=lambda _: "bob") From 33cb8412f938eec76d29fc1aabd5873b98c69ead Mon Sep 17 00:00:00 2001 From: Andrew Snare Date: Thu, 3 Oct 2024 13:51:56 +0200 Subject: [PATCH 08/58] Fix mock workspace identifier to have the correct type. --- tests/unit/conftest.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/unit/conftest.py b/tests/unit/conftest.py index c8fe88cb09..b404a81d7f 100644 --- a/tests/unit/conftest.py +++ b/tests/unit/conftest.py @@ -205,5 +205,5 @@ def ws(): client = create_autospec(WorkspaceClient) client.api_client.do.return_value = {} client.permissions.get.return_value = {} - client.get_workspace_id.return_value = "12345" + client.get_workspace_id.return_value = 12345 return client From 3a1868c22df9d385de63b1ca9091c20a334a8d36 Mon Sep 17 00:00:00 2001 From: Andrew Snare Date: Thu, 3 Oct 2024 13:52:18 +0200 Subject: [PATCH 09/58] Trivial integration test for locating an administrator. --- tests/integration/framework/test_owners.py | 27 ++++++++++++++++++++++ 1 file changed, 27 insertions(+) create mode 100644 tests/integration/framework/test_owners.py diff --git a/tests/integration/framework/test_owners.py b/tests/integration/framework/test_owners.py new file mode 100644 index 0000000000..777d3d75f4 --- /dev/null +++ b/tests/integration/framework/test_owners.py @@ -0,0 +1,27 @@ +from collections.abc import Callable + +from databricks.sdk import WorkspaceClient + +from databricks.labs.ucx.framework.owners import Ownership, Record + + +class _OwnershipFixture(Ownership[Record]): + def __init__( + self, + ws: WorkspaceClient, + *, + owner_fn: Callable[[Record], str | None] = lambda _: None, + ): + super().__init__(ws) + self._owner_fn = owner_fn + + def _get_owner(self, record: Record) -> str | None: + return self._owner_fn(record) + + +def test_fallback_workspace_admin(ws) -> None: + """Verify that a workspace administrator can be found for our integration environment.""" + ownership = _OwnershipFixture[str](ws) + owner = ownership.owner_of("anything") + + assert owner From ec23bb020d093d3344b6908a23853503de6bc48a Mon Sep 17 00:00:00 2001 From: Andrew Snare Date: Thu, 3 Oct 2024 13:53:21 +0200 Subject: [PATCH 10/58] Start implementing unit tests for the Ownership component. --- tests/unit/framework/test_owners.py | 135 +++++++++++++++++++++++++--- 1 file changed, 124 insertions(+), 11 deletions(-) diff --git a/tests/unit/framework/test_owners.py b/tests/unit/framework/test_owners.py index 386b953681..b3f6031ecf 100644 --- a/tests/unit/framework/test_owners.py +++ b/tests/unit/framework/test_owners.py @@ -1,7 +1,10 @@ -from collections.abc import Callable +import re +from collections.abc import Callable, Sequence import pytest from databricks.sdk import WorkspaceClient +from databricks.sdk.errors import NotFound +from databricks.sdk.service import iam from databricks.labs.ucx.framework.owners import Ownership, Record @@ -20,6 +23,62 @@ def _get_owner(self, record: Record) -> str | None: return self._owner_fn(record) +def _setup_workspace_users(ws, workspace_users: list[iam.User]) -> None: + ws.users.list.return_value = workspace_users + + +def _setup_account_users(ws, account_users: Sequence[iam.User]) -> None: + def stub_rest_call(method: str, path: str | None = None, query: dict | None = None) -> dict: + if method == "GET" and path == "/api/2.0/account/scim/v2/Users" and query: + return {"Resources": [user.as_dict() for user in account_users]} + msg = f"Call not mocked: {method} {path}" + raise NotImplementedError(msg) + + ws.api_client.do.side_effect = stub_rest_call + + +def _setup_groups(ws, groups: list[iam.Group]) -> None: + groups_by_id = {group.id: group for group in groups} + + def stub_groups_get(group_id: str) -> iam.Group: + try: + return groups_by_id[group_id] + except KeyError as e: + msg = f"Group not found: {group_id}" + raise NotFound(msg) from e + + ws.groups.get.side_effect = stub_groups_get + ws.groups.list.return_value = groups + + +def _setup_accounts( + ws, + *, + account_users: Sequence[iam.User] = (), + workspace_users: Sequence[iam.User] = (), + groups: Sequence[iam.Group] = (), +) -> None: + _setup_workspace_users(ws, list(workspace_users)) + _setup_account_users(ws, account_users) + _setup_groups(ws, list(groups)) + + +def _create_workspace_admin(user_name: str, admins_group_id: str) -> iam.User: + return iam.User( + user_name=user_name, + active=True, + groups=[iam.ComplexValue(display="admins", ref=f"Groups/{admins_group_id}", value=admins_group_id)], + ) + + +def _create_account_admin(user_name: str) -> iam.User: + return iam.User(user_name=user_name, active=True, roles=[iam.ComplexValue(value="account_admin")]) + + +def _create_workspace_group(display_name: str, group_id: str) -> iam.Group: + return iam.Group(display_name=display_name, id=group_id, meta=iam.ResourceMeta(resource_type="WorkspaceGroup")) + + @pytest.fixture(autouse=True) def _clear_ownership_cache() -> None: """Ensure that the class-level cache of workspace owners is cleared before each test.""" @@ -29,38 +88,95 @@ def _clear_ownership_cache() -> None: def test_ownership_prefers_record_owner(ws) -> None: """Verify that if an owner for the record can be found, that is used.""" ownership = _OwnershipFixture[str](ws, owner_fn=lambda _: "bob") + owner = ownership.owner_of("school") - assert ownership.owner_of("school") == "bob" + assert owner == "bob" ws.get_workspace_id.assert_not_called() + def test_ownership_admin_user_fallback(ws) -> None: """Verify that if no owner for the record can be found, an admin user is returned instead.""" + account_users = [iam.User(user_name="jane", active=True, roles=[iam.ComplexValue(value="account_admin")])] + _setup_account_users(ws, account_users) + ownership = _OwnershipFixture[str](ws) - pytest.xfail("Not yet implemented") + owner = ownership.owner_of("school") + + assert owner == "jane" def test_ownership_workspace_admin_preferred_over_account_admin(ws) -> None: """Verify that when both workspace and account administrators are configured, the workspace admin is preferred.""" - pytest.xfail("Not yet implemented") + admins_group = _create_workspace_group("admins", group_id="1") + assert admins_group.id + workspace_users = [_create_workspace_admin("bob", admins_group_id=admins_group.id)] + account_users = [_create_account_admin("jane")] + _setup_accounts(ws, account_users=account_users, workspace_users=workspace_users, groups=[admins_group]) + + ownership = _OwnershipFixture[str](ws) + owner = ownership.owner_of("school") + + assert owner == "bob" +def test_ownership_admin_ignore_inactive(ws) -> None: + """Verify that inactive workspace administrators are ignored when locating an administrator.""" + admins_group = _create_workspace_group("admins", group_id="1") + assert admins_group.id + bob = _create_workspace_admin("bob", admins_group_id=admins_group.id) + bob.active = False + jane = _create_account_admin("jane") + jane.active = False + _setup_accounts(ws, account_users=[jane], workspace_users=[bob], groups=[admins_group]) + + ownership = _OwnershipFixture[str](ws) + # All admins are inactive, so an exception should be raised. + with pytest.raises(RuntimeError, match="No active workspace or account administrator"): + _ = ownership.owner_of("school") + def test_ownership_workspace_admin_prefer_first_alphabetically(ws) -> None: - """Verify that when multiple workspace administrators can found, the first alphabetically preferred is used.""" - pytest.xfail("Not yet implemented") + """Verify that when multiple workspace administrators can found, the first alphabetically is used.""" + admins_group = _create_workspace_group("admins", group_id="1") + assert admins_group.id + workspace_users = [ + _create_workspace_admin("bob", admins_group_id=admins_group.id), + _create_workspace_admin("andrew", admins_group_id=admins_group.id), + _create_workspace_admin("jane", admins_group_id=admins_group.id), + ] + _setup_accounts(ws, workspace_users=workspace_users, groups=[admins_group]) + ownership = _OwnershipFixture[str](ws) + owner = ownership.owner_of("school") + + assert owner == "andrew" def test_ownership_account_admin_prefer_first_alphabetically(ws) -> None: """Verify that when multiple account administrators can found, the first alphabetically preferred is used.""" - pytest.xfail("Not yet implemented") + account_users = [ + _create_account_admin("bob"), + _create_account_admin("andrew"), + _create_account_admin("jane"), + ] + _setup_accounts(ws, account_users=account_users) + ownership = _OwnershipFixture[str](ws) + owner = ownership.owner_of("school") + + assert owner == "andrew" def test_ownership_error_when_no_owner_can_be_located(ws) -> None: """Verify that an error is raised when no workspace or account administrators can be found.""" - pytest.xfail("Not yet implemented") + _setup_accounts(ws) + ownership = _OwnershipFixture[str](ws) + # No admins. + workspace_id = ws.get_workspace_id() + expected_message = f"No active workspace or account administrator can be found for workspace: {workspace_id}" + with pytest.raises(RuntimeError, match=re.escape(expected_message)): + _ = ownership.owner_of("school") def test_ownership_fallback_instance_cache(ws) -> None: @@ -68,19 +184,16 @@ def test_ownership_fallback_instance_cache(ws) -> None: pytest.xfail("Not yet implemented") - def test_ownership_fallback_class_cache(ws) -> None: """Verify that the fallback owner for a workspace is cached at class level to avoid many REST calls.""" pytest.xfail("Not yet implemented") - def test_ownership_fallback_class_cache_multiple_workspaces(ws) -> None: """Verify that cache of workspace administrators supports multiple workspaces.""" pytest.xfail("Not yet implemented") - def test_ownership_fallback_error_handling(ws) -> None: """Verify that the class-level owner-cache and tracks errors to avoid many REST calls.""" pytest.xfail("Not yet implemented") From b9dd2a3456404428c067bed6603b000112bac067 Mon Sep 17 00:00:00 2001 From: Andrew Snare Date: Thu, 3 Oct 2024 13:58:32 +0200 Subject: [PATCH 11/58] Refactor fixture code for mocking accounts and groups. --- tests/unit/framework/test_owners.py | 47 +++++++++++------------------ 1 file changed, 18 insertions(+), 29 deletions(-) diff --git a/tests/unit/framework/test_owners.py b/tests/unit/framework/test_owners.py index b3f6031ecf..973cf37f72 100644 --- a/tests/unit/framework/test_owners.py +++ b/tests/unit/framework/test_owners.py @@ -23,44 +23,34 @@ def _get_owner(self, record: Record) -> str | None: return self._owner_fn(record) -def _setup_workspace_users(ws, workspace_users: list[iam.User]) -> None: - ws.users.list.return_value = workspace_users - - -def _setup_account_users(ws, account_users: Sequence[iam.User]) -> None: - def stub_rest_call(method: str, path: str | None = None, query: dict | None = None) -> dict: - if method == "GET" and path == "/api/2.0/account/scim/v2/Users" and query: - return {"Resources": [user.as_dict() for user in account_users]} - msg = f"Call not mocked: {method} {path}" - raise NotImplementedError(msg) - - ws.api_client.do.side_effect = stub_rest_call - +def _setup_accounts( + ws, + *, + account_users: Sequence[iam.User] = (), + workspace_users: Sequence[iam.User] = (), + groups: Sequence[iam.Group] = (), +) -> None: + # Stub for the workspace users. + ws.users.list.return_value = list(workspace_users) -def _setup_groups(ws, groups: list[iam.Group]) -> None: + # Stub for the groups. groups_by_id = {group.id: group for group in groups} - def stub_groups_get(group_id: str) -> iam.Group: try: return groups_by_id[group_id] except KeyError as e: msg = f"Group not found: {group_id}" raise NotFound(msg) from e - ws.groups.get.side_effect = stub_groups_get ws.groups.list.return_value = groups - -def _setup_accounts( - ws, - *, - account_users: Sequence[iam.User] = (), - workspace_users: Sequence[iam.User] = (), - groups: Sequence[iam.Group] = (), -) -> None: - _setup_workspace_users(ws, list(workspace_users)) - _setup_account_users(ws, account_users) - _setup_groups(ws, list(groups)) + # Stub for the account users. + def stub_rest_call(method: str, path: str | None = None, query: dict | None = None) -> dict: + if method == "GET" and path == "/api/2.0/account/scim/v2/Users" and query: + return {"Resources": [user.as_dict() for user in account_users]} + msg = f"Call not mocked: {method} {path}" + raise NotImplementedError(msg) + ws.api_client.do.side_effect = stub_rest_call def _create_workspace_admin(user_name: str, admins_group_id: str) -> iam.User: @@ -96,8 +86,7 @@ def test_ownership_prefers_record_owner(ws) -> None: def test_ownership_admin_user_fallback(ws) -> None: """Verify that if no owner for the record can be found, an admin user is returned instead.""" - account_users = [iam.User(user_name="jane", active=True, roles=[iam.ComplexValue(value="account_admin")])] - _setup_account_users(ws, account_users) + _setup_accounts(ws, account_users=[_create_account_admin("jane")]) ownership = _OwnershipFixture[str](ws) owner = ownership.owner_of("school") From 57bf8c37bba6358c174e77ac2c481422d439e704 Mon Sep 17 00:00:00 2001 From: Andrew Snare Date: Thu, 3 Oct 2024 16:48:33 +0200 Subject: [PATCH 12/58] Revert plumbing the workspace client into CrawlerBase --- src/databricks/labs/ucx/assessment/azure.py | 3 +- .../labs/ucx/assessment/clusters.py | 6 +- .../labs/ucx/assessment/init_scripts.py | 3 +- src/databricks/labs/ucx/assessment/jobs.py | 6 +- .../labs/ucx/assessment/pipelines.py | 3 +- .../labs/ucx/contexts/application.py | 20 ++---- .../labs/ucx/contexts/workflow_task.py | 2 +- src/databricks/labs/ucx/framework/crawlers.py | 7 +- .../labs/ucx/hive_metastore/grants.py | 3 +- .../labs/ucx/hive_metastore/locations.py | 11 ++-- .../hive_metastore/table_migration_status.py | 3 +- .../labs/ucx/hive_metastore/table_size.py | 1 - .../labs/ucx/hive_metastore/tables.py | 10 ++- .../labs/ucx/hive_metastore/udfs.py | 5 +- .../labs/ucx/recon/migration_recon.py | 4 +- .../labs/ucx/source_code/directfs_access.py | 13 ++-- .../labs/ucx/workspace_access/generic.py | 2 +- .../labs/ucx/workspace_access/groups.py | 3 +- .../labs/ucx/workspace_access/manager.py | 5 +- tests/integration/conftest.py | 11 +--- tests/integration/source_code/test_queries.py | 2 +- .../test_permissions_manager.py | 4 +- tests/unit/azure/test_locations.py | 2 +- tests/unit/conftest.py | 6 +- tests/unit/framework/test_crawlers.py | 42 ++++++------ tests/unit/framework/test_owners.py | 3 + tests/unit/hive_metastore/test_grants.py | 60 ++++++++--------- tests/unit/hive_metastore/test_mapping.py | 4 +- .../unit/hive_metastore/test_table_migrate.py | 30 ++++----- tests/unit/hive_metastore/test_table_size.py | 24 +++---- tests/unit/hive_metastore/test_tables.py | 34 +++++----- tests/unit/hive_metastore/test_udfs.py | 8 +-- tests/unit/recon/test_migration_recon.py | 3 +- .../unit/source_code/test_directfs_access.py | 4 +- tests/unit/workspace_access/test_manager.py | 32 ++++----- tests/unit/workspace_access/test_tacl.py | 66 +++++++++---------- 36 files changed, 209 insertions(+), 236 deletions(-) diff --git a/src/databricks/labs/ucx/assessment/azure.py b/src/databricks/labs/ucx/assessment/azure.py index ed5c34bf3f..81c99e784b 100644 --- a/src/databricks/labs/ucx/assessment/azure.py +++ b/src/databricks/labs/ucx/assessment/azure.py @@ -42,7 +42,8 @@ class ServicePrincipalClusterMapping: class AzureServicePrincipalCrawler(CrawlerBase[AzureServicePrincipalInfo], JobsMixin, SecretsMixin): def __init__(self, ws: WorkspaceClient, sbe: SqlBackend, schema): - super().__init__(ws, sbe, "hive_metastore", schema, "azure_service_principals", AzureServicePrincipalInfo) + super().__init__(sbe, "hive_metastore", schema, "azure_service_principals", AzureServicePrincipalInfo) + self._ws = ws def _try_fetch(self) -> Iterable[AzureServicePrincipalInfo]: for row in self._fetch(f"SELECT * FROM {escape_sql_identifier(self.full_name)}"): diff --git a/src/databricks/labs/ucx/assessment/clusters.py b/src/databricks/labs/ucx/assessment/clusters.py index b69862b9a6..02badb64ec 100644 --- a/src/databricks/labs/ucx/assessment/clusters.py +++ b/src/databricks/labs/ucx/assessment/clusters.py @@ -143,7 +143,8 @@ def _check_cluster_failures(self, cluster: ClusterDetails, source: str) -> list[ class ClustersCrawler(CrawlerBase[ClusterInfo], CheckClusterMixin): def __init__(self, ws: WorkspaceClient, sbe: SqlBackend, schema: str): - super().__init__(ws, sbe, "hive_metastore", schema, "clusters", ClusterInfo) + super().__init__(sbe, "hive_metastore", schema, "clusters", ClusterInfo) + self._ws = ws def _crawl(self) -> Iterable[ClusterInfo]: all_clusters = list(self._ws.clusters.list()) @@ -191,7 +192,8 @@ class PolicyInfo: class PoliciesCrawler(CrawlerBase[PolicyInfo], CheckClusterMixin): def __init__(self, ws: WorkspaceClient, sbe: SqlBackend, schema): - super().__init__(ws, sbe, "hive_metastore", schema, "policies", PolicyInfo) + super().__init__(sbe, "hive_metastore", schema, "policies", PolicyInfo) + self._ws = ws def _crawl(self) -> Iterable[PolicyInfo]: all_policices = list(self._ws.cluster_policies.list()) diff --git a/src/databricks/labs/ucx/assessment/init_scripts.py b/src/databricks/labs/ucx/assessment/init_scripts.py index b1add2e9dc..909015b678 100644 --- a/src/databricks/labs/ucx/assessment/init_scripts.py +++ b/src/databricks/labs/ucx/assessment/init_scripts.py @@ -42,7 +42,8 @@ def check_init_script(self, init_script_data: str | None, source: str) -> list[s class GlobalInitScriptCrawler(CrawlerBase[GlobalInitScriptInfo], CheckInitScriptMixin): def __init__(self, ws: WorkspaceClient, sbe: SqlBackend, schema): - super().__init__(ws, sbe, "hive_metastore", schema, "global_init_scripts", GlobalInitScriptInfo) + super().__init__(sbe, "hive_metastore", schema, "global_init_scripts", GlobalInitScriptInfo) + self._ws = ws def _crawl(self) -> Iterable[GlobalInitScriptInfo]: all_global_init_scripts = list(self._ws.global_init_scripts.list()) diff --git a/src/databricks/labs/ucx/assessment/jobs.py b/src/databricks/labs/ucx/assessment/jobs.py index 9f7e3cb0e9..d5b77d68e0 100644 --- a/src/databricks/labs/ucx/assessment/jobs.py +++ b/src/databricks/labs/ucx/assessment/jobs.py @@ -72,7 +72,8 @@ def _job_clusters(job): class JobsCrawler(CrawlerBase[JobInfo], JobsMixin, CheckClusterMixin): def __init__(self, ws: WorkspaceClient, sbe: SqlBackend, schema): - super().__init__(ws, sbe, "hive_metastore", schema, "jobs", JobInfo) + super().__init__(sbe, "hive_metastore", schema, "jobs", JobInfo) + self._ws = ws def _crawl(self) -> Iterable[JobInfo]: all_jobs = list(self._ws.jobs.list(expand_tasks=True)) @@ -158,7 +159,8 @@ class SubmitRunsCrawler(CrawlerBase[SubmitRunInfo], JobsMixin, CheckClusterMixin ] def __init__(self, ws: WorkspaceClient, sbe: SqlBackend, schema: str, num_days_history: int): - super().__init__(ws, sbe, "hive_metastore", schema, "submit_runs", SubmitRunInfo) + super().__init__(sbe, "hive_metastore", schema, "submit_runs", SubmitRunInfo) + self._ws = ws self._num_days_history = num_days_history @staticmethod diff --git a/src/databricks/labs/ucx/assessment/pipelines.py b/src/databricks/labs/ucx/assessment/pipelines.py index 329215c804..8421e53084 100644 --- a/src/databricks/labs/ucx/assessment/pipelines.py +++ b/src/databricks/labs/ucx/assessment/pipelines.py @@ -24,7 +24,8 @@ class PipelineInfo: class PipelinesCrawler(CrawlerBase[PipelineInfo], CheckClusterMixin): def __init__(self, ws: WorkspaceClient, sbe: SqlBackend, schema): - super().__init__(ws, sbe, "hive_metastore", schema, "pipelines", PipelineInfo) + super().__init__(sbe, "hive_metastore", schema, "pipelines", PipelineInfo) + self._ws = ws def _crawl(self) -> Iterable[PipelineInfo]: all_pipelines = list(self._ws.pipelines.list_pipelines()) diff --git a/src/databricks/labs/ucx/contexts/application.py b/src/databricks/labs/ucx/contexts/application.py index d06017e8f4..95944a3d2a 100644 --- a/src/databricks/labs/ucx/contexts/application.py +++ b/src/databricks/labs/ucx/contexts/application.py @@ -201,7 +201,6 @@ def legacy_table_acl_support(self): @cached_property def permission_manager(self): return PermissionManager( - self.workspace_client, self.sql_backend, self.inventory_database, [ @@ -233,21 +232,11 @@ def grants_crawler(self): @cached_property def udfs_crawler(self): - return UdfsCrawler( - self.workspace_client, - self.sql_backend, - self.inventory_database, - self.config.include_databases, - ) + return UdfsCrawler(self.sql_backend, self.inventory_database, self.config.include_databases) @cached_property def tables_crawler(self): - return TablesCrawler( - self.workspace_client, - self.sql_backend, - self.inventory_database, - self.config.include_databases, - ) + return TablesCrawler(self.sql_backend, self.inventory_database, self.config.include_databases) @cached_property def tables_migrator(self): @@ -454,11 +443,11 @@ def query_linter(self): @cached_property def directfs_access_crawler_for_paths(self): - return DirectFsAccessCrawler.for_paths(self.workspace_client, self.sql_backend, self.inventory_database) + return DirectFsAccessCrawler.for_paths(self.sql_backend, self.inventory_database) @cached_property def directfs_access_crawler_for_queries(self): - return DirectFsAccessCrawler.for_queries(self.workspace_client, self.sql_backend, self.inventory_database) + return DirectFsAccessCrawler.for_queries(self.sql_backend, self.inventory_database) @cached_property def redash(self): @@ -487,7 +476,6 @@ def data_comparator(self): @cached_property def migration_recon(self): return MigrationRecon( - self.workspace_client, self.sql_backend, self.inventory_database, self.migration_status_refresher, diff --git a/src/databricks/labs/ucx/contexts/workflow_task.py b/src/databricks/labs/ucx/contexts/workflow_task.py index f61306aa99..daa090393a 100644 --- a/src/databricks/labs/ucx/contexts/workflow_task.py +++ b/src/databricks/labs/ucx/contexts/workflow_task.py @@ -85,7 +85,7 @@ def global_init_scripts_crawler(self): @cached_property def tables_crawler(self): - return FasterTableScanCrawler(self.workspace_client, self.sql_backend, self.inventory_database) + return FasterTableScanCrawler(self.sql_backend, self.inventory_database) @cached_property def tables_in_mounts(self): diff --git a/src/databricks/labs/ucx/framework/crawlers.py b/src/databricks/labs/ucx/framework/crawlers.py index 52cd92d12d..4c89cde902 100644 --- a/src/databricks/labs/ucx/framework/crawlers.py +++ b/src/databricks/labs/ucx/framework/crawlers.py @@ -4,7 +4,6 @@ from typing import ClassVar, Generic, Literal, Protocol, TypeVar from databricks.labs.lsql.backends import SqlBackend -from databricks.sdk import WorkspaceClient from databricks.sdk.errors import NotFound from databricks.labs.ucx.framework.utils import escape_sql_identifier @@ -22,21 +21,17 @@ class DataclassInstance(Protocol): class CrawlerBase(ABC, Generic[Result]): - def __init__( - self, ws: WorkspaceClient, backend: SqlBackend, catalog: str, schema: str, table: str, klass: type[Result] - ): + def __init__(self, backend: SqlBackend, catalog: str, schema: str, table: str, klass: type[Result]) -> None: """ Initializes a CrawlerBase instance. Args: - ws (WorkspaceClient): A client for the current workspace. backend (SqlBackend): The backend that executes SQL queries: Statement Execution API or Databricks Runtime. catalog (str): The catalog name for the inventory persistence. schema: The schema name for the inventory persistence. table: The table name for the inventory persistence. """ - self._ws = ws self._catalog = self._valid(catalog) self._schema = self._valid(schema) self._table = self._valid(table) diff --git a/src/databricks/labs/ucx/hive_metastore/grants.py b/src/databricks/labs/ucx/hive_metastore/grants.py index 5c6575eddb..8673779697 100644 --- a/src/databricks/labs/ucx/hive_metastore/grants.py +++ b/src/databricks/labs/ucx/hive_metastore/grants.py @@ -199,11 +199,10 @@ class GrantsCrawler(CrawlerBase[Grant]): """Crawler that captures access controls that relate to data and other securable objects.""" def __init__(self, tc: TablesCrawler, udf: UdfsCrawler, include_databases: list[str] | None = None): - assert tc._ws == udf._ws assert tc._backend == udf._backend assert tc._catalog == udf._catalog assert tc._schema == udf._schema - super().__init__(tc._ws, tc._backend, tc._catalog, tc._schema, "grants", Grant) + super().__init__(tc._backend, tc._catalog, tc._schema, "grants", Grant) self._tc = tc self._udf = udf self._include_databases = include_databases diff --git a/src/databricks/labs/ucx/hive_metastore/locations.py b/src/databricks/labs/ucx/hive_metastore/locations.py index 33a0a90d07..05802153b4 100644 --- a/src/databricks/labs/ucx/hive_metastore/locations.py +++ b/src/databricks/labs/ucx/hive_metastore/locations.py @@ -117,7 +117,8 @@ class ExternalLocations(CrawlerBase[ExternalLocation]): _prefix_size: ClassVar[list[int]] = [1, 12] def __init__(self, ws: WorkspaceClient, sbe: SqlBackend, schema: str): - super().__init__(ws, sbe, "hive_metastore", schema, "external_locations", ExternalLocation) + super().__init__(sbe, "hive_metastore", schema, "external_locations", ExternalLocation) + self._ws = ws def _external_locations(self, tables: list[Row], mounts) -> Iterable[ExternalLocation]: min_slash = 2 @@ -300,7 +301,8 @@ def save_as_terraform_definitions_on_workspace(self, installation: Installation) class Mounts(CrawlerBase[Mount]): def __init__(self, backend: SqlBackend, ws: WorkspaceClient, inventory_database: str): - super().__init__(ws, backend, "hive_metastore", inventory_database, "mounts", Mount) + super().__init__(backend, "hive_metastore", inventory_database, "mounts", Mount) + self._dbutils = ws.dbutils @staticmethod def _deduplicate_mounts(mounts: list) -> list: @@ -318,7 +320,7 @@ def _deduplicate_mounts(mounts: list) -> list: def _crawl(self) -> Iterable[Mount]: mounts = [] - for mount_point, source, _ in self._ws.dbutils.fs.mounts(): + for mount_point, source, _ in self._dbutils.fs.mounts(): mounts.append(Mount(mount_point, source)) return self._deduplicate_mounts(mounts) @@ -354,10 +356,11 @@ def __init__( exclude_paths_in_mount: list[str] | None = None, include_paths_in_mount: list[str] | None = None, ): - super().__init__(ws, backend, "hive_metastore", inventory_database, "tables", Table) + super().__init__(backend, "hive_metastore", inventory_database, "tables", Table) self._dbutils = ws.dbutils self._mounts_crawler = mc self._include_mounts = include_mounts + self._ws = ws self._include_paths_in_mount = include_paths_in_mount irrelevant_patterns = {'_SUCCESS', '_committed_', '_started_'} diff --git a/src/databricks/labs/ucx/hive_metastore/table_migration_status.py b/src/databricks/labs/ucx/hive_metastore/table_migration_status.py index 640068931d..283be4f717 100644 --- a/src/databricks/labs/ucx/hive_metastore/table_migration_status.py +++ b/src/databricks/labs/ucx/hive_metastore/table_migration_status.py @@ -76,7 +76,8 @@ class TableMigrationStatusRefresher(CrawlerBase[TableMigrationStatus]): """ def __init__(self, ws: WorkspaceClient, sbe: SqlBackend, schema, table_crawler: TablesCrawler): - super().__init__(ws, sbe, "hive_metastore", schema, "migration_status", TableMigrationStatus) + super().__init__(sbe, "hive_metastore", schema, "migration_status", TableMigrationStatus) + self._ws = ws self._table_crawler = table_crawler def index(self, *, force_refresh: bool = False) -> TableMigrationIndex: diff --git a/src/databricks/labs/ucx/hive_metastore/table_size.py b/src/databricks/labs/ucx/hive_metastore/table_size.py index eb9bd2c23c..243c4e3418 100644 --- a/src/databricks/labs/ucx/hive_metastore/table_size.py +++ b/src/databricks/labs/ucx/hive_metastore/table_size.py @@ -34,7 +34,6 @@ def __init__(self, tables_crawler: TablesCrawler | FasterTableScanCrawler) -> No from pyspark.sql.session import SparkSession # type: ignore[import-not-found] super().__init__( - tables_crawler._ws, tables_crawler._backend, "hive_metastore", tables_crawler._schema, diff --git a/src/databricks/labs/ucx/hive_metastore/tables.py b/src/databricks/labs/ucx/hive_metastore/tables.py index 9c5810f467..f935aada95 100644 --- a/src/databricks/labs/ucx/hive_metastore/tables.py +++ b/src/databricks/labs/ucx/hive_metastore/tables.py @@ -13,7 +13,6 @@ from databricks.labs.blueprint.parallel import Threads from databricks.labs.lsql.backends import SqlBackend -from databricks.sdk import WorkspaceClient from databricks.sdk.errors import NotFound from databricks.labs.ucx.framework.crawlers import CrawlerBase @@ -342,16 +341,15 @@ class MigrationCount: class TablesCrawler(CrawlerBase[Table]): - def __init__(self, ws: WorkspaceClient, backend: SqlBackend, schema, include_databases: list[str] | None = None): + def __init__(self, backend: SqlBackend, schema, include_databases: list[str] | None = None): """ Initializes a TablesCrawler instance. Args: - ws (WorkspaceClient): A client for the current workspace. backend (SqlBackend): The SQL Execution Backend abstraction (either REST API or Spark) schema: The schema name for the inventory persistence. """ - super().__init__(ws, backend, "hive_metastore", schema, "tables", Table) + super().__init__(backend, "hive_metastore", schema, "tables", Table) self._include_database = include_databases def _all_databases(self) -> list[str]: @@ -488,14 +486,14 @@ class FasterTableScanCrawler(CrawlerBase[Table]): Databricks workspace. """ - def __init__(self, ws: WorkspaceClient, backend: SqlBackend, schema, include_databases: list[str] | None = None): + def __init__(self, backend: SqlBackend, schema, include_databases: list[str] | None = None): self._backend = backend self._include_database = include_databases # pylint: disable-next=import-error,import-outside-toplevel from pyspark.sql.session import SparkSession # type: ignore[import-not-found] - super().__init__(ws, backend, "hive_metastore", schema, "tables", Table) + super().__init__(backend, "hive_metastore", schema, "tables", Table) self._spark = SparkSession.builder.getOrCreate() @cached_property diff --git a/src/databricks/labs/ucx/hive_metastore/udfs.py b/src/databricks/labs/ucx/hive_metastore/udfs.py index 7f272696dc..40992d0524 100644 --- a/src/databricks/labs/ucx/hive_metastore/udfs.py +++ b/src/databricks/labs/ucx/hive_metastore/udfs.py @@ -5,7 +5,6 @@ from databricks.labs.blueprint.parallel import Threads from databricks.labs.lsql.backends import SqlBackend -from databricks.sdk import WorkspaceClient from databricks.sdk.errors import Unknown, NotFound from databricks.labs.ucx.framework.crawlers import CrawlerBase @@ -37,7 +36,6 @@ def key(self) -> str: class UdfsCrawler(CrawlerBase[Udf]): def __init__( self, - ws: WorkspaceClient, backend: SqlBackend, schema: str, include_databases: list[str] | None = None, @@ -46,11 +44,10 @@ def __init__( Initializes a UdfsCrawler instance. Args: - ws (WorkspaceClient): The client for the current workspace. backend (SqlBackend): The SQL Execution Backend abstraction (either REST API or Spark) schema: The schema name for the inventory persistence. """ - super().__init__(ws, backend, "hive_metastore", schema, "udfs", Udf) + super().__init__(backend, "hive_metastore", schema, "udfs", Udf) self._include_database = include_databases def _all_databases(self) -> list[str]: diff --git a/src/databricks/labs/ucx/recon/migration_recon.py b/src/databricks/labs/ucx/recon/migration_recon.py index 24d435328a..404fd8f1ba 100644 --- a/src/databricks/labs/ucx/recon/migration_recon.py +++ b/src/databricks/labs/ucx/recon/migration_recon.py @@ -4,7 +4,6 @@ from dataclasses import dataclass from functools import partial -from databricks.sdk import WorkspaceClient from databricks.sdk.errors import DatabricksError from databricks.labs.blueprint.parallel import Threads from databricks.labs.lsql.backends import SqlBackend @@ -39,7 +38,6 @@ class ReconResult: class MigrationRecon(CrawlerBase[ReconResult]): def __init__( self, - ws: WorkspaceClient, sbe: SqlBackend, schema: str, migration_status_refresher: TableMigrationStatusRefresher, @@ -48,7 +46,7 @@ def __init__( data_comparator: DataComparator, default_threshold: float, ): - super().__init__(ws, sbe, "hive_metastore", schema, "recon_results", ReconResult) + super().__init__(sbe, "hive_metastore", schema, "recon_results", ReconResult) self._migration_status_refresher = migration_status_refresher self._table_mapping = table_mapping self._schema_comparator = schema_comparator diff --git a/src/databricks/labs/ucx/source_code/directfs_access.py b/src/databricks/labs/ucx/source_code/directfs_access.py index 6fda51e521..372b15e464 100644 --- a/src/databricks/labs/ucx/source_code/directfs_access.py +++ b/src/databricks/labs/ucx/source_code/directfs_access.py @@ -5,7 +5,6 @@ from databricks.labs.ucx.framework.crawlers import CrawlerBase from databricks.labs.lsql.backends import SqlBackend -from databricks.sdk import WorkspaceClient from databricks.sdk.errors import DatabricksError from databricks.labs.ucx.framework.utils import escape_sql_identifier @@ -17,24 +16,22 @@ class DirectFsAccessCrawler(CrawlerBase[DirectFsAccess]): @classmethod - def for_paths(cls, ws: WorkspaceClient, backend: SqlBackend, schema) -> DirectFsAccessCrawler: - return DirectFsAccessCrawler(ws, backend, schema, "directfs_in_paths") + def for_paths(cls, backend: SqlBackend, schema) -> DirectFsAccessCrawler: + return DirectFsAccessCrawler(backend, schema, "directfs_in_paths") @classmethod - def for_queries(cls, ws: WorkspaceClient, backend: SqlBackend, schema) -> DirectFsAccessCrawler: - return DirectFsAccessCrawler(ws, backend, schema, "directfs_in_queries") + def for_queries(cls, backend: SqlBackend, schema) -> DirectFsAccessCrawler: + return DirectFsAccessCrawler(backend, schema, "directfs_in_queries") - def __init__(self, ws: WorkspaceClient, backend: SqlBackend, schema: str, table: str): + def __init__(self, backend: SqlBackend, schema: str, table: str): """ Initializes a DFSACrawler instance. Args: - ws (WorkspaceClient): The client associated with this workspace. sql_backend (SqlBackend): The SQL Execution Backend abstraction (either REST API or Spark) schema: The schema name for the inventory persistence. """ super().__init__( - ws=ws, backend=backend, catalog="hive_metastore", schema=schema, diff --git a/src/databricks/labs/ucx/workspace_access/generic.py b/src/databricks/labs/ucx/workspace_access/generic.py index 0d37fa76d9..0fd06db6d9 100644 --- a/src/databricks/labs/ucx/workspace_access/generic.py +++ b/src/databricks/labs/ucx/workspace_access/generic.py @@ -332,13 +332,13 @@ def __init__( Listing.__init__(self, lambda: [], "_", "_") CrawlerBase.__init__( self, - ws=ws, backend=sql_backend, catalog="hive_metastore", schema=inventory_database, table="workspace_objects", klass=WorkspaceObjectInfo, ) + self._ws = ws self._num_threads = num_threads self._start_path = start_path self._sql_backend = sql_backend diff --git a/src/databricks/labs/ucx/workspace_access/groups.py b/src/databricks/labs/ucx/workspace_access/groups.py index cc6c397aa8..75d59a8d61 100644 --- a/src/databricks/labs/ucx/workspace_access/groups.py +++ b/src/databricks/labs/ucx/workspace_access/groups.py @@ -418,10 +418,11 @@ def __init__( # pylint: disable=too-many-arguments *, external_id_match: bool = False, ): - super().__init__(ws, sql_backend, "hive_metastore", inventory_database, "groups", MigratedGroup) + super().__init__(sql_backend, "hive_metastore", inventory_database, "groups", MigratedGroup) if not renamed_group_prefix: renamed_group_prefix = "db-temp-" + self._ws = ws self._include_group_names = include_group_names self._renamed_group_prefix = renamed_group_prefix self._workspace_group_regex = workspace_group_regex diff --git a/src/databricks/labs/ucx/workspace_access/manager.py b/src/databricks/labs/ucx/workspace_access/manager.py index cfdb36f445..50eba51d95 100644 --- a/src/databricks/labs/ucx/workspace_access/manager.py +++ b/src/databricks/labs/ucx/workspace_access/manager.py @@ -4,7 +4,6 @@ from databricks.labs.blueprint.parallel import ManyError, Threads from databricks.labs.lsql.backends import SqlBackend -from databricks.sdk import WorkspaceClient from databricks.labs.ucx.framework.crawlers import CrawlerBase from databricks.labs.ucx.framework.utils import escape_sql_identifier @@ -24,8 +23,8 @@ class PermissionManager(CrawlerBase[Permissions]): ERRORS_TO_IGNORE = ["FEATURE_DISABLED"] - def __init__(self, ws: WorkspaceClient, backend: SqlBackend, inventory_database: str, crawlers: list[AclSupport]): - super().__init__(ws, backend, "hive_metastore", inventory_database, "permissions", Permissions) + def __init__(self, backend: SqlBackend, inventory_database: str, crawlers: list[AclSupport]): + super().__init__(backend, "hive_metastore", inventory_database, "permissions", Permissions) self._acl_support = crawlers def _crawl(self) -> Iterable[Permissions]: diff --git a/tests/integration/conftest.py b/tests/integration/conftest.py index 89a9bec546..2fc3f47b08 100644 --- a/tests/integration/conftest.py +++ b/tests/integration/conftest.py @@ -331,8 +331,8 @@ def get_azure_spark_conf(): class StaticTablesCrawler(TablesCrawler): - def __init__(self, ws: WorkspaceClient, sb: SqlBackend, schema: str, tables: list[TableInfo]): - super().__init__(ws, sb, schema) + def __init__(self, sb: SqlBackend, schema: str, tables: list[TableInfo]): + super().__init__(sb, schema) self._tables = [ Table( catalog=_.catalog_name, @@ -570,12 +570,7 @@ def tables_crawler(self) -> TablesCrawler: Overrides the FasterTableScanCrawler with TablesCrawler used as DBR is not available while running integration tests :return: TablesCrawler """ - return TablesCrawler( - self.workspace_client, - self.sql_backend, - self.inventory_database, - self.config.include_databases, - ) + return TablesCrawler(self.sql_backend, self.inventory_database, self.config.include_databases) def save_tables(self, is_hiveserde: bool = False): # populate the tables crawled, as it is used by get_tables_to_migrate in the migrate-tables workflow diff --git a/tests/integration/source_code/test_queries.py b/tests/integration/source_code/test_queries.py index 029af876eb..10d4ded773 100644 --- a/tests/integration/source_code/test_queries.py +++ b/tests/integration/source_code/test_queries.py @@ -11,7 +11,7 @@ def test_query_linter_lints_queries_and_stores_dfsas(simple_ctx, ws, sql_backend all_problems = sql_backend.fetch("SELECT * FROM query_problems", schema=simple_ctx.inventory_database) problems = [row for row in all_problems if row["query_name"] == query.name] assert len(problems) == 1 - crawler = DirectFsAccessCrawler.for_queries(ws, sql_backend, simple_ctx.inventory_database) + crawler = DirectFsAccessCrawler.for_queries(sql_backend, simple_ctx.inventory_database) all_dfsas = crawler.snapshot() source_id = f"{_dashboard.id}/{query.id}" dfsas = [dfsa for dfsa in all_dfsas if dfsa.source_id == source_id] diff --git a/tests/integration/workspace_access/test_permissions_manager.py b/tests/integration/workspace_access/test_permissions_manager.py index 0a672d06cd..42cfe14c6a 100644 --- a/tests/integration/workspace_access/test_permissions_manager.py +++ b/tests/integration/workspace_access/test_permissions_manager.py @@ -5,7 +5,7 @@ from databricks.labs.ucx.workspace_access.manager import PermissionManager -def test_permissions_snapshot(ws, sql_backend, inventory_schema): +def test_permissions_snapshot(sql_backend, inventory_schema): class StubbedCrawler(AclSupport): def get_crawler_tasks(self) -> Iterable[Callable[..., Permissions | None]]: yield lambda: Permissions(object_id="abc", object_type="bcd", raw="def") @@ -16,7 +16,7 @@ def get_verify_task(self, item: Permissions) -> Callable[[], bool] | None: ... def object_types(self) -> set[str]: return {"bcd", "fgh"} - permission_manager = PermissionManager(ws, sql_backend, inventory_schema, [StubbedCrawler()]) + permission_manager = PermissionManager(sql_backend, inventory_schema, [StubbedCrawler()]) snapshot = list(permission_manager.snapshot()) # Snapshotting is multithreaded, meaning the order of results is non-deterministic. snapshot.sort(key=lambda x: x.object_id) diff --git a/tests/unit/azure/test_locations.py b/tests/unit/azure/test_locations.py index 7e4401f439..f1b901638b 100644 --- a/tests/unit/azure/test_locations.py +++ b/tests/unit/azure/test_locations.py @@ -28,7 +28,7 @@ def location_migration_for_test(ws, mock_backend, mock_installation, azurerm=Non azurerm = azurerm or AzureResources(azure_api_client(), azure_api_client()) location_crawler = ExternalLocations(ws, mock_backend, "location_test") azure_resource_permissions = AzureResourcePermissions(mock_installation, ws, azurerm, location_crawler) - tables_crawler = TablesCrawler(ws, mock_backend, 'ucx') + tables_crawler = TablesCrawler(mock_backend, 'ucx') mounts_crawler = Mounts(mock_backend, ws, 'ucx') principal_acl = PrincipalACL(ws, mock_backend, mock_installation, tables_crawler, mounts_crawler, lambda: []) external_locations_migration = ExternalLocationsMigration( diff --git a/tests/unit/conftest.py b/tests/unit/conftest.py index b404a81d7f..24c8491c8c 100644 --- a/tests/unit/conftest.py +++ b/tests/unit/conftest.py @@ -134,11 +134,7 @@ def inner(cb, **replace) -> RuntimeContext: if 'config' not in replace: replace['config'] = mock_installation.load(WorkspaceConfig) if 'tables_crawler' not in replace: - replace['tables_crawler'] = TablesCrawler( - replace['workspace_client'], - replace['sql_backend'], - replace['config'].inventory_database, - ) + replace['tables_crawler'] = TablesCrawler(replace['sql_backend'], replace['config'].inventory_database) module = __import__(cb.__module__, fromlist=[cb.__name__]) klass, method = cb.__qualname__.split('.', 1) diff --git a/tests/unit/framework/test_crawlers.py b/tests/unit/framework/test_crawlers.py index f83461db3e..1547841bdf 100644 --- a/tests/unit/framework/test_crawlers.py +++ b/tests/unit/framework/test_crawlers.py @@ -5,7 +5,6 @@ import pytest from databricks.labs.lsql import Row from databricks.labs.lsql.backends import MockBackend -from databricks.sdk import WorkspaceClient from databricks.sdk.errors import NotFound from databricks.labs.ucx.framework.crawlers import CrawlerBase, Result, ResultFn @@ -33,7 +32,6 @@ class Bar: class _CrawlerFixture(CrawlerBase[Result]): def __init__( self, - ws: WorkspaceClient, backend: MockBackend, catalog: str, schema: str, @@ -43,7 +41,7 @@ def __init__( fetcher: ResultFn = lambda: [], loader: ResultFn = lambda: [], ): - super().__init__(ws, backend, catalog, schema, table, klass) + super().__init__(backend, catalog, schema, table, klass) self._fetcher = fetcher self._loader = loader @@ -54,22 +52,22 @@ def _crawl(self) -> Iterable[Result]: return self._loader() -def test_invalid(ws): +def test_invalid(): with pytest.raises(ValueError): - _CrawlerFixture(ws, MockBackend(), "a.a.a", "b", "c", Bar) + _CrawlerFixture(MockBackend(), "a.a.a", "b", "c", Bar) -def test_full_name(ws): - cb = _CrawlerFixture(ws, MockBackend(), "a", "b", "c", Bar) +def test_full_name(): + cb = _CrawlerFixture(MockBackend(), "a", "b", "c", Bar) assert cb.full_name == "a.b.c" -def test_snapshot_crawls_when_no_prior_crawl(ws) -> None: +def test_snapshot_crawls_when_no_prior_crawl() -> None: """Check that the crawler is invoked when the fetcher reports that the inventory doesn't exist.""" mock_backend = MockBackend() mock_fetcher = Mock(side_effect=NotFound(".. TABLE_OR_VIEW_NOT_FOUND ..")) mock_loader = Mock(return_value=[Baz(first="first")]) - cb = _CrawlerFixture[Baz](ws, mock_backend, "a", "b", "c", Baz, fetcher=mock_fetcher, loader=mock_loader) + cb = _CrawlerFixture[Baz](mock_backend, "a", "b", "c", Baz, fetcher=mock_fetcher, loader=mock_loader) result = cb.snapshot() @@ -83,7 +81,7 @@ def test_snapshot_crawls_when_prior_crawl_yielded_no_data(ws) -> None: mock_backend = MockBackend() mock_fetcher = Mock(return_value=[]) mock_loader = Mock(return_value=[Baz(first="first")]) - cb = _CrawlerFixture[Baz](ws, mock_backend, "a", "b", "c", Baz, fetcher=mock_fetcher, loader=mock_loader) + cb = _CrawlerFixture[Baz](mock_backend, "a", "b", "c", Baz, fetcher=mock_fetcher, loader=mock_loader) result = cb.snapshot() @@ -92,12 +90,12 @@ def test_snapshot_crawls_when_prior_crawl_yielded_no_data(ws) -> None: assert [Baz(first="first")] == result -def test_snapshot_doesnt_crawl_if_previous_crawl_yielded_data(ws) -> None: +def test_snapshot_doesnt_crawl_if_previous_crawl_yielded_data() -> None: """Check that existing data is used (with no crawl) if the fetcher can load the snapshot data.""" mock_backend = MockBackend() mock_fetcher = Mock(return_value=[Baz(first="first")]) mock_loader = Mock(return_value=[Baz(first="second")]) - cb = _CrawlerFixture[Baz](ws, mock_backend, "a", "b", "c", Baz, fetcher=mock_fetcher, loader=mock_loader) + cb = _CrawlerFixture[Baz](mock_backend, "a", "b", "c", Baz, fetcher=mock_fetcher, loader=mock_loader) result = cb.snapshot() @@ -106,12 +104,12 @@ def test_snapshot_doesnt_crawl_if_previous_crawl_yielded_data(ws) -> None: assert [Baz(first="first")] == result -def test_snapshot_crawls_if_refresh_forced(ws) -> None: +def test_snapshot_crawls_if_refresh_forced() -> None: """Check that a crawl happens (without even checking existing data) if a refresh is forced.""" mock_backend = MockBackend() mock_fetcher = Mock(return_value=[Baz(first="first")]) mock_loader = Mock(return_value=[Baz(first="second")]) - cb = _CrawlerFixture[Baz](ws, mock_backend, "a", "b", "c", Baz, fetcher=mock_fetcher, loader=mock_loader) + cb = _CrawlerFixture[Baz](mock_backend, "a", "b", "c", Baz, fetcher=mock_fetcher, loader=mock_loader) result = cb.snapshot(force_refresh=True) @@ -120,12 +118,12 @@ def test_snapshot_crawls_if_refresh_forced(ws) -> None: assert [Baz(first="second")] == result -def test_snapshot_force_refresh_replaces_prior_data(ws) -> None: +def test_snapshot_force_refresh_replaces_prior_data() -> None: """Check that when refreshing the new data replaces (via overwrite) any existing data.""" mock_backend = MockBackend() mock_fetcher = Mock(side_effect=RuntimeError("never called")) mock_loader = Mock(return_value=[Baz(first="second")]) - cb = _CrawlerFixture[Baz](ws, mock_backend, "a", "b", "c", Baz, fetcher=mock_fetcher, loader=mock_loader) + cb = _CrawlerFixture[Baz](mock_backend, "a", "b", "c", Baz, fetcher=mock_fetcher, loader=mock_loader) cb.snapshot(force_refresh=True) @@ -134,9 +132,9 @@ def test_snapshot_force_refresh_replaces_prior_data(ws) -> None: assert [Row(first="second", second=None)] == mock_backend.rows_written_for("a.b.c", mode="overwrite") -def test_snapshot_updates_existing_table(ws) -> None: +def test_snapshot_updates_existing_table() -> None: mock_backend = MockBackend() - cb = _CrawlerFixture[Baz](ws, mock_backend, "a", "b", "c", Baz, loader=lambda: [Baz(first="first")]) + cb = _CrawlerFixture[Baz](mock_backend, "a", "b", "c", Baz, loader=lambda: [Baz(first="first")]) result = cb.snapshot() @@ -144,7 +142,7 @@ def test_snapshot_updates_existing_table(ws) -> None: assert [Row(first="first", second=None)] == mock_backend.rows_written_for("a.b.c", "overwrite") -def test_snapshot_updates_new_table(ws) -> None: +def test_snapshot_updates_new_table() -> None: mock_backend = MockBackend() def fetcher(): @@ -152,7 +150,7 @@ def fetcher(): raise NotFound(msg) cb = _CrawlerFixture[Foo]( - ws, mock_backend, "a", "b", "c", Foo, fetcher=fetcher, loader=lambda: [Foo(first="first", second=True)] + mock_backend, "a", "b", "c", Foo, fetcher=fetcher, loader=lambda: [Foo(first="first", second=True)] ) result = cb.snapshot() @@ -161,14 +159,14 @@ def fetcher(): assert [Row(first="first", second=True)] == mock_backend.rows_written_for("a.b.c", "overwrite") -def test_snapshot_wrong_error(ws) -> None: +def test_snapshot_wrong_error() -> None: sql_backend = MockBackend() def fetcher(): msg = "always fails" raise ValueError(msg) - cb = _CrawlerFixture[Bar](ws, sql_backend, "a", "b", "c", Bar, fetcher=fetcher) + cb = _CrawlerFixture[Bar](sql_backend, "a", "b", "c", Bar, fetcher=fetcher) with pytest.raises(ValueError): cb.snapshot() diff --git a/tests/unit/framework/test_owners.py b/tests/unit/framework/test_owners.py index 973cf37f72..6ee03bc753 100644 --- a/tests/unit/framework/test_owners.py +++ b/tests/unit/framework/test_owners.py @@ -35,12 +35,14 @@ def _setup_accounts( # Stub for the groups. groups_by_id = {group.id: group for group in groups} + def stub_groups_get(group_id: str) -> iam.Group: try: return groups_by_id[group_id] except KeyError as e: msg = f"Group not found: {group_id}" raise NotFound(msg) from e + ws.groups.get.side_effect = stub_groups_get ws.groups.list.return_value = groups @@ -50,6 +52,7 @@ def stub_rest_call(method: str, path: str | None = None, query: dict | None = No return {"Resources": [user.as_dict() for user in account_users]} msg = f"Call not mocked: {method} {path}" raise NotImplementedError(msg) + ws.api_client.do.side_effect = stub_rest_call diff --git a/tests/unit/hive_metastore/test_grants.py b/tests/unit/hive_metastore/test_grants.py index 2985343d05..101f1dd602 100644 --- a/tests/unit/hive_metastore/test_grants.py +++ b/tests/unit/hive_metastore/test_grants.py @@ -174,16 +174,16 @@ def test_uc_sql(grant, query): } -def test_crawler_no_data(ws): +def test_crawler_no_data(): sql_backend = MockBackend() - table = TablesCrawler(ws, sql_backend, "schema") - udf = UdfsCrawler(ws, sql_backend, "schema") + table = TablesCrawler(sql_backend, "schema") + udf = UdfsCrawler(sql_backend, "schema") crawler = GrantsCrawler(table, udf) grants = list(crawler.snapshot()) assert len(grants) == 0 -def test_crawler_crawl(ws): +def test_crawler_crawl(): sql_backend = MockBackend( rows={ "SHOW DATABASES": SHOW_DATABASES[ @@ -238,14 +238,14 @@ def test_crawler_crawl(ws): action_type="SELECT", ), } - table = TablesCrawler(ws, sql_backend, "schema") - udf = UdfsCrawler(ws, sql_backend, "schema") + table = TablesCrawler(sql_backend, "schema") + udf = UdfsCrawler(sql_backend, "schema") crawler = GrantsCrawler(table, udf) grants = list(crawler.snapshot()) assert len(grants) == len(expected_grants) and set(grants) == expected_grants -def test_crawler_udf_crawl(ws): +def test_crawler_udf_crawl(): sql_backend = MockBackend( rows={ "SHOW DATABASES": SHOW_DATABASES[("database_one",),], @@ -287,33 +287,33 @@ def test_crawler_udf_crawl(ws): ), } - table = TablesCrawler(ws, sql_backend, "schema") - udf = UdfsCrawler(ws, sql_backend, "schema") + table = TablesCrawler(sql_backend, "schema") + udf = UdfsCrawler(sql_backend, "schema") crawler = GrantsCrawler(table, udf) grants = list(crawler.snapshot()) assert len(grants) == len(expected_grants) and set(grants) == expected_grants -def test_crawler_snapshot_when_no_data(ws): +def test_crawler_snapshot_when_no_data(): sql_backend = MockBackend() - table = TablesCrawler(ws, sql_backend, "schema") - udf = UdfsCrawler(ws, sql_backend, "schema") + table = TablesCrawler(sql_backend, "schema") + udf = UdfsCrawler(sql_backend, "schema") crawler = GrantsCrawler(table, udf) snapshot = list(crawler.snapshot()) assert len(snapshot) == 0 -def test_crawler_snapshot_with_data(ws): +def test_crawler_snapshot_with_data(): sql_backend = MockBackend(rows=ROWS) - table = TablesCrawler(ws, sql_backend, "schema") - udf = UdfsCrawler(ws, sql_backend, "schema") + table = TablesCrawler(sql_backend, "schema") + udf = UdfsCrawler(sql_backend, "schema") crawler = GrantsCrawler(table, udf) snapshot = list(crawler.snapshot()) assert len(snapshot) == 3 -def test_grants_returning_error_when_showing_grants(ws): +def test_grants_returning_error_when_showing_grants(): errors = {"SHOW GRANTS ON TABLE `hive_metastore`.`test_database`.`table1`": "error"} rows = { "SHOW DATABASES": SHOW_DATABASES[ @@ -334,8 +334,8 @@ def test_grants_returning_error_when_showing_grants(ws): } backend = MockBackend(fails_on_first=errors, rows=rows) - table_crawler = TablesCrawler(ws, backend, "default") - udf = UdfsCrawler(ws, backend, "default") + table_crawler = TablesCrawler(backend, "default") + udf = UdfsCrawler(backend, "default") crawler = GrantsCrawler(table_crawler, udf) results = list(crawler.snapshot()) @@ -352,7 +352,7 @@ def test_grants_returning_error_when_showing_grants(ws): ] -def test_grants_returning_error_when_describing(ws): +def test_grants_returning_error_when_describing(): errors = {"DESCRIBE TABLE EXTENDED `hive_metastore`.`test_database`.`table1`": "error"} rows = { "SHOW DATABASES": SHOW_DATABASES[("test_database",),], @@ -370,8 +370,8 @@ def test_grants_returning_error_when_describing(ws): } backend = MockBackend(fails_on_first=errors, rows=rows) - table_crawler = TablesCrawler(ws, backend, "default") - udf = UdfsCrawler(ws, backend, "default") + table_crawler = TablesCrawler(backend, "default") + udf = UdfsCrawler(backend, "default") crawler = GrantsCrawler(table_crawler, udf) results = list(crawler.snapshot()) @@ -388,7 +388,7 @@ def test_grants_returning_error_when_describing(ws): ] -def test_udf_grants_returning_error_when_showing_grants(ws): +def test_udf_grants_returning_error_when_showing_grants(): errors = {"SHOW GRANTS ON FUNCTION `hive_metastore`.`test_database`.`function_bad`": "error"} rows = { "SHOW DATABASES": SHOW_DATABASES[ @@ -409,8 +409,8 @@ def test_udf_grants_returning_error_when_showing_grants(ws): } backend = MockBackend(fails_on_first=errors, rows=rows) - table_crawler = TablesCrawler(ws, backend, "default") - udf = UdfsCrawler(ws, backend, "default") + table_crawler = TablesCrawler(backend, "default") + udf = UdfsCrawler(backend, "default") crawler = GrantsCrawler(table_crawler, udf) results = list(crawler.snapshot()) @@ -427,7 +427,7 @@ def test_udf_grants_returning_error_when_showing_grants(ws): ] -def test_udf_grants_returning_error_when_describing(ws): +def test_udf_grants_returning_error_when_describing(): errors = {"DESCRIBE FUNCTION EXTENDED `hive_metastore`.`test_database`.`function_bad`": "error"} rows = { "SHOW DATABASES": SHOW_DATABASES[("test_database",),], @@ -445,8 +445,8 @@ def test_udf_grants_returning_error_when_describing(ws): } backend = MockBackend(fails_on_first=errors, rows=rows) - table_crawler = TablesCrawler(ws, backend, "default") - udf = UdfsCrawler(ws, backend, "default") + table_crawler = TablesCrawler(backend, "default") + udf = UdfsCrawler(backend, "default") crawler = GrantsCrawler(table_crawler, udf) results = list(crawler.snapshot()) @@ -463,7 +463,7 @@ def test_udf_grants_returning_error_when_describing(ws): ] -def test_crawler_should_filter_databases(ws): +def test_crawler_should_filter_databases(): sql_backend = MockBackend( rows={ "SHOW TABLES FROM `hive_metastore`\\.`database_one`": SHOW_TABLES[("database_one", "table_one", "true"),], @@ -490,8 +490,8 @@ def test_crawler_should_filter_databases(ws): ), } - table = TablesCrawler(ws, sql_backend, "schema", include_databases=["database_one"]) - udf = UdfsCrawler(ws, sql_backend, "schema", include_databases=["database_one"]) + table = TablesCrawler(sql_backend, "schema", include_databases=["database_one"]) + udf = UdfsCrawler(sql_backend, "schema", include_databases=["database_one"]) crawler = GrantsCrawler(table, udf, include_databases=["database_one"]) grants = list(crawler.snapshot()) diff --git a/tests/unit/hive_metastore/test_mapping.py b/tests/unit/hive_metastore/test_mapping.py index 94b5ec9aaa..e0ac9f56ad 100644 --- a/tests/unit/hive_metastore/test_mapping.py +++ b/tests/unit/hive_metastore/test_mapping.py @@ -299,11 +299,11 @@ def test_skip_missing_table(caplog): assert [rec.message for rec in caplog.records if "table not found" in rec.message.lower()] -def test_extract_database_skip_property(ws): +def test_extract_database_skip_property(): errors = {} rows = {} backend = MockBackend(fails_on_first=errors, rows=rows) - table_crawler = TablesCrawler(ws, backend, "ucx") + table_crawler = TablesCrawler(backend, "ucx") assert "databricks.labs.ucx.skip" in table_crawler.parse_database_props("(databricks.labs.ucx.skip,true)") diff --git a/tests/unit/hive_metastore/test_table_migrate.py b/tests/unit/hive_metastore/test_table_migrate.py index 4a096ad125..a3b1926975 100644 --- a/tests/unit/hive_metastore/test_table_migrate.py +++ b/tests/unit/hive_metastore/test_table_migrate.py @@ -42,7 +42,7 @@ def test_migrate_dbfs_root_tables_should_produce_proper_queries(ws): errors = {} rows = {r"SYNC .*": MockBackend.rows("status_code", "description")[("SUCCESS", "test")]} backend = MockBackend(fails_on_first=errors, rows=rows) - table_crawler = TablesCrawler(ws, backend, "inventory_database") + table_crawler = TablesCrawler(backend, "inventory_database") table_mapping = mock_table_mapping(["managed_dbfs", "managed_mnt", "managed_other"]) migration_status_refresher = TableMigrationStatusRefresher(ws, backend, "inventory_database", table_crawler) migrate_grants = create_autospec(MigrateGrants) @@ -93,7 +93,7 @@ def test_dbfs_non_delta_tables_should_produce_proper_queries(ws): ] } backend = MockBackend(fails_on_first=errors, rows=rows) - table_crawler = TablesCrawler(ws, backend, "inventory_database") + table_crawler = TablesCrawler(backend, "inventory_database") table_mapping = mock_table_mapping(["dbfs_parquet"]) migration_status_refresher = TableMigrationStatusRefresher(ws, backend, "inventory_database", table_crawler) migrate_grants = create_autospec(MigrateGrants) @@ -129,7 +129,7 @@ def test_migrate_dbfs_root_tables_should_be_skipped_when_upgrading_external(ws): rows = {} crawler_backend = MockBackend(fails_on_first=errors, rows=rows) backend = MockBackend(fails_on_first=errors, rows=rows) - table_crawler = TablesCrawler(ws, crawler_backend, "inventory_database") + table_crawler = TablesCrawler(crawler_backend, "inventory_database") table_mapping = mock_table_mapping(["managed_dbfs"]) migration_status_refresher = TableMigrationStatusRefresher(ws, backend, "inventory_database", table_crawler) migrate_grants = create_autospec(MigrateGrants) @@ -151,7 +151,7 @@ def test_migrate_external_tables_should_produce_proper_queries(ws): rows = {r"SYNC .*": MockBackend.rows("status_code", "description")[("SUCCESS", "test")]} crawler_backend = MockBackend(fails_on_first=errors, rows=rows) backend = MockBackend(fails_on_first=errors, rows=rows) - table_crawler = TablesCrawler(ws, crawler_backend, "inventory_database") + table_crawler = TablesCrawler(crawler_backend, "inventory_database") table_mapping = mock_table_mapping(["external_src"]) migration_status_refresher = TableMigrationStatusRefresher(ws, backend, "inventory_database", table_crawler) migrate_grants = create_autospec(MigrateGrants) @@ -182,7 +182,7 @@ def test_migrate_external_table_failed_sync(ws, caplog): rows = {r"SYNC .*": MockBackend.rows("status_code", "description")[("LOCATION_OVERLAP", "test")]} backend = MockBackend(fails_on_first=errors, rows=rows) crawler_backend = MockBackend(fails_on_first=errors, rows=rows) - table_crawler = TablesCrawler(ws, crawler_backend, "inventory_database") + table_crawler = TablesCrawler(crawler_backend, "inventory_database") table_mapping = mock_table_mapping(["external_src"]) migration_status_refresher = TableMigrationStatusRefresher(ws, backend, "inventory_database", table_crawler) migrate_grants = create_autospec(MigrateGrants) @@ -287,7 +287,7 @@ def test_migrate_external_hiveserde_table_in_place( }, fails_on_first=errors, ) - table_crawler = TablesCrawler(ws, backend, "inventory_database") + table_crawler = TablesCrawler(backend, "inventory_database") table_mapping = mock_table_mapping(["external_hiveserde"]) migration_status_refresher = TableMigrationStatusRefresher(ws, backend, "inventory_database", table_crawler) mount_crawler = create_autospec(Mounts) @@ -339,7 +339,7 @@ def test_migrate_external_hiveserde_table_in_place( ) def test_migrate_external_tables_ctas_should_produce_proper_queries(ws, what, test_table, expected_query): backend = MockBackend() - table_crawler = TablesCrawler(ws, backend, "inventory_database") + table_crawler = TablesCrawler(backend, "inventory_database") table_mapping = mock_table_mapping([test_table]) migration_status_refresher = TableMigrationStatusRefresher(ws, backend, "inventory_database", table_crawler) mounts_crawler = create_autospec(Mounts) @@ -364,7 +364,7 @@ def test_migrate_already_upgraded_table_should_produce_no_queries(ws): rows = {} crawler_backend = MockBackend(fails_on_first=errors, rows=rows) backend = MockBackend(fails_on_first=errors, rows=rows) - table_crawler = TablesCrawler(ws, crawler_backend, "inventory_database") + table_crawler = TablesCrawler(crawler_backend, "inventory_database") ws.catalogs.list.return_value = [CatalogInfo(name="cat1")] ws.schemas.list.return_value = [ SchemaInfo(catalog_name="cat1", name="test_schema1"), @@ -407,7 +407,7 @@ def test_migrate_unsupported_format_table_should_produce_no_queries(ws): rows = {} crawler_backend = MockBackend(fails_on_first=errors, rows=rows) backend = MockBackend(fails_on_first=errors, rows=rows) - table_crawler = TablesCrawler(ws, crawler_backend, "inventory_database") + table_crawler = TablesCrawler(crawler_backend, "inventory_database") table_mapping = mock_table_mapping(["external_src_unsupported"]) migration_status_refresher = TableMigrationStatusRefresher(ws, backend, "inventory_database", table_crawler) migrate_grants = create_autospec(MigrateGrants) @@ -432,7 +432,7 @@ def test_migrate_view_should_produce_proper_queries(ws): ) rows = {"SHOW CREATE TABLE": [{"createtab_stmt": original_view}]} backend = MockBackend(fails_on_first=errors, rows=rows) - table_crawler = TablesCrawler(ws, backend, "inventory_database") + table_crawler = TablesCrawler(backend, "inventory_database") table_mapping = mock_table_mapping(["managed_dbfs", "view"]) migration_status_refresher = create_autospec(TableMigrationStatusRefresher) migration_status_refresher.get_seen_tables.return_value = { @@ -480,7 +480,7 @@ def test_migrate_view_with_columns(ws): create = "CREATE OR REPLACE VIEW hive_metastore.db1_src.view_src (a,b) AS SELECT * FROM db1_src.managed_dbfs" rows = {"SHOW CREATE TABLE": [{"createtab_stmt": create}]} backend = MockBackend(fails_on_first=errors, rows=rows) - table_crawler = TablesCrawler(ws, backend, "inventory_database") + table_crawler = TablesCrawler(backend, "inventory_database") table_mapping = mock_table_mapping(["managed_dbfs", "view"]) migration_status_refresher = create_autospec(TableMigrationStatusRefresher) migration_status_refresher.get_seen_tables.return_value = { @@ -1048,7 +1048,7 @@ def test_table_in_mount_mapping_with_table_owner(ws): Rule("prod", "tgt_catalog", "mounted_datalake", "tgt_db", "abfss://bucket@msft/path/test", "test"), ) ] - table_crawler = TablesCrawler(ws, backend, "inventory_database") + table_crawler = TablesCrawler(backend, "inventory_database") migration_status_refresher = TableMigrationStatusRefresher(client, backend, "inventory_database", table_crawler) migrate_grants = create_autospec(MigrateGrants) table_migrate = TablesMigrator( @@ -1091,7 +1091,7 @@ def test_table_in_mount_mapping_with_partition_information(ws): Rule("prod", "tgt_catalog", "mounted_datalake", "tgt_db", "abfss://bucket@msft/path/test", "test"), ) ] - table_crawler = TablesCrawler(ws, backend, "inventory_database") + table_crawler = TablesCrawler(backend, "inventory_database") migration_status_refresher = TableMigrationStatusRefresher(client, backend, "inventory_database", table_crawler) migrate_grants = create_autospec(MigrateGrants) table_migrate = TablesMigrator( @@ -1115,7 +1115,7 @@ def test_migrate_view_failed(ws, caplog): create = "CREATE OR REPLACE VIEW hive_metastore.db1_src.view_src (a,b) AS SELECT * FROM db1_src.managed_dbfs" rows = {"SHOW CREATE TABLE": [{"createtab_stmt": create}]} backend = MockBackend(fails_on_first=errors, rows=rows) - table_crawler = TablesCrawler(ws, backend, "inventory_database") + table_crawler = TablesCrawler(backend, "inventory_database") table_mapping = mock_table_mapping(["managed_dbfs", "view"]) migration_status_refresher = create_autospec(TableMigrationStatusRefresher) migration_status_refresher.get_seen_tables.return_value = { @@ -1148,7 +1148,7 @@ def test_migrate_view_failed(ws, caplog): def test_migrate_dbfs_root_tables_failed(ws, caplog): errors = {"CREATE TABLE IF NOT EXISTS": "error"} backend = MockBackend(fails_on_first=errors, rows={}) - table_crawler = TablesCrawler(ws, backend, "inventory_database") + table_crawler = TablesCrawler(backend, "inventory_database") table_mapping = mock_table_mapping(["managed_dbfs"]) migration_status_refresher = TableMigrationStatusRefresher(ws, backend, "inventory_database", table_crawler) migrate_grants = create_autospec(MigrateGrants) diff --git a/tests/unit/hive_metastore/test_table_size.py b/tests/unit/hive_metastore/test_table_size.py index 540eb66e19..29a470bf56 100644 --- a/tests/unit/hive_metastore/test_table_size.py +++ b/tests/unit/hive_metastore/test_table_size.py @@ -13,7 +13,7 @@ class SparkSession: pass -def test_table_size_crawler(ws, mocker): +def test_table_size_crawler(mocker): errors = {} rows = { "table_size": [], @@ -33,7 +33,7 @@ def test_table_size_crawler(ws, mocker): backend = MockBackend(fails_on_first=errors, rows=rows) pyspark_sql_session = mocker.Mock() sys.modules["pyspark.sql.session"] = pyspark_sql_session - tsc = TableSizeCrawler(TablesCrawler(ws, backend, "inventory_database")) + tsc = TableSizeCrawler(TablesCrawler(backend, "inventory_database")) tsc._spark._jsparkSession.table().queryExecution().analyzed().stats().sizeInBytes.side_effect = [100, 200, 300] results = tsc.snapshot() assert "ANALYZE table `hive_metastore`.`db1`.`table1` compute STATISTICS NOSCAN" in backend.queries @@ -43,7 +43,7 @@ def test_table_size_crawler(ws, mocker): assert TableSize("hive_metastore", "db1", "table2", 200) in results -def test_table_size_unknown_error(ws, mocker, caplog): +def test_table_size_unknown_error(mocker, caplog): errors = {} rows = { "table_size": [], @@ -55,7 +55,7 @@ def test_table_size_unknown_error(ws, mocker, caplog): backend = MockBackend(fails_on_first=errors, rows=rows) pyspark_sql_session = mocker.Mock() sys.modules["pyspark.sql.session"] = pyspark_sql_session - tsc = TableSizeCrawler(TablesCrawler(ws, backend, "inventory_database")) + tsc = TableSizeCrawler(TablesCrawler(backend, "inventory_database")) tsc._spark._jsparkSession.table().queryExecution().analyzed().stats().sizeInBytes.side_effect = Exception(...) with caplog.at_level(logging.WARNING): @@ -64,7 +64,7 @@ def test_table_size_unknown_error(ws, mocker, caplog): assert len(results) == 0 -def test_table_size_table_or_view_not_found(ws, mocker, caplog): +def test_table_size_table_or_view_not_found(mocker, caplog): errors = {} rows = { "table_size": [], @@ -76,7 +76,7 @@ def test_table_size_table_or_view_not_found(ws, mocker, caplog): backend = MockBackend(fails_on_first=errors, rows=rows) pyspark_sql_session = mocker.Mock() sys.modules["pyspark.sql.session"] = pyspark_sql_session - tsc = TableSizeCrawler(TablesCrawler(ws, backend, "inventory_database")) + tsc = TableSizeCrawler(TablesCrawler(backend, "inventory_database")) # table removed after crawling tsc._spark._jsparkSession.table().queryExecution().analyzed().stats().sizeInBytes.side_effect = Exception( @@ -90,7 +90,7 @@ def test_table_size_table_or_view_not_found(ws, mocker, caplog): assert "Failed to evaluate hive_metastore.db1.table1 table size. Table not found" in caplog.text -def test_table_size_delta_table_not_found(ws, mocker, caplog): +def test_table_size_delta_table_not_found(mocker, caplog): errors = {} rows = { "table_size": [], @@ -102,7 +102,7 @@ def test_table_size_delta_table_not_found(ws, mocker, caplog): backend = MockBackend(fails_on_first=errors, rows=rows) pyspark_sql_session = mocker.Mock() sys.modules["pyspark.sql.session"] = pyspark_sql_session - tsc = TableSizeCrawler(TablesCrawler(ws, backend, "inventory_database")) + tsc = TableSizeCrawler(TablesCrawler(backend, "inventory_database")) # table removed after crawling tsc._spark._jsparkSession.table().queryExecution().analyzed().stats().sizeInBytes.side_effect = Exception( @@ -116,7 +116,7 @@ def test_table_size_delta_table_not_found(ws, mocker, caplog): assert "Failed to evaluate hive_metastore.db1.table1 table size. Table not found" in caplog.text -def test_table_size_when_table_corrupted(ws, mocker, caplog): +def test_table_size_when_table_corrupted(mocker, caplog): errors = {} rows = { "table_size": [], @@ -128,7 +128,7 @@ def test_table_size_when_table_corrupted(ws, mocker, caplog): backend = MockBackend(fails_on_first=errors, rows=rows) pyspark_sql_session = mocker.Mock() sys.modules["pyspark.sql.session"] = pyspark_sql_session - tsc = TableSizeCrawler(TablesCrawler(ws, backend, "inventory_database")) + tsc = TableSizeCrawler(TablesCrawler(backend, "inventory_database")) tsc._spark._jsparkSession.table().queryExecution().analyzed().stats().sizeInBytes.side_effect = Exception( "[DELTA_MISSING_TRANSACTION_LOG]" @@ -141,7 +141,7 @@ def test_table_size_when_table_corrupted(ws, mocker, caplog): assert "Delta table hive_metastore.db1.table1 is corrupt: missing transaction log" in caplog.text -def test_table_size_when_delta_invalid_format_error(ws, mocker, caplog): +def test_table_size_when_delta_invalid_format_error(mocker, caplog): errors = {} rows = { "table_size": [], @@ -153,7 +153,7 @@ def test_table_size_when_delta_invalid_format_error(ws, mocker, caplog): backend = MockBackend(fails_on_first=errors, rows=rows) pyspark_sql_session = mocker.Mock() sys.modules["pyspark.sql.session"] = pyspark_sql_session - tsc = TableSizeCrawler(TablesCrawler(ws, backend, "inventory_database")) + tsc = TableSizeCrawler(TablesCrawler(backend, "inventory_database")) tsc._spark._jsparkSession.table().queryExecution().analyzed().stats().sizeInBytes.side_effect = Exception( "[DELTA_INVALID_FORMAT]" diff --git a/tests/unit/hive_metastore/test_tables.py b/tests/unit/hive_metastore/test_tables.py index 5c53e18b81..f8b02a3b88 100644 --- a/tests/unit/hive_metastore/test_tables.py +++ b/tests/unit/hive_metastore/test_tables.py @@ -174,18 +174,18 @@ def test_tables_returning_error_when_describing(ws): ], } backend = MockBackend(fails_on_first=errors, rows=rows) - tables_crawler = TablesCrawler(ws, backend, "default") + tables_crawler = TablesCrawler(backend, "default") results = tables_crawler.snapshot() assert len(results) == 1 first = results[0] assert first.upgraded_to == 'fake_cat.fake_ext.fake_delta' -def test_tables_returning_error_when_show_tables(ws, caplog): +def test_tables_returning_error_when_show_tables(caplog): errors = {"SHOW TABLES FROM `hive_metastore`.`database`": "SCHEMA_NOT_FOUND"} rows = {"SHOW DATABASES": [("database",)]} backend = MockBackend(fails_on_first=errors, rows=rows) - tables_crawler = TablesCrawler(ws, backend, "default") + tables_crawler = TablesCrawler(backend, "default") results = tables_crawler.snapshot() assert len(results) == 0 assert "Schema hive_metastore.database no longer exists" in caplog.text @@ -285,13 +285,13 @@ def test_table_what(table, what): assert table.what == what -def test_tables_crawler_should_filter_by_database(ws): +def test_tables_crawler_should_filter_by_database(): rows = { "SHOW TABLES FROM `hive_metastore`.`database`": [("", "table1", ""), ("", "table2", "")], "SHOW TABLES FROM `hive_metastore`.`database_2`": [("", "table1", "")], } backend = MockBackend(rows=rows) - tables_crawler = TablesCrawler(ws, backend, "default", ["database"]) + tables_crawler = TablesCrawler(backend, "default", ["database"]) results = tables_crawler.snapshot() assert len(results) == 2 assert sorted(backend.queries) == sorted( @@ -304,7 +304,7 @@ def test_tables_crawler_should_filter_by_database(ws): ) -def test_is_partitioned_flag(ws): +def test_is_partitioned_flag(): rows = { "SHOW DATABASES": [("database",)], "SHOW TABLES FROM `hive_metastore`.`database`": [("", "table1", ""), ("", "table2", "")], @@ -325,7 +325,7 @@ def test_is_partitioned_flag(ws): ], } backend = MockBackend(rows=rows) - tables_crawler = TablesCrawler(ws, backend, "default") + tables_crawler = TablesCrawler(backend, "default") results = tables_crawler.snapshot() assert len(results) == 2 assert ( @@ -531,7 +531,7 @@ def test_in_place_migrate_hiveserde_sql_parsing_failure(caplog, ddl, expected_lo assert expected_log in caplog.text -def test_fast_table_scan_crawler_already_crawled(ws, mocker): +def test_fast_table_scan_crawler_already_crawled(mocker): pyspark_sql_session = mocker.Mock() sys.modules["pyspark.sql.session"] = pyspark_sql_session @@ -544,12 +544,12 @@ def test_fast_table_scan_crawler_already_crawled(ws, mocker): ], } sql_backend = MockBackend(fails_on_first=errors, rows=rows) - ftsc = FasterTableScanCrawler(ws, sql_backend, "inventory_database") + ftsc = FasterTableScanCrawler(sql_backend, "inventory_database") results = ftsc.snapshot() assert len(results) == 3 -def test_fast_table_scan_crawler_crawl_new(ws, caplog, mocker, spark_table_crawl_mocker): +def test_fast_table_scan_crawler_crawl_new(caplog, mocker, spark_table_crawl_mocker): pyspark_sql_session = mocker.Mock() sys.modules["pyspark.sql.session"] = pyspark_sql_session @@ -558,7 +558,7 @@ def test_fast_table_scan_crawler_crawl_new(ws, caplog, mocker, spark_table_crawl "hive_metastore.inventory_database.tables": [], } sql_backend = MockBackend(fails_on_first=errors, rows=rows) - ftsc = FasterTableScanCrawler(ws, sql_backend, "inventory_database") + ftsc = FasterTableScanCrawler(sql_backend, "inventory_database") mock_list_databases_iterator, mock_list_tables_iterator, get_table_mock = spark_table_crawl_mocker # pylint: disable=protected-access @@ -580,7 +580,7 @@ def test_fast_table_scan_crawler_crawl_new(ws, caplog, mocker, spark_table_crawl ) -def test_fast_table_scan_crawler_crawl_test_warnings_list_databases(ws, caplog, mocker, spark_table_crawl_mocker): +def test_fast_table_scan_crawler_crawl_test_warnings_list_databases(caplog, mocker, spark_table_crawl_mocker): pyspark_sql_session = mocker.Mock() sys.modules["pyspark.sql.session"] = pyspark_sql_session @@ -590,7 +590,7 @@ def test_fast_table_scan_crawler_crawl_test_warnings_list_databases(ws, caplog, "hive_metastore.inventory_database.tables": [], } sql_backend = MockBackend(fails_on_first=errors, rows=rows) - ftsc = FasterTableScanCrawler(ws, sql_backend, "inventory_database") + ftsc = FasterTableScanCrawler(sql_backend, "inventory_database") # pylint: disable=protected-access ftsc._spark._jsparkSession.sharedState().externalCatalog().listDatabases.side_effect = Exception( @@ -602,7 +602,7 @@ def test_fast_table_scan_crawler_crawl_test_warnings_list_databases(ws, caplog, assert "Test listDatabases warning" in caplog.text -def test_fast_table_scan_crawler_crawl_test_warnings_list_tables(ws, caplog, mocker, spark_table_crawl_mocker): +def test_fast_table_scan_crawler_crawl_test_warnings_list_tables(caplog, mocker, spark_table_crawl_mocker): pyspark_sql_session = mocker.Mock() sys.modules["pyspark.sql.session"] = pyspark_sql_session @@ -612,7 +612,7 @@ def test_fast_table_scan_crawler_crawl_test_warnings_list_tables(ws, caplog, moc "hive_metastore.inventory_database.tables": [], } sql_backend = MockBackend(fails_on_first=errors, rows=rows) - ftsc = FasterTableScanCrawler(ws, sql_backend, "inventory_database") + ftsc = FasterTableScanCrawler(sql_backend, "inventory_database") mock_list_databases_iterator, _, _ = spark_table_crawl_mocker @@ -627,7 +627,7 @@ def test_fast_table_scan_crawler_crawl_test_warnings_list_tables(ws, caplog, moc assert "Test listTables warning" in caplog.text -def test_fast_table_scan_crawler_crawl_test_warnings_get_table(ws, caplog, mocker, spark_table_crawl_mocker): +def test_fast_table_scan_crawler_crawl_test_warnings_get_table(caplog, mocker, spark_table_crawl_mocker): pyspark_sql_session = mocker.Mock() sys.modules["pyspark.sql.session"] = pyspark_sql_session @@ -637,7 +637,7 @@ def test_fast_table_scan_crawler_crawl_test_warnings_get_table(ws, caplog, mocke "hive_metastore.inventory_database.tables": [], } sql_backend = MockBackend(fails_on_first=errors, rows=rows) - ftsc = FasterTableScanCrawler(ws, sql_backend, "inventory_database") + ftsc = FasterTableScanCrawler(sql_backend, "inventory_database") mock_list_databases_iterator, mock_list_tables_iterator, _ = spark_table_crawl_mocker diff --git a/tests/unit/hive_metastore/test_udfs.py b/tests/unit/hive_metastore/test_udfs.py index 5dc5b7070c..b3ba27a63e 100644 --- a/tests/unit/hive_metastore/test_udfs.py +++ b/tests/unit/hive_metastore/test_udfs.py @@ -23,23 +23,23 @@ def test_key(): SHOW_FUNCTIONS = MockBackend.rows("function") -def test_udfs_returning_error_when_describing(ws): +def test_udfs_returning_error_when_describing(): errors = {"DESCRIBE FUNCTION EXTENDED hive_metastore.database.function1": "error"} rows = { "SHOW DATABASES": SHOW_DATABASES[("database",),], "SHOW USER FUNCTIONS FROM hive_metastore.database": SHOW_FUNCTIONS[("hive_metastore.database.function1",),], } backend = MockBackend(fails_on_first=errors, rows=rows) - udf_crawler = UdfsCrawler(ws, backend, "default") + udf_crawler = UdfsCrawler(backend, "default") results = udf_crawler.snapshot() assert len(results) == 0 -def test_tables_crawler_should_filter_by_database(ws): +def test_tables_crawler_should_filter_by_database(): rows = { "SHOW USER FUNCTIONS FROM `hive_metastore`.`database`": SHOW_FUNCTIONS[("hive_metastore.database.function1",),], } backend = MockBackend(rows=rows) - udf_crawler = UdfsCrawler(ws, backend, "default", ["database"]) + udf_crawler = UdfsCrawler(backend, "default", ["database"]) results = udf_crawler.snapshot() assert len(results) == 1 diff --git a/tests/unit/recon/test_migration_recon.py b/tests/unit/recon/test_migration_recon.py index e8ce64d9c5..febfda4092 100644 --- a/tests/unit/recon/test_migration_recon.py +++ b/tests/unit/recon/test_migration_recon.py @@ -62,12 +62,11 @@ def test_migrate_recon_should_produce_proper_queries( "WITH compare_results": data_comp_row_factory[(102, 100, 2),], } backend = MockBackend(fails_on_first=errors, rows=rows) - table_crawler = TablesCrawler(ws, backend, "inventory_database") + table_crawler = TablesCrawler(backend, "inventory_database") migration_status_refresher = TableMigrationStatusRefresher(ws, backend, "inventory_database", table_crawler) metadata_retriever = DatabricksTableMetadataRetriever(backend) data_profiler = StandardDataProfiler(backend, metadata_retriever) migration_recon = MigrationRecon( - ws, backend, "inventory_database", migration_status_refresher, diff --git a/tests/unit/source_code/test_directfs_access.py b/tests/unit/source_code/test_directfs_access.py index f89dd2f882..0c1063b820 100644 --- a/tests/unit/source_code/test_directfs_access.py +++ b/tests/unit/source_code/test_directfs_access.py @@ -9,9 +9,9 @@ ) -def test_crawler_appends_dfsas(ws): +def test_crawler_appends_dfsas(): backend = MockBackend() - crawler = DirectFsAccessCrawler.for_paths(ws, backend, "schema") + crawler = DirectFsAccessCrawler.for_paths(backend, "schema") existing = list(crawler.snapshot()) assert not existing dfsas = list( diff --git a/tests/unit/workspace_access/test_manager.py b/tests/unit/workspace_access/test_manager.py index c99bdf307b..b4cff1f5e5 100644 --- a/tests/unit/workspace_access/test_manager.py +++ b/tests/unit/workspace_access/test_manager.py @@ -13,8 +13,8 @@ from databricks.labs.ucx.workspace_access.manager import PermissionManager, Permissions -def test_inventory_permission_manager_init(ws, mock_backend): - permission_manager = PermissionManager(ws, mock_backend, "test_database", []) +def test_inventory_permission_manager_init(mock_backend): + permission_manager = PermissionManager(mock_backend, "test_database", []) assert permission_manager.full_name == "hive_metastore.test_database.permissions" @@ -22,7 +22,7 @@ def test_inventory_permission_manager_init(ws, mock_backend): _PermissionsRow = Row.factory(["object_id", "object_type", "raw"]) -def test_snapshot_fetch(ws) -> None: +def test_snapshot_fetch() -> None: """Verify that the snapshot will load existing data from the inventory.""" sql_backend = MockBackend( rows={ @@ -31,18 +31,18 @@ def test_snapshot_fetch(ws) -> None: ], } ) - permission_manager = PermissionManager(ws, sql_backend, "test_database", []) + permission_manager = PermissionManager(sql_backend, "test_database", []) output = list(permission_manager.snapshot()) assert output[0] == Permissions(object_id="object1", object_type="clusters", raw="test acl") -def test_snapshot_crawl_fallback(ws, mocker) -> None: +def test_snapshot_crawl_fallback(mocker) -> None: """Verify that the snapshot will first attempt to load the (empty) inventory and then crawl.""" some_crawler = mocker.Mock() some_crawler.get_crawler_tasks = lambda: [lambda: None, lambda: Permissions("a", "b", "c"), lambda: None] sql_backend = MockBackend(rows={"SELECT object_id, object_type, raw FROM ": []}) - permission_manager = PermissionManager(ws, sql_backend, "test_database", [some_crawler]) + permission_manager = PermissionManager(sql_backend, "test_database", [some_crawler]) permission_manager.snapshot() @@ -51,7 +51,7 @@ def test_snapshot_crawl_fallback(ws, mocker) -> None: ) -def test_manager_snapshot_crawl_ignore_disabled_features(ws, mock_backend, mocker): +def test_manager_snapshot_crawl_ignore_disabled_features(mock_backend, mocker): def raise_error(): raise DatabricksError( "Model serving is not enabled for your shard. " @@ -61,7 +61,7 @@ def raise_error(): some_crawler = mocker.Mock() some_crawler.get_crawler_tasks = lambda: [lambda: None, lambda: Permissions("a", "b", "c"), raise_error] - permission_manager = PermissionManager(ws, mock_backend, "test_database", [some_crawler]) + permission_manager = PermissionManager(mock_backend, "test_database", [some_crawler]) permission_manager.snapshot() @@ -70,7 +70,7 @@ def raise_error(): ) -def test_manager_snapshot_crawl_with_error(ws, mock_backend, mocker): +def test_manager_snapshot_crawl_with_error(mock_backend, mocker): def raise_error(): raise DatabricksError( "Fail the job", @@ -82,14 +82,14 @@ def raise_error_no_code(): some_crawler = mocker.Mock() some_crawler.get_crawler_tasks = lambda: [lambda: Permissions("a", "b", "c"), raise_error, raise_error_no_code] - permission_manager = PermissionManager(ws, mock_backend, "test_database", [some_crawler]) + permission_manager = PermissionManager(mock_backend, "test_database", [some_crawler]) with pytest.raises(ManyError) as expected_err: permission_manager.snapshot() assert len(expected_err.value.errs) == 2 -def test_manager_apply(ws, mocker): +def test_manager_apply(mocker): sql_backend = MockBackend( rows={ "SELECT object_id": [ @@ -141,7 +141,7 @@ def test_manager_apply(ws, mocker): # this emulates a real applier and call to an API mock_applier.get_apply_task = lambda item, _: lambda: applied_items.add(f"{item.object_id} {item.object_id}") - permission_manager = PermissionManager(ws, sql_backend, "test_database", [mock_applier]) + permission_manager = PermissionManager(sql_backend, "test_database", [mock_applier]) group_migration_state = MigrationState( [ MigratedGroup( @@ -170,7 +170,7 @@ def test_unregistered_support(ws): ] } ) - permission_manager = PermissionManager(ws, sql_backend, "test", []) + permission_manager = PermissionManager(sql_backend, "test", []) permission_manager.apply_group_permissions(migration_state=MigrationState([])) @@ -207,7 +207,7 @@ def test_manager_verify(ws): # this emulates a real verifier and call to an API mock_verifier.get_verify_task = lambda item: lambda: items.add(f"{item.object_id} {item.object_id}") - permission_manager = PermissionManager(ws, sql_backend, "test_database", [mock_verifier]) + permission_manager = PermissionManager(sql_backend, "test_database", [mock_verifier]) result = permission_manager.verify_group_permissions() assert result @@ -242,7 +242,7 @@ def test_manager_verify_not_supported_type(ws): mock_verifier = create_autospec(AclSupport) # pylint: disable=mock-no-usage mock_verifier.object_types = lambda: {"not_supported"} - permission_manager = PermissionManager(ws, sql_backend, "test_database", [mock_verifier]) + permission_manager = PermissionManager(sql_backend, "test_database", [mock_verifier]) with pytest.raises(ValueError): permission_manager.verify_group_permissions() @@ -279,7 +279,7 @@ def test_manager_verify_no_tasks(ws): # this emulates a real verifier and call to an API mock_verifier.get_verify_task = lambda item: None - permission_manager = PermissionManager(ws, sql_backend, "test_database", [mock_verifier]) + permission_manager = PermissionManager(sql_backend, "test_database", [mock_verifier]) result = permission_manager.verify_group_permissions() assert result diff --git a/tests/unit/workspace_access/test_tacl.py b/tests/unit/workspace_access/test_tacl.py index 9afb6f0c05..fa6d4614bc 100644 --- a/tests/unit/workspace_access/test_tacl.py +++ b/tests/unit/workspace_access/test_tacl.py @@ -22,7 +22,7 @@ SHOW_TABLES = MockBackend.rows("databaseName", "tableName", "isTmp") -def test_tacl_crawler(ws): +def test_tacl_crawler(): sql_backend = MockBackend( rows={ "SELECT \\* FROM `hive_metastore`.`test`.`grants`": UCX_GRANTS[ @@ -30,8 +30,8 @@ def test_tacl_crawler(ws): ] } ) - tables_crawler = TablesCrawler(ws, sql_backend, "test") - udf_crawler = UdfsCrawler(ws, sql_backend, "test") + tables_crawler = TablesCrawler(sql_backend, "test") + udf_crawler = UdfsCrawler(sql_backend, "test") grants_crawler = GrantsCrawler(tables_crawler, udf_crawler) table_acl_support = TableAclSupport(grants_crawler, sql_backend) @@ -43,7 +43,7 @@ def test_tacl_crawler(ws): assert obj.object_id == "catalog_a.database_b.table_c" -def test_tacl_udf_crawler(ws): +def test_tacl_udf_crawler(): sql_backend = MockBackend( rows={ "SELECT \\* FROM `hive_metastore`.`test`.`grants`": UCX_GRANTS[ @@ -51,8 +51,8 @@ def test_tacl_udf_crawler(ws): ] } ) - tables_crawler = TablesCrawler(ws, sql_backend, "test") - udf_crawler = UdfsCrawler(ws, sql_backend, "test") + tables_crawler = TablesCrawler(sql_backend, "test") + udf_crawler = UdfsCrawler(sql_backend, "test") grants_crawler = GrantsCrawler(tables_crawler, udf_crawler) table_acl_support = TableAclSupport(grants_crawler, sql_backend) @@ -64,7 +64,7 @@ def test_tacl_udf_crawler(ws): assert obj.object_id == "catalog_a.database_b.function_c" -def test_tacl_crawler_multiple_permissions(ws): +def test_tacl_crawler_multiple_permissions(): sql_backend = MockBackend( rows={ "SELECT \\* FROM `hive_metastore`.`test`.`grants`": UCX_GRANTS[ @@ -92,8 +92,8 @@ def test_tacl_crawler_multiple_permissions(ws): ] } ) - tables_crawler = TablesCrawler(ws, sql_backend, "test") - udf_crawler = UdfsCrawler(ws, sql_backend, "test") + tables_crawler = TablesCrawler(sql_backend, "test") + udf_crawler = UdfsCrawler(sql_backend, "test") grants_crawler = GrantsCrawler(tables_crawler, udf_crawler) table_acl_support = TableAclSupport(grants_crawler, sql_backend) @@ -244,7 +244,7 @@ def test_tacl_crawler_multiple_permissions(ws): ) == Grant(**json.loads(permissions.raw)) -def test_tacl_applier(ws): +def test_tacl_applier(): sql_backend = MockBackend( rows={ "SELECT \\* FROM `hive_metastore`.`test`.`grants`": UCX_GRANTS[ @@ -255,8 +255,8 @@ def test_tacl_applier(ws): ], } ) - tables_crawler = TablesCrawler(ws, sql_backend, "test") - udf_crawler = UdfsCrawler(ws, sql_backend, "test") + tables_crawler = TablesCrawler(sql_backend, "test") + udf_crawler = UdfsCrawler(sql_backend, "test") grants_crawler = GrantsCrawler(tables_crawler, udf_crawler) table_acl_support = TableAclSupport(grants_crawler, sql_backend) @@ -296,10 +296,10 @@ def test_tacl_applier(ws): assert validation_res -def test_tacl_applier_not_applied(ws): +def test_tacl_applier_not_applied(): sql_backend = MockBackend(rows={"SELECT \\* FROM `hive_metastore`.`test`.`grants`": []}) - tables_crawler = TablesCrawler(ws, sql_backend, "test") - udf_crawler = UdfsCrawler(ws, sql_backend, "test") + tables_crawler = TablesCrawler(sql_backend, "test") + udf_crawler = UdfsCrawler(sql_backend, "test") grants_crawler = GrantsCrawler(tables_crawler, udf_crawler) table_acl_support = TableAclSupport(grants_crawler, sql_backend) @@ -339,7 +339,7 @@ def test_tacl_applier_not_applied(ws): assert not validation_res -def test_tacl_udf_applier(ws): +def test_tacl_udf_applier(): sql_backend = MockBackend( rows={ "SELECT \\* FROM `hive_metastore`.`test`.`grants`": UCX_GRANTS[ @@ -350,8 +350,8 @@ def test_tacl_udf_applier(ws): ], } ) - tables_crawler = TablesCrawler(ws, sql_backend, "test") - udf_crawler = UdfsCrawler(ws, sql_backend, "test") + tables_crawler = TablesCrawler(sql_backend, "test") + udf_crawler = UdfsCrawler(sql_backend, "test") grants_crawler = GrantsCrawler(tables_crawler, udf_crawler) table_acl_support = TableAclSupport(grants_crawler, sql_backend) @@ -391,7 +391,7 @@ def test_tacl_udf_applier(ws): assert validation_res -def test_tacl_applier_multiple_actions(ws): +def test_tacl_applier_multiple_actions(): sql_backend = MockBackend( rows={ "SELECT \\* FROM `hive_metastore`.`test`.`grants`": UCX_GRANTS[ @@ -403,8 +403,8 @@ def test_tacl_applier_multiple_actions(ws): ], } ) - tables_crawler = TablesCrawler(ws, sql_backend, "test") - udf_crawler = UdfsCrawler(ws, sql_backend, "test") + tables_crawler = TablesCrawler(sql_backend, "test") + udf_crawler = UdfsCrawler(sql_backend, "test") grants_crawler = GrantsCrawler(tables_crawler, udf_crawler) table_acl_support = TableAclSupport(grants_crawler, sql_backend) @@ -444,7 +444,7 @@ def test_tacl_applier_multiple_actions(ws): assert validation_res -def test_tacl_applier_deny_and_grant(ws): +def test_tacl_applier_deny_and_grant(): sql_backend = MockBackend( rows={ "SELECT \\* FROM `hive_metastore`.`test`.`grants`": UCX_GRANTS[ @@ -457,8 +457,8 @@ def test_tacl_applier_deny_and_grant(ws): ], } ) - tables_crawler = TablesCrawler(ws, sql_backend, "test") - udf_crawler = UdfsCrawler(ws, sql_backend, "test") + tables_crawler = TablesCrawler(sql_backend, "test") + udf_crawler = UdfsCrawler(sql_backend, "test") grants_crawler = GrantsCrawler(tables_crawler, udf_crawler) table_acl_support = TableAclSupport(grants_crawler, sql_backend) @@ -535,7 +535,7 @@ def test_tacl_applier_no_target_principal(mocker): assert not sql_backend.queries -def test_verify_task_should_return_true_if_permissions_applied(ws): +def test_verify_task_should_return_true_if_permissions_applied(): sql_backend = MockBackend( rows={ "SHOW GRANTS ON TABLE `catalog_a`.`database_b`.`table_c`": SHOW_GRANTS[ @@ -543,8 +543,8 @@ def test_verify_task_should_return_true_if_permissions_applied(ws): ], } ) - tables_crawler = TablesCrawler(ws, sql_backend, "test") - udf_crawler = UdfsCrawler(ws, sql_backend, "test") + tables_crawler = TablesCrawler(sql_backend, "test") + udf_crawler = UdfsCrawler(sql_backend, "test") grants_crawler = GrantsCrawler(tables_crawler, udf_crawler) table_acl_support = TableAclSupport(grants_crawler, sql_backend) @@ -567,7 +567,7 @@ def test_verify_task_should_return_true_if_permissions_applied(ws): assert result -def test_verify_task_should_fail_if_permissions_not_applied(ws): +def test_verify_task_should_fail_if_permissions_not_applied(): sql_backend = MockBackend( rows={ "SHOW GRANTS ON TABLE `catalog_a`.`database_b`.`table_c`": SHOW_GRANTS[ @@ -575,8 +575,8 @@ def test_verify_task_should_fail_if_permissions_not_applied(ws): ], } ) - tables_crawler = TablesCrawler(ws, sql_backend, "test") - udf_crawler = UdfsCrawler(ws, sql_backend, "test") + tables_crawler = TablesCrawler(sql_backend, "test") + udf_crawler = UdfsCrawler(sql_backend, "test") grants_crawler = GrantsCrawler(tables_crawler, udf_crawler) table_acl_support = TableAclSupport(grants_crawler, sql_backend) @@ -599,10 +599,10 @@ def test_verify_task_should_fail_if_permissions_not_applied(ws): task() -def test_verify_task_should_return_false_if_not_grants_present(ws): +def test_verify_task_should_return_false_if_not_grants_present(): sql_backend = MockBackend() - tables_crawler = TablesCrawler(ws, sql_backend, "test") - udf_crawler = UdfsCrawler(ws, sql_backend, "test") + tables_crawler = TablesCrawler(sql_backend, "test") + udf_crawler = UdfsCrawler(sql_backend, "test") grants_crawler = GrantsCrawler(tables_crawler, udf_crawler) table_acl_support = TableAclSupport(grants_crawler, sql_backend) From 7db7aa096173025c123be083adf324574030ede2 Mon Sep 17 00:00:00 2001 From: Andrew Snare Date: Thu, 3 Oct 2024 16:58:18 +0200 Subject: [PATCH 13/58] More reverting. --- tests/unit/framework/test_crawlers.py | 2 +- tests/unit/hive_metastore/test_table_migrate.py | 4 ++-- tests/unit/hive_metastore/test_tables.py | 2 +- tests/unit/workspace_access/test_manager.py | 8 ++++---- 4 files changed, 8 insertions(+), 8 deletions(-) diff --git a/tests/unit/framework/test_crawlers.py b/tests/unit/framework/test_crawlers.py index 1547841bdf..2fa5c9bfc9 100644 --- a/tests/unit/framework/test_crawlers.py +++ b/tests/unit/framework/test_crawlers.py @@ -76,7 +76,7 @@ def test_snapshot_crawls_when_no_prior_crawl() -> None: assert [Baz(first="first")] == result -def test_snapshot_crawls_when_prior_crawl_yielded_no_data(ws) -> None: +def test_snapshot_crawls_when_prior_crawl_yielded_no_data() -> None: """Check that the crawler is invoked when the fetcher reports that the inventory exists but doesn't contain data.""" mock_backend = MockBackend() mock_fetcher = Mock(return_value=[]) diff --git a/tests/unit/hive_metastore/test_table_migrate.py b/tests/unit/hive_metastore/test_table_migrate.py index a3b1926975..686eedcd98 100644 --- a/tests/unit/hive_metastore/test_table_migrate.py +++ b/tests/unit/hive_metastore/test_table_migrate.py @@ -1027,7 +1027,7 @@ def test_migrate_views_should_be_properly_sequenced(ws): assert next((key for key in table_keys if key == "hive_metastore.db1_src.t1_src"), None) is None -def test_table_in_mount_mapping_with_table_owner(ws): +def test_table_in_mount_mapping_with_table_owner(): client = create_autospec(WorkspaceClient) client.tables.get.side_effect = NotFound() backend = MockBackend( @@ -1067,7 +1067,7 @@ def test_table_in_mount_mapping_with_table_owner(ws): migrate_grants.apply.assert_called() -def test_table_in_mount_mapping_with_partition_information(ws): +def test_table_in_mount_mapping_with_partition_information(): client = create_autospec(WorkspaceClient) client.tables.get.side_effect = NotFound() backend = MockBackend( diff --git a/tests/unit/hive_metastore/test_tables.py b/tests/unit/hive_metastore/test_tables.py index f8b02a3b88..ced4be1501 100644 --- a/tests/unit/hive_metastore/test_tables.py +++ b/tests/unit/hive_metastore/test_tables.py @@ -158,7 +158,7 @@ def test_uc_sql_when_table_is_in_mount(schema, partitions, table_schema): assert table.sql_migrate_table_in_mount(target, table_schema) == expected -def test_tables_returning_error_when_describing(ws): +def test_tables_returning_error_when_describing(): errors = {"DESCRIBE TABLE EXTENDED `hive_metastore`.`database`.`table1`": "error"} rows = { "SHOW DATABASES": [("database",)], diff --git a/tests/unit/workspace_access/test_manager.py b/tests/unit/workspace_access/test_manager.py index b4cff1f5e5..327defda1a 100644 --- a/tests/unit/workspace_access/test_manager.py +++ b/tests/unit/workspace_access/test_manager.py @@ -162,7 +162,7 @@ def test_manager_apply(mocker): assert {"test2 test2", "test test"} == applied_items -def test_unregistered_support(ws): +def test_unregistered_support(): sql_backend = MockBackend( rows={ "SELECT": [ @@ -174,7 +174,7 @@ def test_unregistered_support(ws): permission_manager.apply_group_permissions(migration_state=MigrationState([])) -def test_manager_verify(ws): +def test_manager_verify(): sql_backend = MockBackend( rows={ "SELECT object_id": [ @@ -214,7 +214,7 @@ def test_manager_verify(ws): assert {"test test"} == items -def test_manager_verify_not_supported_type(ws): +def test_manager_verify_not_supported_type(): sql_backend = MockBackend( rows={ "SELECT object_id": [ @@ -248,7 +248,7 @@ def test_manager_verify_not_supported_type(ws): permission_manager.verify_group_permissions() -def test_manager_verify_no_tasks(ws): +def test_manager_verify_no_tasks(): sql_backend = MockBackend( rows={ "SELECT object_id": [ From 7676f7cf9e4e6357788c7fcf8eefc9d221b6eba4 Mon Sep 17 00:00:00 2001 From: Andrew Snare Date: Thu, 3 Oct 2024 16:58:32 +0200 Subject: [PATCH 14/58] Whitespace. --- src/databricks/labs/ucx/framework/utils.py | 1 - src/databricks/labs/ucx/hive_metastore/udfs.py | 7 +------ src/databricks/labs/ucx/source_code/directfs_access.py | 8 +------- 3 files changed, 2 insertions(+), 14 deletions(-) diff --git a/src/databricks/labs/ucx/framework/utils.py b/src/databricks/labs/ucx/framework/utils.py index 0a291960f6..d428447911 100644 --- a/src/databricks/labs/ucx/framework/utils.py +++ b/src/databricks/labs/ucx/framework/utils.py @@ -1,7 +1,6 @@ import logging import subprocess - logger = logging.getLogger(__name__) diff --git a/src/databricks/labs/ucx/hive_metastore/udfs.py b/src/databricks/labs/ucx/hive_metastore/udfs.py index 40992d0524..6ee1eefd38 100644 --- a/src/databricks/labs/ucx/hive_metastore/udfs.py +++ b/src/databricks/labs/ucx/hive_metastore/udfs.py @@ -34,12 +34,7 @@ def key(self) -> str: class UdfsCrawler(CrawlerBase[Udf]): - def __init__( - self, - backend: SqlBackend, - schema: str, - include_databases: list[str] | None = None, - ): + def __init__(self, backend: SqlBackend, schema: str, include_databases: list[str] | None = None): """ Initializes a UdfsCrawler instance. diff --git a/src/databricks/labs/ucx/source_code/directfs_access.py b/src/databricks/labs/ucx/source_code/directfs_access.py index 372b15e464..26acf95215 100644 --- a/src/databricks/labs/ucx/source_code/directfs_access.py +++ b/src/databricks/labs/ucx/source_code/directfs_access.py @@ -31,13 +31,7 @@ def __init__(self, backend: SqlBackend, schema: str, table: str): sql_backend (SqlBackend): The SQL Execution Backend abstraction (either REST API or Spark) schema: The schema name for the inventory persistence. """ - super().__init__( - backend=backend, - catalog="hive_metastore", - schema=schema, - table=table, - klass=DirectFsAccess, - ) + super().__init__(backend=backend, catalog="hive_metastore", schema=schema, table=table, klass=DirectFsAccess) def dump_all(self, dfsas: Sequence[DirectFsAccess]): """This crawler doesn't follow the pull model because the fetcher fetches data for 2 crawlers, not just one From 70102375861f9a95cb51ce36927cde1ee6e80f0b Mon Sep 17 00:00:00 2001 From: Andrew Snare Date: Thu, 3 Oct 2024 17:01:02 +0200 Subject: [PATCH 15/58] Implement more unit tests. --- tests/unit/framework/test_owners.py | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/tests/unit/framework/test_owners.py b/tests/unit/framework/test_owners.py index 6ee03bc753..3d2cf93547 100644 --- a/tests/unit/framework/test_owners.py +++ b/tests/unit/framework/test_owners.py @@ -173,12 +173,25 @@ def test_ownership_error_when_no_owner_can_be_located(ws) -> None: def test_ownership_fallback_instance_cache(ws) -> None: """Verify that the fallback owner is cached on each instance to avoid many REST calls.""" - pytest.xfail("Not yet implemented") + _setup_accounts(ws, account_users=[_create_account_admin("jane")]) + + ownership = _OwnershipFixture[str](ws) + owner1 = ownership.owner_of("school") + owner2 = ownership.owner_of("school") + + assert owner1 is owner2 + ws.get_workspace_id.assert_called_once() def test_ownership_fallback_class_cache(ws) -> None: """Verify that the fallback owner for a workspace is cached at class level to avoid many REST calls.""" - pytest.xfail("Not yet implemented") + _setup_accounts(ws, account_users=[_create_account_admin("jane")]) + + owner1 = _OwnershipFixture[str](ws).owner_of("school") + owner2 = _OwnershipFixture[str](ws).owner_of("school") + + assert owner1 is owner2 + ws.users.list.assert_called_once() def test_ownership_fallback_class_cache_multiple_workspaces(ws) -> None: @@ -187,5 +200,5 @@ def test_ownership_fallback_class_cache_multiple_workspaces(ws) -> None: def test_ownership_fallback_error_handling(ws) -> None: - """Verify that the class-level owner-cache and tracks errors to avoid many REST calls.""" + """Verify that the class-level administrator-cache and tracks errors to avoid many REST calls.""" pytest.xfail("Not yet implemented") From d1e24eb5bdac04c862314cd8d12d8a7b71b21cfe Mon Sep 17 00:00:00 2001 From: Andrew Snare Date: Thu, 3 Oct 2024 18:13:51 +0200 Subject: [PATCH 16/58] Refactor workspace/account admin lookup into separate components. --- .../labs/ucx/contexts/application.py | 5 + src/databricks/labs/ucx/framework/owners.py | 142 ++++++++++-------- tests/integration/framework/test_owners.py | 19 +-- tests/unit/framework/test_owners.py | 44 +----- 4 files changed, 92 insertions(+), 118 deletions(-) diff --git a/src/databricks/labs/ucx/contexts/application.py b/src/databricks/labs/ucx/contexts/application.py index 95944a3d2a..75ba9af9e6 100644 --- a/src/databricks/labs/ucx/contexts/application.py +++ b/src/databricks/labs/ucx/contexts/application.py @@ -24,6 +24,7 @@ from databricks.labs.ucx.assessment.azure import AzureServicePrincipalCrawler from databricks.labs.ucx.aws.credentials import CredentialManager from databricks.labs.ucx.config import WorkspaceConfig +from databricks.labs.ucx.framework.owners import AdministratorLocator from databricks.labs.ucx.hive_metastore import ExternalLocations, Mounts, TablesCrawler from databricks.labs.ucx.hive_metastore.catalog_schema import CatalogSchema from databricks.labs.ucx.hive_metastore.grants import ( @@ -485,6 +486,10 @@ def migration_recon(self): self.config.recon_tolerance_percent, ) + @cached_property + def administrator_locator(self) -> AdministratorLocator: + return AdministratorLocator(self.workspace_client) + class CliContext(GlobalContext, abc.ABC): @cached_property diff --git a/src/databricks/labs/ucx/framework/owners.py b/src/databricks/labs/ucx/framework/owners.py index 5490350597..865c2e7495 100644 --- a/src/databricks/labs/ucx/framework/owners.py +++ b/src/databricks/labs/ucx/framework/owners.py @@ -1,12 +1,12 @@ import functools import logging from abc import ABC, abstractmethod -from collections.abc import Iterable +from collections.abc import Callable, Iterable, Sequence from functools import cached_property from typing import ClassVar, Generic, Protocol, TypeVar, final from databricks.sdk import WorkspaceClient -from databricks.sdk.errors import DatabricksError, NotFound +from databricks.sdk.errors import NotFound from databricks.sdk.service.iam import User logger = logging.getLogger(__name__) @@ -19,25 +19,18 @@ class DataclassInstance(Protocol): Record = TypeVar("Record") -class Ownership(ABC, Generic[Record]): - """Determine an owner for a given type of object.""" +class _AdministratorFinder(ABC): + def __init__(self, ws: WorkspaceClient): + self._ws = ws - _cached_workspace_admins: dict[int, str | Exception] = {} - """Cached user names of workspace administrators, keyed by workspace id.""" + @abstractmethod + def find_admin_users(self) -> Iterable[User]: + """Locate active admin users.""" + raise NotImplementedError() - @classmethod - def reset_cache(cls) -> None: - """Reset the cache of discovered administrators that we maintain at class level.""" - # Intended for use by tests. - cls._cached_workspace_admins = {} - def __init__(self, ws: WorkspaceClient) -> None: - self._ws = ws - - @staticmethod - def _has_role(user: User, role: str) -> bool: - """Determine whether a user has a given role or not.""" - return user.roles is not None and any(r.value == role for r in user.roles) +class WorkspaceAdministratorFinder(_AdministratorFinder): + """Locate the users that are in the 'admin' workspace group for a given workspace.""" @staticmethod def _member_of_group_named(user: User, group_name: str) -> bool: @@ -63,7 +56,7 @@ def _filter_workspace_groups(self, identifiers: Iterable[str]) -> Iterable[str]: if group.meta and group.meta.resource_type == "WorkspaceGroup": yield group_id - def _find_workspace_admins(self) -> Iterable[User]: + def find_admin_users(self) -> Iterable[User]: """Enumerate the active workspace administrators in a given workspace. Returns: @@ -94,7 +87,16 @@ def _find_workspace_admins(self) -> Iterable[User]: msg = f"Multiple 'admins' workspace groups found; something is wrong: {admin_groups}" raise RuntimeError(msg) - def _find_account_admins(self) -> Iterable[User]: + +class AccountAdministratorFinder(_AdministratorFinder): + """Locate the users that are account administrators for this workspace.""" + + @staticmethod + def _has_role(user: User, role: str) -> bool: + """Determine whether a user has a given role or not.""" + return user.roles is not None and any(r.value == role for r in user.roles) + + def find_admin_users(self) -> Iterable[User]: """Enumerate the active account administrators associated with a given workspace. Returns: @@ -109,19 +111,68 @@ def _find_account_admins(self) -> Iterable[User]: # Reference: https://learn.microsoft.com/en-us/azure/databricks/admin/users-groups/groups#account-admin return (user for user in all_users if user.active and user.user_name and self._has_role(user, "account_admin")) - def _find_an_admin(self) -> User | None: - """Locate an active administrator for the current workspace. - If an active workspace administrator can be located, this is returned. When there are multiple, they are sorted - alphabetically by user-name and the first is returned. If there are no workspace administrators then an active - account administrator is sought, again returning the first alphabetically by user-name if there is more than one. +class AdministratorLocator: + """Locate a workspace administrator, if possible. - Returns: - the first (alphabetically by user-name) active workspace or account administrator, or `None` if neither can - be found. + This will first try to find an active workspace administrator. If there are multiple, the first (alphabetically + sorted by user-name) will be used. If no active workspace administrators can be found then an account administrator + is sought, again returning the first alphabetically by user-name if more than one is found. + """ + + def __init__( + self, + ws: WorkspaceClient, + *, + finders: Sequence[Callable[[WorkspaceClient], _AdministratorFinder]] = ( + WorkspaceAdministratorFinder, + AccountAdministratorFinder, + ), + ) -> None: + """ + Initialize the instance, which will try to locate administrators using the workspace for the supplied client. + + Args: + ws (WorkspaceClient): the client for workspace in which to locate admin users. + finders: a sequence of factories that will be instantiated on demand to locate admin users. """ + self._ws = ws + self._finders = finders + + @cached_property + def _workspace_id(self) -> int: + # Makes a REST call, so we cache it. + return self._ws.get_workspace_id() + + @cached_property + def _found_admin(self) -> str | None: + # Lazily instantiate and query the finders in an attempt to locate an admin user. + finders = (finder(self._ws) for finder in self._finders) + # If a finder returns multiple admin users, use the first (alphabetically by user-name). first_user = functools.partial(min, default=None, key=lambda user: user.user_name) - return first_user(self._find_workspace_admins()) or first_user(self._find_account_admins()) + found_admin_users: Iterable[User | None] = (first_user(finder.find_admin_users()) for finder in finders) + return next((user.user_name for user in found_admin_users if user), None) + + @property + def workspace_administrator(self) -> str: + """The user-name of an admin user for the workspace. + + Raises: + RuntimeError if an admin user cannot be found in the current workspace. + """ + found_admin = self._found_admin + if found_admin is None: + msg = f"No active workspace or account administrator can be found for workspace: {self._workspace_id}" + raise RuntimeError(msg) + return found_admin + + +class Ownership(ABC, Generic[Record]): + """Determine an owner for a given type of object.""" + + def __init__(self, ws: WorkspaceClient, admin_locator: AdministratorLocator) -> None: + self._ws = ws + self._admin_locator = admin_locator @final def owner_of(self, record: Record) -> str: @@ -139,38 +190,7 @@ def owner_of(self, record: Record) -> str: Raises: RuntimeError if there are no active administrators for the current workspace. """ - return self._get_owner(record) or self._workspace_admin - - @cached_property - def _workspace_admin(self) -> str: - # Avoid repeatedly hitting the shared cache. - return self._find_an_administrator() - - @final - def _find_an_administrator(self) -> str: - # Finding an administrator is quite expensive, so we ensure that for a given workspace we only do it once. - # Found administrators are cached on a class attribute. The method here: - # - is thread-safe, with the compromise that we might perform some redundant lookups during init. - # - no administrator is converted into an error. - # - an error during lookup is preserved and raised for subsequent requests, to avoid too many REST calls. - workspace_id = self._ws.get_workspace_id() - found_admin_or_error = self._cached_workspace_admins.get(workspace_id, None) - if found_admin_or_error is None: - logger.debug(f"Locating an active workspace or account administrator for workspace: {workspace_id}") - try: - user = self._find_an_admin() - except DatabricksError as e: - found_admin_or_error = e - else: - found_admin_or_error = user.user_name if user is not None else None - # If not found, convert once into the error that we will raise each time. - if found_admin_or_error is None: - msg = f"No active workspace or account administrator can be found for workspace: {workspace_id}" - found_admin_or_error = RuntimeError(msg) # pylint: disable=redefined-variable-type - self._cached_workspace_admins[workspace_id] = found_admin_or_error - if isinstance(found_admin_or_error, Exception): - raise found_admin_or_error - return found_admin_or_error + return self._get_owner(record) or self._admin_locator.workspace_administrator @abstractmethod def _get_owner(self, record: Record) -> str | None: diff --git a/tests/integration/framework/test_owners.py b/tests/integration/framework/test_owners.py index 777d3d75f4..9d4ff6e4ca 100644 --- a/tests/integration/framework/test_owners.py +++ b/tests/integration/framework/test_owners.py @@ -1,27 +1,14 @@ -from collections.abc import Callable - -from databricks.sdk import WorkspaceClient - from databricks.labs.ucx.framework.owners import Ownership, Record class _OwnershipFixture(Ownership[Record]): - def __init__( - self, - ws: WorkspaceClient, - *, - owner_fn: Callable[[Record], str | None] = lambda _: None, - ): - super().__init__(ws) - self._owner_fn = owner_fn - def _get_owner(self, record: Record) -> str | None: - return self._owner_fn(record) + return None -def test_fallback_workspace_admin(ws) -> None: +def test_fallback_workspace_admin(installation_ctx, ws) -> None: """Verify that a workspace administrator can be found for our integration environment.""" - ownership = _OwnershipFixture[str](ws) + ownership = _OwnershipFixture[str](ws, installation_ctx.administrator_locator) owner = ownership.owner_of("anything") assert owner diff --git a/tests/unit/framework/test_owners.py b/tests/unit/framework/test_owners.py index 3d2cf93547..c451c53f63 100644 --- a/tests/unit/framework/test_owners.py +++ b/tests/unit/framework/test_owners.py @@ -6,7 +6,7 @@ from databricks.sdk.errors import NotFound from databricks.sdk.service import iam -from databricks.labs.ucx.framework.owners import Ownership, Record +from databricks.labs.ucx.framework.owners import AdministratorLocator, Ownership, Record class _OwnershipFixture(Ownership[Record]): @@ -15,8 +15,9 @@ def __init__( ws: WorkspaceClient, *, owner_fn: Callable[[Record], str | None] = lambda _: None, + admin_locator: AdministratorLocator | None = None, ): - super().__init__(ws) + super().__init__(ws, admin_locator if admin_locator is not None else AdministratorLocator(ws)) self._owner_fn = owner_fn def _get_owner(self, record: Record) -> str | None: @@ -72,12 +73,6 @@ def _create_workspace_group(display_name: str, group_id: str) -> iam.Group: return iam.Group(display_name=display_name, id=group_id, meta=iam.ResourceMeta(resource_type="WorkspaceGroup")) -@pytest.fixture(autouse=True) -def _clear_ownership_cache() -> None: - """Ensure that the class-level cache of workspace owners is cleared before each test.""" - Ownership.reset_cache() - - def test_ownership_prefers_record_owner(ws) -> None: """Verify that if an owner for the record can be found, that is used.""" ownership = _OwnershipFixture[str](ws, owner_fn=lambda _: "bob") @@ -169,36 +164,3 @@ def test_ownership_error_when_no_owner_can_be_located(ws) -> None: expected_message = f"No active workspace or account administrator can be found for workspace: {workspace_id}" with pytest.raises(RuntimeError, match=re.escape(expected_message)): _ = ownership.owner_of("school") - - -def test_ownership_fallback_instance_cache(ws) -> None: - """Verify that the fallback owner is cached on each instance to avoid many REST calls.""" - _setup_accounts(ws, account_users=[_create_account_admin("jane")]) - - ownership = _OwnershipFixture[str](ws) - owner1 = ownership.owner_of("school") - owner2 = ownership.owner_of("school") - - assert owner1 is owner2 - ws.get_workspace_id.assert_called_once() - - -def test_ownership_fallback_class_cache(ws) -> None: - """Verify that the fallback owner for a workspace is cached at class level to avoid many REST calls.""" - _setup_accounts(ws, account_users=[_create_account_admin("jane")]) - - owner1 = _OwnershipFixture[str](ws).owner_of("school") - owner2 = _OwnershipFixture[str](ws).owner_of("school") - - assert owner1 is owner2 - ws.users.list.assert_called_once() - - -def test_ownership_fallback_class_cache_multiple_workspaces(ws) -> None: - """Verify that cache of workspace administrators supports multiple workspaces.""" - pytest.xfail("Not yet implemented") - - -def test_ownership_fallback_error_handling(ws) -> None: - """Verify that the class-level administrator-cache and tracks errors to avoid many REST calls.""" - pytest.xfail("Not yet implemented") From 9155e19fc5766be9fdbe8af70524a9ac649dedd0 Mon Sep 17 00:00:00 2001 From: Andrew Snare Date: Fri, 4 Oct 2024 13:13:47 +0200 Subject: [PATCH 17/58] Update integration test for locating a workspace admin to test the locator directly. --- tests/integration/framework/test_owners.py | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) diff --git a/tests/integration/framework/test_owners.py b/tests/integration/framework/test_owners.py index 9d4ff6e4ca..904ffcd1e0 100644 --- a/tests/integration/framework/test_owners.py +++ b/tests/integration/framework/test_owners.py @@ -1,14 +1,8 @@ -from databricks.labs.ucx.framework.owners import Ownership, Record +from databricks.labs.ucx.contexts.workflow_task import RuntimeContext -class _OwnershipFixture(Ownership[Record]): - def _get_owner(self, record: Record) -> str | None: - return None - - -def test_fallback_workspace_admin(installation_ctx, ws) -> None: +def test_fallback_workspace_admin(installation_ctx: RuntimeContext) -> None: """Verify that a workspace administrator can be found for our integration environment.""" - ownership = _OwnershipFixture[str](ws, installation_ctx.administrator_locator) - owner = ownership.owner_of("anything") + an_admin = installation_ctx.administrator_locator.workspace_administrator - assert owner + assert "@" in an_admin From 9980c20341ce34c2717c314db4756d7f76c5a597 Mon Sep 17 00:00:00 2001 From: Andrew Snare Date: Fri, 4 Oct 2024 13:18:59 +0200 Subject: [PATCH 18/58] Refactor unit tests for the ownership-related classes. --- src/databricks/labs/ucx/framework/owners.py | 8 +- tests/unit/framework/test_owners.py | 273 ++++++++++++++++---- 2 files changed, 232 insertions(+), 49 deletions(-) diff --git a/src/databricks/labs/ucx/framework/owners.py b/src/databricks/labs/ucx/framework/owners.py index 865c2e7495..e8e940687a 100644 --- a/src/databricks/labs/ucx/framework/owners.py +++ b/src/databricks/labs/ucx/framework/owners.py @@ -19,7 +19,7 @@ class DataclassInstance(Protocol): Record = TypeVar("Record") -class _AdministratorFinder(ABC): +class AdministratorFinder(ABC): def __init__(self, ws: WorkspaceClient): self._ws = ws @@ -29,7 +29,7 @@ def find_admin_users(self) -> Iterable[User]: raise NotImplementedError() -class WorkspaceAdministratorFinder(_AdministratorFinder): +class WorkspaceAdministratorFinder(AdministratorFinder): """Locate the users that are in the 'admin' workspace group for a given workspace.""" @staticmethod @@ -88,7 +88,7 @@ def find_admin_users(self) -> Iterable[User]: raise RuntimeError(msg) -class AccountAdministratorFinder(_AdministratorFinder): +class AccountAdministratorFinder(AdministratorFinder): """Locate the users that are account administrators for this workspace.""" @staticmethod @@ -124,7 +124,7 @@ def __init__( self, ws: WorkspaceClient, *, - finders: Sequence[Callable[[WorkspaceClient], _AdministratorFinder]] = ( + finders: Sequence[Callable[[WorkspaceClient], AdministratorFinder]] = ( WorkspaceAdministratorFinder, AccountAdministratorFinder, ), diff --git a/tests/unit/framework/test_owners.py b/tests/unit/framework/test_owners.py index c451c53f63..bd7addd37f 100644 --- a/tests/unit/framework/test_owners.py +++ b/tests/unit/framework/test_owners.py @@ -1,12 +1,20 @@ import re from collections.abc import Callable, Sequence +from unittest.mock import create_autospec, Mock, PropertyMock import pytest from databricks.sdk import WorkspaceClient from databricks.sdk.errors import NotFound from databricks.sdk.service import iam -from databricks.labs.ucx.framework.owners import AdministratorLocator, Ownership, Record +from databricks.labs.ucx.framework.owners import ( + AccountAdministratorFinder, + AdministratorFinder, + AdministratorLocator, + Ownership, + Record, + WorkspaceAdministratorFinder, +) class _OwnershipFixture(Ownership[Record]): @@ -15,10 +23,11 @@ def __init__( ws: WorkspaceClient, *, owner_fn: Callable[[Record], str | None] = lambda _: None, - admin_locator: AdministratorLocator | None = None, ): - super().__init__(ws, admin_locator if admin_locator is not None else AdministratorLocator(ws)) + mock_admin_locator = create_autospec(AdministratorLocator) # pylint: disable=mock-no-usage + super().__init__(ws, mock_admin_locator) self._owner_fn = owner_fn + self.mock_admin_locator = mock_admin_locator def _get_owner(self, record: Record) -> str | None: return self._owner_fn(record) @@ -73,56 +82,151 @@ def _create_workspace_group(display_name: str, group_id: str) -> iam.Group: return iam.Group(display_name=display_name, id=group_id, meta=iam.ResourceMeta(resource_type="WorkspaceGroup")) -def test_ownership_prefers_record_owner(ws) -> None: - """Verify that if an owner for the record can be found, that is used.""" - ownership = _OwnershipFixture[str](ws, owner_fn=lambda _: "bob") - owner = ownership.owner_of("school") +def test_workspace_admin_finder_active_with_username(ws) -> None: + """Verify that the workspace admin finder only reports active users with a user-name.""" + admins_group = _create_workspace_group("admins", group_id="1") + inactive_admin = _create_workspace_admin("inactive_admin_1", admins_group_id="1") + inactive_admin.active = False + users = [ + _create_workspace_admin("only_real_admin", admins_group_id="1"), + inactive_admin, + _create_workspace_admin("", admins_group_id="1"), + ] + _setup_accounts(ws, workspace_users=users, groups=[admins_group]) - assert owner == "bob" - ws.get_workspace_id.assert_not_called() + finder = WorkspaceAdministratorFinder(ws) + admins = list(finder.find_admin_users()) + assert [admin.user_name for admin in admins] == ["only_real_admin"] -def test_ownership_admin_user_fallback(ws) -> None: - """Verify that if no owner for the record can be found, an admin user is returned instead.""" - _setup_accounts(ws, account_users=[_create_account_admin("jane")]) - ownership = _OwnershipFixture[str](ws) - owner = ownership.owner_of("school") +def test_workspace_admin_finder_admins_members(ws) -> None: + """Verify that the workspace admin finder only reports members of the 'admins' workspace group.""" + groups = [ + _create_workspace_group("admins", group_id="1"), + _create_workspace_group("users", group_id="2"), + _create_workspace_group("not_admins", group_id="3"), + iam.Group(display_name="admins", id="4", meta=iam.ResourceMeta(resource_type="Group")), + ] + users = [ + _create_workspace_admin("admin_1", admins_group_id="1"), + iam.User( + user_name="admin_2", + active=True, + groups=[ + iam.ComplexValue(display="admins", ref="Groups/1", value="1"), + iam.ComplexValue(display="users", ref="Groups/2", value="2"), + ], + ), + iam.User( + user_name="not_admin_1", + active=True, + groups=[ + iam.ComplexValue(display="users", ref="Groups/2", value="2"), + iam.ComplexValue(display="not_admins", ref="Groups/3", value="3"), + ], + ), + iam.User( + user_name="not_admin_2", + active=True, + groups=[ + iam.ComplexValue(display="admins", ref="Groups/4", value="4"), + ], + ), + ] + _setup_accounts(ws, workspace_users=users, groups=groups) - assert owner == "jane" + finder = WorkspaceAdministratorFinder(ws) + admins = list(finder.find_admin_users()) + expected_admins = {"admin_1", "admin_2"} + assert len(admins) == len(expected_admins) + assert set(admin.user_name for admin in admins) == expected_admins -def test_ownership_workspace_admin_preferred_over_account_admin(ws) -> None: - """Verify that when both workspace and account administrators are configured, the workspace admin is preferred.""" + +def test_workspace_admin_finder_no_admins(ws) -> None: + """Verify that the workspace admin finder handles no admins as a normal situation.""" admins_group = _create_workspace_group("admins", group_id="1") - assert admins_group.id - workspace_users = [_create_workspace_admin("bob", admins_group_id=admins_group.id)] - account_users = [_create_account_admin("jane")] - _setup_accounts(ws, account_users=account_users, workspace_users=workspace_users, groups=[admins_group]) + _setup_accounts(ws, workspace_users=[], groups=[admins_group]) - ownership = _OwnershipFixture[str](ws) - owner = ownership.owner_of("school") + finder = WorkspaceAdministratorFinder(ws) + admins = list(finder.find_admin_users()) + + assert not admins + + +def testa_accounts_admin_finder_active_with_username(ws) -> None: + """Verify that the account admin finder only reports active users with a user-name.""" + inactive_admin = _create_account_admin("inactive_admin") + inactive_admin.active = False + users = [ + _create_account_admin("only_real_admin"), + inactive_admin, + _create_account_admin(""), + ] + _setup_accounts(ws, account_users=users) + + finder = AccountAdministratorFinder(ws) + admins = list(finder.find_admin_users()) + + assert [admin.user_name for admin in admins] == ["only_real_admin"] + + +def test_accounts_admin_finder_role(ws) -> None: + """Verify that the account admin finder only reports users with the 'account_admin' role.""" + users = [ + _create_account_admin("admin_1"), + iam.User( + user_name="admin_2", + active=True, + roles=[ + iam.ComplexValue(value="account_admin"), + iam.ComplexValue(value="another_role"), + ], + ), + iam.User( + user_name="not_admin", + active=True, + roles=[ + iam.ComplexValue(value="another_role"), + ], + ), + ] + _setup_accounts(ws, account_users=users) + + finder = AccountAdministratorFinder(ws) + admins = list(finder.find_admin_users()) + + expected_admins = {"admin_1", "admin_2"} + assert len(admins) == len(expected_admins) + assert set(admin.user_name for admin in admins) == expected_admins - assert owner == "bob" +def test_accounts_admin_finder_no_admins(ws) -> None: + """Verify that the workspace admin finder handles no admins as a normal situation.""" + finder = AccountAdministratorFinder(ws) + admins = list(finder.find_admin_users()) -def test_ownership_admin_ignore_inactive(ws) -> None: - """Verify that inactive workspace administrators are ignored when locating an administrator.""" + assert not admins + + +def test_admin_locator_prefers_workspace_admin_over_account_admin(ws) -> None: + """Verify that when both workspace and account administrators are configured, the workspace admin is preferred.""" admins_group = _create_workspace_group("admins", group_id="1") assert admins_group.id - bob = _create_workspace_admin("bob", admins_group_id=admins_group.id) - bob.active = False - jane = _create_account_admin("jane") - jane.active = False - _setup_accounts(ws, account_users=[jane], workspace_users=[bob], groups=[admins_group]) + workspace_users = [_create_workspace_admin("bob", admins_group_id=admins_group.id)] + account_users = [_create_account_admin("jane")] + _setup_accounts(ws, account_users=account_users, workspace_users=workspace_users, groups=[admins_group]) - ownership = _OwnershipFixture[str](ws) - # All admins are inactive, so an exception should be raised. - with pytest.raises(RuntimeError, match="No active workspace or account administrator"): - _ = ownership.owner_of("school") + locator = AdministratorLocator(ws) + the_admin = locator.workspace_administrator + assert the_admin == "bob" + # Also verify that we didn't attempt to look up account admins. + ws.api_client.do.assert_not_called() -def test_ownership_workspace_admin_prefer_first_alphabetically(ws) -> None: + +def test_admin_locator_prefer_first_workspace_admin_alphabetically(ws) -> None: """Verify that when multiple workspace administrators can found, the first alphabetically is used.""" admins_group = _create_workspace_group("admins", group_id="1") assert admins_group.id @@ -133,13 +237,13 @@ def test_ownership_workspace_admin_prefer_first_alphabetically(ws) -> None: ] _setup_accounts(ws, workspace_users=workspace_users, groups=[admins_group]) - ownership = _OwnershipFixture[str](ws) - owner = ownership.owner_of("school") + locator = AdministratorLocator(ws) + the_admin = locator.workspace_administrator - assert owner == "andrew" + assert the_admin == "andrew" -def test_ownership_account_admin_prefer_first_alphabetically(ws) -> None: +def test_admin_locator_prefer_first_account_admin_alphabetically(ws) -> None: """Verify that when multiple account administrators can found, the first alphabetically preferred is used.""" account_users = [ _create_account_admin("bob"), @@ -148,19 +252,98 @@ def test_ownership_account_admin_prefer_first_alphabetically(ws) -> None: ] _setup_accounts(ws, account_users=account_users) - ownership = _OwnershipFixture[str](ws) - owner = ownership.owner_of("school") + locator = AdministratorLocator(ws) + the_admin = locator.workspace_administrator - assert owner == "andrew" + assert the_admin == "andrew" -def test_ownership_error_when_no_owner_can_be_located(ws) -> None: +def test_admin_locator_error_when_no_admin(ws) -> None: """Verify that an error is raised when no workspace or account administrators can be found.""" _setup_accounts(ws) - ownership = _OwnershipFixture[str](ws) + locator = AdministratorLocator(ws) # No admins. workspace_id = ws.get_workspace_id() expected_message = f"No active workspace or account administrator can be found for workspace: {workspace_id}" with pytest.raises(RuntimeError, match=re.escape(expected_message)): + _ = locator.workspace_administrator + + +def test_admin_locator_is_lazy(ws) -> None: + """Verify that we don't attempt to locate an administrator until it's needed.""" + mock_finder = create_autospec(AdministratorFinder) + mock_finder.find_admin_users.return_value = (_create_account_admin("bob"),) + mock_finder_factory = Mock() + mock_finder_factory.return_value = mock_finder + locator = AdministratorLocator(ws, finders=[mock_finder_factory]) + + mock_finder_factory.assert_not_called() + mock_finder.assert_not_called() + + _ = locator.workspace_administrator + + mock_finder_factory.assert_called_once_with(ws) + mock_finder.find_admin_users.assert_called_once() + + +def test_admin_locator_caches_result(ws) -> None: + """Verify that locating an administrator only happens once.""" + mock_finder = create_autospec(AdministratorFinder) + mock_finder.find_admin_users.return_value = (_create_account_admin("bob"),) + mock_finder_factory = Mock() + mock_finder_factory.return_value = mock_finder + + locator = AdministratorLocator(ws, finders=[mock_finder_factory]) + _ = locator.workspace_administrator + _ = locator.workspace_administrator + + mock_finder_factory.assert_called_once_with(ws) + mock_finder.find_admin_users.assert_called_once() + + +def test_admin_locator_caches_negative_result(ws) -> None: + """Verify that locating an administrator only happens once, even if it couldn't locate an admin.""" + mock_finder = create_autospec(AdministratorFinder) + mock_finder.find_admin_users.return_value = () + mock_finder_factory = Mock() + mock_finder_factory.return_value = mock_finder + + locator = AdministratorLocator(ws, finders=[mock_finder_factory]) + with pytest.raises(RuntimeError): + _ = locator.workspace_administrator + with pytest.raises(RuntimeError): + _ = locator.workspace_administrator + + mock_finder_factory.assert_called_once_with(ws) + mock_finder.find_admin_users.assert_called_once() + + +def test_ownership_prefers_record_owner(ws) -> None: + """Verify that if an owner for the record can be found, that is used.""" + ownership = _OwnershipFixture[str](ws, owner_fn=lambda _: "bob") + owner = ownership.owner_of("school") + + assert owner == "bob" + ownership.mock_admin_locator.workspace_administrator.assert_not_called() + + +def test_ownership_admin_user_fallback(ws) -> None: + """Verify that if no owner for the record can be found, an admin user is returned instead.""" + ownership = _OwnershipFixture[str](ws) + type(ownership.mock_admin_locator).workspace_administrator = PropertyMock(return_value="jane") + + owner = ownership.owner_of("school") + + assert owner == "jane" + + +def test_ownership_no_fallback_admin_user_error(ws) -> None: + """Verify that if no owner can be determined, an error is raised.""" + ownership = _OwnershipFixture[str](ws) + type(ownership.mock_admin_locator).workspace_administrator = PropertyMock( + side_effect=RuntimeError("Mocked admin lookup failure.") + ) + + with pytest.raises(RuntimeError, match="Mocked admin lookup failure."): _ = ownership.owner_of("school") From 53da23de7a1cc637c1a2f731ee1691a98b56450e Mon Sep 17 00:00:00 2001 From: Andrew Snare Date: Fri, 4 Oct 2024 13:36:46 +0200 Subject: [PATCH 19/58] Deal with some comprehension issues. --- src/databricks/labs/ucx/framework/owners.py | 22 +++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/src/databricks/labs/ucx/framework/owners.py b/src/databricks/labs/ucx/framework/owners.py index e8e940687a..95efe83c6f 100644 --- a/src/databricks/labs/ucx/framework/owners.py +++ b/src/databricks/labs/ucx/framework/owners.py @@ -42,6 +42,10 @@ def _member_of_group(user: User, group_id: str) -> bool: """Determine whether a user belongs to a group with the given identifier or not.""" return user.groups is not None and any(g.value == group_id for g in user.groups) + def _is_active_admin(self, user: User) -> bool: + """Determine if a user is an active administrator.""" + return bool(user.active) and self._member_of_group_named(user, "admins") + def _filter_workspace_groups(self, identifiers: Iterable[str]) -> Iterable[str]: """Limit a set of identifiers to those that are workspace groups.""" seen = set() @@ -66,17 +70,15 @@ def find_admin_users(self) -> Iterable[User]: all_users = self._ws.users.list(attributes="id,active,userName,groups") # The groups attribute is a flattened list of groups a user belongs to; hunt for the 'admins' workspace group. # Reference: https://learn.microsoft.com/en-us/azure/databricks/admin/users-groups/groups#account-vs-workspace-group - admin_users = [ - user for user in all_users if user.active and user.user_name and self._member_of_group_named(user, "admins") - ] + admin_users = [user for user in all_users if user.user_name and self._is_active_admin(user)] logger.debug(f"Verifying membership of the 'admins' workspace group for users: {admin_users}") - candidate_group_ids = ( - group.value - for user in admin_users - if user.groups - for group in user.groups - if group.display == "admins" and group.value - ) + candidate_group_ids = set() + for user in admin_users: + if not user.groups: + continue + for group in user.groups: + if group.display == "admins" and group.value: + candidate_group_ids.add(group.value) admin_groups = list(self._filter_workspace_groups(candidate_group_ids)) match admin_groups: case []: From 83044e869a314946f2a1ebbf9beee004869d5cab Mon Sep 17 00:00:00 2001 From: Andrew Snare Date: Fri, 4 Oct 2024 15:14:52 +0200 Subject: [PATCH 20/58] Implement ownership for the ClusterInfo inventory class. --- .../labs/ucx/assessment/clusters.py | 15 ++++- tests/integration/assessment/test_clusters.py | 37 +++++++++++- tests/unit/assessment/test_clusters.py | 56 ++++++++++++++----- 3 files changed, 90 insertions(+), 18 deletions(-) diff --git a/src/databricks/labs/ucx/assessment/clusters.py b/src/databricks/labs/ucx/assessment/clusters.py index 02badb64ec..92723c7f34 100644 --- a/src/databricks/labs/ucx/assessment/clusters.py +++ b/src/databricks/labs/ucx/assessment/clusters.py @@ -29,6 +29,7 @@ ) from databricks.labs.ucx.assessment.init_scripts import CheckInitScriptMixin from databricks.labs.ucx.framework.crawlers import CrawlerBase +from databricks.labs.ucx.framework.owners import Ownership from databricks.labs.ucx.framework.utils import escape_sql_identifier logger = logging.getLogger(__name__) @@ -43,6 +44,7 @@ class ClusterInfo: policy_id: str | None = None cluster_name: str | None = None creator: str | None = None + """User-name of the creator of the cluster, if known.""" class CheckClusterMixin(CheckInitScriptMixin): @@ -154,17 +156,20 @@ def _assess_clusters(self, all_clusters): for cluster in all_clusters: if cluster.cluster_source == ClusterSource.JOB: continue - if not cluster.creator_user_name: + creator = cluster.creator_user_name + if not creator: logger.warning( f"Cluster {cluster.cluster_id} have Unknown creator, it means that the original creator " f"has been deleted and should be re-created" ) + # Normalize empty creator. + creator = None cluster_info = ClusterInfo( cluster_id=cluster.cluster_id if cluster.cluster_id else "", cluster_name=cluster.cluster_name, policy_id=cluster.policy_id, spark_version=cluster.spark_version, - creator=cluster.creator_user_name, + creator=creator, success=1, failures="[]", ) @@ -179,6 +184,12 @@ def _try_fetch(self) -> Iterable[ClusterInfo]: yield ClusterInfo(*row) +class ClusterOwnership(Ownership[ClusterInfo]): + + def _get_owner(self, record: ClusterInfo) -> str | None: + return record.creator + + @dataclass class PolicyInfo: policy_id: str diff --git a/tests/integration/assessment/test_clusters.py b/tests/integration/assessment/test_clusters.py index 01a47d1aba..6b6d2670e2 100644 --- a/tests/integration/assessment/test_clusters.py +++ b/tests/integration/assessment/test_clusters.py @@ -5,7 +5,7 @@ from databricks.sdk.retries import retried from databricks.sdk.service.compute import DataSecurityMode -from databricks.labs.ucx.assessment.clusters import ClustersCrawler, PoliciesCrawler +from databricks.labs.ucx.assessment.clusters import ClustersCrawler, PoliciesCrawler, ClusterOwnership from .test_assessment import _SPARK_CONF @@ -39,6 +39,41 @@ def test_cluster_crawler_no_isolation(ws, make_cluster, inventory_schema, sql_ba assert results[0].failures == '["No isolation shared clusters not supported in UC"]' +def _change_cluster_owner(ws, cluster_id: str, owner_user_name: str) -> None: + """Replacement for ClustersAPI.change_owner().""" + # As of SDK 0.33.0 there is a call to wait for cluster termination that fails because it doesn't pass the cluster id + body = {'cluster_id': cluster_id, 'owner_username': owner_user_name} + headers = {'Accept': 'application/json', 'Content-Type': 'application/json'} + ws.api_client.do('POST', '/api/2.1/clusters/change-owner', body=body, headers=headers) + + +def test_cluster_ownership(ws, installation_ctx, make_cluster, make_user, inventory_schema, sql_backend) -> None: + """Verify the ownership can be determined for crawled clusters.""" + + # Set up two clusters: one with an owner (us) and another without. + another_user = make_user() + cluster_with_owner = make_cluster(single_node=True, spark_conf=_SPARK_CONF) + cluster_without_owner = make_cluster(single_node=True, spark_conf=_SPARK_CONF) + ws.clusters.delete_and_wait(cluster_id=cluster_without_owner.cluster_id) + _change_cluster_owner(ws, cluster_without_owner.cluster_id, owner_user_name=another_user.user_name) + ws.users.delete(another_user.id) + + # Produce the crawled records. + crawler = ClustersCrawler(ws, sql_backend, inventory_schema) + records = crawler.snapshot(force_refresh=True) + + # Find the crawled records for our clusters. + cluster_record_with_owner = next(record for record in records if record.cluster_id == cluster_with_owner.cluster_id) + cluster_record_without_owner = next( + record for record in records if record.cluster_id == cluster_without_owner.cluster_id + ) + + # Verify ownership is as expected. + ownership = ClusterOwnership(ws, installation_ctx.administrator_locator) + assert ownership.owner_of(cluster_record_with_owner) == ws.current_user.me().user_name + assert "@" in ownership.owner_of(cluster_record_without_owner) + + def test_cluster_crawler_mlr_no_isolation(ws, make_cluster, inventory_schema, sql_backend): created_cluster = make_cluster( data_security_mode=DataSecurityMode.NONE, spark_version='15.4.x-cpu-ml-scala2.12', num_workers=1 diff --git a/tests/unit/assessment/test_clusters.py b/tests/unit/assessment/test_clusters.py index 02956c6b75..8e97f7a945 100644 --- a/tests/unit/assessment/test_clusters.py +++ b/tests/unit/assessment/test_clusters.py @@ -1,14 +1,15 @@ import json -from unittest.mock import MagicMock, create_autospec, mock_open, patch +from unittest.mock import MagicMock, PropertyMock, create_autospec, mock_open, patch import pytest -from databricks.labs.lsql import Row from databricks.labs.lsql.backends import MockBackend from databricks.sdk.errors import DatabricksError, InternalError, NotFound +from databricks.sdk.service.compute import ClusterDetails from databricks.labs.ucx.assessment.azure import AzureServicePrincipalCrawler -from databricks.labs.ucx.assessment.clusters import ClustersCrawler, PoliciesCrawler +from databricks.labs.ucx.assessment.clusters import ClustersCrawler, PoliciesCrawler, ClusterOwnership, ClusterInfo from databricks.labs.ucx.framework.crawlers import SqlBackend +from databricks.labs.ucx.framework.owners import AdministratorLocator from .. import mock_workspace_client @@ -90,21 +91,27 @@ def test_cluster_init_script_check_dbfs(): def test_cluster_without_owner_should_have_empty_creator_name(): - ws = mock_workspace_client(cluster_ids=['simplest-autoscale']) - mockbackend = MockBackend() - ClustersCrawler(ws, mockbackend, "ucx").snapshot() - result = mockbackend.rows_written_for("hive_metastore.ucx.clusters", "overwrite") - assert result == [ - Row( + ws = mock_workspace_client() + ws.clusters.list.return_value = ( + ClusterDetails( + creator_user_name=None, cluster_id="simplest-autoscale", policy_id="single-user-with-spn", - cluster_name="Simplest Shared Autoscale", - creator=None, + cluster_name="Simplest Shard Autoscale", spark_version="13.3.x-cpu-ml-scala2.12", - success=1, - failures='[]', - ) - ] + ), + ClusterDetails( + creator_user_name="", + cluster_id="another-simple-autoscale", + policy_id="single-user-with-spn", + cluster_name="Another Simple Shard Autoscale", + spark_version="13.3.x-cpu-ml-scala2.12", + ), + ) + mockbackend = MockBackend() + ClustersCrawler(ws, mockbackend, "ucx").snapshot() + result = mockbackend.rows_written_for("hive_metastore.ucx.clusters", "overwrite") + assert [row["creator"] for row in result] == [None, None] def test_cluster_with_multiple_failures(): @@ -171,6 +178,25 @@ def test_unsupported_clusters(): assert result_set[0].failures == '["cluster type not supported : LEGACY_PASSTHROUGH"]' +def test_cluster_owner_creator(ws) -> None: + admin_locator = create_autospec(AdministratorLocator) + + ownership = ClusterOwnership(ws, admin_locator) + owner = ownership.owner_of(ClusterInfo(creator="bob", cluster_id="1", success=1, failures="[]")) + + assert owner == "bob" + + +def test_cluster_owner_creator_unknown(ws) -> None: + admin_locator = create_autospec(AdministratorLocator) + type(admin_locator).workspace_administrator = PropertyMock(return_value="an_admin") + + ownership = ClusterOwnership(ws, admin_locator) + owner = ownership.owner_of(ClusterInfo(creator=None, cluster_id="1", success=1, failures="[]")) + + assert owner == "an_admin" + + def test_policy_crawler(): ws = mock_workspace_client( policy_ids=['single-user-with-spn', 'single-user-with-spn-policyid', 'single-user-with-spn-no-sparkversion'], From ed3894235c4ffe04451dfa503ea80f2b3a301bee Mon Sep 17 00:00:00 2001 From: Andrew Snare Date: Fri, 4 Oct 2024 15:16:41 +0200 Subject: [PATCH 21/58] Docstring for cluster ownership. --- src/databricks/labs/ucx/assessment/clusters.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/databricks/labs/ucx/assessment/clusters.py b/src/databricks/labs/ucx/assessment/clusters.py index 92723c7f34..e40068ab69 100644 --- a/src/databricks/labs/ucx/assessment/clusters.py +++ b/src/databricks/labs/ucx/assessment/clusters.py @@ -185,6 +185,10 @@ def _try_fetch(self) -> Iterable[ClusterInfo]: class ClusterOwnership(Ownership[ClusterInfo]): + """Determine ownership of clusters in the inventory. + + This is based on the cluster creator (if known), or otherwise an administrator. + """ def _get_owner(self, record: ClusterInfo) -> str | None: return record.creator From 5d4c9946d3d19672d27b22093dced12e616db35e Mon Sep 17 00:00:00 2001 From: Andrew Snare Date: Fri, 4 Oct 2024 15:20:01 +0200 Subject: [PATCH 22/58] Check some mock interactions. --- tests/unit/assessment/test_clusters.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tests/unit/assessment/test_clusters.py b/tests/unit/assessment/test_clusters.py index 8e97f7a945..647b130a42 100644 --- a/tests/unit/assessment/test_clusters.py +++ b/tests/unit/assessment/test_clusters.py @@ -180,11 +180,13 @@ def test_unsupported_clusters(): def test_cluster_owner_creator(ws) -> None: admin_locator = create_autospec(AdministratorLocator) + type(admin_locator).workspace_administrator = PropertyMock() ownership = ClusterOwnership(ws, admin_locator) owner = ownership.owner_of(ClusterInfo(creator="bob", cluster_id="1", success=1, failures="[]")) assert owner == "bob" + admin_locator.workspace_administrator.assert_not_called() def test_cluster_owner_creator_unknown(ws) -> None: @@ -195,6 +197,7 @@ def test_cluster_owner_creator_unknown(ws) -> None: owner = ownership.owner_of(ClusterInfo(creator=None, cluster_id="1", success=1, failures="[]")) assert owner == "an_admin" + admin_locator.workspace_administrator.assert_called_once() def test_policy_crawler(): From 348d9b07fc541a911910b69d5cc8fcdb45562a2b Mon Sep 17 00:00:00 2001 From: Andrew Snare Date: Fri, 4 Oct 2024 16:00:32 +0200 Subject: [PATCH 23/58] Improve docstring clarity. --- src/databricks/labs/ucx/assessment/clusters.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/databricks/labs/ucx/assessment/clusters.py b/src/databricks/labs/ucx/assessment/clusters.py index e40068ab69..1b74b0b99d 100644 --- a/src/databricks/labs/ucx/assessment/clusters.py +++ b/src/databricks/labs/ucx/assessment/clusters.py @@ -187,7 +187,7 @@ def _try_fetch(self) -> Iterable[ClusterInfo]: class ClusterOwnership(Ownership[ClusterInfo]): """Determine ownership of clusters in the inventory. - This is based on the cluster creator (if known), or otherwise an administrator. + This is the cluster creator (if known), or otherwise an administrator. """ def _get_owner(self, record: ClusterInfo) -> str | None: From 5cf6f30b126ca18f46822f56baed8f72d5f8ff08 Mon Sep 17 00:00:00 2001 From: Andrew Snare Date: Fri, 4 Oct 2024 16:01:12 +0200 Subject: [PATCH 24/58] Fix unit test. --- tests/unit/assessment/test_clusters.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tests/unit/assessment/test_clusters.py b/tests/unit/assessment/test_clusters.py index 647b130a42..c1ac0b11a3 100644 --- a/tests/unit/assessment/test_clusters.py +++ b/tests/unit/assessment/test_clusters.py @@ -191,13 +191,14 @@ def test_cluster_owner_creator(ws) -> None: def test_cluster_owner_creator_unknown(ws) -> None: admin_locator = create_autospec(AdministratorLocator) - type(admin_locator).workspace_administrator = PropertyMock(return_value="an_admin") + mock_workspace_administrator = PropertyMock(return_value="an_admin") + type(admin_locator).workspace_administrator = mock_workspace_administrator ownership = ClusterOwnership(ws, admin_locator) owner = ownership.owner_of(ClusterInfo(creator=None, cluster_id="1", success=1, failures="[]")) assert owner == "an_admin" - admin_locator.workspace_administrator.assert_called_once() + mock_workspace_administrator.assert_called_once() def test_policy_crawler(): From 8d4de1fb9407a426c9c34e4e0230df9fda167b37 Mon Sep 17 00:00:00 2001 From: Andrew Snare Date: Fri, 4 Oct 2024 16:03:01 +0200 Subject: [PATCH 25/58] Suppress pylint false positive. --- tests/unit/assessment/test_clusters.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/unit/assessment/test_clusters.py b/tests/unit/assessment/test_clusters.py index c1ac0b11a3..4caa7f6fd6 100644 --- a/tests/unit/assessment/test_clusters.py +++ b/tests/unit/assessment/test_clusters.py @@ -190,7 +190,7 @@ def test_cluster_owner_creator(ws) -> None: def test_cluster_owner_creator_unknown(ws) -> None: - admin_locator = create_autospec(AdministratorLocator) + admin_locator = create_autospec(AdministratorLocator) # pylint disable=mock-no-usage mock_workspace_administrator = PropertyMock(return_value="an_admin") type(admin_locator).workspace_administrator = mock_workspace_administrator From 4a597a22d56a5f7bb24b1671e0a8511400ef52f4 Mon Sep 17 00:00:00 2001 From: Andrew Snare Date: Fri, 4 Oct 2024 16:03:31 +0200 Subject: [PATCH 26/58] Implement ownership for cluster policies. --- .../labs/ucx/assessment/clusters.py | 13 ++++- tests/integration/assessment/test_clusters.py | 30 ++++++++++- tests/unit/assessment/test_clusters.py | 51 ++++++++++++++++++- 3 files changed, 90 insertions(+), 4 deletions(-) diff --git a/src/databricks/labs/ucx/assessment/clusters.py b/src/databricks/labs/ucx/assessment/clusters.py index 1b74b0b99d..789e236757 100644 --- a/src/databricks/labs/ucx/assessment/clusters.py +++ b/src/databricks/labs/ucx/assessment/clusters.py @@ -203,6 +203,7 @@ class PolicyInfo: spark_version: str | None = None policy_description: str | None = None creator: str | None = None + """User-name of the creator of the cluster policy, if known.""" class PoliciesCrawler(CrawlerBase[PolicyInfo], CheckClusterMixin): @@ -225,7 +226,7 @@ def _assess_policies(self, all_policices) -> Iterable[PolicyInfo]: except KeyError: spark_version = None policy_name = policy.name - creator_name = policy.creator_user_name + creator_name = policy.creator_user_name or None policy_info = PolicyInfo( policy_id=policy.policy_id, @@ -244,3 +245,13 @@ def _assess_policies(self, all_policices) -> Iterable[PolicyInfo]: def _try_fetch(self) -> Iterable[PolicyInfo]: for row in self._fetch(f"SELECT * FROM {escape_sql_identifier(self.full_name)}"): yield PolicyInfo(*row) + + +class ClusterPolicyOwnership(Ownership[PolicyInfo]): + """Determine ownership of cluster policies in the inventory. + + This is the creator of the cluster policy (if known), or otherwise an administrator. + """ + + def _get_owner(self, record: PolicyInfo) -> str | None: + return record.creator diff --git a/tests/integration/assessment/test_clusters.py b/tests/integration/assessment/test_clusters.py index 6b6d2670e2..1487537aae 100644 --- a/tests/integration/assessment/test_clusters.py +++ b/tests/integration/assessment/test_clusters.py @@ -1,11 +1,17 @@ import json from datetime import timedelta +import pytest from databricks.sdk.errors import NotFound from databricks.sdk.retries import retried from databricks.sdk.service.compute import DataSecurityMode -from databricks.labs.ucx.assessment.clusters import ClustersCrawler, PoliciesCrawler, ClusterOwnership +from databricks.labs.ucx.assessment.clusters import ( + ClustersCrawler, + PoliciesCrawler, + ClusterOwnership, + ClusterPolicyOwnership, +) from .test_assessment import _SPARK_CONF @@ -121,3 +127,25 @@ def test_policy_crawler(ws, make_cluster_policy, inventory_schema, sql_backend, assert results[1].policy_name == policy_2 assert results[1].success == 0 assert results[1].failures == '["Uses azure service principal credentials config in policy."]' + + +# TODO: Investigate whether this is a bug or something wrong with this fixture. +@pytest.mark.xfail("Cluster policy creators always seem to be null.") +def test_cluster_policy_ownership(ws, installation_ctx, make_cluster_policy, inventory_schema, sql_backend) -> None: + """Verify the ownership can be determined for crawled cluster policies.""" + + # Set up a cluster policy. + # Note: there doesn't seem to be a way to change the owner of a cluster policy, so we can't test policies without + # an owner. + policy = make_cluster_policy() + + # Produce the crawled records. + crawler = PoliciesCrawler(ws, sql_backend, inventory_schema) + records = crawler.snapshot(force_refresh=True) + + # Find the crawled record for our cluster policy. + policy_record = next(record for record in records if record.policy_id == policy.policy_id) + + # Verify ownership is as expected. + ownership = ClusterPolicyOwnership(ws, installation_ctx.administrator_locator) + assert ownership.owner_of(policy_record) == ws.current_user.me().user_name diff --git a/tests/unit/assessment/test_clusters.py b/tests/unit/assessment/test_clusters.py index 4caa7f6fd6..67b7aa7b9b 100644 --- a/tests/unit/assessment/test_clusters.py +++ b/tests/unit/assessment/test_clusters.py @@ -4,10 +4,17 @@ import pytest from databricks.labs.lsql.backends import MockBackend from databricks.sdk.errors import DatabricksError, InternalError, NotFound -from databricks.sdk.service.compute import ClusterDetails +from databricks.sdk.service.compute import ClusterDetails, Policy from databricks.labs.ucx.assessment.azure import AzureServicePrincipalCrawler -from databricks.labs.ucx.assessment.clusters import ClustersCrawler, PoliciesCrawler, ClusterOwnership, ClusterInfo +from databricks.labs.ucx.assessment.clusters import ( + ClustersCrawler, + PoliciesCrawler, + ClusterOwnership, + ClusterInfo, + ClusterPolicyOwnership, + PolicyInfo, +) from databricks.labs.ucx.framework.crawlers import SqlBackend from databricks.labs.ucx.framework.owners import AdministratorLocator @@ -214,6 +221,22 @@ def test_policy_crawler(): assert "Uses azure service principal credentials config in policy." in failures +def test_policy_crawler_creator(): + ws = mock_workspace_client() + ws.cluster_policies.list.return_value = ( + Policy(policy_id="1", definition="{}", name="foo", creator_user_name=None), + Policy(policy_id="2", definition="{}", name="bar", creator_user_name=""), + Policy(policy_id="3", definition="{}", name="baz", creator_user_name="bob"), + ) + mockbackend = MockBackend() + result = PoliciesCrawler(ws, mockbackend, "ucx").snapshot() + + expected_creators = [None, None, "bob"] + crawled_creators = [record.creator for record in result] + assert len(expected_creators) == len(crawled_creators) + assert set(expected_creators) == set(crawled_creators) + + def test_policy_try_fetch(): ws = mock_workspace_client(policy_ids=['single-user-with-spn-policyid']) mock_backend = MockBackend( @@ -250,3 +273,27 @@ def test_policy_without_failure(): crawler = PoliciesCrawler(ws, MockBackend(), "ucx") result_set = list(crawler.snapshot()) assert result_set[0].failures == '[]' + + +def test_cluster_policy_owner_creator(ws) -> None: + admin_locator = create_autospec(AdministratorLocator) # pylint disable=mock-no-usage + mock_workspace_administrator = PropertyMock(return_value="an_admin") + type(admin_locator).workspace_administrator = mock_workspace_administrator + + ownership = ClusterPolicyOwnership(ws, admin_locator) + owner = ownership.owner_of(PolicyInfo(creator="bob", policy_id="1", policy_name="foo", success=1, failures="[]")) + + assert owner == "bob" + mock_workspace_administrator.assert_not_called() + + +def test_cluster_policy_owner_creator_unknown(ws) -> None: + admin_locator = create_autospec(AdministratorLocator) # pylint disable=mock-no-usage + mock_workspace_administrator = PropertyMock(return_value="an_admin") + type(admin_locator).workspace_administrator = mock_workspace_administrator + + ownership = ClusterPolicyOwnership(ws, admin_locator) + owner = ownership.owner_of(PolicyInfo(creator=None, policy_id="1", policy_name="foo", success=1, failures="[]")) + + assert owner == "an_admin" + mock_workspace_administrator.assert_called_once() From 1818d46ff7cfa442abc7471a75bf459100480ef7 Mon Sep 17 00:00:00 2001 From: Andrew Snare Date: Fri, 4 Oct 2024 16:05:53 +0200 Subject: [PATCH 27/58] Fix linting suppression. --- tests/unit/assessment/test_clusters.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/unit/assessment/test_clusters.py b/tests/unit/assessment/test_clusters.py index 67b7aa7b9b..9c1243d950 100644 --- a/tests/unit/assessment/test_clusters.py +++ b/tests/unit/assessment/test_clusters.py @@ -197,7 +197,7 @@ def test_cluster_owner_creator(ws) -> None: def test_cluster_owner_creator_unknown(ws) -> None: - admin_locator = create_autospec(AdministratorLocator) # pylint disable=mock-no-usage + admin_locator = create_autospec(AdministratorLocator) # pylint: disable=mock-no-usage mock_workspace_administrator = PropertyMock(return_value="an_admin") type(admin_locator).workspace_administrator = mock_workspace_administrator @@ -276,7 +276,7 @@ def test_policy_without_failure(): def test_cluster_policy_owner_creator(ws) -> None: - admin_locator = create_autospec(AdministratorLocator) # pylint disable=mock-no-usage + admin_locator = create_autospec(AdministratorLocator) # pylint: disable=mock-no-usage mock_workspace_administrator = PropertyMock(return_value="an_admin") type(admin_locator).workspace_administrator = mock_workspace_administrator @@ -288,7 +288,7 @@ def test_cluster_policy_owner_creator(ws) -> None: def test_cluster_policy_owner_creator_unknown(ws) -> None: - admin_locator = create_autospec(AdministratorLocator) # pylint disable=mock-no-usage + admin_locator = create_autospec(AdministratorLocator) # pylint: disable=mock-no-usage mock_workspace_administrator = PropertyMock(return_value="an_admin") type(admin_locator).workspace_administrator = mock_workspace_administrator From c13b4eb1a6c53aba87f4bfeed402c54a2cdb0d57 Mon Sep 17 00:00:00 2001 From: Andrew Snare Date: Fri, 4 Oct 2024 16:46:01 +0200 Subject: [PATCH 28/58] Use runtime context for integration tests instead of installation context. --- tests/integration/assessment/test_clusters.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/integration/assessment/test_clusters.py b/tests/integration/assessment/test_clusters.py index 1487537aae..83eccb724d 100644 --- a/tests/integration/assessment/test_clusters.py +++ b/tests/integration/assessment/test_clusters.py @@ -53,7 +53,7 @@ def _change_cluster_owner(ws, cluster_id: str, owner_user_name: str) -> None: ws.api_client.do('POST', '/api/2.1/clusters/change-owner', body=body, headers=headers) -def test_cluster_ownership(ws, installation_ctx, make_cluster, make_user, inventory_schema, sql_backend) -> None: +def test_cluster_ownership(ws, runtime_ctx, make_cluster, make_user, inventory_schema, sql_backend) -> None: """Verify the ownership can be determined for crawled clusters.""" # Set up two clusters: one with an owner (us) and another without. @@ -75,7 +75,7 @@ def test_cluster_ownership(ws, installation_ctx, make_cluster, make_user, invent ) # Verify ownership is as expected. - ownership = ClusterOwnership(ws, installation_ctx.administrator_locator) + ownership = ClusterOwnership(ws, runtime_ctx.administrator_locator) assert ownership.owner_of(cluster_record_with_owner) == ws.current_user.me().user_name assert "@" in ownership.owner_of(cluster_record_without_owner) @@ -131,7 +131,7 @@ def test_policy_crawler(ws, make_cluster_policy, inventory_schema, sql_backend, # TODO: Investigate whether this is a bug or something wrong with this fixture. @pytest.mark.xfail("Cluster policy creators always seem to be null.") -def test_cluster_policy_ownership(ws, installation_ctx, make_cluster_policy, inventory_schema, sql_backend) -> None: +def test_cluster_policy_ownership(ws, runtime_ctx, make_cluster_policy, inventory_schema, sql_backend) -> None: """Verify the ownership can be determined for crawled cluster policies.""" # Set up a cluster policy. @@ -147,5 +147,5 @@ def test_cluster_policy_ownership(ws, installation_ctx, make_cluster_policy, inv policy_record = next(record for record in records if record.policy_id == policy.policy_id) # Verify ownership is as expected. - ownership = ClusterPolicyOwnership(ws, installation_ctx.administrator_locator) + ownership = ClusterPolicyOwnership(ws, runtime_ctx.administrator_locator) assert ownership.owner_of(policy_record) == ws.current_user.me().user_name From 7e66e70abd97e28058553657b41d4b52a489ed27 Mon Sep 17 00:00:00 2001 From: Andrew Snare Date: Fri, 4 Oct 2024 16:46:29 +0200 Subject: [PATCH 29/58] Fix xfail marker for integration test. --- tests/integration/assessment/test_clusters.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/integration/assessment/test_clusters.py b/tests/integration/assessment/test_clusters.py index 83eccb724d..f688e81960 100644 --- a/tests/integration/assessment/test_clusters.py +++ b/tests/integration/assessment/test_clusters.py @@ -130,7 +130,7 @@ def test_policy_crawler(ws, make_cluster_policy, inventory_schema, sql_backend, # TODO: Investigate whether this is a bug or something wrong with this fixture. -@pytest.mark.xfail("Cluster policy creators always seem to be null.") +@pytest.mark.xfail(reason="Cluster policy creators always seem to be null.") def test_cluster_policy_ownership(ws, runtime_ctx, make_cluster_policy, inventory_schema, sql_backend) -> None: """Verify the ownership can be determined for crawled cluster policies.""" From f7942aaf8d4bee1208f5bbeb4dcd4689bb980732 Mon Sep 17 00:00:00 2001 From: Andrew Snare Date: Fri, 4 Oct 2024 16:47:02 +0200 Subject: [PATCH 30/58] Implement ownership for grants. --- .../labs/ucx/hive_metastore/grants.py | 11 ++++++++ .../integration/hive_metastore/test_grants.py | 27 +++++++++++++++++++ tests/unit/hive_metastore/test_grants.py | 17 ++++++++++-- 3 files changed, 53 insertions(+), 2 deletions(-) diff --git a/src/databricks/labs/ucx/hive_metastore/grants.py b/src/databricks/labs/ucx/hive_metastore/grants.py index a5dd4caeff..0ffbdd4e0f 100644 --- a/src/databricks/labs/ucx/hive_metastore/grants.py +++ b/src/databricks/labs/ucx/hive_metastore/grants.py @@ -31,6 +31,7 @@ StoragePermissionMapping, ) from databricks.labs.ucx.framework.crawlers import CrawlerBase +from databricks.labs.ucx.framework.owners import Ownership from databricks.labs.ucx.framework.utils import escape_sql_identifier from databricks.labs.ucx.hive_metastore.locations import ( ExternalLocations, @@ -381,6 +382,16 @@ def grants( return [] +class GrantOwnership(Ownership[Grant]): + """Determine ownership of grants in the inventory. + + At the present we can't determine a specific owner for grants: we always report an administrator. + """ + + def _get_owner(self, record: Grant) -> None: + return None + + class AwsACL: def __init__( self, diff --git a/tests/integration/hive_metastore/test_grants.py b/tests/integration/hive_metastore/test_grants.py index 6ab661264d..f7771bd063 100644 --- a/tests/integration/hive_metastore/test_grants.py +++ b/tests/integration/hive_metastore/test_grants.py @@ -6,6 +6,11 @@ from databricks.sdk.retries import retried from databricks.labs.lsql.backends import StatementExecutionBackend + +from databricks.labs.ucx.framework.utils import escape_sql_identifier +from databricks.labs.ucx.hive_metastore import TablesCrawler +from databricks.labs.ucx.hive_metastore.grants import GrantsCrawler, GrantOwnership +from databricks.labs.ucx.hive_metastore.udfs import UdfsCrawler from ..conftest import MockRuntimeContext logger = logging.getLogger(__name__) @@ -108,3 +113,25 @@ def test_all_grants_for_other_objects( assert {"DENIED_SELECT"} == found_any_file_grants[group_b.display_name] assert {"SELECT"} == found_anonymous_function_grants[group_c.display_name] assert {"DENIED_SELECT"} == found_anonymous_function_grants[group_d.display_name] + + +def test_grant_ownership(ws, runtime_ctx, inventory_schema, sql_backend) -> None: + """Verify the ownership can be determined for crawled grants.""" + # This currently isn't very useful: we can't locate specific owners for grants. + + schema = runtime_ctx.make_schema() + me = ws.current_user.me() + sql_backend.execute(f"GRANT SELECT ON SCHEMA {escape_sql_identifier(schema.full_name)} TO `{me.user_name}`") + table_crawler = TablesCrawler(sql_backend, schema=inventory_schema, include_databases=[schema.name]) + udf_crawler = UdfsCrawler(sql_backend, schema=inventory_schema, include_databases=[schema.name]) + + # Produce the crawled records. + crawler = GrantsCrawler(table_crawler, udf_crawler, include_databases=[schema.name]) + records = crawler.snapshot(force_refresh=True) + + # Find the crawled record for the grant we made. + grant_record = next(record for record in records if record.this_type_and_key() == ("DATABASE", schema.full_name)) + + # Verify ownership can be made. + ownership = GrantOwnership(ws, runtime_ctx.administrator_locator) + assert "@" in ownership.owner_of(grant_record) diff --git a/tests/unit/hive_metastore/test_grants.py b/tests/unit/hive_metastore/test_grants.py index 101f1dd602..f9be0f356f 100644 --- a/tests/unit/hive_metastore/test_grants.py +++ b/tests/unit/hive_metastore/test_grants.py @@ -1,10 +1,11 @@ import logging -from unittest.mock import create_autospec +from unittest.mock import create_autospec, PropertyMock import pytest from databricks.labs.lsql.backends import MockBackend -from databricks.labs.ucx.hive_metastore.grants import Grant, GrantsCrawler, MigrateGrants +from databricks.labs.ucx.framework.owners import AdministratorLocator +from databricks.labs.ucx.hive_metastore.grants import Grant, GrantsCrawler, MigrateGrants, GrantOwnership from databricks.labs.ucx.hive_metastore.tables import Table, TablesCrawler from databricks.labs.ucx.hive_metastore.udfs import UdfsCrawler from databricks.labs.ucx.workspace_access.groups import GroupManager @@ -527,3 +528,15 @@ def grant_loader() -> list[Grant]: in caplog.text ) group_manager.assert_not_called() + +def test_grant_owner(ws) -> None: + """Verify that the owner of a crawled grant is an administrator.""" + admin_locator = create_autospec(AdministratorLocator) # pylint: disable=mock-no-usage + mock_workspace_administrator = PropertyMock(return_value="an_admin") + type(admin_locator).workspace_administrator = mock_workspace_administrator + + ownership = GrantOwnership(ws, admin_locator) + owner = ownership.owner_of(Grant(principal="someone", action_type="SELECT")) + + assert owner == "an_admin" + mock_workspace_administrator.assert_called_once() From 3cb9abfd62cb81bceaee9bd62bbf67317bf1d62c Mon Sep 17 00:00:00 2001 From: Andrew Snare Date: Fri, 4 Oct 2024 16:50:51 +0200 Subject: [PATCH 31/58] Use a longer variable name. --- tests/integration/hive_metastore/test_grants.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/integration/hive_metastore/test_grants.py b/tests/integration/hive_metastore/test_grants.py index f7771bd063..68f9ce3369 100644 --- a/tests/integration/hive_metastore/test_grants.py +++ b/tests/integration/hive_metastore/test_grants.py @@ -120,8 +120,8 @@ def test_grant_ownership(ws, runtime_ctx, inventory_schema, sql_backend) -> None # This currently isn't very useful: we can't locate specific owners for grants. schema = runtime_ctx.make_schema() - me = ws.current_user.me() - sql_backend.execute(f"GRANT SELECT ON SCHEMA {escape_sql_identifier(schema.full_name)} TO `{me.user_name}`") + this_user = ws.current_user.me() + sql_backend.execute(f"GRANT SELECT ON SCHEMA {escape_sql_identifier(schema.full_name)} TO `{this_user.user_name}`") table_crawler = TablesCrawler(sql_backend, schema=inventory_schema, include_databases=[schema.name]) udf_crawler = UdfsCrawler(sql_backend, schema=inventory_schema, include_databases=[schema.name]) From 2120322bb0f43d38e01a4b8a09260b8df4849eb8 Mon Sep 17 00:00:00 2001 From: Andrew Snare Date: Fri, 4 Oct 2024 16:51:13 +0200 Subject: [PATCH 32/58] Whitespace. --- tests/unit/hive_metastore/test_grants.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/unit/hive_metastore/test_grants.py b/tests/unit/hive_metastore/test_grants.py index f9be0f356f..9ea3cbf002 100644 --- a/tests/unit/hive_metastore/test_grants.py +++ b/tests/unit/hive_metastore/test_grants.py @@ -529,6 +529,7 @@ def grant_loader() -> list[Grant]: ) group_manager.assert_not_called() + def test_grant_owner(ws) -> None: """Verify that the owner of a crawled grant is an administrator.""" admin_locator = create_autospec(AdministratorLocator) # pylint: disable=mock-no-usage From e2189f40608f761cc354730f056777e168f39096 Mon Sep 17 00:00:00 2001 From: Andrew Snare Date: Fri, 4 Oct 2024 17:06:48 +0200 Subject: [PATCH 33/58] Ownership implementation for tables. --- .../labs/ucx/hive_metastore/tables.py | 11 ++++++++ .../integration/hive_metastore/test_tables.py | 21 +++++++++++++++- tests/unit/hive_metastore/test_tables.py | 25 ++++++++++++++++++- 3 files changed, 55 insertions(+), 2 deletions(-) diff --git a/src/databricks/labs/ucx/hive_metastore/tables.py b/src/databricks/labs/ucx/hive_metastore/tables.py index 097faca778..853898d0c4 100644 --- a/src/databricks/labs/ucx/hive_metastore/tables.py +++ b/src/databricks/labs/ucx/hive_metastore/tables.py @@ -16,6 +16,7 @@ from databricks.sdk.errors import NotFound from databricks.labs.ucx.framework.crawlers import CrawlerBase +from databricks.labs.ucx.framework.owners import Ownership from databricks.labs.ucx.framework.utils import escape_sql_identifier logger = logging.getLogger(__name__) @@ -626,3 +627,13 @@ def _create_describe_tasks(self, catalog: str, database: str, table_names: list[ for table in table_names: tasks.append(partial(self._describe, catalog, database, table)) return tasks + + +class TableOwnership(Ownership[Table]): + """Determine ownership of tables in the inventory. + + At the present we don't determine a specific owner for tables: we always report an administrator. + """ + + def _get_owner(self, record: Table) -> None: + return None diff --git a/tests/integration/hive_metastore/test_tables.py b/tests/integration/hive_metastore/test_tables.py index 2d4a372e54..47fa19b1ff 100644 --- a/tests/integration/hive_metastore/test_tables.py +++ b/tests/integration/hive_metastore/test_tables.py @@ -5,7 +5,7 @@ from databricks.sdk.retries import retried from databricks.labs.ucx.hive_metastore import TablesCrawler -from databricks.labs.ucx.hive_metastore.tables import What +from databricks.labs.ucx.hive_metastore.tables import What, TableOwnership logger = logging.getLogger(__name__) @@ -86,3 +86,22 @@ def test_partitioned_tables(ws, sql_backend, make_schema, make_table): assert all_tables[f"{schema.full_name}.non_partitioned_delta"].is_partitioned is False assert all_tables[f"{schema.full_name}.partitioned_parquet"].is_partitioned is True assert all_tables[f"{schema.full_name}.non_partitioned_parquet"].is_partitioned is False + + +def test_table_ownership(ws, runtime_ctx, inventory_schema, sql_backend) -> None: + """Verify the ownership can be determined for crawled tables.""" + # This currently isn't very useful: we don't currently locate specific owners for tables. + + # A table for which we'll determine the owner. + table = runtime_ctx.make_table() + + # Produce the crawled records + crawler = TablesCrawler(sql_backend, schema=inventory_schema, include_databases=[table.schema_name]) + records = crawler.snapshot(force_refresh=True) + + # Find the crawled record for the table we made. + table_record = next(record for record in records if record.full_name == table.full_name) + + # Verify ownership can be made. + ownership = TableOwnership(ws, runtime_ctx.administrator_locator) + assert "@" in ownership.owner_of(table_record) diff --git a/tests/unit/hive_metastore/test_tables.py b/tests/unit/hive_metastore/test_tables.py index ced4be1501..09addedf58 100644 --- a/tests/unit/hive_metastore/test_tables.py +++ b/tests/unit/hive_metastore/test_tables.py @@ -1,11 +1,20 @@ import logging import sys +from unittest.mock import create_autospec, PropertyMock import pytest from databricks.labs.lsql.backends import MockBackend +from databricks.labs.ucx.framework.owners import AdministratorLocator from databricks.labs.ucx.hive_metastore.locations import Mount, ExternalLocations -from databricks.labs.ucx.hive_metastore.tables import Table, TablesCrawler, What, HiveSerdeType, FasterTableScanCrawler +from databricks.labs.ucx.hive_metastore.tables import ( + FasterTableScanCrawler, + HiveSerdeType, + Table, + TableOwnership, + TablesCrawler, + What, +) def test_is_delta_true(): @@ -649,3 +658,17 @@ def test_fast_table_scan_crawler_crawl_test_warnings_get_table(caplog, mocker, s with caplog.at_level(logging.WARNING): ftsc.snapshot() assert "Test getTable warning" in caplog.text + + +def test_table_owner(ws) -> None: + """Verify that the owner of a crawled table is an administrator.""" + admin_locator = create_autospec(AdministratorLocator) # pylint: disable=mock-no-usage + mock_workspace_administrator = PropertyMock(return_value="an_admin") + type(admin_locator).workspace_administrator = mock_workspace_administrator + + ownership = TableOwnership(ws, admin_locator) + table = Table(catalog="main", database="foo", name="bar", object_type="TABLE", table_format="DELTA") + owner = ownership.owner_of(table) + + assert owner == "an_admin" + mock_workspace_administrator.assert_called_once() From 0d8e48bf72e9c5bcdf751594c33c09ece0e05095 Mon Sep 17 00:00:00 2001 From: Andrew Snare Date: Fri, 4 Oct 2024 17:19:19 +0200 Subject: [PATCH 34/58] Ownership implementation of UDFs. --- .../labs/ucx/hive_metastore/udfs.py | 11 +++++++ tests/integration/hive_metastore/test_udfs.py | 21 ++++++++++++- tests/unit/hive_metastore/test_udfs.py | 30 ++++++++++++++++++- 3 files changed, 60 insertions(+), 2 deletions(-) diff --git a/src/databricks/labs/ucx/hive_metastore/udfs.py b/src/databricks/labs/ucx/hive_metastore/udfs.py index 6ee1eefd38..01ec95bfe5 100644 --- a/src/databricks/labs/ucx/hive_metastore/udfs.py +++ b/src/databricks/labs/ucx/hive_metastore/udfs.py @@ -8,6 +8,7 @@ from databricks.sdk.errors import Unknown, NotFound from databricks.labs.ucx.framework.crawlers import CrawlerBase +from databricks.labs.ucx.framework.owners import Ownership from databricks.labs.ucx.framework.utils import escape_sql_identifier logger = logging.getLogger(__name__) @@ -135,3 +136,13 @@ def _assess_udfs(udfs: Iterable[Udf]) -> Iterable[Udf]: yield replace(udf, success=0, failures="Only SCALAR functions are supported") else: yield replace(udf, success=1) + + +class UdfOwnership(Ownership[Udf]): + """Determine ownership of UDFs in the inventory. + + At the present we don't determine a specific owner for UDFs: we always report an administrator. + """ + + def _get_owner(self, record: Udf) -> None: + return None diff --git a/tests/integration/hive_metastore/test_udfs.py b/tests/integration/hive_metastore/test_udfs.py index 692d0c0675..eeaa9e0c92 100644 --- a/tests/integration/hive_metastore/test_udfs.py +++ b/tests/integration/hive_metastore/test_udfs.py @@ -4,7 +4,7 @@ from databricks.sdk.errors import NotFound from databricks.sdk.retries import retried -from databricks.labs.ucx.hive_metastore.udfs import UdfsCrawler +from databricks.labs.ucx.hive_metastore.udfs import UdfsCrawler, UdfOwnership logger = logging.getLogger(__name__) @@ -24,3 +24,22 @@ def test_describe_all_udfs_in_databases(ws, sql_backend, inventory_schema, make_ assert len(udfs) == 3 assert sum(udf.success for udf in udfs) == 2 # hive_udf should fail assert [udf.failures for udf in udfs if udf.key == hive_udf.full_name] == ["Only SCALAR functions are supported"] + + +def test_udf_ownership(ws, runtime_ctx, inventory_schema, sql_backend) -> None: + """Verify the ownership can be determined for crawled UDFs.""" + # This currently isn't very useful: we don't currently locate specific owners for UDFs. + + # A UDF for which we'll determine the owner. + udf = runtime_ctx.make_udf() + + # Produce the crawled records + crawler = UdfsCrawler(sql_backend, schema=inventory_schema, include_databases=[udf.schema_name]) + records = crawler.snapshot(force_refresh=True) + + # Find the crawled record for the table we made. + udf_record = next(r for r in records if f"{r.catalog}.{r.database}.{r.name}" == udf.full_name) + + # Verify ownership can be made. + ownership = UdfOwnership(ws, runtime_ctx.administrator_locator) + assert "@" in ownership.owner_of(udf_record) diff --git a/tests/unit/hive_metastore/test_udfs.py b/tests/unit/hive_metastore/test_udfs.py index b3ba27a63e..58c5f24e74 100644 --- a/tests/unit/hive_metastore/test_udfs.py +++ b/tests/unit/hive_metastore/test_udfs.py @@ -1,6 +1,9 @@ +from unittest.mock import create_autospec, PropertyMock + from databricks.labs.lsql.backends import MockBackend -from databricks.labs.ucx.hive_metastore.udfs import Udf, UdfsCrawler +from databricks.labs.ucx.framework.owners import AdministratorLocator +from databricks.labs.ucx.hive_metastore.udfs import Udf, UdfsCrawler, UdfOwnership def test_key(): @@ -43,3 +46,28 @@ def test_tables_crawler_should_filter_by_database(): udf_crawler = UdfsCrawler(backend, "default", ["database"]) results = udf_crawler.snapshot() assert len(results) == 1 + + +def test_udf_owner(ws) -> None: + """Verify that the owner of a crawled UDF is an administrator.""" + admin_locator = create_autospec(AdministratorLocator) # pylint: disable=mock-no-usage + mock_workspace_administrator = PropertyMock(return_value="an_admin") + type(admin_locator).workspace_administrator = mock_workspace_administrator + + ownership = UdfOwnership(ws, admin_locator) + udf = Udf( + catalog="main", + database="foo", + name="bar", + func_type="UNKNOWN", + func_input="UNKNOWN", + func_returns="UNKNOWN", + deterministic=True, + data_access="UNKNOWN", + body="UNKNOWN", + comment="UNKNOWN", + ) + owner = ownership.owner_of(udf) + + assert owner == "an_admin" + mock_workspace_administrator.assert_called_once() From 28ab56a756ec9d586b78c80a71eeafc37043e522 Mon Sep 17 00:00:00 2001 From: Andrew Snare Date: Fri, 4 Oct 2024 18:03:05 +0200 Subject: [PATCH 35/58] Ensure fewer unnecessary mock interactions. --- tests/unit/assessment/test_clusters.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tests/unit/assessment/test_clusters.py b/tests/unit/assessment/test_clusters.py index 9c1243d950..d97696f31c 100644 --- a/tests/unit/assessment/test_clusters.py +++ b/tests/unit/assessment/test_clusters.py @@ -228,8 +228,7 @@ def test_policy_crawler_creator(): Policy(policy_id="2", definition="{}", name="bar", creator_user_name=""), Policy(policy_id="3", definition="{}", name="baz", creator_user_name="bob"), ) - mockbackend = MockBackend() - result = PoliciesCrawler(ws, mockbackend, "ucx").snapshot() + result = PoliciesCrawler(ws, MockBackend(), "ucx").snapshot(force_refresh=True) expected_creators = [None, None, "bob"] crawled_creators = [record.creator for record in result] From cc7db1cce924dc8968cb65c135f3689a9a8ec9c4 Mon Sep 17 00:00:00 2001 From: Andrew Snare Date: Fri, 4 Oct 2024 18:05:15 +0200 Subject: [PATCH 36/58] Ownership implementation for pipelines. --- .../labs/ucx/assessment/pipelines.py | 19 +++++- .../integration/assessment/test_pipelines.py | 22 ++++++- tests/unit/assessment/test_pipelines.py | 61 +++++++++++++------ 3 files changed, 81 insertions(+), 21 deletions(-) diff --git a/src/databricks/labs/ucx/assessment/pipelines.py b/src/databricks/labs/ucx/assessment/pipelines.py index 8421e53084..2209ba76d9 100644 --- a/src/databricks/labs/ucx/assessment/pipelines.py +++ b/src/databricks/labs/ucx/assessment/pipelines.py @@ -8,6 +8,7 @@ from databricks.labs.ucx.assessment.clusters import CheckClusterMixin from databricks.labs.ucx.framework.crawlers import CrawlerBase +from databricks.labs.ucx.framework.owners import Ownership from databricks.labs.ucx.framework.utils import escape_sql_identifier logger = logging.getLogger(__name__) @@ -20,6 +21,7 @@ class PipelineInfo: failures: str pipeline_name: str | None = None creator_name: str | None = None + """User-name of the creator of the pipeline, if known.""" class PipelinesCrawler(CrawlerBase[PipelineInfo], CheckClusterMixin): @@ -33,15 +35,18 @@ def _crawl(self) -> Iterable[PipelineInfo]: def _assess_pipelines(self, all_pipelines) -> Iterable[PipelineInfo]: for pipeline in all_pipelines: - if not pipeline.creator_user_name: + creator_name = pipeline.creator_user_name + if not creator_name: logger.warning( f"Pipeline {pipeline.name} have Unknown creator, it means that the original creator " f"has been deleted and should be re-created" ) + # Normalization. + creator_name = None pipeline_info = PipelineInfo( pipeline_id=pipeline.pipeline_id, pipeline_name=pipeline.name, - creator_name=pipeline.creator_user_name, + creator_name=creator_name, success=1, failures="[]", ) @@ -73,3 +78,13 @@ def _pipeline_clusters(self, clusters, failures): def _try_fetch(self) -> Iterable[PipelineInfo]: for row in self._fetch(f"SELECT * FROM {escape_sql_identifier(self.full_name)}"): yield PipelineInfo(*row) + + +class PipelineOwnership(Ownership[PipelineInfo]): + """Determine ownership of pipelines in the inventory. + + This is the pipeline creator (if known), or otherwise an administrator. + """ + + def _get_owner(self, record: PipelineInfo) -> str | None: + return record.creator_name diff --git a/tests/integration/assessment/test_pipelines.py b/tests/integration/assessment/test_pipelines.py index b416d83069..23b119cef8 100644 --- a/tests/integration/assessment/test_pipelines.py +++ b/tests/integration/assessment/test_pipelines.py @@ -3,7 +3,7 @@ from databricks.sdk.errors import NotFound from databricks.sdk.retries import retried -from databricks.labs.ucx.assessment.pipelines import PipelinesCrawler +from databricks.labs.ucx.assessment.pipelines import PipelineOwnership, PipelinesCrawler from .test_assessment import _PIPELINE_CONF, _PIPELINE_CONF_WITH_SECRET, logger @@ -42,3 +42,23 @@ def test_pipeline_with_secret_conf_crawler(ws, make_pipeline, inventory_schema, assert len(results) >= 1 assert results[0].pipeline_id == created_pipeline.pipeline_id + + +def test_pipeline_ownership(ws, runtime_ctx, make_pipeline, inventory_schema, sql_backend) -> None: + """Verify the ownership can be determined for crawled pipelines.""" + + # Set up a pipeline. + # Note: there doesn't seem to be a way to change the owner of a pipeline, so we can't test pipelines without an + # owner. + pipeline = make_pipeline() + + # Produce the crawled records. + crawler = PipelinesCrawler(ws, sql_backend, inventory_schema) + records = crawler.snapshot(force_refresh=True) + + # Find the crawled record for our pipeline. + pipeline_record = next(record for record in records if record.pipeline_id == pipeline.pipeline_id) + + # Verify ownership is as expected. + ownership = PipelineOwnership(ws, runtime_ctx.administrator_locator) + assert ownership.owner_of(pipeline_record) == ws.current_user.me().user_name diff --git a/tests/unit/assessment/test_pipelines.py b/tests/unit/assessment/test_pipelines.py index b9a0acb0aa..6d3c1ec352 100644 --- a/tests/unit/assessment/test_pipelines.py +++ b/tests/unit/assessment/test_pipelines.py @@ -1,8 +1,11 @@ -from databricks.labs.lsql import Row +from unittest.mock import create_autospec, PropertyMock + from databricks.labs.lsql.backends import MockBackend +from databricks.sdk.service.pipelines import GetPipelineResponse, PipelineStateInfo from databricks.labs.ucx.assessment.azure import AzureServicePrincipalCrawler -from databricks.labs.ucx.assessment.pipelines import PipelinesCrawler +from databricks.labs.ucx.assessment.pipelines import PipelineOwnership, PipelineInfo, PipelinesCrawler +from databricks.labs.ucx.framework.owners import AdministratorLocator from .. import mock_workspace_client @@ -44,19 +47,41 @@ def test_pipeline_list_with_no_config(): assert len(crawler) == 0 -def test_pipeline_without_owners_should_have_empty_creator_name(): - ws = mock_workspace_client(pipeline_ids=['empty-spec']) - ws.dbfs.read().data = "JXNoCmVjaG8gIj0=" - mockbackend = MockBackend() - PipelinesCrawler(ws, mockbackend, "ucx").snapshot() - result = mockbackend.rows_written_for("hive_metastore.ucx.pipelines", "overwrite") - - assert result == [ - Row( - pipeline_id="empty-spec", - pipeline_name="New DLT Pipeline", - creator_name=None, - success=1, - failures="[]", - ) - ] +def test_pipeline_crawler_creator(): + ws = mock_workspace_client() + ws.pipelines.list_pipelines.return_value = ( + PipelineStateInfo(pipeline_id="1", creator_user_name=None), + PipelineStateInfo(pipeline_id="2", creator_user_name=""), + PipelineStateInfo(pipeline_id="3", creator_user_name="bob"), + ) + ws.pipelines.get = create_autospec(GetPipelineResponse) # pylint: disable=mock-no-usage + result = PipelinesCrawler(ws, MockBackend(), "ucx").snapshot(force_refresh=True) + + expected_creators = [None, None, "bob"] + crawled_creators = [record.creator_name for record in result] + assert len(expected_creators) == len(crawled_creators) + assert set(expected_creators) == set(crawled_creators) + + +def test_pipeline_owner_creator(ws) -> None: + admin_locator = create_autospec(AdministratorLocator) # pylint: disable=mock-no-usage + mock_workspace_administrator = PropertyMock(return_value="an_admin") + type(admin_locator).workspace_administrator = mock_workspace_administrator + + ownership = PipelineOwnership(ws, admin_locator) + owner = ownership.owner_of(PipelineInfo(creator_name="bob", pipeline_id="1", success=1, failures="[]")) + + assert owner == "bob" + mock_workspace_administrator.assert_not_called() + + +def test_pipeline_owner_creator_unknown(ws) -> None: + admin_locator = create_autospec(AdministratorLocator) # pylint: disable=mock-no-usage + mock_workspace_administrator = PropertyMock(return_value="an_admin") + type(admin_locator).workspace_administrator = mock_workspace_administrator + + ownership = PipelineOwnership(ws, admin_locator) + owner = ownership.owner_of(PipelineInfo(creator_name=None, pipeline_id="1", success=1, failures="[]")) + + assert owner == "an_admin" + mock_workspace_administrator.assert_called_once() From 8f0265f3219f7a1b1d51fec4b4962a8e2cfb77d1 Mon Sep 17 00:00:00 2001 From: Andrew Snare Date: Mon, 7 Oct 2024 10:46:52 +0200 Subject: [PATCH 37/58] Ownership implementation for Jobs. --- src/databricks/labs/ucx/assessment/jobs.py | 19 +++++++- tests/integration/assessment/test_jobs.py | 21 ++++++++- tests/unit/assessment/test_jobs.py | 50 ++++++++++++++++++---- 3 files changed, 79 insertions(+), 11 deletions(-) diff --git a/src/databricks/labs/ucx/assessment/jobs.py b/src/databricks/labs/ucx/assessment/jobs.py index d5b77d68e0..01d66d93d1 100644 --- a/src/databricks/labs/ucx/assessment/jobs.py +++ b/src/databricks/labs/ucx/assessment/jobs.py @@ -25,6 +25,7 @@ from databricks.labs.ucx.assessment.clusters import CheckClusterMixin from databricks.labs.ucx.assessment.crawlers import spark_version_compatibility from databricks.labs.ucx.framework.crawlers import CrawlerBase +from databricks.labs.ucx.framework.owners import Ownership from databricks.labs.ucx.framework.utils import escape_sql_identifier logger = logging.getLogger(__name__) @@ -37,6 +38,7 @@ class JobInfo: failures: str job_name: str | None = None creator: str | None = None + """User-name of the creator of the pipeline, if known.""" class JobsMixin: @@ -106,11 +108,14 @@ def _prepare(all_jobs) -> tuple[dict[int, set[str]], dict[int, JobInfo]]: if not job.job_id: continue job_assessment[job.job_id] = set() - if not job.creator_user_name: + creator_user_name = job.creator_user_name + if not creator_user_name: logger.warning( f"Job {job.job_id} have Unknown creator, it means that the original creator has been deleted " f"and should be re-created" ) + # Normalization. + creator_user_name = None job_settings = job.settings if not job_settings: @@ -122,7 +127,7 @@ def _prepare(all_jobs) -> tuple[dict[int, set[str]], dict[int, JobInfo]]: job_details[job.job_id] = JobInfo( job_id=str(job.job_id), job_name=job_name, - creator=job.creator_user_name, + creator=creator_user_name, success=1, failures="[]", ) @@ -140,6 +145,16 @@ def _check_jar_task(self, all_task: list[RunTask]) -> list[str]: return task_failures +class JobOwnership(Ownership[JobInfo]): + """Determine ownership of jobs (workflows) in the inventory. + + This is the pipeline creator (if known), or otherwise an administrator. + """ + + def _get_owner(self, record: JobInfo) -> str | None: + return record.creator + + @dataclass class SubmitRunInfo: run_ids: str # JSON-encoded list of run ids diff --git a/tests/integration/assessment/test_jobs.py b/tests/integration/assessment/test_jobs.py index 3a8ef8dac7..88dfc1ff42 100644 --- a/tests/integration/assessment/test_jobs.py +++ b/tests/integration/assessment/test_jobs.py @@ -7,7 +7,7 @@ from databricks.sdk.service.jobs import NotebookTask, RunTask from databricks.sdk.service.workspace import ImportFormat -from databricks.labs.ucx.assessment.jobs import JobsCrawler, SubmitRunsCrawler +from databricks.labs.ucx.assessment.jobs import JobOwnership, JobsCrawler, SubmitRunsCrawler from .test_assessment import _SPARK_CONF @@ -63,3 +63,22 @@ def test_job_run_crawler(ws, env_or_skip, inventory_schema, sql_backend): failures = job_run.failures continue assert failures and failures == "[]" + + +def test_job_ownership(ws, runtime_ctx, make_job, inventory_schema, sql_backend) -> None: + """Verify the ownership can be determined for crawled jobs.""" + + # Set up a job. + # Note: there doesn't seem to be a way to change the owner of a job, so we can't test jobs without an owner. + job = make_job() + + # Produce the crawled records. + crawler = JobsCrawler(ws, sql_backend, inventory_schema) + records = crawler.snapshot(force_refresh=True) + + # Find the crawled record for our pipeline. + pipeline_record = next(record for record in records if record.job_id == job.job_id) + + # Verify ownership is as expected. + ownership = JobOwnership(ws, runtime_ctx.administrator_locator) + assert ownership.owner_of(pipeline_record) == ws.current_user.me().user_name diff --git a/tests/unit/assessment/test_jobs.py b/tests/unit/assessment/test_jobs.py index 9b7240f73a..cee0e36b19 100644 --- a/tests/unit/assessment/test_jobs.py +++ b/tests/unit/assessment/test_jobs.py @@ -1,8 +1,11 @@ +from unittest.mock import create_autospec, PropertyMock + import pytest -from databricks.labs.lsql import Row from databricks.labs.lsql.backends import MockBackend +from databricks.sdk.service.jobs import BaseJob, JobSettings -from databricks.labs.ucx.assessment.jobs import JobsCrawler, SubmitRunsCrawler +from databricks.labs.ucx.assessment.jobs import JobInfo, JobOwnership, JobsCrawler, SubmitRunsCrawler +from databricks.labs.ucx.framework.owners import AdministratorLocator from .. import mock_workspace_client @@ -59,12 +62,19 @@ def test_jobs_assessment_with_spn_cluster_no_job_tasks(): assert result_set[0].success == 1 -def test_job_crawler_with_no_owner_should_have_empty_creator_name(): - ws = mock_workspace_client(job_ids=['no-tasks']) - sql_backend = MockBackend() - JobsCrawler(ws, sql_backend, "ucx").snapshot() - result = sql_backend.rows_written_for("hive_metastore.ucx.jobs", "overwrite") - assert result == [Row(job_id='9001', success=1, failures='[]', job_name='No Tasks', creator=None)] +def test_pipeline_crawler_creator(): + ws = mock_workspace_client() + ws.jobs.list.return_value = ( + BaseJob(job_id=1, settings=JobSettings(), creator_user_name=None), + BaseJob(job_id=2, settings=JobSettings(), creator_user_name=""), + BaseJob(job_id=3, settings=JobSettings(), creator_user_name="bob"), + ) + result = JobsCrawler(ws, MockBackend(), "ucx").snapshot(force_refresh=True) + + expected_creators = [None, None, "bob"] + crawled_creators = [record.creator for record in result] + assert len(expected_creators) == len(crawled_creators) + assert set(expected_creators) == set(crawled_creators) @pytest.mark.parametrize( @@ -123,3 +133,27 @@ def test_job_run_crawler(jobruns_ids, cluster_ids, run_ids, failures): assert len(result) == 1 assert result[0].run_ids == run_ids assert result[0].failures == failures + + +def test_pipeline_owner_creator(ws) -> None: + admin_locator = create_autospec(AdministratorLocator) # pylint: disable=mock-no-usage + mock_workspace_administrator = PropertyMock(return_value="an_admin") + type(admin_locator).workspace_administrator = mock_workspace_administrator + + ownership = JobOwnership(ws, admin_locator) + owner = ownership.owner_of(JobInfo(creator="bob", job_id="1", success=1, failures="[]")) + + assert owner == "bob" + mock_workspace_administrator.assert_not_called() + + +def test_pipeline_owner_creator_unknown(ws) -> None: + admin_locator = create_autospec(AdministratorLocator) # pylint: disable=mock-no-usage + mock_workspace_administrator = PropertyMock(return_value="an_admin") + type(admin_locator).workspace_administrator = mock_workspace_administrator + + ownership = JobOwnership(ws, admin_locator) + owner = ownership.owner_of(JobInfo(creator=None, job_id="1", success=1, failures="[]")) + + assert owner == "an_admin" + mock_workspace_administrator.assert_called_once() From 8b944b3a6ff8223d6ae4631083726edaba938e41 Mon Sep 17 00:00:00 2001 From: Andrew Snare Date: Mon, 7 Oct 2024 12:32:42 +0200 Subject: [PATCH 38/58] Remove the workspace client from the ownership initializer. We don't currently use it, and making it available could encourage a future update to use it which would result in many REST calls: this is not what we want. --- src/databricks/labs/ucx/framework/owners.py | 12 ++++++++---- tests/integration/assessment/test_clusters.py | 4 ++-- tests/integration/assessment/test_jobs.py | 2 +- tests/integration/assessment/test_pipelines.py | 2 +- tests/integration/hive_metastore/test_grants.py | 2 +- tests/integration/hive_metastore/test_tables.py | 4 ++-- tests/integration/hive_metastore/test_udfs.py | 4 ++-- tests/unit/assessment/test_clusters.py | 16 ++++++++-------- tests/unit/assessment/test_jobs.py | 8 ++++---- tests/unit/assessment/test_pipelines.py | 8 ++++---- tests/unit/framework/test_owners.py | 16 +++++++--------- tests/unit/hive_metastore/test_grants.py | 4 ++-- tests/unit/hive_metastore/test_tables.py | 4 ++-- tests/unit/hive_metastore/test_udfs.py | 4 ++-- 14 files changed, 46 insertions(+), 44 deletions(-) diff --git a/src/databricks/labs/ucx/framework/owners.py b/src/databricks/labs/ucx/framework/owners.py index 95efe83c6f..ba3ddf75b5 100644 --- a/src/databricks/labs/ucx/framework/owners.py +++ b/src/databricks/labs/ucx/framework/owners.py @@ -172,9 +172,13 @@ def workspace_administrator(self) -> str: class Ownership(ABC, Generic[Record]): """Determine an owner for a given type of object.""" - def __init__(self, ws: WorkspaceClient, admin_locator: AdministratorLocator) -> None: - self._ws = ws - self._admin_locator = admin_locator + def __init__(self, administrator_locator: AdministratorLocator) -> None: + self._administrator_locator = administrator_locator + + @final + @property + def administrator_locator(self): + return self._administrator_locator @final def owner_of(self, record: Record) -> str: @@ -192,7 +196,7 @@ def owner_of(self, record: Record) -> str: Raises: RuntimeError if there are no active administrators for the current workspace. """ - return self._get_owner(record) or self._admin_locator.workspace_administrator + return self._get_owner(record) or self.administrator_locator.workspace_administrator @abstractmethod def _get_owner(self, record: Record) -> str | None: diff --git a/tests/integration/assessment/test_clusters.py b/tests/integration/assessment/test_clusters.py index f688e81960..fab5908a55 100644 --- a/tests/integration/assessment/test_clusters.py +++ b/tests/integration/assessment/test_clusters.py @@ -75,7 +75,7 @@ def test_cluster_ownership(ws, runtime_ctx, make_cluster, make_user, inventory_s ) # Verify ownership is as expected. - ownership = ClusterOwnership(ws, runtime_ctx.administrator_locator) + ownership = ClusterOwnership(runtime_ctx.administrator_locator) assert ownership.owner_of(cluster_record_with_owner) == ws.current_user.me().user_name assert "@" in ownership.owner_of(cluster_record_without_owner) @@ -147,5 +147,5 @@ def test_cluster_policy_ownership(ws, runtime_ctx, make_cluster_policy, inventor policy_record = next(record for record in records if record.policy_id == policy.policy_id) # Verify ownership is as expected. - ownership = ClusterPolicyOwnership(ws, runtime_ctx.administrator_locator) + ownership = ClusterPolicyOwnership(runtime_ctx.administrator_locator) assert ownership.owner_of(policy_record) == ws.current_user.me().user_name diff --git a/tests/integration/assessment/test_jobs.py b/tests/integration/assessment/test_jobs.py index 88dfc1ff42..3f8dd1f0c7 100644 --- a/tests/integration/assessment/test_jobs.py +++ b/tests/integration/assessment/test_jobs.py @@ -80,5 +80,5 @@ def test_job_ownership(ws, runtime_ctx, make_job, inventory_schema, sql_backend) pipeline_record = next(record for record in records if record.job_id == job.job_id) # Verify ownership is as expected. - ownership = JobOwnership(ws, runtime_ctx.administrator_locator) + ownership = JobOwnership(runtime_ctx.administrator_locator) assert ownership.owner_of(pipeline_record) == ws.current_user.me().user_name diff --git a/tests/integration/assessment/test_pipelines.py b/tests/integration/assessment/test_pipelines.py index 23b119cef8..93f60c850f 100644 --- a/tests/integration/assessment/test_pipelines.py +++ b/tests/integration/assessment/test_pipelines.py @@ -60,5 +60,5 @@ def test_pipeline_ownership(ws, runtime_ctx, make_pipeline, inventory_schema, sq pipeline_record = next(record for record in records if record.pipeline_id == pipeline.pipeline_id) # Verify ownership is as expected. - ownership = PipelineOwnership(ws, runtime_ctx.administrator_locator) + ownership = PipelineOwnership(runtime_ctx.administrator_locator) assert ownership.owner_of(pipeline_record) == ws.current_user.me().user_name diff --git a/tests/integration/hive_metastore/test_grants.py b/tests/integration/hive_metastore/test_grants.py index 68f9ce3369..bbabc1d20e 100644 --- a/tests/integration/hive_metastore/test_grants.py +++ b/tests/integration/hive_metastore/test_grants.py @@ -133,5 +133,5 @@ def test_grant_ownership(ws, runtime_ctx, inventory_schema, sql_backend) -> None grant_record = next(record for record in records if record.this_type_and_key() == ("DATABASE", schema.full_name)) # Verify ownership can be made. - ownership = GrantOwnership(ws, runtime_ctx.administrator_locator) + ownership = GrantOwnership(runtime_ctx.administrator_locator) assert "@" in ownership.owner_of(grant_record) diff --git a/tests/integration/hive_metastore/test_tables.py b/tests/integration/hive_metastore/test_tables.py index 47fa19b1ff..3e79cc00a0 100644 --- a/tests/integration/hive_metastore/test_tables.py +++ b/tests/integration/hive_metastore/test_tables.py @@ -88,7 +88,7 @@ def test_partitioned_tables(ws, sql_backend, make_schema, make_table): assert all_tables[f"{schema.full_name}.non_partitioned_parquet"].is_partitioned is False -def test_table_ownership(ws, runtime_ctx, inventory_schema, sql_backend) -> None: +def test_table_ownership(runtime_ctx, inventory_schema, sql_backend) -> None: """Verify the ownership can be determined for crawled tables.""" # This currently isn't very useful: we don't currently locate specific owners for tables. @@ -103,5 +103,5 @@ def test_table_ownership(ws, runtime_ctx, inventory_schema, sql_backend) -> None table_record = next(record for record in records if record.full_name == table.full_name) # Verify ownership can be made. - ownership = TableOwnership(ws, runtime_ctx.administrator_locator) + ownership = TableOwnership(runtime_ctx.administrator_locator) assert "@" in ownership.owner_of(table_record) diff --git a/tests/integration/hive_metastore/test_udfs.py b/tests/integration/hive_metastore/test_udfs.py index eeaa9e0c92..348e4a3c1e 100644 --- a/tests/integration/hive_metastore/test_udfs.py +++ b/tests/integration/hive_metastore/test_udfs.py @@ -26,7 +26,7 @@ def test_describe_all_udfs_in_databases(ws, sql_backend, inventory_schema, make_ assert [udf.failures for udf in udfs if udf.key == hive_udf.full_name] == ["Only SCALAR functions are supported"] -def test_udf_ownership(ws, runtime_ctx, inventory_schema, sql_backend) -> None: +def test_udf_ownership(runtime_ctx, inventory_schema, sql_backend) -> None: """Verify the ownership can be determined for crawled UDFs.""" # This currently isn't very useful: we don't currently locate specific owners for UDFs. @@ -41,5 +41,5 @@ def test_udf_ownership(ws, runtime_ctx, inventory_schema, sql_backend) -> None: udf_record = next(r for r in records if f"{r.catalog}.{r.database}.{r.name}" == udf.full_name) # Verify ownership can be made. - ownership = UdfOwnership(ws, runtime_ctx.administrator_locator) + ownership = UdfOwnership(runtime_ctx.administrator_locator) assert "@" in ownership.owner_of(udf_record) diff --git a/tests/unit/assessment/test_clusters.py b/tests/unit/assessment/test_clusters.py index d97696f31c..6363dbf45a 100644 --- a/tests/unit/assessment/test_clusters.py +++ b/tests/unit/assessment/test_clusters.py @@ -185,23 +185,23 @@ def test_unsupported_clusters(): assert result_set[0].failures == '["cluster type not supported : LEGACY_PASSTHROUGH"]' -def test_cluster_owner_creator(ws) -> None: +def test_cluster_owner_creator() -> None: admin_locator = create_autospec(AdministratorLocator) type(admin_locator).workspace_administrator = PropertyMock() - ownership = ClusterOwnership(ws, admin_locator) + ownership = ClusterOwnership(admin_locator) owner = ownership.owner_of(ClusterInfo(creator="bob", cluster_id="1", success=1, failures="[]")) assert owner == "bob" admin_locator.workspace_administrator.assert_not_called() -def test_cluster_owner_creator_unknown(ws) -> None: +def test_cluster_owner_creator_unknown() -> None: admin_locator = create_autospec(AdministratorLocator) # pylint: disable=mock-no-usage mock_workspace_administrator = PropertyMock(return_value="an_admin") type(admin_locator).workspace_administrator = mock_workspace_administrator - ownership = ClusterOwnership(ws, admin_locator) + ownership = ClusterOwnership(admin_locator) owner = ownership.owner_of(ClusterInfo(creator=None, cluster_id="1", success=1, failures="[]")) assert owner == "an_admin" @@ -274,24 +274,24 @@ def test_policy_without_failure(): assert result_set[0].failures == '[]' -def test_cluster_policy_owner_creator(ws) -> None: +def test_cluster_policy_owner_creator() -> None: admin_locator = create_autospec(AdministratorLocator) # pylint: disable=mock-no-usage mock_workspace_administrator = PropertyMock(return_value="an_admin") type(admin_locator).workspace_administrator = mock_workspace_administrator - ownership = ClusterPolicyOwnership(ws, admin_locator) + ownership = ClusterPolicyOwnership(admin_locator) owner = ownership.owner_of(PolicyInfo(creator="bob", policy_id="1", policy_name="foo", success=1, failures="[]")) assert owner == "bob" mock_workspace_administrator.assert_not_called() -def test_cluster_policy_owner_creator_unknown(ws) -> None: +def test_cluster_policy_owner_creator_unknown() -> None: admin_locator = create_autospec(AdministratorLocator) # pylint: disable=mock-no-usage mock_workspace_administrator = PropertyMock(return_value="an_admin") type(admin_locator).workspace_administrator = mock_workspace_administrator - ownership = ClusterPolicyOwnership(ws, admin_locator) + ownership = ClusterPolicyOwnership(admin_locator) owner = ownership.owner_of(PolicyInfo(creator=None, policy_id="1", policy_name="foo", success=1, failures="[]")) assert owner == "an_admin" diff --git a/tests/unit/assessment/test_jobs.py b/tests/unit/assessment/test_jobs.py index cee0e36b19..d2e9089044 100644 --- a/tests/unit/assessment/test_jobs.py +++ b/tests/unit/assessment/test_jobs.py @@ -135,24 +135,24 @@ def test_job_run_crawler(jobruns_ids, cluster_ids, run_ids, failures): assert result[0].failures == failures -def test_pipeline_owner_creator(ws) -> None: +def test_pipeline_owner_creator() -> None: admin_locator = create_autospec(AdministratorLocator) # pylint: disable=mock-no-usage mock_workspace_administrator = PropertyMock(return_value="an_admin") type(admin_locator).workspace_administrator = mock_workspace_administrator - ownership = JobOwnership(ws, admin_locator) + ownership = JobOwnership(admin_locator) owner = ownership.owner_of(JobInfo(creator="bob", job_id="1", success=1, failures="[]")) assert owner == "bob" mock_workspace_administrator.assert_not_called() -def test_pipeline_owner_creator_unknown(ws) -> None: +def test_pipeline_owner_creator_unknown() -> None: admin_locator = create_autospec(AdministratorLocator) # pylint: disable=mock-no-usage mock_workspace_administrator = PropertyMock(return_value="an_admin") type(admin_locator).workspace_administrator = mock_workspace_administrator - ownership = JobOwnership(ws, admin_locator) + ownership = JobOwnership(admin_locator) owner = ownership.owner_of(JobInfo(creator=None, job_id="1", success=1, failures="[]")) assert owner == "an_admin" diff --git a/tests/unit/assessment/test_pipelines.py b/tests/unit/assessment/test_pipelines.py index 6d3c1ec352..1b93d9040f 100644 --- a/tests/unit/assessment/test_pipelines.py +++ b/tests/unit/assessment/test_pipelines.py @@ -63,24 +63,24 @@ def test_pipeline_crawler_creator(): assert set(expected_creators) == set(crawled_creators) -def test_pipeline_owner_creator(ws) -> None: +def test_pipeline_owner_creator() -> None: admin_locator = create_autospec(AdministratorLocator) # pylint: disable=mock-no-usage mock_workspace_administrator = PropertyMock(return_value="an_admin") type(admin_locator).workspace_administrator = mock_workspace_administrator - ownership = PipelineOwnership(ws, admin_locator) + ownership = PipelineOwnership(admin_locator) owner = ownership.owner_of(PipelineInfo(creator_name="bob", pipeline_id="1", success=1, failures="[]")) assert owner == "bob" mock_workspace_administrator.assert_not_called() -def test_pipeline_owner_creator_unknown(ws) -> None: +def test_pipeline_owner_creator_unknown() -> None: admin_locator = create_autospec(AdministratorLocator) # pylint: disable=mock-no-usage mock_workspace_administrator = PropertyMock(return_value="an_admin") type(admin_locator).workspace_administrator = mock_workspace_administrator - ownership = PipelineOwnership(ws, admin_locator) + ownership = PipelineOwnership(admin_locator) owner = ownership.owner_of(PipelineInfo(creator_name=None, pipeline_id="1", success=1, failures="[]")) assert owner == "an_admin" diff --git a/tests/unit/framework/test_owners.py b/tests/unit/framework/test_owners.py index bd7addd37f..8739917963 100644 --- a/tests/unit/framework/test_owners.py +++ b/tests/unit/framework/test_owners.py @@ -3,7 +3,6 @@ from unittest.mock import create_autospec, Mock, PropertyMock import pytest -from databricks.sdk import WorkspaceClient from databricks.sdk.errors import NotFound from databricks.sdk.service import iam @@ -20,12 +19,11 @@ class _OwnershipFixture(Ownership[Record]): def __init__( self, - ws: WorkspaceClient, *, owner_fn: Callable[[Record], str | None] = lambda _: None, ): mock_admin_locator = create_autospec(AdministratorLocator) # pylint: disable=mock-no-usage - super().__init__(ws, mock_admin_locator) + super().__init__(mock_admin_locator) self._owner_fn = owner_fn self.mock_admin_locator = mock_admin_locator @@ -319,18 +317,18 @@ def test_admin_locator_caches_negative_result(ws) -> None: mock_finder.find_admin_users.assert_called_once() -def test_ownership_prefers_record_owner(ws) -> None: +def test_ownership_prefers_record_owner() -> None: """Verify that if an owner for the record can be found, that is used.""" - ownership = _OwnershipFixture[str](ws, owner_fn=lambda _: "bob") + ownership = _OwnershipFixture[str](owner_fn=lambda _: "bob") owner = ownership.owner_of("school") assert owner == "bob" ownership.mock_admin_locator.workspace_administrator.assert_not_called() -def test_ownership_admin_user_fallback(ws) -> None: +def test_ownership_admin_user_fallback() -> None: """Verify that if no owner for the record can be found, an admin user is returned instead.""" - ownership = _OwnershipFixture[str](ws) + ownership = _OwnershipFixture[str]() type(ownership.mock_admin_locator).workspace_administrator = PropertyMock(return_value="jane") owner = ownership.owner_of("school") @@ -338,9 +336,9 @@ def test_ownership_admin_user_fallback(ws) -> None: assert owner == "jane" -def test_ownership_no_fallback_admin_user_error(ws) -> None: +def test_ownership_no_fallback_admin_user_error() -> None: """Verify that if no owner can be determined, an error is raised.""" - ownership = _OwnershipFixture[str](ws) + ownership = _OwnershipFixture[str]() type(ownership.mock_admin_locator).workspace_administrator = PropertyMock( side_effect=RuntimeError("Mocked admin lookup failure.") ) diff --git a/tests/unit/hive_metastore/test_grants.py b/tests/unit/hive_metastore/test_grants.py index 9ea3cbf002..9d1a04438f 100644 --- a/tests/unit/hive_metastore/test_grants.py +++ b/tests/unit/hive_metastore/test_grants.py @@ -530,13 +530,13 @@ def grant_loader() -> list[Grant]: group_manager.assert_not_called() -def test_grant_owner(ws) -> None: +def test_grant_owner() -> None: """Verify that the owner of a crawled grant is an administrator.""" admin_locator = create_autospec(AdministratorLocator) # pylint: disable=mock-no-usage mock_workspace_administrator = PropertyMock(return_value="an_admin") type(admin_locator).workspace_administrator = mock_workspace_administrator - ownership = GrantOwnership(ws, admin_locator) + ownership = GrantOwnership(admin_locator) owner = ownership.owner_of(Grant(principal="someone", action_type="SELECT")) assert owner == "an_admin" diff --git a/tests/unit/hive_metastore/test_tables.py b/tests/unit/hive_metastore/test_tables.py index 09addedf58..74a3fd77be 100644 --- a/tests/unit/hive_metastore/test_tables.py +++ b/tests/unit/hive_metastore/test_tables.py @@ -660,13 +660,13 @@ def test_fast_table_scan_crawler_crawl_test_warnings_get_table(caplog, mocker, s assert "Test getTable warning" in caplog.text -def test_table_owner(ws) -> None: +def test_table_owner() -> None: """Verify that the owner of a crawled table is an administrator.""" admin_locator = create_autospec(AdministratorLocator) # pylint: disable=mock-no-usage mock_workspace_administrator = PropertyMock(return_value="an_admin") type(admin_locator).workspace_administrator = mock_workspace_administrator - ownership = TableOwnership(ws, admin_locator) + ownership = TableOwnership(admin_locator) table = Table(catalog="main", database="foo", name="bar", object_type="TABLE", table_format="DELTA") owner = ownership.owner_of(table) diff --git a/tests/unit/hive_metastore/test_udfs.py b/tests/unit/hive_metastore/test_udfs.py index 58c5f24e74..fdff08f259 100644 --- a/tests/unit/hive_metastore/test_udfs.py +++ b/tests/unit/hive_metastore/test_udfs.py @@ -48,13 +48,13 @@ def test_tables_crawler_should_filter_by_database(): assert len(results) == 1 -def test_udf_owner(ws) -> None: +def test_udf_owner() -> None: """Verify that the owner of a crawled UDF is an administrator.""" admin_locator = create_autospec(AdministratorLocator) # pylint: disable=mock-no-usage mock_workspace_administrator = PropertyMock(return_value="an_admin") type(admin_locator).workspace_administrator = mock_workspace_administrator - ownership = UdfOwnership(ws, admin_locator) + ownership = UdfOwnership(admin_locator) udf = Udf( catalog="main", database="foo", From 86582ad3cff32b364dbd5ee11be048eb3165a1ed Mon Sep 17 00:00:00 2001 From: Andrew Snare Date: Mon, 7 Oct 2024 12:33:25 +0200 Subject: [PATCH 39/58] Ownership implementation for the table migration status records. Integration tests are still required. --- .../hive_metastore/table_migration_status.py | 28 +++++ .../unit/hive_metastore/test_table_migrate.py | 111 +++++++++++++++++- 2 files changed, 138 insertions(+), 1 deletion(-) diff --git a/src/databricks/labs/ucx/hive_metastore/table_migration_status.py b/src/databricks/labs/ucx/hive_metastore/table_migration_status.py index 283be4f717..c31f5e3e69 100644 --- a/src/databricks/labs/ucx/hive_metastore/table_migration_status.py +++ b/src/databricks/labs/ucx/hive_metastore/table_migration_status.py @@ -8,8 +8,10 @@ from databricks.sdk.errors import NotFound from databricks.labs.ucx.framework.crawlers import CrawlerBase +from databricks.labs.ucx.framework.owners import Ownership from databricks.labs.ucx.framework.utils import escape_sql_identifier from databricks.labs.ucx.hive_metastore import TablesCrawler +from databricks.labs.ucx.hive_metastore.tables import Table, TableOwnership logger = logging.getLogger(__name__) @@ -151,3 +153,29 @@ def _iter_schemas(self): except NotFound: logger.warning(f"Catalog {catalog.name} no longer exists. Skipping checking its migration status.") continue + + +class TableMigrationOwnership(Ownership[TableMigrationStatus]): + """Determine ownership of table migration records in the inventory. + + This is the owner of the source table, if it is present in the inventory, otherwise an administrator. + """ + + def __init__(self, tables_crawler: TablesCrawler, table_ownership: TableOwnership) -> None: + super().__init__(table_ownership.administrator_locator) + self._tables_crawler = tables_crawler + self._table_ownership = table_ownership + self._indexed_tables: dict[tuple[str, str], Table] | None = None + + def _tables_snapshot_index(self, reindex: bool = False) -> dict[tuple[str, str], Table]: + index = self._indexed_tables + if index is None or reindex: + snapshot = self._tables_crawler.snapshot() + index = {(table.database, table.name): table for table in snapshot} + self._indexed_tables = index + return index + + def _get_owner(self, record: TableMigrationStatus) -> str | None: + index = self._tables_snapshot_index() + source_table = index.get((record.src_schema, record.src_table), None) + return self._table_ownership.owner_of(source_table) if source_table is not None else None diff --git a/tests/unit/hive_metastore/test_table_migrate.py b/tests/unit/hive_metastore/test_table_migrate.py index f8f082c42e..3d988e635a 100644 --- a/tests/unit/hive_metastore/test_table_migrate.py +++ b/tests/unit/hive_metastore/test_table_migrate.py @@ -1,13 +1,15 @@ import datetime import logging from itertools import cycle -from unittest.mock import create_autospec +from unittest.mock import create_autospec, PropertyMock + import pytest from databricks.labs.lsql.backends import MockBackend, SqlBackend from databricks.sdk import WorkspaceClient from databricks.sdk.errors import NotFound from databricks.sdk.service.catalog import CatalogInfo, SchemaInfo, TableInfo +from databricks.labs.ucx.framework.owners import AdministratorLocator from databricks.labs.ucx.hive_metastore import Mounts from databricks.labs.ucx.hive_metastore.grants import MigrateGrants from databricks.labs.ucx.hive_metastore.locations import Mount @@ -22,11 +24,13 @@ from databricks.labs.ucx.hive_metastore.table_migration_status import ( TableMigrationStatusRefresher, TableMigrationIndex, + TableMigrationOwnership, TableMigrationStatus, TableView, ) from databricks.labs.ucx.hive_metastore.tables import ( Table, + TableOwnership, TablesCrawler, What, ) @@ -1234,3 +1238,108 @@ def test_refresh_migration_status_published_remained_tables(caplog): assert 'remained-hive-metastore-table: hive_metastore.schema1.table3' in caplog.messages assert len(tables) == 1 and tables[0].key == "hive_metastore.schema1.table3" migrate_grants.assert_not_called() + + +def test_table_migration_status_owner() -> None: + admin_locator = create_autospec(AdministratorLocator) # pylint: disable=mock-no-usage + mock_workspace_administrator = PropertyMock(return_value="an_admin") + type(admin_locator).workspace_administrator = mock_workspace_administrator + + tables_crawler = create_autospec(TablesCrawler) + the_table = Table( + catalog="hive_metastore", + database="foo", + name="bar", + object_type="TABLE", + table_format="DELTA", + location="/some/path", + ) + tables_crawler.snapshot.return_value = [the_table] + table_ownership = create_autospec(TableOwnership) + table_ownership.administrator_locator = admin_locator + table_ownership.owner_of.return_value = "bob" + + ownership = TableMigrationOwnership(tables_crawler, table_ownership) + owner = ownership.owner_of( + TableMigrationStatus( + src_schema="foo", + src_table="bar", + dst_catalog="main", + dst_schema="foo", + dst_table="bar", + ) + ) + + assert owner == "bob" + tables_crawler.snapshot.assert_called_once() + table_ownership.owner_of.assert_called_once_with(the_table) + mock_workspace_administrator.assert_not_called() + + +def test_table_migration_status_owner_caches_tables_snapshot() -> None: + """Verify that the tables inventory isn't loaded until needed, and after that isn't loaded repeatedly.""" + admin_locator = create_autospec(AdministratorLocator) # pylint: disable=mock-no-usage + mock_workspace_administrator = PropertyMock(return_value="an_admin") + type(admin_locator).workspace_administrator = mock_workspace_administrator + + tables_crawler = create_autospec(TablesCrawler) + a_table = Table( + catalog="hive_metastore", + database="foo", + name="bar", + object_type="TABLE", + table_format="DELTA", + location="/some/path", + ) + b_table = Table( + catalog="hive_metastore", + database="baz", + name="daz", + object_type="TABLE", + table_format="DELTA", + location="/some/path", + ) + tables_crawler.snapshot.return_value = [a_table, b_table] + table_ownership = create_autospec(TableOwnership) + table_ownership.administrator_locator = admin_locator + table_ownership.owner_of.return_value = "bob" + + ownership = TableMigrationOwnership(tables_crawler, table_ownership) + + # Verify the snapshot() hasn't been loaded yet: it isn't needed. + tables_crawler.snapshot.assert_not_called() + + _ = ownership.owner_of( + TableMigrationStatus(src_schema="foo", src_table="bar", dst_catalog="main", dst_schema="foo", dst_table="bar"), + ) + _ = ownership.owner_of( + TableMigrationStatus(src_schema="baz", src_table="daz", dst_catalog="main", dst_schema="foo", dst_table="bar"), + ) + + # Verify the snapshot() wasn't reloaded for the second .owner_of() call. + tables_crawler.snapshot.assert_called_once() + + +def test_table_migration_status_source_table_unknown() -> None: + admin_locator = create_autospec(AdministratorLocator) # pylint: disable=mock-no-usage + mock_workspace_administrator = PropertyMock(return_value="an_admin") + type(admin_locator).workspace_administrator = mock_workspace_administrator + + tables_crawler = create_autospec(TablesCrawler) + tables_crawler.snapshot.return_value = [] + table_ownership = create_autospec(TableOwnership) + table_ownership.administrator_locator = admin_locator + + ownership = TableMigrationOwnership(tables_crawler, table_ownership) + + unknown_table = TableMigrationStatus( + src_schema="foo", + src_table="bar", + dst_catalog="main", + dst_schema="foo", + dst_table="bar", + ) + owner = ownership.owner_of(unknown_table) + + assert owner == "an_admin" + table_ownership.owner_of.assert_not_called() From b2e66f2d8292a58f3f6f81f899ec774b6e7ecd12 Mon Sep 17 00:00:00 2001 From: Andrew Snare Date: Mon, 7 Oct 2024 14:11:55 +0200 Subject: [PATCH 40/58] Integration test for table migration ownership. --- .../hive_metastore/test_table_migrate.py | 41 +++++++++++++++++++ 1 file changed, 41 insertions(+) create mode 100644 tests/integration/hive_metastore/test_table_migrate.py diff --git a/tests/integration/hive_metastore/test_table_migrate.py b/tests/integration/hive_metastore/test_table_migrate.py new file mode 100644 index 0000000000..3e7ea28ac6 --- /dev/null +++ b/tests/integration/hive_metastore/test_table_migrate.py @@ -0,0 +1,41 @@ +import dataclasses + +from databricks.labs.ucx.hive_metastore import TablesCrawler +from databricks.labs.ucx.hive_metastore.table_migration_status import ( + TableMigrationOwnership, + TableMigrationStatus, + TableMigrationStatusRefresher, +) +from databricks.labs.ucx.hive_metastore.tables import TableOwnership + + +def test_table_migration_ownership(ws, runtime_ctx, inventory_schema, sql_backend) -> None: + """Verify the ownership can be determined for crawled table-migration records.""" + + # A table for which a migration record will be produced. + table = runtime_ctx.make_table() + + # Use the crawlers to produce the migration record. + tables_crawler = TablesCrawler(sql_backend, schema=inventory_schema, include_databases=[table.schema_name]) + table_records = tables_crawler.snapshot(force_refresh=True) + migration_status_refresher = TableMigrationStatusRefresher(ws, sql_backend, table.schema_name, tables_crawler) + migration_records = migration_status_refresher.snapshot(force_refresh=True) + + # Find the crawled records for the table we made. + table_record = next(record for record in table_records if record.full_name == table.full_name) + + def is_migration_record_for_table(record: TableMigrationStatus) -> bool: + return record.src_schema == table.schema_name and record.src_table == table.name + + table_migration_record = next(record for record in migration_records if is_migration_record_for_table(record)) + # Make a synthetic record that doesn't correspond to anything in the inventory. + synthetic_record = dataclasses.replace(table_migration_record, src_table="does_not_exist") + + # Verify for the table that the table owner and the migration status are a match. + table_ownership = TableOwnership(runtime_ctx.administrator_locator) + table_migration_ownership = TableMigrationOwnership(tables_crawler, table_ownership) + assert table_migration_ownership.owner_of(table_migration_record) == table_ownership.owner_of(table_record) + + # Verify the owner of the migration record that corresponds to an unknown table. + workspace_administrator = runtime_ctx.administrator_locator.workspace_administrator + assert table_migration_ownership.owner_of(synthetic_record) == workspace_administrator From 953ff625aa4e30d03b9962cf254777b33344be1b Mon Sep 17 00:00:00 2001 From: Andrew Snare Date: Mon, 7 Oct 2024 15:22:13 +0200 Subject: [PATCH 41/58] Stubbed ownership implementation for direct filesystem access records. --- .../labs/ucx/source_code/directfs_access.py | 18 +++++ .../source_code/test_directfs_access.py | 65 +++++++++++++++++++ tests/integration/source_code/test_jobs.py | 1 + .../unit/source_code/test_directfs_access.py | 17 +++++ 4 files changed, 101 insertions(+) create mode 100644 tests/integration/source_code/test_directfs_access.py diff --git a/src/databricks/labs/ucx/source_code/directfs_access.py b/src/databricks/labs/ucx/source_code/directfs_access.py index 26acf95215..3b58bddab3 100644 --- a/src/databricks/labs/ucx/source_code/directfs_access.py +++ b/src/databricks/labs/ucx/source_code/directfs_access.py @@ -7,6 +7,7 @@ from databricks.labs.lsql.backends import SqlBackend from databricks.sdk.errors import DatabricksError +from databricks.labs.ucx.framework.owners import Ownership from databricks.labs.ucx.framework.utils import escape_sql_identifier from databricks.labs.ucx.source_code.base import DirectFsAccess @@ -52,3 +53,20 @@ def _try_fetch(self) -> Iterable[DirectFsAccess]: def _crawl(self) -> Iterable[DirectFsAccess]: return [] # TODO raise NotImplementedError() once CrawlerBase supports empty snapshots + + +class DirectFsAccessOwnership(Ownership[DirectFsAccess]): + """Determine ownership of records reporting direct filesystem access. + + This is intended to be: + + - For queries, the creator of the query (if known). + - For jobs, the owner of the path for the notebook or source (if known). + + At present this information is not gathered during the crawling process, so it can't be reported here. As such + an administrator is currently always reported as the owner. + """ + + def _get_owner(self, record: DirectFsAccess) -> None: + # TODO: Implement this once the creator/ownership information is exposed during crawling. + return None diff --git a/tests/integration/source_code/test_directfs_access.py b/tests/integration/source_code/test_directfs_access.py new file mode 100644 index 0000000000..a462040614 --- /dev/null +++ b/tests/integration/source_code/test_directfs_access.py @@ -0,0 +1,65 @@ +import pytest + +from databricks.labs.ucx.hive_metastore.table_migration_status import TableMigrationIndex +from databricks.labs.ucx.source_code.directfs_access import DirectFsAccessOwnership +from databricks.labs.ucx.source_code.jobs import WorkflowLinter +from databricks.labs.ucx.source_code.queries import QueryLinter + + +@pytest.mark.xfail(reason="DirectFS access records don't currently include creator/owner information.") +def test_query_dfsa_ownership(runtime_ctx, make_query, make_dashboard, inventory_schema, sql_backend) -> None: + """Verify the ownership of a direct-fs record for a query.""" + + # A dashboard with a query that contains a direct filesystem reference. + query = make_query(sql_query="SELECT * from csv.`dbfs://some_folder/some_file.csv`") + dashboard = make_dashboard(query=query) + + # Produce a DFSA record for the query. + linter = QueryLinter( + runtime_ctx.workspace_client, + TableMigrationIndex([]), + runtime_ctx.directfs_access_crawler_for_queries, + runtime_ctx.used_tables_crawler_for_queries, + include_dashboard_ids=[dashboard.id], + ) + linter.refresh_report(sql_backend, inventory_schema) + + # Find a record for the query. + records = runtime_ctx.directfs_access_crawler_for_queries.snapshot() + query_record = next(record for record in records if record.source_id == f"{dashboard.id}/{query.id}") + + # Verify ownership can be made. + ownership = DirectFsAccessOwnership(runtime_ctx.administrator_locator) + assert ownership.owner_of(query_record) == runtime_ctx.workspace_client.current_user.me().user_name + + +@pytest.mark.xfail(reason="DirectFS access records don't currently include creator/owner information.") +def test_path_dfsa_ownership( + runtime_ctx, make_notebook, make_job, make_directory, inventory_schema, sql_backend +) -> None: + """Verify the ownership of a direct-fs record for a notebook/source path associated with a job.""" + + # A job with a notebook task that contains direct filesystem access. + notebook_source = b"display(spark.read.csv('/mnt/things/e/f/g'))" + notebook = make_notebook(path=f"{make_directory()}/notebook.py", content=notebook_source) + job = make_job(notebook_path=notebook) + + # Produce a DFSA record for the job. + linter = WorkflowLinter( + runtime_ctx.workspace_client, + runtime_ctx.dependency_resolver, + runtime_ctx.path_lookup, + TableMigrationIndex([]), + runtime_ctx.directfs_access_crawler_for_paths, + runtime_ctx.used_tables_crawler_for_paths, + include_job_ids=[job.job_id], + ) + linter.refresh_report(sql_backend, inventory_schema) + + # Find a record for our job. + records = runtime_ctx.directfs_access_crawler_for_paths.snapshot() + path_record = next(record for record in records if record.source_id == str(notebook)) + + # Verify ownership can be made. + ownership = DirectFsAccessOwnership(runtime_ctx.administrator_locator) + assert ownership.owner_of(path_record) == runtime_ctx.workspace_client.current_user.me().user_name diff --git a/tests/integration/source_code/test_jobs.py b/tests/integration/source_code/test_jobs.py index 12159886b0..15f4e14381 100644 --- a/tests/integration/source_code/test_jobs.py +++ b/tests/integration/source_code/test_jobs.py @@ -35,6 +35,7 @@ @retried(on=[NotFound], timeout=timedelta(minutes=5)) def test_running_real_workflow_linter_job(installation_ctx, make_notebook, make_directory, make_job): + # Broken fixture: the linter reports a problem because the notebook can't be read, not because the DFSA is detected. # Deprecated file system path in call to: /mnt/things/e/f/g lint_problem = b"display(spark.read.csv('/mnt/things/e/f/g'))" notebook = make_notebook(path=f"{make_directory()}/notebook.ipynb", content=lint_problem) diff --git a/tests/unit/source_code/test_directfs_access.py b/tests/unit/source_code/test_directfs_access.py index 0c1063b820..953b16dbe2 100644 --- a/tests/unit/source_code/test_directfs_access.py +++ b/tests/unit/source_code/test_directfs_access.py @@ -1,11 +1,14 @@ from datetime import datetime +from unittest.mock import create_autospec, PropertyMock from databricks.labs.lsql.backends import MockBackend +from databricks.labs.ucx.framework.owners import AdministratorLocator from databricks.labs.ucx.source_code.base import LineageAtom from databricks.labs.ucx.source_code.directfs_access import ( DirectFsAccessCrawler, DirectFsAccess, + DirectFsAccessOwnership, ) @@ -30,3 +33,17 @@ def test_crawler_appends_dfsas(): crawler.dump_all(dfsas) rows = backend.rows_written_for(crawler.full_name, "append") assert len(rows) == 3 + + +def test_directfs_access_ownership() -> None: + """Verify that the owner for a direct-fs access record is an administrator.""" + admin_locator = create_autospec(AdministratorLocator) # pylint: disable=mock-no-usage + mock_workspace_administrator = PropertyMock(return_value="an_admin") + type(admin_locator).workspace_administrator = mock_workspace_administrator + + ownership = DirectFsAccessOwnership(admin_locator) + dfsa = DirectFsAccess() + owner = ownership.owner_of(dfsa) + + assert owner == "an_admin" + mock_workspace_administrator.assert_called_once() From 7037b6aa6ef46622c0faa8d1ec35285c645b7198 Mon Sep 17 00:00:00 2001 From: Andrew Snare Date: Mon, 7 Oct 2024 15:33:42 +0200 Subject: [PATCH 42/58] Remove unintentional comment. --- tests/integration/source_code/test_jobs.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/integration/source_code/test_jobs.py b/tests/integration/source_code/test_jobs.py index 15f4e14381..12159886b0 100644 --- a/tests/integration/source_code/test_jobs.py +++ b/tests/integration/source_code/test_jobs.py @@ -35,7 +35,6 @@ @retried(on=[NotFound], timeout=timedelta(minutes=5)) def test_running_real_workflow_linter_job(installation_ctx, make_notebook, make_directory, make_job): - # Broken fixture: the linter reports a problem because the notebook can't be read, not because the DFSA is detected. # Deprecated file system path in call to: /mnt/things/e/f/g lint_problem = b"display(spark.read.csv('/mnt/things/e/f/g'))" notebook = make_notebook(path=f"{make_directory()}/notebook.ipynb", content=lint_problem) From 82820511105cbe1de1196c1723a2c0cab1d00977 Mon Sep 17 00:00:00 2001 From: Andrew Snare Date: Mon, 7 Oct 2024 16:08:19 +0200 Subject: [PATCH 43/58] Type hint. --- src/databricks/labs/ucx/framework/owners.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/databricks/labs/ucx/framework/owners.py b/src/databricks/labs/ucx/framework/owners.py index ba3ddf75b5..6dfd976304 100644 --- a/src/databricks/labs/ucx/framework/owners.py +++ b/src/databricks/labs/ucx/framework/owners.py @@ -20,7 +20,7 @@ class DataclassInstance(Protocol): class AdministratorFinder(ABC): - def __init__(self, ws: WorkspaceClient): + def __init__(self, ws: WorkspaceClient) -> None: self._ws = ws @abstractmethod From 3d769c6d56b1878b3620b27dcffb3ed5f77d8cec Mon Sep 17 00:00:00 2001 From: Andrew Snare Date: Mon, 7 Oct 2024 16:16:12 +0200 Subject: [PATCH 44/58] Rename: admin_groups -> admin_group_ids The list contains identifiers, not Group instances. --- src/databricks/labs/ucx/framework/owners.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/databricks/labs/ucx/framework/owners.py b/src/databricks/labs/ucx/framework/owners.py index 6dfd976304..dd34562928 100644 --- a/src/databricks/labs/ucx/framework/owners.py +++ b/src/databricks/labs/ucx/framework/owners.py @@ -79,14 +79,14 @@ def find_admin_users(self) -> Iterable[User]: for group in user.groups: if group.display == "admins" and group.value: candidate_group_ids.add(group.value) - admin_groups = list(self._filter_workspace_groups(candidate_group_ids)) - match admin_groups: + admin_group_ids = list(self._filter_workspace_groups(candidate_group_ids)) + match admin_group_ids: case []: return () case [admin_group]: return (user for user in admin_users if self._member_of_group(user, admin_group)) case _: - msg = f"Multiple 'admins' workspace groups found; something is wrong: {admin_groups}" + msg = f"Multiple 'admins' workspace groups found; something is wrong: {admin_group_ids}" raise RuntimeError(msg) From 3c0a5b4058f9ff0eb06d7aef624fa0ed2b516c52 Mon Sep 17 00:00:00 2001 From: Andrew Snare Date: Mon, 7 Oct 2024 17:01:21 +0200 Subject: [PATCH 45/58] Fix failing integration test. --- tests/integration/assessment/test_jobs.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/integration/assessment/test_jobs.py b/tests/integration/assessment/test_jobs.py index 3f8dd1f0c7..47fa6f1b81 100644 --- a/tests/integration/assessment/test_jobs.py +++ b/tests/integration/assessment/test_jobs.py @@ -77,8 +77,8 @@ def test_job_ownership(ws, runtime_ctx, make_job, inventory_schema, sql_backend) records = crawler.snapshot(force_refresh=True) # Find the crawled record for our pipeline. - pipeline_record = next(record for record in records if record.job_id == job.job_id) + job_record = next(record for record in records if record.job_id == str(job.job_id)) # Verify ownership is as expected. ownership = JobOwnership(runtime_ctx.administrator_locator) - assert ownership.owner_of(pipeline_record) == ws.current_user.me().user_name + assert ownership.owner_of(job_record) == ws.current_user.me().user_name From 9b39e30c7fd853d2152b0ecb6b99e9554d1dd871 Mon Sep 17 00:00:00 2001 From: Andrew Snare Date: Mon, 7 Oct 2024 17:34:39 +0200 Subject: [PATCH 46/58] Revert a change from this PR. Moved to #2855 instead. --- tests/unit/workspace_access/test_tacl.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/unit/workspace_access/test_tacl.py b/tests/unit/workspace_access/test_tacl.py index fa6d4614bc..cfa1a2bdc2 100644 --- a/tests/unit/workspace_access/test_tacl.py +++ b/tests/unit/workspace_access/test_tacl.py @@ -339,7 +339,7 @@ def test_tacl_applier_not_applied(): assert not validation_res -def test_tacl_udf_applier(): +def test_tacl_udf_applier(mocker): sql_backend = MockBackend( rows={ "SELECT \\* FROM `hive_metastore`.`test`.`grants`": UCX_GRANTS[ From 8d8191d1f79b2e5cad1ff0a5e2f7be65a6aed118 Mon Sep 17 00:00:00 2001 From: Andrew Snare Date: Tue, 8 Oct 2024 17:00:44 +0200 Subject: [PATCH 47/58] Simplify creator normalisation. --- src/databricks/labs/ucx/assessment/clusters.py | 4 +--- src/databricks/labs/ucx/assessment/jobs.py | 4 +--- src/databricks/labs/ucx/assessment/pipelines.py | 4 +--- 3 files changed, 3 insertions(+), 9 deletions(-) diff --git a/src/databricks/labs/ucx/assessment/clusters.py b/src/databricks/labs/ucx/assessment/clusters.py index 789e236757..abfad10c6a 100644 --- a/src/databricks/labs/ucx/assessment/clusters.py +++ b/src/databricks/labs/ucx/assessment/clusters.py @@ -156,14 +156,12 @@ def _assess_clusters(self, all_clusters): for cluster in all_clusters: if cluster.cluster_source == ClusterSource.JOB: continue - creator = cluster.creator_user_name + creator = cluster.creator_user_name or None if not creator: logger.warning( f"Cluster {cluster.cluster_id} have Unknown creator, it means that the original creator " f"has been deleted and should be re-created" ) - # Normalize empty creator. - creator = None cluster_info = ClusterInfo( cluster_id=cluster.cluster_id if cluster.cluster_id else "", cluster_name=cluster.cluster_name, diff --git a/src/databricks/labs/ucx/assessment/jobs.py b/src/databricks/labs/ucx/assessment/jobs.py index 01d66d93d1..a4cc3cb19e 100644 --- a/src/databricks/labs/ucx/assessment/jobs.py +++ b/src/databricks/labs/ucx/assessment/jobs.py @@ -108,14 +108,12 @@ def _prepare(all_jobs) -> tuple[dict[int, set[str]], dict[int, JobInfo]]: if not job.job_id: continue job_assessment[job.job_id] = set() - creator_user_name = job.creator_user_name + creator_user_name = job.creator_user_name or None if not creator_user_name: logger.warning( f"Job {job.job_id} have Unknown creator, it means that the original creator has been deleted " f"and should be re-created" ) - # Normalization. - creator_user_name = None job_settings = job.settings if not job_settings: diff --git a/src/databricks/labs/ucx/assessment/pipelines.py b/src/databricks/labs/ucx/assessment/pipelines.py index 2209ba76d9..0507f0903d 100644 --- a/src/databricks/labs/ucx/assessment/pipelines.py +++ b/src/databricks/labs/ucx/assessment/pipelines.py @@ -35,14 +35,12 @@ def _crawl(self) -> Iterable[PipelineInfo]: def _assess_pipelines(self, all_pipelines) -> Iterable[PipelineInfo]: for pipeline in all_pipelines: - creator_name = pipeline.creator_user_name + creator_name = pipeline.creator_user_name or None if not creator_name: logger.warning( f"Pipeline {pipeline.name} have Unknown creator, it means that the original creator " f"has been deleted and should be re-created" ) - # Normalization. - creator_name = None pipeline_info = PipelineInfo( pipeline_id=pipeline.pipeline_id, pipeline_name=pipeline.name, From 94a601deed35b1f5e173a7a254cf71419c77ef73 Mon Sep 17 00:00:00 2001 From: Andrew Snare Date: Tue, 8 Oct 2024 17:20:18 +0200 Subject: [PATCH 48/58] Rename method: _get_owner() -> _maybe_direct_owner() --- src/databricks/labs/ucx/assessment/clusters.py | 4 ++-- src/databricks/labs/ucx/assessment/jobs.py | 2 +- src/databricks/labs/ucx/assessment/pipelines.py | 2 +- src/databricks/labs/ucx/framework/owners.py | 4 ++-- src/databricks/labs/ucx/hive_metastore/grants.py | 2 +- .../labs/ucx/hive_metastore/table_migration_status.py | 2 +- src/databricks/labs/ucx/hive_metastore/tables.py | 2 +- src/databricks/labs/ucx/hive_metastore/udfs.py | 2 +- src/databricks/labs/ucx/source_code/directfs_access.py | 2 +- tests/unit/framework/test_owners.py | 2 +- 10 files changed, 12 insertions(+), 12 deletions(-) diff --git a/src/databricks/labs/ucx/assessment/clusters.py b/src/databricks/labs/ucx/assessment/clusters.py index abfad10c6a..984ae0e516 100644 --- a/src/databricks/labs/ucx/assessment/clusters.py +++ b/src/databricks/labs/ucx/assessment/clusters.py @@ -188,7 +188,7 @@ class ClusterOwnership(Ownership[ClusterInfo]): This is the cluster creator (if known), or otherwise an administrator. """ - def _get_owner(self, record: ClusterInfo) -> str | None: + def _maybe_direct_owner(self, record: ClusterInfo) -> str | None: return record.creator @@ -251,5 +251,5 @@ class ClusterPolicyOwnership(Ownership[PolicyInfo]): This is the creator of the cluster policy (if known), or otherwise an administrator. """ - def _get_owner(self, record: PolicyInfo) -> str | None: + def _maybe_direct_owner(self, record: PolicyInfo) -> str | None: return record.creator diff --git a/src/databricks/labs/ucx/assessment/jobs.py b/src/databricks/labs/ucx/assessment/jobs.py index a4cc3cb19e..0af2e9aa7c 100644 --- a/src/databricks/labs/ucx/assessment/jobs.py +++ b/src/databricks/labs/ucx/assessment/jobs.py @@ -149,7 +149,7 @@ class JobOwnership(Ownership[JobInfo]): This is the pipeline creator (if known), or otherwise an administrator. """ - def _get_owner(self, record: JobInfo) -> str | None: + def _maybe_direct_owner(self, record: JobInfo) -> str | None: return record.creator diff --git a/src/databricks/labs/ucx/assessment/pipelines.py b/src/databricks/labs/ucx/assessment/pipelines.py index 0507f0903d..f0151f6de3 100644 --- a/src/databricks/labs/ucx/assessment/pipelines.py +++ b/src/databricks/labs/ucx/assessment/pipelines.py @@ -84,5 +84,5 @@ class PipelineOwnership(Ownership[PipelineInfo]): This is the pipeline creator (if known), or otherwise an administrator. """ - def _get_owner(self, record: PipelineInfo) -> str | None: + def _maybe_direct_owner(self, record: PipelineInfo) -> str | None: return record.creator_name diff --git a/src/databricks/labs/ucx/framework/owners.py b/src/databricks/labs/ucx/framework/owners.py index dd34562928..aa03be2a1b 100644 --- a/src/databricks/labs/ucx/framework/owners.py +++ b/src/databricks/labs/ucx/framework/owners.py @@ -196,9 +196,9 @@ def owner_of(self, record: Record) -> str: Raises: RuntimeError if there are no active administrators for the current workspace. """ - return self._get_owner(record) or self.administrator_locator.workspace_administrator + return self._maybe_direct_owner(record) or self.administrator_locator.workspace_administrator @abstractmethod - def _get_owner(self, record: Record) -> str | None: + def _maybe_direct_owner(self, record: Record) -> str | None: """Obtain the record-specific user-name associated with the given result, if any.""" return None diff --git a/src/databricks/labs/ucx/hive_metastore/grants.py b/src/databricks/labs/ucx/hive_metastore/grants.py index 0ffbdd4e0f..ae51c0e1ba 100644 --- a/src/databricks/labs/ucx/hive_metastore/grants.py +++ b/src/databricks/labs/ucx/hive_metastore/grants.py @@ -388,7 +388,7 @@ class GrantOwnership(Ownership[Grant]): At the present we can't determine a specific owner for grants: we always report an administrator. """ - def _get_owner(self, record: Grant) -> None: + def _maybe_direct_owner(self, record: Grant) -> None: return None diff --git a/src/databricks/labs/ucx/hive_metastore/table_migration_status.py b/src/databricks/labs/ucx/hive_metastore/table_migration_status.py index c31f5e3e69..a76f06c4c8 100644 --- a/src/databricks/labs/ucx/hive_metastore/table_migration_status.py +++ b/src/databricks/labs/ucx/hive_metastore/table_migration_status.py @@ -175,7 +175,7 @@ def _tables_snapshot_index(self, reindex: bool = False) -> dict[tuple[str, str], self._indexed_tables = index return index - def _get_owner(self, record: TableMigrationStatus) -> str | None: + def _maybe_direct_owner(self, record: TableMigrationStatus) -> str | None: index = self._tables_snapshot_index() source_table = index.get((record.src_schema, record.src_table), None) return self._table_ownership.owner_of(source_table) if source_table is not None else None diff --git a/src/databricks/labs/ucx/hive_metastore/tables.py b/src/databricks/labs/ucx/hive_metastore/tables.py index 772c874fdd..8d74e4c7d8 100644 --- a/src/databricks/labs/ucx/hive_metastore/tables.py +++ b/src/databricks/labs/ucx/hive_metastore/tables.py @@ -645,5 +645,5 @@ class TableOwnership(Ownership[Table]): At the present we don't determine a specific owner for tables: we always report an administrator. """ - def _get_owner(self, record: Table) -> None: + def _maybe_direct_owner(self, record: Table) -> None: return None diff --git a/src/databricks/labs/ucx/hive_metastore/udfs.py b/src/databricks/labs/ucx/hive_metastore/udfs.py index 01ec95bfe5..d5e4bd90bd 100644 --- a/src/databricks/labs/ucx/hive_metastore/udfs.py +++ b/src/databricks/labs/ucx/hive_metastore/udfs.py @@ -144,5 +144,5 @@ class UdfOwnership(Ownership[Udf]): At the present we don't determine a specific owner for UDFs: we always report an administrator. """ - def _get_owner(self, record: Udf) -> None: + def _maybe_direct_owner(self, record: Udf) -> None: return None diff --git a/src/databricks/labs/ucx/source_code/directfs_access.py b/src/databricks/labs/ucx/source_code/directfs_access.py index 3b58bddab3..342f371d05 100644 --- a/src/databricks/labs/ucx/source_code/directfs_access.py +++ b/src/databricks/labs/ucx/source_code/directfs_access.py @@ -67,6 +67,6 @@ class DirectFsAccessOwnership(Ownership[DirectFsAccess]): an administrator is currently always reported as the owner. """ - def _get_owner(self, record: DirectFsAccess) -> None: + def _maybe_direct_owner(self, record: DirectFsAccess) -> None: # TODO: Implement this once the creator/ownership information is exposed during crawling. return None diff --git a/tests/unit/framework/test_owners.py b/tests/unit/framework/test_owners.py index 8739917963..bbb3a12001 100644 --- a/tests/unit/framework/test_owners.py +++ b/tests/unit/framework/test_owners.py @@ -27,7 +27,7 @@ def __init__( self._owner_fn = owner_fn self.mock_admin_locator = mock_admin_locator - def _get_owner(self, record: Record) -> str | None: + def _maybe_direct_owner(self, record: Record) -> str | None: return self._owner_fn(record) From b6278909dbe1f66722af8ad606d0000e334e7c98 Mon Sep 17 00:00:00 2001 From: Andrew Snare Date: Tue, 8 Oct 2024 17:46:39 +0200 Subject: [PATCH 49/58] Simplify the code a bit for locating members of the 'admins' workspace group. --- src/databricks/labs/ucx/framework/owners.py | 38 ++++++++------------- 1 file changed, 14 insertions(+), 24 deletions(-) diff --git a/src/databricks/labs/ucx/framework/owners.py b/src/databricks/labs/ucx/framework/owners.py index aa03be2a1b..aff839694c 100644 --- a/src/databricks/labs/ucx/framework/owners.py +++ b/src/databricks/labs/ucx/framework/owners.py @@ -46,19 +46,13 @@ def _is_active_admin(self, user: User) -> bool: """Determine if a user is an active administrator.""" return bool(user.active) and self._member_of_group_named(user, "admins") - def _filter_workspace_groups(self, identifiers: Iterable[str]) -> Iterable[str]: - """Limit a set of identifiers to those that are workspace groups.""" - seen = set() - for group_id in identifiers: - if group_id in seen: - continue - seen.add(group_id) - try: - group = self._ws.groups.get(group_id) - except NotFound: - continue - if group.meta and group.meta.resource_type == "WorkspaceGroup": - yield group_id + def _is_workspace_group(self, group_id: str) -> bool: + """Determine whether a group_id corresponds to a workspace group or not.""" + try: + group = self._ws.groups.get(group_id) + except NotFound: + return False + return bool(group.meta and group.meta.resource_type == "WorkspaceGroup") def find_admin_users(self) -> Iterable[User]: """Enumerate the active workspace administrators in a given workspace. @@ -72,22 +66,18 @@ def find_admin_users(self) -> Iterable[User]: # Reference: https://learn.microsoft.com/en-us/azure/databricks/admin/users-groups/groups#account-vs-workspace-group admin_users = [user for user in all_users if user.user_name and self._is_active_admin(user)] logger.debug(f"Verifying membership of the 'admins' workspace group for users: {admin_users}") - candidate_group_ids = set() + maybe_admins_id = set() for user in admin_users: if not user.groups: continue for group in user.groups: if group.display == "admins" and group.value: - candidate_group_ids.add(group.value) - admin_group_ids = list(self._filter_workspace_groups(candidate_group_ids)) - match admin_group_ids: - case []: - return () - case [admin_group]: - return (user for user in admin_users if self._member_of_group(user, admin_group)) - case _: - msg = f"Multiple 'admins' workspace groups found; something is wrong: {admin_group_ids}" - raise RuntimeError(msg) + maybe_admins_id.add(group.value) + # There can only be a single 'admins' workspace group. + for group_id in maybe_admins_id: + if self._is_workspace_group(group_id): + return (user for user in admin_users if self._member_of_group(user, group_id)) + return () class AccountAdministratorFinder(AdministratorFinder): From ae8d194e594118a12f606a277b9ff85a9a97d5ec Mon Sep 17 00:00:00 2001 From: Andrew Snare Date: Tue, 8 Oct 2024 18:48:48 +0200 Subject: [PATCH 50/58] Replace a property (with expensive side-effects) with a getter method. --- src/databricks/labs/ucx/framework/owners.py | 5 ++-- tests/integration/framework/test_owners.py | 2 +- .../hive_metastore/test_table_migrate.py | 2 +- tests/unit/assessment/test_clusters.py | 26 ++++++++--------- tests/unit/assessment/test_jobs.py | 15 ++++------ tests/unit/assessment/test_pipelines.py | 15 ++++------ tests/unit/framework/test_owners.py | 28 +++++++++---------- tests/unit/hive_metastore/test_grants.py | 9 +++--- .../unit/hive_metastore/test_table_migrate.py | 15 ++++------ tests/unit/hive_metastore/test_tables.py | 9 +++--- tests/unit/hive_metastore/test_udfs.py | 9 +++--- .../unit/source_code/test_directfs_access.py | 9 +++--- 12 files changed, 61 insertions(+), 83 deletions(-) diff --git a/src/databricks/labs/ucx/framework/owners.py b/src/databricks/labs/ucx/framework/owners.py index aff839694c..dc42819354 100644 --- a/src/databricks/labs/ucx/framework/owners.py +++ b/src/databricks/labs/ucx/framework/owners.py @@ -145,8 +145,7 @@ def _found_admin(self) -> str | None: found_admin_users: Iterable[User | None] = (first_user(finder.find_admin_users()) for finder in finders) return next((user.user_name for user in found_admin_users if user), None) - @property - def workspace_administrator(self) -> str: + def get_workspace_administrator(self) -> str: """The user-name of an admin user for the workspace. Raises: @@ -186,7 +185,7 @@ def owner_of(self, record: Record) -> str: Raises: RuntimeError if there are no active administrators for the current workspace. """ - return self._maybe_direct_owner(record) or self.administrator_locator.workspace_administrator + return self._maybe_direct_owner(record) or self.administrator_locator.get_workspace_administrator() @abstractmethod def _maybe_direct_owner(self, record: Record) -> str | None: diff --git a/tests/integration/framework/test_owners.py b/tests/integration/framework/test_owners.py index 904ffcd1e0..670d5817a2 100644 --- a/tests/integration/framework/test_owners.py +++ b/tests/integration/framework/test_owners.py @@ -3,6 +3,6 @@ def test_fallback_workspace_admin(installation_ctx: RuntimeContext) -> None: """Verify that a workspace administrator can be found for our integration environment.""" - an_admin = installation_ctx.administrator_locator.workspace_administrator + an_admin = installation_ctx.administrator_locator.get_workspace_administrator() assert "@" in an_admin diff --git a/tests/integration/hive_metastore/test_table_migrate.py b/tests/integration/hive_metastore/test_table_migrate.py index 3e7ea28ac6..e9ba362a86 100644 --- a/tests/integration/hive_metastore/test_table_migrate.py +++ b/tests/integration/hive_metastore/test_table_migrate.py @@ -37,5 +37,5 @@ def is_migration_record_for_table(record: TableMigrationStatus) -> bool: assert table_migration_ownership.owner_of(table_migration_record) == table_ownership.owner_of(table_record) # Verify the owner of the migration record that corresponds to an unknown table. - workspace_administrator = runtime_ctx.administrator_locator.workspace_administrator + workspace_administrator = runtime_ctx.administrator_locator.get_workspace_administrator() assert table_migration_ownership.owner_of(synthetic_record) == workspace_administrator diff --git a/tests/unit/assessment/test_clusters.py b/tests/unit/assessment/test_clusters.py index 6363dbf45a..c86c3f60f0 100644 --- a/tests/unit/assessment/test_clusters.py +++ b/tests/unit/assessment/test_clusters.py @@ -1,5 +1,5 @@ import json -from unittest.mock import MagicMock, PropertyMock, create_autospec, mock_open, patch +from unittest.mock import MagicMock, create_autospec, mock_open, patch import pytest from databricks.labs.lsql.backends import MockBackend @@ -187,25 +187,23 @@ def test_unsupported_clusters(): def test_cluster_owner_creator() -> None: admin_locator = create_autospec(AdministratorLocator) - type(admin_locator).workspace_administrator = PropertyMock() ownership = ClusterOwnership(admin_locator) owner = ownership.owner_of(ClusterInfo(creator="bob", cluster_id="1", success=1, failures="[]")) assert owner == "bob" - admin_locator.workspace_administrator.assert_not_called() + admin_locator.get_workspace_administrator.assert_not_called() def test_cluster_owner_creator_unknown() -> None: - admin_locator = create_autospec(AdministratorLocator) # pylint: disable=mock-no-usage - mock_workspace_administrator = PropertyMock(return_value="an_admin") - type(admin_locator).workspace_administrator = mock_workspace_administrator + admin_locator = create_autospec(AdministratorLocator) + admin_locator.get_workspace_administrator.return_value = "an_admin" ownership = ClusterOwnership(admin_locator) owner = ownership.owner_of(ClusterInfo(creator=None, cluster_id="1", success=1, failures="[]")) assert owner == "an_admin" - mock_workspace_administrator.assert_called_once() + admin_locator.get_workspace_administrator.assert_called_once() def test_policy_crawler(): @@ -275,24 +273,22 @@ def test_policy_without_failure(): def test_cluster_policy_owner_creator() -> None: - admin_locator = create_autospec(AdministratorLocator) # pylint: disable=mock-no-usage - mock_workspace_administrator = PropertyMock(return_value="an_admin") - type(admin_locator).workspace_administrator = mock_workspace_administrator + admin_locator = create_autospec(AdministratorLocator) + admin_locator.get_workspace_administrator.return_value = "an_admin" ownership = ClusterPolicyOwnership(admin_locator) owner = ownership.owner_of(PolicyInfo(creator="bob", policy_id="1", policy_name="foo", success=1, failures="[]")) assert owner == "bob" - mock_workspace_administrator.assert_not_called() + admin_locator.get_workspace_administrator.assert_not_called() def test_cluster_policy_owner_creator_unknown() -> None: - admin_locator = create_autospec(AdministratorLocator) # pylint: disable=mock-no-usage - mock_workspace_administrator = PropertyMock(return_value="an_admin") - type(admin_locator).workspace_administrator = mock_workspace_administrator + admin_locator = create_autospec(AdministratorLocator) + admin_locator.get_workspace_administrator.return_value = "an_admin" ownership = ClusterPolicyOwnership(admin_locator) owner = ownership.owner_of(PolicyInfo(creator=None, policy_id="1", policy_name="foo", success=1, failures="[]")) assert owner == "an_admin" - mock_workspace_administrator.assert_called_once() + admin_locator.get_workspace_administrator.assert_called_once() diff --git a/tests/unit/assessment/test_jobs.py b/tests/unit/assessment/test_jobs.py index d2e9089044..8ec3e89077 100644 --- a/tests/unit/assessment/test_jobs.py +++ b/tests/unit/assessment/test_jobs.py @@ -1,4 +1,4 @@ -from unittest.mock import create_autospec, PropertyMock +from unittest.mock import create_autospec import pytest from databricks.labs.lsql.backends import MockBackend @@ -136,24 +136,21 @@ def test_job_run_crawler(jobruns_ids, cluster_ids, run_ids, failures): def test_pipeline_owner_creator() -> None: - admin_locator = create_autospec(AdministratorLocator) # pylint: disable=mock-no-usage - mock_workspace_administrator = PropertyMock(return_value="an_admin") - type(admin_locator).workspace_administrator = mock_workspace_administrator + admin_locator = create_autospec(AdministratorLocator) ownership = JobOwnership(admin_locator) owner = ownership.owner_of(JobInfo(creator="bob", job_id="1", success=1, failures="[]")) assert owner == "bob" - mock_workspace_administrator.assert_not_called() + admin_locator.get_workspace_administrator.assert_not_called() def test_pipeline_owner_creator_unknown() -> None: - admin_locator = create_autospec(AdministratorLocator) # pylint: disable=mock-no-usage - mock_workspace_administrator = PropertyMock(return_value="an_admin") - type(admin_locator).workspace_administrator = mock_workspace_administrator + admin_locator = create_autospec(AdministratorLocator) + admin_locator.get_workspace_administrator.return_value = "an_admin" ownership = JobOwnership(admin_locator) owner = ownership.owner_of(JobInfo(creator=None, job_id="1", success=1, failures="[]")) assert owner == "an_admin" - mock_workspace_administrator.assert_called_once() + admin_locator.get_workspace_administrator.assert_called_once() diff --git a/tests/unit/assessment/test_pipelines.py b/tests/unit/assessment/test_pipelines.py index 1b93d9040f..949e441f78 100644 --- a/tests/unit/assessment/test_pipelines.py +++ b/tests/unit/assessment/test_pipelines.py @@ -1,4 +1,4 @@ -from unittest.mock import create_autospec, PropertyMock +from unittest.mock import create_autospec from databricks.labs.lsql.backends import MockBackend from databricks.sdk.service.pipelines import GetPipelineResponse, PipelineStateInfo @@ -64,24 +64,21 @@ def test_pipeline_crawler_creator(): def test_pipeline_owner_creator() -> None: - admin_locator = create_autospec(AdministratorLocator) # pylint: disable=mock-no-usage - mock_workspace_administrator = PropertyMock(return_value="an_admin") - type(admin_locator).workspace_administrator = mock_workspace_administrator + admin_locator = create_autospec(AdministratorLocator) ownership = PipelineOwnership(admin_locator) owner = ownership.owner_of(PipelineInfo(creator_name="bob", pipeline_id="1", success=1, failures="[]")) assert owner == "bob" - mock_workspace_administrator.assert_not_called() + admin_locator.get_workspace_administrator.assert_not_called() def test_pipeline_owner_creator_unknown() -> None: - admin_locator = create_autospec(AdministratorLocator) # pylint: disable=mock-no-usage - mock_workspace_administrator = PropertyMock(return_value="an_admin") - type(admin_locator).workspace_administrator = mock_workspace_administrator + admin_locator = create_autospec(AdministratorLocator) + admin_locator.get_workspace_administrator.return_value = "an_admin" ownership = PipelineOwnership(admin_locator) owner = ownership.owner_of(PipelineInfo(creator_name=None, pipeline_id="1", success=1, failures="[]")) assert owner == "an_admin" - mock_workspace_administrator.assert_called_once() + admin_locator.get_workspace_administrator.assert_called_once() diff --git a/tests/unit/framework/test_owners.py b/tests/unit/framework/test_owners.py index bbb3a12001..25dd465b6f 100644 --- a/tests/unit/framework/test_owners.py +++ b/tests/unit/framework/test_owners.py @@ -1,6 +1,6 @@ import re from collections.abc import Callable, Sequence -from unittest.mock import create_autospec, Mock, PropertyMock +from unittest.mock import create_autospec, Mock import pytest from databricks.sdk.errors import NotFound @@ -217,7 +217,7 @@ def test_admin_locator_prefers_workspace_admin_over_account_admin(ws) -> None: _setup_accounts(ws, account_users=account_users, workspace_users=workspace_users, groups=[admins_group]) locator = AdministratorLocator(ws) - the_admin = locator.workspace_administrator + the_admin = locator.get_workspace_administrator() assert the_admin == "bob" # Also verify that we didn't attempt to look up account admins. @@ -236,7 +236,7 @@ def test_admin_locator_prefer_first_workspace_admin_alphabetically(ws) -> None: _setup_accounts(ws, workspace_users=workspace_users, groups=[admins_group]) locator = AdministratorLocator(ws) - the_admin = locator.workspace_administrator + the_admin = locator.get_workspace_administrator() assert the_admin == "andrew" @@ -251,7 +251,7 @@ def test_admin_locator_prefer_first_account_admin_alphabetically(ws) -> None: _setup_accounts(ws, account_users=account_users) locator = AdministratorLocator(ws) - the_admin = locator.workspace_administrator + the_admin = locator.get_workspace_administrator() assert the_admin == "andrew" @@ -265,7 +265,7 @@ def test_admin_locator_error_when_no_admin(ws) -> None: workspace_id = ws.get_workspace_id() expected_message = f"No active workspace or account administrator can be found for workspace: {workspace_id}" with pytest.raises(RuntimeError, match=re.escape(expected_message)): - _ = locator.workspace_administrator + _ = locator.get_workspace_administrator() def test_admin_locator_is_lazy(ws) -> None: @@ -279,7 +279,7 @@ def test_admin_locator_is_lazy(ws) -> None: mock_finder_factory.assert_not_called() mock_finder.assert_not_called() - _ = locator.workspace_administrator + _ = locator.get_workspace_administrator() mock_finder_factory.assert_called_once_with(ws) mock_finder.find_admin_users.assert_called_once() @@ -293,8 +293,8 @@ def test_admin_locator_caches_result(ws) -> None: mock_finder_factory.return_value = mock_finder locator = AdministratorLocator(ws, finders=[mock_finder_factory]) - _ = locator.workspace_administrator - _ = locator.workspace_administrator + _ = locator.get_workspace_administrator() + _ = locator.get_workspace_administrator() mock_finder_factory.assert_called_once_with(ws) mock_finder.find_admin_users.assert_called_once() @@ -309,9 +309,9 @@ def test_admin_locator_caches_negative_result(ws) -> None: locator = AdministratorLocator(ws, finders=[mock_finder_factory]) with pytest.raises(RuntimeError): - _ = locator.workspace_administrator + _ = locator.get_workspace_administrator() with pytest.raises(RuntimeError): - _ = locator.workspace_administrator + _ = locator.get_workspace_administrator() mock_finder_factory.assert_called_once_with(ws) mock_finder.find_admin_users.assert_called_once() @@ -323,13 +323,13 @@ def test_ownership_prefers_record_owner() -> None: owner = ownership.owner_of("school") assert owner == "bob" - ownership.mock_admin_locator.workspace_administrator.assert_not_called() + ownership.mock_admin_locator.get_workspace_administrator.assert_not_called() def test_ownership_admin_user_fallback() -> None: """Verify that if no owner for the record can be found, an admin user is returned instead.""" ownership = _OwnershipFixture[str]() - type(ownership.mock_admin_locator).workspace_administrator = PropertyMock(return_value="jane") + ownership.mock_admin_locator.get_workspace_administrator.return_value = "jane" owner = ownership.owner_of("school") @@ -339,9 +339,7 @@ def test_ownership_admin_user_fallback() -> None: def test_ownership_no_fallback_admin_user_error() -> None: """Verify that if no owner can be determined, an error is raised.""" ownership = _OwnershipFixture[str]() - type(ownership.mock_admin_locator).workspace_administrator = PropertyMock( - side_effect=RuntimeError("Mocked admin lookup failure.") - ) + ownership.mock_admin_locator.get_workspace_administrator.side_effect = RuntimeError("Mocked admin lookup failure.") with pytest.raises(RuntimeError, match="Mocked admin lookup failure."): _ = ownership.owner_of("school") diff --git a/tests/unit/hive_metastore/test_grants.py b/tests/unit/hive_metastore/test_grants.py index 9d1a04438f..7f31824e02 100644 --- a/tests/unit/hive_metastore/test_grants.py +++ b/tests/unit/hive_metastore/test_grants.py @@ -1,5 +1,5 @@ import logging -from unittest.mock import create_autospec, PropertyMock +from unittest.mock import create_autospec import pytest from databricks.labs.lsql.backends import MockBackend @@ -532,12 +532,11 @@ def grant_loader() -> list[Grant]: def test_grant_owner() -> None: """Verify that the owner of a crawled grant is an administrator.""" - admin_locator = create_autospec(AdministratorLocator) # pylint: disable=mock-no-usage - mock_workspace_administrator = PropertyMock(return_value="an_admin") - type(admin_locator).workspace_administrator = mock_workspace_administrator + admin_locator = create_autospec(AdministratorLocator) + admin_locator.get_workspace_administrator.return_value = "an_admin" ownership = GrantOwnership(admin_locator) owner = ownership.owner_of(Grant(principal="someone", action_type="SELECT")) assert owner == "an_admin" - mock_workspace_administrator.assert_called_once() + admin_locator.get_workspace_administrator.assert_called_once() diff --git a/tests/unit/hive_metastore/test_table_migrate.py b/tests/unit/hive_metastore/test_table_migrate.py index a0f5cee48f..3518a0888a 100644 --- a/tests/unit/hive_metastore/test_table_migrate.py +++ b/tests/unit/hive_metastore/test_table_migrate.py @@ -1,7 +1,7 @@ import datetime import logging from itertools import cycle -from unittest.mock import create_autospec, PropertyMock +from unittest.mock import create_autospec import pytest from databricks.labs.lsql.backends import MockBackend, SqlBackend @@ -1241,9 +1241,7 @@ def test_refresh_migration_status_published_remained_tables(caplog): def test_table_migration_status_owner() -> None: - admin_locator = create_autospec(AdministratorLocator) # pylint: disable=mock-no-usage - mock_workspace_administrator = PropertyMock(return_value="an_admin") - type(admin_locator).workspace_administrator = mock_workspace_administrator + admin_locator = create_autospec(AdministratorLocator) tables_crawler = create_autospec(TablesCrawler) the_table = Table( @@ -1273,14 +1271,12 @@ def test_table_migration_status_owner() -> None: assert owner == "bob" tables_crawler.snapshot.assert_called_once() table_ownership.owner_of.assert_called_once_with(the_table) - mock_workspace_administrator.assert_not_called() + admin_locator.get_workspace_administrator.assert_not_called() def test_table_migration_status_owner_caches_tables_snapshot() -> None: """Verify that the tables inventory isn't loaded until needed, and after that isn't loaded repeatedly.""" admin_locator = create_autospec(AdministratorLocator) # pylint: disable=mock-no-usage - mock_workspace_administrator = PropertyMock(return_value="an_admin") - type(admin_locator).workspace_administrator = mock_workspace_administrator tables_crawler = create_autospec(TablesCrawler) a_table = Table( @@ -1321,9 +1317,8 @@ def test_table_migration_status_owner_caches_tables_snapshot() -> None: def test_table_migration_status_source_table_unknown() -> None: - admin_locator = create_autospec(AdministratorLocator) # pylint: disable=mock-no-usage - mock_workspace_administrator = PropertyMock(return_value="an_admin") - type(admin_locator).workspace_administrator = mock_workspace_administrator + admin_locator = create_autospec(AdministratorLocator) + admin_locator.get_workspace_administrator.return_value = "an_admin" tables_crawler = create_autospec(TablesCrawler) tables_crawler.snapshot.return_value = [] diff --git a/tests/unit/hive_metastore/test_tables.py b/tests/unit/hive_metastore/test_tables.py index 12cc4158f0..440bdcc597 100644 --- a/tests/unit/hive_metastore/test_tables.py +++ b/tests/unit/hive_metastore/test_tables.py @@ -1,6 +1,6 @@ import logging import sys -from unittest.mock import create_autospec, PropertyMock +from unittest.mock import create_autospec import pytest from databricks.labs.lsql.backends import MockBackend @@ -663,13 +663,12 @@ def test_fast_table_scan_crawler_crawl_test_warnings_get_table(caplog, mocker, s def test_table_owner() -> None: """Verify that the owner of a crawled table is an administrator.""" - admin_locator = create_autospec(AdministratorLocator) # pylint: disable=mock-no-usage - mock_workspace_administrator = PropertyMock(return_value="an_admin") - type(admin_locator).workspace_administrator = mock_workspace_administrator + admin_locator = create_autospec(AdministratorLocator) + admin_locator.get_workspace_administrator.return_value = "an_admin" ownership = TableOwnership(admin_locator) table = Table(catalog="main", database="foo", name="bar", object_type="TABLE", table_format="DELTA") owner = ownership.owner_of(table) assert owner == "an_admin" - mock_workspace_administrator.assert_called_once() + admin_locator.get_workspace_administrator.assert_called_once() diff --git a/tests/unit/hive_metastore/test_udfs.py b/tests/unit/hive_metastore/test_udfs.py index fdff08f259..d1c87d66ad 100644 --- a/tests/unit/hive_metastore/test_udfs.py +++ b/tests/unit/hive_metastore/test_udfs.py @@ -1,4 +1,4 @@ -from unittest.mock import create_autospec, PropertyMock +from unittest.mock import create_autospec from databricks.labs.lsql.backends import MockBackend @@ -50,9 +50,8 @@ def test_tables_crawler_should_filter_by_database(): def test_udf_owner() -> None: """Verify that the owner of a crawled UDF is an administrator.""" - admin_locator = create_autospec(AdministratorLocator) # pylint: disable=mock-no-usage - mock_workspace_administrator = PropertyMock(return_value="an_admin") - type(admin_locator).workspace_administrator = mock_workspace_administrator + admin_locator = create_autospec(AdministratorLocator) + admin_locator.get_workspace_administrator.return_value = "an_admin" ownership = UdfOwnership(admin_locator) udf = Udf( @@ -70,4 +69,4 @@ def test_udf_owner() -> None: owner = ownership.owner_of(udf) assert owner == "an_admin" - mock_workspace_administrator.assert_called_once() + admin_locator.get_workspace_administrator.assert_called_once() diff --git a/tests/unit/source_code/test_directfs_access.py b/tests/unit/source_code/test_directfs_access.py index 953b16dbe2..c02ad07315 100644 --- a/tests/unit/source_code/test_directfs_access.py +++ b/tests/unit/source_code/test_directfs_access.py @@ -1,5 +1,5 @@ from datetime import datetime -from unittest.mock import create_autospec, PropertyMock +from unittest.mock import create_autospec from databricks.labs.lsql.backends import MockBackend @@ -37,13 +37,12 @@ def test_crawler_appends_dfsas(): def test_directfs_access_ownership() -> None: """Verify that the owner for a direct-fs access record is an administrator.""" - admin_locator = create_autospec(AdministratorLocator) # pylint: disable=mock-no-usage - mock_workspace_administrator = PropertyMock(return_value="an_admin") - type(admin_locator).workspace_administrator = mock_workspace_administrator + admin_locator = create_autospec(AdministratorLocator) + admin_locator.get_workspace_administrator.return_value = "an_admin" ownership = DirectFsAccessOwnership(admin_locator) dfsa = DirectFsAccess() owner = ownership.owner_of(dfsa) assert owner == "an_admin" - mock_workspace_administrator.assert_called_once() + admin_locator.get_workspace_administrator.assert_called_once() From a6b5da0b3539ac370a2387bf0895a997e7928428 Mon Sep 17 00:00:00 2001 From: Andrew Snare Date: Tue, 8 Oct 2024 19:01:17 +0200 Subject: [PATCH 51/58] Avoid exposing the admin-finder on the ownership interface. --- src/databricks/labs/ucx/framework/owners.py | 2 +- .../labs/ucx/hive_metastore/table_migration_status.py | 2 +- tests/unit/hive_metastore/test_table_migrate.py | 6 +++--- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/src/databricks/labs/ucx/framework/owners.py b/src/databricks/labs/ucx/framework/owners.py index dc42819354..79424197cf 100644 --- a/src/databricks/labs/ucx/framework/owners.py +++ b/src/databricks/labs/ucx/framework/owners.py @@ -185,7 +185,7 @@ def owner_of(self, record: Record) -> str: Raises: RuntimeError if there are no active administrators for the current workspace. """ - return self._maybe_direct_owner(record) or self.administrator_locator.get_workspace_administrator() + return self._maybe_direct_owner(record) or self._administrator_locator.get_workspace_administrator() @abstractmethod def _maybe_direct_owner(self, record: Record) -> str | None: diff --git a/src/databricks/labs/ucx/hive_metastore/table_migration_status.py b/src/databricks/labs/ucx/hive_metastore/table_migration_status.py index a76f06c4c8..767bb7d7fe 100644 --- a/src/databricks/labs/ucx/hive_metastore/table_migration_status.py +++ b/src/databricks/labs/ucx/hive_metastore/table_migration_status.py @@ -162,7 +162,7 @@ class TableMigrationOwnership(Ownership[TableMigrationStatus]): """ def __init__(self, tables_crawler: TablesCrawler, table_ownership: TableOwnership) -> None: - super().__init__(table_ownership.administrator_locator) + super().__init__(table_ownership._administrator_locator) self._tables_crawler = tables_crawler self._table_ownership = table_ownership self._indexed_tables: dict[tuple[str, str], Table] | None = None diff --git a/tests/unit/hive_metastore/test_table_migrate.py b/tests/unit/hive_metastore/test_table_migrate.py index 3518a0888a..b9378875dc 100644 --- a/tests/unit/hive_metastore/test_table_migrate.py +++ b/tests/unit/hive_metastore/test_table_migrate.py @@ -1254,7 +1254,7 @@ def test_table_migration_status_owner() -> None: ) tables_crawler.snapshot.return_value = [the_table] table_ownership = create_autospec(TableOwnership) - table_ownership.administrator_locator = admin_locator + table_ownership._administrator_locator = admin_locator # pylint: disable=protected-access table_ownership.owner_of.return_value = "bob" ownership = TableMigrationOwnership(tables_crawler, table_ownership) @@ -1297,7 +1297,7 @@ def test_table_migration_status_owner_caches_tables_snapshot() -> None: ) tables_crawler.snapshot.return_value = [a_table, b_table] table_ownership = create_autospec(TableOwnership) - table_ownership.administrator_locator = admin_locator + table_ownership._administrator_locator = admin_locator # pylint: disable=protected-access table_ownership.owner_of.return_value = "bob" ownership = TableMigrationOwnership(tables_crawler, table_ownership) @@ -1323,7 +1323,7 @@ def test_table_migration_status_source_table_unknown() -> None: tables_crawler = create_autospec(TablesCrawler) tables_crawler.snapshot.return_value = [] table_ownership = create_autospec(TableOwnership) - table_ownership.administrator_locator = admin_locator + table_ownership._administrator_locator = admin_locator # pylint: disable=protected-access ownership = TableMigrationOwnership(tables_crawler, table_ownership) From 33d9c138963d7ad733a3521e1eacfbd133d683bf Mon Sep 17 00:00:00 2001 From: Andrew Snare Date: Tue, 8 Oct 2024 19:25:02 +0200 Subject: [PATCH 52/58] Refactor a sequence of generator comprehensions into a for-loop for readability. --- src/databricks/labs/ucx/framework/owners.py | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/src/databricks/labs/ucx/framework/owners.py b/src/databricks/labs/ucx/framework/owners.py index 79424197cf..2ce77aca59 100644 --- a/src/databricks/labs/ucx/framework/owners.py +++ b/src/databricks/labs/ucx/framework/owners.py @@ -1,4 +1,3 @@ -import functools import logging from abc import ABC, abstractmethod from collections.abc import Callable, Iterable, Sequence @@ -138,12 +137,21 @@ def _workspace_id(self) -> int: @cached_property def _found_admin(self) -> str | None: + + # Ordering helper: User.user_name is typed as optional but we can't sort by None. + # (The finders already filter out users without a user-name.) + def _by_username(user: User) -> str: + assert user.user_name + return user.user_name + # Lazily instantiate and query the finders in an attempt to locate an admin user. - finders = (finder(self._ws) for finder in self._finders) - # If a finder returns multiple admin users, use the first (alphabetically by user-name). - first_user = functools.partial(min, default=None, key=lambda user: user.user_name) - found_admin_users: Iterable[User | None] = (first_user(finder.find_admin_users()) for finder in finders) - return next((user.user_name for user in found_admin_users if user), None) + for factory in self._finders: + finder = factory(self._ws) + # First alphabetically by name. + admin_user = min(finder.find_admin_users(), default=None, key=_by_username) + if admin_user: + return admin_user.user_name + return None def get_workspace_administrator(self) -> str: """The user-name of an admin user for the workspace. From c6de10908f53a6115f552c52f615511f994a73d2 Mon Sep 17 00:00:00 2001 From: Andrew Snare Date: Wed, 9 Oct 2024 11:20:41 +0200 Subject: [PATCH 53/58] Remove documentation that the ownership classes report an admin user if a user directly associated with the resource cannot be located. --- src/databricks/labs/ucx/assessment/clusters.py | 4 ++-- src/databricks/labs/ucx/assessment/jobs.py | 2 +- src/databricks/labs/ucx/assessment/pipelines.py | 2 +- src/databricks/labs/ucx/hive_metastore/grants.py | 2 +- .../labs/ucx/hive_metastore/table_migration_status.py | 2 +- src/databricks/labs/ucx/hive_metastore/tables.py | 2 +- src/databricks/labs/ucx/hive_metastore/udfs.py | 2 +- src/databricks/labs/ucx/source_code/directfs_access.py | 3 +-- 8 files changed, 9 insertions(+), 10 deletions(-) diff --git a/src/databricks/labs/ucx/assessment/clusters.py b/src/databricks/labs/ucx/assessment/clusters.py index 984ae0e516..0e0624d3c2 100644 --- a/src/databricks/labs/ucx/assessment/clusters.py +++ b/src/databricks/labs/ucx/assessment/clusters.py @@ -185,7 +185,7 @@ def _try_fetch(self) -> Iterable[ClusterInfo]: class ClusterOwnership(Ownership[ClusterInfo]): """Determine ownership of clusters in the inventory. - This is the cluster creator (if known), or otherwise an administrator. + This is the cluster creator (if known). """ def _maybe_direct_owner(self, record: ClusterInfo) -> str | None: @@ -248,7 +248,7 @@ def _try_fetch(self) -> Iterable[PolicyInfo]: class ClusterPolicyOwnership(Ownership[PolicyInfo]): """Determine ownership of cluster policies in the inventory. - This is the creator of the cluster policy (if known), or otherwise an administrator. + This is the creator of the cluster policy (if known). """ def _maybe_direct_owner(self, record: PolicyInfo) -> str | None: diff --git a/src/databricks/labs/ucx/assessment/jobs.py b/src/databricks/labs/ucx/assessment/jobs.py index 0af2e9aa7c..3c6a4afa84 100644 --- a/src/databricks/labs/ucx/assessment/jobs.py +++ b/src/databricks/labs/ucx/assessment/jobs.py @@ -146,7 +146,7 @@ def _check_jar_task(self, all_task: list[RunTask]) -> list[str]: class JobOwnership(Ownership[JobInfo]): """Determine ownership of jobs (workflows) in the inventory. - This is the pipeline creator (if known), or otherwise an administrator. + This is the job creator (if known). """ def _maybe_direct_owner(self, record: JobInfo) -> str | None: diff --git a/src/databricks/labs/ucx/assessment/pipelines.py b/src/databricks/labs/ucx/assessment/pipelines.py index f0151f6de3..19bc8c558b 100644 --- a/src/databricks/labs/ucx/assessment/pipelines.py +++ b/src/databricks/labs/ucx/assessment/pipelines.py @@ -81,7 +81,7 @@ def _try_fetch(self) -> Iterable[PipelineInfo]: class PipelineOwnership(Ownership[PipelineInfo]): """Determine ownership of pipelines in the inventory. - This is the pipeline creator (if known), or otherwise an administrator. + This is the pipeline creator (if known). """ def _maybe_direct_owner(self, record: PipelineInfo) -> str | None: diff --git a/src/databricks/labs/ucx/hive_metastore/grants.py b/src/databricks/labs/ucx/hive_metastore/grants.py index 428b5dab42..22b99fa992 100644 --- a/src/databricks/labs/ucx/hive_metastore/grants.py +++ b/src/databricks/labs/ucx/hive_metastore/grants.py @@ -386,7 +386,7 @@ def grants( class GrantOwnership(Ownership[Grant]): """Determine ownership of grants in the inventory. - At the present we can't determine a specific owner for grants: we always report an administrator. + At the present we can't determine a specific owner for grants. """ def _maybe_direct_owner(self, record: Grant) -> None: diff --git a/src/databricks/labs/ucx/hive_metastore/table_migration_status.py b/src/databricks/labs/ucx/hive_metastore/table_migration_status.py index 767bb7d7fe..bd96652962 100644 --- a/src/databricks/labs/ucx/hive_metastore/table_migration_status.py +++ b/src/databricks/labs/ucx/hive_metastore/table_migration_status.py @@ -158,7 +158,7 @@ def _iter_schemas(self): class TableMigrationOwnership(Ownership[TableMigrationStatus]): """Determine ownership of table migration records in the inventory. - This is the owner of the source table, if it is present in the inventory, otherwise an administrator. + This is the owner of the source table, if (and only if) the source table is present in the inventory. """ def __init__(self, tables_crawler: TablesCrawler, table_ownership: TableOwnership) -> None: diff --git a/src/databricks/labs/ucx/hive_metastore/tables.py b/src/databricks/labs/ucx/hive_metastore/tables.py index d8a6e48d09..31643604e8 100644 --- a/src/databricks/labs/ucx/hive_metastore/tables.py +++ b/src/databricks/labs/ucx/hive_metastore/tables.py @@ -642,7 +642,7 @@ def _create_describe_tasks(self, catalog: str, database: str, table_names: list[ class TableOwnership(Ownership[Table]): """Determine ownership of tables in the inventory. - At the present we don't determine a specific owner for tables: we always report an administrator. + At the present we don't determine a specific owner for tables. """ def _maybe_direct_owner(self, record: Table) -> None: diff --git a/src/databricks/labs/ucx/hive_metastore/udfs.py b/src/databricks/labs/ucx/hive_metastore/udfs.py index d5e4bd90bd..74196c543c 100644 --- a/src/databricks/labs/ucx/hive_metastore/udfs.py +++ b/src/databricks/labs/ucx/hive_metastore/udfs.py @@ -141,7 +141,7 @@ def _assess_udfs(udfs: Iterable[Udf]) -> Iterable[Udf]: class UdfOwnership(Ownership[Udf]): """Determine ownership of UDFs in the inventory. - At the present we don't determine a specific owner for UDFs: we always report an administrator. + At the present we don't determine a specific owner for UDFs. """ def _maybe_direct_owner(self, record: Udf) -> None: diff --git a/src/databricks/labs/ucx/source_code/directfs_access.py b/src/databricks/labs/ucx/source_code/directfs_access.py index 342f371d05..f9d02bfb7d 100644 --- a/src/databricks/labs/ucx/source_code/directfs_access.py +++ b/src/databricks/labs/ucx/source_code/directfs_access.py @@ -63,8 +63,7 @@ class DirectFsAccessOwnership(Ownership[DirectFsAccess]): - For queries, the creator of the query (if known). - For jobs, the owner of the path for the notebook or source (if known). - At present this information is not gathered during the crawling process, so it can't be reported here. As such - an administrator is currently always reported as the owner. + At present this information is not gathered during the crawling process, so it can't be reported here. """ def _maybe_direct_owner(self, record: DirectFsAccess) -> None: From 4deaf93899768dc8d4e7ce680126dc048a40b3c9 Mon Sep 17 00:00:00 2001 From: Andrew Snare Date: Wed, 9 Oct 2024 11:21:55 +0200 Subject: [PATCH 54/58] Docstring improvements. --- src/databricks/labs/ucx/framework/owners.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/databricks/labs/ucx/framework/owners.py b/src/databricks/labs/ucx/framework/owners.py index 2ce77aca59..150b0be8d8 100644 --- a/src/databricks/labs/ucx/framework/owners.py +++ b/src/databricks/labs/ucx/framework/owners.py @@ -183,13 +183,13 @@ def owner_of(self, record: Record) -> str: This is intended to be a point of contact, and is either: - - The user that originally created the resource associated with the result; or + - A user directly associated with the resource, such as the original creator; or - An active administrator for the current workspace. Args: record (Record): The record for which an associated user-name is sought. Returns: - A string containing the user-name attribute of the user considered to own the resource. + A string containing the user-name attribute of a user considered to be responsible for the resource. Raises: RuntimeError if there are no active administrators for the current workspace. """ From 47d5343230f2ff1b5a3b652badfb94e0b5622d51 Mon Sep 17 00:00:00 2001 From: Andrew Snare Date: Wed, 9 Oct 2024 11:22:10 +0200 Subject: [PATCH 55/58] Fix incorrect term: result -> record --- src/databricks/labs/ucx/framework/owners.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/databricks/labs/ucx/framework/owners.py b/src/databricks/labs/ucx/framework/owners.py index 150b0be8d8..692fa04ce9 100644 --- a/src/databricks/labs/ucx/framework/owners.py +++ b/src/databricks/labs/ucx/framework/owners.py @@ -197,5 +197,5 @@ def owner_of(self, record: Record) -> str: @abstractmethod def _maybe_direct_owner(self, record: Record) -> str | None: - """Obtain the record-specific user-name associated with the given result, if any.""" + """Obtain the record-specific user-name associated with the given record, if any.""" return None From d74c2411c703986ed3508337d5125c4251d00e51 Mon Sep 17 00:00:00 2001 From: Andrew Snare Date: Wed, 9 Oct 2024 11:22:27 +0200 Subject: [PATCH 56/58] Remove property. --- src/databricks/labs/ucx/framework/owners.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/src/databricks/labs/ucx/framework/owners.py b/src/databricks/labs/ucx/framework/owners.py index 692fa04ce9..4edef7a5e8 100644 --- a/src/databricks/labs/ucx/framework/owners.py +++ b/src/databricks/labs/ucx/framework/owners.py @@ -172,11 +172,6 @@ class Ownership(ABC, Generic[Record]): def __init__(self, administrator_locator: AdministratorLocator) -> None: self._administrator_locator = administrator_locator - @final - @property - def administrator_locator(self): - return self._administrator_locator - @final def owner_of(self, record: Record) -> str: """Obtain the user-name of a user that is responsible for the given record. From 409bdf929f3b28ed88ae5cfc7158f965b4809202 Mon Sep 17 00:00:00 2001 From: Andrew Snare Date: Wed, 9 Oct 2024 12:12:00 +0200 Subject: [PATCH 57/58] Update some ownership integration tests to verify the complete admin username. --- tests/integration/hive_metastore/test_grants.py | 2 +- tests/integration/hive_metastore/test_tables.py | 2 +- tests/integration/hive_metastore/test_udfs.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/integration/hive_metastore/test_grants.py b/tests/integration/hive_metastore/test_grants.py index bbabc1d20e..a89c0b94e1 100644 --- a/tests/integration/hive_metastore/test_grants.py +++ b/tests/integration/hive_metastore/test_grants.py @@ -134,4 +134,4 @@ def test_grant_ownership(ws, runtime_ctx, inventory_schema, sql_backend) -> None # Verify ownership can be made. ownership = GrantOwnership(runtime_ctx.administrator_locator) - assert "@" in ownership.owner_of(grant_record) + assert ownership.owner_of(grant_record) == runtime_ctx.administrator_locator.get_workspace_administrator() diff --git a/tests/integration/hive_metastore/test_tables.py b/tests/integration/hive_metastore/test_tables.py index 3e79cc00a0..efd554591a 100644 --- a/tests/integration/hive_metastore/test_tables.py +++ b/tests/integration/hive_metastore/test_tables.py @@ -104,4 +104,4 @@ def test_table_ownership(runtime_ctx, inventory_schema, sql_backend) -> None: # Verify ownership can be made. ownership = TableOwnership(runtime_ctx.administrator_locator) - assert "@" in ownership.owner_of(table_record) + assert ownership.owner_of(table_record) == runtime_ctx.administrator_locator.get_workspace_administrator() diff --git a/tests/integration/hive_metastore/test_udfs.py b/tests/integration/hive_metastore/test_udfs.py index 348e4a3c1e..2107267f9d 100644 --- a/tests/integration/hive_metastore/test_udfs.py +++ b/tests/integration/hive_metastore/test_udfs.py @@ -42,4 +42,4 @@ def test_udf_ownership(runtime_ctx, inventory_schema, sql_backend) -> None: # Verify ownership can be made. ownership = UdfOwnership(runtime_ctx.administrator_locator) - assert "@" in ownership.owner_of(udf_record) + assert ownership.owner_of(udf_record) == runtime_ctx.administrator_locator.get_workspace_administrator() From 215a1be06dbbd46565058a7ee04af5ae38ea2593 Mon Sep 17 00:00:00 2001 From: Andrew Snare Date: Wed, 9 Oct 2024 12:56:46 +0200 Subject: [PATCH 58/58] Update integration test for cluster ownership. For some reason deleting the owner of a cluster doesn't clear the creator field, even though the documentation says it should. Instead we just check that the creator is actually returned. --- tests/integration/assessment/test_clusters.py | 27 ++++++++++--------- 1 file changed, 14 insertions(+), 13 deletions(-) diff --git a/tests/integration/assessment/test_clusters.py b/tests/integration/assessment/test_clusters.py index fab5908a55..8cf0622220 100644 --- a/tests/integration/assessment/test_clusters.py +++ b/tests/integration/assessment/test_clusters.py @@ -56,28 +56,29 @@ def _change_cluster_owner(ws, cluster_id: str, owner_user_name: str) -> None: def test_cluster_ownership(ws, runtime_ctx, make_cluster, make_user, inventory_schema, sql_backend) -> None: """Verify the ownership can be determined for crawled clusters.""" - # Set up two clusters: one with an owner (us) and another without. + # Set up two clusters: one with us as owner and one for a different user. + # TODO: Figure out how to clear the creator for a cluster. + # (Contrary to the documentation for the creator field, deleting the user doesn't clear it immediately and waiting + # for 10 min doesn't help: the UI reports no creator, but the REST API continues to report the deleted user.) another_user = make_user() - cluster_with_owner = make_cluster(single_node=True, spark_conf=_SPARK_CONF) - cluster_without_owner = make_cluster(single_node=True, spark_conf=_SPARK_CONF) - ws.clusters.delete_and_wait(cluster_id=cluster_without_owner.cluster_id) - _change_cluster_owner(ws, cluster_without_owner.cluster_id, owner_user_name=another_user.user_name) - ws.users.delete(another_user.id) + my_cluster = make_cluster(single_node=True, spark_conf=_SPARK_CONF) + their_cluster = make_cluster(single_node=True, spark_conf=_SPARK_CONF) + ws.clusters.delete_and_wait(cluster_id=their_cluster.cluster_id) + _change_cluster_owner(ws, their_cluster.cluster_id, owner_user_name=another_user.user_name) # Produce the crawled records. crawler = ClustersCrawler(ws, sql_backend, inventory_schema) records = crawler.snapshot(force_refresh=True) # Find the crawled records for our clusters. - cluster_record_with_owner = next(record for record in records if record.cluster_id == cluster_with_owner.cluster_id) - cluster_record_without_owner = next( - record for record in records if record.cluster_id == cluster_without_owner.cluster_id - ) + my_cluster_record = next(record for record in records if record.cluster_id == my_cluster.cluster_id) + their_cluster_record = next(record for record in records if record.cluster_id == their_cluster.cluster_id) # Verify ownership is as expected. - ownership = ClusterOwnership(runtime_ctx.administrator_locator) - assert ownership.owner_of(cluster_record_with_owner) == ws.current_user.me().user_name - assert "@" in ownership.owner_of(cluster_record_without_owner) + administrator_locator = runtime_ctx.administrator_locator + ownership = ClusterOwnership(administrator_locator) + assert ownership.owner_of(my_cluster_record) == ws.current_user.me().user_name + assert ownership.owner_of(their_cluster_record) == another_user.user_name def test_cluster_crawler_mlr_no_isolation(ws, make_cluster, inventory_schema, sql_backend):