databrickslabs · nfx · Oct 9, 2024 · Oct 1, 2024 · Oct 3, 2024 · Oct 3, 2024
diff --git a/src/databricks/labs/ucx/assessment/azure.py b/src/databricks/labs/ucx/assessment/azure.py
@@ -42,8 +42,7 @@ class ServicePrincipalClusterMapping:
 
 class AzureServicePrincipalCrawler(CrawlerBase[AzureServicePrincipalInfo], JobsMixin, SecretsMixin):
     def __init__(self, ws: WorkspaceClient, sbe: SqlBackend, schema):
-        super().__init__(sbe, "hive_metastore", schema, "azure_service_principals", AzureServicePrincipalInfo)
-        self._ws = ws
+        super().__init__(ws, sbe, "hive_metastore", schema, "azure_service_principals", AzureServicePrincipalInfo)
 
     def _try_fetch(self) -> Iterable[AzureServicePrincipalInfo]:
         for row in self._fetch(f"SELECT * FROM {escape_sql_identifier(self.full_name)}"):

@@ -143,8 +143,7 @@ def _check_cluster_failures(self, cluster: ClusterDetails, source: str) -> list[
 
 class ClustersCrawler(CrawlerBase[ClusterInfo], CheckClusterMixin):
     def __init__(self, ws: WorkspaceClient, sbe: SqlBackend, schema: str):
-        super().__init__(sbe, "hive_metastore", schema, "clusters", ClusterInfo)
-        self._ws = ws
+        super().__init__(ws, sbe, "hive_metastore", schema, "clusters", ClusterInfo)
 
     def _crawl(self) -> Iterable[ClusterInfo]:
         all_clusters = list(self._ws.clusters.list())
@@ -192,8 +191,7 @@ class PolicyInfo:
 
 class PoliciesCrawler(CrawlerBase[PolicyInfo], CheckClusterMixin):
     def __init__(self, ws: WorkspaceClient, sbe: SqlBackend, schema):
-        super().__init__(sbe, "hive_metastore", schema, "policies", PolicyInfo)
-        self._ws = ws
+        super().__init__(ws, sbe, "hive_metastore", schema, "policies", PolicyInfo)
 
     def _crawl(self) -> Iterable[PolicyInfo]:
         all_policices = list(self._ws.cluster_policies.list())

diff --git a/src/databricks/labs/ucx/assessment/init_scripts.py b/src/databricks/labs/ucx/assessment/init_scripts.py
@@ -42,8 +42,7 @@ def check_init_script(self, init_script_data: str | None, source: str) -> list[s
 
 class GlobalInitScriptCrawler(CrawlerBase[GlobalInitScriptInfo], CheckInitScriptMixin):
     def __init__(self, ws: WorkspaceClient, sbe: SqlBackend, schema):
-        super().__init__(sbe, "hive_metastore", schema, "global_init_scripts", GlobalInitScriptInfo)
-        self._ws = ws
+        super().__init__(ws, sbe, "hive_metastore", schema, "global_init_scripts", GlobalInitScriptInfo)
 
     def _crawl(self) -> Iterable[GlobalInitScriptInfo]:
         all_global_init_scripts = list(self._ws.global_init_scripts.list())

@@ -72,8 +72,7 @@ def _job_clusters(job):
 
 class JobsCrawler(CrawlerBase[JobInfo], JobsMixin, CheckClusterMixin):
     def __init__(self, ws: WorkspaceClient, sbe: SqlBackend, schema):
-        super().__init__(sbe, "hive_metastore", schema, "jobs", JobInfo)
-        self._ws = ws
+        super().__init__(ws, sbe, "hive_metastore", schema, "jobs", JobInfo)
 
     def _crawl(self) -> Iterable[JobInfo]:
         all_jobs = list(self._ws.jobs.list(expand_tasks=True))
@@ -159,8 +158,7 @@ class SubmitRunsCrawler(CrawlerBase[SubmitRunInfo], JobsMixin, CheckClusterMixin
     ]
 
     def __init__(self, ws: WorkspaceClient, sbe: SqlBackend, schema: str, num_days_history: int):
-        super().__init__(sbe, "hive_metastore", schema, "submit_runs", SubmitRunInfo)
-        self._ws = ws
+        super().__init__(ws, sbe, "hive_metastore", schema, "submit_runs", SubmitRunInfo)
         self._num_days_history = num_days_history
 
     @staticmethod

@@ -24,8 +24,7 @@ class PipelineInfo:
 
 class PipelinesCrawler(CrawlerBase[PipelineInfo], CheckClusterMixin):
     def __init__(self, ws: WorkspaceClient, sbe: SqlBackend, schema):
-        super().__init__(sbe, "hive_metastore", schema, "pipelines", PipelineInfo)
-        self._ws = ws
+        super().__init__(ws, sbe, "hive_metastore", schema, "pipelines", PipelineInfo)
 
     def _crawl(self) -> Iterable[PipelineInfo]:
         all_pipelines = list(self._ws.pipelines.list_pipelines())

@@ -201,6 +201,7 @@ def legacy_table_acl_support(self):
     @cached_property
     def permission_manager(self):
         return PermissionManager(
+            self.workspace_client,
             self.sql_backend,
             self.inventory_database,
             [
@@ -232,11 +233,21 @@ def grants_crawler(self):
 
     @cached_property
     def udfs_crawler(self):
-        return UdfsCrawler(self.sql_backend, self.inventory_database, self.config.include_databases)
+        return UdfsCrawler(
+            self.workspace_client,
+            self.sql_backend,
+            self.inventory_database,
+            self.config.include_databases,
+        )
 
     @cached_property
     def tables_crawler(self):
-        return TablesCrawler(self.sql_backend, self.inventory_database, self.config.include_databases)
+        return TablesCrawler(
+            self.workspace_client,
+            self.sql_backend,
+            self.inventory_database,
+            self.config.include_databases,
+        )
 
     @cached_property
     def tables_migrator(self):
@@ -443,11 +454,11 @@ def query_linter(self):
 
     @cached_property
     def directfs_access_crawler_for_paths(self):
-        return DirectFsAccessCrawler.for_paths(self.sql_backend, self.inventory_database)
+        return DirectFsAccessCrawler.for_paths(self.workspace_client, self.sql_backend, self.inventory_database)
 
     @cached_property
     def directfs_access_crawler_for_queries(self):
-        return DirectFsAccessCrawler.for_queries(self.sql_backend, self.inventory_database)
+        return DirectFsAccessCrawler.for_queries(self.workspace_client, self.sql_backend, self.inventory_database)
 
     @cached_property
     def redash(self):
@@ -476,6 +487,7 @@ def data_comparator(self):
     @cached_property
     def migration_recon(self):
         return MigrationRecon(
+            self.workspace_client,
             self.sql_backend,
             self.inventory_database,
             self.migration_status_refresher,

@@ -72,7 +72,7 @@ def pipelines_crawler(self):
 
     @cached_property
     def table_size_crawler(self):
-        return TableSizeCrawler(self.sql_backend, self.inventory_database, self.config.include_databases)
+        return TableSizeCrawler(self.tables_crawler)
 
     @cached_property
     def policies_crawler(self):
@@ -84,7 +84,7 @@ def global_init_scripts_crawler(self):
 
     @cached_property
     def tables_crawler(self):
-        return FasterTableScanCrawler(self.sql_backend, self.inventory_database)
+        return FasterTableScanCrawler(self.workspace_client, self.sql_backend, self.inventory_database)
 
     @cached_property
     def tables_in_mounts(self):

@@ -1,12 +1,14 @@
 import logging
 from abc import ABC, abstractmethod
 from collections.abc import Callable, Iterable, Sequence
-from typing import ClassVar, Generic, Literal, Protocol, TypeVar
+from functools import cached_property
+from typing import ClassVar, Generic, Literal, Protocol, TypeVar, final
 
 from databricks.labs.lsql.backends import SqlBackend
+from databricks.sdk import WorkspaceClient
 from databricks.sdk.errors import NotFound
 
-from databricks.labs.ucx.framework.utils import escape_sql_identifier
+from databricks.labs.ucx.framework.utils import escape_sql_identifier, find_an_admin
 
 logger = logging.getLogger(__name__)
 
@@ -21,17 +23,25 @@ class DataclassInstance(Protocol):
 
 
 class CrawlerBase(ABC, Generic[Result]):
-    def __init__(self, backend: SqlBackend, catalog: str, schema: str, table: str, klass: type[Result]):
+
+    _cached_workspace_admins: dict[int, str | RuntimeError] = {}
+    """Cached user names of workspace administrators, keyed by workspace id."""
+
+    def __init__(
+        self, ws: WorkspaceClient, backend: SqlBackend, catalog: str, schema: str, table: str, klass: type[Result]
+    ):
         """
         Initializes a CrawlerBase instance.
 
         Args:
+            ws (WorkspaceClient): A client for the current workspace.
             backend (SqlBackend): The backend that executes SQL queries:
                 Statement Execution API or Databricks Runtime.
             catalog (str): The catalog name for the inventory persistence.
             schema: The schema name for the inventory persistence.
             table: The table name for the inventory persistence.
         """
+        self._ws = ws
         self._catalog = self._valid(catalog)
         self._schema = self._valid(schema)
         self._table = self._valid(table)
@@ -107,6 +117,56 @@ def snapshot(self, *, force_refresh: bool = False) -> Iterable[Result]:
         """
         return self._snapshot(self._try_fetch, self._crawl, force_refresh=force_refresh)
 
+    @final
+    def owner_of(self, result: Result) -> str:
+        """Obtain the user-name of a user that is responsible for the given record.
+
+        This is intended to be a point of contact, and is either:
+
+         - The user that originally created the resource associated with the result; or
+         - An active administrator for the current workspace.
+
+        Args:
+            result (Result): The record for which an associated user-name is sought.
+        Returns:
+            A string containing the user-name attribute of the user considered to own the resource.
+        Raises:
+            RuntimeError if there are no active administrators for the current workspace.
+        """
+        return self._result_owner(result) or self._workspace_admin
+
+    @cached_property
+    def _workspace_admin(self) -> str:
+        # Avoid repeatedly hitting the shared cache.
+        return self._find_administrator_for(self._ws)
+
+    @classmethod
+    @final
+    def _find_administrator_for(cls, ws: WorkspaceClient) -> str:
+        # Finding an administrator is quite expensive, so we ensure that for a given workspace we only
+        # do it once.
+        workspace_id = ws.get_workspace_id()
+        found_admin_or_error = cls._cached_workspace_admins.get(workspace_id, None)
+        if isinstance(found_admin_or_error, str):
+            return found_admin_or_error
+        if isinstance(found_admin_or_error, RuntimeError):
+            raise found_admin_or_error
+
+        found_admin = find_an_admin(ws)
+        if found_admin is None or not found_admin.user_name:
+            msg = f"No active workspace or account administrator can be found for workspace: {workspace_id}"
+            error = RuntimeError(msg)
+            cls._cached_workspace_admins[workspace_id] = error
+            raise error
+        user_name = found_admin.user_name
+        cls._cached_workspace_admins[workspace_id] = user_name
+        return user_name
+
+    @classmethod
+    def _result_owner(cls, result: Result) -> str | None:  # pylint: disable=unused-argument
+        """Obtain the record-specific user-name associated with the given result, if any."""
+        return None
+
     @abstractmethod
     def _try_fetch(self) -> Iterable[Result]:
         """Fetch existing data that has (previously) been crawled by this crawler.

diff --git a/src/databricks/labs/ucx/framework/utils.py b/src/databricks/labs/ucx/framework/utils.py
@@ -1,5 +1,11 @@
+import functools
 import logging
 import subprocess
+from collections.abc import Iterable
+
+from databricks.sdk import WorkspaceClient
+from databricks.sdk.service.iam import User
+
 
 logger = logging.getLogger(__name__)
 
@@ -22,6 +28,55 @@ def escape_sql_identifier(path: str, *, maxsplit: int = 2) -> str:
     return ".".join(escaped)
 
 
+def _has_role(user: User, role: str) -> bool:
+    return user.roles is not None and any(r.value == role for r in user.roles)
+
+
+def find_workspace_admins(ws: WorkspaceClient) -> Iterable[User]:
+    """Enumerate the active workspace administrators in a given workspace.
+
+    Arguments:
+        ws (WorkspaceClient): The client for the workspace whose administrators should be enumerated.
+    Returns:
+        Iterable[User]: The active workspace administrators, if any.
+    """
+    all_users = ws.users.list(attributes="id,active,userName,roles")
+    return (user for user in all_users if user.active and _has_role(user, "workspace_admin"))
+
+
+def find_account_admins(ws: WorkspaceClient) -> Iterable[User]:
+    """Enumerate the active account administrators associated with a given workspace.
+
+    Arguments:
+        ws (WorkspaceClient): The client for the workspace whose account administrators should be enumerated.
+    Returns:
+        Iterable[User]: The active account administrators, if any.
+    """
+    response = ws.api_client.do(
+        "GET", "/api/2.0/account/scim/v2/Users", query={"attributes": "id,active,userName,roles"}
+    )
+    assert isinstance(response, dict)
+    all_users = (User.from_dict(resource) for resource in response.get("Resources", []))
+    return (user for user in all_users if user.active and _has_role(user, "account_admin"))
+
+
+def find_an_admin(ws: WorkspaceClient) -> User | None:
+    """Locate an active administrator for the current workspace.
+
+    If an active workspace administrator can be located, this is returned. When there are multiple, they are sorted
+    alphabetically by user-name and the first is returned. If there are no workspace administrators then an active
+    account administrator is sought, again returning the first alphabetically by user-name if there is more than one.
+
+    Arguments:
+        ws (WorkspaceClient): The client for the workspace for which an administrator should be located.
+    Returns:
+        the first (alphabetically by user-name) active workspace or account administrator, or `None` if neither can be
+        found.
+    """
+    first_user = functools.partial(min, default=None, key=lambda user: user.name)
+    return first_user(find_workspace_admins(ws)) or first_user(find_account_admins(ws))
+
+
 def run_command(command: str | list[str]) -> tuple[int, str, str]:
     args = command.split() if isinstance(command, str) else command
     logger.info(f"Invoking command: {args!r}")

@@ -199,10 +199,11 @@ class GrantsCrawler(CrawlerBase[Grant]):
     """Crawler that captures access controls that relate to data and other securable objects."""
 
     def __init__(self, tc: TablesCrawler, udf: UdfsCrawler, include_databases: list[str] | None = None):
+        assert tc._ws == udf._ws
         assert tc._backend == udf._backend
         assert tc._catalog == udf._catalog
         assert tc._schema == udf._schema
-        super().__init__(tc._backend, tc._catalog, tc._schema, "grants", Grant)
+        super().__init__(tc._ws, tc._backend, tc._catalog, tc._schema, "grants", Grant)
         self._tc = tc
         self._udf = udf
         self._include_databases = include_databases

diff --git a/src/databricks/labs/ucx/hive_metastore/locations.py b/src/databricks/labs/ucx/hive_metastore/locations.py
@@ -117,8 +117,7 @@ class ExternalLocations(CrawlerBase[ExternalLocation]):
     _prefix_size: ClassVar[list[int]] = [1, 12]
 
     def __init__(self, ws: WorkspaceClient, sbe: SqlBackend, schema: str):
-        super().__init__(sbe, "hive_metastore", schema, "external_locations", ExternalLocation)
-        self._ws = ws
+        super().__init__(ws, sbe, "hive_metastore", schema, "external_locations", ExternalLocation)
 
     def _external_locations(self, tables: list[Row], mounts) -> Iterable[ExternalLocation]:
         min_slash = 2
@@ -301,8 +300,7 @@ def save_as_terraform_definitions_on_workspace(self, installation: Installation)
 
 class Mounts(CrawlerBase[Mount]):
     def __init__(self, backend: SqlBackend, ws: WorkspaceClient, inventory_database: str):
-        super().__init__(backend, "hive_metastore", inventory_database, "mounts", Mount)
-        self._dbutils = ws.dbutils
+        super().__init__(ws, backend, "hive_metastore", inventory_database, "mounts", Mount)
 
     @staticmethod
     def _deduplicate_mounts(mounts: list) -> list:
@@ -320,7 +318,7 @@ def _deduplicate_mounts(mounts: list) -> list:
 
     def _crawl(self) -> Iterable[Mount]:
         mounts = []
-        for mount_point, source, _ in self._dbutils.fs.mounts():
+        for mount_point, source, _ in self._ws.dbutils.fs.mounts():
             mounts.append(Mount(mount_point, source))
         return self._deduplicate_mounts(mounts)
 
@@ -356,11 +354,10 @@ def __init__(
         exclude_paths_in_mount: list[str] | None = None,
         include_paths_in_mount: list[str] | None = None,
     ):
-        super().__init__(backend, "hive_metastore", inventory_database, "tables", Table)
+        super().__init__(ws, backend, "hive_metastore", inventory_database, "tables", Table)
         self._dbutils = ws.dbutils
         self._mounts_crawler = mc
         self._include_mounts = include_mounts
-        self._ws = ws
         self._include_paths_in_mount = include_paths_in_mount
 
         irrelevant_patterns = {'_SUCCESS', '_committed_', '_started_'}

@@ -76,8 +76,7 @@ class TableMigrationStatusRefresher(CrawlerBase[TableMigrationStatus]):
     """
 
     def __init__(self, ws: WorkspaceClient, sbe: SqlBackend, schema, table_crawler: TablesCrawler):
-        super().__init__(sbe, "hive_metastore", schema, "migration_status", TableMigrationStatus)
-        self._ws = ws
+        super().__init__(ws, sbe, "hive_metastore", schema, "migration_status", TableMigrationStatus)
         self._table_crawler = table_crawler
 
     def index(self, *, force_refresh: bool = False) -> TableMigrationIndex: