Skip to content

Commit 35d04aa

Browse files
authored
Crawler: support for object ownership (#2774)
## Changes As part of #2761 we need to have a way for determining the user responsible for some of our inventory types. This PR updates the crawler framework so that: - There is a way to identify an owner of a resource referred to by inventory records. - When the owner cannot be identified, a workspace or account administrator is used instead. ### Linked issues Progresses #2761. ### Functionality - A component for locating an administrator user. - Ownership information for the following inventory types: - [X] `ClusterInfo` - [x] `DirectFsAccess` (stubbed) - [X] `Grant` - [x] `JobInfo` - [x] `PipelineInfo` - [X] `PolicyInfo` - [x] `Table` - [x] `TableMigrationStatus` - [x] `UDF` ### Tests - [x] added unit tests - [x] added integration tests
1 parent 223d570 commit 35d04aa

33 files changed

+1341
-78
lines changed

src/databricks/labs/ucx/assessment/clusters.py

Lines changed: 27 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@
2929
)
3030
from databricks.labs.ucx.assessment.init_scripts import CheckInitScriptMixin
3131
from databricks.labs.ucx.framework.crawlers import CrawlerBase
32+
from databricks.labs.ucx.framework.owners import Ownership
3233
from databricks.labs.ucx.framework.utils import escape_sql_identifier
3334

3435
logger = logging.getLogger(__name__)
@@ -43,6 +44,7 @@ class ClusterInfo:
4344
policy_id: str | None = None
4445
cluster_name: str | None = None
4546
creator: str | None = None
47+
"""User-name of the creator of the cluster, if known."""
4648

4749

4850
class CheckClusterMixin(CheckInitScriptMixin):
@@ -154,7 +156,8 @@ def _assess_clusters(self, all_clusters):
154156
for cluster in all_clusters:
155157
if cluster.cluster_source == ClusterSource.JOB:
156158
continue
157-
if not cluster.creator_user_name:
159+
creator = cluster.creator_user_name or None
160+
if not creator:
158161
logger.warning(
159162
f"Cluster {cluster.cluster_id} have Unknown creator, it means that the original creator "
160163
f"has been deleted and should be re-created"
@@ -164,7 +167,7 @@ def _assess_clusters(self, all_clusters):
164167
cluster_name=cluster.cluster_name,
165168
policy_id=cluster.policy_id,
166169
spark_version=cluster.spark_version,
167-
creator=cluster.creator_user_name,
170+
creator=creator,
168171
success=1,
169172
failures="[]",
170173
)
@@ -179,6 +182,16 @@ def _try_fetch(self) -> Iterable[ClusterInfo]:
179182
yield ClusterInfo(*row)
180183

181184

185+
class ClusterOwnership(Ownership[ClusterInfo]):
186+
"""Determine ownership of clusters in the inventory.
187+
188+
This is the cluster creator (if known).
189+
"""
190+
191+
def _maybe_direct_owner(self, record: ClusterInfo) -> str | None:
192+
return record.creator
193+
194+
182195
@dataclass
183196
class PolicyInfo:
184197
policy_id: str
@@ -188,6 +201,7 @@ class PolicyInfo:
188201
spark_version: str | None = None
189202
policy_description: str | None = None
190203
creator: str | None = None
204+
"""User-name of the creator of the cluster policy, if known."""
191205

192206

193207
class PoliciesCrawler(CrawlerBase[PolicyInfo], CheckClusterMixin):
@@ -210,7 +224,7 @@ def _assess_policies(self, all_policices) -> Iterable[PolicyInfo]:
210224
except KeyError:
211225
spark_version = None
212226
policy_name = policy.name
213-
creator_name = policy.creator_user_name
227+
creator_name = policy.creator_user_name or None
214228

215229
policy_info = PolicyInfo(
216230
policy_id=policy.policy_id,
@@ -229,3 +243,13 @@ def _assess_policies(self, all_policices) -> Iterable[PolicyInfo]:
229243
def _try_fetch(self) -> Iterable[PolicyInfo]:
230244
for row in self._fetch(f"SELECT * FROM {escape_sql_identifier(self.full_name)}"):
231245
yield PolicyInfo(*row)
246+
247+
248+
class ClusterPolicyOwnership(Ownership[PolicyInfo]):
249+
"""Determine ownership of cluster policies in the inventory.
250+
251+
This is the creator of the cluster policy (if known).
252+
"""
253+
254+
def _maybe_direct_owner(self, record: PolicyInfo) -> str | None:
255+
return record.creator

src/databricks/labs/ucx/assessment/jobs.py

Lines changed: 15 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@
2525
from databricks.labs.ucx.assessment.clusters import CheckClusterMixin
2626
from databricks.labs.ucx.assessment.crawlers import spark_version_compatibility
2727
from databricks.labs.ucx.framework.crawlers import CrawlerBase
28+
from databricks.labs.ucx.framework.owners import Ownership
2829
from databricks.labs.ucx.framework.utils import escape_sql_identifier
2930

3031
logger = logging.getLogger(__name__)
@@ -37,6 +38,7 @@ class JobInfo:
3738
failures: str
3839
job_name: str | None = None
3940
creator: str | None = None
41+
"""User-name of the creator of the pipeline, if known."""
4042

4143

4244
class JobsMixin:
@@ -106,7 +108,8 @@ def _prepare(all_jobs) -> tuple[dict[int, set[str]], dict[int, JobInfo]]:
106108
if not job.job_id:
107109
continue
108110
job_assessment[job.job_id] = set()
109-
if not job.creator_user_name:
111+
creator_user_name = job.creator_user_name or None
112+
if not creator_user_name:
110113
logger.warning(
111114
f"Job {job.job_id} have Unknown creator, it means that the original creator has been deleted "
112115
f"and should be re-created"
@@ -122,7 +125,7 @@ def _prepare(all_jobs) -> tuple[dict[int, set[str]], dict[int, JobInfo]]:
122125
job_details[job.job_id] = JobInfo(
123126
job_id=str(job.job_id),
124127
job_name=job_name,
125-
creator=job.creator_user_name,
128+
creator=creator_user_name,
126129
success=1,
127130
failures="[]",
128131
)
@@ -140,6 +143,16 @@ def _check_jar_task(self, all_task: list[RunTask]) -> list[str]:
140143
return task_failures
141144

142145

146+
class JobOwnership(Ownership[JobInfo]):
147+
"""Determine ownership of jobs (workflows) in the inventory.
148+
149+
This is the job creator (if known).
150+
"""
151+
152+
def _maybe_direct_owner(self, record: JobInfo) -> str | None:
153+
return record.creator
154+
155+
143156
@dataclass
144157
class SubmitRunInfo:
145158
run_ids: str # JSON-encoded list of run ids

src/databricks/labs/ucx/assessment/pipelines.py

Lines changed: 15 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88

99
from databricks.labs.ucx.assessment.clusters import CheckClusterMixin
1010
from databricks.labs.ucx.framework.crawlers import CrawlerBase
11+
from databricks.labs.ucx.framework.owners import Ownership
1112
from databricks.labs.ucx.framework.utils import escape_sql_identifier
1213

1314
logger = logging.getLogger(__name__)
@@ -20,6 +21,7 @@ class PipelineInfo:
2021
failures: str
2122
pipeline_name: str | None = None
2223
creator_name: str | None = None
24+
"""User-name of the creator of the pipeline, if known."""
2325

2426

2527
class PipelinesCrawler(CrawlerBase[PipelineInfo], CheckClusterMixin):
@@ -33,15 +35,16 @@ def _crawl(self) -> Iterable[PipelineInfo]:
3335

3436
def _assess_pipelines(self, all_pipelines) -> Iterable[PipelineInfo]:
3537
for pipeline in all_pipelines:
36-
if not pipeline.creator_user_name:
38+
creator_name = pipeline.creator_user_name or None
39+
if not creator_name:
3740
logger.warning(
3841
f"Pipeline {pipeline.name} have Unknown creator, it means that the original creator "
3942
f"has been deleted and should be re-created"
4043
)
4144
pipeline_info = PipelineInfo(
4245
pipeline_id=pipeline.pipeline_id,
4346
pipeline_name=pipeline.name,
44-
creator_name=pipeline.creator_user_name,
47+
creator_name=creator_name,
4548
success=1,
4649
failures="[]",
4750
)
@@ -73,3 +76,13 @@ def _pipeline_clusters(self, clusters, failures):
7376
def _try_fetch(self) -> Iterable[PipelineInfo]:
7477
for row in self._fetch(f"SELECT * FROM {escape_sql_identifier(self.full_name)}"):
7578
yield PipelineInfo(*row)
79+
80+
81+
class PipelineOwnership(Ownership[PipelineInfo]):
82+
"""Determine ownership of pipelines in the inventory.
83+
84+
This is the pipeline creator (if known).
85+
"""
86+
87+
def _maybe_direct_owner(self, record: PipelineInfo) -> str | None:
88+
return record.creator_name

src/databricks/labs/ucx/contexts/application.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@
2828
from databricks.labs.ucx.assessment.export import AssessmentExporter
2929
from databricks.labs.ucx.aws.credentials import CredentialManager
3030
from databricks.labs.ucx.config import WorkspaceConfig
31+
from databricks.labs.ucx.framework.owners import AdministratorLocator
3132
from databricks.labs.ucx.hive_metastore import ExternalLocations, Mounts, TablesCrawler
3233
from databricks.labs.ucx.hive_metastore.catalog_schema import CatalogSchema
3334
from databricks.labs.ucx.hive_metastore.grants import (
@@ -514,6 +515,10 @@ def migration_recon(self) -> MigrationRecon:
514515
self.config.recon_tolerance_percent,
515516
)
516517

518+
@cached_property
519+
def administrator_locator(self) -> AdministratorLocator:
520+
return AdministratorLocator(self.workspace_client)
521+
517522

518523
class CliContext(GlobalContext, abc.ABC):
519524
@cached_property

src/databricks/labs/ucx/contexts/workflow_task.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -73,7 +73,7 @@ def pipelines_crawler(self) -> PipelinesCrawler:
7373

7474
@cached_property
7575
def table_size_crawler(self) -> TableSizeCrawler:
76-
return TableSizeCrawler(self.sql_backend, self.inventory_database, self.config.include_databases)
76+
return TableSizeCrawler(self.tables_crawler)
7777

7878
@cached_property
7979
def policies_crawler(self) -> PoliciesCrawler:

src/databricks/labs/ucx/framework/crawlers.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ class DataclassInstance(Protocol):
2121

2222

2323
class CrawlerBase(ABC, Generic[Result]):
24-
def __init__(self, backend: SqlBackend, catalog: str, schema: str, table: str, klass: type[Result]):
24+
def __init__(self, backend: SqlBackend, catalog: str, schema: str, table: str, klass: type[Result]) -> None:
2525
"""
2626
Initializes a CrawlerBase instance.
2727

0 commit comments

Comments
 (0)