Skip to content

Commit 21cafaa

Browse files
authored
Determine ownership of tables based on grants and source code (#3066)
Determine ownership of tables in the inventory based on the following rules: - If a table is owned by a principal in the grants table, then that principal is the owner. - If a table is written to by a query, then the owner of that query is the owner of the table. - If a table is written to by a notebook or file, then the owner of the path is the owner of the table.
1 parent 123c89a commit 21cafaa

File tree

12 files changed

+200
-65
lines changed

12 files changed

+200
-65
lines changed

src/databricks/labs/ucx/contexts/application.py

Lines changed: 16 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@
2828
from databricks.labs.ucx.assessment.export import AssessmentExporter
2929
from databricks.labs.ucx.aws.credentials import CredentialManager
3030
from databricks.labs.ucx.config import WorkspaceConfig
31-
from databricks.labs.ucx.framework.owners import AdministratorLocator, WorkspacePathOwnership
31+
from databricks.labs.ucx.framework.owners import AdministratorLocator, WorkspacePathOwnership, LegacyQueryOwnership
3232
from databricks.labs.ucx.hive_metastore import ExternalLocations, MountsCrawler, TablesCrawler
3333
from databricks.labs.ucx.hive_metastore.catalog_schema import CatalogSchema
3434
from databricks.labs.ucx.hive_metastore.grants import (
@@ -43,13 +43,13 @@
4343
PrincipalACL,
4444
)
4545
from databricks.labs.ucx.hive_metastore.mapping import TableMapping
46-
from databricks.labs.ucx.hive_metastore.table_migration_status import TableMigrationIndex, TableMigrationOwnership
46+
from databricks.labs.ucx.hive_metastore.table_migration_status import TableMigrationIndex
47+
from databricks.labs.ucx.hive_metastore.ownership import TableMigrationOwnership, TableOwnership
4748
from databricks.labs.ucx.hive_metastore.table_migrate import (
4849
TableMigrationStatusRefresher,
4950
TablesMigrator,
5051
)
5152
from databricks.labs.ucx.hive_metastore.table_move import TableMove
52-
from databricks.labs.ucx.hive_metastore.tables import TableOwnership
5353
from databricks.labs.ucx.hive_metastore.udfs import UdfsCrawler, UdfOwnership
5454
from databricks.labs.ucx.hive_metastore.verification import VerifyHasCatalog, VerifyHasMetastore
5555
from databricks.labs.ucx.installer.workflows import DeployedWorkflows
@@ -263,17 +263,29 @@ def tables_crawler(self) -> TablesCrawler:
263263

264264
@cached_property
265265
def table_ownership(self) -> TableOwnership:
266-
return TableOwnership(self.administrator_locator)
266+
return TableOwnership(
267+
self.administrator_locator,
268+
self.grants_crawler,
269+
self.used_tables_crawler_for_paths,
270+
self.used_tables_crawler_for_queries,
271+
self.legacy_query_ownership,
272+
self.workspace_path_ownership,
273+
)
267274

268275
@cached_property
269276
def workspace_path_ownership(self) -> WorkspacePathOwnership:
270277
return WorkspacePathOwnership(self.administrator_locator, self.workspace_client)
271278

279+
@cached_property
280+
def legacy_query_ownership(self) -> LegacyQueryOwnership:
281+
return LegacyQueryOwnership(self.administrator_locator, self.workspace_client)
282+
272283
@cached_property
273284
def directfs_access_ownership(self) -> DirectFsAccessOwnership:
274285
return DirectFsAccessOwnership(
275286
self.administrator_locator,
276287
self.workspace_path_ownership,
288+
self.legacy_query_ownership,
277289
self.workspace_client,
278290
)
279291

src/databricks/labs/ucx/framework/owners.py

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -201,6 +201,9 @@ def __init__(self, administrator_locator: AdministratorLocator, ws: WorkspaceCli
201201
super().__init__(administrator_locator)
202202
self._ws = ws
203203

204+
def owner_of_path(self, path: str) -> str:
205+
return self.owner_of(WorkspacePath(self._ws, path))
206+
204207
@retried(on=[InternalError], timeout=timedelta(minutes=1))
205208
def _maybe_direct_owner(self, record: WorkspacePath) -> str | None:
206209
maybe_type_and_id = self._maybe_type_and_id(record)
@@ -237,3 +240,18 @@ def _infer_from_first_can_manage(object_permissions):
237240
return acl.group_name
238241
return acl.service_principal_name
239242
return None
243+
244+
245+
class LegacyQueryOwnership(Ownership[str]):
246+
def __init__(self, administrator_locator: AdministratorLocator, workspace_client: WorkspaceClient) -> None:
247+
super().__init__(administrator_locator)
248+
self._workspace_client = workspace_client
249+
250+
def _maybe_direct_owner(self, record: str) -> str | None:
251+
try:
252+
legacy_query = self._workspace_client.queries.get(record)
253+
return legacy_query.owner_user_name
254+
except NotFound:
255+
return None
256+
except InternalError: # redash is very naughty and throws 500s instead of proper 404s
257+
return None
Lines changed: 111 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,111 @@
1+
import logging
2+
from functools import cached_property
3+
4+
from databricks.labs.ucx.framework.owners import (
5+
Ownership,
6+
AdministratorLocator,
7+
LegacyQueryOwnership,
8+
WorkspacePathOwnership,
9+
)
10+
from databricks.labs.ucx.hive_metastore import TablesCrawler
11+
from databricks.labs.ucx.hive_metastore.grants import GrantsCrawler
12+
from databricks.labs.ucx.hive_metastore.table_migration_status import TableMigrationStatus
13+
from databricks.labs.ucx.hive_metastore.tables import Table
14+
from databricks.labs.ucx.source_code.base import UsedTable
15+
from databricks.labs.ucx.source_code.used_table import UsedTablesCrawler
16+
17+
logger = logging.getLogger(__name__)
18+
19+
20+
class TableOwnership(Ownership[Table]):
21+
"""Determine ownership of tables in the inventory based on the following rules:
22+
- If a table is owned by a principal in the grants table, then that principal is the owner.
23+
- If a table is written to by a query, then the owner of that query is the owner of the table.
24+
- If a table is written to by a notebook or file, then the owner of the path is the owner of the table.
25+
"""
26+
27+
def __init__(
28+
self,
29+
administrator_locator: AdministratorLocator,
30+
grants_crawler: GrantsCrawler,
31+
used_tables_in_paths: UsedTablesCrawler,
32+
used_tables_in_queries: UsedTablesCrawler,
33+
legacy_query_ownership: LegacyQueryOwnership,
34+
workspace_path_ownership: WorkspacePathOwnership,
35+
) -> None:
36+
super().__init__(administrator_locator)
37+
self._grants_crawler = grants_crawler
38+
self._used_tables_in_paths = used_tables_in_paths
39+
self._used_tables_in_queries = used_tables_in_queries
40+
self._legacy_query_ownership = legacy_query_ownership
41+
self._workspace_path_ownership = workspace_path_ownership
42+
43+
def _maybe_direct_owner(self, record: Table) -> str | None:
44+
owner = self._maybe_from_grants(record)
45+
if owner:
46+
return owner
47+
return self._maybe_from_sources(record)
48+
49+
def _maybe_from_sources(self, record: Table) -> str | None:
50+
used_table = self._used_tables_snapshot.get((record.catalog, record.database, record.name))
51+
if not used_table:
52+
return None
53+
# If something writes to a table, then it's an owner of it
54+
if not used_table.is_write:
55+
return None
56+
if used_table.source_type == 'QUERY' and used_table.query_id:
57+
return self._legacy_query_ownership.owner_of(used_table.query_id)
58+
if used_table.source_type in {'NOTEBOOK', 'FILE'}:
59+
return self._workspace_path_ownership.owner_of_path(used_table.source_id)
60+
logger.warning(f"Unknown source type {used_table.source_type} for {used_table.source_id}")
61+
return None
62+
63+
@cached_property
64+
def _used_tables_snapshot(self) -> dict[tuple[str, str, str], UsedTable]:
65+
index = {}
66+
for collection in (self._used_tables_in_paths.snapshot(), self._used_tables_in_queries.snapshot()):
67+
for used_table in collection:
68+
key = used_table.catalog_name, used_table.schema_name, used_table.table_name
69+
index[key] = used_table
70+
return index
71+
72+
def _maybe_from_grants(self, record: Table) -> str | None:
73+
for grant in self._grants_snapshot:
74+
if not grant.action_type == 'OWN':
75+
continue
76+
object_type, full_name = grant.this_type_and_key()
77+
if object_type == 'TABLE' and full_name == record.key:
78+
return grant.principal
79+
if object_type in {'DATABASE', 'SCHEMA'} and full_name == f"{record.catalog}.{record.database}":
80+
return grant.principal
81+
return None
82+
83+
@cached_property
84+
def _grants_snapshot(self):
85+
return self._grants_crawler.snapshot()
86+
87+
88+
class TableMigrationOwnership(Ownership[TableMigrationStatus]):
89+
"""Determine ownership of table migration records in the inventory.
90+
91+
This is the owner of the source table, if (and only if) the source table is present in the inventory.
92+
"""
93+
94+
def __init__(self, tables_crawler: TablesCrawler, table_ownership: TableOwnership) -> None:
95+
super().__init__(table_ownership._administrator_locator) # TODO: Fix this
96+
self._tables_crawler = tables_crawler
97+
self._table_ownership = table_ownership
98+
self._indexed_tables: dict[tuple[str, str], Table] | None = None
99+
100+
def _tables_snapshot_index(self, reindex: bool = False) -> dict[tuple[str, str], Table]:
101+
index = self._indexed_tables
102+
if index is None or reindex:
103+
snapshot = self._tables_crawler.snapshot()
104+
index = {(table.database, table.name): table for table in snapshot}
105+
self._indexed_tables = index
106+
return index
107+
108+
def _maybe_direct_owner(self, record: TableMigrationStatus) -> str | None:
109+
index = self._tables_snapshot_index()
110+
source_table = index.get((record.src_schema, record.src_table), None)
111+
return self._table_ownership.owner_of(source_table) if source_table is not None else None

src/databricks/labs/ucx/hive_metastore/table_migration_status.py

Lines changed: 0 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -9,10 +9,8 @@
99
from databricks.sdk.errors import NotFound
1010

1111
from databricks.labs.ucx.framework.crawlers import CrawlerBase
12-
from databricks.labs.ucx.framework.owners import Ownership
1312
from databricks.labs.ucx.framework.utils import escape_sql_identifier
1413
from databricks.labs.ucx.hive_metastore import TablesCrawler
15-
from databricks.labs.ucx.hive_metastore.tables import Table, TableOwnership
1614

1715
logger = logging.getLogger(__name__)
1816

@@ -162,29 +160,3 @@ def _iter_schemas(self):
162160
except NotFound:
163161
logger.warning(f"Catalog {catalog.name} no longer exists. Skipping checking its migration status.")
164162
continue
165-
166-
167-
class TableMigrationOwnership(Ownership[TableMigrationStatus]):
168-
"""Determine ownership of table migration records in the inventory.
169-
170-
This is the owner of the source table, if (and only if) the source table is present in the inventory.
171-
"""
172-
173-
def __init__(self, tables_crawler: TablesCrawler, table_ownership: TableOwnership) -> None:
174-
super().__init__(table_ownership._administrator_locator)
175-
self._tables_crawler = tables_crawler
176-
self._table_ownership = table_ownership
177-
self._indexed_tables: dict[tuple[str, str], Table] | None = None
178-
179-
def _tables_snapshot_index(self, reindex: bool = False) -> dict[tuple[str, str], Table]:
180-
index = self._indexed_tables
181-
if index is None or reindex:
182-
snapshot = self._tables_crawler.snapshot()
183-
index = {(table.database, table.name): table for table in snapshot}
184-
self._indexed_tables = index
185-
return index
186-
187-
def _maybe_direct_owner(self, record: TableMigrationStatus) -> str | None:
188-
index = self._tables_snapshot_index()
189-
source_table = index.get((record.src_schema, record.src_table), None)
190-
return self._table_ownership.owner_of(source_table) if source_table is not None else None

src/databricks/labs/ucx/hive_metastore/tables.py

Lines changed: 0 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,6 @@
1616
from databricks.sdk.errors import NotFound
1717

1818
from databricks.labs.ucx.framework.crawlers import CrawlerBase
19-
from databricks.labs.ucx.framework.owners import Ownership
2019
from databricks.labs.ucx.framework.utils import escape_sql_identifier
2120

2221
logger = logging.getLogger(__name__)
@@ -660,13 +659,3 @@ def _create_describe_tasks(self, catalog: str, database: str, table_names: list[
660659
for table in table_names:
661660
tasks.append(partial(self._describe, catalog, database, table))
662661
return tasks
663-
664-
665-
class TableOwnership(Ownership[Table]):
666-
"""Determine ownership of tables in the inventory.
667-
668-
At the present we don't determine a specific owner for tables.
669-
"""
670-
671-
def _maybe_direct_owner(self, record: Table) -> None:
672-
return None

src/databricks/labs/ucx/source_code/base.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -231,6 +231,16 @@ def source_type(self) -> str | None:
231231
last = self.source_lineage[-1]
232232
return last.object_type
233233

234+
@property
235+
def query_id(self) -> str | None:
236+
if self.source_type != 'QUERY':
237+
return None
238+
last = self.source_lineage[-1]
239+
parts = last.object_id.split('/')
240+
if len(parts) < 2:
241+
return None
242+
return parts[1]
243+
234244

235245
@dataclass
236246
class UsedTable(SourceInfo):

src/databricks/labs/ucx/source_code/directfs_access.py

Lines changed: 10 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,12 @@
1010
from databricks.labs.lsql.backends import SqlBackend
1111
from databricks.sdk.errors import DatabricksError, NotFound
1212

13-
from databricks.labs.ucx.framework.owners import Ownership, AdministratorLocator, WorkspacePathOwnership
13+
from databricks.labs.ucx.framework.owners import (
14+
Ownership,
15+
AdministratorLocator,
16+
WorkspacePathOwnership,
17+
LegacyQueryOwnership,
18+
)
1419
from databricks.labs.ucx.framework.utils import escape_sql_identifier
1520
from databricks.labs.ucx.source_code.base import DirectFsAccess
1621

@@ -73,15 +78,17 @@ def __init__(
7378
self,
7479
administrator_locator: AdministratorLocator,
7580
workspace_path_ownership: WorkspacePathOwnership,
81+
legacy_query_ownership: LegacyQueryOwnership,
7682
workspace_client: WorkspaceClient,
7783
) -> None:
7884
super().__init__(administrator_locator)
7985
self._workspace_path_ownership = workspace_path_ownership
86+
self._legacy_query_ownership = legacy_query_ownership
8087
self._workspace_client = workspace_client
8188

8289
def _maybe_direct_owner(self, record: DirectFsAccess) -> str | None:
83-
if record.source_type == 'QUERY':
84-
return self._query_owner(record)
90+
if record.source_type == 'QUERY' and record.query_id:
91+
return self._legacy_query_ownership.owner_of(record.query_id)
8592
if record.source_type in {'NOTEBOOK', 'FILE'}:
8693
return self._notebook_owner(record)
8794
logger.warning(f"Unknown source type {record.source_type} for {record.source_id}")
@@ -94,8 +101,3 @@ def _notebook_owner(self, record):
94101
return owner
95102
except NotFound:
96103
return None
97-
98-
def _query_owner(self, record):
99-
query_id = record.source_lineage[-1].object_id.split('/')[1]
100-
legacy_query = self._workspace_client.queries.get(query_id)
101-
return legacy_query.owner_user_name

tests/integration/hive_metastore/test_table_migrate.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,11 +2,10 @@
22

33
from databricks.labs.ucx.hive_metastore import TablesCrawler
44
from databricks.labs.ucx.hive_metastore.table_migration_status import (
5-
TableMigrationOwnership,
65
TableMigrationStatus,
76
TableMigrationStatusRefresher,
87
)
9-
from databricks.labs.ucx.hive_metastore.tables import TableOwnership
8+
from databricks.labs.ucx.hive_metastore.ownership import TableMigrationOwnership
109

1110

1211
def test_table_migration_ownership(ws, runtime_ctx, inventory_schema, sql_backend) -> None:
@@ -32,7 +31,7 @@ def is_migration_record_for_table(record: TableMigrationStatus) -> bool:
3231
synthetic_record = dataclasses.replace(table_migration_record, src_table="does_not_exist")
3332

3433
# Verify for the table that the table owner and the migration status are a match.
35-
table_ownership = TableOwnership(runtime_ctx.administrator_locator)
34+
table_ownership = runtime_ctx.table_ownership
3635
table_migration_ownership = TableMigrationOwnership(tables_crawler, table_ownership)
3736
assert table_migration_ownership.owner_of(table_migration_record) == table_ownership.owner_of(table_record)
3837

tests/integration/hive_metastore/test_tables.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
from databricks.sdk.retries import retried
66

77
from databricks.labs.ucx.hive_metastore import TablesCrawler
8-
from databricks.labs.ucx.hive_metastore.tables import What, TableOwnership
8+
from databricks.labs.ucx.hive_metastore.tables import What
99

1010
logger = logging.getLogger(__name__)
1111

@@ -90,7 +90,6 @@ def test_partitioned_tables(ws, sql_backend, make_schema, make_table):
9090

9191
def test_table_ownership(runtime_ctx, inventory_schema, sql_backend) -> None:
9292
"""Verify the ownership can be determined for crawled tables."""
93-
# This currently isn't very useful: we don't currently locate specific owners for tables.
9493

9594
# A table for which we'll determine the owner.
9695
table = runtime_ctx.make_table()
@@ -103,5 +102,6 @@ def test_table_ownership(runtime_ctx, inventory_schema, sql_backend) -> None:
103102
table_record = next(record for record in records if record.full_name == table.full_name)
104103

105104
# Verify ownership can be made.
106-
ownership = TableOwnership(runtime_ctx.administrator_locator)
107-
assert ownership.owner_of(table_record) == runtime_ctx.administrator_locator.get_workspace_administrator()
105+
my_user = runtime_ctx.workspace_client.current_user.me()
106+
owner = runtime_ctx.table_ownership.owner_of(table_record)
107+
assert owner == my_user.user_name

tests/unit/hive_metastore/test_table_migrate.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -28,13 +28,12 @@
2828
from databricks.labs.ucx.hive_metastore.table_migration_status import (
2929
TableMigrationStatusRefresher,
3030
TableMigrationIndex,
31-
TableMigrationOwnership,
3231
TableMigrationStatus,
3332
TableView,
3433
)
34+
from databricks.labs.ucx.hive_metastore.ownership import TableMigrationOwnership, TableOwnership
3535
from databricks.labs.ucx.hive_metastore.tables import (
3636
Table,
37-
TableOwnership,
3837
TablesCrawler,
3938
What,
4039
)

0 commit comments

Comments
 (0)