From 15dd48dae56740312e5a6fda29a22672246262c8 Mon Sep 17 00:00:00 2001 From: Andrew Snare Date: Wed, 25 Sep 2024 11:16:13 +0200 Subject: [PATCH 01/72] Introduce an optional history log, where crawler snapshots are journalled. Work in progress. --- src/databricks/labs/ucx/framework/crawlers.py | 25 ++- src/databricks/labs/ucx/framework/history.py | 150 ++++++++++++++++++ .../labs/ucx/source_code/directfs_access.py | 5 +- src/databricks/labs/ucx/source_code/jobs.py | 3 +- .../labs/ucx/source_code/queries.py | 2 +- .../unit/source_code/test_directfs_access.py | 11 +- 6 files changed, 184 insertions(+), 12 deletions(-) create mode 100644 src/databricks/labs/ucx/framework/history.py diff --git a/src/databricks/labs/ucx/framework/crawlers.py b/src/databricks/labs/ucx/framework/crawlers.py index 48d774d403..9493fcba96 100644 --- a/src/databricks/labs/ucx/framework/crawlers.py +++ b/src/databricks/labs/ucx/framework/crawlers.py @@ -1,3 +1,5 @@ +from __future__ import annotations +import datetime as dt import logging from abc import ABC, abstractmethod from collections.abc import Callable, Iterable, Sequence @@ -6,6 +8,7 @@ from databricks.labs.lsql.backends import SqlBackend from databricks.sdk.errors import NotFound +from databricks.labs.ucx.framework.history import HistoryLog from databricks.labs.ucx.framework.utils import escape_sql_identifier logger = logging.getLogger(__name__) @@ -21,13 +24,22 @@ class DataclassInstance(Protocol): class CrawlerBase(ABC, Generic[Result]): - def __init__(self, backend: SqlBackend, catalog: str, schema: str, table: str, klass: type[Result]): + def __init__( + self, + backend: SqlBackend, + catalog: str, + schema: str, + table: str, + klass: type[Result], + history_log: HistoryLog | None = None, + ): """ Initializes a CrawlerBase instance. Args: backend (SqlBackend): The backend that executes SQL queries: Statement Execution API or Databricks Runtime. + history_log: The (optional) history log where (new) snapshots should be saved. catalog (str): The catalog name for the inventory persistence. schema: The schema name for the inventory persistence. table: The table name for the inventory persistence. @@ -36,6 +48,7 @@ def __init__(self, backend: SqlBackend, catalog: str, schema: str, table: str, k self._schema = self._valid(schema) self._table = self._valid(table) self._backend = backend + self._history_log = history_log self._fetch = backend.fetch self._exec = backend.execute self._klass = klass @@ -155,10 +168,16 @@ def _snapshot(self, fetcher: ResultFn, loader: ResultFn, *, force_refresh: bool) except NotFound as e: logger.debug("Inventory table not found", exc_info=e) logger.debug(f"[{self.full_name}] crawling new set of snapshot data for {self._table}") + crawl_start_time = dt.datetime.now(tz=dt.timezone.utc) loaded_records = list(loader()) - self._update_snapshot(loaded_records, mode="overwrite") + self._update_snapshot(loaded_records, crawl_start_time=crawl_start_time, mode="overwrite") return loaded_records - def _update_snapshot(self, items: Sequence[Result], mode: Literal["append", "overwrite"] = "append") -> None: + def _update_snapshot( + self, items: Sequence[Result], *, crawl_start_time: dt.datetime, mode: Literal["append", "overwrite"] + ) -> None: logger.debug(f"[{self.full_name}] found {len(items)} new records for {self._table}") self._backend.save_table(self.full_name, items, self._klass, mode=mode) + if self._history_log: + appender = self._history_log.appender(self._klass) + appender.append_snapshot(items, run_start_time=crawl_start_time) diff --git a/src/databricks/labs/ucx/framework/history.py b/src/databricks/labs/ucx/framework/history.py new file mode 100644 index 0000000000..06854689b8 --- /dev/null +++ b/src/databricks/labs/ucx/framework/history.py @@ -0,0 +1,150 @@ +from __future__ import annotations +import dataclasses +import datetime as dt +import json +import logging +import uuid +from collections.abc import Callable, Sequence +from dataclasses import dataclass +from functools import cached_property +from typing import ClassVar, Protocol, TypeVar + +from databricks.labs.lsql.backends import SqlBackend +from databricks.sdk import WorkspaceClient + + +logger = logging.getLogger(__name__) + + +@dataclass(frozen=True, kw_only=True) +class HistoricalRecord: + workspace_id: int + """The identifier of the workspace where this record was generated.""" + + run_id: str + """An identifier of the workflow run that generated this record.""" + + snapshot_id: str + """An identifier that is unique to the records produced for a given snapshot.""" + + run_start_time: dt.datetime + """When this record was generated.""" + + object_type: str + """The inventory table for which this record was generated.""" + + object_type_version: int + """Versioning of inventory table, for forward compatibility.""" + + object_id: str + """The type-specific identifier for this inventory record.""" + + object_data: str + """Type-specific JSON-encoded data of the inventory record.""" + + failures: list[str] + """The list of problems associated with the object that this inventory record covers.""" + + owner: str + """The identity of the account that created this inventory record.""" + + +class DataclassInstance(Protocol): + __dataclass_fields__: ClassVar[dict] + + +Record = TypeVar("Record", bound=DataclassInstance) + + +class HistoryLog: + __slots__ = ("_ws", "_backend", "_run_id", "_catalog", "_schema", "_table") + + def __init__( + self, + ws: WorkspaceClient, + backend: SqlBackend, + run_id: str, + catalog: str, + schema: str, + table: str, + ) -> None: + self._ws = ws + self._backend = backend + self._run_id = run_id + self._catalog = catalog + self._schema = schema + self._table = table + + @property + def full_name(self) -> str: + return f"{self._catalog}.{self._schema}.{self._table}" + + def _append_history_snapshot(self, object_type: str, snapshot: list[HistoricalRecord]) -> None: + logger.debug(f"[{self.full_name}] appending {len(snapshot)} new records for {object_type}") + # Concurrent writes do not need to be handled here; appends cannot conflict. + # TODO: Although documented as conflict-free, verify that this is truly is the case. + self._backend.save_table(self.full_name, snapshot, HistoricalRecord, mode="append") + + class Appender: + __slots__ = ("_ws", "_object_type", "_object_type_version", "_key_from", "_run_id", "_persist") + + def __init__( + self, + ws: WorkspaceClient, + run_id: str, + klass: type[Record], + key_from: Callable[[Record], str], + persist: Callable[[str, list[HistoricalRecord]], None], + ) -> None: + self._ws = ws + self._run_id = run_id + self._object_type = klass.__name__ + # Versioning support: if the dataclass has a _ucx_version class attribute that is the current version. + self._object_type_version = getattr(klass, "_ucx_version") if hasattr(klass, "_ucx_version") else 0 + self._key_from = key_from + self._persist = persist + + @cached_property + def _workspace_id(self) -> int: + return self._ws.get_workspace_id() + + @cached_property + def _owner(self) -> str: + current_user = self._ws.current_user.me() + owner = current_user.user_name or current_user.id + assert owner + return owner + + def append_snapshot(self, records: Sequence[Record], *, run_start_time: dt.datetime) -> None: + snapshot_id = uuid.uuid4() + historical_records = [ + self._inventory_record_to_historical(record, snapshot_id=snapshot_id, run_start_time=run_start_time) + for record in records + ] + self._persist(self._object_type, historical_records) + + def _inventory_record_to_historical( + self, record: Record, *, snapshot_id: uuid.UUID, run_start_time: dt.datetime + ) -> HistoricalRecord: + object_id = self._key_from(record) + object_as_dict = dataclasses.asdict(record) + object_as_json = json.dumps(object_as_dict) + # TODO: Get failures. + failures: list[str] = [] + return HistoricalRecord( + workspace_id=self._workspace_id, + run_id=self._run_id, + snapshot_id=str(snapshot_id), + run_start_time=run_start_time, + object_type=self._object_type, + object_type_version=self._object_type_version, + object_id=object_id, + object_data=object_as_json, + failures=failures, + owner=self._owner, + ) + + def appender(self, klass: type[Record]) -> Appender: + # TODO: Make a part of the protocol so the type-checker can enforce this. + key_from = getattr(klass, "key_fields") + return self.Appender(self._ws, self._run_id, klass, key_from, self._append_history_snapshot) diff --git a/src/databricks/labs/ucx/source_code/directfs_access.py b/src/databricks/labs/ucx/source_code/directfs_access.py index 9ad65d5d0b..de1e712cec 100644 --- a/src/databricks/labs/ucx/source_code/directfs_access.py +++ b/src/databricks/labs/ucx/source_code/directfs_access.py @@ -1,6 +1,7 @@ from __future__ import annotations import dataclasses +import datetime as dt import logging import sys from collections.abc import Sequence, Iterable @@ -97,14 +98,14 @@ def __init__(self, backend: SqlBackend, schema: str, table: str): """ super().__init__(backend=backend, catalog="hive_metastore", schema=schema, table=table, klass=DirectFsAccess) - def dump_all(self, dfsas: Sequence[DirectFsAccess]): + def dump_all(self, dfsas: Sequence[DirectFsAccess], crawl_start_time: dt.datetime): """This crawler doesn't follow the pull model because the fetcher fetches data for 2 crawlers, not just one It's not **bad** because all records are pushed at once. Providing a multi-entity crawler is out-of-scope of this PR """ try: # TODO until we historize data, we append all DFSAs - self._update_snapshot(dfsas, mode="append") + self._update_snapshot(dfsas, crawl_start_time=crawl_start_time, mode="append") except DatabricksError as e: logger.error("Failed to store DFSAs", exc_info=e) diff --git a/src/databricks/labs/ucx/source_code/jobs.py b/src/databricks/labs/ucx/source_code/jobs.py index b5a4e47991..e25bb1f710 100644 --- a/src/databricks/labs/ucx/source_code/jobs.py +++ b/src/databricks/labs/ucx/source_code/jobs.py @@ -352,6 +352,7 @@ def __init__( self._include_job_ids = include_job_ids def refresh_report(self, sql_backend: SqlBackend, inventory_database: str): + crawl_start_time = datetime.now(tz=timezone.utc) tasks = [] all_jobs = list(self._ws.jobs.list()) logger.info(f"Preparing {len(all_jobs)} linting tasks...") @@ -374,7 +375,7 @@ def refresh_report(self, sql_backend: SqlBackend, inventory_database: str): JobProblem, mode='overwrite', ) - self._directfs_crawler.dump_all(job_dfsas) + self._directfs_crawler.dump_all(job_dfsas, crawl_start_time=crawl_start_time) if len(errors) > 0: raise ManyError(errors) diff --git a/src/databricks/labs/ucx/source_code/queries.py b/src/databricks/labs/ucx/source_code/queries.py index 79b40caa0a..d862455fe8 100644 --- a/src/databricks/labs/ucx/source_code/queries.py +++ b/src/databricks/labs/ucx/source_code/queries.py @@ -84,7 +84,7 @@ def refresh_report(self, sql_backend: SqlBackend, inventory_database: str): ) for dfsa in all_dfsas ] - self._directfs_crawler.dump_all(all_dfsas) + self._directfs_crawler.dump_all(all_dfsas, crawl_start_time=assessment_start) def _dashboard_ids_in_scope(self) -> list[str]: if self._include_dashboard_ids is not None: # an empty list is accepted diff --git a/tests/unit/source_code/test_directfs_access.py b/tests/unit/source_code/test_directfs_access.py index 75961390ae..24fe01763e 100644 --- a/tests/unit/source_code/test_directfs_access.py +++ b/tests/unit/source_code/test_directfs_access.py @@ -1,4 +1,4 @@ -from datetime import datetime +import datetime as dt from databricks.labs.lsql.backends import MockBackend @@ -12,19 +12,20 @@ def test_crawler_appends_dfsas(): backend = MockBackend() crawler = DirectFsAccessCrawler.for_paths(backend, "schema") + now = dt.datetime.now(tz=dt.timezone.utc) dfsas = list( DirectFsAccess( path=path, is_read=False, is_write=False, source_id="ID", - source_timestamp=datetime.now(), + source_timestamp=now, source_lineage=[LineageAtom(object_type="LINEAGE", object_id="ID")], - assessment_start_timestamp=datetime.now(), - assessment_end_timestamp=datetime.now(), + assessment_start_timestamp=now, + assessment_end_timestamp=now, ) for path in ("a", "b", "c") ) - crawler.dump_all(dfsas) + crawler.dump_all(dfsas, now) rows = backend.rows_written_for(crawler.full_name, "append") assert len(rows) == 3 From 622792e3ca2566e15ce990cad5604db2a5e0bbad Mon Sep 17 00:00:00 2001 From: Andrew Snare Date: Wed, 25 Sep 2024 14:16:22 +0200 Subject: [PATCH 02/72] Switch to integer identifiers for run_id and snapshot_id. --- src/databricks/labs/ucx/framework/history.py | 23 ++++++++++++-------- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/src/databricks/labs/ucx/framework/history.py b/src/databricks/labs/ucx/framework/history.py index 06854689b8..a3a4775a82 100644 --- a/src/databricks/labs/ucx/framework/history.py +++ b/src/databricks/labs/ucx/framework/history.py @@ -3,7 +3,7 @@ import datetime as dt import json import logging -import uuid +import os from collections.abc import Callable, Sequence from dataclasses import dataclass from functools import cached_property @@ -21,10 +21,10 @@ class HistoricalRecord: workspace_id: int """The identifier of the workspace where this record was generated.""" - run_id: str - """An identifier of the workflow run that generated this record.""" + run_id: int + """The identifier of the workflow run that generated this record.""" - snapshot_id: str + snapshot_id: int """An identifier that is unique to the records produced for a given snapshot.""" run_start_time: dt.datetime @@ -63,7 +63,7 @@ def __init__( self, ws: WorkspaceClient, backend: SqlBackend, - run_id: str, + run_id: int, catalog: str, schema: str, table: str, @@ -91,7 +91,7 @@ class Appender: def __init__( self, ws: WorkspaceClient, - run_id: str, + run_id: int, klass: type[Record], key_from: Callable[[Record], str], persist: Callable[[str, list[HistoricalRecord]], None], @@ -116,7 +116,8 @@ def _owner(self) -> str: return owner def append_snapshot(self, records: Sequence[Record], *, run_start_time: dt.datetime) -> None: - snapshot_id = uuid.uuid4() + # Equivalent entropy to a type-4 UUID. + snapshot_id = int.from_bytes(os.urandom(16), byteorder="big") historical_records = [ self._inventory_record_to_historical(record, snapshot_id=snapshot_id, run_start_time=run_start_time) for record in records @@ -124,7 +125,11 @@ def append_snapshot(self, records: Sequence[Record], *, run_start_time: dt.datet self._persist(self._object_type, historical_records) def _inventory_record_to_historical( - self, record: Record, *, snapshot_id: uuid.UUID, run_start_time: dt.datetime + self, + record: Record, + *, + snapshot_id: int, + run_start_time: dt.datetime, ) -> HistoricalRecord: object_id = self._key_from(record) object_as_dict = dataclasses.asdict(record) @@ -134,7 +139,7 @@ def _inventory_record_to_historical( return HistoricalRecord( workspace_id=self._workspace_id, run_id=self._run_id, - snapshot_id=str(snapshot_id), + snapshot_id=snapshot_id, run_start_time=run_start_time, object_type=self._object_type, object_type_version=self._object_type_version, From 4ebf9b681f282d3cf544f3bae9c9434d487b24b4 Mon Sep 17 00:00:00 2001 From: Andrew Snare Date: Wed, 25 Sep 2024 16:13:15 +0200 Subject: [PATCH 03/72] Update to store object data with first-level attributes exposed as a map. --- src/databricks/labs/ucx/framework/history.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/databricks/labs/ucx/framework/history.py b/src/databricks/labs/ucx/framework/history.py index a3a4775a82..2666dd92b2 100644 --- a/src/databricks/labs/ucx/framework/history.py +++ b/src/databricks/labs/ucx/framework/history.py @@ -39,8 +39,8 @@ class HistoricalRecord: object_id: str """The type-specific identifier for this inventory record.""" - object_data: str - """Type-specific JSON-encoded data of the inventory record.""" + object_data: dict[str,str] + """Type-specific data of the inventory record. Keys are top-level attributes, values are their JSON-encoded values.""" failures: list[str] """The list of problems associated with the object that this inventory record covers.""" @@ -133,7 +133,7 @@ def _inventory_record_to_historical( ) -> HistoricalRecord: object_id = self._key_from(record) object_as_dict = dataclasses.asdict(record) - object_as_json = json.dumps(object_as_dict) + flattened_object_data = {k: json.dumps(v) for k, v in object_as_dict.items()} # TODO: Get failures. failures: list[str] = [] return HistoricalRecord( From d82e06bc9999f78089981d3b387e55ceb50e638d Mon Sep 17 00:00:00 2001 From: Andrew Snare Date: Wed, 25 Sep 2024 16:14:19 +0200 Subject: [PATCH 04/72] Use a 56-bit random number for the snapshot_id. This field is encoded as a Spark SQL LONG, which has a (signed) range of 64-bits. --- src/databricks/labs/ucx/framework/history.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/databricks/labs/ucx/framework/history.py b/src/databricks/labs/ucx/framework/history.py index 2666dd92b2..5670470bc0 100644 --- a/src/databricks/labs/ucx/framework/history.py +++ b/src/databricks/labs/ucx/framework/history.py @@ -116,8 +116,7 @@ def _owner(self) -> str: return owner def append_snapshot(self, records: Sequence[Record], *, run_start_time: dt.datetime) -> None: - # Equivalent entropy to a type-4 UUID. - snapshot_id = int.from_bytes(os.urandom(16), byteorder="big") + snapshot_id = int.from_bytes(os.urandom(7), byteorder="big") historical_records = [ self._inventory_record_to_historical(record, snapshot_id=snapshot_id, run_start_time=run_start_time) for record in records From 4d676bb8c1a729b053c42aff75e57c3938dc89b9 Mon Sep 17 00:00:00 2001 From: Andrew Snare Date: Wed, 25 Sep 2024 16:17:52 +0200 Subject: [PATCH 05/72] Switch to composite object identifier. --- src/databricks/labs/ucx/framework/history.py | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/src/databricks/labs/ucx/framework/history.py b/src/databricks/labs/ucx/framework/history.py index 5670470bc0..0fe5ff868f 100644 --- a/src/databricks/labs/ucx/framework/history.py +++ b/src/databricks/labs/ucx/framework/history.py @@ -36,7 +36,7 @@ class HistoricalRecord: object_type_version: int """Versioning of inventory table, for forward compatibility.""" - object_id: str + object_id: list[str] """The type-specific identifier for this inventory record.""" object_data: dict[str,str] @@ -51,6 +51,8 @@ class HistoricalRecord: class DataclassInstance(Protocol): __dataclass_fields__: ClassVar[dict] + # TODO: Enable this once all record types provide this property. + # key_fields: ClassVar[Sequence[str]] Record = TypeVar("Record", bound=DataclassInstance) @@ -93,7 +95,7 @@ def __init__( ws: WorkspaceClient, run_id: int, klass: type[Record], - key_from: Callable[[Record], str], + key_from: Callable[[Record], list[str]], persist: Callable[[str, list[HistoricalRecord]], None], ) -> None: self._ws = ws @@ -143,12 +145,15 @@ def _inventory_record_to_historical( object_type=self._object_type, object_type_version=self._object_type_version, object_id=object_id, - object_data=object_as_json, + object_data=flattened_object_data, failures=failures, owner=self._owner, ) def appender(self, klass: type[Record]) -> Appender: - # TODO: Make a part of the protocol so the type-checker can enforce this. - key_from = getattr(klass, "key_fields") + key_fields = getattr(klass, "key_fields", ()) + + def key_from(record: Record) -> list[str]: + return [getattr(record, field) for field in key_fields] + return self.Appender(self._ws, self._run_id, klass, key_from, self._append_history_snapshot) From cf48771fd83e645851cd9ea535cae1c30fa0b0bc Mon Sep 17 00:00:00 2001 From: Andrew Snare Date: Wed, 25 Sep 2024 17:29:54 +0200 Subject: [PATCH 06/72] Formatting. --- src/databricks/labs/ucx/framework/history.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/databricks/labs/ucx/framework/history.py b/src/databricks/labs/ucx/framework/history.py index 0fe5ff868f..a9d209b82d 100644 --- a/src/databricks/labs/ucx/framework/history.py +++ b/src/databricks/labs/ucx/framework/history.py @@ -39,7 +39,7 @@ class HistoricalRecord: object_id: list[str] """The type-specific identifier for this inventory record.""" - object_data: dict[str,str] + object_data: dict[str, str] """Type-specific data of the inventory record. Keys are top-level attributes, values are their JSON-encoded values.""" failures: list[str] From e877532b89cc3fd40bc9194fe7d4b6d0962c82fb Mon Sep 17 00:00:00 2001 From: Andrew Snare Date: Wed, 25 Sep 2024 17:30:06 +0200 Subject: [PATCH 07/72] Modify TODO to not trigger linter. --- src/databricks/labs/ucx/framework/history.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/databricks/labs/ucx/framework/history.py b/src/databricks/labs/ucx/framework/history.py index a9d209b82d..0e422aafa1 100644 --- a/src/databricks/labs/ucx/framework/history.py +++ b/src/databricks/labs/ucx/framework/history.py @@ -51,8 +51,7 @@ class HistoricalRecord: class DataclassInstance(Protocol): __dataclass_fields__: ClassVar[dict] - # TODO: Enable this once all record types provide this property. - # key_fields: ClassVar[Sequence[str]] + # TODO: Once all record types provide the property: key_fields: ClassVar[Sequence[str]] Record = TypeVar("Record", bound=DataclassInstance) From b32190329e78591185fca95c276491c033b07c88 Mon Sep 17 00:00:00 2001 From: Andrew Snare Date: Wed, 25 Sep 2024 17:31:08 +0200 Subject: [PATCH 08/72] Unit tests for crawler appending new snapshots to the history. --- src/databricks/labs/ucx/framework/crawlers.py | 7 ++--- tests/unit/framework/test_crawlers.py | 31 ++++++++++++++++--- 2 files changed, 30 insertions(+), 8 deletions(-) diff --git a/src/databricks/labs/ucx/framework/crawlers.py b/src/databricks/labs/ucx/framework/crawlers.py index 9493fcba96..06afd6e129 100644 --- a/src/databricks/labs/ucx/framework/crawlers.py +++ b/src/databricks/labs/ucx/framework/crawlers.py @@ -48,7 +48,7 @@ def __init__( self._schema = self._valid(schema) self._table = self._valid(table) self._backend = backend - self._history_log = history_log + self._history_appender = history_log.appender(klass) if history_log is not None else None self._fetch = backend.fetch self._exec = backend.execute self._klass = klass @@ -178,6 +178,5 @@ def _update_snapshot( ) -> None: logger.debug(f"[{self.full_name}] found {len(items)} new records for {self._table}") self._backend.save_table(self.full_name, items, self._klass, mode=mode) - if self._history_log: - appender = self._history_log.appender(self._klass) - appender.append_snapshot(items, run_start_time=crawl_start_time) + if self._history_appender: + self._history_appender.append_snapshot(items, run_start_time=crawl_start_time) diff --git a/tests/unit/framework/test_crawlers.py b/tests/unit/framework/test_crawlers.py index 2fa5c9bfc9..e8ae03a411 100644 --- a/tests/unit/framework/test_crawlers.py +++ b/tests/unit/framework/test_crawlers.py @@ -1,6 +1,6 @@ from collections.abc import Iterable from dataclasses import dataclass -from unittest.mock import Mock +from unittest.mock import ANY, Mock, create_autospec import pytest from databricks.labs.lsql import Row @@ -8,6 +8,7 @@ from databricks.sdk.errors import NotFound from databricks.labs.ucx.framework.crawlers import CrawlerBase, Result, ResultFn +from databricks.labs.ucx.framework.history import HistoryLog @dataclass @@ -37,11 +38,12 @@ def __init__( schema: str, table: str, klass: type[Result], + history_log: HistoryLog | None = None, *, fetcher: ResultFn = lambda: [], loader: ResultFn = lambda: [], ): - super().__init__(backend, catalog, schema, table, klass) + super().__init__(backend, catalog, schema, table, klass, history_log) self._fetcher = fetcher self._loader = loader @@ -62,6 +64,14 @@ def test_full_name(): assert cb.full_name == "a.b.c" +def test_history_appender_setup() -> None: + """Verify that initializing the crawler also initializes the appender.""" + mock_history = create_autospec(HistoryLog) + _ = _CrawlerFixture(MockBackend(), "a", "b", "c", Baz, history_log=mock_history) + + mock_history.appender.assert_called_once_with(Baz) + + def test_snapshot_crawls_when_no_prior_crawl() -> None: """Check that the crawler is invoked when the fetcher reports that the inventory doesn't exist.""" mock_backend = MockBackend() @@ -132,7 +142,7 @@ def test_snapshot_force_refresh_replaces_prior_data() -> None: assert [Row(first="second", second=None)] == mock_backend.rows_written_for("a.b.c", mode="overwrite") -def test_snapshot_updates_existing_table() -> None: +def test_snapshot_updates_existing_inventory_table() -> None: mock_backend = MockBackend() cb = _CrawlerFixture[Baz](mock_backend, "a", "b", "c", Baz, loader=lambda: [Baz(first="first")]) @@ -142,7 +152,7 @@ def test_snapshot_updates_existing_table() -> None: assert [Row(first="first", second=None)] == mock_backend.rows_written_for("a.b.c", "overwrite") -def test_snapshot_updates_new_table() -> None: +def test_snapshot_updates_new_inventory_table() -> None: mock_backend = MockBackend() def fetcher(): @@ -159,6 +169,19 @@ def fetcher(): assert [Row(first="first", second=True)] == mock_backend.rows_written_for("a.b.c", "overwrite") +def test_snapshot_appends_to_history() -> None: + """Verify that when a snapshot is saved, it's also persisted in the history.""" + mock_history = create_autospec(HistoryLog) + cb = _CrawlerFixture[Baz]( + MockBackend(), "a", "b", "c", Baz, loader=lambda: [Baz(first="first")], history_log=mock_history + ) + + result = cb.snapshot() + + assert [Baz(first="first")] == result + mock_history.appender(Baz).append_snapshot.assert_called_once_with([Baz(first="first")], run_start_time=ANY) + + def test_snapshot_wrong_error() -> None: sql_backend = MockBackend() From 6170a638f00ddaf8fdd6a7d865e8248aa4954bcd Mon Sep 17 00:00:00 2001 From: Andrew Snare Date: Mon, 30 Sep 2024 16:34:25 +0200 Subject: [PATCH 09/72] Ensure call-by-keyword, and indicate the return type. --- src/databricks/labs/ucx/source_code/directfs_access.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/databricks/labs/ucx/source_code/directfs_access.py b/src/databricks/labs/ucx/source_code/directfs_access.py index de1e712cec..d39d5b6109 100644 --- a/src/databricks/labs/ucx/source_code/directfs_access.py +++ b/src/databricks/labs/ucx/source_code/directfs_access.py @@ -98,7 +98,7 @@ def __init__(self, backend: SqlBackend, schema: str, table: str): """ super().__init__(backend=backend, catalog="hive_metastore", schema=schema, table=table, klass=DirectFsAccess) - def dump_all(self, dfsas: Sequence[DirectFsAccess], crawl_start_time: dt.datetime): + def dump_all(self, dfsas: Sequence[DirectFsAccess], *, crawl_start_time: dt.datetime) -> None: """This crawler doesn't follow the pull model because the fetcher fetches data for 2 crawlers, not just one It's not **bad** because all records are pushed at once. Providing a multi-entity crawler is out-of-scope of this PR From 4b55625dc4e7e3d92d04b6a9eb1ebba936bce208 Mon Sep 17 00:00:00 2001 From: Andrew Snare Date: Mon, 30 Sep 2024 16:46:24 +0200 Subject: [PATCH 10/72] Fix unit test. --- tests/unit/source_code/test_directfs_access.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/unit/source_code/test_directfs_access.py b/tests/unit/source_code/test_directfs_access.py index 24fe01763e..3d332b5093 100644 --- a/tests/unit/source_code/test_directfs_access.py +++ b/tests/unit/source_code/test_directfs_access.py @@ -26,6 +26,6 @@ def test_crawler_appends_dfsas(): ) for path in ("a", "b", "c") ) - crawler.dump_all(dfsas, now) + crawler.dump_all(dfsas, crawl_start_time=now) rows = backend.rows_written_for(crawler.full_name, "append") assert len(rows) == 3 From 260becf8bae6bdc59d211fa93e95d7cae85b560b Mon Sep 17 00:00:00 2001 From: Andrew Snare Date: Wed, 9 Oct 2024 18:43:03 +0200 Subject: [PATCH 11/72] Back out changes to the crawler that relate to the history. THe history will be maintained adjacent to the crawler framework. --- src/databricks/labs/ucx/framework/crawlers.py | 24 ++------------ .../labs/ucx/source_code/directfs_access.py | 5 ++- src/databricks/labs/ucx/source_code/jobs.py | 5 ++- .../labs/ucx/source_code/queries.py | 4 +-- .../labs/ucx/source_code/used_table.py | 5 ++- .../integration/assessment/test_dashboards.py | 18 +++++------ tests/unit/framework/test_crawlers.py | 31 +++---------------- .../unit/source_code/test_directfs_access.py | 2 +- tests/unit/source_code/test_used_table.py | 2 +- 9 files changed, 25 insertions(+), 71 deletions(-) diff --git a/src/databricks/labs/ucx/framework/crawlers.py b/src/databricks/labs/ucx/framework/crawlers.py index 95d6b5419d..034e738df1 100644 --- a/src/databricks/labs/ucx/framework/crawlers.py +++ b/src/databricks/labs/ucx/framework/crawlers.py @@ -1,5 +1,3 @@ -from __future__ import annotations -import datetime as dt import logging from abc import ABC, abstractmethod from collections.abc import Callable, Iterable, Sequence @@ -8,7 +6,6 @@ from databricks.labs.lsql.backends import SqlBackend from databricks.sdk.errors import NotFound -from databricks.labs.ucx.framework.history import HistoryLog from databricks.labs.ucx.framework.utils import escape_sql_identifier logger = logging.getLogger(__name__) @@ -24,22 +21,13 @@ class DataclassInstance(Protocol): class CrawlerBase(ABC, Generic[Result]): - def __init__( - self, - backend: SqlBackend, - catalog: str, - schema: str, - table: str, - klass: type[Result], - history_log: HistoryLog | None = None, - ) -> None: + def __init__(self, backend: SqlBackend, catalog: str, schema: str, table: str, klass: type[Result]) -> None: """ Initializes a CrawlerBase instance. Args: backend (SqlBackend): The backend that executes SQL queries: Statement Execution API or Databricks Runtime. - history_log: The (optional) history log where (new) snapshots should be saved. catalog (str): The catalog name for the inventory persistence. schema: The schema name for the inventory persistence. table: The table name for the inventory persistence. @@ -48,7 +36,6 @@ def __init__( self._schema = self._valid(schema) self._table = self._valid(table) self._backend = backend - self._history_appender = history_log.appender(klass) if history_log is not None else None self._fetch = backend.fetch self._exec = backend.execute self._klass = klass @@ -168,15 +155,10 @@ def _snapshot(self, fetcher: ResultFn, loader: ResultFn, *, force_refresh: bool) except NotFound as e: logger.debug("Inventory table not found", exc_info=e) logger.debug(f"[{self.full_name}] crawling new set of snapshot data for {self._table}") - crawl_start_time = dt.datetime.now(tz=dt.timezone.utc) loaded_records = list(loader()) - self._update_snapshot(loaded_records, crawl_start_time=crawl_start_time, mode="overwrite") + self._update_snapshot(loaded_records, mode="overwrite") return loaded_records - def _update_snapshot( - self, items: Sequence[Result], *, crawl_start_time: dt.datetime, mode: Literal["append", "overwrite"] - ) -> None: + def _update_snapshot(self, items: Sequence[Result], *, mode: Literal["append", "overwrite"]) -> None: logger.debug(f"[{self.full_name}] found {len(items)} new records for {self._table}") self._backend.save_table(self.full_name, items, self._klass, mode=mode) - if self._history_appender: - self._history_appender.append_snapshot(items, run_start_time=crawl_start_time) diff --git a/src/databricks/labs/ucx/source_code/directfs_access.py b/src/databricks/labs/ucx/source_code/directfs_access.py index 64b6be1f62..9360b721de 100644 --- a/src/databricks/labs/ucx/source_code/directfs_access.py +++ b/src/databricks/labs/ucx/source_code/directfs_access.py @@ -1,6 +1,5 @@ from __future__ import annotations -import datetime as dt import logging from collections.abc import Sequence, Iterable @@ -35,14 +34,14 @@ def __init__(self, backend: SqlBackend, schema: str, table: str): """ super().__init__(backend=backend, catalog="hive_metastore", schema=schema, table=table, klass=DirectFsAccess) - def dump_all(self, dfsas: Sequence[DirectFsAccess], *, crawl_start_time: dt.datetime) -> None: + def dump_all(self, dfsas: Sequence[DirectFsAccess]) -> None: """This crawler doesn't follow the pull model because the fetcher fetches data for 2 crawlers, not just one It's not **bad** because all records are pushed at once. Providing a multi-entity crawler is out-of-scope of this PR """ try: # TODO until we historize data, we append all DFSAs - self._update_snapshot(dfsas, crawl_start_time=crawl_start_time, mode="append") + self._update_snapshot(dfsas, mode="append") except DatabricksError as e: logger.error("Failed to store DFSAs", exc_info=e) diff --git a/src/databricks/labs/ucx/source_code/jobs.py b/src/databricks/labs/ucx/source_code/jobs.py index 4ca25577b7..7f511f5f52 100644 --- a/src/databricks/labs/ucx/source_code/jobs.py +++ b/src/databricks/labs/ucx/source_code/jobs.py @@ -365,7 +365,6 @@ def __init__( self._include_job_ids = include_job_ids def refresh_report(self, sql_backend: SqlBackend, inventory_database: str): - crawl_start_time = datetime.now(tz=timezone.utc) tasks = [] all_jobs = list(self._ws.jobs.list()) logger.info(f"Preparing {len(all_jobs)} linting tasks...") @@ -390,8 +389,8 @@ def refresh_report(self, sql_backend: SqlBackend, inventory_database: str): JobProblem, mode='overwrite', ) - self._directfs_crawler.dump_all(job_dfsas, crawl_start_time=crawl_start_time) - self._used_tables_crawler.dump_all(job_tables, crawl_start_time=crawl_start_time) + self._directfs_crawler.dump_all(job_dfsas) + self._used_tables_crawler.dump_all(job_tables) if len(errors) > 0: raise ManyError(errors) diff --git a/src/databricks/labs/ucx/source_code/queries.py b/src/databricks/labs/ucx/source_code/queries.py index f694e4875c..3b71f4bea4 100644 --- a/src/databricks/labs/ucx/source_code/queries.py +++ b/src/databricks/labs/ucx/source_code/queries.py @@ -85,7 +85,7 @@ def _dump_dfsas(self, context: _ReportingContext, assessment_start: datetime, as assessment_end_timestamp=assessment_end, ) processed_dfsas.append(dfsa) - self._directfs_crawler.dump_all(processed_dfsas, crawl_start_time=assessment_start) + self._directfs_crawler.dump_all(processed_dfsas) def _dump_used_tables(self, context: _ReportingContext, assessment_start: datetime, assessment_end: datetime): processed_tables = [] @@ -96,7 +96,7 @@ def _dump_used_tables(self, context: _ReportingContext, assessment_start: dateti assessment_end_timestamp=assessment_end, ) processed_tables.append(table) - self._used_tables_crawler.dump_all(processed_tables, crawl_start_time=assessment_start) + self._used_tables_crawler.dump_all(processed_tables) def _lint_dashboards(self, context: _ReportingContext): dashboard_ids = self._dashboard_ids_in_scope() diff --git a/src/databricks/labs/ucx/source_code/used_table.py b/src/databricks/labs/ucx/source_code/used_table.py index 7251a5096c..5b45a96864 100644 --- a/src/databricks/labs/ucx/source_code/used_table.py +++ b/src/databricks/labs/ucx/source_code/used_table.py @@ -1,6 +1,5 @@ from __future__ import annotations -import datetime as dt import logging from collections.abc import Sequence, Iterable @@ -34,14 +33,14 @@ def for_paths(cls, backend: SqlBackend, schema: str) -> UsedTablesCrawler: def for_queries(cls, backend: SqlBackend, schema: str) -> UsedTablesCrawler: return UsedTablesCrawler(backend, schema, "used_tables_in_queries") - def dump_all(self, tables: Sequence[UsedTable], *, crawl_start_time: dt.datetime) -> None: + def dump_all(self, tables: Sequence[UsedTable]) -> None: """This crawler doesn't follow the pull model because the fetcher fetches data for 3 crawlers, not just one It's not **bad** because all records are pushed at once. Providing a multi-entity crawler is out-of-scope of this PR """ try: # TODO until we historize data, we append all TableInfos - self._update_snapshot(tables, mode="append", crawl_start_time=crawl_start_time) + self._update_snapshot(tables, mode="append") except DatabricksError as e: logger.error("Failed to store DFSAs", exc_info=e) diff --git a/tests/integration/assessment/test_dashboards.py b/tests/integration/assessment/test_dashboards.py index 91353766fe..ba84425eb3 100644 --- a/tests/integration/assessment/test_dashboards.py +++ b/tests/integration/assessment/test_dashboards.py @@ -55,7 +55,6 @@ def _populate_dashboard_problems(installation_ctx): def _populate_directfs_problems(installation_ctx): - assessment_start_timestamp = datetime.now(timezone.utc) - timedelta(minutes=5.0) dfsas = [ DirectFsAccess( path="some_path", @@ -69,11 +68,11 @@ def _populate_directfs_problems(installation_ctx): LineageAtom(object_type="NOTEBOOK", object_id="my_notebook_path"), LineageAtom(object_type="FILE", object_id="my file_path"), ], - assessment_start_timestamp=assessment_start_timestamp, + assessment_start_timestamp=(datetime.now(timezone.utc) - timedelta(minutes=5.0)), assessment_end_timestamp=datetime.now(timezone.utc) - timedelta(minutes=2.0), ) ] - installation_ctx.directfs_access_crawler_for_paths.dump_all(dfsas, crawl_start_time=assessment_start_timestamp) + installation_ctx.directfs_access_crawler_for_paths.dump_all(dfsas) dfsas = [ DirectFsAccess( path="some_path", @@ -85,15 +84,14 @@ def _populate_directfs_problems(installation_ctx): LineageAtom(object_type="DASHBOARD", object_id="my_dashboard_id", other={"name": "my_dashboard"}), LineageAtom(object_type="QUERY", object_id="my_dashboard_id/my_query_id", other={"name": "my_query"}), ], - assessment_start_timestamp=assessment_start_timestamp, + assessment_start_timestamp=(datetime.now(timezone.utc) - timedelta(minutes=5.0)), assessment_end_timestamp=datetime.now(timezone.utc) - timedelta(minutes=2.0), ) ] - installation_ctx.directfs_access_crawler_for_queries.dump_all(dfsas, crawl_start_time=assessment_start_timestamp) + installation_ctx.directfs_access_crawler_for_queries.dump_all(dfsas) def _populate_used_tables(installation_ctx): - assessment_start_timestamp = datetime.now(timezone.utc) - timedelta(minutes=5.0) tables = [ UsedTable( catalog_name="hive_metastore", @@ -109,11 +107,11 @@ def _populate_used_tables(installation_ctx): LineageAtom(object_type="NOTEBOOK", object_id="my_notebook_path"), LineageAtom(object_type="FILE", object_id="my file_path"), ], - assessment_start_timestamp=assessment_start_timestamp, + assessment_start_timestamp=(datetime.now(timezone.utc) - timedelta(minutes=5.0)), assessment_end_timestamp=datetime.now(timezone.utc) - timedelta(minutes=2.0), ) ] - installation_ctx.used_tables_crawler_for_paths.dump_all(tables, crawl_start_time=assessment_start_timestamp) + installation_ctx.used_tables_crawler_for_paths.dump_all(tables) tables = [ UsedTable( catalog_name="hive_metastore", @@ -127,11 +125,11 @@ def _populate_used_tables(installation_ctx): LineageAtom(object_type="DASHBOARD", object_id="my_dashboard_id", other={"name": "my_dashboard"}), LineageAtom(object_type="QUERY", object_id="my_dashboard_id/my_query_id", other={"name": "my_query"}), ], - assessment_start_timestamp=assessment_start_timestamp, + assessment_start_timestamp=(datetime.now(timezone.utc) - timedelta(minutes=5.0)), assessment_end_timestamp=datetime.now(timezone.utc) - timedelta(minutes=2.0), ) ] - installation_ctx.used_tables_crawler_for_queries.dump_all(tables, crawl_start_time=assessment_start_timestamp) + installation_ctx.used_tables_crawler_for_queries.dump_all(tables) @pytest.mark.skip("Development tool") diff --git a/tests/unit/framework/test_crawlers.py b/tests/unit/framework/test_crawlers.py index e8ae03a411..2fa5c9bfc9 100644 --- a/tests/unit/framework/test_crawlers.py +++ b/tests/unit/framework/test_crawlers.py @@ -1,6 +1,6 @@ from collections.abc import Iterable from dataclasses import dataclass -from unittest.mock import ANY, Mock, create_autospec +from unittest.mock import Mock import pytest from databricks.labs.lsql import Row @@ -8,7 +8,6 @@ from databricks.sdk.errors import NotFound from databricks.labs.ucx.framework.crawlers import CrawlerBase, Result, ResultFn -from databricks.labs.ucx.framework.history import HistoryLog @dataclass @@ -38,12 +37,11 @@ def __init__( schema: str, table: str, klass: type[Result], - history_log: HistoryLog | None = None, *, fetcher: ResultFn = lambda: [], loader: ResultFn = lambda: [], ): - super().__init__(backend, catalog, schema, table, klass, history_log) + super().__init__(backend, catalog, schema, table, klass) self._fetcher = fetcher self._loader = loader @@ -64,14 +62,6 @@ def test_full_name(): assert cb.full_name == "a.b.c" -def test_history_appender_setup() -> None: - """Verify that initializing the crawler also initializes the appender.""" - mock_history = create_autospec(HistoryLog) - _ = _CrawlerFixture(MockBackend(), "a", "b", "c", Baz, history_log=mock_history) - - mock_history.appender.assert_called_once_with(Baz) - - def test_snapshot_crawls_when_no_prior_crawl() -> None: """Check that the crawler is invoked when the fetcher reports that the inventory doesn't exist.""" mock_backend = MockBackend() @@ -142,7 +132,7 @@ def test_snapshot_force_refresh_replaces_prior_data() -> None: assert [Row(first="second", second=None)] == mock_backend.rows_written_for("a.b.c", mode="overwrite") -def test_snapshot_updates_existing_inventory_table() -> None: +def test_snapshot_updates_existing_table() -> None: mock_backend = MockBackend() cb = _CrawlerFixture[Baz](mock_backend, "a", "b", "c", Baz, loader=lambda: [Baz(first="first")]) @@ -152,7 +142,7 @@ def test_snapshot_updates_existing_inventory_table() -> None: assert [Row(first="first", second=None)] == mock_backend.rows_written_for("a.b.c", "overwrite") -def test_snapshot_updates_new_inventory_table() -> None: +def test_snapshot_updates_new_table() -> None: mock_backend = MockBackend() def fetcher(): @@ -169,19 +159,6 @@ def fetcher(): assert [Row(first="first", second=True)] == mock_backend.rows_written_for("a.b.c", "overwrite") -def test_snapshot_appends_to_history() -> None: - """Verify that when a snapshot is saved, it's also persisted in the history.""" - mock_history = create_autospec(HistoryLog) - cb = _CrawlerFixture[Baz]( - MockBackend(), "a", "b", "c", Baz, loader=lambda: [Baz(first="first")], history_log=mock_history - ) - - result = cb.snapshot() - - assert [Baz(first="first")] == result - mock_history.appender(Baz).append_snapshot.assert_called_once_with([Baz(first="first")], run_start_time=ANY) - - def test_snapshot_wrong_error() -> None: sql_backend = MockBackend() diff --git a/tests/unit/source_code/test_directfs_access.py b/tests/unit/source_code/test_directfs_access.py index efd595971e..3bd3e35945 100644 --- a/tests/unit/source_code/test_directfs_access.py +++ b/tests/unit/source_code/test_directfs_access.py @@ -31,7 +31,7 @@ def test_crawler_appends_dfsas(): ) for path in ("a", "b", "c") ) - crawler.dump_all(dfsas, crawl_start_time=now) + crawler.dump_all(dfsas) rows = backend.rows_written_for(crawler.full_name, "append") assert len(rows) == 3 diff --git a/tests/unit/source_code/test_used_table.py b/tests/unit/source_code/test_used_table.py index 822681a219..0b090c05bb 100644 --- a/tests/unit/source_code/test_used_table.py +++ b/tests/unit/source_code/test_used_table.py @@ -24,6 +24,6 @@ def test_crawler_appends_tables(): ) for name in ("a", "b", "c") ) - crawler.dump_all(dfsas, crawl_start_time=now) + crawler.dump_all(dfsas) rows = backend.rows_written_for(crawler.full_name, "append") assert len(rows) == 3 From 4a5321cc961eaacb4bca701b1f3631108b8ba5e5 Mon Sep 17 00:00:00 2001 From: Andrew Snare Date: Wed, 16 Oct 2024 10:43:32 +0200 Subject: [PATCH 12/72] Replace initial history record conversion and logging with a new version, updated to use the Historical record type. --- src/databricks/labs/ucx/framework/history.py | 162 ------------ src/databricks/labs/ucx/progress/history.py | 165 ++++++++++++ tests/unit/progress/test_history.py | 252 +++++++++++++++++++ 3 files changed, 417 insertions(+), 162 deletions(-) delete mode 100644 src/databricks/labs/ucx/framework/history.py create mode 100644 src/databricks/labs/ucx/progress/history.py create mode 100644 tests/unit/progress/test_history.py diff --git a/src/databricks/labs/ucx/framework/history.py b/src/databricks/labs/ucx/framework/history.py deleted file mode 100644 index 25655fb111..0000000000 --- a/src/databricks/labs/ucx/framework/history.py +++ /dev/null @@ -1,162 +0,0 @@ -from __future__ import annotations -import dataclasses -import datetime as dt -import json -import logging -import os -from collections.abc import Callable, Iterable -from dataclasses import dataclass -from functools import cached_property -from typing import ClassVar, Protocol, TypeVar - -from databricks.labs.lsql.backends import SqlBackend -from databricks.sdk import WorkspaceClient - - -logger = logging.getLogger(__name__) - - -@dataclass(frozen=True, kw_only=True) -class HistoricalRecord: - workspace_id: int - """The identifier of the workspace where this record was generated.""" - - run_id: int - """The identifier of the workflow run that generated this record.""" - - snapshot_id: int - """An identifier that is unique to the records produced for a given snapshot.""" - - run_start_time: dt.datetime - """When this record was generated.""" - - object_type: str - """The inventory table for which this record was generated.""" - - object_type_version: int - """Versioning of inventory table, for forward compatibility.""" - - object_id: list[str] - """The type-specific identifier for this inventory record.""" - - object_data: dict[str, str] - """Type-specific data of the inventory record. Keys are top-level attributes, values are their JSON-encoded values.""" - - failures: list[str] - """The list of problems associated with the object that this inventory record covers.""" - - owner: str - """The identity of the account that created this inventory record.""" - - -class DataclassInstance(Protocol): - __dataclass_fields__: ClassVar[dict] - # TODO: Once all record types provide the property: key_fields: ClassVar[Sequence[str]] - - -Record = TypeVar("Record", bound=DataclassInstance) - - -class HistoryLog: - __slots__ = ("_ws", "_backend", "_run_id", "_catalog", "_schema", "_table") - - def __init__( - self, - ws: WorkspaceClient, - backend: SqlBackend, - run_id: int, - catalog: str, - schema: str, - table: str, - ) -> None: - self._ws = ws - self._backend = backend - self._run_id = run_id - self._catalog = catalog - self._schema = schema - self._table = table - - @property - def full_name(self) -> str: - return f"{self._catalog}.{self._schema}.{self._table}" - - def _append_history_snapshot(self, object_type: str, snapshot: list[HistoricalRecord]) -> None: - logger.debug(f"[{self.full_name}] appending {len(snapshot)} new records for {object_type}") - # Concurrent writes do not need to be handled here; appends cannot conflict. - # TODO: Although documented as conflict-free, verify that this is truly is the case. - self._backend.save_table(self.full_name, snapshot, HistoricalRecord, mode="append") - - class Appender: - __slots__ = ("_ws", "_object_type", "_object_type_version", "_key_from", "_run_id", "_persist") - - def __init__( - self, - ws: WorkspaceClient, - run_id: int, - klass: type[Record], - key_from: Callable[[Record], list[str]], - persist: Callable[[str, list[HistoricalRecord]], None], - ) -> None: - self._ws = ws - self._run_id = run_id - self._object_type = klass.__name__ - # Versioning support: if the dataclass has a _ucx_version class attribute that is the current version. - self._object_type_version = getattr(klass, "_ucx_version") if hasattr(klass, "_ucx_version") else 0 - self._key_from = key_from - self._persist = persist - - @cached_property - def _workspace_id(self) -> int: - return self._ws.get_workspace_id() - - @cached_property - def _owner(self) -> str: - current_user = self._ws.current_user.me() - owner = current_user.user_name or current_user.id - assert owner - return owner - - def append_snapshot(self, records: Iterable[Record], *, run_start_time: dt.datetime) -> None: - snapshot_id: int = int.from_bytes(os.urandom(7), byteorder="big") - historical_records = list( - self._inventory_records_to_historical( - records, - snapshot_id=snapshot_id, - run_start_time=run_start_time, - ) - ) - self._persist(self._object_type, historical_records) - - def _inventory_records_to_historical( - self, - records: Iterable[Record], - *, - snapshot_id: int, - run_start_time: dt.datetime, - ) -> Iterable[HistoricalRecord]: - for record in records: - object_id = self._key_from(record) - object_as_dict = dataclasses.asdict(record) - flattened_object_data = {k: json.dumps(v) for k, v in object_as_dict.items()} - # TODO: Get failures. - failures: list[str] = [] - yield HistoricalRecord( - workspace_id=self._workspace_id, - run_id=self._run_id, - snapshot_id=snapshot_id, - run_start_time=run_start_time, - object_type=self._object_type, - object_type_version=self._object_type_version, - object_id=object_id, - object_data=flattened_object_data, - failures=failures, - owner=self._owner, - ) - - def appender(self, klass: type[Record]) -> Appender: - key_fields = getattr(klass, "key_fields", ()) - - def key_from(record: Record) -> list[str]: - return [getattr(record, field) for field in key_fields] - - return self.Appender(self._ws, self._run_id, klass, key_from, self._append_history_snapshot) diff --git a/src/databricks/labs/ucx/progress/history.py b/src/databricks/labs/ucx/progress/history.py new file mode 100644 index 0000000000..0b78d1b647 --- /dev/null +++ b/src/databricks/labs/ucx/progress/history.py @@ -0,0 +1,165 @@ +from __future__ import annotations +import dataclasses +import datetime as dt +import json +import logging +from collections.abc import Callable, Iterable, Sequence +from typing import ClassVar, Protocol, TypeVar, Generic, Any + +from databricks.labs.lsql.backends import SqlBackend +from databricks.sdk import WorkspaceClient + +from databricks.labs.ucx.framework.owners import Ownership +from databricks.labs.ucx.framework.utils import escape_sql_identifier +from databricks.labs.ucx.progress.install import Historical + +logger = logging.getLogger(__name__) + + +class DataclassInstance(Protocol): + __dataclass_fields__: ClassVar[dict[str, Any]] + + __id_fields__: ClassVar[Sequence[str]] + + +Record = TypeVar("Record", bound=DataclassInstance) + + +class HistoricalEncoder(Generic[Record]): + def __init__(self, job_run_id: int, workspace_id: int, ownership: Ownership[Record], klass: type[Record]) -> None: + self._job_run_id = job_run_id + self._workspace_id = workspace_id + self._ownership = ownership + self._object_type = self._get_object_type(klass) + self._field_types, self._has_failures = self._get_field_types(klass) + self._id_field_names = self._get_id_fieldnames(klass) + + @classmethod + def _get_field_types(cls, klass: type[Record]) -> tuple[dict[str, type], bool]: + field_types = {field.name: field.type for field in dataclasses.fields(klass)} + failures_type = field_types.pop("failures", None) + has_failures = failures_type is not None + if has_failures and failures_type != list[str]: + msg = f"Historical record class has invalid failures type: {failures_type}" + raise TypeError(msg) + return field_types, has_failures + + def _get_id_fieldnames(self, klazz: type[Record]) -> Sequence[str]: + id_field_names = tuple(klazz.__id_fields__) + all_fields = self._field_types + for field in id_field_names: + id_field_type = all_fields.get(field, None) + if id_field_type is None: + raise AttributeError(name=field, obj=klazz) + if id_field_type != str: + msg = f"Historical record class id field is not a string: {field}" + raise TypeError(msg) + return id_field_names + + @classmethod + def _get_object_type(cls, klass: type[Record]) -> str: + return klass.__name__ + + @classmethod + def _as_dict(cls, record: Record) -> dict[str, Any]: + return dataclasses.asdict(record) + + def _object_id(self, record: Record) -> list[str]: + return [getattr(record, field) for field in self._id_field_names] + + @classmethod + def _encode_non_serializable(cls, name: str, value: Any) -> Any: + if isinstance(value, dt.datetime): + # Only allow tz-aware timestamps. + if value.tzinfo is None: + msg = f"Timestamp without timezone not supported for field {name}: {value}" + raise ValueError(msg) + # Always store with 'Z'. + ts_utc = value.astimezone(dt.timezone.utc) + return ts_utc.isoformat().replace("+00:00", "Z") + + msg = f"Cannot encode value for {name}: {value!r}" + raise TypeError(msg) + + def _encode_field_value(self, name: str, value: Any | None) -> str | None: + if value is None: + return None + value_type = self._field_types[name] + # TODO: Update to check this covers str | None + if value_type == str: + return value + encoded_value = json.dumps( + value, + allow_nan=False, + separators=(",", ":"), + default=lambda o: self._encode_non_serializable(name, o), + ) + # Handle encoding substituted values that encode as just a string (eg. timestamps); we just return the string. + return json.loads(encoded_value) if encoded_value.startswith('"') else encoded_value + + def _object_data_and_failures(self, record: Record) -> tuple[dict[str, str], list[str]]: + record_values = self._as_dict(record) + encoded_fields = {field: self._encode_field_value(field, record_values[field]) for field in self._field_types} + # We must return a value: strings are mandatory (not optional) as the type. As such, optional fields need to be + # omitted from the data map if the value is None. + data = {k: v for k, v in encoded_fields.items() if v is not None} + failures = record_values["failures"] if self._has_failures else [] + + return data, failures + + def to_historical(self, record: Record) -> Historical: + data, failures = self._object_data_and_failures(record) + return Historical( + workspace_id=self._workspace_id, + job_run_id=self._job_run_id, + object_type=self._object_type, + object_id=self._object_id(record), + data=data, + failures=failures, + owner=self._ownership.owner_of(record), + ) + + +class HistoryLog(Generic[Record]): + def __init__( # pylint: disable=too-many-arguments + self, + ws: WorkspaceClient, + backend: SqlBackend, + ownership: Ownership[Record], + klass: type[Record], + run_id: int, + workspace_id: int, + catalog: str, + schema: str = "ucx", + table: str = "history", + ) -> None: + self._ws = ws + self._backend = backend + self._klass = klass + self._catalog = catalog + self._schema = schema + self._table = table + encoder = HistoricalEncoder(job_run_id=run_id, workspace_id=workspace_id, ownership=ownership, klass=klass) + self._encoder = encoder + + @staticmethod + def _record_identifier(record_type: type[Record]) -> Callable[[Record], list[str]]: + key_field_names = getattr(record_type, "key_fields", ()) + + def identify_record(record: Record) -> list[str]: + return [getattr(record, key_field) for key_field in key_field_names] + + return identify_record + + @property + def full_name(self) -> str: + return f"{self._catalog}.{self._schema}.{self._table}" + + def append_inventory_snapshot(self, snapshot: Sequence[Record]) -> None: + history_records = list(self._inventory_records_to_historical(snapshot)) + # This is the only writer, and the mode is 'append'. This is documented as conflict-free. + self._backend.save_table(escape_sql_identifier(self.full_name), history_records, Historical, mode="append") + + def _inventory_records_to_historical(self, records: Sequence[Record]) -> Iterable[Historical]: + for record in records: + yield self._encoder.to_historical(record) diff --git a/tests/unit/progress/test_history.py b/tests/unit/progress/test_history.py new file mode 100644 index 0000000000..a584e50344 --- /dev/null +++ b/tests/unit/progress/test_history.py @@ -0,0 +1,252 @@ +import datetime as dt +from dataclasses import dataclass, field +from typing import ClassVar +from unittest.mock import create_autospec + +import pytest + +from databricks.labs.ucx.__about__ import __version__ as ucx_version +from databricks.labs.ucx.framework.owners import Ownership +from databricks.labs.ucx.progress.history import HistoricalEncoder +from databricks.labs.ucx.progress.install import Historical + + +@dataclass +class _TestRecord: + a_field: str + b_field: int + failures: list[str] + + __id_fields__: ClassVar[tuple[str]] = ("a_field",) + + +@pytest.fixture +def ownership() -> Ownership: + mock_ownership = create_autospec(Ownership) + mock_ownership.owner_of.return_value = "mickey" + return mock_ownership + + +def test_historical_encoder_basic(ownership) -> None: + """Verify basic encoding of a test record into a historical record.""" + encoder = HistoricalEncoder(job_run_id=1, workspace_id=2, ownership=ownership, klass=_TestRecord) + + assert ownership.owner_of(_TestRecord(a_field="fu", b_field=2, failures=["doh", "ray"])) == "mickey" + + record = encoder.to_historical(_TestRecord(a_field="fu", b_field=2, failures=["doh", "ray"])) + + expected_record = Historical( + workspace_id=2, + job_run_id=1, + object_type="_TestRecord", + object_id=["fu"], + data={ + "a_field": "fu", + "b_field": "2", + }, + failures=["doh", "ray"], + owner="mickey", + ucx_version=ucx_version, + ) + + assert record == expected_record + + +def test_historical_encoder_workspace_id(ownership) -> None: + """Verify the encoder produces records using the supplied workspace identifier.""" + encoder = HistoricalEncoder(job_run_id=1, workspace_id=52, ownership=ownership, klass=_TestRecord) + + record = _TestRecord(a_field="whatever", b_field=2, failures=[]) + historical = encoder.to_historical(record) + assert historical.workspace_id == 52 + + +def test_historical_encoder_run_id(ownership) -> None: + """Verify the encoder produces records using the supplied job-run identifier.""" + encoder = HistoricalEncoder(job_run_id=42, workspace_id=2, ownership=ownership, klass=_TestRecord) + + record = _TestRecord(a_field="whatever", b_field=2, failures=[]) + historical = encoder.to_historical(record) + assert historical.job_run_id == 42 + + +def test_historical_encoder_ucx_version(ownership) -> None: + """Verify the encoder produces records containing the current UCX version.""" + encoder = HistoricalEncoder(job_run_id=1, workspace_id=2, ownership=ownership, klass=_TestRecord) + + record = _TestRecord(a_field="whatever", b_field=2, failures=[]) + historical = encoder.to_historical(record) + assert historical.ucx_version == ucx_version + + +def test_historical_encoder_ownership(ownership) -> None: + """Verify the encoder produces records with the owner determined by the supplied ownership instance.""" + expected_owners = ("bob", "jane", "tarzan") + ownership.owner_of.side_effect = expected_owners + + records = [_TestRecord(a_field="whatever", b_field=x, failures=[]) for x in range(len(expected_owners))] + encoder = HistoricalEncoder(job_run_id=1, workspace_id=2, ownership=ownership, klass=_TestRecord) + + encoded_records = [encoder.to_historical(record) for record in records] + owners = tuple(encoded_record.owner for encoded_record in encoded_records) + + assert owners == expected_owners + assert ownership.owner_of.call_count == 3 + + +def test_historical_encoder_object_type(ownership) -> None: + """Verify the encoder uses the name of the record type as the object type for records.""" + encoder = HistoricalEncoder(job_run_id=1, workspace_id=2, ownership=ownership, klass=_TestRecord) + + record = _TestRecord(a_field="whatever", b_field=2, failures=[]) + historical = encoder.to_historical(record) + assert historical.object_type == "_TestRecord" + + +def test_historical_encoder_object_id(ownership) -> None: + """Verify the encoder uses the configured object-id fields from the record type in the encoded records.""" + encoder1 = HistoricalEncoder(job_run_id=1, workspace_id=2, ownership=ownership, klass=_TestRecord) + + historical1 = encoder1.to_historical(_TestRecord(a_field="used_for_key", b_field=2, failures=[])) + assert historical1.object_id == ["used_for_key"] + + @dataclass + class _CompoundKey: + a_field: str = "field-a" + b_field: str = "field-b" + c_field: str = "field-c" + + __id_fields__: ClassVar = ("a_field", "c_field", "b_field") + + encoder2 = HistoricalEncoder(job_run_id=1, workspace_id=2, ownership=ownership, klass=_CompoundKey) + historical2 = encoder2.to_historical(_CompoundKey()) + + # Note: order matters + assert historical2.object_id == ["field-a", "field-c", "field-b"] + + +def test_historical_encoder_object_id_verification(ownership) -> None: + """Check the __id_fields__ class property is verified during init: it must be present and refer only to strings.""" + + @dataclass + class _NoIdFields: + pass + + @dataclass + class _WrongTypeIdFields: + ok: str + not_ok: int + + __id_fields__: ClassVar = ["ok", "not_ok"] + + with pytest.raises(AttributeError) as excinfo: + HistoricalEncoder(job_run_id=1, workspace_id=1, ownership=ownership, klass=_NoIdFields) + + assert excinfo.value.obj == _NoIdFields + assert excinfo.value.name == "__id_fields__" + + with pytest.raises(TypeError, match="Historical record class id field is not a string: not_ok"): + HistoricalEncoder(job_run_id=1, workspace_id=1, ownership=ownership, klass=_WrongTypeIdFields) + + +def test_historical_encoder_object_data(ownership) -> None: + """Verify the encoder includes all dataclass fields in the object data.""" + encoder1 = HistoricalEncoder(job_run_id=1, workspace_id=2, ownership=ownership, klass=_TestRecord) + + historical1 = encoder1.to_historical(_TestRecord(a_field="used_for_key", b_field=2, failures=[])) + assert set(historical1.data.keys()) == {"a_field", "b_field"} + + @dataclass + class _AnotherClass: + field_1: str = "foo" + field_2: str = "bar" + field_3: str = "baz" + field_4: str = "daz" + + __id_fields__: ClassVar = ("field_1",) + + encoder2 = HistoricalEncoder(job_run_id=1, workspace_id=2, ownership=ownership, klass=_AnotherClass) + historical2 = encoder2.to_historical(_AnotherClass()) + assert set(historical2.data.keys()) == {"field_1", "field_2", "field_3", "field_4"} + + +def test_historical_encoder_object_data_values_strings_as_is(ownership) -> None: + """Verify that string fields are encoded as-is in the object_data""" + + @dataclass + class _AClass: + a_field: str = "value" + existing_json_field: str = "[1, 2, 3]" + + __id_fields__: ClassVar = ("a_field",) + + encoder = HistoricalEncoder(job_run_id=1, workspace_id=2, ownership=ownership, klass=_AClass) + historical = encoder.to_historical(_AClass()) + assert historical.data == {"a_field": "value", "existing_json_field": "[1, 2, 3]"} + + +def test_historical_encoder_object_data_missing_optional_values(ownership) -> None: + """Verify the encoding of missing (optional) field values.""" + + @dataclass + class _InnerClass: + optional_field: str | None = None + + @dataclass + class _AClass: + a_field: str = "value" + optional_field: str | None = None + nested: _InnerClass = _InnerClass() + + __id_fields__: ClassVar = ("a_field",) + + encoder = HistoricalEncoder(job_run_id=1, workspace_id=2, ownership=ownership, klass=_AClass) + historical = encoder.to_historical(_AClass()) + assert "optional_field" not in historical.data, "First-level optional fields should be elided if None" + assert historical.data["nested"] == '{"optional_field":null}', "Nested optional fields should be encoded as nulls" + + +def test_historical_encoder_object_data_values_non_strings_as_json(ownership) -> None: + """Verify that non-string fields are encoded as JSON in the object_data""" + + @dataclass + class _InnerClass: + counter: int + boolean: bool = True + a_field: str = "bar" + optional: str | None = None + + # TODO: Expand to cover all lsql-supported types. + @dataclass + class _AClass: + str_field: str = "foo" + int_field: int = 23 + bool_field: bool = True + ts_field: dt.datetime = field( + default_factory=lambda: dt.datetime( + year=2024, month=10, day=15, hour=12, minute=44, second=16, tzinfo=dt.timezone.utc + ) + ) + array_field: list[str] = field(default_factory=lambda: ["foo", "bar", "baz"]) + nested_dataclass: list = field(default_factory=lambda: [_InnerClass(x) for x in range(2)]) + + __id_fields__: ClassVar = ("str_field",) + + encoder = HistoricalEncoder(job_run_id=1, workspace_id=2, ownership=ownership, klass=_AClass) + historical = encoder.to_historical(_AClass()) + assert historical.data == { + "str_field": "foo", + "int_field": "23", + "bool_field": "true", + "ts_field": "2024-10-15T12:44:16Z", + "array_field": '["foo","bar","baz"]', + "nested_dataclass": '[{"counter":0,"boolean":true,"a_field":"bar","optional":null},{"counter":1,"boolean":true,"a_field":"bar","optional":null}]', + } + + +def test_historical_encoder_failures(ownership) -> None: + """Verify that encoder places failures on the top-level field instead of within the object data.""" + + +def test_historical_encoder_failures_verification(ownership) -> None: + """Verify that the encoder checks the failures field type during initialization.""" From e321589602b5b09d8ac30bce6a3e3d9331575061 Mon Sep 17 00:00:00 2001 From: Andrew Snare Date: Wed, 16 Oct 2024 11:41:30 +0200 Subject: [PATCH 13/72] Mark inner dataclasses in tests as immutable, so they are safe to use as default values. --- tests/unit/progress/test_history.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/unit/progress/test_history.py b/tests/unit/progress/test_history.py index a584e50344..181b57f272 100644 --- a/tests/unit/progress/test_history.py +++ b/tests/unit/progress/test_history.py @@ -188,7 +188,7 @@ class _AClass: def test_historical_encoder_object_data_missing_optional_values(ownership) -> None: """Verify the encoding of missing (optional) field values.""" - @dataclass + @dataclass(frozen=True) class _InnerClass: optional_field: str | None = None @@ -209,7 +209,7 @@ class _AClass: def test_historical_encoder_object_data_values_non_strings_as_json(ownership) -> None: """Verify that non-string fields are encoded as JSON in the object_data""" - @dataclass + @dataclass(frozen=True) class _InnerClass: counter: int boolean: bool = True From 4e3e3c816fd590ded4686e7c828d5099241caee7 Mon Sep 17 00:00:00 2001 From: Andrew Snare Date: Wed, 16 Oct 2024 14:01:18 +0200 Subject: [PATCH 14/72] Fix type hint. --- tests/unit/progress/test_history.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/unit/progress/test_history.py b/tests/unit/progress/test_history.py index 181b57f272..1118391de3 100644 --- a/tests/unit/progress/test_history.py +++ b/tests/unit/progress/test_history.py @@ -228,7 +228,7 @@ class _AClass: ) ) array_field: list[str] = field(default_factory=lambda: ["foo", "bar", "baz"]) - nested_dataclass: list = field(default_factory=lambda: [_InnerClass(x) for x in range(2)]) + nested_dataclass: list[_InnerClass] = field(default_factory=lambda: [_InnerClass(x) for x in range(2)]) __id_fields__: ClassVar = ("str_field",) From 616b0f1ec5d4cde3c08f1b71a4f1742e64b2af6e Mon Sep 17 00:00:00 2001 From: Andrew Snare Date: Wed, 16 Oct 2024 14:02:20 +0200 Subject: [PATCH 15/72] Mark test class as immutable. --- tests/unit/progress/test_history.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/unit/progress/test_history.py b/tests/unit/progress/test_history.py index 1118391de3..a61b4fd49d 100644 --- a/tests/unit/progress/test_history.py +++ b/tests/unit/progress/test_history.py @@ -11,7 +11,7 @@ from databricks.labs.ucx.progress.install import Historical -@dataclass +@dataclass(frozen=True, kw_only=True) class _TestRecord: a_field: str b_field: int From a52720d8e33d6321e0a23f42198add0cf745b6a2 Mon Sep 17 00:00:00 2001 From: Andrew Snare Date: Wed, 16 Oct 2024 14:02:36 +0200 Subject: [PATCH 16/72] Unit tests for the failures[] mechanism. --- tests/unit/progress/test_history.py | 24 +++++++++++++++++++++--- 1 file changed, 21 insertions(+), 3 deletions(-) diff --git a/tests/unit/progress/test_history.py b/tests/unit/progress/test_history.py index a61b4fd49d..8be3473452 100644 --- a/tests/unit/progress/test_history.py +++ b/tests/unit/progress/test_history.py @@ -1,4 +1,5 @@ import datetime as dt +import re from dataclasses import dataclass, field from typing import ClassVar from unittest.mock import create_autospec @@ -244,9 +245,26 @@ class _AClass: } -def test_historical_encoder_failures(ownership) -> None: - """Verify that encoder places failures on the top-level field instead of within the object data.""" +@pytest.mark.parametrize("failures", (["failures-1", "failures-2"], [])) +def test_historical_encoder_failures(ownership, failures) -> None: + """Verify an encoder places failures on the top-level field instead of within the object data.""" + encoder = HistoricalEncoder(job_run_id=1, workspace_id=2, ownership=ownership, klass=_TestRecord) + + historical = encoder.to_historical(_TestRecord(a_field="foo", b_field=10, failures=list(failures))) + + assert historical.failures == failures + assert "failures" not in historical.data def test_historical_encoder_failures_verification(ownership) -> None: - """Verify that the encoder checks the failures field type during initialization.""" + """Verify that encoders checks the failures field type during initialization.""" + + @dataclass + class _BrokenFailures: + a_field: str = "a_field" + failures: list[int] = field(default_factory=list) + + __id_fields__: ClassVar = ("a_field",) + + with pytest.raises(TypeError, match=re.escape("Historical record class has invalid failures type: list[int]")): + _ = HistoricalEncoder(job_run_id=1, workspace_id=2, ownership=ownership, klass=_BrokenFailures) From 6f381926147d4af1c899a7ffd42716b496f2a2a5 Mon Sep 17 00:00:00 2001 From: Andrew Snare Date: Wed, 16 Oct 2024 14:10:27 +0200 Subject: [PATCH 17/72] When encoding a string field, also handle it being optional. --- src/databricks/labs/ucx/progress/history.py | 3 +-- tests/unit/progress/test_history.py | 3 ++- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/databricks/labs/ucx/progress/history.py b/src/databricks/labs/ucx/progress/history.py index 0b78d1b647..abb0cc30d5 100644 --- a/src/databricks/labs/ucx/progress/history.py +++ b/src/databricks/labs/ucx/progress/history.py @@ -85,8 +85,7 @@ def _encode_field_value(self, name: str, value: Any | None) -> str | None: if value is None: return None value_type = self._field_types[name] - # TODO: Update to check this covers str | None - if value_type == str: + if value_type == str or value_type == (str | None): return value encoded_value = json.dumps( value, diff --git a/tests/unit/progress/test_history.py b/tests/unit/progress/test_history.py index 8be3473452..191200b435 100644 --- a/tests/unit/progress/test_history.py +++ b/tests/unit/progress/test_history.py @@ -178,12 +178,13 @@ def test_historical_encoder_object_data_values_strings_as_is(ownership) -> None: class _AClass: a_field: str = "value" existing_json_field: str = "[1, 2, 3]" + optional_string_field: str | None = "value" __id_fields__: ClassVar = ("a_field",) encoder = HistoricalEncoder(job_run_id=1, workspace_id=2, ownership=ownership, klass=_AClass) historical = encoder.to_historical(_AClass()) - assert historical.data == {"a_field": "value", "existing_json_field": "[1, 2, 3]"} + assert historical.data == {"a_field": "value", "existing_json_field": "[1, 2, 3]", "optional_string_field": "value"} def test_historical_encoder_object_data_missing_optional_values(ownership) -> None: From af1fff24c1e2074f317d3ed2e8f2df38ddf5935d Mon Sep 17 00:00:00 2001 From: Andrew Snare Date: Wed, 16 Oct 2024 14:38:14 +0200 Subject: [PATCH 18/72] Fix comparison. --- src/databricks/labs/ucx/progress/history.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/databricks/labs/ucx/progress/history.py b/src/databricks/labs/ucx/progress/history.py index abb0cc30d5..add5b1aebe 100644 --- a/src/databricks/labs/ucx/progress/history.py +++ b/src/databricks/labs/ucx/progress/history.py @@ -85,7 +85,7 @@ def _encode_field_value(self, name: str, value: Any | None) -> str | None: if value is None: return None value_type = self._field_types[name] - if value_type == str or value_type == (str | None): + if value_type in (str, (str | None)): return value encoded_value = json.dumps( value, From 724abb265943eb395218467c2d2e30756eb01c13 Mon Sep 17 00:00:00 2001 From: Andrew Snare Date: Wed, 16 Oct 2024 14:38:33 +0200 Subject: [PATCH 19/72] Unit tests for the history log. --- tests/unit/progress/test_history.py | 73 ++++++++++++++++++++++++++++- 1 file changed, 72 insertions(+), 1 deletion(-) diff --git a/tests/unit/progress/test_history.py b/tests/unit/progress/test_history.py index 191200b435..c5ea438d33 100644 --- a/tests/unit/progress/test_history.py +++ b/tests/unit/progress/test_history.py @@ -5,10 +5,11 @@ from unittest.mock import create_autospec import pytest +from databricks.labs.lsql.core import Row from databricks.labs.ucx.__about__ import __version__ as ucx_version from databricks.labs.ucx.framework.owners import Ownership -from databricks.labs.ucx.progress.history import HistoricalEncoder +from databricks.labs.ucx.progress.history import HistoricalEncoder, HistoryLog from databricks.labs.ucx.progress.install import Historical @@ -269,3 +270,73 @@ class _BrokenFailures: with pytest.raises(TypeError, match=re.escape("Historical record class has invalid failures type: list[int]")): _ = HistoricalEncoder(job_run_id=1, workspace_id=2, ownership=ownership, klass=_BrokenFailures) + + +def test_history_log_appends_historical_records(ws, mock_backend, ownership) -> None: + """Verify that we can journal a snapshot of records to the historical log.""" + ownership.owner_of.side_effect = lambda o: f"owner-{o.a_field}" + + records = ( + _TestRecord(a_field="first_record", b_field=1, failures=[]), + _TestRecord(a_field="second_record", b_field=2, failures=["a_failure"]), + _TestRecord(a_field="third_record", b_field=3, failures=["another_failure", "yet_another_failure"]), + ) + expected_historical_entries = ( + Row( + workspace_id=2, + job_run_id=1, + object_type="_TestRecord", + object_id=["first_record"], + data={"a_field": "first_record", "b_field": "1"}, + failures=[], + owner="owner-first_record", + ucx_version=ucx_version, + ), + Row( + workspace_id=2, + job_run_id=1, + object_type="_TestRecord", + object_id=["second_record"], + data={"a_field": "second_record", "b_field": "2"}, + failures=["a_failure"], + owner="owner-second_record", + ucx_version=ucx_version, + ), + Row( + workspace_id=2, + job_run_id=1, + object_type="_TestRecord", + object_id=["third_record"], + data={"a_field": "third_record", "b_field": "3"}, + failures=["another_failure", "yet_another_failure"], + owner="owner-third_record", + ucx_version=ucx_version, + ), + ) + + history_log = HistoryLog( + ws, + mock_backend, + ownership, + _TestRecord, + run_id=1, + workspace_id=2, + catalog="the_catalog", + schema="the_schema", + table="the_table", + ) + history_log.append_inventory_snapshot(records) + + rows_appended = mock_backend.rows_written_for("`the_catalog`.`the_schema`.`the_table`", mode="append") + assert rows_appended == list(expected_historical_entries) + + +def test_history_log_default_location(ws, mock_backend, ownership) -> None: + """Verify that the history log defaults to the ucx.history in the configured catalog.""" + + record = _TestRecord(a_field="foo", b_field=1, failures=[]) + history_log = HistoryLog(ws, mock_backend, ownership, _TestRecord, run_id=1, workspace_id=2, catalog="the_catalog") + history_log.append_inventory_snapshot([record]) + + assert history_log.full_name == "the_catalog.ucx.history" + assert mock_backend.has_rows_written_for("`the_catalog`.`ucx`.`history`") From dd73486c50097370017d9fdcb76ff87fe05a5ca1 Mon Sep 17 00:00:00 2001 From: Andrew Snare Date: Wed, 16 Oct 2024 14:41:58 +0200 Subject: [PATCH 20/72] Remove dead code. --- src/databricks/labs/ucx/progress/history.py | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) diff --git a/src/databricks/labs/ucx/progress/history.py b/src/databricks/labs/ucx/progress/history.py index add5b1aebe..34e1b83f04 100644 --- a/src/databricks/labs/ucx/progress/history.py +++ b/src/databricks/labs/ucx/progress/history.py @@ -3,7 +3,7 @@ import datetime as dt import json import logging -from collections.abc import Callable, Iterable, Sequence +from collections.abc import Iterable, Sequence from typing import ClassVar, Protocol, TypeVar, Generic, Any from databricks.labs.lsql.backends import SqlBackend @@ -141,15 +141,6 @@ def __init__( # pylint: disable=too-many-arguments encoder = HistoricalEncoder(job_run_id=run_id, workspace_id=workspace_id, ownership=ownership, klass=klass) self._encoder = encoder - @staticmethod - def _record_identifier(record_type: type[Record]) -> Callable[[Record], list[str]]: - key_field_names = getattr(record_type, "key_fields", ()) - - def identify_record(record: Record) -> list[str]: - return [getattr(record, key_field) for key_field in key_field_names] - - return identify_record - @property def full_name(self) -> str: return f"{self._catalog}.{self._schema}.{self._table}" From 00d5d854ce53384be2c34e383db4153d8584beeb Mon Sep 17 00:00:00 2001 From: Andrew Snare Date: Wed, 16 Oct 2024 16:21:22 +0200 Subject: [PATCH 21/72] Type hint. --- tests/unit/progress/test_history.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/unit/progress/test_history.py b/tests/unit/progress/test_history.py index c5ea438d33..13e6520d01 100644 --- a/tests/unit/progress/test_history.py +++ b/tests/unit/progress/test_history.py @@ -248,7 +248,7 @@ class _AClass: @pytest.mark.parametrize("failures", (["failures-1", "failures-2"], [])) -def test_historical_encoder_failures(ownership, failures) -> None: +def test_historical_encoder_failures(ownership, failures: list[str]) -> None: """Verify an encoder places failures on the top-level field instead of within the object data.""" encoder = HistoricalEncoder(job_run_id=1, workspace_id=2, ownership=ownership, klass=_TestRecord) From 57f63ba05b92e46d79b8ae3bbc8b3527aa280f73 Mon Sep 17 00:00:00 2001 From: Andrew Snare Date: Wed, 16 Oct 2024 16:22:02 +0200 Subject: [PATCH 22/72] Test detection of naive timestamps during encoding. --- src/databricks/labs/ucx/progress/history.py | 3 +- tests/unit/progress/test_history.py | 31 +++++++++++++++++++++ 2 files changed, 33 insertions(+), 1 deletion(-) diff --git a/src/databricks/labs/ucx/progress/history.py b/src/databricks/labs/ucx/progress/history.py index 34e1b83f04..ce851745d5 100644 --- a/src/databricks/labs/ucx/progress/history.py +++ b/src/databricks/labs/ucx/progress/history.py @@ -72,7 +72,8 @@ def _encode_non_serializable(cls, name: str, value: Any) -> Any: if isinstance(value, dt.datetime): # Only allow tz-aware timestamps. if value.tzinfo is None: - msg = f"Timestamp without timezone not supported for field {name}: {value}" + # Name refers to the outermost field, not necessarily a field on a (nested) dataclass. + msg = f"Timestamp without timezone not supported in or within field {name}: {value}" raise ValueError(msg) # Always store with 'Z'. ts_utc = value.astimezone(dt.timezone.utc) diff --git a/tests/unit/progress/test_history.py b/tests/unit/progress/test_history.py index 13e6520d01..d8b39db5a2 100644 --- a/tests/unit/progress/test_history.py +++ b/tests/unit/progress/test_history.py @@ -247,6 +247,37 @@ class _AClass: } +@dataclass(frozen=True, kw_only=True) +class _InnerClassWithTimestamp: + b_field: dt.datetime + + +@dataclass(frozen=True, kw_only=True) +class _OuterclassWithTimestamps: + object_id: str = "not used" + a_field: dt.datetime | None = None + inner: _InnerClassWithTimestamp | None = None + + __id_fields__: ClassVar = ("object_id",) + + +@pytest.mark.parametrize( + "field,record", + ( + ("a_field", _OuterclassWithTimestamps(a_field=dt.datetime.now())), + ("inner", _OuterclassWithTimestamps(inner=_InnerClassWithTimestamp(b_field=dt.datetime.now()))), + ), +) +def test_historical_encoder_naive_timestamps_banned(ownership, field: str, record: _OuterclassWithTimestamps) -> None: + """Verify that encoding detects and disallows naive timestamps.""" + encoder = HistoricalEncoder(job_run_id=1, workspace_id=2, ownership=ownership, klass=_OuterclassWithTimestamps) + + with pytest.raises( + ValueError, match=re.escape(f"Timestamp without timezone not supported in or within field {field}") + ): + _ = encoder.to_historical(record) + + @pytest.mark.parametrize("failures", (["failures-1", "failures-2"], [])) def test_historical_encoder_failures(ownership, failures: list[str]) -> None: """Verify an encoder places failures on the top-level field instead of within the object data.""" From 3aa2431ac153e99000f873352a3c3dad0c91fbe6 Mon Sep 17 00:00:00 2001 From: Andrew Snare Date: Wed, 16 Oct 2024 16:36:06 +0200 Subject: [PATCH 23/72] Rename test argument to avoid shadowing a global. --- tests/unit/progress/test_history.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/tests/unit/progress/test_history.py b/tests/unit/progress/test_history.py index d8b39db5a2..266419f618 100644 --- a/tests/unit/progress/test_history.py +++ b/tests/unit/progress/test_history.py @@ -262,19 +262,18 @@ class _OuterclassWithTimestamps: @pytest.mark.parametrize( - "field,record", + "field_name,record", ( ("a_field", _OuterclassWithTimestamps(a_field=dt.datetime.now())), ("inner", _OuterclassWithTimestamps(inner=_InnerClassWithTimestamp(b_field=dt.datetime.now()))), ), ) -def test_historical_encoder_naive_timestamps_banned(ownership, field: str, record: _OuterclassWithTimestamps) -> None: +def test_historical_encoder_naive_timestamps_banned(ownership, field_name, record: _OuterclassWithTimestamps) -> None: """Verify that encoding detects and disallows naive timestamps.""" encoder = HistoricalEncoder(job_run_id=1, workspace_id=2, ownership=ownership, klass=_OuterclassWithTimestamps) - with pytest.raises( - ValueError, match=re.escape(f"Timestamp without timezone not supported in or within field {field}") - ): + expected_msg = f"Timestamp without timezone not supported in or within field {field_name}" + with pytest.raises(ValueError, match=f"^{re.escape(expected_msg)}"): _ = encoder.to_historical(record) From 4c86ba64502e86c8b8e615e8c8eaf2b416c7cfae Mon Sep 17 00:00:00 2001 From: Andrew Snare Date: Wed, 16 Oct 2024 16:37:15 +0200 Subject: [PATCH 24/72] Unit test for handling unserializable values. --- src/databricks/labs/ucx/progress/history.py | 2 +- tests/unit/progress/test_history.py | 30 +++++++++++++++++++++ 2 files changed, 31 insertions(+), 1 deletion(-) diff --git a/src/databricks/labs/ucx/progress/history.py b/src/databricks/labs/ucx/progress/history.py index ce851745d5..94e930d881 100644 --- a/src/databricks/labs/ucx/progress/history.py +++ b/src/databricks/labs/ucx/progress/history.py @@ -79,7 +79,7 @@ def _encode_non_serializable(cls, name: str, value: Any) -> Any: ts_utc = value.astimezone(dt.timezone.utc) return ts_utc.isoformat().replace("+00:00", "Z") - msg = f"Cannot encode value for {name}: {value!r}" + msg = f"Cannot encode value in or within field {name}: {value!r}" raise TypeError(msg) def _encode_field_value(self, name: str, value: Any | None) -> str | None: diff --git a/tests/unit/progress/test_history.py b/tests/unit/progress/test_history.py index 266419f618..254e0b7eff 100644 --- a/tests/unit/progress/test_history.py +++ b/tests/unit/progress/test_history.py @@ -277,6 +277,36 @@ def test_historical_encoder_naive_timestamps_banned(ownership, field_name, recor _ = encoder.to_historical(record) +@dataclass(frozen=True, kw_only=True) +class _InnerClassWithUnserializable: + b_field: object + + +@dataclass(frozen=True, kw_only=True) +class _OuterclassWithUnserializable: + object_id: str = "not used" + a_field: object | None = None + inner: _InnerClassWithUnserializable | None = None + + __id_fields__: ClassVar = ("object_id",) + + +@pytest.mark.parametrize( + "field_name,record", + ( + ("a_field", _OuterclassWithUnserializable(a_field=object())), + ("inner", _OuterclassWithUnserializable(inner=_InnerClassWithUnserializable(b_field=object()))), + ), +) +def test_historical_encoder_unserializable_values(ownership, field_name, record: _OuterclassWithUnserializable) -> None: + """Verify that encoding catches and handles unserializable values.""" + encoder = HistoricalEncoder(job_run_id=1, workspace_id=2, ownership=ownership, klass=_OuterclassWithUnserializable) + + expected_msg = f"Cannot encode value in or within field {field_name}: " + with pytest.raises(TypeError, match=f"^{re.escape(expected_msg)}"): + _ = encoder.to_historical(record) + + @pytest.mark.parametrize("failures", (["failures-1", "failures-2"], [])) def test_historical_encoder_failures(ownership, failures: list[str]) -> None: """Verify an encoder places failures on the top-level field instead of within the object data.""" From 39f1105394d3e41ecefc9a6786931a5accbdf040 Mon Sep 17 00:00:00 2001 From: Andrew Snare Date: Thu, 17 Oct 2024 14:12:41 +0200 Subject: [PATCH 25/72] Inline trivial method. --- src/databricks/labs/ucx/progress/history.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/src/databricks/labs/ucx/progress/history.py b/src/databricks/labs/ucx/progress/history.py index 94e930d881..0cb370a1f9 100644 --- a/src/databricks/labs/ucx/progress/history.py +++ b/src/databricks/labs/ucx/progress/history.py @@ -3,7 +3,7 @@ import datetime as dt import json import logging -from collections.abc import Iterable, Sequence +from collections.abc import Sequence from typing import ClassVar, Protocol, TypeVar, Generic, Any from databricks.labs.lsql.backends import SqlBackend @@ -147,10 +147,7 @@ def full_name(self) -> str: return f"{self._catalog}.{self._schema}.{self._table}" def append_inventory_snapshot(self, snapshot: Sequence[Record]) -> None: - history_records = list(self._inventory_records_to_historical(snapshot)) + history_records = [self._encoder.to_historical(record) for record in snapshot] + logger.debug(f"Appending {len(history_records)} {self._klass} record(s) to history.") # This is the only writer, and the mode is 'append'. This is documented as conflict-free. self._backend.save_table(escape_sql_identifier(self.full_name), history_records, Historical, mode="append") - - def _inventory_records_to_historical(self, records: Sequence[Record]) -> Iterable[Historical]: - for record in records: - yield self._encoder.to_historical(record) From 1d50ce19a4c4c103dfcb625dcf5aef95344f7fc1 Mon Sep 17 00:00:00 2001 From: Andrew Snare Date: Thu, 17 Oct 2024 14:13:12 +0200 Subject: [PATCH 26/72] Update error message on unserializable value to provide more context. --- src/databricks/labs/ucx/progress/history.py | 2 +- tests/unit/progress/test_history.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/databricks/labs/ucx/progress/history.py b/src/databricks/labs/ucx/progress/history.py index 0cb370a1f9..325ef4de68 100644 --- a/src/databricks/labs/ucx/progress/history.py +++ b/src/databricks/labs/ucx/progress/history.py @@ -79,7 +79,7 @@ def _encode_non_serializable(cls, name: str, value: Any) -> Any: ts_utc = value.astimezone(dt.timezone.utc) return ts_utc.isoformat().replace("+00:00", "Z") - msg = f"Cannot encode value in or within field {name}: {value!r}" + msg = f"Cannot encode {type(value)} value in or within field {name}: {value!r}" raise TypeError(msg) def _encode_field_value(self, name: str, value: Any | None) -> str | None: diff --git a/tests/unit/progress/test_history.py b/tests/unit/progress/test_history.py index 254e0b7eff..abd30f1c54 100644 --- a/tests/unit/progress/test_history.py +++ b/tests/unit/progress/test_history.py @@ -302,8 +302,8 @@ def test_historical_encoder_unserializable_values(ownership, field_name, record: """Verify that encoding catches and handles unserializable values.""" encoder = HistoricalEncoder(job_run_id=1, workspace_id=2, ownership=ownership, klass=_OuterclassWithUnserializable) - expected_msg = f"Cannot encode value in or within field {field_name}: " - with pytest.raises(TypeError, match=f"^{re.escape(expected_msg)}"): + expected_msg = f"^Cannot encode .* value in or within field {re.escape(field_name)}: " + with pytest.raises(TypeError, match=expected_msg): _ = encoder.to_historical(record) From 4ce4ce4ecd4beb8cb6ea6c062a578dfccab98858 Mon Sep 17 00:00:00 2001 From: Andrew Snare Date: Thu, 17 Oct 2024 16:14:17 +0200 Subject: [PATCH 27/72] Rename for consistency. --- src/databricks/labs/ucx/progress/history.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/databricks/labs/ucx/progress/history.py b/src/databricks/labs/ucx/progress/history.py index 325ef4de68..cd3f4b5fcf 100644 --- a/src/databricks/labs/ucx/progress/history.py +++ b/src/databricks/labs/ucx/progress/history.py @@ -124,7 +124,7 @@ class HistoryLog(Generic[Record]): def __init__( # pylint: disable=too-many-arguments self, ws: WorkspaceClient, - backend: SqlBackend, + sql_backend: SqlBackend, ownership: Ownership[Record], klass: type[Record], run_id: int, @@ -134,7 +134,7 @@ def __init__( # pylint: disable=too-many-arguments table: str = "history", ) -> None: self._ws = ws - self._backend = backend + self._sql_backend = sql_backend self._klass = klass self._catalog = catalog self._schema = schema @@ -150,4 +150,4 @@ def append_inventory_snapshot(self, snapshot: Sequence[Record]) -> None: history_records = [self._encoder.to_historical(record) for record in snapshot] logger.debug(f"Appending {len(history_records)} {self._klass} record(s) to history.") # This is the only writer, and the mode is 'append'. This is documented as conflict-free. - self._backend.save_table(escape_sql_identifier(self.full_name), history_records, Historical, mode="append") + self._sql_backend.save_table(escape_sql_identifier(self.full_name), history_records, Historical, mode="append") From 7441077f38dcedeec039c6a3a3cd2d5441d21208 Mon Sep 17 00:00:00 2001 From: Andrew Snare Date: Fri, 18 Oct 2024 13:49:16 +0200 Subject: [PATCH 28/72] Update HistoryLog initializer: it doesn't need a workspace client. --- src/databricks/labs/ucx/progress/history.py | 5 +---- tests/unit/progress/test_history.py | 7 +++---- 2 files changed, 4 insertions(+), 8 deletions(-) diff --git a/src/databricks/labs/ucx/progress/history.py b/src/databricks/labs/ucx/progress/history.py index cd3f4b5fcf..3f2ac8afe8 100644 --- a/src/databricks/labs/ucx/progress/history.py +++ b/src/databricks/labs/ucx/progress/history.py @@ -7,7 +7,6 @@ from typing import ClassVar, Protocol, TypeVar, Generic, Any from databricks.labs.lsql.backends import SqlBackend -from databricks.sdk import WorkspaceClient from databricks.labs.ucx.framework.owners import Ownership from databricks.labs.ucx.framework.utils import escape_sql_identifier @@ -121,9 +120,8 @@ def to_historical(self, record: Record) -> Historical: class HistoryLog(Generic[Record]): - def __init__( # pylint: disable=too-many-arguments + def __init__( self, - ws: WorkspaceClient, sql_backend: SqlBackend, ownership: Ownership[Record], klass: type[Record], @@ -133,7 +131,6 @@ def __init__( # pylint: disable=too-many-arguments schema: str = "ucx", table: str = "history", ) -> None: - self._ws = ws self._sql_backend = sql_backend self._klass = klass self._catalog = catalog diff --git a/tests/unit/progress/test_history.py b/tests/unit/progress/test_history.py index abd30f1c54..aa4ae9cc37 100644 --- a/tests/unit/progress/test_history.py +++ b/tests/unit/progress/test_history.py @@ -332,7 +332,7 @@ class _BrokenFailures: _ = HistoricalEncoder(job_run_id=1, workspace_id=2, ownership=ownership, klass=_BrokenFailures) -def test_history_log_appends_historical_records(ws, mock_backend, ownership) -> None: +def test_history_log_appends_historical_records(mock_backend, ownership) -> None: """Verify that we can journal a snapshot of records to the historical log.""" ownership.owner_of.side_effect = lambda o: f"owner-{o.a_field}" @@ -375,7 +375,6 @@ def test_history_log_appends_historical_records(ws, mock_backend, ownership) -> ) history_log = HistoryLog( - ws, mock_backend, ownership, _TestRecord, @@ -391,11 +390,11 @@ def test_history_log_appends_historical_records(ws, mock_backend, ownership) -> assert rows_appended == list(expected_historical_entries) -def test_history_log_default_location(ws, mock_backend, ownership) -> None: +def test_history_log_default_location(mock_backend, ownership) -> None: """Verify that the history log defaults to the ucx.history in the configured catalog.""" record = _TestRecord(a_field="foo", b_field=1, failures=[]) - history_log = HistoryLog(ws, mock_backend, ownership, _TestRecord, run_id=1, workspace_id=2, catalog="the_catalog") + history_log = HistoryLog(mock_backend, ownership, _TestRecord, run_id=1, workspace_id=2, catalog="the_catalog") history_log.append_inventory_snapshot([record]) assert history_log.full_name == "the_catalog.ucx.history" From 07bc711801071cddf5eecdbaf1a48fdd61ea98c7 Mon Sep 17 00:00:00 2001 From: Andrew Snare Date: Fri, 18 Oct 2024 13:53:00 +0200 Subject: [PATCH 29/72] Update to object_id support to allow properties as well as fields. --- src/databricks/labs/ucx/progress/history.py | 83 +++++++++-- .../integration/assessment/test_dashboards.py | 8 +- tests/unit/progress/test_history.py | 137 +++++++++++++++--- 3 files changed, 185 insertions(+), 43 deletions(-) diff --git a/src/databricks/labs/ucx/progress/history.py b/src/databricks/labs/ucx/progress/history.py index 3f2ac8afe8..24a2f67f36 100644 --- a/src/databricks/labs/ucx/progress/history.py +++ b/src/databricks/labs/ucx/progress/history.py @@ -4,7 +4,7 @@ import json import logging from collections.abc import Sequence -from typing import ClassVar, Protocol, TypeVar, Generic, Any +from typing import ClassVar, Protocol, TypeVar, Generic, Any, get_type_hints from databricks.labs.lsql.backends import SqlBackend @@ -15,45 +15,96 @@ logger = logging.getLogger(__name__) -class DataclassInstance(Protocol): +class DataclassWithIdAttributes(Protocol): __dataclass_fields__: ClassVar[dict[str, Any]] - __id_fields__: ClassVar[Sequence[str]] + __id_attributes__: ClassVar[Sequence[str]] + """The names of attributes (can be dataclass fields or ordinary properties) that make up the object identifier. + All attributes must be (non-optional) strings. + """ -Record = TypeVar("Record", bound=DataclassInstance) + +Record = TypeVar("Record", bound=DataclassWithIdAttributes) class HistoricalEncoder(Generic[Record]): + _job_run_id: int + """The identifier of the current job run, with which these records are associated.""" + + _workspace_id: int + """The identifier of the current workspace, for identifying where these records came from.""" + + _ownership: Ownership[Record] + """Used to determine the owner for each record.""" + + _object_type: str + """The name of the record class being encoded by this instance.""" + + _field_types: dict[str, type] + """A map of the fields on instances of the record class (and their types); these will appear in the object data.""" + + _has_failures: bool + """Whether this record has a failures attribute that should be included in historical records.""" + + _id_attribute_names: Sequence[str] + """The names of the record attributes that are used to produce the identifier for each record. + + Attributes can be either a dataclass field or a property, for which the type must be a (non-optional) string. + """ + def __init__(self, job_run_id: int, workspace_id: int, ownership: Ownership[Record], klass: type[Record]) -> None: self._job_run_id = job_run_id self._workspace_id = workspace_id self._ownership = ownership self._object_type = self._get_object_type(klass) self._field_types, self._has_failures = self._get_field_types(klass) - self._id_field_names = self._get_id_fieldnames(klass) + self._id_attribute_names = self._get_id_attribute_names(klass) @classmethod def _get_field_types(cls, klass: type[Record]) -> tuple[dict[str, type], bool]: + """Return the dataclass-defined fields that the record type declares, and their associated types. + + If the record has a "failures" attribute this is treated specially: it is removed but we signal that it was + present. + """ field_types = {field.name: field.type for field in dataclasses.fields(klass)} failures_type = field_types.pop("failures", None) has_failures = failures_type is not None if has_failures and failures_type != list[str]: - msg = f"Historical record class has invalid failures type: {failures_type}" + msg = f"Historical record {klass} has invalid 'failures' attribute of type: {failures_type}" raise TypeError(msg) return field_types, has_failures - def _get_id_fieldnames(self, klazz: type[Record]) -> Sequence[str]: - id_field_names = tuple(klazz.__id_fields__) + def _get_id_attribute_names(self, klazz: type[Record]) -> Sequence[str]: + id_attribute_names = tuple(klazz.__id_attributes__) all_fields = self._field_types - for field in id_field_names: - id_field_type = all_fields.get(field, None) - if id_field_type is None: - raise AttributeError(name=field, obj=klazz) - if id_field_type != str: - msg = f"Historical record class id field is not a string: {field}" + for name in id_attribute_names: + id_attribute_type = all_fields.get(name, None) or self._detect_property_type(klazz, name) + if id_attribute_type is None: + raise AttributeError(name=name, obj=klazz) + if id_attribute_type != str: + msg = f"Historical record {klazz} has a non-string id attribute: {name} (type={id_attribute_type})" raise TypeError(msg) - return id_field_names + return id_attribute_names + + def _detect_property_type(self, klazz: type[Record], name: str) -> str | None: + maybe_property = getattr(klazz, name, None) + if maybe_property is None: + return None + if not isinstance(maybe_property, property): + msg = f"Historical record {klazz} declares an id attribute that is not a field or property: {name} (type={maybe_property})" + raise TypeError(msg) + property_getter = maybe_property.fget + if not property_getter: + msg = f"Historical record {klazz} has a non-readable property as an id attribute: {name}" + raise TypeError(msg) + type_hints = get_type_hints(property_getter) or {} + try: + return type_hints["return"] + except KeyError as e: + msg = f"Historical record {klazz} has a property with no type as an id attribute: {name}" + raise TypeError(msg) from e @classmethod def _get_object_type(cls, klass: type[Record]) -> str: @@ -64,7 +115,7 @@ def _as_dict(cls, record: Record) -> dict[str, Any]: return dataclasses.asdict(record) def _object_id(self, record: Record) -> list[str]: - return [getattr(record, field) for field in self._id_field_names] + return [getattr(record, field) for field in self._id_attribute_names] @classmethod def _encode_non_serializable(cls, name: str, value: Any) -> Any: diff --git a/tests/integration/assessment/test_dashboards.py b/tests/integration/assessment/test_dashboards.py index ba84425eb3..c94ef21a50 100644 --- a/tests/integration/assessment/test_dashboards.py +++ b/tests/integration/assessment/test_dashboards.py @@ -68,7 +68,7 @@ def _populate_directfs_problems(installation_ctx): LineageAtom(object_type="NOTEBOOK", object_id="my_notebook_path"), LineageAtom(object_type="FILE", object_id="my file_path"), ], - assessment_start_timestamp=(datetime.now(timezone.utc) - timedelta(minutes=5.0)), + assessment_start_timestamp=datetime.now(timezone.utc) - timedelta(minutes=5.0), assessment_end_timestamp=datetime.now(timezone.utc) - timedelta(minutes=2.0), ) ] @@ -84,7 +84,7 @@ def _populate_directfs_problems(installation_ctx): LineageAtom(object_type="DASHBOARD", object_id="my_dashboard_id", other={"name": "my_dashboard"}), LineageAtom(object_type="QUERY", object_id="my_dashboard_id/my_query_id", other={"name": "my_query"}), ], - assessment_start_timestamp=(datetime.now(timezone.utc) - timedelta(minutes=5.0)), + assessment_start_timestamp=datetime.now(timezone.utc) - timedelta(minutes=5.0), assessment_end_timestamp=datetime.now(timezone.utc) - timedelta(minutes=2.0), ) ] @@ -107,7 +107,7 @@ def _populate_used_tables(installation_ctx): LineageAtom(object_type="NOTEBOOK", object_id="my_notebook_path"), LineageAtom(object_type="FILE", object_id="my file_path"), ], - assessment_start_timestamp=(datetime.now(timezone.utc) - timedelta(minutes=5.0)), + assessment_start_timestamp=datetime.now(timezone.utc) - timedelta(minutes=5.0), assessment_end_timestamp=datetime.now(timezone.utc) - timedelta(minutes=2.0), ) ] @@ -125,7 +125,7 @@ def _populate_used_tables(installation_ctx): LineageAtom(object_type="DASHBOARD", object_id="my_dashboard_id", other={"name": "my_dashboard"}), LineageAtom(object_type="QUERY", object_id="my_dashboard_id/my_query_id", other={"name": "my_query"}), ], - assessment_start_timestamp=(datetime.now(timezone.utc) - timedelta(minutes=5.0)), + assessment_start_timestamp=datetime.now(timezone.utc) - timedelta(minutes=5.0), assessment_end_timestamp=datetime.now(timezone.utc) - timedelta(minutes=2.0), ) ] diff --git a/tests/unit/progress/test_history.py b/tests/unit/progress/test_history.py index aa4ae9cc37..18f9fd26fe 100644 --- a/tests/unit/progress/test_history.py +++ b/tests/unit/progress/test_history.py @@ -9,7 +9,7 @@ from databricks.labs.ucx.__about__ import __version__ as ucx_version from databricks.labs.ucx.framework.owners import Ownership -from databricks.labs.ucx.progress.history import HistoricalEncoder, HistoryLog +from databricks.labs.ucx.progress.history import HistoricalEncoder, HistoryLog, Record from databricks.labs.ucx.progress.install import Historical @@ -19,7 +19,7 @@ class _TestRecord: b_field: int failures: list[str] - __id_fields__: ClassVar[tuple[str]] = ("a_field",) + __id_attributes__: ClassVar[tuple[str]] = ("a_field",) @pytest.fixture @@ -118,37 +118,127 @@ class _CompoundKey: b_field: str = "field-b" c_field: str = "field-c" - __id_fields__: ClassVar = ("a_field", "c_field", "b_field") + @property + def d_property(self) -> str: + return "property-d" + + __id_attributes__: ClassVar = ("a_field", "c_field", "b_field", "d_property") encoder2 = HistoricalEncoder(job_run_id=1, workspace_id=2, ownership=ownership, klass=_CompoundKey) historical2 = encoder2.to_historical(_CompoundKey()) # Note: order matters - assert historical2.object_id == ["field-a", "field-c", "field-b"] + assert historical2.object_id == ["field-a", "field-c", "field-b", "property-d"] -def test_historical_encoder_object_id_verification(ownership) -> None: - """Check the __id_fields__ class property is verified during init: it must be present and refer only to strings.""" +def test_historical_encoder_object_id_verification_no_id(ownership) -> None: + """Check that during initialization we fail if there is no __id_attributes__ defined.""" @dataclass - class _NoIdFields: + class _NoId: pass + with pytest.raises(AttributeError) as excinfo: + HistoricalEncoder(job_run_id=1, workspace_id=1, ownership=ownership, klass=_NoId) + + assert excinfo.value.obj == _NoId + assert excinfo.value.name == "__id_attributes__" + + +@dataclass +class _WrongTypeIdFields: + ok: str + not_ok: int + + __id_attributes__: ClassVar = ["ok", "not_ok"] + + +@dataclass +class _WrongTypeIdProperty: + ok: str + + @property + def not_ok(self) -> int: + return 0 + + __id_attributes__: ClassVar = ["ok", "not_ok"] + + +@pytest.mark.parametrize("wrong_id_type_class", (_WrongTypeIdFields, _WrongTypeIdProperty)) +def test_historical_encoder_object_id_verification_wrong_type(ownership, wrong_id_type_class: type[Record]) -> None: + """Check that during initialization we fail if the id attributes are declared but are not strings.""" + + expected_msg = r"^Historical record has a non-string id attribute: not_ok \(type=\)$" + with pytest.raises(TypeError, match=expected_msg): + HistoricalEncoder(job_run_id=1, workspace_id=1, ownership=ownership, klass=wrong_id_type_class) + + +def test_historical_encoder_object_id_verification_no_property_type(ownership) -> None: + """Check that during initialization we fail if the id attributes are declared but are not strings.""" + + @dataclass + class _NoTypeIdProperty: + ok: str + + @property + def not_ok(self): + return 0 + + __id_attributes__: ClassVar = ["ok", "not_ok"] + + expected_msg = "^Historical record has a property with no type as an id attribute: not_ok$" + with pytest.raises(TypeError, match=expected_msg): + HistoricalEncoder(job_run_id=1, workspace_id=1, ownership=ownership, klass=_NoTypeIdProperty) + + +def test_historical_encoder_object_id_verification_non_readable_property(ownership) -> None: + """Check that during initialization we fail if an id attribute refers to a non-readable property.""" + + @dataclass + class _NonReadableIdProperty: + ok: str + + __id_attributes__: ClassVar = ["ok", "not_ok"] + + # Has to be injected after class declaration to avoid being treated as a field. + _NonReadableIdProperty.not_ok = property(doc="A non-readable-property") # type: ignore[attr-defined] + + expected_msg = r"^Historical record has a non-readable property as an id attribute: not_ok$" + with pytest.raises(TypeError, match=expected_msg): + HistoricalEncoder(job_run_id=1, workspace_id=1, ownership=ownership, klass=_NonReadableIdProperty) + + +def test_historical_encoder_object_id_verification_missing_attribute(ownership) -> None: + """Check that during initialization we fail if an id attribute refers to an attribute that does not exist.""" + @dataclass - class _WrongTypeIdFields: + class _MissingAttribute: ok: str - not_ok: int - __id_fields__: ClassVar = ["ok", "not_ok"] + __id_attributes__: ClassVar = ["ok", "not_ok"] with pytest.raises(AttributeError) as excinfo: - HistoricalEncoder(job_run_id=1, workspace_id=1, ownership=ownership, klass=_NoIdFields) + HistoricalEncoder(job_run_id=1, workspace_id=1, ownership=ownership, klass=_MissingAttribute) + + assert excinfo.value.obj == _MissingAttribute + assert excinfo.value.name == "not_ok" - assert excinfo.value.obj == _NoIdFields - assert excinfo.value.name == "__id_fields__" - with pytest.raises(TypeError, match="Historical record class id field is not a string: not_ok"): - HistoricalEncoder(job_run_id=1, workspace_id=1, ownership=ownership, klass=_WrongTypeIdFields) +def test_historical_encoder_object_id_verification_not_field_or_property(ownership) -> None: + """Check that during initialization we fail if an id attribute refers an attribute that isn't a field or property.""" + + @dataclass + class _NotFieldOrProperty: + ok: str + + def not_ok(self) -> str: + return "" + + __id_attributes__: ClassVar = ["ok", "not_ok"] + + expected_msg = r"^Historical record declares an id attribute that is not a field or property: not_ok \(type=<.*>\)$" + with pytest.raises(TypeError, match=expected_msg): + HistoricalEncoder(job_run_id=1, workspace_id=1, ownership=ownership, klass=_NotFieldOrProperty) def test_historical_encoder_object_data(ownership) -> None: @@ -165,7 +255,7 @@ class _AnotherClass: field_3: str = "baz" field_4: str = "daz" - __id_fields__: ClassVar = ("field_1",) + __id_attributes__: ClassVar = ("field_1",) encoder2 = HistoricalEncoder(job_run_id=1, workspace_id=2, ownership=ownership, klass=_AnotherClass) historical2 = encoder2.to_historical(_AnotherClass()) @@ -181,7 +271,7 @@ class _AClass: existing_json_field: str = "[1, 2, 3]" optional_string_field: str | None = "value" - __id_fields__: ClassVar = ("a_field",) + __id_attributes__: ClassVar = ("a_field",) encoder = HistoricalEncoder(job_run_id=1, workspace_id=2, ownership=ownership, klass=_AClass) historical = encoder.to_historical(_AClass()) @@ -201,7 +291,7 @@ class _AClass: optional_field: str | None = None nested: _InnerClass = _InnerClass() - __id_fields__: ClassVar = ("a_field",) + __id_attributes__: ClassVar = ("a_field",) encoder = HistoricalEncoder(job_run_id=1, workspace_id=2, ownership=ownership, klass=_AClass) historical = encoder.to_historical(_AClass()) @@ -233,7 +323,7 @@ class _AClass: array_field: list[str] = field(default_factory=lambda: ["foo", "bar", "baz"]) nested_dataclass: list[_InnerClass] = field(default_factory=lambda: [_InnerClass(x) for x in range(2)]) - __id_fields__: ClassVar = ("str_field",) + __id_attributes__: ClassVar = ("str_field",) encoder = HistoricalEncoder(job_run_id=1, workspace_id=2, ownership=ownership, klass=_AClass) historical = encoder.to_historical(_AClass()) @@ -258,7 +348,7 @@ class _OuterclassWithTimestamps: a_field: dt.datetime | None = None inner: _InnerClassWithTimestamp | None = None - __id_fields__: ClassVar = ("object_id",) + __id_attributes__: ClassVar = ("object_id",) @pytest.mark.parametrize( @@ -288,7 +378,7 @@ class _OuterclassWithUnserializable: a_field: object | None = None inner: _InnerClassWithUnserializable | None = None - __id_fields__: ClassVar = ("object_id",) + __id_attributes__: ClassVar = ("object_id",) @pytest.mark.parametrize( @@ -326,9 +416,10 @@ class _BrokenFailures: a_field: str = "a_field" failures: list[int] = field(default_factory=list) - __id_fields__: ClassVar = ("a_field",) + __id_attributes__: ClassVar = ("a_field",) - with pytest.raises(TypeError, match=re.escape("Historical record class has invalid failures type: list[int]")): + expected_msg = r"^Historical record has invalid 'failures' attribute of type: list\[int\]$" + with pytest.raises(TypeError, match=expected_msg): _ = HistoricalEncoder(job_run_id=1, workspace_id=2, ownership=ownership, klass=_BrokenFailures) From 3b3e5bdaca9a38c8a5edb52ae55f85de4b36eaf6 Mon Sep 17 00:00:00 2001 From: Andrew Snare Date: Fri, 18 Oct 2024 13:54:17 +0200 Subject: [PATCH 30/72] Tweak type signature: the snapshot to append can be any iterable type. --- src/databricks/labs/ucx/progress/history.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/databricks/labs/ucx/progress/history.py b/src/databricks/labs/ucx/progress/history.py index 24a2f67f36..642e722ee5 100644 --- a/src/databricks/labs/ucx/progress/history.py +++ b/src/databricks/labs/ucx/progress/history.py @@ -3,7 +3,7 @@ import datetime as dt import json import logging -from collections.abc import Sequence +from collections.abc import Iterable, Sequence from typing import ClassVar, Protocol, TypeVar, Generic, Any, get_type_hints from databricks.labs.lsql.backends import SqlBackend @@ -194,7 +194,7 @@ def __init__( def full_name(self) -> str: return f"{self._catalog}.{self._schema}.{self._table}" - def append_inventory_snapshot(self, snapshot: Sequence[Record]) -> None: + def append_inventory_snapshot(self, snapshot: Iterable[Record]) -> None: history_records = [self._encoder.to_historical(record) for record in snapshot] logger.debug(f"Appending {len(history_records)} {self._klass} record(s) to history.") # This is the only writer, and the mode is 'append'. This is documented as conflict-free. From a9dd77f12f63844a7684a3296193c264b283f5d3 Mon Sep 17 00:00:00 2001 From: Andrew Snare Date: Fri, 18 Oct 2024 13:55:48 +0200 Subject: [PATCH 31/72] Unit tests for logging Table records into the history log. --- .../labs/ucx/hive_metastore/tables.py | 4 +- tests/unit/hive_metastore/test_tables.py | 83 +++++++++++++++++++ 2 files changed, 86 insertions(+), 1 deletion(-) diff --git a/src/databricks/labs/ucx/hive_metastore/tables.py b/src/databricks/labs/ucx/hive_metastore/tables.py index de8457503a..1a352beb48 100644 --- a/src/databricks/labs/ucx/hive_metastore/tables.py +++ b/src/databricks/labs/ucx/hive_metastore/tables.py @@ -1,7 +1,7 @@ import logging import re import typing -from collections.abc import Iterable, Iterator, Collection +from collections.abc import Collection, Iterable, Iterator, Sequence from dataclasses import dataclass from enum import Enum, auto from functools import cached_property, partial @@ -64,6 +64,8 @@ class Table: # pylint: disable=too-many-public-methods storage_properties: str | None = None is_partitioned: bool = False + __id_attributes__: typing.ClassVar[Sequence[str]] = ("catalog", "database", "name") + DBFS_ROOT_PREFIXES: typing.ClassVar[list[str]] = [ "/dbfs/", "dbfs:/", diff --git a/tests/unit/hive_metastore/test_tables.py b/tests/unit/hive_metastore/test_tables.py index 9cad5a003d..0b45376434 100644 --- a/tests/unit/hive_metastore/test_tables.py +++ b/tests/unit/hive_metastore/test_tables.py @@ -1,11 +1,15 @@ import logging import sys +from collections.abc import Sequence from unittest.mock import create_autospec import pytest from databricks.labs.lsql.backends import MockBackend +from databricks.labs.lsql.core import Row +from databricks.labs.ucx.progress.history import HistoryLog from databricks.sdk import WorkspaceClient +from databricks.labs.ucx.__about__ import __version__ as ucx_version from databricks.labs.ucx.framework.owners import AdministratorLocator from databricks.labs.ucx.hive_metastore.locations import Mount, ExternalLocations, MountsCrawler from databricks.labs.ucx.hive_metastore.tables import ( @@ -679,3 +683,82 @@ def test_table_owner() -> None: assert owner == "an_admin" admin_locator.get_workspace_administrator.assert_called_once() + + +@pytest.mark.parametrize( + "table_record,history_record", + ( + ( + Table( + catalog="hive_metastore", + database="foo", + name="bar", + object_type="TABLE", + table_format="DELTA", + location="/foo", + storage_properties="[foo=fu,bar=baz]", + is_partitioned=True, + ), + Row( + workspace_id=2, + job_run_id=1, + object_type="Table", + object_id=["hive_metastore", "foo", "bar"], + data={ + "catalog": "hive_metastore", + "database": "foo", + "name": "bar", + "object_type": "TABLE", + "table_format": "DELTA", + "location": "/foo", + "storage_properties": "[foo=fu,bar=baz]", + "is_partitioned": "true", + }, + failures=[], + owner="the_admin", + ucx_version=ucx_version, + ), + ), + ( + Table( + catalog="hive_metastore", + database="foo", + name="baz", + object_type="VIEW", + table_format="UNKNOWN", + view_text="select 1", + upgraded_to="main.foo.baz", + ), + Row( + workspace_id=2, + job_run_id=1, + object_type="Table", + object_id=["hive_metastore", "foo", "baz"], + data={ + "catalog": "hive_metastore", + "database": "foo", + "name": "baz", + "object_type": "VIEW", + "table_format": "UNKNOWN", + "view_text": "select 1", + "upgraded_to": "main.foo.baz", + "is_partitioned": "false", + }, + failures=[], + owner="the_admin", + ucx_version=ucx_version, + ), + ), + ), +) +def test_table_supports_history(mock_backend, table_record: Sequence[Table], history_record: Sequence[Row]) -> None: + """Verify that Table records are written as expected to the history log.""" + mock_ownership = create_autospec(TableOwnership) + mock_ownership.owner_of.return_value = "the_admin" + history_log = HistoryLog(mock_backend, mock_ownership, Table, run_id=1, workspace_id=2, catalog="a_catalog") + + history_log.append_inventory_snapshot([table_record]) + + rows = mock_backend.rows_written_for("`a_catalog`.`ucx`.`history`", mode="append") + + assert rows == [history_record] From 0f9a4cd60dfada5bb4122b77d88330ba9a49e3f7 Mon Sep 17 00:00:00 2001 From: Andrew Snare Date: Fri, 18 Oct 2024 13:56:45 +0200 Subject: [PATCH 32/72] Update Grants to support logging into the history log. Includes unit tests. --- .../labs/ucx/hive_metastore/grants.py | 11 +- tests/unit/hive_metastore/test_grants.py | 200 ++++++++++++++++++ 2 files changed, 209 insertions(+), 2 deletions(-) diff --git a/src/databricks/labs/ucx/hive_metastore/grants.py b/src/databricks/labs/ucx/hive_metastore/grants.py index 03cf53dcee..41635c52b1 100644 --- a/src/databricks/labs/ucx/hive_metastore/grants.py +++ b/src/databricks/labs/ucx/hive_metastore/grants.py @@ -1,9 +1,9 @@ import logging from collections import defaultdict -from collections.abc import Callable, Iterable +from collections.abc import Callable, Iterable, Sequence from dataclasses import dataclass, replace from functools import partial, cached_property -from typing import Protocol +from typing import ClassVar, Protocol from databricks.labs.blueprint.installation import Installation from databricks.labs.blueprint.parallel import ManyError, Threads @@ -66,6 +66,8 @@ class Grant: any_file: bool = False anonymous_function: bool = False + __id_attributes__: ClassVar[Sequence[str]] = ("object_type", "object_key", "action_type", "principal") + @staticmethod def type_and_key( *, @@ -105,6 +107,11 @@ def type_and_key( ) raise ValueError(msg) + @property + def object_type(self) -> str: + this_type, _ = self.this_type_and_key() + return this_type + @property def object_key(self) -> str: _, key = self.this_type_and_key() diff --git a/tests/unit/hive_metastore/test_grants.py b/tests/unit/hive_metastore/test_grants.py index f4c568630c..e4208af702 100644 --- a/tests/unit/hive_metastore/test_grants.py +++ b/tests/unit/hive_metastore/test_grants.py @@ -1,15 +1,19 @@ import dataclasses import logging +from collections.abc import Sequence from unittest.mock import create_autospec import pytest from databricks.labs.lsql.backends import MockBackend +from databricks.labs.lsql.core import Row +from databricks.labs.ucx.__about__ import __version__ as ucx_version from databricks.labs.ucx.framework.owners import AdministratorLocator from databricks.labs.ucx.hive_metastore.catalog_schema import Catalog, Schema from databricks.labs.ucx.hive_metastore.grants import Grant, GrantsCrawler, MigrateGrants, GrantOwnership from databricks.labs.ucx.hive_metastore.tables import Table, TablesCrawler from databricks.labs.ucx.hive_metastore.udfs import UdfsCrawler +from databricks.labs.ucx.progress.history import HistoryLog from databricks.labs.ucx.workspace_access.groups import GroupManager @@ -654,3 +658,199 @@ def test_grant_owner() -> None: assert owner == "an_admin" admin_locator.get_workspace_administrator.assert_called_once() + + +@pytest.mark.parametrize( + "grant_record,history_record", + ( + ( + Grant(principal="user@domain", action_type="SELECT", catalog="main", database="foo", table="bar"), + Row( + workspace_id=2, + job_run_id=1, + object_type="Grant", + object_id=["TABLE", "main.foo.bar", "SELECT", "user@domain"], + data={ + "principal": "user@domain", + "action_type": "SELECT", + "catalog": "main", + "database": "foo", + "table": "bar", + "any_file": "false", + "anonymous_function": "false", + }, + failures=[], + owner="the_admin", + ucx_version=ucx_version, + ), + ), + ( + Grant(principal="user@domain", action_type="SELECT", catalog="main", database="foo", view="bar"), + Row( + workspace_id=2, + job_run_id=1, + object_type="Grant", + object_id=["VIEW", "main.foo.bar", "SELECT", "user@domain"], + data={ + "principal": "user@domain", + "action_type": "SELECT", + "catalog": "main", + "database": "foo", + "view": "bar", + "any_file": "false", + "anonymous_function": "false", + }, + failures=[], + owner="the_admin", + ucx_version=ucx_version, + ), + ), + ( + Grant(principal="user@domain", action_type="SELECT", catalog="main", database="foo", udf="bar"), + Row( + workspace_id=2, + job_run_id=1, + object_type="Grant", + object_id=["FUNCTION", "main.foo.bar", "SELECT", "user@domain"], + data={ + "principal": "user@domain", + "action_type": "SELECT", + "catalog": "main", + "database": "foo", + "udf": "bar", + "any_file": "false", + "anonymous_function": "false", + }, + failures=[], + owner="the_admin", + ucx_version=ucx_version, + ), + ), + ( + Grant(principal="user@domain", action_type="SELECT", catalog="main", database="foo", udf="bar"), + Row( + workspace_id=2, + job_run_id=1, + object_type="Grant", + object_id=["FUNCTION", "main.foo.bar", "SELECT", "user@domain"], + data={ + "principal": "user@domain", + "action_type": "SELECT", + "catalog": "main", + "database": "foo", + "udf": "bar", + "any_file": "false", + "anonymous_function": "false", + }, + failures=[], + owner="the_admin", + ucx_version=ucx_version, + ), + ), + ( + Grant(principal="user@domain", action_type="ALL_PRIVILEGES", catalog="main", database="foo"), + Row( + workspace_id=2, + job_run_id=1, + object_type="Grant", + object_id=["DATABASE", "main.foo", "ALL_PRIVILEGES", "user@domain"], + data={ + "principal": "user@domain", + "action_type": "ALL_PRIVILEGES", + "catalog": "main", + "database": "foo", + "any_file": "false", + "anonymous_function": "false", + }, + failures=[], + owner="the_admin", + ucx_version=ucx_version, + ), + ), + ( + Grant(principal="user@domain", action_type="ALL_PRIVILEGES", catalog="main"), + Row( + workspace_id=2, + job_run_id=1, + object_type="Grant", + object_id=["CATALOG", "main", "ALL_PRIVILEGES", "user@domain"], + data={ + "principal": "user@domain", + "action_type": "ALL_PRIVILEGES", + "catalog": "main", + "any_file": "false", + "anonymous_function": "false", + }, + failures=[], + owner="the_admin", + ucx_version=ucx_version, + ), + ), + ( + Grant(principal="user@domain", action_type="SELECT", any_file=True), + Row( + workspace_id=2, + job_run_id=1, + object_type="Grant", + object_id=["ANY FILE", "", "SELECT", "user@domain"], + data={ + "principal": "user@domain", + "action_type": "SELECT", + "any_file": "true", + "anonymous_function": "false", + }, + failures=[], + owner="the_admin", + ucx_version=ucx_version, + ), + ), + ( + Grant(principal="user@domain", action_type="SELECT", anonymous_function=True), + Row( + workspace_id=2, + job_run_id=1, + object_type="Grant", + object_id=["ANONYMOUS FUNCTION", "", "SELECT", "user@domain"], + data={ + "principal": "user@domain", + "action_type": "SELECT", + "any_file": "false", + "anonymous_function": "true", + }, + failures=[], + owner="the_admin", + ucx_version=ucx_version, + ), + ), + ( + Grant(principal="user@domain", action_type="ALL_PRIVILEGES", database="foo"), + Row( + workspace_id=2, + job_run_id=1, + object_type="Grant", + object_id=["DATABASE", "hive_metastore.foo", "ALL_PRIVILEGES", "user@domain"], + data={ + "principal": "user@domain", + "action_type": "ALL_PRIVILEGES", + "database": "foo", + "any_file": "false", + "anonymous_function": "false", + }, + failures=[], + owner="the_admin", + ucx_version=ucx_version, + ), + ), + ), +) +def test_grant_supports_history(mock_backend, grant_record: Sequence[Grant], history_record: Sequence[Row]) -> None: + """Verify that Grant records are written to the history log as expected.""" + mock_ownership = create_autospec(GrantOwnership) + mock_ownership.owner_of.return_value = "the_admin" + history_log = HistoryLog(mock_backend, mock_ownership, Grant, run_id=1, workspace_id=2, catalog="a_catalog") + + history_log.append_inventory_snapshot([grant_record]) + + rows = mock_backend.rows_written_for("`a_catalog`.`ucx`.`history`", mode="append") + + assert rows == [history_record] From ad482b8ada76175dadcb3805088663be726b4a79 Mon Sep 17 00:00:00 2001 From: Andrew Snare Date: Fri, 18 Oct 2024 13:58:39 +0200 Subject: [PATCH 33/72] Update the migration progress workflow to log tables and grants to the history table. --- .../labs/ucx/contexts/workflow_task.py | 34 ++++++++++++++- src/databricks/labs/ucx/progress/workflows.py | 8 +++- tests/unit/progress/test_workflows.py | 41 ++++++++++++++----- 3 files changed, 69 insertions(+), 14 deletions(-) diff --git a/src/databricks/labs/ucx/contexts/workflow_task.py b/src/databricks/labs/ucx/contexts/workflow_task.py index ea1aa07e3b..399a4e70c1 100644 --- a/src/databricks/labs/ucx/contexts/workflow_task.py +++ b/src/databricks/labs/ucx/contexts/workflow_task.py @@ -3,6 +3,7 @@ from databricks.labs.blueprint.installation import Installation from databricks.labs.lsql.backends import RuntimeBackend, SqlBackend +from databricks.labs.ucx.hive_metastore.grants import Grant, GrantOwnership from databricks.sdk import WorkspaceClient, core from databricks.labs.ucx.__about__ import __version__ @@ -14,8 +15,9 @@ from databricks.labs.ucx.contexts.application import GlobalContext from databricks.labs.ucx.hive_metastore import TablesInMounts from databricks.labs.ucx.hive_metastore.table_size import TableSizeCrawler -from databricks.labs.ucx.hive_metastore.tables import FasterTableScanCrawler +from databricks.labs.ucx.hive_metastore.tables import FasterTableScanCrawler, Table, TableOwnership from databricks.labs.ucx.installer.logs import TaskRunWarningRecorder +from databricks.labs.ucx.progress.history import HistoryLog from databricks.labs.ucx.progress.workflow_runs import WorkflowRunRecorder @@ -116,10 +118,38 @@ def workflow_run_recorder(self) -> WorkflowRunRecorder: return WorkflowRunRecorder( self.sql_backend, self.config.ucx_catalog, - workspace_id=self.workspace_client.get_workspace_id(), + workspace_id=self.workspace_id, workflow_name=self.named_parameters["workflow"], workflow_id=int(self.named_parameters["job_id"]), workflow_run_id=int(self.named_parameters["parent_run_id"]), workflow_run_attempt=int(self.named_parameters.get("attempt", 0)), workflow_start_time=self.named_parameters["start_time"], ) + + @cached_property + def workspace_id(self) -> int: + return self.workspace_client.get_workspace_id() + + @cached_property + def historical_tables_log(self) -> HistoryLog[Table]: + table_owner = TableOwnership(self.administrator_locator) + return HistoryLog( + self.sql_backend, + table_owner, + Table, + int(self.named_parameters["parent_run_id"]), + self.workspace_id, + self.config.ucx_catalog, + ) + + @cached_property + def historical_grants_log(self) -> HistoryLog[Grant]: + grant_owner = GrantOwnership(self.administrator_locator) + return HistoryLog( + self.sql_backend, + grant_owner, + Grant, + int(self.named_parameters["parent_run_id"]), + self.workspace_id, + self.config.ucx_catalog, + ) diff --git a/src/databricks/labs/ucx/progress/workflows.py b/src/databricks/labs/ucx/progress/workflows.py index b9fb219a3d..30d7a374b4 100644 --- a/src/databricks/labs/ucx/progress/workflows.py +++ b/src/databricks/labs/ucx/progress/workflows.py @@ -28,7 +28,9 @@ def crawl_tables(self, ctx: RuntimeContext) -> None: as _database name_, _table name_, _table type_, _table location_, etc., in the table named `$inventory_database.tables`. The metadata stored is then used in the subsequent tasks and workflows to, for example, find all Hive Metastore tables that cannot easily be migrated to Unity Catalog.""" - ctx.tables_crawler.snapshot(force_refresh=True) + history_log = ctx.historical_tables_log + tables_snapshot = ctx.tables_crawler.snapshot(force_refresh=True) + history_log.append_inventory_snapshot(tables_snapshot) @job_task def crawl_udfs(self, ctx: RuntimeContext) -> None: @@ -50,7 +52,9 @@ def crawl_grants(self, ctx: RuntimeContext) -> None: Note: This job runs on a separate cluster (named `tacl`) as it requires the proper configuration to have the Table ACLs enabled and available for retrieval.""" - ctx.grants_crawler.snapshot(force_refresh=True) + grants_log = ctx.historical_grants_log + grants_snapshot = ctx.grants_crawler.snapshot(force_refresh=True) + grants_log.append_inventory_snapshot(grants_snapshot) @job_task def assess_jobs(self, ctx: RuntimeContext) -> None: diff --git a/tests/unit/progress/test_workflows.py b/tests/unit/progress/test_workflows.py index 2a86392a18..6a5d1ee97b 100644 --- a/tests/unit/progress/test_workflows.py +++ b/tests/unit/progress/test_workflows.py @@ -1,6 +1,7 @@ from unittest.mock import create_autospec import pytest +from databricks.labs.ucx.progress.history import HistoryLog from databricks.sdk import WorkspaceClient from databricks.sdk.service.catalog import CatalogInfo, MetastoreAssignment from databricks.sdk.service.jobs import BaseRun, RunResultState, RunState @@ -17,27 +18,47 @@ @pytest.mark.parametrize( - "task, crawler, crawler_class", + "task, crawler, crawler_class, history_log", [ - (MigrationProgress.crawl_tables, RuntimeContext.tables_crawler, TablesCrawler), - (MigrationProgress.crawl_udfs, RuntimeContext.udfs_crawler, UdfsCrawler), - (MigrationProgress.crawl_grants, RuntimeContext.grants_crawler, GrantsCrawler), - (MigrationProgress.assess_jobs, RuntimeContext.jobs_crawler, JobsCrawler), - (MigrationProgress.assess_clusters, RuntimeContext.clusters_crawler, ClustersCrawler), - (MigrationProgress.assess_pipelines, RuntimeContext.pipelines_crawler, PipelinesCrawler), - (MigrationProgress.crawl_cluster_policies, RuntimeContext.policies_crawler, PoliciesCrawler), + ( + MigrationProgress.crawl_tables, + RuntimeContext.tables_crawler, + TablesCrawler, + RuntimeContext.historical_tables_log, + ), + (MigrationProgress.crawl_udfs, RuntimeContext.udfs_crawler, UdfsCrawler, None), + ( + MigrationProgress.crawl_grants, + RuntimeContext.grants_crawler, + GrantsCrawler, + RuntimeContext.historical_grants_log, + ), + (MigrationProgress.assess_jobs, RuntimeContext.jobs_crawler, JobsCrawler, None), + (MigrationProgress.assess_clusters, RuntimeContext.clusters_crawler, ClustersCrawler, None), + (MigrationProgress.assess_pipelines, RuntimeContext.pipelines_crawler, PipelinesCrawler, None), + (MigrationProgress.crawl_cluster_policies, RuntimeContext.policies_crawler, PoliciesCrawler, None), ( MigrationProgress.refresh_table_migration_status, RuntimeContext.migration_status_refresher, TableMigrationStatusRefresher, + None, ), ], ) -def test_migration_progress_runtime_refresh(run_workflow, task, crawler, crawler_class) -> None: +@pytest.mark.xfail(raises=AttributeError, reason="Work in progress.") +def test_migration_progress_runtime_refresh(run_workflow, task, crawler, crawler_class, history_log) -> None: mock_crawler = create_autospec(crawler_class) + mock_history_log = create_autospec(HistoryLog) crawler_name = crawler.attrname - run_workflow(task, **{crawler_name: mock_crawler}) + history_log_name = history_log.attrname + context_replacements = { + crawler_name: mock_crawler, + history_log_name: mock_history_log, + "named_parameters": {"parent_run_id": 53}, + } + run_workflow(task, **context_replacements) mock_crawler.snapshot.assert_called_once_with(force_refresh=True) + mock_history_log.append_inventory_snapshot.assert_called_once() def test_migration_progress_with_valid_prerequisites(run_workflow) -> None: From fd2b3acfdafecdc157af55a51db63939478041d8 Mon Sep 17 00:00:00 2001 From: Andrew Snare Date: Fri, 18 Oct 2024 13:59:30 +0200 Subject: [PATCH 34/72] Unit tests for a migration-progress task that wasn't covered yet. --- tests/unit/progress/test_workflows.py | 36 +++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) diff --git a/tests/unit/progress/test_workflows.py b/tests/unit/progress/test_workflows.py index 6a5d1ee97b..a1997a04ba 100644 --- a/tests/unit/progress/test_workflows.py +++ b/tests/unit/progress/test_workflows.py @@ -1,3 +1,4 @@ +import datetime as dt from unittest.mock import create_autospec import pytest @@ -82,3 +83,38 @@ def test_migration_progress_with_invalid_prerequisites(run_workflow) -> None: task = MigrationProgress.verify_prerequisites with pytest.raises(RuntimeWarning, match="Metastore not attached to workspace."): run_workflow(task, workspace_client=ws) + + +def test_migration_progress_record_workflow_run(run_workflow, mock_backend) -> None: + """Verify that we log the workflow run.""" + task = MigrationProgress.record_workflow_run + start_time = dt.datetime.now(dt.timezone.utc).replace(microsecond=0) + context_replacements = { + "sql_backend": mock_backend, + "named_parameters": { + "workflow": "test", + "job_id": "123456", + "parent_run_id": "456", + "attempt": "0", + "start_time": start_time.isoformat(), + }, + } + + run_workflow(task, **context_replacements) + + rows = mock_backend.rows_written_for("ucx.multiworkspace.workflow_runs", "append") + + rows_as_dict = [{k: v for k, v in rows.as_dict().items() if k != 'finished_at'} for rows in rows] + assert rows_as_dict == [ + { + "started_at": start_time, + # finished_at: checked below. + "workspace_id": 123, + "workflow_name": "test", + "workflow_id": 123456, + "workflow_run_id": 456, + "workflow_run_attempt": 0, + } + ] + # Finish-time must be indistinguishable from or later than the start time. + assert all(row["started_at"] <= row["finished_at"] for row in rows) From 3703848c8302cda142dcf3e6dbac2f3d93b393d0 Mon Sep 17 00:00:00 2001 From: Andrew Snare Date: Fri, 18 Oct 2024 14:42:36 +0200 Subject: [PATCH 35/72] Support classes whose failures attribute is a string (containing JSON) instead of a list. --- src/databricks/labs/ucx/progress/history.py | 26 +++++++++++++++------ tests/unit/progress/test_history.py | 25 ++++++++++++++++++-- 2 files changed, 42 insertions(+), 9 deletions(-) diff --git a/src/databricks/labs/ucx/progress/history.py b/src/databricks/labs/ucx/progress/history.py index 642e722ee5..0f2504961f 100644 --- a/src/databricks/labs/ucx/progress/history.py +++ b/src/databricks/labs/ucx/progress/history.py @@ -44,8 +44,8 @@ class HistoricalEncoder(Generic[Record]): _field_types: dict[str, type] """A map of the fields on instances of the record class (and their types); these will appear in the object data.""" - _has_failures: bool - """Whether this record has a failures attribute that should be included in historical records.""" + _has_failures: type[str | list[str]] | None + """The type of the failures attribute for this record, if present.""" _id_attribute_names: Sequence[str] """The names of the record attributes that are used to produce the identifier for each record. @@ -62,19 +62,25 @@ def __init__(self, job_run_id: int, workspace_id: int, ownership: Ownership[Reco self._id_attribute_names = self._get_id_attribute_names(klass) @classmethod - def _get_field_types(cls, klass: type[Record]) -> tuple[dict[str, type], bool]: + def _get_field_types(cls, klass: type[Record]) -> tuple[dict[str, type], type[str | list[str]] | None]: """Return the dataclass-defined fields that the record type declares, and their associated types. If the record has a "failures" attribute this is treated specially: it is removed but we signal that it was present. + + Arguments: + klass: The record type. + Returns: + A tuple containing: + - A dictionary of fields to include in the object data, and their type. + - The type of the failures field, if present. """ field_types = {field.name: field.type for field in dataclasses.fields(klass)} failures_type = field_types.pop("failures", None) - has_failures = failures_type is not None - if has_failures and failures_type != list[str]: + if failures_type not in (None, str, list[str]): msg = f"Historical record {klass} has invalid 'failures' attribute of type: {failures_type}" raise TypeError(msg) - return field_types, has_failures + return field_types, failures_type def _get_id_attribute_names(self, klazz: type[Record]) -> Sequence[str]: id_attribute_names = tuple(klazz.__id_attributes__) @@ -153,7 +159,13 @@ def _object_data_and_failures(self, record: Record) -> tuple[dict[str, str], lis # We must return a value: strings are mandatory (not optional) as the type. As such, optional fields need to be # omitted from the data map if the value is None. data = {k: v for k, v in encoded_fields.items() if v is not None} - failures = record_values["failures"] if self._has_failures else [] + if self._has_failures == list[str]: + failures = record_values["failures"] + elif self._has_failures == str: + encoded_failures = record_values["failures"] + failures = json.loads(encoded_failures) if encoded_failures else [] + else: + failures = [] return data, failures diff --git a/tests/unit/progress/test_history.py b/tests/unit/progress/test_history.py index 18f9fd26fe..025e27c34d 100644 --- a/tests/unit/progress/test_history.py +++ b/tests/unit/progress/test_history.py @@ -1,4 +1,5 @@ import datetime as dt +import json import re from dataclasses import dataclass, field from typing import ClassVar @@ -398,8 +399,8 @@ def test_historical_encoder_unserializable_values(ownership, field_name, record: @pytest.mark.parametrize("failures", (["failures-1", "failures-2"], [])) -def test_historical_encoder_failures(ownership, failures: list[str]) -> None: - """Verify an encoder places failures on the top-level field instead of within the object data.""" +def test_historical_encoder_failures_list(ownership, failures: list[str]) -> None: + """Verify an encoder places a failures list on the top-level field instead of within the object data.""" encoder = HistoricalEncoder(job_run_id=1, workspace_id=2, ownership=ownership, klass=_TestRecord) historical = encoder.to_historical(_TestRecord(a_field="foo", b_field=10, failures=list(failures))) @@ -408,6 +409,26 @@ def test_historical_encoder_failures(ownership, failures: list[str]) -> None: assert "failures" not in historical.data +@pytest.mark.parametrize("failures", ('["failures-1", "failures-2"]', '[]', '')) +def test_historical_encoder_json_encoded_failures_list(ownership, failures: str) -> None: + """Verify an encoder places a pre-encoded JSON list of failures on the top-level field instead of within the object data.""" + + @dataclass + class _FailuresEncodedJson: + failures: str + an_id: str = "the_id" + + __id_attributes__: ClassVar = ("an_id",) + + encoder = HistoricalEncoder(job_run_id=1, workspace_id=2, ownership=ownership, klass=_FailuresEncodedJson) + + historical = encoder.to_historical(_FailuresEncodedJson(failures=failures)) + + expected_failures = json.loads(failures) if failures else [] + assert historical.failures == expected_failures + assert "failures" not in historical.data + + def test_historical_encoder_failures_verification(ownership) -> None: """Verify that encoders checks the failures field type during initialization.""" From 62390b59a6e2295a807d3bf7ed86d3de8beab1b4 Mon Sep 17 00:00:00 2001 From: Andrew Snare Date: Fri, 18 Oct 2024 14:52:20 +0200 Subject: [PATCH 36/72] Ensure updated UDF snapshots are logged to the history table. --- .../labs/ucx/contexts/workflow_task.py | 21 +++- .../labs/ucx/hive_metastore/udfs.py | 5 +- src/databricks/labs/ucx/progress/workflows.py | 4 +- tests/unit/hive_metastore/test_udfs.py | 97 +++++++++++++++++++ tests/unit/progress/test_workflows.py | 2 +- 5 files changed, 122 insertions(+), 7 deletions(-) diff --git a/src/databricks/labs/ucx/contexts/workflow_task.py b/src/databricks/labs/ucx/contexts/workflow_task.py index 399a4e70c1..32d2b5d419 100644 --- a/src/databricks/labs/ucx/contexts/workflow_task.py +++ b/src/databricks/labs/ucx/contexts/workflow_task.py @@ -4,6 +4,7 @@ from databricks.labs.blueprint.installation import Installation from databricks.labs.lsql.backends import RuntimeBackend, SqlBackend from databricks.labs.ucx.hive_metastore.grants import Grant, GrantOwnership +from databricks.labs.ucx.hive_metastore.udfs import Udf, UdfOwnership from databricks.sdk import WorkspaceClient, core from databricks.labs.ucx.__about__ import __version__ @@ -130,6 +131,18 @@ def workflow_run_recorder(self) -> WorkflowRunRecorder: def workspace_id(self) -> int: return self.workspace_client.get_workspace_id() + @cached_property + def historical_grants_log(self) -> HistoryLog[Grant]: + grant_owner = GrantOwnership(self.administrator_locator) + return HistoryLog( + self.sql_backend, + grant_owner, + Grant, + int(self.named_parameters["parent_run_id"]), + self.workspace_id, + self.config.ucx_catalog, + ) + @cached_property def historical_tables_log(self) -> HistoryLog[Table]: table_owner = TableOwnership(self.administrator_locator) @@ -143,12 +156,12 @@ def historical_tables_log(self) -> HistoryLog[Table]: ) @cached_property - def historical_grants_log(self) -> HistoryLog[Grant]: - grant_owner = GrantOwnership(self.administrator_locator) + def historical_udfs_log(self) -> HistoryLog[Udf]: + udf_owner = UdfOwnership(self.administrator_locator) return HistoryLog( self.sql_backend, - grant_owner, - Grant, + udf_owner, + Udf, int(self.named_parameters["parent_run_id"]), self.workspace_id, self.config.ucx_catalog, diff --git a/src/databricks/labs/ucx/hive_metastore/udfs.py b/src/databricks/labs/ucx/hive_metastore/udfs.py index 74196c543c..e97b08aff0 100644 --- a/src/databricks/labs/ucx/hive_metastore/udfs.py +++ b/src/databricks/labs/ucx/hive_metastore/udfs.py @@ -1,7 +1,8 @@ import logging -from collections.abc import Iterable +from collections.abc import Iterable, Sequence from dataclasses import dataclass, replace from functools import partial +from typing import ClassVar from databricks.labs.blueprint.parallel import Threads from databricks.labs.lsql.backends import SqlBackend @@ -29,6 +30,8 @@ class Udf: # pylint: disable=too-many-instance-attributes success: int = 1 failures: str = "" + __id_attributes__: ClassVar[Sequence[str]] = ("catalog", "database", "name") + @property def key(self) -> str: return f"{self.catalog}.{self.database}.{self.name}".lower() diff --git a/src/databricks/labs/ucx/progress/workflows.py b/src/databricks/labs/ucx/progress/workflows.py index 30d7a374b4..714b69809a 100644 --- a/src/databricks/labs/ucx/progress/workflows.py +++ b/src/databricks/labs/ucx/progress/workflows.py @@ -37,7 +37,9 @@ def crawl_udfs(self, ctx: RuntimeContext) -> None: """Iterates over all UDFs in the Hive Metastore of the current workspace and persists their metadata in the table named `$inventory_database.udfs`. This inventory is currently used when scanning securable objects for issues with grants that cannot be migrated to Unit Catalog.""" - ctx.udfs_crawler.snapshot(force_refresh=True) + history_log = ctx.historical_udfs_log + udfs_snapshot = ctx.udfs_crawler.snapshot(force_refresh=True) + history_log.append_inventory_snapshot(udfs_snapshot) @job_task(job_cluster="tacl") def setup_tacl(self, ctx: RuntimeContext) -> None: diff --git a/tests/unit/hive_metastore/test_udfs.py b/tests/unit/hive_metastore/test_udfs.py index d1c87d66ad..39ed9c7616 100644 --- a/tests/unit/hive_metastore/test_udfs.py +++ b/tests/unit/hive_metastore/test_udfs.py @@ -1,9 +1,14 @@ +from collections.abc import Sequence from unittest.mock import create_autospec +import pytest from databricks.labs.lsql.backends import MockBackend +from databricks.labs.lsql.core import Row +from databricks.labs.ucx.__about__ import __version__ as ucx_version from databricks.labs.ucx.framework.owners import AdministratorLocator from databricks.labs.ucx.hive_metastore.udfs import Udf, UdfsCrawler, UdfOwnership +from databricks.labs.ucx.progress.history import HistoryLog def test_key(): @@ -70,3 +75,95 @@ def test_udf_owner() -> None: assert owner == "an_admin" admin_locator.get_workspace_administrator.assert_called_once() + + +@pytest.mark.parametrize( + "udf_record,history_record", + ( + ( + Udf( + catalog="hive_metastore", + database="foo", + name="bar", + func_type="UNKNOWN-1", + func_input="UNKNOWN-2", + func_returns="UNKNOWN-3", + deterministic=True, + data_access="UNKNOWN-4", + body="UNKNOWN-5", + comment="UNKNOWN-6", + ), + Row( + workspace_id=2, + job_run_id=1, + object_type="Udf", + object_id=["hive_metastore", "foo", "bar"], + data={ + "catalog": "hive_metastore", + "database": "foo", + "name": "bar", + "func_type": "UNKNOWN-1", + "func_input": "UNKNOWN-2", + "func_returns": "UNKNOWN-3", + "deterministic": "true", + "data_access": "UNKNOWN-4", + "body": "UNKNOWN-5", + "comment": "UNKNOWN-6", + "success": "1", + }, + failures=[], + owner="the_admin", + ucx_version=ucx_version, + ), + ), + ( + Udf( + catalog="hive_metastore", + database="foo", + name="bar", + func_type="UNKNOWN-1", + func_input="UNKNOWN-2", + func_returns="UNKNOWN-3", + deterministic=True, + data_access="UNKNOWN-4", + body="UNKNOWN-5", + comment="UNKNOWN-6", + success=0, + failures='["a_failure", "another_failures"]', + ), + Row( + workspace_id=2, + job_run_id=1, + object_type="Udf", + object_id=["hive_metastore", "foo", "bar"], + data={ + "catalog": "hive_metastore", + "database": "foo", + "name": "bar", + "func_type": "UNKNOWN-1", + "func_input": "UNKNOWN-2", + "func_returns": "UNKNOWN-3", + "deterministic": "true", + "data_access": "UNKNOWN-4", + "body": "UNKNOWN-5", + "comment": "UNKNOWN-6", + "success": "0", + }, + failures=["a_failure", "another_failures"], + owner="the_admin", + ucx_version=ucx_version, + ), + ), + ), +) +def test_udf_supports_history(mock_backend, udf_record, history_record: Sequence[Row]) -> None: + """Verify that Table records are written as expected to the history log.""" + mock_ownership = create_autospec(UdfOwnership) + mock_ownership.owner_of.return_value = "the_admin" + history_log = HistoryLog(mock_backend, mock_ownership, Udf, run_id=1, workspace_id=2, catalog="a_catalog") + + history_log.append_inventory_snapshot([udf_record]) + + rows = mock_backend.rows_written_for("`a_catalog`.`ucx`.`history`", mode="append") + + assert rows == [history_record] diff --git a/tests/unit/progress/test_workflows.py b/tests/unit/progress/test_workflows.py index a1997a04ba..efb850c225 100644 --- a/tests/unit/progress/test_workflows.py +++ b/tests/unit/progress/test_workflows.py @@ -27,7 +27,7 @@ TablesCrawler, RuntimeContext.historical_tables_log, ), - (MigrationProgress.crawl_udfs, RuntimeContext.udfs_crawler, UdfsCrawler, None), + (MigrationProgress.crawl_udfs, RuntimeContext.udfs_crawler, UdfsCrawler, RuntimeContext.historical_udfs_log), ( MigrationProgress.crawl_grants, RuntimeContext.grants_crawler, From c1fea83c44ee38e1887a7280a76d394166e2220d Mon Sep 17 00:00:00 2001 From: Andrew Snare Date: Fri, 18 Oct 2024 14:53:16 +0200 Subject: [PATCH 37/72] Naming consistency. --- src/databricks/labs/ucx/progress/workflows.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/databricks/labs/ucx/progress/workflows.py b/src/databricks/labs/ucx/progress/workflows.py index 714b69809a..ef097c6c65 100644 --- a/src/databricks/labs/ucx/progress/workflows.py +++ b/src/databricks/labs/ucx/progress/workflows.py @@ -54,9 +54,9 @@ def crawl_grants(self, ctx: RuntimeContext) -> None: Note: This job runs on a separate cluster (named `tacl`) as it requires the proper configuration to have the Table ACLs enabled and available for retrieval.""" - grants_log = ctx.historical_grants_log + history_log = ctx.historical_grants_log grants_snapshot = ctx.grants_crawler.snapshot(force_refresh=True) - grants_log.append_inventory_snapshot(grants_snapshot) + history_log.append_inventory_snapshot(grants_snapshot) @job_task def assess_jobs(self, ctx: RuntimeContext) -> None: From 8c705418f4a245d05371b2a66e4bb1175f7fec27 Mon Sep 17 00:00:00 2001 From: Andrew Snare Date: Fri, 18 Oct 2024 15:04:45 +0200 Subject: [PATCH 38/72] Fix copypasta. --- tests/unit/hive_metastore/test_udfs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/unit/hive_metastore/test_udfs.py b/tests/unit/hive_metastore/test_udfs.py index 39ed9c7616..057966ba8c 100644 --- a/tests/unit/hive_metastore/test_udfs.py +++ b/tests/unit/hive_metastore/test_udfs.py @@ -157,7 +157,7 @@ def test_udf_owner() -> None: ), ) def test_udf_supports_history(mock_backend, udf_record, history_record: Sequence[Row]) -> None: - """Verify that Table records are written as expected to the history log.""" + """Verify that Udf records are written as expected to the history log.""" mock_ownership = create_autospec(UdfOwnership) mock_ownership.owner_of.return_value = "the_admin" history_log = HistoryLog(mock_backend, mock_ownership, Udf, run_id=1, workspace_id=2, catalog="a_catalog") From 6f41fc02083e9d69bd5b9b7d3c41c6161922985d Mon Sep 17 00:00:00 2001 From: Andrew Snare Date: Fri, 18 Oct 2024 15:14:26 +0200 Subject: [PATCH 39/72] Ensure updated JobInfo snapshots are appended to the history log. --- src/databricks/labs/ucx/assessment/jobs.py | 5 +- .../labs/ucx/contexts/workflow_task.py | 21 +++++- src/databricks/labs/ucx/progress/workflows.py | 4 +- tests/unit/assessment/test_jobs.py | 67 +++++++++++++++++++ tests/unit/progress/test_workflows.py | 2 +- 5 files changed, 93 insertions(+), 6 deletions(-) diff --git a/src/databricks/labs/ucx/assessment/jobs.py b/src/databricks/labs/ucx/assessment/jobs.py index 667647d967..4091d277f0 100644 --- a/src/databricks/labs/ucx/assessment/jobs.py +++ b/src/databricks/labs/ucx/assessment/jobs.py @@ -1,9 +1,10 @@ import json import logging -from collections.abc import Iterable +from collections.abc import Iterable, Sequence from dataclasses import dataclass from datetime import datetime, timedelta, timezone from hashlib import sha256 +from typing import ClassVar from databricks.labs.lsql.backends import SqlBackend from databricks.sdk import WorkspaceClient @@ -40,6 +41,8 @@ class JobInfo: creator: str | None = None """User-name of the creator of the pipeline, if known.""" + __id_attributes__: ClassVar[Sequence[str]] = ("job_id",) + class JobsMixin: @classmethod diff --git a/src/databricks/labs/ucx/contexts/workflow_task.py b/src/databricks/labs/ucx/contexts/workflow_task.py index 32d2b5d419..3c83f39040 100644 --- a/src/databricks/labs/ucx/contexts/workflow_task.py +++ b/src/databricks/labs/ucx/contexts/workflow_task.py @@ -3,24 +3,27 @@ from databricks.labs.blueprint.installation import Installation from databricks.labs.lsql.backends import RuntimeBackend, SqlBackend -from databricks.labs.ucx.hive_metastore.grants import Grant, GrantOwnership -from databricks.labs.ucx.hive_metastore.udfs import Udf, UdfOwnership from databricks.sdk import WorkspaceClient, core from databricks.labs.ucx.__about__ import __version__ from databricks.labs.ucx.assessment.clusters import ClustersCrawler, PoliciesCrawler from databricks.labs.ucx.assessment.init_scripts import GlobalInitScriptCrawler -from databricks.labs.ucx.assessment.jobs import JobsCrawler, SubmitRunsCrawler +from databricks.labs.ucx.assessment.jobs import JobOwnership, JobInfo, JobsCrawler, SubmitRunsCrawler from databricks.labs.ucx.assessment.pipelines import PipelinesCrawler from databricks.labs.ucx.config import WorkspaceConfig from databricks.labs.ucx.contexts.application import GlobalContext from databricks.labs.ucx.hive_metastore import TablesInMounts +from databricks.labs.ucx.hive_metastore.grants import Grant, GrantOwnership from databricks.labs.ucx.hive_metastore.table_size import TableSizeCrawler from databricks.labs.ucx.hive_metastore.tables import FasterTableScanCrawler, Table, TableOwnership +from databricks.labs.ucx.hive_metastore.udfs import Udf, UdfOwnership from databricks.labs.ucx.installer.logs import TaskRunWarningRecorder from databricks.labs.ucx.progress.history import HistoryLog from databricks.labs.ucx.progress.workflow_runs import WorkflowRunRecorder +# As with GlobalContext, service factories unavoidably have a lot of public methods. +# pylint: disable=too-many-public-methods + class RuntimeContext(GlobalContext): @cached_property @@ -143,6 +146,18 @@ def historical_grants_log(self) -> HistoryLog[Grant]: self.config.ucx_catalog, ) + @cached_property + def historical_jobs_log(self) -> HistoryLog[JobInfo]: + job_owner = JobOwnership(self.administrator_locator) + return HistoryLog( + self.sql_backend, + job_owner, + JobInfo, + int(self.named_parameters["parent_run_id"]), + self.workspace_id, + self.config.ucx_catalog, + ) + @cached_property def historical_tables_log(self) -> HistoryLog[Table]: table_owner = TableOwnership(self.administrator_locator) diff --git a/src/databricks/labs/ucx/progress/workflows.py b/src/databricks/labs/ucx/progress/workflows.py index ef097c6c65..22dd30b9ec 100644 --- a/src/databricks/labs/ucx/progress/workflows.py +++ b/src/databricks/labs/ucx/progress/workflows.py @@ -69,7 +69,9 @@ def assess_jobs(self, ctx: RuntimeContext) -> None: - Clusters with incompatible Spark config tags - Clusters referencing DBFS locations in one or more config options """ - ctx.jobs_crawler.snapshot(force_refresh=True) + history_log = ctx.historical_jobs_log + jobs_snapshot = ctx.jobs_crawler.snapshot(force_refresh=True) + history_log.append_inventory_snapshot(jobs_snapshot) @job_task def assess_clusters(self, ctx: RuntimeContext) -> None: diff --git a/tests/unit/assessment/test_jobs.py b/tests/unit/assessment/test_jobs.py index 8ec3e89077..6ad33ddab6 100644 --- a/tests/unit/assessment/test_jobs.py +++ b/tests/unit/assessment/test_jobs.py @@ -1,9 +1,13 @@ +from collections.abc import Sequence from unittest.mock import create_autospec import pytest from databricks.labs.lsql.backends import MockBackend +from databricks.labs.lsql.core import Row +from databricks.labs.ucx.progress.history import HistoryLog from databricks.sdk.service.jobs import BaseJob, JobSettings +from databricks.labs.ucx.__about__ import __version__ as ucx_version from databricks.labs.ucx.assessment.jobs import JobInfo, JobOwnership, JobsCrawler, SubmitRunsCrawler from databricks.labs.ucx.framework.owners import AdministratorLocator @@ -154,3 +158,66 @@ def test_pipeline_owner_creator_unknown() -> None: assert owner == "an_admin" admin_locator.get_workspace_administrator.assert_called_once() + + +@pytest.mark.parametrize( + "job_info_record,history_record", + ( + ( + JobInfo( + job_id="1234", + success=1, + failures="[]", + ), + Row( + workspace_id=2, + job_run_id=1, + object_type="JobInfo", + object_id=["1234"], + data={ + "job_id": "1234", + "success": "1", + }, + failures=[], + owner="the_admin", + ucx_version=ucx_version, + ), + ), + ( + JobInfo( + job_id="1234", + job_name="the_job", + creator="user@domain", + success=0, + failures='["first-failure", "second-failure"]', + ), + Row( + workspace_id=2, + job_run_id=1, + object_type="JobInfo", + object_id=["1234"], + data={ + "job_id": "1234", + "job_name": "the_job", + "creator": "user@domain", + "success": "0", + }, + failures=["first-failure", "second-failure"], + owner="user@domain", + ucx_version=ucx_version, + ), + ), + ), +) +def test_pipeline_supports_history(mock_backend, job_info_record, history_record: Sequence[Row]) -> None: + """Verify that JobInfo records are written as expected to the history log.""" + admin_locator = create_autospec(AdministratorLocator) + admin_locator.get_workspace_administrator.return_value = "the_admin" + job_ownership = JobOwnership(admin_locator) + history_log = HistoryLog(mock_backend, job_ownership, JobInfo, run_id=1, workspace_id=2, catalog="a_catalog") + + history_log.append_inventory_snapshot([job_info_record]) + + rows = mock_backend.rows_written_for("`a_catalog`.`ucx`.`history`", mode="append") + + assert rows == [history_record] diff --git a/tests/unit/progress/test_workflows.py b/tests/unit/progress/test_workflows.py index efb850c225..0670a154d1 100644 --- a/tests/unit/progress/test_workflows.py +++ b/tests/unit/progress/test_workflows.py @@ -34,7 +34,7 @@ GrantsCrawler, RuntimeContext.historical_grants_log, ), - (MigrationProgress.assess_jobs, RuntimeContext.jobs_crawler, JobsCrawler, None), + (MigrationProgress.assess_jobs, RuntimeContext.jobs_crawler, JobsCrawler, RuntimeContext.historical_jobs_log), (MigrationProgress.assess_clusters, RuntimeContext.clusters_crawler, ClustersCrawler, None), (MigrationProgress.assess_pipelines, RuntimeContext.pipelines_crawler, PipelinesCrawler, None), (MigrationProgress.crawl_cluster_policies, RuntimeContext.policies_crawler, PoliciesCrawler, None), From 927392a8037609e5f2b08c8b9bb2b0adba886412 Mon Sep 17 00:00:00 2001 From: Andrew Snare Date: Fri, 18 Oct 2024 15:40:01 +0200 Subject: [PATCH 40/72] Fix type hints. --- tests/unit/assessment/test_jobs.py | 12 +++++++++--- tests/unit/hive_metastore/test_grants.py | 5 ++--- tests/unit/hive_metastore/test_tables.py | 5 ++--- tests/unit/hive_metastore/test_udfs.py | 5 ++--- 4 files changed, 15 insertions(+), 12 deletions(-) diff --git a/tests/unit/assessment/test_jobs.py b/tests/unit/assessment/test_jobs.py index 6ad33ddab6..79ef7d0dd6 100644 --- a/tests/unit/assessment/test_jobs.py +++ b/tests/unit/assessment/test_jobs.py @@ -1,4 +1,3 @@ -from collections.abc import Sequence from unittest.mock import create_autospec import pytest @@ -209,12 +208,19 @@ def test_pipeline_owner_creator_unknown() -> None: ), ), ) -def test_pipeline_supports_history(mock_backend, job_info_record, history_record: Sequence[Row]) -> None: +def test_pipeline_supports_history(mock_backend, job_info_record: JobInfo, history_record: Row) -> None: """Verify that JobInfo records are written as expected to the history log.""" admin_locator = create_autospec(AdministratorLocator) admin_locator.get_workspace_administrator.return_value = "the_admin" job_ownership = JobOwnership(admin_locator) - history_log = HistoryLog(mock_backend, job_ownership, JobInfo, run_id=1, workspace_id=2, catalog="a_catalog") + history_log = HistoryLog[JobInfo]( + mock_backend, + job_ownership, + JobInfo, + run_id=1, + workspace_id=2, + catalog="a_catalog", + ) history_log.append_inventory_snapshot([job_info_record]) diff --git a/tests/unit/hive_metastore/test_grants.py b/tests/unit/hive_metastore/test_grants.py index e4208af702..03b8a7a79f 100644 --- a/tests/unit/hive_metastore/test_grants.py +++ b/tests/unit/hive_metastore/test_grants.py @@ -1,6 +1,5 @@ import dataclasses import logging -from collections.abc import Sequence from unittest.mock import create_autospec import pytest @@ -843,11 +842,11 @@ def test_grant_owner() -> None: ), ), ) -def test_grant_supports_history(mock_backend, grant_record: Sequence[Grant], history_record: Sequence[Row]) -> None: +def test_grant_supports_history(mock_backend, grant_record: Grant, history_record: Row) -> None: """Verify that Grant records are written to the history log as expected.""" mock_ownership = create_autospec(GrantOwnership) mock_ownership.owner_of.return_value = "the_admin" - history_log = HistoryLog(mock_backend, mock_ownership, Grant, run_id=1, workspace_id=2, catalog="a_catalog") + history_log = HistoryLog[Grant](mock_backend, mock_ownership, Grant, run_id=1, workspace_id=2, catalog="a_catalog") history_log.append_inventory_snapshot([grant_record]) diff --git a/tests/unit/hive_metastore/test_tables.py b/tests/unit/hive_metastore/test_tables.py index 0b45376434..cd39f10155 100644 --- a/tests/unit/hive_metastore/test_tables.py +++ b/tests/unit/hive_metastore/test_tables.py @@ -1,6 +1,5 @@ import logging import sys -from collections.abc import Sequence from unittest.mock import create_autospec import pytest @@ -751,11 +750,11 @@ def test_table_owner() -> None: ), ), ) -def test_table_supports_history(mock_backend, table_record: Sequence[Table], history_record: Sequence[Row]) -> None: +def test_table_supports_history(mock_backend, table_record: Table, history_record: Row) -> None: """Verify that Table records are written as expected to the history log.""" mock_ownership = create_autospec(TableOwnership) mock_ownership.owner_of.return_value = "the_admin" - history_log = HistoryLog(mock_backend, mock_ownership, Table, run_id=1, workspace_id=2, catalog="a_catalog") + history_log = HistoryLog[Table](mock_backend, mock_ownership, Table, run_id=1, workspace_id=2, catalog="a_catalog") history_log.append_inventory_snapshot([table_record]) diff --git a/tests/unit/hive_metastore/test_udfs.py b/tests/unit/hive_metastore/test_udfs.py index 057966ba8c..6e92371d62 100644 --- a/tests/unit/hive_metastore/test_udfs.py +++ b/tests/unit/hive_metastore/test_udfs.py @@ -1,4 +1,3 @@ -from collections.abc import Sequence from unittest.mock import create_autospec import pytest @@ -156,11 +155,11 @@ def test_udf_owner() -> None: ), ), ) -def test_udf_supports_history(mock_backend, udf_record, history_record: Sequence[Row]) -> None: +def test_udf_supports_history(mock_backend, udf_record: Udf, history_record: Row) -> None: """Verify that Udf records are written as expected to the history log.""" mock_ownership = create_autospec(UdfOwnership) mock_ownership.owner_of.return_value = "the_admin" - history_log = HistoryLog(mock_backend, mock_ownership, Udf, run_id=1, workspace_id=2, catalog="a_catalog") + history_log = HistoryLog[Udf](mock_backend, mock_ownership, Udf, run_id=1, workspace_id=2, catalog="a_catalog") history_log.append_inventory_snapshot([udf_record]) From 78f592df983bae85cdf91099bd348b746cca894a Mon Sep 17 00:00:00 2001 From: Andrew Snare Date: Fri, 18 Oct 2024 15:46:22 +0200 Subject: [PATCH 41/72] Ensure updated ClusterInfo snapshots are stored in the history table. --- .../labs/ucx/assessment/clusters.py | 5 +- .../labs/ucx/contexts/workflow_task.py | 14 +++- src/databricks/labs/ucx/progress/workflows.py | 4 +- tests/unit/assessment/test_clusters.py | 73 +++++++++++++++++++ tests/unit/progress/test_workflows.py | 7 +- 5 files changed, 99 insertions(+), 4 deletions(-) diff --git a/src/databricks/labs/ucx/assessment/clusters.py b/src/databricks/labs/ucx/assessment/clusters.py index 0e0624d3c2..6ef0d46ea0 100644 --- a/src/databricks/labs/ucx/assessment/clusters.py +++ b/src/databricks/labs/ucx/assessment/clusters.py @@ -1,8 +1,9 @@ import base64 import json import logging -from collections.abc import Iterable +from collections.abc import Iterable, Sequence from dataclasses import dataclass +from typing import ClassVar from databricks.labs.lsql.backends import SqlBackend from databricks.sdk import WorkspaceClient @@ -46,6 +47,8 @@ class ClusterInfo: creator: str | None = None """User-name of the creator of the cluster, if known.""" + __id_attributes__: ClassVar[Sequence[str]] = ("cluster_id",) + class CheckClusterMixin(CheckInitScriptMixin): _ws: WorkspaceClient diff --git a/src/databricks/labs/ucx/contexts/workflow_task.py b/src/databricks/labs/ucx/contexts/workflow_task.py index 3c83f39040..f461db2a09 100644 --- a/src/databricks/labs/ucx/contexts/workflow_task.py +++ b/src/databricks/labs/ucx/contexts/workflow_task.py @@ -6,7 +6,7 @@ from databricks.sdk import WorkspaceClient, core from databricks.labs.ucx.__about__ import __version__ -from databricks.labs.ucx.assessment.clusters import ClustersCrawler, PoliciesCrawler +from databricks.labs.ucx.assessment.clusters import ClustersCrawler, PoliciesCrawler, ClusterOwnership, ClusterInfo from databricks.labs.ucx.assessment.init_scripts import GlobalInitScriptCrawler from databricks.labs.ucx.assessment.jobs import JobOwnership, JobInfo, JobsCrawler, SubmitRunsCrawler from databricks.labs.ucx.assessment.pipelines import PipelinesCrawler @@ -134,6 +134,18 @@ def workflow_run_recorder(self) -> WorkflowRunRecorder: def workspace_id(self) -> int: return self.workspace_client.get_workspace_id() + @cached_property + def historical_clusters_log(self) -> HistoryLog[ClusterInfo]: + cluster_owner = ClusterOwnership(self.administrator_locator) + return HistoryLog( + self.sql_backend, + cluster_owner, + ClusterInfo, + int(self.named_parameters["parent_run_id"]), + self.workspace_id, + self.config.ucx_catalog, + ) + @cached_property def historical_grants_log(self) -> HistoryLog[Grant]: grant_owner = GrantOwnership(self.administrator_locator) diff --git a/src/databricks/labs/ucx/progress/workflows.py b/src/databricks/labs/ucx/progress/workflows.py index 22dd30b9ec..9881a0911b 100644 --- a/src/databricks/labs/ucx/progress/workflows.py +++ b/src/databricks/labs/ucx/progress/workflows.py @@ -84,7 +84,9 @@ def assess_clusters(self, ctx: RuntimeContext) -> None: - Clusters with incompatible spark config tags - Clusters referencing DBFS locations in one or more config options """ - ctx.clusters_crawler.snapshot(force_refresh=True) + history_log = ctx.historical_clusters_log + clusters_snapshot = ctx.clusters_crawler.snapshot(force_refresh=True) + history_log.append_inventory_snapshot(clusters_snapshot) @job_task def assess_pipelines(self, ctx: RuntimeContext) -> None: diff --git a/tests/unit/assessment/test_clusters.py b/tests/unit/assessment/test_clusters.py index c86c3f60f0..49dd898334 100644 --- a/tests/unit/assessment/test_clusters.py +++ b/tests/unit/assessment/test_clusters.py @@ -3,9 +3,11 @@ import pytest from databricks.labs.lsql.backends import MockBackend +from databricks.labs.lsql.core import Row from databricks.sdk.errors import DatabricksError, InternalError, NotFound from databricks.sdk.service.compute import ClusterDetails, Policy +from databricks.labs.ucx.__about__ import __version__ as ucx_version from databricks.labs.ucx.assessment.azure import AzureServicePrincipalCrawler from databricks.labs.ucx.assessment.clusters import ( ClustersCrawler, @@ -17,6 +19,7 @@ ) from databricks.labs.ucx.framework.crawlers import SqlBackend from databricks.labs.ucx.framework.owners import AdministratorLocator +from databricks.labs.ucx.progress.history import HistoryLog from .. import mock_workspace_client @@ -206,6 +209,76 @@ def test_cluster_owner_creator_unknown() -> None: admin_locator.get_workspace_administrator.assert_called_once() +@pytest.mark.parametrize( + "cluster_info_record,history_record", + ( + ( + ClusterInfo( + cluster_id="1234", + success=1, + failures="[]", + spark_version="3.5.3", + policy_id="4567", + cluster_name="the_cluster", + creator="user@domain", + ), + Row( + workspace_id=2, + job_run_id=1, + object_type="ClusterInfo", + object_id=["1234"], + data={ + "cluster_id": "1234", + "success": "1", + "spark_version": "3.5.3", + "policy_id": "4567", + "cluster_name": "the_cluster", + "creator": "user@domain", + }, + failures=[], + owner="user@domain", + ucx_version=ucx_version, + ), + ), + ( + ClusterInfo(cluster_id="1234", success=0, failures='["a-failure", "another-failure"]'), + Row( + workspace_id=2, + job_run_id=1, + object_type="ClusterInfo", + object_id=["1234"], + data={ + "cluster_id": "1234", + "success": "0", + }, + failures=["a-failure", "another-failure"], + owner="the_admin", + ucx_version=ucx_version, + ), + ), + ), +) +def test_cluster_info_supports_history(mock_backend, cluster_info_record: ClusterInfo, history_record: Row) -> None: + """Verify that ClusterInfo records are written as expected to the history log.""" + admin_locator = create_autospec(AdministratorLocator) + admin_locator.get_workspace_administrator.return_value = "the_admin" + cluster_ownership = ClusterOwnership(admin_locator) + history_log = HistoryLog[ClusterInfo]( + mock_backend, + cluster_ownership, + ClusterInfo, + run_id=1, + workspace_id=2, + catalog="a_catalog", + ) + + history_log.append_inventory_snapshot([cluster_info_record]) + + rows = mock_backend.rows_written_for("`a_catalog`.`ucx`.`history`", mode="append") + + assert rows == [history_record] + + def test_policy_crawler(): ws = mock_workspace_client( policy_ids=['single-user-with-spn', 'single-user-with-spn-policyid', 'single-user-with-spn-no-sparkversion'], diff --git a/tests/unit/progress/test_workflows.py b/tests/unit/progress/test_workflows.py index 0670a154d1..1e0e8bac6e 100644 --- a/tests/unit/progress/test_workflows.py +++ b/tests/unit/progress/test_workflows.py @@ -35,7 +35,12 @@ RuntimeContext.historical_grants_log, ), (MigrationProgress.assess_jobs, RuntimeContext.jobs_crawler, JobsCrawler, RuntimeContext.historical_jobs_log), - (MigrationProgress.assess_clusters, RuntimeContext.clusters_crawler, ClustersCrawler, None), + ( + MigrationProgress.assess_clusters, + RuntimeContext.clusters_crawler, + ClustersCrawler, + RuntimeContext.historical_clusters_log, + ), (MigrationProgress.assess_pipelines, RuntimeContext.pipelines_crawler, PipelinesCrawler, None), (MigrationProgress.crawl_cluster_policies, RuntimeContext.policies_crawler, PoliciesCrawler, None), ( From 6dfed4cebf063982c3dbe5bf2a4a57d0c6195579 Mon Sep 17 00:00:00 2001 From: Andrew Snare Date: Fri, 18 Oct 2024 15:49:39 +0200 Subject: [PATCH 42/72] Fix some test names. --- tests/unit/assessment/test_jobs.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/unit/assessment/test_jobs.py b/tests/unit/assessment/test_jobs.py index 79ef7d0dd6..4c18e141d4 100644 --- a/tests/unit/assessment/test_jobs.py +++ b/tests/unit/assessment/test_jobs.py @@ -65,7 +65,7 @@ def test_jobs_assessment_with_spn_cluster_no_job_tasks(): assert result_set[0].success == 1 -def test_pipeline_crawler_creator(): +def test_job_crawler_creator(): ws = mock_workspace_client() ws.jobs.list.return_value = ( BaseJob(job_id=1, settings=JobSettings(), creator_user_name=None), @@ -138,7 +138,7 @@ def test_job_run_crawler(jobruns_ids, cluster_ids, run_ids, failures): assert result[0].failures == failures -def test_pipeline_owner_creator() -> None: +def test_job_owner_creator() -> None: admin_locator = create_autospec(AdministratorLocator) ownership = JobOwnership(admin_locator) @@ -148,7 +148,7 @@ def test_pipeline_owner_creator() -> None: admin_locator.get_workspace_administrator.assert_not_called() -def test_pipeline_owner_creator_unknown() -> None: +def test_job_owner_creator_unknown() -> None: admin_locator = create_autospec(AdministratorLocator) admin_locator.get_workspace_administrator.return_value = "an_admin" @@ -208,7 +208,7 @@ def test_pipeline_owner_creator_unknown() -> None: ), ), ) -def test_pipeline_supports_history(mock_backend, job_info_record: JobInfo, history_record: Row) -> None: +def test_job_supports_history(mock_backend, job_info_record: JobInfo, history_record: Row) -> None: """Verify that JobInfo records are written as expected to the history log.""" admin_locator = create_autospec(AdministratorLocator) admin_locator.get_workspace_administrator.return_value = "the_admin" From 34b1e7244e6e3a9aa7efda9cea28932bac8f9dbe Mon Sep 17 00:00:00 2001 From: Andrew Snare Date: Fri, 18 Oct 2024 16:00:16 +0200 Subject: [PATCH 43/72] Ensure updated PipelineInfo snapshots are appended to the history table. --- .../labs/ucx/assessment/pipelines.py | 5 +- .../labs/ucx/contexts/workflow_task.py | 14 +++- src/databricks/labs/ucx/progress/workflows.py | 4 +- tests/unit/assessment/test_pipelines.py | 70 +++++++++++++++++++ tests/unit/progress/test_workflows.py | 7 +- 5 files changed, 96 insertions(+), 4 deletions(-) diff --git a/src/databricks/labs/ucx/assessment/pipelines.py b/src/databricks/labs/ucx/assessment/pipelines.py index 19bc8c558b..02d28bb438 100644 --- a/src/databricks/labs/ucx/assessment/pipelines.py +++ b/src/databricks/labs/ucx/assessment/pipelines.py @@ -1,7 +1,8 @@ import json import logging -from collections.abc import Iterable +from collections.abc import Iterable, Sequence from dataclasses import dataclass +from typing import ClassVar from databricks.labs.lsql.backends import SqlBackend from databricks.sdk import WorkspaceClient @@ -23,6 +24,8 @@ class PipelineInfo: creator_name: str | None = None """User-name of the creator of the pipeline, if known.""" + __id_attributes__: ClassVar[Sequence[str]] = ("pipeline_id",) + class PipelinesCrawler(CrawlerBase[PipelineInfo], CheckClusterMixin): def __init__(self, ws: WorkspaceClient, sbe: SqlBackend, schema): diff --git a/src/databricks/labs/ucx/contexts/workflow_task.py b/src/databricks/labs/ucx/contexts/workflow_task.py index f461db2a09..2f2e338dbf 100644 --- a/src/databricks/labs/ucx/contexts/workflow_task.py +++ b/src/databricks/labs/ucx/contexts/workflow_task.py @@ -9,7 +9,7 @@ from databricks.labs.ucx.assessment.clusters import ClustersCrawler, PoliciesCrawler, ClusterOwnership, ClusterInfo from databricks.labs.ucx.assessment.init_scripts import GlobalInitScriptCrawler from databricks.labs.ucx.assessment.jobs import JobOwnership, JobInfo, JobsCrawler, SubmitRunsCrawler -from databricks.labs.ucx.assessment.pipelines import PipelinesCrawler +from databricks.labs.ucx.assessment.pipelines import PipelinesCrawler, PipelineInfo, PipelineOwnership from databricks.labs.ucx.config import WorkspaceConfig from databricks.labs.ucx.contexts.application import GlobalContext from databricks.labs.ucx.hive_metastore import TablesInMounts @@ -170,6 +170,18 @@ def historical_jobs_log(self) -> HistoryLog[JobInfo]: self.config.ucx_catalog, ) + @cached_property + def historical_pipelines_log(self) -> HistoryLog[PipelineInfo]: + pipeline_owner = PipelineOwnership(self.administrator_locator) + return HistoryLog( + self.sql_backend, + pipeline_owner, + PipelineInfo, + int(self.named_parameters["parent_run_id"]), + self.workspace_id, + self.config.ucx_catalog, + ) + @cached_property def historical_tables_log(self) -> HistoryLog[Table]: table_owner = TableOwnership(self.administrator_locator) diff --git a/src/databricks/labs/ucx/progress/workflows.py b/src/databricks/labs/ucx/progress/workflows.py index 9881a0911b..da7079c8a7 100644 --- a/src/databricks/labs/ucx/progress/workflows.py +++ b/src/databricks/labs/ucx/progress/workflows.py @@ -99,7 +99,9 @@ def assess_pipelines(self, ctx: RuntimeContext) -> None: Subsequently, a list of all the pipelines with matching configurations are stored in the `$inventory.pipelines` table.""" - ctx.pipelines_crawler.snapshot(force_refresh=True) + history_log = ctx.historical_pipelines_log + pipelines_snapshot = ctx.pipelines_crawler.snapshot(force_refresh=True) + history_log.append_inventory_snapshot(pipelines_snapshot) @job_task def crawl_cluster_policies(self, ctx: RuntimeContext) -> None: diff --git a/tests/unit/assessment/test_pipelines.py b/tests/unit/assessment/test_pipelines.py index 949e441f78..0052b9ff29 100644 --- a/tests/unit/assessment/test_pipelines.py +++ b/tests/unit/assessment/test_pipelines.py @@ -1,11 +1,15 @@ from unittest.mock import create_autospec +import pytest from databricks.labs.lsql.backends import MockBackend +from databricks.labs.lsql.core import Row from databricks.sdk.service.pipelines import GetPipelineResponse, PipelineStateInfo +from databricks.labs.ucx.__about__ import __version__ as ucx_version from databricks.labs.ucx.assessment.azure import AzureServicePrincipalCrawler from databricks.labs.ucx.assessment.pipelines import PipelineOwnership, PipelineInfo, PipelinesCrawler from databricks.labs.ucx.framework.owners import AdministratorLocator +from databricks.labs.ucx.progress.history import HistoryLog from .. import mock_workspace_client @@ -82,3 +86,69 @@ def test_pipeline_owner_creator_unknown() -> None: assert owner == "an_admin" admin_locator.get_workspace_administrator.assert_called_once() + + +@pytest.mark.parametrize( + "pipeline_info_record,history_record", + ( + ( + PipelineInfo( + pipeline_id="1234", + success=1, + failures="[]", + pipeline_name="a_pipeline", + creator_name="user@domain", + ), + Row( + workspace_id=2, + job_run_id=1, + object_type="PipelineInfo", + object_id=["1234"], + data={ + "pipeline_id": "1234", + "success": "1", + "pipeline_name": "a_pipeline", + "creator_name": "user@domain", + }, + failures=[], + owner="user@domain", + ucx_version=ucx_version, + ), + ), + ( + PipelineInfo(pipeline_id="1234", success=0, failures='["a-failure", "b-failure"]'), + Row( + workspace_id=2, + job_run_id=1, + object_type="PipelineInfo", + object_id=["1234"], + data={ + "pipeline_id": "1234", + "success": "0", + }, + failures=["a-failure", "b-failure"], + owner="the_admin", + ucx_version=ucx_version, + ), + ), + ), +) +def test_pipeline_info_supports_history(mock_backend, pipeline_info_record: PipelineInfo, history_record: Row) -> None: + """Verify that PipelineInfo records are written as expected to the history log.""" + admin_locator = create_autospec(AdministratorLocator) + admin_locator.get_workspace_administrator.return_value = "the_admin" + pipeline_ownership = PipelineOwnership(admin_locator) + history_log = HistoryLog[PipelineInfo]( + mock_backend, + pipeline_ownership, + PipelineInfo, + run_id=1, + workspace_id=2, + catalog="a_catalog", + ) + + history_log.append_inventory_snapshot([pipeline_info_record]) + + rows = mock_backend.rows_written_for("`a_catalog`.`ucx`.`history`", mode="append") + + assert rows == [history_record] diff --git a/tests/unit/progress/test_workflows.py b/tests/unit/progress/test_workflows.py index 1e0e8bac6e..b967cc4213 100644 --- a/tests/unit/progress/test_workflows.py +++ b/tests/unit/progress/test_workflows.py @@ -41,7 +41,12 @@ ClustersCrawler, RuntimeContext.historical_clusters_log, ), - (MigrationProgress.assess_pipelines, RuntimeContext.pipelines_crawler, PipelinesCrawler, None), + ( + MigrationProgress.assess_pipelines, + RuntimeContext.pipelines_crawler, + PipelinesCrawler, + RuntimeContext.historical_pipelines_log, + ), (MigrationProgress.crawl_cluster_policies, RuntimeContext.policies_crawler, PoliciesCrawler, None), ( MigrationProgress.refresh_table_migration_status, From 77a06e1d38544724c1000432209691674e8e636d Mon Sep 17 00:00:00 2001 From: Andrew Snare Date: Fri, 18 Oct 2024 16:17:38 +0200 Subject: [PATCH 44/72] Ensure updated Cluster Policy snapshots are logged to the history table. --- .../labs/ucx/assessment/clusters.py | 2 + .../labs/ucx/contexts/workflow_task.py | 21 ++++- src/databricks/labs/ucx/progress/workflows.py | 4 +- tests/unit/assessment/test_clusters.py | 76 +++++++++++++++++++ tests/unit/progress/test_workflows.py | 7 +- 5 files changed, 107 insertions(+), 3 deletions(-) diff --git a/src/databricks/labs/ucx/assessment/clusters.py b/src/databricks/labs/ucx/assessment/clusters.py index 6ef0d46ea0..63f1061364 100644 --- a/src/databricks/labs/ucx/assessment/clusters.py +++ b/src/databricks/labs/ucx/assessment/clusters.py @@ -206,6 +206,8 @@ class PolicyInfo: creator: str | None = None """User-name of the creator of the cluster policy, if known.""" + __id_attributes__: ClassVar[Sequence[str]] = ("policy_id",) + class PoliciesCrawler(CrawlerBase[PolicyInfo], CheckClusterMixin): def __init__(self, ws: WorkspaceClient, sbe: SqlBackend, schema): diff --git a/src/databricks/labs/ucx/contexts/workflow_task.py b/src/databricks/labs/ucx/contexts/workflow_task.py index 2f2e338dbf..4b68642396 100644 --- a/src/databricks/labs/ucx/contexts/workflow_task.py +++ b/src/databricks/labs/ucx/contexts/workflow_task.py @@ -6,7 +6,14 @@ from databricks.sdk import WorkspaceClient, core from databricks.labs.ucx.__about__ import __version__ -from databricks.labs.ucx.assessment.clusters import ClustersCrawler, PoliciesCrawler, ClusterOwnership, ClusterInfo +from databricks.labs.ucx.assessment.clusters import ( + ClustersCrawler, + PoliciesCrawler, + ClusterOwnership, + ClusterInfo, + ClusterPolicyOwnership, + PolicyInfo, +) from databricks.labs.ucx.assessment.init_scripts import GlobalInitScriptCrawler from databricks.labs.ucx.assessment.jobs import JobOwnership, JobInfo, JobsCrawler, SubmitRunsCrawler from databricks.labs.ucx.assessment.pipelines import PipelinesCrawler, PipelineInfo, PipelineOwnership @@ -146,6 +153,18 @@ def historical_clusters_log(self) -> HistoryLog[ClusterInfo]: self.config.ucx_catalog, ) + @cached_property + def historical_cluster_policies_log(self) -> HistoryLog[PolicyInfo]: + cluster_policy_owner = ClusterPolicyOwnership(self.administrator_locator) + return HistoryLog( + self.sql_backend, + cluster_policy_owner, + PolicyInfo, + int(self.named_parameters["parent_run_id"]), + self.workspace_id, + self.config.ucx_catalog, + ) + @cached_property def historical_grants_log(self) -> HistoryLog[Grant]: grant_owner = GrantOwnership(self.administrator_locator) diff --git a/src/databricks/labs/ucx/progress/workflows.py b/src/databricks/labs/ucx/progress/workflows.py index da7079c8a7..3cdd67d47a 100644 --- a/src/databricks/labs/ucx/progress/workflows.py +++ b/src/databricks/labs/ucx/progress/workflows.py @@ -112,7 +112,9 @@ def crawl_cluster_policies(self, ctx: RuntimeContext) -> None: Subsequently, a list of all the policies with matching configurations are stored in the `$inventory.policies` table.""" - ctx.policies_crawler.snapshot(force_refresh=True) + history_log = ctx.historical_cluster_policies_log + cluster_policies_snapshot = ctx.policies_crawler.snapshot(force_refresh=True) + history_log.append_inventory_snapshot(cluster_policies_snapshot) @job_task(job_cluster="table_migration") def setup_table_migration(self, ctx: RuntimeContext) -> None: diff --git a/tests/unit/assessment/test_clusters.py b/tests/unit/assessment/test_clusters.py index 49dd898334..13e269eab4 100644 --- a/tests/unit/assessment/test_clusters.py +++ b/tests/unit/assessment/test_clusters.py @@ -365,3 +365,79 @@ def test_cluster_policy_owner_creator_unknown() -> None: assert owner == "an_admin" admin_locator.get_workspace_administrator.assert_called_once() + + +@pytest.mark.parametrize( + "policy_info_record,history_record", + ( + ( + PolicyInfo( + policy_id="1234", + policy_name="the_policy", + success=1, + failures="[]", + spark_version="3.5.3", + policy_description="a description of the policy", + creator="user@domain", + ), + Row( + workspace_id=2, + job_run_id=1, + object_type="PolicyInfo", + object_id=["1234"], + data={ + "policy_id": "1234", + "policy_name": "the_policy", + "success": "1", + "spark_version": "3.5.3", + "policy_description": "a description of the policy", + "creator": "user@domain", + }, + failures=[], + owner="user@domain", + ucx_version=ucx_version, + ), + ), + ( + PolicyInfo( + policy_id="1234", + policy_name="the_policy", + success=0, + failures='["a_failure", "another_failure"]', + ), + Row( + workspace_id=2, + job_run_id=1, + object_type="PolicyInfo", + object_id=["1234"], + data={ + "policy_id": "1234", + "policy_name": "the_policy", + "success": "0", + }, + failures=["a_failure", "another_failure"], + owner="the_admin", + ucx_version=ucx_version, + ), + ), + ), +) +def test_cluster_policy_supports_history(mock_backend, policy_info_record: PolicyInfo, history_record: Row) -> None: + """Verify that ClusterInfo records are written as expected to the history log.""" + admin_locator = create_autospec(AdministratorLocator) + admin_locator.get_workspace_administrator.return_value = "the_admin" + cluster_policy_ownership = ClusterPolicyOwnership(admin_locator) + history_log = HistoryLog[PolicyInfo]( + mock_backend, + cluster_policy_ownership, + PolicyInfo, + run_id=1, + workspace_id=2, + catalog="a_catalog", + ) + + history_log.append_inventory_snapshot([policy_info_record]) + + rows = mock_backend.rows_written_for("`a_catalog`.`ucx`.`history`", mode="append") + + assert rows == [history_record] diff --git a/tests/unit/progress/test_workflows.py b/tests/unit/progress/test_workflows.py index b967cc4213..73bf86ccd5 100644 --- a/tests/unit/progress/test_workflows.py +++ b/tests/unit/progress/test_workflows.py @@ -47,7 +47,12 @@ PipelinesCrawler, RuntimeContext.historical_pipelines_log, ), - (MigrationProgress.crawl_cluster_policies, RuntimeContext.policies_crawler, PoliciesCrawler, None), + ( + MigrationProgress.crawl_cluster_policies, + RuntimeContext.policies_crawler, + PoliciesCrawler, + RuntimeContext.historical_cluster_policies_log, + ), ( MigrationProgress.refresh_table_migration_status, RuntimeContext.migration_status_refresher, From ac606758b20c7a9888113bba191f41666e91939e Mon Sep 17 00:00:00 2001 From: Andrew Snare Date: Fri, 18 Oct 2024 16:24:02 +0200 Subject: [PATCH 45/72] Formatting. --- tests/unit/progress/test_workflows.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/tests/unit/progress/test_workflows.py b/tests/unit/progress/test_workflows.py index 73bf86ccd5..c82d96a452 100644 --- a/tests/unit/progress/test_workflows.py +++ b/tests/unit/progress/test_workflows.py @@ -27,14 +27,24 @@ TablesCrawler, RuntimeContext.historical_tables_log, ), - (MigrationProgress.crawl_udfs, RuntimeContext.udfs_crawler, UdfsCrawler, RuntimeContext.historical_udfs_log), + ( + MigrationProgress.crawl_udfs, + RuntimeContext.udfs_crawler, + UdfsCrawler, + RuntimeContext.historical_udfs_log, + ), ( MigrationProgress.crawl_grants, RuntimeContext.grants_crawler, GrantsCrawler, RuntimeContext.historical_grants_log, ), - (MigrationProgress.assess_jobs, RuntimeContext.jobs_crawler, JobsCrawler, RuntimeContext.historical_jobs_log), + ( + MigrationProgress.assess_jobs, + RuntimeContext.jobs_crawler, + JobsCrawler, + RuntimeContext.historical_jobs_log, + ), ( MigrationProgress.assess_clusters, RuntimeContext.clusters_crawler, From f00a06e32ac8d767d46f798e94a6c6c1ba792e00 Mon Sep 17 00:00:00 2001 From: Andrew Snare Date: Fri, 18 Oct 2024 16:30:49 +0200 Subject: [PATCH 46/72] Fix docstring. --- tests/unit/assessment/test_clusters.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/unit/assessment/test_clusters.py b/tests/unit/assessment/test_clusters.py index 13e269eab4..20317ff4c0 100644 --- a/tests/unit/assessment/test_clusters.py +++ b/tests/unit/assessment/test_clusters.py @@ -423,7 +423,7 @@ def test_cluster_policy_owner_creator_unknown() -> None: ), ) def test_cluster_policy_supports_history(mock_backend, policy_info_record: PolicyInfo, history_record: Row) -> None: - """Verify that ClusterInfo records are written as expected to the history log.""" + """Verify that PolicyInfo records are written as expected to the history log.""" admin_locator = create_autospec(AdministratorLocator) admin_locator.get_workspace_administrator.return_value = "the_admin" cluster_policy_ownership = ClusterPolicyOwnership(admin_locator) From 4d6989c3920c893a12b3a5f4ddb1fe45d6ffced2 Mon Sep 17 00:00:00 2001 From: Andrew Snare Date: Fri, 18 Oct 2024 16:46:23 +0200 Subject: [PATCH 47/72] Ensure that updated TableMigrationStatus snapshots are appended to the history table. --- .../labs/ucx/contexts/workflow_task.py | 20 ++++- .../hive_metastore/table_migration_status.py | 5 +- src/databricks/labs/ucx/progress/workflows.py | 4 +- .../unit/hive_metastore/test_table_migrate.py | 78 +++++++++++++++++++ tests/unit/progress/test_workflows.py | 3 +- 5 files changed, 104 insertions(+), 6 deletions(-) diff --git a/src/databricks/labs/ucx/contexts/workflow_task.py b/src/databricks/labs/ucx/contexts/workflow_task.py index 4b68642396..372dceef78 100644 --- a/src/databricks/labs/ucx/contexts/workflow_task.py +++ b/src/databricks/labs/ucx/contexts/workflow_task.py @@ -3,6 +3,7 @@ from databricks.labs.blueprint.installation import Installation from databricks.labs.lsql.backends import RuntimeBackend, SqlBackend +from databricks.labs.ucx.hive_metastore.table_migration_status import TableMigrationOwnership, TableMigrationStatus from databricks.sdk import WorkspaceClient, core from databricks.labs.ucx.__about__ import __version__ @@ -100,6 +101,10 @@ def global_init_scripts_crawler(self) -> GlobalInitScriptCrawler: def tables_crawler(self) -> FasterTableScanCrawler: return FasterTableScanCrawler(self.sql_backend, self.inventory_database, self.config.include_databases) + @cached_property + def tables_ownership(self) -> TableOwnership: + return TableOwnership(self.administrator_locator) + @cached_property def tables_in_mounts(self) -> TablesInMounts: return TablesInMounts( @@ -203,16 +208,27 @@ def historical_pipelines_log(self) -> HistoryLog[PipelineInfo]: @cached_property def historical_tables_log(self) -> HistoryLog[Table]: - table_owner = TableOwnership(self.administrator_locator) return HistoryLog( self.sql_backend, - table_owner, + self.tables_ownership, Table, int(self.named_parameters["parent_run_id"]), self.workspace_id, self.config.ucx_catalog, ) + @cached_property + def historical_table_migration_log(self) -> HistoryLog[TableMigrationStatus]: + table_migration_owner = TableMigrationOwnership(self.tables_crawler, self.tables_ownership) + return HistoryLog( + self.sql_backend, + table_migration_owner, + TableMigrationStatus, + int(self.named_parameters["parent_run_id"]), + self.workspace_id, + self.config.ucx_catalog, + ) + @cached_property def historical_udfs_log(self) -> HistoryLog[Udf]: udf_owner = UdfOwnership(self.administrator_locator) diff --git a/src/databricks/labs/ucx/hive_metastore/table_migration_status.py b/src/databricks/labs/ucx/hive_metastore/table_migration_status.py index 26e6a5cb93..75b97cc61a 100644 --- a/src/databricks/labs/ucx/hive_metastore/table_migration_status.py +++ b/src/databricks/labs/ucx/hive_metastore/table_migration_status.py @@ -1,7 +1,8 @@ import datetime import logging from dataclasses import dataclass, replace -from collections.abc import Iterable, KeysView +from collections.abc import Iterable, KeysView, Sequence +from typing import ClassVar from databricks.labs.lsql.backends import SqlBackend from databricks.sdk import WorkspaceClient @@ -25,6 +26,8 @@ class TableMigrationStatus: dst_table: str | None = None update_ts: str | None = None + __id_attributes__: ClassVar[Sequence[str]] = ("src_schema", "src_table") + def destination(self): return f"{self.dst_catalog}.{self.dst_schema}.{self.dst_table}".lower() diff --git a/src/databricks/labs/ucx/progress/workflows.py b/src/databricks/labs/ucx/progress/workflows.py index 3cdd67d47a..33174bff74 100644 --- a/src/databricks/labs/ucx/progress/workflows.py +++ b/src/databricks/labs/ucx/progress/workflows.py @@ -134,7 +134,9 @@ def refresh_table_migration_status(self, ctx: RuntimeContext) -> None: The results of the scan are stored in the `$inventory.migration_status` inventory table. """ - ctx.migration_status_refresher.snapshot(force_refresh=True) + history_log = ctx.historical_table_migration_log + migration_status_snapshot = ctx.migration_status_refresher.snapshot(force_refresh=True) + history_log.append_inventory_snapshot(migration_status_snapshot) @job_task( depends_on=[ diff --git a/tests/unit/hive_metastore/test_table_migrate.py b/tests/unit/hive_metastore/test_table_migrate.py index 79fc692390..3423f2d31e 100644 --- a/tests/unit/hive_metastore/test_table_migrate.py +++ b/tests/unit/hive_metastore/test_table_migrate.py @@ -7,10 +7,12 @@ import pytest from databricks.labs.lsql.backends import MockBackend, SqlBackend +from databricks.labs.lsql.core import Row from databricks.sdk import WorkspaceClient from databricks.sdk.errors import NotFound from databricks.sdk.service.catalog import CatalogInfo, SchemaInfo, TableInfo +from databricks.labs.ucx.__about__ import __version__ as ucx_version from databricks.labs.ucx.framework.owners import AdministratorLocator from databricks.labs.ucx.framework.utils import escape_sql_identifier from databricks.labs.ucx.hive_metastore.grants import MigrateGrants @@ -37,6 +39,7 @@ What, ) from databricks.labs.ucx.hive_metastore.view_migrate import ViewToMigrate +from databricks.labs.ucx.progress.history import HistoryLog from .. import mock_table_mapping, mock_workspace_client @@ -1539,6 +1542,81 @@ def test_table_migration_status_source_table_unknown() -> None: table_ownership.owner_of.assert_not_called() +@pytest.mark.parametrize( + "table_migration_status_record,history_record", + ( + ( + TableMigrationStatus( + src_schema="foo", + src_table="bar", + dst_catalog="main", + dst_schema="fu", + dst_table="baz", + update_ts="2024-10-18T16:34:00Z", + ), + Row( + workspace_id=2, + job_run_id=1, + object_type="TableMigrationStatus", + object_id=["foo", "bar"], + data={ + "src_schema": "foo", + "src_table": "bar", + "dst_catalog": "main", + "dst_schema": "fu", + "dst_table": "baz", + "update_ts": "2024-10-18T16:34:00Z", + }, + failures=[], + owner="the_admin", + ucx_version=ucx_version, + ), + ), + ( + TableMigrationStatus( + src_schema="foo", + src_table="bar", + ), + Row( + workspace_id=2, + job_run_id=1, + object_type="TableMigrationStatus", + object_id=["foo", "bar"], + data={ + "src_schema": "foo", + "src_table": "bar", + }, + failures=[], + owner="the_admin", + ucx_version=ucx_version, + ), + ), + ), +) +def test_table_migration_status_supports_history( + mock_backend, + table_migration_status_record: TableMigrationStatus, + history_record: Row, +) -> None: + """Verify that TableMigrationStatus records are written as expected to the history log.""" + table_migration_ownership = create_autospec(TableMigrationOwnership) + table_migration_ownership.owner_of.return_value = "the_admin" + history_log = HistoryLog[TableMigrationStatus]( + mock_backend, + table_migration_ownership, + TableMigrationStatus, + run_id=1, + workspace_id=2, + catalog="a_catalog", + ) + + history_log.append_inventory_snapshot([table_migration_status_record]) + + rows = mock_backend.rows_written_for("`a_catalog`.`ucx`.`history`", mode="append") + + assert rows == [history_record] + + class MockBackendWithGeneralException(MockBackend): """Mock backend that allows raising a general exception. diff --git a/tests/unit/progress/test_workflows.py b/tests/unit/progress/test_workflows.py index c82d96a452..39c16676e1 100644 --- a/tests/unit/progress/test_workflows.py +++ b/tests/unit/progress/test_workflows.py @@ -67,11 +67,10 @@ MigrationProgress.refresh_table_migration_status, RuntimeContext.migration_status_refresher, TableMigrationStatusRefresher, - None, + RuntimeContext.historical_table_migration_log, ), ], ) -@pytest.mark.xfail(raises=AttributeError, reason="Work in progress.") def test_migration_progress_runtime_refresh(run_workflow, task, crawler, crawler_class, history_log) -> None: mock_crawler = create_autospec(crawler_class) mock_history_log = create_autospec(HistoryLog) From 17100d515f52f6a8630c41b3be20761c25ac9543 Mon Sep 17 00:00:00 2001 From: Andrew Snare Date: Mon, 21 Oct 2024 11:38:48 +0200 Subject: [PATCH 48/72] Update query to return at most a single record. --- tests/integration/progress/test_workflows.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/integration/progress/test_workflows.py b/tests/integration/progress/test_workflows.py index 39ed7c1407..2711a8e9c7 100644 --- a/tests/integration/progress/test_workflows.py +++ b/tests/integration/progress/test_workflows.py @@ -32,5 +32,5 @@ def test_running_real_migration_progress_job(installation_ctx: MockInstallationC assert installation_ctx.deployed_workflows.validate_step(workflow), f"Workflow failed: {workflow}" # Ensure that the migration-progress workflow populated the `workflow_runs` table. - query = f"SELECT 1 FROM {installation_ctx.ucx_catalog}.multiworkspace.workflow_runs" + query = f"SELECT 1 FROM {installation_ctx.ucx_catalog}.multiworkspace.workflow_runs LIMIT 1" assert any(installation_ctx.sql_backend.fetch(query)), f"No workflow run captured: {query}" From 78a9a13fac16b28c233b92616d7a78bc931c3c2b Mon Sep 17 00:00:00 2001 From: Andrew Snare Date: Mon, 21 Oct 2024 11:39:10 +0200 Subject: [PATCH 49/72] Update integration test to verify that the history log is written to. --- tests/integration/progress/test_workflows.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tests/integration/progress/test_workflows.py b/tests/integration/progress/test_workflows.py index 2711a8e9c7..c9e8ee0afa 100644 --- a/tests/integration/progress/test_workflows.py +++ b/tests/integration/progress/test_workflows.py @@ -34,3 +34,7 @@ def test_running_real_migration_progress_job(installation_ctx: MockInstallationC # Ensure that the migration-progress workflow populated the `workflow_runs` table. query = f"SELECT 1 FROM {installation_ctx.ucx_catalog}.multiworkspace.workflow_runs LIMIT 1" assert any(installation_ctx.sql_backend.fetch(query)), f"No workflow run captured: {query}" + + # Ensure that the history file has records written to it. + query = f"SELECT 1 from {installation_ctx.ucx_catalog}.multiworkspace.historical LIMIT 1" + assert any(installation_ctx.sql_backend.fetch(query)), f"No snapshots captured to the history log: {query}" From 21ad44d5d605a835c9a1149fc7b220a351934c3e Mon Sep 17 00:00:00 2001 From: Andrew Snare Date: Mon, 21 Oct 2024 12:37:01 +0200 Subject: [PATCH 50/72] Update history log to write to multiworkspace.historical instead of ucx.history. --- src/databricks/labs/ucx/progress/history.py | 4 ++-- tests/unit/assessment/test_clusters.py | 4 ++-- tests/unit/assessment/test_jobs.py | 2 +- tests/unit/assessment/test_pipelines.py | 2 +- tests/unit/hive_metastore/test_grants.py | 2 +- tests/unit/hive_metastore/test_table_migrate.py | 2 +- tests/unit/hive_metastore/test_tables.py | 2 +- tests/unit/hive_metastore/test_udfs.py | 2 +- tests/unit/progress/test_history.py | 4 ++-- 9 files changed, 12 insertions(+), 12 deletions(-) diff --git a/src/databricks/labs/ucx/progress/history.py b/src/databricks/labs/ucx/progress/history.py index 0f2504961f..e64b3f8010 100644 --- a/src/databricks/labs/ucx/progress/history.py +++ b/src/databricks/labs/ucx/progress/history.py @@ -191,8 +191,8 @@ def __init__( run_id: int, workspace_id: int, catalog: str, - schema: str = "ucx", - table: str = "history", + schema: str = "multiworkspace", + table: str = "historical", ) -> None: self._sql_backend = sql_backend self._klass = klass diff --git a/tests/unit/assessment/test_clusters.py b/tests/unit/assessment/test_clusters.py index 20317ff4c0..6a35d649dc 100644 --- a/tests/unit/assessment/test_clusters.py +++ b/tests/unit/assessment/test_clusters.py @@ -274,7 +274,7 @@ def test_cluster_info_supports_history(mock_backend, cluster_info_record: Cluste history_log.append_inventory_snapshot([cluster_info_record]) - rows = mock_backend.rows_written_for("`a_catalog`.`ucx`.`history`", mode="append") + rows = mock_backend.rows_written_for("`a_catalog`.`multiworkspace`.`historical`", mode="append") assert rows == [history_record] @@ -438,6 +438,6 @@ def test_cluster_policy_supports_history(mock_backend, policy_info_record: Polic history_log.append_inventory_snapshot([policy_info_record]) - rows = mock_backend.rows_written_for("`a_catalog`.`ucx`.`history`", mode="append") + rows = mock_backend.rows_written_for("`a_catalog`.`multiworkspace`.`historical`", mode="append") assert rows == [history_record] diff --git a/tests/unit/assessment/test_jobs.py b/tests/unit/assessment/test_jobs.py index 4c18e141d4..9844e8c51d 100644 --- a/tests/unit/assessment/test_jobs.py +++ b/tests/unit/assessment/test_jobs.py @@ -224,6 +224,6 @@ def test_job_supports_history(mock_backend, job_info_record: JobInfo, history_re history_log.append_inventory_snapshot([job_info_record]) - rows = mock_backend.rows_written_for("`a_catalog`.`ucx`.`history`", mode="append") + rows = mock_backend.rows_written_for("`a_catalog`.`multiworkspace`.`historical`", mode="append") assert rows == [history_record] diff --git a/tests/unit/assessment/test_pipelines.py b/tests/unit/assessment/test_pipelines.py index 0052b9ff29..f5ee7d3a50 100644 --- a/tests/unit/assessment/test_pipelines.py +++ b/tests/unit/assessment/test_pipelines.py @@ -149,6 +149,6 @@ def test_pipeline_info_supports_history(mock_backend, pipeline_info_record: Pipe history_log.append_inventory_snapshot([pipeline_info_record]) - rows = mock_backend.rows_written_for("`a_catalog`.`ucx`.`history`", mode="append") + rows = mock_backend.rows_written_for("`a_catalog`.`multiworkspace`.`historical`", mode="append") assert rows == [history_record] diff --git a/tests/unit/hive_metastore/test_grants.py b/tests/unit/hive_metastore/test_grants.py index 03b8a7a79f..5321606504 100644 --- a/tests/unit/hive_metastore/test_grants.py +++ b/tests/unit/hive_metastore/test_grants.py @@ -850,6 +850,6 @@ def test_grant_supports_history(mock_backend, grant_record: Grant, history_recor history_log.append_inventory_snapshot([grant_record]) - rows = mock_backend.rows_written_for("`a_catalog`.`ucx`.`history`", mode="append") + rows = mock_backend.rows_written_for("`a_catalog`.`multiworkspace`.`historical`", mode="append") assert rows == [history_record] diff --git a/tests/unit/hive_metastore/test_table_migrate.py b/tests/unit/hive_metastore/test_table_migrate.py index 3423f2d31e..f0d7360530 100644 --- a/tests/unit/hive_metastore/test_table_migrate.py +++ b/tests/unit/hive_metastore/test_table_migrate.py @@ -1612,7 +1612,7 @@ def test_table_migration_status_supports_history( history_log.append_inventory_snapshot([table_migration_status_record]) - rows = mock_backend.rows_written_for("`a_catalog`.`ucx`.`history`", mode="append") + rows = mock_backend.rows_written_for("`a_catalog`.`multiworkspace`.`historical`", mode="append") assert rows == [history_record] diff --git a/tests/unit/hive_metastore/test_tables.py b/tests/unit/hive_metastore/test_tables.py index cd39f10155..03d6686d48 100644 --- a/tests/unit/hive_metastore/test_tables.py +++ b/tests/unit/hive_metastore/test_tables.py @@ -758,6 +758,6 @@ def test_table_supports_history(mock_backend, table_record: Table, history_recor history_log.append_inventory_snapshot([table_record]) - rows = mock_backend.rows_written_for("`a_catalog`.`ucx`.`history`", mode="append") + rows = mock_backend.rows_written_for("`a_catalog`.`multiworkspace`.`historical`", mode="append") assert rows == [history_record] diff --git a/tests/unit/hive_metastore/test_udfs.py b/tests/unit/hive_metastore/test_udfs.py index 6e92371d62..45c96a975f 100644 --- a/tests/unit/hive_metastore/test_udfs.py +++ b/tests/unit/hive_metastore/test_udfs.py @@ -163,6 +163,6 @@ def test_udf_supports_history(mock_backend, udf_record: Udf, history_record: Row history_log.append_inventory_snapshot([udf_record]) - rows = mock_backend.rows_written_for("`a_catalog`.`ucx`.`history`", mode="append") + rows = mock_backend.rows_written_for("`a_catalog`.`multiworkspace`.`historical`", mode="append") assert rows == [history_record] diff --git a/tests/unit/progress/test_history.py b/tests/unit/progress/test_history.py index 025e27c34d..d279bf1e87 100644 --- a/tests/unit/progress/test_history.py +++ b/tests/unit/progress/test_history.py @@ -509,5 +509,5 @@ def test_history_log_default_location(mock_backend, ownership) -> None: history_log = HistoryLog(mock_backend, ownership, _TestRecord, run_id=1, workspace_id=2, catalog="the_catalog") history_log.append_inventory_snapshot([record]) - assert history_log.full_name == "the_catalog.ucx.history" - assert mock_backend.has_rows_written_for("`the_catalog`.`ucx`.`history`") + assert history_log.full_name == "the_catalog.multiworkspace.historical" + assert mock_backend.has_rows_written_for("`the_catalog`.`multiworkspace`.`historical`") From febc9ce41704690bdfcca9df44a9727a85c413d3 Mon Sep 17 00:00:00 2001 From: Andrew Snare Date: Mon, 21 Oct 2024 12:40:38 +0200 Subject: [PATCH 51/72] Ensure all tasks run on a UC-enabled cluster. --- src/databricks/labs/ucx/progress/workflows.py | 41 ++++++++----------- 1 file changed, 17 insertions(+), 24 deletions(-) diff --git a/src/databricks/labs/ucx/progress/workflows.py b/src/databricks/labs/ucx/progress/workflows.py index 33174bff74..e4f70a107a 100644 --- a/src/databricks/labs/ucx/progress/workflows.py +++ b/src/databricks/labs/ucx/progress/workflows.py @@ -22,7 +22,15 @@ class MigrationProgress(Workflow): def __init__(self) -> None: super().__init__('migration-progress-experimental') - @job_task + @job_task(job_cluster="table_migration") + def verify_prerequisites(self, ctx: RuntimeContext) -> None: + """Verify the prerequisites for running this job on the table migration cluster are fulfilled. + + We will wait up to 1 hour for the assessment run to finish if it is running or pending. + """ + ctx.verify_progress_tracking.verify(timeout=dt.timedelta(hours=1)) + + @job_task(depends_on=[verify_prerequisites], job_cluster="table_migration") def crawl_tables(self, ctx: RuntimeContext) -> None: """Iterates over all tables in the Hive Metastore of the current workspace and persists their metadata, such as _database name_, _table name_, _table type_, _table location_, etc., in the table named @@ -32,7 +40,7 @@ def crawl_tables(self, ctx: RuntimeContext) -> None: tables_snapshot = ctx.tables_crawler.snapshot(force_refresh=True) history_log.append_inventory_snapshot(tables_snapshot) - @job_task + @job_task(depends_on=[verify_prerequisites], job_cluster="table_migration") def crawl_udfs(self, ctx: RuntimeContext) -> None: """Iterates over all UDFs in the Hive Metastore of the current workspace and persists their metadata in the table named `$inventory_database.udfs`. This inventory is currently used when scanning securable objects for @@ -41,11 +49,7 @@ def crawl_udfs(self, ctx: RuntimeContext) -> None: udfs_snapshot = ctx.udfs_crawler.snapshot(force_refresh=True) history_log.append_inventory_snapshot(udfs_snapshot) - @job_task(job_cluster="tacl") - def setup_tacl(self, ctx: RuntimeContext) -> None: - """(Optimization) Starts `tacl` job cluster in parallel to crawling tables.""" - - @job_task(depends_on=[crawl_tables, crawl_udfs], job_cluster="tacl") + @job_task(depends_on=[verify_prerequisites, crawl_tables, crawl_udfs], job_cluster="table_migration") def crawl_grants(self, ctx: RuntimeContext) -> None: """Scans all securable objects for permissions that have been assigned: this include database-level permissions, as well permissions directly configured on objects in the (already gathered) table and UDF inventories. The @@ -58,7 +62,7 @@ def crawl_grants(self, ctx: RuntimeContext) -> None: grants_snapshot = ctx.grants_crawler.snapshot(force_refresh=True) history_log.append_inventory_snapshot(grants_snapshot) - @job_task + @job_task(depends_on=[verify_prerequisites], job_cluster="table_migration") def assess_jobs(self, ctx: RuntimeContext) -> None: """Scans through all the jobs and identifies those that are not compatible with UC. The list of all the jobs is stored in the `$inventory.jobs` table. @@ -73,7 +77,7 @@ def assess_jobs(self, ctx: RuntimeContext) -> None: jobs_snapshot = ctx.jobs_crawler.snapshot(force_refresh=True) history_log.append_inventory_snapshot(jobs_snapshot) - @job_task + @job_task(depends_on=[verify_prerequisites], job_cluster="table_migration") def assess_clusters(self, ctx: RuntimeContext) -> None: """Scan through all the clusters and identifies those that are not compatible with UC. The list of all the clusters is stored in the`$inventory.clusters` table. @@ -88,7 +92,7 @@ def assess_clusters(self, ctx: RuntimeContext) -> None: clusters_snapshot = ctx.clusters_crawler.snapshot(force_refresh=True) history_log.append_inventory_snapshot(clusters_snapshot) - @job_task + @job_task(depends_on=[verify_prerequisites], job_cluster="table_migration") def assess_pipelines(self, ctx: RuntimeContext) -> None: """This module scans through all the Pipelines and identifies those pipelines which has Azure Service Principals embedded (who has been given access to the Azure storage accounts via spark configurations) in the pipeline @@ -103,7 +107,7 @@ def assess_pipelines(self, ctx: RuntimeContext) -> None: pipelines_snapshot = ctx.pipelines_crawler.snapshot(force_refresh=True) history_log.append_inventory_snapshot(pipelines_snapshot) - @job_task + @job_task(depends_on=[verify_prerequisites], job_cluster="table_migration") def crawl_cluster_policies(self, ctx: RuntimeContext) -> None: """This module scans through all the Cluster Policies and get the necessary information @@ -116,19 +120,7 @@ def crawl_cluster_policies(self, ctx: RuntimeContext) -> None: cluster_policies_snapshot = ctx.policies_crawler.snapshot(force_refresh=True) history_log.append_inventory_snapshot(cluster_policies_snapshot) - @job_task(job_cluster="table_migration") - def setup_table_migration(self, ctx: RuntimeContext) -> None: - """(Optimization) Starts `table_migration` job cluster in parallel to crawling tables.""" - - @job_task(depends_on=[setup_table_migration], job_cluster="table_migration") - def verify_prerequisites(self, ctx: RuntimeContext) -> None: - """Verify the prerequisites for running this job on the table migration cluster are fulfilled. - - We will wait up to 1 hour for the assessment run to finish if it is running or pending. - """ - ctx.verify_progress_tracking.verify(timeout=dt.timedelta(hours=1)) - - @job_task(depends_on=[crawl_tables, verify_prerequisites], job_cluster="table_migration") + @job_task(depends_on=[verify_prerequisites, crawl_tables, verify_prerequisites], job_cluster="table_migration") def refresh_table_migration_status(self, ctx: RuntimeContext) -> None: """Scan the tables (and views) in the inventory and record whether each has been migrated or not. @@ -140,6 +132,7 @@ def refresh_table_migration_status(self, ctx: RuntimeContext) -> None: @job_task( depends_on=[ + verify_prerequisites, crawl_grants, assess_jobs, assess_clusters, From e223612c1cd523bfd2ea5a1e916f9939bb1604fc Mon Sep 17 00:00:00 2001 From: Andrew Snare Date: Mon, 21 Oct 2024 14:15:22 +0200 Subject: [PATCH 52/72] Formatting. --- tests/unit/progress/test_workflows.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/tests/unit/progress/test_workflows.py b/tests/unit/progress/test_workflows.py index 19c58960e2..586cca14f8 100644 --- a/tests/unit/progress/test_workflows.py +++ b/tests/unit/progress/test_workflows.py @@ -21,7 +21,11 @@ (MigrationProgress.assess_jobs, RuntimeContext.jobs_crawler, RuntimeContext.historical_jobs_log), (MigrationProgress.assess_clusters, RuntimeContext.clusters_crawler, RuntimeContext.historical_clusters_log), (MigrationProgress.assess_pipelines, RuntimeContext.pipelines_crawler, RuntimeContext.historical_pipelines_log), - (MigrationProgress.crawl_cluster_policies, RuntimeContext.policies_crawler, RuntimeContext.historical_cluster_policies_log), + ( + MigrationProgress.crawl_cluster_policies, + RuntimeContext.policies_crawler, + RuntimeContext.historical_cluster_policies_log, + ), ( MigrationProgress.refresh_table_migration_status, RuntimeContext.migration_status_refresher, From c938dd846bb88b280f3a943321ecb4400be1960e Mon Sep 17 00:00:00 2001 From: Andrew Snare Date: Mon, 21 Oct 2024 14:28:48 +0200 Subject: [PATCH 53/72] Split the crawling and history-log update across 2 tasks for the tables crawler. We don't currently have a cluster available where both can be performed. --- src/databricks/labs/ucx/progress/workflows.py | 19 +++++++++++++-- tests/unit/progress/test_workflows.py | 24 ++++++++++++++++++- 2 files changed, 40 insertions(+), 3 deletions(-) diff --git a/src/databricks/labs/ucx/progress/workflows.py b/src/databricks/labs/ucx/progress/workflows.py index 21a6994e7f..16f208ee6a 100644 --- a/src/databricks/labs/ucx/progress/workflows.py +++ b/src/databricks/labs/ucx/progress/workflows.py @@ -31,14 +31,29 @@ def verify_prerequisites(self, ctx: RuntimeContext) -> None: """ ctx.verify_progress_tracking.verify(timeout=dt.timedelta(hours=1)) - @job_task(depends_on=[verify_prerequisites], job_cluster="table_migration") + @job_task(job_cluster="tacl") + def setup_tacl(self, ctx: RuntimeContext): + """(Optimization) Allow the TACL job cluster to be started while we're verifying the prerequisites for + refreshing everything.""" + + @job_task(depends_on=[verify_prerequisites, setup_tacl], job_cluster="tacl") def crawl_tables(self, ctx: RuntimeContext) -> None: """Iterates over all tables in the Hive Metastore of the current workspace and persists their metadata, such as _database name_, _table name_, _table type_, _table location_, etc., in the table named `$inventory_database.tables`. The metadata stored is then used in the subsequent tasks and workflows to, for example, find all Hive Metastore tables that cannot easily be migrated to Unity Catalog.""" + # The TACL cluster is not UC-enabled, so the snapshot cannot be written immediately to the history log. + # Step 1 of 2: Just refresh the inventory. + ctx.tables_crawler.snapshot(force_refresh=True) + + @job_task(depends_on=[verify_prerequisites, crawl_tables], job_cluster="table_migration") + def update_tables_history_log(self, ctx: RuntimeContext) -> None: + """Update the history log with the latest tables inventory snapshot.""" + # The table migration cluster is not legacy-ACL enabled, so we can't crawl from here. + # Step 2 of 2: Assuming the inventory was refreshed, capture into the history log. + # WARNING: this will fail if the inventory is empty, because it will then try to perform a crawl. history_log = ctx.historical_tables_log - tables_snapshot = ctx.tables_crawler.snapshot(force_refresh=True) + tables_snapshot = ctx.tables_crawler.snapshot() history_log.append_inventory_snapshot(tables_snapshot) @job_task(depends_on=[verify_prerequisites], job_cluster="table_migration") diff --git a/tests/unit/progress/test_workflows.py b/tests/unit/progress/test_workflows.py index 586cca14f8..28d23911b1 100644 --- a/tests/unit/progress/test_workflows.py +++ b/tests/unit/progress/test_workflows.py @@ -3,6 +3,7 @@ from unittest.mock import create_autospec import pytest +from databricks.labs.ucx.hive_metastore import TablesCrawler from databricks.labs.ucx.progress.history import HistoryLog from databricks.sdk import WorkspaceClient from databricks.sdk.service.catalog import CatalogInfo, MetastoreAssignment @@ -15,7 +16,6 @@ @pytest.mark.parametrize( "task, crawler, history_log", ( - (MigrationProgress.crawl_tables, RuntimeContext.tables_crawler, RuntimeContext.historical_tables_log), (MigrationProgress.crawl_udfs, RuntimeContext.udfs_crawler, RuntimeContext.historical_udfs_log), (MigrationProgress.crawl_grants, RuntimeContext.grants_crawler, RuntimeContext.historical_grants_log), (MigrationProgress.assess_jobs, RuntimeContext.jobs_crawler, RuntimeContext.historical_jobs_log), @@ -49,6 +49,28 @@ def test_migration_progress_runtime_refresh(run_workflow, task, crawler, history mock_history_log.append_inventory_snapshot.assert_called_once() +def test_migration_progress_runtime_tables_refresh(run_workflow) -> None: + """Ensure that the split crawl and update-history-log tasks perform their part of the refresh process.""" + mock_tables_crawler = create_autospec(TablesCrawler) + mock_history_log = create_autospec(HistoryLog) + context_replacements = { + "tables_crawler": mock_tables_crawler, + "historical_tables_log": mock_history_log, + "named_parameters": {"parent_run_id": 53}, + } + + # The first part of a 2-step update: the crawl without updating the history log. + run_workflow(MigrationProgress.crawl_tables, **context_replacements) + mock_tables_crawler.snapshot.assert_called_once_with(force_refresh=True) + mock_history_log.append_inventory_snapshot.assert_not_called() + + mock_tables_crawler.snapshot.reset_mock() + # The second part of the 2-step update: updating the history log (without a forced crawl). + run_workflow(MigrationProgress.update_tables_history_log, **context_replacements) + mock_tables_crawler.snapshot.assert_called_once_with() + mock_history_log.append_inventory_snapshot.assert_called_once() + + @pytest.mark.parametrize( "task, linter", ( From 8a378dc69196dcac6f02613c14ac8ba98a0e098c Mon Sep 17 00:00:00 2001 From: Andrew Snare Date: Mon, 21 Oct 2024 14:33:35 +0200 Subject: [PATCH 54/72] Note a limitation of the fast-scan table crawler. In particular, not all runtime contexts support it. --- src/databricks/labs/ucx/contexts/workflow_task.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/databricks/labs/ucx/contexts/workflow_task.py b/src/databricks/labs/ucx/contexts/workflow_task.py index 2e80fb0688..3b41877ebe 100644 --- a/src/databricks/labs/ucx/contexts/workflow_task.py +++ b/src/databricks/labs/ucx/contexts/workflow_task.py @@ -99,6 +99,8 @@ def global_init_scripts_crawler(self) -> GlobalInitScriptCrawler: @cached_property def tables_crawler(self) -> TablesCrawler: + # Warning: Not all runtime contexts support the fast-scan implementation; it requires the JVM bridge to Spark + # and that's not always available. return FasterTableScanCrawler(self.sql_backend, self.inventory_database, self.config.include_databases) @cached_property From 2e5869d80811ce8a7449b9dc095b7a9423822ee1 Mon Sep 17 00:00:00 2001 From: Andrew Snare Date: Mon, 21 Oct 2024 14:57:58 +0200 Subject: [PATCH 55/72] Factor out the ownership components so they can be used elsewhere. --- .../labs/ucx/contexts/application.py | 22 +++++++- .../labs/ucx/contexts/workflow_task.py | 51 ++++++++++--------- 2 files changed, 48 insertions(+), 25 deletions(-) diff --git a/src/databricks/labs/ucx/contexts/application.py b/src/databricks/labs/ucx/contexts/application.py index 88640988de..02d74cddbb 100644 --- a/src/databricks/labs/ucx/contexts/application.py +++ b/src/databricks/labs/ucx/contexts/application.py @@ -40,15 +40,17 @@ GrantsCrawler, MigrateGrants, PrincipalACL, + GrantOwnership, ) from databricks.labs.ucx.hive_metastore.mapping import TableMapping -from databricks.labs.ucx.hive_metastore.table_migration_status import TableMigrationIndex +from databricks.labs.ucx.hive_metastore.table_migration_status import TableMigrationIndex, TableMigrationOwnership from databricks.labs.ucx.hive_metastore.table_migrate import ( TableMigrationStatusRefresher, TablesMigrator, ) from databricks.labs.ucx.hive_metastore.table_move import TableMove -from databricks.labs.ucx.hive_metastore.udfs import UdfsCrawler +from databricks.labs.ucx.hive_metastore.tables import TableOwnership +from databricks.labs.ucx.hive_metastore.udfs import UdfsCrawler, UdfOwnership from databricks.labs.ucx.hive_metastore.verification import VerifyHasCatalog, VerifyHasMetastore from databricks.labs.ucx.installer.workflows import DeployedWorkflows from databricks.labs.ucx.progress.install import VerifyProgressTracking @@ -243,14 +245,26 @@ def group_manager(self) -> GroupManager: def grants_crawler(self) -> GrantsCrawler: return GrantsCrawler(self.tables_crawler, self.udfs_crawler, self.config.include_databases) + @cached_property + def grant_ownership(self) -> GrantOwnership: + return GrantOwnership(self.administrator_locator) + @cached_property def udfs_crawler(self) -> UdfsCrawler: return UdfsCrawler(self.sql_backend, self.inventory_database, self.config.include_databases) + @cached_property + def udf_ownership(self) -> UdfOwnership: + return UdfOwnership(self.administrator_locator) + @cached_property def tables_crawler(self) -> TablesCrawler: return TablesCrawler(self.sql_backend, self.inventory_database, self.config.include_databases) + @cached_property + def table_ownership(self) -> TableOwnership: + return TableOwnership(self.administrator_locator) + @cached_property def tables_migrator(self) -> TablesMigrator: return TablesMigrator( @@ -363,6 +377,10 @@ def migration_status_refresher(self) -> TableMigrationStatusRefresher: self.tables_crawler, ) + @cached_property + def table_migration_ownership(self) -> TableMigrationOwnership: + return TableMigrationOwnership(self.tables_crawler, self.table_ownership) + @cached_property def iam_credential_manager(self) -> CredentialManager: return CredentialManager(self.workspace_client) diff --git a/src/databricks/labs/ucx/contexts/workflow_task.py b/src/databricks/labs/ucx/contexts/workflow_task.py index 3b41877ebe..81a6ec7497 100644 --- a/src/databricks/labs/ucx/contexts/workflow_task.py +++ b/src/databricks/labs/ucx/contexts/workflow_task.py @@ -3,7 +3,7 @@ from databricks.labs.blueprint.installation import Installation from databricks.labs.lsql.backends import RuntimeBackend, SqlBackend -from databricks.labs.ucx.hive_metastore.table_migration_status import TableMigrationOwnership, TableMigrationStatus +from databricks.labs.ucx.hive_metastore.table_migration_status import TableMigrationStatus from databricks.sdk import WorkspaceClient, core from databricks.labs.ucx.__about__ import __version__ @@ -21,10 +21,10 @@ from databricks.labs.ucx.config import WorkspaceConfig from databricks.labs.ucx.contexts.application import GlobalContext from databricks.labs.ucx.hive_metastore import TablesInMounts, TablesCrawler -from databricks.labs.ucx.hive_metastore.grants import Grant, GrantOwnership +from databricks.labs.ucx.hive_metastore.grants import Grant from databricks.labs.ucx.hive_metastore.table_size import TableSizeCrawler -from databricks.labs.ucx.hive_metastore.tables import FasterTableScanCrawler, Table, TableOwnership -from databricks.labs.ucx.hive_metastore.udfs import Udf, UdfOwnership +from databricks.labs.ucx.hive_metastore.tables import FasterTableScanCrawler, Table +from databricks.labs.ucx.hive_metastore.udfs import Udf from databricks.labs.ucx.installer.logs import TaskRunWarningRecorder from databricks.labs.ucx.progress.history import HistoryLog from databricks.labs.ucx.progress.workflow_runs import WorkflowRunRecorder @@ -68,6 +68,10 @@ def installation(self) -> Installation: def jobs_crawler(self) -> JobsCrawler: return JobsCrawler(self.workspace_client, self.sql_backend, self.inventory_database) + @cached_property + def job_ownership(self) -> JobOwnership: + return JobOwnership(self.administrator_locator) + @cached_property def submit_runs_crawler(self) -> SubmitRunsCrawler: return SubmitRunsCrawler( @@ -81,10 +85,18 @@ def submit_runs_crawler(self) -> SubmitRunsCrawler: def clusters_crawler(self) -> ClustersCrawler: return ClustersCrawler(self.workspace_client, self.sql_backend, self.inventory_database) + @cached_property + def cluster_ownership(self) -> ClusterOwnership: + return ClusterOwnership(self.administrator_locator) + @cached_property def pipelines_crawler(self) -> PipelinesCrawler: return PipelinesCrawler(self.workspace_client, self.sql_backend, self.inventory_database) + @cached_property + def pipeline_ownership(self) -> PipelineOwnership: + return PipelineOwnership(self.administrator_locator) + @cached_property def table_size_crawler(self) -> TableSizeCrawler: return TableSizeCrawler(self.tables_crawler) @@ -93,6 +105,10 @@ def table_size_crawler(self) -> TableSizeCrawler: def policies_crawler(self) -> PoliciesCrawler: return PoliciesCrawler(self.workspace_client, self.sql_backend, self.inventory_database) + @cached_property + def cluster_policy_ownership(self) -> ClusterPolicyOwnership: + return ClusterPolicyOwnership(self.administrator_locator) + @cached_property def global_init_scripts_crawler(self) -> GlobalInitScriptCrawler: return GlobalInitScriptCrawler(self.workspace_client, self.sql_backend, self.inventory_database) @@ -103,10 +119,6 @@ def tables_crawler(self) -> TablesCrawler: # and that's not always available. return FasterTableScanCrawler(self.sql_backend, self.inventory_database, self.config.include_databases) - @cached_property - def tables_ownership(self) -> TableOwnership: - return TableOwnership(self.administrator_locator) - @cached_property def tables_in_mounts(self) -> TablesInMounts: return TablesInMounts( @@ -150,10 +162,9 @@ def workspace_id(self) -> int: @cached_property def historical_clusters_log(self) -> HistoryLog[ClusterInfo]: - cluster_owner = ClusterOwnership(self.administrator_locator) return HistoryLog( self.sql_backend, - cluster_owner, + self.cluster_ownership, ClusterInfo, int(self.named_parameters["parent_run_id"]), self.workspace_id, @@ -162,10 +173,9 @@ def historical_clusters_log(self) -> HistoryLog[ClusterInfo]: @cached_property def historical_cluster_policies_log(self) -> HistoryLog[PolicyInfo]: - cluster_policy_owner = ClusterPolicyOwnership(self.administrator_locator) return HistoryLog( self.sql_backend, - cluster_policy_owner, + self.cluster_policy_ownership, PolicyInfo, int(self.named_parameters["parent_run_id"]), self.workspace_id, @@ -174,10 +184,9 @@ def historical_cluster_policies_log(self) -> HistoryLog[PolicyInfo]: @cached_property def historical_grants_log(self) -> HistoryLog[Grant]: - grant_owner = GrantOwnership(self.administrator_locator) return HistoryLog( self.sql_backend, - grant_owner, + self.grant_ownership, Grant, int(self.named_parameters["parent_run_id"]), self.workspace_id, @@ -186,10 +195,9 @@ def historical_grants_log(self) -> HistoryLog[Grant]: @cached_property def historical_jobs_log(self) -> HistoryLog[JobInfo]: - job_owner = JobOwnership(self.administrator_locator) return HistoryLog( self.sql_backend, - job_owner, + self.job_ownership, JobInfo, int(self.named_parameters["parent_run_id"]), self.workspace_id, @@ -198,10 +206,9 @@ def historical_jobs_log(self) -> HistoryLog[JobInfo]: @cached_property def historical_pipelines_log(self) -> HistoryLog[PipelineInfo]: - pipeline_owner = PipelineOwnership(self.administrator_locator) return HistoryLog( self.sql_backend, - pipeline_owner, + self.pipeline_ownership, PipelineInfo, int(self.named_parameters["parent_run_id"]), self.workspace_id, @@ -212,7 +219,7 @@ def historical_pipelines_log(self) -> HistoryLog[PipelineInfo]: def historical_tables_log(self) -> HistoryLog[Table]: return HistoryLog( self.sql_backend, - self.tables_ownership, + self.table_ownership, Table, int(self.named_parameters["parent_run_id"]), self.workspace_id, @@ -221,10 +228,9 @@ def historical_tables_log(self) -> HistoryLog[Table]: @cached_property def historical_table_migration_log(self) -> HistoryLog[TableMigrationStatus]: - table_migration_owner = TableMigrationOwnership(self.tables_crawler, self.tables_ownership) return HistoryLog( self.sql_backend, - table_migration_owner, + self.table_migration_ownership, TableMigrationStatus, int(self.named_parameters["parent_run_id"]), self.workspace_id, @@ -233,10 +239,9 @@ def historical_table_migration_log(self) -> HistoryLog[TableMigrationStatus]: @cached_property def historical_udfs_log(self) -> HistoryLog[Udf]: - udf_owner = UdfOwnership(self.administrator_locator) return HistoryLog( self.sql_backend, - udf_owner, + self.udf_ownership, Udf, int(self.named_parameters["parent_run_id"]), self.workspace_id, From bf542a4fe0a358bbf3ae6ff0a00a716ba9e44749 Mon Sep 17 00:00:00 2001 From: Andrew Snare Date: Mon, 21 Oct 2024 16:24:47 +0200 Subject: [PATCH 56/72] Mark the __id_attributes__ sequence as immutable. --- src/databricks/labs/ucx/assessment/clusters.py | 6 +++--- src/databricks/labs/ucx/assessment/jobs.py | 4 ++-- src/databricks/labs/ucx/assessment/pipelines.py | 4 ++-- src/databricks/labs/ucx/hive_metastore/grants.py | 4 ++-- .../labs/ucx/hive_metastore/table_migration_status.py | 4 ++-- src/databricks/labs/ucx/hive_metastore/tables.py | 4 ++-- src/databricks/labs/ucx/hive_metastore/udfs.py | 4 ++-- src/databricks/labs/ucx/progress/history.py | 2 +- 8 files changed, 16 insertions(+), 16 deletions(-) diff --git a/src/databricks/labs/ucx/assessment/clusters.py b/src/databricks/labs/ucx/assessment/clusters.py index 63f1061364..e31284a1bb 100644 --- a/src/databricks/labs/ucx/assessment/clusters.py +++ b/src/databricks/labs/ucx/assessment/clusters.py @@ -1,7 +1,7 @@ import base64 import json import logging -from collections.abc import Iterable, Sequence +from collections.abc import Iterable from dataclasses import dataclass from typing import ClassVar @@ -47,7 +47,7 @@ class ClusterInfo: creator: str | None = None """User-name of the creator of the cluster, if known.""" - __id_attributes__: ClassVar[Sequence[str]] = ("cluster_id",) + __id_attributes__: ClassVar[tuple[str, ...]] = ("cluster_id",) class CheckClusterMixin(CheckInitScriptMixin): @@ -206,7 +206,7 @@ class PolicyInfo: creator: str | None = None """User-name of the creator of the cluster policy, if known.""" - __id_attributes__: ClassVar[Sequence[str]] = ("policy_id",) + __id_attributes__: ClassVar[tuple[str, ...]] = ("policy_id",) class PoliciesCrawler(CrawlerBase[PolicyInfo], CheckClusterMixin): diff --git a/src/databricks/labs/ucx/assessment/jobs.py b/src/databricks/labs/ucx/assessment/jobs.py index 4091d277f0..924b33bd9f 100644 --- a/src/databricks/labs/ucx/assessment/jobs.py +++ b/src/databricks/labs/ucx/assessment/jobs.py @@ -1,6 +1,6 @@ import json import logging -from collections.abc import Iterable, Sequence +from collections.abc import Iterable from dataclasses import dataclass from datetime import datetime, timedelta, timezone from hashlib import sha256 @@ -41,7 +41,7 @@ class JobInfo: creator: str | None = None """User-name of the creator of the pipeline, if known.""" - __id_attributes__: ClassVar[Sequence[str]] = ("job_id",) + __id_attributes__: ClassVar[tuple[str, ...]] = ("job_id",) class JobsMixin: diff --git a/src/databricks/labs/ucx/assessment/pipelines.py b/src/databricks/labs/ucx/assessment/pipelines.py index 02d28bb438..2398bbdfcd 100644 --- a/src/databricks/labs/ucx/assessment/pipelines.py +++ b/src/databricks/labs/ucx/assessment/pipelines.py @@ -1,6 +1,6 @@ import json import logging -from collections.abc import Iterable, Sequence +from collections.abc import Iterable from dataclasses import dataclass from typing import ClassVar @@ -24,7 +24,7 @@ class PipelineInfo: creator_name: str | None = None """User-name of the creator of the pipeline, if known.""" - __id_attributes__: ClassVar[Sequence[str]] = ("pipeline_id",) + __id_attributes__: ClassVar[tuple[str, ...]] = ("pipeline_id",) class PipelinesCrawler(CrawlerBase[PipelineInfo], CheckClusterMixin): diff --git a/src/databricks/labs/ucx/hive_metastore/grants.py b/src/databricks/labs/ucx/hive_metastore/grants.py index 41635c52b1..de9106ce40 100644 --- a/src/databricks/labs/ucx/hive_metastore/grants.py +++ b/src/databricks/labs/ucx/hive_metastore/grants.py @@ -1,6 +1,6 @@ import logging from collections import defaultdict -from collections.abc import Callable, Iterable, Sequence +from collections.abc import Callable, Iterable from dataclasses import dataclass, replace from functools import partial, cached_property from typing import ClassVar, Protocol @@ -66,7 +66,7 @@ class Grant: any_file: bool = False anonymous_function: bool = False - __id_attributes__: ClassVar[Sequence[str]] = ("object_type", "object_key", "action_type", "principal") + __id_attributes__: ClassVar[tuple[str, ...]] = ("object_type", "object_key", "action_type", "principal") @staticmethod def type_and_key( diff --git a/src/databricks/labs/ucx/hive_metastore/table_migration_status.py b/src/databricks/labs/ucx/hive_metastore/table_migration_status.py index 75b97cc61a..1a15be28f0 100644 --- a/src/databricks/labs/ucx/hive_metastore/table_migration_status.py +++ b/src/databricks/labs/ucx/hive_metastore/table_migration_status.py @@ -1,7 +1,7 @@ import datetime import logging from dataclasses import dataclass, replace -from collections.abc import Iterable, KeysView, Sequence +from collections.abc import Iterable, KeysView from typing import ClassVar from databricks.labs.lsql.backends import SqlBackend @@ -26,7 +26,7 @@ class TableMigrationStatus: dst_table: str | None = None update_ts: str | None = None - __id_attributes__: ClassVar[Sequence[str]] = ("src_schema", "src_table") + __id_attributes__: ClassVar[tuple[str, ...]] = ("src_schema", "src_table") def destination(self): return f"{self.dst_catalog}.{self.dst_schema}.{self.dst_table}".lower() diff --git a/src/databricks/labs/ucx/hive_metastore/tables.py b/src/databricks/labs/ucx/hive_metastore/tables.py index 1a352beb48..8f4adacde5 100644 --- a/src/databricks/labs/ucx/hive_metastore/tables.py +++ b/src/databricks/labs/ucx/hive_metastore/tables.py @@ -1,7 +1,7 @@ import logging import re import typing -from collections.abc import Collection, Iterable, Iterator, Sequence +from collections.abc import Collection, Iterable, Iterator from dataclasses import dataclass from enum import Enum, auto from functools import cached_property, partial @@ -64,7 +64,7 @@ class Table: # pylint: disable=too-many-public-methods storage_properties: str | None = None is_partitioned: bool = False - __id_attributes__: typing.ClassVar[Sequence[str]] = ("catalog", "database", "name") + __id_attributes__: typing.ClassVar[tuple[str, ...]] = ("catalog", "database", "name") DBFS_ROOT_PREFIXES: typing.ClassVar[list[str]] = [ "/dbfs/", diff --git a/src/databricks/labs/ucx/hive_metastore/udfs.py b/src/databricks/labs/ucx/hive_metastore/udfs.py index e97b08aff0..a18d344e75 100644 --- a/src/databricks/labs/ucx/hive_metastore/udfs.py +++ b/src/databricks/labs/ucx/hive_metastore/udfs.py @@ -1,5 +1,5 @@ import logging -from collections.abc import Iterable, Sequence +from collections.abc import Iterable from dataclasses import dataclass, replace from functools import partial from typing import ClassVar @@ -30,7 +30,7 @@ class Udf: # pylint: disable=too-many-instance-attributes success: int = 1 failures: str = "" - __id_attributes__: ClassVar[Sequence[str]] = ("catalog", "database", "name") + __id_attributes__: ClassVar[tuple[str, ...]] = ("catalog", "database", "name") @property def key(self) -> str: diff --git a/src/databricks/labs/ucx/progress/history.py b/src/databricks/labs/ucx/progress/history.py index e64b3f8010..f2703e6943 100644 --- a/src/databricks/labs/ucx/progress/history.py +++ b/src/databricks/labs/ucx/progress/history.py @@ -18,7 +18,7 @@ class DataclassWithIdAttributes(Protocol): __dataclass_fields__: ClassVar[dict[str, Any]] - __id_attributes__: ClassVar[Sequence[str]] + __id_attributes__: ClassVar[tuple[str, ...]] """The names of attributes (can be dataclass fields or ordinary properties) that make up the object identifier. All attributes must be (non-optional) strings. From 3aad8bf2949f1a5d482a7cb6280bd8a2f4d4ed7b Mon Sep 17 00:00:00 2001 From: Andrew Snare Date: Mon, 21 Oct 2024 16:25:26 +0200 Subject: [PATCH 57/72] Mark linters as still needing to be done. --- src/databricks/labs/ucx/progress/workflows.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/databricks/labs/ucx/progress/workflows.py b/src/databricks/labs/ucx/progress/workflows.py index 16f208ee6a..1da789479f 100644 --- a/src/databricks/labs/ucx/progress/workflows.py +++ b/src/databricks/labs/ucx/progress/workflows.py @@ -150,12 +150,14 @@ def refresh_table_migration_status(self, ctx: RuntimeContext) -> None: def assess_dashboards(self, ctx: RuntimeContext): """Scans all dashboards for migration issues in SQL code of embedded widgets. Also stores direct filesystem accesses for display in the migration dashboard.""" + # TODO: Ensure these are captured in the history log. ctx.query_linter.refresh_report(ctx.sql_backend, ctx.inventory_database) @job_task def assess_workflows(self, ctx: RuntimeContext): """Scans all jobs for migration issues in notebooks. Also stores direct filesystem accesses for display in the migration dashboard.""" + # TODO: Ensure these are captured in the history log. ctx.workflow_linter.refresh_report(ctx.sql_backend, ctx.inventory_database) @job_task( From 2381d183291868f12af254368d975598c0029445 Mon Sep 17 00:00:00 2001 From: Andrew Snare Date: Mon, 21 Oct 2024 18:57:31 +0200 Subject: [PATCH 58/72] Handle UDF failures, which aren't JSON-encoded as with other classes. --- src/databricks/labs/ucx/hive_metastore/udfs.py | 4 ++++ src/databricks/labs/ucx/progress/history.py | 7 +++++-- tests/unit/hive_metastore/test_udfs.py | 5 +++-- 3 files changed, 12 insertions(+), 4 deletions(-) diff --git a/src/databricks/labs/ucx/hive_metastore/udfs.py b/src/databricks/labs/ucx/hive_metastore/udfs.py index a18d344e75..81ed350838 100644 --- a/src/databricks/labs/ucx/hive_metastore/udfs.py +++ b/src/databricks/labs/ucx/hive_metastore/udfs.py @@ -29,6 +29,10 @@ class Udf: # pylint: disable=too-many-instance-attributes comment: str success: int = 1 failures: str = "" + """A string that represents a problem associated with this UDF. + + Warning: unlike other inventory classes, this is not a JSON-encoded array but instead just a simple string. + """ __id_attributes__: ClassVar[tuple[str, ...]] = ("catalog", "database", "name") diff --git a/src/databricks/labs/ucx/progress/history.py b/src/databricks/labs/ucx/progress/history.py index f2703e6943..70d4fbfcf5 100644 --- a/src/databricks/labs/ucx/progress/history.py +++ b/src/databricks/labs/ucx/progress/history.py @@ -162,8 +162,11 @@ def _object_data_and_failures(self, record: Record) -> tuple[dict[str, str], lis if self._has_failures == list[str]: failures = record_values["failures"] elif self._has_failures == str: - encoded_failures = record_values["failures"] - failures = json.loads(encoded_failures) if encoded_failures else [] + raw_failures = record_values["failures"] + try: + failures = json.loads(raw_failures) if raw_failures else [] + except json.decoder.JSONDecodeError: + failures = [raw_failures] else: failures = [] diff --git a/tests/unit/hive_metastore/test_udfs.py b/tests/unit/hive_metastore/test_udfs.py index 45c96a975f..db6cd97e1a 100644 --- a/tests/unit/hive_metastore/test_udfs.py +++ b/tests/unit/hive_metastore/test_udfs.py @@ -128,7 +128,8 @@ def test_udf_owner() -> None: body="UNKNOWN-5", comment="UNKNOWN-6", success=0, - failures='["a_failure", "another_failures"]', + # Note: NOT json-encoded as is the convention elsewhere. + failures="something_is_wrong_with_this_udf", ), Row( workspace_id=2, @@ -148,7 +149,7 @@ def test_udf_owner() -> None: "comment": "UNKNOWN-6", "success": "0", }, - failures=["a_failure", "another_failures"], + failures=["something_is_wrong_with_this_udf"], owner="the_admin", ucx_version=ucx_version, ), From 23e772062bd16b481160d8cf8cfd91d35a1736a9 Mon Sep 17 00:00:00 2001 From: Andrew Snare Date: Tue, 22 Oct 2024 09:43:14 +0200 Subject: [PATCH 59/72] Sort imports. --- src/databricks/labs/ucx/contexts/application.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/databricks/labs/ucx/contexts/application.py b/src/databricks/labs/ucx/contexts/application.py index 02d74cddbb..b21bdff4fe 100644 --- a/src/databricks/labs/ucx/contexts/application.py +++ b/src/databricks/labs/ucx/contexts/application.py @@ -38,9 +38,9 @@ ComputeLocations, Grant, GrantsCrawler, + GrantOwnership, MigrateGrants, PrincipalACL, - GrantOwnership, ) from databricks.labs.ucx.hive_metastore.mapping import TableMapping from databricks.labs.ucx.hive_metastore.table_migration_status import TableMigrationIndex, TableMigrationOwnership From 567d809a8b2f3906eb3a9143e376fb57584bd477 Mon Sep 17 00:00:00 2001 From: Andrew Snare Date: Tue, 22 Oct 2024 10:06:25 +0200 Subject: [PATCH 60/72] Test case (and fix) for when __id_attributes__ is annotated as None. --- src/databricks/labs/ucx/progress/history.py | 11 ++++--- tests/unit/progress/test_history.py | 34 +++++++++++++++------ 2 files changed, 31 insertions(+), 14 deletions(-) diff --git a/src/databricks/labs/ucx/progress/history.py b/src/databricks/labs/ucx/progress/history.py index 70d4fbfcf5..83adaabfbc 100644 --- a/src/databricks/labs/ucx/progress/history.py +++ b/src/databricks/labs/ucx/progress/history.py @@ -76,10 +76,13 @@ def _get_field_types(cls, klass: type[Record]) -> tuple[dict[str, type], type[st - The type of the failures field, if present. """ field_types = {field.name: field.type for field in dataclasses.fields(klass)} - failures_type = field_types.pop("failures", None) - if failures_type not in (None, str, list[str]): - msg = f"Historical record {klass} has invalid 'failures' attribute of type: {failures_type}" - raise TypeError(msg) + if "failures" not in field_types: + failures_type = None + else: + failures_type = field_types.pop("failures") + if failures_type not in (str, list[str]): + msg = f"Historical record {klass} has invalid 'failures' attribute of type: {failures_type}" + raise TypeError(msg) return field_types, failures_type def _get_id_attribute_names(self, klazz: type[Record]) -> Sequence[str]: diff --git a/tests/unit/progress/test_history.py b/tests/unit/progress/test_history.py index d279bf1e87..0ecbed5ed1 100644 --- a/tests/unit/progress/test_history.py +++ b/tests/unit/progress/test_history.py @@ -10,7 +10,7 @@ from databricks.labs.ucx.__about__ import __version__ as ucx_version from databricks.labs.ucx.framework.owners import Ownership -from databricks.labs.ucx.progress.history import HistoricalEncoder, HistoryLog, Record +from databricks.labs.ucx.progress.history import HistoricalEncoder, HistoryLog, Record, DataclassWithIdAttributes from databricks.labs.ucx.progress.install import Historical @@ -429,19 +429,33 @@ class _FailuresEncodedJson: assert "failures" not in historical.data -def test_historical_encoder_failures_verification(ownership) -> None: - """Verify that encoders checks the failures field type during initialization.""" +@dataclass +class _BrokenFailures1: + a_field: str = "a_field" + failures: list[int] = field(default_factory=list) - @dataclass - class _BrokenFailures: - a_field: str = "a_field" - failures: list[int] = field(default_factory=list) + __id_attributes__: ClassVar = ("a_field",) - __id_attributes__: ClassVar = ("a_field",) - expected_msg = r"^Historical record has invalid 'failures' attribute of type: list\[int\]$" +@dataclass +class _BrokenFailures2: + a_field: str = "a_field" + failures: None = None + + __id_attributes__: ClassVar = ("a_field",) + + +@pytest.mark.parametrize("klass,broken_type", ((_BrokenFailures1, list[int]), (_BrokenFailures2, None))) +def test_historical_encoder_failures_verification( + ownership, + klass: type[DataclassWithIdAttributes], + broken_type: type, +) -> None: + """Verify that encoders checks the failures field type during initialization.""" + + expected_msg = f"^Historical record {re.escape(str(klass))} has invalid 'failures' attribute of type: {re.escape(str(broken_type))}$" with pytest.raises(TypeError, match=expected_msg): - _ = HistoricalEncoder(job_run_id=1, workspace_id=2, ownership=ownership, klass=_BrokenFailures) + _ = HistoricalEncoder(job_run_id=1, workspace_id=2, ownership=ownership, klass=klass) def test_history_log_appends_historical_records(mock_backend, ownership) -> None: From f0bd9631d7b99d57dfb2987f89fdf499b3666326 Mon Sep 17 00:00:00 2001 From: Andrew Snare Date: Tue, 22 Oct 2024 10:36:17 +0200 Subject: [PATCH 61/72] Docstring explaining HistoricalEncoder design and intent. --- src/databricks/labs/ucx/progress/history.py | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/src/databricks/labs/ucx/progress/history.py b/src/databricks/labs/ucx/progress/history.py index 83adaabfbc..50d56d7637 100644 --- a/src/databricks/labs/ucx/progress/history.py +++ b/src/databricks/labs/ucx/progress/history.py @@ -29,6 +29,27 @@ class DataclassWithIdAttributes(Protocol): class HistoricalEncoder(Generic[Record]): + """An encoder for dataclasses that will be stored in our history log. + + Our history records are designed with several goals in mind: + - Records can be of different types, meaning we have to store heterogenous types using a homogenous schema. + - We partially shred the records to allow for easier SQL-based querying. + - Flexibility from the start, because schema changes in the future will be very difficult. + - Records will be shared (and queried) across workspaces with different versions of UCX installed. + + With this in mind: + - We have an 'object-type' (discriminator) field for holding the type of the record we're storing. This allows for + type-specific logic when querying. + - We have a (type-specific) field for storing the business key (identifier) of each record. + - We have special handling of the 'failures' attribute that is normally present in the types we are storing. + - The object-data is an unconstrained key-value map, allowing arbitrary attributes to be stored and (if top-level) + queried directly. Complex values are JSON-encoded. + - The ownership mechanism is used to associate each record with a user. + - To help with forward and backward compatibility the UCX version used to encode a record is included in each + record. + - The associated workspace and job run is also included in each record. + """ + _job_run_id: int """The identifier of the current job run, with which these records are associated.""" From 5b2ab77eea287aa01e7815e89ce9637026643367 Mon Sep 17 00:00:00 2001 From: Andrew Snare Date: Tue, 22 Oct 2024 10:42:32 +0200 Subject: [PATCH 62/72] Rename some things to align more closely with what they are. --- src/databricks/labs/ucx/progress/history.py | 22 +++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/src/databricks/labs/ucx/progress/history.py b/src/databricks/labs/ucx/progress/history.py index 50d56d7637..d6774c7b1f 100644 --- a/src/databricks/labs/ucx/progress/history.py +++ b/src/databricks/labs/ucx/progress/history.py @@ -62,7 +62,7 @@ class HistoricalEncoder(Generic[Record]): _object_type: str """The name of the record class being encoded by this instance.""" - _field_types: dict[str, type] + _field_names_with_types: dict[str, type] """A map of the fields on instances of the record class (and their types); these will appear in the object data.""" _has_failures: type[str | list[str]] | None @@ -79,11 +79,11 @@ def __init__(self, job_run_id: int, workspace_id: int, ownership: Ownership[Reco self._workspace_id = workspace_id self._ownership = ownership self._object_type = self._get_object_type(klass) - self._field_types, self._has_failures = self._get_field_types(klass) + self._field_names_with_types, self._has_failures = self._get_field_names_with_types(klass) self._id_attribute_names = self._get_id_attribute_names(klass) @classmethod - def _get_field_types(cls, klass: type[Record]) -> tuple[dict[str, type], type[str | list[str]] | None]: + def _get_field_names_with_types(cls, klass: type[Record]) -> tuple[dict[str, type], type[str | list[str]] | None]: """Return the dataclass-defined fields that the record type declares, and their associated types. If the record has a "failures" attribute this is treated specially: it is removed but we signal that it was @@ -96,19 +96,19 @@ def _get_field_types(cls, klass: type[Record]) -> tuple[dict[str, type], type[st - A dictionary of fields to include in the object data, and their type. - The type of the failures field, if present. """ - field_types = {field.name: field.type for field in dataclasses.fields(klass)} - if "failures" not in field_types: + field_names_with_types = {field.name: field.type for field in dataclasses.fields(klass)} + if "failures" not in field_names_with_types: failures_type = None else: - failures_type = field_types.pop("failures") + failures_type = field_names_with_types.pop("failures") if failures_type not in (str, list[str]): msg = f"Historical record {klass} has invalid 'failures' attribute of type: {failures_type}" raise TypeError(msg) - return field_types, failures_type + return field_names_with_types, failures_type def _get_id_attribute_names(self, klazz: type[Record]) -> Sequence[str]: id_attribute_names = tuple(klazz.__id_attributes__) - all_fields = self._field_types + all_fields = self._field_names_with_types for name in id_attribute_names: id_attribute_type = all_fields.get(name, None) or self._detect_property_type(klazz, name) if id_attribute_type is None: @@ -165,7 +165,7 @@ def _encode_non_serializable(cls, name: str, value: Any) -> Any: def _encode_field_value(self, name: str, value: Any | None) -> str | None: if value is None: return None - value_type = self._field_types[name] + value_type = self._field_names_with_types[name] if value_type in (str, (str | None)): return value encoded_value = json.dumps( @@ -179,7 +179,9 @@ def _encode_field_value(self, name: str, value: Any | None) -> str | None: def _object_data_and_failures(self, record: Record) -> tuple[dict[str, str], list[str]]: record_values = self._as_dict(record) - encoded_fields = {field: self._encode_field_value(field, record_values[field]) for field in self._field_types} + encoded_fields = { + field: self._encode_field_value(field, record_values[field]) for field in self._field_names_with_types + } # We must return a value: strings are mandatory (not optional) as the type. As such, optional fields need to be # omitted from the data map if the value is None. data = {k: v for k, v in encoded_fields.items() if v is not None} From 0ff2e950b331416cdfbeb404e63d0aab9ad40c99 Mon Sep 17 00:00:00 2001 From: Andrew Snare Date: Tue, 22 Oct 2024 11:01:23 +0200 Subject: [PATCH 63/72] Remove redundant type alternative. None is a subtype of Any. --- src/databricks/labs/ucx/progress/history.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/databricks/labs/ucx/progress/history.py b/src/databricks/labs/ucx/progress/history.py index d6774c7b1f..3253a552c8 100644 --- a/src/databricks/labs/ucx/progress/history.py +++ b/src/databricks/labs/ucx/progress/history.py @@ -162,7 +162,7 @@ def _encode_non_serializable(cls, name: str, value: Any) -> Any: msg = f"Cannot encode {type(value)} value in or within field {name}: {value!r}" raise TypeError(msg) - def _encode_field_value(self, name: str, value: Any | None) -> str | None: + def _encode_field_value(self, name: str, value: Any) -> str | None: if value is None: return None value_type = self._field_names_with_types[name] From 971cb4c7d11339c978e2f06047200eafb04284d5 Mon Sep 17 00:00:00 2001 From: Andrew Snare Date: Tue, 22 Oct 2024 11:10:34 +0200 Subject: [PATCH 64/72] Docstring wording and formatting updates. --- src/databricks/labs/ucx/progress/history.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/databricks/labs/ucx/progress/history.py b/src/databricks/labs/ucx/progress/history.py index 3253a552c8..05b67f50f3 100644 --- a/src/databricks/labs/ucx/progress/history.py +++ b/src/databricks/labs/ucx/progress/history.py @@ -38,13 +38,13 @@ class HistoricalEncoder(Generic[Record]): - Records will be shared (and queried) across workspaces with different versions of UCX installed. With this in mind: - - We have an 'object-type' (discriminator) field for holding the type of the record we're storing. This allows for + - We have an `object-type` (discriminator) field for holding the type of the record we're storing. This allows for type-specific logic when querying. - We have a (type-specific) field for storing the business key (identifier) of each record. - - We have special handling of the 'failures' attribute that is normally present in the types we are storing. - - The object-data is an unconstrained key-value map, allowing arbitrary attributes to be stored and (if top-level) + - We have special handling for the `failures` attribute that is often present on the types we are storing. + - The `object-data` is an unconstrained key-value map, allowing arbitrary attributes to be stored and (if top-level) queried directly. Complex values are JSON-encoded. - - The ownership mechanism is used to associate each record with a user. + - The :py:class:`Ownership` mechanism is used to associate each record with a user. - To help with forward and backward compatibility the UCX version used to encode a record is included in each record. - The associated workspace and job run is also included in each record. From be16738626fd334d9cfc94b8386cdd2f3f6b2269 Mon Sep 17 00:00:00 2001 From: Andrew Snare Date: Tue, 22 Oct 2024 11:22:59 +0200 Subject: [PATCH 65/72] Mention use-case for these records. --- src/databricks/labs/ucx/progress/history.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/databricks/labs/ucx/progress/history.py b/src/databricks/labs/ucx/progress/history.py index 05b67f50f3..a2db9b1d00 100644 --- a/src/databricks/labs/ucx/progress/history.py +++ b/src/databricks/labs/ucx/progress/history.py @@ -48,6 +48,8 @@ class HistoricalEncoder(Generic[Record]): - To help with forward and backward compatibility the UCX version used to encode a record is included in each record. - The associated workspace and job run is also included in each record. + - These records are currently stored in a (UC-based) table that is shared across workspaces attached to the same + metastore. They are used to help report on migration progress. """ _job_run_id: int From 2b9b47668512fd872f833e2daed57004b3558795 Mon Sep 17 00:00:00 2001 From: Andrew Snare Date: Tue, 22 Oct 2024 11:28:10 +0200 Subject: [PATCH 66/72] Clarify reason for assumption. --- src/databricks/labs/ucx/progress/workflows.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/databricks/labs/ucx/progress/workflows.py b/src/databricks/labs/ucx/progress/workflows.py index 1da789479f..19ce44c6b2 100644 --- a/src/databricks/labs/ucx/progress/workflows.py +++ b/src/databricks/labs/ucx/progress/workflows.py @@ -50,7 +50,7 @@ def crawl_tables(self, ctx: RuntimeContext) -> None: def update_tables_history_log(self, ctx: RuntimeContext) -> None: """Update the history log with the latest tables inventory snapshot.""" # The table migration cluster is not legacy-ACL enabled, so we can't crawl from here. - # Step 2 of 2: Assuming the inventory was refreshed, capture into the history log. + # Step 2 of 2: Assuming (due to depends-on) the inventory was refreshed, capture into the history log. # WARNING: this will fail if the inventory is empty, because it will then try to perform a crawl. history_log = ctx.historical_tables_log tables_snapshot = ctx.tables_crawler.snapshot() From f17dc74ffd0a6feedf0fedfe8e1e4f5bdfd3c88b Mon Sep 17 00:00:00 2001 From: Andrew Snare Date: Tue, 22 Oct 2024 11:36:54 +0200 Subject: [PATCH 67/72] Document reason for non-default JSON separators. --- src/databricks/labs/ucx/progress/history.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/databricks/labs/ucx/progress/history.py b/src/databricks/labs/ucx/progress/history.py index a2db9b1d00..f0f5219f1f 100644 --- a/src/databricks/labs/ucx/progress/history.py +++ b/src/databricks/labs/ucx/progress/history.py @@ -173,7 +173,7 @@ def _encode_field_value(self, name: str, value: Any) -> str | None: encoded_value = json.dumps( value, allow_nan=False, - separators=(",", ":"), + separators=(",", ":"), # More compact than default, which includes a space after the colon (:). default=lambda o: self._encode_non_serializable(name, o), ) # Handle encoding substituted values that encode as just a string (eg. timestamps); we just return the string. From 823c88696dfd2b5690da1e25eb910e5acf98cf89 Mon Sep 17 00:00:00 2001 From: Andrew Snare Date: Tue, 22 Oct 2024 12:10:58 +0200 Subject: [PATCH 68/72] Detect and handle non-string values being passed in string-hinted fields. --- src/databricks/labs/ucx/progress/history.py | 5 ++++- tests/unit/progress/test_history.py | 16 ++++++++++++++++ 2 files changed, 20 insertions(+), 1 deletion(-) diff --git a/src/databricks/labs/ucx/progress/history.py b/src/databricks/labs/ucx/progress/history.py index f0f5219f1f..b512f0b645 100644 --- a/src/databricks/labs/ucx/progress/history.py +++ b/src/databricks/labs/ucx/progress/history.py @@ -169,7 +169,10 @@ def _encode_field_value(self, name: str, value: Any) -> str | None: return None value_type = self._field_names_with_types[name] if value_type in (str, (str | None)): - return value + if isinstance(value, str): + return value + msg = f"Invalid value for field {name}, not a string: {value!r}" + raise ValueError(msg) encoded_value = json.dumps( value, allow_nan=False, diff --git a/tests/unit/progress/test_history.py b/tests/unit/progress/test_history.py index 0ecbed5ed1..5300cf8dd5 100644 --- a/tests/unit/progress/test_history.py +++ b/tests/unit/progress/test_history.py @@ -338,6 +338,22 @@ class _AClass: } +def test_historical_encoder_object_data_imposter_string_values(ownership) -> None: + """Verify that string fields containing non-string values are handled as an error.""" + + @dataclass + class _AClass: + a_field: str = "value" + the_string_field: str | None = None + + __id_attributes__: ClassVar = ("a_field",) + + encoder = HistoricalEncoder(job_run_id=1, workspace_id=2, ownership=ownership, klass=_AClass) + record_with_imposter = _AClass(the_string_field=2) # type: ignore[arg-type] + with pytest.raises(ValueError, match=r"^Invalid value for field the_string_field, not a string: 2$"): + _ = encoder.to_historical(record_with_imposter) + + @dataclass(frozen=True, kw_only=True) class _InnerClassWithTimestamp: b_field: dt.datetime From fdb03b2e0dcbac75b845fdb7486cc80de7acfe05 Mon Sep 17 00:00:00 2001 From: Andrew Snare Date: Tue, 22 Oct 2024 14:29:55 +0200 Subject: [PATCH 69/72] Handle the remaining lsql-supported fields types. --- src/databricks/labs/ucx/progress/history.py | 44 +++++++++++++++++---- tests/unit/progress/test_history.py | 22 ++++++++++- 2 files changed, 57 insertions(+), 9 deletions(-) diff --git a/src/databricks/labs/ucx/progress/history.py b/src/databricks/labs/ucx/progress/history.py index b512f0b645..f7c4909556 100644 --- a/src/databricks/labs/ucx/progress/history.py +++ b/src/databricks/labs/ucx/progress/history.py @@ -1,6 +1,7 @@ from __future__ import annotations import dataclasses import datetime as dt +from enum import Enum, EnumMeta import json import logging from collections.abc import Iterable, Sequence @@ -26,6 +27,7 @@ class DataclassWithIdAttributes(Protocol): Record = TypeVar("Record", bound=DataclassWithIdAttributes) +T = TypeVar("T") class HistoricalEncoder(Generic[Record]): @@ -151,19 +153,45 @@ def _object_id(self, record: Record) -> list[str]: @classmethod def _encode_non_serializable(cls, name: str, value: Any) -> Any: + if isinstance(type(value), EnumMeta): + return cls._encode_enum(value) if isinstance(value, dt.datetime): - # Only allow tz-aware timestamps. - if value.tzinfo is None: - # Name refers to the outermost field, not necessarily a field on a (nested) dataclass. - msg = f"Timestamp without timezone not supported in or within field {name}: {value}" - raise ValueError(msg) - # Always store with 'Z'. - ts_utc = value.astimezone(dt.timezone.utc) - return ts_utc.isoformat().replace("+00:00", "Z") + return cls._encode_datetime(name, value) + if isinstance(value, dt.date): + return cls._encode_date(value) + if isinstance(value, set): + return cls._encode_set(value) msg = f"Cannot encode {type(value)} value in or within field {name}: {value!r}" raise TypeError(msg) + @staticmethod + def _encode_enum(value: Enum) -> str: + """Enums are encoded as a string containing their name.""" + return value.name + + @staticmethod + def _encode_datetime(name, value: dt.datetime) -> str: + """Timestamps are encoded in ISO format, with a 'Z' timezone specifier. Naive timestamps aren't allowed.""" + # Only allow tz-aware timestamps. + if value.tzinfo is None: + # Name refers to the outermost field, not necessarily a field on a (nested) dataclass. + msg = f"Timestamp without timezone not supported in or within field {name}: {value}" + raise ValueError(msg) + # Always store with 'Z'. + ts_utc = value.astimezone(dt.timezone.utc) + return ts_utc.isoformat().replace("+00:00", "Z") + + @staticmethod + def _encode_date(value: dt.date) -> str: + """Dates are encoded in ISO format.""" + return value.isoformat() + + @staticmethod + def _encode_set(value: set[T]) -> list[T]: + """Sets are encoded as an array of the elements.""" + return list(value) + def _encode_field_value(self, name: str, value: Any) -> str | None: if value is None: return None diff --git a/tests/unit/progress/test_history.py b/tests/unit/progress/test_history.py index 5300cf8dd5..afa2e406d6 100644 --- a/tests/unit/progress/test_history.py +++ b/tests/unit/progress/test_history.py @@ -2,6 +2,7 @@ import json import re from dataclasses import dataclass, field +from enum import Enum from typing import ClassVar from unittest.mock import create_autospec @@ -310,32 +311,51 @@ class _InnerClass: a_field: str = "bar" optional: str | None = None - # TODO: Expand to cover all lsql-supported types. + class _Suit(Enum): + HEARTS = 1 + DIAMONDS = 2 + CLUBS = 3 + SPADES = 4 + @dataclass class _AClass: str_field: str = "foo" int_field: int = 23 bool_field: bool = True + float_field: float = 2.3 + enum_field: _Suit = _Suit.HEARTS + date_field: dt.date = field(default_factory=lambda: dt.date(year=2024, month=10, day=15)) ts_field: dt.datetime = field( default_factory=lambda: dt.datetime( year=2024, month=10, day=15, hour=12, minute=44, second=16, tzinfo=dt.timezone.utc ) ) array_field: list[str] = field(default_factory=lambda: ["foo", "bar", "baz"]) + set_field: set[str] = field(default_factory=lambda: {"fu", "baa", "boz"}) + dict_field: dict[int, str] = field(default_factory=lambda: {1000: "M", 100: "C"}) nested_dataclass: list[_InnerClass] = field(default_factory=lambda: [_InnerClass(x) for x in range(2)]) __id_attributes__: ClassVar = ("str_field",) encoder = HistoricalEncoder(job_run_id=1, workspace_id=2, ownership=ownership, klass=_AClass) historical = encoder.to_historical(_AClass()) + # Python set iteration doesn't preserve order, so we need to check set_field separately. + set_field = historical.data.pop("set_field") assert historical.data == { "str_field": "foo", "int_field": "23", "bool_field": "true", + "float_field": "2.3", + "enum_field": "HEARTS", + "date_field": "2024-10-15", "ts_field": "2024-10-15T12:44:16Z", "array_field": '["foo","bar","baz"]', + "dict_field": '{"1000":"M","100":"C"}', "nested_dataclass": '[{"counter":0,"boolean":true,"a_field":"bar","optional":null},{"counter":1,"boolean":true,"a_field":"bar","optional":null}]', } + decoded_set_field = json.loads(set_field) + assert isinstance(decoded_set_field, list) and len(decoded_set_field) == 3 + assert set(decoded_set_field) == {"fu", "baa", "boz"} def test_historical_encoder_object_data_imposter_string_values(ownership) -> None: From b7af858aa0b17f7cb89c5fc05f1f019667a74440 Mon Sep 17 00:00:00 2001 From: Andrew Snare Date: Tue, 22 Oct 2024 14:40:07 +0200 Subject: [PATCH 70/72] All tasks in the workflow are supposed to depend on the assessment having already completed. --- src/databricks/labs/ucx/progress/workflows.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/databricks/labs/ucx/progress/workflows.py b/src/databricks/labs/ucx/progress/workflows.py index 19ce44c6b2..012676351b 100644 --- a/src/databricks/labs/ucx/progress/workflows.py +++ b/src/databricks/labs/ucx/progress/workflows.py @@ -146,14 +146,14 @@ def refresh_table_migration_status(self, ctx: RuntimeContext) -> None: migration_status_snapshot = ctx.migration_status_refresher.snapshot(force_refresh=True) history_log.append_inventory_snapshot(migration_status_snapshot) - @job_task + @job_task(depends_on=[verify_prerequisites]) def assess_dashboards(self, ctx: RuntimeContext): """Scans all dashboards for migration issues in SQL code of embedded widgets. Also stores direct filesystem accesses for display in the migration dashboard.""" # TODO: Ensure these are captured in the history log. ctx.query_linter.refresh_report(ctx.sql_backend, ctx.inventory_database) - @job_task + @job_task(depends_on=[verify_prerequisites]) def assess_workflows(self, ctx: RuntimeContext): """Scans all jobs for migration issues in notebooks. Also stores direct filesystem accesses for display in the migration dashboard.""" From 9b489b96f092a730aedd3729ebc67cef82f82278 Mon Sep 17 00:00:00 2001 From: Andrew Snare Date: Wed, 23 Oct 2024 10:53:09 +0200 Subject: [PATCH 71/72] Explicitly declare the dependency of the record_workflow_run on the tables and UDF refresh. Previously it was implicit: there was a transitive dependency via the grants refresh. --- src/databricks/labs/ucx/progress/workflows.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/databricks/labs/ucx/progress/workflows.py b/src/databricks/labs/ucx/progress/workflows.py index 012676351b..e256af2fa3 100644 --- a/src/databricks/labs/ucx/progress/workflows.py +++ b/src/databricks/labs/ucx/progress/workflows.py @@ -164,11 +164,13 @@ def assess_workflows(self, ctx: RuntimeContext): depends_on=[ verify_prerequisites, crawl_grants, + crawl_udfs, assess_jobs, assess_clusters, assess_pipelines, crawl_cluster_policies, refresh_table_migration_status, + update_tables_history_log, ], job_cluster="table_migration", ) From 0a2f4f2c8480a843ae8c31b86893b8e03805c2f5 Mon Sep 17 00:00:00 2001 From: Andrew Snare Date: Wed, 23 Oct 2024 12:29:28 +0200 Subject: [PATCH 72/72] Use correct method for converting rows to dictionaries. --- tests/unit/progress/test_workflows.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/unit/progress/test_workflows.py b/tests/unit/progress/test_workflows.py index 28d23911b1..555eb49a04 100644 --- a/tests/unit/progress/test_workflows.py +++ b/tests/unit/progress/test_workflows.py @@ -128,7 +128,7 @@ def test_migration_progress_record_workflow_run(run_workflow, mock_backend) -> N rows = mock_backend.rows_written_for("ucx.multiworkspace.workflow_runs", "append") - rows_as_dict = [{k: v for k, v in rows.as_dict().items() if k != 'finished_at'} for rows in rows] + rows_as_dict = [{k: v for k, v in rows.asDict().items() if k != 'finished_at'} for rows in rows] assert rows_as_dict == [ { "started_at": start_time,