Introduce an optional history log, where crawler snapshots are journalled.

asnare · asnare · commit 15dd48dae567 · 2024-09-25T11:20:12.000+02:00
Work in progress.
diff --git a/src/databricks/labs/ucx/framework/crawlers.py b/src/databricks/labs/ucx/framework/crawlers.py
@@ -1,3 +1,5 @@
+from __future__ import annotations
+import datetime as dt
 import logging
 from abc import ABC, abstractmethod
 from collections.abc import Callable, Iterable, Sequence
@@ -6,6 +8,7 @@
 from databricks.labs.lsql.backends import SqlBackend
 from databricks.sdk.errors import NotFound
 
+from databricks.labs.ucx.framework.history import HistoryLog
 from databricks.labs.ucx.framework.utils import escape_sql_identifier
 
 logger = logging.getLogger(__name__)
@@ -21,13 +24,22 @@ class DataclassInstance(Protocol):
 
 
 class CrawlerBase(ABC, Generic[Result]):
-    def __init__(self, backend: SqlBackend, catalog: str, schema: str, table: str, klass: type[Result]):
+    def __init__(
+        self,
+        backend: SqlBackend,
+        catalog: str,
+        schema: str,
+        table: str,
+        klass: type[Result],
+        history_log: HistoryLog | None = None,
+    ):
         """
         Initializes a CrawlerBase instance.
 
         Args:
             backend (SqlBackend): The backend that executes SQL queries:
                 Statement Execution API or Databricks Runtime.
+            history_log: The (optional) history log where (new) snapshots should be saved.
             catalog (str): The catalog name for the inventory persistence.
             schema: The schema name for the inventory persistence.
             table: The table name for the inventory persistence.
@@ -36,6 +48,7 @@ def __init__(self, backend: SqlBackend, catalog: str, schema: str, table: str, k
         self._schema = self._valid(schema)
         self._table = self._valid(table)
         self._backend = backend
+        self._history_log = history_log
         self._fetch = backend.fetch
         self._exec = backend.execute
         self._klass = klass
@@ -155,10 +168,16 @@ def _snapshot(self, fetcher: ResultFn, loader: ResultFn, *, force_refresh: bool)
             except NotFound as e:
                 logger.debug("Inventory table not found", exc_info=e)
         logger.debug(f"[{self.full_name}] crawling new set of snapshot data for {self._table}")
+        crawl_start_time = dt.datetime.now(tz=dt.timezone.utc)
         loaded_records = list(loader())
-        self._update_snapshot(loaded_records, mode="overwrite")
+        self._update_snapshot(loaded_records, crawl_start_time=crawl_start_time, mode="overwrite")
         return loaded_records
 
-    def _update_snapshot(self, items: Sequence[Result], mode: Literal["append", "overwrite"] = "append") -> None:
+    def _update_snapshot(
+        self, items: Sequence[Result], *, crawl_start_time: dt.datetime, mode: Literal["append", "overwrite"]
+    ) -> None:
         logger.debug(f"[{self.full_name}] found {len(items)} new records for {self._table}")
         self._backend.save_table(self.full_name, items, self._klass, mode=mode)
+        if self._history_log:
+            appender = self._history_log.appender(self._klass)
+            appender.append_snapshot(items, run_start_time=crawl_start_time)
diff --git a/src/databricks/labs/ucx/framework/history.py b/src/databricks/labs/ucx/framework/history.py
@@ -0,0 +1,150 @@
+from __future__ import annotations
+import dataclasses
+import datetime as dt
+import json
+import logging
+import uuid
+from collections.abc import Callable, Sequence
+from dataclasses import dataclass
+from functools import cached_property
+from typing import ClassVar, Protocol, TypeVar
+
+from databricks.labs.lsql.backends import SqlBackend
+from databricks.sdk import WorkspaceClient
+
+
+logger = logging.getLogger(__name__)
+
+
+@dataclass(frozen=True, kw_only=True)
+class HistoricalRecord:
+    workspace_id: int
+    """The identifier of the workspace where this record was generated."""
+
+    run_id: str
+    """An identifier of the workflow run that generated this record."""
+
+    snapshot_id: str
+    """An identifier that is unique to the records produced for a given snapshot."""
+
+    run_start_time: dt.datetime
+    """When this record was generated."""
+
+    object_type: str
+    """The inventory table for which this record was generated."""
+
+    object_type_version: int
+    """Versioning of inventory table, for forward compatibility."""
+
+    object_id: str
+    """The type-specific identifier for this inventory record."""
+
+    object_data: str
+    """Type-specific JSON-encoded data of the inventory record."""
+
+    failures: list[str]
+    """The list of problems associated with the object that this inventory record covers."""
+
+    owner: str
+    """The identity of the account that created this inventory record."""
+
+
+class DataclassInstance(Protocol):
+    __dataclass_fields__: ClassVar[dict]
+
+
+Record = TypeVar("Record", bound=DataclassInstance)
+
+
+class HistoryLog:
+    __slots__ = ("_ws", "_backend", "_run_id", "_catalog", "_schema", "_table")
+
+    def __init__(
+        self,
+        ws: WorkspaceClient,
+        backend: SqlBackend,
+        run_id: str,
+        catalog: str,
+        schema: str,
+        table: str,
+    ) -> None:
+        self._ws = ws
+        self._backend = backend
+        self._run_id = run_id
+        self._catalog = catalog
+        self._schema = schema
+        self._table = table
+
+    @property
+    def full_name(self) -> str:
+        return f"{self._catalog}.{self._schema}.{self._table}"
+
+    def _append_history_snapshot(self, object_type: str, snapshot: list[HistoricalRecord]) -> None:
+        logger.debug(f"[{self.full_name}] appending {len(snapshot)} new records for {object_type}")
+        # Concurrent writes do not need to be handled here; appends cannot conflict.
+        # TODO: Although documented as conflict-free, verify that this is truly is the case.
+        self._backend.save_table(self.full_name, snapshot, HistoricalRecord, mode="append")
+
+    class Appender:
+        __slots__ = ("_ws", "_object_type", "_object_type_version", "_key_from", "_run_id", "_persist")
+
+        def __init__(
+            self,
+            ws: WorkspaceClient,
+            run_id: str,
+            klass: type[Record],
+            key_from: Callable[[Record], str],
+            persist: Callable[[str, list[HistoricalRecord]], None],
+        ) -> None:
+            self._ws = ws
+            self._run_id = run_id
+            self._object_type = klass.__name__
+            # Versioning support: if the dataclass has a _ucx_version class attribute that is the current version.
+            self._object_type_version = getattr(klass, "_ucx_version") if hasattr(klass, "_ucx_version") else 0
+            self._key_from = key_from
+            self._persist = persist
+
+        @cached_property
+        def _workspace_id(self) -> int:
+            return self._ws.get_workspace_id()
+
+        @cached_property
+        def _owner(self) -> str:
+            current_user = self._ws.current_user.me()
+            owner = current_user.user_name or current_user.id
+            assert owner
+            return owner
+
+        def append_snapshot(self, records: Sequence[Record], *, run_start_time: dt.datetime) -> None:
+            snapshot_id = uuid.uuid4()
+            historical_records = [
+                self._inventory_record_to_historical(record, snapshot_id=snapshot_id, run_start_time=run_start_time)
+                for record in records
+            ]
+            self._persist(self._object_type, historical_records)
+
+        def _inventory_record_to_historical(
+            self, record: Record, *, snapshot_id: uuid.UUID, run_start_time: dt.datetime
+        ) -> HistoricalRecord:
+            object_id = self._key_from(record)
+            object_as_dict = dataclasses.asdict(record)
+            object_as_json = json.dumps(object_as_dict)
+            # TODO: Get failures.
+            failures: list[str] = []
+            return HistoricalRecord(
+                workspace_id=self._workspace_id,
+                run_id=self._run_id,
+                snapshot_id=str(snapshot_id),
+                run_start_time=run_start_time,
+                object_type=self._object_type,
+                object_type_version=self._object_type_version,
+                object_id=object_id,
+                object_data=object_as_json,
+                failures=failures,
+                owner=self._owner,
+            )
+
+    def appender(self, klass: type[Record]) -> Appender:
+        # TODO: Make a part of the protocol so the type-checker can enforce this.
+        key_from = getattr(klass, "key_fields")
+        return self.Appender(self._ws, self._run_id, klass, key_from, self._append_history_snapshot)
diff --git a/src/databricks/labs/ucx/source_code/directfs_access.py b/src/databricks/labs/ucx/source_code/directfs_access.py
@@ -1,6 +1,7 @@
 from __future__ import annotations
 
 import dataclasses
+import datetime as dt
 import logging
 import sys
 from collections.abc import Sequence, Iterable
@@ -97,14 +98,14 @@ def __init__(self, backend: SqlBackend, schema: str, table: str):
         """
         super().__init__(backend=backend, catalog="hive_metastore", schema=schema, table=table, klass=DirectFsAccess)
 
-    def dump_all(self, dfsas: Sequence[DirectFsAccess]):
+    def dump_all(self, dfsas: Sequence[DirectFsAccess], crawl_start_time: dt.datetime):
         """This crawler doesn't follow the pull model because the fetcher fetches data for 2 crawlers, not just one
         It's not **bad** because all records are pushed at once.
         Providing a multi-entity crawler is out-of-scope of this PR
         """
         try:
             # TODO until we historize data, we append all DFSAs
-            self._update_snapshot(dfsas, mode="append")
+            self._update_snapshot(dfsas, crawl_start_time=crawl_start_time, mode="append")
         except DatabricksError as e:
             logger.error("Failed to store DFSAs", exc_info=e)
 
diff --git a/src/databricks/labs/ucx/source_code/jobs.py b/src/databricks/labs/ucx/source_code/jobs.py
@@ -352,6 +352,7 @@ def __init__(
         self._include_job_ids = include_job_ids
 
     def refresh_report(self, sql_backend: SqlBackend, inventory_database: str):
+        crawl_start_time = datetime.now(tz=timezone.utc)
         tasks = []
         all_jobs = list(self._ws.jobs.list())
         logger.info(f"Preparing {len(all_jobs)} linting tasks...")
@@ -374,7 +375,7 @@ def refresh_report(self, sql_backend: SqlBackend, inventory_database: str):
             JobProblem,
             mode='overwrite',
         )
-        self._directfs_crawler.dump_all(job_dfsas)
+        self._directfs_crawler.dump_all(job_dfsas, crawl_start_time=crawl_start_time)
         if len(errors) > 0:
             raise ManyError(errors)
 
diff --git a/src/databricks/labs/ucx/source_code/queries.py b/src/databricks/labs/ucx/source_code/queries.py
@@ -84,7 +84,7 @@ def refresh_report(self, sql_backend: SqlBackend, inventory_database: str):
             )
             for dfsa in all_dfsas
         ]
-        self._directfs_crawler.dump_all(all_dfsas)
+        self._directfs_crawler.dump_all(all_dfsas, crawl_start_time=assessment_start)
 
     def _dashboard_ids_in_scope(self) -> list[str]:
         if self._include_dashboard_ids is not None:  # an empty list is accepted
diff --git a/tests/unit/source_code/test_directfs_access.py b/tests/unit/source_code/test_directfs_access.py
@@ -1,4 +1,4 @@
-from datetime import datetime
+import datetime as dt
 
 from databricks.labs.lsql.backends import MockBackend
 
@@ -12,19 +12,20 @@
 def test_crawler_appends_dfsas():
     backend = MockBackend()
     crawler = DirectFsAccessCrawler.for_paths(backend, "schema")
+    now = dt.datetime.now(tz=dt.timezone.utc)
     dfsas = list(
         DirectFsAccess(
             path=path,
             is_read=False,
             is_write=False,
             source_id="ID",
-            source_timestamp=datetime.now(),
+            source_timestamp=now,
             source_lineage=[LineageAtom(object_type="LINEAGE", object_id="ID")],
-            assessment_start_timestamp=datetime.now(),
-            assessment_end_timestamp=datetime.now(),
+            assessment_start_timestamp=now,
+            assessment_end_timestamp=now,
         )
         for path in ("a", "b", "c")
     )
-    crawler.dump_all(dfsas)
+    crawler.dump_all(dfsas, now)
     rows = backend.rows_written_for(crawler.full_name, "append")
     assert len(rows) == 3

Original file line number	Diff line number	Diff line change
`@@ -84,7 +84,7 @@ def refresh_report(self, sql_backend: SqlBackend, inventory_database: str):`
`84`	`84`	`)`
`85`	`85`	`for dfsa in all_dfsas`
`86`	`86`	`]`
`87`		`- self._directfs_crawler.dump_all(all_dfsas)`
	`87`	`+ self._directfs_crawler.dump_all(all_dfsas, crawl_start_time=assessment_start)`
`88`	`88`
`89`	`89`	`def _dashboard_ids_in_scope(self) -> list[str]:`
`90`	`90`	`if self._include_dashboard_ids is not None: # an empty list is accepted`