From 8eba90396fb7c41fc86372e63b8d2c32c2d554c3 Mon Sep 17 00:00:00 2001 From: Eric Vergnaud Date: Wed, 16 Oct 2024 10:51:55 +0200 Subject: [PATCH 01/17] make simple_dependency_resolver available more broadly --- tests/unit/conftest.py | 15 ++++++++++++++- tests/unit/source_code/conftest.py | 15 --------------- 2 files changed, 14 insertions(+), 16 deletions(-) diff --git a/tests/unit/conftest.py b/tests/unit/conftest.py index 2c8cbfd3b2..92e86ec73e 100644 --- a/tests/unit/conftest.py +++ b/tests/unit/conftest.py @@ -10,13 +10,17 @@ from databricks.labs.ucx.hive_metastore import TablesCrawler from databricks.labs.ucx.hive_metastore.tables import FasterTableScanCrawler -from databricks.labs.ucx.source_code.graph import BaseNotebookResolver +from databricks.labs.ucx.source_code.graph import BaseNotebookResolver, DependencyResolver +from databricks.labs.ucx.source_code.known import KnownList +from databricks.labs.ucx.source_code.linters.files import ImportFileResolver, FileLoader +from databricks.labs.ucx.source_code.notebooks.loaders import NotebookResolver, NotebookLoader from databricks.labs.ucx.source_code.path_lookup import PathLookup from databricks.sdk import AccountClient from databricks.sdk.config import Config from databricks.labs.ucx.config import WorkspaceConfig from databricks.labs.ucx.contexts.workflow_task import RuntimeContext +from databricks.labs.ucx.source_code.python_libraries import PythonLibraryResolver from . import mock_workspace_client @@ -201,3 +205,12 @@ def mock_backend() -> MockBackend: @pytest.fixture def ws(): return mock_workspace_client() + + +@pytest.fixture +def simple_dependency_resolver(mock_path_lookup: PathLookup) -> DependencyResolver: + allow_list = KnownList() + library_resolver = PythonLibraryResolver(allow_list) + notebook_resolver = NotebookResolver(NotebookLoader()) + import_resolver = ImportFileResolver(FileLoader(), allow_list) + return DependencyResolver(library_resolver, notebook_resolver, import_resolver, import_resolver, mock_path_lookup) diff --git a/tests/unit/source_code/conftest.py b/tests/unit/source_code/conftest.py index 6029ce4d82..9c999d92dc 100644 --- a/tests/unit/source_code/conftest.py +++ b/tests/unit/source_code/conftest.py @@ -1,12 +1,6 @@ import pytest from databricks.labs.ucx.hive_metastore.table_migration_status import TableMigrationIndex, TableMigrationStatus -from databricks.labs.ucx.source_code.graph import DependencyResolver -from databricks.labs.ucx.source_code.known import KnownList -from databricks.labs.ucx.source_code.linters.files import ImportFileResolver, FileLoader -from databricks.labs.ucx.source_code.notebooks.loaders import NotebookLoader, NotebookResolver -from databricks.labs.ucx.source_code.path_lookup import PathLookup -from databricks.labs.ucx.source_code.python_libraries import PythonLibraryResolver @pytest.fixture @@ -51,12 +45,3 @@ def extended_test_index(): ), ] ) - - -@pytest.fixture -def simple_dependency_resolver(mock_path_lookup: PathLookup) -> DependencyResolver: - allow_list = KnownList() - library_resolver = PythonLibraryResolver(allow_list) - notebook_resolver = NotebookResolver(NotebookLoader()) - import_resolver = ImportFileResolver(FileLoader(), allow_list) - return DependencyResolver(library_resolver, notebook_resolver, import_resolver, import_resolver, mock_path_lookup) From 5f6583134e2e577c1c50dc842e0f2807676329c2 Mon Sep 17 00:00:00 2001 From: Eric Vergnaud Date: Wed, 16 Oct 2024 10:54:19 +0200 Subject: [PATCH 02/17] build migration steps for workflow task --- .../labs/ucx/sequencing/__init__.py | 0 .../labs/ucx/sequencing/sequencing.py | 145 ++++++++++++++++++ tests/unit/sequencing/__init__.py | 0 tests/unit/sequencing/test_sequencing.py | 22 +++ 4 files changed, 167 insertions(+) create mode 100644 src/databricks/labs/ucx/sequencing/__init__.py create mode 100644 src/databricks/labs/ucx/sequencing/sequencing.py create mode 100644 tests/unit/sequencing/__init__.py create mode 100644 tests/unit/sequencing/test_sequencing.py diff --git a/src/databricks/labs/ucx/sequencing/__init__.py b/src/databricks/labs/ucx/sequencing/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/src/databricks/labs/ucx/sequencing/sequencing.py b/src/databricks/labs/ucx/sequencing/sequencing.py new file mode 100644 index 0000000000..f28400c3cd --- /dev/null +++ b/src/databricks/labs/ucx/sequencing/sequencing.py @@ -0,0 +1,145 @@ +from __future__ import annotations + +import itertools +from collections.abc import Iterable +from dataclasses import dataclass, field + +from databricks.sdk.service import jobs + +from databricks.labs.ucx.source_code.graph import DependencyGraph + + +@dataclass +class MigrationStep: + step_id: int + step_number: int + object_type: str + object_id: str + object_owner: str + required_step_ids: list[int] = field(default_factory=list) + + +@dataclass +class MigrationNode: + last_node_id = 0 + node_id: int + object_type: str + object_id: str + object_owner: str + required_steps: list[MigrationNode] = field(default_factory=list) + + def generate_steps(self) -> tuple[MigrationStep, Iterable[MigrationStep]]: + # traverse the nodes using a depth-first algorithm + # ultimate leaves have a step number of 1 + # use highest required step number + 1 for this step + highest_step_number = 0 + required_step_ids: list[int] = [] + all_generated_steps: list[Iterable[MigrationStep]] = [] + for required_step in self.required_steps: + step, generated_steps = required_step.generate_steps() + highest_step_number = max(highest_step_number, step.step_number) + required_step_ids.append(step.step_id) + all_generated_steps.append(generated_steps) + all_generated_steps.append([step]) + this_step = MigrationStep( + step_id=self.node_id, + step_number=highest_step_number + 1, + object_type=self.object_type, + object_id=self.object_id, + object_owner=self.object_owner, + required_step_ids=required_step_ids, + ) + return this_step, itertools.chain(*all_generated_steps) + + def find(self, object_type: str, object_id: str) -> MigrationNode | None: + if object_type == self.object_type and object_id == self.object_id: + return self + for step in self.required_steps: + found = step.find(object_type, object_id) + if found: + return found + return None + + +class MigrationSequencer: + + def __init__(self): + self._root = MigrationNode(node_id=0, object_type="ROOT", object_id="ROOT", object_owner="NONE") + + def register_workflow_task(self, task: jobs.Task, job: jobs.Job, _graph: DependencyGraph) -> MigrationNode: + task_node = self._find_node(object_type="TASK", object_id=task.task_key) + if task_node: + return task_node + job_node = self.register_workflow_job(job) + MigrationNode.last_node_id += 1 + task_node = MigrationNode( + node_id=MigrationNode.last_node_id, object_type="TASK", object_id=task.task_key, object_owner="NONE" + ) # TODO object_owner + job_node.required_steps.append(task_node) + if task.existing_cluster_id: + cluster_node = self.register_cluster(task.existing_cluster_id) + cluster_node.required_steps.append(task_node) + if job_node not in cluster_node.required_steps: + cluster_node.required_steps.append(job_node) + # TODO register dependency graph + return task_node + + def register_workflow_job(self, job: jobs.Job) -> MigrationNode: + job_node = self._find_node(object_type="JOB", object_id=str(job.job_id)) + if job_node: + return job_node + MigrationNode.last_node_id += 1 + job_node = MigrationNode( + node_id=MigrationNode.last_node_id, object_type="JOB", object_id=str(job.job_id), object_owner="NONE" + ) # TODO object_owner + top_level = True + if job.settings and job.settings.job_clusters: + for job_cluster in job.settings.job_clusters: + cluster_node = self.register_job_cluster(job_cluster) + if cluster_node: + top_level = False + cluster_node.required_steps.append(job_node) + if top_level: + self._root.required_steps.append(job_node) + return job_node + + def register_job_cluster(self, cluster: jobs.JobCluster) -> MigrationNode | None: + if cluster.new_cluster: + return None + return self.register_cluster(cluster.job_cluster_key) + + def register_cluster(self, cluster_key: str) -> MigrationNode: + cluster_node = self._find_node(object_type="CLUSTER", object_id=cluster_key) + if cluster_node: + return cluster_node + MigrationNode.last_node_id += 1 + cluster_node = MigrationNode( + node_id=MigrationNode.last_node_id, object_type="CLUSTER", object_id=cluster_key, object_owner="NONE" + ) # TODO object_owner + # TODO register warehouses and policies + self._root.required_steps.append(cluster_node) + return cluster_node + + def generate_steps(self) -> Iterable[MigrationStep]: + _root_step, generated_steps = self._root.generate_steps() + unique_steps = self._deduplicate_steps(generated_steps) + return self._sorted_steps(unique_steps) + + @staticmethod + def _sorted_steps(steps: Iterable[MigrationStep]) -> Iterable[MigrationStep]: + # sort by step number, lowest first + return sorted(steps, key=lambda step: step.step_number) + + @staticmethod + def _deduplicate_steps(steps: Iterable[MigrationStep]) -> Iterable[MigrationStep]: + best_steps: dict[int, MigrationStep] = {} + for step in steps: + existing = best_steps.get(step.step_id, None) + # keep the step with the highest step number + if existing and existing.step_number >= step.step_number: + continue + best_steps[step.step_id] = step + return best_steps.values() + + def _find_node(self, object_type: str, object_id: str) -> MigrationNode | None: + return self._root.find(object_type, object_id) diff --git a/tests/unit/sequencing/__init__.py b/tests/unit/sequencing/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/tests/unit/sequencing/test_sequencing.py b/tests/unit/sequencing/test_sequencing.py new file mode 100644 index 0000000000..094767fc3e --- /dev/null +++ b/tests/unit/sequencing/test_sequencing.py @@ -0,0 +1,22 @@ +from databricks.sdk.service import jobs + +from databricks.labs.ucx.sequencing.sequencing import MigrationSequencer +from databricks.labs.ucx.source_code.base import CurrentSessionState +from databricks.labs.ucx.source_code.graph import DependencyGraph +from databricks.labs.ucx.source_code.jobs import WorkflowTask + + +def test_cluster_from_task_has_children(ws, simple_dependency_resolver, mock_path_lookup): + task = jobs.Task(task_key="test-task", existing_cluster_id="cluster-123") + settings = jobs.JobSettings(name="test-job", tasks=[task]) + job = jobs.Job(job_id=1234, settings=settings) + ws.jobs.get.return_value = job + dependency = WorkflowTask(ws, task, job) + graph = DependencyGraph(dependency, None, simple_dependency_resolver, mock_path_lookup, CurrentSessionState()) + sequencer = MigrationSequencer() + sequencer.register_workflow_task(task, job, graph) + steps = list(sequencer.generate_steps()) + step = steps[-1] + assert step.object_type == "CLUSTER" + assert step.object_id == "cluster-123" + assert step.step_number == 3 From 52c549549f544f61d5465c30a986e7fa370fff97 Mon Sep 17 00:00:00 2001 From: Eric Vergnaud Date: Wed, 16 Oct 2024 11:54:04 +0200 Subject: [PATCH 03/17] fix pylint warnings --- tests/unit/conftest.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/tests/unit/conftest.py b/tests/unit/conftest.py index 92e86ec73e..4eee45fca3 100644 --- a/tests/unit/conftest.py +++ b/tests/unit/conftest.py @@ -61,8 +61,10 @@ class CustomIterator: def __init__(self, values): self._values = iter(values) self._has_next = True + self._next_value = None - def hasNext(self): # pylint: disable=invalid-name + # pylint: disable=invalid-name + def hasNext(self): try: self._next_value = next(self._values) self._has_next = True @@ -154,9 +156,11 @@ def inner(cb, **replace) -> RuntimeContext: ctx.tables_crawler._spark._jsparkSession.sharedState().externalCatalog().listDatabases.return_value = ( mock_list_databases_iterator ) + # pylint: disable=protected-access ctx.tables_crawler._spark._jsparkSession.sharedState().externalCatalog().listTables.return_value = ( mock_list_tables_iterator ) + # pylint: disable=protected-access ctx.tables_crawler._spark._jsparkSession.sharedState().externalCatalog().getTable.return_value = ( get_table_mock ) @@ -169,8 +173,9 @@ def inner(cb, **replace) -> RuntimeContext: @pytest.fixture def acc_client(): - acc = create_autospec(AccountClient) # pylint: disable=mock-no-usage + acc = create_autospec(AccountClient) acc.config = Config(host="https://accounts.cloud.databricks.com", account_id="123", token="123") + acc.asset_not_called() return acc From 18609178378f6f507f48fd608ece43506e40f6bf Mon Sep 17 00:00:00 2001 From: Eric Vergnaud Date: Wed, 16 Oct 2024 12:05:36 +0200 Subject: [PATCH 04/17] fix pylint warnings --- tests/unit/conftest.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/unit/conftest.py b/tests/unit/conftest.py index 4eee45fca3..675f2012e6 100644 --- a/tests/unit/conftest.py +++ b/tests/unit/conftest.py @@ -175,7 +175,7 @@ def inner(cb, **replace) -> RuntimeContext: def acc_client(): acc = create_autospec(AccountClient) acc.config = Config(host="https://accounts.cloud.databricks.com", account_id="123", token="123") - acc.asset_not_called() + acc.assert_not_called() return acc From ae23d20173def4d2efa8b2f4430cae9b285373b4 Mon Sep 17 00:00:00 2001 From: Eric Vergnaud Date: Wed, 16 Oct 2024 12:06:04 +0200 Subject: [PATCH 05/17] add object name --- .../labs/ucx/sequencing/sequencing.py | 29 +++++++++++++++---- 1 file changed, 24 insertions(+), 5 deletions(-) diff --git a/src/databricks/labs/ucx/sequencing/sequencing.py b/src/databricks/labs/ucx/sequencing/sequencing.py index f28400c3cd..b12fbcd0b2 100644 --- a/src/databricks/labs/ucx/sequencing/sequencing.py +++ b/src/databricks/labs/ucx/sequencing/sequencing.py @@ -15,6 +15,7 @@ class MigrationStep: step_number: int object_type: str object_id: str + object_name: str object_owner: str required_step_ids: list[int] = field(default_factory=list) @@ -25,6 +26,7 @@ class MigrationNode: node_id: int object_type: str object_id: str + object_name: str object_owner: str required_steps: list[MigrationNode] = field(default_factory=list) @@ -46,6 +48,7 @@ def generate_steps(self) -> tuple[MigrationStep, Iterable[MigrationStep]]: step_number=highest_step_number + 1, object_type=self.object_type, object_id=self.object_id, + object_name=self.object_name, object_owner=self.object_owner, required_step_ids=required_step_ids, ) @@ -64,16 +67,23 @@ def find(self, object_type: str, object_id: str) -> MigrationNode | None: class MigrationSequencer: def __init__(self): - self._root = MigrationNode(node_id=0, object_type="ROOT", object_id="ROOT", object_owner="NONE") + self._root = MigrationNode( + node_id=0, object_type="ROOT", object_id="ROOT", object_name="ROOT", object_owner="NONE" + ) def register_workflow_task(self, task: jobs.Task, job: jobs.Job, _graph: DependencyGraph) -> MigrationNode: - task_node = self._find_node(object_type="TASK", object_id=task.task_key) + task_id = f"{job.job_id}/{task.task_key}" + task_node = self._find_node(object_type="TASK", object_id=task_id) if task_node: return task_node job_node = self.register_workflow_job(job) MigrationNode.last_node_id += 1 task_node = MigrationNode( - node_id=MigrationNode.last_node_id, object_type="TASK", object_id=task.task_key, object_owner="NONE" + node_id=MigrationNode.last_node_id, + object_type="TASK", + object_id=task_id, + object_name=task.task_key, + object_owner="NONE", ) # TODO object_owner job_node.required_steps.append(task_node) if task.existing_cluster_id: @@ -89,8 +99,13 @@ def register_workflow_job(self, job: jobs.Job) -> MigrationNode: if job_node: return job_node MigrationNode.last_node_id += 1 + job_name = job.settings.name if job.settings and job.settings.name else str(job.job_id) job_node = MigrationNode( - node_id=MigrationNode.last_node_id, object_type="JOB", object_id=str(job.job_id), object_owner="NONE" + node_id=MigrationNode.last_node_id, + object_type="JOB", + object_id=str(job.job_id), + object_name=job_name, + object_owner="NONE", ) # TODO object_owner top_level = True if job.settings and job.settings.job_clusters: @@ -114,7 +129,11 @@ def register_cluster(self, cluster_key: str) -> MigrationNode: return cluster_node MigrationNode.last_node_id += 1 cluster_node = MigrationNode( - node_id=MigrationNode.last_node_id, object_type="CLUSTER", object_id=cluster_key, object_owner="NONE" + node_id=MigrationNode.last_node_id, + object_type="CLUSTER", + object_id=cluster_key, + object_name=cluster_key, + object_owner="NONE", ) # TODO object_owner # TODO register warehouses and policies self._root.required_steps.append(cluster_node) From 9c63b8b40901f6a08b9ac682a710fd09b2596322 Mon Sep 17 00:00:00 2001 From: Eric Vergnaud Date: Wed, 16 Oct 2024 12:14:11 +0200 Subject: [PATCH 06/17] populate object owner --- src/databricks/labs/ucx/sequencing/sequencing.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/databricks/labs/ucx/sequencing/sequencing.py b/src/databricks/labs/ucx/sequencing/sequencing.py index b12fbcd0b2..f66563de80 100644 --- a/src/databricks/labs/ucx/sequencing/sequencing.py +++ b/src/databricks/labs/ucx/sequencing/sequencing.py @@ -83,8 +83,8 @@ def register_workflow_task(self, task: jobs.Task, job: jobs.Job, _graph: Depende object_type="TASK", object_id=task_id, object_name=task.task_key, - object_owner="NONE", - ) # TODO object_owner + object_owner=job_node.object_owner, # no task owner so use job one + ) job_node.required_steps.append(task_node) if task.existing_cluster_id: cluster_node = self.register_cluster(task.existing_cluster_id) @@ -105,8 +105,8 @@ def register_workflow_job(self, job: jobs.Job) -> MigrationNode: object_type="JOB", object_id=str(job.job_id), object_name=job_name, - object_owner="NONE", - ) # TODO object_owner + object_owner=job.creator_user_name or "", + ) top_level = True if job.settings and job.settings.job_clusters: for job_cluster in job.settings.job_clusters: From a1734b5a57d730be1b534404b90eb8dc126cafc2 Mon Sep 17 00:00:00 2001 From: Eric Vergnaud Date: Wed, 16 Oct 2024 12:48:00 +0200 Subject: [PATCH 07/17] be more defensive --- .../labs/ucx/sequencing/sequencing.py | 17 ++++++++++++----- tests/unit/sequencing/test_sequencing.py | 8 +++++++- 2 files changed, 19 insertions(+), 6 deletions(-) diff --git a/src/databricks/labs/ucx/sequencing/sequencing.py b/src/databricks/labs/ucx/sequencing/sequencing.py index f66563de80..113b704d30 100644 --- a/src/databricks/labs/ucx/sequencing/sequencing.py +++ b/src/databricks/labs/ucx/sequencing/sequencing.py @@ -4,6 +4,7 @@ from collections.abc import Iterable from dataclasses import dataclass, field +from databricks.sdk import WorkspaceClient from databricks.sdk.service import jobs from databricks.labs.ucx.source_code.graph import DependencyGraph @@ -66,7 +67,8 @@ def find(self, object_type: str, object_id: str) -> MigrationNode | None: class MigrationSequencer: - def __init__(self): + def __init__(self, ws: WorkspaceClient): + self._ws = ws self._root = MigrationNode( node_id=0, object_type="ROOT", object_id="ROOT", object_name="ROOT", object_owner="NONE" ) @@ -83,7 +85,7 @@ def register_workflow_task(self, task: jobs.Task, job: jobs.Job, _graph: Depende object_type="TASK", object_id=task_id, object_name=task.task_key, - object_owner=job_node.object_owner, # no task owner so use job one + object_owner=job_node.object_owner, # no task owner so use job one ) job_node.required_steps.append(task_node) if task.existing_cluster_id: @@ -127,14 +129,17 @@ def register_cluster(self, cluster_key: str) -> MigrationNode: cluster_node = self._find_node(object_type="CLUSTER", object_id=cluster_key) if cluster_node: return cluster_node + details = self._ws.clusters.get(cluster_key) + object_name = details.cluster_name if details and details.cluster_name else cluster_key + object_owner = details.creator_user_name if details and details.creator_user_name else "" MigrationNode.last_node_id += 1 cluster_node = MigrationNode( node_id=MigrationNode.last_node_id, object_type="CLUSTER", object_id=cluster_key, - object_name=cluster_key, - object_owner="NONE", - ) # TODO object_owner + object_name=object_name, + object_owner=object_owner, + ) # TODO register warehouses and policies self._root.required_steps.append(cluster_node) return cluster_node @@ -155,6 +160,8 @@ def _deduplicate_steps(steps: Iterable[MigrationStep]) -> Iterable[MigrationStep for step in steps: existing = best_steps.get(step.step_id, None) # keep the step with the highest step number + # TODO this possibly affects the step_number of steps that depend on this one + # but it's probably OK to not be 100% accurate initially if existing and existing.step_number >= step.step_number: continue best_steps[step.step_id] = step diff --git a/tests/unit/sequencing/test_sequencing.py b/tests/unit/sequencing/test_sequencing.py index 094767fc3e..fa7271164e 100644 --- a/tests/unit/sequencing/test_sequencing.py +++ b/tests/unit/sequencing/test_sequencing.py @@ -1,4 +1,5 @@ from databricks.sdk.service import jobs +from databricks.sdk.service.compute import ClusterDetails from databricks.labs.ucx.sequencing.sequencing import MigrationSequencer from databricks.labs.ucx.source_code.base import CurrentSessionState @@ -7,16 +8,21 @@ def test_cluster_from_task_has_children(ws, simple_dependency_resolver, mock_path_lookup): + ws.clusters.get.return_value = ClusterDetails(cluster_name="my-cluster", creator_user_name="John Doe") task = jobs.Task(task_key="test-task", existing_cluster_id="cluster-123") settings = jobs.JobSettings(name="test-job", tasks=[task]) job = jobs.Job(job_id=1234, settings=settings) ws.jobs.get.return_value = job dependency = WorkflowTask(ws, task, job) graph = DependencyGraph(dependency, None, simple_dependency_resolver, mock_path_lookup, CurrentSessionState()) - sequencer = MigrationSequencer() + sequencer = MigrationSequencer(ws) sequencer.register_workflow_task(task, job, graph) steps = list(sequencer.generate_steps()) step = steps[-1] + assert step.step_id assert step.object_type == "CLUSTER" assert step.object_id == "cluster-123" + assert step.object_name == "my-cluster" + assert step.object_owner == "John Doe" assert step.step_number == 3 + assert len(step.required_step_ids) == 2 From 872d74c6f105df1e0c9ce6d0aa51f47ffea136fb Mon Sep 17 00:00:00 2001 From: Eric Vergnaud Date: Thu, 17 Oct 2024 10:53:39 +0200 Subject: [PATCH 08/17] move last_node_id to sequencer --- src/databricks/labs/ucx/sequencing/sequencing.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/src/databricks/labs/ucx/sequencing/sequencing.py b/src/databricks/labs/ucx/sequencing/sequencing.py index 113b704d30..4037a29697 100644 --- a/src/databricks/labs/ucx/sequencing/sequencing.py +++ b/src/databricks/labs/ucx/sequencing/sequencing.py @@ -23,7 +23,6 @@ class MigrationStep: @dataclass class MigrationNode: - last_node_id = 0 node_id: int object_type: str object_id: str @@ -69,6 +68,7 @@ class MigrationSequencer: def __init__(self, ws: WorkspaceClient): self._ws = ws + self._last_node_id = 0 self._root = MigrationNode( node_id=0, object_type="ROOT", object_id="ROOT", object_name="ROOT", object_owner="NONE" ) @@ -79,9 +79,9 @@ def register_workflow_task(self, task: jobs.Task, job: jobs.Job, _graph: Depende if task_node: return task_node job_node = self.register_workflow_job(job) - MigrationNode.last_node_id += 1 + self._last_node_id += 1 task_node = MigrationNode( - node_id=MigrationNode.last_node_id, + node_id=self._last_node_id, object_type="TASK", object_id=task_id, object_name=task.task_key, @@ -100,10 +100,10 @@ def register_workflow_job(self, job: jobs.Job) -> MigrationNode: job_node = self._find_node(object_type="JOB", object_id=str(job.job_id)) if job_node: return job_node - MigrationNode.last_node_id += 1 + self._last_node_id += 1 job_name = job.settings.name if job.settings and job.settings.name else str(job.job_id) job_node = MigrationNode( - node_id=MigrationNode.last_node_id, + node_id=self._last_node_id, object_type="JOB", object_id=str(job.job_id), object_name=job_name, @@ -132,9 +132,9 @@ def register_cluster(self, cluster_key: str) -> MigrationNode: details = self._ws.clusters.get(cluster_key) object_name = details.cluster_name if details and details.cluster_name else cluster_key object_owner = details.creator_user_name if details and details.creator_user_name else "" - MigrationNode.last_node_id += 1 + self._last_node_id += 1 cluster_node = MigrationNode( - node_id=MigrationNode.last_node_id, + node_id=self._last_node_id, object_type="CLUSTER", object_id=cluster_key, object_name=object_name, From 18acdc0f4257a985c31ad42906a0af43a04ed909 Mon Sep 17 00:00:00 2001 From: Eric Vergnaud Date: Thu, 17 Oct 2024 11:27:53 +0200 Subject: [PATCH 09/17] rename JobOwnership to JobIinfoOwnership and add JobOwnership --- src/databricks/labs/ucx/assessment/jobs.py | 13 ++++++++- tests/integration/assessment/test_jobs.py | 4 +-- tests/unit/assessment/test_jobs.py | 33 ++++++++++++++++++---- 3 files changed, 41 insertions(+), 9 deletions(-) diff --git a/src/databricks/labs/ucx/assessment/jobs.py b/src/databricks/labs/ucx/assessment/jobs.py index 667647d967..290fdbe106 100644 --- a/src/databricks/labs/ucx/assessment/jobs.py +++ b/src/databricks/labs/ucx/assessment/jobs.py @@ -20,6 +20,7 @@ RunType, SparkJarTask, SqlTask, + Job, ) from databricks.labs.ucx.assessment.clusters import CheckClusterMixin @@ -149,7 +150,7 @@ def _check_jar_task(self, all_task: list[RunTask]) -> list[str]: return task_failures -class JobOwnership(Ownership[JobInfo]): +class JobInfoOwnership(Ownership[JobInfo]): """Determine ownership of jobs (workflows) in the inventory. This is the job creator (if known). @@ -159,6 +160,16 @@ def _maybe_direct_owner(self, record: JobInfo) -> str | None: return record.creator +class JobOwnership(Ownership[Job]): + """Determine ownership of jobs (workflows) in the workspace. + + This is the job creator (if known). + """ + + def _maybe_direct_owner(self, record: Job) -> str | None: + return record.creator_user_name + + @dataclass class SubmitRunInfo: run_ids: str # JSON-encoded list of run ids diff --git a/tests/integration/assessment/test_jobs.py b/tests/integration/assessment/test_jobs.py index 47fa6f1b81..3f3104765f 100644 --- a/tests/integration/assessment/test_jobs.py +++ b/tests/integration/assessment/test_jobs.py @@ -7,7 +7,7 @@ from databricks.sdk.service.jobs import NotebookTask, RunTask from databricks.sdk.service.workspace import ImportFormat -from databricks.labs.ucx.assessment.jobs import JobOwnership, JobsCrawler, SubmitRunsCrawler +from databricks.labs.ucx.assessment.jobs import JobInfoOwnership, JobsCrawler, SubmitRunsCrawler from .test_assessment import _SPARK_CONF @@ -80,5 +80,5 @@ def test_job_ownership(ws, runtime_ctx, make_job, inventory_schema, sql_backend) job_record = next(record for record in records if record.job_id == str(job.job_id)) # Verify ownership is as expected. - ownership = JobOwnership(runtime_ctx.administrator_locator) + ownership = JobInfoOwnership(runtime_ctx.administrator_locator) assert ownership.owner_of(job_record) == ws.current_user.me().user_name diff --git a/tests/unit/assessment/test_jobs.py b/tests/unit/assessment/test_jobs.py index 8ec3e89077..862e2bcf07 100644 --- a/tests/unit/assessment/test_jobs.py +++ b/tests/unit/assessment/test_jobs.py @@ -2,9 +2,9 @@ import pytest from databricks.labs.lsql.backends import MockBackend -from databricks.sdk.service.jobs import BaseJob, JobSettings +from databricks.sdk.service.jobs import BaseJob, JobSettings, Job -from databricks.labs.ucx.assessment.jobs import JobInfo, JobOwnership, JobsCrawler, SubmitRunsCrawler +from databricks.labs.ucx.assessment.jobs import JobInfo, JobInfoOwnership, JobsCrawler, SubmitRunsCrawler, JobOwnership from databricks.labs.ucx.framework.owners import AdministratorLocator from .. import mock_workspace_client @@ -135,22 +135,43 @@ def test_job_run_crawler(jobruns_ids, cluster_ids, run_ids, failures): assert result[0].failures == failures -def test_pipeline_owner_creator() -> None: +def test_jobinfo_owner_creator() -> None: admin_locator = create_autospec(AdministratorLocator) - ownership = JobOwnership(admin_locator) + ownership = JobInfoOwnership(admin_locator) owner = ownership.owner_of(JobInfo(creator="bob", job_id="1", success=1, failures="[]")) assert owner == "bob" admin_locator.get_workspace_administrator.assert_not_called() -def test_pipeline_owner_creator_unknown() -> None: +def test_jobinfo_owner_creator_unknown() -> None: admin_locator = create_autospec(AdministratorLocator) admin_locator.get_workspace_administrator.return_value = "an_admin" - ownership = JobOwnership(admin_locator) + ownership = JobInfoOwnership(admin_locator) owner = ownership.owner_of(JobInfo(creator=None, job_id="1", success=1, failures="[]")) assert owner == "an_admin" admin_locator.get_workspace_administrator.assert_called_once() + + +def test_job_owner_creator() -> None: + admin_locator = create_autospec(AdministratorLocator) + + ownership = JobOwnership(admin_locator) + owner = ownership.owner_of(Job(creator_user_name="bob", job_id=1)) + + assert owner == "bob" + admin_locator.get_workspace_administrator.assert_not_called() + + +def test_job_owner_creator_unknown() -> None: + admin_locator = create_autospec(AdministratorLocator) + admin_locator.get_workspace_administrator.return_value = "an_admin" + + ownership = JobOwnership(admin_locator) + owner = ownership.owner_of(Job(job_id=1)) + + assert owner == "an_admin" + admin_locator.get_workspace_administrator.assert_called_once() From b7b0bb26ec446b1bea959ed46eacd7bcbbeceef3 Mon Sep 17 00:00:00 2001 From: Eric Vergnaud Date: Thu, 17 Oct 2024 11:30:56 +0200 Subject: [PATCH 10/17] rename ClusterOwnership to ClusterInfoOwnership --- src/databricks/labs/ucx/assessment/clusters.py | 2 +- tests/integration/assessment/test_clusters.py | 6 +++--- tests/unit/assessment/test_clusters.py | 10 +++++----- 3 files changed, 9 insertions(+), 9 deletions(-) diff --git a/src/databricks/labs/ucx/assessment/clusters.py b/src/databricks/labs/ucx/assessment/clusters.py index 0e0624d3c2..18613c8ac3 100644 --- a/src/databricks/labs/ucx/assessment/clusters.py +++ b/src/databricks/labs/ucx/assessment/clusters.py @@ -182,7 +182,7 @@ def _try_fetch(self) -> Iterable[ClusterInfo]: yield ClusterInfo(*row) -class ClusterOwnership(Ownership[ClusterInfo]): +class ClusterInfoOwnership(Ownership[ClusterInfo]): """Determine ownership of clusters in the inventory. This is the cluster creator (if known). diff --git a/tests/integration/assessment/test_clusters.py b/tests/integration/assessment/test_clusters.py index 8cf0622220..2501b95251 100644 --- a/tests/integration/assessment/test_clusters.py +++ b/tests/integration/assessment/test_clusters.py @@ -9,7 +9,7 @@ from databricks.labs.ucx.assessment.clusters import ( ClustersCrawler, PoliciesCrawler, - ClusterOwnership, + ClusterInfoOwnership, ClusterPolicyOwnership, ) @@ -53,7 +53,7 @@ def _change_cluster_owner(ws, cluster_id: str, owner_user_name: str) -> None: ws.api_client.do('POST', '/api/2.1/clusters/change-owner', body=body, headers=headers) -def test_cluster_ownership(ws, runtime_ctx, make_cluster, make_user, inventory_schema, sql_backend) -> None: +def test_clusterinfo_ownership(ws, runtime_ctx, make_cluster, make_user, inventory_schema, sql_backend) -> None: """Verify the ownership can be determined for crawled clusters.""" # Set up two clusters: one with us as owner and one for a different user. @@ -76,7 +76,7 @@ def test_cluster_ownership(ws, runtime_ctx, make_cluster, make_user, inventory_s # Verify ownership is as expected. administrator_locator = runtime_ctx.administrator_locator - ownership = ClusterOwnership(administrator_locator) + ownership = ClusterInfoOwnership(administrator_locator) assert ownership.owner_of(my_cluster_record) == ws.current_user.me().user_name assert ownership.owner_of(their_cluster_record) == another_user.user_name diff --git a/tests/unit/assessment/test_clusters.py b/tests/unit/assessment/test_clusters.py index c86c3f60f0..e0072b2060 100644 --- a/tests/unit/assessment/test_clusters.py +++ b/tests/unit/assessment/test_clusters.py @@ -10,7 +10,7 @@ from databricks.labs.ucx.assessment.clusters import ( ClustersCrawler, PoliciesCrawler, - ClusterOwnership, + ClusterInfoOwnership, ClusterInfo, ClusterPolicyOwnership, PolicyInfo, @@ -185,21 +185,21 @@ def test_unsupported_clusters(): assert result_set[0].failures == '["cluster type not supported : LEGACY_PASSTHROUGH"]' -def test_cluster_owner_creator() -> None: +def test_clusterinfo_owner_creator() -> None: admin_locator = create_autospec(AdministratorLocator) - ownership = ClusterOwnership(admin_locator) + ownership = ClusterInfoOwnership(admin_locator) owner = ownership.owner_of(ClusterInfo(creator="bob", cluster_id="1", success=1, failures="[]")) assert owner == "bob" admin_locator.get_workspace_administrator.assert_not_called() -def test_cluster_owner_creator_unknown() -> None: +def test_clusterinfo_owner_creator_unknown() -> None: admin_locator = create_autospec(AdministratorLocator) admin_locator.get_workspace_administrator.return_value = "an_admin" - ownership = ClusterOwnership(admin_locator) + ownership = ClusterInfoOwnership(admin_locator) owner = ownership.owner_of(ClusterInfo(creator=None, cluster_id="1", success=1, failures="[]")) assert owner == "an_admin" From d0a6f6d1a51afd5c723535a5a929fb2f4ba245b5 Mon Sep 17 00:00:00 2001 From: Eric Vergnaud Date: Thu, 17 Oct 2024 11:43:42 +0200 Subject: [PATCH 11/17] add ClusterDetailsOwnership --- .../labs/ucx/assessment/clusters.py | 10 +++++++ tests/integration/assessment/test_clusters.py | 14 ++++++---- tests/unit/assessment/test_clusters.py | 27 ++++++++++++++++--- 3 files changed, 43 insertions(+), 8 deletions(-) diff --git a/src/databricks/labs/ucx/assessment/clusters.py b/src/databricks/labs/ucx/assessment/clusters.py index 18613c8ac3..4c0ee8cf3b 100644 --- a/src/databricks/labs/ucx/assessment/clusters.py +++ b/src/databricks/labs/ucx/assessment/clusters.py @@ -192,6 +192,16 @@ def _maybe_direct_owner(self, record: ClusterInfo) -> str | None: return record.creator +class ClusterDetailsOwnership(Ownership[ClusterDetails]): + """Determine ownership of clusters in the workspace. + + This is the cluster creator (if known). + """ + + def _maybe_direct_owner(self, record: ClusterDetails) -> str | None: + return record.creator_user_name + + @dataclass class PolicyInfo: policy_id: str diff --git a/tests/integration/assessment/test_clusters.py b/tests/integration/assessment/test_clusters.py index 2501b95251..cf413a0bc0 100644 --- a/tests/integration/assessment/test_clusters.py +++ b/tests/integration/assessment/test_clusters.py @@ -9,8 +9,9 @@ from databricks.labs.ucx.assessment.clusters import ( ClustersCrawler, PoliciesCrawler, + ClusterDetailsOwnership, ClusterInfoOwnership, - ClusterPolicyOwnership, + ClusterPolicyOwnership, ) from .test_assessment import _SPARK_CONF @@ -53,7 +54,7 @@ def _change_cluster_owner(ws, cluster_id: str, owner_user_name: str) -> None: ws.api_client.do('POST', '/api/2.1/clusters/change-owner', body=body, headers=headers) -def test_clusterinfo_ownership(ws, runtime_ctx, make_cluster, make_user, inventory_schema, sql_backend) -> None: +def test_cluster_ownership(ws, runtime_ctx, make_cluster, make_user, inventory_schema, sql_backend) -> None: """Verify the ownership can be determined for crawled clusters.""" # Set up two clusters: one with us as owner and one for a different user. @@ -76,9 +77,12 @@ def test_clusterinfo_ownership(ws, runtime_ctx, make_cluster, make_user, invento # Verify ownership is as expected. administrator_locator = runtime_ctx.administrator_locator - ownership = ClusterInfoOwnership(administrator_locator) - assert ownership.owner_of(my_cluster_record) == ws.current_user.me().user_name - assert ownership.owner_of(their_cluster_record) == another_user.user_name + info_ownership = ClusterInfoOwnership(administrator_locator) + assert info_ownership.owner_of(my_cluster_record) == ws.current_user.me().user_name + assert info_ownership.owner_of(their_cluster_record) == another_user.user_name + details_ownership = ClusterDetailsOwnership(administrator_locator) + assert details_ownership.owner_of(ws.clusters.get(my_cluster.cluster_id)) == ws.current_user.me().user_name + assert details_ownership.owner_of(ws.clusters.get(their_cluster.cluster_id)) == another_user.user_name def test_cluster_crawler_mlr_no_isolation(ws, make_cluster, inventory_schema, sql_backend): diff --git a/tests/unit/assessment/test_clusters.py b/tests/unit/assessment/test_clusters.py index e0072b2060..1a4594f2ce 100644 --- a/tests/unit/assessment/test_clusters.py +++ b/tests/unit/assessment/test_clusters.py @@ -13,7 +13,7 @@ ClusterInfoOwnership, ClusterInfo, ClusterPolicyOwnership, - PolicyInfo, + PolicyInfo, ClusterDetailsOwnership, ) from databricks.labs.ucx.framework.crawlers import SqlBackend from databricks.labs.ucx.framework.owners import AdministratorLocator @@ -185,7 +185,7 @@ def test_unsupported_clusters(): assert result_set[0].failures == '["cluster type not supported : LEGACY_PASSTHROUGH"]' -def test_clusterinfo_owner_creator() -> None: +def test_cluster_info_owner_creator() -> None: admin_locator = create_autospec(AdministratorLocator) ownership = ClusterInfoOwnership(admin_locator) @@ -195,7 +195,7 @@ def test_clusterinfo_owner_creator() -> None: admin_locator.get_workspace_administrator.assert_not_called() -def test_clusterinfo_owner_creator_unknown() -> None: +def test_cluster_info_owner_creator_unknown() -> None: admin_locator = create_autospec(AdministratorLocator) admin_locator.get_workspace_administrator.return_value = "an_admin" @@ -206,6 +206,27 @@ def test_clusterinfo_owner_creator_unknown() -> None: admin_locator.get_workspace_administrator.assert_called_once() +def test_cluster_details_owner_creator() -> None: + admin_locator = create_autospec(AdministratorLocator) + + ownership = ClusterDetailsOwnership(admin_locator) + owner = ownership.owner_of(ClusterDetails(creator_user_name="bob", cluster_id="1")) + + assert owner == "bob" + admin_locator.get_workspace_administrator.assert_not_called() + + +def test_cluster_details_owner_creator_unknown() -> None: + admin_locator = create_autospec(AdministratorLocator) + admin_locator.get_workspace_administrator.return_value = "an_admin" + + ownership = ClusterDetailsOwnership(admin_locator) + owner = ownership.owner_of(ClusterDetails(cluster_id="1")) + + assert owner == "an_admin" + admin_locator.get_workspace_administrator.assert_called_once() + + def test_policy_crawler(): ws = mock_workspace_client( policy_ids=['single-user-with-spn', 'single-user-with-spn-policyid', 'single-user-with-spn-no-sparkversion'], From 3a411c654f0cc0a818228f1483f486199133ccf5 Mon Sep 17 00:00:00 2001 From: Eric Vergnaud Date: Thu, 17 Oct 2024 11:48:17 +0200 Subject: [PATCH 12/17] formatting --- tests/integration/assessment/test_clusters.py | 2 +- tests/unit/assessment/test_clusters.py | 5 +++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/tests/integration/assessment/test_clusters.py b/tests/integration/assessment/test_clusters.py index cf413a0bc0..75c6bf1ffa 100644 --- a/tests/integration/assessment/test_clusters.py +++ b/tests/integration/assessment/test_clusters.py @@ -11,7 +11,7 @@ PoliciesCrawler, ClusterDetailsOwnership, ClusterInfoOwnership, - ClusterPolicyOwnership, + ClusterPolicyOwnership, ) from .test_assessment import _SPARK_CONF diff --git a/tests/unit/assessment/test_clusters.py b/tests/unit/assessment/test_clusters.py index 1a4594f2ce..af4e1f5c20 100644 --- a/tests/unit/assessment/test_clusters.py +++ b/tests/unit/assessment/test_clusters.py @@ -9,11 +9,12 @@ from databricks.labs.ucx.assessment.azure import AzureServicePrincipalCrawler from databricks.labs.ucx.assessment.clusters import ( ClustersCrawler, - PoliciesCrawler, + ClusterDetailsOwnership, ClusterInfoOwnership, ClusterInfo, ClusterPolicyOwnership, - PolicyInfo, ClusterDetailsOwnership, + PoliciesCrawler, + PolicyInfo, ) from databricks.labs.ucx.framework.crawlers import SqlBackend from databricks.labs.ucx.framework.owners import AdministratorLocator From 082602bb93642d26e60c389214ae843ae62ffcf7 Mon Sep 17 00:00:00 2001 From: Eric Vergnaud Date: Thu, 17 Oct 2024 12:15:47 +0200 Subject: [PATCH 13/17] Use 'Ownership' classes --- src/databricks/labs/ucx/sequencing/sequencing.py | 11 +++++++---- tests/unit/sequencing/test_sequencing.py | 10 ++++++++-- 2 files changed, 15 insertions(+), 6 deletions(-) diff --git a/src/databricks/labs/ucx/sequencing/sequencing.py b/src/databricks/labs/ucx/sequencing/sequencing.py index 4037a29697..04bb273fdb 100644 --- a/src/databricks/labs/ucx/sequencing/sequencing.py +++ b/src/databricks/labs/ucx/sequencing/sequencing.py @@ -7,6 +7,9 @@ from databricks.sdk import WorkspaceClient from databricks.sdk.service import jobs +from databricks.labs.ucx.assessment.clusters import ClusterDetailsOwnership +from databricks.labs.ucx.assessment.jobs import JobOwnership +from databricks.labs.ucx.framework.owners import AdministratorLocator from databricks.labs.ucx.source_code.graph import DependencyGraph @@ -66,8 +69,9 @@ def find(self, object_type: str, object_id: str) -> MigrationNode | None: class MigrationSequencer: - def __init__(self, ws: WorkspaceClient): + def __init__(self, ws: WorkspaceClient, admin_locator: AdministratorLocator): self._ws = ws + self._admin_locator = admin_locator self._last_node_id = 0 self._root = MigrationNode( node_id=0, object_type="ROOT", object_id="ROOT", object_name="ROOT", object_owner="NONE" @@ -107,7 +111,7 @@ def register_workflow_job(self, job: jobs.Job) -> MigrationNode: object_type="JOB", object_id=str(job.job_id), object_name=job_name, - object_owner=job.creator_user_name or "", + object_owner=JobOwnership(self._admin_locator).owner_of(job), ) top_level = True if job.settings and job.settings.job_clusters: @@ -131,14 +135,13 @@ def register_cluster(self, cluster_key: str) -> MigrationNode: return cluster_node details = self._ws.clusters.get(cluster_key) object_name = details.cluster_name if details and details.cluster_name else cluster_key - object_owner = details.creator_user_name if details and details.creator_user_name else "" self._last_node_id += 1 cluster_node = MigrationNode( node_id=self._last_node_id, object_type="CLUSTER", object_id=cluster_key, object_name=object_name, - object_owner=object_owner, + object_owner=ClusterDetailsOwnership(self._admin_locator).owner_of(details), ) # TODO register warehouses and policies self._root.required_steps.append(cluster_node) diff --git a/tests/unit/sequencing/test_sequencing.py b/tests/unit/sequencing/test_sequencing.py index fa7271164e..21d2a612d0 100644 --- a/tests/unit/sequencing/test_sequencing.py +++ b/tests/unit/sequencing/test_sequencing.py @@ -1,6 +1,9 @@ -from databricks.sdk.service import jobs +from unittest.mock import create_autospec + +from databricks.sdk.service import iam, jobs from databricks.sdk.service.compute import ClusterDetails +from databricks.labs.ucx.framework.owners import AdministratorLocator, AdministratorFinder from databricks.labs.ucx.sequencing.sequencing import MigrationSequencer from databricks.labs.ucx.source_code.base import CurrentSessionState from databricks.labs.ucx.source_code.graph import DependencyGraph @@ -15,7 +18,10 @@ def test_cluster_from_task_has_children(ws, simple_dependency_resolver, mock_pat ws.jobs.get.return_value = job dependency = WorkflowTask(ws, task, job) graph = DependencyGraph(dependency, None, simple_dependency_resolver, mock_path_lookup, CurrentSessionState()) - sequencer = MigrationSequencer(ws) + admin_finder = create_autospec(AdministratorFinder) + admin_user = iam.User(user_name="John Doe", active=True, roles=[iam.ComplexValue(value="account_admin")]) + admin_finder.find_admin_users.return_value = (admin_user,) + sequencer = MigrationSequencer(ws, AdministratorLocator(ws, finders=[lambda _ws: admin_finder])) sequencer.register_workflow_task(task, job, graph) steps = list(sequencer.generate_steps()) step = steps[-1] From bb56fbae8a7924ad73490e26a395ae19e5bd5fb9 Mon Sep 17 00:00:00 2001 From: Eric Vergnaud Date: Thu, 17 Oct 2024 15:15:38 +0200 Subject: [PATCH 14/17] sort using adapted Kahn algo --- .../labs/ucx/sequencing/sequencing.py | 135 +++++++++--------- 1 file changed, 64 insertions(+), 71 deletions(-) diff --git a/src/databricks/labs/ucx/sequencing/sequencing.py b/src/databricks/labs/ucx/sequencing/sequencing.py index 04bb273fdb..932c791f2c 100644 --- a/src/databricks/labs/ucx/sequencing/sequencing.py +++ b/src/databricks/labs/ucx/sequencing/sequencing.py @@ -1,8 +1,8 @@ from __future__ import annotations -import itertools +from collections import defaultdict from collections.abc import Iterable -from dataclasses import dataclass, field +from dataclasses import dataclass from databricks.sdk import WorkspaceClient from databricks.sdk.service import jobs @@ -21,7 +21,7 @@ class MigrationStep: object_id: str object_name: str object_owner: str - required_step_ids: list[int] = field(default_factory=list) + required_step_ids: list[int] @dataclass @@ -31,40 +31,21 @@ class MigrationNode: object_id: str object_name: str object_owner: str - required_steps: list[MigrationNode] = field(default_factory=list) - - def generate_steps(self) -> tuple[MigrationStep, Iterable[MigrationStep]]: - # traverse the nodes using a depth-first algorithm - # ultimate leaves have a step number of 1 - # use highest required step number + 1 for this step - highest_step_number = 0 - required_step_ids: list[int] = [] - all_generated_steps: list[Iterable[MigrationStep]] = [] - for required_step in self.required_steps: - step, generated_steps = required_step.generate_steps() - highest_step_number = max(highest_step_number, step.step_number) - required_step_ids.append(step.step_id) - all_generated_steps.append(generated_steps) - all_generated_steps.append([step]) - this_step = MigrationStep( + + @property + def key(self) -> tuple[str, str]: + return self.object_type, self.object_id + + def as_step(self, step_number: int, required_step_ids: list[int]) -> MigrationStep: + return MigrationStep( step_id=self.node_id, - step_number=highest_step_number + 1, + step_number=step_number, object_type=self.object_type, object_id=self.object_id, object_name=self.object_name, object_owner=self.object_owner, required_step_ids=required_step_ids, ) - return this_step, itertools.chain(*all_generated_steps) - - def find(self, object_type: str, object_id: str) -> MigrationNode | None: - if object_type == self.object_type and object_id == self.object_id: - return self - for step in self.required_steps: - found = step.find(object_type, object_id) - if found: - return found - return None class MigrationSequencer: @@ -73,13 +54,13 @@ def __init__(self, ws: WorkspaceClient, admin_locator: AdministratorLocator): self._ws = ws self._admin_locator = admin_locator self._last_node_id = 0 - self._root = MigrationNode( - node_id=0, object_type="ROOT", object_id="ROOT", object_name="ROOT", object_owner="NONE" - ) + self._nodes: dict[tuple[str, str], MigrationNode] = {} + self._incoming: dict[tuple[str, str], set[tuple[str, str]]] = defaultdict(set) + self._outgoing: dict[tuple[str, str], set[tuple[str, str]]] = defaultdict(set) def register_workflow_task(self, task: jobs.Task, job: jobs.Job, _graph: DependencyGraph) -> MigrationNode: task_id = f"{job.job_id}/{task.task_key}" - task_node = self._find_node(object_type="TASK", object_id=task_id) + task_node = self._nodes.get(("TASK", task_id), None) if task_node: return task_node job_node = self.register_workflow_job(job) @@ -91,17 +72,22 @@ def register_workflow_task(self, task: jobs.Task, job: jobs.Job, _graph: Depende object_name=task.task_key, object_owner=job_node.object_owner, # no task owner so use job one ) - job_node.required_steps.append(task_node) + self._nodes[task_node.key] = task_node + self._incoming[job_node.key].add(task_node.key) + self._outgoing[task_node.key].add(job_node.key) if task.existing_cluster_id: cluster_node = self.register_cluster(task.existing_cluster_id) - cluster_node.required_steps.append(task_node) - if job_node not in cluster_node.required_steps: - cluster_node.required_steps.append(job_node) + if cluster_node: + self._incoming[cluster_node.key].add(task_node.key) + self._outgoing[task_node.key].add(cluster_node.key) + # also make the cluster dependent on the job + self._incoming[cluster_node.key].add(job_node.key) + self._outgoing[job_node.key].add(cluster_node.key) # TODO register dependency graph return task_node def register_workflow_job(self, job: jobs.Job) -> MigrationNode: - job_node = self._find_node(object_type="JOB", object_id=str(job.job_id)) + job_node = self._nodes.get(("JOB", str(job.job_id)), None) if job_node: return job_node self._last_node_id += 1 @@ -113,15 +99,13 @@ def register_workflow_job(self, job: jobs.Job) -> MigrationNode: object_name=job_name, object_owner=JobOwnership(self._admin_locator).owner_of(job), ) - top_level = True + self._nodes[job_node.key] = job_node if job.settings and job.settings.job_clusters: for job_cluster in job.settings.job_clusters: cluster_node = self.register_job_cluster(job_cluster) if cluster_node: - top_level = False - cluster_node.required_steps.append(job_node) - if top_level: - self._root.required_steps.append(job_node) + self._incoming[cluster_node.key].add(job_node.key) + self._outgoing[job_node.key].add(cluster_node.key) return job_node def register_job_cluster(self, cluster: jobs.JobCluster) -> MigrationNode | None: @@ -129,46 +113,55 @@ def register_job_cluster(self, cluster: jobs.JobCluster) -> MigrationNode | None return None return self.register_cluster(cluster.job_cluster_key) - def register_cluster(self, cluster_key: str) -> MigrationNode: - cluster_node = self._find_node(object_type="CLUSTER", object_id=cluster_key) + def register_cluster(self, cluster_id: str) -> MigrationNode: + cluster_node = self._nodes.get(("CLUSTER", cluster_id), None) if cluster_node: return cluster_node - details = self._ws.clusters.get(cluster_key) - object_name = details.cluster_name if details and details.cluster_name else cluster_key + details = self._ws.clusters.get(cluster_id) + object_name = details.cluster_name if details and details.cluster_name else cluster_id self._last_node_id += 1 cluster_node = MigrationNode( node_id=self._last_node_id, object_type="CLUSTER", - object_id=cluster_key, + object_id=cluster_id, object_name=object_name, object_owner=ClusterDetailsOwnership(self._admin_locator).owner_of(details), ) + self._nodes[cluster_node.key] = cluster_node # TODO register warehouses and policies - self._root.required_steps.append(cluster_node) return cluster_node def generate_steps(self) -> Iterable[MigrationStep]: - _root_step, generated_steps = self._root.generate_steps() - unique_steps = self._deduplicate_steps(generated_steps) - return self._sorted_steps(unique_steps) + # algo adapted from Kahn topological sort. The main differences is that + # we want the same step number for all nodes with same dependency depth + # so instead of pushing to a queue, we rebuild it once all leaf nodes are processed + # (these are transient leaf nodes i.e. they only become leaf during processing) + incoming_counts = self._populate_incoming_counts() + step_number = 1 + sorted_steps: list[MigrationStep] = [] + while len(incoming_counts) > 0: + leaf_keys = list(self._get_leaf_keys(incoming_counts)) + for leaf_key in leaf_keys: + del incoming_counts[leaf_key] + sorted_steps.append(self._nodes[leaf_key].as_step(step_number, list(self._required_step_ids(leaf_key)))) + for dependency_key in self._outgoing[leaf_key]: + incoming_counts[dependency_key] -= 1 + step_number += 1 + return sorted_steps + + def _required_step_ids(self, node_key: tuple[str, str]) -> Iterable[int]: + for leaf_key in self._incoming[node_key]: + yield self._nodes[leaf_key].node_id + + def _populate_incoming_counts(self) -> dict[tuple[str, str], int]: + result = defaultdict(int) + for node_key in self._nodes: + result[node_key] = len(self._incoming[node_key]) + return result @staticmethod - def _sorted_steps(steps: Iterable[MigrationStep]) -> Iterable[MigrationStep]: - # sort by step number, lowest first - return sorted(steps, key=lambda step: step.step_number) - - @staticmethod - def _deduplicate_steps(steps: Iterable[MigrationStep]) -> Iterable[MigrationStep]: - best_steps: dict[int, MigrationStep] = {} - for step in steps: - existing = best_steps.get(step.step_id, None) - # keep the step with the highest step number - # TODO this possibly affects the step_number of steps that depend on this one - # but it's probably OK to not be 100% accurate initially - if existing and existing.step_number >= step.step_number: + def _get_leaf_keys(incoming_counts: dict[tuple[str, str], int]) -> Iterable[tuple[str, str]]: + for node_key, incoming_count in incoming_counts.items(): + if incoming_count > 0: continue - best_steps[step.step_id] = step - return best_steps.values() - - def _find_node(self, object_type: str, object_id: str) -> MigrationNode | None: - return self._root.find(object_type, object_id) + yield node_key From bfce4749707c56e4c0e7cf9b1afabde185ca028a Mon Sep 17 00:00:00 2001 From: Eric Vergnaud Date: Thu, 17 Oct 2024 15:57:05 +0200 Subject: [PATCH 15/17] revert merge --- CHANGELOG.md | 20 - README.md | 18 +- pyproject.toml | 2 +- src/databricks/labs/ucx/__about__.py | 2 +- src/databricks/labs/ucx/assessment/azure.py | 2 +- .../labs/ucx/assessment/clusters.py | 12 +- src/databricks/labs/ucx/assessment/jobs.py | 39 +- .../labs/ucx/contexts/application.py | 16 +- .../labs/ucx/hive_metastore/catalog_schema.py | 401 +++-- .../labs/ucx/hive_metastore/grants.py | 64 +- .../labs/ucx/hive_metastore/locations.py | 215 +-- .../labs/ucx/hive_metastore/mapping.py | 10 - .../labs/ucx/hive_metastore/table_migrate.py | 116 +- .../labs/ucx/hive_metastore/tables.py | 1 - .../labs/ucx/hive_metastore/workflows.py | 4 +- src/databricks/labs/ucx/install.py | 1 - .../labs/ucx/mixins/cached_workspace_path.py | 173 +- src/databricks/labs/ucx/source_code/base.py | 84 +- src/databricks/labs/ucx/source_code/jobs.py | 36 +- .../labs/ucx/source_code/known.json | 1422 ----------------- .../labs/ucx/source_code/notebooks/sources.py | 8 +- tests/integration/assessment/test_clusters.py | 12 +- tests/integration/assessment/test_jobs.py | 4 +- .../hive_metastore/test_catalog_schema.py | 125 +- .../hive_metastore/test_external_locations.py | 22 +- .../hive_metastore/test_migrate.py | 28 +- .../integration/install/test_installation.py | 4 +- .../workspace_access/test_groups.py | 4 +- tests/unit/assessment/test_clusters.py | 34 +- tests/unit/assessment/test_jobs.py | 33 +- .../hive_metastore/test_catalog_schema.py | 295 ++-- tests/unit/hive_metastore/test_grants.py | 198 +-- tests/unit/hive_metastore/test_locations.py | 77 +- tests/unit/hive_metastore/test_mapping.py | 13 - .../unit/hive_metastore/test_migrate_acls.py | 23 +- .../unit/hive_metastore/test_table_migrate.py | 158 +- tests/unit/install/test_install.py | 6 +- .../unit/mixins/test_cached_workspace_path.py | 50 +- .../source_code/notebooks/test_sources.py | 4 +- tests/unit/source_code/test_jobs.py | 17 +- tests/unit/test_cli.py | 11 +- 41 files changed, 804 insertions(+), 2960 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 5a3c954bb7..4434394178 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,25 +1,5 @@ # Version changelog -## 0.45.0 - -* Added DBFS Root resolution when HMS Federation is enabled ([#2947](https://github.com/databrickslabs/ucx/issues/2947)). This commit introduces a DBFS resolver for use with HMS (Hive Metastore) federation, enabling accurate resolution of DBFS root locations when HMS federation is enabled. A new `_resolve_dbfs_root()` class method is added to the `MountsCrawler` class, and a boolean argument `enable_hms_federation` is included in the `MountsCrawler` constructor, providing better handling of federation functionality. The commit also adds a test function, `test_resolve_dbfs_root_in_hms_federation`, to validate the resolution of DBFS roots with HMS federation. The test covers special cases, such as the `/user/hive/metastore` path, and utilizes `LocationTrie` for more accurate location guessing. These changes aim to improve the overall DBFS root resolution when using HMS federation. -* Added `jax-jumpy` to known list ([#2959](https://github.com/databrickslabs/ucx/issues/2959)). In this release, we have added the `jax-jumpy` package to the list of known packages in our system. `jax-jumpy` is a Python-based numerical computation library, which includes modules such as `jumpy`, `jumpy._base_fns`, `jumpy.core`, `jumpy.lax`, `jumpy.numpy`, `jumpy.numpy._factory_fns`, `jumpy.numpy._transform_data`, `jumpy.numpy._types`, `jumpy.numpy.linalg`, `jumpy.ops`, and `jumpy.random`. These modules are now recognized by our system, which partially resolves issue [#1931](https://github.com/databrickslabs/ucx/issues/1931), which may have been caused by the integration of the `jax-jumpy` package. Engineers can now utilize the capabilities of this library in their numerical computations. -* Added `joblibspark` to known list ([#2960](https://github.com/databrickslabs/ucx/issues/2960)). In this release, we have added support for the `joblibspark` library in our system by updating the `known.json` file, which keeps track of various libraries and their associated components. This change is a part of the resolution for issue [#1931](https://github.com/databrickslabs/ucx/issues/1931) and includes new elements such as `doc`, `doc.conf`, `joblibspark`, `joblibspark.backend`, and `joblibspark.utils`. These additions enable the system to recognize and manage the new components related to `joblibspark`, allowing for improved compatibility and functionality. -* Added `jsonpatch` to known list ([#2969](https://github.com/databrickslabs/ucx/issues/2969)). In this release, we have added `jsonpatch` to the list of known libraries in the `known.json` file. Jsonpatch is a library used for applying JSON patches, which allow for partial updates to a JSON document. By including jsonpatch in the known list, developers can now easily utilize its functionality for JSON patching, and any necessary dependencies will be handled automatically. This change partially addresses issue [#1931](https://github.com/databrickslabs/ucx/issues/1931), which may have been caused by the use or integration of jsonpatch. We encourage developers to take advantage of this new addition to enhance their code and efficiently make partial updates to JSON documents. -* Added `langchain-community` to known list ([#2970](https://github.com/databrickslabs/ucx/issues/2970)). A new entry for `langchain-community` has been added to the configuration file for known language chain components in this release. This entry includes several sub-components such as 'langchain_community.agents', 'langchain_community.callbacks', 'langchain_community.chat_loaders', 'langchain_community.chat_message_histories', 'langchain_community.chat_models', 'langchain_community.cross_encoders', 'langchain_community.docstore', 'langchain_community.document_compressors', 'langchain_community.document_loaders', 'langchain_community.document_transformers', 'langchain_community.embeddings', 'langchain_community.example_selectors', 'langchain_community.graph_vectorstores', 'langchain_community.graphs', 'langchain_community.indexes', 'langchain_community.llms', 'langchain_community.memory', 'langchain_community.output_parsers', 'langchain_community.query_constructors', 'langchain_community.retrievers', 'langchain_community.storage', 'langchain_community.tools', 'langchain_community.utilities', and 'langchain_community.utils'. Currently, these sub-components are empty and have no additional configuration or code. This change partially resolves issue [#1931](https://github.com/databrickslabs/ucx/issues/1931), but the specifics of the issue and how these components will be used are still unclear. -* Added `langcodes` to known list ([#2971](https://github.com/databrickslabs/ucx/issues/2971)). A new `langcodes` library has been added to the project, addressing part of issue [#1931](https://github.com/databrickslabs/ucx/issues/1931). This library includes several modules that provide functionalities related to language codes and their manipulation, including `langcodes`, `langcodes.build_data`, `langcodes.data_dicts`, `langcodes.language_distance`, `langcodes.language_lists`, `langcodes.registry_parser`, `langcodes.tag_parser`, and `langcodes.util`. Additionally, the memory-efficient trie (prefix tree) data structure library, `marisa-trie`, has been included in the known list. It is important to note that no existing functionality has been altered in this commit. -* Addressing Ownership Conflict when creating catalog/schemas ([#2956](https://github.com/databrickslabs/ucx/issues/2956)). This release introduces new functionality to handle ownership conflicts during catalog/schema creation in our open-source library. The `_apply_from_legacy_table_acls` method has been enhanced with two loops to address non-own grants and own grants separately. This ensures proper handling of ownership conflicts by generating and executing UC grant SQL for each grant type, with appropriate exceptions. Additionally, a new helper function, `this_type_and_key()`, has been added to improve code readability. The release also introduces new methods, GrantsCrawler and Rule, in the hive_metastore package of the labs.ucx module, responsible for populating views and mapping source and destination objects. The test_catalog_schema.py file has been updated to include tests for creating catalogs and schemas with legacy ACLs, utilizing the new Rule method and GrantsCrawler. Issue [#2932](https://github.com/databrickslabs/ucx/issues/2932) has been addressed with these changes, which include adding new methods and updating existing tests for hive_metastore. -* Clarify `skip` and `unskip` commands work on views ([#2962](https://github.com/databrickslabs/ucx/issues/2962)). In this release, the `skip` and `unskip` commands in the databricks labs UCX tool have been updated to clarify their functionality on views and to make it more explicit with the addition of the `--view` flag. These commands allow users to skip or unskip certain schemas, tables, or views during the table migration process. This is useful for temporarily disabling migration of a particular schema, table, or view. Unit tests have been added to ensure the correct behavior of these commands when working with views. Two new methods have been added to test the behavior of the `unskip` command when a schema or table is specified, and two additional methods test the behavior of the `unskip` command when a view or no schema is specified. Finally, two methods test that an error message is logged when both the `--table` and `--view` flags are specified. -* Fixed issue with migrating MANAGED hive_metastore table to UC ([#2928](https://github.com/databrickslabs/ucx/issues/2928)). This commit addresses an issue with migrating Hive Metastore (HMS) MANAGED tables to Unity Catalog (UC) as EXTERNAL, where deleting a MANAGED table can result in data loss. To prevent this, a new option `CONVERT_TO_EXTERNAL` has been added to the `migrate_tables` method for migrating managed tables to UC as external, ensuring that the HMS managed table is converted to an external table in HMS and UC, and protecting against data loss when deleting a managed table that has been migrated to UC as external. Additionally, new caching properties have been added for better performance, and existing methods have been modified to handle the migration of managed tables to UC as external. Tests, including unit and integration tests, have been added to ensure the proper functioning of these changes. It is important to note that changing MANAGED tables to EXTERNAL can have potential consequences on regulatory data cleanup, and the impact of this change should be carefully validated for existing workloads. -* Let `create-catalogs-schemas` reuse `MigrateGrants` so that it applies group renaming ([#2955](https://github.com/databrickslabs/ucx/issues/2955)). The `create-catalogs-schemas` command in the `databricks labs ucx` package has been enhanced to reuse the `MigrateGrants` function, enabling group renaming and eliminating redundant code. The `migrate-tables` workflow remains functionally the same. Changes include modifying the `CatalogSchema` class to accept a `migrate_grants` argument, introducing new `Catalog` and `Schema` dataclasses, and updating various methods in the `hive_metastore` module. Unit and integration tests have been added and manually verified to ensure proper functionality. The `MigrateGrants` class has been updated to accept two `SecurableObject` arguments and sort matched grants. The `from_src_dst` function in `mapping.py` now includes a new `as_uc_table` method and updates to `as_uc_table_key`. Addressing issues [#2934](https://github.com/databrickslabs/ucx/issues/2934), [#2932](https://github.com/databrickslabs/ucx/issues/2932), and [#2955](https://github.com/databrickslabs/ucx/issues/2955), the changes also include a new `key` property for the `tables.py` file, and updates to the `test_create_catalogs_schemas` and `test_migrate_tables` test functions. -* Updated sqlglot requirement from <25.25,>=25.5.0 to >=25.5.0,<25.26 ([#2968](https://github.com/databrickslabs/ucx/issues/2968)). A update has been made to the sqlglot requirement in the pyproject.toml file, changing the version range from allowing versions 25.5.0 and later, but less than 25.25, to a range that allows for versions 25.5.0 and later, but less than 25.26. This change was implemented to allow for the latest version of sqlglot compatible with the project's requirements. The commit message includes a detailed changelog for sqlglot version 25.25.0, highlighting breaking changes, new features, bug fixes, and refactors. The commits included in the pull request are also outlined in the message, providing a comprehensive overview of the updates made. -* Use `LocationTrie` to infer a list of UC external locations ([#2965](https://github.com/databrickslabs/ucx/issues/2965)). This pull request introduces the `LocationTrie` class to improve the efficiency of determining external locations in UC storage. The `LocationTrie` class, implemented as a dataclass, maintains a hierarchical tree structure to store location paths and offers methods for inserting, finding, and checking if a table exists in the trie. It also refactors the existing `_parse_location` method and adds the `_parse_url` method to handle specific cases of JDBC URLs. The `find` method and `_external_locations` method are updated to use the new `LocationTrie` class, ensuring a more efficient way of determining overlapping storage prefixes. Additionally, the test file for external locations has been modified to include new URL formats and correct previously incorrect ones, enhancing compatibility with various URL formats. Overall, these changes are instrumental in preparing the codebase for future federated SQL connections by optimizing the determination of external locations and improving compatibility with diverse URL formats. -* Warn when table has column without no name during table migration ([#2984](https://github.com/databrickslabs/ucx/issues/2984)). In this release, we have introduced changes to the table migration feature in the databricks project to address issue [#2891](https://github.com/databrickslabs/ucx/issues/2891). We have renamed the `_migrate_table` method in the `TableMigration` class to `_safe_migrate_table` and added a new `_migrate_table` method that includes a try-except block to catch a specific Spark AnalysisException caused by an invalid column name. If this exception is caught, a warning is logged, and the function returns False. This change ensures that the migration process continues even when it encounters a table with a column without a name, generating a warning instead of raising an error. Additionally, we have added unit tests to verify the new functionality, including the introduction of a new mock backend `MockBackendWithGeneralException` to raise a general exception for testing purposes and a new test case `test_migrate_tables_handles_table_with_empty_column` to validate the warning message generated when encountering a table with an empty column during migration. - -Dependency updates: - - * Updated sqlglot requirement from <25.25,>=25.5.0 to >=25.5.0,<25.26 ([#2968](https://github.com/databrickslabs/ucx/pull/2968)). - ## 0.44.0 * Added `imbalanced-learn` to known list ([#2943](https://github.com/databrickslabs/ucx/issues/2943)). A new open-source library, "imbalanced-learn," has been added to the project's known list of libraries, providing various functionalities for handling imbalanced datasets. The addition includes modules such as "imblearn", "imblearn._config", "imblearn._min_dependencies", "imblearn._version", "imblearn.base", and many others, enabling features such as over-sampling, under-sampling, combining sampling techniques, and creating ensembles. This change partially resolves issue [#1931](https://github.com/databrickslabs/ucx/issues/1931), which may have been related to the handling of imbalanced datasets, thereby enhancing the project's ability to manage such datasets. diff --git a/README.md b/README.md index a530b3d25a..59566786a4 100644 --- a/README.md +++ b/README.md @@ -588,15 +588,15 @@ Each of the upgraded objects will be marked with an `upgraded_from` property. This property will be used to identify the original location of the object in the metastore. We also add a `upgraded_from_workspace_id` property to the upgraded object, to identify the source workspace. -| Object Type | Description | Upgrade Method | -|---------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -| EXTERNAL_SYNC | Tables not saved to the DBFS file system that are supported by the sync command.
These tables are in one of the following formats: DELTA, PARQUET, CSV, JSON, ORC, TEXT, AVRO | During the upgrade process, these table contents will remain intact and the metadata will be recreated in UC using the sync SQL command.
More information about the sync command can be found [here](https://docs.databricks.com/en/sql/language-manual/sql-ref-syntax-aux-sync.html) | -| EXTERNAL_HIVESERDE | Tables with table type "HIVE" that are not supported by the sync command | We provide two workflows for hiveserde table migration:
1. Migrate all hiveserde tables using CTAS which we officially support.
2. Migrate certain types of hiveserde in place, which is technically working, but the user need to accept the risk that the old files created by hiveserde may not be processed correctly by Spark datasource in corner cases. User will need to decide which workflow to runs first which will migrate the hiveserde tables and mark the `upgraded_to` property and hence those tables will be skipped in the migration workflow runs later. | -| EXTERNAL_NO_SYNC | Tables not saved to the DBFS file system that are not supported by the sync command | The current upgrade process will migrate these tables to UC by creating a new managed table in UC and copying the data from the old table to the new table. The new table's format will be Delta. | -| DBFS_ROOT_DELTA | Tables saved to the DBFS file system that are in Delta format | The current upgrade process will create a copy of these tables in UC using the "deep clone" command.
More information about the deep clone command can be found [here](https://docs.databricks.com/en/sql/language-manual/delta-clone.html) | -| DBFS_ROOT_NON_DELTA | Tables saved to the DBFS file system that are not in Delta format | The current upgrade process will create a managed table using CTAS | | -| VIEW | Datbase Views | Views are recreated during the upgrade process. The view's definition will be modified to repoint to the new UC tables. Views should be migrated only after all the dependent tables have been migrated. The upgrade process account for View to View dependencies. | -| MANAGED | Tables that are created as managed table in hive_metastore. | Depending on the WorkspaceConfig property managed_table_external_storage: 1. If the property is set to default CLONE (selected during installation). The UC Table will be created as CTAS which will created a copy of the data in UC. 2 If the property is set to SYNC_AS_EXTERNAL, the UC Table will be created as a EXTERNAL table. There is a risk, if the managed HMS table is dropped, which will drop the data and it will affect the UC table as well. 3 If the property is set to CONVERT_TO_EXTERNAL, the managed HMS table is converted to external and the tabel is created as external table in UC. This gives the advantage that dropping the HMS table doesnt drop the data, but impact of this should be carefully validate in existing workloads | +| Object Type | Description | Upgrade Method | +|---------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| EXTERNAL_SYNC | Tables not saved to the DBFS file system that are supported by the sync command.
These tables are in one of the following formats: DELTA, PARQUET, CSV, JSON, ORC, TEXT, AVRO | During the upgrade process, these table contents will remain intact and the metadata will be recreated in UC using the sync SQL command.
More information about the sync command can be found [here](https://docs.databricks.com/en/sql/language-manual/sql-ref-syntax-aux-sync.html) | +| EXTERNAL_HIVESERDE | Tables with table type "HIVE" that are not supported by the sync command | We provide two workflows for hiveserde table migration:
1. Migrate all hiveserde tables using CTAS which we officially support.
2. Migrate certain types of hiveserde in place, which is technically working, but the user need to accept the risk that the old files created by hiveserde may not be processed correctly by Spark datasource in corner cases. User will need to decide which workflow to runs first which will migrate the hiveserde tables and mark the `upgraded_to` property and hence those tables will be skipped in the migration workflow runs later. | +| EXTERNAL_NO_SYNC | Tables not saved to the DBFS file system that are not supported by the sync command | The current upgrade process will migrate these tables to UC by creating a new managed table in UC and copying the data from the old table to the new table. The new table's format will be Delta. | +| DBFS_ROOT_DELTA | Tables saved to the DBFS file system that are in Delta format | The current upgrade process will create a copy of these tables in UC using the "deep clone" command.
More information about the deep clone command can be found [here](https://docs.databricks.com/en/sql/language-manual/delta-clone.html) | +| DBFS_ROOT_NON_DELTA | Tables saved to the DBFS file system that are not in Delta format | The current upgrade process will create a managed table using CTAS | | +| VIEW | Datbase Views | Views are recreated during the upgrade process. The view's definition will be modified to repoint to the new UC tables. Views should be migrated only after all the dependent tables have been migrated. The upgrade process account for View to View dependencies. | +| MANAGED | Tables that are created as managed table in hive_metastore. | Depending on the WorkspaceConfig property managed_table_external_storage: 1. If the property is set to default CLONE (selected during installation). The UC Table will be created as CTAS which will created a copy of the data in UC. 2 If the property is set to SYNC_AS_EXTERNAL, the UC Table will be created as a EXTERNAL table. There is a risk, if the managed HMS table is dropped, which will drop the data and it will affect the UC table as well. | The upgrade process can be triggered using the `migrate-tables` [UCX command](#migrate-tables-command) Or by running the table migration workflows deployed to the workspace. diff --git a/pyproject.toml b/pyproject.toml index 5361092418..ac356489fd 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -48,7 +48,7 @@ dependencies = ["databricks-sdk~=0.30", "databricks-labs-lsql>=0.5,<0.13", "databricks-labs-blueprint>=0.9.1,<0.10", "PyYAML>=6.0.0,<7.0.0", - "sqlglot>=25.5.0,<25.26", + "sqlglot>=25.5.0,<25.25", "astroid>=3.3.1"] [project.optional-dependencies] diff --git a/src/databricks/labs/ucx/__about__.py b/src/databricks/labs/ucx/__about__.py index 6398b38d94..6a5486e0ec 100644 --- a/src/databricks/labs/ucx/__about__.py +++ b/src/databricks/labs/ucx/__about__.py @@ -1,2 +1,2 @@ # DO NOT MODIFY THIS FILE -__version__ = "0.45.0" +__version__ = "0.44.0" diff --git a/src/databricks/labs/ucx/assessment/azure.py b/src/databricks/labs/ucx/assessment/azure.py index 68233d958c..81c99e784b 100644 --- a/src/databricks/labs/ucx/assessment/azure.py +++ b/src/databricks/labs/ucx/assessment/azure.py @@ -73,7 +73,7 @@ def _get_relevant_service_principals(self) -> list[AzureServicePrincipalInfo]: # list all relevant service principals in jobs all_jobs = list(self._ws.jobs.list(expand_tasks=True)) - all_clusters_by_id = {c.cluster_id: c for c in self._ws.clusters.list() if c.cluster_id} + all_clusters_by_id = {c.cluster_id: c for c in self._ws.clusters.list()} for _job, cluster_config in self._get_cluster_configs_from_all_jobs(all_jobs, all_clusters_by_id): set_service_principals.update(self._get_azure_spn_from_cluster_config(cluster_config)) diff --git a/src/databricks/labs/ucx/assessment/clusters.py b/src/databricks/labs/ucx/assessment/clusters.py index 4c0ee8cf3b..0e0624d3c2 100644 --- a/src/databricks/labs/ucx/assessment/clusters.py +++ b/src/databricks/labs/ucx/assessment/clusters.py @@ -182,7 +182,7 @@ def _try_fetch(self) -> Iterable[ClusterInfo]: yield ClusterInfo(*row) -class ClusterInfoOwnership(Ownership[ClusterInfo]): +class ClusterOwnership(Ownership[ClusterInfo]): """Determine ownership of clusters in the inventory. This is the cluster creator (if known). @@ -192,16 +192,6 @@ def _maybe_direct_owner(self, record: ClusterInfo) -> str | None: return record.creator -class ClusterDetailsOwnership(Ownership[ClusterDetails]): - """Determine ownership of clusters in the workspace. - - This is the cluster creator (if known). - """ - - def _maybe_direct_owner(self, record: ClusterDetails) -> str | None: - return record.creator_user_name - - @dataclass class PolicyInfo: policy_id: str diff --git a/src/databricks/labs/ucx/assessment/jobs.py b/src/databricks/labs/ucx/assessment/jobs.py index 290fdbe106..3c6a4afa84 100644 --- a/src/databricks/labs/ucx/assessment/jobs.py +++ b/src/databricks/labs/ucx/assessment/jobs.py @@ -8,7 +8,7 @@ from databricks.labs.lsql.backends import SqlBackend from databricks.sdk import WorkspaceClient from databricks.sdk.service import compute -from databricks.sdk.service.compute import ClusterDetails, ClusterSpec +from databricks.sdk.service.compute import ClusterDetails from databricks.sdk.service.jobs import ( BaseJob, BaseRun, @@ -20,7 +20,6 @@ RunType, SparkJarTask, SqlTask, - Job, ) from databricks.labs.ucx.assessment.clusters import CheckClusterMixin @@ -44,7 +43,7 @@ class JobInfo: class JobsMixin: @classmethod - def _get_cluster_configs_from_all_jobs(cls, all_jobs: list[BaseJob], all_clusters_by_id: dict[str, ClusterDetails]): + def _get_cluster_configs_from_all_jobs(cls, all_jobs, all_clusters_by_id): for job in all_jobs: if job.settings is None: continue @@ -55,11 +54,7 @@ def _get_cluster_configs_from_all_jobs(cls, all_jobs: list[BaseJob], all_cluster yield from cls._task_clusters(job, all_clusters_by_id) @classmethod - def _task_clusters( - cls, job: BaseJob, all_clusters_by_id: dict[str, ClusterDetails] - ) -> Iterable[tuple[BaseJob, ClusterDetails | ClusterSpec]]: - if not job.settings or not job.settings.tasks: - return + def _task_clusters(cls, job, all_clusters_by_id): for task in job.settings.tasks: if task.existing_cluster_id is not None: interactive_cluster = all_clusters_by_id.get(task.existing_cluster_id, None) @@ -70,9 +65,7 @@ def _task_clusters( yield job, task.new_cluster @staticmethod - def _job_clusters(job: BaseJob) -> Iterable[tuple[BaseJob, ClusterSpec]]: - if not job.settings or not job.settings.job_clusters: - return + def _job_clusters(job): for job_cluster in job.settings.job_clusters: if job_cluster.new_cluster is None: continue @@ -86,7 +79,7 @@ def __init__(self, ws: WorkspaceClient, sbe: SqlBackend, schema): def _crawl(self) -> Iterable[JobInfo]: all_jobs = list(self._ws.jobs.list(expand_tasks=True)) - all_clusters = {c.cluster_id: c for c in self._ws.clusters.list() if c.cluster_id} + all_clusters = {c.cluster_id: c for c in self._ws.clusters.list()} return self._assess_jobs(all_jobs, all_clusters) def _assess_jobs(self, all_jobs: list[BaseJob], all_clusters_by_id) -> Iterable[JobInfo]: @@ -100,11 +93,11 @@ def _assess_jobs(self, all_jobs: list[BaseJob], all_clusters_by_id) -> Iterable[ cluster_failures.extend(self._check_jar_task(job.settings.tasks)) job_assessment[job_id].update(cluster_failures) - for job_key, job_info in job_details.items(): - failures = job_assessment[job_key] - job_info.failures = json.dumps(list(failures)) - if len(failures) > 0: - job_info.success = 0 + # TODO: next person looking at this - rewrite, as this code makes no sense + for job_key in job_details.keys(): # pylint: disable=consider-using-dict-items,consider-iterating-dictionary + job_details[job_key].failures = json.dumps(list(job_assessment[job_key])) + if len(job_assessment[job_key]) > 0: + job_details[job_key].success = 0 return list(job_details.values()) @staticmethod @@ -150,7 +143,7 @@ def _check_jar_task(self, all_task: list[RunTask]) -> list[str]: return task_failures -class JobInfoOwnership(Ownership[JobInfo]): +class JobOwnership(Ownership[JobInfo]): """Determine ownership of jobs (workflows) in the inventory. This is the job creator (if known). @@ -160,16 +153,6 @@ def _maybe_direct_owner(self, record: JobInfo) -> str | None: return record.creator -class JobOwnership(Ownership[Job]): - """Determine ownership of jobs (workflows) in the workspace. - - This is the job creator (if known). - """ - - def _maybe_direct_owner(self, record: Job) -> str | None: - return record.creator_user_name - - @dataclass class SubmitRunInfo: run_ids: str # JSON-encoded list of run ids diff --git a/src/databricks/labs/ucx/contexts/application.py b/src/databricks/labs/ucx/contexts/application.py index 88640988de..ef51d192be 100644 --- a/src/databricks/labs/ucx/contexts/application.py +++ b/src/databricks/labs/ucx/contexts/application.py @@ -294,12 +294,7 @@ def table_move(self) -> TableMove: @cached_property def mounts_crawler(self) -> MountsCrawler: - return MountsCrawler( - self.sql_backend, - self.workspace_client, - self.inventory_database, - self.config.enable_hms_federation, - ) + return MountsCrawler(self.sql_backend, self.workspace_client, self.inventory_database) @cached_property def azure_service_principal_crawler(self) -> AzureServicePrincipalCrawler: @@ -373,7 +368,14 @@ def table_mapping(self) -> TableMapping: @cached_property def catalog_schema(self) -> CatalogSchema: - return CatalogSchema(self.workspace_client, self.table_mapping, self.migrate_grants, self.config.ucx_catalog) + return CatalogSchema( + self.workspace_client, + self.table_mapping, + self.principal_acl, + self.sql_backend, + self.grants_crawler, + self.config.ucx_catalog, + ) @cached_property def verify_timeout(self) -> timedelta: diff --git a/src/databricks/labs/ucx/hive_metastore/catalog_schema.py b/src/databricks/labs/ucx/hive_metastore/catalog_schema.py index 91201a562b..d96068ca4f 100644 --- a/src/databricks/labs/ucx/hive_metastore/catalog_schema.py +++ b/src/databricks/labs/ucx/hive_metastore/catalog_schema.py @@ -1,127 +1,39 @@ import collections -import datetime as dt import logging -from dataclasses import dataclass +from dataclasses import replace from pathlib import PurePath from databricks.labs.blueprint.tui import Prompts +from databricks.labs.lsql.backends import SqlBackend +from databricks.labs.ucx.hive_metastore.grants import PrincipalACL, Grant, GrantsCrawler from databricks.sdk import WorkspaceClient -from databricks.sdk.errors import DatabricksError, NotFound -from databricks.sdk.retries import retried +from databricks.sdk.errors import NotFound +from databricks.sdk.service.catalog import SchemaInfo +from databricks.sdk.errors.platform import BadRequest -from databricks.labs.ucx.hive_metastore.grants import MigrateGrants from databricks.labs.ucx.hive_metastore.mapping import TableMapping logger = logging.getLogger(__name__) -@dataclass(frozen=True) -class Catalog: - """Represents a catalog from Unity Catalog. - - The Databricks SDK also comes with a representation for a catalog: `databricks.sdk.service.catalog.CatalogInfo`. - However, we introduce this dataclass to have a minimal, extensible representation required for UCX. - - Docs: - https://docs.databricks.com/en/data-governance/unity-catalog/manage-privileges/privileges.html#securable-objects-in-unity-catalog - """ - - name: str - """The catalog name""" - - @property - def full_name(self) -> str: - """The full name of the catalog. - - For a catalog, this is same as the attr:name as the catalog is the top of the object hierarchy (see doc link - above). - """ - return self.name - - @property - def key(self) -> str: - """Synonym for attr:full_name:.""" - return self.full_name - - @property - def kind(self) -> str: - """The object kind. - - Note: - In the SDK this maps to attr:securable_type. - - TODO: - https://github.com/databrickslabs/ucx/issues/2975 - """ - return "CATALOG" - - -@dataclass(frozen=True) -class Schema: - """Represents a schema from Unity Catalog. - - The Databricks SDK also comes with a representation for a schema: `databricks.sdk.service.catalog.SchemaInfo`. - However, we introduce this dataclass to have a minimal, extensible representation required for UCX. - - Docs: - https://docs.databricks.com/en/data-governance/unity-catalog/manage-privileges/privileges.html#securable-objects-in-unity-catalog - """ - - catalog: str - """The catalog the schema is part of. - - Note: - Maps to `SchemaInfo.catalog_name`, when introducing this class `catalog` is consistent with - `databricks.labs.ucx.hive_metastore.tables.Table.catalog`. - """ - - name: str - """The schema name""" - - @property - def full_name(self) -> str: - """The full name of the schema. - - For a schema, the second layer of the object hierarchy (see doc link above). - """ - return f"{self.catalog}.{self.name}" - - @property - def key(self) -> str: - """Synonym for attr:full_name.""" - return self.full_name - - @property - def kind(self) -> str: - """The object kind. - - Below "DATABASE" is chosen as this is the kind used in the grants module. However, more consistent with - Databricks documentation would be to use "SCHEMA" instead: - https://docs.databricks.com/en/data-governance/table-acls/object-privileges.html#securable-objects-in-the-hive-metastore - - TODO: - https://github.com/databrickslabs/ucx/issues/2975 - """ - return "DATABASE" # TODO: https://github.com/databrickslabs/ucx/issues/2974 - - class CatalogSchema: def __init__( self, ws: WorkspaceClient, table_mapping: TableMapping, - migrate_grants: MigrateGrants, + principal_grants: PrincipalACL, + sql_backend: SqlBackend, + grants_crawler: GrantsCrawler, ucx_catalog: str, - *, - timeout: dt.timedelta | None = dt.timedelta(seconds=30), ): self._ws = ws self._table_mapping = table_mapping - self._migrate_grants = migrate_grants self._external_locations = list(self._ws.external_locations.list()) + self._principal_grants = principal_grants + self._backend = sql_backend + self._hive_grants_crawler = grants_crawler self._ucx_catalog = ucx_catalog - self._timeout = timeout def create_ucx_catalog(self, prompts: Prompts, *, properties: dict[str, str] | None = None) -> None: """Create the UCX catalog. @@ -132,67 +44,176 @@ def create_ucx_catalog(self, prompts: Prompts, *, properties: dict[str, str] | N properties : (dict[str, str] | None), default None The properties to pass to the catalog. If None, no properties are passed. """ - self._create_catalog_validate(Catalog(self._ucx_catalog), prompts, properties=properties) - - def create_all_catalogs_schemas(self, prompts: Prompts, *, properties: dict[str, str] | None = None) -> None: - """Create all UC catalogs and schemas reference by the table mapping file. - - After creation, the grants from the HIVE metastore schemas are applied to the matching UC catalogs and schemas. - """ - catalogs, schemas = self._catalogs_schemas_from_table_mapping() - for dst_catalog, src_schemas in catalogs.items(): - self._create_catalog_validate(dst_catalog, prompts, properties=properties) - for dst_schema, src_schemas in schemas.items(): - self._create_schema(dst_schema) - for src_schema in src_schemas: - self._migrate_grants.apply(src_schema, dst_schema) - # Apply catalog grants as last to avoid transferring ownership before schema grants are applied - for dst_catalog, src_schemas in catalogs.items(): - for src_schema in src_schemas: - self._migrate_grants.apply(src_schema, dst_catalog) - - def _catalogs_schemas_from_table_mapping(self) -> tuple[dict[Catalog, set[Schema]], dict[Schema, set[Schema]]]: - """Generate a list of catalogs and schema to be created from table mapping. - - For applying grants after creating the catalogs and schemas, we track the HIVE metastore schemas from which the - UC catalog or schema is mapped. - - :returns - dict[Catalog, set[Schema]] : The UC catalogs to create with the schemas it is mapped from. - dict[Schema, set[Schema]] : The UC schemas to create with the schemas it is mapped from. - """ - catalogs, schemas = collections.defaultdict(set), collections.defaultdict(set) - for mappings in self._table_mapping.load(): - src_schema = Schema("hive_metastore", mappings.src_schema) - dst_catalog = Catalog(mappings.catalog_name) - dst_schema = Schema(mappings.catalog_name, mappings.dst_schema) - catalogs[dst_catalog].add(src_schema) - schemas[dst_schema].add(src_schema) - return catalogs, schemas + self._create_catalog_validate(self._ucx_catalog, prompts, properties=properties) + + def create_all_catalogs_schemas(self, prompts: Prompts) -> None: + candidate_catalogs, candidate_schemas = self._get_missing_catalogs_schemas() + for candidate_catalog in candidate_catalogs: + self._create_catalog_validate(candidate_catalog, prompts, properties=None) + for candidate_catalog, schemas in candidate_schemas.items(): + for candidate_schema in schemas: + try: + self._create_schema(candidate_catalog, candidate_schema) + except BadRequest as e: + if "already exists" in str(e): + logger.warning(f"Skipping already existing schema: {candidate_catalog}.{candidate_schema}") + continue + self._apply_from_legacy_table_acls() + self._update_principal_acl() + + def _apply_from_legacy_table_acls(self) -> None: + grants = self._get_catalog_schema_hive_grants() + # Scanning for non own grants first + for grant in grants: + if grant.action_type == "OWN": + continue + acl_migrate_sql = grant.uc_grant_sql() + if acl_migrate_sql is None: + logger.warning( + f"Skipping legacy grant that is not supported in UC: {grant.action_type} on {grant.this_type_and_key()}" + ) + continue + logger.debug(f"Migrating acls on {grant.this_type_and_key()} using SQL query: {acl_migrate_sql}") + self._backend.execute(acl_migrate_sql) + for grant in grants: + if grant.action_type != "OWN": + continue + own_acl_migrate_sql = grant.uc_grant_sql() + logger.debug(f"Migrating ownership on {grant.this_type_and_key()} using SQL query: {own_acl_migrate_sql}") + if own_acl_migrate_sql is None: + logger.warning( + f"Skipping legacy ownership migration: {grant.action_type} on {grant.this_type_and_key()}" + ) + continue + self._backend.execute(own_acl_migrate_sql) + + def _update_principal_acl(self) -> None: + grants = self._get_catalog_schema_principal_acl_grants() + for grant in grants: + acl_migrate_sql = grant.uc_grant_sql() + if acl_migrate_sql is None: + logger.warning( + f"Skipping legacy grant that is not supported in UC: {grant.action_type} on {grant.this_type_and_key()}" + ) + continue + logger.debug(f"Migrating acls on {grant.this_type_and_key()} using SQL query: {acl_migrate_sql}") + self._backend.execute(acl_migrate_sql) + + def _get_catalog_schema_hive_grants(self) -> list[Grant]: + src_dst_schema_mapping = self._get_database_source_target_mapping() + hive_grants = self._hive_grants_crawler.snapshot() + new_grants: list[Grant] = [] + for grant in hive_grants: + if grant.this_type_and_key()[0] == "DATABASE" and grant.database: + for schema in src_dst_schema_mapping[grant.database]: + new_grants.append(replace(grant, catalog=schema.catalog_name, database=schema.name)) + catalog_grants: set[Grant] = set() + for grant in new_grants: + catalog_grants.add(replace(grant, database=None)) + new_grants.extend(catalog_grants) + return new_grants + + def _get_catalog_schema_principal_acl_grants(self) -> list[Grant]: + src_trg_schema_mapping = self._get_database_source_target_mapping() + grants = self._principal_grants.get_interactive_cluster_grants() + # filter on grants to only get database level grants + new_grants: list[Grant] = [] + for grant in grants: + # For a database grant the table/view are not set, while the database is. + if grant.table is None and grant.view is None: + database = grant.database + if database is not None: + new_grants.extend( + replace(grant, catalog=schema.catalog_name, database=schema.name) + for schema in src_trg_schema_mapping[database] + ) + catalog_grants: set[Grant] = set() + for grant in new_grants: + catalog_grants.add(replace(grant, database=None)) + new_grants.extend(catalog_grants) + return new_grants + + def _get_database_source_target_mapping(self) -> dict[str, list[SchemaInfo]]: + """Generate a dictionary of source database in hive_metastore and its + mapping of target UC catalog and schema combinations from the table mappings.""" + src_trg_schema_mapping: dict[str, list[SchemaInfo]] = collections.defaultdict(list) + table_mappings = self._table_mapping.load() + for table_mapping in table_mappings: + schema = SchemaInfo(catalog_name=table_mapping.catalog_name, name=table_mapping.dst_schema) + if schema not in src_trg_schema_mapping[table_mapping.src_schema]: + src_trg_schema_mapping[table_mapping.src_schema].append(schema) + return src_trg_schema_mapping def _create_catalog_validate( - self, - catalog: Catalog, - prompts: Prompts, - *, - properties: dict[str, str] | None, - ) -> Catalog: - catalog_existing = self._get_catalog(catalog) - if catalog_existing: - logger.warning(f"Skipping already existing catalog: {catalog.name}") - return catalog_existing - logger.info(f"Validating UC catalog: {catalog.name}") + self, catalog_name: str, prompts: Prompts, *, properties: dict[str, str] | None + ) -> None: + try: + catalog = self._ws.catalogs.get(catalog_name) + except NotFound: + catalog = None + if catalog: + logger.warning(f"Skipping already existing catalog: {catalog_name}") + return + logger.info(f"Validating UC catalog: {catalog_name}") attempts = 3 while True: catalog_storage = prompts.question( - f"Please provide storage location url for catalog: {catalog.name}", default="metastore" + f"Please provide storage location url for catalog: {catalog_name}", default="metastore" ) if self._validate_location(catalog_storage): break attempts -= 1 if attempts == 0: - raise NotFound(f"Failed to validate location for catalog: {catalog.name}") - return self._create_catalog(catalog, catalog_storage, properties=properties) + raise NotFound(f"Failed to validate location for catalog: {catalog_name}") + self._create_catalog(catalog_name, catalog_storage, properties=properties) + + def _list_existing(self) -> tuple[set[str], dict[str, set[str]]]: + """generate a list of existing UC catalogs and schema.""" + logger.info("Listing existing UC catalogs and schemas") + existing_catalogs: set[str] = set() + for catalog_info in self._ws.catalogs.list(): + if catalog_info.name: + existing_catalogs.add(catalog_info.name) + + existing_schemas: dict[str, set[str]] = {} # catalog -> set[schema] + for catalog in existing_catalogs: + existing_schemas[catalog] = set() + for schema in self._ws.schemas.list(catalog, max_results=0): + if schema.name: + existing_schemas[catalog].add(schema.name) + + return existing_catalogs, existing_schemas + + def _list_target(self) -> tuple[set[str], dict[str, set[str]]]: + """generate a list of catalogs and schema to be created from table mappings.""" + target_catalogs: set[str] = set() + target_schemas: dict[str, set[str]] = {} # catalog -> set[schema] + table_mappings = self._table_mapping.load() + for mappings in table_mappings: + target_catalog = mappings.catalog_name + target_schema = mappings.dst_schema + target_catalogs.add(target_catalog) + if target_catalog not in target_schemas: + target_schemas[target_catalog] = {target_schema} + continue + target_schemas[target_catalog].add(target_schema) + return target_catalogs, target_schemas + + def _get_missing_catalogs_schemas(self) -> tuple[set[str], dict[str, set[str]]]: + """prepare a list of catalogs and schema to be created""" + existing_catalogs, existing_schemas = self._list_existing() + target_catalogs, target_schemas = self._list_target() + + logger.info("Preparing a list of UC catalogs and schema to be created") + # filter out existing catalogs and schemas from target catalogs and schemas to be created. + for existing_catalog in existing_catalogs: + if existing_catalog in target_catalogs: + target_catalogs.remove(existing_catalog) + + for catalog, schemas in existing_schemas.items(): + if catalog in target_schemas: + target_schemas[catalog] = target_schemas[catalog] - schemas + return target_catalogs, target_schemas def _validate_location(self, location: str) -> bool: if location == "metastore": @@ -208,82 +229,18 @@ def _validate_location(self, location: str) -> bool: logger.warning(f"No matching external location found for: {location}") return False - def _get_catalog(self, catalog: Catalog) -> Catalog | None: - """Get a catalog. - - Args: - catalog (Catalog) : The catalog to get. - - Returns: - Catalog : The catalog it got. - None : If the catalog does not exist. - """ - if self._timeout: - get = retried(on=[NotFound], timeout=self._timeout)(self._ws.catalogs.get) - else: - get = self._ws.catalogs.get - try: - catalog_info = get(catalog.name) - return Catalog(catalog_info.name) - except (NotFound, TimeoutError): - return None - except DatabricksError as e: - logger.warning(f"Unexpected error when getting catalog: {catalog.name}", exc_info=e) - return None - - def _create_catalog( - self, - catalog: Catalog, - catalog_storage: str, - *, - properties: dict[str, str] | None, - ) -> Catalog: - logger.info(f"Creating UC catalog: {catalog.name}") + def _create_catalog(self, catalog: str, catalog_storage: str, *, properties: dict[str, str] | None) -> None: + logger.info(f"Creating UC catalog: {catalog}") if catalog_storage == "metastore": - self._ws.catalogs.create(catalog.name, comment="Created by UCX", properties=properties) + self._ws.catalogs.create(catalog, comment="Created by UCX", properties=properties) else: self._ws.catalogs.create( - catalog.name, + catalog, storage_root=catalog_storage, comment="Created by UCX", properties=properties, ) - catalog_created = self._get_catalog(catalog) - if catalog_created is None: - raise NotFound(f"Created catalog '{catalog.name}' does not exist.") - return catalog_created - - def _get_schema(self, schema: Schema) -> Schema | None: - """Get a schema. - - Args: - schema (Schema) : The schema to get. - - Returns: - Schema : The schema it got. - None : If the catalog does not exist. - """ - if self._timeout: - get = retried(on=[NotFound], timeout=self._timeout)(self._ws.schemas.get) - else: - get = self._ws.schemas.get - try: - schema_info = get(schema.full_name) - return Schema(schema_info.catalog_name, schema_info.name) - except (NotFound, TimeoutError): - return None - except DatabricksError as e: - logger.warning(f"Unexpected error when getting schema: {schema.full_name}", exc_info=e) - return None - def _create_schema(self, schema: Schema) -> Schema: - schema_existing = self._get_schema(schema) - if schema_existing: - logger.warning(f"Skipping already existing schema: {schema.full_name}") - return schema_existing - logger.info(f"Creating UC schema: {schema.full_name}") - self._ws.schemas.create(schema.name, schema.catalog, comment="Created by UCX") - schema_created = self._get_schema(schema) - if schema_created is None: - raise NotFound(f"Created schema '{schema.full_name}' does not exist.") - return schema_created + def _create_schema(self, catalog, schema): + logger.info(f"Creating UC schema: {schema} in catalog: {catalog}") + self._ws.schemas.create(schema, catalog, comment="Created by UCX") diff --git a/src/databricks/labs/ucx/hive_metastore/grants.py b/src/databricks/labs/ucx/hive_metastore/grants.py index db0049f328..f8c0a620a7 100644 --- a/src/databricks/labs/ucx/hive_metastore/grants.py +++ b/src/databricks/labs/ucx/hive_metastore/grants.py @@ -3,7 +3,6 @@ from collections.abc import Callable, Iterable from dataclasses import dataclass, replace from functools import partial, cached_property -from typing import Protocol from databricks.labs.blueprint.installation import Installation from databricks.labs.blueprint.parallel import ManyError, Threads @@ -110,15 +109,6 @@ def object_key(self) -> str: _, key = self.this_type_and_key() return key.lower() - @property - def order(self) -> int: - """Order of the grants to be upheld when applying.""" - match self.action_type: - case "OWN": # Apply ownership as last to avoid losing permissions for applying grants - return 1 - case _: - return 0 - def this_type_and_key(self): return self.type_and_key( catalog=self.catalog, @@ -608,22 +598,17 @@ def __init__( self._external_locations = external_locations self._compute_locations = cluster_locations - def get_interactive_cluster_grants(self) -> set[Grant]: + def get_interactive_cluster_grants(self) -> list[Grant]: tables = list(self._tables_crawler.snapshot()) - grants = set[Grant]() + grants: set[Grant] = set() - try: - compute_locations = self._compute_locations() - except DatabricksError as e: - logger.warning("No compute locations found.", exc_info=e) - return grants - for compute_location in compute_locations: + for compute_location in self._compute_locations(): principals = self._get_cluster_principal_mapping(compute_location.compute_id, compute_location.compute_type) if len(principals) == 0: continue cluster_usage = self._get_grants(compute_location.locations, principals, tables) grants.update(cluster_usage) - return grants + return list(grants) def _get_privilege(self, table: Table, locations: dict[str, str]) -> str | None: if table.view_text is not None: @@ -741,27 +726,6 @@ def _get_location_name(self, location_url: str): return None -class SecurableObject(Protocol): - """A protocol for a securable object. - - Docs: - https://docs.databricks.com/en/data-governance/table-acls/object-privileges.html#securable-objects-in-the-hive-metastore - """ - - @property - def kind(self) -> str: - """The type of securable objects, see doc referenced above.""" - - @property - def full_name(self) -> str: - """The object name often a synonym for `key`""" - - @property - def key(self) -> str: - """The object identifier often a synonym for `full_name`""" - return self.full_name - - class MigrateGrants: def __init__( self, @@ -773,22 +737,20 @@ def __init__( self._group_manager = group_manager self._grant_loaders = grant_loaders - def apply(self, src: SecurableObject, dst: SecurableObject) -> bool: + def apply(self, src: Table, uc_table_key: str) -> bool: for grant in self._match_grants(src): - acl_migrate_sql = grant.uc_grant_sql(dst.kind, dst.full_name) + acl_migrate_sql = grant.uc_grant_sql(src.kind, uc_table_key) if acl_migrate_sql is None: logger.warning( f"failed-to-migrate: Hive metastore grant '{grant.action_type}' cannot be mapped to UC grant for " - f"{dst.kind} '{dst.full_name}'. Skipping." + f"{src.kind} '{uc_table_key}'. Skipping." ) continue - logger.debug(f"Migrating acls on {dst.full_name} using SQL query: {acl_migrate_sql}") + logger.debug(f"Migrating acls on {uc_table_key} using SQL query: {acl_migrate_sql}") try: self._sql_backend.execute(acl_migrate_sql) except DatabricksError as e: - logger.warning( - f"failed-to-migrate: Failed to migrate ACL for {src.full_name} to {dst.full_name}", exc_info=e - ) + logger.warning(f"failed-to-migrate: Failed to migrate ACL for {src.key} to {uc_table_key}: {e}") return True @cached_property @@ -803,14 +765,16 @@ def _grants(self) -> list[Grant]: grants.append(grant) return grants - def _match_grants(self, src: SecurableObject) -> list[Grant]: + def _match_grants(self, table: Table) -> list[Grant]: matched_grants = [] for grant in self._grants: - if grant.object_key != src.key: + if grant.database != table.database: + continue + if table.name not in (grant.table, grant.view): continue grant = self._replace_account_group(grant) matched_grants.append(grant) - return sorted(matched_grants, key=lambda g: g.order) + return matched_grants def _replace_account_group(self, grant: Grant) -> Grant: target_principal = self._workspace_to_account_group_names.get(grant.principal) diff --git a/src/databricks/labs/ucx/hive_metastore/locations.py b/src/databricks/labs/ucx/hive_metastore/locations.py index ff011fb535..1ba6e632f1 100644 --- a/src/databricks/labs/ucx/hive_metastore/locations.py +++ b/src/databricks/labs/ucx/hive_metastore/locations.py @@ -6,7 +6,7 @@ from dataclasses import dataclass from functools import cached_property from typing import ClassVar, Optional -from urllib.parse import urlparse, ParseResult +from urllib.parse import urlparse from databricks.labs.blueprint.installation import Installation from databricks.labs.lsql.backends import SqlBackend @@ -42,9 +42,9 @@ class LocationTrie: """ key: str = "" - parent: OptionalLocationTrie = dataclasses.field(repr=False, default=None) + parent: OptionalLocationTrie = None children: dict[str, "LocationTrie"] = dataclasses.field(default_factory=dict) - tables: list[Table] = dataclasses.field(repr=False, default_factory=list) + tables: list[Table] = dataclasses.field(default_factory=list) @cached_property def _path(self) -> list[str]: @@ -57,47 +57,19 @@ def _path(self) -> list[str]: return list(reversed(parts))[1:] @property - def location(self) -> str | None: - if not self.is_valid(): - return None - try: - scheme, netloc, *path = self._path - return f"{scheme}://{netloc}/{'/'.join(path)}".rstrip("/") - except ValueError: - return None + def location(self): + scheme, netloc, *path = self._path + return f"{scheme}://{netloc}/{'/'.join(path)}" - @classmethod - def _parse_location(cls, location: str | None) -> list[str]: + @staticmethod + def _parse_location(location: str | None) -> list[str]: if not location: return [] - parse_result = cls._parse_url(location.rstrip("/")) - if not parse_result: - return [] + parse_result = urlparse(location) parts = [parse_result.scheme, parse_result.netloc] - for part in parse_result.path.split("/"): - if not part: - continue # remove empty strings - parts.append(part) + parts.extend(parse_result.path.strip("/").split("/")) return parts - @staticmethod - def _parse_url(location: str) -> ParseResult | None: - parse_result = urlparse(location) - if parse_result.scheme == 'jdbc': - jdbc_path = parse_result.path.split('://') - if len(jdbc_path) != 2: - return None - netloc, path = jdbc_path[1].split('/', 1) - parse_result = ParseResult( - scheme=f'{parse_result.scheme}:{jdbc_path[0]}', - netloc=netloc, - path=path, - params='', - query='', - fragment='', - ) - return parse_result - def insert(self, table: Table) -> None: current = self for part in self._parse_location(table.location): @@ -119,22 +91,11 @@ def find(self, table: Table) -> OptionalLocationTrie: def is_valid(self) -> bool: """A valid location has a scheme and netloc; the path is optional.""" - if len(self._path) < 2: + if len(self._path) < 3: return False scheme, netloc, *_ = self._path - if scheme.startswith('jdbc:') and len(netloc) > 0: - return True return scheme in _EXTERNAL_FILE_LOCATION_SCHEMES and len(netloc) > 0 - def is_jdbc(self) -> bool: - if not self.is_valid(): - return False - return self._path[0].startswith('jdbc:') - - def all_tables(self) -> Iterable[Table]: - for node in self: - yield from node.tables - def has_children(self): return len(self.children) > 0 @@ -164,59 +125,64 @@ def __init__( @cached_property def _mounts_snapshot(self) -> list['Mount']: """Returns all mounts, sorted by longest prefixes first.""" - return sorted(self._mounts_crawler.snapshot(), key=lambda _: (len(_.name), _.name), reverse=True) + return sorted(self._mounts_crawler.snapshot(), key=lambda _: len(_.name), reverse=True) def _external_locations(self) -> Iterable[ExternalLocation]: - trie = LocationTrie() + min_slash = 2 + external_locations: list[ExternalLocation] = [] for table in self._tables_crawler.snapshot(): - table = self._resolve_location(table) - if not table.location: + location = table.location + if not location: continue - trie.insert(table) - queue = list(trie.children.values()) - external_locations = [] - while queue: - curr = queue.pop() - num_children = len(curr.children) # 0 - take parent - if curr.location and (num_children > 1 or num_children == 0): - if curr.parent and num_children == 0 and not curr.is_jdbc(): # one table having the prefix - curr = curr.parent - assert curr.location is not None - external_location = ExternalLocation(curr.location, len(list(curr.all_tables()))) - external_locations.append(external_location) + # TODO: refactor this with LocationTrie + if location.startswith("dbfs:/mnt"): + location = self.resolve_mount(location) + if not location: continue - queue.extend(curr.children.values()) - return sorted(external_locations, key=lambda _: _.location) - - def _resolve_location(self, table: Table) -> Table: - location = table.location - if not location: - return table - location = self._resolve_jdbc(table) - location = self.resolve_mount(location) - return dataclasses.replace(table, location=location) + if ( + not location.startswith("dbfs") + and (self._prefix_size[0] < location.find(":/") < self._prefix_size[1]) + and not location.startswith("jdbc") + ): + self._dbfs_locations(external_locations, location, min_slash) + if location.startswith("jdbc"): + self._add_jdbc_location(external_locations, location, table) + return external_locations def resolve_mount(self, location: str | None) -> str | None: if not location: return None - if location.startswith('/dbfs'): - location = 'dbfs:' + location[5:] # convert FUSE path to DBFS path - if not location.startswith('dbfs:'): - return location # not a mount, save some cycles for mount in self._mounts_snapshot: - prefix = mount.as_scheme_prefix() - if not location.startswith(prefix): - continue - logger.debug(f"Replacing location {prefix} with {mount.source} in {location}") - location = location.replace(prefix, mount.source) - return location + for prefix in (mount.as_scheme_prefix(), mount.as_fuse_prefix()): + if not location.startswith(prefix): + continue + logger.debug(f"Replacing location {prefix} with {mount.source} in {location}") + location = location.replace(prefix, mount.source) + return location logger.debug(f"Mount not found for location {location}. Skipping replacement.") return location - def _resolve_jdbc(self, table: Table) -> str | None: - location = table.location - if not location or not table.storage_properties or not location.startswith('jdbc:'): - return location + @staticmethod + def _dbfs_locations(external_locations, location, min_slash): + dupe = False + loc = 0 + while loc < len(external_locations) and not dupe: + common = ( + os.path.commonpath([external_locations[loc].location, os.path.dirname(location) + "/"]).replace( + ":/", "://" + ) + + "/" + ) + if common.count("/") > min_slash: + table_count = external_locations[loc].table_count + external_locations[loc] = ExternalLocation(common, table_count + 1) + dupe = True + loc += 1 + if not dupe: + external_locations.append(ExternalLocation(os.path.dirname(location) + "/", 1)) + + def _add_jdbc_location(self, external_locations, location, table): + dupe = False pattern = r"(\w+)=(.*?)(?=\s*,|\s*\])" # Find all matches in the input string # Storage properties is of the format @@ -235,12 +201,20 @@ def _resolve_jdbc(self, table: Table) -> str | None: # currently supporting databricks and mysql external tables # add other jdbc types if "databricks" in location.lower(): - return f"jdbc:databricks://{host};httpPath={httppath}" - if "mysql" in location.lower(): - return f"jdbc:mysql://{host}:{port}/{database}" - if not provider == "": - return f"jdbc:{provider.lower()}://{host}:{port}/{database}" - return f"{location.lower()}/{host}:{port}/{database}" + jdbc_location = f"jdbc:databricks://{host};httpPath={httppath}" + elif "mysql" in location.lower(): + jdbc_location = f"jdbc:mysql://{host}:{port}/{database}" + elif not provider == "": + jdbc_location = f"jdbc:{provider.lower()}://{host}:{port}/{database}" + else: + jdbc_location = f"{location.lower()}/{host}:{port}/{database}" + for ext_loc in external_locations: + if ext_loc.location == jdbc_location: + ext_loc.table_count += 1 + dupe = True + break + if not dupe: + external_locations.append(ExternalLocation(jdbc_location, 1)) def _crawl(self) -> Iterable[ExternalLocation]: return self._external_locations() @@ -341,18 +315,14 @@ class Mount: def as_scheme_prefix(self) -> str: return f'dbfs:{self.name}' # dbfs:/mnt/mount-name + def as_fuse_prefix(self) -> str: + return f'/dbfs{self.name}' # /dbfs/mnt/mount-name + class MountsCrawler(CrawlerBase[Mount]): - def __init__( - self, - backend: SqlBackend, - ws: WorkspaceClient, - inventory_database: str, - enable_hms_federation: bool = False, - ): + def __init__(self, backend: SqlBackend, ws: WorkspaceClient, inventory_database: str): super().__init__(backend, "hive_metastore", inventory_database, "mounts", Mount) self._dbutils = ws.dbutils - self._enable_hms_federation = enable_hms_federation @staticmethod def _deduplicate_mounts(mounts: list) -> list: @@ -368,48 +338,11 @@ def _deduplicate_mounts(mounts: list) -> list: deduplicated_mounts.append(obj) return deduplicated_mounts - @cached_property - def _jvm(self): - # pylint: disable=import-error,import-outside-toplevel,broad-exception-caught - try: - from pyspark.sql.session import SparkSession # type: ignore[import-not-found] - - spark = SparkSession.builder.getOrCreate() - return spark._jvm # pylint: disable=protected-access - except Exception as err: - logger.warning(f"Cannot create Py4j proxy: {err}") - return None - - def _resolve_dbfs_root(self) -> Mount | None: - # pylint: disable=broad-exception-caught,too-many-try-statements - try: - jvm = self._jvm - if not jvm: - return None - uri = jvm.java.net.URI - some = jvm.scala.Some - hms_fed_dbfs_utils = jvm.com.databricks.sql.managedcatalog.connections.HmsFedDbfsUtils - root_location_opt = hms_fed_dbfs_utils.resolveDbfsPath(some(uri("dbfs:/user/hive/warehouse"))) - if root_location_opt.isDefined(): - source: str = root_location_opt.get().toString() - source = source.removesuffix('user/hive/warehouse') - return Mount("/", source) - return None - except Exception as err: - logger.warning(f"Failed to resolve DBFS root location: {err}") - return None - def _crawl(self) -> Iterable[Mount]: mounts = [] try: for mount_point, source, _ in self._dbutils.fs.mounts(): mounts.append(Mount(mount_point, source)) - if self._enable_hms_federation: - root_mount = self._resolve_dbfs_root() - if root_mount: - # filter out DatabricksRoot, otherwise ExternalLocations.resolve_mount() won't work - mounts = list(filter(lambda _: _.source != 'DatabricksRoot', mounts)) - mounts.append(root_mount) except Exception as error: # pylint: disable=broad-except if "com.databricks.backend.daemon.dbutils.DBUtilsCore.mounts() is not whitelisted" in str(error): logger.warning( diff --git a/src/databricks/labs/ucx/hive_metastore/mapping.py b/src/databricks/labs/ucx/hive_metastore/mapping.py index 48f3c30b22..04ee92ec94 100644 --- a/src/databricks/labs/ucx/hive_metastore/mapping.py +++ b/src/databricks/labs/ucx/hive_metastore/mapping.py @@ -58,16 +58,6 @@ def from_src_dst(cls, src_table: TableInfo, dst_schema: SchemaInfo) -> "Rule": def match(self, table: TableIdentifier) -> bool: return table.catalog == "hive_metastore" and self.src_schema == table.schema and self.src_table == table.table - @property - def as_uc_table(self) -> Table: - return Table( - self.catalog_name, - self.dst_schema, - self.dst_table, - object_type="UNKNOWN", - table_format="UNKNOWN", - ) - @property def as_uc_table_key(self) -> str: return f"{self.catalog_name}.{self.dst_schema}.{self.dst_table}" diff --git a/src/databricks/labs/ucx/hive_metastore/table_migrate.py b/src/databricks/labs/ucx/hive_metastore/table_migrate.py index 4819d36707..45f6e70bee 100644 --- a/src/databricks/labs/ucx/hive_metastore/table_migrate.py +++ b/src/databricks/labs/ucx/hive_metastore/table_migrate.py @@ -1,15 +1,14 @@ import dataclasses import logging -import re from collections import defaultdict -from functools import partial, cached_property +from functools import partial from databricks.labs.blueprint.parallel import Threads from databricks.labs.lsql.backends import SqlBackend -from databricks.sdk import WorkspaceClient -from databricks.sdk.errors.platform import DatabricksError from databricks.labs.ucx.framework.utils import escape_sql_identifier +from databricks.sdk import WorkspaceClient + from databricks.labs.ucx.hive_metastore import TablesCrawler from databricks.labs.ucx.hive_metastore.grants import MigrateGrants from databricks.labs.ucx.hive_metastore.locations import ExternalLocations @@ -18,7 +17,6 @@ TableMapping, TableToMigrate, ) - from databricks.labs.ucx.hive_metastore.table_migration_status import TableMigrationStatusRefresher from databricks.labs.ucx.hive_metastore.tables import ( MigrationCount, @@ -30,6 +28,7 @@ ViewsMigrationSequencer, ViewToMigrate, ) +from databricks.sdk.errors.platform import DatabricksError logger = logging.getLogger(__name__) @@ -45,7 +44,6 @@ def __init__( migrate_grants: MigrateGrants, external_locations: ExternalLocations, ): - self._tc = table_crawler self._backend = backend self._ws = ws @@ -73,8 +71,6 @@ def migrate_tables( hiveserde_in_place_migrate: bool = False, managed_table_external_storage: str = "CLONE", ): - if managed_table_external_storage == "CONVERT_TO_EXTERNAL": - self._spark = self._spark_session if what in [What.DB_DATASET, What.UNKNOWN]: logger.error(f"Can't migrate tables with type {what.name}") return None @@ -99,7 +95,7 @@ def _migrate_tables( for table in tables_in_scope: tasks.append( partial( - self._safe_migrate_table, + self._migrate_table, table, managed_table_external_storage, hiveserde_in_place_migrate, @@ -127,19 +123,7 @@ def _migrate_views(self): self.index(force_refresh=True) return all_tasks - @cached_property - def _spark_session(self): - # pylint: disable-next=import-error,import-outside-toplevel - from pyspark.sql.session import SparkSession # type: ignore[import-not-found] - - return SparkSession.builder.getOrCreate() - def _migrate_managed_table(self, managed_table_external_storage: str, src_table: TableToMigrate): - if managed_table_external_storage == 'CONVERT_TO_EXTERNAL': - if self._convert_hms_table_to_external(src_table.src): - return self._migrate_external_table( - src_table.src, src_table.rule - ) # _migrate_external_table remains unchanged if managed_table_external_storage == 'SYNC_AS_EXTERNAL': return self._migrate_managed_as_external_table(src_table.src, src_table.rule) # new method if managed_table_external_storage == 'CLONE': @@ -147,35 +131,15 @@ def _migrate_managed_table(self, managed_table_external_storage: str, src_table: logger.warning(f"failed-to-migrate: unknown managed_table_external_storage: {managed_table_external_storage}") return True - def _safe_migrate_table( + def _migrate_table( self, src_table: TableToMigrate, managed_table_external_storage: str, hiveserde_in_place_migrate: bool = False, - ) -> bool: + ): if self._table_already_migrated(src_table.rule.as_uc_table_key): logger.info(f"Table {src_table.src.key} already migrated to {src_table.rule.as_uc_table_key}") return True - try: - return self._migrate_table(src_table, managed_table_external_storage, hiveserde_in_place_migrate) - except Exception as e: # pylint: disable=broad-exception-caught - # Catching a Spark AnalysisException here, for which we do not have the dependency to catch explicitly - pattern = ( # See https://github.com/databrickslabs/ucx/issues/2891 - r"INVALID_PARAMETER_VALUE: Invalid input: RPC CreateTable Field managedcatalog.ColumnInfo.name: " - r'At columns.\d+: name "" is not a valid name`' - ) - if re.match(pattern, str(e)): - logger.warning(f"failed-to-migrate: Table with empty column name '{src_table.src.key}'", exc_info=e) - else: - logger.warning(f"failed-to-migrate: Unknown reason for table '{src_table.src.key}'", exc_info=e) - return False - - def _migrate_table( - self, - src_table: TableToMigrate, - managed_table_external_storage: str, - hiveserde_in_place_migrate: bool = False, - ) -> bool: if src_table.src.what == What.DBFS_ROOT_DELTA: return self._migrate_dbfs_root_table(src_table.src, src_table.rule) if src_table.src.what == What.DBFS_ROOT_NON_DELTA: @@ -233,7 +197,7 @@ def _migrate_view_table(self, src_view: ViewToMigrate): except DatabricksError as e: logger.warning(f"Failed to migrate view {src_view.src.key} to {src_view.rule.as_uc_table_key}: {e}") return False - return self._migrate_grants.apply(src_view.src, src_view.rule.as_uc_table) + return self._migrate_grants.apply(src_view.src, src_view.rule.as_uc_table_key) def _sql_migrate_view(self, src_view: ViewToMigrate) -> str: # We have to fetch create statement this way because of columns in: @@ -243,58 +207,8 @@ def _sql_migrate_view(self, src_view: ViewToMigrate) -> str: # this does not require the index to be refreshed because the dependencies have already been validated return src_view.sql_migrate_view(self.index()) - @cached_property - def _catalog(self): - return self._spark._jsparkSession.sessionState().catalog() # pylint: disable=protected-access - - @cached_property - def _table_identifier(self): - return self._spark._jvm.org.apache.spark.sql.catalyst.TableIdentifier # pylint: disable=protected-access - - @cached_property - def _catalog_type(self): - return ( - self._spark._jvm.org.apache.spark.sql.catalyst.catalog.CatalogTableType # pylint: disable=protected-access - ) - - @cached_property - def _catalog_table(self): - return self._spark._jvm.org.apache.spark.sql.catalyst.catalog.CatalogTable # pylint: disable=protected-access - def _convert_hms_table_to_external(self, src_table: Table): - try: - logger.info(f"Changing HMS managed table {src_table.name} to External Table type.") - database = self._spark._jvm.scala.Some(src_table.database) # pylint: disable=protected-access - table_identifier = self._table_identifier(src_table.name, database) - old_table = self._catalog.getTableMetadata(table_identifier) - new_table = self._catalog_table( - old_table.identifier(), - self._catalog_type('EXTERNAL'), - old_table.storage(), - old_table.schema(), - old_table.provider(), - old_table.partitionColumnNames(), - old_table.bucketSpec(), - old_table.owner(), - old_table.createTime(), - old_table.lastAccessTime(), - old_table.createVersion(), - old_table.properties(), - old_table.stats(), - old_table.viewText(), - old_table.comment(), - old_table.unsupportedFeatures(), - old_table.tracksPartitionsInCatalog(), - old_table.schemaPreservesCase(), - old_table.ignoredProperties(), - old_table.viewOriginalText(), - ) - self._catalog.alterTable(new_table) - logger.info(f"Converted {src_table.name} to External Table type.") - except Exception as e: # pylint: disable=broad-exception-caught - logger.warning(f"Error converting HMS table {src_table.name} to external: {e}", exc_info=True) - return False - return True + pass def _migrate_managed_as_external_table(self, src_table: Table, rule: Rule): target_table_key = rule.as_uc_table_key @@ -309,7 +223,7 @@ def _migrate_managed_as_external_table(self, src_table: Table, rule: Rule): ) return False self._backend.execute(self._sql_alter_from(src_table, rule.as_uc_table_key, self._ws.get_workspace_id())) - return self._migrate_grants.apply(src_table, rule.as_uc_table) + return self._migrate_grants.apply(src_table, rule.as_uc_table_key) def _migrate_external_table(self, src_table: Table, rule: Rule): target_table_key = rule.as_uc_table_key @@ -324,7 +238,7 @@ def _migrate_external_table(self, src_table: Table, rule: Rule): ) return False self._backend.execute(self._sql_alter_from(src_table, rule.as_uc_table_key, self._ws.get_workspace_id())) - return self._migrate_grants.apply(src_table, rule.as_uc_table) + return self._migrate_grants.apply(src_table, rule.as_uc_table_key) def _migrate_external_table_hiveserde_in_place(self, src_table: Table, rule: Rule): # verify hive serde type @@ -362,7 +276,7 @@ def _migrate_external_table_hiveserde_in_place(self, src_table: Table, rule: Rul except DatabricksError as e: logger.warning(f"failed-to-migrate: Failed to migrate table {src_table.key} to {rule.as_uc_table_key}: {e}") return False - return self._migrate_grants.apply(src_table, rule.as_uc_table) + return self._migrate_grants.apply(src_table, rule.as_uc_table_key) def _migrate_dbfs_root_table(self, src_table: Table, rule: Rule): target_table_key = rule.as_uc_table_key @@ -378,7 +292,7 @@ def _migrate_dbfs_root_table(self, src_table: Table, rule: Rule): except DatabricksError as e: logger.warning(f"failed-to-migrate: Failed to migrate table {src_table.key} to {rule.as_uc_table_key}: {e}") return False - return self._migrate_grants.apply(src_table, rule.as_uc_table) + return self._migrate_grants.apply(src_table, rule.as_uc_table_key) def _migrate_table_create_ctas(self, src_table: Table, rule: Rule): if src_table.what not in [What.EXTERNAL_NO_SYNC, What.EXTERNAL_HIVESERDE]: @@ -402,7 +316,7 @@ def _migrate_table_create_ctas(self, src_table: Table, rule: Rule): except DatabricksError as e: logger.warning(f"failed-to-migrate: Failed to migrate table {src_table.key} to {rule.as_uc_table_key}: {e}") return False - return self._migrate_grants.apply(src_table, rule.as_uc_table) + return self._migrate_grants.apply(src_table, rule.as_uc_table_key) def _migrate_table_in_mount(self, src_table: Table, rule: Rule): target_table_key = rule.as_uc_table_key @@ -417,7 +331,7 @@ def _migrate_table_in_mount(self, src_table: Table, rule: Rule): except DatabricksError as e: logger.warning(f"failed-to-migrate: Failed to migrate table {src_table.key} to {rule.as_uc_table_key}: {e}") return False - return self._migrate_grants.apply(src_table, rule.as_uc_table) + return self._migrate_grants.apply(src_table, rule.as_uc_table_key) def _table_already_migrated(self, target) -> bool: return target in self._seen_tables diff --git a/src/databricks/labs/ucx/hive_metastore/tables.py b/src/databricks/labs/ucx/hive_metastore/tables.py index de8457503a..2e6d6e92c0 100644 --- a/src/databricks/labs/ucx/hive_metastore/tables.py +++ b/src/databricks/labs/ucx/hive_metastore/tables.py @@ -99,7 +99,6 @@ def is_hive(self) -> bool: @property def key(self) -> str: - # TODO: https://github.com/databrickslabs/ucx/issues/2979 if self.is_table_in_mount: return f"{self.catalog}.{self.database}.{self.location}".lower() return f"{self.catalog}.{self.database}.{self.name}".lower() diff --git a/src/databricks/labs/ucx/hive_metastore/workflows.py b/src/databricks/labs/ucx/hive_metastore/workflows.py index 4fd0add564..3021217fbb 100644 --- a/src/databricks/labs/ucx/hive_metastore/workflows.py +++ b/src/databricks/labs/ucx/hive_metastore/workflows.py @@ -13,9 +13,7 @@ def migrate_external_tables_sync(self, ctx: RuntimeContext): """This workflow task migrates the external tables that are supported by SYNC command from the Hive Metastore to the Unity Catalog. """ - ctx.tables_migrator.migrate_tables( - what=What.EXTERNAL_SYNC, managed_table_external_storage=ctx.config.managed_table_external_storage - ) + ctx.tables_migrator.migrate_tables(what=What.EXTERNAL_SYNC) @job_task(job_cluster="table_migration", depends_on=[Assessment.crawl_tables]) def migrate_dbfs_root_delta_tables(self, ctx: RuntimeContext): diff --git a/src/databricks/labs/ucx/install.py b/src/databricks/labs/ucx/install.py index 90e285f698..95754e82b0 100644 --- a/src/databricks/labs/ucx/install.py +++ b/src/databricks/labs/ucx/install.py @@ -379,7 +379,6 @@ def _config_table_migration(self, spark_conf_dict) -> tuple[int, int, dict, str] managed_table_migration_choices = { "Migrate MANAGED HMS table as EXTERNAL UC table. This option would require you to convert MANAGED HMS tables to EXTERNAL HMS tables once UC migration is complete, otherwise deleting HMS MANAGED table would delete the migrated UC table": 'SYNC_AS_EXTERNAL', "Copy data from MANAGED HMS to MANAGED UC table": 'CLONE', - "Convert MANAGED HMS table to EXTERNAL HMS table and migrate as EXTERNAL UC table. This risks data leakage, as once the relevant HMS tables are deleted, the underlying data won't get deleted anymore.": 'CONVERT_TO_EXTERNAL', } managed_table_migration_choice = self.prompts.choice_from_dict( "If hive_metastore contains managed table with external" diff --git a/src/databricks/labs/ucx/mixins/cached_workspace_path.py b/src/databricks/labs/ucx/mixins/cached_workspace_path.py index 50abd71d03..ac8565e8bf 100644 --- a/src/databricks/labs/ucx/mixins/cached_workspace_path.py +++ b/src/databricks/labs/ucx/mixins/cached_workspace_path.py @@ -3,136 +3,137 @@ import os from collections import OrderedDict from collections.abc import Generator -from io import BytesIO -from pathlib import PurePosixPath -from typing import IO, TypeVar +from io import StringIO, BytesIO from databricks.sdk import WorkspaceClient from databricks.sdk.service.workspace import ObjectInfo from databricks.labs.blueprint.paths import WorkspacePath -from databricks.labs.ucx.source_code.base import decode_with_bom +class _CachedIO: -# lru_cache won't let us invalidate cache entries -# so we provide our own custom lru_cache -class _PathLruCache: + def __init__(self, content): + self._content = content + self._index = 0 - _datas: OrderedDict[PurePosixPath, bytes] - """Cached binary data of files, keyed by workspace path, ordered from oldest to newest.""" + def __enter__(self): + return self - _max_entries: int - """The maximum number of entries to hold in the cache.""" + def __exit__(self, exc_type, exc_val, exc_tb): + return False - def __init__(self, max_entries: int) -> None: - # Ordered from oldest to newest. - self._datas = OrderedDict() - self._max_entries = max_entries + def read(self, *args, **_kwargs): + count = -1 if len(args) < 1 or args[0] < 1 else args[0] + if count == -1: + return self._content + start = self._index + end = self._index + count + if start >= len(self._content): + return None + self._index = self._index + count + return self._content[start:end] + + def __iter__(self): + if isinstance(self._content, str): + yield from StringIO(self._content) + return + yield from self._as_string_io().__iter__() - @classmethod - def _normalize(cls, path: _CachedPath) -> PurePosixPath: - # Note: must not return the same instance that was passed in, to avoid circular references (and memory leaks). - return PurePosixPath(*path.parts) + def with_mode(self, mode: str): + if 'b' in mode: + return self._as_bytes_io() + return self._as_string_io() + + def _as_bytes_io(self): + if isinstance(self._content, bytes): + return self + return BytesIO(self._content.encode("utf-8-sig")) - def load(self, cached_path: _CachedPath, buffering: int = -1) -> bytes: - normalized_path = self._normalize(cached_path) + def _as_string_io(self): + if isinstance(self._content, str): + return self + return StringIO(self._content.decode("utf-8")) - data = self._datas.get(normalized_path, None) - if data is not None: - self._datas.move_to_end(normalized_path) - return data - # Need to bypass the _CachedPath.open() override to actually open and retrieve the file content. - with WorkspacePath.open(cached_path, mode="rb", buffering=buffering) as workspace_file: - data = workspace_file.read() - if self._max_entries <= len(self._datas): +# lru_cache won't let us invalidate cache entries +# so we provide our own custom lru_cache +class _PathLruCache: + + def __init__(self, max_entries: int): + self._datas: OrderedDict[str, bytes | str] = OrderedDict() + self._max_entries = max_entries + + def open(self, cached_path: _CachedPath, mode, buffering, encoding, errors, newline): + path = str(cached_path) + if path in self._datas: + self._datas.move_to_end(path) + return _CachedIO(self._datas[path]).with_mode(mode) + io_obj = WorkspacePath.open(cached_path, mode, buffering, encoding, errors, newline) + # can't read twice from an IO so need to cache data rather than the io object + data = io_obj.read() + self._datas[path] = data + result = _CachedIO(data).with_mode(mode) + if len(self._datas) > self._max_entries: self._datas.popitem(last=False) - self._datas[normalized_path] = data - return data + return result - def clear(self) -> None: + def clear(self): self._datas.clear() - def remove(self, path: _CachedPath) -> None: - del self._datas[self._normalize(path)] + def remove(self, path: str): + if path in self._datas: + self._datas.pop(path) class _CachedPath(WorkspacePath): - def __init__(self, cache: _PathLruCache, ws: WorkspaceClient, *args: str | bytes | os.PathLike) -> None: + def __init__(self, cache: _PathLruCache, ws: WorkspaceClient, *args: str | bytes | os.PathLike): super().__init__(ws, *args) self._cache = cache - @classmethod - def _from_object_info_with_cache( - cls, - cache: _PathLruCache, - ws: WorkspaceClient, - object_info: ObjectInfo, - ) -> _CachedPath: - assert object_info.path - path = cls(cache, ws, object_info.path) - path._cached_object_info = object_info - return path - - def with_segments(self: _CachedPathT, *path_segments: bytes | str | os.PathLike) -> _CachedPathT: + def with_object_info(self, object_info: ObjectInfo): + self._cached_object_info = object_info + return self + + def with_segments(self, *path_segments: bytes | str | os.PathLike) -> _CachedPath: return type(self)(self._cache, self._ws, *path_segments) def iterdir(self) -> Generator[_CachedPath, None, None]: - # Variant of the superclass implementation that preserves the cache, as well as the client. for object_info in self._ws.workspace.list(self.as_posix()): - yield self._from_object_info_with_cache(self._cache, self._ws, object_info) - - def open( # type: ignore[override] + path = object_info.path + if path is None: + msg = f"Cannot initialise without object path: {object_info}" + raise ValueError(msg) + child = _CachedPath(self._cache, self._ws, path) + yield child.with_object_info(object_info) + + def open( self, mode: str = "r", buffering: int = -1, encoding: str | None = None, errors: str | None = None, newline: str | None = None, - ) -> IO: - # We only cache reads; if a write happens we use the default implementation (and evict any cache entry). - if 'w' in mode: - self._cache.remove(self) - return super().open(mode, buffering, encoding, errors, newline) - - binary_data = self._cache.load(self, buffering=buffering) - binary_io = BytesIO(binary_data) - if 'b' in mode: - return binary_io + ): + # only cache reads + if 'r' in mode: + return self._cache.open(self, mode, buffering, encoding, errors, newline) + self._cache.remove(str(self)) + return super().open(mode, buffering, encoding, errors, newline) - return decode_with_bom(binary_io, encoding, errors, newline) + def _cached_open(self, mode: str, buffering: int, encoding: str | None, errors: str | None, newline: str | None): + return super().open(mode, buffering, encoding, errors, newline) # _rename calls unlink so no need to override it def unlink(self, missing_ok: bool = False) -> None: - self._cache.remove(self) + self._cache.remove(str(self)) return super().unlink(missing_ok) -_CachedPathT = TypeVar("_CachedPathT", bound=_CachedPath) - - class WorkspaceCache: - class InvalidWorkspacePath(ValueError): - pass - - def __init__(self, ws: WorkspaceClient, max_entries: int = 2048) -> None: + def __init__(self, ws: WorkspaceClient, max_entries=2048): self._ws = ws self._cache = _PathLruCache(max_entries) - def get_workspace_path(self, path: str) -> WorkspacePath: - """Obtain a `WorkspacePath` instance for a path that refers to a workspace file or notebook. - - The instance returned participates in this content cache: the first time the path is opened the content will - be immediately retrieved (prior to reading) and cached. - - Args: - path: a valid workspace path (must be absolute) - Raises: - WorkspaceCache.InvalidWorkspacePath: this is raised immediately if the supplied path is not a syntactically - valid workspace path. (This is not raised if the path is syntactically valid but does not exist.) - """ - if not path.startswith("/"): - msg = f"Invalid workspace path; must be absolute and start with a slash ('/'): {path}" - raise WorkspaceCache.InvalidWorkspacePath(msg) + def get_path(self, path: str): return _CachedPath(self._cache, self._ws, path) diff --git a/src/databricks/labs/ucx/source_code/base.py b/src/databricks/labs/ucx/source_code/base.py index b0f9c91b21..71f76e6085 100644 --- a/src/databricks/labs/ucx/source_code/base.py +++ b/src/databricks/labs/ucx/source_code/base.py @@ -2,7 +2,7 @@ import codecs import dataclasses -import io +import locale import logging import sys from abc import abstractmethod, ABC @@ -10,7 +10,7 @@ from dataclasses import dataclass, field from datetime import datetime from pathlib import Path -from typing import Any, BinaryIO, TextIO +from typing import Any from astroid import AstroidSyntaxError, NodeNG # type: ignore from sqlglot import Expression, parse as parse_sql, ParseError as SqlParseError @@ -482,71 +482,18 @@ def file_language(path: Path) -> Language | None: return SUPPORTED_EXTENSION_LANGUAGES.get(path.suffix.lower()) -def _detect_encoding_bom(binary_io: BinaryIO, *, preserve_position: bool) -> str | None: - # Peek at the first (up to) 4 bytes, preserving the file position if requested. - position = binary_io.tell() if preserve_position else None - try: - maybe_bom = binary_io.read(4) - finally: - if position is not None: - binary_io.seek(position) - # For these encodings, TextIOWrapper will skip over the BOM during decoding. - if maybe_bom.startswith(codecs.BOM_UTF32_LE) or maybe_bom.startswith(codecs.BOM_UTF32_BE): - return "utf-32" - if maybe_bom.startswith(codecs.BOM_UTF16_LE) or maybe_bom.startswith(codecs.BOM_UTF16_BE): - return "utf-16" - if maybe_bom.startswith(codecs.BOM_UTF8): - return "utf-8-sig" - return None - - -def decode_with_bom( - file: BinaryIO, - encoding: str | None = None, - errors: str | None = None, - newline: str | None = None, -) -> TextIO: - """Wrap an open binary file with a text decoder. - - This has the same semantics as the built-in `open()` call, except that if the encoding is not specified and the - file is seekable then it will be checked for a BOM. If a BOM marker is found, that encoding is used. When neither - an encoding nor a BOM are present the encoding of the system locale is used. - - Args: - file: the open (binary) file to wrap in text mode. - encoding: force decoding with a specific locale. If not present the file BOM and system locale are used. - errors: how decoding errors should be handled, as per open(). - newline: how newlines should be handled, as per open(). - Raises: - ValueError: if the encoding should be detected via potential BOM marker but the file is not seekable. - Returns: - a text-based IO wrapper that will decode the underlying binary-mode file as text. - """ - use_encoding = _detect_encoding_bom(file, preserve_position=True) if encoding is None else encoding - return io.TextIOWrapper(file, encoding=use_encoding, errors=errors, newline=newline) - - -def read_text(path: Path, size: int = -1) -> str: - """Read a file as text, decoding according to the BOM marker if that is present. - - This differs to the normal `.read_text()` method on path which does not support BOM markers. - - Arguments: - path: the path to read text from. - size: how much text (measured in characters) to read. If negative, all text is read. Less may be read if the - file is smaller than the specified size. - Returns: - The string content of the file, up to the specified size. - """ - with path.open("rb") as binary_io: - # If the open file is seekable, we can detect the BOM and decode without re-opening. - if binary_io.seekable(): - with decode_with_bom(binary_io) as f: - return f.read(size) - encoding = _detect_encoding_bom(binary_io, preserve_position=False) - # Otherwise having read the BOM there's no way to rewind so we need to re-open and read from that. - with path.open("rt", encoding=encoding) as f: - return f.read(size) +def guess_encoding(path: Path) -> str: + # some files encode a unicode BOM (byte-order-mark), so let's use that if available + with path.open('rb') as _file: + raw = _file.read(4) + if raw.startswith(codecs.BOM_UTF32_LE) or raw.startswith(codecs.BOM_UTF32_BE): + return 'utf-32' + if raw.startswith(codecs.BOM_UTF16_LE) or raw.startswith(codecs.BOM_UTF16_BE): + return 'utf-16' + if raw.startswith(codecs.BOM_UTF8): + return 'utf-8-sig' + # no BOM, let's use default encoding + return locale.getpreferredencoding(False) # duplicated from CellLanguage to prevent cyclic import @@ -566,7 +513,8 @@ def is_a_notebook(path: Path, content: str | None = None) -> bool: if content is not None: return content.startswith(magic_header) try: - file_header = read_text(path, size=len(magic_header)) + with path.open('rt', encoding=guess_encoding(path)) as f: + file_header = f.read(len(magic_header)) except (FileNotFoundError, UnicodeDecodeError, PermissionError): logger.warning(f"Could not read file {path}") return False diff --git a/src/databricks/labs/ucx/source_code/jobs.py b/src/databricks/labs/ucx/source_code/jobs.py index 5c00795442..bfd72ccfa5 100644 --- a/src/databricks/labs/ucx/source_code/jobs.py +++ b/src/databricks/labs/ucx/source_code/jobs.py @@ -31,11 +31,11 @@ LocatedAdvice, is_a_notebook, file_language, + guess_encoding, SourceInfo, UsedTable, LineageAtom, PythonSequentialLinter, - read_text, ) from databricks.labs.ucx.source_code.directfs_access import ( DirectFsAccessCrawler, @@ -156,7 +156,7 @@ def _as_path(self, path: str) -> Path: parsed_path = parse.urlparse(path) match parsed_path.scheme: case "": - return self._cache.get_workspace_path(path) + return self._cache.get_path(path) case "dbfs": return DBFSPath(self._ws, parsed_path.path) case other: @@ -184,8 +184,6 @@ def _register_library(self, graph: DependencyGraph, library: compute.Library) -> yield from self._register_whl(graph, library) if library.requirements: yield from self._register_requirements_txt(graph, library) - except WorkspaceCache.InvalidWorkspacePath as e: - yield DependencyProblem('cannot-load-file', str(e)) except BadRequest as e: # see https://github.com/databrickslabs/ucx/issues/2916 yield DependencyProblem('library-error', f'Cannot retrieve library: {e}') @@ -235,11 +233,8 @@ def _register_notebook(self, graph: DependencyGraph) -> Iterable[DependencyProbl self._named_parameters = self._task.notebook_task.base_parameters notebook_path = self._task.notebook_task.notebook_path logger.info(f'Discovering {self._task.task_key} entrypoint: {notebook_path}') - try: - # Notebooks can't be on DBFS. - path = self._cache.get_workspace_path(notebook_path) - except WorkspaceCache.InvalidWorkspacePath as e: - return [DependencyProblem('cannot-load-notebook', str(e))] + # Notebooks can't be on DBFS. + path = self._cache.get_path(notebook_path) return graph.register_notebook(path, False) def _register_spark_python_task(self, graph: DependencyGraph) -> Iterable[DependencyProblem]: @@ -248,10 +243,7 @@ def _register_spark_python_task(self, graph: DependencyGraph) -> Iterable[Depend self._parameters = self._task.spark_python_task.parameters python_file = self._task.spark_python_task.python_file logger.info(f'Discovering {self._task.task_key} entrypoint: {python_file}') - try: - path = self._as_path(python_file) - except WorkspaceCache.InvalidWorkspacePath as e: - return [DependencyProblem('cannot-load-file', str(e))] + path = self._as_path(python_file) return graph.register_file(path) @staticmethod @@ -315,7 +307,11 @@ def _register_pipeline_task(self, graph: DependencyGraph) -> Iterable[Dependency if not library.notebook: return if library.notebook.path: - yield from self._register_notebook_path(graph, library.notebook.path) + notebook_path = library.notebook.path + # Notebooks can't be on DBFS. + path = self._cache.get_path(notebook_path) + # the notebook is the root of the graph, so there's no context to inherit + yield from graph.register_notebook(path, inherit_context=False) if library.jar: yield from self._register_library(graph, compute.Library(jar=library.jar)) if library.maven: @@ -323,16 +319,6 @@ def _register_pipeline_task(self, graph: DependencyGraph) -> Iterable[Dependency if library.file: yield DependencyProblem('not-yet-implemented', 'File library is not yet implemented') - def _register_notebook_path(self, graph: DependencyGraph, notebook_path: str) -> Iterable[DependencyProblem]: - try: - # Notebooks can't be on DBFS. - path = self._cache.get_workspace_path(notebook_path) - except WorkspaceCache.InvalidWorkspacePath as e: - yield DependencyProblem('cannot-load-notebook', str(e)) - return - # the notebook is the root of the graph, so there's no context to inherit - yield from graph.register_notebook(path, inherit_context=False) - def _register_existing_cluster_id(self, graph: DependencyGraph) -> Iterable[DependencyProblem]: if not self._task.existing_cluster_id: return @@ -616,7 +602,7 @@ def _process_dependency( logger.warning(f"Unknown language for {dependency.path}") return cell_language = CellLanguage.of_language(language) - source = read_text(dependency.path) + source = dependency.path.read_text(guess_encoding(dependency.path)) if is_a_notebook(dependency.path): yield from self._collect_from_notebook(source, cell_language, dependency.path, inherited_tree) elif dependency.path.is_file(): diff --git a/src/databricks/labs/ucx/source_code/known.json b/src/databricks/labs/ucx/source_code/known.json index 484f6019aa..6230f2f824 100644 --- a/src/databricks/labs/ucx/source_code/known.json +++ b/src/databricks/labs/ucx/source_code/known.json @@ -7129,9 +7129,6 @@ "json5.tool": [], "json5.version": [] }, - "jsonpatch": { - "jsonpatch": [] - }, "jsonpointer": { "jsonpointer": [] }, @@ -8773,1225 +8770,6 @@ "langchain.vectorstores.zep": [], "langchain.vectorstores.zilliz": [] }, - "langchain-community": { - "langchain_community": [], - "langchain_community.adapters": [], - "langchain_community.adapters.openai": [], - "langchain_community.agent_toolkits": [], - "langchain_community.agent_toolkits.ainetwork": [], - "langchain_community.agent_toolkits.ainetwork.toolkit": [], - "langchain_community.agent_toolkits.amadeus": [], - "langchain_community.agent_toolkits.amadeus.toolkit": [], - "langchain_community.agent_toolkits.azure_ai_services": [], - "langchain_community.agent_toolkits.azure_cognitive_services": [], - "langchain_community.agent_toolkits.base": [], - "langchain_community.agent_toolkits.cassandra_database": [], - "langchain_community.agent_toolkits.cassandra_database.toolkit": [], - "langchain_community.agent_toolkits.clickup": [], - "langchain_community.agent_toolkits.clickup.toolkit": [], - "langchain_community.agent_toolkits.cogniswitch": [], - "langchain_community.agent_toolkits.cogniswitch.toolkit": [], - "langchain_community.agent_toolkits.connery": [], - "langchain_community.agent_toolkits.connery.toolkit": [], - "langchain_community.agent_toolkits.csv": [], - "langchain_community.agent_toolkits.file_management": [], - "langchain_community.agent_toolkits.file_management.toolkit": [], - "langchain_community.agent_toolkits.financial_datasets": [], - "langchain_community.agent_toolkits.financial_datasets.toolkit": [], - "langchain_community.agent_toolkits.github": [], - "langchain_community.agent_toolkits.github.toolkit": [], - "langchain_community.agent_toolkits.gitlab": [], - "langchain_community.agent_toolkits.gitlab.toolkit": [], - "langchain_community.agent_toolkits.gmail": [], - "langchain_community.agent_toolkits.gmail.toolkit": [], - "langchain_community.agent_toolkits.jira": [], - "langchain_community.agent_toolkits.jira.toolkit": [], - "langchain_community.agent_toolkits.json": [], - "langchain_community.agent_toolkits.json.base": [], - "langchain_community.agent_toolkits.json.prompt": [], - "langchain_community.agent_toolkits.json.toolkit": [], - "langchain_community.agent_toolkits.load_tools": [], - "langchain_community.agent_toolkits.multion": [], - "langchain_community.agent_toolkits.multion.toolkit": [], - "langchain_community.agent_toolkits.nasa": [], - "langchain_community.agent_toolkits.nasa.toolkit": [], - "langchain_community.agent_toolkits.nla": [], - "langchain_community.agent_toolkits.nla.tool": [], - "langchain_community.agent_toolkits.nla.toolkit": [], - "langchain_community.agent_toolkits.office365": [], - "langchain_community.agent_toolkits.office365.toolkit": [], - "langchain_community.agent_toolkits.openapi": [], - "langchain_community.agent_toolkits.openapi.base": [], - "langchain_community.agent_toolkits.openapi.planner": [], - "langchain_community.agent_toolkits.openapi.planner_prompt": [], - "langchain_community.agent_toolkits.openapi.prompt": [], - "langchain_community.agent_toolkits.openapi.spec": [], - "langchain_community.agent_toolkits.openapi.toolkit": [], - "langchain_community.agent_toolkits.playwright": [], - "langchain_community.agent_toolkits.playwright.toolkit": [], - "langchain_community.agent_toolkits.polygon": [], - "langchain_community.agent_toolkits.polygon.toolkit": [], - "langchain_community.agent_toolkits.powerbi": [], - "langchain_community.agent_toolkits.powerbi.base": [], - "langchain_community.agent_toolkits.powerbi.chat_base": [], - "langchain_community.agent_toolkits.powerbi.prompt": [], - "langchain_community.agent_toolkits.powerbi.toolkit": [], - "langchain_community.agent_toolkits.slack": [], - "langchain_community.agent_toolkits.slack.toolkit": [], - "langchain_community.agent_toolkits.spark_sql": [], - "langchain_community.agent_toolkits.spark_sql.base": [], - "langchain_community.agent_toolkits.spark_sql.prompt": [], - "langchain_community.agent_toolkits.spark_sql.toolkit": [], - "langchain_community.agent_toolkits.sql": [], - "langchain_community.agent_toolkits.sql.base": [], - "langchain_community.agent_toolkits.sql.prompt": [], - "langchain_community.agent_toolkits.sql.toolkit": [], - "langchain_community.agent_toolkits.steam": [], - "langchain_community.agent_toolkits.steam.toolkit": [], - "langchain_community.agent_toolkits.xorbits": [], - "langchain_community.agent_toolkits.zapier": [], - "langchain_community.agent_toolkits.zapier.toolkit": [], - "langchain_community.agents": [], - "langchain_community.agents.openai_assistant": [], - "langchain_community.agents.openai_assistant.base": [], - "langchain_community.cache": [], - "langchain_community.callbacks": [], - "langchain_community.callbacks.aim_callback": [], - "langchain_community.callbacks.argilla_callback": [], - "langchain_community.callbacks.arize_callback": [], - "langchain_community.callbacks.arthur_callback": [], - "langchain_community.callbacks.bedrock_anthropic_callback": [], - "langchain_community.callbacks.clearml_callback": [], - "langchain_community.callbacks.comet_ml_callback": [], - "langchain_community.callbacks.confident_callback": [], - "langchain_community.callbacks.context_callback": [], - "langchain_community.callbacks.fiddler_callback": [], - "langchain_community.callbacks.flyte_callback": [], - "langchain_community.callbacks.human": [], - "langchain_community.callbacks.infino_callback": [], - "langchain_community.callbacks.labelstudio_callback": [], - "langchain_community.callbacks.llmonitor_callback": [], - "langchain_community.callbacks.manager": [], - "langchain_community.callbacks.mlflow_callback": [], - "langchain_community.callbacks.openai_info": [], - "langchain_community.callbacks.promptlayer_callback": [], - "langchain_community.callbacks.sagemaker_callback": [], - "langchain_community.callbacks.streamlit": [], - "langchain_community.callbacks.streamlit.mutable_expander": [], - "langchain_community.callbacks.streamlit.streamlit_callback_handler": [], - "langchain_community.callbacks.tracers": [], - "langchain_community.callbacks.tracers.comet": [], - "langchain_community.callbacks.tracers.wandb": [], - "langchain_community.callbacks.trubrics_callback": [], - "langchain_community.callbacks.upstash_ratelimit_callback": [], - "langchain_community.callbacks.uptrain_callback": [], - "langchain_community.callbacks.utils": [], - "langchain_community.callbacks.wandb_callback": [], - "langchain_community.callbacks.whylabs_callback": [], - "langchain_community.chains": [], - "langchain_community.chains.ernie_functions": [], - "langchain_community.chains.ernie_functions.base": [], - "langchain_community.chains.graph_qa": [], - "langchain_community.chains.graph_qa.arangodb": [], - "langchain_community.chains.graph_qa.base": [], - "langchain_community.chains.graph_qa.cypher": [], - "langchain_community.chains.graph_qa.cypher_utils": [], - "langchain_community.chains.graph_qa.falkordb": [], - "langchain_community.chains.graph_qa.gremlin": [], - "langchain_community.chains.graph_qa.hugegraph": [], - "langchain_community.chains.graph_qa.kuzu": [], - "langchain_community.chains.graph_qa.nebulagraph": [], - "langchain_community.chains.graph_qa.neptune_cypher": [], - "langchain_community.chains.graph_qa.neptune_sparql": [], - "langchain_community.chains.graph_qa.ontotext_graphdb": [], - "langchain_community.chains.graph_qa.prompts": [], - "langchain_community.chains.graph_qa.sparql": [], - "langchain_community.chains.llm_requests": [], - "langchain_community.chains.natbot": [], - "langchain_community.chains.natbot.base": [], - "langchain_community.chains.natbot.crawler": [], - "langchain_community.chains.natbot.prompt": [], - "langchain_community.chains.openapi": [], - "langchain_community.chains.openapi.chain": [], - "langchain_community.chains.openapi.prompts": [], - "langchain_community.chains.openapi.requests_chain": [], - "langchain_community.chains.openapi.response_chain": [], - "langchain_community.chains.pebblo_retrieval": [], - "langchain_community.chains.pebblo_retrieval.base": [], - "langchain_community.chains.pebblo_retrieval.enforcement_filters": [], - "langchain_community.chains.pebblo_retrieval.models": [], - "langchain_community.chains.pebblo_retrieval.utilities": [], - "langchain_community.chat_loaders": [], - "langchain_community.chat_loaders.base": [], - "langchain_community.chat_loaders.facebook_messenger": [], - "langchain_community.chat_loaders.gmail": [], - "langchain_community.chat_loaders.imessage": [], - "langchain_community.chat_loaders.langsmith": [], - "langchain_community.chat_loaders.slack": [], - "langchain_community.chat_loaders.telegram": [], - "langchain_community.chat_loaders.utils": [], - "langchain_community.chat_loaders.whatsapp": [], - "langchain_community.chat_message_histories": [], - "langchain_community.chat_message_histories.astradb": [], - "langchain_community.chat_message_histories.cassandra": [], - "langchain_community.chat_message_histories.cosmos_db": [], - "langchain_community.chat_message_histories.dynamodb": [], - "langchain_community.chat_message_histories.elasticsearch": [], - "langchain_community.chat_message_histories.file": [], - "langchain_community.chat_message_histories.firestore": [], - "langchain_community.chat_message_histories.in_memory": [], - "langchain_community.chat_message_histories.kafka": [], - "langchain_community.chat_message_histories.momento": [], - "langchain_community.chat_message_histories.mongodb": [], - "langchain_community.chat_message_histories.neo4j": [], - "langchain_community.chat_message_histories.postgres": [], - "langchain_community.chat_message_histories.redis": [], - "langchain_community.chat_message_histories.rocksetdb": [], - "langchain_community.chat_message_histories.singlestoredb": [], - "langchain_community.chat_message_histories.sql": [], - "langchain_community.chat_message_histories.streamlit": [], - "langchain_community.chat_message_histories.tidb": [], - "langchain_community.chat_message_histories.upstash_redis": [], - "langchain_community.chat_message_histories.xata": [], - "langchain_community.chat_message_histories.zep": [], - "langchain_community.chat_message_histories.zep_cloud": [], - "langchain_community.chat_models": [], - "langchain_community.chat_models.anthropic": [], - "langchain_community.chat_models.anyscale": [], - "langchain_community.chat_models.azure_openai": [], - "langchain_community.chat_models.azureml_endpoint": [], - "langchain_community.chat_models.baichuan": [], - "langchain_community.chat_models.baidu_qianfan_endpoint": [], - "langchain_community.chat_models.bedrock": [], - "langchain_community.chat_models.cohere": [], - "langchain_community.chat_models.coze": [], - "langchain_community.chat_models.dappier": [], - "langchain_community.chat_models.databricks": [], - "langchain_community.chat_models.deepinfra": [], - "langchain_community.chat_models.edenai": [], - "langchain_community.chat_models.ernie": [], - "langchain_community.chat_models.everlyai": [], - "langchain_community.chat_models.fake": [], - "langchain_community.chat_models.fireworks": [], - "langchain_community.chat_models.friendli": [], - "langchain_community.chat_models.gigachat": [], - "langchain_community.chat_models.google_palm": [], - "langchain_community.chat_models.gpt_router": [], - "langchain_community.chat_models.huggingface": [], - "langchain_community.chat_models.human": [], - "langchain_community.chat_models.hunyuan": [], - "langchain_community.chat_models.javelin_ai_gateway": [], - "langchain_community.chat_models.jinachat": [], - "langchain_community.chat_models.kinetica": [], - "langchain_community.chat_models.konko": [], - "langchain_community.chat_models.litellm": [], - "langchain_community.chat_models.litellm_router": [], - "langchain_community.chat_models.llama_edge": [], - "langchain_community.chat_models.llamacpp": [], - "langchain_community.chat_models.maritalk": [], - "langchain_community.chat_models.meta": [], - "langchain_community.chat_models.minimax": [], - "langchain_community.chat_models.mlflow": [], - "langchain_community.chat_models.mlflow_ai_gateway": [], - "langchain_community.chat_models.mlx": [], - "langchain_community.chat_models.moonshot": [], - "langchain_community.chat_models.oci_generative_ai": [], - "langchain_community.chat_models.octoai": [], - "langchain_community.chat_models.ollama": [], - "langchain_community.chat_models.openai": [], - "langchain_community.chat_models.pai_eas_endpoint": [], - "langchain_community.chat_models.perplexity": [], - "langchain_community.chat_models.premai": [], - "langchain_community.chat_models.promptlayer_openai": [], - "langchain_community.chat_models.sambanova": [], - "langchain_community.chat_models.snowflake": [], - "langchain_community.chat_models.solar": [], - "langchain_community.chat_models.sparkllm": [], - "langchain_community.chat_models.symblai_nebula": [], - "langchain_community.chat_models.tongyi": [], - "langchain_community.chat_models.vertexai": [], - "langchain_community.chat_models.volcengine_maas": [], - "langchain_community.chat_models.yandex": [], - "langchain_community.chat_models.yi": [], - "langchain_community.chat_models.yuan2": [], - "langchain_community.chat_models.zhipuai": [], - "langchain_community.cross_encoders": [], - "langchain_community.cross_encoders.base": [], - "langchain_community.cross_encoders.fake": [], - "langchain_community.cross_encoders.huggingface": [], - "langchain_community.cross_encoders.sagemaker_endpoint": [], - "langchain_community.docstore": [], - "langchain_community.docstore.arbitrary_fn": [], - "langchain_community.docstore.base": [], - "langchain_community.docstore.document": [], - "langchain_community.docstore.in_memory": [], - "langchain_community.docstore.wikipedia": [], - "langchain_community.document_compressors": [], - "langchain_community.document_compressors.dashscope_rerank": [], - "langchain_community.document_compressors.flashrank_rerank": [], - "langchain_community.document_compressors.jina_rerank": [], - "langchain_community.document_compressors.llmlingua_filter": [], - "langchain_community.document_compressors.openvino_rerank": [], - "langchain_community.document_compressors.rankllm_rerank": [], - "langchain_community.document_compressors.volcengine_rerank": [], - "langchain_community.document_loaders": [], - "langchain_community.document_loaders.acreom": [], - "langchain_community.document_loaders.airbyte": [], - "langchain_community.document_loaders.airbyte_json": [], - "langchain_community.document_loaders.airtable": [], - "langchain_community.document_loaders.apify_dataset": [], - "langchain_community.document_loaders.arcgis_loader": [], - "langchain_community.document_loaders.arxiv": [], - "langchain_community.document_loaders.assemblyai": [], - "langchain_community.document_loaders.astradb": [], - "langchain_community.document_loaders.async_html": [], - "langchain_community.document_loaders.athena": [], - "langchain_community.document_loaders.azlyrics": [], - "langchain_community.document_loaders.azure_ai_data": [], - "langchain_community.document_loaders.azure_blob_storage_container": [], - "langchain_community.document_loaders.azure_blob_storage_file": [], - "langchain_community.document_loaders.baiducloud_bos_directory": [], - "langchain_community.document_loaders.baiducloud_bos_file": [], - "langchain_community.document_loaders.base": [], - "langchain_community.document_loaders.base_o365": [], - "langchain_community.document_loaders.bibtex": [], - "langchain_community.document_loaders.bigquery": [], - "langchain_community.document_loaders.bilibili": [], - "langchain_community.document_loaders.blackboard": [], - "langchain_community.document_loaders.blob_loaders": [], - "langchain_community.document_loaders.blob_loaders.cloud_blob_loader": [], - "langchain_community.document_loaders.blob_loaders.file_system": [], - "langchain_community.document_loaders.blob_loaders.schema": [], - "langchain_community.document_loaders.blob_loaders.youtube_audio": [], - "langchain_community.document_loaders.blockchain": [], - "langchain_community.document_loaders.brave_search": [], - "langchain_community.document_loaders.browserbase": [], - "langchain_community.document_loaders.browserless": [], - "langchain_community.document_loaders.cassandra": [], - "langchain_community.document_loaders.chatgpt": [], - "langchain_community.document_loaders.chm": [], - "langchain_community.document_loaders.chromium": [], - "langchain_community.document_loaders.college_confidential": [], - "langchain_community.document_loaders.concurrent": [], - "langchain_community.document_loaders.confluence": [], - "langchain_community.document_loaders.conllu": [], - "langchain_community.document_loaders.couchbase": [], - "langchain_community.document_loaders.csv_loader": [], - "langchain_community.document_loaders.cube_semantic": [], - "langchain_community.document_loaders.datadog_logs": [], - "langchain_community.document_loaders.dataframe": [], - "langchain_community.document_loaders.dedoc": [], - "langchain_community.document_loaders.diffbot": [], - "langchain_community.document_loaders.directory": [], - "langchain_community.document_loaders.discord": [], - "langchain_community.document_loaders.doc_intelligence": [], - "langchain_community.document_loaders.docugami": [], - "langchain_community.document_loaders.docusaurus": [], - "langchain_community.document_loaders.dropbox": [], - "langchain_community.document_loaders.duckdb_loader": [], - "langchain_community.document_loaders.email": [], - "langchain_community.document_loaders.epub": [], - "langchain_community.document_loaders.etherscan": [], - "langchain_community.document_loaders.evernote": [], - "langchain_community.document_loaders.excel": [], - "langchain_community.document_loaders.facebook_chat": [], - "langchain_community.document_loaders.fauna": [], - "langchain_community.document_loaders.figma": [], - "langchain_community.document_loaders.firecrawl": [], - "langchain_community.document_loaders.gcs_directory": [], - "langchain_community.document_loaders.gcs_file": [], - "langchain_community.document_loaders.generic": [], - "langchain_community.document_loaders.geodataframe": [], - "langchain_community.document_loaders.git": [], - "langchain_community.document_loaders.gitbook": [], - "langchain_community.document_loaders.github": [], - "langchain_community.document_loaders.glue_catalog": [], - "langchain_community.document_loaders.google_speech_to_text": [], - "langchain_community.document_loaders.googledrive": [], - "langchain_community.document_loaders.gutenberg": [], - "langchain_community.document_loaders.helpers": [], - "langchain_community.document_loaders.hn": [], - "langchain_community.document_loaders.html": [], - "langchain_community.document_loaders.html_bs": [], - "langchain_community.document_loaders.hugging_face_dataset": [], - "langchain_community.document_loaders.hugging_face_model": [], - "langchain_community.document_loaders.ifixit": [], - "langchain_community.document_loaders.image": [], - "langchain_community.document_loaders.image_captions": [], - "langchain_community.document_loaders.imsdb": [], - "langchain_community.document_loaders.iugu": [], - "langchain_community.document_loaders.joplin": [], - "langchain_community.document_loaders.json_loader": [], - "langchain_community.document_loaders.kinetica_loader": [], - "langchain_community.document_loaders.lakefs": [], - "langchain_community.document_loaders.larksuite": [], - "langchain_community.document_loaders.llmsherpa": [], - "langchain_community.document_loaders.markdown": [], - "langchain_community.document_loaders.mastodon": [], - "langchain_community.document_loaders.max_compute": [], - "langchain_community.document_loaders.mediawikidump": [], - "langchain_community.document_loaders.merge": [], - "langchain_community.document_loaders.mhtml": [], - "langchain_community.document_loaders.mintbase": [], - "langchain_community.document_loaders.modern_treasury": [], - "langchain_community.document_loaders.mongodb": [], - "langchain_community.document_loaders.news": [], - "langchain_community.document_loaders.notebook": [], - "langchain_community.document_loaders.notion": [], - "langchain_community.document_loaders.notiondb": [], - "langchain_community.document_loaders.nuclia": [], - "langchain_community.document_loaders.obs_directory": [], - "langchain_community.document_loaders.obs_file": [], - "langchain_community.document_loaders.obsidian": [], - "langchain_community.document_loaders.odt": [], - "langchain_community.document_loaders.onedrive": [], - "langchain_community.document_loaders.onedrive_file": [], - "langchain_community.document_loaders.onenote": [], - "langchain_community.document_loaders.open_city_data": [], - "langchain_community.document_loaders.oracleadb_loader": [], - "langchain_community.document_loaders.oracleai": [], - "langchain_community.document_loaders.org_mode": [], - "langchain_community.document_loaders.parsers": [], - "langchain_community.document_loaders.parsers.audio": [], - "langchain_community.document_loaders.parsers.doc_intelligence": [], - "langchain_community.document_loaders.parsers.docai": [], - "langchain_community.document_loaders.parsers.generic": [], - "langchain_community.document_loaders.parsers.grobid": [], - "langchain_community.document_loaders.parsers.html": [], - "langchain_community.document_loaders.parsers.html.bs4": [], - "langchain_community.document_loaders.parsers.language": [], - "langchain_community.document_loaders.parsers.language.c": [], - "langchain_community.document_loaders.parsers.language.cobol": [], - "langchain_community.document_loaders.parsers.language.code_segmenter": [], - "langchain_community.document_loaders.parsers.language.cpp": [], - "langchain_community.document_loaders.parsers.language.csharp": [], - "langchain_community.document_loaders.parsers.language.elixir": [], - "langchain_community.document_loaders.parsers.language.go": [], - "langchain_community.document_loaders.parsers.language.java": [], - "langchain_community.document_loaders.parsers.language.javascript": [], - "langchain_community.document_loaders.parsers.language.kotlin": [], - "langchain_community.document_loaders.parsers.language.language_parser": [], - "langchain_community.document_loaders.parsers.language.lua": [], - "langchain_community.document_loaders.parsers.language.perl": [], - "langchain_community.document_loaders.parsers.language.php": [], - "langchain_community.document_loaders.parsers.language.python": [], - "langchain_community.document_loaders.parsers.language.ruby": [], - "langchain_community.document_loaders.parsers.language.rust": [], - "langchain_community.document_loaders.parsers.language.scala": [], - "langchain_community.document_loaders.parsers.language.tree_sitter_segmenter": [], - "langchain_community.document_loaders.parsers.language.typescript": [], - "langchain_community.document_loaders.parsers.msword": [], - "langchain_community.document_loaders.parsers.pdf": [], - "langchain_community.document_loaders.parsers.registry": [], - "langchain_community.document_loaders.parsers.txt": [], - "langchain_community.document_loaders.parsers.vsdx": [], - "langchain_community.document_loaders.pdf": [], - "langchain_community.document_loaders.pebblo": [], - "langchain_community.document_loaders.polars_dataframe": [], - "langchain_community.document_loaders.powerpoint": [], - "langchain_community.document_loaders.psychic": [], - "langchain_community.document_loaders.pubmed": [], - "langchain_community.document_loaders.pyspark_dataframe": [], - "langchain_community.document_loaders.python": [], - "langchain_community.document_loaders.quip": [], - "langchain_community.document_loaders.readthedocs": [], - "langchain_community.document_loaders.recursive_url_loader": [], - "langchain_community.document_loaders.reddit": [], - "langchain_community.document_loaders.roam": [], - "langchain_community.document_loaders.rocksetdb": [], - "langchain_community.document_loaders.rspace": [], - "langchain_community.document_loaders.rss": [], - "langchain_community.document_loaders.rst": [], - "langchain_community.document_loaders.rtf": [], - "langchain_community.document_loaders.s3_directory": [], - "langchain_community.document_loaders.s3_file": [], - "langchain_community.document_loaders.scrapfly": [], - "langchain_community.document_loaders.scrapingant": [], - "langchain_community.document_loaders.sharepoint": [], - "langchain_community.document_loaders.sitemap": [], - "langchain_community.document_loaders.slack_directory": [], - "langchain_community.document_loaders.snowflake_loader": [], - "langchain_community.document_loaders.spider": [], - "langchain_community.document_loaders.spreedly": [], - "langchain_community.document_loaders.sql_database": [], - "langchain_community.document_loaders.srt": [], - "langchain_community.document_loaders.stripe": [], - "langchain_community.document_loaders.surrealdb": [], - "langchain_community.document_loaders.telegram": [], - "langchain_community.document_loaders.tencent_cos_directory": [], - "langchain_community.document_loaders.tencent_cos_file": [], - "langchain_community.document_loaders.tensorflow_datasets": [], - "langchain_community.document_loaders.text": [], - "langchain_community.document_loaders.tidb": [], - "langchain_community.document_loaders.tomarkdown": [], - "langchain_community.document_loaders.toml": [], - "langchain_community.document_loaders.trello": [], - "langchain_community.document_loaders.tsv": [], - "langchain_community.document_loaders.twitter": [], - "langchain_community.document_loaders.unstructured": [], - "langchain_community.document_loaders.url": [], - "langchain_community.document_loaders.url_playwright": [], - "langchain_community.document_loaders.url_selenium": [], - "langchain_community.document_loaders.vsdx": [], - "langchain_community.document_loaders.weather": [], - "langchain_community.document_loaders.web_base": [], - "langchain_community.document_loaders.whatsapp_chat": [], - "langchain_community.document_loaders.wikipedia": [], - "langchain_community.document_loaders.word_document": [], - "langchain_community.document_loaders.xml": [], - "langchain_community.document_loaders.xorbits": [], - "langchain_community.document_loaders.youtube": [], - "langchain_community.document_loaders.yuque": [], - "langchain_community.document_transformers": [], - "langchain_community.document_transformers.beautiful_soup_transformer": [], - "langchain_community.document_transformers.doctran_text_extract": [], - "langchain_community.document_transformers.doctran_text_qa": [], - "langchain_community.document_transformers.doctran_text_translate": [], - "langchain_community.document_transformers.embeddings_redundant_filter": [], - "langchain_community.document_transformers.google_translate": [], - "langchain_community.document_transformers.html2text": [], - "langchain_community.document_transformers.long_context_reorder": [], - "langchain_community.document_transformers.markdownify": [], - "langchain_community.document_transformers.nuclia_text_transform": [], - "langchain_community.document_transformers.openai_functions": [], - "langchain_community.embeddings": [], - "langchain_community.embeddings.aleph_alpha": [], - "langchain_community.embeddings.anyscale": [], - "langchain_community.embeddings.ascend": [], - "langchain_community.embeddings.awa": [], - "langchain_community.embeddings.azure_openai": [], - "langchain_community.embeddings.baichuan": [], - "langchain_community.embeddings.baidu_qianfan_endpoint": [], - "langchain_community.embeddings.bedrock": [], - "langchain_community.embeddings.bookend": [], - "langchain_community.embeddings.clarifai": [], - "langchain_community.embeddings.cloudflare_workersai": [], - "langchain_community.embeddings.clova": [], - "langchain_community.embeddings.cohere": [], - "langchain_community.embeddings.dashscope": [], - "langchain_community.embeddings.databricks": [], - "langchain_community.embeddings.deepinfra": [], - "langchain_community.embeddings.edenai": [], - "langchain_community.embeddings.elasticsearch": [], - "langchain_community.embeddings.embaas": [], - "langchain_community.embeddings.ernie": [], - "langchain_community.embeddings.fake": [], - "langchain_community.embeddings.fastembed": [], - "langchain_community.embeddings.gigachat": [], - "langchain_community.embeddings.google_palm": [], - "langchain_community.embeddings.gpt4all": [], - "langchain_community.embeddings.gradient_ai": [], - "langchain_community.embeddings.huggingface": [], - "langchain_community.embeddings.huggingface_hub": [], - "langchain_community.embeddings.infinity": [], - "langchain_community.embeddings.infinity_local": [], - "langchain_community.embeddings.ipex_llm": [], - "langchain_community.embeddings.itrex": [], - "langchain_community.embeddings.javelin_ai_gateway": [], - "langchain_community.embeddings.jina": [], - "langchain_community.embeddings.johnsnowlabs": [], - "langchain_community.embeddings.laser": [], - "langchain_community.embeddings.llamacpp": [], - "langchain_community.embeddings.llamafile": [], - "langchain_community.embeddings.llm_rails": [], - "langchain_community.embeddings.localai": [], - "langchain_community.embeddings.minimax": [], - "langchain_community.embeddings.mlflow": [], - "langchain_community.embeddings.mlflow_gateway": [], - "langchain_community.embeddings.modelscope_hub": [], - "langchain_community.embeddings.mosaicml": [], - "langchain_community.embeddings.nemo": [], - "langchain_community.embeddings.nlpcloud": [], - "langchain_community.embeddings.oci_generative_ai": [], - "langchain_community.embeddings.octoai_embeddings": [], - "langchain_community.embeddings.ollama": [], - "langchain_community.embeddings.openai": [], - "langchain_community.embeddings.openvino": [], - "langchain_community.embeddings.optimum_intel": [], - "langchain_community.embeddings.oracleai": [], - "langchain_community.embeddings.ovhcloud": [], - "langchain_community.embeddings.premai": [], - "langchain_community.embeddings.sagemaker_endpoint": [], - "langchain_community.embeddings.sambanova": [], - "langchain_community.embeddings.self_hosted": [], - "langchain_community.embeddings.self_hosted_hugging_face": [], - "langchain_community.embeddings.sentence_transformer": [], - "langchain_community.embeddings.solar": [], - "langchain_community.embeddings.spacy_embeddings": [], - "langchain_community.embeddings.sparkllm": [], - "langchain_community.embeddings.tensorflow_hub": [], - "langchain_community.embeddings.text2vec": [], - "langchain_community.embeddings.textembed": [], - "langchain_community.embeddings.titan_takeoff": [], - "langchain_community.embeddings.vertexai": [], - "langchain_community.embeddings.volcengine": [], - "langchain_community.embeddings.voyageai": [], - "langchain_community.embeddings.xinference": [], - "langchain_community.embeddings.yandex": [], - "langchain_community.embeddings.zhipuai": [], - "langchain_community.example_selectors": [], - "langchain_community.example_selectors.ngram_overlap": [], - "langchain_community.graph_vectorstores": [], - "langchain_community.graph_vectorstores.base": [], - "langchain_community.graph_vectorstores.cassandra": [], - "langchain_community.graph_vectorstores.extractors": [], - "langchain_community.graph_vectorstores.extractors.gliner_link_extractor": [], - "langchain_community.graph_vectorstores.extractors.hierarchy_link_extractor": [], - "langchain_community.graph_vectorstores.extractors.html_link_extractor": [], - "langchain_community.graph_vectorstores.extractors.keybert_link_extractor": [], - "langchain_community.graph_vectorstores.extractors.link_extractor": [], - "langchain_community.graph_vectorstores.extractors.link_extractor_adapter": [], - "langchain_community.graph_vectorstores.extractors.link_extractor_transformer": [], - "langchain_community.graph_vectorstores.links": [], - "langchain_community.graphs": [], - "langchain_community.graphs.age_graph": [], - "langchain_community.graphs.arangodb_graph": [], - "langchain_community.graphs.falkordb_graph": [], - "langchain_community.graphs.graph_document": [], - "langchain_community.graphs.graph_store": [], - "langchain_community.graphs.gremlin_graph": [], - "langchain_community.graphs.hugegraph": [], - "langchain_community.graphs.index_creator": [], - "langchain_community.graphs.kuzu_graph": [], - "langchain_community.graphs.memgraph_graph": [], - "langchain_community.graphs.nebula_graph": [], - "langchain_community.graphs.neo4j_graph": [], - "langchain_community.graphs.neptune_graph": [], - "langchain_community.graphs.neptune_rdf_graph": [], - "langchain_community.graphs.networkx_graph": [], - "langchain_community.graphs.ontotext_graphdb_graph": [], - "langchain_community.graphs.rdf_graph": [], - "langchain_community.graphs.tigergraph_graph": [], - "langchain_community.indexes": [], - "langchain_community.indexes._document_manager": [], - "langchain_community.indexes._sql_record_manager": [], - "langchain_community.indexes.base": [], - "langchain_community.llms": [], - "langchain_community.llms.ai21": [], - "langchain_community.llms.aleph_alpha": [], - "langchain_community.llms.amazon_api_gateway": [], - "langchain_community.llms.anthropic": [], - "langchain_community.llms.anyscale": [], - "langchain_community.llms.aphrodite": [], - "langchain_community.llms.arcee": [], - "langchain_community.llms.aviary": [], - "langchain_community.llms.azureml_endpoint": [], - "langchain_community.llms.baichuan": [], - "langchain_community.llms.baidu_qianfan_endpoint": [], - "langchain_community.llms.bananadev": [], - "langchain_community.llms.baseten": [], - "langchain_community.llms.beam": [], - "langchain_community.llms.bedrock": [], - "langchain_community.llms.bigdl_llm": [], - "langchain_community.llms.bittensor": [], - "langchain_community.llms.cerebriumai": [], - "langchain_community.llms.chatglm": [], - "langchain_community.llms.chatglm3": [], - "langchain_community.llms.clarifai": [], - "langchain_community.llms.cloudflare_workersai": [], - "langchain_community.llms.cohere": [], - "langchain_community.llms.ctransformers": [], - "langchain_community.llms.ctranslate2": [], - "langchain_community.llms.databricks": [], - "langchain_community.llms.deepinfra": [], - "langchain_community.llms.deepsparse": [], - "langchain_community.llms.edenai": [], - "langchain_community.llms.exllamav2": [], - "langchain_community.llms.fake": [], - "langchain_community.llms.fireworks": [], - "langchain_community.llms.forefrontai": [], - "langchain_community.llms.friendli": [], - "langchain_community.llms.gigachat": [], - "langchain_community.llms.google_palm": [], - "langchain_community.llms.gooseai": [], - "langchain_community.llms.gpt4all": [], - "langchain_community.llms.gradient_ai": [], - "langchain_community.llms.huggingface_endpoint": [], - "langchain_community.llms.huggingface_hub": [], - "langchain_community.llms.huggingface_pipeline": [], - "langchain_community.llms.huggingface_text_gen_inference": [], - "langchain_community.llms.human": [], - "langchain_community.llms.ipex_llm": [], - "langchain_community.llms.javelin_ai_gateway": [], - "langchain_community.llms.koboldai": [], - "langchain_community.llms.konko": [], - "langchain_community.llms.layerup_security": [], - "langchain_community.llms.llamacpp": [], - "langchain_community.llms.llamafile": [], - "langchain_community.llms.loading": [], - "langchain_community.llms.manifest": [], - "langchain_community.llms.minimax": [], - "langchain_community.llms.mlflow": [], - "langchain_community.llms.mlflow_ai_gateway": [], - "langchain_community.llms.mlx_pipeline": [], - "langchain_community.llms.modal": [], - "langchain_community.llms.moonshot": [], - "langchain_community.llms.mosaicml": [], - "langchain_community.llms.nlpcloud": [], - "langchain_community.llms.oci_data_science_model_deployment_endpoint": [], - "langchain_community.llms.oci_generative_ai": [], - "langchain_community.llms.octoai_endpoint": [], - "langchain_community.llms.ollama": [], - "langchain_community.llms.opaqueprompts": [], - "langchain_community.llms.openai": [], - "langchain_community.llms.openllm": [], - "langchain_community.llms.openlm": [], - "langchain_community.llms.pai_eas_endpoint": [], - "langchain_community.llms.petals": [], - "langchain_community.llms.pipelineai": [], - "langchain_community.llms.predibase": [], - "langchain_community.llms.predictionguard": [], - "langchain_community.llms.promptlayer_openai": [], - "langchain_community.llms.replicate": [], - "langchain_community.llms.rwkv": [], - "langchain_community.llms.sagemaker_endpoint": [], - "langchain_community.llms.sambanova": [], - "langchain_community.llms.self_hosted": [], - "langchain_community.llms.self_hosted_hugging_face": [], - "langchain_community.llms.solar": [], - "langchain_community.llms.sparkllm": [], - "langchain_community.llms.stochasticai": [], - "langchain_community.llms.symblai_nebula": [], - "langchain_community.llms.textgen": [], - "langchain_community.llms.titan_takeoff": [], - "langchain_community.llms.together": [], - "langchain_community.llms.tongyi": [], - "langchain_community.llms.utils": [], - "langchain_community.llms.vertexai": [], - "langchain_community.llms.vllm": [], - "langchain_community.llms.volcengine_maas": [], - "langchain_community.llms.watsonxllm": [], - "langchain_community.llms.weight_only_quantization": [], - "langchain_community.llms.writer": [], - "langchain_community.llms.xinference": [], - "langchain_community.llms.yandex": [], - "langchain_community.llms.yi": [], - "langchain_community.llms.you": [], - "langchain_community.llms.yuan2": [], - "langchain_community.memory": [], - "langchain_community.memory.kg": [], - "langchain_community.memory.motorhead_memory": [], - "langchain_community.memory.zep_cloud_memory": [], - "langchain_community.memory.zep_memory": [], - "langchain_community.output_parsers": [], - "langchain_community.output_parsers.ernie_functions": [], - "langchain_community.output_parsers.rail_parser": [], - "langchain_community.query_constructors": [], - "langchain_community.query_constructors.astradb": [], - "langchain_community.query_constructors.chroma": [], - "langchain_community.query_constructors.dashvector": [], - "langchain_community.query_constructors.databricks_vector_search": [], - "langchain_community.query_constructors.deeplake": [], - "langchain_community.query_constructors.dingo": [], - "langchain_community.query_constructors.elasticsearch": [], - "langchain_community.query_constructors.hanavector": [], - "langchain_community.query_constructors.milvus": [], - "langchain_community.query_constructors.mongodb_atlas": [], - "langchain_community.query_constructors.myscale": [], - "langchain_community.query_constructors.neo4j": [], - "langchain_community.query_constructors.opensearch": [], - "langchain_community.query_constructors.pgvector": [], - "langchain_community.query_constructors.pinecone": [], - "langchain_community.query_constructors.qdrant": [], - "langchain_community.query_constructors.redis": [], - "langchain_community.query_constructors.supabase": [], - "langchain_community.query_constructors.tencentvectordb": [], - "langchain_community.query_constructors.timescalevector": [], - "langchain_community.query_constructors.vectara": [], - "langchain_community.query_constructors.weaviate": [], - "langchain_community.retrievers": [], - "langchain_community.retrievers.arcee": [], - "langchain_community.retrievers.arxiv": [], - "langchain_community.retrievers.asknews": [], - "langchain_community.retrievers.azure_ai_search": [], - "langchain_community.retrievers.bedrock": [], - "langchain_community.retrievers.bm25": [], - "langchain_community.retrievers.breebs": [], - "langchain_community.retrievers.chaindesk": [], - "langchain_community.retrievers.chatgpt_plugin_retriever": [], - "langchain_community.retrievers.cohere_rag_retriever": [], - "langchain_community.retrievers.databerry": [], - "langchain_community.retrievers.docarray": [], - "langchain_community.retrievers.dria_index": [], - "langchain_community.retrievers.elastic_search_bm25": [], - "langchain_community.retrievers.embedchain": [], - "langchain_community.retrievers.google_cloud_documentai_warehouse": [], - "langchain_community.retrievers.google_vertex_ai_search": [], - "langchain_community.retrievers.kay": [], - "langchain_community.retrievers.kendra": [], - "langchain_community.retrievers.knn": [], - "langchain_community.retrievers.llama_index": [], - "langchain_community.retrievers.metal": [], - "langchain_community.retrievers.milvus": [], - "langchain_community.retrievers.nanopq": [], - "langchain_community.retrievers.outline": [], - "langchain_community.retrievers.pinecone_hybrid_search": [], - "langchain_community.retrievers.pubmed": [], - "langchain_community.retrievers.pupmed": [], - "langchain_community.retrievers.qdrant_sparse_vector_retriever": [], - "langchain_community.retrievers.rememberizer": [], - "langchain_community.retrievers.remote_retriever": [], - "langchain_community.retrievers.svm": [], - "langchain_community.retrievers.tavily_search_api": [], - "langchain_community.retrievers.tfidf": [], - "langchain_community.retrievers.thirdai_neuraldb": [], - "langchain_community.retrievers.vespa_retriever": [], - "langchain_community.retrievers.weaviate_hybrid_search": [], - "langchain_community.retrievers.web_research": [], - "langchain_community.retrievers.wikipedia": [], - "langchain_community.retrievers.you": [], - "langchain_community.retrievers.zep": [], - "langchain_community.retrievers.zep_cloud": [], - "langchain_community.retrievers.zilliz": [], - "langchain_community.storage": [], - "langchain_community.storage.astradb": [], - "langchain_community.storage.cassandra": [], - "langchain_community.storage.exceptions": [], - "langchain_community.storage.mongodb": [], - "langchain_community.storage.redis": [], - "langchain_community.storage.sql": [], - "langchain_community.storage.upstash_redis": [], - "langchain_community.tools": [], - "langchain_community.tools.ainetwork": [], - "langchain_community.tools.ainetwork.app": [], - "langchain_community.tools.ainetwork.base": [], - "langchain_community.tools.ainetwork.owner": [], - "langchain_community.tools.ainetwork.rule": [], - "langchain_community.tools.ainetwork.transfer": [], - "langchain_community.tools.ainetwork.utils": [], - "langchain_community.tools.ainetwork.value": [], - "langchain_community.tools.amadeus": [], - "langchain_community.tools.amadeus.base": [], - "langchain_community.tools.amadeus.closest_airport": [], - "langchain_community.tools.amadeus.flight_search": [], - "langchain_community.tools.amadeus.utils": [], - "langchain_community.tools.arxiv": [], - "langchain_community.tools.arxiv.tool": [], - "langchain_community.tools.asknews": [], - "langchain_community.tools.asknews.tool": [], - "langchain_community.tools.audio": [], - "langchain_community.tools.audio.huggingface_text_to_speech_inference": [], - "langchain_community.tools.azure_ai_services": [], - "langchain_community.tools.azure_ai_services.document_intelligence": [], - "langchain_community.tools.azure_ai_services.image_analysis": [], - "langchain_community.tools.azure_ai_services.speech_to_text": [], - "langchain_community.tools.azure_ai_services.text_analytics_for_health": [], - "langchain_community.tools.azure_ai_services.text_to_speech": [], - "langchain_community.tools.azure_ai_services.utils": [], - "langchain_community.tools.azure_cognitive_services": [], - "langchain_community.tools.azure_cognitive_services.form_recognizer": [], - "langchain_community.tools.azure_cognitive_services.image_analysis": [], - "langchain_community.tools.azure_cognitive_services.speech2text": [], - "langchain_community.tools.azure_cognitive_services.text2speech": [], - "langchain_community.tools.azure_cognitive_services.text_analytics_health": [], - "langchain_community.tools.azure_cognitive_services.utils": [], - "langchain_community.tools.bearly": [], - "langchain_community.tools.bearly.tool": [], - "langchain_community.tools.bing_search": [], - "langchain_community.tools.bing_search.tool": [], - "langchain_community.tools.brave_search": [], - "langchain_community.tools.brave_search.tool": [], - "langchain_community.tools.cassandra_database": [], - "langchain_community.tools.cassandra_database.prompt": [], - "langchain_community.tools.cassandra_database.tool": [], - "langchain_community.tools.clickup": [], - "langchain_community.tools.clickup.prompt": [], - "langchain_community.tools.clickup.tool": [], - "langchain_community.tools.cogniswitch": [], - "langchain_community.tools.cogniswitch.tool": [], - "langchain_community.tools.connery": [], - "langchain_community.tools.connery.models": [], - "langchain_community.tools.connery.service": [], - "langchain_community.tools.connery.tool": [], - "langchain_community.tools.convert_to_openai": [], - "langchain_community.tools.databricks": [], - "langchain_community.tools.databricks._execution": [], - "langchain_community.tools.databricks.tool": [], - "langchain_community.tools.dataforseo_api_search": [], - "langchain_community.tools.dataforseo_api_search.tool": [], - "langchain_community.tools.dataherald": [], - "langchain_community.tools.dataherald.tool": [], - "langchain_community.tools.ddg_search": [], - "langchain_community.tools.ddg_search.tool": [], - "langchain_community.tools.e2b_data_analysis": [], - "langchain_community.tools.e2b_data_analysis.tool": [], - "langchain_community.tools.e2b_data_analysis.unparse": [], - "langchain_community.tools.edenai": [], - "langchain_community.tools.edenai.audio_speech_to_text": [], - "langchain_community.tools.edenai.audio_text_to_speech": [], - "langchain_community.tools.edenai.edenai_base_tool": [], - "langchain_community.tools.edenai.image_explicitcontent": [], - "langchain_community.tools.edenai.image_objectdetection": [], - "langchain_community.tools.edenai.ocr_identityparser": [], - "langchain_community.tools.edenai.ocr_invoiceparser": [], - "langchain_community.tools.edenai.text_moderation": [], - "langchain_community.tools.eleven_labs": [], - "langchain_community.tools.eleven_labs.models": [], - "langchain_community.tools.eleven_labs.text2speech": [], - "langchain_community.tools.file_management": [], - "langchain_community.tools.file_management.copy": [], - "langchain_community.tools.file_management.delete": [], - "langchain_community.tools.file_management.file_search": [], - "langchain_community.tools.file_management.list_dir": [], - "langchain_community.tools.file_management.move": [], - "langchain_community.tools.file_management.read": [], - "langchain_community.tools.file_management.utils": [], - "langchain_community.tools.file_management.write": [], - "langchain_community.tools.financial_datasets": [], - "langchain_community.tools.financial_datasets.balance_sheets": [], - "langchain_community.tools.financial_datasets.cash_flow_statements": [], - "langchain_community.tools.financial_datasets.income_statements": [], - "langchain_community.tools.github": [], - "langchain_community.tools.github.prompt": [], - "langchain_community.tools.github.tool": [], - "langchain_community.tools.gitlab": [], - "langchain_community.tools.gitlab.prompt": [], - "langchain_community.tools.gitlab.tool": [], - "langchain_community.tools.gmail": [], - "langchain_community.tools.gmail.base": [], - "langchain_community.tools.gmail.create_draft": [], - "langchain_community.tools.gmail.get_message": [], - "langchain_community.tools.gmail.get_thread": [], - "langchain_community.tools.gmail.search": [], - "langchain_community.tools.gmail.send_message": [], - "langchain_community.tools.gmail.utils": [], - "langchain_community.tools.golden_query": [], - "langchain_community.tools.golden_query.tool": [], - "langchain_community.tools.google_cloud": [], - "langchain_community.tools.google_cloud.texttospeech": [], - "langchain_community.tools.google_finance": [], - "langchain_community.tools.google_finance.tool": [], - "langchain_community.tools.google_jobs": [], - "langchain_community.tools.google_jobs.tool": [], - "langchain_community.tools.google_lens": [], - "langchain_community.tools.google_lens.tool": [], - "langchain_community.tools.google_places": [], - "langchain_community.tools.google_places.tool": [], - "langchain_community.tools.google_scholar": [], - "langchain_community.tools.google_scholar.tool": [], - "langchain_community.tools.google_search": [], - "langchain_community.tools.google_search.tool": [], - "langchain_community.tools.google_serper": [], - "langchain_community.tools.google_serper.tool": [], - "langchain_community.tools.google_trends": [], - "langchain_community.tools.google_trends.tool": [], - "langchain_community.tools.graphql": [], - "langchain_community.tools.graphql.tool": [], - "langchain_community.tools.human": [], - "langchain_community.tools.human.tool": [], - "langchain_community.tools.ifttt": [], - "langchain_community.tools.interaction": [], - "langchain_community.tools.interaction.tool": [], - "langchain_community.tools.jina_search": [], - "langchain_community.tools.jina_search.tool": [], - "langchain_community.tools.jira": [], - "langchain_community.tools.jira.prompt": [], - "langchain_community.tools.jira.tool": [], - "langchain_community.tools.json": [], - "langchain_community.tools.json.tool": [], - "langchain_community.tools.memorize": [], - "langchain_community.tools.memorize.tool": [], - "langchain_community.tools.merriam_webster": [], - "langchain_community.tools.merriam_webster.tool": [], - "langchain_community.tools.metaphor_search": [], - "langchain_community.tools.metaphor_search.tool": [], - "langchain_community.tools.mojeek_search": [], - "langchain_community.tools.mojeek_search.tool": [], - "langchain_community.tools.multion": [], - "langchain_community.tools.multion.close_session": [], - "langchain_community.tools.multion.create_session": [], - "langchain_community.tools.multion.update_session": [], - "langchain_community.tools.nasa": [], - "langchain_community.tools.nasa.prompt": [], - "langchain_community.tools.nasa.tool": [], - "langchain_community.tools.nuclia": [], - "langchain_community.tools.nuclia.tool": [], - "langchain_community.tools.office365": [], - "langchain_community.tools.office365.base": [], - "langchain_community.tools.office365.create_draft_message": [], - "langchain_community.tools.office365.events_search": [], - "langchain_community.tools.office365.messages_search": [], - "langchain_community.tools.office365.send_event": [], - "langchain_community.tools.office365.send_message": [], - "langchain_community.tools.office365.utils": [], - "langchain_community.tools.openai_dalle_image_generation": [], - "langchain_community.tools.openai_dalle_image_generation.tool": [], - "langchain_community.tools.openapi": [], - "langchain_community.tools.openapi.utils": [], - "langchain_community.tools.openapi.utils.api_models": [], - "langchain_community.tools.openapi.utils.openapi_utils": [], - "langchain_community.tools.openweathermap": [], - "langchain_community.tools.openweathermap.tool": [], - "langchain_community.tools.passio_nutrition_ai": [], - "langchain_community.tools.passio_nutrition_ai.tool": [], - "langchain_community.tools.playwright": [], - "langchain_community.tools.playwright.base": [], - "langchain_community.tools.playwright.click": [], - "langchain_community.tools.playwright.current_page": [], - "langchain_community.tools.playwright.extract_hyperlinks": [], - "langchain_community.tools.playwright.extract_text": [], - "langchain_community.tools.playwright.get_elements": [], - "langchain_community.tools.playwright.navigate": [], - "langchain_community.tools.playwright.navigate_back": [], - "langchain_community.tools.playwright.utils": [], - "langchain_community.tools.plugin": [], - "langchain_community.tools.polygon": [], - "langchain_community.tools.polygon.aggregates": [], - "langchain_community.tools.polygon.financials": [], - "langchain_community.tools.polygon.last_quote": [], - "langchain_community.tools.polygon.ticker_news": [], - "langchain_community.tools.powerbi": [], - "langchain_community.tools.powerbi.prompt": [], - "langchain_community.tools.powerbi.tool": [], - "langchain_community.tools.pubmed": [], - "langchain_community.tools.pubmed.tool": [], - "langchain_community.tools.reddit_search.tool": [], - "langchain_community.tools.render": [], - "langchain_community.tools.requests": [], - "langchain_community.tools.requests.tool": [], - "langchain_community.tools.riza": [], - "langchain_community.tools.riza.command": [], - "langchain_community.tools.scenexplain": [], - "langchain_community.tools.scenexplain.tool": [], - "langchain_community.tools.searchapi": [], - "langchain_community.tools.searchapi.tool": [], - "langchain_community.tools.searx_search": [], - "langchain_community.tools.searx_search.tool": [], - "langchain_community.tools.semanticscholar": [], - "langchain_community.tools.semanticscholar.tool": [], - "langchain_community.tools.shell": [], - "langchain_community.tools.shell.tool": [], - "langchain_community.tools.slack": [], - "langchain_community.tools.slack.base": [], - "langchain_community.tools.slack.get_channel": [], - "langchain_community.tools.slack.get_message": [], - "langchain_community.tools.slack.schedule_message": [], - "langchain_community.tools.slack.send_message": [], - "langchain_community.tools.slack.utils": [], - "langchain_community.tools.sleep": [], - "langchain_community.tools.sleep.tool": [], - "langchain_community.tools.spark_sql": [], - "langchain_community.tools.spark_sql.prompt": [], - "langchain_community.tools.spark_sql.tool": [], - "langchain_community.tools.sql_database": [], - "langchain_community.tools.sql_database.prompt": [], - "langchain_community.tools.sql_database.tool": [], - "langchain_community.tools.stackexchange": [], - "langchain_community.tools.stackexchange.tool": [], - "langchain_community.tools.steam": [], - "langchain_community.tools.steam.prompt": [], - "langchain_community.tools.steam.tool": [], - "langchain_community.tools.steamship_image_generation": [], - "langchain_community.tools.steamship_image_generation.tool": [], - "langchain_community.tools.steamship_image_generation.utils": [], - "langchain_community.tools.tavily_search": [], - "langchain_community.tools.tavily_search.tool": [], - "langchain_community.tools.vectorstore": [], - "langchain_community.tools.vectorstore.tool": [], - "langchain_community.tools.wikidata": [], - "langchain_community.tools.wikidata.tool": [], - "langchain_community.tools.wikipedia": [], - "langchain_community.tools.wikipedia.tool": [], - "langchain_community.tools.wolfram_alpha": [], - "langchain_community.tools.wolfram_alpha.tool": [], - "langchain_community.tools.yahoo_finance_news": [], - "langchain_community.tools.you": [], - "langchain_community.tools.you.tool": [], - "langchain_community.tools.youtube": [], - "langchain_community.tools.youtube.search": [], - "langchain_community.tools.zapier": [], - "langchain_community.tools.zapier.prompt": [], - "langchain_community.tools.zapier.tool": [], - "langchain_community.tools.zenguard": [], - "langchain_community.tools.zenguard.tool": [], - "langchain_community.utilities": [], - "langchain_community.utilities.alpha_vantage": [], - "langchain_community.utilities.anthropic": [], - "langchain_community.utilities.apify": [], - "langchain_community.utilities.arcee": [], - "langchain_community.utilities.arxiv": [], - "langchain_community.utilities.asknews": [], - "langchain_community.utilities.astradb": [], - "langchain_community.utilities.awslambda": [], - "langchain_community.utilities.bibtex": [], - "langchain_community.utilities.bing_search": [], - "langchain_community.utilities.brave_search": [], - "langchain_community.utilities.cassandra": [], - "langchain_community.utilities.cassandra_database": [], - "langchain_community.utilities.clickup": [], - "langchain_community.utilities.dalle_image_generator": [], - "langchain_community.utilities.dataforseo_api_search": [], - "langchain_community.utilities.dataherald": [], - "langchain_community.utilities.dria_index": [], - "langchain_community.utilities.duckduckgo_search": [], - "langchain_community.utilities.financial_datasets": [], - "langchain_community.utilities.github": [], - "langchain_community.utilities.gitlab": [], - "langchain_community.utilities.golden_query": [], - "langchain_community.utilities.google_finance": [], - "langchain_community.utilities.google_jobs": [], - "langchain_community.utilities.google_lens": [], - "langchain_community.utilities.google_places_api": [], - "langchain_community.utilities.google_scholar": [], - "langchain_community.utilities.google_search": [], - "langchain_community.utilities.google_serper": [], - "langchain_community.utilities.google_trends": [], - "langchain_community.utilities.graphql": [], - "langchain_community.utilities.infobip": [], - "langchain_community.utilities.jina_search": [], - "langchain_community.utilities.jira": [], - "langchain_community.utilities.max_compute": [], - "langchain_community.utilities.merriam_webster": [], - "langchain_community.utilities.metaphor_search": [], - "langchain_community.utilities.mojeek_search": [], - "langchain_community.utilities.nasa": [], - "langchain_community.utilities.nvidia_riva": [], - "langchain_community.utilities.opaqueprompts": [], - "langchain_community.utilities.openapi": [], - "langchain_community.utilities.openweathermap": [], - "langchain_community.utilities.oracleai": [], - "langchain_community.utilities.outline": [], - "langchain_community.utilities.passio_nutrition_ai": [], - "langchain_community.utilities.pebblo": [], - "langchain_community.utilities.polygon": [], - "langchain_community.utilities.portkey": [], - "langchain_community.utilities.powerbi": [], - "langchain_community.utilities.pubmed": [], - "langchain_community.utilities.python": [], - "langchain_community.utilities.reddit_search": [], - "langchain_community.utilities.redis": [], - "langchain_community.utilities.rememberizer": [], - "langchain_community.utilities.requests": [], - "langchain_community.utilities.scenexplain": [], - "langchain_community.utilities.searchapi": [], - "langchain_community.utilities.searx_search": [], - "langchain_community.utilities.semanticscholar": [], - "langchain_community.utilities.spark_sql": [], - "langchain_community.utilities.sql_database": [], - "langchain_community.utilities.stackexchange": [], - "langchain_community.utilities.steam": [], - "langchain_community.utilities.tavily_search": [], - "langchain_community.utilities.tensorflow_datasets": [], - "langchain_community.utilities.twilio": [], - "langchain_community.utilities.vertexai": [], - "langchain_community.utilities.wikidata": [], - "langchain_community.utilities.wikipedia": [], - "langchain_community.utilities.wolfram_alpha": [], - "langchain_community.utilities.you": [], - "langchain_community.utilities.zapier": [], - "langchain_community.utils": [], - "langchain_community.utils.ernie_functions": [], - "langchain_community.utils.google": [], - "langchain_community.utils.math": [], - "langchain_community.utils.openai": [], - "langchain_community.utils.openai_functions": [], - "langchain_community.utils.user_agent": [], - "langchain_community.vectorstores": [], - "langchain_community.vectorstores.aerospike": [], - "langchain_community.vectorstores.alibabacloud_opensearch": [], - "langchain_community.vectorstores.analyticdb": [], - "langchain_community.vectorstores.annoy": [], - "langchain_community.vectorstores.apache_doris": [], - "langchain_community.vectorstores.aperturedb": [], - "langchain_community.vectorstores.astradb": [], - "langchain_community.vectorstores.atlas": [], - "langchain_community.vectorstores.awadb": [], - "langchain_community.vectorstores.azure_cosmos_db": [], - "langchain_community.vectorstores.azure_cosmos_db_no_sql": [], - "langchain_community.vectorstores.azuresearch": [], - "langchain_community.vectorstores.bagel": [], - "langchain_community.vectorstores.bageldb": [], - "langchain_community.vectorstores.baiducloud_vector_search": [], - "langchain_community.vectorstores.baiduvectordb": [], - "langchain_community.vectorstores.bigquery_vector_search": [], - "langchain_community.vectorstores.cassandra": [], - "langchain_community.vectorstores.chroma": [], - "langchain_community.vectorstores.clarifai": [], - "langchain_community.vectorstores.clickhouse": [], - "langchain_community.vectorstores.couchbase": [], - "langchain_community.vectorstores.dashvector": [], - "langchain_community.vectorstores.databricks_vector_search": [], - "langchain_community.vectorstores.deeplake": [], - "langchain_community.vectorstores.dingo": [], - "langchain_community.vectorstores.docarray": [], - "langchain_community.vectorstores.docarray.base": [], - "langchain_community.vectorstores.docarray.hnsw": [], - "langchain_community.vectorstores.docarray.in_memory": [], - "langchain_community.vectorstores.documentdb": [], - "langchain_community.vectorstores.duckdb": [], - "langchain_community.vectorstores.ecloud_vector_search": [], - "langchain_community.vectorstores.elastic_vector_search": [], - "langchain_community.vectorstores.elasticsearch": [], - "langchain_community.vectorstores.epsilla": [], - "langchain_community.vectorstores.faiss": [], - "langchain_community.vectorstores.hanavector": [], - "langchain_community.vectorstores.hippo": [], - "langchain_community.vectorstores.hologres": [], - "langchain_community.vectorstores.infinispanvs": [], - "langchain_community.vectorstores.inmemory": [], - "langchain_community.vectorstores.kdbai": [], - "langchain_community.vectorstores.kinetica": [], - "langchain_community.vectorstores.lancedb": [], - "langchain_community.vectorstores.lantern": [], - "langchain_community.vectorstores.llm_rails": [], - "langchain_community.vectorstores.manticore_search": [], - "langchain_community.vectorstores.marqo": [], - "langchain_community.vectorstores.matching_engine": [], - "langchain_community.vectorstores.meilisearch": [], - "langchain_community.vectorstores.milvus": [], - "langchain_community.vectorstores.momento_vector_index": [], - "langchain_community.vectorstores.mongodb_atlas": [], - "langchain_community.vectorstores.myscale": [], - "langchain_community.vectorstores.neo4j_vector": [], - "langchain_community.vectorstores.nucliadb": [], - "langchain_community.vectorstores.opensearch_vector_search": [], - "langchain_community.vectorstores.oraclevs": [], - "langchain_community.vectorstores.pathway": [], - "langchain_community.vectorstores.pgembedding": [], - "langchain_community.vectorstores.pgvecto_rs": [], - "langchain_community.vectorstores.pgvector": [], - "langchain_community.vectorstores.pinecone": [], - "langchain_community.vectorstores.qdrant": [], - "langchain_community.vectorstores.redis": [], - "langchain_community.vectorstores.redis.base": [], - "langchain_community.vectorstores.redis.constants": [], - "langchain_community.vectorstores.redis.filters": [], - "langchain_community.vectorstores.redis.schema": [], - "langchain_community.vectorstores.relyt": [], - "langchain_community.vectorstores.rocksetdb": [], - "langchain_community.vectorstores.scann": [], - "langchain_community.vectorstores.semadb": [], - "langchain_community.vectorstores.singlestoredb": [], - "langchain_community.vectorstores.sklearn": [], - "langchain_community.vectorstores.sqlitevss": [], - "langchain_community.vectorstores.starrocks": [], - "langchain_community.vectorstores.supabase": [], - "langchain_community.vectorstores.surrealdb": [], - "langchain_community.vectorstores.tair": [], - "langchain_community.vectorstores.tencentvectordb": [], - "langchain_community.vectorstores.thirdai_neuraldb": [], - "langchain_community.vectorstores.tidb_vector": [], - "langchain_community.vectorstores.tigris": [], - "langchain_community.vectorstores.tiledb": [], - "langchain_community.vectorstores.timescalevector": [], - "langchain_community.vectorstores.typesense": [], - "langchain_community.vectorstores.upstash": [], - "langchain_community.vectorstores.usearch": [], - "langchain_community.vectorstores.utils": [], - "langchain_community.vectorstores.vald": [], - "langchain_community.vectorstores.vdms": [], - "langchain_community.vectorstores.vearch": [], - "langchain_community.vectorstores.vectara": [], - "langchain_community.vectorstores.vespa": [], - "langchain_community.vectorstores.vikingdb": [], - "langchain_community.vectorstores.vlite": [], - "langchain_community.vectorstores.weaviate": [], - "langchain_community.vectorstores.xata": [], - "langchain_community.vectorstores.yellowbrick": [], - "langchain_community.vectorstores.zep": [], - "langchain_community.vectorstores.zep_cloud": [], - "langchain_community.vectorstores.zilliz": [] - }, "langchain-core": { "langchain_core": [], "langchain_core._api": [], @@ -10157,16 +8935,6 @@ "langchain_text_splitters.sentence_transformers": [], "langchain_text_splitters.spacy": [] }, - "langcodes": { - "langcodes": [], - "langcodes.build_data": [], - "langcodes.data_dicts": [], - "langcodes.language_distance": [], - "langcodes.language_lists": [], - "langcodes.registry_parser": [], - "langcodes.tag_parser": [], - "langcodes.util": [] - }, "langsmith": { "langsmith": [], "langsmith._expect": [], @@ -10200,16 +8968,6 @@ "langsmith.wrappers": [], "langsmith.wrappers._openai": [] }, - "language_data": { - "language_data": [], - "language_data.build_data": [], - "language_data.language_lists": [], - "language_data.name_data": [], - "language_data.names": [], - "language_data.population_data": [], - "language_data.registry_parser": [], - "language_data.util": [] - }, "launchpadlib": { "launchpadlib": [], "launchpadlib.apps": [], @@ -10251,53 +9009,12 @@ "lazr.uri.tests.test_docs": [], "lazr.uri.tests.test_uri": [] }, - "lazy_loader": { - "lazy_loader": [] - }, "libclang": { "clang": [], "clang.cindex": [], "clang.enumerations": [], "clang.native": [] }, - "librosa": { - "librosa": [], - "librosa._cache": [], - "librosa._typing": [], - "librosa.beat": [], - "librosa.core": [], - "librosa.core.audio": [], - "librosa.core.constantq": [], - "librosa.core.convert": [], - "librosa.core.fft": [], - "librosa.core.harmonic": [], - "librosa.core.intervals": [], - "librosa.core.notation": [], - "librosa.core.pitch": [], - "librosa.core.spectrum": [], - "librosa.decompose": [], - "librosa.display": [], - "librosa.effects": [], - "librosa.feature": [], - "librosa.feature.inverse": [], - "librosa.feature.rhythm": [], - "librosa.feature.spectral": [], - "librosa.feature.utils": [], - "librosa.filters": [], - "librosa.onset": [], - "librosa.segment": [], - "librosa.sequence": [], - "librosa.util": [], - "librosa.util._nnls": [], - "librosa.util.decorators": [], - "librosa.util.deprecation": [], - "librosa.util.example_data": [], - "librosa.util.exceptions": [], - "librosa.util.files": [], - "librosa.util.matching": [], - "librosa.util.utils": [], - "librosa.version": [] - }, "lifelines": { "lifelines": [], "lifelines.calibration": [], @@ -10350,12 +9067,6 @@ "lightgbm.plotting": [], "lightgbm.sklearn": [] }, - "linkify-it-py": { - "linkify_it": [], - "linkify_it.main": [], - "linkify_it.tlds": [], - "linkify_it.ucre": [] - }, "livereload": { "livereload": [], "livereload.cli": [], @@ -10365,51 +9076,12 @@ "livereload.server": [], "livereload.watcher": [] }, - "llvmlite": { - "llvmlite": [], - "llvmlite._version": [], - "llvmlite.binding": [], - "llvmlite.binding.analysis": [], - "llvmlite.binding.common": [], - "llvmlite.binding.context": [], - "llvmlite.binding.dylib": [], - "llvmlite.binding.executionengine": [], - "llvmlite.binding.ffi": [], - "llvmlite.binding.initfini": [], - "llvmlite.binding.linker": [], - "llvmlite.binding.module": [], - "llvmlite.binding.object_file": [], - "llvmlite.binding.options": [], - "llvmlite.binding.orcjit": [], - "llvmlite.binding.passmanagers": [], - "llvmlite.binding.targets": [], - "llvmlite.binding.transforms": [], - "llvmlite.binding.typeref": [], - "llvmlite.binding.value": [], - "llvmlite.ir": [], - "llvmlite.ir._utils": [], - "llvmlite.ir.builder": [], - "llvmlite.ir.context": [], - "llvmlite.ir.instructions": [], - "llvmlite.ir.module": [], - "llvmlite.ir.transforms": [], - "llvmlite.ir.types": [], - "llvmlite.ir.values": [], - "llvmlite.utils": [] - }, "lxml": { "lxml": [] }, - "lz4": { - "lz4": [], - "lz4.block": [], - "lz4.frame": [], - "lz4.version": [] - }, "mako": { "mako": [] }, - "marisa-trie": {}, "markdown": { "markdown": [] }, @@ -10686,12 +9358,6 @@ "msal.wstrust_request": [], "msal.wstrust_response": [] }, - "msgpack": { - "msgpack": [], - "msgpack.exceptions": [], - "msgpack.ext": [], - "msgpack.fallback": [] - }, "multidict": { "multidict": [] }, @@ -12168,9 +10834,6 @@ "optree": { "optree": [] }, - "orjson": { - "orjson": [] - }, "overrides": { "overrides": [], "overrides.enforce": [], @@ -25968,15 +24631,6 @@ "pluggy": { "pluggy": [] }, - "pooch": { - "pooch": [], - "pooch._version": [], - "pooch.core": [], - "pooch.downloaders": [], - "pooch.hashes": [], - "pooch.processors": [], - "pooch.utils": [] - }, "prometheus_client": { "prometheus_client": [], "prometheus_client.asgi": [], @@ -26338,13 +24992,6 @@ "pydantic.version": [], "pydantic.warnings": [] }, - "pydantic-settings": { - "pydantic_settings": [], - "pydantic_settings.main": [], - "pydantic_settings.sources": [], - "pydantic_settings.utils": [], - "pydantic_settings.version": [] - }, "pydantic_core": { "pydantic_core": [], "pydantic_core.core_schema": [] @@ -27351,15 +25998,6 @@ "python-dateutil": { "dateutil": [] }, - "python-dotenv": { - "dotenv": [], - "dotenv.cli": [], - "dotenv.ipython": [], - "dotenv.main": [], - "dotenv.parser": [], - "dotenv.variables": [], - "dotenv.version": [] - }, "python-json-logger": { "pythonjsonlogger": [], "pythonjsonlogger.jsonlogger": [] @@ -27477,42 +26115,6 @@ "requests_oauthlib.oauth2_auth": [], "requests_oauthlib.oauth2_session": [] }, - "requests-toolbelt": { - "requests_toolbelt": [], - "requests_toolbelt._compat": [], - "requests_toolbelt.adapters": [], - "requests_toolbelt.adapters.appengine": [], - "requests_toolbelt.adapters.fingerprint": [], - "requests_toolbelt.adapters.host_header_ssl": [], - "requests_toolbelt.adapters.socket_options": [], - "requests_toolbelt.adapters.source": [], - "requests_toolbelt.adapters.ssl": [], - "requests_toolbelt.adapters.x509": [], - "requests_toolbelt.auth": [], - "requests_toolbelt.auth._digest_auth_compat": [], - "requests_toolbelt.auth.guess": [], - "requests_toolbelt.auth.handler": [], - "requests_toolbelt.auth.http_proxy_digest": [], - "requests_toolbelt.cookies": [], - "requests_toolbelt.cookies.forgetful": [], - "requests_toolbelt.downloadutils": [], - "requests_toolbelt.downloadutils.stream": [], - "requests_toolbelt.downloadutils.tee": [], - "requests_toolbelt.exceptions": [], - "requests_toolbelt.multipart": [], - "requests_toolbelt.multipart.decoder": [], - "requests_toolbelt.multipart.encoder": [], - "requests_toolbelt.sessions": [], - "requests_toolbelt.streaming_iterator": [], - "requests_toolbelt.threaded": [], - "requests_toolbelt.threaded.pool": [], - "requests_toolbelt.threaded.thread": [], - "requests_toolbelt.utils": [], - "requests_toolbelt.utils.deprecated": [], - "requests_toolbelt.utils.dump": [], - "requests_toolbelt.utils.formdata": [], - "requests_toolbelt.utils.user_agent": [] - }, "rfc3339-validator": { "rfc3339_validator": [] }, @@ -28831,11 +27433,6 @@ "solacc": { "solacc.companion": [] }, - "soundfile": { - "_soundfile": [], - "_soundfile_data": [], - "soundfile": [] - }, "soupsieve": { "soupsieve": [], "soupsieve.__meta__": [], @@ -28845,10 +27442,6 @@ "soupsieve.pretty": [], "soupsieve.util": [] }, - "soxr": { - "soxr": [], - "soxr._version": [] - }, "spacy": { "spacy": [], "spacy.about": [], @@ -31774,21 +30367,6 @@ "tzdata": { "tzdata": [] }, - "uc-micro-py": { - "uc_micro": [], - "uc_micro.categories.Cc": [], - "uc_micro.categories.Cc.regex": [], - "uc_micro.categories.Cf": [], - "uc_micro.categories.Cf.regex": [], - "uc_micro.categories.P": [], - "uc_micro.categories.P.regex": [], - "uc_micro.categories.Z": [], - "uc_micro.categories.Z.regex": [], - "uc_micro.categories": [], - "uc_micro.properties.Any": [], - "uc_micro.properties.Any.regex": [], - "uc_micro.properties": [] - }, "ujson": {}, "umap": { "umap": [], diff --git a/src/databricks/labs/ucx/source_code/notebooks/sources.py b/src/databricks/labs/ucx/source_code/notebooks/sources.py index db257054cc..c7606c2698 100644 --- a/src/databricks/labs/ucx/source_code/notebooks/sources.py +++ b/src/databricks/labs/ucx/source_code/notebooks/sources.py @@ -20,9 +20,9 @@ PythonSequentialLinter, CurrentSessionState, Advisory, + guess_encoding, file_language, is_a_notebook, - read_text, ) from databricks.labs.ucx.source_code.graph import ( @@ -294,7 +294,7 @@ def _load_source_from_path(self, path: Path | None) -> Notebook | None: if language is not Language.PYTHON: logger.warning(f"Unsupported notebook language: {language}") return None - source = read_text(resolved) + source = resolved.read_text(guess_encoding(resolved)) return Notebook.parse(path, source, language) def _linter(self, language: Language) -> Linter: @@ -312,7 +312,7 @@ def _load_source_from_run_cell(self, run_cell: RunCell) -> None: language = file_language(resolved) if language is not Language.PYTHON: return - source = read_text(resolved) + source = resolved.read_text(guess_encoding(resolved)) notebook = Notebook.parse(path, source, language) for cell in notebook.cells: if isinstance(cell, RunCell): @@ -390,7 +390,7 @@ def __init__( @cached_property def _source_code(self) -> str: if self._content is None: - self._content = read_text(self._path) + self._content = self._path.read_text(guess_encoding(self._path)) return self._content def lint(self) -> Iterable[Advice]: diff --git a/tests/integration/assessment/test_clusters.py b/tests/integration/assessment/test_clusters.py index 75c6bf1ffa..8cf0622220 100644 --- a/tests/integration/assessment/test_clusters.py +++ b/tests/integration/assessment/test_clusters.py @@ -9,8 +9,7 @@ from databricks.labs.ucx.assessment.clusters import ( ClustersCrawler, PoliciesCrawler, - ClusterDetailsOwnership, - ClusterInfoOwnership, + ClusterOwnership, ClusterPolicyOwnership, ) @@ -77,12 +76,9 @@ def test_cluster_ownership(ws, runtime_ctx, make_cluster, make_user, inventory_s # Verify ownership is as expected. administrator_locator = runtime_ctx.administrator_locator - info_ownership = ClusterInfoOwnership(administrator_locator) - assert info_ownership.owner_of(my_cluster_record) == ws.current_user.me().user_name - assert info_ownership.owner_of(their_cluster_record) == another_user.user_name - details_ownership = ClusterDetailsOwnership(administrator_locator) - assert details_ownership.owner_of(ws.clusters.get(my_cluster.cluster_id)) == ws.current_user.me().user_name - assert details_ownership.owner_of(ws.clusters.get(their_cluster.cluster_id)) == another_user.user_name + ownership = ClusterOwnership(administrator_locator) + assert ownership.owner_of(my_cluster_record) == ws.current_user.me().user_name + assert ownership.owner_of(their_cluster_record) == another_user.user_name def test_cluster_crawler_mlr_no_isolation(ws, make_cluster, inventory_schema, sql_backend): diff --git a/tests/integration/assessment/test_jobs.py b/tests/integration/assessment/test_jobs.py index 3f3104765f..47fa6f1b81 100644 --- a/tests/integration/assessment/test_jobs.py +++ b/tests/integration/assessment/test_jobs.py @@ -7,7 +7,7 @@ from databricks.sdk.service.jobs import NotebookTask, RunTask from databricks.sdk.service.workspace import ImportFormat -from databricks.labs.ucx.assessment.jobs import JobInfoOwnership, JobsCrawler, SubmitRunsCrawler +from databricks.labs.ucx.assessment.jobs import JobOwnership, JobsCrawler, SubmitRunsCrawler from .test_assessment import _SPARK_CONF @@ -80,5 +80,5 @@ def test_job_ownership(ws, runtime_ctx, make_job, inventory_schema, sql_backend) job_record = next(record for record in records if record.job_id == str(job.job_id)) # Verify ownership is as expected. - ownership = JobInfoOwnership(runtime_ctx.administrator_locator) + ownership = JobOwnership(runtime_ctx.administrator_locator) assert ownership.owner_of(job_record) == ws.current_user.me().user_name diff --git a/tests/integration/hive_metastore/test_catalog_schema.py b/tests/integration/hive_metastore/test_catalog_schema.py index 93fb810c54..8354d967a7 100644 --- a/tests/integration/hive_metastore/test_catalog_schema.py +++ b/tests/integration/hive_metastore/test_catalog_schema.py @@ -3,15 +3,14 @@ import pytest from databricks.labs.blueprint.tui import MockPrompts -from databricks.sdk import WorkspaceClient from databricks.sdk.errors import NotFound from databricks.sdk.retries import retried -from databricks.sdk.service.catalog import PermissionsList +from databricks.sdk.service.catalog import CatalogInfo from databricks.sdk.service.compute import DataSecurityMode, AwsAttributes from databricks.sdk.service.catalog import Privilege, SecurableType, PrivilegeAssignment from databricks.sdk.service.iam import PermissionLevel -from databricks.labs.ucx.hive_metastore.grants import Grant +from databricks.labs.ucx.hive_metastore.grants import GrantsCrawler from databricks.labs.ucx.hive_metastore.mapping import Rule from ..conftest import get_azure_spark_conf @@ -20,55 +19,18 @@ @retried(on=[NotFound], timeout=timedelta(minutes=2)) -def test_create_ucx_catalog_creates_catalog(runtime_ctx, watchdog_remove_after) -> None: +def test_create_ucx_catalog_creates_catalog(ws, runtime_ctx, watchdog_remove_after) -> None: # Delete catalog created for testing to test the creation of a new catalog runtime_ctx.workspace_client.catalogs.delete(runtime_ctx.ucx_catalog, force=True) prompts = MockPrompts({f"Please provide storage location url for catalog: {runtime_ctx.ucx_catalog}": "metastore"}) - properties = {"RemoveAfter": watchdog_remove_after} - runtime_ctx.catalog_schema.create_ucx_catalog(prompts, properties=properties) + runtime_ctx.catalog_schema.create_ucx_catalog(prompts, properties={"RemoveAfter": watchdog_remove_after}) - catalog_info = runtime_ctx.workspace_client.catalogs.get(runtime_ctx.ucx_catalog) - assert catalog_info.name == runtime_ctx.ucx_catalog - assert catalog_info.properties == properties - - -@retried(on=[NotFound], timeout=timedelta(minutes=3)) -def test_create_all_catalogs_schemas(ws: WorkspaceClient, runtime_ctx, make_random, watchdog_remove_after) -> None: - """Create one catalog with two schemas mirroring the HIVE metastore schemas.""" - src_schema_1 = runtime_ctx.make_schema(catalog_name="hive_metastore") - src_schema_2 = runtime_ctx.make_schema(catalog_name="hive_metastore") - src_view = runtime_ctx.make_table( - catalog_name=src_schema_1.catalog_name, - schema_name=src_schema_1.name, - ctas="SELECT 2+2 AS four", - view=True, - ) - src_table = runtime_ctx.make_table(catalog_name=src_schema_2.catalog_name, schema_name=src_schema_2.name) - dst_catalog_name = f"ucx-{make_random()}" - rules = [ - Rule("workspace", dst_catalog_name, src_schema_1.name, src_schema_1.name, src_view.name, src_view.name), - Rule("workspace", dst_catalog_name, src_schema_2.name, src_schema_2.name, src_table.name, src_table.name), - ] - runtime_ctx.with_table_mapping_rules(rules) + @retried(on=[NotFound], timeout=timedelta(seconds=20)) + def get_catalog(name: str) -> CatalogInfo: + return ws.catalogs.get(name) - mock_prompts = MockPrompts({"Please provide storage location url for catalog: *": ""}) - properties = {"RemoveAfter": watchdog_remove_after} - runtime_ctx.catalog_schema.create_all_catalogs_schemas(mock_prompts, properties=properties) - - try: - runtime_ctx.workspace_client.catalogs.get(dst_catalog_name) - except NotFound: - assert False, f"Catalog not created: {dst_catalog_name}" - else: - assert True, f"Catalog created: {dst_catalog_name}" - for dst_schema_full_name in f"{dst_catalog_name}.{src_schema_1.name}", f"{dst_catalog_name}.{src_schema_2.name}": - try: - ws.schemas.get(dst_schema_full_name) - except RuntimeError: - assert False, f"Schema not created: {dst_schema_full_name}" - else: - assert True, f"Schema created: {dst_schema_full_name}" + assert get_catalog(runtime_ctx.ucx_catalog) @retried(on=[NotFound], timeout=timedelta(minutes=2)) @@ -78,7 +40,7 @@ def test_create_catalog_schema_with_principal_acl_azure( prepared_principal_acl, make_cluster_permissions, make_cluster, -) -> None: +): if not ws.config.is_azure: pytest.skip("only works in azure test env") ctx, _, schema_name, catalog_name = prepared_principal_acl @@ -104,13 +66,8 @@ def test_create_catalog_schema_with_principal_acl_azure( @retried(on=[NotFound], timeout=timedelta(minutes=3)) def test_create_catalog_schema_with_principal_acl_aws( - ws, - make_user, - prepared_principal_acl, - make_cluster_permissions, - make_cluster, - env_or_skip, -) -> None: + ws, make_user, prepared_principal_acl, make_cluster_permissions, make_cluster, env_or_skip +): ctx, _, schema_name, catalog_name = prepared_principal_acl cluster = make_cluster( @@ -138,39 +95,37 @@ def test_create_catalog_schema_with_principal_acl_aws( @retried(on=[NotFound], timeout=timedelta(minutes=3)) def test_create_catalog_schema_with_legacy_acls( - ws: WorkspaceClient, - runtime_ctx, - make_random, - make_user, - watchdog_remove_after, -) -> None: - src_schema = runtime_ctx.make_schema(catalog_name="hive_metastore") - src_table = runtime_ctx.make_table(catalog_name=src_schema.catalog_name, schema_name=src_schema.name) - dst_catalog_name = f"ucx-{make_random()}" - dst_schema_name = "test" - rules = [Rule("workspace", dst_catalog_name, src_schema.name, dst_schema_name, src_table.name, src_table.name)] + ws, make_user, make_catalog, make_schema, make_mounted_location, runtime_ctx, sql_backend +): + src_schema = make_schema(catalog_name="hive_metastore") + src_external_table = runtime_ctx.make_table( + catalog_name=src_schema.catalog_name, + schema_name=src_schema.name, + external_csv=make_mounted_location, + ) + dst_catalog = make_catalog() + dst_schema = make_schema(catalog_name=dst_catalog.name, name=src_schema.name) + rules = [Rule.from_src_dst(src_external_table, dst_schema)] runtime_ctx.with_table_mapping_rules(rules) + runtime_ctx.with_dummy_resource_permission() - schema_owner, table_owner = make_user(), make_user() - grants = [ - Grant(schema_owner.user_name, "USAGE", src_schema.catalog_name, src_schema.name), - Grant(table_owner.user_name, "USAGE", src_table.catalog_name, src_table.schema_name), - Grant(schema_owner.user_name, "OWN", src_schema.catalog_name, src_schema.name), - Grant(table_owner.user_name, "OWN", src_table.catalog_name, src_table.schema_name, src_table.name), - ] - for grant in grants: - for sql in grant.hive_grant_sql(): - runtime_ctx.sql_backend.execute(sql) + user_a = make_user() + user_b = make_user() - mock_prompts = MockPrompts({"Please provide storage location url for catalog: *": ""}) - properties = {"RemoveAfter": watchdog_remove_after} - runtime_ctx.catalog_schema.create_all_catalogs_schemas(mock_prompts, properties=properties) + sql_backend.execute(f"GRANT USAGE ON DATABASE {src_schema.name} TO `{user_a.user_name}`;") + sql_backend.execute(f"GRANT SELECT ON {src_external_table.full_name} TO `{user_b.user_name}`;") + sql_backend.execute(f"ALTER DATABASE {src_schema.name} OWNER TO `{user_b.user_name}`;") + sql_backend.execute(f"ALTER TABLE {src_external_table.full_name} OWNER TO `{user_a.user_name}`;") - @retried(on=[NotFound], timeout=timedelta(seconds=20)) - def get_schema_permissions_list(full_name: str) -> PermissionsList: - return ws.grants.get(SecurableType.SCHEMA, full_name) + # Ensure the view is populated (it's based on the crawled grants) and fetch the content. + GrantsCrawler(runtime_ctx.tables_crawler, runtime_ctx.udfs_crawler).snapshot() - assert ws.schemas.get(f"{dst_catalog_name}.{dst_schema_name}").owner == schema_owner.user_name - schema_grants = get_schema_permissions_list(f"{dst_catalog_name}.{dst_schema_name}") - assert schema_grants.privilege_assignments is not None - assert PrivilegeAssignment(table_owner.user_name, [Privilege.USE_SCHEMA]) in schema_grants.privilege_assignments + catalog_schema = runtime_ctx.catalog_schema + mock_prompts = MockPrompts({"Please provide storage location url for catalog: *": ""}) + catalog_schema.create_all_catalogs_schemas(mock_prompts) + + schema_grants = ws.grants.get(SecurableType.SCHEMA, f"{dst_catalog.name}.{dst_schema.name}") + schema_grant = PrivilegeAssignment(user_a.user_name, [Privilege.USE_SCHEMA]) + assert schema_grant in schema_grants.privilege_assignments + schema_info = ws.schemas.get(f"{dst_schema.full_name}") + assert schema_info.owner == user_b.user_name diff --git a/tests/integration/hive_metastore/test_external_locations.py b/tests/integration/hive_metastore/test_external_locations.py index ec6b346386..45f6e019b5 100644 --- a/tests/integration/hive_metastore/test_external_locations.py +++ b/tests/integration/hive_metastore/test_external_locations.py @@ -68,7 +68,7 @@ def test_external_locations(ws, sql_backend, inventory_schema, env_or_skip): "bar", "EXTERNAL", "delta", - location="jdbc:providerunknown:/", + location="jdbc://providerunknown/", storage_properties="[database=test_db, host=somedb.us-east-1.rds.amazonaws.com, \ port=1234, dbtable=sometable, user=*********(redacted), password=*********(redacted)]", ), @@ -80,16 +80,16 @@ def test_external_locations(ws, sql_backend, inventory_schema, env_or_skip): mounts_crawler = MountsCrawler(sql_backend, ws, inventory_schema) crawler = ExternalLocations(ws, sql_backend, inventory_schema, tables_crawler, mounts_crawler) results = crawler.snapshot() - assert results == [ - ExternalLocation( - 'jdbc:databricks://dbc-test1-aa11.cloud.databricks.com;httpPath=/sql/1.0/warehouses/65b52fb5bd86a7be', 1 - ), - ExternalLocation('jdbc:mysql://somemysql.us-east-1.rds.amazonaws.com:3306/test_db', 1), - ExternalLocation('jdbc:providerknown://somedb.us-east-1.rds.amazonaws.com:1234/test_db', table_count=2), - ExternalLocation('jdbc:providerunknown://somedb.us-east-1.rds.amazonaws.com:1234/test_db', 1), - ExternalLocation('s3://bar/test3', 1), - ExternalLocation('s3://test_location', 2), - ] + assert len(results) == 6 + assert results[1].location == "s3://bar/test3/" + assert ( + results[2].location + == "jdbc:databricks://dbc-test1-aa11.cloud.databricks.com;httpPath=/sql/1.0/warehouses/65b52fb5bd86a7be" + ) + assert results[3].location == "jdbc:mysql://somemysql.us-east-1.rds.amazonaws.com:3306/test_db" + assert results[4].location == "jdbc:providerknown://somedb.us-east-1.rds.amazonaws.com:1234/test_db" + assert results[4].table_count == 2 + assert results[5].location == "jdbc://providerunknown//somedb.us-east-1.rds.amazonaws.com:1234/test_db" def test_save_external_location_mapping_missing_location(ws, sql_backend, inventory_schema, make_random): diff --git a/tests/integration/hive_metastore/test_migrate.py b/tests/integration/hive_metastore/test_migrate.py index 11bf1930e6..1cc14dddf7 100644 --- a/tests/integration/hive_metastore/test_migrate.py +++ b/tests/integration/hive_metastore/test_migrate.py @@ -172,14 +172,18 @@ def test_migrate_external_table( @retried(on=[NotFound], timeout=timedelta(minutes=2)) def test_migrate_managed_table_to_external_table_without_conversion( - ws, sql_backend, runtime_ctx, make_catalog, make_mounted_location, make_random, env_or_skip + ws, + sql_backend, + runtime_ctx, + make_catalog, + make_mounted_location, ): - src_schema_name = f"dummy_s{make_random(4)}".lower() - src_schema_location = f"{env_or_skip('TEST_MOUNT_CONTAINER')}/a/{src_schema_name}" - src_schema = runtime_ctx.make_schema(name=src_schema_name, location=src_schema_location) + # TODO: update pytest fixture for make_schema to take location as parameter to create managed schema + # TODO: update azure blueprint to add spn in sql warehouse data access config + src_schema = runtime_ctx.make_schema(catalog_name="hive_metastore") src_external_table = runtime_ctx.make_table( schema_name=src_schema.name, - external=False, + external_csv=make_mounted_location, columns=[("`back`ticks`", "STRING")], # Test with column that needs escaping ) dst_catalog = make_catalog() @@ -210,14 +214,18 @@ def test_migrate_managed_table_to_external_table_without_conversion( @retried(on=[NotFound], timeout=timedelta(minutes=2)) def test_migrate_managed_table_to_external_table_with_clone( - ws, sql_backend, runtime_ctx, make_catalog, make_mounted_location, make_random, env_or_skip + ws, + sql_backend, + runtime_ctx, + make_catalog, + make_mounted_location, ): - src_schema_name = f"dummy_s{make_random(4)}".lower() - src_schema_location = f"{env_or_skip('TEST_MOUNT_CONTAINER')}/a/{src_schema_name}" - src_schema = runtime_ctx.make_schema(name=src_schema_name, location=src_schema_location) + # TODO: update pytest fixture for make_schema to take location as parameter to create managed schema + # TODO: update azure blueprint to add spn in sql warehouse data access config + src_schema = runtime_ctx.make_schema(catalog_name="hive_metastore") src_external_table = runtime_ctx.make_table( schema_name=src_schema.name, - external=False, + external_csv=make_mounted_location, columns=[("`back`ticks`", "STRING")], # Test with column that needs escaping ) dst_catalog = make_catalog() diff --git a/tests/integration/install/test_installation.py b/tests/integration/install/test_installation.py index ff3378d9c5..5908f13113 100644 --- a/tests/integration/install/test_installation.py +++ b/tests/integration/install/test_installation.py @@ -171,7 +171,7 @@ def test_job_cluster_policy(ws, installation_ctx): assert policy_definition["aws_attributes.availability"]["value"] == compute.AwsAvailability.ON_DEMAND.value -@retried(on=[NotFound, InvalidParameterValue], timeout=timedelta(minutes=5)) +@retried(on=[NotFound, InvalidParameterValue]) def test_running_real_remove_backup_groups_job(ws: WorkspaceClient, installation_ctx: MockInstallationContext) -> None: ws_group_a, _ = installation_ctx.make_ucx_group(wait_for_provisioning=True) @@ -190,7 +190,7 @@ def test_running_real_remove_backup_groups_job(ws: WorkspaceClient, installation # API internals have a 60s timeout. As such we should wait at least that long before concluding deletion has not # happened. # Note: If you are adjusting this, also look at: test_delete_ws_groups_should_delete_renamed_and_reflected_groups_only - @retried(on=[KeyError], timeout=timedelta(minutes=5)) + @retried(on=[KeyError], timeout=timedelta(minutes=3)) def get_group(group_id: str) -> NoReturn: ws.groups.get(group_id) raise KeyError(f"Group is not deleted: {group_id}") diff --git a/tests/integration/workspace_access/test_groups.py b/tests/integration/workspace_access/test_groups.py index 1e4baf9174..f4be1117b7 100644 --- a/tests/integration/workspace_access/test_groups.py +++ b/tests/integration/workspace_access/test_groups.py @@ -129,7 +129,7 @@ def test_reflect_account_groups_on_workspace(ws, make_ucx_group, sql_backend, in # At this time previous ws level groups aren't deleted -@retried(on=[NotFound], timeout=timedelta(minutes=5)) +@retried(on=[NotFound], timeout=timedelta(minutes=3)) def test_delete_ws_groups_should_delete_renamed_and_reflected_groups_only( ws, make_ucx_group, sql_backend, inventory_schema ): @@ -150,7 +150,7 @@ def test_delete_ws_groups_should_delete_renamed_and_reflected_groups_only( # API internals have a 60s timeout. As such we should wait at least that long before concluding deletion has not # happened. # Note: If you are adjusting this, also look at: test_running_real_remove_backup_groups_job - @retried(on=[KeyError], timeout=timedelta(minutes=5)) + @retried(on=[KeyError], timeout=timedelta(seconds=90)) def get_group(group_id: str) -> NoReturn: ws.groups.get(group_id) raise KeyError(f"Group is not deleted: {group_id}") diff --git a/tests/unit/assessment/test_clusters.py b/tests/unit/assessment/test_clusters.py index af4e1f5c20..c86c3f60f0 100644 --- a/tests/unit/assessment/test_clusters.py +++ b/tests/unit/assessment/test_clusters.py @@ -9,11 +9,10 @@ from databricks.labs.ucx.assessment.azure import AzureServicePrincipalCrawler from databricks.labs.ucx.assessment.clusters import ( ClustersCrawler, - ClusterDetailsOwnership, - ClusterInfoOwnership, + PoliciesCrawler, + ClusterOwnership, ClusterInfo, ClusterPolicyOwnership, - PoliciesCrawler, PolicyInfo, ) from databricks.labs.ucx.framework.crawlers import SqlBackend @@ -186,48 +185,27 @@ def test_unsupported_clusters(): assert result_set[0].failures == '["cluster type not supported : LEGACY_PASSTHROUGH"]' -def test_cluster_info_owner_creator() -> None: +def test_cluster_owner_creator() -> None: admin_locator = create_autospec(AdministratorLocator) - ownership = ClusterInfoOwnership(admin_locator) + ownership = ClusterOwnership(admin_locator) owner = ownership.owner_of(ClusterInfo(creator="bob", cluster_id="1", success=1, failures="[]")) assert owner == "bob" admin_locator.get_workspace_administrator.assert_not_called() -def test_cluster_info_owner_creator_unknown() -> None: +def test_cluster_owner_creator_unknown() -> None: admin_locator = create_autospec(AdministratorLocator) admin_locator.get_workspace_administrator.return_value = "an_admin" - ownership = ClusterInfoOwnership(admin_locator) + ownership = ClusterOwnership(admin_locator) owner = ownership.owner_of(ClusterInfo(creator=None, cluster_id="1", success=1, failures="[]")) assert owner == "an_admin" admin_locator.get_workspace_administrator.assert_called_once() -def test_cluster_details_owner_creator() -> None: - admin_locator = create_autospec(AdministratorLocator) - - ownership = ClusterDetailsOwnership(admin_locator) - owner = ownership.owner_of(ClusterDetails(creator_user_name="bob", cluster_id="1")) - - assert owner == "bob" - admin_locator.get_workspace_administrator.assert_not_called() - - -def test_cluster_details_owner_creator_unknown() -> None: - admin_locator = create_autospec(AdministratorLocator) - admin_locator.get_workspace_administrator.return_value = "an_admin" - - ownership = ClusterDetailsOwnership(admin_locator) - owner = ownership.owner_of(ClusterDetails(cluster_id="1")) - - assert owner == "an_admin" - admin_locator.get_workspace_administrator.assert_called_once() - - def test_policy_crawler(): ws = mock_workspace_client( policy_ids=['single-user-with-spn', 'single-user-with-spn-policyid', 'single-user-with-spn-no-sparkversion'], diff --git a/tests/unit/assessment/test_jobs.py b/tests/unit/assessment/test_jobs.py index 862e2bcf07..8ec3e89077 100644 --- a/tests/unit/assessment/test_jobs.py +++ b/tests/unit/assessment/test_jobs.py @@ -2,9 +2,9 @@ import pytest from databricks.labs.lsql.backends import MockBackend -from databricks.sdk.service.jobs import BaseJob, JobSettings, Job +from databricks.sdk.service.jobs import BaseJob, JobSettings -from databricks.labs.ucx.assessment.jobs import JobInfo, JobInfoOwnership, JobsCrawler, SubmitRunsCrawler, JobOwnership +from databricks.labs.ucx.assessment.jobs import JobInfo, JobOwnership, JobsCrawler, SubmitRunsCrawler from databricks.labs.ucx.framework.owners import AdministratorLocator from .. import mock_workspace_client @@ -135,43 +135,22 @@ def test_job_run_crawler(jobruns_ids, cluster_ids, run_ids, failures): assert result[0].failures == failures -def test_jobinfo_owner_creator() -> None: - admin_locator = create_autospec(AdministratorLocator) - - ownership = JobInfoOwnership(admin_locator) - owner = ownership.owner_of(JobInfo(creator="bob", job_id="1", success=1, failures="[]")) - - assert owner == "bob" - admin_locator.get_workspace_administrator.assert_not_called() - - -def test_jobinfo_owner_creator_unknown() -> None: - admin_locator = create_autospec(AdministratorLocator) - admin_locator.get_workspace_administrator.return_value = "an_admin" - - ownership = JobInfoOwnership(admin_locator) - owner = ownership.owner_of(JobInfo(creator=None, job_id="1", success=1, failures="[]")) - - assert owner == "an_admin" - admin_locator.get_workspace_administrator.assert_called_once() - - -def test_job_owner_creator() -> None: +def test_pipeline_owner_creator() -> None: admin_locator = create_autospec(AdministratorLocator) ownership = JobOwnership(admin_locator) - owner = ownership.owner_of(Job(creator_user_name="bob", job_id=1)) + owner = ownership.owner_of(JobInfo(creator="bob", job_id="1", success=1, failures="[]")) assert owner == "bob" admin_locator.get_workspace_administrator.assert_not_called() -def test_job_owner_creator_unknown() -> None: +def test_pipeline_owner_creator_unknown() -> None: admin_locator = create_autospec(AdministratorLocator) admin_locator.get_workspace_administrator.return_value = "an_admin" ownership = JobOwnership(admin_locator) - owner = ownership.owner_of(Job(job_id=1)) + owner = ownership.owner_of(JobInfo(creator=None, job_id="1", success=1, failures="[]")) assert owner == "an_admin" admin_locator.get_workspace_administrator.assert_called_once() diff --git a/tests/unit/hive_metastore/test_catalog_schema.py b/tests/unit/hive_metastore/test_catalog_schema.py index 1bd69787eb..f81092a92d 100644 --- a/tests/unit/hive_metastore/test_catalog_schema.py +++ b/tests/unit/hive_metastore/test_catalog_schema.py @@ -6,181 +6,130 @@ from databricks.labs.blueprint.tui import MockPrompts from databricks.labs.lsql.backends import MockBackend from databricks.sdk import WorkspaceClient -from databricks.sdk.errors import AlreadyExists, BadRequest, NotFound +from databricks.sdk.errors import BadRequest, NotFound from databricks.sdk.service.catalog import CatalogInfo, ExternalLocationInfo, SchemaInfo from databricks.labs.ucx.hive_metastore.catalog_schema import CatalogSchema -from databricks.labs.ucx.hive_metastore.grants import Grant, MigrateGrants +from databricks.labs.ucx.hive_metastore.grants import PrincipalACL, Grant, GrantsCrawler from databricks.labs.ucx.hive_metastore.mapping import TableMapping -from databricks.labs.ucx.workspace_access.groups import GroupManager - - -def prepare_test( # pylint: disable=too-complex - ws, - backend: MockBackend | None = None, -) -> CatalogSchema: - """Prepare tests with the following setup: - - Existing HIVE metastore resources: - - Schemas: `schema1`, `schema2`, `schema3` - - Tables: Irrelevant for creating catalogs and schemas. - - Legacy ACLS: - - Principal `principal1`: - - DENY on `schema2` - - USAGE on `schema3` - - Users: - - `user1` has USE on `hive_metastore` - - `user2` has USAGE on `hive_metastore.schema2` - - `user3` has USAGE on `hive_metastore.schema3` - - `user4` has DENY on `hive_metastore.schema3` - - `user5` has SELECT on a table and view (Irrelevant for creating catalogs and schemas) - - Existing UC resources: - - Catalog `catalog1` - - Schema `catalog1.schema1` - - External locations (can be referenced when creating catalogs): - - `"s3://foo/bar"` - - `"abfss://container@storageaccount.dfs.core.windows.net"` - - To be created UC resources inferred from the mapping.csv - - Catalogs `catalog1`, `catalog2`, `catalog3` and `catalog4` - - Schemas: - - `catalog1.schema2` - - `catalog1.schema3` - - `catalog2.schema2` - - `catalog2.schema3` - - `catalog3.schema3` - - `catalog4.schema4` - """ - backend = backend or MockBackend() - existing_catalogs = {"catalog1"} - existing_schemas = {"catalog1.schema1"} - def get_catalog(catalog_name: str) -> CatalogInfo: - if catalog_name not in existing_catalogs: - raise NotFound(f"Catalog '{catalog_name}' does not exists.") - return CatalogInfo(name=catalog_name, full_name=catalog_name) - - def create_catalog(catalog_name: str, *_, **__) -> None: - if catalog_name in existing_catalogs: - raise AlreadyExists(f"Catalog '{catalog_name}' already exists.") - existing_catalogs.add(catalog_name) - - def get_schema(full_name: str) -> SchemaInfo: - if full_name not in existing_schemas: - raise NotFound(f"Schema '{full_name}' does not exists.") - catalog_name, schema_name = full_name.split(".", maxsplit=1) - return SchemaInfo(catalog_name=catalog_name, name=schema_name, full_name=full_name) - - def create_schema(schema: str, catalog: str, *_, **__) -> None: - full_name = f"{catalog}.{schema}" - if full_name in existing_schemas: - raise AlreadyExists(f"Schema '{full_name}' already exists.") - existing_schemas.add(full_name) +def prepare_test(ws, backend: MockBackend | None = None) -> CatalogSchema: ws.catalogs.list.return_value = [CatalogInfo(name="catalog1")] + + def get_catalog(catalog_name: str) -> CatalogInfo: + if catalog_name == "catalog1": + return CatalogInfo(name="catalog1") + raise NotFound(f"Catalog: {catalog_name}") + ws.catalogs.get.side_effect = get_catalog - ws.catalogs.create.side_effect = create_catalog - ws.schemas.list.return_value = [SchemaInfo(catalog_name="catalog1", name="schema1")] - ws.schemas.get.side_effect = get_schema - ws.schemas.create.side_effect = create_schema + + def raise_catalog_exists(catalog: str, *_, **__) -> None: + if catalog == "catalog1": + raise BadRequest("Catalog 'catalog1' already exists") + + ws.catalogs.create.side_effect = raise_catalog_exists + ws.schemas.list.return_value = [SchemaInfo(name="schema1")] ws.external_locations.list.return_value = [ ExternalLocationInfo(url="s3://foo/bar"), ExternalLocationInfo(url="abfss://container@storageaccount.dfs.core.windows.net"), ] + if backend is None: + backend = MockBackend() installation = MockInstallation( { - "mapping.csv": [ + 'mapping.csv': [ { - "catalog_name": "catalog1", - "dst_schema": "schema1", - "dst_table": "table1", - "src_schema": "schema1", - "src_table": "table1", - "workspace_name": "workspace", + 'catalog_name': 'catalog1', + 'dst_schema': 'schema3', + 'dst_table': 'table', + 'src_schema': 'schema3', + 'src_table': 'table', + 'workspace_name': 'workspace', }, { - "catalog_name": "catalog1", - "dst_schema": "schema2", - "dst_table": "table1", - "src_schema": "schema1", - "src_table": "abfss://container@msft/path/dest1", - "workspace_name": "workspace", + 'catalog_name': 'catalog2', + 'dst_schema': 'schema2', + 'dst_table': 'table', + 'src_schema': 'schema2', + 'src_table': 'table', + 'workspace_name': 'workspace', }, { - "catalog_name": "catalog1", - "dst_schema": "schema3", - "dst_table": "table1", - "src_schema": "schema3", - "src_table": "table", - "workspace_name": "workspace", + 'catalog_name': 'catalog2', + 'dst_schema': 'schema3', + 'dst_table': 'table2', + 'src_schema': 'schema2', + 'src_table': 'table2', + 'workspace_name': 'workspace', }, { - "catalog_name": "catalog2", - "dst_schema": "schema2", - "dst_table": "table1", - "src_schema": "schema2", - "src_table": "table", - "workspace_name": "workspace", + 'catalog_name': 'catalog1', + 'dst_schema': 'schema2', + 'dst_table': 'table3', + 'src_schema': 'schema1', + 'src_table': 'abfss://container@msft/path/dest1', + 'workspace_name': 'workspace', }, { - "catalog_name": "catalog2", - "dst_schema": "schema2", - "dst_table": "table2", - "src_schema": "schema2", - "src_table": "abfss://container@msft/path/dest2", - "workspace_name": "workspace", + 'catalog_name': 'catalog2', + 'dst_schema': 'schema2', + 'dst_table': 'table1', + 'src_schema': 'schema2', + 'src_table': 'abfss://container@msft/path/dest2', + 'workspace_name': 'workspace', }, { - "catalog_name": "catalog2", - "dst_schema": "schema3", - "dst_table": "table1", - "src_schema": "schema2", - "src_table": "table2", - "workspace_name": "workspace", + 'catalog_name': 'catalog3', + 'dst_schema': 'schema3', + 'dst_table': 'table1', + 'src_schema': 'schema1', + 'src_table': 'abfss://container@msft/path/dest3', + 'workspace_name': 'workspace', }, { - "catalog_name": "catalog3", - "dst_schema": "schema3", - "dst_table": "table1", - "src_schema": "schema1", - "src_table": "abfss://container@msft/path/dest3", - "workspace_name": "workspace", - }, - { - "catalog_name": "catalog4", - "dst_schema": "schema4", - "dst_table": "table1", - "src_schema": "schema1", - "src_table": "abfss://container@msft/path/dest4", - "workspace_name": "workspace", + 'catalog_name': 'catalog4', + 'dst_schema': 'schema4', + 'dst_table': 'table1', + 'src_schema': 'schema1', + 'src_table': 'abfss://container@msft/path/dest4', + 'workspace_name': 'workspace', }, ] } ) table_mapping = TableMapping(installation, ws, backend) + principal_acl = create_autospec(PrincipalACL) + interactive_cluster_grants = [ + Grant('princ1', 'SELECT', 'catalog1', 'schema3', 'table'), + Grant('princ1', 'MODIFY', 'catalog2', 'schema2', 'table'), + Grant('princ1', 'SELECT', 'catalog2', 'schema3', 'table2'), + Grant('princ1', 'USAGE', 'hive_metastore', 'schema3'), + Grant('princ1', 'DENY', 'hive_metastore', 'schema2'), + ] + principal_acl.get_interactive_cluster_grants.return_value = interactive_cluster_grants + hive_acl = create_autospec(GrantsCrawler) + hive_grants = [ + Grant(principal="user1", catalog="hive_metastore", action_type="USE"), + Grant(principal="user2", catalog="hive_metastore", database="schema3", action_type="USAGE"), + Grant( + principal="user3", + catalog="hive_metastore", + database="database_one", + view="table_one", + action_type="SELECT", + ), + Grant(principal="user4", catalog="hive_metastore", database="schema3", action_type="DENY"), + Grant( + principal="user5", + catalog="hive_metastore", + database="schema2", + action_type="USAGE", + ), + ] + hive_acl.snapshot.return_value = hive_grants - def interactive_cluster_grants_loader() -> list[Grant]: - return [ - Grant("principal1", "DENY", "hive_metastore", "schema2"), - Grant("principal1", "USAGE", "hive_metastore", "schema3"), - ] - - def hive_grants_loader() -> list[Grant]: - return [ - Grant("user1", "USE", "hive_metastore"), - Grant("user2", "USAGE", "hive_metastore", "schema2"), - Grant("user3", "USAGE", "hive_metastore", "schema3"), - Grant("user4", "DENY", "hive_metastore", "schema3"), - Grant("user5", "SELECT", "hive_metastore", "schema2", table="table"), - Grant("user5", "SELECT", "hive_metastore", "schema2", view="view"), - ] - - group_manager = create_autospec(GroupManager) - group_manager.snapshot.return_value = [] - migrate_grants = MigrateGrants(backend, group_manager, [interactive_cluster_grants_loader, hive_grants_loader]) - - return CatalogSchema(ws, table_mapping, migrate_grants, "ucx", timeout=None) + return CatalogSchema(ws, table_mapping, principal_acl, backend, hive_acl, "ucx") def test_create_ucx_catalog_creates_ucx_catalog() -> None: @@ -260,7 +209,7 @@ def test_create_all_catalogs_schemas_creates_catalogs_with_different_locations() "catalog,schema", [("catalog1", "schema2"), ("catalog1", "schema3"), ("catalog2", "schema2"), ("catalog3", "schema3")], ) -def test_create_all_catalogs_schemas_creates_schemas(catalog: str, schema: str) -> None: +def test_create_all_catalogs_schemas_creates_schemas(catalog: str, schema: str): """Non-existing schemas should be created.""" ws = create_autospec(WorkspaceClient) mock_prompts = MockPrompts({"Please provide storage location url for catalog: *": "metastore"}) @@ -271,21 +220,18 @@ def test_create_all_catalogs_schemas_creates_schemas(catalog: str, schema: str) ws.schemas.create.assert_any_call(schema, catalog, comment="Created by UCX") -def test_create_catalogs_and_schemas_with_invalid_storage_location() -> None: +def test_create_bad_location(): ws = create_autospec(WorkspaceClient) mock_prompts = MockPrompts({"Please provide storage location url for catalog: *": "s3://foo/fail"}) catalog_schema = prepare_test(ws) - with pytest.raises(NotFound): catalog_schema.create_all_catalogs_schemas(mock_prompts) - # `catalog3` and `catalog4` are not reached as the logic breaks when the users fails to supply a valid location - calls = [call("catalog1"), call("catalog2")] - ws.catalogs.get.assert_has_calls(calls) ws.catalogs.create.assert_not_called() + ws.catalogs.list.assert_called_once() ws.schemas.create.assert_not_called() -def test_no_catalog_storage() -> None: +def test_no_catalog_storage(): ws = create_autospec(WorkspaceClient) mock_prompts = MockPrompts({"Please provide storage location url for catalog: *": ""}) @@ -295,7 +241,6 @@ def test_no_catalog_storage() -> None: calls = [ call("catalog2", comment="Created by UCX", properties=None), call("catalog3", comment="Created by UCX", properties=None), - call("catalog4", comment="Created by UCX", properties=None), ] ws.catalogs.create.assert_has_calls(calls, any_order=True) @@ -304,29 +249,28 @@ def test_catalog_schema_acl() -> None: ws = create_autospec(WorkspaceClient) backend = MockBackend() mock_prompts = MockPrompts({"Please provide storage location url for catalog: *": ""}) - catalog_schema = prepare_test(ws, backend) + catalog_schema = prepare_test(ws, backend) catalog_schema.create_all_catalogs_schemas(mock_prompts) calls = [ call("catalog2", comment="Created by UCX", properties=None), call("catalog3", comment="Created by UCX", properties=None), - call("catalog4", comment="Created by UCX", properties=None), ] ws.catalogs.create.assert_has_calls(calls, any_order=True) ws.schemas.create.assert_any_call("schema2", "catalog2", comment="Created by UCX") queries = [ - 'GRANT USE CATALOG ON CATALOG `catalog1` TO `principal1`', - 'GRANT USE CATALOG ON CATALOG `catalog1` TO `user3`', - 'GRANT USE CATALOG ON CATALOG `catalog2` TO `user2`', - 'GRANT USE SCHEMA ON DATABASE `catalog1`.`schema3` TO `principal1`', - 'GRANT USE SCHEMA ON DATABASE `catalog1`.`schema3` TO `user3`', - 'GRANT USE SCHEMA ON DATABASE `catalog2`.`schema2` TO `user2`', - 'GRANT USE SCHEMA ON DATABASE `catalog2`.`schema3` TO `user2`', + 'GRANT USE SCHEMA ON DATABASE `catalog1`.`schema3` TO `princ1`', + 'GRANT USE CATALOG ON CATALOG `catalog1` TO `princ1`', + 'GRANT USE CATALOG ON CATALOG `catalog1` TO `user2`', + 'GRANT USE SCHEMA ON DATABASE `catalog1`.`schema3` TO `user2`', + 'GRANT USE SCHEMA ON DATABASE `catalog2`.`schema2` TO `user5`', + 'GRANT USE SCHEMA ON DATABASE `catalog2`.`schema3` TO `user5`', + 'GRANT USE CATALOG ON CATALOG `catalog2` TO `user5`', ] - - assert not set(backend.queries) - set(queries), f"Additional queries {set(backend.queries) - set(queries)}" - assert not set(queries) - set(backend.queries), f"Missing queries {set(queries) - set(backend.queries)}" + assert len(backend.queries) == len(queries) + for query in queries: + assert query in backend.queries def test_create_all_catalogs_schemas_logs_untranslatable_grant(caplog) -> None: @@ -337,21 +281,14 @@ def test_create_all_catalogs_schemas_logs_untranslatable_grant(caplog) -> None: with caplog.at_level(logging.WARNING, logger="databricks.labs.ucx.hive_metastore.catalog_schema"): catalog_schema.create_all_catalogs_schemas(mock_prompts) - message_prefix = "failed-to-migrate: Hive metastore grant 'DENY' cannot be mapped to UC grant for" - assert f"{message_prefix} DATABASE 'catalog1.schema3'. Skipping." in caplog.messages - assert f"{message_prefix} CATALOG 'catalog2'. Skipping." in caplog.messages - assert f"{message_prefix} DATABASE 'catalog2.schema2'. Skipping." in caplog.messages - assert f"{message_prefix} DATABASE 'catalog2.schema3'. Skipping." in caplog.messages - ws.assert_not_called() - - -def test_create_catalogs_and_schemas_logs_skipping_already_existing_unity_catalog_resources(caplog) -> None: - ws = create_autospec(WorkspaceClient) - mock_prompts = MockPrompts({"Please provide storage location url for catalog: *": ""}) - catalog_schema = prepare_test(ws) - - with caplog.at_level(logging.WARNING, logger="databricks.labs.ucx.hive_metastore.catalog_schema"): - catalog_schema.create_all_catalogs_schemas(mock_prompts) - assert "Skipping already existing catalog: catalog1" in caplog.text - assert "Skipping already existing schema: catalog1.schema1" in caplog.text + assert ( + "Skipping legacy grant that is not supported in UC: DENY on ('DATABASE', 'catalog1.schema3')" in caplog.messages + ) + assert "Skipping legacy grant that is not supported in UC: DENY on ('CATALOG', 'catalog2')" in caplog.messages + assert ( + "Skipping legacy grant that is not supported in UC: DENY on ('DATABASE', 'catalog2.schema2')" in caplog.messages + ) + assert ( + "Skipping legacy grant that is not supported in UC: DENY on ('DATABASE', 'catalog2.schema3')" in caplog.messages + ) ws.assert_not_called() diff --git a/tests/unit/hive_metastore/test_grants.py b/tests/unit/hive_metastore/test_grants.py index f4c568630c..7f31824e02 100644 --- a/tests/unit/hive_metastore/test_grants.py +++ b/tests/unit/hive_metastore/test_grants.py @@ -1,4 +1,3 @@ -import dataclasses import logging from unittest.mock import create_autospec @@ -6,113 +5,112 @@ from databricks.labs.lsql.backends import MockBackend from databricks.labs.ucx.framework.owners import AdministratorLocator -from databricks.labs.ucx.hive_metastore.catalog_schema import Catalog, Schema from databricks.labs.ucx.hive_metastore.grants import Grant, GrantsCrawler, MigrateGrants, GrantOwnership from databricks.labs.ucx.hive_metastore.tables import Table, TablesCrawler from databricks.labs.ucx.hive_metastore.udfs import UdfsCrawler from databricks.labs.ucx.workspace_access.groups import GroupManager -def test_type_and_key_table() -> None: - type_and_key = Grant.type_and_key(catalog="hive_metastore", database="mydb", table="mytable") - assert type_and_key == ("TABLE", "hive_metastore.mydb.mytable") +def test_type_and_key_table(): + grant = Grant.type_and_key(catalog="hive_metastore", database="mydb", table="mytable") + assert grant == ("TABLE", "hive_metastore.mydb.mytable") grant = Grant(principal="user", action_type="SELECT", catalog="hive_metastore", database="mydb", table="mytable") assert grant.this_type_and_key()[0] == "TABLE" assert grant.object_key == "hive_metastore.mydb.mytable" -def test_type_and_key_view() -> None: - type_and_key = Grant.type_and_key(catalog="hive_metastore", database="mydb", view="myview") - assert type_and_key == ("VIEW", "hive_metastore.mydb.myview") +def test_type_and_key_view(): + grant = Grant.type_and_key(catalog="hive_metastore", database="mydb", view="myview") + assert grant == ("VIEW", "hive_metastore.mydb.myview") grant = Grant(principal="user", action_type="SELECT", catalog="hive_metastore", database="mydb", view="myview") assert grant.this_type_and_key()[0] == "VIEW" assert grant.object_key == "hive_metastore.mydb.myview" -def test_type_and_key_database() -> None: - type_and_key = Grant.type_and_key(catalog="hive_metastore", database="mydb") - assert type_and_key == ("DATABASE", "hive_metastore.mydb") +def test_type_and_key_database(): + grant = Grant.type_and_key(catalog="hive_metastore", database="mydb") + assert grant == ("DATABASE", "hive_metastore.mydb") grant = Grant(principal="user", action_type="SELECT", catalog="hive_metastore", database="mydb") assert grant.this_type_and_key()[0] == "DATABASE" assert grant.object_key == "hive_metastore.mydb" -def test_type_and_key_catalog() -> None: - type_and_key = Grant.type_and_key(catalog="mycatalog") - assert type_and_key == ("CATALOG", "mycatalog") +def test_type_and_key_catalog(): + grant = Grant.type_and_key(catalog="mycatalog") + assert grant == ("CATALOG", "mycatalog") grant = Grant(principal="user", action_type="SELECT", catalog="mycatalog") assert grant.this_type_and_key()[0] == "CATALOG" assert grant.object_key == "mycatalog" -def test_type_and_key_any_file() -> None: - type_and_key = Grant.type_and_key(any_file=True) - assert type_and_key == ("ANY FILE", "") +def test_type_and_key_any_file(): + grant = Grant.type_and_key(any_file=True) + assert grant == ("ANY FILE", "") grant = Grant(principal="user", action_type="SELECT", catalog="hive_metastore", any_file=True) assert grant.this_type_and_key()[0] == "ANY FILE" assert grant.object_key == "" -def test_type_and_key_anonymous_function() -> None: - type_and_key = Grant.type_and_key(anonymous_function=True) - assert type_and_key == ("ANONYMOUS FUNCTION", "") +def test_type_and_key_anonymous_function(): + grant = Grant.type_and_key(anonymous_function=True) + assert grant == ("ANONYMOUS FUNCTION", "") grant = Grant(principal="user", action_type="SELECT", catalog="hive_metastore", anonymous_function=True) assert grant.this_type_and_key()[0] == "ANONYMOUS FUNCTION" assert grant.object_key == "" -def test_type_and_key_udf() -> None: - type_and_key = Grant.type_and_key(catalog="hive_metastore", database="mydb", udf="myfunction") - assert type_and_key == ("FUNCTION", "hive_metastore.mydb.myfunction") +def test_type_and_key_udf(): + grant = Grant.type_and_key(catalog="hive_metastore", database="mydb", udf="myfunction") + assert grant == ("FUNCTION", "hive_metastore.mydb.myfunction") grant = Grant(principal="user", action_type="SELECT", catalog="hive_metastore", database="mydb", udf="myfunction") assert grant.this_type_and_key()[0] == "FUNCTION" assert grant.object_key == "hive_metastore.mydb.myfunction" -def test_type_and_key_invalid() -> None: +def test_type_and_key_invalid(): with pytest.raises(ValueError): Grant.type_and_key() -def test_object_key() -> None: +def test_object_key(): grant = Grant(principal="user", action_type="SELECT", catalog="hive_metastore", database="mydb", table="mytable") assert grant.object_key == "hive_metastore.mydb.mytable" -def test_hive_sql() -> None: +def test_hive_sql(): grant = Grant(principal="user", action_type="SELECT", catalog="hive_metastore", database="mydb", table="mytable") assert grant.hive_grant_sql() == ["GRANT SELECT ON TABLE `hive_metastore`.`mydb`.`mytable` TO `user`"] assert grant.hive_revoke_sql() == "REVOKE SELECT ON TABLE `hive_metastore`.`mydb`.`mytable` FROM `user`" -def test_hive_table_own_sql() -> None: +def test_hive_table_own_sql(): grant = Grant(principal="user", action_type="OWN", catalog="hive_metastore", database="mydb", table="mytable") assert grant.hive_grant_sql() == ["ALTER TABLE `hive_metastore`.`mydb`.`mytable` OWNER TO `user`"] -def test_hive_database_own_sql() -> None: +def test_hive_database_own_sql(): grant = Grant(principal="user", action_type="OWN", catalog="hive_metastore", database="mydb") assert grant.hive_grant_sql() == ["ALTER DATABASE `hive_metastore`.`mydb` OWNER TO `user`"] -def test_hive_udf_own_sql() -> None: +def test_hive_udf_own_sql(): grant = Grant(principal="user", action_type="OWN", catalog="hive_metastore", database="mydb", udf="myfunction") assert grant.hive_grant_sql() == ["ALTER FUNCTION `hive_metastore`.`mydb`.`myfunction` OWNER TO `user`"] -def test_hive_revoke_sql() -> None: +def test_hive_revoke_sql(): grant = Grant(principal="user", action_type="SELECT", catalog="hive_metastore", database="mydb", table="mytable") assert grant.hive_revoke_sql() == "REVOKE SELECT ON TABLE `hive_metastore`.`mydb`.`mytable` FROM `user`" -def test_hive_deny_sql() -> None: +def test_hive_deny_sql(): grant = Grant( principal="user", action_type="DENIED_SELECT", catalog="hive_metastore", database="mydb", table="mytable" ) @@ -144,7 +142,7 @@ def test_hive_deny_sql() -> None: ), ], ) -def test_uc_sql(grant, query) -> None: +def test_uc_sql(grant, query): assert grant.uc_grant_sql() == query @@ -177,7 +175,7 @@ def test_uc_sql(grant, query) -> None: } -def test_crawler_no_data() -> None: +def test_crawler_no_data(): sql_backend = MockBackend() table = TablesCrawler(sql_backend, "schema") udf = UdfsCrawler(sql_backend, "schema") @@ -186,7 +184,7 @@ def test_crawler_no_data() -> None: assert len(grants) == 0 -def test_crawler_crawl() -> None: +def test_crawler_crawl(): sql_backend = MockBackend( rows={ "SHOW DATABASES": SHOW_DATABASES[ @@ -248,7 +246,7 @@ def test_crawler_crawl() -> None: assert len(grants) == len(expected_grants) and set(grants) == expected_grants -def test_crawler_udf_crawl() -> None: +def test_crawler_udf_crawl(): sql_backend = MockBackend( rows={ "SHOW DATABASES": SHOW_DATABASES[("database_one",),], @@ -298,7 +296,7 @@ def test_crawler_udf_crawl() -> None: assert len(grants) == len(expected_grants) and set(grants) == expected_grants -def test_crawler_snapshot_when_no_data() -> None: +def test_crawler_snapshot_when_no_data(): sql_backend = MockBackend() table = TablesCrawler(sql_backend, "schema") udf = UdfsCrawler(sql_backend, "schema") @@ -307,7 +305,7 @@ def test_crawler_snapshot_when_no_data() -> None: assert len(snapshot) == 0 -def test_crawler_snapshot_with_data() -> None: +def test_crawler_snapshot_with_data(): sql_backend = MockBackend(rows=ROWS) table = TablesCrawler(sql_backend, "schema") udf = UdfsCrawler(sql_backend, "schema") @@ -316,7 +314,7 @@ def test_crawler_snapshot_with_data() -> None: assert len(snapshot) == 3 -def test_grants_returning_error_when_showing_grants() -> None: +def test_grants_returning_error_when_showing_grants(): errors = {"SHOW GRANTS ON TABLE `hive_metastore`.`test_database`.`table1`": "error"} rows = { "SHOW DATABASES": SHOW_DATABASES[ @@ -355,7 +353,7 @@ def test_grants_returning_error_when_showing_grants() -> None: ] -def test_grants_returning_error_when_describing() -> None: +def test_grants_returning_error_when_describing(): errors = {"DESCRIBE TABLE EXTENDED `hive_metastore`.`test_database`.`table1`": "error"} rows = { "SHOW DATABASES": SHOW_DATABASES[("test_database",),], @@ -391,7 +389,7 @@ def test_grants_returning_error_when_describing() -> None: ] -def test_udf_grants_returning_error_when_showing_grants() -> None: +def test_udf_grants_returning_error_when_showing_grants(): errors = {"SHOW GRANTS ON FUNCTION `hive_metastore`.`test_database`.`function_bad`": "error"} rows = { "SHOW DATABASES": SHOW_DATABASES[ @@ -430,7 +428,7 @@ def test_udf_grants_returning_error_when_showing_grants() -> None: ] -def test_udf_grants_returning_error_when_describing() -> None: +def test_udf_grants_returning_error_when_describing(): errors = {"DESCRIBE FUNCTION EXTENDED `hive_metastore`.`test_database`.`function_bad`": "error"} rows = { "SHOW DATABASES": SHOW_DATABASES[("test_database",),], @@ -466,7 +464,7 @@ def test_udf_grants_returning_error_when_describing() -> None: ] -def test_crawler_should_filter_databases() -> None: +def test_crawler_should_filter_databases(): sql_backend = MockBackend( rows={ "SHOW TABLES FROM `hive_metastore`\\.`database_one`": SHOW_TABLES[("database_one", "table_one", "true"),], @@ -502,118 +500,6 @@ def test_crawler_should_filter_databases() -> None: assert len(grants) == len(expected_grants) and set(grants) == expected_grants -@pytest.mark.parametrize( - "src, grant, dst, query", - [ - ( - Catalog("hive_metastore"), - Grant("user", "USAGE"), - Catalog("catalog"), - "GRANT USE CATALOG ON CATALOG `catalog` TO `user`", - ), - ( - Schema("hive_metastore", "schema"), - Grant("user", "USAGE"), - Schema("catalog", "schema"), - "GRANT USE SCHEMA ON DATABASE `catalog`.`schema` TO `user`", - ), - ( - Table("hive_metastore", "database", "table", "MANAGED", "DELTA"), - Grant("user", "SELECT"), - Table("catalog", "database", "table", "MANAGED", "DELTA"), - "GRANT SELECT ON TABLE `catalog`.`database`.`table` TO `user`", - ), - ( - Catalog("hive_metastore"), - Grant("user", "OWN"), - Catalog("catalog"), - "ALTER CATALOG `catalog` OWNER TO `user`", - ), - ( - Schema("hive_metastore", "schema"), - Grant("user", "OWN"), - Schema("catalog", "schema"), - "ALTER DATABASE `catalog`.`schema` OWNER TO `user`", - ), - ( - Table("hive_metastore", "database", "table", "MANAGED", "DELTA"), - Grant("user", "OWN"), - Table("catalog", "database", "table", "MANAGED", "DELTA"), - "ALTER TABLE `catalog`.`database`.`table` OWNER TO `user`", - ), - ], -) -def test_migrate_grants_applies_query( - src: Catalog | Schema | Table, - grant: Grant, - dst: Catalog | Schema | Table, - query: str, -) -> None: - group_manager = create_autospec(GroupManager) - backend = MockBackend() - - def grant_loader() -> list[Grant]: - database = table = None - if isinstance(src, Catalog): - catalog = src.name - elif isinstance(src, Schema): - catalog = src.catalog - database = src.name - elif isinstance(src, Table): - catalog = src.catalog - database = src.database - table = src.name - else: - raise TypeError(f"Unsupported source type: {type(src)}") - return [ - dataclasses.replace( - grant, - catalog=catalog, - database=database, - table=table, - ), - ] - - migrate_grants = MigrateGrants( - backend, - group_manager, - [grant_loader], - ) - - migrate_grants.apply(src, dst) - - assert query in backend.queries - group_manager.assert_not_called() - - -def test_migrate_grants_alters_ownership_as_last() -> None: - queries = [ - "GRANT USE SCHEMA ON DATABASE `catalog`.`schema` TO `user`", - "ALTER DATABASE `catalog`.`schema` OWNER TO `user`", - ] - group_manager = create_autospec(GroupManager) - backend = MockBackend() - - def grant_loader() -> list[Grant]: - return [ - Grant("user", "OWN", "hive_metastore", "schema"), - Grant("user", "USAGE", "hive_metastore", "schema"), - ] - - migrate_grants = MigrateGrants( - backend, - group_manager, - [grant_loader], - ) - src = Schema("hive_metastore", "schema") - dst = Schema("catalog", "schema") - - migrate_grants.apply(src, dst) - - assert backend.queries == queries - group_manager.assert_not_called() - - def test_migrate_grants_logs_unmapped_acl(caplog) -> None: group_manager = create_autospec(GroupManager) table = Table("hive_metastore", "database", "table", "MANAGED", "DELTA") @@ -636,9 +522,9 @@ def grant_loader() -> list[Grant]: ) with caplog.at_level(logging.WARNING, logger="databricks.labs.ucx.hive_metastore.grants"): - migrate_grants.apply(table, dataclasses.replace(table, catalog="catalog")) + migrate_grants.apply(table, f"uc.{table.database}.{table.name}") assert ( - "failed-to-migrate: Hive metastore grant 'READ_METADATA' cannot be mapped to UC grant for TABLE 'catalog.database.table'" + "failed-to-migrate: Hive metastore grant 'READ_METADATA' cannot be mapped to UC grant for TABLE 'uc.database.table'" in caplog.text ) group_manager.assert_not_called() diff --git a/tests/unit/hive_metastore/test_locations.py b/tests/unit/hive_metastore/test_locations.py index 2e0a6306a4..f878ad5cb7 100644 --- a/tests/unit/hive_metastore/test_locations.py +++ b/tests/unit/hive_metastore/test_locations.py @@ -22,7 +22,7 @@ @pytest.mark.parametrize( "location", [ - "s3://databricks-e2demofieldengwest/b169/b50", + "s3://databricks-e2demofieldengwest/b169/b50" "s3a://databricks-datasets-oregon/delta-sharing/share/open-datasets.share", "s3n://bucket-name/path-to-file-in-bucket", "gcs://test_location2/test2/table2", @@ -166,9 +166,8 @@ def test_external_locations(): table_factory(["s3://us-east-1-dev-account-staging-uc-ext-loc-bucket-1/Location/Table2", ""]), table_factory(["s3://us-east-1-dev-account-staging-uc-ext-loc-bucket-23/testloc/Table3", ""]), table_factory(["s3://us-east-1-dev-account-staging-uc-ext-loc-bucket-23/anotherloc/Table4", ""]), - table_factory(["gcs://test_location2/a/b/table2", ""]), table_factory(["dbfs:/mnt/ucx/database1/table1", ""]), - table_factory(["/dbfs/mnt/ucx/database2/table2", ""]), + table_factory(["dbfs:/mnt/ucx/database2/table2", ""]), table_factory(["DatabricksRootmntDatabricksRoot", ""]), table_factory( [ @@ -213,18 +212,18 @@ def test_external_locations(): mounts_crawler.snapshot.return_value = [Mount("/mnt/ucx", "s3://us-east-1-ucx-container")] sql_backend = MockBackend() crawler = ExternalLocations(Mock(), sql_backend, "test", tables_crawler, mounts_crawler) - assert crawler.snapshot() == [ - ExternalLocation('gcs://test_location2/a/b', 1), - ExternalLocation( - 'jdbc:databricks://dbc-test1-aa11.cloud.databricks.com;httpPath=/sql/1.0/warehouses/65b52fb5bd86a7be', 1 - ), - ExternalLocation('jdbc:mysql://somemysql.us-east-1.rds.amazonaws.com:3306/test_db', 1), - ExternalLocation('jdbc:providerknown://somedb.us-east-1.rds.amazonaws.com:1234/test_db', 2), - ExternalLocation('jdbc:providerunknown://somedb.us-east-1.rds.amazonaws.com:1234/test_db', 1), - ExternalLocation('s3://us-east-1-dev-account-staging-uc-ext-loc-bucket-1/Location', 2), - ExternalLocation('s3://us-east-1-dev-account-staging-uc-ext-loc-bucket-23', 2), - ExternalLocation('s3://us-east-1-ucx-container', 2), - ] + result_set = crawler.snapshot() + assert len(result_set) == 7 + assert result_set[0].location == "s3://us-east-1-dev-account-staging-uc-ext-loc-bucket-1/Location/" + assert result_set[0].table_count == 2 + assert result_set[1].location == "s3://us-east-1-dev-account-staging-uc-ext-loc-bucket-23/" + assert ( + result_set[3].location + == "jdbc:databricks://dbc-test1-aa11.cloud.databricks.com;httpPath=/sql/1.0/warehouses/65b52fb5bd86a7be" + ) + assert result_set[4].location == "jdbc:mysql://somemysql.us-east-1.rds.amazonaws.com:3306/test_db" + assert result_set[5].location == "jdbc:providerknown://somedb.us-east-1.rds.amazonaws.com:1234/test_db" + assert result_set[6].location == "jdbc:providerunknown://somedb.us-east-1.rds.amazonaws.com:1234/test_db" LOCATION_STORAGE = MockBackend.rows("location", "storage_properties") @@ -238,7 +237,10 @@ def test_save_external_location_mapping_missing_location(): tables_crawler = create_autospec(TablesCrawler) tables_crawler.snapshot.return_value = [ table_factory(["s3://test_location/test1/table1", ""]), - table_factory(["s3://test_location/test1/table2", ""]), + table_factory(["gcs://test_location2/test2/table2", ""]), + table_factory(["abfss://cont1@storagetest1.dfs.core.windows.net/test2/table3", ""]), + table_factory(["s3a://test_location_2/test1/table1", ""]), + table_factory(["s3n://test_location_3/test1/table1", ""]), ] mounts_crawler = create_autospec(MountsCrawler) mounts_crawler.snapshot.return_value = [] @@ -255,6 +257,26 @@ def test_save_external_location_mapping_missing_location(): ' name = "test_location_test1"\n' ' url = "s3://test_location/test1"\n' " credential_name = databricks_storage_credential..id\n" + "}\n\n" + 'resource "databricks_external_location" "test_location2_test2" { \n' + ' name = "test_location2_test2"\n' + ' url = "gcs://test_location2/test2"\n' + " credential_name = databricks_storage_credential..id\n" + "}\n\n" + 'resource "databricks_external_location" "cont1_storagetest1_test2" { \n' + ' name = "cont1_storagetest1_test2"\n' + ' url = "abfss://cont1@storagetest1.dfs.core.windows.net/test2"\n' + " credential_name = databricks_storage_credential..id\n" + "}\n\n" + 'resource "databricks_external_location" "test_location_2_test1" { \n' + ' name = "test_location_2_test1"\n' + ' url = "s3a://test_location_2/test1"\n' + " credential_name = databricks_storage_credential..id\n" + "}\n\n" + 'resource "databricks_external_location" "test_location_3_test1" { \n' + ' name = "test_location_3_test1"\n' + ' url = "s3n://test_location_3/test1"\n' + " credential_name = databricks_storage_credential..id\n" "}\n" ).encode("utf8"), ) @@ -295,10 +317,8 @@ def test_match_table_external_locations(): matching_locations, missing_locations = location_crawler.match_table_external_locations() assert len(matching_locations) == 1 - assert [ - ExternalLocation("abfss://cont1@storagetest1/a", 2), - ExternalLocation("gcs://test_location2/a/b", 1), - ] == missing_locations + assert ExternalLocation("gcs://test_location2/a/b/", 1) in missing_locations + assert ExternalLocation("abfss://cont1@storagetest1/a/", 2) in missing_locations def test_mount_listing_multiple_folders(): @@ -717,20 +737,3 @@ def my_side_effect(path, **_): assert results[0].location == "adls://bucket/table1" assert results[1].location == "dbfs:/mnt/test_mount/table2" assert results[2].location is None - - -def test_resolve_dbfs_root_in_hms_federation(): - jvm = Mock() - sql_backend = MockBackend() - client = create_autospec(WorkspaceClient) - client.dbutils.fs.mounts.return_value = [MountInfo('/', 'DatabricksRoot', '')] - - mounts_crawler = MountsCrawler(sql_backend, client, "test", enable_hms_federation=True) - mounts_crawler.__dict__['_jvm'] = jvm - - hms_fed_dbfs_utils = jvm.com.databricks.sql.managedcatalog.connections.HmsFedDbfsUtils - hms_fed_dbfs_utils.resolveDbfsPath().get().toString.return_value = 's3://original/bucket/user/hive/warehouse' - - mounts = mounts_crawler.snapshot() - - assert [Mount("/", 's3://original/bucket/')] == mounts diff --git a/tests/unit/hive_metastore/test_mapping.py b/tests/unit/hive_metastore/test_mapping.py index f07399287d..e0ac9f56ad 100644 --- a/tests/unit/hive_metastore/test_mapping.py +++ b/tests/unit/hive_metastore/test_mapping.py @@ -46,19 +46,6 @@ ) -def test_rule_as_uc_table() -> None: - table = Table("catalog", "destination_schema", "destination_table", "UNKNOWN", "UNKNOWN") - rule = Rule( - workspace_name="workspace", - catalog_name="catalog", - src_schema="source_schema", - dst_schema="destination_schema", - src_table="source_table", - dst_table="destination_table", - ) - assert rule.as_uc_table == table - - def test_current_tables_empty_fails(): ws = create_autospec(WorkspaceClient) errors = {} diff --git a/tests/unit/hive_metastore/test_migrate_acls.py b/tests/unit/hive_metastore/test_migrate_acls.py index 5e69fec945..d90c8e9bc3 100644 --- a/tests/unit/hive_metastore/test_migrate_acls.py +++ b/tests/unit/hive_metastore/test_migrate_acls.py @@ -1,7 +1,5 @@ import logging -from collections.abc import Callable, Iterable from unittest.mock import create_autospec - import pytest from databricks.labs.lsql.backends import SqlBackend @@ -73,21 +71,20 @@ def test_migrate_acls_hms_fed_proper_queries(ws, ws_info, caplog): migrate_grants.apply.assert_called_with(src, 'hms_fed.db1_src.managed_dbfs') -def test_migrate_matched_grants_applies() -> None: +def test_migrate_matched_grants_applies(): sql_backend = create_autospec(SqlBackend) group_manager = create_autospec(GroupManager) src = Table('hive_metastore', 'default', 'foo', 'MANAGED', 'DELTA') - dst = Table('catalog', 'schema', 'table', 'MANAGED', 'DELTA') - one_grant: list[Callable[[], Iterable[Grant]]] = [lambda: [Grant('me', 'SELECT', database='default', table='foo')]] + one_grant = [lambda: [Grant('me', 'SELECT', database='default', table='foo')]] migrate_grants = MigrateGrants(sql_backend, group_manager, one_grant) - migrate_grants.apply(src, dst) + migrate_grants.apply(src, 'catalog.schema.table') group_manager.snapshot.assert_called() sql_backend.execute.assert_called_with('GRANT SELECT ON TABLE `catalog`.`schema`.`table` TO `me`') -def test_migrate_matched_grants_applies_and_remaps_group() -> None: +def test_migrate_matched_grants_applies_and_remaps_group(): sql_backend = create_autospec(SqlBackend) group_manager = create_autospec(GroupManager) group_manager.snapshot.return_value = [ @@ -99,25 +96,23 @@ def test_migrate_matched_grants_applies_and_remaps_group() -> None: ), ] src = Table('hive_metastore', 'default', 'foo', 'MANAGED', 'DELTA') - dst = Table('catalog', 'schema', 'table', 'MANAGED', 'DELTA') - one_grant: list[Callable[[], Iterable[Grant]]] = [lambda: [Grant('me', 'SELECT', database='default', table='foo')]] + one_grant = [lambda: [Grant('me', 'SELECT', database='default', table='foo')]] migrate_grants = MigrateGrants(sql_backend, group_manager, one_grant) - migrate_grants.apply(src, dst) + migrate_grants.apply(src, 'catalog.schema.table') group_manager.snapshot.assert_called() sql_backend.execute.assert_called_with('GRANT SELECT ON TABLE `catalog`.`schema`.`table` TO `myself`') -def test_migrate_no_matched_grants_no_apply() -> None: +def test_migrate_no_matched_grants_no_apply(): sql_backend = create_autospec(SqlBackend) group_manager = create_autospec(GroupManager) src = Table('hive_metastore', 'default', 'bar', 'MANAGED', 'DELTA') - dst = Table('catalog', 'schema', 'table', 'MANAGED', 'DELTA') - one_grant: list[Callable[[], Iterable[Grant]]] = [lambda: [Grant('me', 'SELECT', database='default', table='foo')]] + one_grant = [lambda: [Grant('me', 'SELECT', database='default', table='foo')]] migrate_grants = MigrateGrants(sql_backend, group_manager, one_grant) - migrate_grants.apply(src, dst) + migrate_grants.apply(src, 'catalog.schema.table') group_manager.snapshot.assert_not_called() sql_backend.execute.assert_not_called() diff --git a/tests/unit/hive_metastore/test_table_migrate.py b/tests/unit/hive_metastore/test_table_migrate.py index 42d20fc63d..23c45447bb 100644 --- a/tests/unit/hive_metastore/test_table_migrate.py +++ b/tests/unit/hive_metastore/test_table_migrate.py @@ -1,6 +1,5 @@ import datetime import logging -import sys from itertools import cycle from unittest.mock import create_autospec @@ -11,7 +10,6 @@ from databricks.sdk.service.catalog import CatalogInfo, SchemaInfo, TableInfo from databricks.labs.ucx.framework.owners import AdministratorLocator -from databricks.labs.ucx.framework.utils import escape_sql_identifier from databricks.labs.ucx.hive_metastore.grants import MigrateGrants from databricks.labs.ucx.hive_metastore.locations import ExternalLocations from databricks.labs.ucx.hive_metastore.mapping import ( @@ -43,14 +41,7 @@ logger = logging.getLogger(__name__) -@pytest.fixture -def mock_pyspark(mocker): - pyspark_sql_session = mocker.Mock() - sys.modules["pyspark.sql.session"] = pyspark_sql_session - - -def test_migrate_dbfs_root_tables_should_produce_proper_queries(ws, mock_pyspark): - +def test_migrate_dbfs_root_tables_should_produce_proper_queries(ws): errors = {} rows = {r"SYNC .*": MockBackend.rows("status_code", "description")[("SUCCESS", "test")]} backend = MockBackend(fails_on_first=errors, rows=rows) @@ -96,7 +87,7 @@ def test_migrate_dbfs_root_tables_should_produce_proper_queries(ws, mock_pyspark external_locations.resolve_mount.assert_not_called() -def test_dbfs_non_delta_tables_should_produce_proper_queries(ws, mock_pyspark): +def test_dbfs_non_delta_tables_should_produce_proper_queries(ws): errors = {} rows = { "SHOW CREATE TABLE": [ @@ -142,7 +133,7 @@ def test_dbfs_non_delta_tables_should_produce_proper_queries(ws, mock_pyspark): ) in backend.queries -def test_migrate_dbfs_root_tables_should_be_skipped_when_upgrading_external(ws, mock_pyspark): +def test_migrate_dbfs_root_tables_should_be_skipped_when_upgrading_external(ws): errors = {} rows = {} crawler_backend = MockBackend(fails_on_first=errors, rows=rows) @@ -167,7 +158,7 @@ def test_migrate_dbfs_root_tables_should_be_skipped_when_upgrading_external(ws, external_locations.resolve_mount.assert_not_called() -def test_migrate_external_tables_should_produce_proper_queries(ws, mock_pyspark): +def test_migrate_external_tables_should_produce_proper_queries(ws): errors = {} rows = {r"SYNC .*": MockBackend.rows("status_code", "description")[("SUCCESS", "test")]} crawler_backend = MockBackend(fails_on_first=errors, rows=rows) @@ -201,35 +192,7 @@ def test_migrate_external_tables_should_produce_proper_queries(ws, mock_pyspark) ] -def test_migrate_managed_table_as_external_tables_with_conversion(ws, mock_pyspark): - errors = {} - rows = {r"SYNC .*": MockBackend.rows("status_code", "description")[("SUCCESS", "test")]} - crawler_backend = MockBackend(fails_on_first=errors, rows=rows) - backend = MockBackend(fails_on_first=errors, rows=rows) - table_crawler = TablesCrawler(crawler_backend, "inventory_database") - table_mapping = mock_table_mapping(["managed_other"]) - migration_status_refresher = TableMigrationStatusRefresher(ws, backend, "inventory_database", table_crawler) - migrate_grants = create_autospec(MigrateGrants) - external_locations = create_autospec(ExternalLocations) - table_migrate = TablesMigrator( - table_crawler, ws, backend, table_mapping, migration_status_refresher, migrate_grants, external_locations - ) - table_migrate.migrate_tables(what=What.EXTERNAL_SYNC, managed_table_external_storage="CONVERT_TO_EXTERNAL") - - migrate_grants.apply.assert_called() - external_locations.resolve_mount.assert_not_called() - - assert backend.queries == [ - "SYNC TABLE `ucx_default`.`db1_dst`.`managed_other` FROM `hive_metastore`.`db1_src`.`managed_other`;", - ( - f"ALTER TABLE `ucx_default`.`db1_dst`.`managed_other` " - f"SET TBLPROPERTIES ('upgraded_from' = 'hive_metastore.db1_src.managed_other' , " - f"'{Table.UPGRADED_FROM_WS_PARAM}' = '123');" - ), - ] - - -def test_migrate_managed_table_as_external_tables_without_conversion(ws, mock_pyspark): +def test_migrate_managed_table_as_external_tables_without_conversion(ws): errors = {} rows = {r"SYNC .*": MockBackend.rows("status_code", "description")[("SUCCESS", "test")]} crawler_backend = MockBackend(fails_on_first=errors, rows=rows) @@ -263,7 +226,7 @@ def test_migrate_managed_table_as_external_tables_without_conversion(ws, mock_py ] -def test_migrate_managed_table_as_managed_tables_should_produce_proper_queries(ws, mock_pyspark): +def test_migrate_managed_table_as_managed_tables_should_produce_proper_queries(ws): errors = {} rows = {r"SYNC .*": MockBackend.rows("status_code", "description")[("SUCCESS", "test")]} crawler_backend = MockBackend(fails_on_first=errors, rows=rows) @@ -302,7 +265,7 @@ def test_migrate_managed_table_as_managed_tables_should_produce_proper_queries(w ] -def test_migrate_external_table_failed_sync(ws, caplog, mock_pyspark): +def test_migrate_external_table_failed_sync(ws, caplog): errors = {} rows = {r"SYNC .*": MockBackend.rows("status_code", "description")[("LOCATION_OVERLAP", "test")]} backend = MockBackend(fails_on_first=errors, rows=rows) @@ -398,7 +361,14 @@ def test_migrate_external_table_failed_sync(ws, caplog, mock_pyspark): ], ) def test_migrate_external_hiveserde_table_in_place( - ws, caplog, hiveserde_in_place_migrate, describe, ddl, errors, migrated, expected_value, mock_pyspark + ws, + caplog, + hiveserde_in_place_migrate, + describe, + ddl, + errors, + migrated, + expected_value, ): caplog.set_level(logging.INFO) backend = MockBackend( @@ -464,7 +434,7 @@ def resolve_mount(location: str) -> str: ), ], ) -def test_migrate_external_tables_ctas_should_produce_proper_queries(ws, what, test_table, expected_query, mock_pyspark): +def test_migrate_external_tables_ctas_should_produce_proper_queries(ws, what, test_table, expected_query): backend = MockBackend() table_crawler = TablesCrawler(backend, "inventory_database") table_mapping = mock_table_mapping([test_table]) @@ -493,7 +463,7 @@ def resolve_mount(location: str) -> str: migrate_grants.apply.assert_called() -def test_migrate_already_upgraded_table_should_produce_no_queries(ws, mock_pyspark): +def test_migrate_already_upgraded_table_should_produce_no_queries(ws): errors = {} rows = {} crawler_backend = MockBackend(fails_on_first=errors, rows=rows) @@ -540,7 +510,7 @@ def test_migrate_already_upgraded_table_should_produce_no_queries(ws, mock_pyspa external_locations.resolve_mount.assert_not_called() -def test_migrate_unsupported_format_table_should_produce_no_queries(ws, mock_pyspark): +def test_migrate_unsupported_format_table_should_produce_no_queries(ws): errors = {} rows = {} crawler_backend = MockBackend(fails_on_first=errors, rows=rows) @@ -566,7 +536,7 @@ def test_migrate_unsupported_format_table_should_produce_no_queries(ws, mock_pys external_locations.resolve_mount.assert_not_called() -def test_migrate_view_should_produce_proper_queries(ws, mock_pyspark): +def test_migrate_view_should_produce_proper_queries(ws): errors = {} original_view = ( "CREATE OR REPLACE VIEW `hive_metastore`.`db1_src`.`view_src` AS SELECT * FROM `db1_src`.`managed_dbfs`" @@ -619,7 +589,7 @@ def test_migrate_view_with_local_dataset_should_be_skipped(ws): assert dependencies == [TableView(catalog="hive_metastore", schema="database", name="v")] -def test_migrate_view_with_columns(ws, mock_pyspark): +def test_migrate_view_with_columns(ws): errors = {} create = "CREATE OR REPLACE VIEW hive_metastore.db1_src.view_src (a,b) AS SELECT * FROM db1_src.managed_dbfs" rows = {"SHOW CREATE TABLE": [{"createtab_stmt": create}]} @@ -757,7 +727,7 @@ def get_table_migrator(backend: SqlBackend) -> TablesMigrator: return table_migrate -def test_revert_migrated_tables_skip_managed(mock_pyspark): +def test_revert_migrated_tables_skip_managed(): errors = {} rows = {} backend = MockBackend(fails_on_first=errors, rows=rows) @@ -776,7 +746,7 @@ def test_revert_migrated_tables_skip_managed(mock_pyspark): assert "DROP VIEW IF EXISTS `cat1`.`schema1`.`dest_view1`" in revert_queries -def test_revert_migrated_tables_including_managed(mock_pyspark): +def test_revert_migrated_tables_including_managed(): errors = {} rows = {} backend = MockBackend(fails_on_first=errors, rows=rows) @@ -801,7 +771,7 @@ def test_revert_migrated_tables_including_managed(mock_pyspark): assert "DROP TABLE IF EXISTS `cat1`.`schema1`.`dest2`" in revert_with_managed_queries -def test_no_migrated_tables(ws, mock_pyspark): +def test_no_migrated_tables(ws): errors = {} rows = {} backend = MockBackend(fails_on_first=errors, rows=rows) @@ -831,7 +801,7 @@ def test_no_migrated_tables(ws, mock_pyspark): external_locations.resolve_mount.assert_not_called() -def test_revert_report(ws, capsys, mock_pyspark): +def test_revert_report(ws, capsys): errors = {} rows = {} backend = MockBackend(fails_on_first=errors, rows=rows) @@ -847,7 +817,7 @@ def test_revert_report(ws, capsys, mock_pyspark): assert "- Migrated DBFS Root Tables will be left intact" in captured.out -def test_empty_revert_report(ws, mock_pyspark): +def test_empty_revert_report(ws): errors = {} rows = {} backend = MockBackend(fails_on_first=errors, rows=rows) @@ -874,7 +844,7 @@ def test_empty_revert_report(ws, mock_pyspark): external_locations.resolve_mount.assert_not_called() -def test_is_upgraded(ws, mock_pyspark): +def test_is_upgraded(ws): errors = {} rows = { "SHOW TBLPROPERTIES `schema1`.`table1`": MockBackend.rows("key", "value")["upgrade_to", "fake_dest"], @@ -1081,11 +1051,10 @@ def test_table_status_seen_tables(caplog): GRANTS = MockBackend.rows("principal", "action_type", "catalog", "database", "table", "view") -def test_migrate_acls_should_produce_proper_queries(ws, caplog, mock_pyspark) -> None: +def test_migrate_acls_should_produce_proper_queries(ws, caplog): # all grants succeed except for one table_crawler = create_autospec(TablesCrawler) src = Table('hive_metastore', 'db1_src', 'managed_dbfs', 'TABLE', 'DELTA', "/foo/bar/test") - dst = Table('ucx_default', 'db1_dst', 'managed_dbfs', 'MANAGED', 'DELTA') table_crawler.snapshot.return_value = [src] table_mapping = mock_table_mapping(["managed_dbfs"]) migration_status_refresher = create_autospec(TableMigrationStatusRefresher) @@ -1109,7 +1078,7 @@ def test_migrate_acls_should_produce_proper_queries(ws, caplog, mock_pyspark) -> table_migrate.migrate_tables(what=What.DBFS_ROOT_DELTA) - migrate_grants.apply.assert_called_with(src, dst) + migrate_grants.apply.assert_called_with(src, 'ucx_default.db1_dst.managed_dbfs') external_locations.resolve_mount.assert_not_called() assert sql_backend.queries == [ 'CREATE TABLE IF NOT EXISTS `ucx_default`.`db1_dst`.`managed_dbfs` DEEP CLONE `hive_metastore`.`db1_src`.`managed_dbfs`;', @@ -1119,7 +1088,7 @@ def test_migrate_acls_should_produce_proper_queries(ws, caplog, mock_pyspark) -> ] -def test_migrate_views_should_be_properly_sequenced(ws, mock_pyspark): +def test_migrate_views_should_be_properly_sequenced(ws): errors = {} rows = { "SHOW CREATE TABLE `hive_metastore`.`db1_src`.`v1_src`": [ @@ -1192,7 +1161,7 @@ def test_migrate_views_should_be_properly_sequenced(ws, mock_pyspark): external_locations.resolve_mount.assert_not_called() -def test_table_in_mount_mapping_with_table_owner(mock_pyspark): +def test_table_in_mount_mapping_with_table_owner(): client = create_autospec(WorkspaceClient) client.tables.get.side_effect = NotFound() backend = MockBackend( @@ -1235,7 +1204,7 @@ def test_table_in_mount_mapping_with_table_owner(mock_pyspark): external_locations.resolve_mount.assert_not_called() -def test_table_in_mount_mapping_with_partition_information(mock_pyspark): +def test_table_in_mount_mapping_with_partition_information(): client = create_autospec(WorkspaceClient) client.tables.get.side_effect = NotFound() backend = MockBackend( @@ -1281,7 +1250,7 @@ def test_table_in_mount_mapping_with_partition_information(mock_pyspark): external_locations.resolve_mount.assert_not_called() -def test_migrate_view_failed(ws, caplog, mock_pyspark): +def test_migrate_view_failed(ws, caplog): errors = {"CREATE OR REPLACE VIEW": "error"} create = "CREATE OR REPLACE VIEW hive_metastore.db1_src.view_src (a,b) AS SELECT * FROM db1_src.managed_dbfs" rows = {"SHOW CREATE TABLE": [{"createtab_stmt": create}]} @@ -1319,7 +1288,7 @@ def test_migrate_view_failed(ws, caplog, mock_pyspark): external_locations.resolve_mount.assert_not_called() -def test_migrate_dbfs_root_tables_failed(ws, caplog, mock_pyspark): +def test_migrate_dbfs_root_tables_failed(ws, caplog): errors = {"CREATE TABLE IF NOT EXISTS": "error"} backend = MockBackend(fails_on_first=errors, rows={}) table_crawler = TablesCrawler(backend, "inventory_database") @@ -1347,7 +1316,7 @@ def test_migrate_dbfs_root_tables_failed(ws, caplog, mock_pyspark): ) -def test_revert_migrated_tables_failed(caplog, mock_pyspark): +def test_revert_migrated_tables_failed(caplog): errors = {"ALTER TABLE": "error"} rows = {} backend = MockBackend(fails_on_first=errors, rows=rows) @@ -1356,7 +1325,7 @@ def test_revert_migrated_tables_failed(caplog, mock_pyspark): assert "Failed to revert table hive_metastore.test_schema1.test_table1: error" in caplog.text -def test_refresh_migration_status_published_remained_tables(caplog, mock_pyspark): +def test_refresh_migration_status_published_remained_tables(caplog): backend = MockBackend() table_crawler = create_autospec(TablesCrawler) client = mock_workspace_client() @@ -1514,60 +1483,3 @@ def test_table_migration_status_source_table_unknown() -> None: assert owner == "an_admin" table_ownership.owner_of.assert_not_called() - - -class MockBackendWithGeneralException(MockBackend): - """Mock backend that allows raising a general exception. - - Note: we want to raise a Spark AnalysisException, for which we do not have the dependency to raise explicitly. - """ - - @staticmethod - def _api_error_from_message(error_message: str): # No return type to avoid mypy complains on different return type - return Exception(error_message) - - -def test_migrate_tables_handles_table_with_empty_column(caplog) -> None: - table_crawler = create_autospec(TablesCrawler) - table = Table("hive_metastore", "schema", "table", "MANAGED", "DELTA") - - error_message = ( - "INVALID_PARAMETER_VALUE: Invalid input: RPC CreateTable Field managedcatalog.ColumnInfo.name: " - 'At columns.21: name "" is not a valid name`' - ) - query = f"ALTER TABLE {escape_sql_identifier(table.full_name)} SET TBLPROPERTIES ('upgraded_to' = 'catalog.schema.table');" - backend = MockBackendWithGeneralException(fails_on_first={query: error_message}) - - ws = create_autospec(WorkspaceClient) - ws.get_workspace_id.return_value = 123456789 - - table_mapping = create_autospec(TableMapping) - rule = Rule("workspace", "catalog", "schema", "schema", "table", "table") - table_to_migrate = TableToMigrate(table, rule) - table_mapping.get_tables_to_migrate.return_value = [table_to_migrate] - - migration_status_refresher = create_autospec(TableMigrationStatusRefresher) - migration_status_refresher.get_seen_tables.return_value = {} - migration_status_refresher.index.return_value = [] - - migrate_grants = create_autospec(MigrateGrants) - external_locations = create_autospec(ExternalLocations) - table_migrator = TablesMigrator( - table_crawler, - ws, - backend, - table_mapping, - migration_status_refresher, - migrate_grants, - external_locations, - ) - - with caplog.at_level(logging.WARN, logger="databricks.labs.ucx.hive_metastore"): - table_migrator.migrate_tables(table.what) - assert "failed-to-migrate: Table with empty column name 'hive_metastore.schema.table'" in caplog.messages - - table_crawler.snapshot.assert_not_called() # Mocking table mapping instead - ws.get_workspace_id.assert_not_called() # Errors before getting here - migration_status_refresher.index.assert_not_called() # Only called when migrating view - migrate_grants.apply.assert_not_called() # Errors before getting here - external_locations.resolve_mount.assert_not_called() # Only called when migrating external table diff --git a/tests/unit/install/test_install.py b/tests/unit/install/test_install.py index 2b8087a525..4dda1cc839 100644 --- a/tests/unit/install/test_install.py +++ b/tests/unit/install/test_install.py @@ -381,7 +381,7 @@ def test_configure_sets_expected_workspace_configuration_values( { r".*PRO or SERVERLESS SQL warehouse.*": "1", r"Choose how to map the workspace groups.*": "2", # specify names - r"If hive_metastore contains managed table with external.*": "1", + r"If hive_metastore contains managed table with external.*": "0", r".*": "", prompt_question: prompt_answer, } @@ -434,7 +434,7 @@ def test_create_cluster_policy(ws, mock_installation) -> None: r".*We have identified one or more cluster.*": "No", r".*Choose a cluster policy.*": "0", r"Reconciliation threshold, in percentage.*": "5", - r"If hive_metastore contains managed table with external.*": "1", + r"If hive_metastore contains managed table with external.*": "0", r".*": "", } ) @@ -1673,7 +1673,7 @@ def test_save_config_ext_hms(ws, mock_installation) -> None: r"Choose how to map the workspace groups.*": "2", # specify names r"Comma-separated list of databases to migrate.*": "db1,db2", r"Reconciliation threshold, in percentage.*": "5", - r"If hive_metastore contains managed table with external.*": "1", + r"If hive_metastore contains managed table with external.*": "0", r".*": "", } ) diff --git a/tests/unit/mixins/test_cached_workspace_path.py b/tests/unit/mixins/test_cached_workspace_path.py index cf676f3cb1..4195f632ce 100644 --- a/tests/unit/mixins/test_cached_workspace_path.py +++ b/tests/unit/mixins/test_cached_workspace_path.py @@ -9,7 +9,7 @@ from databricks.sdk.service.workspace import ObjectInfo, ObjectType from databricks.labs.ucx.mixins.cached_workspace_path import WorkspaceCache -from databricks.labs.ucx.source_code.base import decode_with_bom +from databricks.labs.ucx.source_code.base import guess_encoding class _WorkspaceCacheFriend(WorkspaceCache): @@ -19,84 +19,76 @@ def data_cache(self): return self._cache -def test_path_like_returns_cached_instance() -> None: +def test_path_like_returns_cached_instance(): cache = _WorkspaceCacheFriend(mock_workspace_client()) - parent = cache.get_workspace_path("/some/path") + parent = cache.get_path("path") child = parent / "child" _cache = getattr(child, "_cache") assert _cache == cache.data_cache -def test_non_absolute_path_error() -> None: - cache = _WorkspaceCacheFriend(mock_workspace_client()) - with pytest.raises(WorkspaceCache.InvalidWorkspacePath, match="Invalid workspace path; must be absolute"): - _ = cache.get_workspace_path("not/an/absolute/path") - - -def test_iterdir_returns_cached_instances() -> None: +def test_iterdir_returns_cached_instances(): ws = create_autospec(WorkspaceClient) ws.workspace.get_status.return_value = ObjectInfo(object_type=ObjectType.DIRECTORY) - ws.workspace.list.return_value = [ObjectInfo(object_type=ObjectType.FILE, path=s) for s in ("a", "b", "c")] + ws.workspace.list.return_value = list(ObjectInfo(object_type=ObjectType.FILE, path=s) for s in ("a", "b", "c")) cache = _WorkspaceCacheFriend(ws) - parent = cache.get_workspace_path("/a/dir") + parent = cache.get_path("dir") assert parent.is_dir() for child in parent.iterdir(): _cache = getattr(child, "_cache") assert _cache == cache.data_cache -def test_download_is_only_called_once_per_instance() -> None: - ws = create_autospec(WorkspaceClient) +def test_download_is_only_called_once_per_instance(): + ws = mock_workspace_client() ws.workspace.download.side_effect = lambda _, *, format: io.BytesIO("abc".encode()) cache = WorkspaceCache(ws) - path = cache.get_workspace_path("/a/path") + path = cache.get_path("path") for _ in range(0, 4): _ = path.read_text() assert ws.workspace.download.call_count == 1 -def test_download_is_only_called_once_across_instances() -> None: +def test_download_is_only_called_once_across_instances(): ws = mock_workspace_client() ws.workspace.download.side_effect = lambda _, *, format: io.BytesIO("abc".encode()) cache = WorkspaceCache(ws) for _ in range(0, 4): - path = cache.get_workspace_path("/a/path") + path = cache.get_path("path") _ = path.read_text() assert ws.workspace.download.call_count == 1 -def test_download_is_called_again_after_unlink() -> None: +def test_download_is_called_again_after_unlink(): ws = mock_workspace_client() ws.workspace.download.side_effect = lambda _, *, format: io.BytesIO("abc".encode()) cache = WorkspaceCache(ws) - path = cache.get_workspace_path("/a/path") + path = cache.get_path("path") _ = path.read_text() - path = cache.get_workspace_path("/a/path") + path = cache.get_path("path") path.unlink() _ = path.read_text() assert ws.workspace.download.call_count == 2 -def test_download_is_called_again_after_rename() -> None: +def test_download_is_called_again_after_rename(): ws = mock_workspace_client() ws.workspace.download.side_effect = lambda _, *, format: io.BytesIO("abc".encode()) cache = WorkspaceCache(ws) - path = cache.get_workspace_path("/a/path") + path = cache.get_path("path") _ = path.read_text() path.rename("abcd") _ = path.read_text() assert ws.workspace.download.call_count == 3 # rename reads the old content -def test_encoding_is_guessed_after_download() -> None: +def test_encoding_is_guessed_after_download(): ws = mock_workspace_client() ws.workspace.download.side_effect = lambda _, *, format: io.BytesIO("abc".encode()) cache = WorkspaceCache(ws) - path = cache.get_workspace_path("/a/path") + path = cache.get_path("path") _ = path.read_text() - # TODO: Figure out what this is supposed to be testing? - with decode_with_bom(path.open("rb")) as f: - assert f.encoding + guess_encoding(path) @pytest.mark.parametrize( @@ -106,11 +98,11 @@ def test_encoding_is_guessed_after_download() -> None: ("rb", io.BytesIO("abc".encode("utf-8-sig"))), ], ) -def test_sequential_read_completes(mode: str, data: io.BytesIO) -> None: +def test_sequential_read_completes(mode, data): ws = mock_workspace_client() ws.workspace.download.side_effect = lambda _, *, format: data cache = WorkspaceCache(ws) - path = cache.get_workspace_path("/a/path") + path = cache.get_path("path") with path.open(mode) as file: count = 0 while _ := file.read(1): diff --git a/tests/unit/source_code/notebooks/test_sources.py b/tests/unit/source_code/notebooks/test_sources.py index 084be31867..44fabc4b38 100644 --- a/tests/unit/source_code/notebooks/test_sources.py +++ b/tests/unit/source_code/notebooks/test_sources.py @@ -96,7 +96,7 @@ def test_file_linter_lints_non_ascii_encoded_file(migration_index, mock_path_loo def test_file_linter_lints_file_with_missing_file(migration_index, mock_path_lookup) -> None: path = create_autospec(Path) path.suffix = ".py" - path.open.side_effect = FileNotFoundError("No such file or directory: 'test.py'") + path.read_text.side_effect = FileNotFoundError("No such file or directory: 'test.py'") linter = FileLinter(LinterContext(migration_index), mock_path_lookup, CurrentSessionState(), path) advices = list(linter.lint()) @@ -109,7 +109,7 @@ def test_file_linter_lints_file_with_missing_file(migration_index, mock_path_loo def test_file_linter_lints_file_with_missing_read_permission(migration_index, mock_path_lookup) -> None: path = create_autospec(Path) path.suffix = ".py" - path.open.side_effect = PermissionError("Permission denied") + path.read_text.side_effect = PermissionError("Permission denied") linter = FileLinter(LinterContext(migration_index), mock_path_lookup, CurrentSessionState(), path) advices = list(linter.lint()) diff --git a/tests/unit/source_code/test_jobs.py b/tests/unit/source_code/test_jobs.py index cbd2f87394..a7d60d1a0c 100644 --- a/tests/unit/source_code/test_jobs.py +++ b/tests/unit/source_code/test_jobs.py @@ -15,7 +15,6 @@ from databricks.labs.ucx.source_code.python_libraries import PythonLibraryResolver from databricks.labs.ucx.source_code.known import KnownList from databricks.sdk import WorkspaceClient -from databricks.sdk.errors import NotFound from databricks.sdk.service import compute, jobs, pipelines from databricks.sdk.service.workspace import ExportFormat @@ -265,7 +264,7 @@ def test_workflow_task_container_builds_dependency_graph_for_requirements_txt(mo ws = create_autospec(WorkspaceClient) ws.workspace.download.return_value = io.BytesIO(b"test") - libraries = [compute.Library(requirements="/path/to/requirements.txt")] + libraries = [compute.Library(requirements="requirements.txt")] task = jobs.Task(task_key="test", libraries=libraries) workflow_task_container = WorkflowTaskContainer(ws, task, Job()) @@ -286,7 +285,7 @@ def test_workflow_task_container_build_dependency_graph_warns_about_reference_to ws = create_autospec(WorkspaceClient) ws.workspace.download.return_value = io.BytesIO(b"-r other-requirements.txt") - libraries = [compute.Library(requirements="/path/to/requirements.txt")] + libraries = [compute.Library(requirements="requirements.txt")] task = jobs.Task(task_key="test", libraries=libraries) workflow_task_container = WorkflowTaskContainer(ws, task, Job()) @@ -295,7 +294,7 @@ def test_workflow_task_container_build_dependency_graph_warns_about_reference_to workflow_task_container.build_dependency_graph(graph) assert expected_message in caplog.messages - ws.workspace.download.assert_called_once_with("/path/to/requirements.txt", format=ExportFormat.AUTO) + ws.workspace.download.assert_called_once_with("requirements.txt", format=ExportFormat.AUTO) def test_workflow_task_container_build_dependency_graph_warns_about_reference_to_constraints( @@ -306,7 +305,7 @@ def test_workflow_task_container_build_dependency_graph_warns_about_reference_to ws = create_autospec(WorkspaceClient) ws.workspace.download.return_value = io.BytesIO(b"-c constraints.txt") - libraries = [compute.Library(requirements="/path/to/requirements.txt")] + libraries = [compute.Library(requirements="requirements.txt")] task = jobs.Task(task_key="test", libraries=libraries) workflow_task_container = WorkflowTaskContainer(ws, task, Job()) @@ -315,7 +314,7 @@ def test_workflow_task_container_build_dependency_graph_warns_about_reference_to workflow_task_container.build_dependency_graph(graph) assert expected_message in caplog.messages - ws.workspace.download.assert_called_once_with("/path/to/requirements.txt", format=ExportFormat.AUTO) + ws.workspace.download.assert_called_once_with("requirements.txt", format=ExportFormat.AUTO) def test_workflow_task_container_with_existing_cluster_builds_dependency_graph_pytest_pypi_library( @@ -474,13 +473,12 @@ def test_workflow_linter_dlt_pipeline_task(graph) -> None: PipelineLibrary( jar="some.jar", maven=compute.MavenLibrary(coordinates="com.example:example:1.0.0"), - notebook=NotebookLibrary(path="/path/to/test.py"), + notebook=NotebookLibrary(path="test.py"), file=FileLibrary(path="test.txt"), ) ] ), ) - ws.workspace.get_status.side_effect = NotFound("Simulated workspace file not found.") workflow_task_container = WorkflowTaskContainer(ws, task, Job()) problems = workflow_task_container.build_dependency_graph(graph) assert len(problems) == 4 @@ -489,9 +487,8 @@ def test_workflow_linter_dlt_pipeline_task(graph) -> None: def test_xxx(graph) -> None: ws = create_autospec(WorkspaceClient) - ws.workspace.get_status.side_effect = NotFound("Simulated workspace file not found.") notebook_task = jobs.NotebookTask( - notebook_path="/path/to/test", + notebook_path="test", base_parameters={"a": "b", "c": "dbfs:/mnt/foo"}, ) task = jobs.Task( diff --git a/tests/unit/test_cli.py b/tests/unit/test_cli.py index 4c8cff52a1..9ae208bf70 100644 --- a/tests/unit/test_cli.py +++ b/tests/unit/test_cli.py @@ -13,7 +13,7 @@ from databricks.sdk.errors import NotFound from databricks.sdk.errors.platform import BadRequest from databricks.sdk.service import sql -from databricks.sdk.service.catalog import CatalogInfo, ExternalLocationInfo, MetastoreInfo, SchemaInfo +from databricks.sdk.service.catalog import ExternalLocationInfo, MetastoreInfo from databricks.sdk.service.compute import ClusterDetails, ClusterSource from databricks.sdk.service.iam import ComplexValue, User from databricks.sdk.service.jobs import Run, RunResultState, RunState @@ -863,7 +863,7 @@ def test_migrate_locations_gcp(ws): @pytest.mark.parametrize("run_as_collection", [False, True]) -def test_create_catalogs_schemas_gets_catalog(run_as_collection, workspace_clients, acc_client) -> None: +def test_create_catalogs_schemas_lists_catalogs(run_as_collection, workspace_clients, acc_client) -> None: if not run_as_collection: workspace_clients = [workspace_clients[0]] for workspace_client in workspace_clients: @@ -873,22 +873,19 @@ def test_create_catalogs_schemas_gets_catalog(run_as_collection, workspace_clien create_catalogs_schemas(workspace_clients[0], prompts, run_as_collection=run_as_collection, a=acc_client) for workspace_client in workspace_clients: - workspace_client.catalogs.get.assert_called() + workspace_client.catalogs.list.assert_called_once() def test_create_catalogs_schemas_handles_existing(ws, caplog) -> None: prompts = MockPrompts({'.*': 's3://test'}) ws.external_locations.list.return_value = [ExternalLocationInfo(url="s3://test")] - ws.catalogs.get.return_value = CatalogInfo(name="test") ws.catalogs.create.side_effect = [BadRequest("Catalog 'test' already exists")] - ws.schemas.get.return_value = SchemaInfo(full_name="test.test") ws.schemas.create.side_effect = [BadRequest("Schema 'test' already exists")] create_catalogs_schemas(ws, prompts, ctx=WorkspaceContext(ws)) + ws.catalogs.list.assert_called_once() assert "Skipping already existing catalog: test" in caplog.messages assert "Skipping already existing schema: test.test" in caplog.messages - ws.catalogs.get.assert_called() - ws.schemas.get.assert_called() def test_cluster_remap(ws, caplog): From d0a957b50ac41554398b177b437a35063fa0b360 Mon Sep 17 00:00:00 2001 From: Eric Vergnaud Date: Thu, 17 Oct 2024 16:03:11 +0200 Subject: [PATCH 16/17] fix merge issues --- src/databricks/labs/ucx/source_code/jobs.py | 1 - .../hive_metastore/test_catalog_schema.py | 103 +++++++++++++----- .../unit/hive_metastore/test_table_migrate.py | 1 - 3 files changed, 74 insertions(+), 31 deletions(-) diff --git a/src/databricks/labs/ucx/source_code/jobs.py b/src/databricks/labs/ucx/source_code/jobs.py index 4cfc24e8bd..34cf147abd 100644 --- a/src/databricks/labs/ucx/source_code/jobs.py +++ b/src/databricks/labs/ucx/source_code/jobs.py @@ -35,7 +35,6 @@ SourceInfo, UsedTable, LineageAtom, - PythonSequentialLinter, ) from databricks.labs.ucx.source_code.directfs_access import ( DirectFsAccessCrawler, diff --git a/tests/integration/hive_metastore/test_catalog_schema.py b/tests/integration/hive_metastore/test_catalog_schema.py index f425105711..6be8dd36aa 100644 --- a/tests/integration/hive_metastore/test_catalog_schema.py +++ b/tests/integration/hive_metastore/test_catalog_schema.py @@ -3,14 +3,15 @@ import pytest from databricks.labs.blueprint.tui import MockPrompts +from databricks.sdk import WorkspaceClient from databricks.sdk.errors import NotFound from databricks.sdk.retries import retried -from databricks.sdk.service.catalog import CatalogInfo +from databricks.sdk.service.catalog import PermissionsList from databricks.sdk.service.compute import DataSecurityMode, AwsAttributes from databricks.sdk.service.catalog import Privilege, SecurableType, PrivilegeAssignment from databricks.sdk.service.iam import PermissionLevel -from databricks.labs.ucx.hive_metastore.grants import GrantsCrawler +from databricks.labs.ucx.hive_metastore.grants import Grant from databricks.labs.ucx.hive_metastore.mapping import Rule from ..conftest import get_azure_spark_conf @@ -19,18 +20,55 @@ @retried(on=[NotFound], timeout=timedelta(minutes=2)) -def test_create_ucx_catalog_creates_catalog(ws, runtime_ctx, watchdog_remove_after) -> None: +def test_create_ucx_catalog_creates_catalog(runtime_ctx, watchdog_remove_after) -> None: # Delete catalog created for testing to test the creation of a new catalog runtime_ctx.workspace_client.catalogs.delete(runtime_ctx.ucx_catalog, force=True) prompts = MockPrompts({f"Please provide storage location url for catalog: {runtime_ctx.ucx_catalog}": "metastore"}) + properties = {"RemoveAfter": watchdog_remove_after} - runtime_ctx.catalog_schema.create_ucx_catalog(prompts, properties={"RemoveAfter": watchdog_remove_after}) + runtime_ctx.catalog_schema.create_ucx_catalog(prompts, properties=properties) + + catalog_info = runtime_ctx.workspace_client.catalogs.get(runtime_ctx.ucx_catalog) + assert catalog_info.name == runtime_ctx.ucx_catalog + assert catalog_info.properties == properties - @retried(on=[NotFound], timeout=timedelta(seconds=20)) - def get_catalog(name: str) -> CatalogInfo: - return ws.catalogs.get(name) - assert get_catalog(runtime_ctx.ucx_catalog) +@retried(on=[NotFound], timeout=timedelta(minutes=3)) +def test_create_all_catalogs_schemas(ws: WorkspaceClient, runtime_ctx, make_random, watchdog_remove_after) -> None: + """Create one catalog with two schemas mirroring the HIVE metastore schemas.""" + src_schema_1 = runtime_ctx.make_schema(catalog_name="hive_metastore") + src_schema_2 = runtime_ctx.make_schema(catalog_name="hive_metastore") + src_view = runtime_ctx.make_table( + catalog_name=src_schema_1.catalog_name, + schema_name=src_schema_1.name, + ctas="SELECT 2+2 AS four", + view=True, + ) + src_table = runtime_ctx.make_table(catalog_name=src_schema_2.catalog_name, schema_name=src_schema_2.name) + dst_catalog_name = f"ucx-{make_random()}" + rules = [ + Rule("workspace", dst_catalog_name, src_schema_1.name, src_schema_1.name, src_view.name, src_view.name), + Rule("workspace", dst_catalog_name, src_schema_2.name, src_schema_2.name, src_table.name, src_table.name), + ] + runtime_ctx.with_table_mapping_rules(rules) + + mock_prompts = MockPrompts({"Please provide storage location url for catalog: *": ""}) + properties = {"RemoveAfter": watchdog_remove_after} + runtime_ctx.catalog_schema.create_all_catalogs_schemas(mock_prompts, properties=properties) + + try: + runtime_ctx.workspace_client.catalogs.get(dst_catalog_name) + except NotFound: + assert False, f"Catalog not created: {dst_catalog_name}" + else: + assert True, f"Catalog created: {dst_catalog_name}" + for dst_schema_full_name in f"{dst_catalog_name}.{src_schema_1.name}", f"{dst_catalog_name}.{src_schema_2.name}": + try: + ws.schemas.get(dst_schema_full_name) + except RuntimeError: + assert False, f"Schema not created: {dst_schema_full_name}" + else: + assert True, f"Schema created: {dst_schema_full_name}" @retried(on=[NotFound], timeout=timedelta(minutes=2)) @@ -40,7 +78,7 @@ def test_create_catalog_schema_with_principal_acl_azure( prepared_principal_acl, make_cluster_permissions, make_cluster, -): +) -> None: if not ws.config.is_azure: pytest.skip("only works in azure test env") ctx, _, schema_name, catalog_name = prepared_principal_acl @@ -66,8 +104,13 @@ def test_create_catalog_schema_with_principal_acl_azure( @retried(on=[NotFound], timeout=timedelta(minutes=3)) def test_create_catalog_schema_with_principal_acl_aws( - ws, make_user, prepared_principal_acl, make_cluster_permissions, make_cluster, env_or_skip -): + ws, + make_user, + prepared_principal_acl, + make_cluster_permissions, + make_cluster, + env_or_skip, +) -> None: ctx, _, schema_name, catalog_name = prepared_principal_acl cluster = make_cluster( @@ -107,28 +150,30 @@ def test_create_catalog_schema_with_legacy_hive_metastore_privileges( dst_schema_name = "test" rules = [Rule("workspace", dst_catalog_name, src_schema.name, dst_schema_name, src_table.name, src_table.name)] runtime_ctx.with_table_mapping_rules(rules) - runtime_ctx.with_dummy_resource_permission() - - user_a = make_user() - user_b = make_user() - sql_backend.execute(f"GRANT USAGE ON DATABASE {src_schema.name} TO `{user_a.user_name}`;") - sql_backend.execute(f"GRANT SELECT ON {src_external_table.full_name} TO `{user_b.user_name}`;") - sql_backend.execute(f"ALTER DATABASE {src_schema.name} OWNER TO `{user_b.user_name}`;") - sql_backend.execute(f"ALTER TABLE {src_external_table.full_name} OWNER TO `{user_a.user_name}`;") + schema_owner, table_owner = make_user(), make_user() + grants = [ + Grant(schema_owner.user_name, "USAGE", src_schema.catalog_name, src_schema.name), + Grant(table_owner.user_name, "USAGE", src_table.catalog_name, src_table.schema_name), + Grant(schema_owner.user_name, "OWN", src_schema.catalog_name, src_schema.name), + Grant(table_owner.user_name, "OWN", src_table.catalog_name, src_table.schema_name, src_table.name), + ] + for grant in grants: + for sql in grant.hive_grant_sql(): + runtime_ctx.sql_backend.execute(sql) - # Ensure the view is populated (it's based on the crawled grants) and fetch the content. - GrantsCrawler(runtime_ctx.tables_crawler, runtime_ctx.udfs_crawler).snapshot() - - catalog_schema = runtime_ctx.catalog_schema mock_prompts = MockPrompts({"Please provide storage location url for catalog: *": ""}) - catalog_schema.create_all_catalogs_schemas(mock_prompts) + properties = {"RemoveAfter": watchdog_remove_after} + runtime_ctx.catalog_schema.create_all_catalogs_schemas(mock_prompts, properties=properties) - schema_grants = ws.grants.get(SecurableType.SCHEMA, f"{dst_catalog.name}.{dst_schema.name}") - schema_grant = PrivilegeAssignment(user_a.user_name, [Privilege.USE_SCHEMA]) - assert schema_grant in schema_grants.privilege_assignments - schema_info = ws.schemas.get(f"{dst_schema.full_name}") - assert schema_info.owner == user_b.user_name + @retried(on=[NotFound], timeout=timedelta(seconds=20)) + def get_schema_permissions_list(full_name: str) -> PermissionsList: + return ws.grants.get(SecurableType.SCHEMA, full_name) + + assert ws.schemas.get(f"{dst_catalog_name}.{dst_schema_name}").owner == schema_owner.user_name + schema_grants = get_schema_permissions_list(f"{dst_catalog_name}.{dst_schema_name}") + assert schema_grants.privilege_assignments is not None + assert PrivilegeAssignment(table_owner.user_name, [Privilege.USE_SCHEMA]) in schema_grants.privilege_assignments def test_create_catalog_schema_when_users_group_in_warehouse_acl( diff --git a/tests/unit/hive_metastore/test_table_migrate.py b/tests/unit/hive_metastore/test_table_migrate.py index 889cb5d88d..c4eac65c2e 100644 --- a/tests/unit/hive_metastore/test_table_migrate.py +++ b/tests/unit/hive_metastore/test_table_migrate.py @@ -1,6 +1,5 @@ import datetime import logging -import sys from collections.abc import Generator from itertools import cycle from unittest.mock import create_autospec From 4ee30a08df9e8a437b6eb23dcdc8f3377c70c214 Mon Sep 17 00:00:00 2001 From: Eric Vergnaud Date: Thu, 17 Oct 2024 16:25:48 +0200 Subject: [PATCH 17/17] use existing ownership classes --- src/databricks/labs/ucx/assessment/azure.py | 2 +- .../labs/ucx/assessment/clusters.py | 24 +++++++++++-------- src/databricks/labs/ucx/assessment/jobs.py | 24 ++++++++++--------- .../labs/ucx/sequencing/sequencing.py | 8 +++---- 4 files changed, 32 insertions(+), 26 deletions(-) diff --git a/src/databricks/labs/ucx/assessment/azure.py b/src/databricks/labs/ucx/assessment/azure.py index 81c99e784b..68233d958c 100644 --- a/src/databricks/labs/ucx/assessment/azure.py +++ b/src/databricks/labs/ucx/assessment/azure.py @@ -73,7 +73,7 @@ def _get_relevant_service_principals(self) -> list[AzureServicePrincipalInfo]: # list all relevant service principals in jobs all_jobs = list(self._ws.jobs.list(expand_tasks=True)) - all_clusters_by_id = {c.cluster_id: c for c in self._ws.clusters.list()} + all_clusters_by_id = {c.cluster_id: c for c in self._ws.clusters.list() if c.cluster_id} for _job, cluster_config in self._get_cluster_configs_from_all_jobs(all_jobs, all_clusters_by_id): set_service_principals.update(self._get_azure_spn_from_cluster_config(cluster_config)) diff --git a/src/databricks/labs/ucx/assessment/clusters.py b/src/databricks/labs/ucx/assessment/clusters.py index 0e0624d3c2..95c825b04c 100644 --- a/src/databricks/labs/ucx/assessment/clusters.py +++ b/src/databricks/labs/ucx/assessment/clusters.py @@ -46,6 +46,18 @@ class ClusterInfo: creator: str | None = None """User-name of the creator of the cluster, if known.""" + @classmethod + def from_cluster_details(cls, details: ClusterDetails): + return ClusterInfo( + cluster_id=details.cluster_id if details.cluster_id else "", + cluster_name=details.cluster_name, + policy_id=details.policy_id, + spark_version=details.spark_version, + creator=details.creator_user_name or None, + success=1, + failures="[]", + ) + class CheckClusterMixin(CheckInitScriptMixin): _ws: WorkspaceClient @@ -152,7 +164,7 @@ def _crawl(self) -> Iterable[ClusterInfo]: all_clusters = list(self._ws.clusters.list()) return list(self._assess_clusters(all_clusters)) - def _assess_clusters(self, all_clusters): + def _assess_clusters(self, all_clusters: Iterable[ClusterDetails]): for cluster in all_clusters: if cluster.cluster_source == ClusterSource.JOB: continue @@ -162,15 +174,7 @@ def _assess_clusters(self, all_clusters): f"Cluster {cluster.cluster_id} have Unknown creator, it means that the original creator " f"has been deleted and should be re-created" ) - cluster_info = ClusterInfo( - cluster_id=cluster.cluster_id if cluster.cluster_id else "", - cluster_name=cluster.cluster_name, - policy_id=cluster.policy_id, - spark_version=cluster.spark_version, - creator=creator, - success=1, - failures="[]", - ) + cluster_info = ClusterInfo.from_cluster_details(cluster) failures = self._check_cluster_failures(cluster, "cluster") if len(failures) > 0: cluster_info.success = 0 diff --git a/src/databricks/labs/ucx/assessment/jobs.py b/src/databricks/labs/ucx/assessment/jobs.py index 667647d967..ed2fac89d3 100644 --- a/src/databricks/labs/ucx/assessment/jobs.py +++ b/src/databricks/labs/ucx/assessment/jobs.py @@ -20,6 +20,7 @@ RunType, SparkJarTask, SqlTask, + Job, ) from databricks.labs.ucx.assessment.clusters import CheckClusterMixin @@ -40,6 +41,17 @@ class JobInfo: creator: str | None = None """User-name of the creator of the pipeline, if known.""" + @classmethod + def from_job(cls, job: Job): + job_name = job.settings.name if job.settings and job.settings.name else "Unknown" + return JobInfo( + job_id=str(job.job_id), + success=1, + failures="[]", + job_name=job_name, + creator=job.creator_user_name or None, + ) + class JobsMixin: @classmethod @@ -124,17 +136,7 @@ def _prepare(all_jobs) -> tuple[dict[int, set[str]], dict[int, JobInfo]]: job_settings = job.settings if not job_settings: continue - job_name = job_settings.name - if not job_name: - job_name = "Unknown" - - job_details[job.job_id] = JobInfo( - job_id=str(job.job_id), - job_name=job_name, - creator=creator_user_name, - success=1, - failures="[]", - ) + job_details[job.job_id] = JobInfo.from_job(job) return job_assessment, job_details def _try_fetch(self) -> Iterable[JobInfo]: diff --git a/src/databricks/labs/ucx/sequencing/sequencing.py b/src/databricks/labs/ucx/sequencing/sequencing.py index 932c791f2c..a873ee1b7e 100644 --- a/src/databricks/labs/ucx/sequencing/sequencing.py +++ b/src/databricks/labs/ucx/sequencing/sequencing.py @@ -7,8 +7,8 @@ from databricks.sdk import WorkspaceClient from databricks.sdk.service import jobs -from databricks.labs.ucx.assessment.clusters import ClusterDetailsOwnership -from databricks.labs.ucx.assessment.jobs import JobOwnership +from databricks.labs.ucx.assessment.clusters import ClusterOwnership, ClusterInfo +from databricks.labs.ucx.assessment.jobs import JobOwnership, JobInfo from databricks.labs.ucx.framework.owners import AdministratorLocator from databricks.labs.ucx.source_code.graph import DependencyGraph @@ -97,7 +97,7 @@ def register_workflow_job(self, job: jobs.Job) -> MigrationNode: object_type="JOB", object_id=str(job.job_id), object_name=job_name, - object_owner=JobOwnership(self._admin_locator).owner_of(job), + object_owner=JobOwnership(self._admin_locator).owner_of(JobInfo.from_job(job)), ) self._nodes[job_node.key] = job_node if job.settings and job.settings.job_clusters: @@ -125,7 +125,7 @@ def register_cluster(self, cluster_id: str) -> MigrationNode: object_type="CLUSTER", object_id=cluster_id, object_name=object_name, - object_owner=ClusterDetailsOwnership(self._admin_locator).owner_of(details), + object_owner=ClusterOwnership(self._admin_locator).owner_of(ClusterInfo.from_cluster_details(details)), ) self._nodes[cluster_node.key] = cluster_node # TODO register warehouses and policies