From 95392442b1c6b02992a8ae6fb4da74abbeaff26a Mon Sep 17 00:00:00 2001 From: Eric Vergnaud Date: Wed, 16 Oct 2024 10:51:55 +0200 Subject: [PATCH 001/106] make simple_dependency_resolver available more broadly --- tests/unit/conftest.py | 15 ++++++++++++++- tests/unit/source_code/conftest.py | 15 --------------- 2 files changed, 14 insertions(+), 16 deletions(-) diff --git a/tests/unit/conftest.py b/tests/unit/conftest.py index 2c8cbfd3b2..92e86ec73e 100644 --- a/tests/unit/conftest.py +++ b/tests/unit/conftest.py @@ -10,13 +10,17 @@ from databricks.labs.ucx.hive_metastore import TablesCrawler from databricks.labs.ucx.hive_metastore.tables import FasterTableScanCrawler -from databricks.labs.ucx.source_code.graph import BaseNotebookResolver +from databricks.labs.ucx.source_code.graph import BaseNotebookResolver, DependencyResolver +from databricks.labs.ucx.source_code.known import KnownList +from databricks.labs.ucx.source_code.linters.files import ImportFileResolver, FileLoader +from databricks.labs.ucx.source_code.notebooks.loaders import NotebookResolver, NotebookLoader from databricks.labs.ucx.source_code.path_lookup import PathLookup from databricks.sdk import AccountClient from databricks.sdk.config import Config from databricks.labs.ucx.config import WorkspaceConfig from databricks.labs.ucx.contexts.workflow_task import RuntimeContext +from databricks.labs.ucx.source_code.python_libraries import PythonLibraryResolver from . import mock_workspace_client @@ -201,3 +205,12 @@ def mock_backend() -> MockBackend: @pytest.fixture def ws(): return mock_workspace_client() + + +@pytest.fixture +def simple_dependency_resolver(mock_path_lookup: PathLookup) -> DependencyResolver: + allow_list = KnownList() + library_resolver = PythonLibraryResolver(allow_list) + notebook_resolver = NotebookResolver(NotebookLoader()) + import_resolver = ImportFileResolver(FileLoader(), allow_list) + return DependencyResolver(library_resolver, notebook_resolver, import_resolver, import_resolver, mock_path_lookup) diff --git a/tests/unit/source_code/conftest.py b/tests/unit/source_code/conftest.py index 6029ce4d82..9c999d92dc 100644 --- a/tests/unit/source_code/conftest.py +++ b/tests/unit/source_code/conftest.py @@ -1,12 +1,6 @@ import pytest from databricks.labs.ucx.hive_metastore.table_migration_status import TableMigrationIndex, TableMigrationStatus -from databricks.labs.ucx.source_code.graph import DependencyResolver -from databricks.labs.ucx.source_code.known import KnownList -from databricks.labs.ucx.source_code.linters.files import ImportFileResolver, FileLoader -from databricks.labs.ucx.source_code.notebooks.loaders import NotebookLoader, NotebookResolver -from databricks.labs.ucx.source_code.path_lookup import PathLookup -from databricks.labs.ucx.source_code.python_libraries import PythonLibraryResolver @pytest.fixture @@ -51,12 +45,3 @@ def extended_test_index(): ), ] ) - - -@pytest.fixture -def simple_dependency_resolver(mock_path_lookup: PathLookup) -> DependencyResolver: - allow_list = KnownList() - library_resolver = PythonLibraryResolver(allow_list) - notebook_resolver = NotebookResolver(NotebookLoader()) - import_resolver = ImportFileResolver(FileLoader(), allow_list) - return DependencyResolver(library_resolver, notebook_resolver, import_resolver, import_resolver, mock_path_lookup) From 1740e538c647a900f7f5f27d534a8707cbc596fa Mon Sep 17 00:00:00 2001 From: Eric Vergnaud Date: Wed, 16 Oct 2024 10:54:19 +0200 Subject: [PATCH 002/106] build migration steps for workflow task --- .../labs/ucx/sequencing/__init__.py | 0 .../labs/ucx/sequencing/sequencing.py | 145 ++++++++++++++++++ tests/unit/sequencing/__init__.py | 0 tests/unit/sequencing/test_sequencing.py | 22 +++ 4 files changed, 167 insertions(+) create mode 100644 src/databricks/labs/ucx/sequencing/__init__.py create mode 100644 src/databricks/labs/ucx/sequencing/sequencing.py create mode 100644 tests/unit/sequencing/__init__.py create mode 100644 tests/unit/sequencing/test_sequencing.py diff --git a/src/databricks/labs/ucx/sequencing/__init__.py b/src/databricks/labs/ucx/sequencing/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/src/databricks/labs/ucx/sequencing/sequencing.py b/src/databricks/labs/ucx/sequencing/sequencing.py new file mode 100644 index 0000000000..f28400c3cd --- /dev/null +++ b/src/databricks/labs/ucx/sequencing/sequencing.py @@ -0,0 +1,145 @@ +from __future__ import annotations + +import itertools +from collections.abc import Iterable +from dataclasses import dataclass, field + +from databricks.sdk.service import jobs + +from databricks.labs.ucx.source_code.graph import DependencyGraph + + +@dataclass +class MigrationStep: + step_id: int + step_number: int + object_type: str + object_id: str + object_owner: str + required_step_ids: list[int] = field(default_factory=list) + + +@dataclass +class MigrationNode: + last_node_id = 0 + node_id: int + object_type: str + object_id: str + object_owner: str + required_steps: list[MigrationNode] = field(default_factory=list) + + def generate_steps(self) -> tuple[MigrationStep, Iterable[MigrationStep]]: + # traverse the nodes using a depth-first algorithm + # ultimate leaves have a step number of 1 + # use highest required step number + 1 for this step + highest_step_number = 0 + required_step_ids: list[int] = [] + all_generated_steps: list[Iterable[MigrationStep]] = [] + for required_step in self.required_steps: + step, generated_steps = required_step.generate_steps() + highest_step_number = max(highest_step_number, step.step_number) + required_step_ids.append(step.step_id) + all_generated_steps.append(generated_steps) + all_generated_steps.append([step]) + this_step = MigrationStep( + step_id=self.node_id, + step_number=highest_step_number + 1, + object_type=self.object_type, + object_id=self.object_id, + object_owner=self.object_owner, + required_step_ids=required_step_ids, + ) + return this_step, itertools.chain(*all_generated_steps) + + def find(self, object_type: str, object_id: str) -> MigrationNode | None: + if object_type == self.object_type and object_id == self.object_id: + return self + for step in self.required_steps: + found = step.find(object_type, object_id) + if found: + return found + return None + + +class MigrationSequencer: + + def __init__(self): + self._root = MigrationNode(node_id=0, object_type="ROOT", object_id="ROOT", object_owner="NONE") + + def register_workflow_task(self, task: jobs.Task, job: jobs.Job, _graph: DependencyGraph) -> MigrationNode: + task_node = self._find_node(object_type="TASK", object_id=task.task_key) + if task_node: + return task_node + job_node = self.register_workflow_job(job) + MigrationNode.last_node_id += 1 + task_node = MigrationNode( + node_id=MigrationNode.last_node_id, object_type="TASK", object_id=task.task_key, object_owner="NONE" + ) # TODO object_owner + job_node.required_steps.append(task_node) + if task.existing_cluster_id: + cluster_node = self.register_cluster(task.existing_cluster_id) + cluster_node.required_steps.append(task_node) + if job_node not in cluster_node.required_steps: + cluster_node.required_steps.append(job_node) + # TODO register dependency graph + return task_node + + def register_workflow_job(self, job: jobs.Job) -> MigrationNode: + job_node = self._find_node(object_type="JOB", object_id=str(job.job_id)) + if job_node: + return job_node + MigrationNode.last_node_id += 1 + job_node = MigrationNode( + node_id=MigrationNode.last_node_id, object_type="JOB", object_id=str(job.job_id), object_owner="NONE" + ) # TODO object_owner + top_level = True + if job.settings and job.settings.job_clusters: + for job_cluster in job.settings.job_clusters: + cluster_node = self.register_job_cluster(job_cluster) + if cluster_node: + top_level = False + cluster_node.required_steps.append(job_node) + if top_level: + self._root.required_steps.append(job_node) + return job_node + + def register_job_cluster(self, cluster: jobs.JobCluster) -> MigrationNode | None: + if cluster.new_cluster: + return None + return self.register_cluster(cluster.job_cluster_key) + + def register_cluster(self, cluster_key: str) -> MigrationNode: + cluster_node = self._find_node(object_type="CLUSTER", object_id=cluster_key) + if cluster_node: + return cluster_node + MigrationNode.last_node_id += 1 + cluster_node = MigrationNode( + node_id=MigrationNode.last_node_id, object_type="CLUSTER", object_id=cluster_key, object_owner="NONE" + ) # TODO object_owner + # TODO register warehouses and policies + self._root.required_steps.append(cluster_node) + return cluster_node + + def generate_steps(self) -> Iterable[MigrationStep]: + _root_step, generated_steps = self._root.generate_steps() + unique_steps = self._deduplicate_steps(generated_steps) + return self._sorted_steps(unique_steps) + + @staticmethod + def _sorted_steps(steps: Iterable[MigrationStep]) -> Iterable[MigrationStep]: + # sort by step number, lowest first + return sorted(steps, key=lambda step: step.step_number) + + @staticmethod + def _deduplicate_steps(steps: Iterable[MigrationStep]) -> Iterable[MigrationStep]: + best_steps: dict[int, MigrationStep] = {} + for step in steps: + existing = best_steps.get(step.step_id, None) + # keep the step with the highest step number + if existing and existing.step_number >= step.step_number: + continue + best_steps[step.step_id] = step + return best_steps.values() + + def _find_node(self, object_type: str, object_id: str) -> MigrationNode | None: + return self._root.find(object_type, object_id) diff --git a/tests/unit/sequencing/__init__.py b/tests/unit/sequencing/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/tests/unit/sequencing/test_sequencing.py b/tests/unit/sequencing/test_sequencing.py new file mode 100644 index 0000000000..094767fc3e --- /dev/null +++ b/tests/unit/sequencing/test_sequencing.py @@ -0,0 +1,22 @@ +from databricks.sdk.service import jobs + +from databricks.labs.ucx.sequencing.sequencing import MigrationSequencer +from databricks.labs.ucx.source_code.base import CurrentSessionState +from databricks.labs.ucx.source_code.graph import DependencyGraph +from databricks.labs.ucx.source_code.jobs import WorkflowTask + + +def test_cluster_from_task_has_children(ws, simple_dependency_resolver, mock_path_lookup): + task = jobs.Task(task_key="test-task", existing_cluster_id="cluster-123") + settings = jobs.JobSettings(name="test-job", tasks=[task]) + job = jobs.Job(job_id=1234, settings=settings) + ws.jobs.get.return_value = job + dependency = WorkflowTask(ws, task, job) + graph = DependencyGraph(dependency, None, simple_dependency_resolver, mock_path_lookup, CurrentSessionState()) + sequencer = MigrationSequencer() + sequencer.register_workflow_task(task, job, graph) + steps = list(sequencer.generate_steps()) + step = steps[-1] + assert step.object_type == "CLUSTER" + assert step.object_id == "cluster-123" + assert step.step_number == 3 From 186ea87a48e5fb42927dc1000213f45f082c21f7 Mon Sep 17 00:00:00 2001 From: Eric Vergnaud Date: Wed, 16 Oct 2024 11:54:04 +0200 Subject: [PATCH 003/106] fix pylint warnings --- tests/unit/conftest.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/tests/unit/conftest.py b/tests/unit/conftest.py index 92e86ec73e..4eee45fca3 100644 --- a/tests/unit/conftest.py +++ b/tests/unit/conftest.py @@ -61,8 +61,10 @@ class CustomIterator: def __init__(self, values): self._values = iter(values) self._has_next = True + self._next_value = None - def hasNext(self): # pylint: disable=invalid-name + # pylint: disable=invalid-name + def hasNext(self): try: self._next_value = next(self._values) self._has_next = True @@ -154,9 +156,11 @@ def inner(cb, **replace) -> RuntimeContext: ctx.tables_crawler._spark._jsparkSession.sharedState().externalCatalog().listDatabases.return_value = ( mock_list_databases_iterator ) + # pylint: disable=protected-access ctx.tables_crawler._spark._jsparkSession.sharedState().externalCatalog().listTables.return_value = ( mock_list_tables_iterator ) + # pylint: disable=protected-access ctx.tables_crawler._spark._jsparkSession.sharedState().externalCatalog().getTable.return_value = ( get_table_mock ) @@ -169,8 +173,9 @@ def inner(cb, **replace) -> RuntimeContext: @pytest.fixture def acc_client(): - acc = create_autospec(AccountClient) # pylint: disable=mock-no-usage + acc = create_autospec(AccountClient) acc.config = Config(host="https://accounts.cloud.databricks.com", account_id="123", token="123") + acc.asset_not_called() return acc From f01c4c026bd133630a043709e16df52bfa9b8caa Mon Sep 17 00:00:00 2001 From: Eric Vergnaud Date: Wed, 16 Oct 2024 12:05:36 +0200 Subject: [PATCH 004/106] fix pylint warnings --- tests/unit/conftest.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/unit/conftest.py b/tests/unit/conftest.py index 4eee45fca3..675f2012e6 100644 --- a/tests/unit/conftest.py +++ b/tests/unit/conftest.py @@ -175,7 +175,7 @@ def inner(cb, **replace) -> RuntimeContext: def acc_client(): acc = create_autospec(AccountClient) acc.config = Config(host="https://accounts.cloud.databricks.com", account_id="123", token="123") - acc.asset_not_called() + acc.assert_not_called() return acc From 6613d6e7ff6c876532f406a9d3a5cc35efd538fa Mon Sep 17 00:00:00 2001 From: Eric Vergnaud Date: Wed, 16 Oct 2024 12:06:04 +0200 Subject: [PATCH 005/106] add object name --- .../labs/ucx/sequencing/sequencing.py | 29 +++++++++++++++---- 1 file changed, 24 insertions(+), 5 deletions(-) diff --git a/src/databricks/labs/ucx/sequencing/sequencing.py b/src/databricks/labs/ucx/sequencing/sequencing.py index f28400c3cd..b12fbcd0b2 100644 --- a/src/databricks/labs/ucx/sequencing/sequencing.py +++ b/src/databricks/labs/ucx/sequencing/sequencing.py @@ -15,6 +15,7 @@ class MigrationStep: step_number: int object_type: str object_id: str + object_name: str object_owner: str required_step_ids: list[int] = field(default_factory=list) @@ -25,6 +26,7 @@ class MigrationNode: node_id: int object_type: str object_id: str + object_name: str object_owner: str required_steps: list[MigrationNode] = field(default_factory=list) @@ -46,6 +48,7 @@ def generate_steps(self) -> tuple[MigrationStep, Iterable[MigrationStep]]: step_number=highest_step_number + 1, object_type=self.object_type, object_id=self.object_id, + object_name=self.object_name, object_owner=self.object_owner, required_step_ids=required_step_ids, ) @@ -64,16 +67,23 @@ def find(self, object_type: str, object_id: str) -> MigrationNode | None: class MigrationSequencer: def __init__(self): - self._root = MigrationNode(node_id=0, object_type="ROOT", object_id="ROOT", object_owner="NONE") + self._root = MigrationNode( + node_id=0, object_type="ROOT", object_id="ROOT", object_name="ROOT", object_owner="NONE" + ) def register_workflow_task(self, task: jobs.Task, job: jobs.Job, _graph: DependencyGraph) -> MigrationNode: - task_node = self._find_node(object_type="TASK", object_id=task.task_key) + task_id = f"{job.job_id}/{task.task_key}" + task_node = self._find_node(object_type="TASK", object_id=task_id) if task_node: return task_node job_node = self.register_workflow_job(job) MigrationNode.last_node_id += 1 task_node = MigrationNode( - node_id=MigrationNode.last_node_id, object_type="TASK", object_id=task.task_key, object_owner="NONE" + node_id=MigrationNode.last_node_id, + object_type="TASK", + object_id=task_id, + object_name=task.task_key, + object_owner="NONE", ) # TODO object_owner job_node.required_steps.append(task_node) if task.existing_cluster_id: @@ -89,8 +99,13 @@ def register_workflow_job(self, job: jobs.Job) -> MigrationNode: if job_node: return job_node MigrationNode.last_node_id += 1 + job_name = job.settings.name if job.settings and job.settings.name else str(job.job_id) job_node = MigrationNode( - node_id=MigrationNode.last_node_id, object_type="JOB", object_id=str(job.job_id), object_owner="NONE" + node_id=MigrationNode.last_node_id, + object_type="JOB", + object_id=str(job.job_id), + object_name=job_name, + object_owner="NONE", ) # TODO object_owner top_level = True if job.settings and job.settings.job_clusters: @@ -114,7 +129,11 @@ def register_cluster(self, cluster_key: str) -> MigrationNode: return cluster_node MigrationNode.last_node_id += 1 cluster_node = MigrationNode( - node_id=MigrationNode.last_node_id, object_type="CLUSTER", object_id=cluster_key, object_owner="NONE" + node_id=MigrationNode.last_node_id, + object_type="CLUSTER", + object_id=cluster_key, + object_name=cluster_key, + object_owner="NONE", ) # TODO object_owner # TODO register warehouses and policies self._root.required_steps.append(cluster_node) From 0252ed4f29cbf86e226e1fbc8ae0eeca027f5d1b Mon Sep 17 00:00:00 2001 From: Eric Vergnaud Date: Wed, 16 Oct 2024 12:14:11 +0200 Subject: [PATCH 006/106] populate object owner --- src/databricks/labs/ucx/sequencing/sequencing.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/databricks/labs/ucx/sequencing/sequencing.py b/src/databricks/labs/ucx/sequencing/sequencing.py index b12fbcd0b2..f66563de80 100644 --- a/src/databricks/labs/ucx/sequencing/sequencing.py +++ b/src/databricks/labs/ucx/sequencing/sequencing.py @@ -83,8 +83,8 @@ def register_workflow_task(self, task: jobs.Task, job: jobs.Job, _graph: Depende object_type="TASK", object_id=task_id, object_name=task.task_key, - object_owner="NONE", - ) # TODO object_owner + object_owner=job_node.object_owner, # no task owner so use job one + ) job_node.required_steps.append(task_node) if task.existing_cluster_id: cluster_node = self.register_cluster(task.existing_cluster_id) @@ -105,8 +105,8 @@ def register_workflow_job(self, job: jobs.Job) -> MigrationNode: object_type="JOB", object_id=str(job.job_id), object_name=job_name, - object_owner="NONE", - ) # TODO object_owner + object_owner=job.creator_user_name or "", + ) top_level = True if job.settings and job.settings.job_clusters: for job_cluster in job.settings.job_clusters: From 9f5705a3f3075ad7f497ddbd59572ff2203dd5be Mon Sep 17 00:00:00 2001 From: Eric Vergnaud Date: Wed, 16 Oct 2024 12:48:00 +0200 Subject: [PATCH 007/106] be more defensive --- .../labs/ucx/sequencing/sequencing.py | 17 ++++++++++++----- tests/unit/sequencing/test_sequencing.py | 8 +++++++- 2 files changed, 19 insertions(+), 6 deletions(-) diff --git a/src/databricks/labs/ucx/sequencing/sequencing.py b/src/databricks/labs/ucx/sequencing/sequencing.py index f66563de80..113b704d30 100644 --- a/src/databricks/labs/ucx/sequencing/sequencing.py +++ b/src/databricks/labs/ucx/sequencing/sequencing.py @@ -4,6 +4,7 @@ from collections.abc import Iterable from dataclasses import dataclass, field +from databricks.sdk import WorkspaceClient from databricks.sdk.service import jobs from databricks.labs.ucx.source_code.graph import DependencyGraph @@ -66,7 +67,8 @@ def find(self, object_type: str, object_id: str) -> MigrationNode | None: class MigrationSequencer: - def __init__(self): + def __init__(self, ws: WorkspaceClient): + self._ws = ws self._root = MigrationNode( node_id=0, object_type="ROOT", object_id="ROOT", object_name="ROOT", object_owner="NONE" ) @@ -83,7 +85,7 @@ def register_workflow_task(self, task: jobs.Task, job: jobs.Job, _graph: Depende object_type="TASK", object_id=task_id, object_name=task.task_key, - object_owner=job_node.object_owner, # no task owner so use job one + object_owner=job_node.object_owner, # no task owner so use job one ) job_node.required_steps.append(task_node) if task.existing_cluster_id: @@ -127,14 +129,17 @@ def register_cluster(self, cluster_key: str) -> MigrationNode: cluster_node = self._find_node(object_type="CLUSTER", object_id=cluster_key) if cluster_node: return cluster_node + details = self._ws.clusters.get(cluster_key) + object_name = details.cluster_name if details and details.cluster_name else cluster_key + object_owner = details.creator_user_name if details and details.creator_user_name else "" MigrationNode.last_node_id += 1 cluster_node = MigrationNode( node_id=MigrationNode.last_node_id, object_type="CLUSTER", object_id=cluster_key, - object_name=cluster_key, - object_owner="NONE", - ) # TODO object_owner + object_name=object_name, + object_owner=object_owner, + ) # TODO register warehouses and policies self._root.required_steps.append(cluster_node) return cluster_node @@ -155,6 +160,8 @@ def _deduplicate_steps(steps: Iterable[MigrationStep]) -> Iterable[MigrationStep for step in steps: existing = best_steps.get(step.step_id, None) # keep the step with the highest step number + # TODO this possibly affects the step_number of steps that depend on this one + # but it's probably OK to not be 100% accurate initially if existing and existing.step_number >= step.step_number: continue best_steps[step.step_id] = step diff --git a/tests/unit/sequencing/test_sequencing.py b/tests/unit/sequencing/test_sequencing.py index 094767fc3e..fa7271164e 100644 --- a/tests/unit/sequencing/test_sequencing.py +++ b/tests/unit/sequencing/test_sequencing.py @@ -1,4 +1,5 @@ from databricks.sdk.service import jobs +from databricks.sdk.service.compute import ClusterDetails from databricks.labs.ucx.sequencing.sequencing import MigrationSequencer from databricks.labs.ucx.source_code.base import CurrentSessionState @@ -7,16 +8,21 @@ def test_cluster_from_task_has_children(ws, simple_dependency_resolver, mock_path_lookup): + ws.clusters.get.return_value = ClusterDetails(cluster_name="my-cluster", creator_user_name="John Doe") task = jobs.Task(task_key="test-task", existing_cluster_id="cluster-123") settings = jobs.JobSettings(name="test-job", tasks=[task]) job = jobs.Job(job_id=1234, settings=settings) ws.jobs.get.return_value = job dependency = WorkflowTask(ws, task, job) graph = DependencyGraph(dependency, None, simple_dependency_resolver, mock_path_lookup, CurrentSessionState()) - sequencer = MigrationSequencer() + sequencer = MigrationSequencer(ws) sequencer.register_workflow_task(task, job, graph) steps = list(sequencer.generate_steps()) step = steps[-1] + assert step.step_id assert step.object_type == "CLUSTER" assert step.object_id == "cluster-123" + assert step.object_name == "my-cluster" + assert step.object_owner == "John Doe" assert step.step_number == 3 + assert len(step.required_step_ids) == 2 From 1cee0ce9ea1f1c78b50f9ecdbe6c351acd3fd3fc Mon Sep 17 00:00:00 2001 From: Eric Vergnaud Date: Thu, 17 Oct 2024 10:53:39 +0200 Subject: [PATCH 008/106] move last_node_id to sequencer --- src/databricks/labs/ucx/sequencing/sequencing.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/src/databricks/labs/ucx/sequencing/sequencing.py b/src/databricks/labs/ucx/sequencing/sequencing.py index 113b704d30..4037a29697 100644 --- a/src/databricks/labs/ucx/sequencing/sequencing.py +++ b/src/databricks/labs/ucx/sequencing/sequencing.py @@ -23,7 +23,6 @@ class MigrationStep: @dataclass class MigrationNode: - last_node_id = 0 node_id: int object_type: str object_id: str @@ -69,6 +68,7 @@ class MigrationSequencer: def __init__(self, ws: WorkspaceClient): self._ws = ws + self._last_node_id = 0 self._root = MigrationNode( node_id=0, object_type="ROOT", object_id="ROOT", object_name="ROOT", object_owner="NONE" ) @@ -79,9 +79,9 @@ def register_workflow_task(self, task: jobs.Task, job: jobs.Job, _graph: Depende if task_node: return task_node job_node = self.register_workflow_job(job) - MigrationNode.last_node_id += 1 + self._last_node_id += 1 task_node = MigrationNode( - node_id=MigrationNode.last_node_id, + node_id=self._last_node_id, object_type="TASK", object_id=task_id, object_name=task.task_key, @@ -100,10 +100,10 @@ def register_workflow_job(self, job: jobs.Job) -> MigrationNode: job_node = self._find_node(object_type="JOB", object_id=str(job.job_id)) if job_node: return job_node - MigrationNode.last_node_id += 1 + self._last_node_id += 1 job_name = job.settings.name if job.settings and job.settings.name else str(job.job_id) job_node = MigrationNode( - node_id=MigrationNode.last_node_id, + node_id=self._last_node_id, object_type="JOB", object_id=str(job.job_id), object_name=job_name, @@ -132,9 +132,9 @@ def register_cluster(self, cluster_key: str) -> MigrationNode: details = self._ws.clusters.get(cluster_key) object_name = details.cluster_name if details and details.cluster_name else cluster_key object_owner = details.creator_user_name if details and details.creator_user_name else "" - MigrationNode.last_node_id += 1 + self._last_node_id += 1 cluster_node = MigrationNode( - node_id=MigrationNode.last_node_id, + node_id=self._last_node_id, object_type="CLUSTER", object_id=cluster_key, object_name=object_name, From 069b4d9d500fc9448365afb30bb1e930ccbd025a Mon Sep 17 00:00:00 2001 From: Eric Vergnaud Date: Thu, 17 Oct 2024 15:15:38 +0200 Subject: [PATCH 009/106] cherry-pick changes --- .../labs/ucx/sequencing/sequencing.py | 136 +++++++++--------- 1 file changed, 64 insertions(+), 72 deletions(-) diff --git a/src/databricks/labs/ucx/sequencing/sequencing.py b/src/databricks/labs/ucx/sequencing/sequencing.py index 4037a29697..063d0ca854 100644 --- a/src/databricks/labs/ucx/sequencing/sequencing.py +++ b/src/databricks/labs/ucx/sequencing/sequencing.py @@ -1,8 +1,8 @@ from __future__ import annotations -import itertools +from collections import defaultdict from collections.abc import Iterable -from dataclasses import dataclass, field +from dataclasses import dataclass from databricks.sdk import WorkspaceClient from databricks.sdk.service import jobs @@ -18,7 +18,7 @@ class MigrationStep: object_id: str object_name: str object_owner: str - required_step_ids: list[int] = field(default_factory=list) + required_step_ids: list[int] @dataclass @@ -28,40 +28,21 @@ class MigrationNode: object_id: str object_name: str object_owner: str - required_steps: list[MigrationNode] = field(default_factory=list) - - def generate_steps(self) -> tuple[MigrationStep, Iterable[MigrationStep]]: - # traverse the nodes using a depth-first algorithm - # ultimate leaves have a step number of 1 - # use highest required step number + 1 for this step - highest_step_number = 0 - required_step_ids: list[int] = [] - all_generated_steps: list[Iterable[MigrationStep]] = [] - for required_step in self.required_steps: - step, generated_steps = required_step.generate_steps() - highest_step_number = max(highest_step_number, step.step_number) - required_step_ids.append(step.step_id) - all_generated_steps.append(generated_steps) - all_generated_steps.append([step]) - this_step = MigrationStep( + + @property + def key(self) -> tuple[str, str]: + return self.object_type, self.object_id + + def as_step(self, step_number: int, required_step_ids: list[int]) -> MigrationStep: + return MigrationStep( step_id=self.node_id, - step_number=highest_step_number + 1, + step_number=step_number, object_type=self.object_type, object_id=self.object_id, object_name=self.object_name, object_owner=self.object_owner, required_step_ids=required_step_ids, ) - return this_step, itertools.chain(*all_generated_steps) - - def find(self, object_type: str, object_id: str) -> MigrationNode | None: - if object_type == self.object_type and object_id == self.object_id: - return self - for step in self.required_steps: - found = step.find(object_type, object_id) - if found: - return found - return None class MigrationSequencer: @@ -69,13 +50,13 @@ class MigrationSequencer: def __init__(self, ws: WorkspaceClient): self._ws = ws self._last_node_id = 0 - self._root = MigrationNode( - node_id=0, object_type="ROOT", object_id="ROOT", object_name="ROOT", object_owner="NONE" - ) + self._nodes: dict[tuple[str, str], MigrationNode] = {} + self._incoming: dict[tuple[str, str], set[tuple[str, str]]] = defaultdict(set) + self._outgoing: dict[tuple[str, str], set[tuple[str, str]]] = defaultdict(set) def register_workflow_task(self, task: jobs.Task, job: jobs.Job, _graph: DependencyGraph) -> MigrationNode: task_id = f"{job.job_id}/{task.task_key}" - task_node = self._find_node(object_type="TASK", object_id=task_id) + task_node = self._nodes.get(("TASK", task_id), None) if task_node: return task_node job_node = self.register_workflow_job(job) @@ -87,17 +68,22 @@ def register_workflow_task(self, task: jobs.Task, job: jobs.Job, _graph: Depende object_name=task.task_key, object_owner=job_node.object_owner, # no task owner so use job one ) - job_node.required_steps.append(task_node) + self._nodes[task_node.key] = task_node + self._incoming[job_node.key].add(task_node.key) + self._outgoing[task_node.key].add(job_node.key) if task.existing_cluster_id: cluster_node = self.register_cluster(task.existing_cluster_id) - cluster_node.required_steps.append(task_node) - if job_node not in cluster_node.required_steps: - cluster_node.required_steps.append(job_node) + if cluster_node: + self._incoming[cluster_node.key].add(task_node.key) + self._outgoing[task_node.key].add(cluster_node.key) + # also make the cluster dependent on the job + self._incoming[cluster_node.key].add(job_node.key) + self._outgoing[job_node.key].add(cluster_node.key) # TODO register dependency graph return task_node def register_workflow_job(self, job: jobs.Job) -> MigrationNode: - job_node = self._find_node(object_type="JOB", object_id=str(job.job_id)) + job_node = self._nodes.get(("JOB", str(job.job_id)), None) if job_node: return job_node self._last_node_id += 1 @@ -109,15 +95,13 @@ def register_workflow_job(self, job: jobs.Job) -> MigrationNode: object_name=job_name, object_owner=job.creator_user_name or "", ) - top_level = True + self._nodes[job_node.key] = job_node if job.settings and job.settings.job_clusters: for job_cluster in job.settings.job_clusters: cluster_node = self.register_job_cluster(job_cluster) if cluster_node: - top_level = False - cluster_node.required_steps.append(job_node) - if top_level: - self._root.required_steps.append(job_node) + self._incoming[cluster_node.key].add(job_node.key) + self._outgoing[job_node.key].add(cluster_node.key) return job_node def register_job_cluster(self, cluster: jobs.JobCluster) -> MigrationNode | None: @@ -125,47 +109,55 @@ def register_job_cluster(self, cluster: jobs.JobCluster) -> MigrationNode | None return None return self.register_cluster(cluster.job_cluster_key) - def register_cluster(self, cluster_key: str) -> MigrationNode: - cluster_node = self._find_node(object_type="CLUSTER", object_id=cluster_key) + def register_cluster(self, cluster_id: str) -> MigrationNode: + cluster_node = self._nodes.get(("CLUSTER", cluster_id), None) if cluster_node: return cluster_node - details = self._ws.clusters.get(cluster_key) - object_name = details.cluster_name if details and details.cluster_name else cluster_key - object_owner = details.creator_user_name if details and details.creator_user_name else "" + details = self._ws.clusters.get(cluster_id) + object_name = details.cluster_name if details and details.cluster_name else cluster_id self._last_node_id += 1 cluster_node = MigrationNode( node_id=self._last_node_id, object_type="CLUSTER", - object_id=cluster_key, + object_id=cluster_id, object_name=object_name, object_owner=object_owner, ) + self._nodes[cluster_node.key] = cluster_node # TODO register warehouses and policies - self._root.required_steps.append(cluster_node) return cluster_node def generate_steps(self) -> Iterable[MigrationStep]: - _root_step, generated_steps = self._root.generate_steps() - unique_steps = self._deduplicate_steps(generated_steps) - return self._sorted_steps(unique_steps) + # algo adapted from Kahn topological sort. The main differences is that + # we want the same step number for all nodes with same dependency depth + # so instead of pushing to a queue, we rebuild it once all leaf nodes are processed + # (these are transient leaf nodes i.e. they only become leaf during processing) + incoming_counts = self._populate_incoming_counts() + step_number = 1 + sorted_steps: list[MigrationStep] = [] + while len(incoming_counts) > 0: + leaf_keys = list(self._get_leaf_keys(incoming_counts)) + for leaf_key in leaf_keys: + del incoming_counts[leaf_key] + sorted_steps.append(self._nodes[leaf_key].as_step(step_number, list(self._required_step_ids(leaf_key)))) + for dependency_key in self._outgoing[leaf_key]: + incoming_counts[dependency_key] -= 1 + step_number += 1 + return sorted_steps + + def _required_step_ids(self, node_key: tuple[str, str]) -> Iterable[int]: + for leaf_key in self._incoming[node_key]: + yield self._nodes[leaf_key].node_id + + def _populate_incoming_counts(self) -> dict[tuple[str, str], int]: + result = defaultdict(int) + for node_key in self._nodes: + result[node_key] = len(self._incoming[node_key]) + return result @staticmethod - def _sorted_steps(steps: Iterable[MigrationStep]) -> Iterable[MigrationStep]: - # sort by step number, lowest first - return sorted(steps, key=lambda step: step.step_number) - - @staticmethod - def _deduplicate_steps(steps: Iterable[MigrationStep]) -> Iterable[MigrationStep]: - best_steps: dict[int, MigrationStep] = {} - for step in steps: - existing = best_steps.get(step.step_id, None) - # keep the step with the highest step number - # TODO this possibly affects the step_number of steps that depend on this one - # but it's probably OK to not be 100% accurate initially - if existing and existing.step_number >= step.step_number: + def _get_leaf_keys(incoming_counts: dict[tuple[str, str], int]) -> Iterable[tuple[str, str]]: + for node_key, incoming_count in incoming_counts.items(): + if incoming_count > 0: continue - best_steps[step.step_id] = step - return best_steps.values() - - def _find_node(self, object_type: str, object_id: str) -> MigrationNode | None: - return self._root.find(object_type, object_id) + yield node_key From 2472d357d89182e507265827ef8f6a2621498e80 Mon Sep 17 00:00:00 2001 From: Eric Vergnaud Date: Thu, 17 Oct 2024 18:30:27 +0200 Subject: [PATCH 010/106] use existing Ownership classes --- .../labs/ucx/assessment/clusters.py | 24 +++++++++++-------- src/databricks/labs/ucx/assessment/jobs.py | 24 ++++++++++--------- .../labs/ucx/sequencing/sequencing.py | 7 ++++-- 3 files changed, 32 insertions(+), 23 deletions(-) diff --git a/src/databricks/labs/ucx/assessment/clusters.py b/src/databricks/labs/ucx/assessment/clusters.py index cdb238a8ba..da6b184404 100644 --- a/src/databricks/labs/ucx/assessment/clusters.py +++ b/src/databricks/labs/ucx/assessment/clusters.py @@ -49,6 +49,18 @@ class ClusterInfo: __id_attributes__: ClassVar[tuple[str, ...]] = ("cluster_id",) + @classmethod + def from_cluster_details(cls, details: ClusterDetails): + return ClusterInfo( + cluster_id=details.cluster_id if details.cluster_id else "", + cluster_name=details.cluster_name, + policy_id=details.policy_id, + spark_version=details.spark_version, + creator=details.creator_user_name or None, + success=1, + failures="[]", + ) + class CheckClusterMixin(CheckInitScriptMixin): _ws: WorkspaceClient @@ -156,7 +168,7 @@ def _crawl(self) -> Iterable[ClusterInfo]: all_clusters = list(self._ws.clusters.list()) return list(self._assess_clusters(all_clusters)) - def _assess_clusters(self, all_clusters): + def _assess_clusters(self, all_clusters: Iterable[ClusterDetails]): for cluster in all_clusters: if cluster.cluster_source == ClusterSource.JOB: continue @@ -166,15 +178,7 @@ def _assess_clusters(self, all_clusters): f"Cluster {cluster.cluster_id} have Unknown creator, it means that the original creator " f"has been deleted and should be re-created" ) - cluster_info = ClusterInfo( - cluster_id=cluster.cluster_id if cluster.cluster_id else "", - cluster_name=cluster.cluster_name, - policy_id=cluster.policy_id, - spark_version=cluster.spark_version, - creator=creator, - success=1, - failures="[]", - ) + cluster_info = ClusterInfo.from_cluster_details(cluster) failures = self._check_cluster_failures(cluster, "cluster") if len(failures) > 0: cluster_info.success = 0 diff --git a/src/databricks/labs/ucx/assessment/jobs.py b/src/databricks/labs/ucx/assessment/jobs.py index 1f87b26770..fe23e42fa0 100644 --- a/src/databricks/labs/ucx/assessment/jobs.py +++ b/src/databricks/labs/ucx/assessment/jobs.py @@ -21,6 +21,7 @@ RunType, SparkJarTask, SqlTask, + Job, ) from databricks.labs.ucx.assessment.clusters import CheckClusterMixin @@ -43,6 +44,17 @@ class JobInfo: __id_attributes__: ClassVar[tuple[str, ...]] = ("job_id",) + @classmethod + def from_job(cls, job: Job): + job_name = job.settings.name if job.settings and job.settings.name else "Unknown" + return JobInfo( + job_id=str(job.job_id), + success=1, + failures="[]", + job_name=job_name, + creator=job.creator_user_name or None, + ) + class JobsMixin: @classmethod @@ -127,17 +139,7 @@ def _prepare(all_jobs) -> tuple[dict[int, set[str]], dict[int, JobInfo]]: job_settings = job.settings if not job_settings: continue - job_name = job_settings.name - if not job_name: - job_name = "Unknown" - - job_details[job.job_id] = JobInfo( - job_id=str(job.job_id), - job_name=job_name, - creator=creator_user_name, - success=1, - failures="[]", - ) + job_details[job.job_id] = JobInfo.from_job(job) return job_assessment, job_details def _try_fetch(self) -> Iterable[JobInfo]: diff --git a/src/databricks/labs/ucx/sequencing/sequencing.py b/src/databricks/labs/ucx/sequencing/sequencing.py index 063d0ca854..0b1e384297 100644 --- a/src/databricks/labs/ucx/sequencing/sequencing.py +++ b/src/databricks/labs/ucx/sequencing/sequencing.py @@ -7,6 +7,9 @@ from databricks.sdk import WorkspaceClient from databricks.sdk.service import jobs +from databricks.labs.ucx.assessment.clusters import ClusterOwnership, ClusterInfo +from databricks.labs.ucx.assessment.jobs import JobOwnership, JobInfo +from databricks.labs.ucx.framework.owners import AdministratorLocator from databricks.labs.ucx.source_code.graph import DependencyGraph @@ -93,7 +96,7 @@ def register_workflow_job(self, job: jobs.Job) -> MigrationNode: object_type="JOB", object_id=str(job.job_id), object_name=job_name, - object_owner=job.creator_user_name or "", + object_owner=JobOwnership(self._admin_locator).owner_of(JobInfo.from_job(job)), ) self._nodes[job_node.key] = job_node if job.settings and job.settings.job_clusters: @@ -121,7 +124,7 @@ def register_cluster(self, cluster_id: str) -> MigrationNode: object_type="CLUSTER", object_id=cluster_id, object_name=object_name, - object_owner=object_owner, + object_owner=ClusterOwnership(self._admin_locator).owner_of(ClusterInfo.from_cluster_details(details)), ) self._nodes[cluster_node.key] = cluster_node # TODO register warehouses and policies From 446fab554085b77cfe42d3cdf1397093e3afa875 Mon Sep 17 00:00:00 2001 From: Eric Vergnaud Date: Thu, 17 Oct 2024 18:37:57 +0200 Subject: [PATCH 011/106] fix merge issues --- src/databricks/labs/ucx/sequencing/sequencing.py | 3 ++- tests/unit/sequencing/test_sequencing.py | 10 ++++++++-- 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/src/databricks/labs/ucx/sequencing/sequencing.py b/src/databricks/labs/ucx/sequencing/sequencing.py index 0b1e384297..a873ee1b7e 100644 --- a/src/databricks/labs/ucx/sequencing/sequencing.py +++ b/src/databricks/labs/ucx/sequencing/sequencing.py @@ -50,8 +50,9 @@ def as_step(self, step_number: int, required_step_ids: list[int]) -> MigrationSt class MigrationSequencer: - def __init__(self, ws: WorkspaceClient): + def __init__(self, ws: WorkspaceClient, admin_locator: AdministratorLocator): self._ws = ws + self._admin_locator = admin_locator self._last_node_id = 0 self._nodes: dict[tuple[str, str], MigrationNode] = {} self._incoming: dict[tuple[str, str], set[tuple[str, str]]] = defaultdict(set) diff --git a/tests/unit/sequencing/test_sequencing.py b/tests/unit/sequencing/test_sequencing.py index fa7271164e..21d2a612d0 100644 --- a/tests/unit/sequencing/test_sequencing.py +++ b/tests/unit/sequencing/test_sequencing.py @@ -1,6 +1,9 @@ -from databricks.sdk.service import jobs +from unittest.mock import create_autospec + +from databricks.sdk.service import iam, jobs from databricks.sdk.service.compute import ClusterDetails +from databricks.labs.ucx.framework.owners import AdministratorLocator, AdministratorFinder from databricks.labs.ucx.sequencing.sequencing import MigrationSequencer from databricks.labs.ucx.source_code.base import CurrentSessionState from databricks.labs.ucx.source_code.graph import DependencyGraph @@ -15,7 +18,10 @@ def test_cluster_from_task_has_children(ws, simple_dependency_resolver, mock_pat ws.jobs.get.return_value = job dependency = WorkflowTask(ws, task, job) graph = DependencyGraph(dependency, None, simple_dependency_resolver, mock_path_lookup, CurrentSessionState()) - sequencer = MigrationSequencer(ws) + admin_finder = create_autospec(AdministratorFinder) + admin_user = iam.User(user_name="John Doe", active=True, roles=[iam.ComplexValue(value="account_admin")]) + admin_finder.find_admin_users.return_value = (admin_user,) + sequencer = MigrationSequencer(ws, AdministratorLocator(ws, finders=[lambda _ws: admin_finder])) sequencer.register_workflow_task(task, job, graph) steps = list(sequencer.generate_steps()) step = steps[-1] From a19f939fb540a53d20545bf35a0e31921e3b9912 Mon Sep 17 00:00:00 2001 From: Eric Vergnaud Date: Mon, 21 Oct 2024 14:01:20 +0200 Subject: [PATCH 012/106] move package --- .../labs/ucx/{sequencing => assessment}/sequencing.py | 0 src/databricks/labs/ucx/sequencing/__init__.py | 0 tests/unit/{sequencing => assessment}/test_sequencing.py | 3 ++- tests/unit/sequencing/__init__.py | 0 4 files changed, 2 insertions(+), 1 deletion(-) rename src/databricks/labs/ucx/{sequencing => assessment}/sequencing.py (100%) delete mode 100644 src/databricks/labs/ucx/sequencing/__init__.py rename tests/unit/{sequencing => assessment}/test_sequencing.py (94%) delete mode 100644 tests/unit/sequencing/__init__.py diff --git a/src/databricks/labs/ucx/sequencing/sequencing.py b/src/databricks/labs/ucx/assessment/sequencing.py similarity index 100% rename from src/databricks/labs/ucx/sequencing/sequencing.py rename to src/databricks/labs/ucx/assessment/sequencing.py diff --git a/src/databricks/labs/ucx/sequencing/__init__.py b/src/databricks/labs/ucx/sequencing/__init__.py deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/tests/unit/sequencing/test_sequencing.py b/tests/unit/assessment/test_sequencing.py similarity index 94% rename from tests/unit/sequencing/test_sequencing.py rename to tests/unit/assessment/test_sequencing.py index 21d2a612d0..9410706788 100644 --- a/tests/unit/sequencing/test_sequencing.py +++ b/tests/unit/assessment/test_sequencing.py @@ -1,10 +1,11 @@ +import dataclasses from unittest.mock import create_autospec from databricks.sdk.service import iam, jobs from databricks.sdk.service.compute import ClusterDetails +from databricks.labs.ucx.assessment.sequencing import MigrationSequencer, MigrationStep from databricks.labs.ucx.framework.owners import AdministratorLocator, AdministratorFinder -from databricks.labs.ucx.sequencing.sequencing import MigrationSequencer from databricks.labs.ucx.source_code.base import CurrentSessionState from databricks.labs.ucx.source_code.graph import DependencyGraph from databricks.labs.ucx.source_code.jobs import WorkflowTask diff --git a/tests/unit/sequencing/__init__.py b/tests/unit/sequencing/__init__.py deleted file mode 100644 index e69de29bb2..0000000000 From 5bd3b675a92c13809e7fa29ad8c650d0aed8318c Mon Sep 17 00:00:00 2001 From: Eric Vergnaud Date: Mon, 21 Oct 2024 14:01:38 +0200 Subject: [PATCH 013/106] improve assert style --- tests/unit/assessment/test_sequencing.py | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/tests/unit/assessment/test_sequencing.py b/tests/unit/assessment/test_sequencing.py index 9410706788..e4da351392 100644 --- a/tests/unit/assessment/test_sequencing.py +++ b/tests/unit/assessment/test_sequencing.py @@ -25,11 +25,14 @@ def test_cluster_from_task_has_children(ws, simple_dependency_resolver, mock_pat sequencer = MigrationSequencer(ws, AdministratorLocator(ws, finders=[lambda _ws: admin_finder])) sequencer.register_workflow_task(task, job, graph) steps = list(sequencer.generate_steps()) - step = steps[-1] - assert step.step_id - assert step.object_type == "CLUSTER" - assert step.object_id == "cluster-123" - assert step.object_name == "my-cluster" - assert step.object_owner == "John Doe" - assert step.step_number == 3 - assert len(step.required_step_ids) == 2 + step = dataclasses.replace(steps[-1], step_id=0) + # we don't know the exact ids of the required steps, se let's zero them + step.required_step_ids = [0 for id in step.required_step_ids] + assert step == MigrationStep( + step_id=0, + step_number = 3, + object_type = "CLUSTER", + object_id = "cluster-123", + object_name = "my-cluster", + object_owner = "John Doe", + required_step_ids = [0, 0]) From 10874638916ed36d090915404a891275e31e5005 Mon Sep 17 00:00:00 2001 From: Eric Vergnaud Date: Mon, 21 Oct 2024 14:11:34 +0200 Subject: [PATCH 014/106] formatting --- tests/unit/assessment/test_sequencing.py | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/tests/unit/assessment/test_sequencing.py b/tests/unit/assessment/test_sequencing.py index e4da351392..3d2ff3a3e8 100644 --- a/tests/unit/assessment/test_sequencing.py +++ b/tests/unit/assessment/test_sequencing.py @@ -25,14 +25,15 @@ def test_cluster_from_task_has_children(ws, simple_dependency_resolver, mock_pat sequencer = MigrationSequencer(ws, AdministratorLocator(ws, finders=[lambda _ws: admin_finder])) sequencer.register_workflow_task(task, job, graph) steps = list(sequencer.generate_steps()) - step = dataclasses.replace(steps[-1], step_id=0) - # we don't know the exact ids of the required steps, se let's zero them - step.required_step_ids = [0 for id in step.required_step_ids] + step = steps[-1] + # we don't know the ids of the steps, se let's zero them + step = dataclasses.replace(step, step_id=0, required_step_ids=[0] * len(step.required_step_ids)) assert step == MigrationStep( step_id=0, - step_number = 3, - object_type = "CLUSTER", - object_id = "cluster-123", - object_name = "my-cluster", - object_owner = "John Doe", - required_step_ids = [0, 0]) + step_number=3, + object_type="CLUSTER", + object_id="cluster-123", + object_name="my-cluster", + object_owner="John Doe", + required_step_ids=[0, 0], + ) From 77af27898d38313dd043d014d266da8aff075371 Mon Sep 17 00:00:00 2001 From: Eric Vergnaud Date: Mon, 21 Oct 2024 16:24:49 +0200 Subject: [PATCH 015/106] make 'incoming' transient and improve comments --- .../labs/ucx/assessment/sequencing.py | 43 ++++++++++++------- 1 file changed, 27 insertions(+), 16 deletions(-) diff --git a/src/databricks/labs/ucx/assessment/sequencing.py b/src/databricks/labs/ucx/assessment/sequencing.py index a873ee1b7e..59fbfe2832 100644 --- a/src/databricks/labs/ucx/assessment/sequencing.py +++ b/src/databricks/labs/ucx/assessment/sequencing.py @@ -55,7 +55,6 @@ def __init__(self, ws: WorkspaceClient, admin_locator: AdministratorLocator): self._admin_locator = admin_locator self._last_node_id = 0 self._nodes: dict[tuple[str, str], MigrationNode] = {} - self._incoming: dict[tuple[str, str], set[tuple[str, str]]] = defaultdict(set) self._outgoing: dict[tuple[str, str], set[tuple[str, str]]] = defaultdict(set) def register_workflow_task(self, task: jobs.Task, job: jobs.Job, _graph: DependencyGraph) -> MigrationNode: @@ -73,15 +72,12 @@ def register_workflow_task(self, task: jobs.Task, job: jobs.Job, _graph: Depende object_owner=job_node.object_owner, # no task owner so use job one ) self._nodes[task_node.key] = task_node - self._incoming[job_node.key].add(task_node.key) self._outgoing[task_node.key].add(job_node.key) if task.existing_cluster_id: cluster_node = self.register_cluster(task.existing_cluster_id) if cluster_node: - self._incoming[cluster_node.key].add(task_node.key) self._outgoing[task_node.key].add(cluster_node.key) # also make the cluster dependent on the job - self._incoming[cluster_node.key].add(job_node.key) self._outgoing[job_node.key].add(cluster_node.key) # TODO register dependency graph return task_node @@ -104,7 +100,6 @@ def register_workflow_job(self, job: jobs.Job) -> MigrationNode: for job_cluster in job.settings.job_clusters: cluster_node = self.register_job_cluster(job_cluster) if cluster_node: - self._incoming[cluster_node.key].add(job_node.key) self._outgoing[job_node.key].add(cluster_node.key) return job_node @@ -132,31 +127,47 @@ def register_cluster(self, cluster_id: str) -> MigrationNode: return cluster_node def generate_steps(self) -> Iterable[MigrationStep]: - # algo adapted from Kahn topological sort. The main differences is that - # we want the same step number for all nodes with same dependency depth - # so instead of pushing to a queue, we rebuild it once all leaf nodes are processed - # (these are transient leaf nodes i.e. they only become leaf during processing) - incoming_counts = self._populate_incoming_counts() + """algo adapted from Kahn topological sort. The differences are as follows: + - we want the same step number for all nodes with same dependency depth + so instead of pushing to a queue, we rebuild it once all leaf nodes are processed + (these are transient leaf nodes i.e. they only become leaf during processing) + - the inputs do not form a DAG so we need specialized handling of edge cases + (implemented in PR #3009) + """ + # pre-compute incoming keys for best performance of self._required_step_ids + incoming_keys = self._collect_incoming_keys() + incoming_counts = self.compute_incoming_counts(incoming_keys) step_number = 1 sorted_steps: list[MigrationStep] = [] while len(incoming_counts) > 0: leaf_keys = list(self._get_leaf_keys(incoming_counts)) for leaf_key in leaf_keys: del incoming_counts[leaf_key] - sorted_steps.append(self._nodes[leaf_key].as_step(step_number, list(self._required_step_ids(leaf_key)))) + sorted_steps.append( + self._nodes[leaf_key].as_step(step_number, list(self._required_step_ids(incoming_keys[leaf_key]))) + ) for dependency_key in self._outgoing[leaf_key]: incoming_counts[dependency_key] -= 1 step_number += 1 return sorted_steps - def _required_step_ids(self, node_key: tuple[str, str]) -> Iterable[int]: - for leaf_key in self._incoming[node_key]: - yield self._nodes[leaf_key].node_id + def _collect_incoming_keys(self) -> dict[tuple[str, str], set[tuple[str, str]]]: + result: dict[tuple[str, str], set[tuple[str, str]]] = defaultdict(set) + for source, outgoing in self._outgoing.items(): + for target in outgoing: + result[target].add(source) + return result + + def _required_step_ids(self, required_step_keys: set[tuple[str, str]]) -> Iterable[int]: + for source_key in required_step_keys: + yield self._nodes[source_key].node_id - def _populate_incoming_counts(self) -> dict[tuple[str, str], int]: + def compute_incoming_counts( + self, incoming: dict[tuple[str, str], set[tuple[str, str]]] + ) -> dict[tuple[str, str], int]: result = defaultdict(int) for node_key in self._nodes: - result[node_key] = len(self._incoming[node_key]) + result[node_key] = len(incoming[node_key]) return result @staticmethod From 8478d2678283407fab9f5d308536663192b2fc9c Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Tue, 29 Oct 2024 10:33:37 +0100 Subject: [PATCH 016/106] Lint unit test --- tests/unit/assessment/test_sequencing.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/unit/assessment/test_sequencing.py b/tests/unit/assessment/test_sequencing.py index 3d2ff3a3e8..84f9e51ff9 100644 --- a/tests/unit/assessment/test_sequencing.py +++ b/tests/unit/assessment/test_sequencing.py @@ -11,7 +11,7 @@ from databricks.labs.ucx.source_code.jobs import WorkflowTask -def test_cluster_from_task_has_children(ws, simple_dependency_resolver, mock_path_lookup): +def test_cluster_from_task_has_children(ws, simple_dependency_resolver, mock_path_lookup) -> None: ws.clusters.get.return_value = ClusterDetails(cluster_name="my-cluster", creator_user_name="John Doe") task = jobs.Task(task_key="test-task", existing_cluster_id="cluster-123") settings = jobs.JobSettings(name="test-job", tasks=[task]) From 66e7122ea03f6f21460c84f5018ec6c56b3a8113 Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Tue, 29 Oct 2024 10:57:40 +0100 Subject: [PATCH 017/106] Test all steps in sequence --- tests/unit/assessment/test_sequencing.py | 72 ++++++++++++++++++------ 1 file changed, 55 insertions(+), 17 deletions(-) diff --git a/tests/unit/assessment/test_sequencing.py b/tests/unit/assessment/test_sequencing.py index 84f9e51ff9..df5991d990 100644 --- a/tests/unit/assessment/test_sequencing.py +++ b/tests/unit/assessment/test_sequencing.py @@ -1,6 +1,6 @@ -import dataclasses from unittest.mock import create_autospec +from databricks.sdk import WorkspaceClient from databricks.sdk.service import iam, jobs from databricks.sdk.service.compute import ClusterDetails @@ -11,29 +11,67 @@ from databricks.labs.ucx.source_code.jobs import WorkflowTask -def test_cluster_from_task_has_children(ws, simple_dependency_resolver, mock_path_lookup) -> None: - ws.clusters.get.return_value = ClusterDetails(cluster_name="my-cluster", creator_user_name="John Doe") +def test_sequence_steps_from_job_task_with_cluster( + ws: WorkspaceClient, simple_dependency_resolver, mock_path_lookup +) -> None: + """Sequence a job with a task referencing a cluster. + + Sequence: # TODO: @JCZuurmond: Would expect cluster first. + 1. Task + 2. Job + 3. Cluster + """ task = jobs.Task(task_key="test-task", existing_cluster_id="cluster-123") settings = jobs.JobSettings(name="test-job", tasks=[task]) job = jobs.Job(job_id=1234, settings=settings) ws.jobs.get.return_value = job - dependency = WorkflowTask(ws, task, job) - graph = DependencyGraph(dependency, None, simple_dependency_resolver, mock_path_lookup, CurrentSessionState()) + + # Match task cluster above on cluster id + def get_cluster(cluster_id: str) -> ClusterDetails: + if cluster_id == "cluster-123": + return ClusterDetails(cluster_id="cluster-123", cluster_name="my-cluster", creator_user_name="John Doe") + raise ValueError(f"Unknown cluster: {cluster_id}") + + ws.clusters.get.side_effect = get_cluster + + # Match cluster creator above on username admin_finder = create_autospec(AdministratorFinder) admin_user = iam.User(user_name="John Doe", active=True, roles=[iam.ComplexValue(value="account_admin")]) admin_finder.find_admin_users.return_value = (admin_user,) + + dependency = WorkflowTask(ws, task, job) + graph = DependencyGraph(dependency, None, simple_dependency_resolver, mock_path_lookup, CurrentSessionState()) sequencer = MigrationSequencer(ws, AdministratorLocator(ws, finders=[lambda _ws: admin_finder])) sequencer.register_workflow_task(task, job, graph) + steps = list(sequencer.generate_steps()) - step = steps[-1] - # we don't know the ids of the steps, se let's zero them - step = dataclasses.replace(step, step_id=0, required_step_ids=[0] * len(step.required_step_ids)) - assert step == MigrationStep( - step_id=0, - step_number=3, - object_type="CLUSTER", - object_id="cluster-123", - object_name="my-cluster", - object_owner="John Doe", - required_step_ids=[0, 0], - ) + + assert steps == [ + MigrationStep( + step_id=2, + step_number=1, + object_type="TASK", + object_id="1234/test-task", + object_name="test-task", + object_owner="John Doe", + required_step_ids=[], + ), + MigrationStep( + step_id=1, + step_number=2, + object_type="JOB", + object_id="1234", + object_name="test-job", + object_owner="John Doe", + required_step_ids=[2], + ), + MigrationStep( + step_id=3, + step_number=3, + object_type="CLUSTER", + object_id="cluster-123", + object_name="my-cluster", + object_owner="John Doe", + required_step_ids=[1, 2], + ), + ] From cfb555efb29ad3cff15db445679d78cc3de38ba3 Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Tue, 29 Oct 2024 10:59:43 +0100 Subject: [PATCH 018/106] Sort step ids --- src/databricks/labs/ucx/assessment/sequencing.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/databricks/labs/ucx/assessment/sequencing.py b/src/databricks/labs/ucx/assessment/sequencing.py index 59fbfe2832..d071605ec6 100644 --- a/src/databricks/labs/ucx/assessment/sequencing.py +++ b/src/databricks/labs/ucx/assessment/sequencing.py @@ -143,9 +143,9 @@ def generate_steps(self) -> Iterable[MigrationStep]: leaf_keys = list(self._get_leaf_keys(incoming_counts)) for leaf_key in leaf_keys: del incoming_counts[leaf_key] - sorted_steps.append( - self._nodes[leaf_key].as_step(step_number, list(self._required_step_ids(incoming_keys[leaf_key]))) - ) + required_step_ids = sorted(self._required_step_ids(incoming_keys[leaf_key])) + step = self._nodes[leaf_key].as_step(step_number, required_step_ids) + sorted_steps.append(step) for dependency_key in self._outgoing[leaf_key]: incoming_counts[dependency_key] -= 1 step_number += 1 From c5a27f46d20cfad526addb21e73591bea5a5065f Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Tue, 29 Oct 2024 11:15:22 +0100 Subject: [PATCH 019/106] Add docstring for migration sequencing --- src/databricks/labs/ucx/assessment/sequencing.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/databricks/labs/ucx/assessment/sequencing.py b/src/databricks/labs/ucx/assessment/sequencing.py index d071605ec6..e4b346cddb 100644 --- a/src/databricks/labs/ucx/assessment/sequencing.py +++ b/src/databricks/labs/ucx/assessment/sequencing.py @@ -49,6 +49,11 @@ def as_step(self, step_number: int, required_step_ids: list[int]) -> MigrationSt class MigrationSequencer: + """Sequence the migration dependencies in order to execute the migration. + + Similar to the other graph logic, we first build the graph by registering dependencies, then we analyse the graph. + Analysing the graph in this case means: computing the migration sequence in `meth:generate_steps`. + """ def __init__(self, ws: WorkspaceClient, admin_locator: AdministratorLocator): self._ws = ws From e7ff13040919f6e4033956a44a4548d4a8aa24f5 Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Tue, 29 Oct 2024 11:15:57 +0100 Subject: [PATCH 020/106] Format --- tests/unit/assessment/test_sequencing.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/tests/unit/assessment/test_sequencing.py b/tests/unit/assessment/test_sequencing.py index df5991d990..0afdb6289c 100644 --- a/tests/unit/assessment/test_sequencing.py +++ b/tests/unit/assessment/test_sequencing.py @@ -1,6 +1,5 @@ from unittest.mock import create_autospec -from databricks.sdk import WorkspaceClient from databricks.sdk.service import iam, jobs from databricks.sdk.service.compute import ClusterDetails @@ -11,9 +10,7 @@ from databricks.labs.ucx.source_code.jobs import WorkflowTask -def test_sequence_steps_from_job_task_with_cluster( - ws: WorkspaceClient, simple_dependency_resolver, mock_path_lookup -) -> None: +def test_sequence_steps_from_job_task_with_cluster(ws, simple_dependency_resolver, mock_path_lookup) -> None: """Sequence a job with a task referencing a cluster. Sequence: # TODO: @JCZuurmond: Would expect cluster first. From ad8f4573f47de796bf8800a6968bb2f7bfdba0b1 Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Tue, 29 Oct 2024 11:18:39 +0100 Subject: [PATCH 021/106] Update docstring --- src/databricks/labs/ucx/assessment/sequencing.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/src/databricks/labs/ucx/assessment/sequencing.py b/src/databricks/labs/ucx/assessment/sequencing.py index e4b346cddb..091493c67d 100644 --- a/src/databricks/labs/ucx/assessment/sequencing.py +++ b/src/databricks/labs/ucx/assessment/sequencing.py @@ -132,12 +132,13 @@ def register_cluster(self, cluster_id: str) -> MigrationNode: return cluster_node def generate_steps(self) -> Iterable[MigrationStep]: - """algo adapted from Kahn topological sort. The differences are as follows: - - we want the same step number for all nodes with same dependency depth - so instead of pushing to a queue, we rebuild it once all leaf nodes are processed - (these are transient leaf nodes i.e. they only become leaf during processing) - - the inputs do not form a DAG so we need specialized handling of edge cases - (implemented in PR #3009) + """Generate the migration steps. + + An adapted version of the Kahn topological sort is implemented. The differences are as follows: + - We want the same step number for all nodes with same dependency depth. Therefore, instead of pushing to a + queue, we rebuild it once all leaf nodes are processed (these are transient leaf nodes i.e. they only become + leaf during processing) + - We handle cyclic dependencies (implemented in PR #3009) """ # pre-compute incoming keys for best performance of self._required_step_ids incoming_keys = self._collect_incoming_keys() From 00f4dc161a70ff4412974eed4acb2ae1697c6c24 Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Tue, 29 Oct 2024 11:35:35 +0100 Subject: [PATCH 022/106] Use priority queue --- .../labs/ucx/assessment/sequencing.py | 35 +++++++++++-------- 1 file changed, 21 insertions(+), 14 deletions(-) diff --git a/src/databricks/labs/ucx/assessment/sequencing.py b/src/databricks/labs/ucx/assessment/sequencing.py index 091493c67d..8934bf14ea 100644 --- a/src/databricks/labs/ucx/assessment/sequencing.py +++ b/src/databricks/labs/ucx/assessment/sequencing.py @@ -1,5 +1,6 @@ from __future__ import annotations +import queue from collections import defaultdict from collections.abc import Iterable from dataclasses import dataclass @@ -143,18 +144,19 @@ def generate_steps(self) -> Iterable[MigrationStep]: # pre-compute incoming keys for best performance of self._required_step_ids incoming_keys = self._collect_incoming_keys() incoming_counts = self.compute_incoming_counts(incoming_keys) + key_queue = self._create_key_queue(incoming_counts) step_number = 1 sorted_steps: list[MigrationStep] = [] - while len(incoming_counts) > 0: - leaf_keys = list(self._get_leaf_keys(incoming_counts)) - for leaf_key in leaf_keys: - del incoming_counts[leaf_key] - required_step_ids = sorted(self._required_step_ids(incoming_keys[leaf_key])) - step = self._nodes[leaf_key].as_step(step_number, required_step_ids) - sorted_steps.append(step) - for dependency_key in self._outgoing[leaf_key]: - incoming_counts[dependency_key] -= 1 + while not key_queue.empty(): + _, leaf_key = key_queue.get() + incoming_counts.pop(leaf_key) + required_step_ids = sorted(self._get_required_step_ids(incoming_keys[leaf_key])) + step = self._nodes[leaf_key].as_step(step_number, required_step_ids) + sorted_steps.append(step) + for dependency_key in self._outgoing[leaf_key]: + incoming_counts[dependency_key] -= 1 step_number += 1 + key_queue = self._create_key_queue(incoming_counts) # Reprioritize queue given new incoming counts return sorted_steps def _collect_incoming_keys(self) -> dict[tuple[str, str], set[tuple[str, str]]]: @@ -164,7 +166,7 @@ def _collect_incoming_keys(self) -> dict[tuple[str, str], set[tuple[str, str]]]: result[target].add(source) return result - def _required_step_ids(self, required_step_keys: set[tuple[str, str]]) -> Iterable[int]: + def _get_required_step_ids(self, required_step_keys: set[tuple[str, str]]) -> Iterable[int]: for source_key in required_step_keys: yield self._nodes[source_key].node_id @@ -177,8 +179,13 @@ def compute_incoming_counts( return result @staticmethod - def _get_leaf_keys(incoming_counts: dict[tuple[str, str], int]) -> Iterable[tuple[str, str]]: + def _create_key_queue(incoming_counts: dict[tuple[str, str], int]) -> queue.PriorityQueue: + """Create a priority queue given the keys and their incoming counts. + + A lower number means it is pulled from the queue first, i.e. the key with the lowest number of keys is retrieved + first. + """ + priority_queue = queue.PriorityQueue() for node_key, incoming_count in incoming_counts.items(): - if incoming_count > 0: - continue - yield node_key + priority_queue.put((incoming_count, node_key)) + return priority_queue From 08a676a6f4495c9265a6b371450ae9cb40166468 Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Tue, 29 Oct 2024 12:04:48 +0100 Subject: [PATCH 023/106] Add custom queue --- .../labs/ucx/assessment/sequencing.py | 72 ++++++++++++++++--- 1 file changed, 62 insertions(+), 10 deletions(-) diff --git a/src/databricks/labs/ucx/assessment/sequencing.py b/src/databricks/labs/ucx/assessment/sequencing.py index 8934bf14ea..e6edbe2c61 100644 --- a/src/databricks/labs/ucx/assessment/sequencing.py +++ b/src/databricks/labs/ucx/assessment/sequencing.py @@ -1,9 +1,10 @@ from __future__ import annotations -import queue +import heapq from collections import defaultdict from collections.abc import Iterable from dataclasses import dataclass +from typing import TypeVar from databricks.sdk import WorkspaceClient from databricks.sdk.service import jobs @@ -49,6 +50,55 @@ def as_step(self, step_number: int, required_step_ids: list[int]) -> MigrationSt ) +QueueTask = TypeVar("QueueTask") +QueueEntry = list[int, int, QueueTask | str] + + +class PriorityQueue: + """A priority queue supporting to update tasks. + + An adaption from class:queue.Priority to support updating tasks. + + Note: + This implementation does not support threading safety as that is not required. + """ + + _REMOVED = "" # Mark removed items + + def __init__(self): + self._entries: list[QueueEntry] = [] + self._entry_finder: dict[QueueTask, QueueEntry] = {} + self._counter = 0 + + def put(self, priority: int, task: QueueTask) -> None: + """Put or update task in the queue. + + The lowest priority is retrieved from the queue first. + """ + # Update a task by checking if it already exists, then removing it + if task in self._entry_finder: + self._remove(task) + entry = [priority, self._counter, task] + self._entry_finder[task] = entry + heapq.heappush(self._entries, entry) + self._counter += 1 + + def get(self) -> QueueTask | None: + """Gets the tasks with lowest priority.""" + while self._entries: + _, _, task = heapq.heappop(self._entries) + if task != self._REMOVED: + self._remove(task) + # Ignore type because heappop returns Any, while we know it is an QueueEntry + return task # type: ignore + return None + + def _remove(self, task: QueueTask) -> None: + """Remove a task from the queue.""" + entry = self._entry_finder.pop(task) + entry[2] = self._REMOVED + + class MigrationSequencer: """Sequence the migration dependencies in order to execute the migration. @@ -145,18 +195,20 @@ def generate_steps(self) -> Iterable[MigrationStep]: incoming_keys = self._collect_incoming_keys() incoming_counts = self.compute_incoming_counts(incoming_keys) key_queue = self._create_key_queue(incoming_counts) + key = key_queue.get() step_number = 1 sorted_steps: list[MigrationStep] = [] - while not key_queue.empty(): - _, leaf_key = key_queue.get() - incoming_counts.pop(leaf_key) - required_step_ids = sorted(self._get_required_step_ids(incoming_keys[leaf_key])) - step = self._nodes[leaf_key].as_step(step_number, required_step_ids) + while key is not None: + required_step_ids = sorted(self._get_required_step_ids(incoming_keys[key])) + step = self._nodes[key].as_step(step_number, required_step_ids) sorted_steps.append(step) - for dependency_key in self._outgoing[leaf_key]: + # Update the + for dependency_key in self._outgoing[key]: incoming_counts[dependency_key] -= 1 + key_queue.put(incoming_counts[dependency_key], dependency_key) step_number += 1 key_queue = self._create_key_queue(incoming_counts) # Reprioritize queue given new incoming counts + key = key_queue.get() return sorted_steps def _collect_incoming_keys(self) -> dict[tuple[str, str], set[tuple[str, str]]]: @@ -179,13 +231,13 @@ def compute_incoming_counts( return result @staticmethod - def _create_key_queue(incoming_counts: dict[tuple[str, str], int]) -> queue.PriorityQueue: + def _create_key_queue(incoming_counts: dict[tuple[str, str], int]) -> PriorityQueue: """Create a priority queue given the keys and their incoming counts. A lower number means it is pulled from the queue first, i.e. the key with the lowest number of keys is retrieved first. """ - priority_queue = queue.PriorityQueue() + priority_queue = PriorityQueue() for node_key, incoming_count in incoming_counts.items(): - priority_queue.put((incoming_count, node_key)) + priority_queue.put(incoming_count, node_key) return priority_queue From 29555e3214cdef4c9f80c19e85d89945a2eda390 Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Tue, 29 Oct 2024 12:51:47 +0100 Subject: [PATCH 024/106] Add source --- src/databricks/labs/ucx/assessment/sequencing.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/databricks/labs/ucx/assessment/sequencing.py b/src/databricks/labs/ucx/assessment/sequencing.py index e6edbe2c61..33655758be 100644 --- a/src/databricks/labs/ucx/assessment/sequencing.py +++ b/src/databricks/labs/ucx/assessment/sequencing.py @@ -61,6 +61,9 @@ class PriorityQueue: Note: This implementation does not support threading safety as that is not required. + + Source: + https://docs.python.org/3/library/heapq.html """ _REMOVED = "" # Mark removed items From 7c9e770b555e2e2fa4c0cd7a100b8d2e2780e15f Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Tue, 29 Oct 2024 13:04:02 +0100 Subject: [PATCH 025/106] Fix implementation of custom queue --- .../labs/ucx/assessment/sequencing.py | 27 ++++++++++++------- 1 file changed, 18 insertions(+), 9 deletions(-) diff --git a/src/databricks/labs/ucx/assessment/sequencing.py b/src/databricks/labs/ucx/assessment/sequencing.py index 33655758be..c9493050de 100644 --- a/src/databricks/labs/ucx/assessment/sequencing.py +++ b/src/databricks/labs/ucx/assessment/sequencing.py @@ -67,6 +67,7 @@ class PriorityQueue: """ _REMOVED = "" # Mark removed items + _UPDATED = "" # Mark updated items def __init__(self): self._entries: list[QueueEntry] = [] @@ -78,9 +79,8 @@ def put(self, priority: int, task: QueueTask) -> None: The lowest priority is retrieved from the queue first. """ - # Update a task by checking if it already exists, then removing it if task in self._entry_finder: - self._remove(task) + raise KeyError(f"Use `:meth:update` to update existing task: {task}") entry = [priority, self._counter, task] self._entry_finder[task] = entry heapq.heappush(self._entries, entry) @@ -90,10 +90,11 @@ def get(self) -> QueueTask | None: """Gets the tasks with lowest priority.""" while self._entries: _, _, task = heapq.heappop(self._entries) - if task != self._REMOVED: - self._remove(task) - # Ignore type because heappop returns Any, while we know it is an QueueEntry - return task # type: ignore + if task in (self._REMOVED, self._UPDATED): + continue + self._remove(task) + # Ignore type because heappop returns Any, while we know it is an QueueEntry + return task # type: ignore return None def _remove(self, task: QueueTask) -> None: @@ -101,6 +102,15 @@ def _remove(self, task: QueueTask) -> None: entry = self._entry_finder.pop(task) entry[2] = self._REMOVED + def update(self, priority: int, task: QueueTask) -> None: + """Update a task in the queue.""" + entry = self._entry_finder.pop(task) + if entry is None: + raise KeyError(f"Cannot update unknown task: {task}") + if entry[2] != self._REMOVED: # Do not update REMOVED tasks + entry[2] = self._UPDATED + self.put(priority, task) + class MigrationSequencer: """Sequence the migration dependencies in order to execute the migration. @@ -205,12 +215,11 @@ def generate_steps(self) -> Iterable[MigrationStep]: required_step_ids = sorted(self._get_required_step_ids(incoming_keys[key])) step = self._nodes[key].as_step(step_number, required_step_ids) sorted_steps.append(step) - # Update the + # Update queue priorities for dependency_key in self._outgoing[key]: incoming_counts[dependency_key] -= 1 - key_queue.put(incoming_counts[dependency_key], dependency_key) + key_queue.update(incoming_counts[dependency_key], dependency_key) step_number += 1 - key_queue = self._create_key_queue(incoming_counts) # Reprioritize queue given new incoming counts key = key_queue.get() return sorted_steps From e2be4a3303fee6ac10099cad347aa2ce1e6197bd Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Tue, 29 Oct 2024 13:04:58 +0100 Subject: [PATCH 026/106] Update docs --- src/databricks/labs/ucx/assessment/sequencing.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/databricks/labs/ucx/assessment/sequencing.py b/src/databricks/labs/ucx/assessment/sequencing.py index c9493050de..e0e77d7c48 100644 --- a/src/databricks/labs/ucx/assessment/sequencing.py +++ b/src/databricks/labs/ucx/assessment/sequencing.py @@ -63,7 +63,8 @@ class PriorityQueue: This implementation does not support threading safety as that is not required. Source: - https://docs.python.org/3/library/heapq.html + See https://docs.python.org/3/library/heapq.html#priority-queue-implementation-notes on the changes below + to handle priority changes in the task. """ _REMOVED = "" # Mark removed items From 652d173b4d8fa7f1fc7952cb41ed9439f6a12aae Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Tue, 29 Oct 2024 13:06:09 +0100 Subject: [PATCH 027/106] Comment counter --- src/databricks/labs/ucx/assessment/sequencing.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/databricks/labs/ucx/assessment/sequencing.py b/src/databricks/labs/ucx/assessment/sequencing.py index e0e77d7c48..834364d52c 100644 --- a/src/databricks/labs/ucx/assessment/sequencing.py +++ b/src/databricks/labs/ucx/assessment/sequencing.py @@ -73,7 +73,7 @@ class PriorityQueue: def __init__(self): self._entries: list[QueueEntry] = [] self._entry_finder: dict[QueueTask, QueueEntry] = {} - self._counter = 0 + self._counter = 0 # Tiebreaker with equal priorities, then "first in, first out" def put(self, priority: int, task: QueueTask) -> None: """Put or update task in the queue. From d2505af5d86e9469596766d452efba44c7dc4c3a Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Tue, 29 Oct 2024 13:14:29 +0100 Subject: [PATCH 028/106] Add typehint for migration node id --- src/databricks/labs/ucx/assessment/sequencing.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/src/databricks/labs/ucx/assessment/sequencing.py b/src/databricks/labs/ucx/assessment/sequencing.py index 834364d52c..c968d42d8e 100644 --- a/src/databricks/labs/ucx/assessment/sequencing.py +++ b/src/databricks/labs/ucx/assessment/sequencing.py @@ -26,6 +26,9 @@ class MigrationStep: required_step_ids: list[int] +MigrationNodeKey = tuple[str, str] + + @dataclass class MigrationNode: node_id: int @@ -35,7 +38,7 @@ class MigrationNode: object_owner: str @property - def key(self) -> tuple[str, str]: + def key(self) -> MigrationNodeKey: return self.object_type, self.object_id def as_step(self, step_number: int, required_step_ids: list[int]) -> MigrationStep: @@ -124,8 +127,8 @@ def __init__(self, ws: WorkspaceClient, admin_locator: AdministratorLocator): self._ws = ws self._admin_locator = admin_locator self._last_node_id = 0 - self._nodes: dict[tuple[str, str], MigrationNode] = {} - self._outgoing: dict[tuple[str, str], set[tuple[str, str]]] = defaultdict(set) + self._nodes: dict[MigrationNodeKey, MigrationNode] = {} + self._outgoing: dict[MigrationNodeKey, set[MigrationNodeKey]] = defaultdict(set) def register_workflow_task(self, task: jobs.Task, job: jobs.Job, _graph: DependencyGraph) -> MigrationNode: task_id = f"{job.job_id}/{task.task_key}" From 4aed3da0dc53ef403b77dc1a24b3427b9a4f5a0c Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Tue, 29 Oct 2024 13:18:41 +0100 Subject: [PATCH 029/106] Add docstrings to MigrationNodes --- .../labs/ucx/assessment/sequencing.py | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/src/databricks/labs/ucx/assessment/sequencing.py b/src/databricks/labs/ucx/assessment/sequencing.py index c968d42d8e..7ec14967e1 100644 --- a/src/databricks/labs/ucx/assessment/sequencing.py +++ b/src/databricks/labs/ucx/assessment/sequencing.py @@ -3,7 +3,7 @@ import heapq from collections import defaultdict from collections.abc import Iterable -from dataclasses import dataclass +from dataclasses import dataclass, field from typing import TypeVar from databricks.sdk import WorkspaceClient @@ -31,11 +31,20 @@ class MigrationStep: @dataclass class MigrationNode: - node_id: int + node_id: int = field(compare=False) + """Globally unique id.""" + object_type: str + """Object type. Together with `attr:object_id` a unique identifier.""" + object_id: str - object_name: str - object_owner: str + """Object id. Together with `attr:object_id` a unique identifier.""" + + object_name: str = field(compare=False) + """Object name, more human friendly than `attr:object_id`.""" + + object_owner: str = field(compare=False) + """Object owner.""" @property def key(self) -> MigrationNodeKey: From 183eba5adde2fb47e74b43e480fc2e04624c4a97 Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Tue, 29 Oct 2024 13:19:06 +0100 Subject: [PATCH 030/106] Add TODO --- src/databricks/labs/ucx/assessment/sequencing.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/databricks/labs/ucx/assessment/sequencing.py b/src/databricks/labs/ucx/assessment/sequencing.py index 7ec14967e1..79fa865dec 100644 --- a/src/databricks/labs/ucx/assessment/sequencing.py +++ b/src/databricks/labs/ucx/assessment/sequencing.py @@ -31,6 +31,7 @@ class MigrationStep: @dataclass class MigrationNode: + # TODO: @JCZuurmond the prefixes look a bit redundant node_id: int = field(compare=False) """Globally unique id.""" From ace7ef245fa5e1df872148e9c28dcb84d410266b Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Tue, 29 Oct 2024 13:19:39 +0100 Subject: [PATCH 031/106] Add docstrings to methods --- src/databricks/labs/ucx/assessment/sequencing.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/databricks/labs/ucx/assessment/sequencing.py b/src/databricks/labs/ucx/assessment/sequencing.py index 79fa865dec..3fe7fa0de5 100644 --- a/src/databricks/labs/ucx/assessment/sequencing.py +++ b/src/databricks/labs/ucx/assessment/sequencing.py @@ -49,9 +49,11 @@ class MigrationNode: @property def key(self) -> MigrationNodeKey: + """Unique identifier of the node.""" return self.object_type, self.object_id def as_step(self, step_number: int, required_step_ids: list[int]) -> MigrationStep: + """Convert to class:MigrationStep.""" return MigrationStep( step_id=self.node_id, step_number=step_number, From ee42ec3ac9eefbb1af95902d4cc28b0ffbdae514 Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Tue, 29 Oct 2024 13:21:40 +0100 Subject: [PATCH 032/106] Add docstrings on MigrationStep attributes --- src/databricks/labs/ucx/assessment/sequencing.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/src/databricks/labs/ucx/assessment/sequencing.py b/src/databricks/labs/ucx/assessment/sequencing.py index 3fe7fa0de5..cb4e4189c8 100644 --- a/src/databricks/labs/ucx/assessment/sequencing.py +++ b/src/databricks/labs/ucx/assessment/sequencing.py @@ -18,12 +18,25 @@ @dataclass class MigrationStep: step_id: int + """Globally unique id.""" + step_number: int + """The position in the migration sequence.""" + object_type: str + """Object type. Together with `attr:object_id` a unique identifier.""" + object_id: str + """Object id. Together with `attr:object_id` a unique identifier.""" + object_name: str + """Object name, more human friendly than `attr:object_id`.""" + object_owner: str + """Object owner.""" + required_step_ids: list[int] + """The step ids that should be completed before this step is started.""" MigrationNodeKey = tuple[str, str] From 3497b1cda7eb899b99bd5b605df5ca84db530da6 Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Tue, 29 Oct 2024 13:36:23 +0100 Subject: [PATCH 033/106] Make types more consistent --- .../labs/ucx/assessment/sequencing.py | 47 +++++++++---------- 1 file changed, 21 insertions(+), 26 deletions(-) diff --git a/src/databricks/labs/ucx/assessment/sequencing.py b/src/databricks/labs/ucx/assessment/sequencing.py index cb4e4189c8..e459bd4ca0 100644 --- a/src/databricks/labs/ucx/assessment/sequencing.py +++ b/src/databricks/labs/ucx/assessment/sequencing.py @@ -79,7 +79,7 @@ def as_step(self, step_number: int, required_step_ids: list[int]) -> MigrationSt QueueTask = TypeVar("QueueTask") -QueueEntry = list[int, int, QueueTask | str] +QueueEntry = list[int, int, QueueTask | str] # type: ignore class PriorityQueue: @@ -153,7 +153,7 @@ def __init__(self, ws: WorkspaceClient, admin_locator: AdministratorLocator): self._admin_locator = admin_locator self._last_node_id = 0 self._nodes: dict[MigrationNodeKey, MigrationNode] = {} - self._outgoing: dict[MigrationNodeKey, set[MigrationNodeKey]] = defaultdict(set) + self._outgoing: dict[MigrationNodeKey, set[MigrationNode]] = defaultdict(set) def register_workflow_task(self, task: jobs.Task, job: jobs.Job, _graph: DependencyGraph) -> MigrationNode: task_id = f"{job.job_id}/{task.task_key}" @@ -170,13 +170,13 @@ def register_workflow_task(self, task: jobs.Task, job: jobs.Job, _graph: Depende object_owner=job_node.object_owner, # no task owner so use job one ) self._nodes[task_node.key] = task_node - self._outgoing[task_node.key].add(job_node.key) + self._outgoing[task_node.key].add(job_node) if task.existing_cluster_id: cluster_node = self.register_cluster(task.existing_cluster_id) if cluster_node: - self._outgoing[task_node.key].add(cluster_node.key) + self._outgoing[task_node.key].add(cluster_node) # also make the cluster dependent on the job - self._outgoing[job_node.key].add(cluster_node.key) + self._outgoing[job_node.key].add(cluster_node) # TODO register dependency graph return task_node @@ -198,7 +198,7 @@ def register_workflow_job(self, job: jobs.Job) -> MigrationNode: for job_cluster in job.settings.job_clusters: cluster_node = self.register_job_cluster(job_cluster) if cluster_node: - self._outgoing[job_node.key].add(cluster_node.key) + self._outgoing[job_node.key].add(cluster_node) return job_node def register_job_cluster(self, cluster: jobs.JobCluster) -> MigrationNode | None: @@ -234,40 +234,35 @@ def generate_steps(self) -> Iterable[MigrationStep]: - We handle cyclic dependencies (implemented in PR #3009) """ # pre-compute incoming keys for best performance of self._required_step_ids - incoming_keys = self._collect_incoming_keys() - incoming_counts = self.compute_incoming_counts(incoming_keys) + incoming = self._invert_outgoing_to_incoming() + incoming_counts = self._compute_incoming_counts(incoming) key_queue = self._create_key_queue(incoming_counts) key = key_queue.get() step_number = 1 sorted_steps: list[MigrationStep] = [] while key is not None: - required_step_ids = sorted(self._get_required_step_ids(incoming_keys[key])) - step = self._nodes[key].as_step(step_number, required_step_ids) + step = self._nodes[key].as_step(step_number, sorted(n.node_id for n in incoming[key])) sorted_steps.append(step) # Update queue priorities - for dependency_key in self._outgoing[key]: - incoming_counts[dependency_key] -= 1 - key_queue.update(incoming_counts[dependency_key], dependency_key) + for dependency in self._outgoing[key]: + incoming_counts[dependency.key] -= 1 + key_queue.update(incoming_counts[dependency.key], dependency) step_number += 1 key = key_queue.get() return sorted_steps - def _collect_incoming_keys(self) -> dict[tuple[str, str], set[tuple[str, str]]]: - result: dict[tuple[str, str], set[tuple[str, str]]] = defaultdict(set) - for source, outgoing in self._outgoing.items(): - for target in outgoing: - result[target].add(source) + def _invert_outgoing_to_incoming(self) -> dict[MigrationNodeKey, set[MigrationNode]]: + result: dict[MigrationNodeKey, set[MigrationNode]] = defaultdict(set) + for node_key, outgoing_nodes in self._outgoing.items(): + for target in outgoing_nodes: + result[target.key].add(self._nodes[node_key]) return result - def _get_required_step_ids(self, required_step_keys: set[tuple[str, str]]) -> Iterable[int]: - for source_key in required_step_keys: - yield self._nodes[source_key].node_id - - def compute_incoming_counts( - self, incoming: dict[tuple[str, str], set[tuple[str, str]]] - ) -> dict[tuple[str, str], int]: + def _compute_incoming_counts( + self, incoming: dict[MigrationNodeKey, set[MigrationNode]] + ) -> dict[MigrationNodeKey, int]: result = defaultdict(int) - for node_key in self._nodes: + for node_key in self._nodes.keys(): result[node_key] = len(incoming[node_key]) return result From f2324e82fc59ea310528f5c95beda85b0048a80f Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Tue, 29 Oct 2024 13:41:04 +0100 Subject: [PATCH 034/106] Make migration node frozen --- .../labs/ucx/assessment/sequencing.py | 29 +++++++++---------- 1 file changed, 14 insertions(+), 15 deletions(-) diff --git a/src/databricks/labs/ucx/assessment/sequencing.py b/src/databricks/labs/ucx/assessment/sequencing.py index e459bd4ca0..fa102fc440 100644 --- a/src/databricks/labs/ucx/assessment/sequencing.py +++ b/src/databricks/labs/ucx/assessment/sequencing.py @@ -42,7 +42,7 @@ class MigrationStep: MigrationNodeKey = tuple[str, str] -@dataclass +@dataclass(frozen=True) class MigrationNode: # TODO: @JCZuurmond the prefixes look a bit redundant node_id: int = field(compare=False) @@ -236,20 +236,20 @@ def generate_steps(self) -> Iterable[MigrationStep]: # pre-compute incoming keys for best performance of self._required_step_ids incoming = self._invert_outgoing_to_incoming() incoming_counts = self._compute_incoming_counts(incoming) - key_queue = self._create_key_queue(incoming_counts) - key = key_queue.get() + queue = self._create_node_queue(incoming_counts) + node = queue.get() step_number = 1 - sorted_steps: list[MigrationStep] = [] - while key is not None: - step = self._nodes[key].as_step(step_number, sorted(n.node_id for n in incoming[key])) - sorted_steps.append(step) + ordered_steps: list[MigrationStep] = [] + while node is not None: + step = node.as_step(step_number, sorted(n.node_id for n in incoming[node.key])) + ordered_steps.append(step) # Update queue priorities - for dependency in self._outgoing[key]: + for dependency in self._outgoing[node.key]: incoming_counts[dependency.key] -= 1 - key_queue.update(incoming_counts[dependency.key], dependency) + queue.update(incoming_counts[dependency.key], dependency) step_number += 1 - key = key_queue.get() - return sorted_steps + node = queue.get() + return ordered_steps def _invert_outgoing_to_incoming(self) -> dict[MigrationNodeKey, set[MigrationNode]]: result: dict[MigrationNodeKey, set[MigrationNode]] = defaultdict(set) @@ -266,14 +266,13 @@ def _compute_incoming_counts( result[node_key] = len(incoming[node_key]) return result - @staticmethod - def _create_key_queue(incoming_counts: dict[tuple[str, str], int]) -> PriorityQueue: - """Create a priority queue given the keys and their incoming counts. + def _create_node_queue(self, incoming_counts: dict[MigrationNodeKey, int]) -> PriorityQueue: + """Create a priority queue for their nodes using the incoming count as priority. A lower number means it is pulled from the queue first, i.e. the key with the lowest number of keys is retrieved first. """ priority_queue = PriorityQueue() for node_key, incoming_count in incoming_counts.items(): - priority_queue.put(incoming_count, node_key) + priority_queue.put(incoming_count, self._nodes[node_key]) return priority_queue From 7fcaaee91026b376274eec3eaf080d6fc778b79d Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Tue, 29 Oct 2024 13:45:18 +0100 Subject: [PATCH 035/106] Add generic to priorityqueue --- src/databricks/labs/ucx/assessment/sequencing.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/databricks/labs/ucx/assessment/sequencing.py b/src/databricks/labs/ucx/assessment/sequencing.py index fa102fc440..1c5e59796e 100644 --- a/src/databricks/labs/ucx/assessment/sequencing.py +++ b/src/databricks/labs/ucx/assessment/sequencing.py @@ -4,7 +4,7 @@ from collections import defaultdict from collections.abc import Iterable from dataclasses import dataclass, field -from typing import TypeVar +from typing import Generic, TypeVar from databricks.sdk import WorkspaceClient from databricks.sdk.service import jobs @@ -82,7 +82,7 @@ def as_step(self, step_number: int, required_step_ids: list[int]) -> MigrationSt QueueEntry = list[int, int, QueueTask | str] # type: ignore -class PriorityQueue: +class PriorityQueue(Generic[QueueTask]): """A priority queue supporting to update tasks. An adaption from class:queue.Priority to support updating tasks. @@ -272,7 +272,7 @@ def _create_node_queue(self, incoming_counts: dict[MigrationNodeKey, int]) -> Pr A lower number means it is pulled from the queue first, i.e. the key with the lowest number of keys is retrieved first. """ - priority_queue = PriorityQueue() + priority_queue: PriorityQueue[MigrationNode] = PriorityQueue() for node_key, incoming_count in incoming_counts.items(): priority_queue.put(incoming_count, self._nodes[node_key]) return priority_queue From ad34395bb9988e9dda0146d01ad73ffafbe822aa Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Tue, 29 Oct 2024 13:50:15 +0100 Subject: [PATCH 036/106] Fix PriorityQueue type hints --- .../labs/ucx/assessment/sequencing.py | 26 ++++++++++--------- 1 file changed, 14 insertions(+), 12 deletions(-) diff --git a/src/databricks/labs/ucx/assessment/sequencing.py b/src/databricks/labs/ucx/assessment/sequencing.py index 1c5e59796e..be60f2e7a3 100644 --- a/src/databricks/labs/ucx/assessment/sequencing.py +++ b/src/databricks/labs/ucx/assessment/sequencing.py @@ -4,7 +4,6 @@ from collections import defaultdict from collections.abc import Iterable from dataclasses import dataclass, field -from typing import Generic, TypeVar from databricks.sdk import WorkspaceClient from databricks.sdk.service import jobs @@ -78,11 +77,13 @@ def as_step(self, step_number: int, required_step_ids: list[int]) -> MigrationSt ) -QueueTask = TypeVar("QueueTask") -QueueEntry = list[int, int, QueueTask | str] # type: ignore +# We expect `tuple[int, int, MigrationNode | str]` +# for `[priority, counter, MigrationNode | PriorityQueue._REMOVED | PriorityQueue_UPDATED]` +# but we use list for the required mutability +QueueEntry = list[int | MigrationNode | str] -class PriorityQueue(Generic[QueueTask]): +class PriorityQueue: """A priority queue supporting to update tasks. An adaption from class:queue.Priority to support updating tasks. @@ -100,38 +101,39 @@ class PriorityQueue(Generic[QueueTask]): def __init__(self): self._entries: list[QueueEntry] = [] - self._entry_finder: dict[QueueTask, QueueEntry] = {} + self._entry_finder: dict[MigrationNode, QueueEntry] = {} self._counter = 0 # Tiebreaker with equal priorities, then "first in, first out" - def put(self, priority: int, task: QueueTask) -> None: + def put(self, priority: int, task: MigrationNode) -> None: """Put or update task in the queue. The lowest priority is retrieved from the queue first. """ if task in self._entry_finder: raise KeyError(f"Use `:meth:update` to update existing task: {task}") - entry = [priority, self._counter, task] + entry: QueueEntry = [priority, self._counter, task] self._entry_finder[task] = entry heapq.heappush(self._entries, entry) self._counter += 1 - def get(self) -> QueueTask | None: + def get(self) -> MigrationNode | None: """Gets the tasks with lowest priority.""" while self._entries: _, _, task = heapq.heappop(self._entries) if task in (self._REMOVED, self._UPDATED): continue + assert isinstance(task, MigrationNode) self._remove(task) # Ignore type because heappop returns Any, while we know it is an QueueEntry - return task # type: ignore + return task return None - def _remove(self, task: QueueTask) -> None: + def _remove(self, task: MigrationNode) -> None: """Remove a task from the queue.""" entry = self._entry_finder.pop(task) entry[2] = self._REMOVED - def update(self, priority: int, task: QueueTask) -> None: + def update(self, priority: int, task: MigrationNode) -> None: """Update a task in the queue.""" entry = self._entry_finder.pop(task) if entry is None: @@ -272,7 +274,7 @@ def _create_node_queue(self, incoming_counts: dict[MigrationNodeKey, int]) -> Pr A lower number means it is pulled from the queue first, i.e. the key with the lowest number of keys is retrieved first. """ - priority_queue: PriorityQueue[MigrationNode] = PriorityQueue() + priority_queue = PriorityQueue() for node_key, incoming_count in incoming_counts.items(): priority_queue.put(incoming_count, self._nodes[node_key]) return priority_queue From c04888090dea04f5c1b385ce979ff04f93920888 Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Tue, 29 Oct 2024 13:53:23 +0100 Subject: [PATCH 037/106] Update docs --- src/databricks/labs/ucx/assessment/sequencing.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/src/databricks/labs/ucx/assessment/sequencing.py b/src/databricks/labs/ucx/assessment/sequencing.py index be60f2e7a3..59cf842997 100644 --- a/src/databricks/labs/ucx/assessment/sequencing.py +++ b/src/databricks/labs/ucx/assessment/sequencing.py @@ -84,16 +84,14 @@ def as_step(self, step_number: int, required_step_ids: list[int]) -> MigrationSt class PriorityQueue: - """A priority queue supporting to update tasks. - - An adaption from class:queue.Priority to support updating tasks. + """A migration node priority queue. Note: This implementation does not support threading safety as that is not required. Source: See https://docs.python.org/3/library/heapq.html#priority-queue-implementation-notes on the changes below - to handle priority changes in the task. + to handle priority changes in the task. Also, the _UPDATED marker is introduced to avoid updating removed nodes. """ _REMOVED = "" # Mark removed items From 4ba7408a9e64f77cbbaaf1e3f53e212392e5afe0 Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Tue, 29 Oct 2024 13:55:20 +0100 Subject: [PATCH 038/106] Resolve TODO --- src/databricks/labs/ucx/assessment/sequencing.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/databricks/labs/ucx/assessment/sequencing.py b/src/databricks/labs/ucx/assessment/sequencing.py index 59cf842997..2449b17d61 100644 --- a/src/databricks/labs/ucx/assessment/sequencing.py +++ b/src/databricks/labs/ucx/assessment/sequencing.py @@ -43,7 +43,6 @@ class MigrationStep: @dataclass(frozen=True) class MigrationNode: - # TODO: @JCZuurmond the prefixes look a bit redundant node_id: int = field(compare=False) """Globally unique id.""" From 9410c84bd183a923e5a66f2d291ca00bc23a9264 Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Tue, 29 Oct 2024 14:07:53 +0100 Subject: [PATCH 039/106] Add seen migration node set --- .../labs/ucx/assessment/sequencing.py | 25 +++++++------------ 1 file changed, 9 insertions(+), 16 deletions(-) diff --git a/src/databricks/labs/ucx/assessment/sequencing.py b/src/databricks/labs/ucx/assessment/sequencing.py index 2449b17d61..d65b798bf5 100644 --- a/src/databricks/labs/ucx/assessment/sequencing.py +++ b/src/databricks/labs/ucx/assessment/sequencing.py @@ -234,18 +234,19 @@ def generate_steps(self) -> Iterable[MigrationStep]: """ # pre-compute incoming keys for best performance of self._required_step_ids incoming = self._invert_outgoing_to_incoming() - incoming_counts = self._compute_incoming_counts(incoming) - queue = self._create_node_queue(incoming_counts) + queue = self._create_node_queue(incoming) + seen = set[MigrationNode]() node = queue.get() step_number = 1 ordered_steps: list[MigrationStep] = [] while node is not None: step = node.as_step(step_number, sorted(n.node_id for n in incoming[node.key])) ordered_steps.append(step) - # Update queue priorities + seen.add(node) + # Update the queue priority as if the migration step was completed for dependency in self._outgoing[node.key]: - incoming_counts[dependency.key] -= 1 - queue.update(incoming_counts[dependency.key], dependency) + priority = len(incoming[dependency.key] - seen) + queue.update(priority, dependency) step_number += 1 node = queue.get() return ordered_steps @@ -257,21 +258,13 @@ def _invert_outgoing_to_incoming(self) -> dict[MigrationNodeKey, set[MigrationNo result[target.key].add(self._nodes[node_key]) return result - def _compute_incoming_counts( - self, incoming: dict[MigrationNodeKey, set[MigrationNode]] - ) -> dict[MigrationNodeKey, int]: - result = defaultdict(int) - for node_key in self._nodes.keys(): - result[node_key] = len(incoming[node_key]) - return result - - def _create_node_queue(self, incoming_counts: dict[MigrationNodeKey, int]) -> PriorityQueue: + def _create_node_queue(self, incoming: dict[MigrationNodeKey, set[MigrationNode]]) -> PriorityQueue: """Create a priority queue for their nodes using the incoming count as priority. A lower number means it is pulled from the queue first, i.e. the key with the lowest number of keys is retrieved first. """ priority_queue = PriorityQueue() - for node_key, incoming_count in incoming_counts.items(): - priority_queue.put(incoming_count, self._nodes[node_key]) + for node_key, incoming_nodes in incoming.items(): + priority_queue.put(len(incoming_nodes), self._nodes[node_key]) return priority_queue From 08dfe32148d7e5eeef9d03bba211ff7526f0ee30 Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Tue, 29 Oct 2024 14:08:35 +0100 Subject: [PATCH 040/106] Use ordered steps to compute step number --- src/databricks/labs/ucx/assessment/sequencing.py | 4 +--- tests/unit/assessment/test_sequencing.py | 6 +++--- 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/src/databricks/labs/ucx/assessment/sequencing.py b/src/databricks/labs/ucx/assessment/sequencing.py index d65b798bf5..321b01ddce 100644 --- a/src/databricks/labs/ucx/assessment/sequencing.py +++ b/src/databricks/labs/ucx/assessment/sequencing.py @@ -237,17 +237,15 @@ def generate_steps(self) -> Iterable[MigrationStep]: queue = self._create_node_queue(incoming) seen = set[MigrationNode]() node = queue.get() - step_number = 1 ordered_steps: list[MigrationStep] = [] while node is not None: - step = node.as_step(step_number, sorted(n.node_id for n in incoming[node.key])) + step = node.as_step(len(ordered_steps), sorted(n.node_id for n in incoming[node.key])) ordered_steps.append(step) seen.add(node) # Update the queue priority as if the migration step was completed for dependency in self._outgoing[node.key]: priority = len(incoming[dependency.key] - seen) queue.update(priority, dependency) - step_number += 1 node = queue.get() return ordered_steps diff --git a/tests/unit/assessment/test_sequencing.py b/tests/unit/assessment/test_sequencing.py index 0afdb6289c..9c7c8cc099 100644 --- a/tests/unit/assessment/test_sequencing.py +++ b/tests/unit/assessment/test_sequencing.py @@ -46,7 +46,7 @@ def get_cluster(cluster_id: str) -> ClusterDetails: assert steps == [ MigrationStep( step_id=2, - step_number=1, + step_number=0, object_type="TASK", object_id="1234/test-task", object_name="test-task", @@ -55,7 +55,7 @@ def get_cluster(cluster_id: str) -> ClusterDetails: ), MigrationStep( step_id=1, - step_number=2, + step_number=1, object_type="JOB", object_id="1234", object_name="test-job", @@ -64,7 +64,7 @@ def get_cluster(cluster_id: str) -> ClusterDetails: ), MigrationStep( step_id=3, - step_number=3, + step_number=2, object_type="CLUSTER", object_id="cluster-123", object_name="my-cluster", From 4b846526ff47e5d7a952694fff758703661eadc4 Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Tue, 29 Oct 2024 14:10:34 +0100 Subject: [PATCH 041/106] Fix create node queue --- src/databricks/labs/ucx/assessment/sequencing.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/databricks/labs/ucx/assessment/sequencing.py b/src/databricks/labs/ucx/assessment/sequencing.py index 321b01ddce..a219bf6970 100644 --- a/src/databricks/labs/ucx/assessment/sequencing.py +++ b/src/databricks/labs/ucx/assessment/sequencing.py @@ -263,6 +263,6 @@ def _create_node_queue(self, incoming: dict[MigrationNodeKey, set[MigrationNode] first. """ priority_queue = PriorityQueue() - for node_key, incoming_nodes in incoming.items(): - priority_queue.put(len(incoming_nodes), self._nodes[node_key]) + for node_key, node in self._nodes.items(): + priority_queue.put(len(incoming[node_key]), node) return priority_queue From c2271650c15c0339e7959552be72d896b79ea558 Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Tue, 29 Oct 2024 14:30:34 +0100 Subject: [PATCH 042/106] Remove update from queue --- .../labs/ucx/assessment/sequencing.py | 25 +++++++------------ 1 file changed, 9 insertions(+), 16 deletions(-) diff --git a/src/databricks/labs/ucx/assessment/sequencing.py b/src/databricks/labs/ucx/assessment/sequencing.py index a219bf6970..eb96217503 100644 --- a/src/databricks/labs/ucx/assessment/sequencing.py +++ b/src/databricks/labs/ucx/assessment/sequencing.py @@ -1,6 +1,7 @@ from __future__ import annotations import heapq +import itertools from collections import defaultdict from collections.abc import Iterable from dataclasses import dataclass, field @@ -94,12 +95,11 @@ class PriorityQueue: """ _REMOVED = "" # Mark removed items - _UPDATED = "" # Mark updated items def __init__(self): self._entries: list[QueueEntry] = [] self._entry_finder: dict[MigrationNode, QueueEntry] = {} - self._counter = 0 # Tiebreaker with equal priorities, then "first in, first out" + self._counter = itertools.count() # Tiebreaker with equal priorities, then "first in, first out" def put(self, priority: int, task: MigrationNode) -> None: """Put or update task in the queue. @@ -107,17 +107,17 @@ def put(self, priority: int, task: MigrationNode) -> None: The lowest priority is retrieved from the queue first. """ if task in self._entry_finder: - raise KeyError(f"Use `:meth:update` to update existing task: {task}") - entry: QueueEntry = [priority, self._counter, task] + self._remove(task) + count = next(self._counter) + entry = [priority, count, task] self._entry_finder[task] = entry heapq.heappush(self._entries, entry) - self._counter += 1 def get(self) -> MigrationNode | None: """Gets the tasks with lowest priority.""" while self._entries: _, _, task = heapq.heappop(self._entries) - if task in (self._REMOVED, self._UPDATED): + if task == self._REMOVED: continue assert isinstance(task, MigrationNode) self._remove(task) @@ -130,15 +130,6 @@ def _remove(self, task: MigrationNode) -> None: entry = self._entry_finder.pop(task) entry[2] = self._REMOVED - def update(self, priority: int, task: MigrationNode) -> None: - """Update a task in the queue.""" - entry = self._entry_finder.pop(task) - if entry is None: - raise KeyError(f"Cannot update unknown task: {task}") - if entry[2] != self._REMOVED: # Do not update REMOVED tasks - entry[2] = self._UPDATED - self.put(priority, task) - class MigrationSequencer: """Sequence the migration dependencies in order to execute the migration. @@ -244,8 +235,10 @@ def generate_steps(self) -> Iterable[MigrationStep]: seen.add(node) # Update the queue priority as if the migration step was completed for dependency in self._outgoing[node.key]: + if dependency in seen: + continue priority = len(incoming[dependency.key] - seen) - queue.update(priority, dependency) + queue.put(priority, dependency) node = queue.get() return ordered_steps From 0138906b8f2b4efd8159df0ac898d680761518c8 Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Tue, 29 Oct 2024 16:27:37 +0100 Subject: [PATCH 043/106] Move mock admin locator to fixture --- tests/unit/assessment/test_sequencing.py | 25 ++++++++++++++++-------- 1 file changed, 17 insertions(+), 8 deletions(-) diff --git a/tests/unit/assessment/test_sequencing.py b/tests/unit/assessment/test_sequencing.py index 9c7c8cc099..1cdfc8ca46 100644 --- a/tests/unit/assessment/test_sequencing.py +++ b/tests/unit/assessment/test_sequencing.py @@ -1,5 +1,6 @@ from unittest.mock import create_autospec +import pytest from databricks.sdk.service import iam, jobs from databricks.sdk.service.compute import ClusterDetails @@ -10,7 +11,18 @@ from databricks.labs.ucx.source_code.jobs import WorkflowTask -def test_sequence_steps_from_job_task_with_cluster(ws, simple_dependency_resolver, mock_path_lookup) -> None: +@pytest.fixture +def admin_locator(ws): + """Create a mock for an `class:AdminLocator`""" + admin_finder = create_autospec(AdministratorFinder) + admin_user = iam.User(user_name="John Doe", active=True, roles=[iam.ComplexValue(value="account_admin")]) + admin_finder.find_admin_users.return_value = (admin_user,) + return AdministratorLocator(ws, finders=[lambda _ws: admin_finder]) + + +def test_sequence_steps_from_job_task_with_cluster( + ws, simple_dependency_resolver, mock_path_lookup, admin_locator +) -> None: """Sequence a job with a task referencing a cluster. Sequence: # TODO: @JCZuurmond: Would expect cluster first. @@ -24,21 +36,18 @@ def test_sequence_steps_from_job_task_with_cluster(ws, simple_dependency_resolve ws.jobs.get.return_value = job # Match task cluster above on cluster id + admin_user = admin_locator.get_workspace_administrator() + def get_cluster(cluster_id: str) -> ClusterDetails: if cluster_id == "cluster-123": - return ClusterDetails(cluster_id="cluster-123", cluster_name="my-cluster", creator_user_name="John Doe") raise ValueError(f"Unknown cluster: {cluster_id}") + raise ResourceDoesNotExist(f"Unknown cluster: {cluster_id}") ws.clusters.get.side_effect = get_cluster - # Match cluster creator above on username - admin_finder = create_autospec(AdministratorFinder) - admin_user = iam.User(user_name="John Doe", active=True, roles=[iam.ComplexValue(value="account_admin")]) - admin_finder.find_admin_users.return_value = (admin_user,) - dependency = WorkflowTask(ws, task, job) graph = DependencyGraph(dependency, None, simple_dependency_resolver, mock_path_lookup, CurrentSessionState()) - sequencer = MigrationSequencer(ws, AdministratorLocator(ws, finders=[lambda _ws: admin_finder])) + sequencer = MigrationSequencer(ws, admin_locator) sequencer.register_workflow_task(task, job, graph) steps = list(sequencer.generate_steps()) From 74d115d4985336a53842ce662ffdd44259be1ce9 Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Tue, 29 Oct 2024 16:27:57 +0100 Subject: [PATCH 044/106] Use ResourceDoesNotExists --- tests/unit/assessment/test_sequencing.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/unit/assessment/test_sequencing.py b/tests/unit/assessment/test_sequencing.py index 1cdfc8ca46..a432d64623 100644 --- a/tests/unit/assessment/test_sequencing.py +++ b/tests/unit/assessment/test_sequencing.py @@ -1,6 +1,7 @@ from unittest.mock import create_autospec import pytest +from databricks.sdk.errors import ResourceDoesNotExist from databricks.sdk.service import iam, jobs from databricks.sdk.service.compute import ClusterDetails @@ -40,7 +41,7 @@ def test_sequence_steps_from_job_task_with_cluster( def get_cluster(cluster_id: str) -> ClusterDetails: if cluster_id == "cluster-123": - raise ValueError(f"Unknown cluster: {cluster_id}") + return ClusterDetails(cluster_id="cluster-123", cluster_name="my-cluster", creator_user_name=admin_user) raise ResourceDoesNotExist(f"Unknown cluster: {cluster_id}") ws.clusters.get.side_effect = get_cluster From 12c624199f5ce11d7263fea654e3f4e5e0465ef4 Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Tue, 29 Oct 2024 16:28:23 +0100 Subject: [PATCH 045/106] Introduce MaybeMigrationNode --- .../labs/ucx/assessment/sequencing.py | 49 ++++++++++++------- tests/unit/assessment/test_sequencing.py | 17 +++++++ 2 files changed, 49 insertions(+), 17 deletions(-) diff --git a/src/databricks/labs/ucx/assessment/sequencing.py b/src/databricks/labs/ucx/assessment/sequencing.py index eb96217503..a802c99770 100644 --- a/src/databricks/labs/ucx/assessment/sequencing.py +++ b/src/databricks/labs/ucx/assessment/sequencing.py @@ -7,12 +7,13 @@ from dataclasses import dataclass, field from databricks.sdk import WorkspaceClient +from databricks.sdk.errors import DatabricksError from databricks.sdk.service import jobs from databricks.labs.ucx.assessment.clusters import ClusterOwnership, ClusterInfo from databricks.labs.ucx.assessment.jobs import JobOwnership, JobInfo from databricks.labs.ucx.framework.owners import AdministratorLocator -from databricks.labs.ucx.source_code.graph import DependencyGraph +from databricks.labs.ucx.source_code.graph import DependencyGraph, DependencyProblem @dataclass @@ -77,6 +78,16 @@ def as_step(self, step_number: int, required_step_ids: list[int]) -> MigrationSt ) +@dataclass +class MaybeMigrationNode: + node: MigrationNode | None + problems: list[DependencyProblem] + + @property + def failed(self) -> bool: + return len(self.problems) > 0 + + # We expect `tuple[int, int, MigrationNode | str]` # for `[priority, counter, MigrationNode | PriorityQueue._REMOVED | PriorityQueue_UPDATED]` # but we use list for the required mutability @@ -142,7 +153,7 @@ def __init__(self, ws: WorkspaceClient, admin_locator: AdministratorLocator): self._ws = ws self._admin_locator = admin_locator self._last_node_id = 0 - self._nodes: dict[MigrationNodeKey, MigrationNode] = {} + self._nodes: dict[MigrationNodeKey, MigrationNode] = {} # TODO: Update to MaybeMigrationNode self._outgoing: dict[MigrationNodeKey, set[MigrationNode]] = defaultdict(set) def register_workflow_task(self, task: jobs.Task, job: jobs.Job, _graph: DependencyGraph) -> MigrationNode: @@ -162,11 +173,11 @@ def register_workflow_task(self, task: jobs.Task, job: jobs.Job, _graph: Depende self._nodes[task_node.key] = task_node self._outgoing[task_node.key].add(job_node) if task.existing_cluster_id: - cluster_node = self.register_cluster(task.existing_cluster_id) - if cluster_node: - self._outgoing[task_node.key].add(cluster_node) + maybe_cluster_node = self.register_cluster(task.existing_cluster_id) + if maybe_cluster_node.node: + self._outgoing[task_node.key].add(maybe_cluster_node.node) # also make the cluster dependent on the job - self._outgoing[job_node.key].add(cluster_node) + self._outgoing[job_node.key].add(maybe_cluster_node.node) # TODO register dependency graph return task_node @@ -186,21 +197,25 @@ def register_workflow_job(self, job: jobs.Job) -> MigrationNode: self._nodes[job_node.key] = job_node if job.settings and job.settings.job_clusters: for job_cluster in job.settings.job_clusters: - cluster_node = self.register_job_cluster(job_cluster) - if cluster_node: - self._outgoing[job_node.key].add(cluster_node) + maybe_cluster_node = self.register_job_cluster(job_cluster) + if maybe_cluster_node.node: + self._outgoing[job_node.key].add(maybe_cluster_node.node) return job_node - def register_job_cluster(self, cluster: jobs.JobCluster) -> MigrationNode | None: + def register_job_cluster(self, cluster: jobs.JobCluster) -> MaybeMigrationNode: if cluster.new_cluster: - return None + return MaybeMigrationNode(None, []) return self.register_cluster(cluster.job_cluster_key) - def register_cluster(self, cluster_id: str) -> MigrationNode: - cluster_node = self._nodes.get(("CLUSTER", cluster_id), None) - if cluster_node: - return cluster_node - details = self._ws.clusters.get(cluster_id) + def register_cluster(self, cluster_id: str) -> MaybeMigrationNode: + node_seen = self._nodes.get(("CLUSTER", cluster_id), None) + if node_seen: + return MaybeMigrationNode(node_seen, []) + try: + details = self._ws.clusters.get(cluster_id) + except DatabricksError: + message = f"Could not find cluster: {cluster_id}" + return MaybeMigrationNode(None, [DependencyProblem('cluster-not-found', message)]) object_name = details.cluster_name if details and details.cluster_name else cluster_id self._last_node_id += 1 cluster_node = MigrationNode( @@ -212,7 +227,7 @@ def register_cluster(self, cluster_id: str) -> MigrationNode: ) self._nodes[cluster_node.key] = cluster_node # TODO register warehouses and policies - return cluster_node + return MaybeMigrationNode(cluster_node, []) def generate_steps(self) -> Iterable[MigrationStep]: """Generate the migration steps. diff --git a/tests/unit/assessment/test_sequencing.py b/tests/unit/assessment/test_sequencing.py index a432d64623..8e30e6ece9 100644 --- a/tests/unit/assessment/test_sequencing.py +++ b/tests/unit/assessment/test_sequencing.py @@ -21,6 +21,23 @@ def admin_locator(ws): return AdministratorLocator(ws, finders=[lambda _ws: admin_finder]) +def test_register_non_existing_cluster( + ws, + simple_dependency_resolver, + mock_path_lookup, + admin_locator, +) -> None: + """Register a non existing cluster.""" + ws.clusters.get.side_effect = ResourceDoesNotExist("Unknown cluster") + sequencer = MigrationSequencer(ws, admin_locator) + + maybe_node = sequencer.register_cluster("non-existing-id") + + assert maybe_node.node is None + assert maybe_node.failed + assert maybe_node.problems == ["Could not find cluster: non-existing-id"] + + def test_sequence_steps_from_job_task_with_cluster( ws, simple_dependency_resolver, mock_path_lookup, admin_locator ) -> None: From 8b02db137f72fe6b2f2fd52ae552aee945819c47 Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Tue, 29 Oct 2024 16:36:33 +0100 Subject: [PATCH 046/106] Test register job cluster --- tests/unit/assessment/test_sequencing.py | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/tests/unit/assessment/test_sequencing.py b/tests/unit/assessment/test_sequencing.py index 8e30e6ece9..041dc5ca31 100644 --- a/tests/unit/assessment/test_sequencing.py +++ b/tests/unit/assessment/test_sequencing.py @@ -3,7 +3,7 @@ import pytest from databricks.sdk.errors import ResourceDoesNotExist from databricks.sdk.service import iam, jobs -from databricks.sdk.service.compute import ClusterDetails +from databricks.sdk.service.compute import ClusterDetails, ClusterSpec from databricks.labs.ucx.assessment.sequencing import MigrationSequencer, MigrationStep from databricks.labs.ucx.framework.owners import AdministratorLocator, AdministratorFinder @@ -38,6 +38,23 @@ def test_register_non_existing_cluster( assert maybe_node.problems == ["Could not find cluster: non-existing-id"] +def test_register_non_existing_job_cluster( + ws, + simple_dependency_resolver, + mock_path_lookup, + admin_locator, +) -> None: + """Register a non-existing job cluster.""" + job_cluster = jobs.JobCluster(new_cluster=ClusterSpec(), job_cluster_key="non-existing-id") + sequencer = MigrationSequencer(ws, admin_locator) + + maybe_node = sequencer.register_job_cluster(job_cluster) + + assert maybe_node.node is None + assert maybe_node.failed + assert maybe_node.problems == ["Could not find cluster: non-existing-id"] + + def test_sequence_steps_from_job_task_with_cluster( ws, simple_dependency_resolver, mock_path_lookup, admin_locator ) -> None: From f12c25b8a0ecca31acda09a6b382fbbc11dcc57b Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Tue, 29 Oct 2024 16:41:02 +0100 Subject: [PATCH 047/106] Add happy path test for cluster --- tests/unit/assessment/test_sequencing.py | 26 +++++++++++++++++------- 1 file changed, 19 insertions(+), 7 deletions(-) diff --git a/tests/unit/assessment/test_sequencing.py b/tests/unit/assessment/test_sequencing.py index 041dc5ca31..c0e34c25af 100644 --- a/tests/unit/assessment/test_sequencing.py +++ b/tests/unit/assessment/test_sequencing.py @@ -21,12 +21,25 @@ def admin_locator(ws): return AdministratorLocator(ws, finders=[lambda _ws: admin_finder]) -def test_register_non_existing_cluster( - ws, - simple_dependency_resolver, - mock_path_lookup, - admin_locator, -) -> None: +def test_register_existing_cluster(ws, admin_locator) -> None: + """Register a existing cluster.""" + + def get_cluster(cluster_id: str) -> ClusterDetails: + if cluster_id == "cluster-123": + return ClusterDetails(cluster_id="cluster-123", cluster_name="my-cluster") + raise ResourceDoesNotExist(f"Unknown cluster: {cluster_id}") + + ws.clusters.get.side_effect = get_cluster + sequencer = MigrationSequencer(ws, admin_locator) + + maybe_node = sequencer.register_cluster("cluster-123") + + assert maybe_node.node is not None + assert not maybe_node.failed + assert not maybe_node.problems + + +def test_register_non_existing_cluster(ws, admin_locator) -> None: """Register a non existing cluster.""" ws.clusters.get.side_effect = ResourceDoesNotExist("Unknown cluster") sequencer = MigrationSequencer(ws, admin_locator) @@ -40,7 +53,6 @@ def test_register_non_existing_cluster( def test_register_non_existing_job_cluster( ws, - simple_dependency_resolver, mock_path_lookup, admin_locator, ) -> None: From 346f253aaabd24cb7066a6d8d5c9cdf2127786be Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Tue, 29 Oct 2024 17:22:23 +0100 Subject: [PATCH 048/106] Rewrite order of workflow task to job --- .../labs/ucx/assessment/sequencing.py | 28 ++++++++++++------- 1 file changed, 18 insertions(+), 10 deletions(-) diff --git a/src/databricks/labs/ucx/assessment/sequencing.py b/src/databricks/labs/ucx/assessment/sequencing.py index a802c99770..fa35b2f67b 100644 --- a/src/databricks/labs/ucx/assessment/sequencing.py +++ b/src/databricks/labs/ucx/assessment/sequencing.py @@ -156,30 +156,34 @@ def __init__(self, ws: WorkspaceClient, admin_locator: AdministratorLocator): self._nodes: dict[MigrationNodeKey, MigrationNode] = {} # TODO: Update to MaybeMigrationNode self._outgoing: dict[MigrationNodeKey, set[MigrationNode]] = defaultdict(set) - def register_workflow_task(self, task: jobs.Task, job: jobs.Job, _graph: DependencyGraph) -> MigrationNode: - task_id = f"{job.job_id}/{task.task_key}" + def register_workflow_task(self, task: jobs.Task, parent: MigrationNode) -> MaybeMigrationNode: + """Register a workflow task. + + Args: + task : jobs.Task + The task to register + parent : MigrationNode + The migration node for the parent job + """ + task_id = f"{parent.key}/{task.task_key}" task_node = self._nodes.get(("TASK", task_id), None) if task_node: - return task_node - job_node = self.register_workflow_job(job) + return MaybeMigrationNode(task_node, []) self._last_node_id += 1 task_node = MigrationNode( node_id=self._last_node_id, object_type="TASK", object_id=task_id, object_name=task.task_key, - object_owner=job_node.object_owner, # no task owner so use job one + object_owner=parent.object_owner, # No task owner so use parent job owner ) self._nodes[task_node.key] = task_node - self._outgoing[task_node.key].add(job_node) if task.existing_cluster_id: maybe_cluster_node = self.register_cluster(task.existing_cluster_id) if maybe_cluster_node.node: self._outgoing[task_node.key].add(maybe_cluster_node.node) - # also make the cluster dependent on the job - self._outgoing[job_node.key].add(maybe_cluster_node.node) - # TODO register dependency graph - return task_node + # TODO: register `job_cluster_key + return MaybeMigrationNode(task_node, []) def register_workflow_job(self, job: jobs.Job) -> MigrationNode: job_node = self._nodes.get(("JOB", str(job.job_id)), None) @@ -200,6 +204,10 @@ def register_workflow_job(self, job: jobs.Job) -> MigrationNode: maybe_cluster_node = self.register_job_cluster(job_cluster) if maybe_cluster_node.node: self._outgoing[job_node.key].add(maybe_cluster_node.node) + for task in job.settings.tasks or []: + maybe_task_node = self.register_workflow_task(task, job) + if maybe_task_node.node: + self._outgoing[job_node.key] = maybe_task_node.node return job_node def register_job_cluster(self, cluster: jobs.JobCluster) -> MaybeMigrationNode: From f301839470cfe244dc7394257ff72855967a2657 Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Tue, 29 Oct 2024 17:22:53 +0100 Subject: [PATCH 049/106] Make register workflow task hidden --- src/databricks/labs/ucx/assessment/sequencing.py | 4 ++-- tests/unit/assessment/test_sequencing.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/databricks/labs/ucx/assessment/sequencing.py b/src/databricks/labs/ucx/assessment/sequencing.py index fa35b2f67b..4e9eb93d8c 100644 --- a/src/databricks/labs/ucx/assessment/sequencing.py +++ b/src/databricks/labs/ucx/assessment/sequencing.py @@ -156,7 +156,7 @@ def __init__(self, ws: WorkspaceClient, admin_locator: AdministratorLocator): self._nodes: dict[MigrationNodeKey, MigrationNode] = {} # TODO: Update to MaybeMigrationNode self._outgoing: dict[MigrationNodeKey, set[MigrationNode]] = defaultdict(set) - def register_workflow_task(self, task: jobs.Task, parent: MigrationNode) -> MaybeMigrationNode: + def _register_workflow_task(self, task: jobs.Task, parent: MigrationNode) -> MaybeMigrationNode: """Register a workflow task. Args: @@ -205,7 +205,7 @@ def register_workflow_job(self, job: jobs.Job) -> MigrationNode: if maybe_cluster_node.node: self._outgoing[job_node.key].add(maybe_cluster_node.node) for task in job.settings.tasks or []: - maybe_task_node = self.register_workflow_task(task, job) + maybe_task_node = self._register_workflow_task(task, job) if maybe_task_node.node: self._outgoing[job_node.key] = maybe_task_node.node return job_node diff --git a/tests/unit/assessment/test_sequencing.py b/tests/unit/assessment/test_sequencing.py index c0e34c25af..19dbf613b0 100644 --- a/tests/unit/assessment/test_sequencing.py +++ b/tests/unit/assessment/test_sequencing.py @@ -95,7 +95,7 @@ def get_cluster(cluster_id: str) -> ClusterDetails: dependency = WorkflowTask(ws, task, job) graph = DependencyGraph(dependency, None, simple_dependency_resolver, mock_path_lookup, CurrentSessionState()) sequencer = MigrationSequencer(ws, admin_locator) - sequencer.register_workflow_task(task, job, graph) + sequencer._register_workflow_task(task, job, graph) steps = list(sequencer.generate_steps()) From f1225be3c5b889a8f8cf9539ac06cf2365c47e02 Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Tue, 29 Oct 2024 17:24:20 +0100 Subject: [PATCH 050/106] Use itertools.counter --- src/databricks/labs/ucx/assessment/sequencing.py | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/src/databricks/labs/ucx/assessment/sequencing.py b/src/databricks/labs/ucx/assessment/sequencing.py index 4e9eb93d8c..793062912f 100644 --- a/src/databricks/labs/ucx/assessment/sequencing.py +++ b/src/databricks/labs/ucx/assessment/sequencing.py @@ -152,7 +152,7 @@ class MigrationSequencer: def __init__(self, ws: WorkspaceClient, admin_locator: AdministratorLocator): self._ws = ws self._admin_locator = admin_locator - self._last_node_id = 0 + self._counter = itertools.count() self._nodes: dict[MigrationNodeKey, MigrationNode] = {} # TODO: Update to MaybeMigrationNode self._outgoing: dict[MigrationNodeKey, set[MigrationNode]] = defaultdict(set) @@ -169,9 +169,8 @@ def _register_workflow_task(self, task: jobs.Task, parent: MigrationNode) -> May task_node = self._nodes.get(("TASK", task_id), None) if task_node: return MaybeMigrationNode(task_node, []) - self._last_node_id += 1 task_node = MigrationNode( - node_id=self._last_node_id, + node_id=next(self._counter), object_type="TASK", object_id=task_id, object_name=task.task_key, @@ -189,10 +188,9 @@ def register_workflow_job(self, job: jobs.Job) -> MigrationNode: job_node = self._nodes.get(("JOB", str(job.job_id)), None) if job_node: return job_node - self._last_node_id += 1 job_name = job.settings.name if job.settings and job.settings.name else str(job.job_id) job_node = MigrationNode( - node_id=self._last_node_id, + node_id=next(self._counter), object_type="JOB", object_id=str(job.job_id), object_name=job_name, @@ -225,9 +223,8 @@ def register_cluster(self, cluster_id: str) -> MaybeMigrationNode: message = f"Could not find cluster: {cluster_id}" return MaybeMigrationNode(None, [DependencyProblem('cluster-not-found', message)]) object_name = details.cluster_name if details and details.cluster_name else cluster_id - self._last_node_id += 1 cluster_node = MigrationNode( - node_id=self._last_node_id, + node_id=next(self._counter), object_type="CLUSTER", object_id=cluster_id, object_name=object_name, From a4ac8faa6c1cabf0cd3c140c6982aca6a572e6c2 Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Tue, 29 Oct 2024 17:24:44 +0100 Subject: [PATCH 051/106] Move register workflow job up --- .../labs/ucx/assessment/sequencing.py | 48 +++++++++---------- 1 file changed, 24 insertions(+), 24 deletions(-) diff --git a/src/databricks/labs/ucx/assessment/sequencing.py b/src/databricks/labs/ucx/assessment/sequencing.py index 793062912f..d806919e25 100644 --- a/src/databricks/labs/ucx/assessment/sequencing.py +++ b/src/databricks/labs/ucx/assessment/sequencing.py @@ -156,6 +156,30 @@ def __init__(self, ws: WorkspaceClient, admin_locator: AdministratorLocator): self._nodes: dict[MigrationNodeKey, MigrationNode] = {} # TODO: Update to MaybeMigrationNode self._outgoing: dict[MigrationNodeKey, set[MigrationNode]] = defaultdict(set) + def register_workflow_job(self, job: jobs.Job) -> MigrationNode: + job_node = self._nodes.get(("JOB", str(job.job_id)), None) + if job_node: + return job_node + job_name = job.settings.name if job.settings and job.settings.name else str(job.job_id) + job_node = MigrationNode( + node_id=next(self._counter), + object_type="JOB", + object_id=str(job.job_id), + object_name=job_name, + object_owner=JobOwnership(self._admin_locator).owner_of(JobInfo.from_job(job)), + ) + self._nodes[job_node.key] = job_node + if job.settings and job.settings.job_clusters: + for job_cluster in job.settings.job_clusters: + maybe_cluster_node = self.register_job_cluster(job_cluster) + if maybe_cluster_node.node: + self._outgoing[job_node.key].add(maybe_cluster_node.node) + for task in job.settings.tasks or []: + maybe_task_node = self._register_workflow_task(task, job) + if maybe_task_node.node: + self._outgoing[job_node.key] = maybe_task_node.node + return job_node + def _register_workflow_task(self, task: jobs.Task, parent: MigrationNode) -> MaybeMigrationNode: """Register a workflow task. @@ -184,30 +208,6 @@ def _register_workflow_task(self, task: jobs.Task, parent: MigrationNode) -> May # TODO: register `job_cluster_key return MaybeMigrationNode(task_node, []) - def register_workflow_job(self, job: jobs.Job) -> MigrationNode: - job_node = self._nodes.get(("JOB", str(job.job_id)), None) - if job_node: - return job_node - job_name = job.settings.name if job.settings and job.settings.name else str(job.job_id) - job_node = MigrationNode( - node_id=next(self._counter), - object_type="JOB", - object_id=str(job.job_id), - object_name=job_name, - object_owner=JobOwnership(self._admin_locator).owner_of(JobInfo.from_job(job)), - ) - self._nodes[job_node.key] = job_node - if job.settings and job.settings.job_clusters: - for job_cluster in job.settings.job_clusters: - maybe_cluster_node = self.register_job_cluster(job_cluster) - if maybe_cluster_node.node: - self._outgoing[job_node.key].add(maybe_cluster_node.node) - for task in job.settings.tasks or []: - maybe_task_node = self._register_workflow_task(task, job) - if maybe_task_node.node: - self._outgoing[job_node.key] = maybe_task_node.node - return job_node - def register_job_cluster(self, cluster: jobs.JobCluster) -> MaybeMigrationNode: if cluster.new_cluster: return MaybeMigrationNode(None, []) From 0e0755258c7a8486435855bb39623e4820741d84 Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Tue, 29 Oct 2024 17:25:10 +0100 Subject: [PATCH 052/106] Rename to register job --- src/databricks/labs/ucx/assessment/sequencing.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/databricks/labs/ucx/assessment/sequencing.py b/src/databricks/labs/ucx/assessment/sequencing.py index d806919e25..2870f8e1f6 100644 --- a/src/databricks/labs/ucx/assessment/sequencing.py +++ b/src/databricks/labs/ucx/assessment/sequencing.py @@ -156,7 +156,7 @@ def __init__(self, ws: WorkspaceClient, admin_locator: AdministratorLocator): self._nodes: dict[MigrationNodeKey, MigrationNode] = {} # TODO: Update to MaybeMigrationNode self._outgoing: dict[MigrationNodeKey, set[MigrationNode]] = defaultdict(set) - def register_workflow_job(self, job: jobs.Job) -> MigrationNode: + def register_job(self, job: jobs.Job) -> MigrationNode: job_node = self._nodes.get(("JOB", str(job.job_id)), None) if job_node: return job_node From 9b3b2e69d2ce1da9a8e9db687afb0d9fa75662e4 Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Tue, 29 Oct 2024 17:26:19 +0100 Subject: [PATCH 053/106] Make methods hidden --- src/databricks/labs/ucx/assessment/sequencing.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/databricks/labs/ucx/assessment/sequencing.py b/src/databricks/labs/ucx/assessment/sequencing.py index 2870f8e1f6..ba42aebfd8 100644 --- a/src/databricks/labs/ucx/assessment/sequencing.py +++ b/src/databricks/labs/ucx/assessment/sequencing.py @@ -171,7 +171,7 @@ def register_job(self, job: jobs.Job) -> MigrationNode: self._nodes[job_node.key] = job_node if job.settings and job.settings.job_clusters: for job_cluster in job.settings.job_clusters: - maybe_cluster_node = self.register_job_cluster(job_cluster) + maybe_cluster_node = self._register_job_cluster(job_cluster) if maybe_cluster_node.node: self._outgoing[job_node.key].add(maybe_cluster_node.node) for task in job.settings.tasks or []: @@ -202,18 +202,18 @@ def _register_workflow_task(self, task: jobs.Task, parent: MigrationNode) -> May ) self._nodes[task_node.key] = task_node if task.existing_cluster_id: - maybe_cluster_node = self.register_cluster(task.existing_cluster_id) + maybe_cluster_node = self._register_cluster(task.existing_cluster_id) if maybe_cluster_node.node: self._outgoing[task_node.key].add(maybe_cluster_node.node) # TODO: register `job_cluster_key return MaybeMigrationNode(task_node, []) - def register_job_cluster(self, cluster: jobs.JobCluster) -> MaybeMigrationNode: + def _register_job_cluster(self, cluster: jobs.JobCluster) -> MaybeMigrationNode: if cluster.new_cluster: return MaybeMigrationNode(None, []) - return self.register_cluster(cluster.job_cluster_key) + return self._register_cluster(cluster.job_cluster_key) - def register_cluster(self, cluster_id: str) -> MaybeMigrationNode: + def _register_cluster(self, cluster_id: str) -> MaybeMigrationNode: node_seen = self._nodes.get(("CLUSTER", cluster_id), None) if node_seen: return MaybeMigrationNode(node_seen, []) From 23d158b484d6d017f472b434de2f41ea78193080 Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Wed, 30 Oct 2024 14:03:39 +0100 Subject: [PATCH 054/106] Rename protected access methods --- tests/unit/assessment/test_sequencing.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/unit/assessment/test_sequencing.py b/tests/unit/assessment/test_sequencing.py index 19dbf613b0..f7542414d3 100644 --- a/tests/unit/assessment/test_sequencing.py +++ b/tests/unit/assessment/test_sequencing.py @@ -32,7 +32,7 @@ def get_cluster(cluster_id: str) -> ClusterDetails: ws.clusters.get.side_effect = get_cluster sequencer = MigrationSequencer(ws, admin_locator) - maybe_node = sequencer.register_cluster("cluster-123") + maybe_node = sequencer._register_cluster("cluster-123") assert maybe_node.node is not None assert not maybe_node.failed @@ -44,7 +44,7 @@ def test_register_non_existing_cluster(ws, admin_locator) -> None: ws.clusters.get.side_effect = ResourceDoesNotExist("Unknown cluster") sequencer = MigrationSequencer(ws, admin_locator) - maybe_node = sequencer.register_cluster("non-existing-id") + maybe_node = sequencer._register_cluster("non-existing-id") assert maybe_node.node is None assert maybe_node.failed @@ -60,7 +60,7 @@ def test_register_non_existing_job_cluster( job_cluster = jobs.JobCluster(new_cluster=ClusterSpec(), job_cluster_key="non-existing-id") sequencer = MigrationSequencer(ws, admin_locator) - maybe_node = sequencer.register_job_cluster(job_cluster) + maybe_node = sequencer._register_job_cluster(job_cluster) assert maybe_node.node is None assert maybe_node.failed From 844cc7f133d22b9377323151df3774fa28a2d917 Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Wed, 30 Oct 2024 14:04:24 +0100 Subject: [PATCH 055/106] Test job referencing unknown job cluster --- tests/unit/assessment/test_sequencing.py | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/tests/unit/assessment/test_sequencing.py b/tests/unit/assessment/test_sequencing.py index f7542414d3..e3a749651c 100644 --- a/tests/unit/assessment/test_sequencing.py +++ b/tests/unit/assessment/test_sequencing.py @@ -67,6 +67,30 @@ def test_register_non_existing_job_cluster( assert maybe_node.problems == ["Could not find cluster: non-existing-id"] +def test_register_workflow_task_with_missing_cluster_dependency( + ws, + simple_dependency_resolver, + mock_path_lookup, + admin_locator, +) -> None: + """Register a workflow task with missing cluster dependency.""" + task = jobs.Task(task_key="test-task", existing_cluster_id="cluster-123") + settings = jobs.JobSettings(name="test-job", tasks=[task]) + job = jobs.Job(job_id=1234, settings=settings) + ws.jobs.get.return_value = job + + ws.clusters.get.side_effect = ResourceDoesNotExist("Unknown cluster") + + dependency = WorkflowTask(ws, task, job) + graph = DependencyGraph(dependency, None, simple_dependency_resolver, mock_path_lookup, CurrentSessionState()) + sequencer = MigrationSequencer(ws, admin_locator) + + maybe_node = sequencer._register_workflow_task(task, job, graph) + + assert maybe_node.node is None + assert maybe_node.failed() + + def test_sequence_steps_from_job_task_with_cluster( ws, simple_dependency_resolver, mock_path_lookup, admin_locator ) -> None: From 8261f0fe44c8142b061f7cec392598ccbadbb372 Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Wed, 30 Oct 2024 14:30:57 +0100 Subject: [PATCH 056/106] Test job with existing cluster --- tests/unit/assessment/test_sequencing.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/tests/unit/assessment/test_sequencing.py b/tests/unit/assessment/test_sequencing.py index e3a749651c..f29913a521 100644 --- a/tests/unit/assessment/test_sequencing.py +++ b/tests/unit/assessment/test_sequencing.py @@ -21,8 +21,11 @@ def admin_locator(ws): return AdministratorLocator(ws, finders=[lambda _ws: admin_finder]) -def test_register_existing_cluster(ws, admin_locator) -> None: +def test_register_job_with_existing_cluster(ws, admin_locator) -> None: """Register a existing cluster.""" + task = jobs.Task(task_key="test-task", existing_cluster_id="cluster-123") + settings = jobs.JobSettings(name="test-job", tasks=[task]) + job = jobs.Job(job_id=1234, settings=settings) def get_cluster(cluster_id: str) -> ClusterDetails: if cluster_id == "cluster-123": @@ -32,11 +35,9 @@ def get_cluster(cluster_id: str) -> ClusterDetails: ws.clusters.get.side_effect = get_cluster sequencer = MigrationSequencer(ws, admin_locator) - maybe_node = sequencer._register_cluster("cluster-123") + maybe_node = sequencer.register_job(job) - assert maybe_node.node is not None assert not maybe_node.failed - assert not maybe_node.problems def test_register_non_existing_cluster(ws, admin_locator) -> None: From 04d7fe54d40ae1e60d0700a428b0c1b188da3b9f Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Wed, 30 Oct 2024 14:31:10 +0100 Subject: [PATCH 057/106] Use MaybeMigrationNode when registering job --- src/databricks/labs/ucx/assessment/sequencing.py | 6 +++--- tests/unit/assessment/test_sequencing.py | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/databricks/labs/ucx/assessment/sequencing.py b/src/databricks/labs/ucx/assessment/sequencing.py index ba42aebfd8..70da4b44d7 100644 --- a/src/databricks/labs/ucx/assessment/sequencing.py +++ b/src/databricks/labs/ucx/assessment/sequencing.py @@ -156,10 +156,10 @@ def __init__(self, ws: WorkspaceClient, admin_locator: AdministratorLocator): self._nodes: dict[MigrationNodeKey, MigrationNode] = {} # TODO: Update to MaybeMigrationNode self._outgoing: dict[MigrationNodeKey, set[MigrationNode]] = defaultdict(set) - def register_job(self, job: jobs.Job) -> MigrationNode: + def register_job(self, job: jobs.Job) -> MaybeMigrationNode: job_node = self._nodes.get(("JOB", str(job.job_id)), None) if job_node: - return job_node + return MaybeMigrationNode(job_node, []) job_name = job.settings.name if job.settings and job.settings.name else str(job.job_id) job_node = MigrationNode( node_id=next(self._counter), @@ -178,7 +178,7 @@ def register_job(self, job: jobs.Job) -> MigrationNode: maybe_task_node = self._register_workflow_task(task, job) if maybe_task_node.node: self._outgoing[job_node.key] = maybe_task_node.node - return job_node + return MaybeMigrationNode(job_node, []) def _register_workflow_task(self, task: jobs.Task, parent: MigrationNode) -> MaybeMigrationNode: """Register a workflow task. diff --git a/tests/unit/assessment/test_sequencing.py b/tests/unit/assessment/test_sequencing.py index f29913a521..26370213dd 100644 --- a/tests/unit/assessment/test_sequencing.py +++ b/tests/unit/assessment/test_sequencing.py @@ -22,7 +22,7 @@ def admin_locator(ws): def test_register_job_with_existing_cluster(ws, admin_locator) -> None: - """Register a existing cluster.""" + """Register an existing cluster.""" task = jobs.Task(task_key="test-task", existing_cluster_id="cluster-123") settings = jobs.JobSettings(name="test-job", tasks=[task]) job = jobs.Job(job_id=1234, settings=settings) From 136ee2e2d0a77a24f067bf83460c96ef76ad940f Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Wed, 30 Oct 2024 14:43:23 +0100 Subject: [PATCH 058/106] Test register a job with non existing cluster --- tests/unit/assessment/test_sequencing.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/tests/unit/assessment/test_sequencing.py b/tests/unit/assessment/test_sequencing.py index 26370213dd..796222b6fd 100644 --- a/tests/unit/assessment/test_sequencing.py +++ b/tests/unit/assessment/test_sequencing.py @@ -40,14 +40,17 @@ def get_cluster(cluster_id: str) -> ClusterDetails: assert not maybe_node.failed -def test_register_non_existing_cluster(ws, admin_locator) -> None: +def test_register_job_with_non_existing_cluster(ws, admin_locator) -> None: """Register a non existing cluster.""" + task = jobs.Task(task_key="test-task", existing_cluster_id="cluster-123") + settings = jobs.JobSettings(name="test-job", tasks=[task]) + job = jobs.Job(job_id=1234, settings=settings) + ws.clusters.get.side_effect = ResourceDoesNotExist("Unknown cluster") sequencer = MigrationSequencer(ws, admin_locator) - maybe_node = sequencer._register_cluster("non-existing-id") + maybe_node = sequencer.register_job(job) - assert maybe_node.node is None assert maybe_node.failed assert maybe_node.problems == ["Could not find cluster: non-existing-id"] From 8fc4437e0ea4687788bdb87d4fc39b4910ac7a02 Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Wed, 30 Oct 2024 14:46:29 +0100 Subject: [PATCH 059/106] Propagate cluster problems --- src/databricks/labs/ucx/assessment/sequencing.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/src/databricks/labs/ucx/assessment/sequencing.py b/src/databricks/labs/ucx/assessment/sequencing.py index 70da4b44d7..5fe17b6c8c 100644 --- a/src/databricks/labs/ucx/assessment/sequencing.py +++ b/src/databricks/labs/ucx/assessment/sequencing.py @@ -157,9 +157,10 @@ def __init__(self, ws: WorkspaceClient, admin_locator: AdministratorLocator): self._outgoing: dict[MigrationNodeKey, set[MigrationNode]] = defaultdict(set) def register_job(self, job: jobs.Job) -> MaybeMigrationNode: + problems = [] job_node = self._nodes.get(("JOB", str(job.job_id)), None) if job_node: - return MaybeMigrationNode(job_node, []) + return MaybeMigrationNode(job_node, problems) job_name = job.settings.name if job.settings and job.settings.name else str(job.job_id) job_node = MigrationNode( node_id=next(self._counter), @@ -176,9 +177,10 @@ def register_job(self, job: jobs.Job) -> MaybeMigrationNode: self._outgoing[job_node.key].add(maybe_cluster_node.node) for task in job.settings.tasks or []: maybe_task_node = self._register_workflow_task(task, job) + problems.extend(maybe_task_node.problems) if maybe_task_node.node: self._outgoing[job_node.key] = maybe_task_node.node - return MaybeMigrationNode(job_node, []) + return MaybeMigrationNode(job_node, problems) def _register_workflow_task(self, task: jobs.Task, parent: MigrationNode) -> MaybeMigrationNode: """Register a workflow task. @@ -189,10 +191,11 @@ def _register_workflow_task(self, task: jobs.Task, parent: MigrationNode) -> May parent : MigrationNode The migration node for the parent job """ + problems = [] task_id = f"{parent.key}/{task.task_key}" task_node = self._nodes.get(("TASK", task_id), None) if task_node: - return MaybeMigrationNode(task_node, []) + return MaybeMigrationNode(task_node, problems) task_node = MigrationNode( node_id=next(self._counter), object_type="TASK", @@ -203,10 +206,11 @@ def _register_workflow_task(self, task: jobs.Task, parent: MigrationNode) -> May self._nodes[task_node.key] = task_node if task.existing_cluster_id: maybe_cluster_node = self._register_cluster(task.existing_cluster_id) + problems.extend(maybe_cluster_node.problems) if maybe_cluster_node.node: self._outgoing[task_node.key].add(maybe_cluster_node.node) # TODO: register `job_cluster_key - return MaybeMigrationNode(task_node, []) + return MaybeMigrationNode(task_node, problems) def _register_job_cluster(self, cluster: jobs.JobCluster) -> MaybeMigrationNode: if cluster.new_cluster: From ab401440c0332f9ada6c49af584afd146262897b Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Wed, 30 Oct 2024 14:46:52 +0100 Subject: [PATCH 060/106] Update condition --- src/databricks/labs/ucx/assessment/sequencing.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/databricks/labs/ucx/assessment/sequencing.py b/src/databricks/labs/ucx/assessment/sequencing.py index 5fe17b6c8c..fb5034fdf4 100644 --- a/src/databricks/labs/ucx/assessment/sequencing.py +++ b/src/databricks/labs/ucx/assessment/sequencing.py @@ -170,8 +170,8 @@ def register_job(self, job: jobs.Job) -> MaybeMigrationNode: object_owner=JobOwnership(self._admin_locator).owner_of(JobInfo.from_job(job)), ) self._nodes[job_node.key] = job_node - if job.settings and job.settings.job_clusters: - for job_cluster in job.settings.job_clusters: + if job.settings: + for job_cluster in job.settings.job_clusters or []: maybe_cluster_node = self._register_job_cluster(job_cluster) if maybe_cluster_node.node: self._outgoing[job_node.key].add(maybe_cluster_node.node) From 5d93a77a7f6d4131dfdd5552d2e97bfee2ae1f3a Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Wed, 30 Oct 2024 14:47:57 +0100 Subject: [PATCH 061/106] Fix passing job node instead of job --- src/databricks/labs/ucx/assessment/sequencing.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/databricks/labs/ucx/assessment/sequencing.py b/src/databricks/labs/ucx/assessment/sequencing.py index fb5034fdf4..7d80a5514e 100644 --- a/src/databricks/labs/ucx/assessment/sequencing.py +++ b/src/databricks/labs/ucx/assessment/sequencing.py @@ -176,7 +176,7 @@ def register_job(self, job: jobs.Job) -> MaybeMigrationNode: if maybe_cluster_node.node: self._outgoing[job_node.key].add(maybe_cluster_node.node) for task in job.settings.tasks or []: - maybe_task_node = self._register_workflow_task(task, job) + maybe_task_node = self._register_workflow_task(task, job_node) problems.extend(maybe_task_node.problems) if maybe_task_node.node: self._outgoing[job_node.key] = maybe_task_node.node From a6cbaad2d8e8de8d9820af3854bd459281e7b97a Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Wed, 30 Oct 2024 14:48:38 +0100 Subject: [PATCH 062/106] Remove redundant import --- src/databricks/labs/ucx/assessment/sequencing.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/databricks/labs/ucx/assessment/sequencing.py b/src/databricks/labs/ucx/assessment/sequencing.py index 7d80a5514e..0a777442c9 100644 --- a/src/databricks/labs/ucx/assessment/sequencing.py +++ b/src/databricks/labs/ucx/assessment/sequencing.py @@ -13,7 +13,7 @@ from databricks.labs.ucx.assessment.clusters import ClusterOwnership, ClusterInfo from databricks.labs.ucx.assessment.jobs import JobOwnership, JobInfo from databricks.labs.ucx.framework.owners import AdministratorLocator -from databricks.labs.ucx.source_code.graph import DependencyGraph, DependencyProblem +from databricks.labs.ucx.source_code.graph import DependencyProblem @dataclass From e8c64d35b94e98b6ed046d2f5b92b954e6c69ad0 Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Wed, 30 Oct 2024 14:49:48 +0100 Subject: [PATCH 063/106] fix testing dependency problem --- tests/unit/assessment/test_sequencing.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/tests/unit/assessment/test_sequencing.py b/tests/unit/assessment/test_sequencing.py index 796222b6fd..3127380f1b 100644 --- a/tests/unit/assessment/test_sequencing.py +++ b/tests/unit/assessment/test_sequencing.py @@ -8,7 +8,7 @@ from databricks.labs.ucx.assessment.sequencing import MigrationSequencer, MigrationStep from databricks.labs.ucx.framework.owners import AdministratorLocator, AdministratorFinder from databricks.labs.ucx.source_code.base import CurrentSessionState -from databricks.labs.ucx.source_code.graph import DependencyGraph +from databricks.labs.ucx.source_code.graph import DependencyGraph, DependencyProblem from databricks.labs.ucx.source_code.jobs import WorkflowTask @@ -42,7 +42,7 @@ def get_cluster(cluster_id: str) -> ClusterDetails: def test_register_job_with_non_existing_cluster(ws, admin_locator) -> None: """Register a non existing cluster.""" - task = jobs.Task(task_key="test-task", existing_cluster_id="cluster-123") + task = jobs.Task(task_key="test-task", existing_cluster_id="non-existing-id") settings = jobs.JobSettings(name="test-job", tasks=[task]) job = jobs.Job(job_id=1234, settings=settings) @@ -52,7 +52,12 @@ def test_register_job_with_non_existing_cluster(ws, admin_locator) -> None: maybe_node = sequencer.register_job(job) assert maybe_node.failed - assert maybe_node.problems == ["Could not find cluster: non-existing-id"] + assert maybe_node.problems == [ + DependencyProblem( + code="cluster-not-found", + message="Could not find cluster: non-existing-id", + ) + ] def test_register_non_existing_job_cluster( From 6a62ee0a321a97670d394b8d2f6dca6495aa4b70 Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Wed, 30 Oct 2024 14:51:55 +0100 Subject: [PATCH 064/106] Add docstring to registering job --- src/databricks/labs/ucx/assessment/sequencing.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/src/databricks/labs/ucx/assessment/sequencing.py b/src/databricks/labs/ucx/assessment/sequencing.py index 0a777442c9..025fcb0e39 100644 --- a/src/databricks/labs/ucx/assessment/sequencing.py +++ b/src/databricks/labs/ucx/assessment/sequencing.py @@ -157,6 +157,16 @@ def __init__(self, ws: WorkspaceClient, admin_locator: AdministratorLocator): self._outgoing: dict[MigrationNodeKey, set[MigrationNode]] = defaultdict(set) def register_job(self, job: jobs.Job) -> MaybeMigrationNode: + """Register a job. + + Args: + job (jobs.Job) : The job to register. + + Returns: + MaybeMigrationNode : A maybe migration node, which has the migration node if no problems occurred during + registering. Otherwise, the maybe migration node contains the dependency problems occurring during + registering the job. + """ problems = [] job_node = self._nodes.get(("JOB", str(job.job_id)), None) if job_node: From a29d13086189b980e4077c373ca29be4b078ba26 Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Wed, 30 Oct 2024 15:00:59 +0100 Subject: [PATCH 065/106] Register job cluster --- src/databricks/labs/ucx/assessment/sequencing.py | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/src/databricks/labs/ucx/assessment/sequencing.py b/src/databricks/labs/ucx/assessment/sequencing.py index 025fcb0e39..38c46edde3 100644 --- a/src/databricks/labs/ucx/assessment/sequencing.py +++ b/src/databricks/labs/ucx/assessment/sequencing.py @@ -222,10 +222,16 @@ def _register_workflow_task(self, task: jobs.Task, parent: MigrationNode) -> May # TODO: register `job_cluster_key return MaybeMigrationNode(task_node, problems) - def _register_job_cluster(self, cluster: jobs.JobCluster) -> MaybeMigrationNode: - if cluster.new_cluster: - return MaybeMigrationNode(None, []) - return self._register_cluster(cluster.job_cluster_key) + def _register_job_cluster(self, cluster: jobs.JobCluster, job: jobs.Job) -> MaybeMigrationNode: + cluster_node = MigrationNode( + node_id=next(self._counter), + object_type="CLUSTER", # TODO: Do we want to differentiate between job and normal clusters? + object_id=cluster.job_cluster_key, + object_name=cluster.job_cluster_key, + # Job clusters are ephemeral and exist during a job run for a specific job only + object_owner=JobOwnership(self._admin_locator).owner_of(JobInfo.from_job(job)), + ) + return MaybeMigrationNode(cluster_node, []) def _register_cluster(self, cluster_id: str) -> MaybeMigrationNode: node_seen = self._nodes.get(("CLUSTER", cluster_id), None) From 64443631cccb7950ff89001b2278a3fa056ee58f Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Wed, 30 Oct 2024 15:56:00 +0100 Subject: [PATCH 066/106] Test register job with non-existing job cluster key --- tests/unit/assessment/test_sequencing.py | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/tests/unit/assessment/test_sequencing.py b/tests/unit/assessment/test_sequencing.py index 3127380f1b..1964fc8d8b 100644 --- a/tests/unit/assessment/test_sequencing.py +++ b/tests/unit/assessment/test_sequencing.py @@ -60,20 +60,27 @@ def test_register_job_with_non_existing_cluster(ws, admin_locator) -> None: ] -def test_register_non_existing_job_cluster( +def test_register_job_with_non_existing_job_cluster_key( ws, mock_path_lookup, admin_locator, ) -> None: - """Register a non-existing job cluster.""" - job_cluster = jobs.JobCluster(new_cluster=ClusterSpec(), job_cluster_key="non-existing-id") + """Register a job with non-existing job cluster key.""" + task = jobs.Task(task_key="test-task", job_cluster_key="non-existing-id") + settings = jobs.JobSettings(name="test-job", tasks=[task]) + job = jobs.Job(job_id=1234, settings=settings) + sequencer = MigrationSequencer(ws, admin_locator) - maybe_node = sequencer._register_job_cluster(job_cluster) + maybe_node = sequencer.register_job(job) - assert maybe_node.node is None assert maybe_node.failed - assert maybe_node.problems == ["Could not find cluster: non-existing-id"] + assert maybe_node.problems == [ + DependencyProblem( + code="cluster-not-found", + message="Could not find cluster: non-existing-id", + ) + ] def test_register_workflow_task_with_missing_cluster_dependency( From e93db451f8853a67a5bfa9cb90e30ecd3bf4ad37 Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Wed, 30 Oct 2024 15:58:53 +0100 Subject: [PATCH 067/106] Test register existing job cluster --- tests/unit/assessment/test_sequencing.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/tests/unit/assessment/test_sequencing.py b/tests/unit/assessment/test_sequencing.py index 1964fc8d8b..1f88f8c149 100644 --- a/tests/unit/assessment/test_sequencing.py +++ b/tests/unit/assessment/test_sequencing.py @@ -60,6 +60,23 @@ def test_register_job_with_non_existing_cluster(ws, admin_locator) -> None: ] +def test_register_job_with_existing_job_cluster_key( + ws, + mock_path_lookup, + admin_locator, +) -> None: + """Register a job with existing job cluster key.""" + job_cluster = jobs.JobCluster("existing-id", ClusterSpec()) + task = jobs.Task(task_key="test-task", job_cluster_key="existing-id") + settings = jobs.JobSettings(name="test-job", tasks=[task], job_clusters=[job_cluster]) + job = jobs.Job(job_id=1234, settings=settings) + sequencer = MigrationSequencer(ws, admin_locator) + + maybe_node = sequencer.register_job(job) + + assert not maybe_node.failed + + def test_register_job_with_non_existing_job_cluster_key( ws, mock_path_lookup, From a0b3a678de0a68121f2b0dae5988a53aff633992 Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Wed, 30 Oct 2024 16:15:43 +0100 Subject: [PATCH 068/106] Fix missing parameter --- src/databricks/labs/ucx/assessment/sequencing.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/databricks/labs/ucx/assessment/sequencing.py b/src/databricks/labs/ucx/assessment/sequencing.py index 38c46edde3..d269c1a76b 100644 --- a/src/databricks/labs/ucx/assessment/sequencing.py +++ b/src/databricks/labs/ucx/assessment/sequencing.py @@ -182,7 +182,7 @@ def register_job(self, job: jobs.Job) -> MaybeMigrationNode: self._nodes[job_node.key] = job_node if job.settings: for job_cluster in job.settings.job_clusters or []: - maybe_cluster_node = self._register_job_cluster(job_cluster) + maybe_cluster_node = self._register_job_cluster(job_cluster, job) if maybe_cluster_node.node: self._outgoing[job_node.key].add(maybe_cluster_node.node) for task in job.settings.tasks or []: From 4869679860e2709193aa4af8a7c9a358a2b7c589 Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Thu, 31 Oct 2024 08:51:42 +0100 Subject: [PATCH 069/106] Handle non-existing job cluster --- src/databricks/labs/ucx/assessment/sequencing.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/src/databricks/labs/ucx/assessment/sequencing.py b/src/databricks/labs/ucx/assessment/sequencing.py index d269c1a76b..5a7ccb82e7 100644 --- a/src/databricks/labs/ucx/assessment/sequencing.py +++ b/src/databricks/labs/ucx/assessment/sequencing.py @@ -219,7 +219,13 @@ def _register_workflow_task(self, task: jobs.Task, parent: MigrationNode) -> May problems.extend(maybe_cluster_node.problems) if maybe_cluster_node.node: self._outgoing[task_node.key].add(maybe_cluster_node.node) - # TODO: register `job_cluster_key + if task.job_cluster_key: + job_cluster_node = self._nodes.get(("CLUSTER", task.job_cluster_key)) + if job_cluster_node: + self._outgoing[task_node.key].add(job_cluster_node) + else: + problem = DependencyProblem('cluster-not-found', f"Could not find cluster: {task.job_cluster_key}") + problems.append(problem) return MaybeMigrationNode(task_node, problems) def _register_job_cluster(self, cluster: jobs.JobCluster, job: jobs.Job) -> MaybeMigrationNode: From bc6f7f21490d3bbbcaf9eafd1721caf92473265c Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Thu, 31 Oct 2024 08:55:17 +0100 Subject: [PATCH 070/106] Test for new cluster --- src/databricks/labs/ucx/assessment/sequencing.py | 1 + tests/unit/assessment/test_sequencing.py | 16 +++++----------- 2 files changed, 6 insertions(+), 11 deletions(-) diff --git a/src/databricks/labs/ucx/assessment/sequencing.py b/src/databricks/labs/ucx/assessment/sequencing.py index 5a7ccb82e7..4789bf0f02 100644 --- a/src/databricks/labs/ucx/assessment/sequencing.py +++ b/src/databricks/labs/ucx/assessment/sequencing.py @@ -226,6 +226,7 @@ def _register_workflow_task(self, task: jobs.Task, parent: MigrationNode) -> May else: problem = DependencyProblem('cluster-not-found', f"Could not find cluster: {task.job_cluster_key}") problems.append(problem) + # TODO: Handle `task.new_cluster` return MaybeMigrationNode(task_node, problems) def _register_job_cluster(self, cluster: jobs.JobCluster, job: jobs.Job) -> MaybeMigrationNode: diff --git a/tests/unit/assessment/test_sequencing.py b/tests/unit/assessment/test_sequencing.py index 1f88f8c149..aeba3ce9a7 100644 --- a/tests/unit/assessment/test_sequencing.py +++ b/tests/unit/assessment/test_sequencing.py @@ -100,28 +100,22 @@ def test_register_job_with_non_existing_job_cluster_key( ] -def test_register_workflow_task_with_missing_cluster_dependency( +def test_register_job_with_new_cluster( ws, simple_dependency_resolver, mock_path_lookup, admin_locator, ) -> None: - """Register a workflow task with missing cluster dependency.""" - task = jobs.Task(task_key="test-task", existing_cluster_id="cluster-123") + """Register a workflow task with a new cluster.""" + task = jobs.Task(task_key="test-task", new_cluster=ClusterSpec()) settings = jobs.JobSettings(name="test-job", tasks=[task]) job = jobs.Job(job_id=1234, settings=settings) ws.jobs.get.return_value = job - - ws.clusters.get.side_effect = ResourceDoesNotExist("Unknown cluster") - - dependency = WorkflowTask(ws, task, job) - graph = DependencyGraph(dependency, None, simple_dependency_resolver, mock_path_lookup, CurrentSessionState()) sequencer = MigrationSequencer(ws, admin_locator) - maybe_node = sequencer._register_workflow_task(task, job, graph) + maybe_node = sequencer.register_job(job) - assert maybe_node.node is None - assert maybe_node.failed() + assert not maybe_node.failed def test_sequence_steps_from_job_task_with_cluster( From bea1eb1a39fcc1b9c7d95185d7c97dd3d6bd0393 Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Thu, 31 Oct 2024 08:57:48 +0100 Subject: [PATCH 071/106] Fix sequencing --- src/databricks/labs/ucx/assessment/sequencing.py | 2 +- tests/unit/assessment/test_sequencing.py | 9 +++------ 2 files changed, 4 insertions(+), 7 deletions(-) diff --git a/src/databricks/labs/ucx/assessment/sequencing.py b/src/databricks/labs/ucx/assessment/sequencing.py index 4789bf0f02..6551339abb 100644 --- a/src/databricks/labs/ucx/assessment/sequencing.py +++ b/src/databricks/labs/ucx/assessment/sequencing.py @@ -189,7 +189,7 @@ def register_job(self, job: jobs.Job) -> MaybeMigrationNode: maybe_task_node = self._register_workflow_task(task, job_node) problems.extend(maybe_task_node.problems) if maybe_task_node.node: - self._outgoing[job_node.key] = maybe_task_node.node + self._outgoing[job_node.key].add(maybe_task_node.node) return MaybeMigrationNode(job_node, problems) def _register_workflow_task(self, task: jobs.Task, parent: MigrationNode) -> MaybeMigrationNode: diff --git a/tests/unit/assessment/test_sequencing.py b/tests/unit/assessment/test_sequencing.py index aeba3ce9a7..29d51991a4 100644 --- a/tests/unit/assessment/test_sequencing.py +++ b/tests/unit/assessment/test_sequencing.py @@ -118,10 +118,10 @@ def test_register_job_with_new_cluster( assert not maybe_node.failed -def test_sequence_steps_from_job_task_with_cluster( +def test_sequence_steps_from_job_task_with_existing_cluster_id( ws, simple_dependency_resolver, mock_path_lookup, admin_locator ) -> None: - """Sequence a job with a task referencing a cluster. + """Sequence a job with a task referencing an existing cluster. Sequence: # TODO: @JCZuurmond: Would expect cluster first. 1. Task @@ -131,7 +131,6 @@ def test_sequence_steps_from_job_task_with_cluster( task = jobs.Task(task_key="test-task", existing_cluster_id="cluster-123") settings = jobs.JobSettings(name="test-job", tasks=[task]) job = jobs.Job(job_id=1234, settings=settings) - ws.jobs.get.return_value = job # Match task cluster above on cluster id admin_user = admin_locator.get_workspace_administrator() @@ -143,10 +142,8 @@ def get_cluster(cluster_id: str) -> ClusterDetails: ws.clusters.get.side_effect = get_cluster - dependency = WorkflowTask(ws, task, job) - graph = DependencyGraph(dependency, None, simple_dependency_resolver, mock_path_lookup, CurrentSessionState()) sequencer = MigrationSequencer(ws, admin_locator) - sequencer._register_workflow_task(task, job, graph) + sequencer.register_job(job) steps = list(sequencer.generate_steps()) From 26b48c0af36a177805dbe89669d776da52cdcc9e Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Thu, 31 Oct 2024 09:06:17 +0100 Subject: [PATCH 072/106] Fix parent object id --- src/databricks/labs/ucx/assessment/sequencing.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/databricks/labs/ucx/assessment/sequencing.py b/src/databricks/labs/ucx/assessment/sequencing.py index 6551339abb..e853b97512 100644 --- a/src/databricks/labs/ucx/assessment/sequencing.py +++ b/src/databricks/labs/ucx/assessment/sequencing.py @@ -202,7 +202,7 @@ def _register_workflow_task(self, task: jobs.Task, parent: MigrationNode) -> May The migration node for the parent job """ problems = [] - task_id = f"{parent.key}/{task.task_key}" + task_id = f"{parent.object_id}/{task.task_key}" task_node = self._nodes.get(("TASK", task_id), None) if task_node: return MaybeMigrationNode(task_node, problems) From 04e294371299bf432d7f3d6c8aca72417190e782 Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Thu, 31 Oct 2024 09:15:37 +0100 Subject: [PATCH 073/106] Update sequencing --- .../labs/ucx/assessment/sequencing.py | 13 ++++---- tests/unit/assessment/test_sequencing.py | 30 +++++++++---------- 2 files changed, 21 insertions(+), 22 deletions(-) diff --git a/src/databricks/labs/ucx/assessment/sequencing.py b/src/databricks/labs/ucx/assessment/sequencing.py index e853b97512..11789a808b 100644 --- a/src/databricks/labs/ucx/assessment/sequencing.py +++ b/src/databricks/labs/ucx/assessment/sequencing.py @@ -270,21 +270,20 @@ def generate_steps(self) -> Iterable[MigrationStep]: leaf during processing) - We handle cyclic dependencies (implemented in PR #3009) """ - # pre-compute incoming keys for best performance of self._required_step_ids - incoming = self._invert_outgoing_to_incoming() - queue = self._create_node_queue(incoming) + ordered_steps: list[MigrationStep] = [] + incoming = self._invert_outgoing_to_incoming() # For updating the priority of steps that depend on other steps seen = set[MigrationNode]() + queue = self._create_node_queue(self._outgoing) node = queue.get() - ordered_steps: list[MigrationStep] = [] while node is not None: - step = node.as_step(len(ordered_steps), sorted(n.node_id for n in incoming[node.key])) + step = node.as_step(len(ordered_steps), sorted(n.node_id for n in self._outgoing[node.key])) ordered_steps.append(step) seen.add(node) # Update the queue priority as if the migration step was completed - for dependency in self._outgoing[node.key]: + for dependency in incoming[node.key]: if dependency in seen: continue - priority = len(incoming[dependency.key] - seen) + priority = len(self._outgoing[dependency.key] - seen) queue.put(priority, dependency) node = queue.get() return ordered_steps diff --git a/tests/unit/assessment/test_sequencing.py b/tests/unit/assessment/test_sequencing.py index 29d51991a4..e97b2a341c 100644 --- a/tests/unit/assessment/test_sequencing.py +++ b/tests/unit/assessment/test_sequencing.py @@ -123,10 +123,10 @@ def test_sequence_steps_from_job_task_with_existing_cluster_id( ) -> None: """Sequence a job with a task referencing an existing cluster. - Sequence: # TODO: @JCZuurmond: Would expect cluster first. - 1. Task - 2. Job - 3. Cluster + Sequence: + 1. Cluster + 2. Task + 3. Job """ task = jobs.Task(task_key="test-task", existing_cluster_id="cluster-123") settings = jobs.JobSettings(name="test-job", tasks=[task]) @@ -151,28 +151,28 @@ def get_cluster(cluster_id: str) -> ClusterDetails: MigrationStep( step_id=2, step_number=0, - object_type="TASK", - object_id="1234/test-task", - object_name="test-task", + object_type="CLUSTER", + object_id="cluster-123", + object_name="my-cluster", object_owner="John Doe", required_step_ids=[], ), MigrationStep( step_id=1, step_number=1, - object_type="JOB", - object_id="1234", - object_name="test-job", + object_type="TASK", + object_id="1234/test-task", + object_name="test-task", object_owner="John Doe", required_step_ids=[2], ), MigrationStep( - step_id=3, + step_id=0, step_number=2, - object_type="CLUSTER", - object_id="cluster-123", - object_name="my-cluster", + object_type="JOB", + object_id="1234", + object_name="test-job", object_owner="John Doe", - required_step_ids=[1, 2], + required_step_ids=[1], ), ] From 7e577e1dab20aa4d6a9629154a9476a833e9e921 Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Thu, 31 Oct 2024 09:35:15 +0100 Subject: [PATCH 074/106] Add type hints for problems --- src/databricks/labs/ucx/assessment/sequencing.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/databricks/labs/ucx/assessment/sequencing.py b/src/databricks/labs/ucx/assessment/sequencing.py index 11789a808b..687cf3a45a 100644 --- a/src/databricks/labs/ucx/assessment/sequencing.py +++ b/src/databricks/labs/ucx/assessment/sequencing.py @@ -167,7 +167,7 @@ def register_job(self, job: jobs.Job) -> MaybeMigrationNode: registering. Otherwise, the maybe migration node contains the dependency problems occurring during registering the job. """ - problems = [] + problems: list[DependencyProblem] = [] job_node = self._nodes.get(("JOB", str(job.job_id)), None) if job_node: return MaybeMigrationNode(job_node, problems) @@ -201,7 +201,7 @@ def _register_workflow_task(self, task: jobs.Task, parent: MigrationNode) -> May parent : MigrationNode The migration node for the parent job """ - problems = [] + problems: list[DependencyProblem] = [] task_id = f"{parent.object_id}/{task.task_key}" task_node = self._nodes.get(("TASK", task_id), None) if task_node: From d431702c1246118ab4d23bb49faa938b597653d4 Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Thu, 31 Oct 2024 09:35:41 +0100 Subject: [PATCH 075/106] Explain why not handling new cluster --- src/databricks/labs/ucx/assessment/sequencing.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/databricks/labs/ucx/assessment/sequencing.py b/src/databricks/labs/ucx/assessment/sequencing.py index 687cf3a45a..206d7ffd51 100644 --- a/src/databricks/labs/ucx/assessment/sequencing.py +++ b/src/databricks/labs/ucx/assessment/sequencing.py @@ -214,6 +214,7 @@ def _register_workflow_task(self, task: jobs.Task, parent: MigrationNode) -> May object_owner=parent.object_owner, # No task owner so use parent job owner ) self._nodes[task_node.key] = task_node + # `task.new_cluster` is not handled because it is part of the task and not a separate node if task.existing_cluster_id: maybe_cluster_node = self._register_cluster(task.existing_cluster_id) problems.extend(maybe_cluster_node.problems) @@ -226,7 +227,6 @@ def _register_workflow_task(self, task: jobs.Task, parent: MigrationNode) -> May else: problem = DependencyProblem('cluster-not-found', f"Could not find cluster: {task.job_cluster_key}") problems.append(problem) - # TODO: Handle `task.new_cluster` return MaybeMigrationNode(task_node, problems) def _register_job_cluster(self, cluster: jobs.JobCluster, job: jobs.Job) -> MaybeMigrationNode: From 5f2d07b850614773b17f39209b571b8a900a7afb Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Thu, 31 Oct 2024 09:35:47 +0100 Subject: [PATCH 076/106] Format --- tests/unit/assessment/test_sequencing.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/tests/unit/assessment/test_sequencing.py b/tests/unit/assessment/test_sequencing.py index e97b2a341c..0b642cabdf 100644 --- a/tests/unit/assessment/test_sequencing.py +++ b/tests/unit/assessment/test_sequencing.py @@ -7,9 +7,7 @@ from databricks.labs.ucx.assessment.sequencing import MigrationSequencer, MigrationStep from databricks.labs.ucx.framework.owners import AdministratorLocator, AdministratorFinder -from databricks.labs.ucx.source_code.base import CurrentSessionState -from databricks.labs.ucx.source_code.graph import DependencyGraph, DependencyProblem -from databricks.labs.ucx.source_code.jobs import WorkflowTask +from databricks.labs.ucx.source_code.graph import DependencyProblem @pytest.fixture From 8f85d69128233ac7868554eb61b5ab7679f7bddc Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Thu, 31 Oct 2024 09:38:47 +0100 Subject: [PATCH 077/106] Sequence with new cluster --- tests/unit/assessment/test_sequencing.py | 39 ++++++++++++++++++++++++ 1 file changed, 39 insertions(+) diff --git a/tests/unit/assessment/test_sequencing.py b/tests/unit/assessment/test_sequencing.py index 0b642cabdf..5b439769e3 100644 --- a/tests/unit/assessment/test_sequencing.py +++ b/tests/unit/assessment/test_sequencing.py @@ -174,3 +174,42 @@ def get_cluster(cluster_id: str) -> ClusterDetails: required_step_ids=[1], ), ] + + +def test_sequence_steps_from_job_task_with_new_cluster( + ws, simple_dependency_resolver, mock_path_lookup, admin_locator +) -> None: + """Sequence a job with a task that has a new cluster definition. + + Sequence: + 1. Cluster + 2. Task # The cluster is part of the task, not a separate step in the sequence + """ + task = jobs.Task(task_key="test-task", new_cluster=ClusterSpec()) + settings = jobs.JobSettings(name="test-job", tasks=[task]) + job = jobs.Job(job_id=1234, settings=settings) + sequencer = MigrationSequencer(ws, admin_locator) + sequencer.register_job(job) + + steps = list(sequencer.generate_steps()) + + assert steps == [ + MigrationStep( + step_id=1, + step_number=0, + object_type="TASK", + object_id="1234/test-task", + object_name="test-task", + object_owner="John Doe", + required_step_ids=[], + ), + MigrationStep( + step_id=0, + step_number=1, + object_type="JOB", + object_id="1234", + object_name="test-job", + object_owner="John Doe", + required_step_ids=[1], + ), + ] From d25664c9c3bff73a5e54cc847e57df492d8cd484 Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Thu, 31 Oct 2024 09:41:29 +0100 Subject: [PATCH 078/106] Add job cluster to nodes --- src/databricks/labs/ucx/assessment/sequencing.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/databricks/labs/ucx/assessment/sequencing.py b/src/databricks/labs/ucx/assessment/sequencing.py index 206d7ffd51..c53df86533 100644 --- a/src/databricks/labs/ucx/assessment/sequencing.py +++ b/src/databricks/labs/ucx/assessment/sequencing.py @@ -238,6 +238,7 @@ def _register_job_cluster(self, cluster: jobs.JobCluster, job: jobs.Job) -> Mayb # Job clusters are ephemeral and exist during a job run for a specific job only object_owner=JobOwnership(self._admin_locator).owner_of(JobInfo.from_job(job)), ) + self._nodes[cluster_node.key] = cluster_node return MaybeMigrationNode(cluster_node, []) def _register_cluster(self, cluster_id: str) -> MaybeMigrationNode: From 698e3389cabc7d2132f8aaeb604bf3d345a6e66a Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Thu, 31 Oct 2024 09:44:37 +0100 Subject: [PATCH 079/106] Make job cluster id unique --- src/databricks/labs/ucx/assessment/sequencing.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/src/databricks/labs/ucx/assessment/sequencing.py b/src/databricks/labs/ucx/assessment/sequencing.py index c53df86533..1e54ed2336 100644 --- a/src/databricks/labs/ucx/assessment/sequencing.py +++ b/src/databricks/labs/ucx/assessment/sequencing.py @@ -221,7 +221,7 @@ def _register_workflow_task(self, task: jobs.Task, parent: MigrationNode) -> May if maybe_cluster_node.node: self._outgoing[task_node.key].add(maybe_cluster_node.node) if task.job_cluster_key: - job_cluster_node = self._nodes.get(("CLUSTER", task.job_cluster_key)) + job_cluster_node = self._nodes.get(("CLUSTER", f"{parent.object_id}/{task.job_cluster_key}")) if job_cluster_node: self._outgoing[task_node.key].add(job_cluster_node) else: @@ -230,10 +230,12 @@ def _register_workflow_task(self, task: jobs.Task, parent: MigrationNode) -> May return MaybeMigrationNode(task_node, problems) def _register_job_cluster(self, cluster: jobs.JobCluster, job: jobs.Job) -> MaybeMigrationNode: + # Different jobs can have job clusters with the same key, therefore job id is prepended to ensure uniqueness + cluster_id = f"{job.job_id}/{cluster.job_cluster_key}" cluster_node = MigrationNode( node_id=next(self._counter), - object_type="CLUSTER", # TODO: Do we want to differentiate between job and normal clusters? - object_id=cluster.job_cluster_key, + object_type="CLUSTER", + object_id=cluster_id, object_name=cluster.job_cluster_key, # Job clusters are ephemeral and exist during a job run for a specific job only object_owner=JobOwnership(self._admin_locator).owner_of(JobInfo.from_job(job)), From 5551c260eac27a01aa3424defc121847bc47d725 Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Thu, 31 Oct 2024 09:47:32 +0100 Subject: [PATCH 080/106] Make register signature similar --- .../labs/ucx/assessment/sequencing.py | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/src/databricks/labs/ucx/assessment/sequencing.py b/src/databricks/labs/ucx/assessment/sequencing.py index 1e54ed2336..9aaf029858 100644 --- a/src/databricks/labs/ucx/assessment/sequencing.py +++ b/src/databricks/labs/ucx/assessment/sequencing.py @@ -182,7 +182,7 @@ def register_job(self, job: jobs.Job) -> MaybeMigrationNode: self._nodes[job_node.key] = job_node if job.settings: for job_cluster in job.settings.job_clusters or []: - maybe_cluster_node = self._register_job_cluster(job_cluster, job) + maybe_cluster_node = self._register_job_cluster(job_cluster, job_node) if maybe_cluster_node.node: self._outgoing[job_node.key].add(maybe_cluster_node.node) for task in job.settings.tasks or []: @@ -229,16 +229,25 @@ def _register_workflow_task(self, task: jobs.Task, parent: MigrationNode) -> May problems.append(problem) return MaybeMigrationNode(task_node, problems) - def _register_job_cluster(self, cluster: jobs.JobCluster, job: jobs.Job) -> MaybeMigrationNode: + def _register_job_cluster(self, cluster: jobs.JobCluster, parent: MigrationNode) -> MaybeMigrationNode: + """Register a job cluster. + + A job cluster is defined within a job and therefore is found when defined on the job by definition. + + Args: + cluster : jobs.JobCluster + The job cluster to register + parent : MigrationNode + The migration node of the parent job + """ # Different jobs can have job clusters with the same key, therefore job id is prepended to ensure uniqueness - cluster_id = f"{job.job_id}/{cluster.job_cluster_key}" + cluster_id = f"{parent.object_id}/{cluster.job_cluster_key}" cluster_node = MigrationNode( node_id=next(self._counter), object_type="CLUSTER", object_id=cluster_id, object_name=cluster.job_cluster_key, - # Job clusters are ephemeral and exist during a job run for a specific job only - object_owner=JobOwnership(self._admin_locator).owner_of(JobInfo.from_job(job)), + object_owner=parent.object_owner, ) self._nodes[cluster_node.key] = cluster_node return MaybeMigrationNode(cluster_node, []) From cb22042ff09bbd8632c3abeeba77778d92e39c53 Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Thu, 31 Oct 2024 09:51:14 +0100 Subject: [PATCH 081/106] Explain outgoing references --- .../labs/ucx/assessment/sequencing.py | 22 +++++++++++-------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/src/databricks/labs/ucx/assessment/sequencing.py b/src/databricks/labs/ucx/assessment/sequencing.py index 9aaf029858..f44b9dbfad 100644 --- a/src/databricks/labs/ucx/assessment/sequencing.py +++ b/src/databricks/labs/ucx/assessment/sequencing.py @@ -154,7 +154,11 @@ def __init__(self, ws: WorkspaceClient, admin_locator: AdministratorLocator): self._admin_locator = admin_locator self._counter = itertools.count() self._nodes: dict[MigrationNodeKey, MigrationNode] = {} # TODO: Update to MaybeMigrationNode - self._outgoing: dict[MigrationNodeKey, set[MigrationNode]] = defaultdict(set) + + # Outgoing references contains edges in the graph pointing from a node to a set of nodes that the node + # references. These references follow the API references, e.g. a job contains tasks in the + # `jobs.Job.settings.tasks`, thus a job has an outgoing reference to each of those tasks. + self._outgoing_references: dict[MigrationNodeKey, set[MigrationNode]] = defaultdict(set) def register_job(self, job: jobs.Job) -> MaybeMigrationNode: """Register a job. @@ -184,12 +188,12 @@ def register_job(self, job: jobs.Job) -> MaybeMigrationNode: for job_cluster in job.settings.job_clusters or []: maybe_cluster_node = self._register_job_cluster(job_cluster, job_node) if maybe_cluster_node.node: - self._outgoing[job_node.key].add(maybe_cluster_node.node) + self._outgoing_references[job_node.key].add(maybe_cluster_node.node) for task in job.settings.tasks or []: maybe_task_node = self._register_workflow_task(task, job_node) problems.extend(maybe_task_node.problems) if maybe_task_node.node: - self._outgoing[job_node.key].add(maybe_task_node.node) + self._outgoing_references[job_node.key].add(maybe_task_node.node) return MaybeMigrationNode(job_node, problems) def _register_workflow_task(self, task: jobs.Task, parent: MigrationNode) -> MaybeMigrationNode: @@ -219,11 +223,11 @@ def _register_workflow_task(self, task: jobs.Task, parent: MigrationNode) -> May maybe_cluster_node = self._register_cluster(task.existing_cluster_id) problems.extend(maybe_cluster_node.problems) if maybe_cluster_node.node: - self._outgoing[task_node.key].add(maybe_cluster_node.node) + self._outgoing_references[task_node.key].add(maybe_cluster_node.node) if task.job_cluster_key: job_cluster_node = self._nodes.get(("CLUSTER", f"{parent.object_id}/{task.job_cluster_key}")) if job_cluster_node: - self._outgoing[task_node.key].add(job_cluster_node) + self._outgoing_references[task_node.key].add(job_cluster_node) else: problem = DependencyProblem('cluster-not-found', f"Could not find cluster: {task.job_cluster_key}") problems.append(problem) @@ -285,24 +289,24 @@ def generate_steps(self) -> Iterable[MigrationStep]: ordered_steps: list[MigrationStep] = [] incoming = self._invert_outgoing_to_incoming() # For updating the priority of steps that depend on other steps seen = set[MigrationNode]() - queue = self._create_node_queue(self._outgoing) + queue = self._create_node_queue(self._outgoing_references) node = queue.get() while node is not None: - step = node.as_step(len(ordered_steps), sorted(n.node_id for n in self._outgoing[node.key])) + step = node.as_step(len(ordered_steps), sorted(n.node_id for n in self._outgoing_references[node.key])) ordered_steps.append(step) seen.add(node) # Update the queue priority as if the migration step was completed for dependency in incoming[node.key]: if dependency in seen: continue - priority = len(self._outgoing[dependency.key] - seen) + priority = len(self._outgoing_references[dependency.key] - seen) queue.put(priority, dependency) node = queue.get() return ordered_steps def _invert_outgoing_to_incoming(self) -> dict[MigrationNodeKey, set[MigrationNode]]: result: dict[MigrationNodeKey, set[MigrationNode]] = defaultdict(set) - for node_key, outgoing_nodes in self._outgoing.items(): + for node_key, outgoing_nodes in self._outgoing_references.items(): for target in outgoing_nodes: result[target.key].add(self._nodes[node_key]) return result From cfc23c9fe00340a1af2e476d4b147c4424497d5a Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Thu, 31 Oct 2024 10:05:53 +0100 Subject: [PATCH 082/106] Remove TODO --- src/databricks/labs/ucx/assessment/sequencing.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/databricks/labs/ucx/assessment/sequencing.py b/src/databricks/labs/ucx/assessment/sequencing.py index f44b9dbfad..303ecfec9e 100644 --- a/src/databricks/labs/ucx/assessment/sequencing.py +++ b/src/databricks/labs/ucx/assessment/sequencing.py @@ -153,7 +153,7 @@ def __init__(self, ws: WorkspaceClient, admin_locator: AdministratorLocator): self._ws = ws self._admin_locator = admin_locator self._counter = itertools.count() - self._nodes: dict[MigrationNodeKey, MigrationNode] = {} # TODO: Update to MaybeMigrationNode + self._nodes: dict[MigrationNodeKey, MigrationNode] = {} # Outgoing references contains edges in the graph pointing from a node to a set of nodes that the node # references. These references follow the API references, e.g. a job contains tasks in the From 6feebfa0eb16d3ead63d632c2ca65be91985ae28 Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Thu, 31 Oct 2024 10:07:18 +0100 Subject: [PATCH 083/106] Update docs and naming --- .../labs/ucx/assessment/sequencing.py | 24 ++++++------------- 1 file changed, 7 insertions(+), 17 deletions(-) diff --git a/src/databricks/labs/ucx/assessment/sequencing.py b/src/databricks/labs/ucx/assessment/sequencing.py index 303ecfec9e..2c4c446010 100644 --- a/src/databricks/labs/ucx/assessment/sequencing.py +++ b/src/databricks/labs/ucx/assessment/sequencing.py @@ -197,14 +197,7 @@ def register_job(self, job: jobs.Job) -> MaybeMigrationNode: return MaybeMigrationNode(job_node, problems) def _register_workflow_task(self, task: jobs.Task, parent: MigrationNode) -> MaybeMigrationNode: - """Register a workflow task. - - Args: - task : jobs.Task - The task to register - parent : MigrationNode - The migration node for the parent job - """ + """Register a workflow task.""" problems: list[DependencyProblem] = [] task_id = f"{parent.object_id}/{task.task_key}" task_node = self._nodes.get(("TASK", task_id), None) @@ -237,12 +230,6 @@ def _register_job_cluster(self, cluster: jobs.JobCluster, parent: MigrationNode) """Register a job cluster. A job cluster is defined within a job and therefore is found when defined on the job by definition. - - Args: - cluster : jobs.JobCluster - The job cluster to register - parent : MigrationNode - The migration node of the parent job """ # Different jobs can have job clusters with the same key, therefore job id is prepended to ensure uniqueness cluster_id = f"{parent.object_id}/{cluster.job_cluster_key}" @@ -257,6 +244,7 @@ def _register_job_cluster(self, cluster: jobs.JobCluster, parent: MigrationNode) return MaybeMigrationNode(cluster_node, []) def _register_cluster(self, cluster_id: str) -> MaybeMigrationNode: + """Register a cluster.""" node_seen = self._nodes.get(("CLUSTER", cluster_id), None) if node_seen: return MaybeMigrationNode(node_seen, []) @@ -287,7 +275,8 @@ def generate_steps(self) -> Iterable[MigrationStep]: - We handle cyclic dependencies (implemented in PR #3009) """ ordered_steps: list[MigrationStep] = [] - incoming = self._invert_outgoing_to_incoming() # For updating the priority of steps that depend on other steps + # For updating the priority of steps that depend on other steps + incoming_references = self._invert_outgoing_to_incoming_references() seen = set[MigrationNode]() queue = self._create_node_queue(self._outgoing_references) node = queue.get() @@ -296,7 +285,7 @@ def generate_steps(self) -> Iterable[MigrationStep]: ordered_steps.append(step) seen.add(node) # Update the queue priority as if the migration step was completed - for dependency in incoming[node.key]: + for dependency in incoming_references[node.key]: if dependency in seen: continue priority = len(self._outgoing_references[dependency.key] - seen) @@ -304,7 +293,8 @@ def generate_steps(self) -> Iterable[MigrationStep]: node = queue.get() return ordered_steps - def _invert_outgoing_to_incoming(self) -> dict[MigrationNodeKey, set[MigrationNode]]: + def _invert_outgoing_to_incoming_references(self) -> dict[MigrationNodeKey, set[MigrationNode]]: + """Invert the outgoing references to incoming references.""" result: dict[MigrationNodeKey, set[MigrationNode]] = defaultdict(set) for node_key, outgoing_nodes in self._outgoing_references.items(): for target in outgoing_nodes: From ae059a05fb68107850ec3a878d1185216b3f483f Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Thu, 31 Oct 2024 10:09:52 +0100 Subject: [PATCH 084/106] Update tests docs --- tests/unit/assessment/test_sequencing.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/tests/unit/assessment/test_sequencing.py b/tests/unit/assessment/test_sequencing.py index 5b439769e3..68d6f115c6 100644 --- a/tests/unit/assessment/test_sequencing.py +++ b/tests/unit/assessment/test_sequencing.py @@ -20,7 +20,7 @@ def admin_locator(ws): def test_register_job_with_existing_cluster(ws, admin_locator) -> None: - """Register an existing cluster.""" + """Register a job with a task referencing an existing cluster.""" task = jobs.Task(task_key="test-task", existing_cluster_id="cluster-123") settings = jobs.JobSettings(name="test-job", tasks=[task]) job = jobs.Job(job_id=1234, settings=settings) @@ -39,7 +39,7 @@ def get_cluster(cluster_id: str) -> ClusterDetails: def test_register_job_with_non_existing_cluster(ws, admin_locator) -> None: - """Register a non existing cluster.""" + """Register a job with a task referencing a non-existing cluster.""" task = jobs.Task(task_key="test-task", existing_cluster_id="non-existing-id") settings = jobs.JobSettings(name="test-job", tasks=[task]) job = jobs.Job(job_id=1234, settings=settings) @@ -63,7 +63,7 @@ def test_register_job_with_existing_job_cluster_key( mock_path_lookup, admin_locator, ) -> None: - """Register a job with existing job cluster key.""" + """Register a job with a task referencing a existing job cluster.""" job_cluster = jobs.JobCluster("existing-id", ClusterSpec()) task = jobs.Task(task_key="test-task", job_cluster_key="existing-id") settings = jobs.JobSettings(name="test-job", tasks=[task], job_clusters=[job_cluster]) @@ -80,7 +80,7 @@ def test_register_job_with_non_existing_job_cluster_key( mock_path_lookup, admin_locator, ) -> None: - """Register a job with non-existing job cluster key.""" + """Register a job with a task referencing a non-existing job cluster.""" task = jobs.Task(task_key="test-task", job_cluster_key="non-existing-id") settings = jobs.JobSettings(name="test-job", tasks=[task]) job = jobs.Job(job_id=1234, settings=settings) @@ -104,7 +104,7 @@ def test_register_job_with_new_cluster( mock_path_lookup, admin_locator, ) -> None: - """Register a workflow task with a new cluster.""" + """Register a job with a task with a new cluster definition.""" task = jobs.Task(task_key="test-task", new_cluster=ClusterSpec()) settings = jobs.JobSettings(name="test-job", tasks=[task]) job = jobs.Job(job_id=1234, settings=settings) From 2b90951b33e13ab8e9cfbd4cf2f81c1cb73d6478 Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Thu, 31 Oct 2024 10:13:12 +0100 Subject: [PATCH 085/106] Test sequence job with task referencing job cluster --- tests/unit/assessment/test_sequencing.py | 50 ++++++++++++++++++++++++ 1 file changed, 50 insertions(+) diff --git a/tests/unit/assessment/test_sequencing.py b/tests/unit/assessment/test_sequencing.py index 68d6f115c6..1ea0a7f453 100644 --- a/tests/unit/assessment/test_sequencing.py +++ b/tests/unit/assessment/test_sequencing.py @@ -176,6 +176,56 @@ def get_cluster(cluster_id: str) -> ClusterDetails: ] +def test_sequence_steps_from_job_task_with_existing_job_cluster_key( + ws, simple_dependency_resolver, mock_path_lookup, admin_locator +) -> None: + """Sequence a job with a task referencing an existing job cluster. + + Sequence: + 1. Job cluster + 2. Task + 3. Job + """ + job_cluster = jobs.JobCluster("existing-id", ClusterSpec()) + task = jobs.Task(task_key="test-task", job_cluster_key="existing-id") + settings = jobs.JobSettings(name="test-job", tasks=[task], job_clusters=[job_cluster]) + job = jobs.Job(job_id=1234, settings=settings) + sequencer = MigrationSequencer(ws, admin_locator) + sequencer.register_job(job) + + steps = list(sequencer.generate_steps()) + + assert steps == [ + MigrationStep( + step_id=1, + step_number=0, + object_type="CLUSTER", + object_id="1234/existing-id", + object_name="existing-id", + object_owner="John Doe", + required_step_ids=[], + ), + MigrationStep( + step_id=2, + step_number=1, + object_type="TASK", + object_id="1234/test-task", + object_name="test-task", + object_owner="John Doe", + required_step_ids=[1], + ), + MigrationStep( + step_id=0, + step_number=2, + object_type="JOB", + object_id="1234", + object_name="test-job", + object_owner="John Doe", + required_step_ids=[1, 2], + ), + ] + + def test_sequence_steps_from_job_task_with_new_cluster( ws, simple_dependency_resolver, mock_path_lookup, admin_locator ) -> None: From d453f53046a6beecf6767e3564153dbb4232fdee Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Thu, 31 Oct 2024 10:22:53 +0100 Subject: [PATCH 086/106] Fix test sequence in docstring --- tests/unit/assessment/test_sequencing.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/unit/assessment/test_sequencing.py b/tests/unit/assessment/test_sequencing.py index 1ea0a7f453..3eeb71b6fc 100644 --- a/tests/unit/assessment/test_sequencing.py +++ b/tests/unit/assessment/test_sequencing.py @@ -232,8 +232,8 @@ def test_sequence_steps_from_job_task_with_new_cluster( """Sequence a job with a task that has a new cluster definition. Sequence: - 1. Cluster - 2. Task # The cluster is part of the task, not a separate step in the sequence + 1. Task # The cluster is part of the task, not a separate step in the sequence + 2. Job """ task = jobs.Task(task_key="test-task", new_cluster=ClusterSpec()) settings = jobs.JobSettings(name="test-job", tasks=[task]) From 5524191fdbe2a85872f43583d45a0ad0acb6e16b Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Thu, 31 Oct 2024 10:25:07 +0100 Subject: [PATCH 087/106] Add test when referencing non-existing cluster --- tests/unit/assessment/test_sequencing.py | 50 ++++++++++++++++++++++++ 1 file changed, 50 insertions(+) diff --git a/tests/unit/assessment/test_sequencing.py b/tests/unit/assessment/test_sequencing.py index 3eeb71b6fc..d95842db1f 100644 --- a/tests/unit/assessment/test_sequencing.py +++ b/tests/unit/assessment/test_sequencing.py @@ -263,3 +263,53 @@ def test_sequence_steps_from_job_task_with_new_cluster( required_step_ids=[1], ), ] + + +def test_sequence_steps_from_job_task_with_non_existing_cluster( + ws, simple_dependency_resolver, mock_path_lookup, admin_locator +) -> None: + """Sequence a job with a task that references a non-existing cluster. + + Sequence: + 1. Cluster # TODO: Do we still expect this reference? + 2. Task + 3. Job + """ + ws.clusters.get.side_effect = ResourceDoesNotExist("Unknown cluster") + task = jobs.Task(task_key="test-task", existing_cluster_id="non-existing-id") + settings = jobs.JobSettings(name="test-job", tasks=[task]) + job = jobs.Job(job_id=1234, settings=settings) + sequencer = MigrationSequencer(ws, admin_locator) + sequencer.register_job(job) + + steps = list(sequencer.generate_steps()) + + assert steps == [ + MigrationStep( + step_id=2, + step_number=0, + object_type="CLUSTER", + object_id="non-existing-id", + object_name="non-existing-id", + object_owner="John Doe", + required_step_ids=[], + ), + MigrationStep( + step_id=1, + step_number=0, + object_type="TASK", + object_id="1234/test-task", + object_name="test-task", + object_owner="John Doe", + required_step_ids=[2], + ), + MigrationStep( + step_id=0, + step_number=1, + object_type="JOB", + object_id="1234", + object_name="test-job", + object_owner="John Doe", + required_step_ids=[1], + ), + ] From 8d40f5292ba662261f1aa0cd210a46092b31f395 Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Thu, 31 Oct 2024 10:26:07 +0100 Subject: [PATCH 088/106] Remove unused fixtures --- tests/unit/assessment/test_sequencing.py | 35 +++++------------------- 1 file changed, 7 insertions(+), 28 deletions(-) diff --git a/tests/unit/assessment/test_sequencing.py b/tests/unit/assessment/test_sequencing.py index d95842db1f..2333cdbd0b 100644 --- a/tests/unit/assessment/test_sequencing.py +++ b/tests/unit/assessment/test_sequencing.py @@ -58,11 +58,7 @@ def test_register_job_with_non_existing_cluster(ws, admin_locator) -> None: ] -def test_register_job_with_existing_job_cluster_key( - ws, - mock_path_lookup, - admin_locator, -) -> None: +def test_register_job_with_existing_job_cluster_key(ws, admin_locator) -> None: """Register a job with a task referencing a existing job cluster.""" job_cluster = jobs.JobCluster("existing-id", ClusterSpec()) task = jobs.Task(task_key="test-task", job_cluster_key="existing-id") @@ -75,11 +71,7 @@ def test_register_job_with_existing_job_cluster_key( assert not maybe_node.failed -def test_register_job_with_non_existing_job_cluster_key( - ws, - mock_path_lookup, - admin_locator, -) -> None: +def test_register_job_with_non_existing_job_cluster_key(ws, admin_locator) -> None: """Register a job with a task referencing a non-existing job cluster.""" task = jobs.Task(task_key="test-task", job_cluster_key="non-existing-id") settings = jobs.JobSettings(name="test-job", tasks=[task]) @@ -98,12 +90,7 @@ def test_register_job_with_non_existing_job_cluster_key( ] -def test_register_job_with_new_cluster( - ws, - simple_dependency_resolver, - mock_path_lookup, - admin_locator, -) -> None: +def test_register_job_with_new_cluster(ws, admin_locator) -> None: """Register a job with a task with a new cluster definition.""" task = jobs.Task(task_key="test-task", new_cluster=ClusterSpec()) settings = jobs.JobSettings(name="test-job", tasks=[task]) @@ -116,9 +103,7 @@ def test_register_job_with_new_cluster( assert not maybe_node.failed -def test_sequence_steps_from_job_task_with_existing_cluster_id( - ws, simple_dependency_resolver, mock_path_lookup, admin_locator -) -> None: +def test_sequence_steps_from_job_task_with_existing_cluster_id(ws, admin_locator) -> None: """Sequence a job with a task referencing an existing cluster. Sequence: @@ -176,9 +161,7 @@ def get_cluster(cluster_id: str) -> ClusterDetails: ] -def test_sequence_steps_from_job_task_with_existing_job_cluster_key( - ws, simple_dependency_resolver, mock_path_lookup, admin_locator -) -> None: +def test_sequence_steps_from_job_task_with_existing_job_cluster_key(ws, admin_locator) -> None: """Sequence a job with a task referencing an existing job cluster. Sequence: @@ -226,9 +209,7 @@ def test_sequence_steps_from_job_task_with_existing_job_cluster_key( ] -def test_sequence_steps_from_job_task_with_new_cluster( - ws, simple_dependency_resolver, mock_path_lookup, admin_locator -) -> None: +def test_sequence_steps_from_job_task_with_new_cluster(ws, admin_locator) -> None: """Sequence a job with a task that has a new cluster definition. Sequence: @@ -265,9 +246,7 @@ def test_sequence_steps_from_job_task_with_new_cluster( ] -def test_sequence_steps_from_job_task_with_non_existing_cluster( - ws, simple_dependency_resolver, mock_path_lookup, admin_locator -) -> None: +def test_sequence_steps_from_job_task_with_non_existing_cluster(ws, admin_locator) -> None: """Sequence a job with a task that references a non-existing cluster. Sequence: From 6d757bb7407628e62bedcf85d2e0c02a4d477d81 Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Thu, 31 Oct 2024 10:29:30 +0100 Subject: [PATCH 089/106] Remove redundant comment and fix docstring --- src/databricks/labs/ucx/assessment/sequencing.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/databricks/labs/ucx/assessment/sequencing.py b/src/databricks/labs/ucx/assessment/sequencing.py index 2c4c446010..7e9ff1706e 100644 --- a/src/databricks/labs/ucx/assessment/sequencing.py +++ b/src/databricks/labs/ucx/assessment/sequencing.py @@ -125,14 +125,13 @@ def put(self, priority: int, task: MigrationNode) -> None: heapq.heappush(self._entries, entry) def get(self) -> MigrationNode | None: - """Gets the tasks with lowest priority.""" + """Gets the tasks with the lowest priority.""" while self._entries: _, _, task = heapq.heappop(self._entries) if task == self._REMOVED: continue assert isinstance(task, MigrationNode) self._remove(task) - # Ignore type because heappop returns Any, while we know it is an QueueEntry return task return None From a4ace82cf8c74f57453681ab1882506c3fe676d3 Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Thu, 31 Oct 2024 10:38:41 +0100 Subject: [PATCH 090/106] Test task dependency --- tests/unit/assessment/test_sequencing.py | 51 ++++++++++++++++++++++++ 1 file changed, 51 insertions(+) diff --git a/tests/unit/assessment/test_sequencing.py b/tests/unit/assessment/test_sequencing.py index 2333cdbd0b..03a8776e1e 100644 --- a/tests/unit/assessment/test_sequencing.py +++ b/tests/unit/assessment/test_sequencing.py @@ -292,3 +292,54 @@ def test_sequence_steps_from_job_task_with_non_existing_cluster(ws, admin_locato required_step_ids=[1], ), ] + + +def test_sequence_steps_from_job_task_referencing_other_task(ws, admin_locator) -> None: + """Sequence a job with a task that has a new cluster definition. + + Sequence: + 1. Task1 + 2. Task2 + 3. Job + """ + task1 = jobs.Task(task_key="task1") + task_dependency = jobs.TaskDependency(task1.task_key) + task2 = jobs.Task(task_key="task2", depends_on=[task_dependency]) + tasks = [task2, task1] # Reverse order on purpose to test if this is handled + settings = jobs.JobSettings(name="job", tasks=tasks) + job = jobs.Job(job_id=1234, settings=settings) + sequencer = MigrationSequencer(ws, admin_locator) + + maybe_job_node = sequencer.register_job(job) + assert not maybe_job_node.failed + + steps = list(sequencer.generate_steps()) + assert steps == [ + MigrationStep( + step_id=1, + step_number=0, + object_type="TASK", + object_id="1234/task1", + object_name="task1", + object_owner="John Doe", + required_step_ids=[], + ), + MigrationStep( + step_id=2, + step_number=0, + object_type="TASK", + object_id="1234/task2", + object_name="task2", + object_owner="John Doe", + required_step_ids=[1], + ), + MigrationStep( + step_id=0, + step_number=1, + object_type="JOB", + object_id="1234", + object_name="job", + object_owner="John Doe", + required_step_ids=[1, 2], + ), + ] From 742694f8431e29277ec917db4707eb75f8dcd3c0 Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Thu, 31 Oct 2024 10:46:16 +0100 Subject: [PATCH 091/106] Sequence tasks with dependencies --- src/databricks/labs/ucx/assessment/sequencing.py | 11 +++++++++++ tests/unit/assessment/test_sequencing.py | 10 +++++----- 2 files changed, 16 insertions(+), 5 deletions(-) diff --git a/src/databricks/labs/ucx/assessment/sequencing.py b/src/databricks/labs/ucx/assessment/sequencing.py index 7e9ff1706e..271f3e6f89 100644 --- a/src/databricks/labs/ucx/assessment/sequencing.py +++ b/src/databricks/labs/ucx/assessment/sequencing.py @@ -193,6 +193,17 @@ def register_job(self, job: jobs.Job) -> MaybeMigrationNode: problems.extend(maybe_task_node.problems) if maybe_task_node.node: self._outgoing_references[job_node.key].add(maybe_task_node.node) + # Only after registering all tasks, we can resolve the task dependencies + for task in job.settings.tasks or []: + task_key = ("TASK", f"{job.job_id}/{task.task_key}") + for task_dependency in task.depends_on or []: + task_dependency_key = ("TASK", f"{job.job_id}/{task_dependency.task_key}") + maybe_task_dependency = self._nodes.get(task_dependency_key) + if maybe_task_dependency: + self._outgoing_references[task_key].add(maybe_task_dependency) + else: + problem = DependencyProblem('task-dependency-not-found', f"Could not find task: {task_dependency}") + problems.append(problem) return MaybeMigrationNode(job_node, problems) def _register_workflow_task(self, task: jobs.Task, parent: MigrationNode) -> MaybeMigrationNode: diff --git a/tests/unit/assessment/test_sequencing.py b/tests/unit/assessment/test_sequencing.py index 03a8776e1e..bd1f521f19 100644 --- a/tests/unit/assessment/test_sequencing.py +++ b/tests/unit/assessment/test_sequencing.py @@ -316,7 +316,7 @@ def test_sequence_steps_from_job_task_referencing_other_task(ws, admin_locator) steps = list(sequencer.generate_steps()) assert steps == [ MigrationStep( - step_id=1, + step_id=2, step_number=0, object_type="TASK", object_id="1234/task1", @@ -325,17 +325,17 @@ def test_sequence_steps_from_job_task_referencing_other_task(ws, admin_locator) required_step_ids=[], ), MigrationStep( - step_id=2, - step_number=0, + step_id=1, + step_number=1, object_type="TASK", object_id="1234/task2", object_name="task2", object_owner="John Doe", - required_step_ids=[1], + required_step_ids=[2], ), MigrationStep( step_id=0, - step_number=1, + step_number=2, object_type="JOB", object_id="1234", object_name="job", From f14a0f56855f62f72b8d1edae6580e453561e437 Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Thu, 31 Oct 2024 10:49:26 +0100 Subject: [PATCH 092/106] Add TODOs for register workflow task --- src/databricks/labs/ucx/assessment/sequencing.py | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/src/databricks/labs/ucx/assessment/sequencing.py b/src/databricks/labs/ucx/assessment/sequencing.py index 271f3e6f89..6a6621e3a1 100644 --- a/src/databricks/labs/ucx/assessment/sequencing.py +++ b/src/databricks/labs/ucx/assessment/sequencing.py @@ -207,7 +207,21 @@ def register_job(self, job: jobs.Job) -> MaybeMigrationNode: return MaybeMigrationNode(job_node, problems) def _register_workflow_task(self, task: jobs.Task, parent: MigrationNode) -> MaybeMigrationNode: - """Register a workflow task.""" + """Register a workflow task. + + TODO: + Handle following jobs.Task attributes: + - for_each_task + - libraries + - notebook_task + - pipeline_task + - python_wheel_task + - run_job_tasl + - spark_jar_task + - spark_python_taks + - spark_submit_task + - sql_task + """ problems: list[DependencyProblem] = [] task_id = f"{parent.object_id}/{task.task_key}" task_node = self._nodes.get(("TASK", task_id), None) From d093a47cee63e8a8feba137f1dab4bf7cd60e865 Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Thu, 31 Oct 2024 10:57:24 +0100 Subject: [PATCH 093/106] Add todo for register cluster --- src/databricks/labs/ucx/assessment/sequencing.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/src/databricks/labs/ucx/assessment/sequencing.py b/src/databricks/labs/ucx/assessment/sequencing.py index 6a6621e3a1..427fa0add1 100644 --- a/src/databricks/labs/ucx/assessment/sequencing.py +++ b/src/databricks/labs/ucx/assessment/sequencing.py @@ -268,7 +268,17 @@ def _register_job_cluster(self, cluster: jobs.JobCluster, parent: MigrationNode) return MaybeMigrationNode(cluster_node, []) def _register_cluster(self, cluster_id: str) -> MaybeMigrationNode: - """Register a cluster.""" + """Register a cluster. + + TODO + Handle following jobs.Task attributes: + - init_scripts + - instance_pool_id (maybe_not) + - policy_id + - spec.init_scripts + - spec.instance_pool_id (maybe not) + - spec.policy_id + """ node_seen = self._nodes.get(("CLUSTER", cluster_id), None) if node_seen: return MaybeMigrationNode(node_seen, []) From 26f060b17970a71ec841981e1088870315851451 Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Thu, 31 Oct 2024 11:19:17 +0100 Subject: [PATCH 094/106] Add migration sequencer to RuntimeContext --- src/databricks/labs/ucx/contexts/workflow_task.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/databricks/labs/ucx/contexts/workflow_task.py b/src/databricks/labs/ucx/contexts/workflow_task.py index d840a6b087..a3656b290b 100644 --- a/src/databricks/labs/ucx/contexts/workflow_task.py +++ b/src/databricks/labs/ucx/contexts/workflow_task.py @@ -17,6 +17,7 @@ from databricks.labs.ucx.assessment.init_scripts import GlobalInitScriptCrawler from databricks.labs.ucx.assessment.jobs import JobOwnership, JobInfo, JobsCrawler, SubmitRunsCrawler from databricks.labs.ucx.assessment.pipelines import PipelinesCrawler, PipelineInfo, PipelineOwnership +from databricks.labs.ucx.assessment.sequencing import MigrationSequencer from databricks.labs.ucx.config import WorkspaceConfig from databricks.labs.ucx.contexts.application import GlobalContext from databricks.labs.ucx.hive_metastore import TablesInMounts, TablesCrawler @@ -240,3 +241,7 @@ def udfs_progress(self) -> ProgressEncoder[Udf]: self.workspace_id, self.config.ucx_catalog, ) + + @cached_property + def migration_sequencer(self) -> MigrationSequencer: + return MigrationSequencer(self.workspace_client, self.administrator_locator) From 418718ff49bf1279fa7b3a5462a74adca2c4d471 Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Thu, 31 Oct 2024 11:25:08 +0100 Subject: [PATCH 095/106] Add integration test for simple job --- tests/integration/assessment/test_sequencing.py | 10 ++++++++++ 1 file changed, 10 insertions(+) create mode 100644 tests/integration/assessment/test_sequencing.py diff --git a/tests/integration/assessment/test_sequencing.py b/tests/integration/assessment/test_sequencing.py new file mode 100644 index 0000000000..ea133bd817 --- /dev/null +++ b/tests/integration/assessment/test_sequencing.py @@ -0,0 +1,10 @@ +def test_migration_sequencing_simple_job(make_job, runtime_ctx) -> None: + """Sequence a simple job""" + job = make_job() + + maybe_job_node = runtime_ctx.migration_sequencer.register_job(job) + assert not maybe_job_node.failed + + steps = runtime_ctx.migration_sequencer.generate_steps() + step_object_types = [step.object_type for step in steps] + assert step_object_types == ["TASK", "JOB"] From c55c821ac42cdee13c7b77fb58286b5ff1bc9721 Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Thu, 31 Oct 2024 11:31:56 +0100 Subject: [PATCH 096/106] Test sequencing job with task referencing existing cluster --- .../integration/assessment/test_sequencing.py | 26 +++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/tests/integration/assessment/test_sequencing.py b/tests/integration/assessment/test_sequencing.py index ea133bd817..0316837077 100644 --- a/tests/integration/assessment/test_sequencing.py +++ b/tests/integration/assessment/test_sequencing.py @@ -1,3 +1,6 @@ +from databricks.sdk.service import jobs + + def test_migration_sequencing_simple_job(make_job, runtime_ctx) -> None: """Sequence a simple job""" job = make_job() @@ -8,3 +11,26 @@ def test_migration_sequencing_simple_job(make_job, runtime_ctx) -> None: steps = runtime_ctx.migration_sequencer.generate_steps() step_object_types = [step.object_type for step in steps] assert step_object_types == ["TASK", "JOB"] + + +def test_migration_sequencing_job_with_task_referencing_cluster( + make_job, + make_notebook, + runtime_ctx, + env_or_skip, +) -> None: + """Sequence a job with a task""" + cluster_id = env_or_skip("TEST_DEFAULT_CLUSTER_ID") + task = jobs.Task( + task_key="test-task", + existing_cluster_id=cluster_id, + notebook_task=jobs.NotebookTask(notebook_path=str(make_notebook())), + ) + job = make_job(tasks=[task]) + + maybe_job_node = runtime_ctx.migration_sequencer.register_job(job) + assert not maybe_job_node.failed + + steps = runtime_ctx.migration_sequencer.generate_steps() + step_object_types = [step.object_type for step in steps] + assert step_object_types == ["CLUSTER", "TASK", "JOB"] From 23bec091e2daa15a55a69645150de2393df0b1ca Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Thu, 31 Oct 2024 11:35:07 +0100 Subject: [PATCH 097/106] Test sequencing job with task referencing non existing cluster --- tests/integration/assessment/test_sequencing.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/tests/integration/assessment/test_sequencing.py b/tests/integration/assessment/test_sequencing.py index 0316837077..5276522805 100644 --- a/tests/integration/assessment/test_sequencing.py +++ b/tests/integration/assessment/test_sequencing.py @@ -34,3 +34,18 @@ def test_migration_sequencing_job_with_task_referencing_cluster( steps = runtime_ctx.migration_sequencer.generate_steps() step_object_types = [step.object_type for step in steps] assert step_object_types == ["CLUSTER", "TASK", "JOB"] + + +def test_migration_sequencing_job_with_task_referencing_non_existing_cluster(runtime_ctx) -> None: + """Sequence a job with a task referencing existing cluster""" + # Cannot make an actual job referencing a non-exsting cluster + task = jobs.Task(task_key="test-task", existing_cluster_id="non-existing-id") + settings = jobs.JobSettings(name="test-job", tasks=[task]) + job = jobs.Job(job_id=1234, settings=settings) + + maybe_job_node = runtime_ctx.migration_sequencer.register_job(job) + assert maybe_job_node.failed + + steps = runtime_ctx.migration_sequencer.generate_steps() + step_object_types = [step.object_type for step in steps] + assert step_object_types == ["CLUSTER", "TASK", "JOB"] # TODO: What do we expect? From b2c16718d33378a7d4b12352dc0e1f189b54ad74 Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Thu, 31 Oct 2024 11:35:13 +0100 Subject: [PATCH 098/106] Format --- src/databricks/labs/ucx/assessment/sequencing.py | 4 +++- tests/integration/assessment/test_sequencing.py | 2 +- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/src/databricks/labs/ucx/assessment/sequencing.py b/src/databricks/labs/ucx/assessment/sequencing.py index 427fa0add1..b059f1d563 100644 --- a/src/databricks/labs/ucx/assessment/sequencing.py +++ b/src/databricks/labs/ucx/assessment/sequencing.py @@ -202,7 +202,9 @@ def register_job(self, job: jobs.Job) -> MaybeMigrationNode: if maybe_task_dependency: self._outgoing_references[task_key].add(maybe_task_dependency) else: - problem = DependencyProblem('task-dependency-not-found', f"Could not find task: {task_dependency}") + problem = DependencyProblem( + 'task-dependency-not-found', f"Could not find task: {task_dependency}" + ) problems.append(problem) return MaybeMigrationNode(job_node, problems) diff --git a/tests/integration/assessment/test_sequencing.py b/tests/integration/assessment/test_sequencing.py index 5276522805..b5fb626544 100644 --- a/tests/integration/assessment/test_sequencing.py +++ b/tests/integration/assessment/test_sequencing.py @@ -38,7 +38,7 @@ def test_migration_sequencing_job_with_task_referencing_cluster( def test_migration_sequencing_job_with_task_referencing_non_existing_cluster(runtime_ctx) -> None: """Sequence a job with a task referencing existing cluster""" - # Cannot make an actual job referencing a non-exsting cluster + # Cannot make an actual job referencing a non-existing cluster task = jobs.Task(task_key="test-task", existing_cluster_id="non-existing-id") settings = jobs.JobSettings(name="test-job", tasks=[task]) job = jobs.Job(job_id=1234, settings=settings) From 0392caee31d19b881200316a6a81129084793b2c Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Thu, 31 Oct 2024 11:53:00 +0100 Subject: [PATCH 099/106] Test sequence job with task dependency --- tests/unit/assessment/test_sequencing.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/tests/unit/assessment/test_sequencing.py b/tests/unit/assessment/test_sequencing.py index bd1f521f19..bfe63ed667 100644 --- a/tests/unit/assessment/test_sequencing.py +++ b/tests/unit/assessment/test_sequencing.py @@ -103,6 +103,24 @@ def test_register_job_with_new_cluster(ws, admin_locator) -> None: assert not maybe_node.failed +def test_register_job_with_task_dependency(ws, admin_locator) -> None: + """Register a job with two tasks having a dependency.""" + task1 = jobs.Task(task_key="task1") + task_dependency = jobs.TaskDependency(task1.task_key) + task2 = jobs.Task(task_key="task2", depends_on=[task_dependency]) + tasks = [task2, task1] # Reverse order on purpose to test if this is handled + settings = jobs.JobSettings(name="job", tasks=tasks) + job = jobs.Job(job_id=1234, settings=settings) + sequencer = MigrationSequencer(ws, admin_locator) + + maybe_job_node = sequencer.register_job(job) + maybe_node = sequencer.register_job(job) + + assert not maybe_node.failed + + assert not maybe_job_node.failed + + def test_sequence_steps_from_job_task_with_existing_cluster_id(ws, admin_locator) -> None: """Sequence a job with a task referencing an existing cluster. From 0137e85cb3d807312bcf49a8d98dd75e74354d96 Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Thu, 31 Oct 2024 11:57:07 +0100 Subject: [PATCH 100/106] Test non-existing task dependency --- .../labs/ucx/assessment/sequencing.py | 4 +++- tests/unit/assessment/test_sequencing.py | 20 +++++++++++++++++-- 2 files changed, 21 insertions(+), 3 deletions(-) diff --git a/src/databricks/labs/ucx/assessment/sequencing.py b/src/databricks/labs/ucx/assessment/sequencing.py index b059f1d563..0d12ea214c 100644 --- a/src/databricks/labs/ucx/assessment/sequencing.py +++ b/src/databricks/labs/ucx/assessment/sequencing.py @@ -202,8 +202,10 @@ def register_job(self, job: jobs.Job) -> MaybeMigrationNode: if maybe_task_dependency: self._outgoing_references[task_key].add(maybe_task_dependency) else: + # Verified that a job with a task having a depends on referring a non-existing task cannot be + # created. However, this code is just in case. problem = DependencyProblem( - 'task-dependency-not-found', f"Could not find task: {task_dependency}" + 'task-dependency-not-found', f"Could not find task: {task_dependency_key[1]}" ) problems.append(problem) return MaybeMigrationNode(job_node, problems) diff --git a/tests/unit/assessment/test_sequencing.py b/tests/unit/assessment/test_sequencing.py index bfe63ed667..fbec57cf1e 100644 --- a/tests/unit/assessment/test_sequencing.py +++ b/tests/unit/assessment/test_sequencing.py @@ -113,12 +113,28 @@ def test_register_job_with_task_dependency(ws, admin_locator) -> None: job = jobs.Job(job_id=1234, settings=settings) sequencer = MigrationSequencer(ws, admin_locator) - maybe_job_node = sequencer.register_job(job) maybe_node = sequencer.register_job(job) assert not maybe_node.failed - assert not maybe_job_node.failed + +def test_register_job_with_non_existing_task_dependency(ws, admin_locator) -> None: + """Register a job with a non-existing task dependency.""" + task_dependency = jobs.TaskDependency("non-existing-id") + task = jobs.Task(task_key="task2", depends_on=[task_dependency]) + settings = jobs.JobSettings(name="job", tasks=[task]) + job = jobs.Job(job_id=1234, settings=settings) + sequencer = MigrationSequencer(ws, admin_locator) + + maybe_node = sequencer.register_job(job) + + assert maybe_node.failed + assert maybe_node.problems == [ + DependencyProblem( + code="task-dependency-not-found", + message="Could not find task: 1234/non-existing-id", + ) + ] def test_sequence_steps_from_job_task_with_existing_cluster_id(ws, admin_locator) -> None: From 5278398b9e93253e4326fee6e7f54b2b4582fc1e Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Thu, 31 Oct 2024 12:00:50 +0100 Subject: [PATCH 101/106] Return early when no job.settings --- .../labs/ucx/assessment/sequencing.py | 51 ++++++++++--------- 1 file changed, 26 insertions(+), 25 deletions(-) diff --git a/src/databricks/labs/ucx/assessment/sequencing.py b/src/databricks/labs/ucx/assessment/sequencing.py index 0d12ea214c..b7cc199d11 100644 --- a/src/databricks/labs/ucx/assessment/sequencing.py +++ b/src/databricks/labs/ucx/assessment/sequencing.py @@ -183,31 +183,32 @@ def register_job(self, job: jobs.Job) -> MaybeMigrationNode: object_owner=JobOwnership(self._admin_locator).owner_of(JobInfo.from_job(job)), ) self._nodes[job_node.key] = job_node - if job.settings: - for job_cluster in job.settings.job_clusters or []: - maybe_cluster_node = self._register_job_cluster(job_cluster, job_node) - if maybe_cluster_node.node: - self._outgoing_references[job_node.key].add(maybe_cluster_node.node) - for task in job.settings.tasks or []: - maybe_task_node = self._register_workflow_task(task, job_node) - problems.extend(maybe_task_node.problems) - if maybe_task_node.node: - self._outgoing_references[job_node.key].add(maybe_task_node.node) - # Only after registering all tasks, we can resolve the task dependencies - for task in job.settings.tasks or []: - task_key = ("TASK", f"{job.job_id}/{task.task_key}") - for task_dependency in task.depends_on or []: - task_dependency_key = ("TASK", f"{job.job_id}/{task_dependency.task_key}") - maybe_task_dependency = self._nodes.get(task_dependency_key) - if maybe_task_dependency: - self._outgoing_references[task_key].add(maybe_task_dependency) - else: - # Verified that a job with a task having a depends on referring a non-existing task cannot be - # created. However, this code is just in case. - problem = DependencyProblem( - 'task-dependency-not-found', f"Could not find task: {task_dependency_key[1]}" - ) - problems.append(problem) + if not job.settings: + return MaybeMigrationNode(job_node, problems) + for job_cluster in job.settings.job_clusters or []: + maybe_cluster_node = self._register_job_cluster(job_cluster, job_node) + if maybe_cluster_node.node: + self._outgoing_references[job_node.key].add(maybe_cluster_node.node) + for task in job.settings.tasks or []: + maybe_task_node = self._register_workflow_task(task, job_node) + problems.extend(maybe_task_node.problems) + if maybe_task_node.node: + self._outgoing_references[job_node.key].add(maybe_task_node.node) + # Only after registering all tasks, we can resolve the task dependencies + for task in job.settings.tasks or []: + task_key = ("TASK", f"{job.job_id}/{task.task_key}") + for task_dependency in task.depends_on or []: + task_dependency_key = ("TASK", f"{job.job_id}/{task_dependency.task_key}") + maybe_task_dependency = self._nodes.get(task_dependency_key) + if maybe_task_dependency: + self._outgoing_references[task_key].add(maybe_task_dependency) + else: + # Verified that a job with a task having a depends on referring a non-existing task cannot be + # created. However, this code is just in case. + problem = DependencyProblem( + 'task-dependency-not-found', f"Could not find task: {task_dependency_key[1]}" + ) + problems.append(problem) return MaybeMigrationNode(job_node, problems) def _register_workflow_task(self, task: jobs.Task, parent: MigrationNode) -> MaybeMigrationNode: From df45c536f41803bc06d2f50aba3fbc3f0bb6962f Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Thu, 31 Oct 2024 12:02:16 +0100 Subject: [PATCH 102/106] Fix typo's --- src/databricks/labs/ucx/assessment/sequencing.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/databricks/labs/ucx/assessment/sequencing.py b/src/databricks/labs/ucx/assessment/sequencing.py index b7cc199d11..2f2a48d2b6 100644 --- a/src/databricks/labs/ucx/assessment/sequencing.py +++ b/src/databricks/labs/ucx/assessment/sequencing.py @@ -221,9 +221,9 @@ def _register_workflow_task(self, task: jobs.Task, parent: MigrationNode) -> May - notebook_task - pipeline_task - python_wheel_task - - run_job_tasl + - run_job_task - spark_jar_task - - spark_python_taks + - spark_python_task - spark_submit_task - sql_task """ From 1ea02f72038e3b2c4ebc64fbe84f221ab60a3137 Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Fri, 1 Nov 2024 08:31:38 +0100 Subject: [PATCH 103/106] Rename administrator locator --- src/databricks/labs/ucx/assessment/sequencing.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/databricks/labs/ucx/assessment/sequencing.py b/src/databricks/labs/ucx/assessment/sequencing.py index 2f2a48d2b6..bc9b4f1260 100644 --- a/src/databricks/labs/ucx/assessment/sequencing.py +++ b/src/databricks/labs/ucx/assessment/sequencing.py @@ -148,9 +148,9 @@ class MigrationSequencer: Analysing the graph in this case means: computing the migration sequence in `meth:generate_steps`. """ - def __init__(self, ws: WorkspaceClient, admin_locator: AdministratorLocator): + def __init__(self, ws: WorkspaceClient, administrator_locator: AdministratorLocator): self._ws = ws - self._admin_locator = admin_locator + self._admin_locator = administrator_locator self._counter = itertools.count() self._nodes: dict[MigrationNodeKey, MigrationNode] = {} From d26916b39b834cc238dd6530a1a8550816571ca8 Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Fri, 1 Nov 2024 08:33:40 +0100 Subject: [PATCH 104/106] Do not expect cluster in sequence steps when not found --- tests/integration/assessment/test_sequencing.py | 14 +++++++++++--- tests/unit/assessment/test_sequencing.py | 16 +++------------- 2 files changed, 14 insertions(+), 16 deletions(-) diff --git a/tests/integration/assessment/test_sequencing.py b/tests/integration/assessment/test_sequencing.py index b5fb626544..5aa5fc2682 100644 --- a/tests/integration/assessment/test_sequencing.py +++ b/tests/integration/assessment/test_sequencing.py @@ -1,5 +1,7 @@ from databricks.sdk.service import jobs +from databricks.labs.ucx.source_code.graph import DependencyProblem + def test_migration_sequencing_simple_job(make_job, runtime_ctx) -> None: """Sequence a simple job""" @@ -43,9 +45,15 @@ def test_migration_sequencing_job_with_task_referencing_non_existing_cluster(run settings = jobs.JobSettings(name="test-job", tasks=[task]) job = jobs.Job(job_id=1234, settings=settings) - maybe_job_node = runtime_ctx.migration_sequencer.register_job(job) - assert maybe_job_node.failed + maybe_node = runtime_ctx.migration_sequencer.register_job(job) + assert maybe_node.failed + assert maybe_node.problems == [ + DependencyProblem( + code="cluster-not-found", + message="Could not find cluster: non-existing-id", + ) + ] steps = runtime_ctx.migration_sequencer.generate_steps() step_object_types = [step.object_type for step in steps] - assert step_object_types == ["CLUSTER", "TASK", "JOB"] # TODO: What do we expect? + assert step_object_types == ["TASK", "JOB"] diff --git a/tests/unit/assessment/test_sequencing.py b/tests/unit/assessment/test_sequencing.py index fbec57cf1e..02d2f478aa 100644 --- a/tests/unit/assessment/test_sequencing.py +++ b/tests/unit/assessment/test_sequencing.py @@ -284,9 +284,8 @@ def test_sequence_steps_from_job_task_with_non_existing_cluster(ws, admin_locato """Sequence a job with a task that references a non-existing cluster. Sequence: - 1. Cluster # TODO: Do we still expect this reference? - 2. Task - 3. Job + 1. Task + 2. Job """ ws.clusters.get.side_effect = ResourceDoesNotExist("Unknown cluster") task = jobs.Task(task_key="test-task", existing_cluster_id="non-existing-id") @@ -298,15 +297,6 @@ def test_sequence_steps_from_job_task_with_non_existing_cluster(ws, admin_locato steps = list(sequencer.generate_steps()) assert steps == [ - MigrationStep( - step_id=2, - step_number=0, - object_type="CLUSTER", - object_id="non-existing-id", - object_name="non-existing-id", - object_owner="John Doe", - required_step_ids=[], - ), MigrationStep( step_id=1, step_number=0, @@ -314,7 +304,7 @@ def test_sequence_steps_from_job_task_with_non_existing_cluster(ws, admin_locato object_id="1234/test-task", object_name="test-task", object_owner="John Doe", - required_step_ids=[2], + required_step_ids=[], ), MigrationStep( step_id=0, From ba8b4c6a15c5a31e8475d96703506001339c39a9 Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Fri, 1 Nov 2024 08:46:10 +0100 Subject: [PATCH 105/106] Refactor register_job to register_jobs --- .../labs/ucx/assessment/sequencing.py | 31 +++++++++------ .../integration/assessment/test_sequencing.py | 6 +-- tests/unit/assessment/test_sequencing.py | 38 +++++++++---------- 3 files changed, 42 insertions(+), 33 deletions(-) diff --git a/src/databricks/labs/ucx/assessment/sequencing.py b/src/databricks/labs/ucx/assessment/sequencing.py index bc9b4f1260..4fa660f558 100644 --- a/src/databricks/labs/ucx/assessment/sequencing.py +++ b/src/databricks/labs/ucx/assessment/sequencing.py @@ -8,7 +8,7 @@ from databricks.sdk import WorkspaceClient from databricks.sdk.errors import DatabricksError -from databricks.sdk.service import jobs +from databricks.sdk.service.jobs import Job, JobCluster, Task from databricks.labs.ucx.assessment.clusters import ClusterOwnership, ClusterInfo from databricks.labs.ucx.assessment.jobs import JobOwnership, JobInfo @@ -156,20 +156,29 @@ def __init__(self, ws: WorkspaceClient, administrator_locator: AdministratorLoca # Outgoing references contains edges in the graph pointing from a node to a set of nodes that the node # references. These references follow the API references, e.g. a job contains tasks in the - # `jobs.Job.settings.tasks`, thus a job has an outgoing reference to each of those tasks. + # `Job.settings.tasks`, thus a job has an outgoing reference to each of those tasks. self._outgoing_references: dict[MigrationNodeKey, set[MigrationNode]] = defaultdict(set) - def register_job(self, job: jobs.Job) -> MaybeMigrationNode: + def register_jobs(self, *jobs: Job) -> list[MaybeMigrationNode]: """Register a job. Args: - job (jobs.Job) : The job to register. + jobs (Job) : The jobs to register. Returns: - MaybeMigrationNode : A maybe migration node, which has the migration node if no problems occurred during - registering. Otherwise, the maybe migration node contains the dependency problems occurring during - registering the job. + list[MaybeMigrationNode] : Each element contains a maybe migration node for each job respectively. If no + problems occurred during registering the job, the maybe migration node contains the migration node. + Otherwise, the maybe migration node contains the dependency problems occurring during registering the + job. """ + nodes: list[MaybeMigrationNode] = [] + for job in jobs: + node = self._register_job(job) + nodes.append(node) + return nodes + + def _register_job(self, job: Job) -> MaybeMigrationNode: + """Register a single job.""" problems: list[DependencyProblem] = [] job_node = self._nodes.get(("JOB", str(job.job_id)), None) if job_node: @@ -211,11 +220,11 @@ def register_job(self, job: jobs.Job) -> MaybeMigrationNode: problems.append(problem) return MaybeMigrationNode(job_node, problems) - def _register_workflow_task(self, task: jobs.Task, parent: MigrationNode) -> MaybeMigrationNode: + def _register_workflow_task(self, task: Task, parent: MigrationNode) -> MaybeMigrationNode: """Register a workflow task. TODO: - Handle following jobs.Task attributes: + Handle following Task attributes: - for_each_task - libraries - notebook_task @@ -255,7 +264,7 @@ def _register_workflow_task(self, task: jobs.Task, parent: MigrationNode) -> May problems.append(problem) return MaybeMigrationNode(task_node, problems) - def _register_job_cluster(self, cluster: jobs.JobCluster, parent: MigrationNode) -> MaybeMigrationNode: + def _register_job_cluster(self, cluster: JobCluster, parent: MigrationNode) -> MaybeMigrationNode: """Register a job cluster. A job cluster is defined within a job and therefore is found when defined on the job by definition. @@ -276,7 +285,7 @@ def _register_cluster(self, cluster_id: str) -> MaybeMigrationNode: """Register a cluster. TODO - Handle following jobs.Task attributes: + Handle following Task attributes: - init_scripts - instance_pool_id (maybe_not) - policy_id diff --git a/tests/integration/assessment/test_sequencing.py b/tests/integration/assessment/test_sequencing.py index 5aa5fc2682..3b93682dce 100644 --- a/tests/integration/assessment/test_sequencing.py +++ b/tests/integration/assessment/test_sequencing.py @@ -7,7 +7,7 @@ def test_migration_sequencing_simple_job(make_job, runtime_ctx) -> None: """Sequence a simple job""" job = make_job() - maybe_job_node = runtime_ctx.migration_sequencer.register_job(job) + maybe_job_node = runtime_ctx.migration_sequencer.register_jobs(job)[0] assert not maybe_job_node.failed steps = runtime_ctx.migration_sequencer.generate_steps() @@ -30,7 +30,7 @@ def test_migration_sequencing_job_with_task_referencing_cluster( ) job = make_job(tasks=[task]) - maybe_job_node = runtime_ctx.migration_sequencer.register_job(job) + maybe_job_node = runtime_ctx.migration_sequencer.register_jobs(job)[0] assert not maybe_job_node.failed steps = runtime_ctx.migration_sequencer.generate_steps() @@ -45,7 +45,7 @@ def test_migration_sequencing_job_with_task_referencing_non_existing_cluster(run settings = jobs.JobSettings(name="test-job", tasks=[task]) job = jobs.Job(job_id=1234, settings=settings) - maybe_node = runtime_ctx.migration_sequencer.register_job(job) + maybe_node = runtime_ctx.migration_sequencer.register_jobs(job)[0] assert maybe_node.failed assert maybe_node.problems == [ DependencyProblem( diff --git a/tests/unit/assessment/test_sequencing.py b/tests/unit/assessment/test_sequencing.py index 02d2f478aa..1708c9a537 100644 --- a/tests/unit/assessment/test_sequencing.py +++ b/tests/unit/assessment/test_sequencing.py @@ -19,7 +19,7 @@ def admin_locator(ws): return AdministratorLocator(ws, finders=[lambda _ws: admin_finder]) -def test_register_job_with_existing_cluster(ws, admin_locator) -> None: +def test_register_jobs_with_existing_cluster(ws, admin_locator) -> None: """Register a job with a task referencing an existing cluster.""" task = jobs.Task(task_key="test-task", existing_cluster_id="cluster-123") settings = jobs.JobSettings(name="test-job", tasks=[task]) @@ -33,12 +33,12 @@ def get_cluster(cluster_id: str) -> ClusterDetails: ws.clusters.get.side_effect = get_cluster sequencer = MigrationSequencer(ws, admin_locator) - maybe_node = sequencer.register_job(job) + maybe_node = sequencer.register_jobs(job)[0] assert not maybe_node.failed -def test_register_job_with_non_existing_cluster(ws, admin_locator) -> None: +def test_register_jobs_with_non_existing_cluster(ws, admin_locator) -> None: """Register a job with a task referencing a non-existing cluster.""" task = jobs.Task(task_key="test-task", existing_cluster_id="non-existing-id") settings = jobs.JobSettings(name="test-job", tasks=[task]) @@ -47,7 +47,7 @@ def test_register_job_with_non_existing_cluster(ws, admin_locator) -> None: ws.clusters.get.side_effect = ResourceDoesNotExist("Unknown cluster") sequencer = MigrationSequencer(ws, admin_locator) - maybe_node = sequencer.register_job(job) + maybe_node = sequencer.register_jobs(job)[0] assert maybe_node.failed assert maybe_node.problems == [ @@ -58,7 +58,7 @@ def test_register_job_with_non_existing_cluster(ws, admin_locator) -> None: ] -def test_register_job_with_existing_job_cluster_key(ws, admin_locator) -> None: +def test_register_jobs_with_existing_job_cluster_key(ws, admin_locator) -> None: """Register a job with a task referencing a existing job cluster.""" job_cluster = jobs.JobCluster("existing-id", ClusterSpec()) task = jobs.Task(task_key="test-task", job_cluster_key="existing-id") @@ -66,12 +66,12 @@ def test_register_job_with_existing_job_cluster_key(ws, admin_locator) -> None: job = jobs.Job(job_id=1234, settings=settings) sequencer = MigrationSequencer(ws, admin_locator) - maybe_node = sequencer.register_job(job) + maybe_node = sequencer.register_jobs(job)[0] assert not maybe_node.failed -def test_register_job_with_non_existing_job_cluster_key(ws, admin_locator) -> None: +def test_register_jobs_with_non_existing_job_cluster_key(ws, admin_locator) -> None: """Register a job with a task referencing a non-existing job cluster.""" task = jobs.Task(task_key="test-task", job_cluster_key="non-existing-id") settings = jobs.JobSettings(name="test-job", tasks=[task]) @@ -79,7 +79,7 @@ def test_register_job_with_non_existing_job_cluster_key(ws, admin_locator) -> No sequencer = MigrationSequencer(ws, admin_locator) - maybe_node = sequencer.register_job(job) + maybe_node = sequencer.register_jobs(job)[0] assert maybe_node.failed assert maybe_node.problems == [ @@ -90,7 +90,7 @@ def test_register_job_with_non_existing_job_cluster_key(ws, admin_locator) -> No ] -def test_register_job_with_new_cluster(ws, admin_locator) -> None: +def test_register_jobs_with_new_cluster(ws, admin_locator) -> None: """Register a job with a task with a new cluster definition.""" task = jobs.Task(task_key="test-task", new_cluster=ClusterSpec()) settings = jobs.JobSettings(name="test-job", tasks=[task]) @@ -98,12 +98,12 @@ def test_register_job_with_new_cluster(ws, admin_locator) -> None: ws.jobs.get.return_value = job sequencer = MigrationSequencer(ws, admin_locator) - maybe_node = sequencer.register_job(job) + maybe_node = sequencer.register_jobs(job)[0] assert not maybe_node.failed -def test_register_job_with_task_dependency(ws, admin_locator) -> None: +def test_register_jobs_with_task_dependency(ws, admin_locator) -> None: """Register a job with two tasks having a dependency.""" task1 = jobs.Task(task_key="task1") task_dependency = jobs.TaskDependency(task1.task_key) @@ -113,12 +113,12 @@ def test_register_job_with_task_dependency(ws, admin_locator) -> None: job = jobs.Job(job_id=1234, settings=settings) sequencer = MigrationSequencer(ws, admin_locator) - maybe_node = sequencer.register_job(job) + maybe_node = sequencer.register_jobs(job)[0] assert not maybe_node.failed -def test_register_job_with_non_existing_task_dependency(ws, admin_locator) -> None: +def test_register_jobs_with_non_existing_task_dependency(ws, admin_locator) -> None: """Register a job with a non-existing task dependency.""" task_dependency = jobs.TaskDependency("non-existing-id") task = jobs.Task(task_key="task2", depends_on=[task_dependency]) @@ -126,7 +126,7 @@ def test_register_job_with_non_existing_task_dependency(ws, admin_locator) -> No job = jobs.Job(job_id=1234, settings=settings) sequencer = MigrationSequencer(ws, admin_locator) - maybe_node = sequencer.register_job(job) + maybe_node = sequencer.register_jobs(job)[0] assert maybe_node.failed assert maybe_node.problems == [ @@ -160,7 +160,7 @@ def get_cluster(cluster_id: str) -> ClusterDetails: ws.clusters.get.side_effect = get_cluster sequencer = MigrationSequencer(ws, admin_locator) - sequencer.register_job(job) + sequencer.register_jobs(job) steps = list(sequencer.generate_steps()) @@ -208,7 +208,7 @@ def test_sequence_steps_from_job_task_with_existing_job_cluster_key(ws, admin_lo settings = jobs.JobSettings(name="test-job", tasks=[task], job_clusters=[job_cluster]) job = jobs.Job(job_id=1234, settings=settings) sequencer = MigrationSequencer(ws, admin_locator) - sequencer.register_job(job) + sequencer.register_jobs(job) steps = list(sequencer.generate_steps()) @@ -254,7 +254,7 @@ def test_sequence_steps_from_job_task_with_new_cluster(ws, admin_locator) -> Non settings = jobs.JobSettings(name="test-job", tasks=[task]) job = jobs.Job(job_id=1234, settings=settings) sequencer = MigrationSequencer(ws, admin_locator) - sequencer.register_job(job) + sequencer.register_jobs(job) steps = list(sequencer.generate_steps()) @@ -292,7 +292,7 @@ def test_sequence_steps_from_job_task_with_non_existing_cluster(ws, admin_locato settings = jobs.JobSettings(name="test-job", tasks=[task]) job = jobs.Job(job_id=1234, settings=settings) sequencer = MigrationSequencer(ws, admin_locator) - sequencer.register_job(job) + sequencer.register_jobs(job) steps = list(sequencer.generate_steps()) @@ -334,7 +334,7 @@ def test_sequence_steps_from_job_task_referencing_other_task(ws, admin_locator) job = jobs.Job(job_id=1234, settings=settings) sequencer = MigrationSequencer(ws, admin_locator) - maybe_job_node = sequencer.register_job(job) + maybe_job_node = sequencer.register_jobs(job)[0] assert not maybe_job_node.failed steps = list(sequencer.generate_steps()) From 9842f64c8b97c3bfcf0e6b3086424a7e227cd26e Mon Sep 17 00:00:00 2001 From: Cor Zuurmond Date: Fri, 1 Nov 2024 08:53:18 +0100 Subject: [PATCH 106/106] Remove unused import --- tests/unit/source_code/test_directfs_access.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/unit/source_code/test_directfs_access.py b/tests/unit/source_code/test_directfs_access.py index 5b79914502..2b381df95f 100644 --- a/tests/unit/source_code/test_directfs_access.py +++ b/tests/unit/source_code/test_directfs_access.py @@ -11,7 +11,6 @@ DirectFsAccess, DirectFsAccessOwnership, ) -from databricks.labs.ucx.source_code.python.python_ast import DfsaPyCollector def test_crawler_appends_dfsas() -> None: