Skip to content

Commit ae1c697

Browse files
committed
Merge branch 'migration-sequencing-phase-1' into migration-sequencing-phase-2
# Conflicts: # src/databricks/labs/ucx/assessment/sequencing.py # tests/unit/assessment/test_sequencing.py
2 parents 5eacacc + da0330d commit ae1c697

File tree

4 files changed

+33
-22
lines changed

4 files changed

+33
-22
lines changed

src/databricks/labs/ucx/sequencing/sequencing.py renamed to src/databricks/labs/ucx/assessment/sequencing.py

Lines changed: 20 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -62,7 +62,6 @@ def __init__(self, ws: WorkspaceClient, path_lookup: PathLookup, admin_locator:
6262
self._admin_locator = admin_locator
6363
self._last_node_id = 0
6464
self._nodes: dict[tuple[str, str], MigrationNode] = {}
65-
self._incoming: dict[tuple[str, str], set[tuple[str, str]]] = defaultdict(set)
6665
self._outgoing: dict[tuple[str, str], set[tuple[str, str]]] = defaultdict(set)
6766

6867
def register_workflow_task(self, task: jobs.Task, job: jobs.Job, graph: DependencyGraph) -> MigrationNode:
@@ -80,15 +79,12 @@ def register_workflow_task(self, task: jobs.Task, job: jobs.Job, graph: Dependen
8079
object_owner=job_node.object_owner, # no task owner so use job one
8180
)
8281
self._nodes[task_node.key] = task_node
83-
self._incoming[job_node.key].add(task_node.key)
8482
self._outgoing[task_node.key].add(job_node.key)
8583
if task.existing_cluster_id:
8684
cluster_node = self.register_cluster(task.existing_cluster_id)
8785
if cluster_node:
88-
self._incoming[cluster_node.key].add(task_node.key)
8986
self._outgoing[task_node.key].add(cluster_node.key)
9087
# also make the cluster dependent on the job
91-
self._incoming[cluster_node.key].add(job_node.key)
9288
self._outgoing[job_node.key].add(cluster_node.key)
9389
graph.visit(self._visit_dependency, None)
9490
return task_node
@@ -107,7 +103,6 @@ def register_dependency(self, parent_node: MigrationNode, object_type: str, obje
107103
if not dependency_node:
108104
dependency_node = self._create_dependency_node(object_type, object_id)
109105
if parent_node:
110-
self._incoming[parent_node.key].add(dependency_node.key)
111106
self._outgoing[dependency_node.key].add(parent_node.key)
112107
return dependency_node
113108

@@ -153,7 +148,6 @@ def register_workflow_job(self, job: jobs.Job) -> MigrationNode:
153148
for job_cluster in job.settings.job_clusters:
154149
cluster_node = self.register_job_cluster(job_cluster)
155150
if cluster_node:
156-
self._incoming[cluster_node.key].add(job_node.key)
157151
self._outgoing[job_node.key].add(cluster_node.key)
158152
return job_node
159153

@@ -182,7 +176,7 @@ def register_cluster(self, cluster_id: str) -> MigrationNode:
182176

183177
def generate_steps(self) -> Iterable[MigrationStep]:
184178
"""The below algo is adapted from Kahn's topological sort.
185-
The main differences are as follows:
179+
The differences are as follows:
186180
1) we want the same step number for all nodes with same dependency depth
187181
so instead of pushing 'leaf' nodes to a queue, we fetch them again once all current 'leaf' nodes are processed
188182
(these are transient 'leaf' nodes i.e. they only become 'leaf' during processing)
@@ -191,14 +185,17 @@ def generate_steps(self) -> Iterable[MigrationStep]:
191185
to avoid an infinite loop. We also avoid side effects (such as negative counts).
192186
This algo works correctly for simple cases, but is not tested on large trees.
193187
"""
194-
incoming_counts = self._populate_incoming_counts()
188+
incoming_keys = self._collect_incoming_keys()
189+
incoming_counts = self._compute_incoming_counts(incoming_keys)
195190
step_number = 1
196191
sorted_steps: list[MigrationStep] = []
197192
while len(incoming_counts) > 0:
198193
leaf_keys = self._get_leaf_keys(incoming_counts)
199194
for leaf_key in leaf_keys:
200195
del incoming_counts[leaf_key]
201-
sorted_steps.append(self._nodes[leaf_key].as_step(step_number, list(self._required_step_ids(leaf_key))))
196+
sorted_steps.append(
197+
self._nodes[leaf_key].as_step(step_number, list(self._required_step_ids(incoming_keys[leaf_key])))
198+
)
202199
self._on_leaf_key_processed(leaf_key, incoming_counts)
203200
step_number += 1
204201
return sorted_steps
@@ -212,14 +209,23 @@ def _on_leaf_key_processed(self, leaf_key: tuple[str, str], incoming_counts: dic
212209
if incoming_counts[dependency_key] > 0:
213210
incoming_counts[dependency_key] -= 1
214211

215-
def _required_step_ids(self, node_key: tuple[str, str]) -> Iterable[int]:
216-
for leaf_key in self._incoming[node_key]:
217-
yield self._nodes[leaf_key].node_id
212+
def _collect_incoming_keys(self) -> dict[tuple[str, str], set[tuple[str, str]]]:
213+
result: dict[tuple[str, str], set[tuple[str, str]]] = defaultdict(set)
214+
for source, outgoing in self._outgoing.items():
215+
for target in outgoing:
216+
result[target].add(source)
217+
return result
218+
219+
def _required_step_ids(self, required_step_keys: set[tuple[str, str]]) -> Iterable[int]:
220+
for source_key in required_step_keys:
221+
yield self._nodes[source_key].node_id
218222

219-
def _populate_incoming_counts(self) -> dict[tuple[str, str], int]:
223+
def _compute_incoming_counts(
224+
self, incoming: dict[tuple[str, str], set[tuple[str, str]]]
225+
) -> dict[tuple[str, str], int]:
220226
result = defaultdict(int)
221227
for node_key in self._nodes:
222-
result[node_key] = len(self._incoming[node_key])
228+
result[node_key] = len(incoming[node_key])
223229
return result
224230

225231
@classmethod

src/databricks/labs/ucx/sequencing/__init__.py

Whitespace-only changes.

tests/unit/sequencing/test_sequencing.py renamed to tests/unit/assessment/test_sequencing.py

Lines changed: 13 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
import dataclasses
12
from unittest.mock import create_autospec
23

34
from pathlib import Path
@@ -7,9 +8,9 @@
78
from databricks.sdk.service.compute import ClusterDetails
89
from databricks.sdk.service.jobs import NotebookTask
910

11+
from databricks.labs.ucx.assessment.sequencing import MigrationSequencer, MigrationStep
1012
from databricks.labs.ucx.framework.owners import AdministratorLocator, AdministratorFinder
1113
from databricks.labs.ucx.mixins.cached_workspace_path import WorkspaceCache
12-
from databricks.labs.ucx.sequencing.sequencing import MigrationSequencer
1314
from databricks.labs.ucx.source_code.base import CurrentSessionState
1415
from databricks.labs.ucx.source_code.graph import DependencyGraph, Dependency
1516
from databricks.labs.ucx.source_code.jobs import WorkflowTask
@@ -35,13 +36,17 @@ def test_sequencer_builds_cluster_and_children_from_task(ws, simple_dependency_r
3536
sequencer.register_workflow_task(task, job, graph)
3637
steps = list(sequencer.generate_steps())
3738
step = steps[-1]
38-
assert step.step_id
39-
assert step.object_type == "CLUSTER"
40-
assert step.object_id == "cluster-123"
41-
assert step.object_name == "my-cluster"
42-
assert step.object_owner == "John Doe"
43-
assert step.step_number == 3
44-
assert len(step.required_step_ids) == 2
39+
# we don't know the ids of the steps, se let's zero them
40+
step = dataclasses.replace(step, step_id=0, required_step_ids=[0] * len(step.required_step_ids))
41+
assert step == MigrationStep(
42+
step_id=0,
43+
step_number=3,
44+
object_type="CLUSTER",
45+
object_id="cluster-123",
46+
object_name="my-cluster",
47+
object_owner="John Doe",
48+
required_step_ids=[0, 0],
49+
)
4550

4651

4752
def test_sequencer_builds_steps_from_dependency_graph(ws, simple_dependency_resolver, mock_path_lookup):

tests/unit/sequencing/__init__.py

Whitespace-only changes.

0 commit comments

Comments
 (0)