Skip to content

Commit eda74f2

Browse files
ericvergnaudJCZuurmond
authored andcommitted
basic support of cyclic dependencies
1 parent ad1f7cf commit eda74f2

File tree

2 files changed

+65
-9
lines changed

2 files changed

+65
-9
lines changed

src/databricks/labs/ucx/sequencing/sequencing.py

Lines changed: 31 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -181,20 +181,31 @@ def register_cluster(self, cluster_id: str) -> MigrationNode:
181181
return cluster_node
182182

183183
def generate_steps(self) -> Iterable[MigrationStep]:
184-
# algo adapted from Kahn topological sort. The main differences is that
185-
# we want the same step number for all nodes with same dependency depth
186-
# so instead of pushing to a queue, we rebuild it once all leaf nodes are processed
187-
# (these are transient leaf nodes i.e. they only become leaf during processing)
184+
""" The below algo is adapted from Kahn's topological sort.
185+
The main differences are as follows:
186+
1) we want the same step number for all nodes with same dependency depth
187+
so instead of pushing 'leaf' nodes to a queue, we fetch them again once all current 'leaf' nodes are processed
188+
(these are transient 'leaf' nodes i.e. they only become 'leaf' during processing)
189+
2) Kahn only supports DAGs but python code allows cyclic dependencies i.e. A -> B -> C -> A is not a DAG
190+
so when fetching 'leaf' nodes, we relax the 0-incoming-vertex rule in order
191+
to avoid an infinite loop. We also avoid side effects (such as negative counts).
192+
This algo works correctly for simple cases, but is not tested on large trees.
193+
"""
188194
incoming_counts = self._populate_incoming_counts()
189195
step_number = 1
190196
sorted_steps: list[MigrationStep] = []
191197
while len(incoming_counts) > 0:
192-
leaf_keys = list(self._get_leaf_keys(incoming_counts))
198+
leaf_keys = self._get_leaf_keys(incoming_counts)
193199
for leaf_key in leaf_keys:
194200
del incoming_counts[leaf_key]
195201
sorted_steps.append(self._nodes[leaf_key].as_step(step_number, list(self._required_step_ids(leaf_key))))
196202
for dependency_key in self._outgoing[leaf_key]:
197-
incoming_counts[dependency_key] -= 1
203+
# prevent re-instantiation of already deleted keys
204+
if dependency_key not in incoming_counts:
205+
continue
206+
# prevent negative count with cyclic dependencies
207+
if incoming_counts[dependency_key] > 0:
208+
incoming_counts[dependency_key] -= 1
198209
step_number += 1
199210
return sorted_steps
200211

@@ -208,9 +219,20 @@ def _populate_incoming_counts(self) -> dict[tuple[str, str], int]:
208219
result[node_key] = len(self._incoming[node_key])
209220
return result
210221

211-
@staticmethod
212-
def _get_leaf_keys(incoming_counts: dict[tuple[str, str], int]) -> Iterable[tuple[str, str]]:
222+
@classmethod
223+
def _get_leaf_keys(cls, incoming_counts: dict[tuple[str, str], int]) -> Iterable[tuple[str, str]]:
224+
count = 0
225+
leaf_keys = list(cls._yield_leaf_keys(incoming_counts, count))
226+
# if we're not finding nodes with 0 incoming counts, it's likely caused by cyclic dependencies
227+
# in which case it's safe to process nodes with a higher incoming count
228+
while not leaf_keys:
229+
count += 1
230+
leaf_keys = list(cls._yield_leaf_keys(incoming_counts, count))
231+
return leaf_keys
232+
233+
@classmethod
234+
def _yield_leaf_keys(cls, incoming_counts: dict[tuple[str, str], int], level: int) -> Iterable[tuple[str, str]]:
213235
for node_key, incoming_count in incoming_counts.items():
214-
if incoming_count > 0:
236+
if incoming_count > level:
215237
continue
216238
yield node_key

tests/unit/sequencing/test_sequencing.py

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -79,3 +79,37 @@ def test_sequencer_builds_steps_from_dependency_graph(ws, simple_dependency_reso
7979
assert step3
8080
assert step3.step_number < step2.step_number
8181

82+
83+
class _DependencyGraph(DependencyGraph):
84+
85+
def add_dependency(self, graph: DependencyGraph):
86+
self._dependencies[graph.dependency] = graph
87+
88+
89+
class _MigrationSequencer(MigrationSequencer):
90+
91+
def visit_graph(self, graph: DependencyGraph):
92+
graph.visit(self._visit_dependency, None)
93+
94+
95+
def test_sequencer_supports_cyclic_dependencies(ws, simple_dependency_resolver, mock_path_lookup):
96+
root = Dependency(FileLoader(), Path("root.py"))
97+
root_graph = _DependencyGraph(root, None, simple_dependency_resolver, mock_path_lookup, CurrentSessionState())
98+
child_a = Dependency(FileLoader(), Path("a.py"))
99+
child_graph_a = _DependencyGraph(child_a, root_graph, simple_dependency_resolver, mock_path_lookup, CurrentSessionState())
100+
child_b = Dependency(FileLoader(), Path("b.py"))
101+
child_graph_b = _DependencyGraph(child_b, root_graph, simple_dependency_resolver, mock_path_lookup, CurrentSessionState())
102+
# root imports a and b
103+
root_graph.add_dependency(child_graph_a)
104+
root_graph.add_dependency(child_graph_b)
105+
# a imports b
106+
child_graph_a.add_dependency(child_graph_b)
107+
# b imports a (using local import)
108+
child_graph_b.add_dependency(child_graph_a)
109+
sequencer = _MigrationSequencer(ws, mock_path_lookup, admin_locator(ws, "John Doe"))
110+
sequencer.register_dependency(None, root.lineage[-1].object_type, root.lineage[-1].object_id)
111+
sequencer.visit_graph(root_graph)
112+
steps = list(sequencer.generate_steps())
113+
assert len(steps) == 3
114+
assert steps[2].object_id == "root.py"
115+

0 commit comments

Comments
 (0)