clusterlin: add Linearize function

sipa · sipa · commit 46aad9b09986 · 2024-07-25T10:16:37.000-04:00
This adds a first version of the overall linearization interface, which given
a DepGraph constructs a good linearization, by incrementally including good
candidate sets (found using AncestorCandidateFinder and SearchCandidateFinder).
diff --git a/src/cluster_linearize.h b/src/cluster_linearize.h
@@ -167,6 +167,22 @@ class DepGraph
         for (auto pos : elems) ret += entries[pos].feerate;
         return ret;
     }
+
+    /** Append the entries of select to list in a topologically valid order.
+     *
+     * Complexity: O(select.Count() * log(select.Count())).
+     */
+    void AppendTopo(std::vector<ClusterIndex>& list, const SetType& select) const noexcept
+    {
+        ClusterIndex old_len = list.size();
+        for (auto i : select) list.push_back(i);
+        std::sort(list.begin() + old_len, list.end(), [&](ClusterIndex a, ClusterIndex b) noexcept {
+            const auto a_anc_count = entries[a].ancestors.Count();
+            const auto b_anc_count = entries[b].ancestors.Count();
+            if (a_anc_count != b_anc_count) return a_anc_count < b_anc_count;
+            return a < b;
+        });
+    }
 };
 
 /** A set of transactions together with their aggregate feerate. */
@@ -486,6 +502,57 @@ class SearchCandidateFinder
     }
 };
 
+/** Find a linearization for a cluster.
+ *
+ * @param[in] depgraph            Dependency graph of the cluster to be linearized.
+ * @param[in] max_iterations      Upper bound on the number of optimization steps that will be done.
+ * @return                        A pair of:
+ *                                - The resulting linearization.
+ *                                - A boolean indicating whether the result is guaranteed to be
+ *                                  optimal.
+ *
+ * Complexity: O(N * min(max_iterations + N, 2^N)) where N=depgraph.TxCount().
+ */
+template<typename SetType>
+std::pair<std::vector<ClusterIndex>, bool> Linearize(const DepGraph<SetType>& depgraph, uint64_t max_iterations) noexcept
+{
+    if (depgraph.TxCount() == 0) return {{}, true};
+
+    uint64_t iterations_left = max_iterations;
+    std::vector<ClusterIndex> linearization;
+
+    AncestorCandidateFinder anc_finder(depgraph);
+    SearchCandidateFinder src_finder(depgraph);
+    linearization.reserve(depgraph.TxCount());
+    bool optimal = true;
+
+    while (true) {
+        // Initialize best as the best remaining ancestor set.
+        auto best = anc_finder.FindCandidateSet();
+
+        // Invoke bounded search to update best, with up to half of our remaining iterations as
+        // limit.
+        uint64_t max_iterations_now = (iterations_left + 1) / 2;
+        uint64_t iterations_done_now = 0;
+        std::tie(best, iterations_done_now) = src_finder.FindCandidateSet(max_iterations_now, best);
+        iterations_left -= iterations_done_now;
+
+        if (iterations_done_now == max_iterations_now) {
+            optimal = false;
+        }
+
+        // Add to output in topological order.
+        depgraph.AppendTopo(linearization, best.transactions);
+
+        // Update state to reflect best is no longer to be linearized.
+        anc_finder.MarkDone(best.transactions);
+        if (anc_finder.AllDone()) break;
+        src_finder.MarkDone(best.transactions);
+    }
+
+    return {std::move(linearization), optimal};
+}
+
 } // namespace cluster_linearize
 
 #endif // BITCOIN_CLUSTER_LINEARIZE_H
diff --git a/src/test/fuzz/cluster_linearize.cpp b/src/test/fuzz/cluster_linearize.cpp
@@ -11,6 +11,7 @@
 #include <util/bitset.h>
 #include <util/feefrac.h>
 
+#include <algorithm>
 #include <stdint.h>
 #include <vector>
 #include <utility>
@@ -140,6 +141,29 @@ class ExhaustiveCandidateFinder
     }
 };
 
+/** A simple linearization algorithm.
+ *
+ * This matches Linearize() in interface and behavior, though with fewer optimizations, and using
+ * just SimpleCandidateFinder rather than AncestorCandidateFinder and SearchCandidateFinder.
+ */
+template<typename SetType>
+std::pair<std::vector<ClusterIndex>, bool> SimpleLinearize(const DepGraph<SetType>& depgraph, uint64_t max_iterations)
+{
+    std::vector<ClusterIndex> linearization;
+    SimpleCandidateFinder finder(depgraph);
+    SetType todo = SetType::Fill(depgraph.TxCount());
+    bool optimal = true;
+    while (todo.Any()) {
+        auto [candidate, iterations_done] = finder.FindCandidateSet(max_iterations);
+        if (iterations_done == max_iterations) optimal = false;
+        depgraph.AppendTopo(linearization, candidate.transactions);
+        todo -= candidate.transactions;
+        finder.MarkDone(candidate.transactions);
+        max_iterations -= iterations_done;
+    }
+    return {std::move(linearization), optimal};
+}
+
 /** Given a dependency graph, and a todo set, read a topological subset of todo from reader. */
 template<typename SetType>
 SetType ReadTopologicalSet(const DepGraph<SetType>& depgraph, const SetType& todo, SpanReader& reader)
@@ -458,3 +482,68 @@ FUZZ_TARGET(clusterlin_search_finder)
     assert(exh_finder.AllDone());
     assert(anc_finder.AllDone());
 }
+
+FUZZ_TARGET(clusterlin_linearize)
+{
+    // Verify the behavior of Linearize().
+
+    // Retrieve an iteration count, and a depgraph from the fuzz input.
+    SpanReader reader(buffer);
+    DepGraph<TestBitSet> depgraph;
+    uint64_t iter_count{0};
+    try {
+        reader >> VARINT(iter_count) >> Using<DepGraphFormatter>(depgraph);
+    } catch (const std::ios_base::failure&) {}
+
+    // Invoke Linearize().
+    iter_count &= 0x7ffff;
+    auto [linearization, optimal] = Linearize(depgraph, iter_count);
+    SanityCheck(depgraph, linearization);
+    auto chunking = ChunkLinearization(depgraph, linearization);
+
+    // If the iteration count is sufficiently high, an optimal linearization must be found.
+    // Each linearization step can use up to 2^k iterations, with steps k=1..n. That sum is
+    // 2 * (2^n - 1)
+    const uint64_t n = depgraph.TxCount();
+    if (n <= 18 && iter_count > 2U * ((uint64_t{1} << n) - 1U)) {
+        assert(optimal);
+    }
+
+    // If Linearize claims optimal result, run quality tests.
+    if (optimal) {
+        // It must be as good as SimpleLinearize.
+        auto [simple_linearization, simple_optimal] = SimpleLinearize(depgraph, MAX_SIMPLE_ITERATIONS);
+        SanityCheck(depgraph, simple_linearization);
+        auto simple_chunking = ChunkLinearization(depgraph, simple_linearization);
+        auto cmp = CompareChunks(chunking, simple_chunking);
+        assert(cmp >= 0);
+        // If SimpleLinearize finds the optimal result too, they must be equal (if not,
+        // SimpleLinearize is broken).
+        if (simple_optimal) assert(cmp == 0);
+
+        // Only for very small clusters, test every topologically-valid permutation.
+        if (depgraph.TxCount() <= 7) {
+            std::vector<ClusterIndex> perm_linearization(depgraph.TxCount());
+            for (ClusterIndex i = 0; i < depgraph.TxCount(); ++i) perm_linearization[i] = i;
+            // Iterate over all valid permutations.
+            do {
+                // Determine whether perm_linearization is topological.
+                TestBitSet perm_done;
+                bool perm_is_topo{true};
+                for (auto i : perm_linearization) {
+                    perm_done.Set(i);
+                    if (!depgraph.Ancestors(i).IsSubsetOf(perm_done)) {
+                        perm_is_topo = false;
+                        break;
+                    }
+                }
+                // If so, verify that the obtained linearization is as good as the permutation.
+                if (perm_is_topo) {
+                    auto perm_chunking = ChunkLinearization(depgraph, perm_linearization);
+                    auto cmp = CompareChunks(chunking, perm_chunking);
+                    assert(cmp >= 0);
+                }
+            } while(std::next_permutation(perm_linearization.begin(), perm_linearization.end()));
+        }
+    }
+}
diff --git a/src/test/util/cluster_linearize.h b/src/test/util/cluster_linearize.h
@@ -7,6 +7,7 @@
 
 #include <cluster_linearize.h>
 #include <serialize.h>
+#include <span.h>
 #include <streams.h>
 #include <util/bitset.h>
 #include <util/feefrac.h>
@@ -331,6 +332,22 @@ void VerifyDepGraphFromCluster(const Cluster<SetType>& cluster, const DepGraph<S
     }
 }
 
+/** Perform a sanity check on a linearization. */
+template<typename SetType>
+void SanityCheck(const DepGraph<SetType>& depgraph, Span<const ClusterIndex> linearization)
+{
+    // Check completeness.
+    assert(linearization.size() == depgraph.TxCount());
+    TestBitSet done;
+    for (auto i : linearization) {
+        // Check transaction position is in range.
+        assert(i < depgraph.TxCount());
+        // Check topology and lack of duplicates.
+        assert((depgraph.Ancestors(i) - done) == TestBitSet::Singleton(i));
+        done.Set(i);
+    }
+}
+
 } // namespace
 
 #endif // BITCOIN_TEST_UTIL_CLUSTER_LINEARIZE_H