clusterlin: use bounded BFS exploration (optimization)

sipa · sipa · commit 991ff9a9a4f2 · 2024-07-25T10:16:40.000-04:00
Switch to BFS exploration of the search tree in SearchCandidateFinder
instead of DFS exploration. This appears to behave better for real
world clusters.

As BFS has the downside of needing far larger search queues, switch
back to DFS temporarily when the queue grows too large.
diff --git a/src/cluster_linearize.h b/src/cluster_linearize.h
@@ -13,6 +13,7 @@
 #include <utility>
 
 #include <util/feefrac.h>
+#include <util/vecdeque.h>
 
 namespace cluster_linearize {
 
@@ -415,7 +416,8 @@ class SearchCandidateFinder
         };
 
         /** The queue of work items. */
-        std::vector<WorkItem> queue;
+        VecDeque<WorkItem> queue;
+        queue.reserve(std::max<size_t>(256, 2 * m_todo.Count()));
 
         // Create an initial entry with m_todo as undecided. Also use it as best if not provided,
         // so that during the work processing loop below, and during the add_fn/split_fn calls, we
@@ -445,7 +447,10 @@ class SearchCandidateFinder
             // Make sure there are undecided transactions left to split on.
             if (und.None()) return;
 
-            // Actually construct a new work item on the queue.
+            // Actually construct a new work item on the queue. Due to the switch to DFS when queue
+            // space runs out (see below), we know that no reallocation of the queue should ever
+            // occur.
+            Assume(queue.size() < queue.capacity());
             queue.emplace_back(std::move(inc), std::move(und));
         };
 
@@ -479,10 +484,33 @@ class SearchCandidateFinder
         };
 
         // Work processing loop.
+        //
+        // New work items are always added at the back of the queue, but items to process use a
+        // hybrid approach where they can be taken from the front or the back.
+        //
+        // Depth-first search (DFS) corresponds to always taking from the back of the queue. This
+        // is very memory-efficient (linear in the number of transactions). Breadth-first search
+        // (BFS) corresponds to always taking from the front, which potentially uses more memory
+        // (up to exponential in the transaction count), but seems to work better in practice.
+        //
+        // The approach here combines the two: use BFS until the queue grows too large, at which
+        // point we temporarily switch to DFS until the size shrinks again.
         while (!queue.empty()) {
+            // Processing the first queue item, and then using DFS for everything it gives rise to,
+            // may increase the queue size by the number of undecided elements in there, minus 1
+            // for the first queue item being removed. Thus, only when that pushes the queue over
+            // its capacity can we not process from the front (BFS), and should we use DFS.
+            while (queue.size() - 1 + queue.front().und.Count() > queue.capacity()) {
+                if (!iterations_left) break;
+                auto elem = queue.back();
+                queue.pop_back();
+                split_fn(std::move(elem));
+            }
+
+            // Process one entry from the front of the queue (BFS exploration)
             if (!iterations_left) break;
-            auto elem = queue.back();
-            queue.pop_back();
+            auto elem = queue.front();
+            queue.pop_front();
             split_fn(std::move(elem));
         }