Auto merge of #569 - Mark-Simulacrum:avoid-parking, r=pietroalbini

bors · bors · commit e8f8ace14761 · 2021-04-10T16:01:06.000Z
Adjust some of the code around the worker deadlock

This switches to a Condvar associated with the graph lock to maintain the
blocked worker pool. This, in and of itself, is just a simplification, but eases
fixes for these two cases:

* mark_as_failed is called with a try!/? operator, which means that even if
  progress was made on some parts of a task, we may not have reached the
  unparking code. The notification is now moved up to just after the graph lock
  is re-acquired; this ensures that regardless of what happens, other threads
  will have a chance to run.

* Finished did not unpark any blocked threads.

In practice, I suspect that the second of these is the cause of our bug. The
following is an excerpt of the log before worker-7 stalls out in thread park (in
the original version of this code). worker-7 blocks on the root node, the other
workers all do so as well and exit via Finished without waking worker-7. With
the new code, worker-7 would get woken on each finish for the other workers,
letting it also notice that the root is finished and exit.

```
worker-7 | NodeIndex(40): this is blocked
worker-7 | NodeIndex(0): this is blocked
marking node running: cleanup of crate kivo360/rusty_web_app as complete
worker-4 | NodeIndex(0): walked to node root
worker-4 | NodeIndex(0): neighbors: [NodeIndex(40)]
worker-4 | NodeIndex(40): walked to node crate completed
worker-4 | NodeIndex(40): neighbors: []
worker-4 | NodeIndex(40): marked as complete
marking node crate completed as complete
worker-6 | NodeIndex(0): walked to node root
worker-6 | NodeIndex(0): neighbors: []
worker-8 | NodeIndex(0): walked to node root
worker-8 | NodeIndex(0): neighbors: []
worker-9 | NodeIndex(0): walked to node root
worker-9 | NodeIndex(0): neighbors: []
worker-3 | NodeIndex(0): walked to node root
worker-3 | NodeIndex(0): neighbors: []
worker-2 | NodeIndex(0): walked to node root
worker-2 | NodeIndex(0): neighbors: []
worker-0 | NodeIndex(0): walked to node root
worker-0 | NodeIndex(0): neighbors: []
worker-5 | NodeIndex(0): walked to node root
worker-5 | NodeIndex(0): neighbors: []
worker-1 | NodeIndex(0): walked to node root
worker-1 | NodeIndex(0): neighbors: []
```

r? `@pietroalbini`
diff --git a/src/runner/mod.rs b/src/runner/mod.rs
@@ -16,8 +16,7 @@ use rustwide::logging::LogStorage;
 use rustwide::Workspace;
 use std::collections::HashMap;
 use std::path::Path;
-use std::sync::Mutex;
-use std::thread;
+use std::sync::{Condvar, Mutex};
 use std::time::Duration;
 
 const DISK_SPACE_WATCHER_INTERVAL: Duration = Duration::from_secs(300);
@@ -63,6 +62,7 @@ pub fn run_ex<DB: WriteResults + Sync>(
 
     info!("computing the tasks graph...");
     let graph = Mutex::new(build_graph(ex, crates, config));
+    let parked_threads = Condvar::new();
 
     info!("preparing the execution...");
     for tc in &ex.toolchains {
@@ -74,9 +74,6 @@ pub fn run_ex<DB: WriteResults + Sync>(
 
     info!("running tasks in {} threads...", threads_count);
 
-    // An HashMap is used instead of an HashSet because Thread is not Eq+Hash
-    let parked_threads: Mutex<HashMap<thread::ThreadId, thread::Thread>> =
-        Mutex::new(HashMap::new());
     let state = RunnerState::new();
 
     let workers = (0..threads_count)
diff --git a/src/runner/worker.rs b/src/runner/worker.rs
@@ -6,13 +6,11 @@ use crate::runner::graph::{TasksGraph, WalkResult};
 use crate::runner::{OverrideResult, RunnerState};
 use crate::utils;
 use rustwide::{BuildDirectory, Workspace};
-use std::collections::HashMap;
 use std::sync::Condvar;
 use std::sync::{
     atomic::{AtomicBool, Ordering},
     Mutex,
 };
-use std::thread;
 use std::time::Duration;
 
 pub(super) struct Worker<'a, DB: WriteResults + Sync> {
@@ -24,7 +22,7 @@ pub(super) struct Worker<'a, DB: WriteResults + Sync> {
     graph: &'a Mutex<TasksGraph>,
     state: &'a RunnerState,
     db: &'a DB,
-    parked_threads: &'a Mutex<HashMap<thread::ThreadId, thread::Thread>>,
+    parked_threads: &'a Condvar,
     target_dir_cleanup: AtomicBool,
 }
 
@@ -37,7 +35,7 @@ impl<'a, DB: WriteResults + Sync> Worker<'a, DB> {
         graph: &'a Mutex<TasksGraph>,
         state: &'a RunnerState,
         db: &'a DB,
-        parked_threads: &'a Mutex<HashMap<thread::ThreadId, thread::Thread>>,
+        parked_threads: &'a Condvar,
     ) -> Self {
         Worker {
             build_dir: Mutex::new(workspace.build_dir(&name)),
@@ -59,15 +57,13 @@ impl<'a, DB: WriteResults + Sync> Worker<'a, DB> {
 
     pub(super) fn run(&self) -> Fallible<()> {
         // This uses a `loop` instead of a `while let` to avoid locking the graph too much
+        let mut guard = self.graph.lock().unwrap();
         loop {
             self.maybe_cleanup_target_dir()?;
-            let walk_result = self
-                .graph
-                .lock()
-                .unwrap()
-                .next_task(self.ex, self.db, &self.name);
+            let walk_result = guard.next_task(self.ex, self.db, &self.name);
             match walk_result {
                 WalkResult::Task(id, task) => {
+                    drop(guard);
                     info!("running task: {:?}", task);
                     let res = task.run(
                         self.config,
@@ -77,6 +73,9 @@ impl<'a, DB: WriteResults + Sync> Worker<'a, DB> {
                         self.db,
                         self.state,
                     );
+                    guard = self.graph.lock().unwrap();
+                    // Regardless of how this ends, they should get woken up.
+                    self.parked_threads.notify_all();
                     if let Err(e) = res {
                         error!("task failed, marking childs as failed too: {:?}", task);
                         utils::report_failure(&e);
@@ -94,7 +93,7 @@ impl<'a, DB: WriteResults + Sync> Worker<'a, DB> {
                             }
                         }
 
-                        self.graph.lock().unwrap().mark_as_failed(
+                        guard.mark_as_failed(
                             id,
                             self.ex,
                             self.db,
@@ -105,28 +104,20 @@ impl<'a, DB: WriteResults + Sync> Worker<'a, DB> {
                             &self.name,
                         )?;
                     } else {
-                        self.graph.lock().unwrap().mark_as_completed(id);
-                    }
-
-                    // Unpark all the threads
-                    let mut parked = self.parked_threads.lock().unwrap();
-                    for (_id, thread) in parked.drain() {
-                        thread.unpark();
+                        guard.mark_as_completed(id);
                     }
                 }
                 WalkResult::Blocked => {
-                    // Wait until another thread finished before looking for tasks again
-                    // If the thread spuriously wake up (parking does not guarantee no
-                    // spurious wakeups) it's not a big deal, it will just get parked again
-                    {
-                        let mut parked_threads = self.parked_threads.lock().unwrap();
-                        let current = thread::current();
-                        parked_threads.insert(current.id(), current);
-                    }
-                    thread::park();
+                    guard = self.parked_threads.wait(guard).unwrap();
                 }
                 WalkResult::NotBlocked => unreachable!("NotBlocked leaked from the run"),
-                WalkResult::Finished => break,
+                WalkResult::Finished => {
+                    // A blocked thread may be waiting on the root node, in
+                    // which case this is crucial to avoiding a deadlock.
+                    self.parked_threads.notify_all();
+                    drop(guard);
+                    break;
+                }
             }
         }