Auto merge of #573 - Mark-Simulacrum:exp-delay-on-load, r=pietroalbini

bors · bors · commit 94160d195aaa · 2021-04-16T09:52:17.000Z
Back off from running job on high load

This is realistically just an experiment - I have no statistics to show this will actually be better. But I also figure that it is relatively low cost for us to deploy and see what our metrics look like; if there is an improvement, great, if not, we can revert this pretty quickly (or experiment with other values). We definitely see on all crater machines that there is usually quite a bit of traffic right now towards the CPU.

It is also likely that this might just lead to all the thread backing off, and not much work getting done. If so, we'd likely want to reconsider. But this is at least an attempt at reducing the scheduled load.
diff --git a/src/runner/mod.rs b/src/runner/mod.rs
@@ -112,17 +112,15 @@ pub fn run_ex<DB: WriteResults + Sync>(
         let mut threads = Vec::new();
 
         for worker in &workers {
-            let join =
-                scope
-                    .builder()
-                    .name(worker.name().into())
-                    .spawn(move || match worker.run() {
-                        Ok(()) => Ok(()),
-                        Err(r) => {
-                            log::warn!("worker {} failed: {:?}", worker.name(), r);
-                            Err(r)
-                        }
-                    })?;
+            let join = scope.builder().name(worker.name().into()).spawn(move || {
+                match worker.run(threads_count) {
+                    Ok(()) => Ok(()),
+                    Err(r) => {
+                        log::warn!("worker {} failed: {:?}", worker.name(), r);
+                        Err(r)
+                    }
+                }
+            })?;
             threads.push(join);
         }
         let disk_watcher_thread =
diff --git a/src/runner/worker.rs b/src/runner/worker.rs
@@ -12,6 +12,7 @@ use std::sync::{
     Mutex,
 };
 use std::time::Duration;
+use systemstat::{Platform, System};
 
 pub(super) struct Worker<'a, DB: WriteResults + Sync> {
     name: String,
@@ -55,16 +56,32 @@ impl<'a, DB: WriteResults + Sync> Worker<'a, DB> {
         &self.name
     }
 
-    pub(super) fn run(&self) -> Fallible<()> {
+    pub(super) fn run(&self, threads_count: usize) -> Fallible<()> {
         // This uses a `loop` instead of a `while let` to avoid locking the graph too much
         let mut guard = self.graph.lock().unwrap();
+        let system = System::new();
         loop {
             self.maybe_cleanup_target_dir()?;
             let walk_result = guard.next_task(self.ex, self.db, &self.name);
             match walk_result {
                 WalkResult::Task(id, task) => {
                     drop(guard);
                     info!("running task: {:?}", task);
+
+                    // Wait for 15 seconds before running if the 1 minute load
+                    // average exceeds the thread count. This tries to back off
+                    // from spawning too many jobs on the server, hopefully
+                    // improving performance.
+                    loop {
+                        let avg = system.load_average()?;
+
+                        if avg.one > threads_count as f32 {
+                            std::thread::sleep(std::time::Duration::new(15, 0));
+                        } else {
+                            break;
+                        }
+                    }
+
                     let res = task.run(
                         self.config,
                         self.workspace,