@@ -15,8 +15,9 @@ use rustwide::Workspace;
15
15
use std:: collections:: BTreeSet ;
16
16
use std:: iter:: FromIterator ;
17
17
use std:: ops;
18
+ use std:: sync:: atomic:: { AtomicBool , Ordering } ;
18
19
use std:: thread;
19
- use std:: time:: Duration ;
20
+ use std:: time:: { Duration , Instant } ;
20
21
21
22
// Purge all the caches if the disk is more than 50% full.
22
23
const PURGE_CACHES_THRESHOLD : f32 = 0.5 ;
@@ -95,6 +96,42 @@ impl Agent {
95
96
}
96
97
}
97
98
99
+ static HEALTH_CHECK : AtomicBool = AtomicBool :: new ( false ) ;
100
+
101
+ // Should be called at least once every 5 minutes, otherwise instance is
102
+ // replaced.
103
+ pub fn set_healthy ( ) {
104
+ HEALTH_CHECK . store ( true , Ordering :: SeqCst ) ;
105
+ }
106
+
107
+ fn health_thread ( ) {
108
+ std:: thread:: spawn ( move || {
109
+ let mut last_check = Instant :: now ( ) ;
110
+
111
+ let listener = std:: net:: TcpListener :: bind ( "0.0.0.0:4343" ) . unwrap ( ) ;
112
+ loop {
113
+ // Accept a connection...
114
+ drop ( listener. accept ( ) ) ;
115
+
116
+ // Then check whether we should still be healthy. If not, we simply
117
+ // drop the listening socket by breaking out of the loop, meaning
118
+ // that we'll stop responding as healthy to future connects.
119
+ //
120
+ // A build has a maximum timeout of 15 minutes in rustwide, so we
121
+ // currently expect checkpoints at least that often. It likely makes
122
+ // sense for us to be more eager, but ultimately crater runtimes are
123
+ // long enough that 15 minutes on one builder hopefully won't matter
124
+ // too much.
125
+ if last_check. elapsed ( ) > Duration :: from_secs ( 15 * 60 ) {
126
+ last_check = Instant :: now ( ) ;
127
+ if !HEALTH_CHECK . swap ( false , Ordering :: SeqCst ) {
128
+ break ;
129
+ }
130
+ }
131
+ }
132
+ } ) ;
133
+ }
134
+
98
135
fn run_heartbeat ( url : & str , token : & str ) {
99
136
let api = AgentApi :: new ( url, token) ;
100
137
@@ -149,6 +186,7 @@ pub fn run(
149
186
let db = results:: ResultsUploader :: new ( & agent. api ) ;
150
187
151
188
run_heartbeat ( url, token) ;
189
+ health_thread ( ) ;
152
190
153
191
let mut past_experiment = None ;
154
192
loop {
0 commit comments