Skip to content

Commit 096a224

Browse files
committed
Auto merge of #604 - Mark-Simulacrum:background-reports, r=Mark-Simulacrum
Discover necessity of reporting in the background This will increase the latency to starting a report by ~10 minutes in the worst case, but those take long enough to generate (and experiments are slow enough) that this doesn't seem like a big deal. In practice, my hope is that it will: * Avoid needing retry-report quite as frequently, as OOMs and other abnormal crashes will be gracefully recovered from (albeit likely sending the server into a death spiral if we *really* can't generate the report; in practice this shouldn't happen). It was already somewhat the case that a server restart would start the generation again, just that a thread *panic* didn't since the report generation thread died in that case. So this should be essentially as-before. * Avoid fairly heavy piece of work done on *every* new result, which hopefully improved result endpoint performance and reduces normal server load. See also commit messages for some additional details.
2 parents c899b6f + ff1fe90 commit 096a224

File tree

4 files changed

+45
-43
lines changed

4 files changed

+45
-43
lines changed

src/cli.rs

Lines changed: 18 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -506,8 +506,6 @@ impl Crater {
506506
);
507507
workspace.purge_all_build_dirs()?;
508508
res?;
509-
510-
experiment.set_status(&db, Status::NeedsReport)?;
511509
} else {
512510
bail!("missing experiment {}", ex.0);
513511
}
@@ -522,18 +520,18 @@ impl Crater {
522520
let db = Database::open()?;
523521

524522
if let Some(mut experiment) = Experiment::get(&db, &ex.0)? {
525-
// Update the status
526-
match (experiment.status, force) {
527-
(Status::NeedsReport, _) | (Status::ReportFailed, _) | (_, true) => {
528-
experiment.set_status(&db, Status::GeneratingReport)?;
529-
}
530-
(other, false) => bail!(
531-
"can't generate the report of an experiment with status {} \
523+
let (completed, all) = experiment.raw_progress(&db)?;
524+
if !force && completed != all {
525+
bail!(
526+
"can't generate the report of an incomplete experiment: {}/{} results \
532527
(use --force to override)",
533-
other
534-
),
528+
completed,
529+
all,
530+
);
535531
}
536532

533+
experiment.set_status(&db, Status::GeneratingReport)?;
534+
537535
let result_db = DatabaseDB::new(&db);
538536
let res = report::gen(
539537
&result_db,
@@ -564,18 +562,18 @@ impl Crater {
564562
let db = Database::open()?;
565563

566564
if let Some(mut experiment) = Experiment::get(&db, &ex.0)? {
567-
// Update the status
568-
match (experiment.status, force) {
569-
(Status::NeedsReport, _) | (Status::ReportFailed, _) | (_, true) => {
570-
experiment.set_status(&db, Status::GeneratingReport)?;
571-
}
572-
(other, false) => bail!(
573-
"can't publish the report of an experiment with status {} \
565+
let (completed, all) = experiment.raw_progress(&db)?;
566+
if !force && completed != all {
567+
bail!(
568+
"can't publish the report of an incomplete experiment: {}/{} results \
574569
(use --force to override)",
575-
other
576-
),
570+
completed,
571+
all,
572+
);
577573
}
578574

575+
experiment.set_status(&db, Status::GeneratingReport)?;
576+
579577
let result_db = DatabaseDB::new(&db);
580578
let client = report::get_client_for_bucket(&s3_prefix.bucket)?;
581579

src/experiments.rs

Lines changed: 24 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -298,20 +298,31 @@ impl Experiment {
298298
}
299299
}
300300

301-
pub fn first_by_status(db: &Database, status: Status) -> Fallible<Option<Experiment>> {
302-
let record = db.get_row(
303-
"SELECT * FROM experiments \
304-
WHERE status = ?1 \
305-
ORDER BY priority DESC, created_at;",
306-
&[&status.to_str()],
307-
|r| ExperimentDBRecord::from_row(r),
308-
)?;
309-
310-
if let Some(record) = record {
311-
Ok(Some(record.into_experiment()?))
312-
} else {
313-
Ok(None)
301+
// Returns the first experiment which has all results ready (and so can
302+
// produce a complete report). However, the experiment should not be
303+
// *completed* yet. Note that this may return an experiment which has had
304+
// report generation already start.
305+
pub fn ready_for_report(db: &Database) -> Fallible<Option<Experiment>> {
306+
let unfinished = Self::unfinished(db)?;
307+
for ex in unfinished {
308+
if ex.status == Status::ReportFailed {
309+
// Skip experiments whose report failed to generate. This avoids
310+
// constantly retrying reports (and posting a message each time
311+
// about the attempt); the retry-report command can override the
312+
// failure state. In practice we rarely *fail* to generate
313+
// reports in a clean way (instead OOMing or panicking, in which
314+
// case it is fine to automatically retry the report, as we've
315+
// not posted anything on GitHub -- it may be a problem from a
316+
// performance perspective but no more than that).
317+
continue;
318+
}
319+
let (completed, all) = ex.raw_progress(db)?;
320+
if completed == all {
321+
return Ok(Some(ex));
322+
}
314323
}
324+
325+
Ok(None)
315326
}
316327

317328
pub fn find_next(db: &Database, assignee: &Assignee) -> Fallible<Option<Experiment>> {

src/server/reports.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,7 @@ fn reports_thread(data: &Data, github_data: Option<&GithubData>) -> Fallible<()>
3737
let results = DatabaseDB::new(&data.db);
3838

3939
loop {
40-
let mut ex = match Experiment::first_by_status(&data.db, Status::NeedsReport)? {
40+
let mut ex = match Experiment::ready_for_report(&data.db)? {
4141
Some(ex) => ex,
4242
None => {
4343
// This will sleep AUTOMATIC_THREAD_WAKEUP seconds *or* until a wake is received

src/server/routes/agent.rs

Lines changed: 2 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
use crate::agent::Capabilities;
2-
use crate::experiments::{Assignee, Experiment, Status};
2+
use crate::experiments::{Assignee, Experiment};
33
use crate::prelude::*;
44
use crate::results::{DatabaseDB, EncodingType, ProgressData};
55
use crate::server::api_types::{AgentConfig, ApiResponse};
@@ -160,7 +160,7 @@ fn endpoint_record_progress(
160160
auth: AuthDetails,
161161
) -> Fallible<Response<Body>> {
162162
let data = mutex.lock().unwrap();
163-
let mut ex = Experiment::get(&data.db, &result.experiment_name)?
163+
let ex = Experiment::get(&data.db, &result.experiment_name)?
164164
.ok_or_else(|| err_msg("no experiment run by this agent"))?;
165165

166166
data.metrics
@@ -169,13 +169,6 @@ fn endpoint_record_progress(
169169
let db = DatabaseDB::new(&data.db);
170170
db.store(&ex, &result.data, EncodingType::Gzip)?;
171171

172-
let (completed, all) = ex.raw_progress(&data.db)?;
173-
if completed == all {
174-
ex.set_status(&data.db, Status::NeedsReport)?;
175-
info!("experiment {} completed, marked as needs-report", ex.name);
176-
data.reports_worker.wake(); // Ensure the reports worker is awake
177-
}
178-
179172
Ok(ApiResponse::Success { result: true }.into_response()?)
180173
}
181174

0 commit comments

Comments
 (0)