Skip to content

Commit d27b130

Browse files
authored
[sled agent] Retry loop around loading services (#3793)
Fixes #3790
1 parent f1756cc commit d27b130

File tree

2 files changed

+28
-5
lines changed

2 files changed

+28
-5
lines changed

sled-agent/src/services.rs

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -527,9 +527,13 @@ impl ServiceManager {
527527
}
528528

529529
// TODO(https://github.com/oxidecomputer/omicron/issues/2973):
530-
// These will fail if the disks aren't attached.
531-
// Should we have a retry loop here? Kinda like we have with the switch
532-
// / NTP zone?
530+
//
531+
// The sled agent retries this function indefinitely at the call-site, but
532+
// we could be smarter.
533+
//
534+
// - If we know that disks are missing, we could wait for them
535+
// - We could permanently fail if we are able to distinguish other errors
536+
// more clearly.
533537
pub async fn load_services(&self) -> Result<(), Error> {
534538
let log = &self.inner.log;
535539
let ledger_paths = self.all_service_ledgers().await;

sled-agent/src/sled_agent.rs

Lines changed: 21 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,8 @@ use omicron_common::api::{
3030
internal::nexus::UpdateArtifactId,
3131
};
3232
use omicron_common::backoff::{
33-
retry_notify_ext, retry_policy_internal_service_aggressive, BackoffError,
33+
retry_notify, retry_notify_ext, retry_policy_internal_service_aggressive,
34+
BackoffError,
3435
};
3536
use sled_hardware::underlay;
3637
use sled_hardware::HardwareManager;
@@ -372,7 +373,25 @@ impl SledAgent {
372373
//
373374
// Do this *after* monitoring for harware, to enable the switch zone to
374375
// establish an underlay address before proceeding.
375-
sled_agent.inner.services.load_services().await?;
376+
retry_notify(
377+
retry_policy_internal_service_aggressive(),
378+
|| async {
379+
sled_agent
380+
.inner
381+
.services
382+
.load_services()
383+
.await
384+
.map_err(|err| BackoffError::transient(err))
385+
},
386+
|err, delay| {
387+
warn!(
388+
log,
389+
"Failed to load services, will retry in {:?}", delay;
390+
"error" => %err,
391+
);
392+
},
393+
)
394+
.await?;
376395

377396
// Now that we've initialized the sled services, notify nexus again
378397
// at which point it'll plumb any necessary firewall rules back to us.

0 commit comments

Comments
 (0)