File tree Expand file tree Collapse file tree 2 files changed +28
-5
lines changed Expand file tree Collapse file tree 2 files changed +28
-5
lines changed Original file line number Diff line number Diff line change @@ -527,9 +527,13 @@ impl ServiceManager {
527
527
}
528
528
529
529
// TODO(https://github.com/oxidecomputer/omicron/issues/2973):
530
- // These will fail if the disks aren't attached.
531
- // Should we have a retry loop here? Kinda like we have with the switch
532
- // / NTP zone?
530
+ //
531
+ // The sled agent retries this function indefinitely at the call-site, but
532
+ // we could be smarter.
533
+ //
534
+ // - If we know that disks are missing, we could wait for them
535
+ // - We could permanently fail if we are able to distinguish other errors
536
+ // more clearly.
533
537
pub async fn load_services ( & self ) -> Result < ( ) , Error > {
534
538
let log = & self . inner . log ;
535
539
let ledger_paths = self . all_service_ledgers ( ) . await ;
Original file line number Diff line number Diff line change @@ -30,7 +30,8 @@ use omicron_common::api::{
30
30
internal:: nexus:: UpdateArtifactId ,
31
31
} ;
32
32
use omicron_common:: backoff:: {
33
- retry_notify_ext, retry_policy_internal_service_aggressive, BackoffError ,
33
+ retry_notify, retry_notify_ext, retry_policy_internal_service_aggressive,
34
+ BackoffError ,
34
35
} ;
35
36
use sled_hardware:: underlay;
36
37
use sled_hardware:: HardwareManager ;
@@ -372,7 +373,25 @@ impl SledAgent {
372
373
//
373
374
// Do this *after* monitoring for harware, to enable the switch zone to
374
375
// establish an underlay address before proceeding.
375
- sled_agent. inner . services . load_services ( ) . await ?;
376
+ retry_notify (
377
+ retry_policy_internal_service_aggressive ( ) ,
378
+ || async {
379
+ sled_agent
380
+ . inner
381
+ . services
382
+ . load_services ( )
383
+ . await
384
+ . map_err ( |err| BackoffError :: transient ( err) )
385
+ } ,
386
+ |err, delay| {
387
+ warn ! (
388
+ log,
389
+ "Failed to load services, will retry in {:?}" , delay;
390
+ "error" => %err,
391
+ ) ;
392
+ } ,
393
+ )
394
+ . await ?;
376
395
377
396
// Now that we've initialized the sled services, notify nexus again
378
397
// at which point it'll plumb any necessary firewall rules back to us.
You can’t perform that action at this time.
0 commit comments