Skip to content

Commit 8fa788b

Browse files
authored
Fix potential deadlocks in the statemachine (#2346)
1 parent c452bb8 commit 8fa788b

File tree

4 files changed

+57
-51
lines changed

4 files changed

+57
-51
lines changed

api/README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ Contains the business logic for different API endpoints. Each controller package
1616
Contains shared utilities and helper packages that provide common functionality used across different parts of the API. This includes both general-purpose utilities and domain-specific helpers.
1717

1818
#### `/internal/managers`
19-
Each manager is responsible for a specific subdomain of functionality and provides a clean, thread-safe interface for controllers to interact with. For example, the Preflight Manager manages system requirement checks and validation.
19+
Each manager is responsible for a specific subdomain of functionality and provides a clean interface for controllers to interact with. For example, the Preflight Manager manages system requirement checks and validation.
2020

2121
#### `/internal/statemachine`
2222
The statemachine is used by controllers to capture workflow state and enforce valid transitions.

api/controllers/install/hostpreflight.go

Lines changed: 23 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -11,12 +11,21 @@ import (
1111
"github.com/replicatedhq/embedded-cluster/pkg/netutils"
1212
)
1313

14-
func (c *InstallController) RunHostPreflights(ctx context.Context, opts RunHostPreflightsOptions) error {
14+
func (c *InstallController) RunHostPreflights(ctx context.Context, opts RunHostPreflightsOptions) (finalErr error) {
1515
lock, err := c.stateMachine.AcquireLock()
1616
if err != nil {
1717
return types.NewConflictError(err)
1818
}
1919

20+
defer func() {
21+
if r := recover(); r != nil {
22+
finalErr = fmt.Errorf("panic: %v: %s", r, string(debug.Stack()))
23+
}
24+
if finalErr != nil {
25+
lock.Release()
26+
}
27+
}()
28+
2029
if err := c.stateMachine.ValidateTransition(lock, StatePreflightsRunning); err != nil {
2130
return types.NewConflictError(err)
2231
}
@@ -34,7 +43,6 @@ func (c *InstallController) RunHostPreflights(ctx context.Context, opts RunHostP
3443
IsUI: opts.IsUI,
3544
})
3645
if err != nil {
37-
lock.Release()
3846
return fmt.Errorf("failed to prepare host preflights: %w", err)
3947
}
4048

@@ -43,18 +51,24 @@ func (c *InstallController) RunHostPreflights(ctx context.Context, opts RunHostP
4351
return fmt.Errorf("failed to transition states: %w", err)
4452
}
4553

46-
go func() {
54+
go func() (finalErr error) {
4755
// Background context is used to avoid canceling the operation if the context is canceled
4856
ctx := context.Background()
4957

5058
defer lock.Release()
5159

5260
defer func() {
5361
if r := recover(); r != nil {
54-
c.logger.Errorf("panic running host preflights: %v: %s", r, string(debug.Stack()))
62+
finalErr = fmt.Errorf("panic running host preflights: %v: %s", r, string(debug.Stack()))
63+
}
64+
if finalErr != nil {
65+
c.logger.Error(finalErr)
5566

56-
err := c.stateMachine.Transition(lock, StatePreflightsFailed)
57-
if err != nil {
67+
if err := c.stateMachine.Transition(lock, StatePreflightsFailed); err != nil {
68+
c.logger.Errorf("failed to transition states: %w", err)
69+
}
70+
} else {
71+
if err := c.stateMachine.Transition(lock, StatePreflightsSucceeded); err != nil {
5872
c.logger.Errorf("failed to transition states: %w", err)
5973
}
6074
}
@@ -63,20 +77,11 @@ func (c *InstallController) RunHostPreflights(ctx context.Context, opts RunHostP
6377
err := c.hostPreflightManager.RunHostPreflights(ctx, c.rc, preflight.RunHostPreflightOptions{
6478
HostPreflightSpec: hpf,
6579
})
66-
6780
if err != nil {
68-
c.logger.Errorf("failed to run host preflights: %w", err)
69-
70-
err = c.stateMachine.Transition(lock, StatePreflightsFailed)
71-
if err != nil {
72-
c.logger.Errorf("failed to transition states: %w", err)
73-
}
74-
} else {
75-
err = c.stateMachine.Transition(lock, StatePreflightsSucceeded)
76-
if err != nil {
77-
c.logger.Errorf("failed to transition states: %w", err)
78-
}
81+
return fmt.Errorf("failed to run host preflights: %w", err)
7982
}
83+
84+
return nil
8085
}()
8186

8287
return nil

api/controllers/install/infra.go

Lines changed: 24 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@ import (
88
"github.com/replicatedhq/embedded-cluster/api/types"
99
)
1010

11-
func (c *InstallController) SetupInfra(ctx context.Context) error {
11+
func (c *InstallController) SetupInfra(ctx context.Context) (finalErr error) {
1212
if c.stateMachine.CurrentState() == StatePreflightsFailed {
1313
err := c.bypassPreflights(ctx)
1414
if err != nil {
@@ -21,44 +21,48 @@ func (c *InstallController) SetupInfra(ctx context.Context) error {
2121
return types.NewConflictError(err)
2222
}
2323

24+
defer func() {
25+
if r := recover(); r != nil {
26+
finalErr = fmt.Errorf("panic: %v: %s", r, string(debug.Stack()))
27+
}
28+
if finalErr != nil {
29+
lock.Release()
30+
}
31+
}()
32+
2433
err = c.stateMachine.Transition(lock, StateInfrastructureInstalling)
2534
if err != nil {
26-
lock.Release()
2735
return types.NewConflictError(err)
2836
}
2937

30-
go func() {
38+
go func() (finalErr error) {
3139
// Background context is used to avoid canceling the operation if the context is canceled
3240
ctx := context.Background()
3341

3442
defer lock.Release()
3543

3644
defer func() {
3745
if r := recover(); r != nil {
38-
c.logger.Errorf("panic installing infrastructure: %v: %s", r, string(debug.Stack()))
46+
finalErr = fmt.Errorf("panic: %v: %s", r, string(debug.Stack()))
47+
}
48+
if finalErr != nil {
49+
c.logger.Error(finalErr)
3950

40-
err := c.stateMachine.Transition(lock, StateFailed)
41-
if err != nil {
51+
if err := c.stateMachine.Transition(lock, StateFailed); err != nil {
52+
c.logger.Errorf("failed to transition states: %w", err)
53+
}
54+
} else {
55+
if err := c.stateMachine.Transition(lock, StateSucceeded); err != nil {
4256
c.logger.Errorf("failed to transition states: %w", err)
4357
}
4458
}
4559
}()
4660

47-
err := c.infraManager.Install(ctx, c.rc)
48-
49-
if err != nil {
50-
c.logger.Errorf("failed to install infrastructure: %w", err)
51-
52-
err := c.stateMachine.Transition(lock, StateFailed)
53-
if err != nil {
54-
c.logger.Errorf("failed to transition states: %w", err)
55-
}
56-
} else {
57-
err = c.stateMachine.Transition(lock, StateSucceeded)
58-
if err != nil {
59-
c.logger.Errorf("failed to transition states: %w", err)
60-
}
61+
if err := c.infraManager.Install(ctx, c.rc); err != nil {
62+
return fmt.Errorf("failed to install infrastructure: %w", err)
6163
}
64+
65+
return nil
6266
}()
6367

6468
return nil

api/internal/managers/infra/install.go

Lines changed: 9 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -52,23 +52,20 @@ func (m *infraManager) Install(ctx context.Context, rc runtimeconfig.RuntimeConf
5252
defer func() {
5353
if r := recover(); r != nil {
5454
finalErr = fmt.Errorf("panic: %v: %s", r, string(debug.Stack()))
55-
56-
if err := m.setStatus(types.StateFailed, "Installation failed to run: panic"); err != nil {
55+
}
56+
if finalErr != nil {
57+
if err := m.setStatus(types.StateFailed, finalErr.Error()); err != nil {
5758
m.logger.WithField("error", err).Error("set failed status")
5859
}
60+
} else {
61+
if err := m.setStatus(types.StateSucceeded, "Installation complete"); err != nil {
62+
m.logger.WithField("error", err).Error("set succeeded status")
63+
}
5964
}
6065
}()
6166

62-
err = m.install(ctx, rc)
63-
64-
if err != nil {
65-
if err := m.setStatus(types.StateFailed, err.Error()); err != nil {
66-
m.logger.WithField("error", err).Error("set failed status")
67-
}
68-
} else {
69-
if err := m.setStatus(types.StateSucceeded, "Installation complete"); err != nil {
70-
m.logger.WithField("error", err).Error("set succeeded status")
71-
}
67+
if err := m.install(ctx, rc); err != nil {
68+
return err
7269
}
7370

7471
return nil

0 commit comments

Comments
 (0)