Skip to content

Commit 82e4dd9

Browse files
authored
feat(reporting): add metrics reporting to the new manager experience (#2337)
* chore: initialize the reporter * feat: use the new state machine event handlers * chore: rework the valid state transitions * chore: more tests * chore: tests for configuration storage failure * chore: add new states for the host preflights execution * fix: correctly handle state machine transtitions for host preflight execution * chore: tests for the multiple preflight execution transitions * fix: infra installation flow and more tests * chore: more tests for the infra reporting * chore: installation tests * chore: gofmt forgot to do its things * fix: don't assert on current state before the final move
1 parent 2dfd927 commit 82e4dd9

File tree

15 files changed

+777
-253
lines changed

15 files changed

+777
-253
lines changed

api/controllers/linux/install/controller.go

Lines changed: 15 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -168,18 +168,27 @@ func WithStateMachine(stateMachine statemachine.Interface) InstallControllerOpti
168168
}
169169
}
170170

171+
func WithStore(store store.Store) InstallControllerOption {
172+
return func(c *InstallController) {
173+
c.store = store
174+
}
175+
}
176+
171177
func NewInstallController(opts ...InstallControllerOption) (*InstallController, error) {
172178
controller := &InstallController{
173-
store: store.NewMemoryStore(),
174-
rc: runtimeconfig.New(nil),
175-
logger: logger.NewDiscardLogger(),
176-
stateMachine: NewStateMachine(),
179+
store: store.NewMemoryStore(),
180+
rc: runtimeconfig.New(nil),
181+
logger: logger.NewDiscardLogger(),
177182
}
178183

179184
for _, opt := range opts {
180185
opt(controller)
181186
}
182187

188+
if controller.stateMachine == nil {
189+
controller.stateMachine = NewStateMachine(WithStateMachineLogger(controller.logger))
190+
}
191+
183192
if controller.hostUtils == nil {
184193
controller.hostUtils = hostutils.New(
185194
hostutils.WithLogger(controller.logger),
@@ -204,7 +213,6 @@ func NewInstallController(opts ...InstallControllerOption) (*InstallController,
204213
if controller.hostPreflightManager == nil {
205214
controller.hostPreflightManager = preflight.NewHostPreflightManager(
206215
preflight.WithLogger(controller.logger),
207-
preflight.WithMetricsReporter(controller.metricsReporter),
208216
preflight.WithHostPreflightStore(controller.store.PreflightStore()),
209217
preflight.WithNetUtils(controller.netUtils),
210218
)
@@ -224,5 +232,7 @@ func NewInstallController(opts ...InstallControllerOption) (*InstallController,
224232
)
225233
}
226234

235+
controller.registerReportingHandlers()
236+
227237
return controller, nil
228238
}

api/controllers/linux/install/controller_test.go

Lines changed: 435 additions & 73 deletions
Large diffs are not rendered by default.

api/controllers/linux/install/hostpreflight.go

Lines changed: 25 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ import (
66
"runtime/debug"
77

88
"github.com/replicatedhq/embedded-cluster/api/internal/managers/preflight"
9+
"github.com/replicatedhq/embedded-cluster/api/internal/statemachine"
910
"github.com/replicatedhq/embedded-cluster/api/internal/utils"
1011
"github.com/replicatedhq/embedded-cluster/api/types"
1112
"github.com/replicatedhq/embedded-cluster/pkg/netutils"
@@ -61,16 +62,21 @@ func (c *InstallController) RunHostPreflights(ctx context.Context, opts RunHostP
6162
if r := recover(); r != nil {
6263
finalErr = fmt.Errorf("panic running host preflights: %v: %s", r, string(debug.Stack()))
6364
}
65+
// Handle errors from preflight execution
6466
if finalErr != nil {
6567
c.logger.Error(finalErr)
6668

67-
if err := c.stateMachine.Transition(lock, StatePreflightsFailed); err != nil {
68-
c.logger.Errorf("failed to transition states: %w", err)
69-
}
70-
} else {
71-
if err := c.stateMachine.Transition(lock, StatePreflightsSucceeded); err != nil {
69+
if err := c.stateMachine.Transition(lock, StatePreflightsExecutionFailed); err != nil {
7270
c.logger.Errorf("failed to transition states: %w", err)
7371
}
72+
return
73+
}
74+
75+
// Get the state from the preflights output
76+
state := c.getStateFromPreflightsOutput(ctx)
77+
// Transition to the appropriate state based on preflight results
78+
if err := c.stateMachine.Transition(lock, state); err != nil {
79+
c.logger.Errorf("failed to transition states: %w", err)
7480
}
7581
}()
7682

@@ -87,6 +93,20 @@ func (c *InstallController) RunHostPreflights(ctx context.Context, opts RunHostP
8793
return nil
8894
}
8995

96+
func (c *InstallController) getStateFromPreflightsOutput(ctx context.Context) statemachine.State {
97+
output, err := c.GetHostPreflightOutput(ctx)
98+
// If there was an error getting the state we assume preflight execution failed
99+
if err != nil {
100+
c.logger.WithError(err).Error("error getting preflight output")
101+
return StatePreflightsExecutionFailed
102+
}
103+
// If there is no output, we assume preflights succeeded
104+
if output == nil || !output.HasFail() {
105+
return StatePreflightsSucceeded
106+
}
107+
return StatePreflightsFailed
108+
}
109+
90110
func (c *InstallController) GetHostPreflightStatus(ctx context.Context) (types.Status, error) {
91111
return c.hostPreflightManager.GetHostPreflightStatus(ctx)
92112
}

api/controllers/linux/install/infra.go

Lines changed: 13 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -10,18 +10,10 @@ import (
1010
)
1111

1212
var (
13-
ErrPreflightChecksFailed = errors.New("preflight checks failed")
14-
ErrPreflightChecksNotComplete = errors.New("preflight checks not complete")
13+
ErrPreflightChecksFailed = errors.New("preflight checks failed")
1514
)
1615

1716
func (c *InstallController) SetupInfra(ctx context.Context, ignoreHostPreflights bool) (finalErr error) {
18-
if c.stateMachine.CurrentState() == StatePreflightsFailed {
19-
err := c.bypassPreflights(ctx, ignoreHostPreflights)
20-
if err != nil {
21-
return fmt.Errorf("bypass preflights: %w", err)
22-
}
23-
}
24-
2517
lock, err := c.stateMachine.AcquireLock()
2618
if err != nil {
2719
return types.NewConflictError(err)
@@ -36,6 +28,17 @@ func (c *InstallController) SetupInfra(ctx context.Context, ignoreHostPreflights
3628
}
3729
}()
3830

31+
// Check if preflights have failed and if we should ignore them
32+
if c.stateMachine.CurrentState() == StatePreflightsFailed {
33+
if !ignoreHostPreflights || !c.allowIgnoreHostPreflights {
34+
return types.NewBadRequestError(ErrPreflightChecksFailed)
35+
}
36+
err = c.stateMachine.Transition(lock, StatePreflightsFailedBypassed)
37+
if err != nil {
38+
return fmt.Errorf("failed to transition states: %w", err)
39+
}
40+
}
41+
3942
err = c.stateMachine.Transition(lock, StateInfrastructureInstalling)
4043
if err != nil {
4144
return types.NewConflictError(err)
@@ -54,7 +57,7 @@ func (c *InstallController) SetupInfra(ctx context.Context, ignoreHostPreflights
5457
if finalErr != nil {
5558
c.logger.Error(finalErr)
5659

57-
if err := c.stateMachine.Transition(lock, StateFailed); err != nil {
60+
if err := c.stateMachine.Transition(lock, StateInfrastructureInstallFailed); err != nil {
5861
c.logger.Errorf("failed to transition states: %w", err)
5962
}
6063
} else {
@@ -74,41 +77,6 @@ func (c *InstallController) SetupInfra(ctx context.Context, ignoreHostPreflights
7477
return nil
7578
}
7679

77-
func (c *InstallController) bypassPreflights(ctx context.Context, ignoreHostPreflights bool) error {
78-
if !ignoreHostPreflights || !c.allowIgnoreHostPreflights {
79-
return types.NewBadRequestError(ErrPreflightChecksFailed)
80-
}
81-
82-
lock, err := c.stateMachine.AcquireLock()
83-
if err != nil {
84-
return types.NewConflictError(err)
85-
}
86-
defer lock.Release()
87-
88-
if err := c.stateMachine.ValidateTransition(lock, StatePreflightsFailedBypassed); err != nil {
89-
return types.NewConflictError(err)
90-
}
91-
92-
// TODO (@ethan): we have already sent the preflight output when we sent the failed event.
93-
// We should evaluate if we should send it again.
94-
preflightOutput, err := c.GetHostPreflightOutput(ctx)
95-
if err != nil {
96-
return fmt.Errorf("get install host preflight output: %w", err)
97-
}
98-
99-
// Report that preflights were bypassed
100-
if preflightOutput != nil {
101-
c.metricsReporter.ReportPreflightsBypassed(ctx, preflightOutput)
102-
}
103-
104-
err = c.stateMachine.Transition(lock, StatePreflightsFailedBypassed)
105-
if err != nil {
106-
return types.NewConflictError(err)
107-
}
108-
109-
return nil
110-
}
111-
11280
func (c *InstallController) GetInfra(ctx context.Context) (types.Infra, error) {
11381
return c.infraManager.Get()
11482
}

api/controllers/linux/install/installation.go

Lines changed: 24 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@ package install
33
import (
44
"context"
55
"fmt"
6+
"time"
67

78
"github.com/replicatedhq/embedded-cluster/api/types"
89
ecv1beta1 "github.com/replicatedhq/embedded-cluster/kinds/apis/v1beta1"
@@ -48,6 +49,10 @@ func (c *InstallController) ConfigureInstallation(ctx context.Context, config ty
4849

4950
if err != nil {
5051
c.logger.Error("failed to configure host", "error", err)
52+
err = c.stateMachine.Transition(lock, StateHostConfigurationFailed)
53+
if err != nil {
54+
c.logger.Error("failed to transition states", "error", err)
55+
}
5156
} else {
5257
err = c.stateMachine.Transition(lock, StateHostConfigured)
5358
if err != nil {
@@ -59,7 +64,7 @@ func (c *InstallController) ConfigureInstallation(ctx context.Context, config ty
5964
return nil
6065
}
6166

62-
func (c *InstallController) configureInstallation(ctx context.Context, config types.InstallationConfig) error {
67+
func (c *InstallController) configureInstallation(ctx context.Context, config types.InstallationConfig) (finalErr error) {
6368
lock, err := c.stateMachine.AcquireLock()
6469
if err != nil {
6570
return types.NewConflictError(err)
@@ -70,6 +75,24 @@ func (c *InstallController) configureInstallation(ctx context.Context, config ty
7075
return types.NewConflictError(err)
7176
}
7277

78+
defer func() {
79+
if finalErr != nil {
80+
failureStatus := types.Status{
81+
State: types.StateFailed,
82+
Description: finalErr.Error(),
83+
LastUpdated: time.Now(),
84+
}
85+
86+
if err = c.store.InstallationStore().SetStatus(failureStatus); err != nil {
87+
c.logger.Errorf("failed to update status: %w", err)
88+
}
89+
90+
if err := c.stateMachine.Transition(lock, StateInstallationConfigurationFailed); err != nil {
91+
c.logger.Errorf("failed to transition states: %w", err)
92+
}
93+
}
94+
}()
95+
7396
if err := c.installationManager.ValidateConfig(config, c.rc.ManagerPort()); err != nil {
7497
return fmt.Errorf("validate: %w", err)
7598
}
Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,69 @@
1+
package install
2+
3+
import (
4+
"context"
5+
"errors"
6+
"fmt"
7+
8+
"github.com/replicatedhq/embedded-cluster/api/internal/statemachine"
9+
"github.com/replicatedhq/embedded-cluster/api/types"
10+
)
11+
12+
func (c *InstallController) registerReportingHandlers() {
13+
c.stateMachine.RegisterEventHandler(StateSucceeded, c.reportInstallSucceeded)
14+
c.stateMachine.RegisterEventHandler(StateInfrastructureInstallFailed, c.reportInstallFailed)
15+
c.stateMachine.RegisterEventHandler(StateHostConfigurationFailed, c.reportInstallFailed)
16+
c.stateMachine.RegisterEventHandler(StateInstallationConfigurationFailed, c.reportInstallFailed)
17+
c.stateMachine.RegisterEventHandler(StatePreflightsFailed, c.reportPreflightsFailed)
18+
c.stateMachine.RegisterEventHandler(StatePreflightsFailedBypassed, c.reportPreflightsBypassed)
19+
}
20+
21+
func (c *InstallController) reportInstallSucceeded(ctx context.Context, _, _ statemachine.State) {
22+
c.metricsReporter.ReportInstallationSucceeded(ctx)
23+
}
24+
25+
func (c *InstallController) reportInstallFailed(ctx context.Context, _, toState statemachine.State) {
26+
var status types.Status
27+
var err error
28+
29+
switch toState {
30+
case StateInstallationConfigurationFailed:
31+
status, err = c.store.InstallationStore().GetStatus()
32+
if err != nil {
33+
err = fmt.Errorf("failed to get status from installation store: %w", err)
34+
}
35+
case StateHostConfigurationFailed:
36+
status, err = c.store.InstallationStore().GetStatus()
37+
if err != nil {
38+
err = fmt.Errorf("failed to get status from installation store: %w", err)
39+
}
40+
case StateInfrastructureInstallFailed:
41+
status, err = c.store.InfraStore().GetStatus()
42+
if err != nil {
43+
err = fmt.Errorf("failed to get status from infra store: %w", err)
44+
}
45+
}
46+
if err != nil {
47+
c.logger.WithError(err).Error("failed to report failled install")
48+
return
49+
}
50+
c.metricsReporter.ReportInstallationFailed(ctx, errors.New(status.Description))
51+
}
52+
53+
func (c *InstallController) reportPreflightsFailed(ctx context.Context, _, _ statemachine.State) {
54+
output, err := c.store.PreflightStore().GetOutput()
55+
if err != nil {
56+
c.logger.WithError(fmt.Errorf("failed to get output from preflight store: %w", err)).Error("failed to report preflights failed")
57+
return
58+
}
59+
c.metricsReporter.ReportPreflightsFailed(ctx, output)
60+
}
61+
62+
func (c *InstallController) reportPreflightsBypassed(ctx context.Context, _, _ statemachine.State) {
63+
output, err := c.store.PreflightStore().GetOutput()
64+
if err != nil {
65+
c.logger.WithError(fmt.Errorf("failed to get output from preflight store: %w", err)).Error("failed to report preflights bypassed")
66+
return
67+
}
68+
c.metricsReporter.ReportPreflightsBypassed(ctx, output)
69+
}

0 commit comments

Comments
 (0)