Skip to content

Commit fe0aa63

Browse files
authored
🌱 Implement MS remediating conditions (#11382)
* Implement MS remediating conditions Signed-off-by: Stefan Büringer buringerst@vmware.com * Fix review findings * Fix review findings Signed-off-by: Stefan Büringer buringerst@vmware.com --------- Signed-off-by: Stefan Büringer buringerst@vmware.com
1 parent 2bb2099 commit fe0aa63

File tree

13 files changed

+726
-92
lines changed

13 files changed

+726
-92
lines changed

api/v1beta1/machineset_types.go

Lines changed: 28 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -153,10 +153,37 @@ const (
153153
MachineSetMachinesUpToDateInternalErrorV1Beta2Reason = InternalErrorV1Beta2Reason
154154
)
155155

156-
// Conditions that will be used for the MachineSet object in v1Beta2 API version.
156+
// MachineSet's Remediating condition and corresponding reasons that will be used in v1Beta2 API version.
157157
const (
158158
// MachineSetRemediatingV1Beta2Condition surfaces details about ongoing remediation of the controlled machines, if any.
159159
MachineSetRemediatingV1Beta2Condition = RemediatingV1Beta2Condition
160+
161+
// MachineSetRemediatingV1Beta2Reason surfaces when the MachineSet has at least one machine with HealthCheckSucceeded set to false
162+
// and with the OwnerRemediated condition set to false.
163+
MachineSetRemediatingV1Beta2Reason = RemediatingV1Beta2Reason
164+
165+
// MachineSetNotRemediatingV1Beta2Reason surfaces when the MachineSet does not have any machine with HealthCheckSucceeded set to false
166+
// and with the OwnerRemediated condition set to false.
167+
MachineSetNotRemediatingV1Beta2Reason = NotRemediatingV1Beta2Reason
168+
169+
// MachineSetRemediatingInternalErrorV1Beta2Reason surfaces unexpected failures when computing the Remediating condition.
170+
MachineSetRemediatingInternalErrorV1Beta2Reason = InternalErrorV1Beta2Reason
171+
)
172+
173+
// Reasons that will be used for the OwnerRemediated condition set by MachineHealthCheck on MachineSet controlled machines
174+
// being remediated in v1Beta2 API version.
175+
const (
176+
// MachineSetMachineCannotBeRemediatedV1Beta2Reason surfaces when remediation of a MachineSet machine can't be started.
177+
MachineSetMachineCannotBeRemediatedV1Beta2Reason = "CannotBeRemediated"
178+
179+
// MachineSetMachineRemediationDeferredV1Beta2Reason surfaces when remediation of a MachineSet machine must be deferred.
180+
MachineSetMachineRemediationDeferredV1Beta2Reason = "RemediationDeferred"
181+
182+
// MachineSetMachineRemediationMachineDeletedV1Beta2Reason surfaces when remediation of a MachineSet machine
183+
// has been completed by deleting the unhealthy machine.
184+
// Note: After an unhealthy machine is deleted, a new one is created by the MachineSet as part of the
185+
// regular reconcile loop that ensures the correct number of replicas exist.
186+
MachineSetMachineRemediationMachineDeletedV1Beta2Reason = "MachineDeleted"
160187
)
161188

162189
// MachineSet's Deleting condition and corresponding reasons that will be used in v1Beta2 API version.

controlplane/kubeadm/api/v1beta1/v1beta2_condition_consts.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -209,7 +209,7 @@ const (
209209
// Reasons that will be used for the OwnerRemediated condition set by MachineHealthCheck on KubeadmControlPlane controlled machines
210210
// being remediated in v1Beta2 API version.
211211
const (
212-
// KubeadmControlPlaneMachineRemediationInternalErrorV1Beta2Reason surfaces unexpected failures while remediation a control plane machine.
212+
// KubeadmControlPlaneMachineRemediationInternalErrorV1Beta2Reason surfaces unexpected failures while remediating a control plane machine.
213213
KubeadmControlPlaneMachineRemediationInternalErrorV1Beta2Reason = clusterv1.InternalErrorV1Beta2Reason
214214

215215
// KubeadmControlPlaneMachineCannotBeRemediatedV1Beta2Reason surfaces when remediation of a control plane machine can't be started.

controlplane/kubeadm/internal/controllers/status.go

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -616,10 +616,7 @@ func aggregateUnhealthyMachines(machines collections.Machines) string {
616616
return ""
617617
}
618618

619-
machineNames := []string{}
620-
for _, machine := range machines {
621-
machineNames = append(machineNames, machine.GetName())
622-
}
619+
machineNames := machines.Names()
623620

624621
if len(machineNames) == 0 {
625622
return ""

internal/controllers/cluster/cluster_controller_status.go

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1009,10 +1009,7 @@ func aggregateUnhealthyMachines(machines collections.Machines) string {
10091009
return ""
10101010
}
10111011

1012-
machineNames := []string{}
1013-
for _, machine := range machines {
1014-
machineNames = append(machineNames, machine.GetName())
1015-
}
1012+
machineNames := machines.Names()
10161013

10171014
if len(machineNames) == 0 {
10181015
return ""

internal/controllers/machine/machine_controller.go

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -309,8 +309,6 @@ func patchMachine(ctx context.Context, patchHelper *patch.Helper, machine *clust
309309
clusterv1.BootstrapReadyCondition,
310310
clusterv1.InfrastructureReadyCondition,
311311
clusterv1.DrainingSucceededCondition,
312-
clusterv1.MachineHealthCheckSucceededCondition,
313-
clusterv1.MachineOwnerRemediatedCondition,
314312
}},
315313
patch.WithOwnedV1Beta2Conditions{Conditions: []string{
316314
clusterv1.MachineAvailableV1Beta2Condition,

internal/controllers/machinedeployment/machinedeployment_status.go

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -487,10 +487,7 @@ func aggregateUnhealthyMachines(machines collections.Machines) string {
487487
return ""
488488
}
489489

490-
machineNames := []string{}
491-
for _, machine := range machines {
492-
machineNames = append(machineNames, machine.GetName())
493-
}
490+
machineNames := machines.Names()
494491

495492
if len(machineNames) == 0 {
496493
return ""

internal/controllers/machineset/machineset_controller.go

Lines changed: 151 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ package machineset
1919
import (
2020
"context"
2121
"fmt"
22+
"math"
2223
"sort"
2324
"strings"
2425
"time"
@@ -54,6 +55,7 @@ import (
5455
"sigs.k8s.io/cluster-api/util"
5556
"sigs.k8s.io/cluster-api/util/collections"
5657
"sigs.k8s.io/cluster-api/util/conditions"
58+
v1beta2conditions "sigs.k8s.io/cluster-api/util/conditions/v1beta2"
5759
utilconversion "sigs.k8s.io/cluster-api/util/conversion"
5860
"sigs.k8s.io/cluster-api/util/finalizers"
5961
"sigs.k8s.io/cluster-api/util/labels/format"
@@ -1195,19 +1197,77 @@ func (r *Reconciler) reconcileUnhealthyMachines(ctx context.Context, s *scope) (
11951197

11961198
cluster := s.cluster
11971199
ms := s.machineSet
1198-
filteredMachines := s.machines
1200+
machines := s.machines
11991201
owner := s.owningMachineDeployment
12001202
log := ctrl.LoggerFrom(ctx)
12011203

1204+
// Remove OwnerRemediated condition from Machines that have HealthCheckSucceeded condition true
1205+
// and OwnerRemediated condition false
1206+
errList := []error{}
1207+
for _, m := range machines {
1208+
if !m.DeletionTimestamp.IsZero() {
1209+
continue
1210+
}
1211+
1212+
shouldCleanup := conditions.IsTrue(m, clusterv1.MachineHealthCheckSucceededCondition) && conditions.IsFalse(m, clusterv1.MachineOwnerRemediatedCondition)
1213+
shouldCleanupV1Beta2 := v1beta2conditions.IsTrue(m, clusterv1.MachineHealthCheckSucceededV1Beta2Condition) && v1beta2conditions.IsFalse(m, clusterv1.MachineOwnerRemediatedV1Beta2Condition)
1214+
1215+
if !(shouldCleanup || shouldCleanupV1Beta2) {
1216+
continue
1217+
}
1218+
1219+
patchHelper, err := patch.NewHelper(m, r.Client)
1220+
if err != nil {
1221+
errList = append(errList, err)
1222+
continue
1223+
}
1224+
1225+
if shouldCleanup {
1226+
conditions.Delete(m, clusterv1.MachineOwnerRemediatedCondition)
1227+
}
1228+
1229+
if shouldCleanupV1Beta2 {
1230+
v1beta2conditions.Delete(m, clusterv1.MachineOwnerRemediatedV1Beta2Condition)
1231+
}
1232+
1233+
if err := patchHelper.Patch(ctx, m, patch.WithOwnedConditions{Conditions: []clusterv1.ConditionType{
1234+
clusterv1.MachineOwnerRemediatedCondition,
1235+
}}, patch.WithOwnedV1Beta2Conditions{Conditions: []string{
1236+
clusterv1.MachineOwnerRemediatedV1Beta2Condition,
1237+
}}); err != nil {
1238+
errList = append(errList, err)
1239+
}
1240+
}
1241+
if len(errList) > 0 {
1242+
return ctrl.Result{}, errors.Wrapf(kerrors.NewAggregate(errList), "failed to remove OwnerRemediated condition from healhty Machines")
1243+
}
1244+
1245+
// Calculates the Machines to be remediated.
1246+
// Note: Machines already deleting are not included, there is no need to trigger remediation for them again.
1247+
machinesToRemediate := collections.FromMachines(machines...).Filter(collections.IsUnhealthyAndOwnerRemediated, collections.Not(collections.HasDeletionTimestamp)).UnsortedList()
1248+
1249+
// If there are no machines to remediate return early.
1250+
if len(machinesToRemediate) == 0 {
1251+
return ctrl.Result{}, nil
1252+
}
1253+
12021254
// Calculate how many in flight machines we should remediate.
12031255
// By default, we allow all machines to be remediated at the same time.
1204-
maxInFlight := len(filteredMachines)
1256+
maxInFlight := math.MaxInt
12051257

12061258
// If the MachineSet is part of a MachineDeployment, only allow remediations if
12071259
// it's the desired revision.
12081260
if isDeploymentChild(ms) {
12091261
if owner.Annotations[clusterv1.RevisionAnnotation] != ms.Annotations[clusterv1.RevisionAnnotation] {
12101262
// MachineSet is part of a MachineDeployment but isn't the current revision, no remediations allowed.
1263+
if err := patchMachineConditions(ctx, r.Client, machinesToRemediate, metav1.Condition{
1264+
Type: clusterv1.MachineOwnerRemediatedV1Beta2Condition,
1265+
Status: metav1.ConditionFalse,
1266+
Reason: clusterv1.MachineSetMachineCannotBeRemediatedV1Beta2Reason,
1267+
Message: "Machine won't be remediated because it is pending removal due to rollout",
1268+
}, nil); err != nil {
1269+
return ctrl.Result{}, err
1270+
}
12111271
return ctrl.Result{}, nil
12121272
}
12131273

@@ -1224,31 +1284,33 @@ func (r *Reconciler) reconcileUnhealthyMachines(ctx context.Context, s *scope) (
12241284
}
12251285
}
12261286

1227-
// List all unhealthy machines.
1228-
machinesToRemediate := make([]*clusterv1.Machine, 0, len(filteredMachines))
1229-
for _, m := range filteredMachines {
1230-
// filteredMachines contains machines in deleting status to calculate correct status.
1231-
// skip remediation for those in deleting status.
1287+
// Update maxInFlight based on remediations that are in flight.
1288+
// A Machine has a remediation in flight when Machine's OwnerRemediated condition
1289+
// reports that remediation has been completed and the Machine has been deleted.
1290+
for _, m := range machines {
12321291
if !m.DeletionTimestamp.IsZero() {
1292+
// TODO: Check for Status: False and Reason: MachineSetMachineRemediationMachineDeletedV1Beta2Reason
1293+
// instead when starting to use v1beta2 conditions for control flow.
12331294
if conditions.IsTrue(m, clusterv1.MachineOwnerRemediatedCondition) {
1234-
// Machine has been remediated by this controller and still in flight.
1295+
// Remediation for this Machine has been triggered by this controller but it is still in flight,
1296+
// i.e. it still goes through the deletion workflow and exists in etcd.
12351297
maxInFlight--
12361298
}
1237-
continue
1238-
}
1239-
if conditions.IsFalse(m, clusterv1.MachineOwnerRemediatedCondition) {
1240-
machinesToRemediate = append(machinesToRemediate, m)
12411299
}
12421300
}
12431301

1244-
// If there are no machines to remediate return early.
1245-
if len(machinesToRemediate) == 0 {
1246-
return ctrl.Result{}, nil
1247-
}
12481302
// Check if we can remediate any machines.
12491303
if maxInFlight <= 0 {
12501304
// No tokens available to remediate machines.
12511305
log.V(3).Info("Remediation strategy is set, and maximum in flight has been reached", "machinesToBeRemediated", len(machinesToRemediate))
1306+
if err := patchMachineConditions(ctx, r.Client, machinesToRemediate, metav1.Condition{
1307+
Type: clusterv1.MachineOwnerRemediatedV1Beta2Condition,
1308+
Status: metav1.ConditionFalse,
1309+
Reason: clusterv1.MachineSetMachineRemediationDeferredV1Beta2Reason,
1310+
Message: fmt.Sprintf("Waiting because there are already too many remediations in progress (spec.strategy.remediation.maxInFlight is %s)", owner.Spec.Strategy.Remediation.MaxInFlight),
1311+
}, nil); err != nil {
1312+
return ctrl.Result{}, err
1313+
}
12521314
return ctrl.Result{}, nil
12531315
}
12541316

@@ -1263,11 +1325,22 @@ func (r *Reconciler) reconcileUnhealthyMachines(ctx context.Context, s *scope) (
12631325
if len(machinesToRemediate) > maxInFlight {
12641326
log.V(5).Info("Remediation strategy is set, limiting in flight operations", "machinesToBeRemediated", len(machinesToRemediate))
12651327
// We have more machines to remediate than tokens available.
1266-
machinesToRemediate = machinesToRemediate[:maxInFlight]
1328+
allMachinesToRemediate := machinesToRemediate
1329+
machinesToRemediate = allMachinesToRemediate[:maxInFlight]
1330+
machinesToDeferRemediation := allMachinesToRemediate[maxInFlight:]
1331+
1332+
if err := patchMachineConditions(ctx, r.Client, machinesToDeferRemediation, metav1.Condition{
1333+
Type: clusterv1.MachineOwnerRemediatedV1Beta2Condition,
1334+
Status: metav1.ConditionFalse,
1335+
Reason: clusterv1.MachineSetMachineRemediationDeferredV1Beta2Reason,
1336+
Message: fmt.Sprintf("Waiting because there are already too many remediations in progress (spec.strategy.remediation.maxInFlight is %s)", owner.Spec.Strategy.Remediation.MaxInFlight),
1337+
}, nil); err != nil {
1338+
return ctrl.Result{}, err
1339+
}
12671340
}
12681341

12691342
// Run preflight checks.
1270-
preflightChecksResult, preflightCheckErrMessage, err := r.runPreflightChecks(ctx, cluster, ms, "Machine Remediation")
1343+
preflightChecksResult, preflightCheckErrMessage, err := r.runPreflightChecks(ctx, cluster, ms, "Machine remediation")
12711344
if err != nil {
12721345
// If err is not nil use that as the preflightCheckErrMessage
12731346
preflightCheckErrMessage = err.Error()
@@ -1277,48 +1350,84 @@ func (r *Reconciler) reconcileUnhealthyMachines(ctx context.Context, s *scope) (
12771350
if preflightChecksFailed {
12781351
// PreflightChecks did not pass. Update the MachineOwnerRemediated condition on the unhealthy Machines with
12791352
// WaitingForRemediationReason reason.
1280-
var errs []error
1281-
for _, m := range machinesToRemediate {
1282-
patchHelper, err := patch.NewHelper(m, r.Client)
1283-
if err != nil {
1284-
errs = append(errs, err)
1285-
continue
1286-
}
1287-
conditions.MarkFalse(m, clusterv1.MachineOwnerRemediatedCondition, clusterv1.WaitingForRemediationReason, clusterv1.ConditionSeverityWarning, preflightCheckErrMessage)
1288-
if err := patchHelper.Patch(ctx, m); err != nil {
1289-
errs = append(errs, err)
1290-
}
1291-
}
1292-
1293-
if len(errs) > 0 {
1294-
return ctrl.Result{}, errors.Wrapf(kerrors.NewAggregate(errs), "failed to patch unhealthy Machines")
1353+
if err := patchMachineConditions(ctx, r.Client, machinesToRemediate, metav1.Condition{
1354+
Type: clusterv1.MachineOwnerRemediatedV1Beta2Condition,
1355+
Status: metav1.ConditionFalse,
1356+
Reason: clusterv1.MachineSetMachineRemediationDeferredV1Beta2Reason,
1357+
Message: preflightCheckErrMessage,
1358+
}, &clusterv1.Condition{
1359+
Type: clusterv1.MachineOwnerRemediatedCondition,
1360+
Status: corev1.ConditionFalse,
1361+
Reason: clusterv1.WaitingForRemediationReason,
1362+
Severity: clusterv1.ConditionSeverityWarning,
1363+
Message: preflightCheckErrMessage,
1364+
}); err != nil {
1365+
return ctrl.Result{}, err
12951366
}
12961367
return preflightChecksResult, nil
12971368
}
12981369

1299-
// PreflightChecks passed, so it is safe to remediate unhealthy machines.
1300-
// Remediate unhealthy machines by deleting them.
1370+
// PreflightChecks passed, so it is safe to remediate unhealthy machines by deleting them.
1371+
1372+
// Note: We intentionally patch the Machines before we delete them to make this code reentrant.
1373+
// If we delete the Machine first, the Machine would be filtered out on next reconcile because
1374+
// it has a deletionTimestamp so it would never get the condition.
1375+
// Instead if we set the condition but the deletion does not go through on next reconcile either the
1376+
// condition will be fixed/updated or the Machine deletion will be retried.
1377+
if err := patchMachineConditions(ctx, r.Client, machinesToRemediate, metav1.Condition{
1378+
Type: clusterv1.MachineOwnerRemediatedV1Beta2Condition,
1379+
Status: metav1.ConditionFalse,
1380+
Reason: clusterv1.MachineSetMachineRemediationMachineDeletedV1Beta2Reason,
1381+
}, &clusterv1.Condition{
1382+
Type: clusterv1.MachineOwnerRemediatedCondition,
1383+
Status: corev1.ConditionTrue,
1384+
}); err != nil {
1385+
return ctrl.Result{}, err
1386+
}
13011387
var errs []error
13021388
for _, m := range machinesToRemediate {
13031389
log.Info("Deleting unhealthy Machine", "Machine", klog.KObj(m))
1304-
patch := client.MergeFrom(m.DeepCopy())
13051390
if err := r.Client.Delete(ctx, m); err != nil && !apierrors.IsNotFound(err) {
13061391
errs = append(errs, errors.Wrapf(err, "failed to delete Machine %s", klog.KObj(m)))
1307-
continue
1308-
}
1309-
conditions.MarkTrue(m, clusterv1.MachineOwnerRemediatedCondition)
1310-
if err := r.Client.Status().Patch(ctx, m, patch); err != nil && !apierrors.IsNotFound(err) {
1311-
errs = append(errs, errors.Wrapf(err, "failed to update status of Machine %s", klog.KObj(m)))
13121392
}
13131393
}
1314-
13151394
if len(errs) > 0 {
13161395
return ctrl.Result{}, errors.Wrapf(kerrors.NewAggregate(errs), "failed to delete unhealthy Machines")
13171396
}
13181397

13191398
return ctrl.Result{}, nil
13201399
}
13211400

1401+
func patchMachineConditions(ctx context.Context, c client.Client, machines []*clusterv1.Machine, v1beta2Condition metav1.Condition, condition *clusterv1.Condition) error {
1402+
var errs []error
1403+
for _, m := range machines {
1404+
patchHelper, err := patch.NewHelper(m, c)
1405+
if err != nil {
1406+
errs = append(errs, err)
1407+
continue
1408+
}
1409+
1410+
if condition != nil {
1411+
conditions.Set(m, condition)
1412+
}
1413+
v1beta2conditions.Set(m, v1beta2Condition)
1414+
1415+
if err := patchHelper.Patch(ctx, m,
1416+
patch.WithOwnedConditions{Conditions: []clusterv1.ConditionType{
1417+
clusterv1.MachineOwnerRemediatedCondition,
1418+
}}, patch.WithOwnedV1Beta2Conditions{Conditions: []string{
1419+
clusterv1.MachineOwnerRemediatedV1Beta2Condition,
1420+
}}); err != nil {
1421+
errs = append(errs, err)
1422+
}
1423+
}
1424+
if len(errs) > 0 {
1425+
return errors.Wrapf(kerrors.NewAggregate(errs), "failed to patch Machines")
1426+
}
1427+
1428+
return nil
1429+
}
1430+
13221431
func (r *Reconciler) reconcileExternalTemplateReference(ctx context.Context, cluster *clusterv1.Cluster, ms *clusterv1.MachineSet, owner *clusterv1.MachineDeployment, ref *corev1.ObjectReference) (objectNotFound bool, err error) {
13231432
if !strings.HasSuffix(ref.Kind, clusterv1.TemplateSuffix) {
13241433
return false, nil

0 commit comments

Comments
 (0)