Skip to content

Commit a090145

Browse files
authored
🌱 Prioritize Machine with remediate-machine anotation when selecting the next machine to be remediated (#11495)
* Prioritise machine with remidiation anotation for remidiation * Changes to prioritise remediation for machine set controller * Update RemediateMachineAnnotation description * Address review comments
1 parent d7bb304 commit a090145

File tree

5 files changed

+163
-4
lines changed

5 files changed

+163
-4
lines changed

api/v1beta1/common_types.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -128,7 +128,7 @@ const (
128128
// MachineSkipRemediationAnnotation is the annotation used to mark the machines that should not be considered for remediation by MachineHealthCheck reconciler.
129129
MachineSkipRemediationAnnotation = "cluster.x-k8s.io/skip-remediation"
130130

131-
// RemediateMachineAnnotation is the annotation used to mark machines that should be remediated by MachineHealthCheck reconciler.
131+
// RemediateMachineAnnotation request the MachineHealthCheck reconciler to mark a Machine as unhealthy. CAPI builtin remediation will prioritize Machines with the annotation to be remediated.
132132
RemediateMachineAnnotation = "cluster.x-k8s.io/remediate-machine"
133133

134134
// MachineSetSkipPreflightChecksAnnotation is the annotation used to provide a comma-separated list of

controlplane/kubeadm/internal/controllers/remediation.go

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -347,6 +347,7 @@ func (r *KubeadmControlPlaneReconciler) reconcileUnhealthyMachines(ctx context.C
347347
// Gets the machine to be remediated, which is the "most broken" among the unhealthy machines, determined as the machine
348348
// having the highest priority issue that other machines have not.
349349
// The following issues are considered (from highest to lowest priority):
350+
// - machine with RemediateMachineAnnotation annotation
350351
// - machine without .status.nodeRef
351352
// - machine with etcd issue or etcd status unknown (etcd member, etcd pod)
352353
// - machine with control plane component issue or status unknown (API server, controller manager, scheduler)
@@ -373,6 +374,14 @@ func getMachineToBeRemediated(unhealthyMachines collections.Machines, isEtcdMana
373374

374375
// pickMachineToBeRemediated returns true if machine i should be remediated before machine j.
375376
func pickMachineToBeRemediated(i, j *clusterv1.Machine, isEtcdManaged bool) bool {
377+
// If one machine has the RemediateMachineAnnotation annotation, remediate first.
378+
if annotations.HasRemediateMachine(i) && !annotations.HasRemediateMachine(j) {
379+
return true
380+
}
381+
if !annotations.HasRemediateMachine(i) && annotations.HasRemediateMachine(j) {
382+
return false
383+
}
384+
376385
// if one machine does not have a node ref, we assume that provisioning failed and there is no CP components at all,
377386
// so remediate first; also without a node, it is not possible to get further info about status.
378387
if i.Status.NodeRef == nil && j.Status.NodeRef != nil {

controlplane/kubeadm/internal/controllers/remediation_test.go

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,25 @@ import (
4242
)
4343

4444
func TestGetMachineToBeRemediated(t *testing.T) {
45+
t.Run("returns the machine with RemediateMachineAnnotation first", func(t *testing.T) {
46+
g := NewWithT(t)
47+
48+
ns, err := env.CreateNamespace(ctx, "ns1")
49+
g.Expect(err).ToNot(HaveOccurred())
50+
defer func() {
51+
g.Expect(env.Cleanup(ctx, ns)).To(Succeed())
52+
}()
53+
54+
m1 := createMachine(ctx, g, ns.Name, "m1-unhealthy-", withMachineHealthCheckFailed())
55+
m2 := createMachine(ctx, g, ns.Name, "m2-unhealthy-", withMachineHealthCheckFailed(), withoutNodeRef(), withUnhealthyEtcdMember(), withUnhealthyAPIServerPod())
56+
57+
m1.SetAnnotations(map[string]string{clusterv1.RemediateMachineAnnotation: ""})
58+
59+
unhealthyMachines := collections.FromMachines(m2, m1)
60+
61+
g.Expect(getMachineToBeRemediated(unhealthyMachines, true).Name).To(HavePrefix("m1-unhealthy-"))
62+
})
63+
4564
t.Run("returns provisioning machines first", func(t *testing.T) {
4665
g := NewWithT(t)
4766

internal/controllers/machineset/machineset_controller.go

Lines changed: 22 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,7 @@ import (
5454
topologynames "sigs.k8s.io/cluster-api/internal/topology/names"
5555
"sigs.k8s.io/cluster-api/internal/util/ssa"
5656
"sigs.k8s.io/cluster-api/util"
57+
"sigs.k8s.io/cluster-api/util/annotations"
5758
"sigs.k8s.io/cluster-api/util/collections"
5859
"sigs.k8s.io/cluster-api/util/conditions"
5960
v1beta2conditions "sigs.k8s.io/cluster-api/util/conditions/v1beta2"
@@ -1438,9 +1439,7 @@ func (r *Reconciler) reconcileUnhealthyMachines(ctx context.Context, s *scope) (
14381439
// Sort the machines from newest to oldest.
14391440
// We are trying to remediate machines failing to come up first because
14401441
// there is a chance that they are not hosting any workloads (minimize disruption).
1441-
sort.SliceStable(machinesToRemediate, func(i, j int) bool {
1442-
return machinesToRemediate[i].CreationTimestamp.After(machinesToRemediate[j].CreationTimestamp.Time)
1443-
})
1442+
sortMachinesToRemediate(machinesToRemediate)
14441443

14451444
// Check if we should limit the in flight operations.
14461445
if len(machinesToRemediate) > maxInFlight {
@@ -1617,3 +1616,23 @@ func (r *Reconciler) reconcileExternalTemplateReference(ctx context.Context, clu
16171616

16181617
return false, patchHelper.Patch(ctx, obj)
16191618
}
1619+
1620+
// Returns the machines to be remediated in the following order
1621+
// - Machines with RemediateMachineAnnotation annotation if any,
1622+
// - Machines failing to come up first because
1623+
// there is a chance that they are not hosting any workloads (minimize disruption).
1624+
func sortMachinesToRemediate(machines []*clusterv1.Machine) {
1625+
sort.SliceStable(machines, func(i, j int) bool {
1626+
if annotations.HasRemediateMachine(machines[i]) && !annotations.HasRemediateMachine(machines[j]) {
1627+
return true
1628+
}
1629+
if !annotations.HasRemediateMachine(machines[i]) && annotations.HasRemediateMachine(machines[j]) {
1630+
return false
1631+
}
1632+
// Use newest (and Name) as a tie-breaker criteria.
1633+
if machines[i].CreationTimestamp.Equal(&machines[j].CreationTimestamp) {
1634+
return machines[i].Name < machines[j].Name
1635+
}
1636+
return machines[i].CreationTimestamp.After(machines[j].CreationTimestamp.Time)
1637+
})
1638+
}

internal/controllers/machineset/machineset_controller_test.go

Lines changed: 112 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ package machineset
1919
import (
2020
"context"
2121
"fmt"
22+
"sort"
2223
"strings"
2324
"testing"
2425
"time"
@@ -3138,3 +3139,114 @@ func TestNewMachineUpToDateCondition(t *testing.T) {
31383139
})
31393140
}
31403141
}
3142+
3143+
func TestSortMachinesToRemediate(t *testing.T) {
3144+
unhealthyMachinesWithAnnotations := []*clusterv1.Machine{}
3145+
for i := range 4 {
3146+
unhealthyMachinesWithAnnotations = append(unhealthyMachinesWithAnnotations, &clusterv1.Machine{
3147+
ObjectMeta: metav1.ObjectMeta{
3148+
Name: fmt.Sprintf("unhealthy-annotated-machine-%d", i),
3149+
Namespace: "default",
3150+
CreationTimestamp: metav1.Time{Time: metav1.Now().Add(time.Duration(i) * time.Second)},
3151+
Annotations: map[string]string{
3152+
clusterv1.RemediateMachineAnnotation: "",
3153+
},
3154+
},
3155+
Status: clusterv1.MachineStatus{
3156+
Conditions: []clusterv1.Condition{
3157+
{
3158+
Type: clusterv1.MachineOwnerRemediatedCondition,
3159+
Status: corev1.ConditionFalse,
3160+
},
3161+
{
3162+
Type: clusterv1.MachineHealthCheckSucceededCondition,
3163+
Status: corev1.ConditionFalse,
3164+
},
3165+
},
3166+
V1Beta2: &clusterv1.MachineV1Beta2Status{
3167+
Conditions: []metav1.Condition{
3168+
{
3169+
Type: clusterv1.MachineOwnerRemediatedV1Beta2Condition,
3170+
Status: metav1.ConditionFalse,
3171+
Reason: clusterv1.MachineOwnerRemediatedWaitingForRemediationV1Beta2Reason,
3172+
Message: "Waiting for remediation",
3173+
},
3174+
{
3175+
Type: clusterv1.MachineHealthCheckSucceededV1Beta2Condition,
3176+
Status: metav1.ConditionFalse,
3177+
Reason: clusterv1.MachineHealthCheckHasRemediateAnnotationV1Beta2Reason,
3178+
Message: "Marked for remediation via cluster.x-k8s.io/remediate-machine annotation",
3179+
},
3180+
},
3181+
},
3182+
},
3183+
})
3184+
}
3185+
3186+
unhealthyMachines := []*clusterv1.Machine{}
3187+
for i := range 4 {
3188+
unhealthyMachines = append(unhealthyMachines, &clusterv1.Machine{
3189+
ObjectMeta: metav1.ObjectMeta{
3190+
Name: fmt.Sprintf("unhealthy-machine-%d", i),
3191+
Namespace: "default",
3192+
CreationTimestamp: metav1.Time{Time: metav1.Now().Add(time.Duration(i) * time.Second)},
3193+
},
3194+
Status: clusterv1.MachineStatus{
3195+
Conditions: []clusterv1.Condition{
3196+
{
3197+
Type: clusterv1.MachineOwnerRemediatedCondition,
3198+
Status: corev1.ConditionFalse,
3199+
},
3200+
{
3201+
Type: clusterv1.MachineHealthCheckSucceededCondition,
3202+
Status: corev1.ConditionFalse,
3203+
},
3204+
},
3205+
V1Beta2: &clusterv1.MachineV1Beta2Status{
3206+
Conditions: []metav1.Condition{
3207+
{
3208+
Type: clusterv1.MachineOwnerRemediatedV1Beta2Condition,
3209+
Status: metav1.ConditionFalse,
3210+
Reason: clusterv1.MachineOwnerRemediatedWaitingForRemediationV1Beta2Reason,
3211+
Message: "Waiting for remediation",
3212+
},
3213+
{
3214+
Type: clusterv1.MachineHealthCheckSucceededV1Beta2Condition,
3215+
Status: metav1.ConditionFalse,
3216+
Reason: clusterv1.MachineHealthCheckHasRemediateAnnotationV1Beta2Reason,
3217+
Message: "Marked for remediation via cluster.x-k8s.io/remediate-machine annotation",
3218+
},
3219+
},
3220+
},
3221+
},
3222+
})
3223+
}
3224+
3225+
t.Run("remediation machines should be sorted with newest first", func(t *testing.T) {
3226+
g := NewWithT(t)
3227+
machines := make([]*clusterv1.Machine, len(unhealthyMachines))
3228+
copy(machines, unhealthyMachines)
3229+
sortMachinesToRemediate(machines)
3230+
sort.SliceStable(unhealthyMachines, func(i, j int) bool {
3231+
return unhealthyMachines[i].CreationTimestamp.After(unhealthyMachines[j].CreationTimestamp.Time)
3232+
})
3233+
g.Expect(unhealthyMachines).To(Equal(machines))
3234+
})
3235+
3236+
t.Run("remediation machines with annotation should be prioritised over other machines", func(t *testing.T) {
3237+
g := NewWithT(t)
3238+
3239+
machines := make([]*clusterv1.Machine, len(unhealthyMachines))
3240+
copy(machines, unhealthyMachines)
3241+
machines = append(machines, unhealthyMachinesWithAnnotations...)
3242+
sortMachinesToRemediate(machines)
3243+
3244+
sort.SliceStable(unhealthyMachines, func(i, j int) bool {
3245+
return unhealthyMachines[i].CreationTimestamp.After(unhealthyMachines[j].CreationTimestamp.Time)
3246+
})
3247+
sort.SliceStable(unhealthyMachinesWithAnnotations, func(i, j int) bool {
3248+
return unhealthyMachinesWithAnnotations[i].CreationTimestamp.After(unhealthyMachinesWithAnnotations[j].CreationTimestamp.Time)
3249+
})
3250+
g.Expect(machines).To(Equal(append(unhealthyMachinesWithAnnotations, unhealthyMachines...)))
3251+
})
3252+
}

0 commit comments

Comments
 (0)