Skip to content

Commit 2d86cc0

Browse files
🌱 Improve KCP remediation of multiple failures (#11716)
* Improve KCP remediation of multiple failures * Address comments
1 parent 5400038 commit 2d86cc0

File tree

2 files changed

+134
-17
lines changed

2 files changed

+134
-17
lines changed

controlplane/kubeadm/internal/controllers/remediation.go

Lines changed: 87 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -20,13 +20,15 @@ import (
2020
"context"
2121
"encoding/json"
2222
"fmt"
23+
"sort"
2324
"time"
2425

2526
"github.com/go-logr/logr"
2627
"github.com/pkg/errors"
2728
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
2829
kerrors "k8s.io/apimachinery/pkg/util/errors"
2930
"k8s.io/klog/v2"
31+
"k8s.io/utils/ptr"
3032
ctrl "sigs.k8s.io/controller-runtime"
3133

3234
clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1"
@@ -101,7 +103,10 @@ func (r *KubeadmControlPlaneReconciler) reconcileUnhealthyMachines(ctx context.C
101103
// NOTE: The current solution is considered acceptable for the most frequent use case (only one machine to be remediated),
102104
// however, in the future this could potentially be improved for the scenario where more than one machine to be remediated exists
103105
// by considering which machine has lower impact on etcd quorum.
104-
machineToBeRemediated := getMachineToBeRemediated(machinesToBeRemediated)
106+
machineToBeRemediated := getMachineToBeRemediated(machinesToBeRemediated, controlPlane.IsEtcdManaged())
107+
if machineToBeRemediated == nil {
108+
return ctrl.Result{}, errors.New("failed to find a Machine to remediate within unhealthy Machines")
109+
}
105110

106111
// Returns if the machine is in the process of being deleted.
107112
if !machineToBeRemediated.ObjectMeta.DeletionTimestamp.IsZero() {
@@ -339,14 +344,88 @@ func (r *KubeadmControlPlaneReconciler) reconcileUnhealthyMachines(ctx context.C
339344
return ctrl.Result{Requeue: true}, nil
340345
}
341346

342-
// Gets the machine to be remediated, which is the oldest machine marked as unhealthy not yet provisioned (if any)
343-
// or the oldest machine marked as unhealthy.
344-
func getMachineToBeRemediated(unhealthyMachines collections.Machines) *clusterv1.Machine {
345-
machineToBeRemediated := unhealthyMachines.Filter(collections.Not(collections.HasNode())).Oldest()
346-
if machineToBeRemediated == nil {
347-
machineToBeRemediated = unhealthyMachines.Oldest()
347+
// Gets the machine to be remediated, which is the "most broken" among the unhealthy machines, determined as the machine
348+
// having the highest priority issue that other machines have not.
349+
// The following issues are considered (from highest to lowest priority):
350+
// - machine without .status.nodeRef
351+
// - machine with etcd issue or etcd status unknown (etcd member, etcd pod)
352+
// - machine with control plane component issue or status unknown (API server, controller manager, scheduler)
353+
//
354+
// Note: In case of more than one faulty machine the chance to recover mostly depends on the control plane being able to
355+
// successfully create a replacement Machine, because due to scale up preflight checks, this cannot happen if there are
356+
// still issues on the control plane after the first remediation.
357+
// This func tries to maximize those chances of a successful remediation by picking for remediation the "most broken" machine first.
358+
func getMachineToBeRemediated(unhealthyMachines collections.Machines, isEtcdManaged bool) *clusterv1.Machine {
359+
if unhealthyMachines.Len() == 0 {
360+
return nil
361+
}
362+
363+
machinesToBeRemediated := unhealthyMachines.UnsortedList()
364+
if len(machinesToBeRemediated) == 1 {
365+
return machinesToBeRemediated[0]
366+
}
367+
368+
sort.Slice(machinesToBeRemediated, func(i, j int) bool {
369+
return pickMachineToBeRemediated(machinesToBeRemediated[i], machinesToBeRemediated[j], isEtcdManaged)
370+
})
371+
return machinesToBeRemediated[0]
372+
}
373+
374+
// pickMachineToBeRemediated returns true if machine i should be remediated before machine j.
375+
func pickMachineToBeRemediated(i, j *clusterv1.Machine, isEtcdManaged bool) bool {
376+
// if one machine does not have a node ref, we assume that provisioning failed and there is no CP components at all,
377+
// so remediate first; also without a node, it is not possible to get further info about status.
378+
if i.Status.NodeRef == nil && j.Status.NodeRef != nil {
379+
return true
380+
}
381+
if i.Status.NodeRef != nil && j.Status.NodeRef == nil {
382+
return false
383+
}
384+
385+
// if one machine has unhealthy etcd member or pod, remediate first.
386+
if isEtcdManaged {
387+
if p := pickMachineToBeRemediatedByConditionState(i, j, controlplanev1.MachineEtcdMemberHealthyCondition); p != nil {
388+
return *p
389+
}
390+
if p := pickMachineToBeRemediatedByConditionState(i, j, controlplanev1.MachineEtcdPodHealthyCondition); p != nil {
391+
return *p
392+
}
393+
394+
// Note: in the future we might consider etcd leadership and kubelet status to prevent being stuck when it is not possible
395+
// to forward leadership, but this requires further investigation and most probably also to surface a few additional info in the controlPlane object.
396+
}
397+
398+
// if one machine has unhealthy control plane component, remediate first.
399+
if p := pickMachineToBeRemediatedByConditionState(i, j, controlplanev1.MachineAPIServerPodHealthyCondition); p != nil {
400+
return *p
401+
}
402+
if p := pickMachineToBeRemediatedByConditionState(i, j, controlplanev1.MachineControllerManagerPodHealthyCondition); p != nil {
403+
return *p
404+
}
405+
if p := pickMachineToBeRemediatedByConditionState(i, j, controlplanev1.MachineSchedulerPodHealthyCondition); p != nil {
406+
return *p
407+
}
408+
409+
// Use oldest (and Name) as a tie-breaker criteria.
410+
if i.CreationTimestamp.Equal(&j.CreationTimestamp) {
411+
return i.Name < j.Name
412+
}
413+
return i.CreationTimestamp.Before(&j.CreationTimestamp)
414+
}
415+
416+
// pickMachineToBeRemediatedByConditionState returns true if condition t report issue on machine i and not on machine j,
417+
// false if the vice-versa apply, or nil if condition t doesn't provide a discriminating criteria for picking one machine or another for remediation.
418+
func pickMachineToBeRemediatedByConditionState(i, j *clusterv1.Machine, t clusterv1.ConditionType) *bool {
419+
iCondition := conditions.IsTrue(i, t)
420+
jCondition := conditions.IsTrue(j, t)
421+
422+
if !iCondition && jCondition {
423+
return ptr.To(true)
424+
}
425+
if iCondition && !jCondition {
426+
return ptr.To(false)
348427
}
349-
return machineToBeRemediated
428+
return nil
350429
}
351430

352431
// checkRetryLimits checks if KCP is allowed to remediate considering retry limits:

controlplane/kubeadm/internal/controllers/remediation_test.go

Lines changed: 47 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,7 @@ import (
4242
)
4343

4444
func TestGetMachineToBeRemediated(t *testing.T) {
45-
t.Run("returns the oldest machine if there are no provisioning machines", func(t *testing.T) {
45+
t.Run("returns provisioning machines first", func(t *testing.T) {
4646
g := NewWithT(t)
4747

4848
ns, err := env.CreateNamespace(ctx, "ns1")
@@ -51,15 +51,48 @@ func TestGetMachineToBeRemediated(t *testing.T) {
5151
g.Expect(env.Cleanup(ctx, ns)).To(Succeed())
5252
}()
5353

54-
m1 := createMachine(ctx, g, ns.Name, "m1-unhealthy-", withMachineHealthCheckFailed())
55-
m2 := createMachine(ctx, g, ns.Name, "m2-unhealthy-", withMachineHealthCheckFailed())
54+
m1 := createMachine(ctx, g, ns.Name, "m1-unhealthy-", withMachineHealthCheckFailed(), withUnhealthyEtcdMember(), withUnhealthyAPIServerPod()) // Note issue on etcd / API server have lower priority than the lack of node.
55+
m2 := createMachine(ctx, g, ns.Name, "m2-unhealthy-", withMachineHealthCheckFailed(), withoutNodeRef())
56+
57+
unhealthyMachines := collections.FromMachines(m1, m2)
58+
59+
g.Expect(getMachineToBeRemediated(unhealthyMachines, true).Name).To(HavePrefix("m2-unhealthy-"))
60+
})
61+
62+
t.Run("returns the machines with etcd errors first (if there are no provisioning machines)", func(t *testing.T) {
63+
g := NewWithT(t)
64+
65+
ns, err := env.CreateNamespace(ctx, "ns1")
66+
g.Expect(err).ToNot(HaveOccurred())
67+
defer func() {
68+
g.Expect(env.Cleanup(ctx, ns)).To(Succeed())
69+
}()
70+
71+
m1 := createMachine(ctx, g, ns.Name, "m1-unhealthy-", withMachineHealthCheckFailed(), withHealthyEtcdMember(), withUnhealthyAPIServerPod()) // Note issue on API server have lower priority than issue on etcd
72+
m2 := createMachine(ctx, g, ns.Name, "m2-unhealthy-", withMachineHealthCheckFailed(), withUnhealthyEtcdMember())
5673

5774
unhealthyMachines := collections.FromMachines(m1, m2)
5875

59-
g.Expect(getMachineToBeRemediated(unhealthyMachines).Name).To(HavePrefix("m1-unhealthy-"))
76+
g.Expect(getMachineToBeRemediated(unhealthyMachines, true).Name).To(HavePrefix("m2-unhealthy-"))
6077
})
6178

62-
t.Run("returns the oldest of the provisioning machines", func(t *testing.T) {
79+
t.Run("returns the machines with API server errors first (if there are no provisioning machines and no etcd issues)", func(t *testing.T) {
80+
g := NewWithT(t)
81+
82+
ns, err := env.CreateNamespace(ctx, "ns1")
83+
g.Expect(err).ToNot(HaveOccurred())
84+
defer func() {
85+
g.Expect(env.Cleanup(ctx, ns)).To(Succeed())
86+
}()
87+
88+
m1 := createMachine(ctx, g, ns.Name, "m1-unhealthy-", withMachineHealthCheckFailed(), withHealthyAPIServerPod())
89+
m2 := createMachine(ctx, g, ns.Name, "m2-unhealthy-", withMachineHealthCheckFailed(), withUnhealthyAPIServerPod())
90+
91+
unhealthyMachines := collections.FromMachines(m1, m2)
92+
93+
g.Expect(getMachineToBeRemediated(unhealthyMachines, true).Name).To(HavePrefix("m2-unhealthy-"))
94+
})
95+
t.Run("returns the oldest machine if there are no provisioning machines/no other elements affecting priority", func(t *testing.T) {
6396
g := NewWithT(t)
6497

6598
ns, err := env.CreateNamespace(ctx, "ns1")
@@ -69,12 +102,11 @@ func TestGetMachineToBeRemediated(t *testing.T) {
69102
}()
70103

71104
m1 := createMachine(ctx, g, ns.Name, "m1-unhealthy-", withMachineHealthCheckFailed())
72-
m2 := createMachine(ctx, g, ns.Name, "m2-unhealthy-", withMachineHealthCheckFailed(), withoutNodeRef())
73-
m3 := createMachine(ctx, g, ns.Name, "m3-unhealthy-", withMachineHealthCheckFailed(), withoutNodeRef())
105+
m2 := createMachine(ctx, g, ns.Name, "m2-unhealthy-", withMachineHealthCheckFailed())
74106

75-
unhealthyMachines := collections.FromMachines(m1, m2, m3)
107+
unhealthyMachines := collections.FromMachines(m1, m2)
76108

77-
g.Expect(getMachineToBeRemediated(unhealthyMachines).Name).To(HavePrefix("m2-unhealthy-"))
109+
g.Expect(getMachineToBeRemediated(unhealthyMachines, true).Name).To(HavePrefix("m1-unhealthy-"))
78110
})
79111
}
80112

@@ -1973,6 +2005,12 @@ func withUnhealthyEtcdMember() machineOption {
19732005
}
19742006
}
19752007

2008+
func withHealthyAPIServerPod() machineOption {
2009+
return func(machine *clusterv1.Machine) {
2010+
conditions.MarkTrue(machine, controlplanev1.MachineAPIServerPodHealthyCondition)
2011+
}
2012+
}
2013+
19762014
func withUnhealthyAPIServerPod() machineOption {
19772015
return func(machine *clusterv1.Machine) {
19782016
conditions.MarkFalse(machine, controlplanev1.MachineAPIServerPodHealthyCondition, controlplanev1.ControlPlaneComponentsUnhealthyReason, clusterv1.ConditionSeverityError, "")

0 commit comments

Comments
 (0)