Skip to content

Commit 9d01020

Browse files
Retry in case of etcd errors in KCP
1 parent 48d23cd commit 9d01020

File tree

2 files changed

+73
-15
lines changed

2 files changed

+73
-15
lines changed

controlplane/kubeadm/internal/workload_cluster_conditions.go

Lines changed: 39 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -47,8 +47,38 @@ import (
4747
// This operation is best effort, in the sense that in case of problems in retrieving member status, it sets
4848
// the condition to Unknown state without returning any error.
4949
func (w *Workload) UpdateEtcdConditions(ctx context.Context, controlPlane *ControlPlane) {
50+
shouldRetry := func() bool {
51+
// if CP is scaling up or down.
52+
if ptr.Deref(controlPlane.KCP.Spec.Replicas, 0) != int32(len(controlPlane.Machines)) {
53+
return true
54+
}
55+
// if CP machines are provisioning or deleting.
56+
for _, m := range controlPlane.Machines {
57+
if m.Status.NodeRef == nil {
58+
return true
59+
}
60+
if !m.DeletionTimestamp.IsZero() {
61+
return true
62+
}
63+
}
64+
return false
65+
}
66+
5067
if controlPlane.IsEtcdManaged() {
51-
w.updateManagedEtcdConditions(ctx, controlPlane)
68+
// Update etcd conditions.
69+
// In case of well known temporary errors + control plane scaling up/down or rolling out, retry a few times.
70+
// Note: this is required because there isn't a watch mechanism on etcd.
71+
maxRetry := 3
72+
for i := range maxRetry {
73+
retryableError := w.updateManagedEtcdConditions(ctx, controlPlane)
74+
if !retryableError {
75+
break
76+
}
77+
// if we should retry and there is a retry left, wait a bit.
78+
if shouldRetry() && i < maxRetry-1 {
79+
time.Sleep(time.Duration(250*(i+1)) * time.Millisecond)
80+
}
81+
}
5282
return
5383
}
5484
w.updateExternalEtcdConditions(ctx, controlPlane)
@@ -64,7 +94,7 @@ func (w *Workload) updateExternalEtcdConditions(_ context.Context, controlPlane
6494
// As soon as the v1beta1 condition above will be removed, we should drop this func entirely.
6595
}
6696

67-
func (w *Workload) updateManagedEtcdConditions(ctx context.Context, controlPlane *ControlPlane) {
97+
func (w *Workload) updateManagedEtcdConditions(ctx context.Context, controlPlane *ControlPlane) (retryableError bool) {
6898
// NOTE: This methods uses control plane nodes only to get in contact with etcd but then it relies on etcd
6999
// as ultimate source of truth for the list of members and for their health.
70100
controlPlaneNodes, err := w.getControlPlaneNodes(ctx)
@@ -154,6 +184,9 @@ func (w *Workload) updateManagedEtcdConditions(ctx context.Context, controlPlane
154184
if err != nil {
155185
// Note. even if we fail reading the member list from one node/etcd members we do not set EtcdMembersAgreeOnMemberList and EtcdMembersAgreeOnClusterID to false
156186
// (those info are computed on what we can collect during inspection, so we can reason about availability even if there is a certain degree of problems in the cluster).
187+
188+
// While scaling up/down or rolling out new CP machines this error might happen.
189+
retryableError = true
157190
continue
158191
}
159192

@@ -176,6 +209,9 @@ func (w *Workload) updateManagedEtcdConditions(ctx context.Context, controlPlane
176209
Reason: controlplanev1.KubeadmControlPlaneMachineEtcdMemberNotHealthyV1Beta2Reason,
177210
Message: fmt.Sprintf("The etcd member hosted on this Machine reports the cluster is composed by %s, but all previously seen etcd members are reporting %s", etcdutil.MemberNames(currentMembers), etcdutil.MemberNames(controlPlane.EtcdMembers)),
178211
})
212+
213+
// While scaling up/down or rolling out new CP machines this error might happen because we are reading the list from different nodes at different time.
214+
retryableError = true
179215
continue
180216
}
181217

@@ -277,6 +313,7 @@ func (w *Workload) updateManagedEtcdConditions(ctx context.Context, controlPlane
277313
trueReason: controlplanev1.KubeadmControlPlaneEtcdClusterHealthyV1Beta2Reason,
278314
note: "etcd member",
279315
})
316+
return
280317
}
281318

282319
func (w *Workload) getCurrentEtcdMembers(ctx context.Context, machine *clusterv1.Machine, nodeName string) ([]*etcd.Member, error) {

controlplane/kubeadm/internal/workload_cluster_conditions_test.go

Lines changed: 34 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,7 @@ func TestUpdateEtcdConditions(t *testing.T) {
4949
machines []*clusterv1.Machine
5050
injectClient client.Client // This test is injecting a fake client because it is required to create nodes with a controlled Status or to fail with a specific error.
5151
injectEtcdClientGenerator etcdClientFor // This test is injecting a fake etcdClientGenerator because it is required to nodes with a controlled Status or to fail with a specific error.
52+
expectedRetryableError bool
5253
expectedKCPCondition *clusterv1.Condition
5354
expectedKCPV1Beta2Condition *metav1.Condition
5455
expectedMachineConditions map[string]clusterv1.Conditions
@@ -66,7 +67,8 @@ func TestUpdateEtcdConditions(t *testing.T) {
6667
injectClient: &fakeClient{
6768
listErr: errors.New("failed to list Nodes"),
6869
},
69-
expectedKCPCondition: conditions.UnknownCondition(controlplanev1.EtcdClusterHealthyCondition, controlplanev1.EtcdClusterInspectionFailedReason, "Failed to list Nodes which are hosting the etcd members"),
70+
expectedRetryableError: false,
71+
expectedKCPCondition: conditions.UnknownCondition(controlplanev1.EtcdClusterHealthyCondition, controlplanev1.EtcdClusterInspectionFailedReason, "Failed to list Nodes which are hosting the etcd members"),
7072
expectedMachineConditions: map[string]clusterv1.Conditions{
7173
"m1": {
7274
*conditions.UnknownCondition(controlplanev1.MachineEtcdMemberHealthyCondition, controlplanev1.EtcdMemberInspectionFailedReason, "Failed to get the Node which is hosting the etcd member"),
@@ -97,7 +99,8 @@ func TestUpdateEtcdConditions(t *testing.T) {
9799
Items: []corev1.Node{*fakeNode("n1")},
98100
},
99101
},
100-
expectedKCPCondition: nil,
102+
expectedRetryableError: false,
103+
expectedKCPCondition: nil,
101104
expectedMachineConditions: map[string]clusterv1.Conditions{
102105
"m1": {},
103106
},
@@ -127,7 +130,8 @@ func TestUpdateEtcdConditions(t *testing.T) {
127130
Items: []corev1.Node{*fakeNode("n1")},
128131
},
129132
},
130-
expectedKCPCondition: nil,
133+
expectedRetryableError: false,
134+
expectedKCPCondition: nil,
131135
expectedMachineConditions: map[string]clusterv1.Conditions{
132136
"m1": {},
133137
},
@@ -155,7 +159,8 @@ func TestUpdateEtcdConditions(t *testing.T) {
155159
Items: []corev1.Node{*fakeNode("n1")},
156160
},
157161
},
158-
expectedKCPCondition: conditions.FalseCondition(controlplanev1.EtcdClusterHealthyCondition, controlplanev1.EtcdClusterUnhealthyReason, clusterv1.ConditionSeverityError, "Control plane Node %s does not have a corresponding Machine", "n1"),
162+
expectedRetryableError: false,
163+
expectedKCPCondition: conditions.FalseCondition(controlplanev1.EtcdClusterHealthyCondition, controlplanev1.EtcdClusterUnhealthyReason, clusterv1.ConditionSeverityError, "Control plane Node %s does not have a corresponding Machine", "n1"),
159164
expectedKCPV1Beta2Condition: &metav1.Condition{
160165
Type: controlplanev1.KubeadmControlPlaneEtcdClusterHealthyV1Beta2Condition,
161166
Status: metav1.ConditionFalse,
@@ -179,7 +184,8 @@ func TestUpdateEtcdConditions(t *testing.T) {
179184
injectEtcdClientGenerator: &fakeEtcdClientGenerator{
180185
forNodesErr: errors.New("failed to get client for node"),
181186
},
182-
expectedKCPCondition: conditions.UnknownCondition(controlplanev1.EtcdClusterHealthyCondition, controlplanev1.EtcdClusterUnknownReason, "Following Machines are reporting unknown etcd member status: m1"),
187+
expectedRetryableError: true,
188+
expectedKCPCondition: conditions.UnknownCondition(controlplanev1.EtcdClusterHealthyCondition, controlplanev1.EtcdClusterUnknownReason, "Following Machines are reporting unknown etcd member status: m1"),
183189
expectedMachineConditions: map[string]clusterv1.Conditions{
184190
"m1": {
185191
*conditions.UnknownCondition(controlplanev1.MachineEtcdMemberHealthyCondition, controlplanev1.EtcdMemberInspectionFailedReason, "Failed to connect to the etcd Pod on the %s Node: failed to get client for node", "n1"),
@@ -219,7 +225,8 @@ func TestUpdateEtcdConditions(t *testing.T) {
219225
Errors: []string{"some errors"},
220226
},
221227
},
222-
expectedKCPCondition: conditions.FalseCondition(controlplanev1.EtcdClusterHealthyCondition, controlplanev1.EtcdClusterUnhealthyReason, clusterv1.ConditionSeverityError, "Following Machines are reporting etcd member errors: %s", "m1"),
228+
expectedRetryableError: true,
229+
expectedKCPCondition: conditions.FalseCondition(controlplanev1.EtcdClusterHealthyCondition, controlplanev1.EtcdClusterUnhealthyReason, clusterv1.ConditionSeverityError, "Following Machines are reporting etcd member errors: %s", "m1"),
223230
expectedMachineConditions: map[string]clusterv1.Conditions{
224231
"m1": {
225232
*conditions.FalseCondition(controlplanev1.MachineEtcdMemberHealthyCondition, controlplanev1.EtcdMemberUnhealthyReason, clusterv1.ConditionSeverityError, "Etcd member status reports errors: %s", "some errors"),
@@ -259,7 +266,8 @@ func TestUpdateEtcdConditions(t *testing.T) {
259266
},
260267
},
261268
},
262-
expectedKCPCondition: conditions.FalseCondition(controlplanev1.EtcdClusterHealthyCondition, controlplanev1.EtcdClusterUnhealthyReason, clusterv1.ConditionSeverityError, "Following Machines are reporting etcd member errors: %s", "m1"),
269+
expectedRetryableError: true,
270+
expectedKCPCondition: conditions.FalseCondition(controlplanev1.EtcdClusterHealthyCondition, controlplanev1.EtcdClusterUnhealthyReason, clusterv1.ConditionSeverityError, "Following Machines are reporting etcd member errors: %s", "m1"),
263271
expectedMachineConditions: map[string]clusterv1.Conditions{
264272
"m1": {
265273
*conditions.FalseCondition(controlplanev1.MachineEtcdMemberHealthyCondition, controlplanev1.EtcdMemberUnhealthyReason, clusterv1.ConditionSeverityError, "Failed to get answer from the etcd member on the %s Node", "n1"),
@@ -308,7 +316,8 @@ func TestUpdateEtcdConditions(t *testing.T) {
308316
},
309317
},
310318
},
311-
expectedKCPCondition: conditions.FalseCondition(controlplanev1.EtcdClusterHealthyCondition, controlplanev1.EtcdClusterUnhealthyReason, clusterv1.ConditionSeverityError, "Following Machines are reporting etcd member errors: %s", "m1"),
319+
expectedRetryableError: false,
320+
expectedKCPCondition: conditions.FalseCondition(controlplanev1.EtcdClusterHealthyCondition, controlplanev1.EtcdClusterUnhealthyReason, clusterv1.ConditionSeverityError, "Following Machines are reporting etcd member errors: %s", "m1"),
312321
expectedMachineConditions: map[string]clusterv1.Conditions{
313322
"m1": {
314323
*conditions.FalseCondition(controlplanev1.MachineEtcdMemberHealthyCondition, controlplanev1.EtcdMemberUnhealthyReason, clusterv1.ConditionSeverityError, "Etcd member reports alarms: %s", "NOSPACE"),
@@ -389,7 +398,8 @@ func TestUpdateEtcdConditions(t *testing.T) {
389398
}
390399
},
391400
},
392-
expectedKCPCondition: conditions.FalseCondition(controlplanev1.EtcdClusterHealthyCondition, controlplanev1.EtcdClusterUnhealthyReason, clusterv1.ConditionSeverityError, "Following Machines are reporting etcd member errors: %s", "m2"),
401+
expectedRetryableError: false,
402+
expectedKCPCondition: conditions.FalseCondition(controlplanev1.EtcdClusterHealthyCondition, controlplanev1.EtcdClusterUnhealthyReason, clusterv1.ConditionSeverityError, "Following Machines are reporting etcd member errors: %s", "m2"),
393403
expectedMachineConditions: map[string]clusterv1.Conditions{
394404
"m1": {
395405
*conditions.TrueCondition(controlplanev1.MachineEtcdMemberHealthyCondition),
@@ -476,7 +486,8 @@ func TestUpdateEtcdConditions(t *testing.T) {
476486
}
477487
},
478488
},
479-
expectedKCPCondition: conditions.FalseCondition(controlplanev1.EtcdClusterHealthyCondition, controlplanev1.EtcdClusterUnhealthyReason, clusterv1.ConditionSeverityError, "Following Machines are reporting etcd member errors: %s", "m2"),
489+
expectedRetryableError: true,
490+
expectedKCPCondition: conditions.FalseCondition(controlplanev1.EtcdClusterHealthyCondition, controlplanev1.EtcdClusterUnhealthyReason, clusterv1.ConditionSeverityError, "Following Machines are reporting etcd member errors: %s", "m2"),
480491
expectedMachineConditions: map[string]clusterv1.Conditions{
481492
"m1": {
482493
*conditions.TrueCondition(controlplanev1.MachineEtcdMemberHealthyCondition),
@@ -545,7 +556,8 @@ func TestUpdateEtcdConditions(t *testing.T) {
545556
}
546557
},
547558
},
548-
expectedKCPCondition: conditions.FalseCondition(controlplanev1.EtcdClusterHealthyCondition, controlplanev1.EtcdClusterUnhealthyReason, clusterv1.ConditionSeverityError, "Following Machines are reporting etcd member errors: %s", "m2"),
559+
expectedRetryableError: true,
560+
expectedKCPCondition: conditions.FalseCondition(controlplanev1.EtcdClusterHealthyCondition, controlplanev1.EtcdClusterUnhealthyReason, clusterv1.ConditionSeverityError, "Following Machines are reporting etcd member errors: %s", "m2"),
549561
expectedMachineConditions: map[string]clusterv1.Conditions{
550562
"m1": {
551563
*conditions.TrueCondition(controlplanev1.MachineEtcdMemberHealthyCondition),
@@ -632,7 +644,8 @@ func TestUpdateEtcdConditions(t *testing.T) {
632644
}
633645
},
634646
},
635-
expectedKCPCondition: conditions.TrueCondition(controlplanev1.EtcdClusterHealthyCondition),
647+
expectedRetryableError: false,
648+
expectedKCPCondition: conditions.TrueCondition(controlplanev1.EtcdClusterHealthyCondition),
636649
expectedMachineConditions: map[string]clusterv1.Conditions{
637650
"m1": {
638651
*conditions.TrueCondition(controlplanev1.MachineEtcdMemberHealthyCondition),
@@ -672,6 +685,7 @@ func TestUpdateEtcdConditions(t *testing.T) {
672685
},
673686
},
674687
},
688+
expectedRetryableError: false,
675689
expectedKCPCondition: conditions.TrueCondition(controlplanev1.EtcdClusterHealthyCondition),
676690
expectedKCPV1Beta2Condition: nil,
677691
expectedEtcdMembersAgreeOnMemberList: false,
@@ -694,7 +708,14 @@ func TestUpdateEtcdConditions(t *testing.T) {
694708
KCP: tt.kcp,
695709
Machines: collections.FromMachines(tt.machines...),
696710
}
697-
w.UpdateEtcdConditions(ctx, controlPane)
711+
712+
retryableError := false
713+
if tt.kcp.Spec.KubeadmConfigSpec.ClusterConfiguration == nil || tt.kcp.Spec.KubeadmConfigSpec.ClusterConfiguration.Etcd.External == nil {
714+
retryableError = w.updateManagedEtcdConditions(ctx, controlPane)
715+
} else {
716+
w.updateExternalEtcdConditions(ctx, controlPane)
717+
}
718+
g.Expect(retryableError).To(Equal(tt.expectedRetryableError))
698719

699720
if tt.expectedKCPCondition != nil {
700721
g.Expect(*conditions.Get(tt.kcp, controlplanev1.EtcdClusterHealthyCondition)).To(conditions.MatchCondition(*tt.expectedKCPCondition))

0 commit comments

Comments
 (0)