Skip to content

Commit f6aec64

Browse files
authored
Merge pull request #11797 from fabriziopandini/drop-retry-when-reading-KCP-conditions
🌱 Drop retry when ready KCP conditions
2 parents 043509c + 4f4f4ab commit f6aec64

File tree

2 files changed

+16
-66
lines changed

2 files changed

+16
-66
lines changed

controlplane/kubeadm/internal/workload_cluster_conditions.go

Lines changed: 3 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -47,41 +47,8 @@ import (
4747
// This operation is best effort, in the sense that in case of problems in retrieving member status, it sets
4848
// the condition to Unknown state without returning any error.
4949
func (w *Workload) UpdateEtcdConditions(ctx context.Context, controlPlane *ControlPlane) {
50-
shouldRetry := func() bool {
51-
// if CP is scaling up or down.
52-
if ptr.Deref(controlPlane.KCP.Spec.Replicas, 0) != int32(len(controlPlane.Machines)) {
53-
return true
54-
}
55-
// if CP machines are provisioning or deleting.
56-
for _, m := range controlPlane.Machines {
57-
if m.Status.NodeRef == nil {
58-
return true
59-
}
60-
if !m.DeletionTimestamp.IsZero() {
61-
return true
62-
}
63-
}
64-
return false
65-
}
66-
6750
if controlPlane.IsEtcdManaged() {
68-
// Update etcd conditions.
69-
// In case of well known temporary errors + control plane scaling up/down or rolling out, retry a few times.
70-
// Note: it seems that reducing the number of them during every reconciles also improves stability,
71-
// thus we are stopping doing retries (we only try once).
72-
// However, we keep the code implementing retry support so we can easily revert this decision in a patch
73-
// release if we need to.
74-
maxRetry := 1
75-
for i := range maxRetry {
76-
retryableError := w.updateManagedEtcdConditions(ctx, controlPlane)
77-
// if we should retry and there is a retry left, wait a bit.
78-
if !retryableError || !shouldRetry() {
79-
break
80-
}
81-
if i < maxRetry-1 {
82-
time.Sleep(time.Duration(250*(i+1)) * time.Millisecond)
83-
}
84-
}
51+
w.updateManagedEtcdConditions(ctx, controlPlane)
8552
return
8653
}
8754
w.updateExternalEtcdConditions(ctx, controlPlane)
@@ -97,7 +64,7 @@ func (w *Workload) updateExternalEtcdConditions(_ context.Context, controlPlane
9764
// As soon as the v1beta1 condition above will be removed, we should drop this func entirely.
9865
}
9966

100-
func (w *Workload) updateManagedEtcdConditions(ctx context.Context, controlPlane *ControlPlane) (retryableError bool) {
67+
func (w *Workload) updateManagedEtcdConditions(ctx context.Context, controlPlane *ControlPlane) {
10168
// NOTE: This methods uses control plane nodes only to get in contact with etcd but then it relies on etcd
10269
// as ultimate source of truth for the list of members and for their health.
10370
controlPlaneNodes, err := w.getControlPlaneNodes(ctx)
@@ -121,7 +88,7 @@ func (w *Workload) updateManagedEtcdConditions(ctx context.Context, controlPlane
12188
Reason: controlplanev1.KubeadmControlPlaneEtcdClusterInspectionFailedV1Beta2Reason,
12289
Message: "Failed to get Nodes hosting the etcd cluster",
12390
})
124-
return retryableError
91+
return
12592
}
12693

12794
// Update conditions for etcd members on the nodes.
@@ -189,7 +156,6 @@ func (w *Workload) updateManagedEtcdConditions(ctx context.Context, controlPlane
189156
// (those info are computed on what we can collect during inspection, so we can reason about availability even if there is a certain degree of problems in the cluster).
190157

191158
// While scaling up/down or rolling out new CP machines this error might happen.
192-
retryableError = true
193159
continue
194160
}
195161

@@ -214,7 +180,6 @@ func (w *Workload) updateManagedEtcdConditions(ctx context.Context, controlPlane
214180
})
215181

216182
// While scaling up/down or rolling out new CP machines this error might happen because we are reading the list from different nodes at different time.
217-
retryableError = true
218183
continue
219184
}
220185

@@ -316,7 +281,6 @@ func (w *Workload) updateManagedEtcdConditions(ctx context.Context, controlPlane
316281
trueReason: controlplanev1.KubeadmControlPlaneEtcdClusterHealthyV1Beta2Reason,
317282
note: "etcd member",
318283
})
319-
return retryableError
320284
}
321285

322286
func unwrapAll(err error) error {

controlplane/kubeadm/internal/workload_cluster_conditions_test.go

Lines changed: 13 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -170,7 +170,6 @@ func TestUpdateManagedEtcdConditions(t *testing.T) {
170170
machines []*clusterv1.Machine
171171
injectClient client.Client // This test is injecting a fake client because it is required to create nodes with a controlled Status or to fail with a specific error.
172172
injectEtcdClientGenerator etcdClientFor // This test is injecting a fake etcdClientGenerator because it is required to nodes with a controlled Status or to fail with a specific error.
173-
expectedRetryableError bool
174173
expectedKCPCondition *clusterv1.Condition
175174
expectedKCPV1Beta2Condition *metav1.Condition
176175
expectedMachineConditions map[string]clusterv1.Conditions
@@ -188,8 +187,7 @@ func TestUpdateManagedEtcdConditions(t *testing.T) {
188187
injectClient: &fakeClient{
189188
listErr: errors.New("failed to list Nodes"),
190189
},
191-
expectedRetryableError: false,
192-
expectedKCPCondition: conditions.UnknownCondition(controlplanev1.EtcdClusterHealthyCondition, controlplanev1.EtcdClusterInspectionFailedReason, "Failed to list Nodes which are hosting the etcd members"),
190+
expectedKCPCondition: conditions.UnknownCondition(controlplanev1.EtcdClusterHealthyCondition, controlplanev1.EtcdClusterInspectionFailedReason, "Failed to list Nodes which are hosting the etcd members"),
193191
expectedMachineConditions: map[string]clusterv1.Conditions{
194192
"m1": {
195193
*conditions.UnknownCondition(controlplanev1.MachineEtcdMemberHealthyCondition, controlplanev1.EtcdMemberInspectionFailedReason, "Failed to get the Node which is hosting the etcd member"),
@@ -220,8 +218,7 @@ func TestUpdateManagedEtcdConditions(t *testing.T) {
220218
Items: []corev1.Node{*fakeNode("n1")},
221219
},
222220
},
223-
expectedRetryableError: false,
224-
expectedKCPCondition: nil,
221+
expectedKCPCondition: nil,
225222
expectedMachineConditions: map[string]clusterv1.Conditions{
226223
"m1": {},
227224
},
@@ -251,8 +248,7 @@ func TestUpdateManagedEtcdConditions(t *testing.T) {
251248
Items: []corev1.Node{*fakeNode("n1")},
252249
},
253250
},
254-
expectedRetryableError: false,
255-
expectedKCPCondition: nil,
251+
expectedKCPCondition: nil,
256252
expectedMachineConditions: map[string]clusterv1.Conditions{
257253
"m1": {},
258254
},
@@ -280,8 +276,7 @@ func TestUpdateManagedEtcdConditions(t *testing.T) {
280276
Items: []corev1.Node{*fakeNode("n1")},
281277
},
282278
},
283-
expectedRetryableError: false,
284-
expectedKCPCondition: conditions.FalseCondition(controlplanev1.EtcdClusterHealthyCondition, controlplanev1.EtcdClusterUnhealthyReason, clusterv1.ConditionSeverityError, "Control plane Node %s does not have a corresponding Machine", "n1"),
279+
expectedKCPCondition: conditions.FalseCondition(controlplanev1.EtcdClusterHealthyCondition, controlplanev1.EtcdClusterUnhealthyReason, clusterv1.ConditionSeverityError, "Control plane Node %s does not have a corresponding Machine", "n1"),
285280
expectedKCPV1Beta2Condition: &metav1.Condition{
286281
Type: controlplanev1.KubeadmControlPlaneEtcdClusterHealthyV1Beta2Condition,
287282
Status: metav1.ConditionFalse,
@@ -305,8 +300,7 @@ func TestUpdateManagedEtcdConditions(t *testing.T) {
305300
injectEtcdClientGenerator: &fakeEtcdClientGenerator{
306301
forNodesErr: errors.New("failed to get client for node"),
307302
},
308-
expectedRetryableError: true,
309-
expectedKCPCondition: conditions.UnknownCondition(controlplanev1.EtcdClusterHealthyCondition, controlplanev1.EtcdClusterUnknownReason, "Following Machines are reporting unknown etcd member status: m1"),
303+
expectedKCPCondition: conditions.UnknownCondition(controlplanev1.EtcdClusterHealthyCondition, controlplanev1.EtcdClusterUnknownReason, "Following Machines are reporting unknown etcd member status: m1"),
310304
expectedMachineConditions: map[string]clusterv1.Conditions{
311305
"m1": {
312306
*conditions.UnknownCondition(controlplanev1.MachineEtcdMemberHealthyCondition, controlplanev1.EtcdMemberInspectionFailedReason, "Failed to connect to the etcd Pod on the %s Node: failed to get client for node", "n1"),
@@ -346,8 +340,7 @@ func TestUpdateManagedEtcdConditions(t *testing.T) {
346340
Errors: []string{"some errors"},
347341
},
348342
},
349-
expectedRetryableError: true,
350-
expectedKCPCondition: conditions.FalseCondition(controlplanev1.EtcdClusterHealthyCondition, controlplanev1.EtcdClusterUnhealthyReason, clusterv1.ConditionSeverityError, "Following Machines are reporting etcd member errors: %s", "m1"),
343+
expectedKCPCondition: conditions.FalseCondition(controlplanev1.EtcdClusterHealthyCondition, controlplanev1.EtcdClusterUnhealthyReason, clusterv1.ConditionSeverityError, "Following Machines are reporting etcd member errors: %s", "m1"),
351344
expectedMachineConditions: map[string]clusterv1.Conditions{
352345
"m1": {
353346
*conditions.FalseCondition(controlplanev1.MachineEtcdMemberHealthyCondition, controlplanev1.EtcdMemberUnhealthyReason, clusterv1.ConditionSeverityError, "Etcd member status reports errors: %s", "some errors"),
@@ -387,8 +380,7 @@ func TestUpdateManagedEtcdConditions(t *testing.T) {
387380
},
388381
},
389382
},
390-
expectedRetryableError: true,
391-
expectedKCPCondition: conditions.FalseCondition(controlplanev1.EtcdClusterHealthyCondition, controlplanev1.EtcdClusterUnhealthyReason, clusterv1.ConditionSeverityError, "Following Machines are reporting etcd member errors: %s", "m1"),
383+
expectedKCPCondition: conditions.FalseCondition(controlplanev1.EtcdClusterHealthyCondition, controlplanev1.EtcdClusterUnhealthyReason, clusterv1.ConditionSeverityError, "Following Machines are reporting etcd member errors: %s", "m1"),
392384
expectedMachineConditions: map[string]clusterv1.Conditions{
393385
"m1": {
394386
*conditions.FalseCondition(controlplanev1.MachineEtcdMemberHealthyCondition, controlplanev1.EtcdMemberUnhealthyReason, clusterv1.ConditionSeverityError, "Failed to get answer from the etcd member on the %s Node", "n1"),
@@ -437,8 +429,7 @@ func TestUpdateManagedEtcdConditions(t *testing.T) {
437429
},
438430
},
439431
},
440-
expectedRetryableError: false,
441-
expectedKCPCondition: conditions.FalseCondition(controlplanev1.EtcdClusterHealthyCondition, controlplanev1.EtcdClusterUnhealthyReason, clusterv1.ConditionSeverityError, "Following Machines are reporting etcd member errors: %s", "m1"),
432+
expectedKCPCondition: conditions.FalseCondition(controlplanev1.EtcdClusterHealthyCondition, controlplanev1.EtcdClusterUnhealthyReason, clusterv1.ConditionSeverityError, "Following Machines are reporting etcd member errors: %s", "m1"),
442433
expectedMachineConditions: map[string]clusterv1.Conditions{
443434
"m1": {
444435
*conditions.FalseCondition(controlplanev1.MachineEtcdMemberHealthyCondition, controlplanev1.EtcdMemberUnhealthyReason, clusterv1.ConditionSeverityError, "Etcd member reports alarms: %s", "NOSPACE"),
@@ -519,8 +510,7 @@ func TestUpdateManagedEtcdConditions(t *testing.T) {
519510
}
520511
},
521512
},
522-
expectedRetryableError: false,
523-
expectedKCPCondition: conditions.FalseCondition(controlplanev1.EtcdClusterHealthyCondition, controlplanev1.EtcdClusterUnhealthyReason, clusterv1.ConditionSeverityError, "Following Machines are reporting etcd member errors: %s", "m2"),
513+
expectedKCPCondition: conditions.FalseCondition(controlplanev1.EtcdClusterHealthyCondition, controlplanev1.EtcdClusterUnhealthyReason, clusterv1.ConditionSeverityError, "Following Machines are reporting etcd member errors: %s", "m2"),
524514
expectedMachineConditions: map[string]clusterv1.Conditions{
525515
"m1": {
526516
*conditions.TrueCondition(controlplanev1.MachineEtcdMemberHealthyCondition),
@@ -607,8 +597,7 @@ func TestUpdateManagedEtcdConditions(t *testing.T) {
607597
}
608598
},
609599
},
610-
expectedRetryableError: true,
611-
expectedKCPCondition: conditions.FalseCondition(controlplanev1.EtcdClusterHealthyCondition, controlplanev1.EtcdClusterUnhealthyReason, clusterv1.ConditionSeverityError, "Following Machines are reporting etcd member errors: %s", "m2"),
600+
expectedKCPCondition: conditions.FalseCondition(controlplanev1.EtcdClusterHealthyCondition, controlplanev1.EtcdClusterUnhealthyReason, clusterv1.ConditionSeverityError, "Following Machines are reporting etcd member errors: %s", "m2"),
612601
expectedMachineConditions: map[string]clusterv1.Conditions{
613602
"m1": {
614603
*conditions.TrueCondition(controlplanev1.MachineEtcdMemberHealthyCondition),
@@ -677,8 +666,7 @@ func TestUpdateManagedEtcdConditions(t *testing.T) {
677666
}
678667
},
679668
},
680-
expectedRetryableError: true,
681-
expectedKCPCondition: conditions.FalseCondition(controlplanev1.EtcdClusterHealthyCondition, controlplanev1.EtcdClusterUnhealthyReason, clusterv1.ConditionSeverityError, "Following Machines are reporting etcd member errors: %s", "m2"),
669+
expectedKCPCondition: conditions.FalseCondition(controlplanev1.EtcdClusterHealthyCondition, controlplanev1.EtcdClusterUnhealthyReason, clusterv1.ConditionSeverityError, "Following Machines are reporting etcd member errors: %s", "m2"),
682670
expectedMachineConditions: map[string]clusterv1.Conditions{
683671
"m1": {
684672
*conditions.TrueCondition(controlplanev1.MachineEtcdMemberHealthyCondition),
@@ -765,8 +753,7 @@ func TestUpdateManagedEtcdConditions(t *testing.T) {
765753
}
766754
},
767755
},
768-
expectedRetryableError: false,
769-
expectedKCPCondition: conditions.TrueCondition(controlplanev1.EtcdClusterHealthyCondition),
756+
expectedKCPCondition: conditions.TrueCondition(controlplanev1.EtcdClusterHealthyCondition),
770757
expectedMachineConditions: map[string]clusterv1.Conditions{
771758
"m1": {
772759
*conditions.TrueCondition(controlplanev1.MachineEtcdMemberHealthyCondition),
@@ -810,8 +797,7 @@ func TestUpdateManagedEtcdConditions(t *testing.T) {
810797
Machines: collections.FromMachines(tt.machines...),
811798
}
812799

813-
retryableError := w.updateManagedEtcdConditions(ctx, controlPane)
814-
g.Expect(retryableError).To(Equal(tt.expectedRetryableError))
800+
w.updateManagedEtcdConditions(ctx, controlPane)
815801

816802
if tt.expectedKCPCondition != nil {
817803
g.Expect(*conditions.Get(tt.kcp, controlplanev1.EtcdClusterHealthyCondition)).To(conditions.MatchCondition(*tt.expectedKCPCondition))

0 commit comments

Comments
 (0)