Skip to content

🌱 Retry in case of etcd errors in KCP #11450

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
43 changes: 40 additions & 3 deletions controlplane/kubeadm/internal/workload_cluster_conditions.go
Original file line number Diff line number Diff line change
Expand Up @@ -47,8 +47,38 @@ import (
// This operation is best effort, in the sense that in case of problems in retrieving member status, it sets
// the condition to Unknown state without returning any error.
func (w *Workload) UpdateEtcdConditions(ctx context.Context, controlPlane *ControlPlane) {
shouldRetry := func() bool {
// if CP is scaling up or down.
if ptr.Deref(controlPlane.KCP.Spec.Replicas, 0) != int32(len(controlPlane.Machines)) {
return true
}
// if CP machines are provisioning or deleting.
for _, m := range controlPlane.Machines {
if m.Status.NodeRef == nil {
return true
}
if !m.DeletionTimestamp.IsZero() {
return true
}
}
return false
}

if controlPlane.IsEtcdManaged() {
w.updateManagedEtcdConditions(ctx, controlPlane)
// Update etcd conditions.
// In case of well known temporary errors + control plane scaling up/down or rolling out, retry a few times.
// Note: this is required because there isn't a watch mechanism on etcd.
maxRetry := 3
for i := range maxRetry {
retryableError := w.updateManagedEtcdConditions(ctx, controlPlane)
// if we should retry and there is a retry left, wait a bit.
if !retryableError || !shouldRetry() {
break
}
if i < maxRetry-1 {
time.Sleep(time.Duration(250*(i+1)) * time.Millisecond)
}
}
return
}
w.updateExternalEtcdConditions(ctx, controlPlane)
Expand All @@ -64,7 +94,7 @@ func (w *Workload) updateExternalEtcdConditions(_ context.Context, controlPlane
// As soon as the v1beta1 condition above will be removed, we should drop this func entirely.
}

func (w *Workload) updateManagedEtcdConditions(ctx context.Context, controlPlane *ControlPlane) {
func (w *Workload) updateManagedEtcdConditions(ctx context.Context, controlPlane *ControlPlane) (retryableError bool) {
// NOTE: This methods uses control plane nodes only to get in contact with etcd but then it relies on etcd
// as ultimate source of truth for the list of members and for their health.
controlPlaneNodes, err := w.getControlPlaneNodes(ctx)
Expand All @@ -88,7 +118,7 @@ func (w *Workload) updateManagedEtcdConditions(ctx context.Context, controlPlane
Reason: controlplanev1.KubeadmControlPlaneEtcdClusterInspectionFailedV1Beta2Reason,
Message: "Failed to get Nodes hosting the etcd cluster",
})
return
return retryableError
}

// Update conditions for etcd members on the nodes.
Expand Down Expand Up @@ -154,6 +184,9 @@ func (w *Workload) updateManagedEtcdConditions(ctx context.Context, controlPlane
if err != nil {
// Note. even if we fail reading the member list from one node/etcd members we do not set EtcdMembersAgreeOnMemberList and EtcdMembersAgreeOnClusterID to false
// (those info are computed on what we can collect during inspection, so we can reason about availability even if there is a certain degree of problems in the cluster).

// While scaling up/down or rolling out new CP machines this error might happen.
retryableError = true
continue
}

Expand All @@ -176,6 +209,9 @@ func (w *Workload) updateManagedEtcdConditions(ctx context.Context, controlPlane
Reason: controlplanev1.KubeadmControlPlaneMachineEtcdMemberNotHealthyV1Beta2Reason,
Message: fmt.Sprintf("The etcd member hosted on this Machine reports the cluster is composed by %s, but all previously seen etcd members are reporting %s", etcdutil.MemberNames(currentMembers), etcdutil.MemberNames(controlPlane.EtcdMembers)),
})

// While scaling up/down or rolling out new CP machines this error might happen because we are reading the list from different nodes at different time.
retryableError = true
continue
}

Expand Down Expand Up @@ -277,6 +313,7 @@ func (w *Workload) updateManagedEtcdConditions(ctx context.Context, controlPlane
trueReason: controlplanev1.KubeadmControlPlaneEtcdClusterHealthyV1Beta2Reason,
note: "etcd member",
})
return retryableError
}

func (w *Workload) getCurrentEtcdMembers(ctx context.Context, machine *clusterv1.Machine, nodeName string) ([]*etcd.Member, error) {
Expand Down
Loading