kubernetes-sigs · k8s-ci-robot · Nov 20, 2024 · Nov 20, 2024
diff --git a/controlplane/kubeadm/internal/workload_cluster_conditions.go b/controlplane/kubeadm/internal/workload_cluster_conditions.go
@@ -47,8 +47,38 @@ import (
 // This operation is best effort, in the sense that in case of problems in retrieving member status, it sets
 // the condition to Unknown state without returning any error.
 func (w *Workload) UpdateEtcdConditions(ctx context.Context, controlPlane *ControlPlane) {
+	shouldRetry := func() bool {
+		// if CP is scaling up or down.
+		if ptr.Deref(controlPlane.KCP.Spec.Replicas, 0) != int32(len(controlPlane.Machines)) {
+			return true
+		}
+		// if CP machines are provisioning or deleting.
+		for _, m := range controlPlane.Machines {
+			if m.Status.NodeRef == nil {
+				return true
+			}
+			if !m.DeletionTimestamp.IsZero() {
+				return true
+			}
+		}
+		return false
+	}
+
 	if controlPlane.IsEtcdManaged() {
-		w.updateManagedEtcdConditions(ctx, controlPlane)
+		// Update etcd conditions.
+		// In case of well known temporary errors + control plane scaling up/down or rolling out, retry a few times.
+		// Note: this is required because there isn't a watch mechanism on etcd.
+		maxRetry := 3
+		for i := range maxRetry {
+			retryableError := w.updateManagedEtcdConditions(ctx, controlPlane)
+			// if we should retry and there is a retry left, wait a bit.
+			if !retryableError || !shouldRetry() {
+				break
+			}
+			if i < maxRetry-1 {
+				time.Sleep(time.Duration(250*(i+1)) * time.Millisecond)
+			}
+		}
 		return
 	}
 	w.updateExternalEtcdConditions(ctx, controlPlane)
@@ -64,7 +94,7 @@ func (w *Workload) updateExternalEtcdConditions(_ context.Context, controlPlane
 	// As soon as the v1beta1 condition above will be removed, we should drop this func entirely.
 }
 
-func (w *Workload) updateManagedEtcdConditions(ctx context.Context, controlPlane *ControlPlane) {
+func (w *Workload) updateManagedEtcdConditions(ctx context.Context, controlPlane *ControlPlane) (retryableError bool) {
 	// NOTE: This methods uses control plane nodes only to get in contact with etcd but then it relies on etcd
 	// as ultimate source of truth for the list of members and for their health.
 	controlPlaneNodes, err := w.getControlPlaneNodes(ctx)
@@ -88,7 +118,7 @@ func (w *Workload) updateManagedEtcdConditions(ctx context.Context, controlPlane
 			Reason:  controlplanev1.KubeadmControlPlaneEtcdClusterInspectionFailedV1Beta2Reason,
 			Message: "Failed to get Nodes hosting the etcd cluster",
 		})
-		return
+		return retryableError
 	}
 
 	// Update conditions for etcd members on the nodes.
@@ -154,6 +184,9 @@ func (w *Workload) updateManagedEtcdConditions(ctx context.Context, controlPlane
 		if err != nil {
 			// Note. even if we fail reading the member list from one node/etcd members we do not set EtcdMembersAgreeOnMemberList and EtcdMembersAgreeOnClusterID to false
 			// (those info are computed on what we can collect during inspection, so we can reason about availability even if there is a certain degree of problems in the cluster).
+
+			// While scaling up/down or rolling out new CP machines this error might happen.
+			retryableError = true
 			continue
 		}
 
@@ -176,6 +209,9 @@ func (w *Workload) updateManagedEtcdConditions(ctx context.Context, controlPlane
 				Reason:  controlplanev1.KubeadmControlPlaneMachineEtcdMemberNotHealthyV1Beta2Reason,
 				Message: fmt.Sprintf("The etcd member hosted on this Machine reports the cluster is composed by %s, but all previously seen etcd members are reporting %s", etcdutil.MemberNames(currentMembers), etcdutil.MemberNames(controlPlane.EtcdMembers)),
 			})
+
+			// While scaling up/down or rolling out new CP machines this error might happen because we are reading the list from different nodes at different time.
+			retryableError = true
 			continue
 		}
 
@@ -277,6 +313,7 @@ func (w *Workload) updateManagedEtcdConditions(ctx context.Context, controlPlane
 		trueReason:        controlplanev1.KubeadmControlPlaneEtcdClusterHealthyV1Beta2Reason,
 		note:              "etcd member",
 	})
+	return retryableError
 }
 
 func (w *Workload) getCurrentEtcdMembers(ctx context.Context, machine *clusterv1.Machine, nodeName string) ([]*etcd.Member, error) {