Skip to content

Commit 518fce7

Browse files
authored
🐛 KCP: remove etcd member in pre-terminate hook (#11137)
* KCP: remove etcd member in pre-terminate hook * fix review findings & add unit tests * Add unit tests for reconcilePreTerminateHook * Add unit tests for reconcilePreTerminateHook - fixups * fixup
1 parent 1f5bc7a commit 518fce7

16 files changed

+684
-102
lines changed

controlplane/kubeadm/api/v1beta1/kubeadm_control_plane_types.go

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,12 @@ const (
6565
// failures in updating remediation retry (the counter restarts from zero).
6666
RemediationForAnnotation = "controlplane.cluster.x-k8s.io/remediation-for"
6767

68+
// PreTerminateHookCleanupAnnotation is the annotation KCP sets on Machines to ensure it can later remove the
69+
// etcd member right before Machine termination (i.e. before InfraMachine deletion).
70+
// Note: Starting with Kubernetes v1.31 this hook will wait for all other pre-terminate hooks to finish to
71+
// ensure it runs last (thus ensuring that kubelet is still working while other pre-terminate hooks run).
72+
PreTerminateHookCleanupAnnotation = clusterv1.PreTerminateDeleteHookAnnotationPrefix + "/kcp-cleanup"
73+
6874
// DefaultMinHealthyPeriod defines the default minimum period before we consider a remediation on a
6975
// machine unrelated from the previous remediation.
7076
DefaultMinHealthyPeriod = 1 * time.Hour

controlplane/kubeadm/internal/control_plane.go

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -164,6 +164,11 @@ func (c *ControlPlane) HasDeletingMachine() bool {
164164
return len(c.Machines.Filter(collections.HasDeletionTimestamp)) > 0
165165
}
166166

167+
// DeletingMachines returns machines in the control plane that are in the process of being deleted.
168+
func (c *ControlPlane) DeletingMachines() collections.Machines {
169+
return c.Machines.Filter(collections.HasDeletionTimestamp)
170+
}
171+
167172
// GetKubeadmConfig returns the KubeadmConfig of a given machine.
168173
func (c *ControlPlane) GetKubeadmConfig(machineName string) (*bootstrapv1.KubeadmConfig, bool) {
169174
kubeadmConfig, ok := c.KubeadmConfigs[machineName]

controlplane/kubeadm/internal/controllers/controller.go

Lines changed: 122 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -397,6 +397,10 @@ func (r *KubeadmControlPlaneReconciler) reconcile(ctx context.Context, controlPl
397397
return ctrl.Result{}, err
398398
}
399399

400+
if result, err := r.reconcilePreTerminateHook(ctx, controlPlane); err != nil || !result.IsZero() {
401+
return result, err
402+
}
403+
400404
// Reconcile unhealthy machines by triggering deletion and requeue if it is considered safe to remediate,
401405
// otherwise continue with the other KCP operations.
402406
if result, err := r.reconcileUnhealthyMachines(ctx, controlPlane); err != nil || !result.IsZero() {
@@ -565,12 +569,24 @@ func (r *KubeadmControlPlaneReconciler) reconcileDelete(ctx context.Context, con
565569
// Delete control plane machines in parallel
566570
machinesToDelete := controlPlane.Machines.Filter(collections.Not(collections.HasDeletionTimestamp))
567571
var errs []error
568-
for i := range machinesToDelete {
569-
m := machinesToDelete[i]
570-
logger := log.WithValues("Machine", klog.KObj(m))
571-
if err := r.Client.Delete(ctx, machinesToDelete[i]); err != nil && !apierrors.IsNotFound(err) {
572-
logger.Error(err, "Failed to cleanup owned machine")
572+
for _, machineToDelete := range machinesToDelete {
573+
log := log.WithValues("Machine", klog.KObj(machineToDelete))
574+
ctx := ctrl.LoggerInto(ctx, log)
575+
576+
// During KCP deletion we don't care about forwarding etcd leadership or removing etcd members.
577+
// So we are removing the pre-terminate hook.
578+
// This is important because when deleting KCP we will delete all members of etcd and it's not possible
579+
// to forward etcd leadership without any member left after we went through the Machine deletion.
580+
// Also in this case the reconcileDelete code of the Machine controller won't execute Node drain
581+
// and wait for volume detach.
582+
if err := r.removePreTerminateHookAnnotationFromMachine(ctx, machineToDelete); err != nil {
573583
errs = append(errs, err)
584+
continue
585+
}
586+
587+
log.Info("Deleting control plane Machine")
588+
if err := r.Client.Delete(ctx, machineToDelete); err != nil && !apierrors.IsNotFound(err) {
589+
errs = append(errs, errors.Wrapf(err, "failed to delete control plane Machine %s", klog.KObj(machineToDelete)))
574590
}
575591
}
576592
if len(errs) > 0 {
@@ -583,6 +599,18 @@ func (r *KubeadmControlPlaneReconciler) reconcileDelete(ctx context.Context, con
583599
return ctrl.Result{RequeueAfter: deleteRequeueAfter}, nil
584600
}
585601

602+
func (r *KubeadmControlPlaneReconciler) removePreTerminateHookAnnotationFromMachine(ctx context.Context, machine *clusterv1.Machine) error {
603+
log := ctrl.LoggerFrom(ctx)
604+
log.Info("Removing pre-terminate hook from control plane Machine")
605+
606+
machineOriginal := machine.DeepCopy()
607+
delete(machine.Annotations, controlplanev1.PreTerminateHookCleanupAnnotation)
608+
if err := r.Client.Patch(ctx, machine, client.MergeFrom(machineOriginal)); err != nil {
609+
return errors.Wrapf(err, "failed to remove pre-terminate hook from control plane Machine %s", klog.KObj(machine))
610+
}
611+
return nil
612+
}
613+
586614
// ClusterToKubeadmControlPlane is a handler.ToRequestsFunc to be used to enqueue requests for reconciliation
587615
// for KubeadmControlPlane based on updates to a Cluster.
588616
func (r *KubeadmControlPlaneReconciler) ClusterToKubeadmControlPlane(_ context.Context, o client.Object) []ctrl.Request {
@@ -791,6 +819,95 @@ func (r *KubeadmControlPlaneReconciler) reconcileEtcdMembers(ctx context.Context
791819
return nil
792820
}
793821

822+
func (r *KubeadmControlPlaneReconciler) reconcilePreTerminateHook(ctx context.Context, controlPlane *internal.ControlPlane) (ctrl.Result, error) {
823+
if !controlPlane.HasDeletingMachine() {
824+
return ctrl.Result{}, nil
825+
}
826+
827+
log := ctrl.LoggerFrom(ctx)
828+
829+
// Return early, if there is already a deleting Machine without the pre-terminate hook.
830+
// We are going to wait until this Machine goes away before running the pre-terminate hook on other Machines.
831+
for _, deletingMachine := range controlPlane.DeletingMachines() {
832+
if _, exists := deletingMachine.Annotations[controlplanev1.PreTerminateHookCleanupAnnotation]; !exists {
833+
return ctrl.Result{RequeueAfter: deleteRequeueAfter}, nil
834+
}
835+
}
836+
837+
// Pick the Machine with the oldest deletionTimestamp to keep this function deterministic / reentrant
838+
// so we only remove the pre-terminate hook from one Machine at a time.
839+
deletingMachine := controlPlane.DeletingMachines().OldestDeletionTimestamp()
840+
log = log.WithValues("Machine", klog.KObj(deletingMachine))
841+
ctx = ctrl.LoggerInto(ctx, log)
842+
843+
parsedVersion, err := semver.ParseTolerant(controlPlane.KCP.Spec.Version)
844+
if err != nil {
845+
return ctrl.Result{}, errors.Wrapf(err, "failed to parse Kubernetes version %q", controlPlane.KCP.Spec.Version)
846+
}
847+
848+
// Return early if there are other pre-terminate hooks for the Machine.
849+
// The KCP pre-terminate hook should be the one executed last, so that kubelet
850+
// is still working while other pre-terminate hooks are run.
851+
// Note: This is done only for Kubernetes >= v1.31 to reduce the blast radius of this check.
852+
if version.Compare(parsedVersion, semver.MustParse("1.31.0"), version.WithoutPreReleases()) >= 0 {
853+
if machineHasOtherPreTerminateHooks(deletingMachine) {
854+
return ctrl.Result{RequeueAfter: deleteRequeueAfter}, nil
855+
}
856+
}
857+
858+
// Return early because the Machine controller is not yet waiting for the pre-terminate hook.
859+
c := conditions.Get(deletingMachine, clusterv1.PreTerminateDeleteHookSucceededCondition)
860+
if c == nil || c.Status != corev1.ConditionFalse || c.Reason != clusterv1.WaitingExternalHookReason {
861+
return ctrl.Result{RequeueAfter: deleteRequeueAfter}, nil
862+
}
863+
864+
// The following will execute and remove the pre-terminate hook from the Machine.
865+
866+
// If we have more than 1 Machine and etcd is managed we forward etcd leadership and remove the member
867+
// to keep the etcd cluster healthy.
868+
if controlPlane.Machines.Len() > 1 && controlPlane.IsEtcdManaged() {
869+
workloadCluster, err := controlPlane.GetWorkloadCluster(ctx)
870+
if err != nil {
871+
return ctrl.Result{}, errors.Wrapf(err, "failed to remove etcd member for deleting Machine %s: failed to create client to workload cluster", klog.KObj(deletingMachine))
872+
}
873+
874+
// Note: In regular deletion cases (remediation, scale down) the leader should have been already moved.
875+
// We're doing this again here in case the Machine became leader again or the Machine deletion was
876+
// triggered in another way (e.g. a user running kubectl delete machine)
877+
etcdLeaderCandidate := controlPlane.Machines.Filter(collections.Not(collections.HasDeletionTimestamp)).Newest()
878+
if etcdLeaderCandidate != nil {
879+
if err := workloadCluster.ForwardEtcdLeadership(ctx, deletingMachine, etcdLeaderCandidate); err != nil {
880+
return ctrl.Result{}, errors.Wrapf(err, "failed to move leadership to candidate Machine %s", etcdLeaderCandidate.Name)
881+
}
882+
} else {
883+
log.Info("Skip forwarding etcd leadership, because there is no other control plane Machine without a deletionTimestamp")
884+
}
885+
886+
// Note: Removing the etcd member will lead to the etcd and the kube-apiserver Pod on the Machine shutting down.
887+
// If ControlPlaneKubeletLocalMode is used, the kubelet is communicating with the local apiserver and thus now
888+
// won't be able to see any updates to e.g. Pods anymore.
889+
if err := workloadCluster.RemoveEtcdMemberForMachine(ctx, deletingMachine); err != nil {
890+
return ctrl.Result{}, errors.Wrapf(err, "failed to remove etcd member for deleting Machine %s", klog.KObj(deletingMachine))
891+
}
892+
}
893+
894+
if err := r.removePreTerminateHookAnnotationFromMachine(ctx, deletingMachine); err != nil {
895+
return ctrl.Result{}, err
896+
}
897+
898+
log.Info("Waiting for Machines to be deleted", "machines", strings.Join(controlPlane.Machines.Filter(collections.HasDeletionTimestamp).Names(), ", "))
899+
return ctrl.Result{RequeueAfter: deleteRequeueAfter}, nil
900+
}
901+
902+
func machineHasOtherPreTerminateHooks(machine *clusterv1.Machine) bool {
903+
for k := range machine.Annotations {
904+
if strings.HasPrefix(k, clusterv1.PreTerminateDeleteHookAnnotationPrefix) && k != controlplanev1.PreTerminateHookCleanupAnnotation {
905+
return true
906+
}
907+
}
908+
return false
909+
}
910+
794911
func (r *KubeadmControlPlaneReconciler) reconcileCertificateExpiries(ctx context.Context, controlPlane *internal.ControlPlane) error {
795912
log := ctrl.LoggerFrom(ctx)
796913

0 commit comments

Comments
 (0)