Skip to content

Commit ed90dd9

Browse files
authored
Fix Race conditions in Targeted Deletion of machines by CA (#965)
* read TriggerDeletionByMCM on MCD and set MachinePriority=1 annotation on machines * fixed dup import issue * improved logging+annotation clear * addressed final review comments * removed todo, added unit-test binary files to .gitignore * changed to better var names * reverted code review change - nil/empty check is necessary
1 parent 97fa4ee commit ed90dd9

File tree

4 files changed

+96
-2
lines changed

4 files changed

+96
-2
lines changed

.gitignore

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,3 +34,6 @@ hack/tools/bin/
3434

3535
# gosec
3636
gosec-report.sarif
37+
38+
# ignore test binary fines
39+
**/*.test

pkg/controller/deployment.go

Lines changed: 64 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -25,14 +25,16 @@ package controller
2525
import (
2626
"context"
2727
"fmt"
28+
"github.com/gardener/machine-controller-manager/pkg/util/annotations"
29+
"github.com/gardener/machine-controller-manager/pkg/util/provider/machineutils"
30+
apierrors "k8s.io/apimachinery/pkg/api/errors"
2831
"reflect"
2932
"sync"
3033
"time"
3134

3235
"k8s.io/klog/v2"
3336

3437
v1 "k8s.io/api/core/v1"
35-
"k8s.io/apimachinery/pkg/api/errors"
3638
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
3739
"k8s.io/apimachinery/pkg/labels"
3840
"k8s.io/apimachinery/pkg/types"
@@ -422,7 +424,7 @@ func (dc *controller) reconcileClusterMachineDeployment(key string) error {
422424
return err
423425
}
424426
deployment, err := dc.controlMachineClient.MachineDeployments(dc.namespace).Get(ctx, name, metav1.GetOptions{})
425-
if errors.IsNotFound(err) {
427+
if apierrors.IsNotFound(err) {
426428
klog.V(4).Infof("Deployment %v has been deleted", key)
427429
return nil
428430
}
@@ -512,6 +514,11 @@ func (dc *controller) reconcileClusterMachineDeployment(key string) error {
512514
return err
513515
}
514516

517+
err = dc.setMachinePriorityAnnotationAndUpdateTriggeredForDeletion(ctx, d)
518+
if err != nil {
519+
return err
520+
}
521+
515522
if d.Spec.Paused {
516523
klog.V(3).Infof("Scaling detected for machineDeployment %s which is paused", d.Name)
517524
return dc.sync(ctx, d, machineSets, machineMap)
@@ -604,3 +611,58 @@ func (dc *controller) updateMachineDeploymentFinalizers(ctx context.Context, mac
604611
dc.updateMachineDeploymentFinalizers(ctx, machineDeployment, finalizers)
605612
}
606613
}
614+
615+
func (dc *controller) setMachinePriorityAnnotationAndUpdateTriggeredForDeletion(ctx context.Context, mcd *v1alpha1.MachineDeployment) error {
616+
var triggerForDeletionMachineNames, skipTriggerForDeletionMachineNames []string
617+
triggerForDeletionMachineNames = annotations.GetMachineNamesTriggeredForDeletion(mcd)
618+
if len(triggerForDeletionMachineNames) == 0 {
619+
return nil
620+
}
621+
klog.V(3).Infof("MachineDeployment %q has #%d machine(s) marked for deletion, triggerForDeletionMachineNames=%v", mcd.Name, len(triggerForDeletionMachineNames), triggerForDeletionMachineNames)
622+
for _, machineName := range triggerForDeletionMachineNames {
623+
mc, err := dc.machineLister.Machines(dc.namespace).Get(machineName)
624+
if apierrors.IsNotFound(err) {
625+
klog.V(4).Infof("Machine %q is not found in MachineDeployment %q - skip setting MachinePriority=1 annotation", machineName, mcd.Name)
626+
skipTriggerForDeletionMachineNames = append(skipTriggerForDeletionMachineNames, machineName)
627+
continue
628+
}
629+
if machineutils.IsMachineFailedOrTerminating(mc) {
630+
klog.V(4).Infof("Machine %q of MachineDeployment %q is in Failed/Terminating state - skip setting MachinePriority=1 annotation", machineName, mcd.Name)
631+
skipTriggerForDeletionMachineNames = append(skipTriggerForDeletionMachineNames, machineName)
632+
continue
633+
}
634+
if mc.Annotations[machineutils.MachinePriority] == "1" {
635+
klog.V(4).Infof("Machine %q of MachineDeployment %q already marked with MachinePriority=1 annotation", machineName, mcd.Name)
636+
continue
637+
}
638+
mcAdjust := mc.DeepCopy()
639+
mcAdjust.Annotations[machineutils.MachinePriority] = "1"
640+
_, err = dc.controlMachineClient.Machines(mcAdjust.Namespace).Update(ctx, mcAdjust, metav1.UpdateOptions{})
641+
if err != nil {
642+
klog.Errorf("Failed to set MachinePriority=1 annotation on Machine %q of MachineDeployment %q: %v", machineName, mcd.Name, err)
643+
return err
644+
}
645+
klog.V(3).Infof("Machine %q of MachineDeployment %q marked with MachinePriority=1 annotation successfully", machineName, mcd.Name)
646+
}
647+
648+
if len(skipTriggerForDeletionMachineNames) == 0 {
649+
return nil
650+
}
651+
652+
triggerForDeletionMachineNames = sets.NewString(triggerForDeletionMachineNames...).Delete(skipTriggerForDeletionMachineNames...).List()
653+
triggerDeletionAnnotValue := annotations.CreateMachinesTriggeredForDeletionAnnotValue(triggerForDeletionMachineNames)
654+
655+
mcdAdjust := mcd.DeepCopy()
656+
if triggerDeletionAnnotValue != "" {
657+
mcdAdjust.Annotations[machineutils.TriggerDeletionByMCM] = triggerDeletionAnnotValue
658+
} else {
659+
delete(mcdAdjust.Annotations, machineutils.TriggerDeletionByMCM)
660+
}
661+
_, err := dc.controlMachineClient.MachineDeployments(mcd.Namespace).Update(ctx, mcdAdjust, metav1.UpdateOptions{})
662+
if err != nil {
663+
klog.Errorf("Failed to update MachineDeployment %q with #%d machine names still pending deletion, triggerDeletionAnnotValue=%q", mcd.Name, len(triggerForDeletionMachineNames), triggerDeletionAnnotValue)
664+
return err
665+
}
666+
klog.V(3).Infof("Updated MachineDeployment %q with #%d machine names still pending deletion, triggerDeletionAnnotValue=%q", mcd.Name, len(triggerForDeletionMachineNames), triggerDeletionAnnotValue)
667+
return nil
668+
}

pkg/util/annotations/annotations.go

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,11 @@
66
package annotations
77

88
import (
9+
"github.com/gardener/machine-controller-manager/pkg/apis/machine/v1alpha1"
10+
"github.com/gardener/machine-controller-manager/pkg/util/provider/machineutils"
911
v1 "k8s.io/api/core/v1"
12+
"slices"
13+
"strings"
1014
)
1115

1216
// AddOrUpdateAnnotation tries to add an annotation. Returns a new copy of updated Node and true if something was updated
@@ -68,3 +72,17 @@ func DeleteAnnotation(nodeAnnotations map[string]string, annotations map[string]
6872
}
6973
return newAnnotations, deleted
7074
}
75+
76+
// GetMachineNamesTriggeredForDeletion returns the set of machine names contained within the machineutils.TriggerDeletionByMCM annotation on the given MachineDeployment
77+
func GetMachineNamesTriggeredForDeletion(mcd *v1alpha1.MachineDeployment) []string {
78+
if mcd.Annotations == nil || mcd.Annotations[machineutils.TriggerDeletionByMCM] == "" {
79+
return nil
80+
}
81+
return strings.Split(mcd.Annotations[machineutils.TriggerDeletionByMCM], ",")
82+
}
83+
84+
// CreateMachinesTriggeredForDeletionAnnotValue constructs the annotation value for machineutils.TriggerDeletionByMCM from the given machine names.
85+
func CreateMachinesTriggeredForDeletionAnnotValue(machineNames []string) string {
86+
slices.Sort(machineNames)
87+
return strings.Join(machineNames, ",")
88+
}

pkg/util/provider/machineutils/utils.go

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
package machineutils
77

88
import (
9+
"github.com/gardener/machine-controller-manager/pkg/apis/machine/v1alpha1"
910
"time"
1011

1112
v1 "k8s.io/api/core/v1"
@@ -49,6 +50,8 @@ const (
4950
NotManagedByMCM = "node.machine.sapcloud.io/not-managed-by-mcm"
5051

5152
// TriggerDeletionByMCM annotation on the node would trigger the deletion of the corresponding machine object in the control cluster
53+
// This annotation can also be set on the MachineDeployment and contains the machine names for which deletion should be triggered.
54+
// This feature is leveraged by the CA-MCM cloud provider.
5255
TriggerDeletionByMCM = "node.machine.sapcloud.io/trigger-deletion-by-mcm"
5356

5457
// NodeUnhealthy is a node termination reason for failed machines
@@ -86,3 +89,11 @@ const (
8689
// EssentialTaints are taints on node object which if added/removed, require an immediate reconcile by machine controller
8790
// TODO: update this when taints for ALT updation and PostCreate operations is introduced.
8891
var EssentialTaints = []string{TaintNodeCriticalComponentsNotReady}
92+
93+
// IsMachineFailedOrTerminating returns true if machine is Failed or already being Terminated.
94+
func IsMachineFailedOrTerminating(machine *v1alpha1.Machine) bool {
95+
if !machine.GetDeletionTimestamp().IsZero() || machine.Status.CurrentStatus.Phase == v1alpha1.MachineFailed {
96+
return true
97+
}
98+
return false
99+
}

0 commit comments

Comments
 (0)