Skip to content

Commit c333db6

Browse files
authored
Merge pull request #11434 from fabriziopandini/refine-v1beta2-stale-deletion-messages
🌱 Refine v1beta2 stale deletion messages
2 parents 66b6a64 + 14430e0 commit c333db6

File tree

8 files changed

+341
-141
lines changed

8 files changed

+341
-141
lines changed

controlplane/kubeadm/internal/controllers/status.go

Lines changed: 29 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ import (
2525

2626
"github.com/pkg/errors"
2727
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
28+
"k8s.io/apimachinery/pkg/util/sets"
2829
"k8s.io/utils/ptr"
2930
ctrl "sigs.k8s.io/controller-runtime"
3031

@@ -702,9 +703,26 @@ func aggregateStaleMachines(machines collections.Machines) string {
702703
}
703704

704705
machineNames := []string{}
706+
delayReasons := sets.Set[string]{}
705707
for _, machine := range machines {
706-
if !machine.GetDeletionTimestamp().IsZero() && time.Since(machine.GetDeletionTimestamp().Time) > time.Minute*30 {
708+
if !machine.GetDeletionTimestamp().IsZero() && time.Since(machine.GetDeletionTimestamp().Time) > time.Minute*15 {
707709
machineNames = append(machineNames, machine.GetName())
710+
711+
deletingCondition := v1beta2conditions.Get(machine, clusterv1.MachineDeletingV1Beta2Condition)
712+
if deletingCondition != nil &&
713+
deletingCondition.Status == metav1.ConditionTrue &&
714+
deletingCondition.Reason == clusterv1.MachineDeletingDrainingNodeV1Beta2Reason &&
715+
machine.Status.Deletion != nil && time.Since(machine.Status.Deletion.NodeDrainStartTime.Time) > 5*time.Minute {
716+
if strings.Contains(deletingCondition.Message, "cannot evict pod as it would violate the pod's disruption budget.") {
717+
delayReasons.Insert("PodDisruptionBudgets")
718+
}
719+
if strings.Contains(deletingCondition.Message, "deletionTimestamp set, but still not removed from the Node") {
720+
delayReasons.Insert("Pods not terminating")
721+
}
722+
if strings.Contains(deletingCondition.Message, "failed to evict Pod") {
723+
delayReasons.Insert("Pod eviction errors")
724+
}
725+
}
708726
}
709727
}
710728

@@ -725,7 +743,16 @@ func aggregateStaleMachines(machines collections.Machines) string {
725743
} else {
726744
message += " are "
727745
}
728-
message += "in deletion since more than 30m"
746+
message += "in deletion since more than 15m"
747+
if len(delayReasons) > 0 {
748+
reasonList := []string{}
749+
for _, r := range []string{"PodDisruptionBudgets", "Pods not terminating", "Pod eviction errors"} {
750+
if delayReasons.Has(r) {
751+
reasonList = append(reasonList, r)
752+
}
753+
}
754+
message += fmt.Sprintf(", delay likely due to %s", strings.Join(reasonList, ", "))
755+
}
729756

730757
return message
731758
}

controlplane/kubeadm/internal/controllers/status_test.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -383,7 +383,7 @@ func Test_setScalingDownCondition(t *testing.T) {
383383
Status: metav1.ConditionTrue,
384384
Reason: controlplanev1.KubeadmControlPlaneScalingDownV1Beta2Reason,
385385
Message: "Scaling down from 3 to 1 replicas is blocked because:\n" +
386-
"* Machine m1 is in deletion since more than 30m",
386+
"* Machine m1 is in deletion since more than 15m",
387387
},
388388
},
389389
{
@@ -404,7 +404,7 @@ func Test_setScalingDownCondition(t *testing.T) {
404404
Status: metav1.ConditionTrue,
405405
Reason: controlplanev1.KubeadmControlPlaneScalingDownV1Beta2Reason,
406406
Message: "Scaling down from 3 to 1 replicas is blocked because:\n" +
407-
"* Machines m1, m2 are in deletion since more than 30m",
407+
"* Machines m1, m2 are in deletion since more than 15m",
408408
},
409409
},
410410
{

internal/controllers/machine/machine_controller_status.go

Lines changed: 22 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -678,28 +678,33 @@ func setReadyCondition(ctx context.Context, machine *clusterv1.Machine) {
678678
// message in the summary.
679679
// This is also important to ensure we have a limited amount of unique messages across Machines thus allowing to
680680
// nicely aggregate Ready conditions from many Machines into the MachinesReady condition of e.g. the MachineSet.
681-
// For the same reason we are only surfacing messages with "more than 30m" instead of using the exact durations.
682-
// 30 minutes is a duration after which we assume it makes sense to emphasize that Node drains and waiting for volume
681+
// For the same reason we are only surfacing messages with "more than 15m" instead of using the exact durations.
682+
// 15 minutes is a duration after which we assume it makes sense to emphasize that Node drains and waiting for volume
683683
// detach are still in progress.
684684
func calculateDeletingConditionForSummary(machine *clusterv1.Machine) v1beta2conditions.ConditionWithOwnerInfo {
685685
deletingCondition := v1beta2conditions.Get(machine, clusterv1.MachineDeletingV1Beta2Condition)
686686

687-
var msg string
688-
switch {
689-
case deletingCondition == nil:
690-
// NOTE: this should never happen given that setDeletingCondition is called before this method and
691-
// it always adds a Deleting condition.
692-
msg = "Machine deletion in progress"
693-
case deletingCondition.Reason == clusterv1.MachineDeletingDrainingNodeV1Beta2Reason &&
694-
machine.Status.Deletion != nil && machine.Status.Deletion.NodeDrainStartTime != nil &&
695-
time.Since(machine.Status.Deletion.NodeDrainStartTime.Time) > 30*time.Minute:
696-
msg = fmt.Sprintf("Machine deletion in progress, stage: %s (since more than 30m)", deletingCondition.Reason)
697-
case deletingCondition.Reason == clusterv1.MachineDeletingWaitingForVolumeDetachV1Beta2Reason &&
698-
machine.Status.Deletion != nil && machine.Status.Deletion.WaitForNodeVolumeDetachStartTime != nil &&
699-
time.Since(machine.Status.Deletion.WaitForNodeVolumeDetachStartTime.Time) > 30*time.Minute:
700-
msg = fmt.Sprintf("Machine deletion in progress, stage: %s (since more than 30m)", deletingCondition.Reason)
701-
default:
687+
msg := "Machine deletion in progress"
688+
if deletingCondition != nil {
702689
msg = fmt.Sprintf("Machine deletion in progress, stage: %s", deletingCondition.Reason)
690+
if !machine.GetDeletionTimestamp().IsZero() && time.Since(machine.GetDeletionTimestamp().Time) > time.Minute*15 {
691+
msg = fmt.Sprintf("Machine deletion in progress since more than 15m, stage: %s", deletingCondition.Reason)
692+
if deletingCondition.Reason == clusterv1.MachineDeletingDrainingNodeV1Beta2Reason && time.Since(machine.Status.Deletion.NodeDrainStartTime.Time) > 5*time.Minute {
693+
delayReasons := []string{}
694+
if strings.Contains(deletingCondition.Message, "cannot evict pod as it would violate the pod's disruption budget.") {
695+
delayReasons = append(delayReasons, "PodDisruptionBudgets")
696+
}
697+
if strings.Contains(deletingCondition.Message, "deletionTimestamp set, but still not removed from the Node") {
698+
delayReasons = append(delayReasons, "Pods not terminating")
699+
}
700+
if strings.Contains(deletingCondition.Message, "failed to evict Pod") {
701+
delayReasons = append(delayReasons, "Pod eviction errors")
702+
}
703+
if len(delayReasons) > 0 {
704+
msg += fmt.Sprintf(", delay likely due to %s", strings.Join(delayReasons, ", "))
705+
}
706+
}
707+
}
703708
}
704709

705710
return v1beta2conditions.ConditionWithOwnerInfo{

internal/controllers/machine/machine_controller_status_test.go

Lines changed: 16 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -1700,11 +1700,12 @@ func TestCalculateDeletingConditionForSummary(t *testing.T) {
17001700
},
17011701
},
17021702
{
1703-
name: "Deleting condition with DrainingNode since more than 30m",
1703+
name: "Deleting condition with DrainingNode since more than 15m",
17041704
machine: &clusterv1.Machine{
17051705
ObjectMeta: metav1.ObjectMeta{
1706-
Name: "machine-test",
1707-
Namespace: metav1.NamespaceDefault,
1706+
Name: "machine-test",
1707+
Namespace: metav1.NamespaceDefault,
1708+
DeletionTimestamp: &metav1.Time{Time: time.Now().Add(-16 * time.Minute)},
17081709
},
17091710
Status: clusterv1.MachineStatus{
17101711
V1Beta2: &clusterv1.MachineV1Beta2Status{
@@ -1714,19 +1715,15 @@ func TestCalculateDeletingConditionForSummary(t *testing.T) {
17141715
Status: metav1.ConditionTrue,
17151716
Reason: clusterv1.MachineDeletingDrainingNodeV1Beta2Reason,
17161717
Message: `Drain not completed yet (started at 2024-10-09T16:13:59Z):
1717-
* Pods with deletionTimestamp that still exist: pod-2-deletionTimestamp-set-1, pod-2-deletionTimestamp-set-2, pod-2-deletionTimestamp-set-3, pod-3-to-trigger-eviction-successfully-1, pod-3-to-trigger-eviction-successfully-2, ... (2 more)
1718-
* Pods with eviction failed:
1719-
* Cannot evict pod as it would violate the pod's disruption budget. The disruption budget pod-5-pdb needs 20 healthy pods and has 20 currently: pod-5-to-trigger-eviction-pdb-violated-1, pod-5-to-trigger-eviction-pdb-violated-2, pod-5-to-trigger-eviction-pdb-violated-3, ... (3 more)
1720-
* some other error 1: pod-6-to-trigger-eviction-some-other-error
1721-
* some other error 2: pod-7-to-trigger-eviction-some-other-error
1722-
* some other error 3: pod-8-to-trigger-eviction-some-other-error
1723-
* some other error 4: pod-9-to-trigger-eviction-some-other-error
1724-
* ... (1 more error applying to 1 Pod)`,
1718+
* Pods pod-2-deletionTimestamp-set-1, pod-3-to-trigger-eviction-successfully-1: deletionTimestamp set, but still not removed from the Node
1719+
* Pod pod-5-to-trigger-eviction-pdb-violated-1: cannot evict pod as it would violate the pod's disruption budget. The disruption budget pod-5-pdb needs 20 healthy pods and has 20 currently
1720+
* Pod pod-6-to-trigger-eviction-some-other-error: failed to evict Pod, some other error 1
1721+
After above Pods have been removed from the Node, the following Pods will be evicted: pod-7-eviction-later, pod-8-eviction-later`,
17251722
},
17261723
},
17271724
},
17281725
Deletion: &clusterv1.MachineDeletionStatus{
1729-
NodeDrainStartTime: &metav1.Time{Time: time.Now().Add(-31 * time.Minute)},
1726+
NodeDrainStartTime: &metav1.Time{Time: time.Now().Add(-6 * time.Minute)},
17301727
},
17311728
},
17321729
},
@@ -1739,16 +1736,17 @@ func TestCalculateDeletingConditionForSummary(t *testing.T) {
17391736
Type: clusterv1.MachineDeletingV1Beta2Condition,
17401737
Status: metav1.ConditionTrue,
17411738
Reason: clusterv1.MachineDeletingV1Beta2Reason,
1742-
Message: "Machine deletion in progress, stage: DrainingNode (since more than 30m)",
1739+
Message: "Machine deletion in progress since more than 15m, stage: DrainingNode, delay likely due to PodDisruptionBudgets, Pods not terminating, Pod eviction errors",
17431740
},
17441741
},
17451742
},
17461743
{
1747-
name: "Deleting condition with WaitingForVolumeDetach since more than 30m",
1744+
name: "Deleting condition with WaitingForVolumeDetach since more than 15m",
17481745
machine: &clusterv1.Machine{
17491746
ObjectMeta: metav1.ObjectMeta{
1750-
Name: "machine-test",
1751-
Namespace: metav1.NamespaceDefault,
1747+
Name: "machine-test",
1748+
Namespace: metav1.NamespaceDefault,
1749+
DeletionTimestamp: &metav1.Time{Time: time.Now().Add(-16 * time.Minute)},
17521750
},
17531751
Status: clusterv1.MachineStatus{
17541752
V1Beta2: &clusterv1.MachineV1Beta2Status{
@@ -1762,7 +1760,7 @@ func TestCalculateDeletingConditionForSummary(t *testing.T) {
17621760
},
17631761
},
17641762
Deletion: &clusterv1.MachineDeletionStatus{
1765-
WaitForNodeVolumeDetachStartTime: &metav1.Time{Time: time.Now().Add(-31 * time.Minute)},
1763+
WaitForNodeVolumeDetachStartTime: &metav1.Time{Time: time.Now().Add(-6 * time.Minute)},
17661764
},
17671765
},
17681766
},
@@ -1775,7 +1773,7 @@ func TestCalculateDeletingConditionForSummary(t *testing.T) {
17751773
Type: clusterv1.MachineDeletingV1Beta2Condition,
17761774
Status: metav1.ConditionTrue,
17771775
Reason: clusterv1.MachineDeletingV1Beta2Reason,
1778-
Message: "Machine deletion in progress, stage: WaitingForVolumeDetach (since more than 30m)",
1776+
Message: "Machine deletion in progress since more than 15m, stage: WaitingForVolumeDetach",
17791777
},
17801778
},
17811779
},

internal/controllers/machinedeployment/machinedeployment_status.go

Lines changed: 31 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ import (
2525

2626
"github.com/pkg/errors"
2727
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
28+
"k8s.io/apimachinery/pkg/util/sets"
2829
ctrl "sigs.k8s.io/controller-runtime"
2930
"sigs.k8s.io/controller-runtime/pkg/client"
3031

@@ -241,7 +242,7 @@ func setScalingDownCondition(_ context.Context, machineDeployment *clusterv1.Mac
241242
if getMachinesSucceeded {
242243
staleMessage := aggregateStaleMachines(machines)
243244
if staleMessage != "" {
244-
message += fmt.Sprintf(" and %s", staleMessage)
245+
message += fmt.Sprintf("\n* %s", staleMessage)
245246
}
246247
}
247248
v1beta2conditions.Set(machineDeployment, metav1.Condition{
@@ -447,7 +448,7 @@ func setDeletingCondition(_ context.Context, machineDeployment *clusterv1.Machin
447448
}
448449
staleMessage := aggregateStaleMachines(machines)
449450
if staleMessage != "" {
450-
message += fmt.Sprintf(" and %s", staleMessage)
451+
message += fmt.Sprintf("\n* %s", staleMessage)
451452
}
452453
}
453454
if len(machines) == 0 && len(machineSets) > 0 {
@@ -491,9 +492,26 @@ func aggregateStaleMachines(machines collections.Machines) string {
491492
}
492493

493494
machineNames := []string{}
495+
delayReasons := sets.Set[string]{}
494496
for _, machine := range machines {
495-
if !machine.GetDeletionTimestamp().IsZero() && time.Since(machine.GetDeletionTimestamp().Time) > time.Minute*30 {
497+
if !machine.GetDeletionTimestamp().IsZero() && time.Since(machine.GetDeletionTimestamp().Time) > time.Minute*15 {
496498
machineNames = append(machineNames, machine.GetName())
499+
500+
deletingCondition := v1beta2conditions.Get(machine, clusterv1.MachineDeletingV1Beta2Condition)
501+
if deletingCondition != nil &&
502+
deletingCondition.Status == metav1.ConditionTrue &&
503+
deletingCondition.Reason == clusterv1.MachineDeletingDrainingNodeV1Beta2Reason &&
504+
machine.Status.Deletion != nil && time.Since(machine.Status.Deletion.NodeDrainStartTime.Time) > 5*time.Minute {
505+
if strings.Contains(deletingCondition.Message, "cannot evict pod as it would violate the pod's disruption budget.") {
506+
delayReasons.Insert("PodDisruptionBudgets")
507+
}
508+
if strings.Contains(deletingCondition.Message, "deletionTimestamp set, but still not removed from the Node") {
509+
delayReasons.Insert("Pods not terminating")
510+
}
511+
if strings.Contains(deletingCondition.Message, "failed to evict Pod") {
512+
delayReasons.Insert("Pod eviction errors")
513+
}
514+
}
497515
}
498516
}
499517

@@ -514,7 +532,16 @@ func aggregateStaleMachines(machines collections.Machines) string {
514532
} else {
515533
message += " are "
516534
}
517-
message += "in deletion since more than 30m"
535+
message += "in deletion since more than 15m"
536+
if len(delayReasons) > 0 {
537+
reasonList := []string{}
538+
for _, r := range []string{"PodDisruptionBudgets", "Pods not terminating", "Pod eviction errors"} {
539+
if delayReasons.Has(r) {
540+
reasonList = append(reasonList, r)
541+
}
542+
}
543+
message += fmt.Sprintf(", delay likely due to %s", strings.Join(reasonList, ", "))
544+
}
518545

519546
return message
520547
}

internal/controllers/machinedeployment/machinedeployment_status_test.go

Lines changed: 20 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -522,10 +522,11 @@ func Test_setScalingDownCondition(t *testing.T) {
522522
},
523523
getAndAdoptMachineSetsForDeploymentSucceeded: true,
524524
expectCondition: metav1.Condition{
525-
Type: clusterv1.MachineDeploymentScalingDownV1Beta2Condition,
526-
Status: metav1.ConditionTrue,
527-
Reason: clusterv1.MachineDeploymentScalingDownV1Beta2Reason,
528-
Message: "Scaling down from 2 to 1 replicas and Machine stale-machine-1 is in deletion since more than 30m",
525+
Type: clusterv1.MachineDeploymentScalingDownV1Beta2Condition,
526+
Status: metav1.ConditionTrue,
527+
Reason: clusterv1.MachineDeploymentScalingDownV1Beta2Reason,
528+
Message: "Scaling down from 2 to 1 replicas\n" +
529+
"* Machine stale-machine-1 is in deletion since more than 15m",
529530
},
530531
},
531532
{
@@ -543,10 +544,11 @@ func Test_setScalingDownCondition(t *testing.T) {
543544
},
544545
getAndAdoptMachineSetsForDeploymentSucceeded: true,
545546
expectCondition: metav1.Condition{
546-
Type: clusterv1.MachineDeploymentScalingDownV1Beta2Condition,
547-
Status: metav1.ConditionTrue,
548-
Reason: clusterv1.MachineDeploymentScalingDownV1Beta2Reason,
549-
Message: "Scaling down from 4 to 1 replicas and Machines stale-machine-1, stale-machine-2, stale-machine-3 are in deletion since more than 30m",
547+
Type: clusterv1.MachineDeploymentScalingDownV1Beta2Condition,
548+
Status: metav1.ConditionTrue,
549+
Reason: clusterv1.MachineDeploymentScalingDownV1Beta2Reason,
550+
Message: "Scaling down from 4 to 1 replicas\n" +
551+
"* Machines stale-machine-1, stale-machine-2, stale-machine-3 are in deletion since more than 15m",
550552
},
551553
},
552554
{
@@ -566,10 +568,11 @@ func Test_setScalingDownCondition(t *testing.T) {
566568
},
567569
getAndAdoptMachineSetsForDeploymentSucceeded: true,
568570
expectCondition: metav1.Condition{
569-
Type: clusterv1.MachineDeploymentScalingDownV1Beta2Condition,
570-
Status: metav1.ConditionTrue,
571-
Reason: clusterv1.MachineDeploymentScalingDownV1Beta2Reason,
572-
Message: "Scaling down from 6 to 1 replicas and Machines stale-machine-1, stale-machine-2, stale-machine-3, ... (2 more) are in deletion since more than 30m",
571+
Type: clusterv1.MachineDeploymentScalingDownV1Beta2Condition,
572+
Status: metav1.ConditionTrue,
573+
Reason: clusterv1.MachineDeploymentScalingDownV1Beta2Reason,
574+
Message: "Scaling down from 6 to 1 replicas\n" +
575+
"* Machines stale-machine-1, stale-machine-2, stale-machine-3, ... (2 more) are in deletion since more than 15m",
573576
},
574577
},
575578
{
@@ -1076,10 +1079,11 @@ func Test_setDeletingCondition(t *testing.T) {
10761079
},
10771080
getMachinesSucceeded: true,
10781081
expectCondition: metav1.Condition{
1079-
Type: clusterv1.MachineDeploymentDeletingV1Beta2Condition,
1080-
Status: metav1.ConditionTrue,
1081-
Reason: clusterv1.MachineDeploymentDeletingV1Beta2Reason,
1082-
Message: "Deleting 1 Machine and Machine m1 is in deletion since more than 30m",
1082+
Type: clusterv1.MachineDeploymentDeletingV1Beta2Condition,
1083+
Status: metav1.ConditionTrue,
1084+
Reason: clusterv1.MachineDeploymentDeletingV1Beta2Reason,
1085+
Message: "Deleting 1 Machine\n" +
1086+
"* Machine m1 is in deletion since more than 15m",
10831087
},
10841088
},
10851089
{

0 commit comments

Comments
 (0)