Skip to content

Commit 0b36b9a

Browse files
authored
Running Chaos Pods should not be evaluated as StuckOnRemoval (#781)
* restore info log when deleting chaos pods * Chaos pods that are not yet terminated should not be considered Stuck
1 parent 36f1883 commit 0b36b9a

File tree

4 files changed

+104
-108
lines changed

4 files changed

+104
-108
lines changed

cloudservice/cloud_provider_ip_range_manager_mock.go

Lines changed: 21 additions & 21 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

controllers/disruption_controller.go

Lines changed: 10 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -660,27 +660,25 @@ func (r *DisruptionReconciler) handleChaosPodTermination(ctx context.Context, in
660660
return
661661
}
662662

663-
isFinalizerRemoved, err := r.ChaosPodService.HandleChaosPodTermination(ctx, instance, &chaosPod)
663+
isStuckOnRemoval, err := r.ChaosPodService.HandleChaosPodTermination(ctx, instance, &chaosPod)
664664
if err != nil {
665665
r.log.Errorw("could not handle the chaos pod termination", "error", err, "chaosPod", chaosPod.Name)
666666

667667
return
668668
}
669669

670-
if isFinalizerRemoved {
671-
return
672-
}
670+
if isStuckOnRemoval {
671+
target := chaosPod.Labels[chaostypes.TargetLabel]
673672

674-
target := chaosPod.Labels[chaostypes.TargetLabel]
673+
// if the chaos pod finalizer must not be removed and the chaos pod must not be deleted
674+
// and the cleanup status must not be ignored, we are stuck and won't be able to remove the disruption
675+
r.log.Infow("instance seems stuck on removal for this target, please check manually", "target", target, "chaosPod", chaosPod.Name)
676+
r.recordEventOnDisruption(instance, chaosv1beta1.EventDisruptionStuckOnRemoval, "", target)
675677

676-
// if the chaos pod finalizer must not be removed and the chaos pod must not be deleted
677-
// and the cleanup status must not be ignored, we are stuck and won't be able to remove the disruption
678-
r.log.Infow("instance seems stuck on removal for this target, please check manually", "target", target, "chaosPod", chaosPod.Name)
679-
r.recordEventOnDisruption(instance, chaosv1beta1.EventDisruptionStuckOnRemoval, "", target)
678+
instance.Status.IsStuckOnRemoval = true
680679

681-
instance.Status.IsStuckOnRemoval = true
682-
683-
r.updateTargetInjectionStatus(instance, chaosPod, chaostypes.DisruptionTargetInjectionStatusStatusIsStuckOnRemoval, *chaosPod.DeletionTimestamp)
680+
r.updateTargetInjectionStatus(instance, chaosPod, chaostypes.DisruptionTargetInjectionStatusStatusIsStuckOnRemoval, *chaosPod.DeletionTimestamp)
681+
}
684682
}
685683

686684
func (r *DisruptionReconciler) updateTargetInjectionStatus(instance *chaosv1beta1.Disruption, chaosPod corev1.Pod, status chaostypes.DisruptionTargetInjectionStatus, since metav1.Time) {

services/chaospod.go

Lines changed: 47 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,7 @@ type ChaosPodService interface {
3838
GetChaosPodsOfDisruption(ctx context.Context, instance *chaosv1beta1.Disruption, ls labels.Set) ([]corev1.Pod, error)
3939

4040
// HandleChaosPodTermination handles the termination of a chaos pod during a disruption event.
41-
HandleChaosPodTermination(ctx context.Context, disruption *chaosv1beta1.Disruption, pod *corev1.Pod) (bool, error)
41+
HandleChaosPodTermination(ctx context.Context, disruption *chaosv1beta1.Disruption, pod *corev1.Pod) (stuckOnRemoval bool, err error)
4242

4343
// DeletePod deletes a pod from the Kubernetes cluster.
4444
DeletePod(ctx context.Context, pod corev1.Pod) bool
@@ -125,10 +125,10 @@ func (m *chaosPodService) GetChaosPodsOfDisruption(ctx context.Context, instance
125125
}
126126

127127
// HandleChaosPodTermination handles the termination of a chaos-related pod during a disruption event.
128-
func (m *chaosPodService) HandleChaosPodTermination(ctx context.Context, disruption *chaosv1beta1.Disruption, chaosPod *corev1.Pod) (bool, error) {
128+
func (m *chaosPodService) HandleChaosPodTermination(ctx context.Context, disruption *chaosv1beta1.Disruption, chaosPod *corev1.Pod) (stuckOnRemoval bool, err error) {
129129
// Ignore chaos pods not having the finalizer anymore
130130
if !controllerutil.ContainsFinalizer(chaosPod, chaostypes.ChaosPodFinalizer) {
131-
return true, nil
131+
return false, nil
132132
}
133133

134134
// Ignore chaos pods that are not being deleted
@@ -149,30 +149,60 @@ func (m *chaosPodService) HandleChaosPodTermination(ctx context.Context, disrupt
149149
m.config.Log.Infow("Target is not likely to be cleaned (either it does not exist anymore or it is not ready), the injector will TRY to clean it but will not take care about any failures", "target", target)
150150

151151
// Remove the finalizer for the chaos pod since cleanup won't be fully reliable.
152-
if err := m.removeFinalizerForChaosPod(ctx, chaosPod); err != nil {
153-
return false, err
154-
}
152+
err = m.removeFinalizerForChaosPod(ctx, chaosPod)
155153

156-
return true, nil
154+
return false, err
157155
}
158156

159157
// It is always safe to remove some chaos pods. It is usually hard to tell if these chaos pods have
160158
// succeeded or not, but they have no possibility of leaving side effects, so we choose to always remove the finalizer.
161159
if chaosv1beta1.DisruptionHasNoSideEffects(chaosPod.Labels[chaostypes.DisruptionKindLabel]) {
162-
if err := m.removeFinalizerForChaosPod(ctx, chaosPod); err != nil {
163-
return false, err
160+
err = m.removeFinalizerForChaosPod(ctx, chaosPod)
161+
return false, err
162+
}
163+
164+
shouldRemoveFinalizer := false
165+
166+
switch chaosPod.Status.Phase {
167+
case corev1.PodSucceeded, corev1.PodPending:
168+
// we can remove the pod and the finalizer, so that it'll be garbage collected
169+
shouldRemoveFinalizer = true
170+
case corev1.PodFailed:
171+
// we need to determine if we can remove it safely or if we need to block disruption deletion
172+
// check if a container has been created (if not, the disruption was not injected)
173+
if len(chaosPod.Status.ContainerStatuses) == 0 {
174+
shouldRemoveFinalizer = true
164175
}
165176

166-
return true, nil
167-
}
177+
// if the pod died only because it exceeded its activeDeadlineSeconds, we can remove the finalizer
178+
if chaosPod.Status.Reason == "DeadlineExceeded" {
179+
shouldRemoveFinalizer = true
180+
}
181+
182+
// check if the container was able to start or not
183+
// if not, we can safely delete the pod since the disruption was not injected
184+
for _, cs := range chaosPod.Status.ContainerStatuses {
185+
if cs.Name != "injector" {
186+
continue
187+
}
188+
189+
if cs.State.Terminated != nil && cs.State.Terminated.Reason == "StartError" {
190+
shouldRemoveFinalizer = true
191+
}
168192

169-
// If the finalizer cannot be removed yet, return without removing it.
170-
if m.isFinalizerNotRemovableForChaosPod(chaosPod) {
193+
break
194+
}
195+
default:
196+
// If we're in this default case, then the chaos pod is not yet in a "terminated" state
197+
// It's likely still cleaning up the relevant disruption, in which case we aren't ready
198+
// to try to remove the finalizer, or to mark it as StuckOnRemoval, so we should just return here.
199+
// And check again next time we reconcile.
171200
return false, nil
172201
}
173202

174-
// Remove the finalizer for the chaos pod since cleanup was successful.
175-
if err := m.removeFinalizerForChaosPod(ctx, chaosPod); err != nil {
203+
if shouldRemoveFinalizer {
204+
// Remove the finalizer for the chaos pod since cleanup was successful.
205+
err = m.removeFinalizerForChaosPod(ctx, chaosPod)
176206
return false, err
177207
}
178208

@@ -182,6 +212,8 @@ func (m *chaosPodService) HandleChaosPodTermination(ctx context.Context, disrupt
182212
// DeletePod attempts to delete the specified pod from the Kubernetes cluster.
183213
// Returns true if deletion was successful, otherwise returns false.
184214
func (m *chaosPodService) DeletePod(ctx context.Context, pod corev1.Pod) bool {
215+
m.config.Log.Infow("terminating chaos pod to trigger cleanup", "chaosPod", pod.Name)
216+
185217
if err := m.deletePod(ctx, pod); err != nil {
186218
m.config.Log.Errorw("Error terminating chaos pod", "error", err, "chaosPod", pod.Name)
187219

@@ -590,41 +622,6 @@ func (m *chaosPodService) removeFinalizerForChaosPod(ctx context.Context, chaosP
590622
return nil
591623
}
592624

593-
func (m *chaosPodService) isFinalizerNotRemovableForChaosPod(chaosPod *corev1.Pod) bool {
594-
switch chaosPod.Status.Phase {
595-
case corev1.PodSucceeded, corev1.PodPending:
596-
// we can remove the pod and the finalizer, so that it'll be garbage collected
597-
return false
598-
case corev1.PodFailed:
599-
// we need to determine if we can remove it safely or if we need to block disruption deletion
600-
// check if a container has been created (if not, the disruption was not injected)
601-
if len(chaosPod.Status.ContainerStatuses) == 0 {
602-
return false
603-
}
604-
605-
// if the pod died only because it exceeded its activeDeadlineSeconds, we can remove the finalizer
606-
if chaosPod.Status.Reason == "DeadlineExceeded" {
607-
return false
608-
}
609-
610-
// check if the container was able to start or not
611-
// if not, we can safely delete the pod since the disruption was not injected
612-
for _, cs := range chaosPod.Status.ContainerStatuses {
613-
if cs.Name != "injector" {
614-
continue
615-
}
616-
617-
if cs.State.Terminated != nil && cs.State.Terminated.Reason == "StartError" {
618-
return false
619-
}
620-
621-
break
622-
}
623-
}
624-
625-
return true
626-
}
627-
628625
func (m *chaosPodService) handleMetricSinkError(err error) {
629626
if err != nil {
630627
m.config.Log.Errorw("error sending a metric", "error", err)

0 commit comments

Comments
 (0)