@@ -38,7 +38,7 @@ type ChaosPodService interface {
38
38
GetChaosPodsOfDisruption (ctx context.Context , instance * chaosv1beta1.Disruption , ls labels.Set ) ([]corev1.Pod , error )
39
39
40
40
// HandleChaosPodTermination handles the termination of a chaos pod during a disruption event.
41
- HandleChaosPodTermination (ctx context.Context , disruption * chaosv1beta1.Disruption , pod * corev1.Pod ) (bool , error )
41
+ HandleChaosPodTermination (ctx context.Context , disruption * chaosv1beta1.Disruption , pod * corev1.Pod ) (stuckOnRemoval bool , err error )
42
42
43
43
// DeletePod deletes a pod from the Kubernetes cluster.
44
44
DeletePod (ctx context.Context , pod corev1.Pod ) bool
@@ -125,10 +125,10 @@ func (m *chaosPodService) GetChaosPodsOfDisruption(ctx context.Context, instance
125
125
}
126
126
127
127
// HandleChaosPodTermination handles the termination of a chaos-related pod during a disruption event.
128
- func (m * chaosPodService ) HandleChaosPodTermination (ctx context.Context , disruption * chaosv1beta1.Disruption , chaosPod * corev1.Pod ) (bool , error ) {
128
+ func (m * chaosPodService ) HandleChaosPodTermination (ctx context.Context , disruption * chaosv1beta1.Disruption , chaosPod * corev1.Pod ) (stuckOnRemoval bool , err error ) {
129
129
// Ignore chaos pods not having the finalizer anymore
130
130
if ! controllerutil .ContainsFinalizer (chaosPod , chaostypes .ChaosPodFinalizer ) {
131
- return true , nil
131
+ return false , nil
132
132
}
133
133
134
134
// Ignore chaos pods that are not being deleted
@@ -149,30 +149,60 @@ func (m *chaosPodService) HandleChaosPodTermination(ctx context.Context, disrupt
149
149
m .config .Log .Infow ("Target is not likely to be cleaned (either it does not exist anymore or it is not ready), the injector will TRY to clean it but will not take care about any failures" , "target" , target )
150
150
151
151
// Remove the finalizer for the chaos pod since cleanup won't be fully reliable.
152
- if err := m .removeFinalizerForChaosPod (ctx , chaosPod ); err != nil {
153
- return false , err
154
- }
152
+ err = m .removeFinalizerForChaosPod (ctx , chaosPod )
155
153
156
- return true , nil
154
+ return false , err
157
155
}
158
156
159
157
// It is always safe to remove some chaos pods. It is usually hard to tell if these chaos pods have
160
158
// succeeded or not, but they have no possibility of leaving side effects, so we choose to always remove the finalizer.
161
159
if chaosv1beta1 .DisruptionHasNoSideEffects (chaosPod .Labels [chaostypes .DisruptionKindLabel ]) {
162
- if err := m .removeFinalizerForChaosPod (ctx , chaosPod ); err != nil {
163
- return false , err
160
+ err = m .removeFinalizerForChaosPod (ctx , chaosPod )
161
+ return false , err
162
+ }
163
+
164
+ shouldRemoveFinalizer := false
165
+
166
+ switch chaosPod .Status .Phase {
167
+ case corev1 .PodSucceeded , corev1 .PodPending :
168
+ // we can remove the pod and the finalizer, so that it'll be garbage collected
169
+ shouldRemoveFinalizer = true
170
+ case corev1 .PodFailed :
171
+ // we need to determine if we can remove it safely or if we need to block disruption deletion
172
+ // check if a container has been created (if not, the disruption was not injected)
173
+ if len (chaosPod .Status .ContainerStatuses ) == 0 {
174
+ shouldRemoveFinalizer = true
164
175
}
165
176
166
- return true , nil
167
- }
177
+ // if the pod died only because it exceeded its activeDeadlineSeconds, we can remove the finalizer
178
+ if chaosPod .Status .Reason == "DeadlineExceeded" {
179
+ shouldRemoveFinalizer = true
180
+ }
181
+
182
+ // check if the container was able to start or not
183
+ // if not, we can safely delete the pod since the disruption was not injected
184
+ for _ , cs := range chaosPod .Status .ContainerStatuses {
185
+ if cs .Name != "injector" {
186
+ continue
187
+ }
188
+
189
+ if cs .State .Terminated != nil && cs .State .Terminated .Reason == "StartError" {
190
+ shouldRemoveFinalizer = true
191
+ }
168
192
169
- // If the finalizer cannot be removed yet, return without removing it.
170
- if m .isFinalizerNotRemovableForChaosPod (chaosPod ) {
193
+ break
194
+ }
195
+ default :
196
+ // If we're in this default case, then the chaos pod is not yet in a "terminated" state
197
+ // It's likely still cleaning up the relevant disruption, in which case we aren't ready
198
+ // to try to remove the finalizer, or to mark it as StuckOnRemoval, so we should just return here.
199
+ // And check again next time we reconcile.
171
200
return false , nil
172
201
}
173
202
174
- // Remove the finalizer for the chaos pod since cleanup was successful.
175
- if err := m .removeFinalizerForChaosPod (ctx , chaosPod ); err != nil {
203
+ if shouldRemoveFinalizer {
204
+ // Remove the finalizer for the chaos pod since cleanup was successful.
205
+ err = m .removeFinalizerForChaosPod (ctx , chaosPod )
176
206
return false , err
177
207
}
178
208
@@ -182,6 +212,8 @@ func (m *chaosPodService) HandleChaosPodTermination(ctx context.Context, disrupt
182
212
// DeletePod attempts to delete the specified pod from the Kubernetes cluster.
183
213
// Returns true if deletion was successful, otherwise returns false.
184
214
func (m * chaosPodService ) DeletePod (ctx context.Context , pod corev1.Pod ) bool {
215
+ m .config .Log .Infow ("terminating chaos pod to trigger cleanup" , "chaosPod" , pod .Name )
216
+
185
217
if err := m .deletePod (ctx , pod ); err != nil {
186
218
m .config .Log .Errorw ("Error terminating chaos pod" , "error" , err , "chaosPod" , pod .Name )
187
219
@@ -590,41 +622,6 @@ func (m *chaosPodService) removeFinalizerForChaosPod(ctx context.Context, chaosP
590
622
return nil
591
623
}
592
624
593
- func (m * chaosPodService ) isFinalizerNotRemovableForChaosPod (chaosPod * corev1.Pod ) bool {
594
- switch chaosPod .Status .Phase {
595
- case corev1 .PodSucceeded , corev1 .PodPending :
596
- // we can remove the pod and the finalizer, so that it'll be garbage collected
597
- return false
598
- case corev1 .PodFailed :
599
- // we need to determine if we can remove it safely or if we need to block disruption deletion
600
- // check if a container has been created (if not, the disruption was not injected)
601
- if len (chaosPod .Status .ContainerStatuses ) == 0 {
602
- return false
603
- }
604
-
605
- // if the pod died only because it exceeded its activeDeadlineSeconds, we can remove the finalizer
606
- if chaosPod .Status .Reason == "DeadlineExceeded" {
607
- return false
608
- }
609
-
610
- // check if the container was able to start or not
611
- // if not, we can safely delete the pod since the disruption was not injected
612
- for _ , cs := range chaosPod .Status .ContainerStatuses {
613
- if cs .Name != "injector" {
614
- continue
615
- }
616
-
617
- if cs .State .Terminated != nil && cs .State .Terminated .Reason == "StartError" {
618
- return false
619
- }
620
-
621
- break
622
- }
623
- }
624
-
625
- return true
626
- }
627
-
628
625
func (m * chaosPodService ) handleMetricSinkError (err error ) {
629
626
if err != nil {
630
627
m .config .Log .Errorw ("error sending a metric" , "error" , err )
0 commit comments