Skip to content

Commit 42854e9

Browse files
author
Theo Barber-Bany
authored
Merge pull request #264 from gocardless/CI-1233/abort-consoles-with-multiple-pods
Abort consoles with more than one pod
2 parents db2a96f + a7adac9 commit 42854e9

File tree

2 files changed

+82
-24
lines changed

2 files changed

+82
-24
lines changed

VERSION

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
3.8.0
1+
3.8.1

controllers/workloads/console/controller.go

Lines changed: 81 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -267,7 +267,13 @@ func (r *ConsoleReconciler) Reconcile(logger logr.Logger, ctx context.Context, r
267267
}
268268
}
269269

270-
job, err := r.getJob(ctx, req.NamespacedName)
270+
var (
271+
job *batchv1.Job
272+
pod *corev1.Pod
273+
podList corev1.PodList
274+
)
275+
276+
job, err = r.getJob(ctx, req.NamespacedName)
271277
if err != nil {
272278
job = nil
273279
}
@@ -293,6 +299,36 @@ func (r *ConsoleReconciler) Reconcile(logger logr.Logger, ctx context.Context, r
293299
}
294300
}
295301

302+
if job != nil {
303+
matchLabels := client.MatchingLabels(map[string]string{"job-name": job.ObjectMeta.Name})
304+
if err := r.List(ctx, &podList, client.InNamespace(csl.Namespace), matchLabels); err != nil {
305+
return ctrl.Result{}, errors.Wrap(err, "failed to list pods for console job")
306+
}
307+
308+
// Check if more than one pod belongs to the console or if the current pod is not the same
309+
// as the original pod (which also means that more than one pods was launched). A console's
310+
// job object has `restartPolicy=Never` and `backoffLimit=0`. These two settings together
311+
// prevent the job from launching a second pod when a failure occurs on the original pod.
312+
// However, these settings don't cover situations where a console pod is deleted
313+
// (e.g. manual deletion, eviction, preemption). We want consoles to never launch more than
314+
// one pod. Launching a subsequent pod is problematic even if there is only one running pod
315+
// at any given time. It causes the controller to enter its state machine logic in a way
316+
// that it wasn't designed to handle. It also causes the console to remain in a running
317+
// state for far longer than users expect.
318+
if len(podList.Items) > 1 || (len(podList.Items) == 1 && csl.Status.PodName != "" && csl.Status.PodName != podList.Items[0].Name) {
319+
logger.Info("More than one pod observed for console; deleting job and stopping console")
320+
if err := r.abort(ctx, logger, csl, job, &podList); err != nil {
321+
return ctrl.Result{}, errors.Wrap(err, "failed to abort console")
322+
}
323+
// No need to requeue after an abort because the deleted job will trigger us again.
324+
return ctrl.Result{Requeue: false}, nil
325+
}
326+
327+
if len(podList.Items) == 1 {
328+
pod = &podList.Items[0]
329+
}
330+
}
331+
296332
// Update the status fields in case they're out of sync, or the console spec
297333
// has been updated
298334
statusCtx := consoleStatusContext{
@@ -301,9 +337,10 @@ func (r *ConsoleReconciler) Reconcile(logger logr.Logger, ctx context.Context, r
301337
Authorisation: authorisation,
302338
AuthorisationRule: authRule,
303339
Job: job,
340+
Pod: pod,
304341
}
305342

306-
csl, err = r.generateStatusAndAuditEvents(ctx, logger, req.NamespacedName, csl, statusCtx)
343+
csl, err = r.generateStatusAndAuditEvents(ctx, logger, csl, statusCtx)
307344
if err != nil {
308345
return ctrl.Result{}, errors.Wrap(err, "failed to generate console status or audit events")
309346
}
@@ -339,6 +376,10 @@ func (r *ConsoleReconciler) Reconcile(logger logr.Logger, ctx context.Context, r
339376
return ctrl.Result{}, err
340377
}
341378
}
379+
// Retrigger reconciliation periodically to catch situations where a console pod is deleted
380+
// and re-spawned by the console job. Note that this isn't strictly necessary as Kubernetes
381+
// will periodically refresh caches and queue reconciliation events anyway.
382+
res = requeueAfterInterval(logger, 30*time.Second)
342383
case csl.PostRunning():
343384
// Requeue for when the console has reached its after finished TTL so it can be deleted
344385
res = requeueAfterInterval(logger, time.Until(*csl.GetGCTime()))
@@ -517,27 +558,7 @@ type consoleStatusContext struct {
517558
Job *batchv1.Job
518559
}
519560

520-
func (r *ConsoleReconciler) generateStatusAndAuditEvents(ctx context.Context, logger logr.Logger, name types.NamespacedName, csl *workloadsv1alpha1.Console, statusCtx consoleStatusContext) (*workloadsv1alpha1.Console, error) {
521-
var (
522-
pod *corev1.Pod
523-
podList corev1.PodList
524-
)
525-
526-
if statusCtx.Job != nil {
527-
inNamespace := client.InNamespace(name.Namespace)
528-
matchLabels := client.MatchingLabels(map[string]string{"job-name": statusCtx.Job.ObjectMeta.Name})
529-
if err := r.List(ctx, &podList, inNamespace, matchLabels); err != nil {
530-
return nil, errors.Wrap(err, "failed to list pods for console job")
531-
}
532-
}
533-
if len(podList.Items) > 0 {
534-
pod = &podList.Items[0]
535-
} else {
536-
pod = nil
537-
}
538-
539-
statusCtx.Pod = pod
540-
561+
func (r *ConsoleReconciler) generateStatusAndAuditEvents(ctx context.Context, logger logr.Logger, csl *workloadsv1alpha1.Console, statusCtx consoleStatusContext) (*workloadsv1alpha1.Console, error) {
541562
logger = getAuditLogger(logger, csl, statusCtx)
542563
newStatus := calculateStatus(csl, statusCtx)
543564

@@ -635,6 +656,43 @@ func calculateStatus(csl *workloadsv1alpha1.Console, statusCtx consoleStatusCont
635656
return newStatus
636657
}
637658

659+
func (r *ConsoleReconciler) abort(ctx context.Context, logger logr.Logger, csl *workloadsv1alpha1.Console, job *batchv1.Job, podList *corev1.PodList) error {
660+
// Delete job
661+
if err := r.Client.Delete(ctx, job); err != nil {
662+
return errors.Wrap(err, "failed to delete job")
663+
}
664+
665+
// Delete pods. In theory we shouldn't have to do this. All console pods are owned by
666+
// the console job. A delete operation should cascade. However, in our testing we saw
667+
// that the second pod launched by the job consistently lingers on after the job is gone.
668+
var podDeleteError error
669+
for _, pod := range podList.Items {
670+
if err := r.Client.Delete(ctx, &pod); err != nil {
671+
podDeleteError = err
672+
}
673+
}
674+
if podDeleteError != nil {
675+
return errors.Wrap(podDeleteError, "failed to delete pod(s)")
676+
}
677+
678+
// Update console status
679+
newStatus := csl.DeepCopy().Status
680+
newStatus.Phase = workloadsv1alpha1.ConsoleStopped
681+
updatedCsl := csl.DeepCopy()
682+
updatedCsl.Status = newStatus
683+
684+
if err := r.createOrUpdate(ctx, logger, csl, csl, Console, consoleDiff); err != nil {
685+
return err
686+
}
687+
688+
// Publish termination event
689+
if err := r.LifecycleRecorder.ConsoleTerminate(ctx, csl, false, nil); err != nil {
690+
logging.WithNoRecord(logger).Error(err, "failed to record event", "event", "console.terminate")
691+
}
692+
693+
return nil
694+
}
695+
638696
func calculatePhase(statusCtx consoleStatusContext) workloadsv1alpha1.ConsolePhase {
639697
if !statusCtx.IsAuthorised {
640698
return workloadsv1alpha1.ConsolePendingAuthorisation

0 commit comments

Comments
 (0)