@@ -267,7 +267,13 @@ func (r *ConsoleReconciler) Reconcile(logger logr.Logger, ctx context.Context, r
267267 }
268268 }
269269
270- job , err := r .getJob (ctx , req .NamespacedName )
270+ var (
271+ job * batchv1.Job
272+ pod * corev1.Pod
273+ podList corev1.PodList
274+ )
275+
276+ job , err = r .getJob (ctx , req .NamespacedName )
271277 if err != nil {
272278 job = nil
273279 }
@@ -293,6 +299,36 @@ func (r *ConsoleReconciler) Reconcile(logger logr.Logger, ctx context.Context, r
293299 }
294300 }
295301
302+ if job != nil {
303+ matchLabels := client .MatchingLabels (map [string ]string {"job-name" : job .ObjectMeta .Name })
304+ if err := r .List (ctx , & podList , client .InNamespace (csl .Namespace ), matchLabels ); err != nil {
305+ return ctrl.Result {}, errors .Wrap (err , "failed to list pods for console job" )
306+ }
307+
308+ // Check if more than one pod belongs to the console or if the current pod is not the same
309+ // as the original pod (which also means that more than one pods was launched). A console's
310+ // job object has `restartPolicy=Never` and `backoffLimit=0`. These two settings together
311+ // prevent the job from launching a second pod when a failure occurs on the original pod.
312+ // However, these settings don't cover situations where a console pod is deleted
313+ // (e.g. manual deletion, eviction, preemption). We want consoles to never launch more than
314+ // one pod. Launching a subsequent pod is problematic even if there is only one running pod
315+ // at any given time. It causes the controller to enter its state machine logic in a way
316+ // that it wasn't designed to handle. It also causes the console to remain in a running
317+ // state for far longer than users expect.
318+ if len (podList .Items ) > 1 || (len (podList .Items ) == 1 && csl .Status .PodName != "" && csl .Status .PodName != podList .Items [0 ].Name ) {
319+ logger .Info ("More than one pod observed for console; deleting job and stopping console" )
320+ if err := r .abort (ctx , logger , csl , job , & podList ); err != nil {
321+ return ctrl.Result {}, errors .Wrap (err , "failed to abort console" )
322+ }
323+ // No need to requeue after an abort because the deleted job will trigger us again.
324+ return ctrl.Result {Requeue : false }, nil
325+ }
326+
327+ if len (podList .Items ) == 1 {
328+ pod = & podList .Items [0 ]
329+ }
330+ }
331+
296332 // Update the status fields in case they're out of sync, or the console spec
297333 // has been updated
298334 statusCtx := consoleStatusContext {
@@ -301,9 +337,10 @@ func (r *ConsoleReconciler) Reconcile(logger logr.Logger, ctx context.Context, r
301337 Authorisation : authorisation ,
302338 AuthorisationRule : authRule ,
303339 Job : job ,
340+ Pod : pod ,
304341 }
305342
306- csl , err = r .generateStatusAndAuditEvents (ctx , logger , req . NamespacedName , csl , statusCtx )
343+ csl , err = r .generateStatusAndAuditEvents (ctx , logger , csl , statusCtx )
307344 if err != nil {
308345 return ctrl.Result {}, errors .Wrap (err , "failed to generate console status or audit events" )
309346 }
@@ -339,6 +376,10 @@ func (r *ConsoleReconciler) Reconcile(logger logr.Logger, ctx context.Context, r
339376 return ctrl.Result {}, err
340377 }
341378 }
379+ // Retrigger reconciliation periodically to catch situations where a console pod is deleted
380+ // and re-spawned by the console job. Note that this isn't strictly necessary as Kubernetes
381+ // will periodically refresh caches and queue reconciliation events anyway.
382+ res = requeueAfterInterval (logger , 30 * time .Second )
342383 case csl .PostRunning ():
343384 // Requeue for when the console has reached its after finished TTL so it can be deleted
344385 res = requeueAfterInterval (logger , time .Until (* csl .GetGCTime ()))
@@ -517,27 +558,7 @@ type consoleStatusContext struct {
517558 Job * batchv1.Job
518559}
519560
520- func (r * ConsoleReconciler ) generateStatusAndAuditEvents (ctx context.Context , logger logr.Logger , name types.NamespacedName , csl * workloadsv1alpha1.Console , statusCtx consoleStatusContext ) (* workloadsv1alpha1.Console , error ) {
521- var (
522- pod * corev1.Pod
523- podList corev1.PodList
524- )
525-
526- if statusCtx .Job != nil {
527- inNamespace := client .InNamespace (name .Namespace )
528- matchLabels := client .MatchingLabels (map [string ]string {"job-name" : statusCtx .Job .ObjectMeta .Name })
529- if err := r .List (ctx , & podList , inNamespace , matchLabels ); err != nil {
530- return nil , errors .Wrap (err , "failed to list pods for console job" )
531- }
532- }
533- if len (podList .Items ) > 0 {
534- pod = & podList .Items [0 ]
535- } else {
536- pod = nil
537- }
538-
539- statusCtx .Pod = pod
540-
561+ func (r * ConsoleReconciler ) generateStatusAndAuditEvents (ctx context.Context , logger logr.Logger , csl * workloadsv1alpha1.Console , statusCtx consoleStatusContext ) (* workloadsv1alpha1.Console , error ) {
541562 logger = getAuditLogger (logger , csl , statusCtx )
542563 newStatus := calculateStatus (csl , statusCtx )
543564
@@ -635,6 +656,43 @@ func calculateStatus(csl *workloadsv1alpha1.Console, statusCtx consoleStatusCont
635656 return newStatus
636657}
637658
659+ func (r * ConsoleReconciler ) abort (ctx context.Context , logger logr.Logger , csl * workloadsv1alpha1.Console , job * batchv1.Job , podList * corev1.PodList ) error {
660+ // Delete job
661+ if err := r .Client .Delete (ctx , job ); err != nil {
662+ return errors .Wrap (err , "failed to delete job" )
663+ }
664+
665+ // Delete pods. In theory we shouldn't have to do this. All console pods are owned by
666+ // the console job. A delete operation should cascade. However, in our testing we saw
667+ // that the second pod launched by the job consistently lingers on after the job is gone.
668+ var podDeleteError error
669+ for _ , pod := range podList .Items {
670+ if err := r .Client .Delete (ctx , & pod ); err != nil {
671+ podDeleteError = err
672+ }
673+ }
674+ if podDeleteError != nil {
675+ return errors .Wrap (podDeleteError , "failed to delete pod(s)" )
676+ }
677+
678+ // Update console status
679+ newStatus := csl .DeepCopy ().Status
680+ newStatus .Phase = workloadsv1alpha1 .ConsoleStopped
681+ updatedCsl := csl .DeepCopy ()
682+ updatedCsl .Status = newStatus
683+
684+ if err := r .createOrUpdate (ctx , logger , csl , csl , Console , consoleDiff ); err != nil {
685+ return err
686+ }
687+
688+ // Publish termination event
689+ if err := r .LifecycleRecorder .ConsoleTerminate (ctx , csl , false , nil ); err != nil {
690+ logging .WithNoRecord (logger ).Error (err , "failed to record event" , "event" , "console.terminate" )
691+ }
692+
693+ return nil
694+ }
695+
638696func calculatePhase (statusCtx consoleStatusContext ) workloadsv1alpha1.ConsolePhase {
639697 if ! statusCtx .IsAuthorised {
640698 return workloadsv1alpha1 .ConsolePendingAuthorisation
0 commit comments