Skip to content

Commit d1ed32c

Browse files
committed
add support for diagnosing crashing pod
Signed-off-by: Yeh-lei Wu <rayingecho@gmail.com>
1 parent 2ff8469 commit d1ed32c

File tree

1 file changed

+90
-18
lines changed

1 file changed

+90
-18
lines changed

pkg/plugin/cmd.go

Lines changed: 90 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
package plugin
22

33
import (
4+
"context"
45
"encoding/json"
56
"fmt"
67
"github.com/aylei/kubectl-debug/pkg/util"
@@ -9,14 +10,19 @@ import (
910
"io"
1011
corev1 "k8s.io/api/core/v1"
1112
"k8s.io/apimachinery/pkg/apis/meta/v1"
13+
"k8s.io/apimachinery/pkg/util/uuid"
1214
"k8s.io/cli-runtime/pkg/genericclioptions"
1315
"k8s.io/client-go/kubernetes"
16+
"k8s.io/client-go/tools/watch"
1417
coreclient "k8s.io/client-go/kubernetes/typed/core/v1"
1518
restclient "k8s.io/client-go/rest"
1619
"k8s.io/client-go/tools/remotecommand"
20+
"k8s.io/kubernetes/pkg/client/conditions"
21+
"k8s.io/kubernetes/pkg/util/interrupt"
1722
"log"
1823
"net/url"
1924
"os/user"
25+
"time"
2026
)
2127

2228
const (
@@ -54,17 +60,17 @@ type DebugOptions struct {
5460
PodName string
5561

5662
// Debug options
57-
RetainContainer bool
58-
Image string
59-
ContainerName string
60-
Command []string
61-
AgentPort int
62-
ConfigLocation string
63-
64-
Flags *genericclioptions.ConfigFlags
65-
PodClient coreclient.PodsGetter
66-
Args []string
67-
Config *restclient.Config
63+
Image string
64+
ContainerName string
65+
Command []string
66+
AgentPort int
67+
ConfigLocation string
68+
Fork bool
69+
70+
Flags *genericclioptions.ConfigFlags
71+
CoreClient coreclient.CoreV1Interface
72+
Args []string
73+
Config *restclient.Config
6874

6975
genericclioptions.IOStreams
7076
}
@@ -106,6 +112,8 @@ func NewDebugCmd(streams genericclioptions.IOStreams) *cobra.Command {
106112
fmt.Sprintf("Agent port for debug cli to connect, default to %d", defaultAgentPort))
107113
cmd.Flags().StringVar(&opts.ConfigLocation, "debug-config", "",
108114
fmt.Sprintf("Debug config file, default to ~%s", defaultConfigLocation))
115+
cmd.Flags().BoolVar(&opts.Fork, "fork", false,
116+
"Fork a new pod for debugging (useful if the pod status is CrashLoopBackoff)")
109117
opts.Flags.AddFlags(cmd.Flags())
110118

111119
return cmd
@@ -173,7 +181,7 @@ func (o *DebugOptions) Complete(cmd *cobra.Command, args []string, argsLenAtDash
173181
if err != nil {
174182
return err
175183
}
176-
o.PodClient = clientset.CoreV1()
184+
o.CoreClient = clientset.CoreV1()
177185

178186
return nil
179187
}
@@ -190,14 +198,10 @@ func (o *DebugOptions) Validate() error {
190198

191199
func (o *DebugOptions) Run() error {
192200

193-
pod, err := o.PodClient.Pods(o.Namespace).Get(o.PodName, v1.GetOptions{})
201+
pod, err := o.CoreClient.Pods(o.Namespace).Get(o.PodName, v1.GetOptions{})
194202
if err != nil {
195203
return err
196204
}
197-
if pod.Status.Phase == corev1.PodSucceeded || pod.Status.Phase == corev1.PodFailed {
198-
return fmt.Errorf("cannot debug in a completed pod; current phase is %s", pod.Status.Phase)
199-
}
200-
hostIP := pod.Status.HostIP
201205

202206
containerName := o.ContainerName
203207
if len(containerName) == 0 {
@@ -208,6 +212,32 @@ func (o *DebugOptions) Run() error {
208212
containerName = pod.Spec.Containers[0].Name
209213
}
210214

215+
// in fork mode, we launch an new pod as a copy of target pod
216+
// and hack the entry point of the target container with sleep command
217+
// which keeps the container running.
218+
if o.Fork {
219+
pod = copyAndStripPod(pod, containerName)
220+
pod, err = o.CoreClient.Pods(pod.Namespace).Create(pod)
221+
if err != nil {
222+
return err
223+
}
224+
watcher, err := o.CoreClient.Pods(pod.Namespace).Watch(v1.SingleObject(pod.ObjectMeta))
225+
// FIXME: hard code -> config
226+
ctx, cancel := context.WithTimeout(context.Background(), 5 * time.Minute)
227+
defer cancel()
228+
log.Println("waiting for forked container running...")
229+
event, err := watch.UntilWithoutRetry(ctx, watcher, conditions.PodRunning)
230+
if err != nil {
231+
return err
232+
}
233+
pod = event.Object.(*corev1.Pod)
234+
}
235+
236+
if pod.Status.Phase == corev1.PodSucceeded || pod.Status.Phase == corev1.PodFailed {
237+
return fmt.Errorf("cannot debug in a completed pod; current phase is %s", pod.Status.Phase)
238+
}
239+
hostIP := pod.Status.HostIP
240+
211241
containerId, err := o.getContainerIdByName(pod, containerName)
212242
if err != nil {
213243
return err
@@ -244,7 +274,20 @@ func (o *DebugOptions) Run() error {
244274
return o.remoteExecute("POST", uri, o.Config, o.In, o.Out, o.ErrOut, t.Raw, sizeQueue)
245275
}
246276

247-
if err := t.Safe(fn); err != nil {
277+
// ensure forked pod is deleted on cancelation
278+
withCleanUp := func() error {
279+
return interrupt.Chain(nil, func() {
280+
if o.Fork {
281+
err := o.CoreClient.Pods(pod.Namespace).Delete(pod.Name, v1.NewDeleteOptions(0))
282+
if err != nil {
283+
// we may leak pod here, but we have nothing to do except noticing the user
284+
log.Printf("failed to delete pod %s, consider manual deletion.", pod.Name)
285+
}
286+
}
287+
}).Run(fn);
288+
}
289+
290+
if err := t.Safe(withCleanUp); err != nil {
248291
fmt.Printf("error execute remote, %v\n", err)
249292
return err
250293
}
@@ -308,3 +351,32 @@ func (o *DebugOptions) setupTTY() term.TTY {
308351
}
309352
return t
310353
}
354+
355+
// copyAndStripPod copy the given pod template, strip the probes and labels,
356+
// and replace the entry point
357+
func copyAndStripPod(pod *corev1.Pod, targetContainer string) *corev1.Pod {
358+
copied := &corev1.Pod{
359+
ObjectMeta: *pod.ObjectMeta.DeepCopy(),
360+
Spec: *pod.Spec.DeepCopy(),
361+
}
362+
copied.Name = fmt.Sprintf("%s-%s-debug", pod.Name, uuid.NewUUID())
363+
copied.Labels = nil
364+
copied.Spec.RestartPolicy = corev1.RestartPolicyNever
365+
for i, c := range copied.Spec.Containers {
366+
copied.Spec.Containers[i].LivenessProbe = nil
367+
copied.Spec.Containers[i].ReadinessProbe = nil
368+
if c.Name == targetContainer {
369+
// Hack, infinite sleep command to keep the container running
370+
copied.Spec.Containers[i].Command = []string{"sh", "-c", "--"}
371+
copied.Spec.Containers[i].Args = []string{"while true; do sleep 30; done;"}
372+
}
373+
}
374+
copied.ResourceVersion = ""
375+
copied.UID = ""
376+
copied.SelfLink = ""
377+
copied.CreationTimestamp = v1.Time{}
378+
copied.OwnerReferences = []v1.OwnerReference{}
379+
380+
return copied
381+
}
382+

0 commit comments

Comments
 (0)