linki · remmen-io · May 5, 2025 · May 5, 2025 · May 6, 2025 · May 6, 2025
diff --git a/README.md b/README.md
@@ -67,6 +67,35 @@ Remember that `chaoskube` by default kills any pod in all your namespaces, inclu
 
 `chaoskube` provides a simple HTTP endpoint that can be used to check that it is running. This can be used for [Kubernetes liveness and readiness probes](https://kubernetes.io/docs/tasks/configure-pod-container/configure-liveness-readiness-probes/). By default, this listens on port 8080. To disable, pass `--metrics-address=""` to `chaoskube`.
 
+## Dynamic Interval
+
+chaoskube supports a dynamic interval feature that automatically adjusts the time between pod terminations based on the number of candidate pods in your cluster. This helps ensure appropriate chaos levels in both small and large environments.
+
+### How it works
+
+With dynamic interval enabled, chaoskube will calculate the interval between pod terminations using the following formula:
+
+```
+interval = totalWorkingMinutes / (podCount  * factor)
+```
+
+Where:
+- `totalWorkingMinutes` = 10 days * 8 hours * 60 minutes = 4800 minutes (we asume that all pods should be killed during 2 work weeks)
+- `factor` is the configurable dynamic interval factor
+
+The dynamic interval factor lets you control the aggressiveness of the terminations:
+
+- With `factor = 1.0`: Standard interval calculation
+- With `factor > 1.0`: More aggressive terminations (shorter intervals)
+- With `factor < 1.0`: Less aggressive terminations (longer intervals)
+
+### Example scenarios
+
+- Small cluster (100 pods, factor 1.0): interval = 48 minutes
+- Small cluster (100 pods, factor 1.5): interval = 32 minutes
+- Small cluster (100 pods, factor 2.0): interval = 24 minutes
+- Large cluster (1500 pods, factor 1.0): interval = 3 minutes
+
 ## Filtering targets
 
 However, you can limit the search space of `chaoskube` by providing label, annotation, and namespace selectors, pod name include/exclude patterns, as well as a minimum age setting.
@@ -177,6 +206,12 @@ $ chaoskube --minimum-age 6h
 INFO[0000] setting pod filter       minimumAge=6h0m0s
 ```
 
+### Static Pod Protection
+
+`chaoskube` automatically excludes static pods (mirror pods) from being terminated. Static pods are managed directly by the kubelet on a node rather than by the API server, and they are identified by the presence of the `kubernetes.io/config.mirror` annotation.
+
+Static pods are typically critical system components (like kube-proxy, kube-dns, etc.) that should not be disrupted by chaos engineering activities. This filtering happens automatically and cannot be disabled, ensuring that essential cluster infrastructure remains stable during chaos testing.
+
 ## Limit the Chaos
 
 You can limit the time when chaos is introduced by weekdays, time periods of a day, day of a year or all of them together.
@@ -200,6 +235,8 @@ Use `UTC`, `Local` or pick a timezone name from the [(IANA) tz database](https:/
 | Option                     | Environment                        | Description                                                          | Default                    |
 | -------------------------- | ---------------------------------- | -------------------------------------------------------------------- | -------------------------- |
 | `--interval`               | `CHAOSKUBE_INTERVAL`               | interval between pod terminations                                    | 10m                        |
+| `--dynamic-interval`       | `CHAOSKUBE_DYNAMIC_INTERVAL`       | enable dynamic interval calculation                                  | false                      |
+| `--dynamic-factor`         | `CHAOSKUBE_DYNAMIC_FACTOR`         | factor to adjust the dynamic interval                                | 1.0                        |
 | `--labels`                 | `CHAOSKUBE_LABELS`                 | label selector to filter pods by                                     | (matches everything)       |
 | `--annotations`            | `CHAOSKUBE_ANNOTATIONS`            | annotation selector to filter pods by                                | (matches everything)       |
 | `--kinds`                  | `CHAOSKUBE_KINDS`                  | owner's kind selector to filter pods by                              | (all kinds)                |
@@ -244,4 +281,4 @@ This project wouldn't be where it is with the ideas and help of several awesome
 
 ## Contributing
 
-Feel free to create issues or submit pull requests.
+Feel free to create issues or submit pull requests.
diff --git a/chaoskube/chaoskube.go b/chaoskube/chaoskube.go
@@ -4,6 +4,7 @@ import (
 	"context"
 	"errors"
 	"fmt"
+	"math"
 	"regexp"
 	"time"
 
@@ -74,6 +75,11 @@ type Chaoskube struct {
 	Notifier notifier.Notifier
 	// namespace scope for the Kubernetes client
 	ClientNamespaceScope string
+
+	// Dynamic interval configuration
+	DynamicInterval       bool
+	DynamicIntervalFactor float64
+	BaseInterval          time.Duration
 }
 
 var (
@@ -87,6 +93,8 @@ var (
 	msgTimeOfDayExcluded = "time of day excluded"
 	// msgDayOfYearExcluded is the log message when termination is suspended due to the day of year filter
 	msgDayOfYearExcluded = "day of year excluded"
+	// mirrorPodAnnotation is the annotation key for static pods
+	mirrorPodAnnotation = "kubernetes.io/config.mirror"
 )
 
 // New returns a new instance of Chaoskube. It expects:
@@ -97,34 +105,157 @@ var (
 // * a logger implementing logrus.FieldLogger to send log output to
 // * what specific terminator to use to imbue chaos on victim pods
 // * whether to enable/disable dry-run mode
-func New(client kubernetes.Interface, labels, annotations, kinds, namespaces, namespaceLabels labels.Selector, includedPodNames, excludedPodNames *regexp.Regexp, excludedWeekdays []time.Weekday, excludedTimesOfDay []util.TimePeriod, excludedDaysOfYear []time.Time, timezone *time.Location, minimumAge time.Duration, logger log.FieldLogger, dryRun bool, terminator terminator.Terminator, maxKill int, notifier notifier.Notifier, clientNamespaceScope string) *Chaoskube {
+func New(client kubernetes.Interface, labels, annotations, kinds, namespaces, namespaceLabels labels.Selector, includedPodNames, excludedPodNames *regexp.Regexp, excludedWeekdays []time.Weekday, excludedTimesOfDay []util.TimePeriod, excludedDaysOfYear []time.Time, timezone *time.Location, minimumAge time.Duration, logger log.FieldLogger, dryRun bool, terminator terminator.Terminator, maxKill int, notifier notifier.Notifier, clientNamespaceScope string, dynamicInterval bool, dynamicIntervalFactor float64, baseInterval time.Duration) *Chaoskube {
 	broadcaster := record.NewBroadcaster()
 	broadcaster.StartRecordingToSink(&typedcorev1.EventSinkImpl{Interface: client.CoreV1().Events(clientNamespaceScope)})
 	recorder := broadcaster.NewRecorder(scheme.Scheme, v1.EventSource{Component: "chaoskube"})
 
 	return &Chaoskube{
-		Client:               client,
-		Labels:               labels,
-		Annotations:          annotations,
-		Kinds:                kinds,
-		Namespaces:           namespaces,
-		NamespaceLabels:      namespaceLabels,
-		IncludedPodNames:     includedPodNames,
-		ExcludedPodNames:     excludedPodNames,
-		ExcludedWeekdays:     excludedWeekdays,
-		ExcludedTimesOfDay:   excludedTimesOfDay,
-		ExcludedDaysOfYear:   excludedDaysOfYear,
-		Timezone:             timezone,
-		MinimumAge:           minimumAge,
-		Logger:               logger,
-		DryRun:               dryRun,
-		Terminator:           terminator,
-		EventRecorder:        recorder,
-		Now:                  time.Now,
-		MaxKill:              maxKill,
-		Notifier:             notifier,
-		ClientNamespaceScope: clientNamespaceScope,
+		Client:                client,
+		Labels:                labels,
+		Annotations:           annotations,
+		Kinds:                 kinds,
+		Namespaces:            namespaces,
+		NamespaceLabels:       namespaceLabels,
+		IncludedPodNames:      includedPodNames,
+		ExcludedPodNames:      excludedPodNames,
+		ExcludedWeekdays:      excludedWeekdays,
+		ExcludedTimesOfDay:    excludedTimesOfDay,
+		ExcludedDaysOfYear:    excludedDaysOfYear,
+		Timezone:              timezone,
+		MinimumAge:            minimumAge,
+		Logger:                logger,
+		DryRun:                dryRun,
+		Terminator:            terminator,
+		EventRecorder:         recorder,
+		Now:                   time.Now,
+		MaxKill:               maxKill,
+		Notifier:              notifier,
+		ClientNamespaceScope:  clientNamespaceScope,
+		DynamicInterval:       dynamicInterval,
+		DynamicIntervalFactor: dynamicIntervalFactor,
+		BaseInterval:          baseInterval,
+	}
+}
+
+// NewTicker creates a ticker channel that handles both fixed and dynamic intervals.
+// It returns a channel that sends ticks and a stop function to clean up resources.
+func (c *Chaoskube) NewTicker(ctx context.Context) (<-chan time.Time, func()) {
+	if !c.DynamicInterval {
+		// Use fixed interval ticker
+		ticker := time.NewTicker(c.BaseInterval)
+		return ticker.C, ticker.Stop
+	}
+
+	// Use dynamic interval
+	tickerChan := make(chan time.Time)
+	stopChan := make(chan struct{})
+
+	go func() {
+		defer close(tickerChan)
+
+		for {
+			// Calculate current dynamic interval
+			waitDuration := c.CalculateDynamicInterval(ctx)
+			metrics.CurrentIntervalSeconds.Set(float64(waitDuration.Seconds()))
+
+			select {
+			case <-time.After(waitDuration):
+				select {
+				case tickerChan <- time.Now():
+				case <-stopChan:
+					return
+				case <-ctx.Done():
+					return
+				}
+			case <-stopChan:
+				return
+			case <-ctx.Done():
+				return
+			}
+		}
+	}()
+
+	stopFunc := func() {
+		close(stopChan)
 	}
+
+	return tickerChan, stopFunc
+}
+
+// CalculateDynamicInterval calculates a dynamic interval based on current pod count
+func (c *Chaoskube) CalculateDynamicInterval(ctx context.Context) time.Duration {
+
+	// Get total number of pods
+	listOptions := metav1.ListOptions{LabelSelector: c.Labels.String()}
+	podList, err := c.Client.CoreV1().Pods(c.ClientNamespaceScope).List(ctx, listOptions)
+
+	if err != nil {
+		c.Logger.WithField("err", err).Error("failed to get list of pods, using base interval")
+		return c.BaseInterval
+	}
+
+	pods, err := filterByNamespaces(podList.Items, c.Namespaces)
+	if err != nil {
+		c.Logger.WithField("err", err).Error("failed to filterByNamespaces, using base interval")
+		return c.BaseInterval
+	}
+
+	pods, err = filterPodsByNamespaceLabels(ctx, pods, c.NamespaceLabels, c.Client)
+	if err != nil {
+		c.Logger.WithField("err", err).Error("failed to filterPodsByNamespaceLabels, using base interval")
+		return c.BaseInterval
+	}
+
+	pods, err = filterByKinds(pods, c.Kinds)
+	if err != nil {
+		c.Logger.WithField("err", err).Error("failed to filterByKinds, using base interval")
+		return c.BaseInterval
+	}
+
+	pods = filterByAnnotations(pods, c.Annotations)
+
+	pods = filterStaticPods(pods)
+
+	podCount := len(pods)
+
+	c.Logger.Debug("Listing candidate pods for dynamic interval calculation:")
+	for i, pod := range pods {
+		c.Logger.WithFields(log.Fields{
+			"index":     i,
+			"name":      pod.Name,
+			"namespace": pod.Namespace,
+			"labels":    pod.Labels,
+			"phase":     pod.Status.Phase,
+		}).Debug("candidate pod")
+	}
+
+	// Guard against division by zero, pods could be all filtered!
+	if podCount == 0 {
+		c.Logger.WithField("podCount", 0).Info("no pods found, using base interval")
+		return c.BaseInterval
+	}
+	// As a simple reference, we asume that every pod should be killed during 10 working days (9-17h)
+	totalWorkingMinutes := 10 * 8 * 60
+
+	// Calculate raw interval in minutes
+	// Higher pod counts = shorter intervals, lower pod counts = longer intervals
+	rawIntervalMinutes := float64(totalWorkingMinutes) / (float64(podCount) * c.DynamicIntervalFactor)
+
+	// Round to nearest minute and ensure minimum of 1 minute
+	minutes := int(math.Max(1, math.Round(rawIntervalMinutes)))
+	roundedInterval := time.Duration(minutes) * time.Minute
+
+	// Provide detailed logging about the calculation
+	c.Logger.WithFields(log.Fields{
+		"podCount":         podCount,
+		"totalWorkMinutes": totalWorkingMinutes,
+		"factor":           c.DynamicIntervalFactor,
+		"rawIntervalMins":  rawIntervalMinutes,
+		"roundedInterval":  roundedInterval,
+	}).Info("calculated dynamic interval")
+
+	return roundedInterval
 }
 
 // Run continuously picks and terminates a victim pod at a given interval
@@ -138,8 +269,10 @@ func (c *Chaoskube) Run(ctx context.Context, next <-chan time.Time) {
 
 		c.Logger.Debug("sleeping...")
 		metrics.IntervalsTotal.Inc()
+
 		select {
 		case <-next:
+			// Continue to next iteration
 		case <-ctx.Done():
 			return
 		}
@@ -219,27 +352,48 @@ func (c *Chaoskube) Candidates(ctx context.Context) ([]v1.Pod, error) {
 		return nil, err
 	}
 
+	filterCounts := fmt.Sprintf("initial:%d", len(podList.Items))
+
 	pods, err := filterByNamespaces(podList.Items, c.Namespaces)
 	if err != nil {
 		return nil, err
 	}
+	filterCounts += fmt.Sprintf(" → namespaces:%d", len(pods))
 
 	pods, err = filterPodsByNamespaceLabels(ctx, pods, c.NamespaceLabels, c.Client)
 	if err != nil {
 		return nil, err
 	}
+	filterCounts += fmt.Sprintf(" → ns-labels:%d", len(pods))
 
 	pods, err = filterByKinds(pods, c.Kinds)
 	if err != nil {
 		return nil, err
 	}
+	filterCounts += fmt.Sprintf(" → kinds:%d", len(pods))
 
 	pods = filterByAnnotations(pods, c.Annotations)
+	filterCounts += fmt.Sprintf(" → annotations:%d", len(pods))
+
 	pods = filterByPhase(pods, v1.PodRunning)
+	filterCounts += fmt.Sprintf(" → running:%d", len(pods))
+
 	pods = filterTerminatingPods(pods)
+	filterCounts += fmt.Sprintf(" → non-terminating:%d", len(pods))
+
 	pods = filterByMinimumAge(pods, c.MinimumAge, c.Now())
+	filterCounts += fmt.Sprintf(" → min-age:%d", len(pods))
+
 	pods = filterByPodName(pods, c.IncludedPodNames, c.ExcludedPodNames)
+	filterCounts += fmt.Sprintf(" → pod-names:%d", len(pods))
+
 	pods = filterByOwnerReference(pods)
+	filterCounts += fmt.Sprintf(" → owner-ref:%d", len(pods))
+
+	pods = filterStaticPods(pods)
+	filterCounts += fmt.Sprintf(" → static-pods:%d", len(pods))
+
+	c.Logger.Debug("Pod filtering: " + filterCounts)
 
 	return pods, nil
 }
@@ -536,3 +690,18 @@ func filterByOwnerReference(pods []v1.Pod) []v1.Pod {
 
 	return filteredList
 }
+
+// filterStaticPods filters out static pods (mirror pods) that should not be killed
+func filterStaticPods(pods []v1.Pod) []v1.Pod {
+	filteredList := []v1.Pod{}
+
+	for _, pod := range pods {
+		// Skip static pods (mirror pods) which have the mirror pod annotation
+		if _, ok := pod.Annotations[mirrorPodAnnotation]; ok {
+			continue
+		}
+		filteredList = append(filteredList, pod)
+	}
+
+	return filteredList
+}