Skip to content

Commit 0ad973a

Browse files
authored
Move the DisruptionSpec Explain feature into the api package (#959)
1 parent 193679a commit 0ad973a

10 files changed

+465
-335
lines changed

api/v1beta1/container_failure.go

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,3 +27,16 @@ func (s *ContainerFailureSpec) GenerateArgs() []string {
2727

2828
return args
2929
}
30+
31+
func (s *ContainerFailureSpec) Explain() []string {
32+
var explanation string
33+
if s.Forced {
34+
explanation = "spec.containerFailure.forced injects a container failure which sends the SIGKILL signal to the pod's container(s). " +
35+
"If you'd prefer a SIGTERM, remove containerFailure.forced."
36+
} else {
37+
explanation = "spec.containerFailure injects a container failure which sends the SIGTERM signal to the pod's container(s). " +
38+
"If you'd prefer a SIGKILL, set containerFailure.forced."
39+
}
40+
41+
return []string{"", explanation}
42+
}

api/v1beta1/cpu_pressure.go

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,8 @@
66
package v1beta1
77

88
import (
9+
"fmt"
10+
911
"github.com/hashicorp/go-multierror"
1012
"k8s.io/apimachinery/pkg/util/intstr"
1113
)
@@ -49,3 +51,16 @@ func (s *CPUPressureSpec) GenerateArgs() []string {
4951

5052
return args
5153
}
54+
55+
func (s *CPUPressureSpec) Explain() []string {
56+
explanation := "spec.cpuPressure will cause cpu pressure on the target, by joining its cgroup and creating threads " +
57+
"intended to consume as much cpu as possible"
58+
59+
if s.Count != nil {
60+
explanation += fmt.Sprintf("on %s of the target's cores.", s.Count.String())
61+
} else {
62+
explanation += " on all of the target's cores."
63+
}
64+
65+
return []string{"", explanation}
66+
}

api/v1beta1/disk_failure.go

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -110,6 +110,27 @@ func (s *DiskFailureSpec) GenerateArgs() (args []string) {
110110
return args
111111
}
112112

113+
func (s *DiskFailureSpec) Explain() []string {
114+
explanation := "spec.diskFailure will cause io syscalls, "
115+
116+
if s.Probability != "" {
117+
explanation += fmt.Sprintf("%s of the time, ", s.Probability)
118+
} else {
119+
explanation += "100% of the time, "
120+
}
121+
122+
if s.OpenatSyscall != nil {
123+
explanation += fmt.Sprintf("to return the exit code %s ", s.OpenatSyscall.ExitCode)
124+
} else {
125+
explanation += "to return the exit code -ENOENT "
126+
}
127+
128+
explanation += " on any path with a prefix specified in spec.diskFailure.paths, e.g., choosing the path \"/\" would lead " +
129+
"to all io syscalls being affected."
130+
131+
return []string{"", explanation}
132+
}
133+
113134
// GetExitCodeInt return the integer value of a linux exit code.
114135
func (oss *OpenatSyscallSpec) GetExitCodeInt() int {
115136
switch oss.ExitCode {

api/v1beta1/disk_pressure.go

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
package v1beta1
77

88
import (
9+
"fmt"
910
"strconv"
1011
)
1112

@@ -48,3 +49,17 @@ func (s *DiskPressureSpec) GenerateArgs() []string {
4849

4950
return args
5051
}
52+
53+
func (s *DiskPressureSpec) Explain() []string {
54+
explanation := fmt.Sprintf("spec.diskPressure will throttle io on the device mounted to the path %s, limiting it to ", s.Path)
55+
56+
if s.Throttling.ReadBytesPerSec != nil {
57+
explanation += fmt.Sprintf("%d read bytes per second ", *s.Throttling.ReadBytesPerSec)
58+
}
59+
60+
if s.Throttling.WriteBytesPerSec != nil {
61+
explanation += fmt.Sprintf("%d write bytes per second.", *s.Throttling.WriteBytesPerSec)
62+
}
63+
64+
return []string{"", explanation}
65+
}

api/v1beta1/disruption_types.go

Lines changed: 151 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -137,6 +137,14 @@ type Reporting struct {
137137
MinNotificationType eventtypes.NotificationType `json:"minNotificationType,omitempty"`
138138
}
139139

140+
func (r *Reporting) Explain() string {
141+
return fmt.Sprintf("While the disruption is ongoing, it will send slack messages for every event of severity %s or higher, "+
142+
"to the slack channel with the ID (not name) %s, mentioning the purpose \"%s\"",
143+
r.MinNotificationType,
144+
r.SlackChannel,
145+
r.Purpose)
146+
}
147+
140148
// EmbeddedChaosAPI includes the library so it can be statically exported to chaosli
141149
//
142150
//go:embed *.go
@@ -875,6 +883,149 @@ func (s DisruptionSpec) DisruptionCount() int {
875883
return count
876884
}
877885

886+
// Explain returns a string explanation of this disruption spec
887+
func (s DisruptionSpec) Explain() []string {
888+
var explanation []string
889+
explanation = append(explanation, "Here's our best explanation of what this spec will do when run:")
890+
891+
durationExpl := s.Duration.Duration().String()
892+
if s.Duration.Duration() == 0 {
893+
durationExpl = "not set, so the default duration for your cluster will be used"
894+
}
895+
896+
explanation = append(explanation, fmt.Sprintf("spec.duration is %s. After that amount of time, the disruption "+
897+
"will stop and clean itself up. If it fails to clean up, an alert will be sent. If you want the disruption to stop early, "+
898+
"just try to delete the disruption. All chaos-injector pods will immediately try to stop the failure.",
899+
durationExpl,
900+
))
901+
902+
if s.DryRun {
903+
explanation = append(explanation, "spec.dryRun is set to true, meaning we will simulate a real disruption "+
904+
"as best as possible, by creating the resource, picking targets, and creating chaos-injector pods, "+
905+
"but we will not inject any actual failure.")
906+
}
907+
908+
// s.Level can be "", which defaults to Pod
909+
if s.Level != chaostypes.DisruptionLevelNode {
910+
explanation = append(explanation, "spec.level is pod. We will pick pods as targets based on your selector, and inject the failure into the pods' containers.")
911+
} else {
912+
explanation = append(explanation, "spec.level is node. We will pick nodes as targets based on your selector, and inject the failure into the nodes, affecting all pods on those nodes.")
913+
}
914+
915+
if s.Selector != nil {
916+
explanation = append(explanation, fmt.Sprintf("This spec has the following selectors which will be used to target %ss with these labels:\n\t%s", s.Level, s.Selector.String()))
917+
}
918+
919+
if s.AdvancedSelector != nil {
920+
advancedSelectorExplanation := fmt.Sprintf("This spec has the following advanced selectors which will be used to target %ss based on their labels:\n", s.Level)
921+
922+
for _, selector := range s.AdvancedSelector {
923+
advancedSelectorExplanation += fmt.Sprintf("\t%s\n", selector.String())
924+
}
925+
926+
explanation = append(explanation, advancedSelectorExplanation)
927+
}
928+
929+
if s.Filter != nil && s.Filter.Annotations != nil {
930+
explanation = append(explanation, fmt.Sprintf("This spec has the following annotation filters which will be used to target %ss with these annotations.\n\t%s\n", s.Level, s.Filter.Annotations.String()))
931+
}
932+
933+
if s.Containers != nil {
934+
explanation = append(explanation, fmt.Sprintf("spec.containers is set, so this disruption will only inject the failure the following containers on the target pods\n\t%s\n", strings.Join(s.Containers, ",")))
935+
}
936+
937+
if s.Pulse != nil {
938+
explanation = append(explanation,
939+
fmt.Sprintf("spec.pulse is set, so rather than a constant failure injection, after an initial delay of %s"+
940+
" the disruption will alternate between an active injected state with a duration of %s,"+
941+
" and an inactive dormant state with a duration of %s.\n",
942+
s.Pulse.InitialDelay.Duration().String(),
943+
s.Pulse.ActiveDuration.Duration().String(),
944+
s.Pulse.DormantDuration.Duration().String()))
945+
}
946+
947+
if s.OnInit {
948+
explanation = append(explanation, fmt.Sprintf("spec.onInit is true. "+
949+
"The disruptions will be launched during the initialization of the targeted pods."+
950+
"This requires some extra setup on your end, please [read the full documentation](https://github.com/DataDog/chaos-controller/blob/main/docs/features.md#applying-a-disruption-on-pod-initialization)"))
951+
}
952+
953+
countSuffix := ""
954+
if s.Count.Type == intstr.Int {
955+
countSuffix = fmt.Sprintf("exactly %d %ss. If it can't find that many targets, it will inject into as many as it discovers. "+
956+
"If there are more than %d eligible targets, a random %d will be chosen.",
957+
s.Count.IntValue(),
958+
s.Level,
959+
s.Count.IntValue(),
960+
s.Count.IntValue(),
961+
)
962+
if s.Count.IntValue() == 100 {
963+
countSuffix += " Your count is \"100\", but you almost certainly meant to specify \"100%\". The former means to find exactly 100 targets, the latter means to inject into all available targets."
964+
} else {
965+
countSuffix += " If it's more convenient, you can set spec.count to a % instead (just append the '%' character)."
966+
}
967+
} else {
968+
countSuffix = fmt.Sprintf("%s percent of all eligible %ss found. "+
969+
"If it's more convenient, you can set spec.count to an int intead of a percentage.",
970+
s.Count.String(),
971+
s.Level,
972+
)
973+
}
974+
975+
explanation = append(explanation, fmt.Sprintf("spec.count is %s, so the disruption will try to target %s",
976+
s.Count.String(),
977+
countSuffix,
978+
))
979+
980+
if s.StaticTargeting {
981+
explanation = append(explanation, fmt.Sprintf("spec.staticTargeting is true, so after we pick an initial set of targets and inject, "+
982+
"we will not attempt to inject into any new targets that appear while the disruption is ongoing."))
983+
} else {
984+
explanation = append(explanation, "By default we will continually compare the injected target count "+
985+
"to your defined spec.count, and add/remove targets as needed, e.g., with a count of \"100%\", if new targets "+
986+
"are scheduled, we will inject into them as well. "+
987+
"If you want a different behavior, trying setting spec.staticTargeting to true.")
988+
}
989+
990+
if s.Reporting != nil {
991+
explanation = append(explanation, s.Reporting.Explain())
992+
}
993+
994+
if s.NodeFailure != nil {
995+
explanation = append(explanation, s.NodeFailure.Explain()...)
996+
}
997+
998+
if s.ContainerFailure != nil {
999+
explanation = append(explanation, s.ContainerFailure.Explain()...)
1000+
}
1001+
1002+
if s.Network != nil {
1003+
explanation = append(explanation, s.Network.Explain()...)
1004+
}
1005+
1006+
if s.CPUPressure != nil {
1007+
explanation = append(explanation, s.CPUPressure.Explain()...)
1008+
}
1009+
1010+
if s.DiskPressure != nil {
1011+
explanation = append(explanation, s.DiskPressure.Explain()...)
1012+
}
1013+
1014+
if s.DiskFailure != nil {
1015+
explanation = append(explanation, s.DiskFailure.Explain()...)
1016+
}
1017+
1018+
if s.DNS != nil {
1019+
explanation = append(explanation, s.DNS.Explain()...)
1020+
}
1021+
1022+
if s.GRPC != nil {
1023+
explanation = append(explanation, s.GRPC.Explain()...)
1024+
}
1025+
1026+
return explanation
1027+
}
1028+
8781029
// RemoveDeadTargets removes targets not found in matchingTargets from the targets list
8791030
func (status *DisruptionStatus) RemoveDeadTargets(matchingTargets []string) {
8801031
desiredTargets := TargetInjections{}

api/v1beta1/dns_disruption.go

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -74,3 +74,13 @@ func (s DNSDisruptionSpec) GenerateArgs() []string {
7474

7575
return args
7676
}
77+
78+
func (s DNSDisruptionSpec) Explain() []string {
79+
explanation := []string{"", "spec.dns will intercept DNS requests from the target, replacing the specified records:"}
80+
81+
for _, pair := range s {
82+
explanation = append(explanation, fmt.Sprintf("DNS queries for %s will return an %s record with the value %s", pair.Hostname, pair.Record.Type, pair.Record.Value))
83+
}
84+
85+
return explanation
86+
}

api/v1beta1/grpc_disruption.go

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -148,3 +148,32 @@ func (s GRPCDisruptionSpec) GenerateArgs() []string {
148148

149149
return args
150150
}
151+
152+
func (s GRPCDisruptionSpec) Explain() []string {
153+
explanation := []string{"",
154+
"spec.grpc will activate the chaos interceptor on the targeted grpc server, replacing responses with specified errors.",
155+
"This disruption type can only work on grpc servers, not on the clients, " +
156+
"and requires [the server to cooperate by installing the chaos interceptor]" +
157+
"(https://github.com/DataDog/chaos-controller/blob/main/docs/grpc_disruption/instructions.md)",
158+
"The following endpoints will be intercepted:",
159+
}
160+
161+
for _, endpt := range s.Endpoints {
162+
var spoof string
163+
164+
if endpt.ErrorToReturn != "" {
165+
spoof = endpt.ErrorToReturn
166+
} else {
167+
spoof = endpt.OverrideToReturn
168+
}
169+
170+
queryPercentExpl := fmt.Sprintf("%d%%", endpt.QueryPercent)
171+
if endpt.QueryPercent == 0 {
172+
queryPercentExpl = "up to 100% (evenly divided across all alterations on this endpoint)"
173+
}
174+
175+
explanation = append(explanation, fmt.Sprintf("\t\tThe endpoint %s will return %s %s of the time", endpt.TargetEndpoint, spoof, queryPercentExpl))
176+
}
177+
178+
return explanation
179+
}

0 commit comments

Comments
 (0)