Skip to content

Commit b81a033

Browse files
[chaosplt-161] fix bug on cloud disruptions adding too much hosts (#805)
* fix bug on cloud disruptions adding too much hosts * receiver name change * move to disruption controller cloud host computing * remove useless var * remove wrong tests in chaos pod service + update disruption controller tests * remove human mistake :oops: * remove human mistake :oops: * fix tests * Update api/v1beta1/network_disruption.go Co-authored-by: Philip Thompson <philip.thompson@datadoghq.com> * Update controllers/disruption_controller.go Co-authored-by: Philip Thompson <philip.thompson@datadoghq.com> * add another test * complexify test * Update api/v1beta1/network_disruption.go Co-authored-by: Philip Thompson <philip.thompson@datadoghq.com> --------- Co-authored-by: Philip Thompson <philip.thompson@datadoghq.com>
1 parent defb431 commit b81a033

File tree

7 files changed

+134
-131
lines changed

7 files changed

+134
-131
lines changed

CONTRIBUTING.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,7 @@ Once you have installed the above requirements, run the `make lima-all` command
4545
- `make lima-start` to create the lima vm with containerd and Kubernetes (backed by k3s)
4646
- `make lima-kubectx` to add the lima Kubernetes cluster config to your local configs and switch to the lima context
4747
- `make lima-install-cert-manager` to install cert-manager
48-
- `make lima-build` to build the chaos-controller images
48+
- `make lima-push-all` to build and push the chaos-controller images
4949
- `make lima-install` to render and apply the chaos-controller helm chart
5050

5151
Once the instance is started, you can log into it using either the `lima` or its longer form `limactl shell <$LIMA_INSTANCE>` commands.

api/v1beta1/network_disruption.go

Lines changed: 16 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -708,11 +708,17 @@ func (s NetworkDisruptionServiceSpec) ExtractAffectedPortsInServicePorts(k8sServ
708708
return goodPorts, notFoundPorts
709709
}
710710

711-
// TransformCloudSpecToHostsSpec from a cloud spec disruption, get all ip ranges of services provided and transform them into a list of hosts spec
712-
func TransformCloudSpecToHostsSpec(cloudManager cloudservice.CloudServicesProvidersManager, cloudSpec *NetworkDisruptionCloudSpec) ([]NetworkDisruptionHostSpec, error) {
713-
var hosts []NetworkDisruptionHostSpec
711+
// UpdateHostsOnCloudDisruption from a cloud spec disruption, get all ip ranges of services provided and appends them into the s.Hosts slice
712+
func (s *NetworkDisruptionSpec) UpdateHostsOnCloudDisruption(cloudManager cloudservice.CloudServicesProvidersManager) error {
713+
if s == nil || s.Cloud == nil {
714+
return nil
715+
}
716+
717+
if s.Hosts == nil {
718+
s.Hosts = []NetworkDisruptionHostSpec{}
719+
}
714720

715-
clouds := cloudSpec.TransformToCloudMap()
721+
clouds := s.Cloud.TransformToCloudMap()
716722

717723
for cloudName, serviceList := range clouds {
718724
var serviceListNames []string
@@ -723,20 +729,22 @@ func TransformCloudSpecToHostsSpec(cloudManager cloudservice.CloudServicesProvid
723729

724730
ipRangesPerService, err := cloudManager.GetServicesIPRanges(types.CloudProviderName(cloudName), serviceListNames)
725731
if err != nil {
726-
return nil, err
732+
return err
727733
}
728734

729735
for _, serviceSpec := range serviceList {
730736
for _, ipRange := range ipRangesPerService[serviceSpec.ServiceName] {
731-
hosts = append(hosts, NetworkDisruptionHostSpec{
737+
host := NetworkDisruptionHostSpec{
732738
Host: ipRange,
733739
Protocol: serviceSpec.Protocol,
734740
Flow: serviceSpec.Flow,
735741
ConnState: serviceSpec.ConnState,
736-
})
742+
}
743+
744+
s.Hosts = append(s.Hosts, host)
737745
}
738746
}
739747
}
740748

741-
return hosts, nil
749+
return nil
742750
}

controllers/disruption_controller.go

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ import (
2525
"time"
2626

2727
chaosv1beta1 "github.com/DataDog/chaos-controller/api/v1beta1"
28+
"github.com/DataDog/chaos-controller/cloudservice"
2829
"github.com/DataDog/chaos-controller/o11y/metrics"
2930
"github.com/DataDog/chaos-controller/o11y/tracer"
3031
"github.com/DataDog/chaos-controller/safemode"
@@ -70,6 +71,7 @@ type DisruptionReconciler struct {
7071
CacheContextStore map[string]CtxTuple
7172
DisruptionsWatchersManager watchers.DisruptionsWatchersManager
7273
ChaosPodService services.ChaosPodService
74+
CloudService cloudservice.CloudServicesProvidersManager
7375
DisruptionsDeletionTimeout time.Duration
7476
}
7577

@@ -484,6 +486,14 @@ func (r *DisruptionReconciler) startInjection(ctx context.Context, instance *cha
484486
r.log.Infow("starting targets injection", "targets", instance.Status.TargetInjections)
485487
}
486488

489+
// on cloud disruption, update hosts
490+
subspec := instance.Spec.DisruptionKindPicker(chaostypes.DisruptionKindNetworkDisruption)
491+
if reflect.ValueOf(subspec).IsValid() && !reflect.ValueOf(subspec).IsNil() {
492+
if err = instance.Spec.Network.UpdateHostsOnCloudDisruption(r.CloudService); err != nil {
493+
return err
494+
}
495+
}
496+
487497
// iterate through target + existing disruption kind -- to ensure all chaos pods exist
488498
for targetName, injections := range instance.Status.TargetInjections {
489499
for _, disKind := range chaostypes.DisruptionKindNames {

controllers/disruption_controller_test.go

Lines changed: 85 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -512,10 +512,47 @@ var _ = Describe("Disruption Controller", func() {
512512
})
513513

514514
Context("Cloud disruption is a host disruption disguised", func() {
515+
VerifyHosts := func(ctx SpecContext) (error, int) {
516+
// get chaos pods
517+
l, err := listChaosPods(ctx, disruption)
518+
if err != nil {
519+
return err, 0
520+
}
521+
522+
hosts := make([]int, len(l.Items))
523+
524+
// sum up injectors
525+
for i, p := range l.Items {
526+
hosts[i] = 0
527+
args := p.Spec.Containers[0].Args
528+
for _, arg := range args {
529+
if arg == "--hosts" {
530+
hosts[i]++
531+
}
532+
}
533+
}
534+
535+
for i, hostsForItem := range hosts {
536+
if hostsForItem == 0 {
537+
return fmt.Errorf("should have multiple hosts parameters."), 0
538+
}
539+
540+
// verify that all chaos pods have the same list of hosts
541+
if i > 0 {
542+
if hosts[i] != hosts[i-1] {
543+
return fmt.Errorf("should have the same list of hosts for all chaos pods"), hosts[i]
544+
}
545+
}
546+
}
547+
548+
return nil, hosts[0]
549+
}
550+
515551
BeforeEach(func() {
552+
skipSecondPod = false
516553
disruption.Spec = chaosv1beta1.DisruptionSpec{
517554
DryRun: false,
518-
Count: &intstr.IntOrString{Type: intstr.Int, IntVal: 1},
555+
Count: &intstr.IntOrString{Type: intstr.String, StrVal: "100%"},
519556
Unsafemode: &chaosv1beta1.UnsafemodeSpec{
520557
DisableAll: true,
521558
},
@@ -538,33 +575,65 @@ var _ = Describe("Disruption Controller", func() {
538575
})
539576

540577
It("should create a cloud disruption but apply a host disruption with the list of cloud managed service ip ranges", func(ctx SpecContext) {
578+
totalNbOfHosts := 0
541579
By("Ensuring that the chaos pod have been created")
542-
ExpectChaosPods(ctx, disruption, 1)
580+
ExpectChaosPods(ctx, disruption, 2)
543581

544-
By("Ensuring that the chaos pod has the list of AWS hosts")
582+
By("Ensuring that the chaos pods have the list of AWS hosts")
545583
Eventually(func(ctx SpecContext) error {
546-
hosts := 0
584+
err, nbOfHosts := VerifyHosts(ctx)
585+
if err != nil {
586+
return err
587+
}
588+
589+
if totalNbOfHosts != 0 && totalNbOfHosts != nbOfHosts {
590+
return fmt.Errorf("should have the same number of hosts for all chaos pods in all iterations")
591+
}
547592

548-
// get chaos pods
549-
l, err := listChaosPods(ctx, disruption)
593+
totalNbOfHosts = nbOfHosts
594+
595+
return nil
596+
}).WithContext(ctx).ProbeEvery(disruptionPotentialChangesEvery).Within(calcDisruptionGoneTimeout(disruption)).Should(Succeed())
597+
598+
By("Ensuring adding another chaos pod will result in the same number of hosts")
599+
By("creating extra target one")
600+
extraOneCreated := CreateRunningPod(ctx, *targetPod.DeepCopy())
601+
602+
By("waiting extra targets to be created and running")
603+
extraTargetPod := <-extraOneCreated
604+
ExpectChaosPods(ctx, disruption, 3)
605+
606+
By("Ensuring that the chaos pods have the same nb of AWS hosts")
607+
Eventually(func(ctx SpecContext) error {
608+
err, nbOfHosts := VerifyHosts(ctx)
550609
if err != nil {
551610
return err
552611
}
553612

554-
// sum up injectors
555-
for _, p := range l.Items {
556-
args := p.Spec.Containers[0].Args
557-
for _, arg := range args {
558-
if arg == "--hosts" {
559-
hosts++
560-
}
561-
}
613+
if totalNbOfHosts != 0 && totalNbOfHosts != nbOfHosts {
614+
return fmt.Errorf("should have the same number of hosts for all chaos pods in all iterations")
562615
}
563616

564-
if hosts == 0 {
565-
return fmt.Errorf("should have multiple hosts parameters.")
617+
totalNbOfHosts = nbOfHosts
618+
619+
return nil
620+
}).WithContext(ctx).ProbeEvery(disruptionPotentialChangesEvery).Within(calcDisruptionGoneTimeout(disruption)).Should(Succeed())
621+
622+
By("Deleting extra target and ensuring the number of hosts is the same")
623+
DeleteRunningPod(ctx, extraTargetPod)
624+
ExpectChaosPods(ctx, disruption, 2)
625+
Eventually(func(ctx SpecContext) error {
626+
err, nbOfHosts := VerifyHosts(ctx)
627+
if err != nil {
628+
return err
566629
}
567630

631+
if totalNbOfHosts != 0 && totalNbOfHosts != nbOfHosts {
632+
return fmt.Errorf("should have the same number of hosts for all chaos pods in all iterations")
633+
}
634+
635+
totalNbOfHosts = nbOfHosts
636+
568637
return nil
569638
}).WithContext(ctx).ProbeEvery(disruptionPotentialChangesEvery).Within(calcDisruptionGoneTimeout(disruption)).Should(Succeed())
570639
})

main.go

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -171,9 +171,8 @@ func main() {
171171
DNSDisruptionKubeDNS: cfg.Injector.DNSDisruption.KubeDNS,
172172
ImagePullSecrets: cfg.Injector.ImagePullSecrets,
173173
},
174-
ImagePullSecrets: cfg.Injector.ImagePullSecrets,
175-
MetricsSink: metricsSink,
176-
CloudServicesProvidersManager: cloudProviderManager,
174+
ImagePullSecrets: cfg.Injector.ImagePullSecrets,
175+
MetricsSink: metricsSink,
177176
})
178177

179178
if err != nil {
@@ -192,6 +191,7 @@ func main() {
192191
ExpiredDisruptionGCDelay: gcPtr,
193192
CacheContextStore: make(map[string]controllers.CtxTuple),
194193
ChaosPodService: chaosPodService,
194+
CloudService: cloudProviderManager,
195195
DisruptionsDeletionTimeout: cfg.Controller.DisruptionDeletionTimeout,
196196
}
197197

services/chaospod.go

Lines changed: 10 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,6 @@ import (
1414

1515
chaosapi "github.com/DataDog/chaos-controller/api"
1616
chaosv1beta1 "github.com/DataDog/chaos-controller/api/v1beta1"
17-
"github.com/DataDog/chaos-controller/cloudservice"
1817
"github.com/DataDog/chaos-controller/env"
1918
"github.com/DataDog/chaos-controller/o11y/metrics"
2019
"github.com/DataDog/chaos-controller/targetselector"
@@ -75,14 +74,13 @@ type ChaosPodServiceInjectorConfig struct {
7574

7675
// ChaosPodServiceConfig contains configuration options for the chaosPodService.
7776
type ChaosPodServiceConfig struct {
78-
Client client.Client // Kubernetes client for interacting with the API server.
79-
Log *zap.SugaredLogger // Logger for logging.
80-
ChaosNamespace string // Namespace where chaos-related resources are located.
81-
TargetSelector targetselector.TargetSelector // Target selector for selecting target pods.
82-
Injector ChaosPodServiceInjectorConfig // Configuration options for the injector.
83-
ImagePullSecrets string // Image pull secrets for the chaosPodService.
84-
MetricsSink metrics.Sink // Sink for exporting metrics.
85-
CloudServicesProvidersManager cloudservice.CloudServicesProvidersManager // Manager for cloud service providers.
77+
Client client.Client // Kubernetes client for interacting with the API server.
78+
Log *zap.SugaredLogger // Logger for logging.
79+
ChaosNamespace string // Namespace where chaos-related resources are located.
80+
TargetSelector targetselector.TargetSelector // Target selector for selecting target pods.
81+
Injector ChaosPodServiceInjectorConfig // Configuration options for the injector.
82+
ImagePullSecrets string // Image pull secrets for the chaosPodService.
83+
MetricsSink metrics.Sink // Sink for exporting metrics.
8684
}
8785

8886
type chaosPodService struct {
@@ -243,24 +241,11 @@ func (m *chaosPodService) GenerateChaosPodsOfDisruption(instance *chaosv1beta1.D
243241
}
244242

245243
notInjectedBefore := instance.TimeToInject()
246-
247244
allowedHosts := m.config.Injector.NetworkDisruptionAllowedHosts
248245

249-
// get the ip ranges of cloud provider services
250-
if instance.Spec.Network != nil {
251-
if instance.Spec.Network.Cloud != nil {
252-
hosts, err := chaosv1beta1.TransformCloudSpecToHostsSpec(m.config.CloudServicesProvidersManager, instance.Spec.Network.Cloud)
253-
if err != nil {
254-
return nil, err
255-
}
256-
257-
instance.Spec.Network.Hosts = append(instance.Spec.Network.Hosts, hosts...)
258-
}
259-
260-
// remove default allowed hosts if disabled
261-
if instance.Spec.Network.DisableDefaultAllowedHosts {
262-
allowedHosts = make([]string, 0)
263-
}
246+
// remove default allowed hosts if disabled
247+
if instance.Spec.Network != nil && instance.Spec.Network.DisableDefaultAllowedHosts {
248+
allowedHosts = make([]string, 0)
264249
}
265250

266251
xargs := chaosapi.DisruptionArgs{

0 commit comments

Comments
 (0)