19
19
import webbrowser
20
20
from datetime import datetime , timezone
21
21
from pathlib import Path
22
+ from threading import Thread
22
23
from typing import (
23
24
Any ,
24
25
Callable ,
@@ -946,27 +947,110 @@ def run(self, comp: Composition, workflow: Workflow) -> None:
946
947
raise Failed (f"Unable to kill container { container_id } : { e } " )
947
948
948
949
949
- @Steps .register ("chaos-delay-docker" )
950
- class ChaosDelayDockerStep (WorkflowStep ):
951
- """Delay the incoming and outgoing network traffic for a Docker service.
950
+ class ChaosNetemStep (WorkflowStep ):
951
+ """Base class for running network chaos tests against a Docker container.
952
952
953
- Params:
954
- service: Docker service to delay, will be used to grep for container id
955
- NOTE: service name must be unique to correctly match the container id
956
- delay: milliseconds to delay network traffic (default: 100ms)
953
+ We use pumba (a chaos testing tool for Docker) to run the various netem tests.
954
+ pumba only supports Docker container names (not container ids), meaning that
955
+ we have to pass the exact container name to these steps. We should fix this in
956
+ the future.
957
957
"""
958
958
959
- def __init__ (self , service : str , delay : int = 100 ) -> None :
960
- self ._service = service
961
- self ._delay = delay
959
+ def __init__ (self , duration : int ):
960
+ self ._duration = duration
962
961
963
962
def run (self , comp : Composition , workflow : Workflow ) -> None :
963
+ if self ._duration == - 1 :
964
+ # If duration isn't provided, run for 7 days in a non-blocking thread.
965
+ duration = 10080
966
+ else :
967
+ duration = self ._duration
968
+
969
+ cmd = self .get_cmd (duration ).split ()
970
+ netem_thread = Thread (target = self .threaded_netem , args = (cmd ,), daemon = True )
971
+ netem_thread .start ()
972
+
973
+ def get_cmd (self , duration : int ) -> str :
974
+ pass
975
+
976
+ def threaded_netem (self , cmd : List [str ]) -> None :
964
977
try :
965
- container_id = comp .get_container_id (self ._service )
966
- cmd = f"docker exec { container_id } tc qdisc add dev eth0 root netem delay { self ._delay } ms" .split ()
967
978
spawn .runv (cmd )
968
979
except subprocess .CalledProcessError as e :
969
- raise Failed (f"Unable to delay container { container_id } : { e } " )
980
+ raise Failed (f"Unable to run netem chaos command: { e } " )
981
+
982
+
983
+ @Steps .register ("chaos-delay-docker" )
984
+ class ChaosDelayDockerStep (ChaosNetemStep ):
985
+ """Delay the egress network traffic for a Docker service.
986
+ """
987
+
988
+ def __init__ (
989
+ self , container : str , duration : int = - 1 , delay : int = 250 , jitter : int = 250 ,
990
+ ) -> None :
991
+ super ().__init__ (duration = duration )
992
+ self ._container = container
993
+ self ._delay = delay
994
+ self ._jitter = jitter
995
+
996
+ def get_cmd (self , duration : int ) -> str :
997
+ return f"pumba --random netem --duration { duration } m delay --time { self ._delay } \
998
+ --jitter { self ._jitter } --distribution normal { self ._container } "
999
+
1000
+
1001
+ @Steps .register ("chaos-rate-docker" )
1002
+ class ChaosRateDockerStep (ChaosNetemStep ):
1003
+ """Limit the egress network traffic for a Docker service.
1004
+ """
1005
+
1006
+ def __init__ (self , container : str , duration : int = - 1 ) -> None :
1007
+ super ().__init__ (duration = duration )
1008
+ self ._container = container
1009
+
1010
+ def get_cmd (self , duration : int ) -> str :
1011
+ return f"pumba netem --duration { duration } m rate { self ._container } "
1012
+
1013
+
1014
+ @Steps .register ("chaos-loss-docker" )
1015
+ class ChaosLossDockerStep (ChaosNetemStep ):
1016
+ """Lose a percent of a Docker container's network packets.
1017
+ """
1018
+
1019
+ def __init__ (self , container : str , percent : int , duration : int = - 1 ) -> None :
1020
+ super ().__init__ (duration = duration )
1021
+ self ._container = container
1022
+ self ._percent = percent
1023
+
1024
+ def get_cmd (self , duration : int ) -> str :
1025
+ return f"pumba netem --duration { duration } m loss --percent { self ._percent } { self ._container } "
1026
+
1027
+
1028
+ @Steps .register ("chaos-duplicate-docker" )
1029
+ class ChaosDuplicateDockerStep (ChaosNetemStep ):
1030
+ """Duplicate a percent of a Docker container's network packets.
1031
+ """
1032
+
1033
+ def __init__ (self , container : str , percent : int , duration : int = - 1 ) -> None :
1034
+ super ().__init__ (duration = duration )
1035
+ self ._container = container
1036
+ self ._percent = percent
1037
+
1038
+ def get_cmd (self , duration : int ) -> str :
1039
+ return f"pumba netem --duration { duration } m duplicate --percent { self ._percent } { self ._container } "
1040
+
1041
+
1042
+ @Steps .register ("chaos-corrupt-docker" )
1043
+ class ChaosCorruptDockerStep (ChaosNetemStep ):
1044
+ """Corrupt a percent of a Docker container's network packets.
1045
+ """
1046
+
1047
+ def __init__ (self , container : str , percent : int , duration : int = - 1 ) -> None :
1048
+ super ().__init__ (duration = duration )
1049
+ self ._container = container
1050
+ self ._percent = percent
1051
+
1052
+ def get_cmd (self , duration : int ) -> str :
1053
+ return f"pumba netem --duration { duration } m corrupt --percent { self ._percent } { self ._container } "
970
1054
971
1055
972
1056
@Steps .register ("chaos-confirm" )
@@ -991,6 +1075,10 @@ def run(self, comp: Composition, workflow: Workflow) -> None:
991
1075
if not comp .docker_container_is_running (container_id ):
992
1076
raise Failed (f"chaos-confirm: container { container_id } is not running" )
993
1077
else :
1078
+ if comp .docker_container_is_running (container_id ):
1079
+ raise Failed (
1080
+ f"chaos-confirm: expected { container_id } to have exited, is running"
1081
+ )
994
1082
actual_exit_code = comp .docker_inspect ("{{.State.ExitCode}}" , container_id )
995
1083
if actual_exit_code != f"'{ self ._exit_code } '" :
996
1084
raise Failed (
0 commit comments