Skip to content

Commit 81356af

Browse files
authored
K8SPS-455: Improve self-healing tests (#941)
* K8SPS-455: Improve gr-self-healing * fix operator-self-healing
1 parent d61c487 commit 81356af

14 files changed

+183
-55
lines changed

e2e-tests/conf/client.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ metadata:
55
labels:
66
name: mysql-client
77
spec:
8+
terminationGracePeriodSeconds: 10
89
containers:
910
- name: mysql-client
1011
image: percona/percona-server:8.0.33

e2e-tests/functions

Lines changed: 157 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -100,7 +100,8 @@ deploy_tls_cluster_secrets() {
100100
}
101101

102102
deploy_client() {
103-
kubectl -n "${NAMESPACE}" apply -f "${TESTS_CONFIG_DIR}/client.yaml"
103+
yq eval "$(printf '.spec.containers[0].image="%s"' "${IMAGE_MYSQL}")" "${TESTS_CONFIG_DIR}/client.yaml" | \
104+
kubectl -n "${NAMESPACE}" apply -f -
104105
}
105106

106107
apply_s3_storage_secrets() {
@@ -385,10 +386,29 @@ run_mysqlsh() {
385386
wait_pod $client_pod 1>&2
386387

387388
kubectl -n "${NAMESPACE}" exec "${pod:-mysql-client}" -- \
388-
bash -c "printf '%s\n' \"${command}\" | mysqlsh --sql --quiet-start=2 $uri" 2>&1 \
389+
bash -c "printf '%s\n' \"${command}\" | mysqlsh --sql --quiet-start=2 $uri" 2>/dev/null \
389390
| tail -n +2
390391
}
391392

393+
get_innodb_cluster_status() {
394+
local uri="$1"
395+
396+
client_pod=$(get_client_pod)
397+
wait_pod $client_pod 1>&2
398+
399+
kubectl -n "${NAMESPACE}" exec "${client_pod}" -- mysqlsh --js --quiet-start=2 --uri ${uri} -- cluster status
400+
}
401+
402+
wait_until_innodb_ok() {
403+
local uri="$1"
404+
405+
local retry=0
406+
until [[ $(get_innodb_cluster_status ${uri} | jq -r .defaultReplicaSet.status) == "OK" ]]; do
407+
sleep 5
408+
retry=$((retry + 1))
409+
done
410+
}
411+
392412
run_curl() {
393413
kubectl -n "${NAMESPACE}" exec mysql-client -- bash -c "curl -s -k $*"
394414
}
@@ -397,6 +417,13 @@ get_innodb_cluster_name() {
397417
echo $(get_cluster_name) | tr -cd '[^a-zA-Z0-9_]+'
398418
}
399419

420+
get_mysqlsh_uri_for_pod() {
421+
local pod=$1
422+
423+
424+
echo "root:root_password@${pod}.$(get_cluster_name)-mysql.${NAMESPACE}"
425+
}
426+
400427
get_mysqlsh_uri() {
401428
local idx=${1:-0}
402429

@@ -409,7 +436,7 @@ get_gr_status() {
409436

410437
client_pod=$(get_client_pod)
411438

412-
kubectl -n "${NAMESPACE}" exec "${pod:-mysql-client}" -- mysqlsh --uri $uri --cluster --result-format json -- cluster status \
439+
kubectl -n "${NAMESPACE}" exec "${pod:-mysql-client}" -- mysqlsh --js --uri $uri --cluster --result-format json -- cluster status \
413440
| sed -e 's/mysql: //' \
414441
| (grep -v 'Using a password on the command line interface can be insecure.' || :)
415442
}
@@ -525,7 +552,7 @@ get_router_pods() {
525552
get_mysql_users() {
526553
local args=$1
527554

528-
run_mysql "SELECT user FROM mysql.user" "${args}" | grep -vE "mysql|root"
555+
run_mysql "SELECT user FROM mysql.user" "${args}" | grep -vE "mysql|root|percona.telemetry"
529556
}
530557

531558
get_service_ip() {
@@ -780,19 +807,14 @@ deploy_chaos_mesh() {
780807

781808
helm repo add chaos-mesh https://charts.chaos-mesh.org
782809
if [ -n "${MINIKUBE}" ]; then
783-
helm install chaos-mesh chaos-mesh/chaos-mesh --namespace=${NAMESPACE} --set chaosDaemon.runtime=docker --set dashboard.create=false --version ${CHAOS_MESH_VER} --wait
810+
helm install chaos-mesh chaos-mesh/chaos-mesh --namespace=${NAMESPACE} --set chaosDaemon.runtime=docker --set dashboard.create=false --version ${CHAOS_MESH_VER}
784811
else
785812
helm install chaos-mesh chaos-mesh/chaos-mesh --namespace=${NAMESPACE} --set chaosDaemon.runtime=containerd --set chaosDaemon.socketPath=/run/containerd/containerd.sock --set dashboard.create=false --version ${CHAOS_MESH_VER}
786813
fi
787814
if [[ -n $OPENSHIFT ]]; then
788815
oc adm policy add-scc-to-user privileged -z chaos-daemon --namespace=${NAMESPACE}
789816
fi
790-
791-
echo "Waiting for chaos-mesh DaemonSet to be ready..."
792-
until [ "$(kubectl get daemonset chaos-daemon -n ${NAMESPACE} -o jsonpath='{.status.numberReady}')" = "$(kubectl get daemonset chaos-daemon -n ${NAMESPACE} -o jsonpath='{.status.desiredNumberScheduled}')" ]; do
793-
echo "Waiting for DaemonSet chaos-daemon..."
794-
sleep 5
795-
done
817+
sleep 10
796818
}
797819

798820
destroy_chaos_mesh() {
@@ -824,17 +846,17 @@ kill_pods() {
824846
local selector=$2
825847
local pod_label=$3
826848
local label_value=$4
827-
local chaos_suffix=$5
849+
local chaos_name=$5
828850

829851
if [ "${selector}" == "pod" ]; then
830852
yq eval '
831-
.metadata.name = "chaos-pod-kill-'${chaos_suffix}'" |
853+
.metadata.name = "'${chaos_name}'" |
832854
del(.spec.selector.pods.test-namespace) |
833855
.spec.selector.pods.'${ns}'[0] = "'${pod_label}'"' ${TESTS_CONFIG_DIR}/chaos-pod-kill.yml \
834856
| kubectl apply --namespace ${ns} -f -
835857
elif [ "${selector}" == "label" ]; then
836858
yq eval '
837-
.metadata.name = "chaos-kill-label-'${chaos_suffix}'" |
859+
.metadata.name = "'${chaos_name}'" |
838860
.spec.mode = "all" |
839861
del(.spec.selector.pods) |
840862
.spec.selector.labelSelectors."'${pod_label}'" = "'${label_value}'"' ${TESTS_CONFIG_DIR}/chaos-pod-kill.yml \
@@ -846,10 +868,10 @@ kill_pods() {
846868
failure_pod() {
847869
local ns=$1
848870
local pod=$2
849-
local chaos_suffix=$3
871+
local chaos_name=$3
850872

851873
yq eval '
852-
.metadata.name = "chaos-pod-failure-'${chaos_suffix}'" |
874+
.metadata.name = "'${chaos_name}'" |
853875
del(.spec.selector.pods.test-namespace) |
854876
.spec.selector.pods.'${ns}'[0] = "'${pod}'"' ${TESTS_CONFIG_DIR}/chaos-pod-failure.yml \
855877
| kubectl apply --namespace ${ns} -f -
@@ -859,16 +881,133 @@ failure_pod() {
859881
network_loss() {
860882
local ns=$1
861883
local pod=$2
862-
local chaos_suffix=$3
884+
local chaos_name=$3
863885

864886
yq eval '
865-
.metadata.name = "chaos-pod-network-loss-'${chaos_suffix}'" |
887+
.metadata.name = "'${chaos_name}'" |
866888
del(.spec.selector.pods.test-namespace) |
867889
.spec.selector.pods.'${ns}'[0] = "'${pod}'"' ${TESTS_CONFIG_DIR}/chaos-network-loss.yml \
868890
| kubectl apply --namespace ${ns} -f -
869891
sleep 5
870892
}
871893

894+
wait_until_chaos_applied() {
895+
local chaos_type=$1
896+
local chaos_name=$2
897+
898+
local resource
899+
case ${chaos_type} in
900+
"kill"|"failure"|"full-cluster-crash")
901+
resource=podchaos/${chaos_name}
902+
;;
903+
"network")
904+
resource=networkchaos/${chaos_name}
905+
;;
906+
esac
907+
908+
local retry=0
909+
until [[ ${retry} == 30 ]]; do
910+
sleep 10
911+
retry=$((retry + 1))
912+
913+
succeeded=$(kubectl -n ${NAMESPACE} get ${resource} -o yaml \
914+
| yq '.status.experiment.containerRecords[].events[]
915+
| select(.operation == "Apply" and .type == "Succeeded")')
916+
917+
if [[ -n ${succeeded} ]]; then
918+
return
919+
fi
920+
done
921+
922+
echo "Timeout (300s) exceeded while waiting for chaos to be applied"
923+
exit 1
924+
}
925+
926+
wait_until_chaos_recovered() {
927+
local chaos_type=$1
928+
local chaos_name=$2
929+
930+
local resource
931+
case ${chaos_type} in
932+
"kill"|"failure")
933+
resource=podchaos/${chaos_name}
934+
;;
935+
"network")
936+
resource=networkchaos/${chaos_name}
937+
;;
938+
esac
939+
940+
local retry=0
941+
until [[ ${retry} == 30 ]]; do
942+
sleep 10
943+
retry=$((retry + 1))
944+
945+
succeeded=$(kubectl -n ${NAMESPACE} get ${resource} -o yaml \
946+
| yq '.status.experiment.containerRecords[].events[]
947+
| select(.operation == "Recover" and .type == "Succeeded")')
948+
949+
if [[ -n ${succeeded} ]]; then
950+
return
951+
fi
952+
done
953+
954+
echo "Timeout (300s) exceeded while waiting for chaos to be recovered"
955+
exit 1
956+
}
957+
958+
check_primary_chaos() {
959+
local chaos_type=$1
960+
local ns=$2
961+
local primary_before_failure=$3
962+
963+
local chaos_name
964+
case ${chaos_type} in
965+
"kill")
966+
chaos_name="chaos-pod-kill-primary"
967+
kill_pods "${ns}" "pod" "${primary_before_failure}" "" "${chaos_name}"
968+
;;
969+
"full-cluster-crash")
970+
chaos_name="chaos-kill-label-cluster-crash"
971+
kill_pods "${ns}" "label" "app.kubernetes.io/instance" "gr-self-healing" "${chaos_name}"
972+
;;
973+
"failure")
974+
chaos_name="chaos-pod-failure-primary"
975+
failure_pod "${ns}" "${primary_before_failure}" "${chaos_name}"
976+
;;
977+
"network")
978+
chaos_name="chaos-pod-network-loss-primary"
979+
network_loss "${ns}" "${primary_before_failure}" "${chaos_name}"
980+
;;
981+
esac
982+
983+
wait_until_chaos_applied ${chaos_type} ${chaos_name}
984+
if [[ ${chaos_type} == "failure" || ${chaos_type} == "network" ]]; then
985+
wait_until_chaos_recovered ${chaos_type} ${chaos_name}
986+
fi
987+
988+
wait_cluster_consistency_gr "$(get_cluster_name)" 3 3
989+
990+
primary_after_failure=$(get_primary_from_group_replication)
991+
uri=$(get_mysqlsh_uri_for_pod ${primary_after_failure})
992+
wait_until_innodb_ok ${uri}
993+
994+
if [[ "${primary_before_failure}" == "${primary_after_failure}" ]]; then
995+
echo "primary pod was not killed! something went wrong."
996+
exit 1
997+
fi
998+
999+
uri=$(get_mysqlsh_uri_for_pod $(get_primary_from_group_replication))
1000+
online_members=$(get_innodb_cluster_status ${uri} \
1001+
| jq .defaultReplicaSet.topology[].status \
1002+
| grep ONLINE \
1003+
| wc -l)
1004+
1005+
if [[ ${online_members} != 3 ]]; then
1006+
echo "expected 3 online members, got ${online_members}"
1007+
exit 1
1008+
fi
1009+
}
1010+
8721011
renew_certificate() {
8731012
certificate="$1"
8741013

Lines changed: 4 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,18 +1,12 @@
11
apiVersion: kuttl.dev/v1beta1
22
kind: TestStep
3-
timeout: 30
3+
44
commands:
5-
- script: |-
5+
- timeout: 720
6+
script: |-
67
set -o errexit
78
set -o xtrace
89
910
source ../../functions
1011
11-
init_pod="$(get_primary_from_group_replication)"
12-
kill_pods "${NAMESPACE}" "pod" "$init_pod" "" "primary"
13-
sleep 20 # wait a bit for pod to be killed
14-
15-
if [ "$init_pod" == "$(get_primary_from_group_replication)" ]; then
16-
echo "primary pod was not killed! something went wrong."
17-
exit 1
18-
fi
12+
check_primary_chaos "kill" ${NAMESPACE} $(get_primary_from_group_replication)
Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,11 @@
11
apiVersion: kuttl.dev/v1beta1
22
kind: TestStep
3-
timeout: 30
43
commands:
5-
- script: |-
4+
- timeout: 720
5+
script: |-
66
set -o errexit
77
set -o xtrace
88
99
source ../../functions
1010
11-
failure_pod "${NAMESPACE}" "$(get_primary_from_group_replication)" "primary"
12-
sleep 10 # wait a bit for pod to be killed
11+
check_primary_chaos "failure" ${NAMESPACE} $(get_primary_from_group_replication)
Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,11 @@
11
apiVersion: kuttl.dev/v1beta1
22
kind: TestStep
3-
timeout: 90
43
commands:
5-
- script: |-
4+
- timeout: 720
5+
script: |-
66
set -o errexit
77
set -o xtrace
88
99
source ../../functions
1010
11-
network_loss "${NAMESPACE}" "$(get_primary_from_group_replication)" "primary"
12-
sleep 30 # wait for new master to get elected
13-
timeout: 90
11+
check_primary_chaos "network" ${NAMESPACE} $(get_primary_from_group_replication)

e2e-tests/tests/gr-self-healing/12-write-data.yaml

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,4 +10,3 @@ commands:
1010
run_mysql \
1111
"INSERT myDB.myTable (id) VALUES (100503)" \
1212
"-h $(get_mysql_router_service $(get_cluster_name)) -P 6446 -uroot -proot_password"
13-
sleep 10

e2e-tests/tests/gr-self-healing/13-assert.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
apiVersion: kuttl.dev/v1beta1
22
kind: TestAssert
3-
timeout: 240
3+
timeout: 30
44
---
55
apiVersion: ps.percona.com/v1alpha1
66
kind: PerconaServerMySQL

e2e-tests/tests/gr-self-healing/13-read-from-replicas.yaml

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,5 @@
11
apiVersion: kuttl.dev/v1beta1
22
kind: TestStep
3-
timeout: 120
43
commands:
54
- script: |-
65
set -o errexit
@@ -13,4 +12,3 @@ commands:
1312
data=$(run_mysql "SELECT * FROM myDB.myTable" "-h ${host} -uroot -proot_password")
1413
kubectl create configmap -n "${NAMESPACE}" 13-read-from-replicas-${i} --from-literal=data="${data}"
1514
done
16-
sleep 20
Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,11 @@
11
apiVersion: kuttl.dev/v1beta1
22
kind: TestStep
33
commands:
4-
- script: |-
4+
- timeout: 720
5+
script: |-
56
set -o errexit
67
set -o xtrace
78
89
source ../../functions
910
10-
kill_pods "${NAMESPACE}" "label" "app.kubernetes.io/instance" "gr-self-healing" "cluster-crash"
11-
sleep 30 # wait for crash
12-
timeout: 100
11+
check_primary_chaos "full-cluster-crash" ${NAMESPACE} $(get_primary_from_group_replication)

e2e-tests/tests/gr-self-healing/17-quorum-loss.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
apiVersion: kuttl.dev/v1beta1
22
kind: TestStep
3-
timeout: 480
3+
timeout: 30
44
commands:
55
- script: |-
66
set -o errexit

0 commit comments

Comments
 (0)