K8SPS-455: Improve self-healing tests (#941)

egegunes · web-flow · commit 81356afc7e42 · 2025-06-13T14:42:59.000+03:00
* K8SPS-455: Improve gr-self-healing

* fix operator-self-healing
diff --git a/e2e-tests/conf/client.yaml b/e2e-tests/conf/client.yaml
@@ -5,6 +5,7 @@ metadata:
   labels:
     name: mysql-client
 spec:
+  terminationGracePeriodSeconds: 10
   containers:
   - name: mysql-client
     image: percona/percona-server:8.0.33
diff --git a/e2e-tests/functions b/e2e-tests/functions
@@ -100,7 +100,8 @@ deploy_tls_cluster_secrets() {
 }
 
 deploy_client() {
-	kubectl -n "${NAMESPACE}" apply -f "${TESTS_CONFIG_DIR}/client.yaml"
+  yq eval "$(printf '.spec.containers[0].image="%s"' "${IMAGE_MYSQL}")" "${TESTS_CONFIG_DIR}/client.yaml" | \
+    kubectl -n "${NAMESPACE}" apply -f -
 }
 
 apply_s3_storage_secrets() {
@@ -385,10 +386,29 @@ run_mysqlsh() {
 	wait_pod $client_pod 1>&2
 
 	kubectl -n "${NAMESPACE}" exec "${pod:-mysql-client}" -- \
-		bash -c "printf '%s\n' \"${command}\" | mysqlsh --sql --quiet-start=2 $uri" 2>&1 \
+		bash -c "printf '%s\n' \"${command}\" | mysqlsh --sql --quiet-start=2 $uri" 2>/dev/null \
 		| tail -n +2
 }
 
+get_innodb_cluster_status() {
+	local uri="$1"
+
+	client_pod=$(get_client_pod)
+	wait_pod $client_pod 1>&2
+
+	kubectl -n "${NAMESPACE}" exec "${client_pod}" -- mysqlsh --js --quiet-start=2 --uri ${uri} -- cluster status
+}
+
+wait_until_innodb_ok() {
+	local uri="$1"
+
+  local retry=0
+  until [[ $(get_innodb_cluster_status ${uri} | jq -r .defaultReplicaSet.status) == "OK" ]]; do
+    sleep 5
+    retry=$((retry + 1))
+  done
+}
+
 run_curl() {
 	kubectl -n "${NAMESPACE}" exec mysql-client -- bash -c "curl -s -k $*"
 }
@@ -397,6 +417,13 @@ get_innodb_cluster_name() {
 	echo $(get_cluster_name) | tr -cd '[^a-zA-Z0-9_]+'
 }
 
+get_mysqlsh_uri_for_pod() {
+  local pod=$1
+
+
+	echo "root:root_password@${pod}.$(get_cluster_name)-mysql.${NAMESPACE}"
+}
+
 get_mysqlsh_uri() {
 	local idx=${1:-0}
 
@@ -409,7 +436,7 @@ get_gr_status() {
 
 	client_pod=$(get_client_pod)
 
-	kubectl -n "${NAMESPACE}" exec "${pod:-mysql-client}" -- mysqlsh --uri $uri --cluster --result-format json -- cluster status \
+	kubectl -n "${NAMESPACE}" exec "${pod:-mysql-client}" -- mysqlsh --js --uri $uri --cluster --result-format json -- cluster status \
 		| sed -e 's/mysql: //' \
 		| (grep -v 'Using a password on the command line interface can be insecure.' || :)
 }
@@ -525,7 +552,7 @@ get_router_pods() {
 get_mysql_users() {
 	local args=$1
 
-	run_mysql "SELECT user FROM mysql.user" "${args}" | grep -vE "mysql|root"
+	run_mysql "SELECT user FROM mysql.user" "${args}" | grep -vE "mysql|root|percona.telemetry"
 }
 
 get_service_ip() {
@@ -780,19 +807,14 @@ deploy_chaos_mesh() {
 
 	helm repo add chaos-mesh https://charts.chaos-mesh.org
 	if [ -n "${MINIKUBE}" ]; then
-		helm install chaos-mesh chaos-mesh/chaos-mesh --namespace=${NAMESPACE} --set chaosDaemon.runtime=docker --set dashboard.create=false --version ${CHAOS_MESH_VER} --wait
+		helm install chaos-mesh chaos-mesh/chaos-mesh --namespace=${NAMESPACE} --set chaosDaemon.runtime=docker --set dashboard.create=false --version ${CHAOS_MESH_VER}
 	else
 		helm install chaos-mesh chaos-mesh/chaos-mesh --namespace=${NAMESPACE} --set chaosDaemon.runtime=containerd --set chaosDaemon.socketPath=/run/containerd/containerd.sock --set dashboard.create=false --version ${CHAOS_MESH_VER}
 	fi
 	if [[ -n $OPENSHIFT ]]; then
 		oc adm policy add-scc-to-user privileged -z chaos-daemon --namespace=${NAMESPACE}
 	fi
-
-	echo "Waiting for chaos-mesh DaemonSet to be ready..."
-    until [ "$(kubectl get daemonset chaos-daemon -n ${NAMESPACE} -o jsonpath='{.status.numberReady}')" = "$(kubectl get daemonset chaos-daemon -n ${NAMESPACE} -o jsonpath='{.status.desiredNumberScheduled}')" ]; do
-        echo "Waiting for DaemonSet chaos-daemon..."
-        sleep 5
-    done
+	sleep 10
 }
 
 destroy_chaos_mesh() {
@@ -824,17 +846,17 @@ kill_pods() {
 	local selector=$2
 	local pod_label=$3
 	local label_value=$4
-	local chaos_suffix=$5
+	local chaos_name=$5
 
 	if [ "${selector}" == "pod" ]; then
 		yq eval '
-			.metadata.name = "chaos-pod-kill-'${chaos_suffix}'" |
+			.metadata.name = "'${chaos_name}'" |
 			del(.spec.selector.pods.test-namespace) |
 			.spec.selector.pods.'${ns}'[0] = "'${pod_label}'"' ${TESTS_CONFIG_DIR}/chaos-pod-kill.yml \
 			| kubectl apply --namespace ${ns} -f -
 	elif [ "${selector}" == "label" ]; then
 		yq eval '
-			.metadata.name = "chaos-kill-label-'${chaos_suffix}'" |
+			.metadata.name = "'${chaos_name}'" |
 			.spec.mode = "all" |
 			del(.spec.selector.pods) |
 			.spec.selector.labelSelectors."'${pod_label}'" = "'${label_value}'"' ${TESTS_CONFIG_DIR}/chaos-pod-kill.yml \
@@ -846,10 +868,10 @@ kill_pods() {
 failure_pod() {
 	local ns=$1
 	local pod=$2
-	local chaos_suffix=$3
+	local chaos_name=$3
 
 	yq eval '
-        .metadata.name = "chaos-pod-failure-'${chaos_suffix}'" |
+        .metadata.name = "'${chaos_name}'" |
         del(.spec.selector.pods.test-namespace) |
         .spec.selector.pods.'${ns}'[0] = "'${pod}'"' ${TESTS_CONFIG_DIR}/chaos-pod-failure.yml \
 		| kubectl apply --namespace ${ns} -f -
@@ -859,16 +881,133 @@ failure_pod() {
 network_loss() {
 	local ns=$1
 	local pod=$2
-	local chaos_suffix=$3
+	local chaos_name=$3
 
 	yq eval '
-        .metadata.name = "chaos-pod-network-loss-'${chaos_suffix}'" |
+        .metadata.name = "'${chaos_name}'" |
         del(.spec.selector.pods.test-namespace) |
         .spec.selector.pods.'${ns}'[0] = "'${pod}'"' ${TESTS_CONFIG_DIR}/chaos-network-loss.yml \
 		| kubectl apply --namespace ${ns} -f -
 	sleep 5
 }
 
+wait_until_chaos_applied() {
+      local chaos_type=$1
+      local chaos_name=$2
+
+      local resource
+      case ${chaos_type} in
+        "kill"|"failure"|"full-cluster-crash")
+          resource=podchaos/${chaos_name}
+          ;;
+        "network")
+          resource=networkchaos/${chaos_name}
+          ;;
+      esac
+
+      local retry=0
+      until [[ ${retry} == 30 ]]; do
+        sleep 10
+        retry=$((retry + 1))
+
+        succeeded=$(kubectl -n ${NAMESPACE} get ${resource} -o yaml \
+          | yq '.status.experiment.containerRecords[].events[]
+                  | select(.operation == "Apply" and .type == "Succeeded")')
+
+        if [[ -n ${succeeded} ]]; then
+          return
+        fi
+      done
+
+      echo "Timeout (300s) exceeded while waiting for chaos to be applied"
+      exit 1
+}
+
+wait_until_chaos_recovered() {
+      local chaos_type=$1
+      local chaos_name=$2
+
+      local resource
+      case ${chaos_type} in
+        "kill"|"failure")
+          resource=podchaos/${chaos_name}
+          ;;
+        "network")
+          resource=networkchaos/${chaos_name}
+          ;;
+      esac
+
+      local retry=0
+      until [[ ${retry} == 30 ]]; do
+        sleep 10
+        retry=$((retry + 1))
+
+        succeeded=$(kubectl -n ${NAMESPACE} get ${resource} -o yaml \
+          | yq '.status.experiment.containerRecords[].events[]
+                  | select(.operation == "Recover" and .type == "Succeeded")')
+
+        if [[ -n ${succeeded} ]]; then
+          return
+        fi
+      done
+
+      echo "Timeout (300s) exceeded while waiting for chaos to be recovered"
+      exit 1
+}
+
+check_primary_chaos() {
+      local chaos_type=$1
+      local ns=$2
+      local primary_before_failure=$3
+
+      local chaos_name
+      case ${chaos_type} in
+        "kill")
+          chaos_name="chaos-pod-kill-primary"
+          kill_pods "${ns}" "pod" "${primary_before_failure}" "" "${chaos_name}"
+          ;;
+        "full-cluster-crash")
+          chaos_name="chaos-kill-label-cluster-crash"
+          kill_pods "${ns}" "label" "app.kubernetes.io/instance" "gr-self-healing" "${chaos_name}"
+          ;;
+        "failure")
+          chaos_name="chaos-pod-failure-primary"
+          failure_pod "${ns}" "${primary_before_failure}" "${chaos_name}"
+          ;;
+        "network")
+          chaos_name="chaos-pod-network-loss-primary"
+          network_loss "${ns}" "${primary_before_failure}" "${chaos_name}"
+          ;;
+      esac
+
+      wait_until_chaos_applied ${chaos_type} ${chaos_name}
+      if [[ ${chaos_type} == "failure" || ${chaos_type} == "network" ]]; then
+        wait_until_chaos_recovered ${chaos_type} ${chaos_name}
+      fi
+
+      wait_cluster_consistency_gr "$(get_cluster_name)" 3 3
+
+      primary_after_failure=$(get_primary_from_group_replication)
+      uri=$(get_mysqlsh_uri_for_pod ${primary_after_failure})
+      wait_until_innodb_ok ${uri}
+
+      if [[ "${primary_before_failure}" == "${primary_after_failure}" ]]; then
+          echo "primary pod was not killed! something went wrong."
+          exit 1
+      fi
+
+      uri=$(get_mysqlsh_uri_for_pod $(get_primary_from_group_replication))
+      online_members=$(get_innodb_cluster_status ${uri} \
+          | jq .defaultReplicaSet.topology[].status \
+          | grep ONLINE \
+          | wc -l)
+
+      if [[ ${online_members} != 3 ]]; then
+          echo "expected 3 online members, got ${online_members}"
+          exit 1
+      fi
+}
+
 renew_certificate() {
 	certificate="$1"
 
diff --git a/e2e-tests/tests/gr-self-healing/05-kill-primary.yaml b/e2e-tests/tests/gr-self-healing/05-kill-primary.yaml
@@ -1,18 +1,12 @@
 apiVersion: kuttl.dev/v1beta1
 kind: TestStep
-timeout: 30
+
 commands:
-  - script: |-
+  - timeout: 720
+    script: |-
       set -o errexit
       set -o xtrace
 
       source ../../functions
 
-      init_pod="$(get_primary_from_group_replication)"
-      kill_pods "${NAMESPACE}" "pod" "$init_pod" "" "primary"
-      sleep 20 # wait a bit for pod to be killed
-
-      if [ "$init_pod" == "$(get_primary_from_group_replication)" ]; then
-          echo "primary pod was not killed! something went wrong."
-          exit 1
-      fi
+      check_primary_chaos "kill" ${NAMESPACE} $(get_primary_from_group_replication)
diff --git a/e2e-tests/tests/gr-self-healing/08-failure-primary.yaml b/e2e-tests/tests/gr-self-healing/08-failure-primary.yaml
@@ -1,12 +1,11 @@
 apiVersion: kuttl.dev/v1beta1
 kind: TestStep
-timeout: 30
 commands:
-  - script: |-
+  - timeout: 720
+    script: |-
       set -o errexit
       set -o xtrace
 
       source ../../functions
 
-      failure_pod "${NAMESPACE}" "$(get_primary_from_group_replication)" "primary"
-      sleep 10 # wait a bit for pod to be killed
+      check_primary_chaos "failure" ${NAMESPACE} $(get_primary_from_group_replication)
diff --git a/e2e-tests/tests/gr-self-healing/11-network-loss-primary.yaml b/e2e-tests/tests/gr-self-healing/11-network-loss-primary.yaml
@@ -1,13 +1,11 @@
 apiVersion: kuttl.dev/v1beta1
 kind: TestStep
-timeout: 90
 commands:
-  - script: |-
+  - timeout: 720
+    script: |-
       set -o errexit
       set -o xtrace
 
       source ../../functions
 
-      network_loss "${NAMESPACE}" "$(get_primary_from_group_replication)" "primary"
-      sleep 30 # wait for new master to get elected
-    timeout: 90
+      check_primary_chaos "network" ${NAMESPACE} $(get_primary_from_group_replication)
diff --git a/e2e-tests/tests/gr-self-healing/12-write-data.yaml b/e2e-tests/tests/gr-self-healing/12-write-data.yaml
@@ -10,4 +10,3 @@ commands:
       run_mysql \
           "INSERT myDB.myTable (id) VALUES (100503)" \
           "-h $(get_mysql_router_service $(get_cluster_name)) -P 6446 -uroot -proot_password"
-      sleep 10
diff --git a/e2e-tests/tests/gr-self-healing/13-assert.yaml b/e2e-tests/tests/gr-self-healing/13-assert.yaml
@@ -1,6 +1,6 @@
 apiVersion: kuttl.dev/v1beta1
 kind: TestAssert
-timeout: 240
+timeout: 30
 ---
 apiVersion: ps.percona.com/v1alpha1
 kind: PerconaServerMySQL
diff --git a/e2e-tests/tests/gr-self-healing/13-read-from-replicas.yaml b/e2e-tests/tests/gr-self-healing/13-read-from-replicas.yaml
@@ -1,6 +1,5 @@
 apiVersion: kuttl.dev/v1beta1
 kind: TestStep
-timeout: 120
 commands:
   - script: |-
       set -o errexit
@@ -13,4 +12,3 @@ commands:
           data=$(run_mysql "SELECT * FROM myDB.myTable" "-h ${host} -uroot -proot_password")
           kubectl create configmap -n "${NAMESPACE}" 13-read-from-replicas-${i} --from-literal=data="${data}"
       done
-      sleep 20
diff --git a/e2e-tests/tests/gr-self-healing/14-cluster-crash.yaml b/e2e-tests/tests/gr-self-healing/14-cluster-crash.yaml
@@ -1,12 +1,11 @@
 apiVersion: kuttl.dev/v1beta1
 kind: TestStep
 commands:
-  - script: |-
+  - timeout: 720
+    script: |-
       set -o errexit
       set -o xtrace
 
       source ../../functions
 
-      kill_pods "${NAMESPACE}" "label" "app.kubernetes.io/instance" "gr-self-healing" "cluster-crash"
-      sleep 30 # wait for crash
-    timeout: 100
+      check_primary_chaos "full-cluster-crash" ${NAMESPACE} $(get_primary_from_group_replication)
diff --git a/e2e-tests/tests/gr-self-healing/17-quorum-loss.yaml b/e2e-tests/tests/gr-self-healing/17-quorum-loss.yaml
@@ -1,6 +1,6 @@
 apiVersion: kuttl.dev/v1beta1
 kind: TestStep
-timeout: 480
+timeout: 30
 commands:
   - script: |-
       set -o errexit
diff --git a/e2e-tests/tests/operator-self-healing/01-deploy-chaos-mesh.yaml b/e2e-tests/tests/operator-self-healing/01-deploy-chaos-mesh.yaml
@@ -1,8 +1,8 @@
 apiVersion: kuttl.dev/v1beta1
 kind: TestStep
-timeout: 10
 commands:
-  - script: |-
+  - timeout: 120
+    script: |-
       set -o errexit
       set -o xtrace
 
diff --git a/e2e-tests/tests/operator-self-healing/05-kill-pod.yaml b/e2e-tests/tests/operator-self-healing/05-kill-pod.yaml
@@ -9,7 +9,7 @@ commands:
       source ../../functions
 
       init_pod=$(get_operator_pod)
-      kill_pods "${OPERATOR_NS:-$NAMESPACE}" "pod" "$init_pod" "" "operator"
+      kill_pods "${OPERATOR_NS:-$NAMESPACE}" "pod" "$init_pod" "" "chaos-pod-kill-operator"
       sleep 10 # wait a bit for pod to be killed
       wait_deployment percona-server-mysql-operator "${OPERATOR_NS:-$NAMESPACE}"
 
diff --git a/e2e-tests/tests/operator-self-healing/07-network-loss.yaml b/e2e-tests/tests/operator-self-healing/07-network-loss.yaml
diff --git a/e2e-tests/tests/operator-self-healing/09-pod-failure.yaml b/e2e-tests/tests/operator-self-healing/09-pod-failure.yaml