From 5dd076054d9e90e3e2d4e354221d1b2087618b50 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ege=20G=C3=BCne=C5=9F?= <ege.gunes@percona.com>
Date: Tue, 3 Jun 2025 18:11:34 +0300
Subject: [PATCH 1/2] K8SPS-357: Improve full cluster crash recovery

Our full cluster crash recovery procedure requires at least 1 restart in
primary and 3 restarts in secondaries:

1. Cluster started after crash
2. Pods are started
3. Full cluster crash detected (1st restart)
4. Operator reboots the cluster
5. Secondary pods are restarted to join the cluster (2nd restart)
6. Secondary pods receive data with Clone (3rd restart)

Even though these restarts are by design, they give the impression
something's wrong with the cluster.

These changes attempt to reduce restarts to 1. After a succesful crash
recovery, operator deletes all secondary pods so they can join the
cluster. Only restart will be the 3rd restart required after clone.
Secondary pods will be deleted by **best effort**. Which means if they
can not be deleted, operator won't do anything. In this case secondary
pods should be ready to serve traffic after 3-4 restarts.

---

To recover a cluster from full cluster crash, we use
`dba.rebootClusterFromCompleteOutage` in mysql-shell. This command
connects to each MySQL pod to find out the node with the latest
transaction and reboots it. This means mysqld needs to be up and running
during crash recovery.

After these changes, pods will be marked ready only if MySQL state is
ready in `$MYSQL_STATE_FILE`.

---

This commit also introduces more events in PerconaServerMySQL:

```
Events:
  Type     Reason                     Age                     From           Message
  ----     ------                     ----                    ----           -------
  Warning  ClusterStateChanged        6m33s                   ps-controller  -> Initializing
  Warning  ClusterStateChanged        5m10s                   ps-controller  Initializing -> Error
  Warning  FullClusterCrashDetected   3m32s (x23 over 5m10s)  ps-controller  Full cluster crash detected
  Normal   FullClusterCrashRecovered  2m40s                   ps-controller  Cluster recovered from full cluster crash
  Warning  ClusterStateChanged        2s                      ps-controller  Initializing -> Ready
```
---
 build/Dockerfile                    |  2 +-
 cmd/healthcheck/bypass.go           | 25 ++++++++++++++++++
 cmd/healthcheck/main.go             | 37 +++++++++++---------------
 cmd/healthcheck/state.go            | 22 ++++++++++++++++
 pkg/controller/ps/controller.go     | 19 +++++++++++++
 pkg/controller/ps/crash_recovery.go | 41 ++++++++++++++++++++++++++---
 pkg/controller/ps/status.go         | 19 ++++++-------
 7 files changed, 127 insertions(+), 38 deletions(-)
 create mode 100644 cmd/healthcheck/bypass.go
 create mode 100644 cmd/healthcheck/state.go

diff --git a/build/Dockerfile b/build/Dockerfile
index 68fd6e869..f8d9019b5 100644
--- a/build/Dockerfile
+++ b/build/Dockerfile
@@ -28,7 +28,7 @@ RUN GOOS=$GOOS GOARCH=$TARGETARCH CGO_ENABLED=$CGO_ENABLED GO_LDFLAGS=$GO_LDFLAG
 RUN GOOS=$GOOS GOARCH=$TARGETARCH CGO_ENABLED=$CGO_ENABLED GO_LDFLAGS=$GO_LDFLAGS \
     go build -ldflags "-w -s -X main.GitCommit=$GIT_COMMIT -X main.GitBranch=$GIT_BRANCH -X main.BuildTime=$BUILD_TIME" \
     -o build/_output/bin/healthcheck \
-    cmd/healthcheck/main.go \
+    ./cmd/healthcheck/ \
     && cp -r build/_output/bin/healthcheck /usr/local/bin/healthcheck
 RUN GOOS=$GOOS GOARCH=$TARGETARCH CGO_ENABLED=$CGO_ENABLED GO_LDFLAGS=$GO_LDFLAGS \
     go build -ldflags "-w -s -X main.GitCommit=$GIT_COMMIT -X main.GitBranch=$GIT_BRANCH -X main.BuildTime=$BUILD_TIME" \
diff --git a/cmd/healthcheck/bypass.go b/cmd/healthcheck/bypass.go
new file mode 100644
index 000000000..25a173aae
--- /dev/null
+++ b/cmd/healthcheck/bypass.go
@@ -0,0 +1,25 @@
+package main
+
+import (
+	"log"
+)
+
+func isFullClusterCrash() bool {
+	fullClusterCrash, err := fileExists("/var/lib/mysql/full-cluster-crash")
+	if err != nil {
+		log.Printf("check /var/lib/mysql/full-cluster-crash: %s", err)
+		return false
+	}
+
+	return fullClusterCrash
+}
+
+func isManualRecovery() bool {
+	manualRecovery, err := fileExists("/var/lib/mysql/sleep-forever")
+	if err != nil {
+		log.Printf("check /var/lib/mysql/sleep-forever: %s", err)
+		return false
+	}
+
+	return manualRecovery
+}
diff --git a/cmd/healthcheck/main.go b/cmd/healthcheck/main.go
index 66d3e3416..4c781d094 100644
--- a/cmd/healthcheck/main.go
+++ b/cmd/healthcheck/main.go
@@ -18,33 +18,16 @@ import (
 	mysqldb "github.com/percona/percona-server-mysql-operator/pkg/db"
 	"github.com/percona/percona-server-mysql-operator/pkg/k8s"
 	"github.com/percona/percona-server-mysql-operator/pkg/mysql"
-	"github.com/percona/percona-server-mysql-operator/pkg/naming"
 )
 
 func main() {
-	fullClusterCrash, err := fileExists("/var/lib/mysql/full-cluster-crash")
-	if err != nil {
-		log.Fatalf("check /var/lib/mysql/full-cluster-crash: %s", err)
-	}
-	if fullClusterCrash {
-		os.Exit(0)
-	}
-
-	manualRecovery, err := fileExists("/var/lib/mysql/sleep-forever")
-	if err != nil {
-		log.Fatalf("check /var/lib/mysql/sleep-forever: %s", err)
-	}
-	if manualRecovery {
+	if isManualRecovery() {
 		os.Exit(0)
 	}
 
-	stateFilePath, ok := os.LookupEnv(naming.EnvMySQLStateFile)
-	if !ok {
-		log.Fatalln("MYSQL_STATE_FILE env variable is required")
-	}
-	mysqlState, err := os.ReadFile(stateFilePath)
+	mysqlState, err := getMySQLState()
 	if err != nil {
-		log.Fatalf("read mysql state: %s", err)
+		log.Fatalf("failed to get MySQL state: %v", err)
 	}
 
 	ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
@@ -52,10 +35,16 @@ func main() {
 
 	switch os.Args[1] {
 	case "readiness":
-		if string(mysqlState) != string(state.MySQLReady) {
+		if mysqlState != string(state.MySQLReady) {
 			log.Println("MySQL state is not ready...")
 			os.Exit(1)
 		}
+
+		// mysqld must be up and running during crash recovery
+		if isFullClusterCrash() {
+			os.Exit(0)
+		}
+
 		switch os.Getenv("CLUSTER_TYPE") {
 		case "async":
 			if err := checkReadinessAsync(ctx); err != nil {
@@ -67,7 +56,11 @@ func main() {
 			}
 		}
 	case "liveness":
-		if string(mysqlState) == string(state.MySQLStartup) {
+		if isFullClusterCrash() {
+			os.Exit(0)
+		}
+
+		if mysqlState == string(state.MySQLStartup) {
 			log.Println("MySQL is starting up, not killing it...")
 			os.Exit(0)
 		}
diff --git a/cmd/healthcheck/state.go b/cmd/healthcheck/state.go
new file mode 100644
index 000000000..c36eb3119
--- /dev/null
+++ b/cmd/healthcheck/state.go
@@ -0,0 +1,22 @@
+package main
+
+import (
+	"os"
+
+	"github.com/percona/percona-server-mysql-operator/pkg/naming"
+	"github.com/pkg/errors"
+)
+
+func getMySQLState() (string, error) {
+	stateFilePath, ok := os.LookupEnv(naming.EnvMySQLStateFile)
+	if !ok {
+		return "", errors.New("MYSQL_STATE_FILE env variable is required")
+	}
+
+	mysqlState, err := os.ReadFile(stateFilePath)
+	if err != nil {
+		return "", errors.Wrap(err, "read mysql state")
+	}
+
+	return string(mysqlState), nil
+}
diff --git a/pkg/controller/ps/controller.go b/pkg/controller/ps/controller.go
index d792cf4a5..a31697e5a 100644
--- a/pkg/controller/ps/controller.go
+++ b/pkg/controller/ps/controller.go
@@ -1309,6 +1309,25 @@ func (r *PerconaServerMySQLReconciler) getPrimaryHost(ctx context.Context, cr *a
 	return primary.Key.Hostname, nil
 }
 
+func (r *PerconaServerMySQLReconciler) getPrimaryPod(ctx context.Context, cr *apiv1alpha1.PerconaServerMySQL) (*corev1.Pod, error) {
+	primaryHost, err := r.getPrimaryHost(ctx, cr)
+	if err != nil {
+		return nil, errors.Wrap(err, "get primary host")
+	}
+
+	idx, err := getPodIndexFromHostname(primaryHost)
+	if err != nil {
+		return nil, errors.Wrapf(err, "get pod index from %s", primaryHost)
+	}
+
+	primPod, err := getMySQLPod(ctx, r.Client, cr, idx)
+	if err != nil {
+		return nil, errors.Wrapf(err, "get primary pod by index %d", idx)
+	}
+
+	return primPod, nil
+}
+
 func (r *PerconaServerMySQLReconciler) stopAsyncReplication(ctx context.Context, cr *apiv1alpha1.PerconaServerMySQL, primary *orchestrator.Instance) error {
 	log := logf.FromContext(ctx).WithName("stopAsyncReplication")
 
diff --git a/pkg/controller/ps/crash_recovery.go b/pkg/controller/ps/crash_recovery.go
index 90cb07643..c6d6cdcf3 100644
--- a/pkg/controller/ps/crash_recovery.go
+++ b/pkg/controller/ps/crash_recovery.go
@@ -64,6 +64,10 @@ func (r *PerconaServerMySQLReconciler) reconcileFullClusterCrash(ctx context.Con
 			continue
 		}
 
+		if !k8s.IsPodReady(pod) {
+			continue
+		}
+
 		operatorPass, err := k8s.UserPassword(ctx, r.Client, cr, apiv1alpha1.UserOperator)
 		if err != nil {
 			return errors.Wrap(err, "get operator password")
@@ -79,6 +83,7 @@ func (r *PerconaServerMySQLReconciler) reconcileFullClusterCrash(ctx context.Con
 
 		status, err := mysh.ClusterStatusWithExec(ctx, cr.InnoDBClusterName())
 		if err == nil && status.DefaultReplicaSet.Status == innodbcluster.ClusterStatusOK {
+			log.Info("Cluster is healthy", "pod", pod.Name, "host", podFQDN)
 			err := r.cleanupFullClusterCrashFile(ctx, cr)
 			if err != nil {
 				log.Error(err, "failed to remove /var/lib/mysql/full-cluster-crash")
@@ -86,20 +91,48 @@ func (r *PerconaServerMySQLReconciler) reconcileFullClusterCrash(ctx context.Con
 			continue
 		}
 
-		log.Info("Attempting to reboot cluster from complete outage")
+		log.Info("Attempting to reboot cluster from complete outage", "pod", pod.Name, "host", podFQDN)
 		err = mysh.RebootClusterFromCompleteOutageWithExec(ctx, cr.InnoDBClusterName())
 		if err == nil {
-			log.Info("Cluster was successfully rebooted")
+			log.Info("Cluster was successfully rebooted", "pod", pod.Name, "host", podFQDN)
 			r.Recorder.Event(cr, "Normal", "FullClusterCrashRecovered", "Cluster recovered from full cluster crash")
 			err := r.cleanupFullClusterCrashFile(ctx, cr)
 			if err != nil {
 				log.Error(err, "failed to remove /var/lib/mysql/full-cluster-crash")
+				break
 			}
+
+			primary, err := r.getPrimaryPod(ctx, cr)
+			if err != nil {
+				log.Error(err, "failed to get primary pod")
+				break
+			}
+
+			log.Info(fmt.Sprintf("Primary pod is %s", primary.Name))
+
+			pods, err := k8s.PodsByLabels(ctx, r.Client, mysql.MatchLabels(cr), cr.Namespace)
+			if err != nil {
+				log.Error(err, "failed to get mysql pods")
+				break
+			}
+
+			for _, pod := range pods {
+				if pod.Name == primary.Name {
+					continue
+				}
+
+				log.Info("Deleting secondary pod", "pod", pod.Name)
+				if err := r.Client.Delete(ctx, &pod); err != nil {
+					log.Error(err, "failed to delete pod", "pod", pod.Name)
+				}
+			}
+
 			break
 		}
 
+		// TODO: This needs to reconsidered.
 		if strings.Contains(err.Error(), "The Cluster is ONLINE") {
-			log.Info("Tried to reboot the cluster but MySQL says the cluster is already online")
+			log.Info("Tried to reboot the cluster but MySQL says the cluster is already online", "pod", pod.Name, "host", podFQDN)
 			log.Info("Deleting all MySQL pods")
 			err := r.Client.DeleteAllOf(ctx, &corev1.Pod{}, &client.DeleteAllOfOptions{
 				ListOptions: client.ListOptions{
@@ -113,7 +146,7 @@ func (r *PerconaServerMySQLReconciler) reconcileFullClusterCrash(ctx context.Con
 			break
 		}
 
-		log.Error(err, "failed to reboot cluster from complete outage")
+		log.Error(err, "failed to reboot cluster from complete outage", "pod", pod.Name, "host", podFQDN)
 	}
 
 	return nil
diff --git a/pkg/controller/ps/status.go b/pkg/controller/ps/status.go
index df1599a73..04c67b462 100644
--- a/pkg/controller/ps/status.go
+++ b/pkg/controller/ps/status.go
@@ -33,6 +33,8 @@ func (r *PerconaServerMySQLReconciler) reconcileCRStatus(ctx context.Context, cr
 		return nil
 	}
 
+	initialState := cr.Status.State
+
 	clusterCondition := metav1.Condition{
 		Status:             metav1.ConditionTrue,
 		Type:               apiv1alpha1.StateInitializing.String(),
@@ -48,6 +50,8 @@ func (r *PerconaServerMySQLReconciler) reconcileCRStatus(ctx context.Context, cr
 			meta.SetStatusCondition(&cr.Status.Conditions, clusterCondition)
 
 			cr.Status.State = apiv1alpha1.StateError
+
+			r.Recorder.Event(cr, "Error", "ReconcileError", "Failed to reconcile cluster")
 		}
 
 		nn := types.NamespacedName{Name: cr.Name, Namespace: cr.Namespace}
@@ -207,17 +211,10 @@ func (r *PerconaServerMySQLReconciler) reconcileCRStatus(ctx context.Context, cr
 		}
 	}
 
-	log.V(1).Info(
-		"Writing CR status",
-		"mysql", cr.Status.MySQL,
-		"orchestrator", cr.Status.Orchestrator,
-		"router", cr.Status.Router,
-		"haproxy", cr.Status.HAProxy,
-		"host", cr.Status.Host,
-		"loadbalancers", loadBalancersReady,
-		"conditions", cr.Status.Conditions,
-		"state", cr.Status.State,
-	)
+	if cr.Status.State != initialState {
+		log.Info("Cluster state changed", "previous", initialState, "current", cr.Status.State)
+		r.Recorder.Event(cr, "Warning", "ClusterStateChanged", fmt.Sprintf("%s -> %s", initialState, cr.Status.State))
+	}
 
 	nn := types.NamespacedName{Name: cr.Name, Namespace: cr.Namespace}
 	return writeStatus(ctx, r.Client, nn, cr.Status)

From c8d26aafd05b0b05c141e2e00b57180bd8f1a3c5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ege=20G=C3=BCne=C5=9F?= <ege.gunes@percona.com>
Date: Wed, 4 Jun 2025 13:21:44 +0300
Subject: [PATCH 2/2] fix unit tests

---
 pkg/controller/ps/status_test.go | 6 ++++--
 pkg/controller/ps/suite_test.go  | 4 +++-
 2 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/pkg/controller/ps/status_test.go b/pkg/controller/ps/status_test.go
index 0869e450e..03c84d7ca 100644
--- a/pkg/controller/ps/status_test.go
+++ b/pkg/controller/ps/status_test.go
@@ -380,6 +380,7 @@ func TestReconcileStatusAsync(t *testing.T) {
 				Client:    cb.Build(),
 				Scheme:    scheme,
 				ClientCmd: cliCmd,
+				Recorder:  new(record.FakeRecorder),
 				ServerVersion: &platform.ServerVersion{
 					Platform: platform.PlatformKubernetes,
 				},
@@ -963,8 +964,9 @@ func TestReconcileErrorStatus(t *testing.T) {
 		WithStatusSubresource(objects...)
 
 	r := &PerconaServerMySQLReconciler{
-		Client: cb.Build(),
-		Scheme: scheme,
+		Client:   cb.Build(),
+		Scheme:   scheme,
+		Recorder: new(record.FakeRecorder),
 		ServerVersion: &platform.ServerVersion{
 			Platform: platform.PlatformKubernetes,
 		},
diff --git a/pkg/controller/ps/suite_test.go b/pkg/controller/ps/suite_test.go
index 5eba1dc18..f5747ba0c 100644
--- a/pkg/controller/ps/suite_test.go
+++ b/pkg/controller/ps/suite_test.go
@@ -28,6 +28,7 @@ import (
 	"k8s.io/apimachinery/pkg/util/yaml"
 	"k8s.io/client-go/kubernetes/scheme"
 	"k8s.io/client-go/rest"
+	"k8s.io/client-go/tools/record"
 	"sigs.k8s.io/controller-runtime/pkg/client"
 	"sigs.k8s.io/controller-runtime/pkg/envtest"
 	logf "sigs.k8s.io/controller-runtime/pkg/log"
@@ -103,7 +104,8 @@ func reconciler() *PerconaServerMySQLReconciler {
 		ServerVersion: &platform.ServerVersion{
 			Platform: platform.PlatformKubernetes,
 		},
-		Crons: NewCronRegistry(),
+		Crons:    NewCronRegistry(),
+		Recorder: new(record.FakeRecorder),
 	}
 }