From 5dd076054d9e90e3e2d4e354221d1b2087618b50 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ege=20G=C3=BCne=C5=9F?= Date: Tue, 3 Jun 2025 18:11:34 +0300 Subject: [PATCH 1/2] K8SPS-357: Improve full cluster crash recovery Our full cluster crash recovery procedure requires at least 1 restart in primary and 3 restarts in secondaries: 1. Cluster started after crash 2. Pods are started 3. Full cluster crash detected (1st restart) 4. Operator reboots the cluster 5. Secondary pods are restarted to join the cluster (2nd restart) 6. Secondary pods receive data with Clone (3rd restart) Even though these restarts are by design, they give the impression something's wrong with the cluster. These changes attempt to reduce restarts to 1. After a succesful crash recovery, operator deletes all secondary pods so they can join the cluster. Only restart will be the 3rd restart required after clone. Secondary pods will be deleted by **best effort**. Which means if they can not be deleted, operator won't do anything. In this case secondary pods should be ready to serve traffic after 3-4 restarts. --- To recover a cluster from full cluster crash, we use `dba.rebootClusterFromCompleteOutage` in mysql-shell. This command connects to each MySQL pod to find out the node with the latest transaction and reboots it. This means mysqld needs to be up and running during crash recovery. After these changes, pods will be marked ready only if MySQL state is ready in `$MYSQL_STATE_FILE`. --- This commit also introduces more events in PerconaServerMySQL: ``` Events: Type Reason Age From Message ---- ------ ---- ---- ------- Warning ClusterStateChanged 6m33s ps-controller -> Initializing Warning ClusterStateChanged 5m10s ps-controller Initializing -> Error Warning FullClusterCrashDetected 3m32s (x23 over 5m10s) ps-controller Full cluster crash detected Normal FullClusterCrashRecovered 2m40s ps-controller Cluster recovered from full cluster crash Warning ClusterStateChanged 2s ps-controller Initializing -> Ready ``` --- build/Dockerfile | 2 +- cmd/healthcheck/bypass.go | 25 ++++++++++++++++++ cmd/healthcheck/main.go | 37 +++++++++++--------------- cmd/healthcheck/state.go | 22 ++++++++++++++++ pkg/controller/ps/controller.go | 19 +++++++++++++ pkg/controller/ps/crash_recovery.go | 41 ++++++++++++++++++++++++++--- pkg/controller/ps/status.go | 19 ++++++------- 7 files changed, 127 insertions(+), 38 deletions(-) create mode 100644 cmd/healthcheck/bypass.go create mode 100644 cmd/healthcheck/state.go diff --git a/build/Dockerfile b/build/Dockerfile index 68fd6e869..f8d9019b5 100644 --- a/build/Dockerfile +++ b/build/Dockerfile @@ -28,7 +28,7 @@ RUN GOOS=$GOOS GOARCH=$TARGETARCH CGO_ENABLED=$CGO_ENABLED GO_LDFLAGS=$GO_LDFLAG RUN GOOS=$GOOS GOARCH=$TARGETARCH CGO_ENABLED=$CGO_ENABLED GO_LDFLAGS=$GO_LDFLAGS \ go build -ldflags "-w -s -X main.GitCommit=$GIT_COMMIT -X main.GitBranch=$GIT_BRANCH -X main.BuildTime=$BUILD_TIME" \ -o build/_output/bin/healthcheck \ - cmd/healthcheck/main.go \ + ./cmd/healthcheck/ \ && cp -r build/_output/bin/healthcheck /usr/local/bin/healthcheck RUN GOOS=$GOOS GOARCH=$TARGETARCH CGO_ENABLED=$CGO_ENABLED GO_LDFLAGS=$GO_LDFLAGS \ go build -ldflags "-w -s -X main.GitCommit=$GIT_COMMIT -X main.GitBranch=$GIT_BRANCH -X main.BuildTime=$BUILD_TIME" \ diff --git a/cmd/healthcheck/bypass.go b/cmd/healthcheck/bypass.go new file mode 100644 index 000000000..25a173aae --- /dev/null +++ b/cmd/healthcheck/bypass.go @@ -0,0 +1,25 @@ +package main + +import ( + "log" +) + +func isFullClusterCrash() bool { + fullClusterCrash, err := fileExists("/var/lib/mysql/full-cluster-crash") + if err != nil { + log.Printf("check /var/lib/mysql/full-cluster-crash: %s", err) + return false + } + + return fullClusterCrash +} + +func isManualRecovery() bool { + manualRecovery, err := fileExists("/var/lib/mysql/sleep-forever") + if err != nil { + log.Printf("check /var/lib/mysql/sleep-forever: %s", err) + return false + } + + return manualRecovery +} diff --git a/cmd/healthcheck/main.go b/cmd/healthcheck/main.go index 66d3e3416..4c781d094 100644 --- a/cmd/healthcheck/main.go +++ b/cmd/healthcheck/main.go @@ -18,33 +18,16 @@ import ( mysqldb "github.com/percona/percona-server-mysql-operator/pkg/db" "github.com/percona/percona-server-mysql-operator/pkg/k8s" "github.com/percona/percona-server-mysql-operator/pkg/mysql" - "github.com/percona/percona-server-mysql-operator/pkg/naming" ) func main() { - fullClusterCrash, err := fileExists("/var/lib/mysql/full-cluster-crash") - if err != nil { - log.Fatalf("check /var/lib/mysql/full-cluster-crash: %s", err) - } - if fullClusterCrash { - os.Exit(0) - } - - manualRecovery, err := fileExists("/var/lib/mysql/sleep-forever") - if err != nil { - log.Fatalf("check /var/lib/mysql/sleep-forever: %s", err) - } - if manualRecovery { + if isManualRecovery() { os.Exit(0) } - stateFilePath, ok := os.LookupEnv(naming.EnvMySQLStateFile) - if !ok { - log.Fatalln("MYSQL_STATE_FILE env variable is required") - } - mysqlState, err := os.ReadFile(stateFilePath) + mysqlState, err := getMySQLState() if err != nil { - log.Fatalf("read mysql state: %s", err) + log.Fatalf("failed to get MySQL state: %v", err) } ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) @@ -52,10 +35,16 @@ func main() { switch os.Args[1] { case "readiness": - if string(mysqlState) != string(state.MySQLReady) { + if mysqlState != string(state.MySQLReady) { log.Println("MySQL state is not ready...") os.Exit(1) } + + // mysqld must be up and running during crash recovery + if isFullClusterCrash() { + os.Exit(0) + } + switch os.Getenv("CLUSTER_TYPE") { case "async": if err := checkReadinessAsync(ctx); err != nil { @@ -67,7 +56,11 @@ func main() { } } case "liveness": - if string(mysqlState) == string(state.MySQLStartup) { + if isFullClusterCrash() { + os.Exit(0) + } + + if mysqlState == string(state.MySQLStartup) { log.Println("MySQL is starting up, not killing it...") os.Exit(0) } diff --git a/cmd/healthcheck/state.go b/cmd/healthcheck/state.go new file mode 100644 index 000000000..c36eb3119 --- /dev/null +++ b/cmd/healthcheck/state.go @@ -0,0 +1,22 @@ +package main + +import ( + "os" + + "github.com/percona/percona-server-mysql-operator/pkg/naming" + "github.com/pkg/errors" +) + +func getMySQLState() (string, error) { + stateFilePath, ok := os.LookupEnv(naming.EnvMySQLStateFile) + if !ok { + return "", errors.New("MYSQL_STATE_FILE env variable is required") + } + + mysqlState, err := os.ReadFile(stateFilePath) + if err != nil { + return "", errors.Wrap(err, "read mysql state") + } + + return string(mysqlState), nil +} diff --git a/pkg/controller/ps/controller.go b/pkg/controller/ps/controller.go index d792cf4a5..a31697e5a 100644 --- a/pkg/controller/ps/controller.go +++ b/pkg/controller/ps/controller.go @@ -1309,6 +1309,25 @@ func (r *PerconaServerMySQLReconciler) getPrimaryHost(ctx context.Context, cr *a return primary.Key.Hostname, nil } +func (r *PerconaServerMySQLReconciler) getPrimaryPod(ctx context.Context, cr *apiv1alpha1.PerconaServerMySQL) (*corev1.Pod, error) { + primaryHost, err := r.getPrimaryHost(ctx, cr) + if err != nil { + return nil, errors.Wrap(err, "get primary host") + } + + idx, err := getPodIndexFromHostname(primaryHost) + if err != nil { + return nil, errors.Wrapf(err, "get pod index from %s", primaryHost) + } + + primPod, err := getMySQLPod(ctx, r.Client, cr, idx) + if err != nil { + return nil, errors.Wrapf(err, "get primary pod by index %d", idx) + } + + return primPod, nil +} + func (r *PerconaServerMySQLReconciler) stopAsyncReplication(ctx context.Context, cr *apiv1alpha1.PerconaServerMySQL, primary *orchestrator.Instance) error { log := logf.FromContext(ctx).WithName("stopAsyncReplication") diff --git a/pkg/controller/ps/crash_recovery.go b/pkg/controller/ps/crash_recovery.go index 90cb07643..c6d6cdcf3 100644 --- a/pkg/controller/ps/crash_recovery.go +++ b/pkg/controller/ps/crash_recovery.go @@ -64,6 +64,10 @@ func (r *PerconaServerMySQLReconciler) reconcileFullClusterCrash(ctx context.Con continue } + if !k8s.IsPodReady(pod) { + continue + } + operatorPass, err := k8s.UserPassword(ctx, r.Client, cr, apiv1alpha1.UserOperator) if err != nil { return errors.Wrap(err, "get operator password") @@ -79,6 +83,7 @@ func (r *PerconaServerMySQLReconciler) reconcileFullClusterCrash(ctx context.Con status, err := mysh.ClusterStatusWithExec(ctx, cr.InnoDBClusterName()) if err == nil && status.DefaultReplicaSet.Status == innodbcluster.ClusterStatusOK { + log.Info("Cluster is healthy", "pod", pod.Name, "host", podFQDN) err := r.cleanupFullClusterCrashFile(ctx, cr) if err != nil { log.Error(err, "failed to remove /var/lib/mysql/full-cluster-crash") @@ -86,20 +91,48 @@ func (r *PerconaServerMySQLReconciler) reconcileFullClusterCrash(ctx context.Con continue } - log.Info("Attempting to reboot cluster from complete outage") + log.Info("Attempting to reboot cluster from complete outage", "pod", pod.Name, "host", podFQDN) err = mysh.RebootClusterFromCompleteOutageWithExec(ctx, cr.InnoDBClusterName()) if err == nil { - log.Info("Cluster was successfully rebooted") + log.Info("Cluster was successfully rebooted", "pod", pod.Name, "host", podFQDN) r.Recorder.Event(cr, "Normal", "FullClusterCrashRecovered", "Cluster recovered from full cluster crash") err := r.cleanupFullClusterCrashFile(ctx, cr) if err != nil { log.Error(err, "failed to remove /var/lib/mysql/full-cluster-crash") + break } + + primary, err := r.getPrimaryPod(ctx, cr) + if err != nil { + log.Error(err, "failed to get primary pod") + break + } + + log.Info(fmt.Sprintf("Primary pod is %s", primary.Name)) + + pods, err := k8s.PodsByLabels(ctx, r.Client, mysql.MatchLabels(cr), cr.Namespace) + if err != nil { + log.Error(err, "failed to get mysql pods") + break + } + + for _, pod := range pods { + if pod.Name == primary.Name { + continue + } + + log.Info("Deleting secondary pod", "pod", pod.Name) + if err := r.Client.Delete(ctx, &pod); err != nil { + log.Error(err, "failed to delete pod", "pod", pod.Name) + } + } + break } + // TODO: This needs to reconsidered. if strings.Contains(err.Error(), "The Cluster is ONLINE") { - log.Info("Tried to reboot the cluster but MySQL says the cluster is already online") + log.Info("Tried to reboot the cluster but MySQL says the cluster is already online", "pod", pod.Name, "host", podFQDN) log.Info("Deleting all MySQL pods") err := r.Client.DeleteAllOf(ctx, &corev1.Pod{}, &client.DeleteAllOfOptions{ ListOptions: client.ListOptions{ @@ -113,7 +146,7 @@ func (r *PerconaServerMySQLReconciler) reconcileFullClusterCrash(ctx context.Con break } - log.Error(err, "failed to reboot cluster from complete outage") + log.Error(err, "failed to reboot cluster from complete outage", "pod", pod.Name, "host", podFQDN) } return nil diff --git a/pkg/controller/ps/status.go b/pkg/controller/ps/status.go index df1599a73..04c67b462 100644 --- a/pkg/controller/ps/status.go +++ b/pkg/controller/ps/status.go @@ -33,6 +33,8 @@ func (r *PerconaServerMySQLReconciler) reconcileCRStatus(ctx context.Context, cr return nil } + initialState := cr.Status.State + clusterCondition := metav1.Condition{ Status: metav1.ConditionTrue, Type: apiv1alpha1.StateInitializing.String(), @@ -48,6 +50,8 @@ func (r *PerconaServerMySQLReconciler) reconcileCRStatus(ctx context.Context, cr meta.SetStatusCondition(&cr.Status.Conditions, clusterCondition) cr.Status.State = apiv1alpha1.StateError + + r.Recorder.Event(cr, "Error", "ReconcileError", "Failed to reconcile cluster") } nn := types.NamespacedName{Name: cr.Name, Namespace: cr.Namespace} @@ -207,17 +211,10 @@ func (r *PerconaServerMySQLReconciler) reconcileCRStatus(ctx context.Context, cr } } - log.V(1).Info( - "Writing CR status", - "mysql", cr.Status.MySQL, - "orchestrator", cr.Status.Orchestrator, - "router", cr.Status.Router, - "haproxy", cr.Status.HAProxy, - "host", cr.Status.Host, - "loadbalancers", loadBalancersReady, - "conditions", cr.Status.Conditions, - "state", cr.Status.State, - ) + if cr.Status.State != initialState { + log.Info("Cluster state changed", "previous", initialState, "current", cr.Status.State) + r.Recorder.Event(cr, "Warning", "ClusterStateChanged", fmt.Sprintf("%s -> %s", initialState, cr.Status.State)) + } nn := types.NamespacedName{Name: cr.Name, Namespace: cr.Namespace} return writeStatus(ctx, r.Client, nn, cr.Status) From c8d26aafd05b0b05c141e2e00b57180bd8f1a3c5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ege=20G=C3=BCne=C5=9F?= Date: Wed, 4 Jun 2025 13:21:44 +0300 Subject: [PATCH 2/2] fix unit tests --- pkg/controller/ps/status_test.go | 6 ++++-- pkg/controller/ps/suite_test.go | 4 +++- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/pkg/controller/ps/status_test.go b/pkg/controller/ps/status_test.go index 0869e450e..03c84d7ca 100644 --- a/pkg/controller/ps/status_test.go +++ b/pkg/controller/ps/status_test.go @@ -380,6 +380,7 @@ func TestReconcileStatusAsync(t *testing.T) { Client: cb.Build(), Scheme: scheme, ClientCmd: cliCmd, + Recorder: new(record.FakeRecorder), ServerVersion: &platform.ServerVersion{ Platform: platform.PlatformKubernetes, }, @@ -963,8 +964,9 @@ func TestReconcileErrorStatus(t *testing.T) { WithStatusSubresource(objects...) r := &PerconaServerMySQLReconciler{ - Client: cb.Build(), - Scheme: scheme, + Client: cb.Build(), + Scheme: scheme, + Recorder: new(record.FakeRecorder), ServerVersion: &platform.ServerVersion{ Platform: platform.PlatformKubernetes, }, diff --git a/pkg/controller/ps/suite_test.go b/pkg/controller/ps/suite_test.go index 5eba1dc18..f5747ba0c 100644 --- a/pkg/controller/ps/suite_test.go +++ b/pkg/controller/ps/suite_test.go @@ -28,6 +28,7 @@ import ( "k8s.io/apimachinery/pkg/util/yaml" "k8s.io/client-go/kubernetes/scheme" "k8s.io/client-go/rest" + "k8s.io/client-go/tools/record" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/envtest" logf "sigs.k8s.io/controller-runtime/pkg/log" @@ -103,7 +104,8 @@ func reconciler() *PerconaServerMySQLReconciler { ServerVersion: &platform.ServerVersion{ Platform: platform.PlatformKubernetes, }, - Crons: NewCronRegistry(), + Crons: NewCronRegistry(), + Recorder: new(record.FakeRecorder), } }