Skip to content

K8SPS-357: Improve full cluster crash recovery #928

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion build/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ RUN GOOS=$GOOS GOARCH=$TARGETARCH CGO_ENABLED=$CGO_ENABLED GO_LDFLAGS=$GO_LDFLAG
RUN GOOS=$GOOS GOARCH=$TARGETARCH CGO_ENABLED=$CGO_ENABLED GO_LDFLAGS=$GO_LDFLAGS \
go build -ldflags "-w -s -X main.GitCommit=$GIT_COMMIT -X main.GitBranch=$GIT_BRANCH -X main.BuildTime=$BUILD_TIME" \
-o build/_output/bin/healthcheck \
cmd/healthcheck/main.go \
./cmd/healthcheck/ \
&& cp -r build/_output/bin/healthcheck /usr/local/bin/healthcheck
RUN GOOS=$GOOS GOARCH=$TARGETARCH CGO_ENABLED=$CGO_ENABLED GO_LDFLAGS=$GO_LDFLAGS \
go build -ldflags "-w -s -X main.GitCommit=$GIT_COMMIT -X main.GitBranch=$GIT_BRANCH -X main.BuildTime=$BUILD_TIME" \
Expand Down
25 changes: 25 additions & 0 deletions cmd/healthcheck/bypass.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
package main

import (
"log"
)

func isFullClusterCrash() bool {
fullClusterCrash, err := fileExists("/var/lib/mysql/full-cluster-crash")
if err != nil {
log.Printf("check /var/lib/mysql/full-cluster-crash: %s", err)
return false
}

return fullClusterCrash
}

func isManualRecovery() bool {
manualRecovery, err := fileExists("/var/lib/mysql/sleep-forever")
if err != nil {
log.Printf("check /var/lib/mysql/sleep-forever: %s", err)
return false
}

return manualRecovery
}
37 changes: 15 additions & 22 deletions cmd/healthcheck/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,44 +18,33 @@ import (
mysqldb "github.com/percona/percona-server-mysql-operator/pkg/db"
"github.com/percona/percona-server-mysql-operator/pkg/k8s"
"github.com/percona/percona-server-mysql-operator/pkg/mysql"
"github.com/percona/percona-server-mysql-operator/pkg/naming"
)

func main() {
fullClusterCrash, err := fileExists("/var/lib/mysql/full-cluster-crash")
if err != nil {
log.Fatalf("check /var/lib/mysql/full-cluster-crash: %s", err)
}
if fullClusterCrash {
os.Exit(0)
}

manualRecovery, err := fileExists("/var/lib/mysql/sleep-forever")
if err != nil {
log.Fatalf("check /var/lib/mysql/sleep-forever: %s", err)
}
if manualRecovery {
if isManualRecovery() {
os.Exit(0)
}

stateFilePath, ok := os.LookupEnv(naming.EnvMySQLStateFile)
if !ok {
log.Fatalln("MYSQL_STATE_FILE env variable is required")
}
mysqlState, err := os.ReadFile(stateFilePath)
mysqlState, err := getMySQLState()
if err != nil {
log.Fatalf("read mysql state: %s", err)
log.Fatalf("failed to get MySQL state: %v", err)
}

ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
defer cancel()

switch os.Args[1] {
case "readiness":
if string(mysqlState) != string(state.MySQLReady) {
if mysqlState != string(state.MySQLReady) {
log.Println("MySQL state is not ready...")
os.Exit(1)
}

// mysqld must be up and running during crash recovery
if isFullClusterCrash() {
os.Exit(0)
}

switch os.Getenv("CLUSTER_TYPE") {
case "async":
if err := checkReadinessAsync(ctx); err != nil {
Expand All @@ -67,7 +56,11 @@ func main() {
}
}
case "liveness":
if string(mysqlState) == string(state.MySQLStartup) {
if isFullClusterCrash() {
os.Exit(0)
}

if mysqlState == string(state.MySQLStartup) {
log.Println("MySQL is starting up, not killing it...")
os.Exit(0)
}
Expand Down
22 changes: 22 additions & 0 deletions cmd/healthcheck/state.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
package main

import (
"os"

"github.com/percona/percona-server-mysql-operator/pkg/naming"
"github.com/pkg/errors"
)

func getMySQLState() (string, error) {
stateFilePath, ok := os.LookupEnv(naming.EnvMySQLStateFile)
if !ok {
return "", errors.New("MYSQL_STATE_FILE env variable is required")
}

mysqlState, err := os.ReadFile(stateFilePath)
if err != nil {
return "", errors.Wrap(err, "read mysql state")
}

return string(mysqlState), nil
}
19 changes: 19 additions & 0 deletions pkg/controller/ps/controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -1309,6 +1309,25 @@ func (r *PerconaServerMySQLReconciler) getPrimaryHost(ctx context.Context, cr *a
return primary.Key.Hostname, nil
}

func (r *PerconaServerMySQLReconciler) getPrimaryPod(ctx context.Context, cr *apiv1alpha1.PerconaServerMySQL) (*corev1.Pod, error) {
primaryHost, err := r.getPrimaryHost(ctx, cr)
if err != nil {
return nil, errors.Wrap(err, "get primary host")
}

idx, err := getPodIndexFromHostname(primaryHost)
if err != nil {
return nil, errors.Wrapf(err, "get pod index from %s", primaryHost)
}

primPod, err := getMySQLPod(ctx, r.Client, cr, idx)
if err != nil {
return nil, errors.Wrapf(err, "get primary pod by index %d", idx)
}

return primPod, nil
}

func (r *PerconaServerMySQLReconciler) stopAsyncReplication(ctx context.Context, cr *apiv1alpha1.PerconaServerMySQL, primary *orchestrator.Instance) error {
log := logf.FromContext(ctx).WithName("stopAsyncReplication")

Expand Down
41 changes: 37 additions & 4 deletions pkg/controller/ps/crash_recovery.go
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,10 @@
continue
}

if !k8s.IsPodReady(pod) {
continue
}

operatorPass, err := k8s.UserPassword(ctx, r.Client, cr, apiv1alpha1.UserOperator)
if err != nil {
return errors.Wrap(err, "get operator password")
Expand All @@ -79,27 +83,56 @@

status, err := mysh.ClusterStatusWithExec(ctx, cr.InnoDBClusterName())
if err == nil && status.DefaultReplicaSet.Status == innodbcluster.ClusterStatusOK {
log.Info("Cluster is healthy", "pod", pod.Name, "host", podFQDN)
err := r.cleanupFullClusterCrashFile(ctx, cr)
if err != nil {
log.Error(err, "failed to remove /var/lib/mysql/full-cluster-crash")
}
continue
}

log.Info("Attempting to reboot cluster from complete outage")
log.Info("Attempting to reboot cluster from complete outage", "pod", pod.Name, "host", podFQDN)
err = mysh.RebootClusterFromCompleteOutageWithExec(ctx, cr.InnoDBClusterName())
if err == nil {
log.Info("Cluster was successfully rebooted")
log.Info("Cluster was successfully rebooted", "pod", pod.Name, "host", podFQDN)
r.Recorder.Event(cr, "Normal", "FullClusterCrashRecovered", "Cluster recovered from full cluster crash")
err := r.cleanupFullClusterCrashFile(ctx, cr)
if err != nil {
log.Error(err, "failed to remove /var/lib/mysql/full-cluster-crash")
break
}

primary, err := r.getPrimaryPod(ctx, cr)
if err != nil {
log.Error(err, "failed to get primary pod")
break
}

log.Info(fmt.Sprintf("Primary pod is %s", primary.Name))

pods, err := k8s.PodsByLabels(ctx, r.Client, mysql.MatchLabels(cr), cr.Namespace)
if err != nil {
log.Error(err, "failed to get mysql pods")
break
}

for _, pod := range pods {
if pod.Name == primary.Name {
continue
}

log.Info("Deleting secondary pod", "pod", pod.Name)
if err := r.Client.Delete(ctx, &pod); err != nil {

Check failure on line 125 in pkg/controller/ps/crash_recovery.go

View workflow job for this annotation

GitHub Actions / runner / suggester / golangci-lint

QF1008: could remove embedded field "Client" from selector (staticcheck)
log.Error(err, "failed to delete pod", "pod", pod.Name)
}
}

break
}

// TODO: This needs to reconsidered.
if strings.Contains(err.Error(), "The Cluster is ONLINE") {
log.Info("Tried to reboot the cluster but MySQL says the cluster is already online")
log.Info("Tried to reboot the cluster but MySQL says the cluster is already online", "pod", pod.Name, "host", podFQDN)
log.Info("Deleting all MySQL pods")
err := r.Client.DeleteAllOf(ctx, &corev1.Pod{}, &client.DeleteAllOfOptions{
ListOptions: client.ListOptions{
Expand All @@ -113,7 +146,7 @@
break
}

log.Error(err, "failed to reboot cluster from complete outage")
log.Error(err, "failed to reboot cluster from complete outage", "pod", pod.Name, "host", podFQDN)
}

return nil
Expand Down
19 changes: 8 additions & 11 deletions pkg/controller/ps/status.go
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,8 @@ func (r *PerconaServerMySQLReconciler) reconcileCRStatus(ctx context.Context, cr
return nil
}

initialState := cr.Status.State

clusterCondition := metav1.Condition{
Status: metav1.ConditionTrue,
Type: apiv1alpha1.StateInitializing.String(),
Expand All @@ -48,6 +50,8 @@ func (r *PerconaServerMySQLReconciler) reconcileCRStatus(ctx context.Context, cr
meta.SetStatusCondition(&cr.Status.Conditions, clusterCondition)

cr.Status.State = apiv1alpha1.StateError

r.Recorder.Event(cr, "Error", "ReconcileError", "Failed to reconcile cluster")
}

nn := types.NamespacedName{Name: cr.Name, Namespace: cr.Namespace}
Expand Down Expand Up @@ -207,17 +211,10 @@ func (r *PerconaServerMySQLReconciler) reconcileCRStatus(ctx context.Context, cr
}
}

log.V(1).Info(
"Writing CR status",
"mysql", cr.Status.MySQL,
"orchestrator", cr.Status.Orchestrator,
"router", cr.Status.Router,
"haproxy", cr.Status.HAProxy,
"host", cr.Status.Host,
"loadbalancers", loadBalancersReady,
"conditions", cr.Status.Conditions,
"state", cr.Status.State,
)
if cr.Status.State != initialState {
log.Info("Cluster state changed", "previous", initialState, "current", cr.Status.State)
r.Recorder.Event(cr, "Warning", "ClusterStateChanged", fmt.Sprintf("%s -> %s", initialState, cr.Status.State))
}

nn := types.NamespacedName{Name: cr.Name, Namespace: cr.Namespace}
return writeStatus(ctx, r.Client, nn, cr.Status)
Expand Down
6 changes: 4 additions & 2 deletions pkg/controller/ps/status_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -380,6 +380,7 @@ func TestReconcileStatusAsync(t *testing.T) {
Client: cb.Build(),
Scheme: scheme,
ClientCmd: cliCmd,
Recorder: new(record.FakeRecorder),
ServerVersion: &platform.ServerVersion{
Platform: platform.PlatformKubernetes,
},
Expand Down Expand Up @@ -963,8 +964,9 @@ func TestReconcileErrorStatus(t *testing.T) {
WithStatusSubresource(objects...)

r := &PerconaServerMySQLReconciler{
Client: cb.Build(),
Scheme: scheme,
Client: cb.Build(),
Scheme: scheme,
Recorder: new(record.FakeRecorder),
ServerVersion: &platform.ServerVersion{
Platform: platform.PlatformKubernetes,
},
Expand Down
4 changes: 3 additions & 1 deletion pkg/controller/ps/suite_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ import (
"k8s.io/apimachinery/pkg/util/yaml"
"k8s.io/client-go/kubernetes/scheme"
"k8s.io/client-go/rest"
"k8s.io/client-go/tools/record"
"sigs.k8s.io/controller-runtime/pkg/client"
"sigs.k8s.io/controller-runtime/pkg/envtest"
logf "sigs.k8s.io/controller-runtime/pkg/log"
Expand Down Expand Up @@ -103,7 +104,8 @@ func reconciler() *PerconaServerMySQLReconciler {
ServerVersion: &platform.ServerVersion{
Platform: platform.PlatformKubernetes,
},
Crons: NewCronRegistry(),
Crons: NewCronRegistry(),
Recorder: new(record.FakeRecorder),
}
}

Expand Down
Loading