diff --git a/build/Dockerfile b/build/Dockerfile index 68fd6e869..f8d9019b5 100644 --- a/build/Dockerfile +++ b/build/Dockerfile @@ -28,7 +28,7 @@ RUN GOOS=$GOOS GOARCH=$TARGETARCH CGO_ENABLED=$CGO_ENABLED GO_LDFLAGS=$GO_LDFLAG RUN GOOS=$GOOS GOARCH=$TARGETARCH CGO_ENABLED=$CGO_ENABLED GO_LDFLAGS=$GO_LDFLAGS \ go build -ldflags "-w -s -X main.GitCommit=$GIT_COMMIT -X main.GitBranch=$GIT_BRANCH -X main.BuildTime=$BUILD_TIME" \ -o build/_output/bin/healthcheck \ - cmd/healthcheck/main.go \ + ./cmd/healthcheck/ \ && cp -r build/_output/bin/healthcheck /usr/local/bin/healthcheck RUN GOOS=$GOOS GOARCH=$TARGETARCH CGO_ENABLED=$CGO_ENABLED GO_LDFLAGS=$GO_LDFLAGS \ go build -ldflags "-w -s -X main.GitCommit=$GIT_COMMIT -X main.GitBranch=$GIT_BRANCH -X main.BuildTime=$BUILD_TIME" \ diff --git a/cmd/healthcheck/bypass.go b/cmd/healthcheck/bypass.go new file mode 100644 index 000000000..25a173aae --- /dev/null +++ b/cmd/healthcheck/bypass.go @@ -0,0 +1,25 @@ +package main + +import ( + "log" +) + +func isFullClusterCrash() bool { + fullClusterCrash, err := fileExists("/var/lib/mysql/full-cluster-crash") + if err != nil { + log.Printf("check /var/lib/mysql/full-cluster-crash: %s", err) + return false + } + + return fullClusterCrash +} + +func isManualRecovery() bool { + manualRecovery, err := fileExists("/var/lib/mysql/sleep-forever") + if err != nil { + log.Printf("check /var/lib/mysql/sleep-forever: %s", err) + return false + } + + return manualRecovery +} diff --git a/cmd/healthcheck/main.go b/cmd/healthcheck/main.go index 66d3e3416..4c781d094 100644 --- a/cmd/healthcheck/main.go +++ b/cmd/healthcheck/main.go @@ -18,33 +18,16 @@ import ( mysqldb "github.com/percona/percona-server-mysql-operator/pkg/db" "github.com/percona/percona-server-mysql-operator/pkg/k8s" "github.com/percona/percona-server-mysql-operator/pkg/mysql" - "github.com/percona/percona-server-mysql-operator/pkg/naming" ) func main() { - fullClusterCrash, err := fileExists("/var/lib/mysql/full-cluster-crash") - if err != nil { - log.Fatalf("check /var/lib/mysql/full-cluster-crash: %s", err) - } - if fullClusterCrash { - os.Exit(0) - } - - manualRecovery, err := fileExists("/var/lib/mysql/sleep-forever") - if err != nil { - log.Fatalf("check /var/lib/mysql/sleep-forever: %s", err) - } - if manualRecovery { + if isManualRecovery() { os.Exit(0) } - stateFilePath, ok := os.LookupEnv(naming.EnvMySQLStateFile) - if !ok { - log.Fatalln("MYSQL_STATE_FILE env variable is required") - } - mysqlState, err := os.ReadFile(stateFilePath) + mysqlState, err := getMySQLState() if err != nil { - log.Fatalf("read mysql state: %s", err) + log.Fatalf("failed to get MySQL state: %v", err) } ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) @@ -52,10 +35,16 @@ func main() { switch os.Args[1] { case "readiness": - if string(mysqlState) != string(state.MySQLReady) { + if mysqlState != string(state.MySQLReady) { log.Println("MySQL state is not ready...") os.Exit(1) } + + // mysqld must be up and running during crash recovery + if isFullClusterCrash() { + os.Exit(0) + } + switch os.Getenv("CLUSTER_TYPE") { case "async": if err := checkReadinessAsync(ctx); err != nil { @@ -67,7 +56,11 @@ func main() { } } case "liveness": - if string(mysqlState) == string(state.MySQLStartup) { + if isFullClusterCrash() { + os.Exit(0) + } + + if mysqlState == string(state.MySQLStartup) { log.Println("MySQL is starting up, not killing it...") os.Exit(0) } diff --git a/cmd/healthcheck/state.go b/cmd/healthcheck/state.go new file mode 100644 index 000000000..c36eb3119 --- /dev/null +++ b/cmd/healthcheck/state.go @@ -0,0 +1,22 @@ +package main + +import ( + "os" + + "github.com/percona/percona-server-mysql-operator/pkg/naming" + "github.com/pkg/errors" +) + +func getMySQLState() (string, error) { + stateFilePath, ok := os.LookupEnv(naming.EnvMySQLStateFile) + if !ok { + return "", errors.New("MYSQL_STATE_FILE env variable is required") + } + + mysqlState, err := os.ReadFile(stateFilePath) + if err != nil { + return "", errors.Wrap(err, "read mysql state") + } + + return string(mysqlState), nil +} diff --git a/pkg/controller/ps/controller.go b/pkg/controller/ps/controller.go index d792cf4a5..a31697e5a 100644 --- a/pkg/controller/ps/controller.go +++ b/pkg/controller/ps/controller.go @@ -1309,6 +1309,25 @@ func (r *PerconaServerMySQLReconciler) getPrimaryHost(ctx context.Context, cr *a return primary.Key.Hostname, nil } +func (r *PerconaServerMySQLReconciler) getPrimaryPod(ctx context.Context, cr *apiv1alpha1.PerconaServerMySQL) (*corev1.Pod, error) { + primaryHost, err := r.getPrimaryHost(ctx, cr) + if err != nil { + return nil, errors.Wrap(err, "get primary host") + } + + idx, err := getPodIndexFromHostname(primaryHost) + if err != nil { + return nil, errors.Wrapf(err, "get pod index from %s", primaryHost) + } + + primPod, err := getMySQLPod(ctx, r.Client, cr, idx) + if err != nil { + return nil, errors.Wrapf(err, "get primary pod by index %d", idx) + } + + return primPod, nil +} + func (r *PerconaServerMySQLReconciler) stopAsyncReplication(ctx context.Context, cr *apiv1alpha1.PerconaServerMySQL, primary *orchestrator.Instance) error { log := logf.FromContext(ctx).WithName("stopAsyncReplication") diff --git a/pkg/controller/ps/crash_recovery.go b/pkg/controller/ps/crash_recovery.go index 90cb07643..c6d6cdcf3 100644 --- a/pkg/controller/ps/crash_recovery.go +++ b/pkg/controller/ps/crash_recovery.go @@ -64,6 +64,10 @@ func (r *PerconaServerMySQLReconciler) reconcileFullClusterCrash(ctx context.Con continue } + if !k8s.IsPodReady(pod) { + continue + } + operatorPass, err := k8s.UserPassword(ctx, r.Client, cr, apiv1alpha1.UserOperator) if err != nil { return errors.Wrap(err, "get operator password") @@ -79,6 +83,7 @@ func (r *PerconaServerMySQLReconciler) reconcileFullClusterCrash(ctx context.Con status, err := mysh.ClusterStatusWithExec(ctx, cr.InnoDBClusterName()) if err == nil && status.DefaultReplicaSet.Status == innodbcluster.ClusterStatusOK { + log.Info("Cluster is healthy", "pod", pod.Name, "host", podFQDN) err := r.cleanupFullClusterCrashFile(ctx, cr) if err != nil { log.Error(err, "failed to remove /var/lib/mysql/full-cluster-crash") @@ -86,20 +91,48 @@ func (r *PerconaServerMySQLReconciler) reconcileFullClusterCrash(ctx context.Con continue } - log.Info("Attempting to reboot cluster from complete outage") + log.Info("Attempting to reboot cluster from complete outage", "pod", pod.Name, "host", podFQDN) err = mysh.RebootClusterFromCompleteOutageWithExec(ctx, cr.InnoDBClusterName()) if err == nil { - log.Info("Cluster was successfully rebooted") + log.Info("Cluster was successfully rebooted", "pod", pod.Name, "host", podFQDN) r.Recorder.Event(cr, "Normal", "FullClusterCrashRecovered", "Cluster recovered from full cluster crash") err := r.cleanupFullClusterCrashFile(ctx, cr) if err != nil { log.Error(err, "failed to remove /var/lib/mysql/full-cluster-crash") + break } + + primary, err := r.getPrimaryPod(ctx, cr) + if err != nil { + log.Error(err, "failed to get primary pod") + break + } + + log.Info(fmt.Sprintf("Primary pod is %s", primary.Name)) + + pods, err := k8s.PodsByLabels(ctx, r.Client, mysql.MatchLabels(cr), cr.Namespace) + if err != nil { + log.Error(err, "failed to get mysql pods") + break + } + + for _, pod := range pods { + if pod.Name == primary.Name { + continue + } + + log.Info("Deleting secondary pod", "pod", pod.Name) + if err := r.Client.Delete(ctx, &pod); err != nil { + log.Error(err, "failed to delete pod", "pod", pod.Name) + } + } + break } + // TODO: This needs to reconsidered. if strings.Contains(err.Error(), "The Cluster is ONLINE") { - log.Info("Tried to reboot the cluster but MySQL says the cluster is already online") + log.Info("Tried to reboot the cluster but MySQL says the cluster is already online", "pod", pod.Name, "host", podFQDN) log.Info("Deleting all MySQL pods") err := r.Client.DeleteAllOf(ctx, &corev1.Pod{}, &client.DeleteAllOfOptions{ ListOptions: client.ListOptions{ @@ -113,7 +146,7 @@ func (r *PerconaServerMySQLReconciler) reconcileFullClusterCrash(ctx context.Con break } - log.Error(err, "failed to reboot cluster from complete outage") + log.Error(err, "failed to reboot cluster from complete outage", "pod", pod.Name, "host", podFQDN) } return nil diff --git a/pkg/controller/ps/status.go b/pkg/controller/ps/status.go index df1599a73..04c67b462 100644 --- a/pkg/controller/ps/status.go +++ b/pkg/controller/ps/status.go @@ -33,6 +33,8 @@ func (r *PerconaServerMySQLReconciler) reconcileCRStatus(ctx context.Context, cr return nil } + initialState := cr.Status.State + clusterCondition := metav1.Condition{ Status: metav1.ConditionTrue, Type: apiv1alpha1.StateInitializing.String(), @@ -48,6 +50,8 @@ func (r *PerconaServerMySQLReconciler) reconcileCRStatus(ctx context.Context, cr meta.SetStatusCondition(&cr.Status.Conditions, clusterCondition) cr.Status.State = apiv1alpha1.StateError + + r.Recorder.Event(cr, "Error", "ReconcileError", "Failed to reconcile cluster") } nn := types.NamespacedName{Name: cr.Name, Namespace: cr.Namespace} @@ -207,17 +211,10 @@ func (r *PerconaServerMySQLReconciler) reconcileCRStatus(ctx context.Context, cr } } - log.V(1).Info( - "Writing CR status", - "mysql", cr.Status.MySQL, - "orchestrator", cr.Status.Orchestrator, - "router", cr.Status.Router, - "haproxy", cr.Status.HAProxy, - "host", cr.Status.Host, - "loadbalancers", loadBalancersReady, - "conditions", cr.Status.Conditions, - "state", cr.Status.State, - ) + if cr.Status.State != initialState { + log.Info("Cluster state changed", "previous", initialState, "current", cr.Status.State) + r.Recorder.Event(cr, "Warning", "ClusterStateChanged", fmt.Sprintf("%s -> %s", initialState, cr.Status.State)) + } nn := types.NamespacedName{Name: cr.Name, Namespace: cr.Namespace} return writeStatus(ctx, r.Client, nn, cr.Status) diff --git a/pkg/controller/ps/status_test.go b/pkg/controller/ps/status_test.go index 0869e450e..03c84d7ca 100644 --- a/pkg/controller/ps/status_test.go +++ b/pkg/controller/ps/status_test.go @@ -380,6 +380,7 @@ func TestReconcileStatusAsync(t *testing.T) { Client: cb.Build(), Scheme: scheme, ClientCmd: cliCmd, + Recorder: new(record.FakeRecorder), ServerVersion: &platform.ServerVersion{ Platform: platform.PlatformKubernetes, }, @@ -963,8 +964,9 @@ func TestReconcileErrorStatus(t *testing.T) { WithStatusSubresource(objects...) r := &PerconaServerMySQLReconciler{ - Client: cb.Build(), - Scheme: scheme, + Client: cb.Build(), + Scheme: scheme, + Recorder: new(record.FakeRecorder), ServerVersion: &platform.ServerVersion{ Platform: platform.PlatformKubernetes, }, diff --git a/pkg/controller/ps/suite_test.go b/pkg/controller/ps/suite_test.go index 5eba1dc18..f5747ba0c 100644 --- a/pkg/controller/ps/suite_test.go +++ b/pkg/controller/ps/suite_test.go @@ -28,6 +28,7 @@ import ( "k8s.io/apimachinery/pkg/util/yaml" "k8s.io/client-go/kubernetes/scheme" "k8s.io/client-go/rest" + "k8s.io/client-go/tools/record" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/envtest" logf "sigs.k8s.io/controller-runtime/pkg/log" @@ -103,7 +104,8 @@ func reconciler() *PerconaServerMySQLReconciler { ServerVersion: &platform.ServerVersion{ Platform: platform.PlatformKubernetes, }, - Crons: NewCronRegistry(), + Crons: NewCronRegistry(), + Recorder: new(record.FakeRecorder), } }