Skip to content

K8SPSMDB-1211: handle FULL CLUSTER CRASH error during the restore #1926

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 32 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 4 commits
Commits
Show all changes
32 commits
Select commit Hold shift + click to select a range
e2b7e97
K8SPSMDB-1211: handle `FULL CLUSTER CRASH` error during the restore
pooknull May 16, 2025
47073d9
Merge remote-tracking branch 'origin/main' into K8SPSMDB-1211
pooknull May 19, 2025
49cc044
remove unused comment
pooknull May 19, 2025
19de9e6
fix lint
pooknull May 19, 2025
9bf2482
remove common reconciler
pooknull May 20, 2025
879163f
fix
pooknull May 20, 2025
f87bcc8
fix unit-test
pooknull May 20, 2025
a566737
fix
pooknull May 21, 2025
20e0558
Merge remote-tracking branch 'origin/main' into K8SPSMDB-1211
pooknull May 21, 2025
35a2e22
fix manifests
pooknull May 21, 2025
81186a8
fix tests
pooknull May 21, 2025
2614d85
Merge branch 'main' into K8SPSMDB-1211
hors May 21, 2025
dc8663f
small fix
pooknull May 22, 2025
0a442b9
Merge branch 'main' into K8SPSMDB-1211
pooknull May 22, 2025
b433511
add sleep
pooknull May 23, 2025
81d2898
fix tests
pooknull May 23, 2025
3cd0736
Merge branch 'main' into K8SPSMDB-1211
hors May 23, 2025
f9354c4
wait after adding resync annotation
pooknull May 26, 2025
788505c
backoff wait after adding resync
pooknull May 27, 2025
915ffc8
remove wait and fix tests
pooknull May 27, 2025
9f69da2
Merge remote-tracking branch 'origin/main' into K8SPSMDB-1211
pooknull May 27, 2025
aaa227b
fix merge
pooknull May 27, 2025
82c139f
fix manifests
pooknull May 28, 2025
050f5ef
Merge remote-tracking branch 'origin/main' into K8SPSMDB-1211
pooknull May 28, 2025
ae459c2
fix merge
pooknull May 28, 2025
ced15a2
fix merge
pooknull May 28, 2025
756aefe
fix arbiter
pooknull May 28, 2025
6995236
Merge branch 'main' into K8SPSMDB-1211
pooknull May 28, 2025
fd8c741
Merge remote-tracking branch 'origin/main' into K8SPSMDB-1211
pooknull May 29, 2025
3f6285a
remove commented code
pooknull Jun 9, 2025
423d48d
fix lint
pooknull Jun 9, 2025
d4d89df
Merge branch 'main' into K8SPSMDB-1211
pooknull Jun 11, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
70 changes: 70 additions & 0 deletions pkg/controller/common/common.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
package common
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think packages named common, utils, etc., tend to be vague, as they imply shared logic without a clearly defined domain or separation of concerns.

In this file, the main struct is CommonReconciler, but it's not clear what exactly is being reconciled. The struct also mixes responsibilities: as it's constructing and returning heterogeneous components like backup.PBM, mongo.Client, a scheme, and a k8s client.

To improve clarity and maintainability, I'd suggest:

  • Keeping the scheme and the Kubernetes client in ReconcilePerconaServerMongoDB, and having related function with receivers of type ReconcilePerconaServerMongoDB.

  • Splitting out PBM-related logic into a dedicated PBM factory/service.

  • Doing the same for the MongoClientProvider.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.


import (
"context"

"github.com/pkg/errors"
corev1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/runtime"
"sigs.k8s.io/controller-runtime/pkg/client"

api "github.com/percona/percona-server-mongodb-operator/pkg/apis/psmdb/v1"
"github.com/percona/percona-server-mongodb-operator/pkg/psmdb"
"github.com/percona/percona-server-mongodb-operator/pkg/psmdb/backup"
"github.com/percona/percona-server-mongodb-operator/pkg/psmdb/mongo"
)

func New(client client.Client, scheme *runtime.Scheme, newPBMFunc backup.NewPBMFunc, mongoClientProvider psmdb.MongoClientProvider) CommonReconciler {
return CommonReconciler{
client: client,
scheme: scheme,
newPBMFunc: newPBMFunc,
mongoClientProvider: mongoClientProvider,
}
}

type CommonReconciler struct {
client client.Client
scheme *runtime.Scheme
newPBMFunc backup.NewPBMFunc
mongoClientProvider psmdb.MongoClientProvider
}

func (r *CommonReconciler) Client() client.Client {
return r.client
}

func (r *CommonReconciler) Scheme() *runtime.Scheme {
return r.scheme
}

func (r *CommonReconciler) NewPBM(ctx context.Context, cluster *api.PerconaServerMongoDB) (backup.PBM, error) {
return r.newPBMFunc(ctx, r.client, cluster)
}

func (r *CommonReconciler) NewPBMFunc() backup.NewPBMFunc {
return r.newPBMFunc
}

func (r *CommonReconciler) getMongoClientProvider() psmdb.MongoClientProvider {
if r.mongoClientProvider == nil {
return psmdb.NewProvider(r.client)
}
return r.mongoClientProvider
}

func (r *CommonReconciler) MongoClientWithRole(ctx context.Context, cr *api.PerconaServerMongoDB, rs *api.ReplsetSpec, role api.SystemUserRole) (mongo.Client, error) {
return r.getMongoClientProvider().Mongo(ctx, cr, rs, role)
}

func (r *CommonReconciler) MongosClientWithRole(ctx context.Context, cr *api.PerconaServerMongoDB, role api.SystemUserRole) (mongo.Client, error) {
return r.getMongoClientProvider().Mongos(ctx, cr, role)
}

func (r *CommonReconciler) StandaloneClientWithRole(ctx context.Context, cr *api.PerconaServerMongoDB, rs *api.ReplsetSpec, role api.SystemUserRole, pod corev1.Pod) (mongo.Client, error) {
host, err := psmdb.MongoHost(ctx, r.client, cr, cr.Spec.ClusterServiceDNSMode, rs, rs.Expose.Enabled, pod)
if err != nil {
return nil, errors.Wrap(err, "failed to get mongo host")
}
return r.getMongoClientProvider().Standalone(ctx, cr, role, host, cr.TLSEnabled())
}
2 changes: 1 addition & 1 deletion pkg/controller/perconaservermongodb/backup.go
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ func (r *ReconcilePerconaServerMongoDB) createOrUpdateBackupTask(ctx context.Con
if err != nil {
return errors.Wrap(err, "can't create job")
}
err = setControllerReference(cr, &cjob, r.scheme)
err = setControllerReference(cr, &cjob, r.Scheme())
if err != nil {
return errors.Wrapf(err, "set owner reference for backup task %s", cjob.Name)
}
Expand Down
6 changes: 3 additions & 3 deletions pkg/controller/perconaservermongodb/balancer.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,14 @@ import (
"context"
"time"

"github.com/percona/percona-server-mongodb-operator/pkg/psmdb"
"github.com/pkg/errors"
corev1 "k8s.io/api/core/v1"
k8sErrors "k8s.io/apimachinery/pkg/api/errors"
"k8s.io/apimachinery/pkg/types"
logf "sigs.k8s.io/controller-runtime/pkg/log"

api "github.com/percona/percona-server-mongodb-operator/pkg/apis/psmdb/v1"
"github.com/percona/percona-server-mongodb-operator/pkg/psmdb"
)

func (r *ReconcilePerconaServerMongoDB) enableBalancerIfNeeded(ctx context.Context, cr *api.PerconaServerMongoDB) error {
Expand Down Expand Up @@ -85,7 +85,7 @@ func (r *ReconcilePerconaServerMongoDB) enableBalancerIfNeeded(ctx context.Conte
}
}

mongosSession, err := r.mongosClientWithRole(ctx, cr, api.RoleClusterAdmin)
mongosSession, err := r.MongosClientWithRole(ctx, cr, api.RoleClusterAdmin)
if err != nil {
return errors.Wrap(err, "failed to get mongos connection")
}
Expand Down Expand Up @@ -141,7 +141,7 @@ func (r *ReconcilePerconaServerMongoDB) disableBalancer(ctx context.Context, cr
return errors.Wrapf(err, "get mongos statefulset %s", msSts.Name)
}

mongosSession, err := r.mongosClientWithRole(ctx, cr, api.RoleClusterAdmin)
mongosSession, err := r.MongosClientWithRole(ctx, cr, api.RoleClusterAdmin)
if err != nil {
return errors.Wrap(err, "failed to get mongos connection")
}
Expand Down
73 changes: 0 additions & 73 deletions pkg/controller/perconaservermongodb/connections.go

This file was deleted.

3 changes: 2 additions & 1 deletion pkg/controller/perconaservermongodb/connections_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ import (
"sigs.k8s.io/controller-runtime/pkg/reconcile"

api "github.com/percona/percona-server-mongodb-operator/pkg/apis/psmdb/v1"
"github.com/percona/percona-server-mongodb-operator/pkg/controller/common"
"github.com/percona/percona-server-mongodb-operator/pkg/naming"
"github.com/percona/percona-server-mongodb-operator/pkg/psmdb"
"github.com/percona/percona-server-mongodb-operator/pkg/psmdb/mongo"
Expand Down Expand Up @@ -158,7 +159,7 @@ func TestConnectionLeaks(t *testing.T) {
connectionCount := new(int)

r := buildFakeClient(obj...)
r.mongoClientProvider = &fakeMongoClientProvider{pods: rsPods, cr: cr, connectionCount: connectionCount}
r.CommonReconciler = common.New(r.Client(), r.Scheme(), r.NewPBMFunc(), &fakeMongoClientProvider{pods: rsPods, cr: cr, connectionCount: connectionCount})
r.serverVersion = &version.ServerVersion{Platform: version.PlatformKubernetes}
r.crons = NewCronRegistry()

Expand Down
10 changes: 6 additions & 4 deletions pkg/controller/perconaservermongodb/custom_users.go
Original file line number Diff line number Diff line change
Expand Up @@ -35,9 +35,9 @@ func (r *ReconcilePerconaServerMongoDB) reconcileCustomUsers(ctx context.Context
var err error
var mongoCli mongo.Client
if cr.Spec.Sharding.Enabled {
mongoCli, err = r.mongosClientWithRole(ctx, cr, api.RoleUserAdmin)
mongoCli, err = r.MongosClientWithRole(ctx, cr, api.RoleUserAdmin)
} else {
mongoCli, err = r.mongoClientWithRole(ctx, cr, cr.Spec.Replsets[0], api.RoleUserAdmin)
mongoCli, err = r.MongoClientWithRole(ctx, cr, cr.Spec.Replsets[0], api.RoleUserAdmin)
}
if err != nil {
return errors.Wrap(err, "failed to get mongo client")
Expand Down Expand Up @@ -310,7 +310,8 @@ func updatePass(
user *api.User,
userInfo *mongo.User,
secret *corev1.Secret,
annotationKey, passKey string) error {
annotationKey, passKey string,
) error {
log := logf.FromContext(ctx)

if userInfo == nil || user.IsExternalDB() {
Expand Down Expand Up @@ -395,7 +396,8 @@ func createUser(
mongoCli mongo.Client,
user *api.User,
secret *corev1.Secret,
annotationKey, passKey string) error {
annotationKey, passKey string,
) error {
log := logf.FromContext(ctx)

roles := make([]mongo.Role, 0)
Expand Down
6 changes: 3 additions & 3 deletions pkg/controller/perconaservermongodb/fcv.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ import (
)

func (r *ReconcilePerconaServerMongoDB) getFCV(ctx context.Context, cr *api.PerconaServerMongoDB) (string, error) {
c, err := r.mongoClientWithRole(ctx, cr, cr.Spec.Replsets[0], api.RoleClusterAdmin)
c, err := r.MongoClientWithRole(ctx, cr, cr.Spec.Replsets[0], api.RoleClusterAdmin)
if err != nil {
return "", errors.Wrap(err, "failed to get connection")
}
Expand Down Expand Up @@ -40,9 +40,9 @@ func (r *ReconcilePerconaServerMongoDB) setFCV(ctx context.Context, cr *api.Perc
var connErr error

if cr.Spec.Sharding.Enabled {
cli, connErr = r.mongosClientWithRole(ctx, cr, api.RoleClusterAdmin)
cli, connErr = r.MongosClientWithRole(ctx, cr, api.RoleClusterAdmin)
} else {
cli, connErr = r.mongoClientWithRole(ctx, cr, cr.Spec.Replsets[0], api.RoleClusterAdmin)
cli, connErr = r.MongoClientWithRole(ctx, cr, cr.Spec.Replsets[0], api.RoleClusterAdmin)
}

if connErr != nil {
Expand Down
2 changes: 1 addition & 1 deletion pkg/controller/perconaservermongodb/finalizers.go
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,7 @@ func (r *ReconcilePerconaServerMongoDB) checkFinalizers(ctx context.Context, cr
}

func (r *ReconcilePerconaServerMongoDB) deleteAllPITRChunks(ctx context.Context, cr *api.PerconaServerMongoDB) error {
pbmc, err := r.newPBM(ctx, r.client, cr)
pbmc, err := r.NewPBM(ctx, cr)
if err != nil {
return errors.Wrap(err, "new pbm")
}
Expand Down
16 changes: 8 additions & 8 deletions pkg/controller/perconaservermongodb/mgo.go
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,7 @@ func (r *ReconcilePerconaServerMongoDB) reconcileCluster(ctx context.Context, cr
}
}

cli, err := r.mongoClientWithRole(ctx, cr, replset, api.RoleClusterAdmin)
cli, err := r.MongoClientWithRole(ctx, cr, replset, api.RoleClusterAdmin)
if err != nil {
if cr.Spec.Unmanaged {
return api.AppStateInit, nil, nil
Expand Down Expand Up @@ -193,7 +193,7 @@ func (r *ReconcilePerconaServerMongoDB) reconcileCluster(ctx context.Context, cr
replset.ClusterRole == api.ClusterRoleShardSvr &&
len(mongosPods) > 0 && cr.Spec.Sharding.Mongos.Size > 0 {

mongosSession, err := r.mongosClientWithRole(ctx, cr, api.RoleClusterAdmin)
mongosSession, err := r.MongosClientWithRole(ctx, cr, api.RoleClusterAdmin)
if err != nil {
return api.AppStateError, nil, errors.Wrap(err, "failed to get mongos connection")
}
Expand Down Expand Up @@ -571,7 +571,7 @@ func (r *ReconcilePerconaServerMongoDB) removeRSFromShard(ctx context.Context, c
return nil
}

cli, err := r.mongosClientWithRole(ctx, cr, api.RoleClusterAdmin)
cli, err := r.MongosClientWithRole(ctx, cr, api.RoleClusterAdmin)
if err != nil {
return errors.Errorf("failed to get mongos connection: %v", err)
}
Expand Down Expand Up @@ -621,7 +621,7 @@ func (r *ReconcilePerconaServerMongoDB) handleRsAddToShard(ctx context.Context,
return errors.Wrapf(err, "get rsPod %s host", rspod.Name)
}

cli, err := r.mongosClientWithRole(ctx, cr, api.RoleClusterAdmin)
cli, err := r.MongosClientWithRole(ctx, cr, api.RoleClusterAdmin)
if err != nil {
return errors.Wrap(err, "failed to get mongos client")
}
Expand Down Expand Up @@ -724,7 +724,7 @@ func (r *ReconcilePerconaServerMongoDB) handleReplsetInit(ctx context.Context, c
time.Sleep(time.Second * 5)

log.Info("creating user admin", "replset", replsetName, "pod", pod.Name, "user", api.RoleUserAdmin)
userAdmin, err := getInternalCredentials(ctx, r.client, cr, api.RoleUserAdmin)
userAdmin, err := psmdb.GetCredentials(ctx, r.client, cr, api.RoleUserAdmin)
if err != nil {
return nil, nil, errors.Wrap(err, "failed to get userAdmin credentials")
}
Expand Down Expand Up @@ -757,7 +757,7 @@ func (r *ReconcilePerconaServerMongoDB) handleReplicaSetNoPrimary(ctx context.Co
}

log.Info("Connecting to pod", "pod", pod.Name, "user", api.RoleClusterAdmin)
cli, err := r.standaloneClientWithRole(ctx, cr, replset, api.RoleClusterAdmin, pod)
cli, err := r.StandaloneClientWithRole(ctx, cr, replset, api.RoleClusterAdmin, pod)
if err != nil {
return errors.Wrap(err, "get standalone mongo client")
}
Expand Down Expand Up @@ -922,7 +922,7 @@ func compareRoles(x []mongo.Role, y []mongo.Role) bool {
func (r *ReconcilePerconaServerMongoDB) createOrUpdateSystemUsers(ctx context.Context, cr *api.PerconaServerMongoDB, replset *api.ReplsetSpec) error {
log := logf.FromContext(ctx)

cli, err := r.mongoClientWithRole(ctx, cr, replset, api.RoleUserAdmin)
cli, err := r.MongoClientWithRole(ctx, cr, replset, api.RoleUserAdmin)
if err != nil {
return errors.Wrap(err, "failed to get mongo client")
}
Expand Down Expand Up @@ -1013,7 +1013,7 @@ func (r *ReconcilePerconaServerMongoDB) createOrUpdateSystemUsers(ctx context.Co
}

for _, role := range users {
creds, err := getInternalCredentials(ctx, r.client, cr, role)
creds, err := psmdb.GetCredentials(ctx, r.client, cr, role)
if err != nil {
log.Error(err, "failed to get credentials", "role", role)
continue
Expand Down
Loading
Loading