Skip to content

Commit 91c1682

Browse files
pooknullhors
andauthored
K8SPSMDB-1263: retry on error during running backup (#1838)
* K8SPSMDB-1263: retry on error during running backup https://perconadev.atlassian.net/browse/K8SPSMDB-1263 * fix * small improvements --------- Co-authored-by: Viacheslav Sarzhan <slava.sarzhan@percona.com>
1 parent 97ab2e2 commit 91c1682

File tree

2 files changed

+47
-19
lines changed

2 files changed

+47
-19
lines changed

pkg/controller/perconaservermongodbbackup/backup.go

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,13 +8,15 @@ import (
88

99
"github.com/pkg/errors"
1010
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
11+
"k8s.io/apimachinery/pkg/util/wait"
1112
"sigs.k8s.io/controller-runtime/pkg/client"
1213
logf "sigs.k8s.io/controller-runtime/pkg/log"
1314

1415
pbmBackup "github.com/percona/percona-backup-mongodb/pbm/backup"
1516
"github.com/percona/percona-backup-mongodb/pbm/ctrl"
1617
"github.com/percona/percona-backup-mongodb/pbm/defs"
1718
pbmErrors "github.com/percona/percona-backup-mongodb/pbm/errors"
19+
1820
api "github.com/percona/percona-server-mongodb-operator/pkg/apis/psmdb/v1"
1921
"github.com/percona/percona-server-mongodb-operator/pkg/psmdb/backup"
2022
)
@@ -25,6 +27,13 @@ const (
2527
pbmStartingDeadlineErrMsg = "starting deadline exceeded"
2628
)
2729

30+
var defaultBackoff = wait.Backoff{
31+
Duration: 10 * time.Second,
32+
Factor: 2.0,
33+
Cap: time.Minute * 5,
34+
Steps: 6,
35+
}
36+
2837
type Backup struct {
2938
pbm backup.PBM
3039
spec api.BackupSpec
@@ -34,7 +43,7 @@ func (r *ReconcilePerconaServerMongoDBBackup) newBackup(ctx context.Context, clu
3443
if cluster == nil {
3544
return new(Backup), nil
3645
}
37-
cn, err := backup.NewPBM(ctx, r.client, cluster)
46+
cn, err := r.newPBMFunc(ctx, r.client, cluster)
3847
if err != nil {
3948
return nil, errors.Wrap(err, "create pbm object")
4049
}

pkg/controller/perconaservermongodbbackup/perconaservermongodbbackup_controller.go

Lines changed: 37 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ import (
2626
"github.com/percona/percona-backup-mongodb/pbm/storage"
2727
"github.com/percona/percona-backup-mongodb/pbm/storage/azure"
2828
"github.com/percona/percona-backup-mongodb/pbm/storage/s3"
29+
2930
"github.com/percona/percona-server-mongodb-operator/clientcmd"
3031
psmdbv1 "github.com/percona/percona-server-mongodb-operator/pkg/apis/psmdb/v1"
3132
"github.com/percona/percona-server-mongodb-operator/pkg/naming"
@@ -169,9 +170,16 @@ func (r *ReconcilePerconaServerMongoDBBackup) Reconcile(ctx context.Context, req
169170
}
170171
}
171172

172-
bcp, err := r.newBackup(ctx, cluster)
173-
if err != nil {
174-
return rr, errors.Wrap(err, "create backup object")
173+
var bcp *Backup
174+
if err = retry.OnError(defaultBackoff, func(err error) bool { return err != nil }, func() error {
175+
var err error
176+
bcp, err = r.newBackup(ctx, cluster)
177+
if err != nil {
178+
return errors.Wrap(err, "create backup object")
179+
}
180+
return nil
181+
}); err != nil {
182+
return rr, err
175183
}
176184
defer bcp.Close(ctx)
177185

@@ -209,26 +217,37 @@ func (r *ReconcilePerconaServerMongoDBBackup) reconcile(
209217
return status, errors.Wrap(err, "failed to run backup")
210218
}
211219

212-
cjobs, err := backup.HasActiveJobs(ctx, r.newPBMFunc, r.client, cluster, backup.NewBackupJob(cr.Name), backup.NotPITRLock)
213-
if err != nil {
214-
return status, errors.Wrap(err, "check for concurrent jobs")
215-
}
216-
217-
if cjobs {
218-
if cr.Status.State != psmdbv1.BackupStateWaiting {
219-
log.Info("Waiting to finish another backup/restore.")
220-
}
221-
status.State = psmdbv1.BackupStateWaiting
222-
return status, nil
223-
}
224-
225-
if cr.Status.State == psmdbv1.BackupStateNew || cr.Status.State == psmdbv1.BackupStateWaiting {
220+
switch cr.Status.State {
221+
case psmdbv1.BackupStateNew, psmdbv1.BackupStateWaiting:
226222
time.Sleep(10 * time.Second)
227223
return bcp.Start(ctx, r.client, cluster, cr)
224+
case psmdbv1.BackupStateRunning:
225+
default:
226+
cjobs, err := backup.HasActiveJobs(ctx, r.newPBMFunc, r.client, cluster, backup.NewBackupJob(cr.Name), backup.NotPITRLock)
227+
if err != nil {
228+
return status, errors.Wrap(err, "check for concurrent jobs")
229+
}
230+
231+
if cjobs {
232+
if cr.Status.State != psmdbv1.BackupStateWaiting {
233+
log.Info("Waiting to finish another backup/restore.")
234+
}
235+
status.State = psmdbv1.BackupStateWaiting
236+
return status, nil
237+
}
228238
}
229239

230240
time.Sleep(5 * time.Second)
231-
return bcp.Status(ctx, cr)
241+
242+
err := retry.OnError(defaultBackoff, func(err error) bool { return err != nil }, func() error {
243+
updatedStatus, err := bcp.Status(ctx, cr)
244+
if err == nil {
245+
status = updatedStatus
246+
}
247+
return err
248+
})
249+
250+
return status, err
232251
}
233252

234253
func (r *ReconcilePerconaServerMongoDBBackup) getPBMStorage(ctx context.Context, cluster *psmdbv1.PerconaServerMongoDB, cr *psmdbv1.PerconaServerMongoDBBackup) (storage.Storage, error) {

0 commit comments

Comments
 (0)