Skip to content

Commit 6dedac1

Browse files
authored
K8SPSMDB-1378: Prevent parallel backups (#1920)
* K8SPSMDB-1378: Prevent parallel backups * add backup existence checks * fix finalizer * fix error backups
1 parent b009df6 commit 6dedac1

File tree

12 files changed

+299
-39
lines changed

12 files changed

+299
-39
lines changed

e2e-tests/demand-backup-incremental-sharded/run

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -127,8 +127,16 @@ if [ -z "$SKIP_BACKUPS_TO_AWS_GCP_AZURE" ]; then
127127
run_backup azure-blob ${backup_name_azure}
128128

129129
wait_backup "${backup_name_aws}"
130+
check_backup_in_storage ${backup_name_aws} s3 rs0
131+
check_backup_in_storage ${backup_name_aws} s3 cfg
132+
130133
wait_backup "${backup_name_gcp}"
134+
check_backup_in_storage ${backup_name_gcp} gcs rs0
135+
check_backup_in_storage ${backup_name_gcp} gcs cfg
136+
131137
wait_backup "${backup_name_azure}"
138+
check_backup_in_storage ${backup_name_azure} azure rs0
139+
check_backup_in_storage ${backup_name_azure} azure cfg
132140
fi
133141

134142
backup_name_minio="backup-minio-sharded"

e2e-tests/demand-backup-incremental/run

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -111,8 +111,13 @@ if [ -z "$SKIP_BACKUPS_TO_AWS_GCP_AZURE" ]; then
111111
run_backup azure-blob ${backup_name_azure}
112112

113113
wait_backup "${backup_name_aws}"
114+
check_backup_in_storage ${backup_name_aws} s3 rs0
115+
114116
wait_backup "${backup_name_gcp}"
117+
check_backup_in_storage ${backup_name_gcp} gcs rs0
118+
115119
wait_backup "${backup_name_azure}"
120+
check_backup_in_storage ${backup_name_azure} azure rs0
116121
fi
117122

118123
backup_name_minio="backup-minio"

e2e-tests/demand-backup-physical-sharded/run

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -112,8 +112,16 @@ if [ -z "$SKIP_BACKUPS_TO_AWS_GCP_AZURE" ]; then
112112
run_backup azure-blob ${backup_name_azure}
113113

114114
wait_backup "${backup_name_aws}"
115+
check_backup_in_storage ${backup_name_aws} s3 rs0
116+
check_backup_in_storage ${backup_name_aws} s3 cfg
117+
115118
wait_backup "${backup_name_gcp}"
119+
check_backup_in_storage ${backup_name_gcp} gcs rs0
120+
check_backup_in_storage ${backup_name_gcp} gcs cfg
121+
116122
wait_backup "${backup_name_azure}"
123+
check_backup_in_storage ${backup_name_azure} azure rs0
124+
check_backup_in_storage ${backup_name_azure} azure cfg
117125
fi
118126
wait_backup "${backup_name_minio}"
119127

e2e-tests/demand-backup-physical/run

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -98,8 +98,13 @@ if [ -z "$SKIP_BACKUPS_TO_AWS_GCP_AZURE" ]; then
9898
run_backup azure-blob ${backup_name_azure}
9999

100100
wait_backup "${backup_name_aws}"
101+
check_backup_in_storage ${backup_name_aws} s3 rs0
102+
101103
wait_backup "${backup_name_gcp}"
104+
check_backup_in_storage ${backup_name_gcp} gcs rs0
105+
102106
wait_backup "${backup_name_azure}"
107+
check_backup_in_storage ${backup_name_azure} azure rs0
103108
fi
104109
wait_backup "${backup_name_minio}"
105110

e2e-tests/functions

Lines changed: 33 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ IMAGE_PMM_CLIENT=${IMAGE_PMM_CLIENT:-"perconalab/pmm-client:dev-latest"}
1717
IMAGE_PMM_SERVER=${IMAGE_PMM_SERVER:-"perconalab/pmm-server:dev-latest"}
1818
CERT_MANAGER_VER="1.16.3"
1919
UPDATE_COMPARE_FILES=${UPDATE_COMPARE_FILES:-0}
20+
DELETE_CRD_ON_START=${DELETE_CRD_ON_START:-1}
2021
tmp_dir=$(mktemp -d)
2122
sed=$(which gsed || which sed)
2223
date=$(which gdate || which date)
@@ -1331,8 +1332,10 @@ check_backup_deletion() {
13311332
create_infra() {
13321333
local ns="$1"
13331334

1334-
delete_crd
1335-
check_crd_for_deletion "${GIT_BRANCH}"
1335+
if [[ ${DELETE_CRD_ON_START} == 1 ]]; then
1336+
delete_crd
1337+
check_crd_for_deletion "${GIT_BRANCH}"
1338+
fi
13361339
if [ -n "$OPERATOR_NS" ]; then
13371340
create_namespace $OPERATOR_NS
13381341
deploy_operator
@@ -1867,3 +1870,31 @@ wait_for_oplogs() {
18671870
sleep 10
18681871
done
18691872
}
1873+
1874+
check_backup_in_storage() {
1875+
local backup=$1
1876+
local storage_type=$2
1877+
local replset=$3
1878+
local file=${4:-"filelist.pbm"}
1879+
1880+
local endpoint
1881+
case ${storage_type} in
1882+
s3)
1883+
endpoint="s3.amazonaws.com"
1884+
;;
1885+
gcs)
1886+
endpoint="storage.googleapis.com"
1887+
;;
1888+
azure)
1889+
endpoint="engk8soperators.blob.core.windows.net"
1890+
;;
1891+
*)
1892+
echo "unsupported storage type: ${storage_type}"
1893+
exit 1
1894+
esac
1895+
1896+
backup_dest=$(get_backup_dest "$backup")
1897+
local url="https://${endpoint}/${backup_dest}/${replset}/${file}"
1898+
log "checking if ${url} exists"
1899+
curl --fail --head "${url}"
1900+
}

pkg/apis/psmdb/v1/perconaservermongodbbackup_types.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ import (
1111

1212
// PerconaServerMongoDBBackupSpec defines the desired state of PerconaServerMongoDBBackup
1313
type PerconaServerMongoDBBackupSpec struct {
14+
// Deprecated: Use ClusterName instead
1415
PSMDBCluster string `json:"psmdbCluster,omitempty"` // TODO: Remove after v1.15
1516
ClusterName string `json:"clusterName,omitempty"`
1617
StorageName string `json:"storageName,omitempty"`

pkg/controller/perconaservermongodbbackup/perconaservermongodbbackup_controller.go

Lines changed: 79 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@ import (
3030

3131
"github.com/percona/percona-server-mongodb-operator/clientcmd"
3232
psmdbv1 "github.com/percona/percona-server-mongodb-operator/pkg/apis/psmdb/v1"
33+
"github.com/percona/percona-server-mongodb-operator/pkg/k8s"
3334
"github.com/percona/percona-server-mongodb-operator/pkg/naming"
3435
"github.com/percona/percona-server-mongodb-operator/pkg/psmdb"
3536
"github.com/percona/percona-server-mongodb-operator/pkg/psmdb/backup"
@@ -118,7 +119,7 @@ func (r *ReconcilePerconaServerMongoDBBackup) Reconcile(ctx context.Context, req
118119
// Request object not found, could have been deleted after reconcile request.
119120
// Owned objects are automatically garbage collected. For additional cleanup logic use finalizers.
120121
// Return and don't requeue
121-
return rr, nil
122+
return reconcile.Result{}, nil
122123
}
123124
// Error reading the object - requeue the request.
124125
return rr, err
@@ -144,6 +145,16 @@ func (r *ReconcilePerconaServerMongoDBBackup) Reconcile(ctx context.Context, req
144145
if uerr != nil {
145146
log.Error(uerr, "failed to update backup status", "backup", cr.Name)
146147
}
148+
149+
switch cr.Status.State {
150+
case psmdbv1.BackupStateReady, psmdbv1.BackupStateError:
151+
log.Info("Releasing backup lock", "lease", naming.BackupLeaseName(cr.Spec.ClusterName))
152+
153+
err := k8s.ReleaseLease(ctx, r.client, naming.BackupLeaseName(cr.Spec.ClusterName), cr.Namespace, naming.BackupHolderId(cr))
154+
if err != nil {
155+
log.Error(err, "failed to release the lock")
156+
}
157+
}
147158
}
148159
}()
149160

@@ -228,14 +239,28 @@ func (r *ReconcilePerconaServerMongoDBBackup) reconcile(
228239
}
229240

230241
if cjobs {
231-
if cr.Status.State != psmdbv1.BackupStateWaiting {
232-
log.Info("Waiting to finish another backup/restore.")
233-
}
242+
log.Info("Waiting to finish another backup/restore.")
234243
status.State = psmdbv1.BackupStateWaiting
235244
return status, nil
236245
}
237246

238-
switch cr.Status.State {
247+
log.Info("Acquiring the backup lock")
248+
lease, err := k8s.AcquireLease(ctx, r.client, naming.BackupLeaseName(cluster.Name), cr.Namespace, naming.BackupHolderId(cr))
249+
if err != nil {
250+
return status, errors.Wrap(err, "acquire backup lock")
251+
}
252+
253+
if lease.Spec.HolderIdentity != nil && *lease.Spec.HolderIdentity != naming.BackupHolderId(cr) {
254+
log.Info("Another backup is holding the lock", "holder", *lease.Spec.HolderIdentity)
255+
status.State = psmdbv1.BackupStateWaiting
256+
return status, nil
257+
}
258+
259+
if err := r.ensureReleaseLockFinalizer(ctx, cluster, cr); err != nil {
260+
return status, errors.Wrapf(err, "ensure %s finalizer", naming.FinalizerReleaseLock)
261+
}
262+
263+
switch status.State {
239264
case psmdbv1.BackupStateNew, psmdbv1.BackupStateWaiting:
240265
return bcp.Start(ctx, r.client, cluster, cr)
241266
}
@@ -251,6 +276,28 @@ func (r *ReconcilePerconaServerMongoDBBackup) reconcile(
251276
return status, err
252277
}
253278

279+
func (r *ReconcilePerconaServerMongoDBBackup) ensureReleaseLockFinalizer(
280+
ctx context.Context,
281+
cluster *psmdbv1.PerconaServerMongoDB,
282+
cr *psmdbv1.PerconaServerMongoDBBackup,
283+
) error {
284+
for _, f := range cr.GetFinalizers() {
285+
if f == naming.FinalizerReleaseLock {
286+
return nil
287+
}
288+
}
289+
290+
orig := cr.DeepCopy()
291+
cr.SetFinalizers(append(cr.GetFinalizers(), naming.FinalizerReleaseLock))
292+
if err := r.client.Patch(ctx, cr.DeepCopy(), client.MergeFrom(orig)); err != nil {
293+
return errors.Wrap(err, "patch finalizers")
294+
}
295+
296+
logf.FromContext(ctx).V(1).Info("Added finalizer", "finalizer", naming.FinalizerReleaseLock)
297+
298+
return nil
299+
}
300+
254301
func (r *ReconcilePerconaServerMongoDBBackup) getPBMStorage(ctx context.Context, cluster *psmdbv1.PerconaServerMongoDB, cr *psmdbv1.PerconaServerMongoDBBackup) (storage.Storage, error) {
255302
switch {
256303
case cr.Status.Azure != nil:
@@ -379,18 +426,24 @@ func (r *ReconcilePerconaServerMongoDBBackup) checkFinalizers(ctx context.Contex
379426

380427
finalizers := []string{}
381428

382-
if cr.Status.State == psmdbv1.BackupStateReady {
383-
for _, f := range cr.GetFinalizers() {
384-
switch f {
385-
case "delete-backup":
386-
log.Info("delete-backup finalizer is deprecated and will be deleted in 1.20.0. Use percona.com/delete-backup instead")
387-
fallthrough
388-
case naming.FinalizerDeleteBackup:
429+
for _, f := range cr.GetFinalizers() {
430+
switch f {
431+
case "delete-backup":
432+
log.Info("delete-backup finalizer is deprecated and will be deleted in 1.20.0. Use percona.com/delete-backup instead")
433+
fallthrough
434+
case naming.FinalizerDeleteBackup:
435+
if cr.Status.State == psmdbv1.BackupStateReady {
389436
if err := r.deleteBackupFinalizer(ctx, cr, cluster, b); err != nil {
390437
log.Error(err, "failed to run finalizer", "finalizer", f)
391438
finalizers = append(finalizers, f)
392439
}
393440
}
441+
case naming.FinalizerReleaseLock:
442+
err = r.runReleaseLockFinalizer(ctx, cr)
443+
if err != nil {
444+
log.Error(err, "failed to release backup lock")
445+
finalizers = append(finalizers, f)
446+
}
394447
}
395448
}
396449

@@ -400,6 +453,20 @@ func (r *ReconcilePerconaServerMongoDBBackup) checkFinalizers(ctx context.Contex
400453
return err
401454
}
402455

456+
func (r *ReconcilePerconaServerMongoDBBackup) runReleaseLockFinalizer(ctx context.Context, cr *psmdbv1.PerconaServerMongoDBBackup) error {
457+
leaseName := naming.BackupLeaseName(cr.Spec.ClusterName)
458+
holderId := naming.BackupHolderId(cr)
459+
log := logf.FromContext(ctx).WithValues("lease", leaseName, "holder", holderId)
460+
461+
log.Info("releasing backup lock")
462+
err := k8s.ReleaseLease(ctx, r.client, leaseName, cr.Namespace, holderId)
463+
if k8serrors.IsNotFound(err) || errors.Is(err, k8s.ErrNotTheHolder) {
464+
log.V(1).Info("failed to release backup lock", "error", err)
465+
return nil
466+
}
467+
return errors.Wrap(err, "release backup lock")
468+
}
469+
403470
func (r *ReconcilePerconaServerMongoDBBackup) deleteBackupFinalizer(ctx context.Context, cr *psmdbv1.PerconaServerMongoDBBackup, cluster *psmdbv1.PerconaServerMongoDB, b *Backup) error {
404471
if len(cr.Status.PBMname) == 0 {
405472
return nil

pkg/k8s/lease.go

Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,65 @@
1+
package k8s
2+
3+
import (
4+
"context"
5+
"time"
6+
7+
"github.com/pkg/errors"
8+
coordv1 "k8s.io/api/coordination/v1"
9+
k8serrors "k8s.io/apimachinery/pkg/api/errors"
10+
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
11+
"k8s.io/apimachinery/pkg/types"
12+
"sigs.k8s.io/controller-runtime/pkg/client"
13+
)
14+
15+
var (
16+
ErrNotTheHolder = errors.New("not the holder")
17+
)
18+
19+
func AcquireLease(ctx context.Context, c client.Client, name, namespace, holder string) (*coordv1.Lease, error) {
20+
lease := new(coordv1.Lease)
21+
22+
if err := c.Get(ctx, types.NamespacedName{Namespace: namespace, Name: name}, lease); err != nil {
23+
if !k8serrors.IsNotFound(err) {
24+
return lease, errors.Wrap(err, "get lease")
25+
}
26+
27+
lease := &coordv1.Lease{
28+
ObjectMeta: metav1.ObjectMeta{
29+
Name: name,
30+
Namespace: namespace,
31+
},
32+
Spec: coordv1.LeaseSpec{
33+
AcquireTime: &metav1.MicroTime{Time: time.Now()},
34+
HolderIdentity: &holder,
35+
},
36+
}
37+
38+
if err := c.Create(ctx, lease); err != nil {
39+
return lease, errors.Wrap(err, "create lease")
40+
}
41+
42+
return lease, nil
43+
}
44+
45+
return lease, nil
46+
}
47+
48+
func ReleaseLease(ctx context.Context, c client.Client, name, namespace, holder string) error {
49+
lease := new(coordv1.Lease)
50+
51+
if err := c.Get(ctx, types.NamespacedName{Namespace: namespace, Name: name}, lease); err != nil {
52+
return errors.Wrap(err, "get lease")
53+
}
54+
55+
// HolderIdentity can be nil, we allow deleting the lease if that's the case
56+
if lease.Spec.HolderIdentity != nil && *lease.Spec.HolderIdentity != holder {
57+
return ErrNotTheHolder
58+
}
59+
60+
if err := c.Delete(ctx, lease); err != nil {
61+
return errors.Wrap(err, "delete lease")
62+
}
63+
64+
return nil
65+
}

0 commit comments

Comments
 (0)