Skip to content

Commit 04dc2f3

Browse files
pooknullhors
andauthored
K8SPSMDB-993: increase timeout for waiting pbm resync (#1512)
* K8SPSMDB-993: increase timeout for waiting pbm resync https://perconadev.atlassian.net/browse/K8SPSMDB-993 * fix tests --------- Co-authored-by: Viacheslav Sarzhan <slava.sarzhan@percona.com>
1 parent d5e9919 commit 04dc2f3

File tree

4 files changed

+45
-44
lines changed
  • e2e-tests
    • demand-backup-physical-sharded
    • demand-backup-physical
    • pitr-physical
  • pkg/controller/perconaservermongodbrestore

4 files changed

+45
-44
lines changed

e2e-tests/demand-backup-physical-sharded/run

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@ run_recovery_check() {
3535
compare_kubectl "statefulset/${cluster}-rs0" ${compare_suffix}
3636

3737
# we don't wait for cluster readiness here because the annotation gets removed then
38-
wait_restore "${backup_name}" "${cluster}" "ready" "0" "900"
38+
wait_restore "${backup_name}" "${cluster}" "ready" "0" "1600"
3939
kubectl_bin get psmdb ${cluster} -o yaml
4040
if [ $(kubectl_bin get psmdb ${cluster} -o yaml | yq '.metadata.annotations."percona.com/resync-pbm"') == null ]; then
4141
echo "psmdb/${cluster} should be annotated with percona.com/resync-pbm after a physical restore"

e2e-tests/demand-backup-physical/run

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@ run_recovery_check() {
3535
compare_kubectl "statefulset/${cluster}-rs0" ${compare_suffix}
3636

3737
# we don't wait for cluster readiness here because the annotation gets removed then
38-
wait_restore "${backup_name}" "${cluster}" "ready" "0" "900"
38+
wait_restore "${backup_name}" "${cluster}" "ready" "0" "1600"
3939
kubectl_bin get psmdb ${cluster} -o yaml
4040
if [ $(kubectl_bin get psmdb ${cluster} -o yaml | yq '.metadata.annotations."percona.com/resync-pbm"') == null ]; then
4141
echo "psmdb/${cluster} should be annotated with percona.com/resync-pbm after a physical restore"

e2e-tests/pitr-physical/run

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -106,7 +106,7 @@ check_recovery() {
106106
# fail faster if we don't reach requested status until some time
107107
wait_restore "$backup_name" "$cluster_name" "requested" "0" "900"
108108
echo
109-
wait_restore "$backup_name" "$cluster_name" "ready" "0" "900"
109+
wait_restore "$backup_name" "$cluster_name" "ready" "0" "1600"
110110
echo
111111
set -o xtrace
112112

pkg/controller/perconaservermongodbrestore/physical.go

Lines changed: 42 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -162,54 +162,55 @@ func (r *ReconcilePerconaServerMongoDBRestore) reconcilePhysicalRestore(ctx cont
162162
return status, errors.Wrapf(err, "resync config stderr: %s stdout: %s", stderrBuf.String(), stdoutBuf.String())
163163
}
164164

165-
ticker := time.NewTicker(5 * time.Second)
166-
defer ticker.Stop()
167-
168-
timeout := time.NewTimer(900 * time.Second)
169-
defer timeout.Stop()
170-
171-
outer:
172-
for {
173-
select {
174-
case <-timeout.C:
175-
return status, errors.Errorf("timeout while waiting PBM operation to finish")
176-
case <-ticker.C:
177-
err := retry.OnError(retry.DefaultBackoff, func(err error) bool { return strings.Contains(err.Error(), "No agent available") }, func() error {
178-
stdoutBuf.Reset()
179-
stderrBuf.Reset()
180-
181-
command := []string{"/opt/percona/pbm", "status", "--out", "json"}
182-
err := r.clientcmd.Exec(ctx, &pod, "mongod", command, nil, stdoutBuf, stderrBuf, false)
183-
if err != nil {
184-
log.Error(err, "failed to get PBM status")
185-
return err
186-
}
187-
188-
log.V(1).Info("PBM status", "status", stdoutBuf.String())
189-
190-
return nil
191-
})
165+
time.Sleep(5 * time.Second) // wait until pbm will start resync
166+
167+
waitErr := errors.New("waiting for PBM operation to finish")
168+
err = retry.OnError(wait.Backoff{
169+
Duration: 5 * time.Second,
170+
Factor: 2.0,
171+
Cap: time.Hour,
172+
Steps: 12,
173+
}, func(err error) bool { return err == waitErr }, func() error {
174+
err := retry.OnError(retry.DefaultBackoff, func(err error) bool { return strings.Contains(err.Error(), "No agent available") }, func() error {
175+
stdoutBuf.Reset()
176+
stderrBuf.Reset()
177+
178+
command := []string{"/opt/percona/pbm", "status", "--out", "json"}
179+
err := r.clientcmd.Exec(ctx, &pod, "mongod", command, nil, stdoutBuf, stderrBuf, false)
192180
if err != nil {
193-
return status, errors.Wrapf(err, "get PBM status stderr: %s stdout: %s", stderrBuf.String(), stdoutBuf.String())
181+
log.Error(err, "failed to get PBM status")
182+
return err
194183
}
195184

196-
var pbmStatus struct {
197-
Running struct {
198-
Type string `json:"type,omitempty"`
199-
OpId string `json:"opID,omitempty"`
200-
} `json:"running"`
201-
}
185+
log.V(1).Info("PBM status", "status", stdoutBuf.String())
202186

203-
if err := json.Unmarshal(stdoutBuf.Bytes(), &pbmStatus); err != nil {
204-
return status, errors.Wrap(err, "unmarshal PBM status output")
205-
}
187+
return nil
188+
})
189+
if err != nil {
190+
return errors.Wrapf(err, "get PBM status stderr: %s stdout: %s", stderrBuf.String(), stdoutBuf.String())
191+
}
206192

207-
if len(pbmStatus.Running.OpId) == 0 {
208-
break outer
209-
}
193+
var pbmStatus struct {
194+
Running struct {
195+
Type string `json:"type,omitempty"`
196+
OpId string `json:"opID,omitempty"`
197+
} `json:"running"`
198+
}
210199

211-
log.Info("Waiting for another PBM operation to finish", "type", pbmStatus.Running.Type, "opID", pbmStatus.Running.OpId)
200+
if err := json.Unmarshal(stdoutBuf.Bytes(), &pbmStatus); err != nil {
201+
return errors.Wrap(err, "unmarshal PBM status output")
212202
}
203+
204+
if len(pbmStatus.Running.OpId) == 0 {
205+
return nil
206+
}
207+
208+
log.Info("Waiting for another PBM operation to finish", "type", pbmStatus.Running.Type, "opID", pbmStatus.Running.OpId)
209+
210+
return waitErr
211+
})
212+
if err != nil {
213+
return status, err
213214
}
214215

215216
var restoreCommand []string

0 commit comments

Comments
 (0)