Skip to content

Commit e6f626b

Browse files
egeguneshors
andauthored
K8SPSMDB-733: Improve failure detection for PVC resize (#1543)
* K8SPSMDB-733: Improve failure detection for PVC resize Operator were reporting that PVC resize completed successfully even though it's failed. This commit fixes this behavior and introduce some improvements. Operator tries to understand if resize operation in progress by checking the status.conditions of PVCs. If the condition says it's resizing the operation is in progress, if there's no condition then the operation completed successfully. This approach is naive since resize could be failed. With these changes operator will check events regarding to the PVC and check if `VolumeResizeFailed` event exists. If it does, resize operation will declared as failed. If a resize operation fails, operator will automatically revert the PVC size in cr.yaml. This way the probability to have discrepancy between `CR <-> STS <-> PVC` will be lower. If resize operation fails only for some PVCs, the operator will still revert the PVC size in cr.yaml with the size of failed PVCs. * remove unnecessary logs --------- Co-authored-by: Viacheslav Sarzhan <slava.sarzhan@percona.com>
1 parent d6e4d8c commit e6f626b

File tree

10 files changed

+298
-32
lines changed

10 files changed

+298
-32
lines changed

deploy/bundle.yaml

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19059,10 +19059,13 @@ rules:
1905919059
- patch
1906019060
- delete
1906119061
- apiGroups:
19062-
- ""
19062+
- events.k8s.io
1906319063
resources:
1906419064
- events
1906519065
verbs:
19066+
- get
19067+
- list
19068+
- watch
1906619069
- create
1906719070
- patch
1906819071
- apiGroups:

deploy/cw-bundle.yaml

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19079,10 +19079,13 @@ rules:
1907919079
- patch
1908019080
- delete
1908119081
- apiGroups:
19082-
- ""
19082+
- events.k8s.io
1908319083
resources:
1908419084
- events
1908519085
verbs:
19086+
- get
19087+
- list
19088+
- watch
1908619089
- create
1908719090
- patch
1908819091
- apiGroups:

deploy/cw-rbac.yaml

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -111,10 +111,13 @@ rules:
111111
- patch
112112
- delete
113113
- apiGroups:
114-
- ""
114+
- events.k8s.io
115115
resources:
116116
- events
117117
verbs:
118+
- get
119+
- list
120+
- watch
118121
- create
119122
- patch
120123
- apiGroups:

deploy/rbac.yaml

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -91,10 +91,13 @@ rules:
9191
- patch
9292
- delete
9393
- apiGroups:
94-
- ""
94+
- events.k8s.io
9595
resources:
9696
- events
9797
verbs:
98+
- get
99+
- list
100+
- watch
98101
- create
99102
- patch
100103
- apiGroups:

e2e-tests/conf/client.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ spec:
1212
labels:
1313
name: psmdb-client
1414
spec:
15+
terminationGracePeriodSeconds: 10
1516
containers:
1617
- name: psmdb-client
1718
image: percona/percona-server-mongodb:4.4
Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
apiVersion: v1
2+
kind: ResourceQuota
3+
metadata:
4+
name: default-storage-quota
5+
spec:
6+
hard:
7+
STORAGECLASS.storageclass.storage.k8s.io/requests.storage: QUOTA

e2e-tests/pvc-resize/run

Lines changed: 128 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -5,12 +5,68 @@ set -o errexit
55
test_dir=$(realpath $(dirname $0))
66
. ${test_dir}/../functions
77

8+
function patch_pvc_request() {
9+
local cluster=$1
10+
local size=$2
11+
12+
echo "Patching PVC request to ${size} in ${cluster}"
13+
14+
kubectl_bin patch psmdb ${cluster} --type=json -p='[{"op": "replace", "path": "/spec/replsets/0/volumeSpec/persistentVolumeClaim/resources/requests/storage", "value":"'"${size}"'"}]'
15+
}
16+
17+
function get_default_storageclass() {
18+
kubectl_bin get sc -o jsonpath='{.items[?(@.metadata.annotations.storageclass\.kubernetes\.io/is-default-class=="true")].metadata.name}'
19+
}
20+
21+
function ensure_default_sc_allows_expansion() {
22+
local default_sc=$(get_default_storageclass)
23+
24+
echo "Checking if default storageclass ${default_sc} allows volume expansion"
25+
26+
local allowVolumeExpansion=$(kubectl_bin get sc -o jsonpath='{.items[?(@.metadata.name=="'"${default_sc}"'")].allowVolumeExpansion}')
27+
28+
if [[ ${allowVolumeExpansion} != "true" ]]; then
29+
echo "Default storageclass ${default_sc} does not allow volume expansion"
30+
exit 0
31+
fi
32+
}
33+
34+
function apply_resourcequota() {
35+
local quota=$1
36+
local default_sc=$(get_default_storageclass)
37+
38+
echo "Applying resourcequota for default storageclass ${default_sc} with quota ${quota}"
39+
40+
cat ${test_dir}/conf/resourcequota.yml \
41+
| sed "s/STORAGECLASS/${default_sc}/" \
42+
| sed "s/QUOTA/${quota}/" \
43+
| kubectl_bin apply -f -
44+
}
45+
46+
function wait_cluster_status() {
47+
local cluster=$1
48+
local expected=$2
49+
50+
echo -n "Waiting for psmdb/${cluster} status to be ${expected}"
51+
until [[ $(kubectl_bin get psmdb ${cluster} -o jsonpath='{.status.state}') == ${expected} ]]; do
52+
if [[ $retry -ge 60 ]]; then
53+
echo
54+
echo "psmdb/${cluster} did not reach ${expected} status, max retries exceeded"
55+
exit 1
56+
fi
57+
echo -n "."
58+
sleep 5
59+
60+
retry=$((retry + 1))
61+
done
62+
63+
echo
64+
echo "psmdb/${cluster} status is ${expected}"
65+
}
66+
867
set_debug
968

10-
if [[ $EKS == 1 ]]; then
11-
echo "Skip the test. We don't run it for EKS."
12-
exit 0
13-
fi
69+
ensure_default_sc_allows_expansion
1470

1571
create_infra ${namespace}
1672

@@ -23,24 +79,87 @@ desc 'create PSMDB cluster'
2379
cluster="some-name"
2480
spinup_psmdb "${cluster}-rs0" "$conf_dir/$cluster-rs0.yml"
2581

26-
kubectl_bin patch psmdb ${cluster} --type=json -p='[{"op": "replace", "path": "/spec/replsets/0/volumeSpec/persistentVolumeClaim/resources/requests/storage", "value":"5Gi"}]'
82+
patch_pvc_request "${cluster}" "2Gi"
2783
wait_cluster_consistency "$cluster" 3 2
84+
echo
2885

2986
for pvc in $(kubectl_bin get pvc -l app.kubernetes.io/component=mongod -o name); do
3087
retry=0
31-
until [[ $(kubectl_bin get ${pvc} -o jsonpath={.status.capacity.storage}) == "5Gi" ]]; do
88+
echo -n "Waiting for pvc/${pvc} to be resized"
89+
until [[ $(kubectl_bin get ${pvc} -o jsonpath={.status.capacity.storage}) == "2Gi" ]]; do
3290
if [[ $retry -ge 60 ]]; then
33-
echo "PVC ${pvc} was not resized, max retries exceeded"
91+
echo
92+
echo "pvc/${pvc} was not resized, max retries exceeded"
3493
exit 1
3594
fi
95+
echo -n "."
96+
sleep 5
3697

37-
echo "Waiting for PVC ${pvc} to be resized"
98+
retry=$((retry + 1))
99+
done
100+
echo
101+
echo "pvc/${pvc} was resized"
102+
done
103+
104+
desc 'create resourcequota'
105+
106+
# We're setting the quota to 7Gi, so we can only resize the first PVC to 3Gi
107+
# the others should fail to resize due to the exceeded quota but operator should
108+
# handle the error and keep the cluster ready
109+
110+
apply_resourcequota 7Gi
111+
patch_pvc_request "${cluster}" "3Gi"
112+
wait_cluster_consistency "$cluster" 3 2
113+
echo
114+
115+
echo -n "Waiting for pvc/mongod-data-some-name-rs0-0 to be resized"
116+
until [[ $(kubectl_bin get pvc mongod-data-some-name-rs0-0 -o jsonpath={.status.capacity.storage}) == "3Gi" ]]; do
117+
if [[ $retry -ge 60 ]]; then
118+
echo
119+
echo "pvc/mongod-data-some-name-rs0-0 was not resized, max retries exceeded"
120+
exit 1
121+
fi
122+
echo -n "."
123+
sleep 5
124+
125+
retry=$((retry + 1))
126+
done
127+
echo
128+
echo "pvc/mongod-data-some-name-rs0-0 was resized"
129+
130+
# We're setting the quota to 9Gi, so we can resize all PVCs to 3Gi
131+
132+
apply_resourcequota 9Gi
133+
patch_pvc_request "${cluster}" "3Gi"
134+
wait_cluster_consistency "$cluster" 3 2
135+
echo
136+
for pvc in $(kubectl_bin get pvc -l app.kubernetes.io/component=mongod -o name); do
137+
retry=0
138+
echo -n "Waiting for pvc/${pvc} to be resized"
139+
until [[ $(kubectl_bin get ${pvc} -o jsonpath={.status.capacity.storage}) == "3Gi" ]]; do
140+
if [[ $retry -ge 60 ]]; then
141+
echo
142+
echo "pvc/${pvc} was not resized, max retries exceeded"
143+
exit 1
144+
fi
145+
echo -n "."
38146
sleep 5
39147

40148
retry=$((retry + 1))
41149
done
42-
echo "PVC ${pvc} was resized"
150+
echo
151+
echo "pvc/${pvc} was resized"
43152
done
44153

154+
desc "test downscale"
155+
156+
# operator shouldn't try to downscale the PVCs and set status to error
157+
patch_pvc_request "${cluster}" "1Gi"
158+
wait_cluster_status ${cluster} "error"
159+
160+
# user should be able to restore to the previous size and make the cluster ready
161+
patch_pvc_request "${cluster}" "3Gi"
162+
wait_cluster_status ${cluster} "ready"
163+
45164
destroy "${namespace}"
46165
desc "test passed"

pkg/apis/psmdb/v1/psmdb_types.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1224,4 +1224,4 @@ const (
12241224
func (cr *PerconaServerMongoDB) PVCResizeInProgress() bool {
12251225
_, ok := cr.Annotations[AnnotationPVCResizeInProgress]
12261226
return ok
1227-
}
1227+
}

pkg/controller/perconaservermongodb/statefulset.go

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,11 @@ func (r *ReconcilePerconaServerMongoDB) reconcileStatefulSet(ctx context.Context
5050
return nil, errors.Wrapf(err, "reconcile PVCs for %s", sfs.Name)
5151
}
5252

53+
if _, ok := sfs.Annotations[api.AnnotationPVCResizeInProgress]; ok {
54+
log.V(1).Info("PVC resize in progress, skipping reconciliation of statefulset", "name", sfs.Name)
55+
return sfs, nil
56+
}
57+
5358
err = r.createOrUpdate(ctx, sfs)
5459
if err != nil {
5560
return nil, errors.Wrapf(err, "update StatefulSet %s", sfs.Name)

0 commit comments

Comments
 (0)