Skip to content

Commit b6613ec

Browse files
authored
Add support for HA restores (#672)
* Add support for HA restores
1 parent 9b75d0e commit b6613ec

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

47 files changed

+1581
-272
lines changed

.github/actions/e2e/action.yml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,9 @@ inputs:
44
test-name:
55
description: 'individual test to run'
66
required: true
7+
is-large-runner:
8+
description: 'Whether the test is running on a large runner'
9+
required: true
710
airgap-license-id:
811
description: 'airgap-enabled license id to use for e2e tests'
912
required: true
@@ -61,6 +64,7 @@ runs:
6164
external_ids:ovn-encap-type=geneve \
6265
external_ids:ovn-encap-ip=127.0.0.1
6366
- name: Free up runner disk space
67+
if: ${{ inputs.is-large-runner == 'false' }}
6468
shell: bash
6569
run: |
6670
df -h

.github/workflows/pull-request.yaml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -194,11 +194,15 @@ jobs:
194194
- TestMultiNodeHAInstallation
195195
- TestMultiNodeAirgapHAInstallation
196196
- TestMultiNodeAirgapUpgradeSameK0s
197+
- TestMultiNodeHADisasterRecovery
198+
- TestMultiNodeAirgapHADisasterRecovery
197199
include:
198200
- test: TestMultiNodeAirgapUpgrade
199201
runner: embedded-cluster
200202
- test: TestMultiNodeAirgapHAInstallation
201203
runner: embedded-cluster
204+
- test: TestMultiNodeAirgapHADisasterRecovery
205+
runner: embedded-cluster
202206
steps:
203207
- name: Checkout
204208
uses: actions/checkout@v4
@@ -211,6 +215,7 @@ jobs:
211215
- uses: ./.github/actions/e2e
212216
with:
213217
test-name: '${{ matrix.test }}'
218+
is-large-runner: ${{ matrix.runner == 'embedded-cluster' }}
214219
airgap-license-id: ${{ secrets.STAGING_EMBEDDED_CLUSTER_AIRGAP_LICENSE_ID }}
215220
snapshot-license-id: ${{ secrets.STAGING_EMBEDDED_CLUSTER_SNAPSHOT_LICENSE_ID }}
216221
snapshot-license: ${{ secrets.STAGING_EMBEDDED_CLUSTER_SNAPSHOT_LICENSE }}

.github/workflows/release-dev.yaml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -147,11 +147,15 @@ jobs:
147147
- TestMultiNodeHAInstallation
148148
- TestMultiNodeAirgapHAInstallation
149149
- TestMultiNodeAirgapUpgradeSameK0s
150+
- TestMultiNodeHADisasterRecovery
151+
- TestMultiNodeAirgapHADisasterRecovery
150152
include:
151153
- test: TestMultiNodeAirgapUpgrade
152154
runner: embedded-cluster
153155
- test: TestMultiNodeAirgapHAInstallation
154156
runner: embedded-cluster
157+
- test: TestMultiNodeAirgapHADisasterRecovery
158+
runner: embedded-cluster
155159
steps:
156160
- name: Checkout
157161
uses: actions/checkout@v4

Makefile

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4,12 +4,12 @@ ARCH := $(shell uname -m)
44
APP_NAME = embedded-cluster
55
ADMIN_CONSOLE_CHART_URL = oci://registry.replicated.com/library
66
ADMIN_CONSOLE_CHART_NAME = admin-console
7-
ADMIN_CONSOLE_CHART_VERSION = 1.109.9-build.1
7+
ADMIN_CONSOLE_CHART_VERSION = 1.109.12
88
ADMIN_CONSOLE_IMAGE_OVERRIDE =
99
ADMIN_CONSOLE_MIGRATIONS_IMAGE_OVERRIDE =
1010
EMBEDDED_OPERATOR_CHART_URL = oci://registry.replicated.com/library
1111
EMBEDDED_OPERATOR_CHART_NAME = embedded-cluster-operator
12-
EMBEDDED_OPERATOR_CHART_VERSION = 0.34.6
12+
EMBEDDED_OPERATOR_CHART_VERSION = 0.34.9
1313
EMBEDDED_OPERATOR_UTILS_IMAGE = busybox:1.36.1
1414
EMBEDDED_CLUSTER_OPERATOR_IMAGE_OVERRIDE =
1515
OPENEBS_CHART_URL = https://openebs.github.io/openebs
@@ -18,7 +18,7 @@ OPENEBS_CHART_VERSION = 4.0.1
1818
OPENEBS_UTILS_VERSION = 4.0.0
1919
SEAWEEDFS_CHART_URL = https://seaweedfs.github.io/seaweedfs/helm
2020
SEAWEEDFS_CHART_NAME = seaweedfs/seaweedfs
21-
SEAWEEDFS_CHART_VERSION = 3.67.0
21+
SEAWEEDFS_CHART_VERSION = 3.68.0
2222
REGISTRY_CHART_URL = https://helm.twun.io
2323
REGISTRY_CHART_NAME = twuni/docker-registry
2424
REGISTRY_CHART_VERSION = 2.2.3
Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
version: v1
2+
resourceModifierRules:
3+
# convert kotsadm components (rqlite) to non-HA mode
4+
# as kotsadm will always be restored to a single node
5+
# because it is used during the restore process to add nodes
6+
- conditions:
7+
groupResource: statefulsets.apps
8+
resourceNameRegex: "^kotsadm-rqlite$"
9+
namespaces:
10+
- kotsadm
11+
patches:
12+
- operation: replace
13+
path: "/spec/replicas"
14+
value: 1
15+
- operation: replace
16+
path: "/spec/template/spec/containers/0/args/2"
17+
value: "-bootstrap-expect=1"
18+
# decouple kotsadm components PVCs from nodes
19+
# this allows the PVCs to be created on the correct nodes
20+
# when restoring HA kotsadm to a single node and then converting it to HA again
21+
- conditions:
22+
groupResource: persistentvolumeclaims
23+
resourceNameRegex: "kotsadm-rqlite"
24+
namespaces:
25+
- kotsadm
26+
mergePatches:
27+
- patchData: |
28+
{
29+
"metadata": {
30+
"annotations": {
31+
"volume.kubernetes.io/selected-node": null
32+
}
33+
}
34+
}
35+
# preserve the registry service IP from the original cluster
36+
- conditions:
37+
groupResource: services
38+
resourceNameRegex: "^registry$"
39+
namespaces:
40+
- registry
41+
patches:
42+
- operation: replace
43+
path: "/spec/clusterIP"
44+
value: "__REGISTRY_SERVICE_IP__"
45+
# preserve the seaweedfs s3 service IP from the original cluster
46+
- conditions:
47+
groupResource: services
48+
resourceNameRegex: "^ec-seaweedfs-s3$"
49+
namespaces:
50+
- seaweedfs
51+
patches:
52+
- operation: replace
53+
path: "/spec/clusterIP"
54+
value: "__SEAWEEDFS_S3_SERVICE_IP__"

cmd/embedded-cluster/join.go

Lines changed: 12 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -18,8 +18,9 @@ import (
1818
"github.com/urfave/cli/v2"
1919
"gopkg.in/yaml.v2"
2020
corev1 "k8s.io/api/core/v1"
21+
"k8s.io/apimachinery/pkg/api/errors"
2122
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
22-
"k8s.io/apimachinery/pkg/labels"
23+
"k8s.io/apimachinery/pkg/types"
2324
"sigs.k8s.io/controller-runtime/pkg/client"
2425
k8syaml "sigs.k8s.io/yaml"
2526

@@ -212,9 +213,8 @@ var joinCommand = &cli.Command{
212213
}
213214

214215
logrus.Debugf("creating systemd unit files")
215-
// both controller and worker nodes will have 'worker' in the join command, but only controllers will have 'enable-worker'
216-
// https://github.com/replicatedhq/kots/blob/6a0602f4054d5d5f2d97e649b3303a059f0064d9/pkg/embeddedcluster/node_join.go#L183
217-
if err := createSystemdUnitFiles(!strings.Contains(jcmd.K0sJoinCommand, "enable-worker")); err != nil {
216+
// both controller and worker nodes will have 'worker' in the join command
217+
if err := createSystemdUnitFiles(!strings.Contains(jcmd.K0sJoinCommand, "controller")); err != nil {
218218
err := fmt.Errorf("unable to create systemd unit files: %w", err)
219219
metrics.ReportJoinFailed(c.Context, jcmd.MetricsBaseURL, jcmd.ClusterID, err)
220220
return err
@@ -461,17 +461,16 @@ func canEnableHA(ctx context.Context, kcli client.Client) (bool, error) {
461461
if installation.Spec.HighAvailability {
462462
return false, nil
463463
}
464-
var nodes corev1.NodeList
465-
labelSelector := labels.Set(map[string]string{
466-
"node-role.kubernetes.io/control-plane": "true",
467-
}).AsSelector()
468-
if err := kcli.List(ctx, &nodes, &client.ListOptions{LabelSelector: labelSelector}); err != nil {
469-
return false, fmt.Errorf("unable to list nodes: %w", err)
464+
if err := kcli.Get(ctx, types.NamespacedName{Name: ecRestoreStateCMName, Namespace: "embedded-cluster"}, &corev1.ConfigMap{}); err == nil {
465+
return false, nil // cannot enable HA during a restore
466+
} else if !errors.IsNotFound(err) {
467+
return false, fmt.Errorf("unable to get restore state configmap: %w", err)
470468
}
471-
if len(nodes.Items) < 3 {
472-
return false, nil
469+
ncps, err := kubeutils.NumOfControlPlaneNodes(ctx, kcli)
470+
if err != nil {
471+
return false, fmt.Errorf("unable to check control plane nodes: %w", err)
473472
}
474-
return true, nil
473+
return ncps >= 3, nil
475474
}
476475

477476
// enableHA enables high availability in the installation object

cmd/embedded-cluster/join_test.go

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -190,6 +190,24 @@ func Test_canEnableHA(t *testing.T) {
190190
},
191191
want: false,
192192
},
193+
{
194+
name: "high availability is not enabled and there is three or more controller nodes but a restore is in progress",
195+
args: args{
196+
kcli: fake.NewClientBuilder().WithScheme(scheme).WithObjects(
197+
&embeddedclusterv1beta1.Installation{
198+
ObjectMeta: metav1.ObjectMeta{Name: "test-installation"},
199+
Spec: embeddedclusterv1beta1.InstallationSpec{HighAvailability: false},
200+
},
201+
&corev1.ConfigMap{
202+
ObjectMeta: metav1.ObjectMeta{Name: ecRestoreStateCMName, Namespace: "embedded-cluster"},
203+
},
204+
&corev1.Node{ObjectMeta: metav1.ObjectMeta{Name: "node1", Labels: controllerLabels}},
205+
&corev1.Node{ObjectMeta: metav1.ObjectMeta{Name: "node2", Labels: controllerLabels}},
206+
&corev1.Node{ObjectMeta: metav1.ObjectMeta{Name: "node3", Labels: controllerLabels}},
207+
).Build(),
208+
},
209+
want: false,
210+
},
193211
}
194212
for _, tt := range tests {
195213
t.Run(tt.name, func(t *testing.T) {

0 commit comments

Comments
 (0)