Skip to content

Commit c5b4148

Browse files
feat(ha): print warning on control plane node reset (#677)
warn about removing the antepenultimate (third to last) controller node if ha is enabled.
1 parent 0fb943d commit c5b4148

File tree

2 files changed

+56
-0
lines changed

2 files changed

+56
-0
lines changed

cmd/embedded-cluster/uninstall.go

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,9 +11,11 @@ import (
1111
autopilot "github.com/k0sproject/k0s/pkg/apis/autopilot/v1beta2"
1212
"github.com/k0sproject/k0s/pkg/apis/k0s/v1beta1"
1313
"github.com/k0sproject/k0s/pkg/etcd"
14+
embeddedclusterv1beta1 "github.com/replicatedhq/embedded-cluster-kinds/apis/v1beta1"
1415
"github.com/sirupsen/logrus"
1516
"github.com/urfave/cli/v2"
1617
corev1 "k8s.io/api/core/v1"
18+
"k8s.io/apimachinery/pkg/labels"
1719
"sigs.k8s.io/controller-runtime/pkg/client"
1820

1921
"github.com/replicatedhq/embedded-cluster/pkg/defaults"
@@ -55,6 +57,8 @@ var (
5557
k0s = "/usr/local/bin/k0s"
5658
)
5759

60+
var haWarningMessage = "WARNING: High-availability clusters must maintain at least three controller nodes, but resetting this node will leave only two. This can lead to a loss of functionality and non-recoverable failures. You should re-add a third node as soon as possible."
61+
5862
// deleteNode removes the node from the cluster
5963
func (h *hostInfo) deleteNode(ctx context.Context) error {
6064
if h.KclientError != nil {
@@ -287,6 +291,43 @@ func checkErrPrompt(c *cli.Context, err error) bool {
287291
return prompts.New().Confirm("Do you want to continue anyway?", false)
288292
}
289293

294+
// maybePrintHAWarning prints a warning message when the user is running a reset a node
295+
// in a high availability cluster and there are only 3 control nodes.
296+
func maybePrintHAWarning(c *cli.Context) error {
297+
kubeconfig := defaults.PathToKubeConfig()
298+
if _, err := os.Stat(kubeconfig); err != nil {
299+
return nil
300+
}
301+
302+
os.Setenv("KUBECONFIG", kubeconfig)
303+
kubecli, err := kubeutils.KubeClient()
304+
if err != nil {
305+
return fmt.Errorf("unable to create kube client: %w", err)
306+
}
307+
embeddedclusterv1beta1.AddToScheme(kubecli.Scheme())
308+
309+
if in, err := kubeutils.GetLatestInstallation(c.Context, kubecli); err != nil {
310+
return fmt.Errorf("unable to get installation: %w", err)
311+
} else if !in.Spec.HighAvailability {
312+
return nil
313+
}
314+
315+
opts := &client.ListOptions{
316+
LabelSelector: labels.SelectorFromSet(
317+
labels.Set{"node-role.kubernetes.io/control-plane": "true"},
318+
),
319+
}
320+
var nodes corev1.NodeList
321+
if err := kubecli.List(c.Context, &nodes, opts); err != nil {
322+
return fmt.Errorf("unable to list nodes: %w", err)
323+
}
324+
if len(nodes.Items) == 3 {
325+
logrus.Warn(haWarningMessage)
326+
logrus.Info("")
327+
}
328+
return nil
329+
}
330+
290331
var resetCommand = &cli.Command{
291332
Name: "reset",
292333
Before: func(c *cli.Context) error {
@@ -315,6 +356,10 @@ var resetCommand = &cli.Command{
315356
},
316357
Usage: fmt.Sprintf("Remove %s from the current node", binName),
317358
Action: func(c *cli.Context) error {
359+
if err := maybePrintHAWarning(c); err != nil && !c.Bool("force") {
360+
return err
361+
}
362+
318363
logrus.Info("This will remove this node from the cluster and completely reset it, removing all data stored on the node.")
319364
logrus.Info("Do not reset another node until this is complete.")
320365
if !c.Bool("force") && !c.Bool("no-prompt") && !prompts.New().Confirm("Do you want to continue?", false) {

e2e/install_test.go

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1051,6 +1051,17 @@ func TestMultiNodeHAInstallation(t *testing.T) {
10511051
t.Fatalf("fail to check post ha state: %v", err)
10521052
}
10531053

1054+
bin := strings.Split(command, " ")[0]
1055+
t.Logf("%s: resetting controller node", time.Now().Format(time.RFC3339))
1056+
stdout, stderr, err = RunCommandOnNode(t, tc, 2, []string{bin, "reset", "--no-prompt"})
1057+
if err != nil {
1058+
t.Fatalf("fail to remove controller node %s:", err)
1059+
}
1060+
if !strings.Contains(stderr, "High-availability clusters must maintain at least three controller nodes") {
1061+
t.Errorf("reset output does not contain the ha warning")
1062+
t.Logf("stdout: %s\nstderr: %s", stdout, stderr)
1063+
}
1064+
10541065
t.Logf("%s: test complete", time.Now().Format(time.RFC3339))
10551066
}
10561067

0 commit comments

Comments
 (0)