feat: Add preflight checks framework (#1129)

dlipovetsky · web-flow · commit 178ede7333a5 · 2025-06-17T08:20:28.000-07:00
**What problem does this PR solve?**:
Adds a framework for preflight checks. A preflight check is a type of
validation that typically requires access to an infrastructure API.

A validating webhook on the Cluster resource executes all preflight
checks, and returns failures, and warnings to the client.

**Which issue(s) this PR fixes**:
Fixes #

**How Has This Been Tested?**:
&lt;!--
Please describe the tests that you ran to verify your changes.
Provide output from the tests and any manual steps needed to replicate
the tests.
--&gt;

**Special notes for your reviewer**:
&lt;!--
Use this to provide any additional information to the reviewers.
This may include:
- Best way to review the PR.
- Where the author wants the most review attention on.
- etc.
--&gt;
diff --git a/charts/cluster-api-runtime-extensions-nutanix/templates/webhooks.yaml b/charts/cluster-api-runtime-extensions-nutanix/templates/webhooks.yaml
@@ -56,3 +56,23 @@ webhooks:
         resources:
           - clusters
     sideEffects: None
+  - admissionReviewVersions:
+      - v1
+    clientConfig:
+      service:
+        name: '{{ include "chart.name" . }}-admission'
+        namespace: '{{ .Release.Namespace }}'
+        path: /preflight-v1beta1-cluster
+    failurePolicy: Fail
+    name: preflight.cluster.caren.nutanix.com
+    rules:
+      - apiGroups:
+          - cluster.x-k8s.io
+        apiVersions:
+          - '*'
+        operations:
+          - CREATE
+        resources:
+          - clusters
+    sideEffects: None
+    timeoutSeconds: 30
diff --git a/cmd/main.go b/cmd/main.go
@@ -41,6 +41,7 @@ import (
 	"github.com/nutanix-cloud-native/cluster-api-runtime-extensions-nutanix/pkg/handlers/nutanix"
 	"github.com/nutanix-cloud-native/cluster-api-runtime-extensions-nutanix/pkg/handlers/options"
 	"github.com/nutanix-cloud-native/cluster-api-runtime-extensions-nutanix/pkg/webhook/cluster"
+	"github.com/nutanix-cloud-native/cluster-api-runtime-extensions-nutanix/pkg/webhook/preflight"
 )
 
 func main() {
@@ -219,6 +220,13 @@ func main() {
 		Handler: cluster.NewValidator(mgr.GetClient(), admission.NewDecoder(mgr.GetScheme())),
 	})
 
+	mgr.GetWebhookServer().Register("/preflight-v1beta1-cluster", &webhook.Admission{
+		Handler: preflight.New(mgr.GetClient(), admission.NewDecoder(mgr.GetScheme()),
+			[]preflight.Checker{
+				// Add your preflight checkers here.
+			}...,
+		),
+	})
 	if err := mgr.Start(signalCtx); err != nil {
 		setupLog.Error(err, "unable to start controller manager")
 		os.Exit(1)
diff --git a/hack/update-webhook-configurations.yq b/hack/update-webhook-configurations.yq
@@ -8,7 +8,7 @@ with(.metadata;
   .name = "{{ include \"chart.name\" . }}-" + .name,
   .annotations["cert-manager.io/inject-ca-from"] = "{{ .Release.Namespace}}/{{ template \"chart.name\" . }}-admission-tls"
 ),
-with(.webhooks[0].clientConfig.service;
+with(.webhooks[].clientConfig.service;
   .name = "{{ include \"chart.name\" . }}-admission",
   .namespace = "{{ .Release.Namespace }}"
 )
diff --git a/pkg/webhook/preflight/doc.go b/pkg/webhook/preflight/doc.go
@@ -0,0 +1,5 @@
+// Copyright 2025 Nutanix. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+package preflight
+
+// +kubebuilder:webhook:path=/preflight-v1beta1-cluster,mutating=false,failurePolicy=fail,groups="cluster.x-k8s.io",resources=clusters,verbs=create,versions=*,name=preflight.cluster.caren.nutanix.com,admissionReviewVersions=v1,sideEffects=None,timeoutSeconds=30
diff --git a/pkg/webhook/preflight/preflight.go b/pkg/webhook/preflight/preflight.go
@@ -0,0 +1,210 @@
+// Copyright 2025 Nutanix. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+package preflight
+
+import (
+	"context"
+	"fmt"
+	"net/http"
+	"runtime/debug"
+	"sync"
+
+	admissionv1 "k8s.io/api/admission/v1"
+	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+	clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1"
+	ctrl "sigs.k8s.io/controller-runtime"
+	ctrlclient "sigs.k8s.io/controller-runtime/pkg/client"
+	"sigs.k8s.io/controller-runtime/pkg/webhook/admission"
+)
+
+type (
+	// Checker returns a set of checks that have been initialized with common dependencies,
+	// such as an infrastructure API client.
+	Checker interface {
+		// Init returns the checks that should run for the cluster.
+		Init(ctx context.Context, client ctrlclient.Client, cluster *clusterv1.Cluster) []Check
+	}
+
+	// Check represents a single preflight check that can be run against a cluster.
+	// It has a Name method that returns the name of the check, and a Run method executes
+	// the check, and returns a CheckResult.
+	// The Name method is used to identify the check if Run fails to return a result, for
+	// example if it panics.
+	Check interface {
+		Name() string
+		Run(ctx context.Context) CheckResult
+	}
+
+	// CheckResult represents the result of a check.
+	// It contains the name of the check, a boolean indicating whether the check passed, an
+	// error boolean indicating whether there was an internal error running the check, and a
+	// list of causes for the failure. It also contains a list of warnings that were
+	// generated during the check.
+	CheckResult struct {
+		Allowed bool
+		Error   bool
+
+		Causes   []Cause
+		Warnings []string
+	}
+
+	// Cause represents a cause of a check failure. It contains a message and an optional
+	// field that the cause relates to. The field is used to indicate which part of the
+	// cluster configuration the cause relates to.
+	Cause struct {
+		Message string
+		Field   string
+	}
+)
+
+type WebhookHandler struct {
+	client   ctrlclient.Client
+	decoder  admission.Decoder
+	checkers []Checker
+}
+
+func New(client ctrlclient.Client, decoder admission.Decoder, checkers ...Checker) *WebhookHandler {
+	h := &WebhookHandler{
+		client:   client,
+		decoder:  decoder,
+		checkers: checkers,
+	}
+	return h
+}
+
+type namedResult struct {
+	Name string
+	CheckResult
+}
+
+func (h *WebhookHandler) Handle(ctx context.Context, req admission.Request) admission.Response {
+	if req.Operation == admissionv1.Delete {
+		return admission.Allowed("")
+	}
+
+	cluster := &clusterv1.Cluster{}
+	err := h.decoder.Decode(req, cluster)
+	if err != nil {
+		return admission.Errored(http.StatusBadRequest, err)
+	}
+
+	// Checks run only for ClusterClass-based clusters.
+	if cluster.Spec.Topology == nil {
+		return admission.Allowed("")
+	}
+
+	resultsOrderedByCheckerAndCheck := run(ctx, h.client, cluster, h.checkers)
+
+	// Summarize the results.
+	resp := admission.Response{
+		AdmissionResponse: admissionv1.AdmissionResponse{
+			Allowed: true,
+			Result: &metav1.Status{
+				Details: &metav1.StatusDetails{},
+			},
+		},
+	}
+	internalError := false
+	for _, results := range resultsOrderedByCheckerAndCheck {
+		for _, result := range results {
+			if result.Error {
+				internalError = true
+			}
+			if !result.Allowed {
+				resp.Allowed = false
+			}
+			for _, cause := range result.Causes {
+				resp.Result.Details.Causes = append(resp.Result.Details.Causes, metav1.StatusCause{
+					Type:    metav1.CauseType(fmt.Sprintf("FailedPreflight%s", result.Name)),
+					Message: cause.Message,
+					Field:   cause.Field,
+				})
+			}
+			resp.Warnings = append(resp.Warnings, result.Warnings...)
+		}
+	}
+
+	switch {
+	case internalError:
+		// Internal errors take precedence over check failures.
+		resp.Result.Message = "preflight checks failed due to an internal error"
+		resp.Result.Code = http.StatusInternalServerError
+		resp.Result.Reason = metav1.StatusReasonInternalError
+	case !resp.Allowed:
+		// Because the response is not allowed, preflights must have failed.
+		resp.Result.Message = "preflight checks failed"
+		resp.Result.Code = http.StatusUnprocessableEntity
+		resp.Result.Reason = metav1.StatusReasonInvalid
+	}
+
+	return resp
+}
+
+// run runs all checks for the cluster, concurrently, and returns the results ordered by checker and check.
+// Checker are initialized concurrently, and checks runs concurrently as well.
+func run(ctx context.Context,
+	client ctrlclient.Client,
+	cluster *clusterv1.Cluster,
+	checkers []Checker,
+) [][]namedResult {
+	resultsOrderedByCheckerAndCheck := make([][]namedResult, len(checkers))
+
+	checkersWG := sync.WaitGroup{}
+	for i, checker := range checkers {
+		checkersWG.Add(1)
+		go func(ctx context.Context, client ctrlclient.Client, cluster *clusterv1.Cluster, checker Checker, i int) {
+			defer checkersWG.Done()
+
+			checks := checker.Init(ctx, client, cluster)
+			resultsOrderedByCheck := make([]namedResult, len(checks))
+
+			checksWG := sync.WaitGroup{}
+			for j, check := range checks {
+				checksWG.Add(1)
+				go func(ctx context.Context, check Check, j int) {
+					defer checksWG.Done()
+					defer func() {
+						if r := recover(); r != nil {
+							resultsOrderedByCheck[j] = namedResult{
+								Name: check.Name(),
+								CheckResult: CheckResult{
+									Error: true,
+									Causes: []Cause{
+										{
+											Message: fmt.Sprintf("internal error (panic): %s", r),
+											Field:   "",
+										},
+									},
+								},
+							}
+							ctrl.LoggerFrom(ctx).Error(
+								fmt.Errorf("preflight check panic"),
+								fmt.Sprintf("%v", r),
+								"checkName", check.Name(),
+								"clusterName", cluster.Name,
+								"clusterNamespace", cluster.Namespace,
+								"stackTrace", string(debug.Stack()),
+							)
+						}
+					}()
+					result := check.Run(ctx)
+					resultsOrderedByCheck[j] = namedResult{
+						Name:        check.Name(),
+						CheckResult: result,
+					}
+				}(ctx, check, j)
+			}
+			checksWG.Wait()
+			resultsOrderedByCheckerAndCheck[i] = resultsOrderedByCheck
+		}(
+			ctx,
+			client,
+			cluster,
+			checker,
+			i,
+		)
+	}
+	checkersWG.Wait()
+
+	return resultsOrderedByCheckerAndCheck
+}
diff --git a/pkg/webhook/preflight/preflight_test.go b/pkg/webhook/preflight/preflight_test.go

Original file line number	Diff line number	Diff line change
`@@ -8,7 +8,7 @@ with(.metadata;`
`8`	`8`	`.name = "{{ include \"chart.name\" . }}-" + .name,`
`9`	`9`	`.annotations["cert-manager.io/inject-ca-from"] = "{{ .Release.Namespace}}/{{ template \"chart.name\" . }}-admission-tls"`
`10`	`10`	`),`
`11`		`-with(.webhooks[0].clientConfig.service;`
	`11`	`+with(.webhooks[].clientConfig.service;`
`12`	`12`	`.name = "{{ include \"chart.name\" . }}-admission",`
`13`	`13`	`.namespace = "{{ .Release.Namespace }}"`
`14`	`14`	`)`