Skip to content

feat: Add preflight checks framework #1129

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 15 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -56,3 +56,23 @@ webhooks:
resources:
- clusters
sideEffects: None
- admissionReviewVersions:
- v1
clientConfig:
service:
name: '{{ include "chart.name" . }}-admission'
namespace: '{{ .Release.Namespace }}'
path: /preflight-v1beta1-cluster
failurePolicy: Fail
name: preflight.cluster.caren.nutanix.com
rules:
- apiGroups:
- cluster.x-k8s.io
apiVersions:
- '*'
operations:
- CREATE
resources:
- clusters
sideEffects: None
timeoutSeconds: 30
8 changes: 8 additions & 0 deletions cmd/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ import (
"github.com/nutanix-cloud-native/cluster-api-runtime-extensions-nutanix/pkg/handlers/nutanix"
"github.com/nutanix-cloud-native/cluster-api-runtime-extensions-nutanix/pkg/handlers/options"
"github.com/nutanix-cloud-native/cluster-api-runtime-extensions-nutanix/pkg/webhook/cluster"
"github.com/nutanix-cloud-native/cluster-api-runtime-extensions-nutanix/pkg/webhook/preflight"
)

func main() {
Expand Down Expand Up @@ -219,6 +220,13 @@ func main() {
Handler: cluster.NewValidator(mgr.GetClient(), admission.NewDecoder(mgr.GetScheme())),
})

mgr.GetWebhookServer().Register("/preflight-v1beta1-cluster", &webhook.Admission{
Handler: preflight.New(mgr.GetClient(), admission.NewDecoder(mgr.GetScheme()),
[]preflight.Checker{
// Add your preflight checkers here.
}...,
),
})
if err := mgr.Start(signalCtx); err != nil {
setupLog.Error(err, "unable to start controller manager")
os.Exit(1)
Expand Down
2 changes: 1 addition & 1 deletion hack/update-webhook-configurations.yq
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ with(.metadata;
.name = "{{ include \"chart.name\" . }}-" + .name,
.annotations["cert-manager.io/inject-ca-from"] = "{{ .Release.Namespace}}/{{ template \"chart.name\" . }}-admission-tls"
),
with(.webhooks[0].clientConfig.service;
with(.webhooks[].clientConfig.service;
.name = "{{ include \"chart.name\" . }}-admission",
.namespace = "{{ .Release.Namespace }}"
)
5 changes: 5 additions & 0 deletions pkg/webhook/preflight/doc.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
// Copyright 2025 Nutanix. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
package preflight

// +kubebuilder:webhook:path=/preflight-v1beta1-cluster,mutating=false,failurePolicy=fail,groups="cluster.x-k8s.io",resources=clusters,verbs=create,versions=*,name=preflight.cluster.caren.nutanix.com,admissionReviewVersions=v1,sideEffects=None,timeoutSeconds=30
153 changes: 153 additions & 0 deletions pkg/webhook/preflight/preflight.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,153 @@
// Copyright 2025 Nutanix. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
package preflight

import (
"context"
"fmt"
"net/http"
"sync"

admissionv1 "k8s.io/api/admission/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1"
ctrlclient "sigs.k8s.io/controller-runtime/pkg/client"
"sigs.k8s.io/controller-runtime/pkg/webhook/admission"
)

type Cause struct {
Message string
Field string
}

type CheckResult struct {
Name string

Allowed bool
Error bool

Causes []Cause
Warnings []string
}

type Check = func(ctx context.Context) CheckResult

type Checker interface {
// Init decides which of its checks should run for the cluster. It then initializes the checks
// with common dependencies, such as an infrastructure client. Finally, it returns the initialized checks,
// ready to be run.
//
// Init can store the client and cluster, but not the context, because each check will accept its own context.
Init(ctx context.Context, client ctrlclient.Client, cluster *clusterv1.Cluster) []Check
}

type WebhookHandler struct {
client ctrlclient.Client
decoder admission.Decoder
checkers []Checker
}

func New(client ctrlclient.Client, decoder admission.Decoder, checkers ...Checker) *WebhookHandler {
h := &WebhookHandler{
client: client,
decoder: decoder,
checkers: checkers,
}
return h
}

func (h *WebhookHandler) Handle(ctx context.Context, req admission.Request) admission.Response {
if req.Operation == admissionv1.Delete {
return admission.Allowed("")
}

cluster := &clusterv1.Cluster{}
err := h.decoder.Decode(req, cluster)
if err != nil {
return admission.Errored(http.StatusBadRequest, err)
}

// Checks run only for ClusterClass-based clusters.
if cluster.Spec.Topology == nil {
return admission.Allowed("")
}

resp := admission.Response{
AdmissionResponse: admissionv1.AdmissionResponse{
Allowed: true,
Result: &metav1.Status{
Details: &metav1.StatusDetails{},
},
},
}

// Collect all checks in parallel.
checkerWG := &sync.WaitGroup{}
resultCh := make(chan CheckResult)
for _, checker := range h.checkers {
checkerWG.Add(1)

go func(ctx context.Context, checker Checker, resultCh chan CheckResult) {
// Initialize the checker.
checks := checker.Init(ctx, h.client, cluster)

// Run its checks in parallel.
checksWG := &sync.WaitGroup{}
for _, check := range checks {
checksWG.Add(1)
go func(ctx context.Context, check Check, resultCh chan CheckResult) {
result := check(ctx)
resultCh <- result
checksWG.Done()
}(ctx, check, resultCh)
}
checksWG.Wait()

checkerWG.Done()
}(ctx, checker, resultCh)
}
Comment on lines +85 to +108
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

One of the side effects of using channels like this is that results would be non-deterministically ordered, as well as check results from different checkers being interleaved. I would prefer deterministic ordering of results.

How about gathering all checks prior to running them (i.e. run checker.Init for each checker in a loop to get all checks in order) and then run each check asynchronously sending check index back to channel along with result, gathering results in a heap/priority queue using the check index as the priority?

This would enable using a buffered channel for results (single loop with known number of checks to run) so non-blocking and return results in deterministic order, as well as keeping checks originating from a single checker together (non-interleaved).

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good catch! +1 to deterministically ordered results.

Since checker.Init may call out to an external API, I wanted to do initialization concurrently.

gathering results in a heap/priority queue using the check index as the priority?

Sounds good.


// Close the channel when all checkers are done.
go func(wg *sync.WaitGroup, resultCh chan CheckResult) {
wg.Wait()
close(resultCh)
}(checkerWG, resultCh)
Comment on lines +111 to +114
Copy link
Contributor

@faiq faiq May 29, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can the function return before this go routing closes the channel?

what happens with slow checks?

(EDIT) goroutines will run even after the function returns


// Collect all check results from the channel.
internalError := false
for result := range resultCh {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Very clean, on any Error to run a Check or when a check returns a Allowed==false the resp will always be set to resp.Allowed = false and will keep appending to the causes.

(just mainly thinking out loud)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That's right. We want to keep track of whether we had a rejection, or an error, but collect all the check results, regardless.

if result.Error {
internalError = true
}

if !result.Allowed {
resp.Allowed = false
}

for _, cause := range result.Causes {
resp.Result.Details.Causes = append(resp.Result.Details.Causes, metav1.StatusCause{
Type: metav1.CauseType(fmt.Sprintf("FailedPreflight%s", result.Name)),
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is there any concern around unbounded cardinality here?

Message: cause.Message,
Field: cause.Field,
})
}

resp.Warnings = append(resp.Warnings, result.Warnings...)
}

if len(resp.Result.Details.Causes) == 0 {
return resp
}

// Because we have some causes, we construct the response message and code.
resp.Result.Message = "preflight checks failed"
resp.Result.Code = http.StatusForbidden
resp.Result.Reason = metav1.StatusReasonForbidden
if internalError {
// Internal errors take precedence over check failures.
resp.Result.Code = http.StatusInternalServerError
resp.Result.Reason = metav1.StatusReasonInternalError
}

Comment on lines +138 to +151
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't checking if causes is empty is enough here, does it handle the error response correctly? How about a switch instead to cover error and allowed state? Also think it should use StatusReasonInvalid rather than StatusReasonForbidden.

Suggested change
if len(resp.Result.Details.Causes) == 0 {
return resp
}
// Because we have some causes, we construct the response message and code.
resp.Result.Message = "preflight checks failed"
resp.Result.Code = http.StatusForbidden
resp.Result.Reason = metav1.StatusReasonForbidden
if internalError {
// Internal errors take precedence over check failures.
resp.Result.Code = http.StatusInternalServerError
resp.Result.Reason = metav1.StatusReasonInternalError
}
switch {
case internalError:
// Internal errors take precedence over check failures.
resp.Result.Code = http.StatusInternalServerError
resp.Result.Reason = metav1.StatusReasonInternalError
case !resp.Allowed:
// Because the response is not allowed, preflights must have failed.
resp.Result.Message = "preflight checks failed"
resp.Result.Code = http.StatusInvalid
resp.Result.Reason = metav1.StatusReasonInvalid
}

return resp
}
Loading
Loading