Skip to content

fix: Ensure DaemonSets with '/initialized' and '/registered' are considered during nodeclaim calculations #2161

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 15 additions & 1 deletion pkg/controllers/provisioning/scheduling/nodeclaimtemplate.go
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,12 @@ func NewNodeClaimTemplate(nodePool *v1.NodePool) *NodeClaimTemplate {
})
nct.Requirements.Add(scheduling.NewNodeSelectorRequirementsWithMinValues(nct.Spec.Requirements...).Values()...)
nct.Requirements.Add(scheduling.NewLabelRequirements(nct.Labels).Values()...)

// Add requirements for DaemonSet scheduling calculations
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This looks good! Can we add one test to make sure that this works properly on the two new labels? Maybe something with affinity?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I have implemented a test that verifies nodes are launched with properly calculated daemonOverhead and scheduled correctly when NodeAffinity is configured for each respective label.

// These ensure DaemonSets with nodeAffinity for these labels are considered
nct.Requirements.Add(scheduling.NewRequirement(v1.NodeRegisteredLabelKey, corev1.NodeSelectorOpIn, "true"))
nct.Requirements.Add(scheduling.NewRequirement(v1.NodeInitializedLabelKey, corev1.NodeSelectorOpIn, "true"))

return nct
}

Expand All @@ -80,6 +86,14 @@ func (i *NodeClaimTemplate) ToNodeClaim() *v1.NodeClaim {
return i.Name
})...))

// Filter out DaemonSet scheduling-only requirements for the actual NodeClaim
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: These may not just be DaemonSet scheduling requirements -- I'm not sure why you would do this on things other than DaemonSets, but we should technically support this for any pod that selected against this label

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@jonathan-innis
Thanks for your comment.

Apologies for the delay.
We exclude other workloads like Deployments due to the following scenarios:

  • When node affinity for these labels are applied to Deployments, current validation rejects them during NodeClaim creation, preventing NodeClaim creation entirely
  • Simply removing this validation leads to unintended behavior:
    • Deployments create Pending Pods, triggering new NodeClaim creation
    • Pods remain unscheduled until Nodes become registered/initialized, causing redundant NodeClaim creation

Addressing this would require logic to prevent excessive NodeClaim creation when Pending Pods exist with registered/initialized labels.
This represents a not small change beyond this Issue's scope, so we excluded it.
(Please let me know if we should create a separate Issue for investigation, or if any comments need modification.)

requirements := scheduling.NewRequirements()
for key, req := range i.Requirements {
if key != v1.NodeRegisteredLabelKey && key != v1.NodeInitializedLabelKey {
requirements.Add(req)
}
}

nc := &v1.NodeClaim{
ObjectMeta: metav1.ObjectMeta{
GenerateName: fmt.Sprintf("%s-", i.NodePoolName),
Expand All @@ -97,7 +111,7 @@ func (i *NodeClaimTemplate) ToNodeClaim() *v1.NodeClaim {
},
Spec: i.Spec,
}
nc.Spec.Requirements = i.Requirements.NodeSelectorRequirements()
nc.Spec.Requirements = requirements.NodeSelectorRequirements()
if nc.Spec.TerminationGracePeriod == nil {
nc.Spec.TerminationGracePeriod = DefaultTerminationGracePeriod
}
Expand Down
119 changes: 119 additions & 0 deletions pkg/controllers/provisioning/scheduling/suite_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -2326,6 +2326,125 @@ var _ = Context("Scheduling", func() {
// must create a new node
Expect(node1.Name).ToNot(Equal(node2.Name))
})
It("Should properly handle DaemonSet resources with Node Affinity for `karpenter.sh/initialized` and `karpenter.sh/registered`", func() {

// This test verifies that when DaemonSets are configured with node affinity for the "karpenter.sh/initialized"
// and "karpenter.sh/registered" labels, nodes are launched with properly calculated
// daemonOverhead. (Issue #2116)

// Daemonset with node affinity for `karpenter.sh/initialized`
ds1 := test.DaemonSet(
test.DaemonSetOptions{
PodOptions: test.PodOptions{
NodeRequirements: []corev1.NodeSelectorRequirement{
{
Key: v1.NodeInitializedLabelKey,
Operator: corev1.NodeSelectorOpIn,
Values: []string{"true"},
},
},
ResourceRequirements: corev1.ResourceRequirements{Requests: corev1.ResourceList{
corev1.ResourceCPU: resource.MustParse("1"),
}},
},
},
)
ExpectApplied(ctx, env.Client, nodePool, ds1)
Expect(env.Client.Get(ctx, client.ObjectKeyFromObject(ds1), ds1)).To(Succeed())

// Daemonset with node affinity for `karpenter.sh/registered`
ds2 := test.DaemonSet(
test.DaemonSetOptions{
PodOptions: test.PodOptions{
NodeRequirements: []corev1.NodeSelectorRequirement{
{
Key: v1.NodeRegisteredLabelKey,
Operator: corev1.NodeSelectorOpIn,
Values: []string{"true"},
},
},
ResourceRequirements: corev1.ResourceRequirements{Requests: corev1.ResourceList{
corev1.ResourceCPU: resource.MustParse("1"),
}},
},
},
)
ExpectApplied(ctx, env.Client, nodePool, ds2)
Expect(env.Client.Get(ctx, client.ObjectKeyFromObject(ds2), ds2)).To(Succeed())

opts := test.PodOptions{ResourceRequirements: corev1.ResourceRequirements{
Limits: map[corev1.ResourceName]resource.Quantity{
corev1.ResourceCPU: resource.MustParse("2"),
},
}}
initialPod := test.UnschedulablePod(opts)
ExpectProvisioned(ctx, env.Client, cluster, cloudProvider, prov, initialPod)
node1 := ExpectScheduled(ctx, env.Client, initialPod)

// create our daemonset pod and manually bind it to the node
ds1Pod := test.UnschedulablePod(test.PodOptions{
ResourceRequirements: corev1.ResourceRequirements{
Requests: map[corev1.ResourceName]resource.Quantity{
corev1.ResourceCPU: resource.MustParse("1"),
}},
})
ds1Pod.OwnerReferences = append(ds1Pod.OwnerReferences, metav1.OwnerReference{
APIVersion: "apps/v1",
Kind: "DaemonSet",
Name: ds1.Name,
UID: ds1.UID,
Controller: lo.ToPtr(true),
BlockOwnerDeletion: lo.ToPtr(true),
})
ExpectApplied(ctx, env.Client, nodePool, ds1Pod)

ds2Pod := test.UnschedulablePod(test.PodOptions{
ResourceRequirements: corev1.ResourceRequirements{
Requests: map[corev1.ResourceName]resource.Quantity{
corev1.ResourceCPU: resource.MustParse("1"),
}},
})
ds2Pod.OwnerReferences = append(ds2Pod.OwnerReferences, metav1.OwnerReference{
APIVersion: "apps/v1",
Kind: "DaemonSet",
Name: ds2.Name,
UID: ds2.UID,
Controller: lo.ToPtr(true),
BlockOwnerDeletion: lo.ToPtr(true),
})
ExpectApplied(ctx, env.Client, nodePool, ds2Pod)

// delete the pod so that the node is empty
ExpectDeleted(ctx, env.Client, initialPod)
ExpectReconcileSucceeded(ctx, nodeStateController, client.ObjectKeyFromObject(node1))

cluster.ForEachNode(func(f *state.StateNode) bool {
dsRequests := f.DaemonSetRequests()
available := f.Available()
Expect(dsRequests.Cpu().AsApproximateFloat64()).To(BeNumerically("~", 0))
// When daemonOverhead is considered for both ds1 and ds2, a 16 CPU Node is launched,
// whereas when only one or neither is considered, only a 4 CPU Node is launched.
// no pods so we have the full (16 cpu - 100m overhead)
Expect(available.Cpu().AsApproximateFloat64()).To(BeNumerically("~", 15.9))
return true
})

// manually bind the daemonset pods to the node
ExpectManualBinding(ctx, env.Client, ds1Pod, node1)
ExpectReconcileSucceeded(ctx, podStateController, client.ObjectKeyFromObject(ds1Pod))

ExpectManualBinding(ctx, env.Client, ds2Pod, node1)
ExpectReconcileSucceeded(ctx, podStateController, client.ObjectKeyFromObject(ds2Pod))

cluster.ForEachNode(func(f *state.StateNode) bool {
dsRequests := f.DaemonSetRequests()
available := f.Available()
Expect(dsRequests.Cpu().AsApproximateFloat64()).To(BeNumerically("~", 2))
// only the DS pods are bound, so available is reduced by two and the DS requested is incremented by two
Expect(available.Cpu().AsApproximateFloat64()).To(BeNumerically("~", 13.9))
return true
})
})

})
// nolint:gosec
Expand Down
Loading