-
Couldn't load subscription status.
- Fork 128
add controllerrevisions collection in ksm check for agents > 7.72.x #2249
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from 2 commits
d502386
a9c0528
2a3c448
400cfed
1f0a581
ef59033
4834283
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -46,10 +46,11 @@ func buildKSMFeature(options *feature.Options) feature.Feature { | |
| } | ||
|
|
||
| type ksmFeature struct { | ||
| runInClusterChecksRunner bool | ||
| collectCRDMetrics bool | ||
| collectCrMetrics []v2alpha1.Resource | ||
| collectAPIServiceMetrics bool | ||
| runInClusterChecksRunner bool | ||
| collectCRDMetrics bool | ||
| collectCrMetrics []v2alpha1.Resource | ||
| collectAPIServiceMetrics bool | ||
| collectControllerRevisions bool | ||
|
|
||
| rbacSuffix string | ||
| serviceAccountName string | ||
|
|
@@ -67,6 +68,9 @@ type ksmFeature struct { | |
| // Add "-0" so that prerelase versions are considered sufficient. https://github.com/Masterminds/semver#working-with-prerelease-versions | ||
| const crdAPIServiceCollectionMinVersion = "7.46.0-0" | ||
|
|
||
| // Minimum agent version that supports collection of controllerrevisions | ||
| const controllerRevisionsCollectionMinVersion = "7.72.0-0" | ||
|
||
|
|
||
| // ID returns the ID of the Feature | ||
| func (f *ksmFeature) ID() feature.IDType { | ||
| return feature.KubernetesStateCoreIDType | ||
|
|
@@ -78,6 +82,7 @@ func (f *ksmFeature) Configure(dda metav1.Object, ddaSpec *v2alpha1.DatadogAgent | |
| var output feature.RequiredComponents | ||
|
|
||
| if ddaSpec.Features != nil && ddaSpec.Features.KubeStateMetricsCore != nil && apiutils.BoolValue(ddaSpec.Features.KubeStateMetricsCore.Enabled) { | ||
| f.logger.Info("KubeStateMetricsCore feature enabled") | ||
|
||
| output.ClusterAgent.IsRequired = apiutils.NewBoolPointer(true) | ||
| output.ClusterAgent.Containers = []apicommon.AgentContainerName{apicommon.ClusterAgentContainerName} | ||
| output.Agent.IsRequired = apiutils.NewBoolPointer(true) | ||
|
|
@@ -88,6 +93,14 @@ func (f *ksmFeature) Configure(dda metav1.Object, ddaSpec *v2alpha1.DatadogAgent | |
| f.collectCrMetrics = ddaSpec.Features.KubeStateMetricsCore.CollectCrMetrics | ||
| f.serviceAccountName = constants.GetClusterAgentServiceAccount(dda.GetName(), ddaSpec) | ||
|
|
||
| // Get the CollectControllerRevisions value from spec (default to false if not set) | ||
| collectControllerRevisionsExplicitlySet := ddaSpec.Features.KubeStateMetricsCore.CollectControllerRevisions != nil | ||
| if collectControllerRevisionsExplicitlySet { | ||
| f.collectControllerRevisions = apiutils.BoolValue(ddaSpec.Features.KubeStateMetricsCore.CollectControllerRevisions) | ||
| } else { | ||
| f.collectControllerRevisions = false | ||
| } | ||
|
|
||
| // This check will only run in the Cluster Checks Runners or Cluster Agent (not the Node Agent) | ||
| if ddaSpec.Features.ClusterChecks != nil && apiutils.BoolValue(ddaSpec.Features.ClusterChecks.Enabled) && apiutils.BoolValue(ddaSpec.Features.ClusterChecks.UseClusterChecksRunners) { | ||
| f.runInClusterChecksRunner = true | ||
|
|
@@ -97,17 +110,43 @@ func (f *ksmFeature) Configure(dda metav1.Object, ddaSpec *v2alpha1.DatadogAgent | |
| output.ClusterChecksRunner.Containers = []apicommon.AgentContainerName{apicommon.CoreAgentContainerName} | ||
|
|
||
| if ccrOverride, ok := ddaSpec.Override[v2alpha1.ClusterChecksRunnerComponentName]; ok { | ||
| if ccrOverride.Image != nil && !utils.IsAboveMinVersion(common.GetAgentVersionFromImage(*ccrOverride.Image), crdAPIServiceCollectionMinVersion) { | ||
| // Disable if image is overridden to an unsupported version | ||
| f.collectAPIServiceMetrics = false | ||
| f.collectCRDMetrics = false | ||
| if ccrOverride.Image != nil { | ||
| agentVersion := common.GetAgentVersionFromImage(*ccrOverride.Image) | ||
|
|
||
| if !utils.IsAboveMinVersion(agentVersion, crdAPIServiceCollectionMinVersion) { | ||
| f.collectAPIServiceMetrics = false | ||
| f.collectCRDMetrics = false | ||
| } | ||
|
|
||
| // Only apply version check if CollectControllerRevisions was not explicitly set | ||
| if !collectControllerRevisionsExplicitlySet { | ||
| if utils.IsAboveMinVersion(agentVersion, controllerRevisionsCollectionMinVersion) { | ||
| f.collectControllerRevisions = true | ||
| } else { | ||
| f.collectControllerRevisions = false | ||
| } | ||
| } | ||
| } | ||
| } | ||
| } else if clusterAgentOverride, ok := ddaSpec.Override[v2alpha1.ClusterAgentComponentName]; ok { | ||
| if clusterAgentOverride.Image != nil && !utils.IsAboveMinVersion(common.GetAgentVersionFromImage(*clusterAgentOverride.Image), crdAPIServiceCollectionMinVersion) { | ||
| // Disable if image is overridden to an unsupported version | ||
| f.collectAPIServiceMetrics = false | ||
| f.collectCRDMetrics = false | ||
| } else { | ||
| if clusterAgentOverride, ok := ddaSpec.Override[v2alpha1.ClusterAgentComponentName]; ok { | ||
| if clusterAgentOverride.Image != nil { | ||
| agentVersion := common.GetAgentVersionFromImage(*clusterAgentOverride.Image) | ||
|
|
||
| if !utils.IsAboveMinVersion(agentVersion, crdAPIServiceCollectionMinVersion) { | ||
| f.collectAPIServiceMetrics = false | ||
| f.collectCRDMetrics = false | ||
| } | ||
|
|
||
| // Only apply version check if CollectControllerRevisions was not explicitly set | ||
| if !collectControllerRevisionsExplicitlySet { | ||
| if utils.IsAboveMinVersion(agentVersion, controllerRevisionsCollectionMinVersion) { | ||
| f.collectControllerRevisions = true | ||
| } else { | ||
| f.collectControllerRevisions = false | ||
| } | ||
| } | ||
| } | ||
| } | ||
| } | ||
|
|
||
|
|
@@ -124,16 +163,24 @@ func (f *ksmFeature) Configure(dda metav1.Object, ddaSpec *v2alpha1.DatadogAgent | |
| } | ||
|
|
||
| f.configConfigMapName = constants.GetConfName(dda, f.customConfig, defaultKubeStateMetricsCoreConf) | ||
|
|
||
| // Log final configuration state | ||
| f.logger.Info("KubeStateMetricsCore configuration finalized", | ||
| "collectAPIServiceMetrics", f.collectAPIServiceMetrics, | ||
| "collectCRDMetrics", f.collectCRDMetrics, | ||
| "collectControllerRevisions", f.collectControllerRevisions, | ||
| "runInClusterChecksRunner", f.runInClusterChecksRunner) | ||
| } | ||
|
|
||
| return output | ||
| } | ||
|
|
||
| type collectorOptions struct { | ||
| enableVPA bool | ||
| enableAPIService bool | ||
| enableCRD bool | ||
| customResources []v2alpha1.Resource | ||
| enableVPA bool | ||
| enableAPIService bool | ||
| enableCRD bool | ||
| enableControllerRevisions bool | ||
| customResources []v2alpha1.Resource | ||
| } | ||
|
|
||
| // ManageDependencies allows a feature to manage its dependencies. | ||
|
|
@@ -143,10 +190,11 @@ func (f *ksmFeature) ManageDependencies(managers feature.ResourceManagers, provi | |
| // OR if the default configMap is needed. | ||
| pInfo := managers.Store().GetPlatformInfo() | ||
| collectorOpts := collectorOptions{ | ||
| enableVPA: pInfo.IsResourceSupported("VerticalPodAutoscaler"), | ||
| enableAPIService: f.collectAPIServiceMetrics, | ||
| enableCRD: f.collectCRDMetrics, | ||
| customResources: f.collectCrMetrics, | ||
| enableVPA: pInfo.IsResourceSupported("VerticalPodAutoscaler"), | ||
| enableAPIService: f.collectAPIServiceMetrics, | ||
| enableCRD: f.collectCRDMetrics, | ||
| enableControllerRevisions: f.collectControllerRevisions, | ||
| customResources: f.collectCrMetrics, | ||
| } | ||
| configCM, err := f.buildKSMCoreConfigMap(collectorOpts) | ||
| if err != nil { | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Open question: Do we want to expose a configuration option that enables the collection of ControllerRevision resources — or should we instead use these resources internally to derive higher-level insights?
Based on our previous discussion, this seems related to exposing rollout duration for
DaemonSetsandStatefulSets.If that’s the case, I wonder whether the new setting should be framed around the benefit or outcome (e.g., enabling rollout duration metrics) rather than the low-level implementation detail (collecting ControllerRevisions).
What do you think? Open to discussion.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@clamoriniere So the collection of DaemonSets and StatefulSets will still occur by default, it's just that the "start time" of the deployment would be less accurate without controllerrevisions access. However, this flag was meant as more of a killswitch than anything else, because enabling this on agent < 7.72.0 breaks the KSM check, or if there was a custom version that was 7.72 but tagged as something non-parsable, we wanted to be able to turn it on manually. I'm open to removing this option and just strictly having it version gated in the feature.go code.
Also - and this is something I'd probably rather not do (hence being extra careful with the operator and helm-chart deployments) - there's the option of modifying the agent code to NOT fail the ksm check if it's configured to collect controllerrevisions but doesn't have access. We just remove it from the list and continue normal function.
Happy to hear your thoughts
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks for the explanation, that makes sense.
I think a good approach would be to always add the RBAC for
controllerrevisionswhen the KSM check is enabled. Then, starting from the next Agent/Cluster-Agent version, we could detect at runtime whether the permission is available. If it isn’t, we simply skip generating the metrics that rely oncontrollerrevisions.This way, the check can still run successfully without introducing a new configuration parameter, which should provide a smoother experience for users and reduce long-term config maintenance.
Additionally, we could emit a check configuration warning when the RBAC is missing — the check would continue running, but the user would see a warning in the Agent status suggesting they add the missing permission or upgrade their Operator/Helm chart version.
I realize this involves a bit more work, but it would likely result in a better UX and a cleaner setup overall.
let me know what is your opinion?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
So based on this PR the rbac will always be enabled, regardless of agent version. As far as the agent code changes, I may have misspoke - newer agents that don't have controllerrevisions enabled will still function, it's just that the start times for statefulsets will be less accurate. The case where the ksm check would break is in older agents < 7.72.0, if we enable collection of controllerrevisions (adding the rbac is fine). Unfortunately we can't fix that in the code, so we just have to make sure it's not enabled for those agents.
I'm not against the runtime check for resources in the agent, but it won't help us with this release, so I think it can be something we implement in the future?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is there a particular reason for having a config option in the CRD as opposed to automatically enabling controller revisions metric collection based on the agent version? I'd prefer to keep the feature config options simple and limited since a user can always provide a custom config if they want more fine grained control over what's collected