Skip to content

Commit 7f4de50

Browse files
authored
Merge pull request #10202 from chrischdi/pr-manual-machine-remediation-annotation
✨ MHC: implement annotation to manually mark machines for remediation
2 parents c356333 + 63d56b2 commit 7f4de50

File tree

6 files changed

+42
-2
lines changed

6 files changed

+42
-2
lines changed

api/v1beta1/common_types.go

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -128,6 +128,9 @@ const (
128128
// MachineSkipRemediationAnnotation is the annotation used to mark the machines that should not be considered for remediation by MachineHealthCheck reconciler.
129129
MachineSkipRemediationAnnotation = "cluster.x-k8s.io/skip-remediation"
130130

131+
// RemediateMachineAnnotation is the annotation used to mark machines that should be remediated by MachineHealthCheck reconciler.
132+
RemediateMachineAnnotation = "cluster.x-k8s.io/remediate-machine"
133+
131134
// MachineSetSkipPreflightChecksAnnotation is the annotation used to provide a comma-separated list of
132135
// preflight checks that should be skipped during the MachineSet reconciliation.
133136
// Supported items are:

api/v1beta1/condition_consts.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -152,6 +152,10 @@ const (
152152
// MachineHasFailureReason is the reason used when a machine has either a FailureReason or a FailureMessage set on its status.
153153
MachineHasFailureReason = "MachineHasFailure"
154154

155+
// HasRemediateMachineAnnotationReason is the reason that get's set at the MachineHealthCheckSucceededCondition when a machine
156+
// has the RemediateMachineAnnotation set.
157+
HasRemediateMachineAnnotationReason = "HasRemediateMachineAnnotation"
158+
155159
// NodeStartupTimeoutReason is the reason used when a machine's node does not appear within the specified timeout.
156160
NodeStartupTimeoutReason = "NodeStartupTimeout"
157161

docs/book/src/reference/labels_and_annotations.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@
3838
| cluster.x-k8s.io/cloned-from-name | It is the infrastructure machine annotation that stores the name of the infrastructure template resource that was cloned for the machine. This annotation is set only during cloning a template. Older/adopted machines will not have this annotation. |
3939
| cluster.x-k8s.io/cloned-from-groupkind | It is the infrastructure machine annotation that stores the group-kind of the infrastructure template resource that was cloned for the machine. This annotation is set only during cloning a template. Older/adopted machines will not have this annotation. |
4040
| cluster.x-k8s.io/skip-remediation | It is used to mark the machines that should not be considered for remediation by MachineHealthCheck reconciler. |
41+
| cluster.x-k8s.io/remediate-machine | It can be applied to a machine to manually mark it for remediation by MachineHealthCheck reconciler. |
4142
| cluster.x-k8s.io/managed-by | It can be applied to InfraCluster resources to signify that some external system is managing the cluster infrastructure. Provider InfraCluster controllers will ignore resources with this annotation. An external controller must fulfill the contract of the InfraCluster resource. External infrastructure providers should ensure that the annotation, once set, cannot be removed. |
4243
| cluster.x-k8s.io/replicas-managed-by | It can be applied to MachinePool resources to signify that some external system is managing infrastructure scaling for that pool. See [the MachinePool documentation](../developer/architecture/controllers/machine-pool.md#externally-managed-autoscaler) for more details. |
4344
| cluster.x-k8s.io/skip-machineset-preflight-checks | It can be applied on MachineDeployment and MachineSet resources to specify a comma-separated list of preflight checks that should be skipped during MachineSet reconciliation. Supported preflight checks are: All, KubeadmVersionSkew, KubernetesVersionSkew, ControlPlaneIsStable. |

internal/controllers/machinehealthcheck/machinehealthcheck_targets.go

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -82,6 +82,7 @@ func (t *healthCheckTarget) nodeName() string {
8282

8383
// Determine whether or not a given target needs remediation.
8484
// The node will need remediation if any of the following are true:
85+
// - The Machine has the remediate machine annotation
8586
// - The Machine has failed for some reason
8687
// - The Machine did not get a node before `timeoutForMachineToHaveNode` elapses
8788
// - The Node has gone away
@@ -93,6 +94,12 @@ func (t *healthCheckTarget) needsRemediation(logger logr.Logger, timeoutForMachi
9394
var nextCheckTimes []time.Duration
9495
now := time.Now()
9596

97+
if annotations.HasRemediateMachine(t.Machine) {
98+
conditions.MarkFalse(t.Machine, clusterv1.MachineHealthCheckSucceededCondition, clusterv1.HasRemediateMachineAnnotationReason, clusterv1.ConditionSeverityWarning, "Marked for remediation via remediate-machine annotation")
99+
logger.V(3).Info("Target is marked for remediation via remediate-machine annotation")
100+
return true, time.Duration(0)
101+
}
102+
96103
if t.Machine.Status.FailureReason != nil {
97104
conditions.MarkFalse(t.Machine, clusterv1.MachineHealthCheckSucceededCondition, clusterv1.MachineHasFailureReason, clusterv1.ConditionSeverityWarning, "FailureReason: %v", *t.Machine.Status.FailureReason)
98105
logger.V(3).Info("Target is unhealthy", "failureReason", t.Machine.Status.FailureReason)

internal/controllers/machinehealthcheck/machinehealthcheck_targets_test.go

Lines changed: 22 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -87,10 +87,10 @@ func TestGetTargetsFromMHC(t *testing.T) {
8787
// machines for skip remediation
8888
testNode5 := newTestNode("node5")
8989
testMachine5 := newTestMachine("machine5", namespace, clusterName, testNode5.Name, mhcSelector)
90-
testMachine5.Annotations = map[string]string{"cluster.x-k8s.io/skip-remediation": ""}
90+
testMachine5.Annotations = map[string]string{clusterv1.MachineSkipRemediationAnnotation: ""}
9191
testNode6 := newTestNode("node6")
9292
testMachine6 := newTestMachine("machine6", namespace, clusterName, testNode6.Name, mhcSelector)
93-
testMachine6.Annotations = map[string]string{"cluster.x-k8s.io/paused": ""}
93+
testMachine6.Annotations = map[string]string{clusterv1.PausedAnnotation: ""}
9494

9595
testCases := []struct {
9696
desc string
@@ -340,6 +340,18 @@ func TestHealthCheckTargets(t *testing.T) {
340340
}
341341
machineFailureMsgCondition := newFailedHealthCheckCondition(clusterv1.MachineHasFailureReason, "FailureMessage: %s", failureMsg)
342342

343+
// Target for when the machine has the remediate machine annotation
344+
annotationRemediationMsg := "Marked for remediation via remediate-machine annotation"
345+
testMachineAnnotationRemediation := testMachine.DeepCopy()
346+
testMachineAnnotationRemediation.Annotations = map[string]string{clusterv1.RemediateMachineAnnotation: ""}
347+
machineAnnotationRemediation := healthCheckTarget{
348+
Cluster: cluster,
349+
MHC: testMHC,
350+
Machine: testMachineAnnotationRemediation,
351+
Node: nil,
352+
}
353+
machineAnnotationRemediationCondition := newFailedHealthCheckCondition(clusterv1.HasRemediateMachineAnnotationReason, annotationRemediationMsg)
354+
343355
testCases := []struct {
344356
desc string
345357
targets []healthCheckTarget
@@ -426,6 +438,14 @@ func TestHealthCheckTargets(t *testing.T) {
426438
expectedNeedsRemediationCondition: []clusterv1.Condition{machineFailureMsgCondition},
427439
expectedNextCheckTimes: []time.Duration{},
428440
},
441+
{
442+
desc: "when the machine is manually marked for remediation",
443+
targets: []healthCheckTarget{machineAnnotationRemediation},
444+
expectedHealthy: []healthCheckTarget{},
445+
expectedNeedsRemediation: []healthCheckTarget{machineAnnotationRemediation},
446+
expectedNeedsRemediationCondition: []clusterv1.Condition{machineAnnotationRemediationCondition},
447+
expectedNextCheckTimes: []time.Duration{},
448+
},
429449
}
430450

431451
for _, tc := range testCases {

util/annotations/helpers.go

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,11 @@ func HasSkipRemediation(o metav1.Object) bool {
4848
return hasAnnotation(o, clusterv1.MachineSkipRemediationAnnotation)
4949
}
5050

51+
// HasRemediateMachine returns true if the object has the `remediate-machine` annotation.
52+
func HasRemediateMachine(o metav1.Object) bool {
53+
return hasAnnotation(o, clusterv1.RemediateMachineAnnotation)
54+
}
55+
5156
// HasWithPrefix returns true if at least one of the annotations has the prefix specified.
5257
func HasWithPrefix(prefix string, annotations map[string]string) bool {
5358
for key := range annotations {

0 commit comments

Comments
 (0)