Skip to content

Commit ff13a64

Browse files
authored
Merge pull request #11290 from fabriziopandini/update-v1beta2-status-mhc-controller
✨ Update MHC with v1Beta2 status
2 parents 06a67c1 + fcb8cd5 commit ff13a64

File tree

11 files changed

+723
-421
lines changed

11 files changed

+723
-421
lines changed

api/v1beta1/machine_types.go

Lines changed: 52 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -200,22 +200,71 @@ const (
200200
// Note: this could happen when creating the machine. However, this state should be treated as an error if it lasts indefinitely.
201201
MachineNodeDoesNotExistV1Beta2Reason = ObjectDoesNotExistV1Beta2Reason
202202

203-
// MachineNodeDeletedV1Beta2Reason surfaces when the node hosted on the machine has been deleted.
203+
// MachineNodeDeletedV1Beta2Reason surfaces when the node hosted on the machine has been deleted.
204204
// Note: controllers can't identify if the Node was deleted by the controller itself, e.g.
205205
// during the deletion workflow, or by a users.
206206
MachineNodeDeletedV1Beta2Reason = ObjectDeletedV1Beta2Reason
207207
)
208208

209-
// Machine's HealthCheckSucceeded and OwnerRemediated conditions and corresponding reasons that will be used in v1Beta2 API version.
210-
// Note: HealthCheckSucceeded and OwnerRemediated condition are set by the MachineHealthCheck controller.
209+
// Machine's HealthCheckSucceeded condition and corresponding reasons that will be used in v1Beta2 API version.
210+
// Note: HealthCheckSucceeded condition is set by the MachineHealthCheck controller.
211211
const (
212212
// MachineHealthCheckSucceededV1Beta2Condition is true if MHC instances targeting this machine report the Machine
213213
// is healthy according to the definition of healthy present in the spec of the MachineHealthCheck object.
214214
MachineHealthCheckSucceededV1Beta2Condition = "HealthCheckSucceeded"
215215

216+
// MachineHealthCheckSucceededV1Beta2Reason surfaces when a machine passes all the health checks defined by a MachineHealthCheck object.
217+
MachineHealthCheckSucceededV1Beta2Reason = "HealthCheckSucceeded"
218+
219+
// MachineHealthCheckUnhealthyNodeV1Beta2Reason surfaces when the node hosted on the machine does not pass the health checks
220+
// defined by a MachineHealthCheck object.
221+
MachineHealthCheckUnhealthyNodeV1Beta2Reason = "UnhealthyNode"
222+
223+
// MachineHealthCheckNodeStartupTimeoutV1Beta2Reason surfaces when the node hosted on the machine does not appear within
224+
// the timeout defined by a MachineHealthCheck object.
225+
MachineHealthCheckNodeStartupTimeoutV1Beta2Reason = "NodeStartupTimeout"
226+
227+
// MachineHealthCheckNodeDeletedV1Beta2Reason surfaces when a MachineHealthCheck detect that the node hosted on the
228+
// machine has been deleted while the Machine is still running.
229+
MachineHealthCheckNodeDeletedV1Beta2Reason = "NodeDeleted"
230+
231+
// MachineHealthCheckHasRemediateAnnotationV1Beta2Reason surfaces a MachineHealthCheck detects a machine manually remediated
232+
// via the remediate-machine annotation.
233+
MachineHealthCheckHasRemediateAnnotationV1Beta2Reason = "HasRemediateAnnotation"
234+
)
235+
236+
// Machine's OwnerRemediated conditions and corresponding reasons that will be used in v1Beta2 API version.
237+
// Note: OwnerRemediated condition is initially set by the MachineHealthCheck controller; then it is up to the Machine's
238+
// owner controller to update or delete this condition.
239+
const (
216240
// MachineOwnerRemediatedV1Beta2Condition is only present if MHC instances targeting this machine
217241
// determine that the controller owning this machine should perform remediation.
218242
MachineOwnerRemediatedV1Beta2Condition = "OwnerRemediated"
243+
244+
// MachineOwnerRemediatedWaitingForRemediationV1Beta2Reason surfaces the machine is waiting for the owner controller
245+
// to start remediation.
246+
MachineOwnerRemediatedWaitingForRemediationV1Beta2Reason = "WaitingForRemediation"
247+
)
248+
249+
// Machine's ExternallyRemediated conditions and corresponding reasons that will be used in v1Beta2 API version.
250+
// Note: ExternallyRemediated condition is initially set by the MachineHealthCheck controller; then it is up to the external
251+
// remediation controller to update or delete this condition.
252+
const (
253+
// MachineExternallyRemediatedV1Beta2Condition is only present if MHC instances targeting this machine
254+
// determine that an external controller should perform remediation.
255+
MachineExternallyRemediatedV1Beta2Condition = "ExternallyRemediated"
256+
257+
// MachineExternallyRemediatedWaitingForRemediationV1Beta2Reason surfaces the machine is waiting for the
258+
// external remediation controller to start remediation.
259+
MachineExternallyRemediatedWaitingForRemediationV1Beta2Reason = "WaitingForRemediation"
260+
261+
// MachineExternallyRemediatedRemediationTemplateNotFoundV1Beta2Reason surfaces that the MachineHealthCheck cannot
262+
// find the template for a external remediation request.
263+
MachineExternallyRemediatedRemediationTemplateNotFoundV1Beta2Reason = "RemediationTemplateNotFound"
264+
265+
// MachineExternallyRemediatedRemediationRequestCreationFailedV1Beta2Reason surfaces that the MachineHealthCheck cannot
266+
// create a request for the external remediation controller.
267+
MachineExternallyRemediatedRemediationRequestCreationFailedV1Beta2Reason = "RemediationRequestCreationFailed"
219268
)
220269

221270
// Machine's Deleting condition and corresponding reasons that will be used in v1Beta2 API version.

api/v1beta1/machinehealthcheck_types.go

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,27 @@ import (
2424
"k8s.io/apimachinery/pkg/util/intstr"
2525
)
2626

27+
// MachineHealthCheck's RemediationAllowed condition and corresponding reasons that will be used in v1Beta2 API version.
28+
const (
29+
// MachineHealthCheckRemediationAllowedV1Beta2Condition surfaces whether the MachineHealthCheck is
30+
// allowed to remediate any Machines or whether it is blocked from remediating any further.
31+
MachineHealthCheckRemediationAllowedV1Beta2Condition = "RemediationAllowed"
32+
33+
// MachineHealthCheckTooManyUnhealthyV1Beta2Reason is the reason used when too many Machines are unhealthy and
34+
// the MachineHealthCheck is blocked from making any further remediation.
35+
MachineHealthCheckTooManyUnhealthyV1Beta2Reason = "TooManyUnhealthy"
36+
37+
// MachineHealthCheckRemediationAllowedV1Beta2Reason is the reason used when the number of unhealthy machine
38+
// is within the limits defined by the MachineHealthCheck, and thus remediation is allowed.
39+
MachineHealthCheckRemediationAllowedV1Beta2Reason = "RemediationAllowed"
40+
)
41+
42+
// MachineHealthCheck's Paused condition and corresponding reasons that will be used in v1Beta2 API version.
43+
const (
44+
// MachineHealthCheckPausedV1Beta2Condition is true if this MachineHealthCheck or the Cluster it belongs to are paused.
45+
MachineHealthCheckPausedV1Beta2Condition = PausedV1Beta2Condition
46+
)
47+
2748
var (
2849
// DefaultNodeStartupTimeout is the time allowed for a node to start up.
2950
// Can be made longer as part of spec if required for particular provider.

api/v1beta1/v1beta2_condition_consts.go

Lines changed: 0 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -225,16 +225,6 @@ const (
225225
ClusterPausedV1Beta2Condition = PausedV1Beta2Condition
226226
)
227227

228-
// Conditions that will be used for the MachineHealthCheck object in v1Beta2 API version.
229-
const (
230-
// MachineHealthCheckRemediationAllowedV1Beta2Condition surfaces whether the MachineHealthCheck is
231-
// allowed to remediate any Machines or whether it is blocked from remediating any further.
232-
MachineHealthCheckRemediationAllowedV1Beta2Condition = "RemediationAllowed"
233-
234-
// MachineHealthCheckPausedV1Beta2Condition is true if this MachineHealthCheck or the Cluster it belongs to are paused.
235-
MachineHealthCheckPausedV1Beta2Condition = PausedV1Beta2Condition
236-
)
237-
238228
// Conditions that will be used for the ClusterClass object in v1Beta2 API version.
239229
const (
240230
// ClusterClassVariablesReadyV1Beta2Condition is true if the ClusterClass variables, including both inline and external

internal/controllers/machinehealthcheck/machinehealthcheck_controller.go

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,7 @@ import (
5050
"sigs.k8s.io/cluster-api/util"
5151
"sigs.k8s.io/cluster-api/util/annotations"
5252
"sigs.k8s.io/cluster-api/util/conditions"
53+
v1beta2conditions "sigs.k8s.io/cluster-api/util/conditions/v1beta2"
5354
"sigs.k8s.io/cluster-api/util/patch"
5455
"sigs.k8s.io/cluster-api/util/predicates"
5556
)
@@ -279,6 +280,13 @@ func (r *Reconciler) reconcile(ctx context.Context, logger logr.Logger, cluster
279280
Message: message,
280281
})
281282

283+
v1beta2conditions.Set(m, metav1.Condition{
284+
Type: clusterv1.MachineHealthCheckRemediationAllowedV1Beta2Condition,
285+
Status: metav1.ConditionFalse,
286+
Reason: clusterv1.MachineHealthCheckTooManyUnhealthyV1Beta2Reason,
287+
Message: message,
288+
})
289+
282290
// If there are no unhealthy target, skip publishing the `RemediationRestricted` event to avoid misleading.
283291
if len(unhealthy) != 0 {
284292
r.recorder.Event(
@@ -321,6 +329,12 @@ func (r *Reconciler) reconcile(ctx context.Context, logger logr.Logger, cluster
321329
m.Status.RemediationsAllowed = remediationCount
322330
conditions.MarkTrue(m, clusterv1.RemediationAllowedCondition)
323331

332+
v1beta2conditions.Set(m, metav1.Condition{
333+
Type: clusterv1.MachineHealthCheckRemediationAllowedV1Beta2Condition,
334+
Status: metav1.ConditionTrue,
335+
Reason: clusterv1.MachineHealthCheckRemediationAllowedV1Beta2Reason,
336+
})
337+
324338
errList := r.patchUnhealthyTargets(ctx, logger, unhealthy, cluster, m)
325339
errList = append(errList, r.patchHealthyTargets(ctx, logger, healthy, m)...)
326340

@@ -399,6 +413,13 @@ func (r *Reconciler) patchUnhealthyTargets(ctx context.Context, logger logr.Logg
399413
from, err := external.Get(ctx, r.Client, m.Spec.RemediationTemplate, t.Machine.Namespace)
400414
if err != nil {
401415
conditions.MarkFalse(m, clusterv1.ExternalRemediationTemplateAvailableCondition, clusterv1.ExternalRemediationTemplateNotFoundReason, clusterv1.ConditionSeverityError, err.Error())
416+
417+
v1beta2conditions.Set(t.Machine, metav1.Condition{
418+
Type: clusterv1.MachineExternallyRemediatedV1Beta2Condition,
419+
Status: metav1.ConditionFalse,
420+
Reason: clusterv1.MachineExternallyRemediatedRemediationTemplateNotFoundV1Beta2Reason,
421+
Message: fmt.Sprintf("error retrieving remediation template %s %s", m.Spec.RemediationTemplate.Kind, klog.KRef(t.Machine.Namespace, m.Spec.RemediationTemplate.Name)),
422+
})
402423
errList = append(errList, errors.Wrapf(err, "error retrieving remediation template %v %q for machine %q in namespace %q within cluster %q", m.Spec.RemediationTemplate.GroupVersionKind(), m.Spec.RemediationTemplate.Name, t.Machine.Name, t.Machine.Namespace, m.Spec.ClusterName))
403424
return errList
404425
}
@@ -428,16 +449,37 @@ func (r *Reconciler) patchUnhealthyTargets(ctx context.Context, logger logr.Logg
428449
// Create the external clone.
429450
if err := r.Client.Create(ctx, to); err != nil {
430451
conditions.MarkFalse(m, clusterv1.ExternalRemediationRequestAvailableCondition, clusterv1.ExternalRemediationRequestCreationFailedReason, clusterv1.ConditionSeverityError, err.Error())
452+
453+
v1beta2conditions.Set(t.Machine, metav1.Condition{
454+
Type: clusterv1.MachineExternallyRemediatedV1Beta2Condition,
455+
Status: metav1.ConditionFalse,
456+
Reason: clusterv1.MachineExternallyRemediatedRemediationRequestCreationFailedV1Beta2Reason,
457+
Message: "Please check controller logs for errors",
458+
})
431459
errList = append(errList, errors.Wrapf(err, "error creating remediation request for machine %q in namespace %q within cluster %q", t.Machine.Name, t.Machine.Namespace, t.Machine.Spec.ClusterName))
432460
return errList
433461
}
462+
463+
v1beta2conditions.Set(t.Machine, metav1.Condition{
464+
Type: clusterv1.MachineExternallyRemediatedV1Beta2Condition,
465+
Status: metav1.ConditionFalse,
466+
Reason: clusterv1.MachineExternallyRemediatedWaitingForRemediationV1Beta2Reason,
467+
})
434468
} else {
435469
logger.Info("Target has failed health check, marking for remediation", "target", t.string(), "reason", condition.Reason, "message", condition.Message)
436470
// NOTE: MHC is responsible for creating MachineOwnerRemediatedCondition if missing or to trigger another remediation if the previous one is completed;
437471
// instead, if a remediation is in already progress, the remediation owner is responsible for completing the process and MHC should not overwrite the condition.
438472
if !conditions.Has(t.Machine, clusterv1.MachineOwnerRemediatedCondition) || conditions.IsTrue(t.Machine, clusterv1.MachineOwnerRemediatedCondition) {
439473
conditions.MarkFalse(t.Machine, clusterv1.MachineOwnerRemediatedCondition, clusterv1.WaitingForRemediationReason, clusterv1.ConditionSeverityWarning, "")
440474
}
475+
476+
if ownerRemediatedCondition := v1beta2conditions.Get(t.Machine, clusterv1.MachineOwnerRemediatedV1Beta2Condition); ownerRemediatedCondition == nil || ownerRemediatedCondition.Status == metav1.ConditionTrue {
477+
v1beta2conditions.Set(t.Machine, metav1.Condition{
478+
Type: clusterv1.MachineOwnerRemediatedV1Beta2Condition,
479+
Status: metav1.ConditionFalse,
480+
Reason: clusterv1.MachineOwnerRemediatedWaitingForRemediationV1Beta2Reason,
481+
})
482+
}
441483
}
442484
}
443485

0 commit comments

Comments
 (0)