Skip to content

Commit 9ee55a8

Browse files
Ensure reconcile external references and reconcile node are always run
1 parent 4439c69 commit 9ee55a8

File tree

6 files changed

+298
-278
lines changed

6 files changed

+298
-278
lines changed

internal/controllers/machine/machine_controller.go

Lines changed: 83 additions & 108 deletions
Original file line numberDiff line numberDiff line change
@@ -196,6 +196,11 @@ func (r *Reconciler) Reconcile(ctx context.Context, req ctrl.Request) (_ ctrl.Re
196196
}
197197

198198
// Initialize the patch helper
199+
s := &scope{
200+
cluster: cluster,
201+
machine: m,
202+
}
203+
199204
patchHelper, err := patch.NewHelper(m, r.Client)
200205
if err != nil {
201206
return ctrl.Result{}, err
@@ -215,15 +220,34 @@ func (r *Reconciler) Reconcile(ctx context.Context, req ctrl.Request) (_ ctrl.Re
215220
}
216221
}()
217222

218-
// Reconcile labels.
223+
// Always add the cluster label labels.
219224
if m.Labels == nil {
220225
m.Labels = make(map[string]string)
221226
}
222227
m.Labels[clusterv1.ClusterNameLabel] = m.Spec.ClusterName
223228

229+
// Add finalizer first if not set to avoid the race condition between init and delete.
230+
// Note: Finalizers in general can only be added when the deletionTimestamp is not set.
231+
if !controllerutil.ContainsFinalizer(m, clusterv1.MachineFinalizer) && m.ObjectMeta.DeletionTimestamp.IsZero() {
232+
controllerutil.AddFinalizer(m, clusterv1.MachineFinalizer)
233+
return ctrl.Result{}, nil
234+
}
235+
236+
alwaysReconcile := []machineReconcileFunc{
237+
r.reconcileBootstrap,
238+
r.reconcileInfrastructure,
239+
r.reconcileNode,
240+
r.reconcileCertificateExpiry,
241+
}
242+
224243
// Handle deletion reconciliation loop.
225244
if !m.ObjectMeta.DeletionTimestamp.IsZero() {
226-
res, err := r.reconcileDelete(ctx, cluster, m)
245+
reconcileDelete := append(
246+
alwaysReconcile,
247+
r.reconcileDelete,
248+
)
249+
250+
res, err := doReconcile(ctx, reconcileDelete, s)
227251
// Requeue if the reconcile failed because the ClusterCacheTracker was locked for
228252
// the current cluster because of concurrent access.
229253
if errors.Is(err, remote.ErrClusterLocked) {
@@ -233,15 +257,13 @@ func (r *Reconciler) Reconcile(ctx context.Context, req ctrl.Request) (_ ctrl.Re
233257
return res, err
234258
}
235259

236-
// Add finalizer first if not set to avoid the race condition between init and delete.
237-
// Note: Finalizers in general can only be added when the deletionTimestamp is not set.
238-
if !controllerutil.ContainsFinalizer(m, clusterv1.MachineFinalizer) {
239-
controllerutil.AddFinalizer(m, clusterv1.MachineFinalizer)
240-
return ctrl.Result{}, nil
241-
}
242-
243260
// Handle normal reconciliation loop.
244-
res, err := r.reconcile(ctx, cluster, m)
261+
reconcileNormal := append(
262+
[]machineReconcileFunc{r.reconcileMachineOwner},
263+
alwaysReconcile...,
264+
)
265+
266+
res, err := doReconcile(ctx, reconcileNormal, s)
245267
// Requeue if the reconcile failed because the ClusterCacheTracker was locked for
246268
// the current cluster because of concurrent access.
247269
if errors.Is(err, remote.ErrClusterLocked) {
@@ -285,36 +307,26 @@ func patchMachine(ctx context.Context, patchHelper *patch.Helper, machine *clust
285307
clusterv1.MachineHealthCheckSucceededCondition,
286308
clusterv1.MachineOwnerRemediatedCondition,
287309
}},
310+
patch.WithOwnedV1Beta2Conditions{Conditions: []string{
311+
clusterv1.MachineAvailableV1Beta2Condition,
312+
clusterv1.MachineReadyV1Beta2Condition,
313+
clusterv1.MachineBootstrapConfigReadyV1Beta2Condition,
314+
clusterv1.MachineInfrastructureReadyV1Beta2Condition,
315+
clusterv1.MachineNodeReadyV1Beta2Condition,
316+
clusterv1.MachineNodeHealthyV1Beta2Condition,
317+
clusterv1.MachineDeletingV1Beta2Condition,
318+
clusterv1.MachinePausedV1Beta2Condition,
319+
}},
288320
)
289321

290322
return patchHelper.Patch(ctx, machine, options...)
291323
}
292324

293-
func (r *Reconciler) reconcile(ctx context.Context, cluster *clusterv1.Cluster, m *clusterv1.Machine) (ctrl.Result, error) {
294-
// If the machine is a stand-alone one, meaning not originated from a MachineDeployment, then set it as directly
295-
// owned by the Cluster (if not already present).
296-
if r.shouldAdopt(m) {
297-
m.SetOwnerReferences(util.EnsureOwnerRef(m.GetOwnerReferences(), metav1.OwnerReference{
298-
APIVersion: clusterv1.GroupVersion.String(),
299-
Kind: "Cluster",
300-
Name: cluster.Name,
301-
UID: cluster.UID,
302-
}))
303-
}
304-
305-
phases := []func(context.Context, *scope) (ctrl.Result, error){
306-
r.reconcileBootstrap,
307-
r.reconcileInfrastructure,
308-
r.reconcileNode,
309-
r.reconcileCertificateExpiry,
310-
}
325+
type machineReconcileFunc func(context.Context, *scope) (ctrl.Result, error)
311326

327+
func doReconcile(ctx context.Context, phases []machineReconcileFunc, s *scope) (ctrl.Result, error) {
312328
res := ctrl.Result{}
313329
errs := []error{}
314-
s := &scope{
315-
cluster: cluster,
316-
machine: m,
317-
}
318330
for _, phase := range phases {
319331
// Call the inner reconciliation methods.
320332
phaseResult, err := phase(ctx, s)
@@ -346,10 +358,30 @@ type scope struct {
346358
// bootstrapConfig is the BootstrapConfig object that is referenced by the
347359
// Machine. It is set after reconcileBootstrap is called.
348360
bootstrapConfig *unstructured.Unstructured
361+
362+
// node is the Kubernetes node hosted on the machine.
363+
node *corev1.Node
364+
}
365+
366+
func (r *Reconciler) reconcileMachineOwner(_ context.Context, s *scope) (ctrl.Result, error) {
367+
// If the machine is a stand-alone one, meaning not originated from a MachineDeployment, then set it as directly
368+
// owned by the Cluster (if not already present).
369+
if r.shouldAdopt(s.machine) {
370+
s.machine.SetOwnerReferences(util.EnsureOwnerRef(s.machine.GetOwnerReferences(), metav1.OwnerReference{
371+
APIVersion: clusterv1.GroupVersion.String(),
372+
Kind: "Cluster",
373+
Name: s.cluster.Name,
374+
UID: s.cluster.UID,
375+
}))
376+
}
377+
378+
return ctrl.Result{}, nil
349379
}
350380

351-
func (r *Reconciler) reconcileDelete(ctx context.Context, cluster *clusterv1.Cluster, m *clusterv1.Machine) (ctrl.Result, error) { //nolint:gocyclo
381+
func (r *Reconciler) reconcileDelete(ctx context.Context, s *scope) (ctrl.Result, error) { //nolint:gocyclo
352382
log := ctrl.LoggerFrom(ctx)
383+
cluster := s.cluster
384+
m := s.machine
353385

354386
err := r.isDeleteNodeAllowed(ctx, cluster, m)
355387
isDeleteNodeAllowed := err == nil
@@ -463,20 +495,7 @@ func (r *Reconciler) reconcileDelete(ctx context.Context, cluster *clusterv1.Clu
463495
}
464496
conditions.MarkTrue(m, clusterv1.PreTerminateDeleteHookSucceededCondition)
465497

466-
// Return early and don't remove the finalizer if we got an error or
467-
// the external reconciliation deletion isn't ready.
468-
469-
patchHelper, err := patch.NewHelper(m, r.Client)
470-
if err != nil {
471-
return ctrl.Result{}, err
472-
}
473-
conditions.MarkFalse(m, clusterv1.MachineNodeHealthyCondition, clusterv1.DeletingReason, clusterv1.ConditionSeverityInfo, "")
474-
if err := patchMachine(ctx, patchHelper, m); err != nil {
475-
conditions.MarkFalse(m, clusterv1.MachineNodeHealthyCondition, clusterv1.DeletionFailedReason, clusterv1.ConditionSeverityInfo, "")
476-
return ctrl.Result{}, errors.Wrap(err, "failed to patch Machine")
477-
}
478-
479-
infrastructureDeleted, err := r.reconcileDeleteInfrastructure(ctx, cluster, m)
498+
infrastructureDeleted, err := r.reconcileDeleteInfrastructure(ctx, s)
480499
if err != nil {
481500
return ctrl.Result{}, err
482501
}
@@ -485,7 +504,7 @@ func (r *Reconciler) reconcileDelete(ctx context.Context, cluster *clusterv1.Clu
485504
return ctrl.Result{}, nil
486505
}
487506

488-
bootstrapDeleted, err := r.reconcileDeleteBootstrap(ctx, cluster, m)
507+
bootstrapDeleted, err := r.reconcileDeleteBootstrap(ctx, s)
489508
if err != nil {
490509
return ctrl.Result{}, err
491510
}
@@ -849,78 +868,34 @@ func (r *Reconciler) deleteNode(ctx context.Context, cluster *clusterv1.Cluster,
849868
return nil
850869
}
851870

852-
func (r *Reconciler) reconcileDeleteBootstrap(ctx context.Context, cluster *clusterv1.Cluster, m *clusterv1.Machine) (bool, error) {
853-
obj, err := r.reconcileDeleteExternal(ctx, cluster, m, m.Spec.Bootstrap.ConfigRef)
854-
if err != nil {
855-
return false, err
856-
}
857-
858-
if obj == nil {
859-
// Marks the bootstrap as deleted
860-
conditions.MarkFalse(m, clusterv1.BootstrapReadyCondition, clusterv1.DeletedReason, clusterv1.ConditionSeverityInfo, "")
871+
func (r *Reconciler) reconcileDeleteBootstrap(ctx context.Context, s *scope) (bool, error) {
872+
if s.bootstrapConfig == nil {
873+
conditions.MarkFalse(s.machine, clusterv1.BootstrapReadyCondition, clusterv1.DeletedReason, clusterv1.ConditionSeverityInfo, "")
861874
return true, nil
862875
}
863876

864-
// Report a summary of current status of the bootstrap object defined for this machine.
865-
conditions.SetMirror(m, clusterv1.BootstrapReadyCondition,
866-
conditions.UnstructuredGetter(obj),
867-
conditions.WithFallbackValue(false, clusterv1.DeletingReason, clusterv1.ConditionSeverityInfo, ""),
868-
)
869-
return false, nil
870-
}
871-
872-
func (r *Reconciler) reconcileDeleteInfrastructure(ctx context.Context, cluster *clusterv1.Cluster, m *clusterv1.Machine) (bool, error) {
873-
obj, err := r.reconcileDeleteExternal(ctx, cluster, m, &m.Spec.InfrastructureRef)
874-
if err != nil {
875-
return false, err
876-
}
877-
878-
if obj == nil {
879-
// Marks the infrastructure as deleted
880-
conditions.MarkFalse(m, clusterv1.InfrastructureReadyCondition, clusterv1.DeletedReason, clusterv1.ConditionSeverityInfo, "")
881-
return true, nil
877+
if err := r.Client.Delete(ctx, s.bootstrapConfig); err != nil && !apierrors.IsNotFound(err) {
878+
return false, errors.Wrapf(err,
879+
"failed to delete %v %q for Machine %q in namespace %q",
880+
s.bootstrapConfig.GroupVersionKind(), s.bootstrapConfig.GetName(), s.machine.Name, s.machine.Namespace)
882881
}
883882

884-
// Report a summary of current status of the bootstrap object defined for this machine.
885-
conditions.SetMirror(m, clusterv1.InfrastructureReadyCondition,
886-
conditions.UnstructuredGetter(obj),
887-
conditions.WithFallbackValue(false, clusterv1.DeletingReason, clusterv1.ConditionSeverityInfo, ""),
888-
)
889883
return false, nil
890884
}
891885

892-
// reconcileDeleteExternal tries to delete external references.
893-
func (r *Reconciler) reconcileDeleteExternal(ctx context.Context, cluster *clusterv1.Cluster, m *clusterv1.Machine, ref *corev1.ObjectReference) (*unstructured.Unstructured, error) {
894-
if ref == nil {
895-
return nil, nil
896-
}
897-
898-
// get the external object
899-
obj, err := external.Get(ctx, r.Client, ref, m.Namespace)
900-
if err != nil && !apierrors.IsNotFound(errors.Cause(err)) {
901-
return nil, errors.Wrapf(err, "failed to get %s %q for Machine %q in namespace %q",
902-
ref.GroupVersionKind(), ref.Name, m.Name, m.Namespace)
886+
func (r *Reconciler) reconcileDeleteInfrastructure(ctx context.Context, s *scope) (bool, error) {
887+
if s.infraMachine == nil {
888+
conditions.MarkFalse(s.machine, clusterv1.InfrastructureReadyCondition, clusterv1.DeletedReason, clusterv1.ConditionSeverityInfo, "")
889+
return true, nil
903890
}
904891

905-
if obj != nil {
906-
// reconcileExternal ensures that we set the object's OwnerReferences correctly and watch the object.
907-
// The machine delete logic depends on reconciling the machine when the external objects are deleted.
908-
// This avoids a race condition where the machine is deleted before the external objects are ever reconciled
909-
// by this controller.
910-
if _, err := r.ensureExternalOwnershipAndWatch(ctx, cluster, m, ref); err != nil {
911-
return nil, err
912-
}
913-
914-
// Issue a delete request.
915-
if err := r.Client.Delete(ctx, obj); err != nil && !apierrors.IsNotFound(err) {
916-
return obj, errors.Wrapf(err,
917-
"failed to delete %v %q for Machine %q in namespace %q",
918-
obj.GroupVersionKind(), obj.GetName(), m.Name, m.Namespace)
919-
}
892+
if err := r.Client.Delete(ctx, s.infraMachine); err != nil && !apierrors.IsNotFound(err) {
893+
return false, errors.Wrapf(err,
894+
"failed to delete %v %q for Machine %q in namespace %q",
895+
s.infraMachine.GroupVersionKind(), s.infraMachine.GetName(), s.machine.Name, s.machine.Namespace)
920896
}
921897

922-
// Return true if there are no more external objects.
923-
return obj, nil
898+
return false, nil
924899
}
925900

926901
// shouldAdopt returns true if the Machine should be adopted as a stand-alone Machine directly owned by the Cluster.

internal/controllers/machine/machine_controller_noderef.go

Lines changed: 23 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,11 @@ func (r *Reconciler) reconcileNode(ctx context.Context, s *scope) (ctrl.Result,
7272
node, err := r.getNode(ctx, remoteClient, *machine.Spec.ProviderID)
7373
if err != nil {
7474
if err == ErrNodeNotFound {
75+
if !s.machine.DeletionTimestamp.IsZero() {
76+
// Tolerate node not found when the machine is being deleted.
77+
return ctrl.Result{}, nil
78+
}
79+
7580
// While a NodeRef is set in the status, failing to get that node means the node is deleted.
7681
// If Status.NodeRef is not set before, node still can be in the provisioning state.
7782
if machine.Status.NodeRef != nil {
@@ -87,21 +92,22 @@ func (r *Reconciler) reconcileNode(ctx context.Context, s *scope) (ctrl.Result,
8792
conditions.MarkUnknown(machine, clusterv1.MachineNodeHealthyCondition, clusterv1.NodeInspectionFailedReason, "Failed to get the Node for this Machine by ProviderID")
8893
return ctrl.Result{}, err
8994
}
95+
s.node = node
9096

9197
// Set the Machine NodeRef.
9298
if machine.Status.NodeRef == nil {
9399
machine.Status.NodeRef = &corev1.ObjectReference{
94100
APIVersion: corev1.SchemeGroupVersion.String(),
95101
Kind: "Node",
96-
Name: node.Name,
97-
UID: node.UID,
102+
Name: s.node.Name,
103+
UID: s.node.UID,
98104
}
99105
log.Info("Infrastructure provider reporting spec.providerID, Kubernetes node is now available", machine.Spec.InfrastructureRef.Kind, klog.KRef(machine.Spec.InfrastructureRef.Namespace, machine.Spec.InfrastructureRef.Name), "providerID", *machine.Spec.ProviderID, "Node", klog.KRef("", machine.Status.NodeRef.Name))
100106
r.recorder.Event(machine, corev1.EventTypeNormal, "SuccessfulSetNodeRef", machine.Status.NodeRef.Name)
101107
}
102108

103109
// Set the NodeSystemInfo.
104-
machine.Status.NodeInfo = &node.Status.NodeInfo
110+
machine.Status.NodeInfo = &s.node.Status.NodeInfo
105111

106112
// Compute all the annotations that CAPI is setting on nodes;
107113
// CAPI only enforces some annotations and never changes or removes them.
@@ -134,21 +140,26 @@ func (r *Reconciler) reconcileNode(ctx context.Context, s *scope) (ctrl.Result,
134140
}
135141
}
136142

137-
_, nodeHadInterruptibleLabel := node.Labels[clusterv1.InterruptibleLabel]
143+
_, nodeHadInterruptibleLabel := s.node.Labels[clusterv1.InterruptibleLabel]
138144

139145
// Reconcile node taints
140-
if err := r.patchNode(ctx, remoteClient, node, nodeLabels, nodeAnnotations, machine); err != nil {
141-
return ctrl.Result{}, errors.Wrapf(err, "failed to reconcile Node %s", klog.KObj(node))
146+
if err := r.patchNode(ctx, remoteClient, s.node, nodeLabels, nodeAnnotations, machine); err != nil {
147+
return ctrl.Result{}, errors.Wrapf(err, "failed to reconcile Node %s", klog.KObj(s.node))
142148
}
143149
if !nodeHadInterruptibleLabel && interruptible {
144150
// If the interruptible label is added to the node then record the event.
145151
// Nb. Only record the event if the node previously did not have the label to avoid recording
146152
// the event during every reconcile.
147-
r.recorder.Event(machine, corev1.EventTypeNormal, "SuccessfulSetInterruptibleNodeLabel", node.Name)
153+
r.recorder.Event(machine, corev1.EventTypeNormal, "SuccessfulSetInterruptibleNodeLabel", s.node.Name)
148154
}
149155

150156
// Do the remaining node health checks, then set the node health to true if all checks pass.
151-
status, message := summarizeNodeConditions(node)
157+
if s.infraMachine == nil || !s.infraMachine.GetDeletionTimestamp().IsZero() {
158+
conditions.MarkFalse(s.machine, clusterv1.MachineNodeHealthyCondition, clusterv1.DeletingReason, clusterv1.ConditionSeverityInfo, "")
159+
return ctrl.Result{}, nil
160+
}
161+
162+
status, message := summarizeNodeConditions(s.node)
152163
if status == corev1.ConditionFalse {
153164
conditions.MarkFalse(machine, clusterv1.MachineNodeHealthyCondition, clusterv1.NodeConditionsFailedReason, clusterv1.ConditionSeverityWarning, message)
154165
return ctrl.Result{}, nil
@@ -214,6 +225,10 @@ func summarizeNodeConditions(node *corev1.Node) (corev1.ConditionStatus, string)
214225
}
215226
}
216227
}
228+
message = strings.TrimSpace(message)
229+
if strings.HasSuffix(message, ".") {
230+
message = strings.TrimSuffix(message, ".")
231+
}
217232
if semanticallyFalseStatus > 0 {
218233
return corev1.ConditionFalse, message
219234
}

internal/controllers/machine/machine_controller_noderef_test.go

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -159,6 +159,34 @@ func TestReconcileNode(t *testing.T) {
159159
expectResult: ctrl.Result{},
160160
expectError: true,
161161
},
162+
{
163+
name: "node not found is tolerated when machine is deleting",
164+
machine: &clusterv1.Machine{
165+
ObjectMeta: metav1.ObjectMeta{
166+
Name: "machine-test",
167+
Namespace: metav1.NamespaceDefault,
168+
Labels: map[string]string{
169+
clusterv1.ClusterNameLabel: "test-cluster",
170+
},
171+
DeletionTimestamp: ptr.To(metav1.Now()),
172+
Finalizers: []string{"foo"},
173+
},
174+
Spec: clusterv1.MachineSpec{
175+
ProviderID: ptr.To("aws://us-east-1/test-node-1"),
176+
},
177+
Status: clusterv1.MachineStatus{
178+
NodeRef: &corev1.ObjectReference{
179+
Kind: "Node",
180+
Name: "test-node-1",
181+
APIVersion: "v1",
182+
},
183+
},
184+
},
185+
node: nil,
186+
nodeGetErr: false,
187+
expectResult: ctrl.Result{},
188+
expectError: false,
189+
},
162190
}
163191

164192
for _, tc := range testCases {

0 commit comments

Comments
 (0)