Skip to content

Commit 920ee7d

Browse files
authored
Merge pull request #11771 from sbueringer/pr-fix-topo-controller
🐛 Ensure Cluster topology controller is not stuck when MDs are stuck in deletion
2 parents ffd15a9 + d07153a commit 920ee7d

File tree

3 files changed

+134
-48
lines changed

3 files changed

+134
-48
lines changed

internal/controllers/topology/cluster/current_state.go

Lines changed: 50 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -204,58 +204,64 @@ func (r *Reconciler) getCurrentMachineDeploymentState(ctx context.Context, bluep
204204
return nil, fmt.Errorf("duplicate MachineDeployment %s found for label %s: %s", klog.KObj(m), clusterv1.ClusterTopologyMachineDeploymentNameLabel, mdTopologyName)
205205
}
206206

207-
// Gets the bootstrapRef.
208-
bootstrapRef := m.Spec.Template.Spec.Bootstrap.ConfigRef
209-
if bootstrapRef == nil {
210-
return nil, fmt.Errorf("MachineDeployment %s does not have a reference to a Bootstrap Config", klog.KObj(m))
211-
}
212-
// Gets the infraRef.
213-
infraRef := &m.Spec.Template.Spec.InfrastructureRef
214-
if infraRef.Name == "" {
215-
return nil, fmt.Errorf("MachineDeployment %s does not have a reference to a InfrastructureMachineTemplate", klog.KObj(m))
216-
}
207+
// Skip getting templates for MachineDeployments that are in deleting.
208+
// Note: We don't need these templates for deleting MDs, but also this would likely fail because usually
209+
// the MachineDeployment topology controller deletes the templates as soon as the MD is in deleting.
210+
var bootstrapTemplate, infraMachineTemplate *unstructured.Unstructured
211+
if m.DeletionTimestamp.IsZero() {
212+
// Gets the bootstrapRef.
213+
bootstrapRef := m.Spec.Template.Spec.Bootstrap.ConfigRef
214+
if bootstrapRef == nil {
215+
return nil, fmt.Errorf("MachineDeployment %s does not have a reference to a Bootstrap Config", klog.KObj(m))
216+
}
217+
// Gets the infraRef.
218+
infraRef := &m.Spec.Template.Spec.InfrastructureRef
219+
if infraRef.Name == "" {
220+
return nil, fmt.Errorf("MachineDeployment %s does not have a reference to a InfrastructureMachineTemplate", klog.KObj(m))
221+
}
217222

218-
// If the mdTopology exists in the Cluster, lookup the corresponding mdBluePrint and align
219-
// the apiVersions in the bootstrapRef and infraRef.
220-
// If the mdTopology doesn't exist, do nothing (this can happen if the mdTopology was deleted).
221-
// **Note** We can't check if the MachineDeployment has a DeletionTimestamp, because at this point it could not be set yet.
222-
if mdTopologyExistsInCluster, mdClassName := getMDClassName(cluster, mdTopologyName); mdTopologyExistsInCluster {
223-
mdBluePrint, ok := blueprintMachineDeployments[mdClassName]
224-
if !ok {
225-
return nil, fmt.Errorf("failed to find MachineDeployment class %s in ClusterClass", mdClassName)
223+
// If the mdTopology exists in the Cluster, lookup the corresponding mdBluePrint and align
224+
// the apiVersions in the bootstrapRef and infraRef.
225+
// If the mdTopology doesn't exist, do nothing (this can happen if the mdTopology was deleted).
226+
// **Note** We can't check if the MachineDeployment has a DeletionTimestamp, because at this point it could not be set yet.
227+
if mdTopologyExistsInCluster, mdClassName := getMDClassName(cluster, mdTopologyName); mdTopologyExistsInCluster {
228+
mdBluePrint, ok := blueprintMachineDeployments[mdClassName]
229+
if !ok {
230+
return nil, fmt.Errorf("failed to find MachineDeployment class %s in ClusterClass", mdClassName)
231+
}
232+
bootstrapRef, err = alignRefAPIVersion(mdBluePrint.BootstrapTemplate, bootstrapRef)
233+
if err != nil {
234+
return nil, errors.Wrap(err, fmt.Sprintf("MachineDeployment %s Bootstrap reference could not be retrieved", klog.KObj(m)))
235+
}
236+
infraRef, err = alignRefAPIVersion(mdBluePrint.InfrastructureMachineTemplate, infraRef)
237+
if err != nil {
238+
return nil, errors.Wrap(err, fmt.Sprintf("MachineDeployment %s Infrastructure reference could not be retrieved", klog.KObj(m)))
239+
}
226240
}
227-
bootstrapRef, err = alignRefAPIVersion(mdBluePrint.BootstrapTemplate, bootstrapRef)
241+
242+
// Get the BootstrapTemplate.
243+
bootstrapTemplate, err = r.getReference(ctx, bootstrapRef)
228244
if err != nil {
229245
return nil, errors.Wrap(err, fmt.Sprintf("MachineDeployment %s Bootstrap reference could not be retrieved", klog.KObj(m)))
230246
}
231-
infraRef, err = alignRefAPIVersion(mdBluePrint.InfrastructureMachineTemplate, infraRef)
247+
// check that the referenced object has the ClusterTopologyOwnedLabel label.
248+
// Nb. This is to make sure that a managed topology cluster does not have a reference to an object that is not
249+
// owned by the topology.
250+
if !labels.IsTopologyOwned(bootstrapTemplate) {
251+
return nil, fmt.Errorf("%s %s referenced from MachineDeployment %s is not topology owned", bootstrapTemplate.GetKind(), klog.KObj(bootstrapTemplate), klog.KObj(m))
252+
}
253+
254+
// Get the InfraMachineTemplate.
255+
infraMachineTemplate, err = r.getReference(ctx, infraRef)
232256
if err != nil {
233257
return nil, errors.Wrap(err, fmt.Sprintf("MachineDeployment %s Infrastructure reference could not be retrieved", klog.KObj(m)))
234258
}
235-
}
236-
237-
// Get the BootstrapTemplate.
238-
bootstrapTemplate, err := r.getReference(ctx, bootstrapRef)
239-
if err != nil {
240-
return nil, errors.Wrap(err, fmt.Sprintf("MachineDeployment %s Bootstrap reference could not be retrieved", klog.KObj(m)))
241-
}
242-
// check that the referenced object has the ClusterTopologyOwnedLabel label.
243-
// Nb. This is to make sure that a managed topology cluster does not have a reference to an object that is not
244-
// owned by the topology.
245-
if !labels.IsTopologyOwned(bootstrapTemplate) {
246-
return nil, fmt.Errorf("%s %s referenced from MachineDeployment %s is not topology owned", bootstrapTemplate.GetKind(), klog.KObj(bootstrapTemplate), klog.KObj(m))
247-
}
248-
249-
// Get the InfraMachineTemplate.
250-
infraMachineTemplate, err := r.getReference(ctx, infraRef)
251-
if err != nil {
252-
return nil, errors.Wrap(err, fmt.Sprintf("MachineDeployment %s Infrastructure reference could not be retrieved", klog.KObj(m)))
253-
}
254-
// check that the referenced object has the ClusterTopologyOwnedLabel label.
255-
// Nb. This is to make sure that a managed topology cluster does not have a reference to an object that is not
256-
// owned by the topology.
257-
if !labels.IsTopologyOwned(infraMachineTemplate) {
258-
return nil, fmt.Errorf("%s %s referenced from MachineDeployment %s is not topology owned", infraMachineTemplate.GetKind(), klog.KObj(infraMachineTemplate), klog.KObj(m))
259+
// check that the referenced object has the ClusterTopologyOwnedLabel label.
260+
// Nb. This is to make sure that a managed topology cluster does not have a reference to an object that is not
261+
// owned by the topology.
262+
if !labels.IsTopologyOwned(infraMachineTemplate) {
263+
return nil, fmt.Errorf("%s %s referenced from MachineDeployment %s is not topology owned", infraMachineTemplate.GetKind(), klog.KObj(infraMachineTemplate), klog.KObj(m))
264+
}
259265
}
260266

261267
// Gets the MachineHealthCheck.

internal/controllers/topology/cluster/current_state_test.go

Lines changed: 74 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,9 +22,11 @@ import (
2222

2323
"github.com/google/go-cmp/cmp"
2424
. "github.com/onsi/gomega"
25+
"golang.org/x/exp/maps"
2526
corev1 "k8s.io/api/core/v1"
2627
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
2728
"k8s.io/apimachinery/pkg/apis/meta/v1/unstructured"
29+
"k8s.io/utils/ptr"
2830
"sigs.k8s.io/controller-runtime/pkg/client"
2931
"sigs.k8s.io/controller-runtime/pkg/client/fake"
3032
. "sigs.k8s.io/controller-runtime/pkg/envtest/komega"
@@ -115,6 +117,9 @@ func TestGetCurrentState(t *testing.T) {
115117
WithBootstrapTemplate(machineDeploymentBootstrap).
116118
WithInfrastructureTemplate(machineDeploymentInfrastructure).
117119
Build()
120+
machineDeploymentWithDeletionTimestamp := machineDeployment.DeepCopy()
121+
machineDeploymentWithDeletionTimestamp.Finalizers = []string{clusterv1.MachineDeploymentFinalizer} // required by fake client
122+
machineDeploymentWithDeletionTimestamp.DeletionTimestamp = ptr.To(metav1.Now())
118123
machineDeployment2 := builder.MachineDeployment(metav1.NamespaceDefault, "md2").
119124
WithLabels(map[string]string{
120125
clusterv1.ClusterNameLabel: "cluster1",
@@ -1056,6 +1061,70 @@ func TestGetCurrentState(t *testing.T) {
10561061
MachinePools: emptyMachinePools,
10571062
},
10581063
},
1064+
{
1065+
name: "Pass reading a full Cluster with a deleting MachineDeployment",
1066+
cluster: builder.Cluster(metav1.NamespaceDefault, "cluster1").
1067+
WithInfrastructureCluster(infraCluster).
1068+
WithControlPlane(controlPlaneWithInfra).
1069+
WithTopology(builder.ClusterTopology().
1070+
WithMachineDeployment(clusterv1.MachineDeploymentTopology{
1071+
Class: "mdClass",
1072+
Name: "md1",
1073+
}).
1074+
Build()).
1075+
Build(),
1076+
blueprint: &scope.ClusterBlueprint{
1077+
ClusterClass: clusterClassWithControlPlaneInfra,
1078+
InfrastructureClusterTemplate: infraClusterTemplate,
1079+
ControlPlane: &scope.ControlPlaneBlueprint{
1080+
Template: controlPlaneTemplateWithInfrastructureMachine,
1081+
InfrastructureMachineTemplate: controlPlaneInfrastructureMachineTemplate,
1082+
},
1083+
MachineDeployments: map[string]*scope.MachineDeploymentBlueprint{
1084+
"mdClass": {
1085+
BootstrapTemplate: machineDeploymentBootstrap,
1086+
InfrastructureMachineTemplate: machineDeploymentInfrastructure,
1087+
},
1088+
},
1089+
},
1090+
objects: []client.Object{
1091+
infraCluster,
1092+
clusterClassWithControlPlaneInfra,
1093+
controlPlaneInfrastructureMachineTemplate,
1094+
controlPlaneWithInfra,
1095+
machineDeploymentWithDeletionTimestamp,
1096+
machineHealthCheckForMachineDeployment,
1097+
machineHealthCheckForControlPlane,
1098+
},
1099+
// Expect valid return of full ClusterState with MachineDeployment without corresponding templates.
1100+
want: &scope.ClusterState{
1101+
Cluster: builder.Cluster(metav1.NamespaceDefault, "cluster1").
1102+
WithInfrastructureCluster(infraCluster).
1103+
WithControlPlane(controlPlaneWithInfra).
1104+
WithTopology(builder.ClusterTopology().
1105+
WithMachineDeployment(clusterv1.MachineDeploymentTopology{
1106+
Class: "mdClass",
1107+
Name: "md1",
1108+
}).
1109+
Build()).
1110+
Build(),
1111+
ControlPlane: &scope.ControlPlaneState{
1112+
Object: controlPlaneWithInfra,
1113+
InfrastructureMachineTemplate: controlPlaneInfrastructureMachineTemplate,
1114+
MachineHealthCheck: machineHealthCheckForControlPlane,
1115+
},
1116+
InfrastructureCluster: infraCluster,
1117+
MachineDeployments: map[string]*scope.MachineDeploymentState{
1118+
"md1": {
1119+
Object: machineDeploymentWithDeletionTimestamp,
1120+
BootstrapTemplate: nil,
1121+
InfrastructureMachineTemplate: nil,
1122+
MachineHealthCheck: machineHealthCheckForMachineDeployment,
1123+
},
1124+
},
1125+
MachinePools: emptyMachinePools,
1126+
},
1127+
},
10591128
}
10601129
for _, tt := range tests {
10611130
t.Run(tt.name, func(t *testing.T) {
@@ -1096,6 +1165,11 @@ func TestGetCurrentState(t *testing.T) {
10961165
return
10971166
}
10981167

1168+
// Don't compare the deletionTimestamps as there are some minor differences in how they are stored pre/post fake client.
1169+
for _, md := range append(maps.Values(got.MachineDeployments), maps.Values(tt.want.MachineDeployments)...) {
1170+
md.Object.DeletionTimestamp = nil
1171+
}
1172+
10991173
// Use EqualObject where the compared object is passed through the fakeClient. Elsewhere the Equal method is
11001174
// good enough to establish equality.
11011175
g.Expect(got.Cluster).To(EqualObject(tt.want.Cluster, IgnoreAutogeneratedMetadata))

internal/controllers/topology/cluster/reconcile_state.go

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -677,6 +677,10 @@ func (r *Reconciler) updateMachineDeployment(ctx context.Context, s *scope.Scope
677677
}
678678
}
679679

680+
if !currentMD.Object.DeletionTimestamp.IsZero() {
681+
return nil
682+
}
683+
680684
// Return early if the MachineDeployment is pending an upgrade.
681685
// Do not reconcile the MachineDeployment yet to avoid updating the MachineDeployment while it is still pending a
682686
// version upgrade. This will prevent the MachineDeployment from performing a double rollout.
@@ -809,11 +813,13 @@ func (r *Reconciler) deleteMachineDeployment(ctx context.Context, cluster *clust
809813
return err
810814
}
811815
}
812-
log.Info("Deleting MachineDeployment")
813-
if err := r.Client.Delete(ctx, md.Object); err != nil && !apierrors.IsNotFound(err) {
814-
return errors.Wrapf(err, "failed to delete MachineDeployment %s", klog.KObj(md.Object))
816+
if md.Object.DeletionTimestamp.IsZero() {
817+
log.Info("Deleting MachineDeployment")
818+
if err := r.Client.Delete(ctx, md.Object); err != nil && !apierrors.IsNotFound(err) {
819+
return errors.Wrapf(err, "failed to delete MachineDeployment %s", klog.KObj(md.Object))
820+
}
821+
r.recorder.Eventf(cluster, corev1.EventTypeNormal, deleteEventReason, "Deleted MachineDeployment %q", klog.KObj(md.Object))
815822
}
816-
r.recorder.Eventf(cluster, corev1.EventTypeNormal, deleteEventReason, "Deleted MachineDeployment %q", klog.KObj(md.Object))
817823
return nil
818824
}
819825

0 commit comments

Comments
 (0)