Skip to content

Commit 5211c42

Browse files
authored
Merge pull request #11991 from sbueringer/pr-improve-crd-migration
🐛 CRD migration: Fix cases where update validation fails
2 parents d69751f + 71f75a2 commit 5211c42

File tree

15 files changed

+124
-74
lines changed

15 files changed

+124
-74
lines changed

bootstrap/kubeadm/main.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -365,7 +365,7 @@ func setupReconcilers(ctx context.Context, mgr ctrl.Manager) {
365365
// Note: The kubebuilder RBAC markers above has to be kept in sync
366366
// with the CRDs that should be migrated by this provider.
367367
Config: map[client.Object]crdmigrator.ByObjectConfig{
368-
&bootstrapv1.KubeadmConfig{}: {UseCache: true},
368+
&bootstrapv1.KubeadmConfig{}: {UseCache: true, UseStatusForStorageVersionMigration: true},
369369
&bootstrapv1.KubeadmConfigTemplate{}: {UseCache: false},
370370
},
371371
// The CRDMigrator is run with only concurrency 1 to ensure we don't overwhelm the apiserver by patching a

config/rbac/role.yaml

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -159,6 +159,13 @@ rules:
159159
- patch
160160
- update
161161
- watch
162+
- apiGroups:
163+
- ipam.cluster.x-k8s.io
164+
resources:
165+
- ipaddressclaims/status
166+
verbs:
167+
- patch
168+
- update
162169
- apiGroups:
163170
- runtime.cluster.x-k8s.io
164171
resources:

controllers/crdmigrator/crd_migrator.go

Lines changed: 35 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ import (
2222
"encoding/json"
2323
"fmt"
2424
"strconv"
25+
"time"
2526

2627
"github.com/pkg/errors"
2728
apiextensionsv1 "k8s.io/apiextensions-apiserver/pkg/apis/apiextensions/v1"
@@ -101,6 +102,12 @@ type ByObjectConfig struct {
101102
// Note: If this is enabled, we will use the corresponding Go type of the object for Get & List calls to avoid
102103
// creating additional informers for UnstructuredList/PartialObjectMetadataList.
103104
UseCache bool
105+
106+
// UseStatusForStorageVersionMigration configures if the storage version migration for this CRD should
107+
// be triggered via the status endpoint instead of an update on the CRs directly (which is the default).
108+
// As mutating and validating webhooks are usually not configured on the status subresource this can help to
109+
// avoid mutating & validation webhook errors that would block the no-op updates and thus the storage migration.
110+
UseStatusForStorageVersionMigration bool
104111
}
105112

106113
func (r *CRDMigrator) SetupWithManager(ctx context.Context, mgr ctrl.Manager, controllerOptions controller.Options) error {
@@ -164,7 +171,7 @@ func (r *CRDMigrator) setup(scheme *runtime.Scheme) error {
164171
r.configByCRDName[contract.CalculateCRDName(gvk.Group, gvk.Kind)] = cfg
165172
}
166173

167-
r.storageVersionMigrationCache = cache.New[objectEntry]()
174+
r.storageVersionMigrationCache = cache.New[objectEntry](1 * time.Hour)
168175
return nil
169176
}
170177

@@ -230,7 +237,7 @@ func (r *CRDMigrator) Reconcile(ctx context.Context, req ctrl.Request) (_ ctrl.R
230237

231238
// If phase should be run and .status.storedVersions != [storageVersion], run storage version migration.
232239
if r.crdMigrationPhasesToRun.Has(StorageVersionMigrationPhase) && storageVersionMigrationRequired(crd, storageVersion) {
233-
if err := r.reconcileStorageVersionMigration(ctx, crd, customResourceObjects, storageVersion); err != nil {
240+
if err := r.reconcileStorageVersionMigration(ctx, crd, migrationConfig, customResourceObjects, storageVersion); err != nil {
234241
return ctrl.Result{}, err
235242
}
236243

@@ -252,18 +259,13 @@ func (r *CRDMigrator) Reconcile(ctx context.Context, req ctrl.Request) (_ ctrl.R
252259
}
253260

254261
func storageVersionForCRD(crd *apiextensionsv1.CustomResourceDefinition) (string, error) {
255-
var storageVersion string
256262
for _, v := range crd.Spec.Versions {
257263
if v.Storage {
258-
storageVersion = v.Name
259-
break
264+
return v.Name, nil
260265
}
261266
}
262-
if storageVersion == "" {
263-
return "", errors.Errorf("could not find storage version for CustomResourceDefinition %s", crd.Name)
264-
}
265267

266-
return storageVersion, nil
268+
return "", errors.Errorf("could not find storage version for CustomResourceDefinition %s", crd.Name)
267269
}
268270

269271
func storageVersionMigrationRequired(crd *apiextensionsv1.CustomResourceDefinition, storageVersion string) bool {
@@ -360,7 +362,7 @@ func listObjectsFromAPIReader(ctx context.Context, c client.Reader, objectList c
360362
return objs, nil
361363
}
362364

363-
func (r *CRDMigrator) reconcileStorageVersionMigration(ctx context.Context, crd *apiextensionsv1.CustomResourceDefinition, customResourceObjects []client.Object, storageVersion string) error {
365+
func (r *CRDMigrator) reconcileStorageVersionMigration(ctx context.Context, crd *apiextensionsv1.CustomResourceDefinition, migrationConfig ByObjectConfig, customResourceObjects []client.Object, storageVersion string) error {
364366
if len(customResourceObjects) == 0 {
365367
return nil
366368
}
@@ -374,6 +376,7 @@ func (r *CRDMigrator) reconcileStorageVersionMigration(ctx context.Context, crd
374376
Kind: crd.Spec.Names.Kind,
375377
}
376378

379+
errs := []error{}
377380
for _, obj := range customResourceObjects {
378381
e := objectEntry{
379382
Kind: gvk.Kind,
@@ -384,6 +387,9 @@ func (r *CRDMigrator) reconcileStorageVersionMigration(ctx context.Context, crd
384387
}
385388

386389
if _, alreadyMigrated := r.storageVersionMigrationCache.Has(e.Key()); alreadyMigrated {
390+
// Refresh the cache entry, so that we don't try to migrate the storage version for CRs that were
391+
// already migrated successfully in cases where storage migrations failed for a subset of the CRs.
392+
r.storageVersionMigrationCache.Add(e)
387393
continue
388394
}
389395

@@ -402,16 +408,26 @@ func (r *CRDMigrator) reconcileStorageVersionMigration(ctx context.Context, crd
402408
u.SetResourceVersion(obj.GetResourceVersion())
403409

404410
log.V(4).Info("Migrating to new storage version", gvk.Kind, klog.KObj(u))
405-
err := r.Client.Patch(ctx, u, client.Apply, client.FieldOwner("crdmigrator"))
411+
var err error
412+
if migrationConfig.UseStatusForStorageVersionMigration {
413+
err = r.Client.Status().Patch(ctx, u, client.Apply, client.FieldOwner("crdmigrator"))
414+
} else {
415+
err = r.Client.Patch(ctx, u, client.Apply, client.FieldOwner("crdmigrator"))
416+
}
406417
// If we got a NotFound error, the object no longer exists so no need to update it.
407418
// If we got a Conflict error, another client wrote the object already so no need to update it.
408419
if err != nil && !apierrors.IsNotFound(err) && !apierrors.IsConflict(err) {
409-
return errors.Wrapf(err, "failed to migrate storage version of %s %s", gvk.Kind, klog.KObj(u))
420+
errs = append(errs, errors.Wrap(err, klog.KObj(u).String()))
421+
continue
410422
}
411423

412424
r.storageVersionMigrationCache.Add(e)
413425
}
414426

427+
if len(errs) > 0 {
428+
return errors.Wrapf(kerrors.NewAggregate(errs), "failed to migrate storage version of %s objects", gvk.Kind)
429+
}
430+
415431
return nil
416432
}
417433

@@ -431,6 +447,7 @@ func (r *CRDMigrator) reconcileCleanupManagedFields(ctx context.Context, crd *ap
431447
}
432448
}
433449

450+
errs := []error{}
434451
for _, obj := range customResourceObjects {
435452
if len(obj.GetManagedFields()) == 0 {
436453
continue
@@ -512,10 +529,15 @@ func (r *CRDMigrator) reconcileCleanupManagedFields(ctx context.Context, crd *ap
512529
// Note: We always have to return the conflict error directly (instead of an aggregate) so retry on conflict works.
513530
return err
514531
}); err != nil {
515-
return errors.Wrapf(kerrors.NewAggregate([]error{err, getErr}), "failed to cleanup managedFields of %s %s", crd.Spec.Names.Kind, klog.KObj(obj))
532+
errs = append(errs, errors.Wrap(kerrors.NewAggregate([]error{err, getErr}), klog.KObj(obj).String()))
533+
continue
516534
}
517535
}
518536

537+
if len(errs) > 0 {
538+
return errors.Wrapf(kerrors.NewAggregate(errs), "failed to cleanup managedFields of %s objects", crd.Spec.Names.Kind)
539+
}
540+
519541
return nil
520542
}
521543

controllers/crdmigrator/crd_migrator_test.go

Lines changed: 47 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -63,43 +63,63 @@ func TestReconcile(t *testing.T) {
6363
crdObjectKey := client.ObjectKey{Name: crdName}
6464

6565
tests := []struct {
66-
name string
67-
skipCRDMigrationPhases []Phase
68-
useCache bool
66+
name string
67+
skipCRDMigrationPhases []Phase
68+
useCache bool
69+
useStatusForStorageVersionMigration bool
6970
}{
7071
{
71-
name: "run both StorageVersionMigration and CleanupManagedFields with cache",
72-
skipCRDMigrationPhases: nil,
73-
useCache: true,
72+
name: "run both StorageVersionMigration and CleanupManagedFields with cache",
73+
skipCRDMigrationPhases: nil,
74+
useCache: true,
75+
useStatusForStorageVersionMigration: false,
7476
},
7577
{
76-
name: "run both StorageVersionMigration and CleanupManagedFields without cache",
77-
skipCRDMigrationPhases: nil,
78-
useCache: false,
78+
name: "run both StorageVersionMigration and CleanupManagedFields without cache",
79+
skipCRDMigrationPhases: nil,
80+
useCache: false,
81+
useStatusForStorageVersionMigration: false,
7982
},
8083
{
81-
name: "run only CleanupManagedFields with cache",
82-
skipCRDMigrationPhases: []Phase{StorageVersionMigrationPhase},
83-
useCache: true,
84+
name: "run both StorageVersionMigration and CleanupManagedFields with cache (using status)",
85+
skipCRDMigrationPhases: nil,
86+
useCache: true,
87+
useStatusForStorageVersionMigration: true,
8488
},
8589
{
86-
name: "run only CleanupManagedFields without cache",
87-
skipCRDMigrationPhases: []Phase{StorageVersionMigrationPhase},
88-
useCache: false,
90+
name: "run both StorageVersionMigration and CleanupManagedFields without cache (using status)",
91+
skipCRDMigrationPhases: nil,
92+
useCache: false,
93+
useStatusForStorageVersionMigration: true,
8994
},
9095
{
91-
name: "run only StorageVersionMigration with cache",
92-
skipCRDMigrationPhases: []Phase{CleanupManagedFieldsPhase},
93-
useCache: true,
96+
name: "run only CleanupManagedFields with cache",
97+
skipCRDMigrationPhases: []Phase{StorageVersionMigrationPhase},
98+
useCache: true,
99+
useStatusForStorageVersionMigration: false,
94100
},
95101
{
96-
name: "run only StorageVersionMigration without cache",
97-
skipCRDMigrationPhases: []Phase{CleanupManagedFieldsPhase},
98-
useCache: false,
102+
name: "run only CleanupManagedFields without cache",
103+
skipCRDMigrationPhases: []Phase{StorageVersionMigrationPhase},
104+
useCache: false,
105+
useStatusForStorageVersionMigration: false,
99106
},
100107
{
101-
name: "skip all",
102-
skipCRDMigrationPhases: []Phase{StorageVersionMigrationPhase, CleanupManagedFieldsPhase},
108+
name: "run only StorageVersionMigration with cache",
109+
skipCRDMigrationPhases: []Phase{CleanupManagedFieldsPhase},
110+
useCache: true,
111+
useStatusForStorageVersionMigration: false,
112+
},
113+
{
114+
name: "run only StorageVersionMigration without cache",
115+
skipCRDMigrationPhases: []Phase{CleanupManagedFieldsPhase},
116+
useCache: false,
117+
useStatusForStorageVersionMigration: false,
118+
},
119+
{
120+
name: "skip all",
121+
skipCRDMigrationPhases: []Phase{StorageVersionMigrationPhase, CleanupManagedFieldsPhase},
122+
useStatusForStorageVersionMigration: false,
103123
},
104124
}
105125
for _, tt := range tests {
@@ -127,19 +147,19 @@ func TestReconcile(t *testing.T) {
127147
// Create manager for all steps.
128148
skipCRDMigrationPhases := sets.Set[Phase]{}.Insert(tt.skipCRDMigrationPhases...)
129149
managerT1, err := createManagerWithCRDMigrator(tt.skipCRDMigrationPhases, map[client.Object]ByObjectConfig{
130-
&t1v1beta1.TestCluster{}: {UseCache: tt.useCache},
150+
&t1v1beta1.TestCluster{}: {UseCache: tt.useCache, UseStatusForStorageVersionMigration: tt.useStatusForStorageVersionMigration},
131151
}, t1v1beta1.AddToScheme)
132152
g.Expect(err).ToNot(HaveOccurred())
133153
managerT2, err := createManagerWithCRDMigrator(tt.skipCRDMigrationPhases, map[client.Object]ByObjectConfig{
134-
&t2v1beta2.TestCluster{}: {UseCache: tt.useCache},
154+
&t2v1beta2.TestCluster{}: {UseCache: tt.useCache, UseStatusForStorageVersionMigration: tt.useStatusForStorageVersionMigration},
135155
}, t2v1beta2.AddToScheme)
136156
g.Expect(err).ToNot(HaveOccurred())
137157
managerT3, err := createManagerWithCRDMigrator(tt.skipCRDMigrationPhases, map[client.Object]ByObjectConfig{
138-
&t3v1beta2.TestCluster{}: {UseCache: tt.useCache},
158+
&t3v1beta2.TestCluster{}: {UseCache: tt.useCache, UseStatusForStorageVersionMigration: tt.useStatusForStorageVersionMigration},
139159
}, t3v1beta2.AddToScheme)
140160
g.Expect(err).ToNot(HaveOccurred())
141161
managerT4, err := createManagerWithCRDMigrator(tt.skipCRDMigrationPhases, map[client.Object]ByObjectConfig{
142-
&t4v1beta2.TestCluster{}: {UseCache: tt.useCache},
162+
&t4v1beta2.TestCluster{}: {UseCache: tt.useCache, UseStatusForStorageVersionMigration: tt.useStatusForStorageVersionMigration},
143163
}, t4v1beta2.AddToScheme)
144164
g.Expect(err).ToNot(HaveOccurred())
145165

controlplane/kubeadm/main.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -408,7 +408,7 @@ func setupReconcilers(ctx context.Context, mgr ctrl.Manager) {
408408
// Note: The kubebuilder RBAC markers above has to be kept in sync
409409
// with the CRDs that should be migrated by this provider.
410410
Config: map[client.Object]crdmigrator.ByObjectConfig{
411-
&controlplanev1.KubeadmControlPlane{}: {UseCache: true},
411+
&controlplanev1.KubeadmControlPlane{}: {UseCache: true, UseStatusForStorageVersionMigration: true},
412412
&controlplanev1.KubeadmControlPlaneTemplate{}: {UseCache: false},
413413
},
414414
// The CRDMigrator is run with only concurrency 1 to ensure we don't overwhelm the apiserver by patching a

docs/book/src/developer/providers/migrations/v1.9-to-v1.10.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,3 +39,4 @@ maintainers of providers and consumers of our Go API.
3939
- Note: The CRD migrator will add the `crd-migration.cluster.x-k8s.io/observed-generation` annotation on the CRD object,
4040
please ensure that if these CRD objects are deployed with a tool like kapp / Argo / Flux the annotation is not continuously removed.
4141
- For all CRs that should be migrated by the `CRDMigrator`: verbs: `get;list;watch;patch;update`
42+
- For all CRs with `UseStatusForStorageVersionMigration: true` verbs: `update;patch` on their `/status` resource (e.g. `ipaddressclaims/status`)

internal/controllers/clusterclass/clusterclass_controller.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -100,7 +100,7 @@ func (r *Reconciler) SetupWithManager(ctx context.Context, mgr ctrl.Manager, opt
100100
return errors.Wrap(err, "failed setting up with a controller manager")
101101
}
102102

103-
r.discoverVariablesCache = cache.New[runtimeclient.CallExtensionCacheEntry]()
103+
r.discoverVariablesCache = cache.New[runtimeclient.CallExtensionCacheEntry](cache.DefaultTTL)
104104
return nil
105105
}
106106

internal/controllers/clusterclass/clusterclass_controller_test.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1158,7 +1158,7 @@ func TestReconciler_reconcileVariables(t *testing.T) {
11581158

11591159
r := &Reconciler{
11601160
RuntimeClient: fakeRuntimeClient,
1161-
discoverVariablesCache: cache.New[runtimeclient.CallExtensionCacheEntry](),
1161+
discoverVariablesCache: cache.New[runtimeclient.CallExtensionCacheEntry](cache.DefaultTTL),
11621162
}
11631163

11641164
// Pin the compatibility version used in variable CEL validation to 1.29, so we don't have to continuously refactor

internal/controllers/machine/machine_controller.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -184,7 +184,7 @@ func (r *Reconciler) SetupWithManager(ctx context.Context, mgr ctrl.Manager, opt
184184
Scheme: mgr.GetScheme(),
185185
PredicateLogger: r.predicateLog,
186186
}
187-
r.reconcileDeleteCache = cache.New[cache.ReconcileEntry]()
187+
r.reconcileDeleteCache = cache.New[cache.ReconcileEntry](cache.DefaultTTL)
188188
return nil
189189
}
190190

internal/controllers/machine/machine_controller_test.go

Lines changed: 8 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -827,8 +827,6 @@ func TestReconcileRequest(t *testing.T) {
827827
},
828828
}
829829

830-
time := metav1.Now()
831-
832830
testCluster := clusterv1.Cluster{
833831
ObjectMeta: metav1.ObjectMeta{
834832
Name: "test-cluster",
@@ -922,7 +920,7 @@ func TestReconcileRequest(t *testing.T) {
922920
clusterv1.MachineControlPlaneLabel: "",
923921
},
924922
Finalizers: []string{clusterv1.MachineFinalizer},
925-
DeletionTimestamp: &time,
923+
DeletionTimestamp: ptr.To(metav1.Now()),
926924
},
927925
Spec: clusterv1.MachineSpec{
928926
ClusterName: "test-cluster",
@@ -958,7 +956,7 @@ func TestReconcileRequest(t *testing.T) {
958956
Client: clientFake,
959957
ClusterCache: clustercache.NewFakeClusterCache(clientFake, client.ObjectKey{Name: testCluster.Name, Namespace: testCluster.Namespace}),
960958
recorder: record.NewFakeRecorder(10),
961-
reconcileDeleteCache: cache.New[cache.ReconcileEntry](),
959+
reconcileDeleteCache: cache.New[cache.ReconcileEntry](cache.DefaultTTL),
962960
externalTracker: external.ObjectTracker{
963961
Controller: externalfake.Controller{},
964962
Cache: &informertest.FakeInformers{},
@@ -1314,7 +1312,7 @@ func TestRemoveMachineFinalizerAfterDeleteReconcile(t *testing.T) {
13141312
mr := &Reconciler{
13151313
Client: c,
13161314
ClusterCache: clustercache.NewFakeClusterCache(c, client.ObjectKeyFromObject(testCluster)),
1317-
reconcileDeleteCache: cache.New[cache.ReconcileEntry](),
1315+
reconcileDeleteCache: cache.New[cache.ReconcileEntry](cache.DefaultTTL),
13181316
}
13191317
_, err := mr.Reconcile(ctx, reconcile.Request{NamespacedName: key})
13201318
g.Expect(err).ToNot(HaveOccurred())
@@ -1709,7 +1707,7 @@ func TestDrainNode(t *testing.T) {
17091707
r := &Reconciler{
17101708
Client: c,
17111709
ClusterCache: clustercache.NewFakeClusterCache(remoteClient, client.ObjectKeyFromObject(testCluster)),
1712-
reconcileDeleteCache: cache.New[cache.ReconcileEntry](),
1710+
reconcileDeleteCache: cache.New[cache.ReconcileEntry](cache.DefaultTTL),
17131711
}
17141712

17151713
testMachine.Status.NodeRef = &corev1.ObjectReference{
@@ -1840,7 +1838,7 @@ func TestDrainNode_withCaching(t *testing.T) {
18401838
WithObjects(remoteObjs...).
18411839
Build()
18421840

1843-
reconcileDeleteCache := cache.New[cache.ReconcileEntry]()
1841+
reconcileDeleteCache := cache.New[cache.ReconcileEntry](cache.DefaultTTL)
18441842
r := &Reconciler{
18451843
Client: c,
18461844
ClusterCache: clustercache.NewFakeClusterCache(remoteClient, client.ObjectKeyFromObject(testCluster)),
@@ -2498,7 +2496,7 @@ func TestShouldWaitForNodeVolumes(t *testing.T) {
24982496
r := &Reconciler{
24992497
Client: fakeClient,
25002498
ClusterCache: clustercache.NewFakeClusterCache(remoteFakeClient, client.ObjectKeyFromObject(testCluster)),
2501-
reconcileDeleteCache: cache.New[cache.ReconcileEntry](),
2499+
reconcileDeleteCache: cache.New[cache.ReconcileEntry](cache.DefaultTTL),
25022500
}
25032501

25042502
testMachine.Status.NodeRef = &corev1.ObjectReference{
@@ -3335,7 +3333,7 @@ func TestNodeDeletion(t *testing.T) {
33353333
ClusterCache: clustercache.NewFakeClusterCache(fakeClient, client.ObjectKeyFromObject(&testCluster)),
33363334
recorder: record.NewFakeRecorder(10),
33373335
nodeDeletionRetryTimeout: 10 * time.Millisecond,
3338-
reconcileDeleteCache: cache.New[cache.ReconcileEntry](),
3336+
reconcileDeleteCache: cache.New[cache.ReconcileEntry](cache.DefaultTTL),
33393337
}
33403338

33413339
cluster := testCluster.DeepCopy()
@@ -3469,7 +3467,7 @@ func TestNodeDeletionWithoutNodeRefFallback(t *testing.T) {
34693467
ClusterCache: clustercache.NewFakeClusterCache(fakeClient, client.ObjectKeyFromObject(&testCluster)),
34703468
recorder: record.NewFakeRecorder(10),
34713469
nodeDeletionRetryTimeout: 10 * time.Millisecond,
3472-
reconcileDeleteCache: cache.New[cache.ReconcileEntry](),
3470+
reconcileDeleteCache: cache.New[cache.ReconcileEntry](cache.DefaultTTL),
34733471
}
34743472

34753473
s := &scope{

0 commit comments

Comments
 (0)