@@ -19,6 +19,7 @@ package machineset
19
19
import (
20
20
"context"
21
21
"fmt"
22
+ "math"
22
23
"sort"
23
24
"strings"
24
25
"time"
@@ -54,6 +55,7 @@ import (
54
55
"sigs.k8s.io/cluster-api/util"
55
56
"sigs.k8s.io/cluster-api/util/collections"
56
57
"sigs.k8s.io/cluster-api/util/conditions"
58
+ v1beta2conditions "sigs.k8s.io/cluster-api/util/conditions/v1beta2"
57
59
utilconversion "sigs.k8s.io/cluster-api/util/conversion"
58
60
"sigs.k8s.io/cluster-api/util/finalizers"
59
61
"sigs.k8s.io/cluster-api/util/labels/format"
@@ -1195,19 +1197,77 @@ func (r *Reconciler) reconcileUnhealthyMachines(ctx context.Context, s *scope) (
1195
1197
1196
1198
cluster := s .cluster
1197
1199
ms := s .machineSet
1198
- filteredMachines := s .machines
1200
+ machines := s .machines
1199
1201
owner := s .owningMachineDeployment
1200
1202
log := ctrl .LoggerFrom (ctx )
1201
1203
1204
+ // Remove OwnerRemediated condition from Machines that have HealthCheckSucceeded condition true
1205
+ // and OwnerRemediated condition false
1206
+ errList := []error {}
1207
+ for _ , m := range machines {
1208
+ if ! m .DeletionTimestamp .IsZero () {
1209
+ continue
1210
+ }
1211
+
1212
+ shouldCleanup := conditions .IsTrue (m , clusterv1 .MachineHealthCheckSucceededCondition ) && conditions .IsFalse (m , clusterv1 .MachineOwnerRemediatedCondition )
1213
+ shouldCleanupV1Beta2 := v1beta2conditions .IsTrue (m , clusterv1 .MachineHealthCheckSucceededV1Beta2Condition ) && v1beta2conditions .IsFalse (m , clusterv1 .MachineOwnerRemediatedV1Beta2Condition )
1214
+
1215
+ if ! (shouldCleanup || shouldCleanupV1Beta2 ) {
1216
+ continue
1217
+ }
1218
+
1219
+ patchHelper , err := patch .NewHelper (m , r .Client )
1220
+ if err != nil {
1221
+ errList = append (errList , err )
1222
+ continue
1223
+ }
1224
+
1225
+ if shouldCleanup {
1226
+ conditions .Delete (m , clusterv1 .MachineOwnerRemediatedCondition )
1227
+ }
1228
+
1229
+ if shouldCleanupV1Beta2 {
1230
+ v1beta2conditions .Delete (m , clusterv1 .MachineOwnerRemediatedV1Beta2Condition )
1231
+ }
1232
+
1233
+ if err := patchHelper .Patch (ctx , m , patch.WithOwnedConditions {Conditions : []clusterv1.ConditionType {
1234
+ clusterv1 .MachineOwnerRemediatedCondition ,
1235
+ }}, patch.WithOwnedV1Beta2Conditions {Conditions : []string {
1236
+ clusterv1 .MachineOwnerRemediatedV1Beta2Condition ,
1237
+ }}); err != nil {
1238
+ errList = append (errList , err )
1239
+ }
1240
+ }
1241
+ if len (errList ) > 0 {
1242
+ return ctrl.Result {}, errors .Wrapf (kerrors .NewAggregate (errList ), "failed to remove OwnerRemediated condition from healhty Machines" )
1243
+ }
1244
+
1245
+ // Calculates the Machines to be remediated.
1246
+ // Note: Machines already deleting are not included, there is no need to trigger remediation for them again.
1247
+ machinesToRemediate := collections .FromMachines (machines ... ).Filter (collections .IsUnhealthyAndOwnerRemediated , collections .Not (collections .HasDeletionTimestamp )).UnsortedList ()
1248
+
1249
+ // If there are no machines to remediate return early.
1250
+ if len (machinesToRemediate ) == 0 {
1251
+ return ctrl.Result {}, nil
1252
+ }
1253
+
1202
1254
// Calculate how many in flight machines we should remediate.
1203
1255
// By default, we allow all machines to be remediated at the same time.
1204
- maxInFlight := len ( filteredMachines )
1256
+ maxInFlight := math . MaxInt
1205
1257
1206
1258
// If the MachineSet is part of a MachineDeployment, only allow remediations if
1207
1259
// it's the desired revision.
1208
1260
if isDeploymentChild (ms ) {
1209
1261
if owner .Annotations [clusterv1 .RevisionAnnotation ] != ms .Annotations [clusterv1 .RevisionAnnotation ] {
1210
1262
// MachineSet is part of a MachineDeployment but isn't the current revision, no remediations allowed.
1263
+ if err := patchMachineConditions (ctx , r .Client , machinesToRemediate , metav1.Condition {
1264
+ Type : clusterv1 .MachineOwnerRemediatedV1Beta2Condition ,
1265
+ Status : metav1 .ConditionFalse ,
1266
+ Reason : clusterv1 .MachineSetMachineCannotBeRemediatedV1Beta2Reason ,
1267
+ Message : "Machine won't be remediated because it is pending removal due to rollout" ,
1268
+ }, nil ); err != nil {
1269
+ return ctrl.Result {}, err
1270
+ }
1211
1271
return ctrl.Result {}, nil
1212
1272
}
1213
1273
@@ -1224,31 +1284,33 @@ func (r *Reconciler) reconcileUnhealthyMachines(ctx context.Context, s *scope) (
1224
1284
}
1225
1285
}
1226
1286
1227
- // List all unhealthy machines.
1228
- machinesToRemediate := make ([]* clusterv1.Machine , 0 , len (filteredMachines ))
1229
- for _ , m := range filteredMachines {
1230
- // filteredMachines contains machines in deleting status to calculate correct status.
1231
- // skip remediation for those in deleting status.
1287
+ // Update maxInFlight based on remediations that are in flight.
1288
+ // A Machine has a remediation in flight when Machine's OwnerRemediated condition
1289
+ // reports that remediation has been completed and the Machine has been deleted.
1290
+ for _ , m := range machines {
1232
1291
if ! m .DeletionTimestamp .IsZero () {
1292
+ // TODO: Check for Status: False and Reason: MachineSetMachineRemediationMachineDeletedV1Beta2Reason
1293
+ // instead when starting to use v1beta2 conditions for control flow.
1233
1294
if conditions .IsTrue (m , clusterv1 .MachineOwnerRemediatedCondition ) {
1234
- // Machine has been remediated by this controller and still in flight.
1295
+ // Remediation for this Machine has been triggered by this controller but it is still in flight,
1296
+ // i.e. it still goes through the deletion workflow and exists in etcd.
1235
1297
maxInFlight --
1236
1298
}
1237
- continue
1238
- }
1239
- if conditions .IsFalse (m , clusterv1 .MachineOwnerRemediatedCondition ) {
1240
- machinesToRemediate = append (machinesToRemediate , m )
1241
1299
}
1242
1300
}
1243
1301
1244
- // If there are no machines to remediate return early.
1245
- if len (machinesToRemediate ) == 0 {
1246
- return ctrl.Result {}, nil
1247
- }
1248
1302
// Check if we can remediate any machines.
1249
1303
if maxInFlight <= 0 {
1250
1304
// No tokens available to remediate machines.
1251
1305
log .V (3 ).Info ("Remediation strategy is set, and maximum in flight has been reached" , "machinesToBeRemediated" , len (machinesToRemediate ))
1306
+ if err := patchMachineConditions (ctx , r .Client , machinesToRemediate , metav1.Condition {
1307
+ Type : clusterv1 .MachineOwnerRemediatedV1Beta2Condition ,
1308
+ Status : metav1 .ConditionFalse ,
1309
+ Reason : clusterv1 .MachineSetMachineRemediationDeferredV1Beta2Reason ,
1310
+ Message : fmt .Sprintf ("Waiting because there are already too many remediations in progress (spec.strategy.remediation.maxInFlight is %s)" , owner .Spec .Strategy .Remediation .MaxInFlight ),
1311
+ }, nil ); err != nil {
1312
+ return ctrl.Result {}, err
1313
+ }
1252
1314
return ctrl.Result {}, nil
1253
1315
}
1254
1316
@@ -1263,11 +1325,22 @@ func (r *Reconciler) reconcileUnhealthyMachines(ctx context.Context, s *scope) (
1263
1325
if len (machinesToRemediate ) > maxInFlight {
1264
1326
log .V (5 ).Info ("Remediation strategy is set, limiting in flight operations" , "machinesToBeRemediated" , len (machinesToRemediate ))
1265
1327
// We have more machines to remediate than tokens available.
1266
- machinesToRemediate = machinesToRemediate [:maxInFlight ]
1328
+ allMachinesToRemediate := machinesToRemediate
1329
+ machinesToRemediate = allMachinesToRemediate [:maxInFlight ]
1330
+ machinesToDeferRemediation := allMachinesToRemediate [maxInFlight :]
1331
+
1332
+ if err := patchMachineConditions (ctx , r .Client , machinesToDeferRemediation , metav1.Condition {
1333
+ Type : clusterv1 .MachineOwnerRemediatedV1Beta2Condition ,
1334
+ Status : metav1 .ConditionFalse ,
1335
+ Reason : clusterv1 .MachineSetMachineRemediationDeferredV1Beta2Reason ,
1336
+ Message : fmt .Sprintf ("Waiting because there are already too many remediations in progress (spec.strategy.remediation.maxInFlight is %s)" , owner .Spec .Strategy .Remediation .MaxInFlight ),
1337
+ }, nil ); err != nil {
1338
+ return ctrl.Result {}, err
1339
+ }
1267
1340
}
1268
1341
1269
1342
// Run preflight checks.
1270
- preflightChecksResult , preflightCheckErrMessage , err := r .runPreflightChecks (ctx , cluster , ms , "Machine Remediation " )
1343
+ preflightChecksResult , preflightCheckErrMessage , err := r .runPreflightChecks (ctx , cluster , ms , "Machine remediation " )
1271
1344
if err != nil {
1272
1345
// If err is not nil use that as the preflightCheckErrMessage
1273
1346
preflightCheckErrMessage = err .Error ()
@@ -1277,48 +1350,84 @@ func (r *Reconciler) reconcileUnhealthyMachines(ctx context.Context, s *scope) (
1277
1350
if preflightChecksFailed {
1278
1351
// PreflightChecks did not pass. Update the MachineOwnerRemediated condition on the unhealthy Machines with
1279
1352
// WaitingForRemediationReason reason.
1280
- var errs []error
1281
- for _ , m := range machinesToRemediate {
1282
- patchHelper , err := patch .NewHelper (m , r .Client )
1283
- if err != nil {
1284
- errs = append (errs , err )
1285
- continue
1286
- }
1287
- conditions .MarkFalse (m , clusterv1 .MachineOwnerRemediatedCondition , clusterv1 .WaitingForRemediationReason , clusterv1 .ConditionSeverityWarning , preflightCheckErrMessage )
1288
- if err := patchHelper .Patch (ctx , m ); err != nil {
1289
- errs = append (errs , err )
1290
- }
1291
- }
1292
-
1293
- if len (errs ) > 0 {
1294
- return ctrl.Result {}, errors .Wrapf (kerrors .NewAggregate (errs ), "failed to patch unhealthy Machines" )
1353
+ if err := patchMachineConditions (ctx , r .Client , machinesToRemediate , metav1.Condition {
1354
+ Type : clusterv1 .MachineOwnerRemediatedV1Beta2Condition ,
1355
+ Status : metav1 .ConditionFalse ,
1356
+ Reason : clusterv1 .MachineSetMachineRemediationDeferredV1Beta2Reason ,
1357
+ Message : preflightCheckErrMessage ,
1358
+ }, & clusterv1.Condition {
1359
+ Type : clusterv1 .MachineOwnerRemediatedCondition ,
1360
+ Status : corev1 .ConditionFalse ,
1361
+ Reason : clusterv1 .WaitingForRemediationReason ,
1362
+ Severity : clusterv1 .ConditionSeverityWarning ,
1363
+ Message : preflightCheckErrMessage ,
1364
+ }); err != nil {
1365
+ return ctrl.Result {}, err
1295
1366
}
1296
1367
return preflightChecksResult , nil
1297
1368
}
1298
1369
1299
- // PreflightChecks passed, so it is safe to remediate unhealthy machines.
1300
- // Remediate unhealthy machines by deleting them.
1370
+ // PreflightChecks passed, so it is safe to remediate unhealthy machines by deleting them.
1371
+
1372
+ // Note: We intentionally patch the Machines before we delete them to make this code reentrant.
1373
+ // If we delete the Machine first, the Machine would be filtered out on next reconcile because
1374
+ // it has a deletionTimestamp so it would never get the condition.
1375
+ // Instead if we set the condition but the deletion does not go through on next reconcile either the
1376
+ // condition will be fixed/updated or the Machine deletion will be retried.
1377
+ if err := patchMachineConditions (ctx , r .Client , machinesToRemediate , metav1.Condition {
1378
+ Type : clusterv1 .MachineOwnerRemediatedV1Beta2Condition ,
1379
+ Status : metav1 .ConditionFalse ,
1380
+ Reason : clusterv1 .MachineSetMachineRemediationMachineDeletedV1Beta2Reason ,
1381
+ }, & clusterv1.Condition {
1382
+ Type : clusterv1 .MachineOwnerRemediatedCondition ,
1383
+ Status : corev1 .ConditionTrue ,
1384
+ }); err != nil {
1385
+ return ctrl.Result {}, err
1386
+ }
1301
1387
var errs []error
1302
1388
for _ , m := range machinesToRemediate {
1303
1389
log .Info ("Deleting unhealthy Machine" , "Machine" , klog .KObj (m ))
1304
- patch := client .MergeFrom (m .DeepCopy ())
1305
1390
if err := r .Client .Delete (ctx , m ); err != nil && ! apierrors .IsNotFound (err ) {
1306
1391
errs = append (errs , errors .Wrapf (err , "failed to delete Machine %s" , klog .KObj (m )))
1307
- continue
1308
- }
1309
- conditions .MarkTrue (m , clusterv1 .MachineOwnerRemediatedCondition )
1310
- if err := r .Client .Status ().Patch (ctx , m , patch ); err != nil && ! apierrors .IsNotFound (err ) {
1311
- errs = append (errs , errors .Wrapf (err , "failed to update status of Machine %s" , klog .KObj (m )))
1312
1392
}
1313
1393
}
1314
-
1315
1394
if len (errs ) > 0 {
1316
1395
return ctrl.Result {}, errors .Wrapf (kerrors .NewAggregate (errs ), "failed to delete unhealthy Machines" )
1317
1396
}
1318
1397
1319
1398
return ctrl.Result {}, nil
1320
1399
}
1321
1400
1401
+ func patchMachineConditions (ctx context.Context , c client.Client , machines []* clusterv1.Machine , v1beta2Condition metav1.Condition , condition * clusterv1.Condition ) error {
1402
+ var errs []error
1403
+ for _ , m := range machines {
1404
+ patchHelper , err := patch .NewHelper (m , c )
1405
+ if err != nil {
1406
+ errs = append (errs , err )
1407
+ continue
1408
+ }
1409
+
1410
+ if condition != nil {
1411
+ conditions .Set (m , condition )
1412
+ }
1413
+ v1beta2conditions .Set (m , v1beta2Condition )
1414
+
1415
+ if err := patchHelper .Patch (ctx , m ,
1416
+ patch.WithOwnedConditions {Conditions : []clusterv1.ConditionType {
1417
+ clusterv1 .MachineOwnerRemediatedCondition ,
1418
+ }}, patch.WithOwnedV1Beta2Conditions {Conditions : []string {
1419
+ clusterv1 .MachineOwnerRemediatedV1Beta2Condition ,
1420
+ }}); err != nil {
1421
+ errs = append (errs , err )
1422
+ }
1423
+ }
1424
+ if len (errs ) > 0 {
1425
+ return errors .Wrapf (kerrors .NewAggregate (errs ), "failed to patch Machines" )
1426
+ }
1427
+
1428
+ return nil
1429
+ }
1430
+
1322
1431
func (r * Reconciler ) reconcileExternalTemplateReference (ctx context.Context , cluster * clusterv1.Cluster , ms * clusterv1.MachineSet , owner * clusterv1.MachineDeployment , ref * corev1.ObjectReference ) (objectNotFound bool , err error ) {
1323
1432
if ! strings .HasSuffix (ref .Kind , clusterv1 .TemplateSuffix ) {
1324
1433
return false , nil
0 commit comments