Skip to content

Commit 8f27824

Browse files
Read etcd alarm list once per reconcile in KCP
1 parent c0cf7c6 commit 8f27824

File tree

2 files changed

+28
-24
lines changed

2 files changed

+28
-24
lines changed

controlplane/kubeadm/internal/etcd/etcd.go

Lines changed: 0 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -114,9 +114,6 @@ type Member struct {
114114

115115
// IsLearner indicates if the member is raft learner.
116116
IsLearner bool
117-
118-
// Alarms is the list of alarms for a member.
119-
Alarms []AlarmType
120117
}
121118

122119
// pbMemberToMember converts the protobuf representation of a cluster member to a Member struct.
@@ -127,7 +124,6 @@ func pbMemberToMember(m *etcdserverpb.Member) *Member {
127124
PeerURLs: m.GetPeerURLs(),
128125
ClientURLs: m.GetClientURLs(),
129126
IsLearner: m.GetIsLearner(),
130-
Alarms: []AlarmType{},
131127
}
132128
}
133129

@@ -217,21 +213,11 @@ func (c *Client) Members(ctx context.Context) ([]*Member, error) {
217213
return nil, errors.Wrap(err, "failed to get list of members for etcd cluster")
218214
}
219215

220-
alarms, err := c.Alarms(ctx)
221-
if err != nil {
222-
return nil, err
223-
}
224-
225216
clusterID := response.Header.GetClusterId()
226217
members := make([]*Member, 0)
227218
for _, m := range response.Members {
228219
newMember := pbMemberToMember(m)
229220
newMember.ClusterID = clusterID
230-
for _, c := range alarms {
231-
if c.MemberID == newMember.ID {
232-
newMember.Alarms = append(newMember.Alarms, c.Type)
233-
}
234-
}
235221
members = append(members, newMember)
236222
}
237223

controlplane/kubeadm/internal/workload_cluster_conditions.go

Lines changed: 28 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -183,7 +183,7 @@ func (w *Workload) updateManagedEtcdConditions(ctx context.Context, controlPlane
183183
continue
184184
}
185185

186-
currentMembers, err := w.getCurrentEtcdMembers(ctx, machine, node.Name)
186+
currentMembers, alarms, err := w.getCurrentEtcdMembers(ctx, machine, node.Name)
187187
if err != nil {
188188
// Note. even if we fail reading the member list from one node/etcd members we do not set EtcdMembersAgreeOnMemberList and EtcdMembersAgreeOnClusterID to false
189189
// (those info are computed on what we can collect during inspection, so we can reason about availability even if there is a certain degree of problems in the cluster).
@@ -232,14 +232,18 @@ func (w *Workload) updateManagedEtcdConditions(ctx context.Context, controlPlane
232232
})
233233
continue
234234
}
235-
if len(member.Alarms) > 0 {
235+
if len(alarms) > 0 {
236236
alarmList := []string{}
237-
for _, alarm := range member.Alarms {
238-
switch alarm {
237+
for _, alarm := range alarms {
238+
if alarm.MemberID != member.ID {
239+
continue
240+
}
241+
242+
switch alarm.Type {
239243
case etcd.AlarmOK:
240244
continue
241245
default:
242-
alarmList = append(alarmList, etcd.AlarmTypeName[alarm])
246+
alarmList = append(alarmList, etcd.AlarmTypeName[alarm.Type])
243247
}
244248
}
245249
if len(alarmList) > 0 {
@@ -330,7 +334,7 @@ func unwrapAll(err error) error {
330334
return err
331335
}
332336

333-
func (w *Workload) getCurrentEtcdMembers(ctx context.Context, machine *clusterv1.Machine, nodeName string) ([]*etcd.Member, error) {
337+
func (w *Workload) getCurrentEtcdMembers(ctx context.Context, machine *clusterv1.Machine, nodeName string) ([]*etcd.Member, []etcd.MemberAlarm, error) {
334338
// Create the etcd Client for the etcd Pod scheduled on the Node
335339
etcdClient, err := w.etcdClientGenerator.forFirstAvailableNode(ctx, []string{nodeName})
336340
if err != nil {
@@ -342,7 +346,7 @@ func (w *Workload) getCurrentEtcdMembers(ctx context.Context, machine *clusterv1
342346
Reason: controlplanev1.KubeadmControlPlaneMachineEtcdMemberInspectionFailedV1Beta2Reason,
343347
Message: fmt.Sprintf("Failed to connect to the etcd Pod on the %s Node: %s", nodeName, unwrapAll(err)),
344348
})
345-
return nil, errors.Wrapf(err, "failed to get current etcd members: failed to connect to the etcd Pod on the %s Node", nodeName)
349+
return nil, nil, errors.Wrapf(err, "failed to get current etcd members: failed to connect to the etcd Pod on the %s Node", nodeName)
346350
}
347351
defer etcdClient.Close()
348352

@@ -356,7 +360,7 @@ func (w *Workload) getCurrentEtcdMembers(ctx context.Context, machine *clusterv1
356360
Reason: controlplanev1.KubeadmControlPlaneMachineEtcdMemberNotHealthyV1Beta2Reason,
357361
Message: fmt.Sprintf("Etcd reports errors: %s", strings.Join(etcdClient.Errors, ", ")),
358362
})
359-
return nil, errors.Errorf("failed to get current etcd members: etcd member status reports errors: %s", strings.Join(etcdClient.Errors, ", "))
363+
return nil, nil, errors.Errorf("failed to get current etcd members: etcd member status reports errors: %s", strings.Join(etcdClient.Errors, ", "))
360364
}
361365

362366
// Gets the list etcd members known by this member.
@@ -372,10 +376,24 @@ func (w *Workload) getCurrentEtcdMembers(ctx context.Context, machine *clusterv1
372376
Reason: controlplanev1.KubeadmControlPlaneMachineEtcdMemberInspectionFailedV1Beta2Reason,
373377
Message: fmt.Sprintf("Failed to get answer from the etcd member on the %s Node: %s", nodeName, err.Error()),
374378
})
375-
return nil, errors.Wrapf(err, "failed to get answer from the etcd member on the %s Node", nodeName)
379+
return nil, nil, errors.Wrapf(err, "failed to get answer from the etcd member on the %s Node", nodeName)
380+
}
381+
382+
// Gets the list of etcd alarms.
383+
alarms, err := etcdClient.Alarms(ctx)
384+
if err != nil {
385+
conditions.MarkFalse(machine, controlplanev1.MachineEtcdMemberHealthyCondition, controlplanev1.EtcdMemberUnhealthyReason, clusterv1.ConditionSeverityError, "Failed to get answer from the etcd alarms on the %s Node", nodeName)
386+
387+
v1beta2conditions.Set(machine, metav1.Condition{
388+
Type: controlplanev1.KubeadmControlPlaneMachineEtcdMemberHealthyV1Beta2Condition,
389+
Status: metav1.ConditionUnknown,
390+
Reason: controlplanev1.KubeadmControlPlaneMachineEtcdMemberInspectionFailedV1Beta2Reason,
391+
Message: fmt.Sprintf("Failed to get answer from the etcd alarms on the %s Node: %s", nodeName, err.Error()),
392+
})
393+
return nil, nil, errors.Wrapf(err, "failed to get answer from the etcd alarms on the %s Node", nodeName)
376394
}
377395

378-
return currentMembers, nil
396+
return currentMembers, alarms, nil
379397
}
380398

381399
func compareMachinesAndMembers(controlPlane *ControlPlane, nodes *corev1.NodeList, members []*etcd.Member) (bool, []string) {

0 commit comments

Comments
 (0)