Skip to content

Commit ea27d40

Browse files
authored
Merge pull request #11796 from fabriziopandini/kcp-read-etcd-alarms-once
🌱 Read etcd alarm list once per reconcile in KCP
2 parents e14cfb4 + 8f27824 commit ea27d40

File tree

2 files changed

+28
-24
lines changed

2 files changed

+28
-24
lines changed

controlplane/kubeadm/internal/etcd/etcd.go

Lines changed: 0 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -113,9 +113,6 @@ type Member struct {
113113

114114
// IsLearner indicates if the member is raft learner.
115115
IsLearner bool
116-
117-
// Alarms is the list of alarms for a member.
118-
Alarms []AlarmType
119116
}
120117

121118
// pbMemberToMember converts the protobuf representation of a cluster member to a Member struct.
@@ -126,7 +123,6 @@ func pbMemberToMember(m *etcdserverpb.Member) *Member {
126123
PeerURLs: m.GetPeerURLs(),
127124
ClientURLs: m.GetClientURLs(),
128125
IsLearner: m.GetIsLearner(),
129-
Alarms: []AlarmType{},
130126
}
131127
}
132128

@@ -216,21 +212,11 @@ func (c *Client) Members(ctx context.Context) ([]*Member, error) {
216212
return nil, errors.Wrap(err, "failed to get list of members for etcd cluster")
217213
}
218214

219-
alarms, err := c.Alarms(ctx)
220-
if err != nil {
221-
return nil, err
222-
}
223-
224215
clusterID := response.Header.GetClusterId()
225216
members := make([]*Member, 0)
226217
for _, m := range response.Members {
227218
newMember := pbMemberToMember(m)
228219
newMember.ClusterID = clusterID
229-
for _, c := range alarms {
230-
if c.MemberID == newMember.ID {
231-
newMember.Alarms = append(newMember.Alarms, c.Type)
232-
}
233-
}
234220
members = append(members, newMember)
235221
}
236222

controlplane/kubeadm/internal/workload_cluster_conditions.go

Lines changed: 28 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -150,7 +150,7 @@ func (w *Workload) updateManagedEtcdConditions(ctx context.Context, controlPlane
150150
continue
151151
}
152152

153-
currentMembers, err := w.getCurrentEtcdMembers(ctx, machine, node.Name)
153+
currentMembers, alarms, err := w.getCurrentEtcdMembers(ctx, machine, node.Name)
154154
if err != nil {
155155
// Note. even if we fail reading the member list from one node/etcd members we do not set EtcdMembersAgreeOnMemberList and EtcdMembersAgreeOnClusterID to false
156156
// (those info are computed on what we can collect during inspection, so we can reason about availability even if there is a certain degree of problems in the cluster).
@@ -197,14 +197,18 @@ func (w *Workload) updateManagedEtcdConditions(ctx context.Context, controlPlane
197197
})
198198
continue
199199
}
200-
if len(member.Alarms) > 0 {
200+
if len(alarms) > 0 {
201201
alarmList := []string{}
202-
for _, alarm := range member.Alarms {
203-
switch alarm {
202+
for _, alarm := range alarms {
203+
if alarm.MemberID != member.ID {
204+
continue
205+
}
206+
207+
switch alarm.Type {
204208
case etcd.AlarmOK:
205209
continue
206210
default:
207-
alarmList = append(alarmList, etcd.AlarmTypeName[alarm])
211+
alarmList = append(alarmList, etcd.AlarmTypeName[alarm.Type])
208212
}
209213
}
210214
if len(alarmList) > 0 {
@@ -294,7 +298,7 @@ func unwrapAll(err error) error {
294298
return err
295299
}
296300

297-
func (w *Workload) getCurrentEtcdMembers(ctx context.Context, machine *clusterv1.Machine, nodeName string) ([]*etcd.Member, error) {
301+
func (w *Workload) getCurrentEtcdMembers(ctx context.Context, machine *clusterv1.Machine, nodeName string) ([]*etcd.Member, []etcd.MemberAlarm, error) {
298302
// Create the etcd Client for the etcd Pod scheduled on the Node
299303
etcdClient, err := w.etcdClientGenerator.forFirstAvailableNode(ctx, []string{nodeName})
300304
if err != nil {
@@ -306,7 +310,7 @@ func (w *Workload) getCurrentEtcdMembers(ctx context.Context, machine *clusterv1
306310
Reason: controlplanev1.KubeadmControlPlaneMachineEtcdMemberInspectionFailedV1Beta2Reason,
307311
Message: fmt.Sprintf("Failed to connect to the etcd Pod on the %s Node: %s", nodeName, unwrapAll(err)),
308312
})
309-
return nil, errors.Wrapf(err, "failed to get current etcd members: failed to connect to the etcd Pod on the %s Node", nodeName)
313+
return nil, nil, errors.Wrapf(err, "failed to get current etcd members: failed to connect to the etcd Pod on the %s Node", nodeName)
310314
}
311315
defer etcdClient.Close()
312316

@@ -320,7 +324,7 @@ func (w *Workload) getCurrentEtcdMembers(ctx context.Context, machine *clusterv1
320324
Reason: controlplanev1.KubeadmControlPlaneMachineEtcdMemberNotHealthyV1Beta2Reason,
321325
Message: fmt.Sprintf("Etcd reports errors: %s", strings.Join(etcdClient.Errors, ", ")),
322326
})
323-
return nil, errors.Errorf("failed to get current etcd members: etcd member status reports errors: %s", strings.Join(etcdClient.Errors, ", "))
327+
return nil, nil, errors.Errorf("failed to get current etcd members: etcd member status reports errors: %s", strings.Join(etcdClient.Errors, ", "))
324328
}
325329

326330
// Gets the list etcd members known by this member.
@@ -336,10 +340,24 @@ func (w *Workload) getCurrentEtcdMembers(ctx context.Context, machine *clusterv1
336340
Reason: controlplanev1.KubeadmControlPlaneMachineEtcdMemberInspectionFailedV1Beta2Reason,
337341
Message: fmt.Sprintf("Failed to get answer from the etcd member on the %s Node: %s", nodeName, err.Error()),
338342
})
339-
return nil, errors.Wrapf(err, "failed to get answer from the etcd member on the %s Node", nodeName)
343+
return nil, nil, errors.Wrapf(err, "failed to get answer from the etcd member on the %s Node", nodeName)
344+
}
345+
346+
// Gets the list of etcd alarms.
347+
alarms, err := etcdClient.Alarms(ctx)
348+
if err != nil {
349+
conditions.MarkFalse(machine, controlplanev1.MachineEtcdMemberHealthyCondition, controlplanev1.EtcdMemberUnhealthyReason, clusterv1.ConditionSeverityError, "Failed to get answer from the etcd alarms on the %s Node", nodeName)
350+
351+
v1beta2conditions.Set(machine, metav1.Condition{
352+
Type: controlplanev1.KubeadmControlPlaneMachineEtcdMemberHealthyV1Beta2Condition,
353+
Status: metav1.ConditionUnknown,
354+
Reason: controlplanev1.KubeadmControlPlaneMachineEtcdMemberInspectionFailedV1Beta2Reason,
355+
Message: fmt.Sprintf("Failed to get answer from the etcd alarms on the %s Node: %s", nodeName, err.Error()),
356+
})
357+
return nil, nil, errors.Wrapf(err, "failed to get answer from the etcd alarms on the %s Node", nodeName)
340358
}
341359

342-
return currentMembers, nil
360+
return currentMembers, alarms, nil
343361
}
344362

345363
func compareMachinesAndMembers(controlPlane *ControlPlane, nodes *corev1.NodeList, members []*etcd.Member) (bool, []string) {

0 commit comments

Comments
 (0)