Skip to content

[PLAT-129166] fix parallel db update #12

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Mar 15, 2025
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 26 additions & 3 deletions pkg/controller/controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ import (
"context"
"fmt"
"net/http"
"sort"
"strings"
"time"

Expand Down Expand Up @@ -424,6 +425,13 @@ func (c *RolloutController) listStatefulSetsWithRolloutGroup() ([]*v1.StatefulSe
}

func (c *RolloutController) hasStatefulSetNotReadyPods(sts *v1.StatefulSet) (bool, error) {
if getMaxUnavailableForStatefulSet(sts, c.logger) > 1 && *sts.Spec.Replicas != sts.Status.AvailableReplicas {
// This is causing issues when enable parallel db update (delete multiple pods at the same time):
// 1. use Spec.Replicas instead of Status.Replicas because of deleting multiple pods at the same time will cause Status.Replicas < Spec.Replicas
// 2. use Status.AvailableReplicas instead of Status.ReadyReplicas because of minReadySeconds > 0 & stability
return true, nil
}
// fallback to old behavior if maxUnavailable is <= 1.
// We can quickly check the number of ready replicas reported by the StatefulSet.
// If they don't match the total number of replicas, then we're sure there are some
// not ready pods.
Expand Down Expand Up @@ -509,7 +517,7 @@ func (c *RolloutController) listPods(sel labels.Selector) ([]*corev1.Pod, error)
}

func (c *RolloutController) updateStatefulSetPods(ctx context.Context, sts *v1.StatefulSet) (bool, error) {
level.Debug(c.logger).Log("msg", "reconciling StatefulSet", "statefulset", sts.Name)
level.Debug(c.logger).Log("msg", "reconciling StatefulSet==============", "statefulset", sts.Name)

podsToUpdate, err := c.podsNotMatchingUpdateRevision(sts)
if err != nil {
Expand All @@ -520,11 +528,14 @@ func (c *RolloutController) updateStatefulSetPods(ctx context.Context, sts *v1.S
maxUnavailable := getMaxUnavailableForStatefulSet(sts, c.logger)
var numNotAvailable int
if sts.Spec.MinReadySeconds > 0 {
level.Info(c.logger).Log("msg", "StatefulSet has minReadySeconds set, waiting before terminating pods", "statefulset", sts.Name, "min_ready_seconds", sts.Spec.MinReadySeconds)
numNotAvailable = int(sts.Status.Replicas - sts.Status.AvailableReplicas)
} else {
numNotAvailable = int(sts.Status.Replicas - sts.Status.ReadyReplicas)
}
if maxUnavailable > 1 {
// when deleting multiple pods at the same time, the number of not-available pods should include pods that hasn't been managed by the controller yet.
numNotAvailable += int(*sts.Spec.Replicas - sts.Status.Replicas)
}

// Compute the number of pods we should update, honoring the configured maxUnavailable.
numPods := max(0, min(
Expand All @@ -537,11 +548,13 @@ func (c *RolloutController) updateStatefulSetPods(ctx context.Context, sts *v1.S
"msg", "StatefulSet has some pods to be updated but maxUnavailable pods has been reached",
"statefulset", sts.Name,
"pods_to_update", len(podsToUpdate),
"pod[0]", podsToUpdate[0].Name,
"expected_replicas", sts.Spec.Replicas,
"replicas", sts.Status.Replicas,
"ready_replicas", sts.Status.ReadyReplicas,
"available_replicas", sts.Status.AvailableReplicas,
"num_not_available", numNotAvailable,
"max_unavailable", maxUnavailable)

return true, nil
}

Expand Down Expand Up @@ -626,6 +639,16 @@ func (c *RolloutController) podsNotMatchingUpdateRevision(sts *v1.StatefulSet) (

// Sort pods in order to provide a deterministic behaviour.
util.SortPods(pods)
// Sort pods so not running pods will be updated first.
sort.Slice(pods, func(i, j int) bool {
rank := func(p *corev1.Pod) int {
if p.Status.Phase == corev1.PodRunning {
return 1 // Running pods are ranked higher and will be updated last.
}
return 0 // Non-running pods are ranked lower and will be updated first.
}
return rank(pods[i]) < rank(pods[j])
})

return pods, nil
}
Expand Down