[#2273] Prevent a GracefulShutdown if view is != #replicas or any cache is not in a HEALTHY state

ryanemerson · ryanemerson · commit 46d566b291dd · 2025-05-19T10:44:58.000+01:00
diff --git a/pkg/http/curl/curl.go b/pkg/http/curl/curl.go
@@ -120,6 +120,11 @@ func (c *Client) executeCurlWithAuth(httpURL, headers, args string) (*http.Respo
 		return nil, err
 	}
 
+	// Handle anonymous endpoints such as /health/status that will not return a 401 response
+	if rsp.StatusCode >= 200 && rsp.StatusCode < 300 {
+		return processResponse(rsp)
+	}
+
 	if rsp.StatusCode != http.StatusUnauthorized {
 		return rsp, fmt.Errorf("expected 401 DIGEST response before content. Received '%s'", rsp.Status)
 	}
@@ -172,9 +177,13 @@ func handleContent(reader *bufio.Reader) (*http.Response, error) {
 		}
 	}
 
+	return processResponse(rsp)
+}
+
+func processResponse(rsp *http.Response) (*http.Response, error) {
 	// Save response body
 	b := new(bytes.Buffer)
-	if _, err = io.Copy(b, rsp.Body); err != nil {
+	if _, err := io.Copy(b, rsp.Body); err != nil {
 		return nil, err
 	}
 	if err := rsp.Body.Close(); err != nil {
diff --git a/pkg/infinispan/client/api/infinispan.go b/pkg/infinispan/client/api/infinispan.go
@@ -106,7 +106,7 @@ type HealthStatus string
 
 const (
 	HealthStatusDegraded          HealthStatus = "DEGRADED"
-	HealthStatusHealth            HealthStatus = "HEALTHY"
+	HealthStatusHealthy           HealthStatus = "HEALTHY"
 	HealthStatusHealthRebalancing HealthStatus = "HEALTHY_REBALANCING"
 	HealthStatusFailed            HealthStatus = "FAILED"
 )
@@ -152,9 +152,11 @@ type BackupRestoreResources struct {
 }
 
 type ContainerInfo struct {
-	Coordinator bool           `json:"coordinator"`
-	SitesView   *[]interface{} `json:"sites_view,omitempty"`
-	Version     string         `json:"version"`
+	ClusterMembers []string       `json:"cluster_members"`
+	ClusterSize    int32          `json:"cluster_size"`
+	Coordinator    bool           `json:"coordinator"`
+	SitesView      *[]interface{} `json:"sites_view,omitempty"`
+	Version        string         `json:"version"`
 }
 
 type NotSupportedError struct {
diff --git a/pkg/reconcile/pipeline/infinispan/handler/manage/upgrades.go b/pkg/reconcile/pipeline/infinispan/handler/manage/upgrades.go
@@ -7,8 +7,8 @@ import (
 
 	ispnv1 "github.com/infinispan/infinispan-operator/api/v1"
 	consts "github.com/infinispan/infinispan-operator/controllers/constants"
+	ispnApi "github.com/infinispan/infinispan-operator/pkg/infinispan/client/api"
 	"github.com/infinispan/infinispan-operator/pkg/infinispan/version"
-	kube "github.com/infinispan/infinispan-operator/pkg/kubernetes"
 	pipeline "github.com/infinispan/infinispan-operator/pkg/reconcile/pipeline/infinispan"
 	"github.com/infinispan/infinispan-operator/pkg/reconcile/pipeline/infinispan/handler/provision"
 	routev1 "github.com/openshift/api/route/v1"
@@ -130,7 +130,8 @@ func GracefulShutdown(i *ispnv1.Infinispan, ctx pipeline.Context) {
 	// Initiate the GracefulShutdown if it's not already in progress
 	if i.Spec.Replicas == 0 {
 		logger.Info(".Spec.Replicas==0")
-		if *statefulSet.Spec.Replicas != 0 {
+		replicas := *statefulSet.Spec.Replicas
+		if replicas != 0 {
 			logger.Info("StatefulSet.Spec.Replicas!=0")
 			// Only send a GracefulShutdown request to the server if it hasn't succeeded already
 			if !i.IsConditionTrue(ispnv1.ConditionStopping) {
@@ -141,18 +142,80 @@ func GracefulShutdown(i *ispnv1.Infinispan, ctx pipeline.Context) {
 					return
 				}
 
-				var rebalanceDisabled bool
+				type PodMeta struct {
+					client ispnApi.Infinispan
+					skip   bool
+					state  string
+				}
+
+				// Loop through all pods to initiate the clients, determine that all pods have the expected
+				// number of cluster members and ensure that the cluster is in a HEALTHY state
+				var podMetaMap = make(map[string]*PodMeta, len(podList.Items))
+				var shutdownAlreadyInitiated bool
 				for _, pod := range podList.Items {
-					ispnClient, err := ctx.InfinispanClientUnknownVersion(pod.Name)
+					podMeta := &PodMeta{}
+					podMetaMap[pod.Name] = podMeta
+					podMeta.client, err = ctx.InfinispanClientUnknownVersion(pod.Name)
 					if err != nil {
 						if shutdown, state := containerAlreadyShutdown(err); shutdown {
-							logger.Info("Skipping pod whose cache-container has already been shutdown by the Operator", "pod", pod.Name, "state", state)
+							podMeta.skip, podMeta.state = true, state
+							logger.Info("At least one cache-container has already been shutdown by the Operator, resuming shutdown", "pod", pod.Name, "state", state)
+							shutdownAlreadyInitiated = true
+							// Continue processing other pods here as it's possible that one or more pods still haven't
+							// been shutdown and we need to initiate the client
 							continue
 						}
-						ctx.Requeue(fmt.Errorf("unable to create Infinispan client for cluster being upgraded: %w", err))
+						ctx.Requeue(fmt.Errorf("unable to create Infinispan client to determine if split-brain is present: %w", err))
+						return
+					}
+
+					// If one or more of the pods have already been shutdown then we must continue to shutdown the remaining
+					// members of the cluster
+					if shutdownAlreadyInitiated {
+						continue
+					}
+
+					info, err := podMeta.client.Container().Info()
+					if err != nil {
+						ctx.Requeue(fmt.Errorf("unable to retrieve cache-container info for pod '%s': %w", pod.Name, err))
 						return
 					}
 
+					if info.ClusterSize != replicas {
+						err = fmt.Errorf(
+							"unable to proceed with GracefulShutdown as pod '%s' has '%d' cluster members, expected '%d'. Members: '%s'",
+							pod.Name,
+							info.ClusterSize,
+							i.Spec.Replicas,
+							strings.Join(info.ClusterMembers, ","),
+						)
+						ctx.Requeue(err)
+						return
+					}
+
+					health, err := podMeta.client.Container().HealthStatus()
+					if err != nil {
+						ctx.Requeue(fmt.Errorf("unable to retrieve cluster health status for pod '%s': %w", pod.Name, err))
+						return
+					}
+
+					// If any of the caches are not marked as HEALTHY we must prevent a GracefulShutdown to prevent
+					// the cluster from entering an unexpected state
+					if health != ispnApi.HealthStatusHealthy {
+						ctx.Requeue(fmt.Errorf("unable to proceed with GracefulShutdown as the cluster health is '%s'", health))
+						return
+					}
+				}
+
+				var rebalanceDisabled bool
+				for _, pod := range podList.Items {
+					podMeta := podMetaMap[pod.Name]
+					if podMeta.skip {
+						logger.Info("Skipping pod whose cache-container has already been shutdown by the Operator", "pod", pod, "state", podMeta.state)
+						continue
+					}
+					ispnClient := podMeta.client
+
 					// Disabling rebalancing is a cluster-wide operation so we only need to perform this on a single pod
 					// However, multiple calls to this endpoint should be safe, so it's ok if a subsequent reconciliation
 					// executes this again
@@ -164,13 +227,11 @@ func GracefulShutdown(i *ispnv1.Infinispan, ctx pipeline.Context) {
 						rebalanceDisabled = true
 					}
 
-					if kube.IsPodReady(pod) {
-						if err := ispnClient.Container().Shutdown(); err != nil {
-							ctx.Requeue(fmt.Errorf("error encountered on container shutdown: %w", err))
-							return
-						} else {
-							logger.Info("Executed Container Shutdown on pod: ", "Pod.Name", pod.Name)
-						}
+					if err := ispnClient.Container().Shutdown(); err != nil {
+						ctx.Requeue(fmt.Errorf("error encountered on container shutdown: %w", err))
+						return
+					} else {
+						logger.Info("Executed Container Shutdown on pod: ", "Pod.Name", pod.Name)
 					}
 				}
 
@@ -286,10 +347,14 @@ func EnableRebalanceAfterScaleUp(i *ispnv1.Infinispan, ctx pipeline.Context) {
 			return
 		}
 
-		if members, err := ispnClient.Container().Members(); err != nil {
-			ctx.Requeue(fmt.Errorf("unable to retrieve cluster members on scale up: %w", err))
+		// TODO why is this failing in TestOperandUpgrade for 14.0.x servers?
+		/*
+			2025-05-16T17:11:58.093+0100	ERROR	Reconciler error	{"controller": "infinispan", "controllerGroup": "infinispan.org", "controllerKind": "Infinispan", "infinispan": {"name":"test-operand-upgrade","namespace":"namespace-for-testing"}, "namespace": "namespace-for-testing", "name": "test-operand-upgrade", "reconcileID": "eda96c7e-fd83-43cc-b903-4a623e4e2785", "error": "unable to retrieve cluster information on scale up: unexpected error getting cache manager info: stderr: , err: the server does not allow this method on the requested resource"}
+		*/
+		if info, err := ispnClient.Container().Info(); err != nil {
+			ctx.Requeue(fmt.Errorf("unable to retrieve cluster information on scale up: %w", err))
 			return
-		} else if len(members) != int(i.Spec.Replicas) {
+		} else if info.ClusterSize != i.Spec.Replicas {
 			ctx.Log().Info("waiting for cluster to form", "replicas", i.Spec.Replicas)
 			ctx.RequeueAfter(consts.DefaultWaitClusterPodsNotReady, nil)
 			return

Original file line number	Diff line number	Diff line change
`@@ -120,6 +120,11 @@ func (c Client) executeCurlWithAuth(httpURL, headers, args string) (http.Respo`
`120`	`120`	`return nil, err`
`121`	`121`	`}`
`122`	`122`
	`123`	`+ // Handle anonymous endpoints such as /health/status that will not return a 401 response`
	`124`	`+ if rsp.StatusCode >= 200 && rsp.StatusCode < 300 {`
	`125`	`+ return processResponse(rsp)`
	`126`	`+ }`
	`127`	`+`
`123`	`128`	`if rsp.StatusCode != http.StatusUnauthorized {`
`124`	`129`	`return rsp, fmt.Errorf("expected 401 DIGEST response before content. Received '%s'", rsp.Status)`
`125`	`130`	`}`
`@@ -172,9 +177,13 @@ func handleContent(reader bufio.Reader) (http.Response, error) {`
`172`	`177`	`}`
`173`	`178`	`}`
`174`	`179`
	`180`	`+ return processResponse(rsp)`
	`181`	`+}`
	`182`	`+`
	`183`	`+func processResponse(rsp http.Response) (http.Response, error) {`
`175`	`184`	`// Save response body`
`176`	`185`	`b := new(bytes.Buffer)`
`177`		`- if _, err = io.Copy(b, rsp.Body); err != nil {`
	`186`	`+ if _, err := io.Copy(b, rsp.Body); err != nil {`
`178`	`187`	`return nil, err`
`179`	`188`	`}`
`180`	`189`	`if err := rsp.Body.Close(); err != nil {`