7
7
8
8
ispnv1 "github.com/infinispan/infinispan-operator/api/v1"
9
9
consts "github.com/infinispan/infinispan-operator/controllers/constants"
10
+ ispnApi "github.com/infinispan/infinispan-operator/pkg/infinispan/client/api"
10
11
"github.com/infinispan/infinispan-operator/pkg/infinispan/version"
11
- kube "github.com/infinispan/infinispan-operator/pkg/kubernetes"
12
12
pipeline "github.com/infinispan/infinispan-operator/pkg/reconcile/pipeline/infinispan"
13
13
"github.com/infinispan/infinispan-operator/pkg/reconcile/pipeline/infinispan/handler/provision"
14
14
routev1 "github.com/openshift/api/route/v1"
@@ -130,7 +130,8 @@ func GracefulShutdown(i *ispnv1.Infinispan, ctx pipeline.Context) {
130
130
// Initiate the GracefulShutdown if it's not already in progress
131
131
if i .Spec .Replicas == 0 {
132
132
logger .Info (".Spec.Replicas==0" )
133
- if * statefulSet .Spec .Replicas != 0 {
133
+ replicas := * statefulSet .Spec .Replicas
134
+ if replicas != 0 {
134
135
logger .Info ("StatefulSet.Spec.Replicas!=0" )
135
136
// Only send a GracefulShutdown request to the server if it hasn't succeeded already
136
137
if ! i .IsConditionTrue (ispnv1 .ConditionStopping ) {
@@ -141,18 +142,80 @@ func GracefulShutdown(i *ispnv1.Infinispan, ctx pipeline.Context) {
141
142
return
142
143
}
143
144
144
- var rebalanceDisabled bool
145
+ type PodMeta struct {
146
+ client ispnApi.Infinispan
147
+ skip bool
148
+ state string
149
+ }
150
+
151
+ // Loop through all pods to initiate the clients, determine that all pods have the expected
152
+ // number of cluster members and ensure that the cluster is in a HEALTHY state
153
+ var podMetaMap = make (map [string ]* PodMeta , len (podList .Items ))
154
+ var shutdownAlreadyInitiated bool
145
155
for _ , pod := range podList .Items {
146
- ispnClient , err := ctx .InfinispanClientUnknownVersion (pod .Name )
156
+ podMeta := & PodMeta {}
157
+ podMetaMap [pod .Name ] = podMeta
158
+ podMeta .client , err = ctx .InfinispanClientUnknownVersion (pod .Name )
147
159
if err != nil {
148
160
if shutdown , state := containerAlreadyShutdown (err ); shutdown {
149
- logger .Info ("Skipping pod whose cache-container has already been shutdown by the Operator" , "pod" , pod .Name , "state" , state )
161
+ podMeta .skip , podMeta .state = true , state
162
+ logger .Info ("At least one cache-container has already been shutdown by the Operator, resuming shutdown" , "pod" , pod .Name , "state" , state )
163
+ shutdownAlreadyInitiated = true
164
+ // Continue processing other pods here as it's possible that one or more pods still haven't
165
+ // been shutdown and we need to initiate the client
150
166
continue
151
167
}
152
- ctx .Requeue (fmt .Errorf ("unable to create Infinispan client for cluster being upgraded: %w" , err ))
168
+ ctx .Requeue (fmt .Errorf ("unable to create Infinispan client to determine if split-brain is present: %w" , err ))
169
+ return
170
+ }
171
+
172
+ // If one or more of the pods have already been shutdown then we must continue to shutdown the remaining
173
+ // members of the cluster
174
+ if shutdownAlreadyInitiated {
175
+ continue
176
+ }
177
+
178
+ info , err := podMeta .client .Container ().Info ()
179
+ if err != nil {
180
+ ctx .Requeue (fmt .Errorf ("unable to retrieve cache-container info for pod '%s': %w" , pod .Name , err ))
153
181
return
154
182
}
155
183
184
+ if info .ClusterSize != replicas {
185
+ err = fmt .Errorf (
186
+ "unable to proceed with GracefulShutdown as pod '%s' has '%d' cluster members, expected '%d'. Members: '%s'" ,
187
+ pod .Name ,
188
+ info .ClusterSize ,
189
+ i .Spec .Replicas ,
190
+ strings .Join (info .ClusterMembers , "," ),
191
+ )
192
+ ctx .Requeue (err )
193
+ return
194
+ }
195
+
196
+ health , err := podMeta .client .Container ().HealthStatus ()
197
+ if err != nil {
198
+ ctx .Requeue (fmt .Errorf ("unable to retrieve cluster health status for pod '%s': %w" , pod .Name , err ))
199
+ return
200
+ }
201
+
202
+ // If any of the caches are not marked as HEALTHY we must prevent a GracefulShutdown to prevent
203
+ // the cluster from entering an unexpected state
204
+ if health != ispnApi .HealthStatusHealthy {
205
+ ctx .Requeue (fmt .Errorf ("unable to proceed with GracefulShutdown as the cluster health is '%s'" , health ))
206
+ return
207
+ }
208
+ }
209
+
210
+ var rebalanceDisabled bool
211
+ for _ , pod := range podList .Items {
212
+ podMeta := podMetaMap [pod .Name ]
213
+ if podMeta .skip {
214
+ logger .Info ("Skipping pod whose cache-container has already been shutdown by the Operator" , "pod" , pod , "state" , podMeta .state )
215
+ continue
216
+ }
217
+ ispnClient := podMeta .client
218
+
156
219
// Disabling rebalancing is a cluster-wide operation so we only need to perform this on a single pod
157
220
// However, multiple calls to this endpoint should be safe, so it's ok if a subsequent reconciliation
158
221
// executes this again
@@ -164,13 +227,11 @@ func GracefulShutdown(i *ispnv1.Infinispan, ctx pipeline.Context) {
164
227
rebalanceDisabled = true
165
228
}
166
229
167
- if kube .IsPodReady (pod ) {
168
- if err := ispnClient .Container ().Shutdown (); err != nil {
169
- ctx .Requeue (fmt .Errorf ("error encountered on container shutdown: %w" , err ))
170
- return
171
- } else {
172
- logger .Info ("Executed Container Shutdown on pod: " , "Pod.Name" , pod .Name )
173
- }
230
+ if err := ispnClient .Container ().Shutdown (); err != nil {
231
+ ctx .Requeue (fmt .Errorf ("error encountered on container shutdown: %w" , err ))
232
+ return
233
+ } else {
234
+ logger .Info ("Executed Container Shutdown on pod: " , "Pod.Name" , pod .Name )
174
235
}
175
236
}
176
237
@@ -286,10 +347,14 @@ func EnableRebalanceAfterScaleUp(i *ispnv1.Infinispan, ctx pipeline.Context) {
286
347
return
287
348
}
288
349
289
- if members , err := ispnClient .Container ().Members (); err != nil {
290
- ctx .Requeue (fmt .Errorf ("unable to retrieve cluster members on scale up: %w" , err ))
350
+ // TODO why is this failing in TestOperandUpgrade for 14.0.x servers?
351
+ /*
352
+ 2025-05-16T17:11:58.093+0100 ERROR Reconciler error {"controller": "infinispan", "controllerGroup": "infinispan.org", "controllerKind": "Infinispan", "infinispan": {"name":"test-operand-upgrade","namespace":"namespace-for-testing"}, "namespace": "namespace-for-testing", "name": "test-operand-upgrade", "reconcileID": "eda96c7e-fd83-43cc-b903-4a623e4e2785", "error": "unable to retrieve cluster information on scale up: unexpected error getting cache manager info: stderr: , err: the server does not allow this method on the requested resource"}
353
+ */
354
+ if info , err := ispnClient .Container ().Info (); err != nil {
355
+ ctx .Requeue (fmt .Errorf ("unable to retrieve cluster information on scale up: %w" , err ))
291
356
return
292
- } else if len ( members ) != int ( i .Spec .Replicas ) {
357
+ } else if info . ClusterSize != i .Spec .Replicas {
293
358
ctx .Log ().Info ("waiting for cluster to form" , "replicas" , i .Spec .Replicas )
294
359
ctx .RequeueAfter (consts .DefaultWaitClusterPodsNotReady , nil )
295
360
return
0 commit comments