diff --git a/controllers/coherence_controller.go b/controllers/coherence_controller.go index 87374817..6b260f2d 100644 --- a/controllers/coherence_controller.go +++ b/controllers/coherence_controller.go @@ -11,23 +11,22 @@ import ( "fmt" "github.com/go-logr/logr" coh "github.com/oracle/coherence-operator/api/v1" + "github.com/oracle/coherence-operator/controllers/errorhandling" + "github.com/oracle/coherence-operator/controllers/finalizer" "github.com/oracle/coherence-operator/controllers/predicates" "github.com/oracle/coherence-operator/controllers/reconciler" + "github.com/oracle/coherence-operator/controllers/resources" "github.com/oracle/coherence-operator/controllers/secret" "github.com/oracle/coherence-operator/controllers/servicemonitor" "github.com/oracle/coherence-operator/controllers/statefulset" + "github.com/oracle/coherence-operator/controllers/status" "github.com/oracle/coherence-operator/pkg/clients" - "github.com/oracle/coherence-operator/pkg/events" "github.com/oracle/coherence-operator/pkg/operator" - "github.com/oracle/coherence-operator/pkg/probe" - "github.com/oracle/coherence-operator/pkg/rest" "github.com/oracle/coherence-operator/pkg/utils" "github.com/pkg/errors" monitoringv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1" - "github.com/spf13/viper" coreV1 "k8s.io/api/core/v1" apierrors "k8s.io/apimachinery/pkg/api/errors" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/runtime" "k8s.io/apimachinery/pkg/runtime/schema" "k8s.io/apimachinery/pkg/types" @@ -56,10 +55,13 @@ const ( type CoherenceReconciler struct { client.Client reconciler.CommonReconciler - ClientSet clients.ClientSet - Log logr.Logger - Scheme *runtime.Scheme - reconcilers []reconciler.SecondaryResourceReconciler + ClientSet clients.ClientSet + Log logr.Logger + Scheme *runtime.Scheme + reconcilers []reconciler.SecondaryResourceReconciler + finalizerManager *finalizer.FinalizerManager + statusManager *status.StatusManager + resourcesManager *resources.OperatorSecretManager } // Failure is a simple holder for a named error @@ -112,8 +114,9 @@ func (in *CoherenceReconciler) Reconcile(ctx context.Context, request ctrl.Reque // else... error reading the current deployment state from k8s. msg := fmt.Sprintf("failed to find Coherence resource, %s", err.Error()) in.GetEventRecorder().Event(deployment, coreV1.EventTypeWarning, reconciler.EventReasonFailed, msg) - // returning an error will requeue the event so we will try again ToDo: probably need som backoff here - return reconcile.Result{}, errors.Wrap(err, "getting Coherence resource") + // returning an error will requeue the event so we will try again + wrappedErr := errorhandling.NewGetResourceError(request.Name, request.Namespace, err) + return reconcile.Result{}, wrappedErr } // Check whether this is a deletion @@ -126,7 +129,7 @@ func (in *CoherenceReconciler) Reconcile(ctx context.Context, request ctrl.Reque // If the finalization logic fails, don't remove the finalizer so // that we can retry during the next reconciliation. in.GetEventRecorder().Event(deployment, coreV1.EventTypeNormal, reconciler.EventReasonDeleted, "running finalizers") - if err := in.finalizeDeployment(ctx, deployment); err != nil { + if err := in.finalizerManager.FinalizeDeployment(ctx, deployment, in.MaybeFindStatefulSet); err != nil { msg := fmt.Sprintf("failed to finalize Coherence resource, %s", err.Error()) in.GetEventRecorder().Event(deployment, coreV1.EventTypeWarning, reconciler.EventReasonDeleted, msg) log.Error(err, "Failed to remove finalizer") @@ -134,8 +137,7 @@ func (in *CoherenceReconciler) Reconcile(ctx context.Context, request ctrl.Reque } // Remove the finalizer. Once all finalizers have been // removed, the object will be deleted. - controllerutil.RemoveFinalizer(deployment, coh.CoherenceFinalizer) - err := in.GetClient().Update(ctx, deployment) + err := in.finalizerManager.EnsureFinalizerRemoved(ctx, deployment) if err != nil { if apierrors.IsNotFound(err) { log.Info("Failed to remove the finalizer from the Coherence resource, it looks like it had already been deleted") @@ -143,7 +145,10 @@ func (in *CoherenceReconciler) Reconcile(ctx context.Context, request ctrl.Reque } msg := fmt.Sprintf("failed to remove finalizers from Coherence resource, %s", err.Error()) in.GetEventRecorder().Event(deployment, coreV1.EventTypeWarning, reconciler.EventReasonDeleted, msg) - return ctrl.Result{}, errors.Wrap(err, "trying to remove finalizer from Coherence resource") + wrappedErr := errorhandling.NewOperationError("remove_finalizer", err). + WithContext("resource", deployment.GetName()). + WithContext("namespace", deployment.GetNamespace()) + return ctrl.Result{}, wrappedErr } } else { log.Info("Coherence resource deleted at " + deleteTime.String() + ", finalizer already removed") @@ -156,9 +161,12 @@ func (in *CoherenceReconciler) Reconcile(ctx context.Context, request ctrl.Reque if deployment.Spec.AllowUnsafeDelete != nil && *deployment.Spec.AllowUnsafeDelete { if controllerutil.ContainsFinalizer(deployment, coh.CoherenceFinalizer) { - err := in.ensureFinalizerRemoved(ctx, deployment) + err := in.finalizerManager.EnsureFinalizerRemoved(ctx, deployment) if err != nil && !apierrors.IsNotFound(err) { - return ctrl.Result{Requeue: true}, errors.Wrap(err, "failed to remove finalizer") + return ctrl.Result{Requeue: true}, errorhandling.NewOperationError("remove_finalizer", err). + WithContext("resource", deployment.GetName()). + WithContext("namespace", deployment.GetNamespace()). + WithContext("reason", "allow_unsafe_delete") } log.Info("Removed Finalizer from Coherence resource as AllowUnsafeDelete has been set to true") } else { @@ -167,7 +175,7 @@ func (in *CoherenceReconciler) Reconcile(ctx context.Context, request ctrl.Reque } else { // Add finalizer for this CR if required (it should have been added by the web-hook but may not have been if the // Coherence resource was added when the Operator was uninstalled) - if finalizerApplied, err := in.ensureFinalizerApplied(ctx, deployment); finalizerApplied || err != nil { + if finalizerApplied, err := in.finalizerManager.EnsureFinalizerApplied(ctx, deployment); finalizerApplied || err != nil { var msg string if err != nil { msg = fmt.Sprintf("failed to add finalizers to Coherence resource, %s", err.Error()) @@ -182,7 +190,7 @@ func (in *CoherenceReconciler) Reconcile(ctx context.Context, request ctrl.Reque // ensure that the deployment has an initial status if deployment.Status.Phase == "" { - err := in.UpdateCoherenceStatusPhase(ctx, request.NamespacedName, coh.ConditionTypeInitialized) + err := in.statusManager.UpdateCoherenceStatusPhase(ctx, request.NamespacedName, coh.ConditionTypeInitialized) if err != nil { // failed to set the status return reconcile.Result{}, err @@ -212,18 +220,20 @@ func (in *CoherenceReconciler) Reconcile(ctx context.Context, request ctrl.Reque } // ensure that the Operator configuration Secret exists - if err = in.ensureOperatorSecret(ctx, deployment, in.GetClient(), in.Log); err != nil { - err = errors.Wrap(err, "ensuring Operator configuration secret") - return in.HandleErrAndRequeue(ctx, err, nil, fmt.Sprintf(reconcileFailedMessage, request.Name, request.Namespace, err), in.Log) + if err = in.resourcesManager.EnsureOperatorSecret(ctx, deployment); err != nil { + err = errorhandling.NewResourceError("ensure", "operator_secret", deployment.GetNamespace(), err) + return in.HandleErrAndRequeue(ctx, err, deployment, fmt.Sprintf(reconcileFailedMessage, request.Name, request.Namespace, err), in.Log) } // ensure that the state store exists storage, err := utils.NewStorage(request.NamespacedName, in.GetManager(), in.GetPatcher()) if err != nil { - err = errors.Wrap(err, "obtaining desired state store") + err = errorhandling.NewOperationError("obtain_state_store", err). + WithContext("resource", deployment.GetName()). + WithContext("namespace", deployment.GetNamespace()) in.GetEventRecorder().Event(deployment, coreV1.EventTypeWarning, reconciler.EventReasonFailed, fmt.Sprintf("failed to obtain state store: %s", err.Error())) - return in.HandleErrAndRequeue(ctx, err, nil, fmt.Sprintf(reconcileFailedMessage, request.Name, request.Namespace, err), in.Log) + return in.HandleErrAndRequeue(ctx, err, deployment, fmt.Sprintf(reconcileFailedMessage, request.Name, request.Namespace, err), in.Log) } // create the result @@ -239,7 +249,7 @@ func (in *CoherenceReconciler) Reconcile(ctx context.Context, request ctrl.Reque return result, errors.Wrap(err, "error updating storage status hash") } hashNew := deployment.GetGenerationString() - if err = in.UpdateDeploymentStatusHash(ctx, request.NamespacedName, hashNew); err != nil { + if err = in.statusManager.UpdateDeploymentStatusHash(ctx, request.NamespacedName, hashNew); err != nil { return result, errors.Wrap(err, "error updating deployment status hash") } return result, nil @@ -247,13 +257,19 @@ func (in *CoherenceReconciler) Reconcile(ctx context.Context, request ctrl.Reque desiredResources, err = getDesiredResources(deployment, storage, log) if err != nil { - return in.HandleErrAndRequeue(ctx, err, nil, fmt.Sprintf(createResourcesFailedMessage, request.Name, request.Namespace, err), in.Log) + err = errorhandling.NewOperationError("get_desired_resources", err). + WithContext("resource", deployment.GetName()). + WithContext("namespace", deployment.GetNamespace()) + return in.HandleErrAndRequeue(ctx, err, deployment, fmt.Sprintf(createResourcesFailedMessage, request.Name, request.Namespace, err), in.Log) } log.Info("Reconciling Coherence resource secondary resources", "hash", hash, "store", storeHash) // make the deployment the owner of all the secondary resources about to be reconciled if err := desiredResources.SetController(deployment, in.GetManager().GetScheme()); err != nil { + err = errorhandling.NewOperationError("set_controller", err). + WithContext("resource", deployment.GetName()). + WithContext("namespace", deployment.GetNamespace()) return reconcile.Result{}, err } @@ -262,7 +278,9 @@ func (in *CoherenceReconciler) Reconcile(ctx context.Context, request ctrl.Reque // update the store to have the desired state as the latest state. if err = storage.Store(ctx, desiredResources, deployment); err != nil { - err = errors.Wrap(err, "storing latest state in state store") + err = errorhandling.NewOperationError("store_state", err). + WithContext("resource", deployment.GetName()). + WithContext("namespace", deployment.GetNamespace()) return reconcile.Result{}, err } @@ -288,19 +306,37 @@ func (in *CoherenceReconciler) Reconcile(ctx context.Context, request ctrl.Reque for _, failure := range failures { log.Error(failure.Error, "Secondary Reconciler failed", "Reconciler", failure.Name) } - return reconcile.Result{}, fmt.Errorf("one or more secondary resource reconcilers failed to reconcile") + + // Create a composite error with context + err = errorhandling.NewOperationError("reconcile_secondary_resources", nil). + WithContext("resource", deployment.GetName()). + WithContext("namespace", deployment.GetNamespace()). + WithContext("failed_reconcilers", fmt.Sprintf("%d", len(failures))) + + // Add the first failure as the underlying error + if len(failures) > 0 { + err.(*errorhandling.OperationError).Err = failures[0].Error + } + + return reconcile.Result{}, err } // if replica count is zero update the status to Stopped if deployment.GetReplicas() == 0 { - if err = in.UpdateCoherenceStatusPhase(ctx, request.NamespacedName, coh.ConditionTypeStopped); err != nil { - return result, errors.Wrap(err, "error updating deployment status") + if err = in.statusManager.UpdateCoherenceStatusPhase(ctx, request.NamespacedName, coh.ConditionTypeStopped); err != nil { + return result, errorhandling.NewOperationError("update_status", err). + WithContext("resource", deployment.GetName()). + WithContext("namespace", deployment.GetNamespace()). + WithContext("status", string(coh.ConditionTypeStopped)) } } // Update the Status with the hash - if err = in.UpdateDeploymentStatusHash(ctx, request.NamespacedName, hash); err != nil { - return result, errors.Wrap(err, "error updating deployment status hash") + if err = in.statusManager.UpdateDeploymentStatusHash(ctx, request.NamespacedName, hash); err != nil { + return result, errorhandling.NewOperationError("update_status_hash", err). + WithContext("resource", deployment.GetName()). + WithContext("namespace", deployment.GetNamespace()). + WithContext("hash", hash) } log.Info("Finished reconciling Coherence resource", "requeue", result.Requeue, "after", result.RequeueAfter.String()) @@ -325,6 +361,24 @@ func (in *CoherenceReconciler) SetupWithManager(mgr ctrl.Manager, cs clients.Cli in.SetCommonReconciler(controllerName, mgr, cs) in.GetPatcher().SetPatchType(types.MergePatchType) + // Initialize the manager fields + in.finalizerManager = &finalizer.FinalizerManager{ + Client: mgr.GetClient(), + Log: in.Log.WithName("finalizer"), + EventRecorder: in.GetEventRecorder(), + Patcher: in.GetPatcher(), + } + + in.statusManager = &status.StatusManager{ + Client: mgr.GetClient(), + Log: in.Log.WithName("status"), + } + + in.resourcesManager = &resources.OperatorSecretManager{ + Client: mgr.GetClient(), + Log: in.Log.WithName("resources"), + } + template := &coh.Coherence{} // Watch for changes to secondary resources @@ -344,126 +398,6 @@ func (in *CoherenceReconciler) SetupWithManager(mgr ctrl.Manager, cs clients.Cli // GetReconciler returns this reconciler. func (in *CoherenceReconciler) GetReconciler() reconcile.Reconciler { return in } -// ensureFinalizerApplied ensures the finalizer is applied to the Coherence resource -func (in *CoherenceReconciler) ensureFinalizerApplied(ctx context.Context, c *coh.Coherence) (bool, error) { - if !controllerutil.ContainsFinalizer(c, coh.CoherenceFinalizer) { - // Re-fetch the Coherence resource to ensure we have the most recent copy - latest := &coh.Coherence{} - c.DeepCopyInto(latest) - controllerutil.AddFinalizer(latest, coh.CoherenceFinalizer) - - callback := func() { - in.Log.Info("Added finalizer to Coherence resource", "Namespace", c.Namespace, "Name", c.Name, "Finalizer", coh.CoherenceFinalizer) - } - - // Perform a three-way patch to apply the finalizer - applied, err := in.ThreeWayPatchWithCallback(ctx, c.Name, c, c, latest, callback) - if err != nil { - return false, errors.Wrapf(err, "failed to update Coherence resource %s/%s with finalizer", c.Namespace, c.Name) - } - return applied, nil - } - return false, nil -} - -// ensureFinalizerApplied ensures the finalizer is removed from the Coherence resource -func (in *CoherenceReconciler) ensureFinalizerRemoved(ctx context.Context, c *coh.Coherence) error { - if controllerutil.RemoveFinalizer(c, coh.CoherenceFinalizer) { - err := in.GetClient().Update(ctx, c) - if err != nil { - in.Log.Info("Failed to remove the finalizer from the Coherence resource, it looks like it had already been deleted") - return err - } - } - return nil -} - -// finalizeDeployment performs any required finalizer tasks for the Coherence resource -func (in *CoherenceReconciler) finalizeDeployment(ctx context.Context, c *coh.Coherence) error { - // determine whether we can skip service suspension - if viper.GetBool(operator.FlagSkipServiceSuspend) { - in.Log.Info("Skipping suspension of Coherence services in deployment " + c.Name + - operator.FlagSkipServiceSuspend + " is set to true") - return nil - } - if !c.Spec.IsSuspendServicesOnShutdown() { - in.Log.Info("Skipping suspension of Coherence services in deployment " + c.Name + - " Spec.SuspendServicesOnShutdown is set to false") - return nil - } - if c.GetReplicas() == 0 { - in.Log.Info("Skipping suspension of Coherence services in deployment " + c.Name + - " Spec.Replicas is zero") - return nil - } - - in.Log.Info("Finalizing Coherence resource", "Namespace", c.Namespace, "Name", c.Name) - // Get the StatefulSet - sts, stsExists, err := in.MaybeFindStatefulSet(ctx, c.Namespace, c.Name) - if err != nil { - return errors.Wrapf(err, "getting StatefulSet %s/%s", c.Namespace, c.Name) - } - if stsExists { - if sts.Status.ReadyReplicas == 0 { - in.Log.Info("Skipping suspension of Coherence services in deployment " + c.Name + " - No Pods are ready") - } else { - // Do service suspension... - p := probe.CoherenceProbe{ - Client: in.GetClient(), - Config: in.GetManager().GetConfig(), - EventRecorder: events.NewOwnedEventRecorder(c, in.GetEventRecorder()), - } - if p.SuspendServices(ctx, c, sts) == probe.ServiceSuspendFailed { - return fmt.Errorf("failed to suspend services") - } - } - } - return nil -} - -// ensureOperatorSecret ensures that the Operator configuration secret exists in the namespace. -func (in *CoherenceReconciler) ensureOperatorSecret(ctx context.Context, deployment *coh.Coherence, c client.Client, log logr.Logger) error { - namespace := deployment.Namespace - s := &coreV1.Secret{ - ObjectMeta: metav1.ObjectMeta{ - Name: coh.OperatorConfigName, - Namespace: namespace, - Labels: deployment.CreateGlobalLabels(), - Annotations: deployment.CreateGlobalAnnotations(), - }, - } - - err := c.Get(ctx, types.NamespacedName{Name: coh.OperatorConfigName, Namespace: namespace}, s) - if err != nil && !apierrors.IsNotFound(err) { - return err - } - - restHostAndPort := rest.GetServerHostAndPort() - - oldValue := s.Data[coh.OperatorConfigKeyHost] - if oldValue == nil || string(oldValue) != restHostAndPort { - // data is different so create/update - - if s.StringData == nil { - s.StringData = make(map[string]string) - } - s.StringData[coh.OperatorConfigKeyHost] = restHostAndPort - - log.Info("Operator configuration updated", "Key", coh.OperatorConfigKeyHost, "OldValue", string(oldValue), "NewValue", restHostAndPort) - if apierrors.IsNotFound(err) { - // for some reason we're getting here even if the secret exists so delete it!! - _ = c.Delete(ctx, s) - log.Info("Creating configuration secret " + coh.OperatorConfigName) - err = c.Create(ctx, s) - } else { - log.Info("Updating configuration secret " + coh.OperatorConfigName) - err = c.Update(ctx, s) - } - } - - return err -} - // watchSecondaryResource registers the secondary resource reconcilers to watch the resources to be reconciled func watchSecondaryResource(mgr ctrl.Manager, s reconciler.SecondaryResourceReconciler, owner coh.CoherenceResource) error { var err error diff --git a/controllers/errorhandling/README.md b/controllers/errorhandling/README.md new file mode 100644 index 00000000..a25fb55c --- /dev/null +++ b/controllers/errorhandling/README.md @@ -0,0 +1,94 @@ +# Error Handling in Coherence Operator + +This package provides standardized error handling patterns for the Coherence Operator. It includes utilities for error wrapping, context addition, and common error scenarios. + +## Key Components + +### Error Types + +- **OperationError**: Represents an error that occurred during an operation. It includes: + - Operation name + - Resource name and namespace (optional) + - Underlying error + - Context map for additional information + +### Error Creation + +- **NewOperationError**: Creates a new operation error +- **NewResourceError**: Creates an error for a specific resource + +### Common Error Scenarios + +The package provides helper functions for common error scenarios: + +- **NewCreateResourceError**: For resource creation failures +- **NewUpdateResourceError**: For resource update failures +- **NewDeleteResourceError**: For resource deletion failures +- **NewGetResourceError**: For resource retrieval failures +- **NewListResourceError**: For resource listing failures +- **NewPatchResourceError**: For resource patching failures +- **NewReconcileError**: For reconciliation failures +- **NewValidationError**: For validation failures +- **NewTimeoutError**: For timeout failures +- **NewConnectionError**: For connection failures +- **NewAuthenticationError**: For authentication failures +- **NewAuthorizationError**: For authorization failures + +### Error Wrapping + +- **WrapError**: Wraps an error with context information +- **WrapErrorf**: Wraps an error with formatted context information +- **WithStack**: Adds a stack trace to an error + +### Error Handling + +- **ErrorHandler**: Handles errors in the reconciliation loop + - Categorizes errors (Transient, Permanent, Recoverable, Unknown) + - Updates error tracking information + - Updates resource status + - Handles errors based on their category + +## Usage Examples + +### Creating Errors + +```go +// Create a simple operation error +err := errorhandling.NewOperationError("update_config", originalErr) + +// Add context to the error +err.WithContext("resource_type", "ConfigMap").WithContext("retry_count", "3") + +// Create a resource-specific error +err := errorhandling.NewResourceError("update", "my-configmap", "default", originalErr) + +// Use helper functions for common scenarios +err := errorhandling.NewCreateResourceError("my-pod", "default", originalErr) +``` + +### Handling Errors + +```go +// Create an error handler +errorHandler := errorhandling.NewErrorHandler(client, logger, recorder) + +// Handle an error +result, err := errorHandler.HandleError(ctx, originalErr, resource, "Failed to update resource") + +// Handle a resource-specific error +result, err := errorHandler.HandleResourceError(ctx, originalErr, resource, "update", "Failed to update resource") + +// Retry an operation with context +err := errorHandler.RetryWithContext(ctx, "update", "my-configmap", "default", func() error { + // Operation that might fail + return client.Update(ctx, configMap) +}) +``` + +## Best Practices + +1. **Always add context to errors**: Use WithContext to add relevant information to errors. +2. **Use the appropriate helper function**: Choose the most specific helper function for your error scenario. +3. **Handle errors based on their category**: Use the ErrorHandler to handle errors appropriately based on their category. +4. **Include stack traces**: Use WithStack to add stack traces to errors for better debugging. +5. **Log errors with context**: Use the LogAndWrapError functions to log errors with context. \ No newline at end of file diff --git a/controllers/errorhandling/error_handler.go b/controllers/errorhandling/error_handler.go new file mode 100644 index 00000000..1cbee0e5 --- /dev/null +++ b/controllers/errorhandling/error_handler.go @@ -0,0 +1,1004 @@ +/* + * Copyright (c) 2020, 2025, Oracle and/or its affiliates. + * Licensed under the Universal Permissive License v 1.0 as shown at + * http://oss.oracle.com/licenses/upl. + */ + +package errorhandling + +import ( + "context" + "fmt" + "runtime" + "strconv" + "strings" + "time" + + "github.com/go-logr/logr" + coh "github.com/oracle/coherence-operator/api/v1" + "github.com/pkg/errors" + corev1 "k8s.io/api/core/v1" + apierrors "k8s.io/apimachinery/pkg/api/errors" + "k8s.io/apimachinery/pkg/types" + "k8s.io/client-go/tools/record" + "k8s.io/client-go/util/retry" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/reconcile" +) + +// ErrorCategory represents the category of an error +type ErrorCategory string + +const ( + // ErrorCategoryTransient represents a transient error that may resolve itself with time + ErrorCategoryTransient ErrorCategory = "Transient" + // ErrorCategoryPermanent represents a permanent error that requires manual intervention + ErrorCategoryPermanent ErrorCategory = "Permanent" + // ErrorCategoryRecoverable represents an error that can be automatically recovered from + ErrorCategoryRecoverable ErrorCategory = "Recoverable" + // ErrorCategoryUnknown represents an error of unknown category + ErrorCategoryUnknown ErrorCategory = "Unknown" + + // AnnotationLastError is the annotation key for the last error message + AnnotationLastError = "coherence.oracle.com/last-error" + // AnnotationErrorCount is the annotation key for the error count + AnnotationErrorCount = "coherence.oracle.com/error-count" + // AnnotationLastRecoveryAttempt is the annotation key for the last recovery attempt timestamp + AnnotationLastRecoveryAttempt = "coherence.oracle.com/last-recovery-attempt" +) + +// ErrorHandler handles errors in the reconciliation loop +type ErrorHandler struct { + Client client.Client + Log logr.Logger + EventRecorder record.EventRecorder +} + +// HandleError handles an error in the reconciliation loop +func (eh *ErrorHandler) HandleError(ctx context.Context, err error, resource coh.CoherenceResource, msg string) (reconcile.Result, error) { + if err == nil { + return reconcile.Result{}, nil + } + + // Add stack trace if not already present + err = WithStack(err) + + // Categorize the error + category := eh.categorizeError(err) + + // Add caller information to the log + callerInfo := GetCallerInfo(1) + + // Update error tracking information + if trackErr := eh.updateErrorTracking(ctx, resource, err.Error(), category); trackErr != nil { + eh.Log.Error(trackErr, "Failed to update error tracking information") + } + + // Log the error with category and caller info + eh.Log.Error(err, msg, "category", category, "caller", callerInfo) + + // Record an event + eventType := corev1.EventTypeWarning + eventReason := "ReconcileError" + eventMsg := fmt.Sprintf("%s: %s (Category: %s)", msg, err.Error(), category) + eh.EventRecorder.Event(resource, eventType, eventReason, eventMsg) + + // Update the status to reflect the error + if statusErr := eh.updateStatus(ctx, resource, category); statusErr != nil { + eh.Log.Error(statusErr, "Failed to update status") + } + + // Handle the error based on its category + switch category { + case ErrorCategoryTransient: + // For transient errors, requeue with backoff + return eh.handleTransientError(resource) + case ErrorCategoryRecoverable: + // For recoverable errors, attempt recovery + return eh.attemptRecovery(ctx, err, resource) + case ErrorCategoryPermanent: + // For permanent errors, don't requeue + return reconcile.Result{Requeue: false}, nil + default: + // For unknown errors, requeue with a short delay + return reconcile.Result{Requeue: true, RequeueAfter: 5 * time.Second}, nil + } +} + +// categorizeError categorizes an error based on its type and content +func (eh *ErrorHandler) categorizeError(err error) ErrorCategory { + // Check for Kubernetes API errors + if apierrors.IsNotFound(err) { + return ErrorCategoryTransient // Resource might appear later + } + if apierrors.IsConflict(err) { + return ErrorCategoryTransient // Conflict can resolve with retry + } + if apierrors.IsServerTimeout(err) || apierrors.IsTimeout(err) { + return ErrorCategoryTransient // Timeouts are typically transient + } + if apierrors.IsTooManyRequests(err) { + return ErrorCategoryTransient // Rate limiting is transient + } + if apierrors.IsServiceUnavailable(err) { + return ErrorCategoryTransient // Service might become available later + } + if apierrors.IsInternalError(err) { + return ErrorCategoryTransient // Internal errors might resolve + } + if apierrors.IsResourceExpired(err) { + return ErrorCategoryTransient // Resource version expired, can retry + } + + // Check for specific error strings that indicate transient errors + errStr := err.Error() + if contains(errStr, "connection refused", "network is unreachable", "connection reset", "EOF", "broken pipe") { + return ErrorCategoryTransient + } + if contains(errStr, "etcdserver: leader changed", "etcdserver: request timed out") { + return ErrorCategoryTransient + } + if contains(errStr, "net/http: TLS handshake timeout", "i/o timeout", "deadline exceeded") { + return ErrorCategoryTransient + } + if contains(errStr, "the object has been modified", "optimistic concurrency error") { + return ErrorCategoryTransient + } + + // Check for specific error strings that indicate permanent errors + if contains(errStr, "invalid", "not supported", "forbidden", "unauthorized", "denied") { + return ErrorCategoryPermanent + } + if contains(errStr, "admission webhook", "validation failed") && contains(errStr, "denied") { + return ErrorCategoryPermanent + } + if contains(errStr, "field is immutable") { + return ErrorCategoryPermanent + } + + // Check for specific error strings that indicate recoverable errors + if contains(errStr, "failed to suspend services") { + return ErrorCategoryRecoverable + } + if contains(errStr, "pod disruption budget") && contains(errStr, "not available") { + return ErrorCategoryRecoverable + } + if contains(errStr, "cannot patch") && contains(errStr, "StatefulSet") { + return ErrorCategoryRecoverable + } + if contains(errStr, "cannot create pods") && contains(errStr, "quota exceeded") { + return ErrorCategoryRecoverable + } + if contains(errStr, "cannot schedule") || contains(errStr, "unschedulable") { + return ErrorCategoryRecoverable + } + + // Default to unknown + return ErrorCategoryUnknown +} + +// contains checks if any of the substrings are contained in the string +func contains(s string, substrings ...string) bool { + for _, substr := range substrings { + if strings.Contains(s, substr) { + return true + } + } + return false +} + +// Common error creation helpers for specific operations + +// NewCreateResourceError creates an error for resource creation failures +func NewCreateResourceError(resource string, namespace string, err error) error { + return NewResourceError("create", resource, namespace, err) +} + +// NewUpdateResourceError creates an error for resource update failures +func NewUpdateResourceError(resource string, namespace string, err error) error { + return NewResourceError("update", resource, namespace, err) +} + +// NewDeleteResourceError creates an error for resource deletion failures +func NewDeleteResourceError(resource string, namespace string, err error) error { + return NewResourceError("delete", resource, namespace, err) +} + +// NewGetResourceError creates an error for resource retrieval failures +func NewGetResourceError(resource string, namespace string, err error) error { + return NewResourceError("get", resource, namespace, err) +} + +// NewListResourceError creates an error for resource listing failures +func NewListResourceError(resource string, namespace string, err error) error { + return NewResourceError("list", resource, namespace, err) +} + +// NewPatchResourceError creates an error for resource patching failures +func NewPatchResourceError(resource string, namespace string, err error) error { + return NewResourceError("patch", resource, namespace, err) +} + +// NewReconcileError creates an error for reconciliation failures +func NewReconcileError(resource string, namespace string, err error) error { + return NewResourceError("reconcile", resource, namespace, err) +} + +// NewValidationError creates an error for validation failures +func NewValidationError(resource string, namespace string, err error) error { + return NewResourceError("validate", resource, namespace, err).WithContext("reason", "validation") +} + +// NewTimeoutError creates an error for timeout failures +func NewTimeoutError(operation string, resource string, namespace string, err error) error { + return NewResourceError(operation, resource, namespace, err).WithContext("reason", "timeout") +} + +// NewConnectionError creates an error for connection failures +func NewConnectionError(endpoint string, err error) error { + return NewOperationError("connect", err).WithContext("endpoint", endpoint) +} + +// NewAuthenticationError creates an error for authentication failures +func NewAuthenticationError(resource string, err error) error { + return NewOperationError("authenticate", err).WithContext("resource", resource) +} + +// NewAuthorizationError creates an error for authorization failures +func NewAuthorizationError(resource string, err error) error { + return NewOperationError("authorize", err).WithContext("resource", resource) +} + +// updateErrorTracking updates the error tracking information on the resource +func (eh *ErrorHandler) updateErrorTracking(ctx context.Context, resource coh.CoherenceResource, errMsg string, category ErrorCategory) error { + // Get the latest version of the resource + latest := resource.DeepCopyResource() + if err := eh.Client.Get(ctx, types.NamespacedName{ + Namespace: resource.GetNamespace(), + Name: resource.GetName(), + }, latest); err != nil { + return err + } + + // Get the annotations + annotations := latest.GetAnnotations() + if annotations == nil { + annotations = make(map[string]string) + } + + // Update the last error message + annotations[AnnotationLastError] = errMsg + + // Update the error count + count := 1 + if countStr, ok := annotations[AnnotationErrorCount]; ok { + if parsedCount, err := strconv.Atoi(countStr); err == nil { + count = parsedCount + 1 + } + } + annotations[AnnotationErrorCount] = strconv.Itoa(count) + + // Set the annotations back on the resource + latest.SetAnnotations(annotations) + + // Update the resource + return eh.Client.Update(ctx, latest) +} + +// updateStatus updates the status of the resource based on the error category +func (eh *ErrorHandler) updateStatus(ctx context.Context, resource coh.CoherenceResource, category ErrorCategory) error { + // Get the latest version of the resource + latest := resource.DeepCopyResource() + if err := eh.Client.Get(ctx, types.NamespacedName{ + Namespace: resource.GetNamespace(), + Name: resource.GetName(), + }, latest); err != nil { + return err + } + + // Update the status based on the error category + status := latest.GetStatus() + + // Set the condition based on the error category + var condition coh.Condition + switch category { + case ErrorCategoryTransient: + condition = coh.Condition{ + Type: coh.ConditionTypeFailed, + Status: corev1.ConditionTrue, + Reason: "TransientError", + Message: "Encountered a transient error, will retry", + } + case ErrorCategoryRecoverable: + condition = coh.Condition{ + Type: coh.ConditionTypeFailed, + Status: corev1.ConditionTrue, + Reason: "RecoverableError", + Message: "Encountered a recoverable error, attempting recovery", + } + case ErrorCategoryPermanent: + condition = coh.Condition{ + Type: coh.ConditionTypeFailed, + Status: corev1.ConditionTrue, + Reason: "PermanentError", + Message: "Encountered a permanent error, manual intervention required", + } + default: + condition = coh.Condition{ + Type: coh.ConditionTypeFailed, + Status: corev1.ConditionTrue, + Reason: "UnknownError", + Message: "Encountered an error of unknown category", + } + } + + // Set the condition on the status + status.SetCondition(latest, condition) + + // Update the resource status + return eh.Client.Status().Update(ctx, latest) +} + +// handleTransientError handles a transient error +func (eh *ErrorHandler) handleTransientError(resource coh.CoherenceResource) (reconcile.Result, error) { + // Get the error count from the annotations + annotations := resource.GetAnnotations() + count := 1 + if countStr, ok := annotations[AnnotationErrorCount]; ok { + if parsedCount, err := strconv.Atoi(countStr); err == nil { + count = parsedCount + } + } + + // Calculate backoff duration based on error count (exponential backoff) + // Start with 5 seconds, double each time, cap at 5 minutes + backoff := time.Duration(min(5*pow(2, count-1), 300)) * time.Second + + eh.Log.Info("Requeuing with backoff due to transient error", + "resource", resource.GetName(), + "namespace", resource.GetNamespace(), + "errorCount", count, + "backoff", backoff) + + return reconcile.Result{Requeue: true, RequeueAfter: backoff}, nil +} + +// attemptRecovery attempts to recover from a recoverable error +func (eh *ErrorHandler) attemptRecovery(ctx context.Context, err error, resource coh.CoherenceResource) (reconcile.Result, error) { + // Get the latest version of the resource + latest := resource.DeepCopyResource() + if err := eh.Client.Get(ctx, types.NamespacedName{ + Namespace: resource.GetNamespace(), + Name: resource.GetName(), + }, latest); err != nil { + return reconcile.Result{Requeue: true}, err + } + + // Get the annotations + annotations := latest.GetAnnotations() + if annotations == nil { + annotations = make(map[string]string) + } + + // Check if we've already attempted recovery recently + if lastAttempt, ok := annotations[AnnotationLastRecoveryAttempt]; ok { + // Parse the last attempt time + lastAttemptTime, parseErr := time.Parse(time.RFC3339, lastAttempt) + if parseErr == nil { + // If the last attempt was less than 5 minutes ago, don't try again yet + if time.Since(lastAttemptTime) < 5*time.Minute { + eh.Log.Info("Skipping recovery attempt - too soon since last attempt", + "resource", resource.GetName(), + "namespace", resource.GetNamespace(), + "lastAttempt", lastAttempt) + return reconcile.Result{Requeue: true, RequeueAfter: 1 * time.Minute}, nil + } + } + } + + // Record the recovery attempt + annotations[AnnotationLastRecoveryAttempt] = time.Now().Format(time.RFC3339) + latest.SetAnnotations(annotations) + + // Update the resource + if err := eh.Client.Update(ctx, latest); err != nil { + return reconcile.Result{Requeue: true}, err + } + + // Log the recovery attempt with detailed diagnostics + eh.Log.Info("Attempting recovery from error", + "resource", resource.GetName(), + "namespace", resource.GetNamespace(), + "error", err.Error(), + "resourceVersion", latest.GetResourceVersion(), + "generation", latest.GetGeneration(), + "deletionTimestamp", latest.GetDeletionTimestamp()) + + // Record an event with detailed information + eh.EventRecorder.Event(resource, corev1.EventTypeNormal, "RecoveryAttempt", + fmt.Sprintf("Attempting to recover from error: %s", err.Error())) + + // Implement recovery logic based on the error + errStr := err.Error() + + // Service suspension failure + if strings.Contains(errStr, "failed to suspend services") { + return eh.recoverFromServiceSuspensionFailure(ctx, resource) + } + + // Pod disruption budget issues + if strings.Contains(errStr, "pod disruption budget") && strings.Contains(errStr, "not available") { + return eh.recoverFromPDBIssue(ctx, resource) + } + + // StatefulSet patching issues + if strings.Contains(errStr, "cannot patch") && strings.Contains(errStr, "StatefulSet") { + return eh.recoverFromStatefulSetPatchIssue(ctx, resource) + } + + // Resource quota issues + if strings.Contains(errStr, "cannot create pods") && strings.Contains(errStr, "quota exceeded") { + return eh.recoverFromQuotaIssue(ctx, resource) + } + + // Scheduling issues + if strings.Contains(errStr, "cannot schedule") || strings.Contains(errStr, "unschedulable") { + return eh.recoverFromSchedulingIssue(ctx, resource) + } + + // If we don't have specific recovery logic for this error, log it and requeue with backoff + eh.Log.Info("No specific recovery mechanism for this error, will retry later", + "resource", resource.GetName(), + "namespace", resource.GetNamespace(), + "error", err.Error()) + + // Add diagnostic information to the resource + annotations["coherence.oracle.com/last-unhandled-error"] = err.Error() + annotations["coherence.oracle.com/diagnostic-info"] = fmt.Sprintf( + "No specific recovery mechanism available. Error: %s, Time: %s", + err.Error(), + time.Now().Format(time.RFC3339)) + + if updateErr := eh.Client.Update(ctx, latest); updateErr != nil { + eh.Log.Error(updateErr, "Failed to update resource with diagnostic information") + } + + return reconcile.Result{Requeue: true, RequeueAfter: 30 * time.Second}, nil +} + +// recoverFromServiceSuspensionFailure attempts to recover from a service suspension failure +func (eh *ErrorHandler) recoverFromServiceSuspensionFailure(ctx context.Context, resource coh.CoherenceResource) (reconcile.Result, error) { + // 1. Log the recovery attempt + eh.Log.Info("Attempting to recover from service suspension failure", + "resource", resource.GetName(), + "namespace", resource.GetNamespace()) + + // 2. Record an event + eh.EventRecorder.Event(resource, corev1.EventTypeNormal, "RecoveryAttempt", + "Attempting to recover from service suspension failure") + + // 3. Implement the recovery logic + // For service suspension failures, we'll try to force remove the finalizer + // This allows the resource to be deleted even if service suspension failed + latest := resource.DeepCopyResource() + if err := eh.Client.Get(ctx, types.NamespacedName{ + Namespace: resource.GetNamespace(), + Name: resource.GetName(), + }, latest); err != nil { + return reconcile.Result{Requeue: true}, err + } + + // Check if this is a deletion and has the Coherence finalizer + if latest.GetDeletionTimestamp() != nil { + // This is a deletion, so we'll try to remove the finalizer + annotations := latest.GetAnnotations() + if annotations == nil { + annotations = make(map[string]string) + } + + // Add an annotation to indicate we're bypassing the finalizer + annotations["coherence.oracle.com/finalizer-bypass"] = "true" + latest.SetAnnotations(annotations) + + // Update the resource with the annotation + if err := eh.Client.Update(ctx, latest); err != nil { + eh.Log.Error(err, "Failed to add finalizer bypass annotation") + return reconcile.Result{Requeue: true}, err + } + + eh.Log.Info("Added finalizer bypass annotation to resource", + "resource", resource.GetName(), + "namespace", resource.GetNamespace()) + + eh.EventRecorder.Event(resource, corev1.EventTypeNormal, "RecoveryAction", + "Added finalizer bypass annotation to allow deletion despite service suspension failure") + } + + // 4. Return a result that requeues after a short delay to check if recovery was successful + return reconcile.Result{Requeue: true, RequeueAfter: 10 * time.Second}, nil +} + +// recoverFromPDBIssue attempts to recover from Pod Disruption Budget issues +func (eh *ErrorHandler) recoverFromPDBIssue(ctx context.Context, resource coh.CoherenceResource) (reconcile.Result, error) { + // Log the recovery attempt + eh.Log.Info("Attempting to recover from Pod Disruption Budget issue", + "resource", resource.GetName(), + "namespace", resource.GetNamespace()) + + // Record an event + eh.EventRecorder.Event(resource, corev1.EventTypeNormal, "RecoveryAttempt", + "Attempting to recover from Pod Disruption Budget issue") + + // For PDB issues, we'll add an annotation to indicate that we're aware of the issue + // This can be used by operators or automation to take appropriate action + latest := resource.DeepCopyResource() + if err := eh.Client.Get(ctx, types.NamespacedName{ + Namespace: resource.GetNamespace(), + Name: resource.GetName(), + }, latest); err != nil { + return reconcile.Result{Requeue: true}, err + } + + // Add diagnostic information + annotations := latest.GetAnnotations() + if annotations == nil { + annotations = make(map[string]string) + } + + annotations["coherence.oracle.com/pdb-issue-detected"] = "true" + annotations["coherence.oracle.com/pdb-issue-time"] = time.Now().Format(time.RFC3339) + latest.SetAnnotations(annotations) + + // Update the resource with the annotation + if err := eh.Client.Update(ctx, latest); err != nil { + eh.Log.Error(err, "Failed to add PDB issue annotation") + return reconcile.Result{Requeue: true}, err + } + + eh.Log.Info("Added PDB issue annotation to resource", + "resource", resource.GetName(), + "namespace", resource.GetNamespace()) + + eh.EventRecorder.Event(resource, corev1.EventTypeNormal, "RecoveryAction", + "Added PDB issue annotation to resource") + + // Return a result that requeues after a delay + return reconcile.Result{Requeue: true, RequeueAfter: 30 * time.Second}, nil +} + +// recoverFromStatefulSetPatchIssue attempts to recover from StatefulSet patching issues +func (eh *ErrorHandler) recoverFromStatefulSetPatchIssue(ctx context.Context, resource coh.CoherenceResource) (reconcile.Result, error) { + // Log the recovery attempt + eh.Log.Info("Attempting to recover from StatefulSet patching issue", + "resource", resource.GetName(), + "namespace", resource.GetNamespace()) + + // Record an event + eh.EventRecorder.Event(resource, corev1.EventTypeNormal, "RecoveryAttempt", + "Attempting to recover from StatefulSet patching issue") + + // For StatefulSet patching issues, we'll add an annotation to force a recreation + latest := resource.DeepCopyResource() + if err := eh.Client.Get(ctx, types.NamespacedName{ + Namespace: resource.GetNamespace(), + Name: resource.GetName(), + }, latest); err != nil { + return reconcile.Result{Requeue: true}, err + } + + // Add diagnostic information + annotations := latest.GetAnnotations() + if annotations == nil { + annotations = make(map[string]string) + } + + annotations["coherence.oracle.com/force-reconcile"] = time.Now().Format(time.RFC3339) + latest.SetAnnotations(annotations) + + // Update the resource with the annotation + if err := eh.Client.Update(ctx, latest); err != nil { + eh.Log.Error(err, "Failed to add force-reconcile annotation") + return reconcile.Result{Requeue: true}, err + } + + eh.Log.Info("Added force-reconcile annotation to resource", + "resource", resource.GetName(), + "namespace", resource.GetNamespace()) + + eh.EventRecorder.Event(resource, corev1.EventTypeNormal, "RecoveryAction", + "Added force-reconcile annotation to resource to address StatefulSet patching issue") + + // Return a result that requeues after a delay + return reconcile.Result{Requeue: true, RequeueAfter: 10 * time.Second}, nil +} + +// recoverFromQuotaIssue attempts to recover from resource quota issues +func (eh *ErrorHandler) recoverFromQuotaIssue(ctx context.Context, resource coh.CoherenceResource) (reconcile.Result, error) { + // Log the recovery attempt + eh.Log.Info("Attempting to recover from resource quota issue", + "resource", resource.GetName(), + "namespace", resource.GetNamespace()) + + // Record an event + eh.EventRecorder.Event(resource, corev1.EventTypeWarning, "RecoveryAttempt", + "Attempting to recover from resource quota issue - this may require manual intervention") + + // For quota issues, we'll add an annotation to indicate the issue + latest := resource.DeepCopyResource() + if err := eh.Client.Get(ctx, types.NamespacedName{ + Namespace: resource.GetNamespace(), + Name: resource.GetName(), + }, latest); err != nil { + return reconcile.Result{Requeue: true}, err + } + + // Add diagnostic information + annotations := latest.GetAnnotations() + if annotations == nil { + annotations = make(map[string]string) + } + + annotations["coherence.oracle.com/quota-issue-detected"] = "true" + annotations["coherence.oracle.com/quota-issue-time"] = time.Now().Format(time.RFC3339) + latest.SetAnnotations(annotations) + + // Update the resource with the annotation + if err := eh.Client.Update(ctx, latest); err != nil { + eh.Log.Error(err, "Failed to add quota issue annotation") + return reconcile.Result{Requeue: true}, err + } + + eh.Log.Info("Added quota issue annotation to resource", + "resource", resource.GetName(), + "namespace", resource.GetNamespace()) + + eh.EventRecorder.Event(resource, corev1.EventTypeWarning, "ResourceQuotaIssue", + "Resource quota exceeded - manual intervention may be required to increase quota or reduce resource requests") + + // Return a result that requeues after a longer delay + return reconcile.Result{Requeue: true, RequeueAfter: 2 * time.Minute}, nil +} + +// recoverFromSchedulingIssue attempts to recover from pod scheduling issues +func (eh *ErrorHandler) recoverFromSchedulingIssue(ctx context.Context, resource coh.CoherenceResource) (reconcile.Result, error) { + // Log the recovery attempt + eh.Log.Info("Attempting to recover from pod scheduling issue", + "resource", resource.GetName(), + "namespace", resource.GetNamespace()) + + // Record an event + eh.EventRecorder.Event(resource, corev1.EventTypeWarning, "RecoveryAttempt", + "Attempting to recover from pod scheduling issue - this may require manual intervention") + + // For scheduling issues, we'll add an annotation to indicate the issue + latest := resource.DeepCopyResource() + if err := eh.Client.Get(ctx, types.NamespacedName{ + Namespace: resource.GetNamespace(), + Name: resource.GetName(), + }, latest); err != nil { + return reconcile.Result{Requeue: true}, err + } + + // Add diagnostic information + annotations := latest.GetAnnotations() + if annotations == nil { + annotations = make(map[string]string) + } + + annotations["coherence.oracle.com/scheduling-issue-detected"] = "true" + annotations["coherence.oracle.com/scheduling-issue-time"] = time.Now().Format(time.RFC3339) + latest.SetAnnotations(annotations) + + // Update the resource with the annotation + if err := eh.Client.Update(ctx, latest); err != nil { + eh.Log.Error(err, "Failed to add scheduling issue annotation") + return reconcile.Result{Requeue: true}, err + } + + eh.Log.Info("Added scheduling issue annotation to resource", + "resource", resource.GetName(), + "namespace", resource.GetNamespace()) + + eh.EventRecorder.Event(resource, corev1.EventTypeWarning, "SchedulingIssue", + "Pod scheduling issue detected - manual intervention may be required to address node resources or affinity rules") + + // Return a result that requeues after a longer delay + return reconcile.Result{Requeue: true, RequeueAfter: 2 * time.Minute}, nil +} + +// RetryOnError retries the given function with exponential backoff on error +func (eh *ErrorHandler) RetryOnError(operation string, fn func() error) error { + return retry.OnError(retry.DefaultRetry, func(err error) bool { + category := eh.categorizeError(err) + retryable := category == ErrorCategoryTransient + if retryable { + eh.Log.Info("Retrying operation due to transient error", + "operation", operation, + "error", err.Error()) + } + return retryable + }, fn) +} + +// RetryWithContext retries the given function with context and additional metadata +func (eh *ErrorHandler) RetryWithContext(ctx context.Context, operation string, resource string, namespace string, fn func() error) error { + return retry.OnError(retry.DefaultRetry, func(err error) bool { + category := eh.categorizeError(err) + retryable := category == ErrorCategoryTransient + if retryable { + eh.Log.Info("Retrying operation due to transient error", + "operation", operation, + "resource", resource, + "namespace", namespace, + "error", err.Error()) + } + return retryable + }, func() error { + err := fn() + if err != nil { + // Wrap the error with operation context + return NewResourceError(operation, resource, namespace, err) + } + return nil + }) +} + +// HandleResourceError handles an error related to a specific resource +func (eh *ErrorHandler) HandleResourceError(ctx context.Context, err error, resource coh.CoherenceResource, operation string, msg string) (reconcile.Result, error) { + if err == nil { + return reconcile.Result{}, nil + } + + // Create a resource error with context + resourceErr := NewResourceError( + operation, + resource.GetName(), + resource.GetNamespace(), + err, + ) + + // Add additional context if available + if opErr, ok := err.(*OperationError); ok { + for k, v := range opErr.Context { + _ = resourceErr.WithContext(k, v) + } + } + + return eh.HandleError(ctx, resourceErr, resource, msg) +} + +// MustNotError panics if the error is not nil +func MustNotError(err error) { + if err != nil { + panic(err) + } +} + +// LogAndReturnError logs the error and returns it +func (eh *ErrorHandler) LogAndReturnError(err error, msg string) error { + if err == nil { + return nil + } + + eh.Log.Error(err, msg) + return err +} + +// LogAndWrapError logs the error, wraps it with the message, and returns it +func (eh *ErrorHandler) LogAndWrapError(err error, msg string) error { + if err == nil { + return nil + } + + wrappedErr := WrapError(err, msg) + eh.Log.Error(wrappedErr, msg) + return wrappedErr +} + +// LogAndWrapErrorf logs the error, wraps it with the formatted message, and returns it +func (eh *ErrorHandler) LogAndWrapErrorf(err error, format string, args ...interface{}) error { + if err == nil { + return nil + } + + msg := fmt.Sprintf(format, args...) + wrappedErr := WrapError(err, msg) + eh.Log.Error(wrappedErr, msg) + return wrappedErr +} + +// min returns the minimum of two integers +func min(a, b int) int { + if a < b { + return a + } + return b +} + +// pow returns a^b for integers +func pow(a, b int) int { + result := 1 + for i := 0; i < b; i++ { + result *= a + } + return result +} + +// OperationError represents an error that occurred during an operation +type OperationError struct { + Operation string + Resource string + Namespace string + Err error + Context map[string]string +} + +// Error returns the error message +func (e *OperationError) Error() string { + msg := fmt.Sprintf("operation '%s' failed", e.Operation) + if e.Resource != "" { + if e.Namespace != "" { + msg += fmt.Sprintf(" for resource '%s' in namespace '%s'", e.Resource, e.Namespace) + } else { + msg += fmt.Sprintf(" for resource '%s'", e.Resource) + } + } + + if len(e.Context) > 0 { + contextStr := "" + for k, v := range e.Context { + if contextStr != "" { + contextStr += ", " + } + contextStr += fmt.Sprintf("%s=%s", k, v) + } + msg += fmt.Sprintf(" (context: %s)", contextStr) + } + + if e.Err != nil { + msg += fmt.Sprintf(": %v", e.Err) + } + + return msg +} + +// Unwrap returns the underlying error +func (e *OperationError) Unwrap() error { + return e.Err +} + +// Cause returns the underlying error (for compatibility with github.com/pkg/errors) +func (e *OperationError) Cause() error { + return e.Err +} + +// WithContext adds context to the error +func (e *OperationError) WithContext(key, value string) *OperationError { + if e.Context == nil { + e.Context = make(map[string]string) + } + e.Context[key] = value + return e +} + +// NewOperationError creates a new OperationError +func NewOperationError(operation string, err error) *OperationError { + return &OperationError{ + Operation: operation, + Err: err, + Context: make(map[string]string), + } +} + +// NewResourceError creates a new OperationError for a specific resource +func NewResourceError(operation string, resource string, namespace string, err error) *OperationError { + return &OperationError{ + Operation: operation, + Resource: resource, + Namespace: namespace, + Err: err, + Context: make(map[string]string), + } +} + +// WrapError wraps an error with context information +func WrapError(err error, message string) error { + if err == nil { + return nil + } + return errors.Wrap(err, message) +} + +// WrapErrorf wraps an error with formatted context information +func WrapErrorf(err error, format string, args ...interface{}) error { + if err == nil { + return nil + } + return errors.Wrapf(err, format, args...) +} + +// NewError creates a new error with the given message +func NewError(message string) error { + return errors.New(message) +} + +// NewErrorf creates a new error with the given formatted message +func NewErrorf(format string, args ...interface{}) error { + return errors.Errorf(format, args...) +} + +// IsNotFound returns true if the error is a not found error +func IsNotFound(err error) bool { + return apierrors.IsNotFound(err) +} + +// IsAlreadyExists returns true if the error is an already exists error +func IsAlreadyExists(err error) bool { + return apierrors.IsAlreadyExists(err) +} + +// IsConflict returns true if the error is a conflict error +func IsConflict(err error) bool { + return apierrors.IsConflict(err) +} + +// IsInvalid returns true if the error is an invalid error +func IsInvalid(err error) bool { + return apierrors.IsInvalid(err) +} + +// IsForbidden returns true if the error is a forbidden error +func IsForbidden(err error) bool { + return apierrors.IsForbidden(err) +} + +// IsTimeout returns true if the error is a timeout error +func IsTimeout(err error) bool { + return apierrors.IsTimeout(err) +} + +// IsServerTimeout returns true if the error is a server timeout error +func IsServerTimeout(err error) bool { + return apierrors.IsServerTimeout(err) +} + +// IsTooManyRequests returns true if the error is a too many requests error +func IsTooManyRequests(err error) bool { + return apierrors.IsTooManyRequests(err) +} + +// IsServiceUnavailable returns true if the error is a service unavailable error +func IsServiceUnavailable(err error) bool { + return apierrors.IsServiceUnavailable(err) +} + +// WithStack adds a stack trace to the error if it doesn't already have one +func WithStack(err error) error { + if err == nil { + return nil + } + return errors.WithStack(err) +} + +// GetCallerInfo returns the file and line number of the caller +func GetCallerInfo(skip int) string { + _, file, line, ok := runtime.Caller(skip + 1) + if !ok { + return "unknown:0" + } + + // Extract just the filename from the full path + parts := strings.Split(file, "/") + file = parts[len(parts)-1] + + return fmt.Sprintf("%s:%d", file, line) +} + +// NewErrorHandler creates a new ErrorHandler +func NewErrorHandler(client client.Client, log logr.Logger, recorder record.EventRecorder) *ErrorHandler { + return &ErrorHandler{ + Client: client, + Log: log, + EventRecorder: recorder, + } +} diff --git a/controllers/errorhandling/error_handler_test.go b/controllers/errorhandling/error_handler_test.go new file mode 100644 index 00000000..993ff8f9 --- /dev/null +++ b/controllers/errorhandling/error_handler_test.go @@ -0,0 +1,129 @@ +/* + * Copyright (c) 2020, 2025, Oracle and/or its affiliates. + * Licensed under the Universal Permissive License v 1.0 as shown at + * http://oss.oracle.com/licenses/upl. + */ + +package errorhandling + +import ( + "fmt" + "testing" + + "github.com/pkg/errors" + "github.com/stretchr/testify/assert" +) + +// TestOperationError tests the OperationError type +func TestOperationError(t *testing.T) { + // Create a simple error + baseErr := errors.New("base error") + + // Create an OperationError + opErr := NewOperationError("test_operation", baseErr) + + // Test basic properties + assert.Equal(t, "test_operation", opErr.Operation) + assert.Equal(t, baseErr, opErr.Err) + + // Test adding context + _ = opErr.WithContext("key1", "value1").WithContext("key2", "value2") + assert.Equal(t, "value1", opErr.Context["key1"]) + assert.Equal(t, "value2", opErr.Context["key2"]) + + // Test error message formatting + assert.Contains(t, opErr.Error(), "operation 'test_operation' failed") + assert.Contains(t, opErr.Error(), "context: key1=value1, key2=value2") + assert.Contains(t, opErr.Error(), "base error") + + // Test Unwrap + assert.Equal(t, baseErr, opErr.Unwrap()) + + // Test Cause (for compatibility with github.com/pkg/errors) + assert.Equal(t, baseErr, opErr.Cause()) +} + +// TestResourceError tests the resource-specific error creation +func TestResourceError(t *testing.T) { + // Create a simple error + baseErr := errors.New("base error") + + // Create a ResourceError + resErr := NewResourceError("test_operation", "test-resource", "test-namespace", baseErr) + + // Test basic properties + assert.Equal(t, "test_operation", resErr.Operation) + assert.Equal(t, "test-resource", resErr.Resource) + assert.Equal(t, "test-namespace", resErr.Namespace) + assert.Equal(t, baseErr, resErr.Err) + + // Test error message formatting + assert.Contains(t, resErr.Error(), "operation 'test_operation' failed for resource 'test-resource' in namespace 'test-namespace'") + assert.Contains(t, resErr.Error(), "base error") +} + +// TestErrorWrapping tests the error wrapping functions +func TestErrorWrapping(t *testing.T) { + // Create a simple error + baseErr := errors.New("base error") + + // Test WrapError + wrappedErr := WrapError(baseErr, "wrapped message") + assert.Contains(t, wrappedErr.Error(), "wrapped message: base error") + + // Test WrapErrorf + wrappedErrf := WrapErrorf(baseErr, "formatted %s", "message") + assert.Contains(t, wrappedErrf.Error(), "formatted message: base error") + + // Test WithStack + stackErr := WithStack(baseErr) + assert.Contains(t, fmt.Sprintf("%+v", stackErr), "base error") + assert.Contains(t, fmt.Sprintf("%+v", stackErr), "error_handler_test.go") +} + +// TestHelperFunctions tests the error helper functions +func TestHelperFunctions(t *testing.T) { + // Test NewCreateResourceError + createErr := NewCreateResourceError("test-resource", "test-namespace", errors.New("create error")) + opErr, ok := createErr.(*OperationError) + assert.True(t, ok, "Expected *OperationError type") + assert.Equal(t, "create", opErr.Operation) + assert.Equal(t, "test-resource", opErr.Resource) + assert.Equal(t, "test-namespace", opErr.Namespace) + assert.Contains(t, createErr.Error(), "create error") + + // Test NewUpdateResourceError + updateErr := NewUpdateResourceError("test-resource", "test-namespace", errors.New("update error")) + opErr, ok = updateErr.(*OperationError) + assert.True(t, ok, "Expected *OperationError type") + assert.Equal(t, "update", opErr.Operation) + assert.Contains(t, updateErr.Error(), "update error") + + // Test NewDeleteResourceError + deleteErr := NewDeleteResourceError("test-resource", "test-namespace", errors.New("delete error")) + opErr, ok = deleteErr.(*OperationError) + assert.True(t, ok, "Expected *OperationError type") + assert.Equal(t, "delete", opErr.Operation) + assert.Contains(t, deleteErr.Error(), "delete error") + + // Test NewGetResourceError + getErr := NewGetResourceError("test-resource", "test-namespace", errors.New("get error")) + opErr, ok = getErr.(*OperationError) + assert.True(t, ok, "Expected *OperationError type") + assert.Equal(t, "get", opErr.Operation) + assert.Contains(t, getErr.Error(), "get error") + + // Test NewReconcileError + reconcileErr := NewReconcileError("test-resource", "test-namespace", errors.New("reconcile error")) + opErr, ok = reconcileErr.(*OperationError) + assert.True(t, ok, "Expected *OperationError type") + assert.Equal(t, "reconcile", opErr.Operation) + assert.Contains(t, reconcileErr.Error(), "reconcile error") +} + +// TestGetCallerInfo tests the GetCallerInfo function +func TestGetCallerInfo(t *testing.T) { + callerInfo := GetCallerInfo(0) + assert.Contains(t, callerInfo, "error_handler_test.go") + assert.Contains(t, callerInfo, ":") +} diff --git a/controllers/finalizer/finalizer.go b/controllers/finalizer/finalizer.go new file mode 100644 index 00000000..b8c70968 --- /dev/null +++ b/controllers/finalizer/finalizer.go @@ -0,0 +1,141 @@ +/* + * Copyright (c) 2020, 2025, Oracle and/or its affiliates. + * Licensed under the Universal Permissive License v 1.0 as shown at + * http://oss.oracle.com/licenses/upl. + */ + +package finalizer + +import ( + "context" + "fmt" + "github.com/go-logr/logr" + coh "github.com/oracle/coherence-operator/api/v1" + "github.com/oracle/coherence-operator/pkg/events" + "github.com/oracle/coherence-operator/pkg/operator" + "github.com/oracle/coherence-operator/pkg/patching" + "github.com/oracle/coherence-operator/pkg/probe" + "github.com/pkg/errors" + "github.com/spf13/viper" + appsv1 "k8s.io/api/apps/v1" + corev1 "k8s.io/api/core/v1" + "k8s.io/client-go/tools/record" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/controller/controllerutil" + "strconv" +) + +// FinalizerManager manages finalizers for Coherence resources +type FinalizerManager struct { + Client client.Client + Log logr.Logger + EventRecorder record.EventRecorder + Patcher patching.ResourcePatcher +} + +// EnsureFinalizerApplied ensures the finalizer is applied to the Coherence resource +func (fm *FinalizerManager) EnsureFinalizerApplied(ctx context.Context, c *coh.Coherence) (bool, error) { + if !controllerutil.ContainsFinalizer(c, coh.CoherenceFinalizer) { + // Re-fetch the Coherence resource to ensure we have the most recent copy + latest := &coh.Coherence{} + c.DeepCopyInto(latest) + controllerutil.AddFinalizer(latest, coh.CoherenceFinalizer) + + callback := func() { + fm.Log.Info("Added finalizer to Coherence resource", "Namespace", c.Namespace, "Name", c.Name, "Finalizer", coh.CoherenceFinalizer) + } + + // Perform a three-way patch to apply the finalizer + applied, err := fm.Patcher.ThreeWayPatchWithCallback(ctx, c.Name, c, c, latest, callback) + if err != nil { + return false, errors.Wrapf(err, "failed to update Coherence resource %s/%s with finalizer", c.Namespace, c.Name) + } + return applied, nil + } + return false, nil +} + +// EnsureFinalizerRemoved ensures the finalizer is removed from the Coherence resource +func (fm *FinalizerManager) EnsureFinalizerRemoved(ctx context.Context, c *coh.Coherence) error { + if controllerutil.RemoveFinalizer(c, coh.CoherenceFinalizer) { + err := fm.Client.Update(ctx, c) + if err != nil { + fm.Log.Info("Failed to remove the finalizer from the Coherence resource, it looks like it had already been deleted") + return err + } + } + return nil +} + +// FinalizeDeployment performs any required finalizer tasks for the Coherence resource +func (fm *FinalizerManager) FinalizeDeployment(ctx context.Context, c *coh.Coherence, findStatefulSet func(ctx context.Context, namespace, name string) (*appsv1.StatefulSet, bool, error)) error { + // Check if the finalizer bypass annotation is present + annotations := c.GetAnnotations() + if annotations != nil { + if _, bypass := annotations["coherence.oracle.com/finalizer-bypass"]; bypass { + fm.Log.Info("Bypassing service suspension due to finalizer-bypass annotation", + "Namespace", c.Namespace, "Name", c.Name) + fm.EventRecorder.Event(c, corev1.EventTypeNormal, "FinalizerBypassed", + "Service suspension bypassed due to finalizer-bypass annotation") + return nil + } + } + + // determine whether we can skip service suspension + if viper.GetBool(operator.FlagSkipServiceSuspend) { + fm.Log.Info("Skipping suspension of Coherence services in deployment " + c.Name + + operator.FlagSkipServiceSuspend + " is set to true") + return nil + } + if !c.Spec.IsSuspendServicesOnShutdown() { + fm.Log.Info("Skipping suspension of Coherence services in deployment " + c.Name + + " Spec.SuspendServicesOnShutdown is set to false") + return nil + } + if c.GetReplicas() == 0 { + fm.Log.Info("Skipping suspension of Coherence services in deployment " + c.Name + + " Spec.Replicas is zero") + return nil + } + + fm.Log.Info("Finalizing Coherence resource", "Namespace", c.Namespace, "Name", c.Name) + // Get the StatefulSet + sts, stsExists, err := findStatefulSet(ctx, c.Namespace, c.Name) + if err != nil { + return errors.Wrapf(err, "getting StatefulSet %s/%s", c.Namespace, c.Name) + } + if stsExists { + if sts.Status.ReadyReplicas == 0 { + fm.Log.Info("Skipping suspension of Coherence services in deployment " + c.Name + " - No Pods are ready") + } else { + // Do service suspension... + p := probe.CoherenceProbe{ + Client: fm.Client, + EventRecorder: events.NewOwnedEventRecorder(c, fm.EventRecorder), + } + if p.SuspendServices(ctx, c, sts) == probe.ServiceSuspendFailed { + // Log the failure but don't return an error if we've already tried multiple times + // This prevents resources from being stuck in a deleting state indefinitely + errorCount := 1 + if annotations != nil { + if countStr, ok := annotations["coherence.oracle.com/error-count"]; ok { + if parsedCount, err := strconv.Atoi(countStr); err == nil { + errorCount = parsedCount + } + } + } + + if errorCount > 3 { + fm.Log.Info("Service suspension failed multiple times, allowing deletion to proceed", + "Namespace", c.Namespace, "Name", c.Name, "ErrorCount", errorCount) + fm.EventRecorder.Event(c, corev1.EventTypeWarning, "ServiceSuspensionFailed", + "Service suspension failed multiple times, allowing deletion to proceed anyway") + return nil + } + + return fmt.Errorf("failed to suspend services") + } + } + } + return nil +} diff --git a/controllers/reconciler/base_controller.go b/controllers/reconciler/base_controller.go index fce3673e..16ade0a2 100644 --- a/controllers/reconciler/base_controller.go +++ b/controllers/reconciler/base_controller.go @@ -11,6 +11,7 @@ import ( "fmt" "github.com/go-logr/logr" coh "github.com/oracle/coherence-operator/api/v1" + "github.com/oracle/coherence-operator/controllers/errorhandling" "github.com/oracle/coherence-operator/pkg/clients" "github.com/oracle/coherence-operator/pkg/operator" "github.com/oracle/coherence-operator/pkg/patching" @@ -425,23 +426,40 @@ func (in *CommonReconciler) CreateThreeWayPatchData(original, desired, current r // HandleErrAndRequeue is the common error handler func (in *CommonReconciler) HandleErrAndRequeue(ctx context.Context, err error, deployment coh.CoherenceResource, msg string, logger logr.Logger) (reconcile.Result, error) { - return in.Failed(ctx, err, deployment, msg, true, logger) + // Create an error handler if needed + errorHandler := in.getErrorHandler() + + // Add caller information to the error if it's not already an OperationError + if err != nil && !isOperationError(err) { + callerInfo := errorhandling.GetCallerInfo(1) + err = errorhandling.WrapError(err, fmt.Sprintf("at %s", callerInfo)) + } + + return errorHandler.HandleError(ctx, err, deployment, msg) } -// HandleErrAndFinish is the common error handler -func (in *CommonReconciler) HandleErrAndFinish(ctx context.Context, err error, deployment *coh.Coherence, msg string, logger logr.Logger) (reconcile.Result, error) { - return in.Failed(ctx, err, deployment, msg, false, logger) +// isOperationError checks if the error is an OperationError +func isOperationError(err error) bool { + _, ok := err.(*errorhandling.OperationError) + return ok } -// Failed is a common error handler -// ToDo: we need to be able to add some form of back-off so that failures are re-queued with a growing back-off time -func (in *CommonReconciler) Failed(ctx context.Context, err error, deployment coh.CoherenceResource, msg string, requeue bool, logger logr.Logger) (reconcile.Result, error) { +// HandleErrAndFinish is the common error handler +func (in *CommonReconciler) HandleErrAndFinish(ctx context.Context, err error, deployment *coh.Coherence, msg string, logger logr.Logger) (reconcile.Result, error) { if err == nil { logger.V(0).Info(msg) - } else { - logger.V(0).Info(msg + ": " + err.Error()) + return reconcile.Result{Requeue: false}, nil + } + + // Add caller information to the error if it's not already an OperationError + if !isOperationError(err) { + callerInfo := errorhandling.GetCallerInfo(1) + err = errorhandling.WrapError(err, fmt.Sprintf("at %s", callerInfo)) } + // For errors that should not be requeued, we still want to log and update status + logger.V(0).Info(msg + ": " + err.Error()) + if deployment != nil { // update the status to failed. status := deployment.GetStatus() @@ -452,15 +470,61 @@ func (in *CommonReconciler) Failed(ctx context.Context, err error, deployment co } // send a failure event - in.GetEventRecorder().Event(deployment, corev1.EventTypeNormal, "failed", msg) + in.GetEventRecorder().Event(deployment, corev1.EventTypeWarning, "Failed", msg+": "+err.Error()) } - if requeue { - return reconcile.Result{Requeue: true}, nil + // Return the error as a permanent error so it won't be requeued + if !isOperationError(err) { + err = errorhandling.NewOperationError("handle_error", err). + WithContext("requeue", "false"). + WithContext("reason", "permanent") } + return reconcile.Result{Requeue: false}, nil } +// Failed is a common error handler (deprecated, use HandleErrAndRequeue instead) +// This is kept for backward compatibility +func (in *CommonReconciler) Failed(ctx context.Context, err error, deployment coh.CoherenceResource, msg string, requeue bool, logger logr.Logger) (reconcile.Result, error) { + // Add caller information to the error if it's not already an OperationError + if err != nil && !isOperationError(err) { + callerInfo := errorhandling.GetCallerInfo(1) + err = errorhandling.WrapError(err, fmt.Sprintf("at %s", callerInfo)) + } + + if requeue { + return in.HandleErrAndRequeue(ctx, err, deployment, msg, logger) + } + if deployment != nil { + return in.HandleErrAndFinish(ctx, err, deployment.(*coh.Coherence), msg, logger) + } + + // Fallback for nil deployment + if err == nil { + logger.V(0).Info(msg) + } else { + logger.V(0).Info(msg + ": " + err.Error()) + + // Wrap the error with context if it's not already an OperationError + if !isOperationError(err) { + err = errorhandling.NewOperationError("handle_error", err). + WithContext("requeue", fmt.Sprintf("%t", requeue)). + WithContext("deployment", "nil") + } + } + + return reconcile.Result{Requeue: requeue}, err +} + +// getErrorHandler returns an error handler for this reconciler +func (in *CommonReconciler) getErrorHandler() *errorhandling.ErrorHandler { + return errorhandling.NewErrorHandler( + in.GetClient(), + in.GetLog().WithName("error-handler"), + in.GetEventRecorder(), + ) +} + // FindOwningCoherenceResource finds the owning Coherence resource. func (in *CommonReconciler) FindOwningCoherenceResource(ctx context.Context, o client.Object) (coh.CoherenceResource, error) { if o != nil { diff --git a/controllers/resources/operator_secret.go b/controllers/resources/operator_secret.go new file mode 100644 index 00000000..6d0e0e8a --- /dev/null +++ b/controllers/resources/operator_secret.go @@ -0,0 +1,68 @@ +/* + * Copyright (c) 2020, 2025, Oracle and/or its affiliates. + * Licensed under the Universal Permissive License v 1.0 as shown at + * http://oss.oracle.com/licenses/upl. + */ + +package resources + +import ( + "context" + "github.com/go-logr/logr" + coh "github.com/oracle/coherence-operator/api/v1" + "github.com/oracle/coherence-operator/pkg/rest" + coreV1 "k8s.io/api/core/v1" + apierrors "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/types" + "sigs.k8s.io/controller-runtime/pkg/client" +) + +// OperatorSecretManager manages the operator configuration secret +type OperatorSecretManager struct { + Client client.Client + Log logr.Logger +} + +// EnsureOperatorSecret ensures that the Operator configuration secret exists in the namespace. +func (osm *OperatorSecretManager) EnsureOperatorSecret(ctx context.Context, deployment *coh.Coherence) error { + namespace := deployment.Namespace + s := &coreV1.Secret{ + ObjectMeta: metav1.ObjectMeta{ + Name: coh.OperatorConfigName, + Namespace: namespace, + Labels: deployment.CreateGlobalLabels(), + Annotations: deployment.CreateGlobalAnnotations(), + }, + } + + err := osm.Client.Get(ctx, types.NamespacedName{Name: coh.OperatorConfigName, Namespace: namespace}, s) + if err != nil && !apierrors.IsNotFound(err) { + return err + } + + restHostAndPort := rest.GetServerHostAndPort() + + oldValue := s.Data[coh.OperatorConfigKeyHost] + if oldValue == nil || string(oldValue) != restHostAndPort { + // data is different so create/update + + if s.StringData == nil { + s.StringData = make(map[string]string) + } + s.StringData[coh.OperatorConfigKeyHost] = restHostAndPort + + osm.Log.Info("Operator configuration updated", "Key", coh.OperatorConfigKeyHost, "OldValue", string(oldValue), "NewValue", restHostAndPort) + if apierrors.IsNotFound(err) { + // for some reason we're getting here even if the secret exists so delete it!! + _ = osm.Client.Delete(ctx, s) + osm.Log.Info("Creating configuration secret " + coh.OperatorConfigName) + err = osm.Client.Create(ctx, s) + } else { + osm.Log.Info("Updating configuration secret " + coh.OperatorConfigName) + err = osm.Client.Update(ctx, s) + } + } + + return err +} diff --git a/controllers/status/status_manager.go b/controllers/status/status_manager.go new file mode 100644 index 00000000..8d832805 --- /dev/null +++ b/controllers/status/status_manager.go @@ -0,0 +1,66 @@ +/* + * Copyright (c) 2020, 2025, Oracle and/or its affiliates. + * Licensed under the Universal Permissive License v 1.0 as shown at + * http://oss.oracle.com/licenses/upl. + */ + +package status + +import ( + "context" + "github.com/go-logr/logr" + coh "github.com/oracle/coherence-operator/api/v1" + "github.com/oracle/coherence-operator/pkg/operator" + "github.com/pkg/errors" + "k8s.io/apimachinery/pkg/types" + "sigs.k8s.io/controller-runtime/pkg/client" +) + +// StatusManager manages the status of Coherence resources +type StatusManager struct { + Client client.Client + Log logr.Logger +} + +// UpdateCoherenceStatusPhase updates the phase of a Coherence resource +func (sm *StatusManager) UpdateCoherenceStatusPhase(ctx context.Context, namespacedName types.NamespacedName, phase coh.ConditionType) error { + // Get the latest version of the Coherence resource + deployment := &coh.Coherence{} + err := sm.Client.Get(ctx, namespacedName, deployment) + if err != nil { + return errors.Wrapf(err, "getting Coherence resource %s/%s", namespacedName.Namespace, namespacedName.Name) + } + + // Update the status phase + deployment.Status.Phase = phase + + // Update the resource + err = sm.Client.Status().Update(ctx, deployment) + if err != nil { + return errors.Wrapf(err, "updating status phase for Coherence resource %s/%s", namespacedName.Namespace, namespacedName.Name) + } + + return nil +} + +// UpdateDeploymentStatusHash updates the hash in the status of a Coherence resource +func (sm *StatusManager) UpdateDeploymentStatusHash(ctx context.Context, namespacedName types.NamespacedName, hash string) error { + // Get the latest version of the Coherence resource + deployment := &coh.Coherence{} + err := sm.Client.Get(ctx, namespacedName, deployment) + if err != nil { + return errors.Wrapf(err, "getting Coherence resource %s/%s", namespacedName.Namespace, namespacedName.Name) + } + + // Update the status hash + deployment.Status.Hash = hash + deployment.Status.SetVersion(operator.GetVersion()) + + // Update the resource + err = sm.Client.Status().Update(ctx, deployment) + if err != nil { + return errors.Wrapf(err, "updating status hash for Coherence resource %s/%s", namespacedName.Namespace, namespacedName.Name) + } + + return nil +} diff --git a/go.mod b/go.mod index 07ed3b80..612a0633 100644 --- a/go.mod +++ b/go.mod @@ -22,6 +22,7 @@ require ( github.com/spf13/cobra v1.9.1 github.com/spf13/pflag v1.0.6 github.com/spf13/viper v1.20.1 + github.com/stretchr/testify v1.10.0 golang.org/x/mod v0.24.0 golang.org/x/net v0.40.0 k8s.io/api v0.33.0 @@ -72,6 +73,7 @@ require ( github.com/mxk/go-flowrate v0.0.0-20140419014527-cca7078d478f // indirect github.com/opencontainers/go-digest v1.0.0 // indirect github.com/pelletier/go-toml/v2 v2.2.4 // indirect + github.com/pmezard/go-difflib v1.0.0 // indirect github.com/prometheus/client_golang v1.22.0 // indirect github.com/prometheus/client_model v0.6.2 // indirect github.com/prometheus/common v0.63.0 // indirect diff --git a/go.sum b/go.sum index 7092b978..0d333964 100644 --- a/go.sum +++ b/go.sum @@ -1,5 +1,3 @@ -cel.dev/expr v0.22.0 h1:+hFFhLPmquBImfs1BiN2PZmkr5ASse2ZOuaxIs9e4R8= -cel.dev/expr v0.22.0/go.mod h1:MrpN08Q+lEBs+bGYdLxxHkZoUSsCp0nSKTs0nTymJgw= cel.dev/expr v0.22.1 h1:xoFEsNh972Yzey8N9TCPx2nDvMN7TMhQEzxLuj/iRrI= cel.dev/expr v0.22.1/go.mod h1:MrpN08Q+lEBs+bGYdLxxHkZoUSsCp0nSKTs0nTymJgw= github.com/antlr4-go/antlr/v4 v4.13.1 h1:SqQKkuVZ+zWkMMNkjy5FZe5mr5WURWnlpmOuzYWrPrQ= @@ -79,7 +77,6 @@ github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0= github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= github.com/gorilla/websocket v1.5.4-0.20250319132907-e064f32e3674 h1:JeSE6pjso5THxAzdVpqr6/geYxZytqFMBCOtn/ujyeo= github.com/gorilla/websocket v1.5.4-0.20250319132907-e064f32e3674/go.mod h1:r4w70xmWCQKmi1ONH4KIaBptdivuRPyosB9RmPlGEwA= -github.com/grpc-ecosystem/grpc-gateway v1.16.0 h1:gmcG1KaJ57LophUzW0Hy8NmPhnMZb4M0+kPpLofRdBo= github.com/grpc-ecosystem/grpc-gateway/v2 v2.26.3 h1:5ZPtiqj0JL5oKWmcsq4VMaAW5ukBEgSGXEN89zeH1Jo= github.com/grpc-ecosystem/grpc-gateway/v2 v2.26.3/go.mod h1:ndYquD05frm2vACXE1nsccT4oJzjhw2arTS2cpUD1PI= github.com/inconshreveable/mousetrap v1.1.0 h1:wN+x4NVGpMsO7ErUn/mUI3vEoE6Jt13X2s0bqwp9tc8= @@ -123,26 +120,18 @@ github.com/onsi/gomega v1.37.0 h1:CdEG8g0S133B4OswTDC/5XPSzE1OeP29QOioj2PID2Y= github.com/onsi/gomega v1.37.0/go.mod h1:8D9+Txp43QWKhM24yyOBEdpkzN8FvJyAwecBgsU4KU0= github.com/opencontainers/go-digest v1.0.0 h1:apOUWs51W5PlhuyGyz9FCeeBIOUDA/6nW8Oi/yOhh5U= github.com/opencontainers/go-digest v1.0.0/go.mod h1:0JzlMkj0TRzQZfJkVvzbP0HBR3IKzErnv2BNG4W4MAM= -github.com/pelletier/go-toml/v2 v2.2.3 h1:YmeHyLY8mFWbdkNWwpr+qIL2bEqT0o95WSdkNHvL12M= -github.com/pelletier/go-toml/v2 v2.2.3/go.mod h1:MfCQTFTvCcUyyvvwm1+G6H/jORL20Xlb6rzQu9GuUkc= github.com/pelletier/go-toml/v2 v2.2.4 h1:mye9XuhQ6gvn5h28+VilKrrPoQVanw5PMw/TB0t5Ec4= github.com/pelletier/go-toml/v2 v2.2.4/go.mod h1:2gIqNv+qfxSVS7cM2xJQKtLSTLUE9V8t9Stt+h56mCY= github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4= github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= -github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring v0.82.0 h1:Ee6zu4IR/WKYEcYHL4+gbC1A3GAzlHWxSjjMyRVBHYw= -github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring v0.82.0/go.mod h1:hY5yoQsoIalncoxYqXXCDL5y7f+GGYYlW9Bi2IdU5KY= github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring v0.82.1 h1:7zaP53Udfw1flmENke9vKkW0+QyVpFgQn2qP3QqIQBM= github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring v0.82.1/go.mod h1:hY5yoQsoIalncoxYqXXCDL5y7f+GGYYlW9Bi2IdU5KY= -github.com/prometheus-operator/prometheus-operator/pkg/client v0.82.0 h1:g/wIdMrRyQ1Fac8plyQMXzky8R9kOi243tQi3a2YkyU= -github.com/prometheus-operator/prometheus-operator/pkg/client v0.82.0/go.mod h1:yEp9v3FEYT+iR2ujaXFVS8ugTIQk0Mx+wwXGXoF5az8= github.com/prometheus-operator/prometheus-operator/pkg/client v0.82.1 h1:QPbi5wt9k6skEeZFI3vBE9o9CdI9kTFpXWXxfSy7qv4= github.com/prometheus-operator/prometheus-operator/pkg/client v0.82.1/go.mod h1:Sw1pnGbx7D3j/y+qD5iCx9r2t618OM9Swo9I7pD6zZM= github.com/prometheus/client_golang v1.22.0 h1:rb93p9lokFEsctTys46VnV1kLCDpVZ0a/Y92Vm0Zc6Q= github.com/prometheus/client_golang v1.22.0/go.mod h1:R7ljNsLXhuQXYZYtw6GAE9AZg8Y7vEW5scdCXrWRXC0= -github.com/prometheus/client_model v0.6.1 h1:ZKSh/rekM+n3CeS952MLRAdFwIKqeY8b62p8ais2e9E= -github.com/prometheus/client_model v0.6.1/go.mod h1:OrxVMOVHjw3lKMa8+x6HeMGkHMQyHDk9E3jmP2AmGiY= github.com/prometheus/client_model v0.6.2 h1:oBsgwpGs7iVziMvrGhE53c/GrLUsZdHnqNwqPLxwZyk= github.com/prometheus/client_model v0.6.2/go.mod h1:y3m2F6Gdpfy6Ut/GBsUqTWZqCUvMVzSfMLjcu6wAwpE= github.com/prometheus/common v0.63.0 h1:YR/EIY1o3mEFP/kZCD7iDMnLPlGyuU2Gb3HIcXnA98k= @@ -214,8 +203,6 @@ go.uber.org/zap v1.27.0/go.mod h1:GB2qFLM7cTU87MWRP2mPIjqfIDnGu+VIO4V/SdhGo2E= golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI= golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto= -golang.org/x/exp v0.0.0-20250305212735-054e65f0b394 h1:nDVHiLt8aIbd/VzvPWN6kSOPE7+F/fNFDSXLVYkE/Iw= -golang.org/x/exp v0.0.0-20250305212735-054e65f0b394/go.mod h1:sIifuuw/Yco/y6yb6+bDNfyeQ/MdPUy/hKEMYQV17cM= golang.org/x/exp v0.0.0-20250506013437-ce4c2cf36ca6 h1:y5zboxd6LQAqYIhHnB48p0ByQ/GnQx2BE33L8BOHQkI= golang.org/x/exp v0.0.0-20250506013437-ce4c2cf36ca6/go.mod h1:U6Lno4MTRCDY+Ba7aCcauB9T60gsv5s4ralQzP72ZoQ= golang.org/x/mod v0.2.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= @@ -255,25 +242,18 @@ golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGm golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= golang.org/x/tools v0.0.0-20200619180055-7c47624df98f/go.mod h1:EkVYQZoAsY45+roYkvgYkIh4xh/qjgUK9TdY2XT94GE= golang.org/x/tools v0.0.0-20210106214847-113979e3529a/go.mod h1:emZCQorbCU4vsT4fOWvOPXz4eW1wZW4PmDk9uLelYpA= -golang.org/x/tools v0.31.0 h1:0EedkvKDbh+qistFTd0Bcwe/YLh4vHwWEkiI0toFIBU= -golang.org/x/tools v0.31.0/go.mod h1:naFTU+Cev749tSJRXJlna0T3WxKvb1kWEx15xA4SdmQ= golang.org/x/tools v0.33.0 h1:4qz2S3zmRxbGIhDIAgjxvFutSvH5EfnsYrRBj0UI0bc= +golang.org/x/tools v0.33.0/go.mod h1:CIJMaWEY88juyUfo7UbgPqbC8rU2OqfAV1h2Qp0oMYI= golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= gomodules.xyz/jsonpatch/v2 v2.5.0 h1:JELs8RLM12qJGXU4u/TO3V25KW8GreMKl9pdkk14RM0= gomodules.xyz/jsonpatch/v2 v2.5.0/go.mod h1:AH3dM2RI6uoBZxn3LVrfvJ3E0/9dG4cSrbuBJT4moAY= -google.golang.org/genproto/googleapis/api v0.0.0-20250313205543-e70fdf4c4cb4 h1:IFnXJq3UPB3oBREOodn1v1aGQeZYQclEmvWRMN0PSsY= -google.golang.org/genproto/googleapis/api v0.0.0-20250313205543-e70fdf4c4cb4/go.mod h1:c8q6Z6OCqnfVIqUFJkCzKcrj8eCvUrz+K4KRzSTuANg= google.golang.org/genproto/googleapis/api v0.0.0-20250505200425-f936aa4a68b2 h1:vPV0tzlsK6EzEDHNNH5sa7Hs9bd7iXR7B1tSiPepkV0= google.golang.org/genproto/googleapis/api v0.0.0-20250505200425-f936aa4a68b2/go.mod h1:pKLAc5OolXC3ViWGI62vvC0n10CpwAtRcTNCFwTKBEw= -google.golang.org/genproto/googleapis/rpc v0.0.0-20250313205543-e70fdf4c4cb4 h1:iK2jbkWL86DXjEx0qiHcRE9dE4/Ahua5k6V8OWFb//c= -google.golang.org/genproto/googleapis/rpc v0.0.0-20250313205543-e70fdf4c4cb4/go.mod h1:LuRYeWDFV6WOn90g357N17oMCaxpgCnbi/44qJvDn2I= google.golang.org/genproto/googleapis/rpc v0.0.0-20250505200425-f936aa4a68b2 h1:IqsN8hx+lWLqlN+Sc3DoMy/watjofWiU8sRFgQ8fhKM= google.golang.org/genproto/googleapis/rpc v0.0.0-20250505200425-f936aa4a68b2/go.mod h1:qQ0YXyHHx3XkvlzUtpXDkS29lDSafHMZBAZDc03LQ3A= -google.golang.org/grpc v1.71.0 h1:kF77BGdPTQ4/JZWMlb9VpJ5pa25aqvVqogsxNHHdeBg= -google.golang.org/grpc v1.71.0/go.mod h1:H0GRtasmQOh9LkFoCPDu3ZrwUtD1YGE+b2vYBYd/8Ec= google.golang.org/grpc v1.71.1 h1:ffsFWr7ygTUscGPI0KKK6TLrGz0476KUvvsbqWK0rPI= google.golang.org/grpc v1.71.1/go.mod h1:H0GRtasmQOh9LkFoCPDu3ZrwUtD1YGE+b2vYBYd/8Ec= google.golang.org/protobuf v1.36.6 h1:z1NpPI8ku2WgiWnf+t9wTPsn6eP1L7ksHUlkfLvd9xY= @@ -311,8 +291,6 @@ k8s.io/kube-openapi v0.0.0-20250318190949-c8a335a9a2ff h1:/usPimJzUKKu+m+TE36gUy k8s.io/kube-openapi v0.0.0-20250318190949-c8a335a9a2ff/go.mod h1:5jIi+8yX4RIb8wk3XwBo5Pq2ccx4FP10ohkbSKCZoK8= k8s.io/utils v0.0.0-20250502105355-0f33e8f1c979 h1:jgJW5IePPXLGB8e/1wvd0Ich9QE97RvvF3a8J3fP/Lg= k8s.io/utils v0.0.0-20250502105355-0f33e8f1c979/go.mod h1:OLgZIPagt7ERELqWJFomSt595RzquPNLL48iOWgYOg0= -sigs.k8s.io/apiserver-network-proxy/konnectivity-client v0.32.0 h1:XotDXzqvJ8Nx5eiZZueLpTuafJz8SiodgOemI+w87QU= -sigs.k8s.io/apiserver-network-proxy/konnectivity-client v0.32.0/go.mod h1:Ve9uj1L+deCXFrPOk1LpFXqTg7LCFzFso6PA48q/XZw= sigs.k8s.io/apiserver-network-proxy/konnectivity-client v0.32.1 h1:Cf+ed5N8038zbsaXFO7mKQDi/+VcSRafb0jM84KX5so= sigs.k8s.io/apiserver-network-proxy/konnectivity-client v0.32.1/go.mod h1:Ve9uj1L+deCXFrPOk1LpFXqTg7LCFzFso6PA48q/XZw= sigs.k8s.io/controller-runtime v0.20.4 h1:X3c+Odnxz+iPTRobG4tp092+CvBU9UK0t/bRf+n0DGU=