diff --git a/docs/snippets/providers/base-snippet-autogenerated.mdx b/docs/snippets/providers/base-snippet-autogenerated.mdx index e4e4fdceca..74b7d27f0c 100644 --- a/docs/snippets/providers/base-snippet-autogenerated.mdx +++ b/docs/snippets/providers/base-snippet-autogenerated.mdx @@ -1,4 +1,4 @@ -{/* This snippet is automatically generated using scripts/docs_render_provider_snippets.py +{/* This snippet is automatically generated using scripts/docs_render_provider_snippets.py Do not edit it manually, as it will be overwritten */} @@ -43,6 +43,7 @@ Check the following workflow examples: - [elastic_enrich_example.yml](https://github.com/keephq/keep/blob/main/examples/workflows/elastic_enrich_example.yml) - [ifelse.yml](https://github.com/keephq/keep/blob/main/examples/workflows/ifelse.yml) - [incident-tier-escalation.yml](https://github.com/keephq/keep/blob/main/examples/workflows/incident-tier-escalation.yml) +- [openshift_pod_restart.yml](https://github.com/keephq/keep/blob/main/examples/workflows/openshift_pod_restart.yml) - [query_victoriametrics.yml](https://github.com/keephq/keep/blob/main/examples/workflows/query_victoriametrics.yml) - [raw_sql_query_datetime.yml](https://github.com/keephq/keep/blob/main/examples/workflows/raw_sql_query_datetime.yml) - [webhook_example_foreach.yml](https://github.com/keephq/keep/blob/main/examples/workflows/webhook_example_foreach.yml) @@ -50,6 +51,6 @@ Check the following workflow examples: ## Topology -This provider pulls [topology](/overview/servicetopology) to Keep. It could be used in [correlations](/overview/correlation-topology) -and [mapping](/overview/enrichment/mapping#mapping-with-topology-data), and as a context +This provider pulls [topology](/overview/servicetopology) to Keep. It could be used in [correlations](/overview/correlation-topology) +and [mapping](/overview/enrichment/mapping#mapping-with-topology-data), and as a context for [alerts](/alerts/sidebar#7-alert-topology-view) and [incidents](/overview#17-incident-topology). \ No newline at end of file diff --git a/docs/snippets/providers/console-snippet-autogenerated.mdx b/docs/snippets/providers/console-snippet-autogenerated.mdx index 75492c1249..b0e546936f 100644 --- a/docs/snippets/providers/console-snippet-autogenerated.mdx +++ b/docs/snippets/providers/console-snippet-autogenerated.mdx @@ -14,9 +14,9 @@ steps: provider: console config: "{{ provider.my_provider_name }}" with: - message: {value} - logger: {value} - severity: {value} + message: {value} + logger: {value} + severity: {value} ``` @@ -51,6 +51,9 @@ Check the following workflow examples: - [inputs_example.yml](https://github.com/keephq/keep/blob/main/examples/workflows/inputs_example.yml) - [multi-condition-cel.yml](https://github.com/keephq/keep/blob/main/examples/workflows/multi-condition-cel.yml) - [mustache-paths-example.yml](https://github.com/keephq/keep/blob/main/examples/workflows/mustache-paths-example.yml) +- [openshift_basic.yml](https://github.com/keephq/keep/blob/main/examples/workflows/openshift_basic.yml) +- [openshift_monitoring_and_remediation.yml](https://github.com/keephq/keep/blob/main/examples/workflows/openshift_monitoring_and_remediation.yml) +- [openshift_pod_restart.yml](https://github.com/keephq/keep/blob/main/examples/workflows/openshift_pod_restart.yml) - [pattern-matching-cel.yml](https://github.com/keephq/keep/blob/main/examples/workflows/pattern-matching-cel.yml) - [severity_changed.yml](https://github.com/keephq/keep/blob/main/examples/workflows/severity_changed.yml) - [webhook_example.yml](https://github.com/keephq/keep/blob/main/examples/workflows/webhook_example.yml) diff --git a/docs/snippets/providers/openshift-snippet-autogenerated.mdx b/docs/snippets/providers/openshift-snippet-autogenerated.mdx index d67a362d49..c31758e7f0 100644 --- a/docs/snippets/providers/openshift-snippet-autogenerated.mdx +++ b/docs/snippets/providers/openshift-snippet-autogenerated.mdx @@ -1,4 +1,4 @@ -{/* This snippet is automatically generated using scripts/docs_render_provider_snippets.py +{/* This snippet is automatically generated using scripts/docs_render_provider_snippets.py Do not edit it manually, as it will be overwritten */} ## Authentication @@ -17,6 +17,25 @@ Certain scopes may be required to perform specific actions or queries via the pr This provider can be used in workflows. +As "step" to query data, example: +```yaml +steps: + - name: Query openshift + provider: openshift + config: "{{ provider.my_provider_name }}" + with: + command_type: {value} # The type of query to perform. Supported queries are: +- get_logs: Get logs from a pod +- get_events: Get events for a namespace or pod +- get_pods: List pods in a namespace or across all namespaces +- get_node_pressure: Get node pressure conditions +- get_pvc: List persistent volume claims +- get_routes: List OpenShift routes +- get_deploymentconfigs: List OpenShift deployment configs +- get_projects: List OpenShift projects + # Additional arguments for the query. +``` + As "action" to make changes or update data, example: ```yaml @@ -25,11 +44,18 @@ actions: provider: openshift config: "{{ provider.my_provider_name }}" with: - kind: {value} # The kind of object to restart. Could be deployments, statefulset, daemonset. - name: {value} # The name of the object to restart - project_name: {value} # The project name where the object is located + action: {value} # The action to perform. Supported actions are: +- rollout_restart: Restart a deployment, statefulset, or daemonset +- restart_pod: Restart a pod by deleting it +- scale_deployment: Scale a deployment to specified replicas +- scale_deploymentconfig: Scale a deployment config to specified replicas + # Additional arguments for the action. ``` -If you need workflow examples with this provider, please raise a [GitHub issue](https://github.com/keephq/keep/issues). + +Check the following workflow examples: +- [openshift_basic.yml](https://github.com/keephq/keep/blob/main/examples/workflows/openshift_basic.yml) +- [openshift_monitoring_and_remediation.yml](https://github.com/keephq/keep/blob/main/examples/workflows/openshift_monitoring_and_remediation.yml) +- [openshift_pod_restart.yml](https://github.com/keephq/keep/blob/main/examples/workflows/openshift_pod_restart.yml) diff --git a/docs/snippets/providers/slack-snippet-autogenerated.mdx b/docs/snippets/providers/slack-snippet-autogenerated.mdx index c552f11d8d..f7e6a15064 100644 --- a/docs/snippets/providers/slack-snippet-autogenerated.mdx +++ b/docs/snippets/providers/slack-snippet-autogenerated.mdx @@ -46,6 +46,7 @@ Check the following workflow examples: - [new-auth0-users-monitor.yml](https://github.com/keephq/keep/blob/main/examples/workflows/new-auth0-users-monitor.yml) - [new_github_stars.yml](https://github.com/keephq/keep/blob/main/examples/workflows/new_github_stars.yml) - [notify-new-trello-card.yml](https://github.com/keephq/keep/blob/main/examples/workflows/notify-new-trello-card.yml) +- [openshift_monitoring_and_remediation.yml](https://github.com/keephq/keep/blob/main/examples/workflows/openshift_monitoring_and_remediation.yml) - [opsgenie_open_alerts.yml](https://github.com/keephq/keep/blob/main/examples/workflows/opsgenie_open_alerts.yml) - [permissions_example.yml](https://github.com/keephq/keep/blob/main/examples/workflows/permissions_example.yml) - [posthog_example.yml](https://github.com/keephq/keep/blob/main/examples/workflows/posthog_example.yml) diff --git a/examples/workflows/openshift_basic.yml b/examples/workflows/openshift_basic.yml new file mode 100644 index 0000000000..fd4e538fa0 --- /dev/null +++ b/examples/workflows/openshift_basic.yml @@ -0,0 +1,58 @@ +workflow: + id: openshift-basic-monitoring + name: OpenShift Basic Monitoring + description: Simple OpenShift monitoring workflow that gets cluster status and pod information + triggers: + - type: manual + steps: + # Get all OpenShift projects + - name: get-projects + provider: + type: openshift + config: "{{ providers.openshift }}" + with: + command_type: get_projects + + # Get all pods + - name: get-pods + provider: + type: openshift + config: "{{ providers.openshift }}" + with: + command_type: get_pods + + # Get OpenShift routes + - name: get-routes + provider: + type: openshift + config: "{{ providers.openshift }}" + with: + command_type: get_routes + + actions: + # Display cluster summary + - name: display-cluster-summary + provider: + type: console + with: + message: | + 🔍 OpenShift Cluster Summary: + - Projects: {{ steps.get-projects.results | length }} + - Total Pods: {{ steps.get-pods.results | length }} + - Routes: {{ steps.get-routes.results | length }} + + # Show pod status for each namespace + - name: display-pod-status + foreach: "{{ steps.get-pods.results }}" + provider: + type: console + with: + message: "Pod: {{ foreach.value.metadata.name }} | Namespace: {{ foreach.value.metadata.namespace }} | Status: {{ foreach.value.status.phase }}" + + # List all projects + - name: list-projects + foreach: "{{ steps.get-projects.results }}" + provider: + type: console + with: + message: "Project: {{ foreach.value.metadata.name }} | Status: {{ foreach.value.status.phase | default('Active') }}" \ No newline at end of file diff --git a/examples/workflows/openshift_monitoring_and_remediation.yml b/examples/workflows/openshift_monitoring_and_remediation.yml new file mode 100644 index 0000000000..7611387b23 --- /dev/null +++ b/examples/workflows/openshift_monitoring_and_remediation.yml @@ -0,0 +1,229 @@ +workflow: + id: openshift-monitoring-and-remediation + name: OpenShift Monitoring and Remediation + description: | + Comprehensive OpenShift monitoring workflow that demonstrates: + - Getting cluster information (projects, pods, routes, deployment configs) + - Monitoring pod health and events + - Automatic remediation actions (restart pods, scale deployments) + - Alert-driven workflows for OpenShift clusters + triggers: + - type: manual + - type: alert + filters: + - key: source + value: openshift + - key: severity + value: critical + steps: + # Get all OpenShift projects + - name: get-projects + provider: + type: openshift + config: "{{ providers.openshift }}" + with: + command_type: get_projects + + # Get all pods across namespaces + - name: get-all-pods + provider: + type: openshift + config: "{{ providers.openshift }}" + with: + command_type: get_pods + + # Get deployment configs + - name: get-deployment-configs + provider: + type: openshift + config: "{{ providers.openshift }}" + with: + command_type: get_deploymentconfigs + + # Get routes + - name: get-routes + provider: + type: openshift + config: "{{ providers.openshift }}" + with: + command_type: get_routes + + # Get node pressure conditions + - name: get-node-pressure + provider: + type: openshift + config: "{{ providers.openshift }}" + with: + command_type: get_node_pressure + + # Get events for a specific namespace (if alert provides namespace) + - name: get-events + if: "{{ alert.namespace }}" + provider: + type: openshift + config: "{{ providers.openshift }}" + with: + command_type: get_events + namespace: "{{ alert.namespace }}" + + # Get pod logs for failing pods (if alert provides pod name) + - name: get-pod-logs + if: "{{ alert.pod_name and alert.namespace }}" + provider: + type: openshift + config: "{{ providers.openshift }}" + with: + command_type: get_logs + namespace: "{{ alert.namespace }}" + pod_name: "{{ alert.pod_name }}" + tail_lines: 50 + + actions: + # Report cluster overview + - name: report-cluster-overview + provider: + type: console + with: + message: | + 🔍 OpenShift Cluster Overview: + - Projects: {{ steps.get-projects.results | length }} + - Total Pods: {{ steps.get-all-pods.results | length }} + - Deployment Configs: {{ steps.get-deployment-configs.results | length }} + - Routes: {{ steps.get-routes.results | length }} + - Node Pressure Issues: {{ steps.get-node-pressure.results | selectattr('conditions', 'ne', []) | list | length }} + + # Alert on failing pods + - name: alert-failing-pods + foreach: "{{ steps.get-all-pods.results | selectattr('status.phase', 'ne', 'Running') | selectattr('status.phase', 'ne', 'Succeeded') }}" + provider: + type: console + with: + message: | + ⚠️ Pod Issue Detected: + - Pod: {{ foreach.value.metadata.name }} + - Namespace: {{ foreach.value.metadata.namespace }} + - Status: {{ foreach.value.status.phase }} + - Node: {{ foreach.value.spec.nodeName }} + + # Restart failing pods automatically (CrashLoopBackOff, Failed) + - name: restart-failed-pods + foreach: "{{ steps.get-all-pods.results | selectattr('status.phase', 'in', ['CrashLoopBackOff', 'Failed']) }}" + provider: + type: openshift + config: "{{ providers.openshift }}" + with: + action: restart_pod + namespace: "{{ foreach.value.metadata.namespace }}" + pod_name: "{{ foreach.value.metadata.name }}" + message: "Auto-restarting failed pod {{ foreach.value.metadata.name }}" + + # Scale up deployment if alert indicates high load + - name: scale-deployment-on-high-load + if: "{{ alert.deployment_name and alert.namespace and alert.scale_up }}" + provider: + type: openshift + config: "{{ providers.openshift }}" + with: + action: scale_deployment + namespace: "{{ alert.namespace }}" + deployment_name: "{{ alert.deployment_name }}" + replicas: "{{ alert.target_replicas | default(3) }}" + + # Scale up deployment config if specified + - name: scale-deploymentconfig-on-demand + if: "{{ alert.deploymentconfig_name and alert.namespace and alert.scale_up }}" + provider: + type: openshift + config: "{{ providers.openshift }}" + with: + action: scale_deploymentconfig + namespace: "{{ alert.namespace }}" + deploymentconfig_name: "{{ alert.deploymentconfig_name }}" + replicas: "{{ alert.target_replicas | default(2) }}" + + # Restart deployment on critical alerts + - name: restart-deployment-on-critical-alert + if: "{{ alert.severity == 'critical' and alert.deployment_name and alert.namespace }}" + provider: + type: openshift + config: "{{ providers.openshift }}" + with: + action: rollout_restart + kind: "deployment" + name: "{{ alert.deployment_name }}" + namespace: "{{ alert.namespace }}" + + # Restart deployment config on critical alerts + - name: restart-deploymentconfig-on-critical-alert + if: "{{ alert.severity == 'critical' and alert.deploymentconfig_name and alert.namespace }}" + provider: + type: openshift + config: "{{ providers.openshift }}" + with: + action: rollout_restart + kind: "deploymentconfig" + name: "{{ alert.deploymentconfig_name }}" + namespace: "{{ alert.namespace }}" + + # Send notification with detailed information + - name: send-notification + if: "{{ alert }}" + provider: + type: slack + config: "{{ providers.slack }}" + with: + message: | + 🚨 OpenShift Alert: {{ alert.name }} + + 📊 Cluster Status: + • Projects: {{ steps.get-projects.results | length }} + • Total Pods: {{ steps.get-all-pods.results | length }} + • Failing Pods: {{ steps.get-all-pods.results | selectattr('status.phase', 'ne', 'Running') | selectattr('status.phase', 'ne', 'Succeeded') | list | length }} + + 🔍 Alert Details: + • Severity: {{ alert.severity }} + • Source: {{ alert.source }} + • Namespace: {{ alert.namespace | default('N/A') }} + • Pod: {{ alert.pod_name | default('N/A') }} + + 🛠️ Actions Taken: + {% if alert.deployment_name and alert.scale_up %}• Scaled deployment {{ alert.deployment_name }} to {{ alert.target_replicas | default(3) }} replicas{% endif %} + {% if alert.deploymentconfig_name and alert.scale_up %}• Scaled DeploymentConfig {{ alert.deploymentconfig_name }} to {{ alert.target_replicas | default(2) }} replicas{% endif %} + {% if alert.severity == 'critical' and (alert.deployment_name or alert.deploymentconfig_name) %}• Performed rollout restart{% endif %} + +# Example alert payloads to test this workflow: + +# Manual trigger for cluster overview: +# No additional data needed + +# High load scaling scenario: +# { +# "name": "High CPU Usage", +# "severity": "warning", +# "source": "openshift", +# "namespace": "production", +# "deployment_name": "web-app", +# "scale_up": true, +# "target_replicas": 5 +# } + +# Critical pod failure: +# { +# "name": "Pod CrashLoopBackOff", +# "severity": "critical", +# "source": "openshift", +# "namespace": "production", +# "pod_name": "web-app-123-abc", +# "deployment_name": "web-app" +# } + +# DeploymentConfig scaling: +# { +# "name": "Scale DeploymentConfig", +# "severity": "warning", +# "source": "openshift", +# "namespace": "staging", +# "deploymentconfig_name": "api-server", +# "scale_up": true, +# "target_replicas": 3 +# } \ No newline at end of file diff --git a/examples/workflows/openshift_pod_restart.yml b/examples/workflows/openshift_pod_restart.yml new file mode 100644 index 0000000000..c73e3de079 --- /dev/null +++ b/examples/workflows/openshift_pod_restart.yml @@ -0,0 +1,159 @@ +workflow: + id: openshift-pod-restart-remediation + name: OpenShift Pod Restart Remediation + description: Automatically restart failing pods and scale deployments based on alerts or manual triggers + triggers: + - type: manual + - type: alert + filters: + - key: source + value: openshift + - key: pod_status + value: CrashLoopBackOff + steps: + # Get pod details for a specific namespace + - name: get-namespace-pods + if: "{{ alert.namespace }}" + provider: + type: openshift + config: "{{ providers.openshift }}" + with: + command_type: get_pods + namespace: "{{ alert.namespace }}" + + # Get pod logs if specific pod is mentioned + - name: get-failing-pod-logs + if: "{{ alert.pod_name and alert.namespace }}" + provider: + type: openshift + config: "{{ providers.openshift }}" + with: + command_type: get_logs + namespace: "{{ alert.namespace }}" + pod_name: "{{ alert.pod_name }}" + tail_lines: 100 + + # Get events for the namespace to understand issues + - name: get-namespace-events + if: "{{ alert.namespace }}" + provider: + type: openshift + config: "{{ providers.openshift }}" + with: + command_type: get_events + namespace: "{{ alert.namespace }}" + + actions: + # Restart specific pod if mentioned in alert + - name: restart-specific-pod + if: "{{ alert.pod_name and alert.namespace }}" + provider: + type: openshift + config: "{{ providers.openshift }}" + with: + action: restart_pod + namespace: "{{ alert.namespace }}" + pod_name: "{{ alert.pod_name }}" + message: "Restarting pod due to {{ alert.pod_status | default('failure') }}" + + # Scale deployment if replica count is specified + - name: scale-deployment + if: "{{ alert.deployment_name and alert.namespace and alert.replicas }}" + provider: + type: openshift + config: "{{ providers.openshift }}" + with: + action: scale_deployment + namespace: "{{ alert.namespace }}" + deployment_name: "{{ alert.deployment_name }}" + replicas: "{{ alert.replicas }}" + + # Scale deployment config if specified + - name: scale-deploymentconfig + if: "{{ alert.deploymentconfig_name and alert.namespace and alert.replicas }}" + provider: + type: openshift + config: "{{ providers.openshift }}" + with: + action: scale_deploymentconfig + namespace: "{{ alert.namespace }}" + deploymentconfig_name: "{{ alert.deploymentconfig_name }}" + replicas: "{{ alert.replicas }}" + + # Rollout restart deployment + - name: rollout-restart-deployment + if: "{{ alert.deployment_name and alert.namespace and alert.restart_deployment }}" + provider: + type: openshift + config: "{{ providers.openshift }}" + with: + action: rollout_restart + kind: "deployment" + name: "{{ alert.deployment_name }}" + namespace: "{{ alert.namespace }}" + + # Rollout restart deployment config + - name: rollout-restart-deploymentconfig + if: "{{ alert.deploymentconfig_name and alert.namespace and alert.restart_deployment }}" + provider: + type: openshift + config: "{{ providers.openshift }}" + with: + action: rollout_restart + kind: "deploymentconfig" + name: "{{ alert.deploymentconfig_name }}" + namespace: "{{ alert.namespace }}" + + # Report remediation actions taken + - name: report-actions + provider: + type: console + with: + message: | + 🔧 OpenShift Remediation Actions Completed: + {% if alert.pod_name %} + - Restarted pod: {{ alert.pod_name }} in {{ alert.namespace }} + {% endif %} + {% if alert.deployment_name and alert.replicas %} + - Scaled deployment {{ alert.deployment_name }} to {{ alert.replicas }} replicas + {% endif %} + {% if alert.deploymentconfig_name and alert.replicas %} + - Scaled DeploymentConfig {{ alert.deploymentconfig_name }} to {{ alert.replicas }} replicas + {% endif %} + {% if alert.restart_deployment %} + - Performed rollout restart on {{ alert.deployment_name or alert.deploymentconfig_name }} + {% endif %} + +# Example alert payloads: + +# Restart specific pod: +# { +# "source": "openshift", +# "namespace": "production", +# "pod_name": "web-app-789-xyz", +# "pod_status": "CrashLoopBackOff" +# } + +# Scale deployment: +# { +# "source": "openshift", +# "namespace": "production", +# "deployment_name": "web-app", +# "replicas": 5 +# } + +# Scale deployment config: +# { +# "source": "openshift", +# "namespace": "staging", +# "deploymentconfig_name": "api-server", +# "replicas": 3 +# } + +# Rollout restart deployment: +# { +# "source": "openshift", +# "namespace": "production", +# "deployment_name": "web-app", +# "restart_deployment": true +# } \ No newline at end of file diff --git a/keep/providers/openshift_provider/openshift_provider.py b/keep/providers/openshift_provider/openshift_provider.py index 0d5df3804f..4ab98d9138 100644 --- a/keep/providers/openshift_provider/openshift_provider.py +++ b/keep/providers/openshift_provider/openshift_provider.py @@ -1,9 +1,12 @@ import dataclasses -import traceback +import datetime -import openshift_client as oc import pydantic -from openshift_client import Context, OpenShiftPythonException +import requests +import warnings +from kubernetes import client +from kubernetes.client.rest import ApiException +from openshift_client import Context from keep.contextmanager.contextmanager import ContextManager from keep.providers.base.base_provider import BaseProvider @@ -44,7 +47,7 @@ class OpenshiftProviderAuthConfig: class OpenshiftProvider(BaseProvider): - """Perform rollout restart actions on Openshift.""" + """Perform rollout restart actions and query resources on Openshift.""" provider_id: str PROVIDER_DISPLAY_NAME = "Openshift" @@ -62,11 +65,13 @@ class OpenshiftProvider(BaseProvider): def __init__(self, context_manager, provider_id: str, config: ProviderConfig): super().__init__(context_manager, provider_id, config) self.authentication_config = None + self._k8s_client = None self.validate_config() def dispose(self): """Dispose the provider.""" - pass + if self._k8s_client: + self._k8s_client.api_client.rest_client.pool_manager.clear() def validate_config(self): """ @@ -87,67 +92,560 @@ def __get_ocp_client(self): oc_context.insecure = self.authentication_config.insecure return oc_context + def __get_k8s_client(self): + """Get the Kubernetes client for OpenShift API access.""" + if self._k8s_client is None: + client_configuration = client.Configuration() + client_configuration.host = self.authentication_config.api_server + client_configuration.verify_ssl = not self.authentication_config.insecure + client_configuration.api_key = { + "authorization": "Bearer " + self.authentication_config.token + } + self._k8s_client = client.ApiClient(client_configuration) + return self._k8s_client + + def __test_connection_via_rest_api(self): + """ + Test connection to OpenShift using REST API instead of CLI. + This is more reliable as it doesn't depend on oc CLI being installed. + """ + try: + # Suppress SSL warnings if insecure is True + if self.authentication_config.insecure: + # Suppress SSL verification warnings + warnings.filterwarnings('ignore', message='Unverified HTTPS request') + + # Test API connectivity by hitting the /version endpoint + headers = { + 'Authorization': f'Bearer {self.authentication_config.token}', + 'Accept': 'application/json' + } + + verify_ssl = not self.authentication_config.insecure + + # Try to get cluster version info + response = requests.get( + f"{self.authentication_config.api_server}/version", + headers=headers, + verify=verify_ssl, + timeout=30 + ) + + if response.status_code == 200: + self.logger.info("Successfully connected to OpenShift cluster via REST API") + return True, None + else: + error_msg = f"API returned status code {response.status_code}: {response.text}" + self.logger.error(f"Failed to connect to OpenShift cluster: {error_msg}") + return False, error_msg + + except requests.exceptions.RequestException as e: + error_msg = f"Connection error: {str(e)}" + self.logger.error(f"Failed to connect to OpenShift cluster: {error_msg}") + return False, error_msg + except Exception as e: + error_msg = f"Unexpected error: {str(e)}" + self.logger.error(f"Failed to connect to OpenShift cluster: {error_msg}") + return False, error_msg + def validate_scopes(self): """ Validates that the provided token has the required scopes to use the provider. + Uses REST API validation instead of CLI commands for better reliability. """ + self.logger.info("Validating scopes for OpenShift provider") + try: - client = self.__get_ocp_client() - with oc.timeout(60 * 30), oc.tracking() as t, client: - if oc.get_config_context() is None: - try: - oc.invoke("login") - except OpenShiftPythonException: - traceback.print_exc() - self.logger.error( - f"Tracking:\n{t.get_result().as_json(redact_streams=False)}\n\n" - ) - self.logger.error("Error logging into the API server") - raise Exception("Error logging into the API server") - scopes = { - "connect_to_openshift": True, - } + # Try REST API approach first + success, error_msg = self.__test_connection_via_rest_api() + + if success: + self.logger.info("Successfully validated OpenShift connection") + scopes = { + "connect_to_openshift": True, + } + else: + self.logger.error(f"OpenShift validation failed: {error_msg}") + scopes = { + "connect_to_openshift": error_msg, + } + except Exception as e: - self.logger.exception("Error validating scopes") + self.logger.exception("Error validating scopes for OpenShift provider") scopes = { "connect_to_openshift": str(e), } + return scopes - def _notify(self, kind: str, name: str, project_name: str): + def _query(self, command_type: str, **kwargs): + """ + Query OpenShift resources. + Args: + command_type (str): The type of query to perform. Supported queries are: + - get_logs: Get logs from a pod + - get_events: Get events for a namespace or pod + - get_pods: List pods in a namespace or across all namespaces + - get_node_pressure: Get node pressure conditions + - get_pvc: List persistent volume claims + - get_routes: List OpenShift routes + - get_deploymentconfigs: List OpenShift deployment configs + - get_projects: List OpenShift projects + **kwargs: Additional arguments for the query. + """ + k8s_client = self.__get_k8s_client() + + if command_type == "get_logs": + return self.__get_logs(k8s_client, **kwargs) + elif command_type == "get_events": + return self.__get_events(k8s_client, **kwargs) + elif command_type == "get_pods": + return self.__get_pods(k8s_client, **kwargs) + elif command_type == "get_node_pressure": + return self.__get_node_pressure(k8s_client, **kwargs) + elif command_type == "get_pvc": + return self.__get_pvc(k8s_client, **kwargs) + elif command_type == "get_routes": + return self.__get_routes(**kwargs) + elif command_type == "get_deploymentconfigs": + return self.__get_deploymentconfigs(**kwargs) + elif command_type == "get_projects": + return self.__get_projects(**kwargs) + else: + raise NotImplementedError(f"Command type {command_type} is not implemented") + + def _notify(self, action: str, **kwargs): """ - Rollout restart the specified kind. + Perform actions on OpenShift resources. Args: - kind: The kind of object to restart. Could be deployments, statefulset, daemonset. - name: The name of the object to restart - project_name: The project name where the object is located + action (str): The action to perform. Supported actions are: + - rollout_restart: Restart a deployment, statefulset, or daemonset + - restart_pod: Restart a pod by deleting it + - scale_deployment: Scale a deployment to specified replicas + - scale_deploymentconfig: Scale a deployment config to specified replicas + **kwargs: Additional arguments for the action. """ - client = self.__get_ocp_client() - client.project_name = project_name + if action == "rollout_restart": + return self.__rollout_restart(**kwargs) + elif action == "restart_pod": + return self.__restart_pod(**kwargs) + elif action == "scale_deployment": + return self.__scale_deployment(**kwargs) + elif action == "scale_deploymentconfig": + return self.__scale_deploymentconfig(**kwargs) + else: + raise NotImplementedError(f"Action {action} is not implemented") + + def __get_logs(self, k8s_client, namespace, pod_name, container_name=None, tail_lines=100, **kwargs): + """Get logs from a pod.""" + self.logger.info(f"Getting logs for pod {pod_name} in namespace {namespace}") + core_v1 = client.CoreV1Api(k8s_client) + + try: + logs = core_v1.read_namespaced_pod_log( + name=pod_name, + namespace=namespace, + container=container_name, + tail_lines=tail_lines, + pretty=True, + ) + return logs.splitlines() + except UnicodeEncodeError: + logs = core_v1.read_namespaced_pod_log( + name=pod_name, + namespace=namespace, + container=container_name, + tail_lines=tail_lines, + ) + return logs.splitlines() + except ApiException as e: + self.logger.error(f"Error getting logs for pod {pod_name}: {e}") + raise Exception(f"Error getting logs for pod {pod_name}: {e}") + + def __get_events(self, k8s_client, namespace, pod_name=None, sort_by=None, **kwargs): + """Get events for a namespace or specific pod.""" self.logger.info( - f"Performing rollout restart for {kind} {name} using openshift provider" + f"Getting events in namespace {namespace}" + + (f" for pod {pod_name}" if pod_name else ""), ) - with oc.timeout(60 * 30), oc.tracking() as t, client: - if oc.get_config_context() is None: - self.logger.error( - f"Current context not set! Logging into API server: {client.api_server}\n" - ) + + core_v1 = client.CoreV1Api(k8s_client) + + try: + if pod_name: + # Get the pod to find its UID + pod = core_v1.read_namespaced_pod(name=pod_name, namespace=namespace) + field_selector = f"involvedObject.kind=Pod,involvedObject.name={pod_name},involvedObject.uid={pod.metadata.uid}" + else: + field_selector = f"metadata.namespace={namespace}" + + events = core_v1.list_namespaced_event( + namespace=namespace, + field_selector=field_selector, + ) + + if sort_by: + self.logger.info(f"Sorting events by {sort_by}") try: - oc.invoke("login") - except OpenShiftPythonException: - self.logger.error("error occurred logging into API Server") - traceback.print_exc() - self.logger.error( - f"Tracking:\n{t.get_result().as_json(redact_streams=False)}\n\n" + sorted_events = sorted( + events.items, + key=lambda event: getattr(event, sort_by, None), + reverse=True, + ) + return sorted_events + except Exception: + self.logger.exception(f"Error sorting events by {sort_by}") + + # Convert events to dict + return [event.to_dict() for event in events.items] + except ApiException as e: + self.logger.exception("Error getting events") + raise Exception(f"Error getting events: {e}") from e + + def __get_pods(self, k8s_client, namespace=None, label_selector=None, **kwargs): + """List pods in a namespace or across all namespaces.""" + core_v1 = client.CoreV1Api(k8s_client) + + try: + if namespace: + self.logger.info(f"Listing pods in namespace {namespace}") + pods = core_v1.list_namespaced_pod( + namespace=namespace, label_selector=label_selector + ) + else: + self.logger.info("Listing pods across all namespaces") + pods = core_v1.list_pod_for_all_namespaces( + label_selector=label_selector + ) + + return [pod.to_dict() for pod in pods.items] + except ApiException as e: + self.logger.error(f"Error listing pods: {e}") + raise Exception(f"Error listing pods: {e}") + + def __get_node_pressure(self, k8s_client, **kwargs): + """Get node pressure conditions (Memory, Disk, PID).""" + self.logger.info("Getting node pressure conditions") + core_v1 = client.CoreV1Api(k8s_client) + + try: + nodes = core_v1.list_node(watch=False) + node_pressures = [] + + for node in nodes.items: + pressures = { + "name": node.metadata.name, + "conditions": [], + } + for condition in node.status.conditions: + if condition.type in [ + "MemoryPressure", + "DiskPressure", + "PIDPressure", + ]: + pressures["conditions"].append(condition.to_dict()) + node_pressures.append(pressures) + + return node_pressures + except ApiException as e: + self.logger.error(f"Error getting node pressures: {e}") + raise Exception(f"Error getting node pressures: {e}") + + def __get_pvc(self, k8s_client, namespace=None, **kwargs): + """List persistent volume claims in a namespace or across all namespaces.""" + core_v1 = client.CoreV1Api(k8s_client) + + try: + if namespace: + self.logger.info(f"Listing PVCs in namespace {namespace}") + pvcs = core_v1.list_namespaced_persistent_volume_claim( + namespace=namespace + ) + else: + self.logger.info("Listing PVCs across all namespaces") + pvcs = core_v1.list_persistent_volume_claim_for_all_namespaces() + + return [pvc.to_dict() for pvc in pvcs.items] + except ApiException as e: + self.logger.error(f"Error listing PVCs: {e}") + raise Exception(f"Error listing PVCs: {e}") + + def __get_routes(self, namespace=None, **kwargs): + """List OpenShift routes.""" + self.logger.info("Getting OpenShift routes") + + try: + # Use REST API to get routes + headers = { + 'Authorization': f'Bearer {self.authentication_config.token}', + 'Accept': 'application/json' + } + + verify_ssl = not self.authentication_config.insecure + + if namespace: + url = f"{self.authentication_config.api_server}/apis/route.openshift.io/v1/namespaces/{namespace}/routes" + else: + url = f"{self.authentication_config.api_server}/apis/route.openshift.io/v1/routes" + + response = requests.get(url, headers=headers, verify=verify_ssl, timeout=30) + response.raise_for_status() + + routes_data = response.json() + return routes_data.get('items', []) + + except Exception as e: + self.logger.error(f"Error getting routes: {e}") + raise Exception(f"Error getting routes: {e}") + + def __get_deploymentconfigs(self, namespace=None, **kwargs): + """List OpenShift deployment configs.""" + self.logger.info("Getting OpenShift deployment configs") + + try: + # Use REST API to get deployment configs + headers = { + 'Authorization': f'Bearer {self.authentication_config.token}', + 'Accept': 'application/json' + } + + verify_ssl = not self.authentication_config.insecure + + if namespace: + url = f"{self.authentication_config.api_server}/apis/apps.openshift.io/v1/namespaces/{namespace}/deploymentconfigs" + else: + url = f"{self.authentication_config.api_server}/apis/apps.openshift.io/v1/deploymentconfigs" + + response = requests.get(url, headers=headers, verify=verify_ssl, timeout=30) + response.raise_for_status() + + dc_data = response.json() + return dc_data.get('items', []) + + except Exception as e: + self.logger.error(f"Error getting deployment configs: {e}") + raise Exception(f"Error getting deployment configs: {e}") + + def __get_projects(self, **kwargs): + """List OpenShift projects.""" + self.logger.info("Getting OpenShift projects") + + try: + # Use REST API to get projects + headers = { + 'Authorization': f'Bearer {self.authentication_config.token}', + 'Accept': 'application/json' + } + + verify_ssl = not self.authentication_config.insecure + url = f"{self.authentication_config.api_server}/apis/project.openshift.io/v1/projects" + + response = requests.get(url, headers=headers, verify=verify_ssl, timeout=30) + response.raise_for_status() + + projects_data = response.json() + return projects_data.get('items', []) + + except Exception as e: + self.logger.error(f"Error getting projects: {e}") + raise Exception(f"Error getting projects: {e}") + + def __rollout_restart(self, kind, name, namespace, labels=None, **kwargs): + """Perform a rollout restart on a deployment, statefulset, or daemonset using REST API.""" + self.logger.info(f"Performing rollout restart for {kind} {name} in namespace {namespace}") + + k8s_client = self.__get_k8s_client() + now = datetime.datetime.now(datetime.timezone.utc) + now = str(now.isoformat("T") + "Z") + body = { + "spec": { + "template": { + "metadata": { + "annotations": {"kubectl.kubernetes.io/restartedAt": now} + } + } + } + } + + apps_v1 = client.AppsV1Api(k8s_client) + try: + if kind.lower() == "deployment": + if labels: + deployment_list = apps_v1.list_namespaced_deployment( + namespace=namespace, label_selector=labels + ) + if not deployment_list.items: + raise ValueError( + f"Deployment with labels {labels} not found in namespace {namespace}" + ) + apps_v1.patch_namespaced_deployment( + name=name, namespace=namespace, body=body + ) + elif kind.lower() == "statefulset": + if labels: + statefulset_list = apps_v1.list_namespaced_stateful_set( + namespace=namespace, label_selector=labels ) - raise Exception("Error logging into the API server") - try: - oc.invoke("rollout", ["restart", kind, name]) - except OpenShiftPythonException: - self.logger.error(f"Error restarting {kind} {name}") - raise Exception(f"Error restarting {kind} {name}") + if not statefulset_list.items: + raise ValueError( + f"StatefulSet with labels {labels} not found in namespace {namespace}" + ) + apps_v1.patch_namespaced_stateful_set( + name=name, namespace=namespace, body=body + ) + elif kind.lower() == "daemonset": + if labels: + daemonset_list = apps_v1.list_namespaced_daemon_set( + namespace=namespace, label_selector=labels + ) + if not daemonset_list.items: + raise ValueError( + f"DaemonSet with labels {labels} not found in namespace {namespace}" + ) + apps_v1.patch_namespaced_daemon_set( + name=name, namespace=namespace, body=body + ) + elif kind.lower() == "deploymentconfig": + # Handle OpenShift DeploymentConfig using REST API + return self.__rollout_restart_deploymentconfig(name, namespace) + else: + raise ValueError(f"Unsupported kind {kind} to perform rollout restart") + except ApiException as e: + self.logger.error(f"Error performing rollout restart for {kind} {name}: {e}") + raise Exception(f"Error performing rollout restart for {kind} {name}: {e}") - self.logger.info(f"Restarted {kind} {name}") + self.logger.info(f"Successfully performed rollout restart for {kind} {name}") + return { + "status": "success", + "message": f"Successfully performed rollout restart for {kind} {name}", + } + + def __rollout_restart_deploymentconfig(self, name, namespace): + """Restart a DeploymentConfig using OpenShift REST API.""" + try: + headers = { + 'Authorization': f'Bearer {self.authentication_config.token}', + 'Content-Type': 'application/json' + } + + verify_ssl = not self.authentication_config.insecure + url = f"{self.authentication_config.api_server}/apis/apps.openshift.io/v1/namespaces/{namespace}/deploymentconfigs/{name}/instantiate" + + # Trigger a new deployment + body = { + "kind": "DeploymentRequest", + "apiVersion": "apps.openshift.io/v1", + "name": name, + "latest": True, + "force": True + } + + response = requests.post(url, headers=headers, json=body, verify=verify_ssl, timeout=30) + response.raise_for_status() + + self.logger.info(f"Successfully restarted DeploymentConfig {name}") + return { + "status": "success", + "message": f"Successfully restarted DeploymentConfig {name}", + } + + except Exception as e: + self.logger.error(f"Error restarting DeploymentConfig {name}: {e}") + raise Exception(f"Error restarting DeploymentConfig {name}: {e}") + + def __restart_pod(self, namespace, pod_name, container_name=None, message=None, **kwargs): + """Restart a pod by deleting it (it will be recreated by its controller).""" + k8s_client = self.__get_k8s_client() + core_v1 = client.CoreV1Api(k8s_client) + + self.logger.info(f"Restarting pod {pod_name} in namespace {namespace}") + + try: + # Check if the pod exists + pod = core_v1.read_namespaced_pod(name=pod_name, namespace=namespace) + + # If the pod is managed by a controller, it will be recreated + # For standalone pods, this will simply delete the pod + delete_options = client.V1DeleteOptions() + core_v1.delete_namespaced_pod( + name=pod_name, namespace=namespace, body=delete_options + ) + + # Return success message + response_message = ( + message + if message + else f"Pod {pod_name} in namespace {namespace} was restarted" + ) + self.logger.info(response_message) + + return { + "status": "success", + "message": response_message, + "pod_details": { + "name": pod.metadata.name, + "namespace": pod.metadata.namespace, + "status": pod.status.phase, + "containers": [container.name for container in pod.spec.containers], + }, + } + except ApiException as e: + error_message = f"Error restarting pod {pod_name}: {e}" + self.logger.error(error_message) + raise Exception(error_message) + + def __scale_deployment(self, namespace, deployment_name, replicas, **kwargs): + """Scale a deployment to specified replicas.""" + k8s_client = self.__get_k8s_client() + apps_v1 = client.AppsV1Api(k8s_client) + + self.logger.info(f"Scaling deployment {deployment_name} in namespace {namespace} to {replicas} replicas") + + try: + apps_v1.patch_namespaced_deployment_scale( + name=deployment_name, + namespace=namespace, + body={"spec": {"replicas": replicas}}, + ) + + return { + "status": "success", + "message": f"Successfully scaled deployment {deployment_name} to {replicas} replicas", + } + except ApiException as e: + error_message = f"Error scaling deployment {deployment_name}: {e}" + self.logger.error(error_message) + raise Exception(error_message) + + def __scale_deploymentconfig(self, namespace, deploymentconfig_name, replicas, **kwargs): + """Scale a DeploymentConfig to specified replicas using OpenShift REST API.""" + try: + headers = { + 'Authorization': f'Bearer {self.authentication_config.token}', + 'Content-Type': 'application/strategic-merge-patch+json' + } + + verify_ssl = not self.authentication_config.insecure + url = f"{self.authentication_config.api_server}/apis/apps.openshift.io/v1/namespaces/{namespace}/deploymentconfigs/{deploymentconfig_name}/scale" + + body = { + "spec": { + "replicas": replicas + } + } + + response = requests.patch(url, headers=headers, json=body, verify=verify_ssl, timeout=30) + response.raise_for_status() + + self.logger.info(f"Successfully scaled DeploymentConfig {deploymentconfig_name} to {replicas} replicas") + return { + "status": "success", + "message": f"Successfully scaled DeploymentConfig {deploymentconfig_name} to {replicas} replicas", + } + + except Exception as e: + self.logger.error(f"Error scaling DeploymentConfig {deploymentconfig_name}: {e}") + raise Exception(f"Error scaling DeploymentConfig {deploymentconfig_name}: {e}") if __name__ == "__main__": @@ -173,5 +671,20 @@ def _notify(self, kind: str, name: str, project_name: str): ) openshift_provider = OpenshiftProvider(context_manager, "openshift-keephq", config) - restart = openshift_provider.notify("deployment", "nginx") - print(restart) + # Test validation + scopes = openshift_provider.validate_scopes() + print("Validation result:", scopes) + + # Test query operations + try: + projects = openshift_provider.query(command_type="get_projects") + print(f"Found {len(projects)} projects") + except Exception as e: + print(f"Error getting projects: {e}") + + # Test restart action + try: + restart = openshift_provider.notify(action="rollout_restart", kind="deployment", name="nginx", namespace="default") + print(restart) + except Exception as e: + print(f"Error restarting: {e}")