diff --git a/_topic_maps/_topic_map.yml b/_topic_maps/_topic_map.yml index 6eebda55cd1c..08b16a88175a 100644 --- a/_topic_maps/_topic_map.yml +++ b/_topic_maps/_topic_map.yml @@ -3534,7 +3534,7 @@ Topics: Dir: observability Topics: - Name: Observability in OpenShift Container Platform - File: telco-observability + File: observability - Name: Security Dir: security Topics: diff --git a/edge_computing/day_2_core_cnf_clusters/observability/telco-observability.adoc b/edge_computing/day_2_core_cnf_clusters/observability/telco-observability.adoc index bb3ae01373f6..f284ff1b8634 100644 --- a/edge_computing/day_2_core_cnf_clusters/observability/telco-observability.adoc +++ b/edge_computing/day_2_core_cnf_clusters/observability/telco-observability.adoc @@ -1,8 +1,8 @@ :_mod-docs-content-type: ASSEMBLY -[id="telco-observability"] +[id="observability"] = Observability in telco core CNF clusters include::_attributes/common-attributes.adoc[] -:context: telco-observability +:context: observability :imagesdir: images toc::[] @@ -13,7 +13,7 @@ What follows is an outline of best practices for system engineers, architects, a Unless explicitly stated, the material in this document refers to both Edge and Core deployments. -include::modules/telco-observability-monitoring-stack.adoc[leveloffset=+1] +include::modules/observability-monitoring-stack.adoc[leveloffset=+1] [role="_additional-resources"] .Additional resources @@ -22,7 +22,7 @@ include::modules/telco-observability-monitoring-stack.adoc[leveloffset=+1] * xref:../../../observability/monitoring/getting-started/core-platform-monitoring-first-steps.adoc#core-platform-monitoring-first-steps[Core platform monitoring first steps] -include::modules/telco-observability-key-performance-metrics.adoc[leveloffset=+1] +include::modules/observability-key-performance-metrics.adoc[leveloffset=+1] [role="_additional-resources"] .Additional resources @@ -31,16 +31,16 @@ include::modules/telco-observability-key-performance-metrics.adoc[leveloffset=+1 * xref:../../../storage/persistent_storage/persistent_storage_local/persistent-storage-local.adoc#local-storage-install_persistent-storage-local[Persistent storage using local volumes] -include::modules/telco-observability-monitoring-the-edge.adoc[leveloffset=+1] +include::modules/observability-monitoring-the-edge.adoc[leveloffset=+1] -include::modules/telco-observability-alerting.adoc[leveloffset=+1] +include::modules/observability-alerting.adoc[leveloffset=+1] [role="_additional-resources"] .Additional resources * xref:../../../observability/monitoring/about-ocp-monitoring/key-concepts.adoc#about-managing-alerts_key-concepts[Managing alerts] -include::modules/telco-observability-workload-monitoring.adoc[leveloffset=+1] +include::modules/observability-workload-monitoring.adoc[leveloffset=+1] [role="_additional-resources"] .Additional resources diff --git a/modules/telco-observability-alerting.adoc b/modules/observability-alerting.adoc similarity index 58% rename from modules/telco-observability-alerting.adoc rename to modules/observability-alerting.adoc index 8c5da7ca687e..d6e678d81724 100644 --- a/modules/telco-observability-alerting.adoc +++ b/modules/observability-alerting.adoc @@ -1,22 +1,22 @@ // Module included in the following assemblies: // -// * edge_computing/day_2_core_cnf_clusters/observability/telco-observability.adoc +// * edge_computing/day_2_core_cnf_clusters/observability/observability.adoc :_mod-docs-content-type: PROCEDURE -[id="telco-observability-alerting_{context}"] +[id="observability-alerting_{context}"] = Alerting {product-title} includes a large number of alert rules, which can change from release to release. -[id="viewing-default-alerts"] +[id="viewing-default-alerts_{context}"] == Viewing default alerts -Use the following procedure to review all of the alert rules in a cluster. +Review all of the alert rules in a cluster. .Procedure -* To review all the alert rules in a cluster, you can run the following command: +* To review all the alert rules in a cluster, run the following command: [source,terminal] + ---- @@ -24,7 +24,7 @@ $ oc get cm -n openshift-monitoring prometheus-k8s-rulefiles-0 -o yaml ---- + Rules can include a description and provide a link to additional information and mitigation steps. -For example, this is the rule for `etcdHighFsyncDurations`: +For example, see the rule for `etcdHighFsyncDurations`: + [source,terminal] ---- @@ -43,11 +43,12 @@ For example, this is the rule for `etcdHighFsyncDurations`: ---- [id="alert-notifications"] -== Alert notifications -You can view alerts in the {product-title} console, however an administrator should configure an external receiver to forward the alerts to. +== Alert notifications + +You can view alerts in the {product-title} console, however, an administrator must configure an external receiver to forward the alerts to. {product-title} supports the following receiver types: -* PagerDuty: a 3rd party incident response platform -* Webhook: an arbitrary API endpoint that receives an alert via a POST request and can take any necessary action -* Email: sends an email to designated address -* Slack: sends a notification to either a slack channel or an individual user \ No newline at end of file +PagerDuty:: A 3rd-party incident response platform. +Webhook:: An arbitrary API endpoint that receives an alert through a `POST` request and can take any necessary action. +Email:: Sends an email to a designated address. +Slack:: Sends a notification to either a Slack channel or an individual user. \ No newline at end of file diff --git a/modules/telco-observability-key-performance-metrics.adoc b/modules/observability-key-performance-metrics.adoc similarity index 75% rename from modules/telco-observability-key-performance-metrics.adoc rename to modules/observability-key-performance-metrics.adoc index 6eccabf91c5e..b16b03ad59e0 100644 --- a/modules/telco-observability-key-performance-metrics.adoc +++ b/modules/observability-key-performance-metrics.adoc @@ -1,14 +1,14 @@ // Module included in the following assemblies: // -// * edge_computing/day_2_core_cnf_clusters/observability/telco-observability.adoc +// * edge_computing/day_2_core_cnf_clusters/observability/observability.adoc :_mod-docs-content-type: CONCEPT -[id="telco-observability-key-performance-metrics_{context}"] +[id="observability-key-performance-metrics_{context}"] = Key performance metrics -Depending on your system, there can be hundreds of available measurements. +Depending on your system, you can have hundreds of available measurements. -Here are some key metrics that you should pay attention to: +You should pay attention to the following key metrics: * `etcd` response times * API response times @@ -17,26 +17,30 @@ Here are some key metrics that you should pay attention to: * OVN health * Overall cluster operator health -A good rule to follow is that if you decide that a metric is important, there should be an alert for it. +A good rule to follow is that if you decide that a metric is important, you should set up an alert for the metric. [NOTE] ==== You can check the available metrics by running the following command: + ++ [source,terminal] ---- $ oc -n openshift-monitoring exec -c prometheus prometheus-k8s-0 -- curl -qsk http://localhost:9090/api/v1/metadata | jq '.data ---- ==== -[id="example-queries-promql"] +[id="example-queries-promql_{context}"] == Example queries in PromQL -The following tables show some queries that you can explore in the metrics query browser using the {product-title} console. +The following tables show queries that you can explore in the metrics query browser by using the {product-title} console. [NOTE] ==== The URL for the console is https:///monitoring/query-browser. -You can get the OpenShift Console FQDN by running the following command: +You can get the Openshift Console FQDN by running the following command: + ++ [source,terminal] ---- $ oc get routes -n openshift-console console -o jsonpath='{.status.ingress[0].host}' @@ -79,7 +83,7 @@ $ oc get routes -n openshift-console console -o jsonpath='{.status.ingress[0].ho |`POST` |`histogram_quantile (0.99, sum by (le,managed_cluster) (sum_over_time(apiserver_request_duration_seconds_bucket{apiserver="kube-apiserver\|openshift-apiserver", verb="POST"}[60m])))` -|`LIST` +|`LIST` |`histogram_quantile (0.99, sum by (le,managed_cluster) (sum_over_time(apiserver_request_duration_seconds_bucket{apiserver="kube-apiserver\|openshift-apiserver", verb="LIST"}[60m])))` |`PUT` @@ -127,17 +131,15 @@ $ oc get routes -n openshift-console console -o jsonpath='{.status.ingress[0].ho |=== -[id="recommendations-for-storage-of-metrics"] +[id="recommendations-for-storage-of-metrics_{context}"] == Recommendations for storage of metrics -Out of the box, Prometheus does not back up saved metrics with persistent storage. -If you restart the Prometheus pods, all metrics data are lost. -You should configure the monitoring stack to use the back-end storage that is available on the platform. -To meet the high IO demands of Prometheus you should use local storage. - -For Telco core clusters, you can use the Local Storage Operator for persistent storage for Prometheus. +By default, Prometheus does not back up saved metrics with persistent storage. +If you restart the Prometheus pods, all metrics data are lost. +You must configure the monitoring stack to use the back-end storage that is available on the platform. +To meet the high IO demands of Prometheus, you should use local storage. -{odf-first}, which deploys a ceph cluster for block, file, and object storage, is also a suitable candidate for a Telco core cluster. +For smaller clusters, you can use the Local Storage Operator for persistent storage for Prometheus. {odf-first}, which deploys a ceph cluster for block, file, and object storage, is suitable for larger clusters. -To keep system resource requirements low on a RAN {sno} or far edge cluster, you should not provision backend storage for the monitoring stack. -Such clusters forward all metrics to the hub cluster where you can provision a third party monitoring platform. +To keep system resource requirements low on a {sno} cluster, you should not provision back-end storage for the monitoring stack. +Such clusters forward all metrics to the hub cluster where you can provision a third party monitoring platform. \ No newline at end of file diff --git a/modules/telco-observability-monitoring-stack.adoc b/modules/observability-monitoring-stack.adoc similarity index 69% rename from modules/telco-observability-monitoring-stack.adoc rename to modules/observability-monitoring-stack.adoc index 03dcb6e3653a..c52d1fea89ea 100644 --- a/modules/telco-observability-monitoring-stack.adoc +++ b/modules/observability-monitoring-stack.adoc @@ -1,9 +1,9 @@ // Module included in the following assemblies: // -// * edge_computing/day_2_core_cnf_clusters/observability/telco-observability.adoc +// * edge_computing/day_2_core_cnf_clusters/observability/observability.adoc :_mod-docs-content-type: CONCEPT -[id="telco-observability-monitoring-stack_{context}"] +[id="observability-monitoring-stack_{context}"] = Understanding the monitoring stack The monitoring stack uses the following components: @@ -17,5 +17,5 @@ image::monitoring-architecture.png[{product-title} monitoring architecture] [NOTE] ==== -For a {sno} cluster, you should disable Alertmanager and Thanos because the cluster sends all metrics to the hub cluster for analysis and retention. +For {sno} clusters, you should disable Alertmanager and Thanos because the clusters sends all metrics to the hub cluster for analysis and retention. ==== \ No newline at end of file diff --git a/modules/telco-observability-monitoring-the-edge.adoc b/modules/observability-monitoring-the-edge.adoc similarity index 86% rename from modules/telco-observability-monitoring-the-edge.adoc rename to modules/observability-monitoring-the-edge.adoc index d30837586112..5c94a2cd7596 100644 --- a/modules/telco-observability-monitoring-the-edge.adoc +++ b/modules/observability-monitoring-the-edge.adoc @@ -1,14 +1,14 @@ // Module included in the following assemblies: // -// * edge_computing/day_2_core_cnf_clusters/observability/telco-observability.adoc +// * edge_computing/day_2_core_cnf_clusters/observability/observability.adoc :_mod-docs-content-type: PROCEDURE -[id="telco-observability-monitoring-the-edge_{context}"] +[id="observability-monitoring-the-edge_{context}"] = Monitoring the edge -{sno-caps} at the edge keeps the footprint of the platform components to a minimum. -The following procedure is an example of how you can configure a {sno} node with a small monitoring footprint. +OpenShift clusters at the edge should keep the footprint of the platform components to a minimum. +The following procedure is an example of how you can configure a {sno} or edge node with a small monitoring footprint. .Prerequisites @@ -36,14 +36,14 @@ metadata: retention: 24h ---- -. On the {sno}, apply the `ConfigMap` CR by running the following command: +. Apply the `ConfigMap` CR by running the following command on the {sno}: + [source,terminal] ---- $ oc apply -f monitoringConfigMap.yaml ---- -. Create a `NameSpace` CR, and save it as `monitoringNamespace.yaml`, as in the following example: +. Create a `Namespace` CR, and save it as `monitoringNamespace.yaml`, as in the following example: + [source,yaml] ---- @@ -53,7 +53,7 @@ metadata: name: open-cluster-management-observability ---- -. On the hub cluster, apply the `Namespace` CR on the hub cluster by running the following command: +. Apply the `Namespace` CR by running the following command on the hub cluster : + [source,terminal] ---- @@ -75,7 +75,7 @@ spec: generateBucketName: acm-multi ---- -. On the hub cluster, apply the `ObjectBucketClaim` CR, by running the following command: +. Apply the `ObjectBucketClaim` CR by running the following command on the hub cluster: + [source,terminal] ---- @@ -95,14 +95,14 @@ stringData: .dockerconfigjson: 'PULL_SECRET' ---- -. On the hub cluster, apply the `Secret` CR by running the following command: +. Apply the `Secret` CR by running the following command n the hub cluster: + [source,terminal] ---- $ oc apply -f monitoringSecret.yaml ---- -. Get the keys for the NooBaa service and the backend bucket name from the hub cluster by running the following commands: +. Get the keys for the NooBaa service and the back-end bucket name from the hub cluster by running the following commands: + [source,terminal] ---- @@ -140,7 +140,7 @@ stringData: secret_key: ${NOOBAA_SECRET_KEY} ---- -. On the hub cluster, apply the `Secret` CR by running the following command: +. Apply the `Secret` CR by running the following command on the hub cluster: + [source,terminal] ---- @@ -177,7 +177,7 @@ spec: storeStorageSize: 25Gi ---- -. On the hub cluster, apply the `MultiClusterObservability` CR by running the following command: +. Apply the `MultiClusterObservability` CR by running the following command on the hub cluster: + [source,terminal] ---- diff --git a/modules/telco-observability-workload-monitoring.adoc b/modules/observability-workload-monitoring.adoc similarity index 88% rename from modules/telco-observability-workload-monitoring.adoc rename to modules/observability-workload-monitoring.adoc index 73adffddcc79..69df943b9eac 100644 --- a/modules/telco-observability-workload-monitoring.adoc +++ b/modules/observability-workload-monitoring.adoc @@ -1,9 +1,9 @@ // Module included in the following assemblies: // -// * edge_computing/day_2_core_cnf_clusters/observability/telco-observability.adoc +// * edge_computing/day_2_core_cnf_clusters/observability/observability.adoc :_mod-docs-content-type: PROCEDURE -[id="telco-observability-workload-monitoring_{context}"] +[id="observability-workload-monitoring_{context}"] = Workload monitoring By default, {product-title} does not collect metrics for application workloads. You can configure a cluster to collect workload metrics. @@ -67,8 +67,8 @@ spec: $ oc apply -f monitoringServiceMonitor.yaml ---- -Prometheus scrapes the path `/metrics` by default, however you can define a custom path. -It is up to the vendor of the application to expose this endpoint for scraping, with metrics that they deem relevant. +Prometheus scrapes the `/metrics` path by default. However, you can define a custom path. +The vendor of the application must decide whether to expose the endpoint for scraping, with metrics that they deem relevant. == Creating a workload alert