diff --git a/CHANGELOG.md b/CHANGELOG.md index 8a7b0fef..6d3d3839 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,6 +11,7 @@ All notable changes to this project will be documented in this file. - Use `--file-log-rotation-period` (or `FILE_LOG_ROTATION_PERIOD`) to configure the frequency of rotation. - Use `--console-log-format` (or `CONSOLE_LOG_FORMAT`) to set the format to `plain` (default) or `json`. - The operator now defaults to `AES/CTR/NoPadding` for `dfs.encrypt.data.transfer.cipher.suite` to improve security and performance ([#693]). +- The built-in Prometheus servlet is now enabled and metrics are exposed under the `/prom` path of all UI services ([#695]). ### Changed @@ -21,7 +22,7 @@ All notable changes to this project will be documented in this file. by `FILE_LOG_DIRECTORY` (or via `--file-log-directory `). - Replace stackable-operator `print_startup_string` with `tracing::info!` with fields. - BREAKING: Inject the vector aggregator address into the vector config using the env var `VECTOR_AGGREGATOR_ADDRESS` instead - of having the operator write it to the vector config ([#671]). + of having the operator write it to the vector config ([#671]). - test: Bump to Vector `0.46.1` ([#677]). - BREAKING: Previously this operator would hardcode the UID and GID of the Pods being created to 1000/0, this has changed now ([#683]) - The `runAsUser` and `runAsGroup` fields will not be set anymore by the operator @@ -48,6 +49,7 @@ All notable changes to this project will be documented in this file. [#683]: https://github.com/stackabletech/hdfs-operator/pull/683 [#684]: https://github.com/stackabletech/hdfs-operator/pull/684 [#693]: https://github.com/stackabletech/hdfs-operator/pull/693 +[#695]: https://github.com/stackabletech/hdfs-operator/pull/695 ## [25.3.0] - 2025-03-21 diff --git a/docs/modules/hdfs/pages/usage-guide/monitoring.adoc b/docs/modules/hdfs/pages/usage-guide/monitoring.adoc index 58329301..53c52956 100644 --- a/docs/modules/hdfs/pages/usage-guide/monitoring.adoc +++ b/docs/modules/hdfs/pages/usage-guide/monitoring.adoc @@ -6,6 +6,12 @@ The cluster can be monitored with Prometheus from inside or outside the K8S clus All services (with the exception of the Zookeeper daemon on the node names) run with the JMX exporter agent enabled and expose metrics on the `metrics` port. This port is available from the container level up to the NodePort services. +[IMPORTANT] +==== +Starting with Stackable Data Platform 25.7, the built-in Prometheus metrics are also available at the `/prom` endpoint of all the UI services. +The JMX exporter metrics are now deprecated and will be removed in a future release. +==== + The metrics endpoints are also used as liveliness probes by Kubernetes. See xref:operators:monitoring.adoc[] for more details. diff --git a/rust/operator-binary/src/config/mod.rs b/rust/operator-binary/src/config/mod.rs index 6f8a6fb2..2d157aa5 100644 --- a/rust/operator-binary/src/config/mod.rs +++ b/rust/operator-binary/src/config/mod.rs @@ -12,7 +12,8 @@ use crate::crd::{ DFS_NAMENODE_HTTP_ADDRESS, DFS_NAMENODE_HTTPS_ADDRESS, DFS_NAMENODE_NAME_DIR, DFS_NAMENODE_RPC_ADDRESS, DFS_NAMENODE_SHARED_EDITS_DIR, DFS_REPLICATION, FS_DEFAULT_FS, HA_ZOOKEEPER_QUORUM, JOURNALNODE_ROOT_DATA_DIR, NAMENODE_ROOT_DATA_DIR, - SERVICE_PORT_NAME_HTTP, SERVICE_PORT_NAME_HTTPS, SERVICE_PORT_NAME_RPC, + PROMETHEUS_ENDPOINT_ENABLED, SERVICE_PORT_NAME_HTTP, SERVICE_PORT_NAME_HTTPS, + SERVICE_PORT_NAME_RPC, }, storage::{DataNodeStorageConfig, DataNodeStorageConfigInnerType}, v1alpha1, @@ -264,6 +265,12 @@ impl CoreSiteConfigBuilder { let transformed_config = transform_for_product_config(&self.config); to_hadoop_xml(transformed_config.iter()) } + + pub fn enable_prometheus_endpoint(&mut self) -> &mut Self { + self.config + .insert(PROMETHEUS_ENDPOINT_ENABLED.to_string(), "true".to_string()); + self + } } fn transform_for_product_config( diff --git a/rust/operator-binary/src/crd/constants.rs b/rust/operator-binary/src/crd/constants.rs index 6bd010b4..ab326f72 100644 --- a/rust/operator-binary/src/crd/constants.rs +++ b/rust/operator-binary/src/crd/constants.rs @@ -69,6 +69,7 @@ pub const DFS_HA_NAMENODES: &str = "dfs.ha.namenodes"; // core-site.xml pub const FS_DEFAULT_FS: &str = "fs.defaultFS"; pub const HA_ZOOKEEPER_QUORUM: &str = "ha.zookeeper.quorum"; +pub const PROMETHEUS_ENDPOINT_ENABLED: &str = "hadoop.prometheus.endpoint.enabled"; pub const STACKABLE_ROOT_DATA_DIR: &str = "/stackable/data"; pub const NAMENODE_ROOT_DATA_DIR: &str = "/stackable/data/namenode"; diff --git a/rust/operator-binary/src/hdfs_controller.rs b/rust/operator-binary/src/hdfs_controller.rs index 98a046cc..6394276d 100644 --- a/rust/operator-binary/src/hdfs_controller.rs +++ b/rust/operator-binary/src/hdfs_controller.rs @@ -682,7 +682,8 @@ fn rolegroup_config_map( .fs_default_fs() .ha_zookeeper_quorum() .security_config(hdfs, cluster_info) - .context(BuildSecurityConfigSnafu)?; + .context(BuildSecurityConfigSnafu)? + .enable_prometheus_endpoint(); if let Some(hdfs_opa_config) = hdfs_opa_config { hdfs_opa_config.add_core_site_config(&mut core_site); } diff --git a/tests/templates/kuttl/smoke/51-assert.yaml.j2 b/tests/templates/kuttl/smoke/51-assert.yaml.j2 index 4a20065d..6f57dda2 100644 --- a/tests/templates/kuttl/smoke/51-assert.yaml.j2 +++ b/tests/templates/kuttl/smoke/51-assert.yaml.j2 @@ -8,5 +8,9 @@ commands: {% else %} PRODUCT_VERSION={{ test_scenario['values']['hadoop'] }} {% endif %} + # Test JMX exported metrics kubectl exec --namespace=$NAMESPACE test-runner-0 -- \ python /tmp/test_metrics.py $NAMESPACE $PRODUCT_VERSION + # Test Prometheus metrics + kubectl exec --namespace=$NAMESPACE test-runner-0 -- \ + python /tmp/test_prometheus_metrics.py $NAMESPACE $PRODUCT_VERSION diff --git a/tests/templates/kuttl/smoke/51-copy-metrics-test-script.yaml b/tests/templates/kuttl/smoke/51-copy-metrics-test-script.yaml index fa17cd19..bb617f97 100644 --- a/tests/templates/kuttl/smoke/51-copy-metrics-test-script.yaml +++ b/tests/templates/kuttl/smoke/51-copy-metrics-test-script.yaml @@ -3,3 +3,4 @@ apiVersion: kuttl.dev/v1beta1 kind: TestStep commands: - script: kubectl cp -n $NAMESPACE ./test_metrics.py test-runner-0:/tmp + - script: kubectl cp -n $NAMESPACE ./test_prometheus_metrics.py test-runner-0:/tmp diff --git a/tests/templates/kuttl/smoke/test_prometheus_metrics.py b/tests/templates/kuttl/smoke/test_prometheus_metrics.py new file mode 100644 index 00000000..fb19d908 --- /dev/null +++ b/tests/templates/kuttl/smoke/test_prometheus_metrics.py @@ -0,0 +1,109 @@ +# Fetch metrics from the built-in Prometheus endpoint of HDFS components. + +import logging +import sys + +import requests + + +def check_metrics( + namespace: str, role: str, port: int, expected_metrics: list[str] +) -> None: + response: requests.Response = requests.get( + f"http://hdfs-{role}-default-0.hdfs-{role}-default.{namespace}.svc.cluster.local:{port}/prom", + timeout=10, + ) + assert response.ok, "Requesting metrics failed" + + # Split the response into lines to check for metric names at the beginning of each line. + # This is a bit slower than using a regex but it allows to use special characters like "{}" in metric names + # without needing to escape them. + response_lines = response.text.splitlines() + for metric in expected_metrics: + # Use any() with a generator to stop early if the metric is found. + assert any((line.startswith(metric) for line in response_lines)) is True, ( + f"Metric '{metric}' not found for {role}" + ) + + +def check_namenode_metrics( + namespace: str, + product_version: str, +) -> None: + expected_metrics: list[str] = [ + # Kind "MetricsSystem" + 'metrics_system_num_active_sources{context="metricssystem",hostname="hdfs-namenode-default-0"}', + # Counter suffixed with "_total" + # The metric attributes can change so we remove them from the expected metric. + # The full name looks like: 'fs_namesystem_files_total{context="dfs",enabledecpolicies="RS-6-3-1024k",hastate="active",totalsynctimes="4 7 ",hostname="hdfs-namenode-default-0"}', + "fs_namesystem_files_total", + # Metric suffixed with "_created" + 'namenode_files_created{processname="NameNode",sessionid="null",context="dfs",hostname="hdfs-namenode-default-0"}', + # Boolean metric + # 'hadoop_namenode_security_enabled{kind="NameNodeStatus",role="NameNode",service="HDFS"}', + # Non-special metric + 'namenode_files_deleted{processname="NameNode",sessionid="null",context="dfs",hostname="hdfs-namenode-default-0"}', + ] + + check_metrics(namespace, "namenode", 9870, expected_metrics) + + +def check_datanode_metrics( + namespace: str, + product_version: str, +) -> None: + expected_metrics: list[str] = [ + # Kind "MetricsSystem" + 'metrics_system_num_active_sources{context="metricssystem",hostname="hdfs-datanode-default-0"}', + # Kind "FSDatasetState" suffixed with "_total" + # 'org_apache_hadoop_hdfs_server_datanode_fsdataset_impl_fs_dataset_impl_estimated_capacity_lost_total{context="FSDatasetState",storageinfo="FSDataset{dirpath=\'[/stackable/data/hdd/datanode,/stackable/data/hdd-1/datanode, /stackable/data/ssd/datanode]\'}",hostname="hdfs-datanode-default-0"}', + "org_apache_hadoop_hdfs_server_datanode_fsdataset_impl_fs_dataset_impl_estimated_capacity_lost_total", + # Kind "FSDatasetState" + # 'org_apache_hadoop_hdfs_server_datanode_fsdataset_impl_fs_dataset_impl_capacity{context="FSDatasetState",storageinfo="FSDataset{dirpath=\'[/stackable/data/hdd/datanode, /stackable/data/hdd-1/datanode, /stackable/data/ssd/datanode]\'}",hostname="hdfs-datanode-default-0"}', + "org_apache_hadoop_hdfs_server_datanode_fsdataset_impl_fs_dataset_impl_capacity", + # Kind "DataNodeActivity" suffixed with "_info" + 'datanode_blocks_get_local_path_info{sessionid="null",context="dfs",hostname="hdfs-datanode-default-0"}', + # Kind "DataNodeActivity" + 'datanode_blocks_read{sessionid="null",context="dfs",hostname="hdfs-datanode-default-0"}', + # Counter suffixed with "_total" + # 'org_apache_hadoop_hdfs_server_datanode_fsdataset_impl_fs_dataset_impl_estimated_capacity_lost_total{context="FSDatasetState",storageinfo="FSDataset{dirpath=\'[/stackable/data/hdd/datanode,/stackable/data/hdd-1/datanode, /stackable/data/ssd/datanode]\'}",hostname="hdfs-datanode-default-0"}', + "org_apache_hadoop_hdfs_server_datanode_fsdataset_impl_fs_dataset_impl_estimated_capacity_lost_total", + # Boolean metric + #'hadoop_datanode_security_enabled{kind="DataNodeInfo",role="DataNode",service="HDFS"}', + # Non-special metric + 'jvm_metrics_gc_count{context="jvm",processname="DataNode",sessionid="null",hostname="hdfs-datanode-default-0"}', + ] + + check_metrics(namespace, "datanode", 9864, expected_metrics) + + +def check_journalnode_metrics( + namespace: str, + product_version: str, +) -> None: + expected_metrics: list[str] = [ + # Kind "MetricsSystem" + 'metrics_system_num_active_sources{context="metricssystem",hostname="hdfs-journalnode-default-0"}', + # Non-special metric + 'journal_node_bytes_written{context="dfs",journalid="hdfs",hostname="hdfs-journalnode-default-0"}', + # There is no boolean metric in JournalNode. + ] + + check_metrics(namespace, "journalnode", 8480, expected_metrics) + + +if __name__ == "__main__": + namespace_arg: str = sys.argv[1] + product_version_arg: str = sys.argv[2] + + logging.basicConfig( + level="DEBUG", + format="%(asctime)s %(levelname)s: %(message)s", + stream=sys.stdout, + ) + + check_namenode_metrics(namespace_arg, product_version_arg) + check_datanode_metrics(namespace_arg, product_version_arg) + check_journalnode_metrics(namespace_arg, product_version_arg) + + print("All expected metrics found")