Skip to content

feat: enable built in Prometheus servlet #695

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 7 commits into from
Jun 16, 2025
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ All notable changes to this project will be documented in this file.
- Use `--file-log-rotation-period` (or `FILE_LOG_ROTATION_PERIOD`) to configure the frequency of rotation.
- Use `--console-log-format` (or `CONSOLE_LOG_FORMAT`) to set the format to `plain` (default) or `json`.
- The operator now defaults to `AES/CTR/NoPadding` for `dfs.encrypt.data.transfer.cipher.suite` to improve security and performance ([#693]).
- The built-in Prometheus servlet is now enabled and metrics are exposed under the `/prom` path of all UI services ([#695]).

### Changed

Expand All @@ -21,7 +22,7 @@ All notable changes to this project will be documented in this file.
by `FILE_LOG_DIRECTORY` (or via `--file-log-directory <DIRECTORY>`).
- Replace stackable-operator `print_startup_string` with `tracing::info!` with fields.
- BREAKING: Inject the vector aggregator address into the vector config using the env var `VECTOR_AGGREGATOR_ADDRESS` instead
of having the operator write it to the vector config ([#671]).
of having the operator write it to the vector config ([#671]).
- test: Bump to Vector `0.46.1` ([#677]).
- BREAKING: Previously this operator would hardcode the UID and GID of the Pods being created to 1000/0, this has changed now ([#683])
- The `runAsUser` and `runAsGroup` fields will not be set anymore by the operator
Expand All @@ -48,6 +49,7 @@ All notable changes to this project will be documented in this file.
[#683]: https://github.com/stackabletech/hdfs-operator/pull/683
[#684]: https://github.com/stackabletech/hdfs-operator/pull/684
[#693]: https://github.com/stackabletech/hdfs-operator/pull/693
[#695]: https://github.com/stackabletech/hdfs-operator/pull/695

## [25.3.0] - 2025-03-21

Expand Down
6 changes: 6 additions & 0 deletions docs/modules/hdfs/pages/usage-guide/monitoring.adoc
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,12 @@ The cluster can be monitored with Prometheus from inside or outside the K8S clus
All services (with the exception of the Zookeeper daemon on the node names) run with the JMX exporter agent enabled and expose metrics on the `metrics` port.
This port is available from the container level up to the NodePort services.

[IMPORTANT]
====
Starting with Stackable Data Platform 25.7, the bult-in Prometheus metrics are also available at the `/prom` endpoint of all the UI services.
The JMX exporter metrics are now deprecated and will be removed in a future release.
====

The metrics endpoints are also used as liveliness probes by Kubernetes.

See xref:operators:monitoring.adoc[] for more details.
9 changes: 8 additions & 1 deletion rust/operator-binary/src/config/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,8 @@ use crate::crd::{
DFS_NAMENODE_HTTP_ADDRESS, DFS_NAMENODE_HTTPS_ADDRESS, DFS_NAMENODE_NAME_DIR,
DFS_NAMENODE_RPC_ADDRESS, DFS_NAMENODE_SHARED_EDITS_DIR, DFS_REPLICATION, FS_DEFAULT_FS,
HA_ZOOKEEPER_QUORUM, JOURNALNODE_ROOT_DATA_DIR, NAMENODE_ROOT_DATA_DIR,
SERVICE_PORT_NAME_HTTP, SERVICE_PORT_NAME_HTTPS, SERVICE_PORT_NAME_RPC,
PROMETHEUS_ENDPOINT_ENABLED, SERVICE_PORT_NAME_HTTP, SERVICE_PORT_NAME_HTTPS,
SERVICE_PORT_NAME_RPC,
},
storage::{DataNodeStorageConfig, DataNodeStorageConfigInnerType},
v1alpha1,
Expand Down Expand Up @@ -264,6 +265,12 @@ impl CoreSiteConfigBuilder {
let transformed_config = transform_for_product_config(&self.config);
to_hadoop_xml(transformed_config.iter())
}

pub fn enable_prometheus_endpoint(&mut self) -> &mut Self {
self.config
.insert(PROMETHEUS_ENDPOINT_ENABLED.to_string(), "true".to_string());
self
}
}

fn transform_for_product_config(
Expand Down
1 change: 1 addition & 0 deletions rust/operator-binary/src/crd/constants.rs
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,7 @@ pub const DFS_HA_NAMENODES: &str = "dfs.ha.namenodes";
// core-site.xml
pub const FS_DEFAULT_FS: &str = "fs.defaultFS";
pub const HA_ZOOKEEPER_QUORUM: &str = "ha.zookeeper.quorum";
pub const PROMETHEUS_ENDPOINT_ENABLED: &str = "hadoop.prometheus.endpoint.enabled";

pub const STACKABLE_ROOT_DATA_DIR: &str = "/stackable/data";
pub const NAMENODE_ROOT_DATA_DIR: &str = "/stackable/data/namenode";
Expand Down
3 changes: 2 additions & 1 deletion rust/operator-binary/src/hdfs_controller.rs
Original file line number Diff line number Diff line change
Expand Up @@ -682,7 +682,8 @@ fn rolegroup_config_map(
.fs_default_fs()
.ha_zookeeper_quorum()
.security_config(hdfs, cluster_info)
.context(BuildSecurityConfigSnafu)?;
.context(BuildSecurityConfigSnafu)?
.enable_prometheus_endpoint();
if let Some(hdfs_opa_config) = hdfs_opa_config {
hdfs_opa_config.add_core_site_config(&mut core_site);
}
Expand Down
4 changes: 4 additions & 0 deletions tests/templates/kuttl/smoke/51-assert.yaml.j2
Original file line number Diff line number Diff line change
Expand Up @@ -8,5 +8,9 @@ commands:
{% else %}
PRODUCT_VERSION={{ test_scenario['values']['hadoop'] }}
{% endif %}
# Test JMX exported metrics
kubectl exec --namespace=$NAMESPACE test-runner-0 -- \
python /tmp/test_metrics.py $NAMESPACE $PRODUCT_VERSION
# Test Prometheus metrics
kubectl exec --namespace=$NAMESPACE test-runner-0 -- \
python /tmp/test_prometheus_metrics.py $NAMESPACE $PRODUCT_VERSION
Original file line number Diff line number Diff line change
Expand Up @@ -3,3 +3,4 @@ apiVersion: kuttl.dev/v1beta1
kind: TestStep
commands:
- script: kubectl cp -n $NAMESPACE ./test_metrics.py test-runner-0:/tmp
- script: kubectl cp -n $NAMESPACE ./test_prometheus_metrics.py test-runner-0:/tmp
105 changes: 105 additions & 0 deletions tests/templates/kuttl/smoke/test_prometheus_metrics.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
# Fetch metrics from the built-in Prometheus endpoint of HDFS components.

import logging
import re
import sys

import requests


def check_metrics(
namespace: str, role: str, port: int, expected_metrics: list[str]
) -> None:
response: requests.Response = requests.get(
f"http://hdfs-{role}-default-0.hdfs-{role}-default.{namespace}.svc.cluster.local:{port}/prom",
timeout=10,
)
assert response.ok, "Requesting metrics failed"

for metric in expected_metrics:
assert re.search(f"^{metric}", response.text, re.MULTILINE) is not None, (
f"Metric '{metric}' not found for {role}"
)


def check_namenode_metrics(
namespace: str,
product_version: str,
) -> None:
expected_metrics: list[str] = [
# Kind "MetricsSystem"
'metrics_system_num_active_sources{context="metricssystem",hostname="hdfs-namenode-default-0"}',
# Counter suffixed with "_total"
# The metric attributes can change so use .* for them.
# The full name looks like: 'fs_namesystem_files_total{context="dfs",enabledecpolicies="RS-6-3-1024k",hastate="active",totalsynctimes="4 7 ",hostname="hdfs-namenode-default-0"}',
"fs_namesystem_files_total.*",
# Metric suffixed with "_created"
'namenode_files_created{processname="NameNode",sessionid="null",context="dfs",hostname="hdfs-namenode-default-0"}',
# Boolean metric
# 'hadoop_namenode_security_enabled{kind="NameNodeStatus",role="NameNode",service="HDFS"}',
# Non-special metric
'namenode_files_deleted{processname="NameNode",sessionid="null",context="dfs",hostname="hdfs-namenode-default-0"}',
]

check_metrics(namespace, "namenode", 9870, expected_metrics)


def check_datanode_metrics(
namespace: str,
product_version: str,
) -> None:
expected_metrics: list[str] = [
# Kind "MetricsSystem"
'metrics_system_num_active_sources{context="metricssystem",hostname="hdfs-datanode-default-0"}',
# Kind "FSDatasetState" suffixed with "_total"
# 'org_apache_hadoop_hdfs_server_datanode_fsdataset_impl_fs_dataset_impl_estimated_capacity_lost_total{context="FSDatasetState",storageinfo="FSDataset{dirpath=\'[/stackable/data/hdd/datanode,/stackable/data/hdd-1/datanode, /stackable/data/ssd/datanode]\'}",hostname="hdfs-datanode-default-0"}',
"org_apache_hadoop_hdfs_server_datanode_fsdataset_impl_fs_dataset_impl_estimated_capacity_lost_total.*",
# Kind "FSDatasetState"
# 'org_apache_hadoop_hdfs_server_datanode_fsdataset_impl_fs_dataset_impl_capacity{context="FSDatasetState",storageinfo="FSDataset{dirpath=\'[/stackable/data/hdd/datanode, /stackable/data/hdd-1/datanode, /stackable/data/ssd/datanode]\'}",hostname="hdfs-datanode-default-0"}',
"org_apache_hadoop_hdfs_server_datanode_fsdataset_impl_fs_dataset_impl_capacity.*",
# Kind "DataNodeActivity" suffixed with "_info"
'datanode_blocks_get_local_path_info{sessionid="null",context="dfs",hostname="hdfs-datanode-default-0"}',
# Kind "DataNodeActivity"
'datanode_blocks_read{sessionid="null",context="dfs",hostname="hdfs-datanode-default-0"}',
# Counter suffixed with "_total"
# 'org_apache_hadoop_hdfs_server_datanode_fsdataset_impl_fs_dataset_impl_estimated_capacity_lost_total{context="FSDatasetState",storageinfo="FSDataset{dirpath=\'[/stackable/data/hdd/datanode,/stackable/data/hdd-1/datanode, /stackable/data/ssd/datanode]\'}",hostname="hdfs-datanode-default-0"}',
"org_apache_hadoop_hdfs_server_datanode_fsdataset_impl_fs_dataset_impl_estimated_capacity_lost_total.*",
# Boolean metric
#'hadoop_datanode_security_enabled{kind="DataNodeInfo",role="DataNode",service="HDFS"}',
# Non-special metric
'jvm_metrics_gc_count{context="jvm",processname="DataNode",sessionid="null",hostname="hdfs-datanode-default-0"}',
]

check_metrics(namespace, "datanode", 9864, expected_metrics)


def check_journalnode_metrics(
namespace: str,
product_version: str,
) -> None:
expected_metrics: list[str] = [
# Kind "MetricsSystem"
'metrics_system_num_active_sources{context="metricssystem",hostname="hdfs-journalnode-default-0"}',
# Non-special metric
'journal_node_bytes_written{context="dfs",journalid="hdfs",hostname="hdfs-journalnode-default-0"}',
# There is no boolean metric in JournalNode.
]

check_metrics(namespace, "journalnode", 8480, expected_metrics)


if __name__ == "__main__":
namespace_arg: str = sys.argv[1]
product_version_arg: str = sys.argv[2]

logging.basicConfig(
level="DEBUG",
format="%(asctime)s %(levelname)s: %(message)s",
stream=sys.stdout,
)

check_namenode_metrics(namespace_arg, product_version_arg)
check_datanode_metrics(namespace_arg, product_version_arg)
check_journalnode_metrics(namespace_arg, product_version_arg)

print("All expected metrics found")