From 3f0d9a39fc540f7b3b3327bd9efd55707ff8bd72 Mon Sep 17 00:00:00 2001 From: Razvan-Daniel Mihai <84674+razvan@users.noreply.github.com> Date: Fri, 13 Jun 2025 13:54:42 +0200 Subject: [PATCH 1/7] remove jmx agent and metrics ports --- CHANGELOG.md | 4 +++- rust/operator-binary/src/config/jvm.rs | 15 ++++-------- rust/operator-binary/src/container.rs | 24 ++++++------------- rust/operator-binary/src/crd/constants.rs | 4 ---- rust/operator-binary/src/crd/mod.rs | 28 ++++++----------------- 5 files changed, 22 insertions(+), 53 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 8a7b0fef..b35d7eeb 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -21,13 +21,14 @@ All notable changes to this project will be documented in this file. by `FILE_LOG_DIRECTORY` (or via `--file-log-directory `). - Replace stackable-operator `print_startup_string` with `tracing::info!` with fields. - BREAKING: Inject the vector aggregator address into the vector config using the env var `VECTOR_AGGREGATOR_ADDRESS` instead - of having the operator write it to the vector config ([#671]). + of having the operator write it to the vector config ([#671]). - test: Bump to Vector `0.46.1` ([#677]). - BREAKING: Previously this operator would hardcode the UID and GID of the Pods being created to 1000/0, this has changed now ([#683]) - The `runAsUser` and `runAsGroup` fields will not be set anymore by the operator - The defaults from the docker images itself will now apply, which will be different from 1000/0 going forward - This is marked as breaking because tools and policies might exist, which require these fields to be set - Use versioned common structs ([#684]). +- BREAKING: the JMX exporter has been an replaced with the built-in Prometheus servlet ([#694]). ### Fixed @@ -48,6 +49,7 @@ All notable changes to this project will be documented in this file. [#683]: https://github.com/stackabletech/hdfs-operator/pull/683 [#684]: https://github.com/stackabletech/hdfs-operator/pull/684 [#693]: https://github.com/stackabletech/hdfs-operator/pull/693 +[#694]: https://github.com/stackabletech/hdfs-operator/pull/694 ## [25.3.0] - 2025-03-21 diff --git a/rust/operator-binary/src/config/jvm.rs b/rust/operator-binary/src/config/jvm.rs index 11ba39b6..60810261 100644 --- a/rust/operator-binary/src/config/jvm.rs +++ b/rust/operator-binary/src/config/jvm.rs @@ -56,7 +56,6 @@ pub fn construct_role_specific_jvm_args( kerberos_enabled: bool, resources: Option<&ResourceRequirements>, config_dir: &str, - metrics_port: u16, ) -> Result { let mut jvm_args = Vec::new(); @@ -77,10 +76,9 @@ pub fn construct_role_specific_jvm_args( jvm_args.push(format!("-Xmx{heap}")); } - jvm_args.extend([ - format!("-Djava.security.properties={config_dir}/{JVM_SECURITY_PROPERTIES_FILE}"), - format!("-javaagent:/stackable/jmx/jmx_prometheus_javaagent.jar={metrics_port}:/stackable/jmx/{hdfs_role}.yaml") - ]); + jvm_args.extend([format!( + "-Djava.security.properties={config_dir}/{JVM_SECURITY_PROPERTIES_FILE}" + )]); if kerberos_enabled { jvm_args.push(format!( "-Djava.security.krb5.conf={KERBEROS_CONTAINER_PATH}/krb5.conf" @@ -101,7 +99,7 @@ pub fn construct_role_specific_jvm_args( mod tests { use super::*; - use crate::{container::ContainerConfig, crd::constants::DEFAULT_NAME_NODE_METRICS_PORT}; + use crate::container::ContainerConfig; #[test] fn test_global_jvm_args() { @@ -135,8 +133,7 @@ mod tests { jvm_config, "-Xms819m \ -Xmx819m \ - -Djava.security.properties=/stackable/config/security.properties \ - -javaagent:/stackable/jmx/jmx_prometheus_javaagent.jar=8183:/stackable/jmx/namenode.yaml" + -Djava.security.properties=/stackable/config/security.properties" ); } @@ -181,7 +178,6 @@ mod tests { format!( "-Xms34406m \ -Djava.security.properties=/stackable/config/security.properties \ - -javaagent:/stackable/jmx/jmx_prometheus_javaagent.jar=8183:/stackable/jmx/namenode.yaml \ -Djava.security.krb5.conf={KERBEROS_CONTAINER_PATH}/krb5.conf \ -Dhttps.proxyHost=proxy.my.corp \ -Djava.net.preferIPv4Stack=true \ @@ -207,7 +203,6 @@ mod tests { kerberos_enabled, resources.as_ref(), "/stackable/config", - DEFAULT_NAME_NODE_METRICS_PORT, ) .unwrap() } diff --git a/rust/operator-binary/src/container.rs b/rust/operator-binary/src/container.rs index e1109f8c..a692f66c 100644 --- a/rust/operator-binary/src/container.rs +++ b/rust/operator-binary/src/container.rs @@ -61,14 +61,12 @@ use crate::{ AnyNodeConfig, DataNodeContainer, HdfsNodeRole, HdfsPodRef, NameNodeContainer, UpgradeState, constants::{ - DATANODE_ROOT_DATA_DIR_PREFIX, DEFAULT_DATA_NODE_METRICS_PORT, - DEFAULT_JOURNAL_NODE_METRICS_PORT, DEFAULT_NAME_NODE_METRICS_PORT, LISTENER_VOLUME_DIR, - LISTENER_VOLUME_NAME, LIVENESS_PROBE_FAILURE_THRESHOLD, - LIVENESS_PROBE_INITIAL_DELAY_SECONDS, LIVENESS_PROBE_PERIOD_SECONDS, LOG4J_PROPERTIES, - NAMENODE_ROOT_DATA_DIR, READINESS_PROBE_FAILURE_THRESHOLD, - READINESS_PROBE_INITIAL_DELAY_SECONDS, READINESS_PROBE_PERIOD_SECONDS, - SERVICE_PORT_NAME_HTTP, SERVICE_PORT_NAME_HTTPS, SERVICE_PORT_NAME_IPC, - SERVICE_PORT_NAME_RPC, STACKABLE_ROOT_DATA_DIR, + DATANODE_ROOT_DATA_DIR_PREFIX, LISTENER_VOLUME_DIR, LISTENER_VOLUME_NAME, + LIVENESS_PROBE_FAILURE_THRESHOLD, LIVENESS_PROBE_INITIAL_DELAY_SECONDS, + LIVENESS_PROBE_PERIOD_SECONDS, LOG4J_PROPERTIES, NAMENODE_ROOT_DATA_DIR, + READINESS_PROBE_FAILURE_THRESHOLD, READINESS_PROBE_INITIAL_DELAY_SECONDS, + READINESS_PROBE_PERIOD_SECONDS, SERVICE_PORT_NAME_HTTP, SERVICE_PORT_NAME_HTTPS, + SERVICE_PORT_NAME_IPC, SERVICE_PORT_NAME_RPC, STACKABLE_ROOT_DATA_DIR, }, storage::DataNodeStorageConfig, v1alpha1, @@ -164,8 +162,6 @@ pub enum ContainerConfig { web_ui_http_port_name: &'static str, /// Port name of the web UI HTTPS port, used for the liveness probe. web_ui_https_port_name: &'static str, - /// The JMX Exporter metrics port. - metrics_port: u16, }, Zkfc { /// The provided custom container name. @@ -1226,9 +1222,7 @@ wait_for_termination $! resources: Option<&ResourceRequirements>, ) -> Result { match self { - ContainerConfig::Hdfs { - role, metrics_port, .. - } => { + ContainerConfig::Hdfs { role, .. } => { let cvd = ContainerVolumeDirs::from(role); let config_dir = cvd.final_config(); construct_role_specific_jvm_args( @@ -1238,7 +1232,6 @@ wait_for_termination $! hdfs.has_kerberos_enabled(), resources, config_dir, - *metrics_port, ) .with_context(|_| ConstructJvmArgumentsSnafu { role: role.to_string(), @@ -1379,7 +1372,6 @@ impl From for ContainerConfig { ipc_port_name: SERVICE_PORT_NAME_RPC, web_ui_http_port_name: SERVICE_PORT_NAME_HTTP, web_ui_https_port_name: SERVICE_PORT_NAME_HTTPS, - metrics_port: DEFAULT_NAME_NODE_METRICS_PORT, }, HdfsNodeRole::Data => Self::Hdfs { role, @@ -1388,7 +1380,6 @@ impl From for ContainerConfig { ipc_port_name: SERVICE_PORT_NAME_IPC, web_ui_http_port_name: SERVICE_PORT_NAME_HTTP, web_ui_https_port_name: SERVICE_PORT_NAME_HTTPS, - metrics_port: DEFAULT_DATA_NODE_METRICS_PORT, }, HdfsNodeRole::Journal => Self::Hdfs { role, @@ -1397,7 +1388,6 @@ impl From for ContainerConfig { ipc_port_name: SERVICE_PORT_NAME_RPC, web_ui_http_port_name: SERVICE_PORT_NAME_HTTP, web_ui_https_port_name: SERVICE_PORT_NAME_HTTPS, - metrics_port: DEFAULT_JOURNAL_NODE_METRICS_PORT, }, } } diff --git a/rust/operator-binary/src/crd/constants.rs b/rust/operator-binary/src/crd/constants.rs index 6bd010b4..d5d53b8e 100644 --- a/rust/operator-binary/src/crd/constants.rs +++ b/rust/operator-binary/src/crd/constants.rs @@ -19,22 +19,18 @@ pub const SERVICE_PORT_NAME_IPC: &str = "ipc"; pub const SERVICE_PORT_NAME_HTTP: &str = "http"; pub const SERVICE_PORT_NAME_HTTPS: &str = "https"; pub const SERVICE_PORT_NAME_DATA: &str = "data"; -pub const SERVICE_PORT_NAME_METRICS: &str = "metrics"; pub const DEFAULT_LISTENER_CLASS: &str = "cluster-internal"; -pub const DEFAULT_NAME_NODE_METRICS_PORT: u16 = 8183; pub const DEFAULT_NAME_NODE_HTTP_PORT: u16 = 9870; pub const DEFAULT_NAME_NODE_HTTPS_PORT: u16 = 9871; pub const DEFAULT_NAME_NODE_RPC_PORT: u16 = 8020; -pub const DEFAULT_DATA_NODE_METRICS_PORT: u16 = 8082; pub const DEFAULT_DATA_NODE_HTTP_PORT: u16 = 9864; pub const DEFAULT_DATA_NODE_HTTPS_PORT: u16 = 9865; pub const DEFAULT_DATA_NODE_DATA_PORT: u16 = 9866; pub const DEFAULT_DATA_NODE_IPC_PORT: u16 = 9867; -pub const DEFAULT_JOURNAL_NODE_METRICS_PORT: u16 = 8081; pub const DEFAULT_JOURNAL_NODE_HTTP_PORT: u16 = 8480; pub const DEFAULT_JOURNAL_NODE_HTTPS_PORT: u16 = 8481; pub const DEFAULT_JOURNAL_NODE_RPC_PORT: u16 = 8485; diff --git a/rust/operator-binary/src/crd/mod.rs b/rust/operator-binary/src/crd/mod.rs index 5646a9c7..783796e6 100644 --- a/rust/operator-binary/src/crd/mod.rs +++ b/rust/operator-binary/src/crd/mod.rs @@ -53,16 +53,14 @@ use crate::crd::{ constants::{ APP_NAME, CORE_SITE_XML, DEFAULT_DATA_NODE_DATA_PORT, DEFAULT_DATA_NODE_GRACEFUL_SHUTDOWN_TIMEOUT, DEFAULT_DATA_NODE_HTTP_PORT, - DEFAULT_DATA_NODE_HTTPS_PORT, DEFAULT_DATA_NODE_IPC_PORT, DEFAULT_DATA_NODE_METRICS_PORT, - DEFAULT_DFS_REPLICATION_FACTOR, DEFAULT_JOURNAL_NODE_GRACEFUL_SHUTDOWN_TIMEOUT, - DEFAULT_JOURNAL_NODE_HTTP_PORT, DEFAULT_JOURNAL_NODE_HTTPS_PORT, - DEFAULT_JOURNAL_NODE_METRICS_PORT, DEFAULT_JOURNAL_NODE_RPC_PORT, DEFAULT_LISTENER_CLASS, + DEFAULT_DATA_NODE_HTTPS_PORT, DEFAULT_DATA_NODE_IPC_PORT, DEFAULT_DFS_REPLICATION_FACTOR, + DEFAULT_JOURNAL_NODE_GRACEFUL_SHUTDOWN_TIMEOUT, DEFAULT_JOURNAL_NODE_HTTP_PORT, + DEFAULT_JOURNAL_NODE_HTTPS_PORT, DEFAULT_JOURNAL_NODE_RPC_PORT, DEFAULT_LISTENER_CLASS, DEFAULT_NAME_NODE_GRACEFUL_SHUTDOWN_TIMEOUT, DEFAULT_NAME_NODE_HTTP_PORT, - DEFAULT_NAME_NODE_HTTPS_PORT, DEFAULT_NAME_NODE_METRICS_PORT, DEFAULT_NAME_NODE_RPC_PORT, - DFS_REPLICATION, HADOOP_POLICY_XML, HDFS_SITE_XML, JVM_SECURITY_PROPERTIES_FILE, - LISTENER_VOLUME_NAME, SERVICE_PORT_NAME_DATA, SERVICE_PORT_NAME_HTTP, - SERVICE_PORT_NAME_HTTPS, SERVICE_PORT_NAME_IPC, SERVICE_PORT_NAME_METRICS, - SERVICE_PORT_NAME_RPC, SSL_CLIENT_XML, SSL_SERVER_XML, + DEFAULT_NAME_NODE_HTTPS_PORT, DEFAULT_NAME_NODE_RPC_PORT, DFS_REPLICATION, + HADOOP_POLICY_XML, HDFS_SITE_XML, JVM_SECURITY_PROPERTIES_FILE, LISTENER_VOLUME_NAME, + SERVICE_PORT_NAME_DATA, SERVICE_PORT_NAME_HTTP, SERVICE_PORT_NAME_HTTPS, + SERVICE_PORT_NAME_IPC, SERVICE_PORT_NAME_RPC, SSL_CLIENT_XML, SSL_SERVER_XML, }, security::{AuthenticationConfig, KerberosConfig}, storage::{ @@ -667,10 +665,6 @@ impl v1alpha1::HdfsCluster { pub fn ports(&self, role: &HdfsNodeRole) -> Vec<(String, u16)> { match role { HdfsNodeRole::Name => vec![ - ( - String::from(SERVICE_PORT_NAME_METRICS), - DEFAULT_NAME_NODE_METRICS_PORT, - ), ( String::from(SERVICE_PORT_NAME_RPC), DEFAULT_NAME_NODE_RPC_PORT, @@ -688,10 +682,6 @@ impl v1alpha1::HdfsCluster { }, ], HdfsNodeRole::Data => vec![ - ( - String::from(SERVICE_PORT_NAME_METRICS), - DEFAULT_DATA_NODE_METRICS_PORT, - ), ( String::from(SERVICE_PORT_NAME_DATA), DEFAULT_DATA_NODE_DATA_PORT, @@ -713,10 +703,6 @@ impl v1alpha1::HdfsCluster { }, ], HdfsNodeRole::Journal => vec![ - ( - String::from(SERVICE_PORT_NAME_METRICS), - DEFAULT_JOURNAL_NODE_METRICS_PORT, - ), ( String::from(SERVICE_PORT_NAME_RPC), DEFAULT_JOURNAL_NODE_RPC_PORT, From 759f88747b96782e926056e8048ad8caf8fa992c Mon Sep 17 00:00:00 2001 From: Razvan-Daniel Mihai <84674+razvan@users.noreply.github.com> Date: Fri, 13 Jun 2025 14:33:48 +0200 Subject: [PATCH 2/7] enable prometheus endpoint --- rust/operator-binary/src/config/mod.rs | 9 ++++++++- rust/operator-binary/src/crd/constants.rs | 1 + rust/operator-binary/src/hdfs_controller.rs | 3 ++- 3 files changed, 11 insertions(+), 2 deletions(-) diff --git a/rust/operator-binary/src/config/mod.rs b/rust/operator-binary/src/config/mod.rs index 6f8a6fb2..2d157aa5 100644 --- a/rust/operator-binary/src/config/mod.rs +++ b/rust/operator-binary/src/config/mod.rs @@ -12,7 +12,8 @@ use crate::crd::{ DFS_NAMENODE_HTTP_ADDRESS, DFS_NAMENODE_HTTPS_ADDRESS, DFS_NAMENODE_NAME_DIR, DFS_NAMENODE_RPC_ADDRESS, DFS_NAMENODE_SHARED_EDITS_DIR, DFS_REPLICATION, FS_DEFAULT_FS, HA_ZOOKEEPER_QUORUM, JOURNALNODE_ROOT_DATA_DIR, NAMENODE_ROOT_DATA_DIR, - SERVICE_PORT_NAME_HTTP, SERVICE_PORT_NAME_HTTPS, SERVICE_PORT_NAME_RPC, + PROMETHEUS_ENDPOINT_ENABLED, SERVICE_PORT_NAME_HTTP, SERVICE_PORT_NAME_HTTPS, + SERVICE_PORT_NAME_RPC, }, storage::{DataNodeStorageConfig, DataNodeStorageConfigInnerType}, v1alpha1, @@ -264,6 +265,12 @@ impl CoreSiteConfigBuilder { let transformed_config = transform_for_product_config(&self.config); to_hadoop_xml(transformed_config.iter()) } + + pub fn enable_prometheus_endpoint(&mut self) -> &mut Self { + self.config + .insert(PROMETHEUS_ENDPOINT_ENABLED.to_string(), "true".to_string()); + self + } } fn transform_for_product_config( diff --git a/rust/operator-binary/src/crd/constants.rs b/rust/operator-binary/src/crd/constants.rs index d5d53b8e..5577fef3 100644 --- a/rust/operator-binary/src/crd/constants.rs +++ b/rust/operator-binary/src/crd/constants.rs @@ -65,6 +65,7 @@ pub const DFS_HA_NAMENODES: &str = "dfs.ha.namenodes"; // core-site.xml pub const FS_DEFAULT_FS: &str = "fs.defaultFS"; pub const HA_ZOOKEEPER_QUORUM: &str = "ha.zookeeper.quorum"; +pub const PROMETHEUS_ENDPOINT_ENABLED: &str = "hadoop.prometheus.endpoint.enabled"; pub const STACKABLE_ROOT_DATA_DIR: &str = "/stackable/data"; pub const NAMENODE_ROOT_DATA_DIR: &str = "/stackable/data/namenode"; diff --git a/rust/operator-binary/src/hdfs_controller.rs b/rust/operator-binary/src/hdfs_controller.rs index 98a046cc..6394276d 100644 --- a/rust/operator-binary/src/hdfs_controller.rs +++ b/rust/operator-binary/src/hdfs_controller.rs @@ -682,7 +682,8 @@ fn rolegroup_config_map( .fs_default_fs() .ha_zookeeper_quorum() .security_config(hdfs, cluster_info) - .context(BuildSecurityConfigSnafu)?; + .context(BuildSecurityConfigSnafu)? + .enable_prometheus_endpoint(); if let Some(hdfs_opa_config) = hdfs_opa_config { hdfs_opa_config.add_core_site_config(&mut core_site); } From 11d62764282aaf29bc30ee17c251566a23656ebd Mon Sep 17 00:00:00 2001 From: Razvan-Daniel Mihai <84674+razvan@users.noreply.github.com> Date: Fri, 13 Jun 2025 18:02:25 +0200 Subject: [PATCH 3/7] Revert "remove jmx agent and metrics ports" This reverts commit 3f0d9a39fc540f7b3b3327bd9efd55707ff8bd72. --- CHANGELOG.md | 4 +--- rust/operator-binary/src/config/jvm.rs | 15 ++++++++---- rust/operator-binary/src/container.rs | 24 +++++++++++++------ rust/operator-binary/src/crd/constants.rs | 4 ++++ rust/operator-binary/src/crd/mod.rs | 28 +++++++++++++++++------ 5 files changed, 53 insertions(+), 22 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index b35d7eeb..8a7b0fef 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -21,14 +21,13 @@ All notable changes to this project will be documented in this file. by `FILE_LOG_DIRECTORY` (or via `--file-log-directory `). - Replace stackable-operator `print_startup_string` with `tracing::info!` with fields. - BREAKING: Inject the vector aggregator address into the vector config using the env var `VECTOR_AGGREGATOR_ADDRESS` instead - of having the operator write it to the vector config ([#671]). + of having the operator write it to the vector config ([#671]). - test: Bump to Vector `0.46.1` ([#677]). - BREAKING: Previously this operator would hardcode the UID and GID of the Pods being created to 1000/0, this has changed now ([#683]) - The `runAsUser` and `runAsGroup` fields will not be set anymore by the operator - The defaults from the docker images itself will now apply, which will be different from 1000/0 going forward - This is marked as breaking because tools and policies might exist, which require these fields to be set - Use versioned common structs ([#684]). -- BREAKING: the JMX exporter has been an replaced with the built-in Prometheus servlet ([#694]). ### Fixed @@ -49,7 +48,6 @@ All notable changes to this project will be documented in this file. [#683]: https://github.com/stackabletech/hdfs-operator/pull/683 [#684]: https://github.com/stackabletech/hdfs-operator/pull/684 [#693]: https://github.com/stackabletech/hdfs-operator/pull/693 -[#694]: https://github.com/stackabletech/hdfs-operator/pull/694 ## [25.3.0] - 2025-03-21 diff --git a/rust/operator-binary/src/config/jvm.rs b/rust/operator-binary/src/config/jvm.rs index 60810261..11ba39b6 100644 --- a/rust/operator-binary/src/config/jvm.rs +++ b/rust/operator-binary/src/config/jvm.rs @@ -56,6 +56,7 @@ pub fn construct_role_specific_jvm_args( kerberos_enabled: bool, resources: Option<&ResourceRequirements>, config_dir: &str, + metrics_port: u16, ) -> Result { let mut jvm_args = Vec::new(); @@ -76,9 +77,10 @@ pub fn construct_role_specific_jvm_args( jvm_args.push(format!("-Xmx{heap}")); } - jvm_args.extend([format!( - "-Djava.security.properties={config_dir}/{JVM_SECURITY_PROPERTIES_FILE}" - )]); + jvm_args.extend([ + format!("-Djava.security.properties={config_dir}/{JVM_SECURITY_PROPERTIES_FILE}"), + format!("-javaagent:/stackable/jmx/jmx_prometheus_javaagent.jar={metrics_port}:/stackable/jmx/{hdfs_role}.yaml") + ]); if kerberos_enabled { jvm_args.push(format!( "-Djava.security.krb5.conf={KERBEROS_CONTAINER_PATH}/krb5.conf" @@ -99,7 +101,7 @@ pub fn construct_role_specific_jvm_args( mod tests { use super::*; - use crate::container::ContainerConfig; + use crate::{container::ContainerConfig, crd::constants::DEFAULT_NAME_NODE_METRICS_PORT}; #[test] fn test_global_jvm_args() { @@ -133,7 +135,8 @@ mod tests { jvm_config, "-Xms819m \ -Xmx819m \ - -Djava.security.properties=/stackable/config/security.properties" + -Djava.security.properties=/stackable/config/security.properties \ + -javaagent:/stackable/jmx/jmx_prometheus_javaagent.jar=8183:/stackable/jmx/namenode.yaml" ); } @@ -178,6 +181,7 @@ mod tests { format!( "-Xms34406m \ -Djava.security.properties=/stackable/config/security.properties \ + -javaagent:/stackable/jmx/jmx_prometheus_javaagent.jar=8183:/stackable/jmx/namenode.yaml \ -Djava.security.krb5.conf={KERBEROS_CONTAINER_PATH}/krb5.conf \ -Dhttps.proxyHost=proxy.my.corp \ -Djava.net.preferIPv4Stack=true \ @@ -203,6 +207,7 @@ mod tests { kerberos_enabled, resources.as_ref(), "/stackable/config", + DEFAULT_NAME_NODE_METRICS_PORT, ) .unwrap() } diff --git a/rust/operator-binary/src/container.rs b/rust/operator-binary/src/container.rs index a692f66c..e1109f8c 100644 --- a/rust/operator-binary/src/container.rs +++ b/rust/operator-binary/src/container.rs @@ -61,12 +61,14 @@ use crate::{ AnyNodeConfig, DataNodeContainer, HdfsNodeRole, HdfsPodRef, NameNodeContainer, UpgradeState, constants::{ - DATANODE_ROOT_DATA_DIR_PREFIX, LISTENER_VOLUME_DIR, LISTENER_VOLUME_NAME, - LIVENESS_PROBE_FAILURE_THRESHOLD, LIVENESS_PROBE_INITIAL_DELAY_SECONDS, - LIVENESS_PROBE_PERIOD_SECONDS, LOG4J_PROPERTIES, NAMENODE_ROOT_DATA_DIR, - READINESS_PROBE_FAILURE_THRESHOLD, READINESS_PROBE_INITIAL_DELAY_SECONDS, - READINESS_PROBE_PERIOD_SECONDS, SERVICE_PORT_NAME_HTTP, SERVICE_PORT_NAME_HTTPS, - SERVICE_PORT_NAME_IPC, SERVICE_PORT_NAME_RPC, STACKABLE_ROOT_DATA_DIR, + DATANODE_ROOT_DATA_DIR_PREFIX, DEFAULT_DATA_NODE_METRICS_PORT, + DEFAULT_JOURNAL_NODE_METRICS_PORT, DEFAULT_NAME_NODE_METRICS_PORT, LISTENER_VOLUME_DIR, + LISTENER_VOLUME_NAME, LIVENESS_PROBE_FAILURE_THRESHOLD, + LIVENESS_PROBE_INITIAL_DELAY_SECONDS, LIVENESS_PROBE_PERIOD_SECONDS, LOG4J_PROPERTIES, + NAMENODE_ROOT_DATA_DIR, READINESS_PROBE_FAILURE_THRESHOLD, + READINESS_PROBE_INITIAL_DELAY_SECONDS, READINESS_PROBE_PERIOD_SECONDS, + SERVICE_PORT_NAME_HTTP, SERVICE_PORT_NAME_HTTPS, SERVICE_PORT_NAME_IPC, + SERVICE_PORT_NAME_RPC, STACKABLE_ROOT_DATA_DIR, }, storage::DataNodeStorageConfig, v1alpha1, @@ -162,6 +164,8 @@ pub enum ContainerConfig { web_ui_http_port_name: &'static str, /// Port name of the web UI HTTPS port, used for the liveness probe. web_ui_https_port_name: &'static str, + /// The JMX Exporter metrics port. + metrics_port: u16, }, Zkfc { /// The provided custom container name. @@ -1222,7 +1226,9 @@ wait_for_termination $! resources: Option<&ResourceRequirements>, ) -> Result { match self { - ContainerConfig::Hdfs { role, .. } => { + ContainerConfig::Hdfs { + role, metrics_port, .. + } => { let cvd = ContainerVolumeDirs::from(role); let config_dir = cvd.final_config(); construct_role_specific_jvm_args( @@ -1232,6 +1238,7 @@ wait_for_termination $! hdfs.has_kerberos_enabled(), resources, config_dir, + *metrics_port, ) .with_context(|_| ConstructJvmArgumentsSnafu { role: role.to_string(), @@ -1372,6 +1379,7 @@ impl From for ContainerConfig { ipc_port_name: SERVICE_PORT_NAME_RPC, web_ui_http_port_name: SERVICE_PORT_NAME_HTTP, web_ui_https_port_name: SERVICE_PORT_NAME_HTTPS, + metrics_port: DEFAULT_NAME_NODE_METRICS_PORT, }, HdfsNodeRole::Data => Self::Hdfs { role, @@ -1380,6 +1388,7 @@ impl From for ContainerConfig { ipc_port_name: SERVICE_PORT_NAME_IPC, web_ui_http_port_name: SERVICE_PORT_NAME_HTTP, web_ui_https_port_name: SERVICE_PORT_NAME_HTTPS, + metrics_port: DEFAULT_DATA_NODE_METRICS_PORT, }, HdfsNodeRole::Journal => Self::Hdfs { role, @@ -1388,6 +1397,7 @@ impl From for ContainerConfig { ipc_port_name: SERVICE_PORT_NAME_RPC, web_ui_http_port_name: SERVICE_PORT_NAME_HTTP, web_ui_https_port_name: SERVICE_PORT_NAME_HTTPS, + metrics_port: DEFAULT_JOURNAL_NODE_METRICS_PORT, }, } } diff --git a/rust/operator-binary/src/crd/constants.rs b/rust/operator-binary/src/crd/constants.rs index 5577fef3..ab326f72 100644 --- a/rust/operator-binary/src/crd/constants.rs +++ b/rust/operator-binary/src/crd/constants.rs @@ -19,18 +19,22 @@ pub const SERVICE_PORT_NAME_IPC: &str = "ipc"; pub const SERVICE_PORT_NAME_HTTP: &str = "http"; pub const SERVICE_PORT_NAME_HTTPS: &str = "https"; pub const SERVICE_PORT_NAME_DATA: &str = "data"; +pub const SERVICE_PORT_NAME_METRICS: &str = "metrics"; pub const DEFAULT_LISTENER_CLASS: &str = "cluster-internal"; +pub const DEFAULT_NAME_NODE_METRICS_PORT: u16 = 8183; pub const DEFAULT_NAME_NODE_HTTP_PORT: u16 = 9870; pub const DEFAULT_NAME_NODE_HTTPS_PORT: u16 = 9871; pub const DEFAULT_NAME_NODE_RPC_PORT: u16 = 8020; +pub const DEFAULT_DATA_NODE_METRICS_PORT: u16 = 8082; pub const DEFAULT_DATA_NODE_HTTP_PORT: u16 = 9864; pub const DEFAULT_DATA_NODE_HTTPS_PORT: u16 = 9865; pub const DEFAULT_DATA_NODE_DATA_PORT: u16 = 9866; pub const DEFAULT_DATA_NODE_IPC_PORT: u16 = 9867; +pub const DEFAULT_JOURNAL_NODE_METRICS_PORT: u16 = 8081; pub const DEFAULT_JOURNAL_NODE_HTTP_PORT: u16 = 8480; pub const DEFAULT_JOURNAL_NODE_HTTPS_PORT: u16 = 8481; pub const DEFAULT_JOURNAL_NODE_RPC_PORT: u16 = 8485; diff --git a/rust/operator-binary/src/crd/mod.rs b/rust/operator-binary/src/crd/mod.rs index 783796e6..5646a9c7 100644 --- a/rust/operator-binary/src/crd/mod.rs +++ b/rust/operator-binary/src/crd/mod.rs @@ -53,14 +53,16 @@ use crate::crd::{ constants::{ APP_NAME, CORE_SITE_XML, DEFAULT_DATA_NODE_DATA_PORT, DEFAULT_DATA_NODE_GRACEFUL_SHUTDOWN_TIMEOUT, DEFAULT_DATA_NODE_HTTP_PORT, - DEFAULT_DATA_NODE_HTTPS_PORT, DEFAULT_DATA_NODE_IPC_PORT, DEFAULT_DFS_REPLICATION_FACTOR, - DEFAULT_JOURNAL_NODE_GRACEFUL_SHUTDOWN_TIMEOUT, DEFAULT_JOURNAL_NODE_HTTP_PORT, - DEFAULT_JOURNAL_NODE_HTTPS_PORT, DEFAULT_JOURNAL_NODE_RPC_PORT, DEFAULT_LISTENER_CLASS, + DEFAULT_DATA_NODE_HTTPS_PORT, DEFAULT_DATA_NODE_IPC_PORT, DEFAULT_DATA_NODE_METRICS_PORT, + DEFAULT_DFS_REPLICATION_FACTOR, DEFAULT_JOURNAL_NODE_GRACEFUL_SHUTDOWN_TIMEOUT, + DEFAULT_JOURNAL_NODE_HTTP_PORT, DEFAULT_JOURNAL_NODE_HTTPS_PORT, + DEFAULT_JOURNAL_NODE_METRICS_PORT, DEFAULT_JOURNAL_NODE_RPC_PORT, DEFAULT_LISTENER_CLASS, DEFAULT_NAME_NODE_GRACEFUL_SHUTDOWN_TIMEOUT, DEFAULT_NAME_NODE_HTTP_PORT, - DEFAULT_NAME_NODE_HTTPS_PORT, DEFAULT_NAME_NODE_RPC_PORT, DFS_REPLICATION, - HADOOP_POLICY_XML, HDFS_SITE_XML, JVM_SECURITY_PROPERTIES_FILE, LISTENER_VOLUME_NAME, - SERVICE_PORT_NAME_DATA, SERVICE_PORT_NAME_HTTP, SERVICE_PORT_NAME_HTTPS, - SERVICE_PORT_NAME_IPC, SERVICE_PORT_NAME_RPC, SSL_CLIENT_XML, SSL_SERVER_XML, + DEFAULT_NAME_NODE_HTTPS_PORT, DEFAULT_NAME_NODE_METRICS_PORT, DEFAULT_NAME_NODE_RPC_PORT, + DFS_REPLICATION, HADOOP_POLICY_XML, HDFS_SITE_XML, JVM_SECURITY_PROPERTIES_FILE, + LISTENER_VOLUME_NAME, SERVICE_PORT_NAME_DATA, SERVICE_PORT_NAME_HTTP, + SERVICE_PORT_NAME_HTTPS, SERVICE_PORT_NAME_IPC, SERVICE_PORT_NAME_METRICS, + SERVICE_PORT_NAME_RPC, SSL_CLIENT_XML, SSL_SERVER_XML, }, security::{AuthenticationConfig, KerberosConfig}, storage::{ @@ -665,6 +667,10 @@ impl v1alpha1::HdfsCluster { pub fn ports(&self, role: &HdfsNodeRole) -> Vec<(String, u16)> { match role { HdfsNodeRole::Name => vec![ + ( + String::from(SERVICE_PORT_NAME_METRICS), + DEFAULT_NAME_NODE_METRICS_PORT, + ), ( String::from(SERVICE_PORT_NAME_RPC), DEFAULT_NAME_NODE_RPC_PORT, @@ -682,6 +688,10 @@ impl v1alpha1::HdfsCluster { }, ], HdfsNodeRole::Data => vec![ + ( + String::from(SERVICE_PORT_NAME_METRICS), + DEFAULT_DATA_NODE_METRICS_PORT, + ), ( String::from(SERVICE_PORT_NAME_DATA), DEFAULT_DATA_NODE_DATA_PORT, @@ -703,6 +713,10 @@ impl v1alpha1::HdfsCluster { }, ], HdfsNodeRole::Journal => vec![ + ( + String::from(SERVICE_PORT_NAME_METRICS), + DEFAULT_JOURNAL_NODE_METRICS_PORT, + ), ( String::from(SERVICE_PORT_NAME_RPC), DEFAULT_JOURNAL_NODE_RPC_PORT, From deae3b960fce5790b83e9460a2529c3e8f18f213 Mon Sep 17 00:00:00 2001 From: Razvan-Daniel Mihai <84674+razvan@users.noreply.github.com> Date: Mon, 16 Jun 2025 12:05:26 +0200 Subject: [PATCH 4/7] add test and update docs --- .../hdfs/pages/usage-guide/monitoring.adoc | 6 + tests/templates/kuttl/smoke/51-assert.yaml.j2 | 4 + .../smoke/51-copy-metrics-test-script.yaml | 1 + .../kuttl/smoke/test_prometheus_metrics.py | 105 ++++++++++++++++++ 4 files changed, 116 insertions(+) create mode 100644 tests/templates/kuttl/smoke/test_prometheus_metrics.py diff --git a/docs/modules/hdfs/pages/usage-guide/monitoring.adoc b/docs/modules/hdfs/pages/usage-guide/monitoring.adoc index 58329301..ef1bf6de 100644 --- a/docs/modules/hdfs/pages/usage-guide/monitoring.adoc +++ b/docs/modules/hdfs/pages/usage-guide/monitoring.adoc @@ -6,6 +6,12 @@ The cluster can be monitored with Prometheus from inside or outside the K8S clus All services (with the exception of the Zookeeper daemon on the node names) run with the JMX exporter agent enabled and expose metrics on the `metrics` port. This port is available from the container level up to the NodePort services. +[IMPORTANT] +==== +Starting with Stackable Data Platform 25.7, the bult-in Prometheus metrics are also available at the `/prom` endpoint of all the UI services. +The JMX exporter metrics are now deprecated and will be removed in a future release. +==== + The metrics endpoints are also used as liveliness probes by Kubernetes. See xref:operators:monitoring.adoc[] for more details. diff --git a/tests/templates/kuttl/smoke/51-assert.yaml.j2 b/tests/templates/kuttl/smoke/51-assert.yaml.j2 index 4a20065d..6f57dda2 100644 --- a/tests/templates/kuttl/smoke/51-assert.yaml.j2 +++ b/tests/templates/kuttl/smoke/51-assert.yaml.j2 @@ -8,5 +8,9 @@ commands: {% else %} PRODUCT_VERSION={{ test_scenario['values']['hadoop'] }} {% endif %} + # Test JMX exported metrics kubectl exec --namespace=$NAMESPACE test-runner-0 -- \ python /tmp/test_metrics.py $NAMESPACE $PRODUCT_VERSION + # Test Prometheus metrics + kubectl exec --namespace=$NAMESPACE test-runner-0 -- \ + python /tmp/test_prometheus_metrics.py $NAMESPACE $PRODUCT_VERSION diff --git a/tests/templates/kuttl/smoke/51-copy-metrics-test-script.yaml b/tests/templates/kuttl/smoke/51-copy-metrics-test-script.yaml index fa17cd19..bb617f97 100644 --- a/tests/templates/kuttl/smoke/51-copy-metrics-test-script.yaml +++ b/tests/templates/kuttl/smoke/51-copy-metrics-test-script.yaml @@ -3,3 +3,4 @@ apiVersion: kuttl.dev/v1beta1 kind: TestStep commands: - script: kubectl cp -n $NAMESPACE ./test_metrics.py test-runner-0:/tmp + - script: kubectl cp -n $NAMESPACE ./test_prometheus_metrics.py test-runner-0:/tmp diff --git a/tests/templates/kuttl/smoke/test_prometheus_metrics.py b/tests/templates/kuttl/smoke/test_prometheus_metrics.py new file mode 100644 index 00000000..35fd0199 --- /dev/null +++ b/tests/templates/kuttl/smoke/test_prometheus_metrics.py @@ -0,0 +1,105 @@ +# Fetch metrics from the built-in Prometheus endpoint of HDFS components. + +import logging +import re +import sys + +import requests + + +def check_metrics( + namespace: str, role: str, port: int, expected_metrics: list[str] +) -> None: + response: requests.Response = requests.get( + f"http://hdfs-{role}-default-0.hdfs-{role}-default.{namespace}.svc.cluster.local:{port}/prom", + timeout=10, + ) + assert response.ok, "Requesting metrics failed" + + for metric in expected_metrics: + assert re.search(f"^{metric}", response.text, re.MULTILINE) is not None, ( + f"Metric '{metric}' not found for {role}" + ) + + +def check_namenode_metrics( + namespace: str, + product_version: str, +) -> None: + expected_metrics: list[str] = [ + # Kind "MetricsSystem" + 'metrics_system_num_active_sources{context="metricssystem",hostname="hdfs-namenode-default-0"}', + # Counter suffixed with "_total" + # The metric attributes can change so use .* for them. + # The full name looks like: 'fs_namesystem_files_total{context="dfs",enabledecpolicies="RS-6-3-1024k",hastate="active",totalsynctimes="4 7 ",hostname="hdfs-namenode-default-0"}', + "fs_namesystem_files_total.*", + # Metric suffixed with "_created" + 'namenode_files_created{processname="NameNode",sessionid="null",context="dfs",hostname="hdfs-namenode-default-0"}', + # Boolean metric + # 'hadoop_namenode_security_enabled{kind="NameNodeStatus",role="NameNode",service="HDFS"}', + # Non-special metric + 'namenode_files_deleted{processname="NameNode",sessionid="null",context="dfs",hostname="hdfs-namenode-default-0"}', + ] + + check_metrics(namespace, "namenode", 9870, expected_metrics) + + +def check_datanode_metrics( + namespace: str, + product_version: str, +) -> None: + expected_metrics: list[str] = [ + # Kind "MetricsSystem" + 'metrics_system_num_active_sources{context="metricssystem",hostname="hdfs-datanode-default-0"}', + # Kind "FSDatasetState" suffixed with "_total" + # 'org_apache_hadoop_hdfs_server_datanode_fsdataset_impl_fs_dataset_impl_estimated_capacity_lost_total{context="FSDatasetState",storageinfo="FSDataset{dirpath=\'[/stackable/data/hdd/datanode,/stackable/data/hdd-1/datanode, /stackable/data/ssd/datanode]\'}",hostname="hdfs-datanode-default-0"}', + "org_apache_hadoop_hdfs_server_datanode_fsdataset_impl_fs_dataset_impl_estimated_capacity_lost_total.*", + # Kind "FSDatasetState" + # 'org_apache_hadoop_hdfs_server_datanode_fsdataset_impl_fs_dataset_impl_capacity{context="FSDatasetState",storageinfo="FSDataset{dirpath=\'[/stackable/data/hdd/datanode, /stackable/data/hdd-1/datanode, /stackable/data/ssd/datanode]\'}",hostname="hdfs-datanode-default-0"}', + "org_apache_hadoop_hdfs_server_datanode_fsdataset_impl_fs_dataset_impl_capacity.*", + # Kind "DataNodeActivity" suffixed with "_info" + 'datanode_blocks_get_local_path_info{sessionid="null",context="dfs",hostname="hdfs-datanode-default-0"}', + # Kind "DataNodeActivity" + 'datanode_blocks_read{sessionid="null",context="dfs",hostname="hdfs-datanode-default-0"}', + # Counter suffixed with "_total" + # 'org_apache_hadoop_hdfs_server_datanode_fsdataset_impl_fs_dataset_impl_estimated_capacity_lost_total{context="FSDatasetState",storageinfo="FSDataset{dirpath=\'[/stackable/data/hdd/datanode,/stackable/data/hdd-1/datanode, /stackable/data/ssd/datanode]\'}",hostname="hdfs-datanode-default-0"}', + "org_apache_hadoop_hdfs_server_datanode_fsdataset_impl_fs_dataset_impl_estimated_capacity_lost_total.*", + # Boolean metric + #'hadoop_datanode_security_enabled{kind="DataNodeInfo",role="DataNode",service="HDFS"}', + # Non-special metric + 'jvm_metrics_gc_count{context="jvm",processname="DataNode",sessionid="null",hostname="hdfs-datanode-default-0"}', + ] + + check_metrics(namespace, "datanode", 9864, expected_metrics) + + +def check_journalnode_metrics( + namespace: str, + product_version: str, +) -> None: + expected_metrics: list[str] = [ + # Kind "MetricsSystem" + 'metrics_system_num_active_sources{context="metricssystem",hostname="hdfs-journalnode-default-0"}', + # Non-special metric + 'journal_node_bytes_written{context="dfs",journalid="hdfs",hostname="hdfs-journalnode-default-0"}', + # There is no boolean metric in JournalNode. + ] + + check_metrics(namespace, "journalnode", 8480, expected_metrics) + + +if __name__ == "__main__": + namespace_arg: str = sys.argv[1] + product_version_arg: str = sys.argv[2] + + logging.basicConfig( + level="DEBUG", + format="%(asctime)s %(levelname)s: %(message)s", + stream=sys.stdout, + ) + + check_namenode_metrics(namespace_arg, product_version_arg) + check_datanode_metrics(namespace_arg, product_version_arg) + check_journalnode_metrics(namespace_arg, product_version_arg) + + print("All expected metrics found") From 43f15299a3f6e1a2e1292ca7cb841ae4b1cc848a Mon Sep 17 00:00:00 2001 From: Razvan-Daniel Mihai <84674+razvan@users.noreply.github.com> Date: Mon, 16 Jun 2025 12:08:47 +0200 Subject: [PATCH 5/7] update changelog --- CHANGELOG.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 8a7b0fef..6d3d3839 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,6 +11,7 @@ All notable changes to this project will be documented in this file. - Use `--file-log-rotation-period` (or `FILE_LOG_ROTATION_PERIOD`) to configure the frequency of rotation. - Use `--console-log-format` (or `CONSOLE_LOG_FORMAT`) to set the format to `plain` (default) or `json`. - The operator now defaults to `AES/CTR/NoPadding` for `dfs.encrypt.data.transfer.cipher.suite` to improve security and performance ([#693]). +- The built-in Prometheus servlet is now enabled and metrics are exposed under the `/prom` path of all UI services ([#695]). ### Changed @@ -21,7 +22,7 @@ All notable changes to this project will be documented in this file. by `FILE_LOG_DIRECTORY` (or via `--file-log-directory `). - Replace stackable-operator `print_startup_string` with `tracing::info!` with fields. - BREAKING: Inject the vector aggregator address into the vector config using the env var `VECTOR_AGGREGATOR_ADDRESS` instead - of having the operator write it to the vector config ([#671]). + of having the operator write it to the vector config ([#671]). - test: Bump to Vector `0.46.1` ([#677]). - BREAKING: Previously this operator would hardcode the UID and GID of the Pods being created to 1000/0, this has changed now ([#683]) - The `runAsUser` and `runAsGroup` fields will not be set anymore by the operator @@ -48,6 +49,7 @@ All notable changes to this project will be documented in this file. [#683]: https://github.com/stackabletech/hdfs-operator/pull/683 [#684]: https://github.com/stackabletech/hdfs-operator/pull/684 [#693]: https://github.com/stackabletech/hdfs-operator/pull/693 +[#695]: https://github.com/stackabletech/hdfs-operator/pull/695 ## [25.3.0] - 2025-03-21 From 3d14a50218dc626d095d647f5078967ea30c44cb Mon Sep 17 00:00:00 2001 From: Razvan-Daniel Mihai <84674+razvan@users.noreply.github.com> Date: Mon, 16 Jun 2025 13:33:49 +0200 Subject: [PATCH 6/7] Update docs/modules/hdfs/pages/usage-guide/monitoring.adoc Co-authored-by: Lukas Krug --- docs/modules/hdfs/pages/usage-guide/monitoring.adoc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/modules/hdfs/pages/usage-guide/monitoring.adoc b/docs/modules/hdfs/pages/usage-guide/monitoring.adoc index ef1bf6de..53c52956 100644 --- a/docs/modules/hdfs/pages/usage-guide/monitoring.adoc +++ b/docs/modules/hdfs/pages/usage-guide/monitoring.adoc @@ -8,7 +8,7 @@ This port is available from the container level up to the NodePort services. [IMPORTANT] ==== -Starting with Stackable Data Platform 25.7, the bult-in Prometheus metrics are also available at the `/prom` endpoint of all the UI services. +Starting with Stackable Data Platform 25.7, the built-in Prometheus metrics are also available at the `/prom` endpoint of all the UI services. The JMX exporter metrics are now deprecated and will be removed in a future release. ==== From 9bc3d3bf9cb51a6b7f0bdec817c9971eed01df6c Mon Sep 17 00:00:00 2001 From: Razvan-Daniel Mihai <84674+razvan@users.noreply.github.com> Date: Mon, 16 Jun 2025 13:33:26 +0200 Subject: [PATCH 7/7] review feedback --- .../kuttl/smoke/test_prometheus_metrics.py | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/tests/templates/kuttl/smoke/test_prometheus_metrics.py b/tests/templates/kuttl/smoke/test_prometheus_metrics.py index 35fd0199..fb19d908 100644 --- a/tests/templates/kuttl/smoke/test_prometheus_metrics.py +++ b/tests/templates/kuttl/smoke/test_prometheus_metrics.py @@ -1,7 +1,6 @@ # Fetch metrics from the built-in Prometheus endpoint of HDFS components. import logging -import re import sys import requests @@ -16,8 +15,13 @@ def check_metrics( ) assert response.ok, "Requesting metrics failed" + # Split the response into lines to check for metric names at the beginning of each line. + # This is a bit slower than using a regex but it allows to use special characters like "{}" in metric names + # without needing to escape them. + response_lines = response.text.splitlines() for metric in expected_metrics: - assert re.search(f"^{metric}", response.text, re.MULTILINE) is not None, ( + # Use any() with a generator to stop early if the metric is found. + assert any((line.startswith(metric) for line in response_lines)) is True, ( f"Metric '{metric}' not found for {role}" ) @@ -30,9 +34,9 @@ def check_namenode_metrics( # Kind "MetricsSystem" 'metrics_system_num_active_sources{context="metricssystem",hostname="hdfs-namenode-default-0"}', # Counter suffixed with "_total" - # The metric attributes can change so use .* for them. + # The metric attributes can change so we remove them from the expected metric. # The full name looks like: 'fs_namesystem_files_total{context="dfs",enabledecpolicies="RS-6-3-1024k",hastate="active",totalsynctimes="4 7 ",hostname="hdfs-namenode-default-0"}', - "fs_namesystem_files_total.*", + "fs_namesystem_files_total", # Metric suffixed with "_created" 'namenode_files_created{processname="NameNode",sessionid="null",context="dfs",hostname="hdfs-namenode-default-0"}', # Boolean metric @@ -53,17 +57,17 @@ def check_datanode_metrics( 'metrics_system_num_active_sources{context="metricssystem",hostname="hdfs-datanode-default-0"}', # Kind "FSDatasetState" suffixed with "_total" # 'org_apache_hadoop_hdfs_server_datanode_fsdataset_impl_fs_dataset_impl_estimated_capacity_lost_total{context="FSDatasetState",storageinfo="FSDataset{dirpath=\'[/stackable/data/hdd/datanode,/stackable/data/hdd-1/datanode, /stackable/data/ssd/datanode]\'}",hostname="hdfs-datanode-default-0"}', - "org_apache_hadoop_hdfs_server_datanode_fsdataset_impl_fs_dataset_impl_estimated_capacity_lost_total.*", + "org_apache_hadoop_hdfs_server_datanode_fsdataset_impl_fs_dataset_impl_estimated_capacity_lost_total", # Kind "FSDatasetState" # 'org_apache_hadoop_hdfs_server_datanode_fsdataset_impl_fs_dataset_impl_capacity{context="FSDatasetState",storageinfo="FSDataset{dirpath=\'[/stackable/data/hdd/datanode, /stackable/data/hdd-1/datanode, /stackable/data/ssd/datanode]\'}",hostname="hdfs-datanode-default-0"}', - "org_apache_hadoop_hdfs_server_datanode_fsdataset_impl_fs_dataset_impl_capacity.*", + "org_apache_hadoop_hdfs_server_datanode_fsdataset_impl_fs_dataset_impl_capacity", # Kind "DataNodeActivity" suffixed with "_info" 'datanode_blocks_get_local_path_info{sessionid="null",context="dfs",hostname="hdfs-datanode-default-0"}', # Kind "DataNodeActivity" 'datanode_blocks_read{sessionid="null",context="dfs",hostname="hdfs-datanode-default-0"}', # Counter suffixed with "_total" # 'org_apache_hadoop_hdfs_server_datanode_fsdataset_impl_fs_dataset_impl_estimated_capacity_lost_total{context="FSDatasetState",storageinfo="FSDataset{dirpath=\'[/stackable/data/hdd/datanode,/stackable/data/hdd-1/datanode, /stackable/data/ssd/datanode]\'}",hostname="hdfs-datanode-default-0"}', - "org_apache_hadoop_hdfs_server_datanode_fsdataset_impl_fs_dataset_impl_estimated_capacity_lost_total.*", + "org_apache_hadoop_hdfs_server_datanode_fsdataset_impl_fs_dataset_impl_estimated_capacity_lost_total", # Boolean metric #'hadoop_datanode_security_enabled{kind="DataNodeInfo",role="DataNode",service="HDFS"}', # Non-special metric