|
| 1 | +# Fetch metrics from the built-in Prometheus endpoint of HDFS components. |
| 2 | + |
| 3 | +import logging |
| 4 | +import sys |
| 5 | + |
| 6 | +import requests |
| 7 | + |
| 8 | + |
| 9 | +def check_metrics( |
| 10 | + namespace: str, role: str, port: int, expected_metrics: list[str] |
| 11 | +) -> None: |
| 12 | + response: requests.Response = requests.get( |
| 13 | + f"http://hdfs-{role}-default-0.hdfs-{role}-default.{namespace}.svc.cluster.local:{port}/prom", |
| 14 | + timeout=10, |
| 15 | + ) |
| 16 | + assert response.ok, "Requesting metrics failed" |
| 17 | + |
| 18 | + # Split the response into lines to check for metric names at the beginning of each line. |
| 19 | + # This is a bit slower than using a regex but it allows to use special characters like "{}" in metric names |
| 20 | + # without needing to escape them. |
| 21 | + response_lines = response.text.splitlines() |
| 22 | + for metric in expected_metrics: |
| 23 | + # Use any() with a generator to stop early if the metric is found. |
| 24 | + assert any((line.startswith(metric) for line in response_lines)) is True, ( |
| 25 | + f"Metric '{metric}' not found for {role}" |
| 26 | + ) |
| 27 | + |
| 28 | + |
| 29 | +def check_namenode_metrics( |
| 30 | + namespace: str, |
| 31 | + product_version: str, |
| 32 | +) -> None: |
| 33 | + expected_metrics: list[str] = [ |
| 34 | + # Kind "MetricsSystem" |
| 35 | + 'metrics_system_num_active_sources{context="metricssystem",hostname="hdfs-namenode-default-0"}', |
| 36 | + # Counter suffixed with "_total" |
| 37 | + # The metric attributes can change so we remove them from the expected metric. |
| 38 | + # The full name looks like: 'fs_namesystem_files_total{context="dfs",enabledecpolicies="RS-6-3-1024k",hastate="active",totalsynctimes="4 7 ",hostname="hdfs-namenode-default-0"}', |
| 39 | + "fs_namesystem_files_total", |
| 40 | + # Metric suffixed with "_created" |
| 41 | + 'namenode_files_created{processname="NameNode",sessionid="null",context="dfs",hostname="hdfs-namenode-default-0"}', |
| 42 | + # Boolean metric |
| 43 | + # 'hadoop_namenode_security_enabled{kind="NameNodeStatus",role="NameNode",service="HDFS"}', |
| 44 | + # Non-special metric |
| 45 | + 'namenode_files_deleted{processname="NameNode",sessionid="null",context="dfs",hostname="hdfs-namenode-default-0"}', |
| 46 | + ] |
| 47 | + |
| 48 | + check_metrics(namespace, "namenode", 9870, expected_metrics) |
| 49 | + |
| 50 | + |
| 51 | +def check_datanode_metrics( |
| 52 | + namespace: str, |
| 53 | + product_version: str, |
| 54 | +) -> None: |
| 55 | + expected_metrics: list[str] = [ |
| 56 | + # Kind "MetricsSystem" |
| 57 | + 'metrics_system_num_active_sources{context="metricssystem",hostname="hdfs-datanode-default-0"}', |
| 58 | + # Kind "FSDatasetState" suffixed with "_total" |
| 59 | + # 'org_apache_hadoop_hdfs_server_datanode_fsdataset_impl_fs_dataset_impl_estimated_capacity_lost_total{context="FSDatasetState",storageinfo="FSDataset{dirpath=\'[/stackable/data/hdd/datanode,/stackable/data/hdd-1/datanode, /stackable/data/ssd/datanode]\'}",hostname="hdfs-datanode-default-0"}', |
| 60 | + "org_apache_hadoop_hdfs_server_datanode_fsdataset_impl_fs_dataset_impl_estimated_capacity_lost_total", |
| 61 | + # Kind "FSDatasetState" |
| 62 | + # 'org_apache_hadoop_hdfs_server_datanode_fsdataset_impl_fs_dataset_impl_capacity{context="FSDatasetState",storageinfo="FSDataset{dirpath=\'[/stackable/data/hdd/datanode, /stackable/data/hdd-1/datanode, /stackable/data/ssd/datanode]\'}",hostname="hdfs-datanode-default-0"}', |
| 63 | + "org_apache_hadoop_hdfs_server_datanode_fsdataset_impl_fs_dataset_impl_capacity", |
| 64 | + # Kind "DataNodeActivity" suffixed with "_info" |
| 65 | + 'datanode_blocks_get_local_path_info{sessionid="null",context="dfs",hostname="hdfs-datanode-default-0"}', |
| 66 | + # Kind "DataNodeActivity" |
| 67 | + 'datanode_blocks_read{sessionid="null",context="dfs",hostname="hdfs-datanode-default-0"}', |
| 68 | + # Counter suffixed with "_total" |
| 69 | + # 'org_apache_hadoop_hdfs_server_datanode_fsdataset_impl_fs_dataset_impl_estimated_capacity_lost_total{context="FSDatasetState",storageinfo="FSDataset{dirpath=\'[/stackable/data/hdd/datanode,/stackable/data/hdd-1/datanode, /stackable/data/ssd/datanode]\'}",hostname="hdfs-datanode-default-0"}', |
| 70 | + "org_apache_hadoop_hdfs_server_datanode_fsdataset_impl_fs_dataset_impl_estimated_capacity_lost_total", |
| 71 | + # Boolean metric |
| 72 | + #'hadoop_datanode_security_enabled{kind="DataNodeInfo",role="DataNode",service="HDFS"}', |
| 73 | + # Non-special metric |
| 74 | + 'jvm_metrics_gc_count{context="jvm",processname="DataNode",sessionid="null",hostname="hdfs-datanode-default-0"}', |
| 75 | + ] |
| 76 | + |
| 77 | + check_metrics(namespace, "datanode", 9864, expected_metrics) |
| 78 | + |
| 79 | + |
| 80 | +def check_journalnode_metrics( |
| 81 | + namespace: str, |
| 82 | + product_version: str, |
| 83 | +) -> None: |
| 84 | + expected_metrics: list[str] = [ |
| 85 | + # Kind "MetricsSystem" |
| 86 | + 'metrics_system_num_active_sources{context="metricssystem",hostname="hdfs-journalnode-default-0"}', |
| 87 | + # Non-special metric |
| 88 | + 'journal_node_bytes_written{context="dfs",journalid="hdfs",hostname="hdfs-journalnode-default-0"}', |
| 89 | + # There is no boolean metric in JournalNode. |
| 90 | + ] |
| 91 | + |
| 92 | + check_metrics(namespace, "journalnode", 8480, expected_metrics) |
| 93 | + |
| 94 | + |
| 95 | +if __name__ == "__main__": |
| 96 | + namespace_arg: str = sys.argv[1] |
| 97 | + product_version_arg: str = sys.argv[2] |
| 98 | + |
| 99 | + logging.basicConfig( |
| 100 | + level="DEBUG", |
| 101 | + format="%(asctime)s %(levelname)s: %(message)s", |
| 102 | + stream=sys.stdout, |
| 103 | + ) |
| 104 | + |
| 105 | + check_namenode_metrics(namespace_arg, product_version_arg) |
| 106 | + check_datanode_metrics(namespace_arg, product_version_arg) |
| 107 | + check_journalnode_metrics(namespace_arg, product_version_arg) |
| 108 | + |
| 109 | + print("All expected metrics found") |
0 commit comments