From d150dd668a6dbf37e9e87469f26f8dff5a67e473 Mon Sep 17 00:00:00 2001 From: Siegfried Weber Date: Tue, 17 Dec 2024 17:08:00 +0100 Subject: [PATCH 1/6] test: Fix metrics in smoke test --- .../kuttl/smoke/30-install-hdfs.yaml.j2 | 312 ++++++++++++++++++ 1 file changed, 312 insertions(+) diff --git a/tests/templates/kuttl/smoke/30-install-hdfs.yaml.j2 b/tests/templates/kuttl/smoke/30-install-hdfs.yaml.j2 index ab0fdc0c..e23a69aa 100644 --- a/tests/templates/kuttl/smoke/30-install-hdfs.yaml.j2 +++ b/tests/templates/kuttl/smoke/30-install-hdfs.yaml.j2 @@ -28,6 +28,26 @@ spec: envOverrides: COMMON_VAR: role-value # overridden by role group below ROLE_VAR: role-value # only defined here at role level + HDFS_NAMENODE_OPTS: > + -Djava.security.properties=/stackable/config/namenode/security.properties + -javaagent:/stackable/jmx/jmx_prometheus_javaagent.jar=8183:/stackable/jmx-exporter-config/namenode.yaml + -Xmx838860k + podOverrides: + spec: + initContainers: + - name: format-namenodes + volumeMounts: + - name: jmx-exporter-config + mountPath: /stackable/jmx-exporter-config + containers: + - name: namenode + volumeMounts: + - name: jmx-exporter-config + mountPath: /stackable/jmx-exporter-config + volumes: + - name: jmx-exporter-config + configMap: + name: jmx-exporter-config config: listenerClass: {{ test_scenario['values']['listener-class'] }} logging: @@ -42,6 +62,21 @@ spec: envOverrides: COMMON_VAR: role-value # overridden by role group below ROLE_VAR: role-value # only defined here at role level + HDFS_DATANODE_OPTS: > + -Djava.security.properties=/stackable/config/datanode/security.properties + -javaagent:/stackable/jmx/jmx_prometheus_javaagent.jar=8082:/stackable/jmx-exporter-config/datanode.yaml + -Xmx419430k + podOverrides: + spec: + containers: + - name: datanode + volumeMounts: + - name: jmx-exporter-config + mountPath: /stackable/jmx-exporter-config + volumes: + - name: jmx-exporter-config + configMap: + name: jmx-exporter-config config: listenerClass: {{ test_scenario['values']['listener-class'] }} logging: @@ -71,6 +106,21 @@ spec: envOverrides: COMMON_VAR: role-value # overridden by role group below ROLE_VAR: role-value # only defined here at role level + HDFS_JOURNALNODE_OPTS: > + -Djava.security.properties=/stackable/config/journalnode/security.properties + -javaagent:/stackable/jmx/jmx_prometheus_javaagent.jar=8081:/stackable/jmx-exporter-config/journalnode.yaml + -Xmx419430k + podOverrides: + spec: + containers: + - name: journalnode + volumeMounts: + - name: jmx-exporter-config + mountPath: /stackable/jmx-exporter-config + volumes: + - name: jmx-exporter-config + configMap: + name: jmx-exporter-config config: logging: enableVectorAgent: {{ lookup('env', 'VECTOR_AGGREGATOR') | length > 0 }} @@ -93,3 +143,265 @@ spec: # https://github.com/stackabletech/hdfs-operator/issues/514 - name: dashed-port containerPort: 1234 +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: jmx-exporter-config +data: + namenode.yaml: |- + --- + startDelaySeconds: 10 + ssl: false + lowercaseOutputName: true + lowercaseOutputLabelNames: true + whitelistObjectNames: + - 'Hadoop:service=NameNode,name=*' + - 'Hadoop:service=NameNode,name=MetricsSystem,sub=*' + blacklistObjectNames: + - 'Hadoop:service=NameNode,name=RetryCache.NameNodeRetryCache' + - 'Hadoop:service=NameNode,name=RpcActivity*' + - 'Hadoop:service=NameNode,name=RpcDetailedActivity*' + - 'Hadoop:service=NameNode,name=UgiMetrics' + rules: + # MetricsSystem + - pattern: 'Hadoop<>(.*): (\d+)' + attrNameSnakeCase: true + name: hadoop_$1_$3 + value: $4 + labels: + service: HDFS + role: $1 + kind: 'MetricsSystem' + sub: $2 + type: GAUGE + # Total raw capacity in bytes, e.g. Hadoop:name=NameNodeInfo,attribute=Total + - pattern: 'Hadoop<>(total): (\d+)' + attrNameSnakeCase: true + name: hadoop_$1_$3 + value: $4 + labels: + service: HDFS + role: $1 + kind: $2 + type: COUNTER + # Generic counter, e.g. Hadoop:name=FSNamesystem,attribute=FilesTotal + - pattern: 'Hadoop<>(.*_total): (\d+)' + attrNameSnakeCase: true + name: hadoop_$1_$3 + value: $4 + labels: + service: HDFS + role: $1 + kind: $2 + type: COUNTER + # Metrics suffixed with _created, e.g. Hadoop:name=NameNodeActivity,attribute=FilesCreated + # The suffix _created is reserved for timestamps, therefore an underscore is appended. + - pattern: 'Hadoop<>(.*_created): (.*)' + attrNameSnakeCase: true + name: hadoop_$1_$3_ + value: $4 + labels: + service: HDFS + role: $1 + kind: $2 + type: GAUGE + # Metrics suffixed with _info, e.g. Hadoop:name=JvmMetrics,attribute=LogInfo + # The suffix _info is reserved for static information, therefore an underscore is appended. + - pattern: 'Hadoop<>(.*_info): (.*)' + attrNameSnakeCase: true + name: hadoop_$1_$3_ + value: $4 + labels: + service: HDFS + role: $1 + kind: $2 + type: GAUGE + # All other Hadoop metrics + - pattern: 'Hadoop<>(.*): (.*)' + attrNameSnakeCase: true + name: hadoop_$1_$3 + value: $4 + labels: + service: HDFS + role: $1 + kind: $2 + type: GAUGE + datanode.yaml: |- + --- + startDelaySeconds: 10 + ssl: false + lowercaseOutputName: true + lowercaseOutputLabelNames: true + whitelistObjectNames: + - 'Hadoop:service=DataNode,name=*' + - 'Hadoop:service=DataNode,name=MetricsSystem,sub=*' + blacklistObjectNames: + - 'Hadoop:service=DataNode,name=RpcActivity*' + - 'Hadoop:service=DataNode,name=RpcDetailedActivity*' + - 'Hadoop:service=DataNode,name=UgiMetrics' + rules: + # MetricsSystem + - pattern: 'Hadoop<>(.*): (\d+)' + attrNameSnakeCase: true + name: hadoop_$1_$3 + value: $4 + labels: + service: HDFS + role: $1 + kind: 'MetricsSystem' + sub: $2 + type: GAUGE + # FSDatasetState (also extracts the FSDataset ID) + - pattern: 'Hadoop<>(.*): (\d+)' + attrNameSnakeCase: true + name: hadoop_$1_$3 + value: $4 + labels: + service: HDFS + role: $1 + fsdatasetid: $2 + kind: 'FSDatasetState' + type: GAUGE + # DataNodeActivity (also extracts hostname and port) + - pattern: 'Hadoop<>(.*): (\d+)' + attrNameSnakeCase: true + name: hadoop_$1_$4 + value: $5 + labels: + service: HDFS + role: $1 + host: $2 + port: $3 + kind: 'DataNodeActivity' + type: GAUGE + # Total raw capacity in bytes, e.g. Hadoop:name=NameNodeInfo,attribute=Total + - pattern: 'Hadoop<>(total): (\d+)' + attrNameSnakeCase: true + name: hadoop_$1_$3 + value: $4 + labels: + service: HDFS + role: $1 + kind: $2 + type: COUNTER + # Generic counter, e.g. Hadoop:name=FSNamesystem,attribute=FilesTotal + - pattern: 'Hadoop<>(.*_total): (\d+)' + attrNameSnakeCase: true + name: hadoop_$1_$3 + value: $4 + labels: + service: HDFS + role: $1 + kind: $2 + type: COUNTER + # Metrics suffixed with _created, e.g. Hadoop:name=NameNodeActivity,attribute=FilesCreated + # The suffix _created is reserved for timestamps, therefore an underscore is appended. + - pattern: 'Hadoop<>(.*_created): (.*)' + attrNameSnakeCase: true + name: hadoop_$1_$3_ + value: $4 + labels: + service: HDFS + role: $1 + kind: $2 + type: GAUGE + # Metrics suffixed with _info, e.g. Hadoop:name=JvmMetrics,attribute=LogInfo + # The suffix _info is reserved for static information, therefore an underscore is appended. + - pattern: 'Hadoop<>(.*_info): (.*)' + attrNameSnakeCase: true + name: hadoop_$1_$3_ + value: $4 + labels: + service: HDFS + role: $1 + kind: $2 + type: GAUGE + # All other Hadoop metrics + - pattern: 'Hadoop<>(.*): (.*)' + attrNameSnakeCase: true + name: hadoop_$1_$3 + value: $4 + labels: + service: HDFS + role: $1 + kind: $2 + type: GAUGE + journalnode.yaml: |- + --- + startDelaySeconds: 10 + ssl: false + lowercaseOutputName: true + lowercaseOutputLabelNames: true + whitelistObjectNames: + - 'Hadoop:service=JournalNode,name=*' + - 'Hadoop:service=JournalNode,name=MetricsSystem,sub=*' + blacklistObjectNames: + - 'Hadoop:service=JournalNode,name=RetryCache.JournalNodeRetryCache' + - 'Hadoop:service=JournalNode,name=RpcActivity*' + - 'Hadoop:service=JournalNode,name=RpcDetailedActivity*' + - 'Hadoop:service=JournalNode,name=UgiMetrics' + rules: + # MetricsSystem + - pattern: 'Hadoop<>(.*): (\d+)' + attrNameSnakeCase: true + name: hadoop_$1_$3 + value: $4 + labels: + service: HDFS + role: $1 + kind: 'MetricsSystem' + sub: $2 + type: GAUGE + # Total raw capacity in bytes, e.g. Hadoop:name=NameNodeInfo,attribute=Total + - pattern: 'Hadoop<>(total): (\d+)' + attrNameSnakeCase: true + name: hadoop_$1_$3 + value: $4 + labels: + service: HDFS + role: $1 + kind: $2 + type: COUNTER + # Generic counter, e.g. Hadoop:name=FSNamesystem,attribute=FilesTotal + - pattern: 'Hadoop<>(.*_total): (\d+)' + attrNameSnakeCase: true + name: hadoop_$1_$3 + value: $4 + labels: + service: HDFS + role: $1 + kind: $2 + type: COUNTER + # Metrics suffixed with _created, e.g. Hadoop:name=NameNodeActivity,attribute=FilesCreated + # The suffix _created is reserved for timestamps, therefore an underscore is appended. + - pattern: 'Hadoop<>(.*_created): (.*)' + attrNameSnakeCase: true + name: hadoop_$1_$3_ + value: $4 + labels: + service: HDFS + role: $1 + kind: $2 + type: GAUGE + # Metrics suffixed with _info, e.g. Hadoop:name=JvmMetrics,attribute=LogInfo + # The suffix _info is reserved for static information, therefore an underscore is appended. + - pattern: 'Hadoop<>(.*_info): (.*)' + attrNameSnakeCase: true + name: hadoop_$1_$3_ + value: $4 + labels: + service: HDFS + role: $1 + kind: $2 + type: GAUGE + # All other Hadoop metrics + - pattern: 'Hadoop<>(.*): (.*)' + attrNameSnakeCase: true + name: hadoop_$1_$3 + value: $4 + labels: + service: HDFS + role: $1 + kind: $2 + type: GAUGE From e9cf012ba86bf825870e2530d491540d66d63f92 Mon Sep 17 00:00:00 2001 From: Siegfried Weber Date: Tue, 17 Dec 2024 17:41:21 +0100 Subject: [PATCH 2/6] test: Update metrics configuration in smoke test --- .../kuttl/smoke/30-install-hdfs.yaml.j2 | 54 +------------------ 1 file changed, 1 insertion(+), 53 deletions(-) diff --git a/tests/templates/kuttl/smoke/30-install-hdfs.yaml.j2 b/tests/templates/kuttl/smoke/30-install-hdfs.yaml.j2 index e23a69aa..c0496c58 100644 --- a/tests/templates/kuttl/smoke/30-install-hdfs.yaml.j2 +++ b/tests/templates/kuttl/smoke/30-install-hdfs.yaml.j2 @@ -275,17 +275,7 @@ data: port: $3 kind: 'DataNodeActivity' type: GAUGE - # Total raw capacity in bytes, e.g. Hadoop:name=NameNodeInfo,attribute=Total - - pattern: 'Hadoop<>(total): (\d+)' - attrNameSnakeCase: true - name: hadoop_$1_$3 - value: $4 - labels: - service: HDFS - role: $1 - kind: $2 - type: COUNTER - # Generic counter, e.g. Hadoop:name=FSNamesystem,attribute=FilesTotal + # Generic counter, e.g. Hadoop:name=FSDatasetState,attribute=EstimatedCapacityLostTotal - pattern: 'Hadoop<>(.*_total): (\d+)' attrNameSnakeCase: true name: hadoop_$1_$3 @@ -295,17 +285,6 @@ data: role: $1 kind: $2 type: COUNTER - # Metrics suffixed with _created, e.g. Hadoop:name=NameNodeActivity,attribute=FilesCreated - # The suffix _created is reserved for timestamps, therefore an underscore is appended. - - pattern: 'Hadoop<>(.*_created): (.*)' - attrNameSnakeCase: true - name: hadoop_$1_$3_ - value: $4 - labels: - service: HDFS - role: $1 - kind: $2 - type: GAUGE # Metrics suffixed with _info, e.g. Hadoop:name=JvmMetrics,attribute=LogInfo # The suffix _info is reserved for static information, therefore an underscore is appended. - pattern: 'Hadoop<>(.*_info): (.*)' @@ -353,37 +332,6 @@ data: kind: 'MetricsSystem' sub: $2 type: GAUGE - # Total raw capacity in bytes, e.g. Hadoop:name=NameNodeInfo,attribute=Total - - pattern: 'Hadoop<>(total): (\d+)' - attrNameSnakeCase: true - name: hadoop_$1_$3 - value: $4 - labels: - service: HDFS - role: $1 - kind: $2 - type: COUNTER - # Generic counter, e.g. Hadoop:name=FSNamesystem,attribute=FilesTotal - - pattern: 'Hadoop<>(.*_total): (\d+)' - attrNameSnakeCase: true - name: hadoop_$1_$3 - value: $4 - labels: - service: HDFS - role: $1 - kind: $2 - type: COUNTER - # Metrics suffixed with _created, e.g. Hadoop:name=NameNodeActivity,attribute=FilesCreated - # The suffix _created is reserved for timestamps, therefore an underscore is appended. - - pattern: 'Hadoop<>(.*_created): (.*)' - attrNameSnakeCase: true - name: hadoop_$1_$3_ - value: $4 - labels: - service: HDFS - role: $1 - kind: $2 - type: GAUGE # Metrics suffixed with _info, e.g. Hadoop:name=JvmMetrics,attribute=LogInfo # The suffix _info is reserved for static information, therefore an underscore is appended. - pattern: 'Hadoop<>(.*_info): (.*)' From 91340d57699bbf22b0c1ea73f81af0bf3b5120d3 Mon Sep 17 00:00:00 2001 From: Siegfried Weber Date: Wed, 18 Dec 2024 14:07:12 +0100 Subject: [PATCH 3/6] test: Check metrics in smoke test --- .../kuttl/smoke/30-install-hdfs.yaml.j2 | 26 ++++- tests/templates/kuttl/smoke/40-assert.yaml | 2 +- ...bhdfs.yaml => 40-install-test-runner.yaml} | 10 +- tests/templates/kuttl/smoke/50-assert.yaml | 2 +- .../templates/kuttl/smoke/50-create-file.yaml | 6 +- tests/templates/kuttl/smoke/51-assert.yaml | 5 + .../smoke/51-copy-metrics-test-script.yaml | 5 + tests/templates/kuttl/smoke/metrics-test.py | 96 +++++++++++++++++++ 8 files changed, 141 insertions(+), 11 deletions(-) rename tests/templates/kuttl/smoke/{40-webhdfs.yaml => 40-install-test-runner.yaml} (80%) create mode 100644 tests/templates/kuttl/smoke/51-assert.yaml create mode 100644 tests/templates/kuttl/smoke/51-copy-metrics-test-script.yaml create mode 100755 tests/templates/kuttl/smoke/metrics-test.py diff --git a/tests/templates/kuttl/smoke/30-install-hdfs.yaml.j2 b/tests/templates/kuttl/smoke/30-install-hdfs.yaml.j2 index c0496c58..ae88e825 100644 --- a/tests/templates/kuttl/smoke/30-install-hdfs.yaml.j2 +++ b/tests/templates/kuttl/smoke/30-install-hdfs.yaml.j2 @@ -252,6 +252,18 @@ data: kind: 'MetricsSystem' sub: $2 type: GAUGE + # FSDatasetState with _total suffix (also extracts the FSDataset ID), + # e.g. Hadoop:name=FSDatasetState,attribute=EstimatedCapacityLostTotal + - pattern: 'Hadoop<>(.*_total): (\d+)' + attrNameSnakeCase: true + name: hadoop_$1_$3 + value: $4 + labels: + service: HDFS + role: $1 + fsdatasetid: $2 + kind: 'FSDatasetState' + type: COUNTER # FSDatasetState (also extracts the FSDataset ID) - pattern: 'Hadoop<>(.*): (\d+)' attrNameSnakeCase: true @@ -263,7 +275,19 @@ data: fsdatasetid: $2 kind: 'FSDatasetState' type: GAUGE - # DataNodeActivity (also extracts hostname and port) + # DataNodeActivity with _info suffix (also extracts hostname and port), + # e.g. Hadoop:name=DataNodeActivity-hdfs-datanode-default-0-9866,attribute=BlocksGetLocalPathInfo + - pattern: 'Hadoop<>(.*_info): (\d+)' + attrNameSnakeCase: true + name: hadoop_$1_$4_ + value: $5 + labels: + service: HDFS + role: $1 + host: $2 + port: $3 + kind: 'DataNodeActivity' + type: GAUGE - pattern: 'Hadoop<>(.*): (\d+)' attrNameSnakeCase: true name: hadoop_$1_$4 diff --git a/tests/templates/kuttl/smoke/40-assert.yaml b/tests/templates/kuttl/smoke/40-assert.yaml index 6237bcac..64d967e4 100644 --- a/tests/templates/kuttl/smoke/40-assert.yaml +++ b/tests/templates/kuttl/smoke/40-assert.yaml @@ -6,7 +6,7 @@ timeout: 300 apiVersion: apps/v1 kind: StatefulSet metadata: - name: webhdfs + name: test-runner status: readyReplicas: 1 replicas: 1 diff --git a/tests/templates/kuttl/smoke/40-webhdfs.yaml b/tests/templates/kuttl/smoke/40-install-test-runner.yaml similarity index 80% rename from tests/templates/kuttl/smoke/40-webhdfs.yaml rename to tests/templates/kuttl/smoke/40-install-test-runner.yaml index 2975ba72..43c18d74 100644 --- a/tests/templates/kuttl/smoke/40-webhdfs.yaml +++ b/tests/templates/kuttl/smoke/40-install-test-runner.yaml @@ -2,21 +2,21 @@ apiVersion: apps/v1 kind: StatefulSet metadata: - name: webhdfs + name: test-runner labels: - app: webhdfs + app: test-runner spec: replicas: 1 selector: matchLabels: - app: webhdfs + app: test-runner template: metadata: labels: - app: webhdfs + app: test-runner spec: containers: - - name: webhdfs + - name: test-runner image: docker.stackable.tech/stackable/testing-tools:0.2.0-stackable0.0.0-dev stdin: true tty: true diff --git a/tests/templates/kuttl/smoke/50-assert.yaml b/tests/templates/kuttl/smoke/50-assert.yaml index 1c4860b9..166e0e21 100644 --- a/tests/templates/kuttl/smoke/50-assert.yaml +++ b/tests/templates/kuttl/smoke/50-assert.yaml @@ -2,4 +2,4 @@ apiVersion: kuttl.dev/v1beta1 kind: TestAssert commands: - - script: kubectl exec -n $NAMESPACE webhdfs-0 -- python /tmp/webhdfs.py $NAMESPACE ls + - script: kubectl exec -n $NAMESPACE test-runner-0 -- python /tmp/webhdfs.py $NAMESPACE ls diff --git a/tests/templates/kuttl/smoke/50-create-file.yaml b/tests/templates/kuttl/smoke/50-create-file.yaml index d72fb348..80a710a7 100644 --- a/tests/templates/kuttl/smoke/50-create-file.yaml +++ b/tests/templates/kuttl/smoke/50-create-file.yaml @@ -2,6 +2,6 @@ apiVersion: kuttl.dev/v1beta1 kind: TestStep commands: - - script: kubectl cp -n $NAMESPACE ./webhdfs.py webhdfs-0:/tmp - - script: kubectl cp -n $NAMESPACE ./testdata.txt webhdfs-0:/tmp - - script: kubectl exec -n $NAMESPACE webhdfs-0 -- python /tmp/webhdfs.py $NAMESPACE create + - script: kubectl cp -n $NAMESPACE ./webhdfs.py test-runner-0:/tmp + - script: kubectl cp -n $NAMESPACE ./testdata.txt test-runner-0:/tmp + - script: kubectl exec -n $NAMESPACE test-runner-0 -- python /tmp/webhdfs.py $NAMESPACE create diff --git a/tests/templates/kuttl/smoke/51-assert.yaml b/tests/templates/kuttl/smoke/51-assert.yaml new file mode 100644 index 00000000..d2a0709d --- /dev/null +++ b/tests/templates/kuttl/smoke/51-assert.yaml @@ -0,0 +1,5 @@ +--- +apiVersion: kuttl.dev/v1beta1 +kind: TestAssert +commands: + - script: kubectl exec -n $NAMESPACE test-runner-0 -- python /tmp/metrics-test.py $NAMESPACE diff --git a/tests/templates/kuttl/smoke/51-copy-metrics-test-script.yaml b/tests/templates/kuttl/smoke/51-copy-metrics-test-script.yaml new file mode 100644 index 00000000..5cf50409 --- /dev/null +++ b/tests/templates/kuttl/smoke/51-copy-metrics-test-script.yaml @@ -0,0 +1,5 @@ +--- +apiVersion: kuttl.dev/v1beta1 +kind: TestStep +commands: + - script: kubectl cp -n $NAMESPACE ./metrics-test.py test-runner-0:/tmp diff --git a/tests/templates/kuttl/smoke/metrics-test.py b/tests/templates/kuttl/smoke/metrics-test.py new file mode 100755 index 00000000..bda07005 --- /dev/null +++ b/tests/templates/kuttl/smoke/metrics-test.py @@ -0,0 +1,96 @@ +from requests import Response +import re +import requests +import sys +import logging + + +def check_metrics( + namespace: str, + role: str, + port: int, + expected_metrics: list[str] + ) -> None: + response: Response = requests.get( + f'http://hdfs-{role}-default-0.hdfs-{role}-default.{namespace}.svc.cluster.local:{port}/metrics' + ) + assert response.ok, "Requesting metrics failed" + + for metric in expected_metrics: + assert re.search(f'^{metric}', response.text, re.MULTILINE) is not None, \ + f"Metric '{metric}' not found for {role}" + + +def check_namenode_metrics( + namespace: str, + ) -> None: + expected_metrics = [ + # Kind "MetricsSystem" + 'hadoop_namenode_num_active_sources{kind="MetricsSystem",role="NameNode",service="HDFS",sub="Stats"}', + # Attribute "Total" + 'hadoop_namenode_total{kind="NameNodeInfo",role="NameNode",service="HDFS"}', + # Counter suffixed with "_total" + 'hadoop_namenode_files_total{kind="FSNamesystem",role="NameNode",service="HDFS"}', + # Metric suffixed with "_created" + 'hadoop_namenode_files_created_{kind="NameNodeActivity",role="NameNode",service="HDFS"}', + # Metric suffixed with "_info" + 'hadoop_namenode_log_info_{kind="JvmMetrics",role="NameNode",service="HDFS"}', + # Non-special metric + 'hadoop_namenode_files_deleted{kind="NameNodeActivity",role="NameNode",service="HDFS"}', + ] + check_metrics(namespace, 'namenode', 8183, expected_metrics) + + +def check_datanode_metrics( + namespace: str, + ) -> None: + expected_metrics = [ + # Kind "MetricsSystem" + 'hadoop_datanode_num_active_sources{kind="MetricsSystem",role="DataNode",service="HDFS",sub="Stats"}', + # Kind "FSDatasetState" suffixed with "_total" + 'hadoop_datanode_estimated_capacity_lost_total{fsdatasetid=".+",kind="FSDatasetState",role="DataNode",service="HDFS"}', + # Kind "FSDatasetState" + 'hadoop_datanode_capacity{fsdatasetid=".+",kind="FSDatasetState",role="DataNode",service="HDFS"}', + # Kind "DataNodeActivity" suffixed with "_info" + 'hadoop_datanode_blocks_get_local_path_info_{host="hdfs-datanode-default-0\\.hdfs-datanode-default\\..+\\.svc\\.cluster\\.local",kind="DataNodeActivity",port="9866",role="DataNode",service="HDFS"}', + # Kind "DataNodeActivity" + 'hadoop_datanode_blocks_read{host="hdfs-datanode-default-0\\.hdfs-datanode-default\\..+\\.svc\\.cluster\\.local",kind="DataNodeActivity",port="9866",role="DataNode",service="HDFS"}', + # Counter suffixed with "_total" + 'hadoop_datanode_estimated_capacity_lost_total{kind="FSDatasetState",role="DataNode",service="HDFS"}', + # Metric suffixed with "_info" + 'hadoop_datanode_log_info_{kind="JvmMetrics",role="DataNode",service="HDFS"}', + # Non-special metric + 'hadoop_datanode_gc_count{kind="JvmMetrics",role="DataNode",service="HDFS"}', + ] + check_metrics(namespace, 'datanode', 8082, expected_metrics) + + +def check_journalnode_metrics( + namespace: str, + ) -> None: + expected_metrics = [ + # Kind "MetricsSystem" + 'hadoop_journalnode_num_active_sources{kind="MetricsSystem",role="JournalNode",service="HDFS",sub="Stats"}', + # Metric suffixed with "_info" + 'hadoop_journalnode_log_info_{kind="JvmMetrics",role="JournalNode",service="HDFS"}', + # Non-special metric + 'hadoop_journalnode_bytes_written{kind="Journal-hdfs",role="JournalNode",service="HDFS"}', + ] + check_metrics(namespace, 'journalnode', 8081, expected_metrics) + + +if __name__ == "__main__": + namespace: str = sys.argv[1] + + log_level = "DEBUG" + logging.basicConfig( + level=log_level, + format="%(asctime)s %(levelname)s: %(message)s", + stream=sys.stdout, + ) + + check_namenode_metrics(namespace) + check_datanode_metrics(namespace) + check_journalnode_metrics(namespace) + + print("All expected metrics found") From 632274f0642404b47fa7f5fd261a86578e848c42 Mon Sep 17 00:00:00 2001 From: Siegfried Weber Date: Wed, 18 Dec 2024 16:49:23 +0100 Subject: [PATCH 4/6] test: Remove the custom JMX exporter config --- .../kuttl/smoke/30-install-hdfs.yaml.j2 | 284 ------------------ 1 file changed, 284 deletions(-) diff --git a/tests/templates/kuttl/smoke/30-install-hdfs.yaml.j2 b/tests/templates/kuttl/smoke/30-install-hdfs.yaml.j2 index ae88e825..ab0fdc0c 100644 --- a/tests/templates/kuttl/smoke/30-install-hdfs.yaml.j2 +++ b/tests/templates/kuttl/smoke/30-install-hdfs.yaml.j2 @@ -28,26 +28,6 @@ spec: envOverrides: COMMON_VAR: role-value # overridden by role group below ROLE_VAR: role-value # only defined here at role level - HDFS_NAMENODE_OPTS: > - -Djava.security.properties=/stackable/config/namenode/security.properties - -javaagent:/stackable/jmx/jmx_prometheus_javaagent.jar=8183:/stackable/jmx-exporter-config/namenode.yaml - -Xmx838860k - podOverrides: - spec: - initContainers: - - name: format-namenodes - volumeMounts: - - name: jmx-exporter-config - mountPath: /stackable/jmx-exporter-config - containers: - - name: namenode - volumeMounts: - - name: jmx-exporter-config - mountPath: /stackable/jmx-exporter-config - volumes: - - name: jmx-exporter-config - configMap: - name: jmx-exporter-config config: listenerClass: {{ test_scenario['values']['listener-class'] }} logging: @@ -62,21 +42,6 @@ spec: envOverrides: COMMON_VAR: role-value # overridden by role group below ROLE_VAR: role-value # only defined here at role level - HDFS_DATANODE_OPTS: > - -Djava.security.properties=/stackable/config/datanode/security.properties - -javaagent:/stackable/jmx/jmx_prometheus_javaagent.jar=8082:/stackable/jmx-exporter-config/datanode.yaml - -Xmx419430k - podOverrides: - spec: - containers: - - name: datanode - volumeMounts: - - name: jmx-exporter-config - mountPath: /stackable/jmx-exporter-config - volumes: - - name: jmx-exporter-config - configMap: - name: jmx-exporter-config config: listenerClass: {{ test_scenario['values']['listener-class'] }} logging: @@ -106,21 +71,6 @@ spec: envOverrides: COMMON_VAR: role-value # overridden by role group below ROLE_VAR: role-value # only defined here at role level - HDFS_JOURNALNODE_OPTS: > - -Djava.security.properties=/stackable/config/journalnode/security.properties - -javaagent:/stackable/jmx/jmx_prometheus_javaagent.jar=8081:/stackable/jmx-exporter-config/journalnode.yaml - -Xmx419430k - podOverrides: - spec: - containers: - - name: journalnode - volumeMounts: - - name: jmx-exporter-config - mountPath: /stackable/jmx-exporter-config - volumes: - - name: jmx-exporter-config - configMap: - name: jmx-exporter-config config: logging: enableVectorAgent: {{ lookup('env', 'VECTOR_AGGREGATOR') | length > 0 }} @@ -143,237 +93,3 @@ spec: # https://github.com/stackabletech/hdfs-operator/issues/514 - name: dashed-port containerPort: 1234 ---- -apiVersion: v1 -kind: ConfigMap -metadata: - name: jmx-exporter-config -data: - namenode.yaml: |- - --- - startDelaySeconds: 10 - ssl: false - lowercaseOutputName: true - lowercaseOutputLabelNames: true - whitelistObjectNames: - - 'Hadoop:service=NameNode,name=*' - - 'Hadoop:service=NameNode,name=MetricsSystem,sub=*' - blacklistObjectNames: - - 'Hadoop:service=NameNode,name=RetryCache.NameNodeRetryCache' - - 'Hadoop:service=NameNode,name=RpcActivity*' - - 'Hadoop:service=NameNode,name=RpcDetailedActivity*' - - 'Hadoop:service=NameNode,name=UgiMetrics' - rules: - # MetricsSystem - - pattern: 'Hadoop<>(.*): (\d+)' - attrNameSnakeCase: true - name: hadoop_$1_$3 - value: $4 - labels: - service: HDFS - role: $1 - kind: 'MetricsSystem' - sub: $2 - type: GAUGE - # Total raw capacity in bytes, e.g. Hadoop:name=NameNodeInfo,attribute=Total - - pattern: 'Hadoop<>(total): (\d+)' - attrNameSnakeCase: true - name: hadoop_$1_$3 - value: $4 - labels: - service: HDFS - role: $1 - kind: $2 - type: COUNTER - # Generic counter, e.g. Hadoop:name=FSNamesystem,attribute=FilesTotal - - pattern: 'Hadoop<>(.*_total): (\d+)' - attrNameSnakeCase: true - name: hadoop_$1_$3 - value: $4 - labels: - service: HDFS - role: $1 - kind: $2 - type: COUNTER - # Metrics suffixed with _created, e.g. Hadoop:name=NameNodeActivity,attribute=FilesCreated - # The suffix _created is reserved for timestamps, therefore an underscore is appended. - - pattern: 'Hadoop<>(.*_created): (.*)' - attrNameSnakeCase: true - name: hadoop_$1_$3_ - value: $4 - labels: - service: HDFS - role: $1 - kind: $2 - type: GAUGE - # Metrics suffixed with _info, e.g. Hadoop:name=JvmMetrics,attribute=LogInfo - # The suffix _info is reserved for static information, therefore an underscore is appended. - - pattern: 'Hadoop<>(.*_info): (.*)' - attrNameSnakeCase: true - name: hadoop_$1_$3_ - value: $4 - labels: - service: HDFS - role: $1 - kind: $2 - type: GAUGE - # All other Hadoop metrics - - pattern: 'Hadoop<>(.*): (.*)' - attrNameSnakeCase: true - name: hadoop_$1_$3 - value: $4 - labels: - service: HDFS - role: $1 - kind: $2 - type: GAUGE - datanode.yaml: |- - --- - startDelaySeconds: 10 - ssl: false - lowercaseOutputName: true - lowercaseOutputLabelNames: true - whitelistObjectNames: - - 'Hadoop:service=DataNode,name=*' - - 'Hadoop:service=DataNode,name=MetricsSystem,sub=*' - blacklistObjectNames: - - 'Hadoop:service=DataNode,name=RpcActivity*' - - 'Hadoop:service=DataNode,name=RpcDetailedActivity*' - - 'Hadoop:service=DataNode,name=UgiMetrics' - rules: - # MetricsSystem - - pattern: 'Hadoop<>(.*): (\d+)' - attrNameSnakeCase: true - name: hadoop_$1_$3 - value: $4 - labels: - service: HDFS - role: $1 - kind: 'MetricsSystem' - sub: $2 - type: GAUGE - # FSDatasetState with _total suffix (also extracts the FSDataset ID), - # e.g. Hadoop:name=FSDatasetState,attribute=EstimatedCapacityLostTotal - - pattern: 'Hadoop<>(.*_total): (\d+)' - attrNameSnakeCase: true - name: hadoop_$1_$3 - value: $4 - labels: - service: HDFS - role: $1 - fsdatasetid: $2 - kind: 'FSDatasetState' - type: COUNTER - # FSDatasetState (also extracts the FSDataset ID) - - pattern: 'Hadoop<>(.*): (\d+)' - attrNameSnakeCase: true - name: hadoop_$1_$3 - value: $4 - labels: - service: HDFS - role: $1 - fsdatasetid: $2 - kind: 'FSDatasetState' - type: GAUGE - # DataNodeActivity with _info suffix (also extracts hostname and port), - # e.g. Hadoop:name=DataNodeActivity-hdfs-datanode-default-0-9866,attribute=BlocksGetLocalPathInfo - - pattern: 'Hadoop<>(.*_info): (\d+)' - attrNameSnakeCase: true - name: hadoop_$1_$4_ - value: $5 - labels: - service: HDFS - role: $1 - host: $2 - port: $3 - kind: 'DataNodeActivity' - type: GAUGE - - pattern: 'Hadoop<>(.*): (\d+)' - attrNameSnakeCase: true - name: hadoop_$1_$4 - value: $5 - labels: - service: HDFS - role: $1 - host: $2 - port: $3 - kind: 'DataNodeActivity' - type: GAUGE - # Generic counter, e.g. Hadoop:name=FSDatasetState,attribute=EstimatedCapacityLostTotal - - pattern: 'Hadoop<>(.*_total): (\d+)' - attrNameSnakeCase: true - name: hadoop_$1_$3 - value: $4 - labels: - service: HDFS - role: $1 - kind: $2 - type: COUNTER - # Metrics suffixed with _info, e.g. Hadoop:name=JvmMetrics,attribute=LogInfo - # The suffix _info is reserved for static information, therefore an underscore is appended. - - pattern: 'Hadoop<>(.*_info): (.*)' - attrNameSnakeCase: true - name: hadoop_$1_$3_ - value: $4 - labels: - service: HDFS - role: $1 - kind: $2 - type: GAUGE - # All other Hadoop metrics - - pattern: 'Hadoop<>(.*): (.*)' - attrNameSnakeCase: true - name: hadoop_$1_$3 - value: $4 - labels: - service: HDFS - role: $1 - kind: $2 - type: GAUGE - journalnode.yaml: |- - --- - startDelaySeconds: 10 - ssl: false - lowercaseOutputName: true - lowercaseOutputLabelNames: true - whitelistObjectNames: - - 'Hadoop:service=JournalNode,name=*' - - 'Hadoop:service=JournalNode,name=MetricsSystem,sub=*' - blacklistObjectNames: - - 'Hadoop:service=JournalNode,name=RetryCache.JournalNodeRetryCache' - - 'Hadoop:service=JournalNode,name=RpcActivity*' - - 'Hadoop:service=JournalNode,name=RpcDetailedActivity*' - - 'Hadoop:service=JournalNode,name=UgiMetrics' - rules: - # MetricsSystem - - pattern: 'Hadoop<>(.*): (\d+)' - attrNameSnakeCase: true - name: hadoop_$1_$3 - value: $4 - labels: - service: HDFS - role: $1 - kind: 'MetricsSystem' - sub: $2 - type: GAUGE - # Metrics suffixed with _info, e.g. Hadoop:name=JvmMetrics,attribute=LogInfo - # The suffix _info is reserved for static information, therefore an underscore is appended. - - pattern: 'Hadoop<>(.*_info): (.*)' - attrNameSnakeCase: true - name: hadoop_$1_$3_ - value: $4 - labels: - service: HDFS - role: $1 - kind: $2 - type: GAUGE - # All other Hadoop metrics - - pattern: 'Hadoop<>(.*): (.*)' - attrNameSnakeCase: true - name: hadoop_$1_$3 - value: $4 - labels: - service: HDFS - role: $1 - kind: $2 - type: GAUGE From a4de9330e363c46b53f67b79b8af93cd2b8105b9 Mon Sep 17 00:00:00 2001 From: Siegfried Weber Date: Wed, 18 Dec 2024 23:05:32 +0100 Subject: [PATCH 5/6] test: Fix smoke test for Hadoop 3.4.0 --- tests/templates/kuttl/smoke/51-assert.yaml | 5 - tests/templates/kuttl/smoke/51-assert.yaml.j2 | 12 ++ .../smoke/51-copy-metrics-test-script.yaml | 2 +- tests/templates/kuttl/smoke/metrics-test.py | 96 -------------- tests/templates/kuttl/smoke/test_metrics.py | 119 ++++++++++++++++++ 5 files changed, 132 insertions(+), 102 deletions(-) delete mode 100644 tests/templates/kuttl/smoke/51-assert.yaml create mode 100644 tests/templates/kuttl/smoke/51-assert.yaml.j2 delete mode 100755 tests/templates/kuttl/smoke/metrics-test.py create mode 100755 tests/templates/kuttl/smoke/test_metrics.py diff --git a/tests/templates/kuttl/smoke/51-assert.yaml b/tests/templates/kuttl/smoke/51-assert.yaml deleted file mode 100644 index d2a0709d..00000000 --- a/tests/templates/kuttl/smoke/51-assert.yaml +++ /dev/null @@ -1,5 +0,0 @@ ---- -apiVersion: kuttl.dev/v1beta1 -kind: TestAssert -commands: - - script: kubectl exec -n $NAMESPACE test-runner-0 -- python /tmp/metrics-test.py $NAMESPACE diff --git a/tests/templates/kuttl/smoke/51-assert.yaml.j2 b/tests/templates/kuttl/smoke/51-assert.yaml.j2 new file mode 100644 index 00000000..4a20065d --- /dev/null +++ b/tests/templates/kuttl/smoke/51-assert.yaml.j2 @@ -0,0 +1,12 @@ +--- +apiVersion: kuttl.dev/v1beta1 +kind: TestAssert +commands: + - script: | +{% if test_scenario['values']['hadoop'].find(",") > 0 %} + PRODUCT_VERSION={{ test_scenario['values']['hadoop'].split(',')[0] }} +{% else %} + PRODUCT_VERSION={{ test_scenario['values']['hadoop'] }} +{% endif %} + kubectl exec --namespace=$NAMESPACE test-runner-0 -- \ + python /tmp/test_metrics.py $NAMESPACE $PRODUCT_VERSION diff --git a/tests/templates/kuttl/smoke/51-copy-metrics-test-script.yaml b/tests/templates/kuttl/smoke/51-copy-metrics-test-script.yaml index 5cf50409..fa17cd19 100644 --- a/tests/templates/kuttl/smoke/51-copy-metrics-test-script.yaml +++ b/tests/templates/kuttl/smoke/51-copy-metrics-test-script.yaml @@ -2,4 +2,4 @@ apiVersion: kuttl.dev/v1beta1 kind: TestStep commands: - - script: kubectl cp -n $NAMESPACE ./metrics-test.py test-runner-0:/tmp + - script: kubectl cp -n $NAMESPACE ./test_metrics.py test-runner-0:/tmp diff --git a/tests/templates/kuttl/smoke/metrics-test.py b/tests/templates/kuttl/smoke/metrics-test.py deleted file mode 100755 index bda07005..00000000 --- a/tests/templates/kuttl/smoke/metrics-test.py +++ /dev/null @@ -1,96 +0,0 @@ -from requests import Response -import re -import requests -import sys -import logging - - -def check_metrics( - namespace: str, - role: str, - port: int, - expected_metrics: list[str] - ) -> None: - response: Response = requests.get( - f'http://hdfs-{role}-default-0.hdfs-{role}-default.{namespace}.svc.cluster.local:{port}/metrics' - ) - assert response.ok, "Requesting metrics failed" - - for metric in expected_metrics: - assert re.search(f'^{metric}', response.text, re.MULTILINE) is not None, \ - f"Metric '{metric}' not found for {role}" - - -def check_namenode_metrics( - namespace: str, - ) -> None: - expected_metrics = [ - # Kind "MetricsSystem" - 'hadoop_namenode_num_active_sources{kind="MetricsSystem",role="NameNode",service="HDFS",sub="Stats"}', - # Attribute "Total" - 'hadoop_namenode_total{kind="NameNodeInfo",role="NameNode",service="HDFS"}', - # Counter suffixed with "_total" - 'hadoop_namenode_files_total{kind="FSNamesystem",role="NameNode",service="HDFS"}', - # Metric suffixed with "_created" - 'hadoop_namenode_files_created_{kind="NameNodeActivity",role="NameNode",service="HDFS"}', - # Metric suffixed with "_info" - 'hadoop_namenode_log_info_{kind="JvmMetrics",role="NameNode",service="HDFS"}', - # Non-special metric - 'hadoop_namenode_files_deleted{kind="NameNodeActivity",role="NameNode",service="HDFS"}', - ] - check_metrics(namespace, 'namenode', 8183, expected_metrics) - - -def check_datanode_metrics( - namespace: str, - ) -> None: - expected_metrics = [ - # Kind "MetricsSystem" - 'hadoop_datanode_num_active_sources{kind="MetricsSystem",role="DataNode",service="HDFS",sub="Stats"}', - # Kind "FSDatasetState" suffixed with "_total" - 'hadoop_datanode_estimated_capacity_lost_total{fsdatasetid=".+",kind="FSDatasetState",role="DataNode",service="HDFS"}', - # Kind "FSDatasetState" - 'hadoop_datanode_capacity{fsdatasetid=".+",kind="FSDatasetState",role="DataNode",service="HDFS"}', - # Kind "DataNodeActivity" suffixed with "_info" - 'hadoop_datanode_blocks_get_local_path_info_{host="hdfs-datanode-default-0\\.hdfs-datanode-default\\..+\\.svc\\.cluster\\.local",kind="DataNodeActivity",port="9866",role="DataNode",service="HDFS"}', - # Kind "DataNodeActivity" - 'hadoop_datanode_blocks_read{host="hdfs-datanode-default-0\\.hdfs-datanode-default\\..+\\.svc\\.cluster\\.local",kind="DataNodeActivity",port="9866",role="DataNode",service="HDFS"}', - # Counter suffixed with "_total" - 'hadoop_datanode_estimated_capacity_lost_total{kind="FSDatasetState",role="DataNode",service="HDFS"}', - # Metric suffixed with "_info" - 'hadoop_datanode_log_info_{kind="JvmMetrics",role="DataNode",service="HDFS"}', - # Non-special metric - 'hadoop_datanode_gc_count{kind="JvmMetrics",role="DataNode",service="HDFS"}', - ] - check_metrics(namespace, 'datanode', 8082, expected_metrics) - - -def check_journalnode_metrics( - namespace: str, - ) -> None: - expected_metrics = [ - # Kind "MetricsSystem" - 'hadoop_journalnode_num_active_sources{kind="MetricsSystem",role="JournalNode",service="HDFS",sub="Stats"}', - # Metric suffixed with "_info" - 'hadoop_journalnode_log_info_{kind="JvmMetrics",role="JournalNode",service="HDFS"}', - # Non-special metric - 'hadoop_journalnode_bytes_written{kind="Journal-hdfs",role="JournalNode",service="HDFS"}', - ] - check_metrics(namespace, 'journalnode', 8081, expected_metrics) - - -if __name__ == "__main__": - namespace: str = sys.argv[1] - - log_level = "DEBUG" - logging.basicConfig( - level=log_level, - format="%(asctime)s %(levelname)s: %(message)s", - stream=sys.stdout, - ) - - check_namenode_metrics(namespace) - check_datanode_metrics(namespace) - check_journalnode_metrics(namespace) - - print("All expected metrics found") diff --git a/tests/templates/kuttl/smoke/test_metrics.py b/tests/templates/kuttl/smoke/test_metrics.py new file mode 100755 index 00000000..6132fde7 --- /dev/null +++ b/tests/templates/kuttl/smoke/test_metrics.py @@ -0,0 +1,119 @@ +# Every rule in the JMX configuration is covered by one expected metric. + +import re +import sys +import logging + +import requests + +def check_metrics( + namespace: str, + role: str, + port: int, + expected_metrics: list[str] + ) -> None: + response: requests.Response = requests.get( + f'http://hdfs-{role}-default-0.hdfs-{role}-default.{namespace}.svc.cluster.local:{port}/metrics', + timeout=10 + ) + assert response.ok, "Requesting metrics failed" + + for metric in expected_metrics: + assert re.search(f'^{metric}', response.text, re.MULTILINE) is not None, \ + f"Metric '{metric}' not found for {role}" + + +def check_namenode_metrics( + namespace: str, + product_version: str, + ) -> None: + expected_metrics: list[str] = [ + # Kind "MetricsSystem" + 'hadoop_namenode_num_active_sources{kind="MetricsSystem",role="NameNode",service="HDFS",sub="Stats"}', + # Attribute "Total" + 'hadoop_namenode_total{kind="NameNodeInfo",role="NameNode",service="HDFS"}', + # Counter suffixed with "_total" + 'hadoop_namenode_files_total{kind="FSNamesystem",role="NameNode",service="HDFS"}', + # Metric suffixed with "_created" + 'hadoop_namenode_files_created_{kind="NameNodeActivity",role="NameNode",service="HDFS"}', + # Non-special metric + 'hadoop_namenode_files_deleted{kind="NameNodeActivity",role="NameNode",service="HDFS"}', + ] + + if product_version in ["3.3.4", "3.3.6"]: + # Log counters were removed in 3.4.0 (HADOOP-17524). + expected_metrics.extend([ + # Metric suffixed with "_info" + 'hadoop_namenode_log_info_{kind="JvmMetrics",role="NameNode",service="HDFS"}', + ]) + + check_metrics(namespace, 'namenode', 8183, expected_metrics) + + +def check_datanode_metrics( + namespace: str, + product_version: str, + ) -> None: + expected_metrics: list[str] = [ + # Kind "MetricsSystem" + 'hadoop_datanode_num_active_sources{kind="MetricsSystem",role="DataNode",service="HDFS",sub="Stats"}', + # Kind "FSDatasetState" suffixed with "_total" + 'hadoop_datanode_estimated_capacity_lost_total{fsdatasetid=".+",kind="FSDatasetState",role="DataNode",service="HDFS"}', + # Kind "FSDatasetState" + 'hadoop_datanode_capacity{fsdatasetid=".+",kind="FSDatasetState",role="DataNode",service="HDFS"}', + # Kind "DataNodeActivity" suffixed with "_info" + 'hadoop_datanode_blocks_get_local_path_info_{host="hdfs-datanode-default-0\\.hdfs-datanode-default\\..+\\.svc\\.cluster\\.local",kind="DataNodeActivity",port="9866",role="DataNode",service="HDFS"}', + # Kind "DataNodeActivity" + 'hadoop_datanode_blocks_read{host="hdfs-datanode-default-0\\.hdfs-datanode-default\\..+\\.svc\\.cluster\\.local",kind="DataNodeActivity",port="9866",role="DataNode",service="HDFS"}', + # Counter suffixed with "_total" + 'hadoop_datanode_estimated_capacity_lost_total{kind="FSDatasetState",role="DataNode",service="HDFS"}', + # Non-special metric + 'hadoop_datanode_gc_count{kind="JvmMetrics",role="DataNode",service="HDFS"}', + ] + + if product_version in ["3.3.4", "3.3.6"]: + # Log counters were removed in 3.4.0 (HADOOP-17524). + expected_metrics.extend([ + # Metric suffixed with "_info" + 'hadoop_datanode_log_info_{kind="JvmMetrics",role="DataNode",service="HDFS"}', + ]) + + check_metrics(namespace, 'datanode', 8082, expected_metrics) + + +def check_journalnode_metrics( + namespace: str, + product_version: str, + ) -> None: + expected_metrics: list[str] = [ + # Kind "MetricsSystem" + 'hadoop_journalnode_num_active_sources{kind="MetricsSystem",role="JournalNode",service="HDFS",sub="Stats"}', + # Non-special metric + 'hadoop_journalnode_bytes_written{kind="Journal-hdfs",role="JournalNode",service="HDFS"}', + ] + + if product_version in ["3.3.4", "3.3.6"]: + # Log counters were removed in 3.4.0 (HADOOP-17524). + expected_metrics.extend([ + # Metric suffixed with "_info" + 'hadoop_journalnode_log_info_{kind="JvmMetrics",role="JournalNode",service="HDFS"}', + ]) + + check_metrics(namespace, 'journalnode', 8081, expected_metrics) + + +if __name__ == "__main__": + namespace_arg: str = sys.argv[1] + product_version_arg: str = sys.argv[2] + + logging.basicConfig( + level="DEBUG", + format="%(asctime)s %(levelname)s: %(message)s", + stream=sys.stdout, + ) + + check_namenode_metrics(namespace_arg, product_version_arg) + check_datanode_metrics(namespace_arg, product_version_arg) + check_journalnode_metrics(namespace_arg, product_version_arg) + + print("All expected metrics found") From 43d493315ab354673577e50e8b11ef4e1c0ca38b Mon Sep 17 00:00:00 2001 From: Siegfried Weber Date: Thu, 19 Dec 2024 11:14:09 +0100 Subject: [PATCH 6/6] chore: Format Python code in the smoke test --- tests/templates/kuttl/smoke/test_metrics.py | 181 ++++++++++---------- 1 file changed, 93 insertions(+), 88 deletions(-) diff --git a/tests/templates/kuttl/smoke/test_metrics.py b/tests/templates/kuttl/smoke/test_metrics.py index 6132fde7..5129e2dd 100755 --- a/tests/templates/kuttl/smoke/test_metrics.py +++ b/tests/templates/kuttl/smoke/test_metrics.py @@ -6,114 +6,119 @@ import requests + def check_metrics( - namespace: str, - role: str, - port: int, - expected_metrics: list[str] - ) -> None: - response: requests.Response = requests.get( - f'http://hdfs-{role}-default-0.hdfs-{role}-default.{namespace}.svc.cluster.local:{port}/metrics', - timeout=10 - ) - assert response.ok, "Requesting metrics failed" + namespace: str, role: str, port: int, expected_metrics: list[str] +) -> None: + response: requests.Response = requests.get( + f"http://hdfs-{role}-default-0.hdfs-{role}-default.{namespace}.svc.cluster.local:{port}/metrics", + timeout=10, + ) + assert response.ok, "Requesting metrics failed" - for metric in expected_metrics: - assert re.search(f'^{metric}', response.text, re.MULTILINE) is not None, \ - f"Metric '{metric}' not found for {role}" + for metric in expected_metrics: + assert ( + re.search(f"^{metric}", response.text, re.MULTILINE) is not None + ), f"Metric '{metric}' not found for {role}" def check_namenode_metrics( namespace: str, product_version: str, - ) -> None: - expected_metrics: list[str] = [ - # Kind "MetricsSystem" - 'hadoop_namenode_num_active_sources{kind="MetricsSystem",role="NameNode",service="HDFS",sub="Stats"}', - # Attribute "Total" - 'hadoop_namenode_total{kind="NameNodeInfo",role="NameNode",service="HDFS"}', - # Counter suffixed with "_total" - 'hadoop_namenode_files_total{kind="FSNamesystem",role="NameNode",service="HDFS"}', - # Metric suffixed with "_created" - 'hadoop_namenode_files_created_{kind="NameNodeActivity",role="NameNode",service="HDFS"}', - # Non-special metric - 'hadoop_namenode_files_deleted{kind="NameNodeActivity",role="NameNode",service="HDFS"}', - ] - - if product_version in ["3.3.4", "3.3.6"]: - # Log counters were removed in 3.4.0 (HADOOP-17524). - expected_metrics.extend([ - # Metric suffixed with "_info" - 'hadoop_namenode_log_info_{kind="JvmMetrics",role="NameNode",service="HDFS"}', - ]) - - check_metrics(namespace, 'namenode', 8183, expected_metrics) +) -> None: + expected_metrics: list[str] = [ + # Kind "MetricsSystem" + 'hadoop_namenode_num_active_sources{kind="MetricsSystem",role="NameNode",service="HDFS",sub="Stats"}', + # Attribute "Total" + 'hadoop_namenode_total{kind="NameNodeInfo",role="NameNode",service="HDFS"}', + # Counter suffixed with "_total" + 'hadoop_namenode_files_total{kind="FSNamesystem",role="NameNode",service="HDFS"}', + # Metric suffixed with "_created" + 'hadoop_namenode_files_created_{kind="NameNodeActivity",role="NameNode",service="HDFS"}', + # Non-special metric + 'hadoop_namenode_files_deleted{kind="NameNodeActivity",role="NameNode",service="HDFS"}', + ] + + if product_version in ["3.3.4", "3.3.6"]: + # Log counters were removed in 3.4.0 (HADOOP-17524). + expected_metrics.extend( + [ + # Metric suffixed with "_info" + 'hadoop_namenode_log_info_{kind="JvmMetrics",role="NameNode",service="HDFS"}', + ] + ) + + check_metrics(namespace, "namenode", 8183, expected_metrics) def check_datanode_metrics( namespace: str, product_version: str, - ) -> None: - expected_metrics: list[str] = [ - # Kind "MetricsSystem" - 'hadoop_datanode_num_active_sources{kind="MetricsSystem",role="DataNode",service="HDFS",sub="Stats"}', - # Kind "FSDatasetState" suffixed with "_total" - 'hadoop_datanode_estimated_capacity_lost_total{fsdatasetid=".+",kind="FSDatasetState",role="DataNode",service="HDFS"}', - # Kind "FSDatasetState" - 'hadoop_datanode_capacity{fsdatasetid=".+",kind="FSDatasetState",role="DataNode",service="HDFS"}', - # Kind "DataNodeActivity" suffixed with "_info" - 'hadoop_datanode_blocks_get_local_path_info_{host="hdfs-datanode-default-0\\.hdfs-datanode-default\\..+\\.svc\\.cluster\\.local",kind="DataNodeActivity",port="9866",role="DataNode",service="HDFS"}', - # Kind "DataNodeActivity" - 'hadoop_datanode_blocks_read{host="hdfs-datanode-default-0\\.hdfs-datanode-default\\..+\\.svc\\.cluster\\.local",kind="DataNodeActivity",port="9866",role="DataNode",service="HDFS"}', - # Counter suffixed with "_total" - 'hadoop_datanode_estimated_capacity_lost_total{kind="FSDatasetState",role="DataNode",service="HDFS"}', - # Non-special metric - 'hadoop_datanode_gc_count{kind="JvmMetrics",role="DataNode",service="HDFS"}', - ] - - if product_version in ["3.3.4", "3.3.6"]: - # Log counters were removed in 3.4.0 (HADOOP-17524). - expected_metrics.extend([ - # Metric suffixed with "_info" - 'hadoop_datanode_log_info_{kind="JvmMetrics",role="DataNode",service="HDFS"}', - ]) - - check_metrics(namespace, 'datanode', 8082, expected_metrics) +) -> None: + expected_metrics: list[str] = [ + # Kind "MetricsSystem" + 'hadoop_datanode_num_active_sources{kind="MetricsSystem",role="DataNode",service="HDFS",sub="Stats"}', + # Kind "FSDatasetState" suffixed with "_total" + 'hadoop_datanode_estimated_capacity_lost_total{fsdatasetid=".+",kind="FSDatasetState",role="DataNode",service="HDFS"}', + # Kind "FSDatasetState" + 'hadoop_datanode_capacity{fsdatasetid=".+",kind="FSDatasetState",role="DataNode",service="HDFS"}', + # Kind "DataNodeActivity" suffixed with "_info" + 'hadoop_datanode_blocks_get_local_path_info_{host="hdfs-datanode-default-0\\.hdfs-datanode-default\\..+\\.svc\\.cluster\\.local",kind="DataNodeActivity",port="9866",role="DataNode",service="HDFS"}', + # Kind "DataNodeActivity" + 'hadoop_datanode_blocks_read{host="hdfs-datanode-default-0\\.hdfs-datanode-default\\..+\\.svc\\.cluster\\.local",kind="DataNodeActivity",port="9866",role="DataNode",service="HDFS"}', + # Counter suffixed with "_total" + 'hadoop_datanode_estimated_capacity_lost_total{kind="FSDatasetState",role="DataNode",service="HDFS"}', + # Non-special metric + 'hadoop_datanode_gc_count{kind="JvmMetrics",role="DataNode",service="HDFS"}', + ] + + if product_version in ["3.3.4", "3.3.6"]: + # Log counters were removed in 3.4.0 (HADOOP-17524). + expected_metrics.extend( + [ + # Metric suffixed with "_info" + 'hadoop_datanode_log_info_{kind="JvmMetrics",role="DataNode",service="HDFS"}', + ] + ) + + check_metrics(namespace, "datanode", 8082, expected_metrics) def check_journalnode_metrics( namespace: str, product_version: str, - ) -> None: - expected_metrics: list[str] = [ - # Kind "MetricsSystem" - 'hadoop_journalnode_num_active_sources{kind="MetricsSystem",role="JournalNode",service="HDFS",sub="Stats"}', - # Non-special metric - 'hadoop_journalnode_bytes_written{kind="Journal-hdfs",role="JournalNode",service="HDFS"}', - ] - - if product_version in ["3.3.4", "3.3.6"]: - # Log counters were removed in 3.4.0 (HADOOP-17524). - expected_metrics.extend([ - # Metric suffixed with "_info" - 'hadoop_journalnode_log_info_{kind="JvmMetrics",role="JournalNode",service="HDFS"}', - ]) - - check_metrics(namespace, 'journalnode', 8081, expected_metrics) +) -> None: + expected_metrics: list[str] = [ + # Kind "MetricsSystem" + 'hadoop_journalnode_num_active_sources{kind="MetricsSystem",role="JournalNode",service="HDFS",sub="Stats"}', + # Non-special metric + 'hadoop_journalnode_bytes_written{kind="Journal-hdfs",role="JournalNode",service="HDFS"}', + ] + + if product_version in ["3.3.4", "3.3.6"]: + # Log counters were removed in 3.4.0 (HADOOP-17524). + expected_metrics.extend( + [ + # Metric suffixed with "_info" + 'hadoop_journalnode_log_info_{kind="JvmMetrics",role="JournalNode",service="HDFS"}', + ] + ) + + check_metrics(namespace, "journalnode", 8081, expected_metrics) if __name__ == "__main__": - namespace_arg: str = sys.argv[1] - product_version_arg: str = sys.argv[2] + namespace_arg: str = sys.argv[1] + product_version_arg: str = sys.argv[2] - logging.basicConfig( - level="DEBUG", - format="%(asctime)s %(levelname)s: %(message)s", - stream=sys.stdout, - ) + logging.basicConfig( + level="DEBUG", + format="%(asctime)s %(levelname)s: %(message)s", + stream=sys.stdout, + ) - check_namenode_metrics(namespace_arg, product_version_arg) - check_datanode_metrics(namespace_arg, product_version_arg) - check_journalnode_metrics(namespace_arg, product_version_arg) + check_namenode_metrics(namespace_arg, product_version_arg) + check_datanode_metrics(namespace_arg, product_version_arg) + check_journalnode_metrics(namespace_arg, product_version_arg) - print("All expected metrics found") + print("All expected metrics found")