Skip to content

Test metrics in the smoke test #635

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 7 commits into from
Dec 20, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion tests/templates/kuttl/smoke/40-assert.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ timeout: 300
apiVersion: apps/v1
kind: StatefulSet
metadata:
name: webhdfs
name: test-runner
status:
readyReplicas: 1
replicas: 1
Original file line number Diff line number Diff line change
Expand Up @@ -2,22 +2,22 @@
apiVersion: apps/v1
kind: StatefulSet
metadata:
name: webhdfs
name: test-runner
labels:
app: webhdfs
app: test-runner
spec:
replicas: 1
selector:
matchLabels:
app: webhdfs
app: test-runner
template:
metadata:
labels:
app: webhdfs
app: test-runner
spec:
shareProcessNamespace: true
containers:
- name: webhdfs
- name: test-runner
image: docker.stackable.tech/stackable/testing-tools:0.2.0-stackable0.0.0-dev
args: [sleep, infinity]
stdin: true
Expand Down
2 changes: 1 addition & 1 deletion tests/templates/kuttl/smoke/50-assert.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,4 @@
apiVersion: kuttl.dev/v1beta1
kind: TestAssert
commands:
- script: kubectl exec -n $NAMESPACE webhdfs-0 -- python /tmp/webhdfs.py $NAMESPACE ls
- script: kubectl exec -n $NAMESPACE test-runner-0 -- python /tmp/webhdfs.py $NAMESPACE ls
6 changes: 3 additions & 3 deletions tests/templates/kuttl/smoke/50-create-file.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,6 @@
apiVersion: kuttl.dev/v1beta1
kind: TestStep
commands:
- script: kubectl cp -n $NAMESPACE ./webhdfs.py webhdfs-0:/tmp
- script: kubectl cp -n $NAMESPACE ./testdata.txt webhdfs-0:/tmp
- script: kubectl exec -n $NAMESPACE webhdfs-0 -- python /tmp/webhdfs.py $NAMESPACE create
- script: kubectl cp -n $NAMESPACE ./webhdfs.py test-runner-0:/tmp
- script: kubectl cp -n $NAMESPACE ./testdata.txt test-runner-0:/tmp
- script: kubectl exec -n $NAMESPACE test-runner-0 -- python /tmp/webhdfs.py $NAMESPACE create
12 changes: 12 additions & 0 deletions tests/templates/kuttl/smoke/51-assert.yaml.j2
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
---
apiVersion: kuttl.dev/v1beta1
kind: TestAssert
commands:
- script: |
{% if test_scenario['values']['hadoop'].find(",") > 0 %}
PRODUCT_VERSION={{ test_scenario['values']['hadoop'].split(',')[0] }}
{% else %}
PRODUCT_VERSION={{ test_scenario['values']['hadoop'] }}
{% endif %}
kubectl exec --namespace=$NAMESPACE test-runner-0 -- \
python /tmp/test_metrics.py $NAMESPACE $PRODUCT_VERSION
5 changes: 5 additions & 0 deletions tests/templates/kuttl/smoke/51-copy-metrics-test-script.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
---
apiVersion: kuttl.dev/v1beta1
kind: TestStep
commands:
- script: kubectl cp -n $NAMESPACE ./test_metrics.py test-runner-0:/tmp
124 changes: 124 additions & 0 deletions tests/templates/kuttl/smoke/test_metrics.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,124 @@
# Every rule in the JMX configuration is covered by one expected metric.

import re
import sys
import logging

import requests


def check_metrics(
namespace: str, role: str, port: int, expected_metrics: list[str]
) -> None:
response: requests.Response = requests.get(
f"http://hdfs-{role}-default-0.hdfs-{role}-default.{namespace}.svc.cluster.local:{port}/metrics",
timeout=10,
)
assert response.ok, "Requesting metrics failed"

for metric in expected_metrics:
assert (
re.search(f"^{metric}", response.text, re.MULTILINE) is not None
), f"Metric '{metric}' not found for {role}"


def check_namenode_metrics(
namespace: str,
product_version: str,
) -> None:
expected_metrics: list[str] = [
# Kind "MetricsSystem"
'hadoop_namenode_num_active_sources{kind="MetricsSystem",role="NameNode",service="HDFS",sub="Stats"}',
# Attribute "Total"
'hadoop_namenode_total{kind="NameNodeInfo",role="NameNode",service="HDFS"}',
# Counter suffixed with "_total"
'hadoop_namenode_files_total{kind="FSNamesystem",role="NameNode",service="HDFS"}',
# Metric suffixed with "_created"
'hadoop_namenode_files_created_{kind="NameNodeActivity",role="NameNode",service="HDFS"}',
# Non-special metric
'hadoop_namenode_files_deleted{kind="NameNodeActivity",role="NameNode",service="HDFS"}',
]

if product_version in ["3.3.4", "3.3.6"]:
# Log counters were removed in 3.4.0 (HADOOP-17524).
expected_metrics.extend(
[
# Metric suffixed with "_info"
'hadoop_namenode_log_info_{kind="JvmMetrics",role="NameNode",service="HDFS"}',
]
)

check_metrics(namespace, "namenode", 8183, expected_metrics)


def check_datanode_metrics(
namespace: str,
product_version: str,
) -> None:
expected_metrics: list[str] = [
# Kind "MetricsSystem"
'hadoop_datanode_num_active_sources{kind="MetricsSystem",role="DataNode",service="HDFS",sub="Stats"}',
# Kind "FSDatasetState" suffixed with "_total"
'hadoop_datanode_estimated_capacity_lost_total{fsdatasetid=".+",kind="FSDatasetState",role="DataNode",service="HDFS"}',
# Kind "FSDatasetState"
'hadoop_datanode_capacity{fsdatasetid=".+",kind="FSDatasetState",role="DataNode",service="HDFS"}',
# Kind "DataNodeActivity" suffixed with "_info"
'hadoop_datanode_blocks_get_local_path_info_{host="hdfs-datanode-default-0\\.hdfs-datanode-default\\..+\\.svc\\.cluster\\.local",kind="DataNodeActivity",port="9866",role="DataNode",service="HDFS"}',
# Kind "DataNodeActivity"
'hadoop_datanode_blocks_read{host="hdfs-datanode-default-0\\.hdfs-datanode-default\\..+\\.svc\\.cluster\\.local",kind="DataNodeActivity",port="9866",role="DataNode",service="HDFS"}',
# Counter suffixed with "_total"
'hadoop_datanode_estimated_capacity_lost_total{kind="FSDatasetState",role="DataNode",service="HDFS"}',
# Non-special metric
'hadoop_datanode_gc_count{kind="JvmMetrics",role="DataNode",service="HDFS"}',
]

if product_version in ["3.3.4", "3.3.6"]:
# Log counters were removed in 3.4.0 (HADOOP-17524).
expected_metrics.extend(
[
# Metric suffixed with "_info"
'hadoop_datanode_log_info_{kind="JvmMetrics",role="DataNode",service="HDFS"}',
]
)

check_metrics(namespace, "datanode", 8082, expected_metrics)


def check_journalnode_metrics(
namespace: str,
product_version: str,
) -> None:
expected_metrics: list[str] = [
# Kind "MetricsSystem"
'hadoop_journalnode_num_active_sources{kind="MetricsSystem",role="JournalNode",service="HDFS",sub="Stats"}',
# Non-special metric
'hadoop_journalnode_bytes_written{kind="Journal-hdfs",role="JournalNode",service="HDFS"}',
]

if product_version in ["3.3.4", "3.3.6"]:
# Log counters were removed in 3.4.0 (HADOOP-17524).
expected_metrics.extend(
[
# Metric suffixed with "_info"
'hadoop_journalnode_log_info_{kind="JvmMetrics",role="JournalNode",service="HDFS"}',
]
)

check_metrics(namespace, "journalnode", 8081, expected_metrics)


if __name__ == "__main__":
namespace_arg: str = sys.argv[1]
product_version_arg: str = sys.argv[2]

logging.basicConfig(
level="DEBUG",
format="%(asctime)s %(levelname)s: %(message)s",
stream=sys.stdout,
)

check_namenode_metrics(namespace_arg, product_version_arg)
check_datanode_metrics(namespace_arg, product_version_arg)
check_journalnode_metrics(namespace_arg, product_version_arg)

print("All expected metrics found")
Loading