Skip to content

Commit 9a6920b

Browse files
authored
Check node errors in perftests, support multislot (#17474)
1 parent 79d13dd commit 9a6920b

File tree

4 files changed

+145
-60
lines changed

4 files changed

+145
-60
lines changed

ydb/tests/olap/lib/allure_utils.py

Lines changed: 37 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,14 @@
88
from pytz import timezone
99

1010

11+
class NodeErrors:
12+
def __init__(self, node: YdbCluster.Node, message: str):
13+
self.node = node
14+
self.core_hashes: list[tuple[str, str]] = [] # id, aggregated hash
15+
self.was_oom: bool = False
16+
self.message: str = message
17+
18+
1119
def _set_monitoring(test_info: dict[str, str], start_time: float, end_time: float) -> None:
1220
monitoring_start = int((start_time) * 1000)
1321
monitoring_end = int((end_time) * 1000)
@@ -29,12 +37,32 @@ def _set_monitoring(test_info: dict[str, str], start_time: float, end_time: floa
2937
])
3038

3139

32-
def _set_coredumps(test_info: dict[str, str]) -> None:
33-
if test_info['name'].startswith('ydb-k8s'):
34-
core_link = f"https://coredumps.n.yandex-team.ru/index?itype=kikimr&host_list={test_info['nodes_wilcard']}&show_fixed=True"
35-
else:
36-
core_link = f"https://kikimr-cores.n.yandex-team.ru/show?server={test_info['nodes_wilcard']}%2A"
37-
test_info['coredumps'] = f"<a target='_blank' href='{core_link}'>link</a>"
40+
def _set_coredumps(test_info: dict[str, str], start_time: float, end_time: float) -> None:
41+
tz = timezone('Europe/Moscow')
42+
params = urlencode([
43+
('filter', f'program_type=kikimr; @cluster_name={test_info["name"]}'),
44+
('since_ts', datetime.fromtimestamp(start_time, tz).isoformat()),
45+
('till_ts', datetime.fromtimestamp(end_time, tz).isoformat()),
46+
])
47+
test_info['coredumps'] = f"<a target='_blank' href='https://coredumps.yandex-team.ru/v3/cores?{params}'>link</a>"
48+
49+
50+
def _set_node_errors(test_info: dict[str, str], node_errors: list[NodeErrors]) -> None:
51+
if len(node_errors) == 0:
52+
return
53+
html = '<ul>'
54+
for node in node_errors:
55+
html += f'<li>{node.node.ic_port}@{node.node.host}'
56+
if node.message:
57+
html += f'<p>Node {node.message}</p>'
58+
if node.was_oom:
59+
html += '<p>Node was killed by OOM</p>'
60+
for core_id, core_hash in node.core_hashes:
61+
color = hex(0xFF0000 + hash(core_hash) % 0xFFFF).split('x')[-1]
62+
html += f'<p>There was coredump <a target="_blank" href="https://coredumps.yandex-team.ru/v3/cores/{core_id}" style="background-color: #{color}">{core_hash}</a></p>'
63+
html += '</li>'
64+
html += '</ul>'
65+
test_info['<span style="background-color: #FF8888">node errors</span>'] = html
3866

3967

4068
def _set_results_plot(test_info: dict[str, str], suite: str, test: str, refference_set: str) -> None:
@@ -81,6 +109,7 @@ def allure_test_description(
81109
addition_table_strings: dict[str, any] = {},
82110
attachments: tuple[str, str, allure.attachment_type] = [],
83111
refference_set: str = '',
112+
node_errors: list[NodeErrors] = [],
84113
):
85114
def _pretty_str(s):
86115
return ' '.join(s.split('_')).capitalize()
@@ -93,7 +122,8 @@ def _pretty_str(s):
93122
test_info.update(addition_table_strings)
94123

95124
_set_monitoring(test_info, start_time, end_time)
96-
_set_coredumps(test_info)
125+
_set_coredumps(test_info, start_time, end_time)
126+
_set_node_errors(test_info, node_errors)
97127
_set_results_plot(test_info, suite, test, refference_set)
98128
_set_logs_command(test_info, start_time, end_time)
99129

@@ -110,7 +140,6 @@ def _pretty_str(s):
110140
'time': f"{datetime.fromtimestamp(start_time).strftime('%a %d %b %y %H:%M:%S')} - {datetime.fromtimestamp(end_time).strftime('%H:%M:%S')}",
111141
}
112142
)
113-
del test_info['nodes_wilcard']
114143
table_strings = '\n'.join([f'<tr><td>{_pretty_str(k)}</td><td>{v}</td></tr>' for k, v in test_info.items()])
115144
allure.dynamic.description_html(
116145
f'''<table border='1' cellpadding='4px'><tbody>

ydb/tests/olap/lib/ydb_cli.py

Lines changed: 0 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -148,7 +148,6 @@ def _get_output_path(ext: str) -> str:
148148
self.query_syntax = query_syntax
149149
self.scale = scale
150150
self.query_prefix = query_prefix
151-
self._nodes_info: dict[str, dict[str, int]] = {}
152151
self._plan_path = _get_output_path('plan')
153152
self._query_output_path = _get_output_path('out')
154153
self._json_path = _get_output_path('json')
@@ -231,27 +230,6 @@ def _load_query_out(self) -> None:
231230
with open(self._query_output_path, 'r') as r:
232231
self.result.query_out = r.read()
233232

234-
@staticmethod
235-
def _get_nodes_info() -> dict[str, dict[str, Any]]:
236-
return {
237-
n.host: {
238-
'start_time': n.start_time
239-
}
240-
for n in YdbCluster.get_cluster_nodes(db_only=True)
241-
}
242-
243-
def _check_nodes(self):
244-
node_errors = []
245-
for node, info in self._get_nodes_info().items():
246-
if node in self._nodes_info:
247-
if info['start_time'] > self._nodes_info[node]['start_time']:
248-
node_errors.append(f'Node {node} was restarted')
249-
self._nodes_info[node]['processed'] = True
250-
for node, info in self._nodes_info.items():
251-
if not info.get('processed', False):
252-
node_errors.append(f'Node {node} is down')
253-
self.result.add_error('\n'.join(node_errors))
254-
255233
def _parse_stdout(self, stdout: str) -> None:
256234
self.result.stdout = stdout
257235
for line in self.result.stdout.splitlines():
@@ -289,11 +267,9 @@ def process(self) -> YdbCliHelper.WorkloadRunResult:
289267
if wait_error is not None:
290268
self.result.error_message = wait_error
291269
else:
292-
self._nodes_info = self._get_nodes_info()
293270
process = yatest.common.process.execute(self._get_cmd(), check_exit_code=False)
294271
self._parse_stderr(process.stderr.decode('utf-8', 'replace'))
295272
self._parse_stdout(process.stdout.decode('utf-8', 'replace'))
296-
self._check_nodes()
297273
self._load_stats()
298274
self._load_query_out()
299275
self._load_plans()

ydb/tests/olap/lib/ydb_cluster.py

Lines changed: 3 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -37,8 +37,9 @@ def __init__(self, desc: dict):
3737
def __init__(self, desc: dict):
3838
ss = desc.get('SystemState', {})
3939
self.host: str = ss.get('Host', '')
40+
ports = {e.get('Name', ''): int(e.get('Address', '0').split(':')[-1]) for e in ss.get('Endpoints', [])}
41+
self.ic_port: int = ports.get('ic', 0)
4042
self.disconnected: bool = desc.get('Disconnected', False)
41-
self.cluster_name: str = ss.get('ClusterName', '')
4243
self.version: str = ss.get('Version', '')
4344
self.start_time: float = 0.001 * int(ss.get('StartTime', time() * 1000))
4445
if 'Storage' in ss.get('Roles', []):
@@ -110,21 +111,14 @@ def get_cluster_nodes(cls, path: Optional[str] = None, db_only: bool = False) ->
110111
def get_cluster_info(cls):
111112
if cls._cluster_info is None:
112113
version = ''
113-
cluster_name = ''
114-
nodes_wilcard = ''
115114
nodes = cls.get_cluster_nodes(db_only=True)
116115
for node in nodes:
117-
if not cluster_name:
118-
cluster_name = node.cluster_name
119116
if not version:
120117
version = node.version
121-
if not nodes_wilcard and node.role == YdbCluster.Node.Role.COMPUTE:
122-
nodes_wilcard = node.host.split('.')[0].rstrip('0123456789')
123118
cls._cluster_info = {
124119
'database': cls.ydb_database,
125120
'version': version,
126-
'name': cluster_name,
127-
'nodes_wilcard': nodes_wilcard,
121+
'name': cls.ydb_database.strip('/').split('/')[0],
128122
}
129123
return deepcopy(cls._cluster_info)
130124

ydb/tests/olap/load/lib/conftest.py

Lines changed: 105 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414
from typing import Optional
1515
from ydb.tests.olap.lib.ydb_cli import YdbCliHelper, WorkloadType, CheckCanonicalPolicy
1616
from ydb.tests.olap.lib.ydb_cluster import YdbCluster
17-
from ydb.tests.olap.lib.allure_utils import allure_test_description
17+
from ydb.tests.olap.lib.allure_utils import allure_test_description, NodeErrors
1818
from ydb.tests.olap.lib.results_processor import ResultsProcessor
1919
from ydb.tests.olap.lib.utils import get_external_param
2020
from ydb.tests.olap.scenario.helpers.scenario_tests_helper import ScenarioTestHelper
@@ -37,6 +37,7 @@ def __init__(self, iterations: Optional[int] = None, timeout: Optional[float] =
3737
scale: Optional[int] = None
3838
query_prefix: str = get_external_param('query-prefix', '')
3939
verify_data: bool = True
40+
__nodes_state: Optional[dict[tuple[str, int], YdbCluster.Node]] = None
4041

4142
@classmethod
4243
def suite(cls) -> str:
@@ -69,7 +70,7 @@ def _test_name(cls, query_num: int) -> str:
6970
@allure.step('check tables size')
7071
def check_tables_size(cls, folder: Optional[str], tables: dict[str, int]):
7172
wait_error = YdbCluster.wait_ydb_alive(
72-
20 * 60, (
73+
int(os.getenv('WAIT_CLUSTER_ALIVE_TIMEOUT', 20 * 60)), (
7374
f'{YdbCluster.tables_path}/{folder}'
7475
if folder is not None
7576
else [f'{YdbCluster.tables_path}/{t}' for t in tables.keys()]
@@ -92,8 +93,19 @@ def check_tables_size(cls, folder: Optional[str], tables: dict[str, int]):
9293
msg = "\n".join(errors)
9394
pytest.fail(f'Unexpected tables size in `{folder}`:\n {msg}')
9495

96+
@staticmethod
97+
def __execute_ssh(host: str, cmd: str):
98+
ssh_cmd = ['ssh', "-o", "StrictHostKeyChecking=no", "-o", "UserKnownHostsFile=/dev/null"]
99+
ssh_user = os.getenv('SSH_USER')
100+
if ssh_user is not None:
101+
ssh_cmd += ['-l', ssh_user]
102+
ssh_key_file = os.getenv('SSH_KEY_FILE')
103+
if ssh_key_file is not None:
104+
ssh_cmd += ['-i', ssh_key_file]
105+
return yatest.common.execute(ssh_cmd + [host, cmd], wait=False)
106+
95107
@classmethod
96-
def _attach_logs(cls, start_time, attach_name):
108+
def __attach_logs(cls, start_time, attach_name):
97109
hosts = [node.host for node in filter(lambda x: x.role == YdbCluster.Node.Role.STORAGE, YdbCluster.get_cluster_nodes())]
98110
tz = timezone('Europe/Moscow')
99111
start = datetime.fromtimestamp(start_time, tz).isoformat()
@@ -102,28 +114,20 @@ def _attach_logs(cls, start_time, attach_name):
102114
'': {},
103115
}
104116
exec_start = deepcopy(exec_kikimr)
105-
ssh_cmd = ['ssh', "-o", "StrictHostKeyChecking=no", "-o", "UserKnownHostsFile=/dev/null"]
106-
ssh_user = os.getenv('SSH_USER')
107-
if ssh_user is not None:
108-
ssh_cmd += ['-l', ssh_user]
109-
ssh_key_file = os.getenv('SSH_KEY_FILE')
110-
if ssh_key_file is not None:
111-
ssh_cmd += ['-i', ssh_key_file]
112117
for host in hosts:
113118
for c in exec_kikimr.keys():
114119
try:
115-
exec_kikimr[c][host] = yatest.common.execute(ssh_cmd + [host, cmd.format(
120+
exec_kikimr[c][host] = cls.__execute_ssh(host, cmd.format(
116121
storage='kikimr',
117-
container=f' -m k8s_container:{c}' if c else '')],
118-
wait=False)
122+
container=f' -m k8s_container:{c}' if c else ''
123+
))
119124
except BaseException as e:
120125
logging.error(e)
121126
for c in exec_start.keys():
122127
try:
123-
exec_start[c][host] = yatest.common.execute(ssh_cmd + [host, cmd.format(
128+
exec_start[c][host] = cls.__execute_ssh(host, cmd.format(
124129
storage='kikimr-start',
125-
container=f' -m k8s_container:{c}' if c else '')],
126-
wait=False)
130+
container=f' -m k8s_container:{c}' if c else ''))
127131
except BaseException as e:
128132
logging.error(e)
129133

@@ -146,6 +150,83 @@ def _attach_logs(cls, start_time, attach_name):
146150
yatest.common.execute(['tar', '-C', dir, '-czf', archive, '.'])
147151
allure.attach.file(archive, f'{attach_name}_{c}_logs', extension='tar.gz')
148152

153+
@classmethod
154+
def save_nodes_state(cls) -> None:
155+
cls.__nodes_state = {(n.host, n.ic_port): n for n in YdbCluster.get_cluster_nodes(db_only=True)}
156+
157+
@classmethod
158+
def __get_core_hashes_by_pod(cls, hosts: set[str], start_time: float, end_time: float) -> dict[str, list[tuple[str, str]]]:
159+
core_processes = {
160+
h: cls.__execute_ssh(h, 'sudo flock /tmp/brk_pad /Berkanavt/breakpad/bin/kikimr_breakpad_analizer.sh')
161+
for h in hosts
162+
}
163+
164+
core_hashes = {}
165+
for h, exec in core_processes.items():
166+
exec.wait(check_exit_code=False)
167+
if exec.returncode != 0:
168+
logging.error(f'Error while process coredumps on host {h}: {exec.stderr.decode("utf-8")}')
169+
exec = cls.__execute_ssh(h, ('find /coredumps/ -name "sended_*.json" '
170+
f'-mmin -{(10 + time() - start_time) / 60} -mmin +{(-10 + time() - end_time) / 60}'
171+
' | while read FILE; do cat $FILE; echo -n ","; done'))
172+
exec.wait(check_exit_code=False)
173+
if exec.returncode == 0:
174+
for core in json.loads(f'[{exec.stdout.decode("utf-8").strip(",")}]'):
175+
pod_name = core.get('pod', '')
176+
core_hashes.setdefault(pod_name, [])
177+
core_hashes[pod_name].append((core.get('core_uuid', ''), core.get('core_hash', '')))
178+
else:
179+
logging.error(f'Error while search coredumps on host {h}: {exec.stderr.decode("utf-8")}')
180+
return core_hashes
181+
182+
@classmethod
183+
def __get_hosts_with_omms(cls, hosts: set[str], start_time: float, end_time: float) -> set[str]:
184+
tz = timezone('Europe/Moscow')
185+
start = datetime.fromtimestamp(start_time, tz).strftime("%Y-%m-%d %H:%M:%S")
186+
end = datetime.fromtimestamp(end_time, tz).strftime("%Y-%m-%d %H:%M:%S")
187+
oom_cmd = f'sudo journalctl -k -q --no-pager -S {start} -U {end} --grep "Out of memory: Kill process"'
188+
ooms = set()
189+
for h in hosts:
190+
exec = cls.__execute_ssh(h, oom_cmd)
191+
exec.wait(check_exit_code=False)
192+
if exec.returncode == 0:
193+
if exec.stdout.decode('utf-8'):
194+
ooms.add(h)
195+
else:
196+
logging.error(f'Error while search OOMs on host {h}: {exec.stderr.decode("utf-8")}')
197+
return ooms
198+
199+
@classmethod
200+
def check_nodes(cls, result: YdbCliHelper.WorkloadRunResult, end_time: float) -> list[NodeErrors]:
201+
if cls.__nodes_state is None:
202+
return []
203+
node_errors = []
204+
fail_hosts = set()
205+
for node in YdbCluster.get_cluster_nodes(db_only=True):
206+
node_id = (node.host, node.ic_port)
207+
saved_node = cls.__nodes_state.get(node_id)
208+
if saved_node is not None:
209+
if node.start_time > saved_node.start_time:
210+
node_errors.append(NodeErrors(node, 'was restarted'))
211+
fail_hosts.add(node.host)
212+
del cls.__nodes_state[node_id]
213+
for _, node in cls.__nodes_state.items():
214+
node_errors.append(NodeErrors(node, 'is down'))
215+
fail_hosts.add(node.host)
216+
cls.__nodes_state = None
217+
if len(node_errors) == 0:
218+
return []
219+
220+
core_hashes = cls.__get_core_hashes_by_pod(fail_hosts, result.start_time, end_time)
221+
ooms = cls.__get_hosts_with_omms(fail_hosts, result.start_time, end_time)
222+
for node in node_errors:
223+
node.core_hashes = core_hashes.get(f'{node.node.ic_port}@{node.node.host}', [])
224+
node.was_oom = node.node.host in ooms
225+
226+
for err in node_errors:
227+
result.add_error(f'Node {err.node.ic_port}@{err.node.host} {err.message}')
228+
return node_errors
229+
149230
@classmethod
150231
def process_query_result(cls, result: YdbCliHelper.WorkloadRunResult, query_num: int, iterations: int, upload: bool):
151232
def _get_duraton(stats, field):
@@ -210,15 +291,20 @@ def _attach_plans(plan: YdbCliHelper.QueryPlan, name: str) -> None:
210291
for p in ['Mean']:
211292
if p in stats:
212293
allure.dynamic.parameter(p, _duration_text(stats[p] / 1000.))
294+
end_time = time()
213295
if os.getenv('NO_KUBER_LOGS') is None and not result.success:
214-
cls._attach_logs(start_time=result.start_time, attach_name='kikimr')
296+
cls.__attach_logs(start_time=result.start_time, attach_name='kikimr')
215297
allure.attach(json.dumps(stats, indent=2), 'Stats', attachment_type=allure.attachment_type.JSON)
298+
allure_test_description(
299+
cls.suite(), test, refference_set=cls.refference,
300+
start_time=result.start_time, end_time=end_time, node_errors=cls.check_nodes(result, end_time)
301+
)
216302
if upload:
217303
ResultsProcessor.upload_results(
218304
kind='Load',
219305
suite=cls.suite(),
220306
test=test,
221-
timestamp=time(),
307+
timestamp=end_time,
222308
is_successful=result.success,
223309
min_duration=_get_duraton(stats, 'Min'),
224310
max_duration=_get_duraton(stats, 'Max'),
@@ -263,6 +349,7 @@ def run_workload_test(self, path: str, query_num: int) -> None:
263349
if param.name == 'query_num':
264350
param.mode = allure.parameter_mode.HIDDEN.value
265351
qparams = self._get_query_settings(query_num)
352+
self.save_nodes_state()
266353
result = YdbCliHelper.workload_run(
267354
path=path,
268355
query_num=query_num,
@@ -274,5 +361,4 @@ def run_workload_test(self, path: str, query_num: int) -> None:
274361
scale=self.scale,
275362
query_prefix=qparams.query_prefix
276363
)
277-
allure_test_description(self.suite(), self._test_name(query_num), refference_set=self.refference, start_time=result.start_time, end_time=time())
278364
self.process_query_result(result, query_num, qparams.iterations, True)

0 commit comments

Comments
 (0)