Skip to content

Commit 09771d9

Browse files
authored
fix node checking (#11578)
1 parent 0ef4835 commit 09771d9

File tree

4 files changed

+38
-48
lines changed

4 files changed

+38
-48
lines changed

ydb/tests/olap/lib/allure_utils.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,7 @@ def _set_results_plot(test_info: dict[str, str], suite: str, test: str, refferen
5151

5252
def _set_logs_command(test_info: dict[str, str], start_time: float, end_time: float):
5353
hosts = []
54-
for node in YdbCluster.get_cluster_nodes()[0]:
54+
for node in YdbCluster.get_cluster_nodes():
5555
ss = node.get('SystemState', {})
5656
if 'Storage' in ss.get('Roles', []):
5757
hosts.append(ss.get('Host'))

ydb/tests/olap/lib/ydb_cli.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -153,7 +153,7 @@ def _load_query_out(self) -> None:
153153

154154
@staticmethod
155155
def _get_nodes_info() -> dict[str, dict[str, int]]:
156-
nodes, _ = YdbCluster.get_cluster_nodes()
156+
nodes = YdbCluster.get_cluster_nodes(db_only=True)
157157
return {
158158
n['SystemState']['Host']: {
159159
'start_time': int(int(n['SystemState'].get('StartTime', time() * 1000)) / 1000)
@@ -213,7 +213,7 @@ def _exec_cli(self) -> None:
213213

214214
def process(self) -> YdbCliHelper.WorkloadRunResult:
215215
try:
216-
wait_error = YdbCluster.wait_ydb_alive(300, self.db_path)
216+
wait_error = YdbCluster.wait_ydb_alive(20 * 60, self.db_path)
217217
if wait_error is not None:
218218
self.result.error_message = wait_error
219219
else:

ydb/tests/olap/lib/ydb_cluster.py

Lines changed: 34 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
import ydb
88
from copy import deepcopy
99
from time import sleep, time
10-
from typing import List
10+
from typing import List, Optional
1111

1212
LOGGER = logging.getLogger()
1313

@@ -53,9 +53,11 @@ def _get_service_url(cls):
5353
return f'http://{host}:{port}'
5454

5555
@classmethod
56-
def get_cluster_nodes(cls, path=None):
56+
def get_cluster_nodes(cls, path: Optional[str] = None, db_only: bool = False) -> list[dict[str:any]]:
5757
try:
58-
url = f'{cls._get_service_url()}/viewer/json/nodes?database=/{cls.ydb_database}'
58+
url = f'{cls._get_service_url()}/viewer/json/nodes?'
59+
if db_only or path is not None:
60+
url += f'database=/{cls.ydb_database}'
5961
if path is not None:
6062
url += f'&path={path}&tablets=true'
6163
headers = {}
@@ -64,8 +66,7 @@ def get_cluster_nodes(cls, path=None):
6466
# headers['Authorization'] = token
6567
data = requests.get(url, headers=headers).json()
6668
nodes = data.get('Nodes', [])
67-
nodes_count = int(data.get('TotalNodes', len(nodes)))
68-
return nodes, nodes_count
69+
return nodes
6970
except Exception as e:
7071
LOGGER.error(e)
7172
return [], 0
@@ -76,7 +77,7 @@ def get_cluster_info(cls):
7677
version = ''
7778
cluster_name = ''
7879
nodes_wilcard = ''
79-
nodes, node_count = cls.get_cluster_nodes()
80+
nodes = cls.get_cluster_nodes()
8081
for node in nodes:
8182
n = node.get('SystemState', {})
8283
cluster_name = n.get('ClusterName', cluster_name)
@@ -86,7 +87,6 @@ def get_cluster_info(cls):
8687
nodes_wilcard = n.get('Host', nodes_wilcard).split('.')[0].rstrip('0123456789')
8788
cls._cluster_info = {
8889
'database': cls.ydb_database,
89-
'node_count': node_count,
9090
'version': version,
9191
'name': cluster_name,
9292
'nodes_wilcard': nodes_wilcard,
@@ -183,63 +183,53 @@ def check_if_ydb_alive(cls, timeout=10, balanced_paths=None):
183183
def _check_node(n):
184184
name = 'UnknownNode'
185185
error = None
186-
role = 'Unknown'
187186
try:
188187
ss = n.get('SystemState', {})
189188
name = ss.get("Host")
190189
start_time = int(ss.get('StartTime', int(time()) * 1000)) / 1000
191190
uptime = int(time()) - start_time
192-
r = ss.get('Roles', [])
193-
for role_candidate in ['Storage', 'Tenant']:
194-
if role_candidate in r:
195-
role = role_candidate
196-
break
197191
if uptime < 15:
198192
error = f'Node {name} too yong: {uptime}'
199193
except BaseException as ex:
200194
error = f"Error while process node {name}: {ex}"
201195
if error:
202196
LOGGER.error(error)
203-
return error, role
197+
return error
204198

205199
errors = []
206200
try:
207-
nodes, node_count = cls.get_cluster_nodes()
208-
if node_count == 0:
209-
errors.append('nodes_count == 0')
210-
if len(nodes) < node_count:
211-
errors.append(f"{node_count - len(nodes)} nodes from {node_count} don't live")
212-
ok_by_role = {'Tenant': 0, 'Storage': 0, 'Unknown': 0}
213-
nodes_by_role = deepcopy(ok_by_role)
214-
node_errors = {'Tenant': [], 'Storage': [], 'Unknown': []}
201+
nodes = cls.get_cluster_nodes(db_only=True)
202+
expected_nodes_count = os.getenv('EXPECTED_DYN_NODES_COUNT')
203+
nodes_count = len(nodes)
204+
if expected_nodes_count:
205+
LOGGER.debug(f'Expected nodes count: {expected_nodes_count}')
206+
expected_nodes_count = int(expected_nodes_count)
207+
if nodes_count < expected_nodes_count:
208+
errors.append(f"{expected_nodes_count - nodes_count} nodes from {expected_nodes_count} don't alive")
209+
ok_node_count = 0
210+
node_errors = []
215211
for n in nodes:
216-
error, role = _check_node(n)
212+
error = _check_node(n)
217213
if error:
218-
node_errors[role].append(error)
214+
node_errors.append(error)
219215
else:
220-
ok_by_role[role] += 1
221-
nodes_by_role[role] += 1
222-
dynnodes_count = nodes_by_role['Tenant']
223-
ok_dynnodes_count = ok_by_role['Tenant']
224-
if ok_dynnodes_count < dynnodes_count:
225-
dynnodes_errors = ','.join(node_errors['Tenant'])
226-
errors.append(f'Only {ok_dynnodes_count} from {dynnodes_count} dynnodes are ok: {dynnodes_errors}')
227-
storage_nodes_count = nodes_by_role['Storage']
228-
ok_storage_nodes_count = ok_by_role['Storage']
229-
if ok_storage_nodes_count < storage_nodes_count:
230-
storage_nodes_errors = ','.join(node_errors['Tenant'])
231-
errors.append(f'Only {ok_storage_nodes_count} from {storage_nodes_count} storage nodes are ok. {storage_nodes_errors}')
232-
paths_to_balance = []
233-
if isinstance(balanced_paths, str):
234-
paths_to_balance += cls._get_tables(balanced_paths)
235-
elif isinstance(balanced_paths, list):
236-
for path in balanced_paths:
237-
paths_to_balance += cls._get_tables(path)
216+
ok_node_count += 1
217+
if ok_node_count < nodes_count:
218+
errors.append(f'Only {ok_node_count} from {ok_node_count} dynnodes are ok: {",".join(node_errors)}')
238219
if os.getenv('TEST_CHECK_BALANCING', 'no') == 'yes':
220+
paths_to_balance = []
221+
if isinstance(balanced_paths, str):
222+
paths_to_balance += cls._get_tables(balanced_paths)
223+
elif isinstance(balanced_paths, list):
224+
for path in balanced_paths:
225+
paths_to_balance += cls._get_tables(path)
239226
for p in paths_to_balance:
240-
table_nodes, _ = cls.get_cluster_nodes(p)
227+
table_nodes = cls.get_cluster_nodes(p)
241228
min = None
242229
max = None
230+
if expected_nodes_count:
231+
if len(table_nodes) < expected_nodes_count:
232+
min = 0
243233
for tn in table_nodes:
244234
tablet_count = 0
245235
for tablet in tn.get("Tablets", []):
@@ -256,7 +246,7 @@ def _check_node(n):
256246
errors.append(f'Table {p} has no tablets')
257247
elif max - min > 1:
258248
errors.append(f'Table {p} is not balanced: {min}-{max} shards.')
259-
LOGGER.info(f'Table {p} is balanced: {min}-{max} shards.')
249+
LOGGER.info(f'Table {p} balance: {min}-{max} shards.')
260250

261251
cls.execute_single_result_query("select 1", timeout)
262252
except BaseException as ex:

ydb/tests/olap/load/conftest.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,7 @@ def _test_name(cls, query_num: int) -> str:
5353
@allure.step('check tables size')
5454
def check_tables_size(cls, folder: Optional[str], tables: dict[str, int]):
5555
wait_error = YdbCluster.wait_ydb_alive(
56-
300, (
56+
20 * 60, (
5757
f'{YdbCluster.tables_path}/{folder}'
5858
if folder is not None
5959
else [f'{YdbCluster.tables_path}/{t}' for t in tables.keys()]

0 commit comments

Comments
 (0)