Skip to content

Commit c2d2d5a

Browse files
authored
Fix error processing (#8599)
1 parent ebfe3cd commit c2d2d5a

File tree

2 files changed

+54
-25
lines changed

2 files changed

+54
-25
lines changed

ydb/tests/olap/lib/ydb_cluster.py

Lines changed: 53 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -157,27 +157,52 @@ def execute_single_result_query(cls, query, timeout=10):
157157
@classmethod
158158
@allure.step('Check if YDB alive')
159159
def check_if_ydb_alive(cls, timeout=10, balanced_paths=None):
160-
try:
161-
nodes, node_count = cls._get_cluster_nodes()
162-
if node_count == 0:
163-
return False
164-
if len(nodes) < node_count:
165-
LOGGER.error(f"{node_count - len(nodes)} nodes from {node_count} don't live")
166-
return False
167-
for n in nodes:
160+
def _check_node(n):
161+
name = 'UnknownNode'
162+
error = None
163+
role = 'Unknown'
164+
try:
168165
ss = n.get('SystemState', {})
169166
name = ss.get("Host")
170167
start_time = int(ss.get('StartTime', int(time()) * 1000)) / 1000
171168
uptime = int(time()) - start_time
169+
r = ss.get('Roles', [])
170+
role = r[0] if len(r) > 0 else role
172171
if uptime < 15:
173-
LOGGER.error(f'Node {name} too yong: {uptime}')
174-
return False
175-
if 'MemoryUsed' in ss and 'MemoryLimit' in ss:
176-
used = int(ss['MemoryUsed'])
177-
limit = int(ss['MemoryLimit'])
178-
if used > 0.9 * limit:
179-
LOGGER.error(f'Node {name} use too many rss: {used} from {limit}')
180-
return False
172+
error = f'Node {name} too yong: {uptime}'
173+
except BaseException as ex:
174+
error = f"Error while process node {name}: {ex}"
175+
if error:
176+
LOGGER.error(error)
177+
return error, role
178+
179+
errors = []
180+
try:
181+
nodes, node_count = cls._get_cluster_nodes()
182+
if node_count == 0:
183+
errors.append('nodes_count == 0')
184+
if len(nodes) < node_count:
185+
errors.append(f"{node_count - len(nodes)} nodes from {node_count} don't live")
186+
ok_by_role = {'Tenant': 0, 'Storage': 0, 'Unknown': 0}
187+
nodes_by_role = deepcopy(ok_by_role)
188+
node_errors = {'Tenant': [], 'Storage': [], 'Unknown': []}
189+
for n in nodes:
190+
error, role = _check_node(n)
191+
if error:
192+
node_errors[role].append(error)
193+
else:
194+
ok_by_role[role] += 1
195+
nodes_by_role[role] += 1
196+
dynnodes_count = nodes_by_role['Tenant']
197+
ok_dynnodes_count = ok_by_role['Tenant']
198+
if ok_dynnodes_count < dynnodes_count:
199+
dynnodes_errors = ','.join(node_errors['Tenant'])
200+
errors.append(f'Only {ok_dynnodes_count} from {dynnodes_count} dynnodes are ok: {dynnodes_errors}')
201+
storage_nodes_count = nodes_by_role['Storage']
202+
ok_storage_nodes_count = ok_by_role['Storage']
203+
if ok_storage_nodes_count < dynnodes_count:
204+
storage_nodes_errors = ','.join(node_errors['Tenant'])
205+
errors.append(f'Only {ok_storage_nodes_count} from {storage_nodes_count} storage nodes are ok, but {dynnodes_count} need. {storage_nodes_errors}')
181206
paths_to_balance = []
182207
if isinstance(balanced_paths, str):
183208
paths_to_balance += cls._get_tables(balanced_paths)
@@ -198,22 +223,26 @@ def check_if_ydb_alive(cls, timeout=10, balanced_paths=None):
198223
if max is None or tablet_count > max:
199224
max = tablet_count
200225
if min is not None and max - min > 1:
201-
LOGGER.error(f'Table {p} is not balanced: {min}-{max} shards.')
202-
return False
226+
errors.append(f'Table {p} is not balanced: {min}-{max} shards.')
203227
LOGGER.info(f'Table {p} is balanced: {min}-{max} shards.')
204228

205229
cls.execute_single_result_query("select 1", timeout)
206-
return True
207230
except BaseException as ex:
208-
LOGGER.error(f"Cannot connect to YDB {ex}")
209-
return False
231+
errors.append(f"Cannot connect to YDB: {ex}")
232+
if len(errors) == 0:
233+
return None
234+
error = ', '.join(errors)
235+
LOGGER.error(error)
236+
return error
210237

211238
@classmethod
212239
@allure.step('Wait YDB alive')
213240
def wait_ydb_alive(cls, timeout=10, balanced_paths=None):
214241
deadline = time() + timeout
242+
error = None
215243
while time() < deadline:
216-
if cls.check_if_ydb_alive(deadline - time(), balanced_paths=balanced_paths):
217-
return True
244+
error = cls.check_if_ydb_alive(deadline - time(), balanced_paths=balanced_paths)
245+
if error is None:
246+
break
218247
sleep(1)
219-
return False
248+
return error

ydb/tests/olap/scenario/helpers/scenario_tests_helper.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -320,7 +320,7 @@ def check_if_ydb_alive(timeout: float = 10) -> bool:
320320
assert sth.check_if_ydb_alive()
321321
"""
322322

323-
return YdbCluster.check_if_ydb_alive(timeout)
323+
return YdbCluster.check_if_ydb_alive(timeout) is None
324324

325325
def execute_scheme_query(
326326
self,

0 commit comments

Comments
 (0)