@@ -157,27 +157,52 @@ def execute_single_result_query(cls, query, timeout=10):
157
157
@classmethod
158
158
@allure .step ('Check if YDB alive' )
159
159
def check_if_ydb_alive (cls , timeout = 10 , balanced_paths = None ):
160
- try :
161
- nodes , node_count = cls ._get_cluster_nodes ()
162
- if node_count == 0 :
163
- return False
164
- if len (nodes ) < node_count :
165
- LOGGER .error (f"{ node_count - len (nodes )} nodes from { node_count } don't live" )
166
- return False
167
- for n in nodes :
160
+ def _check_node (n ):
161
+ name = 'UnknownNode'
162
+ error = None
163
+ role = 'Unknown'
164
+ try :
168
165
ss = n .get ('SystemState' , {})
169
166
name = ss .get ("Host" )
170
167
start_time = int (ss .get ('StartTime' , int (time ()) * 1000 )) / 1000
171
168
uptime = int (time ()) - start_time
169
+ r = ss .get ('Roles' , [])
170
+ role = r [0 ] if len (r ) > 0 else role
172
171
if uptime < 15 :
173
- LOGGER .error (f'Node { name } too yong: { uptime } ' )
174
- return False
175
- if 'MemoryUsed' in ss and 'MemoryLimit' in ss :
176
- used = int (ss ['MemoryUsed' ])
177
- limit = int (ss ['MemoryLimit' ])
178
- if used > 0.9 * limit :
179
- LOGGER .error (f'Node { name } use too many rss: { used } from { limit } ' )
180
- return False
172
+ error = f'Node { name } too yong: { uptime } '
173
+ except BaseException as ex :
174
+ error = f"Error while process node { name } : { ex } "
175
+ if error :
176
+ LOGGER .error (error )
177
+ return error , role
178
+
179
+ errors = []
180
+ try :
181
+ nodes , node_count = cls ._get_cluster_nodes ()
182
+ if node_count == 0 :
183
+ errors .append ('nodes_count == 0' )
184
+ if len (nodes ) < node_count :
185
+ errors .append (f"{ node_count - len (nodes )} nodes from { node_count } don't live" )
186
+ ok_by_role = {'Tenant' : 0 , 'Storage' : 0 , 'Unknown' : 0 }
187
+ nodes_by_role = deepcopy (ok_by_role )
188
+ node_errors = {'Tenant' : [], 'Storage' : [], 'Unknown' : []}
189
+ for n in nodes :
190
+ error , role = _check_node (n )
191
+ if error :
192
+ node_errors [role ].append (error )
193
+ else :
194
+ ok_by_role [role ] += 1
195
+ nodes_by_role [role ] += 1
196
+ dynnodes_count = nodes_by_role ['Tenant' ]
197
+ ok_dynnodes_count = ok_by_role ['Tenant' ]
198
+ if ok_dynnodes_count < dynnodes_count :
199
+ dynnodes_errors = ',' .join (node_errors ['Tenant' ])
200
+ errors .append (f'Only { ok_dynnodes_count } from { dynnodes_count } dynnodes are ok: { dynnodes_errors } ' )
201
+ storage_nodes_count = nodes_by_role ['Storage' ]
202
+ ok_storage_nodes_count = ok_by_role ['Storage' ]
203
+ if ok_storage_nodes_count < dynnodes_count :
204
+ storage_nodes_errors = ',' .join (node_errors ['Tenant' ])
205
+ errors .append (f'Only { ok_storage_nodes_count } from { storage_nodes_count } storage nodes are ok, but { dynnodes_count } need. { storage_nodes_errors } ' )
181
206
paths_to_balance = []
182
207
if isinstance (balanced_paths , str ):
183
208
paths_to_balance += cls ._get_tables (balanced_paths )
@@ -198,22 +223,26 @@ def check_if_ydb_alive(cls, timeout=10, balanced_paths=None):
198
223
if max is None or tablet_count > max :
199
224
max = tablet_count
200
225
if min is not None and max - min > 1 :
201
- LOGGER .error (f'Table { p } is not balanced: { min } -{ max } shards.' )
202
- return False
226
+ errors .append (f'Table { p } is not balanced: { min } -{ max } shards.' )
203
227
LOGGER .info (f'Table { p } is balanced: { min } -{ max } shards.' )
204
228
205
229
cls .execute_single_result_query ("select 1" , timeout )
206
- return True
207
230
except BaseException as ex :
208
- LOGGER .error (f"Cannot connect to YDB { ex } " )
209
- return False
231
+ errors .append (f"Cannot connect to YDB: { ex } " )
232
+ if len (errors ) == 0 :
233
+ return None
234
+ error = ', ' .join (errors )
235
+ LOGGER .error (error )
236
+ return error
210
237
211
238
@classmethod
212
239
@allure .step ('Wait YDB alive' )
213
240
def wait_ydb_alive (cls , timeout = 10 , balanced_paths = None ):
214
241
deadline = time () + timeout
242
+ error = None
215
243
while time () < deadline :
216
- if cls .check_if_ydb_alive (deadline - time (), balanced_paths = balanced_paths ):
217
- return True
244
+ error = cls .check_if_ydb_alive (deadline - time (), balanced_paths = balanced_paths )
245
+ if error is None :
246
+ break
218
247
sleep (1 )
219
- return False
248
+ return error
0 commit comments