@@ -105,7 +105,7 @@ class bcolors:
105
105
106
106
107
107
class StabilityCluster :
108
- def __init__ (self , ssh_username , cluster_path , ydbd_path , ydbd_next_path = None ):
108
+ def __init__ (self , ssh_username , cluster_path , ydbd_path = None , ydbd_next_path = None ):
109
109
self .working_dir = os .path .join (tempfile .gettempdir (), "ydb_stability" )
110
110
os .makedirs (self .working_dir , exist_ok = True )
111
111
self .ssh_username = ssh_username
@@ -145,7 +145,7 @@ def clean_trace(self, traces):
145
145
for line in traces .split ('\n ' ):
146
146
line = re .sub (r' @ 0x[a-fA-F0-9]+' , '' , line )
147
147
# Убираем все до текста ошибки или указателя на строку кода
148
- match_verify = re .search (r'VERIFY|FAIL|signal 11|signal 6|signal 15|uncaught exception' , line )
148
+ match_verify = re .search (r'VERIFY|FAIL|signal 11|signal 6|signal 15|uncaught exception|ERROR: AddressSanitizer|SIG ' , line )
149
149
match_code_file_line = re .search (r'\s+(\S+\.cpp:\d+).*' , line )
150
150
151
151
if match_verify :
@@ -198,12 +198,11 @@ def process_lines(self, text):
198
198
trace = trace + line + '\n '
199
199
return traces
200
200
201
- def get_all_errors (self ):
202
- logging .getLogger ().setLevel (logging .WARNING )
201
+ def get_all_errors (self , mode = 'all' ):
203
202
all_results = []
204
- for node in self . kikimr_cluster . nodes . values () :
205
- result = node . ssh_command ( """
206
- ls -ltr /Berkanavt/kikimr */logs/kikimr* |
203
+ if mode == 'all' or mode == 'raw' or mode == 'aggr' :
204
+ command = """
205
+ ls -ltr /Berkanavt/kikim */logs/kikimr* |
207
206
awk '{print $NF}' |
208
207
while read file; do
209
208
case "$file" in
@@ -212,27 +211,53 @@ def get_all_errors(self):
212
211
*) cat "$file" ;;
213
212
esac
214
213
done |
215
- grep -E 'VERIFY|FAIL|signal 11|signal 6|signal 15|uncaught exception' -A 20
216
- """ , raise_on_error = False )
214
+ grep -E 'VERIFY|FAIL |signal 11|signal 6|signal 15|uncaught exception|ERROR: AddressSanitizer|SIG' -A 40 -B 20
215
+ """
216
+ elif mode == 'last' :
217
+ command = """
218
+ ls -ltr /Berkanavt/kikim*/logs/kikimr |
219
+ awk '{print $NF}' |
220
+ while read file; do
221
+ cat "$file" | grep -E 'VERIFY|FAIL |signal 11|signal 6|signal 15|uncaught exception|ERROR: AddressSanitizer|SIG' -A 40 -B 20 | tail -120
222
+ echo "--"
223
+ done
224
+ """
225
+ for node in self .kikimr_cluster .nodes .values ():
226
+ result = node .ssh_command (command , raise_on_error = False )
217
227
if result :
218
228
all_results .append (result .decode ('utf-8' ))
219
229
all_results = self .process_lines (all_results )
220
230
return all_results
221
231
222
- def get_errors (self ):
223
- errors = self .get_all_errors ()
224
- unique_traces = self .find_unique_traces_with_counts (errors )
225
- for trace in unique_traces :
226
- print (f"Trace (Occurrences: { len (unique_traces [trace ])} ):\n { trace } \n { '-' * 60 } " )
232
+ def get_errors (self , mode = 'raw' ):
233
+ errors = self .get_all_errors (mode = mode )
234
+ if mode == 'raw' or mode == 'last' :
235
+ print ('Traces:' )
236
+ for trace in errors :
237
+ print (f"{ trace } \n { '-' * 60 } " )
238
+ else :
239
+ unique_traces = self .find_unique_traces_with_counts (errors )
240
+ for trace in unique_traces :
241
+ print (f"Trace (Occurrences: { len (unique_traces [trace ])} ):\n { trace } \n { '-' * 60 } " )
227
242
228
243
def perform_checks (self ):
229
244
230
- safety_violations = safety_warden_factory (self .kikimr_cluster , self .ssh_username , lines_after = 20 , cut = False ).list_of_safety_violations ()
245
+ safety_violations = safety_warden_factory (self .kikimr_cluster , self .ssh_username , lines_after = 20 , cut = False , modification_days = 3 ).list_of_safety_violations ()
231
246
liveness_violations = liveness_warden_factory (self .kikimr_cluster , self .ssh_username ).list_of_liveness_violations
232
247
coredumps_search_results = {}
233
248
for node in self .kikimr_cluster .nodes .values ():
234
249
result = node .ssh_command ('find /coredumps/ -type f | wc -l' , raise_on_error = False )
235
250
coredumps_search_results [node .host .split (':' )[0 ]] = int (result .decode ('utf-8' ))
251
+ minidumps_search_results = {}
252
+ for node in self .kikimr_cluster .nodes .values ():
253
+ result = node .ssh_command ('''
254
+ if [ -d "/Berkanavt/minidumps/" ]; then
255
+ find /Berkanavt/minidumps/ -type f | wc -l
256
+ else
257
+ echo 0
258
+ fi
259
+ ''' , raise_on_error = False )
260
+ minidumps_search_results [node .host .split (':' )[0 ]] = int (result .decode ('utf-8' ))
236
261
237
262
print ("SAFETY WARDEN:" )
238
263
for i , violation in enumerate (safety_violations ):
@@ -249,6 +274,9 @@ def perform_checks(self):
249
274
print ("COREDUMPS:" )
250
275
for node in coredumps_search_results :
251
276
print (f' { node } : { coredumps_search_results [node ]} ' )
277
+ print ("MINIDUMPS:" )
278
+ for node in coredumps_search_results :
279
+ print (f' { node } : { minidumps_search_results [node ]} ' )
252
280
253
281
def start_nemesis (self ):
254
282
for node in self .kikimr_cluster .nodes .values ():
@@ -258,7 +286,7 @@ def stop_workloads(self):
258
286
for node in self .kikimr_cluster .nodes .values ():
259
287
node .ssh_command (
260
288
'sudo pkill screen' ,
261
- raise_on_error = True
289
+ raise_on_error = False
262
290
)
263
291
264
292
def stop_nemesis (self ):
@@ -284,17 +312,14 @@ def get_state(self):
284
312
print (f'\t { state_object } :\t { status } ' )
285
313
286
314
def cleanup (self , mode = 'all' ):
287
- if mode in ['all' , 'logs' ]:
288
- self .kikimr_cluster .cleanup_logs ()
289
315
for node in self .kikimr_cluster .nodes .values ():
290
316
if mode in ['all' , 'dumps' ]:
291
317
node .ssh_command ('sudo rm -rf /coredumps/*' , raise_on_error = False )
292
318
if mode in ['all' , 'logs' ]:
319
+ node .ssh_command ('sudo find /Berkanavt/kikimr*/logs/kikimr* -type f -exec rm -f {} +' , raise_on_error = False )
293
320
node .ssh_command ('sudo rm -rf /Berkanavt/nemesis/log/*' , raise_on_error = False )
294
- if mode == 'all' :
295
- self .stop_nemesis ()
296
- node .ssh_command ('sudo pkill screen' , raise_on_error = False )
297
- node .ssh_command ('sudo rm -rf /Berkanavt/kikimr/bin/*' , raise_on_error = False )
321
+ if mode in ['all' , 'logs' ]:
322
+ self .kikimr_cluster .cleanup_logs ()
298
323
299
324
def deploy_ydb (self ):
300
325
self .cleanup ()
@@ -309,6 +334,7 @@ def deploy_ydb(self):
309
334
node .ssh_command ("/Berkanavt/kikimr/bin/kikimr admin console validator disable bootstrap" , raise_on_error = True )
310
335
311
336
self .deploy_tools ()
337
+ self .get_state ()
312
338
313
339
def deploy_tools (self ):
314
340
for node in self .kikimr_cluster .nodes .values ():
@@ -348,7 +374,7 @@ def parse_args():
348
374
)
349
375
parser .add_argument (
350
376
"--ydbd_path" ,
351
- required = True ,
377
+ required = False ,
352
378
type = path_type ,
353
379
help = "Path to ydbd" ,
354
380
)
@@ -371,32 +397,56 @@ def parse_args():
371
397
nargs = "+" ,
372
398
choices = [
373
399
"get_errors" ,
400
+ "get_errors_aggr" ,
401
+ "get_errors_last" ,
374
402
"get_state" ,
403
+ "clean_workload" ,
375
404
"cleanup" ,
376
405
"cleanup_logs" ,
377
406
"cleanup_dumps" ,
378
407
"deploy_ydb" ,
379
408
"deploy_tools" ,
380
409
"start_nemesis" ,
381
410
"stop_nemesis" ,
382
- "start_all_workloads " ,
411
+ "start_default_workloads " ,
383
412
"start_workload_simple_queue_row" ,
384
413
"start_workload_simple_queue_column" ,
385
414
"start_workload_olap_workload" ,
386
415
"start_workload_log" ,
387
416
"start_workload_log_column" ,
388
417
"start_workload_log_row" ,
389
418
"stop_workloads" ,
419
+ "stop_workload" ,
390
420
"perform_checks" ,
391
421
],
392
422
help = "actions to execute" ,
393
423
)
424
+ args , unknown = parser .parse_known_args ()
425
+ if "stop_workload" in args .actions :
426
+ parser .add_argument (
427
+ "--name" ,
428
+ type = str ,
429
+ required = True ,
430
+ help = "Name of the workload to stop" ,
431
+ choices = list (DICT_OF_PROCESSES .keys ())
432
+ )
433
+
434
+ if "clean_workload" in args .actions :
435
+ parser .add_argument (
436
+ "--name" ,
437
+ type = str ,
438
+ required = True ,
439
+ help = "Name of the workload to stop" ,
440
+ choices = list (DICT_OF_PROCESSES .keys ())
441
+ )
442
+
394
443
return parser .parse_args ()
395
444
396
445
397
446
def main ():
398
447
args = parse_args ()
399
448
ssh_username = args .ssh_user
449
+ print ('Initing cluster info' )
400
450
stability_cluster = StabilityCluster (
401
451
ssh_username = ssh_username ,
402
452
cluster_path = args .cluster_path ,
@@ -405,8 +455,13 @@ def main():
405
455
)
406
456
407
457
for action in args .actions :
458
+ print (f'Start action { action } ' )
408
459
if action == "get_errors" :
409
- stability_cluster .get_errors ()
460
+ stability_cluster .get_errors (mode = 'raw' )
461
+ if action == "get_errors_aggr" :
462
+ stability_cluster .get_errors (mode = 'aggr' )
463
+ if action == "get_errors_last" :
464
+ stability_cluster .get_errors (mode = 'last' )
410
465
if action == "get_state" :
411
466
stability_cluster .get_state ()
412
467
if action == "deploy_ydb" :
@@ -419,7 +474,7 @@ def main():
419
474
stability_cluster .cleanup ('dumps' )
420
475
if action == "deploy_tools" :
421
476
stability_cluster .deploy_tools ()
422
- if action == "start_all_workloads " :
477
+ if action == "start_default_workloads " :
423
478
for node_id , node in enumerate (stability_cluster .kikimr_cluster .nodes .values ()):
424
479
node .ssh_command (
425
480
'screen -s simple_queue_row -d -m bash -c "while true; do /Berkanavt/nemesis/bin/simple_queue --database /Root/db1 --mode row; done"' ,
@@ -434,6 +489,43 @@ def main():
434
489
raise_on_error = True
435
490
)
436
491
stability_cluster .get_state ()
492
+ if action == "stop_workload" :
493
+ workload_name = args .name
494
+ if DICT_OF_PROCESSES .get (workload_name ):
495
+ for node_id , node in enumerate (stability_cluster .kikimr_cluster .nodes .values ()):
496
+ node .ssh_command (
497
+ f"ps aux | grep { workload_name } | grep -v grep | awk '{{print $2}}' | xargs kill -9" ,
498
+ raise_on_error = True )
499
+ else :
500
+ print (f"Unknown workload { workload_name } " )
501
+ stability_cluster .get_state ()
502
+ if "clean_workload" in action :
503
+ workload_name = args .name
504
+ if DICT_OF_PROCESSES .get (workload_name ):
505
+ store_type_list = []
506
+ if 'column' in workload_name :
507
+ store_type_list .append ('column' )
508
+ elif 'row' in workload_name :
509
+ store_type_list .append ('row' )
510
+ else :
511
+ store_type_list = ['column' , 'row' ]
512
+ if 'log_' in workload_name :
513
+ first_node = stability_cluster .kikimr_cluster .nodes [1 ]
514
+ for store_type in store_type_list :
515
+ first_node .ssh_command ([
516
+ '/Berkanavt/nemesis/bin/ydb_cli' ,
517
+ '--endpoint' , f'grpc://localhost:{ first_node .grpc_port } ' ,
518
+ '--database' , '/Root/db1' ,
519
+ 'workload' , 'log' , 'clean' ,
520
+ '--path' , f'log_workload_{ store_type } ' ,
521
+ ],
522
+ raise_on_error = True
523
+ )
524
+ else :
525
+ print (f"Not supported workload clean command for { workload_name } " )
526
+ else :
527
+ print (f"Unknown workload { workload_name } " )
528
+ stability_cluster .get_state ()
437
529
if "start_workload_log" in action :
438
530
store_type_list = []
439
531
if action == 'start_workload_log_column' :
@@ -444,31 +536,22 @@ def main():
444
536
store_type_list = ['column' , 'row' ]
445
537
first_node = stability_cluster .kikimr_cluster .nodes [1 ]
446
538
for store_type in store_type_list :
447
- first_node .ssh_command ([
448
- '/Berkanavt/nemesis/bin/ydb_cli' ,
449
- '--endpoint' , f'grpc://localhost:{ first_node .grpc_port } ' ,
450
- '--database' , '/Root/db1' ,
451
- 'workload' , 'log' , 'clean' ,
452
- '--path' , f'log_workload_{ store_type } ' ,
453
- ],
454
- raise_on_error = True
455
- )
456
539
first_node .ssh_command ([
457
540
'/Berkanavt/nemesis/bin/ydb_cli' ,
458
541
'--endpoint' , f'grpc://localhost:{ first_node .grpc_port } ' ,
459
542
'--database' , '/Root/db1' ,
460
543
'workload' , 'log' , 'init' ,
461
544
'--len' , '1000' ,
462
- '--int-cols' , '20 ' ,
463
- '--key-cols' , '20 ' ,
545
+ '--int-cols' , '18 ' ,
546
+ '--key-cols' , '18 ' ,
464
547
'--min-partitions' , '100' ,
465
548
'--partition-size' , '10' ,
466
549
'--auto-partition' , '0' ,
467
550
'--store' , store_type ,
468
551
'--path' , f'log_workload_{ store_type } ' ,
469
- '--ttl' , '3600 '
552
+ '--ttl' , '20160 '
470
553
],
471
- raise_on_error = True
554
+ raise_on_error = False
472
555
)
473
556
for node_id , node in enumerate (stability_cluster .kikimr_cluster .nodes .values ()):
474
557
node .ssh_command ([
@@ -478,16 +561,31 @@ def main():
478
561
'--database' , '/Root/db1' ,
479
562
'workload' , 'log' , 'run' , 'bulk_upsert' ,
480
563
'--len' , '1000' ,
481
- '--int-cols' , '20 ' ,
482
- '--key-cols' , '20 ' ,
483
- '--threads' , '20 ' ,
564
+ '--int-cols' , '18 ' ,
565
+ '--key-cols' , '18 ' ,
566
+ '--threads' , '1 ' ,
484
567
'--timestamp_deviation' , '180' ,
485
568
'--seconds' , '86400' ,
486
569
'--path' , f'log_workload_{ store_type } ' ,
487
570
'; done"'
488
571
],
489
572
raise_on_error = True
490
573
)
574
+ node .ssh_command ([
575
+ f'screen -s workload_log_{ store_type } _select -d -m bash -c "while true; do' ,
576
+ '/Berkanavt/nemesis/bin/ydb_cli' ,
577
+ '--verbose' ,
578
+ '--endpoint' , f'grpc://localhost:{ node .grpc_port } ' ,
579
+ '--database' , '/Root/db1' ,
580
+ 'workload' , 'log' , 'run' , 'select' ,
581
+ '--client-timeout' , '1800000' ,
582
+ '--threads' , '1' ,
583
+ '--seconds' , '86400' ,
584
+ '--path' , f'log_workload_{ store_type } ' ,
585
+ '; done"'
586
+ ],
587
+ raise_on_error = True
588
+ )
491
589
stability_cluster .get_state ()
492
590
if action == "start_workload_simple_queue_row" :
493
591
for node_id , node in enumerate (stability_cluster .kikimr_cluster .nodes .values ()):
0 commit comments