@@ -26,7 +26,13 @@ use std::{
26
26
time:: { Duration , SystemTime , SystemTimeError , UNIX_EPOCH } ,
27
27
} ;
28
28
29
- use futures:: { channel:: oneshot, future, select, FutureExt } ;
29
+ use futures:: {
30
+ channel:: {
31
+ mpsc:: { channel, Receiver as MpscReceiver , Sender as MpscSender } ,
32
+ oneshot,
33
+ } ,
34
+ future, select, FutureExt , SinkExt , StreamExt ,
35
+ } ;
30
36
use futures_timer:: Delay ;
31
37
use parity_scale_codec:: { Decode , Encode , Error as CodecError , Input } ;
32
38
use polkadot_node_subsystem_util:: database:: { DBTransaction , Database } ;
@@ -540,9 +546,17 @@ impl<Context> AvailabilityStoreSubsystem {
540
546
#[ overseer:: contextbounds( AvailabilityStore , prefix = self :: overseer) ]
541
547
async fn run < Context > ( mut subsystem : AvailabilityStoreSubsystem , mut ctx : Context ) {
542
548
let mut next_pruning = Delay :: new ( subsystem. pruning_config . pruning_interval ) . fuse ( ) ;
543
-
549
+ // Pruning interval is in the order of minutes so we shouldn't have more than one task running
550
+ // at one moment in time, so 10 should be more than enough.
551
+ let ( mut pruning_result_tx, mut pruning_result_rx) = channel ( 10 ) ;
544
552
loop {
545
- let res = run_iteration ( & mut ctx, & mut subsystem, & mut next_pruning) . await ;
553
+ let res = run_iteration (
554
+ & mut ctx,
555
+ & mut subsystem,
556
+ & mut next_pruning,
557
+ ( & mut pruning_result_tx, & mut pruning_result_rx) ,
558
+ )
559
+ . await ;
546
560
match res {
547
561
Err ( e) => {
548
562
e. trace ( ) ;
@@ -564,6 +578,10 @@ async fn run_iteration<Context>(
564
578
ctx : & mut Context ,
565
579
subsystem : & mut AvailabilityStoreSubsystem ,
566
580
mut next_pruning : & mut future:: Fuse < Delay > ,
581
+ ( pruning_result_tx, pruning_result_rx) : (
582
+ & mut MpscSender < Result < ( ) , Error > > ,
583
+ & mut MpscReceiver < Result < ( ) , Error > > ,
584
+ ) ,
567
585
) -> Result < bool , Error > {
568
586
select ! {
569
587
incoming = ctx. recv( ) . fuse( ) => {
@@ -612,15 +630,51 @@ async fn run_iteration<Context>(
612
630
// It's important to set the delay before calling `prune_all` because an error in `prune_all`
613
631
// could lead to the delay not being set again. Then we would never prune anything anymore.
614
632
* next_pruning = Delay :: new( subsystem. pruning_config. pruning_interval) . fuse( ) ;
615
-
616
- let _timer = subsystem. metrics. time_pruning( ) ;
617
- prune_all( & subsystem. db, & subsystem. config, & * subsystem. clock) ?;
618
- }
633
+ start_prune_all( ctx, subsystem, pruning_result_tx. clone( ) ) . await ?;
634
+ } ,
635
+ // Received the prune result and propagate the errors, so that in case of a fatal error
636
+ // the main loop of the subsystem can exit graciously.
637
+ result = pruning_result_rx. next( ) => {
638
+ if let Some ( result) = result {
639
+ result?;
640
+ }
641
+ } ,
619
642
}
620
643
621
644
Ok ( false )
622
645
}
623
646
647
+ // Start prune-all on a separate thread, so that in the case when the operation takes
648
+ // longer than expected we don't keep the whole subsystem blocked.
649
+ // See: https://github.com/paritytech/polkadot/issues/7237 for more details.
650
+ #[ overseer:: contextbounds( AvailabilityStore , prefix = self :: overseer) ]
651
+ async fn start_prune_all < Context > (
652
+ ctx : & mut Context ,
653
+ subsystem : & mut AvailabilityStoreSubsystem ,
654
+ mut pruning_result_tx : MpscSender < Result < ( ) , Error > > ,
655
+ ) -> Result < ( ) , Error > {
656
+ let metrics = subsystem. metrics . clone ( ) ;
657
+ let db = subsystem. db . clone ( ) ;
658
+ let config = subsystem. config ;
659
+ let time_now = subsystem. clock . now ( ) ?;
660
+
661
+ ctx. spawn_blocking (
662
+ "av-store-prunning" ,
663
+ Box :: pin ( async move {
664
+ let _timer = metrics. time_pruning ( ) ;
665
+
666
+ gum:: debug!( target: LOG_TARGET , "Prunning started" ) ;
667
+ let result = prune_all ( & db, & config, time_now) ;
668
+
669
+ if let Err ( err) = pruning_result_tx. send ( result) . await {
670
+ // This usually means that the node is closing down, log it just in case
671
+ gum:: debug!( target: LOG_TARGET , ?err, "Failed to send prune_all result" , ) ;
672
+ }
673
+ } ) ,
674
+ ) ?;
675
+ Ok ( ( ) )
676
+ }
677
+
624
678
#[ overseer:: contextbounds( AvailabilityStore , prefix = self :: overseer) ]
625
679
async fn process_block_activated < Context > (
626
680
ctx : & mut Context ,
@@ -1250,8 +1304,7 @@ fn store_available_data(
1250
1304
Ok ( ( ) )
1251
1305
}
1252
1306
1253
- fn prune_all ( db : & Arc < dyn Database > , config : & Config , clock : & dyn Clock ) -> Result < ( ) , Error > {
1254
- let now = clock. now ( ) ?;
1307
+ fn prune_all ( db : & Arc < dyn Database > , config : & Config , now : Duration ) -> Result < ( ) , Error > {
1255
1308
let ( range_start, range_end) = pruning_range ( now) ;
1256
1309
1257
1310
let mut tx = DBTransaction :: new ( ) ;
0 commit comments