Skip to content
This repository was archived by the owner on Nov 15, 2023. It is now read-only.

Commit b1cc6fa

Browse files
authored
av-store: Move prune on a separate thread (#7263)
* av-store: Move prune on a separate thread There are situations where pruning of the data could take more than a few seconds and that might make the whole subsystem unreponsive. To avoid this just move the prune process on a separate thread. See: #7237, for more details. Signed-off-by: Alexandru Gheorghe <alexandru.gheorghe@parity.io> * av-store: Add log that prunning started Signed-off-by: Alexandru Gheorghe <alexandru.gheorghe@parity.io> * av-store: modify log severity Signed-off-by: Alexandru Gheorghe <alexandru.gheorghe@parity.io> --------- Signed-off-by: Alexandru Gheorghe <alexandru.gheorghe@parity.io>
1 parent 6debcdb commit b1cc6fa

File tree

1 file changed

+62
-9
lines changed

1 file changed

+62
-9
lines changed

node/core/av-store/src/lib.rs

Lines changed: 62 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,13 @@ use std::{
2626
time::{Duration, SystemTime, SystemTimeError, UNIX_EPOCH},
2727
};
2828

29-
use futures::{channel::oneshot, future, select, FutureExt};
29+
use futures::{
30+
channel::{
31+
mpsc::{channel, Receiver as MpscReceiver, Sender as MpscSender},
32+
oneshot,
33+
},
34+
future, select, FutureExt, SinkExt, StreamExt,
35+
};
3036
use futures_timer::Delay;
3137
use parity_scale_codec::{Decode, Encode, Error as CodecError, Input};
3238
use polkadot_node_subsystem_util::database::{DBTransaction, Database};
@@ -540,9 +546,17 @@ impl<Context> AvailabilityStoreSubsystem {
540546
#[overseer::contextbounds(AvailabilityStore, prefix = self::overseer)]
541547
async fn run<Context>(mut subsystem: AvailabilityStoreSubsystem, mut ctx: Context) {
542548
let mut next_pruning = Delay::new(subsystem.pruning_config.pruning_interval).fuse();
543-
549+
// Pruning interval is in the order of minutes so we shouldn't have more than one task running
550+
// at one moment in time, so 10 should be more than enough.
551+
let (mut pruning_result_tx, mut pruning_result_rx) = channel(10);
544552
loop {
545-
let res = run_iteration(&mut ctx, &mut subsystem, &mut next_pruning).await;
553+
let res = run_iteration(
554+
&mut ctx,
555+
&mut subsystem,
556+
&mut next_pruning,
557+
(&mut pruning_result_tx, &mut pruning_result_rx),
558+
)
559+
.await;
546560
match res {
547561
Err(e) => {
548562
e.trace();
@@ -564,6 +578,10 @@ async fn run_iteration<Context>(
564578
ctx: &mut Context,
565579
subsystem: &mut AvailabilityStoreSubsystem,
566580
mut next_pruning: &mut future::Fuse<Delay>,
581+
(pruning_result_tx, pruning_result_rx): (
582+
&mut MpscSender<Result<(), Error>>,
583+
&mut MpscReceiver<Result<(), Error>>,
584+
),
567585
) -> Result<bool, Error> {
568586
select! {
569587
incoming = ctx.recv().fuse() => {
@@ -612,15 +630,51 @@ async fn run_iteration<Context>(
612630
// It's important to set the delay before calling `prune_all` because an error in `prune_all`
613631
// could lead to the delay not being set again. Then we would never prune anything anymore.
614632
*next_pruning = Delay::new(subsystem.pruning_config.pruning_interval).fuse();
615-
616-
let _timer = subsystem.metrics.time_pruning();
617-
prune_all(&subsystem.db, &subsystem.config, &*subsystem.clock)?;
618-
}
633+
start_prune_all(ctx, subsystem, pruning_result_tx.clone()).await?;
634+
},
635+
// Received the prune result and propagate the errors, so that in case of a fatal error
636+
// the main loop of the subsystem can exit graciously.
637+
result = pruning_result_rx.next() => {
638+
if let Some(result) = result {
639+
result?;
640+
}
641+
},
619642
}
620643

621644
Ok(false)
622645
}
623646

647+
// Start prune-all on a separate thread, so that in the case when the operation takes
648+
// longer than expected we don't keep the whole subsystem blocked.
649+
// See: https://github.com/paritytech/polkadot/issues/7237 for more details.
650+
#[overseer::contextbounds(AvailabilityStore, prefix = self::overseer)]
651+
async fn start_prune_all<Context>(
652+
ctx: &mut Context,
653+
subsystem: &mut AvailabilityStoreSubsystem,
654+
mut pruning_result_tx: MpscSender<Result<(), Error>>,
655+
) -> Result<(), Error> {
656+
let metrics = subsystem.metrics.clone();
657+
let db = subsystem.db.clone();
658+
let config = subsystem.config;
659+
let time_now = subsystem.clock.now()?;
660+
661+
ctx.spawn_blocking(
662+
"av-store-prunning",
663+
Box::pin(async move {
664+
let _timer = metrics.time_pruning();
665+
666+
gum::debug!(target: LOG_TARGET, "Prunning started");
667+
let result = prune_all(&db, &config, time_now);
668+
669+
if let Err(err) = pruning_result_tx.send(result).await {
670+
// This usually means that the node is closing down, log it just in case
671+
gum::debug!(target: LOG_TARGET, ?err, "Failed to send prune_all result",);
672+
}
673+
}),
674+
)?;
675+
Ok(())
676+
}
677+
624678
#[overseer::contextbounds(AvailabilityStore, prefix = self::overseer)]
625679
async fn process_block_activated<Context>(
626680
ctx: &mut Context,
@@ -1250,8 +1304,7 @@ fn store_available_data(
12501304
Ok(())
12511305
}
12521306

1253-
fn prune_all(db: &Arc<dyn Database>, config: &Config, clock: &dyn Clock) -> Result<(), Error> {
1254-
let now = clock.now()?;
1307+
fn prune_all(db: &Arc<dyn Database>, config: &Config, now: Duration) -> Result<(), Error> {
12551308
let (range_start, range_end) = pruning_range(now);
12561309

12571310
let mut tx = DBTransaction::new();

0 commit comments

Comments
 (0)