Skip to content

Commit 4ba1329

Browse files
committed
Merge tag 'rcu-urgent.2022.07.21a' of git://git.kernel.org/pub/scm/linux/kernel/git/paulmck/linux-rcu
Pull RCU fix from Paul McKenney: "This contains a pair of commits that fix 282d899 ("srcu: Prevent expedited GPs and blocking readers from consuming CPU"), which was itself a fix to an SRCU expedited grace-period problem that could prevent kernel live patching (KLP) from completing. That SRCU fix for KLP introduced large (as in minutes) boot-time delays to embedded Linux kernels running on qemu/KVM. These delays were due to the emulation of certain MMIO operations controlling memory layout, which were emulated with one expedited grace period per access. Common configurations required thousands of boot-time MMIO accesses, and thus thousands of boot-time expedited SRCU grace periods. In these configurations, the occasional sleeps that allowed KLP to proceed caused excessive boot delays. These commits preserve enough sleeps to permit KLP to proceed, but few enough that the virtual embedded kernels still boot reasonably quickly. This represents a regression introduced in the v5.19 merge window, and the bug is causing significant inconvenience" * tag 'rcu-urgent.2022.07.21a' of git://git.kernel.org/pub/scm/linux/kernel/git/paulmck/linux-rcu: srcu: Make expedited RCU grace periods block even less frequently srcu: Block less aggressively for expedited grace periods
2 parents 7fb5e50 + 4f2bfd9 commit 4ba1329

File tree

2 files changed

+92
-24
lines changed

2 files changed

+92
-24
lines changed

Documentation/admin-guide/kernel-parameters.txt

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5796,6 +5796,24 @@
57965796
expediting. Set to zero to disable automatic
57975797
expediting.
57985798

5799+
srcutree.srcu_max_nodelay [KNL]
5800+
Specifies the number of no-delay instances
5801+
per jiffy for which the SRCU grace period
5802+
worker thread will be rescheduled with zero
5803+
delay. Beyond this limit, worker thread will
5804+
be rescheduled with a sleep delay of one jiffy.
5805+
5806+
srcutree.srcu_max_nodelay_phase [KNL]
5807+
Specifies the per-grace-period phase, number of
5808+
non-sleeping polls of readers. Beyond this limit,
5809+
grace period worker thread will be rescheduled
5810+
with a sleep delay of one jiffy, between each
5811+
rescan of the readers, for a grace period phase.
5812+
5813+
srcutree.srcu_retry_check_delay [KNL]
5814+
Specifies number of microseconds of non-sleeping
5815+
delay between each non-sleeping poll of readers.
5816+
57995817
srcutree.small_contention_lim [KNL]
58005818
Specifies the number of update-side contention
58015819
events per jiffy will be tolerated before

kernel/rcu/srcutree.c

Lines changed: 74 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -511,27 +511,75 @@ static bool srcu_readers_active(struct srcu_struct *ssp)
511511
return sum;
512512
}
513513

514-
#define SRCU_INTERVAL 1 // Base delay if no expedited GPs pending.
515-
#define SRCU_MAX_INTERVAL 10 // Maximum incremental delay from slow readers.
516-
#define SRCU_MAX_NODELAY_PHASE 1 // Maximum per-GP-phase consecutive no-delay instances.
517-
#define SRCU_MAX_NODELAY 100 // Maximum consecutive no-delay instances.
514+
/*
515+
* We use an adaptive strategy for synchronize_srcu() and especially for
516+
* synchronize_srcu_expedited(). We spin for a fixed time period
517+
* (defined below, boot time configurable) to allow SRCU readers to exit
518+
* their read-side critical sections. If there are still some readers
519+
* after one jiffy, we repeatedly block for one jiffy time periods.
520+
* The blocking time is increased as the grace-period age increases,
521+
* with max blocking time capped at 10 jiffies.
522+
*/
523+
#define SRCU_DEFAULT_RETRY_CHECK_DELAY 5
524+
525+
static ulong srcu_retry_check_delay = SRCU_DEFAULT_RETRY_CHECK_DELAY;
526+
module_param(srcu_retry_check_delay, ulong, 0444);
527+
528+
#define SRCU_INTERVAL 1 // Base delay if no expedited GPs pending.
529+
#define SRCU_MAX_INTERVAL 10 // Maximum incremental delay from slow readers.
530+
531+
#define SRCU_DEFAULT_MAX_NODELAY_PHASE_LO 3UL // Lowmark on default per-GP-phase
532+
// no-delay instances.
533+
#define SRCU_DEFAULT_MAX_NODELAY_PHASE_HI 1000UL // Highmark on default per-GP-phase
534+
// no-delay instances.
535+
536+
#define SRCU_UL_CLAMP_LO(val, low) ((val) > (low) ? (val) : (low))
537+
#define SRCU_UL_CLAMP_HI(val, high) ((val) < (high) ? (val) : (high))
538+
#define SRCU_UL_CLAMP(val, low, high) SRCU_UL_CLAMP_HI(SRCU_UL_CLAMP_LO((val), (low)), (high))
539+
// per-GP-phase no-delay instances adjusted to allow non-sleeping poll upto
540+
// one jiffies time duration. Mult by 2 is done to factor in the srcu_get_delay()
541+
// called from process_srcu().
542+
#define SRCU_DEFAULT_MAX_NODELAY_PHASE_ADJUSTED \
543+
(2UL * USEC_PER_SEC / HZ / SRCU_DEFAULT_RETRY_CHECK_DELAY)
544+
545+
// Maximum per-GP-phase consecutive no-delay instances.
546+
#define SRCU_DEFAULT_MAX_NODELAY_PHASE \
547+
SRCU_UL_CLAMP(SRCU_DEFAULT_MAX_NODELAY_PHASE_ADJUSTED, \
548+
SRCU_DEFAULT_MAX_NODELAY_PHASE_LO, \
549+
SRCU_DEFAULT_MAX_NODELAY_PHASE_HI)
550+
551+
static ulong srcu_max_nodelay_phase = SRCU_DEFAULT_MAX_NODELAY_PHASE;
552+
module_param(srcu_max_nodelay_phase, ulong, 0444);
553+
554+
// Maximum consecutive no-delay instances.
555+
#define SRCU_DEFAULT_MAX_NODELAY (SRCU_DEFAULT_MAX_NODELAY_PHASE > 100 ? \
556+
SRCU_DEFAULT_MAX_NODELAY_PHASE : 100)
557+
558+
static ulong srcu_max_nodelay = SRCU_DEFAULT_MAX_NODELAY;
559+
module_param(srcu_max_nodelay, ulong, 0444);
518560

519561
/*
520562
* Return grace-period delay, zero if there are expedited grace
521563
* periods pending, SRCU_INTERVAL otherwise.
522564
*/
523565
static unsigned long srcu_get_delay(struct srcu_struct *ssp)
524566
{
567+
unsigned long gpstart;
568+
unsigned long j;
525569
unsigned long jbase = SRCU_INTERVAL;
526570

527571
if (ULONG_CMP_LT(READ_ONCE(ssp->srcu_gp_seq), READ_ONCE(ssp->srcu_gp_seq_needed_exp)))
528572
jbase = 0;
529-
if (rcu_seq_state(READ_ONCE(ssp->srcu_gp_seq)))
530-
jbase += jiffies - READ_ONCE(ssp->srcu_gp_start);
531-
if (!jbase) {
532-
WRITE_ONCE(ssp->srcu_n_exp_nodelay, READ_ONCE(ssp->srcu_n_exp_nodelay) + 1);
533-
if (READ_ONCE(ssp->srcu_n_exp_nodelay) > SRCU_MAX_NODELAY_PHASE)
534-
jbase = 1;
573+
if (rcu_seq_state(READ_ONCE(ssp->srcu_gp_seq))) {
574+
j = jiffies - 1;
575+
gpstart = READ_ONCE(ssp->srcu_gp_start);
576+
if (time_after(j, gpstart))
577+
jbase += j - gpstart;
578+
if (!jbase) {
579+
WRITE_ONCE(ssp->srcu_n_exp_nodelay, READ_ONCE(ssp->srcu_n_exp_nodelay) + 1);
580+
if (READ_ONCE(ssp->srcu_n_exp_nodelay) > srcu_max_nodelay_phase)
581+
jbase = 1;
582+
}
535583
}
536584
return jbase > SRCU_MAX_INTERVAL ? SRCU_MAX_INTERVAL : jbase;
537585
}
@@ -606,15 +654,6 @@ void __srcu_read_unlock(struct srcu_struct *ssp, int idx)
606654
}
607655
EXPORT_SYMBOL_GPL(__srcu_read_unlock);
608656

609-
/*
610-
* We use an adaptive strategy for synchronize_srcu() and especially for
611-
* synchronize_srcu_expedited(). We spin for a fixed time period
612-
* (defined below) to allow SRCU readers to exit their read-side critical
613-
* sections. If there are still some readers after a few microseconds,
614-
* we repeatedly block for 1-millisecond time periods.
615-
*/
616-
#define SRCU_RETRY_CHECK_DELAY 5
617-
618657
/*
619658
* Start an SRCU grace period.
620659
*/
@@ -700,7 +739,7 @@ static void srcu_schedule_cbs_snp(struct srcu_struct *ssp, struct srcu_node *snp
700739
*/
701740
static void srcu_gp_end(struct srcu_struct *ssp)
702741
{
703-
unsigned long cbdelay;
742+
unsigned long cbdelay = 1;
704743
bool cbs;
705744
bool last_lvl;
706745
int cpu;
@@ -720,7 +759,9 @@ static void srcu_gp_end(struct srcu_struct *ssp)
720759
spin_lock_irq_rcu_node(ssp);
721760
idx = rcu_seq_state(ssp->srcu_gp_seq);
722761
WARN_ON_ONCE(idx != SRCU_STATE_SCAN2);
723-
cbdelay = !!srcu_get_delay(ssp);
762+
if (ULONG_CMP_LT(READ_ONCE(ssp->srcu_gp_seq), READ_ONCE(ssp->srcu_gp_seq_needed_exp)))
763+
cbdelay = 0;
764+
724765
WRITE_ONCE(ssp->srcu_last_gp_end, ktime_get_mono_fast_ns());
725766
rcu_seq_end(&ssp->srcu_gp_seq);
726767
gpseq = rcu_seq_current(&ssp->srcu_gp_seq);
@@ -921,12 +962,16 @@ static void srcu_funnel_gp_start(struct srcu_struct *ssp, struct srcu_data *sdp,
921962
*/
922963
static bool try_check_zero(struct srcu_struct *ssp, int idx, int trycount)
923964
{
965+
unsigned long curdelay;
966+
967+
curdelay = !srcu_get_delay(ssp);
968+
924969
for (;;) {
925970
if (srcu_readers_active_idx_check(ssp, idx))
926971
return true;
927-
if (--trycount + !srcu_get_delay(ssp) <= 0)
972+
if ((--trycount + curdelay) <= 0)
928973
return false;
929-
udelay(SRCU_RETRY_CHECK_DELAY);
974+
udelay(srcu_retry_check_delay);
930975
}
931976
}
932977

@@ -1582,7 +1627,7 @@ static void process_srcu(struct work_struct *work)
15821627
j = jiffies;
15831628
if (READ_ONCE(ssp->reschedule_jiffies) == j) {
15841629
WRITE_ONCE(ssp->reschedule_count, READ_ONCE(ssp->reschedule_count) + 1);
1585-
if (READ_ONCE(ssp->reschedule_count) > SRCU_MAX_NODELAY)
1630+
if (READ_ONCE(ssp->reschedule_count) > srcu_max_nodelay)
15861631
curdelay = 1;
15871632
} else {
15881633
WRITE_ONCE(ssp->reschedule_count, 1);
@@ -1674,6 +1719,11 @@ static int __init srcu_bootup_announce(void)
16741719
pr_info("Hierarchical SRCU implementation.\n");
16751720
if (exp_holdoff != DEFAULT_SRCU_EXP_HOLDOFF)
16761721
pr_info("\tNon-default auto-expedite holdoff of %lu ns.\n", exp_holdoff);
1722+
if (srcu_retry_check_delay != SRCU_DEFAULT_RETRY_CHECK_DELAY)
1723+
pr_info("\tNon-default retry check delay of %lu us.\n", srcu_retry_check_delay);
1724+
if (srcu_max_nodelay != SRCU_DEFAULT_MAX_NODELAY)
1725+
pr_info("\tNon-default max no-delay of %lu.\n", srcu_max_nodelay);
1726+
pr_info("\tMax phase no-delay instances is %lu.\n", srcu_max_nodelay_phase);
16771727
return 0;
16781728
}
16791729
early_initcall(srcu_bootup_announce);

0 commit comments

Comments
 (0)