Skip to content

Commit 23da2ad

Browse files
Frederic Weisbeckerfbq
authored andcommitted
rcu/exp: Remove rcu_par_gp_wq
TREE04 running on short iterations can produce writer stalls of the following kind: ??? Writer stall state RTWS_EXP_SYNC(4) g3968 f0x0 ->state 0x2 cpu 0 task:rcu_torture_wri state:D stack:14568 pid:83 ppid:2 flags:0x00004000 Call Trace: <TASK> __schedule+0x2de/0x850 ? trace_event_raw_event_rcu_exp_funnel_lock+0x6d/0xb0 schedule+0x4f/0x90 synchronize_rcu_expedited+0x430/0x670 ? __pfx_autoremove_wake_function+0x10/0x10 ? __pfx_synchronize_rcu_expedited+0x10/0x10 do_rtws_sync.constprop.0+0xde/0x230 rcu_torture_writer+0x4b4/0xcd0 ? __pfx_rcu_torture_writer+0x10/0x10 kthread+0xc7/0xf0 ? __pfx_kthread+0x10/0x10 ret_from_fork+0x2f/0x50 ? __pfx_kthread+0x10/0x10 ret_from_fork_asm+0x1b/0x30 </TASK> Waiting for an expedited grace period and polling for an expedited grace period both are operations that internally rely on the same workqueue performing necessary asynchronous work. However, a dependency chain is involved between those two operations, as depicted below: ====== CPU 0 ======= ====== CPU 1 ======= synchronize_rcu_expedited() exp_funnel_lock() mutex_lock(&rcu_state.exp_mutex); start_poll_synchronize_rcu_expedited queue_work(rcu_gp_wq, &rnp->exp_poll_wq); synchronize_rcu_expedited_queue_work() queue_work(rcu_gp_wq, &rew->rew_work); wait_event() // A, wait for &rew->rew_work completion mutex_unlock() // B //======> switch to kworker sync_rcu_do_polled_gp() { synchronize_rcu_expedited() exp_funnel_lock() mutex_lock(&rcu_state.exp_mutex); // C, wait B .... } // D Since workqueues are usually implemented on top of several kworkers handling the queue concurrently, the above situation wouldn't deadlock most of the time because A then doesn't depend on D. But in case of memory stress, a single kworker may end up handling alone all the works in a serialized way. In that case the above layout becomes a problem because A then waits for D, closing a circular dependency: A -> D -> C -> B -> A This however only happens when CONFIG_RCU_EXP_KTHREAD=n. Indeed synchronize_rcu_expedited() is otherwise implemented on top of a kthread worker while polling still relies on rcu_gp_wq workqueue, breaking the above circular dependency chain. Fix this with making expedited grace period to always rely on kthread worker. The workqueue based implementation is essentially a duplicate anyway now that the per-node initialization is performed by per-node kthread workers. Meanwhile the CONFIG_RCU_EXP_KTHREAD switch is still kept around to manage the scheduler policy of these kthread workers. Reported-by: Anna-Maria Behnsen <anna-maria@linutronix.de> Reported-by: Thomas Gleixner <tglx@linutronix.de> Suggested-by: Joel Fernandes <joel@joelfernandes.org> Suggested-by: Paul E. McKenney <paulmck@kernel.org> Suggested-by: Neeraj upadhyay <Neeraj.Upadhyay@amd.com> Signed-off-by: Frederic Weisbecker <frederic@kernel.org> Reviewed-by: Paul E. McKenney <paulmck@kernel.org> Signed-off-by: Boqun Feng <boqun.feng@gmail.com>
1 parent b67cffc commit 23da2ad

File tree

4 files changed

+8
-115
lines changed

4 files changed

+8
-115
lines changed

kernel/rcu/rcu.h

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -623,11 +623,7 @@ int rcu_get_gp_kthreads_prio(void);
623623
void rcu_fwd_progress_check(unsigned long j);
624624
void rcu_force_quiescent_state(void);
625625
extern struct workqueue_struct *rcu_gp_wq;
626-
#ifdef CONFIG_RCU_EXP_KTHREAD
627626
extern struct kthread_worker *rcu_exp_gp_kworker;
628-
#else /* !CONFIG_RCU_EXP_KTHREAD */
629-
extern struct workqueue_struct *rcu_par_gp_wq;
630-
#endif /* CONFIG_RCU_EXP_KTHREAD */
631627
void rcu_gp_slow_register(atomic_t *rgssp);
632628
void rcu_gp_slow_unregister(atomic_t *rgssp);
633629
#endif /* #else #ifdef CONFIG_TINY_RCU */

kernel/rcu/tree.c

Lines changed: 6 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -4394,7 +4394,6 @@ rcu_boot_init_percpu_data(int cpu)
43944394
rcu_boot_init_nocb_percpu_data(rdp);
43954395
}
43964396

4397-
#ifdef CONFIG_RCU_EXP_KTHREAD
43984397
struct kthread_worker *rcu_exp_gp_kworker;
43994398

44004399
static void rcu_spawn_exp_par_gp_kworker(struct rcu_node *rnp)
@@ -4414,7 +4413,9 @@ static void rcu_spawn_exp_par_gp_kworker(struct rcu_node *rnp)
44144413
return;
44154414
}
44164415
WRITE_ONCE(rnp->exp_kworker, kworker);
4417-
sched_setscheduler_nocheck(kworker->task, SCHED_FIFO, &param);
4416+
4417+
if (IS_ENABLED(CONFIG_RCU_EXP_KTHREAD))
4418+
sched_setscheduler_nocheck(kworker->task, SCHED_FIFO, &param);
44184419
}
44194420

44204421
static struct task_struct *rcu_exp_par_gp_task(struct rcu_node *rnp)
@@ -4438,39 +4439,14 @@ static void __init rcu_start_exp_gp_kworker(void)
44384439
rcu_exp_gp_kworker = NULL;
44394440
return;
44404441
}
4441-
sched_setscheduler_nocheck(rcu_exp_gp_kworker->task, SCHED_FIFO, &param);
4442-
}
4443-
4444-
static inline void rcu_alloc_par_gp_wq(void)
4445-
{
4446-
}
4447-
#else /* !CONFIG_RCU_EXP_KTHREAD */
4448-
struct workqueue_struct *rcu_par_gp_wq;
4449-
4450-
static void rcu_spawn_exp_par_gp_kworker(struct rcu_node *rnp)
4451-
{
4452-
}
4453-
4454-
static struct task_struct *rcu_exp_par_gp_task(struct rcu_node *rnp)
4455-
{
4456-
return NULL;
4457-
}
4458-
4459-
static void __init rcu_start_exp_gp_kworker(void)
4460-
{
4461-
}
44624442

4463-
static inline void rcu_alloc_par_gp_wq(void)
4464-
{
4465-
rcu_par_gp_wq = alloc_workqueue("rcu_par_gp", WQ_MEM_RECLAIM, 0);
4466-
WARN_ON(!rcu_par_gp_wq);
4443+
if (IS_ENABLED(CONFIG_RCU_EXP_KTHREAD))
4444+
sched_setscheduler_nocheck(rcu_exp_gp_kworker->task, SCHED_FIFO, &param);
44674445
}
4468-
#endif /* CONFIG_RCU_EXP_KTHREAD */
44694446

44704447
static void rcu_spawn_rnp_kthreads(struct rcu_node *rnp)
44714448
{
4472-
if ((IS_ENABLED(CONFIG_RCU_EXP_KTHREAD) ||
4473-
IS_ENABLED(CONFIG_RCU_BOOST)) && rcu_scheduler_fully_active) {
4449+
if (rcu_scheduler_fully_active) {
44744450
mutex_lock(&rnp->kthread_mutex);
44754451
rcu_spawn_one_boost_kthread(rnp);
44764452
rcu_spawn_exp_par_gp_kworker(rnp);
@@ -4554,9 +4530,6 @@ static void rcutree_affinity_setting(unsigned int cpu, int outgoingcpu)
45544530
struct rcu_node *rnp;
45554531
struct task_struct *task_boost, *task_exp;
45564532

4557-
if (!IS_ENABLED(CONFIG_RCU_EXP_KTHREAD) && !IS_ENABLED(CONFIG_RCU_BOOST))
4558-
return;
4559-
45604533
rdp = per_cpu_ptr(&rcu_data, cpu);
45614534
rnp = rdp->mynode;
45624535

@@ -5245,7 +5218,6 @@ void __init rcu_init(void)
52455218
/* Create workqueue for Tree SRCU and for expedited GPs. */
52465219
rcu_gp_wq = alloc_workqueue("rcu_gp", WQ_MEM_RECLAIM, 0);
52475220
WARN_ON(!rcu_gp_wq);
5248-
rcu_alloc_par_gp_wq();
52495221

52505222
/* Fill in default value for rcutree.qovld boot parameter. */
52515223
/* -After- the rcu_node ->lock fields are initialized! */

kernel/rcu/tree.h

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -21,14 +21,10 @@
2121

2222
#include "rcu_segcblist.h"
2323

24-
/* Communicate arguments to a workqueue handler. */
24+
/* Communicate arguments to a kthread worker handler. */
2525
struct rcu_exp_work {
2626
unsigned long rew_s;
27-
#ifdef CONFIG_RCU_EXP_KTHREAD
2827
struct kthread_work rew_work;
29-
#else
30-
struct work_struct rew_work;
31-
#endif /* CONFIG_RCU_EXP_KTHREAD */
3228
};
3329

3430
/* RCU's kthread states for tracing. */

kernel/rcu/tree_exp.h

Lines changed: 1 addition & 72 deletions
Original file line numberDiff line numberDiff line change
@@ -418,7 +418,6 @@ static void __sync_rcu_exp_select_node_cpus(struct rcu_exp_work *rewp)
418418

419419
static void rcu_exp_sel_wait_wake(unsigned long s);
420420

421-
#ifdef CONFIG_RCU_EXP_KTHREAD
422421
static void sync_rcu_exp_select_node_cpus(struct kthread_work *wp)
423422
{
424423
struct rcu_exp_work *rewp =
@@ -470,69 +469,6 @@ static inline void synchronize_rcu_expedited_queue_work(struct rcu_exp_work *rew
470469
kthread_queue_work(rcu_exp_gp_kworker, &rew->rew_work);
471470
}
472471

473-
static inline void synchronize_rcu_expedited_destroy_work(struct rcu_exp_work *rew)
474-
{
475-
}
476-
#else /* !CONFIG_RCU_EXP_KTHREAD */
477-
static void sync_rcu_exp_select_node_cpus(struct work_struct *wp)
478-
{
479-
struct rcu_exp_work *rewp =
480-
container_of(wp, struct rcu_exp_work, rew_work);
481-
482-
__sync_rcu_exp_select_node_cpus(rewp);
483-
}
484-
485-
static inline bool rcu_exp_worker_started(void)
486-
{
487-
return !!READ_ONCE(rcu_gp_wq);
488-
}
489-
490-
static inline bool rcu_exp_par_worker_started(struct rcu_node *rnp)
491-
{
492-
return !!READ_ONCE(rcu_par_gp_wq);
493-
}
494-
495-
static inline void sync_rcu_exp_select_cpus_queue_work(struct rcu_node *rnp)
496-
{
497-
int cpu = find_next_bit(&rnp->ffmask, BITS_PER_LONG, -1);
498-
499-
INIT_WORK(&rnp->rew.rew_work, sync_rcu_exp_select_node_cpus);
500-
/* If all offline, queue the work on an unbound CPU. */
501-
if (unlikely(cpu > rnp->grphi - rnp->grplo))
502-
cpu = WORK_CPU_UNBOUND;
503-
else
504-
cpu += rnp->grplo;
505-
queue_work_on(cpu, rcu_par_gp_wq, &rnp->rew.rew_work);
506-
}
507-
508-
static inline void sync_rcu_exp_select_cpus_flush_work(struct rcu_node *rnp)
509-
{
510-
flush_work(&rnp->rew.rew_work);
511-
}
512-
513-
/*
514-
* Work-queue handler to drive an expedited grace period forward.
515-
*/
516-
static void wait_rcu_exp_gp(struct work_struct *wp)
517-
{
518-
struct rcu_exp_work *rewp;
519-
520-
rewp = container_of(wp, struct rcu_exp_work, rew_work);
521-
rcu_exp_sel_wait_wake(rewp->rew_s);
522-
}
523-
524-
static inline void synchronize_rcu_expedited_queue_work(struct rcu_exp_work *rew)
525-
{
526-
INIT_WORK_ONSTACK(&rew->rew_work, wait_rcu_exp_gp);
527-
queue_work(rcu_gp_wq, &rew->rew_work);
528-
}
529-
530-
static inline void synchronize_rcu_expedited_destroy_work(struct rcu_exp_work *rew)
531-
{
532-
destroy_work_on_stack(&rew->rew_work);
533-
}
534-
#endif /* CONFIG_RCU_EXP_KTHREAD */
535-
536472
/*
537473
* Select the nodes that the upcoming expedited grace period needs
538474
* to wait for.
@@ -965,7 +901,6 @@ static void rcu_exp_print_detail_task_stall_rnp(struct rcu_node *rnp)
965901
*/
966902
void synchronize_rcu_expedited(void)
967903
{
968-
bool use_worker;
969904
unsigned long flags;
970905
struct rcu_exp_work rew;
971906
struct rcu_node *rnp;
@@ -976,9 +911,6 @@ void synchronize_rcu_expedited(void)
976911
lock_is_held(&rcu_sched_lock_map),
977912
"Illegal synchronize_rcu_expedited() in RCU read-side critical section");
978913

979-
use_worker = (rcu_scheduler_active != RCU_SCHEDULER_INIT) &&
980-
rcu_exp_worker_started();
981-
982914
/* Is the state is such that the call is a grace period? */
983915
if (rcu_blocking_is_gp()) {
984916
// Note well that this code runs with !PREEMPT && !SMP.
@@ -1008,7 +940,7 @@ void synchronize_rcu_expedited(void)
1008940
return; /* Someone else did our work for us. */
1009941

1010942
/* Ensure that load happens before action based on it. */
1011-
if (unlikely(!use_worker)) {
943+
if (unlikely((rcu_scheduler_active == RCU_SCHEDULER_INIT) || !rcu_exp_worker_started())) {
1012944
/* Direct call during scheduler init and early_initcalls(). */
1013945
rcu_exp_sel_wait_wake(s);
1014946
} else {
@@ -1025,9 +957,6 @@ void synchronize_rcu_expedited(void)
1025957

1026958
/* Let the next expedited grace period start. */
1027959
mutex_unlock(&rcu_state.exp_mutex);
1028-
1029-
if (likely(use_worker))
1030-
synchronize_rcu_expedited_destroy_work(&rew);
1031960
}
1032961
EXPORT_SYMBOL_GPL(synchronize_rcu_expedited);
1033962

0 commit comments

Comments
 (0)