Skip to content

Commit 1acd92d

Browse files
committed
workqueue: Drain BH work items on hot-unplugged CPUs
Boqun pointed out that workqueues aren't handling BH work items on offlined CPUs. Unlike tasklet which transfers out the pending tasks from CPUHP_SOFTIRQ_DEAD, BH workqueue would just leave them pending which is problematic. Note that this behavior is specific to BH workqueues as the non-BH per-CPU workers just become unbound when the CPU goes offline. This patch fixes the issue by draining the pending BH work items from an offlined CPU from CPUHP_SOFTIRQ_DEAD. Because work items carry more context, it's not as easy to transfer the pending work items from one pool to another. Instead, run BH work items which execute the offlined pools on an online CPU. Note that this assumes that no further BH work items will be queued on the offlined CPUs. This assumption is shared with tasklet and should be fine for conversions. However, this issue also exists for per-CPU workqueues which will just keep executing work items queued after CPU offline on unbound workers and workqueue should reject per-CPU and BH work items queued on offline CPUs. This will be addressed separately later. Signed-off-by: Tejun Heo <tj@kernel.org> Reported-and-reviewed-by: Boqun Feng <boqun.feng@gmail.com> Link: http://lkml.kernel.org/r/Zdvw0HdSXcU3JZ4g@boqun-archlinux
1 parent 60b2ebf commit 1acd92d

File tree

3 files changed

+91
-3
lines changed

3 files changed

+91
-3
lines changed

include/linux/workqueue.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -458,6 +458,7 @@ extern struct workqueue_struct *system_bh_wq;
458458
extern struct workqueue_struct *system_bh_highpri_wq;
459459

460460
void workqueue_softirq_action(bool highpri);
461+
void workqueue_softirq_dead(unsigned int cpu);
461462

462463
/**
463464
* alloc_workqueue - allocate a workqueue

kernel/softirq.c

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -932,6 +932,8 @@ static void run_ksoftirqd(unsigned int cpu)
932932
#ifdef CONFIG_HOTPLUG_CPU
933933
static int takeover_tasklets(unsigned int cpu)
934934
{
935+
workqueue_softirq_dead(cpu);
936+
935937
/* CPU is dead, so no lock needed. */
936938
local_irq_disable();
937939

kernel/workqueue.c

Lines changed: 88 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -81,6 +81,7 @@ enum worker_pool_flags {
8181
POOL_BH = 1 << 0, /* is a BH pool */
8282
POOL_MANAGER_ACTIVE = 1 << 1, /* being managed */
8383
POOL_DISASSOCIATED = 1 << 2, /* cpu can't serve workers */
84+
POOL_BH_DRAINING = 1 << 3, /* draining after CPU offline */
8485
};
8586

8687
enum worker_flags {
@@ -1218,7 +1219,9 @@ static struct irq_work *bh_pool_irq_work(struct worker_pool *pool)
12181219
static void kick_bh_pool(struct worker_pool *pool)
12191220
{
12201221
#ifdef CONFIG_SMP
1221-
if (unlikely(pool->cpu != smp_processor_id())) {
1222+
/* see drain_dead_softirq_workfn() for BH_DRAINING */
1223+
if (unlikely(pool->cpu != smp_processor_id() &&
1224+
!(pool->flags & POOL_BH_DRAINING))) {
12221225
irq_work_queue_on(bh_pool_irq_work(pool), pool->cpu);
12231226
return;
12241227
}
@@ -3155,6 +3158,7 @@ __acquires(&pool->lock)
31553158
struct worker_pool *pool = worker->pool;
31563159
unsigned long work_data;
31573160
int lockdep_start_depth, rcu_start_depth;
3161+
bool bh_draining = pool->flags & POOL_BH_DRAINING;
31583162
#ifdef CONFIG_LOCKDEP
31593163
/*
31603164
* It is permissible to free the struct work_struct from
@@ -3220,7 +3224,9 @@ __acquires(&pool->lock)
32203224

32213225
rcu_start_depth = rcu_preempt_depth();
32223226
lockdep_start_depth = lockdep_depth(current);
3223-
lock_map_acquire(&pwq->wq->lockdep_map);
3227+
/* see drain_dead_softirq_workfn() */
3228+
if (!bh_draining)
3229+
lock_map_acquire(&pwq->wq->lockdep_map);
32243230
lock_map_acquire(&lockdep_map);
32253231
/*
32263232
* Strictly speaking we should mark the invariant state without holding
@@ -3253,7 +3259,8 @@ __acquires(&pool->lock)
32533259
trace_workqueue_execute_end(work, worker->current_func);
32543260
pwq->stats[PWQ_STAT_COMPLETED]++;
32553261
lock_map_release(&lockdep_map);
3256-
lock_map_release(&pwq->wq->lockdep_map);
3262+
if (!bh_draining)
3263+
lock_map_release(&pwq->wq->lockdep_map);
32573264

32583265
if (unlikely((worker->task && in_atomic()) ||
32593266
lockdep_depth(current) != lockdep_start_depth ||
@@ -3615,6 +3622,84 @@ void workqueue_softirq_action(bool highpri)
36153622
bh_worker(list_first_entry(&pool->workers, struct worker, node));
36163623
}
36173624

3625+
struct wq_drain_dead_softirq_work {
3626+
struct work_struct work;
3627+
struct worker_pool *pool;
3628+
struct completion done;
3629+
};
3630+
3631+
static void drain_dead_softirq_workfn(struct work_struct *work)
3632+
{
3633+
struct wq_drain_dead_softirq_work *dead_work =
3634+
container_of(work, struct wq_drain_dead_softirq_work, work);
3635+
struct worker_pool *pool = dead_work->pool;
3636+
bool repeat;
3637+
3638+
/*
3639+
* @pool's CPU is dead and we want to execute its still pending work
3640+
* items from this BH work item which is running on a different CPU. As
3641+
* its CPU is dead, @pool can't be kicked and, as work execution path
3642+
* will be nested, a lockdep annotation needs to be suppressed. Mark
3643+
* @pool with %POOL_BH_DRAINING for the special treatments.
3644+
*/
3645+
raw_spin_lock_irq(&pool->lock);
3646+
pool->flags |= POOL_BH_DRAINING;
3647+
raw_spin_unlock_irq(&pool->lock);
3648+
3649+
bh_worker(list_first_entry(&pool->workers, struct worker, node));
3650+
3651+
raw_spin_lock_irq(&pool->lock);
3652+
pool->flags &= ~POOL_BH_DRAINING;
3653+
repeat = need_more_worker(pool);
3654+
raw_spin_unlock_irq(&pool->lock);
3655+
3656+
/*
3657+
* bh_worker() might hit consecutive execution limit and bail. If there
3658+
* still are pending work items, reschedule self and return so that we
3659+
* don't hog this CPU's BH.
3660+
*/
3661+
if (repeat) {
3662+
if (pool->attrs->nice == HIGHPRI_NICE_LEVEL)
3663+
queue_work(system_bh_highpri_wq, work);
3664+
else
3665+
queue_work(system_bh_wq, work);
3666+
} else {
3667+
complete(&dead_work->done);
3668+
}
3669+
}
3670+
3671+
/*
3672+
* @cpu is dead. Drain the remaining BH work items on the current CPU. It's
3673+
* possible to allocate dead_work per CPU and avoid flushing. However, then we
3674+
* have to worry about draining overlapping with CPU coming back online or
3675+
* nesting (one CPU's dead_work queued on another CPU which is also dead and so
3676+
* on). Let's keep it simple and drain them synchronously. These are BH work
3677+
* items which shouldn't be requeued on the same pool. Shouldn't take long.
3678+
*/
3679+
void workqueue_softirq_dead(unsigned int cpu)
3680+
{
3681+
int i;
3682+
3683+
for (i = 0; i < NR_STD_WORKER_POOLS; i++) {
3684+
struct worker_pool *pool = &per_cpu(bh_worker_pools, cpu)[i];
3685+
struct wq_drain_dead_softirq_work dead_work;
3686+
3687+
if (!need_more_worker(pool))
3688+
continue;
3689+
3690+
INIT_WORK(&dead_work.work, drain_dead_softirq_workfn);
3691+
dead_work.pool = pool;
3692+
init_completion(&dead_work.done);
3693+
3694+
if (pool->attrs->nice == HIGHPRI_NICE_LEVEL)
3695+
queue_work(system_bh_highpri_wq, &dead_work.work);
3696+
else
3697+
queue_work(system_bh_wq, &dead_work.work);
3698+
3699+
wait_for_completion(&dead_work.done);
3700+
}
3701+
}
3702+
36183703
/**
36193704
* check_flush_dependency - check for flush dependency sanity
36203705
* @target_wq: workqueue being flushed

0 commit comments

Comments
 (0)