Skip to content

Commit 33e83ff

Browse files
committed
Merge tag 'sched-urgent-2024-11-03' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull scheduler fixes from Thomas Gleixner: - Plug a race between pick_next_task_fair() and try_to_wake_up() where both try to write to the same task, even though both paths hold a runqueue lock, but obviously from different runqueues. The problem is that the store to task::on_rq in __block_task() is visible to try_to_wake_up() which assumes that the task is not queued. Both sides then operate on the same task. Cure it by rearranging __block_task() so the the store to task::on_rq is the last operation on the task. - Prevent a potential NULL pointer dereference in task_numa_work() task_numa_work() iterates the VMAs of a process. A concurrent unmap of the address space can result in a NULL pointer return from vma_next() which is unchecked. Add the missing NULL pointer check to prevent this. - Operate on the correct scheduler policy in task_should_scx() task_should_scx() returns true when a task should be handled by sched EXT. It checks the tasks scheduling policy. This fails when the check is done before a policy has been set. Cure it by handing the policy into task_should_scx() so it operates on the requested value. - Add the missing handling of sched EXT in the delayed dequeue mechanism. This was simply forgotten. * tag 'sched-urgent-2024-11-03' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: sched/ext: Fix scx vs sched_delayed sched: Pass correct scheduling policy to __setscheduler_class sched/numa: Fix the potential null pointer dereference in task_numa_work() sched: Fix pick_next_task_fair() vs try_to_wake_up() race
2 parents 68f05b2 + 69d5e72 commit 33e83ff

File tree

6 files changed

+69
-22
lines changed

6 files changed

+69
-22
lines changed

kernel/sched/core.c

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -4711,7 +4711,7 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
47114711
if (rt_prio(p->prio)) {
47124712
p->sched_class = &rt_sched_class;
47134713
#ifdef CONFIG_SCHED_CLASS_EXT
4714-
} else if (task_should_scx(p)) {
4714+
} else if (task_should_scx(p->policy)) {
47154715
p->sched_class = &ext_sched_class;
47164716
#endif
47174717
} else {
@@ -7025,7 +7025,7 @@ int default_wake_function(wait_queue_entry_t *curr, unsigned mode, int wake_flag
70257025
}
70267026
EXPORT_SYMBOL(default_wake_function);
70277027

7028-
const struct sched_class *__setscheduler_class(struct task_struct *p, int prio)
7028+
const struct sched_class *__setscheduler_class(int policy, int prio)
70297029
{
70307030
if (dl_prio(prio))
70317031
return &dl_sched_class;
@@ -7034,7 +7034,7 @@ const struct sched_class *__setscheduler_class(struct task_struct *p, int prio)
70347034
return &rt_sched_class;
70357035

70367036
#ifdef CONFIG_SCHED_CLASS_EXT
7037-
if (task_should_scx(p))
7037+
if (task_should_scx(policy))
70387038
return &ext_sched_class;
70397039
#endif
70407040

@@ -7142,7 +7142,7 @@ void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task)
71427142
queue_flag &= ~DEQUEUE_MOVE;
71437143

71447144
prev_class = p->sched_class;
7145-
next_class = __setscheduler_class(p, prio);
7145+
next_class = __setscheduler_class(p->policy, prio);
71467146

71477147
if (prev_class != next_class && p->se.sched_delayed)
71487148
dequeue_task(rq, p, DEQUEUE_SLEEP | DEQUEUE_DELAYED | DEQUEUE_NOCLOCK);

kernel/sched/ext.c

Lines changed: 14 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -4257,14 +4257,14 @@ static const struct kset_uevent_ops scx_uevent_ops = {
42574257
* Used by sched_fork() and __setscheduler_prio() to pick the matching
42584258
* sched_class. dl/rt are already handled.
42594259
*/
4260-
bool task_should_scx(struct task_struct *p)
4260+
bool task_should_scx(int policy)
42614261
{
42624262
if (!scx_enabled() ||
42634263
unlikely(scx_ops_enable_state() == SCX_OPS_DISABLING))
42644264
return false;
42654265
if (READ_ONCE(scx_switching_all))
42664266
return true;
4267-
return p->policy == SCHED_EXT;
4267+
return policy == SCHED_EXT;
42684268
}
42694269

42704270
/**
@@ -4494,11 +4494,16 @@ static void scx_ops_disable_workfn(struct kthread_work *work)
44944494
scx_task_iter_start(&sti);
44954495
while ((p = scx_task_iter_next_locked(&sti))) {
44964496
const struct sched_class *old_class = p->sched_class;
4497+
const struct sched_class *new_class =
4498+
__setscheduler_class(p->policy, p->prio);
44974499
struct sched_enq_and_set_ctx ctx;
44984500

4501+
if (old_class != new_class && p->se.sched_delayed)
4502+
dequeue_task(task_rq(p), p, DEQUEUE_SLEEP | DEQUEUE_DELAYED);
4503+
44994504
sched_deq_and_put_task(p, DEQUEUE_SAVE | DEQUEUE_MOVE, &ctx);
45004505

4501-
p->sched_class = __setscheduler_class(p, p->prio);
4506+
p->sched_class = new_class;
45024507
check_class_changing(task_rq(p), p, old_class);
45034508

45044509
sched_enq_and_set_task(&ctx);
@@ -5204,12 +5209,17 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link)
52045209
scx_task_iter_start(&sti);
52055210
while ((p = scx_task_iter_next_locked(&sti))) {
52065211
const struct sched_class *old_class = p->sched_class;
5212+
const struct sched_class *new_class =
5213+
__setscheduler_class(p->policy, p->prio);
52075214
struct sched_enq_and_set_ctx ctx;
52085215

5216+
if (old_class != new_class && p->se.sched_delayed)
5217+
dequeue_task(task_rq(p), p, DEQUEUE_SLEEP | DEQUEUE_DELAYED);
5218+
52095219
sched_deq_and_put_task(p, DEQUEUE_SAVE | DEQUEUE_MOVE, &ctx);
52105220

52115221
p->scx.slice = SCX_SLICE_DFL;
5212-
p->sched_class = __setscheduler_class(p, p->prio);
5222+
p->sched_class = new_class;
52135223
check_class_changing(task_rq(p), p, old_class);
52145224

52155225
sched_enq_and_set_task(&ctx);

kernel/sched/ext.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ bool scx_can_stop_tick(struct rq *rq);
1818
void scx_rq_activate(struct rq *rq);
1919
void scx_rq_deactivate(struct rq *rq);
2020
int scx_check_setscheduler(struct task_struct *p, int policy);
21-
bool task_should_scx(struct task_struct *p);
21+
bool task_should_scx(int policy);
2222
void init_sched_ext_class(void);
2323

2424
static inline u32 scx_cpuperf_target(s32 cpu)

kernel/sched/fair.c

Lines changed: 16 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -3369,7 +3369,7 @@ static void task_numa_work(struct callback_head *work)
33693369
vma = vma_next(&vmi);
33703370
}
33713371

3372-
do {
3372+
for (; vma; vma = vma_next(&vmi)) {
33733373
if (!vma_migratable(vma) || !vma_policy_mof(vma) ||
33743374
is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_MIXEDMAP)) {
33753375
trace_sched_skip_vma_numa(mm, vma, NUMAB_SKIP_UNSUITABLE);
@@ -3491,7 +3491,7 @@ static void task_numa_work(struct callback_head *work)
34913491
*/
34923492
if (vma_pids_forced)
34933493
break;
3494-
} for_each_vma(vmi, vma);
3494+
}
34953495

34963496
/*
34973497
* If no VMAs are remaining and VMAs were skipped due to the PID
@@ -5625,8 +5625,9 @@ pick_next_entity(struct rq *rq, struct cfs_rq *cfs_rq)
56255625
struct sched_entity *se = pick_eevdf(cfs_rq);
56265626
if (se->sched_delayed) {
56275627
dequeue_entities(rq, se, DEQUEUE_SLEEP | DEQUEUE_DELAYED);
5628-
SCHED_WARN_ON(se->sched_delayed);
5629-
SCHED_WARN_ON(se->on_rq);
5628+
/*
5629+
* Must not reference @se again, see __block_task().
5630+
*/
56305631
return NULL;
56315632
}
56325633
return se;
@@ -7176,7 +7177,11 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags)
71767177
/* Fix-up what dequeue_task_fair() skipped */
71777178
hrtick_update(rq);
71787179

7179-
/* Fix-up what block_task() skipped. */
7180+
/*
7181+
* Fix-up what block_task() skipped.
7182+
*
7183+
* Must be last, @p might not be valid after this.
7184+
*/
71807185
__block_task(rq, p);
71817186
}
71827187

@@ -7193,12 +7198,14 @@ static bool dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
71937198
if (!(p->se.sched_delayed && (task_on_rq_migrating(p) || (flags & DEQUEUE_SAVE))))
71947199
util_est_dequeue(&rq->cfs, p);
71957200

7196-
if (dequeue_entities(rq, &p->se, flags) < 0) {
7197-
util_est_update(&rq->cfs, p, DEQUEUE_SLEEP);
7201+
util_est_update(&rq->cfs, p, flags & DEQUEUE_SLEEP);
7202+
if (dequeue_entities(rq, &p->se, flags) < 0)
71987203
return false;
7199-
}
72007204

7201-
util_est_update(&rq->cfs, p, flags & DEQUEUE_SLEEP);
7205+
/*
7206+
* Must not reference @p after dequeue_entities(DEQUEUE_DELAYED).
7207+
*/
7208+
72027209
hrtick_update(rq);
72037210
return true;
72047211
}

kernel/sched/sched.h

Lines changed: 33 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2769,15 +2769,45 @@ static inline void sub_nr_running(struct rq *rq, unsigned count)
27692769

27702770
static inline void __block_task(struct rq *rq, struct task_struct *p)
27712771
{
2772-
WRITE_ONCE(p->on_rq, 0);
2773-
ASSERT_EXCLUSIVE_WRITER(p->on_rq);
27742772
if (p->sched_contributes_to_load)
27752773
rq->nr_uninterruptible++;
27762774

27772775
if (p->in_iowait) {
27782776
atomic_inc(&rq->nr_iowait);
27792777
delayacct_blkio_start();
27802778
}
2779+
2780+
ASSERT_EXCLUSIVE_WRITER(p->on_rq);
2781+
2782+
/*
2783+
* The moment this write goes through, ttwu() can swoop in and migrate
2784+
* this task, rendering our rq->__lock ineffective.
2785+
*
2786+
* __schedule() try_to_wake_up()
2787+
* LOCK rq->__lock LOCK p->pi_lock
2788+
* pick_next_task()
2789+
* pick_next_task_fair()
2790+
* pick_next_entity()
2791+
* dequeue_entities()
2792+
* __block_task()
2793+
* RELEASE p->on_rq = 0 if (p->on_rq && ...)
2794+
* break;
2795+
*
2796+
* ACQUIRE (after ctrl-dep)
2797+
*
2798+
* cpu = select_task_rq();
2799+
* set_task_cpu(p, cpu);
2800+
* ttwu_queue()
2801+
* ttwu_do_activate()
2802+
* LOCK rq->__lock
2803+
* activate_task()
2804+
* STORE p->on_rq = 1
2805+
* UNLOCK rq->__lock
2806+
*
2807+
* Callers must ensure to not reference @p after this -- we no longer
2808+
* own it.
2809+
*/
2810+
smp_store_release(&p->on_rq, 0);
27812811
}
27822812

27832813
extern void activate_task(struct rq *rq, struct task_struct *p, int flags);
@@ -3800,7 +3830,7 @@ static inline int rt_effective_prio(struct task_struct *p, int prio)
38003830

38013831
extern int __sched_setscheduler(struct task_struct *p, const struct sched_attr *attr, bool user, bool pi);
38023832
extern int __sched_setaffinity(struct task_struct *p, struct affinity_context *ctx);
3803-
extern const struct sched_class *__setscheduler_class(struct task_struct *p, int prio);
3833+
extern const struct sched_class *__setscheduler_class(int policy, int prio);
38043834
extern void set_load_weight(struct task_struct *p, bool update_load);
38053835
extern void enqueue_task(struct rq *rq, struct task_struct *p, int flags);
38063836
extern bool dequeue_task(struct rq *rq, struct task_struct *p, int flags);

kernel/sched/syscalls.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -707,7 +707,7 @@ int __sched_setscheduler(struct task_struct *p,
707707
}
708708

709709
prev_class = p->sched_class;
710-
next_class = __setscheduler_class(p, newprio);
710+
next_class = __setscheduler_class(policy, newprio);
711711

712712
if (prev_class != next_class && p->se.sched_delayed)
713713
dequeue_task(rq, p, DEQUEUE_SLEEP | DEQUEUE_DELAYED | DEQUEUE_NOCLOCK);

0 commit comments

Comments
 (0)