Skip to content

Commit 2b4d250

Browse files
committed
Merge tag 'sched_urgent_for_v6.12_rc4' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull scheduling fixes from Borislav Petkov: - Add PREEMPT_RT maintainers - Fix another aspect of delayed dequeued tasks wrt determining their state, i.e., whether they're runnable or blocked - Handle delayed dequeued tasks and their migration wrt PSI properly - Fix the situation where a delayed dequeue task gets enqueued into a new class, which should not happen - Fix a case where memory allocation would happen while the runqueue lock is held, which is a no-no - Do not over-schedule when tasks with shorter slices preempt the currently running task - Make sure delayed to deque entities are properly handled before unthrottling - Other smaller cleanups and improvements * tag 'sched_urgent_for_v6.12_rc4' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: MAINTAINERS: Add an entry for PREEMPT_RT. sched/fair: Fix external p->on_rq users sched/psi: Fix mistaken CPU pressure indication after corrupted task state bug sched/core: Dequeue PSI signals for blocked tasks that are delayed sched: Fix delayed_dequeue vs switched_from_fair() sched/core: Disable page allocation in task_tick_mm_cid() sched/deadline: Use hrtick_enabled_dl() before start_hrtick_dl() sched/eevdf: Fix wakeup-preempt by checking cfs_rq->nr_running sched: Fix sched_delayed vs cfs_bandwidth
2 parents a5ee44c + 5ec36fe commit 2b4d250

File tree

17 files changed

+146
-72
lines changed

17 files changed

+146
-72
lines changed

MAINTAINERS

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19527,6 +19527,14 @@ S: Maintained
1952719527
F: Documentation/tools/rtla/
1952819528
F: tools/tracing/rtla/
1952919529

19530+
Real-time Linux (PREEMPT_RT)
19531+
M: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
19532+
M: Clark Williams <clrkwllms@kernel.org>
19533+
M: Steven Rostedt <rostedt@goodmis.org>
19534+
L: linux-rt-devel@lists.linux.dev
19535+
S: Supported
19536+
K: PREEMPT_RT
19537+
1953019538
REALTEK AUDIO CODECS
1953119539
M: Oder Chiou <oder_chiou@realtek.com>
1953219540
S: Maintained

include/linux/sched.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2133,6 +2133,11 @@ static inline void set_task_cpu(struct task_struct *p, unsigned int cpu)
21332133

21342134
#endif /* CONFIG_SMP */
21352135

2136+
static inline bool task_is_runnable(struct task_struct *p)
2137+
{
2138+
return p->on_rq && !p->se.sched_delayed;
2139+
}
2140+
21362141
extern bool sched_task_on_rq(struct task_struct *p);
21372142
extern unsigned long get_wchan(struct task_struct *p);
21382143
extern struct task_struct *cpu_curr_snapshot(int cpu);

include/linux/task_work.h

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,11 +14,14 @@ init_task_work(struct callback_head *twork, task_work_func_t func)
1414
}
1515

1616
enum task_work_notify_mode {
17-
TWA_NONE,
17+
TWA_NONE = 0,
1818
TWA_RESUME,
1919
TWA_SIGNAL,
2020
TWA_SIGNAL_NO_IPI,
2121
TWA_NMI_CURRENT,
22+
23+
TWA_FLAGS = 0xff00,
24+
TWAF_NO_ALLOC = 0x0100,
2225
};
2326

2427
static inline bool task_work_pending(struct task_struct *task)

kernel/events/core.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9251,7 +9251,7 @@ static void perf_event_switch(struct task_struct *task,
92519251
},
92529252
};
92539253

9254-
if (!sched_in && task->on_rq) {
9254+
if (!sched_in && task_is_runnable(task)) {
92559255
switch_event.event_id.header.misc |=
92569256
PERF_RECORD_MISC_SWITCH_OUT_PREEMPT;
92579257
}

kernel/freezer.c

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -109,7 +109,12 @@ static int __set_task_frozen(struct task_struct *p, void *arg)
109109
{
110110
unsigned int state = READ_ONCE(p->__state);
111111

112-
if (p->on_rq)
112+
/*
113+
* Allow freezing the sched_delayed tasks; they will not execute until
114+
* ttwu() fixes them up, so it is safe to swap their state now, instead
115+
* of waiting for them to get fully dequeued.
116+
*/
117+
if (task_is_runnable(p))
113118
return 0;
114119

115120
if (p != current && task_curr(p))

kernel/rcu/tasks.h

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -985,6 +985,15 @@ static bool rcu_tasks_is_holdout(struct task_struct *t)
985985
if (!READ_ONCE(t->on_rq))
986986
return false;
987987

988+
/*
989+
* t->on_rq && !t->se.sched_delayed *could* be considered sleeping but
990+
* since it is a spurious state (it will transition into the
991+
* traditional blocked state or get woken up without outside
992+
* dependencies), not considering it such should only affect timing.
993+
*
994+
* Be conservative for now and not include it.
995+
*/
996+
988997
/*
989998
* Idle tasks (or idle injection) within the idle loop are RCU-tasks
990999
* quiescent states. But CPU boot code performed by the idle task

kernel/sched/core.c

Lines changed: 39 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -548,6 +548,11 @@ sched_core_dequeue(struct rq *rq, struct task_struct *p, int flags) { }
548548
* ON_RQ_MIGRATING state is used for migration without holding both
549549
* rq->locks. It indicates task_cpu() is not stable, see task_rq_lock().
550550
*
551+
* Additionally it is possible to be ->on_rq but still be considered not
552+
* runnable when p->se.sched_delayed is true. These tasks are on the runqueue
553+
* but will be dequeued as soon as they get picked again. See the
554+
* task_is_runnable() helper.
555+
*
551556
* p->on_cpu <- { 0, 1 }:
552557
*
553558
* is set by prepare_task() and cleared by finish_task() such that it will be
@@ -2012,18 +2017,18 @@ void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
20122017
if (!(flags & ENQUEUE_NOCLOCK))
20132018
update_rq_clock(rq);
20142019

2015-
if (!(flags & ENQUEUE_RESTORE)) {
2016-
sched_info_enqueue(rq, p);
2017-
psi_enqueue(p, (flags & ENQUEUE_WAKEUP) && !(flags & ENQUEUE_MIGRATED));
2018-
}
2019-
20202020
p->sched_class->enqueue_task(rq, p, flags);
20212021
/*
20222022
* Must be after ->enqueue_task() because ENQUEUE_DELAYED can clear
20232023
* ->sched_delayed.
20242024
*/
20252025
uclamp_rq_inc(rq, p);
20262026

2027+
if (!(flags & ENQUEUE_RESTORE)) {
2028+
sched_info_enqueue(rq, p);
2029+
psi_enqueue(p, flags & ENQUEUE_MIGRATED);
2030+
}
2031+
20272032
if (sched_core_enabled(rq))
20282033
sched_core_enqueue(rq, p);
20292034
}
@@ -2041,7 +2046,7 @@ inline bool dequeue_task(struct rq *rq, struct task_struct *p, int flags)
20412046

20422047
if (!(flags & DEQUEUE_SAVE)) {
20432048
sched_info_dequeue(rq, p);
2044-
psi_dequeue(p, flags & DEQUEUE_SLEEP);
2049+
psi_dequeue(p, !(flags & DEQUEUE_SLEEP));
20452050
}
20462051

20472052
/*
@@ -4323,9 +4328,10 @@ static bool __task_needs_rq_lock(struct task_struct *p)
43234328
* @arg: Argument to function.
43244329
*
43254330
* Fix the task in it's current state by avoiding wakeups and or rq operations
4326-
* and call @func(@arg) on it. This function can use ->on_rq and task_curr()
4327-
* to work out what the state is, if required. Given that @func can be invoked
4328-
* with a runqueue lock held, it had better be quite lightweight.
4331+
* and call @func(@arg) on it. This function can use task_is_runnable() and
4332+
* task_curr() to work out what the state is, if required. Given that @func
4333+
* can be invoked with a runqueue lock held, it had better be quite
4334+
* lightweight.
43294335
*
43304336
* Returns:
43314337
* Whatever @func returns
@@ -6544,6 +6550,7 @@ static void __sched notrace __schedule(int sched_mode)
65446550
* as a preemption by schedule_debug() and RCU.
65456551
*/
65466552
bool preempt = sched_mode > SM_NONE;
6553+
bool block = false;
65476554
unsigned long *switch_count;
65486555
unsigned long prev_state;
65496556
struct rq_flags rf;
@@ -6629,6 +6636,7 @@ static void __sched notrace __schedule(int sched_mode)
66296636
* After this, schedule() must not care about p->state any more.
66306637
*/
66316638
block_task(rq, prev, flags);
6639+
block = true;
66326640
}
66336641
switch_count = &prev->nvcsw;
66346642
}
@@ -6674,7 +6682,7 @@ static void __sched notrace __schedule(int sched_mode)
66746682

66756683
migrate_disable_switch(rq, prev);
66766684
psi_account_irqtime(rq, prev, next);
6677-
psi_sched_switch(prev, next, !task_on_rq_queued(prev));
6685+
psi_sched_switch(prev, next, block);
66786686

66796687
trace_sched_switch(preempt, prev, next, prev_state);
66806688

@@ -7017,20 +7025,20 @@ int default_wake_function(wait_queue_entry_t *curr, unsigned mode, int wake_flag
70177025
}
70187026
EXPORT_SYMBOL(default_wake_function);
70197027

7020-
void __setscheduler_prio(struct task_struct *p, int prio)
7028+
const struct sched_class *__setscheduler_class(struct task_struct *p, int prio)
70217029
{
70227030
if (dl_prio(prio))
7023-
p->sched_class = &dl_sched_class;
7024-
else if (rt_prio(prio))
7025-
p->sched_class = &rt_sched_class;
7031+
return &dl_sched_class;
7032+
7033+
if (rt_prio(prio))
7034+
return &rt_sched_class;
7035+
70267036
#ifdef CONFIG_SCHED_CLASS_EXT
7027-
else if (task_should_scx(p))
7028-
p->sched_class = &ext_sched_class;
7037+
if (task_should_scx(p))
7038+
return &ext_sched_class;
70297039
#endif
7030-
else
7031-
p->sched_class = &fair_sched_class;
70327040

7033-
p->prio = prio;
7041+
return &fair_sched_class;
70347042
}
70357043

70367044
#ifdef CONFIG_RT_MUTEXES
@@ -7076,7 +7084,7 @@ void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task)
70767084
{
70777085
int prio, oldprio, queued, running, queue_flag =
70787086
DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK;
7079-
const struct sched_class *prev_class;
7087+
const struct sched_class *prev_class, *next_class;
70807088
struct rq_flags rf;
70817089
struct rq *rq;
70827090

@@ -7134,6 +7142,11 @@ void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task)
71347142
queue_flag &= ~DEQUEUE_MOVE;
71357143

71367144
prev_class = p->sched_class;
7145+
next_class = __setscheduler_class(p, prio);
7146+
7147+
if (prev_class != next_class && p->se.sched_delayed)
7148+
dequeue_task(rq, p, DEQUEUE_SLEEP | DEQUEUE_DELAYED | DEQUEUE_NOCLOCK);
7149+
71377150
queued = task_on_rq_queued(p);
71387151
running = task_current(rq, p);
71397152
if (queued)
@@ -7171,7 +7184,9 @@ void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task)
71717184
p->rt.timeout = 0;
71727185
}
71737186

7174-
__setscheduler_prio(p, prio);
7187+
p->sched_class = next_class;
7188+
p->prio = prio;
7189+
71757190
check_class_changing(rq, p, prev_class);
71767191

71777192
if (queued)
@@ -10465,7 +10480,9 @@ void task_tick_mm_cid(struct rq *rq, struct task_struct *curr)
1046510480
return;
1046610481
if (time_before(now, READ_ONCE(curr->mm->mm_cid_next_scan)))
1046710482
return;
10468-
task_work_add(curr, work, TWA_RESUME);
10483+
10484+
/* No page allocation under rq lock */
10485+
task_work_add(curr, work, TWA_RESUME | TWAF_NO_ALLOC);
1046910486
}
1047010487

1047110488
void sched_mm_cid_exit_signals(struct task_struct *t)

kernel/sched/deadline.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2385,7 +2385,7 @@ static void set_next_task_dl(struct rq *rq, struct task_struct *p, bool first)
23852385

23862386
deadline_queue_push_tasks(rq);
23872387

2388-
if (hrtick_enabled(rq))
2388+
if (hrtick_enabled_dl(rq))
23892389
start_hrtick_dl(rq, &p->dl);
23902390
}
23912391

kernel/sched/ext.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4493,7 +4493,7 @@ static void scx_ops_disable_workfn(struct kthread_work *work)
44934493

44944494
sched_deq_and_put_task(p, DEQUEUE_SAVE | DEQUEUE_MOVE, &ctx);
44954495

4496-
__setscheduler_prio(p, p->prio);
4496+
p->sched_class = __setscheduler_class(p, p->prio);
44974497
check_class_changing(task_rq(p), p, old_class);
44984498

44994499
sched_enq_and_set_task(&ctx);
@@ -5204,7 +5204,7 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link)
52045204
sched_deq_and_put_task(p, DEQUEUE_SAVE | DEQUEUE_MOVE, &ctx);
52055205

52065206
p->scx.slice = SCX_SLICE_DFL;
5207-
__setscheduler_prio(p, p->prio);
5207+
p->sched_class = __setscheduler_class(p, p->prio);
52085208
check_class_changing(task_rq(p), p, old_class);
52095209

52105210
sched_enq_and_set_task(&ctx);

kernel/sched/fair.c

Lines changed: 7 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -1247,7 +1247,7 @@ static void update_curr(struct cfs_rq *cfs_rq)
12471247

12481248
account_cfs_rq_runtime(cfs_rq, delta_exec);
12491249

1250-
if (rq->nr_running == 1)
1250+
if (cfs_rq->nr_running == 1)
12511251
return;
12521252

12531253
if (resched || did_preempt_short(cfs_rq, curr)) {
@@ -6058,10 +6058,13 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
60586058
for_each_sched_entity(se) {
60596059
struct cfs_rq *qcfs_rq = cfs_rq_of(se);
60606060

6061-
if (se->on_rq) {
6062-
SCHED_WARN_ON(se->sched_delayed);
6061+
/* Handle any unfinished DELAY_DEQUEUE business first. */
6062+
if (se->sched_delayed) {
6063+
int flags = DEQUEUE_SLEEP | DEQUEUE_DELAYED;
6064+
6065+
dequeue_entity(qcfs_rq, se, flags);
6066+
} else if (se->on_rq)
60636067
break;
6064-
}
60656068
enqueue_entity(qcfs_rq, se, ENQUEUE_WAKEUP);
60666069

60676070
if (cfs_rq_is_idle(group_cfs_rq(se)))
@@ -13174,22 +13177,6 @@ static void attach_task_cfs_rq(struct task_struct *p)
1317413177
static void switched_from_fair(struct rq *rq, struct task_struct *p)
1317513178
{
1317613179
detach_task_cfs_rq(p);
13177-
/*
13178-
* Since this is called after changing class, this is a little weird
13179-
* and we cannot use DEQUEUE_DELAYED.
13180-
*/
13181-
if (p->se.sched_delayed) {
13182-
/* First, dequeue it from its new class' structures */
13183-
dequeue_task(rq, p, DEQUEUE_NOCLOCK | DEQUEUE_SLEEP);
13184-
/*
13185-
* Now, clean up the fair_sched_class side of things
13186-
* related to sched_delayed being true and that wasn't done
13187-
* due to the generic dequeue not using DEQUEUE_DELAYED.
13188-
*/
13189-
finish_delayed_dequeue_entity(&p->se);
13190-
p->se.rel_deadline = 0;
13191-
__block_task(rq, p);
13192-
}
1319313180
}
1319413181

1319513182
static void switched_to_fair(struct rq *rq, struct task_struct *p)

0 commit comments

Comments
 (0)