Skip to content

Commit eaed94d

Browse files
committed
Merge tag 'sched-core-2025-05-25' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull scheduler updates from Ingo Molnar: "Core & fair scheduler changes: - Tweak wait_task_inactive() to force dequeue sched_delayed tasks (John Stultz) - Adhere to place_entity() constraints (Peter Zijlstra) - Allow decaying util_est when util_avg > CPU capacity (Pierre Gondois) - Fix up wake_up_sync() vs DELAYED_DEQUEUE (Xuewen Yan) Energy management: - Introduce sched_update_asym_prefer_cpu() (K Prateek Nayak) - cpufreq/amd-pstate: Update asym_prefer_cpu when core rankings change (K Prateek Nayak) - Align uclamp and util_est and call before freq update (Xuewen Yan) CPU isolation: - Make use of more than one housekeeping CPU (Phil Auld) RT scheduler: - Fix race in push_rt_task() (Harshit Agarwal) - Add kernel cmdline option for rt_group_sched (Michal Koutný) Scheduler topology support: - Improve topology_span_sane speed (Steve Wahl) Scheduler debugging: - Move and extend the sched_process_exit() tracepoint (Andrii Nakryiko) - Add RT_GROUP WARN checks for non-root task_groups (Michal Koutný) - Fix trace_sched_switch(.prev_state) (Peter Zijlstra) - Untangle cond_resched() and live-patching (Peter Zijlstra) Fixes and cleanups: - Misc fixes and cleanups (K Prateek Nayak, Michal Koutný, Peter Zijlstra, Xuewen Yan)" * tag 'sched-core-2025-05-25' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (26 commits) sched/uclamp: Align uclamp and util_est and call before freq update sched/util_est: Simplify condition for util_est_{en,de}queue() sched/fair: Fixup wake_up_sync() vs DELAYED_DEQUEUE sched,livepatch: Untangle cond_resched() and live-patching sched/core: Tweak wait_task_inactive() to force dequeue sched_delayed tasks sched/fair: Adhere to place_entity() constraints sched/debug: Print the local group's asym_prefer_cpu cpufreq/amd-pstate: Update asym_prefer_cpu when core rankings change sched/topology: Introduce sched_update_asym_prefer_cpu() sched/fair: Use READ_ONCE() to read sg->asym_prefer_cpu sched/isolation: Make use of more than one housekeeping cpu sched/rt: Fix race in push_rt_task sched: Add annotations to RT_GROUP_SCHED fields sched: Add RT_GROUP WARN checks for non-root task_groups sched: Do not construct nor expose RT_GROUP_SCHED structures if disabled sched: Bypass bandwitdh checks with runtime disabled RT_GROUP_SCHED sched: Skip non-root task_groups with disabled RT_GROUP_SCHED sched: Add commadline option for RT_GROUP_SCHED toggling sched: Always initialize rt_rq's task_group sched: Remove unneeed macro wrap ...
2 parents 3ba121c + 90ca941 commit eaed94d

File tree

17 files changed

+377
-214
lines changed

17 files changed

+377
-214
lines changed

Documentation/admin-guide/kernel-parameters.txt

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6320,6 +6320,11 @@
63206320
Memory area to be used by remote processor image,
63216321
managed by CMA.
63226322

6323+
rt_group_sched= [KNL] Enable or disable SCHED_RR/FIFO group scheduling
6324+
when CONFIG_RT_GROUP_SCHED=y. Defaults to
6325+
!CONFIG_RT_GROUP_SCHED_DEFAULT_DISABLED.
6326+
Format: <bool>
6327+
63236328
rw [KNL] Mount root device read-write on boot
63246329

63256330
S [KNL] Run init in single mode

drivers/cpufreq/amd-pstate.c

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -831,8 +831,10 @@ static void amd_pstate_update_limits(unsigned int cpu)
831831
if (highest_perf_changed) {
832832
WRITE_ONCE(cpudata->prefcore_ranking, cur_high);
833833

834-
if (cur_high < CPPC_MAX_PERF)
834+
if (cur_high < CPPC_MAX_PERF) {
835835
sched_set_itmt_core_prio((int)cur_high, cpu);
836+
sched_update_asym_prefer_cpu(cpu, prev_high, cur_high);
837+
}
836838
}
837839
}
838840

include/linux/livepatch_sched.h

Lines changed: 5 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -3,27 +3,23 @@
33
#define _LINUX_LIVEPATCH_SCHED_H_
44

55
#include <linux/jump_label.h>
6-
#include <linux/static_call_types.h>
6+
#include <linux/sched.h>
77

88
#ifdef CONFIG_LIVEPATCH
99

1010
void __klp_sched_try_switch(void);
1111

12-
#if !defined(CONFIG_PREEMPT_DYNAMIC) || !defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL)
13-
1412
DECLARE_STATIC_KEY_FALSE(klp_sched_try_switch_key);
1513

16-
static __always_inline void klp_sched_try_switch(void)
14+
static __always_inline void klp_sched_try_switch(struct task_struct *curr)
1715
{
18-
if (static_branch_unlikely(&klp_sched_try_switch_key))
16+
if (static_branch_unlikely(&klp_sched_try_switch_key) &&
17+
READ_ONCE(curr->__state) & TASK_FREEZABLE)
1918
__klp_sched_try_switch();
2019
}
2120

22-
#endif /* !CONFIG_PREEMPT_DYNAMIC || !CONFIG_HAVE_PREEMPT_DYNAMIC_CALL */
23-
2421
#else /* !CONFIG_LIVEPATCH */
25-
static inline void klp_sched_try_switch(void) {}
26-
static inline void __klp_sched_try_switch(void) {}
22+
static inline void klp_sched_try_switch(struct task_struct *curr) {}
2723
#endif /* CONFIG_LIVEPATCH */
2824

2925
#endif /* _LINUX_LIVEPATCH_SCHED_H_ */

include/linux/sched.h

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,6 @@
4444
#include <linux/seqlock_types.h>
4545
#include <linux/kcsan.h>
4646
#include <linux/rv.h>
47-
#include <linux/livepatch_sched.h>
4847
#include <linux/uidgid_types.h>
4948
#include <linux/tracepoint-defs.h>
5049
#include <asm/kmap_size.h>
@@ -2089,9 +2088,6 @@ extern int __cond_resched(void);
20892088

20902089
#if defined(CONFIG_PREEMPT_DYNAMIC) && defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL)
20912090

2092-
void sched_dynamic_klp_enable(void);
2093-
void sched_dynamic_klp_disable(void);
2094-
20952091
DECLARE_STATIC_CALL(cond_resched, __cond_resched);
20962092

20972093
static __always_inline int _cond_resched(void)
@@ -2112,7 +2108,6 @@ static __always_inline int _cond_resched(void)
21122108

21132109
static inline int _cond_resched(void)
21142110
{
2115-
klp_sched_try_switch();
21162111
return __cond_resched();
21172112
}
21182113

@@ -2122,7 +2117,6 @@ static inline int _cond_resched(void)
21222117

21232118
static inline int _cond_resched(void)
21242119
{
2125-
klp_sched_try_switch();
21262120
return 0;
21272121
}
21282122

include/linux/sched/topology.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -195,6 +195,8 @@ struct sched_domain_topology_level {
195195
};
196196

197197
extern void __init set_sched_topology(struct sched_domain_topology_level *tl);
198+
extern void sched_update_asym_prefer_cpu(int cpu, int old_prio, int new_prio);
199+
198200

199201
# define SD_INIT_NAME(type) .name = #type
200202

@@ -223,6 +225,10 @@ static inline bool cpus_share_resources(int this_cpu, int that_cpu)
223225
return true;
224226
}
225227

228+
static inline void sched_update_asym_prefer_cpu(int cpu, int old_prio, int new_prio)
229+
{
230+
}
231+
226232
#endif /* !CONFIG_SMP */
227233

228234
#if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL)

include/trace/events/sched.h

Lines changed: 30 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -326,11 +326,37 @@ DEFINE_EVENT(sched_process_template, sched_process_free,
326326
TP_ARGS(p));
327327

328328
/*
329-
* Tracepoint for a task exiting:
329+
* Tracepoint for a task exiting.
330+
* Note, it's a superset of sched_process_template and should be kept
331+
* compatible as much as possible. sched_process_exits has an extra
332+
* `group_dead` argument, so sched_process_template can't be used,
333+
* unfortunately, just like sched_migrate_task above.
330334
*/
331-
DEFINE_EVENT(sched_process_template, sched_process_exit,
332-
TP_PROTO(struct task_struct *p),
333-
TP_ARGS(p));
335+
TRACE_EVENT(sched_process_exit,
336+
337+
TP_PROTO(struct task_struct *p, bool group_dead),
338+
339+
TP_ARGS(p, group_dead),
340+
341+
TP_STRUCT__entry(
342+
__array( char, comm, TASK_COMM_LEN )
343+
__field( pid_t, pid )
344+
__field( int, prio )
345+
__field( bool, group_dead )
346+
),
347+
348+
TP_fast_assign(
349+
memcpy(__entry->comm, p->comm, TASK_COMM_LEN);
350+
__entry->pid = p->pid;
351+
__entry->prio = p->prio; /* XXX SCHED_DEADLINE */
352+
__entry->group_dead = group_dead;
353+
),
354+
355+
TP_printk("comm=%s pid=%d prio=%d group_dead=%s",
356+
__entry->comm, __entry->pid, __entry->prio,
357+
__entry->group_dead ? "true" : "false"
358+
)
359+
);
334360

335361
/*
336362
* Tracepoint for waiting on task to unschedule:

init/Kconfig

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1075,6 +1075,17 @@ config RT_GROUP_SCHED
10751075
realtime bandwidth for them.
10761076
See Documentation/scheduler/sched-rt-group.rst for more information.
10771077

1078+
config RT_GROUP_SCHED_DEFAULT_DISABLED
1079+
bool "Require boot parameter to enable group scheduling for SCHED_RR/FIFO"
1080+
depends on RT_GROUP_SCHED
1081+
default n
1082+
help
1083+
When set, the RT group scheduling is disabled by default. The option
1084+
is in inverted form so that mere RT_GROUP_SCHED enables the group
1085+
scheduling.
1086+
1087+
Say N if unsure.
1088+
10781089
config EXT_GROUP_SCHED
10791090
bool
10801091
depends on SCHED_CLASS_EXT && CGROUP_SCHED

kernel/exit.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -942,12 +942,12 @@ void __noreturn do_exit(long code)
942942

943943
tsk->exit_code = code;
944944
taskstats_exit(tsk, group_dead);
945+
trace_sched_process_exit(tsk, group_dead);
945946

946947
exit_mm();
947948

948949
if (group_dead)
949950
acct_process();
950-
trace_sched_process_exit(tsk);
951951

952952
exit_sem(tsk);
953953
exit_shm(tsk);

kernel/livepatch/transition.c

Lines changed: 14 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -29,22 +29,13 @@ static unsigned int klp_signals_cnt;
2929

3030
/*
3131
* When a livepatch is in progress, enable klp stack checking in
32-
* cond_resched(). This helps CPU-bound kthreads get patched.
32+
* schedule(). This helps CPU-bound kthreads get patched.
3333
*/
34-
#if defined(CONFIG_PREEMPT_DYNAMIC) && defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL)
35-
36-
#define klp_cond_resched_enable() sched_dynamic_klp_enable()
37-
#define klp_cond_resched_disable() sched_dynamic_klp_disable()
38-
39-
#else /* !CONFIG_PREEMPT_DYNAMIC || !CONFIG_HAVE_PREEMPT_DYNAMIC_CALL */
4034

4135
DEFINE_STATIC_KEY_FALSE(klp_sched_try_switch_key);
42-
EXPORT_SYMBOL(klp_sched_try_switch_key);
4336

44-
#define klp_cond_resched_enable() static_branch_enable(&klp_sched_try_switch_key)
45-
#define klp_cond_resched_disable() static_branch_disable(&klp_sched_try_switch_key)
46-
47-
#endif /* CONFIG_PREEMPT_DYNAMIC && CONFIG_HAVE_PREEMPT_DYNAMIC_CALL */
37+
#define klp_resched_enable() static_branch_enable(&klp_sched_try_switch_key)
38+
#define klp_resched_disable() static_branch_disable(&klp_sched_try_switch_key)
4839

4940
/*
5041
* This work can be performed periodically to finish patching or unpatching any
@@ -365,26 +356,18 @@ static bool klp_try_switch_task(struct task_struct *task)
365356

366357
void __klp_sched_try_switch(void)
367358
{
368-
if (likely(!klp_patch_pending(current)))
369-
return;
370-
371359
/*
372-
* This function is called from cond_resched() which is called in many
373-
* places throughout the kernel. Using the klp_mutex here might
374-
* deadlock.
375-
*
376-
* Instead, disable preemption to prevent racing with other callers of
377-
* klp_try_switch_task(). Thanks to task_call_func() they won't be
378-
* able to switch this task while it's running.
360+
* This function is called from __schedule() while a context switch is
361+
* about to happen. Preemption is already disabled and klp_mutex
362+
* can't be acquired.
363+
* Disabled preemption is used to prevent racing with other callers of
364+
* klp_try_switch_task(). Thanks to task_call_func() they won't be
365+
* able to switch to this task while it's running.
379366
*/
380-
preempt_disable();
367+
lockdep_assert_preemption_disabled();
381368

382-
/*
383-
* Make sure current didn't get patched between the above check and
384-
* preempt_disable().
385-
*/
386-
if (unlikely(!klp_patch_pending(current)))
387-
goto out;
369+
if (likely(!klp_patch_pending(current)))
370+
return;
388371

389372
/*
390373
* Enforce the order of the TIF_PATCH_PENDING read above and the
@@ -395,11 +378,7 @@ void __klp_sched_try_switch(void)
395378
smp_rmb();
396379

397380
klp_try_switch_task(current);
398-
399-
out:
400-
preempt_enable();
401381
}
402-
EXPORT_SYMBOL(__klp_sched_try_switch);
403382

404383
/*
405384
* Sends a fake signal to all non-kthread tasks with TIF_PATCH_PENDING set.
@@ -508,7 +487,7 @@ void klp_try_complete_transition(void)
508487
}
509488

510489
/* Done! Now cleanup the data structures. */
511-
klp_cond_resched_disable();
490+
klp_resched_disable();
512491
patch = klp_transition_patch;
513492
klp_complete_transition();
514493

@@ -560,7 +539,7 @@ void klp_start_transition(void)
560539
set_tsk_thread_flag(task, TIF_PATCH_PENDING);
561540
}
562541

563-
klp_cond_resched_enable();
542+
klp_resched_enable();
564543

565544
klp_signals_cnt = 0;
566545
}

0 commit comments

Comments
 (0)