Skip to content

Commit a2a3374

Browse files
arighihtejun
authored andcommitted
sched_ext: idle: Refresh idle masks during idle-to-idle transitions
With the consolidation of put_prev_task/set_next_task(), see commit 436f3ee ("sched: Combine the last put_prev_task() and the first set_next_task()"), we are now skipping the transition between these two functions when the previous and the next tasks are the same. As a result, the scx idle state of a CPU is updated only when transitioning to or from the idle thread. While this is generally correct, it can lead to uneven and inefficient core utilization in certain scenarios [1]. A typical scenario involves proactive wake-ups: scx_bpf_pick_idle_cpu() selects and marks an idle CPU as busy, followed by a wake-up via scx_bpf_kick_cpu(), without dispatching any tasks. In this case, the CPU continues running the idle thread, returns to idle, but remains marked as busy, preventing it from being selected again as an idle CPU (until a task eventually runs on it and releases the CPU). For example, running a workload that uses 20% of each CPU, combined with an scx scheduler using proactive wake-ups, results in the following core utilization: CPU 0: 25.7% CPU 1: 29.3% CPU 2: 26.5% CPU 3: 25.5% CPU 4: 0.0% CPU 5: 25.5% CPU 6: 0.0% CPU 7: 10.5% To address this, refresh the idle state also in pick_task_idle(), during idle-to-idle transitions, but only trigger ops.update_idle() on actual state changes to prevent unnecessary updates to the scx scheduler and maintain balanced state transitions. With this change in place, the core utilization in the previous example becomes the following: CPU 0: 18.8% CPU 1: 19.4% CPU 2: 18.0% CPU 3: 18.7% CPU 4: 19.3% CPU 5: 18.9% CPU 6: 18.7% CPU 7: 19.3% [1] sched-ext/scx#1139 Fixes: 7c65ae8 ("sched_ext: Don't call put_prev_task_scx() before picking the next task") Signed-off-by: Andrea Righi <arighi@nvidia.com> Signed-off-by: Tejun Heo <tj@kernel.org>
1 parent 68e449d commit a2a3374

File tree

3 files changed

+59
-15
lines changed

3 files changed

+59
-15
lines changed

kernel/sched/ext.c

Lines changed: 52 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -3590,16 +3590,8 @@ static void reset_idle_masks(void)
35903590
cpumask_copy(idle_masks.smt, cpu_online_mask);
35913591
}
35923592

3593-
void __scx_update_idle(struct rq *rq, bool idle)
3593+
static void update_builtin_idle(int cpu, bool idle)
35943594
{
3595-
int cpu = cpu_of(rq);
3596-
3597-
if (SCX_HAS_OP(update_idle) && !scx_rq_bypassing(rq)) {
3598-
SCX_CALL_OP(SCX_KF_REST, update_idle, cpu_of(rq), idle);
3599-
if (!static_branch_unlikely(&scx_builtin_idle_enabled))
3600-
return;
3601-
}
3602-
36033595
if (idle)
36043596
cpumask_set_cpu(cpu, idle_masks.cpu);
36053597
else
@@ -3626,6 +3618,57 @@ void __scx_update_idle(struct rq *rq, bool idle)
36263618
#endif
36273619
}
36283620

3621+
/*
3622+
* Update the idle state of a CPU to @idle.
3623+
*
3624+
* If @do_notify is true, ops.update_idle() is invoked to notify the scx
3625+
* scheduler of an actual idle state transition (idle to busy or vice
3626+
* versa). If @do_notify is false, only the idle state in the idle masks is
3627+
* refreshed without invoking ops.update_idle().
3628+
*
3629+
* This distinction is necessary, because an idle CPU can be "reserved" and
3630+
* awakened via scx_bpf_pick_idle_cpu() + scx_bpf_kick_cpu(), marking it as
3631+
* busy even if no tasks are dispatched. In this case, the CPU may return
3632+
* to idle without a true state transition. Refreshing the idle masks
3633+
* without invoking ops.update_idle() ensures accurate idle state tracking
3634+
* while avoiding unnecessary updates and maintaining balanced state
3635+
* transitions.
3636+
*/
3637+
void __scx_update_idle(struct rq *rq, bool idle, bool do_notify)
3638+
{
3639+
int cpu = cpu_of(rq);
3640+
3641+
lockdep_assert_rq_held(rq);
3642+
3643+
/*
3644+
* Trigger ops.update_idle() only when transitioning from a task to
3645+
* the idle thread and vice versa.
3646+
*
3647+
* Idle transitions are indicated by do_notify being set to true,
3648+
* managed by put_prev_task_idle()/set_next_task_idle().
3649+
*/
3650+
if (SCX_HAS_OP(update_idle) && do_notify && !scx_rq_bypassing(rq))
3651+
SCX_CALL_OP(SCX_KF_REST, update_idle, cpu_of(rq), idle);
3652+
3653+
/*
3654+
* Update the idle masks:
3655+
* - for real idle transitions (do_notify == true)
3656+
* - for idle-to-idle transitions (indicated by the previous task
3657+
* being the idle thread, managed by pick_task_idle())
3658+
*
3659+
* Skip updating idle masks if the previous task is not the idle
3660+
* thread, since set_next_task_idle() has already handled it when
3661+
* transitioning from a task to the idle thread (calling this
3662+
* function with do_notify == true).
3663+
*
3664+
* In this way we can avoid updating the idle masks twice,
3665+
* unnecessarily.
3666+
*/
3667+
if (static_branch_likely(&scx_builtin_idle_enabled))
3668+
if (do_notify || is_idle_task(rq->curr))
3669+
update_builtin_idle(cpu, idle);
3670+
}
3671+
36293672
static void handle_hotplug(struct rq *rq, bool online)
36303673
{
36313674
int cpu = cpu_of(rq);

kernel/sched/ext.h

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -57,15 +57,15 @@ static inline void init_sched_ext_class(void) {}
5757
#endif /* CONFIG_SCHED_CLASS_EXT */
5858

5959
#if defined(CONFIG_SCHED_CLASS_EXT) && defined(CONFIG_SMP)
60-
void __scx_update_idle(struct rq *rq, bool idle);
60+
void __scx_update_idle(struct rq *rq, bool idle, bool do_notify);
6161

62-
static inline void scx_update_idle(struct rq *rq, bool idle)
62+
static inline void scx_update_idle(struct rq *rq, bool idle, bool do_notify)
6363
{
6464
if (scx_enabled())
65-
__scx_update_idle(rq, idle);
65+
__scx_update_idle(rq, idle, do_notify);
6666
}
6767
#else
68-
static inline void scx_update_idle(struct rq *rq, bool idle) {}
68+
static inline void scx_update_idle(struct rq *rq, bool idle, bool do_notify) {}
6969
#endif
7070

7171
#ifdef CONFIG_CGROUP_SCHED

kernel/sched/idle.c

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -452,19 +452,20 @@ static void wakeup_preempt_idle(struct rq *rq, struct task_struct *p, int flags)
452452
static void put_prev_task_idle(struct rq *rq, struct task_struct *prev, struct task_struct *next)
453453
{
454454
dl_server_update_idle_time(rq, prev);
455-
scx_update_idle(rq, false);
455+
scx_update_idle(rq, false, true);
456456
}
457457

458458
static void set_next_task_idle(struct rq *rq, struct task_struct *next, bool first)
459459
{
460460
update_idle_core(rq);
461-
scx_update_idle(rq, true);
461+
scx_update_idle(rq, true, true);
462462
schedstat_inc(rq->sched_goidle);
463463
next->se.exec_start = rq_clock_task(rq);
464464
}
465465

466466
struct task_struct *pick_task_idle(struct rq *rq)
467467
{
468+
scx_update_idle(rq, true, false);
468469
return rq->idle;
469470
}
470471

0 commit comments

Comments
 (0)