Skip to content

Commit 2a0adc4

Browse files
committed
Merge tag 'sched_urgent_for_v6.7_rc2' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull scheduler fixes from Borislav Petkov: - Fix virtual runtime calculation when recomputing a sched entity's weights - Fix wrongly rejected unprivileged poll requests to the cgroup psi pressure files - Make sure the load balancing is done by only one CPU * tag 'sched_urgent_for_v6.7_rc2' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: sched/fair: Fix the decision for load balance sched: psi: fix unprivileged polling against cgroups sched/eevdf: Fix vruntime adjustment on reweight
2 parents 2f84f82 + 6d7e478 commit 2a0adc4

File tree

2 files changed

+135
-38
lines changed

2 files changed

+135
-38
lines changed

kernel/cgroup/cgroup.c

Lines changed: 0 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -3885,14 +3885,6 @@ static __poll_t cgroup_pressure_poll(struct kernfs_open_file *of,
38853885
return psi_trigger_poll(&ctx->psi.trigger, of->file, pt);
38863886
}
38873887

3888-
static int cgroup_pressure_open(struct kernfs_open_file *of)
3889-
{
3890-
if (of->file->f_mode & FMODE_WRITE && !capable(CAP_SYS_RESOURCE))
3891-
return -EPERM;
3892-
3893-
return 0;
3894-
}
3895-
38963888
static void cgroup_pressure_release(struct kernfs_open_file *of)
38973889
{
38983890
struct cgroup_file_ctx *ctx = of->priv;
@@ -5299,7 +5291,6 @@ static struct cftype cgroup_psi_files[] = {
52995291
{
53005292
.name = "io.pressure",
53015293
.file_offset = offsetof(struct cgroup, psi_files[PSI_IO]),
5302-
.open = cgroup_pressure_open,
53035294
.seq_show = cgroup_io_pressure_show,
53045295
.write = cgroup_io_pressure_write,
53055296
.poll = cgroup_pressure_poll,
@@ -5308,7 +5299,6 @@ static struct cftype cgroup_psi_files[] = {
53085299
{
53095300
.name = "memory.pressure",
53105301
.file_offset = offsetof(struct cgroup, psi_files[PSI_MEM]),
5311-
.open = cgroup_pressure_open,
53125302
.seq_show = cgroup_memory_pressure_show,
53135303
.write = cgroup_memory_pressure_write,
53145304
.poll = cgroup_pressure_poll,
@@ -5317,7 +5307,6 @@ static struct cftype cgroup_psi_files[] = {
53175307
{
53185308
.name = "cpu.pressure",
53195309
.file_offset = offsetof(struct cgroup, psi_files[PSI_CPU]),
5320-
.open = cgroup_pressure_open,
53215310
.seq_show = cgroup_cpu_pressure_show,
53225311
.write = cgroup_cpu_pressure_write,
53235312
.poll = cgroup_pressure_poll,
@@ -5327,7 +5316,6 @@ static struct cftype cgroup_psi_files[] = {
53275316
{
53285317
.name = "irq.pressure",
53295318
.file_offset = offsetof(struct cgroup, psi_files[PSI_IRQ]),
5330-
.open = cgroup_pressure_open,
53315319
.seq_show = cgroup_irq_pressure_show,
53325320
.write = cgroup_irq_pressure_write,
53335321
.poll = cgroup_pressure_poll,

kernel/sched/fair.c

Lines changed: 135 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -3666,41 +3666,140 @@ static inline void
36663666
dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { }
36673667
#endif
36683668

3669+
static void reweight_eevdf(struct cfs_rq *cfs_rq, struct sched_entity *se,
3670+
unsigned long weight)
3671+
{
3672+
unsigned long old_weight = se->load.weight;
3673+
u64 avruntime = avg_vruntime(cfs_rq);
3674+
s64 vlag, vslice;
3675+
3676+
/*
3677+
* VRUNTIME
3678+
* ========
3679+
*
3680+
* COROLLARY #1: The virtual runtime of the entity needs to be
3681+
* adjusted if re-weight at !0-lag point.
3682+
*
3683+
* Proof: For contradiction assume this is not true, so we can
3684+
* re-weight without changing vruntime at !0-lag point.
3685+
*
3686+
* Weight VRuntime Avg-VRuntime
3687+
* before w v V
3688+
* after w' v' V'
3689+
*
3690+
* Since lag needs to be preserved through re-weight:
3691+
*
3692+
* lag = (V - v)*w = (V'- v')*w', where v = v'
3693+
* ==> V' = (V - v)*w/w' + v (1)
3694+
*
3695+
* Let W be the total weight of the entities before reweight,
3696+
* since V' is the new weighted average of entities:
3697+
*
3698+
* V' = (WV + w'v - wv) / (W + w' - w) (2)
3699+
*
3700+
* by using (1) & (2) we obtain:
3701+
*
3702+
* (WV + w'v - wv) / (W + w' - w) = (V - v)*w/w' + v
3703+
* ==> (WV-Wv+Wv+w'v-wv)/(W+w'-w) = (V - v)*w/w' + v
3704+
* ==> (WV - Wv)/(W + w' - w) + v = (V - v)*w/w' + v
3705+
* ==> (V - v)*W/(W + w' - w) = (V - v)*w/w' (3)
3706+
*
3707+
* Since we are doing at !0-lag point which means V != v, we
3708+
* can simplify (3):
3709+
*
3710+
* ==> W / (W + w' - w) = w / w'
3711+
* ==> Ww' = Ww + ww' - ww
3712+
* ==> W * (w' - w) = w * (w' - w)
3713+
* ==> W = w (re-weight indicates w' != w)
3714+
*
3715+
* So the cfs_rq contains only one entity, hence vruntime of
3716+
* the entity @v should always equal to the cfs_rq's weighted
3717+
* average vruntime @V, which means we will always re-weight
3718+
* at 0-lag point, thus breach assumption. Proof completed.
3719+
*
3720+
*
3721+
* COROLLARY #2: Re-weight does NOT affect weighted average
3722+
* vruntime of all the entities.
3723+
*
3724+
* Proof: According to corollary #1, Eq. (1) should be:
3725+
*
3726+
* (V - v)*w = (V' - v')*w'
3727+
* ==> v' = V' - (V - v)*w/w' (4)
3728+
*
3729+
* According to the weighted average formula, we have:
3730+
*
3731+
* V' = (WV - wv + w'v') / (W - w + w')
3732+
* = (WV - wv + w'(V' - (V - v)w/w')) / (W - w + w')
3733+
* = (WV - wv + w'V' - Vw + wv) / (W - w + w')
3734+
* = (WV + w'V' - Vw) / (W - w + w')
3735+
*
3736+
* ==> V'*(W - w + w') = WV + w'V' - Vw
3737+
* ==> V' * (W - w) = (W - w) * V (5)
3738+
*
3739+
* If the entity is the only one in the cfs_rq, then reweight
3740+
* always occurs at 0-lag point, so V won't change. Or else
3741+
* there are other entities, hence W != w, then Eq. (5) turns
3742+
* into V' = V. So V won't change in either case, proof done.
3743+
*
3744+
*
3745+
* So according to corollary #1 & #2, the effect of re-weight
3746+
* on vruntime should be:
3747+
*
3748+
* v' = V' - (V - v) * w / w' (4)
3749+
* = V - (V - v) * w / w'
3750+
* = V - vl * w / w'
3751+
* = V - vl'
3752+
*/
3753+
if (avruntime != se->vruntime) {
3754+
vlag = (s64)(avruntime - se->vruntime);
3755+
vlag = div_s64(vlag * old_weight, weight);
3756+
se->vruntime = avruntime - vlag;
3757+
}
3758+
3759+
/*
3760+
* DEADLINE
3761+
* ========
3762+
*
3763+
* When the weight changes, the virtual time slope changes and
3764+
* we should adjust the relative virtual deadline accordingly.
3765+
*
3766+
* d' = v' + (d - v)*w/w'
3767+
* = V' - (V - v)*w/w' + (d - v)*w/w'
3768+
* = V - (V - v)*w/w' + (d - v)*w/w'
3769+
* = V + (d - V)*w/w'
3770+
*/
3771+
vslice = (s64)(se->deadline - avruntime);
3772+
vslice = div_s64(vslice * old_weight, weight);
3773+
se->deadline = avruntime + vslice;
3774+
}
3775+
36693776
static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
36703777
unsigned long weight)
36713778
{
3672-
unsigned long old_weight = se->load.weight;
3779+
bool curr = cfs_rq->curr == se;
36733780

36743781
if (se->on_rq) {
36753782
/* commit outstanding execution time */
3676-
if (cfs_rq->curr == se)
3783+
if (curr)
36773784
update_curr(cfs_rq);
36783785
else
3679-
avg_vruntime_sub(cfs_rq, se);
3786+
__dequeue_entity(cfs_rq, se);
36803787
update_load_sub(&cfs_rq->load, se->load.weight);
36813788
}
36823789
dequeue_load_avg(cfs_rq, se);
36833790

3684-
update_load_set(&se->load, weight);
3685-
36863791
if (!se->on_rq) {
36873792
/*
36883793
* Because we keep se->vlag = V - v_i, while: lag_i = w_i*(V - v_i),
36893794
* we need to scale se->vlag when w_i changes.
36903795
*/
3691-
se->vlag = div_s64(se->vlag * old_weight, weight);
3796+
se->vlag = div_s64(se->vlag * se->load.weight, weight);
36923797
} else {
3693-
s64 deadline = se->deadline - se->vruntime;
3694-
/*
3695-
* When the weight changes, the virtual time slope changes and
3696-
* we should adjust the relative virtual deadline accordingly.
3697-
*/
3698-
deadline = div_s64(deadline * old_weight, weight);
3699-
se->deadline = se->vruntime + deadline;
3700-
if (se != cfs_rq->curr)
3701-
min_deadline_cb_propagate(&se->run_node, NULL);
3798+
reweight_eevdf(cfs_rq, se, weight);
37023799
}
37033800

3801+
update_load_set(&se->load, weight);
3802+
37043803
#ifdef CONFIG_SMP
37053804
do {
37063805
u32 divider = get_pelt_divider(&se->avg);
@@ -3712,8 +3811,17 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
37123811
enqueue_load_avg(cfs_rq, se);
37133812
if (se->on_rq) {
37143813
update_load_add(&cfs_rq->load, se->load.weight);
3715-
if (cfs_rq->curr != se)
3716-
avg_vruntime_add(cfs_rq, se);
3814+
if (!curr) {
3815+
/*
3816+
* The entity's vruntime has been adjusted, so let's check
3817+
* whether the rq-wide min_vruntime needs updated too. Since
3818+
* the calculations above require stable min_vruntime rather
3819+
* than up-to-date one, we do the update at the end of the
3820+
* reweight process.
3821+
*/
3822+
__enqueue_entity(cfs_rq, se);
3823+
update_min_vruntime(cfs_rq);
3824+
}
37173825
}
37183826
}
37193827

@@ -3857,14 +3965,11 @@ static void update_cfs_group(struct sched_entity *se)
38573965

38583966
#ifndef CONFIG_SMP
38593967
shares = READ_ONCE(gcfs_rq->tg->shares);
3860-
3861-
if (likely(se->load.weight == shares))
3862-
return;
38633968
#else
3864-
shares = calc_group_shares(gcfs_rq);
3969+
shares = calc_group_shares(gcfs_rq);
38653970
#endif
3866-
3867-
reweight_entity(cfs_rq_of(se), se, shares);
3971+
if (unlikely(se->load.weight != shares))
3972+
reweight_entity(cfs_rq_of(se), se, shares);
38683973
}
38693974

38703975
#else /* CONFIG_FAIR_GROUP_SCHED */
@@ -11079,12 +11184,16 @@ static int should_we_balance(struct lb_env *env)
1107911184
continue;
1108011185
}
1108111186

11082-
/* Are we the first idle CPU? */
11187+
/*
11188+
* Are we the first idle core in a non-SMT domain or higher,
11189+
* or the first idle CPU in a SMT domain?
11190+
*/
1108311191
return cpu == env->dst_cpu;
1108411192
}
1108511193

11086-
if (idle_smt == env->dst_cpu)
11087-
return true;
11194+
/* Are we the first idle CPU with busy siblings? */
11195+
if (idle_smt != -1)
11196+
return idle_smt == env->dst_cpu;
1108811197

1108911198
/* Are we the first CPU of this group ? */
1109011199
return group_balance_cpu(sg) == env->dst_cpu;

0 commit comments

Comments
 (0)