Skip to content

Commit 11137d3

Browse files
vingu-linaroIngo Molnar
authored andcommitted
sched/fair: Simplify util_est
With UTIL_EST_FASTUP now being permanent, we can take advantage of the fact that the ewma jumps directly to a higher utilization at dequeue to simplify util_est and remove the enqueued field. Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org> Signed-off-by: Ingo Molnar <mingo@kernel.org> Tested-by: Lukasz Luba <lukasz.luba@arm.com> Reviewed-by: Lukasz Luba <lukasz.luba@arm.com> Reviewed-by: Dietmar Eggemann <dietmar.eggemann@arm.com> Reviewed-by: Hongyan Xia <hongyan.xia2@arm.com> Reviewed-by: Alex Shi <alexs@kernel.org> Link: https://lore.kernel.org/r/20231201161652.1241695-3-vincent.guittot@linaro.org
1 parent 7736ae5 commit 11137d3

File tree

4 files changed

+48
-94
lines changed

4 files changed

+48
-94
lines changed

include/linux/sched.h

Lines changed: 12 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -415,42 +415,6 @@ struct load_weight {
415415
u32 inv_weight;
416416
};
417417

418-
/**
419-
* struct util_est - Estimation utilization of FAIR tasks
420-
* @enqueued: instantaneous estimated utilization of a task/cpu
421-
* @ewma: the Exponential Weighted Moving Average (EWMA)
422-
* utilization of a task
423-
*
424-
* Support data structure to track an Exponential Weighted Moving Average
425-
* (EWMA) of a FAIR task's utilization. New samples are added to the moving
426-
* average each time a task completes an activation. Sample's weight is chosen
427-
* so that the EWMA will be relatively insensitive to transient changes to the
428-
* task's workload.
429-
*
430-
* The enqueued attribute has a slightly different meaning for tasks and cpus:
431-
* - task: the task's util_avg at last task dequeue time
432-
* - cfs_rq: the sum of util_est.enqueued for each RUNNABLE task on that CPU
433-
* Thus, the util_est.enqueued of a task represents the contribution on the
434-
* estimated utilization of the CPU where that task is currently enqueued.
435-
*
436-
* Only for tasks we track a moving average of the past instantaneous
437-
* estimated utilization. This allows to absorb sporadic drops in utilization
438-
* of an otherwise almost periodic task.
439-
*
440-
* The UTIL_AVG_UNCHANGED flag is used to synchronize util_est with util_avg
441-
* updates. When a task is dequeued, its util_est should not be updated if its
442-
* util_avg has not been updated in the meantime.
443-
* This information is mapped into the MSB bit of util_est.enqueued at dequeue
444-
* time. Since max value of util_est.enqueued for a task is 1024 (PELT util_avg
445-
* for a task) it is safe to use MSB.
446-
*/
447-
struct util_est {
448-
unsigned int enqueued;
449-
unsigned int ewma;
450-
#define UTIL_EST_WEIGHT_SHIFT 2
451-
#define UTIL_AVG_UNCHANGED 0x80000000
452-
} __attribute__((__aligned__(sizeof(u64))));
453-
454418
/*
455419
* The load/runnable/util_avg accumulates an infinite geometric series
456420
* (see __update_load_avg_cfs_rq() in kernel/sched/pelt.c).
@@ -505,9 +469,20 @@ struct sched_avg {
505469
unsigned long load_avg;
506470
unsigned long runnable_avg;
507471
unsigned long util_avg;
508-
struct util_est util_est;
472+
unsigned int util_est;
509473
} ____cacheline_aligned;
510474

475+
/*
476+
* The UTIL_AVG_UNCHANGED flag is used to synchronize util_est with util_avg
477+
* updates. When a task is dequeued, its util_est should not be updated if its
478+
* util_avg has not been updated in the meantime.
479+
* This information is mapped into the MSB bit of util_est at dequeue time.
480+
* Since max value of util_est for a task is 1024 (PELT util_avg for a task)
481+
* it is safe to use MSB.
482+
*/
483+
#define UTIL_EST_WEIGHT_SHIFT 2
484+
#define UTIL_AVG_UNCHANGED 0x80000000
485+
511486
struct sched_statistics {
512487
#ifdef CONFIG_SCHEDSTATS
513488
u64 wait_start;

kernel/sched/debug.c

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -684,8 +684,8 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
684684
cfs_rq->avg.runnable_avg);
685685
SEQ_printf(m, " .%-30s: %lu\n", "util_avg",
686686
cfs_rq->avg.util_avg);
687-
SEQ_printf(m, " .%-30s: %u\n", "util_est_enqueued",
688-
cfs_rq->avg.util_est.enqueued);
687+
SEQ_printf(m, " .%-30s: %u\n", "util_est",
688+
cfs_rq->avg.util_est);
689689
SEQ_printf(m, " .%-30s: %ld\n", "removed.load_avg",
690690
cfs_rq->removed.load_avg);
691691
SEQ_printf(m, " .%-30s: %ld\n", "removed.util_avg",
@@ -1075,8 +1075,7 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns,
10751075
P(se.avg.runnable_avg);
10761076
P(se.avg.util_avg);
10771077
P(se.avg.last_update_time);
1078-
P(se.avg.util_est.ewma);
1079-
PM(se.avg.util_est.enqueued, ~UTIL_AVG_UNCHANGED);
1078+
PM(se.avg.util_est, ~UTIL_AVG_UNCHANGED);
10801079
#endif
10811080
#ifdef CONFIG_UCLAMP_TASK
10821081
__PS("uclamp.min", p->uclamp_req[UCLAMP_MIN].value);

kernel/sched/fair.c

Lines changed: 31 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -4781,9 +4781,7 @@ static inline unsigned long task_runnable(struct task_struct *p)
47814781

47824782
static inline unsigned long _task_util_est(struct task_struct *p)
47834783
{
4784-
struct util_est ue = READ_ONCE(p->se.avg.util_est);
4785-
4786-
return max(ue.ewma, (ue.enqueued & ~UTIL_AVG_UNCHANGED));
4784+
return READ_ONCE(p->se.avg.util_est) & ~UTIL_AVG_UNCHANGED;
47874785
}
47884786

47894787
static inline unsigned long task_util_est(struct task_struct *p)
@@ -4800,9 +4798,9 @@ static inline void util_est_enqueue(struct cfs_rq *cfs_rq,
48004798
return;
48014799

48024800
/* Update root cfs_rq's estimated utilization */
4803-
enqueued = cfs_rq->avg.util_est.enqueued;
4801+
enqueued = cfs_rq->avg.util_est;
48044802
enqueued += _task_util_est(p);
4805-
WRITE_ONCE(cfs_rq->avg.util_est.enqueued, enqueued);
4803+
WRITE_ONCE(cfs_rq->avg.util_est, enqueued);
48064804

48074805
trace_sched_util_est_cfs_tp(cfs_rq);
48084806
}
@@ -4816,34 +4814,20 @@ static inline void util_est_dequeue(struct cfs_rq *cfs_rq,
48164814
return;
48174815

48184816
/* Update root cfs_rq's estimated utilization */
4819-
enqueued = cfs_rq->avg.util_est.enqueued;
4817+
enqueued = cfs_rq->avg.util_est;
48204818
enqueued -= min_t(unsigned int, enqueued, _task_util_est(p));
4821-
WRITE_ONCE(cfs_rq->avg.util_est.enqueued, enqueued);
4819+
WRITE_ONCE(cfs_rq->avg.util_est, enqueued);
48224820

48234821
trace_sched_util_est_cfs_tp(cfs_rq);
48244822
}
48254823

48264824
#define UTIL_EST_MARGIN (SCHED_CAPACITY_SCALE / 100)
48274825

4828-
/*
4829-
* Check if a (signed) value is within a specified (unsigned) margin,
4830-
* based on the observation that:
4831-
*
4832-
* abs(x) < y := (unsigned)(x + y - 1) < (2 * y - 1)
4833-
*
4834-
* NOTE: this only works when value + margin < INT_MAX.
4835-
*/
4836-
static inline bool within_margin(int value, int margin)
4837-
{
4838-
return ((unsigned int)(value + margin - 1) < (2 * margin - 1));
4839-
}
4840-
48414826
static inline void util_est_update(struct cfs_rq *cfs_rq,
48424827
struct task_struct *p,
48434828
bool task_sleep)
48444829
{
4845-
long last_ewma_diff, last_enqueued_diff;
4846-
struct util_est ue;
4830+
unsigned int ewma, dequeued, last_ewma_diff;
48474831

48484832
if (!sched_feat(UTIL_EST))
48494833
return;
@@ -4855,77 +4839,73 @@ static inline void util_est_update(struct cfs_rq *cfs_rq,
48554839
if (!task_sleep)
48564840
return;
48574841

4842+
/* Get current estimate of utilization */
4843+
ewma = READ_ONCE(p->se.avg.util_est);
4844+
48584845
/*
48594846
* If the PELT values haven't changed since enqueue time,
48604847
* skip the util_est update.
48614848
*/
4862-
ue = p->se.avg.util_est;
4863-
if (ue.enqueued & UTIL_AVG_UNCHANGED)
4849+
if (ewma & UTIL_AVG_UNCHANGED)
48644850
return;
48654851

4866-
last_enqueued_diff = ue.enqueued;
4852+
/* Get utilization at dequeue */
4853+
dequeued = task_util(p);
48674854

48684855
/*
48694856
* Reset EWMA on utilization increases, the moving average is used only
48704857
* to smooth utilization decreases.
48714858
*/
4872-
ue.enqueued = task_util(p);
4873-
if (ue.ewma < ue.enqueued) {
4874-
ue.ewma = ue.enqueued;
4859+
if (ewma <= dequeued) {
4860+
ewma = dequeued;
48754861
goto done;
48764862
}
48774863

48784864
/*
48794865
* Skip update of task's estimated utilization when its members are
48804866
* already ~1% close to its last activation value.
48814867
*/
4882-
last_ewma_diff = ue.enqueued - ue.ewma;
4883-
last_enqueued_diff -= ue.enqueued;
4884-
if (within_margin(last_ewma_diff, UTIL_EST_MARGIN)) {
4885-
if (!within_margin(last_enqueued_diff, UTIL_EST_MARGIN))
4886-
goto done;
4887-
4888-
return;
4889-
}
4868+
last_ewma_diff = ewma - dequeued;
4869+
if (last_ewma_diff < UTIL_EST_MARGIN)
4870+
goto done;
48904871

48914872
/*
48924873
* To avoid overestimation of actual task utilization, skip updates if
48934874
* we cannot grant there is idle time in this CPU.
48944875
*/
4895-
if (task_util(p) > arch_scale_cpu_capacity(cpu_of(rq_of(cfs_rq))))
4876+
if (dequeued > arch_scale_cpu_capacity(cpu_of(rq_of(cfs_rq))))
48964877
return;
48974878

48984879
/*
48994880
* To avoid underestimate of task utilization, skip updates of EWMA if
49004881
* we cannot grant that thread got all CPU time it wanted.
49014882
*/
4902-
if ((ue.enqueued + UTIL_EST_MARGIN) < task_runnable(p))
4883+
if ((dequeued + UTIL_EST_MARGIN) < task_runnable(p))
49034884
goto done;
49044885

49054886

49064887
/*
49074888
* Update Task's estimated utilization
49084889
*
49094890
* When *p completes an activation we can consolidate another sample
4910-
* of the task size. This is done by storing the current PELT value
4911-
* as ue.enqueued and by using this value to update the Exponential
4912-
* Weighted Moving Average (EWMA):
4891+
* of the task size. This is done by using this value to update the
4892+
* Exponential Weighted Moving Average (EWMA):
49134893
*
49144894
* ewma(t) = w * task_util(p) + (1-w) * ewma(t-1)
49154895
* = w * task_util(p) + ewma(t-1) - w * ewma(t-1)
49164896
* = w * (task_util(p) - ewma(t-1)) + ewma(t-1)
4917-
* = w * ( last_ewma_diff ) + ewma(t-1)
4918-
* = w * (last_ewma_diff + ewma(t-1) / w)
4897+
* = w * ( -last_ewma_diff ) + ewma(t-1)
4898+
* = w * (-last_ewma_diff + ewma(t-1) / w)
49194899
*
49204900
* Where 'w' is the weight of new samples, which is configured to be
49214901
* 0.25, thus making w=1/4 ( >>= UTIL_EST_WEIGHT_SHIFT)
49224902
*/
4923-
ue.ewma <<= UTIL_EST_WEIGHT_SHIFT;
4924-
ue.ewma += last_ewma_diff;
4925-
ue.ewma >>= UTIL_EST_WEIGHT_SHIFT;
4903+
ewma <<= UTIL_EST_WEIGHT_SHIFT;
4904+
ewma -= last_ewma_diff;
4905+
ewma >>= UTIL_EST_WEIGHT_SHIFT;
49264906
done:
4927-
ue.enqueued |= UTIL_AVG_UNCHANGED;
4928-
WRITE_ONCE(p->se.avg.util_est, ue);
4907+
ewma |= UTIL_AVG_UNCHANGED;
4908+
WRITE_ONCE(p->se.avg.util_est, ewma);
49294909

49304910
trace_sched_util_est_se_tp(&p->se);
49314911
}
@@ -7653,16 +7633,16 @@ cpu_util(int cpu, struct task_struct *p, int dst_cpu, int boost)
76537633
if (sched_feat(UTIL_EST)) {
76547634
unsigned long util_est;
76557635

7656-
util_est = READ_ONCE(cfs_rq->avg.util_est.enqueued);
7636+
util_est = READ_ONCE(cfs_rq->avg.util_est);
76577637

76587638
/*
76597639
* During wake-up @p isn't enqueued yet and doesn't contribute
7660-
* to any cpu_rq(cpu)->cfs.avg.util_est.enqueued.
7640+
* to any cpu_rq(cpu)->cfs.avg.util_est.
76617641
* If @dst_cpu == @cpu add it to "simulate" cpu_util after @p
76627642
* has been enqueued.
76637643
*
76647644
* During exec (@dst_cpu = -1) @p is enqueued and does
7665-
* contribute to cpu_rq(cpu)->cfs.util_est.enqueued.
7645+
* contribute to cpu_rq(cpu)->cfs.util_est.
76667646
* Remove it to "simulate" cpu_util without @p's contribution.
76677647
*
76687648
* Despite the task_on_rq_queued(@p) check there is still a

kernel/sched/pelt.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -52,13 +52,13 @@ static inline void cfs_se_util_change(struct sched_avg *avg)
5252
return;
5353

5454
/* Avoid store if the flag has been already reset */
55-
enqueued = avg->util_est.enqueued;
55+
enqueued = avg->util_est;
5656
if (!(enqueued & UTIL_AVG_UNCHANGED))
5757
return;
5858

5959
/* Reset flag to report util_avg has been updated */
6060
enqueued &= ~UTIL_AVG_UNCHANGED;
61-
WRITE_ONCE(avg->util_est.enqueued, enqueued);
61+
WRITE_ONCE(avg->util_est, enqueued);
6262
}
6363

6464
static inline u64 rq_clock_pelt(struct rq *rq)

0 commit comments

Comments
 (0)