Skip to content

Commit ddae0ca

Browse files
johnstultz-workPeter Zijlstra
authored andcommitted
sched: Move psi_account_irqtime() out of update_rq_clock_task() hotpath
It was reported that in moving to 6.1, a larger then 10% regression was seen in the performance of clock_gettime(CLOCK_THREAD_CPUTIME_ID,...). Using a simple reproducer, I found: 5.10: 100000000 calls in 24345994193 ns => 243.460 ns per call 100000000 calls in 24288172050 ns => 242.882 ns per call 100000000 calls in 24289135225 ns => 242.891 ns per call 6.1: 100000000 calls in 28248646742 ns => 282.486 ns per call 100000000 calls in 28227055067 ns => 282.271 ns per call 100000000 calls in 28177471287 ns => 281.775 ns per call The cause of this was finally narrowed down to the addition of psi_account_irqtime() in update_rq_clock_task(), in commit 52b1364 ("sched/psi: Add PSI_IRQ to track IRQ/SOFTIRQ pressure"). In my initial attempt to resolve this, I leaned towards moving all accounting work out of the clock_gettime() call path, but it wasn't very pretty, so it will have to wait for a later deeper rework. Instead, Peter shared this approach: Rework psi_account_irqtime() to use its own psi_irq_time base for accounting, and move it out of the hotpath, calling it instead from sched_tick() and __schedule(). In testing this, we found the importance of ensuring psi_account_irqtime() is run under the rq_lock, which Johannes Weiner helpfully explained, so also add some lockdep annotations to make that requirement clear. With this change the performance is back in-line with 5.10: 6.1+fix: 100000000 calls in 24297324597 ns => 242.973 ns per call 100000000 calls in 24318869234 ns => 243.189 ns per call 100000000 calls in 24291564588 ns => 242.916 ns per call Reported-by: Jimmy Shiu <jimmyshiu@google.com> Originally-by: Peter Zijlstra <peterz@infradead.org> Signed-off-by: John Stultz <jstultz@google.com> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> Reviewed-by: Chengming Zhou <chengming.zhou@linux.dev> Reviewed-by: Qais Yousef <qyousef@layalina.io> Link: https://lore.kernel.org/r/20240618215909.4099720-1-jstultz@google.com
1 parent b58652d commit ddae0ca

File tree

4 files changed

+30
-10
lines changed

4 files changed

+30
-10
lines changed

kernel/sched/core.c

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -723,7 +723,6 @@ static void update_rq_clock_task(struct rq *rq, s64 delta)
723723

724724
rq->prev_irq_time += irq_delta;
725725
delta -= irq_delta;
726-
psi_account_irqtime(rq->curr, irq_delta);
727726
delayacct_irq(rq->curr, irq_delta);
728727
#endif
729728
#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
@@ -5665,7 +5664,7 @@ void sched_tick(void)
56655664
{
56665665
int cpu = smp_processor_id();
56675666
struct rq *rq = cpu_rq(cpu);
5668-
struct task_struct *curr = rq->curr;
5667+
struct task_struct *curr;
56695668
struct rq_flags rf;
56705669
unsigned long hw_pressure;
56715670
u64 resched_latency;
@@ -5677,6 +5676,9 @@ void sched_tick(void)
56775676

56785677
rq_lock(rq, &rf);
56795678

5679+
curr = rq->curr;
5680+
psi_account_irqtime(rq, curr, NULL);
5681+
56805682
update_rq_clock(rq);
56815683
hw_pressure = arch_scale_hw_pressure(cpu_of(rq));
56825684
update_hw_load_avg(rq_clock_task(rq), rq, hw_pressure);
@@ -6737,6 +6739,7 @@ static void __sched notrace __schedule(unsigned int sched_mode)
67376739
++*switch_count;
67386740

67396741
migrate_disable_switch(rq, prev);
6742+
psi_account_irqtime(rq, prev, next);
67406743
psi_sched_switch(prev, next, !task_on_rq_queued(prev));
67416744

67426745
trace_sched_switch(sched_mode & SM_MASK_PREEMPT, prev, next, prev_state);

kernel/sched/psi.c

Lines changed: 16 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -773,6 +773,7 @@ static void psi_group_change(struct psi_group *group, int cpu,
773773
enum psi_states s;
774774
u32 state_mask;
775775

776+
lockdep_assert_rq_held(cpu_rq(cpu));
776777
groupc = per_cpu_ptr(group->pcpu, cpu);
777778

778779
/*
@@ -991,22 +992,32 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next,
991992
}
992993

993994
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
994-
void psi_account_irqtime(struct task_struct *task, u32 delta)
995+
void psi_account_irqtime(struct rq *rq, struct task_struct *curr, struct task_struct *prev)
995996
{
996-
int cpu = task_cpu(task);
997+
int cpu = task_cpu(curr);
997998
struct psi_group *group;
998999
struct psi_group_cpu *groupc;
999-
u64 now;
1000+
u64 now, irq;
1001+
s64 delta;
10001002

10011003
if (static_branch_likely(&psi_disabled))
10021004
return;
10031005

1004-
if (!task->pid)
1006+
if (!curr->pid)
1007+
return;
1008+
1009+
lockdep_assert_rq_held(rq);
1010+
group = task_psi_group(curr);
1011+
if (prev && task_psi_group(prev) == group)
10051012
return;
10061013

10071014
now = cpu_clock(cpu);
1015+
irq = irq_time_read(cpu);
1016+
delta = (s64)(irq - rq->psi_irq_time);
1017+
if (delta < 0)
1018+
return;
1019+
rq->psi_irq_time = irq;
10081020

1009-
group = task_psi_group(task);
10101021
do {
10111022
if (!group->enabled)
10121023
continue;

kernel/sched/sched.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1126,6 +1126,7 @@ struct rq {
11261126

11271127
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
11281128
u64 prev_irq_time;
1129+
u64 psi_irq_time;
11291130
#endif
11301131
#ifdef CONFIG_PARAVIRT
11311132
u64 prev_steal_time;

kernel/sched/stats.h

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -110,8 +110,12 @@ __schedstats_from_se(struct sched_entity *se)
110110
void psi_task_change(struct task_struct *task, int clear, int set);
111111
void psi_task_switch(struct task_struct *prev, struct task_struct *next,
112112
bool sleep);
113-
void psi_account_irqtime(struct task_struct *task, u32 delta);
114-
113+
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
114+
void psi_account_irqtime(struct rq *rq, struct task_struct *curr, struct task_struct *prev);
115+
#else
116+
static inline void psi_account_irqtime(struct rq *rq, struct task_struct *curr,
117+
struct task_struct *prev) {}
118+
#endif /*CONFIG_IRQ_TIME_ACCOUNTING */
115119
/*
116120
* PSI tracks state that persists across sleeps, such as iowaits and
117121
* memory stalls. As a result, it has to distinguish between sleeps,
@@ -192,7 +196,8 @@ static inline void psi_ttwu_dequeue(struct task_struct *p) {}
192196
static inline void psi_sched_switch(struct task_struct *prev,
193197
struct task_struct *next,
194198
bool sleep) {}
195-
static inline void psi_account_irqtime(struct task_struct *task, u32 delta) {}
199+
static inline void psi_account_irqtime(struct rq *rq, struct task_struct *curr,
200+
struct task_struct *prev) {}
196201
#endif /* CONFIG_PSI */
197202

198203
#ifdef CONFIG_SCHED_INFO

0 commit comments

Comments
 (0)