Skip to content

Commit bd9a3db

Browse files
committed
Merge tag 'sched-psi-2022-10-14' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull PSI updates from Ingo Molnar: - Various performance optimizations, resulting in a 4%-9% speedup in the mmtests/config-scheduler-perfpipe micro-benchmark. - New interface to turn PSI on/off on a per cgroup level. * tag 'sched-psi-2022-10-14' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: sched/psi: Per-cgroup PSI accounting disable/re-enable interface sched/psi: Cache parent psi_group to speed up group iteration sched/psi: Consolidate cgroup_psi() sched/psi: Add PSI_IRQ to track IRQ/SOFTIRQ pressure sched/psi: Remove NR_ONCPU task accounting sched/psi: Optimize task switch inside shared cgroups again sched/psi: Move private helpers to sched/stats.h sched/psi: Save percpu memory when !psi_cgroups_enabled sched/psi: Don't create cgroup PSI files when psi_disabled sched/psi: Fix periodic aggregation shut off
2 parents 1df046a + 34f26a1 commit bd9a3db

File tree

9 files changed

+362
-103
lines changed

9 files changed

+362
-103
lines changed

Documentation/admin-guide/cgroup-v2.rst

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -976,6 +976,29 @@ All cgroup core files are prefixed with "cgroup."
976976
killing cgroups is a process directed operation, i.e. it affects
977977
the whole thread-group.
978978

979+
cgroup.pressure
980+
A read-write single value file that allowed values are "0" and "1".
981+
The default is "1".
982+
983+
Writing "0" to the file will disable the cgroup PSI accounting.
984+
Writing "1" to the file will re-enable the cgroup PSI accounting.
985+
986+
This control attribute is not hierarchical, so disable or enable PSI
987+
accounting in a cgroup does not affect PSI accounting in descendants
988+
and doesn't need pass enablement via ancestors from root.
989+
990+
The reason this control attribute exists is that PSI accounts stalls for
991+
each cgroup separately and aggregates it at each level of the hierarchy.
992+
This may cause non-negligible overhead for some workloads when under
993+
deep level of the hierarchy, in which case this control attribute can
994+
be used to disable PSI accounting in the non-leaf cgroups.
995+
996+
irq.pressure
997+
A read-write nested-keyed file.
998+
999+
Shows pressure stall information for IRQ/SOFTIRQ. See
1000+
:ref:`Documentation/accounting/psi.rst <psi>` for details.
1001+
9791002
Controllers
9801003
===========
9811004

include/linux/cgroup-defs.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -428,6 +428,9 @@ struct cgroup {
428428
struct cgroup_file procs_file; /* handle for "cgroup.procs" */
429429
struct cgroup_file events_file; /* handle for "cgroup.events" */
430430

431+
/* handles for "{cpu,memory,io,irq}.pressure" */
432+
struct cgroup_file psi_files[NR_PSI_RESOURCES];
433+
431434
/*
432435
* The bitmask of subsystems enabled on the child cgroups.
433436
* ->subtree_control is the one configured through

include/linux/cgroup.h

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -682,11 +682,6 @@ static inline void pr_cont_cgroup_path(struct cgroup *cgrp)
682682
pr_cont_kernfs_path(cgrp->kn);
683683
}
684684

685-
static inline struct psi_group *cgroup_psi(struct cgroup *cgrp)
686-
{
687-
return cgrp->psi;
688-
}
689-
690685
bool cgroup_psi_enabled(void);
691686

692687
static inline void cgroup_init_kthreadd(void)

include/linux/psi.h

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
#include <linux/sched.h>
88
#include <linux/poll.h>
99
#include <linux/cgroup-defs.h>
10+
#include <linux/cgroup.h>
1011

1112
struct seq_file;
1213
struct css_set;
@@ -18,10 +19,6 @@ extern struct psi_group psi_system;
1819

1920
void psi_init(void);
2021

21-
void psi_task_change(struct task_struct *task, int clear, int set);
22-
void psi_task_switch(struct task_struct *prev, struct task_struct *next,
23-
bool sleep);
24-
2522
void psi_memstall_enter(unsigned long *flags);
2623
void psi_memstall_leave(unsigned long *flags);
2724

@@ -34,9 +31,15 @@ __poll_t psi_trigger_poll(void **trigger_ptr, struct file *file,
3431
poll_table *wait);
3532

3633
#ifdef CONFIG_CGROUPS
34+
static inline struct psi_group *cgroup_psi(struct cgroup *cgrp)
35+
{
36+
return cgroup_ino(cgrp) == 1 ? &psi_system : cgrp->psi;
37+
}
38+
3739
int psi_cgroup_alloc(struct cgroup *cgrp);
3840
void psi_cgroup_free(struct cgroup *cgrp);
3941
void cgroup_move_task(struct task_struct *p, struct css_set *to);
42+
void psi_cgroup_restart(struct psi_group *group);
4043
#endif
4144

4245
#else /* CONFIG_PSI */
@@ -58,6 +61,7 @@ static inline void cgroup_move_task(struct task_struct *p, struct css_set *to)
5861
{
5962
rcu_assign_pointer(p->cgroups, to);
6063
}
64+
static inline void psi_cgroup_restart(struct psi_group *group) {}
6165
#endif
6266

6367
#endif /* CONFIG_PSI */

include/linux/psi_types.h

Lines changed: 20 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -15,13 +15,6 @@ enum psi_task_count {
1515
NR_IOWAIT,
1616
NR_MEMSTALL,
1717
NR_RUNNING,
18-
/*
19-
* This can't have values other than 0 or 1 and could be
20-
* implemented as a bit flag. But for now we still have room
21-
* in the first cacheline of psi_group_cpu, and this way we
22-
* don't have to special case any state tracking for it.
23-
*/
24-
NR_ONCPU,
2518
/*
2619
* For IO and CPU stalls the presence of running/oncpu tasks
2720
* in the domain means a partial rather than a full stall.
@@ -32,22 +25,27 @@ enum psi_task_count {
3225
* threads and memstall ones.
3326
*/
3427
NR_MEMSTALL_RUNNING,
35-
NR_PSI_TASK_COUNTS = 5,
28+
NR_PSI_TASK_COUNTS = 4,
3629
};
3730

3831
/* Task state bitmasks */
3932
#define TSK_IOWAIT (1 << NR_IOWAIT)
4033
#define TSK_MEMSTALL (1 << NR_MEMSTALL)
4134
#define TSK_RUNNING (1 << NR_RUNNING)
42-
#define TSK_ONCPU (1 << NR_ONCPU)
4335
#define TSK_MEMSTALL_RUNNING (1 << NR_MEMSTALL_RUNNING)
4436

37+
/* Only one task can be scheduled, no corresponding task count */
38+
#define TSK_ONCPU (1 << NR_PSI_TASK_COUNTS)
39+
4540
/* Resources that workloads could be stalled on */
4641
enum psi_res {
4742
PSI_IO,
4843
PSI_MEM,
4944
PSI_CPU,
50-
NR_PSI_RESOURCES = 3,
45+
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
46+
PSI_IRQ,
47+
#endif
48+
NR_PSI_RESOURCES,
5149
};
5250

5351
/*
@@ -63,11 +61,17 @@ enum psi_states {
6361
PSI_MEM_FULL,
6462
PSI_CPU_SOME,
6563
PSI_CPU_FULL,
64+
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
65+
PSI_IRQ_FULL,
66+
#endif
6667
/* Only per-CPU, to weigh the CPU in the global average: */
6768
PSI_NONIDLE,
68-
NR_PSI_STATES = 7,
69+
NR_PSI_STATES,
6970
};
7071

72+
/* Use one bit in the state mask to track TSK_ONCPU */
73+
#define PSI_ONCPU (1 << NR_PSI_STATES)
74+
7175
enum psi_aggregators {
7276
PSI_AVGS = 0,
7377
PSI_POLL,
@@ -147,6 +151,9 @@ struct psi_trigger {
147151
};
148152

149153
struct psi_group {
154+
struct psi_group *parent;
155+
bool enabled;
156+
150157
/* Protects data used by the aggregator */
151158
struct mutex avgs_lock;
152159

@@ -188,6 +195,8 @@ struct psi_group {
188195

189196
#else /* CONFIG_PSI */
190197

198+
#define NR_PSI_RESOURCES 0
199+
191200
struct psi_group { };
192201

193202
#endif /* CONFIG_PSI */

kernel/cgroup/cgroup.c

Lines changed: 95 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -3698,27 +3698,27 @@ static int cpu_stat_show(struct seq_file *seq, void *v)
36983698
static int cgroup_io_pressure_show(struct seq_file *seq, void *v)
36993699
{
37003700
struct cgroup *cgrp = seq_css(seq)->cgroup;
3701-
struct psi_group *psi = cgroup_ino(cgrp) == 1 ? &psi_system : cgrp->psi;
3701+
struct psi_group *psi = cgroup_psi(cgrp);
37023702

37033703
return psi_show(seq, psi, PSI_IO);
37043704
}
37053705
static int cgroup_memory_pressure_show(struct seq_file *seq, void *v)
37063706
{
37073707
struct cgroup *cgrp = seq_css(seq)->cgroup;
3708-
struct psi_group *psi = cgroup_ino(cgrp) == 1 ? &psi_system : cgrp->psi;
3708+
struct psi_group *psi = cgroup_psi(cgrp);
37093709

37103710
return psi_show(seq, psi, PSI_MEM);
37113711
}
37123712
static int cgroup_cpu_pressure_show(struct seq_file *seq, void *v)
37133713
{
37143714
struct cgroup *cgrp = seq_css(seq)->cgroup;
3715-
struct psi_group *psi = cgroup_ino(cgrp) == 1 ? &psi_system : cgrp->psi;
3715+
struct psi_group *psi = cgroup_psi(cgrp);
37163716

37173717
return psi_show(seq, psi, PSI_CPU);
37183718
}
37193719

3720-
static ssize_t cgroup_pressure_write(struct kernfs_open_file *of, char *buf,
3721-
size_t nbytes, enum psi_res res)
3720+
static ssize_t pressure_write(struct kernfs_open_file *of, char *buf,
3721+
size_t nbytes, enum psi_res res)
37223722
{
37233723
struct cgroup_file_ctx *ctx = of->priv;
37243724
struct psi_trigger *new;
@@ -3738,7 +3738,7 @@ static ssize_t cgroup_pressure_write(struct kernfs_open_file *of, char *buf,
37383738
return -EBUSY;
37393739
}
37403740

3741-
psi = cgroup_ino(cgrp) == 1 ? &psi_system : cgrp->psi;
3741+
psi = cgroup_psi(cgrp);
37423742
new = psi_trigger_create(psi, buf, res);
37433743
if (IS_ERR(new)) {
37443744
cgroup_put(cgrp);
@@ -3755,21 +3755,86 @@ static ssize_t cgroup_io_pressure_write(struct kernfs_open_file *of,
37553755
char *buf, size_t nbytes,
37563756
loff_t off)
37573757
{
3758-
return cgroup_pressure_write(of, buf, nbytes, PSI_IO);
3758+
return pressure_write(of, buf, nbytes, PSI_IO);
37593759
}
37603760

37613761
static ssize_t cgroup_memory_pressure_write(struct kernfs_open_file *of,
37623762
char *buf, size_t nbytes,
37633763
loff_t off)
37643764
{
3765-
return cgroup_pressure_write(of, buf, nbytes, PSI_MEM);
3765+
return pressure_write(of, buf, nbytes, PSI_MEM);
37663766
}
37673767

37683768
static ssize_t cgroup_cpu_pressure_write(struct kernfs_open_file *of,
37693769
char *buf, size_t nbytes,
37703770
loff_t off)
37713771
{
3772-
return cgroup_pressure_write(of, buf, nbytes, PSI_CPU);
3772+
return pressure_write(of, buf, nbytes, PSI_CPU);
3773+
}
3774+
3775+
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
3776+
static int cgroup_irq_pressure_show(struct seq_file *seq, void *v)
3777+
{
3778+
struct cgroup *cgrp = seq_css(seq)->cgroup;
3779+
struct psi_group *psi = cgroup_psi(cgrp);
3780+
3781+
return psi_show(seq, psi, PSI_IRQ);
3782+
}
3783+
3784+
static ssize_t cgroup_irq_pressure_write(struct kernfs_open_file *of,
3785+
char *buf, size_t nbytes,
3786+
loff_t off)
3787+
{
3788+
return pressure_write(of, buf, nbytes, PSI_IRQ);
3789+
}
3790+
#endif
3791+
3792+
static int cgroup_pressure_show(struct seq_file *seq, void *v)
3793+
{
3794+
struct cgroup *cgrp = seq_css(seq)->cgroup;
3795+
struct psi_group *psi = cgroup_psi(cgrp);
3796+
3797+
seq_printf(seq, "%d\n", psi->enabled);
3798+
3799+
return 0;
3800+
}
3801+
3802+
static ssize_t cgroup_pressure_write(struct kernfs_open_file *of,
3803+
char *buf, size_t nbytes,
3804+
loff_t off)
3805+
{
3806+
ssize_t ret;
3807+
int enable;
3808+
struct cgroup *cgrp;
3809+
struct psi_group *psi;
3810+
3811+
ret = kstrtoint(strstrip(buf), 0, &enable);
3812+
if (ret)
3813+
return ret;
3814+
3815+
if (enable < 0 || enable > 1)
3816+
return -ERANGE;
3817+
3818+
cgrp = cgroup_kn_lock_live(of->kn, false);
3819+
if (!cgrp)
3820+
return -ENOENT;
3821+
3822+
psi = cgroup_psi(cgrp);
3823+
if (psi->enabled != enable) {
3824+
int i;
3825+
3826+
/* show or hide {cpu,memory,io,irq}.pressure files */
3827+
for (i = 0; i < NR_PSI_RESOURCES; i++)
3828+
cgroup_file_show(&cgrp->psi_files[i], enable);
3829+
3830+
psi->enabled = enable;
3831+
if (enable)
3832+
psi_cgroup_restart(psi);
3833+
}
3834+
3835+
cgroup_kn_unlock(of->kn);
3836+
3837+
return nbytes;
37733838
}
37743839

37753840
static __poll_t cgroup_pressure_poll(struct kernfs_open_file *of,
@@ -3789,6 +3854,9 @@ static void cgroup_pressure_release(struct kernfs_open_file *of)
37893854

37903855
bool cgroup_psi_enabled(void)
37913856
{
3857+
if (static_branch_likely(&psi_disabled))
3858+
return false;
3859+
37923860
return (cgroup_feature_disable_mask & (1 << OPT_FEATURE_PRESSURE)) == 0;
37933861
}
37943862

@@ -5175,25 +5243,43 @@ static struct cftype cgroup_psi_files[] = {
51755243
#ifdef CONFIG_PSI
51765244
{
51775245
.name = "io.pressure",
5246+
.file_offset = offsetof(struct cgroup, psi_files[PSI_IO]),
51785247
.seq_show = cgroup_io_pressure_show,
51795248
.write = cgroup_io_pressure_write,
51805249
.poll = cgroup_pressure_poll,
51815250
.release = cgroup_pressure_release,
51825251
},
51835252
{
51845253
.name = "memory.pressure",
5254+
.file_offset = offsetof(struct cgroup, psi_files[PSI_MEM]),
51855255
.seq_show = cgroup_memory_pressure_show,
51865256
.write = cgroup_memory_pressure_write,
51875257
.poll = cgroup_pressure_poll,
51885258
.release = cgroup_pressure_release,
51895259
},
51905260
{
51915261
.name = "cpu.pressure",
5262+
.file_offset = offsetof(struct cgroup, psi_files[PSI_CPU]),
51925263
.seq_show = cgroup_cpu_pressure_show,
51935264
.write = cgroup_cpu_pressure_write,
51945265
.poll = cgroup_pressure_poll,
51955266
.release = cgroup_pressure_release,
51965267
},
5268+
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
5269+
{
5270+
.name = "irq.pressure",
5271+
.file_offset = offsetof(struct cgroup, psi_files[PSI_IRQ]),
5272+
.seq_show = cgroup_irq_pressure_show,
5273+
.write = cgroup_irq_pressure_write,
5274+
.poll = cgroup_pressure_poll,
5275+
.release = cgroup_pressure_release,
5276+
},
5277+
#endif
5278+
{
5279+
.name = "cgroup.pressure",
5280+
.seq_show = cgroup_pressure_show,
5281+
.write = cgroup_pressure_write,
5282+
},
51975283
#endif /* CONFIG_PSI */
51985284
{ } /* terminate */
51995285
};

kernel/sched/core.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -701,6 +701,7 @@ static void update_rq_clock_task(struct rq *rq, s64 delta)
701701

702702
rq->prev_irq_time += irq_delta;
703703
delta -= irq_delta;
704+
psi_account_irqtime(rq->curr, irq_delta);
704705
#endif
705706
#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
706707
if (static_key_false((&paravirt_steal_rq_enabled))) {

0 commit comments

Comments
 (0)