Skip to content

Commit 96492a6

Browse files
Chengming ZhouPeter Zijlstra
authored andcommitted
perf/core: Fix perf_cgroup_switch()
There is a race problem that can trigger WARN_ON_ONCE(cpuctx->cgrp) in perf_cgroup_switch(). CPU1 CPU2 perf_cgroup_sched_out(prev, next) cgrp1 = perf_cgroup_from_task(prev) cgrp2 = perf_cgroup_from_task(next) if (cgrp1 != cgrp2) perf_cgroup_switch(prev, PERF_CGROUP_SWOUT) cgroup_migrate_execute() task->cgroups = ? perf_cgroup_attach() task_function_call(task, __perf_cgroup_move) perf_cgroup_sched_in(prev, next) cgrp1 = perf_cgroup_from_task(prev) cgrp2 = perf_cgroup_from_task(next) if (cgrp1 != cgrp2) perf_cgroup_switch(next, PERF_CGROUP_SWIN) __perf_cgroup_move() perf_cgroup_switch(task, PERF_CGROUP_SWOUT | PERF_CGROUP_SWIN) The commit a8d757e ("perf events: Fix slow and broken cgroup context switch code") want to skip perf_cgroup_switch() when the perf_cgroup of "prev" and "next" are the same. But task->cgroups can change in concurrent with context_switch() in cgroup_migrate_execute(). If cgrp1 == cgrp2 in sched_out(), cpuctx won't do sched_out. Then task->cgroups changed cause cgrp1 != cgrp2 in sched_in(), cpuctx will do sched_in. So trigger WARN_ON_ONCE(cpuctx->cgrp). Even though __perf_cgroup_move() will be synchronized as the context switch disables the interrupt, context_switch() still can see the task->cgroups is changing in the middle, since task->cgroups changed before sending IPI. So we have to combine perf_cgroup_sched_in() into perf_cgroup_sched_out(), unified into perf_cgroup_switch(), to fix the incosistency between perf_cgroup_sched_out() and perf_cgroup_sched_in(). But we can't just compare prev->cgroups with next->cgroups to decide whether to skip cpuctx sched_out/in since the prev->cgroups is changing too. For example: CPU1 CPU2 cgroup_migrate_execute() prev->cgroups = ? perf_cgroup_attach() task_function_call(task, __perf_cgroup_move) perf_cgroup_switch(task) cgrp1 = perf_cgroup_from_task(prev) cgrp2 = perf_cgroup_from_task(next) if (cgrp1 != cgrp2) cpuctx sched_out/in ... task_function_call() will return -ESRCH In the above example, prev->cgroups changing cause (cgrp1 == cgrp2) to be true, so skip cpuctx sched_out/in. And later task_function_call() would return -ESRCH since the prev task isn't running on cpu anymore. So we would leave perf_events of the old prev->cgroups still sched on the CPU, which is wrong. The solution is that we should use cpuctx->cgrp to compare with the next task's perf_cgroup. Since cpuctx->cgrp can only be changed on local CPU, and we have irq disabled, we can read cpuctx->cgrp to compare without holding ctx lock. Fixes: a8d757e ("perf events: Fix slow and broken cgroup context switch code") Signed-off-by: Chengming Zhou <zhouchengming@bytedance.com> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> Link: https://lore.kernel.org/r/20220329154523.86438-4-zhouchengming@bytedance.com
1 parent 6875186 commit 96492a6

File tree

1 file changed

+25
-107
lines changed

1 file changed

+25
-107
lines changed

kernel/events/core.c

Lines changed: 25 additions & 107 deletions
Original file line numberDiff line numberDiff line change
@@ -824,17 +824,12 @@ perf_cgroup_set_timestamp(struct perf_cpu_context *cpuctx)
824824

825825
static DEFINE_PER_CPU(struct list_head, cgrp_cpuctx_list);
826826

827-
#define PERF_CGROUP_SWOUT 0x1 /* cgroup switch out every event */
828-
#define PERF_CGROUP_SWIN 0x2 /* cgroup switch in events based on task */
829-
830827
/*
831828
* reschedule events based on the cgroup constraint of task.
832-
*
833-
* mode SWOUT : schedule out everything
834-
* mode SWIN : schedule in based on cgroup for next
835829
*/
836-
static void perf_cgroup_switch(struct task_struct *task, int mode)
830+
static void perf_cgroup_switch(struct task_struct *task)
837831
{
832+
struct perf_cgroup *cgrp;
838833
struct perf_cpu_context *cpuctx, *tmp;
839834
struct list_head *list;
840835
unsigned long flags;
@@ -845,94 +840,38 @@ static void perf_cgroup_switch(struct task_struct *task, int mode)
845840
*/
846841
local_irq_save(flags);
847842

843+
cgrp = perf_cgroup_from_task(task, NULL);
844+
848845
list = this_cpu_ptr(&cgrp_cpuctx_list);
849846
list_for_each_entry_safe(cpuctx, tmp, list, cgrp_cpuctx_entry) {
850847
WARN_ON_ONCE(cpuctx->ctx.nr_cgroups == 0);
848+
if (READ_ONCE(cpuctx->cgrp) == cgrp)
849+
continue;
851850

852851
perf_ctx_lock(cpuctx, cpuctx->task_ctx);
853852
perf_pmu_disable(cpuctx->ctx.pmu);
854853

855-
if (mode & PERF_CGROUP_SWOUT) {
856-
cpu_ctx_sched_out(cpuctx, EVENT_ALL);
857-
/*
858-
* must not be done before ctxswout due
859-
* to event_filter_match() in event_sched_out()
860-
*/
861-
cpuctx->cgrp = NULL;
862-
}
854+
cpu_ctx_sched_out(cpuctx, EVENT_ALL);
855+
/*
856+
* must not be done before ctxswout due
857+
* to update_cgrp_time_from_cpuctx() in
858+
* ctx_sched_out()
859+
*/
860+
cpuctx->cgrp = cgrp;
861+
/*
862+
* set cgrp before ctxsw in to allow
863+
* perf_cgroup_set_timestamp() in ctx_sched_in()
864+
* to not have to pass task around
865+
*/
866+
cpu_ctx_sched_in(cpuctx, EVENT_ALL);
863867

864-
if (mode & PERF_CGROUP_SWIN) {
865-
WARN_ON_ONCE(cpuctx->cgrp);
866-
/*
867-
* set cgrp before ctxsw in to allow
868-
* perf_cgroup_set_timestamp() in ctx_sched_in()
869-
* to not have to pass task around
870-
* we pass the cpuctx->ctx to perf_cgroup_from_task()
871-
* because cgorup events are only per-cpu
872-
*/
873-
cpuctx->cgrp = perf_cgroup_from_task(task,
874-
&cpuctx->ctx);
875-
cpu_ctx_sched_in(cpuctx, EVENT_ALL);
876-
}
877868
perf_pmu_enable(cpuctx->ctx.pmu);
878869
perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
879870
}
880871

881872
local_irq_restore(flags);
882873
}
883874

884-
static inline void perf_cgroup_sched_out(struct task_struct *task,
885-
struct task_struct *next)
886-
{
887-
struct perf_cgroup *cgrp1;
888-
struct perf_cgroup *cgrp2 = NULL;
889-
890-
rcu_read_lock();
891-
/*
892-
* we come here when we know perf_cgroup_events > 0
893-
* we do not need to pass the ctx here because we know
894-
* we are holding the rcu lock
895-
*/
896-
cgrp1 = perf_cgroup_from_task(task, NULL);
897-
cgrp2 = perf_cgroup_from_task(next, NULL);
898-
899-
/*
900-
* only schedule out current cgroup events if we know
901-
* that we are switching to a different cgroup. Otherwise,
902-
* do no touch the cgroup events.
903-
*/
904-
if (cgrp1 != cgrp2)
905-
perf_cgroup_switch(task, PERF_CGROUP_SWOUT);
906-
907-
rcu_read_unlock();
908-
}
909-
910-
static inline void perf_cgroup_sched_in(struct task_struct *prev,
911-
struct task_struct *task)
912-
{
913-
struct perf_cgroup *cgrp1;
914-
struct perf_cgroup *cgrp2 = NULL;
915-
916-
rcu_read_lock();
917-
/*
918-
* we come here when we know perf_cgroup_events > 0
919-
* we do not need to pass the ctx here because we know
920-
* we are holding the rcu lock
921-
*/
922-
cgrp1 = perf_cgroup_from_task(task, NULL);
923-
cgrp2 = perf_cgroup_from_task(prev, NULL);
924-
925-
/*
926-
* only need to schedule in cgroup events if we are changing
927-
* cgroup during ctxsw. Cgroup events were not scheduled
928-
* out of ctxsw out if that was not the case.
929-
*/
930-
if (cgrp1 != cgrp2)
931-
perf_cgroup_switch(task, PERF_CGROUP_SWIN);
932-
933-
rcu_read_unlock();
934-
}
935-
936875
static int perf_cgroup_ensure_storage(struct perf_event *event,
937876
struct cgroup_subsys_state *css)
938877
{
@@ -1096,16 +1035,6 @@ static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx,
10961035
{
10971036
}
10981037

1099-
static inline void perf_cgroup_sched_out(struct task_struct *task,
1100-
struct task_struct *next)
1101-
{
1102-
}
1103-
1104-
static inline void perf_cgroup_sched_in(struct task_struct *prev,
1105-
struct task_struct *task)
1106-
{
1107-
}
1108-
11091038
static inline int perf_cgroup_connect(pid_t pid, struct perf_event *event,
11101039
struct perf_event_attr *attr,
11111040
struct perf_event *group_leader)
@@ -1118,11 +1047,6 @@ perf_cgroup_set_timestamp(struct perf_cpu_context *cpuctx)
11181047
{
11191048
}
11201049

1121-
static inline void
1122-
perf_cgroup_switch(struct task_struct *task, struct task_struct *next)
1123-
{
1124-
}
1125-
11261050
static inline u64 perf_cgroup_event_time(struct perf_event *event)
11271051
{
11281052
return 0;
@@ -1142,6 +1066,10 @@ static inline void
11421066
perf_cgroup_event_disable(struct perf_event *event, struct perf_event_context *ctx)
11431067
{
11441068
}
1069+
1070+
static void perf_cgroup_switch(struct task_struct *task)
1071+
{
1072+
}
11451073
#endif
11461074

11471075
/*
@@ -3661,7 +3589,7 @@ void __perf_event_task_sched_out(struct task_struct *task,
36613589
* cgroup event are system-wide mode only
36623590
*/
36633591
if (atomic_read(this_cpu_ptr(&perf_cgroup_events)))
3664-
perf_cgroup_sched_out(task, next);
3592+
perf_cgroup_switch(next);
36653593
}
36663594

36673595
/*
@@ -3975,16 +3903,6 @@ void __perf_event_task_sched_in(struct task_struct *prev,
39753903
struct perf_event_context *ctx;
39763904
int ctxn;
39773905

3978-
/*
3979-
* If cgroup events exist on this CPU, then we need to check if we have
3980-
* to switch in PMU state; cgroup event are system-wide mode only.
3981-
*
3982-
* Since cgroup events are CPU events, we must schedule these in before
3983-
* we schedule in the task events.
3984-
*/
3985-
if (atomic_read(this_cpu_ptr(&perf_cgroup_events)))
3986-
perf_cgroup_sched_in(prev, task);
3987-
39883906
for_each_task_context_nr(ctxn) {
39893907
ctx = task->perf_event_ctxp[ctxn];
39903908
if (likely(!ctx))
@@ -13556,7 +13474,7 @@ static int __perf_cgroup_move(void *info)
1355613474
{
1355713475
struct task_struct *task = info;
1355813476
rcu_read_lock();
13559-
perf_cgroup_switch(task, PERF_CGROUP_SWOUT | PERF_CGROUP_SWIN);
13477+
perf_cgroup_switch(task);
1356013478
rcu_read_unlock();
1356113479
return 0;
1356213480
}

0 commit comments

Comments
 (0)