Skip to content

Commit f841b68

Browse files
Chengming ZhouPeter Zijlstra
authored andcommitted
perf/core: Fix cgroup events tracking
We encounter perf warnings when using cgroup events like: cd /sys/fs/cgroup mkdir test perf stat -e cycles -a -G test Which then triggers: WARNING: CPU: 0 PID: 690 at kernel/events/core.c:849 perf_cgroup_switch+0xb2/0xc0 Call Trace: <TASK> __schedule+0x4ae/0x9f0 ? _raw_spin_unlock_irqrestore+0x23/0x40 ? __cond_resched+0x18/0x20 preempt_schedule_common+0x2d/0x70 __cond_resched+0x18/0x20 wait_for_completion+0x2f/0x160 ? cpu_stop_queue_work+0x9e/0x130 affine_move_task+0x18a/0x4f0 WARNING: CPU: 0 PID: 690 at kernel/events/core.c:829 ctx_sched_in+0x1cf/0x1e0 Call Trace: <TASK> ? ctx_sched_out+0xb7/0x1b0 perf_cgroup_switch+0x88/0xc0 __schedule+0x4ae/0x9f0 ? _raw_spin_unlock_irqrestore+0x23/0x40 ? __cond_resched+0x18/0x20 preempt_schedule_common+0x2d/0x70 __cond_resched+0x18/0x20 wait_for_completion+0x2f/0x160 ? cpu_stop_queue_work+0x9e/0x130 affine_move_task+0x18a/0x4f0 The above two warnings are not complete here since I remove other unimportant information. The problem is caused by the perf cgroup events tracking: CPU0 CPU1 perf_event_open() perf_event_alloc() account_event() account_event_cpu() atomic_inc(perf_cgroup_events) __perf_event_task_sched_out() if (atomic_read(perf_cgroup_events)) perf_cgroup_switch() // kernel/events/core.c:849 WARN_ON_ONCE(cpuctx->ctx.nr_cgroups == 0) if (READ_ONCE(cpuctx->cgrp) == cgrp) // false return perf_ctx_lock() ctx_sched_out() cpuctx->cgrp = cgrp ctx_sched_in() perf_cgroup_set_timestamp() // kernel/events/core.c:829 WARN_ON_ONCE(!ctx->nr_cgroups) perf_ctx_unlock() perf_install_in_context() cpu_function_call() __perf_install_in_context() add_event_to_ctx() list_add_event() perf_cgroup_event_enable() ctx->nr_cgroups++ cpuctx->cgrp = X We can see from above that we wrongly use percpu atomic perf_cgroup_events to check if we need to perf_cgroup_switch(), which should only be used when we know this CPU has cgroup events enabled. The commit bd27568 ("perf: Rewrite core context handling") change to have only one context per-CPU, so we can just use cpuctx->cgrp to check if this CPU has cgroup events enabled. So percpu atomic perf_cgroup_events is not needed. Fixes: bd27568 ("perf: Rewrite core context handling") Signed-off-by: Chengming Zhou <zhouchengming@bytedance.com> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> Tested-by: Ravi Bangoria <ravi.bangoria@amd.com> Link: https://lkml.kernel.org/r/20221207124023.66252-1-zhouchengming@bytedance.com
1 parent e2d3714 commit f841b68

File tree

1 file changed

+10
-32
lines changed

1 file changed

+10
-32
lines changed

kernel/events/core.c

Lines changed: 10 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -380,7 +380,6 @@ enum event_type_t {
380380

381381
/*
382382
* perf_sched_events : >0 events exist
383-
* perf_cgroup_events: >0 per-cpu cgroup events exist on this cpu
384383
*/
385384

386385
static void perf_sched_delayed(struct work_struct *work);
@@ -389,7 +388,6 @@ static DECLARE_DELAYED_WORK(perf_sched_work, perf_sched_delayed);
389388
static DEFINE_MUTEX(perf_sched_mutex);
390389
static atomic_t perf_sched_count;
391390

392-
static DEFINE_PER_CPU(atomic_t, perf_cgroup_events);
393391
static DEFINE_PER_CPU(struct pmu_event_list, pmu_sb_events);
394392

395393
static atomic_t nr_mmap_events __read_mostly;
@@ -844,9 +842,16 @@ static void perf_cgroup_switch(struct task_struct *task)
844842
struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
845843
struct perf_cgroup *cgrp;
846844

847-
cgrp = perf_cgroup_from_task(task, NULL);
845+
/*
846+
* cpuctx->cgrp is set when the first cgroup event enabled,
847+
* and is cleared when the last cgroup event disabled.
848+
*/
849+
if (READ_ONCE(cpuctx->cgrp) == NULL)
850+
return;
848851

849852
WARN_ON_ONCE(cpuctx->ctx.nr_cgroups == 0);
853+
854+
cgrp = perf_cgroup_from_task(task, NULL);
850855
if (READ_ONCE(cpuctx->cgrp) == cgrp)
851856
return;
852857

@@ -3631,8 +3636,7 @@ void __perf_event_task_sched_out(struct task_struct *task,
36313636
* to check if we have to switch out PMU state.
36323637
* cgroup event are system-wide mode only
36333638
*/
3634-
if (atomic_read(this_cpu_ptr(&perf_cgroup_events)))
3635-
perf_cgroup_switch(next);
3639+
perf_cgroup_switch(next);
36363640
}
36373641

36383642
static bool perf_less_group_idx(const void *l, const void *r)
@@ -4974,15 +4978,6 @@ static void unaccount_pmu_sb_event(struct perf_event *event)
49744978
detach_sb_event(event);
49754979
}
49764980

4977-
static void unaccount_event_cpu(struct perf_event *event, int cpu)
4978-
{
4979-
if (event->parent)
4980-
return;
4981-
4982-
if (is_cgroup_event(event))
4983-
atomic_dec(&per_cpu(perf_cgroup_events, cpu));
4984-
}
4985-
49864981
#ifdef CONFIG_NO_HZ_FULL
49874982
static DEFINE_SPINLOCK(nr_freq_lock);
49884983
#endif
@@ -5048,8 +5043,6 @@ static void unaccount_event(struct perf_event *event)
50485043
schedule_delayed_work(&perf_sched_work, HZ);
50495044
}
50505045

5051-
unaccount_event_cpu(event, event->cpu);
5052-
50535046
unaccount_pmu_sb_event(event);
50545047
}
50555048

@@ -11679,15 +11672,6 @@ static void account_pmu_sb_event(struct perf_event *event)
1167911672
attach_sb_event(event);
1168011673
}
1168111674

11682-
static void account_event_cpu(struct perf_event *event, int cpu)
11683-
{
11684-
if (event->parent)
11685-
return;
11686-
11687-
if (is_cgroup_event(event))
11688-
atomic_inc(&per_cpu(perf_cgroup_events, cpu));
11689-
}
11690-
1169111675
/* Freq events need the tick to stay alive (see perf_event_task_tick). */
1169211676
static void account_freq_event_nohz(void)
1169311677
{
@@ -11775,8 +11759,6 @@ static void account_event(struct perf_event *event)
1177511759
}
1177611760
enabled:
1177711761

11778-
account_event_cpu(event, event->cpu);
11779-
1178011762
account_pmu_sb_event(event);
1178111763
}
1178211764

@@ -12822,13 +12804,11 @@ static void __perf_pmu_remove(struct perf_event_context *ctx,
1282212804

1282312805
perf_event_groups_for_cpu_pmu(event, groups, cpu, pmu) {
1282412806
perf_remove_from_context(event, 0);
12825-
unaccount_event_cpu(event, cpu);
1282612807
put_pmu_ctx(event->pmu_ctx);
1282712808
list_add(&event->migrate_entry, events);
1282812809

1282912810
for_each_sibling_event(sibling, event) {
1283012811
perf_remove_from_context(sibling, 0);
12831-
unaccount_event_cpu(sibling, cpu);
1283212812
put_pmu_ctx(sibling->pmu_ctx);
1283312813
list_add(&sibling->migrate_entry, events);
1283412814
}
@@ -12847,7 +12827,6 @@ static void __perf_pmu_install_event(struct pmu *pmu,
1284712827

1284812828
if (event->state >= PERF_EVENT_STATE_OFF)
1284912829
event->state = PERF_EVENT_STATE_INACTIVE;
12850-
account_event_cpu(event, cpu);
1285112830
perf_install_in_context(ctx, event, cpu);
1285212831
}
1285312832

@@ -13742,8 +13721,7 @@ static int __perf_cgroup_move(void *info)
1374213721
struct task_struct *task = info;
1374313722

1374413723
preempt_disable();
13745-
if (atomic_read(this_cpu_ptr(&perf_cgroup_events)))
13746-
perf_cgroup_switch(task);
13724+
perf_cgroup_switch(task);
1374713725
preempt_enable();
1374813726

1374913727
return 0;

0 commit comments

Comments
 (0)