Skip to content

Commit 1a7c611

Browse files
committed
Merge tag 'perf-core-2023-08-28' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull perf event updates from Ingo Molnar: - AMD IBS improvements - Intel PMU driver updates - Extend core perf facilities & the ARM PMU driver to better handle ARM big.LITTLE events - Micro-optimize software events and the ring-buffer code - Misc cleanups & fixes * tag 'perf-core-2023-08-28' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: perf/x86/uncore: Remove unnecessary ?: operator around pcibios_err_to_errno() call perf/x86/intel: Add Crestmont PMU x86/cpu: Update Hybrids x86/cpu: Fix Crestmont uarch x86/cpu: Fix Gracemont uarch perf: Remove unused extern declaration arch_perf_get_page_size() perf: Remove unused PERF_PMU_CAP_HETEROGENEOUS_CPUS capability arm_pmu: Remove unused PERF_PMU_CAP_HETEROGENEOUS_CPUS capability perf/x86: Remove unused PERF_PMU_CAP_HETEROGENEOUS_CPUS capability arm_pmu: Add PERF_PMU_CAP_EXTENDED_HW_TYPE capability perf/x86/ibs: Set mem_lvl_num, mem_remote and mem_hops for data_src perf/mem: Add PERF_MEM_LVLNUM_NA to PERF_MEM_NA perf/mem: Introduce PERF_MEM_LVLNUM_UNC perf/ring_buffer: Use local_try_cmpxchg in __perf_output_begin locking/arch: Avoid variable shadowing in local_try_cmpxchg() perf/core: Use local64_try_cmpxchg in perf_swevent_set_period perf/x86: Use local64_try_cmpxchg perf/amd: Prevent grouping of IBS events
2 parents d637fce + 2c65477 commit 1a7c611

File tree

29 files changed

+228
-170
lines changed

29 files changed

+228
-170
lines changed

arch/loongarch/include/asm/local.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -63,8 +63,8 @@ static inline long local_cmpxchg(local_t *l, long old, long new)
6363

6464
static inline bool local_try_cmpxchg(local_t *l, long *old, long new)
6565
{
66-
typeof(l->a.counter) *__old = (typeof(l->a.counter) *) old;
67-
return try_cmpxchg_local(&l->a.counter, __old, new);
66+
return try_cmpxchg_local(&l->a.counter,
67+
(typeof(l->a.counter) *) old, new);
6868
}
6969

7070
#define local_xchg(l, n) (atomic_long_xchg((&(l)->a), (n)))

arch/mips/include/asm/local.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -101,8 +101,8 @@ static __inline__ long local_cmpxchg(local_t *l, long old, long new)
101101

102102
static __inline__ bool local_try_cmpxchg(local_t *l, long *old, long new)
103103
{
104-
typeof(l->a.counter) *__old = (typeof(l->a.counter) *) old;
105-
return try_cmpxchg_local(&l->a.counter, __old, new);
104+
return try_cmpxchg_local(&l->a.counter,
105+
(typeof(l->a.counter) *) old, new);
106106
}
107107

108108
#define local_xchg(l, n) (atomic_long_xchg((&(l)->a), (n)))

arch/x86/events/amd/ibs.c

Lines changed: 96 additions & 90 deletions
Original file line numberDiff line numberDiff line change
@@ -156,8 +156,8 @@ perf_event_try_update(struct perf_event *event, u64 new_raw_count, int width)
156156
* count to the generic event atomically:
157157
*/
158158
prev_raw_count = local64_read(&hwc->prev_count);
159-
if (local64_cmpxchg(&hwc->prev_count, prev_raw_count,
160-
new_raw_count) != prev_raw_count)
159+
if (!local64_try_cmpxchg(&hwc->prev_count,
160+
&prev_raw_count, new_raw_count))
161161
return 0;
162162

163163
/*
@@ -247,11 +247,33 @@ int forward_event_to_ibs(struct perf_event *event)
247247
return -ENOENT;
248248
}
249249

250+
/*
251+
* Grouping of IBS events is not possible since IBS can have only
252+
* one event active at any point in time.
253+
*/
254+
static int validate_group(struct perf_event *event)
255+
{
256+
struct perf_event *sibling;
257+
258+
if (event->group_leader == event)
259+
return 0;
260+
261+
if (event->group_leader->pmu == event->pmu)
262+
return -EINVAL;
263+
264+
for_each_sibling_event(sibling, event->group_leader) {
265+
if (sibling->pmu == event->pmu)
266+
return -EINVAL;
267+
}
268+
return 0;
269+
}
270+
250271
static int perf_ibs_init(struct perf_event *event)
251272
{
252273
struct hw_perf_event *hwc = &event->hw;
253274
struct perf_ibs *perf_ibs;
254275
u64 max_cnt, config;
276+
int ret;
255277

256278
perf_ibs = get_ibs_pmu(event->attr.type);
257279
if (!perf_ibs)
@@ -265,6 +287,10 @@ static int perf_ibs_init(struct perf_event *event)
265287
if (config & ~perf_ibs->config_mask)
266288
return -EINVAL;
267289

290+
ret = validate_group(event);
291+
if (ret)
292+
return ret;
293+
268294
if (hwc->sample_period) {
269295
if (config & perf_ibs->cnt_mask)
270296
/* raw max_cnt may not be set */
@@ -702,38 +728,63 @@ static u8 perf_ibs_data_src(union ibs_op_data2 *op_data2)
702728
return op_data2->data_src_lo;
703729
}
704730

705-
static void perf_ibs_get_mem_lvl(union ibs_op_data2 *op_data2,
706-
union ibs_op_data3 *op_data3,
707-
struct perf_sample_data *data)
731+
#define L(x) (PERF_MEM_S(LVL, x) | PERF_MEM_S(LVL, HIT))
732+
#define LN(x) PERF_MEM_S(LVLNUM, x)
733+
#define REM PERF_MEM_S(REMOTE, REMOTE)
734+
#define HOPS(x) PERF_MEM_S(HOPS, x)
735+
736+
static u64 g_data_src[8] = {
737+
[IBS_DATA_SRC_LOC_CACHE] = L(L3) | L(REM_CCE1) | LN(ANY_CACHE) | HOPS(0),
738+
[IBS_DATA_SRC_DRAM] = L(LOC_RAM) | LN(RAM),
739+
[IBS_DATA_SRC_REM_CACHE] = L(REM_CCE2) | LN(ANY_CACHE) | REM | HOPS(1),
740+
[IBS_DATA_SRC_IO] = L(IO) | LN(IO),
741+
};
742+
743+
#define RMT_NODE_BITS (1 << IBS_DATA_SRC_DRAM)
744+
#define RMT_NODE_APPLICABLE(x) (RMT_NODE_BITS & (1 << x))
745+
746+
static u64 g_zen4_data_src[32] = {
747+
[IBS_DATA_SRC_EXT_LOC_CACHE] = L(L3) | LN(L3),
748+
[IBS_DATA_SRC_EXT_NEAR_CCX_CACHE] = L(REM_CCE1) | LN(ANY_CACHE) | REM | HOPS(0),
749+
[IBS_DATA_SRC_EXT_DRAM] = L(LOC_RAM) | LN(RAM),
750+
[IBS_DATA_SRC_EXT_FAR_CCX_CACHE] = L(REM_CCE2) | LN(ANY_CACHE) | REM | HOPS(1),
751+
[IBS_DATA_SRC_EXT_PMEM] = LN(PMEM),
752+
[IBS_DATA_SRC_EXT_IO] = L(IO) | LN(IO),
753+
[IBS_DATA_SRC_EXT_EXT_MEM] = LN(CXL),
754+
};
755+
756+
#define ZEN4_RMT_NODE_BITS ((1 << IBS_DATA_SRC_EXT_DRAM) | \
757+
(1 << IBS_DATA_SRC_EXT_PMEM) | \
758+
(1 << IBS_DATA_SRC_EXT_EXT_MEM))
759+
#define ZEN4_RMT_NODE_APPLICABLE(x) (ZEN4_RMT_NODE_BITS & (1 << x))
760+
761+
static __u64 perf_ibs_get_mem_lvl(union ibs_op_data2 *op_data2,
762+
union ibs_op_data3 *op_data3,
763+
struct perf_sample_data *data)
708764
{
709765
union perf_mem_data_src *data_src = &data->data_src;
710766
u8 ibs_data_src = perf_ibs_data_src(op_data2);
711767

712768
data_src->mem_lvl = 0;
769+
data_src->mem_lvl_num = 0;
713770

714771
/*
715772
* DcMiss, L2Miss, DataSrc, DcMissLat etc. are all invalid for Uncached
716773
* memory accesses. So, check DcUcMemAcc bit early.
717774
*/
718-
if (op_data3->dc_uc_mem_acc && ibs_data_src != IBS_DATA_SRC_EXT_IO) {
719-
data_src->mem_lvl = PERF_MEM_LVL_UNC | PERF_MEM_LVL_HIT;
720-
return;
721-
}
775+
if (op_data3->dc_uc_mem_acc && ibs_data_src != IBS_DATA_SRC_EXT_IO)
776+
return L(UNC) | LN(UNC);
722777

723778
/* L1 Hit */
724-
if (op_data3->dc_miss == 0) {
725-
data_src->mem_lvl = PERF_MEM_LVL_L1 | PERF_MEM_LVL_HIT;
726-
return;
727-
}
779+
if (op_data3->dc_miss == 0)
780+
return L(L1) | LN(L1);
728781

729782
/* L2 Hit */
730783
if (op_data3->l2_miss == 0) {
731784
/* Erratum #1293 */
732785
if (boot_cpu_data.x86 != 0x19 || boot_cpu_data.x86_model > 0xF ||
733-
!(op_data3->sw_pf || op_data3->dc_miss_no_mab_alloc)) {
734-
data_src->mem_lvl = PERF_MEM_LVL_L2 | PERF_MEM_LVL_HIT;
735-
return;
736-
}
786+
!(op_data3->sw_pf || op_data3->dc_miss_no_mab_alloc))
787+
return L(L2) | LN(L2);
737788
}
738789

739790
/*
@@ -743,82 +794,36 @@ static void perf_ibs_get_mem_lvl(union ibs_op_data2 *op_data2,
743794
if (data_src->mem_op != PERF_MEM_OP_LOAD)
744795
goto check_mab;
745796

746-
/* L3 Hit */
747797
if (ibs_caps & IBS_CAPS_ZEN4) {
748-
if (ibs_data_src == IBS_DATA_SRC_EXT_LOC_CACHE) {
749-
data_src->mem_lvl = PERF_MEM_LVL_L3 | PERF_MEM_LVL_HIT;
750-
return;
751-
}
752-
} else {
753-
if (ibs_data_src == IBS_DATA_SRC_LOC_CACHE) {
754-
data_src->mem_lvl = PERF_MEM_LVL_L3 | PERF_MEM_LVL_REM_CCE1 |
755-
PERF_MEM_LVL_HIT;
756-
return;
757-
}
758-
}
798+
u64 val = g_zen4_data_src[ibs_data_src];
759799

760-
/* A peer cache in a near CCX */
761-
if (ibs_caps & IBS_CAPS_ZEN4 &&
762-
ibs_data_src == IBS_DATA_SRC_EXT_NEAR_CCX_CACHE) {
763-
data_src->mem_lvl = PERF_MEM_LVL_REM_CCE1 | PERF_MEM_LVL_HIT;
764-
return;
765-
}
800+
if (!val)
801+
goto check_mab;
766802

767-
/* A peer cache in a far CCX */
768-
if (ibs_caps & IBS_CAPS_ZEN4) {
769-
if (ibs_data_src == IBS_DATA_SRC_EXT_FAR_CCX_CACHE) {
770-
data_src->mem_lvl = PERF_MEM_LVL_REM_CCE2 | PERF_MEM_LVL_HIT;
771-
return;
803+
/* HOPS_1 because IBS doesn't provide remote socket detail */
804+
if (op_data2->rmt_node && ZEN4_RMT_NODE_APPLICABLE(ibs_data_src)) {
805+
if (ibs_data_src == IBS_DATA_SRC_EXT_DRAM)
806+
val = L(REM_RAM1) | LN(RAM) | REM | HOPS(1);
807+
else
808+
val |= REM | HOPS(1);
772809
}
773-
} else {
774-
if (ibs_data_src == IBS_DATA_SRC_REM_CACHE) {
775-
data_src->mem_lvl = PERF_MEM_LVL_REM_CCE2 | PERF_MEM_LVL_HIT;
776-
return;
777-
}
778-
}
779810

780-
/* DRAM */
781-
if (ibs_data_src == IBS_DATA_SRC_EXT_DRAM) {
782-
if (op_data2->rmt_node == 0)
783-
data_src->mem_lvl = PERF_MEM_LVL_LOC_RAM | PERF_MEM_LVL_HIT;
784-
else
785-
data_src->mem_lvl = PERF_MEM_LVL_REM_RAM1 | PERF_MEM_LVL_HIT;
786-
return;
787-
}
811+
return val;
812+
} else {
813+
u64 val = g_data_src[ibs_data_src];
788814

789-
/* PMEM */
790-
if (ibs_caps & IBS_CAPS_ZEN4 && ibs_data_src == IBS_DATA_SRC_EXT_PMEM) {
791-
data_src->mem_lvl_num = PERF_MEM_LVLNUM_PMEM;
792-
if (op_data2->rmt_node) {
793-
data_src->mem_remote = PERF_MEM_REMOTE_REMOTE;
794-
/* IBS doesn't provide Remote socket detail */
795-
data_src->mem_hops = PERF_MEM_HOPS_1;
796-
}
797-
return;
798-
}
815+
if (!val)
816+
goto check_mab;
799817

800-
/* Extension Memory */
801-
if (ibs_caps & IBS_CAPS_ZEN4 &&
802-
ibs_data_src == IBS_DATA_SRC_EXT_EXT_MEM) {
803-
data_src->mem_lvl_num = PERF_MEM_LVLNUM_CXL;
804-
if (op_data2->rmt_node) {
805-
data_src->mem_remote = PERF_MEM_REMOTE_REMOTE;
806-
/* IBS doesn't provide Remote socket detail */
807-
data_src->mem_hops = PERF_MEM_HOPS_1;
818+
/* HOPS_1 because IBS doesn't provide remote socket detail */
819+
if (op_data2->rmt_node && RMT_NODE_APPLICABLE(ibs_data_src)) {
820+
if (ibs_data_src == IBS_DATA_SRC_DRAM)
821+
val = L(REM_RAM1) | LN(RAM) | REM | HOPS(1);
822+
else
823+
val |= REM | HOPS(1);
808824
}
809-
return;
810-
}
811825

812-
/* IO */
813-
if (ibs_data_src == IBS_DATA_SRC_EXT_IO) {
814-
data_src->mem_lvl = PERF_MEM_LVL_IO;
815-
data_src->mem_lvl_num = PERF_MEM_LVLNUM_IO;
816-
if (op_data2->rmt_node) {
817-
data_src->mem_remote = PERF_MEM_REMOTE_REMOTE;
818-
/* IBS doesn't provide Remote socket detail */
819-
data_src->mem_hops = PERF_MEM_HOPS_1;
820-
}
821-
return;
826+
return val;
822827
}
823828

824829
check_mab:
@@ -829,12 +834,11 @@ static void perf_ibs_get_mem_lvl(union ibs_op_data2 *op_data2,
829834
* DataSrc simultaneously. Prioritize DataSrc over MAB, i.e. set
830835
* MAB only when IBS fails to provide DataSrc.
831836
*/
832-
if (op_data3->dc_miss_no_mab_alloc) {
833-
data_src->mem_lvl = PERF_MEM_LVL_LFB | PERF_MEM_LVL_HIT;
834-
return;
835-
}
837+
if (op_data3->dc_miss_no_mab_alloc)
838+
return L(LFB) | LN(LFB);
836839

837-
data_src->mem_lvl = PERF_MEM_LVL_NA;
840+
/* Don't set HIT with NA */
841+
return PERF_MEM_S(LVL, NA) | LN(NA);
838842
}
839843

840844
static bool perf_ibs_cache_hit_st_valid(void)
@@ -924,7 +928,9 @@ static void perf_ibs_get_data_src(struct perf_ibs_data *ibs_data,
924928
union ibs_op_data2 *op_data2,
925929
union ibs_op_data3 *op_data3)
926930
{
927-
perf_ibs_get_mem_lvl(op_data2, op_data3, data);
931+
union perf_mem_data_src *data_src = &data->data_src;
932+
933+
data_src->val |= perf_ibs_get_mem_lvl(op_data2, op_data3, data);
928934
perf_ibs_get_mem_snoop(op_data2, data);
929935
perf_ibs_get_tlb_lvl(op_data3, data);
930936
perf_ibs_get_mem_lock(op_data3, data);

arch/x86/events/core.c

Lines changed: 4 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -129,13 +129,11 @@ u64 x86_perf_event_update(struct perf_event *event)
129129
* exchange a new raw count - then add that new-prev delta
130130
* count to the generic event atomically:
131131
*/
132-
again:
133132
prev_raw_count = local64_read(&hwc->prev_count);
134-
rdpmcl(hwc->event_base_rdpmc, new_raw_count);
135-
136-
if (local64_cmpxchg(&hwc->prev_count, prev_raw_count,
137-
new_raw_count) != prev_raw_count)
138-
goto again;
133+
do {
134+
rdpmcl(hwc->event_base_rdpmc, new_raw_count);
135+
} while (!local64_try_cmpxchg(&hwc->prev_count,
136+
&prev_raw_count, new_raw_count));
139137

140138
/*
141139
* Now we have the new raw value and have updated the prev
@@ -2168,7 +2166,6 @@ static int __init init_hw_perf_events(void)
21682166
hybrid_pmu->pmu = pmu;
21692167
hybrid_pmu->pmu.type = -1;
21702168
hybrid_pmu->pmu.attr_update = x86_pmu.attr_update;
2171-
hybrid_pmu->pmu.capabilities |= PERF_PMU_CAP_HETEROGENEOUS_CPUS;
21722169
hybrid_pmu->pmu.capabilities |= PERF_PMU_CAP_EXTENDED_HW_TYPE;
21732170

21742171
err = perf_pmu_register(&hybrid_pmu->pmu, hybrid_pmu->name,

0 commit comments

Comments
 (0)