perf mem/c2c: Add load store event mappings for AMD
The 'perf mem' and 'perf c2c' tools are wrappers around 'perf record' with mem load/ store events. IBS tagged load/store sample provides most of the information needed for these tools. Wire in the "ibs_op//" event as mem-ldst event for AMD. There are some limitations though: Only load/store micro-ops provide mem/c2c information. Whereas, IBS does not have a way to choose a particular type of micro-op to tag. This results in many non-LS micro-ops being tagged which appear as N/A in the perf report. IBS, being an uncore pmu from kernel point of view[1], does not support per process monitoring. Thus, perf mem/c2c on AMD are currently supported in per-cpu mode only. Example: $ sudo perf mem record -- -c 10000 ^C[ perf record: Woken up 227 times to write data ] [ perf record: Captured and wrote 58.760 MB perf.data (836978 samples) ] $ sudo perf mem report -F mem,sample,snoop Samples: 836K of event 'ibs_op//', Event count (approx.): 8418762 Memory access Samples Snoop N/A 700620 N/A L1 hit 126675 N/A L2 hit 424 N/A L3 hit 664 HitM L3 hit 10 N/A Local RAM hit 2 N/A Remote RAM (1 hop) hit 8558 N/A Remote Cache (1 hop) hit 3 N/A Remote Cache (1 hop) hit 2 HitM Remote Cache (2 hops) hit 10 HitM Remote Cache (2 hops) hit 6 N/A Uncached hit 4 N/A $ [1]: https://lore.kernel.org/lkml/20220829113347.295-1-ravi.bangoria@amd.com Signed-off-by: Ravi Bangoria <ravi.bangoria@amd.com> Acked-by: Jiri Olsa <jolsa@kernel.org> Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com> Cc: Ali Saidi <alisaidi@amazon.com> Cc: Ananth Narayan <ananth.narayan@amd.com> Cc: Andi Kleen <ak@linux.intel.com> Cc: Borislav Petkov <bp@alien8.de> Cc: Dave Hansen <dave.hansen@linux.intel.com> Cc: H. Peter Anvin <hpa@zytor.com> Cc: Ian Rogers <irogers@google.com> Cc: Ingo Molnar <mingo@redhat.com> Cc: Joe Mario <jmario@redhat.com> Cc: Kan Liang <kan.liang@linux.intel.com> Cc: Kim Phillips <kim.phillips@amd.com> Cc: Leo Yan <leo.yan@linaro.org> Cc: Mark Rutland <mark.rutland@arm.com> Cc: Namhyung Kim <namhyung@kernel.org> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Sandipan Das <sandipan.das@amd.com> Cc: Santosh Shukla <santosh.shukla@amd.com> Cc: Stephane Eranian <eranian@google.com> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: x86@kernel.org Link: https://lore.kernel.org/r/20221006153946.7816-6-ravi.bangoria@amd.com Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
This commit is contained in:
parent
4173cc055d
commit
f7b58cbdb3
|
@ -19,9 +19,10 @@ C2C stands for Cache To Cache.
|
||||||
The perf c2c tool provides means for Shared Data C2C/HITM analysis. It allows
|
The perf c2c tool provides means for Shared Data C2C/HITM analysis. It allows
|
||||||
you to track down the cacheline contentions.
|
you to track down the cacheline contentions.
|
||||||
|
|
||||||
On x86, the tool is based on load latency and precise store facility events
|
On Intel, the tool is based on load latency and precise store facility events
|
||||||
provided by Intel CPUs. On PowerPC, the tool uses random instruction sampling
|
provided by Intel CPUs. On PowerPC, the tool uses random instruction sampling
|
||||||
with thresholding feature.
|
with thresholding feature. On AMD, the tool uses IBS op pmu (due to hardware
|
||||||
|
limitations, perf c2c is not supported on Zen3 cpus).
|
||||||
|
|
||||||
These events provide:
|
These events provide:
|
||||||
- memory address of the access
|
- memory address of the access
|
||||||
|
@ -49,7 +50,8 @@ RECORD OPTIONS
|
||||||
|
|
||||||
-l::
|
-l::
|
||||||
--ldlat::
|
--ldlat::
|
||||||
Configure mem-loads latency. (x86 only)
|
Configure mem-loads latency. Supported on Intel and Arm64 processors
|
||||||
|
only. Ignored on other archs.
|
||||||
|
|
||||||
-k::
|
-k::
|
||||||
--all-kernel::
|
--all-kernel::
|
||||||
|
@ -135,11 +137,15 @@ Following perf record options are configured by default:
|
||||||
-W,-d,--phys-data,--sample-cpu
|
-W,-d,--phys-data,--sample-cpu
|
||||||
|
|
||||||
Unless specified otherwise with '-e' option, following events are monitored by
|
Unless specified otherwise with '-e' option, following events are monitored by
|
||||||
default on x86:
|
default on Intel:
|
||||||
|
|
||||||
cpu/mem-loads,ldlat=30/P
|
cpu/mem-loads,ldlat=30/P
|
||||||
cpu/mem-stores/P
|
cpu/mem-stores/P
|
||||||
|
|
||||||
|
following on AMD:
|
||||||
|
|
||||||
|
ibs_op//
|
||||||
|
|
||||||
and following on PowerPC:
|
and following on PowerPC:
|
||||||
|
|
||||||
cpu/mem-loads/
|
cpu/mem-loads/
|
||||||
|
|
|
@ -85,7 +85,8 @@ RECORD OPTIONS
|
||||||
Be more verbose (show counter open errors, etc)
|
Be more verbose (show counter open errors, etc)
|
||||||
|
|
||||||
--ldlat <n>::
|
--ldlat <n>::
|
||||||
Specify desired latency for loads event. (x86 only)
|
Specify desired latency for loads event. Supported on Intel and Arm64
|
||||||
|
processors only. Ignored on other archs.
|
||||||
|
|
||||||
In addition, for report all perf report options are valid, and for record
|
In addition, for report all perf report options are valid, and for record
|
||||||
all perf record options.
|
all perf record options.
|
||||||
|
|
|
@ -1,7 +1,9 @@
|
||||||
// SPDX-License-Identifier: GPL-2.0
|
// SPDX-License-Identifier: GPL-2.0
|
||||||
#include "util/pmu.h"
|
#include "util/pmu.h"
|
||||||
|
#include "util/env.h"
|
||||||
#include "map_symbol.h"
|
#include "map_symbol.h"
|
||||||
#include "mem-events.h"
|
#include "mem-events.h"
|
||||||
|
#include "linux/string.h"
|
||||||
|
|
||||||
static char mem_loads_name[100];
|
static char mem_loads_name[100];
|
||||||
static bool mem_loads_name__init;
|
static bool mem_loads_name__init;
|
||||||
|
@ -12,18 +14,43 @@ static char mem_stores_name[100];
|
||||||
|
|
||||||
#define E(t, n, s) { .tag = t, .name = n, .sysfs_name = s }
|
#define E(t, n, s) { .tag = t, .name = n, .sysfs_name = s }
|
||||||
|
|
||||||
static struct perf_mem_event perf_mem_events[PERF_MEM_EVENTS__MAX] = {
|
static struct perf_mem_event perf_mem_events_intel[PERF_MEM_EVENTS__MAX] = {
|
||||||
E("ldlat-loads", "%s/mem-loads,ldlat=%u/P", "%s/events/mem-loads"),
|
E("ldlat-loads", "%s/mem-loads,ldlat=%u/P", "%s/events/mem-loads"),
|
||||||
E("ldlat-stores", "%s/mem-stores/P", "%s/events/mem-stores"),
|
E("ldlat-stores", "%s/mem-stores/P", "%s/events/mem-stores"),
|
||||||
E(NULL, NULL, NULL),
|
E(NULL, NULL, NULL),
|
||||||
};
|
};
|
||||||
|
|
||||||
|
static struct perf_mem_event perf_mem_events_amd[PERF_MEM_EVENTS__MAX] = {
|
||||||
|
E(NULL, NULL, NULL),
|
||||||
|
E(NULL, NULL, NULL),
|
||||||
|
E("mem-ldst", "ibs_op//", "ibs_op"),
|
||||||
|
};
|
||||||
|
|
||||||
|
static int perf_mem_is_amd_cpu(void)
|
||||||
|
{
|
||||||
|
struct perf_env env = { .total_mem = 0, };
|
||||||
|
|
||||||
|
perf_env__cpuid(&env);
|
||||||
|
if (env.cpuid && strstarts(env.cpuid, "AuthenticAMD"))
|
||||||
|
return 1;
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
|
||||||
struct perf_mem_event *perf_mem_events__ptr(int i)
|
struct perf_mem_event *perf_mem_events__ptr(int i)
|
||||||
{
|
{
|
||||||
|
/* 0: Uninitialized, 1: Yes, -1: No */
|
||||||
|
static int is_amd;
|
||||||
|
|
||||||
if (i >= PERF_MEM_EVENTS__MAX)
|
if (i >= PERF_MEM_EVENTS__MAX)
|
||||||
return NULL;
|
return NULL;
|
||||||
|
|
||||||
return &perf_mem_events[i];
|
if (!is_amd)
|
||||||
|
is_amd = perf_mem_is_amd_cpu();
|
||||||
|
|
||||||
|
if (is_amd == 1)
|
||||||
|
return &perf_mem_events_amd[i];
|
||||||
|
|
||||||
|
return &perf_mem_events_intel[i];
|
||||||
}
|
}
|
||||||
|
|
||||||
bool is_mem_loads_aux_event(struct evsel *leader)
|
bool is_mem_loads_aux_event(struct evsel *leader)
|
||||||
|
|
Loading…
Reference in New Issue