perf/x86/amd: Add AMD Fam19h Branch Sampling support
Add support for the AMD Fam19h 16-deep branch sampling feature as
described in the AMD PPR Fam19h Model 01h Revision B1. This is a model
specific extension. It is not an architected AMD feature.
The Branch Sampling (BRS) operates with a 16-deep saturating buffer in MSR
registers. There is no branch type filtering. All control flow changes are
captured. BRS relies on specific programming of the core PMU of Fam19h. In
particular, the following requirements must be met:
- the sampling period be greater than 16 (BRS depth)
- the sampling period must use a fixed and not frequency mode
BRS interacts with the NMI interrupt as well. Because enabling BRS is
expensive, it is only activated after P event occurrences, where P is the
desired sampling period. At P occurrences of the event, the counter
overflows, the CPU catches the interrupt, activates BRS for 16 branches until
it saturates, and then delivers the NMI to the kernel. Between the overflow
and the time BRS activates more branches may be executed skewing the period.
All along, the sampling event keeps counting. The skid may be attenuated by
reducing the sampling period by 16 (subsequent patch).
BRS is integrated into perf_events seamlessly via the same
PERF_RECORD_BRANCH_STACK sample format. BRS generates perf_branch_entry
records in the sampling buffer. No prediction information is supported. The
branches are stored in reverse order of execution. The most recent branch is
the first entry in each record.
No modification to the perf tool is necessary.
BRS can be used with any sampling event. However, it is recommended to use
the RETIRED_BRANCH_INSTRUCTIONS event because it matches what the BRS
captures.
$ perf record -b -c 1000037 -e cpu/event=0xc2,name=ret_br_instructions/ test
$ perf report -D
56531696056126 0x193c000 [0x1a8]: PERF_RECORD_SAMPLE(IP, 0x2): 18122/18230: 0x401d24 period: 1000037 addr: 0
... branch stack: nr:16
..... 0: 0000000000401d24 -> 0000000000401d5a 0 cycles 0
..... 1: 0000000000401d5c -> 0000000000401d24 0 cycles 0
..... 2: 0000000000401d22 -> 0000000000401d5c 0 cycles 0
..... 3: 0000000000401d5e -> 0000000000401d22 0 cycles 0
..... 4: 0000000000401d20 -> 0000000000401d5e 0 cycles 0
..... 5: 0000000000401d3e -> 0000000000401d20 0 cycles 0
..... 6: 0000000000401d42 -> 0000000000401d3e 0 cycles 0
..... 7: 0000000000401d3c -> 0000000000401d42 0 cycles 0
..... 8: 0000000000401d44 -> 0000000000401d3c 0 cycles 0
..... 9: 0000000000401d3a -> 0000000000401d44 0 cycles 0
..... 10: 0000000000401d46 -> 0000000000401d3a 0 cycles 0
..... 11: 0000000000401d38 -> 0000000000401d46 0 cycles 0
..... 12: 0000000000401d48 -> 0000000000401d38 0 cycles 0
..... 13: 0000000000401d36 -> 0000000000401d48 0 cycles 0
..... 14: 0000000000401d4a -> 0000000000401d36 0 cycles 0
..... 15: 0000000000401d34 -> 0000000000401d4a 0 cycles 0
... thread: test:18230
...... dso: test
Signed-off-by: Stephane Eranian <eranian@google.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lore.kernel.org/r/20220322221517.2510440-4-eranian@google.com
2022-03-22 15:15:07 -07:00
|
|
|
// SPDX-License-Identifier: GPL-2.0
|
|
|
|
/*
|
|
|
|
* Implement support for AMD Fam19h Branch Sampling feature
|
|
|
|
* Based on specifications published in AMD PPR Fam19 Model 01
|
|
|
|
*
|
|
|
|
* Copyright 2021 Google LLC
|
|
|
|
* Contributed by Stephane Eranian <eranian@google.com>
|
|
|
|
*/
|
|
|
|
#include <linux/kernel.h>
|
2022-03-22 15:15:13 -07:00
|
|
|
#include <linux/jump_label.h>
|
perf/x86/amd: Add AMD Fam19h Branch Sampling support
Add support for the AMD Fam19h 16-deep branch sampling feature as
described in the AMD PPR Fam19h Model 01h Revision B1. This is a model
specific extension. It is not an architected AMD feature.
The Branch Sampling (BRS) operates with a 16-deep saturating buffer in MSR
registers. There is no branch type filtering. All control flow changes are
captured. BRS relies on specific programming of the core PMU of Fam19h. In
particular, the following requirements must be met:
- the sampling period be greater than 16 (BRS depth)
- the sampling period must use a fixed and not frequency mode
BRS interacts with the NMI interrupt as well. Because enabling BRS is
expensive, it is only activated after P event occurrences, where P is the
desired sampling period. At P occurrences of the event, the counter
overflows, the CPU catches the interrupt, activates BRS for 16 branches until
it saturates, and then delivers the NMI to the kernel. Between the overflow
and the time BRS activates more branches may be executed skewing the period.
All along, the sampling event keeps counting. The skid may be attenuated by
reducing the sampling period by 16 (subsequent patch).
BRS is integrated into perf_events seamlessly via the same
PERF_RECORD_BRANCH_STACK sample format. BRS generates perf_branch_entry
records in the sampling buffer. No prediction information is supported. The
branches are stored in reverse order of execution. The most recent branch is
the first entry in each record.
No modification to the perf tool is necessary.
BRS can be used with any sampling event. However, it is recommended to use
the RETIRED_BRANCH_INSTRUCTIONS event because it matches what the BRS
captures.
$ perf record -b -c 1000037 -e cpu/event=0xc2,name=ret_br_instructions/ test
$ perf report -D
56531696056126 0x193c000 [0x1a8]: PERF_RECORD_SAMPLE(IP, 0x2): 18122/18230: 0x401d24 period: 1000037 addr: 0
... branch stack: nr:16
..... 0: 0000000000401d24 -> 0000000000401d5a 0 cycles 0
..... 1: 0000000000401d5c -> 0000000000401d24 0 cycles 0
..... 2: 0000000000401d22 -> 0000000000401d5c 0 cycles 0
..... 3: 0000000000401d5e -> 0000000000401d22 0 cycles 0
..... 4: 0000000000401d20 -> 0000000000401d5e 0 cycles 0
..... 5: 0000000000401d3e -> 0000000000401d20 0 cycles 0
..... 6: 0000000000401d42 -> 0000000000401d3e 0 cycles 0
..... 7: 0000000000401d3c -> 0000000000401d42 0 cycles 0
..... 8: 0000000000401d44 -> 0000000000401d3c 0 cycles 0
..... 9: 0000000000401d3a -> 0000000000401d44 0 cycles 0
..... 10: 0000000000401d46 -> 0000000000401d3a 0 cycles 0
..... 11: 0000000000401d38 -> 0000000000401d46 0 cycles 0
..... 12: 0000000000401d48 -> 0000000000401d38 0 cycles 0
..... 13: 0000000000401d36 -> 0000000000401d48 0 cycles 0
..... 14: 0000000000401d4a -> 0000000000401d36 0 cycles 0
..... 15: 0000000000401d34 -> 0000000000401d4a 0 cycles 0
... thread: test:18230
...... dso: test
Signed-off-by: Stephane Eranian <eranian@google.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lore.kernel.org/r/20220322221517.2510440-4-eranian@google.com
2022-03-22 15:15:07 -07:00
|
|
|
#include <asm/msr.h>
|
|
|
|
#include <asm/cpufeature.h>
|
|
|
|
|
|
|
|
#include "../perf_event.h"
|
|
|
|
|
|
|
|
#define BRS_POISON 0xFFFFFFFFFFFFFFFEULL /* mark limit of valid entries */
|
|
|
|
|
|
|
|
/* Debug Extension Configuration register layout */
|
|
|
|
union amd_debug_extn_cfg {
|
|
|
|
__u64 val;
|
|
|
|
struct {
|
|
|
|
__u64 rsvd0:2, /* reserved */
|
|
|
|
brsmen:1, /* branch sample enable */
|
|
|
|
rsvd4_3:2,/* reserved - must be 0x3 */
|
|
|
|
vb:1, /* valid branches recorded */
|
|
|
|
rsvd2:10, /* reserved */
|
|
|
|
msroff:4, /* index of next entry to write */
|
|
|
|
rsvd3:4, /* reserved */
|
|
|
|
pmc:3, /* #PMC holding the sampling event */
|
|
|
|
rsvd4:37; /* reserved */
|
|
|
|
};
|
|
|
|
};
|
|
|
|
|
|
|
|
static inline unsigned int brs_from(int idx)
|
|
|
|
{
|
|
|
|
return MSR_AMD_SAMP_BR_FROM + 2 * idx;
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline unsigned int brs_to(int idx)
|
|
|
|
{
|
|
|
|
return MSR_AMD_SAMP_BR_FROM + 2 * idx + 1;
|
|
|
|
}
|
|
|
|
|
2023-01-12 12:43:15 -07:00
|
|
|
static __always_inline void set_debug_extn_cfg(u64 val)
|
perf/x86/amd: Add AMD Fam19h Branch Sampling support
Add support for the AMD Fam19h 16-deep branch sampling feature as
described in the AMD PPR Fam19h Model 01h Revision B1. This is a model
specific extension. It is not an architected AMD feature.
The Branch Sampling (BRS) operates with a 16-deep saturating buffer in MSR
registers. There is no branch type filtering. All control flow changes are
captured. BRS relies on specific programming of the core PMU of Fam19h. In
particular, the following requirements must be met:
- the sampling period be greater than 16 (BRS depth)
- the sampling period must use a fixed and not frequency mode
BRS interacts with the NMI interrupt as well. Because enabling BRS is
expensive, it is only activated after P event occurrences, where P is the
desired sampling period. At P occurrences of the event, the counter
overflows, the CPU catches the interrupt, activates BRS for 16 branches until
it saturates, and then delivers the NMI to the kernel. Between the overflow
and the time BRS activates more branches may be executed skewing the period.
All along, the sampling event keeps counting. The skid may be attenuated by
reducing the sampling period by 16 (subsequent patch).
BRS is integrated into perf_events seamlessly via the same
PERF_RECORD_BRANCH_STACK sample format. BRS generates perf_branch_entry
records in the sampling buffer. No prediction information is supported. The
branches are stored in reverse order of execution. The most recent branch is
the first entry in each record.
No modification to the perf tool is necessary.
BRS can be used with any sampling event. However, it is recommended to use
the RETIRED_BRANCH_INSTRUCTIONS event because it matches what the BRS
captures.
$ perf record -b -c 1000037 -e cpu/event=0xc2,name=ret_br_instructions/ test
$ perf report -D
56531696056126 0x193c000 [0x1a8]: PERF_RECORD_SAMPLE(IP, 0x2): 18122/18230: 0x401d24 period: 1000037 addr: 0
... branch stack: nr:16
..... 0: 0000000000401d24 -> 0000000000401d5a 0 cycles 0
..... 1: 0000000000401d5c -> 0000000000401d24 0 cycles 0
..... 2: 0000000000401d22 -> 0000000000401d5c 0 cycles 0
..... 3: 0000000000401d5e -> 0000000000401d22 0 cycles 0
..... 4: 0000000000401d20 -> 0000000000401d5e 0 cycles 0
..... 5: 0000000000401d3e -> 0000000000401d20 0 cycles 0
..... 6: 0000000000401d42 -> 0000000000401d3e 0 cycles 0
..... 7: 0000000000401d3c -> 0000000000401d42 0 cycles 0
..... 8: 0000000000401d44 -> 0000000000401d3c 0 cycles 0
..... 9: 0000000000401d3a -> 0000000000401d44 0 cycles 0
..... 10: 0000000000401d46 -> 0000000000401d3a 0 cycles 0
..... 11: 0000000000401d38 -> 0000000000401d46 0 cycles 0
..... 12: 0000000000401d48 -> 0000000000401d38 0 cycles 0
..... 13: 0000000000401d36 -> 0000000000401d48 0 cycles 0
..... 14: 0000000000401d4a -> 0000000000401d36 0 cycles 0
..... 15: 0000000000401d34 -> 0000000000401d4a 0 cycles 0
... thread: test:18230
...... dso: test
Signed-off-by: Stephane Eranian <eranian@google.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lore.kernel.org/r/20220322221517.2510440-4-eranian@google.com
2022-03-22 15:15:07 -07:00
|
|
|
{
|
|
|
|
/* bits[4:3] must always be set to 11b */
|
2023-01-12 12:43:15 -07:00
|
|
|
__wrmsr(MSR_AMD_DBG_EXTN_CFG, val | 3ULL << 3, val >> 32);
|
perf/x86/amd: Add AMD Fam19h Branch Sampling support
Add support for the AMD Fam19h 16-deep branch sampling feature as
described in the AMD PPR Fam19h Model 01h Revision B1. This is a model
specific extension. It is not an architected AMD feature.
The Branch Sampling (BRS) operates with a 16-deep saturating buffer in MSR
registers. There is no branch type filtering. All control flow changes are
captured. BRS relies on specific programming of the core PMU of Fam19h. In
particular, the following requirements must be met:
- the sampling period be greater than 16 (BRS depth)
- the sampling period must use a fixed and not frequency mode
BRS interacts with the NMI interrupt as well. Because enabling BRS is
expensive, it is only activated after P event occurrences, where P is the
desired sampling period. At P occurrences of the event, the counter
overflows, the CPU catches the interrupt, activates BRS for 16 branches until
it saturates, and then delivers the NMI to the kernel. Between the overflow
and the time BRS activates more branches may be executed skewing the period.
All along, the sampling event keeps counting. The skid may be attenuated by
reducing the sampling period by 16 (subsequent patch).
BRS is integrated into perf_events seamlessly via the same
PERF_RECORD_BRANCH_STACK sample format. BRS generates perf_branch_entry
records in the sampling buffer. No prediction information is supported. The
branches are stored in reverse order of execution. The most recent branch is
the first entry in each record.
No modification to the perf tool is necessary.
BRS can be used with any sampling event. However, it is recommended to use
the RETIRED_BRANCH_INSTRUCTIONS event because it matches what the BRS
captures.
$ perf record -b -c 1000037 -e cpu/event=0xc2,name=ret_br_instructions/ test
$ perf report -D
56531696056126 0x193c000 [0x1a8]: PERF_RECORD_SAMPLE(IP, 0x2): 18122/18230: 0x401d24 period: 1000037 addr: 0
... branch stack: nr:16
..... 0: 0000000000401d24 -> 0000000000401d5a 0 cycles 0
..... 1: 0000000000401d5c -> 0000000000401d24 0 cycles 0
..... 2: 0000000000401d22 -> 0000000000401d5c 0 cycles 0
..... 3: 0000000000401d5e -> 0000000000401d22 0 cycles 0
..... 4: 0000000000401d20 -> 0000000000401d5e 0 cycles 0
..... 5: 0000000000401d3e -> 0000000000401d20 0 cycles 0
..... 6: 0000000000401d42 -> 0000000000401d3e 0 cycles 0
..... 7: 0000000000401d3c -> 0000000000401d42 0 cycles 0
..... 8: 0000000000401d44 -> 0000000000401d3c 0 cycles 0
..... 9: 0000000000401d3a -> 0000000000401d44 0 cycles 0
..... 10: 0000000000401d46 -> 0000000000401d3a 0 cycles 0
..... 11: 0000000000401d38 -> 0000000000401d46 0 cycles 0
..... 12: 0000000000401d48 -> 0000000000401d38 0 cycles 0
..... 13: 0000000000401d36 -> 0000000000401d48 0 cycles 0
..... 14: 0000000000401d4a -> 0000000000401d36 0 cycles 0
..... 15: 0000000000401d34 -> 0000000000401d4a 0 cycles 0
... thread: test:18230
...... dso: test
Signed-off-by: Stephane Eranian <eranian@google.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lore.kernel.org/r/20220322221517.2510440-4-eranian@google.com
2022-03-22 15:15:07 -07:00
|
|
|
}
|
|
|
|
|
2023-01-12 12:43:15 -07:00
|
|
|
static __always_inline u64 get_debug_extn_cfg(void)
|
perf/x86/amd: Add AMD Fam19h Branch Sampling support
Add support for the AMD Fam19h 16-deep branch sampling feature as
described in the AMD PPR Fam19h Model 01h Revision B1. This is a model
specific extension. It is not an architected AMD feature.
The Branch Sampling (BRS) operates with a 16-deep saturating buffer in MSR
registers. There is no branch type filtering. All control flow changes are
captured. BRS relies on specific programming of the core PMU of Fam19h. In
particular, the following requirements must be met:
- the sampling period be greater than 16 (BRS depth)
- the sampling period must use a fixed and not frequency mode
BRS interacts with the NMI interrupt as well. Because enabling BRS is
expensive, it is only activated after P event occurrences, where P is the
desired sampling period. At P occurrences of the event, the counter
overflows, the CPU catches the interrupt, activates BRS for 16 branches until
it saturates, and then delivers the NMI to the kernel. Between the overflow
and the time BRS activates more branches may be executed skewing the period.
All along, the sampling event keeps counting. The skid may be attenuated by
reducing the sampling period by 16 (subsequent patch).
BRS is integrated into perf_events seamlessly via the same
PERF_RECORD_BRANCH_STACK sample format. BRS generates perf_branch_entry
records in the sampling buffer. No prediction information is supported. The
branches are stored in reverse order of execution. The most recent branch is
the first entry in each record.
No modification to the perf tool is necessary.
BRS can be used with any sampling event. However, it is recommended to use
the RETIRED_BRANCH_INSTRUCTIONS event because it matches what the BRS
captures.
$ perf record -b -c 1000037 -e cpu/event=0xc2,name=ret_br_instructions/ test
$ perf report -D
56531696056126 0x193c000 [0x1a8]: PERF_RECORD_SAMPLE(IP, 0x2): 18122/18230: 0x401d24 period: 1000037 addr: 0
... branch stack: nr:16
..... 0: 0000000000401d24 -> 0000000000401d5a 0 cycles 0
..... 1: 0000000000401d5c -> 0000000000401d24 0 cycles 0
..... 2: 0000000000401d22 -> 0000000000401d5c 0 cycles 0
..... 3: 0000000000401d5e -> 0000000000401d22 0 cycles 0
..... 4: 0000000000401d20 -> 0000000000401d5e 0 cycles 0
..... 5: 0000000000401d3e -> 0000000000401d20 0 cycles 0
..... 6: 0000000000401d42 -> 0000000000401d3e 0 cycles 0
..... 7: 0000000000401d3c -> 0000000000401d42 0 cycles 0
..... 8: 0000000000401d44 -> 0000000000401d3c 0 cycles 0
..... 9: 0000000000401d3a -> 0000000000401d44 0 cycles 0
..... 10: 0000000000401d46 -> 0000000000401d3a 0 cycles 0
..... 11: 0000000000401d38 -> 0000000000401d46 0 cycles 0
..... 12: 0000000000401d48 -> 0000000000401d38 0 cycles 0
..... 13: 0000000000401d36 -> 0000000000401d48 0 cycles 0
..... 14: 0000000000401d4a -> 0000000000401d36 0 cycles 0
..... 15: 0000000000401d34 -> 0000000000401d4a 0 cycles 0
... thread: test:18230
...... dso: test
Signed-off-by: Stephane Eranian <eranian@google.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lore.kernel.org/r/20220322221517.2510440-4-eranian@google.com
2022-03-22 15:15:07 -07:00
|
|
|
{
|
2023-01-12 12:43:15 -07:00
|
|
|
return __rdmsr(MSR_AMD_DBG_EXTN_CFG);
|
perf/x86/amd: Add AMD Fam19h Branch Sampling support
Add support for the AMD Fam19h 16-deep branch sampling feature as
described in the AMD PPR Fam19h Model 01h Revision B1. This is a model
specific extension. It is not an architected AMD feature.
The Branch Sampling (BRS) operates with a 16-deep saturating buffer in MSR
registers. There is no branch type filtering. All control flow changes are
captured. BRS relies on specific programming of the core PMU of Fam19h. In
particular, the following requirements must be met:
- the sampling period be greater than 16 (BRS depth)
- the sampling period must use a fixed and not frequency mode
BRS interacts with the NMI interrupt as well. Because enabling BRS is
expensive, it is only activated after P event occurrences, where P is the
desired sampling period. At P occurrences of the event, the counter
overflows, the CPU catches the interrupt, activates BRS for 16 branches until
it saturates, and then delivers the NMI to the kernel. Between the overflow
and the time BRS activates more branches may be executed skewing the period.
All along, the sampling event keeps counting. The skid may be attenuated by
reducing the sampling period by 16 (subsequent patch).
BRS is integrated into perf_events seamlessly via the same
PERF_RECORD_BRANCH_STACK sample format. BRS generates perf_branch_entry
records in the sampling buffer. No prediction information is supported. The
branches are stored in reverse order of execution. The most recent branch is
the first entry in each record.
No modification to the perf tool is necessary.
BRS can be used with any sampling event. However, it is recommended to use
the RETIRED_BRANCH_INSTRUCTIONS event because it matches what the BRS
captures.
$ perf record -b -c 1000037 -e cpu/event=0xc2,name=ret_br_instructions/ test
$ perf report -D
56531696056126 0x193c000 [0x1a8]: PERF_RECORD_SAMPLE(IP, 0x2): 18122/18230: 0x401d24 period: 1000037 addr: 0
... branch stack: nr:16
..... 0: 0000000000401d24 -> 0000000000401d5a 0 cycles 0
..... 1: 0000000000401d5c -> 0000000000401d24 0 cycles 0
..... 2: 0000000000401d22 -> 0000000000401d5c 0 cycles 0
..... 3: 0000000000401d5e -> 0000000000401d22 0 cycles 0
..... 4: 0000000000401d20 -> 0000000000401d5e 0 cycles 0
..... 5: 0000000000401d3e -> 0000000000401d20 0 cycles 0
..... 6: 0000000000401d42 -> 0000000000401d3e 0 cycles 0
..... 7: 0000000000401d3c -> 0000000000401d42 0 cycles 0
..... 8: 0000000000401d44 -> 0000000000401d3c 0 cycles 0
..... 9: 0000000000401d3a -> 0000000000401d44 0 cycles 0
..... 10: 0000000000401d46 -> 0000000000401d3a 0 cycles 0
..... 11: 0000000000401d38 -> 0000000000401d46 0 cycles 0
..... 12: 0000000000401d48 -> 0000000000401d38 0 cycles 0
..... 13: 0000000000401d36 -> 0000000000401d48 0 cycles 0
..... 14: 0000000000401d4a -> 0000000000401d36 0 cycles 0
..... 15: 0000000000401d34 -> 0000000000401d4a 0 cycles 0
... thread: test:18230
...... dso: test
Signed-off-by: Stephane Eranian <eranian@google.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lore.kernel.org/r/20220322221517.2510440-4-eranian@google.com
2022-03-22 15:15:07 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
static bool __init amd_brs_detect(void)
|
|
|
|
{
|
2022-05-16 08:48:38 -07:00
|
|
|
if (!cpu_feature_enabled(X86_FEATURE_BRS))
|
perf/x86/amd: Add AMD Fam19h Branch Sampling support
Add support for the AMD Fam19h 16-deep branch sampling feature as
described in the AMD PPR Fam19h Model 01h Revision B1. This is a model
specific extension. It is not an architected AMD feature.
The Branch Sampling (BRS) operates with a 16-deep saturating buffer in MSR
registers. There is no branch type filtering. All control flow changes are
captured. BRS relies on specific programming of the core PMU of Fam19h. In
particular, the following requirements must be met:
- the sampling period be greater than 16 (BRS depth)
- the sampling period must use a fixed and not frequency mode
BRS interacts with the NMI interrupt as well. Because enabling BRS is
expensive, it is only activated after P event occurrences, where P is the
desired sampling period. At P occurrences of the event, the counter
overflows, the CPU catches the interrupt, activates BRS for 16 branches until
it saturates, and then delivers the NMI to the kernel. Between the overflow
and the time BRS activates more branches may be executed skewing the period.
All along, the sampling event keeps counting. The skid may be attenuated by
reducing the sampling period by 16 (subsequent patch).
BRS is integrated into perf_events seamlessly via the same
PERF_RECORD_BRANCH_STACK sample format. BRS generates perf_branch_entry
records in the sampling buffer. No prediction information is supported. The
branches are stored in reverse order of execution. The most recent branch is
the first entry in each record.
No modification to the perf tool is necessary.
BRS can be used with any sampling event. However, it is recommended to use
the RETIRED_BRANCH_INSTRUCTIONS event because it matches what the BRS
captures.
$ perf record -b -c 1000037 -e cpu/event=0xc2,name=ret_br_instructions/ test
$ perf report -D
56531696056126 0x193c000 [0x1a8]: PERF_RECORD_SAMPLE(IP, 0x2): 18122/18230: 0x401d24 period: 1000037 addr: 0
... branch stack: nr:16
..... 0: 0000000000401d24 -> 0000000000401d5a 0 cycles 0
..... 1: 0000000000401d5c -> 0000000000401d24 0 cycles 0
..... 2: 0000000000401d22 -> 0000000000401d5c 0 cycles 0
..... 3: 0000000000401d5e -> 0000000000401d22 0 cycles 0
..... 4: 0000000000401d20 -> 0000000000401d5e 0 cycles 0
..... 5: 0000000000401d3e -> 0000000000401d20 0 cycles 0
..... 6: 0000000000401d42 -> 0000000000401d3e 0 cycles 0
..... 7: 0000000000401d3c -> 0000000000401d42 0 cycles 0
..... 8: 0000000000401d44 -> 0000000000401d3c 0 cycles 0
..... 9: 0000000000401d3a -> 0000000000401d44 0 cycles 0
..... 10: 0000000000401d46 -> 0000000000401d3a 0 cycles 0
..... 11: 0000000000401d38 -> 0000000000401d46 0 cycles 0
..... 12: 0000000000401d48 -> 0000000000401d38 0 cycles 0
..... 13: 0000000000401d36 -> 0000000000401d48 0 cycles 0
..... 14: 0000000000401d4a -> 0000000000401d36 0 cycles 0
..... 15: 0000000000401d34 -> 0000000000401d4a 0 cycles 0
... thread: test:18230
...... dso: test
Signed-off-by: Stephane Eranian <eranian@google.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lore.kernel.org/r/20220322221517.2510440-4-eranian@google.com
2022-03-22 15:15:07 -07:00
|
|
|
return false;
|
|
|
|
|
|
|
|
switch (boot_cpu_data.x86) {
|
|
|
|
case 0x19: /* AMD Fam19h (Zen3) */
|
|
|
|
x86_pmu.lbr_nr = 16;
|
|
|
|
|
|
|
|
/* No hardware filtering supported */
|
|
|
|
x86_pmu.lbr_sel_map = NULL;
|
|
|
|
x86_pmu.lbr_sel_mask = 0;
|
|
|
|
break;
|
|
|
|
default:
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Current BRS implementation does not support branch type or privilege level
|
|
|
|
* filtering. Therefore, this function simply enforces these limitations. No need for
|
|
|
|
* a br_sel_map. Software filtering is not supported because it would not correlate well
|
|
|
|
* with a sampling period.
|
|
|
|
*/
|
2022-08-11 05:29:49 -07:00
|
|
|
static int amd_brs_setup_filter(struct perf_event *event)
|
perf/x86/amd: Add AMD Fam19h Branch Sampling support
Add support for the AMD Fam19h 16-deep branch sampling feature as
described in the AMD PPR Fam19h Model 01h Revision B1. This is a model
specific extension. It is not an architected AMD feature.
The Branch Sampling (BRS) operates with a 16-deep saturating buffer in MSR
registers. There is no branch type filtering. All control flow changes are
captured. BRS relies on specific programming of the core PMU of Fam19h. In
particular, the following requirements must be met:
- the sampling period be greater than 16 (BRS depth)
- the sampling period must use a fixed and not frequency mode
BRS interacts with the NMI interrupt as well. Because enabling BRS is
expensive, it is only activated after P event occurrences, where P is the
desired sampling period. At P occurrences of the event, the counter
overflows, the CPU catches the interrupt, activates BRS for 16 branches until
it saturates, and then delivers the NMI to the kernel. Between the overflow
and the time BRS activates more branches may be executed skewing the period.
All along, the sampling event keeps counting. The skid may be attenuated by
reducing the sampling period by 16 (subsequent patch).
BRS is integrated into perf_events seamlessly via the same
PERF_RECORD_BRANCH_STACK sample format. BRS generates perf_branch_entry
records in the sampling buffer. No prediction information is supported. The
branches are stored in reverse order of execution. The most recent branch is
the first entry in each record.
No modification to the perf tool is necessary.
BRS can be used with any sampling event. However, it is recommended to use
the RETIRED_BRANCH_INSTRUCTIONS event because it matches what the BRS
captures.
$ perf record -b -c 1000037 -e cpu/event=0xc2,name=ret_br_instructions/ test
$ perf report -D
56531696056126 0x193c000 [0x1a8]: PERF_RECORD_SAMPLE(IP, 0x2): 18122/18230: 0x401d24 period: 1000037 addr: 0
... branch stack: nr:16
..... 0: 0000000000401d24 -> 0000000000401d5a 0 cycles 0
..... 1: 0000000000401d5c -> 0000000000401d24 0 cycles 0
..... 2: 0000000000401d22 -> 0000000000401d5c 0 cycles 0
..... 3: 0000000000401d5e -> 0000000000401d22 0 cycles 0
..... 4: 0000000000401d20 -> 0000000000401d5e 0 cycles 0
..... 5: 0000000000401d3e -> 0000000000401d20 0 cycles 0
..... 6: 0000000000401d42 -> 0000000000401d3e 0 cycles 0
..... 7: 0000000000401d3c -> 0000000000401d42 0 cycles 0
..... 8: 0000000000401d44 -> 0000000000401d3c 0 cycles 0
..... 9: 0000000000401d3a -> 0000000000401d44 0 cycles 0
..... 10: 0000000000401d46 -> 0000000000401d3a 0 cycles 0
..... 11: 0000000000401d38 -> 0000000000401d46 0 cycles 0
..... 12: 0000000000401d48 -> 0000000000401d38 0 cycles 0
..... 13: 0000000000401d36 -> 0000000000401d48 0 cycles 0
..... 14: 0000000000401d4a -> 0000000000401d36 0 cycles 0
..... 15: 0000000000401d34 -> 0000000000401d4a 0 cycles 0
... thread: test:18230
...... dso: test
Signed-off-by: Stephane Eranian <eranian@google.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lore.kernel.org/r/20220322221517.2510440-4-eranian@google.com
2022-03-22 15:15:07 -07:00
|
|
|
{
|
|
|
|
u64 type = event->attr.branch_sample_type;
|
|
|
|
|
|
|
|
/* No BRS support */
|
|
|
|
if (!x86_pmu.lbr_nr)
|
|
|
|
return -EOPNOTSUPP;
|
|
|
|
|
|
|
|
/* Can only capture all branches, i.e., no filtering */
|
|
|
|
if ((type & ~PERF_SAMPLE_BRANCH_PLM_ALL) != PERF_SAMPLE_BRANCH_ANY)
|
|
|
|
return -EINVAL;
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2022-08-11 05:29:49 -07:00
|
|
|
static inline int amd_is_brs_event(struct perf_event *e)
|
|
|
|
{
|
|
|
|
return (e->hw.config & AMD64_RAW_EVENT_MASK) == AMD_FAM19H_BRS_EVENT;
|
|
|
|
}
|
|
|
|
|
|
|
|
int amd_brs_hw_config(struct perf_event *event)
|
|
|
|
{
|
|
|
|
int ret = 0;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Due to interrupt holding, BRS is not recommended in
|
|
|
|
* counting mode.
|
|
|
|
*/
|
|
|
|
if (!is_sampling_event(event))
|
|
|
|
return -EINVAL;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Due to the way BRS operates by holding the interrupt until
|
|
|
|
* lbr_nr entries have been captured, it does not make sense
|
|
|
|
* to allow sampling on BRS with an event that does not match
|
|
|
|
* what BRS is capturing, i.e., retired taken branches.
|
|
|
|
* Otherwise the correlation with the event's period is even
|
|
|
|
* more loose:
|
|
|
|
*
|
|
|
|
* With retired taken branch:
|
|
|
|
* Effective P = P + 16 + X
|
|
|
|
* With any other event:
|
|
|
|
* Effective P = P + Y + X
|
|
|
|
*
|
|
|
|
* Where X is the number of taken branches due to interrupt
|
|
|
|
* skid. Skid is large.
|
|
|
|
*
|
2024-01-02 17:40:11 -07:00
|
|
|
* Where Y is the occurrences of the event while BRS is
|
2022-08-11 05:29:49 -07:00
|
|
|
* capturing the lbr_nr entries.
|
|
|
|
*
|
|
|
|
* By using retired taken branches, we limit the impact on the
|
|
|
|
* Y variable. We know it cannot be more than the depth of
|
|
|
|
* BRS.
|
|
|
|
*/
|
|
|
|
if (!amd_is_brs_event(event))
|
|
|
|
return -EINVAL;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* BRS implementation does not work with frequency mode
|
|
|
|
* reprogramming of the period.
|
|
|
|
*/
|
|
|
|
if (event->attr.freq)
|
|
|
|
return -EINVAL;
|
|
|
|
/*
|
|
|
|
* The kernel subtracts BRS depth from period, so it must
|
|
|
|
* be big enough.
|
|
|
|
*/
|
|
|
|
if (event->attr.sample_period <= x86_pmu.lbr_nr)
|
|
|
|
return -EINVAL;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Check if we can allow PERF_SAMPLE_BRANCH_STACK
|
|
|
|
*/
|
|
|
|
ret = amd_brs_setup_filter(event);
|
|
|
|
|
|
|
|
/* only set in case of success */
|
|
|
|
if (!ret)
|
|
|
|
event->hw.flags |= PERF_X86_EVENT_AMD_BRS;
|
|
|
|
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
perf/x86/amd: Add AMD Fam19h Branch Sampling support
Add support for the AMD Fam19h 16-deep branch sampling feature as
described in the AMD PPR Fam19h Model 01h Revision B1. This is a model
specific extension. It is not an architected AMD feature.
The Branch Sampling (BRS) operates with a 16-deep saturating buffer in MSR
registers. There is no branch type filtering. All control flow changes are
captured. BRS relies on specific programming of the core PMU of Fam19h. In
particular, the following requirements must be met:
- the sampling period be greater than 16 (BRS depth)
- the sampling period must use a fixed and not frequency mode
BRS interacts with the NMI interrupt as well. Because enabling BRS is
expensive, it is only activated after P event occurrences, where P is the
desired sampling period. At P occurrences of the event, the counter
overflows, the CPU catches the interrupt, activates BRS for 16 branches until
it saturates, and then delivers the NMI to the kernel. Between the overflow
and the time BRS activates more branches may be executed skewing the period.
All along, the sampling event keeps counting. The skid may be attenuated by
reducing the sampling period by 16 (subsequent patch).
BRS is integrated into perf_events seamlessly via the same
PERF_RECORD_BRANCH_STACK sample format. BRS generates perf_branch_entry
records in the sampling buffer. No prediction information is supported. The
branches are stored in reverse order of execution. The most recent branch is
the first entry in each record.
No modification to the perf tool is necessary.
BRS can be used with any sampling event. However, it is recommended to use
the RETIRED_BRANCH_INSTRUCTIONS event because it matches what the BRS
captures.
$ perf record -b -c 1000037 -e cpu/event=0xc2,name=ret_br_instructions/ test
$ perf report -D
56531696056126 0x193c000 [0x1a8]: PERF_RECORD_SAMPLE(IP, 0x2): 18122/18230: 0x401d24 period: 1000037 addr: 0
... branch stack: nr:16
..... 0: 0000000000401d24 -> 0000000000401d5a 0 cycles 0
..... 1: 0000000000401d5c -> 0000000000401d24 0 cycles 0
..... 2: 0000000000401d22 -> 0000000000401d5c 0 cycles 0
..... 3: 0000000000401d5e -> 0000000000401d22 0 cycles 0
..... 4: 0000000000401d20 -> 0000000000401d5e 0 cycles 0
..... 5: 0000000000401d3e -> 0000000000401d20 0 cycles 0
..... 6: 0000000000401d42 -> 0000000000401d3e 0 cycles 0
..... 7: 0000000000401d3c -> 0000000000401d42 0 cycles 0
..... 8: 0000000000401d44 -> 0000000000401d3c 0 cycles 0
..... 9: 0000000000401d3a -> 0000000000401d44 0 cycles 0
..... 10: 0000000000401d46 -> 0000000000401d3a 0 cycles 0
..... 11: 0000000000401d38 -> 0000000000401d46 0 cycles 0
..... 12: 0000000000401d48 -> 0000000000401d38 0 cycles 0
..... 13: 0000000000401d36 -> 0000000000401d48 0 cycles 0
..... 14: 0000000000401d4a -> 0000000000401d36 0 cycles 0
..... 15: 0000000000401d34 -> 0000000000401d4a 0 cycles 0
... thread: test:18230
...... dso: test
Signed-off-by: Stephane Eranian <eranian@google.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lore.kernel.org/r/20220322221517.2510440-4-eranian@google.com
2022-03-22 15:15:07 -07:00
|
|
|
/* tos = top of stack, i.e., last valid entry written */
|
|
|
|
static inline int amd_brs_get_tos(union amd_debug_extn_cfg *cfg)
|
|
|
|
{
|
|
|
|
/*
|
|
|
|
* msroff: index of next entry to write so top-of-stack is one off
|
|
|
|
* if BRS is full then msroff is set back to 0.
|
|
|
|
*/
|
|
|
|
return (cfg->msroff ? cfg->msroff : x86_pmu.lbr_nr) - 1;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* make sure we have a sane BRS offset to begin with
|
|
|
|
* especially with kexec
|
|
|
|
*/
|
|
|
|
void amd_brs_reset(void)
|
|
|
|
{
|
2022-05-16 08:48:38 -07:00
|
|
|
if (!cpu_feature_enabled(X86_FEATURE_BRS))
|
|
|
|
return;
|
|
|
|
|
perf/x86/amd: Add AMD Fam19h Branch Sampling support
Add support for the AMD Fam19h 16-deep branch sampling feature as
described in the AMD PPR Fam19h Model 01h Revision B1. This is a model
specific extension. It is not an architected AMD feature.
The Branch Sampling (BRS) operates with a 16-deep saturating buffer in MSR
registers. There is no branch type filtering. All control flow changes are
captured. BRS relies on specific programming of the core PMU of Fam19h. In
particular, the following requirements must be met:
- the sampling period be greater than 16 (BRS depth)
- the sampling period must use a fixed and not frequency mode
BRS interacts with the NMI interrupt as well. Because enabling BRS is
expensive, it is only activated after P event occurrences, where P is the
desired sampling period. At P occurrences of the event, the counter
overflows, the CPU catches the interrupt, activates BRS for 16 branches until
it saturates, and then delivers the NMI to the kernel. Between the overflow
and the time BRS activates more branches may be executed skewing the period.
All along, the sampling event keeps counting. The skid may be attenuated by
reducing the sampling period by 16 (subsequent patch).
BRS is integrated into perf_events seamlessly via the same
PERF_RECORD_BRANCH_STACK sample format. BRS generates perf_branch_entry
records in the sampling buffer. No prediction information is supported. The
branches are stored in reverse order of execution. The most recent branch is
the first entry in each record.
No modification to the perf tool is necessary.
BRS can be used with any sampling event. However, it is recommended to use
the RETIRED_BRANCH_INSTRUCTIONS event because it matches what the BRS
captures.
$ perf record -b -c 1000037 -e cpu/event=0xc2,name=ret_br_instructions/ test
$ perf report -D
56531696056126 0x193c000 [0x1a8]: PERF_RECORD_SAMPLE(IP, 0x2): 18122/18230: 0x401d24 period: 1000037 addr: 0
... branch stack: nr:16
..... 0: 0000000000401d24 -> 0000000000401d5a 0 cycles 0
..... 1: 0000000000401d5c -> 0000000000401d24 0 cycles 0
..... 2: 0000000000401d22 -> 0000000000401d5c 0 cycles 0
..... 3: 0000000000401d5e -> 0000000000401d22 0 cycles 0
..... 4: 0000000000401d20 -> 0000000000401d5e 0 cycles 0
..... 5: 0000000000401d3e -> 0000000000401d20 0 cycles 0
..... 6: 0000000000401d42 -> 0000000000401d3e 0 cycles 0
..... 7: 0000000000401d3c -> 0000000000401d42 0 cycles 0
..... 8: 0000000000401d44 -> 0000000000401d3c 0 cycles 0
..... 9: 0000000000401d3a -> 0000000000401d44 0 cycles 0
..... 10: 0000000000401d46 -> 0000000000401d3a 0 cycles 0
..... 11: 0000000000401d38 -> 0000000000401d46 0 cycles 0
..... 12: 0000000000401d48 -> 0000000000401d38 0 cycles 0
..... 13: 0000000000401d36 -> 0000000000401d48 0 cycles 0
..... 14: 0000000000401d4a -> 0000000000401d36 0 cycles 0
..... 15: 0000000000401d34 -> 0000000000401d4a 0 cycles 0
... thread: test:18230
...... dso: test
Signed-off-by: Stephane Eranian <eranian@google.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lore.kernel.org/r/20220322221517.2510440-4-eranian@google.com
2022-03-22 15:15:07 -07:00
|
|
|
/*
|
|
|
|
* Reset config
|
|
|
|
*/
|
|
|
|
set_debug_extn_cfg(0);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Mark first entry as poisoned
|
|
|
|
*/
|
|
|
|
wrmsrl(brs_to(0), BRS_POISON);
|
|
|
|
}
|
|
|
|
|
|
|
|
int __init amd_brs_init(void)
|
|
|
|
{
|
|
|
|
if (!amd_brs_detect())
|
|
|
|
return -EOPNOTSUPP;
|
|
|
|
|
|
|
|
pr_cont("%d-deep BRS, ", x86_pmu.lbr_nr);
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
void amd_brs_enable(void)
|
|
|
|
{
|
|
|
|
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
|
|
|
|
union amd_debug_extn_cfg cfg;
|
|
|
|
|
|
|
|
/* Activate only on first user */
|
|
|
|
if (++cpuc->brs_active > 1)
|
|
|
|
return;
|
|
|
|
|
|
|
|
cfg.val = 0; /* reset all fields */
|
|
|
|
cfg.brsmen = 1; /* enable branch sampling */
|
|
|
|
|
|
|
|
/* Set enable bit */
|
|
|
|
set_debug_extn_cfg(cfg.val);
|
|
|
|
}
|
|
|
|
|
|
|
|
void amd_brs_enable_all(void)
|
|
|
|
{
|
|
|
|
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
|
|
|
|
if (cpuc->lbr_users)
|
|
|
|
amd_brs_enable();
|
|
|
|
}
|
|
|
|
|
|
|
|
void amd_brs_disable(void)
|
|
|
|
{
|
|
|
|
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
|
|
|
|
union amd_debug_extn_cfg cfg;
|
|
|
|
|
|
|
|
/* Check if active (could be disabled via x86_pmu_disable_all()) */
|
|
|
|
if (!cpuc->brs_active)
|
|
|
|
return;
|
|
|
|
|
|
|
|
/* Only disable for last user */
|
|
|
|
if (--cpuc->brs_active)
|
|
|
|
return;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Clear the brsmen bit but preserve the others as they contain
|
|
|
|
* useful state such as vb and msroff
|
|
|
|
*/
|
|
|
|
cfg.val = get_debug_extn_cfg();
|
|
|
|
|
|
|
|
/*
|
|
|
|
* When coming in on interrupt and BRS is full, then hw will have
|
|
|
|
* already stopped BRS, no need to issue wrmsr again
|
|
|
|
*/
|
|
|
|
if (cfg.brsmen) {
|
|
|
|
cfg.brsmen = 0;
|
|
|
|
set_debug_extn_cfg(cfg.val);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
void amd_brs_disable_all(void)
|
|
|
|
{
|
|
|
|
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
|
|
|
|
if (cpuc->lbr_users)
|
|
|
|
amd_brs_disable();
|
|
|
|
}
|
|
|
|
|
2022-03-22 15:15:09 -07:00
|
|
|
static bool amd_brs_match_plm(struct perf_event *event, u64 to)
|
|
|
|
{
|
|
|
|
int type = event->attr.branch_sample_type;
|
|
|
|
int plm_k = PERF_SAMPLE_BRANCH_KERNEL | PERF_SAMPLE_BRANCH_HV;
|
|
|
|
int plm_u = PERF_SAMPLE_BRANCH_USER;
|
|
|
|
|
|
|
|
if (!(type & plm_k) && kernel_ip(to))
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
if (!(type & plm_u) && !kernel_ip(to))
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
|
perf/x86/amd: Add AMD Fam19h Branch Sampling support
Add support for the AMD Fam19h 16-deep branch sampling feature as
described in the AMD PPR Fam19h Model 01h Revision B1. This is a model
specific extension. It is not an architected AMD feature.
The Branch Sampling (BRS) operates with a 16-deep saturating buffer in MSR
registers. There is no branch type filtering. All control flow changes are
captured. BRS relies on specific programming of the core PMU of Fam19h. In
particular, the following requirements must be met:
- the sampling period be greater than 16 (BRS depth)
- the sampling period must use a fixed and not frequency mode
BRS interacts with the NMI interrupt as well. Because enabling BRS is
expensive, it is only activated after P event occurrences, where P is the
desired sampling period. At P occurrences of the event, the counter
overflows, the CPU catches the interrupt, activates BRS for 16 branches until
it saturates, and then delivers the NMI to the kernel. Between the overflow
and the time BRS activates more branches may be executed skewing the period.
All along, the sampling event keeps counting. The skid may be attenuated by
reducing the sampling period by 16 (subsequent patch).
BRS is integrated into perf_events seamlessly via the same
PERF_RECORD_BRANCH_STACK sample format. BRS generates perf_branch_entry
records in the sampling buffer. No prediction information is supported. The
branches are stored in reverse order of execution. The most recent branch is
the first entry in each record.
No modification to the perf tool is necessary.
BRS can be used with any sampling event. However, it is recommended to use
the RETIRED_BRANCH_INSTRUCTIONS event because it matches what the BRS
captures.
$ perf record -b -c 1000037 -e cpu/event=0xc2,name=ret_br_instructions/ test
$ perf report -D
56531696056126 0x193c000 [0x1a8]: PERF_RECORD_SAMPLE(IP, 0x2): 18122/18230: 0x401d24 period: 1000037 addr: 0
... branch stack: nr:16
..... 0: 0000000000401d24 -> 0000000000401d5a 0 cycles 0
..... 1: 0000000000401d5c -> 0000000000401d24 0 cycles 0
..... 2: 0000000000401d22 -> 0000000000401d5c 0 cycles 0
..... 3: 0000000000401d5e -> 0000000000401d22 0 cycles 0
..... 4: 0000000000401d20 -> 0000000000401d5e 0 cycles 0
..... 5: 0000000000401d3e -> 0000000000401d20 0 cycles 0
..... 6: 0000000000401d42 -> 0000000000401d3e 0 cycles 0
..... 7: 0000000000401d3c -> 0000000000401d42 0 cycles 0
..... 8: 0000000000401d44 -> 0000000000401d3c 0 cycles 0
..... 9: 0000000000401d3a -> 0000000000401d44 0 cycles 0
..... 10: 0000000000401d46 -> 0000000000401d3a 0 cycles 0
..... 11: 0000000000401d38 -> 0000000000401d46 0 cycles 0
..... 12: 0000000000401d48 -> 0000000000401d38 0 cycles 0
..... 13: 0000000000401d36 -> 0000000000401d48 0 cycles 0
..... 14: 0000000000401d4a -> 0000000000401d36 0 cycles 0
..... 15: 0000000000401d34 -> 0000000000401d4a 0 cycles 0
... thread: test:18230
...... dso: test
Signed-off-by: Stephane Eranian <eranian@google.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lore.kernel.org/r/20220322221517.2510440-4-eranian@google.com
2022-03-22 15:15:07 -07:00
|
|
|
/*
|
|
|
|
* Caller must ensure amd_brs_inuse() is true before calling
|
|
|
|
* return:
|
|
|
|
*/
|
|
|
|
void amd_brs_drain(void)
|
|
|
|
{
|
|
|
|
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
|
|
|
|
struct perf_event *event = cpuc->events[0];
|
|
|
|
struct perf_branch_entry *br = cpuc->lbr_entries;
|
|
|
|
union amd_debug_extn_cfg cfg;
|
|
|
|
u32 i, nr = 0, num, tos, start;
|
|
|
|
u32 shift = 64 - boot_cpu_data.x86_virt_bits;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* BRS event forced on PMC0,
|
|
|
|
* so check if there is an event.
|
|
|
|
* It is possible to have lbr_users > 0 but the event
|
|
|
|
* not yet scheduled due to long latency PMU irq
|
|
|
|
*/
|
|
|
|
if (!event)
|
|
|
|
goto empty;
|
|
|
|
|
|
|
|
cfg.val = get_debug_extn_cfg();
|
|
|
|
|
|
|
|
/* Sanity check [0-x86_pmu.lbr_nr] */
|
|
|
|
if (WARN_ON_ONCE(cfg.msroff >= x86_pmu.lbr_nr))
|
|
|
|
goto empty;
|
|
|
|
|
|
|
|
/* No valid branch */
|
|
|
|
if (cfg.vb == 0)
|
|
|
|
goto empty;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* msr.off points to next entry to be written
|
|
|
|
* tos = most recent entry index = msr.off - 1
|
|
|
|
* BRS register buffer saturates, so we know we have
|
|
|
|
* start < tos and that we have to read from start to tos
|
|
|
|
*/
|
|
|
|
start = 0;
|
|
|
|
tos = amd_brs_get_tos(&cfg);
|
|
|
|
|
|
|
|
num = tos - start + 1;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* BRS is only one pass (saturation) from MSROFF to depth-1
|
|
|
|
* MSROFF wraps to zero when buffer is full
|
|
|
|
*/
|
|
|
|
for (i = 0; i < num; i++) {
|
|
|
|
u32 brs_idx = tos - i;
|
|
|
|
u64 from, to;
|
|
|
|
|
|
|
|
rdmsrl(brs_to(brs_idx), to);
|
|
|
|
|
|
|
|
/* Entry does not belong to us (as marked by kernel) */
|
|
|
|
if (to == BRS_POISON)
|
|
|
|
break;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Sign-extend SAMP_BR_TO to 64 bits, bits 61-63 are reserved.
|
|
|
|
* Necessary to generate proper virtual addresses suitable for
|
|
|
|
* symbolization
|
|
|
|
*/
|
|
|
|
to = (u64)(((s64)to << shift) >> shift);
|
|
|
|
|
2022-03-22 15:15:09 -07:00
|
|
|
if (!amd_brs_match_plm(event, to))
|
|
|
|
continue;
|
|
|
|
|
|
|
|
rdmsrl(brs_from(brs_idx), from);
|
|
|
|
|
perf/x86/amd: Add AMD Fam19h Branch Sampling support
Add support for the AMD Fam19h 16-deep branch sampling feature as
described in the AMD PPR Fam19h Model 01h Revision B1. This is a model
specific extension. It is not an architected AMD feature.
The Branch Sampling (BRS) operates with a 16-deep saturating buffer in MSR
registers. There is no branch type filtering. All control flow changes are
captured. BRS relies on specific programming of the core PMU of Fam19h. In
particular, the following requirements must be met:
- the sampling period be greater than 16 (BRS depth)
- the sampling period must use a fixed and not frequency mode
BRS interacts with the NMI interrupt as well. Because enabling BRS is
expensive, it is only activated after P event occurrences, where P is the
desired sampling period. At P occurrences of the event, the counter
overflows, the CPU catches the interrupt, activates BRS for 16 branches until
it saturates, and then delivers the NMI to the kernel. Between the overflow
and the time BRS activates more branches may be executed skewing the period.
All along, the sampling event keeps counting. The skid may be attenuated by
reducing the sampling period by 16 (subsequent patch).
BRS is integrated into perf_events seamlessly via the same
PERF_RECORD_BRANCH_STACK sample format. BRS generates perf_branch_entry
records in the sampling buffer. No prediction information is supported. The
branches are stored in reverse order of execution. The most recent branch is
the first entry in each record.
No modification to the perf tool is necessary.
BRS can be used with any sampling event. However, it is recommended to use
the RETIRED_BRANCH_INSTRUCTIONS event because it matches what the BRS
captures.
$ perf record -b -c 1000037 -e cpu/event=0xc2,name=ret_br_instructions/ test
$ perf report -D
56531696056126 0x193c000 [0x1a8]: PERF_RECORD_SAMPLE(IP, 0x2): 18122/18230: 0x401d24 period: 1000037 addr: 0
... branch stack: nr:16
..... 0: 0000000000401d24 -> 0000000000401d5a 0 cycles 0
..... 1: 0000000000401d5c -> 0000000000401d24 0 cycles 0
..... 2: 0000000000401d22 -> 0000000000401d5c 0 cycles 0
..... 3: 0000000000401d5e -> 0000000000401d22 0 cycles 0
..... 4: 0000000000401d20 -> 0000000000401d5e 0 cycles 0
..... 5: 0000000000401d3e -> 0000000000401d20 0 cycles 0
..... 6: 0000000000401d42 -> 0000000000401d3e 0 cycles 0
..... 7: 0000000000401d3c -> 0000000000401d42 0 cycles 0
..... 8: 0000000000401d44 -> 0000000000401d3c 0 cycles 0
..... 9: 0000000000401d3a -> 0000000000401d44 0 cycles 0
..... 10: 0000000000401d46 -> 0000000000401d3a 0 cycles 0
..... 11: 0000000000401d38 -> 0000000000401d46 0 cycles 0
..... 12: 0000000000401d48 -> 0000000000401d38 0 cycles 0
..... 13: 0000000000401d36 -> 0000000000401d48 0 cycles 0
..... 14: 0000000000401d4a -> 0000000000401d36 0 cycles 0
..... 15: 0000000000401d34 -> 0000000000401d4a 0 cycles 0
... thread: test:18230
...... dso: test
Signed-off-by: Stephane Eranian <eranian@google.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lore.kernel.org/r/20220322221517.2510440-4-eranian@google.com
2022-03-22 15:15:07 -07:00
|
|
|
perf_clear_branch_entry_bitfields(br+nr);
|
|
|
|
|
|
|
|
br[nr].from = from;
|
|
|
|
br[nr].to = to;
|
|
|
|
|
|
|
|
nr++;
|
|
|
|
}
|
|
|
|
empty:
|
|
|
|
/* Record number of sampled branches */
|
|
|
|
cpuc->lbr_stack.nr = nr;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Poison most recent entry to prevent reuse by next task
|
|
|
|
* required because BRS entry are not tagged by PID
|
|
|
|
*/
|
|
|
|
static void amd_brs_poison_buffer(void)
|
|
|
|
{
|
|
|
|
union amd_debug_extn_cfg cfg;
|
|
|
|
unsigned int idx;
|
|
|
|
|
|
|
|
/* Get current state */
|
|
|
|
cfg.val = get_debug_extn_cfg();
|
|
|
|
|
|
|
|
/* idx is most recently written entry */
|
|
|
|
idx = amd_brs_get_tos(&cfg);
|
|
|
|
|
|
|
|
/* Poison target of entry */
|
|
|
|
wrmsrl(brs_to(idx), BRS_POISON);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* On context switch in, we need to make sure no samples from previous user
|
|
|
|
* are left in the BRS.
|
|
|
|
*
|
|
|
|
* On ctxswin, sched_in = true, called after the PMU has started
|
|
|
|
* On ctxswout, sched_in = false, called before the PMU is stopped
|
|
|
|
*/
|
perf: Rewrite core context handling
There have been various issues and limitations with the way perf uses
(task) contexts to track events. Most notable is the single hardware
PMU task context, which has resulted in a number of yucky things (both
proposed and merged).
Notably:
- HW breakpoint PMU
- ARM big.little PMU / Intel ADL PMU
- Intel Branch Monitoring PMU
- AMD IBS PMU
- S390 cpum_cf PMU
- PowerPC trace_imc PMU
*Current design:*
Currently we have a per task and per cpu perf_event_contexts:
task_struct::perf_events_ctxp[] <-> perf_event_context <-> perf_cpu_context
^ | ^ | ^
`---------------------------------' | `--> pmu ---'
v ^
perf_event ------'
Each task has an array of pointers to a perf_event_context. Each
perf_event_context has a direct relation to a PMU and a group of
events for that PMU. The task related perf_event_context's have a
pointer back to that task.
Each PMU has a per-cpu pointer to a per-cpu perf_cpu_context, which
includes a perf_event_context, which again has a direct relation to
that PMU, and a group of events for that PMU.
The perf_cpu_context also tracks which task context is currently
associated with that CPU and includes a few other things like the
hrtimer for rotation etc.
Each perf_event is then associated with its PMU and one
perf_event_context.
*Proposed design:*
New design proposed by this patch reduce to a single task context and
a single CPU context but adds some intermediate data-structures:
task_struct::perf_event_ctxp -> perf_event_context <- perf_cpu_context
^ | ^ ^
`---------------------------' | |
| | perf_cpu_pmu_context <--.
| `----. ^ |
| | | |
| v v |
| ,--> perf_event_pmu_context |
| | |
| | |
v v |
perf_event ---> pmu ----------------'
With the new design, perf_event_context will hold all events for all
pmus in the (respective pinned/flexible) rbtrees. This can be achieved
by adding pmu to rbtree key:
{cpu, pmu, cgroup, group_index}
Each perf_event_context carries a list of perf_event_pmu_context which
is used to hold per-pmu-per-context state. For example, it keeps track
of currently active events for that pmu, a pmu specific task_ctx_data,
a flag to tell whether rotation is required or not etc.
Additionally, perf_cpu_pmu_context is used to hold per-pmu-per-cpu
state like hrtimer details to drive the event rotation, a pointer to
perf_event_pmu_context of currently running task and some other
ancillary information.
Each perf_event is associated to it's pmu, perf_event_context and
perf_event_pmu_context.
Further optimizations to current implementation are possible. For
example, ctx_resched() can be optimized to reschedule only single pmu
events.
Much thanks to Ravi for picking this up and pushing it towards
completion.
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Co-developed-by: Ravi Bangoria <ravi.bangoria@amd.com>
Signed-off-by: Ravi Bangoria <ravi.bangoria@amd.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20221008062424.313-1-ravi.bangoria@amd.com
2022-10-07 23:24:24 -07:00
|
|
|
void amd_pmu_brs_sched_task(struct perf_event_pmu_context *pmu_ctx, bool sched_in)
|
perf/x86/amd: Add AMD Fam19h Branch Sampling support
Add support for the AMD Fam19h 16-deep branch sampling feature as
described in the AMD PPR Fam19h Model 01h Revision B1. This is a model
specific extension. It is not an architected AMD feature.
The Branch Sampling (BRS) operates with a 16-deep saturating buffer in MSR
registers. There is no branch type filtering. All control flow changes are
captured. BRS relies on specific programming of the core PMU of Fam19h. In
particular, the following requirements must be met:
- the sampling period be greater than 16 (BRS depth)
- the sampling period must use a fixed and not frequency mode
BRS interacts with the NMI interrupt as well. Because enabling BRS is
expensive, it is only activated after P event occurrences, where P is the
desired sampling period. At P occurrences of the event, the counter
overflows, the CPU catches the interrupt, activates BRS for 16 branches until
it saturates, and then delivers the NMI to the kernel. Between the overflow
and the time BRS activates more branches may be executed skewing the period.
All along, the sampling event keeps counting. The skid may be attenuated by
reducing the sampling period by 16 (subsequent patch).
BRS is integrated into perf_events seamlessly via the same
PERF_RECORD_BRANCH_STACK sample format. BRS generates perf_branch_entry
records in the sampling buffer. No prediction information is supported. The
branches are stored in reverse order of execution. The most recent branch is
the first entry in each record.
No modification to the perf tool is necessary.
BRS can be used with any sampling event. However, it is recommended to use
the RETIRED_BRANCH_INSTRUCTIONS event because it matches what the BRS
captures.
$ perf record -b -c 1000037 -e cpu/event=0xc2,name=ret_br_instructions/ test
$ perf report -D
56531696056126 0x193c000 [0x1a8]: PERF_RECORD_SAMPLE(IP, 0x2): 18122/18230: 0x401d24 period: 1000037 addr: 0
... branch stack: nr:16
..... 0: 0000000000401d24 -> 0000000000401d5a 0 cycles 0
..... 1: 0000000000401d5c -> 0000000000401d24 0 cycles 0
..... 2: 0000000000401d22 -> 0000000000401d5c 0 cycles 0
..... 3: 0000000000401d5e -> 0000000000401d22 0 cycles 0
..... 4: 0000000000401d20 -> 0000000000401d5e 0 cycles 0
..... 5: 0000000000401d3e -> 0000000000401d20 0 cycles 0
..... 6: 0000000000401d42 -> 0000000000401d3e 0 cycles 0
..... 7: 0000000000401d3c -> 0000000000401d42 0 cycles 0
..... 8: 0000000000401d44 -> 0000000000401d3c 0 cycles 0
..... 9: 0000000000401d3a -> 0000000000401d44 0 cycles 0
..... 10: 0000000000401d46 -> 0000000000401d3a 0 cycles 0
..... 11: 0000000000401d38 -> 0000000000401d46 0 cycles 0
..... 12: 0000000000401d48 -> 0000000000401d38 0 cycles 0
..... 13: 0000000000401d36 -> 0000000000401d48 0 cycles 0
..... 14: 0000000000401d4a -> 0000000000401d36 0 cycles 0
..... 15: 0000000000401d34 -> 0000000000401d4a 0 cycles 0
... thread: test:18230
...... dso: test
Signed-off-by: Stephane Eranian <eranian@google.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lore.kernel.org/r/20220322221517.2510440-4-eranian@google.com
2022-03-22 15:15:07 -07:00
|
|
|
{
|
|
|
|
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
|
|
|
|
|
|
|
|
/* no active users */
|
|
|
|
if (!cpuc->lbr_users)
|
|
|
|
return;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* On context switch in, we need to ensure we do not use entries
|
|
|
|
* from previous BRS user on that CPU, so we poison the buffer as
|
|
|
|
* a faster way compared to resetting all entries.
|
|
|
|
*/
|
|
|
|
if (sched_in)
|
|
|
|
amd_brs_poison_buffer();
|
|
|
|
}
|
2022-03-22 15:15:13 -07:00
|
|
|
|
|
|
|
/*
|
|
|
|
* called from ACPI processor_idle.c or acpi_pad.c
|
|
|
|
* with interrupts disabled
|
|
|
|
*/
|
2023-01-12 12:43:15 -07:00
|
|
|
void noinstr perf_amd_brs_lopwr_cb(bool lopwr_in)
|
2022-03-22 15:15:13 -07:00
|
|
|
{
|
|
|
|
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
|
|
|
|
union amd_debug_extn_cfg cfg;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* on mwait in, we may end up in non C0 state.
|
|
|
|
* we must disable branch sampling to avoid holding the NMI
|
|
|
|
* for too long. We disable it in hardware but we
|
|
|
|
* keep the state in cpuc, so we can re-enable.
|
|
|
|
*
|
|
|
|
* The hardware will deliver the NMI if needed when brsmen cleared
|
|
|
|
*/
|
|
|
|
if (cpuc->brs_active) {
|
|
|
|
cfg.val = get_debug_extn_cfg();
|
|
|
|
cfg.brsmen = !lopwr_in;
|
|
|
|
set_debug_extn_cfg(cfg.val);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
DEFINE_STATIC_CALL_NULL(perf_lopwr_cb, perf_amd_brs_lopwr_cb);
|
|
|
|
EXPORT_STATIC_CALL_TRAMP_GPL(perf_lopwr_cb);
|
|
|
|
|
|
|
|
void __init amd_brs_lopwr_init(void)
|
|
|
|
{
|
|
|
|
static_call_update(perf_lopwr_cb, perf_amd_brs_lopwr_cb);
|
|
|
|
}
|