76d3685400
The control knobs set before loading BPF programs should be declared as 'const volatile' so that it can be optimized by the BPF core. Committer testing: root@x1:~# perf stat --bpf-counters -e cpu_core/cycles/,cpu_core/instructions/ sleep 1 Performance counter stats for 'sleep 1': 2,442,583 cpu_core/cycles/ 2,494,425 cpu_core/instructions/ 1.002687372 seconds time elapsed 0.001126000 seconds user 0.001166000 seconds sys root@x1:~# perf trace -e bpf --max-events 10 perf stat --bpf-counters -e cpu_core/cycles/,cpu_core/instructions/ sleep 1 0.000 ( 0.019 ms): perf/2944119 bpf(cmd: OBJ_GET, uattr: 0x7fffdf5cdd40, size: 20) = 5 0.021 ( 0.002 ms): perf/2944119 bpf(cmd: OBJ_GET_INFO_BY_FD, uattr: 0x7fffdf5cdcd0, size: 16) = 0 0.030 ( 0.005 ms): perf/2944119 bpf(cmd: MAP_LOOKUP_ELEM, uattr: 0x7fffdf5ceda0, size: 32) = 0 0.037 ( 0.004 ms): perf/2944119 bpf(cmd: LINK_GET_FD_BY_ID, uattr: 0x7fffdf5ced80, size: 12) = -1 ENOENT (No such file or directory) 0.189 ( 0.004 ms): perf/2944119 bpf(cmd: 36, uattr: 0x7fffdf5cec10, size: 8) = -1 EOPNOTSUPP (Operation not supported) 0.201 ( 0.095 ms): perf/2944119 bpf(cmd: PROG_LOAD, uattr: 0x7fffdf5ce940, size: 148) = 10 0.305 ( 0.026 ms): perf/2944119 bpf(cmd: PROG_LOAD, uattr: 0x7fffdf5cea00, size: 148) = 10 0.347 ( 0.012 ms): perf/2944119 bpf(cmd: BTF_LOAD, uattr: 0x7fffdf5ce8e0, size: 40) = 10 0.364 ( 0.004 ms): perf/2944119 bpf(cmd: BTF_LOAD, uattr: 0x7fffdf5ce950, size: 40) = 10 0.376 ( 0.006 ms): perf/2944119 bpf(cmd: BTF_LOAD, uattr: 0x7fffdf5ce730, size: 40) = 10 root@x1:~# Performance counter stats for 'sleep 1': 271,221 cpu_core/cycles/ 139,150 cpu_core/instructions/ 1.002881677 seconds time elapsed 0.001318000 seconds user 0.001314000 seconds sys root@x1:~# Signed-off-by: Namhyung Kim <namhyung@kernel.org> Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com> Cc: Adrian Hunter <adrian.hunter@intel.com> Cc: Ian Rogers <irogers@google.com> Cc: Ingo Molnar <mingo@kernel.org> Cc: Jiri Olsa <jolsa@kernel.org> Cc: Kan Liang <kan.liang@linux.intel.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Song Liu <song@kernel.org> Link: https://lore.kernel.org/r/20240902200515.2103769-2-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
228 lines
5.5 KiB
C
228 lines
5.5 KiB
C
// SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
|
|
// Copyright (c) 2021 Facebook
|
|
// Copyright (c) 2021 Google
|
|
#include "vmlinux.h"
|
|
#include <bpf/bpf_helpers.h>
|
|
#include <bpf/bpf_tracing.h>
|
|
#include <bpf/bpf_core_read.h>
|
|
|
|
#define MAX_LEVELS 10 // max cgroup hierarchy level: arbitrary
|
|
#define MAX_EVENTS 32 // max events per cgroup: arbitrary
|
|
|
|
// NOTE: many of map and global data will be modified before loading
|
|
// from the userspace (perf tool) using the skeleton helpers.
|
|
|
|
// single set of global perf events to measure
|
|
struct {
|
|
__uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY);
|
|
__uint(key_size, sizeof(__u32));
|
|
__uint(value_size, sizeof(int));
|
|
__uint(max_entries, 1);
|
|
} events SEC(".maps");
|
|
|
|
// from cgroup id to event index
|
|
struct {
|
|
__uint(type, BPF_MAP_TYPE_HASH);
|
|
__uint(key_size, sizeof(__u64));
|
|
__uint(value_size, sizeof(__u32));
|
|
__uint(max_entries, 1);
|
|
} cgrp_idx SEC(".maps");
|
|
|
|
// per-cpu event snapshots to calculate delta
|
|
struct {
|
|
__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
|
|
__uint(key_size, sizeof(__u32));
|
|
__uint(value_size, sizeof(struct bpf_perf_event_value));
|
|
} prev_readings SEC(".maps");
|
|
|
|
// aggregated event values for each cgroup (per-cpu)
|
|
// will be read from the user-space
|
|
struct {
|
|
__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
|
|
__uint(key_size, sizeof(__u32));
|
|
__uint(value_size, sizeof(struct bpf_perf_event_value));
|
|
} cgrp_readings SEC(".maps");
|
|
|
|
/* new kernel cgroup definition */
|
|
struct cgroup___new {
|
|
int level;
|
|
struct cgroup *ancestors[];
|
|
} __attribute__((preserve_access_index));
|
|
|
|
/* old kernel cgroup definition */
|
|
struct cgroup___old {
|
|
int level;
|
|
u64 ancestor_ids[];
|
|
} __attribute__((preserve_access_index));
|
|
|
|
const volatile __u32 num_events = 1;
|
|
const volatile __u32 num_cpus = 1;
|
|
const volatile int use_cgroup_v2 = 0;
|
|
|
|
int enabled = 0;
|
|
int perf_subsys_id = -1;
|
|
|
|
static inline __u64 get_cgroup_v1_ancestor_id(struct cgroup *cgrp, int level)
|
|
{
|
|
/* recast pointer to capture new type for compiler */
|
|
struct cgroup___new *cgrp_new = (void *)cgrp;
|
|
|
|
if (bpf_core_field_exists(cgrp_new->ancestors)) {
|
|
return BPF_CORE_READ(cgrp_new, ancestors[level], kn, id);
|
|
} else {
|
|
/* recast pointer to capture old type for compiler */
|
|
struct cgroup___old *cgrp_old = (void *)cgrp;
|
|
|
|
return BPF_CORE_READ(cgrp_old, ancestor_ids[level]);
|
|
}
|
|
}
|
|
|
|
static inline int get_cgroup_v1_idx(__u32 *cgrps, int size)
|
|
{
|
|
struct task_struct *p = (void *)bpf_get_current_task();
|
|
struct cgroup *cgrp;
|
|
register int i = 0;
|
|
__u32 *elem;
|
|
int level;
|
|
int cnt;
|
|
|
|
if (perf_subsys_id == -1) {
|
|
#if __has_builtin(__builtin_preserve_enum_value)
|
|
perf_subsys_id = bpf_core_enum_value(enum cgroup_subsys_id,
|
|
perf_event_cgrp_id);
|
|
#else
|
|
perf_subsys_id = perf_event_cgrp_id;
|
|
#endif
|
|
}
|
|
cgrp = BPF_CORE_READ(p, cgroups, subsys[perf_subsys_id], cgroup);
|
|
level = BPF_CORE_READ(cgrp, level);
|
|
|
|
for (cnt = 0; i < MAX_LEVELS; i++) {
|
|
__u64 cgrp_id;
|
|
|
|
if (i > level)
|
|
break;
|
|
|
|
// convert cgroup-id to a map index
|
|
cgrp_id = get_cgroup_v1_ancestor_id(cgrp, i);
|
|
elem = bpf_map_lookup_elem(&cgrp_idx, &cgrp_id);
|
|
if (!elem)
|
|
continue;
|
|
|
|
cgrps[cnt++] = *elem;
|
|
if (cnt == size)
|
|
break;
|
|
}
|
|
|
|
return cnt;
|
|
}
|
|
|
|
static inline int get_cgroup_v2_idx(__u32 *cgrps, int size)
|
|
{
|
|
register int i = 0;
|
|
__u32 *elem;
|
|
int cnt;
|
|
|
|
for (cnt = 0; i < MAX_LEVELS; i++) {
|
|
__u64 cgrp_id = bpf_get_current_ancestor_cgroup_id(i);
|
|
|
|
if (cgrp_id == 0)
|
|
break;
|
|
|
|
// convert cgroup-id to a map index
|
|
elem = bpf_map_lookup_elem(&cgrp_idx, &cgrp_id);
|
|
if (!elem)
|
|
continue;
|
|
|
|
cgrps[cnt++] = *elem;
|
|
if (cnt == size)
|
|
break;
|
|
}
|
|
|
|
return cnt;
|
|
}
|
|
|
|
static int bperf_cgroup_count(void)
|
|
{
|
|
register __u32 idx = 0; // to have it in a register to pass BPF verifier
|
|
register int c = 0;
|
|
struct bpf_perf_event_value val, delta, *prev_val, *cgrp_val;
|
|
__u32 cpu = bpf_get_smp_processor_id();
|
|
__u32 cgrp_idx[MAX_LEVELS];
|
|
int cgrp_cnt;
|
|
__u32 key, cgrp;
|
|
long err;
|
|
|
|
if (use_cgroup_v2)
|
|
cgrp_cnt = get_cgroup_v2_idx(cgrp_idx, MAX_LEVELS);
|
|
else
|
|
cgrp_cnt = get_cgroup_v1_idx(cgrp_idx, MAX_LEVELS);
|
|
|
|
for ( ; idx < MAX_EVENTS; idx++) {
|
|
if (idx == num_events)
|
|
break;
|
|
|
|
// XXX: do not pass idx directly (for verifier)
|
|
key = idx;
|
|
// this is per-cpu array for diff
|
|
prev_val = bpf_map_lookup_elem(&prev_readings, &key);
|
|
if (!prev_val) {
|
|
val.counter = val.enabled = val.running = 0;
|
|
bpf_map_update_elem(&prev_readings, &key, &val, BPF_ANY);
|
|
|
|
prev_val = bpf_map_lookup_elem(&prev_readings, &key);
|
|
if (!prev_val)
|
|
continue;
|
|
}
|
|
|
|
// read from global perf_event array
|
|
key = idx * num_cpus + cpu;
|
|
err = bpf_perf_event_read_value(&events, key, &val, sizeof(val));
|
|
if (err)
|
|
continue;
|
|
|
|
if (enabled) {
|
|
delta.counter = val.counter - prev_val->counter;
|
|
delta.enabled = val.enabled - prev_val->enabled;
|
|
delta.running = val.running - prev_val->running;
|
|
|
|
for (c = 0; c < MAX_LEVELS; c++) {
|
|
if (c == cgrp_cnt)
|
|
break;
|
|
|
|
cgrp = cgrp_idx[c];
|
|
|
|
// aggregate the result by cgroup
|
|
key = cgrp * num_events + idx;
|
|
cgrp_val = bpf_map_lookup_elem(&cgrp_readings, &key);
|
|
if (cgrp_val) {
|
|
cgrp_val->counter += delta.counter;
|
|
cgrp_val->enabled += delta.enabled;
|
|
cgrp_val->running += delta.running;
|
|
} else {
|
|
bpf_map_update_elem(&cgrp_readings, &key,
|
|
&delta, BPF_ANY);
|
|
}
|
|
}
|
|
}
|
|
|
|
*prev_val = val;
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
// This will be attached to cgroup-switches event for each cpu
|
|
SEC("perf_event")
|
|
int BPF_PROG(on_cgrp_switch)
|
|
{
|
|
return bperf_cgroup_count();
|
|
}
|
|
|
|
SEC("raw_tp/sched_switch")
|
|
int BPF_PROG(trigger_read)
|
|
{
|
|
return bperf_cgroup_count();
|
|
}
|
|
|
|
char LICENSE[] SEC("license") = "Dual BSD/GPL";
|