1

selftests/bpf: add multi-uprobe benchmarks

Add multi-uprobe and multi-uretprobe benchmarks to bench tool.
Multi- and classic uprobes/uretprobes have different low-level
triggering code paths, so it's sometimes important to be able to
benchmark both flavors of uprobes/uretprobes.

Sample examples from my dev machine below. Single-threaded peformance
almost doesn't differ, but with more parallel CPUs triggering the same
uprobe/uretprobe the difference grows. This might be due to [0], but
given the code is slightly different, there could be other sources of
slowdown.

Note, all these numbers will change due to ongoing work to improve
uprobe/uretprobe scalability (e.g., [1]), but having benchmark like this
is useful for measurements and debugging nevertheless.

\#!/bin/bash
set -eufo pipefail
for p in 1 8 16 32; do
    for i in uprobe-nop uretprobe-nop uprobe-multi-nop uretprobe-multi-nop; do
        summary=$(sudo ./bench -w1 -d3 -p$p -a trig-$i | tail -n1)
        total=$(echo "$summary" | cut -d'(' -f1 | cut -d' ' -f3-)
        percpu=$(echo "$summary" | cut -d'(' -f2 | cut -d')' -f1 | cut -d'/' -f1)
        printf "%-21s (%2d cpus): %s (%s/s/cpu)\n" $i $p "$total" "$percpu"
    done
    echo
done

uprobe-nop            ( 1 cpus):    1.020 ± 0.005M/s  (  1.020M/s/cpu)
uretprobe-nop         ( 1 cpus):    0.515 ± 0.009M/s  (  0.515M/s/cpu)
uprobe-multi-nop      ( 1 cpus):    1.036 ± 0.004M/s  (  1.036M/s/cpu)
uretprobe-multi-nop   ( 1 cpus):    0.512 ± 0.005M/s  (  0.512M/s/cpu)

uprobe-nop            ( 8 cpus):    3.481 ± 0.030M/s  (  0.435M/s/cpu)
uretprobe-nop         ( 8 cpus):    2.222 ± 0.008M/s  (  0.278M/s/cpu)
uprobe-multi-nop      ( 8 cpus):    3.769 ± 0.094M/s  (  0.471M/s/cpu)
uretprobe-multi-nop   ( 8 cpus):    2.482 ± 0.007M/s  (  0.310M/s/cpu)

uprobe-nop            (16 cpus):    2.968 ± 0.011M/s  (  0.185M/s/cpu)
uretprobe-nop         (16 cpus):    1.870 ± 0.002M/s  (  0.117M/s/cpu)
uprobe-multi-nop      (16 cpus):    3.541 ± 0.037M/s  (  0.221M/s/cpu)
uretprobe-multi-nop   (16 cpus):    2.123 ± 0.026M/s  (  0.133M/s/cpu)

uprobe-nop            (32 cpus):    2.524 ± 0.026M/s  (  0.079M/s/cpu)
uretprobe-nop         (32 cpus):    1.572 ± 0.003M/s  (  0.049M/s/cpu)
uprobe-multi-nop      (32 cpus):    2.717 ± 0.003M/s  (  0.085M/s/cpu)
uretprobe-multi-nop   (32 cpus):    1.687 ± 0.007M/s  (  0.053M/s/cpu)

  [0] https://lore.kernel.org/linux-trace-kernel/20240805202803.1813090-1-andrii@kernel.org/
  [1] https://lore.kernel.org/linux-trace-kernel/20240731214256.3588718-1-andrii@kernel.org/

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Jiri Olsa <jolsa@kernel.org>
Link: https://lore.kernel.org/r/20240806042935.3867862-1-andrii@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
This commit is contained in:
Andrii Nakryiko 2024-08-05 21:29:35 -07:00 committed by Alexei Starovoitov
parent 4e9e07603e
commit f727b13dbe
3 changed files with 85 additions and 15 deletions

View File

@ -520,6 +520,12 @@ extern const struct bench bench_trig_uprobe_push;
extern const struct bench bench_trig_uretprobe_push;
extern const struct bench bench_trig_uprobe_ret;
extern const struct bench bench_trig_uretprobe_ret;
extern const struct bench bench_trig_uprobe_multi_nop;
extern const struct bench bench_trig_uretprobe_multi_nop;
extern const struct bench bench_trig_uprobe_multi_push;
extern const struct bench bench_trig_uretprobe_multi_push;
extern const struct bench bench_trig_uprobe_multi_ret;
extern const struct bench bench_trig_uretprobe_multi_ret;
extern const struct bench bench_rb_libbpf;
extern const struct bench bench_rb_custom;
@ -574,6 +580,12 @@ static const struct bench *benchs[] = {
&bench_trig_uretprobe_push,
&bench_trig_uprobe_ret,
&bench_trig_uretprobe_ret,
&bench_trig_uprobe_multi_nop,
&bench_trig_uretprobe_multi_nop,
&bench_trig_uprobe_multi_push,
&bench_trig_uretprobe_multi_push,
&bench_trig_uprobe_multi_ret,
&bench_trig_uretprobe_multi_ret,
/* ringbuf/perfbuf benchmarks */
&bench_rb_libbpf,
&bench_rb_custom,

View File

@ -332,7 +332,7 @@ static void *uprobe_producer_ret(void *input)
return NULL;
}
static void usetup(bool use_retprobe, void *target_addr)
static void usetup(bool use_retprobe, bool use_multi, void *target_addr)
{
size_t uprobe_offset;
struct bpf_link *link;
@ -346,7 +346,10 @@ static void usetup(bool use_retprobe, void *target_addr)
exit(1);
}
bpf_program__set_autoload(ctx.skel->progs.bench_trigger_uprobe, true);
if (use_multi)
bpf_program__set_autoload(ctx.skel->progs.bench_trigger_uprobe_multi, true);
else
bpf_program__set_autoload(ctx.skel->progs.bench_trigger_uprobe, true);
err = trigger_bench__load(ctx.skel);
if (err) {
@ -355,16 +358,28 @@ static void usetup(bool use_retprobe, void *target_addr)
}
uprobe_offset = get_uprobe_offset(target_addr);
link = bpf_program__attach_uprobe(ctx.skel->progs.bench_trigger_uprobe,
use_retprobe,
-1 /* all PIDs */,
"/proc/self/exe",
uprobe_offset);
if (use_multi) {
LIBBPF_OPTS(bpf_uprobe_multi_opts, opts,
.retprobe = use_retprobe,
.cnt = 1,
.offsets = &uprobe_offset,
);
link = bpf_program__attach_uprobe_multi(
ctx.skel->progs.bench_trigger_uprobe_multi,
-1 /* all PIDs */, "/proc/self/exe", NULL, &opts);
ctx.skel->links.bench_trigger_uprobe_multi = link;
} else {
link = bpf_program__attach_uprobe(ctx.skel->progs.bench_trigger_uprobe,
use_retprobe,
-1 /* all PIDs */,
"/proc/self/exe",
uprobe_offset);
ctx.skel->links.bench_trigger_uprobe = link;
}
if (!link) {
fprintf(stderr, "failed to attach uprobe!\n");
fprintf(stderr, "failed to attach %s!\n", use_multi ? "multi-uprobe" : "uprobe");
exit(1);
}
ctx.skel->links.bench_trigger_uprobe = link;
}
static void usermode_count_setup(void)
@ -374,32 +389,62 @@ static void usermode_count_setup(void)
static void uprobe_nop_setup(void)
{
usetup(false, &uprobe_target_nop);
usetup(false, false /* !use_multi */, &uprobe_target_nop);
}
static void uretprobe_nop_setup(void)
{
usetup(true, &uprobe_target_nop);
usetup(true, false /* !use_multi */, &uprobe_target_nop);
}
static void uprobe_push_setup(void)
{
usetup(false, &uprobe_target_push);
usetup(false, false /* !use_multi */, &uprobe_target_push);
}
static void uretprobe_push_setup(void)
{
usetup(true, &uprobe_target_push);
usetup(true, false /* !use_multi */, &uprobe_target_push);
}
static void uprobe_ret_setup(void)
{
usetup(false, &uprobe_target_ret);
usetup(false, false /* !use_multi */, &uprobe_target_ret);
}
static void uretprobe_ret_setup(void)
{
usetup(true, &uprobe_target_ret);
usetup(true, false /* !use_multi */, &uprobe_target_ret);
}
static void uprobe_multi_nop_setup(void)
{
usetup(false, true /* use_multi */, &uprobe_target_nop);
}
static void uretprobe_multi_nop_setup(void)
{
usetup(true, true /* use_multi */, &uprobe_target_nop);
}
static void uprobe_multi_push_setup(void)
{
usetup(false, true /* use_multi */, &uprobe_target_push);
}
static void uretprobe_multi_push_setup(void)
{
usetup(true, true /* use_multi */, &uprobe_target_push);
}
static void uprobe_multi_ret_setup(void)
{
usetup(false, true /* use_multi */, &uprobe_target_ret);
}
static void uretprobe_multi_ret_setup(void)
{
usetup(true, true /* use_multi */, &uprobe_target_ret);
}
const struct bench bench_trig_syscall_count = {
@ -454,3 +499,9 @@ BENCH_TRIG_USERMODE(uprobe_ret, ret, "uprobe-ret");
BENCH_TRIG_USERMODE(uretprobe_nop, nop, "uretprobe-nop");
BENCH_TRIG_USERMODE(uretprobe_push, push, "uretprobe-push");
BENCH_TRIG_USERMODE(uretprobe_ret, ret, "uretprobe-ret");
BENCH_TRIG_USERMODE(uprobe_multi_nop, nop, "uprobe-multi-nop");
BENCH_TRIG_USERMODE(uprobe_multi_push, push, "uprobe-multi-push");
BENCH_TRIG_USERMODE(uprobe_multi_ret, ret, "uprobe-multi-ret");
BENCH_TRIG_USERMODE(uretprobe_multi_nop, nop, "uretprobe-multi-nop");
BENCH_TRIG_USERMODE(uretprobe_multi_push, push, "uretprobe-multi-push");
BENCH_TRIG_USERMODE(uretprobe_multi_ret, ret, "uretprobe-multi-ret");

View File

@ -32,6 +32,13 @@ int bench_trigger_uprobe(void *ctx)
return 0;
}
SEC("?uprobe.multi")
int bench_trigger_uprobe_multi(void *ctx)
{
inc_counter();
return 0;
}
const volatile int batch_iters = 0;
SEC("?raw_tp")