bpf: Use fake pt_regs when doing bpf syscall tracepoint tracing
Salvatore Benedetto reported an issue that when doing syscall tracepoint tracing the kernel stack is empty. For example, using the following command line bpftrace -e 'tracepoint:syscalls:sys_enter_read { print("Kernel Stack\n"); print(kstack()); }' bpftrace -e 'tracepoint:syscalls:sys_exit_read { print("Kernel Stack\n"); print(kstack()); }' the output for both commands is === Kernel Stack === Further analysis shows that pt_regs used for bpf syscall tracepoint tracing is from the one constructed during user->kernel transition. The call stack looks like perf_syscall_enter+0x88/0x7c0 trace_sys_enter+0x41/0x80 syscall_trace_enter+0x100/0x160 do_syscall_64+0x38/0xf0 entry_SYSCALL_64_after_hwframe+0x76/0x7e The ip address stored in pt_regs is from user space hence no kernel stack is printed. To fix the issue, kernel address from pt_regs is required. In kernel repo, there are already a few cases like this. For example, in kernel/trace/bpf_trace.c, several perf_fetch_caller_regs(fake_regs_ptr) instances are used to supply ip address or use ip address to construct call stack. Instead of allocate fake_regs in the stack which may consume a lot of bytes, the function perf_trace_buf_alloc() in perf_syscall_{enter, exit}() is leveraged to create fake_regs, which will be passed to perf_call_bpf_{enter,exit}(). For the above bpftrace script, I got the following output with this patch: for tracepoint:syscalls:sys_enter_read === Kernel Stack syscall_trace_enter+407 syscall_trace_enter+407 do_syscall_64+74 entry_SYSCALL_64_after_hwframe+75 === and for tracepoint:syscalls:sys_exit_read === Kernel Stack syscall_exit_work+185 syscall_exit_work+185 syscall_exit_to_user_mode+305 do_syscall_64+118 entry_SYSCALL_64_after_hwframe+75 === Reported-by: Salvatore Benedetto <salvabenedetto@meta.com> Suggested-by: Andrii Nakryiko <andrii@kernel.org> Signed-off-by: Yonghong Song <yonghong.song@linux.dev> Signed-off-by: Andrii Nakryiko <andrii@kernel.org> Acked-by: Andrii Nakryiko <andrii@kernel.org> Link: https://lore.kernel.org/bpf/20240910214037.3663272-1-yonghong.song@linux.dev
This commit is contained in:
parent
2bea33f907
commit
376bd59e2a
@ -564,6 +564,7 @@ static int perf_call_bpf_enter(struct trace_event_call *call, struct pt_regs *re
|
||||
BUILD_BUG_ON(sizeof(param.ent) < sizeof(void *));
|
||||
|
||||
/* bpf prog requires 'regs' to be the first member in the ctx (a.k.a. ¶m) */
|
||||
perf_fetch_caller_regs(regs);
|
||||
*(struct pt_regs **)¶m = regs;
|
||||
param.syscall_nr = rec->nr;
|
||||
for (i = 0; i < sys_data->nb_args; i++)
|
||||
@ -575,6 +576,7 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
|
||||
{
|
||||
struct syscall_metadata *sys_data;
|
||||
struct syscall_trace_enter *rec;
|
||||
struct pt_regs *fake_regs;
|
||||
struct hlist_head *head;
|
||||
unsigned long args[6];
|
||||
bool valid_prog_array;
|
||||
@ -602,7 +604,7 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
|
||||
size = ALIGN(size + sizeof(u32), sizeof(u64));
|
||||
size -= sizeof(u32);
|
||||
|
||||
rec = perf_trace_buf_alloc(size, NULL, &rctx);
|
||||
rec = perf_trace_buf_alloc(size, &fake_regs, &rctx);
|
||||
if (!rec)
|
||||
return;
|
||||
|
||||
@ -611,7 +613,7 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
|
||||
memcpy(&rec->args, args, sizeof(unsigned long) * sys_data->nb_args);
|
||||
|
||||
if ((valid_prog_array &&
|
||||
!perf_call_bpf_enter(sys_data->enter_event, regs, sys_data, rec)) ||
|
||||
!perf_call_bpf_enter(sys_data->enter_event, fake_regs, sys_data, rec)) ||
|
||||
hlist_empty(head)) {
|
||||
perf_swevent_put_recursion_context(rctx);
|
||||
return;
|
||||
@ -666,6 +668,7 @@ static int perf_call_bpf_exit(struct trace_event_call *call, struct pt_regs *reg
|
||||
} __aligned(8) param;
|
||||
|
||||
/* bpf prog requires 'regs' to be the first member in the ctx (a.k.a. ¶m) */
|
||||
perf_fetch_caller_regs(regs);
|
||||
*(struct pt_regs **)¶m = regs;
|
||||
param.syscall_nr = rec->nr;
|
||||
param.ret = rec->ret;
|
||||
@ -676,6 +679,7 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
|
||||
{
|
||||
struct syscall_metadata *sys_data;
|
||||
struct syscall_trace_exit *rec;
|
||||
struct pt_regs *fake_regs;
|
||||
struct hlist_head *head;
|
||||
bool valid_prog_array;
|
||||
int syscall_nr;
|
||||
@ -701,7 +705,7 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
|
||||
size = ALIGN(sizeof(*rec) + sizeof(u32), sizeof(u64));
|
||||
size -= sizeof(u32);
|
||||
|
||||
rec = perf_trace_buf_alloc(size, NULL, &rctx);
|
||||
rec = perf_trace_buf_alloc(size, &fake_regs, &rctx);
|
||||
if (!rec)
|
||||
return;
|
||||
|
||||
@ -709,7 +713,7 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
|
||||
rec->ret = syscall_get_return_value(current, regs);
|
||||
|
||||
if ((valid_prog_array &&
|
||||
!perf_call_bpf_exit(sys_data->exit_event, regs, rec)) ||
|
||||
!perf_call_bpf_exit(sys_data->exit_event, fake_regs, rec)) ||
|
||||
hlist_empty(head)) {
|
||||
perf_swevent_put_recursion_context(rctx);
|
||||
return;
|
||||
|
Loading…
Reference in New Issue
Block a user