From 376bd59e2a0404b09767cc991cf5aed394cf0cf2 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Tue, 10 Sep 2024 14:40:37 -0700 Subject: [PATCH] bpf: Use fake pt_regs when doing bpf syscall tracepoint tracing Salvatore Benedetto reported an issue that when doing syscall tracepoint tracing the kernel stack is empty. For example, using the following command line bpftrace -e 'tracepoint:syscalls:sys_enter_read { print("Kernel Stack\n"); print(kstack()); }' bpftrace -e 'tracepoint:syscalls:sys_exit_read { print("Kernel Stack\n"); print(kstack()); }' the output for both commands is === Kernel Stack === Further analysis shows that pt_regs used for bpf syscall tracepoint tracing is from the one constructed during user->kernel transition. The call stack looks like perf_syscall_enter+0x88/0x7c0 trace_sys_enter+0x41/0x80 syscall_trace_enter+0x100/0x160 do_syscall_64+0x38/0xf0 entry_SYSCALL_64_after_hwframe+0x76/0x7e The ip address stored in pt_regs is from user space hence no kernel stack is printed. To fix the issue, kernel address from pt_regs is required. In kernel repo, there are already a few cases like this. For example, in kernel/trace/bpf_trace.c, several perf_fetch_caller_regs(fake_regs_ptr) instances are used to supply ip address or use ip address to construct call stack. Instead of allocate fake_regs in the stack which may consume a lot of bytes, the function perf_trace_buf_alloc() in perf_syscall_{enter, exit}() is leveraged to create fake_regs, which will be passed to perf_call_bpf_{enter,exit}(). For the above bpftrace script, I got the following output with this patch: for tracepoint:syscalls:sys_enter_read === Kernel Stack syscall_trace_enter+407 syscall_trace_enter+407 do_syscall_64+74 entry_SYSCALL_64_after_hwframe+75 === and for tracepoint:syscalls:sys_exit_read === Kernel Stack syscall_exit_work+185 syscall_exit_work+185 syscall_exit_to_user_mode+305 do_syscall_64+118 entry_SYSCALL_64_after_hwframe+75 === Reported-by: Salvatore Benedetto Suggested-by: Andrii Nakryiko Signed-off-by: Yonghong Song Signed-off-by: Andrii Nakryiko Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20240910214037.3663272-1-yonghong.song@linux.dev --- kernel/trace/trace_syscalls.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index 9c581d6da843..785733245ead 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -564,6 +564,7 @@ static int perf_call_bpf_enter(struct trace_event_call *call, struct pt_regs *re BUILD_BUG_ON(sizeof(param.ent) < sizeof(void *)); /* bpf prog requires 'regs' to be the first member in the ctx (a.k.a. ¶m) */ + perf_fetch_caller_regs(regs); *(struct pt_regs **)¶m = regs; param.syscall_nr = rec->nr; for (i = 0; i < sys_data->nb_args; i++) @@ -575,6 +576,7 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id) { struct syscall_metadata *sys_data; struct syscall_trace_enter *rec; + struct pt_regs *fake_regs; struct hlist_head *head; unsigned long args[6]; bool valid_prog_array; @@ -602,7 +604,7 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id) size = ALIGN(size + sizeof(u32), sizeof(u64)); size -= sizeof(u32); - rec = perf_trace_buf_alloc(size, NULL, &rctx); + rec = perf_trace_buf_alloc(size, &fake_regs, &rctx); if (!rec) return; @@ -611,7 +613,7 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id) memcpy(&rec->args, args, sizeof(unsigned long) * sys_data->nb_args); if ((valid_prog_array && - !perf_call_bpf_enter(sys_data->enter_event, regs, sys_data, rec)) || + !perf_call_bpf_enter(sys_data->enter_event, fake_regs, sys_data, rec)) || hlist_empty(head)) { perf_swevent_put_recursion_context(rctx); return; @@ -666,6 +668,7 @@ static int perf_call_bpf_exit(struct trace_event_call *call, struct pt_regs *reg } __aligned(8) param; /* bpf prog requires 'regs' to be the first member in the ctx (a.k.a. ¶m) */ + perf_fetch_caller_regs(regs); *(struct pt_regs **)¶m = regs; param.syscall_nr = rec->nr; param.ret = rec->ret; @@ -676,6 +679,7 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret) { struct syscall_metadata *sys_data; struct syscall_trace_exit *rec; + struct pt_regs *fake_regs; struct hlist_head *head; bool valid_prog_array; int syscall_nr; @@ -701,7 +705,7 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret) size = ALIGN(sizeof(*rec) + sizeof(u32), sizeof(u64)); size -= sizeof(u32); - rec = perf_trace_buf_alloc(size, NULL, &rctx); + rec = perf_trace_buf_alloc(size, &fake_regs, &rctx); if (!rec) return; @@ -709,7 +713,7 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret) rec->ret = syscall_get_return_value(current, regs); if ((valid_prog_array && - !perf_call_bpf_exit(sys_data->exit_event, regs, rec)) || + !perf_call_bpf_exit(sys_data->exit_event, fake_regs, rec)) || hlist_empty(head)) { perf_swevent_put_recursion_context(rctx); return;