diff --git a/Documentation/trace/kprobetrace.rst b/Documentation/trace/kprobetrace.rst index 69cb7776ae99..3b6791c17e9b 100644 --- a/Documentation/trace/kprobetrace.rst +++ b/Documentation/trace/kprobetrace.rst @@ -58,8 +58,9 @@ Synopsis of kprobe_events NAME=FETCHARG : Set NAME as the argument name of FETCHARG. FETCHARG:TYPE : Set TYPE as the type of FETCHARG. Currently, basic types (u8/u16/u32/u64/s8/s16/s32/s64), hexadecimal types - (x8/x16/x32/x64), "char", "string", "ustring", "symbol", "symstr" - and bitfield are supported. + (x8/x16/x32/x64), VFS layer common type(%pd/%pD), "char", + "string", "ustring", "symbol", "symstr" and bitfield are + supported. (\*1) only for the probe on function entry (offs == 0). Note, this argument access is best effort, because depending on the argument type, it may be passed on @@ -122,6 +123,9 @@ With 'symstr' type, you can filter the event with wildcard pattern of the symbols, and you don't need to solve symbol name by yourself. For $comm, the default type is "string"; any other type is invalid. +VFS layer common type(%pd/%pD) is a special type, which fetches dentry's or +file's name from struct dentry's address or struct file's address. + .. _user_mem_access: User Memory Access diff --git a/arch/csky/kernel/probes/ftrace.c b/arch/csky/kernel/probes/ftrace.c index 834cffcfbce3..7ba4b98076de 100644 --- a/arch/csky/kernel/probes/ftrace.c +++ b/arch/csky/kernel/probes/ftrace.c @@ -12,6 +12,9 @@ void kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip, struct kprobe_ctlblk *kcb; struct pt_regs *regs; + if (unlikely(kprobe_ftrace_disabled)) + return; + bit = ftrace_test_recursion_trylock(ip, parent_ip); if (bit < 0) return; diff --git a/arch/loongarch/kernel/ftrace_dyn.c b/arch/loongarch/kernel/ftrace_dyn.c index 73858c9029cc..bff058317062 100644 --- a/arch/loongarch/kernel/ftrace_dyn.c +++ b/arch/loongarch/kernel/ftrace_dyn.c @@ -287,6 +287,9 @@ void kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip, struct kprobe *p; struct kprobe_ctlblk *kcb; + if (unlikely(kprobe_ftrace_disabled)) + return; + bit = ftrace_test_recursion_trylock(ip, parent_ip); if (bit < 0) return; diff --git a/arch/parisc/kernel/ftrace.c b/arch/parisc/kernel/ftrace.c index 621a4b386ae4..c91f9c2e61ed 100644 --- a/arch/parisc/kernel/ftrace.c +++ b/arch/parisc/kernel/ftrace.c @@ -206,6 +206,9 @@ void kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip, struct kprobe *p; int bit; + if (unlikely(kprobe_ftrace_disabled)) + return; + bit = ftrace_test_recursion_trylock(ip, parent_ip); if (bit < 0) return; diff --git a/arch/powerpc/kernel/kprobes-ftrace.c b/arch/powerpc/kernel/kprobes-ftrace.c index 072ebe7f290b..f8208c027148 100644 --- a/arch/powerpc/kernel/kprobes-ftrace.c +++ b/arch/powerpc/kernel/kprobes-ftrace.c @@ -21,6 +21,9 @@ void kprobe_ftrace_handler(unsigned long nip, unsigned long parent_nip, struct pt_regs *regs; int bit; + if (unlikely(kprobe_ftrace_disabled)) + return; + bit = ftrace_test_recursion_trylock(nip, parent_nip); if (bit < 0) return; diff --git a/arch/riscv/kernel/probes/ftrace.c b/arch/riscv/kernel/probes/ftrace.c index 7142ec42e889..a69dfa610aa8 100644 --- a/arch/riscv/kernel/probes/ftrace.c +++ b/arch/riscv/kernel/probes/ftrace.c @@ -11,6 +11,9 @@ void kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip, struct kprobe_ctlblk *kcb; int bit; + if (unlikely(kprobe_ftrace_disabled)) + return; + bit = ftrace_test_recursion_trylock(ip, parent_ip); if (bit < 0) return; diff --git a/arch/s390/kernel/ftrace.c b/arch/s390/kernel/ftrace.c index 798249ef5646..ddf2ee47cb87 100644 --- a/arch/s390/kernel/ftrace.c +++ b/arch/s390/kernel/ftrace.c @@ -296,6 +296,9 @@ void kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip, struct kprobe *p; int bit; + if (unlikely(kprobe_ftrace_disabled)) + return; + bit = ftrace_test_recursion_trylock(ip, parent_ip); if (bit < 0) return; diff --git a/arch/x86/kernel/kprobes/ftrace.c b/arch/x86/kernel/kprobes/ftrace.c index dd2ec14adb77..15af7e98e161 100644 --- a/arch/x86/kernel/kprobes/ftrace.c +++ b/arch/x86/kernel/kprobes/ftrace.c @@ -21,6 +21,9 @@ void kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip, struct kprobe_ctlblk *kcb; int bit; + if (unlikely(kprobe_ftrace_disabled)) + return; + bit = ftrace_test_recursion_trylock(ip, parent_ip); if (bit < 0) return; diff --git a/include/linux/fprobe.h b/include/linux/fprobe.h index 3e03758151f4..f39869588117 100644 --- a/include/linux/fprobe.h +++ b/include/linux/fprobe.h @@ -7,6 +7,16 @@ #include #include +struct fprobe; + +typedef int (*fprobe_entry_cb)(struct fprobe *fp, unsigned long entry_ip, + unsigned long ret_ip, struct pt_regs *regs, + void *entry_data); + +typedef void (*fprobe_exit_cb)(struct fprobe *fp, unsigned long entry_ip, + unsigned long ret_ip, struct pt_regs *regs, + void *entry_data); + /** * struct fprobe - ftrace based probe. * @ops: The ftrace_ops. @@ -34,12 +44,8 @@ struct fprobe { size_t entry_data_size; int nr_maxactive; - int (*entry_handler)(struct fprobe *fp, unsigned long entry_ip, - unsigned long ret_ip, struct pt_regs *regs, - void *entry_data); - void (*exit_handler)(struct fprobe *fp, unsigned long entry_ip, - unsigned long ret_ip, struct pt_regs *regs, - void *entry_data); + fprobe_entry_cb entry_handler; + fprobe_exit_cb exit_handler; }; /* This fprobe is soft-disabled. */ diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h index 0ff44d6633e3..5fcbc254d186 100644 --- a/include/linux/kprobes.h +++ b/include/linux/kprobes.h @@ -378,11 +378,15 @@ static inline void wait_for_kprobe_optimizer(void) { } extern void kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip, struct ftrace_ops *ops, struct ftrace_regs *fregs); extern int arch_prepare_kprobe_ftrace(struct kprobe *p); +/* Set when ftrace has been killed: kprobes on ftrace must be disabled for safety */ +extern bool kprobe_ftrace_disabled __read_mostly; +extern void kprobe_ftrace_kill(void); #else static inline int arch_prepare_kprobe_ftrace(struct kprobe *p) { return -EINVAL; } +static inline void kprobe_ftrace_kill(void) {} #endif /* CONFIG_KPROBES_ON_FTRACE */ /* Get the kprobe at this addr (if any) - called with preemption disabled */ @@ -495,6 +499,9 @@ static inline void kprobe_flush_task(struct task_struct *tk) static inline void kprobe_free_init_mem(void) { } +static inline void kprobe_ftrace_kill(void) +{ +} static inline int disable_kprobe(struct kprobe *kp) { return -EOPNOTSUPP; diff --git a/include/linux/objpool.h b/include/linux/objpool.h index 15aff4a17f0c..cb1758eaa2d3 100644 --- a/include/linux/objpool.h +++ b/include/linux/objpool.h @@ -5,6 +5,10 @@ #include #include +#include +#include +#include +#include /* * objpool: ring-array based lockless MPMC queue @@ -69,7 +73,7 @@ typedef int (*objpool_fini_cb)(struct objpool_head *head, void *context); * struct objpool_head - object pooling metadata * @obj_size: object size, aligned to sizeof(void *) * @nr_objs: total objs (to be pre-allocated with objpool) - * @nr_cpus: local copy of nr_cpu_ids + * @nr_possible_cpus: cached value of num_possible_cpus() * @capacity: max objs can be managed by one objpool_slot * @gfp: gfp flags for kmalloc & vmalloc * @ref: refcount of objpool @@ -81,7 +85,7 @@ typedef int (*objpool_fini_cb)(struct objpool_head *head, void *context); struct objpool_head { int obj_size; int nr_objs; - int nr_cpus; + int nr_possible_cpus; int capacity; gfp_t gfp; refcount_t ref; @@ -118,13 +122,94 @@ int objpool_init(struct objpool_head *pool, int nr_objs, int object_size, gfp_t gfp, void *context, objpool_init_obj_cb objinit, objpool_fini_cb release); +/* try to retrieve object from slot */ +static inline void *__objpool_try_get_slot(struct objpool_head *pool, int cpu) +{ + struct objpool_slot *slot = pool->cpu_slots[cpu]; + /* load head snapshot, other cpus may change it */ + uint32_t head = smp_load_acquire(&slot->head); + + while (head != READ_ONCE(slot->last)) { + void *obj; + + /* + * data visibility of 'last' and 'head' could be out of + * order since memory updating of 'last' and 'head' are + * performed in push() and pop() independently + * + * before any retrieving attempts, pop() must guarantee + * 'last' is behind 'head', that is to say, there must + * be available objects in slot, which could be ensured + * by condition 'last != head && last - head <= nr_objs' + * that is equivalent to 'last - head - 1 < nr_objs' as + * 'last' and 'head' are both unsigned int32 + */ + if (READ_ONCE(slot->last) - head - 1 >= pool->nr_objs) { + head = READ_ONCE(slot->head); + continue; + } + + /* obj must be retrieved before moving forward head */ + obj = READ_ONCE(slot->entries[head & slot->mask]); + + /* move head forward to mark it's consumption */ + if (try_cmpxchg_release(&slot->head, &head, head + 1)) + return obj; + } + + return NULL; +} + /** * objpool_pop() - allocate an object from objpool * @pool: object pool * * return value: object ptr or NULL if failed */ -void *objpool_pop(struct objpool_head *pool); +static inline void *objpool_pop(struct objpool_head *pool) +{ + void *obj = NULL; + unsigned long flags; + int i, cpu; + + /* disable local irq to avoid preemption & interruption */ + raw_local_irq_save(flags); + + cpu = raw_smp_processor_id(); + for (i = 0; i < pool->nr_possible_cpus; i++) { + obj = __objpool_try_get_slot(pool, cpu); + if (obj) + break; + cpu = cpumask_next_wrap(cpu, cpu_possible_mask, -1, 1); + } + raw_local_irq_restore(flags); + + return obj; +} + +/* adding object to slot, abort if the slot was already full */ +static inline int +__objpool_try_add_slot(void *obj, struct objpool_head *pool, int cpu) +{ + struct objpool_slot *slot = pool->cpu_slots[cpu]; + uint32_t head, tail; + + /* loading tail and head as a local snapshot, tail first */ + tail = READ_ONCE(slot->tail); + + do { + head = READ_ONCE(slot->head); + /* fault caught: something must be wrong */ + WARN_ON_ONCE(tail - head > pool->nr_objs); + } while (!try_cmpxchg_acquire(&slot->tail, &tail, tail + 1)); + + /* now the tail position is reserved for the given obj */ + WRITE_ONCE(slot->entries[tail & slot->mask], obj); + /* update sequence to make this obj available for pop() */ + smp_store_release(&slot->last, tail + 1); + + return 0; +} /** * objpool_push() - reclaim the object and return back to objpool @@ -134,7 +219,19 @@ void *objpool_pop(struct objpool_head *pool); * return: 0 or error code (it fails only when user tries to push * the same object multiple times or wrong "objects" into objpool) */ -int objpool_push(void *obj, struct objpool_head *pool); +static inline int objpool_push(void *obj, struct objpool_head *pool) +{ + unsigned long flags; + int rc; + + /* disable local irq to avoid preemption & interruption */ + raw_local_irq_save(flags); + rc = __objpool_try_add_slot(obj, pool, raw_smp_processor_id()); + raw_local_irq_restore(flags); + + return rc; +} + /** * objpool_drop() - discard the object and deref objpool diff --git a/include/linux/trace_recursion.h b/include/linux/trace_recursion.h index d48cd92d2364..24ea8ac049b4 100644 --- a/include/linux/trace_recursion.h +++ b/include/linux/trace_recursion.h @@ -135,7 +135,7 @@ extern void ftrace_record_recursion(unsigned long ip, unsigned long parent_ip); # define do_ftrace_record_recursion(ip, pip) do { } while (0) #endif -#ifdef CONFIG_ARCH_WANTS_NO_INSTR +#ifdef CONFIG_FTRACE_VALIDATE_RCU_IS_WATCHING # define trace_warn_on_no_rcu(ip) \ ({ \ bool __ret = !rcu_is_watching(); \ diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 1215bc299390..2c83ba776fc7 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -39,7 +39,7 @@ static struct rb_root uprobes_tree = RB_ROOT; */ #define no_uprobe_events() RB_EMPTY_ROOT(&uprobes_tree) -static DEFINE_SPINLOCK(uprobes_treelock); /* serialize rbtree access */ +static DEFINE_RWLOCK(uprobes_treelock); /* serialize rbtree access */ #define UPROBES_HASH_SZ 13 /* serialize uprobe->pending_list */ @@ -669,9 +669,9 @@ static struct uprobe *find_uprobe(struct inode *inode, loff_t offset) { struct uprobe *uprobe; - spin_lock(&uprobes_treelock); + read_lock(&uprobes_treelock); uprobe = __find_uprobe(inode, offset); - spin_unlock(&uprobes_treelock); + read_unlock(&uprobes_treelock); return uprobe; } @@ -701,9 +701,9 @@ static struct uprobe *insert_uprobe(struct uprobe *uprobe) { struct uprobe *u; - spin_lock(&uprobes_treelock); + write_lock(&uprobes_treelock); u = __insert_uprobe(uprobe); - spin_unlock(&uprobes_treelock); + write_unlock(&uprobes_treelock); return u; } @@ -935,9 +935,9 @@ static void delete_uprobe(struct uprobe *uprobe) if (WARN_ON(!uprobe_is_active(uprobe))) return; - spin_lock(&uprobes_treelock); + write_lock(&uprobes_treelock); rb_erase(&uprobe->rb_node, &uprobes_tree); - spin_unlock(&uprobes_treelock); + write_unlock(&uprobes_treelock); RB_CLEAR_NODE(&uprobe->rb_node); /* for uprobe_is_active() */ put_uprobe(uprobe); } @@ -1298,7 +1298,7 @@ static void build_probe_list(struct inode *inode, min = vaddr_to_offset(vma, start); max = min + (end - start) - 1; - spin_lock(&uprobes_treelock); + read_lock(&uprobes_treelock); n = find_node_in_range(inode, min, max); if (n) { for (t = n; t; t = rb_prev(t)) { @@ -1316,7 +1316,7 @@ static void build_probe_list(struct inode *inode, get_uprobe(u); } } - spin_unlock(&uprobes_treelock); + read_unlock(&uprobes_treelock); } /* @vma contains reference counter, not the probed instruction. */ @@ -1407,9 +1407,9 @@ vma_has_uprobes(struct vm_area_struct *vma, unsigned long start, unsigned long e min = vaddr_to_offset(vma, start); max = min + (end - start) - 1; - spin_lock(&uprobes_treelock); + read_lock(&uprobes_treelock); n = find_node_in_range(inode, min, max); - spin_unlock(&uprobes_treelock); + read_unlock(&uprobes_treelock); return !!n; } diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 48b49925f7ff..f2cea3c80cb5 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -1067,6 +1067,7 @@ static struct ftrace_ops kprobe_ipmodify_ops __read_mostly = { static int kprobe_ipmodify_enabled; static int kprobe_ftrace_enabled; +bool kprobe_ftrace_disabled; static int __arm_kprobe_ftrace(struct kprobe *p, struct ftrace_ops *ops, int *cnt) @@ -1135,6 +1136,11 @@ static int disarm_kprobe_ftrace(struct kprobe *p) ipmodify ? &kprobe_ipmodify_ops : &kprobe_ftrace_ops, ipmodify ? &kprobe_ipmodify_enabled : &kprobe_ftrace_enabled); } + +void kprobe_ftrace_kill() +{ + kprobe_ftrace_disabled = true; +} #else /* !CONFIG_KPROBES_ON_FTRACE */ static inline int arm_kprobe_ftrace(struct kprobe *p) { diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index b3d7f62ac581..166ad5444eea 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -974,6 +974,19 @@ config FTRACE_RECORD_RECURSION_SIZE This file can be reset, but the limit can not change in size at runtime. +config FTRACE_VALIDATE_RCU_IS_WATCHING + bool "Validate RCU is on during ftrace execution" + depends on FUNCTION_TRACER + depends on ARCH_WANTS_NO_INSTR + help + All callbacks that attach to the function tracing have some sort of + protection against recursion. This option is only to verify that + ftrace (and other users of ftrace_test_recursion_trylock()) are not + called outside of RCU, as if they are, it can cause a race. But it + also has a noticeable overhead when enabled. + + If unsure, say N + config RING_BUFFER_RECORD_RECURSION bool "Record functions that recurse in the ring buffer" depends on FTRACE_RECORD_RECURSION diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 50ca4d4f8840..d4efdab463b8 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -7894,6 +7894,7 @@ void ftrace_kill(void) ftrace_disabled = 1; ftrace_enabled = 0; ftrace_trace_function = ftrace_stub; + kprobe_ftrace_kill(); } /** diff --git a/kernel/trace/rethook.c b/kernel/trace/rethook.c index fa03094e9e69..30d224946881 100644 --- a/kernel/trace/rethook.c +++ b/kernel/trace/rethook.c @@ -166,6 +166,7 @@ struct rethook_node *rethook_try_get(struct rethook *rh) if (unlikely(!handler)) return NULL; +#if defined(CONFIG_FTRACE_VALIDATE_RCU_IS_WATCHING) || defined(CONFIG_KPROBE_EVENTS_ON_NOTRACE) /* * This expects the caller will set up a rethook on a function entry. * When the function returns, the rethook will eventually be reclaimed @@ -174,6 +175,7 @@ struct rethook_node *rethook_try_get(struct rethook *rh) */ if (unlikely(!rcu_is_watching())) return NULL; +#endif return (struct rethook_node *)objpool_pop(&rh->pool); } @@ -248,7 +250,7 @@ unsigned long rethook_find_ret_addr(struct task_struct *tsk, unsigned long frame if (WARN_ON_ONCE(!cur)) return 0; - if (WARN_ON_ONCE(tsk != current && task_is_running(tsk))) + if (tsk != current && task_is_running(tsk)) return 0; do { diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 233d1af39fff..7fe4b539a423 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -5540,7 +5540,7 @@ static const char readme_msg[] = "\t kernel return probes support: $retval, $arg, $comm\n" "\t type: s8/16/32/64, u8/16/32/64, x8/16/32/64, char, string, symbol,\n" "\t b@/, ustring,\n" - "\t symstr, \\[\\]\n" + "\t symstr, %pd/%pD, \\[\\]\n" #ifdef CONFIG_HIST_TRIGGERS "\t field: ;\n" "\t stype: u8/u16/u32/u64, s8/s16/s32/s64, pid_t,\n" diff --git a/kernel/trace/trace_fprobe.c b/kernel/trace/trace_fprobe.c index 4f4280815522..62e6a8f4aae9 100644 --- a/kernel/trace/trace_fprobe.c +++ b/kernel/trace/trace_fprobe.c @@ -994,6 +994,7 @@ static int __trace_fprobe_create(int argc, const char *argv[]) char gbuf[MAX_EVENT_NAME_LEN]; char sbuf[KSYM_NAME_LEN]; char abuf[MAX_BTF_ARGS_LEN]; + char *dbuf = NULL; bool is_tracepoint = false; struct tracepoint *tpoint = NULL; struct traceprobe_parse_context ctx = { @@ -1104,6 +1105,10 @@ static int __trace_fprobe_create(int argc, const char *argv[]) argv = new_argv; } + ret = traceprobe_expand_dentry_args(argc, argv, &dbuf); + if (ret) + goto out; + /* setup a probe */ tf = alloc_trace_fprobe(group, event, symbol, tpoint, maxactive, argc, is_return); @@ -1154,6 +1159,7 @@ out: trace_probe_log_clear(); kfree(new_argv); kfree(symbol); + kfree(dbuf); return ret; parse_error: diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 2cb2a3951b4f..16383247bdbf 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -800,6 +800,7 @@ static int __trace_kprobe_create(int argc, const char *argv[]) char buf[MAX_EVENT_NAME_LEN]; char gbuf[MAX_EVENT_NAME_LEN]; char abuf[MAX_BTF_ARGS_LEN]; + char *dbuf = NULL; struct traceprobe_parse_context ctx = { .flags = TPARG_FL_KERNEL }; switch (argv[0][0]) { @@ -951,6 +952,10 @@ static int __trace_kprobe_create(int argc, const char *argv[]) argv = new_argv; } + ret = traceprobe_expand_dentry_args(argc, argv, &dbuf); + if (ret) + goto out; + /* setup a probe */ tk = alloc_trace_kprobe(group, event, addr, symbol, offset, maxactive, argc, is_return); @@ -997,6 +1002,7 @@ out: trace_probe_log_clear(); kfree(new_argv); kfree(symbol); + kfree(dbuf); return ret; parse_error: diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index c3f2937b434a..5e263c141574 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c @@ -12,6 +12,7 @@ #define pr_fmt(fmt) "trace_probe: " fmt #include +#include #include "trace_btf.h" #include "trace_probe.h" @@ -1737,6 +1738,68 @@ error: return ERR_PTR(ret); } +/* @buf: *buf must be equal to NULL. Caller must to free *buf */ +int traceprobe_expand_dentry_args(int argc, const char *argv[], char **buf) +{ + int i, used, ret; + const int bufsize = MAX_DENTRY_ARGS_LEN; + char *tmpbuf = NULL; + + if (*buf) + return -EINVAL; + + used = 0; + for (i = 0; i < argc; i++) { + char *tmp; + char *equal; + size_t arg_len; + + if (!glob_match("*:%p[dD]", argv[i])) + continue; + + if (!tmpbuf) { + tmpbuf = kmalloc(bufsize, GFP_KERNEL); + if (!tmpbuf) + return -ENOMEM; + } + + tmp = kstrdup(argv[i], GFP_KERNEL); + if (!tmp) + goto nomem; + + equal = strchr(tmp, '='); + if (equal) + *equal = '\0'; + arg_len = strlen(argv[i]); + tmp[arg_len - 4] = '\0'; + if (argv[i][arg_len - 1] == 'd') + ret = snprintf(tmpbuf + used, bufsize - used, + "%s%s+0x0(+0x%zx(%s)):string", + equal ? tmp : "", equal ? "=" : "", + offsetof(struct dentry, d_name.name), + equal ? equal + 1 : tmp); + else + ret = snprintf(tmpbuf + used, bufsize - used, + "%s%s+0x0(+0x%zx(+0x%zx(%s))):string", + equal ? tmp : "", equal ? "=" : "", + offsetof(struct dentry, d_name.name), + offsetof(struct file, f_path.dentry), + equal ? equal + 1 : tmp); + + kfree(tmp); + if (ret >= bufsize - used) + goto nomem; + argv[i] = tmpbuf + used; + used += ret + 1; + } + + *buf = tmpbuf; + return 0; +nomem: + kfree(tmpbuf); + return -ENOMEM; +} + void traceprobe_finish_parse(struct traceprobe_parse_context *ctx) { clear_btf_context(ctx); diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h index cef3a50628a3..5803e6a41570 100644 --- a/kernel/trace/trace_probe.h +++ b/kernel/trace/trace_probe.h @@ -34,6 +34,7 @@ #define MAX_ARRAY_LEN 64 #define MAX_ARG_NAME_LEN 32 #define MAX_BTF_ARGS_LEN 128 +#define MAX_DENTRY_ARGS_LEN 256 #define MAX_STRING_SIZE PATH_MAX #define MAX_ARG_BUF_LEN (MAX_TRACE_ARGS * MAX_ARG_NAME_LEN) @@ -428,6 +429,7 @@ extern int traceprobe_parse_probe_arg(struct trace_probe *tp, int i, const char **traceprobe_expand_meta_args(int argc, const char *argv[], int *new_argc, char *buf, int bufsize, struct traceprobe_parse_context *ctx); +extern int traceprobe_expand_dentry_args(int argc, const char *argv[], char **buf); extern int traceprobe_update_arg(struct probe_arg *arg); extern void traceprobe_free_probe_arg(struct probe_arg *arg); diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 9e461362450a..8541fa1494ae 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -854,6 +854,7 @@ static const struct file_operations uprobe_profile_ops = { struct uprobe_cpu_buffer { struct mutex mutex; void *buf; + int dsize; }; static struct uprobe_cpu_buffer __percpu *uprobe_cpu_buffer; static int uprobe_buffer_refcnt; @@ -940,30 +941,56 @@ static struct uprobe_cpu_buffer *uprobe_buffer_get(void) static void uprobe_buffer_put(struct uprobe_cpu_buffer *ucb) { + if (!ucb) + return; mutex_unlock(&ucb->mutex); } +static struct uprobe_cpu_buffer *prepare_uprobe_buffer(struct trace_uprobe *tu, + struct pt_regs *regs, + struct uprobe_cpu_buffer **ucbp) +{ + struct uprobe_cpu_buffer *ucb; + int dsize, esize; + + if (*ucbp) + return *ucbp; + + esize = SIZEOF_TRACE_ENTRY(is_ret_probe(tu)); + dsize = __get_data_size(&tu->tp, regs, NULL); + + ucb = uprobe_buffer_get(); + ucb->dsize = tu->tp.size + dsize; + + store_trace_args(ucb->buf, &tu->tp, regs, NULL, esize, dsize); + + *ucbp = ucb; + return ucb; +} + static void __uprobe_trace_func(struct trace_uprobe *tu, unsigned long func, struct pt_regs *regs, - struct uprobe_cpu_buffer *ucb, int dsize, + struct uprobe_cpu_buffer **ucbp, struct trace_event_file *trace_file) { struct uprobe_trace_entry_head *entry; struct trace_event_buffer fbuffer; + struct uprobe_cpu_buffer *ucb; void *data; int size, esize; struct trace_event_call *call = trace_probe_event_call(&tu->tp); WARN_ON(call != trace_file->event_call); - if (WARN_ON_ONCE(tu->tp.size + dsize > PAGE_SIZE)) + ucb = prepare_uprobe_buffer(tu, regs, ucbp); + if (WARN_ON_ONCE(ucb->dsize > PAGE_SIZE)) return; if (trace_trigger_soft_disabled(trace_file)) return; esize = SIZEOF_TRACE_ENTRY(is_ret_probe(tu)); - size = esize + tu->tp.size + dsize; + size = esize + ucb->dsize; entry = trace_event_buffer_reserve(&fbuffer, trace_file, size); if (!entry) return; @@ -977,14 +1004,14 @@ static void __uprobe_trace_func(struct trace_uprobe *tu, data = DATAOF_TRACE_ENTRY(entry, false); } - memcpy(data, ucb->buf, tu->tp.size + dsize); + memcpy(data, ucb->buf, ucb->dsize); trace_event_buffer_commit(&fbuffer); } /* uprobe handler */ static int uprobe_trace_func(struct trace_uprobe *tu, struct pt_regs *regs, - struct uprobe_cpu_buffer *ucb, int dsize) + struct uprobe_cpu_buffer **ucbp) { struct event_file_link *link; @@ -993,7 +1020,7 @@ static int uprobe_trace_func(struct trace_uprobe *tu, struct pt_regs *regs, rcu_read_lock(); trace_probe_for_each_link_rcu(link, &tu->tp) - __uprobe_trace_func(tu, 0, regs, ucb, dsize, link->file); + __uprobe_trace_func(tu, 0, regs, ucbp, link->file); rcu_read_unlock(); return 0; @@ -1001,13 +1028,13 @@ static int uprobe_trace_func(struct trace_uprobe *tu, struct pt_regs *regs, static void uretprobe_trace_func(struct trace_uprobe *tu, unsigned long func, struct pt_regs *regs, - struct uprobe_cpu_buffer *ucb, int dsize) + struct uprobe_cpu_buffer **ucbp) { struct event_file_link *link; rcu_read_lock(); trace_probe_for_each_link_rcu(link, &tu->tp) - __uprobe_trace_func(tu, func, regs, ucb, dsize, link->file); + __uprobe_trace_func(tu, func, regs, ucbp, link->file); rcu_read_unlock(); } @@ -1199,9 +1226,6 @@ __uprobe_perf_filter(struct trace_uprobe_filter *filter, struct mm_struct *mm) { struct perf_event *event; - if (filter->nr_systemwide) - return true; - list_for_each_entry(event, &filter->perf_events, hw.tp_list) { if (event->hw.target->mm == mm) return true; @@ -1326,6 +1350,13 @@ static bool uprobe_perf_filter(struct uprobe_consumer *uc, tu = container_of(uc, struct trace_uprobe, consumer); filter = tu->tp.event->filter; + /* + * speculative short-circuiting check to avoid unnecessarily taking + * filter->rwlock below, if the uprobe has system-wide consumer + */ + if (READ_ONCE(filter->nr_systemwide)) + return true; + read_lock(&filter->rwlock); ret = __uprobe_perf_filter(filter, mm); read_unlock(&filter->rwlock); @@ -1335,10 +1366,11 @@ static bool uprobe_perf_filter(struct uprobe_consumer *uc, static void __uprobe_perf_func(struct trace_uprobe *tu, unsigned long func, struct pt_regs *regs, - struct uprobe_cpu_buffer *ucb, int dsize) + struct uprobe_cpu_buffer **ucbp) { struct trace_event_call *call = trace_probe_event_call(&tu->tp); struct uprobe_trace_entry_head *entry; + struct uprobe_cpu_buffer *ucb; struct hlist_head *head; void *data; int size, esize; @@ -1356,7 +1388,8 @@ static void __uprobe_perf_func(struct trace_uprobe *tu, esize = SIZEOF_TRACE_ENTRY(is_ret_probe(tu)); - size = esize + tu->tp.size + dsize; + ucb = prepare_uprobe_buffer(tu, regs, ucbp); + size = esize + ucb->dsize; size = ALIGN(size + sizeof(u32), sizeof(u64)) - sizeof(u32); if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE, "profile buffer not large enough")) return; @@ -1379,13 +1412,10 @@ static void __uprobe_perf_func(struct trace_uprobe *tu, data = DATAOF_TRACE_ENTRY(entry, false); } - memcpy(data, ucb->buf, tu->tp.size + dsize); + memcpy(data, ucb->buf, ucb->dsize); - if (size - esize > tu->tp.size + dsize) { - int len = tu->tp.size + dsize; - - memset(data + len, 0, size - esize - len); - } + if (size - esize > ucb->dsize) + memset(data + ucb->dsize, 0, size - esize - ucb->dsize); perf_trace_buf_submit(entry, size, rctx, call->event.type, 1, regs, head, NULL); @@ -1395,21 +1425,21 @@ static void __uprobe_perf_func(struct trace_uprobe *tu, /* uprobe profile handler */ static int uprobe_perf_func(struct trace_uprobe *tu, struct pt_regs *regs, - struct uprobe_cpu_buffer *ucb, int dsize) + struct uprobe_cpu_buffer **ucbp) { if (!uprobe_perf_filter(&tu->consumer, 0, current->mm)) return UPROBE_HANDLER_REMOVE; if (!is_ret_probe(tu)) - __uprobe_perf_func(tu, 0, regs, ucb, dsize); + __uprobe_perf_func(tu, 0, regs, ucbp); return 0; } static void uretprobe_perf_func(struct trace_uprobe *tu, unsigned long func, struct pt_regs *regs, - struct uprobe_cpu_buffer *ucb, int dsize) + struct uprobe_cpu_buffer **ucbp) { - __uprobe_perf_func(tu, func, regs, ucb, dsize); + __uprobe_perf_func(tu, func, regs, ucbp); } int bpf_get_uprobe_info(const struct perf_event *event, u32 *fd_type, @@ -1474,11 +1504,9 @@ static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs) { struct trace_uprobe *tu; struct uprobe_dispatch_data udd; - struct uprobe_cpu_buffer *ucb; - int dsize, esize; + struct uprobe_cpu_buffer *ucb = NULL; int ret = 0; - tu = container_of(con, struct trace_uprobe, consumer); tu->nhit++; @@ -1490,18 +1518,12 @@ static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs) if (WARN_ON_ONCE(!uprobe_cpu_buffer)) return 0; - dsize = __get_data_size(&tu->tp, regs, NULL); - esize = SIZEOF_TRACE_ENTRY(is_ret_probe(tu)); - - ucb = uprobe_buffer_get(); - store_trace_args(ucb->buf, &tu->tp, regs, NULL, esize, dsize); - if (trace_probe_test_flag(&tu->tp, TP_FLAG_TRACE)) - ret |= uprobe_trace_func(tu, regs, ucb, dsize); + ret |= uprobe_trace_func(tu, regs, &ucb); #ifdef CONFIG_PERF_EVENTS if (trace_probe_test_flag(&tu->tp, TP_FLAG_PROFILE)) - ret |= uprobe_perf_func(tu, regs, ucb, dsize); + ret |= uprobe_perf_func(tu, regs, &ucb); #endif uprobe_buffer_put(ucb); return ret; @@ -1512,8 +1534,7 @@ static int uretprobe_dispatcher(struct uprobe_consumer *con, { struct trace_uprobe *tu; struct uprobe_dispatch_data udd; - struct uprobe_cpu_buffer *ucb; - int dsize, esize; + struct uprobe_cpu_buffer *ucb = NULL; tu = container_of(con, struct trace_uprobe, consumer); @@ -1525,18 +1546,12 @@ static int uretprobe_dispatcher(struct uprobe_consumer *con, if (WARN_ON_ONCE(!uprobe_cpu_buffer)) return 0; - dsize = __get_data_size(&tu->tp, regs, NULL); - esize = SIZEOF_TRACE_ENTRY(is_ret_probe(tu)); - - ucb = uprobe_buffer_get(); - store_trace_args(ucb->buf, &tu->tp, regs, NULL, esize, dsize); - if (trace_probe_test_flag(&tu->tp, TP_FLAG_TRACE)) - uretprobe_trace_func(tu, func, regs, ucb, dsize); + uretprobe_trace_func(tu, func, regs, &ucb); #ifdef CONFIG_PERF_EVENTS if (trace_probe_test_flag(&tu->tp, TP_FLAG_PROFILE)) - uretprobe_perf_func(tu, func, regs, ucb, dsize); + uretprobe_perf_func(tu, func, regs, &ucb); #endif uprobe_buffer_put(ucb); return 0; diff --git a/lib/objpool.c b/lib/objpool.c index cfdc02420884..234f9d0bd081 100644 --- a/lib/objpool.c +++ b/lib/objpool.c @@ -50,7 +50,7 @@ objpool_init_percpu_slots(struct objpool_head *pool, int nr_objs, { int i, cpu_count = 0; - for (i = 0; i < pool->nr_cpus; i++) { + for (i = 0; i < nr_cpu_ids; i++) { struct objpool_slot *slot; int nodes, size, rc; @@ -60,8 +60,8 @@ objpool_init_percpu_slots(struct objpool_head *pool, int nr_objs, continue; /* compute how many objects to be allocated with this slot */ - nodes = nr_objs / num_possible_cpus(); - if (cpu_count < (nr_objs % num_possible_cpus())) + nodes = nr_objs / pool->nr_possible_cpus; + if (cpu_count < (nr_objs % pool->nr_possible_cpus)) nodes++; cpu_count++; @@ -103,7 +103,7 @@ static void objpool_fini_percpu_slots(struct objpool_head *pool) if (!pool->cpu_slots) return; - for (i = 0; i < pool->nr_cpus; i++) + for (i = 0; i < nr_cpu_ids; i++) kvfree(pool->cpu_slots[i]); kfree(pool->cpu_slots); } @@ -130,13 +130,13 @@ int objpool_init(struct objpool_head *pool, int nr_objs, int object_size, /* initialize objpool pool */ memset(pool, 0, sizeof(struct objpool_head)); - pool->nr_cpus = nr_cpu_ids; + pool->nr_possible_cpus = num_possible_cpus(); pool->obj_size = object_size; pool->capacity = capacity; pool->gfp = gfp & ~__GFP_ZERO; pool->context = context; pool->release = release; - slot_size = pool->nr_cpus * sizeof(struct objpool_slot); + slot_size = nr_cpu_ids * sizeof(struct objpool_slot); pool->cpu_slots = kzalloc(slot_size, pool->gfp); if (!pool->cpu_slots) return -ENOMEM; @@ -152,106 +152,6 @@ int objpool_init(struct objpool_head *pool, int nr_objs, int object_size, } EXPORT_SYMBOL_GPL(objpool_init); -/* adding object to slot, abort if the slot was already full */ -static inline int -objpool_try_add_slot(void *obj, struct objpool_head *pool, int cpu) -{ - struct objpool_slot *slot = pool->cpu_slots[cpu]; - uint32_t head, tail; - - /* loading tail and head as a local snapshot, tail first */ - tail = READ_ONCE(slot->tail); - - do { - head = READ_ONCE(slot->head); - /* fault caught: something must be wrong */ - WARN_ON_ONCE(tail - head > pool->nr_objs); - } while (!try_cmpxchg_acquire(&slot->tail, &tail, tail + 1)); - - /* now the tail position is reserved for the given obj */ - WRITE_ONCE(slot->entries[tail & slot->mask], obj); - /* update sequence to make this obj available for pop() */ - smp_store_release(&slot->last, tail + 1); - - return 0; -} - -/* reclaim an object to object pool */ -int objpool_push(void *obj, struct objpool_head *pool) -{ - unsigned long flags; - int rc; - - /* disable local irq to avoid preemption & interruption */ - raw_local_irq_save(flags); - rc = objpool_try_add_slot(obj, pool, raw_smp_processor_id()); - raw_local_irq_restore(flags); - - return rc; -} -EXPORT_SYMBOL_GPL(objpool_push); - -/* try to retrieve object from slot */ -static inline void *objpool_try_get_slot(struct objpool_head *pool, int cpu) -{ - struct objpool_slot *slot = pool->cpu_slots[cpu]; - /* load head snapshot, other cpus may change it */ - uint32_t head = smp_load_acquire(&slot->head); - - while (head != READ_ONCE(slot->last)) { - void *obj; - - /* - * data visibility of 'last' and 'head' could be out of - * order since memory updating of 'last' and 'head' are - * performed in push() and pop() independently - * - * before any retrieving attempts, pop() must guarantee - * 'last' is behind 'head', that is to say, there must - * be available objects in slot, which could be ensured - * by condition 'last != head && last - head <= nr_objs' - * that is equivalent to 'last - head - 1 < nr_objs' as - * 'last' and 'head' are both unsigned int32 - */ - if (READ_ONCE(slot->last) - head - 1 >= pool->nr_objs) { - head = READ_ONCE(slot->head); - continue; - } - - /* obj must be retrieved before moving forward head */ - obj = READ_ONCE(slot->entries[head & slot->mask]); - - /* move head forward to mark it's consumption */ - if (try_cmpxchg_release(&slot->head, &head, head + 1)) - return obj; - } - - return NULL; -} - -/* allocate an object from object pool */ -void *objpool_pop(struct objpool_head *pool) -{ - void *obj = NULL; - unsigned long flags; - int i, cpu; - - /* disable local irq to avoid preemption & interruption */ - raw_local_irq_save(flags); - - cpu = raw_smp_processor_id(); - for (i = 0; i < num_possible_cpus(); i++) { - obj = objpool_try_get_slot(pool, cpu); - if (obj) - break; - cpu = cpumask_next_wrap(cpu, cpu_possible_mask, -1, 1); - } - raw_local_irq_restore(flags); - - return obj; -} -EXPORT_SYMBOL_GPL(objpool_pop); - /* release whole objpool forcely */ void objpool_free(struct objpool_head *pool) { diff --git a/tools/testing/selftests/ftrace/test.d/dynevent/fprobe_args_vfs.tc b/tools/testing/selftests/ftrace/test.d/dynevent/fprobe_args_vfs.tc new file mode 100644 index 000000000000..c6a9d2466a71 --- /dev/null +++ b/tools/testing/selftests/ftrace/test.d/dynevent/fprobe_args_vfs.tc @@ -0,0 +1,41 @@ +#!/bin/sh +# SPDX-License-Identifier: GPL-2.0 +# description: Fprobe event VFS type argument +# requires: dynamic_events "%pd/%pD":README "f[:[/][]] [%return] []":README + + +: "Test argument %pd with name for fprobe" +echo 'f:testprobe dput name=$arg1:%pd' > dynamic_events +echo 1 > events/fprobes/testprobe/enable +grep -q "1" events/fprobes/testprobe/enable +echo 0 > events/fprobes/testprobe/enable +grep "dput" trace | grep -q "enable" +echo "" > dynamic_events +echo "" > trace + +: "Test argument %pd without name for fprobe" +echo 'f:testprobe dput $arg1:%pd' > dynamic_events +echo 1 > events/fprobes/testprobe/enable +grep -q "1" events/fprobes/testprobe/enable +echo 0 > events/fprobes/testprobe/enable +grep "dput" trace | grep -q "enable" +echo "" > dynamic_events +echo "" > trace + +: "Test argument %pD with name for fprobe" +echo 'f:testprobe vfs_read name=$arg1:%pD' > dynamic_events +echo 1 > events/fprobes/testprobe/enable +grep -q "1" events/fprobes/testprobe/enable +echo 0 > events/fprobes/testprobe/enable +grep "vfs_read" trace | grep -q "enable" +echo "" > dynamic_events +echo "" > trace + +: "Test argument %pD without name for fprobe" +echo 'f:testprobe vfs_read $arg1:%pD' > dynamic_events +echo 1 > events/fprobes/testprobe/enable +grep -q "1" events/fprobes/testprobe/enable +echo 0 > events/fprobes/testprobe/enable +grep "vfs_read" trace | grep -q "enable" +echo "" > dynamic_events +echo "" > trace diff --git a/tools/testing/selftests/ftrace/test.d/kprobe/kprobe_args_vfs.tc b/tools/testing/selftests/ftrace/test.d/kprobe/kprobe_args_vfs.tc new file mode 100644 index 000000000000..21a54be6894c --- /dev/null +++ b/tools/testing/selftests/ftrace/test.d/kprobe/kprobe_args_vfs.tc @@ -0,0 +1,40 @@ +#!/bin/sh +# SPDX-License-Identifier: GPL-2.0 +# description: Kprobe event VFS type argument +# requires: kprobe_events "%pd/%pD":README + +: "Test argument %pd with name" +echo 'p:testprobe dput name=$arg1:%pd' > kprobe_events +echo 1 > events/kprobes/testprobe/enable +grep -q "1" events/kprobes/testprobe/enable +echo 0 > events/kprobes/testprobe/enable +grep "dput" trace | grep -q "enable" +echo "" > kprobe_events +echo "" > trace + +: "Test argument %pd without name" +echo 'p:testprobe dput $arg1:%pd' > kprobe_events +echo 1 > events/kprobes/testprobe/enable +grep -q "1" events/kprobes/testprobe/enable +echo 0 > events/kprobes/testprobe/enable +grep "dput" trace | grep -q "enable" +echo "" > kprobe_events +echo "" > trace + +: "Test argument %pD with name" +echo 'p:testprobe vfs_read name=$arg1:%pD' > kprobe_events +echo 1 > events/kprobes/testprobe/enable +grep -q "1" events/kprobes/testprobe/enable +echo 0 > events/kprobes/testprobe/enable +grep "vfs_read" trace | grep -q "enable" +echo "" > kprobe_events +echo "" > trace + +: "Test argument %pD without name" +echo 'p:testprobe vfs_read $arg1:%pD' > kprobe_events +echo 1 > events/kprobes/testprobe/enable +grep -q "1" events/kprobes/testprobe/enable +echo 0 > events/kprobes/testprobe/enable +grep "vfs_read" trace | grep -q "enable" +echo "" > kprobe_events +echo "" > trace