4207b556e6
The BPF helper bpf_cgroup_from_id() calls kernfs_find_and_get_node_by_id() which acquires kernfs_idr_lock, which is an non-raw non-IRQ-safe lock. This can lead to deadlocks as bpf_cgroup_from_id() can be called from any BPF programs including e.g. the ones that attach to functions which are holding the scheduler rq lock. Consider the following BPF program: SEC("fentry/__set_cpus_allowed_ptr_locked") int BPF_PROG(__set_cpus_allowed_ptr_locked, struct task_struct *p, struct affinity_context *affn_ctx, struct rq *rq, struct rq_flags *rf) { struct cgroup *cgrp = bpf_cgroup_from_id(p->cgroups->dfl_cgrp->kn->id); if (cgrp) { bpf_printk("%d[%s] in %s", p->pid, p->comm, cgrp->kn->name); bpf_cgroup_release(cgrp); } return 0; } __set_cpus_allowed_ptr_locked() is called with rq lock held and the above BPF program calls bpf_cgroup_from_id() within leading to the following lockdep warning: ===================================================== WARNING: HARDIRQ-safe -> HARDIRQ-unsafe lock order detected 6.7.0-rc3-work-00053-g07124366a1d7-dirty #147 Not tainted ----------------------------------------------------- repro/1620 [HC0[0]:SC0[0]:HE0:SE1] is trying to acquire: ffffffff833b3688 (kernfs_idr_lock){+.+.}-{2:2}, at: kernfs_find_and_get_node_by_id+0x1e/0x70 and this task is already holding: ffff888237ced698 (&rq->__lock){-.-.}-{2:2}, at: task_rq_lock+0x4e/0xf0 which would create a new lock dependency: (&rq->__lock){-.-.}-{2:2} -> (kernfs_idr_lock){+.+.}-{2:2} ... Possible interrupt unsafe locking scenario: CPU0 CPU1 ---- ---- lock(kernfs_idr_lock); local_irq_disable(); lock(&rq->__lock); lock(kernfs_idr_lock); <Interrupt> lock(&rq->__lock); *** DEADLOCK *** ... Call Trace: dump_stack_lvl+0x55/0x70 dump_stack+0x10/0x20 __lock_acquire+0x781/0x2a40 lock_acquire+0xbf/0x1f0 _raw_spin_lock+0x2f/0x40 kernfs_find_and_get_node_by_id+0x1e/0x70 cgroup_get_from_id+0x21/0x240 bpf_cgroup_from_id+0xe/0x20 bpf_prog_98652316e9337a5a___set_cpus_allowed_ptr_locked+0x96/0x11a bpf_trampoline_6442545632+0x4f/0x1000 __set_cpus_allowed_ptr_locked+0x5/0x5a0 sched_setaffinity+0x1b3/0x290 __x64_sys_sched_setaffinity+0x4f/0x60 do_syscall_64+0x40/0xe0 entry_SYSCALL_64_after_hwframe+0x46/0x4e Let's fix it by protecting kernfs_node and kernfs_root with RCU and making kernfs_find_and_get_node_by_id() acquire rcu_read_lock() instead of kernfs_idr_lock. This adds an rcu_head to kernfs_node making it larger by 16 bytes on 64bit. Combined with the preceding rearrange patch, the net increase is 8 bytes. Signed-off-by: Tejun Heo <tj@kernel.org> Cc: Andrea Righi <andrea.righi@canonical.com> Cc: Geert Uytterhoeven <geert@linux-m68k.org> Link: https://lore.kernel.org/r/20240109214828.252092-4-tj@kernel.org Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
177 lines
4.6 KiB
C
177 lines
4.6 KiB
C
/* SPDX-License-Identifier: GPL-2.0-only */
|
|
/*
|
|
* fs/kernfs/kernfs-internal.h - kernfs internal header file
|
|
*
|
|
* Copyright (c) 2001-3 Patrick Mochel
|
|
* Copyright (c) 2007 SUSE Linux Products GmbH
|
|
* Copyright (c) 2007, 2013 Tejun Heo <teheo@suse.de>
|
|
*/
|
|
|
|
#ifndef __KERNFS_INTERNAL_H
|
|
#define __KERNFS_INTERNAL_H
|
|
|
|
#include <linux/lockdep.h>
|
|
#include <linux/fs.h>
|
|
#include <linux/mutex.h>
|
|
#include <linux/rwsem.h>
|
|
#include <linux/xattr.h>
|
|
|
|
#include <linux/kernfs.h>
|
|
#include <linux/fs_context.h>
|
|
|
|
struct kernfs_iattrs {
|
|
kuid_t ia_uid;
|
|
kgid_t ia_gid;
|
|
struct timespec64 ia_atime;
|
|
struct timespec64 ia_mtime;
|
|
struct timespec64 ia_ctime;
|
|
|
|
struct simple_xattrs xattrs;
|
|
atomic_t nr_user_xattrs;
|
|
atomic_t user_xattr_size;
|
|
};
|
|
|
|
struct kernfs_root {
|
|
/* published fields */
|
|
struct kernfs_node *kn;
|
|
unsigned int flags; /* KERNFS_ROOT_* flags */
|
|
|
|
/* private fields, do not use outside kernfs proper */
|
|
struct idr ino_idr;
|
|
u32 last_id_lowbits;
|
|
u32 id_highbits;
|
|
struct kernfs_syscall_ops *syscall_ops;
|
|
|
|
/* list of kernfs_super_info of this root, protected by kernfs_rwsem */
|
|
struct list_head supers;
|
|
|
|
wait_queue_head_t deactivate_waitq;
|
|
struct rw_semaphore kernfs_rwsem;
|
|
struct rw_semaphore kernfs_iattr_rwsem;
|
|
struct rw_semaphore kernfs_supers_rwsem;
|
|
|
|
struct rcu_head rcu;
|
|
};
|
|
|
|
/* +1 to avoid triggering overflow warning when negating it */
|
|
#define KN_DEACTIVATED_BIAS (INT_MIN + 1)
|
|
|
|
/* KERNFS_TYPE_MASK and types are defined in include/linux/kernfs.h */
|
|
|
|
/**
|
|
* kernfs_root - find out the kernfs_root a kernfs_node belongs to
|
|
* @kn: kernfs_node of interest
|
|
*
|
|
* Return: the kernfs_root @kn belongs to.
|
|
*/
|
|
static inline struct kernfs_root *kernfs_root(struct kernfs_node *kn)
|
|
{
|
|
/* if parent exists, it's always a dir; otherwise, @sd is a dir */
|
|
if (kn->parent)
|
|
kn = kn->parent;
|
|
return kn->dir.root;
|
|
}
|
|
|
|
/*
|
|
* mount.c
|
|
*/
|
|
struct kernfs_super_info {
|
|
struct super_block *sb;
|
|
|
|
/*
|
|
* The root associated with this super_block. Each super_block is
|
|
* identified by the root and ns it's associated with.
|
|
*/
|
|
struct kernfs_root *root;
|
|
|
|
/*
|
|
* Each sb is associated with one namespace tag, currently the
|
|
* network namespace of the task which mounted this kernfs
|
|
* instance. If multiple tags become necessary, make the following
|
|
* an array and compare kernfs_node tag against every entry.
|
|
*/
|
|
const void *ns;
|
|
|
|
/* anchored at kernfs_root->supers, protected by kernfs_rwsem */
|
|
struct list_head node;
|
|
};
|
|
#define kernfs_info(SB) ((struct kernfs_super_info *)(SB->s_fs_info))
|
|
|
|
static inline struct kernfs_node *kernfs_dentry_node(struct dentry *dentry)
|
|
{
|
|
if (d_really_is_negative(dentry))
|
|
return NULL;
|
|
return d_inode(dentry)->i_private;
|
|
}
|
|
|
|
static inline void kernfs_set_rev(struct kernfs_node *parent,
|
|
struct dentry *dentry)
|
|
{
|
|
dentry->d_time = parent->dir.rev;
|
|
}
|
|
|
|
static inline void kernfs_inc_rev(struct kernfs_node *parent)
|
|
{
|
|
parent->dir.rev++;
|
|
}
|
|
|
|
static inline bool kernfs_dir_changed(struct kernfs_node *parent,
|
|
struct dentry *dentry)
|
|
{
|
|
if (parent->dir.rev != dentry->d_time)
|
|
return true;
|
|
return false;
|
|
}
|
|
|
|
extern const struct super_operations kernfs_sops;
|
|
extern struct kmem_cache *kernfs_node_cache, *kernfs_iattrs_cache;
|
|
|
|
/*
|
|
* inode.c
|
|
*/
|
|
extern const struct xattr_handler * const kernfs_xattr_handlers[];
|
|
void kernfs_evict_inode(struct inode *inode);
|
|
int kernfs_iop_permission(struct mnt_idmap *idmap,
|
|
struct inode *inode, int mask);
|
|
int kernfs_iop_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
|
|
struct iattr *iattr);
|
|
int kernfs_iop_getattr(struct mnt_idmap *idmap,
|
|
const struct path *path, struct kstat *stat,
|
|
u32 request_mask, unsigned int query_flags);
|
|
ssize_t kernfs_iop_listxattr(struct dentry *dentry, char *buf, size_t size);
|
|
int __kernfs_setattr(struct kernfs_node *kn, const struct iattr *iattr);
|
|
|
|
/*
|
|
* dir.c
|
|
*/
|
|
extern const struct dentry_operations kernfs_dops;
|
|
extern const struct file_operations kernfs_dir_fops;
|
|
extern const struct inode_operations kernfs_dir_iops;
|
|
|
|
struct kernfs_node *kernfs_get_active(struct kernfs_node *kn);
|
|
void kernfs_put_active(struct kernfs_node *kn);
|
|
int kernfs_add_one(struct kernfs_node *kn);
|
|
struct kernfs_node *kernfs_new_node(struct kernfs_node *parent,
|
|
const char *name, umode_t mode,
|
|
kuid_t uid, kgid_t gid,
|
|
unsigned flags);
|
|
|
|
/*
|
|
* file.c
|
|
*/
|
|
extern const struct file_operations kernfs_file_fops;
|
|
|
|
bool kernfs_should_drain_open_files(struct kernfs_node *kn);
|
|
void kernfs_drain_open_files(struct kernfs_node *kn);
|
|
|
|
/*
|
|
* symlink.c
|
|
*/
|
|
extern const struct inode_operations kernfs_symlink_iops;
|
|
|
|
/*
|
|
* kernfs locks
|
|
*/
|
|
extern struct kernfs_global_locks *kernfs_locks;
|
|
#endif /* __KERNFS_INTERNAL_H */
|