1

vfs-6.11.inode

-----BEGIN PGP SIGNATURE-----
 
 iHUEABYKAB0WIQRAhzRXHqcMeLMyaSiRxhvAZXjcogUCZpEG2wAKCRCRxhvAZXjc
 ooW/AQDzyY+xNGt4OPMvlyFUHd5RcyiLsMhYrkKc3FaIFjesVgD+PFW5PPW12c0V
 Z4VHg9w1HDDuUn4XvELs7OXZpek7RgU=
 =eDC8
 -----END PGP SIGNATURE-----

Merge tag 'vfs-6.11.inode' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs

Pull vfs inode / dentry updates from Christian Brauner:
 "This contains smaller performance improvements to inodes and dentries:

  inode:

   - Add rcu based inode lookup variants.

     They avoid one inode hash lock acquire in the common case thereby
     significantly reducing contention. We already support RCU-based
     operations but didn't take advantage of them during inode
     insertion.

     Callers of iget_locked() get the improvement without any code
     changes. Callers that need a custom callback can switch to
     iget5_locked_rcu() as e.g., did btrfs.

     With 20 threads each walking a dedicated 1000 dirs * 1000 files
     directory tree to stat(2) on a 32 core + 24GB ram vm:

        before: 3.54s user 892.30s system 1966% cpu 45.549 total
        after:  3.28s user 738.66s system 1955% cpu 37.932 total (-16.7%)

     Long-term we should pick up the effort to introduce more
     fine-grained locking and possibly improve on the currently used
     hash implementation.

   - Start zeroing i_state in inode_init_always() instead of doing it in
     individual filesystems.

     This allows us to remove an unneeded lock acquire in new_inode()
     and not burden individual filesystems with this.

  dcache:

   - Move d_lockref out of the area used by RCU lookup to avoid
     cacheline ping poing because the embedded name is sharing a
     cacheline with d_lockref.

   - Fix dentry size on 32bit with CONFIG_SMP=y so it does actually end
     up with 128 bytes in total"

* tag 'vfs-6.11.inode' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs:
  fs: fix dentry size
  vfs: move d_lockref out of the area used by RCU lookup
  bcachefs: remove now spurious i_state initialization
  xfs: remove now spurious i_state initialization in xfs_inode_alloc
  vfs: partially sanitize i_state zeroing on inode creation
  xfs: preserve i_state around inode_init_always in xfs_reinit_inode
  btrfs: use iget5_locked_rcu
  vfs: add rcu-based find_inode variants for iget ops
This commit is contained in:
Linus Torvalds 2024-07-15 11:39:44 -07:00
commit 2aae1d67fd
6 changed files with 99 additions and 33 deletions

View File

@ -244,7 +244,6 @@ static struct bch_inode_info *__bch2_new_inode(struct bch_fs *c)
inode->ei_flags = 0;
mutex_init(&inode->ei_quota_lock);
memset(&inode->ei_devs_need_flush, 0, sizeof(inode->ei_devs_need_flush));
inode->v.i_state = 0;
if (unlikely(inode_init_always(c->vfs_sb, &inode->v))) {
kmem_cache_free(bch2_inode_cache, inode);

View File

@ -5587,7 +5587,7 @@ static struct inode *btrfs_iget_locked(struct super_block *s, u64 ino,
args.ino = ino;
args.root = root;
inode = iget5_locked(s, hashval, btrfs_find_actor,
inode = iget5_locked_rcu(s, hashval, btrfs_find_actor,
btrfs_init_locked_inode,
(void *)&args);
return inode;

View File

@ -162,6 +162,7 @@ int inode_init_always(struct super_block *sb, struct inode *inode)
inode->i_sb = sb;
inode->i_blkbits = sb->s_blocksize_bits;
inode->i_flags = 0;
inode->i_state = 0;
atomic64_set(&inode->i_sequence, 0);
atomic_set(&inode->i_count, 1);
inode->i_op = &empty_iops;
@ -231,6 +232,7 @@ int inode_init_always(struct super_block *sb, struct inode *inode)
if (unlikely(security_inode_alloc(inode)))
return -ENOMEM;
this_cpu_inc(nr_inodes);
return 0;
@ -886,36 +888,45 @@ long prune_icache_sb(struct super_block *sb, struct shrink_control *sc)
return freed;
}
static void __wait_on_freeing_inode(struct inode *inode);
static void __wait_on_freeing_inode(struct inode *inode, bool locked);
/*
* Called with the inode lock held.
*/
static struct inode *find_inode(struct super_block *sb,
struct hlist_head *head,
int (*test)(struct inode *, void *),
void *data)
void *data, bool locked)
{
struct inode *inode = NULL;
if (locked)
lockdep_assert_held(&inode_hash_lock);
else
lockdep_assert_not_held(&inode_hash_lock);
rcu_read_lock();
repeat:
hlist_for_each_entry(inode, head, i_hash) {
hlist_for_each_entry_rcu(inode, head, i_hash) {
if (inode->i_sb != sb)
continue;
if (!test(inode, data))
continue;
spin_lock(&inode->i_lock);
if (inode->i_state & (I_FREEING|I_WILL_FREE)) {
__wait_on_freeing_inode(inode);
__wait_on_freeing_inode(inode, locked);
goto repeat;
}
if (unlikely(inode->i_state & I_CREATING)) {
spin_unlock(&inode->i_lock);
rcu_read_unlock();
return ERR_PTR(-ESTALE);
}
__iget(inode);
spin_unlock(&inode->i_lock);
rcu_read_unlock();
return inode;
}
rcu_read_unlock();
return NULL;
}
@ -924,29 +935,39 @@ repeat:
* iget_locked for details.
*/
static struct inode *find_inode_fast(struct super_block *sb,
struct hlist_head *head, unsigned long ino)
struct hlist_head *head, unsigned long ino,
bool locked)
{
struct inode *inode = NULL;
if (locked)
lockdep_assert_held(&inode_hash_lock);
else
lockdep_assert_not_held(&inode_hash_lock);
rcu_read_lock();
repeat:
hlist_for_each_entry(inode, head, i_hash) {
hlist_for_each_entry_rcu(inode, head, i_hash) {
if (inode->i_ino != ino)
continue;
if (inode->i_sb != sb)
continue;
spin_lock(&inode->i_lock);
if (inode->i_state & (I_FREEING|I_WILL_FREE)) {
__wait_on_freeing_inode(inode);
__wait_on_freeing_inode(inode, locked);
goto repeat;
}
if (unlikely(inode->i_state & I_CREATING)) {
spin_unlock(&inode->i_lock);
rcu_read_unlock();
return ERR_PTR(-ESTALE);
}
__iget(inode);
spin_unlock(&inode->i_lock);
rcu_read_unlock();
return inode;
}
rcu_read_unlock();
return NULL;
}
@ -1004,14 +1025,7 @@ EXPORT_SYMBOL(get_next_ino);
*/
struct inode *new_inode_pseudo(struct super_block *sb)
{
struct inode *inode = alloc_inode(sb);
if (inode) {
spin_lock(&inode->i_lock);
inode->i_state = 0;
spin_unlock(&inode->i_lock);
}
return inode;
return alloc_inode(sb);
}
/**
@ -1161,7 +1175,7 @@ struct inode *inode_insert5(struct inode *inode, unsigned long hashval,
again:
spin_lock(&inode_hash_lock);
old = find_inode(inode->i_sb, head, test, data);
old = find_inode(inode->i_sb, head, test, data, true);
if (unlikely(old)) {
/*
* Uhhuh, somebody else created the same inode under us.
@ -1235,7 +1249,6 @@ struct inode *iget5_locked(struct super_block *sb, unsigned long hashval,
struct inode *new = alloc_inode(sb);
if (new) {
new->i_state = 0;
inode = inode_insert5(new, hashval, test, set, data);
if (unlikely(inode != new))
destroy_inode(new);
@ -1245,6 +1258,47 @@ struct inode *iget5_locked(struct super_block *sb, unsigned long hashval,
}
EXPORT_SYMBOL(iget5_locked);
/**
* iget5_locked_rcu - obtain an inode from a mounted file system
* @sb: super block of file system
* @hashval: hash value (usually inode number) to get
* @test: callback used for comparisons between inodes
* @set: callback used to initialize a new struct inode
* @data: opaque data pointer to pass to @test and @set
*
* This is equivalent to iget5_locked, except the @test callback must
* tolerate the inode not being stable, including being mid-teardown.
*/
struct inode *iget5_locked_rcu(struct super_block *sb, unsigned long hashval,
int (*test)(struct inode *, void *),
int (*set)(struct inode *, void *), void *data)
{
struct hlist_head *head = inode_hashtable + hash(sb, hashval);
struct inode *inode, *new;
again:
inode = find_inode(sb, head, test, data, false);
if (inode) {
if (IS_ERR(inode))
return NULL;
wait_on_inode(inode);
if (unlikely(inode_unhashed(inode))) {
iput(inode);
goto again;
}
return inode;
}
new = alloc_inode(sb);
if (new) {
inode = inode_insert5(new, hashval, test, set, data);
if (unlikely(inode != new))
destroy_inode(new);
}
return inode;
}
EXPORT_SYMBOL_GPL(iget5_locked_rcu);
/**
* iget_locked - obtain an inode from a mounted file system
* @sb: super block of file system
@ -1263,9 +1317,7 @@ struct inode *iget_locked(struct super_block *sb, unsigned long ino)
struct hlist_head *head = inode_hashtable + hash(sb, ino);
struct inode *inode;
again:
spin_lock(&inode_hash_lock);
inode = find_inode_fast(sb, head, ino);
spin_unlock(&inode_hash_lock);
inode = find_inode_fast(sb, head, ino, false);
if (inode) {
if (IS_ERR(inode))
return NULL;
@ -1283,7 +1335,7 @@ again:
spin_lock(&inode_hash_lock);
/* We released the lock, so.. */
old = find_inode_fast(sb, head, ino);
old = find_inode_fast(sb, head, ino, true);
if (!old) {
inode->i_ino = ino;
spin_lock(&inode->i_lock);
@ -1419,7 +1471,7 @@ struct inode *ilookup5_nowait(struct super_block *sb, unsigned long hashval,
struct inode *inode;
spin_lock(&inode_hash_lock);
inode = find_inode(sb, head, test, data);
inode = find_inode(sb, head, test, data, true);
spin_unlock(&inode_hash_lock);
return IS_ERR(inode) ? NULL : inode;
@ -1474,7 +1526,7 @@ struct inode *ilookup(struct super_block *sb, unsigned long ino)
struct inode *inode;
again:
spin_lock(&inode_hash_lock);
inode = find_inode_fast(sb, head, ino);
inode = find_inode_fast(sb, head, ino, true);
spin_unlock(&inode_hash_lock);
if (inode) {
@ -2235,17 +2287,21 @@ EXPORT_SYMBOL(inode_needs_sync);
* wake_up_bit(&inode->i_state, __I_NEW) after removing from the hash list
* will DTRT.
*/
static void __wait_on_freeing_inode(struct inode *inode)
static void __wait_on_freeing_inode(struct inode *inode, bool locked)
{
wait_queue_head_t *wq;
DEFINE_WAIT_BIT(wait, &inode->i_state, __I_NEW);
wq = bit_waitqueue(&inode->i_state, __I_NEW);
prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
spin_unlock(&inode->i_lock);
spin_unlock(&inode_hash_lock);
rcu_read_unlock();
if (locked)
spin_unlock(&inode_hash_lock);
schedule();
finish_wait(wq, &wait.wq_entry);
spin_lock(&inode_hash_lock);
if (locked)
spin_lock(&inode_hash_lock);
rcu_read_lock();
}
static __initdata unsigned long ihash_entries;

View File

@ -86,9 +86,8 @@ xfs_inode_alloc(
return NULL;
}
/* VFS doesn't initialise i_mode or i_state! */
/* VFS doesn't initialise i_mode! */
VFS_I(ip)->i_mode = 0;
VFS_I(ip)->i_state = 0;
mapping_set_large_folios(VFS_I(ip)->i_mapping);
XFS_STATS_INC(mp, vn_active);
@ -314,6 +313,7 @@ xfs_reinit_inode(
dev_t dev = inode->i_rdev;
kuid_t uid = inode->i_uid;
kgid_t gid = inode->i_gid;
unsigned long state = inode->i_state;
error = inode_init_always(mp->m_super, inode);
@ -324,6 +324,7 @@ xfs_reinit_inode(
inode->i_rdev = dev;
inode->i_uid = uid;
inode->i_gid = gid;
inode->i_state = state;
mapping_set_large_folios(inode->i_mapping);
return error;
}

View File

@ -71,7 +71,7 @@ extern const struct qstr dotdot_name;
# define DNAME_INLINE_LEN 40 /* 192 bytes */
#else
# ifdef CONFIG_SMP
# define DNAME_INLINE_LEN 40 /* 128 bytes */
# define DNAME_INLINE_LEN 36 /* 128 bytes */
# else
# define DNAME_INLINE_LEN 44 /* 128 bytes */
# endif
@ -89,13 +89,18 @@ struct dentry {
struct inode *d_inode; /* Where the name belongs to - NULL is
* negative */
unsigned char d_iname[DNAME_INLINE_LEN]; /* small names */
/* --- cacheline 1 boundary (64 bytes) was 32 bytes ago --- */
/* Ref lookup also touches following */
struct lockref d_lockref; /* per-dentry lock and refcount */
const struct dentry_operations *d_op;
struct super_block *d_sb; /* The root of the dentry tree */
unsigned long d_time; /* used by d_revalidate */
void *d_fsdata; /* fs-specific data */
/* --- cacheline 2 boundary (128 bytes) --- */
struct lockref d_lockref; /* per-dentry lock and refcount
* keep separate from RCU lookup area if
* possible!
*/
union {
struct list_head d_lru; /* LRU list */

View File

@ -3047,7 +3047,12 @@ extern struct inode *inode_insert5(struct inode *inode, unsigned long hashval,
int (*test)(struct inode *, void *),
int (*set)(struct inode *, void *),
void *data);
extern struct inode * iget5_locked(struct super_block *, unsigned long, int (*test)(struct inode *, void *), int (*set)(struct inode *, void *), void *);
struct inode *iget5_locked(struct super_block *, unsigned long,
int (*test)(struct inode *, void *),
int (*set)(struct inode *, void *), void *);
struct inode *iget5_locked_rcu(struct super_block *, unsigned long,
int (*test)(struct inode *, void *),
int (*set)(struct inode *, void *), void *);
extern struct inode * iget_locked(struct super_block *, unsigned long);
extern struct inode *find_inode_nowait(struct super_block *,
unsigned long,