From f2bfe7e83765f3bd84382cc75d8ac3ca619de39a Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 8 Jun 2024 22:32:40 -0400 Subject: [PATCH] bcachefs: Rip out freelists from btree key cache Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_key_cache.c | 369 +++++----------------------- fs/bcachefs/btree_key_cache_types.h | 14 -- fs/bcachefs/btree_types.h | 4 +- 3 files changed, 57 insertions(+), 330 deletions(-) diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c index fda7998734cb..dfaeb0810c5e 100644 --- a/fs/bcachefs/btree_key_cache.c +++ b/fs/bcachefs/btree_key_cache.c @@ -79,14 +79,24 @@ static bool bkey_cached_lock_for_evict(struct bkey_cached *ck) return true; } -static void bkey_cached_evict(struct btree_key_cache *c, +static bool bkey_cached_evict(struct btree_key_cache *c, struct bkey_cached *ck) { - BUG_ON(rhashtable_remove_fast(&c->table, &ck->hash, - bch2_btree_key_cache_params)); - memset(&ck->key, ~0, sizeof(ck->key)); + bool ret = !rhashtable_remove_fast(&c->table, &ck->hash, + bch2_btree_key_cache_params); + if (ret) { + memset(&ck->key, ~0, sizeof(ck->key)); + atomic_long_dec(&c->nr_keys); + } - atomic_long_dec(&c->nr_keys); + return ret; +} + +static void __bkey_cached_free(struct rcu_head *rcu) +{ + struct bkey_cached *ck = container_of(rcu, struct bkey_cached, rcu); + + kmem_cache_free(bch2_key_cache, ck); } static void bkey_cached_free(struct btree_key_cache *bc, @@ -94,115 +104,14 @@ static void bkey_cached_free(struct btree_key_cache *bc, { struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache); - BUG_ON(test_bit(BKEY_CACHED_DIRTY, &ck->flags)); - - ck->btree_trans_barrier_seq = - start_poll_synchronize_srcu(&c->btree_trans_barrier); - - if (ck->c.lock.readers) { - list_move_tail(&ck->list, &bc->freed_pcpu); - bc->nr_freed_pcpu++; - } else { - list_move_tail(&ck->list, &bc->freed_nonpcpu); - bc->nr_freed_nonpcpu++; - } - atomic_long_inc(&bc->nr_freed); - kfree(ck->k); ck->k = NULL; ck->u64s = 0; six_unlock_write(&ck->c.lock); six_unlock_intent(&ck->c.lock); -} -#ifdef __KERNEL__ -static void __bkey_cached_move_to_freelist_ordered(struct btree_key_cache *bc, - struct bkey_cached *ck) -{ - struct bkey_cached *pos; - - bc->nr_freed_nonpcpu++; - - list_for_each_entry_reverse(pos, &bc->freed_nonpcpu, list) { - if (ULONG_CMP_GE(ck->btree_trans_barrier_seq, - pos->btree_trans_barrier_seq)) { - list_move(&ck->list, &pos->list); - return; - } - } - - list_move(&ck->list, &bc->freed_nonpcpu); -} -#endif - -static void bkey_cached_move_to_freelist(struct btree_key_cache *bc, - struct bkey_cached *ck) -{ - BUG_ON(test_bit(BKEY_CACHED_DIRTY, &ck->flags)); - - if (!ck->c.lock.readers) { -#ifdef __KERNEL__ - struct btree_key_cache_freelist *f; - bool freed = false; - - preempt_disable(); - f = this_cpu_ptr(bc->pcpu_freed); - - if (f->nr < ARRAY_SIZE(f->objs)) { - f->objs[f->nr++] = ck; - freed = true; - } - preempt_enable(); - - if (!freed) { - mutex_lock(&bc->lock); - preempt_disable(); - f = this_cpu_ptr(bc->pcpu_freed); - - while (f->nr > ARRAY_SIZE(f->objs) / 2) { - struct bkey_cached *ck2 = f->objs[--f->nr]; - - __bkey_cached_move_to_freelist_ordered(bc, ck2); - } - preempt_enable(); - - __bkey_cached_move_to_freelist_ordered(bc, ck); - mutex_unlock(&bc->lock); - } -#else - mutex_lock(&bc->lock); - list_move_tail(&ck->list, &bc->freed_nonpcpu); - bc->nr_freed_nonpcpu++; - mutex_unlock(&bc->lock); -#endif - } else { - mutex_lock(&bc->lock); - list_move_tail(&ck->list, &bc->freed_pcpu); - bc->nr_freed_pcpu++; - mutex_unlock(&bc->lock); - } -} - -static void bkey_cached_free_fast(struct btree_key_cache *bc, - struct bkey_cached *ck) -{ - struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache); - - ck->btree_trans_barrier_seq = - start_poll_synchronize_srcu(&c->btree_trans_barrier); - - list_del_init(&ck->list); - atomic_long_inc(&bc->nr_freed); - - kfree(ck->k); - ck->k = NULL; - ck->u64s = 0; - - bkey_cached_move_to_freelist(bc, ck); - - six_unlock_write(&ck->c.lock); - six_unlock_intent(&ck->c.lock); + call_srcu(&c->btree_trans_barrier, &ck->rcu, __bkey_cached_free); } static struct bkey_cached *__bkey_cached_alloc(unsigned key_u64s, gfp_t gfp) @@ -222,78 +131,10 @@ static struct bkey_cached *__bkey_cached_alloc(unsigned key_u64s, gfp_t gfp) static struct bkey_cached * bkey_cached_alloc(struct btree_trans *trans, struct btree_path *path, unsigned key_u64s) { - struct bch_fs *c = trans->c; - struct btree_key_cache *bc = &c->btree_key_cache; - struct bkey_cached *ck = NULL; bool pcpu_readers = btree_uses_pcpu_readers(path->btree_id); int ret; - if (!pcpu_readers) { -#ifdef __KERNEL__ - struct btree_key_cache_freelist *f; - - preempt_disable(); - f = this_cpu_ptr(bc->pcpu_freed); - if (f->nr) - ck = f->objs[--f->nr]; - preempt_enable(); - - if (!ck) { - mutex_lock(&bc->lock); - preempt_disable(); - f = this_cpu_ptr(bc->pcpu_freed); - - while (!list_empty(&bc->freed_nonpcpu) && - f->nr < ARRAY_SIZE(f->objs) / 2) { - ck = list_last_entry(&bc->freed_nonpcpu, struct bkey_cached, list); - list_del_init(&ck->list); - bc->nr_freed_nonpcpu--; - f->objs[f->nr++] = ck; - } - - ck = f->nr ? f->objs[--f->nr] : NULL; - preempt_enable(); - mutex_unlock(&bc->lock); - } -#else - mutex_lock(&bc->lock); - if (!list_empty(&bc->freed_nonpcpu)) { - ck = list_last_entry(&bc->freed_nonpcpu, struct bkey_cached, list); - list_del_init(&ck->list); - bc->nr_freed_nonpcpu--; - } - mutex_unlock(&bc->lock); -#endif - } else { - mutex_lock(&bc->lock); - if (!list_empty(&bc->freed_pcpu)) { - ck = list_last_entry(&bc->freed_pcpu, struct bkey_cached, list); - list_del_init(&ck->list); - bc->nr_freed_pcpu--; - } - mutex_unlock(&bc->lock); - } - - if (ck) { - ret = btree_node_lock_nopath(trans, &ck->c, SIX_LOCK_intent, _THIS_IP_); - if (unlikely(ret)) { - bkey_cached_move_to_freelist(bc, ck); - return ERR_PTR(ret); - } - - btree_path_cached_set(trans, path, ck, BTREE_NODE_INTENT_LOCKED); - - ret = bch2_btree_node_lock_write(trans, path, &ck->c); - if (unlikely(ret)) { - btree_node_unlock(trans, path, 0); - bkey_cached_move_to_freelist(bc, ck); - return ERR_PTR(ret); - } - - return ck; - } - - ck = allocate_dropping_locks(trans, ret, + struct bkey_cached *ck = allocate_dropping_locks(trans, ret, __bkey_cached_alloc(key_u64s, _gfp)); if (ret) { if (ck) @@ -305,7 +146,6 @@ bkey_cached_alloc(struct btree_trans *trans, struct btree_path *path, unsigned k if (!ck) return NULL; - INIT_LIST_HEAD(&ck->list); bch2_btree_lock_init(&ck->c, pcpu_readers ? SIX_LOCK_INIT_PCPU : 0); ck->c.cached = true; @@ -322,21 +162,21 @@ bkey_cached_reuse(struct btree_key_cache *c) struct bkey_cached *ck; unsigned i; - mutex_lock(&c->lock); rcu_read_lock(); tbl = rht_dereference_rcu(c->table.tbl, &c->table); for (i = 0; i < tbl->size; i++) rht_for_each_entry_rcu(ck, pos, tbl, i, hash) { if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags) && bkey_cached_lock_for_evict(ck)) { - bkey_cached_evict(c, ck); - goto out; + if (bkey_cached_evict(c, ck)) + goto out; + six_unlock_write(&ck->c.lock); + six_unlock_intent(&ck->c.lock); } } ck = NULL; out: rcu_read_unlock(); - mutex_unlock(&c->lock); return ck; } @@ -415,7 +255,7 @@ static int btree_key_cache_create(struct btree_trans *trans, struct btree_path * path->uptodate = BTREE_ITER_UPTODATE; return 0; err: - bkey_cached_free_fast(bc, ck); + bkey_cached_free(bc, ck); mark_btree_node_locked_noreset(path, 0, BTREE_NODE_UNLOCKED); return ret; @@ -611,8 +451,12 @@ evict: } mark_btree_node_locked_noreset(path, 0, BTREE_NODE_UNLOCKED); - bkey_cached_evict(&c->btree_key_cache, ck); - bkey_cached_free_fast(&c->btree_key_cache, ck); + if (bkey_cached_evict(&c->btree_key_cache, ck)) { + bkey_cached_free(&c->btree_key_cache, ck); + } else { + six_unlock_write(&ck->c.lock); + six_unlock_intent(&ck->c.lock); + } } out: bch2_trans_iter_exit(trans, &b_iter); @@ -722,7 +566,7 @@ void bch2_btree_key_cache_drop(struct btree_trans *trans, } bkey_cached_evict(bc, ck); - bkey_cached_free_fast(bc, ck); + bkey_cached_free(bc, ck); mark_btree_node_locked(trans, path, 0, BTREE_NODE_UNLOCKED); btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE); @@ -735,48 +579,14 @@ static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink, struct bch_fs *c = shrink->private_data; struct btree_key_cache *bc = &c->btree_key_cache; struct bucket_table *tbl; - struct bkey_cached *ck, *t; + struct bkey_cached *ck; size_t scanned = 0, freed = 0, nr = sc->nr_to_scan; - unsigned start, flags; + unsigned iter, start; int srcu_idx; - mutex_lock(&bc->lock); - bc->requested_to_free += sc->nr_to_scan; - srcu_idx = srcu_read_lock(&c->btree_trans_barrier); - flags = memalloc_nofs_save(); - - /* - * Newest freed entries are at the end of the list - once we hit one - * that's too new to be freed, we can bail out: - */ - list_for_each_entry_safe(ck, t, &bc->freed_nonpcpu, list) { - if (!poll_state_synchronize_srcu(&c->btree_trans_barrier, - ck->btree_trans_barrier_seq)) - break; - - list_del(&ck->list); - six_lock_exit(&ck->c.lock); - kmem_cache_free(bch2_key_cache, ck); - atomic_long_dec(&bc->nr_freed); - bc->nr_freed_nonpcpu--; - bc->freed++; - } - - list_for_each_entry_safe(ck, t, &bc->freed_pcpu, list) { - if (!poll_state_synchronize_srcu(&c->btree_trans_barrier, - ck->btree_trans_barrier_seq)) - break; - - list_del(&ck->list); - six_lock_exit(&ck->c.lock); - kmem_cache_free(bch2_key_cache, ck); - atomic_long_dec(&bc->nr_freed); - bc->nr_freed_pcpu--; - bc->freed++; - } - rcu_read_lock(); + tbl = rht_dereference_rcu(bc->table.tbl, &bc->table); /* @@ -792,17 +602,18 @@ static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink, return SHRINK_STOP; } - if (bc->shrink_iter >= tbl->size) - bc->shrink_iter = 0; - start = bc->shrink_iter; + iter = bc->shrink_iter; + if (iter >= tbl->size) + iter = 0; + start = iter; do { struct rhash_head *pos, *next; - pos = rht_ptr_rcu(&tbl->buckets[bc->shrink_iter]); + pos = rht_ptr_rcu(&tbl->buckets[iter]); while (!rht_is_a_nulls(pos)) { - next = rht_dereference_bucket_rcu(pos->next, tbl, bc->shrink_iter); + next = rht_dereference_bucket_rcu(pos->next, tbl, iter); ck = container_of(pos, struct bkey_cached, hash); if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { @@ -812,29 +623,31 @@ static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink, bc->skipped_accessed++; } else if (!bkey_cached_lock_for_evict(ck)) { bc->skipped_lock_fail++; - } else { - bkey_cached_evict(bc, ck); + } else if (bkey_cached_evict(bc, ck)) { bkey_cached_free(bc, ck); - bc->moved_to_freelist++; + bc->freed++; freed++; + } else { + six_unlock_write(&ck->c.lock); + six_unlock_intent(&ck->c.lock); } scanned++; if (scanned >= nr) - break; + goto out; pos = next; } - bc->shrink_iter++; - if (bc->shrink_iter >= tbl->size) - bc->shrink_iter = 0; - } while (scanned < nr && bc->shrink_iter != start); + iter++; + if (iter >= tbl->size) + iter = 0; + } while (scanned < nr && iter != start); +out: + bc->shrink_iter = iter; rcu_read_unlock(); - memalloc_nofs_restore(flags); srcu_read_unlock(&c->btree_trans_barrier, srcu_idx); - mutex_unlock(&bc->lock); return freed; } @@ -862,18 +675,13 @@ void bch2_fs_btree_key_cache_exit(struct btree_key_cache *bc) { struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache); struct bucket_table *tbl; - struct bkey_cached *ck, *n; + struct bkey_cached *ck; struct rhash_head *pos; LIST_HEAD(items); unsigned i; -#ifdef __KERNEL__ - int cpu; -#endif shrinker_free(bc->shrink); - mutex_lock(&bc->lock); - /* * The loop is needed to guard against racing with rehash: */ @@ -892,44 +700,14 @@ void bch2_fs_btree_key_cache_exit(struct btree_key_cache *bc) for (i = 0; i < tbl->size; i++) while (pos = rht_ptr_rcu(&tbl->buckets[i]), !rht_is_a_nulls(pos)) { ck = container_of(pos, struct bkey_cached, hash); - bkey_cached_evict(bc, ck); - list_add(&ck->list, &items); + BUG_ON(!bkey_cached_evict(bc, ck)); + kfree(ck->k); + kmem_cache_free(bch2_key_cache, ck); } } rcu_read_unlock(); } -#ifdef __KERNEL__ - if (bc->pcpu_freed) { - for_each_possible_cpu(cpu) { - struct btree_key_cache_freelist *f = - per_cpu_ptr(bc->pcpu_freed, cpu); - - for (i = 0; i < f->nr; i++) { - ck = f->objs[i]; - list_add(&ck->list, &items); - } - } - } -#endif - - BUG_ON(list_count_nodes(&bc->freed_pcpu) != bc->nr_freed_pcpu); - BUG_ON(list_count_nodes(&bc->freed_nonpcpu) != bc->nr_freed_nonpcpu); - - list_splice(&bc->freed_pcpu, &items); - list_splice(&bc->freed_nonpcpu, &items); - - mutex_unlock(&bc->lock); - - list_for_each_entry_safe(ck, n, &items, list) { - cond_resched(); - - list_del(&ck->list); - kfree(ck->k); - six_lock_exit(&ck->c.lock); - kmem_cache_free(bch2_key_cache, ck); - } - if (atomic_long_read(&bc->nr_dirty) && !bch2_journal_error(&c->journal) && test_bit(BCH_FS_was_rw, &c->flags)) @@ -942,15 +720,10 @@ void bch2_fs_btree_key_cache_exit(struct btree_key_cache *bc) if (bc->table_init_done) rhashtable_destroy(&bc->table); - - free_percpu(bc->pcpu_freed); } void bch2_fs_btree_key_cache_init_early(struct btree_key_cache *c) { - mutex_init(&c->lock); - INIT_LIST_HEAD(&c->freed_pcpu); - INIT_LIST_HEAD(&c->freed_nonpcpu); } int bch2_fs_btree_key_cache_init(struct btree_key_cache *bc) @@ -958,12 +731,6 @@ int bch2_fs_btree_key_cache_init(struct btree_key_cache *bc) struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache); struct shrinker *shrink; -#ifdef __KERNEL__ - bc->pcpu_freed = alloc_percpu(struct btree_key_cache_freelist); - if (!bc->pcpu_freed) - return -BCH_ERR_ENOMEM_fs_btree_cache_init; -#endif - if (rhashtable_init(&bc->table, &bch2_btree_key_cache_params)) return -BCH_ERR_ENOMEM_fs_btree_cache_init; @@ -984,45 +751,19 @@ int bch2_fs_btree_key_cache_init(struct btree_key_cache *bc) void bch2_btree_key_cache_to_text(struct printbuf *out, struct btree_key_cache *bc) { - struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache); - printbuf_tabstop_push(out, 24); printbuf_tabstop_push(out, 12); - unsigned flags = memalloc_nofs_save(); - mutex_lock(&bc->lock); prt_printf(out, "keys:\t%lu\r\n", atomic_long_read(&bc->nr_keys)); prt_printf(out, "dirty:\t%lu\r\n", atomic_long_read(&bc->nr_dirty)); - prt_printf(out, "freelist:\t%lu\r\n", atomic_long_read(&bc->nr_freed)); - prt_printf(out, "nonpcpu freelist:\t%zu\r\n", bc->nr_freed_nonpcpu); - prt_printf(out, "pcpu freelist:\t%zu\r\n", bc->nr_freed_pcpu); + prt_printf(out, "table size:\t%u\r\n", bc->table.tbl->size); prt_printf(out, "\nshrinker:\n"); prt_printf(out, "requested_to_free:\t%lu\r\n", bc->requested_to_free); prt_printf(out, "freed:\t%lu\r\n", bc->freed); - prt_printf(out, "moved_to_freelist:\t%lu\r\n", bc->moved_to_freelist); prt_printf(out, "skipped_dirty:\t%lu\r\n", bc->skipped_dirty); prt_printf(out, "skipped_accessed:\t%lu\r\n", bc->skipped_accessed); prt_printf(out, "skipped_lock_fail:\t%lu\r\n", bc->skipped_lock_fail); - - prt_printf(out, "srcu seq:\t%lu\r\n", get_state_synchronize_srcu(&c->btree_trans_barrier)); - - struct bkey_cached *ck; - unsigned iter = 0; - list_for_each_entry(ck, &bc->freed_nonpcpu, list) { - prt_printf(out, "freed_nonpcpu:\t%lu\r\n", ck->btree_trans_barrier_seq); - if (++iter > 10) - break; - } - - iter = 0; - list_for_each_entry(ck, &bc->freed_pcpu, list) { - prt_printf(out, "freed_pcpu:\t%lu\r\n", ck->btree_trans_barrier_seq); - if (++iter > 10) - break; - } - mutex_unlock(&bc->lock); - memalloc_flags_restore(flags); } void bch2_btree_key_cache_exit(void) diff --git a/fs/bcachefs/btree_key_cache_types.h b/fs/bcachefs/btree_key_cache_types.h index 237e8bb3ac40..e026c65f54e1 100644 --- a/fs/bcachefs/btree_key_cache_types.h +++ b/fs/bcachefs/btree_key_cache_types.h @@ -2,33 +2,19 @@ #ifndef _BCACHEFS_BTREE_KEY_CACHE_TYPES_H #define _BCACHEFS_BTREE_KEY_CACHE_TYPES_H -struct btree_key_cache_freelist { - struct bkey_cached *objs[16]; - unsigned nr; -}; - struct btree_key_cache { - struct mutex lock; struct rhashtable table; bool table_init_done; - struct list_head freed_pcpu; - size_t nr_freed_pcpu; - struct list_head freed_nonpcpu; - size_t nr_freed_nonpcpu; - struct shrinker *shrink; unsigned shrink_iter; - struct btree_key_cache_freelist __percpu *pcpu_freed; - atomic_long_t nr_freed; atomic_long_t nr_keys; atomic_long_t nr_dirty; /* shrinker stats */ unsigned long requested_to_free; unsigned long freed; - unsigned long moved_to_freelist; unsigned long skipped_dirty; unsigned long skipped_accessed; unsigned long skipped_lock_fail; diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h index b256b2a20a4f..bca56b6359e7 100644 --- a/fs/bcachefs/btree_types.h +++ b/fs/bcachefs/btree_types.h @@ -386,17 +386,17 @@ struct bkey_cached { struct btree_bkey_cached_common c; unsigned long flags; - unsigned long btree_trans_barrier_seq; u16 u64s; struct bkey_cached_key key; struct rhash_head hash; - struct list_head list; struct journal_entry_pin journal; u64 seq; struct bkey_i *k; + + struct rcu_head rcu; }; static inline struct bpos btree_node_pos(struct btree_bkey_cached_common *b)