2017-03-16 23:18:50 -07:00
|
|
|
// SPDX-License-Identifier: GPL-2.0
|
|
|
|
|
|
|
|
#include "bcachefs.h"
|
2024-01-22 22:01:07 -07:00
|
|
|
#include "bbpos.h"
|
2020-12-17 13:08:58 -07:00
|
|
|
#include "bkey_buf.h"
|
2017-03-16 23:18:50 -07:00
|
|
|
#include "btree_cache.h"
|
|
|
|
#include "btree_io.h"
|
|
|
|
#include "btree_iter.h"
|
|
|
|
#include "btree_locking.h"
|
|
|
|
#include "debug.h"
|
2022-07-17 20:06:38 -07:00
|
|
|
#include "errcode.h"
|
2021-01-26 18:59:00 -07:00
|
|
|
#include "error.h"
|
2023-11-30 21:32:20 -07:00
|
|
|
#include "journal.h"
|
2017-03-16 23:18:50 -07:00
|
|
|
#include "trace.h"
|
|
|
|
|
|
|
|
#include <linux/prefetch.h>
|
2019-06-11 18:03:23 -07:00
|
|
|
#include <linux/sched/mm.h>
|
2024-09-04 12:30:48 -07:00
|
|
|
#include <linux/swap.h>
|
2017-03-16 23:18:50 -07:00
|
|
|
|
2022-09-29 20:37:15 -07:00
|
|
|
#define BTREE_CACHE_NOT_FREED_INCREMENT(counter) \
|
|
|
|
do { \
|
|
|
|
if (shrinker_counter) \
|
2024-09-01 10:36:42 -07:00
|
|
|
bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_##counter]++; \
|
2022-09-29 20:37:15 -07:00
|
|
|
} while (0)
|
|
|
|
|
2022-02-26 09:10:20 -07:00
|
|
|
const char * const bch2_btree_node_flags[] = {
|
|
|
|
#define x(f) #f,
|
|
|
|
BTREE_FLAGS()
|
|
|
|
#undef x
|
|
|
|
NULL
|
|
|
|
};
|
|
|
|
|
2017-03-16 23:18:50 -07:00
|
|
|
void bch2_recalc_btree_reserve(struct bch_fs *c)
|
|
|
|
{
|
2024-09-05 16:25:01 -07:00
|
|
|
unsigned reserve = 16;
|
2017-03-16 23:18:50 -07:00
|
|
|
|
2023-06-28 19:09:13 -07:00
|
|
|
if (!c->btree_roots_known[0].b)
|
2017-03-16 23:18:50 -07:00
|
|
|
reserve += 8;
|
|
|
|
|
2024-09-05 16:25:01 -07:00
|
|
|
for (unsigned i = 0; i < btree_id_nr_alive(c); i++) {
|
2023-06-28 19:09:13 -07:00
|
|
|
struct btree_root *r = bch2_btree_id_root(c, i);
|
|
|
|
|
|
|
|
if (r->b)
|
|
|
|
reserve += min_t(unsigned, 1, r->b->c.level) * 8;
|
|
|
|
}
|
2017-03-16 23:18:50 -07:00
|
|
|
|
2024-09-05 16:25:01 -07:00
|
|
|
c->btree_cache.nr_reserve = reserve;
|
2017-03-16 23:18:50 -07:00
|
|
|
}
|
|
|
|
|
bcachefs: Rework btree node pinning
In backpointers fsck, we do a seqential scan of one btree, and check
references to another: extents <-> backpointers
Checking references generates random lookups, so we want to pin that
btree in memory (or only a range, if it doesn't fit in ram).
Previously, this was done with a simple check in the shrinker - "if
btree node is in range being pinned, don't free it" - but this generated
OOMs, as our shrinker wasn't well behaved if there was less memory
available than expected.
Instead, we now have two different shrinkers and lru lists; the second
shrinker being for pinned nodes, with seeks set much higher than normal
- so they can still be freed if necessary, but we'll prefer not to.
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
2024-09-04 17:49:37 -07:00
|
|
|
static inline size_t btree_cache_can_free(struct btree_cache_list *list)
|
2017-03-16 23:18:50 -07:00
|
|
|
{
|
bcachefs: Rework btree node pinning
In backpointers fsck, we do a seqential scan of one btree, and check
references to another: extents <-> backpointers
Checking references generates random lookups, so we want to pin that
btree in memory (or only a range, if it doesn't fit in ram).
Previously, this was done with a simple check in the shrinker - "if
btree node is in range being pinned, don't free it" - but this generated
OOMs, as our shrinker wasn't well behaved if there was less memory
available than expected.
Instead, we now have two different shrinkers and lru lists; the second
shrinker being for pinned nodes, with seeks set much higher than normal
- so they can still be freed if necessary, but we'll prefer not to.
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
2024-09-04 17:49:37 -07:00
|
|
|
struct btree_cache *bc = container_of(list, struct btree_cache, live[list->idx]);
|
|
|
|
|
|
|
|
size_t can_free = list->nr;
|
|
|
|
if (!list->idx)
|
|
|
|
can_free = max_t(ssize_t, 0, can_free - bc->nr_reserve);
|
|
|
|
return can_free;
|
2017-03-16 23:18:50 -07:00
|
|
|
}
|
|
|
|
|
2022-03-04 17:16:04 -07:00
|
|
|
static void btree_node_to_freedlist(struct btree_cache *bc, struct btree *b)
|
|
|
|
{
|
|
|
|
if (b->c.lock.readers)
|
|
|
|
list_move(&b->list, &bc->freed_pcpu);
|
|
|
|
else
|
|
|
|
list_move(&b->list, &bc->freed_nonpcpu);
|
|
|
|
}
|
|
|
|
|
2021-04-23 21:38:16 -07:00
|
|
|
static void btree_node_data_free(struct bch_fs *c, struct btree *b)
|
2017-03-16 23:18:50 -07:00
|
|
|
{
|
2021-04-23 21:38:16 -07:00
|
|
|
struct btree_cache *bc = &c->btree_cache;
|
|
|
|
|
2024-09-05 16:37:56 -07:00
|
|
|
BUG_ON(btree_node_hashed(b));
|
|
|
|
|
2024-09-04 12:30:48 -07:00
|
|
|
/*
|
|
|
|
* This should really be done in slub/vmalloc, but we're using the
|
|
|
|
* kmalloc_large() path, so we're working around a slub bug by doing
|
|
|
|
* this here:
|
|
|
|
*/
|
|
|
|
if (b->data)
|
|
|
|
mm_account_reclaimed_pages(btree_buf_bytes(b) / PAGE_SIZE);
|
|
|
|
if (b->aux_data)
|
|
|
|
mm_account_reclaimed_pages(btree_aux_data_bytes(b) / PAGE_SIZE);
|
|
|
|
|
2017-03-16 23:18:50 -07:00
|
|
|
EBUG_ON(btree_node_write_in_flight(b));
|
|
|
|
|
bcachefs: Clear btree_node_just_written() when node reused or evicted
This fixes the following bug:
Journal reclaim attempts to flush a node, but races with the node being
evicted from the btree node cache; when we lock the node, the data
buffers have already been freed.
We don't evict a node that's dirty, so calling btree_node_write() is
fine - it's a noop - except that the btree_node_just_written bit causes
bch2_btree_post_write_cleanup() to run (resorting the node), which then
causes a null ptr deref.
00078 Unable to handle kernel NULL pointer dereference at virtual address 000000000000009e
00078 Mem abort info:
00078 ESR = 0x0000000096000005
00078 EC = 0x25: DABT (current EL), IL = 32 bits
00078 SET = 0, FnV = 0
00078 EA = 0, S1PTW = 0
00078 FSC = 0x05: level 1 translation fault
00078 Data abort info:
00078 ISV = 0, ISS = 0x00000005
00078 CM = 0, WnR = 0
00078 user pgtable: 4k pages, 39-bit VAs, pgdp=000000007ed64000
00078 [000000000000009e] pgd=0000000000000000, p4d=0000000000000000, pud=0000000000000000
00078 Internal error: Oops: 0000000096000005 [#1] SMP
00078 Modules linked in:
00078 CPU: 75 PID: 1170 Comm: stress-ng-utime Not tainted 6.3.0-ktest-g5ef5b466e77e #2078
00078 Hardware name: linux,dummy-virt (DT)
00078 pstate: 80001005 (Nzcv daif -PAN -UAO -TCO -DIT +SSBS BTYPE=--)
00078 pc : btree_node_sort+0xc4/0x568
00078 lr : bch2_btree_post_write_cleanup+0x6c/0x1c0
00078 sp : ffffff803e30b350
00078 x29: ffffff803e30b350 x28: 0000000000000001 x27: ffffff80076e52a8
00078 x26: 0000000000000002 x25: 0000000000000000 x24: ffffffc00912e000
00078 x23: ffffff80076e52a8 x22: 0000000000000000 x21: ffffff80076e52bc
00078 x20: ffffff80076e5200 x19: 0000000000000000 x18: 0000000000000000
00078 x17: fffffffff8000000 x16: 0000000008000000 x15: 0000000008000000
00078 x14: 0000000000000002 x13: 0000000000000000 x12: 00000000000000a0
00078 x11: ffffff803e30b400 x10: ffffff803e30b408 x9 : 0000000000000001
00078 x8 : 0000000000000000 x7 : ffffff803e480000 x6 : 00000000000000a0
00078 x5 : 0000000000000088 x4 : 0000000000000000 x3 : 0000000000000010
00078 x2 : 0000000000000000 x1 : 0000000000000000 x0 : ffffff80076e52a8
00078 Call trace:
00078 btree_node_sort+0xc4/0x568
00078 bch2_btree_post_write_cleanup+0x6c/0x1c0
00078 bch2_btree_node_write+0x108/0x148
00078 __btree_node_flush+0x104/0x160
00078 bch2_btree_node_flush0+0x1c/0x30
00078 journal_flush_pins.constprop.0+0x184/0x2d0
00078 __bch2_journal_reclaim+0x4d4/0x508
00078 bch2_journal_reclaim+0x1c/0x30
00078 __bch2_journal_preres_get+0x244/0x268
00078 bch2_trans_journal_preres_get_cold+0xa4/0x180
00078 __bch2_trans_commit+0x61c/0x1bb0
00078 bch2_setattr_nonsize+0x254/0x318
00078 bch2_setattr+0x5c/0x78
00078 notify_change+0x2bc/0x408
00078 vfs_utimes+0x11c/0x218
00078 do_utimes+0x84/0x140
00078 __arm64_sys_utimensat+0x68/0xa8
00078 invoke_syscall.constprop.0+0x54/0xf0
00078 do_el0_svc+0x48/0xd8
00078 el0_svc+0x14/0x48
00078 el0t_64_sync_handler+0xb0/0xb8
00078 el0t_64_sync+0x14c/0x150
00078 Code: 8b050265 910020c6 8b060266 910060ac (79402cad)
00078 ---[ end trace 0000000000000000 ]---
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
2023-05-21 21:49:06 -07:00
|
|
|
clear_btree_node_just_written(b);
|
|
|
|
|
2024-02-01 04:35:46 -07:00
|
|
|
kvfree(b->data);
|
2017-03-16 23:18:50 -07:00
|
|
|
b->data = NULL;
|
2021-04-23 21:38:16 -07:00
|
|
|
#ifdef __KERNEL__
|
2020-07-25 12:07:37 -07:00
|
|
|
kvfree(b->aux_data);
|
2021-04-23 21:38:16 -07:00
|
|
|
#else
|
|
|
|
munmap(b->aux_data, btree_aux_data_bytes(b));
|
|
|
|
#endif
|
2020-07-25 12:07:37 -07:00
|
|
|
b->aux_data = NULL;
|
2017-03-16 23:18:50 -07:00
|
|
|
|
2024-09-05 16:37:56 -07:00
|
|
|
bc->nr_freeable--;
|
2022-03-04 17:16:04 -07:00
|
|
|
|
|
|
|
btree_node_to_freedlist(bc, b);
|
2017-03-16 23:18:50 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
static int bch2_btree_cache_cmp_fn(struct rhashtable_compare_arg *arg,
|
|
|
|
const void *obj)
|
|
|
|
{
|
|
|
|
const struct btree *b = obj;
|
|
|
|
const u64 *v = arg->key;
|
|
|
|
|
2020-02-18 15:15:32 -07:00
|
|
|
return b->hash_val == *v ? 0 : 1;
|
2017-03-16 23:18:50 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
static const struct rhashtable_params bch_btree_cache_params = {
|
2024-06-06 15:56:59 -07:00
|
|
|
.head_offset = offsetof(struct btree, hash),
|
|
|
|
.key_offset = offsetof(struct btree, hash_val),
|
|
|
|
.key_len = sizeof(u64),
|
|
|
|
.obj_cmpfn = bch2_btree_cache_cmp_fn,
|
|
|
|
.automatic_shrinking = true,
|
2017-03-16 23:18:50 -07:00
|
|
|
};
|
|
|
|
|
2020-07-25 12:07:37 -07:00
|
|
|
static int btree_node_data_alloc(struct bch_fs *c, struct btree *b, gfp_t gfp)
|
2017-03-16 23:18:50 -07:00
|
|
|
{
|
2020-06-09 14:49:24 -07:00
|
|
|
BUG_ON(b->data || b->aux_data);
|
2017-03-16 23:18:50 -07:00
|
|
|
|
2024-09-03 14:42:53 -07:00
|
|
|
gfp |= __GFP_ACCOUNT|__GFP_RECLAIMABLE;
|
|
|
|
|
2024-02-01 04:35:46 -07:00
|
|
|
b->data = kvmalloc(btree_buf_bytes(b), gfp);
|
2017-03-16 23:18:50 -07:00
|
|
|
if (!b->data)
|
2023-03-14 12:35:57 -07:00
|
|
|
return -BCH_ERR_ENOMEM_btree_node_mem_alloc;
|
2021-04-23 21:38:16 -07:00
|
|
|
#ifdef __KERNEL__
|
2020-07-25 12:07:37 -07:00
|
|
|
b->aux_data = kvmalloc(btree_aux_data_bytes(b), gfp);
|
2021-04-23 21:38:16 -07:00
|
|
|
#else
|
|
|
|
b->aux_data = mmap(NULL, btree_aux_data_bytes(b),
|
|
|
|
PROT_READ|PROT_WRITE|PROT_EXEC,
|
|
|
|
MAP_PRIVATE|MAP_ANONYMOUS, 0, 0);
|
|
|
|
if (b->aux_data == MAP_FAILED)
|
|
|
|
b->aux_data = NULL;
|
|
|
|
#endif
|
2020-07-25 12:07:37 -07:00
|
|
|
if (!b->aux_data) {
|
2024-02-01 04:35:46 -07:00
|
|
|
kvfree(b->data);
|
2020-06-09 14:49:24 -07:00
|
|
|
b->data = NULL;
|
2023-03-14 12:35:57 -07:00
|
|
|
return -BCH_ERR_ENOMEM_btree_node_mem_alloc;
|
2020-06-09 14:49:24 -07:00
|
|
|
}
|
2017-03-16 23:18:50 -07:00
|
|
|
|
2020-06-09 14:49:24 -07:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2022-09-25 11:49:14 -07:00
|
|
|
static struct btree *__btree_node_mem_alloc(struct bch_fs *c, gfp_t gfp)
|
2020-06-09 14:49:24 -07:00
|
|
|
{
|
2022-10-22 12:59:53 -07:00
|
|
|
struct btree *b;
|
|
|
|
|
|
|
|
b = kzalloc(sizeof(struct btree), gfp);
|
2017-03-16 23:18:50 -07:00
|
|
|
if (!b)
|
|
|
|
return NULL;
|
|
|
|
|
2018-11-01 12:10:01 -07:00
|
|
|
bkey_btree_ptr_init(&b->key);
|
2017-03-16 23:18:50 -07:00
|
|
|
INIT_LIST_HEAD(&b->list);
|
|
|
|
INIT_LIST_HEAD(&b->write_blocked);
|
2024-01-16 11:29:59 -07:00
|
|
|
b->byte_order = ilog2(c->opts.btree_node_size);
|
2020-07-25 12:07:37 -07:00
|
|
|
return b;
|
|
|
|
}
|
|
|
|
|
2021-04-20 17:21:12 -07:00
|
|
|
struct btree *__bch2_btree_node_mem_alloc(struct bch_fs *c)
|
2020-07-25 12:07:37 -07:00
|
|
|
{
|
|
|
|
struct btree_cache *bc = &c->btree_cache;
|
2022-10-22 12:59:53 -07:00
|
|
|
struct btree *b;
|
|
|
|
|
|
|
|
b = __btree_node_mem_alloc(c, GFP_KERNEL);
|
2020-07-25 12:07:37 -07:00
|
|
|
if (!b)
|
|
|
|
return NULL;
|
2017-03-16 23:18:50 -07:00
|
|
|
|
2020-07-25 12:07:37 -07:00
|
|
|
if (btree_node_data_alloc(c, b, GFP_KERNEL)) {
|
|
|
|
kfree(b);
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
2023-05-20 17:57:55 -07:00
|
|
|
bch2_btree_lock_init(&b->c, 0);
|
|
|
|
|
2024-09-05 16:37:56 -07:00
|
|
|
bc->nr_freeable++;
|
2020-07-25 12:07:37 -07:00
|
|
|
list_add(&b->list, &bc->freeable);
|
|
|
|
return b;
|
2017-03-16 23:18:50 -07:00
|
|
|
}
|
|
|
|
|
2024-08-19 12:22:55 -07:00
|
|
|
void bch2_btree_node_to_freelist(struct bch_fs *c, struct btree *b)
|
|
|
|
{
|
|
|
|
mutex_lock(&c->btree_cache.lock);
|
|
|
|
list_move(&b->list, &c->btree_cache.freeable);
|
|
|
|
mutex_unlock(&c->btree_cache.lock);
|
|
|
|
|
|
|
|
six_unlock_write(&b->c.lock);
|
|
|
|
six_unlock_intent(&b->c.lock);
|
|
|
|
}
|
|
|
|
|
bcachefs: Rework btree node pinning
In backpointers fsck, we do a seqential scan of one btree, and check
references to another: extents <-> backpointers
Checking references generates random lookups, so we want to pin that
btree in memory (or only a range, if it doesn't fit in ram).
Previously, this was done with a simple check in the shrinker - "if
btree node is in range being pinned, don't free it" - but this generated
OOMs, as our shrinker wasn't well behaved if there was less memory
available than expected.
Instead, we now have two different shrinkers and lru lists; the second
shrinker being for pinned nodes, with seeks set much higher than normal
- so they can still be freed if necessary, but we'll prefer not to.
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
2024-09-04 17:49:37 -07:00
|
|
|
static inline bool __btree_node_pinned(struct btree_cache *bc, struct btree *b)
|
|
|
|
{
|
|
|
|
struct bbpos pos = BBPOS(b->c.btree_id, b->key.k.p);
|
|
|
|
|
|
|
|
u64 mask = bc->pinned_nodes_mask[!!b->c.level];
|
|
|
|
|
|
|
|
return ((mask & BIT_ULL(b->c.btree_id)) &&
|
|
|
|
bbpos_cmp(bc->pinned_nodes_start, pos) < 0 &&
|
|
|
|
bbpos_cmp(bc->pinned_nodes_end, pos) >= 0);
|
|
|
|
}
|
|
|
|
|
|
|
|
void bch2_node_pin(struct bch_fs *c, struct btree *b)
|
|
|
|
{
|
|
|
|
struct btree_cache *bc = &c->btree_cache;
|
|
|
|
|
|
|
|
mutex_lock(&bc->lock);
|
|
|
|
BUG_ON(!__btree_node_pinned(bc, b));
|
|
|
|
if (b != btree_node_root(c, b) && !btree_node_pinned(b)) {
|
|
|
|
set_btree_node_pinned(b);
|
|
|
|
list_move(&b->list, &bc->live[1].list);
|
|
|
|
bc->live[0].nr--;
|
|
|
|
bc->live[1].nr++;
|
|
|
|
}
|
|
|
|
mutex_unlock(&bc->lock);
|
|
|
|
}
|
|
|
|
|
|
|
|
void bch2_btree_cache_unpin(struct bch_fs *c)
|
|
|
|
{
|
|
|
|
struct btree_cache *bc = &c->btree_cache;
|
|
|
|
struct btree *b, *n;
|
|
|
|
|
|
|
|
mutex_lock(&bc->lock);
|
|
|
|
c->btree_cache.pinned_nodes_mask[0] = 0;
|
|
|
|
c->btree_cache.pinned_nodes_mask[1] = 0;
|
|
|
|
|
|
|
|
list_for_each_entry_safe(b, n, &bc->live[1].list, list) {
|
|
|
|
clear_btree_node_pinned(b);
|
|
|
|
list_move(&b->list, &bc->live[0].list);
|
|
|
|
bc->live[0].nr++;
|
|
|
|
bc->live[1].nr--;
|
|
|
|
}
|
|
|
|
|
|
|
|
mutex_unlock(&bc->lock);
|
|
|
|
}
|
|
|
|
|
2017-03-16 23:18:50 -07:00
|
|
|
/* Btree in memory cache - hash table */
|
|
|
|
|
|
|
|
void bch2_btree_node_hash_remove(struct btree_cache *bc, struct btree *b)
|
|
|
|
{
|
2024-09-05 16:37:56 -07:00
|
|
|
lockdep_assert_held(&bc->lock);
|
2021-08-31 21:50:18 -07:00
|
|
|
int ret = rhashtable_remove_fast(&bc->table, &b->hash, bch_btree_cache_params);
|
2022-10-22 12:59:53 -07:00
|
|
|
|
2021-08-31 21:50:18 -07:00
|
|
|
BUG_ON(ret);
|
2017-03-16 23:18:50 -07:00
|
|
|
|
|
|
|
/* Cause future lookups for this node to fail: */
|
2020-02-18 15:15:32 -07:00
|
|
|
b->hash_val = 0;
|
2024-05-05 06:47:53 -07:00
|
|
|
|
|
|
|
if (b->c.btree_id < BTREE_ID_NR)
|
2024-09-05 16:25:01 -07:00
|
|
|
--bc->nr_by_btree[b->c.btree_id];
|
2024-09-05 16:37:56 -07:00
|
|
|
|
bcachefs: Rework btree node pinning
In backpointers fsck, we do a seqential scan of one btree, and check
references to another: extents <-> backpointers
Checking references generates random lookups, so we want to pin that
btree in memory (or only a range, if it doesn't fit in ram).
Previously, this was done with a simple check in the shrinker - "if
btree node is in range being pinned, don't free it" - but this generated
OOMs, as our shrinker wasn't well behaved if there was less memory
available than expected.
Instead, we now have two different shrinkers and lru lists; the second
shrinker being for pinned nodes, with seeks set much higher than normal
- so they can still be freed if necessary, but we'll prefer not to.
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
2024-09-04 17:49:37 -07:00
|
|
|
bc->live[btree_node_pinned(b)].nr--;
|
2024-09-05 16:37:56 -07:00
|
|
|
bc->nr_freeable++;
|
|
|
|
list_move(&b->list, &bc->freeable);
|
2017-03-16 23:18:50 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
int __bch2_btree_node_hash_insert(struct btree_cache *bc, struct btree *b)
|
|
|
|
{
|
2020-02-18 15:15:32 -07:00
|
|
|
BUG_ON(b->hash_val);
|
|
|
|
b->hash_val = btree_ptr_hash_val(&b->key);
|
|
|
|
|
2024-05-05 06:47:53 -07:00
|
|
|
int ret = rhashtable_lookup_insert_fast(&bc->table, &b->hash,
|
|
|
|
bch_btree_cache_params);
|
2024-09-05 16:37:56 -07:00
|
|
|
if (ret)
|
|
|
|
return ret;
|
|
|
|
|
|
|
|
if (b->c.btree_id < BTREE_ID_NR)
|
2024-09-05 16:25:01 -07:00
|
|
|
bc->nr_by_btree[b->c.btree_id]++;
|
bcachefs: Rework btree node pinning
In backpointers fsck, we do a seqential scan of one btree, and check
references to another: extents <-> backpointers
Checking references generates random lookups, so we want to pin that
btree in memory (or only a range, if it doesn't fit in ram).
Previously, this was done with a simple check in the shrinker - "if
btree node is in range being pinned, don't free it" - but this generated
OOMs, as our shrinker wasn't well behaved if there was less memory
available than expected.
Instead, we now have two different shrinkers and lru lists; the second
shrinker being for pinned nodes, with seeks set much higher than normal
- so they can still be freed if necessary, but we'll prefer not to.
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
2024-09-04 17:49:37 -07:00
|
|
|
|
|
|
|
bool p = __btree_node_pinned(bc, b);
|
|
|
|
mod_bit(BTREE_NODE_pinned, &b->flags, p);
|
|
|
|
|
|
|
|
list_move_tail(&b->list, &bc->live[p].list);
|
|
|
|
bc->live[p].nr++;
|
|
|
|
|
2024-09-05 16:37:56 -07:00
|
|
|
bc->nr_freeable--;
|
|
|
|
return 0;
|
2017-03-16 23:18:50 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
int bch2_btree_node_hash_insert(struct btree_cache *bc, struct btree *b,
|
|
|
|
unsigned level, enum btree_id id)
|
|
|
|
{
|
2020-06-06 09:28:01 -07:00
|
|
|
b->c.level = level;
|
|
|
|
b->c.btree_id = id;
|
2017-03-16 23:18:50 -07:00
|
|
|
|
|
|
|
mutex_lock(&bc->lock);
|
2024-09-05 16:37:56 -07:00
|
|
|
int ret = __bch2_btree_node_hash_insert(bc, b);
|
2017-03-16 23:18:50 -07:00
|
|
|
mutex_unlock(&bc->lock);
|
|
|
|
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2024-04-07 16:07:09 -07:00
|
|
|
void bch2_btree_node_update_key_early(struct btree_trans *trans,
|
|
|
|
enum btree_id btree, unsigned level,
|
|
|
|
struct bkey_s_c old, struct bkey_i *new)
|
|
|
|
{
|
|
|
|
struct bch_fs *c = trans->c;
|
|
|
|
struct btree *b;
|
|
|
|
struct bkey_buf tmp;
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
bch2_bkey_buf_init(&tmp);
|
|
|
|
bch2_bkey_buf_reassemble(&tmp, c, old);
|
|
|
|
|
|
|
|
b = bch2_btree_node_get_noiter(trans, tmp.k, btree, level, true);
|
|
|
|
if (!IS_ERR_OR_NULL(b)) {
|
|
|
|
mutex_lock(&c->btree_cache.lock);
|
|
|
|
|
|
|
|
bch2_btree_node_hash_remove(&c->btree_cache, b);
|
|
|
|
|
|
|
|
bkey_copy(&b->key, new);
|
|
|
|
ret = __bch2_btree_node_hash_insert(&c->btree_cache, b);
|
|
|
|
BUG_ON(ret);
|
|
|
|
|
|
|
|
mutex_unlock(&c->btree_cache.lock);
|
|
|
|
six_unlock_read(&b->c.lock);
|
|
|
|
}
|
|
|
|
|
|
|
|
bch2_bkey_buf_exit(&tmp, c);
|
|
|
|
}
|
|
|
|
|
2017-03-16 23:18:50 -07:00
|
|
|
__flatten
|
|
|
|
static inline struct btree *btree_cache_find(struct btree_cache *bc,
|
|
|
|
const struct bkey_i *k)
|
|
|
|
{
|
2020-02-18 15:15:32 -07:00
|
|
|
u64 v = btree_ptr_hash_val(k);
|
|
|
|
|
|
|
|
return rhashtable_lookup_fast(&bc->table, &v, bch_btree_cache_params);
|
2017-03-16 23:18:50 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* this version is for btree nodes that have already been freed (we're not
|
|
|
|
* reaping a real btree node)
|
|
|
|
*/
|
2022-09-29 20:37:15 -07:00
|
|
|
static int __btree_node_reclaim(struct bch_fs *c, struct btree *b, bool flush, bool shrinker_counter)
|
2017-03-16 23:18:50 -07:00
|
|
|
{
|
|
|
|
struct btree_cache *bc = &c->btree_cache;
|
|
|
|
int ret = 0;
|
|
|
|
|
|
|
|
lockdep_assert_held(&bc->lock);
|
2021-07-10 20:03:15 -07:00
|
|
|
wait_on_io:
|
|
|
|
if (b->flags & ((1U << BTREE_NODE_dirty)|
|
|
|
|
(1U << BTREE_NODE_read_in_flight)|
|
|
|
|
(1U << BTREE_NODE_write_in_flight))) {
|
2022-09-29 20:37:15 -07:00
|
|
|
if (!flush) {
|
|
|
|
if (btree_node_dirty(b))
|
|
|
|
BTREE_CACHE_NOT_FREED_INCREMENT(dirty);
|
|
|
|
else if (btree_node_read_in_flight(b))
|
|
|
|
BTREE_CACHE_NOT_FREED_INCREMENT(read_in_flight);
|
|
|
|
else if (btree_node_write_in_flight(b))
|
|
|
|
BTREE_CACHE_NOT_FREED_INCREMENT(write_in_flight);
|
2023-03-14 12:35:57 -07:00
|
|
|
return -BCH_ERR_ENOMEM_btree_node_reclaim;
|
2022-09-29 20:37:15 -07:00
|
|
|
}
|
2021-07-10 20:03:15 -07:00
|
|
|
|
|
|
|
/* XXX: waiting on IO with btree cache lock held */
|
|
|
|
bch2_btree_node_wait_on_read(b);
|
|
|
|
bch2_btree_node_wait_on_write(b);
|
|
|
|
}
|
2017-03-16 23:18:50 -07:00
|
|
|
|
2022-09-29 20:37:15 -07:00
|
|
|
if (!six_trylock_intent(&b->c.lock)) {
|
|
|
|
BTREE_CACHE_NOT_FREED_INCREMENT(lock_intent);
|
2023-03-14 12:35:57 -07:00
|
|
|
return -BCH_ERR_ENOMEM_btree_node_reclaim;
|
2022-09-29 20:37:15 -07:00
|
|
|
}
|
2017-03-16 23:18:50 -07:00
|
|
|
|
2022-09-29 20:37:15 -07:00
|
|
|
if (!six_trylock_write(&b->c.lock)) {
|
|
|
|
BTREE_CACHE_NOT_FREED_INCREMENT(lock_write);
|
2017-03-16 23:18:50 -07:00
|
|
|
goto out_unlock_intent;
|
2022-09-29 20:37:15 -07:00
|
|
|
}
|
2017-03-16 23:18:50 -07:00
|
|
|
|
2021-07-10 20:03:15 -07:00
|
|
|
/* recheck under lock */
|
|
|
|
if (b->flags & ((1U << BTREE_NODE_read_in_flight)|
|
|
|
|
(1U << BTREE_NODE_write_in_flight))) {
|
2022-09-29 20:37:15 -07:00
|
|
|
if (!flush) {
|
|
|
|
if (btree_node_read_in_flight(b))
|
|
|
|
BTREE_CACHE_NOT_FREED_INCREMENT(read_in_flight);
|
|
|
|
else if (btree_node_write_in_flight(b))
|
|
|
|
BTREE_CACHE_NOT_FREED_INCREMENT(write_in_flight);
|
2021-07-10 20:03:15 -07:00
|
|
|
goto out_unlock;
|
2022-09-29 20:37:15 -07:00
|
|
|
}
|
2021-07-10 20:03:15 -07:00
|
|
|
six_unlock_write(&b->c.lock);
|
|
|
|
six_unlock_intent(&b->c.lock);
|
|
|
|
goto wait_on_io;
|
|
|
|
}
|
|
|
|
|
2022-09-29 20:37:15 -07:00
|
|
|
if (btree_node_noevict(b)) {
|
|
|
|
BTREE_CACHE_NOT_FREED_INCREMENT(noevict);
|
|
|
|
goto out_unlock;
|
|
|
|
}
|
|
|
|
if (btree_node_write_blocked(b)) {
|
|
|
|
BTREE_CACHE_NOT_FREED_INCREMENT(write_blocked);
|
2017-03-16 23:18:50 -07:00
|
|
|
goto out_unlock;
|
2022-09-29 20:37:15 -07:00
|
|
|
}
|
|
|
|
if (btree_node_will_make_reachable(b)) {
|
|
|
|
BTREE_CACHE_NOT_FREED_INCREMENT(will_make_reachable);
|
|
|
|
goto out_unlock;
|
|
|
|
}
|
2017-03-16 23:18:50 -07:00
|
|
|
|
2021-07-10 20:03:15 -07:00
|
|
|
if (btree_node_dirty(b)) {
|
2022-09-29 20:37:15 -07:00
|
|
|
if (!flush) {
|
|
|
|
BTREE_CACHE_NOT_FREED_INCREMENT(dirty);
|
2017-03-16 23:18:50 -07:00
|
|
|
goto out_unlock;
|
2022-09-29 20:37:15 -07:00
|
|
|
}
|
2017-03-16 23:18:50 -07:00
|
|
|
/*
|
|
|
|
* Using the underscore version because we don't want to compact
|
|
|
|
* bsets after the write, since this node is about to be evicted
|
|
|
|
* - unless btree verify mode is enabled, since it runs out of
|
|
|
|
* the post write cleanup:
|
|
|
|
*/
|
2020-11-02 16:20:44 -07:00
|
|
|
if (bch2_verify_btree_ondisk)
|
2022-10-28 14:08:41 -07:00
|
|
|
bch2_btree_node_write(c, b, SIX_LOCK_intent,
|
|
|
|
BTREE_WRITE_cache_reclaim);
|
2017-03-16 23:18:50 -07:00
|
|
|
else
|
2022-10-28 14:08:41 -07:00
|
|
|
__bch2_btree_node_write(c, b,
|
|
|
|
BTREE_WRITE_cache_reclaim);
|
2017-03-16 23:18:50 -07:00
|
|
|
|
2021-07-10 20:03:15 -07:00
|
|
|
six_unlock_write(&b->c.lock);
|
|
|
|
six_unlock_intent(&b->c.lock);
|
|
|
|
goto wait_on_io;
|
2017-03-16 23:18:50 -07:00
|
|
|
}
|
|
|
|
out:
|
2020-02-18 15:15:32 -07:00
|
|
|
if (b->hash_val && !ret)
|
2022-08-27 09:48:36 -07:00
|
|
|
trace_and_count(c, btree_cache_reap, c, b);
|
2017-03-16 23:18:50 -07:00
|
|
|
return ret;
|
|
|
|
out_unlock:
|
2020-06-06 09:28:01 -07:00
|
|
|
six_unlock_write(&b->c.lock);
|
2017-03-16 23:18:50 -07:00
|
|
|
out_unlock_intent:
|
2020-06-06 09:28:01 -07:00
|
|
|
six_unlock_intent(&b->c.lock);
|
2023-03-14 12:35:57 -07:00
|
|
|
ret = -BCH_ERR_ENOMEM_btree_node_reclaim;
|
2017-03-16 23:18:50 -07:00
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
2022-09-29 20:37:15 -07:00
|
|
|
static int btree_node_reclaim(struct bch_fs *c, struct btree *b, bool shrinker_counter)
|
2017-03-16 23:18:50 -07:00
|
|
|
{
|
2022-09-29 20:37:15 -07:00
|
|
|
return __btree_node_reclaim(c, b, false, shrinker_counter);
|
2017-03-16 23:18:50 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
static int btree_node_write_and_reclaim(struct bch_fs *c, struct btree *b)
|
|
|
|
{
|
2022-09-29 20:37:15 -07:00
|
|
|
return __btree_node_reclaim(c, b, true, false);
|
2017-03-16 23:18:50 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
static unsigned long bch2_btree_cache_scan(struct shrinker *shrink,
|
|
|
|
struct shrink_control *sc)
|
|
|
|
{
|
bcachefs: Rework btree node pinning
In backpointers fsck, we do a seqential scan of one btree, and check
references to another: extents <-> backpointers
Checking references generates random lookups, so we want to pin that
btree in memory (or only a range, if it doesn't fit in ram).
Previously, this was done with a simple check in the shrinker - "if
btree node is in range being pinned, don't free it" - but this generated
OOMs, as our shrinker wasn't well behaved if there was less memory
available than expected.
Instead, we now have two different shrinkers and lru lists; the second
shrinker being for pinned nodes, with seeks set much higher than normal
- so they can still be freed if necessary, but we'll prefer not to.
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
2024-09-04 17:49:37 -07:00
|
|
|
struct btree_cache_list *list = shrink->private_data;
|
|
|
|
struct btree_cache *bc = container_of(list, struct btree_cache, live[list->idx]);
|
|
|
|
struct bch_fs *c = container_of(bc, struct bch_fs, btree_cache);
|
2017-03-16 23:18:50 -07:00
|
|
|
struct btree *b, *t;
|
|
|
|
unsigned long nr = sc->nr_to_scan;
|
2022-04-03 17:36:32 -07:00
|
|
|
unsigned long can_free = 0;
|
2017-03-16 23:18:50 -07:00
|
|
|
unsigned long freed = 0;
|
2022-09-25 11:49:14 -07:00
|
|
|
unsigned long touched = 0;
|
2020-10-15 18:48:58 -07:00
|
|
|
unsigned i, flags;
|
2021-12-27 18:45:07 -07:00
|
|
|
unsigned long ret = SHRINK_STOP;
|
bcachefs: Rework btree node pinning
In backpointers fsck, we do a seqential scan of one btree, and check
references to another: extents <-> backpointers
Checking references generates random lookups, so we want to pin that
btree in memory (or only a range, if it doesn't fit in ram).
Previously, this was done with a simple check in the shrinker - "if
btree node is in range being pinned, don't free it" - but this generated
OOMs, as our shrinker wasn't well behaved if there was less memory
available than expected.
Instead, we now have two different shrinkers and lru lists; the second
shrinker being for pinned nodes, with seeks set much higher than normal
- so they can still be freed if necessary, but we'll prefer not to.
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
2024-09-04 17:49:37 -07:00
|
|
|
bool trigger_writes = atomic_long_read(&bc->nr_dirty) + nr >= list->nr * 3 / 4;
|
2017-03-16 23:18:50 -07:00
|
|
|
|
2020-11-02 16:20:44 -07:00
|
|
|
if (bch2_btree_shrinker_disabled)
|
2017-03-16 23:18:50 -07:00
|
|
|
return SHRINK_STOP;
|
|
|
|
|
2022-09-25 11:49:14 -07:00
|
|
|
mutex_lock(&bc->lock);
|
2020-10-15 18:48:58 -07:00
|
|
|
flags = memalloc_nofs_save();
|
|
|
|
|
2017-03-16 23:18:50 -07:00
|
|
|
/*
|
|
|
|
* It's _really_ critical that we don't free too many btree nodes - we
|
|
|
|
* have to always leave ourselves a reserve. The reserve is how we
|
|
|
|
* guarantee that allocating memory for a new btree node can always
|
|
|
|
* succeed, so that inserting keys into the btree can always succeed and
|
|
|
|
* IO can always make forward progress:
|
|
|
|
*/
|
bcachefs: Rework btree node pinning
In backpointers fsck, we do a seqential scan of one btree, and check
references to another: extents <-> backpointers
Checking references generates random lookups, so we want to pin that
btree in memory (or only a range, if it doesn't fit in ram).
Previously, this was done with a simple check in the shrinker - "if
btree node is in range being pinned, don't free it" - but this generated
OOMs, as our shrinker wasn't well behaved if there was less memory
available than expected.
Instead, we now have two different shrinkers and lru lists; the second
shrinker being for pinned nodes, with seeks set much higher than normal
- so they can still be freed if necessary, but we'll prefer not to.
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
2024-09-04 17:49:37 -07:00
|
|
|
can_free = btree_cache_can_free(list);
|
2017-03-16 23:18:50 -07:00
|
|
|
nr = min_t(unsigned long, nr, can_free);
|
|
|
|
|
|
|
|
i = 0;
|
|
|
|
list_for_each_entry_safe(b, t, &bc->freeable, list) {
|
2021-12-27 20:11:54 -07:00
|
|
|
/*
|
|
|
|
* Leave a few nodes on the freeable list, so that a btree split
|
|
|
|
* won't have to hit the system allocator:
|
|
|
|
*/
|
|
|
|
if (++i <= 3)
|
|
|
|
continue;
|
|
|
|
|
2017-03-16 23:18:50 -07:00
|
|
|
touched++;
|
|
|
|
|
2021-11-11 13:50:22 -07:00
|
|
|
if (touched >= nr)
|
2022-09-25 11:49:14 -07:00
|
|
|
goto out;
|
2017-03-16 23:18:50 -07:00
|
|
|
|
2022-09-29 20:37:15 -07:00
|
|
|
if (!btree_node_reclaim(c, b, true)) {
|
2017-03-16 23:18:50 -07:00
|
|
|
btree_node_data_free(c, b);
|
2020-06-06 09:28:01 -07:00
|
|
|
six_unlock_write(&b->c.lock);
|
|
|
|
six_unlock_intent(&b->c.lock);
|
2017-03-16 23:18:50 -07:00
|
|
|
freed++;
|
2024-09-05 16:25:01 -07:00
|
|
|
bc->nr_freed++;
|
2017-03-16 23:18:50 -07:00
|
|
|
}
|
|
|
|
}
|
|
|
|
restart:
|
bcachefs: Rework btree node pinning
In backpointers fsck, we do a seqential scan of one btree, and check
references to another: extents <-> backpointers
Checking references generates random lookups, so we want to pin that
btree in memory (or only a range, if it doesn't fit in ram).
Previously, this was done with a simple check in the shrinker - "if
btree node is in range being pinned, don't free it" - but this generated
OOMs, as our shrinker wasn't well behaved if there was less memory
available than expected.
Instead, we now have two different shrinkers and lru lists; the second
shrinker being for pinned nodes, with seeks set much higher than normal
- so they can still be freed if necessary, but we'll prefer not to.
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
2024-09-04 17:49:37 -07:00
|
|
|
list_for_each_entry_safe(b, t, &list->list, list) {
|
2022-09-25 11:49:14 -07:00
|
|
|
touched++;
|
|
|
|
|
2022-03-03 09:04:01 -07:00
|
|
|
if (btree_node_accessed(b)) {
|
|
|
|
clear_btree_node_accessed(b);
|
2024-09-01 10:36:42 -07:00
|
|
|
bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_access_bit]++;
|
2024-09-04 14:19:24 -07:00
|
|
|
--touched;;
|
2022-09-29 20:37:15 -07:00
|
|
|
} else if (!btree_node_reclaim(c, b, true)) {
|
2024-09-05 16:37:56 -07:00
|
|
|
bch2_btree_node_hash_remove(bc, b);
|
|
|
|
|
2017-03-16 23:18:50 -07:00
|
|
|
freed++;
|
|
|
|
btree_node_data_free(c, b);
|
2024-09-05 16:25:01 -07:00
|
|
|
bc->nr_freed++;
|
2017-03-16 23:18:50 -07:00
|
|
|
|
2020-06-06 09:28:01 -07:00
|
|
|
six_unlock_write(&b->c.lock);
|
|
|
|
six_unlock_intent(&b->c.lock);
|
2017-03-16 23:18:50 -07:00
|
|
|
|
2022-09-25 11:49:14 -07:00
|
|
|
if (freed == nr)
|
|
|
|
goto out_rotate;
|
|
|
|
} else if (trigger_writes &&
|
|
|
|
btree_node_dirty(b) &&
|
|
|
|
!btree_node_will_make_reachable(b) &&
|
|
|
|
!btree_node_write_blocked(b) &&
|
|
|
|
six_trylock_read(&b->c.lock)) {
|
bcachefs: Rework btree node pinning
In backpointers fsck, we do a seqential scan of one btree, and check
references to another: extents <-> backpointers
Checking references generates random lookups, so we want to pin that
btree in memory (or only a range, if it doesn't fit in ram).
Previously, this was done with a simple check in the shrinker - "if
btree node is in range being pinned, don't free it" - but this generated
OOMs, as our shrinker wasn't well behaved if there was less memory
available than expected.
Instead, we now have two different shrinkers and lru lists; the second
shrinker being for pinned nodes, with seeks set much higher than normal
- so they can still be freed if necessary, but we'll prefer not to.
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
2024-09-04 17:49:37 -07:00
|
|
|
list_move(&list->list, &b->list);
|
2022-09-25 11:49:14 -07:00
|
|
|
mutex_unlock(&bc->lock);
|
2022-10-28 14:08:41 -07:00
|
|
|
__bch2_btree_node_write(c, b, BTREE_WRITE_cache_reclaim);
|
2022-09-25 11:49:14 -07:00
|
|
|
six_unlock_read(&b->c.lock);
|
|
|
|
if (touched >= nr)
|
|
|
|
goto out_nounlock;
|
|
|
|
mutex_lock(&bc->lock);
|
2017-03-16 23:18:50 -07:00
|
|
|
goto restart;
|
2022-03-03 09:04:01 -07:00
|
|
|
}
|
|
|
|
|
2022-09-25 11:49:14 -07:00
|
|
|
if (touched >= nr)
|
2022-03-03 09:04:01 -07:00
|
|
|
break;
|
2017-03-16 23:18:50 -07:00
|
|
|
}
|
2022-09-25 11:49:14 -07:00
|
|
|
out_rotate:
|
bcachefs: Rework btree node pinning
In backpointers fsck, we do a seqential scan of one btree, and check
references to another: extents <-> backpointers
Checking references generates random lookups, so we want to pin that
btree in memory (or only a range, if it doesn't fit in ram).
Previously, this was done with a simple check in the shrinker - "if
btree node is in range being pinned, don't free it" - but this generated
OOMs, as our shrinker wasn't well behaved if there was less memory
available than expected.
Instead, we now have two different shrinkers and lru lists; the second
shrinker being for pinned nodes, with seeks set much higher than normal
- so they can still be freed if necessary, but we'll prefer not to.
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
2024-09-04 17:49:37 -07:00
|
|
|
if (&t->list != &list->list)
|
|
|
|
list_move_tail(&list->list, &t->list);
|
2017-03-16 23:18:50 -07:00
|
|
|
out:
|
2022-09-25 11:49:14 -07:00
|
|
|
mutex_unlock(&bc->lock);
|
|
|
|
out_nounlock:
|
2022-04-03 17:36:32 -07:00
|
|
|
ret = freed;
|
2020-11-11 16:59:41 -07:00
|
|
|
memalloc_nofs_restore(flags);
|
2022-08-27 09:48:36 -07:00
|
|
|
trace_and_count(c, btree_cache_scan, sc->nr_to_scan, can_free, ret);
|
2021-12-27 18:45:07 -07:00
|
|
|
return ret;
|
2017-03-16 23:18:50 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
static unsigned long bch2_btree_cache_count(struct shrinker *shrink,
|
|
|
|
struct shrink_control *sc)
|
|
|
|
{
|
bcachefs: Rework btree node pinning
In backpointers fsck, we do a seqential scan of one btree, and check
references to another: extents <-> backpointers
Checking references generates random lookups, so we want to pin that
btree in memory (or only a range, if it doesn't fit in ram).
Previously, this was done with a simple check in the shrinker - "if
btree node is in range being pinned, don't free it" - but this generated
OOMs, as our shrinker wasn't well behaved if there was less memory
available than expected.
Instead, we now have two different shrinkers and lru lists; the second
shrinker being for pinned nodes, with seeks set much higher than normal
- so they can still be freed if necessary, but we'll prefer not to.
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
2024-09-04 17:49:37 -07:00
|
|
|
struct btree_cache_list *list = shrink->private_data;
|
2017-03-16 23:18:50 -07:00
|
|
|
|
2020-11-02 16:20:44 -07:00
|
|
|
if (bch2_btree_shrinker_disabled)
|
2017-03-16 23:18:50 -07:00
|
|
|
return 0;
|
|
|
|
|
bcachefs: Rework btree node pinning
In backpointers fsck, we do a seqential scan of one btree, and check
references to another: extents <-> backpointers
Checking references generates random lookups, so we want to pin that
btree in memory (or only a range, if it doesn't fit in ram).
Previously, this was done with a simple check in the shrinker - "if
btree node is in range being pinned, don't free it" - but this generated
OOMs, as our shrinker wasn't well behaved if there was less memory
available than expected.
Instead, we now have two different shrinkers and lru lists; the second
shrinker being for pinned nodes, with seeks set much higher than normal
- so they can still be freed if necessary, but we'll prefer not to.
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
2024-09-04 17:49:37 -07:00
|
|
|
return btree_cache_can_free(list);
|
2017-03-16 23:18:50 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
void bch2_fs_btree_cache_exit(struct bch_fs *c)
|
|
|
|
{
|
|
|
|
struct btree_cache *bc = &c->btree_cache;
|
2024-09-05 16:37:56 -07:00
|
|
|
struct btree *b, *t;
|
bcachefs: Rework btree node pinning
In backpointers fsck, we do a seqential scan of one btree, and check
references to another: extents <-> backpointers
Checking references generates random lookups, so we want to pin that
btree in memory (or only a range, if it doesn't fit in ram).
Previously, this was done with a simple check in the shrinker - "if
btree node is in range being pinned, don't free it" - but this generated
OOMs, as our shrinker wasn't well behaved if there was less memory
available than expected.
Instead, we now have two different shrinkers and lru lists; the second
shrinker being for pinned nodes, with seeks set much higher than normal
- so they can still be freed if necessary, but we'll prefer not to.
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
2024-09-04 17:49:37 -07:00
|
|
|
unsigned long flags;
|
2017-03-16 23:18:50 -07:00
|
|
|
|
bcachefs: Rework btree node pinning
In backpointers fsck, we do a seqential scan of one btree, and check
references to another: extents <-> backpointers
Checking references generates random lookups, so we want to pin that
btree in memory (or only a range, if it doesn't fit in ram).
Previously, this was done with a simple check in the shrinker - "if
btree node is in range being pinned, don't free it" - but this generated
OOMs, as our shrinker wasn't well behaved if there was less memory
available than expected.
Instead, we now have two different shrinkers and lru lists; the second
shrinker being for pinned nodes, with seeks set much higher than normal
- so they can still be freed if necessary, but we'll prefer not to.
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
2024-09-04 17:49:37 -07:00
|
|
|
shrinker_free(bc->live[1].shrink);
|
|
|
|
shrinker_free(bc->live[0].shrink);
|
2017-03-16 23:18:50 -07:00
|
|
|
|
2020-10-11 13:33:49 -07:00
|
|
|
/* vfree() can allocate memory: */
|
|
|
|
flags = memalloc_nofs_save();
|
2017-03-16 23:18:50 -07:00
|
|
|
mutex_lock(&bc->lock);
|
|
|
|
|
|
|
|
if (c->verify_data)
|
bcachefs: Rework btree node pinning
In backpointers fsck, we do a seqential scan of one btree, and check
references to another: extents <-> backpointers
Checking references generates random lookups, so we want to pin that
btree in memory (or only a range, if it doesn't fit in ram).
Previously, this was done with a simple check in the shrinker - "if
btree node is in range being pinned, don't free it" - but this generated
OOMs, as our shrinker wasn't well behaved if there was less memory
available than expected.
Instead, we now have two different shrinkers and lru lists; the second
shrinker being for pinned nodes, with seeks set much higher than normal
- so they can still be freed if necessary, but we'll prefer not to.
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
2024-09-04 17:49:37 -07:00
|
|
|
list_move(&c->verify_data->list, &bc->live[0].list);
|
2017-03-16 23:18:50 -07:00
|
|
|
|
2024-02-01 04:35:46 -07:00
|
|
|
kvfree(c->verify_ondisk);
|
2017-03-16 23:18:50 -07:00
|
|
|
|
bcachefs: Rework btree node pinning
In backpointers fsck, we do a seqential scan of one btree, and check
references to another: extents <-> backpointers
Checking references generates random lookups, so we want to pin that
btree in memory (or only a range, if it doesn't fit in ram).
Previously, this was done with a simple check in the shrinker - "if
btree node is in range being pinned, don't free it" - but this generated
OOMs, as our shrinker wasn't well behaved if there was less memory
available than expected.
Instead, we now have two different shrinkers and lru lists; the second
shrinker being for pinned nodes, with seeks set much higher than normal
- so they can still be freed if necessary, but we'll prefer not to.
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
2024-09-04 17:49:37 -07:00
|
|
|
for (unsigned i = 0; i < btree_id_nr_alive(c); i++) {
|
2023-06-28 19:09:13 -07:00
|
|
|
struct btree_root *r = bch2_btree_id_root(c, i);
|
|
|
|
|
|
|
|
if (r->b)
|
bcachefs: Rework btree node pinning
In backpointers fsck, we do a seqential scan of one btree, and check
references to another: extents <-> backpointers
Checking references generates random lookups, so we want to pin that
btree in memory (or only a range, if it doesn't fit in ram).
Previously, this was done with a simple check in the shrinker - "if
btree node is in range being pinned, don't free it" - but this generated
OOMs, as our shrinker wasn't well behaved if there was less memory
available than expected.
Instead, we now have two different shrinkers and lru lists; the second
shrinker being for pinned nodes, with seeks set much higher than normal
- so they can still be freed if necessary, but we'll prefer not to.
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
2024-09-04 17:49:37 -07:00
|
|
|
list_add(&r->b->list, &bc->live[0].list);
|
2023-06-28 19:09:13 -07:00
|
|
|
}
|
2017-03-16 23:18:50 -07:00
|
|
|
|
bcachefs: Rework btree node pinning
In backpointers fsck, we do a seqential scan of one btree, and check
references to another: extents <-> backpointers
Checking references generates random lookups, so we want to pin that
btree in memory (or only a range, if it doesn't fit in ram).
Previously, this was done with a simple check in the shrinker - "if
btree node is in range being pinned, don't free it" - but this generated
OOMs, as our shrinker wasn't well behaved if there was less memory
available than expected.
Instead, we now have two different shrinkers and lru lists; the second
shrinker being for pinned nodes, with seeks set much higher than normal
- so they can still be freed if necessary, but we'll prefer not to.
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
2024-09-04 17:49:37 -07:00
|
|
|
list_for_each_entry_safe(b, t, &bc->live[1].list, list)
|
|
|
|
bch2_btree_node_hash_remove(bc, b);
|
|
|
|
list_for_each_entry_safe(b, t, &bc->live[0].list, list)
|
2024-09-05 16:37:56 -07:00
|
|
|
bch2_btree_node_hash_remove(bc, b);
|
2017-03-16 23:18:50 -07:00
|
|
|
|
2024-09-05 16:37:56 -07:00
|
|
|
list_for_each_entry_safe(b, t, &bc->freeable, list) {
|
2017-03-16 23:18:50 -07:00
|
|
|
BUG_ON(btree_node_read_in_flight(b) ||
|
|
|
|
btree_node_write_in_flight(b));
|
|
|
|
|
|
|
|
btree_node_data_free(c, b);
|
|
|
|
}
|
|
|
|
|
2023-11-30 21:32:20 -07:00
|
|
|
BUG_ON(!bch2_journal_error(&c->journal) &&
|
2024-09-05 16:25:01 -07:00
|
|
|
atomic_long_read(&c->btree_cache.nr_dirty));
|
2020-11-09 11:01:52 -07:00
|
|
|
|
2022-03-04 17:16:04 -07:00
|
|
|
list_splice(&bc->freed_pcpu, &bc->freed_nonpcpu);
|
|
|
|
|
2024-09-05 16:37:56 -07:00
|
|
|
list_for_each_entry_safe(b, t, &bc->freed_nonpcpu, list) {
|
2017-03-16 23:18:50 -07:00
|
|
|
list_del(&b->list);
|
2023-05-20 17:57:55 -07:00
|
|
|
six_lock_exit(&b->c.lock);
|
2017-03-16 23:18:50 -07:00
|
|
|
kfree(b);
|
|
|
|
}
|
|
|
|
|
|
|
|
mutex_unlock(&bc->lock);
|
2020-10-11 13:33:49 -07:00
|
|
|
memalloc_nofs_restore(flags);
|
2017-03-16 23:18:50 -07:00
|
|
|
|
2024-09-05 16:37:56 -07:00
|
|
|
for (unsigned i = 0; i < ARRAY_SIZE(bc->nr_by_btree); i++)
|
|
|
|
BUG_ON(bc->nr_by_btree[i]);
|
bcachefs: Rework btree node pinning
In backpointers fsck, we do a seqential scan of one btree, and check
references to another: extents <-> backpointers
Checking references generates random lookups, so we want to pin that
btree in memory (or only a range, if it doesn't fit in ram).
Previously, this was done with a simple check in the shrinker - "if
btree node is in range being pinned, don't free it" - but this generated
OOMs, as our shrinker wasn't well behaved if there was less memory
available than expected.
Instead, we now have two different shrinkers and lru lists; the second
shrinker being for pinned nodes, with seeks set much higher than normal
- so they can still be freed if necessary, but we'll prefer not to.
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
2024-09-04 17:49:37 -07:00
|
|
|
BUG_ON(bc->live[0].nr);
|
|
|
|
BUG_ON(bc->live[1].nr);
|
2024-09-05 16:37:56 -07:00
|
|
|
BUG_ON(bc->nr_freeable);
|
|
|
|
|
2017-03-16 23:18:50 -07:00
|
|
|
if (bc->table_init_done)
|
|
|
|
rhashtable_destroy(&bc->table);
|
|
|
|
}
|
|
|
|
|
|
|
|
int bch2_fs_btree_cache_init(struct bch_fs *c)
|
|
|
|
{
|
|
|
|
struct btree_cache *bc = &c->btree_cache;
|
Many singleton patches against the MM code. The patch series which are
included in this merge do the following:
- Kemeng Shi has contributed some compation maintenance work in the
series "Fixes and cleanups to compaction".
- Joel Fernandes has a patchset ("Optimize mremap during mutual
alignment within PMD") which fixes an obscure issue with mremap()'s
pagetable handling during a subsequent exec(), based upon an
implementation which Linus suggested.
- More DAMON/DAMOS maintenance and feature work from SeongJae Park i the
following patch series:
mm/damon: misc fixups for documents, comments and its tracepoint
mm/damon: add a tracepoint for damos apply target regions
mm/damon: provide pseudo-moving sum based access rate
mm/damon: implement DAMOS apply intervals
mm/damon/core-test: Fix memory leaks in core-test
mm/damon/sysfs-schemes: Do DAMOS tried regions update for only one apply interval
- In the series "Do not try to access unaccepted memory" Adrian Hunter
provides some fixups for the recently-added "unaccepted memory' feature.
To increase the feature's checking coverage. "Plug a few gaps where
RAM is exposed without checking if it is unaccepted memory".
- In the series "cleanups for lockless slab shrink" Qi Zheng has done
some maintenance work which is preparation for the lockless slab
shrinking code.
- Qi Zheng has redone the earlier (and reverted) attempt to make slab
shrinking lockless in the series "use refcount+RCU method to implement
lockless slab shrink".
- David Hildenbrand contributes some maintenance work for the rmap code
in the series "Anon rmap cleanups".
- Kefeng Wang does more folio conversions and some maintenance work in
the migration code. Series "mm: migrate: more folio conversion and
unification".
- Matthew Wilcox has fixed an issue in the buffer_head code which was
causing long stalls under some heavy memory/IO loads. Some cleanups
were added on the way. Series "Add and use bdev_getblk()".
- In the series "Use nth_page() in place of direct struct page
manipulation" Zi Yan has fixed a potential issue with the direct
manipulation of hugetlb page frames.
- In the series "mm: hugetlb: Skip initialization of gigantic tail
struct pages if freed by HVO" has improved our handling of gigantic
pages in the hugetlb vmmemmep optimizaton code. This provides
significant boot time improvements when significant amounts of gigantic
pages are in use.
- Matthew Wilcox has sent the series "Small hugetlb cleanups" - code
rationalization and folio conversions in the hugetlb code.
- Yin Fengwei has improved mlock()'s handling of large folios in the
series "support large folio for mlock"
- In the series "Expose swapcache stat for memcg v1" Liu Shixin has
added statistics for memcg v1 users which are available (and useful)
under memcg v2.
- Florent Revest has enhanced the MDWE (Memory-Deny-Write-Executable)
prctl so that userspace may direct the kernel to not automatically
propagate the denial to child processes. The series is named "MDWE
without inheritance".
- Kefeng Wang has provided the series "mm: convert numa balancing
functions to use a folio" which does what it says.
- In the series "mm/ksm: add fork-exec support for prctl" Stefan Roesch
makes is possible for a process to propagate KSM treatment across
exec().
- Huang Ying has enhanced memory tiering's calculation of memory
distances. This is used to permit the dax/kmem driver to use "high
bandwidth memory" in addition to Optane Data Center Persistent Memory
Modules (DCPMM). The series is named "memory tiering: calculate
abstract distance based on ACPI HMAT"
- In the series "Smart scanning mode for KSM" Stefan Roesch has
optimized KSM by teaching it to retain and use some historical
information from previous scans.
- Yosry Ahmed has fixed some inconsistencies in memcg statistics in the
series "mm: memcg: fix tracking of pending stats updates values".
- In the series "Implement IOCTL to get and optionally clear info about
PTEs" Peter Xu has added an ioctl to /proc/<pid>/pagemap which permits
us to atomically read-then-clear page softdirty state. This is mainly
used by CRIU.
- Hugh Dickins contributed the series "shmem,tmpfs: general maintenance"
- a bunch of relatively minor maintenance tweaks to this code.
- Matthew Wilcox has increased the use of the VMA lock over file-backed
page faults in the series "Handle more faults under the VMA lock". Some
rationalizations of the fault path became possible as a result.
- In the series "mm/rmap: convert page_move_anon_rmap() to
folio_move_anon_rmap()" David Hildenbrand has implemented some cleanups
and folio conversions.
- In the series "various improvements to the GUP interface" Lorenzo
Stoakes has simplified and improved the GUP interface with an eye to
providing groundwork for future improvements.
- Andrey Konovalov has sent along the series "kasan: assorted fixes and
improvements" which does those things.
- Some page allocator maintenance work from Kemeng Shi in the series
"Two minor cleanups to break_down_buddy_pages".
- In thes series "New selftest for mm" Breno Leitao has developed
another MM self test which tickles a race we had between madvise() and
page faults.
- In the series "Add folio_end_read" Matthew Wilcox provides cleanups
and an optimization to the core pagecache code.
- Nhat Pham has added memcg accounting for hugetlb memory in the series
"hugetlb memcg accounting".
- Cleanups and rationalizations to the pagemap code from Lorenzo
Stoakes, in the series "Abstract vma_merge() and split_vma()".
- Audra Mitchell has fixed issues in the procfs page_owner code's new
timestamping feature which was causing some misbehaviours. In the
series "Fix page_owner's use of free timestamps".
- Lorenzo Stoakes has fixed the handling of new mappings of sealed files
in the series "permit write-sealed memfd read-only shared mappings".
- Mike Kravetz has optimized the hugetlb vmemmap optimization in the
series "Batch hugetlb vmemmap modification operations".
- Some buffer_head folio conversions and cleanups from Matthew Wilcox in
the series "Finish the create_empty_buffers() transition".
- As a page allocator performance optimization Huang Ying has added
automatic tuning to the allocator's per-cpu-pages feature, in the series
"mm: PCP high auto-tuning".
- Roman Gushchin has contributed the patchset "mm: improve performance
of accounted kernel memory allocations" which improves their performance
by ~30% as measured by a micro-benchmark.
- folio conversions from Kefeng Wang in the series "mm: convert page
cpupid functions to folios".
- Some kmemleak fixups in Liu Shixin's series "Some bugfix about
kmemleak".
- Qi Zheng has improved our handling of memoryless nodes by keeping them
off the allocation fallback list. This is done in the series "handle
memoryless nodes more appropriately".
- khugepaged conversions from Vishal Moola in the series "Some
khugepaged folio conversions".
-----BEGIN PGP SIGNATURE-----
iHUEABYIAB0WIQTTMBEPP41GrTpTJgfdBJ7gKXxAjgUCZULEMwAKCRDdBJ7gKXxA
jhQHAQCYpD3g849x69DmHnHWHm/EHQLvQmRMDeYZI+nx/sCJOwEAw4AKg0Oemv9y
FgeUPAD1oasg6CP+INZvCj34waNxwAc=
=E+Y4
-----END PGP SIGNATURE-----
Merge tag 'mm-stable-2023-11-01-14-33' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
Pull MM updates from Andrew Morton:
"Many singleton patches against the MM code. The patch series which are
included in this merge do the following:
- Kemeng Shi has contributed some compation maintenance work in the
series 'Fixes and cleanups to compaction'
- Joel Fernandes has a patchset ('Optimize mremap during mutual
alignment within PMD') which fixes an obscure issue with mremap()'s
pagetable handling during a subsequent exec(), based upon an
implementation which Linus suggested
- More DAMON/DAMOS maintenance and feature work from SeongJae Park i
the following patch series:
mm/damon: misc fixups for documents, comments and its tracepoint
mm/damon: add a tracepoint for damos apply target regions
mm/damon: provide pseudo-moving sum based access rate
mm/damon: implement DAMOS apply intervals
mm/damon/core-test: Fix memory leaks in core-test
mm/damon/sysfs-schemes: Do DAMOS tried regions update for only one apply interval
- In the series 'Do not try to access unaccepted memory' Adrian
Hunter provides some fixups for the recently-added 'unaccepted
memory' feature. To increase the feature's checking coverage. 'Plug
a few gaps where RAM is exposed without checking if it is
unaccepted memory'
- In the series 'cleanups for lockless slab shrink' Qi Zheng has done
some maintenance work which is preparation for the lockless slab
shrinking code
- Qi Zheng has redone the earlier (and reverted) attempt to make slab
shrinking lockless in the series 'use refcount+RCU method to
implement lockless slab shrink'
- David Hildenbrand contributes some maintenance work for the rmap
code in the series 'Anon rmap cleanups'
- Kefeng Wang does more folio conversions and some maintenance work
in the migration code. Series 'mm: migrate: more folio conversion
and unification'
- Matthew Wilcox has fixed an issue in the buffer_head code which was
causing long stalls under some heavy memory/IO loads. Some cleanups
were added on the way. Series 'Add and use bdev_getblk()'
- In the series 'Use nth_page() in place of direct struct page
manipulation' Zi Yan has fixed a potential issue with the direct
manipulation of hugetlb page frames
- In the series 'mm: hugetlb: Skip initialization of gigantic tail
struct pages if freed by HVO' has improved our handling of gigantic
pages in the hugetlb vmmemmep optimizaton code. This provides
significant boot time improvements when significant amounts of
gigantic pages are in use
- Matthew Wilcox has sent the series 'Small hugetlb cleanups' - code
rationalization and folio conversions in the hugetlb code
- Yin Fengwei has improved mlock()'s handling of large folios in the
series 'support large folio for mlock'
- In the series 'Expose swapcache stat for memcg v1' Liu Shixin has
added statistics for memcg v1 users which are available (and
useful) under memcg v2
- Florent Revest has enhanced the MDWE (Memory-Deny-Write-Executable)
prctl so that userspace may direct the kernel to not automatically
propagate the denial to child processes. The series is named 'MDWE
without inheritance'
- Kefeng Wang has provided the series 'mm: convert numa balancing
functions to use a folio' which does what it says
- In the series 'mm/ksm: add fork-exec support for prctl' Stefan
Roesch makes is possible for a process to propagate KSM treatment
across exec()
- Huang Ying has enhanced memory tiering's calculation of memory
distances. This is used to permit the dax/kmem driver to use 'high
bandwidth memory' in addition to Optane Data Center Persistent
Memory Modules (DCPMM). The series is named 'memory tiering:
calculate abstract distance based on ACPI HMAT'
- In the series 'Smart scanning mode for KSM' Stefan Roesch has
optimized KSM by teaching it to retain and use some historical
information from previous scans
- Yosry Ahmed has fixed some inconsistencies in memcg statistics in
the series 'mm: memcg: fix tracking of pending stats updates
values'
- In the series 'Implement IOCTL to get and optionally clear info
about PTEs' Peter Xu has added an ioctl to /proc/<pid>/pagemap
which permits us to atomically read-then-clear page softdirty
state. This is mainly used by CRIU
- Hugh Dickins contributed the series 'shmem,tmpfs: general
maintenance', a bunch of relatively minor maintenance tweaks to
this code
- Matthew Wilcox has increased the use of the VMA lock over
file-backed page faults in the series 'Handle more faults under the
VMA lock'. Some rationalizations of the fault path became possible
as a result
- In the series 'mm/rmap: convert page_move_anon_rmap() to
folio_move_anon_rmap()' David Hildenbrand has implemented some
cleanups and folio conversions
- In the series 'various improvements to the GUP interface' Lorenzo
Stoakes has simplified and improved the GUP interface with an eye
to providing groundwork for future improvements
- Andrey Konovalov has sent along the series 'kasan: assorted fixes
and improvements' which does those things
- Some page allocator maintenance work from Kemeng Shi in the series
'Two minor cleanups to break_down_buddy_pages'
- In thes series 'New selftest for mm' Breno Leitao has developed
another MM self test which tickles a race we had between madvise()
and page faults
- In the series 'Add folio_end_read' Matthew Wilcox provides cleanups
and an optimization to the core pagecache code
- Nhat Pham has added memcg accounting for hugetlb memory in the
series 'hugetlb memcg accounting'
- Cleanups and rationalizations to the pagemap code from Lorenzo
Stoakes, in the series 'Abstract vma_merge() and split_vma()'
- Audra Mitchell has fixed issues in the procfs page_owner code's new
timestamping feature which was causing some misbehaviours. In the
series 'Fix page_owner's use of free timestamps'
- Lorenzo Stoakes has fixed the handling of new mappings of sealed
files in the series 'permit write-sealed memfd read-only shared
mappings'
- Mike Kravetz has optimized the hugetlb vmemmap optimization in the
series 'Batch hugetlb vmemmap modification operations'
- Some buffer_head folio conversions and cleanups from Matthew Wilcox
in the series 'Finish the create_empty_buffers() transition'
- As a page allocator performance optimization Huang Ying has added
automatic tuning to the allocator's per-cpu-pages feature, in the
series 'mm: PCP high auto-tuning'
- Roman Gushchin has contributed the patchset 'mm: improve
performance of accounted kernel memory allocations' which improves
their performance by ~30% as measured by a micro-benchmark
- folio conversions from Kefeng Wang in the series 'mm: convert page
cpupid functions to folios'
- Some kmemleak fixups in Liu Shixin's series 'Some bugfix about
kmemleak'
- Qi Zheng has improved our handling of memoryless nodes by keeping
them off the allocation fallback list. This is done in the series
'handle memoryless nodes more appropriately'
- khugepaged conversions from Vishal Moola in the series 'Some
khugepaged folio conversions'"
[ bcachefs conflicts with the dynamically allocated shrinkers have been
resolved as per Stephen Rothwell in
https://lore.kernel.org/all/20230913093553.4290421e@canb.auug.org.au/
with help from Qi Zheng.
The clone3 test filtering conflict was half-arsed by yours truly ]
* tag 'mm-stable-2023-11-01-14-33' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm: (406 commits)
mm/damon/sysfs: update monitoring target regions for online input commit
mm/damon/sysfs: remove requested targets when online-commit inputs
selftests: add a sanity check for zswap
Documentation: maple_tree: fix word spelling error
mm/vmalloc: fix the unchecked dereference warning in vread_iter()
zswap: export compression failure stats
Documentation: ubsan: drop "the" from article title
mempolicy: migration attempt to match interleave nodes
mempolicy: mmap_lock is not needed while migrating folios
mempolicy: alloc_pages_mpol() for NUMA policy without vma
mm: add page_rmappable_folio() wrapper
mempolicy: remove confusing MPOL_MF_LAZY dead code
mempolicy: mpol_shared_policy_init() without pseudo-vma
mempolicy trivia: use pgoff_t in shared mempolicy tree
mempolicy trivia: slightly more consistent naming
mempolicy trivia: delete those ancient pr_debug()s
mempolicy: fix migrate_pages(2) syscall return nr_failed
kernfs: drop shared NUMA mempolicy hooks
hugetlbfs: drop shared NUMA mempolicy pretence
mm/damon/sysfs-test: add a unit test for damon_sysfs_set_targets()
...
2023-11-02 22:38:47 -07:00
|
|
|
struct shrinker *shrink;
|
2017-03-16 23:18:50 -07:00
|
|
|
unsigned i;
|
|
|
|
int ret = 0;
|
|
|
|
|
|
|
|
ret = rhashtable_init(&bc->table, &bch_btree_cache_params);
|
|
|
|
if (ret)
|
2023-07-07 01:38:29 -07:00
|
|
|
goto err;
|
2017-03-16 23:18:50 -07:00
|
|
|
|
|
|
|
bc->table_init_done = true;
|
|
|
|
|
|
|
|
bch2_recalc_btree_reserve(c);
|
|
|
|
|
2024-09-05 16:25:01 -07:00
|
|
|
for (i = 0; i < bc->nr_reserve; i++)
|
2023-07-07 01:38:29 -07:00
|
|
|
if (!__bch2_btree_node_mem_alloc(c))
|
|
|
|
goto err;
|
2017-03-16 23:18:50 -07:00
|
|
|
|
bcachefs: Rework btree node pinning
In backpointers fsck, we do a seqential scan of one btree, and check
references to another: extents <-> backpointers
Checking references generates random lookups, so we want to pin that
btree in memory (or only a range, if it doesn't fit in ram).
Previously, this was done with a simple check in the shrinker - "if
btree node is in range being pinned, don't free it" - but this generated
OOMs, as our shrinker wasn't well behaved if there was less memory
available than expected.
Instead, we now have two different shrinkers and lru lists; the second
shrinker being for pinned nodes, with seeks set much higher than normal
- so they can still be freed if necessary, but we'll prefer not to.
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
2024-09-04 17:49:37 -07:00
|
|
|
list_splice_init(&bc->live[0].list, &bc->freeable);
|
2017-03-16 23:18:50 -07:00
|
|
|
|
|
|
|
mutex_init(&c->verify_lock);
|
|
|
|
|
Second bcachefs pull request for 6.7-rc1
Here's the second big bcachefs pull request. This brings your tree up to
date with my master branch, which is what existing bcachefs users are
currently running.
All but the last few patches have been in linux-next, those being small
fixes. Test results from my dashboard:
https://evilpiepirate.org/~testdashboard/ci?commit=c7046ed0cf9bb33599aa7e72e7b67bba4be42d64
New features:
- rebalance_work btree (and metadata version 1.3): the rebalance thread
no longer has to scan to find extents that need processing - big
scalability improvement.
- sb_errors superblock section: this adds counters for each fsck error
type, since filesystem creation, along with the date of the most
recent error. It'll get us better bug reports (since users do not
typically report errors that fsck was able to fix), and I might add
telemetry for this in the future.
Fixes include:
- multiple snapshot deletion fixes
- members_v2 fixups
- deleted_inodes btree fixes
- copygc thread no longer spins when a device is full but has no
fragmented buckets (i.e. rebalance needs to move data around instead)
- a fix for a memory reclaim issue with the btree key cache: we're now
careful not to hold the srcu read lock that blocks key cache reclaim
for too long
- an early allocator locking fix, from Brian
- endianness fixes, from Brian
- CONFIG_BCACHEFS_DEBUG_TRANSACTIONS no longer defaults to y, a big
performance improvement on multithreaded workloads
-----BEGIN PGP SIGNATURE-----
iQIzBAABCgAdFiEEKnAFLkS8Qha+jvQrE6szbY3KbnYFAmVH9xYACgkQE6szbY3K
bnahLRAAiNRZL73SQ+MW79o4yPqGwt0Eyy/mvoiGpZf1B8uXp0oZ55j2w3l887Uf
LeM03mInAYCPdyp/d4vxqIr96j9BODmRRl8sEkkGdJDzokLG+22F0ovOe45KWTxL
kBoNdng/O/oeOe/1K7taP3KzBvMx2nOF6oA+xfgyCjECMArAIXek0iocyEUR4Ywd
vGKhLNn1k2c+94wacnDYwjjdcLBxoqxsFXlpu6V0BcaY+DX4J3aBaGmj75KEoCI0
VbBOzxrOO4QzJrzW2+hxZZWgGyvReCkBJvqfORfuPxiSbFobTim10MdfZOAMQA1U
Xr1FTEpK1wMX0/pPVgZRqaOsttC+yc/SsfPNgSxybgHPbDlMLaakDHjvYssbKOYG
urDWSMG5yCsktSLj95SXsvUFKZaZFD72SKBNdgdt/nZjwTHuNQ7IkdrMwIrCQ/PT
Ifn50UrR/Ahd8RAd5tyNCPw6U9VfwnxACSNl2KA7ONKpvHb+gSt1JsJTDyz1+gN9
nFVrw1SHKQ6EIV6XhVon/5DEuRTzqoYGWoN08FHEUq9fBlvnVpmbJErCQMplOjz9
OQnAfpJH4YqkpXyjFAjP1V0An+RUn8QvDgXNqC9TyvCYuOliVFuil4y7/c+7oIQU
NEoz+jVLenqsGOGAbduI4/Q567COojRgwEvbebSIxSImXuhCNj4=
=Lo4N
-----END PGP SIGNATURE-----
Merge tag 'bcachefs-2023-11-5' of https://evilpiepirate.org/git/bcachefs
Pull more bcachefs updates from Kent Overstreet:
"Here's the second big bcachefs pull request. This brings your tree up
to date with my master branch, which is what existing bcachefs users
are currently running.
New features:
- rebalance_work btree (and metadata version 1.3): the rebalance
thread no longer has to scan to find extents that need processing -
big scalability improvement.
- sb_errors superblock section: this adds counters for each fsck
error type, since filesystem creation, along with the date of the
most recent error. It'll get us better bug reports (since users do
not typically report errors that fsck was able to fix), and I might
add telemetry for this in the future.
Fixes include:
- multiple snapshot deletion fixes
- members_v2 fixups
- deleted_inodes btree fixes
- copygc thread no longer spins when a device is full but has no
fragmented buckets (i.e. rebalance needs to move data around
instead)
- a fix for a memory reclaim issue with the btree key cache: we're
now careful not to hold the srcu read lock that blocks key cache
reclaim for too long
- an early allocator locking fix, from Brian
- endianness fixes, from Brian
- CONFIG_BCACHEFS_DEBUG_TRANSACTIONS no longer defaults to y, a big
performance improvement on multithreaded workloads"
* tag 'bcachefs-2023-11-5' of https://evilpiepirate.org/git/bcachefs: (70 commits)
bcachefs: Improve stripe checksum error message
bcachefs: Simplify, fix bch2_backpointer_get_key()
bcachefs: kill thing_it_points_to arg to backpointer_not_found()
bcachefs: bch2_ec_read_extent() now takes btree_trans
bcachefs: bch2_stripe_to_text() now prints ptr gens
bcachefs: Don't iterate over journal entries just for btree roots
bcachefs: Break up bch2_journal_write()
bcachefs: Replace ERANGE with private error codes
bcachefs: bkey_copy() is no longer a macro
bcachefs: x-macro-ify inode flags enum
bcachefs: Convert bch2_fs_open() to darray
bcachefs: Move __bch2_members_v2_get_mut to sb-members.h
bcachefs: bch2_prt_datetime()
bcachefs: CONFIG_BCACHEFS_DEBUG_TRANSACTIONS no longer defaults to y
bcachefs: Add a comment for BTREE_INSERT_NOJOURNAL usage
bcachefs: rebalance_work btree is not a snapshots btree
bcachefs: Add missing printk newlines
bcachefs: Fix recovery when forced to use JSET_NO_FLUSH journal entry
bcachefs: .get_parent() should return an error pointer
bcachefs: Fix bch2_delete_dead_inodes()
...
2023-11-07 12:38:38 -07:00
|
|
|
shrink = shrinker_alloc(0, "%s-btree_cache", c->name);
|
Many singleton patches against the MM code. The patch series which are
included in this merge do the following:
- Kemeng Shi has contributed some compation maintenance work in the
series "Fixes and cleanups to compaction".
- Joel Fernandes has a patchset ("Optimize mremap during mutual
alignment within PMD") which fixes an obscure issue with mremap()'s
pagetable handling during a subsequent exec(), based upon an
implementation which Linus suggested.
- More DAMON/DAMOS maintenance and feature work from SeongJae Park i the
following patch series:
mm/damon: misc fixups for documents, comments and its tracepoint
mm/damon: add a tracepoint for damos apply target regions
mm/damon: provide pseudo-moving sum based access rate
mm/damon: implement DAMOS apply intervals
mm/damon/core-test: Fix memory leaks in core-test
mm/damon/sysfs-schemes: Do DAMOS tried regions update for only one apply interval
- In the series "Do not try to access unaccepted memory" Adrian Hunter
provides some fixups for the recently-added "unaccepted memory' feature.
To increase the feature's checking coverage. "Plug a few gaps where
RAM is exposed without checking if it is unaccepted memory".
- In the series "cleanups for lockless slab shrink" Qi Zheng has done
some maintenance work which is preparation for the lockless slab
shrinking code.
- Qi Zheng has redone the earlier (and reverted) attempt to make slab
shrinking lockless in the series "use refcount+RCU method to implement
lockless slab shrink".
- David Hildenbrand contributes some maintenance work for the rmap code
in the series "Anon rmap cleanups".
- Kefeng Wang does more folio conversions and some maintenance work in
the migration code. Series "mm: migrate: more folio conversion and
unification".
- Matthew Wilcox has fixed an issue in the buffer_head code which was
causing long stalls under some heavy memory/IO loads. Some cleanups
were added on the way. Series "Add and use bdev_getblk()".
- In the series "Use nth_page() in place of direct struct page
manipulation" Zi Yan has fixed a potential issue with the direct
manipulation of hugetlb page frames.
- In the series "mm: hugetlb: Skip initialization of gigantic tail
struct pages if freed by HVO" has improved our handling of gigantic
pages in the hugetlb vmmemmep optimizaton code. This provides
significant boot time improvements when significant amounts of gigantic
pages are in use.
- Matthew Wilcox has sent the series "Small hugetlb cleanups" - code
rationalization and folio conversions in the hugetlb code.
- Yin Fengwei has improved mlock()'s handling of large folios in the
series "support large folio for mlock"
- In the series "Expose swapcache stat for memcg v1" Liu Shixin has
added statistics for memcg v1 users which are available (and useful)
under memcg v2.
- Florent Revest has enhanced the MDWE (Memory-Deny-Write-Executable)
prctl so that userspace may direct the kernel to not automatically
propagate the denial to child processes. The series is named "MDWE
without inheritance".
- Kefeng Wang has provided the series "mm: convert numa balancing
functions to use a folio" which does what it says.
- In the series "mm/ksm: add fork-exec support for prctl" Stefan Roesch
makes is possible for a process to propagate KSM treatment across
exec().
- Huang Ying has enhanced memory tiering's calculation of memory
distances. This is used to permit the dax/kmem driver to use "high
bandwidth memory" in addition to Optane Data Center Persistent Memory
Modules (DCPMM). The series is named "memory tiering: calculate
abstract distance based on ACPI HMAT"
- In the series "Smart scanning mode for KSM" Stefan Roesch has
optimized KSM by teaching it to retain and use some historical
information from previous scans.
- Yosry Ahmed has fixed some inconsistencies in memcg statistics in the
series "mm: memcg: fix tracking of pending stats updates values".
- In the series "Implement IOCTL to get and optionally clear info about
PTEs" Peter Xu has added an ioctl to /proc/<pid>/pagemap which permits
us to atomically read-then-clear page softdirty state. This is mainly
used by CRIU.
- Hugh Dickins contributed the series "shmem,tmpfs: general maintenance"
- a bunch of relatively minor maintenance tweaks to this code.
- Matthew Wilcox has increased the use of the VMA lock over file-backed
page faults in the series "Handle more faults under the VMA lock". Some
rationalizations of the fault path became possible as a result.
- In the series "mm/rmap: convert page_move_anon_rmap() to
folio_move_anon_rmap()" David Hildenbrand has implemented some cleanups
and folio conversions.
- In the series "various improvements to the GUP interface" Lorenzo
Stoakes has simplified and improved the GUP interface with an eye to
providing groundwork for future improvements.
- Andrey Konovalov has sent along the series "kasan: assorted fixes and
improvements" which does those things.
- Some page allocator maintenance work from Kemeng Shi in the series
"Two minor cleanups to break_down_buddy_pages".
- In thes series "New selftest for mm" Breno Leitao has developed
another MM self test which tickles a race we had between madvise() and
page faults.
- In the series "Add folio_end_read" Matthew Wilcox provides cleanups
and an optimization to the core pagecache code.
- Nhat Pham has added memcg accounting for hugetlb memory in the series
"hugetlb memcg accounting".
- Cleanups and rationalizations to the pagemap code from Lorenzo
Stoakes, in the series "Abstract vma_merge() and split_vma()".
- Audra Mitchell has fixed issues in the procfs page_owner code's new
timestamping feature which was causing some misbehaviours. In the
series "Fix page_owner's use of free timestamps".
- Lorenzo Stoakes has fixed the handling of new mappings of sealed files
in the series "permit write-sealed memfd read-only shared mappings".
- Mike Kravetz has optimized the hugetlb vmemmap optimization in the
series "Batch hugetlb vmemmap modification operations".
- Some buffer_head folio conversions and cleanups from Matthew Wilcox in
the series "Finish the create_empty_buffers() transition".
- As a page allocator performance optimization Huang Ying has added
automatic tuning to the allocator's per-cpu-pages feature, in the series
"mm: PCP high auto-tuning".
- Roman Gushchin has contributed the patchset "mm: improve performance
of accounted kernel memory allocations" which improves their performance
by ~30% as measured by a micro-benchmark.
- folio conversions from Kefeng Wang in the series "mm: convert page
cpupid functions to folios".
- Some kmemleak fixups in Liu Shixin's series "Some bugfix about
kmemleak".
- Qi Zheng has improved our handling of memoryless nodes by keeping them
off the allocation fallback list. This is done in the series "handle
memoryless nodes more appropriately".
- khugepaged conversions from Vishal Moola in the series "Some
khugepaged folio conversions".
-----BEGIN PGP SIGNATURE-----
iHUEABYIAB0WIQTTMBEPP41GrTpTJgfdBJ7gKXxAjgUCZULEMwAKCRDdBJ7gKXxA
jhQHAQCYpD3g849x69DmHnHWHm/EHQLvQmRMDeYZI+nx/sCJOwEAw4AKg0Oemv9y
FgeUPAD1oasg6CP+INZvCj34waNxwAc=
=E+Y4
-----END PGP SIGNATURE-----
Merge tag 'mm-stable-2023-11-01-14-33' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
Pull MM updates from Andrew Morton:
"Many singleton patches against the MM code. The patch series which are
included in this merge do the following:
- Kemeng Shi has contributed some compation maintenance work in the
series 'Fixes and cleanups to compaction'
- Joel Fernandes has a patchset ('Optimize mremap during mutual
alignment within PMD') which fixes an obscure issue with mremap()'s
pagetable handling during a subsequent exec(), based upon an
implementation which Linus suggested
- More DAMON/DAMOS maintenance and feature work from SeongJae Park i
the following patch series:
mm/damon: misc fixups for documents, comments and its tracepoint
mm/damon: add a tracepoint for damos apply target regions
mm/damon: provide pseudo-moving sum based access rate
mm/damon: implement DAMOS apply intervals
mm/damon/core-test: Fix memory leaks in core-test
mm/damon/sysfs-schemes: Do DAMOS tried regions update for only one apply interval
- In the series 'Do not try to access unaccepted memory' Adrian
Hunter provides some fixups for the recently-added 'unaccepted
memory' feature. To increase the feature's checking coverage. 'Plug
a few gaps where RAM is exposed without checking if it is
unaccepted memory'
- In the series 'cleanups for lockless slab shrink' Qi Zheng has done
some maintenance work which is preparation for the lockless slab
shrinking code
- Qi Zheng has redone the earlier (and reverted) attempt to make slab
shrinking lockless in the series 'use refcount+RCU method to
implement lockless slab shrink'
- David Hildenbrand contributes some maintenance work for the rmap
code in the series 'Anon rmap cleanups'
- Kefeng Wang does more folio conversions and some maintenance work
in the migration code. Series 'mm: migrate: more folio conversion
and unification'
- Matthew Wilcox has fixed an issue in the buffer_head code which was
causing long stalls under some heavy memory/IO loads. Some cleanups
were added on the way. Series 'Add and use bdev_getblk()'
- In the series 'Use nth_page() in place of direct struct page
manipulation' Zi Yan has fixed a potential issue with the direct
manipulation of hugetlb page frames
- In the series 'mm: hugetlb: Skip initialization of gigantic tail
struct pages if freed by HVO' has improved our handling of gigantic
pages in the hugetlb vmmemmep optimizaton code. This provides
significant boot time improvements when significant amounts of
gigantic pages are in use
- Matthew Wilcox has sent the series 'Small hugetlb cleanups' - code
rationalization and folio conversions in the hugetlb code
- Yin Fengwei has improved mlock()'s handling of large folios in the
series 'support large folio for mlock'
- In the series 'Expose swapcache stat for memcg v1' Liu Shixin has
added statistics for memcg v1 users which are available (and
useful) under memcg v2
- Florent Revest has enhanced the MDWE (Memory-Deny-Write-Executable)
prctl so that userspace may direct the kernel to not automatically
propagate the denial to child processes. The series is named 'MDWE
without inheritance'
- Kefeng Wang has provided the series 'mm: convert numa balancing
functions to use a folio' which does what it says
- In the series 'mm/ksm: add fork-exec support for prctl' Stefan
Roesch makes is possible for a process to propagate KSM treatment
across exec()
- Huang Ying has enhanced memory tiering's calculation of memory
distances. This is used to permit the dax/kmem driver to use 'high
bandwidth memory' in addition to Optane Data Center Persistent
Memory Modules (DCPMM). The series is named 'memory tiering:
calculate abstract distance based on ACPI HMAT'
- In the series 'Smart scanning mode for KSM' Stefan Roesch has
optimized KSM by teaching it to retain and use some historical
information from previous scans
- Yosry Ahmed has fixed some inconsistencies in memcg statistics in
the series 'mm: memcg: fix tracking of pending stats updates
values'
- In the series 'Implement IOCTL to get and optionally clear info
about PTEs' Peter Xu has added an ioctl to /proc/<pid>/pagemap
which permits us to atomically read-then-clear page softdirty
state. This is mainly used by CRIU
- Hugh Dickins contributed the series 'shmem,tmpfs: general
maintenance', a bunch of relatively minor maintenance tweaks to
this code
- Matthew Wilcox has increased the use of the VMA lock over
file-backed page faults in the series 'Handle more faults under the
VMA lock'. Some rationalizations of the fault path became possible
as a result
- In the series 'mm/rmap: convert page_move_anon_rmap() to
folio_move_anon_rmap()' David Hildenbrand has implemented some
cleanups and folio conversions
- In the series 'various improvements to the GUP interface' Lorenzo
Stoakes has simplified and improved the GUP interface with an eye
to providing groundwork for future improvements
- Andrey Konovalov has sent along the series 'kasan: assorted fixes
and improvements' which does those things
- Some page allocator maintenance work from Kemeng Shi in the series
'Two minor cleanups to break_down_buddy_pages'
- In thes series 'New selftest for mm' Breno Leitao has developed
another MM self test which tickles a race we had between madvise()
and page faults
- In the series 'Add folio_end_read' Matthew Wilcox provides cleanups
and an optimization to the core pagecache code
- Nhat Pham has added memcg accounting for hugetlb memory in the
series 'hugetlb memcg accounting'
- Cleanups and rationalizations to the pagemap code from Lorenzo
Stoakes, in the series 'Abstract vma_merge() and split_vma()'
- Audra Mitchell has fixed issues in the procfs page_owner code's new
timestamping feature which was causing some misbehaviours. In the
series 'Fix page_owner's use of free timestamps'
- Lorenzo Stoakes has fixed the handling of new mappings of sealed
files in the series 'permit write-sealed memfd read-only shared
mappings'
- Mike Kravetz has optimized the hugetlb vmemmap optimization in the
series 'Batch hugetlb vmemmap modification operations'
- Some buffer_head folio conversions and cleanups from Matthew Wilcox
in the series 'Finish the create_empty_buffers() transition'
- As a page allocator performance optimization Huang Ying has added
automatic tuning to the allocator's per-cpu-pages feature, in the
series 'mm: PCP high auto-tuning'
- Roman Gushchin has contributed the patchset 'mm: improve
performance of accounted kernel memory allocations' which improves
their performance by ~30% as measured by a micro-benchmark
- folio conversions from Kefeng Wang in the series 'mm: convert page
cpupid functions to folios'
- Some kmemleak fixups in Liu Shixin's series 'Some bugfix about
kmemleak'
- Qi Zheng has improved our handling of memoryless nodes by keeping
them off the allocation fallback list. This is done in the series
'handle memoryless nodes more appropriately'
- khugepaged conversions from Vishal Moola in the series 'Some
khugepaged folio conversions'"
[ bcachefs conflicts with the dynamically allocated shrinkers have been
resolved as per Stephen Rothwell in
https://lore.kernel.org/all/20230913093553.4290421e@canb.auug.org.au/
with help from Qi Zheng.
The clone3 test filtering conflict was half-arsed by yours truly ]
* tag 'mm-stable-2023-11-01-14-33' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm: (406 commits)
mm/damon/sysfs: update monitoring target regions for online input commit
mm/damon/sysfs: remove requested targets when online-commit inputs
selftests: add a sanity check for zswap
Documentation: maple_tree: fix word spelling error
mm/vmalloc: fix the unchecked dereference warning in vread_iter()
zswap: export compression failure stats
Documentation: ubsan: drop "the" from article title
mempolicy: migration attempt to match interleave nodes
mempolicy: mmap_lock is not needed while migrating folios
mempolicy: alloc_pages_mpol() for NUMA policy without vma
mm: add page_rmappable_folio() wrapper
mempolicy: remove confusing MPOL_MF_LAZY dead code
mempolicy: mpol_shared_policy_init() without pseudo-vma
mempolicy trivia: use pgoff_t in shared mempolicy tree
mempolicy trivia: slightly more consistent naming
mempolicy trivia: delete those ancient pr_debug()s
mempolicy: fix migrate_pages(2) syscall return nr_failed
kernfs: drop shared NUMA mempolicy hooks
hugetlbfs: drop shared NUMA mempolicy pretence
mm/damon/sysfs-test: add a unit test for damon_sysfs_set_targets()
...
2023-11-02 22:38:47 -07:00
|
|
|
if (!shrink)
|
2023-07-07 01:38:29 -07:00
|
|
|
goto err;
|
bcachefs: Rework btree node pinning
In backpointers fsck, we do a seqential scan of one btree, and check
references to another: extents <-> backpointers
Checking references generates random lookups, so we want to pin that
btree in memory (or only a range, if it doesn't fit in ram).
Previously, this was done with a simple check in the shrinker - "if
btree node is in range being pinned, don't free it" - but this generated
OOMs, as our shrinker wasn't well behaved if there was less memory
available than expected.
Instead, we now have two different shrinkers and lru lists; the second
shrinker being for pinned nodes, with seeks set much higher than normal
- so they can still be freed if necessary, but we'll prefer not to.
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
2024-09-04 17:49:37 -07:00
|
|
|
bc->live[0].shrink = shrink;
|
|
|
|
shrink->count_objects = bch2_btree_cache_count;
|
|
|
|
shrink->scan_objects = bch2_btree_cache_scan;
|
|
|
|
shrink->seeks = 2;
|
|
|
|
shrink->private_data = &bc->live[0];
|
|
|
|
shrinker_register(shrink);
|
|
|
|
|
|
|
|
shrink = shrinker_alloc(0, "%s-btree_cache-pinned", c->name);
|
|
|
|
if (!shrink)
|
|
|
|
goto err;
|
|
|
|
bc->live[1].shrink = shrink;
|
Many singleton patches against the MM code. The patch series which are
included in this merge do the following:
- Kemeng Shi has contributed some compation maintenance work in the
series "Fixes and cleanups to compaction".
- Joel Fernandes has a patchset ("Optimize mremap during mutual
alignment within PMD") which fixes an obscure issue with mremap()'s
pagetable handling during a subsequent exec(), based upon an
implementation which Linus suggested.
- More DAMON/DAMOS maintenance and feature work from SeongJae Park i the
following patch series:
mm/damon: misc fixups for documents, comments and its tracepoint
mm/damon: add a tracepoint for damos apply target regions
mm/damon: provide pseudo-moving sum based access rate
mm/damon: implement DAMOS apply intervals
mm/damon/core-test: Fix memory leaks in core-test
mm/damon/sysfs-schemes: Do DAMOS tried regions update for only one apply interval
- In the series "Do not try to access unaccepted memory" Adrian Hunter
provides some fixups for the recently-added "unaccepted memory' feature.
To increase the feature's checking coverage. "Plug a few gaps where
RAM is exposed without checking if it is unaccepted memory".
- In the series "cleanups for lockless slab shrink" Qi Zheng has done
some maintenance work which is preparation for the lockless slab
shrinking code.
- Qi Zheng has redone the earlier (and reverted) attempt to make slab
shrinking lockless in the series "use refcount+RCU method to implement
lockless slab shrink".
- David Hildenbrand contributes some maintenance work for the rmap code
in the series "Anon rmap cleanups".
- Kefeng Wang does more folio conversions and some maintenance work in
the migration code. Series "mm: migrate: more folio conversion and
unification".
- Matthew Wilcox has fixed an issue in the buffer_head code which was
causing long stalls under some heavy memory/IO loads. Some cleanups
were added on the way. Series "Add and use bdev_getblk()".
- In the series "Use nth_page() in place of direct struct page
manipulation" Zi Yan has fixed a potential issue with the direct
manipulation of hugetlb page frames.
- In the series "mm: hugetlb: Skip initialization of gigantic tail
struct pages if freed by HVO" has improved our handling of gigantic
pages in the hugetlb vmmemmep optimizaton code. This provides
significant boot time improvements when significant amounts of gigantic
pages are in use.
- Matthew Wilcox has sent the series "Small hugetlb cleanups" - code
rationalization and folio conversions in the hugetlb code.
- Yin Fengwei has improved mlock()'s handling of large folios in the
series "support large folio for mlock"
- In the series "Expose swapcache stat for memcg v1" Liu Shixin has
added statistics for memcg v1 users which are available (and useful)
under memcg v2.
- Florent Revest has enhanced the MDWE (Memory-Deny-Write-Executable)
prctl so that userspace may direct the kernel to not automatically
propagate the denial to child processes. The series is named "MDWE
without inheritance".
- Kefeng Wang has provided the series "mm: convert numa balancing
functions to use a folio" which does what it says.
- In the series "mm/ksm: add fork-exec support for prctl" Stefan Roesch
makes is possible for a process to propagate KSM treatment across
exec().
- Huang Ying has enhanced memory tiering's calculation of memory
distances. This is used to permit the dax/kmem driver to use "high
bandwidth memory" in addition to Optane Data Center Persistent Memory
Modules (DCPMM). The series is named "memory tiering: calculate
abstract distance based on ACPI HMAT"
- In the series "Smart scanning mode for KSM" Stefan Roesch has
optimized KSM by teaching it to retain and use some historical
information from previous scans.
- Yosry Ahmed has fixed some inconsistencies in memcg statistics in the
series "mm: memcg: fix tracking of pending stats updates values".
- In the series "Implement IOCTL to get and optionally clear info about
PTEs" Peter Xu has added an ioctl to /proc/<pid>/pagemap which permits
us to atomically read-then-clear page softdirty state. This is mainly
used by CRIU.
- Hugh Dickins contributed the series "shmem,tmpfs: general maintenance"
- a bunch of relatively minor maintenance tweaks to this code.
- Matthew Wilcox has increased the use of the VMA lock over file-backed
page faults in the series "Handle more faults under the VMA lock". Some
rationalizations of the fault path became possible as a result.
- In the series "mm/rmap: convert page_move_anon_rmap() to
folio_move_anon_rmap()" David Hildenbrand has implemented some cleanups
and folio conversions.
- In the series "various improvements to the GUP interface" Lorenzo
Stoakes has simplified and improved the GUP interface with an eye to
providing groundwork for future improvements.
- Andrey Konovalov has sent along the series "kasan: assorted fixes and
improvements" which does those things.
- Some page allocator maintenance work from Kemeng Shi in the series
"Two minor cleanups to break_down_buddy_pages".
- In thes series "New selftest for mm" Breno Leitao has developed
another MM self test which tickles a race we had between madvise() and
page faults.
- In the series "Add folio_end_read" Matthew Wilcox provides cleanups
and an optimization to the core pagecache code.
- Nhat Pham has added memcg accounting for hugetlb memory in the series
"hugetlb memcg accounting".
- Cleanups and rationalizations to the pagemap code from Lorenzo
Stoakes, in the series "Abstract vma_merge() and split_vma()".
- Audra Mitchell has fixed issues in the procfs page_owner code's new
timestamping feature which was causing some misbehaviours. In the
series "Fix page_owner's use of free timestamps".
- Lorenzo Stoakes has fixed the handling of new mappings of sealed files
in the series "permit write-sealed memfd read-only shared mappings".
- Mike Kravetz has optimized the hugetlb vmemmap optimization in the
series "Batch hugetlb vmemmap modification operations".
- Some buffer_head folio conversions and cleanups from Matthew Wilcox in
the series "Finish the create_empty_buffers() transition".
- As a page allocator performance optimization Huang Ying has added
automatic tuning to the allocator's per-cpu-pages feature, in the series
"mm: PCP high auto-tuning".
- Roman Gushchin has contributed the patchset "mm: improve performance
of accounted kernel memory allocations" which improves their performance
by ~30% as measured by a micro-benchmark.
- folio conversions from Kefeng Wang in the series "mm: convert page
cpupid functions to folios".
- Some kmemleak fixups in Liu Shixin's series "Some bugfix about
kmemleak".
- Qi Zheng has improved our handling of memoryless nodes by keeping them
off the allocation fallback list. This is done in the series "handle
memoryless nodes more appropriately".
- khugepaged conversions from Vishal Moola in the series "Some
khugepaged folio conversions".
-----BEGIN PGP SIGNATURE-----
iHUEABYIAB0WIQTTMBEPP41GrTpTJgfdBJ7gKXxAjgUCZULEMwAKCRDdBJ7gKXxA
jhQHAQCYpD3g849x69DmHnHWHm/EHQLvQmRMDeYZI+nx/sCJOwEAw4AKg0Oemv9y
FgeUPAD1oasg6CP+INZvCj34waNxwAc=
=E+Y4
-----END PGP SIGNATURE-----
Merge tag 'mm-stable-2023-11-01-14-33' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
Pull MM updates from Andrew Morton:
"Many singleton patches against the MM code. The patch series which are
included in this merge do the following:
- Kemeng Shi has contributed some compation maintenance work in the
series 'Fixes and cleanups to compaction'
- Joel Fernandes has a patchset ('Optimize mremap during mutual
alignment within PMD') which fixes an obscure issue with mremap()'s
pagetable handling during a subsequent exec(), based upon an
implementation which Linus suggested
- More DAMON/DAMOS maintenance and feature work from SeongJae Park i
the following patch series:
mm/damon: misc fixups for documents, comments and its tracepoint
mm/damon: add a tracepoint for damos apply target regions
mm/damon: provide pseudo-moving sum based access rate
mm/damon: implement DAMOS apply intervals
mm/damon/core-test: Fix memory leaks in core-test
mm/damon/sysfs-schemes: Do DAMOS tried regions update for only one apply interval
- In the series 'Do not try to access unaccepted memory' Adrian
Hunter provides some fixups for the recently-added 'unaccepted
memory' feature. To increase the feature's checking coverage. 'Plug
a few gaps where RAM is exposed without checking if it is
unaccepted memory'
- In the series 'cleanups for lockless slab shrink' Qi Zheng has done
some maintenance work which is preparation for the lockless slab
shrinking code
- Qi Zheng has redone the earlier (and reverted) attempt to make slab
shrinking lockless in the series 'use refcount+RCU method to
implement lockless slab shrink'
- David Hildenbrand contributes some maintenance work for the rmap
code in the series 'Anon rmap cleanups'
- Kefeng Wang does more folio conversions and some maintenance work
in the migration code. Series 'mm: migrate: more folio conversion
and unification'
- Matthew Wilcox has fixed an issue in the buffer_head code which was
causing long stalls under some heavy memory/IO loads. Some cleanups
were added on the way. Series 'Add and use bdev_getblk()'
- In the series 'Use nth_page() in place of direct struct page
manipulation' Zi Yan has fixed a potential issue with the direct
manipulation of hugetlb page frames
- In the series 'mm: hugetlb: Skip initialization of gigantic tail
struct pages if freed by HVO' has improved our handling of gigantic
pages in the hugetlb vmmemmep optimizaton code. This provides
significant boot time improvements when significant amounts of
gigantic pages are in use
- Matthew Wilcox has sent the series 'Small hugetlb cleanups' - code
rationalization and folio conversions in the hugetlb code
- Yin Fengwei has improved mlock()'s handling of large folios in the
series 'support large folio for mlock'
- In the series 'Expose swapcache stat for memcg v1' Liu Shixin has
added statistics for memcg v1 users which are available (and
useful) under memcg v2
- Florent Revest has enhanced the MDWE (Memory-Deny-Write-Executable)
prctl so that userspace may direct the kernel to not automatically
propagate the denial to child processes. The series is named 'MDWE
without inheritance'
- Kefeng Wang has provided the series 'mm: convert numa balancing
functions to use a folio' which does what it says
- In the series 'mm/ksm: add fork-exec support for prctl' Stefan
Roesch makes is possible for a process to propagate KSM treatment
across exec()
- Huang Ying has enhanced memory tiering's calculation of memory
distances. This is used to permit the dax/kmem driver to use 'high
bandwidth memory' in addition to Optane Data Center Persistent
Memory Modules (DCPMM). The series is named 'memory tiering:
calculate abstract distance based on ACPI HMAT'
- In the series 'Smart scanning mode for KSM' Stefan Roesch has
optimized KSM by teaching it to retain and use some historical
information from previous scans
- Yosry Ahmed has fixed some inconsistencies in memcg statistics in
the series 'mm: memcg: fix tracking of pending stats updates
values'
- In the series 'Implement IOCTL to get and optionally clear info
about PTEs' Peter Xu has added an ioctl to /proc/<pid>/pagemap
which permits us to atomically read-then-clear page softdirty
state. This is mainly used by CRIU
- Hugh Dickins contributed the series 'shmem,tmpfs: general
maintenance', a bunch of relatively minor maintenance tweaks to
this code
- Matthew Wilcox has increased the use of the VMA lock over
file-backed page faults in the series 'Handle more faults under the
VMA lock'. Some rationalizations of the fault path became possible
as a result
- In the series 'mm/rmap: convert page_move_anon_rmap() to
folio_move_anon_rmap()' David Hildenbrand has implemented some
cleanups and folio conversions
- In the series 'various improvements to the GUP interface' Lorenzo
Stoakes has simplified and improved the GUP interface with an eye
to providing groundwork for future improvements
- Andrey Konovalov has sent along the series 'kasan: assorted fixes
and improvements' which does those things
- Some page allocator maintenance work from Kemeng Shi in the series
'Two minor cleanups to break_down_buddy_pages'
- In thes series 'New selftest for mm' Breno Leitao has developed
another MM self test which tickles a race we had between madvise()
and page faults
- In the series 'Add folio_end_read' Matthew Wilcox provides cleanups
and an optimization to the core pagecache code
- Nhat Pham has added memcg accounting for hugetlb memory in the
series 'hugetlb memcg accounting'
- Cleanups and rationalizations to the pagemap code from Lorenzo
Stoakes, in the series 'Abstract vma_merge() and split_vma()'
- Audra Mitchell has fixed issues in the procfs page_owner code's new
timestamping feature which was causing some misbehaviours. In the
series 'Fix page_owner's use of free timestamps'
- Lorenzo Stoakes has fixed the handling of new mappings of sealed
files in the series 'permit write-sealed memfd read-only shared
mappings'
- Mike Kravetz has optimized the hugetlb vmemmap optimization in the
series 'Batch hugetlb vmemmap modification operations'
- Some buffer_head folio conversions and cleanups from Matthew Wilcox
in the series 'Finish the create_empty_buffers() transition'
- As a page allocator performance optimization Huang Ying has added
automatic tuning to the allocator's per-cpu-pages feature, in the
series 'mm: PCP high auto-tuning'
- Roman Gushchin has contributed the patchset 'mm: improve
performance of accounted kernel memory allocations' which improves
their performance by ~30% as measured by a micro-benchmark
- folio conversions from Kefeng Wang in the series 'mm: convert page
cpupid functions to folios'
- Some kmemleak fixups in Liu Shixin's series 'Some bugfix about
kmemleak'
- Qi Zheng has improved our handling of memoryless nodes by keeping
them off the allocation fallback list. This is done in the series
'handle memoryless nodes more appropriately'
- khugepaged conversions from Vishal Moola in the series 'Some
khugepaged folio conversions'"
[ bcachefs conflicts with the dynamically allocated shrinkers have been
resolved as per Stephen Rothwell in
https://lore.kernel.org/all/20230913093553.4290421e@canb.auug.org.au/
with help from Qi Zheng.
The clone3 test filtering conflict was half-arsed by yours truly ]
* tag 'mm-stable-2023-11-01-14-33' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm: (406 commits)
mm/damon/sysfs: update monitoring target regions for online input commit
mm/damon/sysfs: remove requested targets when online-commit inputs
selftests: add a sanity check for zswap
Documentation: maple_tree: fix word spelling error
mm/vmalloc: fix the unchecked dereference warning in vread_iter()
zswap: export compression failure stats
Documentation: ubsan: drop "the" from article title
mempolicy: migration attempt to match interleave nodes
mempolicy: mmap_lock is not needed while migrating folios
mempolicy: alloc_pages_mpol() for NUMA policy without vma
mm: add page_rmappable_folio() wrapper
mempolicy: remove confusing MPOL_MF_LAZY dead code
mempolicy: mpol_shared_policy_init() without pseudo-vma
mempolicy trivia: use pgoff_t in shared mempolicy tree
mempolicy trivia: slightly more consistent naming
mempolicy trivia: delete those ancient pr_debug()s
mempolicy: fix migrate_pages(2) syscall return nr_failed
kernfs: drop shared NUMA mempolicy hooks
hugetlbfs: drop shared NUMA mempolicy pretence
mm/damon/sysfs-test: add a unit test for damon_sysfs_set_targets()
...
2023-11-02 22:38:47 -07:00
|
|
|
shrink->count_objects = bch2_btree_cache_count;
|
|
|
|
shrink->scan_objects = bch2_btree_cache_scan;
|
bcachefs: Rework btree node pinning
In backpointers fsck, we do a seqential scan of one btree, and check
references to another: extents <-> backpointers
Checking references generates random lookups, so we want to pin that
btree in memory (or only a range, if it doesn't fit in ram).
Previously, this was done with a simple check in the shrinker - "if
btree node is in range being pinned, don't free it" - but this generated
OOMs, as our shrinker wasn't well behaved if there was less memory
available than expected.
Instead, we now have two different shrinkers and lru lists; the second
shrinker being for pinned nodes, with seeks set much higher than normal
- so they can still be freed if necessary, but we'll prefer not to.
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
2024-09-04 17:49:37 -07:00
|
|
|
shrink->seeks = 8;
|
|
|
|
shrink->private_data = &bc->live[1];
|
Many singleton patches against the MM code. The patch series which are
included in this merge do the following:
- Kemeng Shi has contributed some compation maintenance work in the
series "Fixes and cleanups to compaction".
- Joel Fernandes has a patchset ("Optimize mremap during mutual
alignment within PMD") which fixes an obscure issue with mremap()'s
pagetable handling during a subsequent exec(), based upon an
implementation which Linus suggested.
- More DAMON/DAMOS maintenance and feature work from SeongJae Park i the
following patch series:
mm/damon: misc fixups for documents, comments and its tracepoint
mm/damon: add a tracepoint for damos apply target regions
mm/damon: provide pseudo-moving sum based access rate
mm/damon: implement DAMOS apply intervals
mm/damon/core-test: Fix memory leaks in core-test
mm/damon/sysfs-schemes: Do DAMOS tried regions update for only one apply interval
- In the series "Do not try to access unaccepted memory" Adrian Hunter
provides some fixups for the recently-added "unaccepted memory' feature.
To increase the feature's checking coverage. "Plug a few gaps where
RAM is exposed without checking if it is unaccepted memory".
- In the series "cleanups for lockless slab shrink" Qi Zheng has done
some maintenance work which is preparation for the lockless slab
shrinking code.
- Qi Zheng has redone the earlier (and reverted) attempt to make slab
shrinking lockless in the series "use refcount+RCU method to implement
lockless slab shrink".
- David Hildenbrand contributes some maintenance work for the rmap code
in the series "Anon rmap cleanups".
- Kefeng Wang does more folio conversions and some maintenance work in
the migration code. Series "mm: migrate: more folio conversion and
unification".
- Matthew Wilcox has fixed an issue in the buffer_head code which was
causing long stalls under some heavy memory/IO loads. Some cleanups
were added on the way. Series "Add and use bdev_getblk()".
- In the series "Use nth_page() in place of direct struct page
manipulation" Zi Yan has fixed a potential issue with the direct
manipulation of hugetlb page frames.
- In the series "mm: hugetlb: Skip initialization of gigantic tail
struct pages if freed by HVO" has improved our handling of gigantic
pages in the hugetlb vmmemmep optimizaton code. This provides
significant boot time improvements when significant amounts of gigantic
pages are in use.
- Matthew Wilcox has sent the series "Small hugetlb cleanups" - code
rationalization and folio conversions in the hugetlb code.
- Yin Fengwei has improved mlock()'s handling of large folios in the
series "support large folio for mlock"
- In the series "Expose swapcache stat for memcg v1" Liu Shixin has
added statistics for memcg v1 users which are available (and useful)
under memcg v2.
- Florent Revest has enhanced the MDWE (Memory-Deny-Write-Executable)
prctl so that userspace may direct the kernel to not automatically
propagate the denial to child processes. The series is named "MDWE
without inheritance".
- Kefeng Wang has provided the series "mm: convert numa balancing
functions to use a folio" which does what it says.
- In the series "mm/ksm: add fork-exec support for prctl" Stefan Roesch
makes is possible for a process to propagate KSM treatment across
exec().
- Huang Ying has enhanced memory tiering's calculation of memory
distances. This is used to permit the dax/kmem driver to use "high
bandwidth memory" in addition to Optane Data Center Persistent Memory
Modules (DCPMM). The series is named "memory tiering: calculate
abstract distance based on ACPI HMAT"
- In the series "Smart scanning mode for KSM" Stefan Roesch has
optimized KSM by teaching it to retain and use some historical
information from previous scans.
- Yosry Ahmed has fixed some inconsistencies in memcg statistics in the
series "mm: memcg: fix tracking of pending stats updates values".
- In the series "Implement IOCTL to get and optionally clear info about
PTEs" Peter Xu has added an ioctl to /proc/<pid>/pagemap which permits
us to atomically read-then-clear page softdirty state. This is mainly
used by CRIU.
- Hugh Dickins contributed the series "shmem,tmpfs: general maintenance"
- a bunch of relatively minor maintenance tweaks to this code.
- Matthew Wilcox has increased the use of the VMA lock over file-backed
page faults in the series "Handle more faults under the VMA lock". Some
rationalizations of the fault path became possible as a result.
- In the series "mm/rmap: convert page_move_anon_rmap() to
folio_move_anon_rmap()" David Hildenbrand has implemented some cleanups
and folio conversions.
- In the series "various improvements to the GUP interface" Lorenzo
Stoakes has simplified and improved the GUP interface with an eye to
providing groundwork for future improvements.
- Andrey Konovalov has sent along the series "kasan: assorted fixes and
improvements" which does those things.
- Some page allocator maintenance work from Kemeng Shi in the series
"Two minor cleanups to break_down_buddy_pages".
- In thes series "New selftest for mm" Breno Leitao has developed
another MM self test which tickles a race we had between madvise() and
page faults.
- In the series "Add folio_end_read" Matthew Wilcox provides cleanups
and an optimization to the core pagecache code.
- Nhat Pham has added memcg accounting for hugetlb memory in the series
"hugetlb memcg accounting".
- Cleanups and rationalizations to the pagemap code from Lorenzo
Stoakes, in the series "Abstract vma_merge() and split_vma()".
- Audra Mitchell has fixed issues in the procfs page_owner code's new
timestamping feature which was causing some misbehaviours. In the
series "Fix page_owner's use of free timestamps".
- Lorenzo Stoakes has fixed the handling of new mappings of sealed files
in the series "permit write-sealed memfd read-only shared mappings".
- Mike Kravetz has optimized the hugetlb vmemmap optimization in the
series "Batch hugetlb vmemmap modification operations".
- Some buffer_head folio conversions and cleanups from Matthew Wilcox in
the series "Finish the create_empty_buffers() transition".
- As a page allocator performance optimization Huang Ying has added
automatic tuning to the allocator's per-cpu-pages feature, in the series
"mm: PCP high auto-tuning".
- Roman Gushchin has contributed the patchset "mm: improve performance
of accounted kernel memory allocations" which improves their performance
by ~30% as measured by a micro-benchmark.
- folio conversions from Kefeng Wang in the series "mm: convert page
cpupid functions to folios".
- Some kmemleak fixups in Liu Shixin's series "Some bugfix about
kmemleak".
- Qi Zheng has improved our handling of memoryless nodes by keeping them
off the allocation fallback list. This is done in the series "handle
memoryless nodes more appropriately".
- khugepaged conversions from Vishal Moola in the series "Some
khugepaged folio conversions".
-----BEGIN PGP SIGNATURE-----
iHUEABYIAB0WIQTTMBEPP41GrTpTJgfdBJ7gKXxAjgUCZULEMwAKCRDdBJ7gKXxA
jhQHAQCYpD3g849x69DmHnHWHm/EHQLvQmRMDeYZI+nx/sCJOwEAw4AKg0Oemv9y
FgeUPAD1oasg6CP+INZvCj34waNxwAc=
=E+Y4
-----END PGP SIGNATURE-----
Merge tag 'mm-stable-2023-11-01-14-33' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
Pull MM updates from Andrew Morton:
"Many singleton patches against the MM code. The patch series which are
included in this merge do the following:
- Kemeng Shi has contributed some compation maintenance work in the
series 'Fixes and cleanups to compaction'
- Joel Fernandes has a patchset ('Optimize mremap during mutual
alignment within PMD') which fixes an obscure issue with mremap()'s
pagetable handling during a subsequent exec(), based upon an
implementation which Linus suggested
- More DAMON/DAMOS maintenance and feature work from SeongJae Park i
the following patch series:
mm/damon: misc fixups for documents, comments and its tracepoint
mm/damon: add a tracepoint for damos apply target regions
mm/damon: provide pseudo-moving sum based access rate
mm/damon: implement DAMOS apply intervals
mm/damon/core-test: Fix memory leaks in core-test
mm/damon/sysfs-schemes: Do DAMOS tried regions update for only one apply interval
- In the series 'Do not try to access unaccepted memory' Adrian
Hunter provides some fixups for the recently-added 'unaccepted
memory' feature. To increase the feature's checking coverage. 'Plug
a few gaps where RAM is exposed without checking if it is
unaccepted memory'
- In the series 'cleanups for lockless slab shrink' Qi Zheng has done
some maintenance work which is preparation for the lockless slab
shrinking code
- Qi Zheng has redone the earlier (and reverted) attempt to make slab
shrinking lockless in the series 'use refcount+RCU method to
implement lockless slab shrink'
- David Hildenbrand contributes some maintenance work for the rmap
code in the series 'Anon rmap cleanups'
- Kefeng Wang does more folio conversions and some maintenance work
in the migration code. Series 'mm: migrate: more folio conversion
and unification'
- Matthew Wilcox has fixed an issue in the buffer_head code which was
causing long stalls under some heavy memory/IO loads. Some cleanups
were added on the way. Series 'Add and use bdev_getblk()'
- In the series 'Use nth_page() in place of direct struct page
manipulation' Zi Yan has fixed a potential issue with the direct
manipulation of hugetlb page frames
- In the series 'mm: hugetlb: Skip initialization of gigantic tail
struct pages if freed by HVO' has improved our handling of gigantic
pages in the hugetlb vmmemmep optimizaton code. This provides
significant boot time improvements when significant amounts of
gigantic pages are in use
- Matthew Wilcox has sent the series 'Small hugetlb cleanups' - code
rationalization and folio conversions in the hugetlb code
- Yin Fengwei has improved mlock()'s handling of large folios in the
series 'support large folio for mlock'
- In the series 'Expose swapcache stat for memcg v1' Liu Shixin has
added statistics for memcg v1 users which are available (and
useful) under memcg v2
- Florent Revest has enhanced the MDWE (Memory-Deny-Write-Executable)
prctl so that userspace may direct the kernel to not automatically
propagate the denial to child processes. The series is named 'MDWE
without inheritance'
- Kefeng Wang has provided the series 'mm: convert numa balancing
functions to use a folio' which does what it says
- In the series 'mm/ksm: add fork-exec support for prctl' Stefan
Roesch makes is possible for a process to propagate KSM treatment
across exec()
- Huang Ying has enhanced memory tiering's calculation of memory
distances. This is used to permit the dax/kmem driver to use 'high
bandwidth memory' in addition to Optane Data Center Persistent
Memory Modules (DCPMM). The series is named 'memory tiering:
calculate abstract distance based on ACPI HMAT'
- In the series 'Smart scanning mode for KSM' Stefan Roesch has
optimized KSM by teaching it to retain and use some historical
information from previous scans
- Yosry Ahmed has fixed some inconsistencies in memcg statistics in
the series 'mm: memcg: fix tracking of pending stats updates
values'
- In the series 'Implement IOCTL to get and optionally clear info
about PTEs' Peter Xu has added an ioctl to /proc/<pid>/pagemap
which permits us to atomically read-then-clear page softdirty
state. This is mainly used by CRIU
- Hugh Dickins contributed the series 'shmem,tmpfs: general
maintenance', a bunch of relatively minor maintenance tweaks to
this code
- Matthew Wilcox has increased the use of the VMA lock over
file-backed page faults in the series 'Handle more faults under the
VMA lock'. Some rationalizations of the fault path became possible
as a result
- In the series 'mm/rmap: convert page_move_anon_rmap() to
folio_move_anon_rmap()' David Hildenbrand has implemented some
cleanups and folio conversions
- In the series 'various improvements to the GUP interface' Lorenzo
Stoakes has simplified and improved the GUP interface with an eye
to providing groundwork for future improvements
- Andrey Konovalov has sent along the series 'kasan: assorted fixes
and improvements' which does those things
- Some page allocator maintenance work from Kemeng Shi in the series
'Two minor cleanups to break_down_buddy_pages'
- In thes series 'New selftest for mm' Breno Leitao has developed
another MM self test which tickles a race we had between madvise()
and page faults
- In the series 'Add folio_end_read' Matthew Wilcox provides cleanups
and an optimization to the core pagecache code
- Nhat Pham has added memcg accounting for hugetlb memory in the
series 'hugetlb memcg accounting'
- Cleanups and rationalizations to the pagemap code from Lorenzo
Stoakes, in the series 'Abstract vma_merge() and split_vma()'
- Audra Mitchell has fixed issues in the procfs page_owner code's new
timestamping feature which was causing some misbehaviours. In the
series 'Fix page_owner's use of free timestamps'
- Lorenzo Stoakes has fixed the handling of new mappings of sealed
files in the series 'permit write-sealed memfd read-only shared
mappings'
- Mike Kravetz has optimized the hugetlb vmemmap optimization in the
series 'Batch hugetlb vmemmap modification operations'
- Some buffer_head folio conversions and cleanups from Matthew Wilcox
in the series 'Finish the create_empty_buffers() transition'
- As a page allocator performance optimization Huang Ying has added
automatic tuning to the allocator's per-cpu-pages feature, in the
series 'mm: PCP high auto-tuning'
- Roman Gushchin has contributed the patchset 'mm: improve
performance of accounted kernel memory allocations' which improves
their performance by ~30% as measured by a micro-benchmark
- folio conversions from Kefeng Wang in the series 'mm: convert page
cpupid functions to folios'
- Some kmemleak fixups in Liu Shixin's series 'Some bugfix about
kmemleak'
- Qi Zheng has improved our handling of memoryless nodes by keeping
them off the allocation fallback list. This is done in the series
'handle memoryless nodes more appropriately'
- khugepaged conversions from Vishal Moola in the series 'Some
khugepaged folio conversions'"
[ bcachefs conflicts with the dynamically allocated shrinkers have been
resolved as per Stephen Rothwell in
https://lore.kernel.org/all/20230913093553.4290421e@canb.auug.org.au/
with help from Qi Zheng.
The clone3 test filtering conflict was half-arsed by yours truly ]
* tag 'mm-stable-2023-11-01-14-33' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm: (406 commits)
mm/damon/sysfs: update monitoring target regions for online input commit
mm/damon/sysfs: remove requested targets when online-commit inputs
selftests: add a sanity check for zswap
Documentation: maple_tree: fix word spelling error
mm/vmalloc: fix the unchecked dereference warning in vread_iter()
zswap: export compression failure stats
Documentation: ubsan: drop "the" from article title
mempolicy: migration attempt to match interleave nodes
mempolicy: mmap_lock is not needed while migrating folios
mempolicy: alloc_pages_mpol() for NUMA policy without vma
mm: add page_rmappable_folio() wrapper
mempolicy: remove confusing MPOL_MF_LAZY dead code
mempolicy: mpol_shared_policy_init() without pseudo-vma
mempolicy trivia: use pgoff_t in shared mempolicy tree
mempolicy trivia: slightly more consistent naming
mempolicy trivia: delete those ancient pr_debug()s
mempolicy: fix migrate_pages(2) syscall return nr_failed
kernfs: drop shared NUMA mempolicy hooks
hugetlbfs: drop shared NUMA mempolicy pretence
mm/damon/sysfs-test: add a unit test for damon_sysfs_set_targets()
...
2023-11-02 22:38:47 -07:00
|
|
|
shrinker_register(shrink);
|
2023-07-07 01:38:29 -07:00
|
|
|
|
|
|
|
return 0;
|
|
|
|
err:
|
|
|
|
return -BCH_ERR_ENOMEM_fs_btree_cache_init;
|
2017-03-16 23:18:50 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
void bch2_fs_btree_cache_init_early(struct btree_cache *bc)
|
|
|
|
{
|
|
|
|
mutex_init(&bc->lock);
|
bcachefs: Rework btree node pinning
In backpointers fsck, we do a seqential scan of one btree, and check
references to another: extents <-> backpointers
Checking references generates random lookups, so we want to pin that
btree in memory (or only a range, if it doesn't fit in ram).
Previously, this was done with a simple check in the shrinker - "if
btree node is in range being pinned, don't free it" - but this generated
OOMs, as our shrinker wasn't well behaved if there was less memory
available than expected.
Instead, we now have two different shrinkers and lru lists; the second
shrinker being for pinned nodes, with seeks set much higher than normal
- so they can still be freed if necessary, but we'll prefer not to.
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
2024-09-04 17:49:37 -07:00
|
|
|
for (unsigned i = 0; i < ARRAY_SIZE(bc->live); i++) {
|
|
|
|
bc->live[i].idx = i;
|
|
|
|
INIT_LIST_HEAD(&bc->live[i].list);
|
|
|
|
}
|
2017-03-16 23:18:50 -07:00
|
|
|
INIT_LIST_HEAD(&bc->freeable);
|
2022-03-04 17:16:04 -07:00
|
|
|
INIT_LIST_HEAD(&bc->freed_pcpu);
|
|
|
|
INIT_LIST_HEAD(&bc->freed_nonpcpu);
|
2017-03-16 23:18:50 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* We can only have one thread cannibalizing other cached btree nodes at a time,
|
|
|
|
* or we'll deadlock. We use an open coded mutex to ensure that, which a
|
|
|
|
* cannibalize_bucket() will take. This means every time we unlock the root of
|
|
|
|
* the btree, we need to release this lock if we have it held.
|
|
|
|
*/
|
2023-12-02 01:36:27 -07:00
|
|
|
void bch2_btree_cache_cannibalize_unlock(struct btree_trans *trans)
|
2017-03-16 23:18:50 -07:00
|
|
|
{
|
2023-12-02 01:36:27 -07:00
|
|
|
struct bch_fs *c = trans->c;
|
2017-03-16 23:18:50 -07:00
|
|
|
struct btree_cache *bc = &c->btree_cache;
|
|
|
|
|
|
|
|
if (bc->alloc_lock == current) {
|
2023-12-02 01:36:27 -07:00
|
|
|
trace_and_count(c, btree_cache_cannibalize_unlock, trans);
|
2017-03-16 23:18:50 -07:00
|
|
|
bc->alloc_lock = NULL;
|
|
|
|
closure_wake_up(&bc->alloc_wait);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2023-12-02 01:36:27 -07:00
|
|
|
int bch2_btree_cache_cannibalize_lock(struct btree_trans *trans, struct closure *cl)
|
2017-03-16 23:18:50 -07:00
|
|
|
{
|
2023-12-02 01:36:27 -07:00
|
|
|
struct bch_fs *c = trans->c;
|
2017-03-16 23:18:50 -07:00
|
|
|
struct btree_cache *bc = &c->btree_cache;
|
|
|
|
struct task_struct *old;
|
|
|
|
|
2024-05-23 02:19:26 -07:00
|
|
|
old = NULL;
|
|
|
|
if (try_cmpxchg(&bc->alloc_lock, &old, current) || old == current)
|
2017-03-16 23:18:50 -07:00
|
|
|
goto success;
|
|
|
|
|
|
|
|
if (!cl) {
|
2023-12-02 01:36:27 -07:00
|
|
|
trace_and_count(c, btree_cache_cannibalize_lock_fail, trans);
|
2023-03-14 12:35:57 -07:00
|
|
|
return -BCH_ERR_ENOMEM_btree_cache_cannibalize_lock;
|
2017-03-16 23:18:50 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
closure_wait(&bc->alloc_wait, cl);
|
|
|
|
|
|
|
|
/* Try again, after adding ourselves to waitlist */
|
2024-05-23 02:19:26 -07:00
|
|
|
old = NULL;
|
|
|
|
if (try_cmpxchg(&bc->alloc_lock, &old, current) || old == current) {
|
2017-03-16 23:18:50 -07:00
|
|
|
/* We raced */
|
|
|
|
closure_wake_up(&bc->alloc_wait);
|
|
|
|
goto success;
|
|
|
|
}
|
|
|
|
|
2023-12-02 01:36:27 -07:00
|
|
|
trace_and_count(c, btree_cache_cannibalize_lock_fail, trans);
|
2022-12-13 13:17:40 -07:00
|
|
|
return -BCH_ERR_btree_cache_cannibalize_lock_blocked;
|
2017-03-16 23:18:50 -07:00
|
|
|
|
|
|
|
success:
|
2023-12-02 01:36:27 -07:00
|
|
|
trace_and_count(c, btree_cache_cannibalize_lock, trans);
|
2017-03-16 23:18:50 -07:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static struct btree *btree_node_cannibalize(struct bch_fs *c)
|
|
|
|
{
|
|
|
|
struct btree_cache *bc = &c->btree_cache;
|
|
|
|
struct btree *b;
|
|
|
|
|
bcachefs: Rework btree node pinning
In backpointers fsck, we do a seqential scan of one btree, and check
references to another: extents <-> backpointers
Checking references generates random lookups, so we want to pin that
btree in memory (or only a range, if it doesn't fit in ram).
Previously, this was done with a simple check in the shrinker - "if
btree node is in range being pinned, don't free it" - but this generated
OOMs, as our shrinker wasn't well behaved if there was less memory
available than expected.
Instead, we now have two different shrinkers and lru lists; the second
shrinker being for pinned nodes, with seeks set much higher than normal
- so they can still be freed if necessary, but we'll prefer not to.
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
2024-09-04 17:49:37 -07:00
|
|
|
for (unsigned i = 0; i < ARRAY_SIZE(bc->live); i++)
|
|
|
|
list_for_each_entry_reverse(b, &bc->live[i].list, list)
|
|
|
|
if (!btree_node_reclaim(c, b, false))
|
|
|
|
return b;
|
2017-03-16 23:18:50 -07:00
|
|
|
|
|
|
|
while (1) {
|
bcachefs: Rework btree node pinning
In backpointers fsck, we do a seqential scan of one btree, and check
references to another: extents <-> backpointers
Checking references generates random lookups, so we want to pin that
btree in memory (or only a range, if it doesn't fit in ram).
Previously, this was done with a simple check in the shrinker - "if
btree node is in range being pinned, don't free it" - but this generated
OOMs, as our shrinker wasn't well behaved if there was less memory
available than expected.
Instead, we now have two different shrinkers and lru lists; the second
shrinker being for pinned nodes, with seeks set much higher than normal
- so they can still be freed if necessary, but we'll prefer not to.
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
2024-09-04 17:49:37 -07:00
|
|
|
for (unsigned i = 0; i < ARRAY_SIZE(bc->live); i++)
|
|
|
|
list_for_each_entry_reverse(b, &bc->live[i].list, list)
|
|
|
|
if (!btree_node_write_and_reclaim(c, b))
|
|
|
|
return b;
|
2017-03-16 23:18:50 -07:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Rare case: all nodes were intent-locked.
|
|
|
|
* Just busy-wait.
|
|
|
|
*/
|
|
|
|
WARN_ONCE(1, "btree cache cannibalize failed\n");
|
|
|
|
cond_resched();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2023-03-02 00:12:18 -07:00
|
|
|
struct btree *bch2_btree_node_mem_alloc(struct btree_trans *trans, bool pcpu_read_locks)
|
2017-03-16 23:18:50 -07:00
|
|
|
{
|
2023-03-02 00:12:18 -07:00
|
|
|
struct bch_fs *c = trans->c;
|
2017-03-16 23:18:50 -07:00
|
|
|
struct btree_cache *bc = &c->btree_cache;
|
2022-03-04 17:16:04 -07:00
|
|
|
struct list_head *freed = pcpu_read_locks
|
|
|
|
? &bc->freed_pcpu
|
|
|
|
: &bc->freed_nonpcpu;
|
2022-03-04 17:50:28 -07:00
|
|
|
struct btree *b, *b2;
|
2017-03-16 23:18:50 -07:00
|
|
|
u64 start_time = local_clock();
|
|
|
|
|
|
|
|
mutex_lock(&bc->lock);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* We never free struct btree itself, just the memory that holds the on
|
|
|
|
* disk node. Check the freed list before allocating a new one:
|
|
|
|
*/
|
2022-03-04 17:16:04 -07:00
|
|
|
list_for_each_entry(b, freed, list)
|
2022-09-29 20:37:15 -07:00
|
|
|
if (!btree_node_reclaim(c, b, false)) {
|
2022-03-04 17:50:28 -07:00
|
|
|
list_del_init(&b->list);
|
2020-06-09 14:49:24 -07:00
|
|
|
goto got_node;
|
2022-03-04 17:50:28 -07:00
|
|
|
}
|
2017-03-16 23:18:50 -07:00
|
|
|
|
2023-05-27 23:35:34 -07:00
|
|
|
b = __btree_node_mem_alloc(c, GFP_NOWAIT|__GFP_NOWARN);
|
2022-09-25 11:49:14 -07:00
|
|
|
if (!b) {
|
|
|
|
mutex_unlock(&bc->lock);
|
2023-05-27 23:35:34 -07:00
|
|
|
bch2_trans_unlock(trans);
|
2022-09-25 11:49:14 -07:00
|
|
|
b = __btree_node_mem_alloc(c, GFP_KERNEL);
|
|
|
|
if (!b)
|
|
|
|
goto err;
|
|
|
|
mutex_lock(&bc->lock);
|
|
|
|
}
|
2022-03-04 17:50:28 -07:00
|
|
|
|
2023-05-20 17:57:55 -07:00
|
|
|
bch2_btree_lock_init(&b->c, pcpu_read_locks ? SIX_LOCK_INIT_PCPU : 0);
|
2022-03-04 17:16:04 -07:00
|
|
|
|
2022-03-04 17:50:28 -07:00
|
|
|
BUG_ON(!six_trylock_intent(&b->c.lock));
|
|
|
|
BUG_ON(!six_trylock_write(&b->c.lock));
|
2020-06-09 14:49:24 -07:00
|
|
|
got_node:
|
|
|
|
|
2022-03-04 17:50:28 -07:00
|
|
|
/*
|
|
|
|
* btree_free() doesn't free memory; it sticks the node on the end of
|
|
|
|
* the list. Check if there's any freed nodes there:
|
|
|
|
*/
|
|
|
|
list_for_each_entry(b2, &bc->freeable, list)
|
2022-09-29 20:37:15 -07:00
|
|
|
if (!btree_node_reclaim(c, b2, false)) {
|
2022-03-04 17:50:28 -07:00
|
|
|
swap(b->data, b2->data);
|
|
|
|
swap(b->aux_data, b2->aux_data);
|
2022-03-04 17:16:04 -07:00
|
|
|
btree_node_to_freedlist(bc, b2);
|
2022-03-04 17:50:28 -07:00
|
|
|
six_unlock_write(&b2->c.lock);
|
|
|
|
six_unlock_intent(&b2->c.lock);
|
|
|
|
goto got_mem;
|
|
|
|
}
|
2017-03-16 23:18:50 -07:00
|
|
|
|
2022-03-04 17:50:28 -07:00
|
|
|
mutex_unlock(&bc->lock);
|
2020-06-09 14:49:24 -07:00
|
|
|
|
2023-05-27 23:35:34 -07:00
|
|
|
if (btree_node_data_alloc(c, b, GFP_NOWAIT|__GFP_NOWARN)) {
|
|
|
|
bch2_trans_unlock(trans);
|
|
|
|
if (btree_node_data_alloc(c, b, GFP_KERNEL|__GFP_NOWARN))
|
|
|
|
goto err;
|
|
|
|
}
|
2020-06-09 14:49:24 -07:00
|
|
|
|
2022-03-04 17:50:28 -07:00
|
|
|
mutex_lock(&bc->lock);
|
2024-09-05 16:37:56 -07:00
|
|
|
bc->nr_freeable++;
|
2022-03-04 17:50:28 -07:00
|
|
|
got_mem:
|
|
|
|
mutex_unlock(&bc->lock);
|
2017-03-16 23:18:50 -07:00
|
|
|
|
|
|
|
BUG_ON(btree_node_hashed(b));
|
2021-07-10 20:03:15 -07:00
|
|
|
BUG_ON(btree_node_dirty(b));
|
2017-03-16 23:18:50 -07:00
|
|
|
BUG_ON(btree_node_write_in_flight(b));
|
|
|
|
out:
|
|
|
|
b->flags = 0;
|
|
|
|
b->written = 0;
|
|
|
|
b->nsets = 0;
|
|
|
|
b->sib_u64s[0] = 0;
|
|
|
|
b->sib_u64s[1] = 0;
|
|
|
|
b->whiteout_u64s = 0;
|
2020-11-02 16:20:44 -07:00
|
|
|
bch2_btree_keys_init(b);
|
2021-04-29 13:55:26 -07:00
|
|
|
set_btree_node_accessed(b);
|
2017-03-16 23:18:50 -07:00
|
|
|
|
|
|
|
bch2_time_stats_update(&c->times[BCH_TIME_btree_node_mem_alloc],
|
|
|
|
start_time);
|
|
|
|
|
2024-08-19 12:22:55 -07:00
|
|
|
int ret = bch2_trans_relock(trans);
|
|
|
|
if (unlikely(ret)) {
|
|
|
|
bch2_btree_node_to_freelist(c, b);
|
|
|
|
return ERR_PTR(ret);
|
|
|
|
}
|
|
|
|
|
2017-03-16 23:18:50 -07:00
|
|
|
return b;
|
|
|
|
err:
|
2020-06-09 14:49:24 -07:00
|
|
|
mutex_lock(&bc->lock);
|
2022-09-25 11:49:14 -07:00
|
|
|
|
2017-03-16 23:18:50 -07:00
|
|
|
/* Try to cannibalize another cached btree node: */
|
|
|
|
if (bc->alloc_lock == current) {
|
2022-03-04 17:50:28 -07:00
|
|
|
b2 = btree_node_cannibalize(c);
|
bcachefs: Clear btree_node_just_written() when node reused or evicted
This fixes the following bug:
Journal reclaim attempts to flush a node, but races with the node being
evicted from the btree node cache; when we lock the node, the data
buffers have already been freed.
We don't evict a node that's dirty, so calling btree_node_write() is
fine - it's a noop - except that the btree_node_just_written bit causes
bch2_btree_post_write_cleanup() to run (resorting the node), which then
causes a null ptr deref.
00078 Unable to handle kernel NULL pointer dereference at virtual address 000000000000009e
00078 Mem abort info:
00078 ESR = 0x0000000096000005
00078 EC = 0x25: DABT (current EL), IL = 32 bits
00078 SET = 0, FnV = 0
00078 EA = 0, S1PTW = 0
00078 FSC = 0x05: level 1 translation fault
00078 Data abort info:
00078 ISV = 0, ISS = 0x00000005
00078 CM = 0, WnR = 0
00078 user pgtable: 4k pages, 39-bit VAs, pgdp=000000007ed64000
00078 [000000000000009e] pgd=0000000000000000, p4d=0000000000000000, pud=0000000000000000
00078 Internal error: Oops: 0000000096000005 [#1] SMP
00078 Modules linked in:
00078 CPU: 75 PID: 1170 Comm: stress-ng-utime Not tainted 6.3.0-ktest-g5ef5b466e77e #2078
00078 Hardware name: linux,dummy-virt (DT)
00078 pstate: 80001005 (Nzcv daif -PAN -UAO -TCO -DIT +SSBS BTYPE=--)
00078 pc : btree_node_sort+0xc4/0x568
00078 lr : bch2_btree_post_write_cleanup+0x6c/0x1c0
00078 sp : ffffff803e30b350
00078 x29: ffffff803e30b350 x28: 0000000000000001 x27: ffffff80076e52a8
00078 x26: 0000000000000002 x25: 0000000000000000 x24: ffffffc00912e000
00078 x23: ffffff80076e52a8 x22: 0000000000000000 x21: ffffff80076e52bc
00078 x20: ffffff80076e5200 x19: 0000000000000000 x18: 0000000000000000
00078 x17: fffffffff8000000 x16: 0000000008000000 x15: 0000000008000000
00078 x14: 0000000000000002 x13: 0000000000000000 x12: 00000000000000a0
00078 x11: ffffff803e30b400 x10: ffffff803e30b408 x9 : 0000000000000001
00078 x8 : 0000000000000000 x7 : ffffff803e480000 x6 : 00000000000000a0
00078 x5 : 0000000000000088 x4 : 0000000000000000 x3 : 0000000000000010
00078 x2 : 0000000000000000 x1 : 0000000000000000 x0 : ffffff80076e52a8
00078 Call trace:
00078 btree_node_sort+0xc4/0x568
00078 bch2_btree_post_write_cleanup+0x6c/0x1c0
00078 bch2_btree_node_write+0x108/0x148
00078 __btree_node_flush+0x104/0x160
00078 bch2_btree_node_flush0+0x1c/0x30
00078 journal_flush_pins.constprop.0+0x184/0x2d0
00078 __bch2_journal_reclaim+0x4d4/0x508
00078 bch2_journal_reclaim+0x1c/0x30
00078 __bch2_journal_preres_get+0x244/0x268
00078 bch2_trans_journal_preres_get_cold+0xa4/0x180
00078 __bch2_trans_commit+0x61c/0x1bb0
00078 bch2_setattr_nonsize+0x254/0x318
00078 bch2_setattr+0x5c/0x78
00078 notify_change+0x2bc/0x408
00078 vfs_utimes+0x11c/0x218
00078 do_utimes+0x84/0x140
00078 __arm64_sys_utimensat+0x68/0xa8
00078 invoke_syscall.constprop.0+0x54/0xf0
00078 do_el0_svc+0x48/0xd8
00078 el0_svc+0x14/0x48
00078 el0t_64_sync_handler+0xb0/0xb8
00078 el0t_64_sync+0x14c/0x150
00078 Code: 8b050265 910020c6 8b060266 910060ac (79402cad)
00078 ---[ end trace 0000000000000000 ]---
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
2023-05-21 21:49:06 -07:00
|
|
|
clear_btree_node_just_written(b2);
|
2022-03-04 17:50:28 -07:00
|
|
|
bch2_btree_node_hash_remove(bc, b2);
|
|
|
|
|
|
|
|
if (b) {
|
|
|
|
swap(b->data, b2->data);
|
|
|
|
swap(b->aux_data, b2->aux_data);
|
2022-03-04 17:16:04 -07:00
|
|
|
btree_node_to_freedlist(bc, b2);
|
2022-03-04 17:50:28 -07:00
|
|
|
six_unlock_write(&b2->c.lock);
|
|
|
|
six_unlock_intent(&b2->c.lock);
|
|
|
|
} else {
|
|
|
|
b = b2;
|
|
|
|
list_del_init(&b->list);
|
|
|
|
}
|
2017-03-16 23:18:50 -07:00
|
|
|
|
2022-03-04 17:50:28 -07:00
|
|
|
mutex_unlock(&bc->lock);
|
2017-03-16 23:18:50 -07:00
|
|
|
|
2023-12-02 01:36:27 -07:00
|
|
|
trace_and_count(c, btree_cache_cannibalize, trans);
|
2017-03-16 23:18:50 -07:00
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
|
|
|
mutex_unlock(&bc->lock);
|
2023-03-14 12:35:57 -07:00
|
|
|
return ERR_PTR(-BCH_ERR_ENOMEM_btree_node_mem_alloc);
|
2017-03-16 23:18:50 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
/* Slowpath, don't want it inlined into btree_iter_traverse() */
|
2023-03-02 00:12:18 -07:00
|
|
|
static noinline struct btree *bch2_btree_node_fill(struct btree_trans *trans,
|
2021-08-30 12:18:31 -07:00
|
|
|
struct btree_path *path,
|
2017-03-16 23:18:50 -07:00
|
|
|
const struct bkey_i *k,
|
2020-03-15 20:29:43 -07:00
|
|
|
enum btree_id btree_id,
|
2017-03-16 23:18:50 -07:00
|
|
|
unsigned level,
|
|
|
|
enum six_lock_type lock_type,
|
|
|
|
bool sync)
|
|
|
|
{
|
2023-03-02 00:12:18 -07:00
|
|
|
struct bch_fs *c = trans->c;
|
2017-03-16 23:18:50 -07:00
|
|
|
struct btree_cache *bc = &c->btree_cache;
|
|
|
|
struct btree *b;
|
|
|
|
|
2024-04-12 12:34:14 -07:00
|
|
|
if (unlikely(level >= BTREE_MAX_DEPTH)) {
|
|
|
|
int ret = bch2_fs_topology_error(c, "attempting to get btree node at level %u, >= max depth %u",
|
|
|
|
level, BTREE_MAX_DEPTH);
|
|
|
|
return ERR_PTR(ret);
|
|
|
|
}
|
|
|
|
|
|
|
|
if (unlikely(!bkey_is_btree_ptr(&k->k))) {
|
|
|
|
struct printbuf buf = PRINTBUF;
|
|
|
|
bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(k));
|
|
|
|
|
|
|
|
int ret = bch2_fs_topology_error(c, "attempting to get btree node with non-btree key %s", buf.buf);
|
|
|
|
printbuf_exit(&buf);
|
|
|
|
return ERR_PTR(ret);
|
|
|
|
}
|
|
|
|
|
|
|
|
if (unlikely(k->k.u64s > BKEY_BTREE_PTR_U64s_MAX)) {
|
|
|
|
struct printbuf buf = PRINTBUF;
|
|
|
|
bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(k));
|
|
|
|
|
|
|
|
int ret = bch2_fs_topology_error(c, "attempting to get btree node with too big key %s", buf.buf);
|
|
|
|
printbuf_exit(&buf);
|
|
|
|
return ERR_PTR(ret);
|
|
|
|
}
|
|
|
|
|
2017-03-16 23:18:50 -07:00
|
|
|
/*
|
|
|
|
* Parent node must be locked, else we could read in a btree node that's
|
|
|
|
* been freed:
|
|
|
|
*/
|
2023-03-02 00:12:18 -07:00
|
|
|
if (path && !bch2_btree_node_relock(trans, path, level + 1)) {
|
2022-08-27 09:48:36 -07:00
|
|
|
trace_and_count(c, trans_restart_relock_parent_for_fill, trans, _THIS_IP_, path);
|
2022-07-17 20:06:38 -07:00
|
|
|
return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_fill_relock));
|
2021-07-25 14:19:52 -07:00
|
|
|
}
|
2017-03-16 23:18:50 -07:00
|
|
|
|
2023-03-02 00:12:18 -07:00
|
|
|
b = bch2_btree_node_mem_alloc(trans, level != 0);
|
2022-02-17 22:47:45 -07:00
|
|
|
|
2023-03-14 12:35:57 -07:00
|
|
|
if (bch2_err_matches(PTR_ERR_OR_ZERO(b), ENOMEM)) {
|
2024-01-22 12:25:00 -07:00
|
|
|
if (!path)
|
|
|
|
return b;
|
|
|
|
|
2022-02-17 22:47:45 -07:00
|
|
|
trans->memory_allocation_failure = true;
|
2022-08-27 09:48:36 -07:00
|
|
|
trace_and_count(c, trans_restart_memory_allocation_failure, trans, _THIS_IP_, path);
|
2022-07-17 20:06:38 -07:00
|
|
|
return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_fill_mem_alloc_fail));
|
2022-02-17 22:47:45 -07:00
|
|
|
}
|
|
|
|
|
2017-03-16 23:18:50 -07:00
|
|
|
if (IS_ERR(b))
|
|
|
|
return b;
|
|
|
|
|
|
|
|
bkey_copy(&b->key, k);
|
2020-03-15 20:29:43 -07:00
|
|
|
if (bch2_btree_node_hash_insert(bc, b, level, btree_id)) {
|
2017-03-16 23:18:50 -07:00
|
|
|
/* raced with another fill: */
|
|
|
|
|
|
|
|
/* mark as unhashed... */
|
2020-02-18 15:15:32 -07:00
|
|
|
b->hash_val = 0;
|
2017-03-16 23:18:50 -07:00
|
|
|
|
|
|
|
mutex_lock(&bc->lock);
|
|
|
|
list_add(&b->list, &bc->freeable);
|
|
|
|
mutex_unlock(&bc->lock);
|
|
|
|
|
2020-06-06 09:28:01 -07:00
|
|
|
six_unlock_write(&b->c.lock);
|
|
|
|
six_unlock_intent(&b->c.lock);
|
2017-03-16 23:18:50 -07:00
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
2021-07-10 20:03:15 -07:00
|
|
|
set_btree_node_read_in_flight(b);
|
|
|
|
six_unlock_write(&b->c.lock);
|
|
|
|
|
2024-04-12 12:54:33 -07:00
|
|
|
if (path) {
|
|
|
|
u32 seq = six_lock_seq(&b->c.lock);
|
2017-03-16 23:18:50 -07:00
|
|
|
|
2024-04-12 12:54:33 -07:00
|
|
|
/* Unlock before doing IO: */
|
|
|
|
six_unlock_intent(&b->c.lock);
|
|
|
|
bch2_trans_unlock_noassert(trans);
|
2017-03-16 23:18:50 -07:00
|
|
|
|
2024-04-12 12:54:33 -07:00
|
|
|
bch2_btree_node_read(trans, b, sync);
|
2021-04-08 19:26:53 -07:00
|
|
|
|
2024-08-20 12:04:15 -07:00
|
|
|
int ret = bch2_trans_relock(trans);
|
|
|
|
if (ret)
|
|
|
|
return ERR_PTR(ret);
|
|
|
|
|
2024-04-12 12:54:33 -07:00
|
|
|
if (!sync)
|
|
|
|
return NULL;
|
2024-01-22 12:25:00 -07:00
|
|
|
|
2024-04-12 12:54:33 -07:00
|
|
|
if (!six_relock_type(&b->c.lock, lock_type, seq))
|
|
|
|
b = NULL;
|
|
|
|
} else {
|
|
|
|
bch2_btree_node_read(trans, b, sync);
|
|
|
|
if (lock_type == SIX_LOCK_read)
|
|
|
|
six_lock_downgrade(&b->c.lock);
|
2021-07-25 14:19:52 -07:00
|
|
|
}
|
2017-03-16 23:18:50 -07:00
|
|
|
|
|
|
|
return b;
|
|
|
|
}
|
|
|
|
|
2021-04-23 13:05:49 -07:00
|
|
|
static noinline void btree_bad_header(struct bch_fs *c, struct btree *b)
|
|
|
|
{
|
2022-04-07 14:28:09 -07:00
|
|
|
struct printbuf buf = PRINTBUF;
|
2021-04-23 13:05:49 -07:00
|
|
|
|
2023-07-06 23:42:28 -07:00
|
|
|
if (c->curr_recovery_pass <= BCH_RECOVERY_PASS_check_allocations)
|
2021-04-23 13:05:49 -07:00
|
|
|
return;
|
|
|
|
|
2023-02-03 19:01:40 -07:00
|
|
|
prt_printf(&buf,
|
2022-04-07 14:28:09 -07:00
|
|
|
"btree node header doesn't match ptr\n"
|
|
|
|
"btree %s level %u\n"
|
|
|
|
"ptr: ",
|
2023-10-19 19:49:08 -07:00
|
|
|
bch2_btree_id_str(b->c.btree_id), b->c.level);
|
2022-04-07 14:28:09 -07:00
|
|
|
bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
|
|
|
|
|
2023-02-03 19:01:40 -07:00
|
|
|
prt_printf(&buf, "\nheader: btree %s level %llu\n"
|
2022-04-07 14:28:09 -07:00
|
|
|
"min ",
|
2023-10-19 19:49:08 -07:00
|
|
|
bch2_btree_id_str(BTREE_NODE_ID(b->data)),
|
2022-04-07 14:28:09 -07:00
|
|
|
BTREE_NODE_LEVEL(b->data));
|
|
|
|
bch2_bpos_to_text(&buf, b->data->min_key);
|
|
|
|
|
2023-02-03 19:01:40 -07:00
|
|
|
prt_printf(&buf, "\nmax ");
|
2022-04-07 14:28:09 -07:00
|
|
|
bch2_bpos_to_text(&buf, b->data->max_key);
|
|
|
|
|
2024-03-23 16:29:19 -07:00
|
|
|
bch2_fs_topology_error(c, "%s", buf.buf);
|
|
|
|
|
2022-04-07 14:28:09 -07:00
|
|
|
printbuf_exit(&buf);
|
2021-04-23 13:05:49 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
static inline void btree_check_header(struct bch_fs *c, struct btree *b)
|
|
|
|
{
|
|
|
|
if (b->c.btree_id != BTREE_NODE_ID(b->data) ||
|
|
|
|
b->c.level != BTREE_NODE_LEVEL(b->data) ||
|
2022-11-24 01:12:22 -07:00
|
|
|
!bpos_eq(b->data->max_key, b->key.k.p) ||
|
2021-04-23 13:05:49 -07:00
|
|
|
(b->key.k.type == KEY_TYPE_btree_ptr_v2 &&
|
2022-11-24 01:12:22 -07:00
|
|
|
!bpos_eq(b->data->min_key,
|
2021-04-23 13:05:49 -07:00
|
|
|
bkey_i_to_btree_ptr_v2(&b->key)->v.min_key)))
|
|
|
|
btree_bad_header(c, b);
|
|
|
|
}
|
|
|
|
|
2022-11-22 20:05:45 -07:00
|
|
|
static struct btree *__bch2_btree_node_get(struct btree_trans *trans, struct btree_path *path,
|
|
|
|
const struct bkey_i *k, unsigned level,
|
|
|
|
enum six_lock_type lock_type,
|
|
|
|
unsigned long trace_ip)
|
2017-03-16 23:18:50 -07:00
|
|
|
{
|
2021-07-24 14:12:51 -07:00
|
|
|
struct bch_fs *c = trans->c;
|
2017-03-16 23:18:50 -07:00
|
|
|
struct btree_cache *bc = &c->btree_cache;
|
|
|
|
struct btree *b;
|
2023-05-27 23:35:34 -07:00
|
|
|
bool need_relock = false;
|
2022-07-17 20:06:38 -07:00
|
|
|
int ret;
|
2017-03-16 23:18:50 -07:00
|
|
|
|
|
|
|
EBUG_ON(level >= BTREE_MAX_DEPTH);
|
|
|
|
retry:
|
|
|
|
b = btree_cache_find(bc, k);
|
|
|
|
if (unlikely(!b)) {
|
|
|
|
/*
|
|
|
|
* We must have the parent locked to call bch2_btree_node_fill(),
|
|
|
|
* else we could read in a btree node from disk that's been
|
|
|
|
* freed:
|
|
|
|
*/
|
2023-03-02 00:12:18 -07:00
|
|
|
b = bch2_btree_node_fill(trans, path, k, path->btree_id,
|
2020-03-15 20:29:43 -07:00
|
|
|
level, lock_type, true);
|
2023-05-27 23:35:34 -07:00
|
|
|
need_relock = true;
|
2017-03-16 23:18:50 -07:00
|
|
|
|
|
|
|
/* We raced and found the btree node in the cache */
|
|
|
|
if (!b)
|
|
|
|
goto retry;
|
|
|
|
|
|
|
|
if (IS_ERR(b))
|
|
|
|
return b;
|
|
|
|
} else {
|
2021-08-30 12:18:31 -07:00
|
|
|
if (btree_node_read_locked(path, level + 1))
|
2022-07-13 23:58:23 -07:00
|
|
|
btree_node_unlock(trans, path, level + 1);
|
2017-03-16 23:18:50 -07:00
|
|
|
|
2022-08-22 12:29:53 -07:00
|
|
|
ret = btree_node_lock(trans, path, &b->c, level, lock_type, trace_ip);
|
|
|
|
if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
|
|
|
|
return ERR_PTR(ret);
|
|
|
|
|
|
|
|
BUG_ON(ret);
|
2017-03-16 23:18:50 -07:00
|
|
|
|
2020-02-18 15:15:32 -07:00
|
|
|
if (unlikely(b->hash_val != btree_ptr_hash_val(k) ||
|
2020-06-06 09:28:01 -07:00
|
|
|
b->c.level != level ||
|
2017-03-16 23:18:50 -07:00
|
|
|
race_fault())) {
|
2020-06-06 09:28:01 -07:00
|
|
|
six_unlock_type(&b->c.lock, lock_type);
|
2021-08-30 12:18:31 -07:00
|
|
|
if (bch2_btree_node_relock(trans, path, level + 1))
|
2017-03-16 23:18:50 -07:00
|
|
|
goto retry;
|
|
|
|
|
2022-08-27 09:48:36 -07:00
|
|
|
trace_and_count(c, trans_restart_btree_node_reused, trans, trace_ip, path);
|
2022-07-17 20:06:38 -07:00
|
|
|
return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_lock_node_reused));
|
2017-03-16 23:18:50 -07:00
|
|
|
}
|
2022-11-25 14:04:42 -07:00
|
|
|
|
|
|
|
/* avoid atomic set bit if it's not needed: */
|
|
|
|
if (!btree_node_accessed(b))
|
|
|
|
set_btree_node_accessed(b);
|
2017-03-16 23:18:50 -07:00
|
|
|
}
|
|
|
|
|
2021-04-08 19:26:53 -07:00
|
|
|
if (unlikely(btree_node_read_in_flight(b))) {
|
2023-05-20 20:57:48 -07:00
|
|
|
u32 seq = six_lock_seq(&b->c.lock);
|
2021-07-10 20:03:15 -07:00
|
|
|
|
2021-04-08 19:26:53 -07:00
|
|
|
six_unlock_type(&b->c.lock, lock_type);
|
2021-07-24 14:12:51 -07:00
|
|
|
bch2_trans_unlock(trans);
|
2023-05-27 23:35:34 -07:00
|
|
|
need_relock = true;
|
2021-04-08 19:26:53 -07:00
|
|
|
|
2021-07-10 20:03:15 -07:00
|
|
|
bch2_btree_node_wait_on_read(b);
|
2021-04-08 19:26:53 -07:00
|
|
|
|
2024-08-18 12:08:12 -07:00
|
|
|
ret = bch2_trans_relock(trans);
|
|
|
|
if (ret)
|
|
|
|
return ERR_PTR(ret);
|
|
|
|
|
2021-04-08 19:26:53 -07:00
|
|
|
/*
|
2021-08-30 12:18:31 -07:00
|
|
|
* should_be_locked is not set on this path yet, so we need to
|
|
|
|
* relock it specifically:
|
2021-04-08 19:26:53 -07:00
|
|
|
*/
|
2021-07-10 20:03:15 -07:00
|
|
|
if (!six_relock_type(&b->c.lock, lock_type, seq))
|
|
|
|
goto retry;
|
2021-04-08 19:26:53 -07:00
|
|
|
}
|
2017-03-16 23:18:50 -07:00
|
|
|
|
2023-05-27 23:35:34 -07:00
|
|
|
if (unlikely(need_relock)) {
|
2023-09-12 15:41:22 -07:00
|
|
|
ret = bch2_trans_relock(trans) ?:
|
2023-05-27 23:35:34 -07:00
|
|
|
bch2_btree_path_relock_intent(trans, path);
|
|
|
|
if (ret) {
|
|
|
|
six_unlock_type(&b->c.lock, lock_type);
|
|
|
|
return ERR_PTR(ret);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2017-03-16 23:18:50 -07:00
|
|
|
prefetch(b->aux_data);
|
|
|
|
|
|
|
|
for_each_bset(b, t) {
|
|
|
|
void *p = (u64 *) b->aux_data + t->aux_data_offset;
|
|
|
|
|
|
|
|
prefetch(p + L1_CACHE_BYTES * 0);
|
|
|
|
prefetch(p + L1_CACHE_BYTES * 1);
|
|
|
|
prefetch(p + L1_CACHE_BYTES * 2);
|
|
|
|
}
|
|
|
|
|
|
|
|
if (unlikely(btree_node_read_error(b))) {
|
2020-06-06 09:28:01 -07:00
|
|
|
six_unlock_type(&b->c.lock, lock_type);
|
2024-02-06 15:24:18 -07:00
|
|
|
return ERR_PTR(-BCH_ERR_btree_node_read_error);
|
2017-03-16 23:18:50 -07:00
|
|
|
}
|
|
|
|
|
2021-08-30 12:18:31 -07:00
|
|
|
EBUG_ON(b->c.btree_id != path->btree_id);
|
2022-11-22 20:05:45 -07:00
|
|
|
EBUG_ON(BTREE_NODE_LEVEL(b->data) != level);
|
|
|
|
btree_check_header(c, b);
|
|
|
|
|
|
|
|
return b;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
2023-09-12 15:41:22 -07:00
|
|
|
* bch2_btree_node_get - find a btree node in the cache and lock it, reading it
|
2022-11-22 20:05:45 -07:00
|
|
|
* in from disk if necessary.
|
|
|
|
*
|
2023-09-12 15:41:22 -07:00
|
|
|
* @trans: btree transaction object
|
|
|
|
* @path: btree_path being traversed
|
|
|
|
* @k: pointer to btree node (generally KEY_TYPE_btree_ptr_v2)
|
|
|
|
* @level: level of btree node being looked up (0 == leaf node)
|
|
|
|
* @lock_type: SIX_LOCK_read or SIX_LOCK_intent
|
|
|
|
* @trace_ip: ip of caller of btree iterator code (i.e. caller of bch2_btree_iter_peek())
|
|
|
|
*
|
2022-11-22 20:05:45 -07:00
|
|
|
* The btree node will have either a read or a write lock held, depending on
|
|
|
|
* the @write parameter.
|
2023-09-12 15:41:22 -07:00
|
|
|
*
|
|
|
|
* Returns: btree node or ERR_PTR()
|
2022-11-22 20:05:45 -07:00
|
|
|
*/
|
|
|
|
struct btree *bch2_btree_node_get(struct btree_trans *trans, struct btree_path *path,
|
|
|
|
const struct bkey_i *k, unsigned level,
|
|
|
|
enum six_lock_type lock_type,
|
|
|
|
unsigned long trace_ip)
|
|
|
|
{
|
|
|
|
struct bch_fs *c = trans->c;
|
|
|
|
struct btree *b;
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
EBUG_ON(level >= BTREE_MAX_DEPTH);
|
|
|
|
|
|
|
|
b = btree_node_mem_ptr(k);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Check b->hash_val _before_ calling btree_node_lock() - this might not
|
|
|
|
* be the node we want anymore, and trying to lock the wrong node could
|
|
|
|
* cause an unneccessary transaction restart:
|
|
|
|
*/
|
|
|
|
if (unlikely(!c->opts.btree_node_mem_ptr_optimization ||
|
|
|
|
!b ||
|
|
|
|
b->hash_val != btree_ptr_hash_val(k)))
|
|
|
|
return __bch2_btree_node_get(trans, path, k, level, lock_type, trace_ip);
|
|
|
|
|
|
|
|
if (btree_node_read_locked(path, level + 1))
|
|
|
|
btree_node_unlock(trans, path, level + 1);
|
|
|
|
|
|
|
|
ret = btree_node_lock(trans, path, &b->c, level, lock_type, trace_ip);
|
|
|
|
if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
|
|
|
|
return ERR_PTR(ret);
|
|
|
|
|
|
|
|
BUG_ON(ret);
|
|
|
|
|
|
|
|
if (unlikely(b->hash_val != btree_ptr_hash_val(k) ||
|
|
|
|
b->c.level != level ||
|
|
|
|
race_fault())) {
|
|
|
|
six_unlock_type(&b->c.lock, lock_type);
|
|
|
|
if (bch2_btree_node_relock(trans, path, level + 1))
|
|
|
|
return __bch2_btree_node_get(trans, path, k, level, lock_type, trace_ip);
|
|
|
|
|
|
|
|
trace_and_count(c, trans_restart_btree_node_reused, trans, trace_ip, path);
|
|
|
|
return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_lock_node_reused));
|
|
|
|
}
|
|
|
|
|
|
|
|
if (unlikely(btree_node_read_in_flight(b))) {
|
|
|
|
six_unlock_type(&b->c.lock, lock_type);
|
2023-09-19 22:31:00 -07:00
|
|
|
return __bch2_btree_node_get(trans, path, k, level, lock_type, trace_ip);
|
2022-11-22 20:05:45 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
prefetch(b->aux_data);
|
|
|
|
|
|
|
|
for_each_bset(b, t) {
|
|
|
|
void *p = (u64 *) b->aux_data + t->aux_data_offset;
|
|
|
|
|
|
|
|
prefetch(p + L1_CACHE_BYTES * 0);
|
|
|
|
prefetch(p + L1_CACHE_BYTES * 1);
|
|
|
|
prefetch(p + L1_CACHE_BYTES * 2);
|
|
|
|
}
|
|
|
|
|
|
|
|
/* avoid atomic set bit if it's not needed: */
|
|
|
|
if (!btree_node_accessed(b))
|
|
|
|
set_btree_node_accessed(b);
|
|
|
|
|
|
|
|
if (unlikely(btree_node_read_error(b))) {
|
|
|
|
six_unlock_type(&b->c.lock, lock_type);
|
2024-02-06 15:24:18 -07:00
|
|
|
return ERR_PTR(-BCH_ERR_btree_node_read_error);
|
2022-11-22 20:05:45 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
EBUG_ON(b->c.btree_id != path->btree_id);
|
2021-01-26 18:59:00 -07:00
|
|
|
EBUG_ON(BTREE_NODE_LEVEL(b->data) != level);
|
2021-04-23 13:05:49 -07:00
|
|
|
btree_check_header(c, b);
|
2017-03-16 23:18:50 -07:00
|
|
|
|
|
|
|
return b;
|
|
|
|
}
|
|
|
|
|
2022-08-21 11:29:43 -07:00
|
|
|
struct btree *bch2_btree_node_get_noiter(struct btree_trans *trans,
|
2020-03-15 20:29:43 -07:00
|
|
|
const struct bkey_i *k,
|
|
|
|
enum btree_id btree_id,
|
2021-01-26 18:59:00 -07:00
|
|
|
unsigned level,
|
|
|
|
bool nofill)
|
2020-03-15 20:29:43 -07:00
|
|
|
{
|
2022-08-21 11:29:43 -07:00
|
|
|
struct bch_fs *c = trans->c;
|
2020-03-15 20:29:43 -07:00
|
|
|
struct btree_cache *bc = &c->btree_cache;
|
|
|
|
struct btree *b;
|
2020-06-12 19:29:48 -07:00
|
|
|
int ret;
|
2020-03-15 20:29:43 -07:00
|
|
|
|
|
|
|
EBUG_ON(level >= BTREE_MAX_DEPTH);
|
|
|
|
|
2021-07-26 12:52:41 -07:00
|
|
|
if (c->opts.btree_node_mem_ptr_optimization) {
|
|
|
|
b = btree_node_mem_ptr(k);
|
|
|
|
if (b)
|
|
|
|
goto lock_node;
|
|
|
|
}
|
2020-03-15 20:29:43 -07:00
|
|
|
retry:
|
|
|
|
b = btree_cache_find(bc, k);
|
|
|
|
if (unlikely(!b)) {
|
2021-01-26 18:59:00 -07:00
|
|
|
if (nofill)
|
2021-02-23 19:41:25 -07:00
|
|
|
goto out;
|
2021-01-26 18:59:00 -07:00
|
|
|
|
2023-03-02 00:12:18 -07:00
|
|
|
b = bch2_btree_node_fill(trans, NULL, k, btree_id,
|
2020-03-15 20:29:43 -07:00
|
|
|
level, SIX_LOCK_read, true);
|
|
|
|
|
|
|
|
/* We raced and found the btree node in the cache */
|
|
|
|
if (!b)
|
|
|
|
goto retry;
|
|
|
|
|
2021-02-23 19:41:25 -07:00
|
|
|
if (IS_ERR(b) &&
|
2023-12-02 01:36:27 -07:00
|
|
|
!bch2_btree_cache_cannibalize_lock(trans, NULL))
|
2021-02-23 19:41:25 -07:00
|
|
|
goto retry;
|
|
|
|
|
2020-03-15 20:29:43 -07:00
|
|
|
if (IS_ERR(b))
|
2021-02-23 19:41:25 -07:00
|
|
|
goto out;
|
2020-03-15 20:29:43 -07:00
|
|
|
} else {
|
|
|
|
lock_node:
|
2023-02-04 17:39:59 -07:00
|
|
|
ret = btree_node_lock_nopath(trans, &b->c, SIX_LOCK_read, _THIS_IP_);
|
2022-08-22 12:29:53 -07:00
|
|
|
if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
|
|
|
|
return ERR_PTR(ret);
|
|
|
|
|
|
|
|
BUG_ON(ret);
|
2020-03-15 20:29:43 -07:00
|
|
|
|
|
|
|
if (unlikely(b->hash_val != btree_ptr_hash_val(k) ||
|
|
|
|
b->c.btree_id != btree_id ||
|
|
|
|
b->c.level != level)) {
|
|
|
|
six_unlock_read(&b->c.lock);
|
|
|
|
goto retry;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/* XXX: waiting on IO with btree locks held: */
|
2021-07-10 20:03:15 -07:00
|
|
|
__bch2_btree_node_wait_on_read(b);
|
2020-03-15 20:29:43 -07:00
|
|
|
|
|
|
|
prefetch(b->aux_data);
|
|
|
|
|
|
|
|
for_each_bset(b, t) {
|
|
|
|
void *p = (u64 *) b->aux_data + t->aux_data_offset;
|
|
|
|
|
|
|
|
prefetch(p + L1_CACHE_BYTES * 0);
|
|
|
|
prefetch(p + L1_CACHE_BYTES * 1);
|
|
|
|
prefetch(p + L1_CACHE_BYTES * 2);
|
|
|
|
}
|
|
|
|
|
|
|
|
/* avoid atomic set bit if it's not needed: */
|
|
|
|
if (!btree_node_accessed(b))
|
|
|
|
set_btree_node_accessed(b);
|
|
|
|
|
|
|
|
if (unlikely(btree_node_read_error(b))) {
|
|
|
|
six_unlock_read(&b->c.lock);
|
2024-02-06 15:24:18 -07:00
|
|
|
b = ERR_PTR(-BCH_ERR_btree_node_read_error);
|
2021-02-23 19:41:25 -07:00
|
|
|
goto out;
|
2020-03-15 20:29:43 -07:00
|
|
|
}
|
|
|
|
|
2021-01-26 18:59:00 -07:00
|
|
|
EBUG_ON(b->c.btree_id != btree_id);
|
|
|
|
EBUG_ON(BTREE_NODE_LEVEL(b->data) != level);
|
2021-04-23 13:05:49 -07:00
|
|
|
btree_check_header(c, b);
|
2021-02-23 19:41:25 -07:00
|
|
|
out:
|
2023-12-02 01:36:27 -07:00
|
|
|
bch2_btree_cache_cannibalize_unlock(trans);
|
2020-03-15 20:29:43 -07:00
|
|
|
return b;
|
|
|
|
}
|
|
|
|
|
2023-03-02 00:12:18 -07:00
|
|
|
int bch2_btree_node_prefetch(struct btree_trans *trans,
|
2021-08-30 12:18:31 -07:00
|
|
|
struct btree_path *path,
|
2021-07-24 16:50:40 -07:00
|
|
|
const struct bkey_i *k,
|
|
|
|
enum btree_id btree_id, unsigned level)
|
2017-03-16 23:18:50 -07:00
|
|
|
{
|
2023-03-02 00:12:18 -07:00
|
|
|
struct bch_fs *c = trans->c;
|
2017-03-16 23:18:50 -07:00
|
|
|
struct btree_cache *bc = &c->btree_cache;
|
|
|
|
|
2024-01-22 12:25:00 -07:00
|
|
|
BUG_ON(path && !btree_node_locked(path, level + 1));
|
2017-03-16 23:18:50 -07:00
|
|
|
BUG_ON(level >= BTREE_MAX_DEPTH);
|
|
|
|
|
2024-04-12 12:54:33 -07:00
|
|
|
struct btree *b = btree_cache_find(bc, k);
|
2017-03-16 23:18:50 -07:00
|
|
|
if (b)
|
2021-07-24 16:50:40 -07:00
|
|
|
return 0;
|
2017-03-16 23:18:50 -07:00
|
|
|
|
2023-03-02 00:12:18 -07:00
|
|
|
b = bch2_btree_node_fill(trans, path, k, btree_id,
|
2021-08-30 11:22:43 -07:00
|
|
|
level, SIX_LOCK_read, false);
|
2024-04-12 12:54:33 -07:00
|
|
|
if (!IS_ERR_OR_NULL(b))
|
|
|
|
six_unlock_read(&b->c.lock);
|
|
|
|
return bch2_trans_relock(trans) ?: PTR_ERR_OR_ZERO(b);
|
2017-03-16 23:18:50 -07:00
|
|
|
}
|
|
|
|
|
2022-08-21 11:29:43 -07:00
|
|
|
void bch2_btree_node_evict(struct btree_trans *trans, const struct bkey_i *k)
|
2021-04-25 13:24:03 -07:00
|
|
|
{
|
2022-08-21 11:29:43 -07:00
|
|
|
struct bch_fs *c = trans->c;
|
2021-04-25 13:24:03 -07:00
|
|
|
struct btree_cache *bc = &c->btree_cache;
|
|
|
|
struct btree *b;
|
|
|
|
|
|
|
|
b = btree_cache_find(bc, k);
|
|
|
|
if (!b)
|
|
|
|
return;
|
2024-03-24 16:52:03 -07:00
|
|
|
|
|
|
|
BUG_ON(b == btree_node_root(trans->c, b));
|
2021-07-10 20:03:15 -07:00
|
|
|
wait_on_io:
|
|
|
|
/* not allowed to wait on io with btree locks held: */
|
|
|
|
|
|
|
|
/* XXX we're called from btree_gc which will be holding other btree
|
|
|
|
* nodes locked
|
2022-10-19 15:31:33 -07:00
|
|
|
*/
|
2021-07-10 20:03:15 -07:00
|
|
|
__bch2_btree_node_wait_on_read(b);
|
|
|
|
__bch2_btree_node_wait_on_write(b);
|
2021-04-25 13:24:03 -07:00
|
|
|
|
2022-08-21 11:29:43 -07:00
|
|
|
btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_intent);
|
|
|
|
btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_write);
|
2024-04-11 20:58:36 -07:00
|
|
|
if (unlikely(b->hash_val != btree_ptr_hash_val(k)))
|
|
|
|
goto out;
|
2021-04-25 13:24:03 -07:00
|
|
|
|
2021-07-10 20:03:15 -07:00
|
|
|
if (btree_node_dirty(b)) {
|
2022-10-28 14:08:41 -07:00
|
|
|
__bch2_btree_node_write(c, b, BTREE_WRITE_cache_reclaim);
|
2021-07-10 20:03:15 -07:00
|
|
|
six_unlock_write(&b->c.lock);
|
|
|
|
six_unlock_intent(&b->c.lock);
|
|
|
|
goto wait_on_io;
|
|
|
|
}
|
2021-04-25 13:24:03 -07:00
|
|
|
|
|
|
|
BUG_ON(btree_node_dirty(b));
|
|
|
|
|
|
|
|
mutex_lock(&bc->lock);
|
|
|
|
bch2_btree_node_hash_remove(bc, b);
|
2024-09-05 16:37:56 -07:00
|
|
|
btree_node_data_free(c, b);
|
2021-04-25 13:24:03 -07:00
|
|
|
mutex_unlock(&bc->lock);
|
2024-04-11 20:58:36 -07:00
|
|
|
out:
|
2021-04-25 13:24:03 -07:00
|
|
|
six_unlock_write(&b->c.lock);
|
|
|
|
six_unlock_intent(&b->c.lock);
|
|
|
|
}
|
|
|
|
|
2023-10-19 19:49:08 -07:00
|
|
|
const char *bch2_btree_id_str(enum btree_id btree)
|
|
|
|
{
|
|
|
|
return btree < BTREE_ID_NR ? __bch2_btree_ids[btree] : "(unknown)";
|
|
|
|
}
|
|
|
|
|
2024-06-07 15:19:39 -07:00
|
|
|
void bch2_btree_id_to_text(struct printbuf *out, enum btree_id btree)
|
|
|
|
{
|
|
|
|
if (btree < BTREE_ID_NR)
|
|
|
|
prt_str(out, __bch2_btree_ids[btree]);
|
|
|
|
else
|
|
|
|
prt_printf(out, "(unknown btree %u)", btree);
|
|
|
|
}
|
|
|
|
|
2023-10-19 19:49:08 -07:00
|
|
|
void bch2_btree_pos_to_text(struct printbuf *out, struct bch_fs *c, const struct btree *b)
|
|
|
|
{
|
|
|
|
prt_printf(out, "%s level %u/%u\n ",
|
|
|
|
bch2_btree_id_str(b->c.btree_id),
|
|
|
|
b->c.level,
|
|
|
|
bch2_btree_id_root(c, b->c.btree_id)->level);
|
|
|
|
bch2_bkey_val_to_text(out, c, bkey_i_to_s_c(&b->key));
|
|
|
|
}
|
|
|
|
|
|
|
|
void bch2_btree_node_to_text(struct printbuf *out, struct bch_fs *c, const struct btree *b)
|
2017-03-16 23:18:50 -07:00
|
|
|
{
|
|
|
|
struct bset_stats stats;
|
|
|
|
|
|
|
|
memset(&stats, 0, sizeof(stats));
|
|
|
|
|
|
|
|
bch2_btree_keys_stats(b, &stats);
|
|
|
|
|
2023-02-03 19:01:40 -07:00
|
|
|
prt_printf(out, "l %u ", b->c.level);
|
2021-03-04 13:20:22 -07:00
|
|
|
bch2_bpos_to_text(out, b->data->min_key);
|
2023-02-03 19:01:40 -07:00
|
|
|
prt_printf(out, " - ");
|
2021-03-04 13:20:22 -07:00
|
|
|
bch2_bpos_to_text(out, b->data->max_key);
|
2023-02-03 19:01:40 -07:00
|
|
|
prt_printf(out, ":\n"
|
2021-03-04 13:20:22 -07:00
|
|
|
" ptrs: ");
|
2018-11-01 12:10:01 -07:00
|
|
|
bch2_val_to_text(out, c, bkey_i_to_s_c(&b->key));
|
2023-08-03 11:42:37 -07:00
|
|
|
prt_newline(out);
|
2021-03-04 13:20:22 -07:00
|
|
|
|
2023-08-03 11:42:37 -07:00
|
|
|
prt_printf(out,
|
|
|
|
" format: ");
|
|
|
|
bch2_bkey_format_to_text(out, &b->format);
|
|
|
|
|
|
|
|
prt_printf(out,
|
2018-11-08 23:24:07 -07:00
|
|
|
" unpack fn len: %u\n"
|
|
|
|
" bytes used %zu/%zu (%zu%% full)\n"
|
2021-03-28 22:13:31 -07:00
|
|
|
" sib u64s: %u, %u (merge threshold %u)\n"
|
2018-11-08 23:24:07 -07:00
|
|
|
" nr packed keys %u\n"
|
|
|
|
" nr unpacked keys %u\n"
|
|
|
|
" floats %zu\n"
|
2019-10-23 11:56:20 -07:00
|
|
|
" failed unpacked %zu\n",
|
2018-11-08 23:24:07 -07:00
|
|
|
b->unpack_fn_len,
|
|
|
|
b->nr.live_u64s * sizeof(u64),
|
2024-01-16 11:29:59 -07:00
|
|
|
btree_buf_bytes(b) - sizeof(struct btree_node),
|
2018-11-08 23:24:07 -07:00
|
|
|
b->nr.live_u64s * 100 / btree_max_u64s(c),
|
|
|
|
b->sib_u64s[0],
|
|
|
|
b->sib_u64s[1],
|
2021-03-28 22:13:31 -07:00
|
|
|
c->btree_foreground_merge_threshold,
|
2018-11-08 23:24:07 -07:00
|
|
|
b->nr.packed_keys,
|
|
|
|
b->nr.unpacked_keys,
|
|
|
|
stats.floats,
|
2019-10-23 11:56:20 -07:00
|
|
|
stats.failed);
|
2017-03-16 23:18:50 -07:00
|
|
|
}
|
2020-11-19 18:13:30 -07:00
|
|
|
|
2024-05-05 06:47:53 -07:00
|
|
|
static void prt_btree_cache_line(struct printbuf *out, const struct bch_fs *c,
|
2024-09-05 16:25:01 -07:00
|
|
|
const char *label, size_t nr)
|
2024-05-05 06:47:53 -07:00
|
|
|
{
|
|
|
|
prt_printf(out, "%s\t", label);
|
|
|
|
prt_human_readable_u64(out, nr * c->opts.btree_node_size);
|
2024-09-05 16:25:01 -07:00
|
|
|
prt_printf(out, " (%zu)\n", nr);
|
2024-05-05 06:47:53 -07:00
|
|
|
}
|
|
|
|
|
2024-09-01 10:36:42 -07:00
|
|
|
static const char * const bch2_btree_cache_not_freed_reasons_strs[] = {
|
|
|
|
#define x(n) #n,
|
|
|
|
BCH_BTREE_CACHE_NOT_FREED_REASONS()
|
|
|
|
#undef x
|
|
|
|
NULL
|
|
|
|
};
|
|
|
|
|
2022-09-29 20:37:15 -07:00
|
|
|
void bch2_btree_cache_to_text(struct printbuf *out, const struct btree_cache *bc)
|
2020-11-19 18:13:30 -07:00
|
|
|
{
|
2022-09-29 20:37:15 -07:00
|
|
|
struct bch_fs *c = container_of(bc, struct bch_fs, btree_cache);
|
2024-05-05 06:47:53 -07:00
|
|
|
|
|
|
|
if (!out->nr_tabstops)
|
2022-09-29 20:37:15 -07:00
|
|
|
printbuf_tabstop_push(out, 32);
|
2024-05-05 06:47:53 -07:00
|
|
|
|
bcachefs: Rework btree node pinning
In backpointers fsck, we do a seqential scan of one btree, and check
references to another: extents <-> backpointers
Checking references generates random lookups, so we want to pin that
btree in memory (or only a range, if it doesn't fit in ram).
Previously, this was done with a simple check in the shrinker - "if
btree node is in range being pinned, don't free it" - but this generated
OOMs, as our shrinker wasn't well behaved if there was less memory
available than expected.
Instead, we now have two different shrinkers and lru lists; the second
shrinker being for pinned nodes, with seeks set much higher than normal
- so they can still be freed if necessary, but we'll prefer not to.
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
2024-09-04 17:49:37 -07:00
|
|
|
prt_btree_cache_line(out, c, "live:", bc->live[0].nr);
|
|
|
|
prt_btree_cache_line(out, c, "pinned:", bc->live[1].nr);
|
|
|
|
prt_btree_cache_line(out, c, "freeable:", bc->nr_freeable);
|
|
|
|
prt_btree_cache_line(out, c, "dirty:", atomic_long_read(&bc->nr_dirty));
|
2024-05-05 06:47:53 -07:00
|
|
|
prt_printf(out, "cannibalize lock:\t%p\n", bc->alloc_lock);
|
|
|
|
prt_newline(out);
|
|
|
|
|
2024-09-05 16:25:01 -07:00
|
|
|
for (unsigned i = 0; i < ARRAY_SIZE(bc->nr_by_btree); i++)
|
|
|
|
prt_btree_cache_line(out, c, bch2_btree_id_str(i), bc->nr_by_btree[i]);
|
2022-09-29 20:37:15 -07:00
|
|
|
|
|
|
|
prt_newline(out);
|
2024-09-05 16:25:01 -07:00
|
|
|
prt_printf(out, "freed:\t%zu\n", bc->nr_freed);
|
2022-09-29 20:37:15 -07:00
|
|
|
prt_printf(out, "not freed:\n");
|
2024-09-01 10:36:42 -07:00
|
|
|
|
|
|
|
for (unsigned i = 0; i < ARRAY_SIZE(bc->not_freed); i++)
|
|
|
|
prt_printf(out, " %s\t%llu\n",
|
|
|
|
bch2_btree_cache_not_freed_reasons_strs[i], bc->not_freed[i]);
|
2020-11-19 18:13:30 -07:00
|
|
|
}
|