bcachefs: Kill bch2_btree_node_get_sibling()
This patch reworks the btree node merge path to use a second btree iterator to get the sibling node - which means bch2_btree_iter_get_sibling() can be deleted. Also, it uses bch2_btree_iter_traverse_all() if necessary - which means it should be more reliable. We don't currently even try to make it work when trans->nounlock is set - after a BTREE_INSERT_NOUNLOCK transaction commit, hopefully this will be a worthwhile tradeoff. Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com> Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
This commit is contained in:
parent
1259cc31b2
commit
54ca47e114
@ -913,136 +913,6 @@ out:
|
||||
return b;
|
||||
}
|
||||
|
||||
struct btree *bch2_btree_node_get_sibling(struct bch_fs *c,
|
||||
struct btree_iter *iter,
|
||||
struct btree *b,
|
||||
enum btree_node_sibling sib)
|
||||
{
|
||||
struct btree_trans *trans = iter->trans;
|
||||
struct btree *parent;
|
||||
struct btree_node_iter node_iter;
|
||||
struct bkey_packed *k;
|
||||
struct bkey_buf tmp;
|
||||
struct btree *ret = NULL;
|
||||
unsigned level = b->c.level;
|
||||
|
||||
bch2_bkey_buf_init(&tmp);
|
||||
|
||||
parent = btree_iter_node(iter, level + 1);
|
||||
if (!parent)
|
||||
return NULL;
|
||||
|
||||
/*
|
||||
* There's a corner case where a btree_iter might have a node locked
|
||||
* that is just outside its current pos - when
|
||||
* bch2_btree_iter_set_pos_same_leaf() gets to the end of the node.
|
||||
*
|
||||
* But the lock ordering checks in __bch2_btree_node_lock() go off of
|
||||
* iter->pos, not the node's key: so if the iterator is marked as
|
||||
* needing to be traversed, we risk deadlock if we don't bail out here:
|
||||
*/
|
||||
if (iter->uptodate >= BTREE_ITER_NEED_TRAVERSE)
|
||||
return ERR_PTR(-EINTR);
|
||||
|
||||
if (!bch2_btree_node_relock(iter, level + 1)) {
|
||||
ret = ERR_PTR(-EINTR);
|
||||
goto out;
|
||||
}
|
||||
|
||||
node_iter = iter->l[parent->c.level].iter;
|
||||
|
||||
k = bch2_btree_node_iter_peek_all(&node_iter, parent);
|
||||
BUG_ON(bkey_cmp_left_packed(parent, k, &b->key.k.p));
|
||||
|
||||
k = sib == btree_prev_sib
|
||||
? bch2_btree_node_iter_prev(&node_iter, parent)
|
||||
: (bch2_btree_node_iter_advance(&node_iter, parent),
|
||||
bch2_btree_node_iter_peek(&node_iter, parent));
|
||||
if (!k)
|
||||
goto out;
|
||||
|
||||
bch2_bkey_buf_unpack(&tmp, c, parent, k);
|
||||
|
||||
ret = bch2_btree_node_get(c, iter, tmp.k, level,
|
||||
SIX_LOCK_intent, _THIS_IP_);
|
||||
|
||||
if (PTR_ERR_OR_ZERO(ret) == -EINTR && !trans->nounlock) {
|
||||
struct btree_iter *linked;
|
||||
|
||||
if (!bch2_btree_node_relock(iter, level + 1))
|
||||
goto out;
|
||||
|
||||
/*
|
||||
* We might have got -EINTR because trylock failed, and we're
|
||||
* holding other locks that would cause us to deadlock:
|
||||
*/
|
||||
trans_for_each_iter(trans, linked)
|
||||
if (btree_iter_lock_cmp(iter, linked) < 0)
|
||||
__bch2_btree_iter_unlock(linked);
|
||||
|
||||
if (sib == btree_prev_sib)
|
||||
btree_node_unlock(iter, level);
|
||||
|
||||
ret = bch2_btree_node_get(c, iter, tmp.k, level,
|
||||
SIX_LOCK_intent, _THIS_IP_);
|
||||
|
||||
/*
|
||||
* before btree_iter_relock() calls btree_iter_verify_locks():
|
||||
*/
|
||||
if (btree_lock_want(iter, level + 1) == BTREE_NODE_UNLOCKED)
|
||||
btree_node_unlock(iter, level + 1);
|
||||
|
||||
if (!bch2_btree_node_relock(iter, level)) {
|
||||
btree_iter_set_dirty(iter, BTREE_ITER_NEED_RELOCK);
|
||||
|
||||
if (!IS_ERR(ret)) {
|
||||
six_unlock_intent(&ret->c.lock);
|
||||
ret = ERR_PTR(-EINTR);
|
||||
}
|
||||
}
|
||||
|
||||
bch2_trans_relock(trans);
|
||||
}
|
||||
out:
|
||||
if (btree_lock_want(iter, level + 1) == BTREE_NODE_UNLOCKED)
|
||||
btree_node_unlock(iter, level + 1);
|
||||
|
||||
if (PTR_ERR_OR_ZERO(ret) == -EINTR)
|
||||
bch2_btree_iter_upgrade(iter, level + 2);
|
||||
|
||||
BUG_ON(!IS_ERR(ret) && !btree_node_locked(iter, level));
|
||||
|
||||
if (!IS_ERR_OR_NULL(ret)) {
|
||||
struct btree *n1 = ret, *n2 = b;
|
||||
|
||||
if (sib != btree_prev_sib)
|
||||
swap(n1, n2);
|
||||
|
||||
if (bpos_cmp(bpos_successor(n1->key.k.p),
|
||||
n2->data->min_key)) {
|
||||
char buf1[200], buf2[200];
|
||||
|
||||
bch2_bkey_val_to_text(&PBUF(buf1), c, bkey_i_to_s_c(&n1->key));
|
||||
bch2_bkey_val_to_text(&PBUF(buf2), c, bkey_i_to_s_c(&n2->key));
|
||||
|
||||
bch2_fs_inconsistent(c, "btree topology error at btree %s level %u:\n"
|
||||
"prev: %s\n"
|
||||
"next: %s\n",
|
||||
bch2_btree_ids[iter->btree_id], level,
|
||||
buf1, buf2);
|
||||
|
||||
six_unlock_intent(&ret->c.lock);
|
||||
ret = NULL;
|
||||
}
|
||||
}
|
||||
|
||||
bch2_btree_trans_verify_locks(trans);
|
||||
|
||||
bch2_bkey_buf_exit(&tmp, c);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
void bch2_btree_node_prefetch(struct bch_fs *c, struct btree_iter *iter,
|
||||
const struct bkey_i *k,
|
||||
enum btree_id btree_id, unsigned level)
|
||||
@ -1082,7 +952,7 @@ void bch2_btree_node_to_text(struct printbuf *out, struct bch_fs *c,
|
||||
" format: u64s %u fields %u %u %u %u %u\n"
|
||||
" unpack fn len: %u\n"
|
||||
" bytes used %zu/%zu (%zu%% full)\n"
|
||||
" sib u64s: %u, %u (merge threshold %zu)\n"
|
||||
" sib u64s: %u, %u (merge threshold %u)\n"
|
||||
" nr packed keys %u\n"
|
||||
" nr unpacked keys %u\n"
|
||||
" floats %zu\n"
|
||||
@ -1099,7 +969,7 @@ void bch2_btree_node_to_text(struct printbuf *out, struct bch_fs *c,
|
||||
b->nr.live_u64s * 100 / btree_max_u64s(c),
|
||||
b->sib_u64s[0],
|
||||
b->sib_u64s[1],
|
||||
BTREE_FOREGROUND_MERGE_THRESHOLD(c),
|
||||
c->btree_foreground_merge_threshold,
|
||||
b->nr.packed_keys,
|
||||
b->nr.unpacked_keys,
|
||||
stats.floats,
|
||||
|
@ -26,9 +26,6 @@ struct btree *bch2_btree_node_get(struct bch_fs *, struct btree_iter *,
|
||||
struct btree *bch2_btree_node_get_noiter(struct bch_fs *, const struct bkey_i *,
|
||||
enum btree_id, unsigned, bool);
|
||||
|
||||
struct btree *bch2_btree_node_get_sibling(struct bch_fs *, struct btree_iter *,
|
||||
struct btree *, enum btree_node_sibling);
|
||||
|
||||
void bch2_btree_node_prefetch(struct bch_fs *, struct btree_iter *,
|
||||
const struct bkey_i *, enum btree_id, unsigned);
|
||||
|
||||
|
@ -1539,36 +1539,50 @@ void __bch2_foreground_maybe_merge(struct bch_fs *c,
|
||||
enum btree_node_sibling sib)
|
||||
{
|
||||
struct btree_trans *trans = iter->trans;
|
||||
struct btree_iter *sib_iter = NULL;
|
||||
struct btree_update *as;
|
||||
struct bkey_format_state new_s;
|
||||
struct bkey_format new_f;
|
||||
struct bkey_i delete;
|
||||
struct btree *b, *m, *n, *prev, *next, *parent;
|
||||
struct bpos sib_pos;
|
||||
size_t sib_u64s;
|
||||
int ret = 0;
|
||||
|
||||
if (trans->nounlock)
|
||||
return;
|
||||
|
||||
BUG_ON(!btree_node_locked(iter, level));
|
||||
retry:
|
||||
ret = bch2_btree_iter_traverse(iter);
|
||||
if (ret)
|
||||
goto err;
|
||||
|
||||
BUG_ON(!btree_node_locked(iter, level));
|
||||
|
||||
b = iter->l[level].b;
|
||||
|
||||
parent = btree_node_parent(iter, b);
|
||||
if (!parent)
|
||||
if ((sib == btree_prev_sib && !bpos_cmp(b->data->min_key, POS_MIN)) ||
|
||||
(sib == btree_next_sib && !bpos_cmp(b->data->max_key, POS_MAX))) {
|
||||
b->sib_u64s[sib] = U16_MAX;
|
||||
goto out;
|
||||
|
||||
if (b->sib_u64s[sib] > BTREE_FOREGROUND_MERGE_THRESHOLD(c))
|
||||
goto out;
|
||||
|
||||
/* XXX: can't be holding read locks */
|
||||
m = bch2_btree_node_get_sibling(c, iter, b, sib);
|
||||
if (IS_ERR(m)) {
|
||||
ret = PTR_ERR(m);
|
||||
goto err;
|
||||
}
|
||||
|
||||
/* NULL means no sibling: */
|
||||
if (!m) {
|
||||
sib_pos = sib == btree_prev_sib
|
||||
? bpos_predecessor(b->data->min_key)
|
||||
: bpos_successor(b->data->max_key);
|
||||
|
||||
sib_iter = bch2_trans_get_node_iter(trans, iter->btree_id,
|
||||
sib_pos, U8_MAX, level,
|
||||
BTREE_ITER_INTENT);
|
||||
ret = bch2_btree_iter_traverse(sib_iter);
|
||||
if (ret)
|
||||
goto err;
|
||||
|
||||
m = sib_iter->l[level].b;
|
||||
|
||||
if (btree_node_parent(iter, b) !=
|
||||
btree_node_parent(sib_iter, m)) {
|
||||
b->sib_u64s[sib] = U16_MAX;
|
||||
goto out;
|
||||
}
|
||||
@ -1581,6 +1595,8 @@ retry:
|
||||
next = m;
|
||||
}
|
||||
|
||||
BUG_ON(bkey_cmp(bpos_successor(prev->data->max_key), next->data->min_key));
|
||||
|
||||
bch2_bkey_format_init(&new_s);
|
||||
bch2_bkey_format_add_pos(&new_s, prev->data->min_key);
|
||||
__bch2_btree_calc_format(&new_s, prev);
|
||||
@ -1598,23 +1614,21 @@ retry:
|
||||
}
|
||||
|
||||
sib_u64s = min(sib_u64s, btree_max_u64s(c));
|
||||
sib_u64s = min(sib_u64s, (size_t) U16_MAX - 1);
|
||||
b->sib_u64s[sib] = sib_u64s;
|
||||
|
||||
if (b->sib_u64s[sib] > BTREE_FOREGROUND_MERGE_THRESHOLD(c)) {
|
||||
six_unlock_intent(&m->c.lock);
|
||||
if (b->sib_u64s[sib] > c->btree_foreground_merge_threshold)
|
||||
goto out;
|
||||
}
|
||||
|
||||
parent = btree_node_parent(iter, b);
|
||||
as = bch2_btree_update_start(iter, level,
|
||||
btree_update_reserve_required(c, parent) + 1,
|
||||
flags|
|
||||
BTREE_INSERT_NOFAIL|
|
||||
BTREE_INSERT_USE_RESERVE);
|
||||
ret = PTR_ERR_OR_ZERO(as);
|
||||
if (ret) {
|
||||
six_unlock_intent(&m->c.lock);
|
||||
if (ret)
|
||||
goto err;
|
||||
}
|
||||
|
||||
trace_btree_merge(c, b);
|
||||
|
||||
@ -1648,6 +1662,7 @@ retry:
|
||||
bch2_btree_update_get_open_buckets(as, n);
|
||||
|
||||
six_lock_increment(&b->c.lock, SIX_LOCK_intent);
|
||||
six_lock_increment(&m->c.lock, SIX_LOCK_intent);
|
||||
bch2_btree_iter_node_drop(iter, b);
|
||||
bch2_btree_iter_node_drop(iter, m);
|
||||
|
||||
@ -1663,6 +1678,7 @@ retry:
|
||||
bch2_btree_update_done(as);
|
||||
out:
|
||||
bch2_btree_trans_verify_locks(trans);
|
||||
bch2_trans_iter_free(trans, sib_iter);
|
||||
|
||||
/*
|
||||
* Don't downgrade locks here: we're called after successful insert,
|
||||
@ -1675,11 +1691,16 @@ out:
|
||||
*/
|
||||
return;
|
||||
err:
|
||||
BUG_ON(ret == -EAGAIN && (flags & BTREE_INSERT_NOUNLOCK));
|
||||
bch2_trans_iter_put(trans, sib_iter);
|
||||
sib_iter = NULL;
|
||||
|
||||
if (ret == -EINTR && bch2_trans_relock(trans)) {
|
||||
ret = 0;
|
||||
goto retry;
|
||||
}
|
||||
|
||||
if (ret == -EINTR && !(flags & BTREE_INSERT_NOUNLOCK)) {
|
||||
bch2_trans_unlock(trans);
|
||||
ret = bch2_btree_iter_traverse(iter);
|
||||
ret = bch2_btree_iter_traverse_all(trans);
|
||||
if (!ret)
|
||||
goto retry;
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user