1

bcachefs: Kill bch2_btree_node_get_sibling()

This patch reworks the btree node merge path to use a second btree
iterator to get the sibling node - which means
bch2_btree_iter_get_sibling() can be deleted. Also, it uses
bch2_btree_iter_traverse_all() if necessary - which means it should be
more reliable. We don't currently even try to make it work when
trans->nounlock is set - after a BTREE_INSERT_NOUNLOCK transaction
commit, hopefully this will be a worthwhile tradeoff.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
This commit is contained in:
Kent Overstreet 2021-03-29 01:13:31 -04:00 committed by Kent Overstreet
parent 1259cc31b2
commit 54ca47e114
3 changed files with 45 additions and 157 deletions

View File

@ -913,136 +913,6 @@ out:
return b;
}
struct btree *bch2_btree_node_get_sibling(struct bch_fs *c,
struct btree_iter *iter,
struct btree *b,
enum btree_node_sibling sib)
{
struct btree_trans *trans = iter->trans;
struct btree *parent;
struct btree_node_iter node_iter;
struct bkey_packed *k;
struct bkey_buf tmp;
struct btree *ret = NULL;
unsigned level = b->c.level;
bch2_bkey_buf_init(&tmp);
parent = btree_iter_node(iter, level + 1);
if (!parent)
return NULL;
/*
* There's a corner case where a btree_iter might have a node locked
* that is just outside its current pos - when
* bch2_btree_iter_set_pos_same_leaf() gets to the end of the node.
*
* But the lock ordering checks in __bch2_btree_node_lock() go off of
* iter->pos, not the node's key: so if the iterator is marked as
* needing to be traversed, we risk deadlock if we don't bail out here:
*/
if (iter->uptodate >= BTREE_ITER_NEED_TRAVERSE)
return ERR_PTR(-EINTR);
if (!bch2_btree_node_relock(iter, level + 1)) {
ret = ERR_PTR(-EINTR);
goto out;
}
node_iter = iter->l[parent->c.level].iter;
k = bch2_btree_node_iter_peek_all(&node_iter, parent);
BUG_ON(bkey_cmp_left_packed(parent, k, &b->key.k.p));
k = sib == btree_prev_sib
? bch2_btree_node_iter_prev(&node_iter, parent)
: (bch2_btree_node_iter_advance(&node_iter, parent),
bch2_btree_node_iter_peek(&node_iter, parent));
if (!k)
goto out;
bch2_bkey_buf_unpack(&tmp, c, parent, k);
ret = bch2_btree_node_get(c, iter, tmp.k, level,
SIX_LOCK_intent, _THIS_IP_);
if (PTR_ERR_OR_ZERO(ret) == -EINTR && !trans->nounlock) {
struct btree_iter *linked;
if (!bch2_btree_node_relock(iter, level + 1))
goto out;
/*
* We might have got -EINTR because trylock failed, and we're
* holding other locks that would cause us to deadlock:
*/
trans_for_each_iter(trans, linked)
if (btree_iter_lock_cmp(iter, linked) < 0)
__bch2_btree_iter_unlock(linked);
if (sib == btree_prev_sib)
btree_node_unlock(iter, level);
ret = bch2_btree_node_get(c, iter, tmp.k, level,
SIX_LOCK_intent, _THIS_IP_);
/*
* before btree_iter_relock() calls btree_iter_verify_locks():
*/
if (btree_lock_want(iter, level + 1) == BTREE_NODE_UNLOCKED)
btree_node_unlock(iter, level + 1);
if (!bch2_btree_node_relock(iter, level)) {
btree_iter_set_dirty(iter, BTREE_ITER_NEED_RELOCK);
if (!IS_ERR(ret)) {
six_unlock_intent(&ret->c.lock);
ret = ERR_PTR(-EINTR);
}
}
bch2_trans_relock(trans);
}
out:
if (btree_lock_want(iter, level + 1) == BTREE_NODE_UNLOCKED)
btree_node_unlock(iter, level + 1);
if (PTR_ERR_OR_ZERO(ret) == -EINTR)
bch2_btree_iter_upgrade(iter, level + 2);
BUG_ON(!IS_ERR(ret) && !btree_node_locked(iter, level));
if (!IS_ERR_OR_NULL(ret)) {
struct btree *n1 = ret, *n2 = b;
if (sib != btree_prev_sib)
swap(n1, n2);
if (bpos_cmp(bpos_successor(n1->key.k.p),
n2->data->min_key)) {
char buf1[200], buf2[200];
bch2_bkey_val_to_text(&PBUF(buf1), c, bkey_i_to_s_c(&n1->key));
bch2_bkey_val_to_text(&PBUF(buf2), c, bkey_i_to_s_c(&n2->key));
bch2_fs_inconsistent(c, "btree topology error at btree %s level %u:\n"
"prev: %s\n"
"next: %s\n",
bch2_btree_ids[iter->btree_id], level,
buf1, buf2);
six_unlock_intent(&ret->c.lock);
ret = NULL;
}
}
bch2_btree_trans_verify_locks(trans);
bch2_bkey_buf_exit(&tmp, c);
return ret;
}
void bch2_btree_node_prefetch(struct bch_fs *c, struct btree_iter *iter,
const struct bkey_i *k,
enum btree_id btree_id, unsigned level)
@ -1082,7 +952,7 @@ void bch2_btree_node_to_text(struct printbuf *out, struct bch_fs *c,
" format: u64s %u fields %u %u %u %u %u\n"
" unpack fn len: %u\n"
" bytes used %zu/%zu (%zu%% full)\n"
" sib u64s: %u, %u (merge threshold %zu)\n"
" sib u64s: %u, %u (merge threshold %u)\n"
" nr packed keys %u\n"
" nr unpacked keys %u\n"
" floats %zu\n"
@ -1099,7 +969,7 @@ void bch2_btree_node_to_text(struct printbuf *out, struct bch_fs *c,
b->nr.live_u64s * 100 / btree_max_u64s(c),
b->sib_u64s[0],
b->sib_u64s[1],
BTREE_FOREGROUND_MERGE_THRESHOLD(c),
c->btree_foreground_merge_threshold,
b->nr.packed_keys,
b->nr.unpacked_keys,
stats.floats,

View File

@ -26,9 +26,6 @@ struct btree *bch2_btree_node_get(struct bch_fs *, struct btree_iter *,
struct btree *bch2_btree_node_get_noiter(struct bch_fs *, const struct bkey_i *,
enum btree_id, unsigned, bool);
struct btree *bch2_btree_node_get_sibling(struct bch_fs *, struct btree_iter *,
struct btree *, enum btree_node_sibling);
void bch2_btree_node_prefetch(struct bch_fs *, struct btree_iter *,
const struct bkey_i *, enum btree_id, unsigned);

View File

@ -1539,36 +1539,50 @@ void __bch2_foreground_maybe_merge(struct bch_fs *c,
enum btree_node_sibling sib)
{
struct btree_trans *trans = iter->trans;
struct btree_iter *sib_iter = NULL;
struct btree_update *as;
struct bkey_format_state new_s;
struct bkey_format new_f;
struct bkey_i delete;
struct btree *b, *m, *n, *prev, *next, *parent;
struct bpos sib_pos;
size_t sib_u64s;
int ret = 0;
if (trans->nounlock)
return;
BUG_ON(!btree_node_locked(iter, level));
retry:
ret = bch2_btree_iter_traverse(iter);
if (ret)
goto err;
BUG_ON(!btree_node_locked(iter, level));
b = iter->l[level].b;
parent = btree_node_parent(iter, b);
if (!parent)
if ((sib == btree_prev_sib && !bpos_cmp(b->data->min_key, POS_MIN)) ||
(sib == btree_next_sib && !bpos_cmp(b->data->max_key, POS_MAX))) {
b->sib_u64s[sib] = U16_MAX;
goto out;
if (b->sib_u64s[sib] > BTREE_FOREGROUND_MERGE_THRESHOLD(c))
goto out;
/* XXX: can't be holding read locks */
m = bch2_btree_node_get_sibling(c, iter, b, sib);
if (IS_ERR(m)) {
ret = PTR_ERR(m);
goto err;
}
/* NULL means no sibling: */
if (!m) {
sib_pos = sib == btree_prev_sib
? bpos_predecessor(b->data->min_key)
: bpos_successor(b->data->max_key);
sib_iter = bch2_trans_get_node_iter(trans, iter->btree_id,
sib_pos, U8_MAX, level,
BTREE_ITER_INTENT);
ret = bch2_btree_iter_traverse(sib_iter);
if (ret)
goto err;
m = sib_iter->l[level].b;
if (btree_node_parent(iter, b) !=
btree_node_parent(sib_iter, m)) {
b->sib_u64s[sib] = U16_MAX;
goto out;
}
@ -1581,6 +1595,8 @@ retry:
next = m;
}
BUG_ON(bkey_cmp(bpos_successor(prev->data->max_key), next->data->min_key));
bch2_bkey_format_init(&new_s);
bch2_bkey_format_add_pos(&new_s, prev->data->min_key);
__bch2_btree_calc_format(&new_s, prev);
@ -1598,23 +1614,21 @@ retry:
}
sib_u64s = min(sib_u64s, btree_max_u64s(c));
sib_u64s = min(sib_u64s, (size_t) U16_MAX - 1);
b->sib_u64s[sib] = sib_u64s;
if (b->sib_u64s[sib] > BTREE_FOREGROUND_MERGE_THRESHOLD(c)) {
six_unlock_intent(&m->c.lock);
if (b->sib_u64s[sib] > c->btree_foreground_merge_threshold)
goto out;
}
parent = btree_node_parent(iter, b);
as = bch2_btree_update_start(iter, level,
btree_update_reserve_required(c, parent) + 1,
flags|
BTREE_INSERT_NOFAIL|
BTREE_INSERT_USE_RESERVE);
ret = PTR_ERR_OR_ZERO(as);
if (ret) {
six_unlock_intent(&m->c.lock);
if (ret)
goto err;
}
trace_btree_merge(c, b);
@ -1648,6 +1662,7 @@ retry:
bch2_btree_update_get_open_buckets(as, n);
six_lock_increment(&b->c.lock, SIX_LOCK_intent);
six_lock_increment(&m->c.lock, SIX_LOCK_intent);
bch2_btree_iter_node_drop(iter, b);
bch2_btree_iter_node_drop(iter, m);
@ -1663,6 +1678,7 @@ retry:
bch2_btree_update_done(as);
out:
bch2_btree_trans_verify_locks(trans);
bch2_trans_iter_free(trans, sib_iter);
/*
* Don't downgrade locks here: we're called after successful insert,
@ -1675,11 +1691,16 @@ out:
*/
return;
err:
BUG_ON(ret == -EAGAIN && (flags & BTREE_INSERT_NOUNLOCK));
bch2_trans_iter_put(trans, sib_iter);
sib_iter = NULL;
if (ret == -EINTR && bch2_trans_relock(trans)) {
ret = 0;
goto retry;
}
if (ret == -EINTR && !(flags & BTREE_INSERT_NOUNLOCK)) {
bch2_trans_unlock(trans);
ret = bch2_btree_iter_traverse(iter);
ret = bch2_btree_iter_traverse_all(trans);
if (!ret)
goto retry;
}