From 54ca47e114c0cccc075a722f528de2b50b149b49 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 29 Mar 2021 01:13:31 -0400 Subject: [PATCH] bcachefs: Kill bch2_btree_node_get_sibling() This patch reworks the btree node merge path to use a second btree iterator to get the sibling node - which means bch2_btree_iter_get_sibling() can be deleted. Also, it uses bch2_btree_iter_traverse_all() if necessary - which means it should be more reliable. We don't currently even try to make it work when trans->nounlock is set - after a BTREE_INSERT_NOUNLOCK transaction commit, hopefully this will be a worthwhile tradeoff. Signed-off-by: Kent Overstreet Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_cache.c | 134 +--------------------------- fs/bcachefs/btree_cache.h | 3 - fs/bcachefs/btree_update_interior.c | 65 +++++++++----- 3 files changed, 45 insertions(+), 157 deletions(-) diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c index 85ac08b9270a..2ec668c3427e 100644 --- a/fs/bcachefs/btree_cache.c +++ b/fs/bcachefs/btree_cache.c @@ -913,136 +913,6 @@ out: return b; } -struct btree *bch2_btree_node_get_sibling(struct bch_fs *c, - struct btree_iter *iter, - struct btree *b, - enum btree_node_sibling sib) -{ - struct btree_trans *trans = iter->trans; - struct btree *parent; - struct btree_node_iter node_iter; - struct bkey_packed *k; - struct bkey_buf tmp; - struct btree *ret = NULL; - unsigned level = b->c.level; - - bch2_bkey_buf_init(&tmp); - - parent = btree_iter_node(iter, level + 1); - if (!parent) - return NULL; - - /* - * There's a corner case where a btree_iter might have a node locked - * that is just outside its current pos - when - * bch2_btree_iter_set_pos_same_leaf() gets to the end of the node. - * - * But the lock ordering checks in __bch2_btree_node_lock() go off of - * iter->pos, not the node's key: so if the iterator is marked as - * needing to be traversed, we risk deadlock if we don't bail out here: - */ - if (iter->uptodate >= BTREE_ITER_NEED_TRAVERSE) - return ERR_PTR(-EINTR); - - if (!bch2_btree_node_relock(iter, level + 1)) { - ret = ERR_PTR(-EINTR); - goto out; - } - - node_iter = iter->l[parent->c.level].iter; - - k = bch2_btree_node_iter_peek_all(&node_iter, parent); - BUG_ON(bkey_cmp_left_packed(parent, k, &b->key.k.p)); - - k = sib == btree_prev_sib - ? bch2_btree_node_iter_prev(&node_iter, parent) - : (bch2_btree_node_iter_advance(&node_iter, parent), - bch2_btree_node_iter_peek(&node_iter, parent)); - if (!k) - goto out; - - bch2_bkey_buf_unpack(&tmp, c, parent, k); - - ret = bch2_btree_node_get(c, iter, tmp.k, level, - SIX_LOCK_intent, _THIS_IP_); - - if (PTR_ERR_OR_ZERO(ret) == -EINTR && !trans->nounlock) { - struct btree_iter *linked; - - if (!bch2_btree_node_relock(iter, level + 1)) - goto out; - - /* - * We might have got -EINTR because trylock failed, and we're - * holding other locks that would cause us to deadlock: - */ - trans_for_each_iter(trans, linked) - if (btree_iter_lock_cmp(iter, linked) < 0) - __bch2_btree_iter_unlock(linked); - - if (sib == btree_prev_sib) - btree_node_unlock(iter, level); - - ret = bch2_btree_node_get(c, iter, tmp.k, level, - SIX_LOCK_intent, _THIS_IP_); - - /* - * before btree_iter_relock() calls btree_iter_verify_locks(): - */ - if (btree_lock_want(iter, level + 1) == BTREE_NODE_UNLOCKED) - btree_node_unlock(iter, level + 1); - - if (!bch2_btree_node_relock(iter, level)) { - btree_iter_set_dirty(iter, BTREE_ITER_NEED_RELOCK); - - if (!IS_ERR(ret)) { - six_unlock_intent(&ret->c.lock); - ret = ERR_PTR(-EINTR); - } - } - - bch2_trans_relock(trans); - } -out: - if (btree_lock_want(iter, level + 1) == BTREE_NODE_UNLOCKED) - btree_node_unlock(iter, level + 1); - - if (PTR_ERR_OR_ZERO(ret) == -EINTR) - bch2_btree_iter_upgrade(iter, level + 2); - - BUG_ON(!IS_ERR(ret) && !btree_node_locked(iter, level)); - - if (!IS_ERR_OR_NULL(ret)) { - struct btree *n1 = ret, *n2 = b; - - if (sib != btree_prev_sib) - swap(n1, n2); - - if (bpos_cmp(bpos_successor(n1->key.k.p), - n2->data->min_key)) { - char buf1[200], buf2[200]; - - bch2_bkey_val_to_text(&PBUF(buf1), c, bkey_i_to_s_c(&n1->key)); - bch2_bkey_val_to_text(&PBUF(buf2), c, bkey_i_to_s_c(&n2->key)); - - bch2_fs_inconsistent(c, "btree topology error at btree %s level %u:\n" - "prev: %s\n" - "next: %s\n", - bch2_btree_ids[iter->btree_id], level, - buf1, buf2); - - six_unlock_intent(&ret->c.lock); - ret = NULL; - } - } - - bch2_btree_trans_verify_locks(trans); - - bch2_bkey_buf_exit(&tmp, c); - - return ret; -} - void bch2_btree_node_prefetch(struct bch_fs *c, struct btree_iter *iter, const struct bkey_i *k, enum btree_id btree_id, unsigned level) @@ -1082,7 +952,7 @@ void bch2_btree_node_to_text(struct printbuf *out, struct bch_fs *c, " format: u64s %u fields %u %u %u %u %u\n" " unpack fn len: %u\n" " bytes used %zu/%zu (%zu%% full)\n" - " sib u64s: %u, %u (merge threshold %zu)\n" + " sib u64s: %u, %u (merge threshold %u)\n" " nr packed keys %u\n" " nr unpacked keys %u\n" " floats %zu\n" @@ -1099,7 +969,7 @@ void bch2_btree_node_to_text(struct printbuf *out, struct bch_fs *c, b->nr.live_u64s * 100 / btree_max_u64s(c), b->sib_u64s[0], b->sib_u64s[1], - BTREE_FOREGROUND_MERGE_THRESHOLD(c), + c->btree_foreground_merge_threshold, b->nr.packed_keys, b->nr.unpacked_keys, stats.floats, diff --git a/fs/bcachefs/btree_cache.h b/fs/bcachefs/btree_cache.h index 217988696a77..aa8fe4a1b04b 100644 --- a/fs/bcachefs/btree_cache.h +++ b/fs/bcachefs/btree_cache.h @@ -26,9 +26,6 @@ struct btree *bch2_btree_node_get(struct bch_fs *, struct btree_iter *, struct btree *bch2_btree_node_get_noiter(struct bch_fs *, const struct bkey_i *, enum btree_id, unsigned, bool); -struct btree *bch2_btree_node_get_sibling(struct bch_fs *, struct btree_iter *, - struct btree *, enum btree_node_sibling); - void bch2_btree_node_prefetch(struct bch_fs *, struct btree_iter *, const struct bkey_i *, enum btree_id, unsigned); diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c index fddb0c3e7167..af7c2df56692 100644 --- a/fs/bcachefs/btree_update_interior.c +++ b/fs/bcachefs/btree_update_interior.c @@ -1539,36 +1539,50 @@ void __bch2_foreground_maybe_merge(struct bch_fs *c, enum btree_node_sibling sib) { struct btree_trans *trans = iter->trans; + struct btree_iter *sib_iter = NULL; struct btree_update *as; struct bkey_format_state new_s; struct bkey_format new_f; struct bkey_i delete; struct btree *b, *m, *n, *prev, *next, *parent; + struct bpos sib_pos; size_t sib_u64s; int ret = 0; + if (trans->nounlock) + return; + BUG_ON(!btree_node_locked(iter, level)); retry: + ret = bch2_btree_iter_traverse(iter); + if (ret) + goto err; + BUG_ON(!btree_node_locked(iter, level)); b = iter->l[level].b; - parent = btree_node_parent(iter, b); - if (!parent) + if ((sib == btree_prev_sib && !bpos_cmp(b->data->min_key, POS_MIN)) || + (sib == btree_next_sib && !bpos_cmp(b->data->max_key, POS_MAX))) { + b->sib_u64s[sib] = U16_MAX; goto out; - - if (b->sib_u64s[sib] > BTREE_FOREGROUND_MERGE_THRESHOLD(c)) - goto out; - - /* XXX: can't be holding read locks */ - m = bch2_btree_node_get_sibling(c, iter, b, sib); - if (IS_ERR(m)) { - ret = PTR_ERR(m); - goto err; } - /* NULL means no sibling: */ - if (!m) { + sib_pos = sib == btree_prev_sib + ? bpos_predecessor(b->data->min_key) + : bpos_successor(b->data->max_key); + + sib_iter = bch2_trans_get_node_iter(trans, iter->btree_id, + sib_pos, U8_MAX, level, + BTREE_ITER_INTENT); + ret = bch2_btree_iter_traverse(sib_iter); + if (ret) + goto err; + + m = sib_iter->l[level].b; + + if (btree_node_parent(iter, b) != + btree_node_parent(sib_iter, m)) { b->sib_u64s[sib] = U16_MAX; goto out; } @@ -1581,6 +1595,8 @@ retry: next = m; } + BUG_ON(bkey_cmp(bpos_successor(prev->data->max_key), next->data->min_key)); + bch2_bkey_format_init(&new_s); bch2_bkey_format_add_pos(&new_s, prev->data->min_key); __bch2_btree_calc_format(&new_s, prev); @@ -1598,23 +1614,21 @@ retry: } sib_u64s = min(sib_u64s, btree_max_u64s(c)); + sib_u64s = min(sib_u64s, (size_t) U16_MAX - 1); b->sib_u64s[sib] = sib_u64s; - if (b->sib_u64s[sib] > BTREE_FOREGROUND_MERGE_THRESHOLD(c)) { - six_unlock_intent(&m->c.lock); + if (b->sib_u64s[sib] > c->btree_foreground_merge_threshold) goto out; - } + parent = btree_node_parent(iter, b); as = bch2_btree_update_start(iter, level, btree_update_reserve_required(c, parent) + 1, flags| BTREE_INSERT_NOFAIL| BTREE_INSERT_USE_RESERVE); ret = PTR_ERR_OR_ZERO(as); - if (ret) { - six_unlock_intent(&m->c.lock); + if (ret) goto err; - } trace_btree_merge(c, b); @@ -1648,6 +1662,7 @@ retry: bch2_btree_update_get_open_buckets(as, n); six_lock_increment(&b->c.lock, SIX_LOCK_intent); + six_lock_increment(&m->c.lock, SIX_LOCK_intent); bch2_btree_iter_node_drop(iter, b); bch2_btree_iter_node_drop(iter, m); @@ -1663,6 +1678,7 @@ retry: bch2_btree_update_done(as); out: bch2_btree_trans_verify_locks(trans); + bch2_trans_iter_free(trans, sib_iter); /* * Don't downgrade locks here: we're called after successful insert, @@ -1675,11 +1691,16 @@ out: */ return; err: - BUG_ON(ret == -EAGAIN && (flags & BTREE_INSERT_NOUNLOCK)); + bch2_trans_iter_put(trans, sib_iter); + sib_iter = NULL; + + if (ret == -EINTR && bch2_trans_relock(trans)) { + ret = 0; + goto retry; + } if (ret == -EINTR && !(flags & BTREE_INSERT_NOUNLOCK)) { - bch2_trans_unlock(trans); - ret = bch2_btree_iter_traverse(iter); + ret = bch2_btree_iter_traverse_all(trans); if (!ret) goto retry; }