5dfd3746b6
Btree nodes are log structured; thus, we need to emit whiteouts when we're deleting a key that's been written out to disk. k->needs_whiteout tracks whether a key will need a whiteout when it's deleted, and this requires some careful handling; e.g. the key we're deleting may not have been written out to disk, but it may have overwritten a key that was - thus we need to carry this flag around on overwrites. Invariants: There may be multiple key for the same position in a given node (because of overwrites), but only one of them will be a live (non deleted) key, and only one key for a given position will have the needs_whiteout flag set. Additionally, we don't want to carry around whiteouts that need to be written in the main searchable part of a btree node - btree_iter_peek() will have to skip past them, and this can lead to an O(n^2) issues when doing sequential deletions (e.g. inode rm/truncate). So there's a separate region in the btree node buffer for unwritten whiteouts; these are merge sorted with the rest of the keys we're writing in the btree node write path. The unwritten whiteouts was a later optimization that bch2_sort_keys() didn't take into account; the unwritten whiteouts area means that we never have deleted keys with needs_whiteout set in the main searchable part of a btree node. That means we can simplify and optimize some sort paths, and eliminate an assertion that syzbot found: - Unless we're in the btree node write path, it's always ok to drop whiteouts when sorting - When sorting for a btree node write, we drop the whiteout if it's not from the unwritten whiteouts area, or if it's overwritten by a real key at the same position. This completely eliminates some tricky logic for propagating the needs_whiteout flag: syzbot was able to hit the assertion that checked that there shouldn't be more than one key at the same pos with needs_whiteout set, likely due to a combination of flipping on needs_whiteout on all written keys (they need whiteouts if overwritten), combined with not always dropping unneeded whiteouts, and the tricky logic in the sort path for preserving needs_whiteout that wasn't really needed. Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
55 lines
1.3 KiB
C
55 lines
1.3 KiB
C
/* SPDX-License-Identifier: GPL-2.0 */
|
|
#ifndef _BCACHEFS_BKEY_SORT_H
|
|
#define _BCACHEFS_BKEY_SORT_H
|
|
|
|
struct sort_iter {
|
|
struct btree *b;
|
|
unsigned used;
|
|
unsigned size;
|
|
|
|
struct sort_iter_set {
|
|
struct bkey_packed *k, *end;
|
|
} data[];
|
|
};
|
|
|
|
static inline void sort_iter_init(struct sort_iter *iter, struct btree *b, unsigned size)
|
|
{
|
|
iter->b = b;
|
|
iter->used = 0;
|
|
iter->size = size;
|
|
}
|
|
|
|
struct sort_iter_stack {
|
|
struct sort_iter iter;
|
|
struct sort_iter_set sets[MAX_BSETS + 1];
|
|
};
|
|
|
|
static inline void sort_iter_stack_init(struct sort_iter_stack *iter, struct btree *b)
|
|
{
|
|
sort_iter_init(&iter->iter, b, ARRAY_SIZE(iter->sets));
|
|
}
|
|
|
|
static inline void sort_iter_add(struct sort_iter *iter,
|
|
struct bkey_packed *k,
|
|
struct bkey_packed *end)
|
|
{
|
|
BUG_ON(iter->used >= iter->size);
|
|
|
|
if (k != end)
|
|
iter->data[iter->used++] = (struct sort_iter_set) { k, end };
|
|
}
|
|
|
|
struct btree_nr_keys
|
|
bch2_key_sort_fix_overlapping(struct bch_fs *, struct bset *,
|
|
struct sort_iter *);
|
|
|
|
struct btree_nr_keys
|
|
bch2_sort_repack(struct bset *, struct btree *,
|
|
struct btree_node_iter *,
|
|
struct bkey_format *, bool);
|
|
|
|
unsigned bch2_sort_keys_keep_unwritten_whiteouts(struct bkey_packed *, struct sort_iter *);
|
|
unsigned bch2_sort_keys(struct bkey_packed *, struct sort_iter *);
|
|
|
|
#endif /* _BCACHEFS_BKEY_SORT_H */
|