bcachefs fixes for 6.11-rc6
- Fix a rare data corruption in the rebalance path, caught as a nonce inconsistency on encrypted filesystems - Revert lockless buffered write path - Mark more errors as autofix -----BEGIN PGP SIGNATURE----- iQIzBAABCgAdFiEEKnAFLkS8Qha+jvQrE6szbY3KbnYFAmbT0+8ACgkQE6szbY3K bnafeg/+KQroY9Ig1Rn9qSnVKZOjkyDeqRq8sgvfOI5exDyuqcTgM69HU6HJbzzk wCFwVNoscx0PMMrHMLtnVKohevGnATHXqCMz0tZ1YIslFlPsHlQToYfDmae3keZQ ZX6crRCxIGxXUfx5VVf8tPn02ZFEqTkilHoZteCzp24w5d6dpjtlJwYzCJ5k+gTK 1lDcQp9IerwbbbFAvg0yu3BObTG6t2aHvtE0rHJ8gzlsVeDvxhnYRPRi4QJ5lar+ Zwpcp48559j4dl3lYh6y7rU4UfHEecxSu0blKF79D8h0u4dxzu0szyDZiZluVK84 uEI4/hNVDmL6W75mRbkjzzbwJqBdgIB35FomaziJ7Z2VFlaZf5YPWWRQE28NcMD6 nKGMtEc/ryFQKffqTHupAtp9cTZBXEQE9mZGcqWLX8mr7ClVztJLmJUCvicwAwBC sUKzhWiD6HgpAJYsDvukHNJEUGN/NBa4lp3x2lUu13n0zHRZkqY0+3b9EkDrO1KE 24ueRbD3l6g1SIRZmvCjiFCSSlOm5wpqzEYKrQndAyU3fXai/mCCncFT/fqs2zJs nH7TCR9pGvW3ln0GuyZyc8+lgcdZegPalAWLHtpNzy9xQWxbn19O4mCmRGhWCbKF irtL7Pn3+EKuUnhagIOp/ImDIH9po9yX9h5PmVndeJ9Dl6YhOF0= =LTM8 -----END PGP SIGNATURE----- Merge tag 'bcachefs-2024-08-21' of https://github.com/koverstreet/bcachefs Push bcachefs fixes from Kent Overstreet: "The data corruption in the buffered write path is troubling; inode lock should not have been able to cause that... - Fix a rare data corruption in the rebalance path, caught as a nonce inconsistency on encrypted filesystems - Revert lockless buffered write path - Mark more errors as autofix" * tag 'bcachefs-2024-08-21' of https://github.com/koverstreet/bcachefs: bcachefs: Mark more errors as autofix bcachefs: Revert lockless buffered IO path bcachefs: Fix bch2_extents_match() false positive bcachefs: Fix failure to return error in data_update_index_update()
This commit is contained in:
commit
a4c763129f
@ -337,6 +337,7 @@ restart_drop_extra_replicas:
|
||||
printbuf_exit(&buf);
|
||||
|
||||
bch2_fatal_error(c);
|
||||
ret = -EIO;
|
||||
goto out;
|
||||
}
|
||||
|
||||
|
@ -257,7 +257,6 @@
|
||||
x(BCH_ERR_nopromote, nopromote_in_flight) \
|
||||
x(BCH_ERR_nopromote, nopromote_no_writes) \
|
||||
x(BCH_ERR_nopromote, nopromote_enomem) \
|
||||
x(0, need_inode_lock) \
|
||||
x(0, invalid_snapshot_node) \
|
||||
x(0, option_needs_open_fs)
|
||||
|
||||
|
@ -929,8 +929,29 @@ bool bch2_extents_match(struct bkey_s_c k1, struct bkey_s_c k2)
|
||||
bkey_for_each_ptr_decode(k2.k, ptrs2, p2, entry2)
|
||||
if (p1.ptr.dev == p2.ptr.dev &&
|
||||
p1.ptr.gen == p2.ptr.gen &&
|
||||
|
||||
/*
|
||||
* This checks that the two pointers point
|
||||
* to the same region on disk - adjusting
|
||||
* for the difference in where the extents
|
||||
* start, since one may have been trimmed:
|
||||
*/
|
||||
(s64) p1.ptr.offset + p1.crc.offset - bkey_start_offset(k1.k) ==
|
||||
(s64) p2.ptr.offset + p2.crc.offset - bkey_start_offset(k2.k))
|
||||
(s64) p2.ptr.offset + p2.crc.offset - bkey_start_offset(k2.k) &&
|
||||
|
||||
/*
|
||||
* This additionally checks that the
|
||||
* extents overlap on disk, since the
|
||||
* previous check may trigger spuriously
|
||||
* when one extent is immediately partially
|
||||
* overwritten with another extent (so that
|
||||
* on disk they are adjacent) and
|
||||
* compression is in use:
|
||||
*/
|
||||
((p1.ptr.offset >= p2.ptr.offset &&
|
||||
p1.ptr.offset < p2.ptr.offset + p2.crc.compressed_size) ||
|
||||
(p2.ptr.offset >= p1.ptr.offset &&
|
||||
p2.ptr.offset < p1.ptr.offset + p1.crc.compressed_size)))
|
||||
return true;
|
||||
|
||||
return false;
|
||||
|
@ -802,8 +802,7 @@ static noinline void folios_trunc(folios *fs, struct folio **fi)
|
||||
static int __bch2_buffered_write(struct bch_inode_info *inode,
|
||||
struct address_space *mapping,
|
||||
struct iov_iter *iter,
|
||||
loff_t pos, unsigned len,
|
||||
bool inode_locked)
|
||||
loff_t pos, unsigned len)
|
||||
{
|
||||
struct bch_fs *c = inode->v.i_sb->s_fs_info;
|
||||
struct bch2_folio_reservation res;
|
||||
@ -827,15 +826,6 @@ static int __bch2_buffered_write(struct bch_inode_info *inode,
|
||||
|
||||
BUG_ON(!fs.nr);
|
||||
|
||||
/*
|
||||
* If we're not using the inode lock, we need to lock all the folios for
|
||||
* atomiticity of writes vs. other writes:
|
||||
*/
|
||||
if (!inode_locked && folio_end_pos(darray_last(fs)) < end) {
|
||||
ret = -BCH_ERR_need_inode_lock;
|
||||
goto out;
|
||||
}
|
||||
|
||||
f = darray_first(fs);
|
||||
if (pos != folio_pos(f) && !folio_test_uptodate(f)) {
|
||||
ret = bch2_read_single_folio(f, mapping);
|
||||
@ -932,10 +922,8 @@ static int __bch2_buffered_write(struct bch_inode_info *inode,
|
||||
end = pos + copied;
|
||||
|
||||
spin_lock(&inode->v.i_lock);
|
||||
if (end > inode->v.i_size) {
|
||||
BUG_ON(!inode_locked);
|
||||
if (end > inode->v.i_size)
|
||||
i_size_write(&inode->v, end);
|
||||
}
|
||||
spin_unlock(&inode->v.i_lock);
|
||||
|
||||
f_pos = pos;
|
||||
@ -979,68 +967,12 @@ static ssize_t bch2_buffered_write(struct kiocb *iocb, struct iov_iter *iter)
|
||||
struct file *file = iocb->ki_filp;
|
||||
struct address_space *mapping = file->f_mapping;
|
||||
struct bch_inode_info *inode = file_bch_inode(file);
|
||||
loff_t pos;
|
||||
bool inode_locked = false;
|
||||
ssize_t written = 0, written2 = 0, ret = 0;
|
||||
|
||||
/*
|
||||
* We don't take the inode lock unless i_size will be changing. Folio
|
||||
* locks provide exclusion with other writes, and the pagecache add lock
|
||||
* provides exclusion with truncate and hole punching.
|
||||
*
|
||||
* There is one nasty corner case where atomicity would be broken
|
||||
* without great care: when copying data from userspace to the page
|
||||
* cache, we do that with faults disable - a page fault would recurse
|
||||
* back into the filesystem, taking filesystem locks again, and
|
||||
* deadlock; so it's done with faults disabled, and we fault in the user
|
||||
* buffer when we aren't holding locks.
|
||||
*
|
||||
* If we do part of the write, but we then race and in the userspace
|
||||
* buffer have been evicted and are no longer resident, then we have to
|
||||
* drop our folio locks to re-fault them in, breaking write atomicity.
|
||||
*
|
||||
* To fix this, we restart the write from the start, if we weren't
|
||||
* holding the inode lock.
|
||||
*
|
||||
* There is another wrinkle after that; if we restart the write from the
|
||||
* start, and then get an unrecoverable error, we _cannot_ claim to
|
||||
* userspace that we did not write data we actually did - so we must
|
||||
* track (written2) the most we ever wrote.
|
||||
*/
|
||||
|
||||
if ((iocb->ki_flags & IOCB_APPEND) ||
|
||||
(iocb->ki_pos + iov_iter_count(iter) > i_size_read(&inode->v))) {
|
||||
inode_lock(&inode->v);
|
||||
inode_locked = true;
|
||||
}
|
||||
|
||||
ret = generic_write_checks(iocb, iter);
|
||||
if (ret <= 0)
|
||||
goto unlock;
|
||||
|
||||
ret = file_remove_privs_flags(file, !inode_locked ? IOCB_NOWAIT : 0);
|
||||
if (ret) {
|
||||
if (!inode_locked) {
|
||||
inode_lock(&inode->v);
|
||||
inode_locked = true;
|
||||
ret = file_remove_privs_flags(file, 0);
|
||||
}
|
||||
if (ret)
|
||||
goto unlock;
|
||||
}
|
||||
|
||||
ret = file_update_time(file);
|
||||
if (ret)
|
||||
goto unlock;
|
||||
|
||||
pos = iocb->ki_pos;
|
||||
loff_t pos = iocb->ki_pos;
|
||||
ssize_t written = 0;
|
||||
int ret = 0;
|
||||
|
||||
bch2_pagecache_add_get(inode);
|
||||
|
||||
if (!inode_locked &&
|
||||
(iocb->ki_pos + iov_iter_count(iter) > i_size_read(&inode->v)))
|
||||
goto get_inode_lock;
|
||||
|
||||
do {
|
||||
unsigned offset = pos & (PAGE_SIZE - 1);
|
||||
unsigned bytes = iov_iter_count(iter);
|
||||
@ -1065,17 +997,12 @@ again:
|
||||
}
|
||||
}
|
||||
|
||||
if (unlikely(bytes != iov_iter_count(iter) && !inode_locked))
|
||||
goto get_inode_lock;
|
||||
|
||||
if (unlikely(fatal_signal_pending(current))) {
|
||||
ret = -EINTR;
|
||||
break;
|
||||
}
|
||||
|
||||
ret = __bch2_buffered_write(inode, mapping, iter, pos, bytes, inode_locked);
|
||||
if (ret == -BCH_ERR_need_inode_lock)
|
||||
goto get_inode_lock;
|
||||
ret = __bch2_buffered_write(inode, mapping, iter, pos, bytes);
|
||||
if (unlikely(ret < 0))
|
||||
break;
|
||||
|
||||
@ -1096,46 +1023,50 @@ again:
|
||||
}
|
||||
pos += ret;
|
||||
written += ret;
|
||||
written2 = max(written, written2);
|
||||
|
||||
if (ret != bytes && !inode_locked)
|
||||
goto get_inode_lock;
|
||||
ret = 0;
|
||||
|
||||
balance_dirty_pages_ratelimited(mapping);
|
||||
|
||||
if (0) {
|
||||
get_inode_lock:
|
||||
bch2_pagecache_add_put(inode);
|
||||
inode_lock(&inode->v);
|
||||
inode_locked = true;
|
||||
bch2_pagecache_add_get(inode);
|
||||
|
||||
iov_iter_revert(iter, written);
|
||||
pos -= written;
|
||||
written = 0;
|
||||
ret = 0;
|
||||
}
|
||||
} while (iov_iter_count(iter));
|
||||
|
||||
bch2_pagecache_add_put(inode);
|
||||
unlock:
|
||||
if (inode_locked)
|
||||
inode_unlock(&inode->v);
|
||||
|
||||
iocb->ki_pos += written;
|
||||
|
||||
ret = max(written, written2) ?: ret;
|
||||
if (ret > 0)
|
||||
ret = generic_write_sync(iocb, ret);
|
||||
return ret;
|
||||
return written ? written : ret;
|
||||
}
|
||||
|
||||
ssize_t bch2_write_iter(struct kiocb *iocb, struct iov_iter *iter)
|
||||
ssize_t bch2_write_iter(struct kiocb *iocb, struct iov_iter *from)
|
||||
{
|
||||
ssize_t ret = iocb->ki_flags & IOCB_DIRECT
|
||||
? bch2_direct_write(iocb, iter)
|
||||
: bch2_buffered_write(iocb, iter);
|
||||
struct file *file = iocb->ki_filp;
|
||||
struct bch_inode_info *inode = file_bch_inode(file);
|
||||
ssize_t ret;
|
||||
|
||||
if (iocb->ki_flags & IOCB_DIRECT) {
|
||||
ret = bch2_direct_write(iocb, from);
|
||||
goto out;
|
||||
}
|
||||
|
||||
inode_lock(&inode->v);
|
||||
|
||||
ret = generic_write_checks(iocb, from);
|
||||
if (ret <= 0)
|
||||
goto unlock;
|
||||
|
||||
ret = file_remove_privs(file);
|
||||
if (ret)
|
||||
goto unlock;
|
||||
|
||||
ret = file_update_time(file);
|
||||
if (ret)
|
||||
goto unlock;
|
||||
|
||||
ret = bch2_buffered_write(iocb, from);
|
||||
if (likely(ret > 0))
|
||||
iocb->ki_pos += ret;
|
||||
unlock:
|
||||
inode_unlock(&inode->v);
|
||||
|
||||
if (ret > 0)
|
||||
ret = generic_write_sync(iocb, ret);
|
||||
out:
|
||||
return bch2_err_class(ret);
|
||||
}
|
||||
|
||||
|
@ -23,7 +23,7 @@ enum bch_fsck_flags {
|
||||
x(jset_past_bucket_end, 9, 0) \
|
||||
x(jset_seq_blacklisted, 10, 0) \
|
||||
x(journal_entries_missing, 11, 0) \
|
||||
x(journal_entry_replicas_not_marked, 12, 0) \
|
||||
x(journal_entry_replicas_not_marked, 12, FSCK_AUTOFIX) \
|
||||
x(journal_entry_past_jset_end, 13, 0) \
|
||||
x(journal_entry_replicas_data_mismatch, 14, 0) \
|
||||
x(journal_entry_bkey_u64s_0, 15, 0) \
|
||||
@ -288,10 +288,10 @@ enum bch_fsck_flags {
|
||||
x(invalid_btree_id, 274, 0) \
|
||||
x(alloc_key_io_time_bad, 275, 0) \
|
||||
x(alloc_key_fragmentation_lru_wrong, 276, FSCK_AUTOFIX) \
|
||||
x(accounting_key_junk_at_end, 277, 0) \
|
||||
x(accounting_key_replicas_nr_devs_0, 278, 0) \
|
||||
x(accounting_key_replicas_nr_required_bad, 279, 0) \
|
||||
x(accounting_key_replicas_devs_unsorted, 280, 0) \
|
||||
x(accounting_key_junk_at_end, 277, FSCK_AUTOFIX) \
|
||||
x(accounting_key_replicas_nr_devs_0, 278, FSCK_AUTOFIX) \
|
||||
x(accounting_key_replicas_nr_required_bad, 279, FSCK_AUTOFIX) \
|
||||
x(accounting_key_replicas_devs_unsorted, 280, FSCK_AUTOFIX) \
|
||||
|
||||
enum bch_sb_error_id {
|
||||
#define x(t, n, ...) BCH_FSCK_ERR_##t = n,
|
||||
|
Loading…
Reference in New Issue
Block a user