1

for-6.10-rc2-tag

-----BEGIN PGP SIGNATURE-----
 
 iQIzBAABCgAdFiEE8rQSAMVO+zA4DBdWxWXV+ddtWDsFAmZjGxEACgkQxWXV+ddt
 WDt/lQ/9G5Ieozi7BoHcBTmE5BpyRXqfQ5dj9VaWFLZEH+rvdOiZlQ5QlcI1gkUg
 jbSp7SB8qhrIbBwv2b58xgnGuuUxE2dd3tPP655bBhe7o4xvAKOK8XBVyVQQxvzq
 eRI1J4daQC08FjlSNnHiY1ozl1OIOTSVllLBqkhkYOaBbIKbMByCVPpj4cwzqAvC
 5iBCmL2RalAQ/ZasYwdBid2L2VD4dWGJ9uHY6pi+i9f/Q7nxGRlXqhHo25J5T2Gl
 YE+75fS1lN9hQIkhZRlpiqOdpjJcxkkGJ0elXtRQ0t856lXUAOacIN+HW6aaLxj/
 qv8tnaZO1M/Aw8t65N/8m/ktnBQEwXeX3rAcTpvaMLKAQRGP1oZdsyN5YzLpEi0h
 0Y+yXyFdOYbTJBrGie+kRyh9RoRY3CLcQgRw/2u7ysRjBqdXPcaGXAH4ydE/67Qy
 hdD9DVwDkjzmxh/V4o8raIM+b3Ql4FuBR0X93f8A0LeanJUlgCrc4lokgdA1tPEf
 i/+epnnlEnBOpS4ht5PFn8FkpzeuKAR63AM5sKL18DYswpP5oCyyh76zMMoBh33k
 JFEwVSUDW8AH6YSMAA5eLp9bGTDOGYyvNHKBRXAv5GHQkBXq4ahjZEUh2b/r1903
 FzivbyRtUsxEh62SCsXCZqKlb1BhXRkhSqdbBFbYsKYmVEkylQo=
 =7rUb
 -----END PGP SIGNATURE-----

Merge tag 'for-6.10-rc2-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux

Pull btrfs fixes from David Sterba:

 - fix handling of folio private changes.

   The private value holds pointer to our extent buffer structure
   representing a metadata range. Release and create of the range was
   not properly synchronized when updating the private bit which ended
   up in double folio_put, leading to all sorts of breakage

 - fix a crash, reported as duplicate key in metadata, but caused by a
   race of fsync and size extending write. Requires prealloc target
   range + fsync and other conditions (log tree state, timing)

 - fix leak of qgroup extent records after transaction abort

* tag 'for-6.10-rc2-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux:
  btrfs: protect folio::private when attaching extent buffer folios
  btrfs: fix leak of qgroup extent records after transaction abort
  btrfs: fix crash on racing fsync and size-extending write into prealloc
This commit is contained in:
Linus Torvalds 2024-06-07 15:13:12 -07:00
commit 07978330e6
3 changed files with 43 additions and 44 deletions

View File

@ -4538,18 +4538,10 @@ static void btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
struct btrfs_fs_info *fs_info)
{
struct rb_node *node;
struct btrfs_delayed_ref_root *delayed_refs;
struct btrfs_delayed_ref_root *delayed_refs = &trans->delayed_refs;
struct btrfs_delayed_ref_node *ref;
delayed_refs = &trans->delayed_refs;
spin_lock(&delayed_refs->lock);
if (atomic_read(&delayed_refs->num_entries) == 0) {
spin_unlock(&delayed_refs->lock);
btrfs_debug(fs_info, "delayed_refs has NO entry");
return;
}
while ((node = rb_first_cached(&delayed_refs->href_root)) != NULL) {
struct btrfs_delayed_ref_head *head;
struct rb_node *n;

View File

@ -3689,6 +3689,8 @@ static struct extent_buffer *grab_extent_buffer(
struct folio *folio = page_folio(page);
struct extent_buffer *exists;
lockdep_assert_held(&page->mapping->i_private_lock);
/*
* For subpage case, we completely rely on radix tree to ensure we
* don't try to insert two ebs for the same bytenr. So here we always
@ -3756,13 +3758,14 @@ static int check_eb_alignment(struct btrfs_fs_info *fs_info, u64 start)
* The caller needs to free the existing folios and retry using the same order.
*/
static int attach_eb_folio_to_filemap(struct extent_buffer *eb, int i,
struct btrfs_subpage *prealloc,
struct extent_buffer **found_eb_ret)
{
struct btrfs_fs_info *fs_info = eb->fs_info;
struct address_space *mapping = fs_info->btree_inode->i_mapping;
const unsigned long index = eb->start >> PAGE_SHIFT;
struct folio *existing_folio;
struct folio *existing_folio = NULL;
int ret;
ASSERT(found_eb_ret);
@ -3774,12 +3777,14 @@ retry:
ret = filemap_add_folio(mapping, eb->folios[i], index + i,
GFP_NOFS | __GFP_NOFAIL);
if (!ret)
return 0;
goto finish;
existing_folio = filemap_lock_folio(mapping, index + i);
/* The page cache only exists for a very short time, just retry. */
if (IS_ERR(existing_folio))
if (IS_ERR(existing_folio)) {
existing_folio = NULL;
goto retry;
}
/* For now, we should only have single-page folios for btree inode. */
ASSERT(folio_nr_pages(existing_folio) == 1);
@ -3790,14 +3795,13 @@ retry:
return -EAGAIN;
}
if (fs_info->nodesize < PAGE_SIZE) {
/*
* We're going to reuse the existing page, can drop our page
* and subpage structure now.
*/
finish:
spin_lock(&mapping->i_private_lock);
if (existing_folio && fs_info->nodesize < PAGE_SIZE) {
/* We're going to reuse the existing page, can drop our folio now. */
__free_page(folio_page(eb->folios[i], 0));
eb->folios[i] = existing_folio;
} else {
} else if (existing_folio) {
struct extent_buffer *existing_eb;
existing_eb = grab_extent_buffer(fs_info,
@ -3805,6 +3809,7 @@ retry:
if (existing_eb) {
/* The extent buffer still exists, we can use it directly. */
*found_eb_ret = existing_eb;
spin_unlock(&mapping->i_private_lock);
folio_unlock(existing_folio);
folio_put(existing_folio);
return 1;
@ -3813,6 +3818,22 @@ retry:
__free_page(folio_page(eb->folios[i], 0));
eb->folios[i] = existing_folio;
}
eb->folio_size = folio_size(eb->folios[i]);
eb->folio_shift = folio_shift(eb->folios[i]);
/* Should not fail, as we have preallocated the memory. */
ret = attach_extent_buffer_folio(eb, eb->folios[i], prealloc);
ASSERT(!ret);
/*
* To inform we have an extra eb under allocation, so that
* detach_extent_buffer_page() won't release the folio private when the
* eb hasn't been inserted into radix tree yet.
*
* The ref will be decreased when the eb releases the page, in
* detach_extent_buffer_page(). Thus needs no special handling in the
* error path.
*/
btrfs_folio_inc_eb_refs(fs_info, eb->folios[i]);
spin_unlock(&mapping->i_private_lock);
return 0;
}
@ -3824,7 +3845,6 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
int attached = 0;
struct extent_buffer *eb;
struct extent_buffer *existing_eb = NULL;
struct address_space *mapping = fs_info->btree_inode->i_mapping;
struct btrfs_subpage *prealloc = NULL;
u64 lockdep_owner = owner_root;
bool page_contig = true;
@ -3890,7 +3910,7 @@ reallocate:
for (int i = 0; i < num_folios; i++) {
struct folio *folio;
ret = attach_eb_folio_to_filemap(eb, i, &existing_eb);
ret = attach_eb_folio_to_filemap(eb, i, prealloc, &existing_eb);
if (ret > 0) {
ASSERT(existing_eb);
goto out;
@ -3927,24 +3947,6 @@ reallocate:
* and free the allocated page.
*/
folio = eb->folios[i];
eb->folio_size = folio_size(folio);
eb->folio_shift = folio_shift(folio);
spin_lock(&mapping->i_private_lock);
/* Should not fail, as we have preallocated the memory */
ret = attach_extent_buffer_folio(eb, folio, prealloc);
ASSERT(!ret);
/*
* To inform we have extra eb under allocation, so that
* detach_extent_buffer_page() won't release the folio private
* when the eb hasn't yet been inserted into radix tree.
*
* The ref will be decreased when the eb released the page, in
* detach_extent_buffer_page().
* Thus needs no special handling in error path.
*/
btrfs_folio_inc_eb_refs(fs_info, folio);
spin_unlock(&mapping->i_private_lock);
WARN_ON(btrfs_folio_test_dirty(fs_info, folio, eb->start, eb->len));
/*

View File

@ -4860,18 +4860,23 @@ static int btrfs_log_prealloc_extents(struct btrfs_trans_handle *trans,
path->slots[0]++;
continue;
}
if (!dropped_extents) {
/*
* Avoid logging extent items logged in past fsync calls
* and leading to duplicate keys in the log tree.
*/
/*
* Avoid overlapping items in the log tree. The first time we
* get here, get rid of everything from a past fsync. After
* that, if the current extent starts before the end of the last
* extent we copied, truncate the last one. This can happen if
* an ordered extent completion modifies the subvolume tree
* while btrfs_next_leaf() has the tree unlocked.
*/
if (!dropped_extents || key.offset < truncate_offset) {
ret = truncate_inode_items(trans, root->log_root, inode,
truncate_offset,
min(key.offset, truncate_offset),
BTRFS_EXTENT_DATA_KEY);
if (ret)
goto out;
dropped_extents = true;
}
truncate_offset = btrfs_file_extent_end(path);
if (ins_nr == 0)
start_slot = slot;
ins_nr++;