2022-11-15 02:44:05 -07:00
|
|
|
/* SPDX-License-Identifier: GPL-2.0 */
|
|
|
|
/*
|
|
|
|
* Copyright (C) 2007 Oracle. All rights reserved.
|
|
|
|
* Copyright (C) 2022 Christoph Hellwig.
|
|
|
|
*/
|
|
|
|
|
|
|
|
#ifndef BTRFS_BIO_H
|
|
|
|
#define BTRFS_BIO_H
|
|
|
|
|
2024-01-26 19:19:56 -07:00
|
|
|
#include <linux/types.h>
|
2022-11-15 02:44:05 -07:00
|
|
|
#include <linux/bio.h>
|
|
|
|
#include <linux/workqueue.h>
|
|
|
|
#include "tree-checker.h"
|
|
|
|
|
|
|
|
struct btrfs_bio;
|
|
|
|
struct btrfs_fs_info;
|
2024-01-26 19:19:56 -07:00
|
|
|
struct btrfs_inode;
|
2022-11-15 02:44:05 -07:00
|
|
|
|
|
|
|
#define BTRFS_BIO_INLINE_CSUM_SIZE 64
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Maximum number of sectors for a single bio to limit the size of the
|
|
|
|
* checksum array. This matches the number of bio_vecs per bio and thus the
|
|
|
|
* I/O size for buffered I/O.
|
|
|
|
*/
|
|
|
|
#define BTRFS_MAX_BIO_SECTORS (256)
|
|
|
|
|
|
|
|
typedef void (*btrfs_bio_end_io_t)(struct btrfs_bio *bbio);
|
|
|
|
|
|
|
|
/*
|
2023-01-20 23:49:59 -07:00
|
|
|
* Highlevel btrfs I/O structure. It is allocated by btrfs_bio_alloc and
|
2024-08-26 18:40:11 -07:00
|
|
|
* passed to btrfs_submit_bbio() for mapping to the physical devices.
|
2022-11-15 02:44:05 -07:00
|
|
|
*/
|
|
|
|
struct btrfs_bio {
|
2023-03-23 02:01:20 -07:00
|
|
|
/*
|
|
|
|
* Inode and offset into it that this I/O operates on.
|
|
|
|
* Only set for data I/O.
|
|
|
|
*/
|
2023-01-20 23:50:00 -07:00
|
|
|
struct btrfs_inode *inode;
|
2022-11-15 02:44:05 -07:00
|
|
|
u64 file_offset;
|
|
|
|
|
|
|
|
union {
|
2023-01-20 23:50:13 -07:00
|
|
|
/*
|
2023-05-24 08:03:08 -07:00
|
|
|
* For data reads: checksumming and original I/O information.
|
2024-08-26 18:40:11 -07:00
|
|
|
* (for internal use in the btrfs_submit_bbio() machinery only)
|
2023-01-20 23:50:13 -07:00
|
|
|
*/
|
2022-11-15 02:44:05 -07:00
|
|
|
struct {
|
|
|
|
u8 *csum;
|
|
|
|
u8 csum_inline[BTRFS_BIO_INLINE_CSUM_SIZE];
|
2023-01-20 23:50:13 -07:00
|
|
|
struct bvec_iter saved_iter;
|
2022-11-15 02:44:05 -07:00
|
|
|
};
|
|
|
|
|
2023-05-24 08:03:08 -07:00
|
|
|
/*
|
|
|
|
* For data writes:
|
2023-05-31 00:54:02 -07:00
|
|
|
* - ordered extent covering the bio
|
2023-05-24 08:03:08 -07:00
|
|
|
* - pointer to the checksums for this bio
|
|
|
|
* - original physical address from the allocator
|
|
|
|
* (for zone append only)
|
|
|
|
*/
|
|
|
|
struct {
|
2023-05-31 00:54:02 -07:00
|
|
|
struct btrfs_ordered_extent *ordered;
|
2023-05-24 08:03:08 -07:00
|
|
|
struct btrfs_ordered_sum *sums;
|
|
|
|
u64 orig_physical;
|
|
|
|
};
|
|
|
|
|
|
|
|
/* For metadata reads: parentness verification. */
|
2022-11-15 02:44:05 -07:00
|
|
|
struct btrfs_tree_parent_check parent_check;
|
|
|
|
};
|
|
|
|
|
|
|
|
/* End I/O information supplied to btrfs_bio_alloc */
|
|
|
|
btrfs_bio_end_io_t end_io;
|
|
|
|
void *private;
|
|
|
|
|
2023-01-20 23:49:59 -07:00
|
|
|
/* For internal use in read end I/O handling */
|
2023-01-20 23:50:14 -07:00
|
|
|
unsigned int mirror_num;
|
2023-01-20 23:50:20 -07:00
|
|
|
atomic_t pending_ios;
|
2022-11-15 02:44:05 -07:00
|
|
|
struct work_struct end_io_work;
|
|
|
|
|
2023-03-23 02:01:20 -07:00
|
|
|
/* File system that this I/O operates on. */
|
|
|
|
struct btrfs_fs_info *fs_info;
|
|
|
|
|
btrfs: fix error propagation of split bios
The purpose of btrfs_bbio_propagate_error() shall be propagating an error
of split bio to its original btrfs_bio, and tell the error to the upper
layer. However, it's not working well on some cases.
* Case 1. Immediate (or quick) end_bio with an error
When btrfs sends btrfs_bio to mirrored devices, btrfs calls
btrfs_bio_end_io() when all the mirroring bios are completed. If that
btrfs_bio was split, it is from btrfs_clone_bioset and its end_io function
is btrfs_orig_write_end_io. For this case, btrfs_bbio_propagate_error()
accesses the orig_bbio's bio context to increase the error count.
That works well in most cases. However, if the end_io is called enough
fast, orig_bbio's (remaining part after split) bio context may not be
properly set at that time. Since the bio context is set when the orig_bbio
(the last btrfs_bio) is sent to devices, that might be too late for earlier
split btrfs_bio's completion. That will result in NULL pointer
dereference.
That bug is easily reproducible by running btrfs/146 on zoned devices [1]
and it shows the following trace.
[1] You need raid-stripe-tree feature as it create "-d raid0 -m raid1" FS.
BUG: kernel NULL pointer dereference, address: 0000000000000020
#PF: supervisor read access in kernel mode
#PF: error_code(0x0000) - not-present page
PGD 0 P4D 0
Oops: Oops: 0000 [#1] PREEMPT SMP PTI
CPU: 1 UID: 0 PID: 13 Comm: kworker/u32:1 Not tainted 6.11.0-rc7-BTRFS-ZNS+ #474
Hardware name: Bochs Bochs, BIOS Bochs 01/01/2011
Workqueue: writeback wb_workfn (flush-btrfs-5)
RIP: 0010:btrfs_bio_end_io+0xae/0xc0 [btrfs]
BTRFS error (device dm-0): bdev /dev/mapper/error-test errs: wr 2, rd 0, flush 0, corrupt 0, gen 0
RSP: 0018:ffffc9000006f248 EFLAGS: 00010246
RAX: 0000000000000000 RBX: ffff888005a7f080 RCX: ffffc9000006f1dc
RDX: 0000000000000000 RSI: 000000000000000a RDI: ffff888005a7f080
RBP: ffff888011dfc540 R08: 0000000000000000 R09: 0000000000000001
R10: ffffffff82e508e0 R11: 0000000000000005 R12: ffff88800ddfbe58
R13: ffff888005a7f080 R14: ffff888005a7f158 R15: ffff888005a7f158
FS: 0000000000000000(0000) GS:ffff88803ea80000(0000) knlGS:0000000000000000
CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 0000000000000020 CR3: 0000000002e22006 CR4: 0000000000370ef0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
Call Trace:
<TASK>
? __die_body.cold+0x19/0x26
? page_fault_oops+0x13e/0x2b0
? _printk+0x58/0x73
? do_user_addr_fault+0x5f/0x750
? exc_page_fault+0x76/0x240
? asm_exc_page_fault+0x22/0x30
? btrfs_bio_end_io+0xae/0xc0 [btrfs]
? btrfs_log_dev_io_error+0x7f/0x90 [btrfs]
btrfs_orig_write_end_io+0x51/0x90 [btrfs]
dm_submit_bio+0x5c2/0xa50 [dm_mod]
? find_held_lock+0x2b/0x80
? blk_try_enter_queue+0x90/0x1e0
__submit_bio+0xe0/0x130
? ktime_get+0x10a/0x160
? lockdep_hardirqs_on+0x74/0x100
submit_bio_noacct_nocheck+0x199/0x410
btrfs_submit_bio+0x7d/0x150 [btrfs]
btrfs_submit_chunk+0x1a1/0x6d0 [btrfs]
? lockdep_hardirqs_on+0x74/0x100
? __folio_start_writeback+0x10/0x2c0
btrfs_submit_bbio+0x1c/0x40 [btrfs]
submit_one_bio+0x44/0x60 [btrfs]
submit_extent_folio+0x13f/0x330 [btrfs]
? btrfs_set_range_writeback+0xa3/0xd0 [btrfs]
extent_writepage_io+0x18b/0x360 [btrfs]
extent_write_locked_range+0x17c/0x340 [btrfs]
? __pfx_end_bbio_data_write+0x10/0x10 [btrfs]
run_delalloc_cow+0x71/0xd0 [btrfs]
btrfs_run_delalloc_range+0x176/0x500 [btrfs]
? find_lock_delalloc_range+0x119/0x260 [btrfs]
writepage_delalloc+0x2ab/0x480 [btrfs]
extent_write_cache_pages+0x236/0x7d0 [btrfs]
btrfs_writepages+0x72/0x130 [btrfs]
do_writepages+0xd4/0x240
? find_held_lock+0x2b/0x80
? wbc_attach_and_unlock_inode+0x12c/0x290
? wbc_attach_and_unlock_inode+0x12c/0x290
__writeback_single_inode+0x5c/0x4c0
? do_raw_spin_unlock+0x49/0xb0
writeback_sb_inodes+0x22c/0x560
__writeback_inodes_wb+0x4c/0xe0
wb_writeback+0x1d6/0x3f0
wb_workfn+0x334/0x520
process_one_work+0x1ee/0x570
? lock_is_held_type+0xc6/0x130
worker_thread+0x1d1/0x3b0
? __pfx_worker_thread+0x10/0x10
kthread+0xee/0x120
? __pfx_kthread+0x10/0x10
ret_from_fork+0x30/0x50
? __pfx_kthread+0x10/0x10
ret_from_fork_asm+0x1a/0x30
</TASK>
Modules linked in: dm_mod btrfs blake2b_generic xor raid6_pq rapl
CR2: 0000000000000020
* Case 2. Earlier completion of orig_bbio for mirrored btrfs_bios
btrfs_bbio_propagate_error() assumes the end_io function for orig_bbio is
called last among split bios. In that case, btrfs_orig_write_end_io() sets
the bio->bi_status to BLK_STS_IOERR by seeing the bioc->error [2].
Otherwise, the increased orig_bio's bioc->error is not checked by anyone
and return BLK_STS_OK to the upper layer.
[2] Actually, this is not true. Because we only increases orig_bioc->errors
by max_errors, the condition "atomic_read(&bioc->error) > bioc->max_errors"
is still not met if only one split btrfs_bio fails.
* Case 3. Later completion of orig_bbio for un-mirrored btrfs_bios
In contrast to the above case, btrfs_bbio_propagate_error() is not working
well if un-mirrored orig_bbio is completed last. It sets
orig_bbio->bio.bi_status to the btrfs_bio's error. But, that is easily
over-written by orig_bbio's completion status. If the status is BLK_STS_OK,
the upper layer would not know the failure.
* Solution
Considering the above cases, we can only save the error status in the
orig_bbio (remaining part after split) itself as it is always
available. Also, the saved error status should be propagated when all the
split btrfs_bios are finished (i.e, bbio->pending_ios == 0).
This commit introduces "status" to btrfs_bbio and saves the first error of
split bios to original btrfs_bio's "status" variable. When all the split
bios are finished, the saved status is loaded into original btrfs_bio's
status.
With this commit, btrfs/146 on zoned devices does not hit the NULL pointer
dereference anymore.
Fixes: 852eee62d31a ("btrfs: allow btrfs_submit_bio to split bios")
CC: stable@vger.kernel.org # 6.6+
Reviewed-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: Naohiro Aota <naohiro.aota@wdc.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2024-10-09 06:52:06 -07:00
|
|
|
/* Save the first error status of split bio. */
|
|
|
|
blk_status_t status;
|
|
|
|
|
2022-11-15 02:44:05 -07:00
|
|
|
/*
|
|
|
|
* This member must come last, bio_alloc_bioset will allocate enough
|
|
|
|
* bytes for entire btrfs_bio but relies on bio being last.
|
|
|
|
*/
|
|
|
|
struct bio bio;
|
|
|
|
};
|
|
|
|
|
|
|
|
static inline struct btrfs_bio *btrfs_bio(struct bio *bio)
|
|
|
|
{
|
|
|
|
return container_of(bio, struct btrfs_bio, bio);
|
|
|
|
}
|
|
|
|
|
|
|
|
int __init btrfs_bioset_init(void);
|
|
|
|
void __cold btrfs_bioset_exit(void);
|
|
|
|
|
2023-03-23 02:01:20 -07:00
|
|
|
void btrfs_bio_init(struct btrfs_bio *bbio, struct btrfs_fs_info *fs_info,
|
2023-01-20 23:50:21 -07:00
|
|
|
btrfs_bio_end_io_t end_io, void *private);
|
2023-03-07 09:39:44 -07:00
|
|
|
struct btrfs_bio *btrfs_bio_alloc(unsigned int nr_vecs, blk_opf_t opf,
|
2023-03-23 02:01:20 -07:00
|
|
|
struct btrfs_fs_info *fs_info,
|
2023-03-07 09:39:44 -07:00
|
|
|
btrfs_bio_end_io_t end_io, void *private);
|
2023-05-31 00:54:02 -07:00
|
|
|
void btrfs_bio_end_io(struct btrfs_bio *bbio, blk_status_t status);
|
2022-11-15 02:44:05 -07:00
|
|
|
|
2023-03-26 17:49:51 -07:00
|
|
|
/* Submit using blkcg_punt_bio_submit. */
|
|
|
|
#define REQ_BTRFS_CGROUP_PUNT REQ_FS_PRIVATE
|
|
|
|
|
2024-08-26 18:40:11 -07:00
|
|
|
void btrfs_submit_bbio(struct btrfs_bio *bbio, int mirror_num);
|
2023-03-19 19:12:49 -07:00
|
|
|
void btrfs_submit_repair_write(struct btrfs_bio *bbio, int mirror_num, bool dev_replace);
|
2022-11-15 02:44:05 -07:00
|
|
|
int btrfs_repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start,
|
btrfs: migrate btrfs_repair_io_failure() to folio interfaces
[BUG]
Test case btrfs/124 failed if larger metadata folio is enabled, the
dying message looks like this:
BTRFS error (device dm-2): bad tree block start, mirror 2 want 31686656 have 0
BTRFS info (device dm-2): read error corrected: ino 0 off 31686656 (dev /dev/mapper/test-scratch2 sector 20928)
BUG: kernel NULL pointer dereference, address: 0000000000000020
#PF: supervisor read access in kernel mode
#PF: error_code(0x0000) - not-present page
CPU: 6 PID: 350881 Comm: btrfs Tainted: G OE 6.7.0-rc3-custom+ #128
Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS unknown 2/2/2022
RIP: 0010:btrfs_read_extent_buffer+0x106/0x180 [btrfs]
PKRU: 55555554
Call Trace:
<TASK>
read_tree_block+0x33/0xb0 [btrfs]
read_block_for_search+0x23e/0x340 [btrfs]
btrfs_search_slot+0x2f9/0xe60 [btrfs]
btrfs_lookup_csum+0x75/0x160 [btrfs]
btrfs_lookup_bio_sums+0x21a/0x560 [btrfs]
btrfs_submit_chunk+0x152/0x680 [btrfs]
btrfs_submit_bio+0x1c/0x50 [btrfs]
submit_one_bio+0x40/0x80 [btrfs]
submit_extent_page+0x158/0x390 [btrfs]
btrfs_do_readpage+0x330/0x740 [btrfs]
extent_readahead+0x38d/0x6c0 [btrfs]
read_pages+0x94/0x2c0
page_cache_ra_unbounded+0x12d/0x190
relocate_file_extent_cluster+0x7c1/0x9d0 [btrfs]
relocate_block_group+0x2d3/0x560 [btrfs]
btrfs_relocate_block_group+0x2c7/0x4b0 [btrfs]
btrfs_relocate_chunk+0x4c/0x1a0 [btrfs]
btrfs_balance+0x925/0x13c0 [btrfs]
btrfs_ioctl+0x19f1/0x25d0 [btrfs]
__x64_sys_ioctl+0x90/0xd0
do_syscall_64+0x3f/0xf0
entry_SYSCALL_64_after_hwframe+0x6e/0x76
[CAUSE]
The dying line is at btrfs_repair_io_failure() call inside
btrfs_repair_eb_io_failure().
The function is still relying on the extent buffer using page sized
folios.
When the extent buffer is using larger folio, we go into the 2nd slot of
folios[], and triggered the NULL pointer dereference.
[FIX]
Migrate btrfs_repair_io_failure() to folio interfaces.
So that when we hit a larger folio, we just submit the whole folio in
one go.
This also affects data repair path through btrfs_end_repair_bio(),
thankfully data is still fully page based, we can just add an
ASSERT(), and use page_folio() to convert the page to folio.
Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2023-12-11 22:24:10 -07:00
|
|
|
u64 length, u64 logical, struct folio *folio,
|
|
|
|
unsigned int folio_offset, int mirror_num);
|
2022-11-15 02:44:05 -07:00
|
|
|
|
|
|
|
#endif
|