2018-04-03 10:16:55 -07:00
|
|
|
/* SPDX-License-Identifier: GPL-2.0 */
|
2009-03-13 07:10:06 -07:00
|
|
|
/*
|
|
|
|
* Copyright (C) 2008 Oracle. All rights reserved.
|
|
|
|
*/
|
2018-04-03 10:16:55 -07:00
|
|
|
|
|
|
|
#ifndef BTRFS_DELAYED_REF_H
|
|
|
|
#define BTRFS_DELAYED_REF_H
|
2009-03-13 07:10:06 -07:00
|
|
|
|
2024-01-26 20:31:30 -07:00
|
|
|
#include <linux/types.h>
|
2017-03-03 01:55:15 -07:00
|
|
|
#include <linux/refcount.h>
|
2024-01-26 20:31:30 -07:00
|
|
|
#include <linux/list.h>
|
|
|
|
#include <linux/rbtree.h>
|
|
|
|
#include <linux/mutex.h>
|
|
|
|
#include <linux/spinlock.h>
|
|
|
|
#include <linux/slab.h>
|
|
|
|
#include <uapi/linux/btrfs_tree.h>
|
|
|
|
|
|
|
|
struct btrfs_trans_handle;
|
|
|
|
struct btrfs_fs_info;
|
2017-03-03 01:55:15 -07:00
|
|
|
|
2012-09-20 17:21:01 -07:00
|
|
|
/* these are the possible values of struct btrfs_delayed_ref_node->action */
|
2023-09-07 16:09:40 -07:00
|
|
|
enum btrfs_delayed_ref_action {
|
|
|
|
/* Add one backref to the tree */
|
|
|
|
BTRFS_ADD_DELAYED_REF = 1,
|
|
|
|
/* Delete one backref from the tree */
|
|
|
|
BTRFS_DROP_DELAYED_REF,
|
|
|
|
/* Record a full extent allocation */
|
|
|
|
BTRFS_ADD_DELAYED_EXTENT,
|
|
|
|
/* Not changing ref count on head ref */
|
|
|
|
BTRFS_UPDATE_DELAYED_HEAD,
|
|
|
|
} __packed;
|
2009-03-13 07:10:06 -07:00
|
|
|
|
2024-04-12 21:11:22 -07:00
|
|
|
struct btrfs_data_ref {
|
|
|
|
/* For EXTENT_DATA_REF */
|
2024-04-12 14:16:40 -07:00
|
|
|
|
2024-04-12 21:11:22 -07:00
|
|
|
/* Inode which refers to this data extent */
|
2024-04-12 14:16:40 -07:00
|
|
|
u64 objectid;
|
2024-04-12 21:11:22 -07:00
|
|
|
|
|
|
|
/*
|
|
|
|
* file_offset - extent_offset
|
|
|
|
*
|
|
|
|
* file_offset is the key.offset of the EXTENT_DATA key.
|
|
|
|
* extent_offset is btrfs_file_extent_offset() of the EXTENT_DATA data.
|
|
|
|
*/
|
2024-04-12 14:16:40 -07:00
|
|
|
u64 offset;
|
|
|
|
};
|
|
|
|
|
2024-04-12 21:11:22 -07:00
|
|
|
struct btrfs_tree_ref {
|
|
|
|
/*
|
|
|
|
* Level of this tree block.
|
|
|
|
*
|
|
|
|
* Shared for skinny (TREE_BLOCK_REF) and normal tree ref.
|
|
|
|
*/
|
|
|
|
int level;
|
|
|
|
|
|
|
|
/* For non-skinny metadata, no special member needed */
|
|
|
|
};
|
|
|
|
|
2009-03-13 07:10:06 -07:00
|
|
|
struct btrfs_delayed_ref_node {
|
2017-10-19 11:16:00 -07:00
|
|
|
struct rb_node ref_node;
|
btrfs: improve delayed refs iterations
This issue was found when I tried to delete a heavily reflinked file,
when deleting such files, other transaction operation will not have a
chance to make progress, for example, start_transaction() will blocked
in wait_current_trans(root) for long time, sometimes it even triggers
soft lockups, and the time taken to delete such heavily reflinked file
is also very large, often hundreds of seconds. Using perf top, it reports
that:
PerfTop: 7416 irqs/sec kernel:99.8% exact: 0.0% [4000Hz cpu-clock], (all, 4 CPUs)
---------------------------------------------------------------------------------------
84.37% [btrfs] [k] __btrfs_run_delayed_refs.constprop.80
11.02% [kernel] [k] delay_tsc
0.79% [kernel] [k] _raw_spin_unlock_irq
0.78% [kernel] [k] _raw_spin_unlock_irqrestore
0.45% [kernel] [k] do_raw_spin_lock
0.18% [kernel] [k] __slab_alloc
It seems __btrfs_run_delayed_refs() took most cpu time, after some debug
work, I found it's select_delayed_ref() causing this issue, for a delayed
head, in our case, it'll be full of BTRFS_DROP_DELAYED_REF nodes, but
select_delayed_ref() will firstly try to iterate node list to find
BTRFS_ADD_DELAYED_REF nodes, obviously it's a disaster in this case, and
waste much time.
To fix this issue, we introduce a new ref_add_list in struct btrfs_delayed_ref_head,
then in select_delayed_ref(), if this list is not empty, we can directly use
nodes in this list. With this patch, it just took about 10~15 seconds to
delte the same file. Now using perf top, it reports that:
PerfTop: 2734 irqs/sec kernel:99.5% exact: 0.0% [4000Hz cpu-clock], (all, 4 CPUs)
----------------------------------------------------------------------------------------
20.74% [kernel] [k] _raw_spin_unlock_irqrestore
16.33% [kernel] [k] __slab_alloc
5.41% [kernel] [k] lock_acquired
4.42% [kernel] [k] lock_acquire
4.05% [kernel] [k] lock_release
3.37% [kernel] [k] _raw_spin_unlock_irq
For normal files, this patch also gives help, at least we do not need to
iterate whole list to found BTRFS_ADD_DELAYED_REF nodes.
Signed-off-by: Wang Xiaoguang <wangxg.fnst@cn.fujitsu.com>
Reviewed-by: Liu Bo <bo.li.liu@oracle.com>
Tested-by: Holger Hoffstätte <holger@applied-asynchrony.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2016-10-26 03:07:33 -07:00
|
|
|
/*
|
|
|
|
* If action is BTRFS_ADD_DELAYED_REF, also link this node to
|
|
|
|
* ref_head->ref_add_list, then we do not need to iterate the
|
|
|
|
* whole ref_head->ref_list to find BTRFS_ADD_DELAYED_REF nodes.
|
|
|
|
*/
|
|
|
|
struct list_head add_list;
|
2015-03-30 02:03:00 -07:00
|
|
|
|
2009-03-13 07:10:06 -07:00
|
|
|
/* the starting bytenr of the extent */
|
|
|
|
u64 bytenr;
|
|
|
|
|
|
|
|
/* the size of the extent */
|
|
|
|
u64 num_bytes;
|
|
|
|
|
2011-09-14 03:37:00 -07:00
|
|
|
/* seq number to keep track of insertion order */
|
|
|
|
u64 seq;
|
|
|
|
|
2024-04-12 19:57:13 -07:00
|
|
|
/* The ref_root for this ref */
|
|
|
|
u64 ref_root;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* The parent for this ref, if this isn't set the ref_root is the
|
|
|
|
* reference owner.
|
|
|
|
*/
|
|
|
|
u64 parent;
|
|
|
|
|
2009-03-13 07:10:06 -07:00
|
|
|
/* ref count on this data structure */
|
2017-03-03 01:55:15 -07:00
|
|
|
refcount_t refs;
|
2009-03-13 07:10:06 -07:00
|
|
|
|
|
|
|
/*
|
|
|
|
* how many refs is this entry adding or deleting. For
|
|
|
|
* head refs, this may be a negative number because it is keeping
|
|
|
|
* track of the total mods done to the reference count.
|
|
|
|
* For individual refs, this will always be a positive number
|
|
|
|
*
|
|
|
|
* It may be more than one, since it is possible for a single
|
|
|
|
* parent to have more than one ref on an extent
|
|
|
|
*/
|
|
|
|
int ref_mod;
|
|
|
|
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 07:45:14 -07:00
|
|
|
unsigned int action:8;
|
|
|
|
unsigned int type:8;
|
2024-04-12 14:16:40 -07:00
|
|
|
|
|
|
|
union {
|
2024-04-12 21:11:22 -07:00
|
|
|
struct btrfs_tree_ref tree_ref;
|
|
|
|
struct btrfs_data_ref data_ref;
|
2024-04-12 14:16:40 -07:00
|
|
|
};
|
2009-03-13 07:10:06 -07:00
|
|
|
};
|
|
|
|
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 07:45:14 -07:00
|
|
|
struct btrfs_delayed_extent_op {
|
|
|
|
struct btrfs_disk_key key;
|
2015-11-30 08:51:29 -07:00
|
|
|
bool update_key;
|
|
|
|
bool update_flags;
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 07:45:14 -07:00
|
|
|
u64 flags_to_set;
|
|
|
|
};
|
|
|
|
|
2009-03-13 07:10:06 -07:00
|
|
|
/*
|
|
|
|
* the head refs are used to hold a lock on a given extent, which allows us
|
|
|
|
* to make sure that only one process is running the delayed refs
|
|
|
|
* at a time for a single extent. They also store the sum of all the
|
|
|
|
* reference count modifications we've queued up.
|
|
|
|
*/
|
|
|
|
struct btrfs_delayed_ref_head {
|
2017-09-29 12:43:57 -07:00
|
|
|
u64 bytenr;
|
|
|
|
u64 num_bytes;
|
btrfs: reorder some members of struct btrfs_delayed_ref_head
Currently struct delayed_ref_head has its 'bytenr' and 'href_node' members
in different cache lines (even on a release, non-debug, kernel). This is
not optimal because when iterating the red black tree of delayed ref heads
for inserting a new delayed ref head (htree_insert()) we have to pull in 2
cache lines of delayed ref heads we find in a patch, one for the tree node
(struct rb_node) and another one for the 'bytenr' field. The same applies
when searching for an existing delayed ref head (find_ref_head()).
On a release (non-debug) kernel, the structure also has two 4 bytes holes,
which makes it 8 bytes longer than necessary. Its current layout is the
following:
struct btrfs_delayed_ref_head {
u64 bytenr; /* 0 8 */
u64 num_bytes; /* 8 8 */
refcount_t refs; /* 16 4 */
/* XXX 4 bytes hole, try to pack */
struct mutex mutex; /* 24 32 */
spinlock_t lock; /* 56 4 */
/* XXX 4 bytes hole, try to pack */
/* --- cacheline 1 boundary (64 bytes) --- */
struct rb_root_cached ref_tree; /* 64 16 */
struct list_head ref_add_list; /* 80 16 */
struct rb_node href_node __attribute__((__aligned__(8))); /* 96 24 */
struct btrfs_delayed_extent_op * extent_op; /* 120 8 */
/* --- cacheline 2 boundary (128 bytes) --- */
int total_ref_mod; /* 128 4 */
int ref_mod; /* 132 4 */
unsigned int must_insert_reserved:1; /* 136: 0 4 */
unsigned int is_data:1; /* 136: 1 4 */
unsigned int is_system:1; /* 136: 2 4 */
unsigned int processing:1; /* 136: 3 4 */
/* size: 144, cachelines: 3, members: 15 */
/* sum members: 128, holes: 2, sum holes: 8 */
/* sum bitfield members: 4 bits (0 bytes) */
/* padding: 4 */
/* bit_padding: 28 bits */
/* forced alignments: 1 */
/* last cacheline: 16 bytes */
} __attribute__((__aligned__(8)));
This change reorders the 'href_node' and 'refs' members so that we have
the 'href_node' in the same cache line as the 'bytenr' field, while also
eliminating the two holes and reducing the structure size from 144 bytes
down to 136 bytes, so we can now have 30 ref heads per 4K page (on x86_64)
instead of 28. The new structure layout after this change is now:
struct btrfs_delayed_ref_head {
u64 bytenr; /* 0 8 */
u64 num_bytes; /* 8 8 */
struct rb_node href_node __attribute__((__aligned__(8))); /* 16 24 */
struct mutex mutex; /* 40 32 */
/* --- cacheline 1 boundary (64 bytes) was 8 bytes ago --- */
refcount_t refs; /* 72 4 */
spinlock_t lock; /* 76 4 */
struct rb_root_cached ref_tree; /* 80 16 */
struct list_head ref_add_list; /* 96 16 */
struct btrfs_delayed_extent_op * extent_op; /* 112 8 */
int total_ref_mod; /* 120 4 */
int ref_mod; /* 124 4 */
/* --- cacheline 2 boundary (128 bytes) --- */
unsigned int must_insert_reserved:1; /* 128: 0 4 */
unsigned int is_data:1; /* 128: 1 4 */
unsigned int is_system:1; /* 128: 2 4 */
unsigned int processing:1; /* 128: 3 4 */
/* size: 136, cachelines: 3, members: 15 */
/* padding: 4 */
/* bit_padding: 28 bits */
/* forced alignments: 1 */
/* last cacheline: 8 bytes */
} __attribute__((__aligned__(8)));
Running the following fs_mark test shows some significant improvement.
$ cat test.sh
#!/bin/bash
# 15G null block device
DEV=/dev/nullb0
MNT=/mnt/nullb0
FILES=100000
THREADS=$(nproc --all)
FILE_SIZE=0
echo "performance" | \
tee /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor
mkfs.btrfs -f $DEV
mount -o ssd $DEV $MNT
OPTS="-S 0 -L 5 -n $FILES -s $FILE_SIZE -t $THREADS -k"
for ((i = 1; i <= $THREADS; i++)); do
OPTS="$OPTS -d $MNT/d$i"
done
fs_mark $OPTS
umount $MNT
Before this change:
FSUse% Count Size Files/sec App Overhead
10 1200000 0 112631.3 11928055
16 2400000 0 189943.8 12140777
23 3600000 0 150719.2 13178480
50 4800000 0 99137.3 12504293
53 6000000 0 111733.9 12670836
Total files/sec: 664165.5
After this change:
FSUse% Count Size Files/sec App Overhead
10 1200000 0 148589.5 11565889
16 2400000 0 227743.8 11561596
23 3600000 0 191590.5 12550755
30 4800000 0 179812.3 12629610
53 6000000 0 92471.4 12352383
Total files/sec: 840207.5
Measuring the execution times of htree_insert(), in nanoseconds, during
those fs_mark runs:
Before this change:
Range: 0.000 - 940647.000; Mean: 619.733; Median: 548.000; Stddev: 1834.231
Percentiles: 90th: 980.000; 95th: 1208.000; 99th: 2090.000
0.000 - 6.384: 257 |
6.384 - 26.259: 977 |
26.259 - 99.635: 4963 |
99.635 - 370.526: 136800 #############
370.526 - 1370.603: 566110 #####################################################
1370.603 - 5062.704: 24945 ##
5062.704 - 18693.248: 944 |
18693.248 - 69014.670: 211 |
69014.670 - 254791.959: 30 |
254791.959 - 940647.000: 4 |
After this change:
Range: 0.000 - 299200.000; Mean: 587.754; Median: 542.000; Stddev: 1030.422
Percentiles: 90th: 918.000; 95th: 1113.000; 99th: 1987.000
0.000 - 5.585: 163 |
5.585 - 20.678: 452 |
20.678 - 70.369: 1806 |
70.369 - 233.965: 26268 ####
233.965 - 772.564: 333519 #####################################################
772.564 - 2545.771: 91820 ###############
2545.771 - 8383.615: 2238 |
8383.615 - 27603.280: 170 |
27603.280 - 90879.297: 68 |
90879.297 - 299200.000: 12 |
Mean, percentiles, maximum times are all better, as well as a lower
standard deviation.
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2023-05-29 08:16:57 -07:00
|
|
|
/*
|
|
|
|
* For insertion into struct btrfs_delayed_ref_root::href_root.
|
|
|
|
* Keep it in the same cache line as 'bytenr' for more efficient
|
|
|
|
* searches in the rbtree.
|
|
|
|
*/
|
|
|
|
struct rb_node href_node;
|
2009-03-13 07:10:06 -07:00
|
|
|
/*
|
|
|
|
* the mutex is held while running the refs, and it is also
|
|
|
|
* held when checking the sum of reference modifications.
|
|
|
|
*/
|
|
|
|
struct mutex mutex;
|
|
|
|
|
btrfs: reorder some members of struct btrfs_delayed_ref_head
Currently struct delayed_ref_head has its 'bytenr' and 'href_node' members
in different cache lines (even on a release, non-debug, kernel). This is
not optimal because when iterating the red black tree of delayed ref heads
for inserting a new delayed ref head (htree_insert()) we have to pull in 2
cache lines of delayed ref heads we find in a patch, one for the tree node
(struct rb_node) and another one for the 'bytenr' field. The same applies
when searching for an existing delayed ref head (find_ref_head()).
On a release (non-debug) kernel, the structure also has two 4 bytes holes,
which makes it 8 bytes longer than necessary. Its current layout is the
following:
struct btrfs_delayed_ref_head {
u64 bytenr; /* 0 8 */
u64 num_bytes; /* 8 8 */
refcount_t refs; /* 16 4 */
/* XXX 4 bytes hole, try to pack */
struct mutex mutex; /* 24 32 */
spinlock_t lock; /* 56 4 */
/* XXX 4 bytes hole, try to pack */
/* --- cacheline 1 boundary (64 bytes) --- */
struct rb_root_cached ref_tree; /* 64 16 */
struct list_head ref_add_list; /* 80 16 */
struct rb_node href_node __attribute__((__aligned__(8))); /* 96 24 */
struct btrfs_delayed_extent_op * extent_op; /* 120 8 */
/* --- cacheline 2 boundary (128 bytes) --- */
int total_ref_mod; /* 128 4 */
int ref_mod; /* 132 4 */
unsigned int must_insert_reserved:1; /* 136: 0 4 */
unsigned int is_data:1; /* 136: 1 4 */
unsigned int is_system:1; /* 136: 2 4 */
unsigned int processing:1; /* 136: 3 4 */
/* size: 144, cachelines: 3, members: 15 */
/* sum members: 128, holes: 2, sum holes: 8 */
/* sum bitfield members: 4 bits (0 bytes) */
/* padding: 4 */
/* bit_padding: 28 bits */
/* forced alignments: 1 */
/* last cacheline: 16 bytes */
} __attribute__((__aligned__(8)));
This change reorders the 'href_node' and 'refs' members so that we have
the 'href_node' in the same cache line as the 'bytenr' field, while also
eliminating the two holes and reducing the structure size from 144 bytes
down to 136 bytes, so we can now have 30 ref heads per 4K page (on x86_64)
instead of 28. The new structure layout after this change is now:
struct btrfs_delayed_ref_head {
u64 bytenr; /* 0 8 */
u64 num_bytes; /* 8 8 */
struct rb_node href_node __attribute__((__aligned__(8))); /* 16 24 */
struct mutex mutex; /* 40 32 */
/* --- cacheline 1 boundary (64 bytes) was 8 bytes ago --- */
refcount_t refs; /* 72 4 */
spinlock_t lock; /* 76 4 */
struct rb_root_cached ref_tree; /* 80 16 */
struct list_head ref_add_list; /* 96 16 */
struct btrfs_delayed_extent_op * extent_op; /* 112 8 */
int total_ref_mod; /* 120 4 */
int ref_mod; /* 124 4 */
/* --- cacheline 2 boundary (128 bytes) --- */
unsigned int must_insert_reserved:1; /* 128: 0 4 */
unsigned int is_data:1; /* 128: 1 4 */
unsigned int is_system:1; /* 128: 2 4 */
unsigned int processing:1; /* 128: 3 4 */
/* size: 136, cachelines: 3, members: 15 */
/* padding: 4 */
/* bit_padding: 28 bits */
/* forced alignments: 1 */
/* last cacheline: 8 bytes */
} __attribute__((__aligned__(8)));
Running the following fs_mark test shows some significant improvement.
$ cat test.sh
#!/bin/bash
# 15G null block device
DEV=/dev/nullb0
MNT=/mnt/nullb0
FILES=100000
THREADS=$(nproc --all)
FILE_SIZE=0
echo "performance" | \
tee /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor
mkfs.btrfs -f $DEV
mount -o ssd $DEV $MNT
OPTS="-S 0 -L 5 -n $FILES -s $FILE_SIZE -t $THREADS -k"
for ((i = 1; i <= $THREADS; i++)); do
OPTS="$OPTS -d $MNT/d$i"
done
fs_mark $OPTS
umount $MNT
Before this change:
FSUse% Count Size Files/sec App Overhead
10 1200000 0 112631.3 11928055
16 2400000 0 189943.8 12140777
23 3600000 0 150719.2 13178480
50 4800000 0 99137.3 12504293
53 6000000 0 111733.9 12670836
Total files/sec: 664165.5
After this change:
FSUse% Count Size Files/sec App Overhead
10 1200000 0 148589.5 11565889
16 2400000 0 227743.8 11561596
23 3600000 0 191590.5 12550755
30 4800000 0 179812.3 12629610
53 6000000 0 92471.4 12352383
Total files/sec: 840207.5
Measuring the execution times of htree_insert(), in nanoseconds, during
those fs_mark runs:
Before this change:
Range: 0.000 - 940647.000; Mean: 619.733; Median: 548.000; Stddev: 1834.231
Percentiles: 90th: 980.000; 95th: 1208.000; 99th: 2090.000
0.000 - 6.384: 257 |
6.384 - 26.259: 977 |
26.259 - 99.635: 4963 |
99.635 - 370.526: 136800 #############
370.526 - 1370.603: 566110 #####################################################
1370.603 - 5062.704: 24945 ##
5062.704 - 18693.248: 944 |
18693.248 - 69014.670: 211 |
69014.670 - 254791.959: 30 |
254791.959 - 940647.000: 4 |
After this change:
Range: 0.000 - 299200.000; Mean: 587.754; Median: 542.000; Stddev: 1030.422
Percentiles: 90th: 918.000; 95th: 1113.000; 99th: 1987.000
0.000 - 5.585: 163 |
5.585 - 20.678: 452 |
20.678 - 70.369: 1806 |
70.369 - 233.965: 26268 ####
233.965 - 772.564: 333519 #####################################################
772.564 - 2545.771: 91820 ###############
2545.771 - 8383.615: 2238 |
8383.615 - 27603.280: 170 |
27603.280 - 90879.297: 68 |
90879.297 - 299200.000: 12 |
Mean, percentiles, maximum times are all better, as well as a lower
standard deviation.
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2023-05-29 08:16:57 -07:00
|
|
|
refcount_t refs;
|
|
|
|
|
|
|
|
/* Protects 'ref_tree' and 'ref_add_list'. */
|
2014-01-23 07:21:38 -07:00
|
|
|
spinlock_t lock;
|
2018-08-22 12:51:50 -07:00
|
|
|
struct rb_root_cached ref_tree;
|
btrfs: improve delayed refs iterations
This issue was found when I tried to delete a heavily reflinked file,
when deleting such files, other transaction operation will not have a
chance to make progress, for example, start_transaction() will blocked
in wait_current_trans(root) for long time, sometimes it even triggers
soft lockups, and the time taken to delete such heavily reflinked file
is also very large, often hundreds of seconds. Using perf top, it reports
that:
PerfTop: 7416 irqs/sec kernel:99.8% exact: 0.0% [4000Hz cpu-clock], (all, 4 CPUs)
---------------------------------------------------------------------------------------
84.37% [btrfs] [k] __btrfs_run_delayed_refs.constprop.80
11.02% [kernel] [k] delay_tsc
0.79% [kernel] [k] _raw_spin_unlock_irq
0.78% [kernel] [k] _raw_spin_unlock_irqrestore
0.45% [kernel] [k] do_raw_spin_lock
0.18% [kernel] [k] __slab_alloc
It seems __btrfs_run_delayed_refs() took most cpu time, after some debug
work, I found it's select_delayed_ref() causing this issue, for a delayed
head, in our case, it'll be full of BTRFS_DROP_DELAYED_REF nodes, but
select_delayed_ref() will firstly try to iterate node list to find
BTRFS_ADD_DELAYED_REF nodes, obviously it's a disaster in this case, and
waste much time.
To fix this issue, we introduce a new ref_add_list in struct btrfs_delayed_ref_head,
then in select_delayed_ref(), if this list is not empty, we can directly use
nodes in this list. With this patch, it just took about 10~15 seconds to
delte the same file. Now using perf top, it reports that:
PerfTop: 2734 irqs/sec kernel:99.5% exact: 0.0% [4000Hz cpu-clock], (all, 4 CPUs)
----------------------------------------------------------------------------------------
20.74% [kernel] [k] _raw_spin_unlock_irqrestore
16.33% [kernel] [k] __slab_alloc
5.41% [kernel] [k] lock_acquired
4.42% [kernel] [k] lock_acquire
4.05% [kernel] [k] lock_release
3.37% [kernel] [k] _raw_spin_unlock_irq
For normal files, this patch also gives help, at least we do not need to
iterate whole list to found BTRFS_ADD_DELAYED_REF nodes.
Signed-off-by: Wang Xiaoguang <wangxg.fnst@cn.fujitsu.com>
Reviewed-by: Liu Bo <bo.li.liu@oracle.com>
Tested-by: Holger Hoffstätte <holger@applied-asynchrony.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2016-10-26 03:07:33 -07:00
|
|
|
/* accumulate add BTRFS_ADD_DELAYED_REF nodes to this ref_add_list. */
|
|
|
|
struct list_head ref_add_list;
|
2009-03-13 07:17:05 -07:00
|
|
|
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 07:45:14 -07:00
|
|
|
struct btrfs_delayed_extent_op *extent_op;
|
2015-02-03 08:50:16 -07:00
|
|
|
|
|
|
|
/*
|
|
|
|
* This is used to track the final ref_mod from all the refs associated
|
|
|
|
* with this head ref, this is not adjusted as delayed refs are run,
|
|
|
|
* this is meant to track if we need to do the csum accounting or not.
|
|
|
|
*/
|
|
|
|
int total_ref_mod;
|
|
|
|
|
2017-09-29 12:43:57 -07:00
|
|
|
/*
|
|
|
|
* This is the current outstanding mod references for this bytenr. This
|
|
|
|
* is used with lookup_extent_info to get an accurate reference count
|
|
|
|
* for a bytenr, so it is adjusted as delayed refs are run so that any
|
|
|
|
* on disk reference count + ref_mod is accurate.
|
|
|
|
*/
|
|
|
|
int ref_mod;
|
|
|
|
|
2023-06-28 14:03:35 -07:00
|
|
|
/*
|
|
|
|
* The root that triggered the allocation when must_insert_reserved is
|
|
|
|
* set to true.
|
|
|
|
*/
|
|
|
|
u64 owning_root;
|
|
|
|
|
btrfs: record simple quota deltas in delayed refs
At the moment that we run delayed refs, we make the final ref-count
based decision on creating/removing extent (and metadata) items.
Therefore, it is exactly the spot to hook up simple quotas.
There are a few important subtleties to the fields we must collect to
accurately track simple quotas, particularly when removing an extent.
When removing a data extent, the ref could be in any tree (due to
reflink, for example) and so we need to recover the owning root id from
the owner ref item. When removing a metadata extent, we know the owning
root from the owner field in the header when we create the delayed ref,
so we can recover it from there.
We must also be careful to handle reservations properly to not leaked
reserved space. The happy path is freeing the reservation when the
simple quota delta runs on a data extent. If that doesn't happen, due to
refs canceling out or some error, the ref head already has the
must_insert_reserved machinery to handle this, so we piggy back on that
and use it to clean up the reserved data.
Signed-off-by: Boris Burkov <boris@bur.io>
Signed-off-by: David Sterba <dsterba@suse.com>
2023-06-28 11:00:15 -07:00
|
|
|
/*
|
|
|
|
* Track reserved bytes when setting must_insert_reserved. On success
|
|
|
|
* or cleanup, we will need to free the reservation.
|
|
|
|
*/
|
|
|
|
u64 reserved_bytes;
|
|
|
|
|
2024-06-20 11:51:32 -07:00
|
|
|
/* Tree block level, for metadata only. */
|
|
|
|
u8 level;
|
|
|
|
|
2009-03-13 07:10:06 -07:00
|
|
|
/*
|
|
|
|
* when a new extent is allocated, it is just reserved in memory
|
|
|
|
* The actual extent isn't inserted into the extent allocation tree
|
|
|
|
* until the delayed ref is processed. must_insert_reserved is
|
|
|
|
* used to flag a delayed ref so the accounting can be updated
|
|
|
|
* when a full insert is done.
|
|
|
|
*
|
|
|
|
* It is possible the extent will be freed before it is ever
|
|
|
|
* inserted into the extent allocation tree. In this case
|
|
|
|
* we need to update the in ram accounting to properly reflect
|
|
|
|
* the free has happened.
|
|
|
|
*/
|
2023-05-29 08:17:04 -07:00
|
|
|
bool must_insert_reserved;
|
2023-06-28 14:03:35 -07:00
|
|
|
|
2023-05-29 08:17:04 -07:00
|
|
|
bool is_data;
|
|
|
|
bool is_system;
|
|
|
|
bool processing;
|
2009-03-13 07:10:06 -07:00
|
|
|
};
|
|
|
|
|
btrfs: only let one thread pre-flush delayed refs in commit
I've been running a stress test that runs 20 workers in their own
subvolume, which are running an fsstress instance with 4 threads per
worker, which is 80 total fsstress threads. In addition to this I'm
running balance in the background as well as creating and deleting
snapshots. This test takes around 12 hours to run normally, going
slower and slower as the test goes on.
The reason for this is because fsstress is running fsync sometimes, and
because we're messing with block groups we often fall through to
btrfs_commit_transaction, so will often have 20-30 threads all calling
btrfs_commit_transaction at the same time.
These all get stuck contending on the extent tree while they try to run
delayed refs during the initial part of the commit.
This is suboptimal, really because the extent tree is a single point of
failure we only want one thread acting on that tree at once to reduce
lock contention.
Fix this by making the flushing mechanism a bit operation, to make it
easy to use test_and_set_bit() in order to make sure only one task does
this initial flush.
Once we're into the transaction commit we only have one thread doing
delayed ref running, it's just this initial pre-flush that is
problematic. With this patch my stress test takes around 90 minutes to
run, instead of 12 hours.
The memory barrier is not necessary for the flushing bit as it's
ordered, unlike plain int. The transaction state accessed in
btrfs_should_end_transaction could be affected by that too as it's not
always used under transaction lock. Upon Nikolay's analysis in [1]
it's not necessary:
In should_end_transaction it's read without holding any locks. (U)
It's modified in btrfs_cleanup_transaction without holding the
fs_info->trans_lock (U), but the STATE_ERROR flag is going to be set.
set in cleanup_transaction under fs_info->trans_lock (L)
set in btrfs_commit_trans to COMMIT_START under fs_info->trans_lock.(L)
set in btrfs_commit_trans to COMMIT_DOING under fs_info->trans_lock.(L)
set in btrfs_commit_trans to COMMIT_UNBLOCK under
fs_info->trans_lock.(L)
set in btrfs_commit_trans to COMMIT_COMPLETED without locks but at this
point the transaction is finished and fs_info->running_trans is NULL (U
but irrelevant).
So by the looks of it we can have a concurrent READ race with a WRITE,
due to reads not taking a lock. In this case what we want to ensure is
we either see new or old state. I consulted with Will Deacon and he said
that in such a case we'd want to annotate the accesses to ->state with
(READ|WRITE)_ONCE so as to avoid a theoretical tear, in this case I
don't think this could happen but I imagine at some point KCSAN would
flag such an access as racy (which it is).
[1] https://lore.kernel.org/linux-btrfs/e1fd5cc1-0f28-f670-69f4-e9958b4964e6@suse.com
Reviewed-by: Nikolay Borisov <nborisov@suse.com>
Signed-off-by: Josef Bacik <josef@toxicpanda.com>
[ add comments regarding memory barrier ]
Signed-off-by: David Sterba <dsterba@suse.com>
2020-12-18 12:24:20 -07:00
|
|
|
enum btrfs_delayed_ref_flags {
|
|
|
|
/* Indicate that we are flushing delayed refs for the commit */
|
|
|
|
BTRFS_DELAYED_REFS_FLUSHING,
|
|
|
|
};
|
|
|
|
|
2009-03-13 07:10:06 -07:00
|
|
|
struct btrfs_delayed_ref_root {
|
2013-10-13 21:59:45 -07:00
|
|
|
/* head ref rbtree */
|
2018-08-22 12:51:49 -07:00
|
|
|
struct rb_root_cached href_root;
|
2013-10-13 21:59:45 -07:00
|
|
|
|
2024-09-24 07:58:31 -07:00
|
|
|
/*
|
|
|
|
* Track dirty extent records.
|
|
|
|
* The keys correspond to the logical address of the extent ("bytenr")
|
|
|
|
* right shifted by fs_info->sectorsize_bits. This is both to get a more
|
|
|
|
* dense index space (optimizes xarray structure) and because indexes in
|
|
|
|
* xarrays are of "unsigned long" type, meaning they are 32 bits wide on
|
|
|
|
* 32 bits platforms, limiting the extent range to 4G which is too low
|
|
|
|
* and makes it unusable (truncated index values) on 32 bits platforms.
|
|
|
|
*/
|
2024-06-07 07:30:21 -07:00
|
|
|
struct xarray dirty_extents;
|
2015-04-15 23:34:17 -07:00
|
|
|
|
2009-03-13 07:10:06 -07:00
|
|
|
/* this spin lock protects the rbtree and the entries inside */
|
|
|
|
spinlock_t lock;
|
|
|
|
|
|
|
|
/* how many delayed ref updates we've queued, used by the
|
|
|
|
* throttling code
|
|
|
|
*/
|
2014-01-23 07:21:38 -07:00
|
|
|
atomic_t num_entries;
|
2009-03-13 07:10:06 -07:00
|
|
|
|
2009-03-13 07:17:05 -07:00
|
|
|
/* total number of head nodes in tree */
|
|
|
|
unsigned long num_heads;
|
|
|
|
|
|
|
|
/* total number of head nodes ready for processing */
|
|
|
|
unsigned long num_heads_ready;
|
|
|
|
|
2015-02-03 08:50:16 -07:00
|
|
|
u64 pending_csums;
|
|
|
|
|
btrfs: only let one thread pre-flush delayed refs in commit
I've been running a stress test that runs 20 workers in their own
subvolume, which are running an fsstress instance with 4 threads per
worker, which is 80 total fsstress threads. In addition to this I'm
running balance in the background as well as creating and deleting
snapshots. This test takes around 12 hours to run normally, going
slower and slower as the test goes on.
The reason for this is because fsstress is running fsync sometimes, and
because we're messing with block groups we often fall through to
btrfs_commit_transaction, so will often have 20-30 threads all calling
btrfs_commit_transaction at the same time.
These all get stuck contending on the extent tree while they try to run
delayed refs during the initial part of the commit.
This is suboptimal, really because the extent tree is a single point of
failure we only want one thread acting on that tree at once to reduce
lock contention.
Fix this by making the flushing mechanism a bit operation, to make it
easy to use test_and_set_bit() in order to make sure only one task does
this initial flush.
Once we're into the transaction commit we only have one thread doing
delayed ref running, it's just this initial pre-flush that is
problematic. With this patch my stress test takes around 90 minutes to
run, instead of 12 hours.
The memory barrier is not necessary for the flushing bit as it's
ordered, unlike plain int. The transaction state accessed in
btrfs_should_end_transaction could be affected by that too as it's not
always used under transaction lock. Upon Nikolay's analysis in [1]
it's not necessary:
In should_end_transaction it's read without holding any locks. (U)
It's modified in btrfs_cleanup_transaction without holding the
fs_info->trans_lock (U), but the STATE_ERROR flag is going to be set.
set in cleanup_transaction under fs_info->trans_lock (L)
set in btrfs_commit_trans to COMMIT_START under fs_info->trans_lock.(L)
set in btrfs_commit_trans to COMMIT_DOING under fs_info->trans_lock.(L)
set in btrfs_commit_trans to COMMIT_UNBLOCK under
fs_info->trans_lock.(L)
set in btrfs_commit_trans to COMMIT_COMPLETED without locks but at this
point the transaction is finished and fs_info->running_trans is NULL (U
but irrelevant).
So by the looks of it we can have a concurrent READ race with a WRITE,
due to reads not taking a lock. In this case what we want to ensure is
we either see new or old state. I consulted with Will Deacon and he said
that in such a case we'd want to annotate the accesses to ->state with
(READ|WRITE)_ONCE so as to avoid a theoretical tear, in this case I
don't think this could happen but I imagine at some point KCSAN would
flag such an access as racy (which it is).
[1] https://lore.kernel.org/linux-btrfs/e1fd5cc1-0f28-f670-69f4-e9958b4964e6@suse.com
Reviewed-by: Nikolay Borisov <nborisov@suse.com>
Signed-off-by: Josef Bacik <josef@toxicpanda.com>
[ add comments regarding memory barrier ]
Signed-off-by: David Sterba <dsterba@suse.com>
2020-12-18 12:24:20 -07:00
|
|
|
unsigned long flags;
|
2009-03-13 07:17:05 -07:00
|
|
|
|
|
|
|
u64 run_delayed_start;
|
2015-04-19 18:53:50 -07:00
|
|
|
|
|
|
|
/*
|
|
|
|
* To make qgroup to skip given root.
|
2016-05-19 18:18:45 -07:00
|
|
|
* This is for snapshot, as btrfs_qgroup_inherit() will manually
|
2015-04-19 18:53:50 -07:00
|
|
|
* modify counters for snapshot and its source, so we should skip
|
|
|
|
* the snapshot in new_root/old_roots or it will get calculated twice
|
|
|
|
*/
|
|
|
|
u64 qgroup_to_skip;
|
2009-03-13 07:10:06 -07:00
|
|
|
};
|
|
|
|
|
btrfs: delayed-ref: Introduce better documented delayed ref structures
Current delayed ref interface has several problems:
- Longer and longer parameter lists
bytenr
num_bytes
parent
---------- so far so good
ref_root
owner
offset
---------- I don't feel good now
- Different interpretation of the same parameter
Above @owner for data ref is inode number (u64),
while for tree ref, it's level (int).
They are even in different size range.
For level we only need 0 ~ 8, while for ino it's
BTRFS_FIRST_FREE_OBJECTID ~ BTRFS_LAST_FREE_OBJECTID.
And @offset doesn't even make sense for tree ref.
Such parameter reuse may look clever as an hidden union, but it
destroys code readability.
To solve both problems, we introduce a new structure, btrfs_ref to solve
them:
- Structure instead of long parameter list
This makes later expansion easier, and is better documented.
- Use btrfs_ref::type to distinguish data and tree ref
- Use proper union to store data/tree ref specific structures.
- Use separate functions to fill data/tree ref data, with a common generic
function to fill common bytenr/num_bytes members.
All parameters will find its place in btrfs_ref, and an extra member,
@real_root, inspired by ref-verify code, is newly introduced for later
qgroup code, to record which tree is triggered by this extent modification.
This patch doesn't touch any code, but provides the basis for further
refactoring.
Signed-off-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2019-04-03 23:45:29 -07:00
|
|
|
enum btrfs_ref_type {
|
|
|
|
BTRFS_REF_NOT_SET,
|
|
|
|
BTRFS_REF_DATA,
|
|
|
|
BTRFS_REF_METADATA,
|
|
|
|
BTRFS_REF_LAST,
|
2023-09-07 16:09:40 -07:00
|
|
|
} __packed;
|
btrfs: delayed-ref: Introduce better documented delayed ref structures
Current delayed ref interface has several problems:
- Longer and longer parameter lists
bytenr
num_bytes
parent
---------- so far so good
ref_root
owner
offset
---------- I don't feel good now
- Different interpretation of the same parameter
Above @owner for data ref is inode number (u64),
while for tree ref, it's level (int).
They are even in different size range.
For level we only need 0 ~ 8, while for ino it's
BTRFS_FIRST_FREE_OBJECTID ~ BTRFS_LAST_FREE_OBJECTID.
And @offset doesn't even make sense for tree ref.
Such parameter reuse may look clever as an hidden union, but it
destroys code readability.
To solve both problems, we introduce a new structure, btrfs_ref to solve
them:
- Structure instead of long parameter list
This makes later expansion easier, and is better documented.
- Use btrfs_ref::type to distinguish data and tree ref
- Use proper union to store data/tree ref specific structures.
- Use separate functions to fill data/tree ref data, with a common generic
function to fill common bytenr/num_bytes members.
All parameters will find its place in btrfs_ref, and an extra member,
@real_root, inspired by ref-verify code, is newly introduced for later
qgroup code, to record which tree is triggered by this extent modification.
This patch doesn't touch any code, but provides the basis for further
refactoring.
Signed-off-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2019-04-03 23:45:29 -07:00
|
|
|
|
|
|
|
struct btrfs_ref {
|
|
|
|
enum btrfs_ref_type type;
|
2023-09-07 16:09:40 -07:00
|
|
|
enum btrfs_delayed_ref_action action;
|
btrfs: delayed-ref: Introduce better documented delayed ref structures
Current delayed ref interface has several problems:
- Longer and longer parameter lists
bytenr
num_bytes
parent
---------- so far so good
ref_root
owner
offset
---------- I don't feel good now
- Different interpretation of the same parameter
Above @owner for data ref is inode number (u64),
while for tree ref, it's level (int).
They are even in different size range.
For level we only need 0 ~ 8, while for ino it's
BTRFS_FIRST_FREE_OBJECTID ~ BTRFS_LAST_FREE_OBJECTID.
And @offset doesn't even make sense for tree ref.
Such parameter reuse may look clever as an hidden union, but it
destroys code readability.
To solve both problems, we introduce a new structure, btrfs_ref to solve
them:
- Structure instead of long parameter list
This makes later expansion easier, and is better documented.
- Use btrfs_ref::type to distinguish data and tree ref
- Use proper union to store data/tree ref specific structures.
- Use separate functions to fill data/tree ref data, with a common generic
function to fill common bytenr/num_bytes members.
All parameters will find its place in btrfs_ref, and an extra member,
@real_root, inspired by ref-verify code, is newly introduced for later
qgroup code, to record which tree is triggered by this extent modification.
This patch doesn't touch any code, but provides the basis for further
refactoring.
Signed-off-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2019-04-03 23:45:29 -07:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Whether this extent should go through qgroup record.
|
|
|
|
*
|
|
|
|
* Normally false, but for certain cases like delayed subtree scan,
|
|
|
|
* setting this flag can hugely reduce qgroup overhead.
|
|
|
|
*/
|
|
|
|
bool skip_qgroup;
|
|
|
|
|
2021-10-12 01:21:37 -07:00
|
|
|
#ifdef CONFIG_BTRFS_FS_REF_VERIFY
|
|
|
|
/* Through which root is this modification. */
|
btrfs: delayed-ref: Introduce better documented delayed ref structures
Current delayed ref interface has several problems:
- Longer and longer parameter lists
bytenr
num_bytes
parent
---------- so far so good
ref_root
owner
offset
---------- I don't feel good now
- Different interpretation of the same parameter
Above @owner for data ref is inode number (u64),
while for tree ref, it's level (int).
They are even in different size range.
For level we only need 0 ~ 8, while for ino it's
BTRFS_FIRST_FREE_OBJECTID ~ BTRFS_LAST_FREE_OBJECTID.
And @offset doesn't even make sense for tree ref.
Such parameter reuse may look clever as an hidden union, but it
destroys code readability.
To solve both problems, we introduce a new structure, btrfs_ref to solve
them:
- Structure instead of long parameter list
This makes later expansion easier, and is better documented.
- Use btrfs_ref::type to distinguish data and tree ref
- Use proper union to store data/tree ref specific structures.
- Use separate functions to fill data/tree ref data, with a common generic
function to fill common bytenr/num_bytes members.
All parameters will find its place in btrfs_ref, and an extra member,
@real_root, inspired by ref-verify code, is newly introduced for later
qgroup code, to record which tree is triggered by this extent modification.
This patch doesn't touch any code, but provides the basis for further
refactoring.
Signed-off-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2019-04-03 23:45:29 -07:00
|
|
|
u64 real_root;
|
2021-10-12 01:21:37 -07:00
|
|
|
#endif
|
btrfs: delayed-ref: Introduce better documented delayed ref structures
Current delayed ref interface has several problems:
- Longer and longer parameter lists
bytenr
num_bytes
parent
---------- so far so good
ref_root
owner
offset
---------- I don't feel good now
- Different interpretation of the same parameter
Above @owner for data ref is inode number (u64),
while for tree ref, it's level (int).
They are even in different size range.
For level we only need 0 ~ 8, while for ino it's
BTRFS_FIRST_FREE_OBJECTID ~ BTRFS_LAST_FREE_OBJECTID.
And @offset doesn't even make sense for tree ref.
Such parameter reuse may look clever as an hidden union, but it
destroys code readability.
To solve both problems, we introduce a new structure, btrfs_ref to solve
them:
- Structure instead of long parameter list
This makes later expansion easier, and is better documented.
- Use btrfs_ref::type to distinguish data and tree ref
- Use proper union to store data/tree ref specific structures.
- Use separate functions to fill data/tree ref data, with a common generic
function to fill common bytenr/num_bytes members.
All parameters will find its place in btrfs_ref, and an extra member,
@real_root, inspired by ref-verify code, is newly introduced for later
qgroup code, to record which tree is triggered by this extent modification.
This patch doesn't touch any code, but provides the basis for further
refactoring.
Signed-off-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2019-04-03 23:45:29 -07:00
|
|
|
u64 bytenr;
|
2024-04-12 17:52:26 -07:00
|
|
|
u64 num_bytes;
|
2023-03-28 16:04:02 -07:00
|
|
|
u64 owning_root;
|
btrfs: delayed-ref: Introduce better documented delayed ref structures
Current delayed ref interface has several problems:
- Longer and longer parameter lists
bytenr
num_bytes
parent
---------- so far so good
ref_root
owner
offset
---------- I don't feel good now
- Different interpretation of the same parameter
Above @owner for data ref is inode number (u64),
while for tree ref, it's level (int).
They are even in different size range.
For level we only need 0 ~ 8, while for ino it's
BTRFS_FIRST_FREE_OBJECTID ~ BTRFS_LAST_FREE_OBJECTID.
And @offset doesn't even make sense for tree ref.
Such parameter reuse may look clever as an hidden union, but it
destroys code readability.
To solve both problems, we introduce a new structure, btrfs_ref to solve
them:
- Structure instead of long parameter list
This makes later expansion easier, and is better documented.
- Use btrfs_ref::type to distinguish data and tree ref
- Use proper union to store data/tree ref specific structures.
- Use separate functions to fill data/tree ref data, with a common generic
function to fill common bytenr/num_bytes members.
All parameters will find its place in btrfs_ref, and an extra member,
@real_root, inspired by ref-verify code, is newly introduced for later
qgroup code, to record which tree is triggered by this extent modification.
This patch doesn't touch any code, but provides the basis for further
refactoring.
Signed-off-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2019-04-03 23:45:29 -07:00
|
|
|
|
2024-04-12 16:37:53 -07:00
|
|
|
/*
|
|
|
|
* The root that owns the reference for this reference, this will be set
|
|
|
|
* or ->parent will be set, depending on what type of reference this is.
|
|
|
|
*/
|
|
|
|
u64 ref_root;
|
|
|
|
|
btrfs: delayed-ref: Introduce better documented delayed ref structures
Current delayed ref interface has several problems:
- Longer and longer parameter lists
bytenr
num_bytes
parent
---------- so far so good
ref_root
owner
offset
---------- I don't feel good now
- Different interpretation of the same parameter
Above @owner for data ref is inode number (u64),
while for tree ref, it's level (int).
They are even in different size range.
For level we only need 0 ~ 8, while for ino it's
BTRFS_FIRST_FREE_OBJECTID ~ BTRFS_LAST_FREE_OBJECTID.
And @offset doesn't even make sense for tree ref.
Such parameter reuse may look clever as an hidden union, but it
destroys code readability.
To solve both problems, we introduce a new structure, btrfs_ref to solve
them:
- Structure instead of long parameter list
This makes later expansion easier, and is better documented.
- Use btrfs_ref::type to distinguish data and tree ref
- Use proper union to store data/tree ref specific structures.
- Use separate functions to fill data/tree ref data, with a common generic
function to fill common bytenr/num_bytes members.
All parameters will find its place in btrfs_ref, and an extra member,
@real_root, inspired by ref-verify code, is newly introduced for later
qgroup code, to record which tree is triggered by this extent modification.
This patch doesn't touch any code, but provides the basis for further
refactoring.
Signed-off-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2019-04-03 23:45:29 -07:00
|
|
|
/* Bytenr of the parent tree block */
|
|
|
|
u64 parent;
|
|
|
|
union {
|
|
|
|
struct btrfs_data_ref data_ref;
|
|
|
|
struct btrfs_tree_ref tree_ref;
|
|
|
|
};
|
|
|
|
};
|
|
|
|
|
2012-11-20 19:21:28 -07:00
|
|
|
extern struct kmem_cache *btrfs_delayed_ref_head_cachep;
|
2024-04-12 14:16:40 -07:00
|
|
|
extern struct kmem_cache *btrfs_delayed_ref_node_cachep;
|
2012-11-20 19:21:28 -07:00
|
|
|
extern struct kmem_cache *btrfs_delayed_extent_op_cachep;
|
|
|
|
|
2017-11-02 16:21:50 -07:00
|
|
|
int __init btrfs_delayed_ref_init(void);
|
2018-02-19 09:24:18 -07:00
|
|
|
void __cold btrfs_delayed_ref_exit(void);
|
2012-11-20 19:21:28 -07:00
|
|
|
|
2023-03-21 04:13:55 -07:00
|
|
|
static inline u64 btrfs_calc_delayed_ref_bytes(const struct btrfs_fs_info *fs_info,
|
|
|
|
int num_delayed_refs)
|
|
|
|
{
|
|
|
|
u64 num_bytes;
|
|
|
|
|
|
|
|
num_bytes = btrfs_calc_insert_metadata_size(fs_info, num_delayed_refs);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* We have to check the mount option here because we could be enabling
|
|
|
|
* the free space tree for the first time and don't have the compat_ro
|
|
|
|
* option set yet.
|
|
|
|
*
|
|
|
|
* We need extra reservations if we have the free space tree because
|
|
|
|
* we'll have to modify that tree as well.
|
|
|
|
*/
|
|
|
|
if (btrfs_test_opt(fs_info, FREE_SPACE_TREE))
|
|
|
|
num_bytes *= 2;
|
|
|
|
|
|
|
|
return num_bytes;
|
|
|
|
}
|
|
|
|
|
btrfs: stop doing excessive space reservation for csum deletion
Currently when reserving space for deleting the csum items for a data
extent, when adding or updating a delayed ref head, we determine how
many leaves of csum items we can have and then pass that number to the
helper btrfs_calc_delayed_ref_bytes(). This helper is used for calculating
space for all tree modifications we need when running delayed references,
however the amount of space it computes is excessive for deleting csum
items because:
1) It uses btrfs_calc_insert_metadata_size() which is excessive because
we only need to delete csum items from the csum tree, we don't need
to insert any items, so btrfs_calc_metadata_size() is all we need (as
it computes space needed to delete an item);
2) If the free space tree is enabled, it doubles the amount of space,
which is pointless for csum deletion since we don't need to touch the
free space tree or any other tree other than the csum tree.
So improve on this by tracking how many csum deletions we have and using
a new helper to calculate space for csum deletions (just a wrapper around
btrfs_calc_metadata_size() with a comment). This reduces the amount of
space we need to reserve for csum deletions by a factor of 4, and it helps
reduce the number of times we have to block space reservations and have
the reclaim task enter the space flushing algorithm (flush delayed items,
flush delayed refs, etc) in order to satisfy tickets.
For example this results in a total time decrease when unlinking (or
truncating) files with many extents, as we end up having to block on space
metadata reservations less often. Example test:
$ cat test.sh
#!/bin/bash
DEV=/dev/nullb0
MNT=/mnt/test
umount $DEV &> /dev/null
mkfs.btrfs -f $DEV
# Use compression to quickly create files with a lot of extents
# (each with a size of 128K).
mount -o compress=lzo $DEV $MNT
# 100G gives at least 983040 extents with a size of 128K.
xfs_io -f -c "pwrite -S 0xab -b 1M 0 120G" $MNT/foobar
# Flush all delalloc and clear all metadata from memory.
umount $MNT
mount -o compress=lzo $DEV $MNT
start=$(date +%s%N)
rm -f $MNT/foobar
end=$(date +%s%N)
dur=$(( (end - start) / 1000000 ))
echo "rm took $dur milliseconds"
umount $MNT
Before this change rm took: 7504 milliseconds
After this change rm took: 6574 milliseconds (-12.4%)
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2023-09-08 10:20:37 -07:00
|
|
|
static inline u64 btrfs_calc_delayed_ref_csum_bytes(const struct btrfs_fs_info *fs_info,
|
|
|
|
int num_csum_items)
|
|
|
|
{
|
|
|
|
/*
|
|
|
|
* Deleting csum items does not result in new nodes/leaves and does not
|
|
|
|
* require changing the free space tree, only the csum tree, so this is
|
|
|
|
* all we need.
|
|
|
|
*/
|
|
|
|
return btrfs_calc_metadata_size(fs_info, num_csum_items);
|
|
|
|
}
|
|
|
|
|
2024-04-12 16:37:53 -07:00
|
|
|
void btrfs_init_tree_ref(struct btrfs_ref *generic_ref, int level, u64 mod_root,
|
|
|
|
bool skip_qgroup);
|
|
|
|
void btrfs_init_data_ref(struct btrfs_ref *generic_ref, u64 ino, u64 offset,
|
2024-02-16 06:27:28 -07:00
|
|
|
u64 mod_root, bool skip_qgroup);
|
btrfs: delayed-ref: Introduce better documented delayed ref structures
Current delayed ref interface has several problems:
- Longer and longer parameter lists
bytenr
num_bytes
parent
---------- so far so good
ref_root
owner
offset
---------- I don't feel good now
- Different interpretation of the same parameter
Above @owner for data ref is inode number (u64),
while for tree ref, it's level (int).
They are even in different size range.
For level we only need 0 ~ 8, while for ino it's
BTRFS_FIRST_FREE_OBJECTID ~ BTRFS_LAST_FREE_OBJECTID.
And @offset doesn't even make sense for tree ref.
Such parameter reuse may look clever as an hidden union, but it
destroys code readability.
To solve both problems, we introduce a new structure, btrfs_ref to solve
them:
- Structure instead of long parameter list
This makes later expansion easier, and is better documented.
- Use btrfs_ref::type to distinguish data and tree ref
- Use proper union to store data/tree ref specific structures.
- Use separate functions to fill data/tree ref data, with a common generic
function to fill common bytenr/num_bytes members.
All parameters will find its place in btrfs_ref, and an extra member,
@real_root, inspired by ref-verify code, is newly introduced for later
qgroup code, to record which tree is triggered by this extent modification.
This patch doesn't touch any code, but provides the basis for further
refactoring.
Signed-off-by: Qu Wenruo <wqu@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2019-04-03 23:45:29 -07:00
|
|
|
|
2012-11-20 19:21:28 -07:00
|
|
|
static inline struct btrfs_delayed_extent_op *
|
|
|
|
btrfs_alloc_delayed_extent_op(void)
|
|
|
|
{
|
|
|
|
return kmem_cache_alloc(btrfs_delayed_extent_op_cachep, GFP_NOFS);
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline void
|
|
|
|
btrfs_free_delayed_extent_op(struct btrfs_delayed_extent_op *op)
|
|
|
|
{
|
|
|
|
if (op)
|
|
|
|
kmem_cache_free(btrfs_delayed_extent_op_cachep, op);
|
|
|
|
}
|
|
|
|
|
2024-02-16 06:27:28 -07:00
|
|
|
void btrfs_put_delayed_ref(struct btrfs_delayed_ref_node *ref);
|
2009-03-13 07:10:06 -07:00
|
|
|
|
2021-01-15 14:48:55 -07:00
|
|
|
static inline u64 btrfs_ref_head_to_space_flags(
|
|
|
|
struct btrfs_delayed_ref_head *head_ref)
|
|
|
|
{
|
|
|
|
if (head_ref->is_data)
|
|
|
|
return BTRFS_BLOCK_GROUP_DATA;
|
|
|
|
else if (head_ref->is_system)
|
|
|
|
return BTRFS_BLOCK_GROUP_SYSTEM;
|
|
|
|
return BTRFS_BLOCK_GROUP_METADATA;
|
|
|
|
}
|
|
|
|
|
2017-09-29 12:43:57 -07:00
|
|
|
static inline void btrfs_put_delayed_ref_head(struct btrfs_delayed_ref_head *head)
|
|
|
|
{
|
|
|
|
if (refcount_dec_and_test(&head->refs))
|
|
|
|
kmem_cache_free(btrfs_delayed_ref_head_cachep, head);
|
|
|
|
}
|
|
|
|
|
2018-06-20 05:48:53 -07:00
|
|
|
int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans,
|
2019-04-03 23:45:31 -07:00
|
|
|
struct btrfs_ref *generic_ref,
|
2021-01-15 14:48:55 -07:00
|
|
|
struct btrfs_delayed_extent_op *extent_op);
|
2018-06-20 05:48:54 -07:00
|
|
|
int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans,
|
2019-04-03 23:45:32 -07:00
|
|
|
struct btrfs_ref *generic_ref,
|
2021-01-15 14:48:55 -07:00
|
|
|
u64 reserved);
|
2019-03-20 03:42:34 -07:00
|
|
|
int btrfs_add_delayed_extent_op(struct btrfs_trans_handle *trans,
|
2024-06-20 11:51:32 -07:00
|
|
|
u64 bytenr, u64 num_bytes, u8 level,
|
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE)
This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.
When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one. At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.
The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root. This commit reduces the
transaction overhead by avoiding the need for dead root records.
When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.
This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.
We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.
This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.
This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.
This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.
The improved balancing code scales significantly better with a large
number of snapshots.
This is a very large commit and was written in a number of
pieces. But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.
Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 07:45:14 -07:00
|
|
|
struct btrfs_delayed_extent_op *extent_op);
|
2022-12-12 02:02:49 -07:00
|
|
|
void btrfs_merge_delayed_refs(struct btrfs_fs_info *fs_info,
|
2012-08-07 13:00:32 -07:00
|
|
|
struct btrfs_delayed_ref_root *delayed_refs,
|
|
|
|
struct btrfs_delayed_ref_head *head);
|
2009-03-13 07:10:06 -07:00
|
|
|
|
2009-03-13 07:11:24 -07:00
|
|
|
struct btrfs_delayed_ref_head *
|
2017-01-30 13:24:37 -07:00
|
|
|
btrfs_find_delayed_ref_head(struct btrfs_delayed_ref_root *delayed_refs,
|
|
|
|
u64 bytenr);
|
2018-10-10 22:40:34 -07:00
|
|
|
int btrfs_delayed_ref_lock(struct btrfs_delayed_ref_root *delayed_refs,
|
2009-03-13 07:17:05 -07:00
|
|
|
struct btrfs_delayed_ref_head *head);
|
2012-12-19 01:10:10 -07:00
|
|
|
static inline void btrfs_delayed_ref_unlock(struct btrfs_delayed_ref_head *head)
|
|
|
|
{
|
|
|
|
mutex_unlock(&head->mutex);
|
|
|
|
}
|
2018-12-03 08:20:29 -07:00
|
|
|
void btrfs_delete_ref_head(struct btrfs_delayed_ref_root *delayed_refs,
|
|
|
|
struct btrfs_delayed_ref_head *head);
|
2014-01-23 07:21:38 -07:00
|
|
|
|
2018-10-10 22:40:33 -07:00
|
|
|
struct btrfs_delayed_ref_head *btrfs_select_ref_head(
|
|
|
|
struct btrfs_delayed_ref_root *delayed_refs);
|
2011-09-14 03:37:00 -07:00
|
|
|
|
2018-04-04 05:57:42 -07:00
|
|
|
int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info, u64 seq);
|
2011-09-14 03:37:00 -07:00
|
|
|
|
btrfs: stop doing excessive space reservation for csum deletion
Currently when reserving space for deleting the csum items for a data
extent, when adding or updating a delayed ref head, we determine how
many leaves of csum items we can have and then pass that number to the
helper btrfs_calc_delayed_ref_bytes(). This helper is used for calculating
space for all tree modifications we need when running delayed references,
however the amount of space it computes is excessive for deleting csum
items because:
1) It uses btrfs_calc_insert_metadata_size() which is excessive because
we only need to delete csum items from the csum tree, we don't need
to insert any items, so btrfs_calc_metadata_size() is all we need (as
it computes space needed to delete an item);
2) If the free space tree is enabled, it doubles the amount of space,
which is pointless for csum deletion since we don't need to touch the
free space tree or any other tree other than the csum tree.
So improve on this by tracking how many csum deletions we have and using
a new helper to calculate space for csum deletions (just a wrapper around
btrfs_calc_metadata_size() with a comment). This reduces the amount of
space we need to reserve for csum deletions by a factor of 4, and it helps
reduce the number of times we have to block space reservations and have
the reclaim task enter the space flushing algorithm (flush delayed items,
flush delayed refs, etc) in order to satisfy tickets.
For example this results in a total time decrease when unlinking (or
truncating) files with many extents, as we end up having to block on space
metadata reservations less often. Example test:
$ cat test.sh
#!/bin/bash
DEV=/dev/nullb0
MNT=/mnt/test
umount $DEV &> /dev/null
mkfs.btrfs -f $DEV
# Use compression to quickly create files with a lot of extents
# (each with a size of 128K).
mount -o compress=lzo $DEV $MNT
# 100G gives at least 983040 extents with a size of 128K.
xfs_io -f -c "pwrite -S 0xab -b 1M 0 120G" $MNT/foobar
# Flush all delalloc and clear all metadata from memory.
umount $MNT
mount -o compress=lzo $DEV $MNT
start=$(date +%s%N)
rm -f $MNT/foobar
end=$(date +%s%N)
dur=$(( (end - start) / 1000000 ))
echo "rm took $dur milliseconds"
umount $MNT
Before this change rm took: 7504 milliseconds
After this change rm took: 6574 milliseconds (-12.4%)
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2023-09-08 10:20:37 -07:00
|
|
|
void btrfs_delayed_refs_rsv_release(struct btrfs_fs_info *fs_info, int nr_refs, int nr_csums);
|
2019-06-19 12:11:58 -07:00
|
|
|
void btrfs_update_delayed_refs_rsv(struct btrfs_trans_handle *trans);
|
2023-09-28 03:12:50 -07:00
|
|
|
void btrfs_inc_delayed_refs_rsv_bg_inserts(struct btrfs_fs_info *fs_info);
|
|
|
|
void btrfs_dec_delayed_refs_rsv_bg_inserts(struct btrfs_fs_info *fs_info);
|
2023-09-28 03:12:49 -07:00
|
|
|
void btrfs_inc_delayed_refs_rsv_bg_updates(struct btrfs_fs_info *fs_info);
|
|
|
|
void btrfs_dec_delayed_refs_rsv_bg_updates(struct btrfs_fs_info *fs_info);
|
2019-06-19 12:11:58 -07:00
|
|
|
int btrfs_delayed_refs_rsv_refill(struct btrfs_fs_info *fs_info,
|
|
|
|
enum btrfs_reserve_flush_enum flush);
|
|
|
|
bool btrfs_check_space_for_delayed_refs(struct btrfs_fs_info *fs_info);
|
btrfs: check delayed refs when we're checking if a ref exists
In the patch 78c52d9eb6b7 ("btrfs: check for refs on snapshot delete
resume") I added some code to handle file systems that had been
corrupted by a bug that incorrectly skipped updating the drop progress
key while dropping a snapshot. This code would check to see if we had
already deleted our reference for a child block, and skip the deletion
if we had already.
Unfortunately there is a bug, as the check would only check the on-disk
references. I made an incorrect assumption that blocks in an already
deleted snapshot that was having the deletion resume on mount wouldn't
be modified.
If we have 2 pending deleted snapshots that share blocks, we can easily
modify the rules for a block. Take the following example
subvolume a exists, and subvolume b is a snapshot of subvolume a. They
share references to block 1. Block 1 will have 2 full references, one
for subvolume a and one for subvolume b, and it belongs to subvolume a
(btrfs_header_owner(block 1) == subvolume a).
When deleting subvolume a, we will drop our full reference for block 1,
and because we are the owner we will drop our full reference for all of
block 1's children, convert block 1 to FULL BACKREF, and add a shared
reference to all of block 1's children.
Then we will start the snapshot deletion of subvolume b. We look up the
extent info for block 1, which checks delayed refs and tells us that
FULL BACKREF is set, so sets parent to the bytenr of block 1. However
because this is a resumed snapshot deletion, we call into
check_ref_exists(). Because check_ref_exists() only looks at the disk,
it doesn't find the shared backref for the child of block 1, and thus
returns 0 and we skip deleting the reference for the child of block 1
and continue. This orphans the child of block 1.
The fix is to lookup the delayed refs, similar to what we do in
btrfs_lookup_extent_info(). However we only care about whether the
reference exists or not. If we fail to find our reference on disk, go
look up the bytenr in the delayed refs, and if it exists look for an
existing ref in the delayed ref head. If that exists then we know we
can delete the reference safely and carry on. If it doesn't exist we
know we have to skip over this block.
This bug has existed since I introduced this fix, however requires
having multiple deleted snapshots pending when we unmount. We noticed
this in production because our shutdown path stops the container on the
system, which deletes a bunch of subvolumes, and then reboots the box.
This gives us plenty of opportunities to hit this issue. Looking at the
history we've seen this occasionally in production, but we had a big
spike recently thanks to faster machines getting jobs with multiple
subvolumes in the job.
Chris Mason wrote a reproducer which does the following
mount /dev/nvme4n1 /btrfs
btrfs subvol create /btrfs/s1
simoop -E -f 4k -n 200000 -z /btrfs/s1
while(true) ; do
btrfs subvol snap /btrfs/s1 /btrfs/s2
simoop -f 4k -n 200000 -r 10 -z /btrfs/s2
btrfs subvol snap /btrfs/s2 /btrfs/s3
btrfs balance start -dusage=80 /btrfs
btrfs subvol del /btrfs/s2 /btrfs/s3
umount /btrfs
btrfsck /dev/nvme4n1 || exit 1
mount /dev/nvme4n1 /btrfs
done
On the second loop this would fail consistently, with my patch it has
been running for hours and hasn't failed.
I also used dm-log-writes to capture the state of the failure so I could
debug the problem. Using the existing failure case to test my patch
validated that it fixes the problem.
Fixes: 78c52d9eb6b7 ("btrfs: check for refs on snapshot delete resume")
CC: stable@vger.kernel.org # 5.4+
Reviewed-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2024-04-11 13:41:20 -07:00
|
|
|
bool btrfs_find_delayed_tree_ref(struct btrfs_delayed_ref_head *head,
|
|
|
|
u64 root, u64 parent);
|
2019-06-19 12:11:58 -07:00
|
|
|
|
2024-04-12 20:27:49 -07:00
|
|
|
static inline u64 btrfs_delayed_ref_owner(struct btrfs_delayed_ref_node *node)
|
|
|
|
{
|
|
|
|
if (node->type == BTRFS_EXTENT_DATA_REF_KEY ||
|
|
|
|
node->type == BTRFS_SHARED_DATA_REF_KEY)
|
|
|
|
return node->data_ref.objectid;
|
|
|
|
return node->tree_ref.level;
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline u64 btrfs_delayed_ref_offset(struct btrfs_delayed_ref_node *node)
|
|
|
|
{
|
|
|
|
if (node->type == BTRFS_EXTENT_DATA_REF_KEY ||
|
|
|
|
node->type == BTRFS_SHARED_DATA_REF_KEY)
|
|
|
|
return node->data_ref.offset;
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2024-04-12 16:44:55 -07:00
|
|
|
static inline u8 btrfs_ref_type(struct btrfs_ref *ref)
|
|
|
|
{
|
|
|
|
ASSERT(ref->type == BTRFS_REF_DATA || ref->type == BTRFS_REF_METADATA);
|
|
|
|
|
|
|
|
if (ref->type == BTRFS_REF_DATA) {
|
|
|
|
if (ref->parent)
|
|
|
|
return BTRFS_SHARED_DATA_REF_KEY;
|
|
|
|
else
|
|
|
|
return BTRFS_EXTENT_DATA_REF_KEY;
|
|
|
|
} else {
|
|
|
|
if (ref->parent)
|
|
|
|
return BTRFS_SHARED_BLOCK_REF_KEY;
|
|
|
|
else
|
|
|
|
return BTRFS_TREE_BLOCK_REF_KEY;
|
|
|
|
}
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2009-03-13 07:10:06 -07:00
|
|
|
#endif
|