1
linux/fs/xfs/xfs_sysfs.c

782 lines
15 KiB
C
Raw Normal View History

// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (c) 2014 Red Hat, Inc.
* All Rights Reserved.
*/
#include "xfs.h"
#include "xfs_shared.h"
#include "xfs_format.h"
#include "xfs_log_format.h"
#include "xfs_trans_resv.h"
#include "xfs_sysfs.h"
xfs: AIL needs asynchronous CIL forcing The AIL pushing is stalling on log forces when it comes across pinned items. This is happening on removal workloads where the AIL is dominated by stale items that are removed from AIL when the checkpoint that marks the items stale is committed to the journal. This results is relatively few items in the AIL, but those that are are often pinned as directories items are being removed from are still being logged. As a result, many push cycles through the CIL will first issue a blocking log force to unpin the items. This can take some time to complete, with tracing regularly showing push delays of half a second and sometimes up into the range of several seconds. Sequences like this aren't uncommon: .... 399.829437: xfsaild: last lsn 0x11002dd000 count 101 stuck 101 flushing 0 tout 20 <wanted 20ms, got 270ms delay> 400.099622: xfsaild: target 0x11002f3600, prev 0x11002f3600, last lsn 0x0 400.099623: xfsaild: first lsn 0x11002f3600 400.099679: xfsaild: last lsn 0x1100305000 count 16 stuck 11 flushing 0 tout 50 <wanted 50ms, got 500ms delay> 400.589348: xfsaild: target 0x110032e600, prev 0x11002f3600, last lsn 0x0 400.589349: xfsaild: first lsn 0x1100305000 400.589595: xfsaild: last lsn 0x110032e600 count 156 stuck 101 flushing 30 tout 50 <wanted 50ms, got 460ms delay> 400.950341: xfsaild: target 0x1100353000, prev 0x110032e600, last lsn 0x0 400.950343: xfsaild: first lsn 0x1100317c00 400.950436: xfsaild: last lsn 0x110033d200 count 105 stuck 101 flushing 0 tout 20 <wanted 20ms, got 200ms delay> 401.142333: xfsaild: target 0x1100361600, prev 0x1100353000, last lsn 0x0 401.142334: xfsaild: first lsn 0x110032e600 401.142535: xfsaild: last lsn 0x1100353000 count 122 stuck 101 flushing 8 tout 10 <wanted 10ms, got 10ms delay> 401.154323: xfsaild: target 0x1100361600, prev 0x1100361600, last lsn 0x1100353000 401.154328: xfsaild: first lsn 0x1100353000 401.154389: xfsaild: last lsn 0x1100353000 count 101 stuck 101 flushing 0 tout 20 <wanted 20ms, got 300ms delay> 401.451525: xfsaild: target 0x1100361600, prev 0x1100361600, last lsn 0x0 401.451526: xfsaild: first lsn 0x1100353000 401.451804: xfsaild: last lsn 0x1100377200 count 170 stuck 22 flushing 122 tout 50 <wanted 50ms, got 500ms delay> 401.933581: xfsaild: target 0x1100361600, prev 0x1100361600, last lsn 0x0 .... In each of these cases, every AIL pass saw 101 log items stuck on the AIL (pinned) with very few other items being found. Each pass, a log force was issued, and delay between last/first is the sleep time + the sync log force time. Some of these 101 items pinned the tail of the log. The tail of the log does slowly creep forward (first lsn), but the problem is that the log is actually out of reservation space because it's been running so many transactions that stale items that never reach the AIL but consume log space. Hence we have a largely empty AIL, with long term pins on items that pin the tail of the log that don't get pushed frequently enough to keep log space available. The problem is the hundreds of milliseconds that we block in the log force pushing the CIL out to disk. The AIL should not be stalled like this - it needs to run and flush items that are at the tail of the log with minimal latency. What we really need to do is trigger a log flush, but then not wait for it at all - we've already done our waiting for stuff to complete when we backed off prior to the log force being issued. Even if we remove the XFS_LOG_SYNC from the xfs_log_force() call, we still do a blocking flush of the CIL and that is what is causing the issue. Hence we need a new interface for the CIL to trigger an immediate background push of the CIL to get it moving faster but not to wait on that to occur. While the CIL is pushing, the AIL can also be pushing. We already have an internal interface to do this - xlog_cil_push_now() - but we need a wrapper for it to be used externally. xlog_cil_force_seq() can easily be extended to do what we need as it already implements the synchronous CIL push via xlog_cil_push_now(). Add the necessary flags and "push current sequence" semantics to xlog_cil_force_seq() and convert the AIL pushing to use it. One of the complexities here is that the CIL push does not guarantee that the commit record for the CIL checkpoint is written to disk. The current log force ensures this by submitting the current ACTIVE iclog that the commit record was written to. We need the CIL to actually write this commit record to disk for an async push to ensure that the checkpoint actually makes it to disk and unpins the pinned items in the checkpoint on completion. Hence we need to pass down to the CIL push that we are doing an async flush so that it can switch out the commit_iclog if necessary to get written to disk when the commit iclog is finally released. Signed-off-by: Dave Chinner <dchinner@redhat.com> Reviewed-by: Darrick J. Wong <djwong@kernel.org> Reviewed-by: Allison Henderson <allison.henderson@oracle.com> Signed-off-by: Darrick J. Wong <djwong@kernel.org>
2021-08-10 18:00:44 -07:00
#include "xfs_log.h"
#include "xfs_log_priv.h"
#include "xfs_mount.h"
struct xfs_sysfs_attr {
struct attribute attr;
ssize_t (*show)(struct kobject *kobject, char *buf);
ssize_t (*store)(struct kobject *kobject, const char *buf,
size_t count);
};
static inline struct xfs_sysfs_attr *
to_attr(struct attribute *attr)
{
return container_of(attr, struct xfs_sysfs_attr, attr);
}
#define XFS_SYSFS_ATTR_RW(name) \
static struct xfs_sysfs_attr xfs_sysfs_attr_##name = __ATTR_RW(name)
#define XFS_SYSFS_ATTR_RO(name) \
static struct xfs_sysfs_attr xfs_sysfs_attr_##name = __ATTR_RO(name)
#define XFS_SYSFS_ATTR_WO(name) \
static struct xfs_sysfs_attr xfs_sysfs_attr_##name = __ATTR_WO(name)
#define ATTR_LIST(name) &xfs_sysfs_attr_##name.attr
STATIC ssize_t
xfs_sysfs_object_show(
struct kobject *kobject,
struct attribute *attr,
char *buf)
{
struct xfs_sysfs_attr *xfs_attr = to_attr(attr);
return xfs_attr->show ? xfs_attr->show(kobject, buf) : 0;
}
STATIC ssize_t
xfs_sysfs_object_store(
struct kobject *kobject,
struct attribute *attr,
const char *buf,
size_t count)
{
struct xfs_sysfs_attr *xfs_attr = to_attr(attr);
return xfs_attr->store ? xfs_attr->store(kobject, buf, count) : 0;
}
static const struct sysfs_ops xfs_sysfs_ops = {
.show = xfs_sysfs_object_show,
.store = xfs_sysfs_object_store,
};
static struct attribute *xfs_mp_attrs[] = {
NULL,
};
ATTRIBUTE_GROUPS(xfs_mp);
const struct kobj_type xfs_mp_ktype = {
.release = xfs_sysfs_release,
.sysfs_ops = &xfs_sysfs_ops,
.default_groups = xfs_mp_groups,
};
#ifdef DEBUG
/* debug */
STATIC ssize_t
bug_on_assert_store(
struct kobject *kobject,
const char *buf,
size_t count)
{
int ret;
int val;
ret = kstrtoint(buf, 0, &val);
if (ret)
return ret;
if (val == 1)
xfs_globals.bug_on_assert = true;
else if (val == 0)
xfs_globals.bug_on_assert = false;
else
return -EINVAL;
return count;
}
STATIC ssize_t
bug_on_assert_show(
struct kobject *kobject,
char *buf)
{
return sysfs_emit(buf, "%d\n", xfs_globals.bug_on_assert);
}
XFS_SYSFS_ATTR_RW(bug_on_assert);
xfs: export log_recovery_delay to delay mount time log recovery XFS log recovery has been discovered to have race conditions with buffers when I/O errors occur. External tools are available to simulate I/O errors to XFS, but this alone is not sufficient for testing log recovery. XFS unconditionally resets the inactive region of the log prior to log recovery to avoid confusion over processing any partially written log records that might have been written before an unclean shutdown. Therefore, unconditional write I/O failures at mount time are caught by the reset sequence rather than log recovery and hinder the ability to test the latter. The device-mapper dm-flakey module uses an up/down timer to define a cycle for when to fail I/Os. Create a pre log recovery delay tunable that can be used to coordinate XFS log recovery with I/O errors simulated by dm-flakey. This facilitates coordination in userspace that allows the reset of stale log blocks to succeed and writes due to log recovery to fail. For example, define a dm-flakey instance with an uptime long enough to allow log reset to succeed and a log recovery delay long enough to allow the dm-flakey uptime to expire. The 'log_recovery_delay' sysfs tunable is exported under /sys/fs/xfs/debug and is only enabled for kernels compiled in XFS debug mode. The value is exported in units of seconds and allows for a delay of up to 60 seconds. Note that this is for XFS debug and test instrumentation purposes only and should not be used by applications. No delay is enabled by default. Signed-off-by: Brian Foster <bfoster@redhat.com> Reviewed-by: Dave Chinner <dchinner@redhat.com> Signed-off-by: Dave Chinner <david@fromorbit.com>
2014-09-08 18:56:13 -07:00
STATIC ssize_t
log_recovery_delay_store(
struct kobject *kobject,
xfs: export log_recovery_delay to delay mount time log recovery XFS log recovery has been discovered to have race conditions with buffers when I/O errors occur. External tools are available to simulate I/O errors to XFS, but this alone is not sufficient for testing log recovery. XFS unconditionally resets the inactive region of the log prior to log recovery to avoid confusion over processing any partially written log records that might have been written before an unclean shutdown. Therefore, unconditional write I/O failures at mount time are caught by the reset sequence rather than log recovery and hinder the ability to test the latter. The device-mapper dm-flakey module uses an up/down timer to define a cycle for when to fail I/Os. Create a pre log recovery delay tunable that can be used to coordinate XFS log recovery with I/O errors simulated by dm-flakey. This facilitates coordination in userspace that allows the reset of stale log blocks to succeed and writes due to log recovery to fail. For example, define a dm-flakey instance with an uptime long enough to allow log reset to succeed and a log recovery delay long enough to allow the dm-flakey uptime to expire. The 'log_recovery_delay' sysfs tunable is exported under /sys/fs/xfs/debug and is only enabled for kernels compiled in XFS debug mode. The value is exported in units of seconds and allows for a delay of up to 60 seconds. Note that this is for XFS debug and test instrumentation purposes only and should not be used by applications. No delay is enabled by default. Signed-off-by: Brian Foster <bfoster@redhat.com> Reviewed-by: Dave Chinner <dchinner@redhat.com> Signed-off-by: Dave Chinner <david@fromorbit.com>
2014-09-08 18:56:13 -07:00
const char *buf,
size_t count)
xfs: export log_recovery_delay to delay mount time log recovery XFS log recovery has been discovered to have race conditions with buffers when I/O errors occur. External tools are available to simulate I/O errors to XFS, but this alone is not sufficient for testing log recovery. XFS unconditionally resets the inactive region of the log prior to log recovery to avoid confusion over processing any partially written log records that might have been written before an unclean shutdown. Therefore, unconditional write I/O failures at mount time are caught by the reset sequence rather than log recovery and hinder the ability to test the latter. The device-mapper dm-flakey module uses an up/down timer to define a cycle for when to fail I/Os. Create a pre log recovery delay tunable that can be used to coordinate XFS log recovery with I/O errors simulated by dm-flakey. This facilitates coordination in userspace that allows the reset of stale log blocks to succeed and writes due to log recovery to fail. For example, define a dm-flakey instance with an uptime long enough to allow log reset to succeed and a log recovery delay long enough to allow the dm-flakey uptime to expire. The 'log_recovery_delay' sysfs tunable is exported under /sys/fs/xfs/debug and is only enabled for kernels compiled in XFS debug mode. The value is exported in units of seconds and allows for a delay of up to 60 seconds. Note that this is for XFS debug and test instrumentation purposes only and should not be used by applications. No delay is enabled by default. Signed-off-by: Brian Foster <bfoster@redhat.com> Reviewed-by: Dave Chinner <dchinner@redhat.com> Signed-off-by: Dave Chinner <david@fromorbit.com>
2014-09-08 18:56:13 -07:00
{
int ret;
int val;
ret = kstrtoint(buf, 0, &val);
if (ret)
return ret;
if (val < 0 || val > 60)
return -EINVAL;
xfs_globals.log_recovery_delay = val;
return count;
}
STATIC ssize_t
log_recovery_delay_show(
struct kobject *kobject,
char *buf)
xfs: export log_recovery_delay to delay mount time log recovery XFS log recovery has been discovered to have race conditions with buffers when I/O errors occur. External tools are available to simulate I/O errors to XFS, but this alone is not sufficient for testing log recovery. XFS unconditionally resets the inactive region of the log prior to log recovery to avoid confusion over processing any partially written log records that might have been written before an unclean shutdown. Therefore, unconditional write I/O failures at mount time are caught by the reset sequence rather than log recovery and hinder the ability to test the latter. The device-mapper dm-flakey module uses an up/down timer to define a cycle for when to fail I/Os. Create a pre log recovery delay tunable that can be used to coordinate XFS log recovery with I/O errors simulated by dm-flakey. This facilitates coordination in userspace that allows the reset of stale log blocks to succeed and writes due to log recovery to fail. For example, define a dm-flakey instance with an uptime long enough to allow log reset to succeed and a log recovery delay long enough to allow the dm-flakey uptime to expire. The 'log_recovery_delay' sysfs tunable is exported under /sys/fs/xfs/debug and is only enabled for kernels compiled in XFS debug mode. The value is exported in units of seconds and allows for a delay of up to 60 seconds. Note that this is for XFS debug and test instrumentation purposes only and should not be used by applications. No delay is enabled by default. Signed-off-by: Brian Foster <bfoster@redhat.com> Reviewed-by: Dave Chinner <dchinner@redhat.com> Signed-off-by: Dave Chinner <david@fromorbit.com>
2014-09-08 18:56:13 -07:00
{
return sysfs_emit(buf, "%d\n", xfs_globals.log_recovery_delay);
xfs: export log_recovery_delay to delay mount time log recovery XFS log recovery has been discovered to have race conditions with buffers when I/O errors occur. External tools are available to simulate I/O errors to XFS, but this alone is not sufficient for testing log recovery. XFS unconditionally resets the inactive region of the log prior to log recovery to avoid confusion over processing any partially written log records that might have been written before an unclean shutdown. Therefore, unconditional write I/O failures at mount time are caught by the reset sequence rather than log recovery and hinder the ability to test the latter. The device-mapper dm-flakey module uses an up/down timer to define a cycle for when to fail I/Os. Create a pre log recovery delay tunable that can be used to coordinate XFS log recovery with I/O errors simulated by dm-flakey. This facilitates coordination in userspace that allows the reset of stale log blocks to succeed and writes due to log recovery to fail. For example, define a dm-flakey instance with an uptime long enough to allow log reset to succeed and a log recovery delay long enough to allow the dm-flakey uptime to expire. The 'log_recovery_delay' sysfs tunable is exported under /sys/fs/xfs/debug and is only enabled for kernels compiled in XFS debug mode. The value is exported in units of seconds and allows for a delay of up to 60 seconds. Note that this is for XFS debug and test instrumentation purposes only and should not be used by applications. No delay is enabled by default. Signed-off-by: Brian Foster <bfoster@redhat.com> Reviewed-by: Dave Chinner <dchinner@redhat.com> Signed-off-by: Dave Chinner <david@fromorbit.com>
2014-09-08 18:56:13 -07:00
}
XFS_SYSFS_ATTR_RW(log_recovery_delay);
xfs: add mount delay debug option Similar to log_recovery_delay, this delay occurs between the VFS superblock being initialised and the xfs_mount being fully initialised. It also poisons the per-ag radix tree node so that it can be used for triggering shrinker races during mount such as the following: <run memory pressure workload in background> $ cat dirty-mount.sh #! /bin/bash umount -f /dev/pmem0 mkfs.xfs -f /dev/pmem0 mount /dev/pmem0 /mnt/test rm -f /mnt/test/foo xfs_io -fxc "pwrite 0 4k" -c fsync -c "shutdown" /mnt/test/foo umount /dev/pmem0 # let's crash it now! echo 30 > /sys/fs/xfs/debug/mount_delay mount /dev/pmem0 /mnt/test echo 0 > /sys/fs/xfs/debug/mount_delay umount /dev/pmem0 $ sudo ./dirty-mount.sh ..... [ 60.378118] CPU: 3 PID: 3577 Comm: fs_mark Tainted: G D W 4.16.0-rc5-dgc #440 [ 60.378120] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.10.2-1 04/01/2014 [ 60.378124] RIP: 0010:radix_tree_next_chunk+0x76/0x320 [ 60.378127] RSP: 0018:ffffc9000276f4f8 EFLAGS: 00010282 [ 60.383670] RAX: a5a5a5a5a5a5a5a4 RBX: 0000000000000010 RCX: 000000000000001a [ 60.385277] RDX: 0000000000000000 RSI: ffffc9000276f540 RDI: 0000000000000000 [ 60.386554] RBP: 0000000000000000 R08: 0000000000000000 R09: a5a5a5a5a5a5a5a5 [ 60.388194] R10: 0000000000000006 R11: 0000000000000001 R12: ffffc9000276f598 [ 60.389288] R13: 0000000000000040 R14: 0000000000000228 R15: ffff880816cd6458 [ 60.390827] FS: 00007f5c124b9740(0000) GS:ffff88083fc00000(0000) knlGS:0000000000000000 [ 60.392253] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 60.393423] CR2: 00007f5c11bba0b8 CR3: 000000035580e001 CR4: 00000000000606e0 [ 60.394519] Call Trace: [ 60.395252] radix_tree_gang_lookup_tag+0xc4/0x130 [ 60.395948] xfs_perag_get_tag+0x37/0xf0 [ 60.396522] xfs_reclaim_inodes_count+0x32/0x40 [ 60.397178] xfs_fs_nr_cached_objects+0x11/0x20 [ 60.397837] super_cache_count+0x35/0xc0 [ 60.399159] shrink_slab.part.66+0xb1/0x370 [ 60.400194] shrink_node+0x7e/0x1a0 [ 60.401058] try_to_free_pages+0x199/0x470 [ 60.402081] __alloc_pages_slowpath+0x3a1/0xd20 [ 60.403729] __alloc_pages_nodemask+0x1c3/0x200 [ 60.404941] cache_grow_begin+0x20b/0x2e0 [ 60.406164] fallback_alloc+0x160/0x200 [ 60.407088] kmem_cache_alloc+0x111/0x4e0 [ 60.408038] ? xfs_buf_rele+0x61/0x430 [ 60.408925] kmem_zone_alloc+0x61/0xe0 [ 60.409965] xfs_inode_alloc+0x24/0x1d0 ..... Signed-Off-By: Dave Chinner <dchinner@redhat.com> Reviewed-by: Brian Foster <bfoster@redhat.com> Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com> Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
2018-05-10 21:50:23 -07:00
STATIC ssize_t
mount_delay_store(
struct kobject *kobject,
const char *buf,
size_t count)
{
int ret;
int val;
ret = kstrtoint(buf, 0, &val);
if (ret)
return ret;
if (val < 0 || val > 60)
return -EINVAL;
xfs_globals.mount_delay = val;
return count;
}
STATIC ssize_t
mount_delay_show(
struct kobject *kobject,
char *buf)
{
return sysfs_emit(buf, "%d\n", xfs_globals.mount_delay);
xfs: add mount delay debug option Similar to log_recovery_delay, this delay occurs between the VFS superblock being initialised and the xfs_mount being fully initialised. It also poisons the per-ag radix tree node so that it can be used for triggering shrinker races during mount such as the following: <run memory pressure workload in background> $ cat dirty-mount.sh #! /bin/bash umount -f /dev/pmem0 mkfs.xfs -f /dev/pmem0 mount /dev/pmem0 /mnt/test rm -f /mnt/test/foo xfs_io -fxc "pwrite 0 4k" -c fsync -c "shutdown" /mnt/test/foo umount /dev/pmem0 # let's crash it now! echo 30 > /sys/fs/xfs/debug/mount_delay mount /dev/pmem0 /mnt/test echo 0 > /sys/fs/xfs/debug/mount_delay umount /dev/pmem0 $ sudo ./dirty-mount.sh ..... [ 60.378118] CPU: 3 PID: 3577 Comm: fs_mark Tainted: G D W 4.16.0-rc5-dgc #440 [ 60.378120] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.10.2-1 04/01/2014 [ 60.378124] RIP: 0010:radix_tree_next_chunk+0x76/0x320 [ 60.378127] RSP: 0018:ffffc9000276f4f8 EFLAGS: 00010282 [ 60.383670] RAX: a5a5a5a5a5a5a5a4 RBX: 0000000000000010 RCX: 000000000000001a [ 60.385277] RDX: 0000000000000000 RSI: ffffc9000276f540 RDI: 0000000000000000 [ 60.386554] RBP: 0000000000000000 R08: 0000000000000000 R09: a5a5a5a5a5a5a5a5 [ 60.388194] R10: 0000000000000006 R11: 0000000000000001 R12: ffffc9000276f598 [ 60.389288] R13: 0000000000000040 R14: 0000000000000228 R15: ffff880816cd6458 [ 60.390827] FS: 00007f5c124b9740(0000) GS:ffff88083fc00000(0000) knlGS:0000000000000000 [ 60.392253] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 60.393423] CR2: 00007f5c11bba0b8 CR3: 000000035580e001 CR4: 00000000000606e0 [ 60.394519] Call Trace: [ 60.395252] radix_tree_gang_lookup_tag+0xc4/0x130 [ 60.395948] xfs_perag_get_tag+0x37/0xf0 [ 60.396522] xfs_reclaim_inodes_count+0x32/0x40 [ 60.397178] xfs_fs_nr_cached_objects+0x11/0x20 [ 60.397837] super_cache_count+0x35/0xc0 [ 60.399159] shrink_slab.part.66+0xb1/0x370 [ 60.400194] shrink_node+0x7e/0x1a0 [ 60.401058] try_to_free_pages+0x199/0x470 [ 60.402081] __alloc_pages_slowpath+0x3a1/0xd20 [ 60.403729] __alloc_pages_nodemask+0x1c3/0x200 [ 60.404941] cache_grow_begin+0x20b/0x2e0 [ 60.406164] fallback_alloc+0x160/0x200 [ 60.407088] kmem_cache_alloc+0x111/0x4e0 [ 60.408038] ? xfs_buf_rele+0x61/0x430 [ 60.408925] kmem_zone_alloc+0x61/0xe0 [ 60.409965] xfs_inode_alloc+0x24/0x1d0 ..... Signed-Off-By: Dave Chinner <dchinner@redhat.com> Reviewed-by: Brian Foster <bfoster@redhat.com> Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com> Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
2018-05-10 21:50:23 -07:00
}
XFS_SYSFS_ATTR_RW(mount_delay);
xfs: introduce an always_cow mode Add a mode where XFS never overwrites existing blocks in place. This is to aid debugging our COW code, and also put infatructure in place for things like possible future support for zoned block devices, which can't support overwrites. This mode is enabled globally by doing a: echo 1 > /sys/fs/xfs/debug/always_cow Note that the parameter is global to allow running all tests in xfstests easily in this mode, which would not easily be possible with a per-fs sysfs file. In always_cow mode persistent preallocations are disabled, and fallocate will fail when called with a 0 mode (with our without FALLOC_FL_KEEP_SIZE), and not create unwritten extent for zeroed space when called with FALLOC_FL_ZERO_RANGE or FALLOC_FL_UNSHARE_RANGE. There are a few interesting xfstests failures when run in always_cow mode: - generic/392 fails because the bytes used in the file used to test hole punch recovery are less after the log replay. This is because the blocks written and then punched out are only freed with a delay due to the logging mechanism. - xfs/170 will fail as the already fragile file streams mechanism doesn't seem to interact well with the COW allocator - xfs/180 xfs/182 xfs/192 xfs/198 xfs/204 and xfs/208 will claim the file system is badly fragmented, but there is not much we can do to avoid that when always writing out of place - xfs/205 fails because overwriting a file in always_cow mode will require new space allocation and the assumption in the test thus don't work anymore. - xfs/326 fails to modify the file at all in always_cow mode after injecting the refcount error, leading to an unexpected md5sum after the remount, but that again is expected Signed-off-by: Christoph Hellwig <hch@lst.de> Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com> Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
2019-02-18 10:38:49 -07:00
static ssize_t
always_cow_store(
struct kobject *kobject,
const char *buf,
size_t count)
{
ssize_t ret;
ret = kstrtobool(buf, &xfs_globals.always_cow);
if (ret < 0)
return ret;
return count;
}
static ssize_t
always_cow_show(
struct kobject *kobject,
char *buf)
{
return sysfs_emit(buf, "%d\n", xfs_globals.always_cow);
xfs: introduce an always_cow mode Add a mode where XFS never overwrites existing blocks in place. This is to aid debugging our COW code, and also put infatructure in place for things like possible future support for zoned block devices, which can't support overwrites. This mode is enabled globally by doing a: echo 1 > /sys/fs/xfs/debug/always_cow Note that the parameter is global to allow running all tests in xfstests easily in this mode, which would not easily be possible with a per-fs sysfs file. In always_cow mode persistent preallocations are disabled, and fallocate will fail when called with a 0 mode (with our without FALLOC_FL_KEEP_SIZE), and not create unwritten extent for zeroed space when called with FALLOC_FL_ZERO_RANGE or FALLOC_FL_UNSHARE_RANGE. There are a few interesting xfstests failures when run in always_cow mode: - generic/392 fails because the bytes used in the file used to test hole punch recovery are less after the log replay. This is because the blocks written and then punched out are only freed with a delay due to the logging mechanism. - xfs/170 will fail as the already fragile file streams mechanism doesn't seem to interact well with the COW allocator - xfs/180 xfs/182 xfs/192 xfs/198 xfs/204 and xfs/208 will claim the file system is badly fragmented, but there is not much we can do to avoid that when always writing out of place - xfs/205 fails because overwriting a file in always_cow mode will require new space allocation and the assumption in the test thus don't work anymore. - xfs/326 fails to modify the file at all in always_cow mode after injecting the refcount error, leading to an unexpected md5sum after the remount, but that again is expected Signed-off-by: Christoph Hellwig <hch@lst.de> Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com> Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
2019-02-18 10:38:49 -07:00
}
XFS_SYSFS_ATTR_RW(always_cow);
/*
* Override how many threads the parallel work queue is allowed to create.
* This has to be a debug-only global (instead of an errortag) because one of
* the main users of parallel workqueues is mount time quotacheck.
*/
STATIC ssize_t
pwork_threads_store(
struct kobject *kobject,
const char *buf,
size_t count)
{
int ret;
int val;
ret = kstrtoint(buf, 0, &val);
if (ret)
return ret;
if (val < -1 || val > num_possible_cpus())
return -EINVAL;
xfs_globals.pwork_threads = val;
return count;
}
STATIC ssize_t
pwork_threads_show(
struct kobject *kobject,
char *buf)
{
return sysfs_emit(buf, "%d\n", xfs_globals.pwork_threads);
}
XFS_SYSFS_ATTR_RW(pwork_threads);
/*
* The "LARP" (Logged extended Attribute Recovery Persistence) debugging knob
* sets the XFS_DA_OP_LOGGED flag on all xfs_attr_set operations performed on
* V5 filesystems. As a result, the intermediate progress of all setxattr and
* removexattr operations are tracked via the log and can be restarted during
* recovery. This is useful for testing xattr recovery prior to merging of the
* parent pointer feature which requires it to maintain consistency, and may be
* enabled for userspace xattrs in the future.
*/
static ssize_t
larp_store(
struct kobject *kobject,
const char *buf,
size_t count)
{
ssize_t ret;
ret = kstrtobool(buf, &xfs_globals.larp);
if (ret < 0)
return ret;
return count;
}
STATIC ssize_t
larp_show(
struct kobject *kobject,
char *buf)
{
return snprintf(buf, PAGE_SIZE, "%d\n", xfs_globals.larp);
}
XFS_SYSFS_ATTR_RW(larp);
STATIC ssize_t
bload_leaf_slack_store(
struct kobject *kobject,
const char *buf,
size_t count)
{
int ret;
int val;
ret = kstrtoint(buf, 0, &val);
if (ret)
return ret;
xfs_globals.bload_leaf_slack = val;
return count;
}
STATIC ssize_t
bload_leaf_slack_show(
struct kobject *kobject,
char *buf)
{
return snprintf(buf, PAGE_SIZE, "%d\n", xfs_globals.bload_leaf_slack);
}
XFS_SYSFS_ATTR_RW(bload_leaf_slack);
STATIC ssize_t
bload_node_slack_store(
struct kobject *kobject,
const char *buf,
size_t count)
{
int ret;
int val;
ret = kstrtoint(buf, 0, &val);
if (ret)
return ret;
xfs_globals.bload_node_slack = val;
return count;
}
STATIC ssize_t
bload_node_slack_show(
struct kobject *kobject,
char *buf)
{
return snprintf(buf, PAGE_SIZE, "%d\n", xfs_globals.bload_node_slack);
}
XFS_SYSFS_ATTR_RW(bload_node_slack);
static struct attribute *xfs_dbg_attrs[] = {
ATTR_LIST(bug_on_assert),
xfs: export log_recovery_delay to delay mount time log recovery XFS log recovery has been discovered to have race conditions with buffers when I/O errors occur. External tools are available to simulate I/O errors to XFS, but this alone is not sufficient for testing log recovery. XFS unconditionally resets the inactive region of the log prior to log recovery to avoid confusion over processing any partially written log records that might have been written before an unclean shutdown. Therefore, unconditional write I/O failures at mount time are caught by the reset sequence rather than log recovery and hinder the ability to test the latter. The device-mapper dm-flakey module uses an up/down timer to define a cycle for when to fail I/Os. Create a pre log recovery delay tunable that can be used to coordinate XFS log recovery with I/O errors simulated by dm-flakey. This facilitates coordination in userspace that allows the reset of stale log blocks to succeed and writes due to log recovery to fail. For example, define a dm-flakey instance with an uptime long enough to allow log reset to succeed and a log recovery delay long enough to allow the dm-flakey uptime to expire. The 'log_recovery_delay' sysfs tunable is exported under /sys/fs/xfs/debug and is only enabled for kernels compiled in XFS debug mode. The value is exported in units of seconds and allows for a delay of up to 60 seconds. Note that this is for XFS debug and test instrumentation purposes only and should not be used by applications. No delay is enabled by default. Signed-off-by: Brian Foster <bfoster@redhat.com> Reviewed-by: Dave Chinner <dchinner@redhat.com> Signed-off-by: Dave Chinner <david@fromorbit.com>
2014-09-08 18:56:13 -07:00
ATTR_LIST(log_recovery_delay),
xfs: add mount delay debug option Similar to log_recovery_delay, this delay occurs between the VFS superblock being initialised and the xfs_mount being fully initialised. It also poisons the per-ag radix tree node so that it can be used for triggering shrinker races during mount such as the following: <run memory pressure workload in background> $ cat dirty-mount.sh #! /bin/bash umount -f /dev/pmem0 mkfs.xfs -f /dev/pmem0 mount /dev/pmem0 /mnt/test rm -f /mnt/test/foo xfs_io -fxc "pwrite 0 4k" -c fsync -c "shutdown" /mnt/test/foo umount /dev/pmem0 # let's crash it now! echo 30 > /sys/fs/xfs/debug/mount_delay mount /dev/pmem0 /mnt/test echo 0 > /sys/fs/xfs/debug/mount_delay umount /dev/pmem0 $ sudo ./dirty-mount.sh ..... [ 60.378118] CPU: 3 PID: 3577 Comm: fs_mark Tainted: G D W 4.16.0-rc5-dgc #440 [ 60.378120] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.10.2-1 04/01/2014 [ 60.378124] RIP: 0010:radix_tree_next_chunk+0x76/0x320 [ 60.378127] RSP: 0018:ffffc9000276f4f8 EFLAGS: 00010282 [ 60.383670] RAX: a5a5a5a5a5a5a5a4 RBX: 0000000000000010 RCX: 000000000000001a [ 60.385277] RDX: 0000000000000000 RSI: ffffc9000276f540 RDI: 0000000000000000 [ 60.386554] RBP: 0000000000000000 R08: 0000000000000000 R09: a5a5a5a5a5a5a5a5 [ 60.388194] R10: 0000000000000006 R11: 0000000000000001 R12: ffffc9000276f598 [ 60.389288] R13: 0000000000000040 R14: 0000000000000228 R15: ffff880816cd6458 [ 60.390827] FS: 00007f5c124b9740(0000) GS:ffff88083fc00000(0000) knlGS:0000000000000000 [ 60.392253] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 60.393423] CR2: 00007f5c11bba0b8 CR3: 000000035580e001 CR4: 00000000000606e0 [ 60.394519] Call Trace: [ 60.395252] radix_tree_gang_lookup_tag+0xc4/0x130 [ 60.395948] xfs_perag_get_tag+0x37/0xf0 [ 60.396522] xfs_reclaim_inodes_count+0x32/0x40 [ 60.397178] xfs_fs_nr_cached_objects+0x11/0x20 [ 60.397837] super_cache_count+0x35/0xc0 [ 60.399159] shrink_slab.part.66+0xb1/0x370 [ 60.400194] shrink_node+0x7e/0x1a0 [ 60.401058] try_to_free_pages+0x199/0x470 [ 60.402081] __alloc_pages_slowpath+0x3a1/0xd20 [ 60.403729] __alloc_pages_nodemask+0x1c3/0x200 [ 60.404941] cache_grow_begin+0x20b/0x2e0 [ 60.406164] fallback_alloc+0x160/0x200 [ 60.407088] kmem_cache_alloc+0x111/0x4e0 [ 60.408038] ? xfs_buf_rele+0x61/0x430 [ 60.408925] kmem_zone_alloc+0x61/0xe0 [ 60.409965] xfs_inode_alloc+0x24/0x1d0 ..... Signed-Off-By: Dave Chinner <dchinner@redhat.com> Reviewed-by: Brian Foster <bfoster@redhat.com> Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com> Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
2018-05-10 21:50:23 -07:00
ATTR_LIST(mount_delay),
xfs: introduce an always_cow mode Add a mode where XFS never overwrites existing blocks in place. This is to aid debugging our COW code, and also put infatructure in place for things like possible future support for zoned block devices, which can't support overwrites. This mode is enabled globally by doing a: echo 1 > /sys/fs/xfs/debug/always_cow Note that the parameter is global to allow running all tests in xfstests easily in this mode, which would not easily be possible with a per-fs sysfs file. In always_cow mode persistent preallocations are disabled, and fallocate will fail when called with a 0 mode (with our without FALLOC_FL_KEEP_SIZE), and not create unwritten extent for zeroed space when called with FALLOC_FL_ZERO_RANGE or FALLOC_FL_UNSHARE_RANGE. There are a few interesting xfstests failures when run in always_cow mode: - generic/392 fails because the bytes used in the file used to test hole punch recovery are less after the log replay. This is because the blocks written and then punched out are only freed with a delay due to the logging mechanism. - xfs/170 will fail as the already fragile file streams mechanism doesn't seem to interact well with the COW allocator - xfs/180 xfs/182 xfs/192 xfs/198 xfs/204 and xfs/208 will claim the file system is badly fragmented, but there is not much we can do to avoid that when always writing out of place - xfs/205 fails because overwriting a file in always_cow mode will require new space allocation and the assumption in the test thus don't work anymore. - xfs/326 fails to modify the file at all in always_cow mode after injecting the refcount error, leading to an unexpected md5sum after the remount, but that again is expected Signed-off-by: Christoph Hellwig <hch@lst.de> Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com> Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
2019-02-18 10:38:49 -07:00
ATTR_LIST(always_cow),
ATTR_LIST(pwork_threads),
ATTR_LIST(larp),
ATTR_LIST(bload_leaf_slack),
ATTR_LIST(bload_node_slack),
NULL,
};
ATTRIBUTE_GROUPS(xfs_dbg);
const struct kobj_type xfs_dbg_ktype = {
.release = xfs_sysfs_release,
.sysfs_ops = &xfs_sysfs_ops,
.default_groups = xfs_dbg_groups,
};
#endif /* DEBUG */
/* stats */
static inline struct xstats *
to_xstats(struct kobject *kobject)
{
struct xfs_kobj *kobj = to_kobj(kobject);
return container_of(kobj, struct xstats, xs_kobj);
}
STATIC ssize_t
stats_show(
struct kobject *kobject,
char *buf)
{
struct xstats *stats = to_xstats(kobject);
return xfs_stats_format(stats->xs_stats, buf);
}
XFS_SYSFS_ATTR_RO(stats);
STATIC ssize_t
stats_clear_store(
struct kobject *kobject,
const char *buf,
size_t count)
{
int ret;
int val;
struct xstats *stats = to_xstats(kobject);
ret = kstrtoint(buf, 0, &val);
if (ret)
return ret;
if (val != 1)
return -EINVAL;
xfs_stats_clearall(stats->xs_stats);
return count;
}
XFS_SYSFS_ATTR_WO(stats_clear);
static struct attribute *xfs_stats_attrs[] = {
ATTR_LIST(stats),
ATTR_LIST(stats_clear),
NULL,
};
ATTRIBUTE_GROUPS(xfs_stats);
const struct kobj_type xfs_stats_ktype = {
.release = xfs_sysfs_release,
.sysfs_ops = &xfs_sysfs_ops,
.default_groups = xfs_stats_groups,
};
/* xlog */
static inline struct xlog *
to_xlog(struct kobject *kobject)
{
struct xfs_kobj *kobj = to_kobj(kobject);
return container_of(kobj, struct xlog, l_kobj);
}
STATIC ssize_t
log_head_lsn_show(
struct kobject *kobject,
char *buf)
{
int cycle;
int block;
struct xlog *log = to_xlog(kobject);
spin_lock(&log->l_icloglock);
cycle = log->l_curr_cycle;
block = log->l_curr_block;
spin_unlock(&log->l_icloglock);
return sysfs_emit(buf, "%d:%d\n", cycle, block);
}
XFS_SYSFS_ATTR_RO(log_head_lsn);
STATIC ssize_t
log_tail_lsn_show(
struct kobject *kobject,
char *buf)
{
int cycle;
int block;
struct xlog *log = to_xlog(kobject);
xlog_crack_atomic_lsn(&log->l_tail_lsn, &cycle, &block);
return sysfs_emit(buf, "%d:%d\n", cycle, block);
}
XFS_SYSFS_ATTR_RO(log_tail_lsn);
STATIC ssize_t
xfs: grant heads track byte counts, not LSNs The grant heads in the log track the space reserved in the log for running transactions. They do this by tracking how far ahead of the tail that the reservation has reached, and the units for doing this are {cycle,bytes} for the reserve head rather than {cycle,blocks} which are normal used by LSNs. This is annoyingly complex because we have to split, crack and combined these tuples for any calculation we do to determine log space and targets. This is computationally expensive as well as difficult to do atomically and locklessly, as well as limiting the size of the log to 2^32 bytes. Really, though, all the grant heads are tracking is how much space is currently available for use in the log. We can track this as a simply byte count - we just don't care what the actual physical location in the log the head and tail are at, just how much space we have remaining before the head and tail overlap. So, convert the grant heads to track the byte reservations that are active rather than the current (cycle, offset) tuples. This means an empty log has zero bytes consumed, and a full log is when the reservations reach the size of the log minus the space consumed by the AIL. This greatly simplifies the accounting and checks for whether there is space available. We no longer need to crack or combine LSNs to determine how much space the log has left, nor do we need to look at the head or tail of the log to determine how close to full we are. There is, however, a complexity that needs to be handled. We know how much space is being tracked in the AIL now via log->l_tail_space and the log tickets track active reservations and return the unused portions to the grant heads when ungranted. Unfortunately, we don't track the used portion of the grant, so when we transfer log items from the CIL to the AIL, the space accounted to the grant heads is transferred to the log tail space. Hence when we move the AIL head forwards on item insert, we have to remove that space from the grant heads. We also remove the xlog_verify_grant_tail() debug function as it is no longer useful. The check it performs has been racy since delayed logging was introduced, but now it is clearly only detecting false positives so remove it. The result of this substantially simpler accounting algorithm is an increase in sustained transaction rate from ~1.3 million transactions/s to ~1.9 million transactions/s with no increase in CPU usage. We also remove the 32 bit space limitation on the grant heads, which will allow us to increase the journal size beyond 2GB in future. Note that this renames the sysfs files exposing the log grant space now that the values are exported in bytes. This allows xfstests to auto-detect the old or new ABI. [hch: move xlog_grant_sub_space out of line, update the xlog_grant_{add,sub}_space prototypes, rename the sysfs files to allow auto-detection in xfstests] Signed-off-by: Dave Chinner <dchinner@redhat.com> Reviewed-by: Darrick J. Wong <djwong@kernel.org> Signed-off-by: Christoph Hellwig <hch@lst.de> Reviewed-by: Dave Chinner <dchinner@redhat.com> Signed-off-by: Chandan Babu R <chandanbabu@kernel.org>
2024-06-20 00:21:27 -07:00
reserve_grant_head_bytes_show(
struct kobject *kobject,
char *buf)
{
xfs: grant heads track byte counts, not LSNs The grant heads in the log track the space reserved in the log for running transactions. They do this by tracking how far ahead of the tail that the reservation has reached, and the units for doing this are {cycle,bytes} for the reserve head rather than {cycle,blocks} which are normal used by LSNs. This is annoyingly complex because we have to split, crack and combined these tuples for any calculation we do to determine log space and targets. This is computationally expensive as well as difficult to do atomically and locklessly, as well as limiting the size of the log to 2^32 bytes. Really, though, all the grant heads are tracking is how much space is currently available for use in the log. We can track this as a simply byte count - we just don't care what the actual physical location in the log the head and tail are at, just how much space we have remaining before the head and tail overlap. So, convert the grant heads to track the byte reservations that are active rather than the current (cycle, offset) tuples. This means an empty log has zero bytes consumed, and a full log is when the reservations reach the size of the log minus the space consumed by the AIL. This greatly simplifies the accounting and checks for whether there is space available. We no longer need to crack or combine LSNs to determine how much space the log has left, nor do we need to look at the head or tail of the log to determine how close to full we are. There is, however, a complexity that needs to be handled. We know how much space is being tracked in the AIL now via log->l_tail_space and the log tickets track active reservations and return the unused portions to the grant heads when ungranted. Unfortunately, we don't track the used portion of the grant, so when we transfer log items from the CIL to the AIL, the space accounted to the grant heads is transferred to the log tail space. Hence when we move the AIL head forwards on item insert, we have to remove that space from the grant heads. We also remove the xlog_verify_grant_tail() debug function as it is no longer useful. The check it performs has been racy since delayed logging was introduced, but now it is clearly only detecting false positives so remove it. The result of this substantially simpler accounting algorithm is an increase in sustained transaction rate from ~1.3 million transactions/s to ~1.9 million transactions/s with no increase in CPU usage. We also remove the 32 bit space limitation on the grant heads, which will allow us to increase the journal size beyond 2GB in future. Note that this renames the sysfs files exposing the log grant space now that the values are exported in bytes. This allows xfstests to auto-detect the old or new ABI. [hch: move xlog_grant_sub_space out of line, update the xlog_grant_{add,sub}_space prototypes, rename the sysfs files to allow auto-detection in xfstests] Signed-off-by: Dave Chinner <dchinner@redhat.com> Reviewed-by: Darrick J. Wong <djwong@kernel.org> Signed-off-by: Christoph Hellwig <hch@lst.de> Reviewed-by: Dave Chinner <dchinner@redhat.com> Signed-off-by: Chandan Babu R <chandanbabu@kernel.org>
2024-06-20 00:21:27 -07:00
return sysfs_emit(buf, "%lld\n",
atomic64_read(&to_xlog(kobject)->l_reserve_head.grant));
}
xfs: grant heads track byte counts, not LSNs The grant heads in the log track the space reserved in the log for running transactions. They do this by tracking how far ahead of the tail that the reservation has reached, and the units for doing this are {cycle,bytes} for the reserve head rather than {cycle,blocks} which are normal used by LSNs. This is annoyingly complex because we have to split, crack and combined these tuples for any calculation we do to determine log space and targets. This is computationally expensive as well as difficult to do atomically and locklessly, as well as limiting the size of the log to 2^32 bytes. Really, though, all the grant heads are tracking is how much space is currently available for use in the log. We can track this as a simply byte count - we just don't care what the actual physical location in the log the head and tail are at, just how much space we have remaining before the head and tail overlap. So, convert the grant heads to track the byte reservations that are active rather than the current (cycle, offset) tuples. This means an empty log has zero bytes consumed, and a full log is when the reservations reach the size of the log minus the space consumed by the AIL. This greatly simplifies the accounting and checks for whether there is space available. We no longer need to crack or combine LSNs to determine how much space the log has left, nor do we need to look at the head or tail of the log to determine how close to full we are. There is, however, a complexity that needs to be handled. We know how much space is being tracked in the AIL now via log->l_tail_space and the log tickets track active reservations and return the unused portions to the grant heads when ungranted. Unfortunately, we don't track the used portion of the grant, so when we transfer log items from the CIL to the AIL, the space accounted to the grant heads is transferred to the log tail space. Hence when we move the AIL head forwards on item insert, we have to remove that space from the grant heads. We also remove the xlog_verify_grant_tail() debug function as it is no longer useful. The check it performs has been racy since delayed logging was introduced, but now it is clearly only detecting false positives so remove it. The result of this substantially simpler accounting algorithm is an increase in sustained transaction rate from ~1.3 million transactions/s to ~1.9 million transactions/s with no increase in CPU usage. We also remove the 32 bit space limitation on the grant heads, which will allow us to increase the journal size beyond 2GB in future. Note that this renames the sysfs files exposing the log grant space now that the values are exported in bytes. This allows xfstests to auto-detect the old or new ABI. [hch: move xlog_grant_sub_space out of line, update the xlog_grant_{add,sub}_space prototypes, rename the sysfs files to allow auto-detection in xfstests] Signed-off-by: Dave Chinner <dchinner@redhat.com> Reviewed-by: Darrick J. Wong <djwong@kernel.org> Signed-off-by: Christoph Hellwig <hch@lst.de> Reviewed-by: Dave Chinner <dchinner@redhat.com> Signed-off-by: Chandan Babu R <chandanbabu@kernel.org>
2024-06-20 00:21:27 -07:00
XFS_SYSFS_ATTR_RO(reserve_grant_head_bytes);
STATIC ssize_t
xfs: grant heads track byte counts, not LSNs The grant heads in the log track the space reserved in the log for running transactions. They do this by tracking how far ahead of the tail that the reservation has reached, and the units for doing this are {cycle,bytes} for the reserve head rather than {cycle,blocks} which are normal used by LSNs. This is annoyingly complex because we have to split, crack and combined these tuples for any calculation we do to determine log space and targets. This is computationally expensive as well as difficult to do atomically and locklessly, as well as limiting the size of the log to 2^32 bytes. Really, though, all the grant heads are tracking is how much space is currently available for use in the log. We can track this as a simply byte count - we just don't care what the actual physical location in the log the head and tail are at, just how much space we have remaining before the head and tail overlap. So, convert the grant heads to track the byte reservations that are active rather than the current (cycle, offset) tuples. This means an empty log has zero bytes consumed, and a full log is when the reservations reach the size of the log minus the space consumed by the AIL. This greatly simplifies the accounting and checks for whether there is space available. We no longer need to crack or combine LSNs to determine how much space the log has left, nor do we need to look at the head or tail of the log to determine how close to full we are. There is, however, a complexity that needs to be handled. We know how much space is being tracked in the AIL now via log->l_tail_space and the log tickets track active reservations and return the unused portions to the grant heads when ungranted. Unfortunately, we don't track the used portion of the grant, so when we transfer log items from the CIL to the AIL, the space accounted to the grant heads is transferred to the log tail space. Hence when we move the AIL head forwards on item insert, we have to remove that space from the grant heads. We also remove the xlog_verify_grant_tail() debug function as it is no longer useful. The check it performs has been racy since delayed logging was introduced, but now it is clearly only detecting false positives so remove it. The result of this substantially simpler accounting algorithm is an increase in sustained transaction rate from ~1.3 million transactions/s to ~1.9 million transactions/s with no increase in CPU usage. We also remove the 32 bit space limitation on the grant heads, which will allow us to increase the journal size beyond 2GB in future. Note that this renames the sysfs files exposing the log grant space now that the values are exported in bytes. This allows xfstests to auto-detect the old or new ABI. [hch: move xlog_grant_sub_space out of line, update the xlog_grant_{add,sub}_space prototypes, rename the sysfs files to allow auto-detection in xfstests] Signed-off-by: Dave Chinner <dchinner@redhat.com> Reviewed-by: Darrick J. Wong <djwong@kernel.org> Signed-off-by: Christoph Hellwig <hch@lst.de> Reviewed-by: Dave Chinner <dchinner@redhat.com> Signed-off-by: Chandan Babu R <chandanbabu@kernel.org>
2024-06-20 00:21:27 -07:00
write_grant_head_bytes_show(
struct kobject *kobject,
char *buf)
{
xfs: grant heads track byte counts, not LSNs The grant heads in the log track the space reserved in the log for running transactions. They do this by tracking how far ahead of the tail that the reservation has reached, and the units for doing this are {cycle,bytes} for the reserve head rather than {cycle,blocks} which are normal used by LSNs. This is annoyingly complex because we have to split, crack and combined these tuples for any calculation we do to determine log space and targets. This is computationally expensive as well as difficult to do atomically and locklessly, as well as limiting the size of the log to 2^32 bytes. Really, though, all the grant heads are tracking is how much space is currently available for use in the log. We can track this as a simply byte count - we just don't care what the actual physical location in the log the head and tail are at, just how much space we have remaining before the head and tail overlap. So, convert the grant heads to track the byte reservations that are active rather than the current (cycle, offset) tuples. This means an empty log has zero bytes consumed, and a full log is when the reservations reach the size of the log minus the space consumed by the AIL. This greatly simplifies the accounting and checks for whether there is space available. We no longer need to crack or combine LSNs to determine how much space the log has left, nor do we need to look at the head or tail of the log to determine how close to full we are. There is, however, a complexity that needs to be handled. We know how much space is being tracked in the AIL now via log->l_tail_space and the log tickets track active reservations and return the unused portions to the grant heads when ungranted. Unfortunately, we don't track the used portion of the grant, so when we transfer log items from the CIL to the AIL, the space accounted to the grant heads is transferred to the log tail space. Hence when we move the AIL head forwards on item insert, we have to remove that space from the grant heads. We also remove the xlog_verify_grant_tail() debug function as it is no longer useful. The check it performs has been racy since delayed logging was introduced, but now it is clearly only detecting false positives so remove it. The result of this substantially simpler accounting algorithm is an increase in sustained transaction rate from ~1.3 million transactions/s to ~1.9 million transactions/s with no increase in CPU usage. We also remove the 32 bit space limitation on the grant heads, which will allow us to increase the journal size beyond 2GB in future. Note that this renames the sysfs files exposing the log grant space now that the values are exported in bytes. This allows xfstests to auto-detect the old or new ABI. [hch: move xlog_grant_sub_space out of line, update the xlog_grant_{add,sub}_space prototypes, rename the sysfs files to allow auto-detection in xfstests] Signed-off-by: Dave Chinner <dchinner@redhat.com> Reviewed-by: Darrick J. Wong <djwong@kernel.org> Signed-off-by: Christoph Hellwig <hch@lst.de> Reviewed-by: Dave Chinner <dchinner@redhat.com> Signed-off-by: Chandan Babu R <chandanbabu@kernel.org>
2024-06-20 00:21:27 -07:00
return sysfs_emit(buf, "%lld\n",
atomic64_read(&to_xlog(kobject)->l_write_head.grant));
}
xfs: grant heads track byte counts, not LSNs The grant heads in the log track the space reserved in the log for running transactions. They do this by tracking how far ahead of the tail that the reservation has reached, and the units for doing this are {cycle,bytes} for the reserve head rather than {cycle,blocks} which are normal used by LSNs. This is annoyingly complex because we have to split, crack and combined these tuples for any calculation we do to determine log space and targets. This is computationally expensive as well as difficult to do atomically and locklessly, as well as limiting the size of the log to 2^32 bytes. Really, though, all the grant heads are tracking is how much space is currently available for use in the log. We can track this as a simply byte count - we just don't care what the actual physical location in the log the head and tail are at, just how much space we have remaining before the head and tail overlap. So, convert the grant heads to track the byte reservations that are active rather than the current (cycle, offset) tuples. This means an empty log has zero bytes consumed, and a full log is when the reservations reach the size of the log minus the space consumed by the AIL. This greatly simplifies the accounting and checks for whether there is space available. We no longer need to crack or combine LSNs to determine how much space the log has left, nor do we need to look at the head or tail of the log to determine how close to full we are. There is, however, a complexity that needs to be handled. We know how much space is being tracked in the AIL now via log->l_tail_space and the log tickets track active reservations and return the unused portions to the grant heads when ungranted. Unfortunately, we don't track the used portion of the grant, so when we transfer log items from the CIL to the AIL, the space accounted to the grant heads is transferred to the log tail space. Hence when we move the AIL head forwards on item insert, we have to remove that space from the grant heads. We also remove the xlog_verify_grant_tail() debug function as it is no longer useful. The check it performs has been racy since delayed logging was introduced, but now it is clearly only detecting false positives so remove it. The result of this substantially simpler accounting algorithm is an increase in sustained transaction rate from ~1.3 million transactions/s to ~1.9 million transactions/s with no increase in CPU usage. We also remove the 32 bit space limitation on the grant heads, which will allow us to increase the journal size beyond 2GB in future. Note that this renames the sysfs files exposing the log grant space now that the values are exported in bytes. This allows xfstests to auto-detect the old or new ABI. [hch: move xlog_grant_sub_space out of line, update the xlog_grant_{add,sub}_space prototypes, rename the sysfs files to allow auto-detection in xfstests] Signed-off-by: Dave Chinner <dchinner@redhat.com> Reviewed-by: Darrick J. Wong <djwong@kernel.org> Signed-off-by: Christoph Hellwig <hch@lst.de> Reviewed-by: Dave Chinner <dchinner@redhat.com> Signed-off-by: Chandan Babu R <chandanbabu@kernel.org>
2024-06-20 00:21:27 -07:00
XFS_SYSFS_ATTR_RO(write_grant_head_bytes);
static struct attribute *xfs_log_attrs[] = {
ATTR_LIST(log_head_lsn),
ATTR_LIST(log_tail_lsn),
xfs: grant heads track byte counts, not LSNs The grant heads in the log track the space reserved in the log for running transactions. They do this by tracking how far ahead of the tail that the reservation has reached, and the units for doing this are {cycle,bytes} for the reserve head rather than {cycle,blocks} which are normal used by LSNs. This is annoyingly complex because we have to split, crack and combined these tuples for any calculation we do to determine log space and targets. This is computationally expensive as well as difficult to do atomically and locklessly, as well as limiting the size of the log to 2^32 bytes. Really, though, all the grant heads are tracking is how much space is currently available for use in the log. We can track this as a simply byte count - we just don't care what the actual physical location in the log the head and tail are at, just how much space we have remaining before the head and tail overlap. So, convert the grant heads to track the byte reservations that are active rather than the current (cycle, offset) tuples. This means an empty log has zero bytes consumed, and a full log is when the reservations reach the size of the log minus the space consumed by the AIL. This greatly simplifies the accounting and checks for whether there is space available. We no longer need to crack or combine LSNs to determine how much space the log has left, nor do we need to look at the head or tail of the log to determine how close to full we are. There is, however, a complexity that needs to be handled. We know how much space is being tracked in the AIL now via log->l_tail_space and the log tickets track active reservations and return the unused portions to the grant heads when ungranted. Unfortunately, we don't track the used portion of the grant, so when we transfer log items from the CIL to the AIL, the space accounted to the grant heads is transferred to the log tail space. Hence when we move the AIL head forwards on item insert, we have to remove that space from the grant heads. We also remove the xlog_verify_grant_tail() debug function as it is no longer useful. The check it performs has been racy since delayed logging was introduced, but now it is clearly only detecting false positives so remove it. The result of this substantially simpler accounting algorithm is an increase in sustained transaction rate from ~1.3 million transactions/s to ~1.9 million transactions/s with no increase in CPU usage. We also remove the 32 bit space limitation on the grant heads, which will allow us to increase the journal size beyond 2GB in future. Note that this renames the sysfs files exposing the log grant space now that the values are exported in bytes. This allows xfstests to auto-detect the old or new ABI. [hch: move xlog_grant_sub_space out of line, update the xlog_grant_{add,sub}_space prototypes, rename the sysfs files to allow auto-detection in xfstests] Signed-off-by: Dave Chinner <dchinner@redhat.com> Reviewed-by: Darrick J. Wong <djwong@kernel.org> Signed-off-by: Christoph Hellwig <hch@lst.de> Reviewed-by: Dave Chinner <dchinner@redhat.com> Signed-off-by: Chandan Babu R <chandanbabu@kernel.org>
2024-06-20 00:21:27 -07:00
ATTR_LIST(reserve_grant_head_bytes),
ATTR_LIST(write_grant_head_bytes),
NULL,
};
ATTRIBUTE_GROUPS(xfs_log);
const struct kobj_type xfs_log_ktype = {
.release = xfs_sysfs_release,
.sysfs_ops = &xfs_sysfs_ops,
.default_groups = xfs_log_groups,
};
/*
* Metadata IO error configuration
*
* The sysfs structure here is:
* ...xfs/<dev>/error/<class>/<errno>/<error_attrs>
*
* where <class> allows us to discriminate between data IO and metadata IO,
* and any other future type of IO (e.g. special inode or directory error
* handling) we care to support.
*/
static inline struct xfs_error_cfg *
to_error_cfg(struct kobject *kobject)
{
struct xfs_kobj *kobj = to_kobj(kobject);
return container_of(kobj, struct xfs_error_cfg, kobj);
}
static inline struct xfs_mount *
err_to_mp(struct kobject *kobject)
{
struct xfs_kobj *kobj = to_kobj(kobject);
return container_of(kobj, struct xfs_mount, m_error_kobj);
}
static ssize_t
max_retries_show(
struct kobject *kobject,
char *buf)
{
int retries;
struct xfs_error_cfg *cfg = to_error_cfg(kobject);
if (cfg->max_retries == XFS_ERR_RETRY_FOREVER)
retries = -1;
else
retries = cfg->max_retries;
return sysfs_emit(buf, "%d\n", retries);
}
static ssize_t
max_retries_store(
struct kobject *kobject,
const char *buf,
size_t count)
{
struct xfs_error_cfg *cfg = to_error_cfg(kobject);
int ret;
int val;
ret = kstrtoint(buf, 0, &val);
if (ret)
return ret;
if (val < -1)
return -EINVAL;
if (val == -1)
cfg->max_retries = XFS_ERR_RETRY_FOREVER;
else
cfg->max_retries = val;
return count;
}
XFS_SYSFS_ATTR_RW(max_retries);
static ssize_t
retry_timeout_seconds_show(
struct kobject *kobject,
char *buf)
{
int timeout;
struct xfs_error_cfg *cfg = to_error_cfg(kobject);
if (cfg->retry_timeout == XFS_ERR_RETRY_FOREVER)
timeout = -1;
else
timeout = jiffies_to_msecs(cfg->retry_timeout) / MSEC_PER_SEC;
return sysfs_emit(buf, "%d\n", timeout);
}
static ssize_t
retry_timeout_seconds_store(
struct kobject *kobject,
const char *buf,
size_t count)
{
struct xfs_error_cfg *cfg = to_error_cfg(kobject);
int ret;
int val;
ret = kstrtoint(buf, 0, &val);
if (ret)
return ret;
/* 1 day timeout maximum, -1 means infinite */
if (val < -1 || val > 86400)
return -EINVAL;
if (val == -1)
cfg->retry_timeout = XFS_ERR_RETRY_FOREVER;
else {
cfg->retry_timeout = msecs_to_jiffies(val * MSEC_PER_SEC);
ASSERT(msecs_to_jiffies(val * MSEC_PER_SEC) < LONG_MAX);
}
return count;
}
XFS_SYSFS_ATTR_RW(retry_timeout_seconds);
static ssize_t
fail_at_unmount_show(
struct kobject *kobject,
char *buf)
{
struct xfs_mount *mp = err_to_mp(kobject);
return sysfs_emit(buf, "%d\n", mp->m_fail_unmount);
}
static ssize_t
fail_at_unmount_store(
struct kobject *kobject,
const char *buf,
size_t count)
{
struct xfs_mount *mp = err_to_mp(kobject);
int ret;
int val;
ret = kstrtoint(buf, 0, &val);
if (ret)
return ret;
if (val < 0 || val > 1)
return -EINVAL;
mp->m_fail_unmount = val;
return count;
}
XFS_SYSFS_ATTR_RW(fail_at_unmount);
static struct attribute *xfs_error_attrs[] = {
ATTR_LIST(max_retries),
ATTR_LIST(retry_timeout_seconds),
NULL,
};
ATTRIBUTE_GROUPS(xfs_error);
static const struct kobj_type xfs_error_cfg_ktype = {
.release = xfs_sysfs_release,
.sysfs_ops = &xfs_sysfs_ops,
.default_groups = xfs_error_groups,
};
static const struct kobj_type xfs_error_ktype = {
.release = xfs_sysfs_release,
.sysfs_ops = &xfs_sysfs_ops,
};
/*
* Error initialization tables. These need to be ordered in the same
* order as the enums used to index the array. All class init tables need to
* define a "default" behaviour as the first entry, all other entries can be
* empty.
*/
struct xfs_error_init {
char *name;
int max_retries;
int retry_timeout; /* in seconds */
};
static const struct xfs_error_init xfs_error_meta_init[XFS_ERR_ERRNO_MAX] = {
{ .name = "default",
.max_retries = XFS_ERR_RETRY_FOREVER,
.retry_timeout = XFS_ERR_RETRY_FOREVER,
},
{ .name = "EIO",
.max_retries = XFS_ERR_RETRY_FOREVER,
.retry_timeout = XFS_ERR_RETRY_FOREVER,
},
{ .name = "ENOSPC",
.max_retries = XFS_ERR_RETRY_FOREVER,
.retry_timeout = XFS_ERR_RETRY_FOREVER,
},
{ .name = "ENODEV",
.max_retries = 0, /* We can't recover from devices disappearing */
.retry_timeout = 0,
},
};
static int
xfs_error_sysfs_init_class(
struct xfs_mount *mp,
int class,
const char *parent_name,
struct xfs_kobj *parent_kobj,
const struct xfs_error_init init[])
{
struct xfs_error_cfg *cfg;
int error;
int i;
ASSERT(class < XFS_ERR_CLASS_MAX);
error = xfs_sysfs_init(parent_kobj, &xfs_error_ktype,
&mp->m_error_kobj, parent_name);
if (error)
return error;
for (i = 0; i < XFS_ERR_ERRNO_MAX; i++) {
cfg = &mp->m_error_cfg[class][i];
error = xfs_sysfs_init(&cfg->kobj, &xfs_error_cfg_ktype,
parent_kobj, init[i].name);
if (error)
goto out_error;
cfg->max_retries = init[i].max_retries;
if (init[i].retry_timeout == XFS_ERR_RETRY_FOREVER)
cfg->retry_timeout = XFS_ERR_RETRY_FOREVER;
else
cfg->retry_timeout = msecs_to_jiffies(
init[i].retry_timeout * MSEC_PER_SEC);
}
return 0;
out_error:
/* unwind the entries that succeeded */
for (i--; i >= 0; i--) {
cfg = &mp->m_error_cfg[class][i];
xfs_sysfs_del(&cfg->kobj);
}
xfs_sysfs_del(parent_kobj);
return error;
}
int
xfs_error_sysfs_init(
struct xfs_mount *mp)
{
int error;
/* .../xfs/<dev>/error/ */
error = xfs_sysfs_init(&mp->m_error_kobj, &xfs_error_ktype,
&mp->m_kobj, "error");
if (error)
return error;
error = sysfs_create_file(&mp->m_error_kobj.kobject,
ATTR_LIST(fail_at_unmount));
if (error)
goto out_error;
/* .../xfs/<dev>/error/metadata/ */
error = xfs_error_sysfs_init_class(mp, XFS_ERR_METADATA,
"metadata", &mp->m_error_meta_kobj,
xfs_error_meta_init);
if (error)
goto out_error;
return 0;
out_error:
xfs_sysfs_del(&mp->m_error_kobj);
return error;
}
void
xfs_error_sysfs_del(
struct xfs_mount *mp)
{
struct xfs_error_cfg *cfg;
int i, j;
for (i = 0; i < XFS_ERR_CLASS_MAX; i++) {
for (j = 0; j < XFS_ERR_ERRNO_MAX; j++) {
cfg = &mp->m_error_cfg[i][j];
xfs_sysfs_del(&cfg->kobj);
}
}
xfs_sysfs_del(&mp->m_error_meta_kobj);
xfs_sysfs_del(&mp->m_error_kobj);
}
struct xfs_error_cfg *
xfs_error_get_cfg(
struct xfs_mount *mp,
int error_class,
int error)
{
struct xfs_error_cfg *cfg;
if (error < 0)
error = -error;
switch (error) {
case EIO:
cfg = &mp->m_error_cfg[error_class][XFS_ERR_EIO];
break;
case ENOSPC:
cfg = &mp->m_error_cfg[error_class][XFS_ERR_ENOSPC];
break;
case ENODEV:
cfg = &mp->m_error_cfg[error_class][XFS_ERR_ENODEV];
break;
default:
cfg = &mp->m_error_cfg[error_class][XFS_ERR_DEFAULT];
break;
}
return cfg;
}