1

XFS Bug fixes for 6.12-rc4

* Fix integer overflow in xrep_bmap
 * Fix stale dealloc punching for COW IO
 
 Signed-off-by: Carlos Maiolino <cem@kernel.org>
 -----BEGIN PGP SIGNATURE-----
 
 iJUEABMJAB0WIQQMHYkcUKcy4GgPe2RGdaER5QtfpgUCZw5LIwAKCRBGdaER5Qtf
 puRlAYDezbvs1dDSkKIGOt3inGdLptNAu4qniXBUkbYI9BzmtIVDueWP4Wo0dV3d
 gu3xrWQBfjFXdmEuBlwLuAFrp07AN18BVMj+DWCiEShsPHSoSPcF/IrDiz4BHvGv
 MKYq9CywFw==
 =Gj9b
 -----END PGP SIGNATURE-----

Merge tag 'xfs-6.12-fixes-4' of git://git.kernel.org/pub/scm/fs/xfs/xfs-linux

Pull xfs fixes from Carlos Maiolino:

 - Fix integer overflow in xrep_bmap

 - Fix stale dealloc punching for COW IO

* tag 'xfs-6.12-fixes-4' of git://git.kernel.org/pub/scm/fs/xfs/xfs-linux:
  xfs: punch delalloc extents from the COW fork for COW writes
  xfs: set IOMAP_F_SHARED for all COW fork allocations
  xfs: share more code in xfs_buffered_write_iomap_begin
  xfs: support the COW fork in xfs_bmap_punch_delalloc_range
  xfs: IOMAP_ZERO and IOMAP_UNSHARE already hold invalidate_lock
  xfs: take XFS_MMAPLOCK_EXCL xfs_file_write_zero_eof
  xfs: factor out a xfs_file_write_zero_eof helper
  iomap: move locking out of iomap_write_delalloc_release
  iomap: remove iomap_file_buffered_write_punch_delalloc
  iomap: factor out a iomap_last_written_block helper
  xfs: fix integer overflow in xrep_bmap
This commit is contained in:
Linus Torvalds 2024-10-18 11:28:39 -07:00
commit 568570fdf2
9 changed files with 199 additions and 165 deletions

View File

@ -208,7 +208,7 @@ The filesystem must arrange to `cancel
such `reservations such `reservations
<https://lore.kernel.org/linux-xfs/20220817093627.GZ3600936@dread.disaster.area/>`_ <https://lore.kernel.org/linux-xfs/20220817093627.GZ3600936@dread.disaster.area/>`_
because writeback will not consume the reservation. because writeback will not consume the reservation.
The ``iomap_file_buffered_write_punch_delalloc`` can be called from a The ``iomap_write_delalloc_release`` can be called from a
``->iomap_end`` function to find all the clean areas of the folios ``->iomap_end`` function to find all the clean areas of the folios
caching a fresh (``IOMAP_F_NEW``) delalloc mapping. caching a fresh (``IOMAP_F_NEW``) delalloc mapping.
It takes the ``invalidate_lock``. It takes the ``invalidate_lock``.

View File

@ -1145,10 +1145,36 @@ static void iomap_write_delalloc_scan(struct inode *inode,
} }
/* /*
* When a short write occurs, the filesystem might need to use ->iomap_end
* to remove space reservations created in ->iomap_begin.
*
* For filesystems that use delayed allocation, there can be dirty pages over
* the delalloc extent outside the range of a short write but still within the
* delalloc extent allocated for this iomap if the write raced with page
* faults.
*
* Punch out all the delalloc blocks in the range given except for those that * Punch out all the delalloc blocks in the range given except for those that
* have dirty data still pending in the page cache - those are going to be * have dirty data still pending in the page cache - those are going to be
* written and so must still retain the delalloc backing for writeback. * written and so must still retain the delalloc backing for writeback.
* *
* The punch() callback *must* only punch delalloc extents in the range passed
* to it. It must skip over all other types of extents in the range and leave
* them completely unchanged. It must do this punch atomically with respect to
* other extent modifications.
*
* The punch() callback may be called with a folio locked to prevent writeback
* extent allocation racing at the edge of the range we are currently punching.
* The locked folio may or may not cover the range being punched, so it is not
* safe for the punch() callback to lock folios itself.
*
* Lock order is:
*
* inode->i_rwsem (shared or exclusive)
* inode->i_mapping->invalidate_lock (exclusive)
* folio_lock()
* ->punch
* internal filesystem allocation lock
*
* As we are scanning the page cache for data, we don't need to reimplement the * As we are scanning the page cache for data, we don't need to reimplement the
* wheel - mapping_seek_hole_data() does exactly what we need to identify the * wheel - mapping_seek_hole_data() does exactly what we need to identify the
* start and end of data ranges correctly even for sub-folio block sizes. This * start and end of data ranges correctly even for sub-folio block sizes. This
@ -1177,7 +1203,7 @@ static void iomap_write_delalloc_scan(struct inode *inode,
* require sprinkling this code with magic "+ 1" and "- 1" arithmetic and expose * require sprinkling this code with magic "+ 1" and "- 1" arithmetic and expose
* the code to subtle off-by-one bugs.... * the code to subtle off-by-one bugs....
*/ */
static void iomap_write_delalloc_release(struct inode *inode, loff_t start_byte, void iomap_write_delalloc_release(struct inode *inode, loff_t start_byte,
loff_t end_byte, unsigned flags, struct iomap *iomap, loff_t end_byte, unsigned flags, struct iomap *iomap,
iomap_punch_t punch) iomap_punch_t punch)
{ {
@ -1185,12 +1211,13 @@ static void iomap_write_delalloc_release(struct inode *inode, loff_t start_byte,
loff_t scan_end_byte = min(i_size_read(inode), end_byte); loff_t scan_end_byte = min(i_size_read(inode), end_byte);
/* /*
* Lock the mapping to avoid races with page faults re-instantiating * The caller must hold invalidate_lock to avoid races with page faults
* folios and dirtying them via ->page_mkwrite whilst we walk the * re-instantiating folios and dirtying them via ->page_mkwrite whilst
* cache and perform delalloc extent removal. Failing to do this can * we walk the cache and perform delalloc extent removal. Failing to do
* leave dirty pages with no space reservation in the cache. * this can leave dirty pages with no space reservation in the cache.
*/ */
filemap_invalidate_lock(inode->i_mapping); lockdep_assert_held_write(&inode->i_mapping->invalidate_lock);
while (start_byte < scan_end_byte) { while (start_byte < scan_end_byte) {
loff_t data_end; loff_t data_end;
@ -1207,7 +1234,7 @@ static void iomap_write_delalloc_release(struct inode *inode, loff_t start_byte,
if (start_byte == -ENXIO || start_byte == scan_end_byte) if (start_byte == -ENXIO || start_byte == scan_end_byte)
break; break;
if (WARN_ON_ONCE(start_byte < 0)) if (WARN_ON_ONCE(start_byte < 0))
goto out_unlock; return;
WARN_ON_ONCE(start_byte < punch_start_byte); WARN_ON_ONCE(start_byte < punch_start_byte);
WARN_ON_ONCE(start_byte > scan_end_byte); WARN_ON_ONCE(start_byte > scan_end_byte);
@ -1218,7 +1245,7 @@ static void iomap_write_delalloc_release(struct inode *inode, loff_t start_byte,
data_end = mapping_seek_hole_data(inode->i_mapping, start_byte, data_end = mapping_seek_hole_data(inode->i_mapping, start_byte,
scan_end_byte, SEEK_HOLE); scan_end_byte, SEEK_HOLE);
if (WARN_ON_ONCE(data_end < 0)) if (WARN_ON_ONCE(data_end < 0))
goto out_unlock; return;
/* /*
* If we race with post-direct I/O invalidation of the page cache, * If we race with post-direct I/O invalidation of the page cache,
@ -1240,74 +1267,8 @@ static void iomap_write_delalloc_release(struct inode *inode, loff_t start_byte,
if (punch_start_byte < end_byte) if (punch_start_byte < end_byte)
punch(inode, punch_start_byte, end_byte - punch_start_byte, punch(inode, punch_start_byte, end_byte - punch_start_byte,
iomap); iomap);
out_unlock:
filemap_invalidate_unlock(inode->i_mapping);
} }
EXPORT_SYMBOL_GPL(iomap_write_delalloc_release);
/*
* When a short write occurs, the filesystem may need to remove reserved space
* that was allocated in ->iomap_begin from it's ->iomap_end method. For
* filesystems that use delayed allocation, we need to punch out delalloc
* extents from the range that are not dirty in the page cache. As the write can
* race with page faults, there can be dirty pages over the delalloc extent
* outside the range of a short write but still within the delalloc extent
* allocated for this iomap.
*
* This function uses [start_byte, end_byte) intervals (i.e. open ended) to
* simplify range iterations.
*
* The punch() callback *must* only punch delalloc extents in the range passed
* to it. It must skip over all other types of extents in the range and leave
* them completely unchanged. It must do this punch atomically with respect to
* other extent modifications.
*
* The punch() callback may be called with a folio locked to prevent writeback
* extent allocation racing at the edge of the range we are currently punching.
* The locked folio may or may not cover the range being punched, so it is not
* safe for the punch() callback to lock folios itself.
*
* Lock order is:
*
* inode->i_rwsem (shared or exclusive)
* inode->i_mapping->invalidate_lock (exclusive)
* folio_lock()
* ->punch
* internal filesystem allocation lock
*/
void iomap_file_buffered_write_punch_delalloc(struct inode *inode,
loff_t pos, loff_t length, ssize_t written, unsigned flags,
struct iomap *iomap, iomap_punch_t punch)
{
loff_t start_byte;
loff_t end_byte;
unsigned int blocksize = i_blocksize(inode);
if (iomap->type != IOMAP_DELALLOC)
return;
/* If we didn't reserve the blocks, we're not allowed to punch them. */
if (!(iomap->flags & IOMAP_F_NEW))
return;
/*
* start_byte refers to the first unused block after a short write. If
* nothing was written, round offset down to point at the first block in
* the range.
*/
if (unlikely(!written))
start_byte = round_down(pos, blocksize);
else
start_byte = round_up(pos + written, blocksize);
end_byte = round_up(pos + length, blocksize);
/* Nothing to do if we've written the entire delalloc extent */
if (start_byte >= end_byte)
return;
iomap_write_delalloc_release(inode, start_byte, end_byte, flags, iomap,
punch);
}
EXPORT_SYMBOL_GPL(iomap_file_buffered_write_punch_delalloc);
static loff_t iomap_unshare_iter(struct iomap_iter *iter) static loff_t iomap_unshare_iter(struct iomap_iter *iter)
{ {

View File

@ -801,7 +801,7 @@ xrep_bmap(
{ {
struct xrep_bmap *rb; struct xrep_bmap *rb;
char *descr; char *descr;
unsigned int max_bmbt_recs; xfs_extnum_t max_bmbt_recs;
bool large_extcount; bool large_extcount;
int error = 0; int error = 0;

View File

@ -116,7 +116,7 @@ xfs_end_ioend(
if (unlikely(error)) { if (unlikely(error)) {
if (ioend->io_flags & IOMAP_F_SHARED) { if (ioend->io_flags & IOMAP_F_SHARED) {
xfs_reflink_cancel_cow_range(ip, offset, size, true); xfs_reflink_cancel_cow_range(ip, offset, size, true);
xfs_bmap_punch_delalloc_range(ip, offset, xfs_bmap_punch_delalloc_range(ip, XFS_DATA_FORK, offset,
offset + size); offset + size);
} }
goto done; goto done;
@ -456,7 +456,7 @@ xfs_discard_folio(
* byte of the next folio. Hence the end offset is only dependent on the * byte of the next folio. Hence the end offset is only dependent on the
* folio itself and not the start offset that is passed in. * folio itself and not the start offset that is passed in.
*/ */
xfs_bmap_punch_delalloc_range(ip, pos, xfs_bmap_punch_delalloc_range(ip, XFS_DATA_FORK, pos,
folio_pos(folio) + folio_size(folio)); folio_pos(folio) + folio_size(folio));
} }

View File

@ -442,11 +442,12 @@ out_unlock_iolock:
void void
xfs_bmap_punch_delalloc_range( xfs_bmap_punch_delalloc_range(
struct xfs_inode *ip, struct xfs_inode *ip,
int whichfork,
xfs_off_t start_byte, xfs_off_t start_byte,
xfs_off_t end_byte) xfs_off_t end_byte)
{ {
struct xfs_mount *mp = ip->i_mount; struct xfs_mount *mp = ip->i_mount;
struct xfs_ifork *ifp = &ip->i_df; struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork);
xfs_fileoff_t start_fsb = XFS_B_TO_FSBT(mp, start_byte); xfs_fileoff_t start_fsb = XFS_B_TO_FSBT(mp, start_byte);
xfs_fileoff_t end_fsb = XFS_B_TO_FSB(mp, end_byte); xfs_fileoff_t end_fsb = XFS_B_TO_FSB(mp, end_byte);
struct xfs_bmbt_irec got, del; struct xfs_bmbt_irec got, del;
@ -474,11 +475,14 @@ xfs_bmap_punch_delalloc_range(
continue; continue;
} }
xfs_bmap_del_extent_delay(ip, XFS_DATA_FORK, &icur, &got, &del); xfs_bmap_del_extent_delay(ip, whichfork, &icur, &got, &del);
if (!xfs_iext_get_extent(ifp, &icur, &got)) if (!xfs_iext_get_extent(ifp, &icur, &got))
break; break;
} }
if (whichfork == XFS_COW_FORK && !ifp->if_bytes)
xfs_inode_clear_cowblocks_tag(ip);
out_unlock: out_unlock:
xfs_iunlock(ip, XFS_ILOCK_EXCL); xfs_iunlock(ip, XFS_ILOCK_EXCL);
} }
@ -580,7 +584,7 @@ xfs_free_eofblocks(
*/ */
if (ip->i_diflags & (XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND)) { if (ip->i_diflags & (XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND)) {
if (ip->i_delayed_blks) { if (ip->i_delayed_blks) {
xfs_bmap_punch_delalloc_range(ip, xfs_bmap_punch_delalloc_range(ip, XFS_DATA_FORK,
round_up(XFS_ISIZE(ip), mp->m_sb.sb_blocksize), round_up(XFS_ISIZE(ip), mp->m_sb.sb_blocksize),
LLONG_MAX); LLONG_MAX);
} }

View File

@ -30,7 +30,7 @@ xfs_bmap_rtalloc(struct xfs_bmalloca *ap)
} }
#endif /* CONFIG_XFS_RT */ #endif /* CONFIG_XFS_RT */
void xfs_bmap_punch_delalloc_range(struct xfs_inode *ip, void xfs_bmap_punch_delalloc_range(struct xfs_inode *ip, int whichfork,
xfs_off_t start_byte, xfs_off_t end_byte); xfs_off_t start_byte, xfs_off_t end_byte);
struct kgetbmap { struct kgetbmap {

View File

@ -347,10 +347,83 @@ xfs_file_splice_read(
return ret; return ret;
} }
/*
* Take care of zeroing post-EOF blocks when they might exist.
*
* Returns 0 if successfully, a negative error for a failure, or 1 if this
* function dropped the iolock and reacquired it exclusively and the caller
* needs to restart the write sanity checks.
*/
static ssize_t
xfs_file_write_zero_eof(
struct kiocb *iocb,
struct iov_iter *from,
unsigned int *iolock,
size_t count,
bool *drained_dio)
{
struct xfs_inode *ip = XFS_I(iocb->ki_filp->f_mapping->host);
loff_t isize;
int error;
/*
* We need to serialise against EOF updates that occur in IO completions
* here. We want to make sure that nobody is changing the size while
* we do this check until we have placed an IO barrier (i.e. hold
* XFS_IOLOCK_EXCL) that prevents new IO from being dispatched. The
* spinlock effectively forms a memory barrier once we have
* XFS_IOLOCK_EXCL so we are guaranteed to see the latest EOF value and
* hence be able to correctly determine if we need to run zeroing.
*/
spin_lock(&ip->i_flags_lock);
isize = i_size_read(VFS_I(ip));
if (iocb->ki_pos <= isize) {
spin_unlock(&ip->i_flags_lock);
return 0;
}
spin_unlock(&ip->i_flags_lock);
if (iocb->ki_flags & IOCB_NOWAIT)
return -EAGAIN;
if (!*drained_dio) {
/*
* If zeroing is needed and we are currently holding the iolock
* shared, we need to update it to exclusive which implies
* having to redo all checks before.
*/
if (*iolock == XFS_IOLOCK_SHARED) {
xfs_iunlock(ip, *iolock);
*iolock = XFS_IOLOCK_EXCL;
xfs_ilock(ip, *iolock);
iov_iter_reexpand(from, count);
}
/*
* We now have an IO submission barrier in place, but AIO can do
* EOF updates during IO completion and hence we now need to
* wait for all of them to drain. Non-AIO DIO will have drained
* before we are given the XFS_IOLOCK_EXCL, and so for most
* cases this wait is a no-op.
*/
inode_dio_wait(VFS_I(ip));
*drained_dio = true;
return 1;
}
trace_xfs_zero_eof(ip, isize, iocb->ki_pos - isize);
xfs_ilock(ip, XFS_MMAPLOCK_EXCL);
error = xfs_zero_range(ip, isize, iocb->ki_pos - isize, NULL);
xfs_iunlock(ip, XFS_MMAPLOCK_EXCL);
return error;
}
/* /*
* Common pre-write limit and setup checks. * Common pre-write limit and setup checks.
* *
* Called with the iolocked held either shared and exclusive according to * Called with the iolock held either shared and exclusive according to
* @iolock, and returns with it held. Might upgrade the iolock to exclusive * @iolock, and returns with it held. Might upgrade the iolock to exclusive
* if called for a direct write beyond i_size. * if called for a direct write beyond i_size.
*/ */
@ -360,13 +433,10 @@ xfs_file_write_checks(
struct iov_iter *from, struct iov_iter *from,
unsigned int *iolock) unsigned int *iolock)
{ {
struct file *file = iocb->ki_filp; struct inode *inode = iocb->ki_filp->f_mapping->host;
struct inode *inode = file->f_mapping->host;
struct xfs_inode *ip = XFS_I(inode);
ssize_t error = 0;
size_t count = iov_iter_count(from); size_t count = iov_iter_count(from);
bool drained_dio = false; bool drained_dio = false;
loff_t isize; ssize_t error;
restart: restart:
error = generic_write_checks(iocb, from); error = generic_write_checks(iocb, from);
@ -389,7 +459,7 @@ restart:
* exclusively. * exclusively.
*/ */
if (*iolock == XFS_IOLOCK_SHARED && !IS_NOSEC(inode)) { if (*iolock == XFS_IOLOCK_SHARED && !IS_NOSEC(inode)) {
xfs_iunlock(ip, *iolock); xfs_iunlock(XFS_I(inode), *iolock);
*iolock = XFS_IOLOCK_EXCL; *iolock = XFS_IOLOCK_EXCL;
error = xfs_ilock_iocb(iocb, *iolock); error = xfs_ilock_iocb(iocb, *iolock);
if (error) { if (error) {
@ -400,64 +470,24 @@ restart:
} }
/* /*
* If the offset is beyond the size of the file, we need to zero any * If the offset is beyond the size of the file, we need to zero all
* blocks that fall between the existing EOF and the start of this * blocks that fall between the existing EOF and the start of this
* write. If zeroing is needed and we are currently holding the iolock * write.
* shared, we need to update it to exclusive which implies having to
* redo all checks before.
* *
* We need to serialise against EOF updates that occur in IO completions * We can do an unlocked check for i_size here safely as I/O completion
* here. We want to make sure that nobody is changing the size while we * can only extend EOF. Truncate is locked out at this point, so the
* do this check until we have placed an IO barrier (i.e. hold the * EOF can not move backwards, only forwards. Hence we only need to take
* XFS_IOLOCK_EXCL) that prevents new IO from being dispatched. The * the slow path when we are at or beyond the current EOF.
* spinlock effectively forms a memory barrier once we have the
* XFS_IOLOCK_EXCL so we are guaranteed to see the latest EOF value and
* hence be able to correctly determine if we need to run zeroing.
*
* We can do an unlocked check here safely as IO completion can only
* extend EOF. Truncate is locked out at this point, so the EOF can
* not move backwards, only forwards. Hence we only need to take the
* slow path and spin locks when we are at or beyond the current EOF.
*/ */
if (iocb->ki_pos <= i_size_read(inode)) if (iocb->ki_pos > i_size_read(inode)) {
goto out; error = xfs_file_write_zero_eof(iocb, from, iolock, count,
&drained_dio);
spin_lock(&ip->i_flags_lock); if (error == 1)
isize = i_size_read(inode);
if (iocb->ki_pos > isize) {
spin_unlock(&ip->i_flags_lock);
if (iocb->ki_flags & IOCB_NOWAIT)
return -EAGAIN;
if (!drained_dio) {
if (*iolock == XFS_IOLOCK_SHARED) {
xfs_iunlock(ip, *iolock);
*iolock = XFS_IOLOCK_EXCL;
xfs_ilock(ip, *iolock);
iov_iter_reexpand(from, count);
}
/*
* We now have an IO submission barrier in place, but
* AIO can do EOF updates during IO completion and hence
* we now need to wait for all of them to drain. Non-AIO
* DIO will have drained before we are given the
* XFS_IOLOCK_EXCL, and so for most cases this wait is a
* no-op.
*/
inode_dio_wait(inode);
drained_dio = true;
goto restart; goto restart;
}
trace_xfs_zero_eof(ip, isize, iocb->ki_pos - isize);
error = xfs_zero_range(ip, isize, iocb->ki_pos - isize, NULL);
if (error) if (error)
return error; return error;
} else }
spin_unlock(&ip->i_flags_lock);
out:
return kiocb_modified(iocb); return kiocb_modified(iocb);
} }

View File

@ -975,6 +975,7 @@ xfs_buffered_write_iomap_begin(
int allocfork = XFS_DATA_FORK; int allocfork = XFS_DATA_FORK;
int error = 0; int error = 0;
unsigned int lockmode = XFS_ILOCK_EXCL; unsigned int lockmode = XFS_ILOCK_EXCL;
unsigned int iomap_flags = 0;
u64 seq; u64 seq;
if (xfs_is_shutdown(mp)) if (xfs_is_shutdown(mp))
@ -1145,6 +1146,11 @@ xfs_buffered_write_iomap_begin(
} }
} }
/*
* Flag newly allocated delalloc blocks with IOMAP_F_NEW so we punch
* them out if the write happens to fail.
*/
iomap_flags |= IOMAP_F_NEW;
if (allocfork == XFS_COW_FORK) { if (allocfork == XFS_COW_FORK) {
error = xfs_bmapi_reserve_delalloc(ip, allocfork, offset_fsb, error = xfs_bmapi_reserve_delalloc(ip, allocfork, offset_fsb,
end_fsb - offset_fsb, prealloc_blocks, &cmap, end_fsb - offset_fsb, prealloc_blocks, &cmap,
@ -1162,19 +1168,11 @@ xfs_buffered_write_iomap_begin(
if (error) if (error)
goto out_unlock; goto out_unlock;
/*
* Flag newly allocated delalloc blocks with IOMAP_F_NEW so we punch
* them out if the write happens to fail.
*/
seq = xfs_iomap_inode_sequence(ip, IOMAP_F_NEW);
xfs_iunlock(ip, lockmode);
trace_xfs_iomap_alloc(ip, offset, count, allocfork, &imap); trace_xfs_iomap_alloc(ip, offset, count, allocfork, &imap);
return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, IOMAP_F_NEW, seq);
found_imap: found_imap:
seq = xfs_iomap_inode_sequence(ip, 0); seq = xfs_iomap_inode_sequence(ip, iomap_flags);
xfs_iunlock(ip, lockmode); xfs_iunlock(ip, lockmode);
return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, 0, seq); return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, iomap_flags, seq);
convert_delay: convert_delay:
xfs_iunlock(ip, lockmode); xfs_iunlock(ip, lockmode);
@ -1188,20 +1186,20 @@ convert_delay:
return 0; return 0;
found_cow: found_cow:
seq = xfs_iomap_inode_sequence(ip, 0);
if (imap.br_startoff <= offset_fsb) { if (imap.br_startoff <= offset_fsb) {
error = xfs_bmbt_to_iomap(ip, srcmap, &imap, flags, 0, seq); error = xfs_bmbt_to_iomap(ip, srcmap, &imap, flags, 0,
xfs_iomap_inode_sequence(ip, 0));
if (error) if (error)
goto out_unlock; goto out_unlock;
seq = xfs_iomap_inode_sequence(ip, IOMAP_F_SHARED); } else {
xfs_iunlock(ip, lockmode); xfs_trim_extent(&cmap, offset_fsb,
return xfs_bmbt_to_iomap(ip, iomap, &cmap, flags, imap.br_startoff - offset_fsb);
IOMAP_F_SHARED, seq);
} }
xfs_trim_extent(&cmap, offset_fsb, imap.br_startoff - offset_fsb); iomap_flags |= IOMAP_F_SHARED;
seq = xfs_iomap_inode_sequence(ip, iomap_flags);
xfs_iunlock(ip, lockmode); xfs_iunlock(ip, lockmode);
return xfs_bmbt_to_iomap(ip, iomap, &cmap, flags, 0, seq); return xfs_bmbt_to_iomap(ip, iomap, &cmap, flags, iomap_flags, seq);
out_unlock: out_unlock:
xfs_iunlock(ip, lockmode); xfs_iunlock(ip, lockmode);
@ -1215,7 +1213,10 @@ xfs_buffered_write_delalloc_punch(
loff_t length, loff_t length,
struct iomap *iomap) struct iomap *iomap)
{ {
xfs_bmap_punch_delalloc_range(XFS_I(inode), offset, offset + length); xfs_bmap_punch_delalloc_range(XFS_I(inode),
(iomap->flags & IOMAP_F_SHARED) ?
XFS_COW_FORK : XFS_DATA_FORK,
offset, offset + length);
} }
static int static int
@ -1227,8 +1228,30 @@ xfs_buffered_write_iomap_end(
unsigned flags, unsigned flags,
struct iomap *iomap) struct iomap *iomap)
{ {
iomap_file_buffered_write_punch_delalloc(inode, offset, length, written, loff_t start_byte, end_byte;
flags, iomap, &xfs_buffered_write_delalloc_punch);
/* If we didn't reserve the blocks, we're not allowed to punch them. */
if (iomap->type != IOMAP_DELALLOC || !(iomap->flags & IOMAP_F_NEW))
return 0;
/* Nothing to do if we've written the entire delalloc extent */
start_byte = iomap_last_written_block(inode, offset, written);
end_byte = round_up(offset + length, i_blocksize(inode));
if (start_byte >= end_byte)
return 0;
/* For zeroing operations the callers already hold invalidate_lock. */
if (flags & (IOMAP_UNSHARE | IOMAP_ZERO)) {
rwsem_assert_held_write(&inode->i_mapping->invalidate_lock);
iomap_write_delalloc_release(inode, start_byte, end_byte, flags,
iomap, xfs_buffered_write_delalloc_punch);
} else {
filemap_invalidate_lock(inode->i_mapping);
iomap_write_delalloc_release(inode, start_byte, end_byte, flags,
iomap, xfs_buffered_write_delalloc_punch);
filemap_invalidate_unlock(inode->i_mapping);
}
return 0; return 0;
} }
@ -1435,6 +1458,8 @@ xfs_zero_range(
{ {
struct inode *inode = VFS_I(ip); struct inode *inode = VFS_I(ip);
xfs_assert_ilocked(ip, XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL);
if (IS_DAX(inode)) if (IS_DAX(inode))
return dax_zero_range(inode, pos, len, did_zero, return dax_zero_range(inode, pos, len, did_zero,
&xfs_dax_write_iomap_ops); &xfs_dax_write_iomap_ops);

View File

@ -256,6 +256,20 @@ static inline const struct iomap *iomap_iter_srcmap(const struct iomap_iter *i)
return &i->iomap; return &i->iomap;
} }
/*
* Return the file offset for the first unchanged block after a short write.
*
* If nothing was written, round @pos down to point at the first block in
* the range, else round up to include the partially written block.
*/
static inline loff_t iomap_last_written_block(struct inode *inode, loff_t pos,
ssize_t written)
{
if (unlikely(!written))
return round_down(pos, i_blocksize(inode));
return round_up(pos + written, i_blocksize(inode));
}
ssize_t iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *from, ssize_t iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *from,
const struct iomap_ops *ops, void *private); const struct iomap_ops *ops, void *private);
int iomap_read_folio(struct folio *folio, const struct iomap_ops *ops); int iomap_read_folio(struct folio *folio, const struct iomap_ops *ops);
@ -276,9 +290,9 @@ vm_fault_t iomap_page_mkwrite(struct vm_fault *vmf,
typedef void (*iomap_punch_t)(struct inode *inode, loff_t offset, loff_t length, typedef void (*iomap_punch_t)(struct inode *inode, loff_t offset, loff_t length,
struct iomap *iomap); struct iomap *iomap);
void iomap_file_buffered_write_punch_delalloc(struct inode *inode, loff_t pos, void iomap_write_delalloc_release(struct inode *inode, loff_t start_byte,
loff_t length, ssize_t written, unsigned flag, loff_t end_byte, unsigned flags, struct iomap *iomap,
struct iomap *iomap, iomap_punch_t punch); iomap_punch_t punch);
int iomap_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, int iomap_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
u64 start, u64 len, const struct iomap_ops *ops); u64 start, u64 len, const struct iomap_ops *ops);