1

for-6.12/io_uring-discard-20240913

-----BEGIN PGP SIGNATURE-----
 
 iQJEBAABCAAuFiEEwPw5LcreJtl1+l5K99NY+ylx4KYFAmbkboUQHGF4Ym9lQGtl
 cm5lbC5kawAKCRD301j7KXHgpj7DD/oDqQ13NOHuotVbufPRDWuG6+UEaN/Pukp/
 RYDWwYu/DB4v7LVWBV9COqN5jQqY2wrMpgBdZqtEnDtC7yjN6QYAT4TQdfIq/HNo
 NooN4ULmJzOOC6sR9MBGyzOsCbz7kmRt1nBZ7vdEXMrLXeX9JDX3bDrELf7jhKsk
 84lKE/Mxs530LSzxAtN9KaOQncK5gXen4WSrZsYraU2vJFAPBkJwQGAL5pOdmsp9
 NqvNE3QonPr4v99XnDJH80q44afuqffUITPjtGX52tBMO3CCUQFUpZp5fiUjfa1v
 Okz+SyeBE6gB7c008BGqTOgmKdQOMs3uwFDQ/xMw+pYwy+wHH4skzPP776DwAdgn
 C/SaVFsaXkqOXX4f+CiNJ01LmD4EOBy16LM5qE4NwLNpjQu/3EdHjNqaYfM/LCca
 YyQoUOsnYIRj21+oNFpKekscuEAPKG9ewyMyvfxbkk167j00lgwVwybb/2JfYvRJ
 i0GBY5phJnkeNUerU9SDm6RBTAjDOZ0stubTtFjugDZdrz2FmA4pBFGWjgYLiLhH
 3ZCyaCAOoYW8yxxkogTzKbLx6wXb5wgS7jTHgsk+eeSSWRBTnv2sd0fn/D5m3Uw7
 uBHKvauDp3zEd9MdF26QG7U6RlojEbVoyTYjnJskPsClxbch4WSpwvoEILdJRvls
 1dTczxgdyw==
 =wlzo
 -----END PGP SIGNATURE-----

Merge tag 'for-6.12/io_uring-discard-20240913' of git://git.kernel.dk/linux

Pull io_uring async discard support from Jens Axboe:
 "Sitting on top of both the 6.12 block and io_uring core branches,
  here's support for async discard through io_uring.

  This allows applications to issue async discards, rather than rely on
  the blocking sync ioctl discards we already have. The sync support is
  difficult to use outside of idle/cleanup periods.

  On a real (but slow) device, testing shows the following results when
  compared to sync discard:

	qd64 sync discard: 21K IOPS, lat avg 3 msec (max 21 msec)
	qd64 async discard: 76K IOPS, lat avg 845 usec (max 2.2 msec)

	qd64 sync discard: 14K IOPS, lat avg 5 msec (max 25 msec)
	qd64 async discard: 56K IOPS, lat avg 1153 usec (max 3.6 msec)

  and synthetic null_blk testing with the same queue depth and block
  size settings as above shows:

	Type    Trim size       IOPS    Lat avg (usec)  Lat Max (usec)
	==============================================================
	sync    4k               144K       444            20314
	async   4k              1353K        47              595
	sync    1M                56K      1136            21031
	async   1M                94K       680              760"

* tag 'for-6.12/io_uring-discard-20240913' of git://git.kernel.dk/linux:
  block: implement async io_uring discard cmd
  block: introduce blk_validate_byte_range()
  filemap: introduce filemap_invalidate_pages
  io_uring/cmd: give inline space in request to cmds
  io_uring/cmd: expose iowq to cmds
This commit is contained in:
Linus Torvalds 2024-09-16 13:50:14 +02:00
commit adfc3ded5c
10 changed files with 209 additions and 24 deletions

View File

@ -609,6 +609,7 @@ blk_mode_t file_to_blk_mode(struct file *file);
int truncate_bdev_range(struct block_device *bdev, blk_mode_t mode,
loff_t lstart, loff_t lend);
long blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg);
int blkdev_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags);
long compat_blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg);
extern const struct address_space_operations def_blk_aops;

View File

@ -17,6 +17,7 @@
#include <linux/fs.h>
#include <linux/iomap.h>
#include <linux/module.h>
#include <linux/io_uring/cmd.h>
#include "blk.h"
static inline struct inode *bdev_file_inode(struct file *file)
@ -865,6 +866,7 @@ const struct file_operations def_blk_fops = {
.splice_read = filemap_splice_read,
.splice_write = iter_file_splice_write,
.fallocate = blkdev_fallocate,
.uring_cmd = blkdev_uring_cmd,
.fop_flags = FOP_BUFFER_RASYNC,
};

View File

@ -11,6 +11,9 @@
#include <linux/blktrace_api.h>
#include <linux/pr.h>
#include <linux/uaccess.h>
#include <linux/pagemap.h>
#include <linux/io_uring/cmd.h>
#include <uapi/linux/blkdev.h>
#include "blk.h"
static int blkpg_do_ioctl(struct block_device *bdev,
@ -92,41 +95,54 @@ static int compat_blkpg_ioctl(struct block_device *bdev,
}
#endif
/*
* Check that [start, start + len) is a valid range from the block device's
* perspective, including verifying that it can be correctly translated into
* logical block addresses.
*/
static int blk_validate_byte_range(struct block_device *bdev,
uint64_t start, uint64_t len)
{
unsigned int bs_mask = bdev_logical_block_size(bdev) - 1;
uint64_t end;
if ((start | len) & bs_mask)
return -EINVAL;
if (!len)
return -EINVAL;
if (check_add_overflow(start, len, &end) || end > bdev_nr_bytes(bdev))
return -EINVAL;
return 0;
}
static int blk_ioctl_discard(struct block_device *bdev, blk_mode_t mode,
unsigned long arg)
{
unsigned int bs_mask = bdev_logical_block_size(bdev) - 1;
uint64_t range[2], start, len, end;
uint64_t range[2], start, len;
struct bio *prev = NULL, *bio;
sector_t sector, nr_sects;
struct blk_plug plug;
int err;
if (!(mode & BLK_OPEN_WRITE))
return -EBADF;
if (!bdev_max_discard_sectors(bdev))
return -EOPNOTSUPP;
if (bdev_read_only(bdev))
return -EPERM;
if (copy_from_user(range, (void __user *)arg, sizeof(range)))
return -EFAULT;
start = range[0];
len = range[1];
if (!len)
return -EINVAL;
if ((start | len) & bs_mask)
return -EINVAL;
if (!bdev_max_discard_sectors(bdev))
return -EOPNOTSUPP;
if (check_add_overflow(start, len, &end) ||
end > bdev_nr_bytes(bdev))
return -EINVAL;
if (!(mode & BLK_OPEN_WRITE))
return -EBADF;
if (bdev_read_only(bdev))
return -EPERM;
err = blk_validate_byte_range(bdev, start, len);
if (err)
return err;
filemap_invalidate_lock(bdev->bd_mapping);
err = truncate_bdev_range(bdev, mode, start, end - 1);
err = truncate_bdev_range(bdev, mode, start, start + len - 1);
if (err)
goto fail;
@ -735,3 +751,112 @@ long compat_blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg)
return ret;
}
#endif
struct blk_iou_cmd {
int res;
bool nowait;
};
static void blk_cmd_complete(struct io_uring_cmd *cmd, unsigned int issue_flags)
{
struct blk_iou_cmd *bic = io_uring_cmd_to_pdu(cmd, struct blk_iou_cmd);
if (bic->res == -EAGAIN && bic->nowait)
io_uring_cmd_issue_blocking(cmd);
else
io_uring_cmd_done(cmd, bic->res, 0, issue_flags);
}
static void bio_cmd_bio_end_io(struct bio *bio)
{
struct io_uring_cmd *cmd = bio->bi_private;
struct blk_iou_cmd *bic = io_uring_cmd_to_pdu(cmd, struct blk_iou_cmd);
if (unlikely(bio->bi_status) && !bic->res)
bic->res = blk_status_to_errno(bio->bi_status);
io_uring_cmd_do_in_task_lazy(cmd, blk_cmd_complete);
bio_put(bio);
}
static int blkdev_cmd_discard(struct io_uring_cmd *cmd,
struct block_device *bdev,
uint64_t start, uint64_t len, bool nowait)
{
struct blk_iou_cmd *bic = io_uring_cmd_to_pdu(cmd, struct blk_iou_cmd);
gfp_t gfp = nowait ? GFP_NOWAIT : GFP_KERNEL;
sector_t sector = start >> SECTOR_SHIFT;
sector_t nr_sects = len >> SECTOR_SHIFT;
struct bio *prev = NULL, *bio;
int err;
if (!bdev_max_discard_sectors(bdev))
return -EOPNOTSUPP;
if (!(file_to_blk_mode(cmd->file) & BLK_OPEN_WRITE))
return -EBADF;
if (bdev_read_only(bdev))
return -EPERM;
err = blk_validate_byte_range(bdev, start, len);
if (err)
return err;
err = filemap_invalidate_pages(bdev->bd_mapping, start,
start + len - 1, nowait);
if (err)
return err;
while (true) {
bio = blk_alloc_discard_bio(bdev, &sector, &nr_sects, gfp);
if (!bio)
break;
if (nowait) {
/*
* Don't allow multi-bio non-blocking submissions as
* subsequent bios may fail but we won't get a direct
* indication of that. Normally, the caller should
* retry from a blocking context.
*/
if (unlikely(nr_sects)) {
bio_put(bio);
return -EAGAIN;
}
bio->bi_opf |= REQ_NOWAIT;
}
prev = bio_chain_and_submit(prev, bio);
}
if (unlikely(!prev))
return -EAGAIN;
if (unlikely(nr_sects))
bic->res = -EAGAIN;
prev->bi_private = cmd;
prev->bi_end_io = bio_cmd_bio_end_io;
submit_bio(prev);
return -EIOCBQUEUED;
}
int blkdev_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags)
{
struct block_device *bdev = I_BDEV(cmd->file->f_mapping->host);
struct blk_iou_cmd *bic = io_uring_cmd_to_pdu(cmd, struct blk_iou_cmd);
const struct io_uring_sqe *sqe = cmd->sqe;
u32 cmd_op = cmd->cmd_op;
uint64_t start, len;
if (unlikely(sqe->ioprio || sqe->__pad1 || sqe->len ||
sqe->rw_flags || sqe->file_index))
return -EINVAL;
bic->res = 0;
bic->nowait = issue_flags & IO_URING_F_NONBLOCK;
start = READ_ONCE(sqe->addr);
len = READ_ONCE(sqe->addr3);
switch (cmd_op) {
case BLOCK_URING_CMD_DISCARD:
return blkdev_cmd_discard(cmd, bdev, start, len, bic->nowait);
}
return -EINVAL;
}

View File

@ -23,6 +23,15 @@ static inline const void *io_uring_sqe_cmd(const struct io_uring_sqe *sqe)
return sqe->cmd;
}
static inline void io_uring_cmd_private_sz_check(size_t cmd_sz)
{
BUILD_BUG_ON(cmd_sz > sizeof_field(struct io_uring_cmd, pdu));
}
#define io_uring_cmd_to_pdu(cmd, pdu_type) ( \
io_uring_cmd_private_sz_check(sizeof(pdu_type)), \
((pdu_type *)&(cmd)->pdu) \
)
#if defined(CONFIG_IO_URING)
int io_uring_cmd_import_fixed(u64 ubuf, unsigned long len, int rw,
struct iov_iter *iter, void *ioucmd);
@ -48,6 +57,9 @@ void __io_uring_cmd_do_in_task(struct io_uring_cmd *ioucmd,
void io_uring_cmd_mark_cancelable(struct io_uring_cmd *cmd,
unsigned int issue_flags);
/* Execute the request from a blocking context */
void io_uring_cmd_issue_blocking(struct io_uring_cmd *ioucmd);
#else
static inline int io_uring_cmd_import_fixed(u64 ubuf, unsigned long len, int rw,
struct iov_iter *iter, void *ioucmd)
@ -67,6 +79,9 @@ static inline void io_uring_cmd_mark_cancelable(struct io_uring_cmd *cmd,
unsigned int issue_flags)
{
}
static inline void io_uring_cmd_issue_blocking(struct io_uring_cmd *ioucmd)
{
}
#endif
/*

View File

@ -32,6 +32,8 @@ int invalidate_inode_pages2_range(struct address_space *mapping,
pgoff_t start, pgoff_t end);
int kiocb_invalidate_pages(struct kiocb *iocb, size_t count);
void kiocb_invalidate_post_direct_write(struct kiocb *iocb, size_t count);
int filemap_invalidate_pages(struct address_space *mapping,
loff_t pos, loff_t end, bool nowait);
int write_inode_now(struct inode *, int sync);
int filemap_fdatawrite(struct address_space *);

View File

@ -0,0 +1,14 @@
/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
#ifndef _UAPI_LINUX_BLKDEV_H
#define _UAPI_LINUX_BLKDEV_H
#include <linux/ioctl.h>
#include <linux/types.h>
/*
* io_uring block file commands, see IORING_OP_URING_CMD.
* It's a different number space from ioctl(), reuse the block's code 0x12.
*/
#define BLOCK_URING_CMD_DISCARD _IO(0x12, 0)
#endif

View File

@ -533,6 +533,17 @@ static void io_queue_iowq(struct io_kiocb *req)
io_queue_linked_timeout(link);
}
static void io_req_queue_iowq_tw(struct io_kiocb *req, struct io_tw_state *ts)
{
io_queue_iowq(req);
}
void io_req_queue_iowq(struct io_kiocb *req)
{
req->io_task_work.func = io_req_queue_iowq_tw;
io_req_task_work_add(req);
}
static __cold void io_queue_deferred(struct io_ring_ctx *ctx)
{
while (!list_empty(&ctx->defer_list)) {

View File

@ -94,6 +94,7 @@ int io_uring_alloc_task_context(struct task_struct *task,
int io_ring_add_registered_file(struct io_uring_task *tctx, struct file *file,
int start, int end);
void io_req_queue_iowq(struct io_kiocb *req);
int io_poll_issue(struct io_kiocb *req, struct io_tw_state *ts);
int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr);

View File

@ -277,6 +277,13 @@ int io_uring_cmd_import_fixed(u64 ubuf, unsigned long len, int rw,
}
EXPORT_SYMBOL_GPL(io_uring_cmd_import_fixed);
void io_uring_cmd_issue_blocking(struct io_uring_cmd *ioucmd)
{
struct io_kiocb *req = cmd_to_io_kiocb(ioucmd);
io_req_queue_iowq(req);
}
static inline int io_uring_cmd_getsockopt(struct socket *sock,
struct io_uring_cmd *cmd,
unsigned int issue_flags)

View File

@ -2712,14 +2712,12 @@ int kiocb_write_and_wait(struct kiocb *iocb, size_t count)
}
EXPORT_SYMBOL_GPL(kiocb_write_and_wait);
int kiocb_invalidate_pages(struct kiocb *iocb, size_t count)
int filemap_invalidate_pages(struct address_space *mapping,
loff_t pos, loff_t end, bool nowait)
{
struct address_space *mapping = iocb->ki_filp->f_mapping;
loff_t pos = iocb->ki_pos;
loff_t end = pos + count - 1;
int ret;
if (iocb->ki_flags & IOCB_NOWAIT) {
if (nowait) {
/* we could block if there are any pages in the range */
if (filemap_range_has_page(mapping, pos, end))
return -EAGAIN;
@ -2738,6 +2736,15 @@ int kiocb_invalidate_pages(struct kiocb *iocb, size_t count)
return invalidate_inode_pages2_range(mapping, pos >> PAGE_SHIFT,
end >> PAGE_SHIFT);
}
int kiocb_invalidate_pages(struct kiocb *iocb, size_t count)
{
struct address_space *mapping = iocb->ki_filp->f_mapping;
return filemap_invalidate_pages(mapping, iocb->ki_pos,
iocb->ki_pos + count - 1,
iocb->ki_flags & IOCB_NOWAIT);
}
EXPORT_SYMBOL_GPL(kiocb_invalidate_pages);
/**