From c79f52f0656eeb3e4a12f7f358f760077ae111b6 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Sat, 27 Jan 2024 13:44:58 -0700 Subject: [PATCH 1/6] io_uring/rw: ensure poll based multishot read retries appropriately io_read_mshot() always relies on poll triggering retries, and this works fine as long as we do a retry per size of the buffer being read. The buffer size is given by the size of the buffer(s) in the given buffer group ID. But if we're reading less than what is available, then we don't always get to read everything that is available. For example, if the buffers available are 32 bytes and we have 64 bytes to read, then we'll correctly read the first 32 bytes and then wait for another poll trigger before we attempt the next read. This next poll trigger may never happen, in which case we just sit forever and never make progress, or it may trigger at some point in the future, and now we're just delivering the available data much later than we should have. io_read_mshot() could do retries itself, but that is wasteful as we'll be going through all of __io_read() again, and most likely in vain. Rather than do that, bump our poll reference count and have io_poll_check_events() do one more loop and check with vfs_poll() if we have more data to read. If we do, io_read_mshot() will get invoked again directly and we'll read the next chunk. io_poll_multishot_retry() must only get called from inside io_poll_issue(), which is our multishot retry handler, as we know we already "own" the request at this point. Cc: stable@vger.kernel.org Link: https://github.com/axboe/liburing/issues/1041 Fixes: fc68fcda0491 ("io_uring/rw: add support for IORING_OP_READ_MULTISHOT") Signed-off-by: Jens Axboe --- io_uring/poll.h | 9 +++++++++ io_uring/rw.c | 10 +++++++++- 2 files changed, 18 insertions(+), 1 deletion(-) diff --git a/io_uring/poll.h b/io_uring/poll.h index ff4d5d753387..1dacae9e816c 100644 --- a/io_uring/poll.h +++ b/io_uring/poll.h @@ -24,6 +24,15 @@ struct async_poll { struct io_poll *double_poll; }; +/* + * Must only be called inside issue_flags & IO_URING_F_MULTISHOT, or + * potentially other cases where we already "own" this poll request. + */ +static inline void io_poll_multishot_retry(struct io_kiocb *req) +{ + atomic_inc(&req->poll_refs); +} + int io_poll_add_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); int io_poll_add(struct io_kiocb *req, unsigned int issue_flags); diff --git a/io_uring/rw.c b/io_uring/rw.c index 118cc9f1cf16..d5e79d9bdc71 100644 --- a/io_uring/rw.c +++ b/io_uring/rw.c @@ -18,6 +18,7 @@ #include "opdef.h" #include "kbuf.h" #include "rsrc.h" +#include "poll.h" #include "rw.h" struct io_rw { @@ -962,8 +963,15 @@ int io_read_mshot(struct io_kiocb *req, unsigned int issue_flags) if (io_fill_cqe_req_aux(req, issue_flags & IO_URING_F_COMPLETE_DEFER, ret, cflags | IORING_CQE_F_MORE)) { - if (issue_flags & IO_URING_F_MULTISHOT) + if (issue_flags & IO_URING_F_MULTISHOT) { + /* + * Force retry, as we might have more data to + * be read and otherwise it won't get retried + * until (if ever) another poll is triggered. + */ + io_poll_multishot_retry(req); return IOU_ISSUE_SKIP_COMPLETE; + } return -EAGAIN; } } From e84b01a880f635e3084a361afba41f95ff500d12 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 29 Jan 2024 11:52:54 -0700 Subject: [PATCH 2/6] io_uring/poll: move poll execution helpers higher up In preparation for calling __io_poll_execute() higher up, move the functions to avoid forward declarations. No functional changes in this patch. Signed-off-by: Jens Axboe --- io_uring/poll.c | 40 ++++++++++++++++++++-------------------- 1 file changed, 20 insertions(+), 20 deletions(-) diff --git a/io_uring/poll.c b/io_uring/poll.c index d59b74a99d4e..785a5b191003 100644 --- a/io_uring/poll.c +++ b/io_uring/poll.c @@ -228,6 +228,26 @@ enum { IOU_POLL_REISSUE = 3, }; +static void __io_poll_execute(struct io_kiocb *req, int mask) +{ + unsigned flags = 0; + + io_req_set_res(req, mask, 0); + req->io_task_work.func = io_poll_task_func; + + trace_io_uring_task_add(req, mask); + + if (!(req->flags & REQ_F_POLL_NO_LAZY)) + flags = IOU_F_TWQ_LAZY_WAKE; + __io_req_task_work_add(req, flags); +} + +static inline void io_poll_execute(struct io_kiocb *req, int res) +{ + if (io_poll_get_ownership(req)) + __io_poll_execute(req, res); +} + /* * All poll tw should go through this. Checks for poll events, manages * references, does rewait, etc. @@ -364,26 +384,6 @@ void io_poll_task_func(struct io_kiocb *req, struct io_tw_state *ts) } } -static void __io_poll_execute(struct io_kiocb *req, int mask) -{ - unsigned flags = 0; - - io_req_set_res(req, mask, 0); - req->io_task_work.func = io_poll_task_func; - - trace_io_uring_task_add(req, mask); - - if (!(req->flags & REQ_F_POLL_NO_LAZY)) - flags = IOU_F_TWQ_LAZY_WAKE; - __io_req_task_work_add(req, flags); -} - -static inline void io_poll_execute(struct io_kiocb *req, int res) -{ - if (io_poll_get_ownership(req)) - __io_poll_execute(req, res); -} - static void io_poll_cancel_req(struct io_kiocb *req) { io_poll_mark_cancelled(req); From 91e5d765a82fb2c9d0b7ad930d8953208081ddf1 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 29 Jan 2024 11:54:18 -0700 Subject: [PATCH 3/6] io_uring/net: un-indent mshot retry path in io_recv_finish() In preparation for putting some retry logic in there, have the done path just skip straight to the end rather than have too much nesting in here. No functional changes in this patch. Signed-off-by: Jens Axboe --- io_uring/net.c | 36 ++++++++++++++++++++---------------- 1 file changed, 20 insertions(+), 16 deletions(-) diff --git a/io_uring/net.c b/io_uring/net.c index 75d494dad7e2..740c6bfa5b59 100644 --- a/io_uring/net.c +++ b/io_uring/net.c @@ -645,23 +645,27 @@ static inline bool io_recv_finish(struct io_kiocb *req, int *ret, return true; } - if (!mshot_finished) { - if (io_fill_cqe_req_aux(req, issue_flags & IO_URING_F_COMPLETE_DEFER, - *ret, cflags | IORING_CQE_F_MORE)) { - io_recv_prep_retry(req); - /* Known not-empty or unknown state, retry */ - if (cflags & IORING_CQE_F_SOCK_NONEMPTY || - msg->msg_inq == -1) - return false; - if (issue_flags & IO_URING_F_MULTISHOT) - *ret = IOU_ISSUE_SKIP_COMPLETE; - else - *ret = -EAGAIN; - return true; - } - /* Otherwise stop multishot but use the current result. */ - } + if (mshot_finished) + goto finish; + /* + * Fill CQE for this receive and see if we should keep trying to + * receive from this socket. + */ + if (io_fill_cqe_req_aux(req, issue_flags & IO_URING_F_COMPLETE_DEFER, + *ret, cflags | IORING_CQE_F_MORE)) { + io_recv_prep_retry(req); + /* Known not-empty or unknown state, retry */ + if (cflags & IORING_CQE_F_SOCK_NONEMPTY || msg->msg_inq == -1) + return false; + if (issue_flags & IO_URING_F_MULTISHOT) + *ret = IOU_ISSUE_SKIP_COMPLETE; + else + *ret = -EAGAIN; + return true; + } + /* Otherwise stop multishot but use the current result. */ +finish: io_req_set_res(req, *ret, cflags); if (issue_flags & IO_URING_F_MULTISHOT) From 704ea888d646cb9d715662944cf389c823252ee0 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 29 Jan 2024 11:57:11 -0700 Subject: [PATCH 4/6] io_uring/poll: add requeue return code from poll multishot handling Since our poll handling is edge triggered, multishot handlers retry internally until they know that no more data is available. In preparation for limiting these retries, add an internal return code, IOU_REQUEUE, which can be used to inform the poll backend about the handler wanting to retry, but that this should happen through a normal task_work requeue rather than keep hammering on the issue side for this one request. No functional changes in this patch, nobody is using this return code just yet. Signed-off-by: Jens Axboe --- io_uring/io_uring.h | 8 +++++++- io_uring/poll.c | 9 ++++++++- 2 files changed, 15 insertions(+), 2 deletions(-) diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h index 04e33f25919c..d5495710c178 100644 --- a/io_uring/io_uring.h +++ b/io_uring/io_uring.h @@ -15,11 +15,17 @@ #include #endif - enum { IOU_OK = 0, IOU_ISSUE_SKIP_COMPLETE = -EIOCBQUEUED, + /* + * Requeue the task_work to restart operations on this request. The + * actual value isn't important, should just be not an otherwise + * valid error code, yet less than -MAX_ERRNO and valid internally. + */ + IOU_REQUEUE = -3072, + /* * Intended only when both IO_URING_F_MULTISHOT is passed * to indicate to the poll runner that multishot should be diff --git a/io_uring/poll.c b/io_uring/poll.c index 785a5b191003..7513afc7b702 100644 --- a/io_uring/poll.c +++ b/io_uring/poll.c @@ -226,6 +226,7 @@ enum { IOU_POLL_NO_ACTION = 1, IOU_POLL_REMOVE_POLL_USE_RES = 2, IOU_POLL_REISSUE = 3, + IOU_POLL_REQUEUE = 4, }; static void __io_poll_execute(struct io_kiocb *req, int mask) @@ -329,6 +330,8 @@ static int io_poll_check_events(struct io_kiocb *req, struct io_tw_state *ts) int ret = io_poll_issue(req, ts); if (ret == IOU_STOP_MULTISHOT) return IOU_POLL_REMOVE_POLL_USE_RES; + else if (ret == IOU_REQUEUE) + return IOU_POLL_REQUEUE; if (ret < 0) return ret; } @@ -351,8 +354,12 @@ void io_poll_task_func(struct io_kiocb *req, struct io_tw_state *ts) int ret; ret = io_poll_check_events(req, ts); - if (ret == IOU_POLL_NO_ACTION) + if (ret == IOU_POLL_NO_ACTION) { return; + } else if (ret == IOU_POLL_REQUEUE) { + __io_poll_execute(req, 0); + return; + } io_poll_remove_entries(req); io_poll_tw_hash_eject(req, ts); From 76b367a2d83163cf19173d5cb0b562acbabc8eac Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 29 Jan 2024 12:00:58 -0700 Subject: [PATCH 5/6] io_uring/net: limit inline multishot retries If we have multiple clients and some/all are flooding the receives to such an extent that we can retry a LOT handling multishot receives, then we can be starving some clients and hence serving traffic in an imbalanced fashion. Limit multishot retry attempts to some arbitrary value, whose only purpose serves to ensure that we don't keep serving a single connection for way too long. We default to 32 retries, which should be more than enough to provide fairness, yet not so small that we'll spend too much time requeuing rather than handling traffic. Cc: stable@vger.kernel.org Depends-on: 704ea888d646 ("io_uring/poll: add requeue return code from poll multishot handling") Depends-on: 1e5d765a82f ("io_uring/net: un-indent mshot retry path in io_recv_finish()") Depends-on: e84b01a880f6 ("io_uring/poll: move poll execution helpers higher up") Fixes: b3fdea6ecb55 ("io_uring: multishot recv") Fixes: 9bb66906f23e ("io_uring: support multishot in recvmsg") Link: https://github.com/axboe/liburing/issues/1043 Signed-off-by: Jens Axboe --- io_uring/net.c | 23 ++++++++++++++++++++--- 1 file changed, 20 insertions(+), 3 deletions(-) diff --git a/io_uring/net.c b/io_uring/net.c index 740c6bfa5b59..a12ff69e6843 100644 --- a/io_uring/net.c +++ b/io_uring/net.c @@ -60,6 +60,7 @@ struct io_sr_msg { unsigned len; unsigned done_io; unsigned msg_flags; + unsigned nr_multishot_loops; u16 flags; /* initialised and used only by !msg send variants */ u16 addr_len; @@ -70,6 +71,13 @@ struct io_sr_msg { struct io_kiocb *notif; }; +/* + * Number of times we'll try and do receives if there's more data. If we + * exceed this limit, then add us to the back of the queue and retry from + * there. This helps fairness between flooding clients. + */ +#define MULTISHOT_MAX_RETRY 32 + static inline bool io_check_multishot(struct io_kiocb *req, unsigned int issue_flags) { @@ -611,6 +619,7 @@ int io_recvmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) sr->msg_flags |= MSG_CMSG_COMPAT; #endif sr->done_io = 0; + sr->nr_multishot_loops = 0; return 0; } @@ -654,12 +663,20 @@ static inline bool io_recv_finish(struct io_kiocb *req, int *ret, */ if (io_fill_cqe_req_aux(req, issue_flags & IO_URING_F_COMPLETE_DEFER, *ret, cflags | IORING_CQE_F_MORE)) { + struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); + int mshot_retry_ret = IOU_ISSUE_SKIP_COMPLETE; + io_recv_prep_retry(req); /* Known not-empty or unknown state, retry */ - if (cflags & IORING_CQE_F_SOCK_NONEMPTY || msg->msg_inq == -1) - return false; + if (cflags & IORING_CQE_F_SOCK_NONEMPTY || msg->msg_inq == -1) { + if (sr->nr_multishot_loops++ < MULTISHOT_MAX_RETRY) + return false; + /* mshot retries exceeded, force a requeue */ + sr->nr_multishot_loops = 0; + mshot_retry_ret = IOU_REQUEUE; + } if (issue_flags & IO_URING_F_MULTISHOT) - *ret = IOU_ISSUE_SKIP_COMPLETE; + *ret = mshot_retry_ret; else *ret = -EAGAIN; return true; From 72bd80252feeb3bef8724230ee15d9f7ab541c6e Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 1 Feb 2024 06:42:36 -0700 Subject: [PATCH 6/6] io_uring/net: fix sr->len for IORING_OP_RECV with MSG_WAITALL and buffers If we use IORING_OP_RECV with provided buffers and pass in '0' as the length of the request, the length is retrieved from the selected buffer. If MSG_WAITALL is also set and we get a short receive, then we may hit the retry path which decrements sr->len and increments the buffer for a retry. However, the length is still zero at this point, which means that sr->len now becomes huge and import_ubuf() will cap it to MAX_RW_COUNT and subsequently return -EFAULT for the range as a whole. Fix this by always assigning sr->len once the buffer has been selected. Cc: stable@vger.kernel.org Fixes: 7ba89d2af17a ("io_uring: ensure recv and recvmsg handle MSG_WAITALL correctly") Signed-off-by: Jens Axboe --- io_uring/net.c | 1 + 1 file changed, 1 insertion(+) diff --git a/io_uring/net.c b/io_uring/net.c index a12ff69e6843..43bc9a5f96f9 100644 --- a/io_uring/net.c +++ b/io_uring/net.c @@ -923,6 +923,7 @@ retry_multishot: if (!buf) return -ENOBUFS; sr->buf = buf; + sr->len = len; } ret = import_ubuf(ITER_DEST, sr->buf, len, &msg.msg_iter);