1

for-6.11/io_uring-20240714

-----BEGIN PGP SIGNATURE-----
 
 iQJEBAABCAAuFiEEwPw5LcreJtl1+l5K99NY+ylx4KYFAmaTgusQHGF4Ym9lQGtl
 cm5lbC5kawAKCRD301j7KXHgpr+1EAC4I7pRAM341sfmhe/9QQKMM8VzGwy5Tlr1
 AFLO3BujRTl6X8S9fQjIjN1coW6u4F42I19+vVlxqvB7CUnqt9VWpexEjxe4K0FR
 R+hIZW+fWV9K/eMrcsLcI7oReN5kIihHOzzy3wz0rENoGB5dCl6JAZMHDUCSqP0/
 ZJJQ5ut8ah20Y/myHnzP5o4TfdE7nGo73Di2YoE2g3KqeX/dlAKW9+5hqKzzrHhM
 2U25k/6KLy0ROzKpy2qW0QRE3pT5udoHLK2ue9+XwXF8JWVTlfVkHBzGY7NstyyT
 z07SEzW1q4xV1HdCwGDAU7cL2NJMRXSG0p2WZTm8QyaVTdsZQvEx08GLsVdLvFH5
 Gg+oOaxVE+INzW+/Lwz7lFHgq6XEjdAlEAOXDtGkZoni6Rt6iCzFCW6RTf/guy8o
 Cub7tatMyegxai9+FTN/oFVoydRR0tsMf0OHrWnLOperh9CaxAwXvmKFeT/UTwiB
 KIuIOJop7aThJbiV42a/xwTrEjNMZRv6uVBBEtJX3rxpmIhqTbjcAv9rKMmgtLMk
 s6yX1MvYdOLhhEDyoUBX0dJdEETBf3KbnYIwi8kb4Sbkw/ZDgnkmSxFysom61wUF
 byAFEpah3ZFR8aES0uNKUE6UHK6i5qqp0Za/n6gA927E/WGCU9ndaS+01gyknog0
 8FqFYwruHQ==
 =50CO
 -----END PGP SIGNATURE-----

Merge tag 'for-6.11/io_uring-20240714' of git://git.kernel.dk/linux

Pull io_uring updates from Jens Axboe:
 "Here are the io_uring updates queued up for 6.11.

  Nothing major this time around, various minor improvements and
  cleanups/fixes. This contains:

   - Add bind/listen opcodes. Main motivation is to support direct
     descriptors, to avoid needing a regular fd just for doing these two
     operations (Gabriel)

   - Probe fixes (Gabriel)

   - Treat io-wq work flags as atomics. Not fixing a real issue, but may
     as well and it silences a KCSAN warning (me)

   - Cleanup of rsrc __set_current_state() usage (me)

   - Add 64-bit for {m,f}advise operations (me)

   - Improve performance of data ring messages (me)

   - Fix for ring message overflow posting (Pavel)

   - Fix for freezer interaction with TWA_NOTIFY_SIGNAL. Not strictly an
     io_uring thing, but since TWA_NOTIFY_SIGNAL was originally added
     for faster task_work signaling for io_uring, bundling it with this
     pull (Pavel)

   - Add Pavel as a co-maintainer

   - Various cleanups (me, Thorsten)"

* tag 'for-6.11/io_uring-20240714' of git://git.kernel.dk/linux: (28 commits)
  io_uring/net: check socket is valid in io_bind()/io_listen()
  kernel: rerun task_work while freezing in get_signal()
  io_uring/io-wq: limit retrying worker initialisation
  io_uring/napi: Remove unnecessary s64 cast
  io_uring/net: cleanup io_recv_finish() bundle handling
  io_uring/msg_ring: fix overflow posting
  MAINTAINERS: change Pavel Begunkov from io_uring reviewer to maintainer
  io_uring/msg_ring: use kmem_cache_free() to free request
  io_uring/msg_ring: check for dead submitter task
  io_uring/msg_ring: add an alloc cache for io_kiocb entries
  io_uring/msg_ring: improve handling of target CQE posting
  io_uring: add io_add_aux_cqe() helper
  io_uring: add remote task_work execution helper
  io_uring/msg_ring: tighten requirement for remote posting
  io_uring: Allocate only necessary memory in io_probe
  io_uring: Fix probe of disabled operations
  io_uring: Introduce IORING_OP_LISTEN
  io_uring: Introduce IORING_OP_BIND
  net: Split a __sys_listen helper for io_uring
  net: Split a __sys_bind helper for io_uring
  ...
This commit is contained in:
Linus Torvalds 2024-07-15 13:49:10 -07:00
commit 3a56e24173
23 changed files with 547 additions and 319 deletions

View File

@ -11551,7 +11551,7 @@ F: include/linux/iosys-map.h
IO_URING IO_URING
M: Jens Axboe <axboe@kernel.dk> M: Jens Axboe <axboe@kernel.dk>
R: Pavel Begunkov <asml.silence@gmail.com> M: Pavel Begunkov <asml.silence@gmail.com>
L: io-uring@vger.kernel.org L: io-uring@vger.kernel.org
S: Maintained S: Maintained
T: git git://git.kernel.dk/linux-block T: git git://git.kernel.dk/linux-block

View File

@ -50,7 +50,7 @@ struct io_wq_work_list {
struct io_wq_work { struct io_wq_work {
struct io_wq_work_node list; struct io_wq_work_node list;
unsigned flags; atomic_t flags;
/* place it here instead of io_kiocb as it fills padding and saves 4B */ /* place it here instead of io_kiocb as it fills padding and saves 4B */
int cancel_seq; int cancel_seq;
}; };
@ -210,14 +210,6 @@ struct io_submit_state {
struct blk_plug plug; struct blk_plug plug;
}; };
struct io_ev_fd {
struct eventfd_ctx *cq_ev_fd;
unsigned int eventfd_async: 1;
struct rcu_head rcu;
atomic_t refs;
atomic_t ops;
};
struct io_alloc_cache { struct io_alloc_cache {
void **entries; void **entries;
unsigned int nr_cached; unsigned int nr_cached;
@ -372,7 +364,6 @@ struct io_ring_ctx {
struct io_restriction restrictions; struct io_restriction restrictions;
/* slow path rsrc auxilary data, used by update/register */ /* slow path rsrc auxilary data, used by update/register */
struct io_mapped_ubuf *dummy_ubuf;
struct io_rsrc_data *file_data; struct io_rsrc_data *file_data;
struct io_rsrc_data *buf_data; struct io_rsrc_data *buf_data;
@ -405,6 +396,9 @@ struct io_ring_ctx {
struct callback_head poll_wq_task_work; struct callback_head poll_wq_task_work;
struct list_head defer_list; struct list_head defer_list;
struct io_alloc_cache msg_cache;
spinlock_t msg_lock;
#ifdef CONFIG_NET_RX_BUSY_POLL #ifdef CONFIG_NET_RX_BUSY_POLL
struct list_head napi_list; /* track busy poll napi_id */ struct list_head napi_list; /* track busy poll napi_id */
spinlock_t napi_lock; /* napi_list lock */ spinlock_t napi_lock; /* napi_list lock */

View File

@ -442,11 +442,14 @@ extern int __sys_accept4(int fd, struct sockaddr __user *upeer_sockaddr,
extern int __sys_socket(int family, int type, int protocol); extern int __sys_socket(int family, int type, int protocol);
extern struct file *__sys_socket_file(int family, int type, int protocol); extern struct file *__sys_socket_file(int family, int type, int protocol);
extern int __sys_bind(int fd, struct sockaddr __user *umyaddr, int addrlen); extern int __sys_bind(int fd, struct sockaddr __user *umyaddr, int addrlen);
extern int __sys_bind_socket(struct socket *sock, struct sockaddr_storage *address,
int addrlen);
extern int __sys_connect_file(struct file *file, struct sockaddr_storage *addr, extern int __sys_connect_file(struct file *file, struct sockaddr_storage *addr,
int addrlen, int file_flags); int addrlen, int file_flags);
extern int __sys_connect(int fd, struct sockaddr __user *uservaddr, extern int __sys_connect(int fd, struct sockaddr __user *uservaddr,
int addrlen); int addrlen);
extern int __sys_listen(int fd, int backlog); extern int __sys_listen(int fd, int backlog);
extern int __sys_listen_socket(struct socket *sock, int backlog);
extern int __sys_getsockname(int fd, struct sockaddr __user *usockaddr, extern int __sys_getsockname(int fd, struct sockaddr __user *usockaddr,
int __user *usockaddr_len); int __user *usockaddr_len);
extern int __sys_getpeername(int fd, struct sockaddr __user *usockaddr, extern int __sys_getpeername(int fd, struct sockaddr __user *usockaddr,

View File

@ -257,6 +257,8 @@ enum io_uring_op {
IORING_OP_FUTEX_WAITV, IORING_OP_FUTEX_WAITV,
IORING_OP_FIXED_FD_INSTALL, IORING_OP_FIXED_FD_INSTALL,
IORING_OP_FTRUNCATE, IORING_OP_FTRUNCATE,
IORING_OP_BIND,
IORING_OP_LISTEN,
/* this goes last, obviously */ /* this goes last, obviously */
IORING_OP_LAST, IORING_OP_LAST,

View File

@ -4,9 +4,9 @@
obj-$(CONFIG_IO_URING) += io_uring.o opdef.o kbuf.o rsrc.o notif.o \ obj-$(CONFIG_IO_URING) += io_uring.o opdef.o kbuf.o rsrc.o notif.o \
tctx.o filetable.o rw.o net.o poll.o \ tctx.o filetable.o rw.o net.o poll.o \
uring_cmd.o openclose.o sqpoll.o \ eventfd.o uring_cmd.o openclose.o \
xattr.o nop.o fs.o splice.o sync.o \ sqpoll.o xattr.o nop.o fs.o splice.o \
msg_ring.o advise.o openclose.o \ sync.o msg_ring.o advise.o openclose.o \
epoll.o statx.o timeout.o fdinfo.o \ epoll.o statx.o timeout.o fdinfo.o \
cancel.o waitid.o register.o \ cancel.o waitid.o register.o \
truncate.o memmap.o truncate.o memmap.o

View File

@ -17,14 +17,14 @@
struct io_fadvise { struct io_fadvise {
struct file *file; struct file *file;
u64 offset; u64 offset;
u32 len; u64 len;
u32 advice; u32 advice;
}; };
struct io_madvise { struct io_madvise {
struct file *file; struct file *file;
u64 addr; u64 addr;
u32 len; u64 len;
u32 advice; u32 advice;
}; };
@ -33,10 +33,12 @@ int io_madvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
#if defined(CONFIG_ADVISE_SYSCALLS) && defined(CONFIG_MMU) #if defined(CONFIG_ADVISE_SYSCALLS) && defined(CONFIG_MMU)
struct io_madvise *ma = io_kiocb_to_cmd(req, struct io_madvise); struct io_madvise *ma = io_kiocb_to_cmd(req, struct io_madvise);
if (sqe->buf_index || sqe->off || sqe->splice_fd_in) if (sqe->buf_index || sqe->splice_fd_in)
return -EINVAL; return -EINVAL;
ma->addr = READ_ONCE(sqe->addr); ma->addr = READ_ONCE(sqe->addr);
ma->len = READ_ONCE(sqe->off);
if (!ma->len)
ma->len = READ_ONCE(sqe->len); ma->len = READ_ONCE(sqe->len);
ma->advice = READ_ONCE(sqe->fadvise_advice); ma->advice = READ_ONCE(sqe->fadvise_advice);
req->flags |= REQ_F_FORCE_ASYNC; req->flags |= REQ_F_FORCE_ASYNC;
@ -78,10 +80,12 @@ int io_fadvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{ {
struct io_fadvise *fa = io_kiocb_to_cmd(req, struct io_fadvise); struct io_fadvise *fa = io_kiocb_to_cmd(req, struct io_fadvise);
if (sqe->buf_index || sqe->addr || sqe->splice_fd_in) if (sqe->buf_index || sqe->splice_fd_in)
return -EINVAL; return -EINVAL;
fa->offset = READ_ONCE(sqe->off); fa->offset = READ_ONCE(sqe->off);
fa->len = READ_ONCE(sqe->addr);
if (!fa->len)
fa->len = READ_ONCE(sqe->len); fa->len = READ_ONCE(sqe->len);
fa->advice = READ_ONCE(sqe->fadvise_advice); fa->advice = READ_ONCE(sqe->fadvise_advice);
if (io_fadvise_force_async(fa)) if (io_fadvise_force_async(fa))

160
io_uring/eventfd.c Normal file
View File

@ -0,0 +1,160 @@
// SPDX-License-Identifier: GPL-2.0
#include <linux/kernel.h>
#include <linux/errno.h>
#include <linux/mm.h>
#include <linux/slab.h>
#include <linux/eventfd.h>
#include <linux/eventpoll.h>
#include <linux/io_uring.h>
#include <linux/io_uring_types.h>
#include "io-wq.h"
#include "eventfd.h"
struct io_ev_fd {
struct eventfd_ctx *cq_ev_fd;
unsigned int eventfd_async: 1;
struct rcu_head rcu;
atomic_t refs;
atomic_t ops;
};
enum {
IO_EVENTFD_OP_SIGNAL_BIT,
};
static void io_eventfd_free(struct rcu_head *rcu)
{
struct io_ev_fd *ev_fd = container_of(rcu, struct io_ev_fd, rcu);
eventfd_ctx_put(ev_fd->cq_ev_fd);
kfree(ev_fd);
}
static void io_eventfd_do_signal(struct rcu_head *rcu)
{
struct io_ev_fd *ev_fd = container_of(rcu, struct io_ev_fd, rcu);
eventfd_signal_mask(ev_fd->cq_ev_fd, EPOLL_URING_WAKE);
if (atomic_dec_and_test(&ev_fd->refs))
io_eventfd_free(rcu);
}
void io_eventfd_signal(struct io_ring_ctx *ctx)
{
struct io_ev_fd *ev_fd = NULL;
if (READ_ONCE(ctx->rings->cq_flags) & IORING_CQ_EVENTFD_DISABLED)
return;
guard(rcu)();
/*
* rcu_dereference ctx->io_ev_fd once and use it for both for checking
* and eventfd_signal
*/
ev_fd = rcu_dereference(ctx->io_ev_fd);
/*
* Check again if ev_fd exists incase an io_eventfd_unregister call
* completed between the NULL check of ctx->io_ev_fd at the start of
* the function and rcu_read_lock.
*/
if (unlikely(!ev_fd))
return;
if (!atomic_inc_not_zero(&ev_fd->refs))
return;
if (ev_fd->eventfd_async && !io_wq_current_is_worker())
goto out;
if (likely(eventfd_signal_allowed())) {
eventfd_signal_mask(ev_fd->cq_ev_fd, EPOLL_URING_WAKE);
} else {
if (!atomic_fetch_or(BIT(IO_EVENTFD_OP_SIGNAL_BIT), &ev_fd->ops)) {
call_rcu_hurry(&ev_fd->rcu, io_eventfd_do_signal);
return;
}
}
out:
if (atomic_dec_and_test(&ev_fd->refs))
call_rcu(&ev_fd->rcu, io_eventfd_free);
}
void io_eventfd_flush_signal(struct io_ring_ctx *ctx)
{
bool skip;
spin_lock(&ctx->completion_lock);
/*
* Eventfd should only get triggered when at least one event has been
* posted. Some applications rely on the eventfd notification count
* only changing IFF a new CQE has been added to the CQ ring. There's
* no depedency on 1:1 relationship between how many times this
* function is called (and hence the eventfd count) and number of CQEs
* posted to the CQ ring.
*/
skip = ctx->cached_cq_tail == ctx->evfd_last_cq_tail;
ctx->evfd_last_cq_tail = ctx->cached_cq_tail;
spin_unlock(&ctx->completion_lock);
if (skip)
return;
io_eventfd_signal(ctx);
}
int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg,
unsigned int eventfd_async)
{
struct io_ev_fd *ev_fd;
__s32 __user *fds = arg;
int fd;
ev_fd = rcu_dereference_protected(ctx->io_ev_fd,
lockdep_is_held(&ctx->uring_lock));
if (ev_fd)
return -EBUSY;
if (copy_from_user(&fd, fds, sizeof(*fds)))
return -EFAULT;
ev_fd = kmalloc(sizeof(*ev_fd), GFP_KERNEL);
if (!ev_fd)
return -ENOMEM;
ev_fd->cq_ev_fd = eventfd_ctx_fdget(fd);
if (IS_ERR(ev_fd->cq_ev_fd)) {
int ret = PTR_ERR(ev_fd->cq_ev_fd);
kfree(ev_fd);
return ret;
}
spin_lock(&ctx->completion_lock);
ctx->evfd_last_cq_tail = ctx->cached_cq_tail;
spin_unlock(&ctx->completion_lock);
ev_fd->eventfd_async = eventfd_async;
ctx->has_evfd = true;
atomic_set(&ev_fd->refs, 1);
atomic_set(&ev_fd->ops, 0);
rcu_assign_pointer(ctx->io_ev_fd, ev_fd);
return 0;
}
int io_eventfd_unregister(struct io_ring_ctx *ctx)
{
struct io_ev_fd *ev_fd;
ev_fd = rcu_dereference_protected(ctx->io_ev_fd,
lockdep_is_held(&ctx->uring_lock));
if (ev_fd) {
ctx->has_evfd = false;
rcu_assign_pointer(ctx->io_ev_fd, NULL);
if (atomic_dec_and_test(&ev_fd->refs))
call_rcu(&ev_fd->rcu, io_eventfd_free);
return 0;
}
return -ENXIO;
}

8
io_uring/eventfd.h Normal file
View File

@ -0,0 +1,8 @@
struct io_ring_ctx;
int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg,
unsigned int eventfd_async);
int io_eventfd_unregister(struct io_ring_ctx *ctx);
void io_eventfd_flush_signal(struct io_ring_ctx *ctx);
void io_eventfd_signal(struct io_ring_ctx *ctx);

View File

@ -23,6 +23,7 @@
#include "io_uring.h" #include "io_uring.h"
#define WORKER_IDLE_TIMEOUT (5 * HZ) #define WORKER_IDLE_TIMEOUT (5 * HZ)
#define WORKER_INIT_LIMIT 3
enum { enum {
IO_WORKER_F_UP = 0, /* up and active */ IO_WORKER_F_UP = 0, /* up and active */
@ -58,6 +59,7 @@ struct io_worker {
unsigned long create_state; unsigned long create_state;
struct callback_head create_work; struct callback_head create_work;
int init_retries;
union { union {
struct rcu_head rcu; struct rcu_head rcu;
@ -159,7 +161,7 @@ static inline struct io_wq_acct *io_get_acct(struct io_wq *wq, bool bound)
static inline struct io_wq_acct *io_work_get_acct(struct io_wq *wq, static inline struct io_wq_acct *io_work_get_acct(struct io_wq *wq,
struct io_wq_work *work) struct io_wq_work *work)
{ {
return io_get_acct(wq, !(work->flags & IO_WQ_WORK_UNBOUND)); return io_get_acct(wq, !(atomic_read(&work->flags) & IO_WQ_WORK_UNBOUND));
} }
static inline struct io_wq_acct *io_wq_get_acct(struct io_worker *worker) static inline struct io_wq_acct *io_wq_get_acct(struct io_worker *worker)
@ -451,7 +453,7 @@ static void __io_worker_idle(struct io_wq *wq, struct io_worker *worker)
static inline unsigned int io_get_work_hash(struct io_wq_work *work) static inline unsigned int io_get_work_hash(struct io_wq_work *work)
{ {
return work->flags >> IO_WQ_HASH_SHIFT; return atomic_read(&work->flags) >> IO_WQ_HASH_SHIFT;
} }
static bool io_wait_on_hash(struct io_wq *wq, unsigned int hash) static bool io_wait_on_hash(struct io_wq *wq, unsigned int hash)
@ -592,8 +594,9 @@ static void io_worker_handle_work(struct io_wq_acct *acct,
next_hashed = wq_next_work(work); next_hashed = wq_next_work(work);
if (unlikely(do_kill) && (work->flags & IO_WQ_WORK_UNBOUND)) if (do_kill &&
work->flags |= IO_WQ_WORK_CANCEL; (atomic_read(&work->flags) & IO_WQ_WORK_UNBOUND))
atomic_or(IO_WQ_WORK_CANCEL, &work->flags);
wq->do_work(work); wq->do_work(work);
io_assign_current_work(worker, NULL); io_assign_current_work(worker, NULL);
@ -744,7 +747,7 @@ static bool io_wq_work_match_all(struct io_wq_work *work, void *data)
return true; return true;
} }
static inline bool io_should_retry_thread(long err) static inline bool io_should_retry_thread(struct io_worker *worker, long err)
{ {
/* /*
* Prevent perpetual task_work retry, if the task (or its group) is * Prevent perpetual task_work retry, if the task (or its group) is
@ -752,6 +755,8 @@ static inline bool io_should_retry_thread(long err)
*/ */
if (fatal_signal_pending(current)) if (fatal_signal_pending(current))
return false; return false;
if (worker->init_retries++ >= WORKER_INIT_LIMIT)
return false;
switch (err) { switch (err) {
case -EAGAIN: case -EAGAIN:
@ -778,7 +783,7 @@ static void create_worker_cont(struct callback_head *cb)
io_init_new_worker(wq, worker, tsk); io_init_new_worker(wq, worker, tsk);
io_worker_release(worker); io_worker_release(worker);
return; return;
} else if (!io_should_retry_thread(PTR_ERR(tsk))) { } else if (!io_should_retry_thread(worker, PTR_ERR(tsk))) {
struct io_wq_acct *acct = io_wq_get_acct(worker); struct io_wq_acct *acct = io_wq_get_acct(worker);
atomic_dec(&acct->nr_running); atomic_dec(&acct->nr_running);
@ -845,7 +850,7 @@ fail:
tsk = create_io_thread(io_wq_worker, worker, NUMA_NO_NODE); tsk = create_io_thread(io_wq_worker, worker, NUMA_NO_NODE);
if (!IS_ERR(tsk)) { if (!IS_ERR(tsk)) {
io_init_new_worker(wq, worker, tsk); io_init_new_worker(wq, worker, tsk);
} else if (!io_should_retry_thread(PTR_ERR(tsk))) { } else if (!io_should_retry_thread(worker, PTR_ERR(tsk))) {
kfree(worker); kfree(worker);
goto fail; goto fail;
} else { } else {
@ -891,7 +896,7 @@ static bool io_wq_worker_wake(struct io_worker *worker, void *data)
static void io_run_cancel(struct io_wq_work *work, struct io_wq *wq) static void io_run_cancel(struct io_wq_work *work, struct io_wq *wq)
{ {
do { do {
work->flags |= IO_WQ_WORK_CANCEL; atomic_or(IO_WQ_WORK_CANCEL, &work->flags);
wq->do_work(work); wq->do_work(work);
work = wq->free_work(work); work = wq->free_work(work);
} while (work); } while (work);
@ -926,7 +931,7 @@ static bool io_wq_work_match_item(struct io_wq_work *work, void *data)
void io_wq_enqueue(struct io_wq *wq, struct io_wq_work *work) void io_wq_enqueue(struct io_wq *wq, struct io_wq_work *work)
{ {
struct io_wq_acct *acct = io_work_get_acct(wq, work); struct io_wq_acct *acct = io_work_get_acct(wq, work);
unsigned long work_flags = work->flags; unsigned int work_flags = atomic_read(&work->flags);
struct io_cb_cancel_data match = { struct io_cb_cancel_data match = {
.fn = io_wq_work_match_item, .fn = io_wq_work_match_item,
.data = work, .data = work,
@ -939,7 +944,7 @@ void io_wq_enqueue(struct io_wq *wq, struct io_wq_work *work)
* been marked as one that should not get executed, cancel it here. * been marked as one that should not get executed, cancel it here.
*/ */
if (test_bit(IO_WQ_BIT_EXIT, &wq->state) || if (test_bit(IO_WQ_BIT_EXIT, &wq->state) ||
(work->flags & IO_WQ_WORK_CANCEL)) { (work_flags & IO_WQ_WORK_CANCEL)) {
io_run_cancel(work, wq); io_run_cancel(work, wq);
return; return;
} }
@ -982,7 +987,7 @@ void io_wq_hash_work(struct io_wq_work *work, void *val)
unsigned int bit; unsigned int bit;
bit = hash_ptr(val, IO_WQ_HASH_ORDER); bit = hash_ptr(val, IO_WQ_HASH_ORDER);
work->flags |= (IO_WQ_WORK_HASHED | (bit << IO_WQ_HASH_SHIFT)); atomic_or(IO_WQ_WORK_HASHED | (bit << IO_WQ_HASH_SHIFT), &work->flags);
} }
static bool __io_wq_worker_cancel(struct io_worker *worker, static bool __io_wq_worker_cancel(struct io_worker *worker,
@ -990,7 +995,7 @@ static bool __io_wq_worker_cancel(struct io_worker *worker,
struct io_wq_work *work) struct io_wq_work *work)
{ {
if (work && match->fn(work, match->data)) { if (work && match->fn(work, match->data)) {
work->flags |= IO_WQ_WORK_CANCEL; atomic_or(IO_WQ_WORK_CANCEL, &work->flags);
__set_notify_signal(worker->task); __set_notify_signal(worker->task);
return true; return true;
} }

View File

@ -56,7 +56,7 @@ bool io_wq_worker_stopped(void);
static inline bool io_wq_is_hashed(struct io_wq_work *work) static inline bool io_wq_is_hashed(struct io_wq_work *work)
{ {
return work->flags & IO_WQ_WORK_HASHED; return atomic_read(&work->flags) & IO_WQ_WORK_HASHED;
} }
typedef bool (work_cancel_fn)(struct io_wq_work *, void *); typedef bool (work_cancel_fn)(struct io_wq_work *, void *);

View File

@ -95,12 +95,14 @@
#include "futex.h" #include "futex.h"
#include "napi.h" #include "napi.h"
#include "uring_cmd.h" #include "uring_cmd.h"
#include "msg_ring.h"
#include "memmap.h" #include "memmap.h"
#include "timeout.h" #include "timeout.h"
#include "poll.h" #include "poll.h"
#include "rw.h" #include "rw.h"
#include "alloc_cache.h" #include "alloc_cache.h"
#include "eventfd.h"
#define IORING_MAX_ENTRIES 32768 #define IORING_MAX_ENTRIES 32768
#define IORING_MAX_CQ_ENTRIES (2 * IORING_MAX_ENTRIES) #define IORING_MAX_CQ_ENTRIES (2 * IORING_MAX_ENTRIES)
@ -314,6 +316,9 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
sizeof(struct io_async_rw)); sizeof(struct io_async_rw));
ret |= io_alloc_cache_init(&ctx->uring_cache, IO_ALLOC_CACHE_MAX, ret |= io_alloc_cache_init(&ctx->uring_cache, IO_ALLOC_CACHE_MAX,
sizeof(struct uring_cache)); sizeof(struct uring_cache));
spin_lock_init(&ctx->msg_lock);
ret |= io_alloc_cache_init(&ctx->msg_cache, IO_ALLOC_CACHE_MAX,
sizeof(struct io_kiocb));
ret |= io_futex_cache_init(ctx); ret |= io_futex_cache_init(ctx);
if (ret) if (ret)
goto err; goto err;
@ -350,6 +355,7 @@ err:
io_alloc_cache_free(&ctx->netmsg_cache, io_netmsg_cache_free); io_alloc_cache_free(&ctx->netmsg_cache, io_netmsg_cache_free);
io_alloc_cache_free(&ctx->rw_cache, io_rw_cache_free); io_alloc_cache_free(&ctx->rw_cache, io_rw_cache_free);
io_alloc_cache_free(&ctx->uring_cache, kfree); io_alloc_cache_free(&ctx->uring_cache, kfree);
io_alloc_cache_free(&ctx->msg_cache, io_msg_cache_free);
io_futex_cache_free(ctx); io_futex_cache_free(ctx);
kfree(ctx->cancel_table.hbs); kfree(ctx->cancel_table.hbs);
kfree(ctx->cancel_table_locked.hbs); kfree(ctx->cancel_table_locked.hbs);
@ -461,9 +467,9 @@ static void io_prep_async_work(struct io_kiocb *req)
} }
req->work.list.next = NULL; req->work.list.next = NULL;
req->work.flags = 0; atomic_set(&req->work.flags, 0);
if (req->flags & REQ_F_FORCE_ASYNC) if (req->flags & REQ_F_FORCE_ASYNC)
req->work.flags |= IO_WQ_WORK_CONCURRENT; atomic_or(IO_WQ_WORK_CONCURRENT, &req->work.flags);
if (req->file && !(req->flags & REQ_F_FIXED_FILE)) if (req->file && !(req->flags & REQ_F_FIXED_FILE))
req->flags |= io_file_get_flags(req->file); req->flags |= io_file_get_flags(req->file);
@ -479,7 +485,7 @@ static void io_prep_async_work(struct io_kiocb *req)
io_wq_hash_work(&req->work, file_inode(req->file)); io_wq_hash_work(&req->work, file_inode(req->file));
} else if (!req->file || !S_ISBLK(file_inode(req->file)->i_mode)) { } else if (!req->file || !S_ISBLK(file_inode(req->file)->i_mode)) {
if (def->unbound_nonreg_file) if (def->unbound_nonreg_file)
req->work.flags |= IO_WQ_WORK_UNBOUND; atomic_or(IO_WQ_WORK_UNBOUND, &req->work.flags);
} }
} }
@ -519,7 +525,7 @@ static void io_queue_iowq(struct io_kiocb *req)
* worker for it). * worker for it).
*/ */
if (WARN_ON_ONCE(!same_thread_group(req->task, current))) if (WARN_ON_ONCE(!same_thread_group(req->task, current)))
req->work.flags |= IO_WQ_WORK_CANCEL; atomic_or(IO_WQ_WORK_CANCEL, &req->work.flags);
trace_io_uring_queue_async_work(req, io_wq_is_hashed(&req->work)); trace_io_uring_queue_async_work(req, io_wq_is_hashed(&req->work));
io_wq_enqueue(tctx->io_wq, &req->work); io_wq_enqueue(tctx->io_wq, &req->work);
@ -541,84 +547,6 @@ static __cold void io_queue_deferred(struct io_ring_ctx *ctx)
} }
} }
void io_eventfd_ops(struct rcu_head *rcu)
{
struct io_ev_fd *ev_fd = container_of(rcu, struct io_ev_fd, rcu);
int ops = atomic_xchg(&ev_fd->ops, 0);
if (ops & BIT(IO_EVENTFD_OP_SIGNAL_BIT))
eventfd_signal_mask(ev_fd->cq_ev_fd, EPOLL_URING_WAKE);
/* IO_EVENTFD_OP_FREE_BIT may not be set here depending on callback
* ordering in a race but if references are 0 we know we have to free
* it regardless.
*/
if (atomic_dec_and_test(&ev_fd->refs)) {
eventfd_ctx_put(ev_fd->cq_ev_fd);
kfree(ev_fd);
}
}
static void io_eventfd_signal(struct io_ring_ctx *ctx)
{
struct io_ev_fd *ev_fd = NULL;
rcu_read_lock();
/*
* rcu_dereference ctx->io_ev_fd once and use it for both for checking
* and eventfd_signal
*/
ev_fd = rcu_dereference(ctx->io_ev_fd);
/*
* Check again if ev_fd exists incase an io_eventfd_unregister call
* completed between the NULL check of ctx->io_ev_fd at the start of
* the function and rcu_read_lock.
*/
if (unlikely(!ev_fd))
goto out;
if (READ_ONCE(ctx->rings->cq_flags) & IORING_CQ_EVENTFD_DISABLED)
goto out;
if (ev_fd->eventfd_async && !io_wq_current_is_worker())
goto out;
if (likely(eventfd_signal_allowed())) {
eventfd_signal_mask(ev_fd->cq_ev_fd, EPOLL_URING_WAKE);
} else {
atomic_inc(&ev_fd->refs);
if (!atomic_fetch_or(BIT(IO_EVENTFD_OP_SIGNAL_BIT), &ev_fd->ops))
call_rcu_hurry(&ev_fd->rcu, io_eventfd_ops);
else
atomic_dec(&ev_fd->refs);
}
out:
rcu_read_unlock();
}
static void io_eventfd_flush_signal(struct io_ring_ctx *ctx)
{
bool skip;
spin_lock(&ctx->completion_lock);
/*
* Eventfd should only get triggered when at least one event has been
* posted. Some applications rely on the eventfd notification count
* only changing IFF a new CQE has been added to the CQ ring. There's
* no depedency on 1:1 relationship between how many times this
* function is called (and hence the eventfd count) and number of CQEs
* posted to the CQ ring.
*/
skip = ctx->cached_cq_tail == ctx->evfd_last_cq_tail;
ctx->evfd_last_cq_tail = ctx->cached_cq_tail;
spin_unlock(&ctx->completion_lock);
if (skip)
return;
io_eventfd_signal(ctx);
}
void __io_commit_cqring_flush(struct io_ring_ctx *ctx) void __io_commit_cqring_flush(struct io_ring_ctx *ctx)
{ {
if (ctx->poll_activated) if (ctx->poll_activated)
@ -878,19 +806,42 @@ static bool io_fill_cqe_aux(struct io_ring_ctx *ctx, u64 user_data, s32 res,
return false; return false;
} }
static bool __io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res,
u32 cflags)
{
bool filled;
filled = io_fill_cqe_aux(ctx, user_data, res, cflags);
if (!filled)
filled = io_cqring_event_overflow(ctx, user_data, res, cflags, 0, 0);
return filled;
}
bool io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags) bool io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags)
{ {
bool filled; bool filled;
io_cq_lock(ctx); io_cq_lock(ctx);
filled = io_fill_cqe_aux(ctx, user_data, res, cflags); filled = __io_post_aux_cqe(ctx, user_data, res, cflags);
if (!filled)
filled = io_cqring_event_overflow(ctx, user_data, res, cflags, 0, 0);
io_cq_unlock_post(ctx); io_cq_unlock_post(ctx);
return filled; return filled;
} }
/*
* Must be called from inline task_work so we now a flush will happen later,
* and obviously with ctx->uring_lock held (tw always has that).
*/
void io_add_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags)
{
if (!io_fill_cqe_aux(ctx, user_data, res, cflags)) {
spin_lock(&ctx->completion_lock);
io_cqring_event_overflow(ctx, user_data, res, cflags, 0, 0);
spin_unlock(&ctx->completion_lock);
}
ctx->submit_state.cq_flush = true;
}
/* /*
* A helper for multishot requests posting additional CQEs. * A helper for multishot requests posting additional CQEs.
* Should only be used from a task_work including IO_URING_F_MULTISHOT. * Should only be used from a task_work including IO_URING_F_MULTISHOT.
@ -1175,9 +1126,10 @@ void tctx_task_work(struct callback_head *cb)
WARN_ON_ONCE(ret); WARN_ON_ONCE(ret);
} }
static inline void io_req_local_work_add(struct io_kiocb *req, unsigned flags) static inline void io_req_local_work_add(struct io_kiocb *req,
struct io_ring_ctx *ctx,
unsigned flags)
{ {
struct io_ring_ctx *ctx = req->ctx;
unsigned nr_wait, nr_tw, nr_tw_prev; unsigned nr_wait, nr_tw, nr_tw_prev;
struct llist_node *head; struct llist_node *head;
@ -1191,6 +1143,8 @@ static inline void io_req_local_work_add(struct io_kiocb *req, unsigned flags)
if (req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) if (req->flags & (REQ_F_LINK | REQ_F_HARDLINK))
flags &= ~IOU_F_TWQ_LAZY_WAKE; flags &= ~IOU_F_TWQ_LAZY_WAKE;
guard(rcu)();
head = READ_ONCE(ctx->work_llist.first); head = READ_ONCE(ctx->work_llist.first);
do { do {
nr_tw_prev = 0; nr_tw_prev = 0;
@ -1272,13 +1226,18 @@ static void io_req_normal_work_add(struct io_kiocb *req)
void __io_req_task_work_add(struct io_kiocb *req, unsigned flags) void __io_req_task_work_add(struct io_kiocb *req, unsigned flags)
{ {
if (req->ctx->flags & IORING_SETUP_DEFER_TASKRUN) { if (req->ctx->flags & IORING_SETUP_DEFER_TASKRUN)
rcu_read_lock(); io_req_local_work_add(req, req->ctx, flags);
io_req_local_work_add(req, flags); else
rcu_read_unlock();
} else {
io_req_normal_work_add(req); io_req_normal_work_add(req);
} }
void io_req_task_work_add_remote(struct io_kiocb *req, struct io_ring_ctx *ctx,
unsigned flags)
{
if (WARN_ON_ONCE(!(ctx->flags & IORING_SETUP_DEFER_TASKRUN)))
return;
io_req_local_work_add(req, ctx, flags);
} }
static void __cold io_move_task_work_from_local(struct io_ring_ctx *ctx) static void __cold io_move_task_work_from_local(struct io_ring_ctx *ctx)
@ -1467,7 +1426,7 @@ void __io_submit_flush_completions(struct io_ring_ctx *ctx)
} }
__io_cq_unlock_post(ctx); __io_cq_unlock_post(ctx);
if (!wq_list_empty(&ctx->submit_state.compl_reqs)) { if (!wq_list_empty(&state->compl_reqs)) {
io_free_batch_list(ctx, state->compl_reqs.first); io_free_batch_list(ctx, state->compl_reqs.first);
INIT_WQ_LIST(&state->compl_reqs); INIT_WQ_LIST(&state->compl_reqs);
} }
@ -1813,14 +1772,14 @@ void io_wq_submit_work(struct io_wq_work *work)
io_arm_ltimeout(req); io_arm_ltimeout(req);
/* either cancelled or io-wq is dying, so don't touch tctx->iowq */ /* either cancelled or io-wq is dying, so don't touch tctx->iowq */
if (work->flags & IO_WQ_WORK_CANCEL) { if (atomic_read(&work->flags) & IO_WQ_WORK_CANCEL) {
fail: fail:
io_req_task_queue_fail(req, err); io_req_task_queue_fail(req, err);
return; return;
} }
if (!io_assign_file(req, def, issue_flags)) { if (!io_assign_file(req, def, issue_flags)) {
err = -EBADF; err = -EBADF;
work->flags |= IO_WQ_WORK_CANCEL; atomic_or(IO_WQ_WORK_CANCEL, &work->flags);
goto fail; goto fail;
} }
@ -2649,6 +2608,7 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx)
io_alloc_cache_free(&ctx->netmsg_cache, io_netmsg_cache_free); io_alloc_cache_free(&ctx->netmsg_cache, io_netmsg_cache_free);
io_alloc_cache_free(&ctx->rw_cache, io_rw_cache_free); io_alloc_cache_free(&ctx->rw_cache, io_rw_cache_free);
io_alloc_cache_free(&ctx->uring_cache, kfree); io_alloc_cache_free(&ctx->uring_cache, kfree);
io_alloc_cache_free(&ctx->msg_cache, io_msg_cache_free);
io_futex_cache_free(ctx); io_futex_cache_free(ctx);
io_destroy_buffers(ctx); io_destroy_buffers(ctx);
mutex_unlock(&ctx->uring_lock); mutex_unlock(&ctx->uring_lock);

View File

@ -65,6 +65,7 @@ bool io_cqe_cache_refill(struct io_ring_ctx *ctx, bool overflow);
int io_run_task_work_sig(struct io_ring_ctx *ctx); int io_run_task_work_sig(struct io_ring_ctx *ctx);
void io_req_defer_failed(struct io_kiocb *req, s32 res); void io_req_defer_failed(struct io_kiocb *req, s32 res);
bool io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags); bool io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags);
void io_add_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags);
bool io_req_post_cqe(struct io_kiocb *req, s32 res, u32 cflags); bool io_req_post_cqe(struct io_kiocb *req, s32 res, u32 cflags);
void __io_commit_cqring_flush(struct io_ring_ctx *ctx); void __io_commit_cqring_flush(struct io_ring_ctx *ctx);
@ -73,6 +74,8 @@ struct file *io_file_get_fixed(struct io_kiocb *req, int fd,
unsigned issue_flags); unsigned issue_flags);
void __io_req_task_work_add(struct io_kiocb *req, unsigned flags); void __io_req_task_work_add(struct io_kiocb *req, unsigned flags);
void io_req_task_work_add_remote(struct io_kiocb *req, struct io_ring_ctx *ctx,
unsigned flags);
bool io_alloc_async_data(struct io_kiocb *req); bool io_alloc_async_data(struct io_kiocb *req);
void io_req_task_queue(struct io_kiocb *req); void io_req_task_queue(struct io_kiocb *req);
void io_req_task_complete(struct io_kiocb *req, struct io_tw_state *ts); void io_req_task_complete(struct io_kiocb *req, struct io_tw_state *ts);
@ -104,12 +107,6 @@ bool __io_alloc_req_refill(struct io_ring_ctx *ctx);
bool io_match_task_safe(struct io_kiocb *head, struct task_struct *task, bool io_match_task_safe(struct io_kiocb *head, struct task_struct *task,
bool cancel_all); bool cancel_all);
enum {
IO_EVENTFD_OP_SIGNAL_BIT,
IO_EVENTFD_OP_FREE_BIT,
};
void io_eventfd_ops(struct rcu_head *rcu);
void io_activate_pollwq(struct io_ring_ctx *ctx); void io_activate_pollwq(struct io_ring_ctx *ctx);
static inline void io_lockdep_assert_cq_locked(struct io_ring_ctx *ctx) static inline void io_lockdep_assert_cq_locked(struct io_ring_ctx *ctx)

View File

@ -11,9 +11,9 @@
#include "io_uring.h" #include "io_uring.h"
#include "rsrc.h" #include "rsrc.h"
#include "filetable.h" #include "filetable.h"
#include "alloc_cache.h"
#include "msg_ring.h" #include "msg_ring.h"
/* All valid masks for MSG_RING */ /* All valid masks for MSG_RING */
#define IORING_MSG_RING_MASK (IORING_MSG_RING_CQE_SKIP | \ #define IORING_MSG_RING_MASK (IORING_MSG_RING_CQE_SKIP | \
IORING_MSG_RING_FLAGS_PASS) IORING_MSG_RING_FLAGS_PASS)
@ -68,59 +68,70 @@ void io_msg_ring_cleanup(struct io_kiocb *req)
static inline bool io_msg_need_remote(struct io_ring_ctx *target_ctx) static inline bool io_msg_need_remote(struct io_ring_ctx *target_ctx)
{ {
if (!target_ctx->task_complete) return target_ctx->task_complete;
return false;
return current != target_ctx->submitter_task;
} }
static int io_msg_exec_remote(struct io_kiocb *req, task_work_func_t func) static void io_msg_tw_complete(struct io_kiocb *req, struct io_tw_state *ts)
{ {
struct io_ring_ctx *ctx = req->file->private_data; struct io_ring_ctx *ctx = req->ctx;
struct io_msg *msg = io_kiocb_to_cmd(req, struct io_msg);
struct task_struct *task = READ_ONCE(ctx->submitter_task);
if (unlikely(!task)) io_add_aux_cqe(ctx, req->cqe.user_data, req->cqe.res, req->cqe.flags);
return -EOWNERDEAD; if (spin_trylock(&ctx->msg_lock)) {
if (io_alloc_cache_put(&ctx->msg_cache, req))
init_task_work(&msg->tw, func); req = NULL;
if (task_work_add(task, &msg->tw, TWA_SIGNAL)) spin_unlock(&ctx->msg_lock);
return -EOWNERDEAD; }
if (req)
return IOU_ISSUE_SKIP_COMPLETE; kmem_cache_free(req_cachep, req);
percpu_ref_put(&ctx->refs);
} }
static void io_msg_tw_complete(struct callback_head *head) static int io_msg_remote_post(struct io_ring_ctx *ctx, struct io_kiocb *req,
int res, u32 cflags, u64 user_data)
{
req->task = READ_ONCE(ctx->submitter_task);
if (!req->task) {
kmem_cache_free(req_cachep, req);
return -EOWNERDEAD;
}
req->cqe.user_data = user_data;
io_req_set_res(req, res, cflags);
percpu_ref_get(&ctx->refs);
req->ctx = ctx;
req->io_task_work.func = io_msg_tw_complete;
io_req_task_work_add_remote(req, ctx, IOU_F_TWQ_LAZY_WAKE);
return 0;
}
static struct io_kiocb *io_msg_get_kiocb(struct io_ring_ctx *ctx)
{
struct io_kiocb *req = NULL;
if (spin_trylock(&ctx->msg_lock)) {
req = io_alloc_cache_get(&ctx->msg_cache);
spin_unlock(&ctx->msg_lock);
}
if (req)
return req;
return kmem_cache_alloc(req_cachep, GFP_KERNEL | __GFP_NOWARN);
}
static int io_msg_data_remote(struct io_kiocb *req)
{ {
struct io_msg *msg = container_of(head, struct io_msg, tw);
struct io_kiocb *req = cmd_to_io_kiocb(msg);
struct io_ring_ctx *target_ctx = req->file->private_data; struct io_ring_ctx *target_ctx = req->file->private_data;
int ret = 0; struct io_msg *msg = io_kiocb_to_cmd(req, struct io_msg);
struct io_kiocb *target;
if (current->flags & PF_EXITING) {
ret = -EOWNERDEAD;
} else {
u32 flags = 0; u32 flags = 0;
target = io_msg_get_kiocb(req->ctx);
if (unlikely(!target))
return -ENOMEM;
if (msg->flags & IORING_MSG_RING_FLAGS_PASS) if (msg->flags & IORING_MSG_RING_FLAGS_PASS)
flags = msg->cqe_flags; flags = msg->cqe_flags;
/* return io_msg_remote_post(target_ctx, target, msg->len, flags,
* If the target ring is using IOPOLL mode, then we need to be msg->user_data);
* holding the uring_lock for posting completions. Other ring
* types rely on the regular completion locking, which is
* handled while posting.
*/
if (target_ctx->flags & IORING_SETUP_IOPOLL)
mutex_lock(&target_ctx->uring_lock);
if (!io_post_aux_cqe(target_ctx, msg->user_data, msg->len, flags))
ret = -EOVERFLOW;
if (target_ctx->flags & IORING_SETUP_IOPOLL)
mutex_unlock(&target_ctx->uring_lock);
}
if (ret < 0)
req_set_fail(req);
io_req_queue_tw_complete(req, ret);
} }
static int io_msg_ring_data(struct io_kiocb *req, unsigned int issue_flags) static int io_msg_ring_data(struct io_kiocb *req, unsigned int issue_flags)
@ -138,7 +149,7 @@ static int io_msg_ring_data(struct io_kiocb *req, unsigned int issue_flags)
return -EBADFD; return -EBADFD;
if (io_msg_need_remote(target_ctx)) if (io_msg_need_remote(target_ctx))
return io_msg_exec_remote(req, io_msg_tw_complete); return io_msg_data_remote(req);
if (msg->flags & IORING_MSG_RING_FLAGS_PASS) if (msg->flags & IORING_MSG_RING_FLAGS_PASS)
flags = msg->cqe_flags; flags = msg->cqe_flags;
@ -218,6 +229,22 @@ static void io_msg_tw_fd_complete(struct callback_head *head)
io_req_queue_tw_complete(req, ret); io_req_queue_tw_complete(req, ret);
} }
static int io_msg_fd_remote(struct io_kiocb *req)
{
struct io_ring_ctx *ctx = req->file->private_data;
struct io_msg *msg = io_kiocb_to_cmd(req, struct io_msg);
struct task_struct *task = READ_ONCE(ctx->submitter_task);
if (unlikely(!task))
return -EOWNERDEAD;
init_task_work(&msg->tw, io_msg_tw_fd_complete);
if (task_work_add(task, &msg->tw, TWA_SIGNAL))
return -EOWNERDEAD;
return IOU_ISSUE_SKIP_COMPLETE;
}
static int io_msg_send_fd(struct io_kiocb *req, unsigned int issue_flags) static int io_msg_send_fd(struct io_kiocb *req, unsigned int issue_flags)
{ {
struct io_ring_ctx *target_ctx = req->file->private_data; struct io_ring_ctx *target_ctx = req->file->private_data;
@ -240,7 +267,7 @@ static int io_msg_send_fd(struct io_kiocb *req, unsigned int issue_flags)
} }
if (io_msg_need_remote(target_ctx)) if (io_msg_need_remote(target_ctx))
return io_msg_exec_remote(req, io_msg_tw_fd_complete); return io_msg_fd_remote(req);
return io_msg_install_complete(req, issue_flags); return io_msg_install_complete(req, issue_flags);
} }
@ -294,3 +321,10 @@ done:
io_req_set_res(req, ret, 0); io_req_set_res(req, ret, 0);
return IOU_OK; return IOU_OK;
} }
void io_msg_cache_free(const void *entry)
{
struct io_kiocb *req = (struct io_kiocb *) entry;
kmem_cache_free(req_cachep, req);
}

View File

@ -3,3 +3,4 @@
int io_msg_ring_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); int io_msg_ring_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
int io_msg_ring(struct io_kiocb *req, unsigned int issue_flags); int io_msg_ring(struct io_kiocb *req, unsigned int issue_flags);
void io_msg_ring_cleanup(struct io_kiocb *req); void io_msg_ring_cleanup(struct io_kiocb *req);
void io_msg_cache_free(const void *entry);

View File

@ -283,7 +283,7 @@ void __io_napi_adjust_timeout(struct io_ring_ctx *ctx, struct io_wait_queue *iow
s64 poll_to_ns = timespec64_to_ns(ts); s64 poll_to_ns = timespec64_to_ns(ts);
if (poll_to_ns > 0) { if (poll_to_ns > 0) {
u64 val = poll_to_ns + 999; u64 val = poll_to_ns + 999;
do_div(val, (s64) 1000); do_div(val, 1000);
poll_to = val; poll_to = val;
} }
} }

View File

@ -51,6 +51,16 @@ struct io_connect {
bool seen_econnaborted; bool seen_econnaborted;
}; };
struct io_bind {
struct file *file;
int addr_len;
};
struct io_listen {
struct file *file;
int backlog;
};
struct io_sr_msg { struct io_sr_msg {
struct file *file; struct file *file;
union { union {
@ -817,20 +827,20 @@ static inline bool io_recv_finish(struct io_kiocb *req, int *ret,
bool mshot_finished, unsigned issue_flags) bool mshot_finished, unsigned issue_flags)
{ {
struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
unsigned int cflags; unsigned int cflags = 0;
if (sr->flags & IORING_RECVSEND_BUNDLE)
cflags = io_put_kbufs(req, io_bundle_nbufs(kmsg, *ret),
issue_flags);
else
cflags = io_put_kbuf(req, issue_flags);
if (kmsg->msg.msg_inq > 0) if (kmsg->msg.msg_inq > 0)
cflags |= IORING_CQE_F_SOCK_NONEMPTY; cflags |= IORING_CQE_F_SOCK_NONEMPTY;
if (sr->flags & IORING_RECVSEND_BUNDLE) {
cflags |= io_put_kbufs(req, io_bundle_nbufs(kmsg, *ret),
issue_flags);
/* bundle with no more immediate buffers, we're done */ /* bundle with no more immediate buffers, we're done */
if (sr->flags & IORING_RECVSEND_BUNDLE && req->flags & REQ_F_BL_EMPTY) if (req->flags & REQ_F_BL_EMPTY)
goto finish; goto finish;
} else {
cflags |= io_put_kbuf(req, issue_flags);
}
/* /*
* Fill CQE for this receive and see if we should keep trying to * Fill CQE for this receive and see if we should keep trying to
@ -1717,6 +1727,70 @@ out:
return IOU_OK; return IOU_OK;
} }
int io_bind_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
struct io_bind *bind = io_kiocb_to_cmd(req, struct io_bind);
struct sockaddr __user *uaddr;
struct io_async_msghdr *io;
if (sqe->len || sqe->buf_index || sqe->rw_flags || sqe->splice_fd_in)
return -EINVAL;
uaddr = u64_to_user_ptr(READ_ONCE(sqe->addr));
bind->addr_len = READ_ONCE(sqe->addr2);
io = io_msg_alloc_async(req);
if (unlikely(!io))
return -ENOMEM;
return move_addr_to_kernel(uaddr, bind->addr_len, &io->addr);
}
int io_bind(struct io_kiocb *req, unsigned int issue_flags)
{
struct io_bind *bind = io_kiocb_to_cmd(req, struct io_bind);
struct io_async_msghdr *io = req->async_data;
struct socket *sock;
int ret;
sock = sock_from_file(req->file);
if (unlikely(!sock))
return -ENOTSOCK;
ret = __sys_bind_socket(sock, &io->addr, bind->addr_len);
if (ret < 0)
req_set_fail(req);
io_req_set_res(req, ret, 0);
return 0;
}
int io_listen_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
struct io_listen *listen = io_kiocb_to_cmd(req, struct io_listen);
if (sqe->addr || sqe->buf_index || sqe->rw_flags || sqe->splice_fd_in || sqe->addr2)
return -EINVAL;
listen->backlog = READ_ONCE(sqe->len);
return 0;
}
int io_listen(struct io_kiocb *req, unsigned int issue_flags)
{
struct io_listen *listen = io_kiocb_to_cmd(req, struct io_listen);
struct socket *sock;
int ret;
sock = sock_from_file(req->file);
if (unlikely(!sock))
return -ENOTSOCK;
ret = __sys_listen_socket(sock, listen->backlog);
if (ret < 0)
req_set_fail(req);
io_req_set_res(req, ret, 0);
return 0;
}
void io_netmsg_cache_free(const void *entry) void io_netmsg_cache_free(const void *entry)
{ {
struct io_async_msghdr *kmsg = (struct io_async_msghdr *) entry; struct io_async_msghdr *kmsg = (struct io_async_msghdr *) entry;

View File

@ -49,6 +49,12 @@ int io_sendmsg_zc(struct io_kiocb *req, unsigned int issue_flags);
int io_send_zc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); int io_send_zc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
void io_send_zc_cleanup(struct io_kiocb *req); void io_send_zc_cleanup(struct io_kiocb *req);
int io_bind_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
int io_bind(struct io_kiocb *req, unsigned int issue_flags);
int io_listen_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
int io_listen(struct io_kiocb *req, unsigned int issue_flags);
void io_netmsg_cache_free(const void *entry); void io_netmsg_cache_free(const void *entry);
#else #else
static inline void io_netmsg_cache_free(const void *entry) static inline void io_netmsg_cache_free(const void *entry)

View File

@ -495,6 +495,26 @@ const struct io_issue_def io_issue_defs[] = {
.prep = io_ftruncate_prep, .prep = io_ftruncate_prep,
.issue = io_ftruncate, .issue = io_ftruncate,
}, },
[IORING_OP_BIND] = {
#if defined(CONFIG_NET)
.needs_file = 1,
.prep = io_bind_prep,
.issue = io_bind,
.async_size = sizeof(struct io_async_msghdr),
#else
.prep = io_eopnotsupp_prep,
#endif
},
[IORING_OP_LISTEN] = {
#if defined(CONFIG_NET)
.needs_file = 1,
.prep = io_listen_prep,
.issue = io_listen,
.async_size = sizeof(struct io_async_msghdr),
#else
.prep = io_eopnotsupp_prep,
#endif
},
}; };
const struct io_cold_def io_cold_defs[] = { const struct io_cold_def io_cold_defs[] = {
@ -716,6 +736,12 @@ const struct io_cold_def io_cold_defs[] = {
[IORING_OP_FTRUNCATE] = { [IORING_OP_FTRUNCATE] = {
.name = "FTRUNCATE", .name = "FTRUNCATE",
}, },
[IORING_OP_BIND] = {
.name = "BIND",
},
[IORING_OP_LISTEN] = {
.name = "LISTEN",
},
}; };
const char *io_uring_get_opcode(u8 opcode) const char *io_uring_get_opcode(u8 opcode)
@ -725,6 +751,14 @@ const char *io_uring_get_opcode(u8 opcode)
return "INVALID"; return "INVALID";
} }
bool io_uring_op_supported(u8 opcode)
{
if (opcode < IORING_OP_LAST &&
io_issue_defs[opcode].prep != io_eopnotsupp_prep)
return true;
return false;
}
void __init io_uring_optable_init(void) void __init io_uring_optable_init(void)
{ {
int i; int i;

View File

@ -17,8 +17,6 @@ struct io_issue_def {
unsigned poll_exclusive : 1; unsigned poll_exclusive : 1;
/* op supports buffer selection */ /* op supports buffer selection */
unsigned buffer_select : 1; unsigned buffer_select : 1;
/* opcode is not supported by this kernel */
unsigned not_supported : 1;
/* skip auditing */ /* skip auditing */
unsigned audit_skip : 1; unsigned audit_skip : 1;
/* supports ioprio */ /* supports ioprio */
@ -47,5 +45,7 @@ struct io_cold_def {
extern const struct io_issue_def io_issue_defs[]; extern const struct io_issue_def io_issue_defs[];
extern const struct io_cold_def io_cold_defs[]; extern const struct io_cold_def io_cold_defs[];
bool io_uring_op_supported(u8 opcode);
void io_uring_optable_init(void); void io_uring_optable_init(void);
#endif #endif

View File

@ -27,65 +27,11 @@
#include "cancel.h" #include "cancel.h"
#include "kbuf.h" #include "kbuf.h"
#include "napi.h" #include "napi.h"
#include "eventfd.h"
#define IORING_MAX_RESTRICTIONS (IORING_RESTRICTION_LAST + \ #define IORING_MAX_RESTRICTIONS (IORING_RESTRICTION_LAST + \
IORING_REGISTER_LAST + IORING_OP_LAST) IORING_REGISTER_LAST + IORING_OP_LAST)
static int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg,
unsigned int eventfd_async)
{
struct io_ev_fd *ev_fd;
__s32 __user *fds = arg;
int fd;
ev_fd = rcu_dereference_protected(ctx->io_ev_fd,
lockdep_is_held(&ctx->uring_lock));
if (ev_fd)
return -EBUSY;
if (copy_from_user(&fd, fds, sizeof(*fds)))
return -EFAULT;
ev_fd = kmalloc(sizeof(*ev_fd), GFP_KERNEL);
if (!ev_fd)
return -ENOMEM;
ev_fd->cq_ev_fd = eventfd_ctx_fdget(fd);
if (IS_ERR(ev_fd->cq_ev_fd)) {
int ret = PTR_ERR(ev_fd->cq_ev_fd);
kfree(ev_fd);
return ret;
}
spin_lock(&ctx->completion_lock);
ctx->evfd_last_cq_tail = ctx->cached_cq_tail;
spin_unlock(&ctx->completion_lock);
ev_fd->eventfd_async = eventfd_async;
ctx->has_evfd = true;
rcu_assign_pointer(ctx->io_ev_fd, ev_fd);
atomic_set(&ev_fd->refs, 1);
atomic_set(&ev_fd->ops, 0);
return 0;
}
int io_eventfd_unregister(struct io_ring_ctx *ctx)
{
struct io_ev_fd *ev_fd;
ev_fd = rcu_dereference_protected(ctx->io_ev_fd,
lockdep_is_held(&ctx->uring_lock));
if (ev_fd) {
ctx->has_evfd = false;
rcu_assign_pointer(ctx->io_ev_fd, NULL);
if (!atomic_fetch_or(BIT(IO_EVENTFD_OP_FREE_BIT), &ev_fd->ops))
call_rcu(&ev_fd->rcu, io_eventfd_ops);
return 0;
}
return -ENXIO;
}
static __cold int io_probe(struct io_ring_ctx *ctx, void __user *arg, static __cold int io_probe(struct io_ring_ctx *ctx, void __user *arg,
unsigned nr_args) unsigned nr_args)
{ {
@ -93,9 +39,10 @@ static __cold int io_probe(struct io_ring_ctx *ctx, void __user *arg,
size_t size; size_t size;
int i, ret; int i, ret;
if (nr_args > IORING_OP_LAST)
nr_args = IORING_OP_LAST;
size = struct_size(p, ops, nr_args); size = struct_size(p, ops, nr_args);
if (size == SIZE_MAX)
return -EOVERFLOW;
p = kzalloc(size, GFP_KERNEL); p = kzalloc(size, GFP_KERNEL);
if (!p) if (!p)
return -ENOMEM; return -ENOMEM;
@ -108,12 +55,10 @@ static __cold int io_probe(struct io_ring_ctx *ctx, void __user *arg,
goto out; goto out;
p->last_op = IORING_OP_LAST - 1; p->last_op = IORING_OP_LAST - 1;
if (nr_args > IORING_OP_LAST)
nr_args = IORING_OP_LAST;
for (i = 0; i < nr_args; i++) { for (i = 0; i < nr_args; i++) {
p->ops[i].op = i; p->ops[i].op = i;
if (!io_issue_defs[i].not_supported) if (io_uring_op_supported(i))
p->ops[i].flags = IO_URING_OP_SUPPORTED; p->ops[i].flags = IO_URING_OP_SUPPORTED;
} }
p->ops_len = i; p->ops_len = i;

View File

@ -85,31 +85,6 @@ static int io_account_mem(struct io_ring_ctx *ctx, unsigned long nr_pages)
return 0; return 0;
} }
static int io_copy_iov(struct io_ring_ctx *ctx, struct iovec *dst,
void __user *arg, unsigned index)
{
struct iovec __user *src;
#ifdef CONFIG_COMPAT
if (ctx->compat) {
struct compat_iovec __user *ciovs;
struct compat_iovec ciov;
ciovs = (struct compat_iovec __user *) arg;
if (copy_from_user(&ciov, &ciovs[index], sizeof(ciov)))
return -EFAULT;
dst->iov_base = u64_to_user_ptr((u64)ciov.iov_base);
dst->iov_len = ciov.iov_len;
return 0;
}
#endif
src = (struct iovec __user *) arg;
if (copy_from_user(dst, &src[index], sizeof(*dst)))
return -EFAULT;
return 0;
}
static int io_buffer_validate(struct iovec *iov) static int io_buffer_validate(struct iovec *iov)
{ {
unsigned long tmp, acct_len = iov->iov_len + (PAGE_SIZE - 1); unsigned long tmp, acct_len = iov->iov_len + (PAGE_SIZE - 1);
@ -249,7 +224,7 @@ __cold static int io_rsrc_ref_quiesce(struct io_rsrc_data *data,
ret = io_run_task_work_sig(ctx); ret = io_run_task_work_sig(ctx);
if (ret < 0) { if (ret < 0) {
__set_current_state(TASK_RUNNING); finish_wait(&ctx->rsrc_quiesce_wq, &we);
mutex_lock(&ctx->uring_lock); mutex_lock(&ctx->uring_lock);
if (list_empty(&ctx->rsrc_ref_list)) if (list_empty(&ctx->rsrc_ref_list))
ret = 0; ret = 0;
@ -257,7 +232,6 @@ __cold static int io_rsrc_ref_quiesce(struct io_rsrc_data *data,
} }
schedule(); schedule();
__set_current_state(TASK_RUNNING);
mutex_lock(&ctx->uring_lock); mutex_lock(&ctx->uring_lock);
ret = 0; ret = 0;
} while (!list_empty(&ctx->rsrc_ref_list)); } while (!list_empty(&ctx->rsrc_ref_list));
@ -420,8 +394,9 @@ static int __io_sqe_buffers_update(struct io_ring_ctx *ctx,
struct io_uring_rsrc_update2 *up, struct io_uring_rsrc_update2 *up,
unsigned int nr_args) unsigned int nr_args)
{ {
struct iovec __user *uvec = u64_to_user_ptr(up->data);
u64 __user *tags = u64_to_user_ptr(up->tags); u64 __user *tags = u64_to_user_ptr(up->tags);
struct iovec iov, __user *iovs = u64_to_user_ptr(up->data); struct iovec fast_iov, *iov;
struct page *last_hpage = NULL; struct page *last_hpage = NULL;
__u32 done; __u32 done;
int i, err; int i, err;
@ -435,21 +410,23 @@ static int __io_sqe_buffers_update(struct io_ring_ctx *ctx,
struct io_mapped_ubuf *imu; struct io_mapped_ubuf *imu;
u64 tag = 0; u64 tag = 0;
err = io_copy_iov(ctx, &iov, iovs, done); iov = iovec_from_user(&uvec[done], 1, 1, &fast_iov, ctx->compat);
if (err) if (IS_ERR(iov)) {
err = PTR_ERR(iov);
break; break;
}
if (tags && copy_from_user(&tag, &tags[done], sizeof(tag))) { if (tags && copy_from_user(&tag, &tags[done], sizeof(tag))) {
err = -EFAULT; err = -EFAULT;
break; break;
} }
err = io_buffer_validate(&iov); err = io_buffer_validate(iov);
if (err) if (err)
break; break;
if (!iov.iov_base && tag) { if (!iov->iov_base && tag) {
err = -EINVAL; err = -EINVAL;
break; break;
} }
err = io_sqe_buffer_register(ctx, &iov, &imu, &last_hpage); err = io_sqe_buffer_register(ctx, iov, &imu, &last_hpage);
if (err) if (err)
break; break;
@ -971,8 +948,9 @@ int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg,
{ {
struct page *last_hpage = NULL; struct page *last_hpage = NULL;
struct io_rsrc_data *data; struct io_rsrc_data *data;
struct iovec fast_iov, *iov = &fast_iov;
const struct iovec __user *uvec = (struct iovec * __user) arg;
int i, ret; int i, ret;
struct iovec iov;
BUILD_BUG_ON(IORING_MAX_REG_BUFFERS >= (1u << 16)); BUILD_BUG_ON(IORING_MAX_REG_BUFFERS >= (1u << 16));
@ -989,24 +967,27 @@ int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg,
return ret; return ret;
} }
if (!arg)
memset(iov, 0, sizeof(*iov));
for (i = 0; i < nr_args; i++, ctx->nr_user_bufs++) { for (i = 0; i < nr_args; i++, ctx->nr_user_bufs++) {
if (arg) { if (arg) {
ret = io_copy_iov(ctx, &iov, arg, i); iov = iovec_from_user(&uvec[i], 1, 1, &fast_iov, ctx->compat);
if (IS_ERR(iov)) {
ret = PTR_ERR(iov);
break;
}
ret = io_buffer_validate(iov);
if (ret) if (ret)
break; break;
ret = io_buffer_validate(&iov);
if (ret)
break;
} else {
memset(&iov, 0, sizeof(iov));
} }
if (!iov.iov_base && *io_get_tag_slot(data, i)) { if (!iov->iov_base && *io_get_tag_slot(data, i)) {
ret = -EINVAL; ret = -EINVAL;
break; break;
} }
ret = io_sqe_buffer_register(ctx, &iov, &ctx->user_bufs[i], ret = io_sqe_buffer_register(ctx, iov, &ctx->user_bufs[i],
&last_hpage); &last_hpage);
if (ret) if (ret)
break; break;

View File

@ -2600,6 +2600,14 @@ static void do_freezer_trap(void)
spin_unlock_irq(&current->sighand->siglock); spin_unlock_irq(&current->sighand->siglock);
cgroup_enter_frozen(); cgroup_enter_frozen();
schedule(); schedule();
/*
* We could've been woken by task_work, run it to clear
* TIF_NOTIFY_SIGNAL. The caller will retry if necessary.
*/
clear_notify_signal();
if (unlikely(task_work_pending(current)))
task_work_run();
} }
static int ptrace_signal(int signr, kernel_siginfo_t *info, enum pid_type type) static int ptrace_signal(int signr, kernel_siginfo_t *info, enum pid_type type)

View File

@ -1822,6 +1822,20 @@ SYSCALL_DEFINE4(socketpair, int, family, int, type, int, protocol,
return __sys_socketpair(family, type, protocol, usockvec); return __sys_socketpair(family, type, protocol, usockvec);
} }
int __sys_bind_socket(struct socket *sock, struct sockaddr_storage *address,
int addrlen)
{
int err;
err = security_socket_bind(sock, (struct sockaddr *)address,
addrlen);
if (!err)
err = READ_ONCE(sock->ops)->bind(sock,
(struct sockaddr *)address,
addrlen);
return err;
}
/* /*
* Bind a name to a socket. Nothing much to do here since it's * Bind a name to a socket. Nothing much to do here since it's
* the protocol's responsibility to handle the local address. * the protocol's responsibility to handle the local address.
@ -1839,15 +1853,8 @@ int __sys_bind(int fd, struct sockaddr __user *umyaddr, int addrlen)
sock = sockfd_lookup_light(fd, &err, &fput_needed); sock = sockfd_lookup_light(fd, &err, &fput_needed);
if (sock) { if (sock) {
err = move_addr_to_kernel(umyaddr, addrlen, &address); err = move_addr_to_kernel(umyaddr, addrlen, &address);
if (!err) {
err = security_socket_bind(sock,
(struct sockaddr *)&address,
addrlen);
if (!err) if (!err)
err = READ_ONCE(sock->ops)->bind(sock, err = __sys_bind_socket(sock, &address, addrlen);
(struct sockaddr *)
&address, addrlen);
}
fput_light(sock->file, fput_needed); fput_light(sock->file, fput_needed);
} }
return err; return err;
@ -1863,15 +1870,10 @@ SYSCALL_DEFINE3(bind, int, fd, struct sockaddr __user *, umyaddr, int, addrlen)
* necessary for a listen, and if that works, we mark the socket as * necessary for a listen, and if that works, we mark the socket as
* ready for listening. * ready for listening.
*/ */
int __sys_listen_socket(struct socket *sock, int backlog)
int __sys_listen(int fd, int backlog)
{ {
struct socket *sock; int somaxconn, err;
int err, fput_needed;
int somaxconn;
sock = sockfd_lookup_light(fd, &err, &fput_needed);
if (sock) {
somaxconn = READ_ONCE(sock_net(sock->sk)->core.sysctl_somaxconn); somaxconn = READ_ONCE(sock_net(sock->sk)->core.sysctl_somaxconn);
if ((unsigned int)backlog > somaxconn) if ((unsigned int)backlog > somaxconn)
backlog = somaxconn; backlog = somaxconn;
@ -1879,7 +1881,17 @@ int __sys_listen(int fd, int backlog)
err = security_socket_listen(sock, backlog); err = security_socket_listen(sock, backlog);
if (!err) if (!err)
err = READ_ONCE(sock->ops)->listen(sock, backlog); err = READ_ONCE(sock->ops)->listen(sock, backlog);
return err;
}
int __sys_listen(int fd, int backlog)
{
struct socket *sock;
int err, fput_needed;
sock = sockfd_lookup_light(fd, &err, &fput_needed);
if (sock) {
err = __sys_listen_socket(sock, backlog);
fput_light(sock->file, fput_needed); fput_light(sock->file, fput_needed);
} }
return err; return err;