1
linux/fs/signalfd.c
Davi Arnaut b3762bfc8d signalfd: retrieve multiple signals with one read() call
Gathering signals in bulk enables server applications to drain a signal
queue (almost full of realtime signals) more efficiently by reducing the
syscall and file look-up overhead.

Very similar to the sigtimedwait4() call described by Niels Provos, Chuck
Lever, and Stephen Tweedie in a paper entitled "Analyzing the Overload
Behavior of a Simple Web Server".  The paper lists more details and
advantages.

Signed-off-by: Davi E. M. Arnaut <davi@haxent.com.br>
Acked-by: Davide Libenzi <davidel@xmailserver.org>
Cc: Oleg Nesterov <oleg@tv-sign.ru>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2007-05-23 20:14:14 -07:00

380 lines
9.6 KiB
C

/*
* fs/signalfd.c
*
* Copyright (C) 2003 Linus Torvalds
*
* Mon Mar 5, 2007: Davide Libenzi <davidel@xmailserver.org>
* Changed ->read() to return a siginfo strcture instead of signal number.
* Fixed locking in ->poll().
* Added sighand-detach notification.
* Added fd re-use in sys_signalfd() syscall.
* Now using anonymous inode source.
* Thanks to Oleg Nesterov for useful code review and suggestions.
* More comments and suggestions from Arnd Bergmann.
* Sat May 19, 2007: Davi E. M. Arnaut <davi@haxent.com.br>
* Retrieve multiple signals with one read() call
*/
#include <linux/file.h>
#include <linux/poll.h>
#include <linux/init.h>
#include <linux/fs.h>
#include <linux/sched.h>
#include <linux/kernel.h>
#include <linux/signal.h>
#include <linux/list.h>
#include <linux/anon_inodes.h>
#include <linux/signalfd.h>
struct signalfd_ctx {
struct list_head lnk;
wait_queue_head_t wqh;
sigset_t sigmask;
struct task_struct *tsk;
};
struct signalfd_lockctx {
struct task_struct *tsk;
unsigned long flags;
};
/*
* Tries to acquire the sighand lock. We do not increment the sighand
* use count, and we do not even pin the task struct, so we need to
* do it inside an RCU read lock, and we must be prepared for the
* ctx->tsk going to NULL (in signalfd_deliver()), and for the sighand
* being detached. We return 0 if the sighand has been detached, or
* 1 if we were able to pin the sighand lock.
*/
static int signalfd_lock(struct signalfd_ctx *ctx, struct signalfd_lockctx *lk)
{
struct sighand_struct *sighand = NULL;
rcu_read_lock();
lk->tsk = rcu_dereference(ctx->tsk);
if (likely(lk->tsk != NULL))
sighand = lock_task_sighand(lk->tsk, &lk->flags);
rcu_read_unlock();
if (sighand && !ctx->tsk) {
unlock_task_sighand(lk->tsk, &lk->flags);
sighand = NULL;
}
return sighand != NULL;
}
static void signalfd_unlock(struct signalfd_lockctx *lk)
{
unlock_task_sighand(lk->tsk, &lk->flags);
}
/*
* This must be called with the sighand lock held.
*/
void signalfd_deliver(struct task_struct *tsk, int sig)
{
struct sighand_struct *sighand = tsk->sighand;
struct signalfd_ctx *ctx, *tmp;
BUG_ON(!sig);
list_for_each_entry_safe(ctx, tmp, &sighand->signalfd_list, lnk) {
/*
* We use a negative signal value as a way to broadcast that the
* sighand has been orphaned, so that we can notify all the
* listeners about this. Remember the ctx->sigmask is inverted,
* so if the user is interested in a signal, that corresponding
* bit will be zero.
*/
if (sig < 0) {
if (ctx->tsk == tsk) {
ctx->tsk = NULL;
list_del_init(&ctx->lnk);
wake_up(&ctx->wqh);
}
} else {
if (!sigismember(&ctx->sigmask, sig))
wake_up(&ctx->wqh);
}
}
}
static void signalfd_cleanup(struct signalfd_ctx *ctx)
{
struct signalfd_lockctx lk;
/*
* This is tricky. If the sighand is gone, we do not need to remove
* context from the list, the list itself won't be there anymore.
*/
if (signalfd_lock(ctx, &lk)) {
list_del(&ctx->lnk);
signalfd_unlock(&lk);
}
kfree(ctx);
}
static int signalfd_release(struct inode *inode, struct file *file)
{
signalfd_cleanup(file->private_data);
return 0;
}
static unsigned int signalfd_poll(struct file *file, poll_table *wait)
{
struct signalfd_ctx *ctx = file->private_data;
unsigned int events = 0;
struct signalfd_lockctx lk;
poll_wait(file, &ctx->wqh, wait);
/*
* Let the caller get a POLLIN in this case, ala socket recv() when
* the peer disconnects.
*/
if (signalfd_lock(ctx, &lk)) {
if (next_signal(&lk.tsk->pending, &ctx->sigmask) > 0 ||
next_signal(&lk.tsk->signal->shared_pending,
&ctx->sigmask) > 0)
events |= POLLIN;
signalfd_unlock(&lk);
} else
events |= POLLIN;
return events;
}
/*
* Copied from copy_siginfo_to_user() in kernel/signal.c
*/
static int signalfd_copyinfo(struct signalfd_siginfo __user *uinfo,
siginfo_t const *kinfo)
{
long err;
BUILD_BUG_ON(sizeof(struct signalfd_siginfo) != 128);
/*
* Unused memebers should be zero ...
*/
err = __clear_user(uinfo, sizeof(*uinfo));
/*
* If you change siginfo_t structure, please be sure
* this code is fixed accordingly.
*/
err |= __put_user(kinfo->si_signo, &uinfo->signo);
err |= __put_user(kinfo->si_errno, &uinfo->err);
err |= __put_user((short)kinfo->si_code, &uinfo->code);
switch (kinfo->si_code & __SI_MASK) {
case __SI_KILL:
err |= __put_user(kinfo->si_pid, &uinfo->pid);
err |= __put_user(kinfo->si_uid, &uinfo->uid);
break;
case __SI_TIMER:
err |= __put_user(kinfo->si_tid, &uinfo->tid);
err |= __put_user(kinfo->si_overrun, &uinfo->overrun);
err |= __put_user((long)kinfo->si_ptr, &uinfo->svptr);
break;
case __SI_POLL:
err |= __put_user(kinfo->si_band, &uinfo->band);
err |= __put_user(kinfo->si_fd, &uinfo->fd);
break;
case __SI_FAULT:
err |= __put_user((long)kinfo->si_addr, &uinfo->addr);
#ifdef __ARCH_SI_TRAPNO
err |= __put_user(kinfo->si_trapno, &uinfo->trapno);
#endif
break;
case __SI_CHLD:
err |= __put_user(kinfo->si_pid, &uinfo->pid);
err |= __put_user(kinfo->si_uid, &uinfo->uid);
err |= __put_user(kinfo->si_status, &uinfo->status);
err |= __put_user(kinfo->si_utime, &uinfo->utime);
err |= __put_user(kinfo->si_stime, &uinfo->stime);
break;
case __SI_RT: /* This is not generated by the kernel as of now. */
case __SI_MESGQ: /* But this is */
err |= __put_user(kinfo->si_pid, &uinfo->pid);
err |= __put_user(kinfo->si_uid, &uinfo->uid);
err |= __put_user((long)kinfo->si_ptr, &uinfo->svptr);
break;
default: /* this is just in case for now ... */
err |= __put_user(kinfo->si_pid, &uinfo->pid);
err |= __put_user(kinfo->si_uid, &uinfo->uid);
break;
}
return err ? -EFAULT: sizeof(*uinfo);
}
static ssize_t signalfd_dequeue(struct signalfd_ctx *ctx, siginfo_t *info,
int nonblock)
{
ssize_t ret;
struct signalfd_lockctx lk;
DECLARE_WAITQUEUE(wait, current);
if (!signalfd_lock(ctx, &lk))
return 0;
ret = dequeue_signal(lk.tsk, &ctx->sigmask, info);
switch (ret) {
case 0:
if (!nonblock)
break;
ret = -EAGAIN;
default:
signalfd_unlock(&lk);
return ret;
}
add_wait_queue(&ctx->wqh, &wait);
for (;;) {
set_current_state(TASK_INTERRUPTIBLE);
ret = dequeue_signal(lk.tsk, &ctx->sigmask, info);
signalfd_unlock(&lk);
if (ret != 0)
break;
if (signal_pending(current)) {
ret = -ERESTARTSYS;
break;
}
schedule();
ret = signalfd_lock(ctx, &lk);
if (unlikely(!ret)) {
/*
* Let the caller read zero byte, ala socket
* recv() when the peer disconnect. This test
* must be done before doing a dequeue_signal(),
* because if the sighand has been orphaned,
* the dequeue_signal() call is going to crash
* because ->sighand will be long gone.
*/
break;
}
}
remove_wait_queue(&ctx->wqh, &wait);
__set_current_state(TASK_RUNNING);
return ret;
}
/*
* Returns either the size of a "struct signalfd_siginfo", or zero if the
* sighand we are attached to, has been orphaned. The "count" parameter
* must be at least the size of a "struct signalfd_siginfo".
*/
static ssize_t signalfd_read(struct file *file, char __user *buf, size_t count,
loff_t *ppos)
{
struct signalfd_ctx *ctx = file->private_data;
struct signalfd_siginfo __user *siginfo;
int nonblock = file->f_flags & O_NONBLOCK;
ssize_t ret, total = 0;
siginfo_t info;
count /= sizeof(struct signalfd_siginfo);
if (!count)
return -EINVAL;
siginfo = (struct signalfd_siginfo __user *) buf;
do {
ret = signalfd_dequeue(ctx, &info, nonblock);
if (unlikely(ret <= 0))
break;
ret = signalfd_copyinfo(siginfo, &info);
if (ret < 0)
break;
siginfo++;
total += ret;
nonblock = 1;
} while (--count);
return total ? total : ret;
}
static const struct file_operations signalfd_fops = {
.release = signalfd_release,
.poll = signalfd_poll,
.read = signalfd_read,
};
/*
* Create a file descriptor that is associated with our signal
* state. We can pass it around to others if we want to, but
* it will always be _our_ signal state.
*/
asmlinkage long sys_signalfd(int ufd, sigset_t __user *user_mask, size_t sizemask)
{
int error;
sigset_t sigmask;
struct signalfd_ctx *ctx;
struct sighand_struct *sighand;
struct file *file;
struct inode *inode;
struct signalfd_lockctx lk;
if (sizemask != sizeof(sigset_t) ||
copy_from_user(&sigmask, user_mask, sizeof(sigmask)))
return error = -EINVAL;
sigdelsetmask(&sigmask, sigmask(SIGKILL) | sigmask(SIGSTOP));
signotset(&sigmask);
if (ufd == -1) {
ctx = kmalloc(sizeof(*ctx), GFP_KERNEL);
if (!ctx)
return -ENOMEM;
init_waitqueue_head(&ctx->wqh);
ctx->sigmask = sigmask;
ctx->tsk = current;
sighand = current->sighand;
/*
* Add this fd to the list of signal listeners.
*/
spin_lock_irq(&sighand->siglock);
list_add_tail(&ctx->lnk, &sighand->signalfd_list);
spin_unlock_irq(&sighand->siglock);
/*
* When we call this, the initialization must be complete, since
* anon_inode_getfd() will install the fd.
*/
error = anon_inode_getfd(&ufd, &inode, &file, "[signalfd]",
&signalfd_fops, ctx);
if (error)
goto err_fdalloc;
} else {
file = fget(ufd);
if (!file)
return -EBADF;
ctx = file->private_data;
if (file->f_op != &signalfd_fops) {
fput(file);
return -EINVAL;
}
/*
* We need to be prepared of the fact that the sighand this fd
* is attached to, has been detched. In that case signalfd_lock()
* will return 0, and we'll just skip setting the new mask.
*/
if (signalfd_lock(ctx, &lk)) {
ctx->sigmask = sigmask;
signalfd_unlock(&lk);
}
wake_up(&ctx->wqh);
fput(file);
}
return ufd;
err_fdalloc:
signalfd_cleanup(ctx);
return error;
}