1
linux/kernel/exit.c
Linus Torvalds b5683a37c8 vfs-6.9.pidfd
-----BEGIN PGP SIGNATURE-----
 
 iHUEABYKAB0WIQRAhzRXHqcMeLMyaSiRxhvAZXjcogUCZem4/wAKCRCRxhvAZXjc
 opnBAQCaQWwxjT0VLHebPniw6tel/KYlZ9jH9kBQwLrk1pembwEA+BsCY2C8YS4a
 75v9jOPxr+Z8j1SjxwwubcONPyqYXwQ=
 =+Wa3
 -----END PGP SIGNATURE-----

Merge tag 'vfs-6.9.pidfd' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs

Pull pdfd updates from Christian Brauner:

 - Until now pidfds could only be created for thread-group leaders but
   not for threads. There was no technical reason for this. We simply
   had no users that needed support for this. Now we do have users that
   need support for this.

   This introduces a new PIDFD_THREAD flag for pidfd_open(). If that
   flag is set pidfd_open() creates a pidfd that refers to a specific
   thread.

   In addition, we now allow clone() and clone3() to be called with
   CLONE_PIDFD | CLONE_THREAD which wasn't possible before.

   A pidfd that refers to an individual thread differs from a pidfd that
   refers to a thread-group leader:

    (1) Pidfds are pollable. A task may poll a pidfd and get notified
        when the task has exited.

        For thread-group leader pidfds the polling task is woken if the
        thread-group is empty. In other words, if the thread-group
        leader task exits when there are still threads alive in its
        thread-group the polling task will not be woken when the
        thread-group leader exits but rather when the last thread in the
        thread-group exits.

        For thread-specific pidfds the polling task is woken if the
        thread exits.

    (2) Passing a thread-group leader pidfd to pidfd_send_signal() will
        generate thread-group directed signals like kill(2) does.

        Passing a thread-specific pidfd to pidfd_send_signal() will
        generate thread-specific signals like tgkill(2) does.

        The default scope of the signal is thus determined by the type
        of the pidfd.

        Since use-cases exist where the default scope of the provided
        pidfd needs to be overriden the following flags are added to
        pidfd_send_signal():

         - PIDFD_SIGNAL_THREAD
           Send a thread-specific signal.

         - PIDFD_SIGNAL_THREAD_GROUP
           Send a thread-group directed signal.

         - PIDFD_SIGNAL_PROCESS_GROUP
           Send a process-group directed signal.

        The scope change will only work if the struct pid is actually
        used for this scope.

        For example, in order to send a thread-group directed signal the
        provided pidfd must be used as a thread-group leader and
        similarly for PIDFD_SIGNAL_PROCESS_GROUP the struct pid must be
        used as a process group leader.

 - Move pidfds from the anonymous inode infrastructure to a tiny pseudo
   filesystem. This will unblock further work that we weren't able to do
   simply because of the very justified limitations of anonymous inodes.
   Moving pidfds to a tiny pseudo filesystem allows for statx on pidfds
   to become useful for the first time. They can now be compared by
   inode number which are unique for the system lifetime.

   Instead of stashing struct pid in file->private_data we can now stash
   it in inode->i_private. This makes it possible to introduce concepts
   that operate on a process once all file descriptors have been closed.
   A concrete example is kill-on-last-close. Another side-effect is that
   file->private_data is now freed up for per-file options for pidfds.

   Now, each struct pid will refer to a different inode but the same
   struct pid will refer to the same inode if it's opened multiple
   times. In contrast to now where each struct pid refers to the same
   inode.

   The tiny pseudo filesystem is not visible anywhere in userspace
   exactly like e.g., pipefs and sockfs. There's no lookup, there's no
   complex inode operations, nothing. Dentries and inodes are always
   deleted when the last pidfd is closed.

   We allocate a new inode and dentry for each struct pid and we reuse
   that inode and dentry for all pidfds that refer to the same struct
   pid. The code is entirely optional and fairly small. If it's not
   selected we fallback to anonymous inodes. Heavily inspired by nsfs.

   The dentry and inode allocation mechanism is moved into generic
   infrastructure that is now shared between nsfs and pidfs. The
   path_from_stashed() helper must be provided with a stashing location,
   an inode number, a mount, and the private data that is supposed to be
   used and it will provide a path that can be passed to dentry_open().

   The helper will try retrieve an existing dentry from the provided
   stashing location. If a valid dentry is found it is reused. If not a
   new one is allocated and we try to stash it in the provided location.
   If this fails we retry until we either find an existing dentry or the
   newly allocated dentry could be stashed. Subsequent openers of the
   same namespace or task are then able to reuse it.

 - Currently it is only possible to get notified when a task has exited,
   i.e., become a zombie and userspace gets notified with EPOLLIN. We
   now also support waiting until the task has been reaped, notifying
   userspace with EPOLLHUP.

 - Ensure that ESRCH is reported for getfd if a task is exiting instead
   of the confusing EBADF.

 - Various smaller cleanups to pidfd functions.

* tag 'vfs-6.9.pidfd' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs: (23 commits)
  libfs: improve path_from_stashed()
  libfs: add stashed_dentry_prune()
  libfs: improve path_from_stashed() helper
  pidfs: convert to path_from_stashed() helper
  nsfs: convert to path_from_stashed() helper
  libfs: add path_from_stashed()
  pidfd: add pidfs
  pidfd: move struct pidfd_fops
  pidfd: allow to override signal scope in pidfd_send_signal()
  pidfd: change pidfd_send_signal() to respect PIDFD_THREAD
  signal: fill in si_code in prepare_kill_siginfo()
  selftests: add ESRCH tests for pidfd_getfd()
  pidfd: getfd should always report ESRCH if a task is exiting
  pidfd: clone: allow CLONE_THREAD | CLONE_PIDFD together
  pidfd: exit: kill the no longer used thread_group_exited()
  pidfd: change do_notify_pidfd() to use __wake_up(poll_to_key(EPOLLIN))
  pid: kill the obsolete PIDTYPE_PID code in transfer_pid()
  pidfd: kill the no longer needed do_notify_pidfd() in de_thread()
  pidfd_poll: report POLLHUP when pid_task() == NULL
  pidfd: implement PIDFD_THREAD flag for pidfd_open()
  ...
2024-03-11 10:21:06 -07:00

1914 lines
47 KiB
C

// SPDX-License-Identifier: GPL-2.0-only
/*
* linux/kernel/exit.c
*
* Copyright (C) 1991, 1992 Linus Torvalds
*/
#include <linux/mm.h>
#include <linux/slab.h>
#include <linux/sched/autogroup.h>
#include <linux/sched/mm.h>
#include <linux/sched/stat.h>
#include <linux/sched/task.h>
#include <linux/sched/task_stack.h>
#include <linux/sched/cputime.h>
#include <linux/interrupt.h>
#include <linux/module.h>
#include <linux/capability.h>
#include <linux/completion.h>
#include <linux/personality.h>
#include <linux/tty.h>
#include <linux/iocontext.h>
#include <linux/key.h>
#include <linux/cpu.h>
#include <linux/acct.h>
#include <linux/tsacct_kern.h>
#include <linux/file.h>
#include <linux/fdtable.h>
#include <linux/freezer.h>
#include <linux/binfmts.h>
#include <linux/nsproxy.h>
#include <linux/pid_namespace.h>
#include <linux/ptrace.h>
#include <linux/profile.h>
#include <linux/mount.h>
#include <linux/proc_fs.h>
#include <linux/kthread.h>
#include <linux/mempolicy.h>
#include <linux/taskstats_kern.h>
#include <linux/delayacct.h>
#include <linux/cgroup.h>
#include <linux/syscalls.h>
#include <linux/signal.h>
#include <linux/posix-timers.h>
#include <linux/cn_proc.h>
#include <linux/mutex.h>
#include <linux/futex.h>
#include <linux/pipe_fs_i.h>
#include <linux/audit.h> /* for audit_free() */
#include <linux/resource.h>
#include <linux/task_io_accounting_ops.h>
#include <linux/blkdev.h>
#include <linux/task_work.h>
#include <linux/fs_struct.h>
#include <linux/init_task.h>
#include <linux/perf_event.h>
#include <trace/events/sched.h>
#include <linux/hw_breakpoint.h>
#include <linux/oom.h>
#include <linux/writeback.h>
#include <linux/shm.h>
#include <linux/kcov.h>
#include <linux/kmsan.h>
#include <linux/random.h>
#include <linux/rcuwait.h>
#include <linux/compat.h>
#include <linux/io_uring.h>
#include <linux/kprobes.h>
#include <linux/rethook.h>
#include <linux/sysfs.h>
#include <linux/user_events.h>
#include <linux/uaccess.h>
#include <uapi/linux/wait.h>
#include <asm/unistd.h>
#include <asm/mmu_context.h>
#include "exit.h"
/*
* The default value should be high enough to not crash a system that randomly
* crashes its kernel from time to time, but low enough to at least not permit
* overflowing 32-bit refcounts or the ldsem writer count.
*/
static unsigned int oops_limit = 10000;
#ifdef CONFIG_SYSCTL
static struct ctl_table kern_exit_table[] = {
{
.procname = "oops_limit",
.data = &oops_limit,
.maxlen = sizeof(oops_limit),
.mode = 0644,
.proc_handler = proc_douintvec,
},
{ }
};
static __init int kernel_exit_sysctls_init(void)
{
register_sysctl_init("kernel", kern_exit_table);
return 0;
}
late_initcall(kernel_exit_sysctls_init);
#endif
static atomic_t oops_count = ATOMIC_INIT(0);
#ifdef CONFIG_SYSFS
static ssize_t oops_count_show(struct kobject *kobj, struct kobj_attribute *attr,
char *page)
{
return sysfs_emit(page, "%d\n", atomic_read(&oops_count));
}
static struct kobj_attribute oops_count_attr = __ATTR_RO(oops_count);
static __init int kernel_exit_sysfs_init(void)
{
sysfs_add_file_to_group(kernel_kobj, &oops_count_attr.attr, NULL);
return 0;
}
late_initcall(kernel_exit_sysfs_init);
#endif
static void __unhash_process(struct task_struct *p, bool group_dead)
{
nr_threads--;
detach_pid(p, PIDTYPE_PID);
if (group_dead) {
detach_pid(p, PIDTYPE_TGID);
detach_pid(p, PIDTYPE_PGID);
detach_pid(p, PIDTYPE_SID);
list_del_rcu(&p->tasks);
list_del_init(&p->sibling);
__this_cpu_dec(process_counts);
}
list_del_rcu(&p->thread_node);
}
/*
* This function expects the tasklist_lock write-locked.
*/
static void __exit_signal(struct task_struct *tsk)
{
struct signal_struct *sig = tsk->signal;
bool group_dead = thread_group_leader(tsk);
struct sighand_struct *sighand;
struct tty_struct *tty;
u64 utime, stime;
sighand = rcu_dereference_check(tsk->sighand,
lockdep_tasklist_lock_is_held());
spin_lock(&sighand->siglock);
#ifdef CONFIG_POSIX_TIMERS
posix_cpu_timers_exit(tsk);
if (group_dead)
posix_cpu_timers_exit_group(tsk);
#endif
if (group_dead) {
tty = sig->tty;
sig->tty = NULL;
} else {
/*
* If there is any task waiting for the group exit
* then notify it:
*/
if (sig->notify_count > 0 && !--sig->notify_count)
wake_up_process(sig->group_exec_task);
if (tsk == sig->curr_target)
sig->curr_target = next_thread(tsk);
}
add_device_randomness((const void*) &tsk->se.sum_exec_runtime,
sizeof(unsigned long long));
/*
* Accumulate here the counters for all threads as they die. We could
* skip the group leader because it is the last user of signal_struct,
* but we want to avoid the race with thread_group_cputime() which can
* see the empty ->thread_head list.
*/
task_cputime(tsk, &utime, &stime);
write_seqlock(&sig->stats_lock);
sig->utime += utime;
sig->stime += stime;
sig->gtime += task_gtime(tsk);
sig->min_flt += tsk->min_flt;
sig->maj_flt += tsk->maj_flt;
sig->nvcsw += tsk->nvcsw;
sig->nivcsw += tsk->nivcsw;
sig->inblock += task_io_get_inblock(tsk);
sig->oublock += task_io_get_oublock(tsk);
task_io_accounting_add(&sig->ioac, &tsk->ioac);
sig->sum_sched_runtime += tsk->se.sum_exec_runtime;
sig->nr_threads--;
__unhash_process(tsk, group_dead);
write_sequnlock(&sig->stats_lock);
/*
* Do this under ->siglock, we can race with another thread
* doing sigqueue_free() if we have SIGQUEUE_PREALLOC signals.
*/
flush_sigqueue(&tsk->pending);
tsk->sighand = NULL;
spin_unlock(&sighand->siglock);
__cleanup_sighand(sighand);
clear_tsk_thread_flag(tsk, TIF_SIGPENDING);
if (group_dead) {
flush_sigqueue(&sig->shared_pending);
tty_kref_put(tty);
}
}
static void delayed_put_task_struct(struct rcu_head *rhp)
{
struct task_struct *tsk = container_of(rhp, struct task_struct, rcu);
kprobe_flush_task(tsk);
rethook_flush_task(tsk);
perf_event_delayed_put(tsk);
trace_sched_process_free(tsk);
put_task_struct(tsk);
}
void put_task_struct_rcu_user(struct task_struct *task)
{
if (refcount_dec_and_test(&task->rcu_users))
call_rcu(&task->rcu, delayed_put_task_struct);
}
void __weak release_thread(struct task_struct *dead_task)
{
}
void release_task(struct task_struct *p)
{
struct task_struct *leader;
struct pid *thread_pid;
int zap_leader;
repeat:
/* don't need to get the RCU readlock here - the process is dead and
* can't be modifying its own credentials. But shut RCU-lockdep up */
rcu_read_lock();
dec_rlimit_ucounts(task_ucounts(p), UCOUNT_RLIMIT_NPROC, 1);
rcu_read_unlock();
cgroup_release(p);
write_lock_irq(&tasklist_lock);
ptrace_release_task(p);
thread_pid = get_pid(p->thread_pid);
__exit_signal(p);
/*
* If we are the last non-leader member of the thread
* group, and the leader is zombie, then notify the
* group leader's parent process. (if it wants notification.)
*/
zap_leader = 0;
leader = p->group_leader;
if (leader != p && thread_group_empty(leader)
&& leader->exit_state == EXIT_ZOMBIE) {
/*
* If we were the last child thread and the leader has
* exited already, and the leader's parent ignores SIGCHLD,
* then we are the one who should release the leader.
*/
zap_leader = do_notify_parent(leader, leader->exit_signal);
if (zap_leader)
leader->exit_state = EXIT_DEAD;
}
write_unlock_irq(&tasklist_lock);
seccomp_filter_release(p);
proc_flush_pid(thread_pid);
put_pid(thread_pid);
release_thread(p);
put_task_struct_rcu_user(p);
p = leader;
if (unlikely(zap_leader))
goto repeat;
}
int rcuwait_wake_up(struct rcuwait *w)
{
int ret = 0;
struct task_struct *task;
rcu_read_lock();
/*
* Order condition vs @task, such that everything prior to the load
* of @task is visible. This is the condition as to why the user called
* rcuwait_wake() in the first place. Pairs with set_current_state()
* barrier (A) in rcuwait_wait_event().
*
* WAIT WAKE
* [S] tsk = current [S] cond = true
* MB (A) MB (B)
* [L] cond [L] tsk
*/
smp_mb(); /* (B) */
task = rcu_dereference(w->task);
if (task)
ret = wake_up_process(task);
rcu_read_unlock();
return ret;
}
EXPORT_SYMBOL_GPL(rcuwait_wake_up);
/*
* Determine if a process group is "orphaned", according to the POSIX
* definition in 2.2.2.52. Orphaned process groups are not to be affected
* by terminal-generated stop signals. Newly orphaned process groups are
* to receive a SIGHUP and a SIGCONT.
*
* "I ask you, have you ever known what it is to be an orphan?"
*/
static int will_become_orphaned_pgrp(struct pid *pgrp,
struct task_struct *ignored_task)
{
struct task_struct *p;
do_each_pid_task(pgrp, PIDTYPE_PGID, p) {
if ((p == ignored_task) ||
(p->exit_state && thread_group_empty(p)) ||
is_global_init(p->real_parent))
continue;
if (task_pgrp(p->real_parent) != pgrp &&
task_session(p->real_parent) == task_session(p))
return 0;
} while_each_pid_task(pgrp, PIDTYPE_PGID, p);
return 1;
}
int is_current_pgrp_orphaned(void)
{
int retval;
read_lock(&tasklist_lock);
retval = will_become_orphaned_pgrp(task_pgrp(current), NULL);
read_unlock(&tasklist_lock);
return retval;
}
static bool has_stopped_jobs(struct pid *pgrp)
{
struct task_struct *p;
do_each_pid_task(pgrp, PIDTYPE_PGID, p) {
if (p->signal->flags & SIGNAL_STOP_STOPPED)
return true;
} while_each_pid_task(pgrp, PIDTYPE_PGID, p);
return false;
}
/*
* Check to see if any process groups have become orphaned as
* a result of our exiting, and if they have any stopped jobs,
* send them a SIGHUP and then a SIGCONT. (POSIX 3.2.2.2)
*/
static void
kill_orphaned_pgrp(struct task_struct *tsk, struct task_struct *parent)
{
struct pid *pgrp = task_pgrp(tsk);
struct task_struct *ignored_task = tsk;
if (!parent)
/* exit: our father is in a different pgrp than
* we are and we were the only connection outside.
*/
parent = tsk->real_parent;
else
/* reparent: our child is in a different pgrp than
* we are, and it was the only connection outside.
*/
ignored_task = NULL;
if (task_pgrp(parent) != pgrp &&
task_session(parent) == task_session(tsk) &&
will_become_orphaned_pgrp(pgrp, ignored_task) &&
has_stopped_jobs(pgrp)) {
__kill_pgrp_info(SIGHUP, SEND_SIG_PRIV, pgrp);
__kill_pgrp_info(SIGCONT, SEND_SIG_PRIV, pgrp);
}
}
static void coredump_task_exit(struct task_struct *tsk)
{
struct core_state *core_state;
/*
* Serialize with any possible pending coredump.
* We must hold siglock around checking core_state
* and setting PF_POSTCOREDUMP. The core-inducing thread
* will increment ->nr_threads for each thread in the
* group without PF_POSTCOREDUMP set.
*/
spin_lock_irq(&tsk->sighand->siglock);
tsk->flags |= PF_POSTCOREDUMP;
core_state = tsk->signal->core_state;
spin_unlock_irq(&tsk->sighand->siglock);
/* The vhost_worker does not particpate in coredumps */
if (core_state &&
((tsk->flags & (PF_IO_WORKER | PF_USER_WORKER)) != PF_USER_WORKER)) {
struct core_thread self;
self.task = current;
if (self.task->flags & PF_SIGNALED)
self.next = xchg(&core_state->dumper.next, &self);
else
self.task = NULL;
/*
* Implies mb(), the result of xchg() must be visible
* to core_state->dumper.
*/
if (atomic_dec_and_test(&core_state->nr_threads))
complete(&core_state->startup);
for (;;) {
set_current_state(TASK_UNINTERRUPTIBLE|TASK_FREEZABLE);
if (!self.task) /* see coredump_finish() */
break;
schedule();
}
__set_current_state(TASK_RUNNING);
}
}
#ifdef CONFIG_MEMCG
/*
* A task is exiting. If it owned this mm, find a new owner for the mm.
*/
void mm_update_next_owner(struct mm_struct *mm)
{
struct task_struct *c, *g, *p = current;
retry:
/*
* If the exiting or execing task is not the owner, it's
* someone else's problem.
*/
if (mm->owner != p)
return;
/*
* The current owner is exiting/execing and there are no other
* candidates. Do not leave the mm pointing to a possibly
* freed task structure.
*/
if (atomic_read(&mm->mm_users) <= 1) {
WRITE_ONCE(mm->owner, NULL);
return;
}
read_lock(&tasklist_lock);
/*
* Search in the children
*/
list_for_each_entry(c, &p->children, sibling) {
if (c->mm == mm)
goto assign_new_owner;
}
/*
* Search in the siblings
*/
list_for_each_entry(c, &p->real_parent->children, sibling) {
if (c->mm == mm)
goto assign_new_owner;
}
/*
* Search through everything else, we should not get here often.
*/
for_each_process(g) {
if (g->flags & PF_KTHREAD)
continue;
for_each_thread(g, c) {
if (c->mm == mm)
goto assign_new_owner;
if (c->mm)
break;
}
}
read_unlock(&tasklist_lock);
/*
* We found no owner yet mm_users > 1: this implies that we are
* most likely racing with swapoff (try_to_unuse()) or /proc or
* ptrace or page migration (get_task_mm()). Mark owner as NULL.
*/
WRITE_ONCE(mm->owner, NULL);
return;
assign_new_owner:
BUG_ON(c == p);
get_task_struct(c);
/*
* The task_lock protects c->mm from changing.
* We always want mm->owner->mm == mm
*/
task_lock(c);
/*
* Delay read_unlock() till we have the task_lock()
* to ensure that c does not slip away underneath us
*/
read_unlock(&tasklist_lock);
if (c->mm != mm) {
task_unlock(c);
put_task_struct(c);
goto retry;
}
WRITE_ONCE(mm->owner, c);
lru_gen_migrate_mm(mm);
task_unlock(c);
put_task_struct(c);
}
#endif /* CONFIG_MEMCG */
/*
* Turn us into a lazy TLB process if we
* aren't already..
*/
static void exit_mm(void)
{
struct mm_struct *mm = current->mm;
exit_mm_release(current, mm);
if (!mm)
return;
mmap_read_lock(mm);
mmgrab_lazy_tlb(mm);
BUG_ON(mm != current->active_mm);
/* more a memory barrier than a real lock */
task_lock(current);
/*
* When a thread stops operating on an address space, the loop
* in membarrier_private_expedited() may not observe that
* tsk->mm, and the loop in membarrier_global_expedited() may
* not observe a MEMBARRIER_STATE_GLOBAL_EXPEDITED
* rq->membarrier_state, so those would not issue an IPI.
* Membarrier requires a memory barrier after accessing
* user-space memory, before clearing tsk->mm or the
* rq->membarrier_state.
*/
smp_mb__after_spinlock();
local_irq_disable();
current->mm = NULL;
membarrier_update_current_mm(NULL);
enter_lazy_tlb(mm, current);
local_irq_enable();
task_unlock(current);
mmap_read_unlock(mm);
mm_update_next_owner(mm);
mmput(mm);
if (test_thread_flag(TIF_MEMDIE))
exit_oom_victim();
}
static struct task_struct *find_alive_thread(struct task_struct *p)
{
struct task_struct *t;
for_each_thread(p, t) {
if (!(t->flags & PF_EXITING))
return t;
}
return NULL;
}
static struct task_struct *find_child_reaper(struct task_struct *father,
struct list_head *dead)
__releases(&tasklist_lock)
__acquires(&tasklist_lock)
{
struct pid_namespace *pid_ns = task_active_pid_ns(father);
struct task_struct *reaper = pid_ns->child_reaper;
struct task_struct *p, *n;
if (likely(reaper != father))
return reaper;
reaper = find_alive_thread(father);
if (reaper) {
pid_ns->child_reaper = reaper;
return reaper;
}
write_unlock_irq(&tasklist_lock);
list_for_each_entry_safe(p, n, dead, ptrace_entry) {
list_del_init(&p->ptrace_entry);
release_task(p);
}
zap_pid_ns_processes(pid_ns);
write_lock_irq(&tasklist_lock);
return father;
}
/*
* When we die, we re-parent all our children, and try to:
* 1. give them to another thread in our thread group, if such a member exists
* 2. give it to the first ancestor process which prctl'd itself as a
* child_subreaper for its children (like a service manager)
* 3. give it to the init process (PID 1) in our pid namespace
*/
static struct task_struct *find_new_reaper(struct task_struct *father,
struct task_struct *child_reaper)
{
struct task_struct *thread, *reaper;
thread = find_alive_thread(father);
if (thread)
return thread;
if (father->signal->has_child_subreaper) {
unsigned int ns_level = task_pid(father)->level;
/*
* Find the first ->is_child_subreaper ancestor in our pid_ns.
* We can't check reaper != child_reaper to ensure we do not
* cross the namespaces, the exiting parent could be injected
* by setns() + fork().
* We check pid->level, this is slightly more efficient than
* task_active_pid_ns(reaper) != task_active_pid_ns(father).
*/
for (reaper = father->real_parent;
task_pid(reaper)->level == ns_level;
reaper = reaper->real_parent) {
if (reaper == &init_task)
break;
if (!reaper->signal->is_child_subreaper)
continue;
thread = find_alive_thread(reaper);
if (thread)
return thread;
}
}
return child_reaper;
}
/*
* Any that need to be release_task'd are put on the @dead list.
*/
static void reparent_leader(struct task_struct *father, struct task_struct *p,
struct list_head *dead)
{
if (unlikely(p->exit_state == EXIT_DEAD))
return;
/* We don't want people slaying init. */
p->exit_signal = SIGCHLD;
/* If it has exited notify the new parent about this child's death. */
if (!p->ptrace &&
p->exit_state == EXIT_ZOMBIE && thread_group_empty(p)) {
if (do_notify_parent(p, p->exit_signal)) {
p->exit_state = EXIT_DEAD;
list_add(&p->ptrace_entry, dead);
}
}
kill_orphaned_pgrp(p, father);
}
/*
* This does two things:
*
* A. Make init inherit all the child processes
* B. Check to see if any process groups have become orphaned
* as a result of our exiting, and if they have any stopped
* jobs, send them a SIGHUP and then a SIGCONT. (POSIX 3.2.2.2)
*/
static void forget_original_parent(struct task_struct *father,
struct list_head *dead)
{
struct task_struct *p, *t, *reaper;
if (unlikely(!list_empty(&father->ptraced)))
exit_ptrace(father, dead);
/* Can drop and reacquire tasklist_lock */
reaper = find_child_reaper(father, dead);
if (list_empty(&father->children))
return;
reaper = find_new_reaper(father, reaper);
list_for_each_entry(p, &father->children, sibling) {
for_each_thread(p, t) {
RCU_INIT_POINTER(t->real_parent, reaper);
BUG_ON((!t->ptrace) != (rcu_access_pointer(t->parent) == father));
if (likely(!t->ptrace))
t->parent = t->real_parent;
if (t->pdeath_signal)
group_send_sig_info(t->pdeath_signal,
SEND_SIG_NOINFO, t,
PIDTYPE_TGID);
}
/*
* If this is a threaded reparent there is no need to
* notify anyone anything has happened.
*/
if (!same_thread_group(reaper, father))
reparent_leader(father, p, dead);
}
list_splice_tail_init(&father->children, &reaper->children);
}
/*
* Send signals to all our closest relatives so that they know
* to properly mourn us..
*/
static void exit_notify(struct task_struct *tsk, int group_dead)
{
bool autoreap;
struct task_struct *p, *n;
LIST_HEAD(dead);
write_lock_irq(&tasklist_lock);
forget_original_parent(tsk, &dead);
if (group_dead)
kill_orphaned_pgrp(tsk->group_leader, NULL);
tsk->exit_state = EXIT_ZOMBIE;
/*
* sub-thread or delay_group_leader(), wake up the
* PIDFD_THREAD waiters.
*/
if (!thread_group_empty(tsk))
do_notify_pidfd(tsk);
if (unlikely(tsk->ptrace)) {
int sig = thread_group_leader(tsk) &&
thread_group_empty(tsk) &&
!ptrace_reparented(tsk) ?
tsk->exit_signal : SIGCHLD;
autoreap = do_notify_parent(tsk, sig);
} else if (thread_group_leader(tsk)) {
autoreap = thread_group_empty(tsk) &&
do_notify_parent(tsk, tsk->exit_signal);
} else {
autoreap = true;
}
if (autoreap) {
tsk->exit_state = EXIT_DEAD;
list_add(&tsk->ptrace_entry, &dead);
}
/* mt-exec, de_thread() is waiting for group leader */
if (unlikely(tsk->signal->notify_count < 0))
wake_up_process(tsk->signal->group_exec_task);
write_unlock_irq(&tasklist_lock);
list_for_each_entry_safe(p, n, &dead, ptrace_entry) {
list_del_init(&p->ptrace_entry);
release_task(p);
}
}
#ifdef CONFIG_DEBUG_STACK_USAGE
static void check_stack_usage(void)
{
static DEFINE_SPINLOCK(low_water_lock);
static int lowest_to_date = THREAD_SIZE;
unsigned long free;
free = stack_not_used(current);
if (free >= lowest_to_date)
return;
spin_lock(&low_water_lock);
if (free < lowest_to_date) {
pr_info("%s (%d) used greatest stack depth: %lu bytes left\n",
current->comm, task_pid_nr(current), free);
lowest_to_date = free;
}
spin_unlock(&low_water_lock);
}
#else
static inline void check_stack_usage(void) {}
#endif
static void synchronize_group_exit(struct task_struct *tsk, long code)
{
struct sighand_struct *sighand = tsk->sighand;
struct signal_struct *signal = tsk->signal;
spin_lock_irq(&sighand->siglock);
signal->quick_threads--;
if ((signal->quick_threads == 0) &&
!(signal->flags & SIGNAL_GROUP_EXIT)) {
signal->flags = SIGNAL_GROUP_EXIT;
signal->group_exit_code = code;
signal->group_stop_count = 0;
}
spin_unlock_irq(&sighand->siglock);
}
void __noreturn do_exit(long code)
{
struct task_struct *tsk = current;
int group_dead;
WARN_ON(irqs_disabled());
synchronize_group_exit(tsk, code);
WARN_ON(tsk->plug);
kcov_task_exit(tsk);
kmsan_task_exit(tsk);
coredump_task_exit(tsk);
ptrace_event(PTRACE_EVENT_EXIT, code);
user_events_exit(tsk);
io_uring_files_cancel();
exit_signals(tsk); /* sets PF_EXITING */
acct_update_integrals(tsk);
group_dead = atomic_dec_and_test(&tsk->signal->live);
if (group_dead) {
/*
* If the last thread of global init has exited, panic
* immediately to get a useable coredump.
*/
if (unlikely(is_global_init(tsk)))
panic("Attempted to kill init! exitcode=0x%08x\n",
tsk->signal->group_exit_code ?: (int)code);
#ifdef CONFIG_POSIX_TIMERS
hrtimer_cancel(&tsk->signal->real_timer);
exit_itimers(tsk);
#endif
if (tsk->mm)
setmax_mm_hiwater_rss(&tsk->signal->maxrss, tsk->mm);
}
acct_collect(code, group_dead);
if (group_dead)
tty_audit_exit();
audit_free(tsk);
tsk->exit_code = code;
taskstats_exit(tsk, group_dead);
exit_mm();
if (group_dead)
acct_process();
trace_sched_process_exit(tsk);
exit_sem(tsk);
exit_shm(tsk);
exit_files(tsk);
exit_fs(tsk);
if (group_dead)
disassociate_ctty(1);
exit_task_namespaces(tsk);
exit_task_work(tsk);
exit_thread(tsk);
/*
* Flush inherited counters to the parent - before the parent
* gets woken up by child-exit notifications.
*
* because of cgroup mode, must be called before cgroup_exit()
*/
perf_event_exit_task(tsk);
sched_autogroup_exit_task(tsk);
cgroup_exit(tsk);
/*
* FIXME: do that only when needed, using sched_exit tracepoint
*/
flush_ptrace_hw_breakpoint(tsk);
exit_tasks_rcu_start();
exit_notify(tsk, group_dead);
proc_exit_connector(tsk);
mpol_put_task_policy(tsk);
#ifdef CONFIG_FUTEX
if (unlikely(current->pi_state_cache))
kfree(current->pi_state_cache);
#endif
/*
* Make sure we are holding no locks:
*/
debug_check_no_locks_held();
if (tsk->io_context)
exit_io_context(tsk);
if (tsk->splice_pipe)
free_pipe_info(tsk->splice_pipe);
if (tsk->task_frag.page)
put_page(tsk->task_frag.page);
exit_task_stack_account(tsk);
check_stack_usage();
preempt_disable();
if (tsk->nr_dirtied)
__this_cpu_add(dirty_throttle_leaks, tsk->nr_dirtied);
exit_rcu();
exit_tasks_rcu_finish();
lockdep_free_task(tsk);
do_task_dead();
}
void __noreturn make_task_dead(int signr)
{
/*
* Take the task off the cpu after something catastrophic has
* happened.
*
* We can get here from a kernel oops, sometimes with preemption off.
* Start by checking for critical errors.
* Then fix up important state like USER_DS and preemption.
* Then do everything else.
*/
struct task_struct *tsk = current;
unsigned int limit;
if (unlikely(in_interrupt()))
panic("Aiee, killing interrupt handler!");
if (unlikely(!tsk->pid))
panic("Attempted to kill the idle task!");
if (unlikely(irqs_disabled())) {
pr_info("note: %s[%d] exited with irqs disabled\n",
current->comm, task_pid_nr(current));
local_irq_enable();
}
if (unlikely(in_atomic())) {
pr_info("note: %s[%d] exited with preempt_count %d\n",
current->comm, task_pid_nr(current),
preempt_count());
preempt_count_set(PREEMPT_ENABLED);
}
/*
* Every time the system oopses, if the oops happens while a reference
* to an object was held, the reference leaks.
* If the oops doesn't also leak memory, repeated oopsing can cause
* reference counters to wrap around (if they're not using refcount_t).
* This means that repeated oopsing can make unexploitable-looking bugs
* exploitable through repeated oopsing.
* To make sure this can't happen, place an upper bound on how often the
* kernel may oops without panic().
*/
limit = READ_ONCE(oops_limit);
if (atomic_inc_return(&oops_count) >= limit && limit)
panic("Oopsed too often (kernel.oops_limit is %d)", limit);
/*
* We're taking recursive faults here in make_task_dead. Safest is to just
* leave this task alone and wait for reboot.
*/
if (unlikely(tsk->flags & PF_EXITING)) {
pr_alert("Fixing recursive fault but reboot is needed!\n");
futex_exit_recursive(tsk);
tsk->exit_state = EXIT_DEAD;
refcount_inc(&tsk->rcu_users);
do_task_dead();
}
do_exit(signr);
}
SYSCALL_DEFINE1(exit, int, error_code)
{
do_exit((error_code&0xff)<<8);
}
/*
* Take down every thread in the group. This is called by fatal signals
* as well as by sys_exit_group (below).
*/
void __noreturn
do_group_exit(int exit_code)
{
struct signal_struct *sig = current->signal;
if (sig->flags & SIGNAL_GROUP_EXIT)
exit_code = sig->group_exit_code;
else if (sig->group_exec_task)
exit_code = 0;
else {
struct sighand_struct *const sighand = current->sighand;
spin_lock_irq(&sighand->siglock);
if (sig->flags & SIGNAL_GROUP_EXIT)
/* Another thread got here before we took the lock. */
exit_code = sig->group_exit_code;
else if (sig->group_exec_task)
exit_code = 0;
else {
sig->group_exit_code = exit_code;
sig->flags = SIGNAL_GROUP_EXIT;
zap_other_threads(current);
}
spin_unlock_irq(&sighand->siglock);
}
do_exit(exit_code);
/* NOTREACHED */
}
/*
* this kills every thread in the thread group. Note that any externally
* wait4()-ing process will get the correct exit code - even if this
* thread is not the thread group leader.
*/
SYSCALL_DEFINE1(exit_group, int, error_code)
{
do_group_exit((error_code & 0xff) << 8);
/* NOTREACHED */
return 0;
}
static int eligible_pid(struct wait_opts *wo, struct task_struct *p)
{
return wo->wo_type == PIDTYPE_MAX ||
task_pid_type(p, wo->wo_type) == wo->wo_pid;
}
static int
eligible_child(struct wait_opts *wo, bool ptrace, struct task_struct *p)
{
if (!eligible_pid(wo, p))
return 0;
/*
* Wait for all children (clone and not) if __WALL is set or
* if it is traced by us.
*/
if (ptrace || (wo->wo_flags & __WALL))
return 1;
/*
* Otherwise, wait for clone children *only* if __WCLONE is set;
* otherwise, wait for non-clone children *only*.
*
* Note: a "clone" child here is one that reports to its parent
* using a signal other than SIGCHLD, or a non-leader thread which
* we can only see if it is traced by us.
*/
if ((p->exit_signal != SIGCHLD) ^ !!(wo->wo_flags & __WCLONE))
return 0;
return 1;
}
/*
* Handle sys_wait4 work for one task in state EXIT_ZOMBIE. We hold
* read_lock(&tasklist_lock) on entry. If we return zero, we still hold
* the lock and this task is uninteresting. If we return nonzero, we have
* released the lock and the system call should return.
*/
static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
{
int state, status;
pid_t pid = task_pid_vnr(p);
uid_t uid = from_kuid_munged(current_user_ns(), task_uid(p));
struct waitid_info *infop;
if (!likely(wo->wo_flags & WEXITED))
return 0;
if (unlikely(wo->wo_flags & WNOWAIT)) {
status = (p->signal->flags & SIGNAL_GROUP_EXIT)
? p->signal->group_exit_code : p->exit_code;
get_task_struct(p);
read_unlock(&tasklist_lock);
sched_annotate_sleep();
if (wo->wo_rusage)
getrusage(p, RUSAGE_BOTH, wo->wo_rusage);
put_task_struct(p);
goto out_info;
}
/*
* Move the task's state to DEAD/TRACE, only one thread can do this.
*/
state = (ptrace_reparented(p) && thread_group_leader(p)) ?
EXIT_TRACE : EXIT_DEAD;
if (cmpxchg(&p->exit_state, EXIT_ZOMBIE, state) != EXIT_ZOMBIE)
return 0;
/*
* We own this thread, nobody else can reap it.
*/
read_unlock(&tasklist_lock);
sched_annotate_sleep();
/*
* Check thread_group_leader() to exclude the traced sub-threads.
*/
if (state == EXIT_DEAD && thread_group_leader(p)) {
struct signal_struct *sig = p->signal;
struct signal_struct *psig = current->signal;
unsigned long maxrss;
u64 tgutime, tgstime;
/*
* The resource counters for the group leader are in its
* own task_struct. Those for dead threads in the group
* are in its signal_struct, as are those for the child
* processes it has previously reaped. All these
* accumulate in the parent's signal_struct c* fields.
*
* We don't bother to take a lock here to protect these
* p->signal fields because the whole thread group is dead
* and nobody can change them.
*
* psig->stats_lock also protects us from our sub-threads
* which can reap other children at the same time.
*
* We use thread_group_cputime_adjusted() to get times for
* the thread group, which consolidates times for all threads
* in the group including the group leader.
*/
thread_group_cputime_adjusted(p, &tgutime, &tgstime);
write_seqlock_irq(&psig->stats_lock);
psig->cutime += tgutime + sig->cutime;
psig->cstime += tgstime + sig->cstime;
psig->cgtime += task_gtime(p) + sig->gtime + sig->cgtime;
psig->cmin_flt +=
p->min_flt + sig->min_flt + sig->cmin_flt;
psig->cmaj_flt +=
p->maj_flt + sig->maj_flt + sig->cmaj_flt;
psig->cnvcsw +=
p->nvcsw + sig->nvcsw + sig->cnvcsw;
psig->cnivcsw +=
p->nivcsw + sig->nivcsw + sig->cnivcsw;
psig->cinblock +=
task_io_get_inblock(p) +
sig->inblock + sig->cinblock;
psig->coublock +=
task_io_get_oublock(p) +
sig->oublock + sig->coublock;
maxrss = max(sig->maxrss, sig->cmaxrss);
if (psig->cmaxrss < maxrss)
psig->cmaxrss = maxrss;
task_io_accounting_add(&psig->ioac, &p->ioac);
task_io_accounting_add(&psig->ioac, &sig->ioac);
write_sequnlock_irq(&psig->stats_lock);
}
if (wo->wo_rusage)
getrusage(p, RUSAGE_BOTH, wo->wo_rusage);
status = (p->signal->flags & SIGNAL_GROUP_EXIT)
? p->signal->group_exit_code : p->exit_code;
wo->wo_stat = status;
if (state == EXIT_TRACE) {
write_lock_irq(&tasklist_lock);
/* We dropped tasklist, ptracer could die and untrace */
ptrace_unlink(p);
/* If parent wants a zombie, don't release it now */
state = EXIT_ZOMBIE;
if (do_notify_parent(p, p->exit_signal))
state = EXIT_DEAD;
p->exit_state = state;
write_unlock_irq(&tasklist_lock);
}
if (state == EXIT_DEAD)
release_task(p);
out_info:
infop = wo->wo_info;
if (infop) {
if ((status & 0x7f) == 0) {
infop->cause = CLD_EXITED;
infop->status = status >> 8;
} else {
infop->cause = (status & 0x80) ? CLD_DUMPED : CLD_KILLED;
infop->status = status & 0x7f;
}
infop->pid = pid;
infop->uid = uid;
}
return pid;
}
static int *task_stopped_code(struct task_struct *p, bool ptrace)
{
if (ptrace) {
if (task_is_traced(p) && !(p->jobctl & JOBCTL_LISTENING))
return &p->exit_code;
} else {
if (p->signal->flags & SIGNAL_STOP_STOPPED)
return &p->signal->group_exit_code;
}
return NULL;
}
/**
* wait_task_stopped - Wait for %TASK_STOPPED or %TASK_TRACED
* @wo: wait options
* @ptrace: is the wait for ptrace
* @p: task to wait for
*
* Handle sys_wait4() work for %p in state %TASK_STOPPED or %TASK_TRACED.
*
* CONTEXT:
* read_lock(&tasklist_lock), which is released if return value is
* non-zero. Also, grabs and releases @p->sighand->siglock.
*
* RETURNS:
* 0 if wait condition didn't exist and search for other wait conditions
* should continue. Non-zero return, -errno on failure and @p's pid on
* success, implies that tasklist_lock is released and wait condition
* search should terminate.
*/
static int wait_task_stopped(struct wait_opts *wo,
int ptrace, struct task_struct *p)
{
struct waitid_info *infop;
int exit_code, *p_code, why;
uid_t uid = 0; /* unneeded, required by compiler */
pid_t pid;
/*
* Traditionally we see ptrace'd stopped tasks regardless of options.
*/
if (!ptrace && !(wo->wo_flags & WUNTRACED))
return 0;
if (!task_stopped_code(p, ptrace))
return 0;
exit_code = 0;
spin_lock_irq(&p->sighand->siglock);
p_code = task_stopped_code(p, ptrace);
if (unlikely(!p_code))
goto unlock_sig;
exit_code = *p_code;
if (!exit_code)
goto unlock_sig;
if (!unlikely(wo->wo_flags & WNOWAIT))
*p_code = 0;
uid = from_kuid_munged(current_user_ns(), task_uid(p));
unlock_sig:
spin_unlock_irq(&p->sighand->siglock);
if (!exit_code)
return 0;
/*
* Now we are pretty sure this task is interesting.
* Make sure it doesn't get reaped out from under us while we
* give up the lock and then examine it below. We don't want to
* keep holding onto the tasklist_lock while we call getrusage and
* possibly take page faults for user memory.
*/
get_task_struct(p);
pid = task_pid_vnr(p);
why = ptrace ? CLD_TRAPPED : CLD_STOPPED;
read_unlock(&tasklist_lock);
sched_annotate_sleep();
if (wo->wo_rusage)
getrusage(p, RUSAGE_BOTH, wo->wo_rusage);
put_task_struct(p);
if (likely(!(wo->wo_flags & WNOWAIT)))
wo->wo_stat = (exit_code << 8) | 0x7f;
infop = wo->wo_info;
if (infop) {
infop->cause = why;
infop->status = exit_code;
infop->pid = pid;
infop->uid = uid;
}
return pid;
}
/*
* Handle do_wait work for one task in a live, non-stopped state.
* read_lock(&tasklist_lock) on entry. If we return zero, we still hold
* the lock and this task is uninteresting. If we return nonzero, we have
* released the lock and the system call should return.
*/
static int wait_task_continued(struct wait_opts *wo, struct task_struct *p)
{
struct waitid_info *infop;
pid_t pid;
uid_t uid;
if (!unlikely(wo->wo_flags & WCONTINUED))
return 0;
if (!(p->signal->flags & SIGNAL_STOP_CONTINUED))
return 0;
spin_lock_irq(&p->sighand->siglock);
/* Re-check with the lock held. */
if (!(p->signal->flags & SIGNAL_STOP_CONTINUED)) {
spin_unlock_irq(&p->sighand->siglock);
return 0;
}
if (!unlikely(wo->wo_flags & WNOWAIT))
p->signal->flags &= ~SIGNAL_STOP_CONTINUED;
uid = from_kuid_munged(current_user_ns(), task_uid(p));
spin_unlock_irq(&p->sighand->siglock);
pid = task_pid_vnr(p);
get_task_struct(p);
read_unlock(&tasklist_lock);
sched_annotate_sleep();
if (wo->wo_rusage)
getrusage(p, RUSAGE_BOTH, wo->wo_rusage);
put_task_struct(p);
infop = wo->wo_info;
if (!infop) {
wo->wo_stat = 0xffff;
} else {
infop->cause = CLD_CONTINUED;
infop->pid = pid;
infop->uid = uid;
infop->status = SIGCONT;
}
return pid;
}
/*
* Consider @p for a wait by @parent.
*
* -ECHILD should be in ->notask_error before the first call.
* Returns nonzero for a final return, when we have unlocked tasklist_lock.
* Returns zero if the search for a child should continue;
* then ->notask_error is 0 if @p is an eligible child,
* or still -ECHILD.
*/
static int wait_consider_task(struct wait_opts *wo, int ptrace,
struct task_struct *p)
{
/*
* We can race with wait_task_zombie() from another thread.
* Ensure that EXIT_ZOMBIE -> EXIT_DEAD/EXIT_TRACE transition
* can't confuse the checks below.
*/
int exit_state = READ_ONCE(p->exit_state);
int ret;
if (unlikely(exit_state == EXIT_DEAD))
return 0;
ret = eligible_child(wo, ptrace, p);
if (!ret)
return ret;
if (unlikely(exit_state == EXIT_TRACE)) {
/*
* ptrace == 0 means we are the natural parent. In this case
* we should clear notask_error, debugger will notify us.
*/
if (likely(!ptrace))
wo->notask_error = 0;
return 0;
}
if (likely(!ptrace) && unlikely(p->ptrace)) {
/*
* If it is traced by its real parent's group, just pretend
* the caller is ptrace_do_wait() and reap this child if it
* is zombie.
*
* This also hides group stop state from real parent; otherwise
* a single stop can be reported twice as group and ptrace stop.
* If a ptracer wants to distinguish these two events for its
* own children it should create a separate process which takes
* the role of real parent.
*/
if (!ptrace_reparented(p))
ptrace = 1;
}
/* slay zombie? */
if (exit_state == EXIT_ZOMBIE) {
/* we don't reap group leaders with subthreads */
if (!delay_group_leader(p)) {
/*
* A zombie ptracee is only visible to its ptracer.
* Notification and reaping will be cascaded to the
* real parent when the ptracer detaches.
*/
if (unlikely(ptrace) || likely(!p->ptrace))
return wait_task_zombie(wo, p);
}
/*
* Allow access to stopped/continued state via zombie by
* falling through. Clearing of notask_error is complex.
*
* When !@ptrace:
*
* If WEXITED is set, notask_error should naturally be
* cleared. If not, subset of WSTOPPED|WCONTINUED is set,
* so, if there are live subthreads, there are events to
* wait for. If all subthreads are dead, it's still safe
* to clear - this function will be called again in finite
* amount time once all the subthreads are released and
* will then return without clearing.
*
* When @ptrace:
*
* Stopped state is per-task and thus can't change once the
* target task dies. Only continued and exited can happen.
* Clear notask_error if WCONTINUED | WEXITED.
*/
if (likely(!ptrace) || (wo->wo_flags & (WCONTINUED | WEXITED)))
wo->notask_error = 0;
} else {
/*
* @p is alive and it's gonna stop, continue or exit, so
* there always is something to wait for.
*/
wo->notask_error = 0;
}
/*
* Wait for stopped. Depending on @ptrace, different stopped state
* is used and the two don't interact with each other.
*/
ret = wait_task_stopped(wo, ptrace, p);
if (ret)
return ret;
/*
* Wait for continued. There's only one continued state and the
* ptracer can consume it which can confuse the real parent. Don't
* use WCONTINUED from ptracer. You don't need or want it.
*/
return wait_task_continued(wo, p);
}
/*
* Do the work of do_wait() for one thread in the group, @tsk.
*
* -ECHILD should be in ->notask_error before the first call.
* Returns nonzero for a final return, when we have unlocked tasklist_lock.
* Returns zero if the search for a child should continue; then
* ->notask_error is 0 if there were any eligible children,
* or still -ECHILD.
*/
static int do_wait_thread(struct wait_opts *wo, struct task_struct *tsk)
{
struct task_struct *p;
list_for_each_entry(p, &tsk->children, sibling) {
int ret = wait_consider_task(wo, 0, p);
if (ret)
return ret;
}
return 0;
}
static int ptrace_do_wait(struct wait_opts *wo, struct task_struct *tsk)
{
struct task_struct *p;
list_for_each_entry(p, &tsk->ptraced, ptrace_entry) {
int ret = wait_consider_task(wo, 1, p);
if (ret)
return ret;
}
return 0;
}
bool pid_child_should_wake(struct wait_opts *wo, struct task_struct *p)
{
if (!eligible_pid(wo, p))
return false;
if ((wo->wo_flags & __WNOTHREAD) && wo->child_wait.private != p->parent)
return false;
return true;
}
static int child_wait_callback(wait_queue_entry_t *wait, unsigned mode,
int sync, void *key)
{
struct wait_opts *wo = container_of(wait, struct wait_opts,
child_wait);
struct task_struct *p = key;
if (pid_child_should_wake(wo, p))
return default_wake_function(wait, mode, sync, key);
return 0;
}
void __wake_up_parent(struct task_struct *p, struct task_struct *parent)
{
__wake_up_sync_key(&parent->signal->wait_chldexit,
TASK_INTERRUPTIBLE, p);
}
static bool is_effectively_child(struct wait_opts *wo, bool ptrace,
struct task_struct *target)
{
struct task_struct *parent =
!ptrace ? target->real_parent : target->parent;
return current == parent || (!(wo->wo_flags & __WNOTHREAD) &&
same_thread_group(current, parent));
}
/*
* Optimization for waiting on PIDTYPE_PID. No need to iterate through child
* and tracee lists to find the target task.
*/
static int do_wait_pid(struct wait_opts *wo)
{
bool ptrace;
struct task_struct *target;
int retval;
ptrace = false;
target = pid_task(wo->wo_pid, PIDTYPE_TGID);
if (target && is_effectively_child(wo, ptrace, target)) {
retval = wait_consider_task(wo, ptrace, target);
if (retval)
return retval;
}
ptrace = true;
target = pid_task(wo->wo_pid, PIDTYPE_PID);
if (target && target->ptrace &&
is_effectively_child(wo, ptrace, target)) {
retval = wait_consider_task(wo, ptrace, target);
if (retval)
return retval;
}
return 0;
}
long __do_wait(struct wait_opts *wo)
{
long retval;
/*
* If there is nothing that can match our criteria, just get out.
* We will clear ->notask_error to zero if we see any child that
* might later match our criteria, even if we are not able to reap
* it yet.
*/
wo->notask_error = -ECHILD;
if ((wo->wo_type < PIDTYPE_MAX) &&
(!wo->wo_pid || !pid_has_task(wo->wo_pid, wo->wo_type)))
goto notask;
read_lock(&tasklist_lock);
if (wo->wo_type == PIDTYPE_PID) {
retval = do_wait_pid(wo);
if (retval)
return retval;
} else {
struct task_struct *tsk = current;
do {
retval = do_wait_thread(wo, tsk);
if (retval)
return retval;
retval = ptrace_do_wait(wo, tsk);
if (retval)
return retval;
if (wo->wo_flags & __WNOTHREAD)
break;
} while_each_thread(current, tsk);
}
read_unlock(&tasklist_lock);
notask:
retval = wo->notask_error;
if (!retval && !(wo->wo_flags & WNOHANG))
return -ERESTARTSYS;
return retval;
}
static long do_wait(struct wait_opts *wo)
{
int retval;
trace_sched_process_wait(wo->wo_pid);
init_waitqueue_func_entry(&wo->child_wait, child_wait_callback);
wo->child_wait.private = current;
add_wait_queue(&current->signal->wait_chldexit, &wo->child_wait);
do {
set_current_state(TASK_INTERRUPTIBLE);
retval = __do_wait(wo);
if (retval != -ERESTARTSYS)
break;
if (signal_pending(current))
break;
schedule();
} while (1);
__set_current_state(TASK_RUNNING);
remove_wait_queue(&current->signal->wait_chldexit, &wo->child_wait);
return retval;
}
int kernel_waitid_prepare(struct wait_opts *wo, int which, pid_t upid,
struct waitid_info *infop, int options,
struct rusage *ru)
{
unsigned int f_flags = 0;
struct pid *pid = NULL;
enum pid_type type;
if (options & ~(WNOHANG|WNOWAIT|WEXITED|WSTOPPED|WCONTINUED|
__WNOTHREAD|__WCLONE|__WALL))
return -EINVAL;
if (!(options & (WEXITED|WSTOPPED|WCONTINUED)))
return -EINVAL;
switch (which) {
case P_ALL:
type = PIDTYPE_MAX;
break;
case P_PID:
type = PIDTYPE_PID;
if (upid <= 0)
return -EINVAL;
pid = find_get_pid(upid);
break;
case P_PGID:
type = PIDTYPE_PGID;
if (upid < 0)
return -EINVAL;
if (upid)
pid = find_get_pid(upid);
else
pid = get_task_pid(current, PIDTYPE_PGID);
break;
case P_PIDFD:
type = PIDTYPE_PID;
if (upid < 0)
return -EINVAL;
pid = pidfd_get_pid(upid, &f_flags);
if (IS_ERR(pid))
return PTR_ERR(pid);
break;
default:
return -EINVAL;
}
wo->wo_type = type;
wo->wo_pid = pid;
wo->wo_flags = options;
wo->wo_info = infop;
wo->wo_rusage = ru;
if (f_flags & O_NONBLOCK)
wo->wo_flags |= WNOHANG;
return 0;
}
static long kernel_waitid(int which, pid_t upid, struct waitid_info *infop,
int options, struct rusage *ru)
{
struct wait_opts wo;
long ret;
ret = kernel_waitid_prepare(&wo, which, upid, infop, options, ru);
if (ret)
return ret;
ret = do_wait(&wo);
if (!ret && !(options & WNOHANG) && (wo.wo_flags & WNOHANG))
ret = -EAGAIN;
put_pid(wo.wo_pid);
return ret;
}
SYSCALL_DEFINE5(waitid, int, which, pid_t, upid, struct siginfo __user *,
infop, int, options, struct rusage __user *, ru)
{
struct rusage r;
struct waitid_info info = {.status = 0};
long err = kernel_waitid(which, upid, &info, options, ru ? &r : NULL);
int signo = 0;
if (err > 0) {
signo = SIGCHLD;
err = 0;
if (ru && copy_to_user(ru, &r, sizeof(struct rusage)))
return -EFAULT;
}
if (!infop)
return err;
if (!user_write_access_begin(infop, sizeof(*infop)))
return -EFAULT;
unsafe_put_user(signo, &infop->si_signo, Efault);
unsafe_put_user(0, &infop->si_errno, Efault);
unsafe_put_user(info.cause, &infop->si_code, Efault);
unsafe_put_user(info.pid, &infop->si_pid, Efault);
unsafe_put_user(info.uid, &infop->si_uid, Efault);
unsafe_put_user(info.status, &infop->si_status, Efault);
user_write_access_end();
return err;
Efault:
user_write_access_end();
return -EFAULT;
}
long kernel_wait4(pid_t upid, int __user *stat_addr, int options,
struct rusage *ru)
{
struct wait_opts wo;
struct pid *pid = NULL;
enum pid_type type;
long ret;
if (options & ~(WNOHANG|WUNTRACED|WCONTINUED|
__WNOTHREAD|__WCLONE|__WALL))
return -EINVAL;
/* -INT_MIN is not defined */
if (upid == INT_MIN)
return -ESRCH;
if (upid == -1)
type = PIDTYPE_MAX;
else if (upid < 0) {
type = PIDTYPE_PGID;
pid = find_get_pid(-upid);
} else if (upid == 0) {
type = PIDTYPE_PGID;
pid = get_task_pid(current, PIDTYPE_PGID);
} else /* upid > 0 */ {
type = PIDTYPE_PID;
pid = find_get_pid(upid);
}
wo.wo_type = type;
wo.wo_pid = pid;
wo.wo_flags = options | WEXITED;
wo.wo_info = NULL;
wo.wo_stat = 0;
wo.wo_rusage = ru;
ret = do_wait(&wo);
put_pid(pid);
if (ret > 0 && stat_addr && put_user(wo.wo_stat, stat_addr))
ret = -EFAULT;
return ret;
}
int kernel_wait(pid_t pid, int *stat)
{
struct wait_opts wo = {
.wo_type = PIDTYPE_PID,
.wo_pid = find_get_pid(pid),
.wo_flags = WEXITED,
};
int ret;
ret = do_wait(&wo);
if (ret > 0 && wo.wo_stat)
*stat = wo.wo_stat;
put_pid(wo.wo_pid);
return ret;
}
SYSCALL_DEFINE4(wait4, pid_t, upid, int __user *, stat_addr,
int, options, struct rusage __user *, ru)
{
struct rusage r;
long err = kernel_wait4(upid, stat_addr, options, ru ? &r : NULL);
if (err > 0) {
if (ru && copy_to_user(ru, &r, sizeof(struct rusage)))
return -EFAULT;
}
return err;
}
#ifdef __ARCH_WANT_SYS_WAITPID
/*
* sys_waitpid() remains for compatibility. waitpid() should be
* implemented by calling sys_wait4() from libc.a.
*/
SYSCALL_DEFINE3(waitpid, pid_t, pid, int __user *, stat_addr, int, options)
{
return kernel_wait4(pid, stat_addr, options, NULL);
}
#endif
#ifdef CONFIG_COMPAT
COMPAT_SYSCALL_DEFINE4(wait4,
compat_pid_t, pid,
compat_uint_t __user *, stat_addr,
int, options,
struct compat_rusage __user *, ru)
{
struct rusage r;
long err = kernel_wait4(pid, stat_addr, options, ru ? &r : NULL);
if (err > 0) {
if (ru && put_compat_rusage(&r, ru))
return -EFAULT;
}
return err;
}
COMPAT_SYSCALL_DEFINE5(waitid,
int, which, compat_pid_t, pid,
struct compat_siginfo __user *, infop, int, options,
struct compat_rusage __user *, uru)
{
struct rusage ru;
struct waitid_info info = {.status = 0};
long err = kernel_waitid(which, pid, &info, options, uru ? &ru : NULL);
int signo = 0;
if (err > 0) {
signo = SIGCHLD;
err = 0;
if (uru) {
/* kernel_waitid() overwrites everything in ru */
if (COMPAT_USE_64BIT_TIME)
err = copy_to_user(uru, &ru, sizeof(ru));
else
err = put_compat_rusage(&ru, uru);
if (err)
return -EFAULT;
}
}
if (!infop)
return err;
if (!user_write_access_begin(infop, sizeof(*infop)))
return -EFAULT;
unsafe_put_user(signo, &infop->si_signo, Efault);
unsafe_put_user(0, &infop->si_errno, Efault);
unsafe_put_user(info.cause, &infop->si_code, Efault);
unsafe_put_user(info.pid, &infop->si_pid, Efault);
unsafe_put_user(info.uid, &infop->si_uid, Efault);
unsafe_put_user(info.status, &infop->si_status, Efault);
user_write_access_end();
return err;
Efault:
user_write_access_end();
return -EFAULT;
}
#endif
/*
* This needs to be __function_aligned as GCC implicitly makes any
* implementation of abort() cold and drops alignment specified by
* -falign-functions=N.
*
* See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=88345#c11
*/
__weak __function_aligned void abort(void)
{
BUG();
/* if that doesn't kill us, halt */
panic("Oops failed to kill thread");
}
EXPORT_SYMBOL(abort);